From 6f3c5f1c23c25fa439a9f656afef0a0b3495ffac Mon Sep 17 00:00:00 2001
From: limm <limm1@sugon.com>
Date: Thu, 11 Jul 2024 17:25:35 +0800
Subject: [PATCH] support v1.4.0

---
 CONTRIBUTING.md                               |  261 +--
 CONTRIBUTING_zh-CN.md                         |  274 ---
 Dockerfile                                    |    7 +
 LICENSES.md                                   |   11 +-
 MANIFEST.in                                   |    3 +-
 README.md                                     |   15 +-
 README_ORIGIN.md                              |  253 +-
 README_zh-CN.md                               |  255 +-
 TERMINOLOGY.md                                |   48 +-
 docker/README.md                              |   70 -
 docker/dev/Dockerfile                         |   31 -
 docker/release/Dockerfile                     |   23 -
 docs/{en => }/Makefile                        |    0
 docs/{en => }/_static/community/1.png         |  Bin
 docs/{en => }/_static/community/2.png         |  Bin
 docs/{en => }/_static/community/3.png         |  Bin
 docs/{en => }/_static/css/readthedocs.css     |    4 -
 docs/{en => }/_static/flow_img2toimg1.png     |  Bin
 docs/{en => }/_static/flow_raw_images.png     |  Bin
 docs/{en => }/_static/flow_visualization.png  |  Bin
 docs/{en => }/_static/flow_warp.png           |  Bin
 docs/{en => }/_static/flow_warp_diff.png      |  Bin
 docs/{en => }/_static/image/mmcv-logo.png     |  Bin
 docs/{en => }/_static/parallel_progress.gif   |  Bin
 docs/{en => }/_static/parallel_progress.png   |  Bin
 docs/{en => }/_static/progress.gif            |  Bin
 docs/{en => }/_static/progress.png            |  Bin
 docs/_static/qq_group_qrcode.jpg              |  Bin 0 -> 71955 bytes
 docs/_static/zhihu_qrcode.jpg                 |  Bin 0 -> 397245 bytes
 docs/api.rst                                  |   44 +
 docs/community/contributing.md                |    1 +
 docs/community/pr.md                          |   94 +
 docs/{en => }/compatibility.md                |    0
 docs/{zh_cn => }/conf.py                      |  128 +-
 .../deployment/mmcv_ops_definition.md         |  130 +-
 docs/deployment/onnx.md                       |   19 +
 docs/deployment/onnxruntime_custom_ops.md     |  378 +++
 docs/deployment/onnxruntime_op.md             |  126 +
 docs/deployment/tensorrt_custom_ops.md        |  395 ++++
 docs/deployment/tensorrt_plugin.md            |  178 ++
 docs/en/_static/version.json                  |  575 -----
 docs/en/_templates/classtemplate.rst          |   14 -
 docs/en/api/arraymisc.rst                     |   19 -
 docs/en/api/cnn.rst                           |   71 -
 docs/en/api/image.rst                         |  100 -
 docs/en/api/ops.rst                           |  135 --
 docs/en/api/transforms.rst                    |   60 -
 docs/en/api/utils.rst                         |   23 -
 docs/en/api/video.rst                         |   56 -
 docs/en/api/visualization.rst                 |   50 -
 docs/en/community/contributing.md             |  267 ---
 docs/en/community/pr.md                       |    3 -
 docs/en/docutils.conf                         |    2 -
 docs/en/faq.md                                |   93 -
 docs/en/get_started/build.md                  |  292 ---
 docs/en/get_started/installation.md           |  348 ---
 docs/en/get_started/introduction.md           |   36 -
 docs/en/switch_language.md                    |    3 -
 docs/en/understand_mmcv/cnn.md                |  120 -
 docs/en/understand_mmcv/data_transform.md     |  341 ---
 docs/en/understand_mmcv/ops.md                |   66 -
 docs/faq.md                                   |   42 +
 docs/get_started/build.md                     |  234 ++
 docs/get_started/installation.md              |  162 ++
 docs/get_started/introduction.md              |   29 +
 .../{en => }/get_started/previous_versions.md |    4 +-
 docs/{en => }/index.rst                       |   29 +-
 docs/{en => }/make.bat                        |    0
 docs/{en => }/mmcv-logo.png                   |  Bin
 docs/understand_mmcv/cnn.md                   |  538 +++++
 docs/understand_mmcv/config.md                |  200 ++
 docs/{en => }/understand_mmcv/data_process.md |   24 +-
 docs/understand_mmcv/io.md                    |  247 ++
 docs/understand_mmcv/ops.md                   |   37 +
 docs/understand_mmcv/registry.md              |  155 ++
 docs/understand_mmcv/runner.md                |  163 ++
 docs/understand_mmcv/utils.md                 |   74 +
 .../{en => }/understand_mmcv/visualization.md |    0
 docs/zh_cn/_static/version.json               |  575 -----
 docs/zh_cn/_templates/classtemplate.rst       |   14 -
 docs/zh_cn/api/arraymisc.rst                  |   19 -
 docs/zh_cn/api/cnn.rst                        |   71 -
 docs/zh_cn/api/image.rst                      |  100 -
 docs/zh_cn/api/ops.rst                        |  135 --
 docs/zh_cn/api/transforms.rst                 |   60 -
 docs/zh_cn/api/utils.rst                      |   23 -
 docs/zh_cn/api/video.rst                      |   56 -
 docs/zh_cn/api/visualization.rst              |   50 -
 docs/zh_cn/community/code_style.md            |  609 -----
 docs/zh_cn/community/contributing.md          |  278 ---
 docs/zh_cn/community/pr.md                    |    3 -
 docs/zh_cn/docutils.conf                      |    2 -
 docs/zh_cn/faq.md                             |   91 -
 docs/zh_cn/get_started/article.md             |   63 -
 docs/zh_cn/get_started/build.md               |  300 ---
 docs/zh_cn/get_started/installation.md        |  369 ---
 docs/zh_cn/get_started/introduction.md        |   36 -
 docs/zh_cn/switch_language.md                 |    3 -
 docs/zh_cn/understand_mmcv/cnn.md             |  114 -
 docs/zh_cn/understand_mmcv/data_transform.md  |  341 ---
 docs/zh_cn/understand_mmcv/ops.md             |   66 -
 {docs/zh_cn => docs_zh_CN}/Makefile           |    0
 .../_static/css/readthedocs.css               |    4 -
 .../_static/image/mmcv-logo.png               |  Bin
 docs_zh_CN/api.rst                            |   44 +
 docs_zh_CN/community/contributing.md          |   69 +
 docs_zh_CN/community/pr.md                    |   90 +
 {docs/zh_cn => docs_zh_CN}/compatibility.md   |    0
 {docs/en => docs_zh_CN}/conf.py               |  132 +-
 docs_zh_CN/deployment/onnx.md                 |   19 +
 .../deployment/onnxruntime_custom_ops.md      |  333 +++
 docs_zh_CN/deployment/onnxruntime_op.md       |  127 +
 docs_zh_CN/deployment/tensorrt_custom_ops.md  |  391 ++++
 docs_zh_CN/deployment/tensorrt_plugin.md      |  177 ++
 docs_zh_CN/faq.md                             |   37 +
 docs_zh_CN/get_started/build.md               |  222 ++
 docs_zh_CN/get_started/installation.md        |  158 ++
 docs_zh_CN/get_started/introduction.md        |   30 +
 .../get_started/previous_versions.md          |    5 +-
 {docs/zh_cn => docs_zh_CN}/index.rst          |   30 +-
 {docs/zh_cn => docs_zh_CN}/make.bat           |    0
 {docs/zh_cn => docs_zh_CN}/mmcv-logo.png      |    0
 docs_zh_CN/understand_mmcv/cnn.md             |  525 +++++
 docs_zh_CN/understand_mmcv/config.md          |  176 ++
 .../understand_mmcv/data_process.md           |   18 +-
 docs_zh_CN/understand_mmcv/io.md              |  240 ++
 docs_zh_CN/understand_mmcv/ops.md             |   36 +
 docs_zh_CN/understand_mmcv/registry.md        |  149 ++
 docs_zh_CN/understand_mmcv/runner.md          |  155 ++
 docs_zh_CN/understand_mmcv/utils.md           |   69 +
 .../understand_mmcv/visualization.md          |    0
 examples/train.py                             |   84 +
 mmcv/__init__.py                              |    6 +-
 mmcv/arraymisc/quantization.py                |   22 +-
 mmcv/cnn/__init__.py                          |   35 +-
 mmcv/cnn/alexnet.py                           |   12 +-
 mmcv/cnn/bricks/__init__.py                   |   13 +-
 mmcv/cnn/bricks/activation.py                 |   50 +-
 mmcv/cnn/bricks/context_block.py              |   25 +-
 mmcv/cnn/bricks/conv.py                       |   27 +-
 mmcv/cnn/bricks/conv2d_adaptive_padding.py    |   25 +-
 mmcv/cnn/bricks/conv_module.py                |   58 +-
 mmcv/cnn/bricks/conv_ws.py                    |   77 +-
 .../bricks/depthwise_separable_conv_module.py |   43 +-
 mmcv/cnn/bricks/drop.py                       |   26 +-
 mmcv/cnn/bricks/generalized_attention.py      |   33 +-
 mmcv/cnn/bricks/hsigmoid.py                   |   34 +-
 mmcv/cnn/bricks/hswish.py                     |   22 +-
 mmcv/cnn/bricks/non_local.py                  |   66 +-
 mmcv/cnn/bricks/norm.py                       |   57 +-
 mmcv/cnn/bricks/padding.py                    |   27 +-
 mmcv/cnn/bricks/plugin.py                     |   40 +-
 mmcv/cnn/bricks/registry.py                   |   16 +
 mmcv/cnn/bricks/scale.py                      |   42 +-
 mmcv/cnn/bricks/swish.py                      |    9 +-
 mmcv/cnn/bricks/transformer.py                |  432 +---
 mmcv/cnn/bricks/upsample.py                   |   38 +-
 mmcv/cnn/bricks/wrappers.py                   |   37 +-
 mmcv/cnn/builder.py                           |   30 +
 mmcv/cnn/resnet.py                            |  103 +-
 mmcv/cnn/rfsearch/__init__.py                 |    5 -
 mmcv/cnn/rfsearch/operator.py                 |  169 --
 mmcv/cnn/rfsearch/search.py                   |  239 --
 mmcv/cnn/rfsearch/utils.py                    |   68 -
 mmcv/cnn/utils/__init__.py                    |   16 +-
 mmcv/cnn/utils/flops_counter.py               |  175 +-
 mmcv/cnn/utils/fuse_conv_bn.py                |    4 +-
 mmcv/cnn/utils/sync_bn.py                     |   59 +
 mmcv/cnn/utils/weight_init.py                 |  684 ++++++
 mmcv/cnn/vgg.py                               |   53 +-
 mmcv/engine/__init__.py                       |    8 +
 mmcv/engine/test.py                           |  202 ++
 mmcv/fileio/__init__.py                       |   11 +
 mmcv/fileio/file_client.py                    | 1148 +++++++++
 mmcv/fileio/handlers/__init__.py              |    7 +
 mmcv/fileio/handlers/base.py                  |   30 +
 mmcv/fileio/handlers/json_handler.py          |   36 +
 mmcv/fileio/handlers/pickle_handler.py        |   28 +
 mmcv/fileio/handlers/yaml_handler.py          |   24 +
 mmcv/fileio/io.py                             |  151 ++
 mmcv/fileio/parse.py                          |   97 +
 mmcv/image/__init__.py                        |   11 +-
 mmcv/image/colorspace.py                      |   37 +-
 mmcv/image/geometric.py                       |  242 +-
 mmcv/image/io.py                              |  202 +-
 mmcv/image/misc.py                            |   32 +-
 mmcv/image/photometric.py                     |  207 +-
 mmcv/model_zoo/deprecated.json                |    6 +
 mmcv/model_zoo/mmcls.json                     |   59 +
 mmcv/model_zoo/open_mmlab.json                |   50 +
 mmcv/onnx/__init__.py                         |    5 +
 mmcv/onnx/info.py                             |   21 +
 mmcv/onnx/onnx_utils/__init__.py              |    1 +
 mmcv/onnx/onnx_utils/symbolic_helper.py       |  331 +++
 mmcv/onnx/symbolic.py                         |  496 ++++
 mmcv/ops/__init__.py                          |   64 +-
 mmcv/ops/active_rotated_filter.py             |   64 -
 mmcv/ops/assign_score_withk.py                |   26 +-
 mmcv/ops/ball_query.py                        |   78 +-
 mmcv/ops/bbox.py                              |   70 +-
 mmcv/ops/bezier_align.py                      |  137 --
 mmcv/ops/bias_act.py                          |  375 ---
 mmcv/ops/border_align.py                      |   33 +-
 mmcv/ops/box_iou_quadri.py                    |   49 -
 mmcv/ops/box_iou_rotated.py                   |  123 +-
 mmcv/ops/carafe.py                            |   77 +-
 mmcv/ops/cc_attention.py                      |   20 +-
 mmcv/ops/chamfer_distance.py                  |   93 -
 mmcv/ops/contour_expand.py                    |   17 +-
 mmcv/ops/conv2d_gradfix.py                    |  346 ---
 mmcv/ops/convex_iou.py                        |   52 -
 mmcv/ops/corner_pool.py                       |  148 +-
 mmcv/ops/correlation.py                       |   22 +-
 mmcv/ops/csrc/README.md                       |  216 +-
 .../ops/csrc/common/box_iou_rotated_utils.hpp |   83 -
 .../active_rotated_filter_cuda_kernel.cuh     |   59 -
 .../cuda/assign_score_withk_cuda_kernel.cuh   |  132 +-
 .../common/cuda/ball_query_cuda_kernel.cuh    |   49 +-
 .../common/cuda/bbox_overlaps_cuda_kernel.cuh |  175 +-
 .../common/cuda/bezier_align_cuda_kernel.cuh  |  230 --
 .../csrc/common/cuda/box_iou_quadri_cuda.cuh  |   91 -
 .../csrc/common/cuda/carafe_cuda_kernel.cuh   |   18 +-
 .../cuda/chamfer_distance_cuda_kernel.cuh     |  101 -
 .../csrc/common/cuda/common_cuda_helper.hpp   |   16 +-
 .../common/cuda/convex_iou_cuda_kernel.cuh    |  831 -------
 .../ops/csrc/common/cuda/correlation_cuda.cuh |  202 +-
 .../cuda/diff_iou_rotated_cuda_kernel.cuh     |  137 --
 .../common/cuda/gather_points_cuda_kernel.cuh |   28 +-
 .../common/cuda/group_points_cuda_kernel.cuh  |   36 +-
 .../csrc/common/cuda/iou3d_cuda_kernel.cuh    |  290 +--
 mmcv/ops/csrc/common/cuda/knn_cuda_kernel.cuh |   61 +-
 .../common/cuda/min_area_polygons_cuda.cuh    |  300 ---
 .../cuda/ms_deform_attn_cuda_kernel.cuh       |  137 +-
 mmcv/ops/csrc/common/cuda/nms_cuda_kernel.cuh |  111 +-
 mmcv/ops/csrc/common/cuda/nms_quadri_cuda.cuh |  141 --
 .../ops/csrc/common/cuda/nms_rotated_cuda.cuh |   14 +-
 .../cuda/points_in_boxes_cuda_kernel.cuh      |   48 +-
 .../cuda/points_in_polygons_cuda_kernel.cuh   |   79 -
 .../common/cuda/prroi_pool_cuda_kernel.cuh    |  381 ---
 .../cuda/riroi_align_rotated_cuda_kernel.cuh  |  242 --
 .../cuda/roi_align_rotated_cuda_kernel.cuh    |   16 +-
 .../cuda/roiaware_pool3d_cuda_kernel.cuh      |  252 +-
 .../cuda/roipoint_pool3d_cuda_kernel.cuh      |  116 +-
 .../rotated_feature_align_cuda_kernel.cuh     |  129 --
 .../cuda/scatter_points_cuda_kernel.cuh       |    4 +-
 mmcv/ops/csrc/common/cuda/spconv/indice.cuh   |  236 --
 .../csrc/common/cuda/spconv/reordering.cuh    |  160 --
 .../cuda/stack_ball_query_cuda_kernel.cuh     |   68 -
 .../cuda/stack_group_points_cuda_kernel.cuh   |   97 -
 .../cuda/three_interpolate_cuda_kernel.cuh    |   42 +-
 .../csrc/common/cuda/three_nn_cuda_kernel.cuh |   75 +-
 .../common/cuda/voxelization_cuda_kernel.cuh  |   57 +-
 .../common/mlu/bbox_overlaps_mlu_kernel.mlu   |  322 ---
 .../ops/csrc/common/mlu/carafe_mlu_kernel.mlu |  552 -----
 mmcv/ops/csrc/common/mlu/carafe_utils.hpp     |   95 -
 .../ops/csrc/common/mlu/common_mlu_helper.hpp |  398 ----
 .../common/mlu/deform_roi_pool_mlu_kernel.mlu |  712 ------
 .../mlu/focal_loss_sigmoid_mlu_kernel.mlu     |  888 -------
 mmcv/ops/csrc/common/mlu/iou3d_mlu_kernel.mlu |  431 ----
 mmcv/ops/csrc/common/mlu/iou3d_utils.hpp      |  695 ------
 .../common/mlu/masked_conv2d_mlu_kernel.mlu   |  181 --
 .../common/mlu/ms_deform_attn_mlu_kernel.mlu  |  853 -------
 mmcv/ops/csrc/common/mlu/nms_mlu_kernel.mlu   |  483 ----
 mmcv/ops/csrc/common/mlu/nms_utils.hpp        |  553 -----
 .../csrc/common/mlu/psamask_mlu_kernel.mlu    |  615 -----
 mmcv/ops/csrc/common/mlu/psamask_utils.hpp    |   55 -
 .../csrc/common/mlu/roi_align_mlu_kernel.mlu  |  493 ----
 .../mlu/roi_align_rotated_mlu_kernel.mlu      |  490 ----
 .../common/mlu/roi_align_rotated_utils.hpp    |   24 -
 .../csrc/common/mlu/roi_pool_mlu_kernel.mlu   |  747 ------
 .../common/mlu/roiaware_pool3d_mlu_kernel.mlu |  747 ------
 ...oint_pool3d_large_boxes_num_mlu_kernel.mlu |  536 -----
 .../common/mlu/roipoint_pool3d_mlu_kernel.mlu |  544 -----
 .../csrc/common/mlu/three_nn_mlu_kernel.mlu   |  466 ----
 .../csrc/common/mlu/tin_shift_mlu_kernel.mlu  |  307 ---
 mmcv/ops/csrc/common/mps/MPSDevice.h          |   64 -
 mmcv/ops/csrc/common/mps/MPSLibrary.h         |   61 -
 mmcv/ops/csrc/common/mps/MPSLibrary.mm        |  107 -
 mmcv/ops/csrc/common/mps/MPSStream.h          |  132 --
 mmcv/ops/csrc/common/mps/MPSUtils.h           |   51 -
 mmcv/ops/csrc/common/pytorch_cpp_helper.hpp   |   11 +-
 mmcv/ops/csrc/common/pytorch_cuda_helper.hpp  |    1 -
 mmcv/ops/csrc/common/pytorch_mlu_helper.hpp   |   61 -
 mmcv/ops/csrc/common/pytorch_npu_helper.hpp   |   35 -
 .../ops/csrc/common/utils/spconv/paramsgrid.h |   70 -
 .../csrc/common/utils/spconv/prettyprint.h    |  493 ----
 .../csrc/common/utils/spconv/pybind11_utils.h |   60 -
 .../common/utils/spconv/spconv/geometry.h     |  295 ---
 .../csrc/common/utils/spconv/spconv/indice.h  |   78 -
 .../csrc/common/utils/spconv/spconv/maxpool.h |   37 -
 .../common/utils/spconv/spconv/mp_helper.h    |   50 -
 .../common/utils/spconv/spconv/point2voxel.h  |  385 ----
 .../common/utils/spconv/spconv/reordering.h   |   36 -
 .../utils/spconv/tensorview/helper_kernel.cuh |   75 -
 .../utils/spconv/tensorview/helper_launch.h   |   19 -
 .../utils/spconv/tensorview/tensorview.h      | 1119 ---------
 mmcv/ops/csrc/onnxruntime/corner_pool.h       |   46 +
 mmcv/ops/csrc/onnxruntime/cpu/corner_pool.cpp |  123 +
 mmcv/ops/csrc/onnxruntime/cpu/deform_conv.cpp |  263 +++
 mmcv/ops/csrc/onnxruntime/cpu/gridSample.cpp  |  314 +++
 .../onnxruntime/cpu/modulated_deform_conv.cpp |  292 +++
 mmcv/ops/csrc/onnxruntime/cpu/nms.cpp         |  108 +
 .../onnxruntime/cpu/onnxruntime_register.cpp  |   81 +
 mmcv/ops/csrc/onnxruntime/cpu/reduce_ops.cpp  |  188 ++
 mmcv/ops/csrc/onnxruntime/cpu/roi_align.cpp   |  265 +++
 .../onnxruntime/cpu/roi_align_rotated.cpp     |  247 ++
 mmcv/ops/csrc/onnxruntime/cpu/soft_nms.cpp    |  156 ++
 mmcv/ops/csrc/onnxruntime/deform_conv.h       |   57 +
 mmcv/ops/csrc/onnxruntime/grid_sample.h       |   44 +
 .../csrc/onnxruntime/modulated_deform_conv.h  |   61 +
 mmcv/ops/csrc/onnxruntime/nms.h               |   45 +
 .../csrc/onnxruntime/onnxruntime_register.h   |   16 +
 .../onnxruntime_session_options_config_keys.h |   44 +
 mmcv/ops/csrc/onnxruntime/ort_mmcv_utils.h    |   15 +
 mmcv/ops/csrc/onnxruntime/reduce_ops.h        |   95 +
 mmcv/ops/csrc/onnxruntime/roi_align.h         |   62 +
 mmcv/ops/csrc/onnxruntime/roi_align_rotated.h |   62 +
 mmcv/ops/csrc/onnxruntime/soft_nms.h          |   49 +
 .../csrc/parrots/active_rotated_filter.cpp    |   28 -
 .../parrots/active_rotated_filter_parrots.cpp |   63 -
 .../parrots/active_rotated_filter_pytorch.h   |   13 -
 mmcv/ops/csrc/parrots/assign_score_withk.cpp  |   73 +-
 mmcv/ops/csrc/parrots/ball_query.cpp          |   31 +-
 mmcv/ops/csrc/parrots/bbox_overlaps.cpp       |   26 +-
 mmcv/ops/csrc/parrots/border_align.cpp        |   58 +-
 .../ops/csrc/parrots/border_align_parrots.cpp |    2 -
 mmcv/ops/csrc/parrots/box_iou_rotated.cpp     |   24 +-
 mmcv/ops/csrc/parrots/box_iou_rotated_cpu.cpp |   33 +
 mmcv/ops/csrc/parrots/carafe.cpp              |   74 +-
 mmcv/ops/csrc/parrots/carafe_naive.cpp        |   61 +-
 mmcv/ops/csrc/parrots/chamfer_distance.cpp    |   35 -
 .../csrc/parrots/chamfer_distance_parrots.cpp |   51 -
 .../csrc/parrots/chamfer_distance_pytorch.h   |   16 -
 mmcv/ops/csrc/parrots/contour_expand.cpp      |    1 +
 mmcv/ops/csrc/parrots/convex_iou.cpp          |   23 -
 mmcv/ops/csrc/parrots/convex_iou_parrots.cpp  |   40 -
 mmcv/ops/csrc/parrots/convex_iou_pytorch.h    |   11 -
 mmcv/ops/csrc/parrots/corner_pool.cpp         |  240 ++
 mmcv/ops/csrc/parrots/corner_pool_parrots.cpp |  234 ++
 mmcv/ops/csrc/parrots/corner_pool_pytorch.h   |   15 +
 mmcv/ops/csrc/parrots/correlation.cpp         |   74 +-
 mmcv/ops/csrc/parrots/cudabind.cpp            | 1677 --------------
 mmcv/ops/csrc/parrots/deform_conv.cpp         |  141 +-
 mmcv/ops/csrc/parrots/deform_conv_cpu.cpp     |  377 +++
 mmcv/ops/csrc/parrots/deform_roi_pool.cpp     |   70 +-
 mmcv/ops/csrc/parrots/diff_iou_rotated.cpp    |   14 -
 .../csrc/parrots/diff_iou_rotated_parrots.cpp |   28 -
 .../csrc/parrots/diff_iou_rotated_pytorch.h   |   10 -
 mmcv/ops/csrc/parrots/focal_loss.cpp          |  116 +-
 .../csrc/parrots/furthest_point_sample.cpp    |   60 +-
 .../ops/csrc/parrots/fused_bias_leakyrelu.cpp |  132 +-
 mmcv/ops/csrc/parrots/gather_points.cpp       |   51 +-
 mmcv/ops/csrc/parrots/group_points.cpp        |   50 +-
 mmcv/ops/csrc/parrots/info.cpp                |   65 -
 mmcv/ops/csrc/parrots/iou3d.cpp               |  232 +-
 mmcv/ops/csrc/parrots/iou3d_parrots.cpp       |   34 +-
 mmcv/ops/csrc/parrots/iou3d_pytorch.h         |   12 +-
 mmcv/ops/csrc/parrots/knn.cpp                 |   27 +-
 mmcv/ops/csrc/parrots/masked_conv2d.cpp       |   64 +-
 mmcv/ops/csrc/parrots/min_area_polygons.cpp   |   11 -
 .../parrots/min_area_polygons_parrots.cpp     |   26 -
 .../csrc/parrots/min_area_polygons_pytorch.h  |    9 -
 .../csrc/parrots/modulated_deform_conv.cpp    |  195 +-
 .../parrots/modulated_deform_conv_cpu.cpp     |  403 ++++
 mmcv/ops/csrc/parrots/ms_deform_attn.cpp      |   65 +-
 mmcv/ops/csrc/parrots/nms.cpp                 |  260 ++-
 .../nms_rotated_cpu.cpp}                      |   28 +-
 mmcv/ops/csrc/parrots/pixel_group.cpp         |  125 +-
 mmcv/ops/csrc/parrots/points_in_boxes.cpp     |   88 +-
 mmcv/ops/csrc/parrots/points_in_boxes_cpu.cpp |   53 +
 mmcv/ops/csrc/parrots/points_in_polygons.cpp  |   15 -
 .../parrots/points_in_polygons_parrots.cpp    |   28 -
 .../csrc/parrots/points_in_polygons_pytorch.h |    9 -
 mmcv/ops/csrc/parrots/prroi_pool.cpp          |   47 -
 mmcv/ops/csrc/parrots/prroi_pool_parrots.cpp  |   97 -
 mmcv/ops/csrc/parrots/prroi_pool_pytorch.h    |   19 -
 mmcv/ops/csrc/parrots/psamask.cpp             |  241 +-
 mmcv/ops/csrc/parrots/riroi_align_rotated.cpp |   42 -
 .../parrots/riroi_align_rotated_parrots.cpp   |   86 -
 .../parrots/riroi_align_rotated_pytorch.h     |   18 -
 mmcv/ops/csrc/parrots/roi_align.cpp           |  119 +-
 mmcv/ops/csrc/parrots/roi_align_cpu.cpp       |  430 ++++
 mmcv/ops/csrc/parrots/roi_align_rotated.cpp   |  154 +-
 .../roi_align_rotated_cpu.cpp}                |  280 +--
 .../parrots/roi_align_rotated_parrots.cpp     |   28 +-
 .../csrc/parrots/roi_align_rotated_pytorch.h  |   12 +-
 mmcv/ops/csrc/parrots/roi_pool.cpp            |   58 +-
 mmcv/ops/csrc/parrots/roiaware_pool3d.cpp     |  111 +-
 mmcv/ops/csrc/parrots/roipoint_pool3d.cpp     |   49 +-
 .../csrc/parrots/rotated_feature_align.cpp    |   39 -
 .../parrots/rotated_feature_align_parrots.cpp |   99 -
 .../parrots/rotated_feature_align_pytorch.h   |   17 -
 mmcv/ops/csrc/parrots/sync_bn.cpp             |  136 +-
 mmcv/ops/csrc/parrots/three_interpolate.cpp   |   54 +-
 mmcv/ops/csrc/parrots/three_nn.cpp            |   26 +-
 mmcv/ops/csrc/parrots/tin_shift.cpp           |   46 +-
 mmcv/ops/csrc/parrots/upfirdn2d.cpp           |  130 +-
 mmcv/ops/csrc/parrots/voxelization.cpp        |   83 +-
 mmcv/ops/csrc/parrots/voxelization_cpu.cpp    |  152 ++
 .../ops/csrc/parrots/voxelization_parrots.cpp |    9 +-
 mmcv/ops/csrc/parrots/voxelization_pytorch.h  |    3 +-
 .../csrc/pytorch/active_rotated_filter.cpp    |   28 -
 mmcv/ops/csrc/pytorch/ball_query.cpp          |   18 -
 mmcv/ops/csrc/pytorch/bezier_align.cpp        |   38 -
 mmcv/ops/csrc/pytorch/bias_act.cpp            |   20 -
 mmcv/ops/csrc/pytorch/box_iou_quadri.cpp      |   17 -
 mmcv/ops/csrc/pytorch/chamfer_distance.cpp    |   35 -
 mmcv/ops/csrc/pytorch/contour_expand.cpp      |    1 +
 mmcv/ops/csrc/pytorch/convex_iou.cpp          |   23 -
 mmcv/ops/csrc/pytorch/corner_pool.cpp         |  240 ++
 .../pytorch/cpu/active_rotated_filter.cpp     |  120 -
 .../csrc/pytorch/cpu/bbox_overlaps_cpu.cpp    |   65 -
 mmcv/ops/csrc/pytorch/cpu/box_iou_quadri.cpp  |   36 -
 mmcv/ops/csrc/pytorch/cpu/nms_rotated.cpp     |    2 +-
 mmcv/ops/csrc/pytorch/cpu/pixel_group.cpp     |    6 +-
 .../csrc/pytorch/cpu/roi_align_rotated.cpp    |    9 +-
 .../pytorch/cpu/rotated_feature_align.cpp     |  262 ---
 mmcv/ops/csrc/pytorch/cpu/sparse_indice.cpp   |   84 -
 mmcv/ops/csrc/pytorch/cpu/sparse_maxpool.cpp  |   82 -
 .../csrc/pytorch/cpu/sparse_reordering.cpp    |   68 -
 mmcv/ops/csrc/pytorch/cpu/voxelization.cpp    |   30 +-
 .../cuda/active_rotated_filter_cuda.cu        |   58 -
 .../pytorch/cuda/assign_score_withk_cuda.cu   |    6 +-
 mmcv/ops/csrc/pytorch/cuda/ball_query_cuda.cu |    2 +-
 .../csrc/pytorch/cuda/bbox_overlaps_cuda.cu   |   17 -
 .../csrc/pytorch/cuda/bezier_align_cuda.cu    |   53 -
 mmcv/ops/csrc/pytorch/cuda/bias_act_cuda.cu   |  300 ---
 .../csrc/pytorch/cuda/box_iou_quadri_cuda.cu  |   23 -
 .../pytorch/cuda/chamfer_distance_cuda.cu     |   63 -
 mmcv/ops/csrc/pytorch/cuda/convex_iou.cu      |   41 -
 .../ops/csrc/pytorch/cuda/correlation_cuda.cu |   43 +-
 mmcv/ops/csrc/pytorch/cuda/cudabind.cpp       |  852 ++-----
 .../pytorch/cuda/diff_iou_rotated_cuda.cu     |   35 -
 mmcv/ops/csrc/pytorch/cuda/filtered_lrelu.cu  | 2044 -----------------
 .../pytorch/cuda/fused_spconv_ops_cuda.cu     |  104 -
 .../csrc/pytorch/cuda/gather_points_cuda.cu   |    4 +-
 .../csrc/pytorch/cuda/group_points_cuda.cu    |    4 +-
 mmcv/ops/csrc/pytorch/cuda/iou3d_cuda.cu      |   90 +-
 mmcv/ops/csrc/pytorch/cuda/knn_cuda.cu        |    2 +-
 .../csrc/pytorch/cuda/min_area_polygons.cu    |   21 -
 .../csrc/pytorch/cuda/ms_deform_attn_cuda.cu  |   22 +-
 mmcv/ops/csrc/pytorch/cuda/nms_cuda.cu        |   39 +-
 mmcv/ops/csrc/pytorch/cuda/nms_quadri_cuda.cu |   60 -
 .../csrc/pytorch/cuda/points_in_boxes_cuda.cu |    4 +-
 .../pytorch/cuda/points_in_polygons_cuda.cu   |   28 -
 mmcv/ops/csrc/pytorch/cuda/prroi_pool_cuda.cu |   65 -
 mmcv/ops/csrc/pytorch/cuda/psamask_cuda.cu    |    3 +
 .../pytorch/cuda/riroi_align_rotated_cuda.cu  |   53 -
 .../pytorch/cuda/roi_align_rotated_cuda.cu    |   14 +-
 .../csrc/pytorch/cuda/roiaware_pool3d_cuda.cu |   10 +-
 .../csrc/pytorch/cuda/roipoint_pool3d_cuda.cu |    6 +-
 .../cuda/rotated_feature_align_cuda.cu        |   53 -
 .../csrc/pytorch/cuda/scatter_points_cuda.cu  |   13 +-
 mmcv/ops/csrc/pytorch/cuda/sparse_indice.cu   |  159 --
 mmcv/ops/csrc/pytorch/cuda/sparse_maxpool.cu  |  486 ----
 .../csrc/pytorch/cuda/sparse_pool_ops_cuda.cu |   91 -
 .../csrc/pytorch/cuda/sparse_reordering.cu    |  160 --
 mmcv/ops/csrc/pytorch/cuda/spconv_ops_cuda.cu |  477 ----
 .../pytorch/cuda/stack_ball_query_cuda.cu     |   45 -
 .../pytorch/cuda/stack_group_points_cuda.cu   |   62 -
 .../pytorch/cuda/three_interpolate_cuda.cu    |    4 +-
 mmcv/ops/csrc/pytorch/cuda/three_nn_cuda.cu   |    2 +-
 .../ops/csrc/pytorch/cuda/upfirdn2d_kernel.cu | 1015 +++-----
 .../csrc/pytorch/cuda/voxelization_cuda.cu    |   98 -
 mmcv/ops/csrc/pytorch/diff_iou_rotated.cpp    |   14 -
 mmcv/ops/csrc/pytorch/filtered_lrelu.cpp      |   37 -
 mmcv/ops/csrc/pytorch/fused_spconv_ops.cpp    |   34 -
 mmcv/ops/csrc/pytorch/group_points.cpp        |   42 -
 mmcv/ops/csrc/pytorch/info.cpp                |   15 +-
 mmcv/ops/csrc/pytorch/iou3d.cpp               |  117 +-
 mmcv/ops/csrc/pytorch/min_area_polygons.cpp   |   11 -
 .../csrc/pytorch/mlu/bbox_overlaps_mlu.cpp    |  100 -
 mmcv/ops/csrc/pytorch/mlu/carafe_mlu.cpp      |  429 ----
 .../csrc/pytorch/mlu/deform_roi_pool_mlu.cpp  |  343 ---
 .../pytorch/mlu/focal_loss_sigmoid_mlu.cpp    |  332 ---
 mmcv/ops/csrc/pytorch/mlu/iou3d_mlu.cpp       |  144 --
 .../csrc/pytorch/mlu/masked_conv2d_mlu.cpp    |  226 --
 .../csrc/pytorch/mlu/ms_deform_attn_mlu.cpp   |  420 ----
 mmcv/ops/csrc/pytorch/mlu/nms_mlu.cpp         |  156 --
 mmcv/ops/csrc/pytorch/mlu/psamask_mlu.cpp     |  308 ---
 mmcv/ops/csrc/pytorch/mlu/roi_align_mlu.cpp   |  206 --
 .../pytorch/mlu/roi_align_rotated_mlu.cpp     |  232 --
 mmcv/ops/csrc/pytorch/mlu/roi_pool_mlu.cpp    |  275 ---
 .../csrc/pytorch/mlu/roiaware_pool3d_mlu.cpp  |  399 ----
 .../csrc/pytorch/mlu/roipoint_pool3d_mlu.cpp  |  166 --
 mmcv/ops/csrc/pytorch/mlu/three_nn_mlu.cpp    |  100 -
 mmcv/ops/csrc/pytorch/mlu/tin_shift_mlu.cpp   |  203 --
 .../ops/csrc/pytorch/mps/bbox_overlaps_mps.mm |   99 -
 mmcv/ops/csrc/pytorch/nms_quadri.cpp          |   30 -
 mmcv/ops/csrc/pytorch/nms_rotated.cpp         |   21 +-
 .../csrc/pytorch/npu/bbox_overlaps_npu.cpp    |   34 -
 mmcv/ops/csrc/pytorch/npu/deform_roi_pool.cpp |   63 -
 mmcv/ops/csrc/pytorch/npu/focal_loss_npu.cpp  |  162 --
 .../pytorch/npu/fused_bias_leakyrelu_npu.cpp  |   54 -
 .../csrc/pytorch/npu/gather_points_npu.cpp    |   29 -
 mmcv/ops/csrc/pytorch/npu/nms_npu.cpp         |   45 -
 mmcv/ops/csrc/pytorch/npu/nms_rotated_npu.cpp |   32 -
 mmcv/ops/csrc/pytorch/npu/psa_mask_npu.cpp    |   75 -
 mmcv/ops/csrc/pytorch/npu/roi_pool_npu.cpp    |   63 -
 .../ops/csrc/pytorch/npu/voxelization_npu.cpp |   59 -
 mmcv/ops/csrc/pytorch/points_in_polygons.cpp  |   15 -
 mmcv/ops/csrc/pytorch/prroi_pool.cpp          |   47 -
 mmcv/ops/csrc/pytorch/pybind.cpp              |  397 +---
 mmcv/ops/csrc/pytorch/riroi_align_rotated.cpp |   42 -
 mmcv/ops/csrc/pytorch/roi_align_rotated.cpp   |   12 +-
 .../csrc/pytorch/rotated_feature_align.cpp    |   39 -
 mmcv/ops/csrc/pytorch/sparse_pool_ops.cpp     |   48 -
 mmcv/ops/csrc/pytorch/spconv_ops.cpp          |  171 --
 mmcv/ops/csrc/pytorch/spconv_utils.h          |   79 -
 mmcv/ops/csrc/pytorch/upfirdn2d.cpp           |   22 +-
 mmcv/ops/csrc/pytorch/voxelization.cpp        |   26 +-
 .../csrc/tensorrt/plugins/trt_corner_pool.cpp |  217 ++
 .../plugins/trt_corner_pool_kernel.cu         |  110 +
 .../csrc/tensorrt/plugins/trt_cuda_helper.cu  |   91 +
 .../csrc/tensorrt/plugins/trt_cummaxmin.cpp   |  242 ++
 .../tensorrt/plugins/trt_cummaxmin_kernel.cu  |   90 +
 .../csrc/tensorrt/plugins/trt_deform_conv.cpp |  318 +++
 .../plugins/trt_deform_conv_kernel.cu         |  129 ++
 .../tensorrt/plugins/trt_grid_sampler.cpp     |  256 +++
 .../plugins/trt_grid_sampler_kernel.cu        |  441 ++++
 .../tensorrt/plugins/trt_instance_norm.cpp    |  246 ++
 .../plugins/trt_modulated_deform_conv.cpp     |  308 +++
 .../trt_modulated_deform_conv_kernel.cu       |  134 ++
 mmcv/ops/csrc/tensorrt/plugins/trt_nms.cpp    |  279 +++
 .../csrc/tensorrt/plugins/trt_nms_kernel.cu   |  272 +++
 mmcv/ops/csrc/tensorrt/plugins/trt_plugin.cpp |   27 +
 .../csrc/tensorrt/plugins/trt_roi_align.cpp   |  294 +++
 .../tensorrt/plugins/trt_roi_align_kernel.cu  |   28 +
 .../csrc/tensorrt/plugins/trt_scatternd.cpp   |  207 ++
 .../tensorrt/plugins/trt_scatternd_kernel.cu  |   93 +
 mmcv/ops/csrc/tensorrt/trt_corner_pool.hpp    |  111 +
 mmcv/ops/csrc/tensorrt/trt_cuda_helper.cuh    |   41 +
 mmcv/ops/csrc/tensorrt/trt_cummaxmin.hpp      |  122 +
 mmcv/ops/csrc/tensorrt/trt_deform_conv.hpp    |  118 +
 mmcv/ops/csrc/tensorrt/trt_grid_sampler.hpp   |  108 +
 mmcv/ops/csrc/tensorrt/trt_instance_norm.hpp  |  120 +
 .../tensorrt/trt_modulated_deform_conv.hpp    |  120 +
 mmcv/ops/csrc/tensorrt/trt_nms.hpp            |  107 +
 mmcv/ops/csrc/tensorrt/trt_plugin.hpp         |    7 +
 mmcv/ops/csrc/tensorrt/trt_plugin_helper.hpp  |   41 +
 mmcv/ops/csrc/tensorrt/trt_roi_align.hpp      |  108 +
 mmcv/ops/csrc/tensorrt/trt_scatternd.hpp      |   98 +
 mmcv/ops/csrc/tensorrt/trt_serialize.hpp      |  105 +
 mmcv/ops/deform_conv.py                       |   78 +-
 mmcv/ops/deform_roi_pool.py                   |   81 +-
 mmcv/ops/deprecated_wrappers.py               |   11 +-
 mmcv/ops/diff_iou_rotated.py                  |  301 ---
 mmcv/ops/filtered_lrelu.py                    |  414 ----
 mmcv/ops/focal_loss.py                        |   88 +-
 mmcv/ops/furthest_point_sample.py             |    9 +-
 mmcv/ops/fused_bias_leakyrelu.py              |   50 +-
 mmcv/ops/gather_points.py                     |   14 +-
 mmcv/ops/group_points.py                      |  178 +-
 mmcv/ops/info.py                              |   15 +
 mmcv/ops/iou3d.py                             |  215 +-
 mmcv/ops/knn.py                               |   17 +-
 mmcv/ops/masked_conv.py                       |   71 +-
 mmcv/ops/merge_cells.py                       |   65 +-
 mmcv/ops/min_area_polygons.py                 |   20 -
 mmcv/ops/modulated_deform_conv.py             |  141 +-
 mmcv/ops/multi_scale_deform_attn.py           |  159 +-
 mmcv/ops/nms.py                               |  322 +--
 mmcv/ops/pixel_group.py                       |   29 +-
 mmcv/ops/point_sample.py                      |  186 +-
 mmcv/ops/points_in_boxes.py                   |   20 +-
 mmcv/ops/points_in_polygons.py                |   38 -
 mmcv/ops/points_sampler.py                    |   65 +-
 mmcv/ops/prroi_pool.py                        |  152 --
 mmcv/ops/psa_mask.py                          |   16 +-
 mmcv/ops/riroi_align_rotated.py               |  140 --
 mmcv/ops/roi_align.py                         |  120 +-
 mmcv/ops/roi_align_rotated.py                 |  140 +-
 mmcv/ops/roi_pool.py                          |   20 +-
 mmcv/ops/roiaware_pool3d.py                   |   30 +-
 mmcv/ops/roipoint_pool3d.py                   |   30 +-
 mmcv/ops/rotated_feature_align.py             |   95 -
 mmcv/ops/saconv.py                            |   14 +-
 mmcv/ops/scatter_points.py                    |   47 +-
 mmcv/ops/sparse_conv.py                       |  455 ----
 mmcv/ops/sparse_functional.py                 |  156 --
 mmcv/ops/sparse_modules.py                    |  203 --
 mmcv/ops/sparse_ops.py                        |  174 --
 mmcv/ops/sparse_pool.py                       |   86 -
 mmcv/ops/sparse_structure.py                  |   66 -
 mmcv/ops/sync_bn.py                           |   32 +-
 mmcv/ops/three_interpolate.py                 |   25 +-
 mmcv/ops/three_nn.py                          |   16 +-
 mmcv/ops/tin_shift.py                         |   13 +-
 mmcv/ops/upfirdn2d.py                         |  754 +++---
 mmcv/ops/voxelize.py                          |   86 +-
 mmcv/parallel/__init__.py                     |   13 +
 mmcv/parallel/_functions.py                   |   79 +
 mmcv/parallel/collate.py                      |   84 +
 mmcv/parallel/data_container.py               |   89 +
 mmcv/parallel/data_parallel.py                |   97 +
 mmcv/parallel/distributed.py                  |  112 +
 mmcv/parallel/distributed_deprecated.py       |   70 +
 mmcv/parallel/registry.py                     |    8 +
 mmcv/parallel/scatter_gather.py               |   59 +
 mmcv/parallel/utils.py                        |   20 +
 mmcv/runner/__init__.py                       |   47 +
 mmcv/runner/base_module.py                    |  195 ++
 mmcv/runner/base_runner.py                    |  542 +++++
 mmcv/runner/builder.py                        |   24 +
 mmcv/runner/checkpoint.py                     |  710 ++++++
 mmcv/runner/default_constructor.py            |   44 +
 mmcv/runner/dist_utils.py                     |  164 ++
 mmcv/runner/epoch_based_runner.py             |  187 ++
 mmcv/runner/fp16_utils.py                     |  410 ++++
 mmcv/runner/hooks/__init__.py                 |   29 +
 mmcv/runner/hooks/checkpoint.py               |  167 ++
 mmcv/runner/hooks/closure.py                  |   11 +
 mmcv/runner/hooks/ema.py                      |   89 +
 mmcv/runner/hooks/evaluation.py               |  509 ++++
 mmcv/runner/hooks/hook.py                     |   92 +
 mmcv/runner/hooks/iter_timer.py               |   18 +
 mmcv/runner/hooks/logger/__init__.py          |   15 +
 mmcv/runner/hooks/logger/base.py              |  166 ++
 mmcv/runner/hooks/logger/dvclive.py           |   58 +
 mmcv/runner/hooks/logger/mlflow.py            |   78 +
 mmcv/runner/hooks/logger/neptune.py           |   82 +
 mmcv/runner/hooks/logger/pavi.py              |  117 +
 mmcv/runner/hooks/logger/tensorboard.py       |   57 +
 mmcv/runner/hooks/logger/text.py              |  256 +++
 mmcv/runner/hooks/logger/wandb.py             |   56 +
 mmcv/runner/hooks/lr_updater.py               |  670 ++++++
 mmcv/runner/hooks/memory.py                   |   25 +
 mmcv/runner/hooks/momentum_updater.py         |  493 ++++
 mmcv/runner/hooks/optimizer.py                |  508 ++++
 mmcv/runner/hooks/profiler.py                 |  180 ++
 mmcv/runner/hooks/sampler_seed.py             |   20 +
 mmcv/runner/hooks/sync_buffer.py              |   22 +
 mmcv/runner/iter_based_runner.py              |  273 +++
 mmcv/runner/log_buffer.py                     |   41 +
 mmcv/runner/optimizer/__init__.py             |    9 +
 mmcv/runner/optimizer/builder.py              |   44 +
 mmcv/runner/optimizer/default_constructor.py  |  249 ++
 mmcv/runner/priority.py                       |   60 +
 mmcv/runner/utils.py                          |   93 +
 mmcv/tensorrt/__init__.py                     |   30 +
 mmcv/tensorrt/init_plugins.py                 |   37 +
 mmcv/tensorrt/preprocess.py                   |  120 +
 mmcv/tensorrt/tensorrt_utils.py               |  235 ++
 mmcv/transforms/__init__.py                   |   30 -
 mmcv/transforms/base.py                       |   30 -
 mmcv/transforms/builder.py                    |    2 -
 mmcv/transforms/formatting.py                 |  127 -
 mmcv/transforms/loading.py                    |  360 ---
 mmcv/transforms/processing.py                 | 1562 -------------
 mmcv/transforms/utils.py                      |  249 --
 mmcv/transforms/wrappers.py                   |  649 ------
 mmcv/utils/__init__.py                        |   76 +-
 mmcv/utils/config.py                          |  688 ++++++
 mmcv/utils/device_type.py                     |    8 -
 mmcv/utils/env.py                             |   75 +-
 mmcv/utils/ext_loader.py                      |    3 +-
 mmcv/utils/hub.py                             |  127 +
 mmcv/utils/logging.py                         |  110 +
 mmcv/utils/misc.py                            |  377 +++
 mmcv/utils/parrots_jit.py                     |    2 +-
 mmcv/utils/parrots_wrapper.py                 |  107 +
 mmcv/utils/path.py                            |  101 +
 mmcv/utils/progressbar.py                     |  208 ++
 mmcv/utils/registry.py                        |  315 +++
 mmcv/utils/testing.py                         |  140 ++
 mmcv/utils/timer.py                           |  118 +
 mmcv/utils/trace.py                           |   23 +
 mmcv/utils/version_utils.py                   |   90 +
 mmcv/version.py                               |   10 +-
 mmcv/video/io.py                              |   42 +-
 mmcv/video/optflow.py                         |   50 +-
 mmcv/video/processing.py                      |   59 +-
 mmcv/visualization/color.py                   |    8 +-
 mmcv/visualization/image.py                   |   63 +-
 mmcv/visualization/optflow.py                 |   14 +-
 requirements.txt                              |    9 +-
 requirements/build.txt                        |    1 -
 requirements/docs.txt                         |    4 +-
 requirements/optional.txt                     |    2 -
 requirements/runtime.txt                      |    1 -
 requirements/test.txt                         |    6 +-
 setup.cfg                                     |   10 +-
 setup.py                                      |  275 ++-
 tests/data/config/a.b.py                      |    1 -
 tests/data/config/a.py                        |    1 -
 tests/data/config/base.py                     |    1 -
 tests/data/config/code.py                     |    1 -
 tests/data/config/d.py                        |    1 -
 tests/data/config/delete.py                   |    4 +-
 tests/data/config/deprecated.py               |    1 -
 tests/data/config/deprecated_as_base.py       |    1 -
 tests/data/config/e.py                        |    1 -
 tests/data/config/expected.py                 |    1 -
 tests/data/config/f.py                        |    1 -
 tests/data/config/g.py                        |    1 -
 tests/data/config/h.py                        |    1 -
 tests/data/config/i_base.py                   |    1 -
 tests/data/config/i_child.py                  |    1 -
 tests/data/config/l.py                        |    7 -
 tests/data/config/l1.py                       |    1 -
 tests/data/config/l4.py                       |    1 -
 tests/data/config/m.py                        |    1 -
 tests/data/config/n.py                        |    7 -
 tests/data/config/q.py                        |    1 -
 tests/data/config/r.py                        |    1 -
 tests/data/config/s.py                        |    1 -
 tests/data/config/t.py                        |    1 -
 tests/data/config/u.py                        |    1 -
 tests/data/config/v.py                        |    1 -
 tests/data/for_carafe/carafe_feat.bin         |  Bin 4608 -> 0 bytes
 tests/data/for_carafe/carafe_feat_grad.bin    |   33 -
 tests/data/for_carafe/carafe_mask.bin         |  Bin 28800 -> 0 bytes
 tests/data/for_carafe/carafe_mask_grad.bin    |  Bin 28800 -> 0 bytes
 tests/data/for_carafe/carafe_output.bin       |  Bin 18432 -> 0 bytes
 .../masked_conv2d_for_bias.npy                |  Bin 140 -> 0 bytes
 .../masked_conv2d_for_input.npy               |  Bin 3200 -> 0 bytes
 .../masked_conv2d_for_mask.npy                |  Bin 1152 -> 0 bytes
 .../masked_conv2d_for_output.npy              |  Bin 3200 -> 0 bytes
 .../masked_conv2d_for_weight.npy              |  Bin 452 -> 0 bytes
 tests/data/scripts/hello.py                   |    1 -
 tests/test_arraymisc.py                       |    1 +
 tests/test_cnn/test_build_layers.py           |   81 +-
 tests/test_cnn/test_context_block.py          |    1 -
 .../test_cnn/test_conv2d_adaptive_padding.py  |    1 -
 tests/test_cnn/test_conv_module.py            |   17 +-
 .../test_depthwise_seperable_conv_module.py   |    1 -
 tests/test_cnn/test_flops_counter.py          |    1 -
 tests/test_cnn/test_fuse_conv_bn.py           |    1 -
 tests/test_cnn/test_generalized_attention.py  |    1 -
 tests/test_cnn/test_hsigmoid.py               |    7 +-
 tests/test_cnn/test_hswish.py                 |    1 -
 tests/test_cnn/test_model_registry.py         |   63 +
 tests/test_cnn/test_non_local.py              |    1 -
 tests/test_cnn/test_revert_syncbn.py          |   58 +
 tests/test_cnn/test_rfsearch/test_operator.py |  325 ---
 tests/test_cnn/test_rfsearch/test_search.py   |  128 --
 tests/test_cnn/test_scale.py                  |   59 +-
 tests/test_cnn/test_silu.py                   |   28 -
 tests/test_cnn/test_swish.py                  |    5 +-
 tests/test_cnn/test_transformer.py            |  495 +---
 tests/test_cnn/test_weight_init.py            |  559 +++++
 tests/test_cnn/test_wrappers.py               |    1 -
 tests/test_fileclient.py                      |  860 +++++++
 tests/test_fileio.py                          |  211 ++
 tests/test_image/test_geometric.py            |    7 -
 tests/test_image/test_image_misc.py           |   21 -
 tests/test_image/test_io.py                   |  115 +-
 tests/test_image/test_photometric.py          |  102 +-
 tests/test_load_model_zoo.py                  |  146 ++
 tests/test_ops/output.pkl                     |  Bin 2168 -> 0 bytes
 tests/test_ops/test_active_rotated_filter.py  |  258 ---
 tests/test_ops/test_assign_score_withk.py     |    1 -
 tests/test_ops/test_ball_query.py             |   48 -
 tests/test_ops/test_bbox.py                   |   71 +-
 tests/test_ops/test_bezier_align.py           |   54 -
 tests/test_ops/test_bias_act.py               |  144 --
 tests/test_ops/test_bilinear_grid_sample.py   |    7 +-
 tests/test_ops/test_border_align.py           |    3 +-
 tests/test_ops/test_box_iou_quadri.py         |   77 -
 tests/test_ops/test_box_iou_rotated.py        |   48 +-
 tests/test_ops/test_carafe.py                 |   60 +-
 tests/test_ops/test_cc_attention.py           |    3 +-
 tests/test_ops/test_chamfer_distance.py       |   57 -
 tests/test_ops/test_contour_expand.py         |    1 -
 tests/test_ops/test_conv_gradfix.py           |   43 -
 tests/test_ops/test_convex_iou.py             |   56 -
 tests/test_ops/test_corner_pool.py            |    1 -
 tests/test_ops/test_correlation.py            |   11 +-
 tests/test_ops/test_deform_conv.py            |    7 +-
 tests/test_ops/test_deform_roi_pool.py        |   60 +-
 tests/test_ops/test_diff_iou_rotated.py       |   49 -
 tests/test_ops/test_filtered_lrelu.py         |  224 --
 tests/test_ops/test_focal_loss.py             |   52 +-
 tests/test_ops/test_furthest_point_sample.py  |    1 -
 tests/test_ops/test_fused_bias_leakyrelu.py   |   60 +-
 tests/test_ops/test_gather_points.py          |   95 +-
 tests/test_ops/test_group_points.py           |  168 +-
 tests/test_ops/test_info.py                   |    3 +-
 tests/test_ops/test_iou3d.py                  |  153 +-
 tests/test_ops/test_knn.py                    |    1 -
 tests/test_ops/test_masked_conv2d.py          |   43 +-
 tests/test_ops/test_merge_cells.py            |   80 +-
 tests/test_ops/test_min_area_polygons.py      |   30 -
 tests/test_ops/test_modulated_deform_conv.py  |    7 +-
 tests/test_ops/test_ms_deformable_attn.py     |  206 +-
 tests/test_ops/test_nms.py                    |   39 +-
 tests/test_ops/test_nms_quadri.py             |  119 -
 tests/test_ops/test_nms_rotated.py            |   99 +-
 tests/test_ops/test_onnx.py                   |  737 +++++-
 tests/test_ops/test_pixel_group.py            |    1 -
 tests/test_ops/test_points_in_polygons.py     |   23 -
 tests/test_ops/test_prroi_pool.py             |   98 -
 tests/test_ops/test_psa_mask.py               |   60 +-
 tests/test_ops/test_riroi_align_rotated.py    |   84 -
 tests/test_ops/test_roi_align.py              |   31 +-
 tests/test_ops/test_roi_align_rotated.py      |   53 +-
 tests/test_ops/test_roi_pool.py               |   44 +-
 tests/test_ops/test_roiaware_pool3d.py        |   44 +-
 tests/test_ops/test_roipoint_pool3d.py        |   51 +-
 tests/test_ops/test_rotated_feature_align.py  |  131 --
 tests/test_ops/test_saconv.py                 |    1 -
 tests/test_ops/test_scatter_points.py         |   50 +-
 tests/test_ops/test_spconv.py                 |  133 --
 tests/test_ops/test_syncbn.py                 |    3 +-
 tests/test_ops/test_tensorrt.py               |  807 +++++++
 tests/test_ops/test_tensorrt_preprocess.py    |   75 +
 tests/test_ops/test_three_interpolate.py      |   36 +-
 tests/test_ops/test_three_nn.py               |  118 +-
 tests/test_ops/test_tin_shift.py              |   57 +-
 tests/test_ops/test_upfirdn2d.py              |   29 +-
 tests/test_ops/test_voxelization.py           |  116 +-
 tests/test_parallel.py                        |   66 +
 tests/test_runner/test_basemodule.py          |  557 +++++
 tests/test_runner/test_checkpoint.py          |  432 ++++
 tests/test_runner/test_dist_utils.py          |   52 +
 tests/test_runner/test_eval_hook.py           |  482 ++++
 tests/test_runner/test_fp16.py                |  300 +++
 tests/test_runner/test_hooks.py               | 1488 ++++++++++++
 tests/test_runner/test_optimizer.py           |  639 ++++++
 tests/test_runner/test_runner.py              |  289 +++
 tests/test_runner/test_utils.py               |   38 +
 .../test_transforms_formatting.py             |  101 -
 .../test_transforms_loading.py                |  151 --
 .../test_transforms_processing.py             | 1014 --------
 .../test_transforms_wrapper.py                |  585 -----
 tests/test_utils/test_config.py               |  534 +++++
 tests/test_utils/test_env.py                  |    7 +-
 tests/test_utils/test_hub.py                  |   32 +
 tests/test_utils/test_logging.py              |  117 +
 tests/test_utils/test_misc.py                 |  225 ++
 tests/test_utils/test_parrots_jit.py          |    8 +-
 tests/test_utils/test_path.py                 |   73 +
 tests/test_utils/test_progressbar.py          |  171 ++
 tests/test_utils/test_registry.py             |  282 +++
 tests/test_utils/test_testing.py              |  194 ++
 tests/test_utils/test_timer.py                |   40 +
 tests/test_utils/test_trace.py                |   24 +
 tests/test_utils/test_version_utils.py        |   57 +
 tests/test_video/test_reader.py               |    2 +-
 tests/test_visualization.py                   |    2 +-
 840 files changed, 47681 insertions(+), 60178 deletions(-)
 delete mode 100644 CONTRIBUTING_zh-CN.md
 create mode 100644 Dockerfile
 delete mode 100644 docker/README.md
 delete mode 100644 docker/dev/Dockerfile
 delete mode 100644 docker/release/Dockerfile
 rename docs/{en => }/Makefile (100%)
 rename docs/{en => }/_static/community/1.png (100%)
 rename docs/{en => }/_static/community/2.png (100%)
 rename docs/{en => }/_static/community/3.png (100%)
 rename docs/{en => }/_static/css/readthedocs.css (75%)
 rename docs/{en => }/_static/flow_img2toimg1.png (100%)
 rename docs/{en => }/_static/flow_raw_images.png (100%)
 rename docs/{en => }/_static/flow_visualization.png (100%)
 rename docs/{en => }/_static/flow_warp.png (100%)
 rename docs/{en => }/_static/flow_warp_diff.png (100%)
 rename docs/{en => }/_static/image/mmcv-logo.png (100%)
 rename docs/{en => }/_static/parallel_progress.gif (100%)
 rename docs/{en => }/_static/parallel_progress.png (100%)
 rename docs/{en => }/_static/progress.gif (100%)
 rename docs/{en => }/_static/progress.png (100%)
 create mode 100644 docs/_static/qq_group_qrcode.jpg
 create mode 100644 docs/_static/zhihu_qrcode.jpg
 create mode 100644 docs/api.rst
 create mode 120000 docs/community/contributing.md
 create mode 100644 docs/community/pr.md
 rename docs/{en => }/compatibility.md (100%)
 rename docs/{zh_cn => }/conf.py (62%)
 rename docs/{en => }/deployment/mmcv_ops_definition.md (80%)
 create mode 100644 docs/deployment/onnx.md
 create mode 100644 docs/deployment/onnxruntime_custom_ops.md
 create mode 100644 docs/deployment/onnxruntime_op.md
 create mode 100644 docs/deployment/tensorrt_custom_ops.md
 create mode 100644 docs/deployment/tensorrt_plugin.md
 delete mode 100644 docs/en/_static/version.json
 delete mode 100644 docs/en/_templates/classtemplate.rst
 delete mode 100644 docs/en/api/arraymisc.rst
 delete mode 100644 docs/en/api/cnn.rst
 delete mode 100644 docs/en/api/image.rst
 delete mode 100644 docs/en/api/ops.rst
 delete mode 100644 docs/en/api/transforms.rst
 delete mode 100644 docs/en/api/utils.rst
 delete mode 100644 docs/en/api/video.rst
 delete mode 100644 docs/en/api/visualization.rst
 delete mode 100644 docs/en/community/contributing.md
 delete mode 100644 docs/en/community/pr.md
 delete mode 100644 docs/en/docutils.conf
 delete mode 100644 docs/en/faq.md
 delete mode 100644 docs/en/get_started/build.md
 delete mode 100644 docs/en/get_started/installation.md
 delete mode 100644 docs/en/get_started/introduction.md
 delete mode 100644 docs/en/switch_language.md
 delete mode 100644 docs/en/understand_mmcv/cnn.md
 delete mode 100644 docs/en/understand_mmcv/data_transform.md
 delete mode 100644 docs/en/understand_mmcv/ops.md
 create mode 100644 docs/faq.md
 create mode 100644 docs/get_started/build.md
 create mode 100644 docs/get_started/installation.md
 create mode 100644 docs/get_started/introduction.md
 rename docs/{en => }/get_started/previous_versions.md (93%)
 rename docs/{en => }/index.rst (71%)
 rename docs/{en => }/make.bat (100%)
 rename docs/{en => }/mmcv-logo.png (100%)
 create mode 100644 docs/understand_mmcv/cnn.md
 create mode 100644 docs/understand_mmcv/config.md
 rename docs/{en => }/understand_mmcv/data_process.md (90%)
 create mode 100644 docs/understand_mmcv/io.md
 create mode 100644 docs/understand_mmcv/ops.md
 create mode 100644 docs/understand_mmcv/registry.md
 create mode 100644 docs/understand_mmcv/runner.md
 create mode 100644 docs/understand_mmcv/utils.md
 rename docs/{en => }/understand_mmcv/visualization.md (100%)
 delete mode 100644 docs/zh_cn/_static/version.json
 delete mode 100644 docs/zh_cn/_templates/classtemplate.rst
 delete mode 100644 docs/zh_cn/api/arraymisc.rst
 delete mode 100644 docs/zh_cn/api/cnn.rst
 delete mode 100644 docs/zh_cn/api/image.rst
 delete mode 100644 docs/zh_cn/api/ops.rst
 delete mode 100644 docs/zh_cn/api/transforms.rst
 delete mode 100644 docs/zh_cn/api/utils.rst
 delete mode 100644 docs/zh_cn/api/video.rst
 delete mode 100644 docs/zh_cn/api/visualization.rst
 delete mode 100644 docs/zh_cn/community/code_style.md
 delete mode 100644 docs/zh_cn/community/contributing.md
 delete mode 100644 docs/zh_cn/community/pr.md
 delete mode 100644 docs/zh_cn/docutils.conf
 delete mode 100644 docs/zh_cn/faq.md
 delete mode 100644 docs/zh_cn/get_started/article.md
 delete mode 100644 docs/zh_cn/get_started/build.md
 delete mode 100644 docs/zh_cn/get_started/installation.md
 delete mode 100644 docs/zh_cn/get_started/introduction.md
 delete mode 100644 docs/zh_cn/switch_language.md
 delete mode 100644 docs/zh_cn/understand_mmcv/cnn.md
 delete mode 100644 docs/zh_cn/understand_mmcv/data_transform.md
 delete mode 100644 docs/zh_cn/understand_mmcv/ops.md
 rename {docs/zh_cn => docs_zh_CN}/Makefile (100%)
 rename {docs/zh_cn => docs_zh_CN}/_static/css/readthedocs.css (75%)
 rename {docs/zh_cn => docs_zh_CN}/_static/image/mmcv-logo.png (100%)
 create mode 100644 docs_zh_CN/api.rst
 create mode 100644 docs_zh_CN/community/contributing.md
 create mode 100644 docs_zh_CN/community/pr.md
 rename {docs/zh_cn => docs_zh_CN}/compatibility.md (100%)
 rename {docs/en => docs_zh_CN}/conf.py (61%)
 create mode 100644 docs_zh_CN/deployment/onnx.md
 create mode 100644 docs_zh_CN/deployment/onnxruntime_custom_ops.md
 create mode 100644 docs_zh_CN/deployment/onnxruntime_op.md
 create mode 100644 docs_zh_CN/deployment/tensorrt_custom_ops.md
 create mode 100644 docs_zh_CN/deployment/tensorrt_plugin.md
 create mode 100644 docs_zh_CN/faq.md
 create mode 100644 docs_zh_CN/get_started/build.md
 create mode 100644 docs_zh_CN/get_started/installation.md
 create mode 100644 docs_zh_CN/get_started/introduction.md
 rename {docs/zh_cn => docs_zh_CN}/get_started/previous_versions.md (93%)
 rename {docs/zh_cn => docs_zh_CN}/index.rst (65%)
 rename {docs/zh_cn => docs_zh_CN}/make.bat (100%)
 rename {docs/zh_cn => docs_zh_CN}/mmcv-logo.png (100%)
 create mode 100644 docs_zh_CN/understand_mmcv/cnn.md
 create mode 100644 docs_zh_CN/understand_mmcv/config.md
 rename {docs/zh_cn => docs_zh_CN}/understand_mmcv/data_process.md (93%)
 create mode 100644 docs_zh_CN/understand_mmcv/io.md
 create mode 100644 docs_zh_CN/understand_mmcv/ops.md
 create mode 100644 docs_zh_CN/understand_mmcv/registry.md
 create mode 100644 docs_zh_CN/understand_mmcv/runner.md
 create mode 100644 docs_zh_CN/understand_mmcv/utils.md
 rename {docs/zh_cn => docs_zh_CN}/understand_mmcv/visualization.md (100%)
 create mode 100644 examples/train.py
 create mode 100644 mmcv/cnn/bricks/registry.py
 create mode 100644 mmcv/cnn/builder.py
 delete mode 100644 mmcv/cnn/rfsearch/__init__.py
 delete mode 100644 mmcv/cnn/rfsearch/operator.py
 delete mode 100644 mmcv/cnn/rfsearch/search.py
 delete mode 100644 mmcv/cnn/rfsearch/utils.py
 create mode 100644 mmcv/cnn/utils/sync_bn.py
 create mode 100644 mmcv/cnn/utils/weight_init.py
 create mode 100644 mmcv/engine/__init__.py
 create mode 100644 mmcv/engine/test.py
 create mode 100644 mmcv/fileio/__init__.py
 create mode 100644 mmcv/fileio/file_client.py
 create mode 100644 mmcv/fileio/handlers/__init__.py
 create mode 100644 mmcv/fileio/handlers/base.py
 create mode 100644 mmcv/fileio/handlers/json_handler.py
 create mode 100644 mmcv/fileio/handlers/pickle_handler.py
 create mode 100644 mmcv/fileio/handlers/yaml_handler.py
 create mode 100644 mmcv/fileio/io.py
 create mode 100644 mmcv/fileio/parse.py
 create mode 100644 mmcv/model_zoo/deprecated.json
 create mode 100644 mmcv/model_zoo/mmcls.json
 create mode 100644 mmcv/model_zoo/open_mmlab.json
 create mode 100644 mmcv/onnx/__init__.py
 create mode 100644 mmcv/onnx/info.py
 create mode 100644 mmcv/onnx/onnx_utils/__init__.py
 create mode 100644 mmcv/onnx/onnx_utils/symbolic_helper.py
 create mode 100644 mmcv/onnx/symbolic.py
 mode change 100755 => 100644 mmcv/ops/__init__.py
 delete mode 100644 mmcv/ops/active_rotated_filter.py
 delete mode 100644 mmcv/ops/bezier_align.py
 delete mode 100644 mmcv/ops/bias_act.py
 delete mode 100644 mmcv/ops/box_iou_quadri.py
 delete mode 100644 mmcv/ops/chamfer_distance.py
 delete mode 100644 mmcv/ops/conv2d_gradfix.py
 delete mode 100644 mmcv/ops/convex_iou.py
 delete mode 100644 mmcv/ops/csrc/common/cuda/active_rotated_filter_cuda_kernel.cuh
 delete mode 100644 mmcv/ops/csrc/common/cuda/bezier_align_cuda_kernel.cuh
 delete mode 100644 mmcv/ops/csrc/common/cuda/box_iou_quadri_cuda.cuh
 delete mode 100644 mmcv/ops/csrc/common/cuda/chamfer_distance_cuda_kernel.cuh
 delete mode 100644 mmcv/ops/csrc/common/cuda/convex_iou_cuda_kernel.cuh
 delete mode 100644 mmcv/ops/csrc/common/cuda/diff_iou_rotated_cuda_kernel.cuh
 delete mode 100644 mmcv/ops/csrc/common/cuda/min_area_polygons_cuda.cuh
 delete mode 100644 mmcv/ops/csrc/common/cuda/nms_quadri_cuda.cuh
 delete mode 100644 mmcv/ops/csrc/common/cuda/points_in_polygons_cuda_kernel.cuh
 delete mode 100644 mmcv/ops/csrc/common/cuda/prroi_pool_cuda_kernel.cuh
 delete mode 100644 mmcv/ops/csrc/common/cuda/riroi_align_rotated_cuda_kernel.cuh
 delete mode 100644 mmcv/ops/csrc/common/cuda/rotated_feature_align_cuda_kernel.cuh
 delete mode 100644 mmcv/ops/csrc/common/cuda/spconv/indice.cuh
 delete mode 100644 mmcv/ops/csrc/common/cuda/spconv/reordering.cuh
 delete mode 100644 mmcv/ops/csrc/common/cuda/stack_ball_query_cuda_kernel.cuh
 delete mode 100644 mmcv/ops/csrc/common/cuda/stack_group_points_cuda_kernel.cuh
 delete mode 100644 mmcv/ops/csrc/common/mlu/bbox_overlaps_mlu_kernel.mlu
 delete mode 100644 mmcv/ops/csrc/common/mlu/carafe_mlu_kernel.mlu
 delete mode 100644 mmcv/ops/csrc/common/mlu/carafe_utils.hpp
 delete mode 100644 mmcv/ops/csrc/common/mlu/common_mlu_helper.hpp
 delete mode 100644 mmcv/ops/csrc/common/mlu/deform_roi_pool_mlu_kernel.mlu
 delete mode 100644 mmcv/ops/csrc/common/mlu/focal_loss_sigmoid_mlu_kernel.mlu
 delete mode 100644 mmcv/ops/csrc/common/mlu/iou3d_mlu_kernel.mlu
 delete mode 100644 mmcv/ops/csrc/common/mlu/iou3d_utils.hpp
 delete mode 100755 mmcv/ops/csrc/common/mlu/masked_conv2d_mlu_kernel.mlu
 delete mode 100644 mmcv/ops/csrc/common/mlu/ms_deform_attn_mlu_kernel.mlu
 delete mode 100644 mmcv/ops/csrc/common/mlu/nms_mlu_kernel.mlu
 delete mode 100644 mmcv/ops/csrc/common/mlu/nms_utils.hpp
 delete mode 100644 mmcv/ops/csrc/common/mlu/psamask_mlu_kernel.mlu
 delete mode 100644 mmcv/ops/csrc/common/mlu/psamask_utils.hpp
 delete mode 100644 mmcv/ops/csrc/common/mlu/roi_align_mlu_kernel.mlu
 delete mode 100644 mmcv/ops/csrc/common/mlu/roi_align_rotated_mlu_kernel.mlu
 delete mode 100644 mmcv/ops/csrc/common/mlu/roi_align_rotated_utils.hpp
 delete mode 100644 mmcv/ops/csrc/common/mlu/roi_pool_mlu_kernel.mlu
 delete mode 100644 mmcv/ops/csrc/common/mlu/roiaware_pool3d_mlu_kernel.mlu
 delete mode 100644 mmcv/ops/csrc/common/mlu/roipoint_pool3d_large_boxes_num_mlu_kernel.mlu
 delete mode 100644 mmcv/ops/csrc/common/mlu/roipoint_pool3d_mlu_kernel.mlu
 delete mode 100644 mmcv/ops/csrc/common/mlu/three_nn_mlu_kernel.mlu
 delete mode 100644 mmcv/ops/csrc/common/mlu/tin_shift_mlu_kernel.mlu
 delete mode 100644 mmcv/ops/csrc/common/mps/MPSDevice.h
 delete mode 100644 mmcv/ops/csrc/common/mps/MPSLibrary.h
 delete mode 100644 mmcv/ops/csrc/common/mps/MPSLibrary.mm
 delete mode 100644 mmcv/ops/csrc/common/mps/MPSStream.h
 delete mode 100644 mmcv/ops/csrc/common/mps/MPSUtils.h
 delete mode 100644 mmcv/ops/csrc/common/pytorch_mlu_helper.hpp
 delete mode 100644 mmcv/ops/csrc/common/pytorch_npu_helper.hpp
 delete mode 100644 mmcv/ops/csrc/common/utils/spconv/paramsgrid.h
 delete mode 100644 mmcv/ops/csrc/common/utils/spconv/prettyprint.h
 delete mode 100644 mmcv/ops/csrc/common/utils/spconv/pybind11_utils.h
 delete mode 100644 mmcv/ops/csrc/common/utils/spconv/spconv/geometry.h
 delete mode 100644 mmcv/ops/csrc/common/utils/spconv/spconv/indice.h
 delete mode 100644 mmcv/ops/csrc/common/utils/spconv/spconv/maxpool.h
 delete mode 100644 mmcv/ops/csrc/common/utils/spconv/spconv/mp_helper.h
 delete mode 100644 mmcv/ops/csrc/common/utils/spconv/spconv/point2voxel.h
 delete mode 100644 mmcv/ops/csrc/common/utils/spconv/spconv/reordering.h
 delete mode 100644 mmcv/ops/csrc/common/utils/spconv/tensorview/helper_kernel.cuh
 delete mode 100644 mmcv/ops/csrc/common/utils/spconv/tensorview/helper_launch.h
 delete mode 100644 mmcv/ops/csrc/common/utils/spconv/tensorview/tensorview.h
 create mode 100644 mmcv/ops/csrc/onnxruntime/corner_pool.h
 create mode 100644 mmcv/ops/csrc/onnxruntime/cpu/corner_pool.cpp
 create mode 100644 mmcv/ops/csrc/onnxruntime/cpu/deform_conv.cpp
 create mode 100644 mmcv/ops/csrc/onnxruntime/cpu/gridSample.cpp
 create mode 100644 mmcv/ops/csrc/onnxruntime/cpu/modulated_deform_conv.cpp
 create mode 100644 mmcv/ops/csrc/onnxruntime/cpu/nms.cpp
 create mode 100644 mmcv/ops/csrc/onnxruntime/cpu/onnxruntime_register.cpp
 create mode 100644 mmcv/ops/csrc/onnxruntime/cpu/reduce_ops.cpp
 create mode 100644 mmcv/ops/csrc/onnxruntime/cpu/roi_align.cpp
 create mode 100644 mmcv/ops/csrc/onnxruntime/cpu/roi_align_rotated.cpp
 create mode 100644 mmcv/ops/csrc/onnxruntime/cpu/soft_nms.cpp
 create mode 100644 mmcv/ops/csrc/onnxruntime/deform_conv.h
 create mode 100644 mmcv/ops/csrc/onnxruntime/grid_sample.h
 create mode 100644 mmcv/ops/csrc/onnxruntime/modulated_deform_conv.h
 create mode 100644 mmcv/ops/csrc/onnxruntime/nms.h
 create mode 100644 mmcv/ops/csrc/onnxruntime/onnxruntime_register.h
 create mode 100644 mmcv/ops/csrc/onnxruntime/onnxruntime_session_options_config_keys.h
 create mode 100644 mmcv/ops/csrc/onnxruntime/ort_mmcv_utils.h
 create mode 100644 mmcv/ops/csrc/onnxruntime/reduce_ops.h
 create mode 100644 mmcv/ops/csrc/onnxruntime/roi_align.h
 create mode 100644 mmcv/ops/csrc/onnxruntime/roi_align_rotated.h
 create mode 100644 mmcv/ops/csrc/onnxruntime/soft_nms.h
 delete mode 100644 mmcv/ops/csrc/parrots/active_rotated_filter.cpp
 delete mode 100644 mmcv/ops/csrc/parrots/active_rotated_filter_parrots.cpp
 delete mode 100644 mmcv/ops/csrc/parrots/active_rotated_filter_pytorch.h
 create mode 100644 mmcv/ops/csrc/parrots/box_iou_rotated_cpu.cpp
 delete mode 100644 mmcv/ops/csrc/parrots/chamfer_distance.cpp
 delete mode 100644 mmcv/ops/csrc/parrots/chamfer_distance_parrots.cpp
 delete mode 100644 mmcv/ops/csrc/parrots/chamfer_distance_pytorch.h
 delete mode 100644 mmcv/ops/csrc/parrots/convex_iou.cpp
 delete mode 100644 mmcv/ops/csrc/parrots/convex_iou_parrots.cpp
 delete mode 100644 mmcv/ops/csrc/parrots/convex_iou_pytorch.h
 create mode 100644 mmcv/ops/csrc/parrots/corner_pool.cpp
 create mode 100644 mmcv/ops/csrc/parrots/corner_pool_parrots.cpp
 create mode 100644 mmcv/ops/csrc/parrots/corner_pool_pytorch.h
 delete mode 100644 mmcv/ops/csrc/parrots/cudabind.cpp
 create mode 100644 mmcv/ops/csrc/parrots/deform_conv_cpu.cpp
 delete mode 100644 mmcv/ops/csrc/parrots/diff_iou_rotated.cpp
 delete mode 100644 mmcv/ops/csrc/parrots/diff_iou_rotated_parrots.cpp
 delete mode 100644 mmcv/ops/csrc/parrots/diff_iou_rotated_pytorch.h
 delete mode 100644 mmcv/ops/csrc/parrots/info.cpp
 delete mode 100644 mmcv/ops/csrc/parrots/min_area_polygons.cpp
 delete mode 100644 mmcv/ops/csrc/parrots/min_area_polygons_parrots.cpp
 delete mode 100644 mmcv/ops/csrc/parrots/min_area_polygons_pytorch.h
 create mode 100644 mmcv/ops/csrc/parrots/modulated_deform_conv_cpu.cpp
 rename mmcv/ops/csrc/{pytorch/cpu/nms_quadri.cpp => parrots/nms_rotated_cpu.cpp} (59%)
 create mode 100644 mmcv/ops/csrc/parrots/points_in_boxes_cpu.cpp
 delete mode 100644 mmcv/ops/csrc/parrots/points_in_polygons.cpp
 delete mode 100644 mmcv/ops/csrc/parrots/points_in_polygons_parrots.cpp
 delete mode 100644 mmcv/ops/csrc/parrots/points_in_polygons_pytorch.h
 delete mode 100644 mmcv/ops/csrc/parrots/prroi_pool.cpp
 delete mode 100644 mmcv/ops/csrc/parrots/prroi_pool_parrots.cpp
 delete mode 100644 mmcv/ops/csrc/parrots/prroi_pool_pytorch.h
 delete mode 100644 mmcv/ops/csrc/parrots/riroi_align_rotated.cpp
 delete mode 100644 mmcv/ops/csrc/parrots/riroi_align_rotated_parrots.cpp
 delete mode 100644 mmcv/ops/csrc/parrots/riroi_align_rotated_pytorch.h
 create mode 100644 mmcv/ops/csrc/parrots/roi_align_cpu.cpp
 rename mmcv/ops/csrc/{pytorch/cpu/bezier_align.cpp => parrots/roi_align_rotated_cpu.cpp} (54%)
 delete mode 100644 mmcv/ops/csrc/parrots/rotated_feature_align.cpp
 delete mode 100644 mmcv/ops/csrc/parrots/rotated_feature_align_parrots.cpp
 delete mode 100644 mmcv/ops/csrc/parrots/rotated_feature_align_pytorch.h
 create mode 100644 mmcv/ops/csrc/parrots/voxelization_cpu.cpp
 delete mode 100644 mmcv/ops/csrc/pytorch/active_rotated_filter.cpp
 delete mode 100644 mmcv/ops/csrc/pytorch/bezier_align.cpp
 delete mode 100644 mmcv/ops/csrc/pytorch/bias_act.cpp
 delete mode 100644 mmcv/ops/csrc/pytorch/box_iou_quadri.cpp
 delete mode 100644 mmcv/ops/csrc/pytorch/chamfer_distance.cpp
 delete mode 100644 mmcv/ops/csrc/pytorch/convex_iou.cpp
 create mode 100644 mmcv/ops/csrc/pytorch/corner_pool.cpp
 delete mode 100644 mmcv/ops/csrc/pytorch/cpu/active_rotated_filter.cpp
 delete mode 100644 mmcv/ops/csrc/pytorch/cpu/bbox_overlaps_cpu.cpp
 delete mode 100644 mmcv/ops/csrc/pytorch/cpu/box_iou_quadri.cpp
 delete mode 100644 mmcv/ops/csrc/pytorch/cpu/rotated_feature_align.cpp
 delete mode 100644 mmcv/ops/csrc/pytorch/cpu/sparse_indice.cpp
 delete mode 100644 mmcv/ops/csrc/pytorch/cpu/sparse_maxpool.cpp
 delete mode 100644 mmcv/ops/csrc/pytorch/cpu/sparse_reordering.cpp
 delete mode 100644 mmcv/ops/csrc/pytorch/cuda/active_rotated_filter_cuda.cu
 delete mode 100644 mmcv/ops/csrc/pytorch/cuda/bezier_align_cuda.cu
 delete mode 100644 mmcv/ops/csrc/pytorch/cuda/bias_act_cuda.cu
 delete mode 100644 mmcv/ops/csrc/pytorch/cuda/box_iou_quadri_cuda.cu
 delete mode 100644 mmcv/ops/csrc/pytorch/cuda/chamfer_distance_cuda.cu
 delete mode 100644 mmcv/ops/csrc/pytorch/cuda/convex_iou.cu
 delete mode 100644 mmcv/ops/csrc/pytorch/cuda/diff_iou_rotated_cuda.cu
 delete mode 100644 mmcv/ops/csrc/pytorch/cuda/filtered_lrelu.cu
 delete mode 100644 mmcv/ops/csrc/pytorch/cuda/fused_spconv_ops_cuda.cu
 delete mode 100644 mmcv/ops/csrc/pytorch/cuda/min_area_polygons.cu
 delete mode 100644 mmcv/ops/csrc/pytorch/cuda/nms_quadri_cuda.cu
 delete mode 100644 mmcv/ops/csrc/pytorch/cuda/points_in_polygons_cuda.cu
 delete mode 100644 mmcv/ops/csrc/pytorch/cuda/prroi_pool_cuda.cu
 delete mode 100644 mmcv/ops/csrc/pytorch/cuda/riroi_align_rotated_cuda.cu
 delete mode 100644 mmcv/ops/csrc/pytorch/cuda/rotated_feature_align_cuda.cu
 delete mode 100644 mmcv/ops/csrc/pytorch/cuda/sparse_indice.cu
 delete mode 100644 mmcv/ops/csrc/pytorch/cuda/sparse_maxpool.cu
 delete mode 100644 mmcv/ops/csrc/pytorch/cuda/sparse_pool_ops_cuda.cu
 delete mode 100644 mmcv/ops/csrc/pytorch/cuda/sparse_reordering.cu
 delete mode 100644 mmcv/ops/csrc/pytorch/cuda/spconv_ops_cuda.cu
 delete mode 100644 mmcv/ops/csrc/pytorch/cuda/stack_ball_query_cuda.cu
 delete mode 100644 mmcv/ops/csrc/pytorch/cuda/stack_group_points_cuda.cu
 delete mode 100644 mmcv/ops/csrc/pytorch/diff_iou_rotated.cpp
 delete mode 100644 mmcv/ops/csrc/pytorch/filtered_lrelu.cpp
 delete mode 100644 mmcv/ops/csrc/pytorch/fused_spconv_ops.cpp
 delete mode 100644 mmcv/ops/csrc/pytorch/min_area_polygons.cpp
 delete mode 100644 mmcv/ops/csrc/pytorch/mlu/bbox_overlaps_mlu.cpp
 delete mode 100644 mmcv/ops/csrc/pytorch/mlu/carafe_mlu.cpp
 delete mode 100644 mmcv/ops/csrc/pytorch/mlu/deform_roi_pool_mlu.cpp
 delete mode 100644 mmcv/ops/csrc/pytorch/mlu/focal_loss_sigmoid_mlu.cpp
 delete mode 100644 mmcv/ops/csrc/pytorch/mlu/iou3d_mlu.cpp
 delete mode 100755 mmcv/ops/csrc/pytorch/mlu/masked_conv2d_mlu.cpp
 delete mode 100644 mmcv/ops/csrc/pytorch/mlu/ms_deform_attn_mlu.cpp
 delete mode 100644 mmcv/ops/csrc/pytorch/mlu/nms_mlu.cpp
 delete mode 100644 mmcv/ops/csrc/pytorch/mlu/psamask_mlu.cpp
 delete mode 100644 mmcv/ops/csrc/pytorch/mlu/roi_align_mlu.cpp
 delete mode 100755 mmcv/ops/csrc/pytorch/mlu/roi_align_rotated_mlu.cpp
 delete mode 100644 mmcv/ops/csrc/pytorch/mlu/roi_pool_mlu.cpp
 delete mode 100644 mmcv/ops/csrc/pytorch/mlu/roiaware_pool3d_mlu.cpp
 delete mode 100644 mmcv/ops/csrc/pytorch/mlu/roipoint_pool3d_mlu.cpp
 delete mode 100644 mmcv/ops/csrc/pytorch/mlu/three_nn_mlu.cpp
 delete mode 100644 mmcv/ops/csrc/pytorch/mlu/tin_shift_mlu.cpp
 delete mode 100644 mmcv/ops/csrc/pytorch/mps/bbox_overlaps_mps.mm
 delete mode 100644 mmcv/ops/csrc/pytorch/nms_quadri.cpp
 delete mode 100644 mmcv/ops/csrc/pytorch/npu/bbox_overlaps_npu.cpp
 delete mode 100644 mmcv/ops/csrc/pytorch/npu/deform_roi_pool.cpp
 delete mode 100644 mmcv/ops/csrc/pytorch/npu/focal_loss_npu.cpp
 delete mode 100644 mmcv/ops/csrc/pytorch/npu/fused_bias_leakyrelu_npu.cpp
 delete mode 100644 mmcv/ops/csrc/pytorch/npu/gather_points_npu.cpp
 delete mode 100644 mmcv/ops/csrc/pytorch/npu/nms_npu.cpp
 delete mode 100644 mmcv/ops/csrc/pytorch/npu/nms_rotated_npu.cpp
 delete mode 100644 mmcv/ops/csrc/pytorch/npu/psa_mask_npu.cpp
 delete mode 100644 mmcv/ops/csrc/pytorch/npu/roi_pool_npu.cpp
 delete mode 100644 mmcv/ops/csrc/pytorch/npu/voxelization_npu.cpp
 delete mode 100644 mmcv/ops/csrc/pytorch/points_in_polygons.cpp
 delete mode 100644 mmcv/ops/csrc/pytorch/prroi_pool.cpp
 delete mode 100644 mmcv/ops/csrc/pytorch/riroi_align_rotated.cpp
 delete mode 100644 mmcv/ops/csrc/pytorch/rotated_feature_align.cpp
 delete mode 100644 mmcv/ops/csrc/pytorch/sparse_pool_ops.cpp
 delete mode 100644 mmcv/ops/csrc/pytorch/spconv_ops.cpp
 delete mode 100644 mmcv/ops/csrc/pytorch/spconv_utils.h
 create mode 100644 mmcv/ops/csrc/tensorrt/plugins/trt_corner_pool.cpp
 create mode 100644 mmcv/ops/csrc/tensorrt/plugins/trt_corner_pool_kernel.cu
 create mode 100644 mmcv/ops/csrc/tensorrt/plugins/trt_cuda_helper.cu
 create mode 100644 mmcv/ops/csrc/tensorrt/plugins/trt_cummaxmin.cpp
 create mode 100644 mmcv/ops/csrc/tensorrt/plugins/trt_cummaxmin_kernel.cu
 create mode 100644 mmcv/ops/csrc/tensorrt/plugins/trt_deform_conv.cpp
 create mode 100644 mmcv/ops/csrc/tensorrt/plugins/trt_deform_conv_kernel.cu
 create mode 100644 mmcv/ops/csrc/tensorrt/plugins/trt_grid_sampler.cpp
 create mode 100644 mmcv/ops/csrc/tensorrt/plugins/trt_grid_sampler_kernel.cu
 create mode 100644 mmcv/ops/csrc/tensorrt/plugins/trt_instance_norm.cpp
 create mode 100644 mmcv/ops/csrc/tensorrt/plugins/trt_modulated_deform_conv.cpp
 create mode 100644 mmcv/ops/csrc/tensorrt/plugins/trt_modulated_deform_conv_kernel.cu
 create mode 100644 mmcv/ops/csrc/tensorrt/plugins/trt_nms.cpp
 create mode 100644 mmcv/ops/csrc/tensorrt/plugins/trt_nms_kernel.cu
 create mode 100644 mmcv/ops/csrc/tensorrt/plugins/trt_plugin.cpp
 create mode 100644 mmcv/ops/csrc/tensorrt/plugins/trt_roi_align.cpp
 create mode 100644 mmcv/ops/csrc/tensorrt/plugins/trt_roi_align_kernel.cu
 create mode 100644 mmcv/ops/csrc/tensorrt/plugins/trt_scatternd.cpp
 create mode 100644 mmcv/ops/csrc/tensorrt/plugins/trt_scatternd_kernel.cu
 create mode 100644 mmcv/ops/csrc/tensorrt/trt_corner_pool.hpp
 create mode 100644 mmcv/ops/csrc/tensorrt/trt_cuda_helper.cuh
 create mode 100644 mmcv/ops/csrc/tensorrt/trt_cummaxmin.hpp
 create mode 100644 mmcv/ops/csrc/tensorrt/trt_deform_conv.hpp
 create mode 100644 mmcv/ops/csrc/tensorrt/trt_grid_sampler.hpp
 create mode 100644 mmcv/ops/csrc/tensorrt/trt_instance_norm.hpp
 create mode 100644 mmcv/ops/csrc/tensorrt/trt_modulated_deform_conv.hpp
 create mode 100644 mmcv/ops/csrc/tensorrt/trt_nms.hpp
 create mode 100644 mmcv/ops/csrc/tensorrt/trt_plugin.hpp
 create mode 100644 mmcv/ops/csrc/tensorrt/trt_plugin_helper.hpp
 create mode 100644 mmcv/ops/csrc/tensorrt/trt_roi_align.hpp
 create mode 100644 mmcv/ops/csrc/tensorrt/trt_scatternd.hpp
 create mode 100644 mmcv/ops/csrc/tensorrt/trt_serialize.hpp
 delete mode 100644 mmcv/ops/diff_iou_rotated.py
 delete mode 100644 mmcv/ops/filtered_lrelu.py
 mode change 100755 => 100644 mmcv/ops/iou3d.py
 delete mode 100644 mmcv/ops/min_area_polygons.py
 delete mode 100644 mmcv/ops/points_in_polygons.py
 delete mode 100644 mmcv/ops/prroi_pool.py
 delete mode 100644 mmcv/ops/riroi_align_rotated.py
 delete mode 100644 mmcv/ops/rotated_feature_align.py
 delete mode 100644 mmcv/ops/sparse_conv.py
 delete mode 100644 mmcv/ops/sparse_functional.py
 delete mode 100644 mmcv/ops/sparse_modules.py
 delete mode 100644 mmcv/ops/sparse_ops.py
 delete mode 100644 mmcv/ops/sparse_pool.py
 delete mode 100644 mmcv/ops/sparse_structure.py
 mode change 100755 => 100644 mmcv/ops/tin_shift.py
 create mode 100644 mmcv/parallel/__init__.py
 create mode 100644 mmcv/parallel/_functions.py
 create mode 100644 mmcv/parallel/collate.py
 create mode 100644 mmcv/parallel/data_container.py
 create mode 100644 mmcv/parallel/data_parallel.py
 create mode 100644 mmcv/parallel/distributed.py
 create mode 100644 mmcv/parallel/distributed_deprecated.py
 create mode 100644 mmcv/parallel/registry.py
 create mode 100644 mmcv/parallel/scatter_gather.py
 create mode 100644 mmcv/parallel/utils.py
 create mode 100644 mmcv/runner/__init__.py
 create mode 100644 mmcv/runner/base_module.py
 create mode 100644 mmcv/runner/base_runner.py
 create mode 100644 mmcv/runner/builder.py
 create mode 100644 mmcv/runner/checkpoint.py
 create mode 100644 mmcv/runner/default_constructor.py
 create mode 100644 mmcv/runner/dist_utils.py
 create mode 100644 mmcv/runner/epoch_based_runner.py
 create mode 100644 mmcv/runner/fp16_utils.py
 create mode 100644 mmcv/runner/hooks/__init__.py
 create mode 100644 mmcv/runner/hooks/checkpoint.py
 create mode 100644 mmcv/runner/hooks/closure.py
 create mode 100644 mmcv/runner/hooks/ema.py
 create mode 100644 mmcv/runner/hooks/evaluation.py
 create mode 100644 mmcv/runner/hooks/hook.py
 create mode 100644 mmcv/runner/hooks/iter_timer.py
 create mode 100644 mmcv/runner/hooks/logger/__init__.py
 create mode 100644 mmcv/runner/hooks/logger/base.py
 create mode 100644 mmcv/runner/hooks/logger/dvclive.py
 create mode 100644 mmcv/runner/hooks/logger/mlflow.py
 create mode 100644 mmcv/runner/hooks/logger/neptune.py
 create mode 100644 mmcv/runner/hooks/logger/pavi.py
 create mode 100644 mmcv/runner/hooks/logger/tensorboard.py
 create mode 100644 mmcv/runner/hooks/logger/text.py
 create mode 100644 mmcv/runner/hooks/logger/wandb.py
 create mode 100644 mmcv/runner/hooks/lr_updater.py
 create mode 100644 mmcv/runner/hooks/memory.py
 create mode 100644 mmcv/runner/hooks/momentum_updater.py
 create mode 100644 mmcv/runner/hooks/optimizer.py
 create mode 100644 mmcv/runner/hooks/profiler.py
 create mode 100644 mmcv/runner/hooks/sampler_seed.py
 create mode 100644 mmcv/runner/hooks/sync_buffer.py
 create mode 100644 mmcv/runner/iter_based_runner.py
 create mode 100644 mmcv/runner/log_buffer.py
 create mode 100644 mmcv/runner/optimizer/__init__.py
 create mode 100644 mmcv/runner/optimizer/builder.py
 create mode 100644 mmcv/runner/optimizer/default_constructor.py
 create mode 100644 mmcv/runner/priority.py
 create mode 100644 mmcv/runner/utils.py
 create mode 100644 mmcv/tensorrt/__init__.py
 create mode 100644 mmcv/tensorrt/init_plugins.py
 create mode 100644 mmcv/tensorrt/preprocess.py
 create mode 100644 mmcv/tensorrt/tensorrt_utils.py
 delete mode 100644 mmcv/transforms/__init__.py
 delete mode 100644 mmcv/transforms/base.py
 delete mode 100644 mmcv/transforms/builder.py
 delete mode 100644 mmcv/transforms/formatting.py
 delete mode 100644 mmcv/transforms/loading.py
 delete mode 100644 mmcv/transforms/processing.py
 delete mode 100644 mmcv/transforms/utils.py
 delete mode 100644 mmcv/transforms/wrappers.py
 create mode 100644 mmcv/utils/config.py
 delete mode 100644 mmcv/utils/device_type.py
 create mode 100644 mmcv/utils/hub.py
 create mode 100644 mmcv/utils/logging.py
 create mode 100644 mmcv/utils/misc.py
 create mode 100644 mmcv/utils/parrots_wrapper.py
 create mode 100644 mmcv/utils/path.py
 create mode 100644 mmcv/utils/progressbar.py
 create mode 100644 mmcv/utils/registry.py
 create mode 100644 mmcv/utils/testing.py
 create mode 100644 mmcv/utils/timer.py
 create mode 100644 mmcv/utils/trace.py
 create mode 100644 mmcv/utils/version_utils.py
 delete mode 100644 requirements/build.txt
 delete mode 100644 requirements/optional.txt
 delete mode 100755 tests/data/for_carafe/carafe_feat.bin
 delete mode 100755 tests/data/for_carafe/carafe_feat_grad.bin
 delete mode 100755 tests/data/for_carafe/carafe_mask.bin
 delete mode 100755 tests/data/for_carafe/carafe_mask_grad.bin
 delete mode 100755 tests/data/for_carafe/carafe_output.bin
 delete mode 100644 tests/data/for_masked_conv2d/masked_conv2d_for_bias.npy
 delete mode 100644 tests/data/for_masked_conv2d/masked_conv2d_for_input.npy
 delete mode 100644 tests/data/for_masked_conv2d/masked_conv2d_for_mask.npy
 delete mode 100644 tests/data/for_masked_conv2d/masked_conv2d_for_output.npy
 delete mode 100644 tests/data/for_masked_conv2d/masked_conv2d_for_weight.npy
 create mode 100644 tests/test_cnn/test_model_registry.py
 create mode 100644 tests/test_cnn/test_revert_syncbn.py
 delete mode 100644 tests/test_cnn/test_rfsearch/test_operator.py
 delete mode 100644 tests/test_cnn/test_rfsearch/test_search.py
 delete mode 100644 tests/test_cnn/test_silu.py
 create mode 100644 tests/test_cnn/test_weight_init.py
 create mode 100644 tests/test_fileclient.py
 create mode 100644 tests/test_fileio.py
 create mode 100644 tests/test_load_model_zoo.py
 delete mode 100644 tests/test_ops/output.pkl
 delete mode 100644 tests/test_ops/test_active_rotated_filter.py
 delete mode 100644 tests/test_ops/test_bezier_align.py
 delete mode 100644 tests/test_ops/test_bias_act.py
 delete mode 100644 tests/test_ops/test_box_iou_quadri.py
 delete mode 100644 tests/test_ops/test_chamfer_distance.py
 delete mode 100644 tests/test_ops/test_conv_gradfix.py
 delete mode 100644 tests/test_ops/test_convex_iou.py
 delete mode 100644 tests/test_ops/test_diff_iou_rotated.py
 delete mode 100644 tests/test_ops/test_filtered_lrelu.py
 delete mode 100644 tests/test_ops/test_min_area_polygons.py
 delete mode 100644 tests/test_ops/test_nms_quadri.py
 delete mode 100644 tests/test_ops/test_points_in_polygons.py
 delete mode 100644 tests/test_ops/test_prroi_pool.py
 delete mode 100644 tests/test_ops/test_riroi_align_rotated.py
 delete mode 100644 tests/test_ops/test_rotated_feature_align.py
 delete mode 100644 tests/test_ops/test_spconv.py
 create mode 100644 tests/test_ops/test_tensorrt.py
 create mode 100644 tests/test_ops/test_tensorrt_preprocess.py
 mode change 100755 => 100644 tests/test_ops/test_tin_shift.py
 create mode 100644 tests/test_parallel.py
 create mode 100644 tests/test_runner/test_basemodule.py
 create mode 100644 tests/test_runner/test_checkpoint.py
 create mode 100644 tests/test_runner/test_dist_utils.py
 create mode 100644 tests/test_runner/test_eval_hook.py
 create mode 100644 tests/test_runner/test_fp16.py
 create mode 100644 tests/test_runner/test_hooks.py
 create mode 100644 tests/test_runner/test_optimizer.py
 create mode 100644 tests/test_runner/test_runner.py
 create mode 100644 tests/test_runner/test_utils.py
 delete mode 100644 tests/test_transforms/test_transforms_formatting.py
 delete mode 100644 tests/test_transforms/test_transforms_loading.py
 delete mode 100644 tests/test_transforms/test_transforms_processing.py
 delete mode 100644 tests/test_transforms/test_transforms_wrapper.py
 create mode 100644 tests/test_utils/test_config.py
 create mode 100644 tests/test_utils/test_hub.py
 create mode 100644 tests/test_utils/test_logging.py
 create mode 100644 tests/test_utils/test_misc.py
 create mode 100644 tests/test_utils/test_path.py
 create mode 100644 tests/test_utils/test_progressbar.py
 create mode 100644 tests/test_utils/test_registry.py
 create mode 100644 tests/test_utils/test_testing.py
 create mode 100644 tests/test_utils/test_timer.py
 create mode 100644 tests/test_utils/test_trace.py
 create mode 100644 tests/test_utils/test_version_utils.py

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index a60cd99..2fdf8a2 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -1,258 +1,71 @@
 ## Contributing to OpenMMLab
 
-Welcome to the MMCV community, we are committed to building a cutting-edge computer vision foundational library and all kinds of contributions are welcomed, including but not limited to
+All kinds of contributions are welcome, including but not limited to the following.
 
-**Fix bug**
+- Fix typo or bugs
+- Add documentation or translate the documentation into other languages
+- Add new features and components
 
-You can directly post a Pull Request to fix typo in code or documents
+### Workflow
 
-The steps to fix the bug of code implementation are as follows.
+1. fork and pull the latest OpenMMLab repository
+2. checkout a new branch (do not use master branch for PRs)
+3. commit your changes
+4. create a PR
 
-1. If the modification involve significant changes, you should create an issue first and describe the error information and how to trigger the bug. Other developers will discuss with you and propose an proper solution.
-
-2. Posting a pull request after fixing the bug and adding corresponding unit test.
-
-**New Feature or Enhancement**
-
-1. If the modification involve significant changes, you should create an issue to discuss with our developers to propose an proper design.
-2. Post a Pull Request after implementing the new feature or enhancement and add corresponding unit test.
-
-**Document**
-
-You can directly post a pull request to fix documents. If you want to add a document, you should first create an issue to check if it is reasonable.
-
-### Pull Request Workflow
-
-If you're not familiar with Pull Request, don't worry! The following guidance will tell you how to create a Pull Request step by step. If you want to dive into the develop mode of Pull Request, you can refer to the [official documents](https://docs.github.com/en/github/collaborating-with-issues-and-pull-requests/about-pull-requests)
-
-#### 1. Fork and clone
-
-If you are posting a pull request for the first time, you should fork the OpenMMLab repositories by clicking the **Fork** button in the top right corner of the GitHub page, and the forked repositories will appear under your GitHub profile.
-
-<img src="https://user-images.githubusercontent.com/57566630/167305749-43c7f4e9-449b-4e98-ade5-0c9276d5c9ce.png" width="1200">
-
-Then, you can clone the repositories to local:
-
-```shell
-git clone git@github.com:{username}/mmcv.git
+```{note}
+If you plan to add some new features that involve large changes, it is encouraged to open an issue for discussion first.
 ```
+### Code style
 
-After that, you should ddd official repository as the upstream repository
+#### Python
 
-```bash
-git remote add upstream git@github.com:open-mmlab/mmcv
-```
+We adopt [PEP8](https://www.python.org/dev/peps/pep-0008/) as the preferred code style.
 
-Check whether remote repository has been added successfully by `git remote -v`
+We use the following tools for linting and formatting:
 
-```bash
-origin	git@github.com:{username}/mmcv.git (fetch)
-origin	git@github.com:{username}/mmcv.git (push)
-upstream	git@github.com:open-mmlab/mmcv (fetch)
-upstream	git@github.com:open-mmlab/mmcv (push)
-```
+- [flake8](http://flake8.pycqa.org/en/latest/): A wrapper around some linter tools.
+- [yapf](https://github.com/google/yapf): A formatter for Python files.
+- [isort](https://github.com/timothycrosley/isort): A Python utility to sort imports.
+- [markdownlint](https://github.com/markdownlint/markdownlint): A linter to check markdown files and flag style issues.
+- [docformatter](https://github.com/myint/docformatter): A formatter to format docstring.
 
-> Here's a brief introduction to origin and upstream. When we use "git clone", we create an "origin" remote by default, which points to the repository cloned from. As for "upstream", we add it ourselves to point to the target repository. Of course, if you don't like the name "upstream", you could name it as you wish. Usually, we'll push the code to "origin". If the pushed code conflicts with the latest code in official("upstream"), we should pull the latest code from upstream to resolve the conflicts, and then push to "origin" again. The posted Pull Request will be updated automatically.
+Style configurations of yapf and isort can be found in [setup.cfg](./setup.cfg).
 
-#### 2. Configure pre-commit
+We use [pre-commit hook](https://pre-commit.com/) that checks and formats for `flake8`, `yapf`, `isort`, `trailing whitespaces`, `markdown files`,
+fixes `end-of-files`, `double-quoted-strings`, `python-encoding-pragma`, `mixed-line-ending`, sorts `requirments.txt` automatically on every commit.
+The config for a pre-commit hook is stored in [.pre-commit-config](./.pre-commit-config.yaml).
 
-You should configure [pre-commit](https://pre-commit.com/#intro) in the local development environment to make sure the code style matches that of OpenMMLab. **Note**: The following code should be executed under the MMCV directory.
+After you clone the repository, you will need to install initialize pre-commit hook.
 
 ```shell
 pip install -U pre-commit
-pre-commit install
-```
-
-Check that pre-commit is configured successfully, and install the hooks defined in `.pre-commit-config.yaml`.
-
-```shell
-pre-commit run --all-files
-```
-
-<img src="https://user-images.githubusercontent.com/57566630/173660750-3df20a63-cb66-4d33-a986-1f643f1d8aaf.png" width="1200">
-
-<img src="https://user-images.githubusercontent.com/57566630/202368856-0465a90d-8fce-4345-918e-67b8b9c82614.png" width="1200">
-
-If the installation process is interrupted, you can repeatedly run `pre-commit run ... ` to continue the installation.
-
-If the code does not conform to the code style specification, pre-commit will raise a warning and  fixes some of the errors automatically.
-
-<img src="https://user-images.githubusercontent.com/57566630/202369176-67642454-0025-4023-a095-263529107aa3.png" width="1200">
-
-If we want to commit our code bypassing the pre-commit hook, we can use the `--no-verify` option(**only for temporarily commit**).
-
-```shell
-git commit -m "xxx" --no-verify
-```
-
-#### 3. Create a development branch
-
-After configuring the pre-commit, we should create a branch based on the master branch to develop the new feature or fix the bug. The proposed branch name is `username/pr_name`
-
-```shell
-git checkout -b yhc/refactor_contributing_doc
-```
-
-In subsequent development, if the master branch of the local repository is behind the master branch of "upstream", we need to pull the upstream for synchronization, and then execute the above command:
-
-```shell
-git pull upstream master
-```
-
-#### 4. Commit the code and pass the unit test
-
-- MMCV introduces mypy to do static type checking to increase the robustness of the code. Therefore, we need to add Type Hints to our code and pass the mypy check. If you are not familiar with Type Hints, you can refer to [this tutorial](https://docs.python.org/3/library/typing.html).
-
-- The committed code should pass through the unit test
-
-  ```shell
-  # Pass all unit tests
-  pytest tests
-
-  # Pass the unit test of runner
-  pytest tests/test_runner/test_runner.py
-  ```
-
-  If the unit test fails for lack of dependencies, you can install the dependencies referring to the [guidance](#unit-test)
-
-- If the documents are modified/added, we should check the rendering result referring to [guidance](#document-rendering)
-
-#### 5. Push the code to remote
-
-We could push the local commits to remote after passing through the check of unit test and pre-commit. You can associate the local branch with remote branch by adding `-u` option.
-
-```shell
-git push -u origin {branch_name}
-```
-
-This will allow you to use the `git push` command to push code directly next time, without having to specify a branch or the remote repository.
-
-#### 6. Create a Pull Request
-
-(1) Create a pull request in GitHub's Pull request interface
-
-<img src="https://user-images.githubusercontent.com/57566630/201533288-516f7ac4-0b14-4dc8-afbd-912475c368b5.png" width="1200">
-
-(2) Modify the PR description according to the guidelines so that other developers can better understand your changes
-
-<img src="https://user-images.githubusercontent.com/57566630/202242953-c91a18ff-e388-4ff9-8591-5fae0ead6c1e.png" width="1200">
-
-Find more details about Pull Request description in [pull request guidelines](#pr-specs).
-
-**note**
-
-(a) The Pull Request description should contain the reason for the change, the content of the change, and the impact of the change, and be associated with the relevant Issue (see [documentation](https://docs.github.com/en/issues/tracking-your-work-with-issues/linking-a-pull-request-to-an-issue)
-
-(b) If it is your first contribution, please sign the CLA
-
-<img src="https://user-images.githubusercontent.com/57566630/167307569-a794b967-6e28-4eac-a942-00deb657815f.png" width="1200">
-
-(c) Check whether the Pull Request pass through the CI
-
-<img src="https://user-images.githubusercontent.com/57566630/167307490-f9ebf9fa-63c0-4d83-8ba1-081ea169eb3a.png" width="1200">
-
-MMCV will run unit test for the posted Pull Request on different platforms (Linux, Window, Mac), based on different versions of Python, PyTorch, CUDA to make sure the code is correct. We can see the specific test information by clicking `Details` in the above image so that we can modify the code.
-
-(3) If the Pull Request passes the CI, then you can wait for the review from other developers. You'll modify the code based on the reviewer's comments, and repeat the steps [4](#4-commit-the-code-and-pass-the-unit-test)-[5](#5-push-the-code-to-remote) until all reviewers approve it. Then, we will merge it ASAP.
-
-<img src="https://user-images.githubusercontent.com/57566630/202145400-cc2cd8c4-10b0-472f-ba37-07e6f50acc67.png" width="1200">
-
-#### 7. Resolve conflicts
-
-If your local branch conflicts with the latest master branch of "upstream", you'll need to resolove them. There are two ways to do this:
-
-```shell
-git fetch --all --prune
-git rebase upstream/master
 ```
 
-or
-
-```shell
-git fetch --all --prune
-git merge upstream/master
-```
-
-If you are very good at handling conflicts, then you can use rebase to resolve conflicts, as this will keep your commit logs tidy. If you are not familiar with `rebase`, then you can use `merge` to resolve conflicts.
-
-### Guidance
-
-#### Unit test
-
-If you cannot run the unit test of some modules for lacking of some dependencies, such as [video](https://github.com/open-mmlab/mmcv/tree/master/mmcv/video) module, you can try to install the following dependencies:
+From the repository folder
 
 ```shell
-# Linux
-sudo apt-get update -y
-sudo apt-get install -y libturbojpeg
-sudo apt-get install -y ffmpeg
-
-# Windows
-conda install ffmpeg
+pre-commit install
 ```
 
-We should also make sure the committed code will not decrease the coverage of unit test, we could run the following command to check the coverage of unit test:
+Try the following steps to install ruby when you encounter an issue on installing markdownlint
 
 ```shell
-python -m coverage run -m pytest /path/to/test_file
-python -m coverage html
-# check file in htmlcov/index.html
-```
-
-#### Document rendering
+# install rvm
+curl -L https://get.rvm.io | bash -s -- --autolibs=read-fail
+[[ -s "$HOME/.rvm/scripts/rvm" ]] && source "$HOME/.rvm/scripts/rvm"
+rvm autolibs disable
 
-If the documents are modified/added, we should check the rendering result. We could install the dependencies and run the following command to render the documents and check the results:
-
-```shell
-pip install -r requirements/docs.txt
-cd docs/zh_cn/
-# or docs/en
-make html
-# check file in ./docs/zh_cn/_build/html/index.html
+# install ruby
+rvm install 2.7.1
 ```
 
-### Code style
+Or refer to [this repo](https://github.com/innerlee/setup) and take [`zzruby.sh`](https://github.com/innerlee/setup/blob/master/zzruby.sh) according its instruction.
 
-#### Python
+After this on every commit check code linters and formatter will be enforced.
 
-We adopt [PEP8](https://www.python.org/dev/peps/pep-0008/) as the preferred code style.
-
-We use the following tools for linting and formatting:
-
-- [flake8](https://github.com/PyCQA/flake8): A wrapper around some linter tools.
-- [isort](https://github.com/timothycrosley/isort): A Python utility to sort imports.
-- [yapf](https://github.com/google/yapf): A formatter for Python files.
-- [codespell](https://github.com/codespell-project/codespell): A Python utility to fix common misspellings in text files.
-- [mdformat](https://github.com/executablebooks/mdformat): Mdformat is an opinionated Markdown formatter that can be used to enforce a consistent style in Markdown files.
-- [docformatter](https://github.com/myint/docformatter): A formatter to format docstring.
-
-Style configurations of yapf and isort can be found in [setup.cfg](./setup.cfg).
-
-We use [pre-commit hook](https://pre-commit.com/) that checks and formats for `flake8`, `yapf`, `isort`, `trailing whitespaces`, `markdown files`,
-fixes `end-of-files`, `double-quoted-strings`, `python-encoding-pragma`, `mixed-line-ending`, sorts `requirments.txt` automatically on every commit.
-The config for a pre-commit hook is stored in [.pre-commit-config](./.pre-commit-config.yaml).
+>Before you create a PR, make sure that your code lints and is formatted by yapf.
 
 #### C++ and CUDA
 
 We follow the [Google C++ Style Guide](https://google.github.io/styleguide/cppguide.html).
-
-### PR Specs
-
-1. Use [pre-commit](https://pre-commit.com) hook to avoid issues of code style
-
-2. One short-time branch should be matched with only one PR
-
-3. Accomplish a detailed change in one PR. Avoid large PR
-
-   - Bad: Support Faster R-CNN
-   - Acceptable: Add a box head to Faster R-CNN
-   - Good: Add a parameter to box head to support custom conv-layer number
-
-4. Provide clear and significant commit message
-
-5. Provide clear and meaningful PR description
-
-   - Task name should be clarified in title. The general format is: \[Prefix\] Short description of the PR (Suffix)
-   - Prefix: add new feature \[Feature\], fix bug \[Fix\], related to documents \[Docs\], in developing \[WIP\] (which will not be reviewed temporarily)
-   - Introduce main changes, results and influences on other modules in short description
-   - Associate related issues and pull requests with a milestone
diff --git a/CONTRIBUTING_zh-CN.md b/CONTRIBUTING_zh-CN.md
deleted file mode 100644
index 0062203..0000000
--- a/CONTRIBUTING_zh-CN.md
+++ /dev/null
@@ -1,274 +0,0 @@
-## 贡献代码
-
-欢迎加入 MMCV 社区，我们致力于打造最前沿的计算机视觉基础库，我们欢迎任何类型的贡献，包括但不限于
-
-**修复错误**
-
-修复代码实现错误的步骤如下：
-
-1. 如果提交的代码改动较大，建议先提交 issue，并正确描述 issue 的现象、原因和复现方式，讨论后确认修复方案。
-2. 修复错误并补充相应的单元测试，提交拉取请求。
-
-**新增功能或组件**
-
-1. 如果新功能或模块涉及较大的代码改动，建议先提交 issue，确认功能的必要性。
-2. 实现新增功能并添单元测试，提交拉取请求。
-
-**文档补充**
-
-修复文档可以直接提交拉取请求
-
-添加文档或将文档翻译成其他语言步骤如下
-
-1. 提交 issue，确认添加文档的必要性。
-2. 添加文档，提交拉取请求。
-
-### 拉取请求工作流
-
-如果你对拉取请求不了解，没关系，接下来的内容将会从零开始，一步一步地指引你如何创建一个拉取请求。如果你想深入了解拉取请求的开发模式，可以参考 github [官方文档](https://docs.github.com/en/github/collaborating-with-issues-and-pull-requests/about-pull-requests)
-
-#### 1. 复刻仓库
-
-当你第一次提交拉取请求时，先复刻 OpenMMLab 原代码库，点击 GitHub 页面右上角的 **Fork** 按钮，复刻后的代码库将会出现在你的 GitHub 个人主页下。
-
-<img src="https://user-images.githubusercontent.com/57566630/167305749-43c7f4e9-449b-4e98-ade5-0c9276d5c9ce.png" width="1200">
-
-将代码克隆到本地
-
-```shell
-git clone git@github.com:{username}/mmcv.git
-```
-
-添加原代码库为上游代码库
-
-```bash
-git remote add upstream git@github.com:open-mmlab/mmcv
-```
-
-检查 remote 是否添加成功，在终端输入 `git remote -v`
-
-```bash
-origin	git@github.com:{username}/mmcv.git (fetch)
-origin	git@github.com:{username}/mmcv.git (push)
-upstream	git@github.com:open-mmlab/mmcv (fetch)
-upstream	git@github.com:open-mmlab/mmcv (push)
-```
-
-> 这里对 origin 和 upstream 进行一个简单的介绍，当我们使用 git clone 来克隆代码时，会默认创建一个 origin 的 remote，它指向我们克隆的代码库地址，而 upstream 则是我们自己添加的，用来指向原始代码库地址。当然如果你不喜欢他叫 upstream，也可以自己修改，比如叫 open-mmlab。我们通常向 origin 提交代码（即 fork 下来的远程仓库），然后向 upstream 提交一个 pull request。如果提交的代码和最新的代码发生冲突，再从 upstream 拉取最新的代码，和本地分支解决冲突，再提交到 origin。
-
-#### 2. 配置 pre-commit
-
-在本地开发环境中，我们使用 [pre-commit](https://pre-commit.com/#intro) 来检查代码风格，以确保代码风格的统一。在提交代码，需要先安装 pre-commit（需要在 MMCV 目录下执行）:
-
-```shell
-pip install -U pre-commit
-pre-commit install
-```
-
-检查 pre-commit 是否配置成功，并安装 `.pre-commit-config.yaml` 中的钩子：
-
-```shell
-pre-commit run --all-files
-```
-
-<img src="https://user-images.githubusercontent.com/57566630/173660750-3df20a63-cb66-4d33-a986-1f643f1d8aaf.png" width="1200">
-
-<img src="https://user-images.githubusercontent.com/57566630/202368856-0465a90d-8fce-4345-918e-67b8b9c82614.png" width="1200">
-
-> 如果你是中国用户，由于网络原因，可能会出现安装失败的情况，这时可以使用国内源
-
-> pre-commit install -c .pre-commit-config-zh-cn.yaml
-
-> pre-commit run --all-files -c .pre-commit-config-zh-cn.yaml
-
-如果安装过程被中断，可以重复执行 `pre-commit run ...` 继续安装。
-
-如果提交的代码不符合代码风格规范，pre-commit 会发出警告，并自动修复部分错误。
-
-<img src="https://user-images.githubusercontent.com/57566630/202369176-67642454-0025-4023-a095-263529107aa3.png" width="1200">
-
-如果我们想临时绕开 pre-commit 的检查提交一次代码，可以在 `git commit` 时加上 `--no-verify`（需要保证最后推送至远程仓库的代码能够通过 pre-commit 检查）。
-
-```shell
-git commit -m "xxx" --no-verify
-```
-
-#### 3. 创建开发分支
-
-安装完 pre-commit 之后，我们需要基于 master 创建开发分支，建议的分支命名规则为 `username/pr_name`。
-
-```shell
-git checkout -b yhc/refactor_contributing_doc
-```
-
-在后续的开发中，如果本地仓库的 master 分支落后于 upstream 的 master 分支，我们需要先拉取 upstream 的代码进行同步，再执行上面的命令
-
-```shell
-git pull upstream master
-```
-
-#### 4. 提交代码并在本地通过单元测试
-
-- MMCV 引入了 mypy 来做静态类型检查，以增加代码的鲁棒性。因此我们在提交代码时，需要补充 Type Hints。具体规则可以参考[教程](https://zhuanlan.zhihu.com/p/519335398)。
-
-- 提交的代码同样需要通过单元测试
-
-  ```shell
-  # 通过全量单元测试
-  pytest tests
-
-  # 我们需要保证提交的代码能够通过修改模块的单元测试，以 runner 为例
-  pytest tests/test_runner/test_runner.py
-  ```
-
-  如果你由于缺少依赖无法运行修改模块的单元测试，可以参考[指引-单元测试](#单元测试)
-
-- 如果修改/添加了文档，参考[指引](#文档渲染)确认文档渲染正常。
-
-#### 5. 推送代码到远程
-
-代码通过单元测试和 pre-commit 检查后，将代码推送到远程仓库，如果是第一次推送，可以在 `git push` 后加上 `-u` 参数以关联远程分支
-
-```shell
-git push -u origin {branch_name}
-```
-
-这样下次就可以直接使用 `git push` 命令推送代码了，而无需指定分支和远程仓库。
-
-#### 6. 提交拉取请求（PR）
-
-(1) 在 GitHub 的 Pull request 界面创建拉取请求
-<img src="https://user-images.githubusercontent.com/57566630/201533288-516f7ac4-0b14-4dc8-afbd-912475c368b5.png" width="1200">
-
-(2) 根据指引修改 PR 描述，以便于其他开发者更好地理解你的修改
-
-<img src="https://user-images.githubusercontent.com/57566630/202242953-c91a18ff-e388-4ff9-8591-5fae0ead6c1e.png" width="1200">
-
-描述规范详见[拉取请求规范](#拉取请求规范)
-
-&#160;
-
-**注意事项**
-
-(a) PR 描述应该包含修改理由、修改内容以及修改后带来的影响，并关联相关 Issue（具体方式见[文档](https://docs.github.com/en/issues/tracking-your-work-with-issues/linking-a-pull-request-to-an-issue)）
-
-(b) 如果是第一次为 OpenMMLab 做贡献，需要签署 CLA
-
-<img src="https://user-images.githubusercontent.com/57566630/167307569-a794b967-6e28-4eac-a942-00deb657815f.png" width="1200">
-
-(c) 检查提交的 PR 是否通过 CI（集成测试）
-
-<img src="https://user-images.githubusercontent.com/57566630/167307490-f9ebf9fa-63c0-4d83-8ba1-081ea169eb3a.png" width="1200">
-
-MMCV 会在不同的平台（Linux、Window、Mac），基于不同版本的 Python、PyTorch、CUDA 对提交的代码进行单元测试，以保证代码的正确性，如果有任何一个没有通过，我们可点击上图中的 `Details` 来查看具体的测试信息，以便于我们修改代码。
-
-(3) 如果 PR 通过了 CI，那么就可以等待其他开发者的 review，并根据 reviewer 的意见，修改代码，并重复 [4](#4-提交代码并本地通过单元测试)-[5](#5-推送代码到远程) 步骤，直到 reviewer 同意合入 PR。
-
-<img src="https://user-images.githubusercontent.com/57566630/202145400-cc2cd8c4-10b0-472f-ba37-07e6f50acc67.png" width="1200">
-
-所有 reviewer 同意合入 PR 后，我们会尽快将 PR 合并到主分支。
-
-#### 7. 解决冲突
-
-随着时间的推移，我们的代码库会不断更新，这时候，如果你的 PR 与主分支存在冲突，你需要解决冲突，解决冲突的方式有两种：
-
-```shell
-git fetch --all --prune
-git rebase upstream/master
-```
-
-或者
-
-```shell
-git fetch --all --prune
-git merge upstream/master
-```
-
-如果你非常善于处理冲突，那么可以使用 rebase 的方式来解决冲突，因为这能够保证你的 commit log 的整洁。如果你不太熟悉 `rebase` 的使用，那么可以使用 `merge` 的方式来解决冲突。
-
-### 指引
-
-#### 单元测试
-
-如果你无法正常执行部分模块的单元测试，例如 [video](https://github.com/open-mmlab/mmcv/tree/master/mmcv/video) 模块，可能是你的当前环境没有安装以下依赖
-
-```shell
-# Linux
-sudo apt-get update -y
-sudo apt-get install -y libturbojpeg
-sudo apt-get install -y ffmpeg
-
-# Windows
-conda install ffmpeg
-```
-
-在提交修复代码错误或新增特性的拉取请求时，我们应该尽可能的让单元测试覆盖所有提交的代码，计算单元测试覆盖率的方法如下
-
-```shell
-python -m coverage run -m pytest /path/to/test_file
-python -m coverage html
-# check file in htmlcov/index.html
-```
-
-#### 文档渲染
-
-在提交修复代码错误或新增特性的拉取请求时，可能会需要修改/新增模块的 docstring。我们需要确认渲染后的文档样式是正确的。
-本地生成渲染后的文档的方法如下
-
-```shell
-pip install -r requirements/docs.txt
-cd docs/zh_cn/
-# or docs/en
-make html
-# check file in ./docs/zh_cn/_build/html/index.html
-```
-
-### 代码风格
-
-#### Python
-
-[PEP8](https://www.python.org/dev/peps/pep-0008/) 作为 OpenMMLab 算法库首选的代码规范，我们使用以下工具检查和格式化代码
-
-- [flake8](https://github.com/PyCQA/flake8): Python 官方发布的代码规范检查工具，是多个检查工具的封装
-- [isort](https://github.com/timothycrosley/isort): 自动调整模块导入顺序的工具
-- [yapf](https://github.com/google/yapf): Google 发布的代码规范检查工具
-- [codespell](https://github.com/codespell-project/codespell): 检查单词拼写是否有误
-- [mdformat](https://github.com/executablebooks/mdformat): 检查 markdown 文件的工具
-- [docformatter](https://github.com/myint/docformatter): 格式化 docstring 的工具
-
-yapf 和 isort 的配置可以在 [setup.cfg](./setup.cfg) 找到
-
-通过配置 [pre-commit hook](https://pre-commit.com/) ，我们可以在提交代码时自动检查和格式化 `flake8`、`yapf`、`isort`、`trailing whitespaces`、`markdown files`，
-修复 `end-of-files`、`double-quoted-strings`、`python-encoding-pragma`、`mixed-line-ending`，调整 `requirments.txt` 的包顺序。
-pre-commit 钩子的配置可以在 [.pre-commit-config](./.pre-commit-config.yaml) 找到。
-
-pre-commit 具体的安装使用方式见[拉取请求](#2-配置-pre-commit)。
-
-更具体的规范请参考 [OpenMMLab 代码规范](code_style.md)。
-
-#### C++ and CUDA
-
-C++ 和 CUDA 的代码规范遵从 [Google C++ Style Guide](https://google.github.io/styleguide/cppguide.html)
-
-### 拉取请求规范
-
-1. 使用 [pre-commit hook](https://pre-commit.com)，尽量减少代码风格相关问题
-
-2. 一个`拉取请求`对应一个短期分支
-
-3. 粒度要细，一个`拉取请求`只做一件事情，避免超大的`拉取请求`
-
-   - Bad：实现 Faster R-CNN
-   - Acceptable：给 Faster R-CNN 添加一个 box head
-   - Good：给 box head 增加一个参数来支持自定义的 conv 层数
-
-4. 每次 Commit 时需要提供清晰且有意义 commit 信息
-
-5. 提供清晰且有意义的`拉取请求`描述
-
-   - 标题写明白任务名称，一般格式:\[Prefix\] Short description of the pull request (Suffix)
-   - prefix: 新增功能 \[Feature\], 修 bug \[Fix\], 文档相关 \[Docs\], 开发中 \[WIP\] (暂时不会被review)
-   - 描述里介绍`拉取请求`的主要修改内容，结果，以及对其他部分的影响, 参考`拉取请求`模板
-   - 关联相关的`议题` (issue) 和其他`拉取请求`
-
-6. 如果引入了其他三方库，或借鉴了三方库的代码，请确认他们的许可证和 mmcv 兼容，并在借鉴的代码上补充 `This code is inspired from http://`
diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000..e163b31
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,7 @@
+FROM python:3.7
+
+WORKDIR /mmcv
+
+COPY . /mmcv
+
+RUN pip install -e .
diff --git a/LICENSES.md b/LICENSES.md
index 3cdeddf..9bb0c8c 100644
--- a/LICENSES.md
+++ b/LICENSES.md
@@ -2,10 +2,7 @@
 
 In this file, we list the operations with other licenses instead of Apache 2.0. Users should be careful about adopting these operations in any commercial matters.
 
-|    Operation     |                                                                             Files                                                                              |    License     |
-| :--------------: | :------------------------------------------------------------------------------------------------------------------------------------------------------------: | :------------: |
-|    upfirdn2d     |          [mmcv/ops/csrc/pytorch/cuda/upfirdn2d_kernel.cu](https://github.com/open-mmlab/mmcv/tree/2.x/mmcv/ops/csrc/pytorch/cuda/upfirdn2d_kernel.cu)          | NVIDIA License |
-| fused_leaky_relu | [mmcv/ops/csrc/pytorch/cuda/fused_bias_leakyrelu_cuda.cu](https://github.com/open-mmlab/mmcv/tree/2.x/mmcv/ops/csrc/pytorch/cuda/fused_bias_leakyrelu_cuda.cu) | NVIDIA License |
-|     bias_act     |             [mmcv/ops/csrc/pytorch/cuda/bias_act_cuda.cu](https://github.com/open-mmlab/mmcv/tree/2.x/mmcv/ops/csrc/pytorch/cuda/bias_act_cuda.cu)             | NVIDIA License |
-|  filtered_lrelu  |            [mmcv/ops/csrc/pytorch/cuda/filtered_lrelu.cu](https://github.com/open-mmlab/mmcv/tree/2.x/mmcv/ops/csrc/pytorch/cuda/filtered_lrelu.cu)            | NVIDIA License |
-|  conv2d_gradfix  |                              [mmcv/ops/conv2d_gradfix.py](https://github.com/open-mmlab/mmcv/tree/2.x/mmcv/ops/conv2d_gradfix.py)                              | NVIDIA License |
+|    Operation     |                                                                    Files                                                                              |    License     |
+| :--------------: | :---------------------------------------------------------------------------------------------------------------------------------------------------: | :------------: |
+|    upfirdn2d     |          [mmcv/ops/csrc/pytorch/cuda/upfirdn2d_kernel.cu](https://github.com/open-mmlab/mmcv/blob/master/mmcv/ops/csrc/pytorch/cuda/upfirdn2d_kernel.cu)          | NVIDIA License |
+| fused_leaky_relu | [mmcv/ops/csrc/pytorch/cuda/fused_bias_leakyrelu_cuda.cu](https://github.com/open-mmlab/mmcv/blob/master/mmcv/ops/csrc/pytorch/cuda/fused_bias_leakyrelu_cuda.cu) | NVIDIA License |
diff --git a/MANIFEST.in b/MANIFEST.in
index 622635c..65f232e 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -1,6 +1,5 @@
 include requirements/runtime.txt
+include mmcv/model_zoo/open_mmlab.json mmcv/model_zoo/deprecated.json mmcv/model_zoo/mmcls.json
 include mmcv/ops/csrc/common/cuda/*.cuh mmcv/ops/csrc/common/cuda/*.hpp mmcv/ops/csrc/common/*.hpp
 include mmcv/ops/csrc/pytorch/*.cpp mmcv/ops/csrc/pytorch/cuda/*.cu mmcv/ops/csrc/pytorch/cuda/*.cpp mmcv/ops/csrc/pytorch/cpu/*.cpp
 include mmcv/ops/csrc/parrots/*.h mmcv/ops/csrc/parrots/*.cpp
-include mmcv/ops/csrc/pytorch/mps/*.mm mmcv/ops/csrc/common/mps/*.h mmcv/ops/csrc/common/mps/*.mm
-recursive-include mmcv/ops/csrc/ *.h *.hpp *.cpp *.cuh *.cu *.mm
diff --git a/README.md b/README.md
index 098cf65..9b64100 100644
--- a/README.md
+++ b/README.md
@@ -7,7 +7,7 @@ MMCV是计算机视觉研究的基础库，主要提供以下功能：图像处
 + Python 3.7、3.8、3.9
 
 ### 1、使用pip方式安装
-mmcv whl包下载目录：[https://cancon.hpccube.com:65024/4/main/mmcv/dtk23.04](https://cancon.hpccube.com:65024/4/main/mmcv/dtk23.04)，选择对应的pytorch版本和python版本下载对应mmcv的whl包
+mmcv whl包下载目录：[https://cancon.hpccube.com:65024/4/main/mmcv](https://cancon.hpccube.com:65024/4/main/mmcv)，选择对应的pytorch版本和python版本下载对应mmcv的whl包
 ```shell
 pip install mmcv* (下载的mmcv的whl包)
 ```
@@ -18,7 +18,7 @@ pip install mmcv* (下载的mmcv的whl包)
 
 1. 基于光源pytorch基础镜像环境：镜像下载地址：[https://sourcefind.cn/#/image/dcu/pytorch](https://sourcefind.cn/#/image/dcu/pytorch)，根据pytorch、python、dtk及系统下载对应的镜像版本。
 
-2. 基于现有python环境：安装pytorch，pytorch whl包下载目录：[https://cancon.hpccube.com:65024/4/main/pytorch/dtk23.04](https://cancon.hpccube.com:65024/4/main/pytorch/dtk23.04)，根据python、dtk版本,下载对应pytorch的whl包。安装命令如下：
+2. 基于现有python环境：安装pytorch，pytorch whl包下载目录：[https://cancon.hpccube.com:65024/4/main/pytorch/dtk24.04.1](https://cancon.hpccube.com:65024/4/main/pytorch/dtk24.04.1)，根据python、dtk版本,下载对应pytorch的whl包。安装命令如下：
 ```shell
 pip install torch* (下载的torch的whl包)
 pip install setuptools==59.5.0 wheel
@@ -32,11 +32,17 @@ git clone https://developer.hpccube.com/codes/aicomponent/mmcv # 根据编译需
 - 提供2种源码编译方式（进入mmcv目录）：
 ```
 1. 编译whl包并安装
-MMCV_WITH_OPS=1 ROCM_HOME=${ROCM_PATH} python3 setup.py -v bdist_wheel
+MMCV_WITH_OPS=1 python3 setup.py -v bdist_wheel
 pip install dist/mmcv*
 
 2. 源码编译安装
-MMCV_WITH_OPS=1 ROCM_HOME=${ROCM_PATH} python3 setup.py install
+MMCV_WITH_OPS=1 python3 setup.py install
+```
+3. 测试验证
+```
+cd test
+pytest -s ./test_arraymisc.py
+pytest -s ./test_ops
 ```
 #### 注意事项
 + 若使用pip install下载安装过慢，可添加pypi清华源：-i https://pypi.tuna.tsinghua.edu.cn/simple/
@@ -52,3 +58,4 @@ MMCV_WITH_OPS=1 ROCM_HOME=${ROCM_PATH} python3 setup.py install
 - [README_ORIGIN](README_ORIGIN.md)
 - [README_zh-CN](README_zh-CN.md)
 - [https://github.com/open-mmlab/mmcv](https://github.com/open-mmlab/mmcv)
+
diff --git a/README_ORIGIN.md b/README_ORIGIN.md
index 25d290f..e9e3f8e 100644
--- a/README_ORIGIN.md
+++ b/README_ORIGIN.md
@@ -1,119 +1,204 @@
 <div align="center">
-  <img src="https://raw.githubusercontent.com/open-mmlab/mmcv/master/docs/en/mmcv-logo.png" width="300"/>
-  <div>&nbsp;</div>
-  <div align="center">
-    <b><font size="5">OpenMMLab website</font></b>
-    <sup>
-      <a href="https://openmmlab.com">
-        <i><font size="4">HOT</font></i>
-      </a>
-    </sup>
-    &nbsp;&nbsp;&nbsp;&nbsp;
-    <b><font size="5">OpenMMLab platform</font></b>
-    <sup>
-      <a href="https://platform.openmmlab.com">
-        <i><font size="4">TRY IT OUT</font></i>
-      </a>
-    </sup>
-  </div>
-  <div>&nbsp;</div>
+    <img src="https://raw.githubusercontent.com/open-mmlab/mmcv/master/docs/mmcv-logo.png" width="300"/>
 </div>
 
-[![docs](https://img.shields.io/badge/docs-2.x-blue)](https://mmcv.readthedocs.io/en/2.x/)
-[![platform](https://img.shields.io/badge/platform-Linux%7CWindows%7CmacOS-blue)](https://mmcv.readthedocs.io/en/2.x/get_started/installation.html)
-[![PyPI - Python Version](https://img.shields.io/pypi/pyversions/mmcv)](https://pypi.org/project/mmcv/)
-[![pytorch](https://img.shields.io/badge/pytorch-1.6~1.13-orange)](https://pytorch.org/get-started/previous-versions/)
-[![cuda](https://img.shields.io/badge/cuda-9.2~11.7-green)](https://developer.nvidia.com/cuda-downloads)
-[![PyPI](https://img.shields.io/pypi/v/mmcv)](https://pypi.org/project/mmcv)
-[![badge](https://github.com/open-mmlab/mmcv/workflows/build/badge.svg)](https://github.com/open-mmlab/mmcv/actions)
-[![codecov](https://codecov.io/gh/open-mmlab/mmcv/branch/master/graph/badge.svg)](https://codecov.io/gh/open-mmlab/mmcv)
-[![license](https://img.shields.io/github/license/open-mmlab/mmcv.svg)](https://github.com/open-mmlab/mmcv/blob/master/LICENSE)
+[![PyPI - Python Version](https://img.shields.io/pypi/pyversions/mmcv)](https://pypi.org/project/mmcv/) [![PyPI](https://img.shields.io/pypi/v/mmcv)](https://pypi.org/project/mmcv) [![badge](https://github.com/open-mmlab/mmcv/workflows/build/badge.svg)](https://github.com/open-mmlab/mmcv/actions) [![codecov](https://codecov.io/gh/open-mmlab/mmcv/branch/master/graph/badge.svg)](https://codecov.io/gh/open-mmlab/mmcv) [![license](https://img.shields.io/github/license/open-mmlab/mmcv.svg)](https://github.com/open-mmlab/mmcv/blob/master/LICENSE)
 
 English | [简体中文](README_zh-CN.md)
 
 ## Introduction
 
-MMCV is a foundational library for computer vision research and it provides the following functionalities:
+MMCV is a foundational library for computer vision research and supports many
+research projects as below:
 
-- [Image/Video processing](https://mmcv.readthedocs.io/en/2.x/understand_mmcv/data_process.html)
-- [Image and annotation visualization](https://mmcv.readthedocs.io/en/2.x/understand_mmcv/visualization.html)
-- [Image transformation](https://mmcv.readthedocs.io/en/2.x/understand_mmcv/data_transform.html)
-- [Various CNN architectures](https://mmcv.readthedocs.io/en/2.x/understand_mmcv/cnn.html)
-- [High-quality implementation of common CPU and CUDA ops](https://mmcv.readthedocs.io/en/2.x/understand_mmcv/ops.html)
+- [MMCV](https://github.com/open-mmlab/mmcv): OpenMMLab foundational library for computer vision.
+- [MIM](https://github.com/open-mmlab/mim): MIM Installs OpenMMLab Packages.
+- [MMClassification](https://github.com/open-mmlab/mmclassification): OpenMMLab image classification toolbox and benchmark.
+- [MMDetection](https://github.com/open-mmlab/mmdetection): OpenMMLab detection toolbox and benchmark.
+- [MMDetection3D](https://github.com/open-mmlab/mmdetection3d): OpenMMLab's next-generation platform for general 3D object detection.
+- [MMSegmentation](https://github.com/open-mmlab/mmsegmentation): OpenMMLab semantic segmentation toolbox and benchmark.
+- [MMAction2](https://github.com/open-mmlab/mmaction2): OpenMMLab's next-generation action understanding toolbox and benchmark.
+- [MMTracking](https://github.com/open-mmlab/mmtracking): OpenMMLab video perception toolbox and benchmark.
+- [MMPose](https://github.com/open-mmlab/mmpose): OpenMMLab pose estimation toolbox and benchmark.
+- [MMEditing](https://github.com/open-mmlab/mmediting): OpenMMLab image and video editing toolbox.
+- [MMOCR](https://github.com/open-mmlab/mmocr): A Comprehensive Toolbox for Text Detection, Recognition and Understanding.
+- [MMGeneration](https://github.com/open-mmlab/mmgeneration): OpenMMLab image and video generative models toolbox.
+- [MMFlow](https://github.com/open-mmlab/mmflow): OpenMMLab optical flow toolbox and benchmark.
+- [MMFewShot](https://github.com/open-mmlab/mmfewshot): OpenMMLab FewShot Learning Toolbox and Benchmark.
 
-It supports the following systems:
+It provides the following functionalities.
 
-- Linux
-- Windows
-- macOS
+- Universal IO APIs
+- Image/Video processing
+- Image and annotation visualization
+- Useful utilities (progress bar, timer, ...)
+- PyTorch runner with hooking mechanism
+- Various CNN architectures
+- High-quality implementation of common CUDA ops
 
-See the [documentation](http://mmcv.readthedocs.io/en/2.x) for more features and usage.
+See the [documentation](http://mmcv.readthedocs.io/en/latest) for more features and usage.
 
-Note: MMCV requires Python 3.7+.
+Note: MMCV requires Python 3.6+.
 
 ## Installation
 
 There are two versions of MMCV:
 
-- **mmcv**: comprehensive, with full features and various CUDA ops out of the box. It takes longer time to build.
-- **mmcv-lite**: lite, without CUDA ops but all other features, similar to mmcv\<1.0.0. It is useful when you do not need those CUDA ops.
+- **mmcv-full**: comprehensive, with full features and various CUDA ops out of box. It takes longer time to build.
+- **mmcv**: lite, without CUDA ops but all other features, similar to mmcv<1.0.0. It is useful when you do not need those CUDA ops.
 
 **Note**: Do not install both versions in the same environment, otherwise you may encounter errors like `ModuleNotFound`. You need to uninstall one before installing the other. `Installing the full version is highly recommended if CUDA is available`.
 
-### Install mmcv
+a. Install the full version.
+
+Before installing mmcv-full, make sure that PyTorch has been successfully installed following the [official guide](https://pytorch.org/).
 
-Before installing mmcv, make sure that PyTorch has been successfully installed following the [PyTorch official installation guide](https://github.com/pytorch/pytorch#installation). For apple silicon users, please use PyTorch 1.13+.
+We provide pre-built mmcv packages (recommended) with different PyTorch and CUDA versions to simplify the building. In addition, you can run [check_installation.py](.dev_scripts/check_installation.py) to check the installation of mmcv-full after running the installation commands.
 
-The command to install mmcv:
+i. Install the latest version.
 
-```bash
-pip install -U openmim
-mim install "mmcv>=2.0.0rc1"
+The rule for installing the latest ``mmcv-full`` is as follows:
+
+```shell
+pip install mmcv-full -f https://download.openmmlab.com/mmcv/dist/{cu_version}/{torch_version}/index.html
 ```
 
-If you need to specify the version of mmcv, you can use the following command:
+Please replace ``{cu_version}`` and ``{torch_version}`` in the url to your desired one. For example,
+to install the latest ``mmcv-full`` with ``CUDA 11.1`` and ``PyTorch 1.9.0``, use the following command:
 
-```bash
-mim install mmcv==2.0.0rc3
+```shell
+pip install mmcv-full -f https://download.openmmlab.com/mmcv/dist/cu111/torch1.9.0/index.html
 ```
 
-If you find that the above installation command does not use a pre-built package ending with `.whl` but a source package ending with `.tar.gz`, you may not have a pre-build package corresponding to the PyTorch or CUDA or mmcv version, in which case you can [build mmcv from source](https://mmcv.readthedocs.io/en/2.x/get_started/build.html).
+**Note**: mmcv-full is only compiled on PyTorch 1.x.0 because the compatibility usually holds between 1.x.0 and 1.x.1. If your PyTorch version is 1.x.1, you can install mmcv-full compiled with PyTorch 1.x.0 and it usually works well. For example, if your PyTorch version is 1.8.1 and CUDA version is 11.1, you can use the following command to install mmcv-full.
 
-<details>
-<summary>Installation log using pre-built packages</summary>
+```shell
+pip install mmcv-full -f https://download.openmmlab.com/mmcv/dist/cu111/torch1.8.0/index.html
+```
 
-Looking in links: https://download.openmmlab.com/mmcv/dist/cu102/torch1.8.0/index.html<br />
-Collecting mmcv<br />
-<b>Downloading https://download.openmmlab.com/mmcv/dist/cu102/torch1.8.0/mmcv-2.0.0rc3-cp38-cp38-manylinux1_x86_64.whl</b>
+For more details, please refer the the following tables and delete ``=={mmcv_version}``.
 
-</details>
+ii. Install a specified version.
 
-<details>
-<summary>Installation log using source packages</summary>
+The rule for installing a specified ``mmcv-full`` is as follows:
 
-Looking in links: https://download.openmmlab.com/mmcv/dist/cu102/torch1.8.0/index.html<br />
-Collecting mmcv==2.0.0rc3<br />
-<b>Downloading mmcv-2.0.0rc3.tar.gz</b>
+```shell
+pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/{cu_version}/{torch_version}/index.html
+```
+
+First of all, please refer to the Releases and replace ``{mmcv_version}`` a specified one. e.g. ``1.3.9``.
+Then replace ``{cu_version}`` and ``{torch_version}`` in the url to your desired versions. For example,
+to install ``mmcv-full==1.3.9`` with ``CUDA 11.1`` and ``PyTorch 1.9.0``, use the following command:
 
-</details>
+```shell
+pip install mmcv-full==1.3.9 -f https://download.openmmlab.com/mmcv/dist/cu111/torch1.9.0/index.html
+```
 
-For more installation methods, please refer to the [Installation documentation](https://mmcv.readthedocs.io/en/2.x/get_started/installation.html).
+For more details, please refer the the following tables.
+
+<table class="docutils">
+  <tbody>
+    <tr>
+      <th width="80"> CUDA </th>
+      <th valign="bottom" align="left" width="120">torch1.10</th>
+      <th valign="bottom" align="left" width="100">torch1.9</th>
+      <th valign="bottom" align="left" width="100">torch1.8</th>
+      <th valign="bottom" align="left" width="100">torch1.7</th>
+      <th valign="bottom" align="left" width="100">torch1.6</th>
+      <th valign="bottom" align="left" width="100">torch1.5</th>
+    </tr>
+    <tr>
+      <td align="left">11.3</td>
+      <td align="left"><details><summary> install </summary><pre><code>pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu113/torch1.10.0/index.html</code></pre> </details> </td>
+      <td align="left"></td>
+      <td align="left"></code></pre> </details> </td>
+      <td align="left"> </td>
+      <td align="left"> </td>
+      <td align="left"> </td>
+    </tr>
+    <tr>
+      <td align="left">11.1</td>
+      <td align="left"><details><summary> install </summary><pre><code>pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu111/torch1.10.0/index.html</code></pre> </details> </td>
+      <td align="left"><details><summary> install </summary><pre><code>pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu111/torch1.9.0/index.html</code></pre> </details> </td>
+      <td align="left"><details><summary> install </summary><pre><code>pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu111/torch1.8.0/index.html</code></pre> </details> </td>
+      <td align="left"> </td>
+      <td align="left"> </td>
+      <td align="left"> </td>
+    </tr>
+    <tr>
+      <td align="left">11.0</td>
+      <td align="left"> </td>
+      <td align="left"> </td>
+      <td align="left"> </td>
+      <td align="left"><details><summary> install </summary><pre><code>pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu110/torch1.7.0/index.html</code></pre> </details> </td>
+      <td align="left"> </td>
+      <td align="left"> </td>
+    </tr>
+    <tr>
+      <td align="left">10.2</td>
+      <td align="left"><details><summary> install </summary><pre><code>pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu102/torch1.10.0/index.html</code></pre> </details> </td>
+      <td align="left"><details><summary> install </summary><pre><code>pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu102/torch1.9.0/index.html</code></pre> </details> </td>
+      <td align="left"><details><summary> install </summary><pre><code>pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu102/torch1.8.0/index.html</code></pre> </details> </td>
+      <td align="left"><details><summary> install </summary><pre><code>pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu102/torch1.7.0/index.html</code></pre> </details> </td>
+      <td align="left"><details><summary> install </summary><pre><code>pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu102/torch1.6.0/index.html</code></pre> </details> </td>
+      <td align="left"><details><summary> install </summary><pre><code>pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu102/torch1.5.0/index.html</code></pre> </details> </td>
+    </tr>
+    <tr>
+      <td align="left">10.1</td>
+      <td align="left"> </td>
+      <td align="left"> </td>
+      <td align="left"><details><summary> install </summary><pre><code> pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu101/torch1.8.0/index.html</code></pre> </details> </td>
+      <td align="left"><details><summary> install </summary><pre><code> pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu101/torch1.7.0/index.html</code></pre> </details> </td>
+      <td align="left"><details><summary> install </summary><pre><code> pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu101/torch1.6.0/index.html</code></pre> </details> </td>
+      <td align="left"><details><summary> install </summary><pre><code> pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu101/torch1.5.0/index.html</code></pre> </details> </td>
+    </tr>
+    <tr>
+      <td align="left">9.2</td>
+      <td align="left"> </td>
+      <td align="left"> </td>
+      <td align="left"> </td>
+      <td align="left"><details><summary> install </summary><pre><code> pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu92/torch1.7.0/index.html</code></pre> </details> </td>
+      <td align="left"><details><summary> install </summary><pre><code> pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu92/torch1.6.0/index.html</code></pre> </details> </td>
+      <td align="left"><details><summary> install </summary><pre><code> pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu92/torch1.5.0/index.html</code></pre> </details> </td>
+    </tr>
+    <tr>
+      <td align="left">cpu</td>
+      <td align="left"><details><summary> install </summary><pre><code> pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cpu/torch1.10.0/index.html</code></pre> </details> </td>
+      <td align="left"><details><summary> install </summary><pre><code> pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cpu/torch1.9.0/index.html</code></pre> </details> </td>
+      <td align="left"><details><summary> install </summary><pre><code> pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cpu/torch1.8.0/index.html</code></pre> </details> </td>
+      <td align="left"><details><summary> install </summary><pre><code> pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cpu/torch1.7.0/index.html</code></pre> </details> </td>
+      <td align="left"><details><summary> install </summary><pre><code> pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cpu/torch1.6.0/index.html</code></pre> </details> </td>
+      <td align="left"><details><summary> install </summary><pre><code> pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cpu/torch1.5.0/index.html</code></pre> </details> </td>
+    </tr>
+  </tbody>
+</table>
+
+**Note**: The pre-built packages provided above do not include all versions of mmcv-full, you can click on the corresponding links to see the supported versions. For example, you can click [cu102-torch1.8.0](https://download.openmmlab.com/mmcv/dist/cu102/torch1.8.0/index.html) and you can see that `cu102-torch1.8.0` only provides 1.3.0 and above versions of mmcv-full. In addition, We no longer provide `mmcv-full` pre-built packages compiled with `PyTorch 1.3 & 1.4` since v1.3.17. You can find previous versions that compiled with PyTorch 1.3 & 1.4 [here](./docs/get_started/previous_versions.md). The compatibility is still ensured in our CI, but we will discard the support of PyTorch 1.3 & 1.4 next year.
+
+Another way is to compile locally by running
+
+```python
+pip install mmcv-full
+```
 
-### Install mmcv-lite
+Note that the local compiling may take up to 10 mins.
 
-If you need to use PyTorch-related modules, make sure PyTorch has been successfully installed in your environment by referring to the [PyTorch official installation guide](https://github.com/pytorch/pytorch#installation).
+b. Install the lite version.
 
-```bash
-pip install -U openmim
-mim install "mmcv-lite>=2.0.0rc1"
+```python
+pip install mmcv
 ```
 
+c. Install full version with custom operators for onnxruntime
+
+- Check [here](docs/deployment/onnxruntime_op.md) for detailed instruction.
+
+If you would like to build MMCV from source, please refer to the [guide](https://mmcv.readthedocs.io/en/latest/get_started/build.html).
+
 ## FAQ
 
 If you face some installation issues, CUDA related issues or RuntimeErrors,
-you may first refer to this [Frequently Asked Questions](https://mmcv.readthedocs.io/en/2.x/faq.html).
-
-If you face installation problems or runtime issues, you may first refer to this [Frequently Asked Questions](https://mmcv.readthedocs.io/en/2.x/faq.html) to see if there is a solution. If the problem is still not solved, feel free to open an [issue](https://github.com/open-mmlab/mmcv/issues).
+you may first refer to this [Frequently Asked Questions](https://mmcv.readthedocs.io/en/latest/faq.html).
 
 ## Citation
 
@@ -135,27 +220,3 @@ We appreciate all contributions to improve MMCV. Please refer to [CONTRIBUTING.m
 ## License
 
 MMCV is released under the Apache 2.0 license, while some specific operations in this library are with other licenses. Please refer to [LICENSES.md](LICENSES.md) for the careful check, if you are using our code for commercial matters.
-
-## Projects in OpenMMLab
-
-- [MMEngine](https://github.com/open-mmlab/mmengine): OpenMMLab foundational library for training deep learning models.
-- [MMCV](https://github.com/open-mmlab/mmcv): OpenMMLab foundational library for computer vision.
-- [MIM](https://github.com/open-mmlab/mim): MIM installs OpenMMLab packages.
-- [MMClassification](https://github.com/open-mmlab/mmclassification): OpenMMLab image classification toolbox and benchmark.
-- [MMDetection](https://github.com/open-mmlab/mmdetection): OpenMMLab detection toolbox and benchmark.
-- [MMDetection3D](https://github.com/open-mmlab/mmdetection3d): OpenMMLab's next-generation platform for general 3D object detection.
-- [MMRotate](https://github.com/open-mmlab/mmrotate): OpenMMLab rotated object detection toolbox and benchmark.
-- [MMYOLO](https://github.com/open-mmlab/mmyolo): OpenMMLab YOLO series toolbox and benchmark.
-- [MMSegmentation](https://github.com/open-mmlab/mmsegmentation): OpenMMLab semantic segmentation toolbox and benchmark.
-- [MMOCR](https://github.com/open-mmlab/mmocr): OpenMMLab text detection, recognition, and understanding toolbox.
-- [MMPose](https://github.com/open-mmlab/mmpose): OpenMMLab pose estimation toolbox and benchmark.
-- [MMHuman3D](https://github.com/open-mmlab/mmhuman3d): OpenMMLab 3D human parametric model toolbox and benchmark.
-- [MMSelfSup](https://github.com/open-mmlab/mmselfsup): OpenMMLab self-supervised learning toolbox and benchmark.
-- [MMRazor](https://github.com/open-mmlab/mmrazor): OpenMMLab model compression toolbox and benchmark.
-- [MMFewShot](https://github.com/open-mmlab/mmfewshot): OpenMMLab fewshot learning toolbox and benchmark.
-- [MMAction2](https://github.com/open-mmlab/mmaction2): OpenMMLab's next-generation action understanding toolbox and benchmark.
-- [MMTracking](https://github.com/open-mmlab/mmtracking): OpenMMLab video perception toolbox and benchmark.
-- [MMFlow](https://github.com/open-mmlab/mmflow): OpenMMLab optical flow toolbox and benchmark.
-- [MMEditing](https://github.com/open-mmlab/mmediting): OpenMMLab image and video editing toolbox.
-- [MMGeneration](https://github.com/open-mmlab/mmgeneration): OpenMMLab image and video generative models toolbox.
-- [MMDeploy](https://github.com/open-mmlab/mmdeploy): OpenMMLab model deployment framework.
diff --git a/README_zh-CN.md b/README_zh-CN.md
index d9a81eb..e3288ee 100644
--- a/README_zh-CN.md
+++ b/README_zh-CN.md
@@ -1,116 +1,200 @@
 <div align="center">
-  <img src="https://raw.githubusercontent.com/open-mmlab/mmcv/master/docs/en/mmcv-logo.png" width="300"/>
-  <div>&nbsp;</div>
-  <div align="center">
-    <b><font size="5">OpenMMLab 官网</font></b>
-    <sup>
-      <a href="https://openmmlab.com">
-        <i><font size="4">HOT</font></i>
-      </a>
-    </sup>
-    &nbsp;&nbsp;&nbsp;&nbsp;
-    <b><font size="5">OpenMMLab 开放平台</font></b>
-    <sup>
-      <a href="https://platform.openmmlab.com">
-        <i><font size="4">TRY IT OUT</font></i>
-      </a>
-    </sup>
-  </div>
-  <div>&nbsp;</div>
+    <img src="https://raw.githubusercontent.com/open-mmlab/mmcv/master/docs/mmcv-logo.png" width="300"/>
 </div>
 
-[![docs](https://img.shields.io/badge/docs-2.x-blue)](https://mmcv.readthedocs.io/zh_CN/2.x/)
-[![platform](https://img.shields.io/badge/platform-Linux%7CWindows%7CmacOS-blue)](https://mmcv.readthedocs.io/zh_CN/2.x/get_started/installation.html)
-[![PyPI - Python Version](https://img.shields.io/pypi/pyversions/mmcv)](https://pypi.org/project/mmcv/)
-[![pytorch](https://img.shields.io/badge/pytorch-1.6~1.13-orange)](https://pytorch.org/get-started/previous-versions/)
-[![cuda](https://img.shields.io/badge/cuda-9.2~11.7-green)](https://developer.nvidia.com/cuda-downloads)
-[![PyPI](https://img.shields.io/pypi/v/mmcv)](https://pypi.org/project/mmcv)
-[![badge](https://github.com/open-mmlab/mmcv/workflows/build/badge.svg)](https://github.com/open-mmlab/mmcv/actions)
-[![codecov](https://codecov.io/gh/open-mmlab/mmcv/branch/master/graph/badge.svg)](https://codecov.io/gh/open-mmlab/mmcv)
-[![license](https://img.shields.io/github/license/open-mmlab/mmcv.svg)](https://github.com/open-mmlab/mmcv/blob/master/LICENSE)
+[![PyPI - Python Version](https://img.shields.io/pypi/pyversions/mmcv)](https://pypi.org/project/mmcv/) [![PyPI](https://img.shields.io/pypi/v/mmcv)](https://pypi.org/project/mmcv) [![badge](https://github.com/open-mmlab/mmcv/workflows/build/badge.svg)](https://github.com/open-mmlab/mmcv/actions) [![codecov](https://codecov.io/gh/open-mmlab/mmcv/branch/master/graph/badge.svg)](https://codecov.io/gh/open-mmlab/mmcv) [![license](https://img.shields.io/github/license/open-mmlab/mmcv.svg)](https://github.com/open-mmlab/mmcv/blob/master/LICENSE)
 
 [English](README.md) | 简体中文
 
 ## 简介
 
-MMCV 是一个面向计算机视觉的基础库，它提供了以下功能：
+MMCV 是一个面向计算机视觉的基础库，它支持了很多开源项目，例如：
 
-- [图像和视频处理](https://mmcv.readthedocs.io/zh_CN/2.x/understand_mmcv/data_process.html)
-- [图像和标注结果可视化](https://mmcv.readthedocs.io/zh_CN/2.x/understand_mmcv/visualization.html)
-- [图像变换](https://mmcv.readthedocs.io/zh_CN/2.x/understand_mmcv/data_transform.html)
-- [多种 CNN 网络结构](https://mmcv.readthedocs.io/zh_CN/2.x/understand_mmcv/cnn.html)
-- [高质量实现的常见 CUDA 算子](https://mmcv.readthedocs.io/zh_CN/2.x/understand_mmcv/ops.html)
+- [MMCV](https://github.com/open-mmlab/mmcv): OpenMMLab 计算机视觉基础库
+- [MIM](https://github.com/open-mmlab/mim): OpenMMLab 项目、算法、模型的统一入口
+- [MMClassification](https://github.com/open-mmlab/mmclassification): OpenMMLab 图像分类工具箱与测试基准
+- [MMDetection](https://github.com/open-mmlab/mmdetection): OpenMMLab 检测工具箱与测试基准
+- [MMDetection3D](https://github.com/open-mmlab/mmdetection3d): OpenMMLab 新一代通用3D目标检测平台
+- [MMSegmentation](https://github.com/open-mmlab/mmsegmentation): OpenMMLab 语义分割工具箱与测试基准
+- [MMAction2](https://github.com/open-mmlab/mmaction2): OpenMMLab 新一代视频理解工具箱与测试基准
+- [MMTracking](https://github.com/open-mmlab/mmtracking): OpenMMLab 一体化视频目标感知平台
+- [MMPose](https://github.com/open-mmlab/mmpose): OpenMMLab 姿态估计工具箱与测试基准
+- [MMEditing](https://github.com/open-mmlab/mmediting): OpenMMLab 图像视频编辑工具箱
+- [MMOCR](https://github.com/open-mmlab/mmocr): OpenMMLab 全流程文字检测识别理解工具包
+- [MMGeneration](https://github.com/open-mmlab/mmgeneration): OpenMMLab 新一代生成模型工具箱
+- [MMFlow](https://github.com/open-mmlab/mmflow): OpenMMLab 光流估计工具箱与测试基准
+- [MMFewShot](https://github.com/open-mmlab/mmfewshot): OpenMMLab 少样本学习工具箱与测试基准
 
-MMCV 支持多种平台，包括：
+MMCV 提供了如下众多功能：
 
-- Linux
-- Windows
-- macOS
+- 通用的 IO 接口
+- 图像和视频处理
+- 图像和标注结果可视化
+- 常用小工具（进度条，计时器等）
+- 基于 PyTorch 的通用训练框架
+- 多种 CNN 网络结构
+- 高质量实现的常见 CUDA 算子
 
-如想了解更多特性和使用，请参考[文档](http://mmcv.readthedocs.io/zh_CN/2.x)。
+如想了解更多特性和使用，请参考[文档](http://mmcv.readthedocs.io/en/latest)。
 
-提示: MMCV 需要 Python 3.7 以上版本。
+提示: MMCV 需要 Python 3.6 以上版本。
 
 ## 安装
 
 MMCV 有两个版本：
 
-- **mmcv**: 完整版，包含所有的特性以及丰富的开箱即用的 CUDA 算子。注意完整版本可能需要更长时间来编译。
-- **mmcv-lite**: 精简版，不包含 CUDA 算子但包含其余所有特性和功能，类似 MMCV 1.0 之前的版本。如果你不需要使用 CUDA 算子的话，精简版可以作为一个考虑选项。
+- **mmcv-full**: 完整版，包含所有的特性以及丰富的开箱即用的 CUDA 算子。注意完整版本可能需要更长时间来编译。
+- **mmcv**: 精简版，不包含 CUDA 算子但包含其余所有特性和功能，类似 MMCV 1.0 之前的版本。如果你不需要使用 CUDA 算子的话，精简版可以作为一个考虑选项。
+
+**注意**: 请不要在同一个环境中安装两个版本，否则可能会遇到类似 `ModuleNotFound` 的错误。在安装一个版本之前，需要先卸载另一个。`如果CUDA可用，强烈推荐安装mmcv-full`。
 
-**注意**: 请不要在同一个环境中安装两个版本，否则可能会遇到类似 `ModuleNotFound` 的错误。在安装一个版本之前，需要先卸载另一个。`如果 CUDA 可用，强烈推荐安装 mmcv`。
+a. 安装完整版
 
-### 安装 mmcv
+在安装 mmcv-full 之前，请确保 PyTorch 已经成功安装在环境中，可以参考 PyTorch 官方[文档](https://pytorch.org/)。
 
-在安装 mmcv 之前，请确保 PyTorch 已经成功安装在环境中，可以参考 [PyTorch 官方安装文档](https://github.com/pytorch/pytorch#installation)。如果你使用的是搭载 apple silicon 的 mac 设备，请安装 PyTorch 1.13+ 的版本。
+我们提供了不同 PyTorch 和 CUDA 版本的 mmcv-full 预编译包，可以大大简化用户安装编译过程。强烈推荐通过预编译包来安装。另外，安装完成后可以运行 [check_installation.py](.dev_scripts/check_installation.py) 脚本检查 mmcv-full 是否安装成功。
 
-安装 mmcv 的命令如下：
+i. 安装最新版本
 
-```bash
-pip install -U openmim
-mim install "mmcv>=2.0.0rc1"
+如下是安装最新版 ``mmcv-full`` 的命令
+
+```shell
+pip install mmcv-full -f https://download.openmmlab.com/mmcv/dist/{cu_version}/{torch_version}/index.html
 ```
 
-如果需要指定 mmcv 的版本，可以使用以下命令
+请将链接中的 ``{cu_version}`` 和 ``{torch_version}`` 根据自身需求替换成实际的版本号，例如想安装和 ``CUDA 11.1``、``PyTorch 1.9.0`` 兼容的最新版 ``mmcv-full``，使用如下替换过的命令
 
-```bash
-mim install mmcv==2.0.0rc3
+```shell
+pip install mmcv-full -f https://download.openmmlab.com/mmcv/dist/cu111/torch1.9.0/index.html
 ```
 
-如果发现上述的安装命令没有使用预编译包（以 `.whl` 结尾）而是使用源码包（以 `.tar.gz` 结尾）安装，则有可能是我们没有提供和当前环境的 PyTorch 版本、CUDA 版本相匹配的 mmcv 预编译包，此时，你可以[源码安装 mmcv](https://mmcv.readthedocs.io/zh_CN/2.x/get_started/build.html)。
+**注意**: PyTorch 在 1.x.0 和 1.x.1 之间通常是兼容的，故 mmcv-full 只提供 1.x.0 的编译包。如果你的 PyTorch 版本是 1.x.1，你可以放心地安装在 1.x.0 版本编译的 mmcv-full。例如，如果你的 PyTorch 版本是 1.8.1、CUDA 版本是 11.1，你可以使用以下命令安装 mmcv-full。
+
+```shell
+pip install mmcv-full -f https://download.openmmlab.com/mmcv/dist/cu111/torch1.8.0/index.html
+```
 
-<details>
-<summary>使用预编译包的安装日志</summary>
+如果想知道更多 CUDA 和 PyTorch 版本的命令，可以参考下面的表格，将链接中的 ``=={mmcv_version}`` 删去即可。
 
-Looking in links: https://download.openmmlab.com/mmcv/dist/cu102/torch1.8.0/index.html<br />
-Collecting mmcv<br />
-<b>Downloading https://download.openmmlab.com/mmcv/dist/cu102/torch1.8.0/mmcv-2.0.0rc3-cp38-cp38-manylinux1_x86_64.whl</b>
+ii. 安装特定的版本
 
-</details>
+如下是安装特定版本 ``mmcv-full`` 的命令
 
-<details>
-<summary>使用源码包的安装日志</summary>
+```shell
+pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/{cu_version}/{torch_version}/index.html
+```
 
-Looking in links: https://download.openmmlab.com/mmcv/dist/cu102/torch1.8.0/index.html<br />
-Collecting mmcv==2.0.0rc3<br />
-<b>Downloading mmcv-2.0.0rc3.tar.gz</b>
+首先请参考版本发布信息找到想要安装的版本号，将 ``{mmcv_version}`` 替换成该版本号，例如 ``1.3.9``。
+然后将链接中的 ``{cu_version}`` 和 ``{torch_version}`` 根据自身需求替换成实际的版本号，例如想安装和 ``CUDA 11.1``、``PyTorch 1.9.0`` 兼容的 ``mmcv-full`` 1.3.9 版本，使用如下替换过的命令
 
-</details>
+```shell
+pip install mmcv-full==1.3.9 -f https://download.openmmlab.com/mmcv/dist/cu111/torch1.9.0/index.html
+```
 
-更多安装方式请参考[安装文档](https://mmcv.readthedocs.io/zh_CN/2.x/get_started/installation.html)。
+对于更多的 PyTorch 和 CUDA 版本组合，请参考下表：
+
+<table class="docutils">
+  <tbody>
+    <tr>
+      <th width="80"> CUDA </th>
+      <th valign="bottom" align="left" width="120">torch1.10</th>
+      <th valign="bottom" align="left" width="100">torch1.9</th>
+      <th valign="bottom" align="left" width="100">torch1.8</th>
+      <th valign="bottom" align="left" width="100">torch1.7</th>
+      <th valign="bottom" align="left" width="100">torch1.6</th>
+      <th valign="bottom" align="left" width="100">torch1.5</th>
+    </tr>
+    <tr>
+      <td align="left">11.3</td>
+      <td align="left"><details><summary> 安装 </summary><pre><code>pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu113/torch1.10.0/index.html</code></pre> </details> </td>
+      <td align="left"></td>
+      <td align="left"></code></pre> </details> </td>
+      <td align="left"> </td>
+      <td align="left"> </td>
+      <td align="left"> </td>
+    </tr>
+    <tr>
+      <td align="left">11.1</td>
+      <td align="left"><details><summary> 安装 </summary><pre><code>pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu111/torch1.10.0/index.html</code></pre> </details> </td>
+      <td align="left"><details><summary> 安装 </summary><pre><code>pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu111/torch1.9.0/index.html</code></pre> </details> </td>
+      <td align="left"><details><summary> 安装 </summary><pre><code>pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu111/torch1.8.0/index.html</code></pre> </details> </td>
+      <td align="left"> </td>
+      <td align="left"> </td>
+      <td align="left"> </td>
+    </tr>
+    <tr>
+      <td align="left">11.0</td>
+      <td align="left"> </td>
+      <td align="left"> </td>
+      <td align="left"> </td>
+      <td align="left"><details><summary> 安装 </summary><pre><code>pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu110/torch1.7.0/index.html</code></pre> </details> </td>
+      <td align="left"> </td>
+      <td align="left"> </td>
+    </tr>
+    <tr>
+      <td align="left">10.2</td>
+      <td align="left"><details><summary> 安装 </summary><pre><code>pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu102/torch1.10.0/index.html</code></pre> </details> </td>
+      <td align="left"><details><summary> 安装 </summary><pre><code>pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu102/torch1.9.0/index.html</code></pre> </details> </td>
+      <td align="left"><details><summary> 安装 </summary><pre><code>pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu102/torch1.8.0/index.html</code></pre> </details> </td>
+      <td align="left"><details><summary> 安装 </summary><pre><code>pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu102/torch1.7.0/index.html</code></pre> </details> </td>
+      <td align="left"><details><summary> 安装 </summary><pre><code>pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu102/torch1.6.0/index.html</code></pre> </details> </td>
+      <td align="left"><details><summary> 安装 </summary><pre><code>pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu102/torch1.5.0/index.html</code></pre> </details> </td>
+    </tr>
+    <tr>
+      <td align="left">10.1</td>
+      <td align="left"> </td>
+      <td align="left"> </td>
+      <td align="left"><details><summary> 安装 </summary><pre><code> pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu101/torch1.8.0/index.html</code></pre> </details> </td>
+      <td align="left"><details><summary> 安装 </summary><pre><code> pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu101/torch1.7.0/index.html</code></pre> </details> </td>
+      <td align="left"><details><summary> 安装 </summary><pre><code> pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu101/torch1.6.0/index.html</code></pre> </details> </td>
+      <td align="left"><details><summary> 安装 </summary><pre><code> pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu101/torch1.5.0/index.html</code></pre> </details> </td>
+    </tr>
+    <tr>
+      <td align="left">9.2</td>
+      <td align="left"> </td>
+      <td align="left"> </td>
+      <td align="left"> </td>
+      <td align="left"><details><summary> 安装 </summary><pre><code> pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu92/torch1.7.0/index.html</code></pre> </details> </td>
+      <td align="left"><details><summary> 安装 </summary><pre><code> pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu92/torch1.6.0/index.html</code></pre> </details> </td>
+      <td align="left"><details><summary> 安装 </summary><pre><code> pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu92/torch1.5.0/index.html</code></pre> </details> </td>
+    </tr>
+    <tr>
+      <td align="left">cpu</td>
+      <td align="left"><details><summary> 安装 </summary><pre><code> pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cpu/torch1.10.0/index.html</code></pre> </details> </td>
+      <td align="left"><details><summary> 安装 </summary><pre><code> pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cpu/torch1.9.0/index.html</code></pre> </details> </td>
+       <td align="left"><details><summary> 安装 </summary><pre><code> pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cpu/torch1.8.0/index.html</code></pre> </details> </td>
+      <td align="left"><details><summary> 安装 </summary><pre><code> pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cpu/torch1.7.0/index.html</code></pre> </details> </td>
+      <td align="left"><details><summary> 安装 </summary><pre><code> pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cpu/torch1.6.0/index.html</code></pre> </details> </td>
+      <td align="left"><details><summary> 安装 </summary><pre><code> pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cpu/torch1.5.0/index.html</code></pre> </details> </td>
+    </tr>
+  </tbody>
+</table>
+
+**注意**：以上提供的预编译包并不囊括所有的 mmcv-full 版本，你可以点击对应链接查看支持的版本。例如，点击 [cu102-torch1.8.0](https://download.openmmlab.com/mmcv/dist/cu102/torch1.8.0/index.html)，可以看到 `cu102-torch1.8.0` 只提供了 1.3.0 及以上的 mmcv-full 版本。另外，从 `mmcv v1.3.17` 开始，我们不再提供`PyTorch 1.3 & 1.4` 对应的 mmcv-full 预编译包。你可以在 [这](./docs_zh_CN/get_started/previous_versions.md) 找到 `PyTorch 1.3 & 1.4` 对应的预编包。虽然我们不再提供 `PyTorch 1.3 & 1.4` 对应的预编译包，但是我们依然在 CI 中保证对它们的兼容持续到下一年。
+
+除了使用预编译包之外，另一种方式是在本地进行编译，直接运行下述命令
+
+```python
+pip install mmcv-full
+```
 
-### 安装 mmcv-lite
+但注意本地编译可能会耗时 10 分钟以上。
 
-如果你需要使用和 PyTorch 相关的模块，请确保 PyTorch 已经成功安装在环境中，可以参考 [PyTorch 官方安装文档](https://github.com/pytorch/pytorch#installation)。
+b. 安装精简版
 
-```bash
-pip install -U openmim
-mim install "mmcv-lite>=2.0.0rc1"
+```python
+pip install mmcv
 ```
 
+c. 安装完整版并且编译 onnxruntime 的自定义算子
+
+- 详细的指南请查看 [这里](docs/deployment/onnxruntime_op.md)。
+
+如果想从源码编译 MMCV，请参考[该文档](https://mmcv.readthedocs.io/en/latest/get_started/build.html)。
+
 ## FAQ
 
-如果你遇到了安装问题或者运行时问题，请查看[问题解决页面](https://mmcv.readthedocs.io/zh_CN/2.x/faq.html)是否已有解决方案。如果问题仍然没有解决，欢迎提 [issue](https://github.com/open-mmlab/mmcv/issues)。
+如果你遇到了安装问题，CUDA 相关的问题或者 RuntimeErrors，可以首先参考[问题解决页面](https://mmcv.readthedocs.io/en/latest/faq.html) 看是否已经有解决方案。
 
 ## 贡献指南
 
@@ -119,37 +203,12 @@ mim install "mmcv-lite>=2.0.0rc1"
 ## 许可证
 
 `MMCV` 目前以 Apache 2.0 的许可证发布，但是其中有一部分功能并不是使用的 Apache2.0 许可证，我们在 [许可证](LICENSES.md) 中详细地列出了这些功能以及他们对应的许可证，如果您正在从事盈利性活动，请谨慎参考此文档。
-
-## OpenMMLab 的其他项目
-
-- [MMEngine](https://github.com/open-mmlab/mmengine): OpenMMLab 深度学习模型训练基础库
-- [MMCV](https://github.com/open-mmlab/mmcv): OpenMMLab 计算机视觉基础库
-- [MIM](https://github.com/open-mmlab/mim): MIM 是 OpenMMlab 项目、算法、模型的统一入口
-- [MMClassification](https://github.com/open-mmlab/mmclassification): OpenMMLab 图像分类工具箱
-- [MMDetection](https://github.com/open-mmlab/mmdetection): OpenMMLab 目标检测工具箱
-- [MMDetection3D](https://github.com/open-mmlab/mmdetection3d): OpenMMLab 新一代通用 3D 目标检测平台
-- [MMRotate](https://github.com/open-mmlab/mmrotate): OpenMMLab 旋转框检测工具箱与测试基准
-- [MMYOLO](https://github.com/open-mmlab/mmyolo): OpenMMLab YOLO 系列工具箱与测试基准
-- [MMSegmentation](https://github.com/open-mmlab/mmsegmentation): OpenMMLab 语义分割工具箱
-- [MMOCR](https://github.com/open-mmlab/mmocr): OpenMMLab 全流程文字检测识别理解工具箱
-- [MMPose](https://github.com/open-mmlab/mmpose): OpenMMLab 姿态估计工具箱
-- [MMHuman3D](https://github.com/open-mmlab/mmhuman3d): OpenMMLab 人体参数化模型工具箱与测试基准
-- [MMSelfSup](https://github.com/open-mmlab/mmselfsup): OpenMMLab 自监督学习工具箱与测试基准
-- [MMRazor](https://github.com/open-mmlab/mmrazor): OpenMMLab 模型压缩工具箱与测试基准
-- [MMFewShot](https://github.com/open-mmlab/mmfewshot): OpenMMLab 少样本学习工具箱与测试基准
-- [MMAction2](https://github.com/open-mmlab/mmaction2): OpenMMLab 新一代视频理解工具箱
-- [MMTracking](https://github.com/open-mmlab/mmtracking): OpenMMLab 一体化视频目标感知平台
-- [MMFlow](https://github.com/open-mmlab/mmflow): OpenMMLab 光流估计工具箱与测试基准
-- [MMEditing](https://github.com/open-mmlab/mmediting): OpenMMLab 图像视频编辑工具箱
-- [MMGeneration](https://github.com/open-mmlab/mmgeneration): OpenMMLab 图片视频生成模型工具箱
-- [MMDeploy](https://github.com/open-mmlab/mmdeploy): OpenMMLab 模型部署框架
-
 ## 欢迎加入 OpenMMLab 社区
 
-扫描下方的二维码可关注 OpenMMLab 团队的 [知乎官方账号](https://www.zhihu.com/people/openmmlab)，加入 OpenMMLab 团队的 [官方交流 QQ 群](https://jq.qq.com/?_wv=1027&k=K0QI8ByU)，或添加微信小助手”OpenMMLabwx“加入官方交流微信群。
+扫描下方的二维码可关注 OpenMMLab 团队的 [知乎官方账号](https://www.zhihu.com/people/openmmlab)，加入 OpenMMLab 团队的 [官方交流 QQ 群](https://jq.qq.com/?_wv=1027&k=GJP18SjI)
 
 <div align="center">
-<img src="https://user-images.githubusercontent.com/25839884/205870927-39f4946d-8751-4219-a4c0-740117558fd7.jpg" height="400" />  <img src="https://user-images.githubusercontent.com/25839884/203904835-62392033-02d4-4c73-a68c-c9e4c1e2b07f.jpg" height="400" /> <img src="https://user-images.githubusercontent.com/25839884/205872898-e2e6009d-c6bb-4d27-8d07-117e697a3da8.jpg" height="400" />
+<img src="docs/_static/zhihu_qrcode.jpg" height="400" />  <img src="docs/_static/qq_group_qrcode.jpg" height="400" />
 </div>
 
 我们会在 OpenMMLab 社区为大家
diff --git a/TERMINOLOGY.md b/TERMINOLOGY.md
index 07411b7..61941e3 100644
--- a/TERMINOLOGY.md
+++ b/TERMINOLOGY.md
@@ -4,27 +4,27 @@ This document is used as a reference for English-Chinese terminology translation
 
 该文档用作中英文翻译对照参考。
 
-|      English      |     中文     |
-| :---------------: | :----------: |
-|    annotation     |     标注     |
-|     backbone      |   主干网络   |
-|     benchmark     |   基准测试   |
-|    checkpoint     | 模型权重文件 |
-|    classifier     |    分类器    |
-|     cls_head      |    分类头    |
-|      decoder      |    解码器    |
-|     detector      |    检测器    |
-|      encoder      |    编码器    |
-|     finetune      |     微调     |
-|   ground truth    |   真实标签   |
-|       hook        |     钩子     |
-|     localizer     |    定位器    |
-|       neck        |   模型颈部   |
-|     pipeline      |    流水线    |
-|    recognizer     |    识别器    |
-|     register      |    注册器    |
-|     schedule      |     调整     |
-|     scheduler     |    调度器    |
-|     segmentor     |    分割器    |
-|      tensor       |     张量     |
-| training schedule |   训练策略   |
+| English | 中文 |
+| :-----: | :---:|
+| annotation | 标注 |
+| backbone | 主干网络 |
+| benchmark | 基准测试 |
+| checkpoint | 模型权重文件 |
+| classifier | 分类器 |
+| cls_head | 分类头 |
+| decoder | 解码器 |
+| detector | 检测器 |
+| encoder | 编码器 |
+| finetune | 微调 |
+| ground truth | 真实标签 |
+| hook | 钩子 |
+| localizer | 定位器 |
+| neck | 模型颈部 |
+| pipeline | 流水线 |
+| recognizer | 识别器 |
+| register | 注册器 |
+| schedule | 调整 |
+| scheduler | 调度器 |
+| segmentor | 分割器 |
+| tensor | 张量 |
+| training schedule | 训练策略 |
diff --git a/docker/README.md b/docker/README.md
deleted file mode 100644
index 60d5c9d..0000000
--- a/docker/README.md
+++ /dev/null
@@ -1,70 +0,0 @@
-# Docker images
-
-There are two `Dockerfile` files to build docker images, one to build an image with the mmcv pre-built package and the other with the mmcv development environment.
-
-```text
-.
-|-- README.md
-|-- dev  # build with mmcv development environment
-|   `-- Dockerfile
-`-- release  # build with mmcv pre-built package
-    `-- Dockerfile
-```
-
-## Build docker images
-
-### Build with mmcv pre-built package
-
-Build with local repository
-
-```bash
-git clone https://github.com/open-mmlab/mmcv.git && cd mmcv
-docker build -t mmcv -f docker/release/Dockerfile .
-```
-
-Or build with remote repository
-
-```bash
-docker build -t mmcv https://github.com/open-mmlab/mmcv.git#master:docker/release
-```
-
-The [Dockerfile](release/Dockerfile) installs latest released version of mmcv by default, but you can specify mmcv versions to install expected versions.
-
-```bash
-docker image build -t mmcv -f docker/release/Dockerfile --build-arg MMCV=2.0.0rc1 .
-```
-
-If you also want to use other versions of PyTorch and CUDA, you can also pass them when building docker images.
-
-An example to build an image with PyTorch 1.11 and CUDA 11.3.
-
-```bash
-docker build -t mmcv -f docker/release/Dockerfile \
-    --build-arg PYTORCH=1.9.0 \
-    --build-arg CUDA=11.1 \
-    --build-arg CUDNN=8 \
-    --build-arg MMCV=2.0.0rc1 .
-```
-
-More available versions of PyTorch and CUDA can be found at [dockerhub/pytorch](https://hub.docker.com/r/pytorch/pytorch/tags).
-
-### Build with mmcv development environment
-
-If you want to build an docker image with the mmcv development environment, you can use the following command
-
-```bash
-git clone https://github.com/open-mmlab/mmcv.git && cd mmcv
-docker build -t mmcv -f docker/dev/Dockerfile --build-arg CUDA_ARCH=7.5 .
-```
-
-Note that `CUDA_ARCH` is the cumpute capability of your GPU and you can find it at [Compute Capability](https://developer.nvidia.com/cuda-gpus#compute).
-
-The building process may take 10 minutes or more.
-
-## Run images
-
-```bash
-docker run --gpus all --shm-size=8g -it mmcv
-```
-
-See [docker run](https://docs.docker.com/engine/reference/commandline/run/) for more usages.
diff --git a/docker/dev/Dockerfile b/docker/dev/Dockerfile
deleted file mode 100644
index a4d9e23..0000000
--- a/docker/dev/Dockerfile
+++ /dev/null
@@ -1,31 +0,0 @@
-ARG PYTORCH="1.8.1"
-ARG CUDA="10.2"
-ARG CUDNN="7"
-
-FROM pytorch/pytorch:${PYTORCH}-cuda${CUDA}-cudnn${CUDNN}-devel
-
-# To fix GPG key error when running apt-get update
-RUN rm /etc/apt/sources.list.d/cuda.list \
-    && rm /etc/apt/sources.list.d/nvidia-ml.list \
-    && apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub \
-    && apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/7fa2af80.pub
-
-# Install git and system dependencies for opencv-python
-RUN apt-get update && apt-get install -y git \
-    && apt-get update && apt-get install -y libgl1 libglib2.0-0
-
-# Install system dependencies for unit tests
-RUN apt-get install -y ffmpeg libturbojpeg \
-    && apt-get clean \
-    && rm -rf /var/lib/apt/lists/*
-
-# build mmcv from source with develop mode
-ARG HTTPS_PROXY=""
-ENV https_proxy=${HTTPS_PROXY}
-ENV FORCE_CUDA="1"
-ARG CUDA_ARCH=""
-ENV TORCH_CUDA_ARCH_LIST=${CUDA_ARCH}
-RUN git clone https://github.com/open-mmlab/mmcv.git /mmcv
-WORKDIR /mmcv
-RUN git checkout 2.x && git rev-parse --short HEAD
-RUN pip install --no-cache-dir -e .[all] -v && pip install pre-commit && pre-commit install
diff --git a/docker/release/Dockerfile b/docker/release/Dockerfile
deleted file mode 100644
index d5e25e9..0000000
--- a/docker/release/Dockerfile
+++ /dev/null
@@ -1,23 +0,0 @@
-ARG PYTORCH="1.8.1"
-ARG CUDA="10.2"
-ARG CUDNN="7"
-
-FROM pytorch/pytorch:${PYTORCH}-cuda${CUDA}-cudnn${CUDNN}-devel
-
-# To fix GPG key error when running apt-get update
-RUN rm /etc/apt/sources.list.d/cuda.list \
-    && rm /etc/apt/sources.list.d/nvidia-ml.list \
-    && apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub \
-    && apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/7fa2af80.pub
-
-# Install system dependencies for opencv-python
-RUN apt-get update && apt-get install -y libgl1 libglib2.0-0 \
-    && apt-get clean \
-    && rm -rf /var/lib/apt/lists/*
-
-# Install mmcv
-ARG MMCV=""
-RUN if [ "${MMCV}" = "" ]; then pip install -U openmim && mim install 'mmcv>=2.0.0rc1'; else pip install -U openmim && mim install mmcv==${MMCV}; fi
-
-# Verify the installation
-RUN python -c 'import mmcv;print(mmcv.__version__)'
diff --git a/docs/en/Makefile b/docs/Makefile
similarity index 100%
rename from docs/en/Makefile
rename to docs/Makefile
diff --git a/docs/en/_static/community/1.png b/docs/_static/community/1.png
similarity index 100%
rename from docs/en/_static/community/1.png
rename to docs/_static/community/1.png
diff --git a/docs/en/_static/community/2.png b/docs/_static/community/2.png
similarity index 100%
rename from docs/en/_static/community/2.png
rename to docs/_static/community/2.png
diff --git a/docs/en/_static/community/3.png b/docs/_static/community/3.png
similarity index 100%
rename from docs/en/_static/community/3.png
rename to docs/_static/community/3.png
diff --git a/docs/en/_static/css/readthedocs.css b/docs/_static/css/readthedocs.css
similarity index 75%
rename from docs/en/_static/css/readthedocs.css
rename to docs/_static/css/readthedocs.css
index 9e3a567..3f425fc 100644
--- a/docs/en/_static/css/readthedocs.css
+++ b/docs/_static/css/readthedocs.css
@@ -4,7 +4,3 @@
     height: 40px;
     width: 85px;
 }
-
-table.colwidths-auto td {
-    width: 50%
-}
diff --git a/docs/en/_static/flow_img2toimg1.png b/docs/_static/flow_img2toimg1.png
similarity index 100%
rename from docs/en/_static/flow_img2toimg1.png
rename to docs/_static/flow_img2toimg1.png
diff --git a/docs/en/_static/flow_raw_images.png b/docs/_static/flow_raw_images.png
similarity index 100%
rename from docs/en/_static/flow_raw_images.png
rename to docs/_static/flow_raw_images.png
diff --git a/docs/en/_static/flow_visualization.png b/docs/_static/flow_visualization.png
similarity index 100%
rename from docs/en/_static/flow_visualization.png
rename to docs/_static/flow_visualization.png
diff --git a/docs/en/_static/flow_warp.png b/docs/_static/flow_warp.png
similarity index 100%
rename from docs/en/_static/flow_warp.png
rename to docs/_static/flow_warp.png
diff --git a/docs/en/_static/flow_warp_diff.png b/docs/_static/flow_warp_diff.png
similarity index 100%
rename from docs/en/_static/flow_warp_diff.png
rename to docs/_static/flow_warp_diff.png
diff --git a/docs/en/_static/image/mmcv-logo.png b/docs/_static/image/mmcv-logo.png
similarity index 100%
rename from docs/en/_static/image/mmcv-logo.png
rename to docs/_static/image/mmcv-logo.png
diff --git a/docs/en/_static/parallel_progress.gif b/docs/_static/parallel_progress.gif
similarity index 100%
rename from docs/en/_static/parallel_progress.gif
rename to docs/_static/parallel_progress.gif
diff --git a/docs/en/_static/parallel_progress.png b/docs/_static/parallel_progress.png
similarity index 100%
rename from docs/en/_static/parallel_progress.png
rename to docs/_static/parallel_progress.png
diff --git a/docs/en/_static/progress.gif b/docs/_static/progress.gif
similarity index 100%
rename from docs/en/_static/progress.gif
rename to docs/_static/progress.gif
diff --git a/docs/en/_static/progress.png b/docs/_static/progress.png
similarity index 100%
rename from docs/en/_static/progress.png
rename to docs/_static/progress.png
diff --git a/docs/_static/qq_group_qrcode.jpg b/docs/_static/qq_group_qrcode.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..7c6b04f561da283ae622f4219ea9b8cabf8f301a
GIT binary patch
literal 71955
zcmeEucU+Upw&(|_*bosBq-tmq1eD%UA@m}U5IQ0t5PFwx0|f-61f(l9BoL%`P^1V*
zCzQ~nhTeO9aqqL;d*6G`efOQ;`SVQ}zBT1rYs#8gWhOZpIQaowR+d+i2S`W&fQ0w~
zP9{iiDap#3Jk-*VS5k%iRzOMz5CiFD0I;)nb=FdNa6?y5|HjX}KZ2>5i{l^gKY0@I
zZukAf4gfto|D^o?mUYhD!o`eOX_XjVor%PWjWs92S1o^spZ<g&{|>+X2|GizWQlb?
z5aC;ve}NzW1@>@tbS2jL_Org3qw`OAm<UVS*}DCt^)viz@kI-Jm^SgLON@+w3!nuk
z01t@pe>D6#o(`!1Ai4zrWbgjWdmIM<r2zmyhyOE=?E?T%z5;-f4o6dG)886%iuj-O
z$rAur&jJ7nJpiEY1c37fe>C*ZcmJG=f6><)#CI5p{d6FHECE}<0=NMv0rr3yz)OVq
zfjhumK=@<`kOj`3IYWNt^jUIp@^k0Tp1*jN;^Ku17q4HTro75T&%(?^&&YU#jhFKV
z=r$`O<1G=c+js5?3JS7tib;v`OY#Z`^8Z9aa_-!@ix)1^QBctF-(<YW|Nk6LJ_A%|
zPv0Y5CL;j>q*NqiR3s<$#14=Eq+}#NIs1=5ewOUinbV{{OD!$~BxGbHWM|HvBPTz5
z<}BGcfP|Fn)M=_S<X5P#-n=VvU-K-@H98%)!X8?f>8pq+WO_G<m0v*1={dXnlZ^bY
zlN=A^9=o`H{K6(I>RhnOc}q~p>_z0bHoT~AVy#Q|Wprv9kqkYNJ_$hfmp1%P<_xin
zgX(9|&t{w=`$r3jO`tk;<>uYf)b}+_ot}eOWl!+HMdItERAf|u43PBf%)382^Z%cJ
z9RczQG^h1Oy%o!&zvlku;6pgbH9OQ|`5u1?cst8Fz!1}&HU5<UcJ((O%y4zqx3j=`
z;d^JG|2cxcV<P{WE;oCZ8#%)IlA8z-?qY^(Z?0dc2TuQ9&TxYa{GX&rwg#h4ub-pw
z5_|TIkmm0h)B*RKoT7aLod1)`f3%<AA0(@DcfTibwoY~M)3tutN299<KThBKlN=E`
z^9^Wu_P3mWJHf*1ECd|ES+x`E2$))@T{PA(d)F8MQ2!O<AaMQRf6e=M(q!W+BS$}&
zkH$t#VK8Gy43gNl|BjfX0O0@d@1XztJLIh+6)Cq^!u{rbGHDF4Q1G<#2h?BZ?$7G~
zky$*1)tQqeRm9x^qp{E<m(u7xFhKh3Pa=PTDBh7E|C0QlLFtGvlXsNwEEJ=PHAWh<
zOvUfKlwR<DXJ1-QE)P7>5%Q))hLkJF-|^+9k)Vux_BUiCfeIFmuY1Fr_~}9b;9sst
z+h-U5+noOm&##29Q1D6wWu`!}-^-igGE}J1mzzhY;4_p&=?bvCm$}FCy7hg?I@h+}
zql=dirRj&lG*sH>O}#}tdp=Yvd;>oGfyk$N2<1!1yhQMM=Cg?cy8p=k@6Z43-0<H`
z!W>e@h<N(R-o3?=_Gu4P^L!ULA|DS{VxWd~I15R@9vC#^+uk8$jnp2-1wWD4;rbMk
z9B__5c6^)iGrivw%QMQJRhAEryglGk(yH@us$2ou7hcr+{ex#%!i;*qv0RbJzo;;e
zg>#NS1g`$~*8Y+6zZ+4K&HC0_$LsnDxnDN@Sjzc7?(#pQY>*LnC8Kx(=+7hK#gp$V
z9yN@#4kyW!O?Rd74DOrdPVI?4-9Pda=WfS*C=$}&quQ8^7tRx77<GgBj?ec?J$@<k
zVRUTtsk8E)abCpy4xe)NdXh~2J`2m-{$~4_*PaIFe5J>9vnMrsW*bD?DkEYwRjDi`
zX4FDs!d865NbLkD&?(p6exf!h6(kd3Ei?FW7R(}5cstW=OW$qMZPK!XncoOKI~^K(
z-rS72GMk$zx}$a2Td|xVIn>7Ai_Z#`QMs-Wb;bs!uQS9oubmKF@6r8uBgw^V(6eto
zB~JV57L|5=f%0~Kb~o0LS&;r^BfoOYz|*B5#~lazqwCZc3z7~w3>+Kx?RLBeWi&t%
zQ)(WUGOvH|Z)t$;JH6})v?4r~7D<b*MwPS22nitH?M9TV?p&v=6YeLU&!l;HDEN#x
z55Mugx%a0Yl1IS#pBeuu{O?xd#e7cw>^I*#Iv;3m`f=_1RDK9K$ZwN$yxW0XGBapL
zW1e4J>y;k_Ig5p<Ti(2AqXsd6duFHU!HU)-+;R|`8}3Ql9o*$l;>{S#ebjaGdcM1F
zKaI1&37fQb;0R`=Zv8Ts@>?r#S{HlWFnWHNSTXzK%W-^e*0N8~C|F$+ecs_&iuK#L
zOUAKMV|Wi~WO#LZ)d_&L91NpEGW!aRMF(=Fxk<J%oB&Ex{66_7Ky*sgj&pV6+J^4<
zvC{hI+HbBYi=CU%8@E{wg2RvE8DNDxQnw?I>&$nT3d_f)ckm8EW@BA3@+<jm#sR8S
zTk55$-LAN6bMyt!uL22-)(29-S>9^=Q0)eOS1Hu81M;kb?X_d2Y_5f_GTXJu-K-O!
zp0MUNncN|-F2gAiR%?7wgvrRU<~ueSW^NQw?D(E-iA3r1r}UP@4Kc%eeS6oG;^<w=
zHK})Fw3<}$66|bBZEx_kt5TFGL!tTikJoFG0=-2>;>H3e$IaiMw$mcKY4|<21?9yn
zgg5RYFUe;)n~H_Q#Z;5kBK&2XP5>|e?VD2?p2mGzsu|&rPCd+I{I12pbkOmVtvZk>
zQEmY~ii;vU&qJ46Yh<TLe`fp-;qNk!Z0_yo(Puvpc=P6VghzG+71f<*N${p~61|Q?
zqgewtcE?Mr6)39beV<8-+O24Qb?{1gL>b25{d+OlWZ>u)qwr4lHt=s&)LtS|1X!|K
zqu%dJ8+C$A0=#=sg!zz{`oWm`6Cjewuf?7Yg-(SXv5jerp8#u;=RnFysXF(vla@A*
zB?4>S<`_IF2o&L^GR~Jc0c1Y!l(<-i&mN_SpnYpO`w!Bq({<ME_;<S;_Yy;!yxb$~
z<I@a~$1$f=Gxx>(on4Z2aLW?7=x8;MSx~k1&ZfU<h=nVoH&V?ep^e%@zZSfC0$4Yt
z9;IBF-Y=c%sGU*ur)onmD)?a@`^Rs|NzT-+N9`oUCk^zvHIIOj4NUeFz4TYy6rxH^
zga`7)J^kNVodAO$cf4l`aeOBL#n^z4_1MuD$Z<1e<@`jePIZ^W0cG3?P`bT8w={VI
zXb8x|okwJ@!Fj{iX?I;2`v!TcvQ!FkbH7KgbILgLX>8MNTR1~d*{N{9*y*fe|Db{S
zR*z7@>bltLuJ0;6b|&8w2BkY9$JYuDMsLqo&7v*J=7YiJs>e=@wdG<bz?7cy+>oqo
z3}i-L-~`A&x9HVZfT)OnEfS9mPHAv{6_>Zr9abNaI&SGDaO}S>=7ZT0QPu5{?n^9m
z)oH$FU?hQa#)i@!pGgW&wux=lIRO@BpL{Pb)FSroT<rj204^|m0*tX7y>WFYMN7ON
z{cbb(Wnbb!qP<6Je3&eO$S~5MS%^G?B660-C-NpDbNPs$Dfh%RnytE;PJLf-Bu1zA
zv_*fym%3g@2~>Y2#>Phy^31C3JC}&@`m-fEPXUQxF=Bi|_J659x+Oq@7cx?vI-+R~
z($sX!4iM>xZ5*V&uo(lLlw;9&oIjvF^*1f*Z}ORBKM46HiTd2DyYeZ>C@%0b(tp0f
zNOqTBZXP<1lTY*2QI+y;_cP;W^%7G~fR7Uh#iS>FQR5ySPbN5*Va*Hpq@?_6ncdbC
z;MoCwiI%hX>Xu?kl%TH+*p$-7Sj+bP*!QR2j?0zNhwuQ*3QyJiIq>`PCset|r}7JJ
zJXpKCT4u(#&KLN%9?MZXJm11~YBlXUSJS1`7xv+Gx`w#s@b48T<?a#&1L+n*ePu4h
zc-o>5&?ccOM#ne}uJFV3qnTrj$DW1>F?sW6l8}Zz0nq-GeD+d9{>a!K|1I~|N0BW0
z&z($jO^&H+KkfFYB#GT8a_)$HSr4vt<U;MIy^j48z#%K@Nb=}ia%jamg|_0BJKc`G
z#18d(s?>X9SFP-Vd{dM;s!XT+YGJ(JkD=V)$YOLzLnzas`c7|tb(4s~T<rT{#^u*N
z7;{XO)Z&O981Cdf^vYzZx!q2EG9ihAo=?D^r+J-l0+e%`UDv9z2|>eclw(;7zKy=s
z7=Ic5b-Aq{JiSrd1kd!-Ziq^5lCM3yb^@4U9-RPq-jtQR<HiJGX7AHiPJmT|qd;sj
zA$j4L*z84XT&<Xdu7<7giuLXXMs-Kxl#)+8`zI6IBp@=lo+VCi`N8U2LV=jHwkG!!
zVEt#szZ+=HwASyMh_UUzTLAxf<<Qyu2vEqk-S@I{9XfPVb5-8*PJ<FOn!n^0AJ`&w
z#Jx~yE@Kzp8}+t~YbxDtxPA9~gL#e(zWUZw3**2_izR>UYZD&2VP1jtyo9@@;^R4w
zT=11ebM;W@EOhp^h^+r}LbsmXCXE9sGb4=4_`S>fUe$z!eU7T66Tn+W%c^HPf1(i4
ze*z#yD7Vp`7yTGhs^)~SNmbhO=UcM6ls_qf%JUGGx16o8W?fhag7ER1cYI;vLL@w-
zR7cwq(QO^(D|G+3;Y_BXHRc6zT$zqu&%~Sn7~vCO`<A^Cm#14vY64MgJ}#Y;B`nR1
zj^S3!B=S-U(xv9nRa1NU2YEa_m(pDd_ASS$%Q0O?uc=%Ijm$Nid(x2JmuE{%v-slO
z*Ti8xV$p>Hn+5M%{0}v>TRSM?ee)_mPMXGC&$y7JtffDXrtybUCvDz@fx#o19x)!^
z86_lf_u&pP)mCjmTYlk`V1BP1j2<)!M%#nZ;yMxq=XMiTabw!~Q**5jqVu*Fjx`>h
z0G}MT=_@?Aq=s%~p8)Fq+k#uDIXwjh3PFEN(3?I&Ic3?TIbzyIQzVxA)tolQdC&cc
z<divZ-InNjqx}3QAN&cONhE48PG!~|`vlsnIgiM!{w(^N{5_*ssTwFsjEHA{*7z@w
zj6CJnP0H^VZ|(!Tu6EuqECS5;IB~jPR+o2BZ`RDL|8Td*%`Ylb=8j&n_IN^|$c#|#
zrc<XTMCIC3ClR-D;ZLt8Ztklr7dY*`qEnyTB#YR4CFA()G!g6JZ>0SG;b-_SbSa3^
z!f?{DCuOUhmrM8anV+W6@8R!;?8ON8xQkB2i2I}bpFR2gQl3$E`dxg!e@6Zu*~Sf`
zN%D_sGXJbP(Yg5_CI5FCL|a7T^WU@nNc|^g?hl;AXCwZN3AlbR%{8dI-Wi;=%KuxL
z42Apft*N}o=SWBK0%5lB=LMkZTw`;XgYwN!03Bj}ZkCH3gGm<U8b{P9nY%CO_RhhK
z>+c%aN7Dui_ZisSz+^h3g>{+35oi}aagvh15Cj02oN69Fa3+_g{l!!i&uKd;tp~@9
zm{X{urg2&~r_M6yj10GZ^fg=nH^zGI>B@LRBl3F9qUE_{<P4eZDJZnX<pUAlPZnkf
z{EeNRlp6uzXTNCt-MlalDf(tcj8BLLH%W^ft9Gs8tV(F#X?19*V)n)$Yya8WuSx#T
z82dW6J8Zj(`HDLq#4;56>$V*#knTunXZYhZCQy!dZS0EY&vq9K+Qy&va48LNFw>vf
zy9R|pk6IjT`7Ag4MbrD>W!+eX?^9{!W6{A=CM`&p?#e;Ajt8lEPLBX=?}2r{zE+L6
zui2xqfYt!`=c0A6I|jaKwCR79lj1JgPQ0d#sl7|HA)vKlTkOtm6271F@d+?ZfoY2w
zLGic{hc=?O`}=zr=9D$aB6JZ-;KsgpGj4Y8D>=vMz2w$ia2oEPLbOGoTYLJ`b$LfW
zZhw<xvP79=2!__}F$o}y6dTJyE<2io&LUYC!Xo)%>%h<By&|pbD3iCOw~rXuj2T6p
z)0_`2VA>a6sgFjJ+Mo6esRJEA^y{mvpX#Y^wJ*@D$#_(Y6ohQv6tPZHv&EGt+V^Fb
zMX$SvVO7ONI8_Xjf>ZWp$Og#cP(r6mRrRteMi5ud>qaGc!CD9UvJiPr7urgzW!<x5
zn?srR<E{D`I?SUaSilP3$3~E(-ls$|+_zMW-0iZo@gtVmE)hY(>#O+bMK3Wby@S1D
zLuwA^xO>2B+abSVHI=9Fd-BohsV)o}7x|sovSDxwDjI6K5p}m2f*88vY{&)@fpV><
z8tNKQXfKfAIsrWIy!oK(EIM?lq;l4qrlz2)dAG1PM`N<7CZQ5tH9R+LYdeI1GTaM(
zdM(Zh>$)@+nPgHG_Qa_QH`8KyUDd@xTwS|P<v4qgs6@t|0A$Y=O7x02Y}zCVy^fQW
zTG(l*>I|e~wmN&bb`@&!ZsmN^X`z1eel-D%Rq0HL(Jj*$xkdK_`nks*ttl-ItIJy5
z`u0U0m1;cgFQ73onSzlbS<c&h&RGw#9zWl5Dc90lLW<DOMbhhA^(h%wze(BHJF5V(
zOFqH~Cn#9E3Vy;PGGg7OSqSv?W{JIU%dB;6$;RaVEtq`~POws@C#2EFQoffP9<LO2
zh%*_q6whl3IpBPmW=l2aa5nw|n^wh>ZmI8jOl?)u88GL=Aml?%CKSRZFrsry#M|-$
zQsK6(15HBP*TgqfgR-9oul1&2Z)TT{lu;SqIGogC8?o*3NVUwMjq~^7^GtDIg7y2C
zzCHoyFH7-+Eu|MV#^KrH;b3Vc1`%N`E(Y#_H?{L(+2Lc!o226`{4L)e`xI2?goeV!
z+6`GX#bMAj*RZVAK6?AejR$4sZS@i8jRufk6zvXQ_j}{++e69a2~@M*pbGu@1_W%i
zd^pr&wmyZs9O<XZX~k?)GUriY#~e9mX~At9)&EtsN$E;X_~UANY`bS65;`+8!=YyB
zo}F*iTHP1!65XvWt-CW6vjTGes7vUk>@s-ReXV?I0w%4`HBUp!V;h!`l)cVDC2eS)
zKcqU}q3K`NeDutd;YJ_a$RT5aSHzE;8Qgzw^nwVr8(ovDh3EP-8;EJu@dQA5@afeK
z%B$WZSre?HV~cfK8AepP1;t97>rCM~rL!xiqk|Kt5X(qUVY(oJ>o7T%=M*^sw4O_m
zlBTaduQsqr4bh&1Wxaq9ToredgGNLB=X`Ie3u7{vmT_t>rR{^+VBJUskbBl)Y`%8>
zy%(-5M;UIMNAtCZR=S|PE<~$e$PgA)Wf%#U8w9`qHeqkLgXuY&6x3NY?<!z728RmS
z*zNP}iDa;Ovqy+{?{rbg{5W-dmSE{^Tx^paGj{ueLnSM$S)H&V3{^*(eTsw5L7^Qh
z1hz-ylgWJ?hEoV1%*)N`;`V^jg&`xU(~X(vvp*8v_G@D_VIHtMmEWp{e>ldcsEppO
zG%22M<5ERZ_47_QUi;QpOdZFJg&Bgx%1jR0I;`&swRu@!6*#?kl_!hTBIlScWIQR)
z?VDpa>km*On+m_8SiSqiF$ifgvYy0&Ebm{;GAn7phe+<64+(hut*EP9cQ$?-Tnt}_
z=LfZgZ`9mlW6E%|M%50RAAe05P=LA0EX|w%jajEL!Xuhq-5wHVn*m0wh##)pl2O=2
z@8Lm{wyf=B`}UGb>8S3@gA?6bBG<lhk2D2a$QezKHj2=mk6R)Ap6t8tY3;hyo=T2k
z4~~TAH+T25(rEfcMGhW!bM(^`M|aQdhv;j|4^pjv!;ZB~p4+?SCLVi?9+er+JOMs`
zxF;^0X5%%36Kcev&CD;ie-Z;ThPtl{VQ1t*JwHu}^RWw$%~{a)odB=wo=VowZM(P!
z71Yp5PqJz$u2nVYnh{h}Gqrb^nWPr9ZMC&4f)CgsL#q9%gn|s1;gmBL_6^f{+%E)A
z$&mQGSrs+ppAl_ha7?wjBv4YK_-!nQXUQjS7}5R)+ax|NYB*In&6Chv+{@({V5daO
z5S>=b>2U%Ol5JfH3F)tL@{LM*isZ+#+#uHyikgF6&2V9cyqc)&WAAcgOVYVl%cP2f
zOKC<19C=0-9xtjRg@RNgWW1fUy}f1LjV6}Xe%KKT!<Jg*%JASP&@-vakPEcy8D5)0
z(gB9sAg+~&?VfE1?II3?IGXWWW@&JA<1Eb_sL~@vqWLX|@A2mI&sqyo1va96lUY2D
z-3E!6uTXQvFgc=2*5VMmm}<7e9F~IL{+N$Yv}KSh5sv5qvx4z$ZNnR^NdY?L<tKnx
z^{E~umD3cASEIX1L~n%VH#TNc<;vCFJo8ksO^Hq4FDO)DSVxJu7h%y%KYsVn=&k?A
zL(a&nc=h+V7zw)#XpbG~Ov@9M#<1rJi#GIoR8()4Ir%wuJWTn9Oye1N7^%gc64{V>
zWh(*|T&qN4$1RUIxb_B<Q)yF?kimRt>&rSYr9@-=e$IK5mv)zA4(}>oTw#T`yT2Oe
z9uh8x^tq_GXi+&-ISk!*bTgo-fen|qu=~zW^T-bAu^LGkwk!%!l|fu$(^8H(*<}ku
z?MEn1t=a6OxO9dig@fo1?_!vrdXCP@&CDL3Np9G*Gyd33vz%9@F>yXiMfJcIk-g`i
zU<~4Wz$YvY><CU{?P!YQ34`;qEE&AE6KyAeYH05B;rTajuq|i_(K^478rc>!_9GtK
zZ_r|6osh98lU9#J;bScex!c2>{WnXeWrSn%kN7fNX&N`|lzTB*x}~B~D<bRZHgK*m
z0$2IRZSM3HJV%ckc6tE@BUJ(wxFRfIUerv0`sLQEt0jtT<`H82{dYC>v)92aPXZ!4
zR2~&{>syuX3Ej44^ii-F*n^J!n_8S4GgUU{m!nHF9+FSm4*d(GuCs5%;yGg7^E!7J
z<aCu|^VzashMZKGn5^bRv)$*}1r`DBdRe=o!&CDi;rLPaokM4Li>Cvboj-c82?_|#
zbwS(ZsqT(FE^3@Vn3J&SxsJl%(Mb(~HW6(KLYkV|x3cl%a|R__0r=1WuG;My6-ovF
z>j$+>=_x{I(u%FU4PzOq&0SaT(lWFX{T71=*K$8O?wPO`l{O6Kewo=>uI??@4a8G7
z(x_6<;ZkVy8_Wst{nVEdW;D?`wHUWV=VCzG(}!}EgqL*5m4buPZRQz=%jx?~snFHu
z+6KE_D^2H9Pk`g*<gV8dpTys@4~9xowJ}Twrb3aS&fIaa;7sj8u6A;7zfT5kJef8Y
zeMsch?qcmr>h5_<Lf$kIoiBtrd356Np$Cf1;F^odqkjGMhpZ72{5X$8iJ;?Yr}QEN
zG%w9Y!Fhdm?x~1{S8^JMo0l&&3`u9<8$aM|7!uTijN{Z_D<2luenG8#pzy63)v7eo
zx21vcW|e5mH*<Px!rjc>84o$t>4L&1M?JQEa~5m9B3=yHl{<Us(ubN|hlq67Mzmw_
z)mdaEcse_8odRPlG+Wa;+uYC{%*z~6dqpJS2<FhOyqUm1IPJzJ<QjAST=vXkX)!ti
zw;^s8YWV!BT?l-Z$HFu?6=%>;-J>|lP=5QZ9;>mXkl{?Kskk=xU|YBH!OhAJuffpd
z)a?1aRI}{iX+v}xV=f}^1OVU6_UWwT?{SaAdf2d22(g)@?>L~*c@IFKX_)uB1;I9+
zu*ff}ZK8O0d~yhzWYW-gLo7_w-|t?Y_m%E#Ur0`#zzAv({>^dMEGj9XWlI&~MVnUo
zzF^^$3OUK+$91_Q<I9=L68XdTm?>GdvY`IMmd7I3E}oyvxb>)Klp1-h(&Y-hg0y?O
z-K>q#`R0lg5bxcCu&BSPmEl)j9{0KU7e$=_-CRq%O8z(rnS=Ec;D;lVZ>ehM{L<UO
z+qk8};l6mgW-1vlXS&*mmk+9TL%o}#)W%MwAXx2w8X5#k0Y?{rH|}lCMN}`j_;X1c
z`eeNAG%~i%gY|fG#8<>xmPo<;i$;0xI6w`?5<xCRh2QL;iz#4VyXcb4V*_co7u^+M
z3-r18iSPMr8ZCkB1VYDN`<zMCxlyN8e|)q5DANfr2hF!|B{o;(cy;9jQ2wAO@pK+r
zc~m8{+qpL=R1mrKX~C|f=4yOQ2J-H=h9yX{#z${Wjod0XxY6+_%b}I+q5_(ma|N}%
zPC}_Q>fn72S8Ag&q2F>ImAG%W_0_n{m@%cAcHJ<gw)|4%V=s=(aRcF+UWp!9Z*_ax
zM3y)jy`o%Oo5Zkg;T<HPZzUmJo{<(f+r*k}X`2%%ZD~I(5|>N>navu1O@Bd`{UPiu
ztF4oKuNhr$oe=81oZK>Lm7T(IXS~hv5oM{?BfO=KSIAML#^TK*6wyIPt>!=WaeH2s
z;g=HV#C(Zm>(cas$ic1VH?MsdW#*Y$`y944qb`%ZuCh70@L_bprx;q)Bjh}<d`lz|
zcLM0qy@wp_YWMehh*wI#svf>&n2UZ9u^c+r-Owmf8|ZR)*2C0`RRh7T=C;3~!PH(}
zY#$P!y=V8>zIdnwcf{^o?%6NJQbnm^u%vNsfrr+$xI(Wx`;qsY=q6U9Lx}%7*z!F3
z{o;&v@2Y!FOqwMQx-JHb?k6vhi9k0>uqYG#rqpBjOVQn{vlo_>2bLbTo<7wcsM90g
zba%F4WW*T+=5>1ce%0Xw$fniM%`1Gwm>E+zv?5KlGhh$R?`@!fs4j*<TRpnw0-&>o
zmPPrqZ+?cL?{5O!yNY{@q_31N!z3NkdvLP!)sA6a)iewT{pWQ*aEA@u*0gFfv%tj&
z3&-dO=_oPK+ZS_$>Q@eE;2{L{VLs2t)(8fe5``=^MT~H@jB}>)R%zu|w`-V!ITeFQ
zFBEe&t8V$>Cc3XUrJ9Q9a9O!_B{GTY)`v{Q^v%%x)Z?ko*TXqA=T&r&MJg_pIN|f|
zE+E$m1$UurJld;?DR<k$w%;u#$lkeS0LA8=#Ap)`hm|`-8rQdKr(|WNiII}oqhhTS
z42q$nTpYdyoKl;cR2=I$PQ@J~awgqyZYtW^w1l%7UX)fbxo%1#9A??v*~8gFSJ4TJ
z_Ksodo5u|^%irBx(=tnURjtv=qO33C={X8S)9mT{(w)bt&Qbp8Yh&(A>$|#=9G^|U
z%yzlbDe_y@&_Z~Wn0kOAcza6N9f`{%?&(fZ2bcU!nP8p7qZXoLl0NB7cSTXJ$uhsc
zX~!Zfx`JhU!+-K2TUu*x?Uv5k@wCYCX(`#iBuM#Ji3f2gI1ADCkW-?^hBX;v+L-oQ
zQwEnI=92iw>4G|^j2qOyyG#YS-}FxJN#Lk4T8M;YW|T>-D4M}5x13x-{)i1ud&=uI
zFuRHEgrV(fPBl(TabB6V<`dw|319_rfl38wDyYGU&ODXZ&Z%PjmFV0fW>CIB0PmR~
z_Y@;m2emu1eZ3JAJnf0wEAA_nS(1B~t&*fP()(5Uhpylg+xq3ETFU+H;91_12P0xP
znkRZ0PUET&ZjrUBI<=dIRUPWni?(l)E_7xeYLYg;)E_ZoY9CTFy1*VL{$_|(PY}_R
zN@R5$0YbC2SyJh<g)96iscuJp+afMy@&xo2q*j5bj#oGAh6#RUg&bxZS6_5!sa+))
z*D(xcCY9S!>!}nmDGzry@LG*K`>7=34x<r#dv8yGklG-^-BPJ4n=S9%Uc5ejIqa=S
z<x$$&A@z>SKs*?1<978b)Un{a6<tIF1NGUyEPfD4B38n<5K&?{%Oj+pWPjP%{?b&;
zQCn}wdc<|4dbHZ+gGMFRn?dV}OMQ3I1U~1RT5j0%N*f9rjqF=f*{P*G`VWT?y3;ms
z6)5ynT`FOHCw6D@(>c=vVZ2C)(Swmx@Ru(?(tS>AG&kO@?oJ%3=DeR)Bs8XEi!b?d
zy8To|TZ50#-PANtM4KFhpo*mtcRIY}XBTZl_c2l<z@~dy$SAbgbL|A!38%EwU(nWE
zY_D02j^)uXj5U9L)0KD)pfMjbO?$}@boib?)`90h;)P1(;JC43TiOfeHZ1gU<*gg6
znzb{WvPjyjPeWZ^GS<cM;{DoIeGfH<reFJ=_uxjFA<!FsL<hUW-HalpwE2+Zpu7@8
zS9V`)q(!(N_DM@<_!Xh%q}eVp9F@4YS}sG;k)BoewN_u|j%C{Oh^LwnIJrR&3)MXF
z$3hyr&n5M5<YW|dY7c|2q?Ul64r=ndDuINk+K;)3=KvQJjTg2qCA^q+E^0%dBRUkc
zRKpbRQnkFnRK2O1jhkJPB07NoJ?|)M(g$0=*4$|~A1$7W#q0o7XhK^o6T{L>#|i{;
zH08kDESznwu4dJnP?Y_>KC4`Xnz+_H&b?>?xJN1<c?EA6w&a?kcJLwX5iR>chDk#m
zGY*GJUp3|yjNcbQY>Tu?lH;X)IK|6Hu+E$4JkNPR)c4btrvC|G-eFaDt#jy3&8T1i
zM`m}$^X-GEF*awpIVO5B`b7@rcE9!jh(>ghaFShamF>jFeTOVxh+$VgYfus|Gy>Ch
zNTo&*u7?$7O6q=S#Ko3@G4C$#QcRo;o`#YNCzvNF;{+Pcpn7wrZo9+1GP)<H<v}3u
zAsVG7uA(#~Xcx$>h80r8S9U8Nd$erLxy05Uh*LU}{Fq;2wdsrD;(;sTbTPG8b7p+_
z7G2=GC%_e^5DN*_?r>k}7~DlSQZz=d$<?<yAI+`l2i5uv+|V})nP1Xy3eGY@SgL8z
zPu~o-RNhkWRZSHl?%zr^gSKgf23$*hy9}q<C8#1AM_*|iZYMlCTWaAgTxDg`Hd8>*
zzJJ>-7a<?rE!FbgIZt{+yLOuW1`xtkV&nPRp0W>{{lUN{aZ52$VXgd!?!JhJmw(@^
z6jSc3N@jJzLrr}`TA;EQ61f1~5uy=+iIwUv`#C=VJoTRp2_-1wdE73A_H=d0YOtok
zbTIGh5oil4r|4x_96)kap^&>rc(ekCKuH=jxa=A#tbO{F<I9jxEwkWyrHm{BZ;ZtD
z>4Q_->|>I_aTcMo^RLBnz8VHsEkR6nua~%GE(nGNl<F=_zfg|1=RNr0jG^^Xkh+6@
zQ+-MO>(5(W5c-ZM@LWi-ggh!8{lS8UIy**jCSne>It7<F`xZJoOOUO?ZF_~@Pv<QO
zR;X_YZn1<gWWeN8(e7vvR_6KAGwy&(UrWMm0y>Abq$t?b-8bSiOf(r_BiEsdJt=Y0
zFKvFjMAXE~ZfC%M+`dH?-yQE)^74VUZZYFxS?_9kFd7okU7}OoZs;IJ1z$G|YKPD;
z(D#XJZAm?OSz7|O_CHRH%yE!gw23<3pX!m<sqyTrv{aOh9W>m%9LF1T?d~wWA{z|R
z954bmrOFOydH~M*<}ShBxg$lMWy7<hG{+I|Rh)#)@J{TtpET&G>f4w`d;B;nKK{uI
zvz^~7Vvx~t*4oB|tw=?cAg(y&(xqvPLvY-UhC-I+1V;*&@*h4}aJ!acTckT;vuD$#
zSzFyU>aeQi3qT}928Q*OCnqa$2E4RkZsATeW&!YwZ<uxhFFZ+8+)F}97a+O6`{4oo
z%i~^m${pyobCPr=9<rb`HfUcu2TmyBeO{0&r@Sc#gLAA1(1?9Xs?IVKjxFd#&^c@v
zO!)R)TW-=qSGK8w&9a!z*D5VMHxc5%70Z9@@wgy9dw%yy0r`W1ESjOXr6vl2&3JkS
zE3XuY^;6rooyA*r!3x#vIxP)G)9rV^YBd9(wwuKg$7wi0o%j(NxP2CCYz!UIZQoSU
zBqs3w_nlCQT=VX_bx0u{&SgY_Uno}<jn4OcxA~0b<vqla+DF$dqmP0@Zgt@J*Qx}i
ztD&mg^f*N3#Iy|OF{dV^_(;J%EjTRX2~2RaOT=9?)i&2gyPS%Z)A7{J)j^E^VR<gm
zMyy%<0xMxsGPnYP7t#k!DiqH)>#;^K;#zpGNfSq$^U*1f55fW7xPEjncQO)HBugXy
zeK2F}YhoC}{#aW(ALjo&1+g!@gB4s7!gd&_sfVJJER}<p8^85J>ikrUl6`n-%U15u
z)P$H;LQ3jpt%P_^-&S{DFF3y+C<IRvRowTCnicjVch8!8%VLf5BmMeJ^eADjb@_b{
ze<y-FgqJ2f+b5gb(g*ccr<jU=6EX*xnI*6ql?`#_TnoJ^YmpXqYi@q($AjJ(xZi_~
z<E^=)sZHyCyj>|I!2a64a&f>=m-8C~j)|SpTV=-Pv!p!PI0iGf+uxQb?GW6I!OL2>
zCJ}YfOiFUf{V3P1-~(Z5P^uUW4V9)R@@n75lFlKJEA%R~H==x~-JD8T5Viu9n<INO
z<N@`_8zV1$ySZ(I3rs!%TphNvJB@E|9~@x~?7HKM1zqgdK+b5^PpGL`sBtue-@vXn
zCdBpH{#mW_x(wx4t<b36Wdh;%6V;z*lOn$z6i!Sa-vV~O&ZdC>{gFHw1vE4;QjbO8
zk&Q08ppwqAwPX0qBZak542yH_bA02OFJ$!jEk$dOVbGRTzm>fdX9RmrcM&yDi~Mlc
z!9iDk3(m!;y+4gQ?R?_EGFde_l`Q&3CE|VhWUqn}wH6&~XxK=+!UnjKDX8u3VX!wI
zTaY@IX#>tS*Cm8ruh^6`v{-`7?OmlbXMB5EN9B~Wr+H9Kue+#|iv<P45~Q)qP9G%%
z=Zf;pRg4tfTd%#r^n-X$Ar$2h@+9l|yZbHNh?K8cuSG|VxAv&HHlHuP3`1AV+fZLO
zaL-Il4loP?Yg>BVJ|=SP2~f4UqW^sJ?Z?1DY6OOxcvSD|JYHD_88wMYq<qR4T^e!}
zjPT@h>QaLU1~H^(jzWUqX<~D-qg5jx3-B}8B${vaUv-msoi}eV{FqFyn8)k$(f4XT
ztcYN!_Z5IObL)*IANrjD!`|AXqX$<-xGflw7~$n9HA6lbbuXu|PANy1_{Td=6}aKK
zWx>@i?5yv5&3$&;w>9D1$**Q*IQ{acuQ;+6<wz`i9fK1*BnJs|pP6omsYuNnNM}ak
zOqkA>(3M42Br)xYHBy7CC1-v6jIp@>mJk^6hLb)NlAM|)fx-;DN!gntL&;on?(ivl
zI<LYrb5>6$V(oFvX`^O_L;vv3O!3z0`HD47`vUFHjXkCu+35D=^=%O6+6q|~xzyw8
zey9;RXU7T@R*@70lBO|gDojX8*~#h|K(LQJ97UjgR8HT%`c5%_4Y5<m8kKc6iA7<J
zEQkDc7N5(A0bI?1%oLjz^;$r;HnO~(Mmj(e9jfUSwkCUFNi`!a8*O`b4+&dVn}R@9
zGsR+yfyH}&a_ukZKXLed(TxJ$qukfn%$RF<_XJ3tZsO<rW-hcasLfvEu34Rv?Lx!q
zT|qVS!?6VIon&D*k;x!byc7w2uD>&$<HAr;Nj$CVSZAKDh+H%K^iWH4Xf88}YB<fy
z=sT}x`C3O0@o+AtSaYs|3kxNlka_;dcA?t}QverJXLxv4ID3mf9@I0+u<O<ynZGsg
z)>}t(00%K<-P=0>HolX$zd=>8W6uuCl6Kwh7>%>|8ns@&Iu=0`x4rO^<V+dr`+D|@
zrBklRv+ZGSGkbTQR*3f)2_<t)R`tqWQC<07{3h2>A|=U_h;7ClSrh^9YE_Y5Z{YSY
z4mAX4t&jz#rp8o+fl-1Mq{PhkSd<)0{n%B2Cpp{c7&5dMD~VH+?L_s6JC6=@XJVC~
zRK^9dBl2mP2DR&II|pxOW(9UR`rr!Nzu^bw`&2=B;^S8@%?xTfp@Npny7(hz7lm?=
zY%Lzs7me9ow@@-V%}lKBi0Y|*H@n_Qsk(k>4VxwpsWK4Z7++a$_x(PZmd3J1)+^>~
zprX@)F`9@WhdQ65F%Jle<%VB}?#oQLok{8*pJ-vvp6clx*A#UW@oY_bcS%-j{yo;q
z8BGtS&k|}=zg?(M_AQEQ2wz_JXijj7V%@!W&kWJh;%;BwMnHhw`7H1L=#_oiM-4fx
zb~R>5mN`6v-B5{F;_l!HkQ~*C+lmEqtGbqWfrHrMXcAWCvExEi5*0Kqua0jZhBJ(`
z{et_|IvhSC*)ehAjGMEM{hN-ZnwBdM%NExpwluI$_TBvnNop#Pc@?1wadA5y_Ynt)
z#7|h`Wt_3uV6A~guNOj^(LM|F6$~;@Y-vjM3*O-qC(gxiQfOqdj?5-wgMwuxxj<pR
z7+y=zL-vW6$d0899&H0fFyQRUw`X&NB8Dw2DZ0hVg34#$thF15>cI^*cJXa$55g2~
z+Z~p2+8~`Vn<!@9_a{Kke#%jm|M6fKysyE&v_p^o*!~)K|8$e(+N`9Mgo@^w)LyOD
zQ~6D|zw3@}gIRmlD`<=MaPMFvmZgIq0`1i#QTy5P4I?%T1K8OP<rCo43fY3ZX3LOm
zsdD-WkTGnoH(E{f=L^Yr5DTfBeTsAXV#(*qh9rpY``(mzj%x8c<?UYnVd~RwZJmnW
zB7}0~<m6`MM8E0ODjQ=p+)6S_UX;M%3baK$zr$dLVE@a~wms?<eQ=YsaU+&9OWb;4
zw3JD3H)k}4xR9tbD_dQIL$O6YU!~;tK_!-^RAt6~(S&wN(lGGBF*>udLzahvh7mE4
zAa1%Y<}x@yypXXfjBpT1fV6)sb|1u88JE9M7G2L#CvYB~0PD{)GTFS{*d4+c*gTrk
zAoAr@*Xz_9!C=Bi>F`nS(&juT1==8MYl3rHS)!$xM;aYLBYK@Dy8l8u%5EDQx(VG}
z&3Opno2wKfKo{psc3ISFM{~8+G+c|IxMTes?Jk9?evy0{37I0wwrjfy>~-KUeq$z(
zlI~#qoASkxCgByCLw~0&Ozlnyv8DbF+eQ05MKl}6n7sb;e#-kX=}z}Bu|h#%Zu`{7
zoTI1=6smlH<>Hl=0gV0}&09DtHCKDt>^<>+c7u*g-M?RSSdeh!5)!O)hotD&o(cTz
zy9s1$MwzmV#3=O7_E$R7j2B53x{uAS!7)=hnIehR;99N&HMP@B5|1*2A|8BS&JTK}
ziuEsQ0s2!vW|nHROR*_kRYXrAz<J_Wc=`Z9q2KC8(~x6laa+4MBqlu+@+B;$)ixE$
zo>|%h0l7*=fWZFLKfkm3*Py-nudinRI~Q&Kiu+%}WPe<=`X@F1Jq2ogSwH6M6O^4)
z_m17ir0)tP;^<7dX?qy|n`ua+rnU?y`J=8RYen6MEvR4LL(UU`ck^|IyG)Vkp=@jQ
z%{im+B|3j9#e1~gzfzaiR}{VAW(V>t^;(xtkN%mGtn6k+4jgyAUc35=#G~q0>iUg7
zFMV_>*vnVtNxY}=+c43E376z)LlB6y0F7yT5tel3)xrfeufAA((S@p~E(=mTDw;e}
z@)?D)k{UWnjL0>i@6Nn5G&F+{$jq1JtFMi=0<wK=U`vLlYyJiGryu@Dk}rq3F8v}|
z`I(ZrQpdW|!aUk-{{>D7#w;?PTp&6+oC>x;m#d}{J(AP|;4?#Bqcw+P1M;WyN=$5b
zYWRY&MM8&+PGUT=?8EI&NjO8x3!7ZCm&sp>Y^2Skj8X5>5t8ZkJrQ8`FgnoV8YMPo
zp&SGU3<i&*qE6v_Jw5tv6Wg@?R_2XOO`aPc8eJU~{NVKdi>HX}2~<{$*azgB-Ko%T
zGu7xg2t?fiqVALI+p0BSi=)V@FkUJWD1Wov1cQD{%x%fkWJHy<fGwxAH6FNmXI8^h
z7US<RQ;nED^g_$n+`7g)lx-^#|5_Wr;*OugH+ZLIpv|%J)70Qln4yKH!1~8o4N1YG
z^<Jn7;~lvMpGd_i8{J-_6_&>j;*?fy{xR7{<Bv|?&Zj=EoZr#fM`H#>$z#1_yDM4n
zcaB*<+&bs~Y;B?VNkWN^l7b0}2a433^t-;m8oJa1%GAe1w}AQ9gn43zq*g*EFu4@|
z<c#Vn51Ty%5-r`}*XxeX!zfC&s(trrKfQe?dA&A?Meum{9PzH>DuFnIF{%1qp(;-T
zpC;!-ig8cayq_`zgSiWWL)w1MVCBLEt^Ga>zU)GVU8RA8;4-Rsn2Y;^4K6G)=1EJh
zJC!3E2W_XJWdSq1y8mMmx8$E|zO8v>m*p8ibf5DU{l@2y4;AudO%z<x!U|qs&sXRP
zlnY*Ad+Sc4={Pk*B<4>VaP8o+@56y(r==;f+Mml(vN9&))8IUVhHLeZm8A>SQW!cr
zLd*cEnkuI06b<e8mW9$`IXn6ZB?V(Cj=n%+*IUBSAe+79QEBJ8^tigK8^;f4bq+$x
zqp|#`xV?v)od23n*ezC_7tYt>w7hkL4ve<EvwdN(#YD2!+}~Gb?dA1_-`65vz1EG>
zBY&nOD}G;#A!}FPka+m~N>TpmHUtD1jr~f={z6Hm_>8u3$h~BgYedtgObdRmAH&3c
zyA6q21dYSc*@<9YX{gBAmtR^jc;aP*=EYR@-}bD3&)azV(Q4Sj`Ik7&13eXK9m=!I
z{}?q2spmxNHp$IM!?m;<es9P;+}7=3iB4KsK8Ge_a9kbQa$2il?_9dn)%>F*%JCbN
z<jkKiM=;LfH~C&M(?SU<M$6GlME91ankIuegy`OCa0ZPNXJ_S};Nh!o=3w39_SdW7
zHw9m(#>f<YJY*vwFky~)@{i<3H$9sWL>IWNzotkvssB2)q8FFhfjCQ@<geG>kte$6
z?UpocQG0h2`J!uu;oe$Ds5Q!TO7ap5yV+3EF^5YP!LQE4{dr#T5ohL70#8-nL+b+=
zJJg4w`9oTmNU1<$$ZL=Ve{lAp^4dr7x*p@=jP2ap$3tEpiCfH2qYlNnzH9MYxqQV|
zv`<$FUGc=-Q$ILV=;!X47pZg$^mF$_qcKE<q$6q=fT`8!+m+b4$!ge?c0c^QVD8}R
z`l*BiQ&x}(qkj`u2eO=caGUp9@u1LaqfyfeBk=OW4jKt-JYJ8k52M-c4)+W*oL(3j
zmSfN(R(W8JFv!4Srv=yh3X<B;5n%`+-5m=NU0l0vQuPmoc<LM;p#^;tMEA<o*ALtR
zGOKM=mh_E&a8bs3)pl1B@B)6Irp1Z|%ibZeVxyHlG(&ECT4-Z#UY7ZWQl$WXgiuY>
z8d!{qGd7n~uR{?p|35Z*I8Z}eU8FKYDSI>%EbKDxJWXy*OM{7a&|0aecjcj^gPG?*
zhy40Z1aaf}gGj$kZld1y^KX(i+D4Q&Cmi><2ZmRc2@-7{i<}<TSM8`DS2!qHsUX*1
zlq|5r_%1j_wbJK%WSG(3ib&lQ^S<{(Fgd3;R?Wh!bXe$gUqK*x{1nYq98IJWV^@WI
z@po?+j1dQcLX9mTJR<MnWxG^Z20M?he3(`LRMekq?Tb;VIxU8aFVxcCLavWF^wR2!
z(;K{6)$Y;mQ(|;%9Axav4~59tXi`y8iA7wF-R8cpkmG7+@fE6d-v=A^WB0nYODNdf
z)|$~?o;AHCP1A6?N#3xUknKLe98)!FgGJKp#B>;Fvde{rSw)}-<%X!ZQ22G@E%76o
z)ZB_IZ9>?7DZ@^chx4z+vzJ^VTKY=QRrF=vsb5?n>nxeWP{e=4U8`g-3G}$DJI7_M
zM~y6p)L}Gfuge+wys=*RL*#^0+KBf}e-td{>Fz(x*3+ur;vt+J=Awyis^qGe!8+G3
zt$>s%iV<pNPMO_1=eeO!kvIbfRAe$21FKn_mIybEIaC28=7<iCTYPwvc=$m@o)GqK
zw`Ff@^&r)R^n+b#?!rpj(JS-kKiUpw_p)~C{Yyqm^J+!5sp9Ert#(3Fi0Ak;+7H`W
z%Y4+KO_dA@1C0v_2R35^r(^I(z7~ZRrSUE2%jdvpKkU^`g)j`oO+vm#CH=N?3a3Z&
zHnDr#Z^z!uZK>mnIdXfD?&AC0P}gFEgq0UuRC5*E)pszUEEVlflY+PshW6|t{dp$V
zN4dFy$qMqQF!}0P9F!<HFARUM_w>LVao6~!iA+d}5#oO?w;=Q1R~sYNWf+5m!3e~^
z<|l{4uCE*69eJ-uC`FiVt|2!T#&_<)RqQir^|D7aY^sM*k6dueVKLcjLPoDt(L$O6
zujD$u1rq+`TmKEGZkFpN(lsoulK(&!GC~4)y8Ye&z6oX{;s&I=l48}m8+6dts_0|k
zv%g#2x#9f#YkaQc{!_f~Hnp0zT}@tX-?F;36z$a2sdZmTh1uig3&GG(;x#7v#AYPS
zWI`uJbi`!ME_|MSCw-}~T0sxv%2Tpu!bpXKIe83{!RQhSs)Js^V6zDK=e>;b{R;~`
zzin2q4kZ{1nX?s}?&xp_JuhS8Tr^B$;_1O7yws%SF~=HA=;Cl`c>T1hcYgWTMiIlY
zA>-pZI-CTe*Q-<=Hn1?8tJ#gScc?j=u8_Ha->l0CPU2DjEV%ik5GSO+prPH1F!<i7
z;+$z7(QLvUznzl{fk518h^}ZuP=wX51y*rds?Vzf+P#n~tD%wYaOVq6OGZQ!lTWS<
zePlUtT@A>|&3xfm)OiuEJBx1(M5VvjGh+I(%AZM~hkhIA79D=8poEl9EpH8bTkidh
z^Ox4BF?z|!xPQM-!{YK4qf#|p+pSBe;z1swP8Hmwq@w&&r{bm2e^b^1nw*l=_$75+
z$z`zn)x9-n^tL)_1g*z6qnooC;fu4i=M+Qs$z}eW6VD63%O%=zP-v<hMstqG%{jkQ
zQ$IZ;W87Z)X*wLcsp>VYC8r+CXZqc<asGF4!&gn}w@&-JxZ$gl4ESA2g{;1#Otk+^
z+*~>6>FqJ_!HI-wTNU>{)Jr&RVJo;UE^48`YP^X+KY^%@dq<zc{Mjsx;>63p58hSB
z-^Y$YGuKQD4g!H(h<}f&*g^firU(;#oHv|A=Bb#Qx5<K@RJ2@;9P=LoM8)^)r^@rw
zf(!ln)0ivEFasFw{mlC<{QZX@_#t@nyA61Af#1I5&rSZyLl?@74#&6Xt$%u|;1+Oy
z9(9!SL6Xs{E^iB(+z0aS_cfk3fx)~viE4AWF6<Z`g|FAxU%`y1%%k(ne{h`zNMv>e
z{KciMkJbR{6W|}ZHOURTQZxf`U#YTq)R|;zou}V@pcCV7SN(9z&tYkj`>*B%Kzau+
zJs?fFMInW$H>?d+#3%D{kDKv%dpMP?479u8osP9VKl?`}hSy6*?WrK0Javu<Gdm)8
zzFUSQ5)OyWQW8>r=_nd3k1I0U(u<K6zUzgNTFnkVJpSj_Ef&}iL4lw=S@<leV*XRf
z-Kd8{yfsj#atcG3Bp>ba%CZzwWin_|w9zqr)RBdvQgZY4GK_iAM|g3go24A;{fYGi
z5O$dJ*mBrS8yETEark0QA@mS-oYdkeLmRTUf?JmHU{o9HHk_0k{33%Sv#WM6@pD*Y
zCB3&1cl?HUr^f%4%l?qPpYGiG1<I(za3t~SwM=VSLS=!_36S<xmEgf7l|_>2s-D^>
ziK!Pp)@$|<9ZL?A8d+REPKXj4@lbGDIXq`a*|k#BbMRU6m-b=p5uBgCeIL6N-_m5F
z_jtU!d-z7Y<$4*dTWAD&x=x1s6=ar2^m4#X9YcM5AOW#%68a4>ojI}l)x+Oueq_<H
z|9YbnQj&NVHDxw@D|T0b4<mdZyX@7nXQJ?ULJMce8Be=jM(r6E(Sob1<%)*RiBc$J
z<cTX!GM!!zkAx{*QSEcm<IqK;LSn#KD#YQnTu*2xJB*(4yE&E8Vu%T8Dcz9~DK8H+
zlQbmA%uF&Kc0B6<)wa~$+OrIji7w~6IlfjVlQ=v!!Mp6?l*lWP$n|v*4jn9tDagsI
z(Q#JR<5>yF8<$qCHV@UjQ%~Fnk4}BT2-kk(^%CB@ui!fYgBcJ4uQ@CAcrJ&2>uo7K
zhlI|N#r_&~jcF2|6X2_Tht$mjq(UWGO5f1M^(J_~NmbgE59ak=m~8#XNNCOxFUV4&
ze0GUwb^K$R)1R*n6?Q24O=~VU{d%2LX5-0mtiR3)5bSl0k38iHg2AG{B~0WdCO9i#
z;%MZkDXr9{-q4b+{pzALgS+P(<uF=mVApPtkS^nl+Ii$Lb<=k>aE;?*Ild}((wV;8
zx~%5T$k?z~sW+ogqf?@V`q%MN%j9=QEomc2Gab10Fi~}>2~%h1fz^3_^qbGzU}8mL
z0*IHavlOXi5<4~(_FE5c5(JK@jtfqJv%!Kh?OQszrAj7d#p@Ni)4GPonpdBCfxwx}
zQ^v13yM>sE#gU~7CO*aM%6jaI`o!WmuT(S#-L!FFUgxRYUzj|<p8#_HIkj~+;?L8E
zBsIpr2$N=qL30DIIe;9bTHf@wY;eG|)0d7<ZNHw~8M8k6!AQb4n^eBXlzz)rTHU!u
zrJm?G;~8IZP$V3OQ#ge#x^7beWVQGwJ<?P@@WY6S<NFQ7pCFi6Sy}c+4V$@4&xrzF
z*HqY>G)I0E>u2vlTn~mtOYv<>eVEv!mN$vtFx})Ie<|X;b5L6Y-adVsheLe*-ZC5|
z2H=StObU|tm8`qf%sckQ^V47TF|Q!7CVnoFLt)#0EMo3I4bp{erxMDSM^c)IHCnz&
zSVdMy%dT!K-N>-L#I``<cA|U^HmiJrd)k*VQJIKo<G~S5ZAM&y8-AfZ4x%2_MA~mV
zY0Gk!J}?zG!F&Hpx|;?qa?Ry+er4IP!xBMkQT)gQx16yBgBLgfyJBtuCiB@+IdVsT
z6;GQVAV;JhmI`wh2t@tM67V>%UGUo5@HhQGw7myZliAkC9owKHAR?kP6%dp<BE5)8
z6A=(0MEWQqlF*Bxid{Ma0!r_M(4>T36qG8R(2Gb1>0J=N^CXb)zIX1Ocir#%X3fl6
zVWmFjIp^&C-@pAor#`|Y$TPfgxAszan-AT7B4IfVfwM;6kWc?v-d2jkuB7jMJZCex
zQr4d3eBQ&wU5dG|Bx5->vX;$|um}8iV%V-^FMEWX1zvFwTi*UTpeu7{yWN%FWq-SM
z)bQJZJwU1Y!*WcqUZ#|5j$%)4KgJRSmGeQ*Nhyi;uiRx>pey8gr!wy3EKU4#u*07Q
zS=#9-uyHF{{(A)0cDALsPgSWP<A5to-`jiUz!vQA)15#ycoV~9KIOCluHDIFBzb2Y
z=CHjnkv5~h(BGoZP;Tz}W$rMT@)ou9UBpkU8j4ni6m~y}Kf|zgcrWE9Ec#5|hWX?x
z@aC2Td!uad^zGPW&UbR<d3)x@1;yun2;3GfuuBdIbtTcusgG-Kogoh)`UWtX)1CHM
zmNi<h%6oJ88G3_m%BgD6@9x=wwVOZ}-$MVYfjvdr4<HId@Nk2gFKoMR0Q8mUD`nA#
zE542F$UG|~nP;5ZjTHSHU=x_Q;`Q^u)gBYwj>fQ!gAFk7vi)|XYuhsO<N!IIV6cnV
znN8(Wp3=!ce&d>@;Lkw)LUCoehgARnp&VE_b=+@fK|8*huuCbBmC~F}LYdw-(n(Vk
z@6sO?w++d#w4z6-BEuR2@@ou@Qip?X%Y7)=<Wvggj>X;`Pi|@ZNgXwq_wX@T8l(CG
z$tg_C=urG+!BSV3%i{;#NKf#4)fe+qc9q-h(M?th3E3f@v5tzF*QC5B`s{BQ=^0?1
zeht>?<X>HXI_6K%cxOZz(iD<k_2h+svwFLZZ?iOZ%5miQrI~rQ;Sbr)_SM5<C|ZpO
zS|>Fb(haS)MfxzGI@OO$xX?veyQGZbLMLwbNp`b9xirMZ>S6iM5A$o$(Z}9v2|wsq
z?#tFTJb!++SV9NnrsWMvduKAU#2Mdf8C$tWoJ-mzp&cqSIidWheQ8;KLr-I^eN0Vj
z@_f>O9n%M#x0C3jb11|@ywiw=)Gw)O5x@I+W`SxN13A(%p6T}V=CQ%9t7;{b?^aXL
z#t2~<ZV`U<OZBY{NiihJ4d^-<52aD3v)&#YP?~nr2m6liv#gEJ3E2FrN4^ff;RG9x
zdd=Gh8s}4*bZp-2PO-17nRlNC+gsA={ZFxWsuA5HZ({IH=>tQQeQtDb@z(KPZZ?Vv
z1JyqD$h+Q10KA&SldrSbMn6zzIEor$eh9S*i_|@s{X=I(dQsB7EvToqN;qZxwm2P?
z>V4J7H_}v~L1`TQ<?&TF4u=k}I++%+V!ryu4$GM#_bqP|sr7Q^EsyqYuZP{GuWulW
zUVHNfZK`T0fAZz~Ro%z##xH`ad7YW^HM!aMd33AH&yO}4U@aMc%+WjC&%N3I8^$Er
zv#GK<^rF{@g{QTv*VNo|qU0QALf|cZo;G(fdi9i4W(tF&@4b;@LI$q|9<n}8?fT<I
z?!%rDR~E0=aAnKLhlaId`W?BVJcz@a{PcA3DF+scbng~-rKH(Y`+v}CtpOO9CP)PE
z0OO*+^m67lFFDQpqcRY@Tfuk_i>tAh2OMts%rvTYOl7Q4BqikFCMO&b{dxtTklL-R
zt=@EJeEl46@XF0mYeP<htm~cyPFT>g^gKGFM=QR$gE2zDG8U&S`yi<Orz1C;e~W||
z+z456TTb34PJ`RsBl0tQJu1yYXJ(PAW#RzGr7x=8k8*>2QE<$dBv0`ezs>UW%)X*I
z>m5n+H;lBtXzJ607am&_CJGU%liNbXC`pKr*vK6aRvE3=M`5C~MGkj6UKqBsss&j*
zP<g)sO>p%*?Oiv|y@ua-UbQ`nOU)T=PR~t&j|Db;`ZV~Z=I-N}C0?7w0?(8NYzf!h
z5B264oS$E@tJO$&NZ_2?C1&TMUX|G<q={tO#Ei6$My9@Kiam?|LTT<43tVe-<4my7
zJ&VlLn{1USO>NWCu9VqcmmAmJJh7<6k>1N(+|DdM^*@a4(|+M*f9#XpNkzCT17o0_
zp5?%O7igzH`n7%p?DPt8ffv9|Z}6;jdMA;&nbVt?ZSc^7AX{oGZp)U#JF+ET)W_gP
zY1WNt#bWNt)R?4P6wk;3W*6eTt{>5F7g1!_;AzO0Qw>mD^aY&#iO#K9ODFVkCi+&V
z%03m)$&z;`CF7#VpgN0GX0Aw$AcwC4TP4;<-e!;<ZrvSi1~y(|{wG^5nbJ+8Q}rW3
zvgDt%CNVYA)IK~PKW++Q`>Lzjr;h~#HSN+RM-33#V}@&O6V(yXIRks0rIws$ajbKx
z5<44m%goa&Uapy!F=lRJD+~1d{g-mPajXHYZ-XU(*z`81Xu~h^0{v9+s<cfq8<%p(
z2kL=xll6?nvYG1@#_h_F;<|;gc>5%<a?a=bTNZ@%d?=}Dz{{)J$xm?nx&%tk6x<6L
zTd%*XG^l*fsGptBE-Cvw&>*H#d9kHLI-)YTGd;XT$2Va8s;+$iHpGpn5xyJcS7(<W
zw^@0b;o9}cr&Y&k>~W6vx^P365}z^0Ce~zb)W>s0(!NnEow*Iv2>8XDugeZ*Z*L#L
zwbSOW<{zZv+9VV{X{()+^7T<?_HhQ%`CH8fhJx$aWJ%}P<-l|dVP4)3zkyA0p}YC3
zjW{h2eSHr%cy^{ot65kppk34Mgk%y&LO_2_c!L^<scYD{N2Wot;<~@XG`GSe`&Rcv
zAA59Ml5eEwbxgWeCZ-JzMBhWbFU{wCw$=436DvG9D)1EhAmalI6SWgYd8H4s1HZ7c
z1}L}}7^yokmnrd(!V`CjPjx%N5N}X-57l>Q(%Jbu%Zj|=$if&a&bX2S2OVJH0ya-&
zeSgD9zdhhLEG(EH|IItcjE)fZPJ{7-&q>eldsV-^cPejL=n(eKFJSN7c~LYQ4e1G&
znMAg?PJ#6TQ^MBC0JhGZ7d>$&KjAZ?LsM@?Sa><Wnj4#7*D7OA=QsmL@YB5)#FiWg
zCQD~hy*UK=&Z`f$E2JG=((=q5R~Bk7(p$E@$|s9uo=n0uDDvp^a!%>ItDws+o*+cJ
zjJ8_^m4it#Q{j5IGoMgJ6aEPOZ{K~G@fFgeQ*6_MC%;b%p29+ZVrIp4*T24Ia2MBi
zfnP(7PZG&Ed8czU4&GqF;5e-};?>oQereY;8d_#xkEfDPudzvgnp@ooL|a21^<Hhn
z;WQxZs9P^YvzUZGdb5_hzI=3})uuJCp3kwkse58vfG;vPR>Gl=<088{a2TYwW=j_=
zsYR%r6?TgaY0OUC<jRa+ij2pBMW}o*X(q=`DDNN_CK(eKAU5ceCEVcLi*av{Wy4|@
z+0HPWbe9^+dHp4?5%anq_|K5ad~DI{hf;nWFLjL&9o*}-=-YDo#K7g)(@yq386K{<
zM%zFlE<(220f@MjqLLdx#EsaX?CXS|CVdNzvJZ_5JXQVh->@?arD`?BsNXR8Aq#8K
zJh6BEm5a8%KEtznP4E)FeoCgbMI$b@HOk5brPYj{%l*!G-pOmKhH#md>pWUd$Pf)>
z0<G2~+3jtLYE1F3IFRBq3bE-N@Jd9%%`o-Su4R!1i(D_N_E%H3m~V3rG2Ex5S}eX+
zN!okEZ3fe9(&^sQB0fA9df<&0Dx6Z&)HF@``lq~YqDz3%RUVzn_%t{iZUIF5g#+b0
zi3y#o$;FdlB`uMqUpskF4=_GiD%|3Xt112rv{)0${sb>6@#V<i1Wa&b-0iNINUjtP
z-y3V8qv8yQJw1QK#xDMKFi;gKA^k!(zU61E_!JK|LH9XFt}urWyA8)f&nzAW=ZlhI
zC(Dy$(4x4A)K2^U-~?Rv1V1RyptzYQF`b}5<7Yu<KchD?(ba#y2Y%o|X1gYuN<c|^
z;gzKG8CR7o;bfmIjsWpH>3-*Z<!liwVm4_1syCr!&B|k4t&fkA<MY|$s|pY6HI&nH
zmgN;fi!1gcBur{J-X6oHB5&Z~aIHa5mp)exyUHZjnI4ZXoEdS6PQxiG!^?Z`FI4-f
zbiVsi%YlsGGiX6eIZbOPP6q|3cT7@(s+PmRE2ruqXB+qAVC;yXr`P6{?;<kUUX_iT
zwnI9hrwY*fUe_3`8rgb;&kC32M*K>$#M-lfMLq~!yEYaO9AfMGMi{K|E17?KVj}oe
zTkmzbX7{g^au+sr%vpaP#vKW<6|Y3Yt-}SHlFsgAdQV5X|KVm(x*1m}=wNE!9*MyO
zi~Z#}c5|J>zq&m$nFoxpqdyp7JpmHza7B}al=?PCj%G(PrrBeV7e;&^o-1f86=$~*
z&p_6W>{Q}UvT%AfBlyscp&h8mfK%U->}4YWWshyM)qT;23K4gDgpP3S`O7J>{y}=7
zCZa3i0KHVuF%j_u1kWA3P3!}qn}d=~y?+}A9XQMT8%CPwEgxXJA2yG%#9!tsb(Ook
zlu8j}mT4g1Vo0x(lCmcD-9`spR}elViOFwZ7{A5o3WR5s$-YC7@T_ly#X`b!oC4>M
zR3JRd13BjIVIA{q14|*1LbO8H6p}s3r-eI?uzpy1=GU%XDA12zK9}WtqxP4KeSCr+
z)1sOphnWSOR&eAdCtO{|BTzw&iodWGPk*fH2(8{k6u1K!+l9{|x3PgMZ#H}F-72+Y
zGQ#>;zWmXh)9;(7Ve0&nzFlWZNF}dLN$t3WPs=qArXbHOwiLKdK3q=k(OLF&bi6cA
z=C3%K?-yqN^AJ3>%+bQ-P8A7tDC^M!bz3&Mw>wcd9l@^iBXKJHT}5kN@gON%)~M@b
z1tt3&K6@es(;?2<If;Ew7gT&c@Vu|49*CG1E)?Wqe@Yw0J)Hh&kWM*bhyFwLC+fb3
z2D{PqVPuIeE1c1DG|@>AEK;SWDQ`twH69jfiYQ)H)TNU1@uAgAGit8&2$E}{E1{$x
zV<HZc!BR%{IGAv1e8l;76R|zr00jcimy;W-8GHf+?j0RLCaCJu=!Qv_qvDWm8Wjao
zpw&8DmTjPlL9Ac1K``nD9XdpZKwVsXd`1FhyQObRbK{5G!!K*Ecl~`Yb9ZIUZ1=2Y
zOCO*8qt3rPJ>JfMrUg_`^(qLYt2qB$nx5s5`J1x;QQFa(6}87rp@~?KMCaD;&rySt
zXy<TWL?%e0g)hDmu(amsOTX3i1K)AuRrUD!iD8U?W-L!oDzZQwfDGm6n2~jR;`lLq
zmr^lFUBYR-Y!h1g8C);4(1X-ww(j^mCS6D8PfN=Z1UE3-Q@>t0LE~i^rOr@tZs94z
z5{x3swzTU<xBz3{!-ryBgA3SK@@3v8OttSTH72e||J$F;TV%edtUW8-Ug$|#mlOx8
zlZuK*Zs2X1)8IF&y}XlbSfWT*h%I%po(nS*TDxwQH?fhXPX(mUiAmG(;Q;YCJ6~Ia
zDmHF$tf!}Ec&+Dw*k+yfp7bBB`g~UKiv4ivBwOUW_5wX0R90BJjS-D%|6)8_{nKs>
zEmuliz$xIH8m8VtoI)WNkAphk6hH+DP&v1u!wUX8Zzkw<wxn{%kay2W_sD~$9N`m1
zFZGYi6jXw0)umXQyqDYn%<%G#<-aV)^WtZtCSIdXO+DH2(p4V%0L61I>8S4Hm)rF<
zidTl;pG1(W=x6xnVGcVQ(a~K+(2a<ddFMfg_9=bCPmMG|nwRyz4vF={E3Klg?dkZT
zoHoVKApPQH*?AQzSL3WK31W@5VYumfW=VnkPKj2vRRJZf0S6lnfVqn6_vcAG05sPE
zu&b+W)6_)VRo4iE>9M_;D*b>HNJEyScop9!t-@ANxo1cBQXZktozz_pjc8jb>Z^T4
zA#E@|fsCSqcgoeq!Rf^0Zf?5os^IRO-TP*pRL^DA2L(2$oLay3r0C0)9!uSh0U%#B
z?yD;?oX}sKK2=6L)PZQ%o8m~#NQq3x-n6Wj@?!6MA$JpW8Xt6z`gmyIQ`ma*qS~wn
zD7+9WOx!QCHXO37UEucV(L+WaHOSK^wk~<kU`<XLCa8E$YbW#D*+83Pnm+G4usPOn
z7=6Hi&2i8Q<FyH!{|MV-mGi*61+T+E2tR~c>&yt@2VM9ouL=}?fc<IV8k0*-k6HKR
ztH=(^_?V~d)&1iWcg<7~*2bW$OiL&$Ke6~*we*5{?zBy|rbKLqRSH1<v?Ok5M-c1(
zaAN)6K2%K$shpUV*)zlVY)V@eZuRW2C(^Tq!}nnyyUr+ZWAH`BVz~tbW1m($n{^~%
zHmui7Y2<m`Ok~%t#B+fD`|Xl`*)S)u94+%=i7%K(IuL+faIk;@&`Y&CZytbNbYiu?
zxG6|`cM$A<ik`=yp}vElfy7s~y@t@fI0z})SJb+9YJuPV8Act>Vv~X+W6zI1)8YZy
zYh36G_O3c;VF7hxhsW0@2wzPee6=e@rKCqUvT;8TOFn4J5VAY|=JxHXkG-!Bwup%S
zsgUw2NzfRMSbO?;ujQO$ruOV(^|f}qtl_8O-pg=Rz^8p_5cRhX=W2Rw2_T*%VqN^-
zKs;Ac&cI;#I+qkok!h4kSZ*PFsOd;<>t!X5XpaWxU`{L+o5zhUUY9OAYeyiCVmHs+
zK657x>?BlC*P1nTegt=)>a+0y+*%MI$;yMQux$X&WH`-s)YtnNh(wZ-C)uD#L_-j4
zMo=UIrVjv9RL%AbhAfS3l4`{fu)VOk_gkpV*^R(=-CL<`kLr^R9*zUPB_R0nAH#ow
zrgNru?C=V8tM_pD7KgQMX}^FASZ+}%YvN$JeGMWAu-xW1(2ZY_ntQZyCOWQQ`bNE?
zq0#|;-f-776CVzv%>AD4k&FEFv~ExDEk}B{v=uVkYSWlV5xVrnJlVxY@hY8O6^E*(
zrltiLnuP<>{4*~mS5=&_Azec8iUogU<ArkOHCiWizy2xr`M!0PAqcr+?}8oSg&g=b
z4p&c@QBT_xKh){=N+?e+eAF9nbRYY^rGTm2!^(36UI|B><_>rVt&9H^S`2o_hIY$t
zI?a~uX}jRA=wOc^YI{vffX}P~d}f7!&u|Lvs#V*908exp{oqxHU3@Os1E?^lWFu$2
zbcKR^{jq|JuQN=n>n2jnL?fV*JjnNTb*Z-V{pRg_UyXN(7a>C^<vp()7Pi)$%gaLp
z^Pav2hEtW7Uhb`0{~KA@*R8?iC;KnsEn1nUHy(K;onHzr0v1mkkvDV<P7ICN@N_tF
zO50%EBlry}k~+5<csC#f4T*{&AZX4^1O%-ELeK(U<#w6`OOzJ=8ad5Aq9^m!hLv;X
zhq=jwHJx~0qoQdea6E4PDLXU}Y9)4ByQHLY{bEs1<X_`EXv*t8#^ntlAxZRUVELBw
zv{9+e5`$_ia+!sqN{4xU6k`=U)8I&BT6OF0*M6h|pNsf9YU_rhc=aZFGpx1%P4~QV
zvwv%hN;I=1-1w5WF>Gve2<P@CR|XT-AMxdnFLwQzc^XKsm*(x<Q(sl`H5Ju(&3&?N
zda(PuXJP>xl`jw1dz3u1p6#Ywmg4q>bAvwnbEs1natPGeIW&9q`}%%WvZFJDpd#lL
zZ}N9%W^I3@a<?f|CwKytQlKJ*O9gv)yiur+mUNd&yRdCAm!%`8`DwG$6hB`@f>N7d
zWxP2kwasOQlz>tj`iF`5wAgRhh2n2AHL8l=S(P>uYV(Uz+-w(}!`?jrD}?Qq#l^<2
zM`4^Bv(Fnngn%F5h$7RQ7G;g{`NuO`L-aib-}x_jn;gCd)BX)3@j|loRYa)AwOQZ^
z8<`{8_4g)oQPVPt%RF1x=U)qa{|$?u%M%rqX?tW_wqQpK@&jUR_Kc`7Kd1u`XvgK%
z5H)#uVL9`Fhe6oUi+b+)J`AT+mv~}iHveHTD8fXSihky8-a7Z|5Pl|f87`1zIyj{t
zc*nkPlqsPledNfzMs7q}g1-+E$2?|?emBx!P*PH*eJ|G`P)(Ol&qDNUyl0qKQBhg`
zXNwiV{Dw>3$shmoMcsR;yP*cCfXcT)l#0HP6pVwSRPSx>tdhW;vDfcFkfs9bDr0-L
zZrI2iT~Hh0hkqH--G~b7?X9>N$!{CAXzlyh<fDnd;fUc|m+a2~3kZ2Lx7K^Q!NPK$
z0WAsAp;HaV4{aW$*uH3NZhH@Qa{^o}paTtbS$ZsSPI<~Z&B|4!!#U(8Y*ttls#g3Q
zx+#2JP%5%CXTJfskhOPq{!+x4<O*F;&f;=zg;XYfxnuVH`f2XLEu%N{qG%(OjI#Ut
zh8ZVG)#nd%1=qyqlNQRn#oZ`>F}*A(DBCK^27*dpG<B5bM6I-L688`L1vM5vpe%p0
zVQ;mccDf6vm(YoLVNEVMg*0B?u$||OEa-EPvXE8cQ=@bST!rU!@1S+Kk=QME@|`1Z
z*OuF9)}1(uO0x5P=9!Ch=wpj5(5XsMM;!DB=dz(Q-1+fGOM5rb7z?xk4MhmNPNTfP
zzjwf?2rYJVNUY%~jnssY7Ovo-@>h^m9S;Or6_izp*DIuif~+dEppGuDcyh%$6Zo@H
zNvqq>&z|{)FARHGBUVsxa{9+?-pai9BdWL`Zr^}6x<;RibKM%=JaN!Q^@Lzmk+bB?
zFK;83XuD99WoY6&*8BvB)-U~)3bA;KM=-;vejgC6Ly}_f`FNdO`tTi>A$!775z-&w
z8fbFDHBIMC2g79OBr2lO*@u2#XSHsH&F8*0XT9Ogcikx=0umXDSM`)hqVk^chvonj
zb*tNiYv_ye4bF>ebpEigG-67#oX&8&J2st-WdbGgVk6VAn`;vDh6{>d&=`Jkfk7+S
ziX#r1-xl5FlV#Nc#~tD-%O-l0-dg1wM*cDw63wpY`^8ad+vvRq_Uti<-jM1lOBleH
zeSa!2u9{)jflYP|RDH7eL#y$a>6CGbPvx+tzlJp$jb1(J(_e{z7#V3`*M<UI=S)3S
z`h>LJpYpf5Ulj&TgS4$fl(M*k^UviUGvbs`(JykR=qK?nMZcH}k*+Ia-&Le-3+Ehq
zJp*(iPplAEp>l{ligtoMks5W2m1#E|q+6q{Oa|gNzR)*)-B8!w7n=f97-hzbqIcRk
zX^kKtFj9u3%EqMSjtxJ`NdK@dUb`+pv$&b}x}=23@TZh;rD$err|p)lO!Mg}fw?D3
z0?uODKZ2Vt=YMGH(CnQKD?XXhUsXQC9>8VE&e$O^Ts?AAT)bJriTd7ia~;1sy{0)O
z<(_zYs!^?;Nsc5>h3d{xO`qB?+Vr&5zIxoTiO$ED|MAkZzV!Mu2Kb{c`-M24YV7yq
zL5DIhPsXQv=LuWvzFp`fS^qt@e$de6Y_Sr?;!(!AwO-G39SZ?(QLr<o^;c0@hnt+b
z#qp2T4++^fU6loES6cB5h72~;RJzIedXM?7DkYsI)4&bI@*13D1pfK&8h4(F;Im60
z!Jfs{w*wOFvlwLF$^*gvI`*BjzY}wmlH`*Ms&QTY*cPqvsoCICC1s9eO2b;VKmkp-
zTm^GV6aFW)UiKDkpFxvTyNvOLJ1~Z|=yO~9{7R-36(;J6S5nf(0z0l2efA3Y`m1j=
zE-)e$3MwbgQWZ>@OQyQWerI~kJ6s&+v;LmFuYVArVj2WgtZ{l9>NSB;#N5x)vuEk2
zgkz)LBGgJMyj)ZJ`;Y~Wy%>+Lo+TW9|3r^Txq8@_+*_~u@FwpYHES8_=dmv9<mlAW
zxyfW>yGxe>REsi@SaCt`@<DYc>TmH9B&0yX6^9HHNo&`%Uy*1)D6^EjDcIzydL17C
zXe8h=p%!oU515w~47b{c8Shg`N`J&8KlZkF#x9^){Zq{YlVheQQ;pE!pOMB5a<y1C
zlZac-xAx}}L8QTNWIVMPHZ<$3-dt!i3l;B&mzzK{{BX7m+7Sba_w)?!)tyv7?jlG2
zXmwlq^1bX#S<oif!J@`D`T0l3rt-iN;-+Bj>tcMcd^Ky@IEjz0&mz}CP4<u3V+L@_
zRdl$I>Vp!vBc3igaC5so=8Z@Xo@#Y;tN+hkq+`*Svm_KU#NnT=<-vXCI*Ikfn=3o0
zrONU?zuTet!3hq}<kztRvVjxgK4dxcsleJlKsiy%J8a3v+aQ@czQ63F^=b*Yy5i_a
zynXS4)3~UEzm0ZOQE{7%EzZ;HE2f0=HHmxU0gVsD-YP4Y-V|q>okd*HCnV0mk$`Cx
z@O~xUQlI5U<qZNm@o$A0q?ufz<NTmfcH~LzVvpigg<+(zV<2~O3*3i%Lut}H<%{_#
zz1r>OU`@Xye4@@r1(abWu-f`ZzV^%aIl}>S0D61`uDW{4l36E2XL_?xdO$PoEkj)r
z2R7i_dT4_h@B4dIjBF!5niI%h6d5_reMKlIr^t$!Nhy?VQi}Qg4;B(sojO&S`X$^D
zR&eo8ugJ%~#ylPZBs%Mg;|gC0icc709x_!D6{yP>WUU2ntc`J_KH2Z*oTsx4Y7?Mp
zhaGx<JZy_egb4R_&5~+zo^r6f9=gd_fa|e5du!MB);QfLH_2l3{q)hEb;ENb%$ALc
ztTU@L=%wEG4O4}ZzWe0HBV@DF3}vy)`ek2MZcuYwzz!+$k8WD{l=(A-kJR<Ijml3n
zUN!OEXC85D;4%Pc_uYGr`gvl^Ix#w~6e_5h{h1x08rEnND=e5=ZVZ=D?cKdYKM{9T
z%u(TqPruk?$R8eny&eE6i*-b5Q&=M*Wr^kL9T#?|oeo2Daq#363&C4=X2R34V!<Yt
z0_#{$db+2CfNu1fr7%WP%D?CDdY=xk@vT-o*aj}S5AMt8_L=F!toYa$c4Nimk8H9s
z7x_L|`N4}^6bAqey#y82rC59F%n^gLc?+e=1(lmi)-$svq^si+RlGf`X#TGk>Qa-r
z+qK>qp|<1b1|qdc;6pVP65;P|Vw03IavLy~Q4owq?vFXp$K3#sjzWL}0HnjuLm6%b
zAsyd{)ZgLFM0Jh$oR0^G?iK1R#^;!W8ZCssTDjDl&<7ZMF+beqUpcG@8_IuGhXjQk
zWH%?aMW@#HS(V-J+brF#OYdvG2MTC&$m^A0OQ_VM5A}|bCi0iRNaW3f!@n}7iXsXV
ziOh~i(^mizSzfFLmHkT>-onTCy@tWaIG!|Bh*^=sq;z<9dY-Q6`Sd8C&mloK>!-US
z+eT5%p!g<1BV$_jlV#rUjbRO|@b`trC0_wbTifNHjr{L2ai<zDTRpkXL}KC&xyenL
z%jcLI$|eZwS5%(3vBh;E;ak;-tzirQvJfVn;fhZDsOEUp)jwShnJof>xy|___a1Z(
z7ore_*DS3`dOCM5yHT-pLw%OI|Jkd^E(>s1`@L3Hrn~zxthOi>$tEmV49q<z_WiY2
ztVb)kc!G~fk;sVYouZ5?zYk_U+cqQyn;>7Fvm6^I+U$c9+eGT9`WI`>LnQJsDQ5t@
z<#EJMLAt2^9<X}lHm0Y)^cACedav#_`9QskB_v&6le5kxy<iu0PHT??yJ*qp5Nl``
z6&4SLYy~4=j^B0@n`lqEtNQ9@`K^fNnhI<}Q1-Q7ZTcNJb)9?z<MLR$h_VWcre6R@
zmMPjh@Vu(U9uHPaqm0zRZSbcMS}pN5gW`eQDNW9>U4-BN=x=!QME5uBDPN=$SU&w6
z4qPFRFFO^ci?7@EVTUf*^oejy>DbKdwJHJ0Ah1s%;un`1a@%wygB)HBR<CP~F)d!K
zDJhB=XhjY&HH^>h1>wUsyMINJvL9T+xEwWPVZWpNk&WW(1-!ideKz9a-Jdnb<O~A+
zVEWUfob<Ztcd#tY5C*VxZo>2)v@BgAj$mr-Ag#oeiOroxW_1$INLv|)7md)h_AOo4
zuAPkYIEz}l>32jPL`bxB33W>WGS5S{8xIad!e9rku9<D~X2yP9_A^766<Z{*>=&1}
z+qB>)@&PMLBhprw7P^gLUYne5p}%30Tf<M3i;{CxBip4ys@k^@HRv^&$N)gsTRwgc
z=z2>v>yze+#@(W-W}{Q~IPG<5J7!kqVywx`?9IxIOOcr3+T^AK)-V|F>hq9o&{K;J
z(;6m+*SXPm?tt$^&4v3x4{XJyZW~X!dlopuofxC8FldtJ>Sy&#{a9q0ZfeSTfe=+F
z8d?+%k32uO6d)vD$LQ5G@>9u>1X>+ngXNJ?oIgJQjLF;_>ZwMLMcvQR%^8V>VmUCZ
zP%PKj*AMJAd$ABQ#c?-J+8kqtc6X7r8V*l+Yh8AUIjpBxdS_hsMfaF!iUKgyp|*lV
zQj3BBX`60z_&d5$_1?3>*e>hD4B){6SLS)}asvYbzF3@9aotO~iM>CcMQ@6j(|{fc
zjg72UQ;A8hNT1A(cBRiiPyHkWY$^Ml<B$bbP_NVYzD~l@_}{gvh)JWeP}D`m6?TR2
z-k3=4N3FP=tcz({ap6DG7nuv{I`Z;oVf2A0NzM=BDtyXzIDBtnq)xj=0q*uikR7vv
z?3j^|9ee-hKC=Wb64ZNr9|`Q-4~$VroN4NTt@8(uT@D7kU5k!QK-E=wBq)))Sp)a~
z5_II)jzCfQ{x__BUyvi>H%xY(bBuEyF}k|e)j#mva*UaC=;VY!WHl^sMJC7jGNB5O
zoFG);?^F@0@ZwMvKKeRC3CN3;9!BZiL87o0kLvr6z-NsH(X=&mdHz1AkcIm;iOk{S
z5z-W=$=$I(1e_JKr!AT<zy8pQ(9oN4E<T0oti0CG6o9bgVIJcet{lE8?1-0;P^F&!
zAg%Er8==q5^P}OJXXFhJCgzqDLbdP`Bd1lw_#b1WK3Y)M-f6#eKcw0UUg(&8tM9_p
zkNRi1krM7b94&GF8Ax$jYsN&YlF9u8S>`C44&W&;VfPGF-_u+rb#GLG>bnt3P`@px
zzOSsZR^pnq`UtvhWco=qjJd4-jTKz&Fe>b2KPvE~(+ZmQL3o>;^T9l&gk?F~b_D!J
z-XBV4NF_#v;p@%x_qF&}?x964zEpiYGb?ZYpdx+tlFqbU$axdb2s}`X%>dCj3KZj2
z;I9N4E~|5X&?swX2#MZs+Nev4#rVdG)CfTZNMAWU1yF!=I`$4GM}~gMsBjFqp6^&?
zN~nd>l<XDg`2#4rGV`KDxbQInE^J`ESlGVRy}H6&*|0Rin%42Chs7nZN*VrTz=c}Q
z$>Q^BBE7msLUN}a)TI~okQM6Evy*xab?F(|mO)*5vz-Pr74N$G<KkL$W_nyQd>t>C
zIu!kQipkI)kNjND9cx!0dF9>mKTC@DmiY&SaAV5)({@V?pi5O=Sjr;cB<NDj)bY&s
z2VJToJkbo!Br^iVFzxR8jcdqIUH$Zc9-(8+f-wzPpBWo*gOx#1gSN_(5fQ_$V<bvi
z4X_C-Drd-D;M9xvd$h|?B^q#4;ceZ!&$zkTguyvvpvvFb%fHzBuA(D5KWL`C(^3RE
zr7s(AMxV2wQ9X{Qt?t0-0NVtpA1+yvvpu)(5Ehvpa-DnhnGP42*{LXEQG1M7)V{b*
zy7YU<^U#m1PzQj9?``m^<-?vI$OzC!NkBuz*s<8&f%{aNL^2ecA*+!yy#mme%S}=c
z`ZDfF1EDXKV{Q=oGVmdyFa9s3jniOwOt!@5(}eAMi=I{try6Azd~QI85A>*RHA(oy
znjuss_zu~W-qPy3j|Clrf{sR7{7b3ZY_KV{>3^&@D9x;bN#8zAH||U&Y%km}R`e;j
z#QQ$qV8OXZ3z-czk8fJYRUjfX;^OC)+CMI<glTs32hp#XT$P7N247QnZM!rx=ryJj
zQ(fAZN`{CZJCYbY&V6%-opC<&<^UE51^`U@!*j950DxI3s2%~Q!S$guPTm6m3>Rr<
zMLXZ=lHSmt422rYdydem3qp<M(>4XHP-FRUC2K^<(ESarvWF@faIgaRJRpu!mNa1S
zI>8Fe_^yH>FaMCtZGgy5ph{XIG7Fs~E@0Ws+_8Aulj(AL-28$O;|N1?vn#n3=EBhv
z&LrZpx!M5ex?Ozzt~7TOW0G@qMI8h?GH2iSD#WMvYl){B*7{e!gDCQ)2)A$x!kE)K
zw;bx`jEln;P68mx*I@#guj?IwA@WdcSUa5amFZ2ALK*oo5HvNX>3?flokTLCjDbSQ
zw~w_ZtO8lvhX-Deo0x8|;0(bGMWxf@=HUh+Pa#!OmWB;dCHs44XMrk-{vm+Adq!Kt
z8v348){m13-!s4YmNfJ|Q&svDanSd~7TDObEiLVtp45Ljfw~-A1eVKL{zzX>Bv>w+
zdEj5~6@cZEa9wVHLF$*9qAG)nh`3;uC(wrpxZpk94la0We^r)iAjF?xw+#YN6Way>
z-Xwzn*%#nRsdwx+gR$E%BWiLRX7nb(j1-C(us)TWH&6Nqa4g&7Zzq(@_x1)T_!pdz
z*Sy~BRq`&ZG5jtftjHT+8=C#RcnI47LU0lR+W<mvH-v3KLU81(c9G1xV4VjySv+X#
zfTv<?v2mcSLqR%UrK7+$B+10_ID|0?Y-3E^{~KdE&v?IVjET6llrA|IAb-QWQ{4;&
z3fW-%=?%*)57L@)dq!NW6yabQCCua66;4_>A||=}uMSDE?CE`P9NH5LS>-<uf|FGW
z2KpM%p|hQ;GcD)4`nz*R1Hh|UT<P0yU=4uDsi*M0*&ARLz7NphM=Ua=`=^P_tdc?3
z7&0W}NTOi&kdfK%JS%w2l6_^8a?g0$?WV2-`_jl8?(ufMzVvsxGc^5a^)PL<v1}%p
z)B_Eo*Ijg7e!7=T=-czvGF2vnTcL*?DC?oIBGvAIkDnZt2ZkAx{n0wpfm6D{MJ;%Q
za{Tc(05JqZ4DNcZxRYR#g%iS@ng6f)b0|V=fbWU0tm#Zhk@=CCd(xSBxF8pgP4Zhs
zFk3y}uODCD)rs}A*Bo04wQ);64oM-jaR-)$cCkTi+~SNGT2)XRcXFh6fn@b_r3q;z
zRJ)es%L=u=hvTLeN7d9d)D=0bE12%1!dDB_!5Js=aQLX4Y|`KIbpZR=aW%T-@&do*
zX?VFIY%YA=?MeLQYvBHn7l<-ZSyLndl*eXa+^_0V5k8l4Q1Jd=$aI|Ug|%K}_fPbv
zBhF1FY5mU4jPGqjJ%YyIa?@PW|BHI!{uk=$G&!|}F;D_NhEExIoD$8Bv}z$f268)+
zgZLO@dWNtA(y*Hw$mrB|iCW*mgbA+FTR^L$NzPTkUCfmWP~@R+9syE0awv&fb<y@O
zC*-dO9dJB@@;Ly>=yHJCd4dZdM(*jc6#!xcUR9l>O=`+Hwgv-29guMUJw~X}v_OaW
zdBJ5v!V;XvaAcstFVxLN?VEbn(u3rLq~O>%6@$CaIC+KxZXD8S7pCI|yCLW+AD*i#
zA$9IuIQW0~k^9f))!}j%(=kXs^x+%hOlA-X0UUhLC*b7(G#As<bHT_zp{g@a9C5_h
zOQ#PY^d;|P0F#qKqLEJQj*(6?*pMP2rxeFm2j4+XDd&h*Mk!YA9*qIAe|tk5I13Vo
zJ-uenN`1R<Ji^AVddJ^p&aaL}0<Q1tF*f5mfa1$-ox?uYo<L&VlP8w>0NmjuC^nP<
z*p478cX!`fY=+2T334Y~6aPH8wlG%K>}P-n!I(~b4*)nwTb9@mvY^rC^RS$psy*P}
zj&D@!8;&O|{+Mt(guZgzi~MM)<Cc~#4@X=^d}IJt%CbTV#nkgw{wB_`^*pBRb7=yp
z2k+w#DOSr|XRf!8g$tXStP4Z|sU^s_X>IE>a&zdyQcTaZo0e%^WSymu9pf`R0?xJ1
z<|Gfh|IFuk>>#E<j}0^_?M;0mcy&J>POH)Sg`>50>ArO(o@ma+B>{dHjV<YG9VF|^
zc8BGM`7@SoudmpZy;zvh3JZ#i$yrr)1ymX|(XWjYZK<{axV50y<)W$FG;qaf<GsAO
z-4+N>;6^pQY~PCWTvb%2Ot29>|6UWxJ;d!16+~V3zaVAqe}R;_lt|hA35Pw`i(e9F
zamZYV-R?(zXYC0De8T3rKG<ym(j#`#FZy4XeXwzV8}3+Zn>@#*&&&7ytMNp?DFF#?
z0TIdta&N2^9XU}~9=`f*gybv*rJ(;6n6j^~0I)!#F*q=Fa|<73p6udI*5>gqfS2=t
zcp9$ff^~XA!{3vbDuR4X#D#jl7y&<t`>I+a42RQzFD<S0!r3I!w3Cuf1uOlFmbN@%
zAs2OFyO0Z|s>DL>3;rJ9B@rfUpC0=Ru<fS9N&d|hfNh6bI|K=)DDtnbk}X!o>Og-z
zsTpz?JSVy)HDJk5c779OpPun;!QwPG#--@}_c0<zbnRp}XE>B^j2J}+K>3EQUqm&?
zH{LCS-OH}PnaYPU$GH+}q)Ns-AT&}L^${AW>Y+xey+epw>b>K`U?x0E555Uzf+;)8
zOK2v99Ck1JI8XL++GWbLE15j>8&<7Cbm?$Aq-MX{4x<5BiTqDHj5p+n-`}`<M0j*j
z-wtDD4!SEq>dMG1%)H7IM8}=to#FH})4VNjN>~!|^a?sc?F4yxUFJ;r0C{?GTDiYp
z2Yx>k$(9`i|EiMSlxXuT$agOZTG2Ra*@HC1;5Rq1jvJhiBhv&jg$P+_m2N{8C#y)1
zg%Tl_;v>dVLH{<E%IC7uq7j=WSiNwr#x+PHHg%1DdX$XQ0<+BQ63}d@8^f1Z`?Dw2
z9<Haqa+UYt=f0pH+r4aiPv#n0o=^pbcDOcXGsw{H`#>cyw7WUPvy?neW<r4kNJ3Z?
z1#(oN7RDHUMD5|8)F91bzDhM-<d^;+Z@pLrpg2wMd(z^ReOq5R$alaBm?PCPg620(
z4NWa24qJr@#*T~Aa7|k{J%sNa&*>*|g*Tp)v?KmG=HD=~@lv7rLaTW;3d`uMUm5^5
zH+JA-_cBvh|4Wb{rcJJamQIF^<&T7((d2AoSV?;1v#;alQyu`SU6z~3K^%h`qF&rh
zvrnrdm|&fWf8~(equSn*Sx>}T!U<T*NA%;r>DTu3ip8bXT{C!4sPip8)X}t0brgTG
z0V%B4VJxagFvc1)d;~t5(2sfi#Dh<|P~VsDDRp8d>aQVYFtV2yw#RhkYWz}Pqj4@0
zaP`O(Y@lD6(2{la>ODsYtWT!}xK|I^)r?2sA5<XLDw2ZI1V)EsXU651A`}^h8}At@
zA4r<kv~SB@A4^J3it`me>z`Rr@67D(*})de&21YSb7Ys@nGg^KcC82({&hvLjwE{%
z0qUG!#U;4K9tG1$d)k+d@S)UIx66ljsP??|HVe{_K(ITg^V8~K^J}p~Beyl!Xh?0~
z6C#<Q+mpcmSl!;{vfhmeGp0HNwi-FctJ*&&`9bS6DWQvN5GCHyuH(!U<PWY&&qyt3
zSJE~_Nb7698whngp;&a#{k_d1;ZhnGle)cVMEMA2P2X3qXdTs_B|VZQrG`!P!G}f@
zFrKlI+GiuV;)Vi5dGxB-Mt~_%GvFAVcl~gWl-rQpvxMuPP&0J2Te8#8ROsUL&`CF<
zul90p1}1aY3ZFKS{hYYWJ)6|0SeUe}@W<a6^2Fs|{zJ2O=*D{{Mnxun_X%+OBE+qm
zLj)XkRDsrX?^g_OE6|!CIxz;)njkt6GWR0ViF<WH4-l(wa}Lx4bilVz`8m`BB$Vji
zoN5F003A+)qYx~okm#do<jd64UUa7nRU{-*lyEMWsD>#sb&uEx*O$*q*LX`iOZpzA
zeqNVG<e}YYfun%(mUUs-_ny;j%ClEG{*ff*q8(``dU2JxcYq$QY^_NN(8C4gRZtHX
z=5wDB=;4AJyh<|d`QZi_o@j@$N}O=h0zU;!K#5NSaM%>@N0r@`EV?o*RfyjT8o1mX
zGEc*gb4x%s<`{3vWkAbK{ka&6f4AJ6trrGemk?F-`g$i1&t0&YVcGpNoP^;Z*8r`-
zOuGiqkJ@n*iwTa6)#i)@ma-s<4qxRS0n9ZXVAt9ie|K|`@7$0EOZkT>7B8|@P`E~f
z(6KfNAZ7vf$Oe~IZh=eoz9=c#@CJ$l_CS4t7Er0~%NL%Lux{vY#D7Gt{6R2CB0Sm?
zz~6e-7yK6*4Lzn1z^r&A*acNeP>!?`9dLljd>uWU)}|7O4UeCQ`H?zq(@*f2_%2gr
z;VZu_Oy(!&7Zvq{%___qT?jW3tEB_Rgpa1vCMKP!Ut{Le8Y0thS)cO9Mem|mzdTyP
zv}cT5|6I=R_*4At;8J9@Ffa<ZGd0Dl2Q?Ja-P}SzMghWl`HCY}?j<mLR<nVY*~x)<
z#T|^y?fgNOKXTjp?@a!jT4g7C-3ZX3ABkiyeIPJ!YHkn%rvNc<Uh5Qqf%7UW0}PzC
zPh&{@H?S>lBlS-lw~@}BPf#jdb<Q)blUvnIQ{;+yj`rXm0@T1TsJYNXr&~}PfD+I#
zVr6>%Zz|JQe0SKZH!oMZzN^UE;;i#~Ujev>e+HmX_@d5Q0J;e#%<Qs0pe{3E`fE+5
z6@Maw)@Dms7wb@*@zOifr>}CT%t*ZPA!HwQRA?Epk9z)G3$l;;K6V7Mj~eJ{u<f*E
zv^8-`(dvoM$n0L%=e0@YcRTMXB~44OL1P1f6w+=a2!R4d{k`lyBdA~{XQXCYfeIGr
zR1NwFDp=eFR(_y@<wVWA$8zNyp2};Cwf|Zf$&#u3uV#rJ)gNQmZ%wEmtma*3o_o}q
zItVHkl(ENKj#PFC+f?BkgPg2eD6Z!8C2lgWWvsJTGM9O61(x^r_Wp*ccYzsJ)1>$t
zcBt%#;V^@zl=_h2(u_u}n4Uo6;|%MtE<pDL7>SQn{u87^?FMK9`Rs^)q6x0BQ2Y~;
zNiqSF@X^NoI%M0NS9@4;_81mBAVtIt>@0RF>RPa<t$k8Ba9ZJJ-0)+`t=q%JuRT5u
zJ+`%Va}Vq6ue5q!Y4zf6jw|Jy8^(0n;Nr+8QlV|$ftlBM)?!99V&hbUvl3)HaZOjq
z95S8&Wz)P*z<44seX55EGM->K?A2top_rxjC=X*g+sztPnOp0m(5!rU<FCH=8P52k
ztQ5VnKz4@n@5UxZZ=HqNGx~u+SJI8kDAfUe2?6nZV9;-tsKe=w*g!U5;#N>ir~ViJ
z7C3(7_!G*_{d-7{cJ6qNjebD7gcmhJ#ZPHFCaCxc3T{YH{QM$6s%a3|)CvZDiWN@4
zu!_Phzt*tMtf1XglEY_;;_?sf(>1$PJ$+XH@q%Bljlt<raT*$DCxz^#d@pZHleiD=
zl&(;#B!NTVeD*4qFlQiSh6mKWtBEj2aSdc=r3a2J{21Bv3z!SY6pMNOWb`tw1f~AV
zEO6fRR}U=NW=gP74?jOWL-e83|8J&>0t_Z{{ydT&&4@fKwiX1xolAswxPfy4NU~J}
z^VaCPPsa_oCapd}?=XcPZgP{Byml|CHvAV<;h#6F=sb#w*x&!W16<9dSrYD&Uh)#u
zYXgRoL~M~beN(8$YQdp94NM;z0dWL}Xsj!XEAMj}pJ&kjhH*Glgv`6OIOv9jMNLYI
z53EZZ{0-Yi?vHLaPz3DRYgxJ`>oBXPH*KIi=V;E6(`$Ir(!w%Rx5Ssx<Wz&;i!(6T
zqa~&@9SBK#VItK*_u%Db`u=_+fBKY{k3olgivy#dvH@{9KMi0aj`ykGvu<X>)fsr<
zs=*me6dBn*RC7;(@L_Oqn3+*g6Wq0WgFnDs8@A!t;Cx7?KJ2IdFWs#R-+(<I(HBWR
zI3qnNR~L(#;sAzhpC=L@Po_^XdZz1fNq`ju_H1VX^dno1b!}UXHEbT1Ps?fa?=CH@
zs1r9lH@M~Yx}v||C*v1$`o$98$Z~_Z2U~V7ts6s6NG?C*rTUFX7oKVnU}u7u&h*C|
z0DQ=xAo7-$76gks;I9T)cim}B7r%s`i73Pe0Gt7sZ<MxMf(C5B2LLNnM11<`z0R`0
z2qI@|j+%7UX+^<%O18Fx_k3Pw%9cE630-NHqHs>YVjhH_ek#39a;dEF0j^9qDJ__(
zWGGN?`mMh@9#DI3)DOYI73k0}$iaHeTJG4P_nL>KSTBd;Ma*?A0!pj20YMpb*LF?K
zk48>jv!a<Hre-PfBediGSNZy0a*s$t)GnjLePkCY-t{|w=XAKI>G*!g`$=s_PgC63
zHkUeeX1fz~?ier||KCo)rsyYo|1X_#K#$mQx+dH1fa^@E|KC0eD<ikT)jigv+VS5$
zwC`UU-GDJ6$z|*OL4L9|RQwrA8PW%_Y>|3=o`9y?ZPJT_{Q0T-%l^(r{O~o<jTo{<
zw%Q6H*=wx<H(*Om1nAV;%@gph^hm^(x<D4mEIZu+4|iKleV=)9{;0sjQ3r)(=c`k9
ze!T@}j+0sPw_c>0yM7alp29rB((p3yQQTKXn)4KHnV0iYff>oMJs<aOJrO6jM8MV*
zmWd9QNqb1Np!nRm-!KaPZSUzQ+ztl$dEYLH>hONUCKyI?$A4{LZQD5PqdCc)?ZB@C
zdtrtx{-~W+NMDztLb{Yb;YqCF{U)(p;V0pqh5dJ)Yc@Z@wuR8@HG;M+eNAf)ux$lP
zU8IM$Enu!PA@PJlAV~xbg`YPzSTdmSvwUp91B9RI@IP0%?QoD05b(3<!9L_+2mXz6
z;}0GzBrY<0`rb^EFs65+7WhXuVRkCvym}2syoi^+{DJo$wAw3xanUPH8{t_6Y~p>$
zAmvKn<^8#i!1fO*J!rT^vd^-t-AfVkA2}(Cm_+;B*yi4p&!W_dwRvI#?|oIKdjYnL
zIN6U@bZiWGT(x3tLTKuM9HXJ84&WFKnmR^{pr#HQng7b*%bI=}&mife;tE-~B<mBY
zlpuKBi_x3djH||EzhL^-+dX#);zUG?#9{8P<RLpx&k+=Bn!Fwxz#*zBl9NO{usK16
zHz-+Jfww64*HIgX_&Pi-z0C^b!Soa)23VYVA(R*ZVwd}7yNyF!MKe7}41k~Y<G>Kf
z`aqDF6yb%nisF|+OFSK@%^`5x#d1ADv7Fjcsi~FdzI7RLeLP4QKZ5xdaDB|UY6?K-
z?Mn~r6(|x?J3^Tfq+Lf^n+c9Yq34hVzKU%8?DCZwC5LVv1}2s{GQh-AZKmPcs&oFN
zWHA)B1LO}1+qFvy+jdYX8mWjCG~}pccMjMO6WR^DN~_QL5dDYm;GGyFqU9M)bdh{4
zwZ(uha!IlES${X&-`6lZfwT4*vqMKhHk;dA8z;H(<^P~Vg=Cs5Y9q~M%|7(aN+ovL
zrg(a0Q7co5y{TR@>M3q#59G5+1m~Gc)xH=ck%EUm?UJDwmyQdq)Y%IY?3V6?9pUYV
z_Zg|vCI^k_v2iKskQTh#jX}@LC3kdgvDVPU#rSkVMCX+FIZc0Fz*h$tdt$KfoVMpt
z_f`>MA_)}V(7{9yZP1{Z6ekFbp`O2%Rsjj^;!Z+~`j>>3ReNAlWO|#8`ioqr*Mdp@
z5Bs?t51n4KSzw5+HLh$<w6sDn5c@rho#_r{g8NHS)zFLyG;jQdeeUN@P7e#7UkgwE
z@kmt6p>j$WGUTR^aqjlbYJ21{jnJ(+9|Oji;oQrxo^(Yn$vd9m<6bSc$LaBau@(lV
zmVmL=6%y!!7;6v+0mhn3DIGQou;y(;AJd!&d-+C%&Z#L5$YS+dsL4#MrF1h+FJJY3
zEq3Yn!?)9tE_e}VVJf1MH!-xW<W<J0BM3^~6P?&Ib3WS!iK_cZ^3X*6lzyrqt`B&`
zv;+XuKvN_pJ;4W{28^SF=tzJXltg@V2;H?MNVM97Haf->YUGNlDXO!`nfucNCC9(o
z>m}hPS0lai_z8z$0cZZ^Lf={G_9>GQ`qr8Gb0Zp;ob53?VCz9y(z7Qd&1Tixk$sfr
zrHHEi1l9El9{6u$+`CH;_`QhetCF_O63&VauYn)0Qf+}E(_)KKNfBr#66L>GJmb3<
z)fuJRE)(v2i1Kv$BWT$=GoPknC*|c)F`NN-o}z7_B*`x!mnTUmI4*vh7a0I<EtRRj
z_0a3>^=2pLl4$#%S%9&NvkY3Yv6@UaU#zEGRYx0i_NM6x2o9=C+{m8m+MdL9#*cC<
z9I_DvlXOBoo?wy=?9~V+>G9g+^7P?WLw$AI{5B$t7t~5Ht}1|Uk00YmB-wIFHtY^S
zu?}HP%^wv^0s@CqQyi}b5jZ9JD($$7ueeMcm`B3_qj}C6iqmSi<2^u}b~BI{4#jDJ
zXj)>Csd%tG^)LR2lGnBGPX_z}SIVn{n5bJFw@cChNK!e74ke84=PL5I{3qWj&+>oR
zQuarEsHaOJ_{3R`xIjK}(i}veIGb&sxQHLXSCTNw?R~&z<@;>bAtT~2pAd%RE*^k+
z{M+WUzgXQqMsB;hNx5ogQBj(KEcmD7X=-fnfK*diP(x9MnT1w5{TeibI;QE=+fDf>
z0U4yYpP;y_;)!4oE%5IuiD$o>o2Hm*A=GU!D3A1fwa8<>d`#mD`+YN8ZiSJ1sh;k`
ziu?c(llI*Ir+y!as{Kgjdd^9z<T6QzzUok&U^&1dWj9|aFK>hM0p?1yfCd`gM}*J+
z%XX%3%jxZA62}%nf18pGkkEU;$W>MeGp5^GffRU**Pvmjm?v>S2>+8%2Y#d+WJi^&
z0Qs@6>-c`J&4(kwY1dwUL9oS@Y+M+1QA=BHk^Z?_K>q>s2+jl!1-&?3Zc$PfULq-3
zy-14lq(~+H$Wd=l9uCnrog~(W)9o6!3&cU7KR>M;9CmXhUmmQa_}{QoceY9`?X(x8
z$j1wT!yY7TvXzh$MJz6Fl;FUdYK<BDrQgWl9b68$AK$&S6V+8sD|~Ti>rp7|`yqmz
zO3R9S{wXaw$z`slh>~%-Ba=SmwQw~+wdGaIuK?cx1Kvo;cR>GYCBb)KEpHz39k}G3
z)bixU&m{Hc2|jTIk<>$rcI{I)C~8E}9mIOH3{Q(Gs{#mbbWs+S6kVL_)a^boWcTK?
zkD_#S8~iiD!)JQ>@w2S4te$$70|3OiZ6$j4nJ}^Tn+%Ks;adl+Tb$IO^DBG@ty)(J
z=U33RX|}1?K<8I5j!DmGLg!ZiUDW?5r8h!F8NxF0qRt>7fC9IAU{M$f1#Sos5P-m~
zL@9%WR_r<=3Md~#DO-Y>zg+sqXGK)9P%AI+!U*LA{aiqe0dP~9y?5BA#J6_#BG68q
zojnYoqi0P2!_hOa)*`)_Z0(fvAS(YRb&VQzb|X2Wtr9rQ=*8?cUa~vcb*F|y*c|?t
zq$E$Jq~NaG)k8kmZ3h~0p!I&x7&yCQl6J;+7!=BIrl6$7mkSP&fJ$2WtDlZTC9T<;
z+|^!9!#~bEIS@sDbDIsh9<4BZ;W1Fzj4iB!1D`CHc?EBkt~<-M@t!4A2IVJn?_@B5
zgQC*lc)~$Zk|WSTQEh~SqB8%pzmhxCZr?8Se>PiEen~z|idt|QIKw?C?$DV$FTfw^
z84C>im@fhx%mxR)#0GG%YHKGyRW0-WAy!Pw!c|3C<zu=7EcJEFUhMGua5*2=u>7Fn
z^#*mvqBSaQU;)*tZODHDctEAbVDr_arB$Sv0~CcU{2Nf6!}5y4>L74>udd{F54Yqb
zsnD*wEU$M$VuGJ0;%$fcu~0>Y%*r+=k(=B&c>2vH<CNb0N@<T&9ybL)v<SITWCji|
z0q&r|ihvqKMFjO50csF{yR4*hxgu8ZhR~TQEM?00PtchuH3KSU{)97Ad=M!{(3vT1
zjS6}Z!%A^NfJf=jaUudKJQ0_+i4@3=ihzE+;itVLMKYv1;WD;KI-G1zMs~fpc0347
zs|-1g!9ic{z`me6MJ**0FEt^Mi@CHDpm^OB9>oMuZ-@x$&CUC?7p*W9eefSo8K1H9
zZ@&fqgiz*0b3T*v_btbQ6Eo_-S>@kQy|aMMpsdW(VWbvuaKXm-TMnJ7Tu8Kj`}5Y}
zq0tfI#3&u#MW%7hlFXSDt_w#7V=<nQQMbDiBN1`zz7lKUV+DwV?(RQ;UEg1aH^}ec
z#oykQx*7^F*f_f-`(qFW3&)O%0}K|h0g`V@cOUgIJ)Rag074`bk;c@N*rqWVK{b;|
zV~#12-+2<0vdu&Y7Dv!|SN65rN61+ohDzBCr{r&LdMdB<Dc>#Dl!)wb&h1cb=jmhN
z(>O<@I<N$)1D0NDp;I=*bGoAra(m2JtF2mI*odwF4=0bQ(U6X$I(n3b{F+By{o$!C
zIwyLJG6D{9Cp#3)KCMmFpD<V|$FePnOH%oft{?eKtD}sJW5A~{YJrBa6`@k+5Ptba
zr6f~WLBnF$hUow}0>Zjb_IuA*oBXV*Y&qkGNy)l6@vO49^Mv!=8Yxfp0q>)-Yj3_#
z9WZXll8`v8G&vdHF#M-NmUO2)yh0<w$~|QhmZKE`K+7+quR7GOI+`s9(Y97job_Pz
zb@s=kc+gu1&D_?w&=A<-<lB*&il%%ORoHS19H|f)n;7eRedv`)b%*$@O)R^PyC*i~
ztNY7D@^M}ir$Bi*IJ+N}W`CR^1O%#3dHEzTdVogQ9~eChpQTDg>4NM}2oN8TAsQV-
zd_ab1{Uk%QIpOd*JSt6AU5yvGT-GMc*D(3lJ8L1hK_wXcJKO=z4O;#l0i7FETlhx_
zbZ!u^pzMUs4I(mF`$KlA8tFy*S`ky13)3)#+WeY-O7yTZorCjyM;3V<8>FxH2xm*b
zEYEvuFYxt>H@F5Ym>{rDdF@IzsflqM6%p69;sy?IQUQJW&;d>-e^^+INj6wcC%Kt6
z{wUlqY1I|S=?HY_Fu@z5d@AEIF!$*cOD^^HjY_N61Q%f6tJ-NAv_6@8RoXTCSk(bL
z1?+o9q=?7Mm2W!+Mw^+_nd{hUjq3u>X;0wgi=YI(6rj=F*r<a$fO|>U`CuF=TCg3v
zN=p3p&OG^tqX<8;eiNF`LzxpU*~@cwR8E;Kx4(Aj_?Cm?W3cbbV~0j$2{F@uR|*8h
zz}Kl`p&{2M89g2Ezj8<m67ap=n`X=GFAkR72zrL7GtMvfNYL;v6RM;SHswpn$CXh$
z&E&_J83I#Xmh}wTtBPiEzP`@_4&S)5Sa5_k9ys~It=fP>f&C3Tb`o5_e%?sv`hhTn
zaQ#3S0$smL-We^?Dr4khZ@9^ae6chClhfG$#pH@YoFpP1Th|f68*0blf}jPz(I^Jk
z-9ih#X`G!G2ejaq^8_)rHL5a3RE~aJ<^y43N-|(ep)m0}bQB^KCVqY|XM;64oE#N&
zCG)Sj<?zMZ<NDC;4p`SL=ynfEHwxf_Zg=3!Ih<6w%NcC!BaEcTlDiKz!MTNBAUmrx
zdxD+SEpTCgos|Y;XNBJ2FYOl=X-?DfT5d0#GQ$j6V^SthO9@((uZlYQ!GREsOa!|l
z!-s<(87Q^K>jCI#PbGgj6r5_QRU1;pVMhyrp1@O5VFZljIqAK#4|m)muJ0oW30%7r
z$iF*72Z#>4G-uZANTfa`WLydQVsbSh<4PRyL!hbR@^t3-{&A@yVu3wu{=~Z7wsV)^
z|5M$0$5Y+^|Nk@<sU#_@vPUTUkTmSsiR0L=Lb9_%LPNF?Wy^M~jAM^dcF5i<duK$1
zey{hzIj+w&>i7A6e}DYkZm#QehI7t)zTU6r<9WY7J?gY$IRtaQha*QzLCG+m$)18x
zNRP^Di=7o-^@lf|4v@}mqq9O*gvcUR%H~tiuBA^R&ax)x76w<C7kPROUU|CLof>c9
zdE2esWN*<4MxuHLBQdv-67PCK2nB{5EHB^7vtU$YMn`^3eyhOCE{Z(4P<>=x_$-xQ
zz8YYtGAAb0dw4dy;H;L!^sA`7YQt!?e!#~R46-8vnqSFL>f~wgY@pK$(Ea<rk@221
zucmJj<jZex2qe=0RehwEHF9qlAQR)-5fu7xcrdwI(-7ctq(&HJssooJ**1aer3Nd8
z#QddLWtB7DmF6L;*^AR|NILJGUB!8xSnJ!znP)<86*Xc#TtJ<5nohE#=RL)ah>PI3
zEl$etj(YRsS|4b->ySJUfiobTbSo^tbw2z3sTc4>1J2EKRb~LhN_VegES&Vu>v*pp
zK72xh59~OvEO)kj0|lHiZf8EW<S7noe`Qo_0}9E=KN=zCH(N%_r~!4tf?<pEE<waU
z8$uM0I-ZF`qZtwto0Sa`0w-7L{U0;IXOT7AxU;>NcQKZmbGl4&>}^5rbLr(p_#~A3
zykl5+2;@F#^$)p6`VnI%w*Nbe<bF6{=k{kaCPD06_N;5{q!2r|Z;UQ*4zP1)WN*87
z_Q~q)l8pRw=;^O9$OL$nH_<<r=#Fmw7+AzIS5^@f!#w^iUyHN>38~y>_JOC@(3vuq
zLjZ_?i)l5TgK3(CPYTvy^LXC}0CxpUghBxsD2|71AZl+{$2T6__>abe<{LY<M4_*L
zPmb!JGLaj$d68CXaYeQIfzxXsJI{>;Xd)tPRoLr-*6}_eP`X3;uhVFv$oR=CD7Qi*
zuE7<SUQsmo-n5wKzsvl;kPlEQG!G%HkT$Ep<$^S|2y)MrT>iea-UbqGuOWe}yi22D
zG2O)latJhigik>bGB;C5$6uh0FIv(giB{M)i!FcAi`LS1#hwj#JPzB1Y~i}HXSZ=(
zC|(?{D<4`LP6=pGFa>4)T&Q4rLC<Ta%rF0rSO}9tSr<~!eshBIfXU^T3xd3&7Q8dc
zXwYI0Qh2ltxD2Ccq+|Z~00oWNA{s!j3QBlZytUr$=ha^dV>yJ&C+vkWIf7{(h^7lB
z^ZtICCj(dBf5^BFs%<Z{nCDA9N}KZ)n7+(A39l7b&JMX3uE~VyhYIZT{rCPp@t_v#
z5E(?1HZ$F0X|(1>1vJgGNL5}#3wbN$BMl1WK(GdIia@XiaEd^%25^e7!J4l;^1(nq
z^|`}NAF$pO8omMrSVlfjfW54@HE-@B(En9Nk+uw!yn(+AEEKH2j2^S)FT13<<1agR
zpzm|I)<in;)(^%AnqMkK#ez5wnPI6%)sMC?VMD}QthNst`e!PXmUe(ddoq46>00QC
z?HV%P?ixz{YuC{0W#61*fKR0W+)IE@l>@n#@;}h@_L%Z7FNQ5s9q{NJD!yS1azS8A
zWi{oPT^{|>2y#JCMc>(OcvrvB_)aPP1q|AUcPxBO3V)KmYW#=v)z8%(BJN#x1;jmi
zuoi&28W8t<aKybiEOGDJe-ihCzsZMzCNqmPa2Yy}l_wI=3iE5VZ+CTyhP?X2UacW6
zZ3Ww78WRngd$2vGQ2QIU$CM`Cy*m<+{#-u{k3sL-u+RtI*VJkJYcssJQo(?!-y}k)
zxm^fF2@32C@n2@=uu%rDc=NC!<^v~oaT?urzITA52gpzfL(2-_q@ZO5GE^kn8LB9;
zP?fIoe{bs?JB;tX0wvk4@@eeV{SCi6RcV8l1b7VH0lO<O>w(>s7BZqiyX%t8?AGqe
zkK0}My<7AKQ+Sa7Lfc+~A_1C(mw@Kb4o#wS!5kWpUZy~EXvsk1!DzOl-P_JtFjKki
zE)vrIXOVDj?l%Ef$3!;kBb$!VtT(&7TdSLy<*)Yc>sh?=ON>~JX3i?w>=fu{#g<1E
zA+HNm9)%8I*z&04+t9g^V^`?GnoJL*G8KccaOFP=e|Vux$K1A#Tzl?hw*gYvkc=AW
z>;x8BRWYcu6SBx=K%JeCMV9`T7FjObU{v1uD5(*}?^t4mf5Q^1n2RghNdMa!Wj-Z@
z%jPz!Nd4GJ=ine~`}ZDwqHYjLhjFxi9t4zdAAl0>*cO)zuA$>VE&Tin0V1myE|F~A
z3eD|+oeG-U@uLaN5C(HQfaGICYW=9(YSbrJY0}ft1R|5m=P6bqE<j{*Pddc7Uo;?-
z2a76c*t|JutdKf&7D67Um2_XW1<a^4euAJT8M*Ra0MNUW>u^SP2wnl^bnx4IhMM6B
zFLoUeduO#FZh(18G4Ys)j~3B-Nb|tCTPJbf<O<cg$4?l;<*<<m`c1hJjojxI1Dj?}
zmac{Qz4d^n$Q7k9FwAG3_6Vd9#oEhzKniivl>Dq`*d?F7`Y9)~)0MZ|ok5c2zhfWl
zO0WEbrt`yO4Fw=Cl-xvALga-8PyvO=3l;#J0tLv}Dsk>k)vv4s?}E0YWPp0I9rMW@
zIP?TiT!ga1&toYr<tqh%LKHVp4JbrD5XD8KwU(xJ6gg2T)jtt#l3?x2I#2u|$U?K8
ze^l4k)<LK=zNOw)v$01RmsP>L;zbPkWPD;e>)Kik6F^b!>jQH$OlWu3rP}%tQF&eD
z#5r%KOk!K(Hf6!JTJoboL4*6ZLYKzgC=QZU4O-4?2JiMgPw(1%nxkQOkBzfv3lIf`
z1G~_m9D6ZON$W}8uDmFb2+rNupe=7gDm~fl^%4kL*w8)<!Jmc~DqTUgeq;u4Bpn%8
zpm@lRnw;oZGDyE+&5~!gbB=bvQv!)4vA)CP-AioGO}_57Z-DYu#?_BEe(rM?B7UVc
z1Mn+t$$%CEI2{#VWbz>V%5nlX0*eV7dUvYi2Zow5Df?UfhkHc}q2cxZYODhA3!lEo
z(UEzXF244b&94GIZL*8#<4(A>wcGmX{D$}6a9L?q6gwQYp~!t5%^_-wSf0~3lqtDT
zuTSPT=$7b_Zx46^R{~0#9L}}d)ad~P4HS7bJub(+szFpxQ3V(@X9LS<mndm)4Hx?F
z)^A0vAD-C~k(Twt;KZ*~IRq{B`I3=sk$_cK@kOkp9bi*dn1qm^bA&t=k6?+c+P7^6
zy6VNJ2{OcWF=}bKYCZ}#2q>{3;$ot%usSj1n3DYCN}RMYGs<}vPD;q0aMB^XAO!2g
zRFVrJpG5=Akw<IUBE)BUBi4%>`na4lNrqy-Q>=m@D6<VvgL1$0ejq6W<{_{63*eK9
zB`_s?fbGhT)N;_wdEW~>i*M-fSbTyDIECXMwm$%<y_wu&{qZJh9z`DH6!SH&Ff=l?
zr>}u{<4gs7{^ez2Qoy>Z$2cBMZngOdBkyDu2#ZhlrYa1bFW}o0HMM-j>7W=)o*+z6
zy2I{v*@0EL3{BRhkE>&M^L~Lqr1I+EvoL?Ft^FKJ{gTdvk=x>+4uBSfLpBJ)XpbH0
z+g7}#@|@%affy(*0Rl1VTL+6k5SIYcO$kzZMRfLC3d37_aQ%cXz&?v#=?Bcad}=X4
zK<(5vWtg?#apwDFxSbN<0zLEA(iaB(7>ZFtI5Q|%h-A-ylW_L`E4T|rwTV|U65Fp)
zfNc~C=rXV?$%B9{>lTU&E7>AEp*`gM{Y2*&NYN&)Jbwg;pF51A#pM_bwO#_Bw>3sB
z{c?os(PKwBWB_3Bjn1p^PtYXE1Rb$lD&|ziaLJWCi+Y*T_v@bkDow{>i2{J+>>ZrP
z-gg)mRGYRqh-1{!ve%<sAMu}HU*UGJA<u_*fteb&V){AY_kF`2!Uy?%2U~A-j{(1L
ze!KOIH{|yvf{;EbFjy8&aY@4YG65Vwkp^s5&;b+}u8`mWIxqDKy(73HNXJEt-sKJ?
z0(bHa;7(TB1n%VR?8{pL*lto4E;}{=Fv;7xXwp@f2|wWEl&~!jfna=*mg=To`Jk!k
zqe3TJDv2eKd#^w4i%XhyoLVQ__cu|n1Essplq!x)Apg(^Duvb1zVe+h_(x)BgO$m)
za*7}_aoIM`+N478$4Gi$tHg=x?F;TYA4rghi<O}3@zDzwNhU{(@tpJ)Yz+8vlQ@n%
zYZy6E9dyA05G+UJgAzVpgAuLWq}_D6X|pqJQ@S0`{c^K-8X(rNaVscd3y5J5VXZAj
zF*qHZ^F+sKH{zZCYBOw`!`TdhL3<l}t9MNh1kraWT!5hWR|?k)*si>Qq)#7c!I<2l
za6vr`f1z;YeY`eE**Zgzs<V76qN3bdZcKT+tLDduJ~%-Rd{^5frTcI~pq2Cp<Uen+
zUh|bv-Ac8cWsCDs0{Q3?&^-b2(Mmvs*mQv6K*@K~?alegfP#L6uj6t|-=6+JTzK;=
zap?9o@oGuly}4Vg>7-yfvU~luL;4qskTf1YvsuaU?p2cfRM|`3K=$ah<kGNHs}~8|
z42DuT7U=+<$i&(+^vGU{Ki9Hw%~V6ju9te$wZ+Vq8lDxJ+SZDwo{AdyI?`jbVjHi6
z3a-LZyAI-a%vcFCZOn$XWv<d^Wo?0%7=+-N;OcX7qi`~+U~xSzk79!32k_^Q7ssBO
zZM)GemMFE`m7#nY_2L$mF3kVHR>15opf^Jaa;3hlO$l0p^v}`<xwbN!*af)XcUMg&
z<;qdbZQ1N?jGnbvmUutg`0)0gI2%1%oRPnciF27gd1zN4>m7_~Zae6E?!=?*&1`^a
zF%maFNkNGy?y(xJH_cT?N-(88dGg_vd3tYOFgPaDMlZ4^T!X3P#H3(2-sGs5uJ|Y#
z=eL$ipa>YAg-Pn#Uz+JNA^`;(MKGZop?8&T?<|gstgsKaQmKm%5DRDa%3YH<3x;Wo
zABgw^ZB<%C0ob!0EQW*cpb_qW*;NID@XDzs1t;ntFQvv-<vGwBbk?;4FXd?b#qyb1
z0R@4X^YbQ)ljm2WWCnJ5(0uXt-@s+H$BlQlvC;fz8=Jh&y$~N?<*CBJg~#xrIG|tR
zvWXVvzoSFW?2**eI>X?!y|njzE}s6<kSQa&_T_`3rlMtz>DN2y4rht-d?dq*yo{i(
zlfa9^W1<t|aC>5mq1<m=VcjwVE!Qbq!32MB(VGWau4}e(3DMYk*YziN3IZfF3@~D4
zIj!u(A<8`foDw7<IX<Z5SM^b)k*vmBgBH!`aP#O8o>Z^N%br8Ymjn<pKs6Yh8nUe#
zoZL|jdg7B2_kl5>C8S~qG6v)%CYo^-K*oSfKJiMf6v!Bm=p$;OM}p+;in;-t;7#*@
zfa83Hu}cte9LVud`2eXF1^`cL`L*O(dG}JuEZS{@Q*H7m&PTA?Lg_I}VD{w&=`kz<
z<^fd)7-aKPYtIMzPK~Yrh(2mvPiJkl`17fUkNMnlQl(-sy}(L|<)}T&Tm9mvRoHlG
zDo~0<P^0+iEI$5DOl%EW(LcipybTg>4_}r)%b?R8zR@#gSHJOec7}J^Hi7w@X<<&s
z)Nq8LS{eYp0B*YtQ}IQ#M>xCu2c7HFU*@xqYJWk8oVGi5%)KUJdCbY-bT8;?bSON!
zSq*$>)f``qJio~i%Vl;JS9rM0J*@x2d&hKAIMr}d$s1W407hm}$1JhUrP({prJ!Z{
zFU_T=v`S6X%v8G3R>dVAQ)JeoOzObJ<iS_b$tdQH{w76+NKBJO)20cX9K>6=_K4v2
zImtyMAyo*rBx8hWQU|alD3XUL2C$_*AUy2C3J)v63=MxwY6}!5vkoe#BgKZvC}XT%
zL_=XR0A2#IGz2-do8{W?#Yx^L@G{)(o#ircL{Oc4$&<nU-d~*}qbig47LS-J*+ikc
zW}VjvX=G`0OX0qzHb-{b0>$<|O=0Sjtt2^8(aOoM%PnZX@PI7C=OsRcOov~jh(vC6
z-z_pQDozV;%(1UwfwI<#EtVZn)*7-S9ykRy8GLhbUG3H;<Jfq759ewHC{}wKJ0UoW
z1vV^qZ4ny(u|>dPznovl#Ua$A4{veTCjWuECUd|56!?p@?wMebYO27}x`kBBr`SfS
z?Nh+bhwK8H%M2X-jegWY-n6z0{~o>zUG(0PI&#|YFFEuC2NfE}-@gy{0e+vO6(oRb
zd^k-C0{D2V0(Nd~w^d;Y*I#rTOt4M^0A7Q)L2~>^5j4R{?Oo4kSv0!YWxE`{F$c07
zscKB0mnjS(3xMqtc8AlD1weXcL?;Yb0D9bA(*+YGB*{LW2!||u4LcS->c3j}JYgAX
zMqp}nJ}#A9+T%y9eoC}!w0I?tB-mVhlw>O)-h-cO8xJyf3t(rTzh#Y4?Ak6brDDrV
zYV5c~C5?%W?#hfU+Q2lZpwRfDm}reHA6f#y2OWA}_UZumpv{U)8Lpj66pt!99}Z7o
z<0CZ>P%?Bsm;46pZZhrDG2$=kSL08Mmh_VA5uJTF#v0b!7Xdg2LD<bZ;_*V~6mvY3
zjy}Olwaf<6(X2ykJyEDz4ftFq+hP5pAuFua)3eWI{|u=-VuIZom;t3OzsxRTudE+N
zr7g+Yk07u|#MGTjL&a=4325@HX7w5sm^{0mdqxNWCeJQ;z7zwKXUqP%+r#TU+>_y8
zmd$EJ)2BoVoa=hFfP}^AvaHbwGKz1#*p@*;*}5P>Nqx#Rvt2dXx-3G_UB}AQ|D-B~
zuc$Z3O0<C5#t`PPh|;8>Fhc_CT;O~I13L$hE*rA7+s%N;G(u1#JodJBPAWXbq*YL>
zU(DL0V39O+X+cShQpY%ewxZQO7VP0DMkEP}+Jk@uKMiMx-<<)6YdeDEmN{brQs0oH
z7fO9Yj$R~4eFp+(4@iCI!*aTr+H%=#<IsQ_$O6tf?GQB(oOQ%6b!v`bee&kLegrsL
ztIPm2G$DrnglP=jhiLyCEGq8~eunPE{_M$waYb$4hQW^b*#sXk2fwY*UB@^^K)lM(
zwYitmrt3`4M+cB1hNv!zI5FWQ8NRJxUtTxTktU-t+h&6kbzq@D=n0b}Z>DIHUq-A2
zuU4urtGAe?qVO3D+ZT9(6=^Asd*pfLz<E5<$#MjsM)x}HTvY60IPbMhOOMGxR0FWW
z0fq|?09fG!A3rMuR=~<1D<|tz<7dHTo>?vsfa+NzOCGRHBu&xzOT}858mUx7qw{Of
zZf;?d=s!Rc7eBU%3*S+(XP9vd9n}9cyGudYpis*jD<W%^(3CO5ipZ*l{LfHQLL#!n
zR0|N-00bWD_mb8-5rB7+ze&WJ&rl<*qfSZYposQ&pgt0`@~$Q{%t($hR+Y};5;Y@W
zsI_wl-fx{1@+F{2H`*{_%5(z}i|xiS$@_rKg+!1EUx~|-yPPgAL1jG;Z?FJl36f60
z7kfW@5|8)R95|rTwHeUzN;pZ&h?!@jEiFj?;P7WeH667Zo!AFIFo&qLc_0CHmGa0>
z7;TZ`i1xxUw&C^Aiqa)5zA;h<OLy;SGk$^bKKaE&09$Cb6%Oiyv|Z>lH26Cgl_ltU
z-G_j-%L0n+5^xT3ueY!Euh7E?@?)4>v5{tX9oIwHL^GAQ5rsbni9`isb*}gT^R43q
z<^T&4TL8@A01GmUdC7}9jKzZZOOARs!67V2;wTy$24mrzRxa_CxYt$tC^7kQr6$As
z#mtZ~T1Cn6Hi0f*@#<&FXiRm47skQ;EN(*RDh)~R?VI*!5hLbP9KA`#{2>7HG-uN_
z(NzvmEoj{|@exv1Z=?oD=`Oq}z5KPQxfX%AaQ{6i*@6_EU#bp*3ym(Satm8!<HRR5
zx^^NF12mdDx=TR92VI*{xY?ty)z3I7!_~rs=U1|tuox#pbsj9osahK-$!G@=2+fyD
zTE;Yk_$<k9Z%7Ll0G{bvL^y<Jg7kwBo=F?%2O&HYseqWqLl2d79NPrnd#J2(E(9IT
zYX1N*W3;?{1iI2Jdy=05@h5isu_M|tDsv9vv%a$~4S0tpNR>>D&pO;H+DexKc-K98
zsngTfD0bqd=CD2e)7yoK=P+Et<lHGw)%Uzu_vXCRwt5qzF_wSFf|eEw^&)h{)4K|z
zj?zg@*1?S$bA%i}+*C_qOEUR-4Izh6rLwp|r(>j1i9wR3iGiOnu^a99M4}S=E<wbd
zoK>*Gy@|vWjEcLB5=?6N3k22TrH9Q3m4KiTbw){cJrQ*itTs}oCw0pWx(mM8NGw0&
zmgCch!@MXdOz+Ce^RNT)(Dwi8-?sk)cK@*I<7nF*ZyRc>wK|}7_k<2G&EeL~tN%s3
zVY4!2-3>#IAQ=;@nLC#<k+(7djV}2q3XCw(d8<m_4q5(sn0@%lub5uo=#R%aE&6O%
zHqGo0M1s=|j`%~r2TxGm6>iz<gii$8pQ<H`s4kyJqFtYO`!F#i=&FCO=Pqy$(|Z)i
z_8gZ`*+tJH;?%q*VHxA^Q{qoppn9qF$b77q4H+3%D`@DV{R!Jw`veFN)!fi5ORXVi
zzSv)Z(?z+1)+q}Q5APaQ&MVTZ_|No>r>&n?ZcM3*-5QHWf;4$pJW??Q$gg7YNaZh|
zYkqr{<Y<JP(a`*Iq0so4jFuU1J9bQ#n&`hhChJMt9+SBw;FE*lHO(eP%{dVhX0d55
z=^&q+s19<gN;P!gFGHlQt0BIKHe=c$npaKp?}6^}gvMEhx@3BHx<DNzF>jU7JoloG
zdZgWt0=WQ_yBlfW<dC3A#bmp^sA4%^!&e71sQ~4xR{elru_{0~rHl7Nv3SFnw7A>`
zY%Jcq`iQqT6pI&C&L<;*@|lELczRkc?m+hC1z-c=<fQC;Rcuon4(wT5Ni3$bj|_%Y
zX{Hr$Q7lFO{%ILBC9tg)Ia~ir-j2>YeL|SuxYJ}b^NB)N5^KzWsoVi7^Wyvn9t}7t
zurw%qF*bPu*tC(~L(7#~xGiQyNCPj;R*W4x-D~M&;_%~;w$e=UEyG?oN+&%U(td*m
z7g|s(O*d*L%mgiqLa-4Y-QyrWEUbVy3F5=`!Tu6-2oN7ujs{^$5UKnlPYaLnQa8w#
zyIb@bfJiB~c-U1Qsi;d;07M!+#M1XP0l#Gs@$Wz0)LgEha1xvfNVb@4h>KwG08RxR
zY)nsWx2MT2<Ai}>j2LKeX6L#^|4uO3rhi+4CIU+={d-UE_Z?couHp5$vwzd5{q7d+
zt{fi=iPP0xI!#}ku9F^-Er&4_W=Km@79?a-!Va4oP69W*yRQaX(!FV9TF~2fhGstd
z7}fW-5LX(pcY_Pc!ukDQ^o8ner<rW;QT+I21Tr1psosE_ot4W5%;Ua$rY~o_n;Tms
zK+laBxE3C^0OvCRn!3vAjz61KI^jpXo*eBg>Et0(aW`~84E=<XWJe<+aV%dWF&LbH
z9a@MOaL_`0;pRDyavNvYi8|i2b}5sP!-mv1$B^PHaH>?J`=RNEI!EN|7mJ!he6md2
z&+wEa`VdM}s00mEvwN!yt5PIU(584S^?pnnt*~%!=sT`US|@`b6T6&w<FbN8CB3O~
zzK>YVcJ6isPk16#j_aTa!ZMvBN){%@$Kw;dGv!WTG%DEq$Vs40<(V~1&E$oFFx;}!
zBqXy`p$k13J2Y1(Ol2Y55djpvifq`x;wlbK4JYbroTFqYY<M%x)56PFdw1@j+OqJX
zkXuV&cTFWE=Yf+FgK6N17c|G)fUCB;flMv&o<L86d$mF5sZ9Zu`bkFfz%?aTW}_+g
zK&Y{!K?+F&8au?iXKGl%nRf!(m#o&tz`hh^0s0AUeyGX={REY{LO(!1LHLiuy<HYC
zTYGNR$q=l#m6v^V+uWKSxV^g+5_*JnaF9r5GpVQ<wPlyI#`W&q-TFVyyED|Sys|&x
zEojS|PfIqK4O#<iR1?1$AM~khlHm67O(h+_q3!|?u>CtP`(WgYf*Xx8XFD4okqmQe
z8GN;$I^%VFK@SKme__uJyVgI0gBTE)WfBq!MYN=}STDUmHdR;lyFO?V7dkGP1d!K*
z2=ecz(8Oork_>4R6PlIp>AR1AANG07xV2^QGba|Uvd-=VW7P)^>^_-$a&+%&YWJ+O
zZi+4e7rd<Wz&u_DE_fj#<0bSETVIp`*yA0vr&NVsik4W~vx%u&F@zXtra@Qh5Je4p
zwVn#Xpj%h#R?c87@?T!C?V=V3Jxsw5dZ}FNc{s}szDdM9sC;qXfLgJ9z})hAMqyr`
z*~}Px8u*g8vqlL3oY6++gpOU#IHgrm0E{#M2NeKK{l^FYKyT5Y!ls^zjqg2#i-_zb
zviu2f)+|s9y9{b!NSbNDuGxt%;a(t#e|QD>SegqeAs<UEoCEk+xFiaTAs>srP}=zH
zURYcAkp&@1ly*_fNZ70At3EdX62>oOgM)U6vU7Y|fOrSAVW7Xf?H%KKyKBJxj!O7Z
z`w~Oukm5B}M^I|08NaJfA8xDFs$`*e=2=<v(w-x@+<L<KEN^Q7X)@#};Q;AP3B~()
z>lgH<ZI*2{f1IumQqcaDm5dl3moAty1f(dwP+9HMsJZHvI||Jt6ELwCwHw%D{WriK
zTL8S>Nu@kIY_b(Jct?M1hm}(rZ3XZ+*IfY~N1x%#Bd2H3vS`@G<9vAc-K29YR>UH4
zMQ1QqKjOtkO-!$Im+7Yxk|NZ4`j^dX<v(HKU*_g!ozi-AEJr5JZ<xGopv!({{i9|p
zj5~Z;cxHm4q3H|9rQ!&mGN*qfJblG!-dJI<!zJYdi$lf2#)Wo>_aiNSN+4!joL}-0
z#Ejj6tgwu7a~shn_2l|DyZB<H%yfv6p>VW;CWE=p<5jESz+0jLKnkPERgM(Z$Uuj)
zYeyZ`(WgB7U8IGVT$^TLla~(I3j>~c6@yUGk5Zk}l?LrN`Ixuk?@GG{z$sLEzxteO
zU4;FO6;@vx0+H05Gb29fX{B}&)o|3YMJi7KaH3RQ(-5t<wQTINqU7$3eBbUZ61aZ5
z_Ph3Y4|`qRPuREKu(|c{eS=xsOW7Cq_i&Oj*26kEprTC)3_Vaqn<~2C5meFU=Fhzg
zRkX3$A2zqWW*HAIf1guO=O|IS4;W5J5NL&hkJKR03IeG2pPo}QVZv;H#_JqG!rC#Y
z@p}K3chJ~Gf)Vi==;5v&7jInRf4})A0qohY=94JFo_!x{2=2ogf++>sMSwpzdg{w8
z@CRG-6Zw`^XjPi&TlZ&p>14m)^VQ9pF}m5@pDirnZSr!-J0{3+`F$a`)=ILio^rku
zqc{iRQbY5ngtK3nvPFQXN?{H*%~=9s+E7>%#I)6=*tcTZ)5>!?*qHX4L%We(c$^Z^
zLs5$E+YwkO-^+&qf%kvHE>%GpU`%A(M=UQZ;o8?77*r+BqPABAci7l<oY`@#f%+=`
zI5WUqmA;S;w&pKD#4A&k_t5m%7(f4a&%*E?eIl<h!ksuTwRlR`=Cgc2AO%&;ZBJh{
zliXMVKb<1$Cq|@VZ1OpU{yh<DKVgB75DOyU`><GOiG3WHFNNoqew3Rw(}9$<*DrQm
zaRpl_&vN={XbUxbP;~SZ*GjG}W&MWNm~iY|>~iTFlMuejYpM^SD(8UfT?;8FZvd&u
zh@Od!(}J4WziJ|n>AbH^Vf_h%)bkgtmq9dG7(xbK&hdwkfgl=8wjB+ArrME~lx4uK
zr>gxh1+u606KR({47RVPh`#Lxxi$~?<F;0pvpc40rbXxP#<*j~&erS#+adqx+;!mo
zF7~&SM3C2T$Tbv>(KO9L`yg(J5IpA?3N8js<VE$ON@HU3Pu_Oi&1vous_X-C4S8hs
zcadO!xtTu3m2qBk@6n|d5UM!?MbTfC@Iz7b&vyJ!6rFpmZUscqi`Uq~bVA|6MC4ET
zu?WfGQi7n?a=G$(*R448AT}0kPtmPzFcUpNIKvXhL36RV{p%_(=RwKb6?$Wz^?Zu9
z_|uh~f(hb+l4a$<T^Wxd&fKugS}@YfOk=IdUsd89{6GmPlv7%18oX#Qb){iR>tf{c
z?jfJ?B5*n>Ib`5A#&>h?hbHP2@3!VluuWTsr3Jy2$wi)i|GFfKlocj{WT(^68T<cL
z!qEv=<~~k<3o|OL;U=G04#D<_?1Tnjq_|`Wh*9jw%>KYlN(4EX6yNNB1U!nn^qfRR
zx2SOwCbqkLllvlfM+9F0WLR<fx0Hw)TJ7C#HURB9^%J(!`>|JKXU_KO&ircq`oQkf
z1yS)Ed%RDl?`@(m@+{`18<8+>+`s4D-<2Q)@RtKA=fLvDr2_WuIj{lW*lB;df6{GN
z4gc!EWvYUmPtAr;Hh_D}(!VK1w4J-~6-KxpoWy?M0b|>$$-Pm|FqkTSf`+h+NEnwM
z3497Z0{b#fQ8u7N?f$J;2hzo6;V!6$qJHahfxq-$rU&+@{5|4l4{jJoijRRf<C$;O
zy{T{ephg9VNQ><&f{3(+m~A5MNuE<qxbpgEJizC`9{}aNoemJ(MLN=c128b1KVh8n
zNI4)5dj4@l0f>W&y-YEsImH{!_kMp%1oioIpI<@DDm_mK2pJ!|o(6(`5VYzM0Gxle
zJz&*R2=F^5inf^M&#K`E$c$U&3ncXbW#OA4>labkL7^8%93+Z+Y@`wmrUhwY%9U$8
z#M^2Q4(ay=H#m@WH^qcS22he7${QUYQ|tgfRbJ)PHYDUz1reV<$ftVpfxQ&uQw1IN
z8}#sBLAfh!af^D7-qg6ktj%NyVb%AokWEoiQ6+6m082fDO>*0veT!nctqYJv%|UML
z%E=9YChPDVAd7w!)}uIuUZaa9lbg&T=)=F$(f1{S!)%LMCH62|F9RKBeS~{wp~Eb9
zc-ztnHICIMu@?FhhT4=FPT1hCbXZTQC=Kfw7_zl|p6@lU88-GtQ0Rf9B8VewRmwbC
zx8(00z}*Fv%Bqr@TBi5CXtLs25!Ta*O2IlpAD;<^zMf=@g&w?_Ir+1inTznYl2NH!
zuEvOW)OA9>qgR+r)S4tm6uCv`A_MK<OAI30ICut-W&wj<hiw6+7oV>~-t?=R%^g<4
z5+136C%sdGh$|Y3xg(a@a^oc6B8+^DB<6LE^|9aL(-gIMFwqk?JGzDSPV%1M<54n%
zPfM^ef(c!%Z*Xs;OoxA0<{iq?ColAA!w`!-*HC)U)~O&lSrye|Oqs<yeMiWi9x}|?
z-$qewP*q9o`h?ZdISRKwzY0zR)zg>FwepjodMXZ2Y_-$`)l(Gwj?GGV2NeO1o5Ok-
zj5lF8i>tHL&N0UMC0~p~-b7tkFLVM`&?JCPpxTAS+~5Q%CN_$cf1Hh)anjV7Y2_h%
zRlk*`w2&mV^AQ8%zF>%kW`y}4w&d*5l>bFoPX64ltIsY6Cn4}k?&+zyK**Tuf2#Cp
z)z^<mCD}`$U|T9hhnwt@;Qzm|avgAO4QxGCNBMBezGw!7v<038%M~MrgFvs~-h_;F
zt$VWu2Jucd>TIB?u6(<xPVeuU>Tq~<D=-}0-qzA65x^)p+L)G;j1&R8-;*t+<TD>F
zTUPWUI$f$`S8<lkTQPBp*EA-1i<hMyI_-k?l5692e~Lr`8@&tngKcUeF1e`dv{R{W
zKwAMLUUQ(0;3|_(5HQ()g_BGaEN@41-mwG)0BGS98#S;$ORE77!Kb<rAl?<<K4?5{
zet39#%N+Zm7HjgG+cx>d{GG{f*S1GT9Md-|!~lk;)W;BClJ&&sj2NDnXU)lu6?@D~
zAnFyWz1^Q`SXx;o1>BlhRO<XD*9X85cdbrZXDI?OM4(2hPAdRI05#9-^^0u<Z6j%O
z7kfEg+`4gDX(qwQKxGu8=3|*zSZ^`Lsns{Y1uo}+<kJ+ooEx#qsDUo$fa6mPT+SsH
zAJqol)o^0r233`DESpi|+*UNWiI)ms5^7kNsA;N~khws@maN%ipZAA{`!`~1wyt(I
zziFWEN|U`k)!8Fs{e!GIM6eeMZsn>Pr8Ejc!L5t}eksBWP;d*7Yz*~C5awOZ*7&uI
zTX<#Xb~RpI#wpR9+q$}MPEVoQ4dhq=yW0QB^XjArmumA%DRpf}0P(_NCEu>q-u$!W
zPZ%!AIm#;^0}^aLX{*{)B&|qOv&@6&OZ+Nr+itE01K>Iy_vCiayHT)((hK`Ll%5-m
zs1O%bXCYzfvROqNFB?*73LgmmjA*V@pp11zZgH;YMe9lPVVJ9fd<sBMFeii5#I6lw
z`u5uJ(%D%XLxtD@#ec`)R0d?!?IBs#h*kG!NU4@}E3sHwwMCjsPF2SiccVuYMVJa4
zh6wY8C@S%4Oez#W0jdx}D1fpwW5PxO0w~kK%~mIS+q1hz_K|rPmSJ*ZPI6zB+?H6V
z^bo$<4~W!%XPZ2C497sC!@mp+adFN=P!Wo~!d~JidS%HOkJIwSz3iOyFG)zuxWO_0
zWlZ_MgYCgr{mc7UX0rgnD_PT!!o!i*F66MOH2`9aGgxZ12G)R@Q(1Jvm8<ts10ed}
zzo2bw6L?1MmT_<ydL!t`Uh*Q?i~YK(G9~@tw?vUrB<_W7;SFIUilK$s9n|q7cRyoV
zw!B99RRA(U6m60F?1Uakm8Vy9{39{ShuJ;pf~Q;~+Rc*F!gB@s#wTuDrHOxylST6a
z+KXsIjk5t>E=lu?6-3~KCuL1v2$jg1_HR7+yi^gT)H4&>V{q?uW!WrUkn(JDPdn#u
zRf`4&-e_+xRg_E#DhvK&7&Zv5P|(^`4Z#)Q^g0>)A-DoLp-1GqqcMt3Q+ui(zWfP$
z^F6@n<@feQhjnAq+50PtKBQS;RtXCF$Iv!xNjyPe;PBhl1;j!?Yk2~Add4vBPgNSB
z@QvTV;+-PZ%Q1lVQ1HaNI`1JmeBNxxGt3N3qbS53?KfH?+p5~KFafWL3!sdo_}Vae
zqlo3L%V^WT?v3sOqCT^2QJ>*|iuzbsR`GH^c=4GSUOA3}=2kKa!^I;c8WWN9{x_xD
zGLqN;&O!aDsQklMhYsC&0<bO;>PgwfWgb)P)*+1Dlk^E5{NfYmMZIgvRY{QOR1MZ>
z4x!^6_ypUHkOpx>Ff9)N8B49uv^+$}h=Qi&AqWmXn3i9?tbh|mom1$;ae?G&A==uP
z9okyWUukQa4{;u<h`#?o%_;pIYVP);#WreA+e&rd!0vxKtm#8TN#uw&G?WYrE!PG^
z$uS2fcQBMx_tJl3FQmV4{Htq*7AJ(gY1Q1q-pH@oaszA&!aR)gbCf*|{<GU}U<~SX
zOp}_H0oV-L{yN+&!9SY0n^HIU6kl>HJrc<ExDq|`rf(fyAm8%6X_q<JbAv^tX`O&(
z<!K!~O`ETHMT$NuFjeQ4G8!}i9s)s)eX+Z~fl4Ybh*ZU;0s)6ODCE3>1RTl0LJ0&M
zKm%rbeiC-RF2y(ks;;s(wWFZwYA~?qNJ7<BAbm*z)m8bv#jxfOi>%xHXL!XELCu*W
zF{rO<#z&Q2iYkegHP-E8???HudW)pG-rH_jW<jBWL)RO(L}EE3l)cm!I0CEsP9_0Y
z;Fb@LSVW)~o9LzmN+g^l5w?oUUC`{}5{*@#l`%#kCVSyh52I3{0%&Po^$1h|wIpZu
zLj%vJybfL*W{`93nkuNahJf0*y>#ODA72|NDwap>Kye&^bfbU5j$d~41%cqKpD<7m
zjCVCv2L(Y?aYqj*2-acwMoiAYHjb}xP_6b(37<h5V%pw->If=0Drn>uR31U`&Zuf6
zCMNbkfTsD&xJ}GuW;#->+lny>=sM3OhaA2a38{?IfKwO9qr!kwx48IYWRlHlTsno?
z+!i4a8-v9X0zGyJfrS6GES&I>*mwAx3dcCvbNmC9Q{lj13h|p1Z*1|K`mp?_t?y^)
z21r`v53nRHq_q<PMA8BPJcy(v29dN}_D`6=A&)MD&tpm8(e3YfI171n8#K_gK%wx~
zZ@mZORG}=A{;HSa$2ow^!piv`0c6&#%F6x_dpMnr<$jPzizj7GjQp7WPWdc9J2nD7
z0wUnJJg>ep6(bn-)Lu6*goZtU0*HZzJw2l_NXS^4sxji(*<<?$_FT?0wqfc0LEfod
zATLe_MtK!`C-k3#Rj?Xo+`qWQx^yp<cj<ED9WJ=Xwr+?VYZdvAyZ-r~kRYaF5OT$6
z+{$n!j>0{2pDs-AD(+urXlJ-if0^MrgT~zjZqOL#c0Uw!B_bWM&|q?DC8^BIQ|7&&
z9N`o3p;2Sdyt(5#Z;QiKxwmSsf28`E$`}Re__6f)@v(pF6El-86qxx5n<c$dONk<)
zBT>xIaA9;ulF;8lFPojO(6B+o%mp@S%<<W@^yrHhMBi<d{piq0LWUq2cMI#SBuU9}
zj9c=a5^GaQedr&pc|(e{zlI@zy{{eug-7q<bB4hlaoo?FnBG?(^uz4=&G&a6rpY}0
zDi|%WLQYtbMrPUA++miIfw`Y~iA;#B>bmtWKImLe&%b}9cQyt*(uwYLWT-5=W0@z~
z)UPzHiGPcpx}aPlh2VG->plmySIz04b_f(i95%YCTI8ggm}9Y4bChpxgVsZyD&>P;
zzyIu~A)*CVmx;Z+-_I$?TRU}$Y>wk^oj{DX^{2M|C@*P6aK)K?a0>X?#unJ^{)%-F
z9zdP^>7k0r;RNHG-EAc~#pn8wi75f@Yzrf{f<uvK${xBs6*ldX%PGBunj{o6G1<&i
z?oK^5ae6t2P}fFdrPS6MJ!_i|91d4*G75Tb6a@W*N$3RWKPb!OuRUUNCrzzB5Gne>
z$ydt1**>?QB`z^niGE&;d%({YiS9g|2V`m_h?}lftz|RW)NzKwH*`vx)z4fzsl+oI
z;>SwzHD4oNHlN>OytRM5|9n=|%vw%CDY6w|*u~0nVv$|=hH_VXjscyhSYT2~Xkq7=
z{3D+ijKK^(7tHB(xEqV|sy(FF1aA8q7!K$nFj^}GqT(2kB|tWCG1*nmT#0zsOQvpF
zdh@-?%ZtS=b%96`!Qoy*Mskd;0@ICqjtu(ECw&~L3(%7_2SyqsV$n0Bo93WJfz7V9
zm>Oo?r)HonmD2wc7D@g^*igDgZA}@8X!)S1z=-rOTXLXo@X?dk8@tZdY@>eQgDwrC
zm#6L-8{m4~S}8YmcXR9A=wXDHG8~`E(Nn0-(NEB+;1;bGSu17R{KMKAMi8o7@_CYw
zg>_~D{<V^Oz#`wHzxK{xb;Cj=#p8$-MN^{-WGyYoFj16@;xRJxCl19k@(+h1htJjV
zYo=xQ)tcJ71}&Nl-$#D>=CV{c=MW^S6G;8Z&0_>vA+jlX>KuvKWIrNlxm506#G-9<
ztejvMMz19G^Qb&mE~|Up=h@kh{Co~_!pZm4+NyG80<9T{?&T3_-kJ#O{oXTXnO;-u
z-r+1Io#=*6ekvcxb77@cZ`0j~9hre_j_IW+;T^X`5qkFignd=~d_Ebo5OCe3f)5d0
zX<VpSKBUsoI#|x#H=2o@HcTHLTR|_oaD}wCQ?TiAwe+yDOq>%FpEqPMHmO+xsa@iL
z53TI!{=HOIr&uY=QX<11=jeKSi;AtM*%Y!x24wHIZ<hWvY^)aYX@zx=1T)WpDqQl$
zl&FU$QtDWX&y=^U$7R4hLL0}<h*gkBW{@*j>5{7~!{t;@`DN#vNv34uxDe>hg>KoD
zHk}<IW%k<OY^=O}&SXzW#bXy<FRu(ShIsm*2r^yY5CTbWU2ruSqg|ie^jd@I;t9<I
z%e|I)J@=T$R1Mi1jIEiyY^JD&YZ#X=Ci}b;8=c|3+oq-!VsJb1mg<)vCN^E2sc56u
z!=u2EYspe?+%9J*$A9B}LAyGu0h7_B);O&b^|upnsx*i8;A*dg>sIG!j2OGd(?VsQ
zS;YqWD!&__Vi}KdV4)hPK##$ZWce1!foh&z=>3hV$=M?%K7s{>GX<=6s-;w(w?0Ys
zEiZ1~dNjml&EVZA#^l4S)~REh(w}IA{A8h*{7eI+N+oO3gd9T_9SgV`&BB+%xn+1S
zC|+&VHS}cNly(XYv-x8Y!&}gAlhh?#mh(8oFvLOlWcJ)c$&qLEu42eL>ElaG5*B*|
zqGSBX$jHdU$&NK2IJP+#{}ZO$KYc@B_rd=6RVA~$>nUes59MhDH(CTU+=N}l{h2-U
zoTQI8PtB29+wvE1clsYeznx;MZ!=NqdU8|DEZM*Pf;&SXnJ1M)jmJmJ`~`Gk{@C~?
z{j~V<J!|S8hQ|$wg2Me9Zb+E;2NoH+N=|5vMK97hTw1W%GtrY{$6l7@Q>TwkqVTJv
z5R=o+YpA}%FVH)op@+gCl?E|EJ^^41jg<MuyE*A^NTKo_#ahBfxvJVHT6WtmBQj9F
zEPYTvNTF)oceqI6{)3_zCrgolVjtt7Gw<vAr+-KnYFTBaTjdCPunM&b%e1wY_j^;!
z+6p;^T9XP>ZGgy4@{OC56BF^tdA%&k;pnzFVui!hfej;;9QH>Y2DEDDm*al`1y(P;
z^M13hoO<7XC_2V~6JW%0$mgXHF(by@9u8|u1q$91gNo%wNUkIw-Acj5@SA%N|JE{k
zV$nIAOIR<&8-b2Bl%O&o<9#$F!r<03WE*E5sMU&8{FHSgajyI3lbagZ(?RwWLdGvs
zhm^P^q*sK=gDE?G1KxHee^(s|SP-FG<s|8$;biXbX&9c<2^FZgPjEIR+bYCnH5#p*
zl)j=AaI>khEFU?q(qB`*z(`6|=e3TO`zYj;HN!e%*HmBJ)G^7NIH&ZDQyoSAh{q_u
zXJ*r+MRv(@iF!FTqkm2=R_0W^qxT)ud|-BJsArf?oZ{)SjdNCict3pBb?hfBc+oNN
z>=&BrGXy_jpH@q1eCPKtkdP7APgt~L+KnPSN@S~Y6WJS6h;=VsPj;tAS_W`9+0$++
zaIuD01ZEgg@*q=-4Q;jh`R4XC#2H_v(ibMouU8zutlO-9J=IdVP@}m?V$(|LR0W6A
zYG3|UfwScICWYfCCxz<664MeAq?yNuVq^SuU6KxxiO|Iem?_8;QxhC~4jR;rmM~@U
z;(hVb{9KRqV~>v%dRZilqb*0s!-eGgBKlLub6%ww87u!VqffD>jcw<buBa|Nkucio
zmR~kPE^@Bf-oZf7z_ceqn4RA%?L-g#7X`%Xv4As%x5KI4J)Eg&P&IE>`WBXY#w0q>
zHz=6&TT_9`km(~)qv1_~&9j_Q%>~*9sMl&;GVEz`0w04UyE<N1Ua{qNC=`i~yw++$
zQEt&vus5gK%;;2!wTt@m*=ROBov_4YpQmDq^_lmQF^qXB`Pv*NC5Q$SJuWH5{l*q>
zhW=h<-iwLh@Hy=lN4eO<y&RKr1Z*<QhdK54WP7E#Q>1E{e4?`_7pOFqFVwr#()c0K
z<b(#R5sL|mAu3kwxt=POu_2uzRdti;Bvb9(bPqP=hTDY|<BjVNlu!Hl)@(}01PqHc
z)aZ7W`%K)+)|gDcnHe!?lP`BaC_G&vR_OZ&{(xX}-|sG+``lVuR-~4n$Mu>PA~R3H
zXH(y}*GsmY6w{)At??*m3PA>MeIzDypLYr!^Cq{YO*+x<J!zap`gJ-4(&_Ef_{Y*`
z53o>kZCX5N(&uzKkwhjkkSN(Lo~t!yZz04G?B;Hss3)u^zQD9M+^%ohW`y+Y$5GyA
zzRLY3Q*TlWY(iei^L`fX4oyB?8tknyd+#Sqij3d9v2(#?{Ca4ilY4O&l5~zyj;WpI
zGuUUliywz{J(rwhOKa09P@CoMV(!+HwnddwS(phkqVDHv6xVo$*-%m(lhwW;*st8G
zIe9nDCxb<R{IFUDyZErF#3LD0s4a8<E0~RKZR^qSETpEXBE3@fCsZ*i&cRQp6%|Z6
zLg&pNqt`|ufzY(xuR$@RxLVaUT+@8LzlG`RsBO(_VZvhShi!tb-_%lX`^tCg*BvMz
zpW{oaZlsS?xW1m$7-;3=27X?P?sMx*ON^M;4{ET{qKV2X^luj(Sx=Z<<P)8fptV}r
zT%uG>iHhnRsfaGH`(ml?VY+|TeKr_@a1UBU>scf<K2=0%-V62CYyv>;1QW)%Skp;N
zt=kmc#`9(UeAMP?F$IU49qV@Ss8@&&!cBy#nnXHC<s4hlQI1)sIS%dx=Hqn*>~zMB
zr)mm=BHEI#oxJ~P&3j!@cyd^C*o@qoB3;YWv~RJad*70^Ui1dHHGNufu6dYC;H<J4
zaO37aX{_@~=U0IHPsQ>1qA~?8aCkJ(Q7R~$rS;*Dlcu977eytnaE0b;8C!A)9ql&g
zvJr}Cm_yuDyOWqFTajXL@=zwcdqx9@*_xsZroUM^UY>C+Pr94eRxRE8oiW|&N8;<6
z7}S#84<8ZKdq@9444j$m-k01Q3RA@+pOgERhU@Bw{1k0NOGQ59+`)7uCK|8=5fa+l
zsS~LsQ?VN+a9LWOr<$S;xY?lB@=5@0#!rdSq2|8hD4ncHr?_a+W5~F`A520nAi6KS
zojf>Ew<V)l-4NZ4OfWe2a9UPu?_Jsa1}(G2fJ4DZEz@#gqP7_x2YC+^ua6AFzKbIj
zs3)|4jE^;2zdERD^^84ZiQ?|F7cFhFEBY_lP<C&DJIP6oYeZ|VH|$fRm2GPDq+0Dv
zc%YKJ2~lFdkVU1im)}{Rbv^DELLp~29*1rOIAt73@$=U=pRt)7v_%PM4Si~PY2CVZ
zXH3E<_*_~@kmSU0{p?gL+P$Ik$B&~5QmWK)Nm0qU2Eo}gN0c~SGvUfSBZ_RVQ<TKv
zwz(kRQ>(!`x91$V*xL0&!NsxUqy4fg_=ndjDwvoX!3^WUKZE$io<RJQkHw4FCPb~m
zjA=)>oZtB<jVsPD%w0EE-7{Y<KSCwG!a~q-MZkL&@zKn}W%-y7ox@9W%AP34Cqr(1
z(8J7U|M>oeKVO2RxaPll;v<$t@RuWrZhq9b{ox7o#f2bS0SoY36mt=G+rsPJhZ@zM
zLF_O0zGE!<md~MI=1)g1$A#BZLBIbWJ>9oF$xKJd!fULyuZ<Bk&f9xyWaFghCD!Cu
ztogU*1vea8XKNzQk-_#{#jT}3Pe9=A{*N0(pN&{YER2PM(MCZqSmLfnb-nX7k_bM8
z79>WV|E~Kg?#Lgo1G^snmz$L5_JQ!>uYOhomQKFYd-djsm@{nG(?!%mYo7=*k`mNX
z7DTcv-%UPQ%&fyz``MND<Nxmamvc&=Zpasr{mu&8>%9G8_xSUTSUlwA%Hxi{eNj67
z^B#haNvn*cLaQC|d*8vxe)W~M9{;_s|L$9D%j%WLHR*?exV5&A-lS$k^2+PpmIj06
zJttu>Eh}E{fPKwJZ@pjGy^Q|tbNtB@{7+BZlQ!3+_;o`oP=wH#;A69d0?}yTcvVNM
zWf+8zKdTQeFq=jb=57@l&0sXxCP#B8I_-E#$<#3H8}KViv)3<#tXL)bY4cVj&1tnu
zk7&<uw|<!eq*0D*^2S2fS_<rw{F^jJ9Z2lH1lKIXh2F*mc5ar&w4cKGc}p6-5gmFT
z;Tw46dX0I9GYMB7>y?n+k<}yPuVv4&n@*k;$4-%5tceT^xHQlDNM7{x)q^ko>5Kg7
z{y%rj{^_Az{~)%W0AJm9g5|r?p#;;A5cMmsTsvhqI!%u%7bEPNc*eBj?n{>&cW>y#
zw<*2sTz4}-Ag&;hKS92-?4va7;+qy@AjhqXw|+)!q^taH;sm@qLjRK8E0ON?qZ%8A
zv-xB~<ma@ql`Lozt_R1M!dt#B>B*!CUTIXN9cgN+7y+G1)O)fYu>#VVMKrA?I*H0q
z6JwA}d}D&8P2F3DoUqGE%#d^La`9V3&&MsF?{$;5vdyx(vu4;edVY>%yq~{%d_4PZ
zt+AR~KZTqOnaO)au9lkKS*LFZ`{PwZ=f6HnkWzd(v6gcq!j{*rQN*3jK+fThwoB=r
ztd07HdS9hJErTws?vQ5FME5%(t`U;07gMe7CEf@X@9dg*#rU>KsKxRYoIc`E`+}`Q
zqX<UGIke7xa!p;23KN)>N%$bZZbA6I+jH+|43B-TbYpLqx2`|`iHzj6WAZO)9TnJX
z4ZFo31xJT}?qpBXLeP;1cT8vpT7G&)GtzICBdFR-oaP!oSdjOH_adlc1gG8@+-K~s
z<TG|JIZh`(&n;M#-hsxmdClHm(4NScxL5Xq&mH*eDpILCZ=#UBEBrG`oh8A$`dObh
z_d0ppp^KenZFM)Uc6Dp2v!1gxLWPs(E52QlJTYgI+|ia3o2NeNxAv$&uKGN(hJORO
zsu)K;%=CcKC;FfqY2$oVi={?5p@$*j1?(SZWp`~iFk|N<yN{2%AHv_>r}+~Wn=tr|
z_V!ryO;%SBW74#FuKvC(Gx!AYuvVaa<o6(TWvvtM<4jS!N}rCSb4M%|nbwhn+LxP@
z6zH4sHR*cNEMLVTrTi-{LZ@8y8Gp^Fg6S`<WlH_=-YmmrmZ5bUuNCvyBA>KH4d)h2
z(Kp=@O#JT03-W^^UpLDt$d0FlCYVf>QEZ?Pt6Ci14^)4`comOqmftcwugQP2oAW8}
znt#l~Lmi5*S(_GVXE)qpTh38@Tgqtbo@CK86ol(oQh9cHJRh+x^Y4_N3;~Lu0ka7y
zy0`A@a@IvR<@7?v6p$xdyu2H}R17N*l17F_Q+M=QwG|5|Gb3w~uC7S-gULmajew@F
zPHz;LOl?&~SfkNNYGa(z(X^b)=m^tF=DP(38|oUVO42KWd}A0}q|-)B^H---1tt!g
zBoW5wkR*-KbZOdXrsZfqMHoS(c}Z95G4EFtYxXFawZm+Uh5dPn!#N*}JqEKJLi<`!
z&AeIS#Wg;no8^jpwUhnoS)La%S(E*VKc%mslYYYJPPM1KKp5ALy~1cJwUwVDUkcRl
ztXn6M4dsk=JXL=oL(;=Xh5u{m6l?nWN9qT#Cj8p`=UcybnDtOvPkH^RDPBEd=<pp{
zkwZjbFAL;7T={hZTOO&|Vv@of1=!-mF0`cS=zBLNcljE^FTK=y#$KQCIlHT-q>(|l
zJG+Goz98n?MjLJE^v+{>F}5uT?ejz8AWPWC$4YlBFDbrx<YP7<bL#w6u(j|5lYMAw
zS4giEMTxyTiMQy!?#2M64&Gwd0+n7$O>UZ2?Zww~)kuW_w8rq*^~*)Z@%>hBJDFo%
zJwY0Z^T8WM<Veqm(bLP)o=1?(Jdx=+gS?S4jX<||BTwlXS|*deG4F25Up+a&qug@A
zKGT)QQP#@#AVwg25l-2#39>mo@Wkb04YyQas+ys5l*!P9JE9%+xN1$&_9L3|d4}n;
z`!I#65hcwCDT~B7HTdgPKl;E#k4P!yWV65m`{=ot6x6**<8N7Xl(gwwv}h|%eG7jp
ztz#JlvPeV0{PPXt(Uxc@(nQnp>?arGbK>Y$!V*W4P>(EG7JkC2jy}0zkY|<S7mcYH
zl^o5oS3UE2C0#J-<fO{15Odd!Z|qb@gDNeZHfZ^n#>c`Hhcy-xKFghAyKzC;-oaS0
z?($83Yu{jtX&Xj4k2OM>aH^MqZ@C`O%ffwSqg_T+tOK7T8oA>8&0VU7$Gqpni%S8*
zDuSljq`+b$&XBf1gN0?}Su(@rdO!ZxKVg1?s1>S__CJ&jCaHzvhfroHjt3?-YLSb)
z;}6fKuHP+~uun<8!#UY_-*f${SN1pS?t3V<qyBj)@tid{nc5f!*^EMeQ{L$H7On_E
z6&=@8bhJ8%R6{G;^X09^&t;{A_b(x2uJMvj4ktXKDq5{cHw<sexJ569w1~D!H~8b@
z?!o&%Z^eHe*8kJ<$hcnfwqDHRJNAl~y6NCv_O8B+L=56e^#Kp*VYKI>e>Fvli9$fJ
zox2{Lp$&lq<xiNLx9yy)r`XA1GLK{r)l!=;!JRsV(yb~p7<thd$v~M8)doLd>xW9$
zj;~4Hk#~C?xt=J`>RHmMO)<D+iFu^bGL>VSm~3Ue7{pbEsafhls<b=0Z=#1{bBW~k
zE`6Z*30sJgzDL=3#NbEB6Ss1cRGGk>2+Z4BLInz8y1>G8QwK7klU>x*uqfPj+6RVF
zio<nu7QwR@@gpwW!+*W;d*kQ7z7CZB3InV=_>Q?#zHGSv<k0UNPk!rY@^^0q(#b9}
zy|z^kgZp}iFaBx#|MyS-@4prJUM!S8R$R|y<6&(CQ=tWUBE;wZ{!MWA{<$29yZbk9
zoaHoKco={5iEcTa!_BukzZ!r4-njdp-}~M2Y<kthP{Lr@(;BUYLC0R~f#3uG^z(az
z&3}3|Sa07)V*&`q_i5HCr=48qE-tLPt)!&Eg7^+&{^u8U!p{8LXZq(C;vQLbAgNx6
zyh>~>CRP^dYMW(uMUgN%dQjT;-L6YV+_SMa{wJHv&R-&^4e_}##r(Xi%6-AbT`g71
z^6L-F)}D?lgwDJ6(Vb^Q|NozDI<RBqO7bs6u6gUpe;CwC?h@ciZWK~@7Kn<fAbP)s
zPHsvVIB<3M+x_YG|KJ-R(13{m@Q}iDnI0WrkuiV2JmKK`45=0H^_M&6fBmkf`_?U)
zB+ts*^L!j*D^Lg<{^eUi-}b+JyR#m7vMviNL&1WwOp|FSroo#qaH0C^pWhli|K-m-
WtjfC+De=iYi1{?^cdr6}cK#m#Rh8=i

literal 0
HcmV?d00001

diff --git a/docs/_static/zhihu_qrcode.jpg b/docs/_static/zhihu_qrcode.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..c745fb027f06564d41794e9a40069b06c34e2bb5
GIT binary patch
literal 397245
zcmdSA1z23owl>&U2n3e^f#BZ3-9rd&!5tFZ-CY6%4<0;Na5^NoySux)H`+i$Pv_k8
zoqPT}Gtb;}|GD>>sqWfUy?6KSy-L=*)?2G!^RRUQj+~UN6aWDM0FZ&d0I&^!wxp+(
zIRK!j2w(;P0M7x42zUS_cntym0)$%ukpET(0O;W*0D$xz;oq-*NBrw5GXHm^zv`%+
zf4&Gi0q}pYbZ~WWv2<{x<YHw9@V}E)ME-L(c=}tN?r&9`>DSQ<ae!aQ2Z3ncjv?>-
zVeJ6i=ct{?{74AY07P5_BwPeoH-G~EL?{S<OMm+r{D6RngpBeG^*I{)3-}jmaNv)Q
zgoKEUgo5(tZxH<8*8#}5D0oyH;?M9^KA}=O5pV{@=RT)-SJgqNI&n_RW$YY`hW?U>
znB)~5Jp&^XGdB+}AHRU0#Cu68X&G5LHFXV5Eo~iL6H_yD3rj0&7gslT4^J=ekk6rE
z;a?&m6B3h>Q&PXBrRU`r6#ghGE-C%>ySk>fuD+qMv#YzOx37O-aB^yTW_E6VVR2(~
zYkOyRZ~x#Bd~tboeRB)ByZ<8>ygmO-Eco%?B>N9?;lkxYL`FtJM*Slf0;0zs!EupM
zs5qYCiL0P~a>A$P40=xRE<U%a1C54D^_<Yyc>?_<E%yc;_>X9Rlk7hySnz*IvVREn
z?{X~xBmjtiOGt?DUP3~G_Y^Wbp`fDtNvO|J|CXNrJ)!?Cz4((b|B+zun;^jN0srwc
z_!|=q7409_{?irM3Vc|;hb;mykPzU52?-Y<26&9liv(UFU7qJfG6MefY{QwCVoZQl
zkco~^ojiZI>vo2(yAS4MJF^f#FLmZRJJ3J-0yVoao=yLR^@tJj6*#nNbxLQo=X|<H
zn|{NtV^VT9J%2~G_azs5dBb2$({S1iIHI~W(&9oT9H@*~Os-tWWGv8YC33X(SwM?f
zmjYEVAt@szZ_*T9SHX04WXV{vGF$>Im7m|1bULY*O(MGw%t9jgsL5m5Ti|6LW2u!z
zQkt}qF7lu>V_U-r9nv9u)Dn&>u6T24B&2JwJU`6JqTjn!e`c7U?>RVoUeU6DVJ6nC
z+^>8E%IMi~DLRu-_YX-1V+vNOl%LSFDka^AF{pB4iEpdV?9}HdGr6NP=0#DV5U5Cg
z{r~tOxt%dD@=f4>?x6V=E|C!BAL0ISNPHUr8~@)JFQ%d0wk9MWvBrI}rx7q`rg<H!
zFMp~V7G!G0K|*>|4>L|!WaVwI_qAiM<3KjI>9IxhS;g#;+5#6^fe24-@xSXq1?@YT
z@fy~^HMPs~TahwV2W(90YJB9!Q!C4Z*~sh3W8X&jBY*3RWc-f!CnpkAf0((#=Bga`
zRrC0GfHLKk@+feazu02lr&8A%H(cj<bU!!th57nNzE4sAuSEfa0z9Et7aVD}7H1MJ
z`xmj4*(4X5xA~A92`tM>2`R8$*!MERE-$XDE>|%L14qgW;GVTw65SB3e^6+zO1E@&
zVtuuWq(#Fdy!qJSj}rLboD=`_(_ep-{HJq2!cPF;83<63B>2<i0C?7fPi>|^_=!I!
z%Mm0A)c!u#nP~s%!`~DD5^UmsqmLPpw%8)M<Wb|!R*4upjCYiT)Lb115ThIdZhr^S
zdp}yW5yAl5er+&-JKN*4dO}<DM<A+l&I|1K{*vDUn$-hV1JX{*sOvF`C~pZ4FZ{R+
znyNm&AfY)8pIU68?^CI14U~To=XQ!|q8Sp=Zj|>G$6IxdA3pQm{*}?On3l%wxCa>#
zHl-vQl%5|L^oDMc*e!gVj)mN#o{H9Vs(!TK8TryBRc<>vOfFjcvZy-mcr3sC%np=?
zuO~yRwye>-+l(qVd&V6`zw+zzPSO?6VLC;`n`=q5X4<||{87KMU9dLV<XrCUosyq2
zUW0X$%wcf>DtAr0-}Aq>@t=17|3gNAP5y87_U2h63~(W8O>x=-1AL#$*2;xy!2t1>
zAO5X_n=jgwhuT)G?%DdGi4;#@wV5Z=iA&8-jSbal=5@{m<kBYU;?${5l?@%TI++UB
zuSvVvjj)*=R^&vKF|R7Wvww;=#H_#BY&@aq{<TQGWZ-)+&gZ00_;Xh4{l)Jo#?sn3
zG(j|)kAvR0n$Eze<cM6$yvX>!hQgl`y~XN7Gb>$8<gd(Net{o*1P$B;iJBj)tt!I{
zJUx2t^^O~>wg9v_TZYL56H37{-#D{BYmFqg<U=x4CYGrwc(L|ZT9Y?6FPa{vZ$*E<
z+0>azrKeFJ^UkZ{_H!T72_LQ#(st}$^VD4NRQsHFXIdkAnrt~8$IT0$`32K|pMQVm
z;Qw6y@tc{+gGxi3HXDH0N;~N<<k`1b)N_Q53gJ{Kv_x|~dD9=XS6^E<B(wAF6LMr5
zb5O6e04RWFquJ2v)r_UiyCnT0`s0M({&sa1rtuqlWa6PiW^!3J4Kt~4jBD%;_RA|P
z$LcH#g)=9$0)2uc2|mD;$lo(R4f$J!eWqXj2OdKgY-PUrm)u?`jiH2bE}InCIxs-r
zZkfq5vAv3ESr{Ofz0&z|GrTJb<FTxJQFrU1j)~zZP`8P32U6PBS;*u~4+GGoR0?6r
zLu#m7bfrBcJzLIBFwgCWsG1cWxUkAFoEu<(aloTZe9L2bE6$>+glMj7S5dOfMNUta
z&3xCnyN@2kQ!odDoVpvw+(u|R9S{*&1Oo&y+;JUP1Gjw6miRh;Zd={I%yrv0l-YQS
zF&OIzh#LdD7*1b>PlSC-`9KR695y#ciTnx!FbDc^7<?N=7(4$ayH%y_@yb!luKu=m
zuFbHI9zW)nf+jPuu!tMe(EEf`n=>1KhKVZ`NnfvP@qe@p!3R{>)c^2w`y@%AWz|iv
z-kBju{?_{7K5I6IAq~mYL#?0ZK+wLytT;4=)U<vF`wB#(TZIur5cS4ZBlW9vh0{#e
z_4`3Sk=~SgMPRqJ6V~^6`^d+**(T2d9RkkG*qOc2j3_D~o{A*lzr*bx$o)DwA@R!n
zC6ig56D_MsP(MxNwH<aZhd%X^HAT}`0N;(6?nR^%2{D=0O9mA5r7Q^9XzNSg^VI%(
zIRQLLtY~6#aY8!Ju<*^O5xVzyae}xDaWn^p4L=TRy_0{9y$H%KRi>O9*>p@O1lq@q
zUo7v2alR6gwNER83rcky`cFvk&uZYmt4_`Xe(2fiXnVnvF~4Rua}}^K+Y*=Iik={X
z-upVd9yA;fwuV;&3cVk`KMiuQ;hDbKUE-6`r{)6UeP{0wa>`Mdn`)~bE!@QZ0t4W1
zuZ59gSEoY{BwUBk6MmdJzOZ*_d`a6*o0F6>;umlZYOnGYJQq#f_%QJ1X5X|iSsG7Y
z&O=7J82Lo)afa}rRwex-zftH0h8>~q52i~&-`QK1)4Ker_mj2FW530SMmU!paKW@Y
zA*BauU91{HMr*Al)-^Lf&We~%<3idtezr2e0AoLrVE}&$7$8;!2EYsji#_>qgC5Rd
z0H|}-4gw5d?To4nk&)g<@h`;T^gJ^Re`x}7pu4Yw(sugKq6ZLG_~*Q6tIQ6kz*zST
z;k)mHGL4cC+EZYj*|*`<{U{2C2yOa<wkU%6Rz)l~t7q&g93;FXIM6COm<P+1kqP-}
zFSBRPvopYbTdTiLUdERl@=a2Qxz*NPriq`vqoUM0i0h6OrSOb<>PR)Ze>2x3R;2}v
z7KN({5UJ54@yyP77?7nXao20A<DDFSR5YEHbBU0136Br(X5U+1Ep(XKJk9+>l}l8T
zZc@IRd&&tzQ=Q9Lz=oy&l-T8*0SwTN3Il9u<P<~AA9GzjHMcaU-SmjFYJtm(-_+~$
z5%&@JUjwh*TNNLx#J1DsVSssEibo<kSI0H7orP=*X|`-FDQ5e*U+yr#fRZl^5aM*_
zZlL!XW3W7v@Wcc3q~Hbv{5Faamfgw;zc|chZ04JDG8#b~9wcn8(V!$l$i%bR7raxx
z5rqK~KEc2H39cxwkU?d@ODut|$=saH{qe|X^_2uo`3Z`=b2s>;P~g%1<(wGVWPbu3
zOBEG?Qi9uJ*zBMZ>ms`HKA*pcyL`OG#!zKW(0or52;B%;k9Ou58)CR|gj{-1J-mM%
z5UD6BBbJzGRQ%g3kAZhX*P!U5ODIb$0wo#7H7MM*ZPBM!lmZgN7uw2rkfQlgWtr3}
z=9dh}R`!ne&1t7hKV@R*`nE}--yV{a7~U9)FujIS=3}&j6`TJ`a{6xF9cw?Nm2+d1
zB9@c5e9(>R!~jHeYBYMV2m|asWUCMzr-J>@o<wP<XvUy7&GBtl#OVzV6#k>xyb!02
zG+#B4fmOm{7p)4X?htSwO`EaZ@~a6$w#{`clSw*v`b%uGND>_-2__O`ig&<0w#B+9
znH?D5(QhGOf2VSl46G;8XAZF?w3kF1<*<^@clT3Lc<2&NMNQG_QQdX^OgpnP!zQu0
z7E1B^sR67fnWH~-EwA^S<rK5&P5<dm;HcE(GqG1UJ^}MTAnPlhwRG3G^Bj)~TX$M{
z{(WK>O4n=;XmwA4z$2!;uvrpT``5;ibgZX)=%x9z3Elv~SzoJ7Wt{cj<x;4qI#M{v
z2|H0ftOv=Rh7{Y{&3R#5g!1XCF-yA%hY#Go@^%k|2QT7kAlV0xzGbe;k7(&Fa~ljD
zm`EFBjWMIiK5hY`1@Y1}%b!VKQp=#@YLy7@@myqAJ-me~gWG!es{=yk{57+0<?YOD
zKgP^%5N!PT)|i66T!-TRLy#=}5vPvmg#+HoFX0UWxryN7$&dETXtYcM?F{@N(ikvQ
zBdk-k%YwgP#&V@YhdWVuRgQV_#zFmoybbkZy#sDUi4jv5_kj>o(?XrwX&r-~a7v5^
zJ)Ata2)X+CPs9NdZ2JFQh|BZ+Uo6Vg|C2@ecZmDr_B2o@L;}8E&Ni|zbQ1}>*d}tl
z5BQudJ=xGQy$%C}nrNn5Vq5;4uDE`VJV9nrn%G}~vD+*+d+#HQ{I!7?BsJ+PZR1l*
z65zqKl^xc6_|3iMeYGnb#$8w>KjuZgbCT?4Aqd<}`8Ti5tJqL71|!<BY(O-){zR?E
zQGy}acvG`u;8+A$_9D0<B9)H7L<?Ujp?TqV>5==Qw_swU<Tj&-;wOk$_hQ9Zl<WCa
zql8E&Rk$#2wm3+%K3%>cda`v_MA7x>8358G!;oh{ljB&#b4_>=&5Mu3$G@s5*4h^n
zGz{jcnx1~0%K1>~H%UaMqC9F<RN+sVA(m%6C7TO3HO>CJ;O_tGzPREx4oHJH<I<!G
zOFNe1O!0B<lxA@aW#}>WzD1xyUor5|`^Q{ijV+zjj#-a$OJ2EnUBC1T;+H0YhMMHg
zRc{u0-%`CL++x^hJCXTb6|N1yNPH~Bc6jgvO*yY&|D11T5mvMvji#f`0VdcP<=>{X
z%qhl=aX?+ZbZ%ON=cj698ZYjuxVs;=tTcHno5zr}JeI~2?J{(8`8>Ao-D@3+YmKdl
zCHpgay2yZQ+cv=_8-?{b-J-UY7%t3;yY%m4c7HRD>1Qp^)xZFqHIF3$dtp^@#myLS
zN8P5db&A<#PWW@8BCb$&s)%^ej{YRT%b<c{-`}T0Sno@RYPVd<vqBBQSglAqmg<GW
zxbd?H%DC`Vn*Khh+-cjD4k%L07<5PMs|W)a^0wV`lufk0wxS@oVha@&nQEn%bj{gU
zW(hISmv-#t=0po*qWSs^XMGDqxc47`?P8v+mLH2~)bVTCt4PkGGs#s}C`kJ9_hrM?
zd~>JvD$SsiS`o9ood0ohdL8H7Y2$_1yQA&1CHyDZthmR}To_=&S!<T(!s8?Y2B017
zd+HpA0rne!>%JmzsEI>;;`S>dg8}+O2%m`V@t*qDE)-bXj+w-s0+X-Je2X3^0tVO{
zVSt|nFu*;c){KHS6Eo-Ab^PcY5O=fqpF5B|@rpi&JAcpcq0ilXLmt$j^bYuzQClZi
zIBu`0IqRzOc8aQxnwEP8U;x(~xq;~bTOT|6r#<;;UlKQznF0{TM9qDeEDfDD#j-nf
zvFwA@{zdPE{QBL>Yq?(;S5!&SIMVUsk@vB$)>qjEou+@kp?`Vv^e809<_5N%+aJr=
z#A4adA45@N>oK+qI2}Ex$mwbf*oI^1FOOJXiA^>%#f=r?Sud7qyAq4ypO4G&WKY{w
zhZ8rw{)Sl3_X-}%0=8p~V1T)m)Tdb83l^c9p7Yswi<eo2o^&THpGZx@NT=Vc9HZ1D
zHp2j$d+<|Wp$;u_{JL}KqR$vNUc~B&Insb5WRY5_zEAi56&Y!^=4PW;+q7Q{n-vMq
z7^<0@M?vH1q+xitR+O`~K93sk{;a0#c!%K5yz=<CCQdQYVfxl#e(y<kwGa;cyl~$J
z#oT{(RS>Kcm#+sAldH<c<kM8SX_3KBFjT(hjGy8kvLFpdqTzunH<UI@J+{xQuNueg
zxtZK}>l^tH)f;h8P~P@pC}(H67?$`vm|1#%HnM0r7t-&3ZHmVGJvw(hOtYhwhWRb!
z{hYbb=gUTY11*6tq$Sn8UD;9edmxXOeRk>ob_ZBWSu6BlhqL#AIkVRPIICvH>UH|W
zLU!^4uP+e{@cD%lISUwBd#*-Cs(;zV!XA3OR}jl?H16A4b4};V9$*)xw2GYy13Z(5
zkFV*82b=fDcx<RmTx5(-<H!l#*pORL!k8yf^lw``1%EEgGx>+B<J9*pCkZeBU&+-S
z8;N7KR!G>37hi5o9v4<xD;}EKjwDXgnHlN0#0h%b({PO(^z8v#=Os25?&Zg0Q)-QW
zXuz+3(SX>nng22*<!4V1(8W?UN}-D=y3{pRl1vhfi7}BUtAzNFU031$sIeFfARL2g
zrH4OM1?=<eC0`juHIrgTRGPKw2xk<MU`F3%_cjAnC(c~cd^6_Lx#r_9a&q@=t`y3M
z<kYENEEJktlDRPnUb;^-rE#mqhsYXM?eUny2|UDqP>1o}KGUwM)`#uxjbah@luQnu
z5<OGyqMr<dP=>Y8$zLuP)q}93Dr^rHD|$2N_n*u})U$z+m89h#$eW_mt%%Dj__S-=
zKI6O-a~qSJQGE_#Ro^ajm(V2io+{4gzQG^I)J)$cer9G%-bF%Rc1UzS-)Wj0d~e>|
zrZG#`QosB;sn7d{bGXto!bQ<1JT0bg>c^d6dSh=J%eaqc*Bh@snrHPLS$&I@z|^Db
z(ow-2H6y2xu6MyZL6`iIr}h`Xk0tBz=|@mpN~4!nq>ZHY$*s}tuXb!o&UM4UQt8Sz
zpFdrrQa4<d36o77kCvyv00yBsYG;H8GTu`J+iGQZr;9n0(=5j`!I6wVtl>@bhu{1k
z$%Mn6a|R(@qByf|55b7q3(6b=y>hwj<l9%3*{#4wkUw>gN#2sLz|=OTOL0<T>!Mfe
zD(|qh-?mduUXF^9bf?a1g*D#Xpx}w>ogKpRoE69dIV%1h&fvx1{6LGU`(@$;Tidy)
z>ti-lV`B`(^S26kE~>J*zT7s8CPwP6DElC=+h+PJLE=2ph<|~l?rHL-0ri~01S{U*
z-~|V*YHmN}{XOn@Y;T*`6OsSu?2{-&$faB#*Q&y3x%yuHSX708@<%TWV653cX3V7z
z>IjIupX*R2)Q52Oa>%92FMqRN_dWfU!_P%|S_BE)2p<+WdJrQsC03#XMRuEMkilt=
z!HvR9)d!L5wlSqSXhz8IW-aUwT_1)IiZ{wC%|u)i18jC2Qd&;#Nx(EuvbPap$0Tr#
zn>XUC_COfW-?iy*4~E0)M2wLY^nHRr>ub+2?Qepj$<Bj=&gkC;{uu2WYx7GQQ`|5B
zzRkEbk)xihhn3yy#u4gXGbb9~lKPmDG4T=K9Q01r_2xwmEsyZsxZ+LOUSYiyIi<lV
zXxcrG5rhAH&V|Bpz%dK<Q&d80Npth>sgr`{X@<zzRjM?)v|0Y~fa>A56cM1<02?uU
zh>bSH!^En<dX{f0MwNkMNxzRP6n7xu9T$$s(XOyiuBN`k=Mv)Dq?wg!(U909wOGl>
z&jpE2i}$ClMpd`a#226kbY(c-N$00y#PlDK(qt1Gn&deK{_Xz<3$fr>czK2cWrR3w
zwusj_;6VIGFTm~CLQJw0NJvia3crw<Zf`|)z>QOsbC;)38?lE&qbC8wHc8Lw-_Xe<
zLP72HNT6vEZEBvvb?J7Rn`~n;`?<4E9dYfGyKu-80a3@;Jz+Tva8?ok+3_?wAv(|B
znETvHY-nRhI$m+imfuh#na)M^5yV`Eo%Us$%oS99v<n0LFK!a3WPKS1;4aHvKbYyB
zc5<UkcS%$<u$&po+$>Lo`_jgxYF!;XXx!7(=~Lt2wS|z4)WBw|^^8a99zpI`F~eFX
zc(P26@9a=1fQ&~*aDTNk7$N`zysOE1yk>&|$UqE_u>spzhekK}4vzq>+3Tskg&Pa$
zFHWw-n-;_+mPy+Q6Sj#0o(yzDrQ6!p$&TkqnQrzRc8B-XUZ_w;wve1FAe=f}Q-uM*
zgH=}Nd>eEz(|on9#J^>~yCv|7n%KDspki#3QK%S#Bz=AB<9Myd2vyg%Qtxuqf00!Q
zmGcO}0KsD0aIN9lSN_ksBLFu0UqA*EnUdRBk$rqCVAZvu*DS%Z425pb^mY9S*;40=
zfk1hri|9GZ+uOPeF7D5Pj(kpWMKuoSSi0GoK=xq$pWZ3gOG^>+xHPm*a68CCtm1=m
zngsrcB=hK!rZFih!|T36@tbO?=bU<!Ca6WTEw5ZBL*ue43`RZ7CO_a{>-Nr6AF&wd
z_CASBurF7v=q=XLA%6-iT?(RXBfhTK@T|ZnB%~`@_oqKMIE|vpo+mlM7HWUU?yb-J
zMl4cx<TtJ_1+JY_W$!A6+6+SF*ZCPnR%108wyRB{<$f)|<#uz~hh&{?V@FvVq~6*{
zGrjuS>{-pQ4Rpaw9P1KPo-5phght`s^5bs!n$nlQi0uzDMGkPvB8xN0EjP$mVCVB$
zMC8xg%B^?cSM?7f4}mLLjjz^aiHn#v8w#a!oJ8WBbn;yBP*q#MgLP89t6%2Etx`Io
zx)@s61h~DbDdjLrBqn*NDVW1prxL0s+ihZR&^M!FI`*8;G>NKGR?GGo5Pk^bH|b3L
z3KW!yca&ZMV0X{VQZzsB6FAV=O*Mb}wrly^=CDMUNeU>@UYuxeFo;syfOH@O=_iQo
zS#w`$lasi_HJJF#Ux@4OgL_?>*ijj=Y4eOBNN^VLHvY5%gjQnb-X33_5xPYQ&TiJq
z$moUWEL9!sph*)xd&u!BK{iz@<qsnwqze#f89T7XY;<z?319VzdH2`i<9~lifUp}t
z@Wg0o^g>vGzN(Zx<>JGDi8=O|6FV99ho}zsW08K;ONuT*qR1y9s65b(M$xTA`6lBb
z(>LYj@U2N~kcMM%+gson8-f1_IXSp3q6o|u<ttup)Le1V)yO!@*wG(@)k%o@UFB#l
z{Tgm#LPgKRyH-T1&(IDAOcTDEC(`YHiw)g2)~TAeK>A5|e=5<FCWg7=@-)(-ZzpNN
zKq@6FcVAt%wCb~<Ly@xQ%aP&q%mn-7g<7YYhl<igYDR5DOWm#JoQEI_ja=1ub(tcm
zLbRln3A#ACH?0Yh=w6A)h;Q42Jd^-#WzH!$yn>joxi=N{J;h^~>^9el&tv`S6+Lq|
zN=>Y7t;%Z?>4$`nB%9+7|D3#?nuP(ZM5a!2ps&XoM>&OA&4N;lzQonfrabU2fRrJm
z<X%=AM<enZ8lcc0i35!};=w&D9YSWjho65Qb-95?0w{c6g9AE9s+IAqwRlrMfso$a
z6pp2`ROtZx2J2Ur$iv!nkJ7=?Ay9!Wc}DIt`~|wlik40TT2%|H1mP|B=YyAn+1S2w
zQ|BJ!o1C`lk9jTV3HDC12KzS-(|4!9^-_V_SHiI~B?Y<kD=B5JE!bnvena*i6b+(E
z0M+AL4l8Ld^bY!^4MH%c#7ern@%SsSAtpu?H#1i`qYh7eM)~LBDCHbWdQxAz@IK3I
zeJC#F2OtCjt_WGYUBEV6xeu8?k{DHtFx1r^mR4@vr8BTGI1h&wcas@(%T|>%A5tZk
zG(E@K$HEqKI@g{JY@8S#zGzBwplx8WCmllHcuVpTP1vO!WS$oZ-QpUy*#c{D)Terl
ztX$2FKS0|Mq*21c@UbeM16dnUG@F}ircUVFTH-&|)k!7jkcPfjN$8u$Qq=j;NLxH1
zmpjp+sR<}!BBl)KKz1YSBQ<cbbsN0#ZOX(DdnrmZ+LRBrk3@KSgrePZMcxJ6<U#7z
zTTD92*2l1{HScRycDlI~ZW7Qwg)!>+D?}x|83q9GgA$y2O&~E3lvxt08JZBU0F-i<
zs`%=Mkq#P$1nPJR--iA2XQ<#;Tg^NVq@KSv?|H$76=r6p<ru-!@W6vp0z!3?YnHxT
z&{Dk(>v}VxwN6*xK)uf(o@VmwoUFmZ&BSdL<QB)A%pkf@jai#E>2cNdwEThhxU%W6
zN&hBi{%O!BAWTb3wM4yfx{zb9Uh;$T6n$%Xxe*q`NA~<jgvG`Q7OsPd32`Lts1hn|
z)#<Ra>@`n~Q8Jj-j%<tv6c_v}Gedr;5PIyN+;wOA3tAhCu5^kkdw&gu>ktse4dQ>$
zAlR_Ee+{j^eto7)7*S^9-e`T|j`C$xXiCMwgn4E|*3fpmsU>WJPsrIDecMe@)us@)
z_g$q|pDJ_oT$#HE2W@O{$;*!3DT6CZx-!uK+;oB3{GTq>Ap<-*>)ARcxQWYn$^eZo
z-^da;GzwZ|XuB+sEvQ`LDqLw3g&TON6jCX;{l0Fwb1HI}B01LyU-vS1uDl1IsU`C$
za4ySD<n|*89Ox2bwKduJIB=U@Mq}0B$?x{Q!0aN6)0Nq*$D`n`MOEzMaTjk<aEQ%L
z!e^L|9a%RwQ@?ol*-ip2nWRppa>TCT<xioV4`dWRXBINq<zjlmk9;A^Qxa1Nu~3{n
zd8dT994cgHh~@Wa(^->)ZT*Tih;@#6<+YSL3~+t4?`0mkJy=%T6&;&w{66?_=6eIu
zpnq<t$A;526L%xo%=yofZQp{U$k-N&?p5puf;hKAbG?8~!$Re{8m}$-S>`VIK)&YY
zz%8y8mK)!8r-4(U-~0c9L#C}l&Pc0>vUG3j@c1NWrx0CCij=~iM6a7>3|_yh2F1!E
zHlmqeAoxGk9DOpsv-N%AV4#m5%X`eTWnV;0OfRF&$YoY%``EQud-{PIm%_d#y)t;d
z8-m#;AewxjmQp7?7&+L(KeKI+B`L|$^j={78p?cdk<9ND+efHHxhK$9QrTbE$iMSa
z^Ab(5!T>u|=G9D?|7~{jqv9ude9Rxq+f{wfYPqFs{H#YzQ9V<*?jv{mdU@C{qmD<T
zF=wf<M<~3y*+ULg)ty}MC^S&7N`QB3(@)u<cN!0H7Et`3S2Yp-X@&EDpsrf44{zqb
z;#~f3{lI@ESgE)mje2JkaA?DOrd?Od`kBL(q?cVxaO3nkYAKaj3FlgOKmVPYMQX%w
z?z1gS9Z)GG#thrRx-jwKXpUyGL0zNX<jqdjn3rJCf$E*Pcyg!78X;o0rV8uhLjdy?
z)8q$DQRi0b3Y}F2vJ4MAsiuta1Qb0!yhesW1cpp+HJ3f!&3uOn`5hBHhYzT>m%1$F
zp9s#33&y4NcKXM1#b&=N)61B?f3wu8R&P=?S&(dK(V3I+bCgS^hpAS`^h>bIs52v~
zXA`Ftm81;xC&5QVkpn|~ywkV}aSx6J8FEp87&{?bIzi7Reoo8S3vt@`Cc%?<K@SlW
zb>;z1vpS#!?M%eSV9izHf!M6lmtj0KBWu-HC$;G{*MyZyxJij~!q^`~n5bO8G?8Hz
zVP;P^<)f?bipOG|hF_GQ(aSFPeorQg&)L%$3~ItV^@vX)_1o_YQ<dJ+Oquh*#{dYB
zUlSNL4O0zfny#;}tAq6yg9)Q#`c@-sU+Vn|oo1v*b07gm$HxLQ=b(f|M>r%W@0M0k
z)`KqKG^}-#bW(Mcu#m1ueTQ99h&oB7#LMWD0ftH&U7n(6g{W1TB~o%D6g<}622XKf
z3pLMm>7QPTx%Cy7AKvR}Or5wDzg+qdWAtg`%O1njYO_zbC|1)Xy+<GkC0VSf8Ts{<
zw*Dv&cS*kR;JltM1EHjty_AarNJr?{agfCx?kgaN5}!#5yM9-k-*+WcO%&LFPFUzE
z@oh%1%)DfuGRjMd-yC|lT^_7J=;q+IbqSJ2GwJPRIW``lvRTlW4@GB3RhTy7w7^9j
ztF@7wUCAtyGFZwbG}umIz!~NjQI@;j00PNbWLB1gs4gLt(si}Wzl+Y?S)U2r@8%_s
z*8*q}okIDchjxYIIL}dk_?4Ls)8YQ^+WGk*f6m`iOs=7@4ytfckk+~#5$yM+e2=yM
zE*AB*x5m=PAC?UR7Djt=O>)toh1e*kvX@QyXEug*o1(>SgGI%50N$@AMM*jL&r8M#
zp(lRIhtJvE^yL`yg5fO7zeUgeOC>N391v+MOp!ZMzUI5cFLR5++V`DeF0#=vX*(Ci
zefV%DhF@T0fyU8rF(coYPQsDME1KdPTcwCVnZ}*$=2`SA?Rj0d4I7!R(!icf@9vua
zb^^7d`0JZufa`(L4^a%TdFoaG<~$_leUhSuA6|b<e0+RAr7O*33t1m?$>$yCBM8^<
zkc+o=9b=yxB)>E-DcY?7^ya$rUlQYw=;^Cy_C`aY^0;$Ig{~=$0A$jXQ6H?Co`^oR
zyc!F8QZ|sm;`5yuolzV-<?m$^2U-_6$nPy0DlcAWe|-D3*idk@5clVqA&dUq>bP{9
zL^zGjOiB@)Cw=}ncN<D^9%m5XXH&Om*=rMTK^(j1-T$+;I=+K9!ZQ<4?(-1_h*3)!
zJj89-`UsqGcS*O6YNEW<QysHjozxOIay%gkK%2iL&$6r>gaKs3jLnpjTTmsmQ@s^#
zCep;Q@+(n!jz3-MYP`vg26O1pQztm<f@r8C)npALlW&B>(%s>!Wg7qR+Wvy8KjnWW
z?l0JBw!~ny&xU+!Km@gdyYl^^6!UStj#dgk;gGfNMe&6l!m^${3?Mh!GGot5i|qSF
z5J7v=#71DR#Z9$q&84H)952x+<fp#S09*P}W^4$2>ETsT7{=wVubk}I)@iIXUav$o
zQXjfV$ftMiBCtF`QXI0{U<2t(<}th7VLE(@H!)3~?P8&pw!u6Xb4ZqlG9$PJ3c}UI
zA3RJ}tIXpIn=F?GNq>a|>XBdR)i*S#{GhO0G$p>`A3IR_mYaARru@x)L_Mq<nQofg
zCwe<TE{!rGO4zE}B9=oIh{{#R6H!=`rr^Pmn`pHi%ulF#LZ;WikT_~poZ9s5^iKLM
zmS-_T9X<}-w_g9W5mk3C=hlMpP*)s`aAtwmTu13uw)wvCXR9XqwZ|^UOjV;SG&py?
zS`PIEA2HOaUI`S$D|6ybRTY6b)ti#jd>3?$cX>xq`N@1VzOYuYB@CC(Z)3m!x(vUB
zUXsun_e9X%kk260)kcIb6tFNI#b;_5ZUa+xZHg!*$UBxZjlZYpeB=*2aqr!V#bW01
z%1^vacs4d+Ep7EaupU66i>{aYd|0mQOom<CqZASGSEwq&wFpr1M<ipC1s&>(lhqQ^
ztYPvYtktd8R+1|VF^8IP3%g0iY@sSlf=PHb{VR5g6vrBMsU3Doy_D9dGMUMdiXP$^
z(^tgYpJ^o2Srt`$bMG#t5^VX90Yx0eM>(oi6MCK5rlovdfx+5)Vp%fOD9`PBs(OiD
zd|*l-4WP;D2pX%(yN^{l^@2x*V>&p$+EbziCUAa8tZB_o-l}_fu{b=gPZcx09!3l*
zD||e>3oK^Xxr-Il!~$)saOu=?7%{C)%aV`=-OrVT4(}-zkEuIrb^NIG9rm8o60d-G
z!vK=?W?5*Tt8fPv!fo$gbu1kiNy}AR1G~X{<XWGq!IOzrg=FZ{5wEkuVE}T5$1N>Z
zD(ncZOWY}*m!W8OLLz7;n!9yYdIPB0f%lI#b=b5u!pjN@gqPpHHs~kby2mHVQt}lH
zixaQ!74sB{ROVvMC{~IjQ5@;)s+naCwpR_b_oE&eCRrP4XR9A9IhW%-TaJoja@LM*
z_EsnTk-}-_5}y?+y^E=Q6Cm*U_-%+k8tQBRWX=VhKIUO_=j`yw8V%erW-W7F>9DTt
z9A!|2QEk{N(<dt$XN5@4@EugqJ)ZBgHy1L77~9d8qfR>K&Aqq7A9BT2YDS4a-m8{~
zTJmhu$ufb|PNS%MN7qDr3!XI0%NC+_--=KxKO$>HLXrBPQo}n){{*i8msI`pAL&5P
zY_D&$PGv$|WB>xw?Q{s+=3KXcuvJ-xj}F3=?Lx|mR=x#7W&(mx)oM$?R$R9jwSX^4
zf$YQF2R0*1eu-cli(LLun{5f4Xh(if{f2D`lb?OwT<%>Rk%elqbV$MDOk3J|4)JiC
z_0M^%34A>1x!IP>8z5vf`}K?&#A*`_+9aa%_U6qa-gaUV>O`;LImco%!{%`%T@A|o
zJ1XO$Y_u_b@41MmJJ*s5H$}mBznJi(siQdww{d)M4}p`T?BEJdnTubUMl;LSmEl+R
z^P^npa(BOP%pgNHV25`16A#A(n?>vAHN^4gSo}H!91K>(0|UFQiJ(#IN!RTSnrnS>
zX5vV>@;_mozXjp`8XPx(TQC68)1<%X`C258B9b!1BA{f+>Q+^+$BXQOdGw1T8G>#5
zv!m%po27n;&+Clo<TuD_PFfyGCO?)23ZA5MzL|gX#UKE)6kJX4-O>96BQPK~#Gv;2
z<AW7Pc5=eqY!s#Z1Ue0R9GIKhyT2j@I&-QAW!Li`7~w#QZaTobX?X@bXI*6#gq73@
z&)Ha63tkXRGNHHgG@dg1f8^wfyci3jz;CuW&*-`>F*!d{nKPd~l{`)l^rFI}`hctc
zoPt^4qsxGaQ9#&v9Iuy~ttJ(n3V*4S29vJt!^lwjB1`g_yYGl0Ux%M{)+@z&od!{z
z7585?*Z!Jf8vCGdv5m4;S~%BFH{EtlZS*9W|MaS9Q~43k0KV*pNC%Jm)&&P_5<3w=
z@57%k;VYa2+2QwV$uNL$+c_FMikM;9EffUF>B=Vf{VF`dO~`<A;T!dU8*?{FoTNdw
zy~#eObzBfs!f&$;&B~v8x_WQY9&|EmWm{l8hQy~%TZ$+Bkk`D{bMMgQut?LHJ{NM#
z1GIZ*C(zEN_w_>CFl&=nG>x%6_9)(nD`<$iDmGh0jyspOR1TPtY&h3i=2tUUIA(+!
z<=>ToX(0gY5n(HQbbn~f-i%Gi8zM!OMo3QD=FApHhT9~Vg00ez$-Y{f`zLOoP*H(k
zmw@%V6k>Z?P3D{JuQaPtSj&i6gY3sBriXCU7qb>>(Y!cPgrr&6S-rUs#!1QG=pB|T
z>8T>a5=_6fadZ<TN%n0ciY`)oH_DmHQ$zyn=}VS(vjDk_g!u@5lk+U)rW3VU0UEMc
zPBj_W$Neq_eF@l5@ShdCQZH{iD}e#-UjBq4lt9hS!%y%Fq1<2?AS%Nk%;s5h{3qJ>
zA?n(8k3Lc}xqIU#B82fWsTu|HPjclYooY^TA}53%HXUr;eUBz3=wCy9->etXk+$j?
zoK&aX&X5p<=Yocef}dW%mnZTaHsJ+<ujDMZfGrOaG6%~8qpMwa&Z-PHvPt9x#378y
z+*4nP3)g7jT^!wGAMf3gyv)(~(a7(J4WMeST#j(5i{7dV#P-&|=+ID-@XiLD(-Cl9
z_Wk@RcO&jgdTRsAQyyhHecKycA}dtfer16^o~yNPcpQe&d5Y&(&+Ma3``iunmYQ1i
zs))h&!r$UaQXlCl<#8KY4U1ZQh-06(tHhGGD7ckqs#Uinoq6(?IhPIT)%jIl@>@A!
z(dU;b@jLjF)#-z43Nl)54(igb{Rr>W#DX09dZQe|c-siwWV!367B9B6ZHhD^-~M9%
z!nP<^>;8T8+}}P^YnlS1^*JaUYGSfUY806s7a^6cE6ct2wb{9hNjc|{UIC!wrPg0I
z>PgBv^z*R#@<oG0HgJAgO9@)ilDA}JJ-f26g!H9KyR;IwONYt1ZVZz=b*ch1e!o~_
z>pH;N4>zY+lZ#02WCYMN6BnC`H)B`cJ4RDKxj|N~(tTd26*WPzCpk0Aj<6TiApxAE
zHrnOmw7^(VljZbGXRi`8%n?4ax&G0DrRJ=$x+3CE;VMmmr~p#WcSqfI<QO%xwuN(c
zYQFc*N@0iGi=dP*uiNcDt_b4T6p#3OcmkhPk+zf{o)tZP)z#V$m%Be@+b9(f(Pt4<
zbH1r&((tH_EwG=glN`T$dOgbUt+V-1lFdTXb*8ach~cn3G17s1lS@uetDhw<uQg2b
zcf|qu>nV%bnumGrhZi4T*<_@>WHiC(5VtS}kWiXtU%w73D>~xkuwG=@Cc4pj(MTF)
zrD<tuq~S`Z?T6`01?QO&!CMFZXcMgz+OkjaApM$=PNuu3F`Jn0K~UBccx39VLl6vL
zCl+Fed>$6DKEy)Qjrf{=@CBUNk@`%;QLH+D`OAYW^P1;=)M!K0E^cF3)Gc)~OD@@h
zJD;FD_q9Mq%ju45qmDj=%!p#0J*`K-j!2dLGYygZ%5rwCp&V~p+C9P4o;5k~RnhZ;
zzF~2R4w5~>i@mOZ#r$@$m%i$)#OF_`Ws8O{tWGuM=S^Z?|73DT(;?mt>Lc7-R6Yp@
z+jQ487Kma&G@L-<g)1%PQ#$%1oMk|EgC&;O$qtvu^)84(cVd}lNLR5^dQ__L#tWv}
zyfY@cGDUmW*{#fM?hRGALB~6}6J6**kTJ=0oGyAkeYIp)I!=Yrm+h^^x<65+N!&*W
z?yYz*_SHJ?S<S+ZZp9zVl_)V_spec=9(is~Ct<f{%SVWT#bxAk_)0;`hOW~|?E6DK
zVYXS<T^&}b*kV;2ouR!JNvF-qFYiuOUEWwh&*3(Kn}nf^y&`+lEygzQ6D!)fs@@4B
zHmDSM?u6t!r14VzoG<vqEsF|~r)C2WuOoaI6hUCGIi@sUi35H6_?z`f&j`_qcl$Pt
z6b3MFR55Z0UiVd^|IESBJsxeEs6X>PyJhAy$88)ktc)|6uc>Y(mv6NsBYUzi|6m^p
zU7TrUewqbtod|8jTpoC{HkEZJE)_Vv-KTLrByQp!6o{!822Z75G5MqWPi0d_<n&Sy
zs40h_6uiWywJjB}y~ADl0e}H)2oE=f1!FPa{V@ABUPMx9W+i7=QJG|XTJ5JKsIPj|
zApIh=gn`JyH81h_^y(q5zK+#>+k&LeS9NYU0HqblQWSBPqW9h-+5sx><%XoE$O?HH
z;jBX4(;w_E0hC-KP`tA_7=#QTp?vP#%pxPS&}8ye!g_VB4>Axd@LBMuZn0+mL7hX}
z<icciSPi!kniH%4a<={-w%q(^LVkBLA!SWcX&fXkgb=$>X$=EtBy9K=hxLdkdGR=f
zSg_I%zOr6XBwbMz1L8nO^qUg~guD{675TWo#{x@n<#X4KnFk%5`QA#tn}{}<tdP+*
zu|q=kU5l|Bvv7&;d`M#>P75Mc63X07-vm1UFqU-*3Tm&Q*9N>PR3k4Uo1vgfyBT7#
zy#N>j-=AJ{I8~R6q*3z^4BYS+Q`SXVWlL*VsP&@gQ@@u&4hF0nSB0lnYUh6RJ`wV)
zz<;l-IAW?EeW#D~IFWOtUeuf*0Ieu4i%YT)KrCeM3@V7)b>yerdseno>_W3F6|A}a
zaW_FhNwEz3a}UkkFe(jF;%dS{UJQFNY7vP*aS8)6emyyaOl_4z=#AI91$_mz3IV5H
z{~EKy)~@-TdyWEm^+3Td4U~uhVT8<WKW6$mC()e^V=;kGUnP|Zs!qFlUb)<+(q~NU
z5BH+Vnqsto+fn&n-xVY2bqK^~V0Z#<%kVoj4VFT>rBRfKDdb9c^7>1`*QhkYd*Ct^
z1HfE5qo&B^cS^tlQnvN;(3b8=%%x-iu+QY^Ojcyz?!(7Gz_xiQTL#(ZQk?STy~(eY
zQ5WK5%dwHq3FQWvK1p2yMr98cD-|6pckIw(@>zmbqYN&?E=g%Wa57;e;6#28>0=<c
zwla=MreTbNdM;bx=9@k4lc;)$RLQ%zPBQnsI2jvC{w}y2dS+3pzVNWd5qKBIE|k-m
z#nYcWNxY!rVYbBIey^t6yEjO;<V1E#6Qpr`lCHq9s7<j!P;Kn&!MzjjTeW=|cjg2=
z6w<-SsJr3k-_6*S6q(W<cqvy*nJyzeU%<{!e287cnc*j2_wpunyryOtNX<@z!F8t<
z?U~q)zlh{j@GON|Y5$>UYFew7rjN1P9-&VcM38_$eJ<ss`=M7oAK2}7KAy=llfyVb
z+Py14P{#JblRC{v+@=+?X70OH9^&eOtWdIdoKufyc1gaG<t@47rc+LGJO3vm<&a^k
z*=qlZ4P*An;FfB^3X*K0=k8R#dqpjlL*&z|`78$tzatdO<%!$T1=XVZN8t#AN=6oM
z*u!TsPzq{D1}^t%sNxBCC3J%$eZ(EOEZCC!UdPCCCj|X^pJMl!5P<M#{rUIMOYw+r
z*)3m2Jk*+W$q4BP!BmM>qx}<lY`$CkWRopbZ-ezs0sKGaR|M96J8ysD!}E8j(BZvR
zWx6yaZE@<c8J#X4HhX^I(p}=2fI==SYGKF@ATOA9rzC*^{6c652ad+4LUZRwxPM4o
zQ~IlKzdSXzaa$asMmptW8d=2r)=A`HvBWC|-g_3Xmk>auSz3___0off9MV3$tI*nD
zbGc`YJKx%e(~dMB^8UpQ*&s1U#8wm=ie&|NEapz<uO;c_y4%*Mxz#4Uqa_2uNxh>5
zRbndD(NjkN#oWBJ&E;9&Dn~Oo_Sb#V8m*j6-RI2D3JCXGVlh4f^jdGs%y(P*&z_`5
zov@sLCJ8XM`DMoDmgpa*{<bEnG_;y;sybEX$>QAvoP1CQrjXVbmAwf>w7C}i5Um=m
zkswA_e7C*Fvap^*^1;4(4Bgmv9e=f%)vm(2)G3shjBKtIYWfK|g`e^)CMo)Iwi#v0
zQAk<&$dHxF(2i@ibAfhD0fG-UepsLDCN;uqS4j75*R#fXxtQ>-k3^WFA**W{{c&U5
ztp(WKC{o#!x6Cq<wJp!h$>#bgK7N|aod>$95Iqz@fc{AP7%3_!4xm8uRdl35CYt<_
zt$?^7luNSx+tltgJ4cUirAwea`66ll5hQ>shN+Q<*N37oNi75^9v*x)H>Cd%^tYvk
zIQB5!6dhivNS(A;LUm_Zk-$RG*jVHmAyf}M)tvx+3E{%%>>K8=zR$&cRc*Osio8y|
zuGWQrDdYcvqsxyC!?Bk}pqpR3ST(psc4*_XPn+nF`;(oQK&yj1f~=hEpz|b$-oTAf
z*}fM=UrzLm*vlvS13|1D6R&blqqmJ8S>8BLSt~u~`~@K0p>i+jxO4WIo5b;-^-eCw
zUDESq0t-96i;U094=l|GAl&!|YK1|Hy1dzrH^6c2HX9?HNBRj}$_YN-PJ$jZuDgSh
zGXbDLuFgQm{6m{n7+|A<$LOB!i5>=Mg>U<b);jfC)e7!0gS-+j)Vq#;^KqH1%!ki|
ziHVQg%mc{8w`a#<ayi^YY8cCaFKo}YFzY<S?0ZjJ4zkE=sn~HAgV<D<JVjVU2Ash|
zKDWKE$Aa6)SVXy&j<|HKh;+Fur+Sd9mIrCgt&qrr)Js$gaD3(xVSp}O>$WR-C_GBq
zR}2Oqt_-+gX|a=Y__CPz?q>H)C)Ncc$DfNStCkOGSAQKmt<`U#&lFnpb&F%hc_utd
z%wllHLKQkCdoVmX)#kF&ZsBH;_Mxkb@Wh1GrU*+<mrr-WO>~zwGtmg+skiNd4Du(!
zHp<}<>jEBd58TL^pL>$XA!s}1u!HYQT-6#nUxOx+^c7-13VJVqpt0=aHDXsG^P)^o
z6RoHa@lAsBy)SRh%o^(J<bT~%<-V(wi%t+{xtL3b3*zj{437=-FN5!*f)fIphjDSn
zkq%2-9r43a)UVmi^q;9+C{B|DkGBdAw8i#$*KT*jI$OvLut*2bJsF}qMl0m1la@5R
zG?o%-8t9-cqt=8j`ewfGat+y|={FNYZ)DHhmM`NzuMt57z~S<(GtGP`!?@Z?wHGf6
z(R;EyFg|8;7>ORWUHCMLJqFUigNV&ajedfl5!vv~T|W!N?ipuRJf4(2q~V{C1MV?u
zj;99B7qoEA@BDFjl!(1c3|64HKG0Fr*s_?xpZuk8r?2$g4LmwedI>cb)R*(Q;z8KU
z`xeTo%)S<2Nl|s@Z!VmZ;qLBqTQ%FJ(HsCeGJJU+mJjmukV!O{wtGqQEDLM%GBp`D
zaFL`{!+04hxb;mFq+&~6o4&Ob@{Br?&WbKYUI(!>R_EI{exwqfv5m|s*K_{*@u(Ut
zlo9ml14Vv=gRwH@G#0`-sZ8BsdHi=8ubK(63YFBNRHu#)3Y&67p*ju=Tjp@50Nr62
z&a^KAZ^pQW`x29vZXpi~2S)Nwlda^FP#v)J246T&lGU3UJi9%0yK)ncXdKe2N*i&=
z!X`GJwsbkfVck|RG=l@p#Rh4J#8h}A1&<Tc9x5@%6M*Nd40&B~)To|L_c+rfJc}2G
zjpQ}R`?Z}sq@M5|#d;xB*dLE4pK&mnL-gwCU4@dBYNFQ8jPu2{u4P=>XZGJ+ny!o1
z1tnImoIl(CNHutUwp`?QZnTb>+G{7YzU1kk<`W~WY$xszmj5MQgWfJojLk``fkxm=
zPYh8QI}ZZ9$p)u>7Up=-V`5eG{%3hyOQ2u0^Rh*v9)8Y=(USYX#z8*BYtq)c89yEd
z$WLhu_-sN^%hrvj6<^p9{ZQ_9;nnIA`!v|C+O3tD^pm56j%iRnTZ0;8zS6Q}cc1;F
zld@52a^S2MZ5QFFNZT@l`Xk#z^t!e-c9tNRs`K_#QO7<Rjg!oT{A&9av`yrtI4XUW
z#cYkM65H>|eHalpqP$YA!Jo9a@2~7xv^f_3gz>bv$^{OOpscU2e&BxD#1hEXs`o;l
z;#*KGa9M0)gW`GHK4E#l8ZJDp&u!RKYIX7G1`5vfP@x@o%nY}UB(^wy+46<w9)sJC
zAodMQz*%PinCB6*cOeI3V^&X@w_T~msPxn0{_M8q_Js3=y#1{h)kfwsH=Qkuz#geg
z(JC!z<+Qa7pG&s>JE8V~+PW^bFQRNy!lZ>(_nCa&1wR-I(r~28adURN@F#4Jc9W7;
zhzEM5Tg<a4<YUF(Xu5+NGt1t{6i^rgcsQhDx|)Mb##Tg*z?m=rVUn>EZnjETGBI6g
zva;n;(=4>AHbt{e{#s0Ou-If@)rP&Ybnu>sT#t3OsjcMvHqf8IbE;3QS~^0M3HoWu
zf8<%@klptM*{CQh`1bYK@YCe?$eD4ikNkwazIKyWO>Ijw8(}dvR`y>DvzkUQ9P{!C
z(+^d3-gex*Jp@+IJ|R9$-G|i_AHFi`eNiR^%xKuHt<$l0!r+t{Av3{xhAU&pzu78N
z1+fE%^cYoVt%ZC7Z_gPrpReQ(`o243^2Fm-ca>=J?8q59TAsT~#gxX3?VTNl?_W%a
z#dG}d*}8Z#j)B_$%3U>iB_q*6J|}9=zLgYP`Pokmfl^}{1}1(!zL60-3Ny1jSr+Aa
zY=~5~kT}muR!lYi9c!!UhXT7`ft|OY@PJQVpCvX0DH?s>i75*<t%c>FwTgNw_u|<2
z^FuORHWgp3k|yUU#qTjOB@n6ey8|x-F#Asoy&kYv^%j?)GWaEQP^s@YTrhx0*(Jnm
zn%$QpbFyo5>x<&29B%al<;9_u+er`2Haq3=lBkL=qrVQ5iv?M)Q-M2e?KQW0MwOwU
zrN;L@%Hh#1*9)@HTu>ujL&7tbSEpmv5Yn5S<ZNsJb;(6<1C{12H97F)(L~jbNVaj7
zA)-KDJ6kpC_8WBy|Ltb8D0Lyzd5ZM&vRbg%y)#2@(#-bQr2rocAR2HL<`v$1hT+-P
zy@p+5dajh~ly8HG&yDR6I7Qq+PqsxfOoqrVVcdLs;{nNB!*6}PWN^p=*QOOrZNIO%
z8Fsn4pV87hrL&UGZ@Yu|VSq5m`TGf!0JI~E`YuUB=?$?(t@XX2nivQD^SR)fGCdHE
zK5546rFt3f?ngqj#mK@~#Gu)>w*$>$n>_KegWv|&WJx+fRsGZx{((!Q<T9R?g60LV
z*BHL(#jw?uV}14JRP@yDb(GxF>cL6A5Zjl43-4@HI2I_@{U40IWmMa3yX_0biWexZ
zMN5$4E~OMN4nd1k+}+(>i%W3{9^4_gyB7%V?(poq?>=X(z0MeWo%JCZ`J9p5_w}FG
zoWJ=N-BxYzTH;GR2uu5J*}MSHV^sBkv(^c|Fn_TmoxQ-8F|CiLM&H}#*Ss4RC)ni{
z*2&aYs4rn!s3mSgmzn(hJ%)6mze+@xlFJ1K&B$SXEfK+d*t?dS6H?v$sqzGs$kNr4
zo`^KE7k8zhN<j>YchL8W+N_*C){=ldKD#hZ7vb|k241F<Gh*vx7r`%8-XRoJJ<nt=
z0_sp=wai`HXKs8Syla?thkdetaoI`w0+?xtYVP+8IS>fB7<iupMX|X3oT(}@f97)e
z;kDVDeD)2;$M`z3bGD{U=b6-NM<vSXqZ5*Co^-yy2$cJgkl5sL7sI3;<18w;LRJx&
zy>ZIK3!c7vBoJk-5HqZMiiEl3t4E!iYAs=wtXScx|7R*k_-}&zdDCKtUnyv+A67O-
zA-^+NRtyc;Z5(>~!$FhVR>hI-C1SMJ#F{1_ob-jfflk<QF4S~t8}Yba(UJujkc+{7
z^e*fXs<!55b$@=o9~@8D`ucM8+1AXam@A^xAP5%G@^pEFisf7E;M}v5JOG7ufi-Fy
z^UF^#p{IA-oWC>;;x91L90zX2HMF!ES#4iIHwkDqHv>XT*e^ay=N*xRA7^7io*Zw8
zQ;xeV=Hz{;S1M?xd10)13@?;k-pN2=6kPtuYUVGqJ5Efe18~TwwFw=??k&L}4Qwt!
z3Aa}iNwKNWjBR3E_sD9n)VyHN^5s@>S2x5RL;t`Ya(b?<N@kyz^`x#+&$ilTAmrW0
zUlD9=O6OQqHxT8&6XzMbOV8j^3*RBH!}F-0B6}+OW4kCA^<!d)!>sVJToTjDfVKRm
zps{${cw8)DiJz{h#0ip`7|>$N^1=uj>U|5^xoTRDUc;D9->?-B*aqX|Op-z_>l2cC
zZL>?69?=XH+71?6aiN}it@N-4)n=}CB*C&(=T42OYWL2sLVnrV+Bmc}b!?bEVNu&>
zzi*$Qg(9|5sFTYs#uSrh<_ds**XFn^Md{)L8PG>gqRD}s9@Qj#C%>eS@=VX+B`C<r
zMO+!hlXpm!Ko#1<M%9wd#>qtUix`=fhCwd0DPy2nJEH9rEh;IybiU(bWy#-yN~!40
zW<<j;hdd)DIL)>S$k&52zg31n^tky6ShGtMEoFAWs00u99wP{pKjsrCW<E%eTqimt
zJ<joJu>T1!BFhO{$Sr;161ORnp`z9tO=OQ3_*Jz`AcJ1>$#wIc;7sr74p$i%?Ph(W
zoMr3^XHU9B*kPsk@Pu!*%x7CIOOJPSj^Jrta6ywb-_a4<4y13mujAn{=?jQvn{)vT
zEx%7kWFu2uwp6=a@|=Wkmax#|bR?_=;fqly$eG+%K8FN;*`lomI80dKX<Ys+4lws?
zAmZ+iN{-;CqDU`N$Sh;kWU7_*jyyS}Z4Ml!ZElU14Cf`wa(Gw_asLasw354H5oX72
zdKy4rpvXQ((79yIua@7_ZNa(G99-|dr2q#+K3OAcmv&hQPf`*`yAamPiO+HkjaqE}
z7UeKT^hctFW#LTY$ddnoGYQh*O2ITE_)29E@0&t9jmLily;>}m8Sq29vruaj%4oP@
z&*AuhlV!^we9W^;qHeq!n-_|LXnbU)vUhd1U`$e)$p@DZ4fJgPts$a&VJ4KS2R8`v
zbS^w~^>A5VPo#a_{vK&aO8B4=aJ!e8w3`mp|HAMP>X)KrabNbcS(yK>RHWxi3T-s}
z-+LmmQ<3JPiA+u_Is7pV+)r#H(H+tQoQk#TvEQ-pqY#=7EDU4K=VGd1Q#e6Sx5(No
zP$G7Z?MQtT4*qzd(Jq7ab69~krP-lPn?jT5&A4BRJ`6JUCYJj57WUbEaVrU!q-XsW
zlK;Z@MD-^UApgJjjQ>M>PJBE(LtfFi`#q$KTr}jSw#1^}r}a{H=L(mzM^rV3$(o0v
z$g@0!yMn(qEzR&y08C!#K{@axyg+SkGSnF~X#@Xyk{3iOEi8DM@%Z#cY|tMIKeQQ{
zY86wTPWu?a&v2|>{RL?7(af&Hw<KjV6*sXsZt$Af=ZCt6KrRi{l^pxA!neE&KY%!1
zyFUZ}sd}1podGWY4uT$N@>4~9?;CIbu}Ch!G<^={|J-(>>t`G2)LL815inlp41OpE
zO4eih50xNQ?0Y>Id$^q$Ecf)0kzCd)%<C!|V<kp@mSLViO}-L9%axtCYxt?Ms6Df`
zy0QGDUW{1^D>jTO&{0&@x+&qKW;~YkTECOlWh1m5L*wJiL^EYaZv7-AN-=5!u5n?s
z{Kz$}X^x5*ng8qNW}<z!nJ}l`!MfE&-<|cttkXr*pz^r;@-$eS9Q<S~sFuYXG;7E&
zL>BqlU)L7+Hi7=Efh*>OD~}f&9m`SZBjGJ#8kB<8bZ{psKI{+)#z(%DD2J)P*DGj{
znRfoX12^9L43S(rGm$dUm1>hvwiuzMfqR;=!6TU?89_B*LIoeBm!<Ij=hN|D6<z9;
zByCV!fS)-w-iABxUEGC!1T$C5ZTiX0BQdj0*TQ(Sz?MDsSEpGpEOK=*x4{kzUTIXp
z@D+bwEPrp~44!&iTxU_%Ou`;VW%Pg9`JGcvbZJdE>DHtFHjN+HOtBC^U7v8M+s{dj
zYRZ0@jP5qLbaBvZQR?UWVt%t5^~}Dg3Z;Co^do{Tb(t3ZF3;b%HFIlCp+}tFl5=z+
zpp9d#NY{|J8~IP^krY{Fe4?`VxkPW%e(3Ee)q3t`{VQ2_tucuOGKBFbMA89^R#;f#
z*n^Q5=S!WgbBo)A;f9eC^>eq7jP*5Rdt=UooZwQ2ENA)-wB*AoyfLQP(Hs&^H5&#-
zFxOs2lSl@O^O=*R<J(G~YAVx8^RrxqgyS>F%C1fI^m?X~_%+6FhyPOUt&iaE6Zx*s
zn5RY`Wo`9Km<XfalF{NnaN_ONNn2fy^qg_BluyogocfPs2EvwphX|c~4_@2gxWO2_
zZN<KGjLE9v-zBsL!7Qlp5%?*Q@s0?;Dk-D7zyYu?A;ci}{><);wR8PXiU@xZGYbaK
z7kJr<B77u9r;XvJ85!H9-M)JDa`pU(EdIZoa;b~HWp^SHukqgK+prb5yM})KWo(K=
zk?v1R9J!Bf=nbv!hiJVJG)c~1a?axZfir+b#%_qM*55w8xITDXGy$8Yy^WomY#dzR
zTEYiyO(efTd@pF?VF(C~QGIOhpLUnIcOY0Y;<~qSHil4orgLI;EYn@WRHcuW(Q(jx
zQcPbfRcMy2|Cg?OY~JBaMOfLnRw>V%QL@g)-Px^>uKwIbLruB@MG0)(B%r@hXVbO{
zo}AKfi6LGKQHKR{;g~k#0&+i`wcBi-M;=#L{U(VSC9S9slxx`l1`aP+6QvBVq#L}~
zLq-AnAUsPyoaw`R)6bt<&q#I5{eEU$E>Q7qVeMm#kH=cCNG!bdDd=#|$$lnzNcSZ)
zCm9Sp_p23h!i<jb>4+S3Z}M<${9UR_QuAXoC@5i;;tQ(sNX;^f_9H}HZgno0akyf6
z_I~HDI>DOJ%F5HB)64`=q|*>h6V29l0ylZ)sN}4$mAAk`_{1ZwTQQ1~(3|06VN&WM
zu21eS?&0*LU~xXe2+n#qa~>_Lh(5?i{?e@94^>Fe?vJ<F!*5*d4+k|-AXbPi*-ucP
zVYC%)U1;AX9`z}?0+Ks#?>}&n6dbSFFU+~v-aZ$TdbuYWcrtgZ-+ug{rMksw{lMOX
zt@{sLMCX^cF_F)gOAkR%?DtmtDjy{9bQC_W3ikC5Q=7`M>9TU5kOk@6W$b_*9&dGh
zFfdZkC-i19x%q)A%c2|H_;g+ksi!(k0YQ*K%v33C4LnK<^D5a|2PLbfmI{T1w!@a$
z7b@6j4)Jm98>FP<%I3h^?!pVqzyu$YcbTrCCnI7meV5*bg%)QM;j3+5(|xJDMJ^|!
zz}a%z)7W9Y09q!i%0aHW>XjL0Mt8%6k*b*jPb7?XN3n4y0k_l13lRv7^v$a_J6ic4
zx^^+8&)sk+Fgz`sGyb^6KX5Q<@Yf{Si}l5Q40F41MJ~P^v!_q{NZCh4mcI8hTI%|H
z=5q<TXZuu}#(UT8_2Ha9L<IOlRe_PoqHY47e3Gb+Gub=IW!(5)r(Zl+H{RsKN7?Z0
zultP4<}CGYOz*NX+qX~Fer39zHHgY3mk}Xn5F)4Hqypd0e<+FD9z*;9Yp<`oQ(j-V
zD3H80pw%mJ+k_!PKg%Di<R8>4kM3GshC`Vi2%N}!{jqLW5ip#VGIn5(F*R=tP?XJy
z4w$Y83prkK`$mObR76-!hj^;xL1{ByPw5IYwE5RPIicm3dvcP6WfO%}nTq!=6sxaE
zf;|YLO)y3DAGi~et|u`I{rs3NI=omu?uXZWZ@Guug10G>#lbHRK$4RJjYfih;FkaX
zYN61hip1P7OPGABeBF4c&i^-7>3l;3<G4e>rWvM)TE61Ny$DV^0Uj7Pw@?A@mLGSY
zS7c{{ZRjc&Cd;R9%!V#Xh)vxE+)KCV=q9xb7ti+2iDeHDDY`uFoAIdefA85Tt0~EF
z)&;V0U<>Va!fY9#oew!xW=WfO?L<4y&8tBtHm!^Euw&T8?-Hda^^Jc1hBOZIudW<N
zW41k!Y^SP?id#8lZs}0<$n^x=grm0r<kyF2gB5MH5|*DE(@8!QjkR~ncKEx?DH`Ia
zJUc13(a@5L$e;3Y;hlmrZ-nk&R4!w?*mzc)=@L-ki_<R2<c@yU2!|?KzLY(9Z0bjo
zga(wo&H3WBXM0P%mN~hn(xGEY_R+561ziU|Qs`eI(McPCPhobwFyGH_3%R&DUoUxb
zowg|7J;+{e<uG!Xwxp4f;o|E_DfeMiS~_yb`}!YxV#Y=kubPuEGg%oI_r|Yw=K=~H
zqCdZc_{Ic{9on;M)+Dz8pbl894bE9K5|(8=Rhea8Ba;%gjb?+}Z0dXkq_>e-a~E@}
zY8$o5<<>_m8Oq=SpTFZ2rWbxt<bccEZuxCYvM8^mIY{X;`#N1ww`#nPa%+Kc!hsmV
zE{yY(4PC!PV>I%RdZ7jMmu?zQ9P9OMQ7+%Ob3$1`mrT7GL5C|MC8h#v2h-6~bZZ@o
zB`Jv}O33ODl#CGXkc)~<Y>2SyF{EkwAx`6h%vi&xCfqbyuc9GXmhCXUgL6{*rtXCh
zb7AD#lZe@<D6G8ZO7}><=eqH*;Wt-{Ue?IbIM2Nq7AJmIAL;M;{;hS{xpmD>U}EB~
zjKaC}HH-CWxrMWE_1$iiO<dmq0^D((v8c4wnM<JA+DMO|N8$t@KoEZQSU-i5@a>(q
z^2JRjQCssRD+rn_5YK`zNP{l%EqM4^5bCDb+og_QoIwee>oE4kI)gSc?ep!0!ooSr
zl7t8mF)oyL9uj=#d>zJ2GTsY~ty8xTMve51c5&4ae-2x`SV$Gk!L6-+4Ed(E%&70y
z&TRMj@gvI0&o=-Mp$nhqK@X~WjF+l+{tqq}b$6>@FDrki-t#W}wP<u_d084kF`;VQ
z3PtiDI*<bSdL{0rt@RO;@gOH<{gtk0W6_JMz15mv!WKsbq-8|noUQ*(Eg6M@(_sIq
zTOpW?Xpm|5O6;cZys5EIkOa+Uf{LCxY)#@murftBuLF9FQZ0;^Tvs2jxPqO)GVy?&
zE8pho>6O>a{vRy5%p7a<)I#7-VUfvthu4adt(;e}gdnsT$U@;cq5ATTr<6fAEaP)+
zeAN4YILfjA8{ARe&7Vkk*y_n7<4ly|NV@R=D27EhNsEY(<5Mw~&k-Ii&-bZ?9sbp-
z?O~dc630b)Iabv5>&@%RVw$9P!}B*Dzu;)6B>fswE)V;Lzlkq#yDxhc?oWsRa?J6#
zWnf2gvml!!_R5!cdGZiTlJ~spOuU$kakl&LJqo?jP4_H3ntiYY<n2yJRo252`}MY>
z(KGl?<?*uZ+vcvcpw2bf0&$MA;>WDUg!Lo`Ox=$ydNm?v3<x6gM2u9Ai+m?mLqgb>
z<lLmDbqkS&n;{yL7?_?XCRkEGesv<?hn9WwuG6M0GvFlAEZ1Aed2K_xDkV}kyj1~w
zT%}@J_nZf(=;BORbn2tzbardN)-(y}dq2MC3FHVLndp?EobyySGOF-Moj$^AAYiRq
zt~E@3*O6Aiwk-D5oU10@27PZ{U)<wA&fQm%p^!%SlQmPE#+R}@vgQFt2IWVHn<(fB
z<Y7lb%1!M5RaGNcCvwKm)m<<;w&~~voKE}%Nsju8eLg_CNV$+@^vet>T&#PwXY#f}
zPetb1fb9fRCBx&-CoQ#5A*8)B!Nlzo3DzVtd$-Xxkq*vCXFWC9OEy~gsVC35?v;se
ztU#r<nlTA+l7@smD21NK>LN{vu;uJ0DV+!d&_|9FUBZdKvNerYHlx?M10|*rq@ywH
zvx$R6e68QxZ@DvVpYVq=I2ck7Y2h`4iHP759_MC;VsI<#l8yM3H>y#PcuG-(#cs-H
zwaRga@LNdLvf>4EwWo1D*Cj7pYsXxMBAC-A#mn536*b0e5Su<`BTHg`^YO8i*87z<
zNX?syf_U}$Gh+JiWY>C$kBqW0Z=G&D^j-o64Y_27`^}WmXp`!K-s0(^XW6y<NyCIT
zG3Q|~Uyz_oa7mLsd`P`p{WLOc!dz&QJ8&?2nE?S`N@?z-fzSK&n}p<mI_hKQp&s$J
z9kJf=Z@N!o1kjks!M%I#zhOPHP5I;cIK%lH?z!8P%K`P%k!@LFzfpFdJQt&MCmlQ-
z7kO2PX`ND;oyKW+xn`X7WkP$tC9B7gfIP;JZa%t{p~!|7s<>B`ofz8KdNyV=B$W(!
zjD9`{=}MtlQ`|FB1EM(=^2|<_XJvp5KZ9p&hN$YljI`{fb4NSL?WH_UV4WnyEoAdE
z9etl|3~`IGrN*@}<sU*)wDbId&75L5s3b6(v#1=AqxP3~G(<^(^H6D3yE1rKfJ55%
z3J;bcZ~^3)<qnHjahY<}t?SV>C|5`Y64wS|FLF__dmQAi$CJVFLH~j^=WO)of&L%&
zo8Vq8Pr;e;=s~Q#qp>L32TqZ39GpA0K+Y383ZtX1QI>(o9Gb5d6_|5PMjBzEOv6WP
zR+hpf%WCzb*YG_9$bJHmy(3~UtsVjJK`Bh=Rl@jmK2xHVz;b_Hn-h7KW$2WMvd9uh
z7`m&~;N_;5Y315Q**`FE(!Ts$g})I@nF#X{)$+pL{)bfQQd@@D6~Y{2Fbz_A+}DCc
z#hEx-`WX!`q8$25%#u*&GO%1%8_}brLrWYS+Jhj3f~w;Z>1HK+Vn4%-7WyZ)`<?rC
zn;&d}o;1LFwX|F-&$B!-(48f#mYYd&y946rwHKp});RD$R;#&r1u4e7y?^NoueSp&
z?3P>%UUHzukBHB?vB}N({MDXvASTTWTBs*=PGVgU^opIonM(4c*;%oZd1o6#O0GzJ
zgRlE}eGZsBfJ|p>*x>uC^ikUMQGm?XrKyZu@#6!yl_`^Oa%4J554BX{qra|gjO3Yp
z_w612joknri+oIFa7_js6)OiQX>n0x{@Nr`^TZ&hSs`~wBP7bLbX7PKU7Vcko$hCA
ziqOhH(|xppSt|8DL^t?gcvbVY@o4Bu{bfK_ps?Rim1^~rgA$g4C@zw3`5WXf$)e6w
z?p9a$7hafLp3NtX^=N1$^R@p<Euq0vV^<(F{&juTtD5GrYO+^^Y0mpo^^(=f_c)~6
z!h?&UnCK54w9eKT7h)9igE7659XuI}`Rf6LbA4(mvFnNl5&A9y?R{Hc`eT^3bAy=M
zHHzz)6JwfO4_0Fu9v%zqN>DDV6{|zbP*C|soqdzOwF73=8QY3s?hvuEIESrM<a3Rt
z#j}QI3l}B{CZyGVkdUlXe3j%^{!^Wg%GwH*dx@F0D_`iA@rWdOEo>cEssmm~t2W5R
zwWm(LR7Hkx>gp6G;_C%Dy|Whto`z-{7HJp7rrY2bwfnh#R+5amL4&JER-c0av+x<m
zn?XsoqtkdKTu=r+2>sI%Q|{CY!&V%|u-lg~=L&i5TNK=N?n)+{>_rF6PfENvEiEF&
zt!Mxm?LtF<Ly~gp(UYK!%fdSHA!Jwv1l$ml_y{yj)9wlt;;n9XJkI!8oOcme3m#)+
zRPvxAK!Mu^S$Imu{26G>P%r8TR_L|~q>GU`jn4y~X4kehOMBr>&UUyuS-AdMhN{0G
z%NK{nOMPPsV~3gZ!{*mA<E+|z21`;+rmDYG$}ssdk7P>JrLLYd0c-z$f7XJgh|_!!
zL&12z86COm^*a#RIXZ0Tbi(S>sab=Ixq*jL9>(fwQ={cL=p=D?SGcW|d{Xw&63n2*
zYL<UH2PG19&?$yw$~CpuKg$o1vdI965y;)SNwf}mW%guDZPlOKr5!~9E^F+1%V{+6
z1BLtdW+<0;l?Q8@LpctK%JDQ3Zy!*&vvi{F$zy&!0?T(>oQ08f-=EkJRqL-w4ZaD=
zAgBM)9;r+PmQnEoHI>*z`dP&)QX>gGimXF^BZ(C`r_dW|a+$nq#|TnTFJD=kJO_M;
z(x`imGK@V_)N@eW5R44ui|WceV3$t|-k$|Imi|@E=W1fH{Y7SWUC!u70K4{sZ9qX;
z><3ta*K(L+wrnLLU}feReiI{crp61NfS^}T7p_M_rR2Zgc9*gvJrvY-@PbzI=Js*K
zz$-s6*srd}dv3_)c(Iw}h(i5Xq&9x|9Hog2BXp^-YY?#(<Cawg&6wm&$&R_=WbHrB
ze{%Vpgh*T3AM#nico8IhbrSL0oWg)l*I^N)e%1ILV~iHyi0H05Ig9TQ)5w{%4v|VE
zD$Kr1)vaA&e_ZhVIm*hq_X%eL{T&wjlu>{n%U=|nWWaP%(qw5x{IG%b4^nl1YXe6|
zraw#-Y+0+nenRnWrbJgT3WD{2+f@r6X%(iurm#R&2x%~<ut8==yzyG61Y1o4JBIG6
zhY)iq7En9;`otExKfmq%O2aO@>Y@txhJDz9;7;SQ<~@JU{lMqWmK=g-jC6Zx8uRV)
zL950mokQAw1;>Qk){+Gf*YA4i#mh<-+aS`Y5Bqj{?2*$rZa$K_7B-|z!!&11%~Co1
zm8;d54{rURVLDyHL8HdZw<05}G!@`i4?borPVL0Ml~<qQKPLR%RXqSKrD8K?dL}yf
zWBGzXBw!1*=tG$@H-)i`+Td*{x6AU8gSJK(7df1C25YvJ0olv;25p?HuxmiZQjxIC
zePMNZiq@R>E$yn0Ugg800-G%x5cY3V`!AWl{|1PbcPa7d@N3~orl_bd=c!%9ihokb
z>w<44L3^iTogli)N~G0{FAr?iMyFKm+41-a<MM#3*<)5jq^K589)C3pC_YzQiKbpd
ztbOeGMaP=sXc>u3C$O%h^z-k^7}z#!bCA$R9J}2nJCr?k44V*D6D<^$n#eqztM<ti
z8D)8DU01d>3uO`-)2^hrPxMe9GJUK7%T^4hACj&{PD8~G|5CTB-b)Wos$Shxu*tmp
zexeyh%r}U#(!$=1rkSCT{oQrbiDXH;9K=v|ACg*R<A*zGJ#x@TRn1>vi~d2v+imNd
z*ouNQaYdTe!e$j|%lPV4*x^uL&D~GGZt*1bY&?u+@0+hF_v4*$(tNMr%3&^O=kXN7
z*@n;pG(C0t==4MBuSbh;M~F-x>srDYfY2o<jCuJ`Pf*_0D+1T&G615A)0{*eBdZpd
z<*RPuJY1X|qum0O^BGd(#JJ45$=4?6|9;N6$dJ)E+H6#whBgGsNuTWCWjg!5Xc)`D
z@L?#jp!UaGOL0--AO|}PbE7s>s+<MAEDWz!>AU2%{29G66+)dXX!8}GXyW`0%ouGA
z=!O&KE8DwTGPWPA9shb}tc3F$rIxtFbv`ZQJ0r8DD~+8~H{!JntMt1cOwhBS3|{Qc
z5f1*Lj};DhRhnd;Q2H#T%oDTKogw83B<*JbVouvOT^HkVyl<tbF$(XGGYTy42`)n{
zcsa?kf^x^^(?|A75}XS-`0RZE?X^(&IQ|NzPu&=noh5F?ORJeNvWB>&L&T4n6lo>n
z<uaBK2BL~a31;38H92dwYrcj#A3YcGwy`O+EjUK)EE@MtE_TTNJpF2(#BskndZp--
za}8ZRJA31yc$?n`B@OR9<$%u(u7X#xf3{KRF%Zo*5PLqWw9T{Xon77wx=zA*qUDE7
zwiHk6T^0Vhk;>SmuY%BgOfK9@Y@h;2oGQ;nt^DzVKM~OGzGJhC^0GNrUF)YZb#^3<
zCE&5fn05BhW@56(qY6m90r~Ik#%{tXhz)pDl3HYZE^gpIj!`}>FfWX=3MGznbvncL
zRK{7V+su!T#n#QpkU?1E`wS7I6sS_XD$gkDkKi)or4B(P(0c=S@N9(lI@Gt(y5~+T
zQ)6~4#ioeaQWIjn8!3OWp#XPq!K{emG^nKJHY1`b#)z?wRZU-}c;3TBzu{NXpZ;=E
z4_Vojv>82Ov8TqRWDzlg#uaHbCKpAd3`}W;VDM5FTW&Ugp0uGjj6k?gBYLsqN0n5g
zA`f%WTLv}7M89S#oIEM^(2H+XaB^5%QwUd7RJrw<=8AzCo{L)nFHwa5?qV=IZASOe
z&hAR_{nverqWM29tq9(3N8etIDGVik;V;M<8v<_4cOKB3K^Hn`1}W&H`ZB)9g5FX%
z>beC($zhhPhm|Y2blZ|ShV-4(t*GIo`?P42#i<<+zlO5Yy#^ghhJNCTU<#6)bEk}}
zfM}{CWPAR-dS4Ix6{5ZrK7Mj0AWDi_rB!03;=Veb%V65Klb**|_Q{#fH_Q&L$Lr$M
zf$uE<w-;Eh0lo(yumc>2)UY=WBSs_J!w<h41KvLf4kN<Wj^&ti`NK>p3wR%fALBZ7
zqxJ^A!qDXS=_((MyzVQe6LZ^a3Y~X7iucBnU4#Uo2;W6|5pEC@-S^rNl%Kg0P}Btn
zNkhEuA;<2gIsHT8%%w{((cz~0L+ce(dirah!%a1AgU_AkjYg{@M`}Vg4YTW}f4hlS
z>}@W!?l**XuaEY816EccbH9}G`6ykpQnz-gzmm&LycYsulB519O0L9NizPBjMMf2J
zWSdwR7a5abNQWPoXRy&Eh_g1{Kse{yj?bDu@zH-~d$A1Lii<{0DE)yP2jUoXqI|iI
zzo^6NL)?wCXS!pj;8!+{2pZVr5Xh=v!U)8+m!j^<dH=0|xz@i5MS0Gb`>Exb-W#S^
z2T%5)Z?4;(`&Jh~o;zN+QVT-v=mb0$C(JZa>?+0gd?dv(KC3;oFT@$2Xfq-UI?3sC
z`Lz9)2N0#oeM^j_DHZjo&pTHApTBM55_cXL-itluq@-9}_0-EMIVDO@f#oVteW&Z)
zhm^93g68I`%hyPk$7IjmVMr>D6#4i7itef%n=IuEgUhdnRy4hyXYwqJQ=eX#eSLWf
z%_u{?7RS|c1>57Q@`+~SXnD9v{()0oDW;HF8H}#toB&v`+S<th1{RKF&<3NS9V~@o
zcjd+G1cV60!w~ftj6264Gmqx|#fs`wPa5c+%M>9iF{|7&I#yrHH9QcAZ%f}x*c@;S
zvLwxUsUwv9%&hRh>02!)UmDu4g_sLUk(|cY!LLi2*A(zFl+(u^_WzRcHR6M_w`_ea
zmr1?0P<jN$WRj}6-P4z$+MJcZn_S(o!|@HK!MqEhpFW0eWqDX)IZZI}gpt<mg8CCf
zX|GVbi<tHmf{A&h7Rivu#qH`8|7uBBMSI5PhUoIbla6o`G!9&({x<cf+E$V2?pbkI
zdY)6!qqY$bJhRVpQm>*7y!<u0Z7tm$I!NdDbc>l?$b(gWQpZvHbNoJ5Fk@a^{>LuA
z(_w9?zP0uH$MDh`!w4YTObGF4)A;0;a)eack8DA88_B&rewtW__>km3aLgCPUT@Ti
zu?3EC%3Uk$vf__6fv!+;06z|Jt1wac;QfFOLaZ1>qb=!PHJA~P%L6chLSLKCZ5p^i
z>MWs>1x<Kb{0We;oDcxddFmc)Xu@Lb<~n}2L3{QhdVpxSqMOA2+O!%O71zz2{e^l~
zJ8mLtNTCC562idCmdrWCtaeU&-sTL^Ygbe#e3RM4d5A&i8CafjD~Z)H!(COyN}@#*
z{ym@u!Hwy}GJ7G%h1gIn%R7vo+0$*@bZs1<bLPGS=baxxOmx<zJ>76k%|2@7K^y8k
zg!pANcAd#gzQIYYJhx?Sxh=%BF?K`gJ?1q<7QANSdhgMSt_o&sA(y4EQO8kENrl3l
zovi|`{%?C|hkSt)^IiI{@QIccousC}_)10d>1x#;pZ|RFjIw$kFfH3Gd5x-pp3}k4
z^<!$)_m2NVMGFFpzs${jF@$?ANjf^hKH%+w=<vXz2^*`}2<X2mW5s$3nmBQ*%1}F}
zO-Qk<UTz`rC-=`O$&mFhkkU5w!dZHJ{tI*d1N{Hru+YEB4F7jXVcxU^yw!oPkGDyE
zs(yq(ZHQ-D6dAP8QZ&Ar*+5+unHYswHRConhLA(*5?dq0lP%m28kKQ{Lm+0qv>Yq!
zq+f5UYKJN>x~re?Qk`QJSN+ECYi*YWr7&cf;OG9NOdCK5;`-rlZGN4z50cGS`F7yW
z6gSX;H*CQb{5_$-nV&((Y$dDl-WugB;>9Ar5bf(InM+aZ7L-)3OXP(Fy~M1gnyk*l
zGHG!4gj7<&s&Lx->l5ojQyXA&AFcUCV3C-%NdrlFxSI^?mOClDc+4MX>TFX33-B8p
z#@r~#*4#3km?BK+bV7;bK0*0G8?eTwlQ8Ez?lvQkf)whYkH2)Jh;&qlxtqv=MtQ`v
z?oA2Z>TvEbD<GrYxxb<A{K7rLfLN|FPg!K*Em`h3)!*UCxiD5<ii-c)nEIt%O^$jf
zKi|4+2WN<DK1bc4GNg83Yo^ns2Q+=hTHGj6gds3ZZ{J)}a8@k0m*pWLRpn8}^~o{r
zO2{;<j7+JfUusR)D>69(&0l&F3<!!Qu~-V8b)_~}CHh)%v;9X$T8ZFCqUUwOqk231
zeR<gumiXyuoOYF)g979G@Rq@k1%PE0(M|eF*7pYjE~PJd0;(!!-((>&g#++F>zwB8
zh?iOl*L@p-y(3b0601O(7-SiS$jdIA7l}kWj!AL(a4ey_Bi|r04Lv+eV0*`W{-j=|
zA!)W>_=|eUAwdx@sx6UUh3fT}xw~qtIhWB>tx??;ripoBxXLoxL@5hXBUlYzQldQD
z2d3kiWgh{7Rn?w;qilJiEHkba`ozdn=^fZ%pWXehA0gv%@2t7;)poHL{F$f7(v7A`
zl0P1O3D^~A6>S?T)=e(8C7EEccX(ITe}Yq`(e$o?b-M}2#v17*#J)9S=_t~5slXj#
z3~X}rzr2d9Fe1@W#oV&+{e@V2`U;pksO=$+Nd64dIUowIL}>LGH3M6~^yC89F%O1l
zHbkUo+&0eaz7lO^@{Su02|)OrKfInuitz~pI~Yd9ca*Wyw{(ca{(=!S#(>t1H@T$&
z$i~4+`pyml{R#iD+v=53;bgwszAvK9gM8Mdz+}tBE&XQAmD60#JD`|4$NIWM<_7KH
zQ@6;F#r~~&K#9_O@!+{^;>Wi6vNT|oFKm@{5hZZIb4b;r<X#nht)ITV`277<f6uoT
zW;u5)dBcB1mH0%ZoUw01&QjDy0K*is{r|;x%C$rDt%{aM{XE}v*`nJh0;||pIQ99x
ze_ZCxv0{ao2HBzZ4~lw<tsXE{pa#1~Mv@C;@yGHWHFW%0+x9lyws?^!=bo+QxlFA2
zyon?V)n)z%t^@IlgT`(j!>ICm8r0_7>M8rV{BLBnOD~J~K$l<p!pp&_*Q<H#F{e4w
ze8=h=%bN*M1@9@D;Ym^LWwoY+pPD8?ZNe3-XI7=UM24)`h>IqG#uNJq0kVowF5DD*
zafDb#dZR0*x_BLggvgmO3;T%?+DRoZFSW!z8z++8cSy7et*u-1V7?-Su_5O~M#Ka@
z6=Xi4nOc=M-E!S{Y`wzg`FIQ`4|@}Q;^F;B@V>$r?^di=!1r=1BQ-^|hoTbgRvj7Y
z<|Jgk#>6GW{oH|e2IcyUA%^V0S2Z&}k=q+72;U4rLn8;GMUTB_%<}nxPel)H?Ob8j
z0I8p-9}%1{_uk_D&xt3QqO#$l=(-<^s+;wGuBmR+>OX`1u~hjOE~{crs6*DIPbShQ
zut#%?j6M>oq$|iqvkF5TkH%FyAw+J-s~Ca^ucw2|*tH-z^Zk|HZMJ;U&#+yxY{qBF
z6P0#01*w%ngNXQYc%i3xw3Q=mX@B@2Mp`qt>z|YUFD{ktln??vx>{=9dc)oea_kMv
z*u2P{n>{&or-l!b@P<pkE1k-bB#vmdLsh<ONYBY=^crnpD`gYKyUWxh=sCKkd&&1?
z85=L$PJxf!-Xl^*n6565MbE6>u#?*A??uQ_Jx-yeO{R-bB_RXf>H#e+!eM}Zk?c_)
zz)=`RN@f+rI9PBv@J&hHvma0auG@-4ey-x2|BtDSWj~wv&u6Ty7D`G$-OcdnsRWzH
zbZJWrf=rvyz7WFVN{fSEpZRf*p!$yuLY}i)pK8RqM2Lriuq7A*DSB*#57d$De>*N2
zU8y&1ovBK;m?gaCOsX+qJRZWI0l)Apu>0iDnj5L^{gyApwtQS9@cEYIksT&cwmMGA
z(n*W{x4AO>m(K9Y%9hs07f35+UtZ;XS8#t5@ScvPgq}|Q2g2HQWNlH=?%0xi-JfD~
zb_AjWhm9~9ItPI>g`rc4IiA#l38BxY8}+u_>Y+F;xMvz~WKNnIiXJwmoYFuo_hpi-
z{oL7i1)+jI3LjL?4RG+?rH>SKUjg`n^=9Est+AmSIzF;E%u_FUJ9b1pcV(BBVqjc=
z-4GmwgKlc99c!VNS~By8D#EO+ILn1Z(&l~1xcf}cIPD30Wmi*`YL*kb9`eRqt6gnM
zPP0!^Dwb5eyYTkL>DS>f=Ez=MuFD_0X4MbE{H43CI+D7bmke~T@Fz@sGrTTU5r__2
z%vUQ?jnduNv@!Q_w`D;g;MjAOtAbx-Z2Q%+Dx}W>ij2`P{81hWs1yudue74!lJzI=
zH%7SxC4b4!Dl$R4Cscj>iqt)Hnx%ICU5+;wH<z0REz*6*rOG3lC3204d-yqw0VDk!
zB2lU+@HxV*ZJ+wPK_1pJN_7`^gZcW(AXzPscFOKt!@2_h*urBX1qBZ&5w!Iw671y6
z>ilrVTx}dqwTIWY={j!QeL~NvOYPCj&wkxgDff$Dk=Kb@kgkVlg9DHJ`ut}VEb1qf
zt%^Hm=e2fF#T{f$-5HM@y5)L33?y^(Irc@p36A)25%DsQ;actIw9@>mC1jr7r|!}2
zYM=w%b3W5X;n*axkA#|$E+{vP(r_c|;>vQ0KE%$fGK!>fZ${j;XAC7o;&DPzHg-jy
z$!5%92-#iNY20{K|Lc8(GubMVBVDY1HmWehV%&pwWrbVm6KzYfJ^!K3Gvnt{;?}a!
z4Ct4t0!}%r5Bfrk7UC4@in`d^qm6Ap>CMf+F?%wqc?{7v<3gi$9_}s8{Vm-|w&KQ%
z<bclnJ9x%uY3*HUfH5T%Wf=Z68W3o{m$i~Ip7%QnK2_@@&*=c)z!c}&=9QH#0p38}
zJxwmipfE|7H*2_QTuJoP>LqAZjgbgB(ix<hmN9=)sZE9fYf4*p`Y&_czh}Gu8`87Q
zu4F^E>Y>1>q?4~STY7NYR1!C-L0L93N5_N8i7W_1AX-hwqKu&&Im6X#|9rsR&0AuA
z()*jn+XMbCbD|{b;0Urz_0NYMM*RqqX2ag_uTLM5+q0W$`9vAFyMB#i*EHN7bkRI&
z2<Ttr>%Ta@EXy=vP~_c|*e-RNS%RHe7Kl{viS;@k%O^yBxN;w|xCrGaOY}-^|LVi*
z(d$<mwx_dOFBMxdpxoDjpg65k7s)S6Dfiyqm(ym8Y4J+;?_5tXS#A)S#W(m@qR*23
z11B(`%p4OeKRC<x=8|uIof%|~_X!!A`drXq#xi~Yl6z-FJ6>shR1helA_6YJXp-Hw
zD4|olxEX}1K|^Qulx&vDRIg2LL5yz`g_3fVrN_Ep8y~DEmYj+$Q6gRQ5o7yBg2`BP
z0;U`8a{*wPMTIuqctI$Tn5J1IwDW5yFqN|2Vhaj0TD9f7C#hVlO?$ZU!G}H!Z!20p
zX1VH?^K7-;dFc751j&}SD^?N{7wh`0Io+1Q5L$ba6k+&d4_*);$<b*P`*?KPD#&@m
zf~AQ}V`#Aept_T~f1qZiT7e{Bdl0MUszh_t0pQz8SF>}}Uz{m<Y{Gj|!R>z;ZMv+C
zQm&a~PDP4U1TiERJ??m_=Wzgdmftnh@t~+Eipl>x2n06fDFuX7sp?%S<ZeC0t^71e
z+qVWd4pMZc47*ov3%PooIC~5A#qmo6N}d+O*EZHI@wn%AVwfu(wK|SBMz6}$G%EfG
zWdV6`?hVG{KDd{tF0vLfpW?TOGE-eJ<wy!+vT{oJh)_hE_azEa(8*j2#u<3de*Obq
zEQ|j?1t2B!s;+MrDH}B#CWSq4M-X4*8~hm0+F+f8BEBXT?4MILD|D?C2T1ZlVzm@g
zBkqrJo*4^7%v7=S`nxe-M=9iWOk~*9o)@FYzMHJaX{pYt=C(Cl`G>vl6AYjHp}NU)
zclWF!y!!}u9ccNyI?U3O&=aBLk0nVd?0y7Dw2nN;ie|9;v8eVdqjsE6XXD$~tzyl6
zYAQts;K(Pe5&U<7PP4F4+9qUvEO87~30~#>_Mg-ZDPug8t4YLE6q<|jy%tuMXs~I3
z$n+;x+D)7(=uE}WpDihvbtYz6Rnjk3rq(o(h=EkD-`w?@&lN*8)0a&`X!dI$a-~h;
z&O7tz#~XyC&f!uEtt9&|96b#UgI^Vxy#4!7vKIFr%JeF8Gewnpy4J*}Dd;XKx8G8>
z^m%1xM%ItSq)o{x85++7o;Ai+zCb7m4>Q`1#y#i;GoA}0#jiN_`4F1GKGm1ln-pQt
zY~n9Brcv_q)9AF}Xocg4RKpWq6wB2CLNZ(OZau9@7QUarUUC6h$j_kIqxJVs8p{{4
z3;>97%KM^Ifs(cxQFf>|k4G>BaC?4I)k5Z~*J-vcg7y%V_b>pLTKB#h--#B(*YM^R
z{{TMVP(PhOJLc^-$HrmOu#J6IWkTZu{;g<Vb<CY-B&6`6GRiCe3N-Om>Sj`iQhR5D
zPp^n{vA1Hd$#{CJQy`Rn$=k0@xVp}_(`-gf%|SLvvTmR!Ayh3WUaX91ht)Sc!}0)j
zq8b0kpz&Y!)Ri`F+Tf|1Wf$~BcA`fOK4vd?1$IYE_O-|qV+B?xPnwvAknP>66JLmB
z|N0sh9N$U@hhLF*>*1cDhzzp~?LB=XIQ!SJ)*M}Dhjl~nDawOJ4W;ZbXMZIPKT7FM
zb<^mDWiX*{$^LQ<|Mt-}@1G@qjzHC^?Y#J3Kky(r&MI<2T0KEyy1Zi1e%iOT8A4?h
zxTk7TJ5E|4Cc~7wkU>Pej6|>00OByc`HvTw;k<G^ROEfpz-eu=nO}xr+**Re=Ca3?
zinNH_(Kv)G%yobVZ6ZB3vIYW%+76Gfgb4c3ul46ftoF@Vd1r|>&AlXwRx1c?($H~7
zC>%MGu{`be&)O@oBP83Rzbo)SB>vTFhj9t}+7;W8t+ou4fs|VzQs{T(Vf>Qi3(b--
z&LJZ&jTA+c<rAtVqtPq^#GM4a*-I5c&4mT`_{YKLKDfw>aTT3(Wx0g567mp-1&jI;
z9nJAMkg@YWaNk$lk?J76%~TYAfIp4O)L)Vn2+t8zV}kA-dDaIJ8WnF#WTW13jBRrc
zs!ocmTcI>BWD-|oKO;}|XPIPWsoh-&t_{?|iP#QuR~m<ni6f&fU*inO6xD$fwaBD9
z^{|oREX6yUOy|Kt6L5_?6Ap)`+7X0Vj$!a#>h3DE>&P2;XPusJ<VUwv>T#p~k^;AR
zzsro^X;~@g8L8vdlQ#neK1hk{m;1qW(how<(OwfT$}KZg3OS1%f3k--ochC`XS`IR
zQjcCc3p366lDY3IJ@{?w{B#|Wx=QhHj!;0X-7%pUB)6Ivt8oBHtZoZi=ThEkzf5j<
z<yJQUfgzwJ_b};(1?NKoO;#u_6XOKu(sS%>g+A`A!q_WowkE8emm)yoqYCrCZeT!V
zDyO383d{XP3!h!G@#+Rhou5^{QwB47pC<2~ro&i|H)1;UH}fp94VUC@?}--L<|$dy
zS`v`k##tv*P6r4yOi9-?qk2~Jv`xIbM^H+6SwB(0Vb8kXYdP3!5V&3H0{0(mekJa{
zMsT~~-ccz*z2><SE(aGW<gB)li{yG0ueUB2erJXHFQGR-VayA@>OWH|B~GU(-}(Df
z_}cD+mFBl=TN=aIKpp+6DDpw1{zsUcN)0VqNV`4PHakjcQvOZ{*dTt@X_4(19T#47
zOIZTJmXw#XHbM;lD*;vP<$@e0O}u>x!L0jn@e19IiPY&n{w8Xn5J&zyYnO^)LqCh+
z)YPpV4>+TTAtk>yS+y4BB7rK{k^K$1es<F;WToIC%_-V*A#Z-`3TVEA_I}8?yUI?c
zd7U;~ZmdeEkprjawC5CPv)REhX~kXMj|Webg^WnyhX!>%dxe&BYiqFd2wGavY{Xye
z_@CH^&8dI~(ghxsTf0&P;Aio3n2~OS4!`Y)$neu%3D-`kn^@S;>SztodPhpx3j<!G
z^-9yXX%l8N^$P84jZR8PHQ+FOUE*BNecZQqn{-M^smaJU3hzsE?CNFEiH)E6`BF4D
z<1Wgdl7Z96iM^X6ru?Eju5n_gQuRKF73&}TOmxL)HohiF5nI=r(d7QPD<xQ_X=)Y|
zsv{OLnt($P4rNTeuh(8>0j&*uOAG}_x-mHu-<8GOcC3t0MGwY$P({biPb1E>jH<{S
zI91T+7f<eWCw7kQN|XBKSTba6vOP=gXic1U-rP>73TmC77BPu9!q{NPx8qjBB_0_*
z9Vau$-tT%93WObQs)qQ$d-jgtmgLnx-xlI~XB=|bQeNEgWo6F$OQ!}&KRrK3AmS}j
z^!{n<Or2HE7<l*BSgF7>A)zhN<5<QA9b#RtCA}l<dd>jzDK~g&NY&S~m#I7>?Y=nn
z&{ft{+G6dvUo8)U<Y?=TG7JMpzi@mR%flIjMpoO{mGTVdBTO<FpofvU7<u~Q>%G05
zO}L+K95(%U&+ZTRP1s6L!g+d_fq`%$^1~y#L2c<@4t6y438%x@?N&?<lbrFr*pGNS
zu+>kAJs_BRgt#C*M<?(M-qc`}!sqK=Ysid>6~ySdKz(w|9q|cmMmlYSZ8qQRN1(=7
z5?O3iY)U&EIb-dgfqW$ctiW8=kMxh(Fmq+ObP23&@jtso{yis8{eRH+-VW`cM)u}h
z*>nZQpE2||(x#e?;GEOO?zTA3gN8{g8K#IY0cwM2*z|2tI)Yw&{lFu9seL^WPM#Ry
zPi4njju}a^>ReJR##`=FC@t|xi{@n51Fa8+u^(~r;MEM!Kt4WZB|07JH^YdZIVSeu
z&>r2kQBH}Ct!Zby3A)TvWMo)PRBQd#b$2^k@CO4%FI$Qfv{AwHH&T+K?A^Q0kH_4;
z>-t0Y`h=F9uu$T&WRn9K2Pe{3B0Hl&IQWM}nolsMBd$iRNXi8;e;ZS$U%Qa-X?*Ut
z>f`jO8SN?Q;0XUFw(Ro&d)jnYd)`~hZOWDC$xv@r$=I&f9q`0E2IKXY3xl$FDtpt!
z>tuj02Mk;h5?5ZkAMPl+`L4ZDX6nmOGt5Aw`p{o28X?e7ScdTl4q<YQT*AYvU;t-J
z>3P83OXIjjUgQV4;H})1Xa;B6KX6Tpet&=OjXe^18l@!LU??J|>Srr#1${U14*v(v
zMc&i=3Qnud=&NMUECZQJA@2A0249_0CX1MDCsVo&U#J$+lAG0<W16TQU9Ta6fsq`;
zCN{<36KG=!<@k{CR8Y{vydsq%lZQbSWPSJu;H9HQH_;c3k?T2OeNdt>!=indfEv(b
zw6srr4CEOn+ue_@MfA_R!T*RS+&CJ3cWQ)3Jc@7}2^Jui5E@tudhWi3I4XK&H$W-V
zo9xPveg;xIEg4f+=!PVtm<RauA;wBUHSzjcXjDm4QR3z84#!LyB@!jl&fsg<qo0B_
z=Sn`dBuq;dR*y{N%Fi*>c1T4kvJ@!w9Ipv6Zl{5{*w%kwc8fv+a*NzK`{3exr=w%D
zoJ+dTtsLs;Kh+T7EVK*xBIht#6t?Bc{r;tH4{|)FD191ZE3&2#tERgcj55V3YAmDS
zXF#WM^dsrowxiUxU=MHD{;ir76qKS5tB_LnUc9r>VEqmjSxL3|<~+AryJ96U9~xnG
z7qy(inO%{a9)5$rnL~FuQ38^@tKkovk6Tl{CiBeL<rdbv*KV=O{w9C6jv}F-cdCXA
z3x*IN07_&TCxISmmEK3@jnu2Lj8HhEpoPQ^Up0^3ph#@4WOY~_`+v4%%QS!u@R>iz
zIdSy7Upw*i1TI8ut)AMtXoe02kgj*?i7ww(TTH(Q7n|qO%5<a|483G6W=4&DQe2Ok
zJIW7p^33v$+H`KzZ?v>ctb$k98{BAHwJ5o!30<S^Avp!OPj*_a#rNUYd!#!X@iHD}
zOf-E+)$BKB;DagBAPf2R`c5u2R|XIDok!?-G_NqAPb$7nf-yRFX^8JsSq>ux5E~%2
zogm=-lFW~vl#(*tB`?=$R0AKU=Wwnbl3;|B%Jbt)$WgK5`*lP_C|~2InX8k^vNW%U
zUA&qbEVEP+F%=HsEi(|?DL5lub4Qq|KNV}qknS@!oN&nPZ6$kGfUo&}J0+bEX+j3S
z((=qgz=CTCovyWhXjxbR=_@hokYV>`Tz(0&2J&s@1L-44CyPHn4Epp^o}7qcHg%(j
zQC^%@E{OLN(`keBXCvhW%}>-ref&t7xrF<rFPnW+w=wK~+xjpforwl7)GrKHNc@)h
znp>??8Ny+dh9N_WGq0%M*Wui%|Kcw8wh5A;;j0dMg2<}-6!B}z989*Q;-=DIlmX+o
zwC6dmCb#2yzTkKZHhUHNHCa?H%M^%TRU2%Yp)7*kog_*3@Sknm%q+a@=s^FXA(d%J
z!39>bNpW9dL%NphG=H?-2M~I)TQD@xVs>1mcXyMwjQjTVQGAm8qiGFU(I20Fddv-V
za0_+pV>GG*yW2wNJ>8@tUPK=tFbdIV-2!p7^A7yC+}i~BS&KoMrNw0u>~QBU7&fK!
zP?dJN88O5_mQoTS|7md#0;#;Km$7gT<u@{_ie?idbAmNcyr@>V%RUK<%Vk5C9H)b>
zWKR0>t4k)_9b?%6OKJrStG~Qs#RrI@5f><N+~n<i7LR-rl=XBrMI!|$qmAT@{(&=_
z`*!9|W-L;q;8CqS?rRm^H<zzGQ(ig!On^=)PAIS5H<}>~ftmF2{sXr6ue;-4FRRvp
z5Ll><nDzn=|LQ)@TP=rbPAJn=)D&!lx0ZuiwYnV_L(=WnD`?SQS9Pd)dF9-FOM6!h
z|5b;1U6K2im{t2xq+WF=E_R|`#4@KhXVmf6_v9Ut!brVuKi{KwGZg6<(DdAw@0z7(
zA!cG5M=wrkKDKDzqa+#h;Kv}c+GfjTM|1EEy&SvD*&q$biwUW;A~yj#1YxBq=Fuf(
zh!^6!o1cvYvA`<84C$8!^bh3TlZw5CZSsx#aE~opjr;EkV)<1qn(arV`0yLlQ^s_;
zKNs?mQFm_0nFIh?nfp96*tTSJKk!$KVmQ8w{0EMa3_4R8ZnLGB{MkegnIFQflxyZ{
zh_1>NK|YciO}=LDq(L(a+G|BSSyjA`{R;W<`-?ygYCR?BgU5lKK(4W@qg#uNj(DFh
z`%>?HYExYsvUDT>;cAnjxf-q+Xg-IF@VwaZqSq3;Tbc3oyMK+67mNo{tw&lKLaH%p
z$A{Gd6_wpjtBmd|DZrcwXn&^`l#qL<)Mxxly>tFkq6++{P|bu{<R+?sjAidd!_Slg
z*sXwQ9r^1rQ8w*N+!02y-raBaL*2v(v)mpwdmEizn-u<7j1-J&5&qpy5R1Y8#oAj2
zwH3bYzM;5FaW7E36n815xVyVM!JXnQ?v!9{ad&rjhXi-`;NG40-Sf`*e>rpJ%-$a|
zS(8aJnas1+dhYwauHRKZOsM2pg%3xfjz_Ssz{_V&QlSHbId1ZkJ;B;4B#tS8_ioru
z3uVsDGzYW6@59z&qN?%3d-n54ru4D`Y;0^7pV9A(9@1ZPz4(H8yeJjNkZY~@Q5l5N
zbK07#+E1WL>weKpJME2b<ki*gmi!+g<zCeu_@Hos)ZU`YPp+aHPJdYnsMd|R9=1ER
z1-h@q95GvS9s7LZMGmz*1F-dA%dkiEzt<r%Ngh_S2&STploX?%j_uOwfd_h9Ba=PF
zVr+0Poi&Ec*&z#_qSS_3Ld}op6$R5(q~Tixpy8M5ny4%NvR^4>mFS5{6>~fj4fS5G
zlW8hEB|mojok-~JSzcM<T*3l|Y9Ode`97JCL^lsk9t?dW*9cZxPGO15b&mOj%dD-6
z(?J{`SLu*rs3g?5pM1$|mM@n_zn2Nc7O%Gl=|y~Ee%qdvD3U+;ri@o+N$dr9n*Lp{
z$*s+HW&6w%^y`NAZ?CS?uB}y!6|a(OQO%U^QM*|2DjAb`+MzJ;H)CY+-N;RqK!h<H
z$>~h`8A^9;+6b}muD0~<TC^%p|2sPIG)5ob3RSr6&%aeAX;gR+^)JR_h_gm7v%yzm
zb+Mh1@UyMD4U3g%y7C`_xI$@S0N5o*1{Dnx*)=fRuJ?@9phD5OKV?}{jMlFvnm(&d
zQ!HK0_xIoZ$RUV&vMhD8MY<IepTuPAh>Tm46lzled=(lCX7S47&Ma+MNKzy^_m~=I
z>*wHYriucsNGpM_##a-z26a;(>e|99X;S^CI0Tjh02C}BTsytYNjcRfb;_|($ym{0
zI&W9~$%Mvj-}gCPaxt*=U??*)jJo*Y2X6dqrGZXgz#eDDhi5K~xXj=R#JX7gQv*CP
zy$c<WPw~7J){0kIsSES>ZmjYojm$d-479YXcO{=?XB~O<NA~M?(}xH5?`|q||K^T9
zbJw8$Wnd5~O)Im#D>Gh=uvIY?H^Dvu*T>T5C(|yw4K%dmI<wyK&Wv%huA5GBy^$0I
z=%1+@EXpN0dqYXHO!+i_%p2H3+mW=A)?8-(JpOvFGC&(mIx4qJ@_;MIc9ErW2Z6~e
zJ!>ZSA5}GXBV}m%RF#Ey2~2z*JN&frA*Hm*N_C=UXBcMg0RN3#o^-ZMpa(hgWZQV?
zzWJcw>+xLX*Nj5vk0U<mP8yeh8=m?dHzF1R{feksBYATm`7{-4;x)}v9RHbuU#6ka
zF=LLXN*V(d2g7=+*5wN)Ox-qQ0x$A<qBkU4Q$rHMj_qpOQ=I&f#cn*COoCV8bVayH
zs4;kB1KcW3Nk)^ef+$#ePr8T<6S^yg^hn~X6aQyl5_SytA6XUo^l2r>7$fFff2d3G
zP)=>?d}*vzzp^=7c<6bCdiV}s!Q)G3B~I-bE>tEvT2sz{QTF7y?ura#J=O#2E9xTl
zqq&f~Y+@f6{jBu2xC{)>FU<Qilyw!Kgzyibqfdcy26O*M=w|qRYavsvIkY4>S5=E|
zh&hD~Tk}M|oyN+#;xmpg{H{WK2!_smxy-66v@LI14yjs`NYVNSaERNGd(0f%Acq-N
zk(0n{ahYbW2&(T}g5R#ap0s1QI2<Rv-ml(Pm5e-)ENKnFVk*5}maa+esaiI$(`~vk
zF0O=>N&F?C*0F2~tsN_@)Y)lJ4xJRrMdGQQ36~xI0r~sl72`b9quD$~^#TI?R@!U5
zh2w?HGA)DI0G$&jCzw0@q&?^fr>glyawi_+flDs#xu36Qg)#Pvl$a1X2B5>|%ot7i
zLj++8MR}-e?pFb-3ZKqGkI#4;LAKtYEB(j8caO0Nr5*U#a@Wu-G1lJtj}tcelUWS7
zeV_BxG9R*YS3Po=pkaZ0P|JUSA7>TgIEL&R7_M@IyrB=lpyBsYvDF8=+i8=_YwX`q
zrmcfjeuUID8FnaxJO9QIFaz^mN<KHg)RF0Vl*yI1Hoo17njwV$5vI(5e_B8fnP9`e
z1WbVo-`B~!H(5D}vvaS@k>niAFxt)1{<^qpEAqp|h+2HHri!0D$<Ah1GB#T`8ZI96
zLo+lut{ul$GZ@2d*uNTpY&)bdKT^mqxvH<9*^)dOZM%%(Bt#Y1&}EMQbyW&?S#_SC
zGf1$2pkN|Gi$IpI8Sdwx_<e=GXLC&kNJgU!!_|jMZo5oC1nWsf-VN^<%uy0bQ%kL)
zt86^mIA=M_E>RsYGekb0Ol&oV8{+HVB9?8t)SbBEli6sV6&gQT=UO|zp89dzO%bE?
z#Nuq9RkSZz`*bHGEr|JDIX>eu$D}VfpB<j58I$bX$wkyD;N+{U0d@-Kt39Yfd-|k6
zT+G|DprUuQ%Kh}T6hG~*BtM)cO}17u_aTyhsDN)OK}x@55=ptjW{@wOf!x>4pf*{r
zZZ<@#hN59|4-G%@_53^JGn?Xg{=MCa1d9;HWw!TbAkWCeTc^a^%-@jqmUwCGhKK|}
z;oP&ELM7M(#u{P5_@A8<<KM;`5qLGo-b&G+A<;#b1?4RPvlQBuWp7qNzG@=5+Nfu5
z_=+&%F66gOCuXxZCb$Uwd*Cs2ZoMs*q36=(n`{fcTZk?s@#dzoanF+dR5eMO|I7%9
zmea90uQQEs^lEi@wmb}{?lPJ*wiMYET$J%PgNJ_v{>#LzQcQvhbNUH^2`ms|c0_87
zG%onuDp^tq9rIqthNuBT@I4GL0lv-vyfcWl&FRbVfb{?ufs6iFk6J~{tKBwgQM_5T
z|Lscmiw&@DzW#>TswFYHUEc)-nHEIjH=GCOxtW_uo4cptfKvnrCnr36oF5@iWX!mC
zbRc-FMd6mcd*@DR?=<Lz*_kbxirFht|HlG=q#lE(ET-4s@TMgWPa@=Y0;&qyi{wQ8
zw)ce&%O!wG%IPVXCwr%wg~2KHZ*fxg+6m3v={<G)`nO%*OYA-M9#Z{LzSX3md<zqB
z=K@GQvjfDF2pkkttA#4YdG88o7awCP$3P`YXIq5|P?{Idgar#6kOCB)b5bLJ{{syD
zF_x<6o9%SNyHHbjdSRAJQ9Jb=`~@@z-0EF!;J;`?KZgmcfd$jj$!ry*Q)$w!DT024
z&c(~!f8w_4#lDvmukd!0h7x4z+JTP*alX)Dj^!Nth8k0Z^qU@O1?fg|8^yO?OD|Y?
zvs%%jDsI~Q2@Dw>J&Ms#VSDGvLM4+nY~Pm;D-pwAGf}{dO*7SxKx3MdE^J}*%e$8o
zQMBeHeT@%G(jPU*Q!UR~=Gt+g@yby6%G}CQ+xrfVB-|f653S^PmE&5{-ojGvjAnDU
zJ5+ZqS5@-FO07<o7M3rz(m|HSQMvFp*gq1<##|anTR)(>hQ3p9$Ag>4Hn8dve9_(9
zz<FR$4PfaftU`|vqJ!sXrdVh-xvg3ZV?HKP5<d*EPlN`A)MJy_Ru!Chj<G5aoA%}T
zh8hiF02>)t(re0Fjy#V$#D=f=+xkv<opNZgU<RKq=~pi?%HN><Q6SKkrH|&u-K<<R
z%*|NYpYLI_h*UeKK&EkvrsZJ&A0Vi@(rde`JJzq;ZLH)7x8p4;(9#33;D;9$U>95O
z@hH()*yGgxj<FrfD`LNJ&hV*<_EM4+5(yL?%r|hj^W(auU*1P=R@+{dOyh&)Vh&T!
z-(F&W4`ex6$H;c1H}@*yFr1NbRsAwF?5DjfN(7Bd+&)3<oVC<?6!*Ro?q_)J?Gl@)
zWtBo)m%D_0+jw*odrPcHz|vjNAMlJM!8Vlw?B@AD(RlFsxg`EhsgQw(DRdm3ZoQLC
z8f88*z~0pFDj4{Z(%-P{Edz^XARpxC08fhYWYG{9n_jkMdKjn*l9Q*<p1;`aYMQ6P
z>t6aIKFN`#dE;T82yVanBqz@}wxuauBzMGJAte<4nJ#Z!1c{yFwlZVR3b!<8|Agc4
zCUfo&fAOAWqNo`k46RKnAf#heyJuDMcXtMrh=ms0^3V>OZlqC^_kLt(Pwo5H%}eA0
ziK_@It{_2eGhmCJkXBQh6yUqQz?oXD^gdxi8=fDW_85^rpHnJ}RAjDQm(BX_Qh_-o
zo$h(YR|4FaIN$wxTnddcw*=XzX>*iE<LzU+sTAV!jI$_%<v4ax&xaa^k1h3HTglXc
zJX=+!2%M;)T>JuKctJ|`ho57#+KTF$IUT8SaIsdObM-ExGi6m34i6{>=*LT?mXjoY
z&hs;`1V0z?U@un0gq@PLzTb$dvbR%{h`d^eTvHzrZE)oW^M1^Xcbkw+^7@cH(Qffo
zN?JT-VTKe1Y?9|g!azV4{>xM!3Y&%ynaj5C%;CeqD{bCUMc9qAO>zdI$s}hL(YA<`
z4o=p?Q4h|Y$P-=<>4AEC_uqC-?@A4)OkYlxZ>Zn~^14BBCB!X>)p9umZi~L)>ik{c
zHZb9D<)m|8Cqn!D?`Mw(8=4bx3zK27-p-Q;WXy(~WT3M%sdP<9<4m0WL2=bNy$iuR
zx-W>Ti`p~B@Ur&u_ouV)2D*Ec1_%X*&b<$j`n-iJbLxjkXNpCXA(_UxpW>Hc_^CoT
zSlHt`92Dx9P=ozC{L9`dyLW#N#+Ny+atqv3MsO_*%Q>;lW&f;YIo}xwrhyV=nWf6o
zRJxsBrg)N^K3CKyn_DvCtaN3xXJL3TP?S}7OmDWT9Z!&zjGDaUEym03(a+tUpn~Lq
z2r~%J6_m;%(t6=iiW`}`e3~CE%AbGj`w7jrwEkQ;_d<NfE_He<K-*v^9Tdb3)HkxA
zvm=nE4c3L&)8PaZO6n|#2h?O`57iLZ^{J)_Y^TXvjSZ#Zv5BigsN~k|fAwN5rx7XR
ziS)bHu}0n(mo@F95BCV~x1RWzUhTlq$9cwlkfMcm+1=G5SUd4)yrv_ZV~q`e?6erV
z?_kMG<7uvm*ypbUw(V;=xQLvL^_Pq;+xQXdC4eu~Vpr3&Rm&C7q~AXVnk8ZRU2zN;
z+7s4X<<(ko)R6t+brlmdnFH%bT76AmH;_^aG@ajXAVirqMMAV!#<b@()hF^iWyVg=
zv6K74A#bQEO}D>Z&Y^HMC)618G<R5vo~i|gqZRE&J?DSZB)W6_^VnVk>XOfyFK@y%
zbab3Ik?Kknp^+I`=1@~rK<GY^m&8Z>PiVor{{R^L*D>~gL=uz=W(?*6-?qwNyT~kC
z2-P`hvA;|o-kFs0NUm6NnwYwg^$tC<YP?7uan(05?|3&O+ih3?4eHQ8fWbiE&)|Jm
z)hjU)*EWHG>|@R@9)SXx>AGeW!6X+%7tldW(z<)~%pd#b5*?a+1_3XAP2ywOD!B$b
z_niwNT3;16|3qzTOOdsX-y1I593iel@bIe;%+Jw0hSqANR3WEU^UfbVqH+JYkWAud
z!Pi^2Y<CbTJYa5%NR48X_}hT<E(^csiY*n29dTwsaieK%P0-=9Hr@9-Ae+wpxslZ`
zGBbe`{(P6MI!Eb^ZZbT!t3ft$)hr3MdtM1TL4JQG{Bx}P11D38LgK}k7jB-PUgFC~
z9*kEsV}Q2;_}NK%WBaYN2V`et14iU=2VbEDO-l`a?ph#`<4t4|1A<Mc!9sZW-piP?
z<$HNrcTEJyEERc$+QAM|oin|h{2jjVPJ{vJ6H%WLDh%TjUap48wxp4OF62KcdjLbw
z@u#YEtCldXyJ|N{3=CYl*Z5@HxupI<cJvYxJuhdw87r#pjOhbX^8Gn(Imqp;*-gev
zB`@K(yMe|1$eBMxm*tF)20^D;#`ZK#-Zi2ea=h9fUp6e*sg^C2<tAI<i^S0vFrE=Y
zR!?<fccPexDt&$D)fIB&!VorZM8xDiZ6?mF?)um#YvFF=zWZY30wC^&?{NO)E@1YP
z+U(f^frU?58C=gGLAciACHAR4bZSd?uH1A6--2M0dHm;H5UcrzLGLr&+m`7zA{sV)
z0V~uT5l$8-YyB|~nd=rAI?&M1-okDyDsrvtirUC)gT%OZ2QgqCQoXuTkUg-+f6wMJ
zYA3j@QbbajCE+g~=Wl@B@!|?s{#=|N`3{vH5uV~(G@WB|J$6S{5c5t-vX`_6v-Uzt
z#n+XQp8#X7^Y{X%-{F^;5y~CnB8j#dzr0QzoETyPm|xN^`*Zw(Y<m<ZXsLuF6!V*B
ze=ySeY}Me@7w4;U84&^4Uzp3iBEGG@_-UrcDf0?b=96pTd95z9f;fJx5xXqC+NY#b
z+V6ndzI}1U1kM%f{=!z*o3^!yJ4Sn|C1w2E8uve7P(T4734oA1jU`)w$tB;hz=l8w
zybyT>Q~@uWLQc6;iHLwdD^Z6%9Zv7wHFJ0$s>UusB#;H;Yr4Jty6e{Sy_Q>^McQ0x
zg@^l&mOJdg!#2}!h2Ie1FBmtgslZK^F%D~Je}L1~rOXhFaEq5nxyLLi!3yop$TDT@
z)3gwIu^a*olj<qjCrw%5TWWb4LEHovx0~%kK3r;SwEDOr1h@n)En%=i9dm@Rxv(}j
z_ve(!jNmYqtj}_~Y{F$4A#~&NN2*HBw5cRJKct4T{8s~PXGde;8q9kTYq^5;X-HXq
zVrk=^A%iO(zq60W5)qS0xjkuM9KlWtz_|}qs_oa`pGQxqX)6V$<{B{X5n{B^TVkB(
zSRjkfQH?dLQy6Bp_`HHJmBNW;aM2)nYw>ssD{z=5kvakq7jKEGzt3d?Zp6BTS%2i+
zxsT2(fcwGmFNldoIC7VlJ%}`=egIzzzC=hK3#FuDI?p<ZE6EP?6IqH`jh_yslo2v;
zy$s=+K{q`_$RrnBH6$|8TaWei8%Yz#F@UGrzLO(T!H1?UeInsqDgK`LEl-7@Tk)ew
z14p*ZK(-~g%)ThqJ_2QYi``qmJlIgLcNEU4;p%5s`vOn2edZEIoRv~h@I-*)h)O`2
zK>(`h0?((l$??jfdbcaV3FJ2s8CY6v6SiFMbkNS?tD;j*>NS6SY%Vb3ja*r0PHblo
z``-RwiV~LQf+tL|ZC<&e$g`jLfz0&VI52prn?m5kSo|oG5GSaWd49yvJ=T#?mLP(J
zEGxdi-wB(5l;H7L0wkiHcuP{L&+3GhE$Cq=z`_l_{|BI*cp-xFi9_ARtP=++mL3&2
zZ-VZXC32M-rXH@hFe|&57PyfJ`ThZXs^#*-+$Xt0bpnMS=+a}Jy26Z4(@`@pXg|o3
zzcdR|eGvkSg>|x>h?2Jm62*VoB}38tRS8xBkcZyaL3nO7E=v`6IGL7LdM=zwyrhTB
zYT^~Te%e+V*Qq*e<XTxjxNTEK3B#3CMG9M!*XgIBleNGlB9kEOxW4*S71w85D?mPJ
zohUWOc`sV;nk?S&SGsA>v@=1`<zrU!wG%ndqcp_?k<PJIjrr|Gv7x!W6S56K7&6yK
z$Ip!sX77M+HNJjUbs8={9ZDJ<sew~l&NsY8`cz3D8M~arFM@noZlxEs38z>4r<Fov
zR!vhQ#F09=AKIR|`+UiaFY#F4OHmxujjT|gq>OAu7VQ}wZGMi{V~B<^`i%t$Tca?3
zWm5B1&IYi6BOv5gm_EM@JWZ3fJ5~neJ!bv^GH!^=dA0k!iO5PMZ}kV>VqO%xYFg*m
zcB{L>H=BQ?daub<zhJRQ^mWpKg;@eYafw3T4t-Sbbtr$wZFi}3(!Y>iLi+vOct!(q
zRpaPooXj^ve8?i9aEnxWFYC~uKA8IzSxqpo1y#fBn>wQq-H<T3udDstcxcq6zyy(8
zRU1#MpP0nseH8y^_TkdO4*ET%Ct3H~xAFkv%j@m^<n3PrartAG{@+bxHd*uk0o=1J
zp0(>BdkgJJUr$&SH0L9<$Z@2B#*#0>4+kF;;$@+J!ewn1Ud(XaB79yu;P%yva7*-U
z_UN^<mFc?&x47SD5#!teAKwDvD~n6PO`ed1f`RffRwsBCSiQ0Udhqhd6H^M?;V+R~
z`=3N`<iQ})0;<bjk3l!%+VO^tiNcky@qem}ANp{DNJh`*zX4uZhV~UV*vUhuX1ip^
zh@OQ-ut&JK!MqjE35t7)C{cl7rX9y1sp>)Au{}oy!0yeHW^}Y<j!31<ln8M4PXxc&
zNBxq`_p=@P3@v@`88x;g^>!@+{70aZa_7I<5=>-ml|EHm9ym^E+6GuTg$*X1`cksJ
zFWp#v+s!9xj+ukC$G#Hu2b}PIa6<*Sr6rGLcticXL~1^WG0xw8j`UsKwaDXD4JJR?
zOs3M^ek%v;VoV2Gmf%WMD&zNhHZEu0Fi*r-!g&`-o$ut}yKoAW0FYt;-hVh-^k4!m
zW-0pGbgB-NFV4;i+7W4KHc`!YJ5l$_v8YT*11bJ($`d(}a|D|+5;St?qy0?|FJ0dY
z-4;$Y?Da!k3TY#dW3II)ij$E74gI3@^F8559V1)FHCR}pgCmjXi67$yZh!ZJDSW8T
zg_|d5{i3Rfezl;I089vlBX~r*vzjKms1GFx*;hIGL^qXDkVRLg5f|~C;^VuuHGg}X
zt3Y5MlNyFx2*Jk}cm@(&Vx)6a{wdk~y10b16%U#1Fj$ObTQhI4{C4;b!<=fel}W9=
zLAbn+@(;S&>H&8=*0QaHWyhbdqSpzsVzKVY{so%968iY3<ieDFRZe|SF0M6@G;uTN
zS1+=xmpc#<3I1GHmKVO>$y0&iR;09_(4Lex<;#xl<FIv+5dB~AX9fuMc#_@yT>ALw
zVZ2uoqSS)5*ygca?VtUmoAkMfH>-q66GsgV88>hO?~)SS@f#jzo{Rsp6TB(KZpZh<
zc3y>$rJ?;ZiIn7r))2w@ui%gHC`kO|?q&7r3I@f08>>r(CCLwdpJ)>W$IYLjF?OaA
zF~dZP39i5YMlmuS|3{(xKLzvu>vPb5V86@PjNioP{sEfp<!Wz}V+DimF>U4eKt|T0
zpC8`eSWkWGqbjN5hDN@G<^2Oxnsu;V^*@8I#fXE3<;0%$S#HSjm&?W|hjvuL=d58v
z5Wk(SAEegbE2=QFe8ZsqlUzN?QBTVK%TDRf!Y8f9eL-PwWQu-+poOArtfJyXme;fV
zFaSWiSR{hWh~aW4e))Z$(LMO#<>cgoyfDZi7am5eL%VncJ-pEmDDHHGOM6R^uXd0f
z1s=YNg;%Hi1Js!tf%dXW8Lw^qJR0s=c0obCxMki=GVQPV{>C5gn#zsn2-m4XOvDq)
zh<6eg;Lu8hz9YDBoh<$X^z!`iR=?|qb<4NfxT~w{FL%qxabFj@(~APbJSrsy?tJ4q
zo=pyMZSL?^8&iES_ijdu9N1a?ya@S;Nhq-Fo4O#}6V7)_(^wgtqhLYFf$U-Rgrst(
zXSeAr+}U<=v(lCJ%8t}!eU=r8wXsx68`s|1JMguw(dq6RPtb_<LeK{6k53KI`A+<P
zP5P=TMU<=AIaaVXuJkD|2-jHqH$6V=`WM7}Xa5h-Vk*}6CY>4yOJf=M2Z(=}3JYYN
z4B8S}h><3CDT+4<s_@A|^>HJQ1?*ERJGdETKm}@cDl8I5Wt#6tqpmhSJ)*O)-5rY9
z-?BEnE>lKdiDh%SuZwm)>^;_&q}?>DJS+K}!O&`#8M@hvj8pjgbLW3FY$x~JQbh(7
zKP5y^3Hb=n(pFdQpSA3}w#7v8n|ETI8h(Ar$G8fxw>1-g&pl4`Ty&z_2*Yb;hXKd&
z<65>W4D@7v(KsF*soe3#?MEDbGYyMSFwgSU0!d8}Wu3E&ngGu2Q=*tqm*ts#ccKny
zWJwWpdfb{Zu(n(1K{MGkC0?%c?+pci{Q_vj@I8R@Ee5?>D=$n4OJoRHl76Qc{!WG<
zKh0UDWAD~~Z{(jsEXyvVm|v>y^m~JjkQ{cDY-I;)6DYBCV`yy~$n>2*ZPzCVeV>@u
z=FU#sQ1aE8id*42r`<i^DZs3wi@L%Y&D%Xm2slQi0ghs>FZjOO`V#oJWsiQw9l~9%
zFo~prW&fjvIHR*+H35~rxoqYw0Tyx%!EV`*y<t|Bmo*&HM`J8o4$KTFU9Bk0MOUOH
zH*u#@S5>pabxZ3ZZaj_RS@(ACPG6qUkN!Qam*eGSdS(>8r5p)<g5?R7Tw8a)E&={h
zIqTg>Nx-q-?crmV@>hYzxysgKsoL*c7Kv9tHbZmn@&TU^JaG{y&;gtQ*V=5?WHd-t
zbnnxs=7Cx-OKKvgnE@p~)@@ctzlg+o%lSSp=Oh)3)-y}#&LQ9h6z6d5VTR?w&n;I7
zI~4gl2d@F;zPa{ON~2*D-<K&@7kNgx3fF{Rsj(Bg0;9_J1#sKXHOR>WxV|-}v%M3}
z?dJ`i#`aBh&=Tp%^diVta2V%|IvsIp7T6IGR?qW_e5KS>({F`pYbK$38^VBJwd})i
zfuTI5HW3Y!)Nu%^OxCOXWZncywxG2x&+qYa^>YTuHtGAD4oDfST=~JHcmaafs(x*R
zd6q7y+*^i$Dn8x93q=jrDoKkYBc&BYpkR6_HKEY>J?*pJQqwjK!!(F%MMoi+!3{W#
zb<-MAgq4b%32eX#z2;LA()#?PU9<3z<CK%A1$af{D)!z8wen;+t%Wbc=L4-U^_G-K
z*>um|b@|07+!5{fN@#0JW&K9HIgdG*Lia$*a(;$?fC?jbSu>ri<oW3?#{Co=!~k9*
zY*M=!VY}4T3~QL~DIvV-)l>Pv7}m1U4CsE-gm%ELlb?c4YJ&uyIeCQepRxChi10dW
z{r>^D4!u2^P_rqc{+PH3@i<bZ#^wc;z+U%lwBKQoyK;D573+@InL~1*hFLK?y&r!5
zeOT!erAZ#_q{@-4$Ofc|ETJ~l7;QddKxRmF#7sh8m#33HCd#!&ActVnf*#4S%!LVj
zl$d0N3Uf6Aq->Mb$6Z{CnD`q??em1Sw=M=LKKCC{=9c5v&Fj;W^op*!q8?@4R9*~-
zB3|Cdd;*DlrM94D;e_k(KDD3d{P05B06|#piUicpv!lkC(k9PChaKF9e79e?pwdQ{
zlJkKi(9{;4KL8Rk`fn612QDIRpxbr4v=8M{>hwimkq)7}Jxh>AcU3VEL*S?b{|r_t
z4J>8w`5MP|Vj7M36bwU_-GA+tSaeTe<P~zwYd1X5Zbt;~*kGniCK@@s?qYYFct3i1
zJb6@ot?Qcg7ccrWke|4^qVO9Ufk35dguCM$GN}L4;{s*W@%+l-3Kb6ZMvYy{Uhdb6
zsK_(xpl~W-)`-5-iTD`n;2vMUCEi`VQDydd#p~=;oG6;Uk&wL+UTVc>$o|!pC$HcR
zF?H@}OD_707Wn0ZCpR|P-~{=Xc<#`BHrOn)UdX-ZL1KHdZtHpA@*wuNrM#;-7mh+G
zGWrGrmaF~ZZ7!C#^0rb^5?4V#9bWP`m(5pS=#k<3GJR~s@QFWDvS%M#jg)Wr59j*K
zEub0XYph?_bE4i91B>kQPe=nBEGh1?IVUi_<7k9P!L0KOpYgoSC80(cqKOsy<(Ep~
zsMJmd_v(vLk8&rHP)M)Hmt$Y^%n1SF7*h~u(Qgl8>P?TTfJ?Cf%=ZIOI?T79f&4a9
z4;}M2w6|5XBuNW@JDY<2Z@^9ZRYpTImaCG=1aCc{xjPR}VA|}>#i74G)c7uN;f09y
zxRc`g^Gyt1ckJ9Z>0;Gu_*2;RCr8x&ZdPYj!y-|FiI_Sj1k?Wy)cMYR?)Je7B1{|t
zQ8#F5Kw#y<Y9{xuIGPmL@4Sxi|I)=q#ub?K4<L3eCbWZ%LB$RAQCevZx)R&<57^B8
zNxBhYq0!_lIuS>SOb6pdc=Ox^h5CE^c^HL^&lOVo!tM923MQW&8RsrnrWV^kW^H{d
z+S4E^<if5PxDPikB~MmmL4S=Ti*0M`xxV(yhkT)PA3hg}4H&dfG=<*0Hqf`MPk7vl
z2@mt<5v7hTX|;u{fA%;G1gRP!!x~mQk~!^(N1#W2`xgM6FkKj|^fZE!mH(iSp}wIl
z2(*sw0rx2{L1HtWrSvnlpf{Qh!j)YF$Fh?6l&Rys$gtZlF-2sI1#CBD{-m8CGF$d&
z%x)!2Kn3my+CD@cm`$D=qnd3G5sK&3ileEB?|{&B$&UrCZ5;+mXUIK_R@_q>$;^2o
zg#KZuuw46EJ6MyXy*~xd{ajUFzGAk@F><qCC`(L#gr!e#*rH1ra6TTL3GY-U#F|8d
zBdaDDz&uYww(Dc=NvSiUJ|IWWo<PTN9|<pVOb|Vi(u%?Ij#$gFsf?7dm`FXX$zr+e
z<;yPLrlhB!R*gv65a!LOIz^Ab(2l`#{s-CEuQ?Cph1r#8y{`hu0O*mD#+){fCb_b0
z^?dn7D4EwJjOr3-)Zkxio@gYIZ@`<;V($1ToNNfrsFH9UL$4e%54(SQhET}ydI=uQ
zD^l<d3^Kh7mcx?n5CZk;Q;_8#Lo>RY=jLH#Tbg#7UBhp}r-Qw$wN-0$zIciO@X$Ew
zSsfVxlfrhMKi5dcS<onZ^R&7;rXknlZt@VmsPemU6)cn;%7p@E2gRs&G3G^fC&ILs
z7y3|E{szS~38BJ-9sXmY7Vb&gc~xyIw#=fn6TFWFSyI}xxv-|lc_;2AVrTf?s$o+u
z7rPZ52KCLLkoNBf?Owul(%DR<nrOgF&kDxC^EJ4^M7dVZF-I~<JWXVHs=|QaPPkl(
zO?k@~-&i4zBbt{J<NRj8S`oj_@}s62mbbuu>j#fUV2iS7X#8N!#faiQPYlv-ZIat7
zOJmsisrQ2B!l_kp^LJu1)5y)6w+4giCcLjn-*wCxkU{D!C?wAz91pYo;1bmA6z-m8
zDsU@V&EQuyOZ7cVa^GT}n}hBp1(4-5?Zbe7pQ{DPxW4rK(|t&ci67dN{^qyRJEGE3
z4c6~&;C3b0t->wwmYk%}7CYmHjUDkL(;*Yfb>97VSVaK%zx06r4>E9hw)2~KT~KpS
zs=by}+hDS*v|%Jc0adJT*5cHr@=UG>77W0*-^l^HF-8SJIE>O~;wu$>iekie>2hz!
zwKq2BH0KQC2RE1bsUpR$TyltDQoEC&17i(;3n)^?X6~|EZznGlV|zosditDDO2u$I
zi35#Y#C|U*(jaiUlbGt}*(nGE0yZX}+d?YXVf2^ShUfeF*|B2a?~&(hurnEp6Q0Zt
z#nbnpBDsgtr;xI2ekD%Z-l>#V`41H%iA=q~=vevZwY0Y)#2~2scJ2^*#%Umy4ZP$W
z$;O%Q3Kfe#kVEZT(HO$+RmPOD+@GyQe}YS;ZAE*zPEmGfO!2hHb4X)a%j7|{i&_5x
zX2T;q!PrCkc0?IZtu1hbGCV&*95MX+l{unDTuFrMYI2gReL5^I14`DPpx60xOK706
z18v4|)r^?-M=nSjv;KP?U0e~tj68EeWD!;6$XBeXR1xEcoQo@gkD1_pEm*pp2eWRa
zpYCVL+Q(xX#Zo&`7^pg@grm8&xg~ZlIRTXo8*rn#=t*$jO8zzSHSwiv^SKIZYOBJE
zg7QSIae*Pi#((6DAWWB4R5i&}+V{DOI9Mv^yW4}dG>%4*fNCmLaFJ7X48K#Vo?JbP
z&<Iap$5iMW<7Calg|;Q!)iE~YUHjW1oKrfPCMpS>lmKkb`qMV|{V}9}7gZ?;faueI
z{c#)}jSqb~hvmX`a1EtshAP5p;?=brqRV$*>k~T?wXU+)L&EHfW^=uh9MH!i@%?MA
zp3aBXA24ykZ*q*NkF`l2qTd{+JRPELES-vS$!bYbWZe^`eqs2!SCIMn(Bo0RLQ)om
z;#@_F(?0zn3uca!5ssp1PGXu?A}>o!+o39On^gYOz}=djCo{uao(YA-Xn<=K9&u=&
z;N$mXq@RSw@cm#WbKnU`U!x_5$KcU~JjX5T4#Bg0ucV!30_9U-+y{|9t{+z@f!eut
znWr=!>(IxrtONey-&Qu1vzCZ6Q=_syHwP6q62;4>xP%-#ae~&+O4~My>3Iu13+jlT
z(HjEq0bw0yF4Wfzy(D^V2HT_fAs#b}Ro(-h5tdsav-;_1Shbu3RF<IWT($7PFGC`V
z%9eWf6vsl}nkmj*k25;z(nm02m^Hck3^5nX{R}jCWr=Osw??kCJ3O9z5u7iR)D2m-
z&p^<E*2nb&@U1KS_##9}rJIfHyF`(+5I6m-k2Ir()c(R?fyXm_O8NDA<Wv#<L?sVl
zKU4iPT^2dn-y}Nx&J(v{g}yZ6EDaV6Bkr+Wb5Op9JFn;uI1)$TqeQU-h~FjdGI~~@
z7*oWLmu_1-{`I?I2j+Loer8Wyfz2OeWSz*G<Xpr2pW^1fd%#C+&ki}5Br4q*)>aA-
zzuL*{#||QiM&vznB}c;Kb-q?bI`e7f^Ckf{TuBzr^(BA>t(enpWiI3o0mn<4>M`3X
zh4`CS0n9DgKX#e&U>Zq-93vq$?w(p?Wusfhm@^tDjUZfp9u!<D-J0hv^$7zyh2PfF
zNRNFiM1X^2P|c^d+zj&T*PQ_)os=_geX|3);Cc1E`1`OjOS0k$fuFlv=8deJ7M`M+
z&1E)77}i~?%KExAYBPiar-D%<3%9RA5{+CN<YUXC@C}Ci0-ZYNOl+E-Hs_7s2@=8F
zMzSz99350T-Ym}6!QFqDBc7ie?open!n;lloB{DlMP)!~2d*%5xWbA+nCDI}dcUM)
zXB&7Gc#4?hb8gAGD&Q%+JIiMd&YR*c_*rEuUPk9L(Re<13lr2jflykCU=o<u|6M)U
zeiQS|J&)frCeTTK&4U&DhRKhR<;$2Ms^`C*#y>8etE3-u75jcRD-GVI0Ag!S^iJlr
zq|#@_FZ52poos&>@-En6xhx{i@{A6q%i@prYVC943s#L8Fp%`mNu;1K(2cu83}nFE
zMlM_0?c9a<$9#U$8lE=Eb{E2-C~K3AhrQ0)U8-(KFwHAjrAS#VGB<7x3mWmySx|}W
zQsv-73=i|o8Z%VsbDk(m^PI5trL($z8-3b|%;zK*1^u8_-!q;Y<q%OCn;hPTcgQ&F
zh~sIKfgbmic}GUU9;UWF@cIV*wSzl|I`0IYAQ?6rsKR2uy>s926nSOyx`iC?1JC4L
z0*3E+&bQt4h)3CU>R(HjDm#k?_g+L|M}e}Ia~(*xF*!BY$~S!Nnwui%1K=R*c^Y))
ztRX=RLEq~`zo^Gr7}o?k)bpw_m0u5ORd3w>nv8pk@%z&Lw7Rpw=opOtOOIqn7dHZ}
zdYC89I7hQD@ci#V5%{xn;^Bcmmo)jBopMr>Yz5U+%icQXL;BVQ*-}8-nHqZSH)m&F
ztZiS%DTAmjf9FdRR7=PUiEvtzYzE#kYn9l|I7|>OCv1CZftqP>=}=|uMdpn2;ZYVo
zDYDpku-5-G)KBkn4|0dyk3uIzhVG(KMgt41`{fsPS3M6(2VePYv{SyN9mgjRnvuoN
zH|Jz`rIiN>Rj7ZwR36ASmTrU5?fixkMsQ;QO?cV0@N@eJZnsqhtj?zmGKd3NCESWL
zDG^A`pMrXg-an)l47tY?L?vU+(>WWicWv7DoSuo&_<sOvtuN8d-((PD;WwTQ1t;Fi
z+#XE;!M$4?_wL$trA^Vke0z(hy*qWRnQ1&5jeP+bpDdEkTdTNM-#+*=U2gP~(PI6<
z>v7M!*H-<#UbImty<?m$h8sxukh?W;Em7QSRrBeGGk)N?_=<3L{uAk|Q7uS5TBax!
z!<hz&(YMl1%ld~qCPTlk#j;bKL%NDgK0Yw<!Iug+;n<g@`*XD5)VM>r?r1+_ZovP2
z$i$)GP5MW}i;a;yXf1J3bJnL~desn)<<7+irhsyTqxZ`ve3ZbZrTQd!z|!WM<|g#;
zPEYe})w3kFZuonSOKCrx(aB{Bv>0wxDhQTX2B}(TPRA)V*%*Ei3N>feEjP;z>fu@$
zhA-ky2`%^(=o+|1p$j!0NZPo}Fnj_A9}W0OFiA~{k=`mtD`^jUNst3y%Epv)IjE&n
zrQOzHdwH?{+RKamvX3fzqn8eVA49A5QX4`x?SVhh(Soi?v-s@nB8N<*iSBA`<kAt#
z)_oQj*|E7BP`NhAu}Bm040uEg9^@F}SznkG)#BW|SgMh5oSY2`o2Q>s378$DUS8dF
zr;-EK|Kg6r|80<X@wgz<fIfmWs3kcC*P6fxDLymP*Z2tH+w`6P1jKOqki0i2Ph~gF
zr^wD+&AIbXD^OnIxRtcYeb(@md;v}0=JU!HLOr9W`&5BV`5vy-kC&l+%7(Dxnl)*A
z)NwV8(4!O3!Xh4?S3pm939t=cUs|`|AyUNzZ<~kg7b6{x{)>HQghryc8g_%XFJG2M
zy_4Kk(Wjn@vH-R7q5Yp@jK}#0w<kK_e2`!<B+1HnklsU|Gt|vwQ~SrtkdL?82RN(;
zIL%Nzm^R&hwcXS@XgkCGB!H8PyF9;!;{AzH#wINIW!X70NvJJr)z}2@6p2<eN~{7X
zH#5yL-&vILK(-}3Cr??h?Y(5m&*Y%Y@Va$mEU|p#N%*ZD#q_7Pb1D^jj4#?O9V!6{
z>J-#EN*Dfo2?_mnMOV(TjdnaMtBJ!jZq0MVl2NsdRp-hc)GPx&H?AA%^f5?U0+_Et
z)ZE*A_+*(aABWY0bzP;4R+7f>qjvZQ741=ZY6kR}5^~r2%>v7t8RmEr-4JMq&yutl
z;l^;A<GZzD?iG2e4G=!Wi43SWejg!Dz#&e`=PU43RTQd&HG<1cnV0Fc$E+3bCiUZ(
zGkbq?7QK^vR4%app@-ms!Igz0Flk353FkkR=?qT}G-UQYc4@UB^>?;9ehzCsMy8H>
zlHLTWBNcu<8~+{r2$?GLz-*u4Y270dy58Wq`P`%bgFaf-o2$P0WTnD;^~*}H5S4Pz
z=u(yl)vn*xArm77W7QP)8qR+X8~zWE;r;`;TpsK9Caz6&4uX`&dcBGBQJsTRI(d%W
z%nhrr1C_5t{hd2E{!3;<^@z&w>PB!BdBHai$#@tNTbBKAkxh8$8}p4AY3QMg$v}}E
zdR1PgUzurY^H<Gj?5_d@UgC4Yira>~0ouK-6t}PC4jlYHqm*Vj5b|Qq`>cUy203-+
zrQU72)t5w(jjvs5seEk*7OGBU8PFMEl43yoR#&_rL*JU#&{llJjeh1U=Q97zGg6ZV
zm%%#B%MH*4;(QvRCWX4K&UTV&_4Zz!?82@=Tc>U4KVqSCd*P=IYdIo+{h1*S{_?*c
zG6$C{liXjCq~>`tiGL}Wp#=faUC@PSk(j^i`e~Q)oqhBvJzA%qxm6VLk8bj=ug#^_
zx5a1DHeuKGF#%2~`<`-ViiJ|4zG%ATpJ+F6*>^I}!|L-{O}||F1A9jm8uf6v;l3n6
zKP}~f39dWO%Z(p>Cpn}6+S=*wCkRrm^5~V<ZNM7hF&5D7lL3E?IzjJoWl^@|OszfJ
zmCIYvb6x%JE#vtUDO^aVkI|F!ZhYEJyEM?2bflfGI({TY`%vD^3Hz0cY8V!K`w^;P
zL_L3+Uq5$Nzs99(DbK%&p+CG%#+ha{_FaHBzQRjX>^0B9{o&(GJ5I>k8#}En^jFFb
z0+#f+h_Ck2&ZT#jhK(Hl>9Ut${Et!KF9({+16BTk3ctfCL1;t&P`_UNw~}xxVK0Hf
zAid4MHP0HgCV#_^n$L)Na*I~&NK>wj?I$hTn%!tAmqfbImQXQVxxK~O4X6qNz<8Mj
zvP@Lb=2>={$@;wJeRxBkGifO@qk?nzFZ7`8gYV?RV}CA7t4HaWMyyse@iV}8=`{!_
z#@gaOPCvO%7#z5PSkleHOaIpVz<V>jfV@PtC&_bfkN39`#SX;}&(Spf`Zj@PrAdOm
z3Qkdzf;wfBv-d3LC7!4TpvQQ$n4c!}=6p+L8zi<IbOWiFe*dxgMzF~^(tTIjL+MZh
z2PejMz1$8N0>y$eN|woN(FoLBJFt~RQ)A_R($40IL(*B@MWkko_AL_KB18to<Ywb0
z@rGDdZ!58T*9GU;wsb!0h<jvVVa<wmHXKqq_9xa?`&cg?@Qg7x%}_XFZ2gwz%<t4I
zoH+qa`-mI%&Ba%!GEZDf_q{TEFtIr<vfE6`(vnOtofS*Q$`-klWF_iZHfFstSF|!$
zQw%AwNW(vrrCzxG%`}v0ssk?9-Atj|b$;kdbV#AI+wqU@4Zph;U1);)sVowhdh|vC
zCCvQTT%W+oIf&%KPPX0*<!`M|Egn2*N*_#4@A{nhXR8b42!vqK-)V6Quq7+0{PmhA
z#_}CmManjGG)MSWeKw0+%0E>N1rV<gV844ZsY!ppS8|&WW7aV<u!~I^(an&<H6XZS
zPk>yrop9bgWn3W1G{&~TraiZw#CP<EIwq$20{nY@Td`&2=I*v^CzYa<D8>n?VC;9t
zrVCog7yeSNHDN36pM2br@Y4bI@=hvI1y=y|xsuYaX}dt5Bs?p_a{lcKA5JF>a>Sbf
zYD#Og#rr{997hKy^S4{hZ#nEcW2OnPs9zX}k%l8Ud`Mi~AraN_xo+{U*yu~HaQ18I
zR&@oiahDTOMFUHKxFm!cF}oMrH!q+>*H$aYu`@xMa<)%INd(c``R9wDUm0>;t+}(E
zGNKfn=^IQ<fp*rn#jNESRWtQtV2~xfWbq+=DB;NYV@GHPo64CM@-Fhxdb`})-L-17
zee?)=8+no}4~#V9mV6;jadP6)jG_u7{B3CuXe7^T3Q2Q^m!M93^rA~{8L5KNQ~$e#
z^KKxWG`uco&f?AEZm*Rvf>W$jEY)8gW>TSQ+_+s9J1}<9{Rb$Cew}$K+viVyGZZr(
zf9fQT24++gu}$oN8^i~!bzk=@&>lw4r&_}t@qen`B}kitF5F*r&siA+`m3cTV3~3u
zu<@fPg`H(fbCWJ&3;cwXL}DVW&wgO!$%j(*agCx`K|YTjq9zuR`+h1a=e$ovF`Y0U
z?~&M~KX{N|Si3$yxSOWa_7W0)bAjnIIFWCi6?^({K=DweG|Mx>x@6Y;@z3qCRtr~5
zYi@%F`%LYrecw_9O%5S<&E%Cf<lxUyil&hMqSFrEm!9<HicrvFi{!vkt|9869MlXZ
z1pk<bgX!}2`61|!_avgC600JysK&5d?ZFY`(7vU#NgE8tX_(lJ#$Z}l-3)B#q#{rY
zTae?dPuFlYW9O~`-VvCO7Zg<ZWl{BAz)`{JtS)Cl+MmA#Uj=HaaD6AD!@_3h@Wn;x
z!wkKxg7GKAA(#6e?Hng)xuzej=y?&yQ%86GSL5=cp!!Gc#@iGu{X}}a#ZLv{b3I-K
z2Ho%Ek|bJkXaER1d&JbC7X}aIgPLW63x>P*qRC8~Bh&fSx73Hdyz-#W`V9@wM$e}K
zA$joR>K4XpP16v}O~bEGij-4|kV-s9Gjw<R-`_E;3%_hqZs=W5{aQJ+iD`-FB8f~z
z9ZhL9o~b+;$^Eq)vZK~Pb#wjsEzEwg6Hhv|%h>`}m%bJJ1AJiFczR6#2l&bZb9SMK
z%^y=;qP<=S-n*cR(LTyYXuq%yfsK=4b|WU+3B{^#=`)VihobE^Bxvl$e9#9YyHVc|
z`<XYEpy55Q+o402Ns^XjLD1{xPsXR0H)2#nK^>Z~$oH4gv%U71Bu|RK21x?P{1qfL
zSRqw4!mf+JSvf1^DllA1gg4~au&16Ni>Q$tb)zJ8TB+ONcIQAs$hm3(I!MN*VY@_e
z&F4dWnAD!vTqy~J><w1FtCU3!)x?}vNyRIzF>eTP6VLP?jyMLIx*#yG##E1(h~M1&
zMfa0`C`kK2BcD?V%O+_McrQcYZNf1(n-{hJg}fU-ArJ;jxcf~QHNX%jv-`!+EjzO&
z23|Sc;ls|l#pabA-Efjrr-G|o@YGfDcC&uwAH4Hji`eFsQ$=_ZcZj_6d%9YT?<hvf
zq6=0t=QNG#u9Q-uc^Cowf9<0+-0<V6ML0tX<l?Oo9b-wqpitLQqCCxvjdP#+2K9L*
z**dikgV@=nwnC$BUvpCTb1|GKpL+KLii>N{h$l~<A%(3Y@yYNfMoLUZTHkJ4GCgSj
zjN2Lh?1rD8p$N05ZuWE&Fu=hk5rGNoMzKx_X{!lxKh;CHd`=^H;Q>!tjJ=M$;&c@@
z5sYyBVgo$~k6_K=_Gh~E&%dD4R!xKps=_V5TIk5{w-Z(-yvV0z*>MsZ(w%9c6?t?G
z*1|jPz-eNuuT9KyMH4N7cgnvnlycUWH>C>Ihj(mN;)^HjUla7j@wrNOp+gK}dV^hQ
z_S&nio)o2DkVz_2^`6PqW1Bb*G{bv`S|aytIO@c?%gcWmCsYq&&vod7uAa2k>Tn0)
zFr?KVG>;rlhh?2+ys9zlYxrMcoe3v(*@}3asWNe{Q89dWDgsC@IqBlYRM7JMUnkwb
zoxYZ-Y`=bid8S6+)8RhAq{ax@WXfHr_w6u(%Y;wHb&z`EjqnH9kcQt^Z1~=EHVRpe
z<+|^l?*EVA<^SuwLI3_AHg&l%PS7y@&g=3r>}b>Gp4As&zrFxk<|P2!IA0L!h5eF)
z=|HO|(6uuxOLtgO?BrLKEIh{gE(G*2^!AnMfA4L>-Z(GJp$#qj>U@hl#g#WdtTOfI
z(R5b^?3202Ur@R!GP}f$m0y@<+TkyKx1EVz48WhZputVHc{NtoA8VT)n`1Tb3?mT^
zM9TJMJhHXBLA!4PvqrqFgz>h{`;`|T_Wl9V%2s?gO(3(Bl$R4lk(~r9{vNGi5Ag$n
zp>&D7zY90N%*mKLe?-Ej#)8e|%%iRO7*#X(!fv#VUpBr4M)$H8j4@vt%tll`f1D{;
zxL5?YG%qyxWlQw?ehd@En7)|0Xjd)+c-a2Va{335I@WOEjqApiJ8s9_stR8o>WhE>
z0O><_t1zzDhKlBu1%H}p)~S9*rDD#Cfiq(8DR@Zkzm~3eQGz0X*pkW>tPX_?M^ME$
z^_MS?C0xoiidD~<QQN(+BDZu!$se#5PTd(N>07h=3^(Mn+Rs%02%&(ucGAlu3%Bwe
zxv-XekYiQ2eZy1TAqUBtu&}i@ev3KXqk0o}rv8#chl7t~S-US!>nI0xFnox><P2!n
zWNx$G%<K!Q00VjMV4F}m3JQE2fB@nxwk7?u-JsVTuBKrGNw??Vu$}ZZ@`Tv`Bad2r
zk{C+ADe%|$*FMv~D*=Q<s6~3RK-?tG($Phs2ja5pI$esmpG+wMd&e9XhOkJ8@WJQ+
zxBSuPmkf5aOKb{zrgBPw1xxk%EL!tW<r9^%iy`ywvajwdMITFNeu4Mlg=o%WYhrSg
z6m6v~dKO5{_u)pIHnrTVR|mXUPjJK&kP+N3CiLf$whA~6*|sK81tx7<D#r22Jn4dq
zQm<nNlvR(k^Ih{z{QIx9%%y`3P2n*(W#2SeaRxZQcRLG*p6JL~8+@=dLkgXEC!0l0
zuq*cduViZ-NA@sIxM;&U4SSJ|fY@3O&vq_;o=w#!TGNKFHSolPKm(6XvYbk7q|()^
z;icf=9fIRPLg-E7XNsIs;W1a<$K3wTu&1c2nP8>06@oDmJHM+U5~lz!Q#VH+<mFEm
zx*9hudcnwPbg(4}Gnz5mP%_QGh|r&CUE^fcJ1O-=J(2Yp!7Ox=c2#7o+Qg4rpRVE0
zj#+GFDHW=-sw4{4{0kiXpIaywoXrU^RoRf~v-i)1-H~bn#VwIbK3Iwmtv{)o0r#Qz
zC54isiLmXH;!iim@<*wHikU;-&rzT`%k0HbYq-Z@g2Ou)f0tn@PMjtIKwCtxiRQ(3
zqz}@n02V4ypjwFeaP?Htl7)n7N6kNxV_CB>j^8VkP%gX^wr9anruHoV#HXdG?~ur^
zl83|v5p8J1m2e{1R?3WvdE(bRh}~8&CdbMyL1B#=suU>BKuz#dPaWJLo|HD2uTL*=
z@lA?k{?$Fzp@MTp_x`Pp_PS&r?N}qbr8Mn|5D{&2(52(v?^#8_$pD}qBADxIr%2`w
zL(@M-@+dA7f3USqG}nP4pa=L1mt}DaPit5{lf@wdDQTevIp0Z4nSi{j%<x%!T@Ao>
zribu-q?#Fh^bktec`12QMV%%Ml1kHJJ9U-aO*WPd9eFH2cJSdZ_l<9Zx7LRemQ}5y
zR;@a!dH%IvL|(4R%(3N?O|m~cnhoO>_L)z~UcrkoOCeWLvRm<@miuqoVF5e?er97i
z!hi@(k`^jny7@F*vYxMKd_cu2x*t9R^1c}sqF-c0J8hv2_FZ)C|6=VegW~GLZBG(h
z0>PaoNN^8sAp|ElH16KGy9NRT8g~osH12MVLvVL@cX+4YId|rqxl(tgYCd#zb@l$%
zUHkv+^{n+<ak_k@a_?cwWc+>OYm}+QmE)X{jMb&dZ6EFp2EgD9xAEoX&)*-3gj|Lo
zK6NvFC?%ASYH+YAd$(lgqqFj9tlDEM$8yA@jUB5;>b0E7L6x$H#}Xzq$EudOlbu|`
zI;(y!#@e|D#_i|oHaikYT(n9nXrFo%oJeW=u&T4xrahD9t^@e-bCjB|c*NN_!a+&W
zS#oKAPxwcuq3V1EPXd{$3EQH<ismqFK$buW%%`UJmRKoKY|cxeo7ZS^1L4Gw5aG}I
zj77}Yn}fe+Tyvm4%f$7!*PN@sZ1OHgPM08lOLB#>gP%Mjdta|dHjdI`fir94(tta>
zS0S?5+?SP*P@&EKovENw^}##QL_d3v&OY3SPR*nbQu-~;TCQ5u>GHotRCK=Mg(~s$
z_nHW!jQK^&^5(JK2Ldwv96@<1Q;~NqKXRogLdw4oEQJg5oEe4fA@7-yM$-hIdE)Pw
z<j=E2&k22r^li7Y8Z$FL&7HNI_FlGCXnrGoFuQ5Bv@Mbd*>#`h#`#lT<NC}!1RWmf
z*+15zpa5Y1nZGsr+cm}gffK1MUOqa7RZmjp`-E?s$y|H(*SdJMc7l10p%4ATpE|4j
zY|p%33l}cnafb%HuE%_lZ8s}Elm7OVUxMB(d&qN)N{v@n5$oQQp$$h7(z&^AHw{;k
zu!er});PapTKWfWfBsx!;R^Dz#US*SCw_|{mS$ghhKlF@kU1nsj?`zQ?sW=fJ1g<b
zHk+aL00c{*VgVjBMDdC)<}kc?!{v0BIkx!DylQqfFaiyv2gRZ~LfcVT!yXhq=g<0s
z2t5=SmHqTOPv>2)L^<nWTV`CQiT`Wd^+)lawT4Z*m+kWZ!1aw^zjeSKktWBy0X^8y
z72<F23_A8j&(PkaF6S{|hxVRG%}eUZjfs%5tU$zrhTmjny$IDqMf^7Dg+RE!Jlb_E
zk&QGetv7tXmuSDF#ARV#Cm$PFB&cmM=eu#$4uRfEyg{+qU;Q2tn33`m)W7Y=z5O-)
zI@AFrYu(z8$Vt*K5vASTpH)i3^+(hD3g+TQQH#{y$p$T+%F(-NA-1tJqdwchURTa~
zMR<{iX_Ts_hQngaif|fgjS^blB*D)C-MZU{IwQNO=(c2JxJo||!((A(kXx~uWM_ck
z%v4GQ;$;p-MPr`mp4gYBU4ps06QBO^v%cBiwd6Yz^HxUpL^p#s<zJ$9)V0(jOFfKw
z-@W1UqGM#SNHXO|ynC*V+jaOhQ#E+BoN;D!#PMBaglyMBi%NH!IvYBbDa68o0Vczp
zp^y*^B$Kn7m(RDgZQg%|7FsnE_xiqN<)um0gh|889#cQvh-$mY?R@{shxDzh%=LvH
ze$kKctL=&1rro=E2Z%cUE&$ykOY=O*{6jqqHO&W6Czc_KleWQd!{5=+zkbhh8Qi^p
zTYz;HvUBCH9S>iKB$B{rni1(ehdv5c!3hlGPUk5=8@aRR|5n8meOIQkH8~ohp%EQv
z1`^TXGmQRNxRn9sl({W(76@M$o8|nCS20PZjw&yoL;GCgq`(Qv$?93U`r_HW4o(br
zwP#Pnr!=XJ;hI5POXoWp*cB|YbYl6d{SkR|g)G~;zr#HZFA86h)a`5ci(-MT!8_*i
zZ>Qps#ry+IJiC<dzFUw=a;(go^6;4<JF!(GQVyO>nG}(N4zPkssI;sp4COsj5RvCP
zeB&m6ypN~E@G3!moSzg$uw4@7xuM9igL^q1p2&WtmYTds<b5qb-hoY$tH`qShY~*6
z&&L0A-t7O|j@19>b)x_LCyu*Z5eN2%e}DhhJhy82koP9>ynZiQ0TO?Ry_I4WUPf2%
z-1U#$C?W%<di*=RuHPHCv;6DX@ZX{O*Rydz#NtSDCra|WwRxw56Jre!L(l9EdEXIy
zu?{6ylz-65@!;)EVgb3DeAI%l<!LNZ@jLE*?+R7T$`8Vly`th|xBgCqUul%ESzZ5$
z`CBLJ$mm_o$uf3Oi1g-a^ORzz-tV-EyP`^cSm+4k9>2c!vXExMJWRIpb4ZX-w1=<!
z0_Av25h|<V4YWn$$Rm}o^mL_TjN4Cd#aVUN(<vcF@_?1m2S&VN%TpmnA}Zy<IS2sv
z+oO5!I<8?^&K$H?B=O%Kga5!~nLGlA93@st3?^<wwtwcfCj#dcXC9UZ3}HraRhPV=
zt499Ap~}xEdcLJ!%tds+$4>yXPhX5e=Gi_c7sp0#rtKDp*ysTB9Q;Y1gqoKkRZslX
z!9Ql>ug%OGz-ntUW;>hdm5fmp<x?ezNPiEr;3`lyxV{cXHHnSmZDhEOiJ+&feWLa|
zXpo$c2fA}d3oGk{6D{TQt;*q2RS7x<IB=!K2(ro!I(R!se32aYyOY@_)3EnW%FCg`
zW>`-$aOK|WBp`vA5JHVdb~8+`m)dJhV`W*?Fc^rrx7G;=M`mns|B~L8Gs<wAOpi>f
zPF@*`=uQg!&6+vdkFP#sA1tvwXFajQ*_zS-OAneE&0s6J797yaob{W5DDF4e!my9b
z1zGgP@W(mYtBPSBb7RmP0`@q3DBo%D{1xQ4=aM6|-HR@`gVLIWq%hSf${n5$1$m5;
zuIlzl+2fw?*eGL3`bmXP(hVFuGd(_IsCFZC{Uema*OqKCj{FE<MXG6$fvprnEuuVh
zng(pG;oEBlu>5Wf7tA9Ac(oOv!B4(|Sk!aW)op^ubboVVkc4-Cu~jIDEwFOu&d+i!
zPbYGRi2ZF-XjEaTKOdon@oqkNI%<MVj>kQP4<bkQ;E;H-P`G;y5K+tOOAd^jiKpEZ
z#n*{Vwa`N!6{*t_2$(UwfvHv$R4uixK|7Bz8Lmh52kfh;0Ujxw*|L{RMoShmMocAT
zNd@7Blnl#Y*oMAB`XWVg_xSeDcy;Cd24+zeg!2*>o+{T|JZQ>Nj+x?RELw0c_|>CU
z!Kz6uuR!%>c3I%nqHoDB1Uv4u_A_TOKZ`!%78cQwK4n4~;4_?4HN*C$@Nq^<XUhz$
zX=|%9O&nP@x7y`YM=jgPCt0B@{mYFlj_LH81&4ReW9+wi{P_?Sj)o#!*xjKx=07CC
z$!V97F^vtKMXC01xU620dfW^{2vr_3ov9L83^L6HqPqZcFe^8CvOpMzPNM`r=K8M!
zFF<G;vQ>WLGB@?!I9%7b8eq7c1;eJ95!P(6;%5!9)}hEEMRfHfx+5_Q&9)*V@%e^N
zG>%2{oYoc-%&>h9hx$c&P1O_=OMX^n2Qxs16nB2d#!>&hQ__BIZeY%bsvy;C0t`Ol
z!ygV4Y@>ei*bzBM`!xC4Y=(XpW!^hWBwtnI-e)zz$`%w8!(|VmPio@6VZazelUzFN
zX2UgEn4U;wzJG{Kf%-t$Hr=QN$XLWr2RH2kX==~?-Kp9#UB74Xah2WXvNh)gR3?Kx
zpcZ_y*QOztUDNuG-4IH5tRrCFPTNY1q=_9Q3`*)iFIUw0kX|=qec}~<EMa8jHKQH0
zt3X7CFAQ>b+tlR@{#`2)k@zn_8CDjE|2a6R!DZEM`-h;^1oqLdCk@*7RghmH>Q;Hy
z`1`WDlNMEr>pE+FG$Db5<6MJ~3=sw&foz4W>;5c?M(<X+dQlihEpE;0-CH%@j=e%2
zk;rRVOMOe$xH9vHPynn_%sc-$+y{eDQ%lf`+o#Ydq3L-<%@>O4$Su#}%J(w8_z#>;
z<>({t=6tm63w80*vh_7jNzMwH)qIFpTwfJpPC|C7V+^}5)5){oi1)EebAv0aCajDB
zf$WuO-ykd-L9*E16@W5?Uzfb^x|+X!dCr9y$q`g>!v_($XbgZ=m|>xRpA#?}tJBEl
zP)ib{Lb5yJqrTp1({xr<INCo<g$0Smt;%gNzFW=u<HR>^Bt?%|j^E#f&rmi*_bO|P
zzWg9mkC4VQ%xq#@SE#@FMfDlEa_O<P`yV)D9|_TL;DA52D{W`{Z>vxu=m@cXRvyJ#
z+x*mzsHkXh?Z2&dDjLn#y5g$niXX(`LrYG{xtWG+Jp~utWtX#4)3RTFo{JuD8N6c5
z*Z9Cx9`$1XE~bH7(MQxZ|M}6*Ct`@{gZNvS(m!wkz#Al|<Qe>z=xq_=f8b(Aea#-Z
z7E6HVSQWI4uba1!5N|+jHkOZ^lMkr0_QSF{(M#EQn#ca9TpP}dbw$}5gK(9Q%Z`e`
zf8bbPYPXQ*&l*oY{sAY#=<!bFE<vRT-{c)XiRia4utFq_L=)h(6ip-<Nq5t1#cApi
z{%1QWeMIXc6><@Hu8sWwxbt&e^yb*ns97**t!vZsl(%|ozBI1@+P*td%kP9v;n;7w
zs#VZMlce!>VWSbH*D+PE$E|eaM#y8VoWRxhH}r`Bp8H4+OG4(GM(e^}<WX@;xFwjD
z?KTSao;m6Z@`KC@@`e}CKM#bg`D-)~-WI^Qzi04P?>yZTPfZ<J=31$NS}DX|{CGF&
zAP{~2=>)W2{pj55ed!*4xKS-zvzxVFE)G9Hv^p%}_NdnIl_I4K!i){qFjzyefdS+8
z3i9MIS4j!Shj10{7;{Cm3%Q4zH=;9DC0u!wQ_a?~%BjeWfYQT)d+@4+PHi^4U<E{R
zeB#5sep16&l-@I5E>o{j+kWEjxz@Q6RHQ)c?(q(=3_z4JU84_oN;$9Fky6ygOVJ)c
z`xY6<is^wo*Ot6#gPqvg5^~R65q}FYf&j2v-=)}0Wu+7#kW?Bo$}!>i9;YHgxoR>y
z^7u<tMyPxT@3UUEDn7m%WT+_|wLQ)DjoTfI3y#Gkr7jFKb$t_j1+gh*=^V^~_|XQ6
zK8*8|dsKu^kLe=8FWi=19*$9Hvy1rHjoB*52$Ch@k85jKRtzD;p`0((u5BT&l*R1d
zRAwtxQNJ61cMJ|{PG}63+l^R~Xi~Rdv!#qZk#Osg$%{~m<&q9ZusM?Bpmy`m^u@j~
zvx%2)eI`XhGRVf%*?aK<jyF1uc$bv3pMq%xrI{wKA-N)$5@g!0M}0^_XO1h^`jXyn
zM-w@l#Pz9SUT;!$S#=4@255fc|2s+Y|I7FO|NJLby3)Tto9)**$LsKcl=i-l-al|u
zxBJia6<Ax)Su)Q^&#FY=y}^zL4@Y!?XLh9Gi@fSvbHvTM;vWdtW!QM`f6RQgj*P3(
zP%H1U#0<Z`vV6JV@?w5-J~&tIW10Mn)Ec0hPKah*eN$(Xjt<|Fm1Y1oUVQviI}=PB
zMY~<~HoZ4o)Xr|x^K+x<=}5S^557_S&<CS*&hW6C`lpFUtLkGKWliTMSP9l1aJGdO
z(l)AV(?vc=ZH!*obX@Xt;G4NLDm1IcHqB2_lEFyGS4Ef!xL~iX4qjq#S$H-#@W6LX
zCt0O?dN=U$AGpX!(dFb^i52x_Vv5^ALpxM3X%YUyLx(27pyL{ItmY#->IC<&q=r^P
zz*poTDFYZt2AS?8*N-?HSjOaTDua3OlK;94X!}}>Gs(b2xq2Qk>G4<hl;-+Cf?u1`
z=@rqfM3Br`Mp(Z2Q=?Kb{>)BBB-!jj!;a~u6kiv@3=oO^l+%u%^jlW&IzCn+E_TwI
zzzubp&zf-!s}y^hyBXOn_m(LBLPw7Iehcqm#66#J6g4X$g8Ll=^`TH)UkiIO&!0o<
zix&~y2=&nO-*_>oHC(yY$=Xu{mbI#?wzlrNE^R@=51}bC8cTF8fFVDmRD{fL{A!I^
z8xmzO6xC%`r(p1gu(&NeAOK8G-%pWm0GvA2p3a{Mk0$?eTr7~lpkZz4_`T<W^|S56
zmOaTTx-HUd{*PgZU{uurX)EuayCF!E2!#*c#eA@au;eA;TQR$QTrmUjjo5nDe%c*5
zo8;6NCYq|hr^a=GaPO#eF@=4t&j{ylAOp^8?ENOXCDZo|cY3~B&+`};JXp-KuC@Z1
zr$s}4MPpAF8KC46imfs(jCV^lJK-ro9N7xDQy_3jsov?4xlP(WDwW2g+Q!AgUZ)Mr
zS5+c<1{r)aAE$m>)_GBR8Qy5gQfvH!UqF%@Vu~oU6J*V|jYDs9%71#Qn5A^jfC3*Q
zLtXDCeU)3z$qnWoElcJXavQS@q9W!azQr5N4Foz2Nb#3wJAPD=bjRe-Y`zgV?51_{
zXyn|zJdJ*9w_LP#*S0C0Bp+5}^hB!3NzAYyDid@!ZaSu&InU#%msE^=F)Gm;xkER$
z-7p&o5%ri)LlQpDx#SG#Ft`V0K-|$KzM=NpnQ1ts--nD5kI~}K!Ju>W=>I2~njQg`
zcvl2jIeT+1FAl&uRq(~=j`e3Z4StoF`z=?Trs?{`V*b9p<?j6rRm~$<o;fV|S)ytZ
zJSjx%T_)tb%G7O}ow8vXZ8P6&wWOb(zZ}N=2|WNwnwamlBA6%wgW0-O5)aF|f-I`L
zY2aKT@tP>ZR(72LS}TQ4Vj0IB=b;rwVL1NoQ0vL(o<h8qRknG<rA)?=3_NmOh-H_k
zXd<;I2Vz*j8m;J>@WItBC9$--Ar)lWlaWe-h7j}99Zo^M$xmKja@1X_QFnUB89GFC
zlwITA_RH<Hcb-k|`f>mVZ*XG~D)8%alujKF6$tN@vEL7?``u}Y9iKTX5h0vi2PO?T
z-W1R7WGzI~$~neq!b9z>r*yb5-F?vsd4n{bg?3_qBF66~s*5+I&5XmyY{{WSPqW_Y
z;fa|kNIQ%%Fc!tJO2vM1>zFtZd{jk7_C=}VB;M><@34j-vK_lMA?I>uQa9W^%PS)f
zm+nxL%$>*JppC`n#J)J4@+e9tOkT!QI*N|EQ`eI8_+~GLd<iQf+XDaAv|}IPEpu$z
z`o$fN$LO3&D&qdpT~h3zrU{z}hzH4%L=+J(F|DH4u%_miR{CWBcmjj~riQuNx)oIH
zU?L+#)Ybt*Exqbz6Sjwam|~mAw%U1RFY&Vqn7X3EsgFK*;#b87yDdDm5je`0!wRp!
zF_9`N_8;VQi)uoKN@553k&*rsmGsNURngfh&m3y+RIx|w{+3$Ea@4EiMHMoZPIpSX
z67H$9;nuo&<fR+^74-8`pOu$6@|@4kn@DI%>7K6osFZ=&A?k)ZvGC~b{($Et&Ru-|
zq5{YdQ_VWOTy}}IEhZ)N=`by8ubEU~iHyYaSDz8s6EVr1;!n||daMUTkF|p6)Blvv
z3{X&Dd=a-rgVudpAvC{V*U103RCWqGjeMv@Ekp<&;y)>pS(jC1jNP~M;rC0_>v+DY
zuf0fqUE`ds{Rd9?AGkjdVSh%9pq)4m4A9LBx?O@W;f|kpia&`T<eF~+SAN0^UpJAG
zyr~O+d?#XZmX^yv8v2J1Kij}DI3fhmd9O?{HPb#I(O_T+&n_nsa@vjXUgYUc2)N*1
zHzNnzYU9@$+UY64c3RzFec{0ED@e2-0H$!<{_$V<whSZf;@|O}a>GtN4ADE>U;n^;
zP3oWv%YI?qBzdbB>7<^f@2B1<9h$8`>~A(BcG)jdUaK!b!C?2kG_d`hUpnQouDWA6
zcju&19luE$f|Jg$OwAvqv($Qc7B}P;#4{AzwSo)_cj-O3)znD6u%bMX+|qZohnq*a
z<3Bds90`NADIXaHe50KTOJrQxXq#KWT%3>Am`bKWz4Ktk)-PjUhQ3H&GAuQb?USPx
zSXCy@krMreL)xDhMlD%Fl{aVU&#0z}kBy(lpECAp?2$3?2Ce?C@=QTLjJV^{=f7xK
z=)a2^#182G)Wm-WF@ccc&LCmd?GY0dg6p34!?ODQKPllf54~8IM^b;Aptxv@1)3s8
zY~^tf+97WOalb@X)8OEzCn7j-q~(;YFY0L3?~bi6=0%#NQ#y#$Pk2l~mymxNISR#&
zx&1k9oS*@(&0f+)`hBuqW6IKKdhJv2(=-vKr?@Tu<c5~aB`iL$PMrDj%G>MwC1RZ|
z3yOf5VKN8tL2qgD7S`4>qb*4B8Fg}!vY8(djy5q+@SAAp1Xay!Gb1hHhx(S~GLE_)
zI>C0z7}`Uha5!iK%&A2o+E0Lu@bN?+Z_gKC`1iOz3d9I1tV?0<iS8YE#SSw#yL3Ks
zrtt~DGqr$+Lb_H0`SY7pEzBcwQFO8igS!0-cKiS1h5Vmz-b*R%k`gVbzZpx$N`6`0
zvwA5*;dBdCKHk<QOX|kb{;$e9E995DklOlyL-0Twa6s4KyW)?;yORqEM`ExUMo}cm
zH3$+)LVH)vcK%xijSLFwbUz+C6D~V&AYRIUEM>91&&CCBTD82~h!FUUZDu2bvOc$H
zp8TQ09Z`5h5v70?wEuh~JkTdy78+)=yN;#F_<4)Qp&tt}vo%BO29fkdu;WX@;e#wI
zb=_PIJ~pe?DcZ9R(i`O#-loh*DDgz99^|&Umz)o1TjR|M0dk5|Npqe^a7et4tFIKp
zw-q0x*@z<Jsr?1|*a;kW2T;i$$6WlVqDL<gH*R8EpXZp5G;{mulEfatC+3}Bdj(Y@
zM3MW%7~?>#Omc$!>?1#o=D~?*QZ}TZn7%8HwZb$SYGxUcKRdIE43m!$o^XNL;&UyK
z8Yxm!_Hs!Ep@;E}2Zi+=wR6N4i8hC|$KoX`O#GDC+pPkyl*YhG@oDriuo{iw?7fFc
z)c4_GEtCOaP}dtHg3bu#FKQFuWO6lkA9jbr1}CRC@R!H6_^j)yz2fv2Z!Nn@8Wj$b
z9X9GDsMxvYOy9Q*a#`?@tKbD5okpAG^pByFxaKXr<5?lZi}-#DfhLOYlEWi>ppt};
z7i6y|w#|u?>!L$Bf|6~j#(Ak&Yt$>%qP4+CSbmY|aDm0PykfhdF8ou(r=QYY>dGgG
zekOfFuJ~R=STE==(W`tWz5HtRKC@T1$~rF-!WVQsqUattv`NG79R1XgTVu=?qQ(CD
zqurRx^H^<;GyIGLU9BiER#N4aX#eQsb-t7TjFUt{F&?QRS5@?<aO(EHK#4bMc7kSU
z=4d`=gJhJ#yu@u0dQRHzEL&bfzH+Vixys;K<=7t3ZsUBeWfM-^%R$!xdIGWfpVW2s
zbW%Tb3N+4_={y!ZCgzv{k7Ey2Jv~x)p=*cVboyvm@B#x*6i~QnyB*+Evj<vCS0{Kf
z*hr%+dZj*+iK=ELnwB=JvrHGL|Ex0V%vJz>FSW(+0%y!QSxC?y@S80CWPmo6{GlnP
zxmNgD62^+@JrnRwH!<@@&?INyj~*r-ZoZv5<4<{Hz?Lykz{xznft-}Kqg*vEP6T}g
zMo$EzrR`pG{7|Df&97W=VM3bh2~KM@mdz6^VBleVt{v;a@lBd^!I10;XtkLQCqnS7
zN$Q0k2J@rkmw-}EDU|`qk@V!hrrMa@Ebc68XL>iD2FE7bc#9r9-Io4bqx8x;X~Nd?
zcDe+XFoH%eb@6TPm5hlbqofQ``6h4j=jD21GM@q@2@h4L>~^}61Beq(Sbvsfw<)|x
zDvdJv%yga{gJz<~hrsxV)NfGS<~QtCgjw`uZ>`fD(oC)K6ZipoWer4c{hfyULUo6J
zw~d&wjBFz?V?kR%;d`m2s8-6GyUzvBfcTHUodN`qY)HZ@wqrF37b)Qw=%NQ2{!*hV
zC;*$D8BBwuRAE#x&Pq;ElEl<mpXG4gr1MBMr@?ATrPx-ok4{`_-_M!KRq&TOU8?)(
zN*L{);s+*WoEzvq@$4kgU_3c0za6cLjJCDJb(QeHlpZEZG(9R|chid@m3;Y&O0Y*`
z&moB(8zMSY$-ts1=ZoCeNuP>pw!bTKi$`njWF~sFT<EQyd8vzNOe2r!7M+k?!Ffu;
zQi3%5VTAlo{Oy<JNex!Q29NPk3Xb|(=&l;eu0!E_yy7$G?4DR-=KxcL%M8bUImUd{
zJy+fB_ldLRy*jHgD;4#5F6OL%_KQm=ZJf-OW1=Qqxt*PAQgtv&Il0zygT#b_)ZYP{
zkV>&iGAgrv*2qrxnMUwLB@+$q6?fB)l_hB~_U%*NSESh4ns8EcIf>{&-9D%r#tBv%
zA34@qZuo*2T37Mn1y_;#ap0<s4BP?CnXn$$7a+ELPHZ58DaoY^g(*4-20PSizHZ+B
zgoeTs7{n385qzp|>qF($_asDM&yA0I60MnAU3aP>_|W@0M+p<57m&I4hFMuhpS;v_
zR2Y^2bsl~$DDx}`nUhW_W21~Xb!qpJEkxf#>oeq)e+301MDUVq^=;&6X-nB+*c|%M
zO=$b~wvJgSHMh3Kblco>*0`>WG`*>xlCTCE<@6X;|7hM!Z}kP)TapW^%-4rQ=O7d^
z?ScS~WK$0tvP@aW`>YcIm>}6Zt-9=X00X9fgh3Om$|7%?DBr#{F;GX<jQ-i6s9$*b
zn*GL9;i^OT3d?;RFzE)9l>tl)N(^D>$s2>%hd1FOZP>6SlzXzbXK)_{EIIr2B;R>X
z64bGm5T6@+jq{K??*acNl?ngWL+}sW6A`4k(+cwV(dZwzstnO5g!;>g-Yd~f{z**#
zJ!;$a*J=aXx5<{z)s-H2{dr3=p0xER?l?5!&S9}M7+tP{PF&#Giwx-*=Qir3--@Hp
z@}aDr3XB{pFSi9H6rZCanm+wfl0<n5p331L^F8OD=M>>|a7<<fs~UuBSQaX_NzonZ
zhi~qGC={a?!s~UOYZ`ME*9sL@59P0bAd3t4r{kM6@a|aD`+#}Uz^n?wicYY|zN5J-
zhrN|sSAJg+qT2PCN_)h<V)}5%@3{QZcx@H=%lO^TPZMCh`6yRsT*>#g#J0V{4<(d$
zogvl3%tdidLIXw_-VwKHSAY<lwubqFGXkZWmr_D2c>LqpZDV&scGse5MY4!(ANB2a
z<erboGdpZ-mL|L7UVap}V;_{JnrHQO#Xxq~y0TTd!O6yOfJ~pE?TILBbgK?BE%?vM
zmHAx=V9q9pz-3V%3uu6sg*c(xm{0TFfTYKY_k7wIa%L<zQ%1<RB}cA!#+CjE7=Ofz
z<?79lriIFUvdBHW0!O+d<V`X=^R7X;s^r7#xYV}pqp=uw$8QsFmYB;7lBeW5m#BiO
z*#&-aQgG$~)2?n7(-CBGm~XdXR^!_O4%@0oZjM1oqU+g_?<X!K&COW2YovAdeNpXu
zw%`EPOn)MNbe?y&5Q2^fDHQ5WYzJhoJJDd8hD<-?_xyLacN4b|UIF(W(b&H|B|_t(
zqJwuW^tl~0um3~y?|<^u|D&PiKR@RsS^kMCNS*tl95?C!cmL%bEne90K@ByM23!!P
z7;=+HLLKUtTQS(Y#{R#^s>A;mpzD9T#L^1(Y%Rm|-V637FDAW;H8~wrOGZz3sEvI%
zRc9`PJu5|Q6gXxf@E2TTUUx9;4FGr~y?<I=HVrT{g3dMXO|jDY-Z{!Jxub<d9~ziw
zdf-ab&V1PJV@=bN8>E9P-Jgy)nLF=oG2KI_{}l9%UvnEBx~m_*H7lFVA^@zuQoh(<
z(yy>?YxH&%K@za5iu?KeVO;QSx3lM0eQSg0<>?pG+s{1P*ZB8GE6aLpOGe!)UbiJU
zYdwZ*E;5wV0L~x=sVKUFgEv~((a4o!VSRokwO^dXuC*IIHeq+uu>Ac4%_WM1BATED
zxc&HKOs7j9Z)VwIk$|&Y$AwbT$uFoax-T2`+$h333L`xw*C34n1&2VG$0Ya(SRc9-
z$wM_Pmzd22-Vho=M^+YbX{F${!+ttve0YA~K76trs|=wS#8xj<XI^$E^YsQMlM0T=
zb=HoTN1^tXVFDNCb5o{Nv6)4r6UZVSAb1@1M>F{R-WBaFCw|R0ek1g=qZ}Y?#WiJ}
zvwA_0{YOn3Bp~H}%cUNkvy|h<I0B5K9Q<uyIx0`!HlGmMbj^$%^Kckh2$|uHXow#d
znT}`yN1HGq>Ey~SGwvAc+^OzqWxG8!W%AC7h`ou<rLH*rDlQVm>_JbRX{Apk6TEiR
zOs*(MH7Fy(gTDaaq27P&90UzGD)p~B^DVI7*GzU>zGkp*^Fmvll_trAIr0lDvd61;
z7kOre9u@rRUs;mZk0xRF#25SY;x<<uf3BC|W6QEed)@1s5<8Z?aB<0lwLg)us_0fc
zFZqT~8ETytu8OL9WP_V(stXVhM(xM*M)bjceR}8Mwd{}5nU1qf{USCT%F(S_{9J{Z
zq;~wO@ZhGK0(IX!cl}!$8vf-KwAATcYCx3pScO8G%WO33<yUHD*!E(cKkZOR31tQj
z2FPk|d9o3LD6kFgmDv5U>}M?FUy9I*{)+s$U9i^x%c3pZzP^1jhhG!;Z>|?KRok~^
zB*`$8rqVm?O;+^lg`)&?eHw`Nt<y_<(m%@EJa*GoP@JRp<Mn5wc<Of<_Z1H~EG!Zg
zbyg&XM(Pe6Z(knGFge))^HUubgemG}pBuuDCwnE))k|>KXaFsmi|s=&-F71Dh~@7P
zy3s87qSKoqk-HBqM;`2so2b6+NnXmI3#P0QBf{KAYT72XwpkYc3@U5V`iMm8PQK!(
z*dV<f($@6{=H{WxuC!`s<Y@M<XNdTU0#7cX>h12P@`$ELWb#s4{=I${_uNxw#0;g}
z0akt_JBGO%J~2jWik8D!ZWm5$CpJOm2+SLR#(ib~ida2C@oJTCo_|6!JvizvSvhvE
z2@bv}tv-n}B>pWE<bbywT_M7>?Ao;Oj@hHviNEY>z;=2}g%?xOI=vxSnL#%MwTb%d
z=6x}vq|<zC)sNOtju^(ntSzIxDl%6}L%%Ntd6}k!s7;AW5t)aDl2Hd#`b$GKWVYdW
zG15@wsha#*n@_(_M*L05b#}u>e}6vzQSST%4Dp*<6Q1*dzjz|mBbRr!R!ukJ2MB$K
z+bMIS!2s^&&p#!uZqBft?WY~4duHyK$Gcaa>~mi)g1xHh_3+7+Zx?yd+guUDGETBa
zCSRv?LYsbmRVp?{ouEl;rvX!eoGA$H;I}%t{BZqdNU}p%s3zNtA^sNjvE<WsaT|2H
zxUsES`}MACpI>X)Qb+a4|882=xQz+6RNXv^_3QHe`oLFV%i5>5iQ!ect-u@Yd^QKG
ziuTI3o@#6kvE`AY*jn5t=Q4ovp?v-0NBi)+MUV<3WY>BN7RDBU`P`}^xS6GCTTP}q
zUSg9pdvyBnm0PNJ=5s=!Q~hGY**~zDxQ0Q77DO8L>&Cu4eA+0JSP47|SW_j%W=8c+
z?V7RoojpZK3}3a<IKLuT5FdZ%rTTuw0i9~ey%Aa*IQ;|;9<qqR*w}~&1Bjr!v?XpB
zab&!4=Q+N+dg*|+tJF634RAfz8#F=NwL!&S?mo+T8v2g33XjAn9xAHbr*AnZ@dBan
zc=rvd1r$J<>^3IjDZr<zb<xUA=&t6lm8Gq0R_>wQoZ$e?dgvhc%Hb2N!IdnkzUxe2
z{vESDS$gPL0r;!#N5)Z8L+R%3LtWK^QLiQ&lRu};M^;o#!`F(HgbFj}4Jpj2mYU94
zf1ZJQ_MZ4KEZexCdnM|$qs5fbrDJK`WU~ahW%>BO*hRQMFaz0S*Mk)F0J80CHKFPx
z^YZHfNOHbg4SfD-5;*SP(+U6-+mjrVf;pVC2Z59mcBGiS%nohtkQs+$wN1Z~D^c?P
z<)!h}^*zr9h~3=M&+4iH1*yq7TNi!mkg<hkGcy&Q$a`pkbN!u%41@9yfT+Gutz~18
z!K#&ZR6~jBGpo&>%%+V6<}b>P3r|Rm5c;$1!rD8t?L-}h5x7Q&JR$=}=xCl}<bH=V
zv`FmI48J$sG(~q9jW4lu@m9**AZ?4?wuMcq<;*YEV}8>j@Qr%8uymB={PS-Y;GMUo
zG?7pF5nl{e*LJ7uLpHP%tqr(A_$*lfm#yf|K)!?QVlnM-^z#nE^-yaK{o(gk^t+XC
z7oE(zSBfd83A<shO|B5SSqIQUoBFfZ(nC>KvLAW2G>5@DBb4cE6b;^Pox;y{#R03`
zrxKlc+^J<ak?@p(_ns~FCS?W60NsSBJE`kfHDB4)a2c;FYm0n{Cm?i8HeKstP~wV*
zdgYa$60rIu+V{kIrrCBWkKa;02kKV}ye;ZpT&(w4W%jB3rX%3YZfb4Jz*#&!)oPR>
zyRyp!{8^GBo2{2<k}$cFg$ibh+zGmu-rGL!aDALSyGq%&BXWgbt1+Yadr`^Wz{`y1
zOsDHS)vhlooo8ik82e|0iwq9g$r~8+!UX=VGgemfVcXJt1y&g;8V0zcJqFw!-R=M%
z0<I>=easuHJinZyagGV_%r{XxVcq*m!*W`BvTxVw?Qw5U;>=eWwG_8H)?ot^)<(o#
zrA4{Ck)G;{GC8S4gGAVe;6&7>80zxhy$lE!R`XXQAD4c|c+Js7aXICUP*++*Dxd{7
zoHPrX#qu$V!c3xt5;ijQ30eH`kai`HuX@W>-oYxd4$~^t`}Xy|z-PYlGEnIVK%8+u
zS^|>DbhN(MFI_iFix3bO%Cy)7{W5y#v{&d$$6T%Ox1Wm@9Zrl7;`|^%UaoC>wY*Q^
z&QEj;$+GK#3Mfc^lt%x^74d|-$zWdH`B{-$=zO(--R(Kz>N@*ghOnoebaW%x;*p|L
z0uR`qT@AzPe*NFS)Kn+%>wXB0Vt+JCxp|)k{EMxU`_nTe8MhyVzM{}g9VN#_5rFoH
zzSY_1*(O2uBpG=nW%bd9le`dkq9Y6kg<nt0-m>pco;xd={360kC8)2QIn}fg{fB!i
zckgqs&0%vo<*V@V(Unh<ivC==FILEebF%U?3C?ZxQD?0p6mYtmy}2K5F`j3*qb0v;
z{#z*L){Gx&o7*u-&U6vCuhTl&X69US=)~PVR^FOt`M{`lX;8XeJ24>L!Q`R$w%Q;J
zQeYzAF$vg+PWp;f$q!%Pu=jw7Do{6<sVMjB+Kir=+n08F^ob;b9)j>x^}n2Xr~kW(
z;Qv55l^6xdg(Em_JKh#IA0Yj3)&5bsoIBwXNZgDNANv5L;KeEK9X;*Z*wxRKvn$ZF
zVYNj8uwG9-0j28F4I_d6hbpqU4kRK{CgMM(-3Z=2oEtEx;xb03>112TAu43p!ha#!
zKs;*+{wt+D$`kJBWNwP6xr5<^Xp8!-JbhAH+$4S(|6M|Et4?3Da$LdSD@b$Pj*i<)
z`9OWSJ#AL0wD?gIli|>MvdKM_qb{nVi~q%1IxJPi&7LMpHkl7EnQJ-fue|yLks&Xp
z^(L<gz>J{%bd)9Atf?z@XIuum1kRgmua3Y{lbQp)Z68wP-&5R?jNwq<S1Wr)me^}6
zsAz-co>(LUPn2gbYNc-6w=J@EyQuIHlwLBVQemxsE*^CW2)gOSMfMG13WR0^$G_cI
zcz|;(oC|b?BfH%t-EQ~cbD-_M)EIZM-y0fPm_4?=qdR=VDoy&e@v|V-hJhwJb;KG=
zydROBkU$d?2lE3)daN<6()Ufg#394i$PqQ|It*O<``y0do`)K65#Q;?7`t9qE~KuD
zcN;x+J$8i#q+d$&>cSS5P;TMVUb;Uwisl&&X`neQ{Eb?vl3ws>mpRvVOTJ!iSV(6%
z)!j&Jd}$^wGg|x9t`iS6&@RwdUKp<=$Vl#}^g_%I{}gYdfVsWSJ>AVIE2aT!KqilR
z3EeNQDARdv1!_?`$(+t}ugc*w0jN_PZbg8M0$ky<1_JF*B5>D4SbcL0V9D52Odbm1
z51_U3IQ`-2;P!>;)1lJ8e>9e<4Jib!$NW{zYGFI{xMTcBK$q4ms>&$Sbk>oR!EYuh
zyyUN3JOhV8AUV?*%CF+=#Rav|yQVbwhrud;{4joHSJ)~SRF9w}q2ar(y))$GP$><i
zsV+8;9#`&+<3pw~xhnXM{S$qGeC4+Mwz?%<2ba2!=)_P_fxk>^cot9kZw($yxdLL(
ziT&Ki57AuIJ&LTW5t{Djdz;)3Y^*Rp)TVu_=j=5iHuP#%4b-my1TO2&g!Er7f*Lc(
zIPP9R(=0B>EIfz}RgqFS+eAl>=X$POW+bv7wUedb`2{CTLjDR-+vk1%*%PN1fG$0f
zoxVKPYP?4j=)<&O`cWtTgwaa;9D#xnA3?#)xv2^Yk-ga0SEuuA&B_#@7}LVP3lB@F
zGC7T*Pw63abX5_wNHe{{Xb#)BhvL)B{qk{=z~NxJHnq6SX$8v;nAVKY#BL*rtVhXl
z>J0>ajm>ohjwA0_ol(AS2JFX)M4c+v6?&^hbvtA4VAfaEX>cq%{fwlo@(w4K-wdFn
z=zUa+lVj?tY4J4T-`UID>qn(Gq$aXq9{|a2#(Dex!IYBtGQ@-u_Z_Uc>VC?PyjGj{
z^W!}sMk#6+1}Y#?yCK{$l#lXq!i!){r(_L*SpAL4;|tEb3C!1QIEZJGqa>l*oLFEc
zNfQ9acV@kwNL!aBqa1`v^Oi8vsCD|Yd$=YOW^{z8<7I8tGO|puU*)Op@_xvAD6!3t
z!%F>%K!HlUm9QB0Rgue)CK1hRwm%Z35wN-Q%5V?@vi5}bWg{M}W4i$Vx?a&mXpGvF
z)t~MF27cb3-oWtr>d%xOwelJ*7k4>Gd8E}0=nP;9`3rR1x(r{Y8#}n#q3OuI3T9k=
z-QONFjg<8F5MJ^`BS*HSIqk}#)&vW_QK~x+ch`lN==eL!hXF|a7tIw4v1fS`>Pzm!
zERxqKAB!?7p3ai)d`ZVWwgd~?qDnL#PNqAy?+-?{&t(-xjXrtza?8Tum6tEnxujBc
z5sTCq1>*0n**bhd8<$ffCHz91KC6J#r$QNwwYp6m(h(vzojM=Vi=qwQUvKyeqb@{6
zoh{AOzAx2&mjTdWGzfYO3e3{<y&;v*b;bT2O{75LizlS?#+QPHVrEIF*^G`R3st0^
z_AR{d$kCIx;gB)=+!gs)HQvHNrLbg}_s41{mmT9~W9r;p-4T6pnIw<KdD&|1E^X+p
zQbFRt>8K3N4Mk9}-5~k|T`K#_nA;ou#r_@Qv$$<`tu(KsFV`|R^rMRW5UDH{fT0^8
z`$~{Rm1%8V!0R&6f0-hi;x(SAZXT|VP;IW{uPn7-D=!AzH9J~_tpg_(DQPk95eEk}
zcf8fEB}mBg0V$C&{@QlhXkQ*vGZ8B-)K>$LoF}v!T|Wi(xKTsewK($Wn3mh6IB7Pd
z&w#dy+#`-vV?8Qh&23W0h3A>0Z;ZWlM&7d;V?B)BpV2Zx2RsW=&SLg}u&Qj5xow>z
z{n&gBy~tMIW0heomI9n-r~1gt@>XN>oh$yEl046t&1K=H@?C_n92F(^--5?KzA?m{
z_t^;IYEuSJwQcPTUThKtXEj&63Idm6yBSuH-Wh`;Tnxlly3J8Lw4tgJl5zRL^kNiH
zZdxOg`N(+Sze3W05Ah}1C$=cp5a*C*MbSj3XJyr0xRAr!%3n}LYL9hINztnd|3qgD
zukQd5jws2i9LjPL`W;L&y<>fwb3cP&dZ~5mpY`I|0YXwRvX`>Hk1*3hzS?rvMx=3D
z=^OTkmEvIZrfnF2Ca`?I#bcq^l>sIapJaTJhYG147L}AF^~!f&;{Kc-L{1VyJMp2l
zOIo6mp%*NC7k1w7pxx?Q`9#V+;)H~Jte=I07m8xYnx*S4|I;x>_;{z9YrkK{MXFhW
z^$XQ=ybhE^*3^JgJ|o1a=aoZQzoOOIk(h=IjD<W`uC1$^@}Z^5eH<MV_5QAyHS4Y~
znmg`mudutH%TI5to-K-{&A&A{RZ>3lTHEW7xUSJpw_ZDb1g&z2eZSxYUC5a#FVA1X
zTcEFA?VjApZEEU^7Y^_iyVBCit?&^J&$(r(c{YE>ffan|BOt1%!DnfDIAs2EoqEGA
zTeU9g-2;&1o=JTycDHE%QrY4XQ(Y9>s7Lqbl(?xqwnY-Q%rL=bqSC2y|LYlm4aos3
z)o74NmvO0_{rd}r>BFz1T*Q3a6ssWE{JF-)Bs*&+{C&9TZUD45m=pSqT4Mww4s;Y9
z5L>RTQPdoOWQGtAT^&Q>iw?NG)6y~+V&04%g`)vZW}}Me@tk5I$Z`rZ4Sd%8OKqC|
zYa@nZx)c928_jsx!k04D42zx;Vqf8z3VkjU;FBd7&3*N!o^Mrwc^Q<SQU3QN9E1Q)
zkM{w@G~qbJG=`;lG+A(vL=ol}1@FW3-Zwx(n6YSQqE4vrj7f#7D4rj+6kgP3J9ih(
z`*``NI4lPa1E!(GI6HLu!$(+>${uT%J^TC^^4$mc(W1oQgV<Azi6h?inY0SKK**Wz
z56Q`1N5>W#Cn+M<_rdDMaWc!V1UTt_mvq7B+IOfmWP>U3bTnk~55R??MQuR<y2Aa;
zL+!w%u2lzpJ9tl&Wyh(D!U=UdExF6!<85~68$->dWH=LZ(iM{&iZPBy_Z~0#OvLO%
zU5#kuo`r_;F62YMXF@EphR^HYV|{gP_QaWb!NVDcxCD2pbGF{-Fag58Z1iQ1Y>8B~
zQ4VIYKuc37eQz=K<aNaP@g?t_@57MqMS6cM@s8&gJ+WD~KZ8qLUCCLIZKk(f{eF{&
z+KEZkMo2|P3duZ;9Ocz=q)1Xy#41=h2qX_SP`Ha*HbBUU-|qzT5qyQIhorqIJY{{)
zQ<$XtKRhP@{tJHezfH_dh<MHi{?xMX=!isPVanCmY|v`rkf0tlNW2bL*K=YRy1drP
zQaTnK43!=JyKzO?t!ojl{Wt7B70;*7Fx$w0G(}^-bfR<kVwlL~v<bTO%Mv+Ttlxa4
zi#B=KSj;HKFR=nwUhQ#H+;rQ{G4gW0Lts`0K8cwlcA9%a)>beR--?OYc=goS7)kfW
z?X6y0&XR`Xhl`vd@i<|~L?x4~x4Vrl)!Tz(_cq|0qZXl#QB&wbUEJfS3^QE!Q^ZMd
zr|M{*)MEaVjiMuYK_AOrWCtggX0BSz&y<(`sOBXR`)T5`Mx4RO;o#lpw(G9q%F`6v
ztWgG>)4gwwdI{PpvWntKZ2;P>_N|bHT7m-p-#B*g2rZ{X_|`J8yUnIMPOrTU=41JB
zYuL?Vq=E@)D9~<;>&#K+%N2L>xe_C#D36nl=u`_V(1@7RJwZqM<A_UT#+<$PL&s?A
z=JKa1Qn2vRWZ0qG8wi&%;c<L*ii3{46=N2$u_1maZUxz6c{El^EbHbFVkhwr+*vb_
zln1lRy&@liNLW2`f`^+?Z*%2O?qTxG`l(ckzHiF+?}R=mAaqMY7peP&>41&!@I%=F
z1cQ#3-CUXC>+p8k<6n170h0<pqwGz5v~geJSNtyG>3&1M3RN#J-)F3Ay#k|8L*%IE
zHzm9|;bPC5&D`pJ1&M<j7RVxXLbmg1kAvrKzQ+;PCo|7FrSZFY0IwRzU3i=tB2Ohd
z-_x9Hor2UTUj<*dB+4J3*=dO)mT{759U|?d!86P)1=ud}G}nJkvWYUdA7ou!9=9zt
z%eM%3_3wbZ14E-&z{Fb&f9jWmEmxmyeE-Ks(N~6;#MEZwb__7>8tMYFLfcex9Pg+h
zzOeKBMd1i)OsF~wc$Mj|^?nFbML1=WhAm`TNPVhEG(;K9w-sxE?D0}2L)4OFoowo2
zG<#1RJcAK9f2e03g_P`$HepLOEcTe3gWwJ1Nx_y0!W#Ah#*3F591`xG4g3J3B2Pwt
zV<HU6)Va!Zhzp*1YNpZ9Wh){&V0<5EuYxra9DA;BznM+A0J2G%|GK1?wf(AN@<?*y
z9-9jbecXuD?`rgpV<lNYWUXZn(#;je)KXtc*s1AW%P$y#9TA_JyG3OKEuO#Cp3rCa
zW))UPaHj+7p`#O2r8|h@p5#aA`opz#Hl+t1G_J&jwbCY5*tLy(6EOafWnY2#2C-Z4
zfaAnmJ(u;amTiztA}lnGpYbf;VXCF_YJGda*qZeX?=sRqL;|h%Gqv7xGWctSNlu7F
z2|_F!#YUd&P>3im{5nx#PX$@@`;YGQu5>wLHd3|@;01N|#GS0>^5a1ZiRhD7qC&sR
zeLx7=(*hpKsmU|v59Z+lwTohOC*LF9R5JT;4Ey8Yh^aV9Z{OIGYj&={0jt)Rg3*dy
z#~{b7I_F=XWU;to+Xk3*0=%YWK;CYF3N+>jjv7WBHAEEvzrS&g!9TupCTVfzzYC>q
z2XfLS1hT9FWeVM|Rly7WLT7?Gbo9`7)#D%Zf_TF=G6&+v<M!MdVa;JT_!NI#Fy#&`
zmMFt7<A8-fyykl=Fqig~lGT|<sg!HD?w!ZC{4$K&#!h!R-)``~wy_z4R{e<k3GeXY
z2b}c36-8n<wAp>?YcgE=y)J&##EX2&#aF_DV{_v;q3y~o$>5Im(zA-SLgGHjPLGG3
z2G{=?%Y+L!bz1LDSJy;_eU-Iw$9;YKmdukW)KD8$^Jze;$e8Ky8<#F1>L$n8rtva5
z4f_|=5mwL@v%QLz5$6TK?}xRP;fd5Mh`hMp2x)%aB$bv+dEyw5x#ea$b&!7UvcRVt
zm=bVS2NS5$!(wZzlDbPPFEvk=RG9||sp`V76V^USU6g33cXaVegC$I0R3%ZEY4EnR
z_q$JUZ4eef_HR2kXDxNux6*}2w+xpvs{tO<r5zq$M5?64`Vg8xnXY|2h7aF;9I(E~
zWPR;d*nyL++!FMfQBFdz>v}RTVuc$zl?8SLn78W7VGiM|?c9`MOUEob{{zRIoYpfd
zq-bVZ&c`a?{-(Z9k{o6Skqg5d+@W)zNL>BZrB-%Mo3p!hEvx1`UnWeVNW6SlyNnJ*
zkmJk?K2Ts<Rtr@cnT(8h=vfWcU&`5egl&%wzIBZLdR6g<F_xUn7tEw$SsR7V@JB7e
z@2cbfYWF4J2BtA!iR+8qjy|wKIsaVxL1+Tv?}G0oK}i};;>;%YP6{Y5Oa+%+Vq2$L
ztFSJVd~Drg#HETjsp4aU(#bcnf4z{rdW{<Yj+&hO6o+<hD6@Z-GrOB7rfOV2;E*PG
z#zqW#d6_pP1`rd!SaQK%U1ylL;HA0C?`LweOwcB3Gho=TtHG#b6rhgI>D+c18kbKA
zX0nDjzSp`LEkAYy-RW&#IbNL&B*e)T9OX*(W^ycYGGo=wS-0!LMm|CR+;UgHX1h-h
z^WPsnsmrk-+Sx@uRf(92GhhSg8awBzpyupRyl7rSZ}0_TYbW+UzcA1x*0*NthMT^C
zT3f)vIdnd!{0Mi2VV{%m)%z$hV_jaYs=9?FW(LvNA7iUz`aSUGm%Ov`?__K&gc=&I
z%EaU2@DhapJq@3ehNQ+wI{|%dey*8-#k0yF#*irF;~!0NThdvJ!`NBx<I@_%jzuxF
zoodg+XuQX#{$or{`*%!r{09zwE0nt#J3CbqiI@(VVy~5ryi7E>K171qZ}<i6@)+W`
z+pqh;F@$|NA2DuaDjzENOE;@icG6v`m9#=5i8j{3i==MMW^T_rF@ckf_*k}t1{CA?
zgMRP?nj-`=$a+8{o#*)r5w)`SCsZTCKjFkp-E!y74WbA(LuG@|9V!=yRxIDwxg8MU
zA$9zDpo}3W)R2@U%0wsI!EUVT_qisc#P0WHxiBXbz$~T?8)?~uN8C7F6a!Ol2<MJ3
zep*vUPNYYCw;z1~TJa+y(_Ev3vxz4|yXXsKahd*z!<SC0A$|?#4S&G2B!c~+<;SoI
zsV)Y|l{4s7+km<=q;tmvOX`uL8z!j^WLNypMy3BEvC6bscQN&L(E5nm3e!xwBmpdA
zWk>fP@Gb+-BwVtDFLNy$&W#K=)&x#X=0}j9=0}n_WE|NtnFx*j%uV8)4aDPjBl6T?
z9%(}7J+7149*Ume`<B||re8zr%%+&`Jf1_`fF+vm%eaXc97}by0d=i3&*{xi8tP{w
zpn!@diTJAGo%!WSU#x%<gv`Z^R@)-VvvGdvB5recTCwnEaz~}Mi&U|duVi5UiB21p
zm$=dC!PZo;<D}{Q_ryE?x&85VV&<19g@Ie5TD`92d*`UAz=%r%u%vt~E7v5&zvTgA
zlO3rvUy-nD63GeaU=*QMu%g#BBNLZL4QpY?)@y=@aA!z8;)Cy2Sxk$>;#mF_;1-wS
zJ#H3u?R*2%*2j0m`Jpn@2?B8i3S#79_(S2nJ@ve(d98?%C84FI;`@9467)x~5^iyf
zAc2owB(6TH&x|k56l_yl7i{5`A3+4t1mjY4t}~3Nl<@;M@R#^^#fJu{`aes4^hg~M
z3F!&_pw)<}Q<gAx`R{)!z<)t#<}HB}D+(9nSbByytD-!|`9!0fjU74iFwI>y9j0KR
z(r}A>T|J+^Qv<Jq(>ia>yP%w~GmzM~8cw>Ah5v)EzYL1&i^9B72n2##AUK4^HMqNb
z(8k@}-Q61}xVtp&?(P=c-5mn_-=2AA?#!*abw6~S>hs}Lbyc6e_gd?Devh|uKb{Kg
z!OHkv17ee2+~_|(R85IcK~FvL!65A4Uzmu43KA)(EMPLiI5~PFP0Z<?Sa{H6>ZQ5Z
zk7bH-E*^h=yEKf()yw8F3e|1A&4@fwH3;YC=ci?7b2?NvQSP6Ysv$Q)dMCZ{Yh*!L
zouO)(FFlA^(hge_#+yb{Xsrh1CpXc_VmQpX8Be(D_m^+=Ht(?>eLW5Xu-oyGWTh(m
zdaE$5d|`L}Cq(ZFxcXFUG@15Fh*q~4ikD!S3WtyDaL$Nby4=x#B1io@rI)d_`Q@wA
zk;IN0Zq<|~BM!>=kb+m#MM#YUFf4{Lj;6Yw3-7lRL5;lA_$isZrGrUqz@SxW82qeJ
z<F0ISOjYGEhm+<QELwjyeu+t{IPw`QF+5<=`=|@w`B3Z3vo+#gb3~Mpm-_Or*3>H2
zVDuUPnR?o&?=%#YgL81M5bu1~Ij>!qfTWs(@cvqj7WqoCR2Y9LY=awwm?)@{sG8MC
zTU9l6q!7`7vwGnt@y*NE`e;*Jj<X?;h}<q$>#gPYe1-?&4~YW~=Tm*gmoT=Lr5dAY
zA?GaM@U2lyIplA#g{$f?s8+BX%hcOT&PnU!tO9R`{!CT~2j%=tFqR*N6;pwEz8j^I
zq=moQs2`8a`H+MAp&H4USKzJ#w1QjVSQR0P1H*r=MK$_wH?s}aUuG@YRWehpQkQV6
zi2QR_QFyGR40#=)?_6<FUA>ip0DkdJm5j}!*ND=TG|L3|n4|rb5F4V-boG;b?aP_T
zWF1uOJbaCVg8M14$BPxEzkT6+c+tC$lC_Y-KdGxQ@0i=4aum8sXAYPJ%BEz4SXKzW
zNvPXZ6_$2s$Asf5DXrVwx7(sDW~3iEQl8w#rJ$+XqCnZiNCmoO7vbN^hE2}oUPDBi
zD?bHUTgc024)v%UC-3~b{j<GriTaDz@Z&~>C1@o@Q0eCQIQQEg9<}*ZSo?AaTPAb&
z^_+fDpX+*Lu~`j?rz=&QPNduTDw%tiT1ho%k2XjQ)J?OVUu)5J>#3H}dcll>|FHRb
z|DL_#JC}S_+!B3;?C&ll_?qVq731o7Bgi4oPvG>C!}+|$yc&>I980$RR;gZ%({GVX
zG#y8Uc+~v5)4&XyaE=;aIRLIOa&@FJ&ewWU8CbX0M8AHiL|(t2H|$#}K=FE*WAPXI
zfH=Jpg|H%gk`GN`uOt#FZ;b^_E~CaI((1yu`LV$gi7kB$l3n#sH5zQvIo7}*YeP>j
zBqF7>2%|bF-zqxI%}td7Ah)y&v7W&QoB79YsR5^$jrbo_C-tf^De^%Mtwz$bU47za
zAOA^m;u1#jr4g<uc@1EASk8}a_0rtLAukag@3E9U2_93sL!xJpC&*d>n%a~i4vJfE
zI5{T}-`&jRXM%`Ao|HFCMxv~^S(rhEOjeG7fnSrItUt@|k6+H}bIb$k3T|5X73%nN
zk!e)b-nqe5v8fR8<gq7<FRLCMrkSduI?O%$17jv`R6lNSuPO}oU8lJIvtzjP12O!7
zvud{8+7FuWYJz$Po+$t2ZuUx&67h8<=qUy^R7*t}QA-pSYJ-rSoaC`R>G&bCFbP}(
zU%HIjkR<I>603j|4^xTJx24M;s-KH>Q70IQzXoQb&P&6}FP1#2nT&(D)|`f9;hz$F
zMsNMmEvJgDutY)ebtVGCZFyi)mY^>!xL~u}qa<fvqZSS!JVU`>cISFxJ){`FPM3Rx
z#vN(9zP}aTe*T=uLBcs*m#m;0Hj-SOhR5T6x|XTOIdKwbEZLL`?RzTFwjCnU$%BdS
zn(|6qaqd+);|9TH9>#ZO*i<Bl(l^K+mg(Wf#H=7yddx3V&#z6zB#tcg^&hA=I^31t
z0FWg(vcGTqUE4V0@ba>6PXGo9Q6=f=gFPx_i`LDC+zA~#h8(@(3O&m<(rZY#PYUxY
zqAB-qgonIeY-+w=999ZQ;^nB7WyB3Fkb{$`1YRrpCVk)L$3@ES?q{miVN+(j9+v3n
z0D~z&DmSaYka83YNm=r}2@*MG+3$>Oz63EcM?VaSdRupVdX?UT8;yD+p*~MO)$TUL
zmQKM`uepUcRPSA{n99UqsR`YuZ@?*%eAb;FT%ML_S9KzRS^pHLq4Xo$&fqw@qT674
zKgy+VBzHGU6Oo8eXWW!MssrZ=Xk#R?>YwxwwHoBex;)LSR%0S-@jlWEQ%M|DHR%QA
z;FvMm9c#DGVSgWXkh3%^>_pe0Nsa+PIdu0UZW8?+e#!i*6uZhjdaB&~CyZT6V-RX&
zBFMMAuF5TBgs+VY)_I!WQN(RE?YpXWYF(3*n1nPpgxpb6l(UB*q^dj*eUHwga+Kpq
z1ve=%3VZ<-n{+EhY{8&)_BtpQQ`TH5%EI2`C8)1S3}03OW0bY|)LT?Uohh%NM5*|4
z8p$anUiM9458{Pk-)=J!y*&<JkL0hoi68n0K?h&_M$k9M+^C???7fDpTaDv&LmUd}
z7OE|@tJh@daQ|Z2sX7RMHpy$^xoJsEa2>=PN(%(c!q@xIs%9Xa58|Scz6D1;WShXv
zU!}CEM+U;@6F(mHAZkfnf^CnL6R%MRYNx#P>i&ZwoydJVf(T2qYb>SLAcnVFw15kC
zYOS*&u^-S*Uy$iY9QCNYjonR#30gBa2%VVqHA*+JQz|UH%)lgR-iSSgE`VDwM-EM?
znJ<wK8qZPWolE4O{zvX|F)K+zVBt$rFQ=A<bqASQGs(a^BtT*r%##v>($M9ldR7h?
zsSMBNv?Rl_O&DMnIA(j<BZ9Ybsx<ez|Gbn%B+ODTfvw22J_QA_NMumvnHaRS39&dP
z)k5G_IdF8Rib3i(Iu=?{Q*(Ao;dN`4{4r<m^pBZi*F`^Z{yG!8ncVpAs9owKEY0_y
zlJ7<bS&xY+^sptS3U5~a6~G-MOnnaHyBX&O1GrLZt1fm;ZN>#acl-*F5jaRt=l>7$
zChP*?tLJsBk0o3vjOH3o*M871H0A@>x5X{CTOR?EMrNVj&58It=a#K*i_VVLo;L4}
zQYHWduX|ydZe2;CUhxdv^Let*^k&+a_q&;>j0BrmM~6WaCbEPXLmlxy`*!x(D>Zq7
zDqE>)zJY5?zc*`<5NKQfaT+IlP2ji>9%;SQbZ=4Gu#v_x{>euS0i5c9=&ZmK9eKn}
zDK5Lso>{NPpz>?ISZ;>7gGJkv2;>md-i2N0u({=US#(9Zmk)ktdF9dX7m#T|%<^S=
z5s2?)s5sq+2BlZ-74yn%WD|aIWo*94IK_Sd`$=-p(3dI%>=PUsDk_C*?(jr(zMtN$
zLrFJTi6nDWi}4hJu>y9Kr!Qihd(68sU8BK3B6Jxtv2f9nX!6{hn9wzsPOrYUHPJLm
zWD$i^$=L)sZU8UBduL+2{D(kDqE4D*OPA=9zBVNERli^YPd*&ixxj-rG1|l_Xa7MV
zbdy@^BX`Q<bQO(CJR$K(UdjJ(F+A~=B<q}<fyZBbUmgJ`?U#nu6g7JnK65GQlm_|e
z7I_>GH0S)BGt)F|6`ITsv{-L%=D|4CHG~BY%P_JyrXHU<WMrwXC_x0li9u1+d0+n5
z2krl`Ht&Y28+2t4!7Fz)Qzpjs-^SF$`|$SJr%&->-*=$l>BdfV8F4sST)4ogQ|B^j
zo;-l_dKrvZM)!_yV<>ScRy3@`w2iDotjwn^)m#afcJSpNbfbQfo!QY&wl?NjDj7OZ
zLQ|@isKmn0s8DaYRza(id9TH?I6F+GJza_bMiz;RR>PbWb;o0Di<u36appVc;FUNU
zr9oO!wf?g6hr*P|jw@P|R`?t=idwG6GL^|x{^@b@V`Ofnj0WV<+epdV1H@uSj3~-~
zxN0e&ulFrEZPy|SPpGEWsgqW};dvtnbJE$hfEX+Vl|B%sY42M`Nz%6(F%L7RHoo`n
zmut=WGa!~OdlnS2pR&h?bqSlDvsy)UzWR|w*b7hjMs-Y%>hbJzHC;OPKIJ|xk)qY~
zx|7MKmk0d0>Z5iTI&4gq3GoMflPfhv-53q8Zqjz+jM0oW9!nUE#lvG&o4Q43vwuRw
z2Bh+xtL;jZn;>KP37LR9KUA$pR#Td>`UGIY+(TvxVhYk$A&g<ildU#kx*aoA?1f*O
zPZQL7Tz~-(BXy0N=KfEk^Lq!sCO5jwAsS?5Bev<?5y9o^7UOL8N$vEYx(QF~QASUs
zE9=84rDb`E!@(foquLEP*5FgU*S%_)tr5xgj3fSW)lhB#s)t_AhxIXiQH-YZ*o@4}
z74@JVMoEjH&dmg3v3j-N7&+6#xy&(u)-xZlId&>jm$fSaffGan&-^+kYU|qzS~ugm
z9}6L=QgEAL+Akvek5#J&3=tuQQtxLUjW+c``vKxE3qVU^Q;VYP7|$Zh5`(!EM90<#
zcG&d5m9b9&grZJ2?_rEYEGocO`Z1dm<KYJDd{6Z+t5pvsa{r9BqU{yho@>F$TK9g=
znO=6as{%I|N2n}1=s2r~J-SAWX%;kcnBOc~efUF|zVqK^Aa3mHl2SV5b_$I46pAq@
z=bWO2LnAP*3M8HGVF*(KZ*qJv?7c#*Pwp!o(Wz4gRoPzedbiikN~HW}+ka47?3b1I
zV!LUUeb*VcviZ@GlK&(+YI#$XG|}Vbsc}mV=92Wvr_8Mp6|G#Hq+zP+nAMfP0h%P2
zlkwB(3U*#yC2Ksi_sHueh{9S$hd1a~p~|1Z#?qJILQPv;6VXxAQ+<+V+<RTY@OPyO
z3#R5Zxn2b=)=~CdW2Y3REnzD!4Il3w`k7LSUiI)A^^#^aYg0t#;jhj-@DVmXk_2Nm
zeP(LEZ=0cKYOi&%+gu3Hpv>30?J=Ym+#u{9oZ>1C+tx^lS57B5s3DngnFH6ya?o^Z
zJ{AmITS7&&Dfbgm6+9s8-$GHnq#@(}h~1LKaF3P;E^bBgo0t{;W&)X^1I>6`k7qqU
zv&z@+t=O4K-a+cdv@*shc?r{yMxbLy?Z$v5e;18@UGbHr<(%hRXs2I2(>*vZ4Fg2)
zLWp?Fgw*#GYXu#Le#L%SWCE)83CEC^X*6E_kPJMf2+i77S87RfJxF+S231=UH%+dM
z!$UMbg7=etVeQVrQyv-_b}TIkkGS~V^O^P?@5A%>?%0Rh=-n*uoG<z&D_g&o4Vn>U
zUZwh_olonWB=_iGJF@Mh?OgpBL_SGc0nLzvSL0n0bQF&J(N=P8IY0X+jpzTM@Lzy?
z|3STfmz8g?OOtrfEaIEisnk8A<tx=&SJU|QE-rb~^VD`@AB9OoH)OG<UP}3$Q-i1r
z`T#WCokmkGqMD6Z-!v1nM-g2d6(@+?Jo|xiOBKgZz;gNoB{Ru<WPFGRW{@yT9vvYC
zEtjt;npdnd=9f+p4WhEtOY|i}fX?hSc}?LnsX~=W0g6;aw`(1DS;V|(nhVj&C2!T!
zeE&f)k2@pdSF4SuQqnGa4MBi9C2o22$5zh*DU()yw2k#G<{{w*j=?&1ZF{d*=mYkl
zAbv?@y6G)XyoT0z1wUi`d_$+F?QYP}?S?QCk19@O`Rth={SrI8hwk{`c0Y=<4f7~p
zhB4#OwnFc)FxVMQ1bq&0SZPGN<lpHI<-pBW$`FHZ2^Hs2ERJ2AVWx%2=*2FT>F%u}
z^WNmPhP%bTarID<)LiBWGz&|@Y|gh)ROA-?1Flbo%0un%=3L;LxunxbWpRcmIKtVV
zM1sD<EF!*qlDU^7J(-_&qF|4yfqz;OT;&!s0DKo7z&Vmz6UICKxD$sjl}b=f&rEMX
z-k{#V<(5uk0?uf5%8M2T4Y8DI-jy>C{hfi?cV~=ZPcR42CArpMbss+Fs0voc4nVE~
z@fyiYNAxiF3N+1{nU9c9Uc?Qi_yhqh*1Vw4L0XIklJPnyWqoef04}_uR!PQK^c|9n
zM5AFiSi&c~Vc#-1l{k275D!A48eI^qf%D^~=48+^&2tSB(Fhgk@#^wWWNZO2=U2TK
zD)XgWd&cr<_xM=w3O_5z3wbiypZoGak@fl(SM4uoymWy)UXmVbSZ1C2i-@Ih+Bki^
zn8LyxhXb>wBE|Fw1IwOhQ-Uy|AO8uA1fCuKl_pYJ0Cb}<uAttd6*U9!gAFY!sZSs!
zn|~Zhs`?TJJ;a{exV&Ub9O{Uj;D(zQbhj74587RtA76>U2-6@Mbntit3oZQzbwB{=
z#p<ke-tT5gKhePCa6UYMMgOn+IzSZ}eauJ6q|yx%8CQ0Joi~-rf%)V0mP$-9O)?V)
zJPm{3QTD!bf3(IfWsaOlB=4G$7qmsh>xQ)escLMqCJ8laiHZgRjim<$S@Gl%PZ;P#
z7%;70SpBXU;Xd}GUVh1%l>>qI=wO65{%y5??+-WRXD#E`<{fkMTk6lIZAqNNA`li<
z+Bp7z%FK+{8*nEQMrd&iZ>Zn#{4V3QHvdEd%~4<82Q~VWJOlLMQ~DWX-SoF`yrQZa
zuom0i4)5PSF`kpPQsqN0OjLZ}D`l9ZC(ANH?h!j~+zQrs(eARn79|>FIrzjrI2!v<
z*9Xk*diz^S&V5Q&LCkx%$7hfx?f-8Nx{iO;ySC--x+ETo%b8E(j91g1KIUH!r^zXY
z^&-PN(zD@iqa|0mYNIFFe*^83e9&`}d0-XBiI2vvV#V-#KDl)c)gN_5A6Fky_;LB3
z<y#Y;9*5~;fK}opU{pdlJFz!#Mcl~(6qV$Z<ZM)j7_zK3>1U}9bELD^ghnfFvMo>1
zshz-?^-#TAS<jp^EZ?7+AhyFec{!rzYmKB-XOLHkH7(8VyMD=eX1dWKDR&#y<APoA
zF9a;N*YWw?bN-I&Aap;0G9ft4po=+BD~W4umiCVK@l@&wfo;b#^M~d60-Hi=EoU-z
zkq=r{j-briGm!bvB3NI;>}KelLfE_okgGK1!(X*C(_)}5qMCAIeHFd>bvl8EihOtg
z`_`_mXac#$l+@PkrxU`tj_XvO(;SU~(7947Y4UZJ+Ny}8AOMVQWNl4Uam$vudZz$|
zTcZLbpo^CZlv?vftW69Ggh%F`!w*wFRD_52&T=m$7qEa><ltU~uXr=yjZgjm9`DPK
z^l$2DoGJ63tXxGp6SLN(2i*TyAo{3$tRGOWTVgQkUMSIyww#syUgl0$h#d6o5&MPW
z&^D3?dII@rSRmzD@S^25^B;%U&5uaC(Uz3(Rg>h{I_|Ud3p8AC?$q7tl(&_&Ug@HV
zXMf%@apkiY>x0l^QE@LRx)YiVbJVNJOk2BGKIB|%@V_BX{{IKqc`q?loQNIvo!&iZ
zq{(K6tqM8?*#=(GPy^q0YAUlKA2>=%y|Xmj-R)}L5kqg>W*`OrGa?~VDY<OAl$eht
zZIp3Dfpctj-M!gcma_+e;EMDi;RyKbB)zEXq)~k5v+L6^q2@D62k7ALr&8m>+5ZXd
z04%p<VQFP=Ybc6(H!B$dhS_CdCYw=+CDhN_iuPq(BT=cLyquHAYTyoga!JG`;c
z%H?~_F^5`FVs~?j2;BEG3VCv^N?#03c47PbG*4jN`jCb07Wwi_*J|egjK%S(+WV<-
z=7dE*<Ap)I$KxJYyGlZnm<5I<U|V>;!BSFPN84SD!ySd3dGPlz*=Y`|imZbc4NOcK
z<k55il}mLwSxc1W5+195*N0yr0k19JMSeU7sbM}#ijso3c%u#M>>TJA-_&J8z1>Ay
z@<zBTQx{S!iH}f1OlZqdaz%&KmKbYm1-5CFWChdj(g1lnkJY<vGwOOLiQ@iWkp7yB
z%VBE1j+A0&q+#ilx21}E;s$U$KzXS}GTB`(kwyItiW4F?0D)!VwqTg9l`wD<5!=R~
zeCf0tZTg*~ys#L0e@#y7Opt6sw;Dc_R38A{hD;(kLrDpp{7UkS3RHfK%VbxLEt;jH
z$I%Qrgt^X4Dsp|$nMBwNO(vBoN96mr2jKfCYmRYg7JRD^xH;2)r&$m&?U0B9#xjHA
zFE<~j9QDYT(<E5MGRY0XiOnH+9!(=Ui4;y{yD@yjU#^a7g%UA#e}~^CmT~S*8@(OG
zesRcRB~E=}AD!f@;FgZx&CsL*w<etwmy=1UEV<ZUQw?L@rhv-4B=b6gDM}Ny4g(A7
zzP=LI#Cmh*a?++Dkw4-bMWL#rfIhoIqhPLQZS~n%X=8>CJMv!c$0>a*>@^KnzGWS5
z(Ie`y^*CQTy}_!$m;lGsH}!|6M9Gw>XG0oK!odzLO+r_Z)#4cX=u{YoYalYQ_T+)@
z5xKWfUQ3LP1HzV9lfSm%#~Oy7<ZE4YbSiCS+P&apMNQ0L{`3~wa<zzRs}FN8;0Tv+
zUK&Z9cnB(KacM??QfS1gR7eBUiD!ALsK!ZK`+L4Lw5gyw6*f76TvcyYIi!rdD~AJw
zhM0Sf0f_F^E(@~i?g7lli;rc=4{S}@vGVD$QO~d|;hXoaO#cpmZGXl^_W4y;LN@(J
zCVQ?Sl<!p}o%t$TBB@$t`Qjq5#({KoKcs*)ltHDT&O>s0PQ!6WaJCxe6!~U&sW-L@
zjzn-~b8A2OHYtJtEb%=bq^^;9!B=ZcpOeJmv6sw$%78UR4q?d&aZZqJT$vOgx*e|0
zYZV#ZLp?d>B3^o6jNBvQ)D4g|r%fH!;(MQX0gf26yv}xP0#0iGtS+msuJskdW#JMh
zHyzPztGaP~iK6|HW;yheTwaYl%{X*KC91A9Fw3k${vQ3JLyDg@!xDw)I55etoG#%R
zvnyaf>TWxH4wiNDbs_-1%gZo>@p~;?_YeN)RGQTf|B(H$v|%Qfb@Qw<{N>#Im&CJY
zwYIqBIWh+FdP3SzJo3*k-KH-o|GeIR-h^QNeGxIgB`(JODJRvx$i)ZoF8+N(T>B_<
z*%Z~gk&L)-F__o&B&7G;P(`lAJ-%Am$(l>f&`5WfYg300`gWeN!cx#iGc!6tGi0rf
zu)S^mcN@dsPCuqwuq92d*;^O2Z)`1<_WYO1tY^owqyEn`;PYG8!nReKik^<_#28<g
z1l8GK#?0@p(t3OaAS>nWvH&sY(bAOh{bFfxMOr`#iK5?(ej{qb&9#y~>N%F@8pWLI
z7r6wxzrA){kQ|KjT#Z~!%3Cug!d1t(5aJNveyviX^|j#Ne<Lg}!2@Uo+uimqE#Gi+
zOhRJ9-hv!ofG0jbGC%Ln)YPsUKG4QGWyVS`aCpKMBR$;(LTObAkFhn{kH2I;g&C-V
ztU?P+ey+w~=|eCE6@m!X+Nmtx%H|MnxQlEsLeFKQdhg{qni#h8o|P59mwhX5zEvit
zcqvcLuIUrGR<^c4o~-b~Z<njs((;|VR0!Qb&(ZtEa|C`BF5I47p>sF4`Dvm#m(tT%
zt!U3JnC<JZ?eQV+lT<7p{m(G)vkh{`YJ@yjMW1;PX*>AN{A2?Sr(E67`VKP#0>St|
z;elKnyTdwReFvM(d;O@DeH6=EKKS^Jc;eft^|58n_vGaRX5K_^UzQPdr4FhRAfuqH
zqaGCq)enw1e18?RzxYUw627*r<<h6cm%Jq@yj(ag<pfS83jAKHBU5)|Sl1BHrFg6h
zC(1}}f~W%0L7vaUltB8}UrtXaS7)KzAm*NytXBRJ7iXjq6FdCNf#-hgfpC_pwm#Z2
zq*j~rS`}r0+?ZYB#FK^WC;deiYX`)*n5gYezg*s}_S2*fD!|vjKi@?)gE!{OM6W=4
zUf`1wsJ=5z2pN)XX{5)uu9iq{=^mKWOY4Q9#ad~k-pV4=nZ+dM`_)9UtU)TZ1h#jF
zHArd*N^h~ToUvdl(Z)6pqc!mN$<R;5`Uy6>aYYg7{c|lPNk<x!ROUlt-WCJ%1tQD%
zRm&b5?f6}e*b;$l{c2=1jT;0Oalu6BNS!ZI18^gXj=J4oNRM|N1nKei`*0x;buB76
zlo&~!=lZO;SAt$rH~d#Pz-T{g<!O9mmxvW|U3_U42mckGDuVGEL2cfU39;t7cEXfF
z5h*dVthlPS!b!9{*QImcnp+ieS-03`Iuul{qpo6qEXTFjJk#al*So)05?lWQbWFU}
zm*O%Rjx^pAn6{i}ftJ{f9KRz92$Exx&NCjUxyP(L@T$s&(USJ)*@AF6<AnLGS3mJm
zIDk(6p39vw;+>>}qXnoH&Y*>AZ>s*+->W)ecusLV=+=Z=Kl)h(@Y3F?MLzu#&iO<C
zXSDnQ+VcPI;{AVJ(|;Ngd>_6bG%vAyh=EqguuwEU1mVY8$(wE~f9+rtI3W&dV>giG
zw?6sMLb4ts<f#fy7zGoo@G-F`$2wZ3lkp`$<iNEWf2fOm;5ag$Cil0i#iq+xM!uB9
zV{-xEGME!Go&eWpAOV>*Y~3w6a;j83xUwmv8naezQrn%|SM`Pvbyx>$p2Ikq(N`iN
z7zjZAM%|2cwLgdc+HhGMAP}JL(6xykBCrF8w%lY&a&je#vz~~`tmz<*9JpmTdIH@?
z*}l(PIFq=TPhW3tFigvJMXH%%SO5K4Z>jie(g<^oZ2=3{G2fdi>%h0xd#taDg0U|8
z`9X0-Z!YulN1}-230K2XGjQFax3)>z9$}x%1Z1AqfpN^2`r3_FiPZD}h9K@?>tlUl
z*(X~&*JK13tk_caF`biWz94HwhSdFvV*)S21@$HSOy`EmeJI2eW%9O}OlUP_T%w(W
z4?5Z>nyuZE^}|8Mi6D4^@N=DFbWwYT&OVb*v~%FdpR<eJh5)zQwJba{k7od-0s^k#
zYaH#+XcRX<RhSe2OQN^<eN^dtMc8H--mAR5W~DZZynR+VGQt_I6WWed82&GOjsA)_
z#2onLrfR#{{&HMov|D9wE0d;_xi__c{O?AP9O3~v(10Ilf{I4d@A9P2?q32oRF%li
z`buH2j`~p<sFh_FUe(=orE;?yTCE;`rQw561;Vl4>Tc5Z7{UkjQ6&9gXt#xMcw1TT
z0fEnT8yu{V8f(m~&AVjNq8hDd_hMpgWK}h-6Qoj*_B~}USK<F0j{gVUwDp5Y+eIwM
zxeNY_+`%XN$_VW!%QwL4%87DFVcY8zvKXDP0_^fR)E~Za4d{ev?rH>+`@*#jEk48K
z+&`b#*(ySH2enXyJ2?<8kjLsc!$dam6%2++dCI1PI*ZUnWsV~-@%j)QZoMTqQG#Yg
zPj4~N<etN+RVV8cTS96yu<2X)4AWOd8z5}@dD1I}NWG%Xde#mnH#|FkC#G%?IxdCI
zFB=m=w|A?Q<ZZI@UaD07SgWEHjFwbfYQuMaYi@vyWx;`;K{}w-lzt~sc-Ikf%EAQD
zRmRvN@=tqKAylnQrn(zVN{(Gvz4g`$6G-JpGb&gSkxz#z6<GgCb-*{0;tN7C+vsX+
zpeVAZwRzo=+vU3OJ#+Sm3e}(->=jeVt+B!bN3A!I(^jtl7P{;ukq;WVn7PIsQjW>!
z=z<-|<3e5yn}a;cAE_cd?OnrV&B$?%!Np3^Dho{!KCSkonq+*W1~pSB;xa*l{r&IV
zd(3sMqlK$uoG$EI?1nSao1WP7S9oIy6sGjQDoSCn*&7o8wAfp{N!f#ra;**j=;@wR
zCryIF@p}2oZ~nq{sm1&<45Z@!{<z;(@uGeplC{1Bd(USzevZ5GS6N|Rad){~v}~&g
zYn>YLE~_%Gzsc2fq2w-W<5l(>DS6S)DNlu#y@v@=vwHN9P=^6nP=EW)AN*BNsK(CZ
zJ)ZzVv6ebNiAqP~Jex;N(+_p7Lr%j9KkWO|O1G7?6;9afxb=9f?7Rn(P_4F(_MYUf
z#n_*M%*f8FOc)$jdYsUDKL`ydLi&|~9ZBvF+9b9+N2bU#tb;OZBDC8arsdS`j0o7g
zI1paI!J7&u^8(wlIuNOKCuZev?61ZrC|PBd;Fz237M*KsNNfZ8bA?dF2%ejp@0l$U
zsKlnXU;-H`|8hRpMi(JZ@;Vec;kPE^*8eGQEB-Tz=yR{b!BL?|<;uge+L9JC>0(lu
z=x@q@P~DQ}3L4o+v?HA`=Yg+_^|#IeeotD7Hl0P`KNW>|fRFs#9TW$kdoX63)y&eT
zsipcu;+KkaE*&TJa!KTbKx<a|OdxMKxW{Ixkl_v`vMFF%v;AJFeGD=7>e~YqTzm2Z
zmrZ6ne3)9KPhF7WLR6F8UHP|t_qK8}e||r^==o%3@>|g*BE;u*%2eJEO>GB@KMq|B
z@78_SaE!$eiQG!N*tAgsiw3&y`G9?z>A}RwQHRqjqcuiEQd3m2lDIb&05a1H{GUoR
z>T_usuA(RCK`*rl*?*e?;;nTl1<5Xe^+opz?ee~1=BSas+k4^?3XCUj@cp^ywoPm}
zZ#nDUDbVibhw*Rs3#WQ{iR1w+gu$a^-M>%;QnoBe(61ZW<Cf1|hsu6R;+YU{E#0qf
zq;MPx`+)RO{!HwqKa`WYmYKj@d#OugR-LYG7HnBAz!RFdEBlID(U2QttxTzwQcDkO
z7p{jS0mULIcjprpF+^f}WDt-AR5GHQDi|mhtJsmsR7dRTBR|hE-pk^SKH~I_3?CDE
zhBGkK?j)QQ90BOqPEEUf9a!3!l}0Aje@UDp_H0lIpl-NzIkJ|DY509H0Es)V)kh`<
zSKOllZD0sf+;7~E<o2Q7Be)+82UOXur7^y@wfheT=~)nZ>|kYO#6m^G{q3E{Y6s;y
zF6Cl^0TQXhiM8j@R#fZCUj-@RVc%(Qfx(@kJF!}}KL(ebG&jU9Dm&51q4u3}LWq~9
zZQ0+zw<80o5P?9WbU|0F{8R<&-%%(Of;g{d_*U=)-cHW^k+y8gUwX#5_BWO+{8yc0
z%E}<i8S}a~P9#T060TxSS!q$%PjTInMn<+a73pZr5G?r20~y7eQ3>E_!ns<kS5JtI
z@zN*pOU{B}qPF-C=CaS1HTVMrFcVVv)53xF-%rHjme`>xKGlQ9c71QYKq>a?+=lJ~
z8y~x;Gw2_+xPNlRCy@QfI^p<$l|FbzJ(!IKsL-Cm&mRl_L1|r%T05D2HzhgJj|<Np
zT=cC0MudCx45&Pq3YA6q3@(;&QRc3CuUspofDSR)sn{)5>w1ecS`6YIy|{V_i7m<n
z3T5^a$yASPr(}t*Wz^k-&{3P5zC|PFa8*&MpKSeV?kD4#9LVnKvU;qwB(vRzpH)Sy
zf!0_>zH?$#H&XmbU^!+uWzu9W5`88puS8sGo#XG5JKf#NmniO&Su}8D<<@q7Omz>g
z0;%XT<xaKNflOxRUT6s$t=_<+GS)Y-4{T>~%CHKG&+;DD&KDJI1?vJ|ZmFigXOeYo
zK7Ou2UtsPXwYLEKnqB<<KL0^Y*BYy(->%rRzoq_o`Sl=3I#>zJExSjX#HWCA=cgrT
zgJt#{!WR(?^<?+96VR$;q1*(|u1H5JBkl9P>M8w}Q1AjTS7QQn_Sr@|YyW*mSL7bS
zW-Y|-k4I~rf-)COvy0`)BQ_+c<@vCxYnDy3)ZnT3SeV6Md3Ce3^R?<Ukj6so-F003
zEi0}a5pxDmXU^}nErktSzpYT#9yIkZ5>Ow;HDVpvSMmJ8%#9(3__(%_poCk4H`&15
z$GzH#h`^xJCouSr7dbe`1FrTeKZk2Gor4!0S~1hjYq_K5j>V}kyXf{!x!dk%46Mcz
z#|2E%5R{i>|9RjOchxBGo=x*^aFSkhB?R8evH#iD<RFOb?`iJI8$dV-=e?hC-Q@oY
z_rFfgi}PuNT}lV1&+zlv@`~*O$!E6|ZBO%F5EDHl#?m_VFJdYig>OW1GLWrVj25YR
z2u~no!O{!E6S<{AWIuF9YkQ&5+M3H}`S$`wR*r+!#ub^W8y-JG&A5hN)$6;P`IN`(
zYPLa3+-e_wsGRYaqKM&ytT?Fa-<4mGzM#z)rpFJS2$RSCa@4I;U%lCQZA#rGN8j|h
z@>9BwqqWu**%#^jk>UO7L3*JV18>BC&a8{SC50zTOTB}fQDXZ9?{G0-kGSornE(A)
zUV*QE6=cr?K`mVT%awR>AMrA4iy+~6#{Cxynew>nHs}+GdVa>$uOnb8Z_~gefDZN2
zc5@@rGJGP<!ObrOi1o%t${q*L*!QiI`6FA9{IbitX@(e83!~Jo|DYzH&uFW=4y!k0
z*{jCbY*_G6#kIgFB`pG1S^He&eVdS_S?Yh1OA7wqd|{}P=F3tu*5H_has)&K{;#lO
z7BlRvXoU52aiWBQzY8gcw*u=?g)2E5B#$Q;{gg*C3W{2>0AFI~yR7WSNB|MC=~)~N
zd23q@@@9@)V{|cSd0@8gC|Mer>8fJi$`eTs|Hx#*+dz%C=5X^0?av#Z9ilj1{6~k?
z75MQP+9raIT)`O^0%Io91%;!~pqs%15}Az^Ho7i`pbf(W(D=lB?VrgC9P-Va?xoB5
zvf>K9v+}KZR=b3~$isxr8P(I=@rJ`h!Bnm}Nxc1PO9cvCsH_}R+Dc)3j{O0QIN$ST
zs_zLBRGJtGSur_qtW#?7f21QW1==d-wW}~ZqlEEB;JMxwzZ$oLo&(_Zp!N2*+U0bk
zseZ>+zc&iqL5I>qBfYK!c-&EdG-9Ke?~N(`l&Q_^>YaXVl{&cRF$WcUJP#3|u~)??
zsB%EuqeLFVbogbhIY5S<WS=4X<ltd=!B`{y&S$9nBm<S!gD<GO&bmU4lA*@-JGk&&
zriwOkeVXpj!}5+3PBF3;ObGXvH%8M0xFf_%v-Em4awAY9!Be7`)d6=(L1LZ=agc>F
z!}v%{1g(=<nK!<@JT$8LQ6Poz+f5*!cK;VB-9i$br3qe3RqQO3R%lg6kj~rUIlgG?
zdb89H*ia}nwDssK-|~(tSmMR{7ESF2KGL?yn;TcO=Lg4YzdYWBFWq>|&<}41Xgrv$
zre~N160PZKcCJ=gqf^xs?uvLfi;s*OaOA`G4~x{ds}#~ighlflwT|-VnxGzTe{)6r
z7tla^gJnNu>)f?z94AVa#AOz?Kxxl>kAzwawY^ftOO_wP?wNL>pxoF!mofGBWi>8D
zBkhn=1~Fqo|2Oxn7?w+7Qiox&#hT;-6%uv8M3*xaH?Z72H`g?ny2T55CA>X*O-T<h
zUR=PFOoZ?Jw<H4}Vt=id4}WJ#^27{7a!OA#R$g3hf!a4;S5k{MPw&vl10zY%)E?OX
z@!Rn8L}yU#vYjc!bdXdor=7b5&VESdS5_$ONqXHd!FxQ5--cFfS-1<07rG>vV^S@7
z?QeBC4QNLxWA1<ZU|n4ajObr?KDm4EXL2uJp}1QIF$5ay+X?iHmn0d8*yS;IyOj*2
z&`N5JJCH+%uS7X_hViGLF7Q_lRtL&&@#d<DGFx%{`KO8PE=*{6KdjxPO-a;;+kpG|
z3RBvo4Ds|=);bOYgZ%N+YmTk`8(>$foV49ER_J=wc=G=&G+k*YYx6|fn%%=clkj(^
zuqcp)iE<-e2uQuWI<M91TtgX?FmBQ0!#l8^Z+!Sd-1*l3Y>vPg(x=dF0Z#C6i9kh$
zk-Ab9B4Ksnyp>X!WP?P>1nw5MnT!(cKVY*|Qq@SEN|$5yQv2%jIVnoZPs7=d>tnh0
zWY2oW`Qu5$!1atk1x^IGxwY<r)dk0fC_$z;vSKE;<TbrgiBGOu)`9_wf4_8$4`E7o
zRl!Z0S}$zXym%Th@iNDE>@(8%;b^QZFxjnq_%818<2~kxyQ6#{dzoVo9?njK?tM)&
z+;T=uRew)gSA<?NceCCr=}(uD4KOCE+G3LehADLJLGLl4pDQdD{XDvZ$e}PiA5hk(
zGKB$|{E#hcb2YAtTj5*Pzmj`1^I-%SuBU3tuz8sW0CtKKqKpG;f7E-=V3)%pzy1N}
zM6vL3KWeD>z3%<5ylN%ZxeSnAUL$>$kb0VDky^r?WW0yK{847^n-EfxNPXT>!k^J?
zYDm<tiE<-(E%yUg5s)JzLmaz*yvTv}!h{fJzMOOe^*Wh&M?F~<Zjvtbc|FFD=zM<S
z^nN)=ckaHev&|vJ{=NP^CIL&<;YcAkzsRVN0FCrBrI;B*Oar;WQDc~FQj}XSJTOPz
zUu8a-LSEWUaC-cGYhKSbn)+Vg__^<$_CKiP2IjBAGjdUt?cNS`5u$mAf8xZgB-td0
zZV7LPXk^9@L{cVws_WJiIJ4q-qviwnO2<i=NIV{@aUuM-1fo4MVwMlD<&AP%xYf=k
zS}z<TM-4Ub>P2C=ugLWL8l7*Z5X%?bxK}Xct$9Fki~6EVvt^)7J8>cZkk`N%wOAb2
z@Y1oHe=*a#8lkX*nk;fCRT(?3kGzcQ_q_)mE2Utq!b8s3LJCN@>7wg{MAuF<zuCr^
z3FekzQWi4)&6J{a&)k)K)4Mmpi`2=$>dJy)-A(z`w#vw$;USvW8lI}^pYO}KV4(wo
z|9f~U!!xfWgrA`R_Ak)!E$EjIc~p7UKBL=sXWzFba@=ojhePURadL<SDfnmm+$u_%
z7>r5*<~ujkwG|qUQjg6DzMnQ+81iOu2&gP<sFwOI+uTVKZ>M;Wgb9#Nl~!n2^-e(v
z(uW+b@OKS)8=Y|^{YC2VRNLd?o~iapWra}=)n+eTeJ(dPROS?$>K2aO<v(j0F8&&E
zckHucey))b)?zo89DwPav(&pS&A&J-a*#00I^XWFN<rlbm>E8rEhSJ{%FGClIBuP^
z5_3!fc^%diixC8e&FDE<^XyroUplJlnG!MQJYLNWP!BH)0I_1THJv$ljx<ITgb7!U
zz{Ay)L?nA0#kSUXQXAxd0R%CHj_;KZ(Oy!m;K&}!DGmaA^i%gb-}6jQ&bRkap`&i&
z*kA9fXqNBQepG8OmXWIW?nv)S{(<)-%U$<7WJCPd5jKZHr<9p=@t*K@aCFD>^1fgH
zamRnsEav$otxBW6rRty1-ER>`r5Y+QReqqYwpJ*I0o5Hh$yZV`_@MXiN>c@|c+o~z
zOmCi*KF4lz68y7l+SK@898~{<;+ecnDbRT7P;Uf<MjfsY=QJsbrx+|%l{jH5?@^Nf
z`&C2Bw7RGAsp%*8aFF6dvAqU50xh{ps@$tZB(wc7Oi<GL)zcVu#5!148N3?As`Ye3
zsykBc_g+#|13HLGu2%o;iyOp-Y5N=Mp>m8+iZyoRRm~-`bM`pzk*iXsTRV-4G%8(T
zdZ<4GhpW%vLLkLUXjN6|`E8Nww^~#xeEe5A!tq`S4&xGUb=zzjIt#Ic?>^j6YIP*U
ziw)$VLTT_@*$Cq)pGRNKDKGPj02^He5HT{Aww~Z&1?1PgMYv$!OT<AfG5DYV&+j@e
z;Ed@{F88`O5;#bo99;1pTJfy;kFE7{-=DTl4J^37rvFUv3XDdJB+-ocBoGwr6eb&~
z(gpYv&b*&f;x7Fz5SVt(m0NCZF0j?R%q&a4eO*QpW_(W2J_LMKRG@o!$KE8L8N2Nn
zPVgj>0P=o(Sk4V#)^Hp+c^;MbyZO|A72<JTF3J&79m-8vJXp~jr^9AQ`9yboS){GH
z4UhMEUre1HAruCv{VpU#R;lH|fjM&&${EB_q^~5K4#ma%4w2rk{0Ft7sSZNvkWx)y
zJa+RlP^>#@7H`KC7KUg~&Tv|^P2D2C?!QPr<SCh;&K$u-0=ZwKVdisA>d`XKRew*u
zrW9}RBVG>Cr+N1>?}L2MrUPXi%bnzCm1LXuI6%V6cx2z)YW^Y|Ae&>u1_&WfBbI++
zACo9nt6>#h=CE(T`9|hR*7&fR-7zYQbUerXSZMOX`Tz9PkGD~AmQs(-Xb8OPgTiAw
zy?}XE|3TfMK!P7@Z1r$hp554rZ<$m*MSSNTO4ZC;!(dV9z7itms_52$3Wvz0T^(Ss
zMv20gJHf`3ZS5qQQ9}#G-d%E=bW*9Wohu}5M84-uEsZVp5k+(X3X0VhWMn0azw_K+
zu=SL0B@|fWtm|XRTLi3sM5U<i`+mFYL!Ieb%XDU|U5l;kmgwaVn4%WTHjeKA{G5jF
z71Y;?la$6z*I+Sk;Tm!!wrQ@XDMGOSo-mcD)bjW)&STFNn#yPDiFc~<Q6ViQb=ww+
zJM4k2w@Oy`aozzFglk~FzE(j!)<sXwLjP;ZVbGp0yKUGXe+EmY4k1qkFZ<4Ug2Xv%
zXL)JN83}h*knq6#OFd~<-0gd<QZuT<2B^RB@K;v5PSn}K;Ep^8t`iV<N;T)!OtBp}
zU&G(kHT8NgMAs6=A8U4|ENRG-2mKV=>k7Lf#%*hi2$CJkW(t8M-dKDwM9$pXLIT9Z
zczfZ$0s6Pt=lNtYw9QO%e({YS^a`pSC+Jg-&`4?Rki9C$kNA@l@1XWKgf57Btl(RZ
zA;AAkPSCUCq%4l?OpSS}&!HSL&{5TA<0*Y}6L^A!3^DH^k(~lG@^lau>Ps}SRsiO%
z>LbZEA85Gi3ZbAalqxWIgcP}|lr(q)IJ|eG=CE*3)<xp{7*okTVD=pnoeEZ4!h~>F
z#=Qdn_hz9a|5@HOBr7#uRaHG3EL2=~oHT?NU$GODQC^qGp{^72-ny5|X8U!5>IQ4%
zMa9nC!We$S41N|^RC8&(f6Ee;DdC}RyZg#Tz`}#?XQo4*>E(~=U(I6T9f&1zL>7{^
z&)fBt#X5Yuby$c<8W*lz`O2})O4Qf*>$|PJL*TvvbFS7s3Hl4eC1}lNu~W#T%>EC_
z8Qs#_9*vruIefy56F+TKr^={UJvLHS7$3%^E}QjY@U!NW7DmFo>ebJ7jqRTrJ2zHk
zO)ZwV;(*c81~)+v`(+N`W+{1||0o$MVTBwuB<ilC^@`q<S!Y=FU(_2t2ZOw3mm*^|
zl9hS1A!55E4GO3&F!*%2?mb+z?Ngm!s+#*i+DVe!+xT?>W8Y((S)A1I9`QP6niI3r
z{ddGT2gNab+#fK@OQ<b!s<ayk3Kr&%zm>CV!|bYe(cL%F%=a_9YppXxTEAwAval^y
ziM@lxC8%?dTi|~G{ssGVEhnocH7=PjMt<B@j=4s(_glcOt@H|m2EL85jiSwzUQE+A
za+CFd#xN)OpY8e+OYFFRiY8(xXA9mH&%7AJkBCgBD<><VRJ9~-07@KfJWCC}IO3#$
zjeF&3>&GF|%Mn=H;@K;?oDiXpuWkpe`b%;m&`eo-gi4$tS2*LrAMIUmk!g_AnTMPl
zj+rnL&l_W{9YSk((Xgn6p~%>Zyu+8&fzCN!Dfaq)hh%M0)ROo;0UFZ;O5}+|1{HBN
zY`b-e32t@3ykd2IJulMsNMxsFD&`SFf<M;x5uT*>|BL<lv2*Tia;ukJhMk^Wv!4*7
zZ1}@mXd*6FKQDSbh-(y_y(ezj=Skw+$7-wI<NAtP5Jh*S*zrG%4T2DOwZSb~n2SA*
z<xGinbKU0pXptdgNaV^d44&DJ_J?w{hqGJu;UNAGO3uva<OdTJ;eA{|kF+rq=@g5G
zQO=JrgPOQs<DRg@1wHZudxwg%?<L)E*mg6+`lv6_S~t^q?Kz<?%L`!GI8i27!wuki
zE^0T&sUE%5g)5j&aZz$y;sWr7O|`PX&N&4BI}8yC=(_jL4VZi6vZeaFafdtgYy!$g
zRP<4{E!W~g{V7WFkyBfKHRI{n8c2FQC6@!kKq`IZVU;q#za%yIcFj){_4Zp5j=?!X
z5bDZ>L2KC2ubWx}Uo@HbqZiZL>1(+JHvb>ot&O(jC}V<b8v7_`{O1=RKeWO$Abm;M
zZ1bSs-)NmsJUyzA;>sR=X~q=1n%15E6BlI9G%kbaZMYHlD~=kiCzEY1X7WQlro8hZ
zFSmv&=hsv`Jb@dPsEj`l=VIWA@TV%n9M-z%v9q!LulRJr+JtC`iKn6*LHSEosZCby
zaLst4up_IOLp8wEG3#E;1T_y5J)<_`d{5KQNw+y>x5u@*Q$7PA%OFcG*jWEcUur>B
zf~H5iF-M+26inH08PePq8%RAhviMl3aHI_C))!9*WJMooC87nu8X^`{{hUV7Yod>t
z<B$|p8gYIaGQ~(?&S`yBoL!va%|@j0?vll-S(Sh`2I;7?O&#h<|0FU-nzF8W(%)ia
z*7VA%AaVFzwClj%k@4O!IJS-^x%3(Sz!X5cdZB$fK{Ay--XW=T6h;e8+z9t7{pF9s
zRU!7vn)xG7Q>5)!STVin*Tn~}lA5nljPMErO-k8fLoKPIq?(EoQ}z<*fpICJgnkxg
zGuTcWXuOA8m4Us-xq4nea5CLq{+Z7;I`|23ODXMOs^_OvA0AN!WxWJ);UT%P!@<(f
z@SKWR==ztQ!kb6@63~vPQtYJ50vYZ$TOx5dRrx!e$hH(93o7_U61&*mq;^yzQ5N+h
zRui+#m_(UQTHRJ->bG=?yY!C<D2q9-#J_Nr8yk`i$~dn<wRT+}wAzsYNoJ_SD(#iM
zsdj&3ur`{lxtbO`9>kAn(?@?QmRd~<8r>rZP0_h*uy|A!oUN%bU-wOOm*NI7zKo*6
zZxJq0#%qM(#k<lHqB=-%Js)GsJk7{2U%qFqLKa4NK5XLQ>T)GJNY*J-+bA*|LdESx
z6j^!QC?0c;ysG__i|m}n?T3GFOS@6iW+W%hM$0(Mc+YYxChMiX@_@^Ip7IxJ0ymJ%
z?&Ed%trZi+Ry+<Ka}pjGvj5;pOX%c2np6JK@&LCrtVh_ItzJi6dtE|aM{I#B7j18_
zYOVD~tkjq;u}wZjYqD}eluW(I|7CZ9H%WS^x$bud64)`UNifG2F?tYL2kSL0dpM3q
z=anLh?6uXQtHfqdy6OO2rq`r3<DmPhw2x^Rp--SI=BIw1tr9m09}zyYPaqO0QdWD%
zwPj}}ZIwlGAEgwJ9pFdQQn4?LJ1Rh8mJXVoqh9@=au)WJB<5WKpY_~pKOYRx9>AsU
zA$aHdf_`(&+qO4qodcxn8`mR?Mx6pQ1*Iu!Q5U%zGvzxO6*;cq_LVJL={LpRX%~-z
zDYvLw8(9bmeeae11Hft@^sJ#<CMiJS=%_!s_3cKS0`3{6|6~vA8&xD=CdCNzQ`%30
zWN=hXfYf^9u`7NBU6s(h4j%N|Szp*k{Ka~&%X?4$>=fT+24w+3V6HBPv{{iJ`uGN{
zas$4w-Q$gK5ujVLz5Ee|^OQG!#g9ak?+;MP+&%j}EUy}yl+?0iKk$gd1S9|3XG&a`
zL2IGzS|1f$*mHvu&gFF16x!;>ze)=GnX<Ju)XpK|n3T1Q7S;(c-PvT^7HL2wZFv>x
z!08h>&NwTA!l>`D9bnW_0-1o0ko##h+7o5Q#;!}12EU0!^rnMUi6r;u3u)L!jzAMo
z4^-?<KS&LhiedPcs~^pjN@pv-sV9vkXJM>xmHQM&(`W4!-xZq8kVbi#2Gdlnx9$@M
zr>86;&sTY??)PXpX>;)GRV5>~@qZO7o~Q#eG|^i^L6Xk3Am?qhPuGXx0sUVV_qzJ$
z&1+_X))bHDJn?lkhcV{YQl4vti#xy;rikkYU+KnNY;E?G@n#s;$dv|wJ3J(qdWGwi
z@ETioU7W_+_+bmBS6#y@#gxRg$+#APt#uyl$@?DrUNPsAAexViCPG`h{q8;SKPr;l
zq6N!b;Yh{5V<b6Vg-yfNzO+Gk3NNt6Z56fM?_s#_)}qSD>;(1F1ByQnj$dL`j1ux4
zio3$KK4jrN=B3`#i2QATN{MV!!{x!^W#AA7w))bT2sHqDzBy+#FT=MGeVpD&T!@lq
z{zesoA^)&0IuaGN<@LgvuaGuJzQZGN<^i>%vI3!{Az=H>9c4O+p=QF`omp5V8BqO}
zxXDIG#I)N#EIBNxqTvp=<7ZFqX<A8@(E~eNa@Qeo?D{NECz`(}PU2tY(5qM~SQX~d
z#z>n*&}wK|qI$Rlsn7C|@oQ)0)YaTKz_G!&>*q5*i^7PGtC09mhH;g%U)YGhV?zk{
z3WNj}js`mlM*w-pT^gyo>F>z_p@77J`MjXti8vP7&&MeztU@5*i!WU#5OJ;oY9R0U
z9xS?8;CWFJM*qWw)VxIA(GOaP!Q5UQ`-I%A(?;~UitXacQ!wIztML+QQ^tK77c20t
z{DDMf=i&Ot84uqX7Ryi{t#%o|5ZLIp9&|w!^YH(W^_FdIu;H>cR$Ph|cPJ9vEoh4t
zC@w{cySrPFQXse$cMtCFQarc^cXxSq*4pp;Y5#-dLvq}6&CEIMfP?hia5NHK#E(vB
zXNK9jsKRXWM(UVK-edZ+Vu>?;^2H~(7HFF0I}c_zdV7EOkrJw2fnk2Bwxz(Vk=Dj_
z%+{A)ka(Cca`4XEC9huh^S_|zg3^nOu`UGtwP~4;RaNOnkJ@T2$Ebvoq;k|qUt4y|
z3^{=l^2TK=MlzTx3BDQvpLI!~a!cix@=va}eEm!c!aNEI`fZrFb-|yVdJgDjvvAeU
zud>F3#EopKZ0&TiVogVBlDq77Hp*AW*;3Pxi8~@l&NSl3(nMta2~J4N<9&ByJ39d)
zrw=L47m?*5oL(e+V7{ipxDm}o*QQfD3bDp;%QQNvCyR4m*wu%#j;CxrB}L1NbJI6L
zjxg1m8!!k!I7a4K>{gKGUATmM`E9xi+c=P=Nz$WYIzx&jx1>^0SeF8*C%AU)x{lLd
zR}6#^nC|k~^N{<>^<|hj>+^o;TQ#D_=4lrnYpgnB#0_?l#$O~jQ2k2brBA3Zd1NJN
zelW@!*Bc2%WN2@0^mpGX!hQUq6dj0!UUCfByZ!ndA=;!X@6u{G{Ivc*I9Lk0%C9<5
z%*Hvu^6-@*RLAB<^*9!0o`6%tF8}|n6Xpa<V~j<D<)T>P6?q+&(ox15!tHn7$LN79
zf&N*-ll?W*1R&Aei2<>l=!wF#;8n|*j+dc|;lMk~6|gCGRvH&yA6Z0Q)Di;gFS3iO
zR&@4F&|fn-kCng<ep(Fk|E~3UZ`u{k=F@iPe&v5Xn(Z12M<>WnggjyzGHaBc`rJoA
z+W~2!e^aj0CE!|zcJ!{f;Y9c8@#7oo<7|710+TEo9jvn1Gloo>e~pu2^FT~z>#I@?
zZoY&WC5&cI+^7AFBpwb-rCaQ@1g8^o?|yX61nK2wR5cX_t;V^CcvP+8Vx}nJ*RZvt
zh?-u3#hiCNtmd_%H-5oyC$LrY2PFTqCDjnlfOicVebAM`a(6_iFeVmwDWT14jnPNQ
zuTK6Ul5ab=4`SYaj{U=*MaNn`JK(KNmwOAXL6;n>yPM?hSe#C6%64xX5XpApmb)GF
z_KbWL5;Z#Vh&n9|Fz+3qQn<T(+_1EU@hb!iCZTC71*qZ6#5-7VTxU9l%*Zd&ow07{
zwRz&qKw02PS;91@ZR&c9hfVJyZGt_wug>B@Hb3G?)ExvmGtAU5m7XV$EU-g4&n36C
zb))2-LYVNG*v-B>vfO5BYGMNB#Z>BEKZ8}5Nc~h<nmpi7QTW7iEIs|bDYDv9QzUbY
z4R^NxgNs;Lh;rnUCBjC%UPim#VXXFdzEHCij$(w*d*p_lvyAlLp))#|IUNJ{2%Q0&
z(oMCoWLV_H>;z@Wh0Gx$m;#~;>+5li*LtH@WN>3ns8h2|$iFQ;V#+oJd4kY9pJXY7
z-Ewb^wf4149iW|_4N-y}XO*5THR#yDmYvzwba%9$vHN5DA2?Xs@%J$M4{hc~`HJ&H
zgi@?_wO7fNC6FG(8fE%cFl17`#+!TEb=-`2eo=YA&wHLPHPGL6{PAV;(jcWc<1XJQ
z>?7;gciFhdJ78e4{|hcMFWpOi7j|uXmZ)Umpd&{rXIrEL`KOck9G{n_r9=e$MXL$F
zMcno=oRK%O;1aI|kVQAG4{RRD>}4AE%Ru~Z?k)5Pme95amPvqTaHa_C!g|1;A5$+Q
z`o;0szNOAQktV4oHMc3FPFW1A8aB$XYzs-=d>LvahJaW>BRQ0Qf>#{<Z(keUTv!KA
z{eDkd2od<+iw)&$QT@)Qxn#Tkt1J4#=w#U(mw<jN;=Q#~6j1#5r!Rx?_w7+br^5{U
z&vb=2*?&CiD&J*I09zb?0_!&_wubGE`mRR(3lkI(Je58^Q1slL7@m_+pjRXmM?ZCF
z8K}k9%u}g90-!_@9o9A+Zp_OD-d2UsFODic_gLb(+?-J-aM{tqR8;{$tLOXL?UTWd
zSqJ*2WMce=9l=xCB7gAZ_>Ma$3J=IJAY$*X95p`R^2RgMSwZ?JIBrVL&R&G(yGmsR
zK;Cs#uErbIY=QA)1Z2F0j!{ah$2N`uX#??@A%LL@`QD}H48bvXqV`sc{Ip;){2sWQ
zBei+oA}o8_+8!3~%7w#~iU%^;DWBwPsoAue)7g)qn|x{;-nvWrru10tl8ARzfN`to
zkr8<R-O!T>E?P`ab{-?0gxaS9<?~(Kq5bbqf^RGqHzdtiEtHLz`CAp<?3^8SAO$x&
zO3sr69EC637V$=8wS)3_D1wrpBJD=p2>>>U#MaAv2d{Fj6GlKXzP+6-YImgRc!Qi&
z3X^j%BxAU%Id#e+Sz#9e#c+Ox9#_VkR$}n?nyiG<PaAGtUA5IoiK!;Q^#$J9sa3bY
zGu3n#e5<Gj$1B(Ca%R6xjy`CtP?ezaBj%@zqwgcphZK^6CrUCF?oOOv1H=MV=>_R9
z>oa4MWU0~u)Fh@DSBjZ>Zq&Z?G7O37IqQW^H;j^FME>;xzRXvc3u60Z<gc30wBxIE
zwR*8ywi+ds?<osT_&kZVtjK#BrQe7us#ezHGQ0iirYmg<-i5or<7<nPO1mEU+|l1p
zDF*t!YFai#^H(rt2%$@aO3HT5IE6V}3mCDQ2R=M*@t*atytK0^JN<Iaoteyb%2K~3
zb84Y;PeANw2rP$8-MfHkQ!qV(1b63sgQ*jJFjI(ME9l3~*cj^j>0DdAcX1ap%sXlt
z+dFJ#YN>i84z9{}@phm7Tqg*LD}c<@ECjXT+^H8vb=TA6n{H|F%7;m~0JX#>UP4v)
zn1whjd}yz-f(Z4WP!qlvu$Eke3k?aC?PQ>t0>~&z_7EwvlM3&qt+CoPRbe*JL6kGg
z5FP3VuHUHSiKjM6_xBgrnuUx@>d@a%(1XKv;0i*6ij>1e8MkCJdwbkL9)*oZd&+!A
zE8Qp^bz<;@L^GGMF^_kmfSpDvlX<k>Xx5<%&O%-6gW1vu<aQhf7t!9IEP@U|n@lG6
zrsM^V6y@;GoeE9=>7ezMq7%!vRI_0?ZRt%(UgWFj4+GHwy11Rv>Q{E|B7y4Cp=t_c
z)+hI}#uG}qON^xV>FELnoljn_u-um>y(2*kLAfeqt86RONHJnas(j|yo3K<l`_P;~
z)xd)0|GWAjL~YEJaFXczYN1xeaCxH1&eoN>nv!WW@LQx8en@*qY*ZZ`WfUwK?}g?a
z5VA1-W(G!h|33Pjm?*dAR)k2GA!FhW=6~!oWxMF_)D!XgSNCMQS03GTb}aY`n7w~?
zRU3%f0x9edZtoaImyTOhdhVqyZ(V6zeb;vO6vMjIK3%^e8*)L2mo|3{`c^L}IQ6Ym
zXwV<HO|@L2@aSE(V(XK2+U%2r#O=-cp~cR{+h905q_bNd!hT!8xJ=BE24`35*q&($
z6kfPFIrBu6^v4@fks-%5%B&^?hYXiDXagB+89QHc>gIP&>$ykGe+>vIetfPnwLRO^
zFiuM*(e)a1_;Dg|>6Czs-p%BY{xE1JXPN|;zBL~frpCD}1a7Yg%QNJ!LV8nu@-Rx@
z;eM`e!Y+u00TLck|ASkxis3b8R1+XPlql4Z>D!r$lK4$vqHh)$!vauPjRX)s3@VXw
z7JN|1{!L=Sm0YWt<tN>deOebEK|;ax45`4!=>+v<eT0`VWRbxnit_0kBdKw}P>IHz
z4YTj&)5y4^ZVOe85@_vg7qLIvR=XB)?oquvpm)}tFC%qwNvxA;D4i@!vaV>d9_7_q
zIODA({BD-G(bPe+)s6cqlSN-6&-+HG<Gol<gT+Mgh5B+h+A{Y6ta^#lD^&f<XK{e5
z&3^34+PUdxH2i;XY;wJ>+0Jv#e76&wRh^ac9~x{4<Ty)%SW0i<oKR-F5c#3YE-#|A
zi``FD6;2oME-g?Vlz@)zgLhfxtqUYC&z?4uqX3G=PgmLI@invmV9osa=ftBW4}^wS
z$?Yn-fqQY|1!{b1k+@qkyPw?gBecztCEBpKgUf-V`W^9139+wWg>3EhLRFDh>h^AL
zwSbMVNHcz}$TrE7=I~s%^t0BQa8`Cj@u<5swcFD29l4M8^z~Qr5z}PN`Ec|aB;0c>
zFREy<Px?#g?v8LdpREN*Dziqzm4>rgU%Hi(&-R|0S(9w2fa!ZgCCuvLVs~-_ZG-=^
zjp39S$5$^%!?xrJ3PW8e3yvl%4GhQMntd%xr4~Fdm5-e58F9$}#!6C+Lw?4h@?HpA
z>*lICM`voEEujr*m@>5&2jGCB3k+2g>c3V%`zVec!arzfw{U$#*13wxBC@e_knQG8
z>17Dy=SfDPO7_eQ{V7q3?BWXv&=1)iMZ|QA`{;c;2>8)KL}^AWX8g8)F9_#@Jx`Nv
zYJ!jeN8jwggy6{KVIEQXTJgD$BtJzZDa=H!ii)pDZnp`2YyiJyXBRl!C2Wz&`2Ahy
z<d6wpn>xX%o89|a6HxK{6Wn3pxqUADq;(z1`GHRVFa6j485>(6bDk+&(1To{$3gTo
z(V=GDk08bMOtX(dM>$8DTF~<aN9JUD8KLjpz9`*A2Cdiwapn6wDVnBpPo%Y26CGoW
zI>RW$!?YoGKHWi^{+XK>+JT0)m)mA9gyV9k|1uGe43iD{JTG|Z!XAT}A|x%TFOQv#
zM8va1i*+u^D$IhPEV{xUf3;5Q2R1d{JR~+QNCB2dyTbqvg0(Vcv-MdM<Jb*g)~oUq
zu^mF{Xh4n-H<}=y9t><yET(+W1z3=!X)o}C5#!h;)jM4TZ61IELs~vn;IVetzBL>y
zlkInj#Fg0(+hwyuPnkSdlz&~?G-hl@XsOty0jk;<Ocjxx5gT`E?_m;KMnBL0SpAb&
z!#-~Uzt)L!psxgqTG+zya3XVL_2DF(8FqGbwLhR32%U>*4`|$1#Gd1+P*29o^6ne8
zlH@ie11S_QCeSkmtgk1#CWw=|*<y5OTvpF=qn@&4PKzc~tj6+y`Vzw1(wG(eF}h6F
z`Y<C^gN`<-6x_7-q5RRzC{)Z2pBmL>zJ*J#YXoSzn`$1Z>Xj?Z0M|Z$Tiq{x|LS%2
zTKHW!2);r(enL13npC*He4$>LTN!59cnhl)2wtwoiXUY8^>t)CEA~EzC1ofzWhazn
z6Th3C0|qDO+zK2HrgsT70AuGX6k!AwQBR?#2>)|nt98%v{We`SG5^{tf%J<!W!U(K
zCUgUMsaz50)K;8*<`jkfh<0?|I2Tr;?5bLN8Y}8=(@-j$RNxVdF-XY=d6ndiAK1rW
z+oa{dgh4l2{y9&4n;~8sI@V0u3hhPGaO{P44P!+VA$4XC_do&4g_*|=mfuR~Yl=Ub
z=!2A5Zi0F;HOYeBwFnUWAFJm*%ZAO%5G!LEID74OGH)x*$iREDS}T9zL2Xzzf4r$T
zS<HiA^OE%V3uXEebt0oT4i2z>mDXjV(DCPgf7n#C<{YB0M*9<khyw_ZpQ&QCb{Udy
zYX^4yFVrs#Wfojekp`WXe@fn0JAFCXm&>9PeZl`Rg{IuVr<Zx&aCGdSWpc!Vh8ud&
za-^=0!~DfG#Uc4dAWV$f=*SacW&5E|ju^;gcvoPXk}*PrzEvw)?c)zPaMm@uJbx*e
z?cXuh)MZmDYkJG#iKpe4C&YAJV;5)@I=bGk>d;2xQG76^wtp!7_JAlTs0w+r4HsQ4
zXuR%@g!;*Dr?xg{Ttggks&Up|BaHewir@S0Cy~SNG`~gmaK~nPO!(u?DwFu_eXr!^
zTBvt75cvkVm-Q!e!Y1)WzbJ@JA_52=8z*JWJe8HzfWEiH{AT=TmowBP%C@yGJD^?u
zotOWgcRJ)0?AG`w&M?&MS$2XRzWI;sLmk&iHU?#Pm`}`oGG$ir9~l5b@HJ`=oaHq7
zEuGr&L2_i(*cpCU&E-YJtANAh(N)7WVgHzR=9@pud-lU9+6#?_gd#;h%f@_d*~+k`
zyFDvDUP7N{vh0{ACZ3EiWZd2rhS6^OM!j0B=b45eA(~atDt6~7dNkVX;kr`&*Tl}g
z`h0SEl$N21;kzk>@27tqs(I83^Y~gTQmiscpxzN<ZJ24S*Jo}?N>@k54I_Omw(R7h
zl1R#TIywajYJo9)px~jco=10wF;nM^(yq}N+kq<=Cc!OGWWULL2CGmJyKw)9_jFrW
z4-DF9q~NwFFk^!^S=%#6z$ByEodYFz)dZ&;*`ariPw<3D>i@)F&Cp-LwFM+TS{n<$
z=MV@mELRsWpuViC=)Y(m^3~8HVJ#wb14YRSPgK2flzv2hDIAN9MkYq^M<INsK&7R}
z`oGHVa0@i!^+{~c+SB%${(l{<uK(+3LEb`O5Ed(#sGHo_{+~yfstVq>Z~^%EaP;^%
z*_JY^TqOgYCnD&>>4|0as56uUS%&RmT@jcNR4dotcCY5U*ryj0TamZ<7bt^PS)J>k
z+AD+PNx=;q4493NmDRg;{2hqqCv-)nWCg~xcPH2<TUtC&KkN_cSuFcCeC=DWEenbn
zPnw7NSzk`=6m5i?%yeiN?vE~gXrph41Gl`>7};25><Xk@9qWBNadvo(h~7he7?w8r
z`!4oq4~E%=SSb}q;Y5t1t*CyjaOYUkTjIaPXfG+w?aX*|TPe*ef1Qde_bmGRd@OCv
z_i{TBRuk9bUDv}{>T#j{D>|g&Xu8Q)B*5AIQc~r2v8583ffLL72<AOX0o1-X%}3%5
z^{>uNN8J@J>Y7+Nw4I3yJTAtlqcF7>tmS2>@9veg?w#4vyN=cKZ_zsz^9;&5JIn;L
zs>0YL>kHNT`e>p{e)TEz;b#--pfgkZwx4Av7aj<m<vtZ7PiPJ6aU(skGexScOUdSk
zGX7R|&1PYq0xT+sC1Q=v=s$+{kj5?d8C%Qo$FA_XM##dQSNZbDQ`Wj@<1+g`iTV}7
z_fj@gzL4Q@<DUP@@J`?l>qwW8Bl`h6aH$Gp5@LjMITR6K1`sA{d-6sBI__(m2zM<5
z$_c_4Aqx~e9_?}R(_v!E)y-~R@ahK_+4B)){AOIq-_^7FqeG3-9#hG_7aacwC;g1l
z^JPJVFmMmF|8E%`6$BS215X0YkMwIrNS*MSPj-^LBP4eiuqS-NToEsme~5E|Y>~a$
zaALrG4xI!dai6R<A(@2_RLzDq>NQm^JIt;99wToo6efb&iaFy#_9dP|C9Calxa#E3
z`{@<08A_9~6U|p(&xrXOQjC|Y*Qxj&%h3drk`=ty^c`$DL}TjEh4v|QFX(}+WBu`$
zDfpfbzZt}z=X=z&@6PP_ghy`*Uaw0c%M`SWr>a;RMX+qv9a#6|3Ebfhkc+YucC4mU
z7P<^(?ke8!cJ*}?zMBzA1ZX4Hf_<m`j7w~%!T)OY6GPF&OTnpwdv!(S^OW<Lb{3sy
zB-2IBfA<lJR3uj0Kt+NdSWS9AaPk~8U)486S1f68la2I)liOp_ua^lA0H>2lt&Lyo
zNhRI-7%+A4q+NtH1=UZNQFm-JcWNr~MJm#MIehsFqQ`_G&4x!2vGzY4b7`Z=?6&X^
zmDhdVHAnY3c-HdniWVN4X@QGV7NNW=pktCaKu_<Tgd<GGwD$(U3oa2~#lk($-!Qfk
zlD;s6hitE#HM_4kLo`7=e&O62E$75|f#?MU0t)9dLb!l!iIZl?eihC2u459crBcIf
z-42wOCg9<5U_@*L80b;swg^p$I;#%QigzZ}j2`d_ch)N)z)8b6Tjkc=)l6-uQhmLt
zf+N6+xW!jy>#B}anorC>1l@`0U!E6LXv>?Ht`-}$TC_0X>?|7c1eEn>IIAe#E&Q&^
z4j)!D{yG$xOF4l-$&t5wsi8)y@<qDGKF0MBQ|#8M7P0(}h=F&MmRW)gU3^}WJ?ERx
zfknP8-xlyY7z<=S_|ZkRi5^GAhYK2qsxDTbl>rLEDqyL1E$S)}A#`MnSDnhSZrj;n
z6=@LFD+Lzchhg5?e&Q0Dm#<RVWtG)lQBoX+HhO<*@5;7LCTOrhReDQ=ir{MI{7<Gm
zeCEc<>eymxUSf@ZGi0Vq>~(5C1a>&tO#i_|=84Z1Dq~xQZBAAHsJ7nt@$Gv-a!>Ct
zVN%uu|K1;8{Js$?Bv-Bd;S`0LM!#0B9?hM#Ipeoo5!cLz${$_B8g6U#h$6#H`ldBy
ziB@B=Lgi;<K?oUjWuPjvht^8X`3O)dl4jjez3Ub{WWn#@<LYN~$FxjZ_!JKy9ECjJ
zaRADrNY!Ki==|$<i&?i6bGLdFA|fX7?&*P&xfHs&P(usLzs8rU`;~WxAYaCjTHP3B
z1>Cuc!&svJHs$Uee~F_mf*>UVWxV72?^+vc0^L_l!c0QoDp8cncBxBIDV>N_cYdm_
z?kwqV19#|wVL*aF;q%)V9K~t7{5f7*oysZ`V!1qna$rE=17o7zF0`9{;xVL&U%&Rj
zc=dH^^_rcq&m00NOe%LbMR?Ij3feEou-b|#N)DlAwy+R{ml&nb4nn`2ZM}V_ms5h<
z@8$`#%6Vk+4DX^mu6tD3)@<zleJA3H=I%F3IJfmPz-m5qKSKf;dyRT(gB2a*{+20j
z#}Tnf={D|*wjUvR<vjBDYlf6!d`Eawe6r}>O8Z|#Kql^-xMbwE-E1tDX}M~^)RkiU
zkD~o4%J9J8sz@(AcOveWu5ZuJQNQUqrkzjJ7N@GJebGv4p*e(ND32;o`|cHXI?{mJ
zPng0nZA_`4?ZlF_h!)1!0ZQAO?x78|E2F+Q^M(t86@td(u@xKK-PN_=&@j^xhkzZw
z{xw%4uzM&}9Mj5>b#`y2<EuI0Tjr(@bE*@PoAJ7i2OX03j9JIm$R06+tp}rBzXkt(
zLBAoQ6<dn{nPFC!Qm<9&TiQw)PC>BMW7dn%Nf#WqZ8nMk=?17Xy2;F;tx0`SMtp(x
zIr^h;=t2j}<Taq>b%e}`cv83w7<p=Y4+Ayyo0gTS2JQziX6&X_b`^xVzZ-c2p%<E~
z5c1g{ljoo=8e*)l1Cemyx!0v^rSzvVQl4fTa+6NiQNo{~eK(299t2q=XuZX6`#s%k
zZKO-MU9;!;{g~f|K5C&cebOx+69}HnPqPxex|qLWb!8Fq613c`h9!rA<(~3pE>eVu
zT|2MkHP=mPtNRuXU*Y!^?Oqb8cd>bH%N;4V8IwHlr-Ns!O1Zw!#BzI!a_H$SsL5Qb
zywJl(Np;Q~ZDO0Z_+k`hHQUF*Igdg+X<^GaJb+$TQkfnADCbdrSf$Dbmv3hgHkpKb
zg)B`KVfK?Jg7BupwQaW!fOxWcWF+}2<>%$|?dAp>aF9q%?QFQAYN>2kg8W&&ofadr
z-Ej?Wh?6D!g01y{&&RNnC3^az7n|p3qm!>O)}vAnFZ36>(6KKhVIVYk6uBbuiwI`S
zLy)*aCLvNY_~hFcN1owmOzd>ZyIHFh$UR;$N?I0rA`BK3OrI`I8U%mxs5aat){1mn
zY11qByFcw-5u;>YdE8Ay{~FmTzwJa(6~W#PAF@ccMc_HNXCXc$$hQd}b?MBpQ7882
zLr7Fq-2D(TbJBMqLGp}UtNV+w>b~Nr4c%D$y9%wQ&#GN^rKdn!0~4Ka0+alcykTv8
z;zP~wQB#$)^UqCo=f(B$7R@fzov2&$qRw){`>iKT6E5}t+-+Q48BLc&fzXj^h-7Ko
zFuN0RCUW)R#~$1fz8f`sgY#hMvVCFH&{MOFQ~7Wy7=<voU1;tF)I8b}bzc<a=|82L
zpAk(V%{)R!Jy$U(HYif@Kjk7aK@NN`s&9+Z<D{tUH3}o4WS|$r#+!}$Jev&q4b4~9
zKRL-)$>Jb|7etcEHx;EiNELl^N2FYliXrSduOm8E^eJ(H5fnYftobL&&2q>-nmkjJ
zy3**ggm&FdyuoDEwrxcqx37l9FD@AD8%`VVWUo~XaKf}0^VKT{N9(UjQkfUkm^;^=
zllR&ORTw-|GMgANyRpIpL&}NgOh-&Vir>;|MDP|YlVG4CR_8ojd5-w>btyme0Yvhw
zEx?k*;Cwxr^1M^zM#OpH87DC2Bu?7bE^oF{d(#Y_ZO>g8rk|g5uiyW4^ut4V{XAp)
ztjK)$@*wp4xYZ}*Bu<9F&`$GAt_gP~8RtD;a=&v9>BCKIzc2M&jfx`qT5dg}TjUR;
zV5x={SGNShF?-Pi>Vu6gj+?=B9?@(a^8K!eN7eF)s>9Fk@))_v`G{q2o9aeF>08#b
z5qykb(}kV|mw>0^vwIMa|Kz;%13@Mp%TzkCG))X~>GqF(<h&GDXQya<ra#Tt%*}Le
zcC^&w+EZghYxj8o5xUS0x0Vc(k%d;fecbFEg^|`AC0*&S_~SG~4-T9_Iba}qj%vdF
ztV$Yd@KW?pzy(B0WAVEpop<U7gGCGy$spMm1KZ`oH;7--{=B??g)i$v>ZmQoNPK<@
zJMui;NmRn@jf(GX6A_+>aFU6bn*}B8L7=6Tk!Kr<b!u<f%&?i??1Xb$`5)Y=?K<PX
zF5jLbnkPM8eK_C!HWPCi_VdbZh+n%geu%<oV-Sn1erzXmYT`<tao3OFCQfbQl_;P*
z&^GBxHbxXss8f@i-_`E5tR)gaVKg5}@!ohN8!7su^o0Lix_<Rbp?!?bo0y|seu2Wb
zXAHH+f?USP(}oyYswTrJ7U=^wiP?_@**lDxTFoo$r~CZ%jtKI9Mz%r<mY7JuaUG)U
z;@fO<Snc>MQ^bGBvwFif%O?E6<R&ZTEne$IYVzDT&}ST<NckrF$UJ3&ZKm_dh<?~7
zJ;|_m(hKC?bNYwMQh13o+jShhNAMhML~!Uy>K|KM)bG**gdBR4toa7tf*a?^h$uJ#
z2Q5*rm))jkkBnPY$K5m#*gW0xgCtB{p}>|Re!=9|t@O<h42FR4zqsUY=z2_uIEI(|
zF;HLRIlfi%_ND-IamPFq+HUc8w)ZLFoL^?Gi?7^MzC{je)S4W;7#HgiW#E}Hv-raJ
z28X=lA(&K?ysOR>`>c_*FigpaMp`@PdKDqJzz4!PW-ha8R<U%a6mvfbUJ5lb^AmjB
zG=~a+q^=}mXYW#XTNv%s;unF*pc4a>!Oi7`asBn`M;6{jhyag@!$gyitbXZ=)ACsy
zVB6p58-&+()XtsQ5bbQK0{MftAvccmIbfqiEt^&#_F&xPvXN<QR$PJIK7CCdRTsbX
z0O%X7X`33q!epu)^P8;DD2<6C^5G5~Cr0k)0ukjyw|b|wY@gG3O~526xF0?4Hncqj
zV9E6B&uv3)j}o98<~U?|Qz(mbS0zk*c?W{|ia$IX6Qp^wogHL}2jytFg)q6%iuKA1
z!OK_E4`oQ6B7aa;Pkl(ol$w7=qvqB(C#V_njfi?wamRcd5i<I!pHWUJn<d%n-$L&;
zsE>E8in!;TAUi&xG#NR_;HGZ1&(Zxs^3;}5_4ll&#XB`>)#!#P|5Ci-PrOOx&3?kH
z3h%IigtJ~atn~NBr<~tL=ePd4FH^zld&4HQ-N^x#!lRjxh0AK}^F~(ZaMsr|)x_9_
zDPE!{p60ho^w-c-1C^sebt!rT(Kx}v8;mBN;3KuM06&jGTs;!`L{G#<Y~c(cm!Z=X
za(=03D!uV5L~oyO{m%5!J`4ZY_PFk$ty6eRAcd^%h~^z5>AuVTiJF=B*FYX<%XNVt
zFedRx*(iRuJ%(NWLjFYFXo;w+<mvK!etOR|!_SeP8_n1xwuCwswRBj|V_QL0#TiiS
zvluSw%Ul1o<b(rqC64h3)3$Z1Yto!p_!S|7)cakMn5~n5>dT90cPw;GrM<J?PX*MJ
z!1l$kn6cIzAP=N}$J>9wpT6G$iZiiNm%*H&HaPOE3FI)US-iY=45>(wdFdCMZ+O>i
zt3^y)?DX6DRlr{eQdJUM=iy3WjDfcO>e3-kygpbY<PFfEXUGMOz0;74es0)_Wrp9J
zj+&@fw4yW%lNKfq@!Fcl9cv~mzFun!*Ko)!(SL!+Q}l!XqKCjjRXay|IKtmTU0je+
z&oU|}YuSmDh$nriY%uO^7HPR<0Z3n0WWRsa=>CncTiQbwqZ>gK=`=_pf6O%~2~10i
z+~l?w+TG!E_95x~2;oGX&ok(70UkwuvPR_WL1~?|@fIze+b1@?233i!+G|Qi2QTfH
ztIbIR$5HpS`hwt&RTN49imqw<bo*mDMMC^~dq|b1Z<?)eFKV((@^YzEo<gPC%7?8S
zZcz6rc-^(WYwKOMApu`}Lm~f?b?dpt&Rr$4(9Q1It~A!w2bvMF<kYG|{tSqu!{Yu|
z1_Li+!!8sqxAk&6r^Wuy)u2qNDBB>esG~bDmcik`H<iDRG*3G2ltZm0B;bG;bM1{k
z(>LYn&NJ~^G4ZN#{X#<kapVTO;VZ5BrRy7_m;4z<QS@v+`d3Ih_i|ZZcCh?IC7GyK
zMf?-Y9aJAirJF4d<AP`wOSJ4<phvK=#}p{<P;;AFGg@zH-v5TGlEwAcAQs!yED@^D
zJ!@_YCV^6ki7OdJj@?{Vr9S_LBq4e>DiM5(K(ZH0{uyq@k<-RAC?RdiDnBv;1GQxj
zG97<w+7<sd5xUx_uV@wvjL#{yDg1ENCZ#ifd6iET7eru|s4L`h?ye?(NW1G-Xenw@
zICt9U@2R1L->+5W@3xCZ;uQrK7^S_-E*#{|H~%OAL$w7^iwZc^%r?-_Y`Rjf^BxOF
z78r-1ZMBnQ_uPPLUYWNxaUMC8-v3RreeJOG92rPSkR1{b$<}$Z-Kno;j-@o5Lcd_|
z1kDIK2A4ytj7W;a){UMheYdDWg;gRLLiHBcY@P?$x|sD%0P070w>vKriP>IYJ5>0)
zJZ-B%;ckIJ=rPnrI5}%o+_HD;(Ab(q(5rNKgt=0Jgtstw6#S>#I%id&#|WO_F<=xY
z((ebal-%u!CmL2;E1BW3-N&F1T~%owQzWLJ^VLQvQ0K;ih`#;|mFFk?n4y`zS@()|
zdViWLGN@-r0O4bR3ZB=2R?}?doRc)qnRh0F$CpsnQ|B2;APkK9AKV$vi<_U~;|$+s
z_$#;BCHPlnN1$bsOLJ;Xiy^!DF{k8{=?bc#-Hnm3pW0*Z710iVM>g%_pYkl_bENB^
z7^KpeT)SDdLqRNeyE}}neEUTLc#`DU-rz$qg3Al6ROu7q+Y;@7D^N+G_I2&!dEgLV
z?W9Ic)lka%-<Qaa#=bB(_}K>Dlan*{`%Seuc813GhIoq*Fa2RwHJY(H?l{fh!><96
z?^8LKDuG0P^3$|epn>)>QI29Gy~%rig;q%Vx<@bXNWYnC=*z^f3*(?wZ*WGJ*!#b3
z+-^&MbNg2nqWjoC-T>K>6OY=Qx~T?-W$IB*rRE&ADbZ6)^a1}5*vw<jf~l#qrI{%y
zhcAU2py|t65@b_Z>{@qfi0|EhFzQ(63-5n$z0%c{44}Ti2CSVmNjj6w0a`vIT0SjS
z_5}L1C+ZLG`ZpsNM&M_yvXYYw`|7G9{XtJlUv&K+Qy9PLQkaod=Od6fcS+x6<grG!
zVxe7GZ`s|pIKRc~-^=5yZzzoVxfNzNteUaQ>uM!}26^Z~7;r?%#3B;ZQI7$D%Y`hB
zD<QqT9G%;Mteovbe@h(ac?Gub|J-BO*0sTb|K3xX!1#}BR}b5p!yxSLNI$dy23t>D
zL!GI}3)VeA3v2D}9eG8WncYX?<e$Jw^2P0?XrLY;1<Q<c!a8tjf%EQYvABHJ^AGV%
zg&Ac;DL74xC1662F{;o%uvhJKSk+SuZ=NK7vw7ELJPP_mED3Cq@HsZB<1=>?X?~fE
zmW9_pLUK(3<qmb(9hnMC@S~-6j?|S!4ttg_<=*#W55Y+RnI-teHH{D>T5)~}GdL%r
z)-$6raojFqFBPa^Mw{pFD$#{ssZA|XYQCQ0gJjs*TbI6}Mp>J<VuW79+oI#2%V0G;
z13Y<5r(;|W7U9w?xVKf)wX$K9nm5~s$F^{Wa<9~Jx~nd_t+7Xu#eU!ESJgp4J8eB&
zWj|}~0n+`1T&@6{>M*P87y4hjl*ufhxc#ne+p^0E-qSI#t3#@o9DNtJj2GfY5<Shg
z_>TkE&Z_<BB=o6PPa_8qb;tOfueQbwp((>fT<{5&$=|-@8s?=G$g|__TGuQyVgf}p
z#O+e(@uxrx+WYi)%Z%jewR9~nV*O}{DU+u;v!NoCwU;mBr2f4!dxfKvcZZt&kDvII
zgte$PtDzmRHXAWH$^fu?$*xFQ5!<!rWTne5gKIctD@SsrE9~)C?xEce5l_sWDlY_D
zn)N9l&q8Nq!K2!ppfeh66jwYN9huFq+>ztjt3<t2?XPhI%98hl--*;d#>e6)+<h9t
z^ZbIu{hDBwqprRx(fdbCR}PvyT^}+)DdMUu@J-W1a|OS<N+ixGYJm}19%l@ONSUv-
zh^KQOvD>;Ev>fJbC!_L;Mm*bS$hD@!7^RB^o}kCBlFfuTsxOEV=Zh$YiN5X>dFbfv
zbbNxZ&k^4wTw&!NrPd%1NWPih?3gOb6Vsu=`y3xqj)&v3+|(eKlmh>EaW>~e%WugG
zE$!$<@@N3`uY_RH$wER(;davb0B!6CzBQ)G)xUM{8Ydgc@eS0=Hb06Y{W@*P0I4||
z4Ol^#<(vSLw3esx^Q1tit>u4k1}W9nOg!^_#!qI2{-&q{awz7646;PejnC)_U1;Y4
zIV#(*H?<lobAtcSGpWC%Yx9Wzu+@t9ZE^EwhZ(z@_S{I4HH41EF>!(pZO3g6!XK+_
zHT-h}X3lH;vUE4#a>o(oMlrkWuEtB)zb2mUtAQ1<3!C>F+`Hy&t68R{ZBF*i-dN_W
zh={+XGUW8@MD!7D%_!V!zgT{mEQaXcQF?#d11zc^ozo5r$D{Z052++tl)p$HaBv>G
zl@<|XbbhQ`9>lb$zaPQq@y|N-{h1dQZrW7@xX6Ys{Uudq#QSYdv@2^jp%ad%1@dxn
z$SCm{V9!$P6%9vcE*-d?Y_w;s%fDa7&?++j)pV!z1F^B3_0EevW$N)UUGIY8cC|d3
zcaU2=K*}aj1AE<lE!Onb<n^@-u__CVDYP@wd(UlMMHLfj$9G*GH2#1SP|!K`rPK(r
zqM<&=cDxZqGU2^Z4+)<52N0h?s5htjp(F{IibzPFfAW={HY@%7Gp{@5ctf^AsMH{7
z$V^Z{OIcwUV`l2nYNK}N^%e*>L1OnsJA$cc7N`xd0dcitUp1n2jZ$7$%KRweSD`_`
zl8@tQ;;%-fzdh0drz)qa%F1U5CE=~gV}JsdEC1|=4<ye8rThpdd+_UdXUu!8*eFhA
z5mN>ajh4)v<_9NnOWRH!)`wXDvAiFm#pZ0cZTI92;(oMhVKgAdM^k2^pE)?1k($Lg
zTMYdapLcSF1(|&IIt-lRyzte-Wz4<Vrp{}#N>22KU4B+iMQ7>Oow_(F%U8SHbvdEE
z%;#SXl2&|avDk8;j|DJS?PgfCrbWZY6cpQl<YmUC75hs@uN*Rh_EI#QWKB?MyT<44
z(x+BTyg6$|e2TA?eY6L=$nCt!hqS1!XJlap)0$5}P9Z(sdS}Ol=jW|xg|HR>V1hDT
zA+ZB~fsGO+F!y_$IVJ5JiTO`<Ms)%^{<v2+aHq*sO8QBIMHl*Ia`YnQ=ee!8uxO=5
z`O%+&%Tp><{wKt%q~KhHF2~3c?_Lkt#Y;tdEkoODD%fJ=B$XdbK*9yWrdR$SK?$p>
z{g1FD9;sUBsD}#l1!DQh)mXWI`Dl>)HovC$n;^&u?L7A;>drX9>4U4@WVdaJ#43qd
z(j@_<>_TI~)WkXYQjW#e+SJ$ufgkl%bGUnb{@N1nYT6Ue7f)GQ%h2pKcZ!NbP#EQh
zj!T2TYla5xN*?8gYgFq)fn{L(?ygkT)QOe@k`)}gMWH~S6l?m3geW=0vU_lO9*Ks$
zyBKz(6<|EC*c>TNh4&2KFjT$OUkq<*FGO(%&a}yQG5$E4qcepD$%d9KTUOuSw&u~6
z1@%05?@{O?dQq7UUM-nKBSX+?=LiBOb+W7|q@iN+0tPhdEBA>+B!)%alZdDKG?KUU
z(!tH*ZR?k!3nwmu^sdy;ea<<G2fuiYpJEGaQM4W)mu?JZA`4Mw*+86S63B+_UsB02
zGySa<zR0|q($RJcT~Qc@QE3INZ<%Ebn@~My?;9(7FyUfmLkm(qE}5T1O|#N^I207)
zbo*3L9?8g(XKL@NX<Odzvgj8dQQNN{4|a6v)sJu=1fOZm&PTBKcBK=DD(>z2pY9N}
zDrPC0B`%c^NXa(~^-$*w)x4So^U4le5JKRtnApR#1xKN+7bLmDnC-rG@8MmkznE_<
zi5;(oZ^Y<8$Ey*wot>H8bVK2Ch51qMZPrvhT=rJ%SPbk+4AhJNi(V-*^>;O9f{m~W
zo&%O&xppYT)p1vB@VvEULvz}O|APY?;v^~*hPKLM3L&{ylWU<9nq0KbNPj%4Kaqj2
z8*^daG>=j=?abiHtW^&UzA&WNo+4Y<p>BleOu!O-<4iCWZ(G;i3}JnJvDT7{oZRkt
zv)#KAv{swXULeCIaWLYNtu>3@a~?DsGfR<3_?oQV;=Axm`D=5^>(7rkclj%LzZ-Y0
zff%OZMa~sj+@3Ssg#tHgRIF!y?$p5`8#Ab>D@h~@s*sul`NFUGFD<ynD*n0XU{*#k
zlhGQm{(P~;_P(VYTT%S4-EawIqFg=&=76E=LYx%A6|S6Np{msS$dS5|e*-tcL6F{(
zA&Rvy%mK<yE^QZbUAR~?wPmbU$u;4dg?RgXo&KD=SSR1B@A|DF+l9?k%)LV{jUrRU
zGUM$o?U=D%;tFsLeib?{8qEPLHC+rLA4GJvavT+kA5yVgY7ZXloNozpQzFJKhn&!g
z=fixXTk1pn@`FZvapR+IsBj-?(qo2VV&3%V<}SfnP3fPgCpjlFvHSAnG!$F9RE%sk
z!eLjNnJfjaoB_XHR!4kZmoiyHRG=iG$vBx2`L#Iu!v*f6sp0~SNqi;IHv!f^YcGqj
zD!ChnH=UicZ#&6!d5E3<*i@tyY$`buH4eNEQA<LvwhEtC3KsId$Ym12y-+4sVXDf>
zyJN(rRq3Y;hbr^yMMpY_22Us;@ctEs7HGN(eIr!4u&5fwbdDKYcJZ;OwJ*pADAFYK
zN!erE_@LA3$Dwn_FY@K>dbe7*6a|KdrHK)?2v&9~>3@7wm0-bkRkp&o<g}J@T|?Zg
zsiH!AG#%&?O|NP|jnb_w2E=@gI5)w7uzP#n8^RCSrV>S!L_{=H0O%m&E-Oo8<uRzO
z0&g~^!c+I9yeD#o``1Syl1S37JX9r;=U(8vcdA#kb$Y#r0Z|n<@V;yboy7*v(E2Xb
z#zzUkm)4U^M8n1{qX<f>OJ0;y)J1^=oJ~W0ifh;ok7XUl3n17qy8J_cY37xtouXR}
z423LiwLOWN7t&~_(kZr%i}!^No&YWvE*BFHrARSxHtU-S3g2}MzyHlOMK+IaTNk**
z%Ex`Es(i9(!6Uw*UWukJpL&%3?SmJ1m}-S%ySc{z8|JrvI>zlPy*~qngYf@@D>jT%
zC{+!u#fT;$wkH-GB?k9)gK`>62E{?10Q(AkEHbGWboTg?_DVblVpA?gx1vtsWW-!>
z?RGZCkj0#uVnXVTSXb8OmS30mppJeQJKBxLX`4@uWx!nE<@e$F%HdzIFZVR|!$9h4
z^Td<?Ut(?|qIrWl6WrN|H95cVhbg*iEv*#m-IlC>337v3O9?P+yu2tR<Qu}IlSsyk
z^u6U<bcgrV!a2~>D14{C3rEw$orHj4a2!wSk5!?1XMlctjb<;IqAO_qGL5w)+)25x
zbE#_eu@Dwnp$CmR>qHL&2aBk@Zxzko4NLxs<8;77FQUYl8!okj8y+ByzO!GPLmLTL
zQY_-9#-a%DbR)^N&t*@$U$7sgw{AVe_^&!l^{#4Gg?WGdvFBRmH2sac%-}wXtTiIu
zS1~HC@yhxrvc(bFMcH(D5PVz8S`e}!$k4WgH9zw$;buH$^51UlXb)o$GK?28zAA!Y
z4R*AM-Pl+r)h$l$lO?y}{&za?K@*Wa0gmu-^OIL_)C5`ocITMU6VF}qE@;lnH`L1f
ze!v8H+#T}Xo}%i+X_Hb^v}CiOu4BW=FhGQ=CbvtXr49+a8>VXv&<>a}x>Wr`g>55#
z8S97sX=R|M_D84I@hASCin3M58+ayj%%Z3u;c73wTaMHD7h5l!oh0W+UxNfH?8gpH
zqthoNJE9*;zh3BLH#_X?F(YtVUh~48w@g|T`TMu&V$0-9fo|*Gjcq_5zptrU108r7
z`>$eepg)1uRF8}S6}eN6)$db2{zC=Dr!Br`?9d}h=vvt()$;h6HEdJ~X<R;zTPJgI
zFY{8-k!Mx8Sjg0*I+}B=KLqs{^&Ig0K&E5T#ru{QXp$3dL}ve2V~+Yyzab;&pVu9k
zU`SEFplQ=SL*Gczh)E9$`F1O~aa#qi!WiPTT;US6@}u!&oeD7EheLXk!iZRyeRcRI
zLN(PK;%5%ciRfaezj6%aW4^7@8tm_PhVOmQR!?>_?x#bj6r}BT6*uo*&FRk}7Mt4I
zj107dy|AMx(y7tWb!^~eP6y5hWIx0I`!<kiE`LDv=RY{YcPv<=lo^V@y%{O2xUAqE
ze#|zjs}C0K6`QV#b}7N`>=~I#vq!4HCjQtHi%PxwQC12^+Nj9Vo6=ywOH=Q8bv<MA
zVBK51=P?s3Jh(^r<UR-eBMx>m8N;a(>qdf6DZs1aut_9LrJP>@pGU>Ix6QM)6P}va
zY*K3L@X9_Jd@0)x&N^VJbPD(mxWDLm7rl%a2*l}!f{>0ER)+ra@R~rEf2-0`rIW~E
zMsssl!uO&OQ|BC<({4vQsA$UKlJ%Z3z{+KeyHiVbhSYr)Zcn8m@{W`huZ!tRxEUp$
z=Jwu5Db1?%(_=;SF%a&yY*gw?Ny<i#xFFjD90!Hte~%(i@M*Vu@yG`C<9G4`5#UfF
zeNa9#$$xONtgg{FH3{BZw}jV&S<p4d=43sL$h5+m5p-eUOimW>t;N%>L7+Dn{juT_
zzb_C^{p_$nVWB{cuB%y}Xv$jSr{JWJIOnPDxeFy%Rk%$o^9JyR3O?sjiB;YGvQ4o$
z@LF8hxxpl^O_1JI+fRKPOz;YXq)J^if|m_t+?rZZIQ7e&TgjbA4y^le6nb=s#r}f>
zhEPfn4+g+8?W%*bnpR_nuYX8_!C|FG+A(P=`^r*<=Bhp^g8#wY%|b=|ROo+wc#=^j
zNV&heEpnoExo{eglh8Ebder)ZnzGg~8Ygs=ZCkVBOPRR!*<x-V<>hWYG3sV2Voq<$
z+U5Rq_X6%9w|ZZ(G&x%Te3)Dtr1!dS2Nk1Pm*Pc{>UTB{-st$W{>iUSu7sju&Ce>h
zL090+!vYR|5-^zAkD9O^U?_>1-x*ydZgI8-b<EI9W{6^*v+poZ^WQnr9jmZsieGf5
z=$#VVn(vIk|3nt#@Kjo;nze2Vk-?)8N?t3)$d;hIV{W364j>qGOd~hVXk#ZXV20be
zlJ@wL|L>ETl(}>8tJs|bz0=q)_an{Iy%ZNy*9InfDutAP6TCS*H9wUFkMz&lZQLph
zbZ$ZDHcom9cIN?KlVYka4Er6_$I&x!jEsDq`o6|zP#f(l7h+npbutb#`?%#PADP-A
zRLT#xfB}Ru&joFS=C+heJ?bh&mdiF;&C`|fBWQFClDh$ffz+6bNzgfnRy!tyM}ZDj
zbEx1u9vb*eMwmzs2Bf+|ZcmwA>YGx?($|`S{qncv9q;axN+f*wQMHFs8IhEYt_uIb
z#Wm)<dFU1CwTa>ZZ}H3bOq_=Di;@f`BaO0jo~t?TY;Oq}ROimOJDTKaemcQXd+_;m
zq01M~iGL}ai>ildj`&;+A}0Fa=?CPJQdpFjaeD4YXfvSObb^tVD%Dz$Uc3K4xCW6>
z=!=D&s|{@3pkAbX<OTT^zlZM%<qVsm!^n<8A*T+MS}g?58`C`Df1%2>IC}x|w{AH|
zimDCL4Q=n3REmT~nPQSEJezsR-L8{CfqP*!P2J70docWqpK<-FXO(l?LTeyc!Lq|I
z)+2XY!ykYb)n)P&H;u}B$s_+&o?|4_l6-NA3SK_3w~wc<YOi;yb<oc{G3teGi`Q-g
z&7>fwN$Cxch#O#lWd#`fcRwS1jvhgu8KAKmVU=%P7?ZAn=LIbY5klH*N8A${*X*~e
z`vVf1L`CMrp@epLx=pDuCap%^iTtk7{^@c?vK_IlwlZw(Qpd{GG)OOC7r)Tl^)GUh
z2~|>v+C?^xFRm@Exy|cC>!+iK!b5J`)jtZtRBT%OkUX5f6h-}ddt0V)TsH`SpsVj$
z9az<}PSvMZpNOeADTkW?ohdr!)nmFAoI9ANV|6cEm#(*9fvdfyD$8v)mmljAqeQ7M
zEtl}a5Y65-jR!G~1VNe=mxEXg`NSua+G@sl{rpToK%<K_rQ9N2vr%bf@n?hFtEL7^
zpQZK7n0AB53Z*9PC5x6=&UNR`o=I`iH@J?bIXy4lo*UK~Ci*~>MX~qfOYxA8AQEfL
zf2v>C=xdG(d~T*lD7?X~V6ztCpr$&@wwpy}r_<p02s=zI-bBba@mvvxcdFY!0tb=x
zD9>2!Q?H5OlJlnZd*g$j(ed2d%tE)A8$e0lt6mw)&eEerlN*J8_iltf7`EHND-k(O
ziK+_Y7eIMiw;hJ(o%sHP3k*2<Ur9Vcl?zLSNl?L-m;PT#T=+SN0D`ZA$zo`Unu|Z>
z^AE-XQQ`~q*=l}(-|tw|oImH&CCN;Vrizy>>2#a!S17m^U6V0J<?z&UuoYwRmW?kt
z2PL-NEFS6W&t8m=t=-XQjLp0IvQgqPthxjohmL|7u5e=bNG#${_pI3HCH`P*96+9T
z7a@Dg3<`e%R3zyK-$Naq?}a(=9;?|z`0qD)K|``OH>2(+tt9(w-JY0*m+;kg0vYi1
zJHV+b!`QWdxfSKXn@-^R{yoT%x`M*#W2KC6k?uMWnWV5al`Z<7GNe;{8}e&izxtF(
zHSuV|d%#t{&p;m#!b$v5?zLiGj3j8&%9_A0Ph-o2F=}@rxu#B-yJ>@k^&Z1~BRavs
z+~HR=0?QIQ-E}BCSF~mGY&v=UD1a`APCe|dtawE;g(hMc>2|8q4<GWf;@t7=q2XIU
z<M=fmD(F0Zm9BYP6Ra85&$MNaLRb=)roH>3Zu#{FDF#vvxY_(zcC%fkSTAmIX;{c?
zK?vQb7B34iy&*`1L}R)Y(-MwQlz+khEW=6EN@0EInd0ZGz1yTEUI;ml%!}Qz6-_Xf
z-*%q6%EPUb<TDE)qKM$#m5G<mMFF-$u76LiaJOuXm5Kfeat-la6xvDVI<yasuBonK
zGvh`{GK2GasHPiEi%`6tgwB5S7D6Pv^TStAn`9EKSdq{902Sj-W~Ue|8wCs&1yp5!
z>6l)(A}7^4F`AEk9lk7GVahN4-!1~IkAPCKQ)Pdo_XgvlatT?%%+D$08mr3knXs?c
zwA%s{pu7euJiU_|)PmIUWal=X=y%G-<-NQx85j8>+uc>~!Vt}r&rIVZ)LT(^7ubFh
zQF1$TcbWipsmkRM$11;b9zYVs(J@FJD@skj4yddnIxNfTMFH{&m!nURN*BCPZi?sH
zXU50|XNxtw=w@Xoof<&mTjC^d!KA9TGCqHjLCdtsQ5=6oId97-IXDtSd`wArr`Y!l
zEHQsiiNt!nGP9eTlZ7IS4U4MDkt@_mR1Yw%w2-@%pIGC@@q7@`2+p>#W>4|-05o!<
zFeXNU?YKJ4U>Dv(Hy&qcwMAF+mO<;zv0jIGPf^kkL8xR8I?pS0&Q-am>e%&p<X&3H
z@*z=?dh*C#t+i;=F+7EnO&j}aJ%Brz4qoGyAeEb;;j=c4L^Zy~72yJ@2no7nh-{IY
zTEBuNI~|d7)c>LCEtuMT->}W#EyW833MD|G6u075+TsO@Yw;8bQlNMURy=rdE5$WX
z+})wLyB7~0An@D#ci)|z&CK%wGMRaDU)Oz{$9a=gs8Mlvea(N%@8C7Gp$6OASZ!I{
z(@fV$7;W}z>^h5XS03SIi8j>K&nj#zt=p#$vIR+J9?s>4YR?vqSe#w)ldL@1{b<%e
zJ51vDvdt`fn=V?EO^7`J(~VW8lAUt3Z-@b2KR2q08vR`ax?GF)ntqRFSqW&KYh}^{
za>>auv_IH=lQ&k8zz6-J$5N`hdK>*CX@bSlYtUBuE86Z^!mDPd`$^21(X?m#lCV`t
z%Ky+%T&JKAaS->fjiEOF?&SS5hfUFh=7GInoc{nbrg~6Glw3YaU!mmiUP$$u)$KNr
zgVaW&iQbR=wsf~Ti!nDSGGf*@=Fi3nY42B6rNWnfqmK$no}}xm#hYJ)jL#yJ3NXcU
zQ({Vc8aIX(<>&15N4N<SVcU8$Y3_;Ud5a2#`<P0QrH|PCl$^Dcr0+QH{D0xmyej!L
zw}V3Wwec9wfZgUFhS>+dHM#*UF<!m9_LPo>Vti-SL*wiysMiK*C=A$uVQqvUUqGK~
z0fWwNj8QH<93|U3sVV+|L6*&i95u~&IMtGrm>OTCi@V}MW)H?G%TS2jF5GT=y0j;U
za&{I8lBRgZO9ojrdV0kxW?lKmcO+Q2Z>=ShFr$>L=-o1%C?e`VfUV!sQeQTO8^#Ft
z49(S;{vPQSgNdnxyiz_3626t}Sjq{XbDYMqqCb&524=<a6H)^(A8*2k>>p^M=4eh4
zLnlmFYS1rvZ64ho*0A$sqY^0QmwcN^07rk>^aG-@ZhBYwrweD?jP_QzM~9uDSiy2f
zML}5FW&l1w8H8Bo^9&+q@-p4l$;t7HMd0GfOC@mJUn(7K3MDCgF@zb4A4#_f?Dr!t
z2wgA5zn8QpUd<O}2RksRZJq;M<Vnb7U4sHT5vzU+YuBGq5G0#wU3=}+R{NQ8Qe{C-
z$zSOCYaM)9NUPpZ*kF9aXH>^mQAE|I8efN1tTub!d0m{L(OlJq7ca3zwe7ua^M|tb
z;l{JjBg~h=JrA&$LVKNUNY15Uy)~g>N*HjLVQ+DB$~ZnG5r%J^%Tr(Vt5?-bIWSdL
zg)`RI%SFH0!qOm@r_v??V3+fKZv2M_CBD~wTr#0XNthRT%5>-n-_^%ZnV)ZZE^c-W
zD0W5~|B*D0Nunb!4PDmG;8BIqnbM}6F^I&~vrEOJEbmx%Vdd(iN!uwf2j=E0>*;J7
zn4stL^_U-yEnl{LdZU=5qH2HIXG2<leveoc0k8JN3dWYsgi)cC?&TjBv&muNL*IRA
z3z3LmXkE0b!0b-ysbq(UZ;8&+T?CNBflz!s?KWYPv?t~jRP&6ejqU8Z%DMD_B5CnL
zjK7;!?0h5Uq?hLL!2jv?xss&eC1SBby?cb=3(zULR=GX!(zFy!IwBer<t#dPR5=$c
z(|d<GoT8EV;?A0~uFkfSdb;s%1>YaCIKp+}rnmV(v9jCb8XBfK%0I0dy@6D;4Q=4Y
zyTN+_zs?U=5YA>^T&P#9iQy=-tkWi+u+$%iMZ8UBppXl}X@hzE<O?9*L-0iHC|i#q
zxUM#5Rkq0XCa!FDPI!{5a+J=6_LY*Zra0L*$SsXAd_C4zfd-NHhy$B$fBuMb;I4Zo
zUnsQ}0-KLtv$?SFOndEOeJ|>6%ArRzLMTw2K(rAei_VRd6v#28=mk5x_q8m~F0@8@
zyFE~cdajn!l+Y{<dK|QmJcdcXfxbqP_5q0pH=Ka`d<RJ%Q<(*bRGj!Vf9qQT-?{*L
z@HI(=Pzv`-kG)wUsHgT&^@0_@E`<#(Ci#|VHly@+>VwxsGoS28-iPbdRa@_<VLg<f
zsO~?aB~bqXe$`d2FBL=|tRzW{3jcd@a%=@A4*7&W@aiu3oM=Xh-#V)g$`Xmt)>ER)
zEd>k>b>otGGDiQJ+cjV#-_4LQ)c5#~IF&vA4}f)x=(*46XX&thD^+q>lYCW-4$A==
zUbhDUfse1DL~}XQEmS4SN%(GQ7fOi>Wln$&a~6XRi}p9M=5s}ibNe^PAMOFRus2FA
zU?lS-v7nKyp-MT?+i&_SUO%jsk`|Nx7INuH8)3iy!HAExo2}vY>3*5t7rv?TEK2Zm
zP2BIS#a-cO#qAFvnlx@&oi~V;KP`Kln_QqmtIJt|+PUF#$b}5k$e%KIl?a{Zw%mDj
z)|cRWvB%j}tcRSOhRVtHbh)`Q4XcJJUY4lHTx(v(gN~7Kf7&`G&3By9kb@nG5XdcX
zqW0LS>#jFDtVP5}I=y)Rg6q28{jX&bJ9UmOLoQ=bH?}M;1?Sb$wb_4wz~JuUuol|0
zyTjq>6UTUrSR!P_vfi0JT6C+4ylVb_m_OO#BN@^96}mnWDqbG^vr5dur4Zdb=y7El
zen(2`JSr(yi8^LQuK*EWVRafh+&nqW;cWGB;P5`3i6Hhn@?Imv(4y@^_V3qeO=2;W
zo*%li_@`JVB~{Hya%|aNizT~1`KNuQDM5y@5CS7iv7qY6P5GHis`M|&eb@6o_?$3*
zv_fjsss&a?L%)5rt7*ZHudyY65<!et2ut?AXJWo;-n?o>XM!1_#3@rdWwrmt`gmp3
z+XC3?Z1!*x8TTlwB@;eUQjo}h`^InT1hsrf<3`K!V@&PN_{k-2<5fNEw#!>rH?orf
zw=NV?KNJ3n&vlR7h&x=S>9*QRXBNHhWvOd?AN@r1pLfiAQK5Shm7Wy)UPGSABN%)3
zw^cUo<Q4?O1Ez%yKt69=1b|&gXO`Cf<k^AZB+NFn<2w34g3v@W%a+e-vhdt=liQ#z
zr<P-A@b>GB&J=~l27q4JCZ7?I@7Uyai!oFl%t)qga*F_{4l+>_Iv-;+5!&tFkC%5c
zJecEx(7Cm;T%^y<vpoQpeS%uQ#mA}(ya*Wo18{^Wov}aXOr)hXc0(B0{dEsv?lGX`
zbLc0i<NR#cg(FY+g8pGEd+1L*Iq1c^!DyK4BNgl~xURt}9`ss|F4TT9q3(BO-FhFE
zlu<mV%R(YmY9)bj)=VOM<OqpB{91BmKF^l0Z|xxcbPu~SK_qbQefMZ%nw(h&$@0v2
z@7#z9DU~5xE%y#1hY~2DpW({Vr7w@LOYsnHU!huPN3DAUd8<hU#@L~V3(D?-+p}`5
zE$}>ldyRO!7&zjbgo-Ow?aemue~YU4TPBrJD>BCR7USVBFY`Vt^nD-U4N}}7vr2&L
znHSnXX!V+~TII|&OCrWg*DOJwCJI{cZDrgeJO>WH{Lr0vA7J@e%I;*mg%?0X2>FhX
zu30|ntFgKkne&FGA7?vyBP>mV^D*AMOD|X0fyVr$Q@&RwZNC&}U>gI~i}b84YW`xm
zweLROm1a*)RZK+k(eG-j=Nwx2xw;HaT&omVELU6M-x17h4Brj4jIy-?M{F$;zTY`=
z>oqN0dx<=(6bf0Cekqlz32RkO>veS3iCUl7)fA8?>?Qxud`b?FKt*1MrpUmLFJe$v
zI9G@|o(e-@<`1zXqPuN055JaGwj{isi{A4c@UONda|A`nuUTG{4c5!AT8D|@o_+cq
zamV6m^Z$yTzDboHo%(xWQp?0`W_fGIMD?Gr`Aum^YOkF-Vo3v1y}zC~?Hi+_n7%v7
zvVoSMNdcdJYbQ&+7gj8~3IFB0Oeu-!XNa@`Jq@^t5-_rkG7{AWWIe%1@Luv7RnOY>
zb29ZEAvrUf8i)m!R%t$R3E)3e_&zC5-8F!sl5D!kgO|Ty-Beg=PdW4?T`d0|Kt0J<
z5@o|waS$$6Z||Ngk{R13C6C%nSg^(TET=wYk6u&0Sc^z{)C(4R%1l>2TID3@1vc|`
zo@{;bA3!cp6gK(3YK!Am^v-N93##2}AW2=aB<tG5TC+Go1Smi5^Q=f&`jyj&Oc+6v
z%u}i9_R*?|b<&g)j`{=f!b<+|sE;mX=m@q6*M%Cbj}LB+awR=`Odu!ndUg@*y!Hxm
zTHpMqW;*F@&4m>s<<B0fr>Afa#MmhTChEgB^~3*n&Dk%@C07zyVB1YoOLJYN?c2uO
z3-=WC0=|xsl)Cp&2Evugq4g*Cu2R7PHMwWInlCe;>?X`=dAsiPUBkYq8xt#KRs$^p
z{+-j4dFCx7;q`5y)xE4H;lS|PzNt0avgt2G#wOnV{2kD|Q-;_{=l}Mz?u|A5KWWzS
z|NrFA{<Qb&nw|SBf3@<~H+o)I9hyc7ka(gwo|mBPijIm1z)h&@kb!N1F;qR)Ikum+
zj9A^IWqN))C4q*pk(S}A-%dO@E6#B)?MKL)fbC!|*CT@0t)GOyh%Kp9?^WIQbJ$9N
zbT>8cPk4Y*2C2=2=NO;~`mqk<^8ZcMkQjvTw_*CpXO4dp$Y6S4D-@j--~f#3gt?pU
zr(IYNSp0AihwCka8<_*I6Pv!vNRCt=#6PSsH2*=TCFK-7=blD@7}T~hp6_eZDzX7G
z?mt%yy`wT`jm?!Cmw{T%P4%*r#iaerxS?g((_(lxIBN2OT#g>RU_gH^aX+giKgdK|
z92ZM;%-ZUs@n*T`pG|G=Pe3=oLXFR;)(6;i(I(NL96Lf4x(@02t;lC0w+vZ?k)_-0
zwOCsS1`|@o-Cj~{?R}sA7L_%tI5+z{_2`ERoJYhGJ{>3LeiqCVXnd|07n4&cdELFo
z<DTHJ?T9<o_LpvNdGMs=CHkrv==a9JfWnL7eTw_Cx_2Qj3t^(+c2R`>O$m}dgF^b}
z!t`t5JH2k_AtxMKncWB4yZRq$x3%(f^L}@lf+v7y@$X8M4@AQ%#r6Us+L_COm1uh<
zdkTMT&uvfSvrB;P)R*MRcN6|_G>7(>L3_6H^+WK(f?xTvTh=YPG9aB}AzwCvX{5Jk
z*l@B)_e{+bz6E=qYxVNzKXRelBU?1Oz-QkD?zM~Bwg+Zc)J-02>lbx0j~G;YEIw|V
zMYNQWKRt|2WG%fttG3MPg=HtM@0*z)Ri6Ek$r8-Cp-Y=~b53v?{(Y$s=$YO@{nFEC
zh3$JK@5scJnN;a$_1`D`vu)imWG(Ri(Cdn4y)fL$+2rFpckUBDCBk1Up=;y!bJJ$7
z4#u3&)bqsi8<<V?UIJVT^5OFo@At;V)$7;c0UZPD52W5>i2Op9nJLNC-U3PuIRmW!
zqshTimh!>WC`>^t|BSbo7uo99HaJO7#Qe>}?qFzm-8!c*{k`<i^>xO5qCX1*jjtT^
z4+*E1(hQ;|vZlG(g^)f`#;r9B0eY3(XLqLHNJsT%&{gnY*o{kB5bVn#QEJm&6!32F
zwloSk7R}`bH(HLVUddX+w8ko*^d}uB3=`fOP9JDwvrL@ImswS$e(uIn&xC>ZHS9IZ
z_+wX!r}~axKTp`t(7IN_GT_4@LiSixNQCFzypdzw95|EzQEFBUDv;0xKO$bFGl_7y
z9K3^jN8}(--9w}KDPQ`Tvd3eTFMQJPBM%YM>nEaAYayLOYUh$f-6C*aLZ7skb&YPm
zDcfvjVj7cG%%p69nbtO&<<fZ4eXWmiJTF;yN*4>2<i&v^nl8nhqlvhhrR)9#MTdr;
zV|Z_(Xrv2s-cDMY$9s)_qJ{pEl7EvxX<5u<Lt}+qI90A3CrI`{U_L!99BT$$o+)~q
zaPxVQTwtW)<0C@eiH%8S`r!CWBy^nWYZpJ^9GySyGT$?_%b;4bl6xE15yKK)RPvOo
z2LL!U30_wZk!C#G?DX?2FSOA#hgS%XeQP<Rd8A?k@*jx13mvqRN)z-#xEPquCRuy$
z(8vX*6(=mIXV>wvhc^nM7#LGX1B6BXz61`y2O!2=E4Upm`P9`A=$JWCh`*(3nVJP+
zQ3m6>k=Ss0YGXn=C7=9jiM!<kUCe6ydztX%^ZsSbq2?b_P@SH{HWBHXQOIN4+_rb1
zZTtQwmAKvD_YWPE4Zpts&KozU*=!?tsF4AiQlfpQY95?A7nk}=Y}(8tm0DvXX@8_s
z!L5koUeB;p5jPjo4&3iFO(|Yg998p|`}g!yhr%vl)+FZsx$qn=X(kh{M+Hf_jZvS@
zyQ`bv77Z{@W~ps_4Yhl9!60*-5BYe|i~(kxqE*6u*-yi`ClhM;nE5F6INDAElR^2`
zj8Pg!N-0aysai~m_FmNy#+RIwHv(;lLIb6q;KBe<--j$|#>cTXmX{t84j=ZZr^Xl(
zN<JPSR~5(!GF&Nh^c|x*?6)g+x`F>HXtqXp9Pr*3HX2ngU>ZUn)3YP3mqB42gVUj{
z>g`TimmjXV9;#ssw%)Kzq-CGDTxd`dGxL-ioz{-Nu^_vb;nBcaQ<4&nF#4y(y18@0
z-4<P57z=!x;TAaA9_!Hf7jh!AP<p+93P##F$aH0~Hzt@O?Zzkusk&}&+=X|3Dmu`&
zSfO)wqLzjUSO7|6!_FQdlf~>7=X0l0#wPWbK5N5g|9sFQ{e;;(q%<`PDiH5m%t53(
zyLD9rdAt^Z<}UVx4Qn?|6t>lQTC`O&s^U~KVoZ+j*kTnhcv5=IBHOhUD&bz{kY6Se
z`@m4G*jZrVP<N>B={9bLYDUUeutXid813R{P|Uyf4MZ<54)hCh(X+baWtv@+(Gc>s
zB!-SFE<-@lg`<h@(u_o`qSP4q7{2N7dhg>IUQ}Cd;i~+EvEj<c>JMCwPrz1Yn3+e6
z3P~^2Gv5-@@@pd(c4=5WHRk5jaL;uL>fS}cz}&E5!`xe1931>i)8g!!8}#!nL*$mJ
z=iz!-vffl^rnB?L#f7O{W3rX|Q+mI)kDC3KKt2xt=J~mu(V6m@sOed~%0VY*Cy?A*
zfgB}J+*4g$T|kH~E}6V6E(7F_lWCLw1w>(SgSBq|q)lP-woM~^*v?1tqsKyd+xg2k
zzgwqI##mg&=l}Vn9YL{>_kG!vk_x0h(0F-cWp=KCx-{=}bZ}0UK#7XueqLd;Tat4p
zen%7cgr3hgUM`@T`^nkDpk^TXk<s1xd6n~O-!y@e{B9CfH=af&Mclm|&jdK1n3T+F
z^YlJd``8~TbWA9}SEBj~=ZCVaHRRr~UuFAI%vQ65lQL?5(^vVhJLwr#y7Hfzu9a1m
zrCiHzEbl=9n}5fp-HAx_avn0G27w|bmpigAao=GVbpXr&Ql&dGi_flWQ0B;&eLi%D
zGc(Xn1DnJQV^$ZvASRov{ZDZz=G^djk4l!`btE#G)%5O`J*nB^p>J1*IKfL}Gubuk
zl?{@~ID_nloVshezEqp1)Si%zEc&)H(@3X{$;(Xf*J+yvmceqSvVjIMayob^NGu`r
z?c=;(O?C*zwQc%*e~y6$dA>iJ#+O7nT~X<DN>;Vhz`j8L5xOM(btNVpKzJ5|S?@8l
zF+zRLv{31Ndd#9-VaGLx7Khdj6k9tVxfTX*Bl%ep($3oXI8I<vhu4n8$VXa-s)l;+
z;opl=4}PEvhS!_7mYc{1u$@!W%zuD5GnaNDA7H19|Ed`3W-}SN7I^%#*FKW_3Vmy2
zq`y!Xc!V{~;Qni;(%Woq%Jk3Pk=Ukb<fQE8e}Hfe6B}ugHwi{h=0A89gPVBuB3}Ip
zaiL0Q%%K^DZJ|UuvM5l#IYua_&NQ8hn1Ur9P<?H<rEI(s8xCtYOS}?GJcNPsj3k60
z{h5DF$(2v9U-~%-aD8Zgy2QfirC;P@bCxfrF8L)8`;l(m^ZC8qjdA>%z^L>0Tevzq
zH)DOwqk&vQW{Hy2;9pgrL?JbH*-;o?c+Yf?PNH|kc|tSH{vm->p3=52QOKs8tVTo5
z%$|uBA&81ZrgyGU98Ge==BU3}Et!4@ZU_zog*>wEoz%&BJ5c3wOy#p`rdP`gAukU*
zCrc?e?*uOlF=PS<O4jvt_NHfVU$SKRwS(Fk*AJXv3OFhEUxneR@!6sziJ~8I!rJhl
z17cXv>b!xS?jM5#G6iScTu482h%UkOOVoY8AD)mYLCa!)_doEoJ&Pr4U&e^YQqdkC
zLxs$OnA4|Apl^b+lfboSKEn09jb+{74F<nS{?G|E%|5av7yGt==Ipg}*V+|fYlF-~
z<%;#!Y&-;ewnoDM6IrxMX8DR&xEH4;0yO!Jnqq@OcNOXy%8_+*GOx?kJ}3N0OjWnz
zf?LvgNwJ#<$Gxw&)TE;xh-Wqykt9i!=;;)$0!uHyylRZrdjVD(rO`Q2mr<UL@bNtE
zkl}Go&>m4~V7E|Wbdf1H@zM3i2Sv^2zs?HWq4-RCSA9Y9L#KzI76dP1k4cxVCByq`
z2|6qGXj<p;Nk5F}GSc5K{BBdGLE7YbS@e~TntS{VH37&>7eFNOA0~WA|IYT8WUpS5
zaOh}O^Ixd`Ko`AI;O#o$r8>!{mosryvEJT#nF5_VyFv??w+mygRd)k(W0gj$@ck=K
znN1e=+fzMmu7Z|W7Leq*FQ>w%iP3>IQLx&w-WLa_S53!oGhfcmVx96Br-C24-d|eN
zh~DrxCta03UN%SbD;c{uZqStn59YDG)io(Ew{2oAU9}T@GJ*dMU;lx?km*@7W2R!S
z$)}0<kKr$sXgGdxDM<kk*k_1dtLP5}Hms4k&<ir*N2(E2_2!G5%@^^Db9{zLqM#7B
z<7`<I1#xH(Nn7pZt)uEt<6*fssAI%c?92O*6@$YyLPX&-jBUSOkGoC&>!8#qEKF~A
ztn<$V^uuKJ!|tQbovJBkZ_PA6U&K=KmGp4H&S+>U8+t8=Ro1or=-kWC$F^5p=cL{@
zOSKS}9g$>|4UuqZ&eC$DGurYRJb`Qi7YXJ3Xx|Z-R%6cLPG3@Ar<9WZ{4QP4Mu&Q+
ze|)arAz9u4EJoxF73}D&Ro(W1(QL52P==bHyN9ZXT!pZ}|D)c=dj&b>^>^-P3#?*f
zc{bQUz>s(b_RU1Hc6m|-h;n=I0rOog5Q8}rq3&^~f_daI%Y74DZAUZ0NiTwLtcEC$
zukR*Yx%+?S%g^D^I0ts`_lj@#I+(Lm#c{z?&Aw&aS<OXKd3^P5nUfqXt6*u1+x3@(
z4EZ}ZJPL^e@c&I0;RcFYJh>c|bCO)FPknWjTBp53H$tR;Z<()9lvF)6B0Qla*>I~v
zStXsf*&5;EqIgS-{?z-ip}i$u|0HDr>+c3yrw6SVNZazPl)Eb060aHipY*p8Zw7I_
z0t}J+ToeQsSs)QsOzHN!I}Aca?H2mimiov)-R{vA%rbh!o1Z;TP<a=06E7pP_<FtF
zn1n+#CxYE0vm2C=fhn>&sxf`)ncNF_9apF#s@j}C|DM#{g+J|%w960Yd%D!ygVi@l
zN)^VN5;_{SMs-F6G67{fnns`*Wq#8Z>rfO>VaCnZLp}XdKwk8-l$DhbDu9%F<*6xH
zwXlw5vK(YUahPOlL@RZ@RjZnZ(j4j=J~~;md`B%oO{DlpW~Y+wb@bLY)iOLwK=Z{)
zooZ;7{_M|5h3xR2d+p!3R-q`%kIg|ncRICqRDr8H=ei<YrCE)g!#Jm=ZU{nxTByQ$
zDP3thnavzWr}8kJV&ke3?G^}~cJJQi8V2^RGErFirrv2Z1H&Xf^u~bSF~bmseV_RM
z#{GWZ5`83rcdlQNtR+^`SDaVPX&2>2J=G>63HH5A{n7>X`R}#7>GjL3*cE)$lJ(9c
zUkr>d!p`6Gse^@f>#04c1!#Hb>tWu6A)6<YqmoMt@sw<(8DCpmF=Dx<BwL4?#`jBP
zI{G9=RbrQj{9%FcX?KO*-P_50w5r;k$(e6-%>561uP1$<(Ix7|qn>S5CBr2nziqQ^
zaO>3R??@6y9o?N)`9ab(g4mw>rVc^FOdlM^e)R3SR;<*~{{=lP%hd(V3DIqwFEGWl
zig{mribc;H7WvdkCUYJr{w=2{q5&WH+Syov-wTdvY9$gMY880nbpG@G_eajOibEi_
z^VY(9l9xU1P~%tBrgKZUSjQ_nYo6GW;BF6sRuxON_3}Q>g-43sCY3%Mo)pf6^NW5^
zt7nVGVB9WoBf<?O$tdIGMfSoNkF9F6<<n!~J*eu~s#sFn(TA$c>7w_ok=KA|*CTdC
z3AHgo_JJeNc;MvxoE)Q3r~<QS2@c}$1J?HnAQ!0$tBrS3QcpVv-*nTx+pXRm7RAZ8
zCnw+fw({wo9Vi6tf4(zQRRemTe8F~`?b>mB0%Yr*?%i~2d8y>MG68oev^M4TJT)RW
z?0YwcU(UkjATywK69VBvgz_~gjInwVJVyn@j3(r$FXt{eKIjjE;AHXw;@XZ?*@+zv
zE$bJWk~TDecK&~MIdxxC^9Pr6zY6-xB65Al#<a!bOG<fZ7X1hVKlQ*F9SRXte0+TY
zxc<k-tMP9y1eTTv>htaL^ZSQ1Yt!`9*LMD@vsjcp^^mKT2kdJqQoML_+D%eWs^mso
z|6ai!@j50_`%N2sWV+^yh(RU)0l6a!83O*tCjzPJ+Es5Tv7@g}{q>!<&)g)ZC_k+j
zsLmI{Y^_9-7w2eA!${msFs)Y{(661_){oEe16*~!giXR;dnC<@{3t+Lv#$t(f^+v0
z4mby22XDAX%PkIlQ)u;-2U5K(`FB>L@zbOzB;rj)E*VD60r=mO|H}5wucx!}Q9oiE
z<+Wlv)`LtEHrm6eh2cp})wHp(FYM97k2ggK-uzdP_%@PEozrsFVx4au3Nul~wx^JX
z-(dOo;vpDaa=p=+&dl1KRE35f$X!kKTbgmqPD=f&)W6GT+4*+3JYGV(AFy*c=~K{u
z^Rii9VNi&tuJJ}^#hM~oW6b4!FNLtwrVe$LFIuI+Cko|m{}O+Dkrkf$^toc;=1hd?
zF`8WqmF_&re78p<ld0qIL(;|4hA~Lj1sp3N?D7kyBal@1(L!PXb2d<t$<%-5gzea1
zRw9x0GkwAUWIuGWL?PO@=PuJ#BF47Z+K(}|cN(>t%Okdac$O`Z;-}F;#Ou8Vwu$_n
zNZlfy3yl6VXWS8kEEQjhQIl?7D6jqv{9{yKwYy4%bZp)#O>oGOx2VDmcVok|h<WAi
zH>Pk+?|o)!?#Z^X8?YWJ6wxIpwZy~Ln!%~gE}2n@L#POR(?LUT&2n|;o^GDE@S)JQ
zlC`{skV5n~od%^lPOT{|6{d+hbFN?ce3CQgzCgCCLGllEV9V+$x|tAwgAI30Q1m?g
z#QS&3gjr!qIB7FMeqD4-==S(t6}Iw7h|(fH{Q+YzmfZwW<0j$T(D@dm3-`Xc5MGJT
z&L9ixKZ)%jkGNI8_Qc#dMSJF|%&LW+UF*(VHZSCoA6$<sP|nhY6M-XG8SIS%zlppd
zp#2A{#)x%$q0jp@^G_;bHX%)>Gk3<fi)g)MtzKU>t()dc_0jCQ0~dCsG9Za(diNvO
zWX^00f6?MW((@wzWrLAwc^9mM#?H=ddh*pe9wursC7c%;%#>U|N{QfB<m1tOZMvVG
zlerR8FWK+w4YNV8$@W@Iy!H{W@3LT2gXu(xWbxx5e);sBO*9TIk3RmnCEzU9AiOIG
zQ&wV1Dzl#7{@h*?EH&2iMSNHjxtili-)R@Z86~{Dq1@2{yZubOWJR|Se(TSqz<O^8
z>%&AxU)qVZlh<Uvzv=USo#P={|9k);b)5xQK`hd7P0C||MsRR|!gx^gb>7e{UQ`ZZ
z$<wmq4%}Dj<;<c{RS$3Z{M5)M-j-mP-ZI{xl8;0%;$bON)F{*r?AVf5VtcCeZ}UOi
z=x!VxDmhl&u$3#%-gy}@`-muhQ1pDZNgdS2W&`SJdooillqotiPJ7I9K@NWROnw!P
zgRPK5&}xgy?UJTFEBLtxjKuusT6Csb3D1qQ_83h`7(oItn=l2C(0Th&_Wf(b(VnfB
zukWTrxwX!;o_&@4`G>;yWzRj5?e&Xe{}2sz!^WutdVSMl@s-erXYek{H<qkd9MKv^
zDqmswKduvzRd>tWuWOC6L)U8N(9chMq$xs9`_}MVMqO1e(v)g+qN0v#Tm;Ft@y0U3
z&x?V68C!PoGJO5%lrfY86N7Y=-_1U7vUh5dG6r=L#6s@wz{CSW$o^xtdlIM&YP??h
zHS&+#Mhx{RZ?@3U6V=uWGLL5|0XVMo&pG_>8yyzJ9zOhN`IPjkOxnSNA;Ab!-{*CD
z00okLduvT?%7gjkd2fNCK%zE2=m*pX#0P{YwNcB|kg=RpZ<$N|f)nl+icJb-#e5~>
zxI}^-#`PEsW|4c{ixlaP5*|~U0oRU8T63z<*^~=kXAi0K4caSe{?lSPOJoYQpIi6q
zi3v6(gHb@@J$*Jw!kzP0IGg0hE;_|??vW$8RuSF1G!p)xZzhQy^9P2}3BE51b4`r(
zP2MR*Y|iS1*bHMALUM>2HDx>hDI%K28nk%FkcZ@Q`BKw!rfd##n=$Y~(NMadI^XR~
zlkR#iFOU38=5=ZR%0ad6m0`W6q;2B84^?^g$o>dsC3lkrqrTWL&F9kd5jhzrI5+;u
z8d-D!ru2^^n5;&tit-ETC#dZyJ0+>{2wrmHh}knGf9dInUrjlKJh|>9Pec*(54^fl
zLd{RyR2dHFmL!7Lh<jTixioo8n&Qi(L)uAt&0HPn`icuSSbdc2=dLEQE6n*DVICpQ
zc84WCxwi(WM5NOuZ|A7C%^LYzughYCo#>}lw1ImAOReia!B(GX)f6lQO>XYr!um}_
z9)9s`&N@MlpLI49h3eb<ajYCk=ccM;k@y+n*VG`ThV|$#>55$HFs0xwcaU_;F@Stq
zP}y6GZ0JtO1K}30!=rOWMQI6eUVTk3ux%3hA?}y}!Hddlda_Yp7iAH`@`{Qapu^B%
z(N<5jSt0Dn3sy^uQfO({Zj#a3<9lQnk7l1PmA3PxzO5dHk)cUYu1GD}NI9f%f|D)p
z&$t!Sl+M-`&Y6OAx^%YPXI<=UHdF^5H`j(biG8`n>aS|c%LLTlPSy=<MX++6jCc~3
zJY~70+LNJb<3|y!kA)PT@kU*}DYuMz8nU2ISDO5f;!Sm9xYSv|^4xm=G3~uAS`oG3
z>ziv!?SY94*>7lkPciry9~EFl{hU!Rjzvt`K;oNwtUP4rT<s<1GG?-3Y3EQ3rqX;a
zb+jbiX*EstrrcW&XbYRpMt&B3+t-frLrUegG<yCJF#!=bJ)wO(U-0+yVKjb3qQiR3
z#E5dw?hR(#V#wB%N>Cw>huZDw3=T6#W6qMno^*LG`djsq?q&1a9k<a0JEp3zzyAUL
zto4vC@_ueemS>YkK4h(l=rskW(bCiree+jHS;tl*p~jJd++;Mf4>fq;9@Gyd_p&u(
z?wl~0oGQ=dZt)%$gTdDLf9(}}0Vwq@2DY=+nk+h`W2$S}K|DLSrW!uXGyaad){F2N
zQC5)9J>;Gr*3m|Z@*~cUlICKOW;!FP>rDQX#PmAT;zVgsNl)-ok&fEclk#S)HeH|r
zl=30#krw)F;`bDqHm{?;ncubKQ^2Vsiy1C_GFG{ijHI{gUa#rI)DRsL36<;l%Dz#_
z-j(cE%WE<`@IV^cz)AXJr^ll8AhU@V#q#2M+;FKM0>S7e-cX2ceJ%>iB`nGH2drat
zP+hEH#-T;ajkyj|(orwOa`|J8r9$-tH#p|Fp26~nIMj?nXeV_E$dO^S4$S4_JZHP^
zd<bdLKtNvB_acJpMGTL3rWCkO!rj>8*Zqa74M#!HA%AQp3%<rDo;otD-6{6?Y_DD#
z{9MN18`KMJkt27KH&jvG8}!Jxw>!5y*9Tr6LH_~P3);!K+Zc`Q+Jq;#E1>L9#xUC?
zHtH7)1`d<mg2xw`-Oz)3?c*=`d>)UTXkk?0x?jZOtI(b{f*I~BP6H~UKx$$Zk}YZD
z(is-|z`_Q^j+W+>)1}Ws&4b~r4=2W5#w9{tqX{CT!B@ZvU<?!xBaq^3KpCC+Yyvy8
z$w|r4k~4Wz75d~{);CF0_%shk#P^L)fZr>Ju0d%J$<oLryr|zL;H1L0HKySGi~Glm
z3!>a;{uH|k(KVRJToXpc4Z5qxeQAhlzczkApn1=(yS}jL>VM~@TsR`N<T|W7(>GIb
z$at|$Hh=}*s-n97W(Gk*Vc2biEtRO0Ma2gvte715@6z$asaKe>i*}KU?w4JBgf2cJ
z=i;W<$$ybL#(t-Dq9qe#Xkompz(BE{<;+KeiN&8?ka*ASVRorv2u}%3U%Ebb3fts}
zk&?o_XdSJnvnF&|o>`y_J^kF#j&qg}k{}qyXzRBi{iaH#A%L5XLRS$p&r<L)2Hi2N
z&&^_@LsXjrH%Ggt1UHBeTMdyx$OX9Ze*Vkspeq?BUYDGYk_yq-@iO9h=YP6#D8Ws8
z7*zZQ;e@J2vi15AVYO8X9E#&eSnNL+G=w8x>kRhEz=@pya(08_SjiI^;V!ZRLih4o
z>E>tCI>wTSwI&&&(en&7?CRVw3_vNs&Ln{&BGo^OFf;*Ha|!N*v+c4K&L)N&4^V&8
ziYC_eQ6iay=gwzArX}6&3|ROxcU5mM325)8a#AD|mCwtZH4NGp#``6XOqHu~AzZ<m
z-^h)!1RHei{{`4dO>x<ePz4O)Os2~uQHI>Fl(fx#U@2PTxCmIxi}~Kq<HGT>%Kbq6
zg_3sB*zj!gTIfv^ckA}q0gt9GM{#rTUg4IpQ;NSmfDpo@k*z#bCbqt{JR7B^#11s<
z8Tcp6XC}q2*bDs+u+dIq_fn<Am_KVSmubiUTIe`Ut-kP(%xgA2=wXMAs9nQUFuYqi
zy;`b7XG=<>P>w=5jwx<IN876U)i2U=&zlHoP~hu7<6&IwycufL(GAj9i7%-*n&qgB
zs0i(j_FK%iFuhp9@qjLsE@{Qqhr``hke-+AVHHuw51k!i`u!^!mqoP_k&*QR#(eH9
z)qD;;*~Mj_Pqd2wzL3L8FE<X32lezmOR3x&dD=W~9m!>tB--n@w<2U7aTw-Ih!)4u
z5`H$lCv@m4>=kw3?a?Jdd`6%qx-@JCqq<XU|L)6qA-M4|_3ZJdUQEW^ehJ_+&<?!^
z*QwI*TX|5Q@_hcVScqzuy}}#23zu0DlG1CEaLAK6O1q_MlHN6ab}4ye4?M*85=8|1
z*-*42v!E@cZ}e8OoKj;~VyKXQ{cE=j?Z=6}3|cQrny+QVN85uhBvqTHY)5C)O%-GC
z{+B!W-$P<}FSE|2F}Uvjm6qK@m|%|TEXEEKE0Mg1lB`?#6?_t$E8vWI?(>Ci+&%8-
z%t(caS29{2_8I@(^{8R|%uDL!o<!VZss73QKLWRShQ=ZG>qOclA;R(R#aJrT#i+K6
zC9)9QiGSbw3@h&Hv)t0Q{lh_2mbiS8<M>LvQ-5UE!pG1m+xw-ZI%o`%P-WG(ay+jM
zv9@1p1?ySx=`%DSbP1P$yl*(`hMxa@!OmJ=FVJ+3O;imuo|9mJFYacuw&m%R?^*82
z%YPs}iM4xJ>6tj-pPxUD=5CIB>Jv23>p1wFXZcec2q)H)CEI)9kG7_VSh@5wRUXfp
z6`n~>h|S+g=68*ke+guwzE^|r!0fb+%8p{RS7l4hKEwLCJ|YHB{ZhyT6lPoQ`yPcS
z-iBSPels~%xC=9i^;?uG(r6y9TUx$U%z`wuw8~n>X>SW;Y?pss`BWE0uZ9#iw*K~$
z{tdcxCrlayNvn_kZVr)5a-Uq0d-g7>TmB>`;A&vh*Vf(sCM8o_ltrFpo&OR)TZD7R
zm#|bNp76`?B9ZQC$$n)&^}=Ukh5H^>9i$KWn9vmlJYlpkgq1Tin!BHq$#wCEdt+t!
z&&_pge{FM;@hE-K41{!Q*imJtp@T8@zE0`&|KUTJHsBx@thAllY>Fj)_7Gqe^bxx;
z%CgthSg;fP6Mlwa*KWa4WTW3U-W`lMr#+YwCl7g7pS3tTQ<keHpI*^rzSm;y-jf2X
z!*m8A;tyFNjp@4<jX1Y@+?%8{%9inCAKr{Or>3|_eIxdozG%zKDfl@m!WiP{i74;b
zK3>YSfyxUCuXO9G7ooE5HJe_0ET>9P1_Y1&s&Kp+abL;Yz9Z87^D|DiPk%#Uvf!(i
zl_p`uYQDU5?G>im#1=2-T!=%voO9Yc^^93LpT#V6dB!97w?lJ1sfMB8lRUAgcR2oe
z(C!1fyfXcwK4G}-nb?7B;je$@9{uf9^k*W&(fw(a2`S~POlWIFzI+tBO!Jrj02*l%
zb-hC&T#^-HgQc2AO?8vfr^Y%Kh3@obS@WetZ1A|}W_Y+U#?#8W%kFeL@6}Ik9HsLt
zVrALVn!CmK;?}5GJct>!pOgj-V~y<@x>N9_3(()s>TpQdMD0&aBcw89JEJ19KO~q;
zAb)-!n$xxKO_L*$AQ8Xn^^$CZsDpC)jCMeittuON9Ctt4d;;#<R6J<+O-P(1%vn;8
zw=dflCr-#@U7CZt3c52(v-~hkmc4#o<6Cg-9aC>bh_VZX+>g-v41muPm(nvkCZPI=
zuKOFaz+1~B-_lJRt?TH$#W@5A$6l3TnX7y?-u;T8o^n(Jf)RNGXXtw!c+X2i^PT{7
zXp#ypF<LX??rjO^CW7nIz~62T7Dfikom78B=f3`>61Gx@?IKn5l?SUJAQ}>}c;MD=
z@%4W@G&_7&?YO~TqLu;b8d<d-&zRHMzcXo>OYO%x1}hJF(4WO9PjfVE7W0P<JZJML
z;Mm|y9kk<5V<#ACyqj^UtI&5?9&#SQC}$daU%U%eWQ)b%Kc7VyYFrPktXXpsT>b+v
zzRM@~Vz?p#dKIX-0MI3)<FYi6fMCmjtnF#Og}O<`gs&0|iF4FB@Yy;hNFuL)hsgcA
z?0sXC#*JnvwJ(zyqo8j?{DXIT)^8xt+Mck_M6rdq({wzPwB+?Y>r|htowmMETE9_A
zrLWEPrM;*t|824__~%oojRg*goTb^^1a<77$MgIi*2IQwy<`XjWfIX?VGy&&Cl{H%
zHVlGVY%ADbD%YDK&#$C80FSB-z@;a%4N&zi{Q_r5kNSUCUWIQ?RSOZyG+F(sxpFw~
zPC>60)(PjMg=|I3qDTZRzXDe)V$AM`r}glZ3>DIK88CbL;82`=;b#W#DU1Ui7Hg@x
z6GBb!6fa~cB)1by3O!_v|D=VKIY+lHQ1r!>3c}N7C*Edi^)v<AO24YMMbPaE0OOzh
z^rJ)sQc}FbDINn{d_nj>QbSBct)@?Rk}*vqU>4cBO_-m*$cl;d8OD}7Z$gip4c~B(
zdRvY^!|f33J2UuIYGFQpq`#3R4Yhm``DQ{bntC-G7Kt94^o01AT9k+N-NzpP?@DxU
zjr5W9Uls+w-D2z)1>>h7O>T6|Y+_^xB%Izl-cR`&@Jp)*F>*MUX%}6t=LM`WLw^5O
zlSnV=WzkH$r1m>vV81e@!mn4FZVLTn<5;2FFjp<fJF_KeD&02gDYTKYc!9-AJj;_Y
z=nL`N*f+a7cD|qUsaQ6ztuI>dRUdm9Bv70@rCYqDWrqEMx<@qNfU(76P-C*a#_01T
zFTxj3tmJ(=oXk?0VBb+nNUcp^AA<ZpK$plmv6y28i0<TdL!!faJ)HhU+*+1QQ%?2-
z<gQrSDCjw$GXR&olPdDJr)hT<O~~d_xA&K<Wa63b74=scPqCFG3wc2tfM-=Z(E{ph
zX%8}Zo6+J)qzl@CiE0V$yXA=0n6Wcy;0C9KRNmKV-g-=SR0jMXV4-XH+6C#>7sOEI
zTDoSZSw4C>a*|YF#-HL4*(4X-^};lDM$bbwrciK{iKbU`nu~FUt6=KnQy|?Bn>=zw
zWTOHu84J8Lr*pj&TjC{piz=ctqMT!9Wrxjutf6~jBebAsLRZCdi*zyQdKAgl@z1e^
zwSZ|3YA%&4`9dR>yBs~PE9S@w26NCn->KM@__idA-!6sGG>|TJYa<CF+-P9T|J+5y
z)x%DsR(4L}tns2o%ST4~j6xMHfwsCsA-YV7dK^l0MjqGY4?k+10CeqPz$cTZdgJ^;
zvxU~qyqO+WBkQ{Y{UccdbeQ^(LGE~kT?0lj6IrdmC;c)Hb`MLzqIyjIV3N1_W9&w$
ztt@Frh~MbDWpsMj#nB!?iiwgOcTH1$`hlOpD#h8_j~2I$NOf9IEv@?Lw~_N7&%S*8
z5Aa4);6q5a{Ncnz*B?G6f5r&(*h7)bdQKe!L)u<PI}ZKILFF(Db=qNnq>mD_MGQT^
zD*5v&#>Weel6Olj^f3cc5`7Gu6EzMh-A(ccAM5->&X;Qooq~BH%&&>EEO0_PUwN-5
z-sc7RuL%U~tFs}ii9>qD8yfEw3r|XIolZS2m(D?gTSxNDf%&1v@jh+sJpG4bM{EB9
z@LnjIr~fZIEHv6P`L@VGXc%H;r9V_cvsYgqWg3Eq?jcb-qWb(Rd(KDYP~!f3)1Nf4
zjaWy$sGwqo9tvVVCN-?jGWou8_hPV_H8MYINci#FuqnfFCvdb;6vw%OwyqlO@jL}n
zT*3~Ut!~!|f!$VmWQ$;-Xp5S5BY5@|u+T}hl>{CX!zx!o8?+m9;bzi0_D^K9z4Jc+
z0R0ttiUyg;Og-1Al^5t(g1dCGYvydl!N4FXF|B}D2=c>q^aH#NgvQPK@^!(qr|mgc
zfLo;J?CXH-<&Rn&U@vMjt@{olD~H>lx}k@8l;GLn#$3NCX_RY6Q^Y8jMxT@YYmVe3
zO~t1RZ_VOK32?z`0#E7Z5M#I=<Shb4p4*ybWdgM7xMbdeoEI4h;nmW>9&uf0@%?z>
ze*lcMqrkV$DRx~jf}^B%b<%Wi1~^~^#auD&`(e1K>tAQ5M>i#d!eR3HxnBn9)@%hU
zpNfX@VT%zq22`$jY45><Cflwb3fkAR+&$_pJo~TxYK<VLe1mQqw_xU{WHG7buLj@T
zhiuWkDo?1Xz|QVpdEbX21=|`Y-5|F1r}jIZj@)}6=&KZel!8jl0-to^w|jPeT~Mbm
z+fm~<ECWLjhfOlfeP)7031k@+*-G504<b;~huk(gsP{OXE;8yg+1H*7(|IJrx|7=!
zavQtl?eZ>8gozP;f=f#Nfcbuorr2m&5ot1HkBet$Z<hWFJ@<~0fmis?c%>{6CAwH>
zjqV9eA?>5z+fjAh@l(0SNFUe2<sY6s*9lTBIV_sdIa^>eOv;D06E~<=D@q5S(oAn?
zqlL4xoBFs!+ohmZ19IW3cXfCv1eNVme|Gd!Gqa8NT;VrV0hPJs7isc3ySGhpdT08I
z_9vrtSDr?NqGIl6inq8K9091gtTGA}=5E5b@e+6+)eKUQm}-YRy*wx%N@Hsc&hVxl
zne6|4%GZmJ7gV)CM9w3k#P!2LhxV14nkRm{2gX}5HecTX#D=z0*pdDRcuE~<;5E%-
zp&dQ<jVXSStI~wzS-e42T)im5LuM0R&y-iBr8KnoUfgKzx8$eXQ2vEhQ-{~nCg*;I
zuA8R|dXG>41C&6hU%;eLTMm?lT-W0(`v1Ur)LOK%ZjktjEJdBgPh1uH{CZD@<Y0rg
zi@Mpl3*Cjb?h}TFM!$QB_<r}lWG3P$DFOiS@oahV+e97;{p_7^pxD(3AEp+$J^BO`
zc0c4}UL{<F3(r`GE1<OlEWE2wdXg)Yqlmz$XADgaWbl~k&tp>^tU0J-XMurg%&I<N
z1ta^5$&qG7e@4o5@e(ehy8D}c+j=`R>hS2~$1B7I{#JO2eR8D!PL$R%7&oRL7@3XK
zK!h8vMS&Vn9|v<hT{@zwRywIh7K4Zv;(dqDLD+yOHF*L3cx<%TnbGD=%%h-ZmLDbR
zMnj~YM7fiYG5VQ)IIBg3(1wQV^iGVfg%{#&45w)YaxA3b6PIs+MQgrV=@CNhbnHY%
zWXyAoJUqF&(<k=dY|P%apW=UjYT&Wk`spIwr})la4s^XQ?LTLJP><cE7I4H?u_axT
z=YVNl6n<!c_8*P}nGSHbj;=&a>)k=mmyA0OS+w3AQ&u%)geYjz>XXOH;=X)<KBTm-
z|8{b7vtIX$iNR}nGAk;njP>~>NBh}8$6|1!rNFW5c*&Ivs5gslwp+~NPw%L@2Sc%!
z2l_!hZ(~(B*sAC&vApT%#N9PIro7$ObpEizkHLYbHk~hx6mF$R!R<i4gU4upl!O=m
zHo=7=eQYF`AoiEu26Z1WV2!*D&lB$Hw2Alg^PoT~x1$62xsh>?OcV{Y8+X05MA&Ny
z{`^4<mA(lKK)>23mbBdTb?H`}VD57|VbiH+jUK1E2w`miSnWli<sZu19R^|iaz#)v
z<g2JZp0cS_rzgkVCSSzqlOmm^-%|i^v6%W<NJz5icN^F0^t~ZzfothDcQOz4=gVVJ
zm12ekXB6>KdbLgMLU-AjXT~drL)A-)6N(Mp?47vuaA46WtO7P7gZw=Fz}2}v7E>*?
z5=$;;yGb+k9f^naq*abCUn^?WW%r$HgS4#akT4FtCZ~g>-r?;j$$GV+^cXczzTO0m
z-owP6L<v`?NRHR4Wh$aL4JWUf-_*Wh#;o8CCafZSyk6+%$<q>5{3yj1!=)g$o4eq?
zmxj#%pSN2t&DA0=2a7$u55P&M^3AkC?((;cj6sxTjkE+1`WFP)?S7AhXfKrS*;04E
zDN^nx%{=Z`V@)cwGzI)FKS+Mwp}{-^v-`Rzr2OV6$d)a5voDpGytj~DV`Z%MV?;9$
zS}P$l)p)F}*V$f6aNM{-8fbCa1tW`JGClj_papJ7hrGI{a~m#HAa8IBBQq_~c9DLD
z()iz>Bnq*CpjsX~<Lcj(BG7K87=GDTIa-iQM!Ep4z1@=g50K>4m`=|pHZ<%>C(>^Y
zj?7hGQ9ig6^XBc((O=~nM%F$AD`EOl6;6eyI?<A-gmpxO^sBlvg@7x4A(dx8mtVfV
z9p4W8BM-9zn`xH<WO4PpOxbmR?vHlajefTtpDZP;)M0NfoFR&Tn@HsH#>zZIQ{ahI
z7$Url_i-5FKIs{{VVwXLy<lma++iEc4F79x|G3|TZvS?6<MYs{K!s_55rtFf7?E<y
zj7@KjoVN%|N9Kk{&F%byvBgNJHD#;Ln8p9STdr~Ke6x7g6i-9u`=mfTh6_)Oc0lMZ
z^GtocKN!Zo#9{aOA>nhdzHHIE9Pp=0z5f7NaeJA0XVYeLyJInJ^r-d?w<AN-2ur;P
zUALN+A3$a|<L_)aowQn`u>S6hFX})UOQV(ST^Lgjn#zc~)!XDsWT~(?6pJMH7{Je#
zov%cT)h4$X(L>Krn@#sNqHz=9MUkSYFi)xlPrK)_S`F=cQ`^X{LFqtyKN%Dezw%P3
zTgMA`bUS4%y(YFdWSOwRT)^!7$rfdv7V~+=PLu#S=o8r`<4krMrn8<+>Oen5B5?iG
z1XWCHT<!EEG@$ctdcsab-1)-6%5#~1N8Q`<eV6>=FKi|H>F4+cAesTo(f4qtQ_eIK
zl?`j9VpP5M8Ga^J@Z)wgD2qfSTgzDL$mJa%lF2BlJ1%XiHDOR)D0E1)OsLU?;a$TC
z7QOCI+x$EoIAl7(F!k4;D!tf-S9W<lFC4NxBz1rGEXST;C(_)?<AEz#$T(|3W7#gu
zMS8EEoF0BY*Di@?ht_E|W%!5e&(=qLRc~B`a^Oib+NvyhQQdh4Vb~rR7%IF(9er6W
zaod1Cyqy0Ilr_zAPOE#Jjq=mFT*dtl;8D`jTXLg_aA2-yEFRS_xlt>20`Ye&Lbh_7
zq&NMZNe&N3Q*o3AB$(o(-MUemA0DOwdTZ8u?-gkXg5bMtIH=tY)ULBX$;7w?Ci3M`
z^g#Oe!f69KM&%r&TG(y~_N9=d{2wy(zlTR%`ToN?-@-aA4ZX#+Wmdaq1gb%AVGSGT
z6zCrN1Tt8i5S>y1Rqt@ZaC5b}19^gNNS}a+Mpve!CH|;hugk))+)Toc2P_(i1dW#^
zF)VFbITHHLCLFd>>R<y-`pZy25^(m9i8J?juppUi<6lqEWO);YX<Die^k(2_;#j75
zPX#r*1NOD*|AXfJ-u_%ytvgcBhNZL!2lU??TneRE*j?W_G#{KVh3EXi+y1r!y25Uf
z<Eiq?XM7cQz<-61K^4_q!w9D<uq!R?hQetU3uRXOFUsR_WBxAyV?mt03Gr9Oy=zww
zYo!rsY*_7e;zmQ**NV?|B$rZ|n7bSu#N?d&)q8hkflK7LT#@ODp(MluDe6uurU^$=
zrZ85qx#6iKZyV@&`-^!2A73$R^b=Ck<-52IEBUtZl0+wtez?tkcX(Ny{A;24LnPkY
zfDas(U&_BlC)FMjcc}%Ot|J@9bB?veH~Z=Qio|P13HhBDhpt@e_HfA-Gb)9_2ELlF
zmb97CHLW%|h)HCfN+xn!>0eTgmGk-U@gwMcj|V^9)sLF|f{G}w2>URiiYNpB)BCef
z)h#r|Epp&H0%JkRWBwK9I>(5wHCIS(f~0Yrgi(NV-1?e^Lf5kqb9KH${{Ss8IX-}Y
z2&^Wznk`OMxaBs2ta=<*^o}0`7<@?nTE*rvttR5Ehu)R6kIYV4jxpDo#B=1##w5YU
z;>vJ&>spr=61M1<g*|r<PflwZHYFrx!!!J#1IWj<d-N&tqr<5_h^*=u`KRwMPsXH{
zSClAGw5aDCXWpN>F$<DDZiCp<LdYPF=2D8TN~i~uTioT6wxxLFJGKFmMsj!@R6y^4
zm~rV&bW%Y*Drn$oV>7N1LB|6kt|-HrO_KeNGZd2yjhu|~4M0=6;eXx#0QJ?6GUH){
zACtZYY7rcx=GaKjekysM=De5Ko#Jw;sGxu|yS*0ZCz)B>3%6+=wV;c>7h(Z1u6BZR
zO}ATlG310&Ily8`$E9Ycb68UMtY=;<FuKQVqy^x&sjIe8%HX!ls`AA26xDQ!W9BH6
zIXLvK^keoGhs*MWhoCtu2VQF`wQQ_y3KwmrR@0K+?(yRrR3OMIcs1<*01ou^Tbr*n
zLuCfi4{u8G`0O^ZHt`78XeF17`&X{~HIhqt-YLm#!|!##ubs}D`m;WRBaN|9ay@F+
zR1DEw0Fn=xbIuo~c~_1nSv7@*G8hEU73^v)W^4>O=t&jgzBc{NzrGNwgxi8RKT7$0
z7sP!oX&G{Ab8hPbVY~BJ^nGq^E>AK7S|&d<euvanOFG<qv#{OR0aW+-M2)8d+>hm6
zwCJfvQ|0MZlwZ1g){)|S4NDP*DK0Wp@N24!t9$0YUyA+hU1x_X<EJFor+90|?KFvP
zs%$`b5@!H?YstlB+@ADZ&wB@%MqcdqJyaquILF~lX<`kqWRva1M+MwhDY_<B9g3XP
z>IM%L<;^&|Gu3x`p`o8RBZ`Hi9%lS=R#CUEcCzwEBhr+BY7@4IE>@5+QGt_H3adGd
ziOZby;-~WGj%Sn1+>MQ~<L=;d&%F}LcSnvgPHN(1CfhS!7?GX8$RqBzBO|?Mzm_5U
zOu1F)ah|?{wPZ)%ji(2k)@0Fnu?5~0Ny$<(U69*T3X00*NG@7=78OwFK?E}N6f-I-
zM-q*T)MW8bh(~EI@ON%bF~>fYPUck%=@;(zgU{aU^{wJ^x2B~jJcwKOWu2UK$-?{8
zGO0-Sbw5yll?xrxpDA%FHzo0c2hyThk`yuQS0taG7$kP$x@o%{RW)ZjsOUDDurm+c
zP6uJZuQj`|n#x9wP)5>5dsn($LXj#m$fu4#Jm#=(EUm8<gvbbPro9X{B2nw1%Tk^9
zJYe;tuA5Wv^wwpjfQe%3fz%4;f~|Wr>ZvQ7?23w*sKKafRPG?BuS#Vo>rIL5cK#d~
zwsW^RZgX832#y@!aCrLHJD@R+&72N1$?aZ~;NJpke-JKq*ngwxgK(Eoak?<W_+wm;
zF^S0F;CJ+|pUm;}@X>WDO351GGR#gks-uO6G?~an1?HuD7LTLbwdSF5pC$;t^JA$b
z@Txnp;8)b2555T5X>r?YJ~GCksO$_aKjqd2KEwTE0fCmt^cCtJ3-AYryc1$Aw3q>%
z)kODBH%~EM?A>wb1_10Ux&T+@xz`g`#jj^e-H+nW+Bj!{@cDhQIFEXxTRoSd5LEyG
zxC5{l6(WpM_32COE9G^2pLUW%sWL0h{v&)&@F&7w55I~&AZqYw8gOEY@{(Rz0+#aP
zjX{!G2vR~Kvm72VUy9!sf8e4&6+A`bn;lExu#dyq9*K7&+uM@Tok~1~{{T%5wOTe&
zkcz5Pci!9s52VOAPcqIqrG$sNzSG+My?-vJhnsN@6E8_ue)H(J^*?*4uZ6#6zuSAj
z9~rc}9TUb^c77W1w33Tg)18_-Tgh7<TiMAeJA|3QiJQz03gLzszUW?+<z`u4D*;{M
z@o;gH@A<Cl%=B<r{5~$9w8KqFd48ui;{N~__;<jb9lh~qi!Y<obd5O#mse7TDH@gt
z2?`mxW&?nF*W&N(EB^om3h`%&HFoglz^lo$9U{tgW2`E#+V7bI0u^LdSl~N>!BXU?
zIRlFQEd8cGX1y2ojquDmMgFaOqv{p~p2F%09qpBvjlz3f>lDD_sl%`Euo(yW@A$Lu
zQ{oT7KM^;FynCg|V$4ajxyrr0++!GcU0H(;Mn-a{h5(BF8^esJCdE{8#lCvWXg#gp
z``@Q-r^NB5WtrjEE*)~ye`#<300a8e{eb<NKX2a)d_hYe27Fc@*q$hLju`Ki3t4JY
zGZg_=H6|GdEApR|o!BY>2lSy_5%jO*DmG}`IyM+|AmES2zWn{0{{U}a8hj;&KMDLm
zj&BtBZ_JHinpxc6Y4Pxf%XShOVpJIs5!(vN39na+JTuke@{PK~{{WGHqxn9EpToR!
zWU%><%3tUHXYbbs@T0Fv;(QVD55eCNbXfdD;GJ66S<|E#^X+9>0=WCgz!Fv0xIHVh
zQ=V(_nv~^EPBT_(x%z!6!lXIi?w?dU8OODJANxvv!u|sI#cel;JQW?cg}hd~r1Rj1
z7T3&n4Yv`g0dN%UWB~4BTX#eEkgqNHsqvfP-@%^`zli*QtHY;3f*^<HlH%q;fwCB5
z4I`Ye$qGV)xrkhv@5(WF+&(H3z&GtLqFUd|-A)YG4TP_Tlrfg%?)10J{$hS0d^_=v
z!tWAI;XfT|b7?x9D6)uWDoet|gppYZRz@mtLF`9RYmCSuzOMbJziN*Ye$jfQa7kkp
zli|B55?dWVYD+kg1A{EH?GDbuOl%lM0Bsp8&GL%yIq6^3`88Z+Y%C{^hb>ELle+%C
z^ZD~IqlblgM+q)y-$!C9QP#ej{h$8;Wj~1j0JH|HZ8n>0Hk09hF~!cmaS{745#?DS
zQpjhMcguu3RD->SFn>_IYySWQ*!WGWcoM@w@MZnpkFRNBLYlq6S=#xeAfL9Ph{J#e
z6U*7NIRNqxhnsQMRhQR}9=w~A-rDl|{{X<dJuJ6{G5Fdpy%^}b>3>7<E5P17@rT1t
z6I%GQ!CIZhm#JE}%Pb9z=zjEW4potd1Q30C;MeYt?791Octhg~T6jCh?{DGn8A&Qz
zTSt}_nwz*Ke1~l8l##T?$L4HoB(PEVo$)*27mxlBc#3Zfc>e%MpG~<(ltC*gYkRn(
zZN7E9ply~jka93rIA9wfd7_4m!^TRokW{Wg1RR10=t1dNXM8t+;tC$sz1O=nzt6hz
zG|Kb5R!>&ZS}o%L0IkpK{DA9UF8<JevX8@$kGe$1QMQjt@fi87^}AS<T!vL4R#V5D
z893Yz&4Gm?zcW8&{{Y+H#ohzD(>xFHZs+@B!fS{ZRFz%t^yJUV%*mIyg*gREw4h)s
zVM3qUZv^=B!T$gev@3rT_$OAi)->CW65`T9B8LDHj5uU<$lQzrDd2%$g=RS}X~Wp1
zS6JEmvs*te^GyA70h(p`<{80SORQe&`tFbBljE1{@9`7%e%BjL@twJSGHZz;)3t;F
zZLfr2Hc)S7M%qCr*dv@1f@|ErW-r)}#lIZIuZ(<CB-#gpr9mV&E048mE4h&UopbUh
z%71v{o;HFGNAw4+_<ut2C6j5sA=B-(D@SBzg6<`nGSRSfC0r4eU<mXgv=SS5B3NNX
ziZvKjKovk8fCB=)ml5KYV^ajM^x~satk>weGsU66I9OtRq@{R&FX;QD&%POaF7PkH
zPYpxiKLP1b>6Q^Hh6pn(&Z8UVkmaOua!F&<oM2a?uH4{qMR=daua6%N{swq*U1!D`
zlnHq7%Jv#yll?08R2(E~p_nOR0dl2RIX;K+i^ZStP;ZTT_r$F)$2tQ+qiMQAOB6Z)
zX(h6`Npq8XhDMq3fC$(Ws45imUptfVu4|mBVkxaP_oVlImww(|PiHjZOt&lLqbILu
ztJ!{s^aFco4fV9PcL20gDM<ikLRE$i8AxD#!3Mt(e`;^oE5;rf)b$^Te+#6#@h+Ep
zJ;mMigqVBVG4o2!BzFdARD7UwAlv*z@%kV8IR4jO0Qmm^r>&59v=0?V@+{gV#Nnlp
zP!{uToQ3n_DhXf!c*r4!HnV+cg>}OWgX`Y1%QHO7hjEhST(a6xdq3-=MPl%oetm>i
zmE+N_kNW7(<OuBDfI&Xq#)|%dziD6CTfl!AKG)*E2kh|Ii`Yvkk|DOoP?&+VT}A;Y
zNgT@<4f7Jg@BsWU@$bVw9(*m<Kd?R^XqTED_86phap7fDVvNx`mPr$E;nj+sppa|&
z3x+t$1)RkyFP5iRz5ZR-pWuEwo$%f-Et~d=@aZMD&#}Z;)1R~N?4j`=_Jz7Px*EZz
z_;%J~73QyWccMCAv{A&Te3J4q4$uM4c*T6gao)d7{{Uql+rD3h_F5;xzli#DQ+R3v
z9h@4)xiQ^YBeQ(>3HK(6GK=yAbd5`X@vx__it!nx3=F50K}vs$UDE3Oj4~>CDwxPe
zA1hP*-)H1~vEt9zN8$Ff@IJ%gK8>nh+g|9eadoHabIl}+_PdSbnXSVJjCo)-qNxEv
z2PBjDpYfC6XT(o{ek6~>e-LcqiLVTjSY2DlC4%-C$OcIN0ElC73>X3l?m4gO55s;V
z@b`;!du@Be8jY^2rrs|2F0AEQp^ySW+UQ6)AcN>Z2C}?w@bkfc5%d}SHREj_*I3i1
zQVdryhT2FYayQ<M*FP&M45yHD&3+NXoOg}OD@quf&JInq^pbsA{a0Tt&(^c96T#!@
z^Wm)-CGNGeT`qqoB;{$f_n+-C`wV#J;Pu45BJcz^zA5mA$tRL^jSbeCi@nO+DmLeh
zBTSjq6-P!2a8H8#4gH+HCVYI>lGDN(dnS_<s+aoA;}&+RcIT2;&2ShI5*_4oz{wxI
zVfdde#9*F2HCMA+-%EKu%>0WJl4B@heyKP<cekIT?0CM5r0TvP)@*z|p;}9Ssp@mB
zwVkX8URmaTii$e=p20yRfC(S8U$SrPA93K32gFZ{msb(%_Yg}8=+;*c3|98?<K&)b
zU0z8t0~J;Xqyh60ll6DNFWF<^x4<1aTSB+~(br}`F7=DFQ*!N(nPb4jQRg^S=O1~P
z^IuMZ^4H@0U*a81WaWj+z1mMp?Bu^i_@Acm?*>!D$zpMe^X<_eg5R}Y>__8&hZnlf
z#9swX{k!2HX1}}hWX1NLfjr4tV&@XZtQhZ8xG7LnoV9*<DGZ^9AoMuk{VV$Nk{G5&
zj$~Ix7-e7xBhZ7vug;I!fA$RU-^AG_@t?xMZQ;)lq%%ozV-zZudZfI%<N}KX?AZoS
zy4m^186KYtc#q1Xg3Vjy+ns;y-{<$7JV(J2lX#53Uh)3`C;A_Ze+&LAc>CbbiARb4
z9_o^5moMd|(kKlL+%dRu6v)7aQb$|{0Gtt%U%Y>_KkcF5kB-pUcw16O(R^yq;!Qq9
zEe+el8F{6)kwF_kMs1IRtAUn0*Yp1X;;#sJr@-3W{u}Y8mvf`)wxoHsmq=1Y3<D{_
z>Z}MVM+6W-!K`R*ZfxByY~^?+jDnIz#a&3~K^*43`#s@2w>+S#!P;$mvtNbRRsAM>
zj!(oGyrQd(uNAys^|}3bBW@J>{AvA`zu=&^)|VFk3HbYVl35iI+IWbeSA?hvk=>+G
zw)JoM?r_V=kgz#l*c}&J(tJUtTI*UJ<Tv&g(Jc1&Htr;fSlne<9Z3ub=tX}$XPh~X
z$}74!xarbMSieW#%XEItljnF`%8IQ*UaP7#YY$)W=858MK20xPvWEKBP(w#Dh8&D!
z9;)7i5JhfLUMyuOzG%BM)s$l;%O!Mva(q_(nzhddf5hYA^3SX3Jo&b5`^&3-q_Uo}
zCI&If4}RY_#dMO!HnvA`+<PB-{i-StTKRk8$L!^z_{QP2?+LtGF0=Erh_M4}dQMO8
zQ~|Z}e)#_Yfd(8`=s5=))S}fKZn5vz=6+|yoC6x|*UPkj%hTk3e6*mS-4lU~in$zy
zD7QX&^y^I4bYBp7Ussnz@kX5{oqC7vNTpIm9Bz$>#EbsY<W$J}kOko4zgVG&r(O!B
zCv?xqu{0{ssOeXYni$jW!P>{)6*8Zdm<*h8QIu?gSPyKGnsXArjcQ;jvqv6ZmOW|8
z$lM;ZM;YA3&OzzzNbr^!iyMK-`U(wOGqobls~+AI=hm_a^9PorX#+K@dn3-jAS;eI
zKi0EEY=utju+CWLrEg7XbCRbwQ#%cBDu(%k4s%pyVkLdO3F5S_qIrO52$TcK2c=ZF
zj6|r+r!V<&U6DyGPHEqvog=Zr`W)t*9OIrbO_&nFhfb6r^CR8$7^_a^Egk;=!#0op
z6HPVb{{WuK0rw`qO$Xf{L0^~O4p=vh^r*WD)c!`lP)FS#L0_HmxBJB}Mt+&VpTMi?
zg{rrzt!pB%$SP{pR=fWIk7)<{CcZv9qwXp@L9@z|{p89+mDor*85tGuf5!sN<J*>Q
zr58M7+P>>WDwzjp$?eyg_&ei!gX7zb9Q=a5mnyTU?b!U=F_-Szea|9(w90dJrq|G;
z$3aCDPytE-G-ABh#orS&zYYnktpK^c0H)vIMm@>(6%(3@xurT&o3v5V_?N{RXND|g
zxV)5k>M<Nc?=um`M`2#+`yqT));wt1*M>C+Ad1sY32_rf2y*iiyCk1#{KVCDi~VZt
z?(gI-%yY>mzVQ8m5;;6e;#k~dG#2BIy;{7yMj^{>k4~Nu!F$`Er^y&dRh7(**?1qV
zO*FAV3z=|Ciat;|R{GQ;AjBj}(W>Np-F~$U_D|)K(&c;c)A6n(lX{!ItaFVX?DyuP
z@|raRac)WHwra9TbqhRmg>WAcx}Uy>x%4@!LVU{66k)gck3;mYUN1xDZqUb#rH=%3
z^{+Shqa=gCTF4|hTXKvSouHA{y&8DaD<Th@{o)7dUVZUKB#*!v(*PIDgXSZQ@-dpy
zE!<B14}})>BZzr}3jWoXb1TZbt-`9Q$UXl6DqYDc6k-5C2>fapWQyHDOiCQ!`x@wp
zymmG`8d<z&p{#?Gs9Sq=abKl>XMqE4Pyk|aUzz?5n|F=0LAU;2t<U~MQD3G>^KQuM
z#<{BdwWV(&>o46x{$-yI!yD_itso3Uz^+Di{SRvTYQA&O;tQ-=mE-q|Uq)BXXKwR_
z`yXGy-vw9ZzaXNDE5ts~D58o0|J3*BEWDQlq-;OB2~ql*lI5i(!5&;ufwg-K^}(WN
zbMrV1zKNgfQJYqgl)JjbdaLnY)T`drpUG7t_EDPx8D+_Dkb_nJdv_%pYi`YHPN@?F
z{ts%#mv(veJl9+*X=92|u!bdOCQOjWClzW~t(wovgcZXEJC1)ULimg3>OtG}6-p+Q
z@&HyH@sLNRDxRhjX>L}LSPXh_D#9%A!z>_@8w3pZsljDC*ajn<bgQ;D(_gdhVjBg%
zTz^{8PoSX)y-JZprZCe=QS<kE)yW}v)VW}G?dWs&h!s&S%z{T_(Y6P3#(xTMf;fgF
zCwiPY&kgu=sFd4gn>&wKrFkTgCEiNyDmWkHQ((5bw2(EFh0X}vF^{J;Bp6AIP1KG^
z@0yyyRv6mZ!wt?`Ipg!Fe(7or>NX@}zB`DCex#wn{OhB#M0l1nBkq2tPH;Vl;<?mo
z!zY;#J3%0U-|8z?-R{+qS+IwIq<5}buT!dryQ4LtJ0&*LN>gF^NErlIuKYTUP0h*c
zwFc8(UvC7Yw62?mFmsNT>Hh!@C57+qA&ng{gFCt&dsolq@BN-f(`5;LVkfRe8#0~j
zNzWOt4e{Rg*7u0<7+eNDN%~j3yLntH7SE?@@SliMV_n#Lp#5v+@DB6P_H&t9o~IS(
z1yC?DPBT$W^8AKC`Sk5cuy-{Xfn#4dbm?D0qU~mUg0+f=aBLoxVH^%KQ8r2G)|D0#
zu5pK62iBJ=DOx(6c7x&>w2PE!T1SMCHU=Noy}L}-Y_&DrZ29g~k&$0B$s)3<KO22L
z>eY?(_7jP$Wn^{a5PzL;Vz6G(^CQy2W-^A7)gM#mAW>d_;m;aDeF{OSowpq~Z~?CE
zrbGaf!LJ6k9(3NQ_bFF}Plf6uw>xoAPlZs#g(L2h&=d5lK&uSmp>QPa#bTMGNh2{O
zo)i1_-=9uKPo+mCqB?n*DgnnF9C}uqjT#{1*i~t_$s0!i>H+*~YJJS6uB>7;vq%WH
z2jx8E)mKG$oM3Mm0GifhlOa_=KgW}UjCRL-)<wvP02nErpq%qvk&`&;Q!>_Rpdog1
zpO}-|+M{S7ak=E%=xXGxEC>A}J&H&1cg<xg0;Llh21xvCvYS>o>MJ5CA|;4fOMz9$
zeZ@~fR%0Rp?=O7fqq~fql0;4l<m1-5U~_cTs;sC<WDo3Xmen*gyxnnTfh8v;MnE;y
z#sC>@+t;2)98|*U<dqQ+1Ch=KdH1cOUBu^cJYskzwvnKi6?Nc)RkoV;oo7O`zmS=~
z*9GE1!Td#ecBQ4>X{Yz8mCgf>MS56#Ol6>_4pScnn9`~`MgvJBoFr(anUA0YlU)~t
z{3qg{AL|XF_)-|;k}hs8X2`d<Q_#B(r1nvgeY)4*9}K<-cwgc2L#enrj=Uie%YQC@
zQb)`ok2_-rrvoH)J!`_xc&`nBPMkgJox6Io>NtOdv00N<CGOMi`uZM4@R#6S?e?1n
zujBZPHq(CgJt&q*cDLR8Ln{KL4t5TD0!Jdgy|lBou(poQ#w3E?6$u<ss6<6x003|R
zt17tQ)9dl<$27!aYPz-E-{6nha=f1lms0jvXxq=L`HTS4kxs0QXIj#{Kc-!4TIJMM
zHWpDd)7(z$98sc-lFY;b_?qBJw`IBMN=YXtv8YG_z9;>p{{U@&hra=&Zw}ZY@h6I&
z0Ww%yKHGRh5)|BA0Lq6e_W)<c4%`u+h<|BM_$Y3@;)ibyd<CB4N79Owxz*;067qcY
zSXIOMN&eU0Aoj1z$=++3q`0|umRVT5k~!SW%yLO5l1Z=E_*27K*5$=!iqPo}uh+Mx
z{)gtALB~+0<%hv5zOw1^?s^}@kBgrWelcsq#Ges$iM4CU)#HI%5y5R6w&IY&zakER
zpS;0WaX1*@mI<z=j^5TMnps>TMuT%RHy8kNPtvO(+H1G)7moZ%@YlswJ~Z%@(Lt(d
z>{<t&2<{^Vh7M9d%6RY8b~XEDDqmK^x_C$>Wun=6Ef?cs^53i~)ty{So4)UNWAsz@
zRsDeVUlYls{7m?@D74VsW4yA_;&~G4<TB<qnn>C=w*w)zhVuh12m?9&vPIa6xy}F?
zKGpDl?92Oacq`*NTlizevP0qj02vtfJ+zFmSzMnnV?k{!kCu``;}IX5t}+FE;toJH
z{J+K+{MQqXf7-F-lG$5Cyu14KKYHPOE?-9ozq8%b-SuzJ{DrOy;vd6b1^iU-<lYnU
z=9d<&rOeAAhC<5AB4=yIA{BVt0!HLua7fMx2fd=cUX@wXlASkcEt=gQK~{_^!PBU&
zpF{cU{ic6nKN|cJxw!E+goNHC@N~1b$)ZBnmI&u4mq?=w2P@|w+DIFM77JeuGJ~9B
z*RQRAUUI=4$kEC-BZ91_p$DiVxUbCb+DHBg_2G{f!aPIpLc+&D*JQbQZ!Kk-NbGJ>
zT#{4`;wDz)1W>p*zyJ#UqlG+5Q&Ga_8|<gk{@(upHS_L%S;V{u+@32h{{X;$$oz=-
zPw|)HH^Dy<pAmdL)Xj|3c}aH!h$XhXiYMN&&l&s5IRJvLftzC~Vr%#J_HO;G{15v<
z=(7l8Iv&4ZVvhGsx|K*TDwYjxC(GwXO0fH(hCXE(8UA=KJO|@wJXJ4(WYBdb*J5&}
zz*)@KEC=zdJ3Q7xFu}HCX$Ls3*zehc{t5p8p!h-r@rTB{Wz>909l}dvYi)NRwzQ2=
zNM<qlZ1R^Z4oOfIan#qN;wB3ro)$Qn$wsr;#?8rjtEF!<lY^>e)T@17lv{^YE2O{4
zo{9TVe%!tj_$tzPL&7kB!b9Sln>iXcf<!iU4&xC#@AGdt%WP0j%Y%&OzYF|z@fXIw
z68u9yihOaaTx!~l&)$-D`B8J7kw!3)pzNvu_OH{=+IRj5<pz~wFN{BD`$qdcymC!@
zp-VeCM2r^AnPBeh!;dMOu-nfE@^1G}zSAz{)AcK9tuC$Dq|jVO(9IK$s`0oCtPfDY
z5ncWtVRBgEeWqQlTYJsho6k;*)9+^=8RBzCTiIhYheWo2KBEMf9MTp;^7{MKOgI$L
zU!|pW?w=DK9nt#1`zZe3cK#oSPxxo?p;9{t<c`-)*RA4?7_Ff=REq8>)m3AR;0aK0
zr{@4EbNeHwYMOSdX%+UFaTV3H>%J+WaU^cJ##NM%MnM()h>$ap*1nne3Hw+4R`?S=
zj+3NnO?TmFZv6Y58g)mBqjEgeg_(CkSPcB6F6^9Sl7AuM-wmUP=Edc%)7{6T^z8h$
z^gl)6&NX;wsu=6aJsP&2zDM*k@lWA5f&MV~W={+F>rIbQwIs<2a_aC*0Sh8DY>o=&
zI1B@HJXhe??HT(R{6P2-yTlf<FNl0IJ;#$3<+2o-7INhyngUtD*qJV+qsaqyNEQ2g
z@I&^2_<ix~K$BYVRojUz<an+uZ#=l~V%v;6LaxjrY~f!35)Y+&Y_TNKv{EQ#VTMvN
z2tAE{Ym)IUXOLI+I`XIQLH__+KTn;{x6JZf262b>&}#2#FW0&J!TbsRoxUM_fAJyF
zJShj*^y_%U_qx5(vf9BBV3<M&?`0(XsJT26cm}_qU)lcv_IvPm?EB$!q4;NY#qHRQ
zwLL+Lp}m?d!6Y!Wk|bm+V+q#1mqO5VZw+a1Xqp9-8hwSZD-E@}2;z(qScO&KWRiZS
zvV3vlj|_Y|@WuZCi@aTX6^5CoLi0-`j1;!Pornn=ow_QHq}KdR#yH&aYJ?MwJLv_j
z-=B6Yi-z!-T`y}VdG(gxcVnrNREZi%SSqL_l5hYW$*<0z+JpA;@bAL8FZ>7a+Bo&U
z6|6g2=&)@+Pm)#2v$CJxI;#fq)PQ&l%U>FP(to$#jQ%8CYCaD5T_b2-54nNuq_}95
zYB$i#in1hyi-=b{fb#IgTa%m-`QL;9D8K`tudv}C4BVV?d8+>aw?FwV-iznh`EDZP
znAzd7=dbgBm-(G9i+pF|&xk%Cy76C)^-1-8Uh@K5n}*3MoGPk}Dk6@EfG8x8IW>W6
zWp#IR9i@~K+|Ma0kwT_P9D$M*R1!`Ga0PnL!OwyJ01&=8c!BgE0a%HAg)MP=bG|z$
z2;?M_DmYhes=Y|&zj(d?e#l=6{s-IGYgYCPsrb4ZUnb{SxMp?|NEgogJZwrttXBZ2
z-M1q+CqHk*JYRsw=`1}pd99-!p1qgie7;M=ILz9ksV8Xllm4}5$Dgs^>@TTlT9%XX
zi{j*x+f5Qff1_w{O7P7DtT#n9)UoYg%A`1nSYgg2#@0stc0d(C0|0P6F-Sn*_xjhs
zpS1Vw%i%wS3#oV`z~ObD8bXMNAT^DXN)Ak}8|RS+K3MKX<BkvE7`#qp#F>XRxuo_>
zXs^+EpRUt}2P469x62jxxBh3=UlhJAcoX3FhORtS<E=?-tu53YiVU;JS%N4?lP=j}
z20*~?#eY9PwO8#`efvo3wj$R`T_Zu!%bO@<@}{|w<j84WUnof)_1YRGk)tJ5j!05G
zPvVEf{{S5PV(~lp$Knp6+K!~rEGXMnIO2&jwIhu{R9uoy2PBef&9-3Xzir_T0;82R
zio`y8-TFt<yYu_3ep$p^Uso}UqeXJl{9mv66N8><>Tqf?(w@C*^ck*Zd{VuxeF^Xb
z_LumT`#$R;e+^x=?uBu5E;S7f;m_HwzFmZ?VT=;(#vKPcS9-Fj75m}v6ZVz(d-3-`
zj{g9|cLvu}i9o*6?pP(oyrg7+GyB!R$aKiZTpWS?;P?aK?~6Vrc$&|`ehky))HMs)
zQZ*5y7~<Y=9%ofy6l1%#0I%F{*-!Q(&^`+rPaXK&uA}3W^EN%S@_mxUUn)`|@`(#C
zC7pZV3jBM-+!jiq?WFl}4`mx&`N@8}`k$q6HYYT7J)JdrD|_s{KdT>plnvAd9`%`{
z=sGutY$DP;KVfU7*ukGF*3J~Sfj1IP*y9SLB$9AC*1GU3=YNlXwBN&T+4E1flg0DP
zaj9w^V6)R~!pU;`5VrRPhnE&WUF3B*0N3L994;djRyb@lT%|3N>VBh$uSX9DTDwUu
zuc7of<Wsv>^Vh2X0Kr5*XsNtsCyBJ{>pNX2O}ZTq?9sibfMfS>Azw9%W0&2KNya#@
z*5BFN_OtkH`$6fZ4JLo=zY*G`X7fh7brXhUEX{6mJotfD1(aoR^C$!p^cl|u=6P);
zTD)%_yt>=h`my5WoK=9v)O4x4HQ!76A5!4ajB`q-H0t?FpGbJ$#y^2y4}LiKaA-a^
z)7E#lS<at!jM2|!Ea|k#8Ae^$Ay}hj7^uiN82k(UqW=J8FNnXh=A_p*>e}CiZXQLn
z)~#Y9AQR>baDcJ5kg4dzsoL9FuuXqU6`QH)I&QOLai!|k(Oy~I#?ahb&Vop#jm8;W
zSP(%SL9egjjxxq%)h!Qto8B+<?Ee6!e9nEt7;MHDw=GVR{{TMc^N;&B{>dK@KWIDI
zZqWY#!ad=uLZahSyM(hyWCW<Tgq8Dpa&i;>tnhz-eiHmB@SnlI2HN;9!O%$!lm(?=
z^UWxE<iVBIW|B`VJ7tO?3K)_GNhi9~G}|p2!rMvHAzOPpIFbu^Q-qEvz*I#Zil6~p
z55-T5{sZ_k;hFqX<IPGtD`=3ka0XL#azi2&w}i5^tP8H~yeK(h2Q^$n#Q6OBx}=j(
zzq<vlzpD(JBf{kr9GlmypY@_WJAbu4zn@Qy{{XkQ#lIhTo6PV9`*>r+wij^RSlhsh
z_KSmUA(AOBowpXr+qxipv$O&ZPJYq-m49#j6XFNiyb1A|cr<?!yBO_tG8PL9&<1fM
z5*7)#AOrF@Vh;xKoCBL-a4Q`+->ZuFqPw+k$jb@FId&ry@UWMXli9QM>Y70&gkb6!
zNFa=Y4nYQ;Ulx5?=l(tTYvE6b`f}KKn%B&DL;afOGqQVTZ~<uB*MY$WPB{j@Ild-*
zDDiK?JILkKKjLenPQ*c|TcQ2l-{GBqfr#fL<?DmReyr6i+betPc`YuZS)iE=az@}X
zx-h|32Z9LT*S(W*4ks?CrB7J)K65_dtTty!#%j~(&&Kg~CULP&5Avy`W1M64udIG5
ze#}=E0k!XiN;V)ZHjgxn!GPqOcXCK?L?>zOiuoNbH_Id~8^b&-Qdp&3uN#i4!1N&3
z?3pfqhs<I7r|&J%_>ObKn7q2HjcEJ5A4Xn4RdVK$1a71-%APr@=_@q8Q+EXZ6=@I%
z{I&iOUh2P7#L=TZ;Ib8jAVgD@E6#qkTIyjE22vHhd)2#;`BnprRar@wYK6!Gt{a%m
z88Ddww!`<1TO<mC@^W^e$S5!f@0!%OuxT-~f;${mL5@&Qm@rh4`Bz+Io6zNwMNcUt
zx96yB+|*<#04T;f;;cE4ormt4sWeUjDaq_BTiKYjj=SOH0C>+zl^E_fQKvraEA;QC
z2cQ-CoACP5Rq>XRE(X#r5%KgnuhS56Pp4}9^Nl3_u}jgPqi`x}m0s6E)_$f{Z02bO
zR3Xi1ReP|8+EF0Nl{ppiP<o^8D#w})9#E(SzT5#{0Q_kLd_lZZ&cLAlARp^rdq~8#
z6p%CgtKiR#U-D5kG4B`ti(2}8r8rIbqw{>q(t=U$c{_Ec(w?-H@1w|S!BN~z4W+x?
zT)L!DjI%Mo6`QMT+E$4fZ9-tI6tGPE&0jD0ui_r5;slb*%-LupBr1W(9Xab<^()3Y
z9W*e~sj54#AN)<83E{Eu^raVRGD@s6+4S_qcooIemlC}4F=unRn4FSNy+bU5L%1HC
zQwXm*rDoozuS$$*uQE_c=D$FHV0PW&UlaL%d|N<A(=I=yet!;?`UCp_?H(ubJ01wP
zw;At^&5>O0?VS^CqxBXyR|)5?;1435c%a+Nv2eQ&dbY8n4a1?TH<7Whx{r8JRE1&J
z@veAlGYLZaqlfZ@ep12}QAyl6B!W1{zG^~|xyuZm)RC?b0&{`ZpNtV++;%=zHoFIp
za30mdd`*N&;2mBk83qsPE6{6#_?%gM39GXI01O15pdz(^;(mDjj7oOw%78wVN?43e
zNAM{G4z;(uNCa|&gU1IL&1Jd8%iEzWsouIGZ5@q21q}G(Lu?=T{<rJ@09_WpO^4re
z{?V_^KM6@6{3jYAY=le8bL-ETxv$c(6juYz1dy&s!6LaTC++0r<Y`~sqXfLoO$t(V
zn_ojRDEgZEHok7q;>NLPBI<4w^Iu5U&Sy{es@;#T;Qs*ORP_odqP%15f{G}h5C7Hr
zFv)HL+O*8_s3Y!UpyMX1PQ|VKtA~)5IZ`@|Qcvbe-!KulKQSHss@g_nK^@X5QTKMA
z>(;-hoRUA2n!9Mhf3x&?Bn3ab<DbH=32f!HF-D8MxXTQhww5+iO6t*kzN~YCJr7#Q
zng{cdY|{Zj$OqE2hjWr@QPGe+yTIwc{aH9aLssIlMqTO)M*J>2eSNB-l1U%QmGE#t
z4bwGXd6vH?+Dvf+g#c%h>0Lb97<)H;OzCk9j~P?bwOxWMhya=6A$FV-z%=<HW^pp1
zmGTEXXX*!~Xt$i@44h;5K_q^)604?#`!#(EDQjyaer!Vwc_18&)N#iQjLPm*_E0xu
z{c1^XQHhOC#ssOr6pIqX4A^EQb?7;%hQASR9TBHtrzu8}P=?1jrI8zF*oXxkvQKbF
zN{3IL2Mrt(ktz?LCwAY%-mTco*Hf6U2E+UgyzV~ZwR6s+Z0n~@a@mtW*7oxz=)iHo
zU^@M3?bJqTVt7>HL1uh>qM(w_3zjJIY}#;0Jq=m1w-BVt8H%S&p0$-p7fvqvqE9_>
z8je{{Vb?Y4zYftCP?(naigf&I$<p>1*>=a5&H={)y-VSw4<4g%(qQgPf)0Dvi<^JD
zNcXavePT8}s=No{7*^Mt{gf5-2HrDX1Mw6H^}>6)3i;f#bbA@XUd;3IRoexzibrN#
zBy#Ul)C2tLv#Bq^BAp`$wv{95E9lRUO|@d_+O7{}s*(V*GvMuGoa2gx-HtHS(iz7j
zk@cWum)R<)g+XOMdp^F@1vc`M!zxD=3}<QDIH?q8hCEOOm_vW8&I#v=@4PwUY3=1p
zTlq`EdNJg!dHONI$3a1#1Q0iSXBCZlDx_@9X;zgvb42=n()QZsFwjUHNbOelDlScY
z^`+}qdTg=W!cGS!J;bRVmFpTmi0w6n=1(e7)+Wb4Ltj6L!AcUmc0Pv>j*VE^=x#Zf
zo$P=p>CS35&eNU_Gmoua3=bWtlbFzM+!T@Cxnk_>&bFPdWR^fT79jd<s_Sq(;ED4C
z)Ro3RI<nwNcLwPdfzEdH%{t+R{h11SS4wZ8%+7OCnn!;wLa)v^9V;R=n5k%h*_?Fd
zx<VpCNMD?hoy27Oimb*IcZtT+o&|N&mGn63$ElMwyvl?$XL|Fril%4SM+1Y<3|5uI
zA?5QXn`Y6F7@wN6WZNnkV;lp9_BGWRD;!lWwkBith7u^-g~1r=DwD`cgBpMlvMCYd
zL-&piYFO#_y0puu=~lAcTu3)6M7iLcZYQ#X)S6W)!kk>{J0vPJXHseqS8t(`ZpbC)
zU_D9bIl%pE)Vw3`iuc8+x0>?3o|Q5NT#d8Nt^L8(6p!L=PhpO0)_e`{drr|=bw3d7
zzq~7f2^;O=KA<r9o7|2)E7`of8u(1pimE<pSY5mS0FnBJMZs}|ReZr+?xWk($ME-u
zG*1iZaOnCKtai4rF4a^VpHj`$n4U>IlUw6~O;x2Pz9v$XpERA>`aL+Ltb^W^xB!#d
zqmoGCLnMq9R2E`-06Lod(fy_W0N|khANYGH@V~*aW8!^X<9|O=^SsL&Cj<byQ|6jZ
zxn0N?`^52HmQ|Eu^D65PgN57sm#a9l9McVfr0Zg3W%$2C>z|Gvv|qzNfZ8y=@tyu}
z?jvWi(yhmtd3Cho<W2%Ee~rqo2LzrE;NR^H`&0Zw{h)PkFI1oVFT=LQme)tMC15``
zM3a(BO)dZ%n*m04D&@C)`{Ium_@l=f%-$gJW~+ItU(Q&W?<RSfLosrRxSgmOi3%_V
zYY0Cz{T0Hz1B1$vuT}Hb@6msU{LjWXhl_C;gZ6a&+I_qFhai*NujOB6e$F4UkHjyI
zca}Qtvkd+p@J!p9O;OcW;^h}VEu?Rf#GX9HLl_*5#BDzsmco|g9CZf-el_~*`!0XM
zLhU2AwD1SVA#Y{&cv2fL5TdLW-dqu%mhl&aaui|XU5v*CQ;hZTo+~uN;2X+ech&Vv
zx?NxOZrt2AhRg97g&cdb=(;!N&#L@g`viOj@i&1q-v(*AdC+b&wh|_@r^9ZVh0BP{
zf9X<~kDMDJoDerMsm?`yDEw^wpME3!1=fzA<6S+L!g;2S!&8i_TVA9@VV7$$jf#-Q
zH^U5^WDUpqeXzf@)NLTYu$d>exrru*SlBX29Dp*qk-<_9L9SQFUl05x@gG9D@b`=K
z%Uwp&<_Vx5Xo5LjA3X{)!qO{wasYCy20;M!uf=$4k8=#NxBEn#>AT%Gt^23k{XaS3
z3??$4HW?^R+pU^ElSX~AEU}zAFl9L9N}Pf{2t5x`U#MTR5BwAtUGSXtz7F`~b8)Na
zg&7lED)%~!61<rWa?K`Xg+nkqRLB5Oxd1J3pR|wc0pj0<%>E_#R&RV&;rC|UW+5_P
z*sA2^$_=u{7a5RkV%hls8Lt}soquG{h@T&|dv{1D(L5~5&o;X}4$!bd0Z0eRB$VfY
zkT~Sm>zS5e!};D<?eVi*i?7;?`<}j9-^~2$1BcD>+<Kvhi{|apOZ>e0AJM;sd{yD!
z8)z0@DDb8HI-ZrN#IsvnTej%rM)dR`@XC2o2p|webT#CE1%3{AQ{ZQUE&L7PA35z}
zcA9DK3Yn*!@x9!lsOPsC0|O@&>UURG_E1T6cPc{+N?u7MV5o|tB$hk?0Ldo5n=uNP
zwv=$~xZT?NHo9nj;YM?Gl&d72wbw=SED{G=XPFs&x#qtre`#O%D3*tzoBsd=d=Lq%
zY6joSlf)Z=vA2>?bM{~}=gAnsEy4nD2_b;_Q}%QHyS^a&EYwDq@iOxJ#r_()n|#t-
z%4gKA2?Q2`NgFPN;IwXgWeDhg#|h!~abE*0Y+#m`QF=-Geu>)W&0;vqE~ATBSxWsb
zwoLu_HZ}(H`#{^oZi4LdwlA5t({?kDY}J-zQIo}RpANnycrW6whm*w~1=Vf!Z8@Z9
znrnC55>&?M+;vx6jDuaY_VL_A(aNEbfdmndL9dFbPH>Z_SvbjW<b6aZ8AqO*O*7zs
z+GF<X@K3@xuRJH<GZ%>bVH}=Bk(Fa*r?A0hoxxu<uy~F$*E^YUgZO9h1IIoh_?6==
zQ^x)&xHmeEt2<9CbFo3;V?aw4jTZ`woGO4T^k?>={eZklp<L=85PlFn=ZNFfwFxJ<
z(F7asA#0Xt8KQW=St3>^f)$hyK5k8ZT--r(X>lc`>`!kzF44&pidE6j;gx~p5O^ZL
zqj(1clT*T~xpeueKZ(DK&0C`U`?K)wFRz-{#VUB{^1)w~Exm8tv0<m`8om9tlcZb8
zd3|{*L2qvomy%gzI8{(GNhgZ^gZ-Aj;Ga;~M+c4mKUl1{;!W1R9h1npwGkB{o4{68
zk%=Ixs|~p1X1^o6Gva-J!P?!gi2N~qHl3*1NF;)Lc^)TIj1jpRTMP+S0F0?3zj=RV
z58IEz{{R`%9}0NkZM-|<4bjC6B1~IqOu*%>i}J0Ia}yFqMj5)&@e>uC(!zhjbbodB
zn)sLbF7NW|w+B(mr-|goJJ0N=@c#hA`kn#%PJh8W65<=b8vF%TH}hRQ)A)7#h;4*Y
zoPlI<62;{ck15zIDac?~5BU?TXu8LTbt&|35Ln!4xAv}5dt11g8RB8t<8T=p*nn&L
z75$}uZC{6f1a!--A48ho#C|ajU|lW=a+a(BEcZ^lBCbFPGVFK*0f_t+_?z)7#NQXZ
zZK-(w0L9M}YI>U?b%i!X8b+ZbjTe#?5kTaUdXtk`cyo-->fx`AkM4fVo|5@@UVoXV
z7-90d72I&Lw7R9L{{VsgY-0FR#hxPYKEG$;e+z0C8fK+sAd)MKX&NUO>ZOo^Re>r-
zc?636$o-gqZ$AlqP=`<OZ^Zc_&^%*tFt&l!3TgU-xnUB<?CxWkfB>w8LpcEnEEIkX
z+uB{;+(l)l!uJ<5y2lhzsb_a$096Adlh{}4{{Za4{{RH`pHzoV@&5qFHlDyoaMHu!
zrA?AZu{!x}cOz$NKGw`ixhTqat7HzZ5HNWx@P5}Ye6>EyUlV<;)px$#(a*uv^ElSF
z4_I|i;r{@D{SVoQ<VRh|Lj&A$4Se(aM1Ia54E>_~H*cc&np>+I-8;^GI{wx0&`BzV
zJL5krv62*&I6Qo;q~^YWg%%W8qZ?5aDyZxS0DcwD_`}Bj7Vr*-H;FuBs#@tfWxhPi
zi>U-?3E%;qr1k@v{&m4t#8SjUrWWd=dRu#+wPLAJof%WaS;4JuvHaA0ZT*@)Ec^kz
zf-f9cM<s>a&jpsJV<4K=>5v0}IF>fU50*qydE2`L*U}%eKkOOfZ;BTd-YWR14d;eD
zD$)7Y@u?8%SG@d~jzlIHo0GY~W?U5;STURSrTx0KPm4&t67UY29+jck-bZwo_Zug@
zo_lA^az;LA;!nN7-d$Uqau3$Ovse5SC&J!2SUf55iLHD)ue(0YNq=*uT<&nOMgzM%
zQ(>?{$~N4F1-6gfGfdNpR!F9(r9ZURvP+}o>F2$>E}xNLFc}2%R|iqJeU#OceO3Pe
zEsxMI1^6GrzYjb=YvFGPX_nenn`nU&3s{mRVh1k400r3cPf~fUFB15lK=5vZed2Eu
z-bJTrdUP*6)zsWa9BI^?WRi1*QIf=f0VfsWzaIYpX`hEb05o5=c$RT{t4|n`%SX17
zF?Vu>f|pS2phl$Sn>{)Lc(1{4j-Rxb#Gj9zAiKZu4x=@mi0&5CLbj3C#t4w*H>Pnk
zx#gsAa(?Jg03Vz1zXoG-$-48rY4wxq>EH4`??2*PCRI&RhO^m4boKtOeRum={@-_=
zE{^Bn_rVK^EVOr2TdxoymfrH<hYZPT2u6w$qhpsF+%W(k8Gp<F06FG%l141fa>SgH
zPpLGLMmp4L3X(l5`XeRC@L5hBVQ{gxch$c$@vPG}#AX$Jt|C|Lx+B!S82lpmh4GVG
ze-HdS(Wcd|p+%N<R!E?NBHMRKER3kR=K!3X90P-2v|kB-;GTa5{3GG3h<tx$d{?gO
zns~Llp5?!BX6`?DY(Q0LSR8Ga{b&P%yO<x#581Ewr}1~}-Kc5{r&-6W=vt{qywhft
z;>58B3nUIVDm!i-VKK3FRT&ts-S3Bgw1>g}0EoT|x$(x4;!6lEG`lFTb%^e6BS|!+
zk})A$ub6}>lB_{1xDW}i%zQ`3XOwEHbk{8ojyJo0cGYXsPnq_-6N1WXr#hbXRJBss
zdUR*wSM4?X4|w0;wD&$L@M7F}m%(EV=Bk->3jh}`Zq*b9k0T;vP@|)T;J3n~1Y~qI
z`o;TOe&11Q_PU?LFM=(;*5)>~zwr=Y=`3;qxoozv7{*nkVd@o6-XD|EoQ`_e+VE~C
zIIDn9Heca?X|FZ?KIhGGMjruB6yujl@#?xK<j<o20B6tId&l3i&Z~1jhhv9Q({Di+
z8fD{4_HE>*W<^O5EZa~hbDxxucs2b3d>Z|rei?jn(Lc2O5voTm+z`ht_MIqb?(EEi
zqoV~Zp@C5%<rt0)e>6S>_<!QxiC!R24frQYxz+F8OpO(yd5s)kfL;0dP!q{KSMDF|
z!}|~Tf8gzv-n--3<JUY-t5{8^&X(^ac6N5xIa#BLl%u=IxD6q}Qa((5<MBQu;BtOn
zZ7Fi%A4R)fYs<aY%=%6Q#OA)!*wc?S9=FookMuaN+GqX>4WjrK?oS7N7#5x()!~v@
zbqzvz*{p3IHz;F>?q5DoRFXh30gN*eK>SzMb^U+D+JxHAiM1=uUr@L?mhSpzxSh}Y
zQaLPt6JK_I)L*d2ihdPe9};{mhW`M>z8ID}i;3Y#Ot*S!!b1@$L5U;SavT8PD`j{D
z{(U5YhaC-le*<80S~xXREPT>?$^2h6)B4=-xT71GRK&_S<z$|Vr^)pi$F*qa`o62-
z{ZiLj(C;SG^$QtNduyv_Xr+ooA(fHTf}|0UL8=9Lq&yn?cgpl?>F#`@oVk>vqetyG
z?7ja01qji#^-l(Tc)PXHui;C0Y<0b?miLG5xx8{p!znm#GYb(Lixj{I^*YM>%Tlz4
z`%a!an`?;HSZ0O9k;xl?%F4<}Wne)B91&m0Lnv-*?BCh9_RjJD0Kkj1(lv&k#J&@d
zDn6faJ2du+ncMazk@iM8JEI^gb;{$P#<;J;ILXTult#_!;nV)z{{SQOz8>SrMXWw$
zs{U@jGx{~8Cjz{?;djO#0eoZd+IXYD8mu~%x5~4|r)>7pvWD`fjQr8K2caKI^yMYf
zk;$*cD^jUOqorCcNp706KU1Yz6%v&<bkBu<X}{ZZ!9NA<?tC-g_qOpjjFtSRv`^mI
z*f82#?T(ToZYp?<31E3`qZ#})_>J)|#-A1ZQFY?q9O_f*H@6Z<(X6h`4Xkd(gf{5J
zh@w)*YA*m+>fh~S`vpa>-0MFTz7tI_)h=J{SAHIj1!TFAimSD$K)W3M;*1Dmxr+nE
zeqi`B;8(@39eB>;z<M`^brGrQ_fgvFi#5Bd>84V^J=OszxK=&M1Le;Z`|km)gAaxJ
z)e4`yOW$(QuJ-cu`_IR?!g<YHLc`XJ<Zs<k)jwZ9U%2v2hLK@VS>1xFsNBSpfB+dJ
zlV7JFvnTu$<6iMw$Kub9cGGCuKknwU(h=1!Zb%`sbfp+Y3ojdY33Hr;z%}-_?9uxX
zcpKoIxAxzPg|~`)Q97pUi;HxU*3704`Xb7u)wgFdk{FBttImGq3J1_v&T)5%QmxH|
z%GILubL#y&eDyt^9N<+=tVTaq(m&=>()7C>A5ON?bjVWL(#|A;2xAyX<3WX09e@KB
zcn}UN<KNm>_Q?2s`#xGWleFvJDw``7lWQV_22*g7>O+wfeE=9|IpIzYKN@X+Z4cTO
z-{MrC6Ay`Io=GQpKe6U{4v?@gDj*iB%DYug1O5Ds#FLu&Jd=VmJl2!MQ&x`Y%S-;c
z_t^Su(~598eMr%kudTX&U61K#9MNARe$Ah?=fmHR_6w%RAklnN3YMQoxs^<ks(@Ns
zb{O+x$@C7t#G@j<xC9ZJ`J8?O5rVHQJ|;Y}T@QB$jiHOAt6}BM79$Lb@Xw0h0z5DA
zH5X7U`d+sJGQoXr0+t7mlP)nLZs)TO#8<l0TCv08YEY=EQk9x76Njf>o26Pei|Bqr
zc&EYsDDVcY9*^T2L2Yv)vA3N*Xf6>>3r66P{{R7OFwY=XWLR!Nuhzd3_*X&kewj9p
z;u|P$ZKhJCWl&1zx;IsCLI@(iJ^m?t2k}3{d0STS=$~HDUvYL+&Bc;0PcSbGfqJ+D
zk;onY09D91(lw&6_{V8<XXG47!84%VBLtg|RoL;Hhg4pJ7{@h1S|SRu0abIj`qSZb
zyns%P=0zt)Q^hpEsHJ;**XY#Rak^*b)ss%m6c%E^h&T(LN9j~Fo%LIj|Nq6Oq=1wl
z9g@=0%><-GKtMVsEg&roqr0R9fuVFc>6GqNx?^;2<X~g(?>;|#|AAfC?(4o^an5<3
zheMW;_tR*3d^Pj=A8%B+X)TfGx&zNZqW*!I`nZqzjN557Ea+;{X%xJPBAY1v$FXOg
z5Sip0+C8X6pPa|Z-4kn7ui@KDS)Y~aHxaB-#|z>erk{I;l*{1>*!}%U=~rc!iP4CS
zl(!Fjw98MyPJUdEgG`Dm02H1v(%e6w({=8Ao5%^weM%8fWiw^V;9AkZ&0NiyFEx*+
z%0TlyLk$P$oQL{?VvWVYqETCmS*tN;Y&+bOZ!;x{8%Wuxd${u-;Gh`#*RklL<qA3l
zO8mtFGpsiHO7a>7K6O5<Yi<4Ddjj_U##OQ8?eHznm3T&c5Lq;E)5)w$WiRbY1{bD9
zAAd(iY{xb@E8Bc}lPyxq$mpVb4w3{NNlIRR&soiK)6)uQL@*{Or;tVMgAvW!Og>m8
zzTIDtP-|b-D37#HA+y)Ufw|GuWDRpSa)>klnlF6kxoodrijta1X$!qB{u!^wQx~!|
za=>>!wa9piV2Ni4RhV-1zi$j8teoiG+5Zjw3tfD^F5RIB3*EOLGXs-pYkF63nX!~L
z-4Y#-Sah~hWtTJ0zwi1|Ax(v?Q1FO~WDmxZ*vt{&z}a1n5uec9e)0BSZMS;*-TWTq
z<Hzftu4%Vfk~mMRvz=;-&+%y~ZKSwY#I41!FwkT@@k<UVgPl$LKP@Lg88%hcN(Jhy
zX&aV}-9aj;WpJ)GcO!}MLKcfL%P_{U!L7g8+TT|;-7?#xgpC}*zC_cU{pkMyEwSkt
zj6^>u6&a(NMP);3hxSb;yeLbJQ_OX3NXwwBf{?;x`TpW<!q(#>XUjmoe6ho(Uz%HD
zirX%DDRi?gLl_0Vt15RRZ~eh-kRTFg(_**uhaF6aa_w7LV<va<yx6uUBYFpQZ#2_R
z$l0ErZhlcQ#mzmNd#}nX|MU;RlNyuX*pY-Wkh0Nat`xUH9pG}mB>1_E(UgM*Z|ar8
zP{i8d2e6e2^&U1cf?+?v;La5LFFADDjZ5_WHj^J=0-Rf_$aqz767n?#+XcXj2=t2T
z)Cz+u7cGd{d7q5i>(a3bzjik!EymIRUkQ8*La5*E`4H?FL??hQJtwQhp`cd{+5N^e
z-w50)Qh4znV9NHvc{NDhbaaA6l9{{A*AMdF{JqmrPmjG#OEcF`<2O7ln}$^l8q5Q_
z3~2zmwFjpL1n!+H=_6QI$qTOavWtEg^o!l;3r*!bjSXr-m1fz?EbctRS$9=|H*^!f
zU9e*6u}RTzltrNmy9aaYl=Q@xK!ZBzBYm^l3Z^={$iM>SF^ZW(gLMXJU0B}k%)ah%
z{MRS3{T|Jo1aH5sytL?{?IhgCO?=?Z$US<q?fu10k-g5py_-2Eadz7LjkO>|RLI<?
zlrv*1sQI00E6@)IeVnn-VOHX32zA+^<8W)ABxGN}uk_xg_J8&^vh1gfqKAviqrlQP
zO2p6QRs4am9e!siR>h2}PYl0Z7zYLfJ15OuqaG|qvXp9XR5lICb(`@!+K>xEp|`(V
z&24H_Do$CRx~88j+|m$EXZbB~TqsWZLv7+maGrF+zN(vF$X{B5;K>YXGth#~P{R*I
z44sO5e?HnM1Xk7Ml`npXUyj^8`8N<7Zm>%k8YYmsOUfjg)?T+HUTiAu(yz%Z`Acz1
zx}3KB{nzy0P438Jq(PR-o!wVK0_pc2kbq|S)K{7x^qqYq4!)A7rH)bLv6KWf_kJ=F
zbj@-(JL(FKezf1{Kli6=nW@<Q)xNC%_F??BTXg%+j+QWgm3I*v=`(PssHCG+iPy~N
zi_d!Uq1Y}41@;Wg`p4B&`c%*xW|g2rgo3?=d#T^WLZZ>+E_9<_{$&5HVeMY2(h}7{
zG44g{P>l>>UHhL;-w4UhhB>3j0`47LV+P88F#P6@&yuEAfKTwIyIfL}41LvcQkht3
z5BHRk{SC}iCuYd;6>?pl6)8}Wf2>4w#t{49WM%<neTKz?zb5Cp#<~zHZ2EK_b54!3
zUx#=ul8tZxw_?Z6e-eF39GXo54Ux=vrhQ7IM~l+5tQWjI_F56c7G)LOci4Iv6PMR7
zaYGXex-zw{7%lRrl%_l0|5c+d*;-wbgy;lZG5ymdcYhNWt;o0#CCN)COZ2Do_&&M-
z_N=AbOg0dMyO0UOExr5Edi;J!dsI~3WGl|XwuhgLd5*46>IH4yKC6ep)v4Z$UR()Y
z^dy}bL)Y86WEg9tN;?KG<<QbAvhZZ;tw>_g7kf*`5k58E!Rk?8a<qNLWVp022m3_&
zuXTty$}n-d!K2|<|AkZ~*QtFAN)&&+s-5Gb&2mHHX+_@8nu^%J!|5GqgTGn^&3@ld
zTkgCz3R1_8asXxK+6C))3+=O%o5T$4?Oj_QDRMwc0%;;UpP6r1qXy%Xp5cC~F5<AY
z;hy|Vl80#2UiIc~k~*1BrevIxJZR51u~Y-Ssv6M>`~$KX>)?T3nj(IvHM-TB2nqR&
zwHh;e&^S!VU2ThBJ4AKR{qR#=8J@1`PR%qli{3L)KN~t;{Ai@_=0@G&NJ3HVQAku?
zi1%IwKtNl802S={uXEGiIq<2~VH$y^BQ0J}vQ@9<_R&)z*;asFgu04C?)dW#gT>o#
z-$NM}$PffDod^XQr2X+gfy5l^%omjL(j|L}h`laEkZRbG@y1{TEe(Z)eA-yXTDu$F
zgg<8li@uv9p00XoxRowtt}u@STuZpgtDbc~%G=!Xm;e)_9g&9!ob9OuhW>TQ4$@lu
z<mpkoiyD^~{NwCNA{s=-=t+|)FHoU7^pTCIrV;c^@9Z{YOXqTB7KuhBz!wSIK7hn6
z*Jz-3ESi{UV6$33F#h&7m!%K20wqBz_;pU**b-)EI}t~xY5et1tqzMBu9i3@kml+n
z0m@PidB|(*<jVX5{aw=5j2ioWluS>=cX=IHkH`bt#177R*XntOgkQY|oNBJeD+T&O
zH4dsLxvpDZ4z4?&Z7e}<=n?w^6K^;AE#>Q5;ZqU+BBM1Ug)KKi{nXh$i+x(%Q^iSi
zAz)33^x=rr?dI*nH9B%TS;5|#p3b5r#T1?)4@^<&2FZLD_VbIur3-s2O@2kJ8bm(`
zzBuo_)RP!CcDWN2E^uS)ynCY`955nu_G|yI`v5;~t<gA3`%EKxYGLY85D?zn2O?G6
z0ud~WXMK0URMHffjKXOimHK`Jynh@{WfRI*-(3wN^-EAi*vHwyfMga!!KFv!B?FE9
zYMz?ULJeYOnPYH$scWaZvYbY}@=6TRQ-S9pK+$`A2sfL1W1E??KXC)aR6vDInX#i!
z?c!n9-(|BmufF!s573jT7y!t+4Sp1a=fD5{cHq;s+${_=z=DJ3gwIXI2YFR0mzi`Y
zj#hV1$&$vA_^+|2cjF`$sME2-BW?;F8iO1Vy=U8f;ODu=ui-XoJAcC4;@p1k++~v%
zxs$viQEb}u%FBJ|M-IV(TN^{bry;%K2pnycxd%<!G>lu=eEHp2^LVt%uFkuN%+G*P
zHEJXcN);Q%B=2&ln9=L-YwUWuHVI#L^7*t`x~}V5#7;iEeHJ=i6A{sabFK6AM_Su|
zfK$^2l-&I8o&P;LWy)`BFFC*5QLV}hjqWN9Dh_=#5KI7<ym3WlA}$S|cT(3nyKr$E
zbj-VEW=#XMNoT_L)rD`#cYN6s(a1-GPlJ0<e3)fsP9!<PPi=s_3A&lriV~w3Y|iqO
z7T9||^fBFID@}Djsk%k_Aw72Yo8DL%e<{!0>F)g6(p|EaCV>$J?RN2<(9HD(@DunE
zW=a9~0U{@{EE%z#e|E0-d``$AXYJgT4cEU^WZm2$4WPmz_-Gi1`JPbD7fL)0{B$0X
zJJrVkO_&F>H~$BSv>#wxGarUB^l=2y=F%8sjG8?n=W*cp{It~zi9+dvR|9Sbppv~n
z65IBFa-4E+1u_2^H3R**PL~82fZ6n0K2e`{7z^gmr&;iM=X~qk@!ZUZ!?*W9Advm0
z+BGoS&e*S9Y_37KX=4X`4<}&a+kz~s-Mm10Aqu8+5j4>*Mey?^SH{((w;~vW)vo?*
zVFyRDl+MCea+hgjBiJTT0yLH2tqz5s?Dlp4?Fh&3J{zbCy<4ILdBWLyDoff)ku}Zz
zbFzOriP%n&uHLTj0-8pIP@FeT;6t&`;Z3fyn*6{5onUC&uK`?F&u&k^h*YJ&U|`yw
zZdpu%0mH4i>ugO|<r!rBo4tvlTcl_80`J<}1<=Bm5LsqvBoj6x-j&Rb^o6<LAET^~
zRJG&<zOHiptn|6d8jrY5ScTImGKBAS>37*d7m2n1xug$5m_84jRboEe&`V`b%Rblt
zW19){7I-DqP2fbadZ8eWj6?aNy%1%IKM_JPjkxfEq{!-f@OKw_?bIqh?(>~gFIj52
z9o{oQWd#mP1y<ZWErH)|fA&$Jnlott(KX;q)a8VD__C3$%QsX?q@vAOoD(<kiA=lR
zY?FjgR{x^R6vuN5?rB;`KWSgalnui7o^fPZy&N^4)>+eWz&IpfW?6m%W6&N?&y&Cu
zW_`teVQg)!tuu9}%r##o_Hh1eP`ulzI|K4?dL?Ut1(1*)3@==;*<%Yz=jdyyQ0DY@
z#Lbs3Db~=IzwE5d)$q}5=4Zxdpg*97?9-{ukcgxTq*~9gIM#ewzEijITS3YED<<0L
z%rqg1e1S?8?hC3ORis9mVD}2LL<Jtd%JabjsV)~a!w$j}VU;8$Rf(T$tOhE&e*pyg
zv$<Dt(pCAB@%OiI%3E{j#Kpz+-6waC3=xRiXBLFextZJX-vRhSoHvxfFG2^Q6XW{y
zS6Rtq^YeN<!zH#WSz~IsXFb?dK1v9@zWxjuXJ@s!5q~RufDyR+iSmky(f+fPC=Gu8
zB+r$TkOEysB2op4`f9dnp~ci#ts^({N*xxb*<Q~GsKnQh4~5RVEtO%OOn8emtqUU`
zb`wmj>=XLFN?+>a#_IlZW%H;_=cngbeX_iAU*n66QRAqH+#k1diVI8%GCIjN*tmSP
zMRb?pA(s>6MuPykAqiGN4MwLf@P(V;dI{6WGE)TqsKQi{af?WW-73^`jWsdxV_v<j
zi=H$~#0G81Cs5pf048z!PQ+jZN7kz~O%J<LRcZkNoJ~BH_r%mV40*M3_Y21xrvj+Q
zerB)$U>xSTd~+1~9dXBBk>(qpADP{xRW03Hp$fK6ies*-93mLt5_qcXxsU>%WKYW%
z>6F-_)<FM9s?UdGRbA;VTF6~qL~Q?Hv<(9Rx7Oss^I56~L7nj(dDX2(X3oz3SYwR2
zi*`z?!cxph?bgkXVHxA5b|Gii^pZ37y-a#HU8xNTlaBINHRQrM<m3?b7|Fhs-`E_B
zVaveYHqosgI`q`D3^l!6;hH2onU|hvQX<2J1w@rL)R(@e+*r@(06PgXf;NIbrtmJK
zy`3o3@<@oa#kE}?6UFlz-Hlh@%82;gX+U%E8_cn5PzxmvR4ZPeLU%gE#BElOCR1CR
zyWS7Dri|um8q8fMM77Qf{c_uVV?3`}QQ&Jlb5`iUuaw;FF3)=#)1c+mfBy#>gO;>X
zD66n+NhUV4z2r=d!wHooNlcH@9V3^9enxVlM1npbPixj3!8E_k_~b>bNXyEP=;GHW
zCOs2dDK&%$gn!l&oPTCqICnTXM~Nb7Zv5dW$90y=#;1D6PeUp#zRqZO*IHCoR?xrn
ztfW?4Ng7DuLRKZe@_%PRB$27JYbIo<t=#L^F;Y1vzvvs-rM-_8ylJrPLDj!CY>1R0
zXJc;*Z5T&j@RJ082~yfLd)pu_F5&Jw8pxmd3d~FhTt|T0ur8rLq<%f$t@OwvhyGGM
zYioLaOSnahWVhdQ$A!>6sm1n}D5%OrcS4GB0Y<s~ZfP6bewu&Y_^{&BG>_Jq1J1pJ
z9&H$Wm}`H4nnA`E`a+hfw#oW87`sBhOp#e0Hv5eC>9)=mrme4cv*PAhWZ1>pEhc0I
zcc*0Y10BGW;9eHI4zf*uIeFNcaFAxzk;ER8nQpGh77dZ@<#Vw(4g>YSk=^^8ILyvx
zI$W>Dj|xTeYH`xTl69sl*4IrOeWoopCYTyKUN_XgJ4*eTd@}y}@gGvwZS<6wZW+M|
ztfts(?yhTuu@y-8(ero&b$Xrcc3gpEKp9~A3wV%<p6mQbe>bmRehLmjLqu{50oIjg
z%pvQJ>{7r#0KB1zV@ofTBxaa}qElgYsB+}l0rNB5IPt(pY3%6o%BM+^e6^`tf(q97
z`>C4>EC)2SwARH7j@mTuH<Mj8J<47E$Ig#P`8s6J5YV9acqt=PeQ=U=d;foExl=?i
zd~|eZ>vkX(^d8<A)z)Jb=<qnWZd9tbQuVk^Jd65KoZRRW%!^Q<TCcU=PidU;L#Xrg
z?f&k$3lw=>3rFS&zOFYQ;QFy}=XM;NvH*z!;)3bWG}9q<3d{#z9eak(&(uZMv=`qG
zYCr^iKkFI2@Dxp4sJ1d&?-0UHO*CX<y&vMBR}rx5^>(JA7&mH#0nSRX`n54`Vv_f0
z=w9?_+i!h3g7KAxZA8S)*N0~8KICQ*Kh=GywRKK(cq?)1lQqDGGGCCWdty`I@TlqQ
zrnNXHfJNd2ap$VvN7v+Q=~+Jm%Yb&dp5C#9<am%;Wa`ChIXBq2v_t;J8F0JRfZZ_D
zUBAg9(Ec{wu#|}Trgdh7!XbCoj8_5!Y~@?I)_}{}w?q-azfvpaZYS6S2p)bD!euQ6
zTey~F%_m@g2IG@ua#|OUpWmtb$S~eQF~Je>s7R4c5PuesehnJJa_hFY%DMw-PtQwT
zK$d^PO6z9$>7^Ny3X3LJYN@o<AN|(Y|0GrOs?{<i-QJ_mYKzn0)OlI+!g;}3SPBgh
z>u#H?-fTaEWSc=Ip@|)Iu>SxtU{*+Pf$X%an3naDZb?H+&&N8FDttZN%otTe_VP)^
zUshEs7)f*>G6u14kz#~soDMGdw<{s>{GbA_ax5_{lx`Bhke4+SfOwG}im$OrW{OGg
z{?H$6DmutfX|`Vf7cEJ1V&NEUh|U*k>3+ick^`dMc(#E9ba0E^V7^8+{0AU9yx>Zu
zg2_Hu+wnPmBH$-x^iL*0*S~90pL~QRE};Z5-34sOIIjHdu7rP0sar-|pBg$cX2q#!
zdG<)`*VpbwhBHGqS$X$w+ItHqu&IS`F=8Mw(sF|T%NSw*dqrmV%0JQ^X~7yCx#;tF
zw0cpv8(CJPOgrdg2)&F-Y1+aNRmJJ{TR$u+<eR{{>K9JN;HaCxfA1_A<Icp}SNwj=
ziJQ>%t<%O?q5cd|800jCN{#I2-Yc_un3#6fmK@OAP1e?!582o_Flfp5pHNEu!#7R~
z20#Zv%bFM>o^oWQ)|OD1wb@hYoV4Ho*emYV6fYh#KMS2qhH?_Dmh7i&V{2=RQOmZY
zgOFRV&h2s|G*1424pROhT9a`&dyB6$Kh-Ru%wIeGAk8jYS=T97bSCjNGQ+(rcB-4$
zO{&<FVt_coTe)$<K=HtOAvBfDW%tcjB;U5VuS{twVLUOD>9o}N%`4R^`XPQ^swr9=
z5BGKJQ1`lLLq|IOA<N2JEPZ6SD%)M;&R<s-Eq;n;NRXzluDcE5Qk+?=GN!2$JsP1c
zpO7*W>$WzeFOgWy|2bc28U2m<MK=ulyp1ELH3JV8Do%ZyZvtG37ZzrQT--h<@lTI#
zQHtrNjgP;6PoU=q9S&K}#t>0F9XDcrjKuL~O||<3MJw4bo2iP>@i#lX=-RKqDI0yI
z%n}#!3Hz=uz-Ue>%1dhYz5q!~hdCSm)fDsNobX!uhZkw@bJIQgz=$`gaR-+BNcQ$Y
zKdM8dL^+%8yR*QojF9Djfb8!;AdFci(`@TvjpouDoZlrfS>oYaVL4ImL?gPkx+?AO
z(2`n`rHV!MuGT-h7Jnn+`H0T4bHfycbL0Dv(#akK#xZa9YXA}txPP6fu)x2x-xncP
z4}I8XJsLL<>3-b*3N-)J?B>w}<eDYMa2+px6dQ$|F!?cG)M^ualk=Q2&1cd|fIF={
zdyG(exvD^+OnQ49$`56f4Vr=qEnQ<Bx&4by>T%(#UCVj1G9m1T0L(!n50;yX3Xihd
z))TzNGanMG#2Y8tzwO4W9E^5{lU>pgM1X3r3+_3ZihUBr9i|{}0TIttiRs6VRs+Qx
zy{SF{rg%AZQ`%cRje{5v`!~kOe3Zh`*8!OER@b!K&L^zaGm)6OTDQOR3C_&p5w07?
zxMcxXzwf%ic20)-`DG7ynfQ%ugT5DiN>gK+=u9CA6WV8c-QyaT)bXu2k$kwCt1sEc
zZonO(%7-ODI1Rvx^DOSnT_!>V+q`wO_qCOkC40Anv*-mhOb<7*)`R8ARuYKYEtDpS
zQ9MPh%`HiG(VxD_d8%&n9{j(%Y2?hOI^FO<@V4%s*!NG>C!YmijZMu_N-89;%GI}3
za2^@F29SnYH#*<dX|DS$KJ-3qDHG`Z_OB?M)|On6;H%<w**}%|)(hYObg8t@SZzUp
zo>0u<o#>c+DPdy=cYeE9Y+gYmiPtbR>YCWX>}^-}+xb(@-<=5$b`8gMD#sBKbK0b|
zwk)O(5O(oB5C!I$pEArf|HcKpShLEzVb>XX(h@ezO6>*%T-Ju^1;nG|A`%d^&nxDG
z{-Sl^9hyB~JP_W$*H3k@dr<0~TC|@@=tp*W;e|uJs#i&qCcXeFNdocZAa$hUwosIB
zs+sfVy0KQeTZI`rEp5cD?e?vnZ=bv3R*HC!_|6pVrxhrZD5(CU@Re{zrYWTsOYHUq
zT9ve_C@vx~Rb7b+niDO!SOM-MfygkG1QL~Qeow2%UhQtsm7~OpX&;))I6|TVh?OVP
z4<^fc;%%@Q+yLt%R^T&GF^Mb@dJ~=zD5qBP?#Z&-i$?+5{Crn2EW*$z*f~spq3D6^
zFOm=G9c*+YQL)ihQk=X7sLF&?x=4sbAE*T7NvbW)A?fz@CUe$UIRnzLzZUlJBgomy
z3n6O}@h%xQN$pKCGiYt!%e&(sO~6S1;6?LT%oEWTxtTRc8q7D;$sNQBcNj>Lt`Sdg
z6Wmg~aNWMMjcaF7Mn+hkQM6cMj|PnPtwB?L6B<c71-6qGb^hGqQ5<hHB5s{Gz(Qzh
zgi!AvL&fs4{4z~;V(0P*kyqxn%p!zS(gS`Z1B<pn>)&d=dj{0bXi%+rYX3_w;C-Ct
zutzKIByKMy?owzI&7PufXbw^4wlRpp{_7pX;;af9%8zNHJ;P<D%j~<qBrimMctoT?
z#a7sM0WB^gM}#bIAE+K83Z&5*F#k-Sb~Cf=*Yd<($c6nAVY4TLaFn{#plHDo^e<#l
z>50{xoRJJA!^us{=kQmWh#~3gia?@!mJ(eJmTyKbMsum1lj9IyMpzaO*YzbZ*8Y3t
z<gxvh$L~aD1*bs)uTKuT%K~}sv}K`X5fh?JysuxW1d=b_S$wDWiIWIh5g!avY3S;L
z8$xYL6>q-BZN99H2=^yQvCLed-pP$rKyTL$?HaB{=Qk>-!ehE+tM(G!Bg|K@6MoLj
z$}IM-!gof~rZiDkKoaE}i^miEv!S%VG@F1OLmUfwGBeK!)9iCpnculeA17xDn>snl
zOsDNym$Q0J0I4UOFk<ktgkYCakR_YNx;k4pMV7tiYr199)ih}->UKxpn{sq_iK;pv
z@yq#ySfU+`?Su=j^79(&6_Z<Y*51T|z=eN`^~)VN%2PKJKdv$iA^lwm5}mYUC7-6N
z?~6#t@eTQJxl4nEa}lC4&!CpVrH!>WdBX7}F}-s&21`On&Z&47YU~1uV<KKsx{_pQ
zUa1PjRHbxmBQ}lt=;vF5VVZ~D0(v_W9>TKCjCjluzo{KQ6u9V}Iiv3AO-ecQzwM+O
zySO=wygUnh?DxQTbkR{-K$*GRv7z5R;Z@nPv~);1Bm{F?OHkpE99?AH`Di5Y$2kxb
zZ9L9@;_hoji2B(&8O-;yCZpQD=dks^g?rzVaukOvVUM`@C2@E-2|fo-S87U$jtf<?
z>C0L{1^^_x#2CfYE9yTnLfbtw2AbA31ZHP27%>TS=s4~$+R#eD>RJ7+ZZH15f6l_g
zmlrs9ovd=F$WNf>#IvnIGb$%bLgJ0)b<HZuaI1T%qtH9bB^>9~oA?0E)Et-a89Y+u
z0ql(MNWohF13W`!tHDoX%f3zdhTBB>e^k5u!`I`z@%-45HC-~DIx=`Mdsh)j6bGX;
zrT^E1QafkDk~i4>OEu-6C|yS9XLGZhETn76Es)yfkJTO-?@R~f9ILMc8gagzRIAV=
zCJG@8i!H1bTjmpQoek{jZA`riyan;|PSCn+=aC1{U!h)ZoTd;GvAg^HmR_@TfG5BD
zGIkfY%auZ$7(Fb0e9_<hwN451p^#GI#}xDR!B)wS%gk-mp+*{qQ}XXbvB;~6UB6Tg
z^>2JQS&9qr%B=aiCvq$H<71&GtKjGcww|21ZrqPJ$yy&8X1}SPBo#k(qK#f#ebEC|
zM(VR<`g(SC++_nhIIhJ938(*!x^kv)OUVTo{Rook3g6dniDY*(zTb|&xRjiV&ir@0
z>%KdnSTMBQ>5t5D;aI*Me$6F*j9mDOWG{q1LS~FS@2F<sw>B@8@jeFuvt`F`>UYm1
z{Bu{v-1@7@rYncTU;Fq}LBD{`<E!zz%_ZbRCxZEPV&Z5G2g$?^zS>4fu@!5uv1Uuj
zhtf?5nz`=MP6hnPd`@T5jK@_b_?XHSNHSX@f>Pd2&e=;Jjq#aArGSNBlST;`vxs^z
z29+^AJiNS^_4rr&RYk3SY<1Ktd9HZ4Y&5-e2iv#Asx^>lCdHj@dPqf)Sf#GB1LQ5+
ztxjVL_uGWje5(9eDy8snWcT6+yoJD;2XNnPTFa{Lev!XE>J?vEi_cmVE}HDO2jS38
zizIHzT#L_W5Nj=Q2pVP9G`!f5GQT$pE<Wb*sdzg7+`Z5hhiI_lNLHGIVONp<hHk>b
zNYmhw=_ZODvWq2dmV1?P{ypuRcYc#%$Qv$V3utD-7jevLQ>}%Wb)7trQi%QA4wPP@
z(uBakIqZeUX4nIt2i8qyITZ*lo4NYg#ldmf0x9q$9RH-I1Cu@lX*uR9K#G=Yl5<?M
z;M{LOAUz!pRNg}GUXUWa#&qAY;6uv43k8~dZ$?!fQ{d>8gX9)V%)L(3%qh2IZeo0=
z%WMk=xs0mt1r+OV|9MYfs+gyji)@=KGYq<BTDsTbeP4n{?=&1t^*x$zNI^8{3o;lk
z)CYm5BG3K<44e20loTtEx3#CL`t(C4lBoh_eneMU{wz@y)C=U0xjy_?dNk(~e>7#y
zk1vm;nwgsK`tFw4)#NY;Nw`I{AEj)qNlZFoA?VW_f!306xg~?hH1Y_>Jd!+bXv_aN
zeaM#+rOm!9QDC`mVuw4hr=GYMr<#CJ4g@PFanQsc@O$^-N5iPkLw(~5Pa>0fM=eZE
z8!=!X1*0c}OWr~HecX2c{y^@Sb5Mqu-ZtRC_iV&e?_!OZ4)qA%SosjUjM-6G2EZ(>
z_U(7S7fPMlC(Jr9$>bnn7$ka~rxPBN*l;KIpre!dGfRR_by<ClAtmdL)K&(zVs^Z-
zrKz(y`Dp$_?vqs8Wc5W68L82_-@|sS;e=9b9X8u;-8z6EANcJJS<>f(kw+rSmMak6
zv-Q`><%aJiMR<bOO=RgpIT($6iTuK1$5!1J$0o7!-`z!rYl7q3Fu|uq$(`)PXR~OQ
zv9b!RYp;<yRT3<XQU;Geo&g;KtFX~k=H{QE$*h}z#<W*zISXt&R-6UWPe^F842Rxp
z|GVz`pCyEQGK<!5Axi+u(tSCV*%SHrx2E{huQl_>P_bbz8iGi3$%<1@y)|-yuP7{8
zdJ+-H+3xJ^dR%h@6zg@Lw*pbY^Rm*`Q9`1ysW4EzQUAs=KE@Vq@l&N_X@tr*w`AsX
zW90B^Xu$AMt94v@VnF;u3mkf)3@7W`Al-rEmldkiX!um#1`JG}&K7L>U%tozJ9|GJ
z9ZYIw;X9|Bnq6mvcg3JNi~D{copqXKKD*5FD!Wkj=K(~N!m?OBhtk)Zo8r;jvB=|B
z<Hz7LZK|0zO}NV~&}_s{2QRevZhMNWdgXy^Uk$5EB%poaY_NUMjHWY~)HyUOyY}g=
zFn?&K=C_QXl2?jfgOV1KUms%GZ6S-kLo77sV&~70N6Xu<;kMsP9gXjdnrk;dldF!t
ztxAUoN53L{BsFzKEdJ-utPtL~Bfnme8k<Zswtbbi8?MMm?kCPR82k8V6aWCA1&~I-
ze;xXL&wbE5I$EE(yI+}czQYjU`Vl{_Ranxv2ocihfGIEqNx>a@p~OAd`P4^;sg9=R
zq!+P8{HGjsj2&s}y9H{l_<P($j}v*znL0nzT6zlKFN6j8V|jbb;HhOc5VCV;tiqy?
zqs=E1zy-S;wKZIJ*QI`!(E2S|<n?FkE;0ok@Ct~b^Mzm2Sig+xFAPh=KPYGG41YCP
zMa7=MaObM^?!#k7mz_<Lg`_?-Q_j9_78|`XD8K@gZG?@MY^CT+z}}G>Igy$01y=oh
zkV}1F4VC$q>;?L^QO|;=V(7!jxCnIvX@+h<ae{I!Z1dCx{6b}^i!W_K$XjcP#JnFI
zdF9l{kcO$#ip{Ao{Ow*-Ib};kP7K?mBr#rUrbf~k2x3nW)|%ZiEZ(O;GrpCCnYz)M
zJzV;Ad*VF<<L~q{N;IHh+xUYrYrR_NpNj7!M5=AccG3AAzc*u2n869PG%?S^jTax2
zeYQyE)eY?xT5i6XMW_uiDPgO4q|d(S;3=}Q;idkB=<*Ui^=`Oi8TWW1D?>xl<AdHu
z=xo#UgAy@p+5H6yb9<ec@ZT#qb@NLEZ=$JtUtUYCu8lx>y$yh(yofwX0vrT%MM|@8
zl+H7mkH#r8gz(-M9O7~3K#nL0@`fY5$8-=OaJjh5o+G|PP-s_O8CFMIetaz`#-K8T
zmc{FL46L#YPvxc5yyI21bH@j~f6d{5!x1!DsS<2~k_RsUDLVmgzpP1To54?}2QLqe
zlKa`XP7)lSC@9LtZMq1AD?)!wVWqhOBHhuno6e9j&3lMsx*Fc~w~jz3i1+;+OMM&g
z#1T%cfwbc8BY+=Snwm>~mq!;r)j9)wOnWF%`Yn7eOOt^UUd9bf+GvP5i9ir<a$en!
zDtSYms9$J2ulQW8Qtcu*T-1(BCa6c}sj3?%t3$xjwt)|p50XM2Zrv;d5tmm_oB2Bj
z+enW1I&efpWr<GGMl=rU{&x2yWNxfD46tvs63nhL?F)2tFX0S_iD&UpU6UjdrJksS
zDD0YB<nDYMK?VK^MvkJcH|)Vfx$b)eBE3V4CMJP(p!sr)DxKuZfj2az+CFgkm=f?P
zW(ES+k@EKEL_oSfZ8!Y!O*31ggc%y!J8&kL)^gg&N2(EG3-PIurxlCq2g^xV9I)Al
zW3U5XkB5s$MlO$ytceBLcrisdA#_7To2%P?cJ4EO<1tMekaJSxvo=V73C4afKk#u8
zR&8c!zGOb&@5tUKM_Q-CZ(CLLW3(=Hm^CrNK#_JrJl_bfWg?iH*X@H}YP;{7X-)pR
zjv3)J4cdBH#w>nM4TV;y22vlMGSo#Mq=yyG@bea$qk=|u-Llr_DdshbfxM*M&$8r5
zoyUiGX}?&dp79;9gkpW0ln{prr~UWv2t|Q{QfvC8%<;hO7IojxJ-AeMtBY30hpCo6
zZu|~&+}C(C`XyJO1)F9;V~5z3`IVhxm#JmoQu>V$T&;Ud(~ZI}nlCtHum>Ehu`u4;
z@X?s8mhWeS?T<cbhWk*ZYt%;oUeEws_U0GT4$<7Z5U`sx{b@Nm`}1u;X^ubZ<>xJ}
zrLUcHYAHCf*=6ohQ~TJoK2}vFp9|8#I}ht5xVyK^=dx;9*dX?HkBK(3>$&)FA-U7a
znAC`#u9Lf!8ZS|!pp!{}c`gItvSvHrwTGIF-ES`^Bi@qPHitmhgEvs>;3aJNJwRX~
z#-yt$=;MuxhhvEn!<ec~6R`1J<rq=zNY7n-M5r^ObLeIa+hy#6c!A-9P>czThZVVS
zseV0J*Ce~Po*G{+WF+s2%)uyVgUzsu@wHhi))|30x3@=MauvW-vrHID_kB6PZIYI@
zcSW-hqumgJ{aBN{+tju#6O}YgkrBQww>cu#!X|brSI}L+cM}8&zrgkyN+vX{SfWcB
zJKBZ%(S72uAh2C*eBZhR_{eNys`UX6JHOi4)i;$Q6Wp1}I8x%*#%GXwtDcdX1$aU~
zw@9WSZY`oV{)hp4igoVU>jXS}16n}{&~(_#dVvVHF=DM!9cGX@kr|oJV`%(rm(<`1
zNO*zcS*YmHMpfi{nMVU)oPMf{0~-g%U)mqqyJp7feSo$jCCOQIny)l;#;z~f!sm&-
zUPO!Oh>xc<DxG=t*o2Kp9r;(2(fW<`F+ch%*r5x1P<<-&u3qV4)m%zt^I4<Um@r^V
zRpW%ObX7DNli3@jiuBrIiSK;g26s!k+L;v_gM6zBf-;6uQ@+{WOMXoodK~_UU5cxt
zQl#`~Xhm`?*yT@0$K-v-(f1p(t7{-N&?yBAg<zq;PAy03Y@W{DMt1;7av-{4%PYE_
zeEIDhN*v8Ga+}ntv2?6<I#u9NApE_3eK^*D_$%4nM`X5gcz>6)Zb+Qz*8H2F6n|sm
zqx+Mfyjpn3a@h?df~LngFohW@0dw){eXQi56!q(>pC}^egp?z;31(~X2w#%KE~eYT
zk>p22V8t+ci28vd8@@og*Q!?%@pfb4X6Ewptmy<SlrVXj*DmZWEVO=lV|oFEw)3sX
zznx}4Jt=BwYEen!`S}qsoG3zP?5@Z}SSf>5?yrDUj*|2=u~#S;eeHIO^yRhw`aZUT
z@nk~abl3N(rSqG1Bm_=+@l;{?NXx5t@Pl*G^TUhh;tcAInGxdcE`xwO5qzUxAzwPF
zz%w9X1%4#)dt^khld|^je*ny4if+G%vBcU&$4+>nXl=}T&jD7!ZAppnSd#yMkpgL`
z!MhwD*snfk$G`H=@E01Bb*4ey7hPOp8Pnn7WB|GjZ@Av_wjTV%y<<5#pFiBJii;F*
zA)_H@D<K?yM8naf!IG2q0yl=%ZZl$*0#t_GC~mQ{TqX$9Riw~nMfTh21Wmlj->c^9
zY8R)kNC=t%5tpv3-O2fU7w9pLfdV~~)hv5%j`R_Z`*cv7ySZ0L1^4$Wq#MEvwK0b}
z0_zMs8?zJ`6Iy{f--TF4jMvi|t`*zI5sSRAHZb{38}|2ESe|!xdANGN3Q@gm-b(|1
z4ea>sPWmE|A{+T4@XO{yutKB#{NCOumfOiwu)ep}|8?RSmEW4bp&>5yM}J(#2#}S5
zthAcf-p`r*y`}pKlu$^IeAxm$3tbSnF>OPKhjCtpQS@`#fjW-;;n(xH$6raD3`R!y
z3#~fqABuoh(CGgFR9|?+bpVd@ozD@fkWV(X&&dZ|tiFtla`vkiK30AIw)oZ4OE;oQ
zInf(ytUepD=rnES$Gu%$<yc{1-s4zytgD*3)MC4W$sWE)zTQ*Jyt^Z!Y7FK?X@apO
zcx0HzeOzIm@Vj73Y-G>B<G^Ht{{wiRmCz9iXAz1_Br+97L-+GmC<e%6l4dhi;N@7s
z)<R&dD+i@uMiMQpxp3>|gW>xR0C8rCquHqM8nghyUL$pZQQl92lt$~SB@j8*&ORR@
zpPv1)xp)K=%jvx=$h^Ns31Iq|u`vfXt8JaQ_X9P4lfVr%V(Q#J9ryl08>JTiZCV8@
zPJ>qM*EV-Lz|$B6CL~T6{Q>)RW~Lh^KX20Xj#KCr__ot`<0(NsN$Sz^^@%ie;e09O
zW*rV1GW!n1JNBt5L=K!TO|-tK5wfHjFI~DIJ;4tVh*rWG`?&BA@kg5@8BZywBkT6z
zyUX3i*vy`vIY*sZg45Gh(Axgnb;HYR1(?CI*Rkaj#P&@F(h;TLNMVVb-qV1d^#!ND
ztJf7+MR}xAeJ$@gl{T0PK;GUIVjq8)BZlD5tO5&~@#6Za=Tf8*{Y+HW<g~#@jwCj_
z{>dWNCZ%ZFy)N1gGgt}*a=V`f%!9BmQmCj{Z*_F#ww?$d2p785)nuxa((R_c?BLs)
zXJ_7y8y6x9?wsu!VcsYAs9_X1*vu9~J!E1&-wLIKG;;S(rQBB~1zev5$>Sz|ZZNYK
zUjDhi>;HjeLo-ytVOT#uOi{MeL)wR$9$F*&)l)<m)UjE2L@V<)nvzJ^g66v7j#QM+
zj7ERO$YpFCHFjhB5ZwVnBnL|l6v#wxtH4!yhtFB<qHj)wh<>w2^CoHsO&7brW*Rzj
z=nq*;-xNv-qHx?ImM0=3U3GMC@R2FaA@NwNqT*Hf+oRa3Hxw8n8#hU(Nf{yrv}akF
z!k!?BRJtM+c;ts-gLvpqnp*&x<%@tPM2YI6LwQe=mn<j3;vcLKDW?W-ogN736&AFd
z+|n##-%j(rw9g)Q!B?u=+qjuTOf3$Fda=(Di~R*AabquRUuJ(5g{)nGGP^KuS2iwy
z##`TZ(z+E43kv=NNc;zY4{bVNTN;<ju+(%dS9z9!HsvYm*#d*8hfv2;x4c>dV|9j(
zc^n0Oz$QiNMDPrX8`JF@5TUreks=|oywX1+o)WYBh$A&jZi9y*oP=!6qa7&8$dZAP
z#Vl@6frb4lhx8i1eWKTnDi<s&GBI;;WX+X}^e^eYG{;$WA&`AN5qyji!M4IFed(fN
zV8KPZ+SSgVagq1^QlU+r+}#aQP3wmo8QTh;z2DF8$h}Wytr*aJsWrf3wy}$>+3C8f
z0;^5hHnaqybeFXTG3OEsn5NeIKsy`9GSfaQD_o!X`GeX4l9!ZXDzrqEg#f?-0ANIn
z0K$gCI?0euA=G}C`UyJqkwLLw|9$WXiSk}*+zDP#4oVlZT0n`E#+r-~9VGDMb+MFN
z!HB*3nH{OM{WeIh3c-Wvl`Cx`c?fwYPml>>V=&}da{`tIClKO?Kg>=<xjrw)dlD{u
zQRq_GByBSy4mTI>mOA-Yt;%=%rJxP1cV*GBbYt=#fOg9r*jm`S*Z%pB$+g=l3(6}<
z5m^CyS);(Rwu*G<qqms#m2iA=aduYP9Q<#>y&ICvN}X_ac^NH*a|~Ka!4Odw9X}6n
z2*XIdySZN(&#HB<qD*jxz6wq8WCtj7X)}dVY~tT}egF&dHY8F!^05i6d}6N<N{S69
z?JOLDPX1wK0{9$SX&xht+-Ep!b7mF(q`@Uv+Q&{%puSkCdPL{**f<*Z_W@81mY5Rb
zPWR4zF|ue3Y-WaigM2!DXW$A9>=UASme$8x`76lmrRKvO@S+;q+sXq)LqNC;xsz#y
zv)j`=iWReR^&cupR9_5qL$O0!eB0--mJ~?4E$1m(<brO>yXlh*n%sA3<9;7hSVY^I
zA18;9<YN5Fv<<j^XjwGd?SbcHQp<Ff4IDYUY<=uP=`bHo7B-Dexl#VeF<5H|F<LV1
zytk8n<kB{&7qspot#<*Pp!_0BzH?=*;*6QVo=sS2f`BbeaLB-+1A;6j#M=$;j9}&S
zqpw+M8iWF#VLGa^gZO0T56AEo31gO;C9e?f<>uWGp#|j~;6h&eu|l6EXb-C(JlzAI
z>zX%5bU1E7So{KJ1}rINU(AWM?+W0J<7T)Nhf6;S!~Mcaf8@OB@I*=bnW6KS;3%2Y
zybqgaIr){2xnQV`M1G~&kgu~$MiyO$m}^<Nog4)SH^}56_gqep#S-b_Ca{<LW2bdy
z?REk{=e8O`l`wLrZERC<dQYLzU{sX3x0r}$XBydQK_)6W;=YtbquO?N4W!^`i?=5u
zRsLnD!Wyjyj0)SKI5c!|S@)cl#Z#f%Z@qy|Llao)7osq%#ciG65{i$vjBCdEV#~nl
z&s3$PU4JRK1)$*wU^H4|bHd{b*>jV|sAX1f+_Pxg15-&_66qapIy&fbogi<Yz1Xaf
zQ@;N3;VN1()=!xxdtw4Li-IoHN=EG%AQZmFdtq|OFp+#$*~lcYtDhe?nlnIJ=MEJM
zK^+(!lJD%bNc}PWkb49<2w<cx=7oyiCsqHkYBGMi<A76m<g8cW1Ypn*T{If$gQ+de
zYt3Y)2;^$eQ@eDe<`>&n_z!Yy?CySNClbCl{QleiBk3!r^dYvqh7ys0(}}>}nR8Q7
z8G81DHHF5`xnHvAC-RJ54U7Dp4Ey__2<wvR6af#_nZ$Ldz)x~0K8$b`sPhy2^n5Z@
z$i#P5x&0DgI8|}xl<50n#K=53GWEeJolSIp=<>Y$(9^MA)9Op7Iil5s{Myj65EWlS
z?;rmmF;vC}x(tj5)k8uRSP!#lc$}G{s~33!id>Byh`Qa?j+`$pu^|+~4g>gtF=ldp
zlWX1jov-diq73OG?!9-7^_Y(S$n&1>!sQfsj|*?PYQ|jn+>5xxh_7n~8v#|6#!#hz
zS!n~COTbWbKw)6cjr$uLjzlTBK%)&%Enepe^tlFk_Y)^V=H05zbB>N2I+ppa*Z%>m
z)@r9<<@Z_7yM8$7@xDD8I*1HWAR^g(^ub%jt@bU)`@)Ds#<@QSwiJUWWO5|dl^fe9
z(=nc4<xsVd8ZZ5MdoQC>Sh8>cK$Zxs?HCVu46!^BQyWo{UG(6H|M*%)@8*~0AkA$X
zFvgonZ&a^d@rmseD@%#2d6A~cLPHJy(l2k~SQv30AF~Ei=ruGq$Ptl-(l5}GRMvNM
zPV|3ZN@*_fx2&jk3Ydj@luur&4{>{JEA;-(bk{z+?%*k@aH261YgobRO&(i!zc?I=
zosp7qk@YO=LSCRBfrA!tLmWo?h8ZI)i9rfyJqB)8);hGrY}g`akUp|+pl(C+MoU61
zT51t}+~bC6B+4i;Sg~=ceVaj3(F*Hg9I^Ad5fC|1VPnaUc^^!5k6OqclWxmAEBo5s
zI$|@zRqxP3*2p>d8g)90;i@E#^L0&<96B!Gn=Oe>6n7BYx$_tL;WEAK!^;))KuD$a
zsk~(Q-SbUysml0z;nh4*3hj23k`keZ&|J%}vc-u%+B*H0ZlB)oWZ>GhFK|Pg4!ko`
zbERz$6afL&wGz-0|He-CUlfM3|NXtc=zD49@JLxOSor#WZp8RykgEF0TFUhI8{;xV
zjj(0KNo^-%WRz7CrvW~}bk|P?;CCt)26lzOZ>OirTa$B%&p+Q2!G)fq_0m+bhWd~x
z9zA)XW*D5qYp4t2&1bQvN=78Tvi?ul%eQNS6#Tuet4p|Dv<~%jJWa}4+cL}AQj1e3
zO}?jT@!7bjXr>#lo+b+X&d^n1>&4dT>DU~kgd>XYT}wz}03&o$`<8t^cXh$?_)Hov
zX3jHDC;Vx8ILS(nz(0*^k<m~`XX8_4^LpE_A*M&g*Kz?@-%}%OND$W2la{Kuc~{$A
ze>8YOd>s{y5{!7<ve>rM66bP`k;EP7q5cUaho{(&w_kmJPkb5Pa};Q1vM=3<&LnF0
zR>1s^)bX_NqR}id4|AU4>O7Gc3C)U2ZFjdC5<diz+>Qi$qPNI#AYPWPw#$9*u0^1?
z-<YRz?-pQzFA+o${{iH(?-%}gU;6etP$F~HH;@#s?~SJHk}i(|@2>PpespKh;b%;6
zcGS?@IO4GWeWCD)0g)cc<7zWBm8=W^hj*s&Bh^<jpq^FFTq*hXe;@Hq)K`_#XYMw!
zJHBr{@EESFj7`c+RiD!1d%bH16@0n+@r-++M#NaI$wXuFa;S+XjW9);V7cLQT>M%h
z1s~%X=N&-rv$ToUmR-Um-O5+8633S`c80>g*+L2a2n*)J`a8l&X?{;AGWuF-WU2Di
zc7oo$9~d<jrkz|xFO^lc@=JcinY}7C{iQ6{-Km484a_%*m(E{(V<0-?D~0Cqa`q96
zV~=MVo$P-F6qK=t(rpX0NKQuJrVjEpctoZ=aL!WiG%My_f$-6<7J{6{|NaN~;n?~J
zYOCb?@q$hNHl=lw#dK?68;<-WZJ1ddVB%vEH)!y_ZxBlctQbGMX@kZI>A!<x2Q`~%
zciDzUWnArnyuSi#A9q&5$$Hj+eRYDh(37s&VWsj>dIozOh)}zvTs;^-bQMB`1S-O#
zLVy1~&B9~#uk%i`*G-z1)YAbe76@BtOgW+Zq+xFtt7eMttdQubB~NC`zD>$1qm2rr
zkji~*;iLF>)_HzZ9`Sc2UdMxavX$Y+p#)y$Wl0@gTc8*)OFKd*QJnD0sl+IEt_G*8
zv1&>DV8mYPFblR=yZ|r)?N3?uy184x@bLNRY%y#gefxB%CmJ96e^4IlYs8l`yrl_$
zRq4?<w<t{_MSW!s?4w;>uZw0@I5+u%73lgWMOOvQG$z|`+XSmo4ma5kf8?0pKc&$^
z@XZx$C`flz)6ug%&-Cz|f_G5$Ns|!?(H%CU#fQ^^h(Nk+cf~S`AhThz5Nf}T4=r=z
z49)x(;~Tt*X%ysEx-?Qmg#f;sHwNY}yEi|60T@*Ty+po1&}{q0TElNc#pYYBy*&K6
z&rStwaPj@^+_HBGWL1r?0C_~1uu&;D{^3`XRn9p<`q|e%FR(Rj^^E6Cd2{d0zQnKG
zU&>YB;9)gg0OT_^L#e97?D;fA6@nXL9TEv73gYuBKEC&cCWyPJ*eBn<?`meSq*G_3
zimb2v`SIhvhgAo)6wVfC*#`6ExWdl}Wdoz>X?9UoCs}aYdPsCV&0O-WsV9jmQZfm3
z`BnKCy<>Za6r!vf*B0KIaW_-daSX|xf-jy8x~l@oqLchkQ<Y{#y;x(WYejzu_*gJH
z;a4$+3!XO~c4GVO;pYqi-YL`M++D!nqH6c9&IG6fi%Ag*l1FxJH6TEt;(m>MINWLb
zPpN9(0l1{Czw#A@_C@2chKmq#-x+o`w9Y`eX3n?LurkEB=Kya;nxuYsb@5}uEAbVy
zG`9H4Ez0!XFS|Rp`rcskC8scBF`|gzVpqBE2m9B+XTu}EEcviGJ+g0Q@x7+vmTfLV
z**XT7t-llp{St9*utY?bcuc$KeSLdFI6}!>HLdVPs5gm-3?t*Ee&RNMr|3f-R@oGa
zfiR{qJw_?8AWZwG1R<VF(eJ~2#XEL*TXxW8WKrC%ubfX1(a`RCMpAA(v!lgwsgW-Z
z$4Gu}5<8%COy<IM%d+p|?8hxr%zn4Mkf$Azfv^pF&_L;-kMMb#qw&-A*oZ3WlR`?J
zw|m&*GRF@*pBtIom%V~sKI$iQb5tFPRHQqb`Vsg&xa4Y1m>_Ja4wm*>G{{0{+X^oW
z)mn(|;0PMFTj3g;c*{*|C9G|ec!_GXX<?KIcp8GOl6gS*_I@LX0&@=d!fF7S7p}Q(
zqI$I-lzvbbvuaWFZpz~8d^`Ibs(=C0@6~o!n`wRrYaY9eV$_BOrN-8U7C~zm*KBNR
z@4Egb{2`~43H-N>W=T%V<wnbCS;9irq?y6h#W*6={dE?9-Zu!Zj@&C;Few`JkkKiK
zet{@Si8vQ)6^~jFEN=RH&aYJGq(xcv%D7K-a2x1xx0#wc0Y32CziCEtZl=Wi4SJ)%
z+BA16WS}tMXvL=C>v=}q#BUd7Xz&r_^EkN?S%fIM=|@%}%wTva31VaO@Kb2aIrF|^
zWAhVsQ(*?&<XzP}CIvYACz21gp5TbSas;mV;XhET5%Mf8ZAtAH>Z<X65#BO|;wmW#
zS#J-LnDY&mSV5;?C%){aklc-BF~7vHT{<9!f3DY3Uq{Gd7}-}$#~F6g{Mpk~@-s-V
zkbv}Ij>qyvH{Y>(0w<#G06EkrS=49A{lQAE_H~+>6AtH4{_}&Iv9#Bj=9Hu??!?XE
z57}LA5K`pe%3C<i04kykS)e3fGgnNQ6%bh%6m^3~uF3t5`h!EaetOsjc!s4}G8{u<
zVc{p*Co$Boz-!Dho@6eNA6a~A&<fVZ6b}~u?p|wlhcZ0VkGCt}=Bs=VyY&YLc;2J0
z<v)uITm2tNXBpPy|G)hyNK2=5iP9k*Q&B<$1f^pjARyg68l+1=a!U6^TDn`5ZWyC;
zBL|Fq@9TH}AMC-7W5*tRc3s!rah|U;-6)uJBH%wz76x668(m{-5MgcV4PC#*Tr~gC
z$U>%#7=CQ96}ukr;Oeb&l@IonMsAHmC$M;T(KtqcM_zmk+Zc6#Fzzq*Jn3kecH*}C
zCq*aP%j_3Pv+cO`E%#zT4AWT*orba1!`R2s{1kLri!+mh54%&J1^87LBoF<3_R8WF
zsT7@>1#vIlICK>^5!}ev3s1Na?Ag{sUAC0I?!fh2et4(HDL$(35SCB3c+E%l!@Y~#
zVHa~#O4Up8Kq_58*jeLbC&(%2w~vAnb=2Ehnz+`o)mZ(dAq?{+a4m)fICRja9srXM
zJY<^wKb1Y3zYC$iYolZqRthr=R@_@m`8|ZS(d4QC`~k>`O-D;3kBglYi(7Z|8O}&n
z4w^`B8;HYnR~4f2-zUrTi0C`VuGvGU)==-(q~zD`V+p(yW(~r2`W)$!3*2xK<m*d`
zs@c6DvNBoxz@1wq+zLB2j_yEOd9cY~_1WJG0{v`((Sj>|c!m&f>u_`4!eGW2O$kxI
zzwB#seq3MrSJTEM=|Sf^_~PN&*`q+Yv)7D`g_qT+wVOleB*1%nfI3-MjUI?zcc5V&
z!82R^GvH8J9vri($`hi^$w1WE^B_Q3?p5n@B*4y-0U3lF<EFq5YH&>O{?Ls8wQ~ZJ
z#KX1DSj3!bldqo}nDEMXg0KO~kDG!tCMDoxKQrXBp*FAN7(y2ULrcQ)Z7yeBR+pR2
zy2O+5Ib-9kH}k>IMr0t8f6!^~HH~g3)6atpa!#>H?PJZ+A|ojDrFTaDp_NnT5H6N3
zeVWppXb4F9$pqWM7Bsrp>&fG`ApCZ}&&6Y^FLF$L7u{WHi@HL$2Sy0-qwwz&brYUx
zUmD%DUnE6;1~`|)lB;!y`5$2ql};J|9Y5PeFnf!07-TCp<IWPAfr$7z)L=}#2&0`o
z6tIeJ8b@W7y{a=Y?O7_!klKBLgGRM@D3nSdmOtir+u!~&0UuOIc6~A=J0n3j{0Djl
zBvS8AY{NIS53A8hfQ63(WiuCj0_b}iB$=Mw(O<6e+IevmeOn+==Ngz)k^Z(4&hZDj
z$9;ZwX?hp(OgnnJcd(OzOm6PWTMY#{9f<g-R(ki4Q<vXqMKUo5KQQC$vby>Ffo*!&
zCYESX-Asp;{?kWVkX))Zh~&Z}n9AcVlwL-YyR>cg^gobDCKfMFaJ`D0x)ojpdvyuf
zhB`y1U_Yg!lsNhWkn9m|CcA7r^Jk-_1^B<Ftv_7r7`PYL*K|Excl@aC2BgOt`I25*
zV0w|Tn9Ma2#NX|o)93aZamLw5mxc4quQoO*y-uDs*J+lIUyKm2%<T^zy|@hk!pD4n
zqaZy~E*liIbYjE4EQ(S2MyA&36M5pI@3-QGC0arm!D;z~cV*CL7=aDbH5yCwgV`Tt
zpWpXBhCdihs@vCjPEw@%UALklek78Xc32=B63{&w%l2MZYgXeb@OBMb-Egz?<CH~5
zo6B%sp#J3Kba4%Zi$0p@>YSfIME9^0t?e$Rw<+<?c$Jy2eVBIsv9#TVBp^z2J1y9E
zCI0Bgvu<^ka|vi=!h(D|5hx{I%&VqVp-JA<ZId|KFixM+foAK0LH%Z3QV@5X>(U<|
zk~^WhJg@^K8pi&tG?BoJqE9O4^=g`z_=$W-?^4Usj*2!DNK}E2iRq?u%PkgrnnY`x
zrDCI7ZLj2mehSn_qa$l6m+8OWRzB-~Z8^E6v68wHY~~!yzCuhGPfaoH#cq7u!5XNB
zj_&ECUG#13_K#?k&iFOR%_TupLocqKDIO+0;QoRwVm*8ux(=QC5A@R|94D*_j>WO?
zc19~$oN2hZju{&5YsxdE{Q*g5KJ^yl(HZ6u<p}GHhJrC(NN^;Mw;7e0=sD~;t~;Ob
z4?LETj~)_F6$Zzs6Lh*DR-cU4-3Z5F7H28Yr*_y<*&(ro*LphpzttU~`_KtU6TCM#
zcJ^V3?{NS-0Vd_K*r0=lD^PH?#;dT8C;D+4P6@RHmLzwM=KI`(fPt$F^$uet%|?pB
zbFGu&I%`wAKTJ8i!lREGr+S#)lcoK(@n7<zPSz0Jg=ZILZRF|OmVsg;dsa#fm5KWl
zhW~+b6v<Q{ArCC|nTCZ=O#o^4rF56}lJW+;yV_Z0rsg=%=*OzOx=-nvDNM46o0P|g
zS`bcyH%*c9%e`nlX(cHf#;U}Mbwll<DMGz?Tc}c4SE^$FEgO%AymjY}RIrU=(%&zN
z#Tjo#`QB$j`u4@Okd~W`?ziA^FT0tEge!P#tVOeNG@)%nz0}X0j6r(NPTvUZ=EMFq
zb5y0&zB6K><GHW$yk`SOj><~C8KC}Kxm5?)3Z5yX#f!->-SSuep$Ir`r#O)E<!lLE
z&!bl>(yd9v6L`8FMuGnb+jUpbY2hS20A2)B;wZW158|AwZty94SsTv3yxiuqjj?dg
z5=zx;j0b6H7eIkOkS6KK4heO_@IV(bb5``nu2K&*`X1ct9#~yn-YB;_?!9%{?s)v%
z8sh8R25sf1T*bsI7vk!v1^95`1<BW6*fZiFUWU4T*mlyk3!>>T65Y8W61R=2^Yy=a
z9ouu~=;d%wHe;Z@Of~vTod@d7onlSOOug@3l<06dxPEmQjjBYbhKOJ(JS^G2teM6?
zuJbD!sG18R?<)MVp)3A{nB!F4+2P>9^OiyFH^XyY?d-kTntxgVFp_2I!jFuNZ6B(B
zY7zZU3Lt^W3Cw}S__bXirT(NM#QSQ^21v|w-%=0=AAJw&7p^|C0u1;?cM)D|5w6R=
zQ;4=E|EtKqqms+w{8{JCO@X1`hCA%KWR_|83Cv<-Yiut+-Nh()psT;5rDI>=m{D;9
zY0CcSfz!3Hd9iM0wO<@h&xno1qZ_rg?SMmf$(f)RL)muay}VMNTF)?qH+*G#J68dE
zCp2+0RQTrvw)JVfhJbhz{AmL7gfJG$6p4cOBZD@o?*W!oDCE!dVO_Vrt!(p`T=uNC
z{+$f`2X|7D=0bj?m9*`p8hc9Sm;MlRlLm}bH=05a@w(~o3U?yye2HrW>$-rgwL^Sh
z<dztVK%x74^kb*WA~{bx5>-4NqrFX+X_~K+!@hGes|cXsm(h3eN?d_d(-tj%H(>or
ztnFDsNAPOPzF{|s^h9Dy1D{pjdtF3F$@kCUiy~~e>COjGb__K;`)4-phR$DY?7O0S
zlS1SDORUXQ1=Oz65Bt`iD9mkuj~?7wVpT9_<$_dgz){^5?Mas8*t&Md|LoW;P1PeB
z&<W|ni*U|8gGhF1d|~69j=Nh~lU6)jGx6#y$^IO`Vs#f^_F5Hyon|cgoi}X+<=hGd
zcPr_0S|Lx8=#QK1#WE<!Mz<8yxOr5r-X?G)%k*CRF|Nz}`4A)qY(@KU_?Q(Zr%Jnr
z43XFur6z3vCc$_By`n<-Wg|~KCrp%?*_EFdv_3Zi{?&=4CHBlQ(FS4te>XQjl<T(L
zP`QU~&JC;#3L_?&zi&Y?;=#8f9Bk<*Qq^F>D9Ng1-O}(o;c=MV|3S3ney&$O#iyD!
zm_RF8!VnVLE(vMznY}JiYb$20YvwymDVFY4=H|PIRC}!Y{uGch9uh2a75r;%a7BKQ
z5-a~36|_4i%qdN!%%mEt_MxIaMy@tA#ptUTbzsX5-7V>@7_O&-79-)~g4BMrEE1z&
z_@!heTa2|Nx9?c#&3%m#nfC&0;aj9>u)pG!dzv!OjS@kd+jW0@{{xAQi5`$05iL$O
z0W=bkSN7?y-lPw7W)0HzpwU~>M<%zR(pWeg${9!*ORA{lNo|Vpex_#8mMc9WNUadV
zLO!H!4d(ud@AT>8TH>Rcl=5BU^x3#-6LtF_D}fhoE>D(t+<){6kbe({)PfrzSXvS=
zIRITI#}T#}`0S>dtonYDYtyfD6|;*pdh98mzVNfG%gPO(VvzLC>iuj3X&%J5HT&k~
zSOvIk5o4*N+-yc!iRu7p%M(5usF>(MMq-qSTD%fL2Vz?xkS^>&0{wlNMAJw0aKILy
z33G3J@R;EFu|LV4f*2y0%6G!J5p@#jd61y3)0XlvYITLQPTunm0q9DG?pF3z8ra%n
zCD(fbHP8)C3Yrxou^f{j;`WzWz0}u(uqR*<&A2d?L-aYz>wX!QeD)*pw$*<oou>-9
znH>F}-rtSX`;Y#>1fn;cxQbQX>CCdiZ0&fGTQ=9dC;aa!@5%tYvp$=U2a-LMzR_Q$
z{H$R+rul_~_aB3Y_pG+MMkG!HYg48O;oD8U><a-%>3(*mRbM`}za>-KQ>=wEB9ae8
zO$>iEF3YE!wJ1_ai-D3wA9~m*IHV(gPEePQj#i~L%w1oboc)~>M^yC^n(64cWbHYZ
zhENtE6>fB&aCWlHotFcQ4qc2bpxlfrDuEG{SD7yiJ5+v76JTgnzfKh`{-V|y@dS^b
z0LUOk1TO+vk4JtF##SZoU`JL``6h~M(Qm)`ju3dK{@b)_6Ge{jP`c8{cA-Sg`L>$l
z9e>VPjZ)O{)m<EIVe<d*g`%fLJB><hm1+-M&Qv!e5L;<uDRE-MJgMqDr|OD#2W=QP
zRN)3gf3=$e7<Hu#GyE+h&NwniCUCcBlU%`dn+AKX3EZlRxV}zmC8-fFML$SS{-cC=
z7e7B@hpZ(B7L%n~-7cxFu`L4!)}oZyyJd_SDy=J+uuZCi0Y#rlpJSVR`gJ(N?`7l}
zuQzYr5MJ8gcUnwkpTgt&N`v>O_i<DgC%f^!KH+GYog+7&crbln|E)N*uY(N3$leF#
zNu{9iE!dD5T;z#hb|54BZ5;ie#cC`e%7T#e%Hf#uzR1st?EKP_UH`zF61W##Uf!?F
zY(Ied`&qtD8e{+f@Qzdg5gKvqxEDBj4Np7A{gH+VjOzG3ApTHtpe&AS=A8?OB>6!1
zFgIyn*Rsx;dr)3lwtaON8so=(LDF!#98Ekab~MvD4*LN3i5o>iI`HhT9_hvP*I;_D
zH0Tp+hPaJ>Ybb;=@;}*e_K|)OzX_$re#9UFR8UeJFIpr4?o)Ox<oH9dv9T7fXCMOB
zpCst_xUFD&e;ap-eAkgu0*uFZvJGTyGgJf*2WP&e290}}6Qc;@q1>3LxgH$Rq_d>c
zo1>m0SNKQ^_qGoY*AteEk04O?-9fak{DWj2uYJj<OGce_t1FosP|X}Pn}AdWg1PKG
z_F&&V?&_!96jWtPjjPii7$%&|tM<Z(iy#bvY5^{knD6Sev-$dHhm!hW(bJaqJdr;&
zes&Eo#HVe88$_@)XKFLmumQ+DA&beHBcYOj6|(v$fjUOvs44lj7VRhRQkY#;CdrRU
z%z3t``|mW@GOurf^V<FcDcid$SzWkPEkU;Sp-zEhfZ&)X5HyFJR^aoW+H+Scxm9nL
zuNE$x7;D}f*pVgw<`|k=^8t5>n}9Tk^ad}|sHps{QOVh0<XQ{<VW#$Q8lSZ|8Sx6W
z8Lt)|AA(>!VjowkE!tlsC=^bqF`BRpAi&SKqN>*|L8PQx5E0gv+C9QGAc8(F^(irT
zG*5ZSGn@(4RP%_A)2!F%4ZbbsSAAI&M#N*FN`+D)lr23fzfJd72TJn(wYlJ-QSOag
zeS`P&TBoHmigTsc>~nS;r%xSDlP)cQ?jXv(gENQ^aWY_+sWtt@#4zxAO~LHeSE<?G
z`rT2gJ>>#lKk;@v_fDF=WmN$*3bD7^07TY)TH*bxZe)t~%PL}gc3@WaM_j?&$P9sY
ztJ}Q+Kbu(iHMT~YRC?q(*_o-gAJ;I18SFM^Mh|?wQH>F_Y74Ea|Jj$)WD5koF?w^v
z9+biPX}<Cl6Mjm(`HIvPrZISnokB-<t<eNZpoQ})7M?8*M(ghLNv8~uU4~jSnMA%H
z0#y4#bZ&P??YO$8_NL!2gz5+rLb_SvA6?0Es?!#N^g{93A|9bnC<B$6r%L1|!b4tu
zuV3`zDH?<ypL!Pt1g#YZ=_zqn$HYJVxN_~Bc5xi|wfR#9wMTilzU~bCR6yUm<W_J?
zF>EGC+PJa9$`Mh0!i_iH_?0$=`0&G-?g5GQRLN_3b@Pt&ecf`VXB&RsetSg9KK85n
zS06%@IAE0<s}U9ZoKvJFGpu<}Q@(K^R$ZX`^Y>@hliKyIX&DLzEgA?Va>rjW>|O)%
z*03ENg-_2Rq{ARGPl9sO3CXrHSBmU;wl(8?jfR&Dt}bQ;4R5wJ6Wg;F|K6nx+jMat
zWx+_(KXnKg8K&glp;s$UYh*XGspmwnj+wQ)JhrmMop_~*LtnY77J9Q`fw-}-qH|3w
zF`ev)-oEOGErgr?-T6HiNEcTvq@j?}2U8N2y_+8(3ElUAa7}#vJA|5GQnzvx|Jh4X
z24B5L6%ceNBYUtM9171>5`d0FSuvq5!j(V1oL5^Mrxd-q=5tZOYpJ%LQ&v-7dVdOI
z<{^U~gN<)@E?LFyI{P_)e>ob$jt4p+lcHB2qv)$VVKm0-_~G9+2dt6<*rL}aJBIF}
zt0U%c{5jr}wTqknN!fK@>!o$UFRdDsJ)8K|I9QU5oh!8Gi`ZuT80?Y7?`ArY)5sr$
z;7E@i=v}Pan}>xl=F`{@4!Q=dtVwa<t&Re5m@kA_PGi8oR_cN6_fGG}Dg!N+fWdzt
zIPl({=0l)-==l#zKSmynmY8*tj3>oeeQr-bS;dn({CgHeEu<hg$F9(Pve=43x&K`m
z=KYaZ><cCJiM`q96db*xJvRlCWOrTYUxvM678R?fb`O5^bE@(914!;;LB@$dGpE~B
zjljQzCw<++4dr+;FBt72BBRKU!-~HVCh_Zwy53K0Ij<O*TFD&s<8SWLIfHt6&y7%1
zrNvuqAAdfXkNV<tRFvM>))yw$WH+&6)THlbJTRZ!G6Bj@{|<lVpM$TtOM9{VR5gRg
zimN~b72^Jyv)=^x&~ik-bEMP*!nEl*#pVF0*R*e<Hh8of{8&}#V&w2jvfp4sQuptF
z|8B+lgo((y19qlRB^xg|10?Fj(F6WLpC`-X!&L8QJ*!Tp;k-9|1>Y66t_SCSb-WO5
z(-1oKb$BmL-mf%vy;wQLvXk@%^C1Wc@xmm&TxFc`Xx?5KAV$)m!fF+3jno$7=5(Rc
zUv+fT6N@)#bzX}_(&{|VAN;q5&u#zg?k%^YC66_`*24hW+NBt!!Dn#6Q1=gPGe~y)
zwkd})qEf>SYeee>+y<uN)eMlh{Y(*gCX9qN0yXJL0@OL)F<C+h^}5HA3e9VeO=-^Z
zXKWhPlDQ}{y=N~58|06~Ok^5c!qgd4Kg6dbd1&dl(3T6T^{K7-lp8N<w~S>pN+M6G
z3}3}>F~}%7G-GVI^Nc-HyzBb&Iul%L<*z$iU#jnqpTb7k*%80XXO4MX@xufuOnu;d
z5>ph9Cm@1v48OUbD<+rHon`h3SS#`5>mM{c$5!rX3G9GlIAF8`_XbCZ*h<<28#+G+
zWL?~)>`bblU)8)L%e@c1;2lYiWEzmrU4<~*O$i6>Hx6Xb8fK0y`Z4tf@pvmyXhq8G
z7<Q}P7NvFJ;~F5291??hIh}DenkLP!O>Kz`zKb+Js&nCzCVL6DFp#YDrt#)i-WpT4
z%6&pQAX}MCv@yh!hs5{)uxnu>OwP+H3h--h_OnlDXj9aZ$x{7Uu4tR)X<FQ6j(Twy
z-+t_JcRSuueEy{`@@=3>hX86dTk*0)@id(yUzTxwX35jfgg7xbe0ehCH<`6gXpEKF
zcb1Ks`6!2U`rR|Lv2I;Pp#B;@ti-vQ@kJ<bZiq0OwU<QRXXNx!KVHUd3$1A5yKn@P
z<lZ2wgW}e;qo8c4S1Jy#9THyZ-47UU@$SMSk>g5wF@5Up2haJ_92-TV@H24g@V7tA
zB-u5CPxN!qoRHGVxoVa#!+MwuZW6tzGur9UUs_20MO8LE3Xcu;<od-0yoj2p;B_J|
z(H9UPn*!8p<pZESRby-`Q&M$o+5W7L?IDKjR}?v5`Ax{&+3r?(dLDv|EiC^`@2~fD
zF|p8tuDH71<=NDq4Vl73QKPJdFMtXzRlSRDAblle%`<&TgeDXsf&07Gz}~yIaPT&?
z?S;VSN8@`-4n+&UB)f-rG{iH;T(L!j0AlZ0b&$P&0!u5QJXLkvMu!r&nTD6BCI(a9
z!^MGQ*monOY?9Azd+x&@_|T+a&MFV&I=@H?B-om@!+^NZl&$>jZIujr>y$VNiCUhF
zkLrN&^d^(U*A2oj8Z!nb3Km%m-nxdt&=ky*y&je*|IyCZINS0-8_XSA1UcKlR*k$x
z3(~tt&kI`Uur?IDS?GCkJ09}A)h1DHE<U+}l#S5cp#lX73p7L1m`C@r5lsqT<Y{@F
zni!XCFUr-?M80C$TCNar7r3kh1`i=^@~Fk*-oaqH$4L-0&pp1TsTtKvC`{}vCDiIN
zi!v?RMvo9*_=6<0G6zj=)Z1+Ha`;*Myz0%=4D4Q}H3oY?c_6>gZVfzkyD&MIR(FwD
z2s+fWli7Ex_-`PCXSH|<nSa8euZ!S7VrOg2H@xQfUo7x<4p+>GliuJ5SV7dZ5)JBc
zCVhIzON}odhgZ0*JX*yB|IBZuWgn)>{NIoML+^tHNCtH8uR$o*`>v>i88@*gz0_vu
z&pYi*l|8E7#0eKR^dZGk?c6JJ`5FL3er0t8XQRoVzW)r&Es2hR)#Wy{HGOlQtRODC
z;b!<h5G3@ZFT`!%^~uuU1$<i`2|rRq^8tJqn!ZQHo4++9ldY#bTc74}Hs~ufzId>0
z**o-{53sp`f?94x0hy-qaea;h1Ropd2j|rMx#XMT##7Ovr`I0OUsqA1p;cVO6FAbB
z$<}CzD6x<1u6k1sGpwGw#&*(-_qcW3?|6#XI<83#0G}$g|1VkhBP5!gnI1?UxOr=<
zjLsXTZY-r~%Jkv-{@Uc*pDlq%Jok_;atcX|0s1{cBoRuA`4vonIqh0BSHbIl<`-b{
zZQeGJIvKBQcYaHoFdJ8L?&P70CeI_U{;~R^%y7BP*V(Cvx|=^gK2=yR^a4M2Q;8Qi
zcW^)Kp{Xxq@R3<Zp@X<g!e#=Si60Xa{$Kg~R_S=uHErSJEJB0xRRwRN_|FRJ>sFPL
zno<C%Cb1=Yc&@jo;%fIUcntT!BHP5TusJkVotZ%8eOQq@k>rifEwKFSR`!}8onVN7
z%4-9<9(f+`n+*<!%6D4eNxJ7}RyyI$57%u^JmIu}lKms>OYj3{kw9g{?haZy96@c$
z`jT<(i2hxemhg|4T2#>Up#mjhE^sS^#)f5!NJeG|z9@hYcYfUc^mVy_-CRHW>G?hs
zqDqrA>}F7x<Ta^!F<#)1t@`D>HOS>fd-%Jl<G<<;X6oYv#eW>v6orA|bVFE8jO98!
zu7kcAWt&t#f1xRi$j}x#h11sN7Nw{aJ#&A$JM!Y_Zwlm3=HcC~cHqmq{EHUezq|~K
zLdYJM^PE&;Cf&z>xU67xLmz&whQucuK|lt88MsS(F#rJ{yaUo}rQ@7xx}E3-3Jb<e
zj-(?0%r1_7sT+U~W-)RQm?4<L<`4?XJxo#Rdirr~`V|_b@J~>98d{1%viPvL=y(6c
z`7u-pHw~+$h{X{HRSqbTpEXPn__ei+Wj5X$=Kk~H9htz?*dgJ>@Oa=O*<FDfT<Q~N
zOS@vdRB1*V3nsPm?)ABC@R2MaY`vLOaFW1uu4m8t2J2I$c@@^bBqeQpCyIV%bw!;_
zzW$-G7XMH_t79nwt2Qa~cHu_*KM=550JmK`bFVv_Y=~c!8saLkNxhM;-%r8NHHAI-
zG-&pOi(73nUv2H2_cJ27JoKoiwpJ;rL3AD&R$VOF+BClJ`;;EexhP0(R4XAhW&}-n
z`>Urv$yzPr3G20r|K#tD!l!gITJ*vd7p^In?=ieLGB=SMe%RlZKwJAeSy|vIKIL<c
z#Hh}XU-RC|rxWDOCVAB?1!rCN)bqpb@^h}huHa8F4Bu6x!b7^~cqLID|Em~_WNo@H
zPK#c-wdx&wpa<g3_dGhErgU}JtX-zvMPW(Np(0p=_37x|4vD<JNz;)3K-Qgspf$S~
zExe!X2b6fN87pZpyt|y>^@5id)mV1)rP|S^P9p_XKaHa+=$`YB7_C`8aK}3uc0Yp$
z69@E}WqUWXFsI5rY0=dMb7j{4_VL!LpW|c|Ol;D+dg)&Q7c|GF!5K6teuJz@;^vQ>
z(}&#?--ZQGhtZSiyVj>YHjLQL{qus{fS}5&Q>g8k#&_HZghg^9HVyj(*VSA*?PQ&-
z?dL@nS6W0o=_VIe&AjQjsLq1~Md4j)tn(eJqec;`(abn%6PY&k4FHqlV5Wi#>Jg*c
zG}r5I|4qQm>B;NvzUd_kZ|Hd<-^Ki8!0nGxLr#rDHaN;Nt~$WHJzQ(5%1Xj?#Z0YM
znap7(Z$9#!qIyl>Sz?{SEY%OF+Tpb4b}($Hsx-n?HgLter3hZRn=~8@kg13PO2_EN
z+5!$-r-rBQ^?gE2G?%(>v6aBak%ee@DH8BtI&1|m3|?|!lF$_wl1Vh(Zirx#g_WFu
zVPT}5OAVRd8Fv^H1GfXH>JWs#ZXKQ!bXN>SL;lF@;EuiOX2%P>18ZeShavk!ev_8>
zI_nD2M2EUHWax9fRA;7fHrawNtX<ZjPr5W68T_y>5S69l+eu#Tx5{F}`72AlvCf-E
zT+c;UvVz}?{OEV!&E9?!{0ZPd{yqr0Y`9Qjm++&wta4!({oe)|+X`#Y{|{6<*Tw7-
z^$RPU)T`#1Rg!qDZfp;ozN&p?sgdyw@Ck_Nd|{M-&X?$bUIGUE2@3Xr?=Q%<=bgE4
zcz2UK)q^p&XIE0zTuqst+>mtjwO?<YIG=xH3|WJ-J-TZE+l~hyRtEd*168tZs&jY@
z^cg~g3&~CiPZ3hPfBsBfyKH?OY1Vzw<p@;=mgtR{@g1_OqX-~tIH<6<gDhsGI(6h>
z|8>HW^_*~QvEJ8m`fD!(rsYBWz^nT;kNtol$h+2^op$&fZuK0S<Pck&szB{|xOM60
z6pXO~8tPAA4BtTM#uuSB7j&Dc{f)!5$KtLI&BO7YttO%EEhA3z2>xVTDGNZ{j0%+e
zj*)*^iyDj){5a)VEkzeLJ4+RjD9Lg!H|<!RCtv+>=Of@7wJ}d@b|RGq*CB~ybH=>w
zZQhsJn>zKG^p%vHulw987501a{ILh2r+{1QS3V0`Ds<!S(OjPrV`tOMv(Zgz6SGCN
zI%R&g{exszU!~}9i0q(?%*}yO!Iks16I44B7O)=xIBzB@gM0k1I%aSpuw4UmW`csy
zIt7K9Z{dJkaZOV3qVS$@&8FdBwh?KnmU&c#xFPDEB*x+PEp%1s8RmzV4?;W2K;#GJ
zrCL#AR$I}n*^Qc1XZ9cFk5tKYqJEp3BzGIr=ammXjc7d7T@|Rb|5}wi7WxnPqcqqE
zK9UC>pVw%@2%Zk~L^l!4;P2V?_;;?h-wCbq7ICj%DFrIaE(ZG+mY}b^*GXC=6(HEB
z@ZiHH)Y`ZSbbMEpX=<DouxVL9PZk>io5^bQ_Wi+ni}zvcQ$t>#R38`-1Ig)(ER?Dr
zAJX0NGB(v>Oti~=G^a)C6IAB(ryd75j6OxSG-WAaJzjPH8*mq|y2nIV4DH4N`!v+M
zuQR2!D84~^tETpROBbgNk(F-8?kGz0Cm@x6REJ%UJLyrcBvcs_@O)0epnB2Qqt`fS
zpu7Syyk!h5Evz%_MYZTuO+o64ApBgACUhD)#*Wgyqv<BLN4g|Xbys0xOh;XcpPqRh
zND4%r;fb7V@3`^cD~EhfUtNQtv8RUX%O1S`2>}_HG++y0rewS18YKAdqevJ=ZVk9>
zqrq>FoGsb`JjBARoy|M+P=vUg=#UL}7||xxY+DYJy{j`C!ctFFfH`@qQ5q(jI)YX=
z$>Ga-F8;DB>6-<M)CXkBY`ZhjBxA;tY5s!9|5g6$AVQFCOmb&tN-NW#KhC|_+>^he
z-WthUX98=~D<$rpsz}p5wmr+cRl)UlQg#sRyPDbQnE6)rHvMYzp7TytXcA5C;#yIC
z<fLygf=7{_!*Q<*{0=$>oToNt*$XG6NRrqni#ySF<6o|{(U0WE!V6FfxB3WOMaB!f
zOHHJ~Z5P$9%4@Rl>Y!QfzWBuw0vVan^!?n7lcv(4YZfrx)^x24^~{{wx#~zK6W9tc
z&cK)g!M4Twavt^!{+b0w_L(;xe(sAV{UNmxjsv@(-TfesZ0r*Z`c?+}>JBE^ET~|H
z@3D8at!!DgSf`FZ%1qS|qqiWuET-kkXpz6x1g;N0f}zgV$((*H7GTo)ZNrOAw5Xg^
z?yYz|F*pU5y8%AU=j@VD3amb60(p{<|D**CNxUEyw*I=Y`cK`&MCJpwwwF=<-Mwj~
z;KC=KRENVrvBLZny=uM-2XO51EmrPo?Z;W5n_zjs-{6kZJ8iK<E1<gL@7fg4rBr=M
zv=e*Bcb6Ny{)3GVC7_B3j@LEC#Jzs}<mduaG1nN|tO*ZM*zGx0A4N}ZcRp*kCmM%l
z<!&-MPA2{8@418N?M{S^oC#_XG&S{(Ou%jTgH6qu(5lgGmPN{NUx;qQjOX>)ukAn9
z*<7k*t;s)^OQV>E6doyPB7z79jOk9I0<~@E3yb$f>lyUx68?=c$I~f<W8$f%AK^ZB
z5JCS2QvuFWe@&~dxzB&iDtNc^JRfPuN2sn!t1!7D1@X$s@T4dnce+C#L6-pv1=edl
zCyCA>dy|c!eM%_eCL^;<%K0OIQRhNW*qlSM_qTT~NOu8IV_eXN(t|Pa^4mA+e_$$2
z=H*WN(9smMIkQ*^!tBYgbN<8>HYp1HS$m$~peJPwSdy(*xtr}#3C<b(=I+>YM4Gkg
zPmq{@0M+szwLD)>>4zHs2K9-`niE~Y|FJJlT<*_w2wdkAV%|$0$01ygn;6hTmsiH$
z*q`7F&&J*-9&>6Q?zU)Ng5+5pBWvWd@9IJ`oPTZIRUG5SaXi2f8~*+j$Cq!8c9!-3
zb%1R;rHn)_-OMbJk{)TDvVt`lvrqRC5w6|%PJ+K=cPnCvc`oml>m7mLbxgWAU1*B?
zIXNHks;}oWh>g<L{<QYA;iwb<y&&Y6o7Z|caNx1?q2>;Gadru)cEbVy;p74e5Y_Z?
zZTs~MJN$UfU1iV2L~s-}D>5`@b<gdOCbWLuXmM}$=lp^)YkT`|!FRpX5s5N?AAI<q
z|LlK_6sy3g$t56)2n$~{H(A6mAh#xsmrFAr!oK8i$tU!2m>g>j;)m~q{tSB?DwXNt
zbNAyvkPCa%8rd`WAw0UtJ6{ydyZHiP!W<bO@aijT>bM*+RSqVDcT}|7DRyM=y6}tp
zAFeS8ZL9dWuX)!_DBjJ>rt^s<@wdIaJr?aXkJJSmf_RQr*aJ_V??)tzrxK{j91J40
zJjsbTqyiq{Od8gb{sYkia)=nWK*M(QgAv~-;*Y6J)e~v78VvU(W1u=r_?19vshaFC
zkO#F8cX@|e>yvO;pNNAy0na|_wtUTY6oHW3f1spH;lAp&!IA+^7Nq4U(ey;1%-Aoi
zcwc{c5nLb$c4qF+km1%r$PAfRcVBb3kOePl<A_&^F}dRaZpN**=yU3bb@RWS^o*X@
zPkb2Fm;V7+(?#;$v@Rt3y+ES7B-|6MH9F`rCcs8Iy8iC|me55`T;J)cth6N}g3c!O
z-91_SGDcaBSlm}-0y<iK^z%ReD&XIIr2Z0Umk++z$-u;ABShVMJUr~}nktj7_kN<W
z3?Y3R8C<iits0+vrMpfR;0T{C&1cH%c`^xa9DI*Xyfk7SX;juS36h@GLO%ieZ1OyT
z)i)rA1WY)ZyBk6pDC@QXXWgH@QOR=|%#pVky44`scyXrA{eXw-{vhG~PV!FswWZSj
zHO6dh60E{4BuasI(o}tGUoOOka~4tH{VF;YUv0dZtd%7aBF{G@p#Yrd{^P~l_nF#f
z=c>-Oxfbr9NwVMEadlQJYOkg9x#>PdP<ez9_(@_2Z$+WYt3nt#r$A=pgtR%z)w?%$
zIJh)gc7;>|dMfJQ7o@IB@$fbpb+Q2oQ{u<Cv(sWI3>s&jT}xG};|2ax(}Rp=OGL-M
zt0GRAF|<uFT>vUDq<eOvB+~ah6?l_&Dm{#eCe0c+uTNb8AG~kvDu{IuFUYV8k<Y%K
zT5(?d$@zS55`Z5W3<lo|Bu33)d7zTNIa(hy{%jI!*!m|}{;j}P-&)1%jM17yR`PW2
zi1{`cLxXU9@9TkpFSJ26K9Xc{WkIxR72?Nvth0D@!PDg|wjSlo+K9mfY{14z4BO)l
zv`c~)(}m(2+^1}HTla2hPqpd1+JQonFW>pgKx8C$G?Bi9RNvGM21-dF*U#GJMI;c-
z(eWsZ%x@X}Vx>C<EIaoe1rYewhKam=J|Pux{49xZP}I(|GC1>y&4`!hH`Hhe)&dix
z$~71ok&4$FSDL)lpB6e#;GYdxIvB>6x@`QXFr~@usrC;~qV^E55X=C#0giaDe0jgo
zTmiF3*;pI$!pr9Spn%`$+oVp0UW*Se>Y;l^akk_J%-V;|@t@xJ8VZ|~3E0!2fsp7b
z>|zwV_=KCn>^AOfft?RJ-ua6?G1yL{tRb{j4HY5%ly$b8qgN|5BwgqpzQPxoWtA@9
z%`n2_wQhI_<RK2v&V3gLn(7#v`^X*OBbX%AKb>5z=VQx9nAQ_gs3<UZ`0SQdYF1uI
ziRh?$1Hlz@pX+L!?4G$F+5|}$0)6P~)TV<x()0vUskvS&3!V+%4HZDLmiD^#)#xxP
zLH$`70g(BrPVPlgINTdrn}*sZ<u5)env+}AE-kh8S4iu1Ak~x?)KMv30W$vp??N9?
ztMFv!2*BJR6NE=1w~oFC$oM%ZGpbh;0V7PSb>d2ntDJeKa?nB>GNL~8B`|KlM|8z(
z?Xc<A`yiOavKA%X&#hF4khjZb4k+WI{!IV++pi4X(0mT}R_#CEx70}}ZzNC?Qa72i
zscl!Tz`7jJOGe4Ev)N2ATzoiR$^lxbn$o>BKrzFO6~6n<1^Y1PRQwbDq{+~!AxE8L
ztU%$bI%OKm7h-O*HfJ7>SS%}M>sBG~^&PD3dA4-?k-mn;&k3t?<KqC02(stQL-%Yz
zHiZhI6Iu>87O3ciKoR5$`CMN<<@8a5llT$cMsq{5983I>;>@6mO7GM45Z_8RL6lFF
zf?L5j;)G$(J9{9<l`zL4fJB?dB8tfX-&#D3qKs~|H`gI)xFX2BcuUi6<D|6IMcMHn
z5|vfVC+Gd%zg)PZePS~W6wj7^xpp}Bze8Mq7vA%GQTj_DTaJ#PX+(C9sP(5?ZjeO#
zGQANgxlF%5<cR(M7wy3zJe}y^9xfawYTk<PQn#?9+&Z2>k4~BN+uzM&W^2+Ay&+q0
zPOCI%wc$2_WI>f`{fMWOt+EQToqck{{#AqGoArFVNkpc30<;WQg`*ENLV*V;NEUsY
zx^ChPGMdCxPL<oCQ=g9|woFy324o3;901D|$vrP@5mp#eiG+oB2sELCLoNtw+atu)
zqIa5_W4a!){9_03X{;(Q?tP5{B>^SKLhI!kig4hIBUW)Eh66Piw{3Sc+hgETXFm3v
zsUa)Gv^8knsIB`i!|e<PPK%-!5OnTfUn^d#xPX-3L{qrsv$M>Q;EuEvVHfka`LWMm
z6moTfmFAIw+UxBieNLBkIc6`o7wDEv)e2?rwTg$&<LEYw_huEql#>c7_q+$w0~nP)
ztbYFw<QDzo0P;@Xyu4&#Kf}Wa^0MxQc)F)$@(Y~mYAecc?_KtOlC<ckRcy(6z2miz
zr5NXYcc?5o?)T^6$6ClP#!Xj&LbOP)lTVN3iGpmW&g3+M9ZN<_1X;#X{B>Mn$ip@B
z<@AzL`?96$jCfYH9d3N~mk*)g;e$HW51uGa4A-&zY9x22<38KfONR;0ZjnW}HCyX>
z#%_^hWl75(7BW?)s1)VR)1`U&@kkT!a(oA3S#+rt9<3~V4j8VUHhHx$JPd~H>7-sU
z8uc_%;DzcGER8p;@h&5@d<m+nbI6AJ)sqO~5{24HWAzQ+3Kkp2Td`K~*uFkhQPPqv
zws+3i(FZ~6A0z-JeD1}SB#Bu9U8AfBBVZwb%1**m&_Z8*+$aNoxwA=eGwHJ=`JV-c
zvExzn1R0pO#9n-AnC{ac0r~I`HlJ!w8#JgCsN+3dXEAv3dGCsLb9y?j{&mmXbSiDS
zFQ<26o`=y*GpP-CizyOBGDrO+SP<#D13EKY95HFA8`0~&;7c0$RSNa;cyqGs9@7h_
z#m2rGkl7BO?Ff@rXJ>biPplEyprSyqzFAN^dmK^ml!IQuvnE_`rrI;k%EL7nNFhz|
z<@@3lxAJLsTml=H9FuZd?e*`+Q~9rnBI=V?lSgg60J>9fHTti9uG*4MC(lKBK?<mg
zv;8xHr%-aQ-NAT(=X$Dy$M$@UbHQfOt8^LcXWyB0?+yx{bJqP}zES;;mHVQyW!d2R
z{@R>TzE0&=4K`eDU*b6CpYt{OCvHIUs8X)TBRsL6hwGzg1(~vv(Cpu)b;0SBj_f%y
zLxXP!<enHAULQ6)H?MCL1_a15R#?_f<!+R}w9<2kvEB+LX(?{7kPxu+m}|g46G}Ab
z(-~$;Fx(9DW6P`j)_XJeenZM2OUnJbwdk%x)?mBa2TTHI?SGTEDW@{TDSFv6V~+QR
zowEk%yp_4FT<GxL&W^z)PITwHSOu<{8&j|sy7aP1)Gs+vvlYLjm=EEhCy79I<s3L2
z8iJW*+(K>t;Bwcjo9nWRkJpPL=g(<I(2B=CDe`;(tU3SpVt*dU9A)I>5}3i`ml9=4
zc5g!VAp7c`=d61Qv*ZYTgz!0qKYAkcips?PccDD!i05<2`7mMj#f6hDDgKw&B%^J0
zO8)V4Z`qXx;lAVbSC3oTqf`@kt>c@P9}Wb)mUP-^Y-YF3JYnsk$xc*}Ht-&IbClxK
zXHBZMTkU0v8&CM4c5iWI;PXy#3YGJCdZB$mo2|E?*ul+Q1y(vlyK>oFp5Q|?QDM9o
zQt1NuQ`4Q1`Rjmd%LU!h+CywogDI&;+D~Wf!Pt1}O`W1+%o=r#|0)sPS#aOV8Y+!b
zIf_U$=wR>h9z+_es)^$#^XIWhvNER!WX=8ovR9}HrDm$*vFbWS-xJMSrDY2VZn7<?
z>1>3qQM#M=0^#vG5UyUS@hnkgZ1_?96X*2ve@bljT#sXl%xZhQ6G_Dg>C7gdM9ubE
zHCLNhvC#i8(=$!gB9Jh;5*>A4wbqCnG;b;Ow6QGK&LD|*CM2An9=>O&tFr7ZJn5fp
zm_DZQ`?)td{+^bb_!*U~qNv*k_+Oq$18eb*4-CAy{dx%Y2dC)H8G7Mi*-;(u4bX-;
zTD6NM@;2AD&4j6jJ7RFH1;yIgMktY;IjCK`$y~GzP12}YeGGQx=H3Y<SH|lW8YZlf
zfmhuR9b$w}_{PP2_E3;wlDPs>TEKb;CCc7swcpC0UII+hGl}zd@j>;vIPIzLiL0x0
zBq;+wKHNLU1s!O9_?VjCE^xNJT(q=&rzEChTC4w(s-IW3a)V+q_;L~z+@;rs>WF6X
zUrXRr%eht3+;efP`{C)gqAF`v#Fd-uAA9aT2B%A^`8luV0jL)Zf>K5k2mkiyc-2yG
zsRGti`VMg{7$brxMN21nsU2DhZ7p@@%`v=p66B5&JCmR+I`0~M+({5@FS5h{81LDg
zdH){9?Xpou*;3m7RqHV++I@8Td_Jw@C5TQP98m4c&<`*&<p7LCjDi{RNd4r5<QwAc
zgQ!K;{)l(qLHzfjr7e;>ejjuPLuA2`P{5{a&Te^n@KMZOt7*L*JPu26BV?Q#Y>Mu^
ztzTOm8NoM&eYK@KpF6y`l?3)D-xXfal|(x)Lc^IUnhr9l=$o2e3{%A?$-R%~e>wm~
z<M9MV-va!#Zh0>XYE^qAIFWpop3Tu?X;<{8#BvYF!r4YQu)!mqi*-pA@cBZA?ATcU
z1qu}|?y|Ag{%?w%{MyCAF-)dVOY44;t~y^iA7A#FOlw~(qn!q6y3*$y4^Q(vU{3Y^
z?~pH>Z~7z5g`JCY75XGJxNfd@B4m}-jJ-#H&8|IH0P;DLzhwVdk0tea4+qcD>vi&?
ze20~yRGC718;dU?brLKO0AoUa2+2kVC-x;OsRvGC?mIQe(5pQu`0~2$s~`D}%xjX6
zRHJ4Vt7*V(XM6hwXi20n<4)j+KsGpHf>g~W(JSUZ&}0MAH^i$6EYbTDu6%V_9yiE0
z>o#zf8~xMqrcZZ!od*V+op{G*rYCr5|K4R>P72&iP2WN;3-3CgxkOzV01@o3gd`^@
z2Y3=s-g%4ImdbR!`{CQ131s?&l`&cUW$$OF^MwQ9e={X>ZI%^g4v!K>>xSL@1m0|Y
z+dLZX5%%W~sgMDI9(dfxSB`jB_?L_%r%~XBIoj6%FYW!Mq3^1TU;Em%2oF>?&y7Gt
z9Cl(>_)jPTkG5qaR$wm`Vu#vToW6N~8GKiv#EqLUn5qfkntkRl#jtqZdTo6=(%;E$
z>&16?s1Z0u)s1(21z)2Gh1ElPVM%h-pOvUPUrm1ulXa=j`bIh`m24etiX9>1_TSQy
zn>)QSa|u_l-w2Lg<*7rvgkLb7x9bjv#`-ej*SxTHg&L@IEmslAu(%4G9Sg53JpP>L
zD7RU?@;^l!&UViAg#M%J!}n#^<(CJi;3|h!0f$>ZAz`HY_pCSSEU`6-rD05llrPS?
z>8_n_wE^|6XEl@*{s>DLr6_GMI^p$09#&^AFJ>Fy$C{MJ$4B)as8Zp<LcDYlZj`<0
zQyjWd4}#~r(IZ6KcgA9v0}(fo-w=Q6kUUOX(fpC|$TBS@p8fAHX%eiQq=y#kpnGeN
zL#evRF#N#dBDUWB@Ql>Qo^P(=KhWb~F8_|qvR%1qK>EC-VH0f2nHb>rx2X2#;fQPS
zKo!QNUx^E)-|Dd8Lh_m6k@F9GLH9w)wgjffkICtNs%#ByvONq|53XlFY)7pjFJn;$
zvvH*d|3YT7x2ir17ubLvq}BhUl=>RD9yCcr(1{PW11}&4!F4CGFjchRW29PZbE+b@
zzFC<3?mwO3`6^?ncL+mKc2Sw3Zup9fPlBlIA+%hVU-Hqc$cNjd&SLmAENv%OEN<j;
zfWJ#ND^HVod(sSeApl20SQMz1gQF6O2L|1!?dAzuZv#}FstxqC?qQ^9Q}V%S2-AXz
zBETGZOE>v}ES^XqO!3Cx^giLph>Eyq&>18cIu33MPQXzrK$$QJ0IXes?^|Jf9bxBG
zAtXJXd|pDBJg|j1RkW%_61{AK_*xifu3NtXJkfwyZjsj7^FY}_=l!UUcf7dZZD>!R
z_gx;8JunD`np7}wx(vXO9|#%G=(srL+J<bhqFM6LO(P3Ky`Wx{>SfejHa6f^!8c^d
zHwxV73q7`?&#D@@-|rPkrpzI<r6vwlkwtQIXXD~=EP<*(_CPR+F=n*ucdl_smb(#t
z3JR^kPC;Ad`fn(UNuQO7oBxG;J|7Ae)n;F~>>J$gZ&OoUY9_qkRI_1Z<xE6~@7l{p
zCaU^T04d`W?{+N&v6R|USG2hCX#xh8K}uddn71S34Fdd0dc1Z2Q}gqz2^h?-)d6^P
zwDh{rn7@g<Z*&g?oK>(q9M0VQj;Ap<%6H_5F+Pn7h%;$KoKqZ00G-d(tOAvj8xj&W
za9}*y2iU3#rFs<;$`Xo2SQ1A*z2Y|@6dAcYb~vMh@O4nUMLArVDv1ClHkxSuT)EOV
z$S!tbsPf;K`&*l}R^bH4x&7exYlJ}d9w{;@5{U+fy0o3&)8C%d`L#c(E1PI{mi%H}
zbfi(7m~Q_{Kci{()Y!b2wU4@Hfh%3PFiAxppXvg>xzh!sQgB1H_9bH~yZqY+Qp#SA
zEFl~RpP0YYU>Z@&g{6nG<X_z8-SXp>jrj&yUj))21q?k!=rtM~Eh6*PyttCv>KK1f
zWDM1|rhme77F8c&k9UKBHpB-7wHQ_l8qSj7*YK>HObWL|c2-9v?v*m!&K~Y93jRMi
z`T^aTe^OG%1C{L?^~L?2-sq37HdH<Z*HThhEx<Bpx<Qe$_p~=37nba2zuvqtueDc<
z+^SC^`)kUXXt)`lZewFX#1STNb2Q{Ua~gfb3bfOTci$Z!-i;*C^r0V69_XW^lLuPv
zmWR}gC~vX7EL7%6M3@*s)}L7g?xmAv25>J-uBmDlst-aHGHGV@@5Vbg6ua<*gKoHA
z17E`IZR4z)x|>AfbS$s%;cDn}p6r_d<GTc$oZTAjQ3sW3ac@xNYVYMzz!$#v){YAO
z2{q>Wpo&*x^e=ukgS`wbVDmr^S$HXh66un6If^?G0o=|hR}b~=F7L8PLaaj7IaS<!
z*uU*CZFH(mv^^hn_4Sf-zf4VS)5Qwkuv2|?O_XLb8gSjz@pi&R55Rs+U}=su^#!!q
zus)3rzH!PPmI!PpcjR@G`NcJJ(h9lSpAAC7OJ=Rf^CPYiR_QklQGX^^yk9?PbD-5J
zJ1qPDgsn>t6(WUr<8Xbeu)*T&1X=pH1syFohm_w6>!c#w7tJTg8Ui}1w%TlvU@{C|
z?;HTSgBH6K?|G|tIK*WVh5serN_tMnGlyU}dV1^aJAowCMrhv`z<BhaP2Y4lO31WL
z>o7aIaEFx8a@dZV=0mOKp=eBL4<JG=Ns$WQXqTGP`<Y95c?Ik+hPb&o(g72-FrznD
zCwZ{_#5^B>nIkQn1_+b?isu4_oXxib6pP_ioO}SQ<1MOBuF<DrJ^49LkU@0kyj#C?
zRxmwoN4ggf5gn9{p=1VR(|$Oz)gPVkc4Eb@HwWDW{rJ+%uTTt^?wIh~7Y>@X2ze`l
zbjN>jNBXX>+kX$Kt>^kBM7pGiFVI~w90F!*{JXh8!%H@4%nY-=@$*#q948v?BZi-3
zDVGJMqJx~>B-QQ7xkm~HhfR?RoCrAHu9P_=DnAc6AAiq}0uBydiz#J{H>qx;)anYU
znn81PzDcS?`OE@=GjgN8aIF)Rl5#ZWapU7R9kT9IyT3b6RU@Gj!N0D0;R|PPQo-b}
z7NeTHE1G_fFDBKG5u!MTDF@T87~p97uE8mmC%j2`shXcY`wyfPsL~D8202;Mo~b&~
zth`OGN*i%w`D_~UXD3o;SW!TYaa;EuSt)-&k{cw_$}JtOXU<I{{Zyk8Y|?`$vn;Pt
zV&%zHMt~P{F6U9DUH{w=BE*8mR|zD@LqwB_Ax-%2%`jr8Qh`T#eW*9_DVQ;^JtW~5
z_6?YBB38t})V{BkUX}hOXXvGyqv7DDg+(crR37*FuI+!TfR7VVFxH|Zqj(B(-zav`
zgqJ<Edd<`sJ`7YL^e?UDu@e_rLn~npsJM-t?oL)LCEB!u7CoMsd}tjYkZvPpzQlb*
zqeAyBWZA}K+jM>I$edM`t2#QJXy9V;>T83OSCfXn!!ouhp*VZ0s!|iT=j5E<VSA(s
z%;sD%9>`~n9YpAQ^&h@%%IWSS8FFzY8^AJLaO+=>@EP-|_{+MZ#~lEgCuM>T8*zO?
z8;h>vE)=MI%K1M*2zT-eCVWr$6o8b#k#JZ0z67{5=;0d4b}<b<xv}Sp7j2f$)i&du
zjC9o4m6Su&d7J6B(LPZP_TwABzO5=+&f!*$+nVH?9TKJselEn-?Ey*e=gOBccgY<r
zxW6gNWuHYJM{kh6+hEoG^NhDI$6fg2^{e#<*XkCd3$^`FLaYUb2F<+wsqIEOv3{p1
zOr6ycp6t@tnqU-S?UqbTnpgTT7FTetX{(7QPq(eHJFA|F#8NFU{`nr6%Je~82LC?Z
zQ^I75vMMk$hN1^T<w=YcK{^RPguH3ypKx+5s@VdBQ)E@*&uuu?Q*vn_B0bAaal`uo
zkIF|f4KK`LfWZZCWujS|IBRfe^6ImvgY5y%_VlBu|07+o=}<@kxW4o7A#j{g)PS}m
zQgqc9*6TjXnL5BYwiuPqm>O<3YWj+R!98q!J$Zr0Kl5-3xSvjb&#X5BDUZ#?zv6#s
zxF9y&I$jL~Pv-fV0z)k=x7=i3_)-P>SeuM`fWN78;nv{DWsm*adxXLtLqOJ2X9gW_
zUB>iY`rTzJaYFwtw%NXlyZwJ0orPOdkK4wFq_hIk3<&`#0qGb9EueG=6Da}d8Zb%e
z1_?<Kk?!sW>F#E9ZlEw=+xI=c_b=E5*SXGlp69;r&n?;OGE8{Y?DBPG;79Bs3BzUK
zvcDbyIl`THuhXOBk^k_2AR-(cTE>~g=##;64K$A_Wxn{KnYOQ~^jaI;YsQ|Rgdn`E
z#?dc#%9C{gl!ucCw-m7**}xbOwLH%q#x*q5!w!0=#&s>1QM}zrbD%&1oE)|%XwAr2
zc{GB~oQ2fof_dLsXO-EHE3#zaLqlC9nfHvxptc&Tg<se^$*u#YRgS)c$u=+kNO$_0
z0a6+#D($KU8!QZGs)1c@I%bdIqHByjDOfr)g>tI-EV+bE_tYq-hO@_;hHNIl>9VTk
zz6%E%r;s}sEP4A~8xqn}Trk!4tY(B@IYv9WT<bw7gA@h1=1^B}g0i+f=J0XmY%)5>
z;aZE#r_BxBE9v{Uv)}nV!0^2I20}9;e0zKw{Qtrud+DuB_Lnky315Y@2RT=`ZGO{F
z;l@w0`&@rq+k7CN&r2G<;Q!>z9jw>bZ1?DXhJ?|sh~wowX=bC}yKBK-XN&_%5{%Z3
zsnVxAp9&&V5J?hG502c*%AUQtCrZK|0)3}bC};8;=-oileqN~48ayRFX#^euTn)0h
z26a71(!W>F66|0`Xlzx}$RO)KcY(WAbm`U$)sYrVBJRLVL8?`Honx;i$_^6xx8ffu
zJyfC0vtwJaOBHTOlQ<5ws7s;zA%n{9)_2$rSPH?BQk28|k4CySScSM|{}x+&M)!#Z
z4EBHX6OEP`_&ORCCTR&HmwxODt!r`oms4)(Tzq&DSzfWNww3atcaWXDOe`drL>#l!
zw+PI~c<CkRC2?+KJ#wYS)U?I-8shw?$SVD$^Jo$FyQL7tvw<UpWH9G0X(K!!bSs+}
zCW17X9*eP2pGO=^nPhIZ?84tvARDZtXWN}t0k#2)ep`**648AofT#6qe!n6M^!@J<
zzYjOVFPxjd<+xaV8FeB3x&v+q=9R{$(|tEv2Yj*}HS}A;hNqXvli$BrtyW^9EIaFa
z8g!&u=AI)nddb93o-1Hd`|i%X!(4gEka!nO8rRt`6lVsh*!|rD6lJgiwC4u;pa9!r
ziK(rXN>~GI#PrZbRfK5s<&Z}&*2Kz1;nIus7!k0Xi6bQw-|RqwyG$FScovh;-{~EK
zDXlbKyH3Ukrw>yJ*10g?H0|lHm&;EF3W`@XyuCd+HRLX(eYm`iMhCf)p=v$mO!|H5
zQdu)=zyIwXVf};!u~gs)^mZz%jI`fg?xhEG&)2ZOGd6(QUkXS<&;qji+q>7h@FqbE
zB3X@3L0l@dK2o40wU49Px`(?_sKSRV<hDT>2XNNS^M`1_a@!IR0%!iXoKsu;XUfz>
zdAO|qU3R4H(;rXyMWi?Jp-lkz#P)=2iz<xF7I-N|DsS|4@I1@MrcA088jZU2y=KNb
z<PD#@hOva2Z~Wdj#u@ge!=@Ze{i>kz5oozazQKL{)%gh@`NRvVljb{ohunu8+Kg}u
zdK3EaHO{1go`;<69fm{eyFnM010pE`OHyRrmCG9d46N6mcuEj1y``x@A*NLu`!S5n
zgCy@QA>*Kc3z5-I&Rz?_%Qm3zH~f_GP7@8j@|_&bK!~rmd*fxknVx#joFy?f#;X$%
z<$tB+hxbPu&1RRgm!%Y?x3;#?N?3E_?}$@5Ve~f)8bVfR4GHet{s%IP_d(G|K8M^t
zxR$w&6#g+&Ciw1Gs$V3*r;;DZL-H<E_!$^=Y!M*CkGXG?OXBXKWd$-+!p$=H{(Tcj
zYx(;6Lz!Zr<F;jj_J-?Z(cK0Q+#CB7xC>1BPH&*cPc3I0S3FY_9a3uQr3N;sM$ddk
znt8IwDfzqU?g1s>TU3;vKJ2m>SbaMZ%%2L##86GZ;$!zCwc-db25oLTFTky%jGP(&
zAYY+`a$iu5eJHeLR|`Cis);OfKu1sj@rV3UEe0euX&u>hucH*39$*2XgIiTMq6XuH
za(i<gH5nCDKiE1K=e1%KWj~~a4a|Q|<=O;^2Dr-MA&<KOjTbKOycIxT=XI-<?IaaN
zxVr*!1_*$wfuf^o6%|4#DyrabRcL;#l8w56u^Zv<6TKLEDC3ABZL{6^9w4>tk3(yr
zCw`mIrQ2c6mp;T17!5D#iK4Y`1|xM_GW)M0kv=<T0_$u1Ct4!O{WcE1^C??n9GB(`
zLY6f%f6?aKd)t-`Qa@uJjt)+eVyv-6kj*@L90yZ)U<a`#cU<<~jTkIcd8uM+`Biy~
z)TrtXiic9qwed;k<L(b)PTYgi{Nvg_@@a<$iMIy)^npXqO8ATge>MgzxSvujdtc2X
zp<OO;vAjWW1jdwCx+YGi%J>KIesjtzN5{o`B9~d0f%m~*fi^IlAm%lCa2tOc8sR&-
zT<~X|Jtj16eOZDlVk;7)uRkv8b^=-*Rr2ZYt2J9y^K|DNdp<s`#ua0>wz~U0J;^vd
zNlh+!XfJ_H1loZdB>GVEiLc*X|M*F#&>u{={5IUcT9Q%0<O@fqIl|{yn<dJTSE{nc
z-~>J#CEvK|eM$%nC&Uq8Q>1>MTesU!*>;)B(WL25*F-!YTfU}&LKAEmcO=!ZyV$Hw
z2OuC@N15X6>?miV5|xV2eHQw>wcurJea555@PiiWE+SCua3Onu_~u?+ohH-(xnAi|
z`AIU_?l$(@PX~ugZ;=t2n3z&ZAA*p@;1`<<-6(DREiogRs`vGsnVpsY+>b>m`TDI-
zb=wdBY5#>(7%Uoc=2cE?EZ1JKPx^~7!tFMR>j5rWG)J&5*GuZuma;5b8aj5RC*D-g
z`JMhyb^SqU1GVR*kL_R?-a-CyVoE-t!Bh`a*mS(tLg`)EZ;V3XMTqX>^Czido%sQR
z7mX=J269ha+MLEzA=bb2b`ooo0w?kg_2pA(1o-B!ZUj-@AsAiYC;R9X=up42B~nV3
zn$<pJeb`}fdNnnf;4+fnK(xN?pf-IrtrGSBMVTjYG17#_s@9y<1P`FI(1G~CTuR``
zYvW9X-q2FX9qG?0APsG=NpM<3sl_A|L@q8V5B-s@>~>x@z0yKWoSYVC9!4MGXtKIl
z8|q3D<LlmlpQ%wh4j;YEUR-{`vd__Jzhw^T+4Jhz=EyT_RPzl;ys>@Y;t0owMG0r*
zBEZl~g>>w9z%6ES9ct5nv>jBln)9%+j@mHS2isRGy)8*2iIO`ABR+@BQ<isf#V-re
zn@+=D{Wf)w;a-)TH*}4($RVa;E<XEZ1rAWhsnGA4I~)ox-CgCOlmFLEI+$k$T#dtG
zlG@ipU6vXWhNWRcgn2HDpYuRjJFEk3P6Eii)F<eyil<`7Z1FR-7<npPc7#c~O%?ij
zj1p^D38WY@h}(rtkO6@kb;5y=Phbj(#?9lT##9HAx#x#3E>E6h`p<fGsZi`TE9<!Z
zcu1Q?1*`0>mNIGbt%!wP04me%IP(Loc5##FHuKS!KVl4H&>LGy4fXC%>bHiJGmF1H
zJ<5M{%{uUpl^L5iQQ@$3Q1G(tU?d~6mRy-F`;&|(-HZZl3vP1Ssk63z#8{x7kYxbN
z;3mmWNg{QsZyoq&CH>$2Wo22%<HH=P&S|EAM{2BfZ%cg)l^Qe{IdR50zfdz-w}yY_
z4Wm6Sn2ni^`lKH%i1Mm&DZ&KuV=}NbT%!fiKu?rS%U=k`wp^4H-{ng~vzE-a5qXUd
zss_0p=uXrN4i9$OmmTiCR%2(`v6^8!v<gj5vSeAPTDo-gz3<3XJFu=jsu!s2r+dbw
z$|}iAF(9ugtDPhjWUX1yGM6jF$4&F*m1x!v`I48^9py1AP4{1(+A?w;8nAZ}#HsD2
zUq1U5`gV{2QDD>vo157+8YkKLdsVi4<Um&}U&8DD<QbK3hzZF&Ro#xOA-snH@mq1z
zK&w(TQ_b>mS{A&Qu|WN-AeMjmxwX?EOa8rT@iC_j<pxn%s^{^E+U|Oo7u%+ekI03%
zveB=J3!5t+wY4?PTHBqi{@G8=2|0usf9-9CtNa<OuV%~JCM&l;nD7|nWaqsrkuMd`
zj~M15rU)jPAj-0M@LR>CsG>DJm7r*RYQ7yvo<6ZlU8Alz198-_4ahwT&q{DNdE8e`
zmH3S^sHsG!ZC4)=AgI>Oa2)0Ru_{TOqbW{;tFC0x+(or$yZO`j$gc~v@yYuZQFR|0
zlVE3DaG4lTK!J==j0RUlcaR*p2QAajf>{c=#2SUkuFEVdeZS<ZPxGP#y;6)k90eCY
zo1bndytPn#e)lQPNbCCt|2LCwGG7tpI_^Z8sr%VRHM%=G({-kPwGL#bne{W!&mWRs
z&0Qw-IcPft`Rcb0{k(j{m*Du~f=-Gf*nI!-HldR0H1}eO7D)Pdz8MZSIr?{=L-&nb
z=sdMqE@P_{#jIla-^^W|I#=u;9A2_9;*=izGy&iD0;2xut54aH)}csZj>z9~I-~OZ
z<t8qxpw$fQ;UAozZ!LbG--2R$Xv9Q9mt~<ndhcAdG3%a!AwFK{m(WtO=rz#;f+xzs
zijhZ)oBpuMbSFn4!S{piG_QMW$(bfrUS%fTMh@7ybLZ_|XLycf`lH<1(>+S)j%a$4
z@>CcN5Okwv2TE#w&UZWWajNxicZ*83LK*U7e|Zv>O6qEgsSa<yQ=%QXl1FUOnmc8}
zTg@i28J0r?|H$3>i=W!LbGc`-8%Mkk+)ue(NP$yl90@}7B=TH4o{ege@JLnK1QVqO
zG`s;NrwmL@H2<q8QvZWG>b`7#B$zLzs9tfU8_X{f(fobe=#3I&_lIS7X}Lh&5<V?+
zn1t}VbFEeUh*_%!Ex!!T_Qe&0Nw+cQdwgET(4fZIg;I=&G2!VMHjtKp@YGzQp*l0@
z@h0bPd#ImeuMdZ#*5PB9!I2^SDx$X0rOMJj|3tN~^A~`g(?PEQl#cl9M0&(q-46Z(
zF>@+Jn?$YX4n+~eUK@F{r0%P=amSN?a*Oq~nlc0VBltT6mdXE$5=tsqik#KchUp(V
zZ!{k32)I5(eRv)v1%ayH8cRBWDWulEd#l@8>7T!Sv7gE=LRxq@T{kMySJIczb(y@L
zYS!#N6QHY4L;o=BCA|%fL&N|iTEvOgvLh$8G5|2+p#2cEZnSEk5(%DB?AU6&`jV`D
zOd+!N<>YEFm$$^UoY>24;B=AVVZB4$n+Vl_%c9p9L=OC=5<j{vhQIAnU6KEd9}1{6
z44c>8JD6hz#U}G_tuzUG&z%0J3H19ISG;=#Cet^+R+(dqQ|e1^t0@$xhy9CJ!KoZz
z=pSe^Bt4;w5iR>g-Sw9}CirPTFvSN%v;Zd3v;4Zqj-_8tvL~yUsp8`ukuBE+yPGc}
zP)nHMfDC=~UsTbj#(PRE8Z;EOFaHCrdB_#tF6{bLC`ERu|D!_y@c&%PYc25h8zBd+
zziWka0cq}~oZ=VpfX1-IyyZO@?Xafuww?(R^(2<SWH$-VwS(3wLaY5rtQ$UGFWuX`
zg<ys9Kb%pv>f2oi6obrg$0H|EGsAyqwgIvG8V@w5DwzvcaV83q@O$V;Bvn7v);Z(i
z-cpZzPO)Q)Df2HE)%$#U97^5fF(q9wo5cnSQu0*0(0Onl1b|7Rv$sq)h|mL*P&@A<
zg2bItem-*Gn{KF#_bu+LA>cJV)x4vGm2S!RPsA!P_q$fY6qC88lJ_?3mI$i9f#m+E
zykp<>eRiOI{+@4}o5d}Tnf1(I8Ms$LcQzb5IZ^9eo5PEIe()!C|AFZK1A#`cnGmO3
zRyb`O6$!fj3Cb|^fo0VCx^a)0-2tP@*%OzQe7!%^%_|A5CL*ogXSsp?Phd3L{s|f9
zf&W_N?WYiN$_~c*Il|AK$OL>gXyRU|=mT|-cmz~CCqX=83v`E_q#eSr{msV6@Mt-F
zSVP(vndxW^&v$ZU$Njk1Ttv))7OT`0cKvK@7VZpf%I4qrJ#ogqKqOVPhE23GNIi~!
zqj0U=urLv;466n9L)3Vr>sBSbUBhz2caK(^y4DyCnyVq*RWJD$JBEmfQ+ePvfNnvi
z<m4gFgWp`6=-0WKnIGtiskwtTru(?8hL|3&8mO$7d<}DZ8Q&aWeZGw70fV2be)g39
zFa@`~N7y#Ku1Ijwv?l%=(Y(zS6W{jrRC+VFI%3RUK5+C>i)}#4;Zb8WU<@^5f~fWH
zqRIEkOVp=3#+*7U1~2G63H!5W8@v<S#JcvRIbHOpX17wlU661Ru-UhIRURY*tmJ69
z`>5gN<31CH@LrtQMSSv?hlCBS2@+dZvr62NBf=S-ZHzaKq59wuU_AGyLnF4RqHK}z
zaes#dgg@ToJ9iSGz^z<5@hOvcY?QzseZDF@U?i~4QU$MnI4>NK=83*m#RMp9l^GZd
z*&S=1{0Ew--hZx({;^4(aYxK=58v#aZa3VL?5+f3KB43b!{9W(#~|6FgRbx?>6+U2
zkAz*HmI^q8N=*o;zDrW9EDo^VSak2tW&Ev83xA=}Nr9FF>GJJv*sCLOX4k<FMb=+Z
z$??i1-bEr;wkp5+)1b9Rx)tu%-aShDNZHQ=Pm%@N9;J78+(&589{|l8<sH!kzn4mn
zZ#Rdbzv&ciyBlZT9Bvw@W+?dojzKCCY!1Y7BMf>^Q1BmZqnUUA{pJmmw#e7jbx}dn
z>mrHd9{`e(6nR(k*cTTVU2G*f{57&7)_a@kfq8eKT?2L7wWOgCXB01j@cZ|8MZl^$
z;Jlm+REAVIq#2qMqp3WvSXsCH{Kot&Oq;Xg#H?1rQ#0M<dRpV$nvS6fA4J<0BC4I#
zFTPk^K^Lt26Jth2F`1M<Pg<3n{<rt7K<f5S^ETsyQo)PMc;S8Kf-P}Sj&VM^)3PPX
z$%RlBnc$@!;Zd)wZ=1Qj(5MY2e-8Mk1+E1-pHfZV-*!*xtJgOys~;jnR$i--`~Wp`
z)#9ao^ig=dGsLOMDb|`aI^O?1{U3KE5Y*@d;79v_f<Om11=6boi|tqW0Z*p7AG{7p
zOHw(Ah#ump_dic?X)x=ZUFXo0<}TYUXqVx<5I{s!H@ADLKb<<HE64g}>9_y7)-9+n
z(|%slFdHW>bfT?D_ks&Ycl+f|1TbJI)H~A(r2)wibFY+wRv#4Fa~oDEWQoIfyGtpb
z&3WVZCRB^rmK_UZq5TK^AD|yN6GzrhW=zDI)3EmMA-J=v1hfPnOp&IVwSEu!9CMEt
z+K~B)QNHwYT?qB$HwXH2@x`F6Q00_|x!*0=p%-&sD?|U1&c`F(SvM0|`4uC!`Q_nC
zuL9-tsfXi}QOQJGYtmj)=Z_~ZdGQ5CjR9}RlOL$_SPTTY(S2JPC#Ng+qTt}_)ZOJk
zQktwwj}cT`P5L82@<9@@`1GK=qnOOAr>axgWwc>WzaU{4dkWn*Gyh8O|7|yzXI_#0
zCNkmd$}~n^<Xf5_Tvph`9)-yfi!LfKIE9*@1K!gqu5#_vlw=JpAmeuW_|6x0zTJNO
z2|}^khD-~!yrkJl9-eicS6cR;M%6xjA{K#g-j@9i;Bbrx_BxF_?ZIfC_{CJjAQ<nl
zZR8{4bGSTbFaoUb*(j!pEOA#_uxoiNyYxnsE+NG=Sc)Bpoc*IG+IEZF?A&5C7F<|{
zZl51FTHXMMrvn>WRO}w*>B+6>NMqh{E>Q<D`W(?2MboQrzu_%)h)>b`a8p?>PTAj{
zy1}lY?=D3SM3NxzPFu*5?AHFt<%b2C_W{c|j^>w*GIPr+SE0uDd=get-hK?M!LQ}Y
zDI0aTxdK`eoq;bNQJWfllt1{$&1f~a15QwDvNrHFk+n~;C{bE!h`<)Oz_vnb!T)>W
z<N2vV|GmS8A{Y>n<~{F8Dhn<$)U0YAr`Oq44;l291Zuy7(rkYgOQZC4#%7k7ZrrV#
zwZJFJH;s1<&8PhD0EhC`#?k(=)iW?RT=>oiw`?NW#RVlrn=G?B4@ll`>*j-4RmaDu
zxz>f*IB&%ovF!u}`k!?HP1~qDNsJ^m&!f~7%(60?<_k+ckqOO=ICC6rTE%-DlJ``!
zjO6T@PvxHrbu=k#QCl=5MX1=ned_j!qXxa)^LbK(+}y*Z-0`#0J4Lv~9F7C9GcD)#
zU87*%{q<TJzx8m2OJ6*oC#muhLD;drsQjPCs&aKSi4$F4#K@uk;;W%zhIoTNy^V#u
z6~S~p78s-L7yD*W9hFo@CSZ9P(hsT9J|+8!c^T4eqL7f$9((wN!ik7eD~f$o^1XUD
zypXh6B(Z4+FDnLD>pvDyz0kW6IR9vQY_82@B8DqU%)MCar_t%&vSc*-Fu6u6y8^*i
zPj(i=qb%pxtrj%H7xVFg_H_;ZfaoDF-&=4FvVuk~KRUVClk9GFVPR%s^p2Nh+C__(
zn8k1)w=V586Ud7_*FAXjS^S*7MLqE_wX95FSHvkyW7^2awU%zvdda=c6`HUqicyxR
z>D~IUD>)Tl0xm#^)Va)N+h4qkI{QM0;@l1h27rNX$NdU2bL=w$f**bG62rgqL2qoy
zErkVQR7oR#B`Nm%Q{twf4HI$xa)Xj#b+6NU*{D8D4Hfn)+?;n=hP5W%3;qRyrwsFr
z8>7W9<NM=d0J`hawk^uAkL=RZg^bMaQ*=T0PX*CbWU6%xYrZR3Z1a(TqWB&sejyze
zzb6N5QFBnHCCxW~I<1hsk-O?I-zR5n%E+TZdSizjyUq2-je{Ep0XvD0!XtDL>qNAi
zP?PG?o?y`u&mU0ej;uM?+r>!KxI_!+1y%z{KiFX?kUW8oa_$Zlrs%+<DSANIma%de
z30uj{5%umW+7Jj+B9gc<_J-GERz9%Vxe!ZLw9WLEHG(6JGTpp20FeKmy%E?9Ysim-
zN*|AogIh5~<0%9six_XExa#!8X8$&NkImPd5%``&W~-aNBbYrQ8sm?=9(tR<3x@iS
z<s#q7JM`)(Z9(G7CCYz`mAokL1%LN*Y({Q;QdG$%(IMfL>nDXJRN-GN#}WE2u9J<p
zN-2e;Jpb8HGv`xybA99kGk-kzwh!y52cPge!D_ENFWX*SBO7}eR(b%f+hnG%<8f0E
zH}EiJYSpN@H@d%K7agf{S}c>;)nHSEMCOR2qY<5vCck_q;_FvVeJ^-1MZ4|kL)BtQ
z5B!h#<Pv_6Zleo@%^kE@x8#U4iBAllZ!G%(v;*^8(aQ?X{yiP+Wu32*BQR=Z3+^>%
z$ohR+xBKl8C56x#Kihd)QnZt#3Nk#Q|Nok;)5fZ#!GZrkZ1Cu-BpNP%MzZwn#Omsb
z--i58=Egtjue1`hQl)mX{aS0}?<O!jgr02t4~T~{4B4+m<_;7?O^_Ig4cidiIKkU*
z0zt2Byan(6lD2BlUEY`dp1!Z(%`yXU1UvNkO)&G*nm&+=+7l0CIlLo>2S;m|?<cH#
zjjuU#;t|dKMNFeHZhm~4D|nqmkLw)}O^b+<($@z8s1bu8Musvfav;`-@(Q)yhEkdh
ze)(<x0MK<fq}4ci7+I;_zW6I=fOCb8(rBt(I{BltR!h??@?6#J>vvBpavj9~{VjHO
zc8cYy8SZ}fKv_csz?VRip>+!wT7}`E&Mg`=z8w8b(y?cy8bj)$vfk%AhE>T9H}9Gh
z1_p{8`S)oY{d+0cFiQF_P^G=4?hKp7&mHmged=pzh6nNUpDG-G`*21nd4)Ep((YS+
zU}5o)V&5)tkWon97JPdCYC-UE=gfJYpI@oz*N3m3ivnc~60d-35+@y|7sH4B<^R7*
zfs}=4Imz-d&nJb?XO0Y7vXa(}H#A-2MSZp}={<Ie`kE<4Vov`++QGGhr^ZOcRnW|+
z$?HOO6ak6JzH*xf{bRi%QKh;>f!9^tEb-eQl=(R|Qp@5S?q9_x;kp*ztABDaX;Da@
z__K{x$tfU72YCf3D9e>s_%mBoZRj8*Q&<0U{yj6a6KK39=3<_tL^WZ5FCzOZRkGb5
zXI@|n+04apqDBg%{Kb|0w#Mj~ubEVJor_mno(+9d4gd2sPz8U~=|>SLcQvdxD%x6~
zQhv<tgMZ;f<>i-Ot<Zm~`Y<tp(&*cncx>wwAjcym<h1*8YjeAEY2nfhX#FDC$04}v
z2Zlw{o*dmJ)s<R-heB-9Y9BR1Qis;EGB9tkKRZ7PM+6Xy^BZHptk*FftV-D&&9yb|
zu69Bg6^b5}J?kvd(8QOe4T}0wCb+Mbv(K;F(lkvYGR|n#xB{g_In-JmMEBVxX5tIX
z5%sQCaI^GX{jB1V4AHs%e_t@U85_lOSuX2TBERx<BphDPA9_IqI<5Z$p?N04F%Snk
z1T@Y7b!<|XKN|iDE*@J_Merq$6n6`3*J=^AXG!=}v(XwFW(xDn{JcqIez1AopD~25
zU*qZPQ33%=Vhdd=byvnpVaajS_Y`j^z3i14eUfZVTOnCs!AaFrqxalxjx?$j5gy!u
z4->@LA=I8ct1ESMaQK{K-9OiR!KtYFF7Uq1#}d$S5*~)?f36FF4j%<(b9Sn`5!#(>
z<9nvM@UlK7Mn}N8-DIl^DjoeHEAU59P=A1u`7@--SIn2ZiLFk?LxYy_OcxSAho)3L
z@)vIp7+8~dE$bJ^f9*VgMuO{k0JUE`Y-$a*vc>e?A%`(YZX}xi$rtshcaj>9?=uM`
z#QqA-cZ(xI@SMSA`mH=lFA({6d)OtlyAA_0aHaw@_SsUlx?xY}+CPneTn~TO$J-g7
z@MgEd{>Rtn_*HrEtrZ+C&A&wdJX`zSl(?*!tx8MA!0oVJ&N9%2qWem~N>oNBLH}D!
zFr|`zhyN&0kT>7iG6Tm$ZEXhS!bM=e)eIOSZ{XkB;y>y0q)6fgW7js})8N4ro$joA
z`u>-eK{Le%GEt;q$A2rcCSZPeS-Ku5kycYRXYg_D<dtaFK3$r=Zl0j5{CBLk(Z<q|
zMAe4TX!6Rca!tR$XI3uYzGc8j0Q3QDE+k?s(Li1hkrQ>R9sj(j!bUYmR`UhObcN$c
z`GC87S+C)=jPX9cTh(?a`TrL974SeO$Z2b`-dPrkbdWrMe*I<PQ4~Hkpv&E7s;v_E
zdUB*p1zoQUg+7v_LB;n}p&PqeqU;|zSKGYnOP!H(cpPSpW27^4lRv&WmftEHd|Xj-
zJQCpLa66S-Yjtx)CiUZR#k@LTz6e8!E$Fn|V1?t&QWn~bRlv0N>~KQ?q^%Rqv>!yH
zH>-cSHtV<yt4s9wzX`2Ozz;cbxJ9;C*UdUw*K~x*M*B<-3W)wll70|IT2uA-TO0`|
zQRfqgJrvC|EBO)u<Rk?R_=9bmI*~UnMK`@vRhWFc@$)q?%LE!0LI3a`b8g*SA$oDI
zG_}70+o#8keqj1OIg-Q6k8W7Qi>trq2XMnXS=<7U5)H6Dw4C0wK@o>SV9?hsxg$nv
zHQnXEtrJ#~_60z^T@8j_i~%qlHQXKTjw<Y(&)5k5=jRX!^uM?kG8&#Xe5PE8gSj`W
zLq@V3ofy>&2>t^J(Nk+W@Gq|`PmVDzqf`sbn5Vv7P|dec;luX(A><fCON29#IWw!N
zmK$=EX!8}hj7$zAtD1rCEYmIJ)6F+e9Ec-(mc?7k3yyJqw>ip6{jjX&63@&Og$d6?
zoXWrOqQxb9FTmlOPhe=!=qe}HD;gfBy2EVDs!a~?H`#u6J_VdnXdd(E(!o-o7L!Z&
z&8(MvUk(dNv9)CCkCU3k)}M)0i?Pw1VxSI_xG&5=QADep7|Kjgjt=$W^e+`Xc&+yt
zb}j6ed+38lJR|9G)z!UjZxa(B%3k-<PoYsd<(V$);)5pcW-@C2Z?Fn3UAF0_D;ZQ3
zV~|nYYfnM3w^PnmGvW?DjzT}Wk{BlkIUeFMCk9${BRF&BR1N>O-@U{QgDC<&35xul
z+r$l{SVUACGUhTkCYp*!8}Y%N5p=qB3B!0K!(_Hd9t5-t648k#r`*AY>W{eCC!aS7
ztrlHc`-wD>WT#eE)(^D!Uz^Q&Pn&1R*pH&wYksdUU%*h!2_SSE>dNTgY1ppa>8v}z
zo1c^ZW5;mq{f$PH%mN@F6TxZ+mrpyxF!aCH9Ar{P2;<i0B@!+%S}LR=rDD{eoh-x2
z&f4&kG!!x7teoN?Sd+QO<<IYP!={&|Pw_8R>_zT|TsO`tUWniIB271;y`4b{tj<(0
z(*FF*MKy0fw;t1xVkzc=WU?-NJUv3z@b@^seslH@7Y)e2f->!uKY?!?U{9#tI-~B}
zlPG`ZHgmo)c>}rf0uGe^Fsd#IBszS%PhwqgVU<5xEuZSQ@q^%d%1f68#AWe=nnk}J
zZsgxE?EUb0$R!FH&gV|g0uj6U8SZ+Y0!&Lnyzm~w1qD1bt2^OdrGr4~a!&bkHplvE
zve<^c?3&>+pO1{RF&1%(7=;I8uy|IR)qOPgE$2TKY<9vHBoz{Ey9lWRpK2h_uf9y_
z?3;Mad1iiRVs_*hP#qiir+K%(+SN-k9K2{VQbBte-#rl*P?y`c5ij%WnvJhBX+zZ~
z36NKol}*W<1z%QBC%0dw3hgu)A@E7%=ym7n&HVJ5=EX-=VwG{HqqS>h!!z3ajRrmd
z_4muzr%y~o_1m!@Cwg%#)ww#emAgic<$?)A0yUDLr}6vKKygD3X}8s1T(xnr-Z4+j
z-_zsf8wj<l7Qa|-Re>kjK^oK?7pC}_HkBPDs&*Iz_3|Its8)$mkc<F&lJ|2Jk>3pQ
zOh?01&xI#>Ew4Tt(2q#tj}I{t5K2Ai=*Ff6e4ya((WVCqTue>vZ>y}6MwF#8hSyk0
zEWDYEjdyOEbT9Z=?{|ZXsb41aB=p_;3Ho0@E(-|M1)>Zz9CLM!v?n!sQ}`w8KhO=`
zEp}}pZ|jWk+jO6eX@Ky}l@wLj2ALcgGPBnOZP;a^)RAKOfO4CzcY^orZ@b*Gq?#ha
z^dwjFC?Y<)ibEzwt`8;o+>u=%?<bu!<tCmsM|$4Qr4Zc$b=n`rGX@$zeg)09O#pCf
zVG0bS-ZDTq8}`RU_S4O0FQLW9^a^b`U6<zaDATLp#_4HIo^F+iCgl}Dq$29Ktn&Sv
zOh|tWW!vA%Xis@3M;*I!r?!>o?~AQ%ZE2Qk?V4Y_UiC=D0MR-jBYy#O2M0n%3cR|t
z0D#&Eakh?#$_N|}3cJ~samqv(u%<{`_TaVeSNu|sn|Np;{tSlGmZgKKn$%^N(!W{!
zGNrWjC1bJ!B7uJc!|5p?BcA|D;Uu7JclaxVCYAauQ}Du=^J2(GA0-uTURU|VQ_OS(
zv}#FGq=1swM!fCiSS@7<{1LmT!+_W*c;9a6^@jZIlNVZU+Vj<F7Z(pz@TH!ruo$N2
z7mV4WJ~@E}lSgCcv^z~g&G)R?J_m-%fkgB+TlDU+=8BPrStPz2;a#SSBe9r{d&g5;
z%c8VnYt4>|nH{sQ_$s`yO6i}HO`Aqz_3H7&M?Zu5+Gi1wD*`eZ<C<I^-DU^o`Z4gr
z#>+|`bUZ?qm4mygp>N?03AgqGtbJ}QJb)-+%pyNoaHN=WXIHlof4;-7Imw~Xr-Gp^
zNyLU{cA-4g+RaV>2$Q&PDKe$jO_j6fx1`zMd+v2)M;*hI&!i3}?9kRqd3K}5wJ^uK
zWXwh^K0+xa{#HQn@@dTN_ODB|aT$Hjmmx>)hLgcw9$+5d(PsUW`TN<1aus-EMzUX<
zk9gC&T{=vu8NRF!J`em{@|ZO78T3u!#J>KTt(3s3_upc8(~{w-M%tU899JD`QiR-F
z`-tHmM5?F+QLj0EzXs9ftat(J08gUfJPVQDzDvJBlU~Eyh$wv532BHr!QAlWGr@K$
zw)*cb+>WCNX^_y<f48e$QFqkEo0oO_7lp5fk$F0zR9sQ_ccsQGmJDRQ4H$0H*b$dy
z0;I<n&5Qm=`vm(jOo&EOKs{vYwft}PE|~kVMsgT_+)`;njiwwdhq{o}jY4`FED72P
znvpKHyy@^fJcPEcnC4DIyU~QEKI;WF@cX?-R)kj$C}C@AbCP&_<AGt5sfW!f?S3Z#
z5ksZgV;2!<3@kaiGJKSGu6L~e;w(240!7|Yz3d<=XL(JVoTk4MpW3F2^uo>7&j)du
zsSJro;8)Jv%^?g;Jjb$PDOMe6qg>vhKlq1dU<s`D>)mA*zW#+$zn(gJG*w%*{kF+d
zp_xHEOmkb`$Zcj6G1~dv`Did3=B{4Wd@<zKx$6gy4;K16;!wan>x@GlzgOD(?rPsx
zHM}3pe+;I<u+7!2EimzaX#t<{lJR0h@xZaL_S0`y)Ai*ig1|5TL1bXP`NyBZ4*cGT
zzs6KnuKHV~?;5ocYTHJJn0w8f%Vj;Wdt=G!P}VtlO(Xc5_WISv;ro$%<$jhAaU|kL
z>vt@MUbDAio1Ud(`9~|V?-jMBPUg-Qs<}J3A?|tj<uIylgT&`prF@><R8f^mldC_7
z=B5_XUkS>HoRnX}vxriz?3lY|M{bvUO&hra_g*|He?^{pg&KRB!o#_D#Q&phX8V&e
z=jf&d$%zB)p?4YW>Km=mq^9uzrga{f0!O38t?w`Un<Z4eB_Sf<E%SQho)}xLbo{qx
z|LAreYBHI<H0Qy4Vb(`Z`gbEV*FN6#``buIbDlN6&GVC{zm>9LvB+uLg5;4jI<VM!
zx_4!c0_T?JpkN@Qin$n0{I4`oM*#XO&7H@&!N*0G3*zYxZ{g0m$y<S=ltX);-MVU)
zCq~kTbv9{$cwwDB<l5Gj&4|X7I!XA!qRKwGDd?yu%*EKGdQ%TKNfSo15y>mm|7AJN
zr964{ywjFjNQ^xpxczGTyXZ&JO}4yf+rQ<?fve^vkC{pL7ZUY<IilJ`h*qVzyawm%
zqeYXHD9YVBJ~NvABHq}Tq2mYhJNthx?Gsx%<NS#!KR!Rzvy5EkPP`1+SZ|1ScF!oO
z(|m2cl86)MXgJF)DtQw=vRqy_9kE!X&l7iZY$A2&KC)-slm5|fE>==!?k|7+Y@KYg
zz*#~_heB-Jc;T+KJPAkf2|oktp?=jfO`y;EX?M@h7Mmuxl06-ry{NrLi9!|H8<jEA
z$442v4SCvYY+3!x)!FrMxB6hcB!GOOg^$aoQ_Q9@&X%y|*$V<s0uRo~g(eskLaT5K
z#g@DsAog}9;ncJ&tvS@siPieu>b?tj{ScaK3+L1kXMIE3=IGT5u0l>kP?x1LivmNo
z_dkWco!N&gFI>VH5yqUZVJdhj;d#U%dVgarH5UXv&U$2~pPd#k8gqOY{?c@QYVrkL
z8T#$LS<e0;%Kn{>$!~vW+)O|fRj2|twib{pNJkY|Q2aJ_0zH;mZG6n|pW;v1lav=F
zD~F}I)x?4NNhkrjbZaoeNsb$lE#b&%SWhy&yy__MNRiQq)&|HBxs@ryj!3}s{jl7D
zi9jf&f*jhe4T6ZKxKL&-t0!~cT{j*bsZ*m3sgU{h;m_jDw<s)gDEwvLCQ~osT*frO
zS0Q^L5ykSN+X!yriR?JJ^9a}p5PD>2nBh5XZr3Pg@?Lo*KjS_V--Ga%fV}*ryK+8v
z39zrS`Z@UK&|-fGrE1W}a@H+agiw(cGPJ1Pcv^Ele0oNxwdrjIx42WmjVQ=sbHsZZ
zr$$QAHk*4R@53He0TaWBY@le>8%T>OCDHB~{hqiB44m!X8#m&X!_hTR@GTi_)d*eC
zn;1jFPI|ffjn@>a{UFE9SXWp;pNw#{hpg}ewA+O&R_tAnkfW0YaGXf6HuB))EaGHG
zD#{tsptLvld(UHV%+6%+7t=Fu>(;p<=VElbIsSescq0RTOG$D>5ugxIdwpsP1jruc
zlQVr8Z5a#;N|?xdKdRJ9PrMfy)f^N8Us9Cx_|O}L-9gjcy}~G<Z?<5b8~4|KNggr)
zrX-$3|0Qdlm#jo&%35CPL<?+R>A<1381D2PPkmU!W=>MPw)q4SVl)JcCeGBo(ibS4
z7p^P}7{%D5XYTMY=1BS|L=D>05)%DaQ{DTK_X8h0!7A>keh<c3E8hP=M$Zaa@z}xj
zo&TeG0&p*Y<{9fRMDnkbMtOUiA9TN}cDPv2gRAU<^Nc%U5e9Nm!-_u*Cx}vG<a{~)
zvgAVT08|)@IN6~48uws)#AZIW5`w<hgY+^B?`zW*TzGx;T4J)_@nN?!lL`|>NyNR9
zphdhuLI8ZLz09*U*yTv6@gIoZ9?-q)hMc+PW%L5%=a#fU`e0WLYRLxr3egw%I|0k?
zI8wZ=4UqUoFpf&?QR6nPkK?&GySMS-_>T{LCX!LR7pm+JBlN<CHvI)*t^GJ|6i|OD
z3^(4!zI4>qpXIs~5_%$S!}$aMXAuh_1IUGle|{@~62%x2$IlAWT?+tmd)#{yk`Ybw
zaddJD_b>I%r+{FpH^Tc=4dskjn}j~HhgeP_5`Q&L3#g<xQajyi20w2iW^{9h%1clQ
zF4uXqGwUvh^J_A{uFJsTZmk;Y8ya76)s8*Jdq?*|U{n>jKGp=(0v#Fqs*@HCsDQy}
z4abkSO2Sm(JMRR`t6T$}RAp_F3HC&p#5Z>uwHGem9P@3Ba((*s^YpBrEAtZAVJ60r
z(oPoHAEke;whsMOkimWjayYqc7>Bl*HtlUGtI0MhQNivGx1ZZMv1@}l>Bpw6RrsuU
z<0@y`J@Fx(R5)_97ZnAJUL$UpVfLnxz`41}J$OQ(P@pd{%l6X=q3@U2^752Ag9evh
z)ICv&gG`~XUo(&b$Uh=br4YIoRoVr><Ao(7+nkH~oj~`N$Be}1R?A+NC{d3zayfmF
z__>=Nc)zl{wu5K1cKIl|lYg1a87(}Db$|NjnY04y&`M_6_rvSk*sv5!g@R&j*7EW?
z?dLP<-8xc$E}WhN0$DBWK>nAYB&5hLY(jbiv{Xzob3Y`K^z0)P|54(fbe%RFDrktk
zh+zmwizHwx{2${cf#u91*DHJr2L93!P0;btC-A|ZCa<1dE&+nmKE8!W4f6CgXw!w*
z9!vAJO(K;4<E8$XP{KjGU#DQZcI38@bhguHiT5WhBc*>&kNl{5E90QF{tU1RFaylH
zkG|4}rJQTZY=Z0R`bCg|F~aNd1;d5EMOHAC_?<WqV8x(a5%B~YNB;700aW%lwnjBi
zUPQ1%rVq!B?nIm_!aCM?TtamuSeeW{ekgZo<jMhMd&;=2R=4)(NM2H9hE4d?Q7Oi0
ze&ks~-@|*u7tg~aKtc6dk}QVjDm~6ija#-oRD{}_r&P)fZJO#!j{l-Fn^KKE*=fTr
zudV20<7mzTT?pM_iBW$(BGh{R%~g)%YfKdWj+qHQ*t>W?Yiq9sU<w-jO|bijg;V}J
z5A^<aKdxX)8}Gfw5489%k^Gl(rFV;61BX@wbr#Ls)Il*;8ECFlA*1A>p~^+7Wn0|+
z&7_KAL}w5mt((~W++Nb0_ZBX41GR38<Rk)**@dQG!=4sxB+|F|7yg32xAX1)HE-CE
zEZ^SW-t*k8_xT(Bsky+X!(G9^HIVL_g)V-mAmr_^PQ2$;CH4&0>ZJgU{A2@U{L(kf
za<voBprB6M31<IEe_k1$(6$HxfhEeq0J+G9X+E*K8`H0GrUXKB{E0)XRObs_olE&!
zxn1%irx?zU*0WAu9es+;z%qMwoSNSaod4PT+|b}BL&t1G;I0U8xH(5izXH+RS)HTK
zT99e9;Zs$Dhj9wW`5&B=35{G{5JX`|u<8GSoU)0xQ@^|agsaBM11NIMZwoc6kA2u%
z8){>Q<h#kJ<0v9T-&PNePZHBzuyJpRWprTa;W8#gXSqJ!7Y9Hmd|FU7<46DTij)?!
z1FS6gCqXEqWsELIs&pFsO+{gzZU%7uFNAFVVB<p_%)%*<X`@qeau;oyrF_=yvw~5H
z7o|_@TcoACuv?&eiUReQwkJ1tx)?P8OKE2{Ws94i68dvzo*2NN&SlY+B*~n?M|}N_
zT4kLgC_bSFX8ql;tIcfIcABcPbjcU2VH#KUtly91Zt9r35kqtPxl7)sEY|P+6kIre
z?HKpCEnIu1?4w<UlM@S<t8xf;dd7{`;MEeE#~Yc_A(;Jg$K;iq0nH+_Eq&sV*z4)?
z>Nk!&B#Q|mF`*D~e_{Wr&U(W>@G^}N#3{p!xi|4?853)LU#w%w(B%_?(A3XKlBaL!
zLA?74{x7<k<18awis?y|mNm;W*0?t=WG*WNdp;Bu>)5*`wW$3)!^D>d*ma_Ll`AXP
z-os=!#Yc=kozf<l9lNPSNB_9d5ljg?&?Cp}Q@W8XEBTwQnyqiFX?!Wt791NrF)`lr
zN>XFUN*e8}edw30pPVNj$dzjWwZZ|q?qIxv!uwducb6E+EjqK2(h=vjb(tC)e%Km<
zi%$6NM3i6X*%IPfPJxPds|{S|6>K_$(Fw@(58s)qHqP;j*lkTdX@KUTPFyDBXx3hl
z$5WJq=e$<7iZt%C_{B~gWfmN~NEpF5bT5A;=FadRCvxS=AFYdQ?OsqIMve!4YH_@E
z>@=QSZ9lZQWZX$_{d1p(bo6B$2xPbmILF-Yh<QRzJ*gr)iNZW!@Rf3nS6;90$Q7OJ
z1R0~wLS5V+a3_2jtpxmR`rGFC%rP+gY8TDB8sEa>#ebIyMEw@eN*`#cYo7|OXHN_d
zzp)8)!=tUg^{ksj|8DlMP^ZiMA+Ly0>jKBw@1^~QI+$eV#QuJ5+t%3>HNVcTC@e7g
zfcBvx<40mHQtTQw1M*>l7$aaROpj5y177nwQOW(x${QbNcCBSo#VsSh!|tglz91;6
zaM*3ikE&`Gr#(KFhVua)oFTS(&4g`tnYM2pQEOKaBAzfm(@005Z5U6Oa+FblPLc`-
zi)c+1p%L4Um>I+J099}u<iroTf*6ZgP6%ct?zdKqejohAR*FASj*rX1rNHphx?5n<
zFPaPt6M773I*@x0V=MVp&Vopk$&|mZJM3arL_d+lW_e9=Kx>DGxQHh_3^sE=Yx&!k
z<zk0N*MGkRh~-1Jvb2m=cgq!$<Pe%O!k*bL7gT57s(F8r^O`>xFJiMx=isw_TAIpk
z9f9B6?>~bZ-lzhHV;<XCAUb1^DkbPt>19}5Y>%13j1z(RhaGKpPM?WCqV&X0+S?lB
zh*_5SkNoG5rOc~+R9bhC_2xNCSq>-Cx6YM)6KYrOB%49pb@`2GBIPlT`ayEm{vZPj
zf+omyxB$mhmuwEamVHfeM3~3XT~qyqvF5QS@f*jcrVjj{PCsv{K8|oyMZAs3pXBYr
zKQy;P86qTmCqkGW0{Rbi{inZ+JhpMm0*pdJzmx=Yco+6Q5{a%pO_F?&n48cc70~?x
z#{dKOOll%~&Q|(c&hkO2BnxAvrg{mil`dYT&IImMBkVZAAHe~%xW{lk)CR!mRojjS
zAJy8YjY|}}H}EFkB`PZzP5Sa;#{4yU+1L>7_x;}6dmP@JVP$=5{|a3G4`b+NAt>$h
zxpVXy&=j?=hJbl%(C;|*UVp9<5>D^TUGPmn$1mcFVRlU>H<!KEC&1;Q6!(kDc&+|!
zsdtmUdWU2h9D@uat%Uk#Mp+<v;9~<`iU*qOen)FU2B9q5cY4!!atf?GpIvII04+N&
zE7DD-Z<xNMB?Ifn8b2asOH|w=SMA5$miuom_qzqM33Exi44*z)AGy7NJa+nfe8+HV
z+fkphEW4=LUpmID+(;eHxA)ozbl9$b_C6*=$1bCy@TWbs{j&JZm1M&>SJ?Kta-7ks
z$J5~#M4iQuh+F~%28<rE&YT)m5oG6Q@j1xRi!a#pWm9)!Do4_*8DdNF3O%GM5P@L<
zCXmjETrxS+yaOC<4ur-I$J55-pXs))pInX*NFsTn^1Vu~=-Q{3L`#`=6D-Vu!WSBy
z>D^AN&1r4@-R^7Gj#DYkLi$VM)0rgi$7K|h1nl}3&i)6gX~fV{{QIH^6T1qUbu70^
z{1C1+kIAETQ&%%Et!dH|Oz91zmlg~f``@~Ue8&pzy(df#C(PL}n*F{*zg&hAsSAEq
zpzqjgIhN2ozAXW2a~fAE_8);%gF^w3n7I&Ycm)>HultIIHx-1mOWI5`l6*wI8fevZ
zI7pmRcomP0QecC-Bx%ze+UKyMi~{?dPq!&3CSO}Ht-iWeBiHjOk?RU^QUy_smu|zL
zcdnW*TKIvGB@oA=sC3R99$cF2MG;@UicY%fyb*7AcE}AefG~9M8KFl3V@^L)E{2Q3
zO=zc0Qn*FU+qt*Gi*^F(xD<B}5rr+mCU!cJ1;lz6Z-OHvrVep}ZkoRu5}WHgUlVlV
zd#7+v{>c8t_6!EeGRMb;0Q+eEHYb|1&*XInb9K#wp&|!KUhW#zEL$w&)YY1*FM$fP
z_69HiRqrO`1cBVR@}CS#+dSqJ#2ozm4`k5V3;3dvl=r$w0+Z{h3Grula^2%<DZx45
zrD=7~&&}`I63v|(({`J8kp5(nv9>a?0~x+Ra+muyS20kDAXlA?kkK*<bAtWd>8wqL
z5lTJo-XO<~Z3{mfm|j_36^OlZl+e3TlZwsHQTkl0h_Ji|{ylSJVEApjmg$({Yxwir
zt)pA;Pee5u&dOu>KtUh-*&pzleljlDwwqpk*{GAR!9qBzK9z^}T~{_LSdojIe6B$$
zpnWXhsbMZthu|uD1NZ3lI{%z$U&*-c+dGZ2q6#&U-_!T{$K|T-;a!{#(%t^Md*Z+D
z3fGyd+Xw}0tO8?&5xe#G9ZHohUlhN8!7SJDSv5*<$NJ@QOg>BHuvgd}`E_s1pkS|n
z9=I;bFr{PSrV`S1lT{k~%xHOnN_=FgJ@<wcsJFNyhN)~9&&oL<4^)1@8dT>-N$%NQ
zUnq;*&4O-pWVYaatcn6u(r9AfKhKR~-iE|Dm18{UdvR=2Lgp*)G8lifG-YfHJZku=
z#91Om@W}3&9<@Xg>|elg?$v*wg?jAE<1QewF3i;XN?*k6II&d4+r!wa&Ba;Y&r?Tr
z9h8um!tROo+dLnfhy?bNx9i+Urvi5R&yc#$`_7_dJ8R8fOO|dXq!KI^02FC%%T`%u
zWbw`%UrCgxos_!+SG)7?%qQ7x@!!U)eeg%~b(4;S*Ikz|ElXF8qqyicK_uB0_$&NT
z4&Dr0%t9#X@w4H^w_KN6RNMtAv#kxj58b2&p7CD{Uc~?#8q6VwDb<-4Lt$M1<$2Bd
zq?OA;!ab-*q9$j_5L0Z3B^;}@Cuay)*~lTzxxpR6NHTQzKuZ}L!@~1pXW4ht1w!M1
zWcMiC`Bn>akB$sz>_#L*(*pqBcrl6$Q9d>v`cnfE*2c(j6R_k`<Fnvr9Fes5AqZ#9
z&XCep9k)*b74P}(P4jNl1_3)hZ{A4RFu&bsx%($y@p!<>vx=`(UVhz_&))<-@w3y$
z8O>96i>*PoB6jQhL)*e6YbFtCBN;{$qb7&>(lKn$qPnXS77wA-GF0Jp5VFIB0~}jb
z<4oD6lImtdtfe9?Aqy{~6v*llI`JRqWb4iieYaVci;PFtnZas%1$X)>+7_gms+2y<
znzjX9NaS#QEKYc4$hB|??T=qM-<$}B-?!a{5UmexuR#sMqy_%A8ozdXSwmiH&StVP
zf=?tWH@ySOb32|GkcLoiyA!InA3edYn_gho<j$`b7sA8@lBq)7f8H>mn;QYj`-Q?V
ziYFQ)wngIT1lKNJIjn}7bz3?uP|}YzH@mXDt8_C?r&5%b0%aV)ToYFfFS=3g2!Ki>
z0jojuvx&~yCTjL-T7KlVbo`J-ph^>}Swdt4GOpR*`>4GHsa3(y68D?qWl)h|%>61K
ze$t(weP^$u*>bKKPN=~;Y_k2#Mb&n-gmQJ0FQ&H#GZ04jk#9)dt2JK#;C5wqM~<Ft
z%i*Kh)9T%jW$6AQmKB@VX;}#%G(JP$s&Y=sX%n_3F55S_hLWptKicD7u4VUcXCV)@
zfQCRB6-Y3OX!`9}Tjy+msFeIX*P`RSc=_LV)Z~fJc<}ca4SMgn#_iwwAseIRWD)X|
z8-3<<v!~*Y68j4Jb6G=jLV775Sno6HkN{^Z;*3$?L*2VUg3(6ypNF(L7Ah6n$$Yip
z%Y&CX9+9g4QezIrGgLOv!6Ry0XGFklv>~PQZN>kh*L<NDn7Gl)P?zuDzO++$IomY&
zckTu7y!(TCma(?P>SOTof7M2A{C^==P;3X>3^{euMuXBahQHYQvCY+y5)CyLND?yJ
zk<EbRn+;mr+=3SAz^CGOgy_O8Q?>2RP?vRB{Qr@37H&<y|JxoSAR*n218I=10V2}U
z-BKdmoq}`;2nYfbk&e;b-5?;NyK^H4jD4Pcf6qU#<9#gd-S_>vuJi1t`;>nU0F8GF
z=w$+uCRyRnkGDL(c-~JrT{VWUO`ANmhS|bJ@y<9QZQ5ZuVGAXDm-%+7+q{*Qo9K4t
z!#s>V1;hsD?)P2ig(;R0R{dGaA~EZ2clf+m@vNHSdo%+z=0*YeGy_rw7Bc`LG)L$T
zwb@D*{|t;`jn|Io><1<aiZc_Bp-EXeDG{;_h6CX6<(sa8Ee1t{*`z+Tus_PX%ND<S
z>}**wXtKV%N;+A5K*2&MHyXpj(W|u!eS#2U!nT@o!X)XokHgzDR)TjlzkxJ%T6(!}
z=*bXZa(4q+_MZbY9i=6IGEW-M>aP}`<X>&)zP7<qB2Ft?I(;IVMDM0-P`LNoZa5n$
z#KHJA?NDK$^W~aRM*`bkWojyzgI;)S%oAJ9&L|M?BJ2}7mhk5eFQ5167jF(j8n+1|
zgpDq_KMkr$1ALiGVg#SRg)H`A5-t2*2J9ChqH@(RZ}=V16>_HuD0(ybVHrddZZuuB
zf?7N5t{(EuD{^R4|Gju*@XdH-_>QGH$vVm7upGdY;emGfvLooWm|~a)^JV9EW*hcA
z@T9n<)Py)~<3+U8q+G=52_=}l?-OppPmRG$Fh*RE)K;(CHa~T%mNTjD?h?=^ZSS8E
z_K@h{w_yo!hA{k4HlzXiyEr6Erm}qzYf_n$arOIs!Wa5CgN(kc9?V8F1o>!2ewqsF
z0+PR{rE*z~B=(NeFO2M3^XkpCe?3vw;NSjbeJYIvC<@~`3*EwZLw-Q~y9E;-5~8SK
z**`KG8a<o+l)pNQkyX>t(8aUh`QwN~Vn+Qu&sBK-=Q0MRX?kj!%#f%Ez4)zjK7gf#
z%bE}9LCfImAGuLIv>WlRRiGev+pM{eO0u%6i?E6yTbIIW*sLzgc4A(M6`<v@-=X)Z
z$tV#VT_aW(aoU`YTsaT5RHaGwNpL;agsEN1d_dqozCl0MWBEX^)Tf0|CY7&-Ig@?c
z>wLJcWaKi21foWsUn+z?B8i`Vc2Og-XD1<Fc|d3H+;R@|I`DaSOH#uTPs>}e%i)+C
zA}9HbFtnM#{57;6MK^cW&?z`3K+#$BM~y(>)iqu`;8`1h8)m_}<Lv6X%0I65$Iw!2
z^~r{1%URR};_3*VcnDx#7C6!{agb@;1rwwA4lQ0pofX)1%-6i@iQ$U*H*$1+V#%f$
zr8=i~RM8aj8c3!|q&{DHM%D#qin|WOH)4=CYB3`F77or)#FWzW6F;+jiPS`_l_Tm@
z8Y*y{czKv^oW5ORJl!#bt<i@#4I@2vz$A#XsroV%_r3Y=Ulxae^n3agv6{a`_7j9X
zOtG{`a(&VaSwrceN7*<pf;bhs1kA6M_JWm$gv#^i@f(8fq+}auFq{uWzWDL>s%7tZ
z>R>ayBG@T0^UBGe;vS5%5KQAH^vajXHE@Lpo+ncqy|;B|<#1O%M>7l$8)l0uQzlt=
z*LKI<<!<A)I{PCk@;B)p4QEj=qgA}6B;=?Ul};>0)}9iv>XQ|n0gChoC8dx*NUT7m
zua$X!wIedLLT~~vuM`|z?Oh3{9E9pW6BE2I(5%w^{Oa?iWa55GgQ}J|!6JQKvXG9c
znaK!*@>H2fqqxhJw|;*1L-f;MgN-rj9UP`A#cwMkRje(&(!C<A)qi%x>7zF3Y2yvm
zoP5$gy@(_jOa@?9eU1%uRb~&;fAy?@NH-ar7r>Ml$}&mgBBuC_Y$VO<<d6%J)ZyE#
zqf-1?&?VsuYk4T_+T`*z<2}R+Ba@xNp!}ZEm7n^23;Ru00Ie}~eeEcF?p!mi3&q6{
zr(+6>OoU)XQl%j0Gk|FF_hYUVu3e7`(v_^wICx%p{m|un#kV+(ic>=hay<t*-j2sF
z{MNm4cgVkre8)heHnou**qbOHq0RU|1?)EaKu&twyI=34-VgiTIW}Z=*`B<~o?(6r
zPbw}BN(ho`T)`;+Ew=n3B_>7I-<Cea|JQudUJ@H|*2l9u{}B=E_W{=T;d~M_JJe-=
zZIWOb`!vhSheVy)Ik3VPsU6clv0{6Fy~rIGwQdX9^`Rc(s{1X^PCvZnbz&&G1uB^`
zO3!^*2wrHnq~Vd$`cPy;Gyq%Sl4r$-c4;^zSC3m_ILo=JHZ#eaY34}Fnrx%T;_^2C
z{d=sVNW$mhprQXJ&6%dgO1v)+#TpV&Mm8a3{zIIdW~I4g??X$5UBXBC-W^Pl$6f>u
zZ^odYpmJImt{r_46#b#e^mQzQ0@ON4uJ|p@phZe|I4zKVkotdsfQ0A;=)%Ei7nABW
zLvP!R<ZQEvOQ=cc_s95?RI~inEe8Ank{^xS3O12(RC}qR(QOks4!zZte1cuHzsq8N
zCR|E*7;#aThmMm_u<ZxeZ@Yb^P0ok-6uxPX^7QroM&z*pgAn}lR#eI79x>shs#bgf
znHg`7b$KQ9!8ZztLd(GabtP4DjR#qgwKeTpV|!hkAfU%I=y-g*)@{I7iL-ix4+!Vs
zz5r{chH<+Quu>2s11wz~tTgAtc>&DO*(<yhjLF4J&;J3@Zp`#w@;kq85kE<(a+u~3
zIQ{CXj4fLE5d&PL-zDSq6u&g#l9)C0g5uHjS#whfE#y=B<8&W<(Lo!+CAiKDF%`@Q
z`-Mn%4s3kR`s`Wy^=XDHps4M$jyST<UdmlVld-Xip)>D~`seDXSgiWh{Kh0NDdWCr
z%({;^n<@Pw|ANLwcGK%xZ56zM0;;tmfx(>$fhBz>;<L9~&EMYHJKR8sXE$uJe+SU9
z^Ozm5LK}lc^FRIDBiQ;r;Ezcya_|Xi6kv3H8OLAQo4+1#VnMU>=o?})VQ}zyhw<Ne
zNFDb8D2ye=SWntmP$uI_9RE@NNN6ZeD&N7w4}?Wgv!c8S?<qI*iW|<W)vq5S2QM@p
zG(>N@W<5_{8dO&O#yg=}9*+U=p}3BJ()?PR;^G1{jmy!3q&Md~fglfxY~uvGuz;$7
zZDo6GiXNbN8N2uA+ZwF_oA`e!#M?S5792Ec_Uuw#L(=8tBX_(tq|bsE?<#aYD&aj-
z#%TKjo-O+mUl&gP<(p8V0o-1O#pHE3VSr;Z3EnTXEhpLG4;D>T=5P{J+t#Lmq7-BO
z584r~V+v7CeFjmrlc-8nu~n2t+zu*NKp74{(zF!d>dCYU)39Bb=ix+k7;ZB?w@jEO
z(tONH4;lO(b4Lb`%s(EmA?Sb!r4EKfZ8F8>u&3}Yxzsz9z60cs>3{}UeWe*fUW`Cu
z#E3Bp?DLn8dEbp0Hn)ZEqnoU@@~8dzLC{HidVYEOJfp`(sk<AcHP{{Pb`7asg@yN{
zk454WAJdyZnCK6wb(24Zt4^)}v6+#nXtX`A|2D3^=PLX$2IP8llB^ygM_xNlJ5I)3
zSZaqB-(b$5%UIC8`)G4Z9Ni_T0WxLfa$*JlhmL;DY}?6NfDpIE%ykN>gQ4W>cKeqo
zZ~&dWhl_iJa#2!S1-C89Q^1L361WiuNw$C0pA>|yosy^o71Pj~f{6|Um)?`tOSjik
zW@F&o+24heBrURr{!1iNYijq@U>UOW$uJXcKyxbo(R{jpO(jrxdR(FQC-V^X-tCm{
zy8R?cOe^Z_yvp`-akK&_t0zmZ0ou9AIMf)-y~9L-z_e_FYeqDxB{3N^OsD%eub5#U
zR}$*a+$mydx;s6PJQa4^$VP+dt|yg@=1#Z%g6stwzBy5z)Tnzz;omlx;eP*$r*a?8
z8kvJ+Mzq5$w{d!jNV}tuAtfU1xj#)8d!3{fhvB_^0Ov5$T){hPX7C{>CgJk*7F?6-
z!dDX#w9<wQ0w>v<Nhgk#oH5oOu|3?{pFnyn5gb`xklI^i-e_1#pE}V1twH{s7?mhY
zxXG^%>$TEXmQN}#0QVmxc)#*Srl7&d4v>|?p$#h6>JG6@{EBWue}!4K02ZADuTmh%
z)rT#=fj2ELk_D+Rrrc$S-wLVz6;9()OahMd>T9dj3!n#vDy-L-IsURlb-Azkj7AiX
zj?<TAp_@F0)cZ!2Aotmfj)l__DnRnaH;5mC42D0v!4#}4Y>B<XQ6Hf|^^kZ<x;`st
z=)nrkS0E(HY_%rzh<|9mHGTSWvk`|+l_bMA9I}PRLGD>VHSkzEy8)NBgkrr^3_ZSR
zBED=>C=rbPs|?fO(Jy~)rMkb~*DM_g96A(!cvL}Wfc6Y6d{GU+epgU@$NR!a&%V7$
z)!%<R=ffsZIF0=sM22LJhznl^G_(<<c^mwcg<=fG_-mb@)RdwH$Nv)-R6R<S<>3hd
zO8zoG2fo*-Xw)E_F=fwr&>RQkj6+49i26lkmo1p*dA-Nqp3vl9+D7o6yH|9*tH_kb
z_H@!`u+=*3%?M}tGQxZfT!e1<uet)Mw6{G3J=2fT3aMV_(B`ND`Ik&qF)D0-Lrc)*
zp+@LgOfeF(Ilb3{uI{H6JhRYxc15asMT&)gEN$#m*1tc~XofxmV=NlG8>hnujyV)Y
z22x^=NBUG~Rvy>AtD(!jjbKsyWb{vqtDIYkxE%02eZ9LxGrs$IC~ffPS?p9ekz$Mv
zNg-S3^h0}v`<`(}b{65!ro_!lnc{AMtV7C@(*T;nRov_fHEepwin%d9P!~K@P4H<F
z`c~iD@A^W~9ODDbcTesqrk^KNqe?z2i}=p%EKnvNoP7_b-@Ib<Gtuc}%se_d*blB#
z623W6HOzTKwGYBr=yvjvFo@ZwZw_Q={sIcyv5d~y_W#)<IXD$l_G{%}IrXZ<JPr6P
z?e$wJb7N18p|`{5vZn=)&ItRRmwDZhAU_@x*~BWVc+r40VDmrfy!NwurcMx53g&ZU
zhc}XO-kZTsp{cfgL3pK|E=1>b{*Y5FBggq~gJla!cg#{&&sJ*YJBp+(WqY-s0koXf
zKK{FjXWr;XGN7-(96>k46@tS^VAVL%zmHXxLs;9ZQ`#;MLnu|i)|ai-hW<ARs0LYx
zo%>qt4di?G=jkQj*7O;s?cLp>w;S0|Z05sSIn{T?M4~Jn(+oBp!C$GZUYGg9SLwGF
zPhA~0Tk1MQSV*guPPk%r;O%XbSR7Mc>B=N)fs7~iV-y#XpZ_wv^>w%VSxNHcGIwdS
zb2g>IY7w0i03Fb_69+|LFr3^!3XNUyF%EHe8N1Q&qA$XQZh5d}G4nsJ>~;7$oOf3z
zF-;XmJ>nn(=DRqX`}0hOq$L+boZ>|09?xV?+n2QaS@xBBUIAlh?!))HokB5DQ=xz-
zFj9;$Yb!|qr+)s6!ucCNg8B^=>RxO7mN=DzaruUj07!vo@~WZ4Ha~^GQ{OjvuWc8T
zah2$OD>w3HD9K9Uhn{cYBV+i)xBxG68Ys=DfDT|x<r&4Sw58wtIQLdAw%M9;Mf*9r
zG}+w%J;mA5WHLr~KNVosW5fLxugFw*CrdXNGGOWSSURJw$A){02}~1yHl_Dcx1Q)L
zwqV5gu*uI8FqQmg+LuYInHScTFZZtA?y=$-#%_;os{H%Dhk*mU`^aA<ZHOacWFa}}
zzR3B}MpvmmjPyJmRY{#sJnpzHQ?~`tYta11_z8@Kq8#4ThCxvzc_4(XVC&om;D{hM
z4k-yVbQ9P4TzdL7S|jLmv6!WeF!l?9<7|C>>9aW-eDc%b6Sj-MUMkQC8n6LU$jxxh
zRCq4q&d_bwB@k=|>+aRWGDBPpZa6s6`6|Ent!bDA)#a6T1efCOcztVSwr9me-i6M9
z8<_^eQu0|kB=b(&{GtgGZh7(66-{lEc}@e~%69%Z;F%oq*8)&K@Dm=|`*@^~#gsW3
z-sM)$h3L19PrzChs*TkMN~gG&dDH3~!cy-pPOJWhTd18QWuiw0G(~FX_}gUm%aKw=
zyhMcDDY&z`1MsP>BiY?u0Z-=jHFa|?EH!gg+vu>0LOivV9KFYH0(4st<1IZv9MUtr
z1in3j$MoJ8pxtT`ZXlH{654)JS5k|kEdc&56|=CFBIx(qqX-E|n+#gc+AA+Z?XTOP
zTGE#=05@nYT^X_Bq)+X&-qRaq>&X=1%^QBvx~J^ycd(rp6ANF;DTz%0i}gW!vi6vy
zPL92ICA!q3#yhK+Jb332KcB*?QffkZf<GaGV4D3g=oN@qM7Xw*4;amcZ5p>LP05kP
zHw?uDczRfV*=TS(3)q=X2^?JeN;>`<A^%z`<Q1nZ^tY&i+VAd=DU6l8iVI6wq#Po9
zXCprU75M*N&EdxBh*42fASJ^!O~Q*F_s%AftgmzvAZ3Vu=joB_)pHH`i_q;yikf5s
zX)gXDfk)`|ONqoI#5vIW4QfvoovSmC>!{6Xm&qtKH8yedqnuN+3)@dni+=Yiw{OMv
z#bQ5&D<Khfw_dkR_hQg{xJ8B)+;qEr#StBP<9RShcXDTof~|J0C@NdGtKKtjlKNc&
zn94#~5d5&_K?{7%m<MRI@RBz*P+-Nls7|q|J$ZYE0x<B{<uV?j$qrhe*@#1c7&)yD
z<tEu3q&r>KP*{5){DeWPr$!ZyH4Cx!?lNRW@!^_nQhXcn*w~=4@}b;wV!1<w&|jew
z-7u|?puQ^;gwW3SwBrYPDrKJmdEUM9^74NCUV@+!ot{YM@rZura1%6wF~t(5In9V*
zAAIn(#O19HR{Q0YrTAi8@TpKE^|Hr|tbtlzZlHB~Jo-L%8r>DG*F2Sn-EE#_G{=HW
z(QJZX%ZN!o&o5_HDiY7usm3Zeb|!!xr%Gu2c86~NBC<-d3kmMaOw;Vn%DX$Anqn9g
zN)(kC0=E=)qNq^z-$NI=n>~@zEgc9qSU#ueKxqMAqiJlD>0%sN;17fMuL2|(I%=;7
zGN~bh)5K0Wka&^t3o=Cfxw*XlKX#{|<E_a@pE7E2ZLR@;H~%Z<Q9SV?y`Qo}sgQxQ
ztWEGw`L&Kmu2%I6ygjDl!Pp#PVf2nursK|K2MgWQKW~W;gE)hWG8AeudBXgpYoist
zJ}>m^Ms)=m&}L#fAT>!HHt+xh#fiCJmyhaKJz4svX2@%LVZSndCAxiDAy#GVHJ(po
z+W?&mEgAA{x)YoI!LyYX@C+0Me0NIv#`q^DJ07;mAy+{LeYlk2^)yWW)&gM+<A1Lv
zjj0~}A3#3`NU>iVL_6x|&I(Ze8G8@3`47MY#h(_rVm}jl1R!*W-OUl>TUec4U$zyD
zS<}^ZnCIM#@&?W@iB;}mxt`U!{rE@_h|nazeE6|ZF-0i6GOKxGPKNfwf*z1q$t5wj
zGmbi@gJED@*kX8p{A|OFNX6vqJM;S5mY*7P@1ClUfS;mY?qXfhzbwmzr*WV2FKOO#
zFLVhW*CA4db(BgFY9CrNXZP`gN_RWjTfa8Akv0&bR0|;!6Y^qMGP>!Hky}hrOoSa2
zTk@L_LjNGmhQH<>(!ZJ&lQ(niiV28vhB4VLom|=AOgNbaM$W<^f7+G~+biyxSMAf>
zU4aZlPWqQhCE3VAsSC@j(pO#cPd~a{H0h8yPL_C1eVWAnJmN{t=6&NOz*c^?dS{;1
z+#lI!*{|R8+$Z%%i_pG&`o9f%n%m$$rKZ1I4T-CkhiLdlXXd|A`=+#alwz*r?<a^i
zZJy^KBgR@rE-V%ZoAd{%u~+%p&@g_;af)aY6GN(vH1t!~5%2*UQHIW^VIic@7tUD5
zPF=WpNbQQ_+jqyqnsJP5i9&+b=kH2JZ0%SInMYSSDG;e>yB+%ge_|J#B^lzx5UiEI
zv`b0SHK@k%CuCbZQ$&Hzz0g-objB!+d%}j!EBVQ3PKFBI4Za&<B9Y&ke&!%#se1WJ
z)2pGX>bJyy0Ao-K3pqjl>Dx^*qdj`=@3z8wv4Nd#oTds(zEA2xj4TuNfvJBQOiZ{d
z&ZqwpEz#XxFP!?&@kfJcKeaCm75cc@7dgTQ>dfBFqE3}Osff7FSK^-4N2quz0dK#3
z3|Er3B>o=_!4l29e3yf2rI%rWJC|ZwnkhnF=HrUj8_=YH?53wY<_WpaT83sQyK%OW
z7KU-dW7UR)MxBphYkaX!K{0O@AE*@wf+75a!J}`mtPm*!v`SIJXhJfn(_B?`Uh&GP
zkB`u^_a5)Xgg)BlOX01R--+Jd8C<TIE2hf}3VSNOidIsjwzHFXPSX4QWBk3*_&U^P
zEzQM*eRBgna120HY=?{^Y_>!u4`M((K>zC3W5YqEl)$v{)`CQg%P2cf+_ghl=@{l9
zf>8ZOwMWCyo(Ii%mR3Y0Oaq^nOfOlYWL~&1mgoGdM+7aQfjBFh#_1MylZqK>7!}_c
zzwr1;p6I<kA1_-P?z=YC-^xEK0BC|ZK(3#YD-i9Mdy?*HIV~>6Pv=VEwj}l4zwhcQ
z*`ha&nsC@Y`=ORCt!o<G&R7Nu+8j;@6-PA5;`l36*|ea+1=agTX&m*@?v;MZZx0-w
zEuO_cJv@QC?TDf+0m6rT!a)l2NcCOn>c*9fiOUNKD_T+=a_Kj@82|y-M&zo4<PfC5
z(RX<v&QBz|z>Ya{`FL5Ld}c&aP;xW!N&LvN>VNrCa%o^QBgInzU#t)73L*;lwA`1P
zMtUqh-|A`jOu=B)$QAQBG+|rDICqC7m~Oww#ft7qR?G#OwHXMaNE$!XPa8lVc=IZ|
zs}DL+bN>NqHokTV#B~^Na-0QA@6fOXRI{<9&1pNx+~t?j*|vAN2^jLIY^3?|Z&Af{
zNR=a`1|(y?-XLZf)Kv4%Tz8TcGHh~WXygmH7NvV%bu0Y`aI{=G7zpK=uMTE_y+efn
z`{6v_%#PuZjDR5Kk77L=q15gr4qLgR$_zOBJSnVo9dlCugPZPp8m}$xW&hZ%CmUtm
z%~uZ$W!m%#eocZttE#Pq5m*j%S}M3??Rt8+0(l`O2#Sl4;-)a>Wv(DKhYP-fy85{N
zmKL9Q>O!4a<x}_aatR(y0N!aAU9b!sXHb*I0fhtWZKO^9y=5cjN3t?h=vgM=tF!A$
z{hr_q$>r|xgn^m{*Ov5MeDS5siCiv;1?k0+O;>IV)Cqe*Ce(iuzsI9HPem)CYk?w1
z9){Fgf}bzdz4X%B)UMoqi`Nm-5A5p%x9OU8U&`kGY<H^QZmo3NO#A-@Xs+-6YyMng
z0nhcM6v7n$12mMJhQO|N@_+hNEw9KIA|~F4PFD;R85o*<G%vTAGJ9E&GVu!*WVp#c
zuo0clfl%6^yTJ0VKOMTAGSs?lCecWH)z0Cpc=~s@Oo|nsnR+ZSUR*N~);cMMb-rLa
zd}9v1vG!W08rZAr01GotUx(UbG?VN7C;&RJD%%(Yu=*sC$bItjiqhA@{q#RTuX>1E
zU+(_3()4icIeHL}e#4pKWif6NoKo-{`DNnyzTV}qjjq}IR)z08CnNiYVfcaKBQAkp
z)e{y9Js4&JaX-rW^B|i0DWOeZ`yVUjash<><U6ulEfb!J0MEp?BRDU=$)Z!gcHIc}
ze+RK-C^FfU%;rFsUue(TdldtTB1{i{&qJef@VDRGn0JxM;i3JApngf5uigokME*9K
zO|#eItRyST%kOIjY0;UPa=<~Ew8}1eEG+}v&*oz%8I4FfuG(XG^&epJ9%X+TOc610
zq^mQaI-fJkABK3bX8SS`dLVYD@Q96&-R8^i`rrMlT}lknq)yF8SRgYRs>_FSK6Wl3
z_u6pJPG869;)W^AKXPE50al#v<3{F~8K{s}BkvI+X*$?)O}EP>7$RhnDV7<Slq(qx
z1{oSaIqLd_`RWu{T?H$X|5R|&N8mm;?6nKX<j2sru9+M}5shBkLw8`}&dzKx)#*qm
z*gK3ZpB0uB7fe^7{>#5!y=B&i+wG>Q+2hBacA8MbEh{}^427oRzeY`><EFNC^n#FF
zV8b=(;^lfq{#Ut^#$1{CRveU!-g}m$pmPcJ!+jgcCV~Asd6aYshd=?7W9`uWTP$|p
zf7x#y#XPmrIz!Y$(c<pzx{`sErTs$-M>vCN|J;bdhGC=T8s`+H#4xfxomxWNU=&Y<
z8gj@-z0*PIMpUsq8>KPr;MaUFJAm|(+5#Dvi|M3vzPwn0BB#;7hfhoh4)yAC=MH@M
zx?W@V;x`jSIDvv6=g0D9<#I|YE~N{c^Th~_i7yr&k2>h%!@FvvA|D6`_R9EF`10K<
zB9VzN=qqqvP0x;(megZE4>CLgV5zvQ8p-hZ!1*P=uQ?@VNp<ik@Xg$P=;Y(d^<tly
z)sJ!f8+}>so5%6$NvV#ydQhE|3J_iPIad=BV*DQ(nKeyArFvxx+HXl4tPdBKA%aon
zFDI9{<1W82+SbC|;7N)~5+B-UChLI|p~tI>!^5)l)n_23>Z;+G4(d8g-V<iEpZ@@9
znF-7i&vyhVmgS^Wq-FhNsDfG0CzVLKlz!B+zg5?Z1{^z?c5hu>gMYnixFYE?*v+jK
zXWgL@85{(KM2`Q0waA8|9aKFwCu{fo9j7A*C9;$~h<S_BwBM7T%x5%w!<q>*n7S#(
z5^mH47fbL{;<6BS<~@xMS6wM&`f>66KCvLS2P0s5Rr!Z?;C`{icMr?(edB6&CS&NJ
zW{&2=U*|5RQzL?IqR8K+TRV17u3|;{-Sa%&6)OBUG&@brQEDnYl@xkJI!;qEZ0L?i
zSl<&br12z0@TetXh?I8QKw@|toFTrK!lXMG`Vpy)_`8jUPnaYoRjfW`sgRyG&b{*5
z<B-qblTDU;+CgAgB+K`UcT9y|v=d%jgZP-YI~Wb|)z?F)?(?7EVYd}+3vRZs?tV_j
zg=O3UKa=cj<G05(DO)<SiC0G_77~5|K|JryjT7s{rY2eVhC#Wx2QoL?27+&AL$|Q+
zoO9jP1NB@e>VFS9_pSq5lqa<Q1H9qBLGlw>c^+#gaXhX))Wa+NNOpv{lB*m(w|R@W
z_$aeBJ!)&k9_*jv-rS1h`MsPtza#mRuUaxvODq_ao->>?d*v<an`XV?9`IZHv61yw
z1P!O<Z6CMi@ZBu6x>G9hlgkgvjCA3*Zn14vw0^f65hkzC_O~D??$4Lsgm4PXG6QM2
z_96N&t=Z=3w=QC3*CJlhe*JexSNrFyj}^47{TkyDk<emwZ;Jz=Ze2k#2P<NOw$E!A
zN*`mK(*n<q0g#Fg^6GW!Ih0LfA)B$1cFz45+ZGQ`JH0j|Yb@&W@QJ{0A?hK<Fw|Og
z2U+o|D7o<as43A$!=Je-mD1U3Uk&Zm=&MLX1qk5n6Ch8AGQ5?MIxLL(dB3s=e`bZG
z9)bh#LfJ;+0ERn;4chp5KQcCPqoX}zH#u3=U{_7Alr))s$lV7r-6?A{bdlAtp~q9L
zhhTpQQ$?r4jS%ss;(G?zD^}~Zn7K@rj<9r+q#DJSK}!mFx>?e5pD(ZQGCnq%2wZF7
zkT+hFrMr&6@7*rcJUD*;cKcankcm$y?Z6q|Q=91dcJx6<++~xgO-VT1!|a=b`-?%$
zJ@dj}rD`c}Fw&=`KR>HeO&vLENc8U6xhhWEx4#BhS4mQV1Tbg3i(N39qNaj&wO%W{
z@HDE$xfC8{<Ng+zYLGl~KWrf@>BKRu`cnI?S>^C}HeqP4)<S>>$1=I^fv~yijJU1&
z(k33MQ=B~R+_>$1IN+L`?DYe2$3eh=^90cc>tf)P)`}>}JZ<juzS1RWQTu~|Q^1m4
z+FL^?@O6Z#V6Y}Z*wPpHXsQzZt;j*XiD3|glGg3J(KjPvDGxKgm+t=%NrkolEj*Sl
z9J5A?+pd~Yk4o(R`-;Vf_oeqjjR5a-$;<_$-xn@=>NVT#{e4yzkZ3U|E!V^ogtMLz
zmKR7K<`Ejv@o@IN;q#hU$rVJQhkE!Iq`yP4yGiLZl&M17k2-M9-BER5USvh;nT_x2
z4niT+{)Gfn>E8mKiUyKOWxzReh`JYTN*qfw8&8=3u?+n1rRQDS<a#0J{cXud@t*S3
zK9Vo8g1NhXldfgJBX@~c1{wCg8hq6kxS5ygd&yxVCB*5keled(Y?n8UXiv#li<b`0
z=sder_j!k_;>C#G0ct0GTPf@KneGg~?$sd-li{S8br1l9vB9i5E9RpfzL)`JQGi{@
zLP1Saw4nX-Gz~vSGz|O3K)0AOqdQYJ6-k!Iy{~_PoJB5~)tDAN>e=Mum)>_3SDPPs
zj4~d46SgHSJ66zY@6`5Xep5#CI4n^e%`L9VHW2e1q9~3jvLP`jttp}Fk~{Incp$iC
z1wS<pEI@q=F-Ls<BI2-W8=aDvX!uNculZAZy}k~{$k`l3WEIJ6y2Q7avxBn-ije%&
zft=iU37j1(bk@`KW;TpND#S>BYq3yOcye0Ne1$f?Q(Fc9IDCuU<%EfpKlIu|jV*&v
zq^w-5m^|RJlGO(H*Aq#LJJ9%GgUVgeZGmYDYk#%=22oE>&A<l=;7$oXH?{0%R+jG`
z4d={WdbUmhY=o>HD_9;+P9`W*qdCVXc(@_;=7M*i%kU4{eX%6djCIl7Ap0uE_T1To
zgn4VF*4SzJLEbR{nV*(|3b6_s7n36-*jBfiwb6F5()QaM#&Enf)v008*Sv!b#@2W~
z$6RGN@djgllOG~IcMWwGVe_s7&z*+@%M*2TYIOJ67YaVBLfD0S2T~O`C!_^b-;&VZ
zlxKfqXyJZx_$FnhXbZ$Jf=GLE5KpKblz;gyv6h~w-z*o7^HxBAZ(~XMuHb!gIYzJU
zRLDz<rkQ+K9f{4CxtG!>Ge`_LaG(o^)1JmfO@UxYjphXrjVJDBTTr#cJ<(W@3;H5N
zuq6>|@5`iun{+R$U?P!R{N{w<vG|xqhMjjHjiN*1ctub!YvrSNXvty9SEOlyf4c^4
zW`gHyz*#D_bLMN*Nx?43J>#mOZVPn3IHT%#nX$NSD#qKzH9cu@H5mss(mjyBkxFqh
z+~1uTXc@Q8KxcM&z4qxtcvQESr;`7VJ8560A3@N42`Ii{FTWEGnlaz0xh8I(<i3<q
zs(;6Id-x<fa;G~Pa4@hDwf+LpxuuqZjd=W!qBm#H^=R}xjgxrKHvT=!j?d|s+_5AX
zSPrLAvdbbvWiF>oeOc{ON@JM@1@o4O_nqu>lmL3)0t)5x*BvD2#^)W5Z@^#rd75k=
zPdBdL!@~Eqij>~-d91RtT?Q_`r4MRyqoyPRMJk%TLqzu-+F|GNxR-6*)o%|Zq~!Sd
zOrFVs!7H@|Ym=yZEK+6VSy#8f&cqJQ*JRQnHP*`IzoP;F0ZeXW!Z7<<L#asN<XcC3
zhwq8`HU4ddHOLMp(cK2Y$o!#3b8bGm!rN0*%YO-nT0gJNXi8u)$Da0jQeRzGc;a*<
z$1pf?o6IzzLyXWI>~<%LTX5;yTd2Vm-{D@oTqC+?%oKZO$^ErVm12tv?imqSR0K8G
z^drMdFXH%{#Zy(dK0Sl72o*v4A=2S<<}pU_g~ENd9FVWCX#ZG!lakA19oxNMEv8p)
zJ(iXVp1`%=X)ih(TECfVuucr`@M8qRf*r>RRP{EDGTG|!N5&YX*Q|Z<1Xb%U>E^<m
znHKHU5DrC?aPF=5@ZPOMf|N3cwy4>HhSh<BrZgSU$o(Ra?GH5$f11)hU|B<N*FIUm
za;`O$4mj=k<8J}hY2oaGM_A$GdFabTNX{NjV!7VW+mz~KwHu}L=AjJ)M@+xlOMm04
zE((oxW;kKtr!<qX)QpCjfO)~j;UDDLz?bR7FQe%!9iAKR*bivN<qIR~w||1E{V#OV
zwdY-*{jz?hME?0r?I3M<X(R(VMs9v_%QHcV{Z9TZ_Pz~bq&2)VCLGBx@<;4$LfL6_
z_Blxz-PpNAYKG?u70+97ifYY%8Vh+1qnw~3KTg-ILq%EX2_fTQ!nL_&?$&|N%ao1V
z(k3hKVK?6&q6-4J<gq$ln6K@MYXR=g{Tb9ld(ht@m_`-f&@4Yi=!^mUnCkM6j_Kz&
zQ+?d;+$Z?{#D!;mKo(Kt@Yu6m4KIW1FHU>W$eGYNR<xeTmA!KT@;g~t-k{~vmFsic
z41CHRdR1I;Ji)NIpNS!?<Q$MuM9Dy+qR3dZ#XaZUqvpZfUfBZ`k-IdHu)ecjr_37F
zc<U~h6onfsifG*82&UR`q2;a;{rC6m`TJhpLPgQC2<5R3Bg-tjw_K{<<X&~thI0My
z2O`7kAGA$CtJxp>YT^2&<N4;&YBnB&=o((??SYT}$oe}MdxvMa&z3OnW^=}<Ki)yU
zk09Cw$Es`QI)bG2A6E*I4`79#umyZHAWd=hydgBEA5B0#Ro?_hkQkvK`NVCTh`y)p
zK)e5Ekee_#b6ElxZ!E^@r(-<<te^?Hy7ML8F+B2Fo`tAgwzbE<b#@9F2iX&i@sVqo
zI3Bg6YBHm!(j>T{<P7*LbSjjxdBKN)@4|TA2j}zkBZquH!PF1_{k3~HdKSHd=R)~o
zipr=^8DXRkA_l%OfM1O`he<9+GUVlQ)tas-wEn=-0|*7Lw%et6g#%FF4wbDcoWTc%
zi)f^}4PjBS<C_L5gZk>-G|^)scO@;k=k&yK0O<Dgnav}%If7CY-rS3?Rwe%~n&w=@
zT+(5nez8(Mz+cR2FX7|RbT*V`bTTw`Ysw*Q>-dq!Ou|XJcnw-|7e;q?7;H(mSx?&#
zQbc2sv(6o6DLb<-`>A{7Vh~JFTo20{^lO14*<9=yGSH%ClCByO08lDU-h)`Xp+a;I
z$1^=C=C;2lj2OuP=kNFJ;zQ?WDa(p1qA%?e1xfWXSke>>6`zMJXa((Hcc<T5d;6+T
zr2dVstKriB()*lI4yW`6VuVnDL$~>RAf)WE9k^k8tWj7uiNB<)c2zY;U*#*AMVq~m
z9vP7}9n7cSf>)dv+ZBqnJqli*A|Fuf6Sg3`*xu5=LtURy+{o!;extwM5aJ_46VYae
zmp4(@{BB<V7w|x@P^tgE{O?}SaL4UT6FuvC?3%F#51~sZXOlx&4vrQV?;Ox7cY<W&
zKTQra6Y)S->#-W#%Yej1rb4EA!!w$K(7!Txv?#EZN3o+Q3(G%VbK|q^R*A8Gk47Es
zFYSqf(R3LqgCX=|)Pc}liBE3X?kGX+=(L2p-?yO^H<rNbTvx;JsW-=-3y&&pPXSCL
ztg&z~pB+bksp|_Qm$x`I#nTJa+(vCy|9^la(YBNXVW+UuF#4aqqIciAM)>XIF4<OX
zDV!5aQ+;bjc#JqOVTMCp_)r-}|NO^Z6zik-e8Ul_J*qK}&)RUvoi#Co)f-p}qD4h)
zY4G$OkZ*ABl{Tjww->^5A7eade1g=B?kpiT-!lS6N_}&3-f9NcmpCj(SB1I!Ir5#e
zuvi<Z>2-TX!<wnYqOWhrnx;%~bD5*P9|Y`g2Hjg%KQhT9F`i0g*g~i85^$o=QUA8?
zFDOs6{_em3d@j&vIiC8xfpDaSF#F(n1tIs{&rQQKZrJ$FZMHNt(v2-1HK~6g+`fqX
z+3tQi_9xt!701<!tc3=nDLo!#Q~j}}rZuzgmc6<4%d&)Zsw&8}Qy-09KY!`)*@77g
z;;mdlB=q~uIV@*<EfeXSc-gq)&GLmot`{xxoj2_DkbU;C-WyKsGpheooq{w+pNk>D
z?NtMFS$mz!5@ctm6cdz~IazAXbh%p~%r0!@N(J8A2;(0LaJdE(OPMY&9@}uclqug^
zu5-|R{Qj-;R!EG_-{eV>AX%I_9u+uQmuO%d>+kF=VTX_vegN0mE#)&YDH!yt9zC7R
zBYU69!`xPw0ORFoAcGpgoPYk<*6F#AX@w52b8*DGbRLhX!fAKy@ALt|(FB_xQ4&nC
z`J(AV<8Ydrgw$;oU;eK3syg{ME~`;}XVdyav*JVIkRzs@lGAZm_icD%0*Wk%U0q}9
zb^Ne?qZh-!x>^Fm$21nJJ6nSu-i9)3r~6C91&nF1yB;0cQHnz9^EQsH3D+5Ex(Pea
zEf-qZrLkGh)IXCkxS07SWLf8rc087AWfGE~8AAdeTKSqn+94Cjxg=P2hGYVXPVju2
zsfHzYXs71vbT}BM!;AMHK+T10r%wi4YP`}hTDAPnHsZdY{@p?i%V)M_LjruoU4hp!
z822l;N$51c7hG;1cF=dw27^SgH@z`)GlJ@N*gv9dV{I?BFrtidN%39T!@KitIDF_y
zv7L@T!wA*oDd96QISC7#_x#6eM0tZAp>~FNHOa-xqRnAKE1@q@GaD2#&n(O}VJEt3
zPEL2xYUA8Lbb_4HaYxoG1^-0owNZqWTzC(k$$mxW)xnX07yS(K+mt2P6;`j){+g~W
zh9vDe@D&@_D!=~F(|aqkb_oRhsiC*uWLiqNC4^fJ{9KWx3N~y4zlSCCWW6zsAz44B
zyVoL*uuj3`nT(L53)~I;hw$3U3h!XwJVY@aN>U^>#}t04TG6~TP-aVhD<7xdA5e7Z
zmo;jQ=0_cLF@G|XHEFQHGrLK@E22zm)?3TlP;a=HdDM8>YflcFGDFq`alS@5n48|n
z=3Oic-L@c79^bB+If$h<Wp9zUU7cQkj!4AX76Qd%e&`8awAK?4XW-uRKC(m~(a{YW
z2AfTEXJgy4_XrsIz8RzOCF8kIl5-)=M2)7<d8*2Cmno%tAmtR!-Zw=|oT|`*Uc^LQ
zwMD*O6=6kr2-t5wn4Q!L-cVZ?EonAgc;3;GQY&j0wj^uxRB*9k{L`RuwnMDge3;>$
zl(l*8jB8e&fMcuhQ{v8KI)+~b*XS7Z!mSVTH6IHhv85c$viVX(_~gaf-nZRR(3MKk
z<s>&+7Z5Huu=Q3uN(<-meiO`q=-z_F@pf2knroN?jT%%)=}I%`qcVKk>XfUKD<z|K
zgd!vWDQnY9__sq$3_M|56!2s)TX!Pw^EBmK%Gt)Rx8o77`J;{K*c}vY(lBt+a1KJ*
z?Xnxv2RW~-G8#-gyG5W2{{dJ-Ql8n=Es7{=?^U5I=V)N#`WJW3@!VN^;Z7Nw8N=;3
zrm|M^cq7dCO&L~nt;Mo040qjsde|hw;lU+mH~mVDJD*a#kV&XLAHbxGcC1N-&TcfJ
z`NYpm%nmcFC|h^lq=F;`+nJU3oZa&El}M~G>~zO%Fsq$lUfTylDNIN=I5*<wHrB)1
z*sr2W62)3M2O@2HtGBH7xvih3gFP%m`hqbMC#&Krb2;vU#*wlI_h7aGka5iT_K}XG
zejs1M{Da%=ucv0DKD_(Pf&`|yER1G4kb6uzNxpZzV3Z)$X02|Y_$UY<VWOVeUWsMG
z<>=WxAfMN5dByq3nnKT0jWIkZ|KEenBY$X*B>G>)%?chf^r)5u?5`p)6wUGW{My>J
zF8I7%<+BZ*XQ0Fswgm8Y$yzX!w%1sMy87ps=8gTH@X;rlbFRsz^$T?25qM=f`d!if
za${TvQ)aG#@+f_d{X`pX`LD6NFC``ug(*w$YQ{OMFM4%3k9&LqTXgq;wbuEnTdF^J
zrP<3eg{hI~6h8ol`X5mEYnHZH)GPz^uag*Dp6&Zsz$;@>?tx>)zvIsc{d;9)zahY1
zhpnZf{DVYMR3S_d`@8&gx1Jxl=&|lQi@&@NB1fypIT|UgXKM5W1eZ=h<}knbU;&5X
zV@8%)H{lZ(YfK+2-U)SDvp&MvznPfKjGaS@fr%(6YOK2k5{+V5IkF^1u@63#Y$G<#
z1dIa`c#|S4(#bgAjSK&L*R}0iCXVM$+jWA7fz9=XQ3P}PpeMI<N8gSmeP+)vzT>iZ
z+}TFKf9%SS)jZHg$Zbx7bo#d7j+u;v?wacv{{gNupq<y<+YVX&VHVXi^HqE*of+Li
z_wSqA+G<)OwQ4fX&RvRB3Nl}F!AI$^(aEuh7n|`<c2C)|o4-xQ9q~6eQW!t|YWDoS
zc;ZWGRUEfgqqD!hCg&|N)}d^R69LJE#30?Dz9Nk`HREW*qJweEj~+6!wsBU9!#;1w
z*$2>QO~6Nz$M>D}tXbuvF@HIiK50DVb=xw<$kyMq9UFW+3FXXSjSq1=ev|krwLieU
zzcr2dSt)PYZsZlfcOD)_Y)f6)MwWy2jo<0asvX#mKiq!m;7zwD7nm!;_2dXP{3n}n
z^F)-`?I?@jFsQOCHA7c5bEzbq7t+<R3o4H(qd7HZ$cJ5??|9n^e4OTS+)T+(T9-`|
zn&#b|_VPSqovlaTu&+)`c&f0Mz3wj5T-LpnK?FR$HC3;ksQTmIM(3M6ryss$%O&fZ
zrM-Ed8abrMAh(%o-Hp}7izGv&!i0>P{-RhMz*bI+yJ-vF8Eg?-K$lAGnbhIuk<oy}
zWT)o-;PKb~VHI0xQLY`678WVx$aZRYGLewJTU;>waA7pyR>Be+A!Y)<i@W|0fP-*1
z%~R9+A?MmGMzW$f6-Xp#pg(#l)S7~a%WQbTe%km*>2JCjEQ;cW%MVu}{{2qlbhS=X
zs%=hwN0jwU-26X)HyN<_^t$H+d0!Dje+QZ6U(&sW!P&!I8_*j~q)E$9eOMiHV||BF
z{*HE|7pe^RdBY4^-D6&?Fw)EQrYj}ZIMgG)Pkis{X|40HiuH-fQghm7auaD`Do=)5
zV+ul3(6f$1kCV!~Rqs=7Bc3X{s8Q1kmY<-X(skeju<XifCxzDzhW_f)ER1>_xTEX6
zKQL>J5)DVX8)MFHyteQ&5q_}1^`H<BT-waC(q&;;8|Iy{2%7H!X@2%rwf+DBcDJ46
zqVnUM+cLkZ6F)%<%Z+Ka$;p9A<GoA+p7VjbPpmn}gRj97RVj%cKQ4U1%1<RHOQHPZ
zUgs?-iJ26V)O3=05Bv!BDx~jIp$r>pM)HrCg=mL%$)Xy!!$Sz#HBWiMdC`Cg+yt0k
z`tMMjg3+7ndbB&TaVmAo+#|VrGvVz=7l}*(9~ZK;!Lwo(qq^FA8()t8T{>KXZhn8@
z)?I=OtA8{RzQ2(APrdo-0ObO+O~RtZE$xuSFU#HNcA8!HqI@5i0QeUf2A5oQTo?Ib
zurH<Llh&`+A~Dp{&2yzlMEICty8o6&JLZtyFm}XqqW#$D2YcaOgvmzt9hw2LBOXSS
zV*`g!D7x|n>jVV`JTQg$n-1jICtNeg4lJ{#@PW%#l>8ho6<7&i6Jo#s&vMCV3&lMe
zYH4mI{aNoxFAenQEkW7M`Fz{ozPUvqjT4xvT31)Y9IYdVQ(NJ|H>)9eQ&w@`+ejn!
zQpz!OweZ|?+Tvh`|BM%XG&ydc-D!uH^yoXBery(8_4irYYa@vAbdGyI{(W^_?afC+
zk}RIu<ndc{q9?*np#V)u4VAtZgZe8q!mbHS=<MHBLVQ4OP4+nMmstWBqxMr&H@W3a
zf<lq0H-E$-HNgS7Ae?#Wl6K_6T@~6WRgJQrn%knUrcSBk3cNiEdq;h?87&e`uJ@i=
zUpopAh|yCf5U(R(Bm;ztMr|=L?pVSZb`3-+$4pd;oQXb4WlD$d=G?z1Q(1DH{_UAa
z*~aQxU(RQ=Nx0tyLhL;1eC~RgeA1s!U{&fkGK72H7sp%0lo**FM{oZNODXso%@h<x
z&2}-vH```9=xygfpwUQ<d)ITArVgP^QVcBR$DQ~HBx`4&X@%Y`4qD=2`N#d@*;GYR
zaA-o1&pkK;E*ASHBA<#s^zO?0J}f)TaN#1@yh5|eN$k>5GHRSMGr*KB?v@(Og~bRL
zr<hW6{`;Ffj}p{Jey~4XWZamBA<-O}hW*C>RYU5Inl{BYH;2vQr*5-;ohVwzJQ3TJ
zVFBvwh0<fgEMc&L;hg-625V|Yo*9<j<BHQ_6@<4@o}sgS43DH6{~MvqkpzS~^OzIz
zs=oFWn^m}xr)G0!;91NHEywBeSGdG{Y%dexcU_IsiT}*b#(lgrtx?S-{7WAe(}0vl
zFLI+~cJX=>wC8>`HswKNf+F|61C)iXcW~C5lF)It6<o&m2_&N}!9^(^Cl~sz*J^r6
zX{>a97c@iiL56u`UD`9FE%Pjyh$FaLvS^}MUeX`1W2eo2!-ZApsuFkIc@)JnFn6&j
zL%@YrrSBa!hi$g3uUEsR#=53z4ZE!K<NMfly|Z_b``m|4{z_`OmsI18@uE#FmRao7
z)LMllst;j3NQ6w^+V|%I;I=mcV8{F4M(cLj{OaA`&}$zj%UFJU>xtzD1kl!kXbzYn
z1&l2n24PxQc)3*^t)BNT@ApYAn_-9A^wN1v1<`h~C2!h74m2V@^D*IdV(djr{MIH2
zJE3z8Bo?xiqRf$YRlX5u#4Yp*^2Ooqxqj7AF(5<MJTDt&P01PU=ljy4B7`)6RGrRs
z=TNTNK`7BrWF_){lzv)zitYS7qC9?{a~^xApzj>V9+CRBmTw)Gc{3rnY8hBC-<GW}
z571=C2P{<DQnsH$KDFWO7CbdS#cQ-GCX6KVLvP7n^8=As-zyz@04@%!M2Ft**NaP{
zw@M4(#AJt<vXaEJU)Th?jBkuYIdQO3N-605JX8t$zx|24AqS-Ik$EQObEc_chYT9=
zqznh^2k_oictNj!HJz0a$XOCWaW2cAU$%t&2M{I1nf+r!uhwW7&Xu*ovTXE5^Cy$|
zW)4;%m#RHW&MQN=W(sl-lX0)ixM-?OiC&TN^TD?x2P^SqA`O(svjZONW@Nc2@|QrO
z0XN4cEEa3iz=-TA9O<scp8fVJ$JZfm#`MCCLVX5KtGT*K>H?YmgFa!H|03`!W%YX}
zwO1SOIvhtLY2~i6U&(dOjB0!gcCmpO`n`(&{ey&9D9-R6>Jd;XXHV^5n(KXG=31HP
zBBz^FO{i2_?HiQM_|80mNyp1F5%KMNt==rka{lR}FSkhqmcga@r~AG0#tGDmjGnc8
z_xi->lW5EMh{n!*{tXYE1fQS+P?J2RNqWF<$L$I-b+4m3eD3ee!|6aSTv=QA3<D2?
zPO0j17C|8<S}dIoW_KUKYDx}39`~Jx6N$uZ%Axd47^*4#-fz#C9eCNq7m*7Lh2C9|
zB?iqmlYYn3Qf`m-!@Qh-v2Gh`wOdAiCv0(9OXinQBhnE+I|)jn4)&H1r_k6n<tj;K
zt-z~Gs=6#sadZ49#IcG}8xg-(*#*0DiQ8n}Pxu)OaV*kfo>~5rFiY#x>Ar-Fpzx*&
zf<uDBK;5F|?;9?^#x-frOVCUosdwgcCwRHtdT~>%3c6#7?xjuI7#s1t{m2!G=1Fpu
zR`R``%$HY+zV8JxX~!RZ4Jh(;S1@)|Au}CHdY)(<`z2ynLUxVf2yLipHrG&gF_Hw?
z?s`A>g@|^ykoHj;k?n#q7}NvB`c9P70@(3J28~LAnCWIqg=#NZ{p@5fIBu6Oi5TA7
z&J*YVm<&X{3`mJTd)@gD7_mfah~C!~rO5Pk+2D!~8K%pm2%24oWHf_j0|+=9{n-;2
zx?@1HRVPq&!aT<-EPxPGh8|eQX$d3-`*I*);WTF@k4^k4gH1`?E8pwP;;O_!(l^z)
zDaWP57tR-IG|w?w{h>e3N+k#+Q+0F7Qxb`eD>e)sQPiCWut$ZyJQ5{UDmy9XtG>OW
z=}l5Oo68M?7COn0!cdKNxA-d@=78lNT6jA%w^h8Brn3!nVjcfnw;SW+q>9{s?bJ-`
zTE0f0ZBvQg19JQ#W{Bl<eRm_ldH~E}SEs%{t1Hl0gE{3z2$TFV{6`{~`xop+XT@bj
z2;=};814;>SM_-vWV~9}o7?tfg+ai=0rYE>#pt)zWRG_A@Sd>=Bc#%)xjAb&Z?#!y
zyg6uOB@QTi5Pw*mT7V}+V*}lj7>0^Ggb+vvQWnK!31{iTD~F2klJ(#G-Ah;^SJu7U
zDGAhfNi18O6%XE1Mk<JNT1|^X)SZLdeCba+lMKAvkz7fg(}9K`LdJ5Ww*rdQ0vE%F
zlt<50NE8l*a(={2NijCeCyS%Blx8EGmAEpMoXNh-ggqSN2l*x8i_x6r{^cMO#iST6
zQm#{`8(;KA!=DnOqV(-Sy6<)VB>KS?dxkYFs~s>x&i3-9yyCwL$7W~IU4gY9bX#sX
zlyH^|#{{V)rZg=aIYHS6%B)Fon(M5ENr{rgOIW5e@%ieL>_2wNW<FY%Yv7}yL@63K
zzat#y3>66*%BFJ&C+_%Tx@mk;@b1x^OcQfW)m2I<G>W#_A~JwokojV2Yekx9(L@7M
z;^ZsfXky-HwlX1@rszr3T&?8({{UV=p}u*q%?ohSLPMTwqwuGTyhGr9OYPntocT`5
zZY-o@<W8g1e+yuoFy}pTIIo7!ct)K|*1{`Gw@3a*>O3pr9a?YLW?I8!*6Y;$9?oeG
zO7kxOd{pq?k6>u*WH#E2ITKjUK$&sDWgo{MqYe+&y)^x6^Gf)7l&0y_R%rdF4T`0R
zrmE4BY4!|p#eDhuNql_K{{Ux?hm!ciQ?;J<?&;dgOM*DdMJ4k@?+~5A0ywgtG0ALw
z(0RwugMnQC0E#>-;qMw~w;mkvR-0?9X?I`=?Jpu(p_6G0xkdzJFa#WMMr%wI=)$}u
zh>tWA)opYvT{_dFDA1QI?W20?e=*+}{yqFi_~WnLYhE(dCHpMpJe?{DWEy?4$DM*W
zOBE$@eqsv}NhZ8W%y6N38C6tdvk(9tq@FAE{{Z%f{{Vt@_@~1<px3?!2{p|pP`eSG
zB4@?jgIvlL0!g$h+ef?RcOm0N%NON`*1xkK>?Pq}g*K<eUlbtIyiwwb9w}i46D^*b
z<(<Gm=Kc9MV3|s(-~b1I=r8r(CCl>I)xq0c6c&!_y6NlJrO(CV!dbQ}8!C98oSVA0
zm-%!*4m?-jKOJ}$JvMI{XcyWvTErh|v3tmjYdl<Jq4fZQRN;Uik%BYN>AIEnm3?oj
z>30&^URp}CL2(0^<%&c(Rz_2mV0i>|uj<>!zYV+-@jt`zcxT62MYgf0xn@}I5HX5E
zr38^T-ik#GcDfMDo(QkT&xk+pO^+XGUl26SBjNU^4gUaz^;um8p?$mU8h@7?$lq#M
zqu9#9Hpc6;G>j01$R`|!#GD2;5~rBjoWA(A{(fD51E)6Nd}bC(rVHO*S~Pm!?0fri
zYyKeDbl-&E71HZeyS*YEH&=owE^efei!*((RFknr0FhVa{w_Hmu1ZN9@n1B20{xi&
z9ee`O;EO`FivIv!F_pa6Cxu1a<+)&|lN@>bs0qO8O?@l7Z&Uf#<2*@<z+y4dsh3mb
zgZy86biZTt{D%*Wql1ib(^8k@yEoO3Bo4L4_>bZbg8U<?={h%wwW~cY^IVw0r$u!r
zh}_D!%Aq`GkOpy*Kp7+o_`CL<{k!}b@N~C?`~~)J6L_uF!-SP&vD4UPN7@=T%Say|
zxl*SYC6|o;9eDHNSC2j@YjSJ;J=U)^i?220hwW1|iEboq#UN64#TYokr~m*f={Rr0
zyl!Pr+t+^1o7(E_`YztKJUsJ_FxV&TsIGbX>G$_Ptd1CFmw{h}-?GQ;=i@&HU)X#;
z_@5pB0E+Z`tB)$$_C5Dny|YV&NYz=haS(os{;@+HrGQkgZT+#oZcTr|_qwmbFNU^S
zhKa01!Yi$A@*{03#70T8ibW1X@~dz_1ZAS!Hb{Kh{0*7Y%vN2?cJ_TO-@nS&I_dF-
z0$3-ElWN^9t<TfzZyI<)^Tank613K|`L(+>4{@hQa~oa6qyv_OF>o+J<ZuYjQJ#g8
zH^UGQabL-Wm&6Yo>Aog^9BJD2p>5(R9LV~lS28W*xxgi+1Ubk69Fvj3&3@s2#~-$4
zrSZCT@DGi&Yl!?qX>QVgYFJymntbu6LpuWBW|e?Ex9;7P;gy+VK9>`Cd5p?vtVATI
z1@Uxi<=gQ&GEOqX<~3a!k>qb~xBmcwbl<eM>`CB%jQ00_Bk-z2;olb#G~(jkHApoJ
zhYOFk0Hl$X`|F0}lboFVh5J5#!8!a#KZs7V@#$l;(rjic2A69vl1ulA$lWZ8K1+50
zGKC@7ADAy9ziMfODLFl>#ldloYlgwR4LYxt?S0py=z3N7Q!TBESgF(Zmrjq>9RC1@
zd?VqH1?ceT9ud;5G<`bXK3%n(D-1xMrwRs1?oTJZL*ow__+P*t9=h>QjCCu0CrYvk
zXS%s<v8Lt<ql}WwND2TXlYv|}$1jXpXTaZwm%cyMw7dOU8)jHm3kbtEn<ye9E1#Aq
zi-4^m2L*-|i2(lqJKr7u0BFC6e;<59CavOo$t|?Ca@JZr5<l4Oz}yjn6Sd_E*l7p_
zy@mkl;f@i-XLVg!J5#^CE!%y5er6m&#MpeQ3XEfU^jjo-NBdCz-##b!g?Dw~4}@12
zJ|FPpEbz&wM&4cZ{{Uh`Da<kFJ3^hra!()+&F?x%WqGA`l1Uh|yRl|wJxM1dlibr4
zbMo~y`XT!=e!*TW_^B0#js7G&z8>(b%F@Mau*a$11tcyORoe_wU@&s7Sw83)3_o{;
zETe`n7nPwOKZ;NDZ2X#aGc5BL-cRmG`G4UDz)y+)030<@ruY|0d0<IAtBpqFG*&j{
zKrF8|(l>xX*tu>;036rvm+blbD*QS41E!fYhz6;zOjymV-a{YSg18xqG6(MB2RH+g
z4<y&P_$$Ev6!?3gL*cIl>2}&anF%bm){(4eU9q{yQOh<4N#xevwXW#i9Mfmkd_j9}
zr&wC9`R=ZzSYe4rcXuIFeNBEN#awfY&R?*kxha1t-d?}uvHGS#!gzeTKH971so%G!
z`JXC&(Lb@b!+(!9jj6{Jejo8neq+b0S;BzaGbzGdpaafL_+r@-4tB7~ufh+D-vquR
zd=AxN(f%Q6O=j0mB3j)>M7DQEAf>#SMhjpt;gwHbI5qu7Xxi41;w>`UTF|c`zp%WG
zWrF6~MV2XJag}9sC0G!02sO`m-{IGR{v>IC*q%Gk?6un<S&|#rR%MOZN^g&FB_C@L
z8Au~NYPeU9v6)2G3iq8Jv3sve{!EPfgfO{`eYGEUr**US{{S=jHw?;FSn{mAf;b@d
zBhZTd0R5N0ZLf*{01tJWPX_q5ElbB54eaA=3Fcku)~XbjHiMRw1smEX$oWnR<eKh&
zGXDU<IDC1d_@*g-5L`p7=vuAmi6Fg@#c`#ioT-{akt`|5^8nreP*s@Z{q6fR{>L5y
z_z$LA>Ux}Zz9I34nA1M9CAeR+7gC;GqOoHV1Y)JLx$;Ti?H{3WKN#@tFD|u6dr3)X
zwrTmj<8Q~%_^gYDGn_U%_fq6bchRp-r_EosH|?$CF9qM+{4ekZmw%w$-Pp#{>eu$~
zZF3i%izaM{W=8@x+RxP_1a(^cZ`Ji}TVK>)Ro3tBbt~C$9^&q2o>s>LF66Pt*1xFV
zjK2c@9{h9g-2NQ##+@9@(8&$1qa11W`x7BjUfNey*x6Mi5LK~;1aV)Ici-?#--$8&
zMsE-4n$>~utCpTEUS@M~9k_4=Zvc!dTwwjxV1y0Zmpez9z+6v|;c(8CX!|OOYTmrx
zJ^n|j#d#KIUlHvf+~D?0r_1#}KKvQre-->f@hz`}d>y9GscN^J#{?)_Y61gD=NL%W
zBXgbxK<Yq0w*LULN9+&bABIy~c)Q|%*}PBU6K(;qnk1i1vY&Wx@b5s+B7wJaa6vdx
zUs3!P{hxjt{0q?G(|jAIM7n^tST6N>qej#(cAS=;WmP9F$WetI0OGyHaz7gR&Mfg8
z6OX<h19(x{_kZNPk5`8{GL8|d_@&KLe<WB%NYDiVRX_j^00O>z`1AWrd^7!@^m#6P
zS$wnW656%SpQAu?b$Q8E!NFMLEC?>#01g8YgYysVm-}ODUk9%4ei!&FNx6$%zOfR2
zZQUzJZ*G(2b7}?*vZ%@5xGZgs80Q~{-YD@Oh`eE}&*BdiYIl0RrE!lo>hflqSK2T#
z%t_!3R-7Z@-Au|c#p7kmN$Qf`U*x|Z%&sKkIAEsmxK(<c`Y-c6r{jn1G4VI!_Oo$y
z;;ZJh(OHn&XjZbguz*IR3{i>mBrFRSY>;t`U{``RzzZ<|4oN5Zn!DlO4{M$&)vdG-
z4cJ_2dX3}kg4WVDmRT6|3aSbGdJgy%`%U{W{{X>1G;ao!@ps1rwb%7mi4DGpRcF)e
zi>_PF+zQ-Gr{&1~+TiWqkk$IeX`SVKJA+V)O5Y3?@A~T5`Q8UF$LDybH>-Qe?2n7T
zX8!=#YsWts=3RTnt9Ri~3V|YtWQ@hAT*|DWL{Q9}L`LFdVc2a19FN@}hu;gl3-G%@
zgTsCW(e1RWNKsMD5rvXsx%uUAH%P>iNn&{>yZB*>-Xw+;SmTXCs;I)Mfq($)0IpwJ
z@#lj4H>&Bn7Qe1(`gWtNOXO+xmp3uRHPoNoi5-JE<Zz(%9<}%v5OGdnomAzh+J{K3
zzJ2{!`u0(fVRG6_XD+g5P@@AC`3w7H{@cDa@Xv|0KZJh`Ah^&avQ$g$QsyHK@sljy
z=)kc82@s<!m<)Nw0k7B5wD1mV!hSV=&p!<QH)waZJ}c4fUsRgV+S+RA5{v6TOGeSk
z0Bj?j+X>pkDnSJ2mxi&}93DOuvkF}BT{}MZ>*#goSX^a%E2V*-EWfV359RjvQ`EJ4
z`;Ajqx|;U#>HNv(xRIrnM?7Vn$!0hJa(WC`Ti`#3-ZA)};yZr|co$2TRn%uIxgz0V
zLcx5=U!q2&^2e~?6UBY?WB&jI%J`=zkIspx>pEm!7mg&C`)V-lB!J`?lGA?iDzM2d
zAdhJb2|0{@w)`RZ8Sv-ej)=N1fb^Jc8KXqHyeb;*?UO$|(w~{mNyj)H2NnBfb>dC~
z2SF^l)00{yqF%q`epQxmMmrT<%NV(0o~`cv51Rh~W&Z%!f5CqTwCFYe03Tb~_?N}*
z(a-105SH6ckra)Vp~pP2CPo<~0x^M)y*#x8Xy8+jQ;!^1;8~_+h|DX>zA9If`<J5V
zkJvCdb{_{z*kR*n{{UN_3;RNRYqj_b@apfz(&;z4#H;(Ocx|F)iDhmTu4Pa}FoH%{
z^*B+<ujjMlN9_mkJL8VObE|m2Q<^I|3VAwBw6Z0IxQt;!L|B5|%Qje&d9UgNQq$~p
zYYFttPTniKOSsw@;F$!G%NvX`I<O&>jtDjQ)B9t7!}<q<bd6);@50-gT`NtrPqbfX
z6UeYey!^(+-Ia29+iZHAh|WqyUw`4=Z<b+k9GI$qy>H!<)p}c}sqx%vmgDNs=fg|)
zYk2JaeAkiUzq2px$>V>7GTQi4$CA(DF9><jtF+GZUupL08zR(}0ftx|j&XstNUy&?
zX;0gZ!{L90v_$xSrrcfY{wdR=)a=_+w3-MlZ6{I#uwc99RwLvElw;=LjPoC|Z|pbW
z4+!a&KN0>UHlO0HX7(hxu!aT~T3w`WH%91jw&GGScm2iw4C8tGW8#m&KZhR}^aJ7l
z037JHw}Loc8%wV(e$`{VV|}!Q6B{oCby7gw2;!qZ%=mVuZyM2{<pke9E3$ttC%)JG
zU1Yg-cMP@0RTNuZ>Uy>F{;Yp7J~8pPj68ed7V%e%b&JhkR$u_Tx|N<pRbuLqocxHa
zODe7aJdSJhU-nP^yL@S(YC2cLUy1PE_@hsm%ocaH=_TH<vM@w~5SapKhSQRMaguUC
z#c*CS{eV1ap?p)(z8>jbEYL13HP@N#Z!T^Hwd8iz;a)>6lM?RuED%D@9H}kkjN|oJ
z_ILf2J{|l7(Cr1at+uh^bXGSy^w!9?EY2{?5j%`g0l-ynQgTVJtm3X9;rZo|ry6Q^
zUhGxVj_UX6s{a6heC|P$=Mcxbu5LQ}&AX@NYwCSP9D+p$A9!SaE0*zx!|w!qLDN66
z{BNPyYnEen*sq{Tf^Np`rGY9=21=2fS6Xnm#c_H+#IJ|`A@LpWgfvePX_wmOoEY8d
z*7tEqbpTQoi^Qy1c9H?e;G7!#I-j&s=c1mP-E{f%KUk^k`BgfW*G+n#jNczW;GLc|
z@W+WX`Thx`>)Ix*bZyPG%z-D<8b)STl0rUvbxbLKTo5uFc1PWRvrp_v;2(n;Y&zeI
zZLNGu;_1>kCb_wZWwx@BdD&==4lr?<2;4}>$3gckQq*-D9VWu#RMh8=*3!}^o_VEU
zk;fchvokT^1p_44(_;9`HpO8U3Z+MGw%%T^`oB}>@VqCM;js@D2y5rE^gJWur|jAA
z)8p2PY<yQ>w)&i5MXs@F8ZG_0hEmG1Fguli+Eua!ayhTbtsDLcxABha;zXVv)~_t>
zd?^&7R@UL0Gf8IhI-rUjoLuZcT^r<0!{%;9e&hIK#hw}Qu8{|bd?Tva>pEr5OgDBj
zCBzbKI6GYl1cER}V_g9q>N0*U&2p&9jS1O3EcLVQliB|ORyE9ccQ4EwYSoR}>bm-#
zFYssJKf}*~`T}U)4AY{D;p0TS)Lu0gccwhyrgZ~2$-o!`(!HE0pwg~u=W!T{xQed5
zc{xe$-siPL4^D+OPLi@iv5F|9Cl!gXw9?8t)E75)_Ez#;UC6OT5|?*!6;)JZl1~5u
z-o6z0)B9J&YWIE__+aZam@!GL{pgUNz4md@$o}~|bDRU#y&ShI#AB+eIC$NjZfBce
za1>p9Osu~X-u^HAd+@Kq+jg-MCZn&&fuPf(Z<#}Md5HUo`kuqM{K)u=@iWCA5+<Kp
z@m<u95=n@xV`8?@DBx_5I3#~~<&JvSIVI)Zp>rmosF%IGx^FV#=2tHxx$0Q9N&M>;
z{wd^&NRaGP&reGIr-%4E50|A<s^*__@UAA~JVs$v)1>W7r$_TAvYs25M3H1JF@iDD
zu16XVl;8@&(e5RZ*A7{BF-@nZ2e0GSwb@aelPLonr)v8}1r?$35`50=#MG^=Zf0b|
zz*PM6#sMs&wtH5tlMS2OW+UbWBq@Ga52>zXP3B)W*~`SChep~6?~r+}y^2HtjsYMY
zNaGY(hOR+p4Xjgn5E9S0kYADs^d`H#BK}*gL+pwH@`elKmgDrUCu>|4F`$Tp_kqc;
zPqBi|7zMYR95T&=@{BtibYq@#?OYh$Qt~|<wX~)-F>f98)`szm%0q<<g1-3`Ur^Ek
zCMjp0Q7;E$V`=M?&lRI}V{)qmk+R6}0t$hE-$T~1KX(Q5Tw5cdWyJBHn=E=9jySJ6
zJD!}jGb4ja)1_D?l~IuHQgU4WBa#n%)wWyBQGD3rSd|!`k;=CnIQmsdbjO<H>TE=8
z#4cS69DW@IUK-)IvALCyP2>UriAmhUpuo*^(tQqTJ0b;Cf<|la-YmFOTxaI)bJo7G
z@GF(@wx1FyAyo9|iunV~yq+6rk~cqWU!6h7P<pSi_ph#g7{rlyJ5G)mni%||6<??s
zJ-b(vn*RU-M{Wx3orc+s!wHlsY#VX!-l@DlGbqVy91PX4liRuTvzFi!$o8u9MKJ{K
zMJJ<Ups$;Rw6W|<-9<aO5v@#XkVwJL7$Ui@ohzcdh$0=NK6{P;`d2-Sn)Gn(ByiQ$
zk_^>LyJ)VXF-(Eb_9HcPbf_dQIvy)n$)|KgprowNC-D!2Ueq<_oC`<Qqwc>?YT-%|
z!Q<;+N2xi(epSZn{uH*pSGlnsT;n9D`>p;J>fy2W(xW&k;rkvs*#?{St9O={7IJx4
zF_dpYDw!D->cVPD=;3P4%!%7AfO=Bzk)%*C5CflF)in-<i(~L0)xw}5W78R{B$TF9
zfN)6Q<2b4WEJ!2c+=2Sk@jNKrQe$!CsPy%!;Ca6=>61VXS*`aKMg{SLJBp`p?j#vv
z7p+evviYkrg$xD{9Fb5<woqc|I&yMoG;2cGniq_eJBBfzdd<5Lw0qnr3d4iPB=zEp
zOE$F*H}3@J9+e|AI4dFJ`@Mw%im?^V+)%}beeKwaTP)qiW|(rnD)HKuIipRg6l5G=
z;-7USxD9}yk@Up{7+UjfwURuZ2;()wWY=%4vKvViR0GE$f30yC_pU0R5u!i;(ES0o
zS1NwcoPpl0ILu(602--bw8A1*4T0A=t3oD4U<p2en*FW2Y<>=A>PahE6hqHn%CI3r
z<d&0}2SRJDn4;Q~DZ%EsIPYUr{py_L^!nH0-XTr>b}Rd*^sj@_{{V!lkG!AcO)A2Q
ztbX%%^);5n#z{EOZgat@uC6}NK14eQAor>zmQjQ!Bd%-UIv=vZX{jtCX=CzsuOQ%O
zo+(N`?Tx`B`Bg@Fp_E9h6aj_={{UKOmp2LXU=U7o+Ltk>DXhh1D{Ub-#&8ZY1v6)m
zBX!A)w<-^%Q+2tLOu*>8V?FCdW{%nfOO^yLQax(ouG2G=ENBa{IV%dNk#WNu3dV02
zY0>J@XfjHTs5+Mjhbk0x$sOy$z99TkyRp8TMeyqza-$N=cyp1^dVOn#_+>5C*NtuE
z4$V8qCLHoX$6EfW%ToHCEB^p2e<S!KIrnkZ-}mMDpI+Q*HvVeIc?*z6Rg`2@Z*I5I
zx7g%pN_^P|ZhZkA{*|O45Zl2#tj;3HZQPPiwMQN0j_Z#j<N-lEWAv`n?V>!6US(++
zwm>9p6tel!clI^vzYebEp89)#D{7=;2R}Dn!o2dpSkJotIT}cibCto*^{$WM-;Cq%
z-nDgmCZFcVa6l@<=1{ribJIEIxN}uGVWsSq+B<UEw6OS?PP4Q+v+8{_!?$-NJmbrc
zuRTq6)><Z=V*Yei=p`ctOnk@jtd9-&%SZ7nA*>&3j<LQKhv8h8j=nH>W5HreSC{Qq
z>V&Kje8YEO&Dy^!sSI{1zpzn!-H+H|TQ0)kmL<({chma0>NCp?x-^n7MM08DzyiK^
z_<{RD=$;=*bpHSkFWc|AUEqNnXW!I{`HRON75rP{Mrf|Cn%>VmTflZM0m$4(E%?`!
z<esFQ*WBcs3l!B%d%u_Yp9h`s?MHf81!do_%dyq?m&Se}@r~KL*RGyppO!GhM!!TK
zE`9M_(M*miAM@6n3C?TuN;G3lC_*VEwYndg)T=s^bt*}@KId`Z(u(&g$NQ$Qc!$J#
zMv*E>cW{}WMUFVbe8q{zF<wipc!yfh;QKy^h$X%ST4v>=RrK^HHNaSDvRvxcx7wVq
zad2min6b|}u6nt3TD4t8v5ic9D8<4|`m-&Z8kKq)l1--tMg>LExLxg`#E{dwJOPSA
zsxVmKlYle*D0vw|h3dx@9I@_nIQf?*oJT7qVbT^D+A=FRTSpw<X^gRA2svzz%BEJ8
z8J<8?V*qZ*KZPShB%4`e*r$>J%~_ip94xUB77Lu5vBz3iOE2!`V$Iz4s|=n{1QOrQ
ztVUKv=*#TG6cR-7C*1%5d8o?uUezS%_$4{O;Aa&TxOZRP4Xutwb3hbsoW~O}A9RdT
zpjJRndWuVVo_2XdDxNXBEA_`}S*KfUAq9(l-M`khi<3tr^m>j`QLw!#CWcvZB9O+%
z_^5%I#^wZPps8S#I}zzktL{bJ^egNdRm2yz_bqLA0|$y|+iRnag-`PbgV2mu)SnLi
zJwZ9Q(R^~t6!O294xob&Nt|QOAM(kG<3Duq(>Sk|-sLVXji;1ZqsCd+0|z|`>(Z;*
zFWTBzfKmY}LBSQ{=GhJt9ZmaHY4mGz>iBnw@p)B6>|NmZi`e~3j##FRWr@|%fFYFR
z5Pb;iPsp!?ejRxC#9t3v?Dx_e%@x>Y+21V)>6nT8ynhcvj@9;8hWu5b__I%4OGCL?
zqAjt3`CT9Ttf!EFL)$g^7HN~#z|`fQtGd|!g2?mw8J!hUmXhlpsOa5G3<HW$OGSAn
zy>O-#7^l%f5n5+hVR?45gi^&&ZexL*068S~1Fe1_{@0(j-^7he#agGs9|A?Fcsob5
z2HNUth-9+W?qx?IofwsSc$feb03<w;!#J<pVgaw8zB&HQe++&-=?2rq+IWXjg-_Wv
zYe2DHz}rfX-eWF<gXO+Z;Bj9|!kLZ(1&&p-3S9B&<onxZcpOK8#$vE;mI{2b{Qi9p
z<^ELM=1#pjpH3^+ehqvR@%P6+6i4BI1KD}rSy#Qhl<c;&fSebR{eQZnEZv2D+2B9;
zCTEXk@%6>8#64op-@;Z-N?h1GHQu0s#?@HZ0S4sBVoQ}g4E^Ey^YH88r-A+z_;TCB
z9tF~%(`>Dy5sivG*)U1nESTLRWS&@)U#;;ci8x#>K3uX&bst-475cv~y!_iE;B00h
zT=?b51paQ{q4V$T+4~^;Jopo)TWYi0L#%kk?bR;zX@{Dq#fvi9EQ9w++jcjw9G=zb
z9~%DvXHSKH9W=ML@xG4~nu^3@w$-2lEBkT-gxfF$nGu|@jrT{m?%SSg*>!ziOVBjS
zooiaUips{$C!XTsW(gcoMx~wEj{quyNvz)sc*nxtInZr<PvH$t+gH->6f`$hvI!96
z1A~rL#&F2MGM*R%kzb0{&9e&FmzdP4CsO|a7iao<AFD$HgpDt#sH<CTKKmbuT4($d
zKjQEFC$VU{zN@Dvh13Zp*6rloBQ8ho(%HXwB_q2JEPr_PKd|2leh~Nv;O~bb@Xvts
zxa=*!26>Oqb9Er=B=Vn`*phGw+M|U4abBZ=oC<hQE4Mo0tjjfwVT`GCx-Ap)ec8vA
zaOMXuoToxMzWUqK$X?XieKCjwK9%#AJ?(l8krGJEawuhCf(YP>{BQlWzhdtT=$4mX
z6@DCQI$wru;fh%`4J%N%7S{7j(t@pKTlaAQ9D^9zfCUN%O8cAkko~Ou3Hv<SM`wK-
zYd#>kjJz6Myo#)gl>w)IOhJQXeo|N|>A|nS&y0T@Jah5S#Ye?HH@e>>ahNS_Vai)u
zLFPNEB4-4R`%9mlg0A9lEA<}^@*L(^g_vQb?CE!U^LbmZuS4_xGR^YkN?2-<Q>eCG
zyzlxSuHUjp>`mY=25DMf#D5!0r}(p9nPI)Mdue?83ldN=>?(Fn<8V8e6;c6QoHc%w
zVR3*livDE20DjZo5x-}>aL=I5bu9|%<NJ1vbnI>9MQ@p*<oPm?PtK!~2JC@~{RjA=
z{{RIu(tqIz)6c?<A=kv(glFxmsxJ66+{S;^lqx5YWFV;^AqsN8HwP8)hJT36dFOh{
zZ%J&I%c5U#83z+(6mUv9Q)_kI_rF`8eSB>A{{Zmk;4g-x@ehr!u_^g&bnARcd37lt
zD#spp2fMZvM<jt?&)3J#+H2xJ?Fp(A;(O1s=yub}_Bu_VXf4$Xv5DtTF~qkBgaYJ{
z4oU?KLGjNO__M^`I@d0|OXB@b=T_A3T*)=X)P_eVpbL|QP&f)O0O?fxG2y*?#v0wP
zhI}hwKBuZ*N+Gwkw_Vam=Zp@34^|`qIV1}G1BN&!1(VbI#wn>)-tTDoU-4}D9P^5?
zIh>vzish%<tM&f?BiTL#e$pQje`noP-1t81T<KRg5lw%jU6R(fDx5Yu!y_*qVsn*E
z!PP?7@7KVu+GpXP$K5GzJSBVN>ah>>y&~v@ytg0%Nlf9>o*fBNK^frJ^EIt$8mEUf
zd(8u0wU)zC)8~@f=Gt+#NT!L3y0G>m85sw$rO|vtuXtBgxA6ytH4B|DQM8aeS2nV=
zGOjv}$U(sz5zvkg6~7bkHWN69*C{m#^lNE(yKm}ak?}?|E1%WsUQz0v>(jCQb20<Y
zC=0g(rG8WX%ir)&+Yb@_kHJ42-|V{0#lpdDu0;~ZX>ev@BT1q``;w;$TxCv1LV!(v
ztR#sdtDy|60R$X^4{&Sw?>EV@nRRbri;cTJkI|p9WqGa}A4}Td-TSVq)RP#dmCI(7
zkOogm_}liI{jod^@FPo;Nz)~RT=8|nvfJs=NUoOb0Qs*96478VV22FDrySQd2QROV
zuN*cSJkos*jBYlD5{s{hm7<TP{9og52z)=%UtjUAv1OuZv6T&TdosfqEWrmXdCM>#
zh^~st^2<`Sj`qT6?rr5=(aRHuM;@xn!5>QgRK6(uUh((FKNU~nza45)UEE69js<30
zTW}OEA%`U+`BVZx1+kNpUfKIQ{@0%oz5w1^>5$JRvEc@tUhhVo^RY-F&8&@+103$a
z`51wb*1t@|cvXy!586|xrB|$adMDr1_-sFjSZU(DoZ-t)q`y{=-`|R#2tFVDWbo_w
zYsdOrSN7mJf@}r3wUNdQkWGTow@l-PzzhiDzYhK+e#9Ol@W;m;8{uDvyh)?aZ{mNn
zTxpWuU8FY=K?97(JZB8k`D__PYJ9*PpgR~3r@l1*0KrQ%r|{Hv9|P=X8pYf}rN7l7
zUoOeG$|N~qD`P(~Ax1hL-;#Qti@Z~>Ytne9S=DcEHHjw|Ci+{4Yk?}9t1Ow^r7$*a
zBWME{#d`b;!@2ev6PG`;gp;yr`t)|Xw!eAj@n&<BSCn~U?HabJ+fLWBx&0)5&!4jg
z!@t?<M2;^GS__X5T-iZws%rA{Ez71E#9)j?36qtLall>0ii2NB#xY-tAF|Kw_pJD8
z>es-Z8!n{OG*Yo!S!)G%+*v9qRTJQEomlM!*zQruBxG0Z7LlvzTJE0?t)kxBYBpC3
zz8jlKQfOsdk(6{StTG75Jw<+P#W`MMg2lI)UCHasx_>{@q4wN8nP4$kb@dx0w_R8D
z-1x8dw*9as{hwoh3+TE9-R8f4VfKqHLgEs-R4nXJZ+wF*E(zGZ@Cg~O!M_)H>&5>7
z5cS9LH;uK6&0|z&F1HsmGZ=R^{jjb=!hyL}j4F=8zp9@Tcvr)oG12ZcFBEB(x~`ve
zjFQ{k#Ir_2sZ+xPgM-v^I~x2${i1(i{VU<mjdUAN1$f6vvC=$Ea`y3C_=@&8E^Q>;
zyAkjV@hofx=Gs93Gm`2n?mRxua*RA(3ee@M{?pd$ucn<kpC88AUSnQf-ZQ-=(`$d9
zsoH+XKeqn>iFD5pTKqWpwQ+yq8+~4DTP-J0NW}M6(y?U4!C&7_j9EtWy_b^5Xe55s
zp)4{*e5Lz1{>nZA_ywgc-Kr+D;%k&owd(W6x!$?r-A+s?^AJLup5)in#jrDq_>L^b
z<+!YTs^yU7hhEM1OQH4LIfTaH@R6sEpESC8FJ7QPt{cZc68r)1&c+Q-$J+IlgKKjr
zf+?@2Sdru)oC1A^C!nvAKWZP_gTg-pWxMcahM--0#!Ct}k{o@G%*-ESPU=e*U8=?)
z6qR6dEAYGGN5x+r{9N%Az9{&Cs>`b1Ny2VBVS#V~NMau(N^z1D44zGUd@bSrFEmQN
zB3{mu+1pp>yLp~wdB#|56x|9Q{ZGMqv--BOzO>YBA-=GgC%3pok||?g$mqi=%0~nd
z$Re#|0nL9npR;f6jpN^dHz!KD^E^Z0dCGa3ZPe~w4WU#utc*yD2r;t-KiSR*`qTD=
z{{VuICH9uS9{33q(8+fJ`(CrCL*>hESdp^YEyQoY<=Z&j@_<HaF}@sT^l_^VN*vL9
zOQpQ~dVRr!<7^d7B&$_?vX1NgyPtJ_(Lc3E!{6EGN{#iVeOFq!C2e$zHbXhU&iP9)
z5ymn(Je=?ieg=GG{h)p(e0bKswY+tzGuy;A+i8}&ps=@4Nd;iuz{qpUv1UCpfB^D6
zPgK=)%jvaUa`xj;zPV#0_cu~B(@N*7GqEI}#;aSzC7L)(Bw^iDyOI@hah4!40C&JA
z*w^fQEx?#OrkbrM&0DOkvh0^nQ}c}bifiVr)A(uhN$=`?WBWP&*WVGo0qPcdC~39d
z2t3%h(d5jp3Onv|WJ>`Qgpzjs#w<x0IUk}QwD0Z3sdyK_z8*ghG+j?g*L+a|UPE!O
z>2gB^Hl+cON9HtQS%Gyb%e?>@Ach(1KV;9?x4|C_wB0|){wkWs#a<`6hC8ckxG_GT
zXp*R9Ho%ri9C8Hz0L9gIgNpkz;t#_g1N>3&?A{mg#*=TU*vgF>i4b{^?#4_qV|-1(
z`H+LfepAF5?n75Nje2-~XvMx~w>QhJyzO)D@@$_sr<O{KuV*Fh^|#N={#SLs6ZnI~
zdiB?fHJw7@?^?aM`%L%JIWx4~oHy{5z}vSQa5m!tzfeD9zuVu$R@#l9!+(tCePYf?
zW46;ZYioHfFXNIm+NpH0?|H+F4>T_2ZeMU>%a8FZ{t2JsyMG+VAHwZM&VLHuTmsL3
zD=p5O3qA^!!TU#+pD;U=x7|Ga$MkdH2kgJ_=imm76goYWI)=R9HRiW-61CKlZNU+-
z1j87~!XX`b$t2g(aaR)X<|{6ZIK$dWuXRV=aDKMZ-L<*k@b*KVVR6n^agF`!>YwAU
z;C^;~*Z%;wSBf<~QvU$qZ^EIdcy89l;(JX`Q;{Ch`ryRDO52>Qupo6{Ux#dbqz+H!
zr-!^%;vWlmj@!ll9n`LDHLFR@%s0_W#&Iz@W-P?+-I5(!D(XtGDo8)kr|lK{DEvD8
zp>&09N+r@fLuir<Ene0n^FuJ<Cm8+oTYx)B$meMU=D#d{2Y$lq<KKw-?}xly;@v83
z8^U*~ZKy}1+^qLkyU~~g!vk!f?F?8xPV`b439I2QBFJ#rT%#y*xUYRRes7;&PJzUk
zE^AXA2}-9oGxu6{x67gR_w2FzZ|dGX@P4bOc>BZpzNxMFUroEzF11}Q)6TcOirsw0
zLXq(e%K}g*++qO-FSLGke$zj-&%_Up8r9a74A*)OgcP8IL<}QlzyzDJhcdA^I~bH)
z0tRwx`V{adfP5qH-@|sE6!2b~2AQVXBBDnM0M1ke-y>smjzHy@mOagUt@}^@#Xbl4
z@ofGf(0rc__`(R7M|}hd<lYg2sdkIBz6L_LA=f=f7_XJcIEN#loJ$o~4a-hX-FLq)
zoxiJeK9e@db86Xop$hks-CEq<SN^Zm{6qLF@%!Rez>Rq7{sYuPK|b%a>DOjSbot8#
z8{P$x7tH||Y=DOV0iC%A`X>FKzB%as0JH~yA@R16r(Wq~NY?jO_mMKHPP~y6xY;rl
zD&dfkh5+HR2f#nGxBL@+4J!Bk5>JnB73`MR0j)GkIgFOBu|7lHNwrLY@IQ6{U8IK2
zc>QIt)9iH1NNhCgc&#mM(PN4j5o3-tQGlw70;mIk0OG5U^PHm_QAY&})K6Kh6}wxn
zujWSvWqFnkoBGSN9?^QgTl6jfDb!LEftvi$A9yWsUO4ghhJGIClIy-Exe!2{Bx*NH
zEB08>a>w)?O?jWi{{W5Je}HW#(zH)6Q1J;*-P@9Us0)49l>2qqIX>0-m*X!Hc(cY@
z)VkM<FQvV^VYNdXMG}8@G#o0P`0HO?!<;EsGoLiyEEnGW50Bz*F2m(eaNcik_$R1*
zT=?(euZpu=X;!j6mEog+wo$euxB%od>^lDdxDMZ3kPUbXv{s;Ri_8Oz3VU5c3?%^|
zuTzYSQ{lMD+(5wqsrAi%*_CBfGWuS^gl*^5pMhri#w#$Z?W@VZE{o6+nb&j&%kQ+F
zYFI8IodA3@at?UrtIIS3UHot1bM>sdyHrNp&9IY^&sz0QF*}ytc(!y<#GmzNAY;~$
zM=F6SWs!)_CmeoNg$p&Lb2*GaX8YXl9C!R{R>DMUb22}gM>!)H{3`FD6))`#)*b+e
z;DSn=jy-DH$1mB+VTp|HTphVT;Z<X}yZd|3Jh8kCnFzJF0E*@oGOp6&lhY!g2+iEq
z%6Rin1Z^Mz>OQsJSV3aCd{5*`rhlK!k&6^;`kawoPP4!+V6~5QUR1dt9tih8(!JUN
z6qbT}ISi;HJJnk%SPp~UxN+Mf(98bU$huNei7xU4e>OWX#c}q!-K3M;SkE&}cN=A$
zRaG`IJ4YUaf@@<%y|+y&IV6fy^CT!^=0;LZIUR*mmr2nhyO#RO1YmMqp}{4GVg-57
zTArM?3FWyMF%T5?>}xhV75f~LY0|8C4APKtH~LnfSanNRMON~NDsrW9T<`XZ-Wjd!
z-dlBryp$3TEq5sO{vq#OYSz%?Y}(PDMAM0yK`qV+01N^2_4?P=J^|d`6w~IE;IbV4
zb@67Rn$DcMT+oNPW_L0;JAlqO$6;S*_zGDi@Lrm3kVhj%OD1whC-ScoJfH3~?s_;C
z?`U;58Ip2$7F-N|6fj6)EXV1)6y>&bB4pgk2Oyv2Rpo%(5G723)0+9n$4j2fmqt_6
z8YQy_a`}uh6miGUS0~%0bXvA!0Z8Zh3d6^>chJ!sHFvNMl#B;@QAoooMQeL%V=LIK
z>-)gK^cd?-f+q8%X8WWJRAp5dZex?$q%w%myD(K9Njy|0?{Z@FGPSJ^%T|mf!T`(n
zTy?J**0dY#B6bdep&W-&Pagi2?~KvN7v}(uah|o0cr9)rc%&j#ZmKyr`d6ig#!3%E
znxzEwK71<{3xG0xI@D~e>-YYE^sc{L@T{?1eXmOep=IS5V5zPaQp(#i&Pc#Moom{q
zUNcuXCrgxUQJZOGmPhj%PtXc*n7jFEhXiLMu&CpTM_}09f(=-e?*xO&QL%^4bAwxn
zS*sO)d-kw{{a!ga=xJG_-I#V9edE;U@u;t0X#+TRQ^3NUQ{s{B;y}@69CG0D57WH^
zCtSkcK0bb*g;SC>`%4BmB=c4-OU`51rAKD=s<HtTft|3&1hes+Py!opNie=&wp4NX
z)kTi$%p!6!ISt$X`RP_9k!B`%iP@5PJaN{U1<j4j%CWHr`>n=kCz#IGq*c@9jpRop
z3~l4|uRh1QuIF5~wp5soHm*n3xNYg{Tva|JWPktB{Ry^+pxv}CdSe{bhIZZ*DaWlg
zIN2fEI}8ev*fPczPFuZx&ik|Qgl6i1i*tdR;Q>3&?BgcA4^v#)xL>`FKDFmJQall|
zD-j{?EAej-Zn5!S-CwEwKj6eIlhpqJ8~l-AN)fj9runfmm6dV`#~d2X^T|a8cNXps
zJXBFG*55M~FUqO(uY$WjVNukzC990IELn5FZgEkn%Oo4-E09X<3yz-RoYP4Xoz6C%
zz+l#7mn`F8JZChm)Th}N?XJbCWefBalif*ndSH507{Jas`kIlxQI)MJy=-GsW8~i*
z5BRF@f8G_<{286#@#VaVrT+kNGT?U%Fl&_Y!Ox1XbLuOt_%UqPemu9gR@hqJW4qI6
z2EVED`7f$cU)_)7e+Fl8<;T}g`Q`bu?C-NkB=aOjjDj|Y?VkN{Rp!v_q7qJIb!mQP
zZZccg5sK5eg5DUjJhDj2xZ|FK>sS`{@#<NIS)|NlkQfj#>MON3vE*4AdVI0NY~eTy
zfyl*6bWkuckaO1+R!tTeBfz>UpmM<aR!x*}MRz5<bD2T_M(S{;o4Lh~ccjs;JkwfR
zOEs&!G8G`Q5~l~;SBzX-OC+q)vpZz++?w_~{C$dFrFg8P_l@;6>Sgrc)TDWCPq?iv
z?$r@FqZD(FE75E2Gv`ZCJaJA39cwh-&prC++8vylzM~+>wY3H09;AWL4|@7@;cx8Y
zuU_3IweccxW{fi)l@SH{f~?LM_Q)d@$C~CCe7=jTikgpQ)%>?QvfQ^7kF2R-VC8+^
zMe5I!^`_0p3&<liUrDx><50I)k)INwI2pjrdhd$78EJLn?PAUY<l0%?GI?q`FJLRM
zg2vBFwicRXFu@rECukC|9;dHb-RCrw2)ORo=6E%-I?lY+O%i{QImgTCQ876HQl4BV
zwK%RA4z=SZbvQdDIH&+0Lz**|2Aur`Fm<YlEp;hNjm4bdM^l18{HkcAjg%E2spv&j
zxPn)e405t4Ib|IfdQ|&Why#pvsv(V-9R1^phBD#61mK^oS(+>kXof6vK^&}8za`Y?
z9@PMy<Rok<j{I{{urY&<hM`w4lBtqAde8%B50xD9GgO2wlCyb)gZwxZb@#{!apj+G
z)y}J5T*~p<%*e5vGff#gkFl-cvCUS?Q%Y!3QMO0gJ&rR{MGC>d$OFAncwr8+T(Jze
z<QyqKO5BFu?Gc$2hE6bjXq3`dWYMK6j?qS3w$E^mP?asl@H2|k(lpp@<MN`7(>$3Q
z9G_ab3jY9S!t&fN?m+a;N$HBQ_S;q2v9|-y1EqP@Yc`#aZk7kx`_c)mZtzuPGK1Nb
zaZ>neR`DK>sxGOg$Sfqczzm;uP!HUZGlo(){zM*W?Wn}5D-~?Qp!ECOvw%HM70p9&
z1g|B?l2&;>@;h+9LE5~`RT_2ax^<P)JzO3tz80#jhn1Q3$AUgEXx<}|EgxCgr)v*|
zNb-D}gZ}_bocI3#V4RM`*Rc*oeq3B#T0Rc^Z08`5RE~rmppL{>y!<)%;d$Zc;?lfX
zG`g0W&xF5Yx_f0k1b}lQXZXJG?~&8sb8Z}+8`i_?9>?qX4;RA`YB;?m-=X$>6`!YS
zmKwFJciL>pac?NfOBvj(J&3FM74pwVW_`MPTv{lifrw59Yrwube$ihJ{s3r6s`%34
zSGl_mwmN;Hi7sxP04x*|6#&S~NXov1o-4%v0BmpC(^LJPY^`(;4(Jy*TIRWI?_;Um
zNB;m4t9DbhgB*sIR3A5%xZ@dM4SogqtHk~!@xHrt;(r%vGwQmv>u#Rv;fp&Ak`xY#
zi=UNL1zy6xN#M=^shZmtikB@PE#LVc4aB@(3>%gg3$0GRU*>yv?Fsu%d`11BHJKY%
zx|2`v;z%0LLWV+Zp#T*RCr>s~80FhI+(Ll31DW_M@$=$Oz<qc7U%;A)iV5B`d#BuP
zdn?3@oGh^e{?h>Ki5Dkp44nBskECjP?x$^`=$4Y->Nk?CHrBT2WR_W6;Z;;*l1+48
zI{1C#Z-$!Z{3D(_(XO<6@}6SFa~WSTGmY`L-F46LDI&je{>jTJ&{<^T9$RS+{&wHy
zeo*mQhqe0bXDhF@`hUPbqF>qj_O<w1@%kaA=(kU%_>$fgd#xV%L{lB02_&~|%(KE6
zkO(`#!F61W{YXi#<VQ@^wLK3|gIUopE;Rf5NAqp&tz~FuiNGUrNTf2H5;8jyIvV|=
z{gHodFBo{!!upSe{Ab~PXT-Xnh4ja`{?yUnH&1r3sWHcLGNL<~R1mHA9hB#3U@P%n
zF7Wbr1o3zV-8<@=S6B1uuk&At`gaa-s>Vy|__b;GOGj_K{XW#SO+Q7`?lo;%<|{ip
zNZMKFox(*FYBJ2sZ~z06PfGlj{ilE6q0mOM_&4@`xml!Tc<($y@#SMAjiSUc!yInR
zuDBT+27Y1jkL@x0Tl`1(^{LCGNM-Q%h3zMqZS;s4<CbX1RQ~`^JGRRpDo8-8s92Af
zfYtI6?%j`5UrEBg8q=W|<1=k#(l^n1+kZ{HPa}zV)^ffZ1EhM(ugkHgs9)b|*OO`*
zl=58M+&f7uvaylf+mcH$B$Hdd5AY|9J|Jr`_<O-RMb@*aN(opkW8B+EB(dnAdXfcu
zSM2-yF#JXM{{XGq!=u{CrucW}lJ8x(iT=uh7Etj<afv2iPSpn-9FPGu`{(c{;2(j$
z0eCccU&0U>E-n~NsOnPz_M7R(aPCI|P#v+L;TP45Gkq5n@#YgJg2UkwonFgj>u$bV
zo*qTQxV+V>mHpU%H&%ZzSGKoWb>!N9qXd?>cG5IcK{^=aibObMbYK}-PXrN=YAgfD
zujsS(mi>*q5Boq_uBm8Y@b8M?W|4JEdt@f>DYQp-w+$ljxP0zVIxynD3O+G>6!G`K
zzZ1>j&l=cC6k_4wv(yn7S>G&R67V|#xNTraATZA(t#)v~9APtbsr)s1#r}`4=3&Gf
zEs4upe-%IG{{S<S@K1(f@t=!yzY0yOSZHgh&X5g5Q<0*ygt6nybV&;5pd@X{8;Xuc
z^oj5X_H^(Mz%L7pE5i|+%|`5`+J>oc6{onA4#X;jjdDUL90SfzHT=dB9A!=qt$qIh
z`z`+6o+S7?9lwHpBQ4K}d?v1m1i{){?Kut?3B8Y*0xmHSIZ!Z}&f4=lLdf%6!h-sY
zuLZmojrrfNr|Na^hHHSq*S0D;y`|gw?tZ)cbN!mW8vK3Gj+x@CF>9#ARjoCPL`A-~
zAd*x$2Xt;3gCk{0JFpp5m+(vD=j_YzBldmPB)9Q3t6pexPGZyb%XrV(E|&$AHVYVL
z+74oKoU3dui@g5;NZtzZw}k#7Xcqn`@NTJXt7(@x^6o8U5-vwoIQy(SgUIx)t!GBk
zb^R{uMb~X&y|BKC;JCJ(cf~An0<5g3f(Sh;@=hP)9DYqdtI2ayPguU5ouA--iJb7x
z6EvgkD85<rlI!Y!BJqN)xOM6)?|<2k_UG}3!P%kk$HY1BJVW5PVi7|GYO`t<ArOeg
zvq;DqB6g{cM+%F|ZNUw1_SF58{3-Ai?EGKgtz|#5wTqb}O&0o4*U+&$iMKsvfdRoS
z^B~)j#FJm2hYWHn_quS(_>V49apr`(t2n<W(XT6~=6+E*V{%+`rx{vt^SXY&@K4q6
z+JF8EA3nKlCxib0XWN^LD@l+o=ZGFBx4Jpn+icayXofP6BcOceX3zt-<we!)-lcgy
zrKwFN)!odEH1o{G<&sAvvpY6PW+x<`O;&7m?_Xd3&0n$S#1D^Gv1+%8Cx?6~7>;Q*
zJD6CTsNA~kvm6N+6y%Z>M||KMaJU?!hOmxJsK>0Eyr1g*68RBd7nx^xl^J`_ve7T=
z%=x3jo(u6`j<qJzJQ<_j>vzi(lN);rces=!YFUX?1oR(?;AG$&38HA2$0#VuNGijS
zN3DNS-v@pOd_VXl;g#^$f;7mWEQq(4?uu`3oim5HlmO0h%s~6Ale7x(AKHiZF7S89
zzdOY`G^?Qa(mQsJJHS8DFG_*st>OS%E3_z;K4nva90d#V7(W)Uv&Xs>Y4Xr+%3Xb0
z{{V-v^%$QIaWKOwv}C1Ita^Nz{P1l9kwyh~zY+cz_~YR>h#SK`JkqDq?#YnbnUsXN
ziZ?0d!r#P<MnUu=*0?JDV-H%DSSplcoL<d4qx1Yl3DTRTQCY5nINU4gpV{B`rudcn
zK58-A=(h(`@YM4wUTC*$Uf*jGzDs|Yc{6R!8BW$X-WUOondwJb=FBiOG1Oik6&`55
zkM!)vgvQsyR8=TRrjOe{il6XPF9F(kHUsdJO}*57Nqq!zGHQ0l8(lcQL!`t=tt_d=
z1SO+Ae9ktX$8Q+;>&5>75cR2iN8^oJ>sr+2Ictl#42%Xz3V`s4x%p8*RU7~Su2_z=
z+VS!Z63eor-V#3Z-MwF@lJrNdn(-DlGKyG<Xu2&Isrm=|Eq>gdDfmZoY2hD;^IYq`
z4rUR??8aNoG#miY6Z1(5_lTYcDTDI2W&2P5-<~1y(_VNd;MCISz8qqUF0zJ5bsOaX
zmsHNvZwoO40C|q9$yIhr{L8>2nwaBm4mhrQoDG#!$NvCm%9`h0KFjmBa_Mm<WU&7L
zR8?Ou<M-||T+IGsv1UD3@=xMwJ)m)2t@p!^h#F6WVEBcp_;*mX@YT$MD0N9-Www#n
zkf1r^1dIdFo&|AP&o%V)Yf`G4rzIq`($3GZ<JG596aC`Vg-R5*c8uU;{RKnKeyIM(
zAF+3U{vzo*XT?t&>-L(|5L}5ruq<6<jTtb>9rNcQj||ZP(*U+|2bZ4bxO}G#^{iFx
z(JfnNUR9P>$Kn<qp8o)G`Q>HstKvST;7i|$b{-zSpTjzI$qmk=u^%NGPK0EJbW?!t
ztF;MG#1<LgSzagd<x-7+cCf+6(AV|TZKi3~c4?;CTSE<`aR^f0IMrfUz}gTX8$#oZ
zV;HZ^U)pc}3Gt(N{_69_{{RP`&q3A&vhLADLK_>&Vhnd9E*2SN8&{RsPt3$*n*8f2
z@m7u_5h_^tdpJLZuAWl=05$&r1p7?S!|Jr@drUOBq_<mr$LF8y=lfiILH(caO_jO+
z-0;Q2qg?3sQaD)f*=a3YXUUX~4-AjS<s>Oz-XDXX9)28ra?nn>;9W^3XtDOI3CgAQ
zp*;lff=cI{Ff5Iam5vTSm@RWa*E}b!U3hcEwo~bvrIQ7@v${hZ(K9G;8BZfB!H2O1
zw>%N!zZHBo@jZu(d>eZ$mZzu49yUfHBus+QN~7<?CIECjM{qzj>+vrEVKaDrYYNmV
z%T(9HFJE5k@jRS^i!oVb{;!BmRMWFhc=~^l{bW}y;{N~{_)oxECC7<8XR6w0+Fh>S
zHtO<KiX|*Us^y5wGY}O<OA-JC*TFvte{Owq#oq;W&mQZ(0oHGIFAPSjc{Yu2zfQVx
zBrPOv?o3G%s<gqgur?IFEAmg{hwTCJ3*+aA-&^sO*Og@yq}b_J?*9N|v{l@zg(nR#
zM=J8*7w3f-QTc91;r=@tjH!miK1kDk^SXZ}Z!eeReu<j#jvEb!ja)yx<+hD4ne}h&
zN&9zrkK$bOctha*-R6VgV3I?3b_bDrsFo!N$R(n;E6^!caL3FY7QPAaZ;d=v@c#hB
zc3v&;hNV86t7-BHAh?A506js%zQY_cusK&pQdQA`R#H`g1d8UGM2a?%&J{*U_7(T1
z?9=-Tc-!Li>Ecg_Lhr)97eO1wuL)b-KuDp-kQ{AT!k%O~=YXw)kJ=-L$+&j_;Zi9>
zXMJ3~S5xxmh|IH$Z#)}v>20I_UZ?1n>~;HE-28ab*TViM(r=T*`dlOJnw7n=Z8GTq
z`DJM%8*S!t4(P}TSY;2*v3~If0=Qp?{{RoXA@Iw?mfjNZK9>fQrdt3+P_7J;Z3@yz
z-+D<HD9f=da5=yj?vJUj=FHy_hOdjATpT$d-ix>O*!{~bp@^f0jXYHOq`HU)Iim#A
zfgLN%{v-TZ@Q1@4HLo@MwKq(p+t|jy+)mwssXPx>I0C%*!cumMyE`X1!Y)#FM{}#{
z`gVn<Tx#0A%yyP`sAQUXn`~&S)UzIfz8v_C`%;a5OOFru07(UlgBxlxD@QEAj!Zz{
z#+>6M20eJMo<1e~Tk%)LtHUm{y3<OGgKwrn?c}aMicDk8e)(aA{t!9Omw_RP{DAwD
zwg^0WSKVZsD%{mvV$-qt_YiS^BwZ{Xt@S+|M=RX5+>$)Sfh~|2x49;=HKS>9WPV_d
zP7vccC$|}_3r$XUf;i=sq2v;C^JBW2=^3peNbRj~BW!%Cx%rQCU!viyLfneUADCj4
z>-*Im+2>lGn=Py>D8eZrUw9=>YNW;!h@I2{@~Grz>5BC^jlIU7D1EmGxebBbcKX+z
z$!1nVG#0l25DK9Ls5$G5SGPkO34BL~iNQ@P#TnmZj#*+~o6Jw29hfhzP-~q+Nq>JF
zaplUQxXEu!R@Lm+5vVaRS5i0prH?g?Ce+_kM7WJ7w+cyMHvWg4iu9G5Jk+hDA2Xzu
zj+ZeV(jA~DVmtS!bjVCk<n16y$r$U8(zJ!7w&^6LV{8&vYHdHxvLgFzpSzoK8Q6tT
zP81IG4A`GXisSD{+hhL#Ruln)?~b*VbEe*asT_`&AcMik^!BY=2;{Q27R@1(W`1Vs
zYbtxO<uucoV`Xdr#D#41#R7LO+gXH|%E(!cG7m#uw+El6_;z2hMy+$Ov~ipO%zIar
zO?38a39^l@cK}ND;PvO)yK4!rwVxqoN41O`5%*dCo`SeDAF_Hmf7<yQu)__+7J8Pa
zYa|zFfWFqkD`U7*%~@NRCW1*V?Z}9+P}tfZJx?6fd+jdhEMg^N94sI2aD8`V^sKkJ
zz4>Og4)(>$;x0#CP6uy#^WnMbJ&Lo*bE3eO+FiQ5Y;%WpILY-Ur@quyD4>r~wFVc#
zmOwaPrrv74rM_KVZ5fZ2v2IiY?=>-Q-rhyDpHJQ#a#-Z!-nyZ&$oDj))c)CJB$|7}
z_GrOeo!K}YF^c+^;i{y54%0R%8>7o|TPHrIzE0Gx5#koQe#=KWGA1$z`d8K84jx#1
zDW|-JWJZhSz`(B$IBVFuo`wabBhX#Pm*xfCgN$OcB)Rfs`#C2rODkg_dm6BlG<P6N
zBuN~faWjLT>r$*zD@e1k1<%U4<Sl&cZjWYK>R{?gBf?&C41=()aC8;aYLF$oM{8~*
zWE0l8dhMk2IB`1w7^O6v_US?CS{O<#D;Y9Ly@}il0zgsJRN#4b>}EqJA9Y4^Q3;>S
zlHNFWkU!QD$3BO>7N^N?Do5N;ZuM#xS0hw5NsuVUcOHrdHDca1WduT39mpL1R8P6k
zsxVRRGgbb_D<~pWUt}2k>XyN&BF1Kwhv{B>;%^A9sLgq&ffdYo1~33wKE9uYcMWNH
zv5_2~Ic^z2;Cu7USB}<uc^y<duHcZw_xg6Nsa15{(Va+hZhZRoTBe;d+g(NFl^N&1
z(9%UOoC)-=YVih#bFEoFnFvQgkjwY0$nP|ptt5FeM!|8N`%c{Y6JFjM8yTxhMsUuP
zi!knooJ4pT2Oy5TRNi|4BMG%fcN`zenIujQ@P2G@L+^O<qyi2Y5t2qbbH_E@oy^E?
zO9&3bfXArpJ*y<xEFeQGyrN8PX6ke4SN>##i-pSP?v4rkYdRUl)zdPDQ5*fnISN4N
zC?-UnL^jDinnJ-y!Ek+h^U|u=tQO4D$k+hjG31Y>OK_7<DK1Ln@sr8?C=k3zSz}2E
zAH;fo6cCqUuDf{!#j8UB-|~{pir{>|YUvr-?XGbi)e6{S)SBk9{=cPg)qe5NkN?vB
z6^X==i4f&mk&s98=ANvi@Im@iBh7UolrPZcrh*Wwl7SmM;O8R0Ws>TC6&PKVwL;5h
zBdOXz>*<>FbEiy&r+IwfNhdYvx~rA2n;@KE;8&Wgv%r%_ytHlebNn^<$B1_R(O=<T
zn?I;L9KZG?-}m3-h-5B`&*bdd4i9RqlcEsw66YLqOrH16O@Q%Po5pe0z5$=F!Y!+l
zO*Z)pWq>`2rYoj-qzk_l8C*6hd6{-gLh)cRj`bsqCmm{}LmleF6O4-9oVGBiv}eiQ
zH~#>UruY5fU6;XEjeK<weBrI9Q;xW0#}&r--zUXazwZj_{td8?$1p!!K`F+0CmavP
zzp1kJ{;5g-0Cqo-{1u%xvkLzJ<Co^ov#sncx9^RlK4wPOVnsoyTHD0}DGZn=8yg*e
z8ryr65i9I5oB|Z}=~QlQ*7g-<FP^7p-TGHVXFg-hlx~Vf!sJ`VEskSeNDY#HwK29>
zpL(tf4s+I*Gu!;O#jrRS9@NyDR*ZeCiFwU$&TwPZsMw1C0RB9r{{VKpYEhPAJxzNp
z{{Z}X$?hw~?p(%LjP*6^Woqz}UC$M_eReGP4@<uB=a22Y9b)SgK}@$Z6(<V3;2waH
z!1u3h{{V#ps@(Xh?t9Dc?0*W`@UqTTrnvwe#ZEu~V?8t4vi=8o9}IpU_?zP2iHvsl
zm!YGyj!wh?LaMj-nB(84HSZoP(|lX;bKy6Kq|y>7t~Di&;_y3pD;#c_B|#jJPfQHg
z&F6XMCZ1)<g_@^_zax|WYbPeXHa^25$nceN&U*24uWoe}+U-R?y6$9rF3^4+_*23j
zDDfx5*;7)Q&8`Rbi>NZ0qXd)=S0O>iwRu<U@#7y7c)R1ytv<b|Pj;!MN#&%DN+upG
zVR~c+BRI}0*Zd9e;@{zC!S594iK|7b?n}Akc*fBavf!cQ<gPh9eJkgm**vw!#?5Bt
z1uqKek|~f6mv-oQ&ra3LNl?UP*{&r&XwpeaOIuz&)zRtAVlcUuQ-*7j`pOCEcg@-A
zjonMkm*NMDJhd#s_X8ce)|JkmD6?6MhLK0jxO0y|P|k&Xd*a4?sW-%a9M+}VOXRAk
zB|zkp*jHUUa#QqQk@L97H&Ua0-{x|tIytA*W=x(sQ^z8`HL>$jdf039sDz5A29|f)
z03A(IdoeVtGdqH~!iFJu^&KkWV9N{$<Q!+3fMKF(q{<S+01k1E^`Wp};<A;}C3|KP
z4X1fMe=4IncPvJqxe8=55tF%w3I71~Q{fj5c-ypnb5Sh9R`XME3O3=mC*G|=a!D#z
z815-Ov<O+fM$a@pXoNbM2i`z=A6gnnpiv{8xgd1)s3o?H-TkgGs$2bG?%(IytuRN7
zu6Us`r5$uC+_+{NPvcn^1j8~DBOfr~SGH>Gkr$PA<o4pWqSI}i6{bzhrz0n&Dc6h^
zp-Twrx8!ENmle&PUO)~G@D6e9pIXqcSm3l<c()L$HjiOah@yl|Z*EmyQd?_sGmlD+
z!2Oz8JpJ2|GDbFx`*U1VttyQ4@bpzDq7wEA<+j8#G;SGA*5ybYsUm@5M_`D%&RcIx
z{yi$S_OS#JtBf<d09BtD&){n=>S-oVG9p>=w{HE5&$q9=X64bH4)QeRxMA{+m73Gr
zM8S^T(wqXLIX?L5_|y^FEC5Jdm<~=`bxU%vM;iH1z9r7$owM#b8q1Yl<yy;8j3T?{
z2x3W9uvM_!dYosi5=f9vw$p{(AS7+zaZl1TxZ<`fQQ>k)=1uNL`%T~JT2k9t$f0Ke
zy$^Bt*2+Fc6Ri3jCxiT9uXrlrDYPqswyh{@n@2f<RsJFu_+0j3_}AE82KbYqc;Xfo
zY;CV%$DL;>21L(wBLUOgejr!E%wdjb?Z)rkA!Pw_pbt~n)2$@9vAJ8@X&&BtX#>S9
zWUH<@1QE@A_HDzIaVgZItuyR+e~c^Oq^Q?!RNvv~ewYKATKSLR-^ZJ4*%L$YlX+J=
zl_S3^vMEawfgl42M;!2{+rQGV&XXv3*fO4kSLZmq4Qw>sI?Bx-vtaSmaa7e>61rzI
z;~xZgN5}d*=pHoDthK#8+i|zPg=Rq5BXaHsYhaPMVBnF^n*50PoBsd=*6}m^MTh<s
zo~v%28P+bYC3U$&{{RwbR4XczzjWc4m5o#k0rMOVe`v_fEqYlu5@s1&e^aSs_gk-9
zKBpdOk>K&`_wm=d(f#MlU$f`zwea)cCV^!hoo8>Uc$p(2^In=2`&806A^fs(Vvh_|
zDxbT-CcQJ_f5GpE{{R>KKK>o??voCoZc)}Lh}&~+WX9%#Ve`bt10_#58O|%zX}tPZ
z1#Hh3R~7Zzl^dzv?*9NT-y^D?U514Zt40fxw?py6O8)?YUg~jtL{`<jR|LKbNyWif
zVr0}LggHND<djplYP8wTGD3rcU#edZz8QEE;cke!2Y|F|T^`H@-Yw7vnUr8OvVG|r
z-HERE98!VCE5A14Ovf^eV~nWUj>~21Wu^SiT(=~_;3X+xVQc>YR`fn0{iOc@Vy^@I
za<h|E@Y-4USH*2~E~M2VwhXtMfLRjd6t4Zl<_-v8<0PD%{L}C!{t32pj}gAR@xJcQ
zRI;(1*4s(4E@HXbe|Idh5~pJ?CM~=if=_?fOaaX%4st823&uIV9|-U?t3FS~dOg?3
z=EQKmSz8pbbt!wx?tZAu(llKoK-1#Vv`Z^}KF$${;I)Zii3hT%0De`Z6b_W^XRR0l
zzG6~sxk;Y<S1RR-&_0#M`0L^CgFYo_Gx%r68cnvZr_Q7p?jsIkW6|Sd^G4u+Ln#DT
zQ*m(}v{GDLNU_BlmS$sxP&<?9UTffwjUNp@B5JzEhv5GJ5ZP*%`VH6FFYVxRWQqF`
zo+OO)5+>-&0=u%L4XDPlUfz;^?^V<1(8{#wTiMlXT`c;aj6byR>>1+kg_jo|E%;v)
z-XZX;v$K7YHEDGXHa6u|$0m4`<avN@QG($7&ObY&Wqdg3MnTU69zpzT`v7HG7Y76%
z#8>85?MeFrX+AoZ=f@ugVz<(KK{L&Bd!`@V+3FHF2QiYtEu}08BreA-^KChL{fC5n
zN0e=Z%_Zcg+x%bW_xC?A;ywuEZxxp>SpNW*`5%)X34S>INBAkN+v^?%)vhhz7g5~Z
z>83^&SHeCDPZ&5Mm*p%lz+(r2kJ5h^f5A_*`@aRr7leE#b!p;Dl$te?>U5IY&9)CJ
zP(p(-+)vAe#?W$ef%!S&j|=#N!J5_AhkR9GK9{LkOz=%@aJyVQWUFBGU<f}lk`H58
z1-&cuY<CM~8LdcS@eopy>Yo0#-}>l$b{jp+viQo4DY&PlyLxsypBH%7#r{6j{{XhU
zYpvgFm*q-}i}~7T3QlmjI6>^74{FBJG)-T`8jKn~gJW^2U)++g+S<m`O(6iW3W^IH
zC?|o?8ka%S;nsDl9VX`X9X2b=h~R6xg^}WfImYH#k=&2>RP)yZB!5Ofvv=&h@U!5a
zkp{WpxM0=1L3wDA+-uAUl4#flA|+>#LXr>*Di9lL=QtSU;*KiA@b(rEr%zY#q^z|3
zE%=*cIW8wTuKA^+_@woIhshtaC;Su2&q$Wf#{U2w65mpuNETQ;I5N>$%vo7Y%!=D4
zWk6M<89>DH*ap8=thD<rF3KG@O}CEL*4Y*)VTD9-#)>c%Rd4_Uu&K6e85Q#X0PR`v
z^IZ4~;3U@kJ>jeUZue3%TETY$gKMYFv*eQG?U403RN09zM&+;PtY&4HXSmp@zG=zm
zwoB8ZKW4&UaCtrz%9`bh-SvLGPigVj#V?273+y7)z94JXdL6yPh~aCe!%D@`Fo*$m
z+Ff@Q1SmV3025tak*#R{BGWDPjSlwTP_VmCGT!FeMV1**@wr_IRvF`i)MCGs{{V>}
z5`1a#Ys8ayyT@9LnvSD53zqpKi3dcAG4jTv(Lnd+y|eaf{ir@Bd<E6g$3oq$cy{XL
zB)rgNV#_JOLe}I5Xl5z^42nY#yOn?fzV{R1Mk<~OSEW%(uI@W8ME?NH`7AexbgJTg
zl<C?H+V}j?{RaF*_(|~V;;xk1Kacb)Jz^b>u}M5&&u_e7v@+v-P2Va<ARg8DOZ!TG
z#9t782q5t<haOK9_+{Q>EcUQ4n@)`e^*2koiA}lv>W!mm-HeR?06-oS*JklQhV<_c
zTxr)@OqzwXcMWBF5L`j#4Z&mx%%OnooQ{Mk$Q=vBlEzX+46FztxdeI|_}qhvvDr0u
z>|Nv5FK_q{%^ypf@a_jRqNdfQ(o27zbNK^C$j(9ZBau(9*uUF*_D1+`rRX}h#$OET
zcAgm25?L=b3p=?;x74Ja*KmMkjC1ZR%8|z2Yq10ZcMr!*spWba{*d9!`i5<U`ppaW
zY4m-(&&2b5qOM(C(W$Rz-31g^N8s-dN8;ZWX&xA#Q?t=x)b8W9Z9`CV6|K0C<9x0a
zwy5<Z9;6OOw^F2G2U2?_ck(>CQ0HnZc1Pi#40w;mdd;_n{3WK`>e{8*A{$#oGDz6!
za5z;R*pQ>S75g#!F@M27G*1aM9y9pe4W_xPJW<I8jLKxRvXcisZb@0(WD&;IBaGM8
zKeLbQ)$rrs9*WX<OpQlYzk(>W4PBXuxiJ!dc7{n7BqfR3gpYGyOF2l8$j1t!B>M{f
zed6yC@mRS_3zoi0TVH8E@?VeSe#ygp8A}Zq;&FDA`*dID`571f7x3n*qPK?iEiTUM
zNhq<K8K6lmwk8U%l)$DGfwzt@G19*!{{U*Q_$S<fwJ(TY0%49QqPG&u;fRroyQkl4
zHY^o#y!l724&F`=={Ak=^Wi4F@dr@&Tcvo8+HVr+h>vk+vP2klM*=yW78#ptib}@p
zM>zur9>NG=kzW~<=XsV*T~?td7WHZEB>Shm`=3dV%P={e7fz+6Z7#Q8KhXX`@2s!%
zoBNF~R=1YU^3}{!+s=&~u}Hm9(ST)P+!31ePlewXejxlD*MH$6_<gHQr%uIw(Fi52
zu8|HxPZ9ZKQQJI>oDq{>zQ45R?2+LQ+5=9TRfMjW;~Q&1ZKrB%WH73%zHgT_D#2Y?
zmR;mCF74QDp-;;H0JEq36P6zh>Tlvdj}2w3&21?A4wqvruWSkwVKlPlKQJ-erV}19
zhTM7&*rDPMO^nN>EPQ?bu(j{$<@q0&z6{JT*x0HsOX)7&zGvEhvj@lNzi4j)w~H=3
zJ9VH*1c?h7etTQCDtAk3F9p+bkU<$v0VLPbn5wq6&{@Q=TEvjVg9?$TRZ-{w;8A~d
zC6$H6<W}-syb(N$ZwWyxrBslJWMoauMh50p$>f2}eh-bQPOWK9ggIi~iubpb+4}Yt
z6)DkCtv+cj-*e_4+C%ni_*?Ol!n(cZh^@5!7h6kXEM6utB28)JfJ4nDQ+z>J4jHkq
zV;})voIkS%?2quT#{U2kwIA(|h#EhOth_IH-|;8YmPpO|CN@nWmm96gBr?UgWDLF-
z1LEJcSM7E2JL3+$sCY(OdHgfsIA)Dw(PfWi%#8t7^Df<hV(jB^L4;9-!w_A*O3*w>
zt@u}0v+);(?`G5W3ki~VCxN!QJZB^7GC2o>(2Q5^yf?ym#B!-nmMPPem%AP9r+wA-
z(doZ)@|^EI%PM9nOAe%>^>=Fj04~2X`o_{cHK6DkJYF8rH0w(pG8Aai&fXM?0F9l)
z%TO1bobk?iuYmsmX&=}R!e0|EA@L`|!DHbqWm;CewMaJDUwMSS?=CW;DT5R9uM7?t
zaJcI~VvpKG;xETf4&GY$>%+GTE{upD#Au5Jo%FI6A|V#l3o<6s!ZRQ_+`=HpAMVWV
z=D#C&jK?C(r4>6>i`LHR`t*16{Lj&Sh6^>otH$)?wCLBTWAlIZW&MFXG4SL5DvyiN
zSa_peaU0uOG-gXH9n15fNeM~cIK#3GVBm9KqsbaZijjb-qY9&d0O$v`I1JLByw{zY
zXE@BtyRVF!ll&K==yc_|ZW{qh*<qt)_`g;e=OU4q1uWz$j!8WQV{7{Uk>Q;xePdg?
zj?&r%EYBgw&;iwn=m6x`!5<X=0BCJb#%(T-;cquj(SGYm_h-4!_lV_Ge{^;4=nS_k
zuZ^kf=&zq*Gd$Z5fu-$hzn4?nzAOIF`frCLn?dkYk!l*hN4O+4yc~><56;-nP674l
zUjl2B-*}HypIg_blJ@%GhLTyBxi}<%Gr$8U0B|YqY($MHC0u|r)YD;0lqbyj$M>^?
z`B&>XFAQPuDO7&+&&Tu5Da7OIa?^aY*H7~~`><`}X#*7t)QZlE;botClbxfIdskU?
zXl^1gg$x^mlZ;n0eWFVO?@~qvPTp~j)$U54RCp?{HsZWUV=3x$`P6NCU>iiw76uCr
z0LQ5!gniL(-ndcCM|j9>%xpJq#}$>Cvg~f?nv^n@g=ENVoMVDVf30emjl`?uOz24<
z4nR2eHRo%oT4_$PPQoW6A)A3#Zgsep>N}}qx_68fi^)>R3wnTXMId_}B$`{!WL9;C
z;4#U^1a_`M=ECyU>RXAIV-QtX6UhEm*IG$;1P!d}O}x#83I+fJryi!Zq_$m3*evU?
z9&p?ceQVIeV<#4ro`;o)!YL-wJfUUOV~xwZ6~=cS-oR8*{kBlxNbe+Im2QWc-qh@6
zvW?}FGDeTJwsIHKHASu<PrU0HBe}t^YMn^UT{Fe0N>Fxa#kY#+`Lo>Q%e&=1V9q_p
zaaZjA&v>CO6tgj4Q#t#-ovJ^x!tb<^mgIZXvcn~&>0!$48CFra`;2$3E@asm8&RQz
z$pe5&p=_KA^<5&(?3UM2x<b+{U^5oZ6#IYmtIaJMElrZ(21z0#IU7j!KU(%Tp4cJl
z<}5=awlQ28j?g^}zwH+8t}XQ#*4{bZF!;_C@~4WSEu{KWIk#|9HNZjgh98YpLdM3@
zWqBiZBn4iid#`%P*A2yzhPai`6VJ`Pi6ekP>GZEY2dOQJSGUMSdeBERMyw9bKqImD
z`d2j^%cxyXtIc^Oy~VKskdx4ijywJpvlzS5?8Dxa**_>lZQR|4F~K!<@(U|FL34Ww
zd5odKOdMy_*6vLslc`GjJKa(UE@1u9g*VRXK@tPdH*sHOd^kaK;9V|qy9-2EuHeCN
zdlAKag&mcoX5Rk*O_MQPsau5&m10M6Ut;_|5nA{!O|*(Vh_Mi%PXJ`si=0LK3%=*4
zmn+oN>86f03v$F^a5`kwi<p+>(pSkKaml7^wvH87a>*wEj>q1tF=$CL$h$v;XPWRX
zne5AB4_4eG0G>TDT-GbC)!>p?mT<sEpq3fr`d2NqS5+;JD$%~<Dl;GqRQ4IE_KIW1
z8=BPRZS*P1$7yxW7n-8C7S7Ee9IsX*9e%Z}gCxg_&ApP|GWPDW7iHsW=QX4lwPY~O
zEUzlS%IE+pNBEv9M@y6vOpaZgDBIU=-nhrEX)W1fwh0;qP`@y4Hjd_tc`e=@w)Y^Q
zC#b<SM6_0p>iSY)LaN#4Amj?TWP4;DW@IdJpG?)8iDR{ge4B_JLiOjRQ8Fxrcb$>2
zC4)9e^b`QMA<SeWDJ(KM;2*-Vu0(hCUuV=1B4m8Sq5SJg6A&d31zhp-a(@ag#k}hr
z9&{=>1U3tP6yogeiE8_vL9O^V>~}L=*jzl(ag`&5`g>OgaXfGmts6c%<BIx9VdO?4
zP^<{r%*}y{;Wdv2CH`)#Spm)%DfvmSR}YMD-I2>yk5qX4g>K{Cr_0;M81<^Q(8In+
z=ah!PA(xC+rlD&3bn;o=MdYfUp|=z7SW_%wQqeJGZ(8+ZCAv9Yj>YBsIDLhbz6n--
z_^ioZS!OZB*>2@JVxD4UfhTEH%FIG1P6aU~M{hDHp50%DW;h2P^<Y-Kxk&W;i+7WJ
zhB9%t91p^{KRepH?M5i!w4PYo%nX<(a5&9y{$hGp6;<yJGSgH4)BPKYr2v9ztTLBC
zvJjFHoG8u(O$=fo)UiFye#EY~KM}g-(9PDEL#InI832xh-n{ABIoROie;n7K_>cZT
z)B0t9oq1b_k``ExC!rp-_{WG%Jjq-6SLV;@?*`*+j#vJ7`6D7&*=1$y38w8)(xzo~
zC1v4-RXN?pabE={&)1W^ixK0BiHP4$Qb5i#ie@*Qfl^hK^(;m+`Bk{%`EX8ZqJ{&C
zu!Zqk!Wyfb)tX74CwS-nM$6Iu(5}PamA$ps#0zMymva4@;EZrX1>^OuKgS>PHQ#6Z
z!}(Wf@HC?QLbrJaRV^-D{{X;KHT_YRul6cm;a`#b8=R}=75@OoFU_B4+{GQevzV2g
zP-7iiKGfB*n&Q?7!!lzG*y~o(;yb=#Q-R+VE6H&f9$Z_q*ql@Ln%MKKh~u@E-ZhOU
zB1Op=$Itcu0P9v%vs_OmXI;3!z!)4=$yVMeR(2VD=LeDxwKn9C<2fOKBR=)CS?A^|
z)ES+9pRq}wO7Ux;#5OwQ*R)^rWhc<~uN#?_ceC)SNhi|1+_FwEZFfF)Hr_|0_zV6O
zr-pnzW3Bv6x|_s$!`zk9^xU5<9OD2W!sM|d=ExQHzl^kx5O|-#QQ3G}f9#9d?SuIl
zP<AqX!f<hdagpA>3-}A*8NMNWMgIVVpHou027pYv*nE;opzQ-bhm&5R<NpAQdcyec
z`$NL_vPElPe3+CIvOqX(%g}?{BZ~QMBg^MkH;gco=BW>UN?O7#?G+uDnfKWSVNVN`
zxnn8v!hY;|bd-|bZSMTKb~{hm@50x%-weD#;13+iV!pf7F5$UGmvp8l067Gc$r<OI
zXFpu+cj6C+z7pAg!U^!k$_(3rlijf{+mFn0-C44Ia5{Tef$DxL*Dtl3jdClgZSJll
zc|UN>xZl?pr)ZuI@fV16Ti+7e+p}5PWu%$^0F39Iq@GSsHMMM-KHgZ2!l%1Yr^ua`
zH>XwD=7xJEhK?TrPv54k&eh){zg~>%JQb)<sb1>3rTH??a_c0Fd$}Z&UCsQ4TX_Ur
ze5ac8--j`I&3uc<0yER=UW2GcEv@7&!tFJ_F3S?*9hHwMRJ{&nak1^-Rg4O$4jDt|
ztxP4s+luU4A31xwfsIQRBA5v$K9w^z2TbOoW)a3rY60oet|l$$K)c+LxTyA}iru2i
zJgq6tM&LLel?o*8zb_}fNovv%!azX>1RA3xhsafi-~c_nGw)QOx>av3<f2W+LIbqt
z(>0)1dD=!XP)lj%&jgAAD%|nfq0TFFXws8qiCC<8k8bbwx>OO_>UUAGVIB`pO0)f&
zVX&l*%t^?{v8_)mG<a~#0R9n=@UB_b=#NI06>lOtdwXk0mKgr>j(FqWlT?kx>7V5C
zS`6bVa{2eANa8YLE&l*UInUuxUffw}dvhA5@_7Rs)(cHp()nZXnvFDQo8(+)zB#UP
zOY20sc%s}HhECvlA6k}6^|(+}@OlreWnEiIE5<{V2-zbad)l*7>}$zwM%}|D(z+=C
zHa79o6bL-!5v#K7;kM(NyBsdSEPR@l#@b74nUs=PmhLgynvXNF9MOlo%FP-=(Lk!l
z%{zJIel@uSf-qoQF;UJdEr3g-BgpEHz(L3~`0Zt~l1X5F{$b7};{(#Ra7xE4>Nlz-
zihnueRouIH&OeoAZ+I@^vvx9hnSU{{kOyx}RsHuS>0AXU%ZE|e`ijfdG}X9~B$n&T
zWj`+%{5Y<drOT;<eT}D<M3y*&M8^LBdQLFE{<@=kaM3V(c8#{M$ZmM=j2^y~qY_9W
z2_t!f92{ecqcydpZ|2Dk(4dmayc)-;rz^F`#VcOg`KxHkk`gwy(||oidQXD>C+ofg
zXf0({ztI5!?XW!9Gsftck@Ao39z6)@UU_8#La#J(F<JJqs~qhf!k={%^TO~%;XK7F
zyBRB<<J?!Dj?3v{*DX|fc~)_X%j$FC<!<YvKSF#n;++%4nqumj9m?E7NmjwbBMz#|
z)ladmioZAfIe)77QuQu;KY1mj=s{u$G0c036Qcv@YwNFvpBFUm6_KTD8+83xHpGpN
zBe}r)(H~RYPd=IL`Mz(FRl!Q})*a8<vK}kK<`7YDmVHzG4`!m1J*jKKKBy?7xnCK0
zgHZ6#hBZBROVRFi>&q*5mfuaZD6?Hk2g@3?gevYmSo9p=;}vj&oRnj`CUa1WoVG@`
zud8X=C4}0Zt#b{XqzYPD=aFNL6nAD~0IxRqTk+4~f5tmo`@e_U<T{#Z*Cy6=8A$^V
zy5SoPV}7Xk!wyA%KfknJ?IZDL<9@RGUyi)Z4#Q57ka&I-lHNF^&fsoilMeEMjBg(=
zU^8Dn_-DnQDDXbBABemss@`cDge<HsEhT7}g<#uTX&aP~GN&N-uhy`B7F4N$b!%0%
zSF&2gdcMA^Z!`0JM~E?oE0PqOr}+KH^~0K{eQ@@c(p;o3If)tkxW-+!f^eg`JplDJ
z@<;5E@!#Ta#;*!bfAJH+_ZlI$5t~g|BSUMcE9X93QsP0ecAqe2IWkJ)9Q}dG75Q#A
z1yc!LSbFxnUbbnky&s_Aadh!Cm1<i*k^K37*8c#t&&3~!x~=!a%bhas!CD=Z&m7uJ
z+oQd@!TDPAa;XCV5gPoU=Pcd6BhhuuTS3(<^{o=_Ya81cW10z|Vviff!PKb-vnV4x
zWDYx5^r8Dm{?5M&J~a47{^Q26*=ZV`rLi`?Byf<+Vq?iqGFB_P0LSJ=U<f>5^Iw>s
zvR~}6@OR<|kM%h<{{S25zAcZ!_wspKC7g0w{{U!3Zsk_yQ7<oQ=V>V(;5b)oI0yFb
z8RG1oj!`Ik9!SA^(Mi5${Px$+@;@ohczZIdnMa<glbiQjcDKvR$oseUG5w@ICH!~N
zZ<EJ<7i(BF)mY*21){Q8-OiX2L{-TvC=9Lg7E_a*qyT>D1Xdn{plH4s(%{j&Jz;OA
z*js`kwzP?1g*Ot%YA66@i6_*PTLj=|(!UkWaa1uFx!1$fRN&r`T|V>mEDaj8C`OfO
zb4e{(;GZ6UXFrD@A9N#Q;|oz1rD+7Irs}a|Ph!#)Vs?gAl`^9!`54a9q?2FHhsF<w
z5&S6lgR1x<4QkI{io!+_-q||3v}K!MkSSo2?Et!D?I#&MYwl0mYxbr1-*MtA&xWvR
zTBm??6p-wYmxk|Cgee7#@tuYi4U&<u+rV>?Uzza|R{(v{`d9CM9b}n~4+yK*ti6=g
z)Hk*I->0Jd&&RmiF~Q<-9JFcLYk6&dm+HV|8v391b^WfsB!15?Hj$>m7QNxSxjfl4
zo4Fb>9J@;Ix0nYpWNqBLSq|P55=rv!0(=qpx$!$nzSn#M;cM+@PSb(&?qE={mtIOn
zT(6qoa#b=%exp84+m+cQLX3=2gK$GB!3VD(j2in~b{j6tV=NXS#&+H})jyW0_Ue4j
z8n!b6<6f2G-p`}|05kfW_(Acr;qS)J4#(mT0_u@nA1zwy&6q9Ko1UpOS;~-m^dkfu
z5ni%YB~p$9DHtGtLHsNEFz}C!{A2L{09#)W_#auk(KQuNpJ=m{qh^nAZH<G5LckKF
z5KVsK{>%RWw`YX@DkYbLej~J62a6dai7q5ymO7Ng<M&4ZLnCp7V}@geIU#HCjw|r<
z6P77ruzvk7{F*Dzzp6i5;qEb`jOC$=_i8`6oDc08{{RH>)BIz3b>h#1cQENXrS0gN
z=SiF_dU;uh5;P?g7<L@K&QyXz?H~j6PwdzG9Q;51pA4GP%HAOHMCf9DQsO@>LlY28
za7DR-1A)1TEZ)G7e#5{&GWV=ad&E8;@TQwKx8iM1(@E3q_W9RWvn9MKyzNlV<y018
z6-IJNz{#(duf<u$BMP$l)ovSGtMb0=_h|6$TSB|V!U;=z^fva6y({ED+DrDq@CWSq
zZT*znb<Y&sK+{@jP`OmXxI^}dm0UtT@c@FyX~3_A{{U*Q_$V#Ty1IwM&w+BuZ>Py0
zm387ZXNuxo$Ca}jG--i@BYe%C0)dnHNq2L5s9jBceKeOhHx6Z*X_(0*j@?<Ai7dyt
zJlEXtmxgt531V|Q#vLQo`u1Osq4L~A$55c&9|5eto1*mZ`5vF~Z{r`2{x<lTyiw!*
zNv~yAb8BpaWv!fTkKMr@PS5}YB$jMti8bacfzrEQhkp$`aq&~cQFvFtdQAGRp&%?o
zYD93N^?77JENDGg@<;<DfN1!~!JaDkdEy(N3i#7ZpGehhapqgxF495H84oJx*!f*a
zQ_ndEAGcv;f}oZQ4J4db!kW5Xo1c|pUMjuuRa1P=e&;nzah{a=?mioMKF7ztH`2T%
zsq5B08Mf4B5lN})B#USUGnP3~lhAHuT%K4CYp%U$(5EW3Hj2CJvCT@GoRk*r+wVTE
z{fhqpXy1rm3w&E|@c#fw((gP~qIin#2;|dZXL)SoW3=2`kC{{j{{UF0XkY=zBcEga
zxj$+zj1YLsSok@lTHSb0LAC-3?iY)lRw);D;V#@HE(jrGJMcbc!xXR6Z-Bo5z8m}#
z@SUBXf~~EtZ!T?P`(Cz)pJ>!nlIrauXL5YsCJr{^uUhWDCVVdNC&gb3-FRciI$f>(
zw9rc&a?2W-ty)o!HU}dy$QT9;2<1j=`PUc4Syc1y?GLMSMSPomC2g<e*VO%^55w7v
za<_z)drPNlcE0EGl#ey6@!eW6hG=9HMCtOfNT+aMM?w!d^{?AM*kAUv_>u8PP=ALW
zKhtjY$t>V9MdCPP7*MB(?{=!7Y<<^WTOTmbC*-{s_FMP|b@BS^z&9Tkw9B6i#cq?u
zwotb-Nibq9mhgX~7mws(>e*o4SY6<n{loAd!=D9wHPB)3--5J@T_(mAQRUl45Xj0g
zzD5Hs=c#5SlV3x}yhD}ZFp<NmPE(RiHy0Nh_P(#vW5wYdlQ5%+b)!`%Cut<pzuf--
z;GMWR>L?zR?2eWB5Bquk)BgY$yd$eW!v6pZX?m}N^yhUi;vjaNQalD!w3!AXT<#JL
z-eCX&6;D5w<asV5G{vkY8rOZ_PuI~OqUM>MJQW9CqHT6Z=)K3spM+N4Kab&z{wmXB
z@f@S&vT49#<0s4A_k7F(b`zWqmFwz5FSUOl(p=ojboa7M?Qruo!^*^&8fR_oA#CKi
zZ<x1S^Ixlfu%GQu@rzmUMX$n-5ZP)zGtpNH`kH2Yogrn~up5b7?KZ>9afUu;-YdId
zQ}>)l;jLT-4dUoj=22eqdVJUUGvjhjI)*P0@b&5Px3#tV`+Dqs=le%~$$t+&XnP}h
zsUgyQO$<vFuD1e~kz-7e6tV7AmOq&KGM<f$gYdV&zxXD;r~d#6CFSSGi4nA&K5s7C
z!%v?nsjoSa=0;g!ix|R<vK;kXSL~k@@rwBj7Z&EYOeC;WYPARVukgN~)sLjc@ZJL#
zh>j|>J>~fN_cFBIA5GD$E%dD>+AB*-NY*GIg#t$!D8N+%u&p2nsVX_8ju@U<Sy2&7
zGcm$|I+9O7E9T_fo!dRN2*p`wh0hh_KNCJOcw6DEx0ib6?^BQ;BFZjUk=@A7#pkKy
z_7&pa7QblpwDWvD@cGP>o$=48<K>UB5ae=y+U?uczBHQVd;6EXy?37CQ@Q1omywC?
zS-Ca!`6mij#mCxK_hb2=A)ax492{Lb9`w$S$A1w#OYs9!I+nd~-dH$>);1ytLF}vC
z`je7tl333Hg+|Hd$jZt<NBr?t#l%kA7m~e4di_%YgQJ9^sZx!senX7M*TqzID@ykn
zEbA6L!hT`^;GRt(Gcmv*FQz%AnjbW&EWjxT1KOE(&mFXn8nZSq0t~l4g1cGDtcXmG
zvV<Vu^f?skxi%v1=O+g~m7i-7wnSO+fO2G$hh{xa2OQPen%I!n3dMbK@=?fyV!ez_
zzIe{id1dWcGfyauoq`3(5rR22yzLd(p4!=bpfCfe_4KG^wu02J%w>I`68zzNoK`Z0
zjp0>nbJIa_KA&?1=}brt8yVUv9R}Le+8CNP44*DvKQQcS^~4vll0Yj71$Op-L!P3j
zJne53VZU%8``nYe9lo{AV`S8ph4^QV#_6qQh%Kap0OgO(?VRGf2(9hqvBjJ3by5_&
z9B%ELR|^%@uDGaBGJ($0Fl$)|u|X6BfZf5xAmv0h`i7#nO>2D&r4Pzjs49Oh-&%dm
zm)k96dwVx88)I$&+&xBXXGPa+t`XBaWsSD5Zn*W$Y+Tw%>}9u*lQ17EISMi9k6P)W
zRnvDz9wL&dqdcC|Mv~g$uWr=FUxkmH56-q;1a&@R9li0^yNm5I-|XIH>oD3lJ#cG+
zju}jBh{i<>aNeT5tUe)3@o^YAQ`F6ZSn^@CLJ`k06V69!^v^2!luZWQ2!J{6JxzI2
ziL6+8Yq?#9c<-O*Ubq)fZ7of#ZM>)fTwu4Ttnu#ZEgEJ20Bht?f+2Bm(D_PajRetT
zkV|`Wim!8F{h;UjLECwiSfgWpN2wfR>0KNamP>gD+S?O^#8LFet!L^Mwz@^v+fqH|
zXTaXXfKRtTE6;}Kr!9&%b~jIV6n8J1QaHB@l{o0x<$)gDR$iqG>azWU3ysegRg7`~
zKI0W~<W9Qvw0ADTIS%A>au2T4j@0|D(Ojcip#^0-%5Yeo%tdU~&PG(zwEN2&sXo;^
zO>8!mLxF?b3|HDd1%bce71Ohvd42uOe1ob^(MGXFZ1T**e%Qb+N47}M=U-L)D|^K7
z9+H<z;g%u}_AF1#dF1{T<mZj7I=6Gw%HQw4M{2QJMIYJR5D6qyk;F=5BW*nkVCI`~
zwnht`r;dFpSjpa1PUZusubEq(&93fd>I8j`TyQ|g<y^4jS5v6R{*5mf$gW*i09R!u
z&T6eWXcW^bD+D>*c^;VesW#PkjFLH75C!Nv*5?~S79w>E$Zn_7i@0Es8D+)@{AxLs
zo$&z(st!ot15rs33bHXQ_`yA@l&nfB$>dwj=F1^M%Y{*ZtTEBEO&)VPLkwyVsb}B}
zfBNd3{Fk?~iyK(cjAdDxL68UIQrs-Iq)f=tI2%DupnW=Gtvv+dw<NlbX%Kz)gp9Ez
zbM23%D1sIO;?`Z#C;*1X<V9Dul$d<DRFdvXs|=ib3YHXmX_<|yh}h@ufz$D)V=@^o
zH(aV2BafhE#aWu(IUOXxbzX!D$h)~`o&1Xh8&5K0AbK2U{Ax(<WQGM~VoPvi#|J*1
zlw4)`CK2G29dI%!w-PL86G#xALaO!r1!p*xXAnywsxC8+y_3*Yi4H(A#3hnHl({>9
zJW#hKoL`9ay*F5Nw$yF0xZqqV!vXZ}E6X+C4UG~+GnaT-w#LXt)?UQdxn5tH3~*f8
z4ZFBJ^!3e4sOg%1p%ROTm&lLijYk}T(>3T(#JFg4%93ZzQd`}mVb(a(EyQ;$KVJ2f
zBC(L*VN_!sje8Eg;Ym~eCR$i|VUc-+uif><dAnO&%I>P7H5>-VIj?4wMJV3qG~;KX
z%jy7&86*G<p#%VbI^uWNrt0v4r_Qo4+_8|nV?MlBnt5Z^x$68x$8Asl)cqJ#MLAK<
zF;g!Gk;O73k%%M=AAV}nM7hb%ey0`t6yH<v2&a3qA6sOLOO{y%<qSSk(ASl~vPCQ1
zO&CZ?CkxY`O7#yC0sV<G9_+Q|S2CTDrzHUSzY6?QkH6$(zrw#Ze^Gct{{SLa{x|s}
zpSn4fiu5NVH4^T}-6!c*W_YAOARv3vE*F~kbdT1enOVyL(;2BDY#wSy!*l&<BV6(*
zenm}_in5E&20BzIisuC4tR_^L*0zhijvB7%pCfqZ{zl9H0QZOTuIJ!7hxm_br+{s4
zK|Y@-t{cZc@-|=p0KPw!b)N;q=f}3f-OO*~&GRV9#yJ>0b6?cicl(tu@UO`q!kNeX
zj4S^Dj$fJgY&OjkIy>(NAUAqg?Z80~n8Yta+?-WwaukTAk)B*<pOoY1I2C3%@XX=3
zZV1k5WgcBlD9Isaj@A_3>ZFbVBb=JWyhlluUk3}5k9ytvI=nthACz<?{{UL&<uWTY
zZrj52J@H!7)bsOIY8PF~m!mc_f7d5JkTv3SsFpTH=;H_IYuj%><Hm3L<0t-tYsE?0
zx%B3~gDiH0y-%8{Yh3PQ_=m0O-W7|%`ZB|(S)!GZt$|4=2LSnpzCgjQZ$j|Th`d#K
z_FV?bTe)(elvWY0f4t#JuNgezy$|6(g8VDu#GhC3W~eM;ww<C|BHLt`0HiiU1GJNY
zj-*$l>i+;4G|vcYdQXD<2c?}oF7J}od89zBK<G$0Es#2sn(?t1=T8$-j$cQbrmeT4
z?%RESN7Z5QUec8;PG?%w^hqsVx+IrRnO@i7pTqwEhSq7~ABwu9Hxk4*3k-a}KR{7S
zJC*E41#J9c@rJJlg7o-w=wZ{e-B}bRp3Gbbh%ByH90e+)<=dQ|yjMT*pTpXBi@rB#
z8a};o6k1Y24EHfOB+klJk~IUd!3Ws!TK+5W)OwD%-Y)U|)K(fbge`5S=?cJ1=@=je
z#x|=E3gDhP*U!`Cm0UJ67dJV|Jl5#1@Tji)w$FaMlqq7c7$_)0GUZRJx5VV%PKh2V
z@b}CR>LMm8Nsp)FUXOCQvm#Po*LO9*ct2FrEq}FdBt^5c7=6zCAOzrYa!x%fjqw-8
zXz%q4eFwod(%n3hqDwhkfP~<XIpZ10HSgkL{;{^_$hpN(<vl0MRb#XODmMTyptF!c
zImdBNj9_zJiCFpjh)B*U9devfF~P~HRON9^8Mu)M{{VDU5)L|#T8iCC8E?Jbr_6_e
zQ#>sk20@+Nbf~FSMAnua$%rkabDV+i+L;>59ne`Fe6TVZbAo^R)u58b@eP}~>$qU&
z=}{YVBY@li&(^qVS8_dCcxc645ty3LMFp5eh;z`B_!^dJqYH@{e9qgB4L#gOSangr
z=CLH3&AB7bRv6(5u1DjF%~a`QSD{8{lTXdvvy&lij6*IDrYb9<_ffo;U`Q7QK;r}4
zR2L|Bc`=Yvspp}nJkL7k$b{}HaKj{HiqYzhd1%WJtYoN-NX89Wca7g<k}4MUBR`d8
z&u<w=%!VtXxWYSa80pikUb2pBkrma<DZVr)931E06;!Avsg+z*pSu%I+3nNiia{YA
zNUZzm?sq(L83YV5m0$?Rpgm8prHaJaNY@U;t`EwiCav0B+TX+GY%EthLku#J=qp7Y
zR$(c*t1=xv<_V(?1CXF{%5$Gym454S#@_tms>v$BXCrx146Zhlh83Uwlp%raWeo@M
zDf!#;Jk=$4a#CwT<YL(-+i*a90=;|I9-*jeaRq3hNfFqB?uAqi-FkjKYUHgY<WbEu
z46;J!8xC2Q=~tuFqq%@Y*5pJ<1WUo=z6NV(y$5G;%X-t=M$obs3$(KR)o(*qM7~w2
zdy+UGH=#8T_G}3ef<KWZ`HHU{YAN4NwzrNZm*rvda!EB7TN-m&SadSA(k#jHCqSdP
z6)C#Bh?i@9(5T3ha8Z%lp7mbN@;KCm3>0IQz|YdH`O-0EKfF)wo|Kx<L!!Nj;aiJ{
zngb?szk3|@$m5aCWz7`-0ARV6+ThJ7ZSgwr$VW_i5!$7+v)>$#wW5A-o5@l5eJQpY
zQ$lVd7lByef)2d(tfvJf%O#;uoS`mgyEEDT7W{kEY?d7(#)~z@u-c1xwT#Kz_ki;1
zJx4wHud=l5Lc>+Fi%`>UBf7SkcSkHt9T<9oIIqpw<A&K2%qnCf3%KJgir(;tk331>
z>y?{HEgiG4Yl}I*<K9+3c%dAtAMXym!R_%m?+z&^h{CH|bbg(~oOI%&hs>Sc%3tPw
zh|_kC{MQHIpNLu~jcpoBGS=5o9kW`@K#>oq!6BF2ab4&+uY;T=2g@a9eyL8RD#cC?
z>7N~b(Vwx$!+(zpKD*_%hvLB*^7UIV7wrtFRy4ap2%0rJdE!L_3g8kC&7ZSx{1d-V
z@b&yYGWg=q>UY+&$@Xmq77|#mmgQ1u+;cHt1EPGY0r}Lo%D<~5W|{yT*SkZCvsySe
zfTvaaH}|F4K8W%vaP|gx#f_XVCHH#UxeOMz_K-(yYZ^lgN)kBG1yxlT00F=NcCRJ)
zyYbWEx4|6@>mEJT;IooQ<ZCMs!S63xFfy=0xpw807RJ@c$ra1~(w`W-U+`1FN5%dT
z@Qu%lU1TxNA(LcwYCzj8!9&VWlZ>Je4g(d)HT>Fm{{YASG5DY2yT2It>rk6f)vnIh
zdAJeAx!V+JfFp@AcA|g*B%HBkKKBOjT(gOLdh_L^p0;VpzSimL&m)PrqB&ajG@*Kr
zzKhq)`WyC+{jfeIe$ZDbVCVi3uMI#XMba#1lNT!{!~U500(L^bE)`I9+n#IYJp;sA
z?}fFCZxDED`r}K~tny&Hw2`5jJ;Q5lbs&s_LG=}oze@Yd_8<5s@N43?jU=1mPl#>q
zF0CSVw(z0zE#$g(Vyh!gb_r2epDq?Bb1w1#+TXh3p_gP?T%`(syNcN>x9jWGv+{aR
z9h_sM&r|o0X5FpmevtmgzqDt?4~^OsQ25`%b1l7;4Ilg^7QjVgc{~9WqeR4h`SMxJ
zM4)a0Ag{W>+&W|0vh<A~O3-xcZ5u<ghQ`LuB!b@F-2`z(5)3LTr~m=jS1Iue;+KHF
z6lgPelgGNnzLRAVlBQ<NvoHkghB5QV=NJm0l6bG>?0sr@+-#?VqspZ8io5>+J<s0o
zl%<5isN!mKa;;?Sj(g)5!5@eJ02}-=-Wu`do-UCVPqJO1*>z<rjCqhtf>E*oBb96b
z;g=x(cla;<35}&$d`i8#@t(HYkAiIGj%_;D7ct)6sU(<g-BAQVFcr*MG6R({oQ3^3
z@DIeF1N>3YA@MhXHH&Rt!caeX1A@`FIY|@mM&N=9umllZ!1b>D7ZGN;Z7JbsRFo6Z
zuHEg~oLIb<0gtI1O<1Wlt^4}-M>pY52zY0~8U?q7yeFpH>3U6)?1IwS0z`oFNCT-S
zl20V^YvKO@+TZp^_%q{uB7cb&nx&t^{Yu2|ywwUKSlyZARR-=cu(XBSYC1BxFNWMU
zf%hki^?g@B@Xn>JXu3@L?e&$r%?6un=^WQiqarkE3aDe-C!T{OfPb3*0NOL-_r*Vr
zUNX5pJ@d4y8w-c~4~PC_hQl8&8CU}IWA8}ACnsoCIInYn^1OaOG4|MM+>ScRTfe)l
z{PaA|Eyv)nxJ5$|Pn(-at649fPnM_8H&I6g)b{bplcZ9-Vpz#2;TW#v+!W!lk&q5L
ziej?^!NxFuDr=HmKvrgRfD{r*{43e-zh{4qdRKu{;!XbmgmoCS1~0nc$MeS$1wM3<
zee{L54YComocbF6jjxKs(s5XtiY@4_(!26M6Qzf%SDV>Iw6)Ui$oix9eg6Q0fq3)b
zW{`X_<Lg;`W2MH$t!^c_E2zL5Xfc_bIF=;<l~x;pINlThJTLZy{i=Q={?J;4w))KN
zrg(83TGvIlkeC#%2x(3MnfG!F5105r#eCMCYSo^df2v)@W2Rcqdv!a6iW_*^Ssjl+
z6+tGvYJ4k{VX+S|g*d5q>FSot<agE2arLOIB<i;vt^ICi1OQ^aTi}nw-w*sx_?e@4
zC&b<%gTppDl)@&wnOT}gW(tr)g=7;-P7ugdGu*b+Z9j@1v+u>fgSL8pr{nDgYYQz(
z1e!V1_pwCn)+wEGBb7!sD;_h7@qDXrd-WCS!nHhZ7Ec9NbkdY;?ECgOXG(M`rCu97
zoz?#UQ~H+pH}E^*zrg<h4M(7O15QXc#~R&g5^a+5{nIDSo?r&%VoP8JMa~gbKv9MJ
zKzs=JS@EaB7dF2UB96q^!qY>pMp7HCJ!ARVgw6_x@DBBSp&XHr2>i+YgnwwCjUU_5
zd?bTMyz#Du1-LOZmn$5)L=dYG7HP1J8y%`6l2J^ZWmlX{xU465aCg_>?he6&!{89y
z-QC^YAq01K5AFnaXK;e+z<Xxzwa!}mA3PtrpYFT5>Z%q1)mh8y|C9ks__^Lj#<?p4
zUg{PgO#K}q9Z1Q`t3<Uv6QoDoT3y7Y+iwylP8(5L2_*ntGPK#-vR_N{KY&;7b%6Pd
z!&C3SBS;;@<3^rbrVK;eM`qW<aa-n<ulB3aL49}+=UzYIj#uE|Cr#t6VXg!ESj8G+
z%@6qyEs!Nl<z7=W{W<%*_rD!O8x?T^A>UQV2LMFP6Qjw<+@lT`m}wS;L^xInua+z?
zqwol{-@F%ygK(5d<N%nWo+F3fel~#DCc9{=eT-(OsxOmYrI>_&7zA2ImRGN{PJI$T
z7p|2*mAusNC*q$KA{MOt8+pU67UH?x%t*1VWrv?MYoB0f_$62p7l%WjhA`yb-{yGb
zc^hGbc_E6x^}%sCC}3ax%Gr$Bm49jG^rNx+J9#obn|AcLQ0J+rasYI|>?iC;VGrj=
z;=Z0MPpK;*in3-V<-zNGBWwf+q1bsNc7OzOF@7FJ3o(Rw+27X@;y$e95ey}qv}1)j
z0F8R%S!LkCpo<UT(3z$a1rO8@+B&ny-_O-EF`u#a_JfW$kMI8$)+~&>;vWa9yYIHx
zIz$ID&mC9lv|Yo)xF86l)%n{#DLlb{6nzSMfnNQf5A0bkhIa@T4=@tZ`7|>Xzlq46
zWXD7gC14XzvyUF*;|E|t_M!gP#>-6kZDJfn=S^?V;P04GIE9Of<-{q9!0kUe&MAEs
zU-=&EA&V5rOJ*nlasJAWcVAdwSALOcUIw<d3E@!ee_jcM*PEujsS9j34+GdI8#|1u
zjnAr518p^@&YJu8cN>qL>X9w@^kv#>S!BzrdS82e^lk-&iyHhHqh3O94hQ?Oz`~*t
zevYM^=NTN;;X3o2-><`k#<Su|g4l;x5bBOhGWyBOfo?}t>K?YXWMxWemUv$o%K~_I
zAxo5*1#5I={`8ZLYVcBF`Wb`q{4WgEv#Cu9sy<e<M|>7Rd{^NTYk2=xKiC`JA?FwM
z0YVL3`<VPYHX2ZYQaC7m!kef~I}a5O4%R1tUHY%#j_vSkfw?@kbGlxfkNxweCuolK
zOVZwry6cd{O#Eg-+BGb*m*wzW{2N_x`&kB&JBm))RXjd+*kLF=ylBwLnV~SUu>8Ho
z8DsN8eN?$L=|XjrOtDx0Xdmf~QaWw*0b^@R7!;OJPhtWNwpj)kqCTAT57#G%P`^aG
z(xf-~b4mgjbg9tq3a&Vj9iE_ruf6L8hj(ai@#0vHmcViz&sglQ;(YOV4N}Aak@T^s
zkOBd%Ge6NkT)&q6fVFO-Oa~&YwQ5QcSlfvKHOU&^Mra36r9?spX8hle4Ud441CvTh
zS4pyS{U*Oe9_9A3J+R1G@F7@6t@-8grA!OaVH?4fY~~FG_igf1G^L);w97iN2Uf5d
z)|EQ5S`3H)%gw$UAAU6_!#sYJ+Jy5|_EQ6?5^qCweE^?54@=V_e0$iJ!m14`8m3B{
z;|<vqu2x_AV0}eIn*|R-YOmLsv2~_a0Lg0nG#4K`XXIrG0=71;y;CL^N$oKy!NFf>
z)%_Mx8?IPrPO7=|M%qeU$jboX-=o7re?p$HR!duY^wx4Gi6B{5RHr}&iAMX{Ln6~d
zD*)mF%fle|_B7T+FF`P3jLd~zPRw}I5MxOqrR9{l5I|YqO*E<`W><|;CdUfgDK02R
zv_^Uo9Qsh{EJfdO`O|vkc1WnAp+%YvPHvq1nHbUhGTIUAtIP535u(%BMqO2HequH)
zkbFc^R@tatcX+ABG(-ESBf<PTQO0)@X=OVgNA!HvnKRcF`<02M#QS<IPHLa8Mg6)h
z7R<Zwa6|DJ1wsu{l1NZ!OoC1w*P3Q_!BO(4*P8~01d#qZNR|2`e;9c<pjT14)M|OQ
z%|Pvd$cm7ksyBHSA%TZT{4F(cw0`3(rFCj&?%7N{InGsII@c*j>&<qm&DyD?8CU<E
zo{uctvn^*)f55ckTNB!KUJ9g_Y{_*!<)9#v(fyawzg#y7p_#e3>pyw-m;Q|g@kJ~D
z?8>x%s04LGCpmRIoN#4;2uK_$1GEynV3v5x)tXnx8(L#5+PcLI#^K17=}-zpbuY9d
zK2}_MC}UlsZ5|ASJW}PtJAvyb;`adt&OcfUK+j4UvzOi=f&^04d(%2b!#^`ca`p>~
zx%wEh2l(!zws8NHI|X)hN@XV<P<to>1OEePS`#qFlo-V5wuW)&5>1&gI+fVp4VpM_
zLYJ@9%HK}>><UFsVK83obTq0}=UHuw0Q#<cr($k_4!%0mQMRp6_UrOXtxxIi85e4L
zs-2~N<h66><9g6;<=1qeImU9b96TEU^$ouGVk_*;2%D%tQ(?U3ygb=5`f6kxAbEj0
zwA;Vy)dh6W_I17(c?yq(aCNd!VaS^bE-Pse7ew^@_-KtZEU!-&)OsaJO#0;z&@ot+
zrO`Ek!JxHMo+I^5mi#_}p~D1yb-KG?(;ox1qWIE<jf$^M$gQsG5q@nqMLXu$SIeGu
zqlvG29rinhi_4ce-CVfpaQjQbAC;%VC=RJ?^m;li?6ZmSO`QyIUT}HIZGVZqHX>n2
z_k0L_IY);Hcq$&bnG}ita<udr3z3vtlVx&CyL?AN`K^31#W8vkUy_gxyVTUph1<hN
z(GG<wesz`8%-24PIy2O%qD-b+36erEMY53iLL)=%QLXnCLy_2doYc7W-C^%&(XJ%j
z(6Q}D2%8fp`Ju4j@=40lExE<Oan|#jCjEXW(S9(koahSb2?-z9A!Ev4q2ynb3f2?w
zd5nH#$#oX8_q?gWV8g=XnqzU~QSPKmL;1rmuJed9z3-A$GImBL{1!CQ)4r)1l9K<9
z@We!}W=WhT2gSf>y6Zf^8_9cYlOZO3O#Q<)M%<O@Sv70<AMj$4>Ca<OZ4R~ODeMjK
zn*4+Q{gXLc$R8qaGT~RCaO!wA>^=bCeHnOvRcDY?Rg64}^`d;CCKM*g6oCb!ddQ&U
zd02;{A3M49@aXFmpgO=m6ezIdAr(cFp)eAkm4;zikXt?D=eRe>$1A{cdcAhdAYzr}
zIV(VToh4_A9uJLrbmC0#meWi;uvw%<B`iJ~NLZn9<SFe^j_X%+AWV5a%cC$Z|D!R*
zbt&e)pXwt59PjUc$cVymFSgW*v?jD(Yw|G_!Js9o)OxR+Z~Y%YDEIS?`&7OD3Xgpd
zNL)Ae-|`1l5RO#bHI0m#t6Cl?k5ihze)VW>#FR*o7?NjPD~o5WSDA{rJ*(5C)J|fz
z{z6B`z<*^pcu?Rb*H^vK6C)cS-W?*KbpVFkrm1i~abBC>Vh4{3<Y@l`<bTS}qU3MB
z2$|$Y(7`}5;EiItOP}sHtDSGl3wX}68x5g0x0=1@=BWTUVxPabro!$_v<Do&>v0fy
zN%^Xh8w{m%m_g2Yh&x2ZDRtAYX#Xo22lj}jdlW*OdW%qOyxO;p^c=f5ao~3JLq2zW
z)&KM6Q**@+7s6IUw;Ld!B*?vk>aTP;`5aLIGv0}Yb0)sxYeBl&`#Jx8Ti*8AM!%ZR
ziSV@o$G06-swq*~7bSLr>bN@(B&kf_JILk;Dq)OssrP_tsgCLi?7*eiaWuQe!n~iB
zbTE5wE3oTE+(C#PhlqoXkG1d7pHujLxSHf7#e6V_LR!K;Q7lJZpBF(xiMksRC;52>
z#tR0MyoI-iydTb~{{h6g4>|$JtF4BCV{yBWqskvJt5MVV<x^)yGU;PE9EO&&lPld^
zJ%Uw+R(-S$%PngQATL@V?q#24R`EYTD0LfWOYk75Z~s;%*iZOy+h{0&Z`tGY%;rK+
zM)ca!TN|Z9j&J-d(dYGW|7182-V6TE``*ILT(mspj8K&>yt}YgUOj34MOxvA2=hzo
zE#996WvcNZ^?MkkyW!+;>W7EtiTeDp{addsjcRcL_HqIi%Wj~joc-zg69JIwYse6T
z`)->%4c=s1Y*(|(&d0)~Kf!JJ0gNH?Dl(yPn5lMWOQJw%RX3C0m1^PjQ+?8-y^-qb
zb*{?twOMDUL*0pJsjtixi97Zue*)J}T}Hn2piA$17}YVH<sp1TjT>wmz1_Bt!2O;C
z7V2pRWsZ8J_nz6^)pH<Z=A(Tc^~`wsyY-5jan-f&66l5Iz_`^?2yaz@*`nZxwpryM
zL6e-u6(5|JLB$<;vkg7rr>xpc>~FhF7ePUgJb}{KAna8=25h4lAzsr(blX}#IsZqi
z6G>Mqdc9QH2rNp&<<ni@>{@!g25!$!|Jz&6DjJ_SQx~_Y3!8L;=N6nmM(jLRR^uN4
zGZMC5{T4Vg-qcE=(uTVh@VD91B)M00RLA$~gx@HR6jTD8P_Gu!%j-7Jm&2qdb4mv}
z&{pXi*c-CvBwT-(7-PCPPf*27Ww_=&j@MlO{SBO#@pO_TMKiSJ(bLXu7u(S<E)LZS
zT=5)X-%s^<HFdhSmG@gnkFHtLoZMtd`aSzStbTfGfR5V+E(U(%7p|!~cQa*5zT5=$
z4MX(?o2#3d{Uk>WV6)Ij_u-qmfwD-;RzbM;DsR|jD#{$0OYP|05L6mWh8;!$;Hl>K
zbY|`~wVBa3lr=Ihtj$upA?L0&Sv>Cv0J2%4?ust)`7tAC33dQ?lIKf-^3LO%NnPtO
zkS&^a-0&*{XNWcLgeyLPi@CrDlYx;Qb5FU=nwqHXKnf8%VVi-PE5cCz?pFe?CZ~(C
zfA6^g&o_Ij*W+0)@;vf4L_b|K=jm0{b&E@14dv3!DUcNHbM&lpUI4MCJWvYPp$?PI
z92UC+ntCr)V`EabC`F2<WNb3~pvherlbXG@h2zIbIM9w<q_jD2rFCRxBzIHK!Hy$-
zS_22hL-@>~)hj&aZ&|Ucodg<#QueHDDVFSAVyomm#b)rmM#_Y<Jsrn5u4KngmSxg{
zR+AmyuL;L#T%CjGr~zS`?j1Ru$bZjP8h)kFe(Vb+=dqt9E3-%8O{G8Mjoa5ApLn*n
zsxQ|OdLjYbHv-fqh1$q{9uI0xNAL8UpL$c0lvbg+^i8SpJXE4uMzmbEzUEfE<+PAs
zx)FR_R%echX)QGN&GEE*-5F#A>7`T?I^jd+*k^FH_Bt1e!(T=_F>wb7@Mx*kiQdMU
z`LbaznYr;^zzpYMuJ-5FIj@?bMDpt&3WiM7Tro1;D`tYYX)SWoI)>p*YC#RA_*3ga
z?P%JO1aEuV7|^U`TSwwaf_AQHoo|R}ru-Z4e*YMw?@6rKZEkJvCT}LnDSpfkw|ieI
zGNUfvgFdzJNBE`18Fp`bB4p@$WYX;_+YX8Zsc=uOc2QcvFFQSUn>3{f;W;f`oaap}
zbi?L#f1it&IUmmXP17ZZ&(ED)%BhkNK2WVcyz0Y+aBfX?Zm`5l49Kofm?k2^Coshe
z%6MnlcWqHRPsOQM*FFhufsqGOXjjaLS1iU8zyXTnNX?&4;$*b}23g12c_cE3jep%w
z1OBF4cKq<eu9(tVtekK0jX+a+d`*5=h0I{0T#iwAyrlduf=~WL!@OA8^;~Dlsm5Mr
z6Dd**HO6UeL_sR~TTm{dN@ob;<!PXonr`$)J{_G;v}F>40?mlAzfXKoyUpjl1mL)r
z+#!MIkjD?nArYFE6iBjI5u(Ujl}alY&@#yX(`1TY*28BI|EKdL5dM_v?O1KCg{ix<
zK12NUjNT|mnL&%35XX;PVumrl`mlL>@nrF0|1M#j0Sw?sIN127lzi0M-(VgC66N|r
zi8+3HzDCTEhmoA%`Xcx)_Bkbr*>8EJWbIxp(|<F)>Ihd@`{&mP$DPWLm}&cCa{|N@
zl!y(b#l%xm!oxgLd;I+i7TsE%^5%zs0h@*KCq0ZYQ@Pb>J~`*@hSiTEg)hpGqR?-*
z?csh%6;?**Ekg&2HVf-`GE41O*2nk;?LF}W$UF~2>@cL-7&j0iLhPsT%kqKIi##~6
zviD=j>aU(u3pLMQPTX}&RBWGua%k@3LwG(ev){))3ASe(pZHS^+R-KrX|LwY;Qpa?
zZ>rJZleJ4l8i`hgkwD$&*Mh>E_Ji_a(a=x2#Nk4e)~}b3U!AcVq>&qT=+B57T$=8m
z#bH7h`bz_Ycld6Nh8-gW=;Hhwe~Cu4#!J(AKb&kK8ylqSGlEa4Rqz@821sr91J`8s
z94}vK!;No^#FxfmdUTNCkV<>js!O(TDZt&F(+-2Oa)qv3-rpp;EgxReq_&f=10?#<
zZ?QxMO`VZYB+;h$ge{hH@7f)C??g5B@~W{ShJ3A;ddypZmEODkNr&Tu|BTQ+*T}wz
zz`dLdc9kaxxx3r$8(2(k$iy0oMGMWrcqh=KCpiS{A>j+&{z+2wBx~`!GQPQg@?Fqn
zmR7BPN!i!2=kk@2&epSLWFFr^RyiT#-ErK9he;eTf<P9Cz|OZPBPSz$`~|eh6neI{
zw&qq>wfky%c2?7CQ<%dka&+}Sl%a1jtYfQ*-3bI#ln>6qDd*h}tg(Ed&=O${sq!Ve
zN;@?l<PiBJ5RCXjb5IK#KxNZh9@V2>w%em<?1nB`E{9J`kBKr;f=LyjL;O>KP4zA%
z(D1mWy)*c#?zi0UbRFshDacA?(@cv)7axTA8S|;+hjwU$<0p74Qg|o|XCRzWT{_*o
zIi6Q{J4zC8Q4HOh9t?xYZq$Ygslf`ovUGkI()W`WMf&x*VtS&tHAKPAlB1E)CEzQ&
zYBw$|cB+0dpnC6<=Q)J`93St!svfspJiX-Ym*TMU@SZ3d)_h6$XELN@F`;fM=hJiN
z6H>pt2?_DT4+cVwm3y81@hL3~qwVjBnLUxgXA>CL?ww|1{iihv>o-vRQw{Zt(8tg_
z*XPWC0Js3mQx6g9o9}HqRLj@oUdOkTQzV1At{9X9e=`!eka>*x|LjK#R=-}{&L#M1
z-ja>*?y%qbgEQYUO6hZS@biO`TbOMBVrU&}62k)A>J4A8-j%>gFR%ebLs}8f(#g%v
z#k*W7+6^w<+mu>HEGFn*DDT&f%(NCP{K)cw6UJc)^pmRFhLds+pzX~$X)kS3%Bf0=
zrV#<$(2He}r~5n6(tJzFzhJrIZ7YF~G{T5{UyPW81rT=zfvH2)=|G6+hx=>%q2mi}
zVrVOCG$F8jZ(GTp^$+Ltf-Rw4B1jRF96w=J0_h9;xXJLV>Gc!zm1wYNOW=@E;s$kB
zV(9e-(gl~Kf9Vdk!j!>qkd9^+2PW#On#MbE@INW+20Mt8pZt%7PMw^gTZP<WMrB#Q
z5+voKIP$Xzypm?AN<ZcXF^GOz=7Bk16!*}=L`X>qUII(ROV=}Y>BYOOcj-Vm>i6T)
zPP|yb10ybh#3Bd~C;!NPX8TK~B~)D@_JZDdcm}~GbP*cm^x7AA<jGwM242Knh;hDn
z{~6LX*dHSlSW#O78|I`pDl}D{jUU{GAG~#B$s~i=X+aKUd!z7_{Lu*ZeHXi)7fSyt
z#0a~nXlsl_CZKT|?@8-SmGJAErhf9h|KY_=KN={zjoqYTV#vToaQEoh@y#;5&{D_4
zDB`W>pp;QZDxf#{qQB|+a^^<n0B^(y%gb@ZkXg}YcSy@nDx(l1VcFfQ<X+WW=APgP
zK?N86rrerIGDy_z!GK6^o~zmbPqH;)^v@(s#Lfl`#t60*TU8Ys0IEOP%c1VFR3Z1P
z;f~d-92Jdxp-BnZO8+miwZg;5vHI99BFIL#`71&e0TZcw%aGXIy3}C7dry^qF9MR@
z=7P4FNH;&Bf83$I`4mA<5U8Un$Z2YBvVLW$J%2A09wg|)RQIw{J<dIEU7UH4eZNae
z00qfmd)AMxzgguGs$MBb0kBo&AZ1L;j2G{UJR1s|uf^@{;R+MfaX6YXE{e{5!Y}dr
z^^qH1-K*skpe>3Ja$OyK)F9N*t%E+~+p*z)SYYw)-AWAGa=N!zzKO51E_3%hAaY1S
zP_SX2R_e;=>|bD_>*&n=BtD|w;)+2c@Fn|Q(%?eQ`PU2bge`!Erk~2Lel4n#&nTc*
zKx<mIsb@}jDBH|~X?l!0bt%QhOiH<XM5JGH;!a2eF2`v_*&{$M$1<zNZMQYx-*;dq
zgdFZw<1#6b#AtCLqWT1CATjmO8nxa1-74ZyQujzpeaOJ!&lwZ*mD=lhGMI?&7^ORS
zu|QyfAljIaV$LCN(c~v~Txy8xn53R|Y;q#s<m^>C-LVRL@Yz4*0!}V8RM{=|!NC!U
zOh%?FDEe0!>NxlEF(};(C%G-(dGG{fOJhr8B0|xZQGdS5!Sr4Bt}Y+g&sIa<i1Z9t
zwHjG|hcMKR5#g&9F%J2?ljDQZarg){di${i_5udR&f@m9Va^CX8t)PF-4+(m>>aP{
z7af>N@DI+xF)FfJ!{(e({3*TA_98y&&sG2r{kEB4q41xFpJ=s0y5o=<UOxKJFw*kQ
z=PEp`9z1n@C2WcDQl5{0qq(p*9bE|<3;XikQYCz^!B=s&Ie>0&LICv!a%hn3EWC3}
z2sTH5P9e`==$!aN>5|C&Q#dn0NotY#W_P0gttQhIowQy^zqgoOYTgHD44{$w;f$g+
zh7D})!pove!cZe%QG`>|c#elcm!rf(3<LiQx{YDEf0Lpk#pBgjA4=ITr<Z%Rkg08+
zUnY!^^QY(A?MAAYt8Q5?NEacyOnhtxpC*d5qlWV+3_}MKM??-9IRk9lqSf6gqRyL5
z0Fwa?Pj$A@ndQez(|WWjmg?}Usoq6df5ic=F*{i}&qYk_nq#trZA{laG@4Ms%%FQ7
z)<ncHlW*c;)aKzbg)4b-R5D3|cd8`-|Da;CDz-4~H?I@tn|qDG+UfDaiFnC5?ZWCi
zFL0f!m<ACUZEU5RakI$1RK^~;abORQPk#^1=AP5)W};7S!;&@W8ZaQ-0nm~u-_k#O
zj^-+IKqncGtl%3JNy}~MRND9;oKkM<)%2#@&--@Vk2u}fyUdN`*rqSN(2*7)2cJfz
zv^MLi+_hON^-+Y)YVC=bA<Cl}bSbTTS6Ndo3l>FAmcJvJT^I|J6$#xaiw72zxt6<E
zhSsz`w6ii-RmK-1QD#GbNwm}Sy6$%;c?s5gw#Qe=;@3^l+gvp-oJu#&;c^aBJ$|e9
zr1l)KGc=^XTU893VHnH%w$Uj(Zii)9VpHa%Q95}U=c}lsS%7cRmO<O0!bv_!cev3v
z|Kdvkv6Z>-5q$F@707wTwY8mKPO-g;Ic5uX-L2q$>>uLRx*<*Yd-t0bnlv}wTj#pN
z>n(p>^Rpo{d4;*fJYg=aK#?h<2yI-6J)zY?f{H=dzmia8n#xd`Gv_sub2m-HgYEjR
z?MMCdCSRRts|R5gnI%hLXtI6*tcQwY(O^^*YPy0A<FPcVjc59b5!3hMz}`{Od`gz*
z4QVue&${x)NO~0=u4fT9@Ncp-53o&YQG1_lP|f%yue~tfskX)9o48u9G>Rq>&Qrsv
z50(UHHqL^*W9#v-{n(@zSjJ|KiKVWm!d$I1z-h=C|Mbh1Zofr{1uL!GC<mJ3vBt2o
zBzee}h6WQd;@g$_tb*|9!)+#)J-<-vH%-QImNMuGuXSM}9I3i%TFZ|1?Bi(E?O>=j
z85J*_mLUCM#bIWAm9N{u;CPHJ+41%e?e-aRCb1({elP=TW9tD~-lc;!FO23o%#PdS
zWWVJAdm~+*&*4H7OL5QkNli2YGUA~#3iv{Tt`i7Cu&06A*^t4A3;A+XC~r_>0Yg2k
zK<WX}okSi$3YF!0Q3)Qz1xoY_3fL+WLJMqGb1*L6pJ|7kj4pwUQ=yYxipjrk`{QzA
zFN0dF@rO;B)6W)5wL;GRWmC!_M-96SB?C)87dTq;pr@iy75Je&+u=(!wll)n+c9Cn
z<d~p5Rkz%qDq3z9;XB$xr?vvD*1*5sebvKKn#b&_2+fWPe%H}7{TOA;mgxT2WsNTZ
zI{$!#1C9cW!ralF>TX@W%=o1r^;d!~AdW&39U@008)|<4#vA38K8^drg<c`bXH+KT
z;lTe^J{ZQacUmcwmbw9;e#5jbvY7Q!GuHqHwGHa9z`5`6#J2^u6Anl5*_z7aKSL0>
zFLvmiNu$5<Pq>{rgD=Kd6bO(o1)M{3z@`w-smcF(PE}q+;qLfF|4rRV(PM$&(mFO<
zzNloncnd8zV^YGCT<oX*_|SPx|KmxAeOI*yOCG#a5MX|gi_Q?=UFY?t-q6l8S|z5c
zhW&GmgyGu+@w;Y4)2a3&PK^Mu`*y0x_uDyeeCF;yv}1$a0kOyf;mUxJl@o~KhV$qD
z_)+ioD9COaufzeMm7|}}uTp`kd$o^*5UxRrXt-<O|5pjGpOc|WyxzyK<$LU;wp?fF
z%g75k%@cpxG;MF?);HI7Q>KrnCAW)YUFei$=dNQ(KmC~d1Jwbf2EzRUbUCpH!Lgs|
zCT%DqyL8LbS_t+zLyX=-2Fl)yb4&qRf>7_Qem;b!MV$whm}<X^L3saH9n6OLx!St(
zf7HR$m^Y@z4a`Ro|E+2+rST~uhE6g@j3l&$zUZ5i5i&S0vOms}`~LJs`pG=Xu59Y$
z4kg!{DV?);LWr;TwKC0c)ocUmqf!#BuUZNlpS`nNxB_K3oocPSdWJg@ZS_^#7dc^F
zWknPj-fPY5)Q|iF6Q_cjUlKyH_{v#z&r6lK4vJvS5_D5tbMxATW%gT?$(Q0Bu&m?E
z;vQ#ws5TYwvhL(Sp#)?2P(8c*i<2Td(O_xO3&U_{mv;l0B6&t)vIBA_k%&GclFwJx
z<Oh7|-uPAZ?Dimizy9o6uNq8>i))*@_$r<4*9=z)J9qrAn7RtjYk~em#g*smGj0qq
zLj*HY@+&~0E<SUzsOcOogKf4UxC#W5WMnHIO9&qZeBq&|nFR30G1PO5cEulr>fG$>
zRfL>O)x-#@x#Y+W4kc*1<DK2GXldQD!v0ctE?bdMQjYaDC3wzfIcYP*c6HR|4rz^-
z)*!AOtoffB(O=>D#ujR0tzcrE#CLuLxnAf>qv~c)TB9R<AJv8uJ_GV5NNbQYq5=#H
z4}nN{PdF%zR{2XACUSyVC!uS;h-Z;zE*ibpqg6_!zdPNNl9N<<0K)94id|j@p?>9G
zq&Tdvbx*;+zT<_0rTYIAXFeCBEwslmnfjh{UMje9Oi=)%&>rnNPcGDx9Wv$6(q}pF
zS&<S0mN8|#I<2V1`nQcW9m_geWlvz+9ie|hiizdbV(q3gu2{@F?k)=_u60tW5_aQN
z8C~d5?<mH}8}!-UfD2P>@rL59iKZLXuqGo0Jo&qfiSo`(N_)T_fyAVyjix*(dA31_
z%crSB$Br_qsV8gbiUdX6i6t#a2MTv^uQ2oGAJAja7E#K!mEgVwW9bo8Sv~CRVa9u)
zkd=bJH99T6-RwkG@teGXUd{}w=}w6s`<*B=lidK&MC&3xPFdWQ!PB9@Cl8;Se-R=r
zETgM($uF4q0ctOwq3}|ZL+RYkv;Pulm$MY}heXW3Yd?$Y9RCZLwpanL>7MX}BQ9e*
zxSi~WW-{y{P65BhcP)l2zNe}HX`ijWXl(S-#xI#g%(JfFWto!0Tg1XAw8sF2^0nMO
z?F}_d?|LCRUPl@SCzn|X`r|_(>ouKvx;fI@(p$s&ucdyKjbksq%LJD6uE;-5PGH!c
zuQX>?H|tX53bbMD!wrK@%DFwB7Le|Eoo?NX2dg7iu;{(DC~r;T1YlpY--^h#$NmEt
z)(SD25<gc{xa{uiu^JXEQ<D!A_9ul89?y|(+d{65jHTPI6fh4eo0^dwLd(3yxc0_v
zwDiagoQGbjiEITg4joT8E}pIo`Z5mGZl_;Hzu{&;ecSrZJIZG~NG4w|yi#a~YhQEM
z_-eY_(hhR^rnINIgp2)g3|dLqZ#;G%5KgRhc(Hj`y|Le00`t1qnZJ5U7qiP3zZh_a
zped_vG&r!_d#Ne7XkzGd#LuLzJ)X%qw*!ruq0ZU9QsYpOrJF_%E!Hb8RMllKhm9^n
zdXB8Rx%|iW&|-6xV`8j*ei?{r36q<qr+HXy_62sYZuCa$alg8IKE?X!Q^)sV!|Gc)
z(T`I(@{=^q+MbF=>=HgIQjQXl5wx^FT|GN=&7sp3$61K~2T1&KVU%Yn#IBCwljZIe
zAaem099?qufJBWgcZ)@aYy4d${`Sa~ZdnD8H?}Xeytk|i*KR3rVb+NcjqX(=2YC~t
zqv^JmodX40({v;91@!fP6cwj%c50K|FIH5WtxiZgpk7EP_NK(87fgdyK1u`k^1J#Y
zJ{J#JgTJy=m-p1%y?96ab{jLsA!&(TP>3l{p7>NHLd0QkQ#wtAw|#m1VG02uJ;qWs
zW6{Rsz#Re=3~5Av{offs{&%U*VKUS{jzQ;|7egc?#Fsjqu(^uCuXij8#1l_Z=)b%B
zpI?6Z-q}9xVBWHnG8%JgdD)dO`#c-^z7^&<!^21@To9Pqy^Zx3z0m9#MH#Za7#|3V
zVta9_7L1-RfQT~UX*2j(exP)kcFJxGohHbd(ZOJHa2f@?G=Cym-s0)qAnT$={?a)k
zrOKxcdDoC5*~3WF#MJwa*d;ry51#$*^#gBW>`y;-ANI@h-gETvO?~1+C)Y(O(|jjI
zQD>A}3LddQk~Bo1jTo%w2lZ==9C<6*?I+}EEUZ7jw~Gpw=9)GTzu34s^<BKJ^;Vw0
z)ph#wl-KU+pYygO8`@TP8d{f_E_HV@DIaQxXuc)_TonB|Z-}0>UNUdbSRa|;me$U2
zfz5KHMczcrtuvEo+k;r<o}6st4K(U6y^I53sr^I@F9ofR`5x6WgEOtC*$HYqrykZp
zDh6VDbH4pZ7#9>DH&<j>`Gr6G<)l{sa2Ah0_1yW+_t`N@8Z{|jjkK0N96y2~#cFfG
zgr9|Y!~T-MrrXTuD`OFlWG>0eH^Cn|;wd|t)6K@RItX{8a$-uWWObFq&a}fs8g9_P
zNOwZdj3!va{45W4Bj1)psBVDup2g>Edta|;tfp8b?VJ!$!-C}7#K+`6@Rs}c%0xt$
z0`S`->|3;J{fwAMzvn!-d_mYkur_H&x|0jx0S`mZ3TPb>8w{+E@{&Eqa6Tocw$*LL
zf_w4CKd>VB+F6H;1L}dqDGI;4GQni(^MQN6$Y4djcsVNPj^*%_TJHA~>QgJ?hT2fc
zwyIcYq^Lq&-IZv#T#@hpap{gWg+LT~*>w#e_||A4gBs-nG#ndG-M&({k+QCy^s@Bm
zlYi`d#5Zt&OGdH_qw<?NWh17+8Xnr}-1~KJmNpSQp|QVXCfhz-ufhVaxrAiVLQ?w%
zQrwA%HkmmhzhcHSJve)N&g-Ak<Oq<zTyusRRT`2)bj1Rjr$AH(yViPSDd$5X?V|SP
zjsz`SUS!SCtsk|Sis2erL+mbV$o!12f=Z)`K<6PHLrWSV1Hsl+$BdAv`z;3p+=fR&
z<&)R^`r*LnKAxBQ1K)aFuU(atr}G1d_DFJ<YpY5DTilkQLw#pD{{HKUV3t!~goD4{
z?1lS=6Y@hP?vdEQ3vqijkJzg0?f)q<W0BxQC~cu4_j!<VWELDGdkD6_d4>>WP+H4w
z{w2Bz&Y-F2C*v#_n?Vz>%a)8W#0BXP1ku5q#9Ry>+`VxJHs377qy<d+)F;oxMzGX~
zbgD6r^v4!GwqFX~jO{rNlaUOvjwl+^bL%P8b{R#mB+;~$Y(mRW$qq+{pugAN@Uj`K
zOKA50isw{Xe2B$&+h*7x5>zn57=rC$a+WUsUG)vIcn$GW+Wy+Xt}NzhRENByJ=r`$
zTe~EAJkq6!W3QSKAdXLd$M{V3`Ss)0^l&$^N7%Xa<#Y5z<z_ieXVFi_KP^Wx^*u=6
zvGaW(I^ewOYD)<KE*!oluG^fU0^R_h&8{pzAW*yPH@YvNhhG_p8ntv!DZL-Q0WxP6
z><g*H3l!9@%Jn14OB;yaY7_&9b$KJ)PL{c+I<A-(UrK%=kx7y0B>n>BD0Wluq;&J>
zEQ?bI!5jK$&5&XBTQTzvsZA7nsf<;e3suXb8VBvYIZdNEq#qPwi3r&Sa9DE%B75gp
zq#!Xhg+@CF1m6P?^gYO;>tPeNsUV}+ST$++g^_KFQ%QBeGxil;)43n5_sGN9P6igR
zEAt<#yP~H`uFh&*X>WIaEN*(N{n5>rJv1>kSQ_#(1)~yUr?iHL<!{W8$|~)R^zti)
z?lx>pcwNqfbfe2?YL^jqmZsFa0w@0qPsUT%>SUu)i&m@U7)yAUJW>Q9GM_=THdO+c
z+m$Y&h-ee5`4bDgP;CcHyBhnKTKmYYqsmU~T$R@-w!>0{b1dix;eeL0k6~jGOU83=
z(RhhWm19ih?FqW?CLDnbf2WGFs+}l0G&<4)egCj*5hwb%%2EBg9{?gd!JoaU2{(GL
zNVqm9mAHwtG_!!D`1e4i-POo$KTn)-P)3F$mdhKB_$mh;_VzxdW6ORPC}+%dJi_CP
zJaT5mk%}wf{|p|xD}ixQKz8zOW;3WxEqD-Tn#~PC=tV*qSdlJfKIG8b7}onaAY=;P
zIH%i6oH~a%LQZP;1L={NOlj#0dt*x%%ZYK)$G^;DQB`fwp9KRT{!ryzX4Zmm<>SRf
zt>9*?0SA@Wr7B)Z?1YVit8wM`Ox)WmDyGt=@<LreWvK#8s(}ph1bg(2(o9Xs(}iY4
zky9<r9|eLI3nDXUPvy~iIb#@#e-1(&eyROFP`!ffwU|=YrpA4q-=n~j9RPp8FDCKI
zY<DCn?G-7B)A=6$Dbn|#QwHzLI`cMd2Rf(DtwW;hNQb<o`7QYs3q1|1CXQfby0gd1
zt6PE6hrigF4l5Ea($@%l2SMa1-`T457-gf!CY{7cTMNxAkfKVm)1#ut0Z=|}kaD`L
zpJAinCcrsAmw;A@rFGeejjv7O48s^1B0VYyC)|QM+#WHuvccIjA9wbfVIqCY`e#}m
z_v25uu!cOWH=!3Xmnt%5$WBM&+e)ZP!iqr1+n~n2acbEKc}RIZX)9*<UmJVkl4_1k
zIp&pRCyg5~KJH4I$24Ec>}(n@@Dd+63!l*2A`4S-pzv{_oBSHXu>HwGtB0g7F}to#
z^rEqJY8;L8ESiNqL7T;z{?@cv!n6V*3qfh>36M0#3)HO$$v0_xNV-t6rXF`pr3mbM
zjaaUqJqgIkl@4bjX5Vp0Sgu*d%tv^e_7bMgw0M|UJ=NWnwD$@8Rn%S?y<90DmujB{
zJIO+d30HKjIY%hBW=PlB<4^u!T>CCpy2p#iQQaSUdO|!)BOaf`z{d6z5Vfyo9JoU@
zwZq@`_uJN_(~qOaibpJL>G29p>Pc-g`}*nt!U_Bv!wmADlcv<+r>M#}_rjNyD=WYb
zNzf&Angek{YzwF_;YE{c_a^l0$_Gkh<q&^h#@S*Ka_X*j))^>DcV{N=jyHCjH+hZH
zQ)wq|;Zs&e&`e412S0!c<Pwp6SEtt=FM&&|ADCkP)k!1n8^u_*S)($yTBH{p7%9Rd
zp8NT&{EULARd^UtX79QVmirzjAt;6rBe)#jwGBUw_UHnKUa6bh_7(_)DW~=gd67*@
zHN$kHV(Vx)Ra*1@URsdH#@oOwYV^>he^Q(PN1Dddn>pbsd5@NFL%4EyE*;mwGGTIl
zmU5Oo-LPlHW)5~cXYxRUVK_b#mEjb9pdPOfgta6;aCl$k>4}@9YtMucMfFYEzY|oU
z+dHM}H^IO?hMux1-!Wn5S#|$<;Wj~`%wUuqnZAUkHTO_~t>8_$ImOV0j*s(Mz7l&S
z&NE?C73k4(zXVa_5j`_fB#B1b#YVw3-}esJE6}}HtYK{SRp7_%K5?W8e)lAhS2UU;
zPAkctT#V@V$J~wLX5emQZE!eg10x)?*0TNs6zj;I#wQOTBQQVI*d;p{T0IF32|<;7
zyY#nSX>a#ruZcQ>a@y>yUi~{uu9HD|i>i}G>@RFR=1d8*m^Dc9FAGk2GU(^Yg>Glu
z=B2M2FWG2*@Rg?hwb29|hcuW?O=Rx@I~6yDfcAX>>z;V;L#~#v?-<A4EMr<9DeJr5
zC`K{CSe>#>B}v*CYEvbAGFY?HR)|CLcv^MK|5!;$_(JksY2=XoQp0$ZXK$%|a(I?^
z4Dshp=_LoxfeCTJ<K4>INA+1f@#Me#+-qg}AOt?L1V)gM+^3XJ7K4)8KakTw0M&o1
zA8fxKenXys^+oaZ^2*~hL?Rh@CC&<Gd*UV-QIZ*OlsGgj@9Pb31Hr*&qF3ej^%VC|
z(2c>uBLSRatDg(NPfdvLEb<T8E2bRl>nrR0TvE`Bu+9F@o8P27%LKea;aT^<UxRk-
zKTP_FD&O*cFLaX?@GNQ&&tw1N7N}}R0h%3~DG(yNo~eJ&EoSPaZ<Gm+?|^H;kzjHj
zo6jR$AS%viel~z0mptMja1etFbTQ_gZpWa|PZoo0;o3UUq)I<yW7yAHr$uCrBTKMt
z;|tbaKBRL>?!bGAIgDKqvtMHR5E^hN(=Y$|dr*!E2NGvy3Z!~5;$CH||Kty_d;1ti
zOMX$!)@ElAaB`iszN^zP&53o25#j`lT}KEjx0<eiJb>8Vv80y#d`GD1!-}rd&*~0>
zN&E=%;ex)X0>Kvsd6+%HDQZZl_+J(}dw+cEW+(=r0_ti6mu~Ot_sT&Q|5r?6t9R^y
z%wPol7jcDeU+d9@t55HCLk!Dj1s@0&V$@v4U=!e}pGXuv-{NV9t8rJ_dDReV54`ao
z5IGGp!uR}@Rn}d73ckbVnK>9=of&4Ay_hU28Vih$m=XXjhztMpY$=%|I;PvqZr~#}
zd-_oE4(h^=#$spU&7yAsJPP-rx)>|$)V-sij<rJ}Cb$@_w66#oXaF-(hV<j!);O2R
zt9C#8Ab;mKt(n!j6P+c?Uip-8Zs=JdW5n$X7K^QvUk0@V3{$~_$oz0+i#?Xk<*;qH
z=(`wv3di`+8|JE`fxWZ7OS<3-*DAShBx$a@(6y#~KxhwC&efc!&)Toz2BbfZyM&_z
zCZaJysQ{A$$J(TO&p1GLYq;4$$xuVZvYnk%Izr$&Gm=BbOc%!Pgo*U1$g5GQPefv^
z_1gxg4m+k~Fc%9uwb_&LhYVcLTb8%v&rn5e9j4*GSEg~E+$nFi1M4sxZ>t(A$jfw2
z;xKM?f`aMHZfuC12ee3#Bs{f)8IT&>`EN{Qxn<swXx<V&Pg9PHI`77|E-x3zmZO(;
z$FuLVz(8_B!bV@4?*Gd_APt^<;+`1l>x{1dEIrRU&M+gjU4l0B$71M<1|t!hR9wI%
zssfLmsFEviO?G;L6D4Z^`gMtCWdcO7ymn+tHuM=t*d3<^P{Y1YNJqBKQmuoZ4%>UP
zs26V0xmVsLx!~07##Je)sX3p<Xp2jqlS}~5=TA6W8(_*)XS3lqz>|~Nh$E*6xpWRD
z{<^ej%{@n#o^Y$WL>1SaFW5hfv)@nPqpF3LvM%ns=au0o2?&?8d^pG_ZlsFZPV4X2
z&A@QFm$jgp-00>Toby{W3s3X@8^05}Qg=SY`G_zWT^#0Z&pOrVCA6S>CRZlA>MS7m
z^BX4C*!+DC>~!jErLzNt=BPqm(pb6Vl`hkGjLS%7(TEH?qQKVg%o%g!1)Br{q&zc^
z%9PD32B8UJ7MEZ>hcB1T>K_S{dk)_WWCu}Fl~VW?ut3m_GUBq$LOp*C((co1e;aid
zh(C+!N}`O+g14z6&RRW^U!H3gzrKD>cAK!26Q_t@q}ZCP38s71ZW<eY9RsW+$o(c{
zltiLShdG?d1d&H<VUc{LBb!XO^s;2@YP1gC7;x(L4`}`NaWH~?*Q1zcuZDU_S{_p!
z=<CwHYq?<>!bROd=6m~84l^vqOx4Nr)NGKct@Oiwf{y=Wq2_!g?)mQ=F~vN&{xy~3
z{>49)vubSIXqP%%g0BpE<R`-Sda*)w^0@d&sf7!b!HJm);Z0#jT@G7g6#g#O(dF7Z
zg4Eq+R<lSHr`D?|G<mz$5@kFDhjb!a8V3>=ptDKr&aHvlmEVHFC>!D>-{m<29&jd1
zlYCv*G!+!(7bfSqS3Zs&CS2C}h?LcJc_&aF``615j`q?`6Qoo;V(3jgGoMwB{eFkV
z1t*l%0GEU=*4#x|J+GaJejRexJ=++lBz+ZS@wa8vSlnMajyaY-2AKPhl=o_8N;fxk
zgdFRE09SGCJEHNZ^w)gtcy2ztVLi?VWRcHjJUP2o?LDcyxV5Q7$%`a^d&drQ-Z=-!
zFt|jCHwCSyjk`n1tXeXoP3ijI<$c)>PhA-Ibky{s_VIAbtzSDGyJY5<OKjgE9tauv
zj+6z^+A(!}Ow*_EXC(Q9kk<G~9=|dMonc1Y`!&HK;hHs*2=fkE@|#}wLMq9O-^m_X
z0=khLZu}uzM*9y58LwjWAIgeDpE`}te9!g2lT$Qz;_yiila2BYoED2j<YUzzFG)ob
z7z$g5CQt7jV*?SuIJf14nFU{OdsDV_w&~fYgQZ4l2GCOnj!04{_~BtlCE?V>!M8W8
zU@W4cF6Cjy5B1TvLt8_vmK=JwcoFtv{jIvI>6}p5%~rr>lr!kW!*q1$txZ5hiE&}-
zC6aAWH658-x|pT#c=O1`w229oqr}#PzMSU6b{AxBtqaWmefIRGlx~lbpI;(}jC(_#
z@U1uwRzpe!s;UA2;KRA2yNbX0_vE$;Mq-Wo<ZV7nIZMv<Wd;skEj<fDwcS9~G&}wj
zlfy}t<n$|M5-Gl2DL+so`=eV{y!Y|VR(A<^_6f+$@Td6){3KE%0Qt_hAT;OmXCE6_
z?j`@0b!642eS~OgbZNm7YSjMk<~s7>6om3e7z7w6uyZhxqyE5yKEltNBDgvQJ~^O1
zEAIpp^zKB)j`Ghi;bN@`Ktf>5!prC0@k<_m4HQZ861ls^{}50cG4HS`t(e1xG~!dk
zk&|-xEHSPmtFDEKDPRJiQ{?uLwaz>*tKVU5MMMf%3H_{FIv4zes!}$r!?IZFCr&ur
zto9hsBJ6IYIgRFZM!iF)*k=0}JnqhSi(XNo8AeRc-x-h$KQQh@2OZZ}4+_V+-(!sc
z*5glYeg-zYDc-m$oQxb$FHFZ#qif6K<8(I1k-1)TgChR}-4onJcv)1+{juEgN=Uz#
z=1DeOVs3!{v$~RDQd~`ZCQx$+<0dgu2x<R<70F9&l$lq4u^4?*PXcvq-Zw=j`dla4
z#G5+jCUGNm|74($$m`3$KEAEZ)3<<}Jw03L@9mu}87A1&Y7KU;wc+eZlEls(^knJA
zLcm%f*LZo+E+6W;qaN@FQ~%sQc0)30d?g=uQLpyC<iMjq&c`!Rga_70Lz>&C%p$Pb
z$~9Q{m!WK<g1<Gk!vs}RRWvE^D~vXE%ZS<6;Iu)e9*=Iv-+H!ME1$&Oz8?&N(Xi=D
z$y}HUj)lnuykF9v_Sl36$M9?9k~k6sv@X>nR~lv!m4v3;yZY<Z@6ByAGX<399O6Ff
z1BhQZ2LoVkG79(W0UP4Bzt=~n>>7BpY6!B|r?X8iXQxM^sZ+e`91mFs>LW!-TE4Zg
zMiDx9>8Vq#Z;en%xmq&?(WlU<uLcjnsV8R39`uv-?-l+Bz!4GEe%YAYUk$q~yTuF6
z@Jv;2&%26Lkb|G&;zb#@?{uM{GmB4xYR;(Np#k-0yo553n7!tzReN8T6m;zdR2KG`
zz4Y*y2J5%f!WYNt`gIrI&F$lXvDAkQ32yy&I&Wow%2;Q>c1#pj#R#5Yn%GaB8Lz?M
zo&Nh7j%)&&;DHYb2sMhCWfVnFIh^^*zF4~`h8r;|h7AB61gH)|^!Ls#bB~|4D37Z+
zw(s`eBzg7FiWnFiLV?Bka7RZ*)J@Pa4dKbLK!`$uz|a8h)NhEdL<gBy*vyCic>8Aw
zz9Azty$)*Zab&DZC&i?~@h-+}?#|pe4DwgGrtC{jEgz+pX1-{{iLnJlFa8c{`+chw
z9h8i+QA}t;{h|y<uDct3aJP~1ev-HFs(4-CwFnNsSg+BwL-k7l1gRsmDs*SCqu=^e
z#Hj4di04fb%Wk@q;v)A<$IXe{X8Qmxj`(JrXQ^o@3$7MAuz6`a8@wC#oTSoFfeH?S
z<%G&FV3#uju9_Y_oMNj=aZnL;H<Y7F)5i_M9nJ&2vkc2FtO-0q7vl;?L#4+Zj}s8G
z9KJ-KX-9$Bx&eT@fU*`z1#hzn$a!c^Y+Cg8T+<wdL1jPkf)D*zTPaBFs&sTum-Tmi
zGvbePh#3DP$$tRCkh{g%I5}*ayCT@Hf=&UgY8wj*7Af9^48A9PU<M@H?ur!cvMG6`
zN2;=&H87+0xy5ojizLdCmh}EHQ@GRkn6U7LIz0=Cl2?_<H79{LP7rhvaiNWQX?hNB
z!8VKw`qzuNrx_Lf5RgLJQVJCZrEI5m<2`!6SLja7+7$D!-Sf8!Pjo2upJKvFVk1C~
z33v4EhS?qhbSL(+$I>>db}kITjKqJ?^A^p%!Hf-90>ir^a5AmZH`DXuNpi;%bTY<0
z^iME{q)Q_~!b1u~(BwqQ52cB-GFG(io+KWYP>|+vAl^#bd~S8J6M7=|uqhLo%9pd8
zUc^b}6<fVH2@d&|pAD)nZ<^JSsndu*EJVI&o!ph<hZkAb>~mm228co8_j@sHEbSOy
zqS)+OOLVC_^xLCxv|nd2{qEM5Zpj~|GINvHSDJ?2<;FN*DdX&(hS1j(0mODUOfPe9
zsBvR{Ml$#qfJNR);~aBWAHM0)ocOsgtX|HyB3h|b3vY7dqd@tQ@at=AZ{Ya6EWhbc
zfH!<h)qM*`H}(-qKa1Rkfcu<0V5hS!;+mVIZjYa!=3Rlvyg<{SYOJaP+QFYb$GxX{
zBN}$A2|0CqGtmTL4yW>~3Mri0NCXnGs~D#2%8FBcE<=S0jsj`<;)O=t@nUv(>&?#^
zBuD7rMBtSMb1lnW{_2k?4ut}NJ3;Vz<>d}YFYB^3YaG+T1S36J+YO#L#0tZK1W$g$
zm^>{N1n(?tTkH`kp=U3!@pn?c7>fh>a&vA*{-;|6S@ZAL?9FT?r~XKShgxr@9k=26
z>^6*6*Elo>DO2GY)5iR#84hLIyKidye?ShL-^PBglrFcWSL=<rp*zG`2!nCvOc)CC
zae5aF3~Zi4=GXEYyFyU^2p-m|cZY27sjO{Dn<fv9H<<@372}`jjIw~h<8sG(wxt?d
zP_!#;kVvE;=@#NZqDDvIjG0z@M|fM^%GJ65`BSyaMo5${UpjUOfjCU?%d*vh20nD^
z(g$N#0qI+FBu1=)Y~sB+1Ip##b_dLD>L>SOd$6hcUN65M6@$!$5uF3T^;K&%01&Sn
z@^B#95t#2}*|Ov`wX{iBO>e?|>5h9df@mx{QQ;u%geI<r!S9&+#n+_7jp(cKh@Wd}
zYUcy3QV~{Z`J@sNL7nB)V_n&s!P78{6T*f5Nr3HqQR74uBO22Uk*K^nAuloqTCLKE
zbLs&091rJCYwD@#B+%DuY<n`%G=}QCgg&bBg_7ltE{|mUDz~&Wl8|lhGjV?ZP_Pyp
zy;Sx_e$^HkH}l-oeS;A02-K!xcZg`qFvTvIdT)7TmuS*FD%~!B+Do*~mci<g21EZV
zWdt=0!xR6b&`KEBr9p&02Y+~J!Tj1R?k@;ZScit4q#Z%Tek&BRtD`ulPNq$E38>f$
zWS^_GJd&Fxm^-070S<HoW0h42AQU%tO12U$q9j36B8jjmwDS9^4bNO3i<TS}9t!h&
zR59@Rk@Myh*X>AaNsxVVCb`+0WvynqCUwTvFjCm-#HmKid;ySW@*BWpno#PTy;Y{;
zq*TtmV%pxkZR-?A64}%i`KjXJ!q1u#oBdnzIEBkh@Mh%*|H_z>)zW5ob_dWj_0=M0
zi(kiNg&x`Bkn9f|TC3U;rfn3DgsMS@yP6!M9)~_`q)mm|p^zIm-g5K*0pCC%zeJX|
zEproNdf=RH3dg=HP%-qz78fJUfaLtGv#&ktQ&qCH5v|3kZR&7&<b(bdA-j$u(M$75
z>IdslEHmCYk_mPQ)a6e@Kpba_Rz>jj+B}jrQ==S#&3yj=<%;@0#*$q_;X9SNiv~;*
z`4~8_oIK%PzFSd7mAubC6Y(Ga(ETsSG^89<X;v%s!|{~5oPUh6G#O4hGN0#OQ74s<
zBT@69B=^mF&y8JEL69HfbRVsF^dXq28RX)>7US1^$y@nX=FjR+2~z&VZGSw!LbJz%
zN`Q`)EW-@L(XmZj<BIsZAE!5c#5;onR^z}Sh|jfGKT-Kq@u_T|N@i6pZ3}VbvyYc2
zfH~r=N9OL?8kcWO)OOKtJGWrhL1hiRExf;*Bazdvt!bv(vB@)$(*7CVUg}M)YDgLv
ze5?)Zd;1#qh%MuW7~qK#H9!DJzzQ+w1vbT9AXFUy;-&jC?kt0AAL1Qr`bP;jEG<9q
z$MQdkb2>b@`tSSF{Mh#3;P&;V`7z4M@+f_ucOy9ZdQ)e$XNPHaZ(o0{Iu)PFz{F&K
zdz$D<#~Xnph*UE|pt<#|8_hQI>wVNw76=CLImf+pQEycOW>VguV1GL4yewmRr;p55
z3<2BJ@t^UfQn#~@Ix?ODIKSM;@jLI9#%%G3q#yX{uMoi`C>b^OM~gfW9rdizX-l$a
z1Q`DSgy;EJm00{aveG7x`%YB3yCfZjepDyhKi0j>w>6-tyY77Te2Wtaa<j`cKMd*`
z+`D5!vFB?s<Lip{4JX3d7lxEH!_J+)Q1Ym3A8sqETSv1{urlO~aySFpnJu)pk%X1V
z3c-##8so+06)O9^4*V`#3V!srBR6jgDYy!-lZ7LblkHc$%T_HPnyto4bDUOn<F&M^
zv8s^EgSg`!^rciFl^J)DfZ>NvuX^A}^c9xGyRBX}jt5mBZ~(?DCFVb4xmf{lBYflr
z94S5O%xfY}0h7!v#z`6csc+*y+1G9c3dH%q>%po^O-!eA%sfXT{{U-A<t7(ukgM;_
zVLP)&oE67o?^}K%k~GyFv$b|7U!`+zKbtgb3k-9+BRTq4)>Dh*N6glmQe=hj+cEgl
zS{zO;?dE%V$4MjMxa0%sJ*qi_2?D9Xa7g+Jt~}gfj%#mbxpgpgPVY+bh08iw>rD(&
zF$$B9mN`E@6}`F|?iE*YR~+{h;p?%{qkJ;3!Od!EdfoPdg#dfoG0&DxW{eJgMn8Ce
zvVQ|!P9p~QiaHf^maO&lh%Q2K4>b&y%2bj$to<`yu+<_H*vaOn0G@;&LJvdrtCHQn
zo8^d`Z}+pu<z9QIQaUoZ(?%TevN0u2J$qCxs!%hI2d#7xMzZ50gH&#<Rl!9EZUzUq
ztBXrUXUdF=gS8eo*Bga8B=Uv@HdZkc=7WXp-lhc^1FmpKaf4bnp_GvY;F3jEjm%^m
zlZ@n4Gc1Nbm3Q~S=~Bp&ODeh?DEq>on%ZZcDOPtxk2#G`%K85Ap)pz5eYlfpdGHVN
zs&^Lu0P82qHqb~J$owi}b-m51#I6ob?Z9NOY<pGfaLl`zLLWePezjX_OIj2(;ug*{
zQpW=yF)V*tw2+9GY8Kn*PH{@`;e?399(d0c975>GOp+l?a01}}06NZ2<!F_diE62G
zE?vuUg00B>1x{j&NUi|el1TNZZi&msyD}gd9+~Ny#Z|b9ea~`|Wc|Q4;C>|LoJ4Ir
z!Jo^E<sf9>;-HrD3u7Et2j-CAW}sJBmCQ$!#(x^CHjbA~EYeEh-3p8z21f_igH;vp
zY9`C~OtB~c3?Is<&GxHlf?5P1@>u=cnx`4Rf;epBCUigpO8k?G(tDTriHHq@f(R$_
zt$A!F%t-8YDQ1S|>kvo_1Ld8|f=&-U*sUXGAh(ivq-WZ>IXq&dg|D2m+-(s8Jg`3T
z80u=&@W*v(Rz=*Ws67oOQS5}G%%xZ-jya<_Vb7M|GT?h2pIW<XG<TN2YPy9IMkjU;
z8$shf^_GbxoTS4TDInMm;5Osht9`yLGA6dT40mLfL6CFX6z3NrT$0?{TagvJGl<G=
zJ2rIN?@Ekhh3AP#3O8VJkyw`S>Gu~_@W(W{Iem)bXwOX6z0IVS^2sDowB|vHHy^qO
z0~F&M0(T&}xq{hXk~lFW{BM*DFRx6Rs|;6<Dw|~7$me;N?lOCO)~d9ANsr1|rvoa7
zs^`6DMPkAd^348!B%olbzL}~Svuv#Ld9r}4>IT8v6-MeCX1`e^k8{Q|^Ku8_S^9MC
zV&-eN-7avd&GK#aEu0F_VzR>5R}rerRE)1VQ`4tf0D1dp5?@_`96%EoQ^5z;tJz<h
zr@+ldZe%vGSQU~rakqLY>+k3)?S-wymAq?o?9Sx(<Bptmq}B9f^P<y28z7ARdCht+
zg8nS)o(P;ld}P$Lrz##P9GM{ljKoRC2lr23YV)<ZjhsdUhHT?1K;z$~P}KwmFD_8{
za8JlRJx@ViUOOzIii@XFeES~ePo3g&>GR><omWPFjd*XxIv0yH_0+T*w7P`w+fE%A
z^R?9Sr|bAvKt^lxU&8vnx8X~q)$|*)YGXiINXV29cF4Kf!1L5(XCUUj#qbx#4I9O4
z9fqO>hN{43B7b!uJxpwIo_~b=-&*|FGsrPGOUl=Z>VC_S@lG2b6&XHR_C1V%MhMM(
z$@@Y6%Krcjzi3-H4~Y_8Ggh+RVAVA6vD|#&0TSg%nLv#|51peSag$$7jzs`sysQ>7
zz80)&;pXEdvq#Y4@RV^hof`4HUqj&^+0*tI_<QhY$?*?~T|86bV!!L|As=LeJ)wd3
z42%Q4vcUAf;=b+Lxy>#GEpM3Sc&vSAh{esyPx{lbo&zt#;HfG&XxSNhqw0yLUN(-9
zz3_rVYPST4T79R>5*7J^7RKT^#eX*c0JWEjzCP-o6K^zsj!gDfkOXUeI@U-2%kmd;
z$1SoPE2|ykNJjE=oMQv}Tdyeit?+xmUl}|#H--FnrbmBj?y|=Fo++W4JneYpjI4-8
za6n)f5~m=Leclk_8aYlc+Sa6|o|pSk^H=`7(c|$xO<OU;@=%?c>CyV1%F>XkSeyVd
zPo;i@e$8L7cZ_~3iys*LMrk}d;a4%w6aa3#)Zle<`?9Tw6FUy(1(`{}QUR~Kz6O86
zKfWFOCGkDai#$(%{{RV<+*2cI33qHEg2hnhb4Da|nqqN>U;^M3P)RlV2M|<28jJ-1
z06Jp6>xukMbBERPi>vQ9`6c?l_$SF^Tn5usb1nY>m2LB958ywCUJ>|h;p<-vcrQ$Y
zOt7?yID*80NiUYWL(JfAmP6H@$t03{*Kh_p)Rc!E>+#B!B{{m1cW-m_`f!CfsKHqw
zy$y1{F7daAJQJmzS6{qHBf;|1Ig(T8ia9@CgSoFU_@nXLK=3JjCrjI@>s*f^3G(72
z{(3M62mO)Ht}Ee9XI}9Ki}fjWe-zwFbu5SXXzjToUqoJtJ%%gX$}$Sr*(Wt<cRqVL
z;wl&w$erid?|vfuWAR7CYlgD2X*AslS%8Hct4t3Wau3anbm%L~p|+0RF&t}vIIAI5
zKvft6prx<W^31OfgNH3B>)iaKGtBWA+KzRle=dN$VwClyrm)<kXCC6cBg%2Ut&5&<
zOI59+k|ZKWE^>K3m1csHvI?&>%C4~I4<IFY&)w_%>P1<+;!7`;js`i;*EJ(X4?V>L
zDakZe@ELC80S8jU*XdQRypcQ+#LA5MV59he>(kRUa^X|SZ!m69IUcngv{vgHy2b}}
z_RR!IyAa*SzS-1eH{&IEJbQ}77V5+{_T0VmpU%1~w^0$UL+w(@axHB@Wnj$H1C>(0
z!}YD<p^U8+sQIJ3Xaq2kF6Q}*amH$AXrA4~ByLqBBd8<%D$D?2s2JLL`qp*xJ+hCo
z#u=FO9ANs@$8(o?4324*&KVcyk-C=VqD!gP;iG6I%VRHrwAHxTk~qH9S)`Mx#(DQW
zs^=khbC0HIso2^ucGRIAxPeHE<;ltXY8Hf#GI)>&D~;-MPqsQ%n+?MRaC(9<QO2>t
zf*6--E^r6A$*Dw`O?CDKl35Z$WB{BFL9IBZBFS!27A+#2f(GH5wD83$uwV}uAoExz
zOQ^0@T4YOpimm&z_=Y*4M2fS><s^-^?HfyV&%eE1OKGND2?*?Q)~A`9$XQ`v#h(Kl
z@lRnSOa;3to~#JZ^`K=@YTA99M&?VMq@12ugYIiK*HF5R*b8GEYC@=Annwrw+4QYw
zrn^%j-xS7k83zQf(wpYPatm8Bvmxi@EswxZAuUo=kW1#WL~=ZqRRcd-oZ7@?G8F{o
zv)(xxIOJPM#7Idw;FIgu^s83pOPhZ;O^AsLa;&%q)SUWIGA6$-a+X?iMB(%2F#FNt
z+r3F<^Ba${NX5QvZzBNJ$n5l8B?a8^MkE~kpTx(m2XkABmbYjwEjF*1ypTB_pN#-!
zTSEl@0BO@U6lj43qjm%_?V8E3@T5>%G)v|{#kX$USf5<y*0yb-n@fR(%;srieZ|jH
zp5H-Mt*$N>c^=|IVjp)ssay(>?20mzskHfm#%5w!@zmp<wb4(l+Q(%uSKjeP0ojtv
z#~H3OTD!b!`)RzpXSV>9nfY}-%bsfFm+5{M8)(yUOidy#0r$l}WY{W^NXgUnol^Z2
zUGI%oK3r~h1_!f_cYZZ`W}apbD#sTeaxsOgF~@Ic0)@%-?@XQHb;2BzU393WxyxF!
zdli8rs}=*Gz~YF@d1pO&q$Dl|I?@&cA23|{*IZG=msT*y!45$+_CLd{zk+naI3)@F
zYvba8uNC$OfP+Wiy)$q3KdpTK021BMg}<BhJ#3MGw7)~GJF~Z%kcVCgH8BG@s9lCY
zYWUXDRz9LBJ<QETPua;nqPcUL>UB2;5(lWRU%htIO{12juc-q{g}kTWo}Tp9Ko|h0
z>N-`N4&Z)NwYoB82;XsF++gCMnI(X{>foqfxKwrZ&sw)68Feb#26BDsmDiZ<<wt_8
z!K#T8A)4+lvc|F|;^U`YNvI@_DAc9Gs3(&ia=zZ#tCLyD3;}>y9Y6{PAI_uuI$Ufr
z3>5i_o<>D#rO?d9vWghNmnBPdR8T4RlDa`Acf@h>a0xZ7G*eB$jUrX(;{>0;bgpjh
z2L~-FWgSU9D&x76MSBShA2q@}W>5<TP(iI5iEWz?0WH{&20t@YB4}e|kxS1Ta#@%V
zJ-Suvl~zS-WkVZs6L3-ob5=_-QdEg#iZIBH$#{UupRHH7fv@h?AuC8@1o@J60<GT3
z3^I9EujfST^JkE)+Xua4m?T+^%Bq*f(~dGLXlRaGK4W}3hj^!x3myWo91-n`hwV=c
ztRNA}g?YzgQ(4`lk!-nEDx(`o1P{`#={9<2*`>J2h(326qyhLEQf%7hGdj-5*62)y
zfp!?c9+>v4h%Ln5=~DSgf=SLeA4-<%O|+IQ+epZ9!wxg=P-&J3k<0z1C!Pa*xg3T5
zb)8Lg6WrrGZ#<HCPUx8gi=U8;{odVc=FGRfeMj*QYgv3TaMu3->em1aV!mbHAEkX3
zUCOAdo^~ox=hXkv{V@wqrNN^Hzg#~adLxbTx-)1`kNN7K&b+48g5b9SPI<3h@q|WA
z6}kTama+c;I`T`HcBoqXPmN#vD15(Ge^2;Hra-^PFVLqsS8~hHoYZ*jND!gsrGYco
zn)vhd$vp`oKx);!v&k;h86@_rF(%%6dRIqi@vLgFM7z73;L}R($2%c{&N#}zpcxyI
zed_!c-dx>-dVMOgGBw#kuC70gPamCV!33!Lq>#wD<Byy3u82NHV>7q40j(EsAd!+Y
z(yk(fPzUqPRJ6-mjAZfOtzH1)zoBq-Er+DPy1yg%yEgv-f3N;2er!V9Mo&RacUFsV
zVYP<_m4gAm$@HLkj+k}YLEr<PE4D22>{Gah&WIwMZR8(%>--}OO3Yms%?LLz_3mq!
zl!%vc1dnR!{4C1SNXy1nvc*5#vwQ?^h)nGAYk=0RVS@9_Rr%kNPpx;=!+3^I^x&B2
zqn}#f!#hhu(9C6ZW^?mKs>a~39ASa)>48d|uJt$rs3NM|sghB+3=g_oe4zT(wgutZ
zt^;EmvDe=f>q%VjTTG`k5lb)(bG&puf~#J`Z4^lo@PB}I_p3tA-U!6ka_%Y7<X`|N
z>(Y`tRgqQXC5R-9oQxkz6BVpewvCqK<;Me_27fxScvgKn?PL4iRDh3s8goq`+?LV#
zj;914lzw$yNo89rzdkV*A#wtoj+ES6FmIbf#Jp4*T}oIAupwjoa0O6x@?zbbU;)4x
z2kTlMD%8qq(uso{B5cC@de=1#)Rzd&fTKKy<BI!w5WP6F<1rO~c`s6Ablj}WUSGPq
zy8};=CmvZ&2l3WumO^(092{hdvd?z82uVTOx8zNnu3X4}lswbfds9`F6|*A`n;2et
z(2_ijx<w0)z~+n9CH6$a<pGE^;|_$YSZqe}%3rkUcLpgB-cknE{{VP&{Cd}=cyGj!
z>M|Qk^!qi!s^B@o%6pVO>&Vng7~0IoB=j4o^6n+V%2&$=Ay3{1*!HX|;MCErELu*?
z`cHs|9crB5g;!-K4lo8sKBpbMtIBkb7F=meEp<Kf+pz&<>hX{Ezq~#BSFBpzS?X{k
zmXmLfA--1YkEd$+iuh`jr1`cz7*&MyMr`_8G^SRNg2Onf62#G~1svmw=;j?U{zXM9
z`G~5k!8knQ0+OAOI9;67Y)4)x@<@s@><1aEH`;q9)``Adz@KWTD#FqR7%VGTG)F73
zj|BTJ2@#i#ixDO`8LdkjZz*MEa>7>KgV22{Pc}v25E1!MlTOkUC<J?QA3W~#$MvhL
zB-Vy0TXC&!WXO{{K@7Mgaz3BRloX7on88`P4^!<$q>{<JOTktkeJVM3$|GqaUol%F
zobC4q=~*SAvP#;EJ+jFDa=Am?0oUtUK~e7lG)z|<H@$D{*a;lgX}v0U7=sV?Taox;
zt|*p>?U%})<{%h{+k#X8Fgd~gE`KVuhfX_HW|1akxP+2d<tloN`&Ope3@#HMVNw*H
zaC=jB6`7HD9LiPILo4NRoxGm;9MaiZG%|xA`AXo7?&<Zaw^3ci<<E07ZHNu~dK2`k
zu}<vq+{V~}kdmx0N$uCAJCkH+TtgI)G;ffEbAmzq#a0F}2#B007|H8Qxte)lk{J-F
zBjqXc5<Ph6YaZ=xjh(%{m)@2$mbl^9Z})-k?@cXA=d!VKIHQq7D6GSh7}a}=8yKQQ
zyyWl79GvIVim_oI*=><7W*;XR=rU>3-Py{G6G&960nZse-nD5n3X$r}JIQV>ZzKNA
zw@9KzJEhtP$@Cx|YfXGX8blRQo<KQc(4Sgt2_>8|#cdQ@S+_=rF`V_??^i9J-fhb~
zQHD%pADf@ZRSG6l7rW+bksBn1ette$A-iKMq{{ek2yLJ)e!Wd-$7t^I`NT37-4+Mk
z1EBm3Az^bPgA=yV&gJ0Zg3R)5Mcc=yz``Pc2^f5Y_Q9)?Ng23~-BhZKAE@=FCZhKv
z>|)i>E(zScW7@5Q0a>MxmT0g(RQZ7V=B3KS*^RrI;GP#>xW^SFi#+lN6Nr#=gWjRI
z5vzH!hHeLzr>0l9AI_;qdo9$WDb$4rjBr_f$6-x7kfK-^^IB94<?sT7o}QkSUhaEq
zwEM#e^17=X%HD>O)^#&7BCnLBg+MkH1NqgawTerK_6Xv(0GTj0u_vYv^P<M7yBZ#3
zCdo0%DvbNqHcYZ^Bo&VmM2+&aOuXeW&PQD5HK_*3<C<BVd1T=-*&GjA$zrkFpwyl)
z$-tFBLzCA8bjaX}i9$1vES8MARV7Ypq#c>`N5h|u*E(9org+NXuPoeziueL#g!)bW
z<^KR^ay!@6_Bw^0qi-GEjl_2l$^#^kFlAtU#eQi^6}0ONr}u(BV>#d-U#)hY5b>9Y
zJQZ-48Z4$uc0awck&lw&uNeD{?mryY!RFj9la)*_+<v3Oym=;;8yvOneuM6y9M=Qj
zpNM`Q@x7;+r_3R`LHp}TSP4+$Cm-KY{pHB6^+-J{;Ux*eT$L4>`h_ZysTDXorap=&
zq-ZxB0ZIlb3t-~B`^VoCJU8(7O#5!HF8SmixwMS_`a|^o5$=Bs)kZRtQI6!)oM5@8
z?2g|4`qNCbmj3=_g5DL~B#v;Zqq#NlAH_e~67O4@PY-wkSuL&9;!Qyg*N}G!k;0!|
z52bkzj{YThzv4yXKBaLbrkbu=8yJaL!2TWM_{i&%o@<JZ5f}kSPMELIGENfFR>k$`
zes`X6cQSNveL9bZ)m7T!OuWyM&c`f%rmRw_z~+*^_MI3~rm4jrpH{6pvvZrf3Mitq
zMnOdsQvj%BxQZuO+~J(y3Q3-HWD!OkN%@IvWALXi5BG%vY@7@n(r--TJk&;AtP=#2
zQ7XqSW`%}N1gXzI<3P#XmA+bqm9VOSZST(`(-mEW(K3d0C5{GosNOwG?F%GwqQD2v
zjJZEhXd+h|GEAu=$W@5>hSSk~>n<4JjW$36u1g$r+<N{sdPp8mwOm}rj9=uB2PDPR
z4EE<Wg>L|~fZK>dCQEG%xa9Tptzyh48}4fYxF8U7=~>bN6q}lxZyYW2z`ILz%`cH8
zs_%9J?9R{*M;#4x#x_@GaZ2DMYldOBXpOK2dht`n)4H-qo?bd1TvKcTk}}PMlfeTO
zAWfML#e;TZky*EK%3VmERiN1<hF!-Yfh)l6kyTPgn(87L?J84lFfuwCnPZMDM;OAd
zLE5R^eX1#u?I8|vg&VL7eq2<MxVnTymWl}qN<L*hv+r4V7M62PBz8%5Alj};!}mGs
zTGsZL21b_eC8SgIU&fnfrwbf{72C6smi*`#S&Ot^-z;Sd5~4rh6tb6{Cd{5%sN1*>
zK|ju?xn_zpCc?_2Bq`_c6u}qn4YI2=ZS#4Dmj}}n2+z~4<eoOTnn!g~V^xr0hTrIE
zEa$b7Nv$n|TgbWI2*WV;ts7`{2O}%{tPn~0SP|{-R+B}IS+4<OFdPi_?gay5wD=XG
zbba{fL-AI0>8xV8TNvY;eo4y`1#)^snw_=v#P`t8_FJ~hOR)Ki<^KSC*!Hc-WWCm{
zx7njuO0w-N$+7m1xIJ)a0>{{9cArpiB#sFiWNt9LcLt=ezIm;@;cRWj{kwmt_N_)+
zZ8t`ErD<i26z(|Q>|3o-mgXxx98GUGmLCE<uOr(Y)Y7<v?e@aw?6NAm_AX9G(~5@X
z=EB=fVJZIrSnRBUj&bNY;<?+MOUYk8-M3swNRhhnj=t5*J2QRbxaU8Oby31Oy-phV
z=tJFGnpe7;&dZUuG~pM}S6!vOzuCu;C{CN3)YqPXTw%p}1&zEh+9L>t+-Dy3=+&g7
z9S%BG9OZUsDm<*^_Lcq;4N#d<5=cLVT887C1==%#ny-CqUO?oK2Vikri*vQ?LL}Y+
zBY{p4nHW9+0;2#R<nixQz~z;(j03?3Ijy2`#`j~$!2B!hF981l$eX5v{lon0<KV@!
zkzZYS9ZPsiO8}4zZT#!!_=orr{{R>0dpRq$erMS1E)r6kqHUyh>_tR}dj9ZWiXsoV
zhCEe+9&w8J*)@9~RFX3<!u^=z+PRXT1B&W36x#tc%~xF+XmV9ayA;&OU!ekqxCxTu
z0eYI6S12$#(4%_gfX#Fm73SGGZBvj(^REf<2gUCV_&?%29tQC%y`JlM<+r>fc}1ki
z*(Y(+f-Bu-^8*u(1%6`w*31vZaoqm^XWRb(TI<3=RF@&oh?gXJ&(hbpj{4Cqtyo+_
z<8m0=h6kYp8m${i6ktA3u|F<zfnSwB4Zb>f!{PnRHs<O}{TWa1EM{%Xv(X93Q(u1g
zH{u_Kz98w5-{@EVUBTu{X`d2CJB(+iy>BctiIi*eJ07>OZrCDV?dR_m{#B641Es`i
z`{KFWJ1G@zEBiZkEYcRp{`OR2nAYk?+hJgWPFYoQSXVmH>NeKLKj%)&W4A>ZUT}af
zKc;I=Bx_q(((uO9uscGG;2&Pq4aB$Y8X_jf<INjL$Uohwr?PoeIy9LJ(6#~gHFq;<
z-sWm)^ZA9PiHtm)rv#qZ_N|*gut8@ulf)8K<yJ$<AB|$Q{G~;e+}=}zxd8cy$86PU
zWiJ@}OBp7gDy&9Afm<P1b?%{QF5k1l@v-~rIS134w`*?#St~JMdY#O1^sM_h?akVv
z$Rr&Y^uQIW9?Dt1$t;NQM*ENhoElC^1<Jl`N{cdrW8Jl|es92Z$Ky?!VGZg@1Brl4
zkU05IuWZ(w=Go$s(@%ynx!t*l&V7YvMzhRjf)p;La5&|0)}c=Q1#_DCgKQVW7ZSt@
zqwPCSy?oEIzLfFAmxseM$2yS`-~)~c`q$3?04;m?3wsrz;^TjcfB(|`IHI3{NJcC5
zo3`{n9(x?Oi-MgI!)I@s9&y(huOGOHT!fEsLUGSp_3s%mO$Jw9q0o=O*N^?GLZf>8
z$DkGXryFURwb?J!{-E%pHpnCXDSm}nV@Qi40&siPt0hKUN-_z+$;C!unV7L&Pinlu
zL<pk+knSX$3i%78^tQA#9?~mqts3V&K{@`lsbGle*+ou0PxY!6h?BxjSIh+D=dEp8
z1xSj?%V&d(arLW-&pB8kjlB4xQZ^Hoz~p{4wE<ArjFL0d@mVpXjVTeU3?8`SpL*Gf
zF%p1!*3|dAIj4IwsJH(BR1I4lDz&K0EA*>CSM(MNJlJ|~`>XOljdPoQezm`vUz-w&
zegzm6-5n1wXzR~4()ege=1yf$I2#D`@1N4S%nIr}K2Mn^`=YV0{3#t+NZPboI-2v}
z7F|OW#K;@uPCyyXYtXCDyia8;*YMm4nHD}mso}o0&4stL)alF-!-uks^ApGz{KtNF
z<F875lCABx+F6iCy9!ru&OLwn)zG`UDawKm1RepZGs0$w+eH}|!TC<n_#7Jbu8#@A
z#86FX93l<AVPk?YNd0R@D5HaR4nXQN)4ef}5>$ioi~>14)@rTAmBV>1HcY^gAi>BO
z&%YH3^(n0hqY}KUcCp6D<b-(gHmUybC!GB%rQh4$*67U~eqWT}llQTM(zU0zES_|;
zvYw}J0<x}6;@RIA@iZa_C`rNlz3JV(h;2oC$BKBR_v=p#v8x3{3!LN(eznfCB4jwq
zj-2o-d&Sp~%dSewFv^IedxaUUU7sq(LGqke-eKcvQEdGG026O%Cl`H5WuDy>Z8$!Z
z!+SI<@`c*t10&Y6<Y?RsE_vxvOwP)s(`u$M_*Y7e-iH-RkD$>d$R;(BRG+=LRQNIG
zMcQPLg>o`;^vzP13z?K&#=j}*eKA%bEZ}aFcc(m7vP+?wsO_mFkh5)mcISc(E2Amg
z12?(FK^%&iV5PIu2Li0cHOnCl2rJ!}s1*#QC2OI=Hn~sU6;&J()jf>&Iwi{7X-+vD
z#z_d@p~v7WFkhgX?AuP<<8k$?-gI!}lq}2BoC?Zxqa|r0YE`WsW7ceZSEp(wIER@W
zq<-;aUAtS`laPBF=uTJ-roMf-noEe*Sc^p>467$le!i93cyq>z9c=ZYBF+^h{v7-I
zX1pw3TGq7D-%A*!qdibW9X7Wh)+D-Gt<n2B<9hR+nXQ;^5x7&4!N~XNRx$=#t}D!O
zaCVW|y3pks)_6`NPzR_a=Akm|`@G|WT}<dD99KOxjIp~oA0g(Q(L9$bNL|-4u|w_N
zl_Zl8<~AkC4BTOH>))D@SqhRnQxfDq@G!@y#ZqD{w8(@bbu}zVm>?*@_Np?gOi-p2
zf-{kxD&Eq_W6m-$!KMXiCP?=futJ}luU;xEdy(fcEL(p?$4Yz}Y*wnmA%f>D$C6LE
zrdzNQJ-$9={w=s4oggY*8{3&?haOaFP{hXsI6rtE^<qfkScDh>K-+^(oz`I_2n#S6
zIO72K{3@lbsEXVq-|0mdGLzJi$Ujk5W;scKvqf@HCoZbF1L{s{oY6--e`vH{v`9`F
z!6QGJz^h7<Y-t@pE_scPRlSd`P83^06=DbvJ7XMkKow!Pd1J_Jk|5>-f-*hLRhA2h
z<5W0833p@L9jZC7Pwt}##ertV4QSm<1Qwf|l~Or9>t(T<w6qc{DALe*EK3u(wrV6u
zTH&HFhEdD0Cp)`htm*c#BwlU2rbTg)o|RWnxwf{JCoD{=cvKsS{0Caa-LpiJM#l*W
zF`;Y@06hM+g&Db)X=He$hZq=aau22n{c76XL2GRkes7f<s94rP`@+MiIp(OPz2s9f
z#d@Ko!NVqaAB|qTx)UPY7WV2ES)+Aua0@x)4@@8DPJ|0-Sy+STwtj-BUOurT`vfWw
z5D|goj^`frc6bBqnlgh6wDHb<v^cU$l1B@S?@_dM$l|dtt#71gO{}5Nw=c#(K7@9v
zx5~e0iLIhrnAu8P{KMq>40WwNKGI7FMYXbqvIF;I@J4FXY=EnCr@onPBrlTZZuKCs
zW8Vjgw{EwRHt?vir^}PeFvot`t9P)8AqH+zKbsvoRZC0R1TJQS$xz=QR^7RMX*n%`
zCcKDTiw9k#p_3$FVMrg=n2Tk6OhuAyyp-mlx+73rd2CTmL4GhtrA?;WT3fS8uzAn-
zr2ha9YH3&pX^}}Nk)(-ZjdGzuCxCkMS7)^|g93x)u*OI0SlWu+6C5_~%8vaNPb_^;
zVOrL<Qd`+Iys{9@oSgJ0(xOV*t;&p=V_{319Y!1NE?I46wHS@is47Pp3_fA}tLi@i
zd~1gBq0&5T$!U8l2;sXyh$cVn^bxne3UkH{e8lZ*W3zDOus0s2pFW=?^A9r~T7!Y>
zp1H3FG{|UUAGF#>)#N-+jLYVserP_2={#<(p_PLJ$ROkpX;^?VPCe`5j{|&6*1Q;>
zW4SM=X-EJN5<JL2Jd|VAK>icy!R=gc#UGCv$Bm?)K=97{9Uu^<QMx(i+Io@*D!=Ua
z@9AF&{;lHvv-M=YeGk^7#P}zK`qG!nsqLQ?{{Uzk4-P!K2ZLTa?N}18KMw>4xZwQk
zJ-7$2HSsp3ae1w3lWQ8i#Fv+M4ZcYvJZ|<K03E4dwn!H^`HxfF)VoD~gOu?89|>nD
z-b<<ZzIVh`@sepfN3iUY26>{4QC~>ho4NB*@(L)TlHXF=uu(-Z<OniBKS7$NAzTut
z6%u@-=0BY)%6yX|4DrzMN*8)!Ksg+WLVA)1jIKih2VV79vfRc*5*0c7$_7XM^a!bv
zDB~MZLXSaD3KyWo11MbF3tuQPEhC+a$5ZcG5iSsP=bk!L60t^8*LN7HRl(pI3B8FK
z6|hJgbgEaGby!tPtei0)!aZu)eW0@*2Pfz%ow$HU=R+W9=00Mb*#~!}0HuA1Ok&>H
z!;*IKlg(hfAl@z8F)FG+UflPq+MJ1QuW>YzZ5)I^PadRID@%mExASAjQ_~%5cxZFf
zzd~}Vq%rw(%|={L3#ncXM_=%$qSMtBmXX|Z!yNUdTEuO`BN59I0O?veT^Xe8OJ%kQ
zmINPkaw+~(G~0>bR47BvWPuj|kt6rdrAg<kZp|YAJ7%i=iA=39^KQkLMUhF~I)XY?
zcw)JQV~$TO2&6khu`Jzt_5PJL<-)-kzKd%*50o*$IjJAZong6T9${RD?lao6V<RCF
zS!H)`GGG)85(m<uS#AZ<g#E=FF+FQQaq|VscBrITo;NZk(}T2*dF*Kb`5l%r%7w@u
zbagb?Z2r}{d!&*Ti;<E!&(PGmb)Dmg9(jgvrNB&XBd>llTQ|x?FKr0RxZ{k0^`L1H
zTU|lAJ9vXSla1b#%l%H;#$B?N+B$^{Tl^}{p9r>qBQ75wF~@&us|B>zLgUPBpfMqb
z-XkBOq?qL!7ACriOK7EmcQm;X<a+f!)V3Cifh1Nf)HYjiCbP6nGHY3#<at`%)a^;1
ze<btU@~YG8+Fpqi(ZC&~+;F%%K|Y?<r8qe|BPq>AU5$fQlE&_FsY*;<VG$+<Hva(n
z>yXs-LwPYs&?HLCLV`%i@6BXfUfkbIXSi}$lZDU90rV8cag5{>UalVr%C95J#bX+>
ziDEH~`_dADy3-YjOp0bV9&6BwNoaX;lWy#?4tiH#45bt@o#9iF&Q#;KYT^-!vGf(+
zS<dTi8%7kX0iE1eE-qf_-GwYea*b?^(Xu%i0M%``EjOEVz^NEeIp^N2jmORmckUsV
zobCEmK^ves&U0KbI_BN%E3n=G=qgc{kz|bFQ-&V&&y|Tdr$hWfc<yU>l3G)cc>HVY
z{{RCGz7NxKIr7i*uaBVIP8+3thv25qg*5*F`sMv==Qxx25%oP>r}&KN#F!l^JjOi-
z>rdDlpv6G!W0hkgJBC58j*`5OsJ)R^{fW0A*EwAk?tGFBU~LuKLru<k$KJVGW(r1B
zpL%P@Gjg9O`qhVwJ~P^)xL{Xd&JP@MT?P(1m6qTaF%U`aO@3p3*Q%@WbOh=Bn{WMV
z^y~<PMj@PIsXYF5`FHzUM;t#ELPfi8_I0`W&f}hY*P)hwALMdlRjGW@^G49Xiqg>a
zZBN7cf@+#Y!`ocOL03cR&=JTZ>sbMt9cjUjdiLcL%x&Jt`_tg>?F-@kRKA1p4llCZ
z%;_9AQKll4w*zoecAWbG?_XTo-$fiLZwq<Re+-%a6Vs8_zZ8iKG19%W;kU;x6?g_!
zwYV**=?q5uc^vMKAM?!i<F#<q!rk23GPdXHWOt?@e$E3iATaC2CDo)ix${QGLOBG0
z!1d!b!2CS%&xZaWSxbAMY9LNyT*U3RL}S>30k1$7_mNucxN$7N9(rW^n&6c=DIIj|
z*E2FjrJkRBqSig-m-n7w+%S3!o|QUTc}82HPUz3doH7jhcdUzKx|TbamIwQ9Ax3hA
zM?>vSwK}94Y?941XhG#b+(7*3ZS*pl)VFgaaoS|;Qs)Ywa(#PM2<9=wOhpEIsb0UO
zYAm*b*hwRKfn(fCu6aJypQl|k)0<I&Zzs+<!Eg3)+JX%mTdhCJ63@%204jsu@Tl*g
zj>*LKvq-y>Cvd{3KZiAA>=2cID|}O_z(Dy2(DbOLl4iG(*HRAdpamfIG^}X^x|}DE
zQ_Jw&#jb?LK&+}WoPI!8&UN$`k7Nm;N|FU!U?9)TR=$1n6Vkq)Ew`{oiH&R8pa0YS
zJftHuv{&mZbMfb~%6Pwx7^xWzG<p2%#@MW|Fwf7D0q!f<ylE$wq89Du#(yJTF&CC(
zXJWt{oQ2JPA;%Kt&8C-S`5)B26+tY5AL5a2+$?wni=O4G%f-CNLE5|w<3FWVw@9Rr
zuw%gPF;{KnRy#0MFRyy}MH?TYOtWj`?O5U)r1u+lR@u|UVNB)O*kmYgyH%kvtbj6Z
zKmo|=4QtEulx37KepPXUfmaina_C#On6yGRVYqRD)K!#87w<WLuWF7cQ3zn8I2kzS
zHE34QFsS*F(pjnU7wJ~O#aFZc08qb5vtQ8IDJ6!czq-F8_}4i90DrCiDSm7yqKfJ~
z<gOLdczgY1pYDq0Yo_q``pG}t6^<kLQaZ5D_t8gHUVGwKxRT}Kw@E<?HoA}oNcXQo
zu0LPiXGI^~?OeETjOxclyPUdD9C3(cF48XySZANFN^q89c|w49J?LkhRYZAmTjne=
zfm1%_7&rpGG^}{73=*r8+>h?{rXkv)t`&<D#__m;RmGAad1jQI{nzEd^vLUvdUM>&
zvP7}&XwL^ckxVsvj+4#%NCA$_bnQ@Taj)3!o;3;@-bLOH1~J%otFy}_dq_gSo^jU|
zPf%NTu)M)->2EI?;AKy0HRuf`7TNR1jgilYTLC~3$C3!iIO=g&_eR<6{HR!Yha7z?
zL&XU>)+K3Q=19<O*asNwM_T9MM=TwNGwWZV(f$MzKQO5O0DCEUv4-AKSj1GfVVbON
z?wj{g#<?65n$d9_pooUo9a^cnsf<QgF+gxY>T9Ml=#FVJ#D+GvNGEd~V<!Oj{<T`(
zNK@qmFzJ&`jwY5BY4{_mka?*5h29GQs5@sBT5QNCo>aGoF{o?|QoQrWYv#t=aJ=Kz
zn6X>Dtbk=UjAz%qJX*?FLlT$W#s_+ta-v+N(4}Da7iDfGQ@1@>0a`OU%Wz8k%HRbf
zq4lQ8ZY`w{%^5`}1$uK)Y+o^o7F3TLdC%!dIK7C2aaLu-M}!|R5D4JbJ;KK%oQ{=d
zLoAmHUPd4f;yiu=ohaQF@&*M}kuFW@jh!c6ztbIjnHpG!SX|@*{?Da))|0Q<>h?sz
zC`tJnNy7v5uL`r2B(7tKNaa8Ujxkb2JoYyqX}3O9=)t(;W7D41=Hl}8Q&{eLI6QAE
zw9lh0MBYLRh7B$T<LYYQP*)AeuQSkmY?ow3s)HP);cy1S>Bz-;QnIq;dy4tGn0eD}
zNgmZYv6{SDLR&c8{^$Ypu6-eyCb$y^{{UIW?wG*)zJj{=*BPcu638&snyIag4s;-8
z!4(ge<e?CZk<ezP)a_el1dSRu%65)UeFz|muM&A!UPv4a^{o`HM=QBv-f3Mwewz=a
zG0H|aIRH}0Bkn;S)+GGEoM3*nCd80!kOS>Nij2{S+FXDDCnu#-w`nAeUvMGDL0&Qs
zaZ_GfV25TRF~Lj%8}q8u3#6G1s0fj$U4&p`(wuZ;v|5JND1q6WvB@OoCX3l#*4(sX
zE!!uI`_yr?;_0{j%mM)9bpHSvo>z(745XX0^5;0mwGu>`VlwEG$YVlI=|~5swoON=
zOM?@}QJ@Ku2;5JmD&5_b(u6Qc<$!)+jD}xA4N{A0J6+C(C=i&2@ql)mWDi5w){0sS
z(2+=ZcCJtkpmElg3m}eWhBasH-x=>qa}wM%ep4rq-!z1r4mwn^T|6gh$GJ%3%R9LG
z(*<j4Ln;X_1+&JEV<3XWaDA#LYpsoPQ6wnX3aVAGI|6v7&v|CXUot^!A`BOiM(m!L
z=AZ(5n^jYCOA~@0hQRJu-?eEHE?tLEj$<0KvPyr0gS7e+-kp6M&1})aK+aD-wVP*Y
zYiWKZD)$##a2zoas{a7H(<9ck4b0>}+hUG5#&fmF<MJHSTigtbTYX7UZX}fmRUU62
zC?l!$=iatb;?>a^omsL70RFX;YpF)_$!IN@TF408*&%`VrvCtB$1slK(ss8B{{V2O
z{Iegz4r(S!O6H-s6JN?cW(b4~ecboWRtx6F_WuBI#(rKn=C;Pwi1{d^9^Gr7TLpqS
zks0$j-b*ngf4iQ8*i=PWqgp70P)4Vm9I^JOWt|w|6$DJ7qL*;Xp5O|lEH^gaWhl<Z
zLIdTu9iE_fG&Y|Mx9aJaL>Xeoa^&_={6B?k0daqCWMZ1>aDe=dPB8UpWR~C`n>*By
zMJJaW`y#!zR@&K6K4l(<p7o+4irKQ92JM~)&~~Vn;8VLu%Eas#HUi~;`t>wTD*U_4
z0y>_wh7%;reBHu7yu5s*^Uo)>PKYE0kYo<^14U`BZ()u|t>fIgA9MFP73VhUs)2yP
zZb+|BGWk~XDCCIP^(VD?s-43=wXHo_>0$NIc_4~y6jxeGI<uv&;YAcw+$7TXLW(G$
z0-(4@biz9!^<K3-HHZV6ZGlD^9b3tDCusZR{qL<`2O^@tJc^M<1tBIl^r$14JcS2g
znx3Kk)JH5rmgAoE42pOIxRZMgy@BG0hS^E&$LB=XDH{YV25Lod_q}K`da_mll&R==
ztkY+2AWL)`78X|u1_vEIyVUHyWd2}~;eJw;Dl@2KC{+NQ9OsWy(t*Vk?%`0cDH&qB
ze+tHf&ilSliAPLhn&_^`XbBvO#aPralP4{XGshl-wQmzSl7qELZry5wsz_DSJRJ0?
zZW>U?krE3YPizX1E(R+|Td9Pa*sTmAIT@F$^{QHio%YD~i5emsA&}tKjFPNaXW$+w
z#zZ5N)|`BVW=#^q62}xGPBE7M0PEAB^A-)gSw{motr+76a>hw5$IM9-Y4$G^rY2L2
z?k(HsDj3Yr+Jv)PM8TmFxl&b09@)oC`qx_wHp6@&ywZc_H2EZw4=rPe$X&Ma*m|F8
zmIc~bTma4J0Hh4hmJha}wzQq?iV#Ur&H?GqV^zW(5(fr3FIqx(v2UCI0M%D7ZLi%}
z*vuhu(2<&g8RN2;!ErCmAPz)_KU`1*bKOr1+sv`~P_musL1zcIPS~qgH?i8;o56+i
z6tE0%2^|MMwD@oBZ}k|on@r1dD+NVSki_~|7vsG*S<*Ep)Adp1?jB@uXCNUY1Nnna
zj5+H2KD&k@tEWraC3Ueve-`Sq##9L71Y-&gN8?<)&eHPal6snIj3KDl;dtwjUhO<O
zg^!)BUDS6MDh3ByYR8|wPgMkiQ<VXDt=hSO-P;#xQY+)HT1t^)=RUL{SqN0*k>0!O
zGN0M#jykPzJ3s)}Q5Enuh$4-*;00`C16=r}?&x*lz9VwRHX)k?Wk(0nmf|UK^X((s
ztyhj#ir75lvBx+#KZYr-8#GvnnMvesB$1!3am85dmvJE*oP*k>^1{ejSdx2VimI(D
z5Tq78&N|kVWto0z+9Wq+aigVugWzPxg=_)m_jU3KTKgBlV;&aMzw4cR-x7ZTdY-OG
z?@~>jAp~?Z(x9m)lS8_YS3J`f<w8fLe0MiQeN1)Sqk4+A<c_(l4W_rRS(@DL_N+dY
z-9cSg#-rB5Hkz+8g1IDP)YRp{1PZ9}BPfl@+IT!?HPB%qHVL&*5OdIuYx4W{y<MLc
z#((G?Z~p+ZSL!qQ7IA=zNGFU}<>&2#&&9BR>KkqU0JB%2mVY1QapS*&y-$|F#VslA
zNnYzb@TnDm7^vu^7B)1Eds*=P>S|gQ?AA9CoQH+VXX*jLA6omf;b-kFWv{^ogX6iP
zv%CrsE{=9i!=`;0`jd+M`=c2nHI(To4N0w!*~2Bp%u&l_Cg@~P%E)pGo`ez9Vwr9&
z=Yn^*3_^UWa6u3B>-g8_$H700-YW3#h%~Pa+}u6popUl;s?6Cjd4wn+ka%BD<zJ!~
zERl!5m`NZnwM1XVk6=gRUTo<#B9xT3B#H}LB}<EmFc@GbX2)Y$^UZGev8|+Q8}7~s
z%7T3etu<R)=!BO5MIU87L8hHD8OkN2Al>(5CzZ!NJ63MXMp+t8%XI5=CfO6I3UGfK
zu?^g2{bZ4ullYI`2h#vmEk?^wf&!`K&hj#XL*<S?twkz4TP>hgNw#H)>`$Phl@_)+
zABi^+&7|ByBaCfqs{!+JYv)~q)P8mJ7mtw#h9*~ad6k!QU<T{_tLJPFQ~B4@Wxv2j
z&*P+(Q~%WcK)|B{iYxV%x%l%q-yH=y8B)P_N`sJ3dh$TgtVR_(%7StQdf$#gZ4|Go
zt$9VfYZuJNn*2MApEhkjMfo4p{uZzJL5KLI`HZ)>$wpy<aCV;cqZ~^fcP7$Lekywi
zV>7&t+lz9$j&`48R<0sQTMSG0`Hg(TN8i+)iwh#b8oS`De7IhwrqiA~iCtrjq6#yE
z&ow=qXyF@f%nmXJKU%kL2$8nJ!?xj!AJ(>vQbrOrRQ#ZN3Q^XYqZI0;xr4oquFMa!
zuh3PT3aw|%3-qfs{RM-U?kO+sugLy3&VS$Qe~Mq53<@wPqPtH%B8+3DbiN(~e4p;u
zHP@Qyd_DK%{{VKeuly+;Sa$uiZgkbjYf>s)Dlr6+_}5!phPwck>2?sT2PXiU<Du~#
zc*nWVO>)sVP(OBh;B=-X#1ffOA?GTbaxwhqa*#+Cc2FGrx$bImvB8o+LF<q3uTi@#
z4;7~OGj1kpc(;Y}@Nfa=t!!Jv5=5my<Y)7ypC)LS95^)P-ViKcW2YvhRwemQLs*)^
z`FdUZMup@s$Z!Z9z3N-LyO`CI-V*E%20PR@j>Ad1X;8a0w4~!0A1Uiq1$LOE?5xk8
zJVPkG*Ct{Ld1)G!#xhFwu6r*TD?0(6pGwr%%gd|JEPwzufS|T<v~|z)s=iuzCU+bz
zKPvqKh4~_WWm$gqNq1&#tD`Kb<+fWtF!ZYJ2;H(AgZ|ZDn&Kjf<n&@k105>7ep(p=
zl4L8u$4cr_yE&w^5#^H#g?D50q<Ev5FzeH%Dg;S^!8y%OaFZ&jKuY71wWZOML#^%J
zY1Rv5EbefsIVAd5ccy8P*+#O;LrmZw_iLZ<TL$r7l#j}rloj-5x#4SVNt)u=2xl3=
z94mcAcv+TWsMhvUT@Ony%cqK?DzWJ7c+R1w+UgONxn0Mz4j3P;aJQNynsfQ8?9q=w
zo;~Z-?yT-?XKQPuD(lzK{S8!T)qw{DA8IJ$FR{m44%4$du$y@l5HsnXqN!Zmv;e4G
z%UzzW;ir*I`fd>NOnr~ixHbDq$>htGY~g_(mF?1TYesWbi-yIkdx-pxF?c=wstGS(
zDtRTFzA285u5-vB@mbp(;4+L4O6o~DXmV6;*EDyzz-aJ2`3Ad<AH~{^n5dHzEv$G9
zF_u;Kt}-xWKt_1$SKw&&70+I^#fig4qi04R6;c*adLKaPTAUiCxVg3@sTslG{#B=6
zI6<gg>9+|zjD>ceok8KgzV+&!8u5gBw2f_a1adD-{{RsB*UaNG+0%U852C_j;Xik&
z+%hW+Fv#>Gv*)%Sc}L+|d>%(iV}-{&SC@8=O39qkTP$-+AQ)*npQi$<!*CvHB2eEU
z$0HTgv@x>h9M(*>?5xLc<5w9F;%3~%71XQeE<{8!pF>L``Rnq&NAjh(a|<HuBssxh
z(0yuXEtVzucn6x*G-A;M!mk@69By1xR`#~ggebo<3@Oh{^{XT1$v(6d#sd1&Y(-n?
zlx%`FZSpV~UZa|<@Ww8Pm5U><-Rd$rR;g9$0L@f@G)p4P+t{-(Bmsa=1Ms0wxh)xw
z<SoyUaK)X}?NtXG4;=QY^W8~na+aEUd104xNck(r@bO#9r_1M)&oz>azHj<iXoC~5
zH#x;xeFb8L_K_~Jv+boLC_@K_hB)b*_N$2vwYvc*FEJPmjxZ|i+;YCbYcYodBr1cr
zA6)dLj?U`<NchjNHKcab&y`-}(Idu<BuTj7^x*v}Ic(#&;&tHiRGjnpQE_WHUCTbe
zop|8Y3)^`v?90V);gDqcI3)gcE^hWkJ&e1{JGpLJ-r7Y-)d(oO0iOKwYgWn1*v5>(
z++kP`V~T~5qSIRD1DlnM1HT01e|QR#%6O-=-aM%1{{WVoI9=bBYGKK#cJkE6Xy;E=
zjz-4fGm<-wY0zS5R#FZ~#tkwixH1%tgM?l(da?eM0lK=~5_u$V2XRsIsRzH-vzK#g
zD-ZiUuw|b5QpN_|8!-cKw;e@4bg3-TqlOMLsz*cXnw6rCNJyOnk3us}je!b8Wq;W{
zN99kSOQK4#yo)S;dn$%pl1?&xg(UXZFCwfmGodWZTeq;QGg^z5+~^L|#zt{UuJEY}
zto-AOmhY)fER0_?uGTjATZLcZ>C-gbHalx__Y$~SrvNx7)6i9>L=LU624jNQtlf!l
zWF-M)Vxw>eIPN`-0$Y}+iC@l<WZFb*rya&RSDvWA#d;hzw-V{BueJwNZd2yS9=Y|e
zJs9oMwxJYy7-aVriYTq;s<GK1qKYWDVFeUXKm{QH($PQ&02H)QKm|cHv{RKb43*?m
ztieWGfl3HHC;^5v+sGVyReM?Dxbp;g#7<*582Oj4UZSkA$ReV)jiXJjNjdL8v=eAq
zcMn{el5|O;F}V3#f<389l4+aNx0ixX(t!|NMh&TmF+so#I*e8gz?mRu?bsOzJ6ZV|
z_X8Ebbi1RjYJEo1J^|gyOyi>t2(4pt8J7f!f#wjs22U9^DaJA?oR*Q?su$!8Fdef>
z$tLC6*a~^irE3{jz;TUk2hKWnseZ`vZH^wGcjli4#vq~Gq@0jCW9wTs#@qrTU{6Lq
zhLypQ+S!<w_qZ+WDmkaPOr2#^TT!>QL!nS8E-e%-Qk<g2y|@*3DGtHiwOG*L8r&tt
z-HN-r6nEF4{qnxwxOdziXJlmL*GcxtUTf`V&S~NCnGI?eCY+_j(cWI_Nll%*J_;D2
zJB`Vlo#J%vucf@`p#YD`xgjtaH^ngN)zsSYy9Kef2Tpjg7+Fc9of46p85FOiH*tUl
zn-^SKHIwPTs8S2Cl63@y7gL&^D7Ur%Ws2a=G~Yprj&=JpF&%9SA1jVExN(q!1ZRFf
zon|N3-r9Oyy;@kN4&XU9&T72led^&lob3LqO+rweJWXqHasA^P8UGY?byH=>o)+?s
z>T#j;b?O7|K>1;M%~c8;v4h}bEqu+BK@sZw8TRNA80%-xNKBABYGD)PcYk|MzxI9z
z5MbqW`fv_sYF>STL!-GT93))owQWj<4B3q%j%qFC96Rc3b$fYn-y|7^hE${oK4AF)
zkM^6d5V_;>L2ZDmZrYNvFhU~2;!tYaQh8)3Mjga@{IncAL?AOM>}S=j&3`Foq*%*e
zisT|(=&r$zRojg(F5t6xdRv7w%ocJ!-wL#BYo$B;>`)N6hc`pkL<%I4XjaS=?zI_=
zKW8Jr=vTz_C@@<DkQnDRG)v2X35sYMaD^Xljdzzq&zVy)js5{{%R}tk1;c{0JvrxT
zkgdsC1K3o)>=D@kEmMVhhuv9#Cx*yy_^h({A7FtGwvm+&&|D&WPkt$9_4jtT!#+Yd
zpj`?0#Vtd!Tf<7u1aZ1}!bWY?S)OOi$HedQmQ8kn!@hclL<7r!F}IWaU`GRa`G)lg
zeq7dm$Dl-_d}@1=X0t8uNduwdI6w+2o9v;Y(E8+3g6<uG<VP@RJQ)ixTjNI=xcAcB
zT{sQu?qGg{DRpPtcnh7aK#JONKq>Uoc%2{I-fCuY`t%3c@Lti;AXNL=AKVxF3Qtop
z6)dG2w}iSvJd`Ey1L*Q%=<uva!+QR7USz`@c-D(m8&(A+FX1KCu{`w7|0ctI3Ek=2
zq$kyKCygI@EDYMlUAK{?xRMPcgf7g3`T;`oPKe2aCrb9>dW4mPQqJEPN2Eml0WL3b
z-e$>WmfrU>D%%+~N7FX^pjvkN7zDT-D*Kl5ZEDq^rvC%-_Sg=|3&UcVZBqNF98oOw
zaA|g$?=@({w!xKl$6TB?%TGhFkF+M|whIoazAv$e+5E(l6zrhoO0HAUKRvw#tYa!G
zPrdLU+E1Tu2A|nP@1|Yj101ku4<&|b1-7^R(&zoil6TV-?+m026nw_^{Ob*+pCuax
zLXgfKn+si4afID7_c|P(*u55J{#$MD;ab;M_``F;Me+&{<^FO#w%XeGeJ|}OM%<ov
zom#L^rCdl6Ft>~XZ;BaVk0Q{n6ag2WEdWh3`SKfSYHxr;yT6!Dag0{5XbE5`gax%S
zk6#C03ooWtJ{J?L8>WIog=u-W34T|MQ?A5sJU=xNK3g?>Hq|hVMn>kY#Ys4Lt2@0g
z<|(iq34_av9?Y8Wo%9E&ozfb=M>3gSbe#b-WiZ78yT}66rGLKrUfnUrt7lSaDbgDF
zcR@g-S`v?3LhiTsw5`Z{wk||!2%iR6r{Mcq($e&DO<_%ICLIy=0tj}#T^!a%;7qK)
zdHTU}tJy_Z;AiR^*#2iJYEAk})h*Z4_^)9Cd_8WckWEhj`6i(taPu6bi8rySTkOuV
z^_@D~*>HC0i}F}VF+;h5S|=imHsboAhqLn5-31O9eDiBOc<fKv!eIZaJ!O~IR@<U*
zsqj8b{D>Wu2qlRB(=@biBCxYgsQtq2AK)Y<@=I>cnvi+j1RJj3v+2wF10o<HVa%@!
zrrM`N8_mgRhz;{Q;?K}6z<k5Xly<V=gf-vnjE;1XpqxBm(#SGuwY51$9$CgG!p>N5
z@elAf<z-3>Cx#X1FG`qfp9B5U*iG%7>golOQqZ%jrUoA&pkY@blQ@sLsSg~TQm15~
zTNs<*Vun#YYSNI0|0SxQN^z(rAfDx>BI??|EK7Oyu?`|Z&&2I<7?>`KJGt^%%ZObE
zB+SfEX1Ft^c1!#{HHa6mYPca&D=~n;2=Rw#5O*oI7tM2jeF%?09&j6k|9bB5OdTov
zHzY+TFET}HE)q5XDiBvMuv__1ol7e8J@H_!Sm9~WK{3&y(wDj~l$5Dro2g$>io9Rm
zS{y0SF51mKdgA^A93YqB)`7Fs&uaIg?H%XsDywLIGT4l8m^$TLuxa&-hrB+xyj~sv
zo%^f=)~Pz)Aq*s<0Q^U|&exl3_u9x*P3<<Gj5{TIyeFl0ZY^k3G|$G&kPs;Y*d}Bv
zCM(!EE98jy=FPHte-G_S_r=BYRwj7bWpniFsn^YrP!~4qC%jEXth>1@ncusj8}9qC
zdzxN*SFRzUp0ejQLUAI<fX8?`f|*s)lvr6>zH!b9Ym~9lw6ojZaN~tm4fuF146S1e
zS%7YwB!y=v$<v?}#0lxJS6Yo;Q7D{#<~a8F&!k}pE?_Dw(ONMSnOvoC737#DS3Ura
zaM@$(ANCc6=;;1@7Ed-TAs=t|0w=N)2x946u`V7?TNFl(9B!%ZO?4Cp<|C76n3oo;
z9N`c+?+yy>fDowu0V)kDD?NTFDvW1p(DT)KY<584B(I9}T5c8*DdxMM!G<!y{SqCP
zf=W^SMhwa=B@F^V<r(PBJ)>Lk2<>i?r|K&~VmW25pOzcxn73_T9XFbBlPI+fug0OC
zP-C8q6;V^I*k$a7fo$OO@xgD{&0K+}VzA<c1>qHyc&JJf6=ZwmvJ_ookG*gBlxVOp
z%_JVMPr){HyOoQLWLyQ)AP)6jaIvLvF;ihQ2GWSVM>Z!*%0(}q#0-lyITFNBZ)gx>
z?+$ppepDO0$munUKp|%>DJLA#F8^gVe2&*TC1BxdR1WkIfQfcFy)zOjuJnqozir6H
zPG0N5viEvu-bwOzJCR0Lu(wQa-;3%#nMnKKGzke?C7avWx@xhz4m{qhMR7?x?{btK
zX_l_gC)Tf@r)0*yS`)geCi%3g!nkj(>fmNEDW*5PXV#Y*hbv+67A)B>$XLJ_=X(@2
zX_2&XQ&4O!pPLKH4bHqNR^6A$Q_&<UbDD6drvJ|J(p4U^!oJHiN3To)Ckw_p5fNz#
ztcX(IY$HX16%!zW)~n))xz6ZQbjfoM6h4*H<*PDmS3A;4JBWk=ff{zduF87sDMXt-
zQd>~Ck?Z`@q8x4Iwk5LDdO0dNGh;jf;S`Ql{*e#ddy;#FGmz-wE0gLERrxI#wzi*F
zF}VYuOD6~O+#8ri1}I+sju)y)qYRe41&*1OX8z1YRKiZwia<!QlWkNHv?g|Cmtit}
zE02wTe@`~&>E|gh|9veAncN$HsYuRTMT*4%P_a$pi$UT-FJWP2Ow+Vjy+e9>s({_S
z`*2Z0W=9B~eVU8Ry&LR~GICe(divzx@m%mLj-1@BexE9_RB)uv{1nN1hnx;&c6z}1
zDWO4?11l+<XnFC#FUh6fSYh#SOr-l4O(oTj(Unv(Zm)P>OQ$oE0w)9ZF(gr^5O;p}
z!Bg=be!chM8P=J}D4j)uf`>nCWcHMBlrw)em(So)8#dd9kvgVllxw5VEiT)TfMM8H
z7j{Sq^JRzUA2T?3j84=z*fxK&bxn{LS&wc;sDALfw@WQa?os(1uUp)@L~YG17H>K+
z)*8e1IIH1M7Di8-%8iSab5lwiSMF(4bw@%kko~7XLwd6HrwClp2mGn4%)-)((<TL*
z3m_{d*xPCs9%)QR$t-#&9;8$PW#<_h5J+E75G3KV(vzjjYq4EDJa&USDt-jyf#?D7
z66C+v5VmxoZf>ro9ZI1P2oQB!u)0Wr0GT7HO<kps<UQj<1I_hr?r5NMW1~TO76cEz
z!0R##a_(W#$JsaPtudCByGs;=EXdMAP3O|rdWFkHqvA2T>s7sRx#rO#%(F2!=v4u2
zh>mx(>oxXuH^~wXy)5q-u#g_N>h~B*)ZF&UT6fh6WT<g<fA^`Gnoz4tha+g#TZ-Bd
zH1d>gXw9<@z=KEUD|ppZ_H!y#*>@?}*zDDB%Is}TOh{_5tMfCfZw1R7TtBm*_3b9a
ziSn+fuCGWqg%XASfCpc9g~|RLG<~4xOQZ$Hp*!UwS{?U$a1}ETYb|qv0;uKg)r^d5
zA)7`G*I!pLDjx!;n=0jc+4Vz1(er}4$rt9!@M$!3Y@8QLO3Q1#{sH=?XntRbr8vg%
zZ9IOuPCGYkZL->X8uh}2pBgz!5?XNSOZHs3+(WICKe0()mYG=aSA~Q=7`k6<?0C;g
ziogIS_@{j^>p-pD_O~vNBwh)n1GKU5r8+QtiPeo~I<^-nCsxrDyDpl7w9vN><XuD^
z_cEdEXn<h}-%b3wok0J%bM6Qp{~{8W-2-}n&?)j*E*KiiyoSv=WI-6T-0!v)CR}{V
zNmjOSS)aPT3DtejP^~kXTN4|3;{VY+GwwfC6wl(dNf?+tTr+RDZflnYEsT1Iw|q#C
zb4Z9h5Dd0iUEK;;F-W2|?$bR>5Rc8kxt0$+K_Ctzn~ieHaD>eog%Dc*F5o6DSw$NJ
z3{q9P&lgr!o-U<^^h+B!2_8p)Ws|6u+cB4r=9NhRZYF{cRW_PkE}L1qxn8JDX0dEd
z7@r3xNTzrav75^t&GNblapOi&sX(=CEKf~nUM8iF3zVy8-uT<wpC5Q%7>2+m%LRB%
z8%B7+=Lhta4XlLwB?QbsyboNN<{d*9j#PhI+=Ta&eC&{Q$z`8t|B!zTb-6vGeAD3^
z;ScWv+^HG`;OcxkD^0;8J+zRPh5l?=jSe^=0n8RWcDjENt>8QFIWbet`>0CSMG{0m
zja{Ohig@Hv_(j54&8N*cKYmfMvtw6D`B=Bu8KpY;daY&{&wS>d1iOc9<I{7`OO-el
zXxkA9E+)~vAtZg$JaT9ZrKW%P2lQP3QVk&ghzv(BB5WI9#|%Ps78)N&cy?Zro+pwu
z3z_lrbD^FPYp+H4a~W4~5(J{l+oRRrz$PAm5Ua3b7Tdi02M7ecxPG!t7@`_-+#Q5A
zQ@xU4SwBa^yh4@2$%sVUlINHMXmdtDFQBXF@)p^tv3SP-o@d|$d<JCPY$voutW>jG
z6J(?`G{BX9g(1e>Af=Vg*}pDIp`v&A4<KNEc6^WNA12_ZHaGKKGnSfVRL%7AYP*qZ
zzj1PjKacQKWdZr*p_<?!fsp5MXL`)h)&`WE3qg#>c96L_S6P*w$rk)LI-J;57R!|G
zEo$o&m{_Gr^;YJ1!poQ^JL22#Sajm>r6HDVMtXonqo6c2(=GimVQZ~d#e@CbX@=8F
zA?oAyKLFH_)#y=k_W#%aIEvNq5Kt*9&aBd25*0V-94$-O&WznM33rT!N%8m3<MIKQ
z^KJQv-e#%18$oPiUJEhrtCBMmr%e&p+sf#unM~>W5O%A<4+~p*`c+oSp78p==wj_3
z;134z!>f6eJ9bEfc>o?capFeiePxEXrMaf%TA0sQRhS0Dk{nyF{WZd$OaM?^7UbIW
z<&KM=htx^9&}UdD8>n)+_9@x0r591)c}2@8cy)f+)V1X2ccfwXMpY^gDuZ+TKiFk>
z0BFmojl+$+Wt=IT4d2g>f|(oyL;aI{4lk*;%3`@CE_MP=m`Sb?G9jBvw*+!q_PrAg
zs6N7*SN$lX?%aTFH>@==ezN`G0hCG)1}S^g_upiVqUd)t&z@1b*ou;Wrb%A;qZ=x5
z?=O)LuhV-}Tq=4>Du2tZD{84K6EZOUy(;7U8T~}-cGWBW8ou<xjW3c0T^ahnZ`>a1
z*A3aC#eQB5;$>dAf%3vKgeFk=EGPz3AI&?h@2nXPUfoKW6WoQ&6YaO!oi`@>$#I`Q
zUy^vmWHHJW4Ikf)urJD}cAA93jcq|!s9(_7krjU6aWuc;82GWURgzm!GckM3?w`{0
zbNY+;V|Gx(EVk^!HZM`a?3Cw!peOGJCVvqAvkl!|Z?_lQ()R81=gY?OWAK1(Dn;P+
z`h9G=TG^G^v`^>VMxoJGBIO8TmGJxP2bc4Z_<X(0mrTBw7PgpWPLf!Q3d7CEb{ghg
zBU+ZGh(}rPKE5O>!kAXbzd6bH7VS<bV?9w2E4y1S9vPi9i<rK18x-xi8-dmVFUX@k
z5{0M^9KFQ)zg;s75o5LVi3(V(dTY3jW%YtdW2S7$eZH~5)a=p0daWI&Apz*@O2V=Y
z_jD=9G|4guh=7v!&*jqsGW=0>5XQl_K7A<aXH&;aiz=*kr4|)Thox-?F2s*9mnMYL
zSxSW4!rQ7sgM(7=tiC?P@2tarOzb;NT=_F-36QIazI{~Gb@*@vKEUi5v*EOH8x2FW
z+D1f~t~vick-JWm>KU}I--kl(@;DRJ76%!9sO>YbzZ;uZ18DQ2@IHKm7CNb<;UnPm
zxf(DxUOkICl(~iBB*dgGGD5WCxzYCB{FD5mL#J`7AB32pRy2E~@n=qiUw6lBwAz$#
zdl#8dxJ15}b8J(aZs3h+(U{Ov`c!N`gATyM6%o52x*cL;y+*HO^>}Vn%8l(pCNUZ5
zWT%gl-e|hDpA&5#u6V5bIz0{?+U)FLyeY4^S={dEEL%$h2Ed~nXG10r2@8L`L%*L$
z60UfGF-{o1@@?pv{f=LsM*Vvpv-Nu(f2R8ms04##8TXYui1$e5HHzUaLtN@UqW{HE
zY#LC_$t_0aRH|N&p{X^`S|utWYCh_6Ojo>Py1?9pU3AJ+`T2zr=hch{>YL27D^3ys
zw@O6Gh4Re{(5a=>RfiCLq4a#l@P^6%IZl=uHF2_1RdI2ZiaB~E3w?Rth2)GWiWq!b
zg?6lv&gisIrXn4K&+T|>Fxmr5UHuu@jS|F=oNHq5YrnHbXZoss0~4@GP8Kq6AiaW!
zBjp2~6dli}lk9TrQ`)uK16|6~s2ad#QSK<<6CkB-NWwlE+?>P$bZ3<Lvg07D`2M?9
z@*w7bP*ElIRA8LPhM#MO%~Sw)wa}Vn6oEs#C~uzy&o4%r(y4q-`9hL>iy%SZB&RUm
z2>Ii;if2rm3I9sZ@pEp%4JhY>+0f#)t7lW@r;27JgHXyJwL&4GT6-b~-AVsRFXR(h
zQ5<hC?{iVYGdS2DQr^?GI1{G!+E^$*)R}{nK}<1%+C@(d1tYo?Gg=n!N=-g1Yug#d
z|8+8!$CXk()^^rDJ)LerFD==L6^I>fQme2%a7GXG6lD0~V$xl?Wb2;G6-?6{qp}uj
zUua!$@&TW&n{(Q^U;6P=B4XX8a)o30`J%YXxb@8Jgi5duBL_CX!uCX@U%}v)Ton9&
z?dDX8&KW$N>ysiz%~-cnGBBpzu_fEXk{hp1LHg@PD9M1QDEaeUj6NUEUs8tE?!Sc8
z>VSok!A9y(Hy-t;Tq*I}f{HYDFlmLRM$wkGBW3e2tr(V{??Jr<h`y>2qdvoJk{g>*
zBJ1P39n(9`DsFoC;e=O>Y_j{LwK7o-CN^#X-zhJFZ39Dzypy-Obe_k%DBo#s^;k;6
zNPOy#@Mp5LkX{S*(>EGvA|z*--`hy8<v0ygyuz~O*B0^C#=WDnA}q8K#*1ahM4jQ;
zvb9WjsbadLpY4RKO;*5ss}w2^J$4gocLU^E(m^UZ#90Nn+9JZ=7$xb-F<wVO-h=i<
z4|^eQ$fWf54O<?g+5?Wa_j&8@D%$?mB9r}G73JOWQf~@X^^TVzeNwR(t5CZN|IIr3
z@cA7fSy0=mW_c`&G_^?#{yzZ272K&=!5$uZJCzI9FAz+xjs1Zu_z@)VfBfGINu?sX
zu75?u=JzdG5(V$EoCHAX^AF^s2&8`bH|Tz;d(Z(hngztbW6j)#U<Hk2h2BO)icDWt
zdOeNV32(e$#wb&h^eRd;Nyfq!Mw*Vz7+`v{RlXT&ba9MT-|DbkPf-wY097h7k><)-
zF~uu#G5C8h;l{jUqg0D-3OPH=)I_BYWRKrNM+S2e*;!XYKkb@P&pQBTP@qJRcVND0
z<VD|x;-|9%X~b}pQLMdJ-<*Ibk^=7xwr3+Cf|@#e(1M`5hI7d(!u_nj^#bO$UYr&_
zK;sh4tfZNty!hTHoQJdw)VdA&5loEltP+6jY*1j;4~ZcAp5bl_&ab(!e7DkgS8~UZ
z?4+bua#NaghY5F73pMF+8k8#JtYV8<opW#Q*@;n*+uJb@MG6c!$mcqTPe9g7F`yB4
zH`idASt3Ii%BbUm=7eJ$G4pym%4SjCvS##6YtuWVkzJDOtHE#ClM87Xn@K(dvc5CV
zT!MND_y;DXb0<^qJV!z+A--7?q@=9p%YOh{UatM|l3yP-7CQVU@BED~Nef|y$#+zB
zv3Ku~$j3W;(eyUbwQ2lq0}3^V{N7qD`kTa-D~-KUJ!XPZ%gCI^zOxrw?iJfmpp-(D
z$waZyuqIR)OCL0HzwTj5GU&ImYGzK-BL?uL-{{jlBwGfTm5g~V(qy3X5F9+3p5J+F
z;b1qB3lj>@H(&ZA%!Ev(*nUnT4TcWQvI*=TDm5!Ni)^4Zwz=GurI2KTwsj$qF)0i+
zp;UsUH{&;t%e2hh!t;KV9fKFy&yn-~x1*W^?}R;JpeL$7)>2^=aN1a)4^GYe1HiGF
zbp_&C3n}&0%@Fw|21oU9il~MDM{*U=eYUfO_241~aNTR(sLyT6O_M!QeegPl0Xe+Z
z!T~;^H`@o9#D#0u04ftsNJ=WJf6Jk}LPvyG@Lpv%{sDgfr+zjposy+NS0gbgg^qmv
z&j$x6Q25yWbxeN=*zwtm%+I8dI>p7O#&VpFGwH_WiV0s?E{0x@v?W49h-W8V0z_ls
z_E}xcG~NS%Fq>d#{!H|e#HEDG$p_D+QjsHN3u<#a4&=<a4jnYgFY;V7Ft^X#xLP(?
zArex%yXL=Y%OFV8xYLqeUwI(IxCrZpP+xwHx+y=U9G9`ZIW9RfPeC!#8kb&0AdX>0
zxq{9I7m3ZEx<|aHAI)`=o{&d7a=}9L@c+ZdsUsYAl2=n7_=W*Kwv*X;bsYn;b8^$y
zorY&!8ErhmH_Dwh?UzQrszlzO$nj_AzOUPdu*lF4N{UftjN>xbZn`#RoVtj8mRHg%
zp&OnISoPxWB?>^SrFba|Bk>RV#ewT}$&B9af9d&mcH*3GN{y#&zBI)j#-)*;XF`An
z2@MAiNeY$9kp;ea(kfT0&n0YiF_o*JxsT-5=DHd7?CP%%D_W0Pvp;a`ZQ|cXZK}}&
z{J8uGT5w*^ZJ2uaGuc)Z)MPBL|AK6$U~Y5`NLV}@B-<nl69hM!eK*kp{8pb6TwWbC
zl>giu8}isEJmp_QOGmQLaITwwebugohp)*OaD9QRkRPMcg=v$fCpQJGnAn!3_M^M9
z6X7+hH}6OrXk|lk+wMy3X`vW4D!iEg&Vl>dz-0&(T~Q4xfWFuHVe9V$ky>NQj{gss
z`#%8XC+_R#K^*XE6Mq-aa0AM|=AD6^O4W)KxxcHp%{|<$51cp@6-LzkxQ*Pxbyb<P
z*JKn-yc8>fqVur)!^utdG}}U@jlO$4zfR-yhDoY{M_@_U@`oJW0ps3z1UW9TYDAqs
z=J=HOHEPlOptXir#Ty@~%bWCETSnOgGf<p7NZRV%=?xSE!b|mXXh$x9XE@o0v3bJ^
zA__Hvs=AVaZ_oY#s2-^Lls;IS94XNc(^xag$k+=joxK`s$=-qCe*0(CQP=Zj<0B5p
zBEMo#MQ7*2O_zW1>_8aWauO1{xE>#aR$u)T+Iyo7D7l?FirYQU`#8`$KDmkyCV86>
z98jd;mtns3?Cj&ReCBOzcI7NMtLUTJv!WW8s#H&=EI{Z+Om8}oR4P<US7&gy0Bx2o
zx@0DY4>`zfHWMs);0-{&z-+hp4e?;Ik8;E#78=7OOyTcnNGuZVez_C;8KR&*8RuBf
z{n`|D2c^Z4#nZ5;mY5Cw?IxUKxU_JWdhxga<+H6hy~&oYWp``!1UjE5I38KFUz(J0
z)fG&hS%Z+~+<nvM&$*1*T;buY60A3AS5VVg$G@5WWqBBQM|J1;QeJl+`C-ySvmXm1
z1!Ox%8NP4h-mRIe;w3Xck)2TjP46}9826dI6PiKEVn|eHfLjb6`GU-JEw`^_(AON*
z*Uzcx=z~y4$V6_7M~G!b1^GdF7kV<hj)YqIL}G2%=X(2k%U*duapj{)d+dH7qP&uK
z&CK)j$8FJSS~^rL?U*{I7rO4QQWk{o(5mDE{9N5ubqN*PSe}&D`5hB(`;aP!E53v@
zN~20_u-d)5Cm@*XRXNn$qKa#{vN*1r5C0u#cC8ifWm2y^XABpI-Nt_gBW1z|PzpS{
zSrl21<eVNIqF=*l&5R@$FJtc|oK@ahXzYg!kj&pfi=lT>gQFWIq6x7XJ7h`GcGyGI
zUx$>l<rfZSb$&=cXMgDQr*3JmcnBURWbWdfKuXt0`5LCdkA;*uw~t+C%ckF?@>T`3
z$7}TdWnGwI{eE&cY3pP>bm(+R$WG~J)Esd>iHbmmq9`Jf;L&^fBohX9OeL;De3qx;
z1VXB)dG_}4GL<$}E&)<c95`cCCiCS;-~|D>qr?E2arf(ciCDseQ=)NuW5Ry8g%PaJ
z#~x4jn6>fhwc>G{JRymw@p=CFO%2znv;4(;v^NyUQCT08jkv1BnE6(^UjewVH*pHR
zw3drZ6~aDzD^P;&-qJSQ+Z^{ixdNU96q*U`0*AUbv%H4i-8wd9|CE{K%@V%*1B>3u
zChSO8fTZLf;hi*MOGJjN;qAt<MVdBs`Jo6ho}Y(6=-V#2!{pN=BdShmzFqSWG6xXJ
zoW^7EY;~ei#LIccd&E#jC(XR<@{*T87obZMy``Q44I#lS<rZ&Osa==}dl1gVv4-0#
zj+u)kB-Q4sJhjRAT${I*S06j#O5i%7v9-PfU*rM)3d;}aHCY2g88B1j*_g2-#=TgA
z<Aq1xm;MObr&8wB0j5-xJ@U1?$A$T6EO19?2ixVubIhlK7vG`h<3o0|nk*?ILGX3+
z{HJu;Je|hU(by#J!Bs!g1Pv-zr*8m%G&zfC{D;V<>{eCkP98$E_Kx~{HDWpO%}Zt3
ze9&Uax@B5AUAO9jfN_;=hILC7;bpd{syLo!FO?m0CjG%%nYtjtodADKy37_rv3IQq
z8hY^vgh-HErs~}2LZp-v6xmA0<dF1Lj2#CynKz?xrEy6<PK?BL<8?#WAtEswY^QM0
z3bKW33NrkPn6MJ7tX%Ok4>wL2N<hAJTA-TH23b#vdeE6Y__b=-;$;`OwP>E^GO}Tk
zR{a|y?k9A6fVNq_pZ3cekWflOF0ZMS1B-v`VO=-3<XM*4m8$p9YU`||_VVztmw|L?
zQW5-EE@;*`R9>>ySf~z*2)skW6{^pLa2N>W(zVKv)tML7nd9HB%PcqlJXTu;)K3eK
z@>VRJUJvG{;n6KEj*Egp74t`mvO&I%nrMkUTw;a|XGT#O@45*2o9`>Bs4b{*g@?d%
zJf)PQ0UXXTX(bzVS0;<&dXMQT=97+x>gM9X6v?b798;e7+-EcJ%w1%6G?#ef299BY
z-^g2h8)z(#Nb%Cqdn;sH6$3mEnvTyQYHD}_UmV6q;4uj8zGWU?wDV)M#+7GT?~%+a
zqfEi#rbJhEGRC2+zol2$k$IUq0^rUv^$vO^Y*Ah>^hYJ|DyeQ;kf-au)RcyJ)agyh
zkYGSDX;(SM4l`93=6=+hwQ5M>KDz1k&k*hKZrT`zwvcZbzu?*EH1OU`!>Bn_x*@-u
zbtj-J4oiN!P^~ZryG)*X-TB_Jwq&BpdR9!Oc<e{VpG*_>65=!m=PO9v)#7KVW0kfs
ziWKE|iH3P)OdJrz(LfD>gl!0EXQ4s?4E!S80vzgdnOZ&0W5!q3r<MMn!vOdz!utq;
zJ~Md1$q4VmC+e9Z8GFAg;ymGU`zxzViM$EIw&S(iFS5)REc+g+Ewq*Cb(iz*F%xaN
zu^KBhE>4gE$Y>FT$1(#+t;CB7%2PQ(azKJ7v5^WhWL-(HlD45GaAhsxS83kU5=^X0
z?b)tPc~+HnnNqv}Gvd9yPgPdL>*KikMu%|B_RY7W_6uzl&o!K9?b9S&+3HEf{>leW
zG}$1|kScgqzIJ++rhvrC5rpYTrBBn(h1fi}+UUy<5Ag&d9BtdzV#$mI0)$ebl>NkC
z+HLBUdzIC5&eFEchwDF0=Q4ky_6Sq{;k%BZ<>WT!*Xh+BMnH<Q<ILR@^{U0hua>IN
znu;;XaW1RFQ7^(<q*zYO=#Ijvg}f2+Y~^RjoO*jeUl@$B;e4#1!i$j(0CBnLadR=1
zRrdo(cGPn3@&OEN%_-?=LS0m+J~#vQwg=bmF%-uy;`4~WL!^?JKy>l8JGgOmH{2CK
zKxrFQo%6{ldG%wH4-%Sp3|uui7E!L4M9VNfJfMM@WSyi&CwbK=zT(HfG~mCfK&8i=
zoz7Z3XZ~aEYB7do5k4Fiu-_cC@%ylD)%bn&TecOErtsfb>m95j;6{V_TGE(@9&NbF
zA3mUEqrGC-w4*t$9B%oDThfbdDN|J>T?E{I`{q#Qsehwk`=(FUREY`sPj(@vbi@$B
zE$v_|8yNR7aEym40Xzdxs~(JDQ0+rkVcZPhI^@$0G(!W&&+QimlJaThSSrihae1;$
ztUYReW8CC(fU0VzWX^fEs^?#$e=22t(=@+dC<uCaR6TlkH7?bzv&?AfWKDTXaAz3u
zaPbo3-P<|b!G*X7t!aaMoKwEUzBn508gli<-qW+}x5rcOGiKZx*hJZu6W{<K!0q`3
zp(L)!a^Yw1c&~Y7#OZpwS<f#u<$2kEzTcyXArNxbY4j54m7M`hK+lyg)x5AS;S-|&
zGWAEjpl=DMhKc_G4PT#=5&i*;M(khG+tI7wj$AX<eW@#p$Ey$8UMkaHWs?cbo6?vo
z!G-CPXy6>C6D1OaA!uQ8dhMW2E2U7dV2Bq=fFB`jWJ}?p`l24(v@}}Fwt_ECVU~Jx
zJl&B8<9?#D<~xiufPwc8?~DqnwW<V$5=k~6Pc=xqt_|~aXv1#&F_xr?4&1=nD`Ktc
zlt0?MAq7wA&jXP@5`W;Q^CvKVMc?xRU++iqTzHz4M`rhn1?idULCeTYjR4jr?aA(@
zedtAZ$$Fl(*Z8C<#8g9iW&ZQ{%z1*L3>+EcEmk%R*lTWJWUD-_cG~j~x{*42Fjolf
zQdRpsscaZ|OVLJSOa1eiL21{3hlSZyW~s3@W%&0}t0h@bd*07f-9rPTYAdjTj6>N}
zgRUbA;$P5<=AXNK*)n~dXRHwcdBcuklP_!UBi;-;;0M%@@CyB@qimjK+LUu!tazrX
zKJ!&D_YEm-bFYclrb2nqz3hHQIn*878?$2fP1EZwltxFiAgwhlQv;g%Y1Yg_t{Cs@
zrNbuZyN$7if!_$Lc3ZGHD{4&6Sce@C)tKr_<gEy16_FH?iJCRagEr5&2F>ayB>-nD
znX#30eQDQ!RYzYL1C}{eMnmZ-Gg(&8FwRd6g<tZ%DouBUz`7az&EF+SelpVTNV-m5
zeq62h+awo`mYHjYClD!j7j)m0(lg!?_=W8%zF>arKf7EU2%IY)m+1X{>YGvH36;8>
z>s)q$^}nRrw;vifJG+~i0OCCpM!t`<`sA1#8%8-Q@Xn5vse)SFo?{RF-u9~I_S7BC
zSxIu+wkKC4uc*rLn74GMGSP9@N;{OG=y{=CWRrMYo%gzv8_($V{3LU%OJaxYwwRL3
z`bP@Lr!Q&QUXy~ZxQ#r+77h;$^!+HKBAELDR8b43dWc;vSC2r_Xw8Z{rIUYv?&sKj
zDE_l*Y5K6XfhH{p-`Xu)I<(_2R0ym*rcFMQoEqq`jy|@-u96fP=g`Pm`jTUwcp4Su
zJV=nm8$LXOW)?{04_C)O<}E&dIb{1Fi90}~cP>~uM!d!<dKDg{)Zwijh;B>(%n685
z%RDRD4j-&2S&=!{^k$Z^?xle&1*)$d8`xhOA4-LpfZRmf=?qwvU<(BUNN4?A4;kxi
z`CCVW>6|}?dOo6DRjklt3y|3Hlzl{Pr6o_&_HR*iKv7kb`+Aufv%0`6^Q9t8!P<hP
z-%t4;!1wDvKvhXA?o!vQdZmB@`v*xvlG}>0RAJ#`17B&fO72w)o#tX|%@*bxK1c?~
zivCJdJW)QsKB*TgRW5;Yduf3k;kn*bOz5xze|EWWt>!7bXGE26FhGWZIo*~+XG1Y$
zCDnD44-gUSPz2C-b{;hk66a}B-|e^xh6|V+U}E^QU$*`^rZxjl22~Wll|B$9WxScW
z(|v);V=w#g!*U&_Sxv(fW($OSW=7K5o0gTffc^a&1@WaQN4_64WB2U{vU4AeE@Z=o
zx5R@K6?q8IZ3bg70m|TX8}9l0#&y@W+`KCoohZ_$c%G1InnIT5@x6N}%lzsc0W?<w
z=v`UrYfqbrwj<e242PD8OkYRZv_GPpT#=CdrLSxa#BZe;{7I%occ5VL_rd*>`kn}`
zFGM~DF4q7#-%pFDx9JG!EDY7nRZ(g!#;G?V5ArJaW6AzWmNtJmo0mZ;bn&{|F4eCc
zG($*4H`=fzan!qruuu`DlQNm_lq|l;zw(jYE&r+^wKu|K&NC%#s0ymErCO#|Uv?h6
zFJJK+L*&bB@In|WtC6FlkYpcw^j`R`AH#2`VR@K*!2Qs+ox{aEx`(oiU1O5N(<f@#
zr#NrtonFhtf6u{9kkAUQBh4MQ^$^@n>0OqOjQev%g5|R?XB)jpa1|T8t-R$+-~E*0
z$h`wR>Iexd^s&b+ybYIddr_*|{JW{}@EQ(J!WG7;I}h=DJL_ZUH?-GZL`<js+oC0o
zoEdN=@-!9$!m9TpI7+X4Z7f|o=Bw{nGc=UH|A^Q6!HOpC%$)klNx@g<$-qZ5W#CmP
z;ZKX+(IZx<Y5Y0@v7Jm+4AP9L>j92$%Id-pz{t2A-ZMUZBDww8{&n!``q>()E=f$O
zcW*FvuXNJ9wH8@Gw6o|k^0#_$)Z?m<P{lVO&f5&QDFJndVU?x2j=bQ+2rZ$Fpwbd1
zBfhBp18moKI7?+<t{j+wHFkOn9<v*6G`@zqzMU7~FShm^SzVzT8u#}=CWk}6$PI&i
ze$F(rPluml?}-n9TkvBsI0U;{;lHq&J>D_t-}G19@FQ&hRSLu8wF5p@I=?u-G&}In
zUbxskG7`1CXKdzq=VYxOoo`z?;bm5ml=R%V+|<1f@lsF1!qf_;vh5<D?>Nh>i2c>u
zCtJAuRpd8wwX2Akv83-?QRb0>_e{!Z<P_Na=Gj(A)soSldj@5ez133QCf$cTTt?j>
zgWrD_f%y1*`*GW+9kbD!(-NEz1*uP}xnRp4TU~l;2g)j3iV19RlX<xm%nb!M>;)O>
zrV+^6s_?sW`Qm%7i~WpvXhaAMTp_EU&}EC)lSJO&p)CSREbo28g_sIT-&D|ID5g6f
zDKYe$cMH4R6cYzf_h6UC6qW1gdZO3X<>R`Yr-LhV(%(#HZiCOPJ!S3Ac29-XA%Fb&
z7Plc%#*8yJ_P8rY+M&w3axuRy{E~L6*^%D=M$^dJFk_1%>u+)-Bv7>-AN@v~;QTif
zi(==82-mL8*GTh*ETdUe)<%R_>l>JzQHv-wW>s+$*M>1`l4Yl_)#|O4@n==rfeD*<
zgnqX5>cYUOX4P0_eKOIanou!8;Je`g)iCGGeiSaaxMUJCt<+9#msgvnMV<kg$2$|R
zbD?wv;PR_coCk^4@h(fNPW|Td<GfeVU4{hE6PD>58P5%C$U-gN*O?>Jap1fn9>UZD
zWD9mBrzZE)8^4+yD<D#4(>diquC^hwHI(w9^z3T4WE}&`;GB$8B9#thRd8&EyPYrJ
z?4}4gH`xti&aQ-vyra!8DkKWJMV_wHn?8}a6)86jg73r8-2f8Iv`V6nyW<XY5l-xq
zGNhJXVGIO6dr&{}A`(uJCsBodh5e903X(3kikVZTB0;ws9D1BolLCyO2mbYBfDfW}
zi%TEVZ&RXtljWV<AiyE-?9!@Cp+*aR>^u--!2#TODd2xE`pf>S-`|P8G}u;r__jtn
zDkUn@ijZw@!8FSL4eB%B8z*yT9=i0L!XVaxXbVS+G`z^@<J-Vh%G{YT95EQ?yS{|j
zJ;YMD)X+WYGDIP^=760I2X@as8Cg%>g2!2%6BdHErzan9P%WqtCovUF{{pSd5IMe>
zp`c~2-`_!no7$UOooQ|=L~ZhduS(AK)h?#JGV2|G#DuPwZUG}R2W%>lXoGq;GreQx
zz(w-&e3{(XrG#ci$3ttkWeeX4BvJQGubKzQ3xBkSw=hz$ZRemAdf^`wq)Vljb=aBv
zVqLb|35ag`eu4F*F%4eRAfniW3*B!UQp=G1PS%#hdaK9>dTdp-6@Mj=D4fWkhI#~j
zV?vO?A&2YYJG{K@>yzYj*WWR{d^|Mp$B<k4xhwjT#xcL;Wia37C`g0v#8&&e`%qc>
zy;HsSkC}qn)PR*l7skmsuD9yJu0r*<Y)MmBxWQr!yhPnd?m6YYhcd!Crc%Z!2hxX1
zfA^x&^H3?4F;O@l0tL1SOQwEv+x9Uf@bIXQ2_+-6n*Iz75Y7@>Ufatu3jY`{M)1Rx
zc2QG#lN{9}vR^ZAMk2tLh+Y0!GQgD@!ULc4YG`@Lb#<cF)y;8f#GI=vjey%DowLFf
zS-mV8D?L0;j$+7UGkp)+<6peV_TbR~ecXJ|<n*=9ndW^z1Q;E4sGxs;JTYCqkWoEY
zga*Z(`T|Q2p3Jb-30hVP%kjnPW|agLKr;j04NOUjPPBU#M+?ybg&5p|1n-iG{&f9f
z*?e8+$R2NN?l#f$@X5!ChTcX^tr<$+NEL!6=#1!}&<nMfrxSEK6$j=z-`C6i{oui&
zS3gI3W*R#>BHh=Bg1!j2LCks6Az?Y(R!V7OW{d_N+_^NAdY!nBN-xSU;xOjT#i+Zd
zf{QX@Vdsi`nvu6au|j1QayyS<i0HHB2o;GCFuZ4ZSi_Qoe__}G7J4=}!xJ6RN$OXR
ze;E4UJqsTIHx7HL$uSCKy9f4T_=;Xp$9?YpQn3T!S7Ty#Dvp(n^PNe5XKN#kC*ZY^
zV};vPCvlQb&_#KMg53UIUi6OV#c}MX_EWJU-WI1^3D4j}p>m5EN<4yK>0n@y6^YN%
z_Mu`^tQ#YUA5zfpImB`3Ty@s^TNUb|ToYMRYjW79`WrH2osQv)!Id1X;B~kI?PZPc
ziRO|YJpDNC13dFuPxC=@)bV{ArFR;0^NyV=s}+2rpMSWOu1UWZypulAXJyFfY2#Uu
zBV9G@kBYad-@|znEX-C)l<K15d##HNosHNfqBl&+ZeUM>b)o(P<e2^E+@?%a$f?)7
z!x0vHA>1Rx4GXh??_GET2`evI6u|G)_N>_{;AdJ|fxXP?+21l-<@}P2syajlRP5>9
zKa=)pixpzW&O<v<1uQPb)j&NxcTveF1LN&!vfa2L%<<m)goNWV1domvDR=G{v43D2
zvk=bN`kY|j*<q~YwddUNbuW_rLho-Og;qLWGVF^i-Kw>9bc+Ua!)?_Jn&7w2p?f5A
zyxd8$H7?w5&f!vGkj1h#vBYE1(<Y?;R+TXE=q%fwQhJEgQXt)y+0QQD)0yM^0WxBg
z2a;{1zI&T9k`QTy>Gs?%;kj2@RQeAf3*ug8)wQ60+G<C>q>i6f3rBVn!zcT-fPY?6
z*vh;rO8q_wbrS0my6E$@v!>GS2oC8yphSGw59TOpYUl*f8EbD^cBp<^gJ%?h@?Fua
zYQEc68EAlC&EJD#ozvH5q9XT*V$;w;2$IJMs)Ok+*aAR20wWKSE8wj(v)#ndSz0hz
zhP|f;t_Eh!o0WzD1X4^Xolrw6`B_|^Gpdkp97I@`Pa28U`YVU6E?40PqN7<3-d^(#
zp;$Pw5FaC@PjT8#C<}+W&93Hlv|u0UcwQHOHdFRiMFd`ImkHU6_M(meSYcs9R-r@+
zR@$FTuT!`|f@ZLzp=z?4VvvwMS$6#-Jj+Eb548U*zNNt3qw&?hq*+b53y~%^2QmFc
zHq(q=2W97$K@!I~VTy>1r#ZAo7TB7HMBjt!;0cWqkl1<;#WASU*vzXIzv@*|5e|oc
zs|U}nTQA~$Iw{L``pL`e-+xH@={FzwDqU6KYNc&+07haQ*W25&f0g{aoE~FjoONqV
zc_!!Ro$%Pki5%HXQg$mOoqp2!YsfOPC+^M~iXW}Ub{ITJ5Lv1%OZ8Eiug*pi-<3Q>
zzbR^vb((OXCC*=`Kc1$Mu3C;tdMwI~VqrzM^x<aLsc^KGLYc7Z42w&LE3-Oo17VN=
zN-Zf9#da}>%5(ZdPu!*YqnpMF;Q0C2OpwWR;}PE!Sp#?IzIwopVMo5YpSs_P53^p`
zL|e9pkH<07XK~<AV^XJrUyRprcAvA}v?hO*)oEDPqWTCp(7~=mF>?}3n!6Z;;ceSt
zmdd1Tx5&+AjEMrZzVwgrzs8JyY%qjd7a<q^7ILS(_ZHh1^qxaee=0!VtLr$*w$2$g
zjWgN^o{d`IoNJ(rypQUS^%-jnKtZhMn*7Gz6T|Q_h<!>kk}P=46u*R61W$EFzwv@(
zx~^9J2|*$OB}D(#iT}QG@6Xqru^JLfgb%}`_1wXnom*M26l)a@Oe9y$oIJc{WRvwN
zc~ZKwe<}U?N-l#E;^WJ|Zu#{y&T4kLoC=JH+GjKvR-1`A_jqdMYUOKznQO+drlT(z
zKfr*k4DO=1K4N_6Zu~*y(+zPpG<}^gtc$+y(Tx2A`C^{kqdJ&VJ>;z%W#)Bx1lVg^
z)lgaWEu{8Jv$H3hy;EO*;<%&k4|x;Ca@j?8n^pE5e9-uV%zz&lRD4W~<{nx<hSjGd
zSk;)bCgo|x9tx5kq&o1$kf~x>`*tTjt2?&x5^Ttqw=Jkt89$ZHc=WryefYCjS}4aj
z?}HTActw1Dzgt|ZH193kE-Q48mrI~j<pZ^9y@M!XXQi`jD7&c}lh!N1Tk`<s>8u)g
zB}~)LIG<9AG@bVk&<Td^Rp!@Oo7%*Apsbg#NZ4|)>koylB>|jOCs4b2`SG9gS`<}!
zZQczUwkyV~OJ6u}C=cur0C2I<<Jlh6rFTBkva=xAAq&#$s+yiuxC)^;vb6uC*!~D+
z5#CTg@h2RJSlKL&*t)PWw>6L>Qd#FDa$6ONqaZaLVGdlq;O+Ai4*7wFdk7;Ww-ldg
z;<`oL_nfHoF~pXUA|etRn7epvNBzq2unGi5YQ(QwuG3SRYvat8Qz2d>qr{0GVTr5j
zR{E{^=)6GM5HG!`T+76k&FeqmV3wPf)qX=6P6%RSGMRktfk{J${Xv~?Iw*W7Q%APs
zM8Bm_2Qm7pEU@bgD@qeRO*J4q=plCZOL5&0qVL6gE@>w>=^w2ro6Ue6T+W{BYX;r2
z<-=-aJIxnMox)FF<`Q;H9zN}+Jj}$FJT8<+$T}n%s*V(E&m>xqg`IJCV;JzCV<;7!
z(VVal$4c*yKwkx0+Ox{xJIYdU-<$grAjcanK{hp<b#IeUl*6!NLrc8$=aG+6OA1{t
zd{GaFlJ=#<Fa3|7U_ArES3E%S+k4t;XzkxK?#YFIGTwgxaE~S&NNpC*@_Ocf*7hP3
za((7`Ox3WhW9=6m*wpOTvX6_3N!<pGH!u}jzEUz;*vI!O;DFz4#N;fjN||h4k`q&<
z^6TPJ1vq`nyuK-YJB!xY@m567aa=+2ZeB~S7yO$Y4Ltb-iY|^$4F4^93|1dpAK*_4
z#Hd*<2*9C0p{gd|)~Krb+>EA4r&+?4zHUW4<+MHrlyA5`XEtu49Z8nZ9MHoHmQ_mn
zhM&S;%f<P^ZYxLmWic|ypdMO>+~uuF1$gr-AzL;5W=!?Z%mJYsHM{nmk5lKl1}fkd
ziVNqFtr+_byj1`2@kw9v7d3<kO*kk0%xXFYf7kitn4^tPP_?f-ozotLml%hRLh2W`
z++=l9G-I~n9;)#KHzyH}6#-5b>)=h<Im;e^wto<b5^lng>9;~x=F-U?f8BGaig)F3
zBD&v7?W)+~{1%j&;hNn1$u(Qf+kDj<OUa)3Q?>N@*idQS?xvC4ECDeHewz?G0#he7
zCsstL(w{Plu(!FNhXfJ{0n?Q-YOzcW?9qza`gYz$;4N?dn;5|r^Pj^{3o7ySB<|3Z
zEr@B#4pRlOCC?#z)^q@2e){xYUpE2rMRM>FXPmLC&x`~LfebW<j(Mg>D@>?Un}Q0R
z?W&qdt#z$7TS909gY%<;1R0EY_u4#ZKbm_*{+%P)iLAaOC~5h0SR75Q&;)qgPVa?`
zZ9N38VjZfZx+7Pp3f$(HaQ*`@-RF-~@`5qmpUYKC?;y$|({-2ZH7m!Y;3P?}1V|Nr
z9~k))*ryOY#+AV7tiAXy6$Md1)dc)WBQk3wcE0FpoZ($wnpPSit`ZWuksAs`LDecg
zxv-R{K7o8qc(^UB8Qw}AI^Svg|IfR_?q}RvzMS;*8+a4d5)%Lyb;ETWUZ35T=@62<
z)+fexuiom_)_+HqL6$jS6pv{-RHk=BpbVmu8sz1yP&{&Ql&7WTN=UFl{1V+Cz@g@f
zA-19zIhuL3gqCAkeODGm0{NwA&2ZARt1MUhRvjtc3&0oxC$nf?YKx+}=xKB$Hyjs8
zk2y4EZ<$46>CwdAF%pwlylk(R6K+3kB7Kid#S|HsZb#EHZQt{pyHql@7-$bZ`^)&b
z+h*q)c2gh)l2E5jq!o_l)0$@=t$nZ_5dORyJ5#AC_B%O3i}GxQgT)Eha;HOL_@Sa<
z58ox$8^vTLj-dcboJob(7@V!6cBP?hs>yC@N@7A-8qXcNaymut^z7-2ke|LB%~;N{
z{z_oDy6Rz}RXvxws@=lX-o}H(m_+ONBFu2nY>To&(~yXs&)R(hm4GSz6sq_3Hgk4k
z`W22YT|SY@aek<W*<37_1B2Wvd5-pB{iPyWn`rmoA)+%mnm#W*+lx<<$3R_?aCOcE
znrqYi$PjC$c%y_|b5>U#fe<g?z%J=oxh-%^Z)V&YM(w4R&=N<+z3w>XOKrZLdtD21
z<S}}~qNna++l=7Ep`X<k<$f1)8T%L02?a@wjQk?BeT%>8q%M+|*4(yY!<h4S^&h~+
z?@K-83;xEw@)F7H#Vv!3SbFOf>krP<X#864H8^d4hnRkE|HcPr?$&dA4(>ml6|~uw
z)m0l)Oznc?xB$BSiUnir_D8J@t=;0CmK)GtNku+f!b=`}3^Y*Qo`}>bjx6}r5P0Hp
z%OgFR#L{`XvDNk|EtHX1*d%b8)Z$fYN}M4#FsyOlSSSu&DbGA>UP_5x__o4;-3^y%
zr(Kbw%K`-}SDT*jya<EP4$HY)Upc*w1m!tetNp~CYC*w(6TxzdtKC@YnJdPr5q4P~
zxgqaq@5bg`bL~Do+r(4C%J@aAF&;8Ot1w97Es18sT+-AwFR=8`AJvt}M!?><Iu3WI
zjva9oTl&R%VxZ*HO1})9Ucr0`s({ZnFL@Y2w`Gcx)=@)>qc5yjOqlOJb{Vsq#ijkD
zdbo3n1{DM?mt<ks#9od!U-Kbv>NfSu@lMj_bG*ufjZY_T;58=wiQ^#A*t<fFtW3?>
z#Rsm{Xel4#;sN|%lzP5;Weu8Kgm)a7#;q?5M4#sAk<wJhmp9Pt2?dWQU2Ol1sgb>P
z+-l(a-HGG6BM~R{ELG2LGxs~c6p&r0*#y7=ZRo-qG5UD#X-vAjiftLabapwcO5gm=
zN#qbCEaf+FmJ`-O-XT<f7aI5vKoP0*xRpJqIqnFT0j&-B@e-^3QM9_5M&-I(DbN1V
zK)=o?gCp9_1!9Zmi07$X39k06z|CR%`8WNj@FH!(@v`K7@PeQvAjn$c93<IQ6?!IT
zq9nW5CNhSnGlYkqz=R*;$u$j0tI$^@j#x(lVey*KI^01aTNdzHc)e!)^VFRm8Q1IY
z&Adl=l0-pmV*KOL`Yw%%{%p|dE$MN?440g*s3Xw|#Zee^(*Rg1N?Lfne2Vb4$ugGc
zmi)!bcy&o8qT;zXN$bPVft2<0|6}W|g4%xIX75mFaVQepTAbkSQi@ZcxI=M*I|PTK
z4Z$5kDNx*sySo)H?(XjVPoC$UdC$$c$xN<tk(pnz-`)KzLp#}J(~AjgsAyBuA-}vM
z{Wwc~o;lw;n)I)X@5fTahk|0(ViCUz??_Q;t}D<7Ql&R9NQ4`GE@!2SenpRV_v+~K
zPazKsLp&<*#|`@HJ(^t-wi$%u7h76>y{)O<e`hz!XP^fwAYSm|?2IcCto+x??(nrD
zRl7suq644!jK-k7s=7wXA94wRIE)!La4!Fo)e*E@MiUe))en2=Ys+rw3{eiE#8pH6
z3frv;m_;6>tCkC!82ONnu4Ur*PN;Gw$#G&-2w#V_OuHeq%dm>dxYKR=2wrC_XgQ6j
z0p~59{5ydPT~j6h%HaL9QmB1pSWM$T%TW5`b2O5dmrs+bmIOEQKFH4fqx_}GkJ8^2
zF*f!q&`NOwMyTm)8)&apr}0^XyDeW`K1^fmWy$ngD#>6a?tcI=(4lLyXHVXa8(B)(
zBx1h=bYcC2cLLou0ZjJ2Bp-PvSF{DEyPf-jF@{f-9Q^L`R2H@(G24kdD0b23i`!3)
zJ5mbX+!TV{PlIJ*Q)v=&iVuw6xgQ;=sULFTQ7wH5_z#e@UeupabYuT&J9ecvU&+cp
z34`a$EESSc@=T@u2e40;T#ZlO1Hk=@JDW^fR8S$dp&rBaH(HVgba&U4xos0<3T${e
ze#d`X(*BLi+dehjmWf1IE^eQgp(?KUCMAfZ&^!mltdug<2!BE|N*mu4dOrFrEko`@
zSR*XHK=NMqX=*v#N**P1jl2Qp#&$byd}N0v^4i`!=SVAG{ZFx)zzIvPoXO8Z%F*rP
z=&Dj8^AGe9lD@N}g(HLpO8me(_jd^(%sEv~#jh`<;Ct8yI{Awz`c>rW{ngD9^=huL
zxjc)={Lw!^3Z&qH%OewsXP4pnj7r$|;#XF@=hA(V1)z0`Q69^uBEBo@{@zA`MfZgE
z9SdjFJBVhOW|Ehv=$<uDY6Lfj_8@x7;z1r5&oBM0k@TeRo8AJcx_pQkT_<Din7aEv
zXOX$wG?w)W?X+$p!=U$}hz+x^4o2CN=}DTEw2A7(+U_e^E{%}czRwl|hHhHWz*aR`
zB+)s)@aTeTy6>OMlZFtbYm8b$(GCtN1aES5iv$(0MWNM5uxCkIygvUK|KDDArKeu3
zc|9t)eyC3eCKdtk)h@^`?>ioHav+yKb_U9R@zQ83gR#`Kh&`=SpCDI0f0+F)z{YkP
zA@r56RHz&2`D<tv-U;-s@}HPMA4DLg73Y=yFT>SwCF_p6ePER<Ze!)(S;T(;REPM$
zx>hx9l^Dt<jedz%-xtu0QYX=*{xf}j_#z>Ut~!kg>Oe_e=U49)79%8E(BLUP5?)<b
zGVBf(>64T06IDW_^6dFaWF@v%8Q5Z4lSzk8?XtqiUC5)T!s|rMJuvUXzEL+xIpC(h
zPNR@Wj91AFs&x}Ih6q*u&Q-m0X)xb(xsh081g>N#?a<~VIo=HLVJeRM{kA_9JIqsV
zN$#;0le?h)rf#o0k@<nPIrN#YB@AXW-uGb4bU^`jSq@g5q%7KRT6F7E3;A^HMFmn^
zmHChxca9?`CRMiRB><hR7!$)poRPHZ+M3@ctZK$52V7wI43zuJ6@nK%>+Bnv?T=WU
z#!<2IZ)uD8E;R_1AG|;DgdS+vG`HP`x^k3N7%@zYOAWg0_v8Y%tQHje=mq`cZ(Unw
zzZHydgq`|_3u}Fm{xW@I$6Pz6KSzEf61KosH)_Yq&5=j@rf`Wi`sxh*S$5hZ88)wF
zb6oj9t;uLvF@A#S{eenkzJt8W7trR>M&T#Vl>H}Geg-&0VzSuRhv|!!TW(FISVBJ<
zZBJH<#82_zLUcJb@~V!L4eP3k&zCdqCrWbdnM$)KmNl&nW(i%wgz+REZ&k1ZKj**s
zPUXsFtaKr=-VrPgI!MsJ&*;*A8n!vP5tKA&afJXKp-8VpgM6o^g27WSD<1BTH=_0f
zv@s?D;jz^Zvrk;!`HLfj>ebi<V9sH2dn?kVU6N}Qg!|NHQclD4R7hJ*iB=gWPVqXp
zC=diqq=fiB>?M&ZNmdsJqp|!5Q8^Vi&w`k>x5N0t`J&zdCDl+RQKgKHt@X3@#|y;L
zk4$Z~3Z*nkn^B))uK6w^V1e7tvxt+@sp_7Ib;q((F5)EQhPa-FSN;buaSar{7WBIR
zb1nLObnkM*$(G4V$XzT7@Sd2e?Mq)?bwAFdz-gWHuX2d}$iz)E?5ow|@<D{+Pa-tq
zDe~A)oOa6#Z3~?^Vf-gWf&mO17ex2n6}8e_htZM+*ncWW`#Aft5BAsXz@SSUx|x+c
zrm3qXyb}wq{!K#`(;&tm{TLlWvJs}^GDOQzx#Sb*Sj|b<yWEfaat(r4c8}Tk+08sr
z$>=Z$0B&h6aX=CCeoTq)nx}Q_vp10eFQ7J;tA>g%{2iRRKe8F!0}4&h70V1QVo_4i
z9hQ5XW5`1XKbDN3Xnau66o%)92WI<*7w79Eic!02)b|$1yZV=H2GSi<7B7bnakt5G
z)UgHr?{K-b=ofORp~EGUiAMxu9@rT2pA#0d3f0oKF?K9d<WG|OwY41O>L3>%!rUI=
z5x_$ND%wCp=YYC1_1>04??0iw6y&nIs<raypO8>sO-J$x2L0Q=EFd@%$@()Xfy`{%
z*aV-@2F%1{ObyCng|r6&1YLa)wS<IbAr>PMa&E<n_giN$khn*SQ)3Rh4<BtWsgU`A
za@Hi_Td86Gl@4!qpWm~yX$;fCZJ<74Vb-q?k`wq)AY%|Vq(5vE)5sH<71@ph>-!~W
zZzinUjGvKm0_eDSrCm*5&WQK4SMJl;TEUmRktasMZYk_$II$w0yglQ}WZ6}L+VjfQ
zkm^wP47D;9hViUroYAIRBF#XllCN~3yv0f>a&3yO;HOEJwyLFMidlX+6%y_J0U%*~
zSx~5MVeZs=LQ|N`7qJ%E1vUBGA^>eU?`R*Q6(6Lbut#`^s&Dgd1mU~>$+`XFgIhT>
zU#)<6H}xN&IWXD#;4t8K^SKFDn@ZZV-CqP#{a65lS&~)qYl5BenhGg4*F+sdXD)VG
zjVluJ%-oFj<mXTulsi6qQ+7gU7IqM`;)^60Hgn0zjK22~W@|`Zm840q+~a@_*Cv;<
zzk1NY46Il&;YbN~*jkzQ8tv4s-EVSq)9Pg^$9_Ser)9bH3gKH3P8-tucF~OnrNv~s
zyAHis!T<bO%*3aW2S||-7zCqLFxsyVqt>XXjg^XUDz1O)$v&#C1nzT<3g~e{ofBYH
zVL5ekqhqx0$LQpd5aeG0v|dMgclauQioc@`iNW0Nb)-hm+#EKmlFFt~G@uUGd@!tO
z<$ktIY16$kQwXw3^6NCwhtF37bX40~zV6utZ)~EGXSPBm4b;8D(YidH-((MniANAB
z6v^@0?O4Mlkw5-SU@A=Tb~umfny|lnQ!zkxaPeJc#FU@`UW;}5iU$r{Xcwu<*q9Pr
zu++v6u1YavcbkUacx~~gm|Z9@CeaJ5<vY~#B)#FoOcuW%b6)i2xD9LDnTll>FdAOL
z*vp!)^HZ3vB^4F>R&J|7<lWtWK(k0Vh{;58-MKZ_?n~GPvsU@@q&P77jk<2QR6Wpg
zB|vThk2k(zf&cd`O$y-Re)fkv?ImVTb5Dpri%o5ev~Jdbwc};DbJ;Mu%!F$vUi7U5
z);+Q6-xH408<BsQi!9V1k1(@fk!3!Qn{BMdmWna6XY2#nLBof0V0G<SH^$EY0N9A^
z=XJ*krL43q*lURJODzA6MgoC&8_$_hYGkY3uV?WXUMGwm2bMIgU6<^^J(wai0Tgo4
z0kfVz*_dYisAM1_|NZNOHF*aOCkTGN7kuWYU^d+)++Wm3()hGDUkN_vV!3$SwdEu8
zCss{AGo3wTWmiQNjy+*p8_DfH5{B268;k9h`_gZ$B(bXyQgylmQ>WVV&pVD!0ymWk
zOioaAX)JABBd`xamG@0g0&<r51LfXt@b#x8t}~TSZ2^qz*>`T!e|7SLwnX9h;bh4&
z)(d*Hv6P)c$6Cr-`c7Z;+m@aue%KT<gxlHbV}7?;QNSt_Lh(Rd)(n`vy-4M5I*-v?
z$eNG#Q?=X>5p?oETbc6u7ahv#O&1UE0gfE}2OxR|`3oCR3;*l5PzHRxht332nZ-ZT
zZ;TWr-g#aw_wWQZ(ANCCeej3O4{sAug?CVlH?DE@SZNesy2>oht*%<1_}~I|(nfy+
zOV;3uckOFzwc__Rt<e^+!<F72afdy*1AI45&<s^PKeC#c%}=LsQoKI{F+U@=2(ey)
z54Puf{b#>aGOP-j`4iqFb8VoXU02THhF$hl`cco6eJ}pNwcJC3os+yJBB5d2&Jk&|
zOQKIQ9B{lv@3CYQAQvnhakO(kyx^e<A^1^Ke`~Vq2c^?{XMXH#>(c0^HsnN~^_z$$
zu9sf~PusiS?m9mMy#GA3AOQU94i+me46$Vv$oQMNF9w+p%nKYSsz4%19P@hQc=p9D
z|6h%nLt3fD4vB8~pBIF#Od9siFC_Xsn^lxS-75a8{qJ9iZ9H3qg{SeL(|7)|fe2Lo
zUr{;S?iCq;2H#b>7+gfS@LT1U7XBjt3ViNB@1aoblR5r5$4MhXquD5zCPJyN3cL&C
zV#1o=agx&X_dTTsd9ljfsA!vgnKqbSb<!INreQZT>j-o-+M-jnL2$CCeU7qOyKgj^
zo`d!i#~8KSMjhX4^@bCoaeu%q#N$Rd+<;-Mjk=?KXp5!Z8<s-XaK)`Oh`XbszYEe+
zUjy4mnz}Ee>i;ERuXfl0Ij9B37CnP5SSp^V=T7UAS0lIaMf!f2ouq5f8~Iw-c?z3E
zVRs~>gc;#t53RO0-bm;Vqz+F}^m`4_;`-NxM#QC)DwbGAiQ@Plp-UMUjoE!;HaoZ+
z(e#~Bmt(bK{nDgsLFVs0mKyEE3U&GHHPC3`vDv1fI^%G*g)i9^MG9KREpS%JRKB;L
zV5t25vYpyI(jZ4P?gAIq0lJWd32-e9PCL*Lis!&Ubxm(ivKSeG-Ryao#?pn3XcGye
z1Hi|<@OW^tvGL<;#QN(;x<jc{*0GhbeEW~O+e23<*)OlPW3v3=^e__^E)al!JH=@P
zsK0L%?!q?cy)Ih82@t!|(;>c0+iYmCwlc33RhHkI2R;@wFF*fLUgTeOd7_+Tq6LI}
zhYC0q6;&LYHP&4w)zaf=GNA<26*yV3ZV)Uh>u%9CjhTZpX5<IIHFP)lFSuP3Hf2IE
z6zGe$1BuMI_UG(qQ<=6cpaCK6{>vSEeAlXdH=th&wQ1P2L;#4(T)X?T-SK6|ox300
zlAi2D{=ey`v;MESbIY9LLKO2Eb`Uc#*mPN|b2vD5KKgo%u*&Y%O6c3#Tw{NRd{!7T
zc9-BD@TvY?9+Z}9;F8~3{omu5mneb$--<H+ho4CSn9TeBJbHKs7m<;TV)sG=!0Wet
zb#420N?|JwAF$T_PkR2&XWysx1e%)E>B}oyoJS-Y?@%Z%q)+ym$odq^Bq;!^Kb3a%
zOLjiadb-vZnLQ}xB!{C(NCuV@vri7nbYIZg3qHuc1sV7i#eY+yPC_~|j`q@#vz*jx
z7)8qP1h|cJJz^*saQQNxWuk}HCB+36=-YvUdgaYyq#g{#b>YieA*+T*?@`?<+^!3K
zk)z4&Y_|t5%7VAVxEDUAZ$D^+tD{NE?};%7jQEp{u>Hxa&2n_)4~$`*<aL%M1Z<rp
zlMEH@Hqtx%(2&EvDa@upn%K*D@Y-?@W@p8VWx@96SJ$jnm%|D;J$1I*>40I;%^v>*
zRe!gPhO6Lkx}A+_aF$ef|8$lsw;ukjy#xbq78!08eAWf+0oetXyV-lGWc#{~)Q0S-
z;fS8WUwiqst{WhQGyhrk)d`d=O2*z6BU|}iwiEgM3u`r?AZmb?>c;zzY@ZfnnoLrN
zR;VmXn2gzXo`uB+nybQp>Fc6ns*Kn-x93#g>+Zwgsfj}r6zJyvQoXy^xD9HKh%&d7
z4u`$nxqm%BG5A_AYOY@?J>JT(-e%sXQR}H#Ek|l}ITDvA)<`NQqBARweIHnGuVX(#
z=w?I^%0ai)NLg*fR*~#fsK?5$mPVs|qApBuve?Wfe9QDE3JELl2PLRZO4<z#e8c#@
zw{+(xivgn(_(`M7M>VXfQ1zO*<v+k5yR9E#_>uX+;WrvB7l?E?QT{W}cX-aX#^-K6
zzX5y+*UZ3C{|GFORYcALQxv&xjItq;Zf*9&Aq{G=g*b5;Hwk{^IZfba;B=Wll~6nu
z)%CJ7Ti8Y;8=m{J<U^i^N_8{bd$H7X*=4%ND%o$h);-kBnQ-i9{LQIs?BP#wn7Zlx
zLuwq@VM2f{BixpFjBbzaT8v8-zi>GEpZzt$Arw<Fwj3KCniDcp<-|enS9sOD9Qj~)
zE@Scu1A940Sc_kZw=uTpMEBo*Y74$~h)57#6k|O$_v+%8K3u0WGI_tjiuma4ysXE$
z@9OX3V~yQn6~hIk_&g=69te=wkxoL5lQ%dbGhj*j69!gowbY0IOqHm`3(v41Sx@sw
zp!8IX4n9pSn}-0dIYg9ASnKD;S4Ync7cI_9_570uYSRNqdrs*>!45Qc%Mn+!`1;$w
zxg5m3Ddzgb>FcCZN3DBthj*SWHi8Ts<{PW3c(?YyGWCXNK1grX;te!FPU4vQz=R$)
zRQk2`ptR59*x$+|d#Y?!xMKQwLUxew`)=G`F{Mo<&Rz#ITloJ&w>^oyZuw*TecP&h
zpx_I9d`f(npLE`NDQm;U6@I1gR}>bM(K8xX5qy{FMd!-t`3Zu9!;)O>K`K3FvR41r
z_Ae1zgM{(T?Zv-)LyKR9!@L6ySes1JEkG>aS@I}6-N?a--@dc!mvk+^0QLOakDB<M
zC%oW$!1g&vmnT|ju!=T4q{W;Zu)Y_sbNySoc;UCpkI0Gb?u2m$w6){AVtcvlvIWjP
zhhEDTQ%B%Lq$jj|;Z@Fvq-a>Y^a^2-rJA4G*6xX9#}+eWh9KCVsE4^<QUv#6ex*z2
zVZHZ_RvYdOqe(tQ8lu+M&)=a`Gbj~Z(>sq0^`A>kB-!p&(e(#KPSC5Ocu<1YO0(yl
zCEk^Y4hF%tV$-nRJ<eBuA1$}h{CDos#B0e8p;X0h{S$KRknV-k-{>GMm#SplVzMgh
zy;U?(4jIq;ZDgDHKx`WV!Tb9HVv$dt$-^pjmO2F;INvAx_U-#68;T~C(_6(dp#6!>
z%A)quY>*X*uwhk%BEyrX4&fB8zm0@$@t#q0D-<?;Jvj}zAM(za_>QNG3aWe5`7OzP
zWw;3alo(WSbn$c&FdO6EcF|V&{I1v@VH6GSQ@k2w(Cwr%&zO6`D)*O7PhnK$!O%S)
z5Dn0pv!&un>6QZa)K()8)PqtRv3JWXm;hc!Y60s+!55A0DAs(?^Xo%fUI&%h^)rI~
zF%hkrnewSkRFZ&RX!J~<-;SLRLo#uWy$bO!*!LIEoFCU_;7O$CibJ&2y;Zw8aq@Wb
z4zas$qiCN((aiEm(E8JyLtph_8)PohQVhqx4@+>xSks{Cs3;F^X({+ET5$o6M?-R?
z_s2kqc$GpsDq5-fEu(aX3yMe4Qc=n0$3zMKDc|B(t*veBUhAexQ;CZc#;3vuow(zN
ziDHKTPvS)`W%K>BTFk)oOz(zePbk)5|8z7F)lOo1Z*Tn8UFceN%d8<Bw~<>8Qj<k7
zg>)pX=~G1<NYW?3kPyxiZ8WGwkpR{tAn+u_nj<Qx)LRg$m%U;~8`Po+KD8s(o{w2Q
zGU5x445_$3y5<}2vb^<RBe!T+UFKHZGMeF>UtaGu(;jsD;^EKRz@yYI&<SuEJo;+1
zBzb|lHwwUV+&YTxKnoWf9tvHvy(<gX%^gB<GIJZiyw~v%EXW;P8Ij*zr7UtCTdNYk
z(HY6W8Z>cJn1QW@eNL!9mf;>;Tj@tHe}DOfgkFGO7&&^OCEd+v`M2nh`8UGMd{ijZ
zEEh(k8`)@Pjf3HE_xdSo%|;gc4PX`O{e1W(+}1B@CC~HA0`%fKAB9+LWl3OBK?+t<
zNSDFT+I-nn^T~^P>mX-Otq**e>q97LU&IK{K9c8_v)rk-KotF?LL@K%y)XrBN>~e;
zDAFN#REPg!8MO@>u_$TD*q{tcPG{{S<8Sorzv(|!)o%&-rKkCG1upv*U?Q9|xT?P%
z+$9?1ix1k@73CxD6=PUFB3_kDAo(mw%W5Sb$Fg2TvL!Thv^2d(f-iU{=ZVGMgOTq+
zRd+4Xb7G?1c3!X=e;ejeT_4wj0WhpKm4@tPnL?g)!g6Gum^5SM9&)66>OP!UE0#=F
z4kp<Ndj&lU@FzL4ZO2=4%V8;cBS}7_hi@uS#%>&Kyneh8FWwY+sAE%?F})}msPhcU
z20V)$aj3Tttd7o075Hzv3;`AWRHE?T1ZBC3;Mgg5uofz+l+LO8W-X*RL{@%L`>_+U
zANe}wTFoNjh&_!U#ZR5_QPLIHG^YU95(q=kPn|oVvtBWC_$MwA%5O;yeXUB0XOMbZ
z$k<yVRL9$d8^Ky;8|uuJd~TyXW-NcfniiYS*&Vbu(i<wJ8!t-TDY5ygyTf=goqp(k
zqi&5j-2RP~YFwdn9rVw*TWx_|i<P%2QTzm98MJp6Uylyu%ojW4#mW5SrUGbeTuc^5
zJ8Zv3ZR`*B%#b}RMcu3aHj*e)^P5>(tC0xYVCT+DpVVY{uw!Fe8%5;t*J@7Vx7--o
zHeTw<+7YbghaFDpRRBSbi%AZg;k=@0gH6_m$GFzl*32Ql>Or;9F(TBYhAsiEjLo`d
zQ6SRc3%te_d)dO&r>H{bWlC0V!>CnkK)1CWgd1f%m2ou-@p{M-mD}jY<xH#$3mDHs
zl&lS|jBxw6&n>9&-Ai*&hH|)%AL`Ws2E)y2pBgCQ$I&QDw{xO$Q2o5lbJU_o`aEUn
zvWTKRMDpUL`lTpf*(As&ld!OLT=e^I?UTO_&bzqmA4)>KA1SP=+gbhi{<eee*`K;0
zFU_NjF|AR!*jV{~6@}HPl|_{tQCHEVaFHWjZ^2T%wbE0_GJU)mcUjA!ZgKGt|Fq5@
zqq-}JbZAH;VnM}vBQ{E0@Z4R!Yhz4*b66<Tjpe;&4&saT^(yiUTz4?o==gD?`p%AJ
z#O?R^BxTRr#CsDes1UFpIv<8{Lz}O6rjkWB&;9R_wlfF6(xY7Y08SGpI{!ACLGW!(
zG3t@=9Gx~b^Ozr3p=jKi!36rZkMaeIFsZ=<6Q?dBTAVf<?0}U{(p6eY*#cJS4*Igy
zTYm}Ua}K;e%Ho*BShAFQnkq+Q9uLd4LIs=Of8JD1s*Xi)=%3Y^+}|O$+eOybH8wd^
z$r|p}`J=8+U}GxLXFMP)2EK2f-7_zh<!YM!eh=e|PbCi~J;=y)4KRJ{>wo*h(*M1J
z#@m1cW3)=nId}3*QiB~flBaTGBT+^B>s{CEVivP3e-^UiFJdsJa)qBeGmxrRkK2mq
zOx{eX{{YJWBWMw%;v<chomaE=g(<<QvVj*a#MF#PuSrkTwGf4q5QL2I`({cSWPX6-
z`=cGOB6dU%`cJT2p%XM_EezdlX0|Qj7G}OD%thujP8Xaow5lz>_^`O`G2#ssL96#)
zCr7z{*KTv~^rW#5sc0+mHW+ZA%J#v@6|1Z391Po%V`=GV_8w^SS14LG5DZu4(nlpv
zp2s4eM$w7wO9RJ`fUnAVpC@lt$v1!AhvICWPN(GD9>)s!YF#CtnvXW2dV2Hyf=z<e
z6)12H4*?dmG_Rss**X@%mi4Cjeq{!~ai#UX8q3cuC`2o@0C1Zn4`l)vEq$ryqw`Bj
zUHA1mnp@LbqZDMnc3HlSe1w6rZSt}$6lmkBXx?D~gl-w?BpR$(u!3lz(ByF$^B*-l
zqEn{Wy(#?ws>ZVIM7g|(-<uVzJbhMlA_62Z+zJu1I<`8-`e61@wDil18?$Y^xK8&M
zJN!-l_YX(IgA1tht3S|*guLVnW^f4IsikyPnD;%j3doW&GMdHo2l`x$4Tdpzb<olg
zd~|Ai&G4yNJ@!eUr720x8+pbe3Z3jl1sAuX?&<;NSO_eOUDc$!?F*HBHtj>EOBan*
z$Jg^6Utz6;!}-HN_WQBauicx@uRXTR63uJ7R)-)b?#e~BA4TK36gd9DW7;unWhaZe
zG!CWX#Ff<g^G!7&3Ue;ZZ&gQ;C`B$b6kJg7S}4boyGx)MFYJ38S+Q<Z*q?gz>MrFy
zOnZHcsSDJ-%h}nCg>rp7sp#GYoIiIYgO`SEhDU5r9G;b+RJLr|<NZ^rwT|etJaH?K
zM-AAvBE@9&CN(K%vv^jvE<w&zyF0><wcC${4u-1MqzJGlR9n)yqEhILuk981*XzBW
z#>R|q?nFBcQU5pdRNC7&ujS5Ome-s$M<|7r3zr1!p?H+XzHeCMBh9o<;g;N*ZgZ36
zQ^&~ux1LiTa3m+ckTcK}TF+kUuSb8f(%g_q@&I_nJ5YuBGHcQ{j6pZTamD*f^bGW?
zJYd=n3UqkSO@*1>_$IaZUPuk_o3`T$7|Be6HDlR5_wdtrds3KMOuyjXV(XvBzS?y-
zIF8~({}CuU6cqH6t6utaQ1Ev14@WJU6Cid(U8>V+9~FF(8<U_NzoJrV0LOGq@jTj1
z>PN$pYtxBz%ygH>XdJ&-TWrY=?AEaDac@juu#d?Lj^3k-EHU-0RN*5`p4|WBQXl3U
z1^mWLj>I4WV@cuwVPF2Vpum48RLDJOC1(>Ka<?q?s(F`GY;*pp0U6no=NLTQ^kumn
z%k@{|HpT;Js|)H~zcxbhaSAagf0JFtK=?)!k;AitzSwQ6!YUb`8Ni5z9HPni+QfPB
zStU2J7An?v%&>G)AcM|on!-w-UXQ12*;h$X_->ZY?v+ReWn%7;3Yz7I_eAEM$i=)F
z?Azuz%h3nVA{*GMeZ-TCw(+OxYps43OgD(GZ)YIJ>5WoaW!>}A0<0=dy74vnhA%um
z-<GJKjb)bYx%O!gz)0122;NdTw#||V7bo9uCyK&4W@tPdmjgWMc&1#`mSx`NQiIqS
z;SuypEK)<rVz#&3X-i;pCx>C&t1<UWec|^j-3A7wf|1%+KV5O@toceQlSiu~iPQ->
zl>cYth+E5t8ZSsQ7|x~N@J`4~$y@g!$bn?Dpa!Bgb(|l~R^Q8$MUkWbWZh+v6Yl)z
zAA(?JN`mjZ$-QCv$`8@bK7!AsC)8-BejHH|s#S)8``9KNy{FfRb716S1<{XWaOG3k
z!_k0EsD8ts2rS3;uXQY&%T(hux})A8@=uIF{Ubju-ttn}OL060HcfnIO&@dWk?75^
znQ~@1`lxYSTER!N_6TI6^S{P&Jyz-_dc}uhS|9Q>wa}rbw8dqA|Hye8Uba<huZIgc
z{BBB+Civ{F&TS%|tTAF2#yY6RahFwGQB*LqV|&2!)ruvo=~?EB%|-=nQcx&rg7DlY
zp`^%eLCgWVI6E`C@b`ZPy`|RXs*B0N4#Z<kAd#OI=Fr4w)(w`ih7?By-YxXj4Nchk
zUO7154#gnfkRLFFDhM27&UX2|*aV-)L<zGn%-Z6!5~2+tkC_?TjKUaHk#?U@o(28z
zA>*E#VyTA+5C$ic@ROq{K@df&dhcWg%$?@!9@GNE;Fyv>EvRB%T1pYV$=Gp{qD8MC
z4z#nUl_w0~1sl|n^Ua=Ye61_`L2nQlN<Hdg#`|y)oz{cL7+5hlHN$U&S9QhLms;9{
zEHb)s_>;e#f%FAY@y3djBkNvwoIpSNzt>S7V;l*ZFl(0Dl^pYxk*guKuZ3D}>@TsE
zIscnL761PesKuZI4y^ktl?xK0inrx!-v7?mPT>&;bL}1%bu}YfY!)=@MqK{^+N<nB
zX?s*sKDrc@O^HF5fGQP4JipI~M%noJ;^a;4<$(Sl6p&F-VmBVAKZb}fkrcmd;ikpz
zd=Bt^hnX}qoPcN`{j1q5&0m>BVf_{tr5SeYqFtcJ@DN1J-1(>p#r%m+*(ax8NWat_
z=J%UcOcnU2<zN2`X@DmZZ8Q%|+{sPby)&ky?+tY}z3Rlr>gSelQp^TX=#{=ceQWg*
ze+=!L&qWH9<A`<L@^cBf2Qpceiw2K}94qd)1+fbgcuu`n?^$F!_60}8btUFuDw%dB
zeQxWTjb9Uelb#U?&&>x-X3W0?885h=h%q$1C#QQWHl2;^h$tdH9-GBzkoY@o@_b82
zf6S5qh7R$OBqN)CJI{ca^sDY4S;hmS)R=)bcBM+!slCKwmPH>PoJeY<#gDO+^W)b#
zP)RpC<XTZLc$6>fT^3fk^}&`>mqS#Vo!2&*%Yo>7(|q@$Jv6Z7BV(ld%CdWP`LY|-
zj5}4sw`%v^E-UC0FINbOyEe+r$TuN(Dr!g-kY}SU;hC<OMfEGrqcY)@*w3$K#(q8G
zYqP-<@^^#33K;|oR)&reFcM!_j^@8Ly+i&<W8E*5z4%2I2@F{I{!&MWpzV^s*%DFI
zTBxK0hzD0gO4X{&jv<XZ&5k_c&yy&Zhbft_mR+P<0^lJo@T+mXuqJo#U-$BTvqFXa
zaaK<ZlF0nI!tvC3&pEEvRJjoh!HElMsWW4vVlb%8=uH0mH*aHc^BcaYXHBgi{Q{i*
zZ~AE?!zhtJLjz3}AQcY<SIK9))5`-PjTqyy`z$LwsXG4{^l!d4$gpR!yeopxmZOa3
zI|5}i4`tlb1TZ51$>kg~xR1(rXuKEw%INStfv0Ah4&GF_BJH#-#cJ)IZ&@X*McJY+
z7Qv+YA+2KapwGZAEHX(33nR;`bla1dyKx^1S0N<omnX?5Rv&KHNxIldVf<KgV%e<T
zZWsesK^1&ci#^*L6I1AXs2ix5ig~JjDKde5Ke*I<(6C(L4D<CRyD7CRXo3QT2FLtX
z>qmG<sbtn({&MMOSVvY~{|LkUkOCC_?KV*7ogS{4|56~4A!f*KtB$UbQrdK_f`rUR
z&==<qB9NbFtJ6)9*JY-qBN%g{4L>sxQBV4LLg!D&BC8-+^U>E;ig~W6O0et5hyYq$
zWV|ZG+3O18UTo$eV)V5Y<EP#+w!?1zu-wCzIM<eHMxXx^sXJ%KSP_EgKen=#yMvSS
zQ{#gDnO6kInUVTX%RShDlSKp>fUyr-iBnBV{{irRjo;Qs8z58JJheXh&T14g*9a#Q
zYAtjG209f=?=8C@1BMtIfP3=|cf#?OW}odB7EaV;$F0ACTK0AyqU^1YrVlg-sE*&#
zuC@t#f3CC6)B!S-h2W9B4`L$ON983w`P|QsfLn-o71i5YB|{pni}1`E*Ia4`ShB=V
zIpsvHT6mSDllgBfh@GW*gw%jb;iTIy;DbnBmPnu{lnTTxv+@z>!tAKmIY5;Yrmy5l
zpC+$?TwVNbrCUThW??vpo6|+f*_dzjI~D{wt3mg-)}TSu@n4|SnKq-*Fg|ieeD<@;
zy;Ts;m+gK_^^H)M%i=z^snwPoTBocsVn?}^n`dyhfdZKiJDyfnRgVSybyitP6!K$5
z2HkrQ!(f^a$s=J;<qeZ2mp#!kH_vKmFnw(?Yp$Y-Zyc7x$(L011Y7Qp;t_K{Efrj?
z<4Gm<GPCebnUIx^<HuM_G(`v>+>VZvtid1azdCcjmd}{|IYroUc$VE2Xdd<y5DJYA
zyR>EcB}XW`6eBG>CkCL`nqg8edr}v|P7DpOYKz@K;a^NC_94*sndd5s*UU+^>0IWT
z2X0Re*m}^iTKmXM@Ew?-iGC@(_wAol@*a*K!HN?yk8RaNl$Wk}BAHG1R%mC3$aMd?
ziGU7$?tXlXp6*Spzm4OTA#9|<F8P@wJ!WpSX3(wdyipI(8|+8gY~$HXo>=tLaPvjM
z>$CF|==HKpSD^JorP!Leu4HS`o%+Y>LTW+hx6%om4oWKgM3AZpGwfy&j4Haeul^&!
z<LA^ds;;>8Tj)SpuYK13oMl6S*5yI|8Jn$@o8<fWk_jUS29|ptPUU#RDu6u_<g(ff
zQ&peMuK$47jqdu+yWEJwWTus|sH?<QL#(Y^u{>!@M3n}%4g<ZhE0B#bMR1m?=#XIx
z1&!GZXyX{>)V|MP7L6ErVK*IBH$^^rMQ}c4#w>MF4psou0rmUZqE%6;qK~hS%TV{z
z_Frw9R5$lW1>T7o=exg}!(-g&yomk-xWuRV6&7uU!nZPB<n9eNeJhFY*?WvTR9SqJ
zmWQB7A+muKRtWxIJDHweTO0AIfSpfHyR!+g5>M=`3tpXk`BN!8LWWSM*UiA~?ATA{
zNk3rtkDnuKzK;q&fm{MlH!nZ9pXiV_1esr!)TN8>oIm&p+0Si;4_H5Ya&o<7e22@T
z%ynGUnej4LFm(5P*c6|bM>)~A27DbQqDBem@$qtAOv&Zf+5ttzW#z=X`kE&u>JCi|
z%Yk3GU-F+RR_V^M`-{Qo%~LL}C3*C--lpCYpHw$Y(g&B8IX(>_ADOh@`x_@KR|^`X
zi%8rvUPVOB*iDUP7N7H-zSfhVe$#(+{DI(!;}QPyuoeSUgvy@hpB}jdY-rGw1iAVj
z{oM)?44(nuev5u0yDw#kf9PJ^HBQZ2I;cj8Oz6BEXoCbqx|#7D+C?EF0~hyopGd@a
zTUoUf6vZ!Q!|r36VY*XVT!Xnu(D$;ootx14oce*E)4uS)a&KwZy8g_B6=my3o$OkA
z-8?9#AkDcfXkRqWMfdS{uz7Lr>MtQ*`MjKgxt0SnAN$aF-3vOSKm8I*CVM7e+g3dT
zoK^L}4<;YL8Dh7I!gakCy%o4=@1y|p^NZder<Ng~5%K2aUM(jy-P=FBGx;_MdhKQ5
zq>vF}dpsOfJUP;khu2$_LVI}t!%k3U+?%xgpd90C6B@up^;5gqM)0<ayg7Nzfr{?*
zC>9}dyn!B5`%X81Leo9vCL%u#l&yiw-6rq7!XQtfx1TROi>Xp8biwLvJK4GQ2D$|g
ztz~B`wQyPQF+Yh-XVj>h#G_*4!2P|-^8Wyq0PLK|jx|K?d7-p0X8N_SWd1IY*TdWT
z^n!>|yu##BIku@1k$+bCK*8l>0oU{@X_)fyj9*|YHq}9K7(&4R^|xJwB~H^S(1fUF
zr}Wl7wQQ$nwS<r-s9tS$WEBvt7T)$_vpE0Z+5D4Xt-0)wCp8R1x}&=;xmp685l;CG
z)dg9S!z3_f;Lk(U@lKdYrc$fuW0q*AtT^bG&pVxrh|SD??PxKGhrws{(Z5T5_@))B
zTXm||mLi`U(8Tq;QO=S5Z4-RZfCUeo$Jktz1@3Asc|lnvKL>%|*;f|+o(;Ks=o?A_
zEi(jrSTkMA69NHtlV;IEvU`THkc%bEGULTw9L+)0l`f&10sULPI2oZ>ZtnO_WX#y|
zfZ|TqR)#?S`;{**<``?X0XD-1vet07aBuGzO+=1NPTlvWw%$5MIeNkQyT>(O+3kqq
zC%sYBjJ~Q-y{Kzai2`}HSyLBhS=CBXzDHiwF7W5^I=vgU1_89vhS((N#Zq)01&>_=
zw}D|S{jWhaVN<Oa7PONBFP}U)zpzLO#Fsl?jybDGxLAB?kr~T}I@YJ}Ih*EcYu*_k
zs6S+0cre%1z%-6K4ZO0Rc$M*|y{I26E4#kW*r-4n$AkiJ28=;_60DH<Ws9R6Y*O0I
ztKPgHE~)YV%|7)Qd*?ShRG_1xq87Me5*W1jv{1+kp4t^$e4^ud`wuA~AM!ZQ=(N2w
zSPtp6{(e2NWzsyoc>F3l+OcZYI(Txj%C>6w-Li@8uT_Eixr$akeqgOI*`#TM^pM8)
zeK2C!iHoI&oF|3l%~qC|(20Q%ZOWjT${DDHe*nwJQ3~}p3p)QuTh0rZ+VG87$sEpX
zihQ)gQ3iWKh^!$}f|yTkB7Lp`#!A0gI!gRC%_WbaQ)DZ}2!ZiIA-Ix_x^SxH2aOr3
zh4M%iqP=xLmqwt(pw1J*RdrL&Y{sbS0BhzWG^_BN^P3%3MmE2XxG7``el(3g81t>D
z-mnw<#Ghf)OYl}i@gn1r{DkY6k#n#`#hb>I4^qpemRY?-n*QT`1V>{QEIVSVo&SFS
zhyxa&kzF5;J6n39%0VHZc%Qdr`gO}EB=*58_My(R4_4Xj{W`2(TZFUTngGGsZHw*d
zz~u1I{-HU?g&o|3B82lFAY52G%5Xz;r011n=2_0WzS&**q-EC?rxO7zKn)8Z*>6TM
zb5KNSZK60$lwdw-AgR<mzX?yz*MEubCxr2l7_|iwtQPowX4lt~UEON4DLyI!Zoq+W
z>p#%WXc?}oS0&opt0GtO<&u7m>jMl|NqAdaXgN3(B(hMv&#y*>VjL*YeDH8+$sa+U
zh6j>0nZdy!KP!0t-VzSY=CY67k-TXdKt3-nrzFSg8{yod?87D)sR-MK)0t-eKt;wr
z(--z*?QO9T5M=!MJJKp#{*>MG0n8s}tm40qZ&KD<{EiW2ud&P$WaH5V0*h&V-{p7W
z`r*V#%`OHy!9}|`|HJgc`xN`~_GmK`E!%Ifyo(%{E(_s=F0Nul+XMi66JIvIxeSU0
zXji6e$~H0ouY@PoF4cKveaP$ED4f@f60Kq_o5{JOrOtL!AK6u+`~EOW08ZdCDvU}0
zIdgJqZnR@x?oIL^0*^M=R3Cn(4K<AHHT7$xe7P`@2@Pxx@sIsJ>fSeVV@Vz%<$zH!
zYbK-yQ6F^t9SyY7mj4eRZaYji!rVqI%6~IKH9$J1O7(A+(mLu6y1w@e`lGEXonn`k
z$2<RNpM6hPcJdM{cB-K;sK3J|^DgdBGW5L;mK-TndI>Snk@HU({`tXW++n~C<(OS(
zYE3Wes{DfRrK|icFwFWFU(dj54Yj%FIU)DXg3xgEqEfCe@{lVs(m*b?;_g<ocDgo=
zA;_<j!a}RZ^{)(DtxK872}S%(1xx&{7@3xUa*Yqm3!JWgJ2A*)dagviI!rUZ-|z&r
z8l3RnqTZ-0$98<DAMN=Ir{QM{-AED5J=E%w5KsC$b4RxPWD+KH5)u&@x38A%yALqI
z@MRqN+fw2Nye}+^Z&b;nj=9trC&*cvy0#dh+CR)m&&;w^YIx(7_dZDD7Iw6K?4_pC
zqmJ6lW2SKBJ{@tVol#fWq#wpFb0$Q}nAzL&s`m9Y_-5&l|9%0Lgf{mXR}sGJ<G5{C
zoNvqEN9-xY796B->MVR$hPMaAbk+JnL8S4heVz3ypN=vV=3T&|_`ISNBYQ2Qt5oCY
z65rW%razIcW)7&VRMz_XK3>osxSQj!OG?H{%ql2Wqu0b1LyPGssU!m$aQ=ydku2sm
z&-*Yv5mZ<isF<;VrL?Ok5Z{G4&QjjCq|kzSiG%70u3*>X8MmM^7RIuwy;fH#bV&RU
zz^3sKoqt3g%cXhBXRQ#(p;1q=H!c-hiZNW-#eop+>b5EZryV@UsZV1!e7n@7ejnR~
z@4s!TVga$~XTQff!RI;6Y*eXFep1IkzK7!UEKkcB28!EVk!Wrg{ceyuxC+VAPZ_u>
z+==5{rn(M&({dI~MU~iXg7^GoqgYMu7F%&waK@TS!F3WZ%OfUfJ_la+2&ej$MVP8_
zg>bX-RC7Ci-pA^l&;DzjtT28@@7GI_ALMB%N+ozhS$awxDpAWU2b&w0iQe8RoMv#X
zQhM2qyum({6FNkz3`!Nu)e3#{Jsibh>8@sp>ZS|UsZmVrea-2U4Q6BFxqIq17M^T1
zvY01xBiV^ONkNy#XTAXtBr^udc#L`h>^3VGN_zqr8Z%X9>ujaPlO5moWIdm7XGh(j
z$8=tQY@drW>kuhN#ej*W-<^9!Cm#AGUzoqj8B{**;J$`{g6lOHzCvy$>b%x~2qK$D
zGlFNPhL{xm*qF_m6hA2IDoJ4;JKiBPJf-VxQhmM<dbitbSVL$NZA`qpGR6c~iiG>f
zo<yMB7F+4aI?*`{!*I)G<A$uD367@!0k|+jgdXSfe2@K(w3!DF7-*9@%ctim$^!HO
zipiAdN18?npGcDx3uVo`pG#zj=CL>jAm?0n;LSl#im`2ZDqb_yJNZ7t2G;Qjral?E
zT?YR3E9f+>JE<AuuAua9!N1li=Uxh|Enqee9cwgz^*N_N<$0+|DsF%V&RlKVVg}n)
zFVcl<Q}_QC2H{NPLdH){f_!Z~|6B#lBEkk2>dXCAE9YM3j8L7}YTZmw1M1GYN8xG<
zriC|mS$UjP1%=8xKC`E6Z$phdP|ZPzG)3_subc6gjX;uz5D~lErK-0RH<g;<-g}Z4
zoX$Qd6!RmSoQe6z69wIyT9&z%9b0oIcVB!+&u%IWo$hZ^U#5*^Az6<Zx_0nT?Tl;u
zVv{ww!d`~d<O$KBK#YQ7@UOp-Cxh0s@cRnxr-=@yH6g(R`0Sxt&#*xr+3b|IY~5+O
z*tPvf`VRqs2|8_lWi{+jnydgIooDGzg~Dnm=@_f$5f%dzt`J+wUg8+UCI45MPmQC>
zu?!0uokau0Z|H;CHyWyklD`o)dIi+2vNYzo`Oxwy&a`9;QJj#v#_Tfo09i3b#K35Q
z=jF%Wv#f?25fy>ma5zL!^Vi9CNdBH!O~#+D&eaNF8)WLU@e|>C?34IPf*Cm+O}sW_
zm9a;8rE#|u{P~CfW%jTTE9x~6!j!MOrDukvh}^+Qc=}e}iffrK<#V;XOJgy~v+BEi
zC6gMSkv*rubt*ugUNkie12?BvEGs}rLP|25o(T$tZiyHW1`P^uD>0_*VF}4n7KN*>
zDh>nPvU{BeR?C-VD5}xSI^!fva<?^g*l^B)?aus7Cz9_Av+W2+X>1ipKase>XT}5-
zL}P9vhw*vBw-WUogLAdaAkLcWn}5Q}%gO?$bl*c9_ilcoaag#o5HJ8lpS#Np>gf#n
z0UYSHaKP`jicDk_j#N)lx`e>)O2gj7xW85Iu}FY@t|LkOUUlZ}7M!92=Xc(Zb2$;)
zH4avzAA^qXx%Xf)-^%WxtVI!`5Y3BUbkS<|T__G^F7NjWn7nv%#TBb_tRT3tetcf0
zw}Fm&k|RJ?P7L@$=Yg?1E^FFSkR;8OiR961K-&h-CV9DJi|io&%(X~$${zgwXtCWt
zL+y%!BMGUCW=|s9dHcpmVfxhwS4pT4{Cs3xoI^fF&Rxo5ejhclr>*a^ioHL^r7=zd
z=t$cwM=LMTs5)~NhNEu2m9~kY<JdLhPpsN1gxrvBC$m}j1`D;4KmWqONn<ESXLp=*
z$fVLy&x1~X{<tdbh|C!mn-w1%J=+i@O)z#Bj<cq5uZBXTE&jQ4F#VHY*O^Jd+MD=}
z4SiczE=HD~OV7&Dr7tqefKDi6{-C3CIyRi_6U?<NP{7>e?ZAJ4Q^d0d|2h3pBD~s(
zCpF1rH&(wOpEep*@!3ql@MT{=S|KkonD#Boy7kOV-JS$NrAp>c{X>xjq9u(#3112m
zLX@kXCCkrijr1q;n_DQ-j@5g<?M0?c>O8>?FQ>3&$L+|2$e6cmmZ>(vu><VABRV;~
zd3qZbT@2F-t)gO|uN-xNj|M2kVyzm53SSNVN#1~V&K3z|mKd_jd#Fh7WSn1fxoRFV
zo^aldg1=Qy^$6HZ$Q{;8h+K#;!FWJ0e)#pxgBbO`-&i;!9h~{0_+zv}j)gSaff0Ll
ze&j@^OI>EwG3e$%%Sd(4d%C?}2A)a3_3cmb^1O+Yef|E&*f8T{?q9hNvMj3O2;b&#
z^r%olYT1J@Xct}=KSzVg`^fYY%_^LE@>e=?p0ifdyOtM_NkfL<@u{ZF$0Dt7cPglX
ziW|j%-KJ_@v6AZf(0)~1hPes1d7BE%FLS?ZMHm_6pKvx?L<|Nw2u6edh`=^mS?(3o
zmmihcS%9;f!ryWV)Y*mSl0re6e${I?#rfAW*96TpgJR6X;+BuoxTFA6D|nmEk^YF4
z?YpLF>vijBiEXPniDP=7{{VUL_PUp(2DaEp3b%B&f!8n-dhfhfSqJmiZk?P*p`#;&
zYL$_;ZC+;tj1XV)P4{2bD`^ErT*=9WA$qVI(`OPd403Z+kgy?SK0I0{)HB~1RZOR+
z1>n0rQq;CUQK5pSmmFEBCShKOhLi{Sh$UKO@Z(I5Q)Bv+Y*w`vZ5;aU9}9b?uVd0z
z6&Ty5x0T}gj;FqYzOJjXLn%H#@~+jm8bf>@HLImxVO?oOlz+Is#C{F8v`ity&hCYA
z4?thFtQP;Wc5lw0Qul?AOCYP4S5J~P#Il9ks>!kE+VQG`lH>fcd2%2(N6d5$VRQFE
zegBY*&GQVR_)yZ`w^&i&2$S6$rF~U7crQ3a2M_7cSpVFXbsQ+-y?z$eT`-AwkV1q;
zj(pWt&Dr%u)1sW3zkES~+oPV)TdE-`Q~4h*&l%g=j$;`Ggyc9*mLNIinQpj1=l3Ti
z07)iXF%B$j8|mLH)e3D8403V$W_hnodq-r^E0xwOtDIq*@qtcz^H@lR0aNaxsk(*M
z)AsfYQ&Pe*m}uslBNl7j_M-8XAVlQFZujv~V{7+~t<ge!)NN(P(KQw2JTmd2A1+>t
z`EHv8?SFGmnMN@MnJ+|gnMf*<+ZJwmI9KdlZn)_Urw<W=;*B>I5&A8#X$?CA&A0hl
z>YC@AsG{12?FLN5s<qbP$fEP7yxYqf67gKuN!0|Y=`xlL2|K?u81A|piaeo@WU8yO
zO0@X#)zsV@F}AWLw;Nk={DMQ|a@6ITmhnr+SIIA@o=MxLkR1I<@VJ0#68J27*fX0N
zCx?kz2$*U|;W^hW?a-D<GYly{>C?M(Y#@>8v7bPl(Ng#zH|kb%qE2`sI8^TQtJX@r
zna%bwgm3B~!LJ6dk*e3@`JjPEbRO_;DBTQ&<i$Twv9{SiIZI|+4f8o{LQU8sPpSTg
zs*XpRVrU@+^b!6$semdNvu*HXJw@l8nF0mY45U8P$(M|N3Lj~>>JYtrHet~U2Ug-X
zD5^QFM6vTt_U!<8FGf#Rr+UeOH5PBaRe(Qb@A%X@wz?k;!L*^j;E<X1Q@X&nqhvv5
z{Y8lV55O~%@uFp*_PN6r#?J`w+PbKO`<eXKKY%QkBQ`_Dy*QYV-3aF4b!R*Gm3EBp
zY;MJU&aJy07!@Md4kZo*idupMEu~*4I@%zqiwrTy@+AH5{+8oW$BVHT6sH4eBP0J^
z>biV=2>Lyez9@s(`^tDkoBpc9{Wr=iPS!}Vn2{FYQDXx7M4!cx1Pwz}+rEhCr}cbN
z0&Z`+MB5Xvp6^PqNlh64PF&Lvp!dZ|Pv+WrgAUkH0W%dyw9-|xbEVwy>GvBYG%vr;
zgV!bqisTtf2|br8tx390o4xcji~vZN0SE9pmZfDq1i9An8At4`&a^($UF*G;;G(L~
z$2xXOr+evoKAN**o2Gn+#w8DKiARy=v1R8#;i+1KLM-U>HneI&v54)v^Fy4{uKAg%
zX>>J8zu-!wEt;r#(U{wpUz?fV97N-n6$uGG2(s%u&M#5nXiH$2+KOLHSD+)TN8@%D
zehWF;B)TxR>|P8+Ui)pq{Z}N%j)i&BXS~uK@8s9h%-4&XEcTrG(!d?-aVlQ#@p3HX
zyi9UOmjv7t5eQ_S>h!$gbC=7~DdK{5B5L*Fhekt&qrgA?KfyDxtnL-WC;c{X9`Yiw
zVOT(Tdk)Hc)%P6p^((&DdG(bA6sO(y3DBSBaAyPY!mI!-Pu=n<o2@!Ony)c0v556<
z@1in5xJ6UO$4TaG0hjC2zvZp%-&h1JPAbwxY7*Y$)<3jQA;nh#gAj>Mthvszd8Mab
z5gp11@K~$$d&Xf*5R8ywt}i|~b*%Z5EZS1+;42V1IjvB8&CwsnbX9)m$e2eqC1I05
zOE}2_!Zl31pQBZ^uWr->*gJy)m}yd`7-X?8gZo@&TlLo7WiXSp*5MT*$+AQ*4MH!U
zOBUAu4^w9u)m9g+YiMyOw75fY2u^WHffg-L+zG|q-QA^l@fJ!c5`w!FcPQ=_+zA>W
z(3|g^d&juHGm^P%@2s`vJKrZ2b<l4mXPP48C%$Y{FE@bysGlUD*R|;>RcyMHMeA1J
zXB0@p=(on2lng%yRg`O5d-Y(DLXk?!8JmmZHt}N&H14%_ZJ#yGJs9e;Q6`6Lt5^Bk
z+xfVyX}Zd{K>u<_WuUkze=~s)la=_2&lYcon;czq^3Y(yV7X79B(Z?|18ir1m*VcW
zBC~C((C@2zLLkRj<3@8&r(U%Lo;mIH(squ?dq&pO`Vy(_2*)z9&K{aj(QPLT%w4tY
zQlu<Ko=Q%X;XBqB8F>-vy-eKSE7*xGH|KgAWew*oG%+8NJt7j9P-s<TfKysYZQpX*
zTr@{TsFQkXBin`!K9ixjPom7pYp@1I#{6PVniKR^P6&1|P#SZyPB=ZC)p4?_&Ip5i
ztMx>?XlN)Y7b>+<id^HzmmL4-X@kro<EI8qbKQZmZyBj;SZ8N!C+`;ha~y;8xhCg!
zDo=twJ+Dg9#4Euo%%NCOz@~RYqsiyWHUmPyF!KHY$62c`ZJ-kfi>2n`SQ+M^pfV61
zRMUoz`9yn#wIunT5bYK+GUkrl>NY0@FsOO{xDXxZv5rYQ|6MUdat=a$rZ(}J>1)h-
zR!AkU5_%umc%F}ws_=Ybdd37>>F_0pst0tvu@`z!DI(=>wD>Za^?KkAc$O46N%Hvi
zf^G{O9U&p^SXQY<T-(Fglq4OU^jdDuNz_+iSOk;WRc~)`(poLWY23{LvoWWPE9In<
z_^+dcK#}xM1B-iekEHjbL#uVPl74<T>zN_%4;9~T3tqVY-j!5C;$L^Al;AbUN@?)I
zlN+*}U%HC*lOvc9MHsjHeB5h;6u9HnN_Ra%_k_{B#=yu_%TKE<Px8QGw9?!A7cSBL
zd}%*6pWn2Sk;umwIQ3T>(?1J)??hfbs+&q}`&ie&TCqL0h(k?waj#72O-4G3H?8}r
zNxE^pb-3g^2D#MD>P-e})TYe4(ilzj4GL=X6-_=fSF#ZjH-+sI{7y5@iedLQ+6*f$
zta<g($Gflc>_%ubLUSPYbY9IoKb&lEkwn&NV5<g|zJ(@weOzvmq3iJp$IRcNvyXl3
z<eR8bWwctUs5Vm-KLf)2o&^;*6NEGI`H-FpGAl-g6~4hDPR*z4uH7}6)pa0W>oXD&
z5<9KbWsL;yw>x;j29Xo}gfANg_tljpmXO<FVT!x5z4?B9(gL>PFB}#`v!-UPufz@8
zrb_f3qb`Cjg=GLZ%j{Lf_tgo}j8Yj6Ln%mCux0r3R%N+nLdH)=IX8_N*_C&Jek%sm
z`P&%6AH-*Gw;Pb}w+aLpkCBdVHLz~E6`)USn<~Lno*^j$kt_&MC@Eei-+Z)c&(YHx
zD0Y>KU!G7mTB}s7@%NM{^sC_gDt2-@t38M%j;;yUmixdI10Vc!2a(U|!+ZQt(~RzW
zSH_Z({*vPJ{&z+}I!@7@xJ)|1Z8^U0Z<9{fydFP)C;`W$S`YD?|CqoK{UwU0sEYmU
zu@^sPrG8|SM8~-q_WsF+h%seJrkuaNx%=Mr$u{~FD{j1@8z}mcN>1E=4_xDRbkas9
zLLZ7aiti>s(rq0Ff1tqL6MnH&spRi)2`vA-=AL%I@$+*npQqbs9?e>327ODSA2m^4
z5N$fjtRAJz(RU$Vk<v+9M;AFuGfYuxH*rly-a!iUgj6KerM8!3ND&%SB}xt~JAF?<
zKr)6?OHYC4%g9JkbhKrOl2U)SZG$+l9jls8iF#U4Ul1+pOX;^6_a@iC#%;Ei%KQBC
zQxfbxj(7CZnjBh7xvs#1r=XqEmT;xZsRMPG(9gvr-{jfhxcI6>H<5s@86tM|p5%FA
zvc775Kkx1rQ<m2U44fJP^YA>YY&X`Xr_-N~tf~Z`(w6h7HP*Xpk9S=9hKjHELO4Sm
z-4jz{)%BvxMCg=qHhZF8I=u*S`J<&;@Uc2<gul-NbZ%XxBGahBE<;Te-XVXH_SY&@
z;J}!4TX|jrlamySo7*f-E7#+P1h^u93v1N8QG8)S*PdTr7C_k7h%KoqmUaUv3c?wu
zGO#ny!x1XX^|0$Z-@KG_HBqlxNuTEd!**gyJy0O;u@5AVdq);n_#5c5uYJq4T|D@d
z8<I?b%$&D99Y(@Zol!qh^{vu}<oFhK;yzI9#_aq;MN6=lRfT;h^=9(AfdtF$E25{g
zHtCm`eZL;WsOtGZy6qo{Bg$k^I8Y%Mrl8%_jAp@eakX+$b=4b;hmeQbdp6fbZr0;b
zlsb?AHmH*D0Xdxv8RTLaB^Dl^dvqXzs_9NkLyLyP6((FIIb`04p$nIgnczrDmA%kS
zM`7hzV&*<q#0ZwiFtX{K&Jws66|#CKr`s)aIJ*IJ>YP85T22HGO~yC7Vp=gOwnmNB
z^t$S1&bU#z@X|OOO%*tV-{c_L65Z{Z$0Qi#a<yjGZv2i9g4+K0nozxr&#JF1(T7BO
z=z?O8^oRS!THx)vTfQyvdesis{bR^FFRO~-%7e>)DCM1vX-`YY+&d#En@zm1(qExY
zsXcr<d$4!3?$FgC81`_J-W&$H;pZ$os}|g)qbAASx)}*^TM+v0y1-*nrO>q{lMGMU
zglU2n7qgy*gXI39^r{#)bYUDe)tU#Vn>+Xo^aD$H`o>@Iy!O#J!_=6>xt~IvZM}wY
zz$wZ~qOyX2ft0@1j?^)<bT9X!U7ex(Va7f)*k+Q}_FBi6DF8z6@O?xDkIXLSc$eyK
z#mkIoxiyoEleQW(Nt&2T7A&3NtRtj3cKW|Hn!i3>ogeD9H(<uqaQ67Fe?njH+w>ae
z4bORURMIe-UMrl`C{Y!#D_BFr5sx!2o=(Tp_}g#6GNqiL{_*Aiv)d+Psa#%Dv!%}@
z?Iq}=tWZTWEArSDv?akF;f9IFzw)Bk?w==o?~nh=u08C-8l4Pal*PPh7wiZ$E|~kk
zpUj6zyQy2Ki;V?wx#RV1)le_@`ffqfQ$YNp`Q|r$OWe^LPKhW4JFF2z8=;%8Ayf(|
zRLXID7liIpnoM*|@9{j{{tsnp5@h^5vMnapsD`+Wzo)z>=m%ZCvj^T1{_lq-K_gw4
zSo3hY<DzG|R-bA1oeU8Czf7#@y-ykek0=t=B0sMnEy7B#t*9X%yoLt>>v<W`l#?~R
zC~itP;{Ye%U9IX~^{dG~N=4p_j#vX&)~}OWRE%Dqn7tpq`)#au`Nw*>Wf!-&#q8Z@
zdJXrS>8ZtWx6R0VIk~L?yKoD(WenGGuIf}#VftaoNYfb&arVh?NA+!rPE}*mPRq3@
z3TN&n-R{i87#miM1HyjL_Ac4x)=ZnX+V{eQCwbw?1~=HOUUD83ZwlMi>JCA_FPPFI
zPLlVNsVue=zqj>2M=Eaf|KTFzK$c&WS9<lVGujS+e4C!sd<Uk=HX+m30Scrv6DX-m
zaIf3HjWu$(ueg@In2)Qz|8-j6#P}KCZ-;U^>Il0?Kz<}VE)FZrR=bx+4Q!n#Ks5BJ
z>i2oZM{3l#tcvOuU-eBOT?vK?Z|Gg`_5~!5f9~3P5I!w5%{PQ|bCtOZ`Rj%QIpVE7
zzc0*#3H(Ee#71;6{#$sgeX!S^Wp;lZn-Se&<T_vHNkO?;q@Q6#z2jyVsP)eMDdQVw
z`9`~kPG2;v90|<{?bPw6I4<A=H4oa;4e4cf*@~=Gke_-%sFX7_Bs%>c3zY_w{7c*I
zZ3h0hm9OSKvzAUbFuxd2a7|+fAz;mWepfwC?8-FeEnxkoARAF<(6^J8F*4IjxKhbK
z9?M9_Bd$!eB6&1zV6|Yg(u|Mmx%PRZQWuqKM9~D8Ycuh=aFBjB<(m?~>2lR=XgsY@
zRm|!~|5<dgWi~h>lOv0XPlQu(kfFjzL8XnM=(jc%3I3-1!hEhfVB-|DZs_=`#b=+U
zGri1d^z|Fnlt^+>^d!leoZoj}3vQ{*IfaRQxrjb&$xRO@BD|#L$`edqHJc)7Vq9l$
zkhq&@>w%5*{iF_gl492vAXIVjw=qOXx!*j-Ej4r(4?bscP369t9%afR5>6Kfz^2>1
z2WLLyR9hR>$KaeECL7%5dZwNTe3G)bxq-Zf!LB^pf$Mr%j59gcm)?*22X>7j*pAcc
zdk!cC5!7jL9c7f-lt0U^GlH!Z)fUZ+Gb#F-j)}1Y(9=m|;`D2-|F;RT)hLy!KLT$G
zojBe#D*KYEes@DB8hL@|sUFwj{9Aw2u4WWl;Dz*;lzw$=mLZ{H0fj#nc{%!&YYc3$
zSA1wSr*Zk48x*EqNpsR+V~JN0Y$!O=qd_A5SSm!)cy6h<ud%uXOR0IR?P^$*vIhJ&
z=cufb1mBFgl=D*|g<=3jr)`_|v`p3vXnJFfEA!||8<WzsI6)RhgBc;G-BGrgl&KFx
z#j^Zf;&j5M$8DK`jt+8%>hnjNadcAjA!jeOaZsBj#i!nVu=YKMPp`*#VD1*0-ZSZk
zVz|d+h&%$d0vhkj`q6l*<!k;Rll%1mdSKU<R1o=FM$8kUec+iO($Fs$rx2c8vE^v5
zjVuC|Qh`+r++}j@IxLkgvpb&?QC{f-YHeI8ICOta<t)O?S#J}|738G4t#~&)oDu?7
z-3I`q)BF&|f$kJ}t~_&~!9xN=1G!<%h0kN47o-AUdqFjj{;nKHh<~tfq<$IKkvEzj
z+kMF{?W<a5!Y?6FW;y<#`edtErU8ZtVr~q-hm@S7%t6K)+p0`C!uP0XPwSiWm8n@F
zG9%dzgyg6D4POk$sdd&hc9~b*`YawjDl*+ishuvHz6Ez6WDg_(*uIq0bH_EDUJ4eY
z^pgS9FvHKdZ*X5rfTOANbOO^LE3(4?2@CV&ww$RYo^LBI3=%oJ72W`vbYFvZc}yGF
zYtE7fqAS<yUn*z{*o3~d7S?&8pA0Lom~A$7xW~iBhXy$GcIrcia^JRY$&(9;G1&_h
zL^1VIWn|NfN{6Y+<b3LPIWao@GeI0|&>$gT6h@!U@Ac{u`nw{=0uUxIn={Y3oum7g
zx%LB2_qQ_%_rYC_Vf<s$Vz{ci4r7LPHtg$YQ$b;L3F%6<MCpD(L%?L&tGCsp+Ei_G
zH~PLdVZ4&9bQ2Ym)NNSO3HapKzq(n|Xirya7IG&1ay6&)vlybsMrwBE_^XTS1G^FB
z*TT5;XfN7YtrMWdsjJzP?<tzx)8Z66f7QkXV`ngfC-X0dBx?E9+)uR|oCyze(AZf@
zB)pE~QS$1{r{ok772^nZ7%dmK)$}z(o$+8ss`<!=^$-s$y07O28CN$qr+dY23t5-#
zSE9q-%-@h0Bus(}!2kzRM$&uwTM$xApL+9{%u1A`wUQuc-C|crTHpUIUu@nm$;TEe
zA;>5d$T1$it<&&om8idT*^Hg!k2?~<RiN<2ghM%yjhu14>wR&zs08E4p}SwxvvvN>
z(L!s=t$Wa10XOI5Q*6nl;xRDz(N+a1k*I_PKcT+snh1Ut1u+<c`19*%dTY)AU-b2y
zOr#;)L9{#A$C1e!JXey<Fbpw+a>xa8o3ql%6yOJ=>mzt+PmA9*?E(KX3m&0(d-W-1
zM7Dx^AMkYqYX}mqs~GVQh3YxunuQ;b7T;6Nd~Br1ZsSd=_oo}}q0sKM=fX)|j+?i)
zBv|;_nifD8S2a^6{xne7vr@Nt9PUq&E}V2$`^ic8ON!se%aqG~rtLJ4N%C^V0QvsM
z>EVaTe<*{1-EXFGYiO=`s84nV`;cx55`B|j6u`W$vGpj&gkoAHPtH!zPM|E(7)Kv^
zHS+;!sf1L$!*^djYQx*BfHnWO<J|byv>%bSUwy!>^9S_*dMg$@kJtMg)YXef{s0Re
zSn~%hSk0cjJ7&wDBCfPjDYm&YvnL&e?$tP0teD$*|HjP50J1>%X1lgeJHyIZHBIL5
z@rQq)lW)y+fbI)E28<H1-2d{&MSPhr(02b~dzpZjP4GM}8(=w~tc=TdeFcP<Uvlo8
zmf(h8xf=$T`^qEWpyd0!E<-S4u&k+n<8>xsV9v@Rk{T4LQ92;$sTy2&0!IojGQ%MV
zoc6-y+$2}6W;M2dD22GNe<=Jxb-MS-NOS4P7O>;sA{XBcGU*;ynjF78{hQ9TezMBq
z9}2W$xGi)!tY7+!{r|@E|A)f+Fb>+2)bHdz#)CC2Iw~gYSLPwfL311kvjy4vcuN~(
z3U#ayq3uh-(ZzeNvYGc6uKfRk@;<C-&)(d^d#jX13k(55lg@brPdF)^eD+D*iIJsr
zH#5jKKo}`R_sNSzRHYo$pMUTCL%G@t0`399bI;}r7Vut(@LFvH=to*1!mp5EK+=*p
zYQ|yX=zm%}L_qVKdOCm+*^|l0yRIwB0Ud)f+A}H9<HDs$d(tekI5~d2mVYRJk8_>l
z{bpVd49q0^B8T-#%Sgt3y*y>T)`%*N^%qUxIbvdH&*vw2!fzB|Iv+E58(6r{iq!O}
z3-^Qd*N~p@^>`R!!~anFb<;Ns(&_vlH$dda|1+-0A_!Y(aNZ3BZs<3OfG-ER1;ezp
zJK2wRo1T&usjpwL0IxYNFIA|ag%5>k!T-}vvfCuUJ@T2E18zCli{k`F&02W=H-w+M
z$ORivp2B8AkN=BqA}41{ZjP+y2`;A)Mo#{Jlkq&&hg`2mGYI4PQdEP1xu7VuXO5y|
zXjN5sJg%zW)&G3YxG_0br1<IcaMym%<HmjTZ4&E$;pQ3Obrs*g+)~N5&HmRJZM|o9
zJHD>$r$U>3R;u`~0tdpn-@dx%?d8_FZf<(_(;Y=2Uk6fT>;F*3)cBBbfjfTD2re#d
z+2HTXi930F$j}9aBn{2q^&(5kKA9#X86I&Ec2BPZG4C?@HIy9SEF$+gfT!ah%C}%!
zv*5gI1u3Q@uPcc|dk5tZzAWT|E8poN@$^>XTwJbp_Wo}Hj#4>GryZ-&Z;~7#7m?<r
z^ZiBV-LvTxr`pxCd<k9bL%L=xw7Nk?*Y>kT_ub=~O)@mIDy0>dN`rbQuTw5~Q+rRm
zunS~hJcfzs_5c0Cm#320=is61Nw|T*`boU9X$xA}y7V9d2iay^kO5Qc(lYmUD(^oO
z<U)?-L_N&061f3RC*J^B{+s9X;61u!y~2G-tpL5bx%O^z2L<xY;p8WrUqDhpVf8Mx
zM!K`C$s_yWysb_X40c)T4ubGnEAYwx3>~n(1~hq3r;G%(O_d({UDK_+R&%z0SCgGX
zv}bOMDel7*BzPSBx?OhjAIfWZrr|#nJi9kW1`32ShZ*8ELzbo#3Wr1e0j$IYpN_T{
zAoKNSRXh$=Kib3OViz(zL?jl5txU)*Z}@nK!qQ5G!1n7r=L@iV+BJNpK2FxT3TC#_
zV8MqV%vml^cZo9s(tfw9V$lb``TQKJW2+GPuU#B@G#fEh1Y5{@Pgvj$B&tEwtgbyj
z!ew>H>%)oP%SF)>F`}6Qz?n?|5ZS^=F^MeAiJYVok8%=kC$}Kj`DK;-XR(ECBbA$k
z%&xhLo*UX>hXs#EI_)&ebH3z;U#IA8fdsr4y1gra_%CHgWH7dbkf|E-7&@qn@1N_c
z1wD8gM&d7?C<JqgGi5sewTX|oyo8@_zE61H9p&mNlX|>DGK)wv+T+7KNy*Q<q#{8k
zzQ?slpErNB`qh@L2;2SS?P=Su^h^&QsR`?jxgmec7}iWC_Y^FeGm<&`YJiCWkVln~
z`MN{O5C2dA_0Fydmn}V;&)Rnd`6azJ<rP<5kTS?Pa2W^^+vpoz2{<zrCmoUhWJ>gq
zbJ;vfM5*skHa*owvr2@mJ7KP4uq5OYK<2GZ-3$0ZgHuD&WxvfsrjOz%=-;e#A*9s%
zu<(voOtTJ@JfJ#w{;D9>9%c`9EP}4|Jyc5l;NCHMmP0-dxBl=77*Rj)_10erMJVo*
z>WH#!$)n_4??^7R8`GZJ!m3jo>5D{gG6Dt0XX2|0*m?j3NjB0NYpy|eDNq=Z+o*ki
zhp7HTDE@A`xP{*npsX|IDb^C`?c{Fcuaf#6alCNq=+FQ}ll+@NVVV%O_6(Ztl7|}c
za2`NypR)Due+8L$Q<O4OI~hurYkFPUitAn4Ozf@c*ySixgsfVD!?2*8DTY7p+r3ER
z*nWNf|KPkR>hb~Fr{XHPn(1=kSU#LSzw>6bm%tqSzfaNK`BD`z7&Y7vjfE1o8Z3m#
zQVA^1;R`CxKfn<$#C;Hl?nuzb#a?<}0?i+G`o}*vL&wwY<;@LTL{g=kL)yR%-L-3C
z+o?a&GQEF^QdE~o1|;pJ7KhR#Lh34RCtp+*^SKtP8@acqsT54BCqPsGA~0dPh(yZC
zrw9n}+I?XGY1LCscj9Q<8ZIGwEqF-~*R^9aAN>Ck+2c<c7!m>%Bl3t2Haa89gY685
z93(7*U@!inmfJ{O+5>qFaGg!3)(YTF2ofYjfM-hmS}4l`_H+E7a+syQuzFjbf<idF
z=L(nbApnJn;ayZ`Bm4}!Tn4JT;Zsc#x#R%T&^#2C_vL9YdJ)>2h+^}iyq1RfUg;5Y
z67x?XM#f*6Y*w7n@TP~!vR=INbs)-7CVCuvntvAU?luaN1W<6Pj*tlcJ-+q_vTDl5
z9ebEwrWAlzzXE2t(v`+TFbIxKz7(0fTS$4k#xT;$fw^YtcOj-*5go@a1Q~71(><hq
zg%GU|dbtgHR|5}b>l!ECjq{P1q|dZfpgWt(OujQ64zG8IYaM-x$ny~Sk@sR~6&!fw
zY%w_?sw8yxrmUw-yw$u!#pUkJ6|vpD#Lo7MRJL2-85WMh5OBIG_2;QXK||4uy2wF5
z+oNm%xF{51`xY45A20{{^gJb@Qbh1z&4W}32%731w2NfTzI6tUAtqz95Nv9@_jLdM
zK5lE`H}O|HfxC~B{d)W(4D*rhH(r~l6MP1KnWKisu}+`=kY9##7E6VARKE1JW*&VX
zM=OYu7@h%~=yC~f1*W)GNiC~Bf%e2$(ua3n(q-pmu;zC**sC{batgyvL2PCWgE)pi
zM6Sza$TwZx0fc9Z+Fyn>`}I>X_~;r1x_5V2--f?D)q=QZ3Z=FJKD>>S%iooK*WO~{
z7Sv#}JObobo}#>e><q}eI<ec~Q=>Kn>*$E?b{;*;gKcB?<zDd*B*E_qbbyg0R0pq1
zCbSiD9*oB(G2~GOLsJ$crsd<`6(}+i@^_Mi3~cHCtkcO7wr#k&bkt1ln{g;@#!vvq
zg-DAy(8?nLq%A*fxSe!#ntYuooIV-yE}~JJ&Q`IsN>kR`gmyZfMhW^&Ii&na)gesF
z)WBaYR9ysPf8qidMxsGJqg!j#b|kg>Sk2|~pF&&J*81zqj-Fgih+u^4Y@YWYTX<YK
z?c29e-hgB|-HU!RyHDcq^#@GCs3(QvnDiDA<^1v`eoMrlh7@gLvr1P8C#-fw2}4c7
zlSJoq8eh*z5<{#IjLX}YuiiwKp*EfeTTj_(Y}PUBfAMgY^GGhAVVp;pe+|HrEgDj|
zZ}}tIFHMN*LdrXsm(unfjK!_(>Y#unO?A7O#L<xWw>qO+oy{T1=UAoF@zeQ;0pXar
zzgzE9VOCzC9r-kiI62=SkjJ3CrW@c3<GaGzdD*j97GqhL{;AoG07f1BaKKj1HQqmP
zZ5e|>j=#EnSyeDMN8a;0!dyJu=^x6Cj#8b`7=+0b53%BBK3}iR7$=uHCH$fHB_>6g
zWuIJ*G|$+3^}~MMFHUt8V&@vZD4SQB5_qFSZ=ytZqsec;{=9F528K?bCxKu~fZm!D
zTjmE1Gv?w%x00;y*W-a5oBM<a_Ba>!F12PLd|S7#YFYtLB_hRl<>*yl%NE9DkB=@2
z$;=t*pMNM83&Sv;i^JfUhjr3tLC{6rZZZjTy6)}ehLh;biEFYfiM!j1EX86aOX45K
z1anwb@c_Eil+^_GTItdY$p~}7NTH%mzizC{A{;4GX}gC{R8M`}4|H#`*`BP58=I``
ze(?J$le;F7i6YAT{LSKIBG?_=)RYPq)zG4R7FDYTU0L(!>>4v&#V>2vBr!iFJI&3o
zUrQkKrHNshH9j)yzbbtb9Dkv!TWatgw5h(*NyXG%WZ(`j4o~jADATdG*;zeiQQT()
zPS>P*{XL5CHTl_P>93bE6;9|T>p=m|MsI15`tlJUiPxTN`26U=Pso&#x}?j*OTP#C
zdFb!-s)A=o8ns#~-##1cDbCg-Y%9RBzQmF-FGyuh&EJT$)4<VZjwJk;pPiS$Bf9AU
z{`{=7Yo(pBfQAncStR^ihq6{JJvm>fVNGmNF8~M$&3`|Pu+!^r>lx+zuAoK((>XhL
z=5nhEISNOk9M;#^xvz>#QWL3)w{=gc&yNq1t!IndB-v>c$oV?N`2MAJ<0c1FhQ5Kw
zgo`NSdZ8V}M#!}FX2;{#_dBfm#b35uQ2sJJx+K3V6$0{DS@X1U6-K6Mhc6QXvp5zO
zvTvzgCTthXrRN+N{WJ>G$vI`*<OnzMhp7#yZLLXqfQ3|=?&91wuZWG2Vg%n-f{707
zGqXrqUlawXT*}Inmc?@h3^PG6`=&$bSSKAZqokDZG)v7!`$u=#MM|F9i*LRXg5$T+
zcF=`9l%eenK#e&9YSuE83B766+>aG}D=Um;-D37OA0Fnvnab(rJT2buiG=yhyziHc
z86@s*n+`=i9u(vh7<TH9e);$M-Pmq5-~hDbtT64lksub7UAZh9Xm^|C@OL8gZ^oxE
z>1zm&_}GuEO@R54e70tMnld=!#M`o2)C}fvQSIe!3z}PmwfX-pMZu@~I~PU0P=rvq
zILihOj)Qs{3v{wAg=%QZzx`1-oqiwkRRo3G9i-a*$Oz{b*j0fME`|=AUk5MTWn3<0
zQCW)MNzr!ltI{sx*t(4vyenK%voCnYje;$fH`soGmYnQB*QVz{ZN|ybel9ix1oNHn
z<Xh1TqmmDxH(Yr&s51BeabF}x?(en-yPG@ljwwyDFn&FJpLP6M=s&=?R1>z1lD987
zndTFkYW1yo)5X)U46r0J>`--M%axQ6jJ@Z>Dc)~JpH5PBUvxL+Fp2$=m=-<o@B*^Q
z^G7X4j5`Yl9<>+zdQ~4cew51h+2&i|GmZ;}ydQWQ6dqg$jP^+M5?3P|^6U_rKOe65
z|2pQhl^7@Y{u7}QYQh30;=>r&oEB9Bd`eVjGGTlge6e9&GvCZZbK@E3$$lvqoaoij
z5s*CjO8Hr8<$$B;)e_0w$E55>lFr;xnxNZLPr2Gg;c}Kv6r*no<0xz^iT8(tC7vc^
z7s4lBple<;y+O2@x993z?$od2b9Yy}m1*@*M9?nnYrA5Jc`wq-H~BdDIyP#jN4<j_
zTQ_P(@$oAR6lF=|&gM;K3_CL$_xEGSB{UbUb8i2kEK_x_6JL<^9dF?Hh+%V`IDk);
zInyLgEHjx?I<1+IwzquC)twXwY8a&7{Wnhp;apc;KrwLHiqCvvt;va`?oQSI*an7=
zqi-SLS9|MWNj`E_Tblt6G|+853fTQrOhoNb`{24?MmkpvUS8jD-DSQ1D{=_ZDqP&&
zRzqA)cU>TlRIu^~snJ`9&iGsZ=8MRIloC2>alWKGQF^MmbR-JGN&Z0ua2Hsyd!<Hn
ziO2Bs5Xz`o;2OW1dZ*Qt`+VY8u#Oq#Zg6JjQG&16VpjkT%-DON%ck<Cg@zT=Hrxm!
z_$CLC4osZ&5Cop{v!IDRjCC_P>6utx-;S17`CatqjIHRCnQT?~EZ-1YGr8dTrq3Yb
zfNl~MZhXW(S8A%`&6FRl4r{c#ge0p$2eur*3yHN)W<1NWp228^puw$<2ReDC5bPEN
z+2tl5om=FT7=_R2Xk$XZJ>onT-^M=`3L2aJL;27Mm){)qh{9qLUeFoGd&P*TAQZaO
zQ2K4ADjwG>x=r21IaYn|#I3|#p0?*G$EUuM-Pp<M0iL6@TosQSG(Fm8``Cbyk?Pp^
zXxMl@gt%Dd8>j8*i0<n`>E!Z}S4uO2(MHVcyl!Pmdv_tC$%t+gk&;_RgYM5sS0mha
zcsgw{dz~j%K~P|lAn-I&38J<jw{xv9q`=n~O4HEL)oE?*EI&RcrHc|Um~knM&X}vp
zsjKTi+PvP;QIoPJrQ-|aJY)@kYC(MjXg(kfm`TNN+f;)?uTxi!50?};rJuilr-0-m
z1d2;z5b~uyfmNwPcEWti@v+3Ua?|<c3TTA5uf^`F5KS?cV}k?|CK$DOZ~Ys*?E$x)
zgWSTb16u&OD308OlG;Geradcm7NVpELkV|~AL|JKst1({j;~~-xw3wXT(m`%de412
zZmNw7CZb`su4yz4wbAsRJ=uB-!=wELByo;r=ZDx`PxTKXF0_6p`M7=~Kt##~ejq!6
zB$9>4HF&c7C<>SH4j(X)vTlxm+M9d65PLJrFY`-VYB1+2Ij;0pqwd0SJ@I0<s&qlS
zICcJ(DN~CxdQXwoEwLuOATG#(cCHb#?-Qw5JKLtq&99%+`&v8S(~ede3mN!~7-M>-
zGd^kCx!M9f$3=lm88=eW#SQe6e&?MO#y;Gm0}mF+bG$?1bA%<fN9fWE3=u-1X<-gp
zeKj<z&JBtf4>r}>@U%FmQE#WX+qr8gyGUk@O4}B;wG!{zF5Mm9JEpw?RScCt{m0J%
zRP`4a9nR2m?Dz3oiT<x}dVrpeF1oIXfo~S})f|%MvL3kt-X%jZ`?Nae9CZSzWRg$4
z&7{8-$fn>q7U1vQ$4F`$bXrMRXHC8hkS2R9Ap>}}&ZHrX*3ZjMm81Dw$4NC7I@ci6
z8{>zLfdtvH@0!m|z_7;i9`9fx&z~0`6C0vZY<ZT<l(ko=lgS7bFx9dzMK(ioE|I+I
z&Uo(rb{*0J=I-7C_NB8bi1nTiyp<!;cS=KJ*U{MFbnc~v2Qh6}=h@#*?0~oM(TkCJ
zwMgN_FVK93jVEsLfw4NJ)#_&Tfq5rFEpijomuCxuWb0V4#H14661Ad@hWSDP!&Oe9
z%TJ6ceq}auS--IMe~kNN^N^VeP-gu8*y?F3u0&*1YjAKnCzTl7`=FmB8`mS3uao{m
z(39<`(DSIZI1@eezSXN_kX7)G=VWw~3$wp~7G{k+v}|QH%yBy|oir#{h%#m{WCYay
zKKpLPF4Rn;dM*uO0>KFea=eP&OO0&&QZnal>A#lz9BV5!`&oh3O+o#pyEF@?Lqzh;
z%x(W{B$@lvKuPIvaJ2ZOKYr-=Z;yIE*j(e0)nlek?Mh^`OUMsdfd5(V23mDw5LEh8
zapy=Ui?o(xm9aUt$W8LS3%=y>=>|d2U=UrQ>P!QO2!4jeFk2jD!x+lg<V_T!tu@K$
z#418=qGmTE1s#L1l;AW!WNLPCrI+|SJ83=MumXl$mlAgIIJvj&*GlXk>I2yJ=W}SJ
zw^1H+4_RX+UOn<4&_f*p^ljZMkIB4FnH0qY$NH(*m~gyg3V9{Mk<QWuLE}<H7nyAr
z=g`2p+idj5rH|~@nMZ=YKh$bhiRo`0B3Wtr1I7ZBdeyipay5IkZuwl)-IO>2E_d5K
z{a9O0rSQPi7Q6yr0$XO5Y46=#qxm2Qcx)2vFfQ3k^hM_ooCldA*bMGZMhosCdv`~v
zmXzM!bEJ#@%T!E8L_P*}%o<;mJ}ncH+er_{HG<niD+;lFB0e~o;xZ`WMk_J|oVcKl
zoV`4&M)OY~x^8hM-h}x^I)KY$8h~}o*2rr<2|p#C&ZjBpHowG2JZ1^c*|Nz>;}hgv
zxN+f)d$g#NtsGyN*fb#)&m!el(}aa-T9N)E%o2|x^wvm=w7pI0@Bu*Xy_(X2BU-BM
zL^xb}08r2+my{rN%9my=Psr)DpK(pz4Jnd9-S+PyF$@gX%PyUD_{-6jS6_pVdWWpB
zJKj^+wsO$b-I6p=d*u^0Xb_#|$Wh0BFyvz?g@g}#$2JFLnun(onoJA325O!j+y)2B
z)}d$5{?IsF=J5y|Eb;L@>+*LI8bH~+V`**vFN1*{Qn6gw>rT?AZnWHaVD&MTU%Vnj
z8j&c?k76JbI03ki`nj{xxj)wwD6^GG<;&zoBEvT<GJqZ=2rt&m)rWDHl&zYN6&%Sd
z9$q$X>;P_vwuO+M5<O2=7d46NYsJYl0fXc(x>%RuTfB}2S&!_mg2eh+4nH4P0F5qs
zF>Q4El)<U@gyuOJTbc9)cBkAwqW{hs<)QD~R;>)==-z4`{Cz#j)M_L!LU7SWIz+pQ
z1OKMsl$*0eSE^KRuEFa+@((4nU$-S_VulGwe0}F0@;2RiDP-KoY0}lC;X)|5E<j~^
zW0wEfb1Gvri%PL;50t%cUE@A_T!N^4($<k1mQ6V}S-P0tdnK<hTd$}#LhL5)Jnl0c
zk>X8{qFWTen{PT_Qs;y=MS92Vr+e&sZFG@l|EtRtu8X6qb!DKDIo<krEn*n&ivL3b
zZ|O2W>r#--R0OO>uGe1*PdboWB5Zv2C(`pD#0iekH6bi7lOZ9~N@k>yUox}PI9Wz0
zi$~iDYuWu^pZn{U`o9-oEH<dN0uz`9O}3@$8$&hAav(`V5!@TdhZzrhM?xN<r-;hM
z`}DvWBfSK((o)T-Kf9hVlWVfJ)xCFMwDHEhgMiYrQW<-sX?pFSn)C#=ZJNP=xydGJ
z*0WPBaY8BcMSpFouBP^_KYC-byG)nT*Ax(1u<xvQ%AKA>YZRr#SKYvInkf1SwKbY!
z9DPbXy`C0tkq0YC|Nc0(e|Tu#tKL_GV4ViVqKu56`sS=V%Rk@T2u{qG1~jk_S~$XA
zrTU7Hm))8>|CX4&x-Yu=@|f%MFpC?-l$2XD?#$I7@eqF(TNon6P63mv`-k!cF!@A%
zbGvX-OnZ)X^rLJ4gl@W%6~;$*k$w!iee=JX8K<BTa5AnP;OVfNF3P?FNzYNyd|pFL
zHAU8S5o*{M74A%y(h!Q$JiPGvjWgi(o-WcbQ*wXrP~BH4A8;YQMu#-p3RJ>B8TlMS
zjL-3UvKiz0+@POffSlG&)Qg0w^Nde#B)*Gp4eygfi+avJZtu(I!*QEKoz&H#(bvZC
z2{mf({hSHDg#3oGpOS;ib8k%r7MNbfji}F_YO`9*@2eL{6Z@^cvagRsH*_fNYCuXR
z`xH`ILl>5tRrtsr^{un3v3f#y1&kLbRtW~W6gIUT1m&N$FMy<bN9HygKZ7;cO5#2T
zbjsve6$~Z0_)HS73q(-at{Xbz!F!%I{2pV#PC*~0Ka$t)Dn^X6J9te}R7@4OWr+;%
zrOd~it}5+x?l`wfO4>QpiuJ@9iX!+_#v3j6Dg&AxKf*kVAW!_rM2HNsVB-+9%<~NN
zaDLe3d5!?=RmMHBMl(8~Nd;%$7ZP^CKNpT(0{=}t#%pD&`@lZ!Sy}Ja{PjYVvxIyM
zMUB%;s<;fQ822>DPpsD*C2UUlo57V}eN_qY_iS9>5_S#g0NgHWwYC9M`9y>%uj0Mu
zvfY-OZWC*kC2HHgWtoRb3K-tO#6NXfyq!U*;#Udqh1qX=Ra`7UpBgfDxLk{ZtyhMb
zYq#CnvqxWk6+H*Z!<C_Q(E<KHf*owK`ZJ;^>ECBEh((!v*O~e8u2^H4_j7`|g(t5O
z*I;3N>o++Pg0;z;3IewRBmR;2xAa6cvagYMU<#}sGe1VjV|@Bjlg_wsY!EX_OrN|(
z6YaNK;WNapv@D>cXjZU#Uy?2?a<Hlsl<jDX=VVHyy;x)YIEaw%V3}{WS=zz=!>p9k
zSb0?^_T4hUVTchUW8AEzoSo}$sw}eLUyx;8GAk6mmupkQ)t5FXl9Eed<if#*R%^r=
z|0e2fdKwM3B=TOO8En@UHb&FkO@S6Sp}4*)r{1yn*k&KAbP+-Na@T!fV;TvNPjp3x
z#8#nFte2A~h0{d_lG!si<s3v*VFCFb2bVjo;J=i7%6+E1F+P|l8{A!zJ>Kpv=D-rZ
zOe9MZ-M8hVxRla1pvt~$QOHI)O_J)c|Cmkh82*x;^dTqgXszl+yQR;0K??-ZvUhaz
zRz5{y(IVQ-oSoH(&JN;CJUt|gh^}FLmuh5Gglp;8Mj)ZpW#>vx*r-RH#3!R`Aq*bJ
zW)zzKz0r$T^OD4EF}}IvikMffXGh1(Dc?X&6veyZq9uyqx15T~2dIHJOMTqixA^@=
z9w<ncB94=kp61ocVhKkl%q?|Ws10sMFE*|l-=bEbtz33dwgMWmW%^305BcvlTbxG4
zbv}>CX-DC|@7L24(bm>VCLc+~+MbE4qFgOT%X}}-`iq@03L(so`sVPu($Mz6CTB4R
zYaM&Tqe$;I@#pWg0{@+bl}?=dXJSTgG0$N{O2t+$b(|q;GqA@Mfy8d1B^KhmrERsG
zLT2-6JI!N}NhdXg{M<$D(+%jNP{=3pS4L&3NUfnt?dWT)J?5LkxV^}B-nGC?y~fcd
z<`6sug#FKj?+tBPYWzbax+*oA6QY;8al5<egMJ#?qxdK^3a23HFx_HYFDV{@#2O;!
zHo%gRXa|+9hr+%Dx(IC#WFw|eeN7HG^euznzH@`Dt0i?RSuF~S=&R%PUs@j<5*ST?
z@4i1jHde{w#d0!%sot<*I_ZGRr5znRtyly_H_se$l8$`2isMg?sY{+O`7?pv1O>1i
zci;5%$-8_X`f2gPo{Y6kjG<~Y9*_T>Q%z=UdM3Uo@lgGv?>_CPGO8c{P;}QJ(zo$0
z!%P;v-Dt4To6?CMA;&mfz;iWT@<dDE&%8=6H-J&n-l8HAO}W)a=Gg5LS#wWwBoA}$
zm&9c+mC6U}uRU9Zt{if%ab>S|QVe2bV{WbC?u7#l>MeOYO>bKsCHwQT+9(XuzV?Mr
zmRM#uR7lfmg?Oh-s=A4p)iYY<x3~Oc2EqMs9AcZEsH+&cnw)=E#ZYV9H2~zD8F_i=
zmw}P*U&eLtwP?|*0}zv2>XA09Z$Vi#L(ob>y8NYOe!J4Qa{VGdtiLLYu5SO3Ebj}E
zZwdV@!l?%>Cd6}|98H^?&SxS|EO8^}-P9t;su}dTeLN{3oV937ck)z_WuJWL9YRC~
zgqOW&T)cynNwF6XZec8L+Z*?O6}z^(x7Gw6Y#)Of;ND7nqn3hgX$V3IhaR=%pO562
zjB!nlK^HX1Baa0cAHK3fl3oR_-H7Nf%1UT*!x!o7zTw~W_lx&z{oN*+S~}ueMI>H!
zT@~$Q<-1jO2;*6Gsdl10PrC=ahnZw;l3lEIgpjH|XJ%2YJW=u?q%@vRg8jPqeQpL@
zBy<n%PEH?%eP;d?@K(d_V!2PuA{{NY8lq%5A^W9h{E6RQrcKyiPLgh=jIyTJ>2#DQ
zJY}q?<&0VsbO^V~uU+8P34Z%%s=4DNHS=MC>!GR1xIyCTwh+E6RVN%HMHp<>i?amk
z0oIe6Sp>h$RMi|MpH%Ary@>@$v;}$IC7%LZFL>l>nQNw}&Jo__CM%=vd}|EL4yi&Z
zFI{54Es%FR`9X`S;gClb=E!>QAna7d?I^*F&WRw~7a@aB*6wLW5{$XMoI^IjR+bqh
z6i%sgqnTud9XwHLTi1S!r7`^X=I%;e#sf2|;>IT}B{q*0KHw*BDdCI!9hgs+BHM}l
z%4sbwHQhQ_V8^5L>d$rVi$B!Dx1LCH>OYi9gES>%Xx7nk16j+>Y{uqPC{yI~gxnHE
zh~{%$O(nrf!hZiVSvT%;sO#|Q<J_@P;qpx#E$?Uv2wM!F7YtijZ`YZ3cIqtE=+^9Z
zdFIpXlLPOnZFS~!oMrA?K9U_Q2$4z&zz|+;M&s5tjq?gAS-zIqzWR(Wnd4tqgdPEQ
z)d&Mb;l&O-2o)$DuDy=eo_tG7Yimu8$jq2i)kz4x0tHK#S`d{`<i+D=&2#{tH~RQq
z?$*7hFH15$E!2iqJLAuZzKf*HLXuv<N0@Nm`I7QL&|8?%l!{Nn?a|)>Mi7??p$OX5
zFeG`Td6c?t)yZBa!I&fC_mq=L8C%-`H^-W1T9-#4t#EWT1kh`0H*4oos_`q+wqw_O
zx9FZ$o;14zZhtE=+t8M)Kzn6SppI872j+Mx@~lVKOGv;PCVyDSdLA?Y-j)u;8)h0+
z<(s>^5fpE!2TKlgh9pB-Pqr%&>=*mJ5GuQ?cYcBSKeJwUQ`bNbb=|{8O6e6&PyFWo
zp+rlGKcz{LAssFUBRrL3xU0UrZW~Ke5_;K&LQKBbL9w?KDN!oe%}L^`)2CGqof8$g
z9GOb$+ETNeh3erk!4|32=50AVigBwv{lO*$oz??DI#10CgeA;zQ9MJC{k!bqPu5Y;
zlzsd~S7N*F=6&Jwn>G8-Q_xnM=eegsmy35X5!>Plt6D?}``|#sM^)scc>aK^$-$9g
z|7L!8=1r;ZZpJrd;;C$DoIj4kqE(t(+{upu8$Lo5&#XrREB2iM-P5ARWFu(4)~Mue
z0_$C}Bk(j-Zi-_4FXdXlz$)FcIl57a&1(mFK?Za*iNi2uu`-qU2$pjxYXCsmn9eKv
z04Y67PLg2XEW``=0C(Q*BHB1G$a~AZ-#okLP1uYf>{v-6aP{^xFGVoXvTcD7ETOO_
ze?NZ1uq)7$=_~h7CnjOjSK16H%5A|9bxn}9?Gon$o=0Ig<U<`?a%<!Dk#{B;83B`+
zgQn;eP4%F=ul{128-`5yxW8prFJ08_1|BQiB5Au<&CPKKW79XxtDRu?zIm&2CUQUw
z28y-HlMe6wQr*nyMQK+e@xjhSE8UWVLu1;|MD@0*=X571D|DoWq++3O%U(X!!^q*+
za-v*jMeT9m++=!}HRCpj8XnwN@D3imP4|W+`d5!>)Q7n$QwEC{`j&Wfbgl0($Vr$7
zr27Lv+dQ;BxdUWs7!_4^t)02fu7E}Pa%0KSK`Lqh%?6uWh!Z`|6DN!X@+c1H=~)22
zY{=KxR;&qu?#<~>`=TZ8rB0uS&s_~ulWdm+WW&tmpRzgIDI}1(l^d$~L=WZHal#p^
z<ip}{Hq4ha$Jh9HlkD2#73LhK<-h0CwE-G;Z*ldNbBi<1S*pNug{*zPmI|iqTU)W=
z9?&)A*}D?J+Q6Exmo5UAt~RslVzhfXsRQ~Cg~CpK#qL%p`Ki#EW7~tHZO`T#0xYn}
z@LjwbpPzginfgeE-X(Uj!MpmKIvG27ey#jrt*^xi6L;n}Dz8At^ZLG$>U#`gL=eL3
zHbGJIVkJ%S2v)LKrj8s@dSBDV++i&v5?UW7i>v;i&;VL^B2w%^3Kuqpbs?20bkUV3
zxcwa6j?P4D<mOl81oW>~WKqLc7viB&_#UN6eRJJPPLR3?yOGbXi8<46;2CQ($Np{0
z`^DYlF|$dO34Aybd6(&gscswklKHK41+*gY6_`N3UuVfyqM>2VzFd~J2Pv5POF;7(
zK_>reA!5|^&bX(#hD4BJ^M#$Q_Z?|Gb3{oWVZDD0d%`KBU>|(pHWF^UO@O!I6a4W4
zkJ6;1j9iCm%iby(ufR=Q2);0e79!19U|;mqr0}Q~hCI!^zi&ZkbesqR_Lo#Rn$e@0
zn(hJz#~+&?*UxfJt6FJ|?J`GTH|)@khNrBKbKCP#70;{Fva*Xijag6N;?*D-Wny2G
z+x>D}EQ>oM`fI~Y%UF?N!TZNM-fjBALy#YW2M1`bMxB=SoKhBi5bt+Cx|!UQC=Ok3
z_-#vB(4K?mB{V9gbdbD9YStFyEj?(`aPCs|C34g<&kV+snFnZF5_a|iSmYj0(VF;2
zKoXKbGI8y@iiU=sf%?ZA_?={MAk_I!ptt(PG7|3A<5|R)X?n*ml43quE>Hf}Yn)aF
z(bnES;HuYEGnU#k5QCO9_{@BlN2qo|`Jjf(&ejzpMD33!F%VjDJ%KVCCo>{Vjlo}e
zP5->x^@fNmVxklptiUB7#Sgk(A$%l2w%b@YqiN`Ujt(__kqLjIf>U)RtR?wC=y|U5
z`kY;YJ%gl2GPO>ru4Ut7Y|H9xaZ18)&l$r)<IIZzA(2XSZ)q8=Wlg)!b&7`ocH=hZ
zA290#1eHJ1J9}A8?;;~iK=8StR->yM1;^6O-|@Zcd8kSBquIKTMsHsZPZ&b{i#7T{
znicH>6ic1ODxEEk)srelZ&peG;dw3e;{wGXKxs~j0qzf#IiM`LK4+AxUTOPKMf}cj
zPkBgO4ry*~?0>5~g;1$n_X#(IXOoOSA;u;9z<4j+TZXq!$zT4(NB7+H*<j8DqQ>_@
zb4sF%ii&>doE67uIIq?zatZn!eTm!38~);|#rC)TXtpgaoogu9mdEeTIhgV4>lM<E
z=5`}0NaT6K02J*pF02oCra66T_{qDvN+Z;bI8=Y{y?RD|Cb4sAzc)_T-6}cR&xXJ)
zJ2HFq&YsYA`iP$G1^KQG;>^&Q-nvTi_}KC(3yBCsa?`JxkmAyeJ0sT|3jr+~tJLzU
zmhF={m4cjRp6)_{UUJSUu?!c!@VecAGa7OBAe(Ct2Eqo;2#fAp=IMoPn!kNBN|Lp5
z0%mG#h{GySDgUurw8i@ZqQS8r3$&pN|A&Gepi&tCoIx-PmNoiBZro~vD}}D!RsRg9
z7bAUZvFuG?GUvQGiM%_!IPNas1(Eq^_Nzcv^?I4)NZydB)xY==D`$p}`f|Rell6JL
z5MS_|{D>9+(@of!c52A}1jW{2`b?tD8@WnIi~@ks2E8tD=cL{@JSvz}c)mS@2%eAH
zuVfj61J$NmM8ZVf`sCg{bxA$%ws6^l91)YmFx`RkrAeKOv(i$VE_{uF=e>D7a8+<!
zrRGS+Y-HfjZq&?#ydcamPkqw!Rr?ImF5)QpU1*#fHcA^hdUb~?s>lI0v=afW-ROQH
z3Qy|F@A`*A4T+cV{b5@`4$G|BdLq1fl<Vm^Q{Xy-KV93n(Csq0pV;5gz8u_f^K{9U
zIBI;_KP6ESv)OCE1@c4lxUJIpC?dW%X0h^Al>@VH=;EX>;B8ynDPLf^n}acyb700g
z-L{tcl|6LN``tucw_&HHT~&QdRz@b@{a%7)&HfGCdixIAUbULp=M`_}lLfLp-RzPy
zs@hS8eUS+@LHW(QezkZLs@mE1Hy5`Qf}m4?ZvjA!b7H=#+(7Y%N%kVX7UJhs!cF1k
zLN}Ha$C`ch@xu|^X<N}Je9)B;Gi8sfnZ!HURzr-y09p}V9NlA9<{BPp=i+K-8+7^E
z@$zZ_=lLX8X<5?a=z)EqO!tjUfJBL^n}(|Jofa6v`SY|~H_+=oF!~YrJXOF7Z`)Re
zyt1d%$+t%D%;nllomHZ4RVG9iUyH(WUlH6j;eM)>aNXJ<J@!NIj4U3-&;Gr(G;oQ1
z*J?O6Rpy|Cp<tAUXVp=+X?=3_Gk_iDxDa!V)bh$!DN06UMy<a~RH_)JIv*L@e`zJ=
z4H1H=oy2cF`S(2xbO<MNfkB#Gt!*}Vu0%v?0EN$crfwxQLo|TPN*2bD%)poo1N)=Z
zuRBH!Qe3|E4Uf*H=4TaCP{>KE5@FCvj?wzH6m8D>;BbKmJkcN4HTWR)0~!PNh{!ng
zI5b&sNfI&mOewop=k_C3R0P^t{x{ef6yE9`Q{N#s3Dxp$BGqlYl6Jn0*STVf`)Kk0
zCDGMuOzNxt0CgB+cQu@7kTNU$#90nZlMqX3EcYEdzNiUR8NZa8$XK{%BbruxF2Hu{
zRA#VuIij)gp(K5JjBAz2b&y{`fDi)(CDr~G@0hcc^H;_v2-S^)*spuDbVsJSg~9Tf
z{j&R8!^E7y3Gx%+#`3+Yv$GtpVEp7~)CY9f2J%0+7ZO9uQZv_Xyp4sB^hEy}Chnbz
z#zXGq2+s&TQrJ!cfHn(zHc)*8KEi}zw|Ge8p;!&SwYf(W@uJo~Ig^XT<XP|?9~zYo
zZY-_z(~(PfmL7`*o)#4T(mxz^es_nSeHUvSqxD%$jyjLPZ!6eZl0Xfv4!i0Le&yIe
zGRM$7z4g@yU0a{_Wk))?6}9uA(ZW{xKBq3WgcKEj`o3yTZqauz2?co9sgMK`obuoS
z{k?zG;6RyREli@Bz1Bo_>%8DtX}<D2zJYPQGIU<q$zB4UYcEpRiG$AlADX^0tgR;6
zIt5xNMM{C<QlPlIYjF)0+&#tJi@RI#QrwHXyQVk<cXtcgFYmqg`<drCv(L<)Ig^>a
z_gaf8W_AbHXd|$wo#-y$LL}6Oq5SZ%C}JCN>$uXqs;H~k`gg`Rk8y8#PP3Pn{pw=e
z^L11|zyXvc(#Li?E@Ix%znAaVM2W8MfLlwXQS{n(*jY&~J=Tb6!e}H+%`jK4ZGr2<
z1j5Xdh*Hj;bVCE9Uf+Aunh(T@;@i|+MjK08Y@YqQ#6HKp(I5_oqTeesK3QXH?Ji03
z^5r5T5RXVsEAva$-ofND9<xH8ucNOFLX5ZaW2QtwLt+_8h@~ar-IMgL`lERBU4|ke
zl&Zg8d>KsooTa7^d}F;ev?@OWv@CIpEfD0p@z1q1+@m=27!+}nzY|IFswdfAV@w5P
zL^<Hnt?H?2+LAdw)6W&oK4TYgbyIc6*y)jWYHwd5fqu!IWkSEM2_Ot1UsjXB$p2Ia
z_)Rg`GI)#V<t8H?FwAS7(G8uwXE<Xs921n6RGo0d*{a`9+w+b6<evZTiE>v28fTce
z-s6@*oN1bKb@&yvLP=FHltk!wBj5#mBm#}n!+Yg!m~tG<w83n&?17URf%g=D-BtH^
z)O|cMt$4J)C!5m44SH-e#A!_J@^qGMNPN<$s|k^pb!4a5uA%N|x{Iif>>J{~`$Hbf
zTi@m?NfO32`b;n_((p=sz}@`z?cf8C%i^0!|4$6fFGW*HTm1QGrpm#Xzbq<ff%43y
zhLzI-Y`jI+a#|Wob^Ce4)zUCYzOXIsXaGT?YqVt^qv%j0coJFeOqlRf*GC_rxUE^H
znb*g7)T;}p-F~r${KcW6OiWdlGWg;<wWG3XVny|h1(#gt@89`{SMJ28erif}g7Xd0
z7Ay~gzP4V*1BwTn{^%Eua$v(rTqub_r;BAag>+&kGa~618rj4O^h|s@&k%W7Uh(gv
zi}j&kZ4*EVi5~U~!$jjG3N2hv{HEF*7J&@C(dWMs7JJi#krjE#TgB7)7jx8e!n1{V
z<xhlghVM|@Kz*_^-K2mV<eYOZJ=v~wp->f_&S%H=raUSO^v7q++D<r<Uf0{RYWM0F
zfvMzzYNGAZ$InB&lR^w7(}GN%eq&rcY~6iAtYAyIi*|=^%+-1v{w{P|tw^o$lZ^)_
zsYk*WM2-eh48*rF97k7cWOg(CLIPNQ#S-TBJN=o`P8j$vGDYGG*!H1&tI?w7S~rl}
z((|**p99E=3*IA07<?rB@S8E)BBvyG8STgl24<nBl2c}TTzL%F+YxlE#3maWUC#RN
zvF-ss8kpt=MW3`#x;OD%DQ6$mC^M$#oj<XGntP}ayx_+l_2xhn*%k@*jlf~y!{3UP
zG084uB^8Of(nM?~V?F3`<8k&jqq<9s+ye>hHT_f<h6|+4r3zOcjGiAPdEY9D3SG;(
z;zmhYQLCI3vynMZmkOa-3WvA*ED@L`xaDk5m#0!2_}X-Ij3+5aykFy?13VWa&n#S)
z3mFPsYp89w^BMM6`MvQdf0c@{{6Ai<)7&L9lxIqB4u9OqDHsNLOBt6KmaB6&YyV%N
z^omGr=hq9#>4fRat`yhH+JAsx>wkdduDs$9YyS$BChJ#)#~-ciJ9}lAY(J<gL!*6)
zay}|8Qyv=z(;ig#YYy7b25d7&BHJ}6nU59FV(4}<$W|6KK`XS<PF3~N;s*l|g2`5U
zXpWVDZuGpdn(gB^F($EM<ci%`S<g$4YA(yXJN|r&n`|>dN#~lR<X3EzS;n2en*@bW
zmqSzaW2x|$7w`}DQYcFgX8L&z+!uzA>+wJGvcK#KB2Ks+v5izo$Qvz4N28twmg3_+
z(jfIN>ZHAzUc}*AtO>`#{8e$Sl*F;xW;)cGuMOTxITL+;2;T_+eg0UT<$V0@lCeaB
z*nJz?IVVfyr#1C)oO?;!%z+nzl<~X9aExBY&|*9(_Tf~sa&?Uucjps8c+L(FE&W{M
z-SX@t@b822UCij(K|pIg2iZu7&yu06-_*WaX&&t>k@YpKxjOT1;ZTozefEn#y!v@4
zBIWx1@_(tD>8EA=HV#8+;VkEl8tTp7*u_T;0V{h#YyAvapPCR=0Eni`=FqO<d!@OD
zvG8}t#?^`0r#Wl1ONB2r4=I17F7KuNEBywK%Ds!PR=+dcO}iSdt~}SjRO~mmz<%d~
z!zI|Q8%GcL5qSJkuHg|MAIVCXu4<UZ-)gG*96StEQVbz!3E096S)&-)MT605%=!{{
zTlhB3UUd?~>4l?eZ;$X$gyCxGSKstU=?nk6<f&&ZN_gHr7WMGHyG+tGcU8d~@u2?|
z_ZF1qFx<rNk}&rEAz%e#^F(Z~3mq@WP|AnyB~MU!!}Is_ZJICWn;uINB{mhq`j}}V
z;0RF7L6j=n=@FYPQ+G@MwYAIGk^N`)gB$|ykiUTqkv=h$GfB6004F$?A?qo}djWmJ
zp6tL+pb764vt7H7_EZ1<esqqgk)M$-JHu4v&BI5*MA<JX;rN!LXs+F-1|(X%A|B_z
z^O;OCO;+V7JT}AWqzF`$y+WZMI86nX^T(CslcHbVYNzK1KCas!bQAvpT=**L=&F0N
zPo=Cveq!EK!NWv#Me24PJKqTTi2Y8`;9dj`5HS6|)b-S6M^c63pX46RdTWLU`QEKW
zf8vPy2hjJayHwGg*m^L%PW?>#2F*zdxzgK5$UVY-c)!0d^|NJmVP<=xg6_*N%nm^g
ztdJ894a%<6#pH+#Nu=l|iO6PVv}H3_)EAHHhQC+Ym3i8Adv-}^$_;`Pr{uXEe=%&p
zH=(>pC(#u{vCbKjpGewz$V8VoDHx*?`nTRwt^sR#Nd~l=g3y{n?sMFehBg@Zp|0_1
z7EtLih}UXUqLBoGjEVy{oZE`Du(v<4oUczIq1GP*g^Q8>SJW7?BLP*?Y(_OsY8uDq
z=Bs(}Ztm9&&C1<&G`tb44O8T3la^h(P<RX^+_p>#a!tS8$)C}lrn_r3TQs|IIzsuV
z&f3wb;3zecgCQkCAG$`nWvSH@=Qt<};mVZytLc?$U=@qdJaT|Sp7G?|(?|Bf)bQ-E
zN#Y;iONS4??ZwmT*a=b~_{<+!3VziptAvU*WGb|nbS@J;L_XNj`nMLpviyCV{H!=%
zZog|e-g>(dL7j~h?-|ZHhJX(Myo_FfGn*R{!9xkaLZcSN0+4<Q1k5y;Zpr@Hv{3ts
z=)+Zie!_a8Q=%+xHe|G_&i_M^Lurnb(2&<Xt76&5Ce!UYTs#B!voSVNV=>}MjV#Oe
zBYU|xnl)5^*oUWOgh6h4n4@KFAIkGurJ($_aM=p)`K-IGLnKjpyH{3LfBwVdr4f`>
zD@*1Wre<s@9fjO<XNYi0iJf3$?8v6?G^^+k67dDcseEw{71jqy|K<;7S$nzIwZo*&
zSHy8c$J)*eOBsXQnsWWrYY!BzRJDOpf0v3xEwDK492N&TvQV4Psj$M0q7#)-b*c6{
zZIC6KXdq5?aww?Yrt`V%x-mV{7UmOFZQ@wh*wM5H^!rQgB|Wl4d*tkQd_{fMUVA*v
z0!U#tKOrJ7HZ4p^a51Cj`FLVGBs=O#SzQS_8t2@uKu3q#A&Ls?zbuA>m9i9_Xe}xN
z0Xf!biHid5<PYEd?#Cu93b2{?8*|4c33z!$6f|r2c3n;)PIpzVf>)$Ok)r(s=aZ$r
zO4BQ3hYxaSSDf_|i#)%V!Q};qnXhMeyt7LDi!Cp96sbj#Pfr|8Dy2MKt<)BHN=%Mw
zgPE<Kki&1JnIZo@5}5h+2P4Y4w(R0Ox8VI2b><xe*$`#WfV}R_>*oFqbtnJqoZlEM
z@(Fr1Di97O;0rakTCpr^?s;L@gy0`KAGkN^=xw7T6^fmJ$`Vc_{hBsAjR~g9gY4BG
z((<QX<_$e}_FPWCa@F)Z=F|^Ds&P)x993#Y2o6_3hvQd^`3RZu7}AD1N^H{^9y_W?
zF~>|GtB)&Jm~XlUJEvY!1*NOOxrFrzWk>FyQrwVP$BG$_GK!%e02)Mu&X34K;Is23
zICjzf<oE(2J>#QR{&lV?n!GV}?SMjm5<ivF2|)oc{EL(=dOH@pCHO7*AD{*d%n!@f
zNui{bP86JE26}at2|=%f^MmurPU&6gHDh>tN`)=Wrooa^ox|5%ai7!D)`<l*tc{V<
zcs|*DRiy<MBuox$YKT$2y!G6;5Q&*|-aIw@2LLdOz#>nmV1111cxRo=R&F&9L#p1(
z-n#86AErJlMi&c+P7+mmuRrr3_dCMTq)Z=jA1$5XY1U&*`+hAnQatpuO-i6t7EVj?
z=xk!0W?koU@}W6?g`yX%{oAjVow@u8m(`h7GA@{Vj~_=|zmD$6=8Lw5z3mu>)+7Ky
za)hb~fP2exIx4bNc`;-UOIz_j7%1ajZ@iC-S)sW8Hl>M3@leEj_96U=2X`Dc$q{{D
zE%H(%%mi0jneWKAG5D-o7}-0mW&WgY!qKwA3Wckry!CLQbHw0?xZoWhLn-aevMC?b
zJ6?+encFk(muh+{kx}(;UTB5!DfTQ4r5rrk`*X~9naP}UVIue_)i!~0JnPTv`beF|
zA+`=BQMsDDh-<C(NHFhyzxOXmKBk2pjlI6?Zq1$wcz}0oobl+3NEZ&?3qPI_V4L|q
zWK%vGU~gM*rq>!&f6FnO1X*H^A>zmw@`QySC&d6Bz~$nhO>Tp9grMTZkoE;jbfriG
z7rKT5xRg@zdyRApK1%`)IX7yKJ0hgPqh%j$e=+METaXLq;;K3<(-x{32y#4_p1)rJ
zQ+sI}xTj2w(GE6fmJMcWK^H(tZpxKwsfQ~!$5D{Qz&%4=oi((89z5aPa}dooIClx)
zp{zmrsi{@b@}$-`_)o4?Cd|2|=3!w7>eJ}3zAWpHl`};x!KrP&j&Iy-E@PaKleMsN
z$Ml3@4PraGoN#x$WK9$Rhu;$5IqV=r2k4T<%4OWt`9zq_Su1J|fxHXfJ694;>oCyl
zx3&!Zm@qJZKQPlfedx&ys=ST!_H!e60#Y6tzU#z0aE__1K$FYKA886Ed%@@wxUl^3
ztbK_LQpD@kacR0o6LkZI=czrb4*RJaR@3~U?r^?~giRS=`u7uh{;jlVn7Z*GBr@jx
zKoz^s@<oG+oSZ%&sds0am#7otrKF+yjSp*34=KSk6Vt6#L5hF<C@t{ARfP!vIqQ?u
z9&fi^C||XF*u~Nvs88M6m~~VueN?ejZj|^cxutMzq`a_nAX9U+W@$opp<)hykdC*!
z#c6<nd0~Fy!0RT}@AFIi^A~|3$9FZCWcop7-`fk5Ez1F(>PVMa1o&QP8kNGYcl&pw
zv##2f?rq+smex@uduo-G_7J_ZeXa}U>WZUI^4Th`Y?~w9<>O)&&TBSHH{pU;Q5VDj
zL#@MgY(Y%>Co27Co-{R$OX5TUrnEHbEsUbNxLiBPt!UHQ3wQ*=D~%mi7+9^*3QONA
z8LgOpV%u9{3C+T8va5FfI|>VovH6*y(m>Car^*`TtKZ~n$pS+;s~3nvd7U@F2ot^h
z+OGrZw<Y}4xDaf5nm{{w+HS8l{|%Ym&wQH~wmy`L;bVBQ46-E}Xk-C@p&9w<Gk=dN
z5=qAvN}3-i5TKWnTWV+x`fH<O-k;j!Hc1s~rInvwI!xaeFX_v&6$cIKH{gO9@&vRi
z7eWn=oLXt_@&_xQQg)h>)yp7R=*I!kmpR}=)+KJ+gl$u0I=LcLr}<1bm#SL_U*m<O
z;-!^Kumvv~1Z_9Y4*zE*b;_Ll_kym>VxBdh`%QbaV?Wk~if6XXV~f_V&Uooh7(sK3
z@Nyj5-8f^zB2rdZ(C^pLPNa*pfco(5*@kxsRPEte0?VmPt|B!FBQn>^3${K%7c|>B
zaTo3i3&D>_UZJI4k@Gx};?2eZJLCIQ;=-^+b|tYRluHpjSd+o!LOAm?a7#DI+1F$F
zXAUlo@2uo24zM>)*$<p6w7FuBWpRw2uD06^oL`u)fKDCOo4W6N-`~59Ui%0G(^?n#
z)^XJo>x*%o@9owX46FQveQ5r=f=L9ni+e8;tum71A*B+eINzrE<G4UDKNC&Kla<(p
z1OJA!zFZ>Mk>PvAO?1+24S9Kj&lW+JXG@VEG4>38cVgCF?nhL#Gy8l>oAx*ExN*PI
zu?%AL3;QD^bD;G8&`{aq_-^>pt?(rquP=`!Y`ZCV$fXHOSN~y>6*qZ=$QFPRgu7wF
zYZ!6iF^)XDi@Me`4@{AoR^P=hDQzfEJ>;#5+%K0G7-7Tr0nOu-K4~a64=}V)uVmT(
z))g}JCQaD1fA5jm<6p}YbK{R|eRbFra{IoY!*<QY!w7Yq&Nxv0{?st5s%&wE;}-XJ
z_5M_&0+zI?*|uR%D8^h|el#M;cElELyA?@?D^3^5sp9s>L|HA@<^jir>oyB9)#9h@
zcG=!vV76S9!Ew8equga@W9zYnhaf1Xdm)w(|2>}+&aA5@7?gkLFaO&a$3PmTE55qK
z-&7rVS+o)YfD_yrI3d4qa9wx)cvQ84Iv-D^yUeu)uMHNaWGs7Ji!y{F6*)+A-Twmw
zw%b0DKQql)TuKMKerTLU!bGJDebj`)AzCW(_$s>iV`TlB!jyYlsUTgZzRj|^Y%4js
zUJRh(;W_8cdd*BvGrv8NGLdHe7eXiaHZoCpqpuwO%0C#kjTaT6qI{PGN7a~vM}vXe
zXDqw={;};cf~MO{-~k4#5LbZ_yHDIb=2E;Wj)QIiS?*P;oeR|Ys-cGBA^KnOB}FuF
z(yu+UwKN9iO3||l7V^J~zvI5EO6w&>zT}DH^0;Ane>iQp%16}5Oc|kk38$5I#anqL
z5L9a?a^Gwt!?A8MV5zZ{mXe)0Z5%!)5~SK}ti}Bv`^z%Rr1S43=BfTh9Fn#2CM3X*
zFm#SxdYR^j7>1X<mil=@Tt$d-3D>40wQHtvF)=kT$kj&CN3l>zWXrwdaaq1HXTLOY
zw;6RR#i#FM9!ggT^*)Pa6Qo$(wH0bjR-4vC@XTlKO}xf_Nm#Rv-}jIRh@+4FO^eMy
zAY_>1_uUd^vft<5VBeeR{}$X7WxZ{WG1N)0f^c<0q-nf5wSZv6hkos4x)JxuBj4aM
zv|6}pOLxisyh_PHJClYaMOqU0m$n<R5IA*PsoVFMOgI?g&J}Z7eje&UToB&(H|`3&
zYrNTDSla&WlI-I5{;6-RuG0K&)ALhf>cev0xqNi#i|OKE+7m?_O<mX0+Wo8dvHwtM
z$97y;4kOIDJez0$3SLr7y(*w|;ed07;izJ!gdM>*K8Kq@hV!_$esi6lFWu6~f?ov>
z{uuOj)7%9vS6tK_c51Q8@jL-7U#1L6Jg&}rdDhrws|2yk?g-y!Me%B{E{_`U_+6yF
zeZ8CZ)7$FI4CAO+$I|p5-bOi`4eS!sKO8G=J$G4qZf%%{Qua@>pL8ZuMvT><IwUnQ
zOgrA{>O$migy&w8+eNH5$imjBVM|-MS4k&V0~X_aPeM%-49s5V$@FN>$rPO^3k^|_
zPhfZGYmv`brTCV7y6pa5N|cf1pGo9D)#ZMy&mI?P5yIF9WaynFx~p2+{gm=B=8p^o
zn!h})v}HlpR1af_iJ(0|mYN@-j{)q>X6{>SC`<G+QtM=m9oZ>fu~KdB1Y>_SgfU<e
zCt$cbX{0d2Xg_L;SP8?q+Mt*|ur`@a{D`Nmv2lL+;4$ab<d0VP-%+bAN<#HKcN%Xw
zr#f%uYyq1Hf0J)wm!`np?N=%g!4TQirM)5FGf<uEQ@d^o*%j$*oBzys%H3JelgcZ^
z;B&jJbe}Ln%e382bnncP^&P9sR1H#|+0Rb}CA^364D|PRAum5&iQpBHuY{RVnGvMc
z7D@6m={t$y+SxxxWi6-@m8lkTgD2`*`R*urJT(k-;)7M&?Xv8vUM=-~FtpX^3)3oG
zSl-Kl>MN^<VG^=kNiuVc4Ke_Bt(87!!kmyihYhk)ohJeLmp`4f9_0hA)N$yY*+uZb
zIquZy3Lt_0?^JauqC%!X^&BZSXuI0BeXJiIZ2np{bF(RQq0bR2$-erlp{e0|X-SS)
zx9%PE&C@D8rw{5mb-y#6azQ83zTB-GH(;1yMmxKFY1GQzOs>al<Yps9NNgVTihN|I
zE({)LAN5Q%PgN>3^kNH)%=)!@Yb~I=s`LmwxV+NdV*R<aapAGpn-ep7#+Fj~+pXi)
z%es!gYgt<75=mHK>gL0T<NEqc1zF({q}Wfg8P<3LIM3svh-*oAwy+OdFGT&H*KylT
zIbdL=Y4;T>cdb}(h?&u#3he4PU?;T?J2qtkHg=a@o4M^8*6;t}u;^T2hEXbe$KY+u
z88Cfmg_tjVH?%KMY9(O=ch?`cQ{-73R^c~NHkryVF1#~Sag{(O@rA0sLIlc7@}oL?
zv=)ORRU&lQN;L%4srU(+SeMqmx|R1lzuLjI%IO6=S)kD_{61f?tLm8B^0vUGp<oyK
zf@e1b55qQAHEg8|a%{pltu>zMO4+<4HogWgFkFADI&yn37d9JOBhTsW1i~gRtUEvT
zZ7!#CeerNTX5(veS>kF>D6r6=BF0^%#{1#xJl_G@s6NHpoFMA$x~TMHIvmoxc=mXr
znm}n`Ez2GwPH19`T*+tF_P`%Rxi*aR$2NuM{E{lQ3gQpBS(yq0<@;(8w0F#rPML>_
z{En(TCtMsfb+dQzTlom{$HEGZ_|#>c=2V;NuGN&c^wUg9zYdwFhmsaqp7NE=?f`>2
z4jV2X2P)gwoqH0w(|-^kc32I48W?1^5p4q?q;WlcNO%}Fo2`i_ku90T<eW3X@I{wS
z0J>QxP9{qAjem(CDTCvyar0cm0?f9n?b_$6(GSvuDHj-Tz`(SYcsijJWX7RIv?MLL
zb~@D*ovNuw+eb>9KX!%rvyTg#DZzKb$TO;vD7BwkOg#9U93S5p#4y+gL~;uqmDEK}
zuYDjg8|OYs_#G5YkB0V&kh$e6B_z=26`Mzibj9?QJZIuh$cv-lC_$>gF@8pKi*M2H
zeOs<^a9#x+3zeV#v&Mqi!n1ez>iBXrh+VfFBjk%o)FbNkh9v)%MQe_H+Gdek{EOZW
zTv{fo2;E=Ml!1S)P*ry|cW#joHI@(uv?NQW;5V+r+4RcMF^hu0ATOnt<7a99+^WF>
zbyMcp;^%ph$RmSYgDPG}TW$OKKUbH^{P@~`P{ZR?f@1&d)gE^Dvj-N<$`*yv`l9IT
zl{c0+peybED&0LbNav`q&l1YE`{J**KQ-M7zYW(Y6CEhE<90Ri-SkE7Aj=*45Ru0X
zc&P`UF1r#yiZBg(J4Mh=b(rp@*SB#Gl~!|i{Mz}FUaTL&xEN#gjj(7RB35J9b*inu
zPBld2e1)oiR94He*&3lEAjkYwz1FdS{h(tzn13sD1eWE(3>`s2;}Oq%xKT?Kk-NC$
z{DSAcCNkZsFEH3dXc6REh_6u2E#7$YeR!W3#Dey>vhxlZIY-LzmTP-=(HfZY?XC^H
z=S%7ymmgIemPOVzX6>mfd0#0zvzNrTjbHBALjK~YzG2qvNQ~^w)(vphzf0eIKfLa-
zsTHpQ+QYxU;aWDREhcL8)KX}zqOiePC=D|<u`3uKG7~kA^C6SF(WjA~L9%wVI0I}1
zt&=tgs`Od6%YRI>3D!^84Hej*F4hF|x-*(d`I#&}HEf#HC^=2#x~J}PUYt&p*XouB
zo;8L?_5=`6T}Ufsm!X6dp#9a8`u6i7`IFfBfW)4nwL@A|t-N*^BOV$$)y%5?=AIZ~
zzU@kJ8T{T!A7{EKUry86epN}CILRKn-YZQ>Qr7-h0Y+nMZ51p=!*PLB<K|nV0SCp?
zY{P=StloDp;HgGxb_J+Bc(<~?GTxN|F`6U;>{%t*Sk8I&{dl`iHI0Y_=du&V8nvYr
zo+1N9QpLbXamjFnjt1h2RFJ*9BjHQ-A9;+zR-eJr{){w>>{Dn%bWJSI7ZP6ltrl;Y
z%Bs_EngtV_5`khG%8utcYI?}gx}`?&IO)m~q7bX^TZtkmrc<AKsE)*Ijgo|~kZ|%>
zW2?C#-N92r0`>A=%So}m%G+w}KfK(FLb8bq`iXs~h3LD72u_vq>vj0PUXnqYB8Hwd
z`2YR`l*IgxFlgGHj-JQo!?d8g!B>q->JG5bee|ujybM&^Z1$|TEczc{%%b`)+0sxS
ze`wG~>F-(Y(8SYlROeV^owIEZkrQjrMFWC4l`Vdte(?jxs!4bq7Wra_#2+f`!vP!Z
z!gkd(d@{Nbg8l1CA}u5>3%(Urj_FF?h}lxHTU=NSJ-1HotScM(i5Y+UM2EO+G~k2r
zk4N>$Y4&6nhI+UJ^aSJQBEI)#Z!&rP?Y%(FF(M8Vt<uD%4LXrhJ`1K)P1?APM%eJ2
z#ObG1ij*kyNxX~avQ(XwQc9Fn$73BGvsc$X6pip6q8|^^+j?`;K(2i5;vJVqGpAH^
z`oYTPjbbtun`25<hKw4T2^6nueYR2NvmdV2O&fFH8DEr#o)z7PRQ*@|GwCPw3sn<;
zlsSQjw;~GJ2EPLSu>AEROta4oAOQG(ye%=xp<_0f=V<;WY+vbH7Mqcq(U@Rt7PH`W
zLj#xtC-8<xu?u$h@!gzjGNtGf_!PxRWZ8Y+d7PcRSdN+GbB+Z1-3FZaL_LW|ysODi
zLq@k}#;n2zL0r$*>xtEhT8c&}&8b30F~#B;_D?ci1j0a1{{zD@#2Of4E%{GNW#)xw
zeyc6V(>xYe|3oW9WV#L$8j`_bY))f8c3`tsZ8h(ew7xvL2~{jG^>gCI=d!e17GG|A
zYv1k(T`%-&&M1o8Ypo|fnhmZ`crbl3{zd`IwW)HFO0d3trGJp>tbD1Zxawp^iD0Ch
zu~ordG1`7u7}dCkp*!5HCkiX$5rMLN21W`@w185Q?y`FO4Y5(ZuMC%XLASIG%9v}i
zU_oh0@W<k%FbP=X3pGsk33x!t!+Oxkesz<rPG81LF6IE|O^+#VfmHhG+TqCbB%6{y
zfwH5=4i|LJTsLc+3cs4(pQ=p;D-zNn$Eyb=9HErmrzfZ~q6MwQi^x6a!wco)V8z|o
zVg<(`4T@yJ#*}32IJXk(h|!?KrXU~3O=Yq}tYbgI4%w-0CM<G!ETMGgtc8)S+J`Ck
z)S)TWYC<Swg|41>KER3Jr8-5J=CB6y^9rQJj2S231^Rfr<1uTeS1pzU-iAcB8C|CI
z<~!HCfpmBY3b5RTrkN5ir+$o*&eS7lLb?k49sLexfW8HSKXj}~5|L?ffpe?sVehPv
zfijaGdnNtG3j|$xNWJ~58$qh2N{6wll!C);H%g@;fWMLDQex_=xy^e+(U5UGQLp}X
zD(Gd-0Og(&T02`g*v`$pesEUZ?DI5hhONsD^!%o8|5eM>(q_LX#{qO7Q`h{sAYUYC
zE^Zc+A1|$ToH3X`BjJVh%|t-HbTvnWY0QBQT5npHh(}E)HSWMR38ub{gC<b&Zto4~
zJn8fYf1C(NitYUkY^|kd9%zj9bgQVo_6msG{dU!7OWOVVJ^$C4qR|Q%nA|T<J1M1Q
z%k?#;OrO)a?7?QG4tqqnecsKY?DzL}I~pweY}C)l2%UEq+UOTKOI=6UBCnA)FlLj4
zKamnwDO6;RhTa~m+na&_7eaN|KR}nFA;W-3Pw^||F--2P(4s3Pm*)6sO<!20Om0z^
zS$!i~su|MBS7F`}!~CXs2P;q!45q42ZIAxIV3&J40Zvc6H1QD>wxS_+#l7)hxwnF|
z)OrT(^(#Kq6*a`p%NK(5cU~Z8`;ES3R;F!{AUq@_K62@0()0?Tx2;t|S?9iUN8@cA
z3H^=xr47b$t3qoqk*{Aix?bMbbBq*8GRF+6QNfmvW_$oueH!Rley}+??`mvbn9poN
z{^0LvCC#r#(mEl`z0vZd8l8s{f#9^v^p%6uk_x7(`NKSBtuLd0c!$edlM~rlpt$ZE
zCzR*dXEq-Mcv{|B=f#^PL?8n&B{FK<&6rGqpGbb0*E&NQtfpIAvP^Ej=h-^Dl^L5z
zou9iss;(%1fl{EI@G>fsdrjH(YCDu2sax*xm0#j34o9piWV?OD@iQqq{*Ar3cSae@
zQAZ->PAXgaMKq$38>&3+z>zg~>ORczD+Z_yLjSX)>7uGz!H7e$Us_o@dax}wGh3IE
zC}<vmTw1EGqlodcMDN(MA7@jFjc6Mbk1QDbYxc;gfWGXsXVYshm+P<7v7@`BEw<v*
zAmUTMW3I8LPlRRE_<D-t3}W119n{iDWg}>iO-J*pwbwM>)M044E+sO2Uwn#U|7@!r
zbGtJNOr2z1DTTU{|G*N^HxwQHRQe*Ea{f6zp#W)vrkWU(_-sQ31LDQqZs+#4vA2&l
zBH|KjkSuJq79VNI0ZTDt$q0A$aX))<KA3%5(aJjO23ST~Y)Gl^b^WT<gVQKx0Pm8?
zI_8By&zK@G(|z#fz9dveZ!cr0f!W*grht;`loAOcjk1vZaNv!vBAVHS+=Xn%d(ydC
zZNr!RT^|O_`4)|*brvpOs*lp|GO(r1Jjn^&YbwCHSAbaOT&YSI@~T@t<U$~298DGN
zFb#hGo^i@Xqyj0Xj%>L2`*WCT)-5M#`(MZDpD6fP!v)Jp^T}wwP;qt#<ZV$5Dl}Ay
zn*mN!5vBT*ogEcGK9WVjvFxmx+Xkr>fHXA2d8UCPdM7qHo$nVi5`+f+Ba2XVwZy*F
z>J74x)xHpKfkM!!(uywi4@ShFM9{LJ$a<0Om8UteW7U|{zMcst9#?8b6)iVMDY2~#
zOdC14k`I}jWg!!@Oy1id`w1EUY-VT0?g>^IC6w;MyTYyb?GADiT!Q<e&ANn{wUoGO
zIRo(xX9ZDp(t0O6tG%CZ?38h7CjdOz#SM&dkeRHxpLxs}9=O&j<WPFOV0KzuPr-~_
zPAK>jzozYq>Bt062WwL$=`97R42g9<?^Y5epG_SY?CkH5+K468Kb=M8SZZN<@|I3f
z(pmN(z%b90)VygdC>k`qjw*=fy!j*old+$%UEMO_g`gADJaF7T`YL*1c4Gz-I!k+3
zcnmWfeH2&BF71>4FpL878$a?y{3TqByC9EWXAx^eMZe1teTB&dQzmpgWz;J(=ksn?
zyDyx2U|~M58dH27O_hb;2zif=x0S|bx^fdFsXYFM4)Z~Tz~g>w{`N<&fH}@wOlZUS
z{Reyj;R_8{%bQm$gPyx!s0kH%9|06UvaNW)WON?RwW^WO{ANlDd^N6b<Cy<dsKV>P
zh5A@recG1uWVOnV%iu8W(b=}EF?Agc#QsG^=I9OOyOKtP)Z>60q4ImYS1eyV;Iw3H
zlP$P}&S^1`%Lijsnk>X;VA~L$!rQ#7EL#s}Tvklw8n?xMJt@b?6N;02Hi*?{)MPJ2
zqv-&@tU?aI_Yu~fG^U!y#4(ziuT7LM)?FYge<nlYgz38}(<N#n*YPbsg5coy`X@}7
z)?eILCVq0iyo}DBzu(^_*jg9BWyFm_dB?Vi0%-{-XSca7kx$8r*A0DS0-?QY+B@8A
z#=!S{R152Si<V)d#<Cr$#XDI-vBi-SM<zyap+$qRNzr&fWj2Du=LS>qBbvBdBZj{>
z$^nKrT!q(CQ9@Z?#kB2`2V+h!xsKOxUeX$B9N*rDyCT(JWt%NwQ1Pk7V~BQF2aH{<
zC{Y7b@bwM-!j6_29Rqu$R0h<6ZqJZ5#)43N{sl0C0@md}z#E(^efB)|uC+-?lf%2-
zd968%Rbs~i)7Un37LgPe+aPRDcxA;8WHrOO8J|CTL)U`ar*ISmbeEQbV^qH8YSz-;
z1Z)S9OakSO1h9v_{pq^`lH6QJhPY!cSC7anCyTQB6W%Opk}&rm0co+e9S{j_t3`0r
z#f49}ET_%2v8!5+4$`(v4%SHj%xEbR%TWUxv=)?Y@F`d1SeF>$3N;&>n`ZGKO?=3W
zLi{uN&Gg^c;|U_V&qS*+Dy<b)o)kzw<6uY9Jm{zDg{UnwsOlE9FqaSj-)pLhA$eUM
zYuq9?d9*jVt`vX4R6Et!$lzsczm2e+@)Z6Dc=4heJUW4@f@;4u8{-DV;cKlJd1Grk
z$-A1J0C@_^o8R&3%yEo)8SDC%k+?sEmBlG`-dx{BSZH;Q;lzhfo!Pl#URH97U(xmW
z*fC9=ZK6c7wY-PQ+?+>N3MQ-@Ko!yYQOJH?Mh6Y~H8{D><+Hxz$js9`O`YI}W8|-u
zlGQ;x)3cx3H%X*$;}#lF<$9-xbV7<VZGTFU34wwl$69$}EbpxHo}^xhT&Zk5KXCN$
zrVuB~j68_m(o})Pw`fzT+)_-*&7$|R`zt2BCtK>8o`kl(QyJq0HF!tg_LMg^F?g8D
zq>)OoA}e^!e|<U9A0Hzn8Y9vOm{gu_WL(?C!o|V;5PkANoE@xsnB@Bqy%C=sYqK_U
zIG~|Y&dgon#4Z;AnT>2n1e7MWM}KX3IKPy?^P2TRE?a}Gce4_`@SjQL+)ezk2$eP&
zzsxE0mDXEuy)d|1*SyGwU<v9qyOIY+14iRLk-R`ls#onBFH&HLy$_eMtuxnTl08X9
z8irZ2RGy|RQ8_CKq|*748_c(Bj|u!fUtHb8r%stW(?9*HZFtsn<b0x`VB_miIznzP
zr6@W^F{~;HJ@7L=JijRw<E<X&OL)4?Qc2(}`U;;QzF~vhuubjZ`3s-AH(ic=L)9JD
zJF}|XYrZyO8dnr4pVjH{&0^nLTebaaHv14iHp!3XtsxV;LGWXwQeT8ZnqdK<ww#2E
zGq_@Ec1e;fhuHVTq;WUA-5k?vs1Ew|lRILetr)|oJ_%b;&xxdOH44j}O^iR=14{%;
zCTl@g+heT&*cg!`&}s37z9y8`j?mp9xY2Rt#b$eik}a0lT<aagSi^1$oMKK(ko;WW
zT3x;O_xVhgVvN5w2n!uz3QYtO7fPcy-Bc{&zfF+g4nd~cuLA3iX(aiHPJFC2ksgNc
zK90*32M7GV8OkrHosQM{{#Nj!FzgA+*A6)4v3PY>`OPG$t&#EdGh`G7*Yk7*74P`}
zn|<MGk11$P$lh`yqZ~t`dp^QR(6}T-AF9T6!&fggor@Jcqwkga^X){M3Ut)Ic2>O6
zIWp1+-=fk^b1CF6ZME6{E2Lepu1;tI%^Nmq{Aqyw!C|v~bgc*zilW=_293Bra_ifd
ztHG0eBK(8@0PV`NdP%a9xGR%Jza!}YI|N*ZT_~;-U<p_N90hH<u55Xid0><|V%xTz
z5LlZqOdcq<)!zS!A7p#6m<v;W7J$=K2pGQLGoXDlIUM1ddX<(URnFGFl|IW#o%#l$
z0$=ADrCdS=T-7i2292n`s0<I4O!$@xI>}75m%TM{$5i%$LALT@4e?fg@nIw+`Z{Jj
zC=qnclRS}0XHPYJwcAf^Xm=X+sm1d8F@z&V!C0X1fj-M|LPslbvT&C5ge_}U0N0Kn
zi_>Z-rZ0M7HJABU@8R4etUn4OH7Ft`Yb!JxX5=7xvK@CQFL=!u^`~?hLfIf!HLcJ`
z^3LN_aiXIyK4<N7Te1<#6rVuzj&paVBfqHF3K9AK;=D$14aA|+iLL%)!9{a<+V=<w
zdg<`Hvul}$_mjN01E$o={9iDB>nmD*r|2-zU(aF@`B~Az95a7663ci-j+T+58&k2h
zDz)Wjx){lnpP|_=7v#PM!I9KT*O#f9mT-QbEWi;|kBm=UZ|Yw+5bWRlE991@Qp_QB
zH)RTy+mj@g;oATjul1X+KX?)6ONDpULT;G{7(sGq2OO~iuGJmw^eWFR0cs(H-^*d&
zZJu8i4SxxUR(C#=c0V)xxEai5dHpeWxuSic+H_53*~>h~@b$Uk0R(dSP%MG^<Fr_(
zjhF0Z!fS!fnJU~FJ8%_ttdAF7#iYU6{KZYvS9oRuqT*9{c*_;#6J_uEM(pP>$#vGk
zv{6^UyEZc9p94>xEZjW9T3gD3F*BCVWSeDyTq+1MN_{Dmf@DC=>c)zlFQ0IQIKI{8
zg@4vi72>VRY^-qPZ7AdS*4L+0Z|pQOV;I~1B{X64!JE%^jkur;JCuaOz^cE+IzgVB
zb>*(|gY2sM+k||Yb==?s<}h19jg$-lCcdiQO_&V08sFcXP8ZFd<*<ER9j|OURJU!&
zl>dpOP}obhW_CJAFQ56J%(`r({%Eli9SLTK<0Q9N_Na~A0&_(aNp#gH>D8C$k|Rus
z$X)nx<dwTcWgM_F`rk`^Pv-l_`hGnfaQoBe+0~@aOQyUxXG(OAtF(SkFv*bTt{DlP
z>rOzZk7{P!zj&Pg1`K|?;cBwT>pCeEB45s79~~AGrYR8RczCL9bJdfMl>{bb_oxb|
zvBjW!_9BXZeQ1>V_=n5|V|Ii(mWE9j+u?X7r?>p$0!lMApZX4a7UF5btom6Q)2w3P
zZWu60jjEUA=TPYJJMl8d88SMAN~001XyQ-2!Z$-%UjWa2>lxx2bE)ize|p~>%d{am
zmXXmrX5AxCw_9$PQ<36|;p{sm+e#tF!M|q0Tgy*$Z+{}@xD#I4$MVn%enrw%Kg&!#
z?=HBb$Jie%+UR$eL1gP5sww@=KyDda7W_23l&@q)Mf~P**6rYz%sYUck_{cGT->IT
zj(5aWhoi8UnLDwCQXc>#H=VDv12)kRJv#Vjt3Yen;T$dl3bG1f3_j)ElqJ=reV1#_
za)O;RnR7I-#jR9bv5u&Kd~3mi@3}@cmQj`x&G+408k%PW;>(+}8>z;z46fh4qkMlS
z6jtft-!DEXk|6?npE*X$99<4Z9CxcmUgyetNfQ|h?Be$IcwtcG)#OcPL0`u$HMVy2
zrtd*bquLfuakxa}^|ygHWMRr%+<%wgv`%YA13y%u9=lY(JFbN`Opl=fS)OkgMefw}
zwJ%nnz|cJh6y|sQSK#8tSbb^L&mf~H1=HcEbGuvQN01rsK2O6+{y)HvteGAar4a(C
zWv@Nf(Y&=c*wq9JzL#EMz7*MbcA|UaaD=thJd-NJS!gA!KLf2AFAg>Pf8z3>uf!|c
zbV8ax+ex82cZy8RX6(hiBE)Q`s@Uw0gD;W{{obC&`hJlA2N*2Vfbnfn3n=ae=HBH1
zA0ISs_`~PEb#{~u_v}>f(Hf9j`~ysoJ-_{QbwoBJY~vraMj>&nznIeje=Cnn7s|8L
z45OiSEm2pL&rh-kjLs-3$Qu{MHr>O><Ttb{$Uw!*S9{y9a06H`*>d|6(>@fJs2sGd
zHMYe_VM+f0F|!FiWS2x}QRTRg?Qqzh^}<vp4zAwSzWof&T|;A<kIinj44Q5ta!vk=
z|HMDI*Z>R_^8^YKcD4_CrIdo&o%lL;8ou%<ByA#x&mgU8f9$Ip8G4nd&X0fl#q*02
zyN&Go9+nWp*{RpAz5g8vA!BV!gY?A6g?qm$X5!h<>l2)rCq}in=lqkMIgc9v69gBb
z2s4yf;uPVY$td=jO4E`r%PDpol$5_j<mqVnZ(@G?*IDHX$Y1#P19M@Iwcp`M{6di@
zxqRi|PI8N@`LiMSrz`)<*2k|aG37*YO3FCVxt7MPfU!l4toelP=jDh0jHJu66y$_K
zsxQL&W!~^;irlZ{KHHR&1+RSpUh6Hrd40}(w(E?NPF3+4=);bGtF;(KfM+F2eBrGE
zd*Xhzf>W(g#v;V2^~`J(M`X%_*BcKvaV+2N!cnuo%W-_DE-_YK%En%w{`2y}d|SRf
z`uH$wo1fui^G$b**bVNOd_UvnwE=Mb0eGS|^5=}M{`Go41<v>Lju%hU@W0O3!H*EA
zQT>M3x90x<zHlvx!Q%ri)Z$1#Nvn3T2`KEpL8QFIgr2|~#=+G6DBvOrnJiCd@bxJD
zOfQ1v51B}Nxrb|25LV}1(GC~GW6y{@|I_g=|JTYK{<_apzC~kkckpb|w#)WKTtB(l
zsy0Am`G30oHy^N<oI)Nn9R~|1=(?7MUsNzpzVle#=L%xNT}_VVZXPtV|KFoCx|_o7
zl?~O5V*dbs@TF-e{!8T-V)ua%oi>GOOEUIxq{!C)tz{&Unm?<#Yd%-&?|r_NH|s6s
z8vQp}-(ZSccmzD3!6yd)wHN(1gK#ViFROk}nsZ~8bajK%_5TgLMZV)Es(XUu?7)Cq
zZ*7bAAph&-nY2x%b`{pv+M%auxWW6ocsaMX0x{r2e!+!rmD4yOd?XRnzx^mIE)8+~
zFL1~fVTK~@4}%LE_YrpMW<1>A;2}G>w}`4DS$_C20tECASN;KJWc^`tz7m$jONSdj
z<Ey+9oinDc-%U%4={5MsPHIlWKkVonwg2W|*ANC2W<-ROOP%<4vB14v0H@T!-HLax
zuRni!;Q4PF%KC>#Muz4~<;k1YIS|PK#_DGCWS5^fy8hcAB0Q!38`V$U4duwd9YgD(
zP9FFdH&}*s!7-qlpm>(U6Maq9?!QC-_(!HN1u|MZU!t#{;4}={Rbh)Xgj=cpKPzPa
zSy|WsMQO3&d3hqfX2?Tc#eZW4<k0#&G&Rz`IhkgIi}V@!HQekh0KdmjQQ1$johsmc
zT$XIkGW#TOt^7*C?e;ngchLvBzB_-<jYm+A5SN9<JQp9ezz?$~F~XM*Y^IN>x3)B*
z7YjkNLj}tyf_?SF!WK?K`nAeJb)bUwNa|RX;CDV*7!8)E@6_5x-myCIM8Ar+3qr!L
z8GDJF%O<LpSz5G-rh0(|=Qv(XeL%PS!QO?@RL?(~J2a>``;pu2l8Cyn$g}>y5Ng|D
z|8^x=(!s)u!E&dJdT9kZ^r;8nT~MIMp*~Ij69Ff@;G}SF(r?NWRR%`pQ5FLEHOUX^
zTODGZv1S<{GpD0}!0~J<X6ecKesbu6f)w-oLa!S!yTbkMZ+@Jrjcjp^R7|n8sT*XJ
z_fWg=j8uX#9MvC#kGI-_t+RNXiHrY(kgiHild#erASuz?pTD@ou+N*(9WPxwTP{!_
z&a1{Ar<`!uf#X?dz}$viHl3%Ahm83n@tbqK%f6ZDmjaWdGxxa(wiJhd0F6>jrJOt4
z<uiU-W=)jebO<X5te%6&^NlX$FG#B+Tywvd$BS)7$TgXUc!g62m-w2(%Zbn4{NP+@
zvK5PV+U4GeCRKE_(AHzC&!&?t&xymlMAs{82Tuo^+a2Eu$!oOMH_ycD^5de(ostI}
z&lGxI$#xoz!&Ki>XWLQ!2^I@P=EWcMCp>!Q+0>T+Nw{1Twxh1Ro0Pe&(yM->%+S~R
zF=cM~2_g|u2W^*((<)R>sZU9O=s?$w(q!#T_;b@66Oj)MHaE{dEFb=fu#=nMD4CT0
zpv&);4KAjT02BTgLW=dgrEZv<6%d_p-nzU>4Zpq0A&vbkIM)y##TMgJif>EiLOb|Y
z)BErB2M<#VDzmI@fLbAR$jwWQY5Nmeqtp97+ac2QYR>z_#^sM|{RzQsl4IZ4t}dj`
zihf(A_arX$t-O68XtSJeL`C@d-7MM{Off5{Cdh7!MAi}O;N`4AR4Lc|)MHj?;QG$_
z{tW>EX>-YM*>ME<LB)c^&%%+uvPA}K%S8u;>9|=qE2{w{WqkH06*u9l1*?dEVzj*G
z))i6Q>~=0}K3io{B}7%DQ}>EJiymt-{>oTzo)F0)K2!QdpR*`*6jJ|p0Q|LTP_#Sy
zgm=HW<h0uU^YVQ{#c7uj-OwmMVL?mkZxWnSXHP~1BOJEgY5l?9$f6;&9#D>JH{KG}
zCG@^k?Sb^3zRz~Ze>G<Oae!r=8jxXq@1*lXS@GrfgL`hN_;R*q0P%E2FSNNC_V-Oi
ztVGtY<bCsXsq@CH-ak-F1+lXdKo$*Gi{B|nOk3_QMjaQuG=C&U5`4{0n%FDwjxzhf
zJ7F<q(Q5%tAc%=|kR84|RbAm%7`t;zvq6kTGkfuxx4nmEYg$v9p)S(!4gU&Fx#7B5
z0-inp9K<peaGY+UHYPN40;kV>B#{|z_d9;q8VaZuPGO&0*-xCU>>5`aLaa11U=RA?
zPdINvskZKFVz|*7(0y(wG@#n}<}v@I`IE>+Ne4lUw>b&)x(_2}1;oSEK3`DrBc7kf
zR6q1{Iu&p<l!rhK+QnB4EG~VL51FKyxWy$%<ohXA5K6pFHeb);GGgoRk?wH~i@QG7
zFfoqC(yw3VV0Ya8Ryz0=d}zuc^3;$N|JK@?(Pw`IGN!@Z!bs0QrdC{Mq}_-1W~$XM
z8~p2P$;b%f0|wD_&B->%hj1kah(i><DT;fWMGej4>`Ko~g2kL$u==kWW-{oqr<wuI
zL_f3~^v^9wnp5IaX;7dgFgE@JaE~YQV`oGPEI3otV%3qw_&SLQCAcx@x}o;Jqx<_5
z8GDBF51_RqplqtXmBUM6{Tt1ljj<SB&#+SQ_aU|b73`!8$23``C)#=|V(r_AcUvAC
z3W5HT)qe>;esmV={Yk`kJK^&WU|8&M+L`y^;ni}NST6T2Mfd&%^WWTyMxs!tGLsnN
zQcp!Kvnj>`73+=Uq3(EkLS~+62Bhs8Ya2x?Ou#MLRJf??n}&T2EZasFJo_B;(kS|$
zC>b^eqP3f4LeRaUkFTMb-->AUAb!xx`AH_rn00l`@vUCd%Y4%7kqF25_E<zo`yx;h
zcg=HClC!ChJ1|+=u4-Yk&vfXO_<r_OdY`<CqRl?^AK*&F!~L~av+I#nlM4=rW08I!
z`fJ!8SN059tF1C~%=*#q*Klb$h#ZPZs9*7O^JBnq#*_7Lncp<lezM(mp8E|xikHsV
z9-~Jl>Ga`z1j~jby|K4DZImY-vBdSbZ8>0DPK7bY-4J?V{R)v3X-8+OX(Wkzh;V)5
z7~_>stV%$=u{7uHHMg9MeSFNmRX63|+v?6%Vd*qQsh>M>Ozc^?rjg%OSNgsWZ}2%k
zQzBu^9&ER_ntpM6;*S;BiJjA^iqDJXG<oR$ejO<%dmy2cx+OpVQ!u{F)29j(|9*sX
z+b2<UP!?c0iF03O%>+s9P8cF%dnl^_W3#IGIXTeQVt0x6t+JPNBgXTVyHy=B?+Y4j
zFLPzWBcwR-!wxb+{=M~#vz~8uC$F>AI#WMMb)$~M73AKpFo-HFD3lz1GcC9LQ&H~E
zwG&A)md+=5qo%_q$eI^6q5X-uuj;Kiog2S(e<F0-n$1By5z!m8=7*D_yQyDSEj2@B
zKY{M^#+F`bf)4P_30FA#_?!iWR|NG`+lxDO$5*h<kZ6EmrSO&Vq*-pN6!c6p`Zye7
z&pDVbhMB*Fm21d?8}{AU<*C?8#KVoNtuiE`zRf?GZJO~Xhug~%&2^O>Q|c8D<8rJL
zKWJvdHSg_?52;CIRLS#_;4u!ce)?1aEzqlhN*2s6%}{D47>_|uU>2z@W1qsE+wQ#`
zH(#n#ir&oj#mO;;p%!aKEoRxxe&Qty@I;2#OhcvdUUT`p{uk~mVJRy9_RC*J$L@l<
zSP|az1@AzopWAUdMEdVxIfFN>BNQXMo3Y*f7#Qf*%hhRwe`tzW6^*(lO%voj?N(@!
ziM=$6L$%VzK83mxr}7?8IL<9Nw?=>Om{HvG35gPD^69G~y%|eX4@_;_=2aP9>^aY&
zOTGR{iB3*xtjBf?axZW|Zk?Q7u|0Zotaq#!v!a(0>-4D1Xxm%cq*?|)&>qi5rydh-
zu{*t4@zX2^Eu@ZeFFlWAYV(^e<7$w4CJZmaPB15O3*E%sBKqic&g5zz977D)rH|G6
z(6BCvJQGQENy_;>mO{4*czzvg+EQ~{w~G%sEUT`LMl!eG6`<VvT?>WUNCtZbEWkWV
zDtE-&G;vhkGmiV&MEUK+ldSVn?jF6BFkft=M^G0Ls5AI-c{h&vzGwILdPpS1YqkBo
zU8jRJ843Kjs+6*`PP`MS{0<7CmUvie_i`yo&W<sOi$&ALv-4IJCyFmH?Np-Yvxr&u
zIwQd6L&&9~1XQbOwYcyz3ITzA%)ETZ8P<%GlU}Jvs-7O2<i<a?*9roO!ZrpcPWb<R
z=0_JQ`1#;;N^FM92dSx?{eR@Wby$?&*Ec!{isVQ)qk@2v5)uM~L6_uEGE!2~(lH>?
zB_JS3ih$(MB{0$@-J!H}*T4kl_WOI@C(e7G^T&D5ALqT!AN%4G@0-2vz1LcMt<S1z
zps%r;0sV_=>&I-$w_nA<^Lzp#mi8Z8Y-zk2;$?AG+Hu>kx6{qzkLDVkNS(C`YDyyd
z{N8fqwsqRhZjQqPgWWgji4PwK?ZzgJ8kf}G8Z0EGk5NoU2|%g62P|h0Uxyyd_w*<e
z2@KynWhz6DXelif%D%SitAokDY;r6WRrHovQfw9;J^DlOn#fzvk?>u~!ds-XY>jeu
ztm?dRmBQAel2lB4y_ohSwA_w_cITWY=EsYYzMuZ;4l))wX+~Ku43m?De+Pg2MkJ@!
zjU@7?P<C=wqq@Vw^noBVH}<4c=N4z}-s;qcy{|sBn`K(H2i~728p)}&OQMbBlRkwH
z$~vZB>Z%+zhH5C9J}5b=9;k^=dNOGD<wRs{htD?&?y{P=P-FN`hxt0WNM_PC{&EV|
zEO92lCD|A!$*Fc7M3|`OLC-s1Ju=v-L@-1soqoDpy=z6)W%h}P(=WGKxAD<YC3U+i
zZeC|4mT~Ra<v6KJd4Rz}J|&*WN;mZOIz;7F{9b(0uz>95fKV?1ci#)+dRA?&@yDO&
z&bGCDAEz{kRo+~%ch83#cUKM0CEV;iD~Pz&;C`tqu1z2Gy)JuOgMUF=@=Y}_E{B$e
zhS%qn1#M5+9JkLtwC(X2f2l~u*Yr~&f=F{k)<g$Zo-Mb?-&?bc)%%6tKCcfdOai57
zALT$&?&43Qpa<(5?59@0e_03;Xg}#%GuNaesy(PnabECsz83)mb|L3e`9&Y!;A^3G
zmkqv%T>G>XH`)08NAXg@{hr&9aoBC+#M3fz=``(A*<{}NXT4ny6;`zNa^0PZD7DqQ
zV!CpFDQpR}e^h1*Vw5PjHvK2(qtg$WaA=ZhV^sMXo$!~WuX9?}LEfTcx6KrVT!ZHB
zeo(xTMdVP=`AoL0)X?^vq4G_L*OrD4%qh=y#Gv<&6t4s;&48J9lewzLaE^6Oh|Qk{
z6S|9v;i>1*MfHT&lFhq0Jxlg$czf>1&u4$wpMC`++FH<e(}j8RYjTsaq^IjJdo4d@
zdY2Yz2zpi2a6Y6`e3!SLtLptb{ps?-*YBg(G_^GA_}V$guPsZg9)*ILx@#01Y5gjS
zawIR5PcHT5lT%-x_kZk))>)Ih9jiG!W|gF4Yf1j_Lf`WBZ)Jv`{0#>2u?h(0D!!i7
z=Pc7F>v{awI;u?WcK&&uQ^)p|eS(&B?N^Hq;|kL{bV)6eCt=33e!lyjTTf9{H;w;F
z;JmZa_6d*_qAmg>VP7VFV8QJ?w5A@C{G6}8wP;HFLHDtcNi3MLcT?rJSHug>SButW
z+ZUs*qa8fcDH?88p6hLMFEw<h8nCdsfwG9=9&z5e5~~QjusAF*h?bmLp_RzRv>1ub
zRX`p1mnFKno{QfJCi-y!GNEib**DdY{fMu4T0%DpbdVC+tn`z=7t}dibRKCxAu?d`
z2yWvMDZQQyG(u8Ib0c{=tRTJuTDh%nhxpPiRISwGeo>eBL8i#&SIdtckEB)%U`?OK
z_^7-b%BT>_Eg?Ck_mECN<k_bvlUu6Mo=#J$`rF6Fl$Mr`u2|GcVY}nnN?N6lf8LAn
zcv`@E*n2ADV)R%h<h4n%gzUU)u5T@&&f>eh%hM7E7Qc~l7Ts5bb7I4r0ZbEO!Odn5
z7jb(XzSeZRmypmB(U%GG)YWChe;NoMhd3{bFlwpu8*Z<8%lKKAC4H&0mdr-8?3``7
zl+Y);dRio8Pi4!ITTo#mn?1=%tGFgItkxyEO>bVl{bt!feevP;Y(G%p@+(l`+oO!j
zZBlW=%cLclnKYd$fcH~hIeD22R(j&`=ZKa*qxlP0kKZEd5xMUvFi(a|EH3ojW;#9H
zIlH#|wc=8~aW!O5><ymb=D=f%sb(yzD}18_y<HMy`t(Oj)|Ry~tJmqmK6wNZW4J%~
z+r>=uwFL~kiK!P*)~LtYo~gYK7ZKn0bOlpXP0>|{$62T;-ifys5GP#7XxNTyPu@f#
zr_GEQW0^?0_FcB@&2r=Vy$qry*ZH1cc^}>9Xna~ZSLyvcPzl89L-o$?`aPF-H_Nn2
zLy9}1`jpyHevtxi`&FkCL^z-cPi{WZBW}2sf1Fo0CoG}IamP~H;`Cdtldxx_`R8ei
zRY^MYihDoLd7YC-syMA&j-nIQf7Eo;k(WRMJ6tuV^0dH=P1lyy2Ma<d+Jbd7%|zcE
z)=b`gvt&T{&W9>ZvNW3_bkv12QDd7^`;l{lPV9tCZ$+%qB>dF+`mJlm9zGK`d1<q)
z7WG!%&QHrXTuuv~xv@=Dt73B9$gT;^1O4K@UuzeoXP{PnJ9oFs^Wh@Ruz7{aCwFk|
zOO@vZ5NrIBj`2e&bx%&uc)rr~wZI2YHoG-=1u$x;#(ElB<)^rb!C$$vQRjE<*3kT=
z4kY;DyRbw7WY1!HP@H2D`P2)=+m8;zl0-rXKY*D;tF|V&9<ngv$8OE)t1SlHEvi|i
zer-%mz7$|W>4uYMxW~)F{HE=d;rQ3~u&1uX`Mf$}J)7FStUFWh`R9WL0^hO~=TqPK
zqN<uYopiy*o6)jtqQAG*I<CP*bO{@D5oJA)E^d@X26OkO7Rr?Lz18Nq=SlgIb!t79
z9_kq5C6-IRp7&~Db5OV62IT6hn@(s9S8=DxdQd;_Smvr$k_&F#H!@Pe<50g!$v!`?
zf7u=i3fk8@+4vf{)E>=NgQ_0W5m47CR)6dmhRLt_bJvSgUOIlZ;!Wk)F2t*lu9TVo
z%lGeQ%53&~Ua~_JI#|V3_B)+TUG#Xpi;7yF!yNq+NQR`9p_P|!DMu&sV4dXc`0C?U
zwn~|zt!l;|v&7BGmak~~GHZnOee)h=CzZwz!evv8#k@g1d9I}6g~ij<ik*PV)xZE~
zjq^N1`km%FTxJS|AMu5C%mz?uesH^(8p4DG6toxPX*TQDFR~I36|vD<KnV%()g;a|
zex{6z(=Tw%?)vkL_eFC_nlD^MjK{v@c;2tKmKd;9UkmArXLRc}OT)~l3_Uv{Y5zTr
z4Z}5b$%=9poT(>6_Y>jFQ_~A|OipN%k-~Lm`EbAi{C29FXc=YHw}81Pc#wM(J+r`|
z{cGV5(Q=xGt<;EV#E%bJN*cWLE!Xiw@LLEQ@U2IQOJwe$Z4-%@XS(%7Mzg#pgOc7i
ziR}cr8wk_dX(IB%^0)O<`1E%(BuwtTQ8yUQVYn}oAT2H(D9O19*0$fw3IeJuo_A$2
zp>v;4`8{5;zXI)S2);_%FSEeN)IWDN*iW>P_JQr}|Ja}2Ut(xAwsE>suI}4Gn%!G$
z$5W}qczR?ZsgyECDEl?jerxesAm|4xi1%1}szc(}SgTIszTnXnDEJ6!*ozfR%7qCP
zTVpGLuEa-hk#(HN5+Xb|LD`YBI%+`jHof%u2H5Bj%oNbqB5?>f6~&_1oKz$42eO+K
zKa}=ax`}M!K}NS8DnQA^rG%AHoI~{;&aVwkNiz&;Q$9n*5ld0{+a!d@aM6%;ZZar<
zPwSv|tXC$rb32%obS9n-z4OOw*q=B?o<P{wA%my`S{I;IxJ)l19su7pu|tEOou=2j
zhZWZlj+@uPZ#04b)PbFe9G1U!+RcD)Vo`ipux7(HJeO7nFsR+)*5QEn0v(nrcmxzs
z$ojl~#(&rmYxL4n>GmZF5U<&N-{A^mj$)Ee_P1BIX3aGZ0Hb>0xhGn9l=VNe3;xHu
zZULR9Qr^r;r@Nf=+wp&wuHA`Y^ty0HA~#uxUsbK2s^B1B;!ZIVcy|D%&%OeMdnN$g
zp|3zcQZMGA7EVmv$MR*iW;ms+EUfDn&FP`8D8Y310UnxGkmft+ek6|mOUmWNXCTJ|
zcty*;9h>SO26XxmIc@__uXj~cIZla|qS`=!iy$1CTkZ5-YA|IH-SlXxHeYFHpxg>v
zPMzo0Gbn7o!OR=R)af02UtT|9WWmz&RsA#mwg#z(c3VUGeuBjTeZTp~DH6L&Y`>k=
zJ~}H=`kWl%taRbSJ^_Tu1Bt3VH~8ID^@$}rfS%3r0f8egKi025DxhN=u*HpgCpAg-
zJ{TBu_C)-)ZY4ZZIdqA3ny5w!Wu;a;BnB!X76qRv9JWc$m2Mh?->9b}7U(Xo%NwuE
z17X$xYQ7MfN^=EDak>IsL;Ue$0Pke1ForJdmWmM00uG%G+6}1X6^K#ZZ9T(U?e;nc
zjT=xT2liW6_X>pcMja!u*Xe)`P;<bJX8rvW)~#V@8i&3Q`Aar)6BLLRu<}a42OuJU
zMaPubb`DB*{6Z~?o%_=X#)?=s0n<mBvROvi{G|gD)K1|Fadh_N+hL_H7XYWHF<LnI
z8Ng#nB2ZW${;i`PXl-u4Yxe-o@)|YRvTAgI;WBdl=f!@^eJ|us{u`zF4>CK{Tuv??
zG@&XH(HRq=HTluNvA+a7BfyeB>LHMlb$?gPgranXUI59h2V8!F3_Hck^(lm~F}D-o
zm?OTWoar2X;A1f!>&x^yt_0%_M_YCQ3R-{k?-b;gfEcO+hyZ#l%b+=9+&QvZR#iar
zPLlc@ATe#hozfMeUv-9K0Zz6kbQ&f<629HyfZYb1Hh>L#Ynu}f_-}$lma2hT8%8vi
zX<g}x<tf0#WrWL`JFpcb(}vb2vt#pD<4WbG>V8Sav)NFT(ng`5eOmLMmQ@(`+0T><
z2%c!DOCCr7w#UK@uvN<_umyA;@W8L=9s=zZr_rTACPpS?1@{>V_(ZE{wShel(fk8<
zz3=6li1>&2(UXY9&j{LnKqY)%+<w6(Yg`6BqBDuZlXM>{xp7CHP`k_sAke=8I<z!#
z?gP+N{~qWv#Bu=;wFg`QTAQc?%^~)==f$4OFPVOM&qqK{RW?^~;j*_=)HCml)ZX1~
zH3lemgYME^-honPV6l|x@4ol9rzaS<0#ZW_+tP>&YvDFA7OTT@8c|&jx<0x1ffxrg
zKF+G(?%?5@ojJwPa9X(D`+?Q$h3D()0YXJ?5Rrjc<cULvHZWi@2sA*pBF8ra+83zb
z<x4An35%1sBS5@<<fiis1I9j6{q9?P$R5kYDB7xx$YRgpHo}R2NnF7$6}tgTq408u
z=hp+MEJ$P-g0@<q+ttFA7Fa2o0N&t^ah-tv0Aa70yOY)9d=E1!fv$dH=WKLgm-}hI
zfu0QtS%;KI@Ks;|vTR{zCWn#6CED9kZsm;g^jZM1k3x6qmXEHLRpjpyvPM*0fwE!q
zZ(pG2#PIItw*%NMQUC0r9~Z(3%IHKXmLVS7qz+7~yNlsNH-xgl#vcAdh{y(;0({f!
zg}?F^ka&jN|42&`MEiZOI!~$E<o>-X>nqTc!c~BdOwiCdDc6;9>J^~!F|y%XF3xKJ
zk@5G)*etXDH60(S5*7W-s-{%($7Z}Z1ic2#NnW>n{9(4`bDSBl5j1d@5mlVWKkp&_
z{R8QoW?)^{J!0%NSl5GifP#Vd5Al6?L4BfaX4~~lt|5F7+-*eNrRz=0EFd6Xp#Eb*
zp@qu2sAu$XM&;|6HaPxnHl(s3`s{;y3?D>~#_jZicW>)HAVdDek!1KggUVO{kp%=u
zIldSFF#HklV6x;UurMlZ$PB!j1E*h=Cy2e_gumOe_a&@bvI=AWZ!ER)KZFk;zg{3-
zJ2AF=xP05#l|Cv2dUWr{Dr9d`5Gx!P9Wit^n0oPdt6{%I_3fjN<$aa?dp1o-KHhWu
zhb0&H(N+*9#IUs*HXkT)b04wv(GPI>KFJU{0whB?aEk&N)i0~y03$&zBR>9%kq!`l
z|2AqvBtJk0aY&13=Rx=5gQcw3{;nMErKNben}!g^d_a6lRs81!iT~>(0KJjtt_WU|
zZ)SZ2q^19oJY>7p7Tdq~1pmbf@~^XFMp%KF*c<<C^}r3s?NzSA>|mjOKU0ndAObmL
zlg(H3lqka1zX!g>rS-3k>^?(P1vFdV45d)`VNrNjEeqMN1gI=^QDjzP>Jt9XQCM~V
zV0m-&cVXXN{~pc2O2j`TEe)+Ih*Gu2n6|Tj2EKmTmgUU#P-{?3UD|pUj^}FW04~It
zPgfw|o3cQw`(A%VFcWwN*_mnPF;NXfL;dBTQSqIr<x$FC#u~(R)PLFM%$*id+7Ndv
z*g%Gp>qY(59MsSah}TF4g5#}OH*v>%-S{9q0hHA~y*f_<|La!C)#Vp1yGK)VA9b!k
z##MsArh`ujx&I|RmlH7}UhQve8e1}Iq}Hr@u^!{_gP*1X$A0a_oAs;~GzehEU%-qr
zDSv0g-@&W@Yfn15&t4sxGIb#fyi-}cQV6HewwgP=%LhO5kp-ZDa~fCw79)&#By@78
z{~~2R_&>d<9nhb7|I?ck!wRb*=(Zcc)w=x{L9cic4*d$(8?>l27vPeangz6!y4UcZ
zEvt}2aHZ2f8+sox6V-7fHzHeD;3&iBoeM$l>Od-|qQ0aIgc*!g$`RFnO!x8sopeqI
zxGW~I|L2JTf%~&X=vcb7cr)r~g|u=J5cAcQ0Y6&=6NCy3$M9#G)GO>+9xYSAx%^7~
zOKMl3aq_Hx?uf|lo&Mzy`45XW=Ulp;qmcbZmkc{I#Rev#m$7-)!s)L?VXfU|^8UW^
zYV}nQ6Y8We@h=wnGKF<sc*1{MM;iO_H`frD*Vn$d0U`)HDMlQW)o7)^G(K4g-M)a8
zYnf;dxg>bHV!@7*?9>lN;+DO>qR3AhEMW|jTf{F2k1zFd57u>Ns!iq~-<%GRW?xdy
zCV5ed&>b~yr_M}?0M;$cl*2^rPrn&(R(s@N%97Zz(rCAvV@NC_jtFyTx+xEtGbr2M
z1OMv<Mp-($8JKu7CJYd&A-sNk6EoOWZ#yJz5d!UN?u$gsd9`fsF`Ti}0G@ZUtR>Sb
zA@1GDt(m@+04V$Y!h@;~@@tyk=90P(!Uho!&h2>cbDigKXN1|(V`$TX7+Sf6E{(~F
zU+qnOT-bcUQT~&IIHNkNIfSF!b_NhHZ+ZoihwlD_yPp&PYYHl$!=s97_2kcWbom8d
z4>&zCv4|n@TJ#R(aolM0U?hC*6R;rgjx@LRc_s;QTgF=@`mMb_B{>O@>`s>73l#BQ
zE}yolFiRP5s7t2jz~}y_7ihq3{whpH27CN8&V;0px}_Hb2gnZB9#*)~eUlZO*)|~l
zkbYmX&h`WwF(J5+_kROWA_6G4b;{WHIyGI3F1U}1bC(2)a!BWANp4wqup(5>i+J$A
z{(dy7+}S_t_Yf#-N#Hx%+<eV$A@wVv3k>3iy^^2MM9}iy+!5jc>G0+AP=){0yy9hZ
zaPus}S6T;87h4KgAs!2fz!7NGqqR!ZFr8~iHCG6h_qR=%iH4PFlT3zU&RlT6jhUUV
z5ieY4{R)KEHJXxdsUe((qP4}__}@g6ZNq^;nzojpFHLm{#8s>g2H*4jR-kQgs<yA~
z{MGB_`n*@590HTjf+bC=O(y$orfcApC(Tf?0f1JYmyg)>>=vX?#U@rMcEIzvnSRZ~
zfco$UToKDe)(@lXb4R@biJ`lc(Y)(UR9<l1gU;c1I8TL(qkMS7ZAi~$ut`;p#3lP6
z_#sd{ZwEss+K&GS;974dOE%X{h?3l>$$`79m`n|ElIcrm+~v;AM0f7TUK(16=QsjX
zI5WRCK@6&x51AyqQYKMUci-NIC_LnA4}qdiS{)^=qn4Ipa%yUW*(GeG;nC1gf4Ced
zlQ`!a_pELPdOb2$tnoM_pTok!hOID#KGc>r@dycxF^UDcldX55s)biNl5eNSQ+QGM
z?(b4%p;h@>cR|Q&$VuV*i1M)$D95smv2aln(1I6nHHAZbf5zuTYspI$9UpJ-x~C&D
zVQJyI^6rIxXNM=aA@{M$F#uuK-AgqL*JG`aK`~>(J)nFYPamH`*??ZeAlM2IprzeN
zwaMKe==6L0i+9IqE~W#;G-)MiDEj@84!)O|?y#MG0&~L<zp0Al!McgnO>P-~$4<{a
zb$KE|H|}NM+ISx6m|Zivd+*bHPrQPjTf=<F3u0I0A9Nccq1`0`gAjmO!x1)`?21bR
zl4(QARb(#3TlZXRCoeF?GkJ+$?dMn1c{mXY5O52-m+F#d$QvxnV)4xr+@oISqTKcP
z!((M$cW>6CpFTJt-hnI7>xFkxnm-VskJzbe^b&Q_$(oHe2pa7hK=UHwU!Sc$86%W2
zm};3?OgHS%`RO8%A&&Xh%zzNT^ni<8J`0vG&4h*MJrG<_w!<LYecbo(PvmJ~y$~B(
zEqT!Y9|m1kk*$Z+z{h0f_fHKhS%b&fJJ>rs#B$^b)eGMGWg*e#`|ZplRUeW=Uknia
z@LvNofG`&vudGBOrt_1h*a={%oJgGR3aGvT%Stzq`%19MhUFkrC#Tzf#*$VhFtXpg
zeFPqDDF`!)x&Oq;7q{MXXj!1&ES0e9)Qu-GCyA3_oQrh-u5>R!8ML}+G4?xNXRKM0
z!}+4OdlRnY_c=7L`!s)#$U-_Yg`i%hsHKhdQV;&CT+BZ%67bh2fTJ7~UH<Ze4=yJR
zt2em{xiIH#wLguHzZX>C*u5exkJx3d+rNZd4da5W%>B1wek$|HDnEJW-R(2^@aAw#
zN0!FK;LFdeWURMDf5wEiBcIJr&GQfSZ$B3M86bn7-?)n&jsMn`pJ++z3$kDNmXqd5
z^&^I(;)fm+=hUx-TFnr5O&Y9=5?`WS3|Ifpq?Rpbx5RlI?@-g@MSqs|jk0jA2F9Ny
zQvL+UY1dDZW`NEd0*QXTt4ODX{Db%`-QR|yVAo<ht5yjP6`~4`dj`)&R6+xH+!rDz
zg-aVt<HEB2pL_&=?5kILa!xtd-3S|uCXf`nyn`X%<Y}~ytl6<zYpAUv9)23ChFCTF
zvphRQk~me&HVarObr~qGA&YnA_Dtz7-cLpxBNp$Y1~~S8c)p-M4{aWNiR1zO-w(C*
zJP<z<?LuL$>~!|^0oE}$<up)$|E9?PrH-4z7RbXNs0;h|AD)Qxk;XtsmW!h7$Dbp>
zGq$#=c|+f_iM5nZM6D+S=_7fvul?^dnuxoNY$^_k61s#^RJa)Nhf64}tghzECgk#4
zj^6S{!X(c){3Qs$_-eLRPfZ@)_Z&Vv?t9>+e_0G2K1%A;$kwwGlS;!_{qYltY>*%<
zmHu}3Q_bzo$a__Hc$Lr=XCa(kgJM^peCTA;-R8NokSkF940JmjNrId_y7%Tf^x0wN
z-30FtO`WR|oWJh)%iqdO<d%oA*)Zv&ZYOWCBGIa*V3!9%66bWInr_}~mBzDA{;0Iy
z>uoFZjsIOq?>lEwy;XW=DOz>hHxgrU)Hs*tueAiBk!5UO>GZ9Pu`ha~8@q9>hfa1&
z;zUMdwuNo-gx|WoF2VM4prrQrd8g}^9-KH-Tk-ZO-6e`8qOGR`1ilkHW(r^4C$-HK
zt4{0Iub>7Q_tk^f8X+&_cHw3pTsRME^TH<~@kgoal!`Q%S<=z%KLK_!0jUA)!Fc}l
za4=5M7)T3n*j|LPqQM3SdfWUn43A<s_+V7AH>8LmZQXTcy!%7bZg}N|aGBGoE6}L1
z&nfzxig|`JPyLrEG<9HUjbwDlf~nY6um@ul$%Fh6w-NZ?(=bUVh&J`)>$xb695KY`
zsVzx<q8#PG9W$-ueEMt>RUJ>=dxaqz*SfaUCyf2M05`qOwaG1U($YT=6?J?y!5gv1
z?z+EaKQ5jS*oa9873CDkjT2aiHel^Q;|Dg^Gz(}OPV--9WEp|nX@6&_cp8CtzC`|S
zF?a`BfY0r@3smPA1I93WGgs5tvUgXUEr7^c0E%t1=UUeOi&JUr6mB#iEav{7lENbU
z$&x~3qvQGy;nB2DdhUfm8+N8`itUhUh^%<``CS0b@+Y4Pfz{!M)euWENr=sl@JkT#
z7INz7zIagng2f>pP7pa>=XN!U3&!a52Z=s$czSIh@w>~VN0~1SmaO`E?Cb%@+8iH*
zJ@dW4eL)bcC-hNbpUpVy%Hx(V#Y3x#oTcR^RM7;T{vY2$AC=^3OP8##dpU6@5Qy}z
zeemD`6@$Mq(Zz6Od9SA~n~$uMhcTI+jx<Lr=qKs)coH<M6HUp8>il~7ywO;7T?Btv
zyGse`f?l3hTU)xAJ2jW-OuWtD&>eX_iriglh}~35Vs(_1%?ul!a=}QV*{6PFvP{0k
z(0`bFX;<n--5=AFe^bX%@dfFjNkER7lQos=KAE7ou%{2K(vLfN*@1MkjD+*$t%j4M
z(ej@aZU#(RQRl`m%pf+HL=&SRW?9Oah6QVXiR4|08)^G*WFrwbZHA<%Z)WvJ5VNf)
zjTZfBHZlFTejQa^z7v8wcG0N32hWevV~I;mV`<~Edrw6P)XkDGuYAY1?Q-`M5GpnD
zpyziYe0Z?l1tD6)r0%q!PqZPlr-VlZT5?pPRN@CKo1Jo#sClQsGA%i#=mf>{F!t)8
zV(sEy$GARh!AbT_LfxREV%{$2c6MB0-#CG@m^5<<a~_2p4cJbmQ;6AF6m)sNS(5Gk
zIOtzZqG|s>{S%hv_u>llK_(05ABUqq-E;@4UPD*G|CQ<`M4;z}m+V53c5S>qx+MBq
zG5#oFS!(4sYT4u2tU9oYLj4$}HCYN=_9rmpr_7xw0CAyP`u*w@>43|TM2A6fE@8m0
zj1G24alst+TcV}YSXHRz_Nl)nP&*|czRWCD@J{qg<6h>Dk#_{o5>$itgrKCEM5LoH
z+lq<^I3<TvhT}9i)To~0wXBHBnII`DfBvYy=1OL<<wzCE(aU8dmhnd;k)!I<*exxW
z3PqJBhEWr-j%zcP<{oT69!aIwmA>ZamN`~u>kD#JVt?_VNQOjO|4$=%ynp6tKL`0O
z)E(^2O#^!?tGpRy>6mw$^=$(k!L|E3`!63>DPFfrQ05+1<}?nj=<o!kW6Pv}_$!J5
z)+JdQO}-u3EhzscxpFUee;jfNE<<oDjRh#_Jn#nps;$Vc_(bv|=p32dLf=`XIl(m~
zvXH{T*m{%;8Lbz2p3H=e)45%=9`*kF!jX&uAiR^Ia-=+80{^F>cAkBy4Ox>TNwby6
zPK!<hDcFUiC8ZTty#Nj9o7HizfAl9tkIn(WF+Z#S^_TyxW<g{{F4TUvD*t`wu^=zD
zSS7ZJ8Pumx*skl=zQ)#nuDPkEpG_82BxhGyqn01Ybt9!<^b>`I&kMqDUumoVgO1gN
z6&UZlVmx8`lO=bZR_p(zp<_5f`~5?aRD$z$kW^$zO0uhv(t6CN6M|OmgJRMDm|gz=
zIY<2uYA^VAYDdv@y^Yemyo6lxZ-3_p>VgKfr(S^?hlUTmE{e&t6vF7Fa*%avvUI$A
z1{!WU{7)m*4fMA4Kd5t7%U;K2d!yrBAK$z^ggdBNo@Sreg+LIiUl*=GQ@nuR;H1~S
z3p!+kbGvs1a>)zm!HP&btZXfOhXRSUv96=Wb8{v;UaQe}W~a%014SAuENhw=NN>Za
z?(>jaQw25=1Rq<oX_lJS)w$*`MdjY5R%orJnzCTGlM(Hia&j)}<I?<8!O7}FIdNJs
z0?AZ8uQ2H{d8U)h8#+$@;`VEU<B5-+oKrI2o>A;Pz27g^R(_OA7)sdU!@P&w)n6uD
zRS;JmT$cpJy_w?m&9ke!K^$o_tK{`SC`WV2sLjJDoiSSKg(_l7tJt28o`Myf340uG
zi7E1>d<-kYK1@D(Nxiu6;Om#w;Hsn_@?E?`mXUj8G-n^)@3zyepOzgjxuuE@JUJ5%
z_VL);kc*X7ta1LMqY*5F&8N&17V_7@#%TbO{?Aa=|3vIOUu8dnGV#@88Zvf|L_w2c
zWMYyra$SLpyPCC80}i|Aid%Lq<grw~X*17n<`6$oBL)O*;J*}&mH>1qMgdK><fCdi
zR#s7(Ln=C_OknvqT=wlJt0+fG-L>Pm6k@T36su27{#IeFV{V@}P^SF@8{URpo`HSk
zqg}7X^rY;I5$}xaW_W9Tn|Du>Q|<4a<+{+7R&%^fR9lu9_NjUZ0+)K#WbKk1T!D02
znu6t-V~efLIGyM1r2M0&^y7QEACPsGxWsxA)CCU&82f3Wr&dsrbs<giUgqWc6!aNy
zRE)Z#7k39=wpb*aYY06i#n7BlOOE)N&EbUm0cE2XUhvwKMUku5?zQ2^h9X|y3J;Yd
zRYSvH+q1t5mc7NJlO*CI?prp9%4Bf=y!eCU?x#PAnMky6-krv<$fLs+;Z*8p4h*K{
zjafe*GSa`LAvP^us{qGuM(z|x9Wt~d3DH#+Uct|+niB2~up)0!=Ss9%xwqfw_WRTH
ztA)}lvE2oN)@K)=cx$CZ&-lh&+V1S8U)vr=0G!4Y)ZKr(d)e*cr;XKVN`qGGSpwi!
z`;m*pD<4D6y-H?-al4EYabB{9h((&QE6@YKIKM9~Z=tkU??ouBttkZE@YO&d1!>Ac
z0qtOZUg@6-2xK*wvl`jcvGn)690FdK{tPCcn&{M^ycuc9g#~Y~KxKt9W!*VJKW)uF
zex%Wkjpa8)>aC9y7mS3^YMEGj{k><S39l7=u;<bh>a9KkKQlVL-eLO2k}@}5jy9>N
zEp14lSjpzvY}yB1Qo9#6*VN8W^YSCLM*p$Gz~s1Wgbe}+5U+9yvN{~<Fkawitk$Q$
zwS*KS`yhNLh=fT-{Ncq5){1MW&-<~618w-M&N6Nnh%+k{4>}=EYKo#D|KzCVeL*yp
zwQbm4^7`S9Jp-PU=5SjF{R<<%JuL5KeX#z%?B2**n-723l6Pxur^bhP`0J1BY&Nb3
zWL|+<#ZcxD5*t~<Q9RU7Yt(!k3$!ik{2gZN6~)IKkd77_<anh!<cyNA$f!snB&q=g
z2n%Ax28(h#!U4CG)3v~UKZxOnrH1QPy%Mt7%groH*1HQBJuL0StVEMBqO_IvvtxA@
zEtA-Fz9u^U$Nu@;gt43;c9k@I0~NKX!ODxG?CW}rAj%)G_4D0k3^iKtEbM?XcNt8!
z>2ZnZQ0JOe%@8=7p{EHLiCM60_(3pfV#d^<czc~C2y_CT;W+C>dt>yytiFyu{OVPu
z9s<`Ne(gNk-lHSR-~HLqU{QjJi6Btyj155#CBO@K<t~CdcZrv|f_UdlHW?yQ${*>y
zu6aG<LX?XsNhNY8&isO^a4_yplMidRABs3LLsoxxpnU0bhlzA-?p{U1%de}~HY6^G
ziCyFal>KA?CvPoQ938TPqVS^b|1sNH+8t5jOL3QJwJ{{&zTmnN-Sl;$q7<T_X;@Xj
zXx4$&!aI0XS2Oh8m&o-lcAhHNl*WLxZPrI>H#{_5mp&@@^xbVRedfSHOK=I64-h~<
z!;^^|9BWT<R38?!@9I1sTTP_!w)LmObQI(WeaiV2yDF{`ToX|3Cyq{8L_}rYsyJL|
zZ#JT^mLG?`6~9H(PRT5JD{!PIEL4FYXp4ui@k=E32=@i>?|fajCJVHFm!+z}D8@*u
zzy9;MnC<?E!>);YtbP9cyC=W2OoBhmX#xvjkq=;<LA2VwRcPjTfrfb`N4ML{mne+h
zJ?CV3#fTw?ZE6B-$6sRUarry1ox*%_yNeC86ZRFxZy7JNg~cq13-_{nEwBi&I?gH)
z5pK@%AiHH0@nlQ~-5TS2C*22>)w{MX{NIJX2a}t-zK=_6;UPQ|jzn{u0mS((#E%7+
zRs%sl-@DUr2#b2IEZV4$arf?Q1jv#BM$usIsnhU+qgF`ig>F{tTL_w;IJ96wUyvFr
z(o#K^@xyjkOk$5K;&B##PPCHCoa-?0?yeu;kGy{Sa3MN?HJ}fGN1KRCUfLLSFek^;
z!fG|wenyC`Ds8a<#`cplWImn}YmN3kTxq38-I8UtvoExT@mV-4^zIhOk+n)&e_B(E
z4sIp7rSd_u8*-orzYuYvx&lobKLc!Kv<=~bP8(27_IwxU$V|Oygj?^B@ztpvLptmg
zRAjD-!nB?*eGO?fP2k~0Po35+Ix%c4+>&Pbf?jAJ%NzV$W-zxfrxNyHWH6{38q150
ztQtP`f2KYz7R6%7T{+s6Rr~W3FR1>tK|lFAd7pyZZT9*e<i>YA3Ed$O*#!xfs%vA@
z_P<OCLW<AwvR2vp1le+Q`Gn_F=M$5DCw7s8l!;Lc3^Zp{*QL1f3CYE9R!WkFhYSDg
zH~<lHj!zsYVrda8PT2cHGg-!UV`b7APF=*98$a&1zpp3W6(*SHlhoO4dO^HYw@ZL=
z$9f)mcERcGv%px-l8sDee+7fLpLr$UYu!sH?u)2h#10lQ#d@2o2UH+hdGNe`%IM-n
zFzW(V=@3A=!}s2}+bfmkp;88Ha=l)r>Mp*b7Ph}=#<x!-K8x!JoqUql>v`|pOIB|O
zcXF4$LrPaeizLRwkr0F&{6InTf-8#(x+4bI$zRw5B~j=HKtAXdC<<s#c-xCm6Ya44
zOQ$_~HZ%?)Hm(m;2ojASVlEbH?FD_{{R5m8%h?3F9BNBJ)`nv6vn{l5fgGAE=RUA=
z3yYn-G-n>wTQtkmyLH;V@4fl*97`XJSFvu_#1PoQFLmewWJ}}1{bcClmOn<?OYF^*
zRezt5_<fehIJ5G~m08k)BmI_~L@QAs!gU-L9sr=@4E(ZEezy+7I>|hq$yO97IK@j)
z!COC-C9(ykLL7a{z&>Y}pbN}d9=hA<fMroUs^y;Ox0FeGFlBPA;=UL!@|V#!;(Y=R
z;#ZQ>CBCquriEI;UCnohB^&f#?>q{_jN=2Crqj=kVRk(vfY4T)<;xfi#uxVdO<eU)
zW}xZ0hh_NpODY~@r@Hm?iE5IS>PZL7>NH<t-|0~`tJ3|uhKnCwWhE<RyhP004TM$<
z%|JV()&XqS8mJ4-amc?jn_T$KrhLHN(|Im0O@TkzVm=KE0B7&s2aH15@sNNCyx>8>
zJ8ADyn?{**DZ_VtgDaK~Q{TMud{8ejzhi(&!&Tw{3vG3xg$daF8{j_^SU`_y)qt)Z
zV}S@I6Hx?C+<2IHAC9Hj?h4>V2Vr=;9(M*jns9V4H9t~UGco<ua`BZ4KCkfkGvQ;c
zr$BE88(1s!Kn-~z0=@?5zUpuQI1(2-uL1l8hL^Rac^gtzlQpF;+s(`@$cEh9oVRI(
zIvM8&_c|UF1tlnFQ;_l==`43+>DCwHg53?@#)U~y=B;Ginc-|UgsE4WKkgX)nxNEh
z8Y5FFiM&6ISge+1Pd<}qQa$K8`aQ7#mdX5<&}6&C;l<#6_Pd$@6ATx`&$<mcV0ish
zGiMb;b%)m;99AgkG)ofQdU6R<J^-Jkpye^jXb*M7QfP^zWzF2#uGFS<6o$)hmqtE8
z$fcb_nEi9*P1d?yz&FE{dnk_o;R*!n*vdf$8Ftu^+vo6_g?2LEQjIg4EPdyZUN_!c
zaOKd1VK_7fVWNi(#{XD2@`93~b_A|Km#Q)W>jA@v105iiW`PJR@3y!AHhSr`ZCsw9
z<dmq9^s1^hO{C_FeF_YGH$L=cbZ-96I}q`v9vymS5sc;wYs1+aL}{Xf7h`nV^|jgZ
z@spck_o5S9z1}<5Yt0v0Y!u`L)OkU>WEs#wOQpb1^6}l%7#7|4nzr6{n3W{()=g<3
z#Y^dS;1gPc(UJcJ^Xadm1^eu$Au-Ai-QGZKfK3xk9e~YwLZ5Yd0zDxv!&c$8*bC^b
z@Vn4w-8y*&DiIV0a=g12aUmOO%@5?izIdn)ViX;(;WCC~As`ii1<i#;jPW|*Xls_Q
zK<jzF;O;u`;WMqdS>9J(CozgANcQfjBji+N1foLwZ;c?T!Pm)BfPXbL3&=5yq_0!A
zzZa=8Z6c8jZ*17f2)&P?7rn(l^x1`)+EYltZGPpXG&EL{W9D`rkZx<esn;@et<Q?;
zo2!rkh%rv>w9{X>*wbQ}?M8y)9gwkYXOoNQk;x4Y{SVf0^4kW-wYmHk)wPxH$V}cH
z3GvL~G&Bd|v`j<zGfHBs7^i*`hIRiWuGrQSC|zl|{Zo8*GcldCCOq=oNV4eFI%m>O
zq9kwJE=i=^TAZ#<)6MkQRMW4_r_fM-Ph7lLO7fqX(VcH5@4mj#fUyxz)_2#%J$QId
z37ekEL-T$n3zbfk9Ks#dbjB2AaMQ-Y9^?v^K1lGH$w;ErGG#h7lc#+_@SlhJspyuI
znH+@}3$wQv88av%M&F~*WCA8i;L#~;q;mqKrrPn)^hdzv(vtrLFFbwv?x!;`XPVLR
z6)3WW3r_4OaR`Y}Qn06xS~eI}m_~B7QHgB*lw&(KUizleY@-!=WGkq@<GSMF+_to3
zZs#YYp*g}^b=<##+ErXeQEjD#z4v3_-I(Ps8J+EQAv51P?!U*F0@+&OWXqv6^frGo
zEBlD=XXmE`<zCL!1iMI;qCm<(f(zYcJuzHO{Hg(8F$L=*Qc~XHtp$(sH&-ArbXJ6j
z^f?QkqlH#)?M|b-N8}7(NMGMlg@1*V$6<q{%>ZaN8Gb3Cg8fBK5%2-u206NZ7={Zl
zL~Jzhu3~ksKnaeh66i^_|5OkFXq6e*pA}w#zA9n<08q<nE@tsaFiuM5tOmFYZf)w2
z`C=Z^kC)zic=268^bdp9lB*>DEv+FYEN{bsXE?t!Y)J;vF)19ed#cLfTfb@=drxi#
zlJWLU(U#T^E3<dt+re<@fPu8Q89xHVV*mC=Sy>ZV=~gCtMv@;&fJJh8&WSkZQ7O%O
z4zbuq_reb#T9Y&JLfH`i5bnLEB0D6cgZG-b?EDo#oiguFWdPVz;njj<wu$zpcTvF_
zl|i1$GVS5RzLuxl*@Ou`uwFPL!T}uCLTEkW$G|CjfAVAC@~qYwkI<U&&kWBS?y~{U
zhu(akv|qH}XGRaL)WwJvJJD^?{A$VZy<YA7qoB7|Ii2^D88IP%ZLNhvL|(z>4!T3p
z^Ic$u=zzX=u<z)!mnmC#7*uox!El64Q@tJ-SJxqKnRu?_<2#C@Z=QwfC$3Drzl{(#
zDW9x<DP?=tm|q*Ck^07Q^v#=iu4swpx8M2g^97BQv--q_Eyq!y(?*<{WQvUJ^!Jnu
z`ClvRCM`x6N(WEYIPi~7PO|7p8sr+Zo|1y_N$W;atqWnyzoyPUrtv+J3Z!;u88bY=
zWLshe{>+KK>Bj9$b*p$gC^{NG8xRw0Jwm~K*M>3ZvB7V}g>A@04KQ#In{1D``m_|}
zkC@&&aDz=;ABx116)k8MWs%rLSULH>eUfQxo2{H{coQ?kXr4F3KYnaFXX3TP(XRj2
zxVdEimuMP7bY#=Y3%%kz(xLFEc}z1m%Ef+gKrA3%e$?B|(ixI<n8RH9JAcEIvd9gs
z@8@7r;O(uhkzZ6;I6ivKMPonmRRI8;6@NNxDR~RNVSl+*p;KV%nPL`3#CXSZXsNc*
zXB}&thj8+O84e$Q$0%wxR(B(_Xt<z8ApBka-MmAFh|y7DwY2=~zq#ri^^juNIms7W
zW<=PDJq5Ayd*kWN+VWZB=O_<}S5H{&WRi4A@eoJZsU;+x>Ze(j3|=jZidKDP{xu0A
z_9xfvF&+m65uzwYG+gn}BjWaFjmaJU8Zu?#$BT_IR?&xBB|L-y0Ok%}u|I1)XwC}0
z0@;Bh@a*N`PaGJ+_pKxEVZNoroP8*I%tO6M*&c}x!xL^-dI90E4OT^Jo%2|k$yeqY
zN*Gaw7eQzuI_x@rBJ*MS;r=e<kh^o%<clxMJ{267{KB=el_-?)mlB>qsU2+2dnt<+
zT>2hI0_Y!-Z%!W9r;%8o-HSDPR_wtw#wLR##N$ff;bG#Vs-+kN>>&U9S9C%9@#mK?
zh3~cFeAasU4__MfRCX*G(A|6aq?L%(l_KB}KL`YbslX|`vfA_)Zb5Awj19jZ3WmS1
z6W-KMxwf!_vc>%o`195~ui<eO7wL6#-8~L5bj1OU+Sn;7CX4dzOGB+4C#+tM(R@iT
zk6GFo?FXTg2O9px81roKI>+DoGw&R1q1Ui9+?}Xk@ypPhY}>(H!!dVm!!rK6A3}`7
zt)8oo+>mg0X>h&ynu%}_x|ocDh&hEKs;?Kc^Y$lRfo@K9P)};Nb3g>xRCs>vzJ4++
z8lpxKbnK)Cbe9={bPqgg*GID*5^9c*u+PHDx8^#6fnx1FeciZ8g0OnzD)`L1v3?9M
z<0sNwbwjxn3(xMoWm?vYARxqF9}R?l6-3{+-eoJsUPse6!rlu6^e;QHn8+sDN<Er!
zaCmN2Rz}0Rr`uuIzSk7#YmTBtm_es^QTCdwc9mX<TJ>Xo_m*<V(@M^!#XUt#_8!i4
z3^Khae>%8R8nQtm2{A;pTJJmNp`#9lG~<=(>PBjm%^f@cWP%obgP7O1;S@k?bGj>#
z{ZN<@CDIlnF}qE@=~<`M`|Z)^T!AlHHqY-oQ!+~=w#77LgMVpKBU&GzEY&UA6Kt6U
zq6#;pBBD!-YHEu;rMI~xUPt~>c9^y<3ZLQ#RMf)a(cN<B6bwVaNS%>-%tdfVz|Ha8
z%)v8|K5xGV8K1BNXE0j_pLmxiLGrT%3V*~Ehynlyt}6q9_YffTY>5wrje!ed4G#)-
zgAUs}2UfBoGD#2hOH0M7AH80*5pKnVtJ1!#01BJcA1U}BTn8ca7p~*)*)A})(0Lgz
zJ0{ba9a4U#Sh>qu;zm_#(<&X$TGU6f9f{}10>RfD8Z~j3BrgFiaUHppizjK$Mc1{X
zDzn0xper(5=knvC+ut|j^RQqism?$9+S+oAg!L<tcztOt;9XNd%{1dh;aE=yFm)Iy
zMC(H=^iZ|U;|j!tvi)UmI3caRBF{WFZ8XncA0deFDRmR*ieGns^|MIJVsMm%)o%jh
z53rD|^h1oK*VFFhg0?7}!hZX%N3B#|?AJ}{+Di3A@(Ps*CY$MZB_&@ATj$+^aCbI`
zI~@vU=h|{OA{j?$-;x%AAc4pb)MEavPQe}b6`i}G4?lCOotZfYb1V=I6R&w<sfm{z
ztFo~klPwH2BlwO(*(jO)na@f1k+I)`0?+ZALee>h=kLk_Hjtx3=m$t_Y8?Pj?eOPZ
zfj0fb(181zsQC(%*n3cb^RfdpW5s&`eTY^)#Ogp35MONO00?f?rEGAo6PjziokDy?
zW~vJs%3PEa2S2czd%i!)8!zd_Iiyvor|mK*{CFxgpksli20HWktZtOwPs6lpbnDWI
z1pQ7j;0d!y-(8rqmlj2zA_vU~q0bZ${;Du^!S`Dd@XZ7O0KE!@HwsG!Tg>%)gv(Lv
zU=x_{{OW~Yz&!IQyxI7X=-%x5_lGkN0j1$10q!yE7$BnScIXjEoUG&~5RWWc=r;!j
z%?g_kNfK+LNH`-E&vPRzSP@>4IA=5oMt>vV4ldH6MF#!llzqExmhbFx^Y!(?ynMxq
zU?5kdqyaZy)U26yJ%AG5y#RJc%_7fQ#!eA*3RBK(+6a1d(4gTpdBOAwYOi)REjmqI
zB9Cma5w2X86cPx8&qIC)-u&HmcT#ZAz0}%Z<qu%`>$^%NF7&Ba5aPz^&X8EDM7(lA
zhbD%?03gS(5<xUSb&;3OlF>I1*{&r>0(7&K*AsuU7tlb)mbr1EC5V$}!^H0a0<wXw
z=KG0Wfy(Lgu0VMYVev+O0Oy1NhAmOsSj+9rWc+L70jA@!`Ou2VV@#lc>SY<Ice+i|
z)k{zD6bTUjyJ1AH8rt>>^om~<*kp4g5OlMc;zOCVoHhMLlidCT|K#W5Eu+(zRf=<~
zx`z1NcyJX5l^jG2(aIL^$N@ozy*9DF1cX%Oj7$v5zM^O8nUYb~q6uNU*05)P&S5%v
zvOr<u6Bd4&8qyq^VlTEkUhvh~I43~?OQ(!nwQf(z<iG)@@*Bp^!w;$e3SLv%3;izc
zV^UgOCqw^dRBrK%0$z(c(AkeWlf8ulvb@1$3w$YX8Y3Qlu8<adG97oC9!I)VwH^2=
zV<ZTx8YHcVc@r?L%#uo!DSgPqr@QyaNB4PE5;v5;Io-0%XP2<~EpXjv4e)9XqwJAP
zHa^cu>@5lmH$0Nd6@=oS+(@#C{}S7(C&%8_^7rPD#M0Pjz&f)aV!jq@Mhen0U%n%6
zD9-WkL}l#-(VB&oL~X+X>Dj6V*yktG08g8o#Ni$p4HKVeVi>MKy~Tba!0C`f28u6$
zU3!;}=v_xI;OlJwW^$}UZup)F0{o?q214FnF+xAXj@tpcl>mM<bcn(^j{q*IUrut6
zder}gt?owy9C9D<sL+8R?(GEd=!zTM_XPnQaEBdeXN4iE5nT{MFVCfvAHPT)V9G5G
zX~!L!6cf$0`g7JZubgBJ`Z>OfzUSe17<~yDBfaoE%fXf4fsDJ~bYVt^UlZVVv$3b6
z)yBg4dxwepj~D~vxfC{80+k3kmjj_ZS0Kwc^gTU%jX)t1KjGtkmPk=j(V4%I>{%ZC
z_9mCIb=t45R!@FBS5V;QOm~6H;YT3{s+xhyK*%yX2U^A+v4o_u@p13Uv1e+@Jexe2
z?XtA7QcX@8?<sm(DZFb=LgWHxM+f6cdofyG>#&6WCHVWdF_Mf{{C6rFCXv_H2`Xb4
z*YOSUX)8^N7$ZchinLDl){B>$Qfrx?Wxj5DoZ;!c^!R5VQL~M<VdT`(&$7X3P;;k9
zqxCkYS&JW+zvEqp=j^1?nKcUNHB)3_w!`yaCQG&JuGWuDZm>q;KcO75F2o=as58Qp
zE6_6){0lrWqWARy6c^3xiH}pkBccEG0T5nMniz@x-Y&~G=8hmRF)>01S=#%{8b4`V
zRQHH+3a{r>YI}Ms?$g8T&OexO2WY@HA__;_62Ipc6O3anD=tK5UfvO#NKOSJNj)@g
z4}FAKD2qm9rJfV<f1r)~gnu5;A!V9X5-^D%<Es%<2=sbK76?f?cF<doIy|;8@trz@
z$U%Ol>;iN~bWw{-duy*5fgnF6+l9BDGh7?JB*VqI=MQjS<_jx);|zUzvsa!+jwt=l
z_kA>{b$7FIdlu~}I54fWH}Tn_UiD>YDs8l8e34<cH<3j)&0KK`d&Pai1p70xvz5^k
zoW+?sI@P)`L@0*t&-E)%sFJ1PQyI!NI_wy}t<LiiqGqVAcUQMMrtIr;%I{o}O7r-e
z9a(lDe5=j>Vt5hnV8FfT&3}V0kP1jibkY^*_GkEChcMxk<4gQ8VVLk0$Rl;7r^c4P
zIL}Z8V_DsBFd!2}+N8VW^(`?^bGFb?bsD>k-g$Tk)rIZ=xN~;ok<^)HT>K8Ypy)cL
zMGv_CTCx?Og<>ny0F)7T{s6wDxGXvGc%=x470m)Ne!n@U>#A$BC$TV}pUz+0SJkf4
z71R-vo4NZ5Sr($qJN%b7Z)4&v_W|k8?~ToxY$j}l9xk~6$u?J@Gx;(T_`x8S_%MEE
zM4oOn6a5#a`OhQt!+G9Gei|sIh*(U7x$I9MZ^JhWmOsx=rN`z+tdC8CHz<T2ZG4H8
z^XoipKl^&1+1ilx#9+bb5RzQ!|Lz9q2jZ?V7XivP-924U1<2aQ%0lx@4?TtSdwuZx
zyZ5pS<Sg{_x0UmU;mZaB3~#l*v1WWWa5Pt5;hb89cSX(Qqgz)((-b_|IVNCyuFiGZ
zJP<h$;V?N+5&A%Lfj%!K?sWfVACL;`FC$=17I)9IcW}A=b|^0oshkX@3%vdhmMo~%
zx#^JBV!&u5F>xVpbf;utYX}Ht>PR0FTH!IPpk8aQli=a6^($?jYcw_5h`*eIGvnuF
zIi;X&jBhP(cAJdv?#^*j7T(x?*C#SGKwY?d?9aPXg#Cs?VYLOM?U#@@bKLO*V|gu!
zS@AJDpJ}3TYxKKwfxnJi4l>ym1FAY0-=0PNL7Z(hV&4QLE_Yd+$})6{o~FHkpB)3b
zTd$+vqblp1;%nx-IcU|T(8Gm}EJ`m`s!BVl_xFYSent)}`1PhdAiHL+K*-j!>dspG
zZDei>=5Y{H(di<5dpiVn8{7jsegAOS{WqJRGWU@~-h~L-JYPp&!e)8)+=`oSoAxu7
zv*l=cQ1dQI74gb*A;+sbGo|()^y9l~?CR9!{tjsA1k7YnvzvmCN@(HkWLgO0y#y`*
z6~&F;q*`ms*vCg%_;`*Jt}h8*=3dJQ@`|aET_th?Kj`l$y3*1FP19&kz4!Av)bZj~
zKR`tg_QcPkV|z5$UVS(2VY{c5K>RaW(Fp&t-&v%~+#<B#Q045jHta0lEp@EF+^uT@
zZT1=`kchZfxOa+n0<In6DXhx@X65?tpfC1Q`kCy9H5vI0o@Vx+IP5=Bz0v&C=XU4r
z2qc{#Gea5IgX{<NMfkhe!UJU6Z|T6FqiA0PU-4McUy%4h`Q(JBH((ZX3PnDz)!gs4
zgax_0$W0&Bqqc*^w)h}BIeJ#qIFKL5*F0RNG#l^~S&6fhxgn+s63Z<(T;vG?tt>0D
zIYsTAX&UF?f@rt_^FbyTtuqodwZ~}#rQp@cwC8FVPmnVxG*?%Gb;V&;B>X2!W?!3h
z+^=#i&gdt5Pj0g^<F=!Y6+?Plgf(I8a4gBu|H0mShc)?b>B2z}qzHl_ogiI$ks>Xi
z(nRS^T2w?jh;#@Epj0UWB1#KLuR*Hx-cdRTC>?162{n-7d4GHFIkWeknQLawH~YKJ
zxvu?RLIQbt-Y3sm>t6SLFXDZo!*UjjMTJnOri=CnAb$f{FQ#d2E3CxCKQt0+>SzDP
zvvw?4d0Z@VX4!W_mDG}uMZ8i8VQ3W#a+$URQm>Dw9FYy^%Q$58BJ<2d_ciGG9j0I=
z+tvW$F`zt{0NPhP{>swjp#W=n8_1t$<9ujh3qSKz^73(TeQadgm|y=eL&&wgr*bgI
zR#i}(Q+m&d<naTY9Ye8uLL`$$v~O!j@W9sW0P-*g8>^-LzH+TTGqYKAeL`WXaaES=
z+4UAz_EO(pWgPqJqq#IM*ujRtyA|7(vQ@95CBofQ$zYz7kZRGJCqARHv!^%d@9?@`
zOC`HtXc~R61`&Q)TbWIdGsieAGXr}pj!4yvrs&QT7PHcBJ_~Cb61^&PT_8;TP(%3X
zuFnS9OJM5I#{Oy~o<V=r<E4bVEcoTjyVhG{_hTQ|1u*oPM)15Ki@ssOKPO48#r=$z
zM|MjMJUUg_oe=4F?s3wS5aVr?PD73}u?%nOc-gP|q~DyPUiR!Y?ggSlLlx}L3tV@X
zewgKVE^c6?ELq)EXGQNU`+{V~(mI`MK7%^9jP5>>yUzZTj|oJS{(@s9&d8;f=GS+D
zuQGeA#7r&Fs@UJQ1HU#OMN%S`fT7|e3HwvcE^ZD>DngQn;yyQ$ey+bgJ}7saeP4N$
z@{s~JNpb+xHN|M^ZYYoz#a(T;yz44z^t5%_;}XA^H<t5$#lmhUH&O7B!VYvA(yJ@-
zKz>O+-h1cLJ8Bo`rET;b1{z^4&e)O?ibo)j`3*pPKa8M(Jd+?tg|~ud_nR6n%@url
zK?-%25yFjs1`CVp^`~A6Ieju5<Pi@&xJ%>)8aL(Th%LrR61>UgT>f5Uk;RVX(*o)*
zs*djE>$ac1P*Z4fZMU-#J0Q=Hb6v}RyX;G1GSIQwAa=+VcK_b;oFT<NKL_7z%G<rW
z4a;J~FU0TMVC}NkI!MPUKAdxgfK{2dPSJf{E?%o(x;x6!MgD8?x>%;M$u;S!q)W>5
z&_YPfBAx-c;vT&+w9iQVGDO?3Owe4>nYiQf{@1>D`<Wes@&TV$d*O9j3^$8WdDt51
zo&;8?e%EiwBLYbwi;93>n|AYou=>}{J-BK0P%l|FF0W1W;Jd5c`rd6rrb;WmBl0$e
zwnrWjev4PzoEJe(b)!vlj*78xp4Jp~5gImiX8P14y62y?jsv%q|Hyt836;u^%LI^+
zutm+Ne%`d-AoHe(g-els5T(Iw$X42|BPQG~j+NNk%1B6t@p`dYS5&VMJfA2|mob(-
z)ku1GG8mxEE2^hI{L4>H!4!m@AyTuWa}8&RTv+*)y$>|W65$f=H+S#yB=Ocvk=Z|k
zc~dJ3Wi8ml2*9VmBjmzZ(WmAZNXI^x^P8<3D>}QV8mrg9WbWEY=dmG@#LA+B3uWn1
zH0q=oq&UI~ih7ev4!w}S3}&3wDB5Ccsd?vDSABWx3r&gX+l1!`Dq;}A(F4ne>Y=0F
z@l%}ng}Z5c1C6g@j?`^x&~-EVqOT-BY{WViD6;?X(o!)*3Pg+%N4frT<DZmph*_Av
zG+Z?z@+z9wFlqSL?Qqg0Kyh>?fe9i~ZJ?`{HxhjGDX@1sXH;b;j(RQPYaNREJX|~_
zS>N2Ik!HRN5qiTRMOI->M!XOC^$_=bTT_DOyFYLDi}~&jyO^MqYf<_fN?+DO0hcYp
zMZ#O4$V;A!PpoiiRVCzAxN*ALTSYbBc-Fbjmq=^ATTcb6%89sd=wZ%j{UVR_gm-B_
zp%xqirD;~e&;iXk^!EksD&6%7!3=91GUKN`jT(2!nuDNwvE-<g`5y3r><;9Fp$V`8
z9)W&;CE4S_pngHrvp0wS%0*LvwL;Pk@@B%LTo%y}!1Icy-E1kx^`Y1O-6h-w;@awt
za^5DH*(=CDihVg6J4$h^;_@5BRS7gHWr??2fl4dv9bzB@ySionN4-1q!*_dt%<(&u
z2F3P8SRu{>CW)!<h0Ip>K<HXIw^o6HF;Z?PR%Ti3kBHX&ySRY`g&8W!^>7fJ1UNb+
z5Nijp8^1xeifzQi0caKS@7V}0%lqH7PB;k#tyIMERxmakBiQM=WqCQSAeZq}x5Vj`
zpKE1yH8$Y6wa)dSD+<Q(c=Ijj?iu=ZAK*~|c~%bmewUc6k6YlO!FXZBRK@k$%bc~}
zfoT*&(Y&#48iE%?RmdNJ-~gYugB`L~?Y}|n!667jvgd9<IOf5M0PVKO6T2Dr8XQ8&
z68#u3Hi+qS9IEjx(fI!8xXUWlEq7m03|0^9<taiU5dt59GT=>DBwoYWA6)A)9$T)d
zP4S@?={)4?8QiVQGJ~dOytxP;8X=Q6&~fo%8=~nS>yIZD;iX~XMyqoGMSYK4945Ip
zjjj{S0J~cGnR<DI(b<eG^Ou8C^Zex=(_8HiS#pwNkzz_z1V9@frUv6k?_8vU-N7JC
zBJKNPD^_bOML&B~I^jC(Ri<@p+89xQmpky!iM2Ut!-aPm{eZwj(-p8(KebLAnzS;l
zjH*Z%P0tl$tf!XSv~w$eJ0zV%LS}vxfSza%gaAB{NFo>;l+CS6N4JHx=n0>18#G|M
zt#mUX)#HUDDdv<&t@9hiQ{Opq4#<q#j%@PX5gn|MwKUjWpyr-RAVviqBd|B>f6ZeO
zPGUx3r1t2ScOJOAG3*<^JF`8R9vReqRZpMsL-&eXuN<zS_tvf@P-p{=0P`#Rzc5P)
ztd5J!-gn2ukKOd9e~#a5u4_Kzi5D<aD$h?bCyQr!sRpJ6{<*)0V7?>JI;gXoL0}v$
zkOrJ9&m;qH0{*H!T>`j9KVF%>41XQH?c$pzmlEU_#Je~<d-meXP50MzKm9*cmFSsV
zQJR?qDtLXwODH{@jQI7W^#V30zap_hqtsdcfor{_4GB|sK45gUpu@Z7`y+871n|3A
z`UJg*sD>FYLlf9cTg8sw>xGt9#b5mL*~yL1)z81VTNJU8Mt2X7!p!s_qrfajZ6yAJ
z7Ipw$aS5K$3Ld+jAn>qtOwL43{|oOY?LM6F$F{Pzx%#CHXbp3_2NO)@?wz7V%Ybv^
z1}ff3x2Sisuen|h8?YxHBvArH17hzYT2CT=feA}k9$^Er$ZkcgNM*&5Vif+qs|O6X
zsfJU2rsTBlN}>8}1$6Nu2tG<2hF%g=CBsvpBfFG*pb*`OF;t#mg53G)X}vV~G~ZVi
z4{Wi&hgD{dqoE^(yL~unIFN6#BAk%Su)A2FWtQUyq^1uPEN$dU3+Gucx5Qo<z%}gI
zC3^Bmu4{TG_JH8^@E*WW)_IWzNCQYI4xNeq2w<M+K>$%6fDG9n#Psr=b1lLL81o7w
z5(>i8Eg_iaUW++sjz6hwsb}buj5ie&X%+BxZ!wKR0Zf_s^b=M>AzX$yp?U)wjo#=q
z+)g9Ur%kJyc~IhX((|PyLrUk5D|qvGG~UF4mBWDI8x*ZK5(lPGzK-Mdmr%nvBq+;#
z-1glsVJ)k@{N(HOPww;ZE^%yrijCh58fkKMvgaDX4FDp}iBkp+QUkODk&z8R$ib+e
zxkS!5oHJtdB^{CW2uZvTJb(Sy&$H>aN;R+>X#MWA^1ATl#VFNlQ?ep`2fJmzYD-04
zM<dC0<Ar`Yc!lVJ1TVB@La-t#KdU`)4^=Aipug%`*(aFw%>*4o7kqsU{MqvtXcc!)
zP&$NU#3PA0)N&+{mq5VS`HoLc=Uman>FGNuQ#G^0h5^z7u+Q$cnrH86gp;9#(0ch_
z8U^sF^UD#{+0Z*5v8!tF+DEvda630Y_YKe5hS_(fi-&xn7C9qiQj8>PP_&K<BHRt9
z`b9osd;o9T_;#b4EL^_di<r;^QHXp7Yg_)hcZMp0xQ@)pv_nDg%h)jJm2G;2$4;8f
zdVFMlE_p*^<LBsHt#7nuEVK368&sfn{wRvGQv_CUf45<&?Fu1P1MPEey?2B2B&FEu
zOjJoeBe|j-F+UfqhN?q2v|;&2x>+~cuyMG1Lzjv9m*Zw9l%;2MgZWN>3RrNRYy_!)
z0@LuC``FbqUA)LbdceNqhkxfm3M6(Ng^0}~)b`=YEP%!tVu8})(JjQ$O;jf|907bU
z=Bc}&SCvIp#2nf&R$J%KKH2+u2~B&~&(cVjD!J*_J!_{3p_C<DAWo`MVkEi~W^sM0
z<Y=Gl*6G?zVO<vyFVW}G{LhS-hf@-nLjs;%2!SMcM5@Y9!P)!|yO39E?z8^}(K`F4
zWl(j8`_-mA{q}9cWz~gtDUH|#rz8RxA|+e}t2orNKt5Bp=Co|yI8l&wv-MLdSlPH!
z+g@0m<Wp_PV3G*B@B|38VV>|YA_sPHSp~Xs8set^^7`j26{lxYH|LAy#$B8~J{r?_
zaiGB8erA$5Q|HtMMl1oj1PAQuiWLy=Lmw+YUQYkO_wJI$q*-7}#?$pD9|RJw1!srT
zU1h-Xee}+z`-E63g4QBKYert3A}+w_$}C+}xiQMo*Gfbk9*%eRQaog}_aC4ZEn^gR
zxUS9z(nKi!0Z4;s_M80xn4ZXEk+IcES=~4-zD|1j<ROoXmE&xaOmqIEAH94|85D%*
zxQTE<aBgEPodGK&nukp*Zdz82@j@%4#Rs`g;^6tGPXpiFLTy4%j0o>wS8>94;Oq5D
ztIJ3(w`7~P-f|ixz5DKDQwW;!0`}RCLyJX_lrL$xJ0utoRsSu6<G=gd)(Zomrriaj
zh`;F7*qT>Yp!hJiKQnV|8vJm>v>YRZX?WzDa?5)qwN4_t@A|r0Uf(c2k-gwu+!y#1
z39viC0MJ~#jX3Td0d$C~C^nQm_BSBn<kvR=4l+5!xjv`=6uc=}X~`*&7u`(sz};)Z
zegJg!g(|`V#rpT6OE>8fMC$K_92L*!s|6aE+^a%Z9?_8jCS~D$1V147U27)l@7RVe
zA+czH@=);`gyWf!lfb&}5B&zk#(xEv7~H3SFZLoU0N)EJ0Hd=nrIo5!IG6K-Oc2hg
z`agX~P7y^#Pq|VAG7#hJKeP*Y-D8CEWX`^0<MNzU`LSm!i?mx*Rkqt)VpHYWnvY_}
zReoh(=nGN#VoS(q22#9B9=!>dcJ3br&=vJLNw?46Ph5BgD(}BY+W3hAS3BX@BS5IL
zTrh0l!zOsg(%oh^d_#|&c?~ldbyudwh;+AJ({AnbE;swB<UiIB1D0%Q9arIb%KI6$
zBkYqClS@c7b~1v-0ZgVl*pCI*3h1s>2U>Mj<0g!t@WP`{X?Pk!VY9WeIcB4mx&Poy
z&b;q_H2)J{ghNsN^ISpao1dCOAT3#WGrtNWRrxLT(c&X3lP5|1RsIgR7rHjfYqLzB
zJ|x3IqPJHSNM-SR-?z<~>yvGN3?5orYaa}WMtU_hhz@rj=m_$i%ATQK&AJw^N-3nh
zw&15r_e1yzL(DKuH{)DsuTtu4$!#3m8^fI;zr#a_9bAkyUMs%ED<8sY0?CoJqy_tn
zHg!|>?x87$E$AtXZ?$K=<s@Wh4?$p=Iv>Mb@RzosSQguesYufPi4nu;vIIA=%}uhb
zdiN<j6Q>P|!NaG421N99whqS+U|Z#~wQfYpXvD10^y+MZ9heS&wyk;{!bSUug<Y&B
zY&C_u&whh;_jD0!b>ii{YN$yhRxmHiA+p&Kv2_mY5)B<_aw+6On<-=|6F_meFb$j+
zk0XgONvQ6@)aC7=uaI*iDfCOap&pBX0R7(}?Zk5pC=YQ?bO4?9<4wxrQmZRkPnggy
zJ0;kSlZE#k=&&D=!Is|;t5xPp-!B8*SG*v0j3Bo1)N0u}5PG<$)q>~1Xk${lz_k7$
zdxwIb;&1s?IsonALQyT>r()O{UYvnH2VR4Kgas@sj0ZR%wbHhyMIJXktSFQ6^I(YO
zI7_}ArMC96$6xE$^;PgLl-y;3{%tQ<yG8eX@2~!`AH@T_Ri#>?36{9_*jToBo=)8x
zBhC|rCxZUx&@@PReEj8MGp{ax=VMu~!XXKh277D)g7akNGFiyW-T>6r)ssu;F@Uo9
zYc_kKDKa=PsN4V(_EGB~EbCVLYe*cN6mNxvEl^;;tRN$%M+aQ4Wum@so#@^YT^TFn
znfjKT+)A!Bu=X;In2*1>jKbbgrqA}*724385EErxv5DIEVCenIT=&xm!A77121UvY
z6&V#A6`i%}AWr(v;1>tayx2ZRmp^KN19e5a^3IBR)b#ucw2O}FqNdv&v$mFJKMu32
zD>E70ju6U+#F6Qi!PX0y)EH(nLPIk+M>vIzKYc#y1!qB3!)H+ncRnh3%IPlL6S-u@
zOc!QjuM0Gs<|opj;R%Jyuz^*nhPREk>v;ea%&7x{Puutc*m2Oph44>LFqhu|$m^xj
zEvmE9rCd^sJ}|f&!<j&Qa|e2FhSuj#Fd9%HXh-j$59;WMi^ab|g#t{W5N0Df#Fo|Z
z*RNha*H4M2>%8+%x6Edk^rU*7=X`TQ?VSw`%I`zUxjAk)Q3U?0f?4EitE5w7L^V3K
zcTP3gga60zX{4-+i;I86mOz?|j?qoSA0MJXfoVsULZoNPSP~o$5P`X{>*#sg891~5
zyFR@M<%ih|wl@0D%!I^t_p_58a%_a(H5v{gflhP&e~bqI-RSY3y&s4!B1CnwKhrva
zzKG(V`yzi5$^m^5t3Uc8GQcz|WX|f2nNg{j<vpDJC-e_$k@3P^1HpU9+MCo~hx4mI
zEO}iBtosZ*gdiB-?B0^RdWug`(I<Wea+@t>pH-jp6Mf?9AI2#XQEh7uQ2c$smf{~9
zTL3Q`#>4wJNvKVogojKWvZ<1cMc<2!JG5&3yfJ>kE3LR`DDf#67^G4hwOy1)cIDD$
zKz^PKn(lToQRcnSE=GI}v95P-p3n@F0Y?Ci8|=Hc7jBq|1@_JuIKWZ@(2w6-Ke`~>
z&q-OKtDX+7hXY$sL-9|)b6!ZK^7#0EBm{4O0$kHBt0aSu2MuQNmr$FItC<`)QrhIa
z)pKIx;r*%}Xr$*sU1eE5AYLFAQZDbmZhJxqIY!X+it{u*2qMyd{uzJ&8nd1(`B%xi
z&lX{PURjxxOlYRU4?%vH^Y{W%8ThpIAAG$XyPENI+8fS2byYi6UALG#Q>@dbrEdR6
z4W^W*U+Y1KMM&`FU@(r$U$z;6Q}cf^u;N!U{?pBJ7w|rr5`EkMnp>GHN$u0x>Er3v
zq!g-Ks?5x<H`j%cVH0Ro=vCp_pCS}5gL)+kS^)&?>S^k7>Ps=eq=r-tZCaxFQoAx@
zLH(*yuLJdm54!jsTccr_cslj*wsf~A!Hr*mt{Bu9;3xE=wlC4pm>+y88;NrPNWQ2c
z0P;>`Yg0vM)oh#hnxs!1)b9j<bFd??UOmUMnM~z!d8Z~BXn#)S6am5jW%Dv#Y4wc~
z^c!NcSmfZcgDQSzCjoy|PhX}BuB4iJjv&UoYwha}Ky+TT-vASCfT@>l!Ku-XZ8v}|
zC`wR3MMFFuHYbFgjO>)nCE#D&!@0N80hw#W$Qq2c#>3{tBw{sbW*NpR;_%*qu!gvU
z*;R#e^>j6}9!UIB!cY+D0r=Yx>$|c#REv?#t4k*f8cvlY73Jk-2q6*BQ`@9-CX}Go
z7dzl`$PQcSF=ZRFq*BN8@Rrp8u{zCoLGrs~;ME61d$gAk`e{B#idjgvqWL#Z8p8!{
zU*dI*VR@mT%mx4wOlai~;mS{aGh7LFfUi~V3XHi&Hf6G12gugVZIgftMlQg$VUAdS
zLMZ_EaG+&>c_Om0Y3vW+>-)yRHHVIlJ6;|v^I`B2H(Ha8%iMe+8t|5r3L{)cAZ_ld
zm`I=B8%iIyYrQ9(gIZLbLbVj31@`^?)`U#+=e+Bp%@Gyys~fSrXmCdY3$_@KpWTwD
zWVezVFtQ3=)7+kA5k4UAzs0hehK+ruT48@|ExaIQU|a8t7j4zrvVe&xw<^Ty&eqFs
zxwQ3etdDf;vnFuD=gqSCV~2SreabDhjg}&k6g0|5?<$CrPk}&`AaB&1A`#YRN8KPm
z$j3ANyyfeJiUyd&uHr9rR30GIYmXO%#^XfcT(NR-Liq<Tv^Ey540^S%l$u}*&5Ym`
zQVzoq(v12ZWL)-d5M8t=1<`mJRam;-7ODkdR#iyQziNTlYGgZ4Y6UAGb|ELfAV*Qi
zk58SySs?-oi5YLqrJ7)63wg-tM(PjBrG5zYzR4N+D=6u{vM*#?X35zliLt@La|Y6#
zlU?VNx<)Z-3V!F~<5d3|F#d0}zWAfxjp!GcAht#lLykA6HXsabYxF>&2E79a^aJ;X
z5DSl<|3vInA1Q}A51QJS>Oh_ARKYp+fhL3!vMuA6z<7m?HUN)26AmmSyr>4cRQ@}?
zrL%Ad5Q>ebAUi2YVsf9YEVKgPbX^uX(OR}vN9+op5}zRe+m?Uq!M=ARGGBS(0qFLW
zO^>B>#Y%|K=Mp12=TT3SJj*-**R8{Vx+SjS17ugeZH1p+#vRY~`X9*&eYM^u?|IMW
zNBJS>MG_#U$==%m<SSL?+lbw><!2bWUq8yAy_oI=aAVCBrO)@_D55bBa#`%>OQiFb
zEZw)RrSiTW75P=#RPx+B|A;=z;6N`Y|8c8DTjfq)nZ6eSb4>@n?N7i`Cw(n29dsq8
z9<XEe*BT7-o2qGQ(oR|<rOOn(&-1Kbfm&etD<w<+n)%|-48zE7yGn~@m6<9-y>r*&
z{ZT9pD!g<V&GMZ;2;wq4)jW-i#+T;z|7^2qTYBGm@l794bwtldHs%H>a%SZ8&h(0l
zzb&+`;O5Sf?54%yb1;YyWgql54kyv4cr7>Q`|d(%u$x%h2bL#cccq2*H0`<N??A^W
zem!1d#3BF^q2|~)fAe%XqTYRhp<DYrb^oK-XBtH-7Q08}BzIzuA=%jIv`$t?bv_><
zS66Sy{EYulm9*=7yuQLot)0iS^qHnHxEns9c<1c?^hR$(F=A%Dj~MGrEZaGgnsKK%
z*~@?V8zhkmK9?%pXcq9z5*!EPJa@58x>;Gw(+HfCr@I0RY6dXI6S$j>mt9?Sy8~qQ
zV@5Npe1079{-|%jEBtF&4iU<T-6E&Oa{)^nn4TXf@Jxe&(=nfS1*VWQ)Xj)|+jjI0
zUO2PShG)$3;9Zh^eFQ8M0XPu)z05Z?;9iIhz4xeJxF=c!8lcz}cgAbIvZ>CHAs#@2
zR!ZO#Lfv<o9|F@7^E+5kTNEe!m;;zPA-$Ow)&rl&x2lVK(dv<gJ35h!gl~_3EPJo=
z-WT=LFf+Xv^$m*Uhjd%4Cik{cR;-2F-l2<fHMTn&e_mJrwDq2AtXS(<$gJ-z-aTU1
zisr=?=~PP*OE*aJ7}4(x{LVjEXjumA5G;^(w%5jcV%DYrSt?MKvmsqth+DpUXy!rh
zLUpJ2o4#-lb$`6#o1PicN%KttoDBdtjR1L3G5$Up72&qKN2bV?BGz5CbVkRIf9bw3
zpYr(Dc?>=`wDL85rlG#$0`$cnY=#fLOf6Z455?9R;HRi%uO5Jixl@~Zl8u)3lW_*0
zdo-dp2iww+-!zx%cONdXL~o2tB97*LUCzx=h#q;^*Hl6f^xPyN6FnGNHMHG_+dt46
zY$LIPu&650>0jLfFl)Dn))Le^)eCbrJs6kO%<rQ*99OXg-FHN8Pzt6Nw6i-YbLK6r
z<_F)`1|_$1q+zXr<gf-YU-?H1Jh;x7%UT6_2;6<$#N?8paBT_<XWyoX>h)d2FA{U^
zY_{D?!4&thuG<D|mJewQ&zX8NIcW$;+ZSs&$2qGLZh&%uQ6#0dU#F+NzGq+j82z1e
zG=(g7^kSMwFOeFrF|EqGIhU1NHa~AQU=N=z5=)u7N2WQju;=wo<r3;C;H2SF^-<<i
zXAvLru{A0j=DD1}*5@M;fl~ce_aeX;EVeqg<^Wc#Kv02pgvwf69!@kv^1ts`Hn(bI
z<~`HNo3nC`FQlS`gcK>OtcZboEKMvI!Xb068zZR#)Ax53lqD%pwIH{@Rb|Qku+vQZ
zY)XdA2K>ToWzeZBs?`{Vjqa;9^Hz5Xw`PzI)&8bkgAE=ZhGZ`IhK|zKEJh<4WD}?W
z;l3&}`naXGXQAOaZ|bh_?t1UnsIm*vr=ogZr;em^|K*s3x{0k`Rncv?y^QP*G?CQz
zWE|~fSdxm3W~aFRP%rpa1*+4swzqBBjn}%#M5mUTJ?O(V$=f1d*czyl^hlJSrhzv9
z^jWAI*q+rE<K@Lyh-L0^?)uNjU8y@&Ycc{eU-jtUZeS{KU30JIT0;xA>#2&xcJr8N
zi!*cbHJ*^`C=9F$gvkBJ7yNl2jLpW%bj%e-8yc=xg{CnkD{&cjiadH5)=m0SjWdp^
z6EBCMtzH=+-y-tNJPup<d1k3E{^Oycu_tv`HOCv7>86tZvNsh_{rwR*XZ{j6Yd4@k
z;Jo_31Ly8g+y4%n!Or8R0F&|0c-xUE3aEzEGKJw&+Hc`OFaUE;Dl9?#T|s30R|V0Y
zK5_le1(5_H-}X0Xcm%MA`o~62l^b-2g~ma_LKmckDM|%$9;-2aH@$VOhQrj{@{xW2
zrCSs%74f~B)}53PMReCWQ8toiQu6*uHKlP@cVJ9V@~4?G!ye%WEJ-pcviv%lw$(YC
zt-KTYV|ew)F2&#U>%Y`Chl_J3yO7(A^|M;hkg#&lr?t^Y&ff;0(N7u-ukNZbnVLKT
zQ2@Q%3;7?BGWB9d%))K5wNyk?I6q7po3ix4!JrY78h-4?_G8aJ(V@8}NkL9m#8uT@
zxh=i&M>2HE)U2thdXn443d_?F{;BcS8|_F=>9R|x(q-ElS-M6qOwJ~EgWYOR78?#Z
z^t&X#-sqG@ip1^apVps&`vbpxek#w^+vWgSO4);U0uH7LhJ@S#pg8&9vNaOjR@wIp
zgSjDq#DV<1j8eZo4*n=V3;vS%@@B#na#kx9`5aQg=^BVVY8Fr!U%^)E)z8E?*rOKa
zzgq^pTVc(P|G_CY*b&p;=li1M<y>oDwEw)s7wN#aHx^{s<Kujt_G<76iOUPrz4jk!
zmGABf@g3_ox;PXc$HC@t#xOo;?BZqf&T{(H!Pb~x!W*~ZtQ`_gOSgnY4u)x^3z+F?
z9!SK9ZxWQ4BO-WWi}amDz+X3F+ko~ii_@W^qwkFP(AYJ(HyJ^8w`Au4zBz@e3jR7F
zNBP!DDT-=`v}>XnILW;(79SA5_=Xz{G+*G=4W+5*`k1r(1`*ip8S$NwbSK{F_*4N!
zFXbOAimr7k$V>B>vN&VoxEh%s70<)O=IMlbV;+Ooz|t=vqy0RBS1$Q|-M-Rbu;F|}
zsuCV#%7lJzZR4s;$fr~oYE*5bkI1*>NU`&VB>*7Si5ONEz0sGZ)m=`--aS)eC!@9_
z+~?Zs|1`AzcJau7h~3Yx&s}z6gvzBFA_Q4wjjzfJq*UOphIbfrX>`6aG5*DrSYqP_
zt-gMiz8^$KY_V=`w6dt@vOwof&PY;guYX@lPI5`)&8gC!M)lT`xtP;C)lpvME)^D*
zWs;6bOd`?WJuZkrAOK@WgDJ!sa{$o62iQHE0;s9W=0tTr<ng`)G0c&487Q%c5M-w<
z)6~%y>ig-|GbkS?zWT}#jwmysF5T;(T{VBMf)*NB00tF)EaC{~_cuIhqAvVaFZJ#W
zo*psXz83p3w7m#n|3LMJH<I4DNox*laINRlNGclzDa2g%JAB-CbOim8=)}le|GCD0
zfTQ&0U?kY|i4bW81`ugBKgipBmrA4>Lo8os&ffpFe*5-7%9|^+>R({5(dIo1s_5N{
z%B-vCCYNGQU#{g>v03-u<1%iHFadnBE+;N%1e-|z3q-Yv!?zoxM@%b=Rc-i}_y<6^
z2WSwZj--C8BhfHBx8<fUY(rf?H*}gO<QKL{`ma@VM_AiYID(G%t&#0WX8)7R;JfJK
z`tUh7%AY>L=EIMeD4$rle}C@3W4nw&5_SBdAv?YIx(GmXVJ&3~x(|FpP8uj@1r_%O
zduK%w9bfLk82Dr^J=)`b+)vWAnwZyI-x-E4gl6efJUd-E?P(x&Zqy19hg3y=%?d!M
ztnPd+Nk~mlb9sk>r?{2FC+1Y*R!Y07SU*7D^e#}OXe#0RMSi7$4j~c9t2KydMc4Xk
zo-O&iA1M8tZk(;!WzcWuy&S2d6$VE`8NEj$E3M`M^nd$w!!7d_1b$m@R0s4X53RtL
z1(IFOVhe|<;O=jq`$RCz8&^B6#YlzIm|DG2gta5FOx_5#X6lulMfyzyy)yHKMW|yA
zQ#q}?WZay_BbJaZQFT^|0-0~Xybxwp3CFrV3k-88tM}B|x@7-?t5ke`#KdUeO3Pv=
zm*kkTTlSq5F<PUO@iocEk#RHoytLisuV&$6=RVD^3t(#KUE*tzxFqJ-8n-OE{$+NL
z2|upd{Vb->=$^d<;BLKwfAz*CWU|fPzOHhRO(S2#%?4V<Q)B$Pz*l7Bv^2)vEelR`
zxDKbAS#-IHeijS~;$WM>B2kD=*(FXdbF*1Kvm{Zi!u_Z0SKy)=?v6x8^kU>*1<Kn-
zZf-DW`iCc<)GH@CTkX?ZC~`uf><zkzngptOhG)~YER)&WV_hR)wDen&3F6mtx)Z@J
zc1lj)fg<Hpx5SqbN+#fr7pk>gnuujdLcxc=eX(Y#r|n$X+RW|~dwd@NHIQ%Xn~B}>
zW|`&rAhYX_#(zz*Mn0$%6g>I@+ri0O4V8?njBwh-4wY;R@Z0?`mX_k`9!Tg6Q3+as
zbbp~+?oH>M{>b+8VDoKl^=mHIGDW04D=P#L7#w1PR;7CY#Q`W82Vsx-sW0zfz3Lu_
zsD8+XcUghVMLwADxx%&hpg!)S7OQp7RXs^M3UP$K^Z9v*^5ODKcP90U>f6&f+f8w|
ze%(?X&!3%r1)fv-PZX9;nL7Mmuf}p+!^Zh6U$z}rINGr?e(v5K`^@hE))QwpA2$5W
zlyxm_2?oA`-q@9@aAE_PZN!C#d~phDHb#&8-0~84lTD6Qgl8-WCD-elP5oEmMMBD?
zsny+D{Pzz@)HqcDyA1Ga&*G_F7|HXrd+U7}&=D!0?;R$)tKViS@wc!EA7R>K@%j0C
z5i8{MKP>yTK2T?U=rdJGfCs&_4To~Wl%o8DPGT`~(T~Hco5BL_8^-cvj`cVD&NoD^
zs&NfrL{@q~ERokw02$VlPtzjyr1E7dcLq7^f|_7xX!2JIrk@OWCv0K1&=kyS$?4Jf
z3!19CP}seoj9A2?1xi6xqH#i11)I<<k-dP8SsBQ$s}CDvT_EA=fA!u3CO)dAKI9;~
zJ%ePgZNll4xf9u4#??L9HF$1Hzv*8kLlo@^kuKn+R@``d=EUk+a)t$K8?1{r2fHH7
zI6vr<nMbtstvaoY(EC3-ym!bqKD%5QKRc)UdMA|UWA9aWa&=KXJ&<fBPTJDt+%^;C
z)YzIHRA@!0P<s8eDNGwNT};CqSm9ae0G_UdGh0~Mz(=#Bn=(mNavJ@sWNOo%-qvaz
z*IHF$GhbvU3SvFni&sWS2b}V%Lk>g~{NHA+Rs2{8txE_z*YC>Wo=`UaXzBk_I~mAX
zVznbb9|KpYr_LEw<`Gp_xqzjE)_*qW_r^tZY`neQb--=@9zHxiAlNzY?Ul&+V-)(x
z`V&DXn>I+3kaNuWWYM&?27T5vUA|{VeOYx}E3puYgjM)nj!M_mSFU6dPsPl4H;T8o
z<iQNltB!J~U+UP0E93Mz>XyTD0h(|8DpMp*@}X5-{t7|;4eR(%7E@e_%)McX$xvjz
z3w&x!bAfWAP?#pscA5P%O(Z1iBFNFXjTPoqGISF_1(mORmO2?qnYO)^)Qu}7>&|}B
z8hPuaVrW^vr^zEK+NTg|$thzwad`4tlMkQ0E7|};f0yLtp2@{N8(-+^YP8QRxwITH
z&ZNN!x+L-wL819>v1TKNC&bmP{u#HQ1c&%yI>#tQj{nRm2b>af92s8AZV{2`<?fJ%
zZVHI*S~io+lX_;5?U3XWIi)HGGZ^o3NmV7w;-u7D<;(B>C>pD@{JeqVKx*F<hOqZQ
z;Iy!Mb6aGIXH@Ij{Q^=V;_z4!R6(;2jUN}+6!(Waef%k%RaviCzR|03i6Jm^Dn^5Y
zkVr+@zNlKI&twPl1-EA_d{Z*ta>%K-bE<uPu3_)*vih#gW92o18m2pHrJjTjuSNd6
zWE0QG*_ESAPJyy4#R*$l6^7L+t*Z&78#(^KrNj#jCbTz{fNRzq>szTl8%C3V+-7ej
zy^F{M0b<OWRD9BW+xa;#UVC(>8O^y~k%=BsYimh-4Cr9p_}^(Mm}-RT@ST&`B5;~s
z=#NXtGi)vJ-qHs2(?(Y?Rn4v1chRoKcWPW-$|-+h3H5UyVd=939o&N7!uEB~$RIjD
zq2HcYAgSNtyysI`BdtM_GJCTp6_M?WY$#a|22@zH*T;cn!js48&)3dgWH@*XlUxi@
z3Va#sAe+qGH$puhG0{x2oT+af!F;{7%h^aYYl){}<A({(Jkh_j*An@fDCW=g8)S$+
z-iXYLIRDCT)!y;*j(2lhe+ErreBdZWHE|Mr@Bp%VzIZ<UAhid{Fp#qqDA5I_)m<Td
zTsYU`=dWYs`14AIoe{<G;0yM|mzImQ6yVDMY^D!PiyEFOfQgKYc|#)`nKFwE${iSo
zjj>seTcVGZSJByY3TS8_b^1Pi-_m{*n{pQCl*4tm;a5@lcBke)L18}_4=7h}t+Ym>
z-nKc%sR|uEpzAhB<Xq3kg!6Hza_Yu{Pb`%aZo6#l3UBS#G%3xJbFD1gu&`qh&q+AU
zraoS>mB-F`X9hQemx2ZK>30k<(zo?f=sDGeDfxqrKxjRec^{X;s_}h^C#*q10rrM7
zZ;iALP^oS9=`0nZ%33QAG5ot<bJk?)#hDDWfh^D|5AUhQoQqvvMbiAZ{Tsvp;}uUY
zxW9b)7yElns{(Zvg`<Aeu-1gy>6+3n*JXEtijeyLFpTHt0JsIKsJ<F#XG^ETF%24Z
zR2s+pFx1_3FZJoCde2{4-0|HnM7oGl-yZsdmwNW1m%q);jio91sP)wGp&CN?l1O-u
zvuUhQ@e4wjYq_2d=%&s?7rDBxhWqy4KP4%Jkb_jJ)KAL6m+{`Mk+v_pAPh%0VDf(6
zw>y-Q*KP@t@`tnw#)hfw$Ly(6I+-<^cM?d%QncpWjJ9v*?(0ClcAsZ?V$CUW22Q@G
z5W1_Tk3O#IwS<2}tacX;J)&EoJU5`I!A^8DV68eEQ=_uO-MKnRrMrhst;5JEL5VZt
z%{*PJebF%8W!w!I=kmKL*$=Pcf@nl{J)Y*w+R+J&rTVakVM2m?eAtfh=Y_v;VuS&K
zhu#)L@3yr8B*gkzI!^wf)jUk3@^kow0Uwpgq<Y`SBHSZmCRCs<^vo_<qV0lX8^cjO
z`e%+4wO&p=(_5jhqh%zti=E`^TN%@~rI^x{w+3}hg;?6<n<iY3k~aJkb9P$7isz($
zY1qSDCd&3zWw6)Sqf*{&EyviZo>TUcoV!CZgFQTHysUIRcp~D)5>r1JNouXDaX561
zaBV^9m6f=f3P{<>rL5KZ#ApUdgo8CNai5YmN1kL%uBkGQH;cIHq+3qf8NDpkqWjW*
z@**V>Ckkuyi~h(Vye$&?4fdIsA{dKUsUUpBD`h8SEvnTs8^z~5{rQA$Qq&HUO_yII
zCauD(Q8~`jJFn!J&9c8<WQ5>&R_plh*uDOH7O;Q(><|?VJ)i_m6co@r%T@^#3mEA}
zosj{|#+lJSJgb{t?4Q2pxP`=(5$L$}cg&JX$^pR|(3mAq>-^D}<=2zCulq+|_Kzm)
zUttcvkbkBDA^(L2#EzPI;Q+fVaTm_ieYttC2?2l*6aQEc;zDxU^t(5sqz^PZ^HpoQ
z)R^9z`dNPN%(l;_IlETWUt6+c$|?LNjk~5f=$2BA!@k5Z)1-!|7k83O!-E$_f+|Aw
zjcEg?T7$^umN*)nZ@(N%-py3nh#M?76>kgw(D`@yR;CpdA8Qj}ky%6E)5`sd@3>MG
z?(-c1c03JzNmkJ`)f~y^K~%n1pC*0bpDcJtH|l#KKh7yJ-{x)sUt@wssbejh*h7tC
zp}?3A8d4g7>ZaW&!BT%mFZ)9EvB%lP1;k8q+c{-9m5XrW+vltnk6!k3pWGxl*#A}H
zwd{r<b^%?+k9HM(C-DF#q*IERae)wo;#(5*#&2fw$Vci_)mO!9F6?+P+%3cQu3Q1>
zQ5YL7I-RiPS3h+29sM!iaPec7Y?7St3jpU(;N^|4A?d)2>dP;`d&%Y5+F=f{jyVIY
zrn)cFEt?gtKxnZWUnkby9R{`~9b6trvA<3#`AfZnymhs%uC6*%%ymWkdX$^?szFrl
zs8BoU*2A=f2)t71pdiqt5Sdj2sq{3zs3grEx}lIwYq#PS-WR3HzEc<Y<mQBp)4_W~
zt>F##CD5_#a$EIjj|8=)KSB>pn=e{7*EYu&b17DQNc_M8HkfY`@AWz6e8AN7b`gjq
z1i*VN+R$<SUN|-9#aT0`yV-&`d<2}q@*8xG&{VyjF-p+~W)zs-g*YlH?bFT&SDk-d
zV9%?kAb%QmEx;hC=5a``ULfgsOD>V^&T@=X!JbR$`-&8}%eA^UdSKj#`Sp!OUzMHs
zDKYVpU9J2dm-r>AHOUR)D#2CvR8OR47Fr{k(GIVC*t?3Ln{~|0e0xszb#a*qr3`IS
zfW73`HnO8#^Ilcu8MyfNG|K7XsH8<M>0lj6Z(z8If&TsEn0+EIULQl(g`&5R-FqY6
zEag@6An1~Bn&7RA`bsYp9?>nZlBS(}Ta6c*7JAFWO;wT$A{hlKQrY*^wya6G`7!9z
zyrkoZ;baNC;(wFmuq~D-kLRAm%lJ7Tir36YR`uDRI8iGlhGuSGcyySg&e;G^DV{CT
z?PPJG8IyJl2dEalv%mJY=3G(9cAt-pBdG1zwzw@J$RGX)uA8|t@iF7d-DnpUVJ*)C
zH2i%<L5Q-|a(%C^epKU0aZXqMmnYt68$Q1UKDBw;B}P}*BkK%6agc>=BMScp%}I34
zT{_#vL)^vlId2aQcY{wuV&Sq!>jkL=7P>c2<eppUyqGWWe`4aqJ~bF(fy<crg1opZ
zUb`5rDu?;8Cf!^0)y>V)#Qw+CTXULAidqb`DvqQRs&ar%BLQ}E+C8yO*OPAc$*I%V
zamaq=<YE{b9iA89^Ly)>*;MJ%&8<;X?1Vk-IKM!#Z0m>9>OoXUL@0CGF;)o_7QMU`
z#~SuAMXftTt@j`Rl`keXK0eKE5kg7h!WKF~B`HL@j*LB*t6rw#assp5)HQ3auc5q_
z!{8u%mDjWPc}1+b3JK|SlK>%)$b4C5x76?X47=BfM76fSB4K#<;Bdc*>v;dKjGe5D
z(1KwaBlF}h3!<wt0q*_lTRX_6Sln?o9kTV3qvcFA%)Sf{opDS1s&ZG^j9!8JZDOdX
z4504{r)X16>A-FuT1_Av8+_a@OfG!?rp`2ERotaZV}{E&1qdVs&y~_}8K7+8Sdw<q
zY^H-e-&6#8@{8Y^?pknEg1lWo0PY4jh-GJAjT5$5liDRzQyo|@FXqe4(><?U?Ll0A
z(rR(O*Yg5Q;?3z`jnQnvxLDWA<(f%rIj~=lPWtR;KZ1@cDG6Gd89h~{L5Hs|Pr4u3
zsJbNi-9)*uD@pC?*y=a@Y}7E-xD+B>)J_@#%BD-eyQUR(0PLMO3=jJ)@gq(rSX0*(
zzV2a0?bgt$j~XMF`&XGZ6_LAw=<%5cE}7EX!MA_qNLgxm2;YJ|XU#NK8!=AqyFb%1
zLdI;+(lYFH)97+Vl92tQpez174MX~5{&JmbwzH0bu&ZILPjwxq-o}S(L*Ejl)L0NS
zc=eTiMt^@7xY$I_1lRhIWyh80y)-mR`%l#EGH7TegJz;&0uT%wA8_;;=n+Nj>?nSz
z8>@M(G*HW~#O^4*C$pi?A(S+boZQk*Gpl@{f0HC`we5_-XCFuY1F>0v;vu|mz(Sb`
zaBBePGd<!P?ee_Z@%*)@(G7Dgl9ab-b&9Y{+rkyd0;k3YZL>F<v_+T5>Qg+}x<9Ds
zO@W?{QVb!h&W2m1fIz$n$(YK)nPmXhv&1%_r#^KAN9H=Q>8E*@vb#tqN2LBj=UMfC
zrFJ9bdwP6a=&Y!hQg&PHI7lRD^X(p`7sFuodB{@iDCte;8L0&j$7>+KgoyVrVJ*BY
zMRX)VF@ZpTG1xXLvFI*|sTy!*f;K^A*dsR{&f5<x+1a1AzI~#b5>ED2lAKLVo(sg0
zHA!*^i3MPmh!pgDfbRwnY_cCf@3|}<kWM0Y#t{V2WYy;R{Zq_#^{mxE(4}raF@~G{
zx0bErx{5cQaUZqgIAT~Wt_*6bQxx%zhp0LdQsUv@MSzuV1|?j*Y!9F4yNVSdc6#7f
zP=!akBD!PCa&r$dTi%NH4Eth4wSn*8br5Si2hP~%GFW>mR?GBLU9+Sdwsnj_cfFpY
zV#zm)f+s{->m+t{8DL5uV}K4yj%hIB?o{&~oT9~68qdvokB?j9qE4rI7US~Xqz(3D
zb}d0uCz*xYzP%$xwGn&Y-`qFut<u!qI;iB0pL%kMyEaV8#bTW0i9uo?Tlig57OL#$
zAPx|42GE6r*;F~OR_!N&wDr7O=I4PmUfphMb|QO%&xOj90y&lv@}JvEVuCl~T1EZY
zZFE^(Yf>j&S0~au>vUh~)-K+#Eunr+k&UW$$8#==;M(dJ9qX=sU;Fe<*Ku}0Wk4he
zUYqOn(Qzhu{)f#s%MsZeV-LPJxy}4~n!Vg99jq+9mqGsWyJ~7!r{sy<KkB1tNY}{g
zIsGuad?D6q1sJ5$SXUdp>rSIDDsMJkft|IKb-hwZ#-)JDue~hsV5*!rR5GU?M1Cj0
zkG`xpadh{QriSsP@~vfp<+O;H{)3vvu#59FFFNBtRe(A#sM>-!o`k4EFm2J0WdycK
z3my3~=flx!*84N&6|pZ}9w{vy<d{76dsNNwq!o!djTM7)U~eO?u*)~aPYlK=@G8Bq
z2@|)vT&o$NAUnC0CYSh8&8-I5sVvb0y8iF@1t-Hv1;0;~mg-vV_|>#TC`IwH_^%sm
zc8!)^xCF#Y&L#b8bNIXHST=NZki}kC4<7^PH{SI}?4&QU`YyCSMrQ#kj<eMV(5?m1
zFuUySs<V^&v7G0h1?fU3nYa~L?YW=3>{7c}xrlhzR7WUq1L?@hzoa7u&TP-pORdr$
z7)`VHQx3%YPv7m|k05&%qUwW5*rm()QV5*N_?u2X!M^A+vktMmS$260Jx|5zW;R#g
zl6g=WR;%0gCe~OmvRPkRwZYGs&yT7o`Z?{fwK%Bnhv2qL<9IXwaxkr(XYQ7quNOyx
zGz&eLqUbxh=Wg1jBk39>9LZ9Hys%liBoQ^Bni!NPbB7~=x=}e`FpTk<QA4k1O)SSr
z8DJ=-pdVKc2-Ef>E5G=;x?c-wnvTnjy||hSx-I@fm!uzLK7C#v(X4`cv}0+Xk1-5)
zi89hO+++>qMVGSKzRulzlOD|0yU;6l%f3GP35}Ll4Gpp!6wudgBYpYm(q3tvRYs3}
zei@ZMEB&rDvO?w|v>Js^dH^QiPy{f4kj9tt+TEr(+dCZSdveV$io73hnyk?u7yWV-
z^&YaXo^Z>lslUG)LuRpRosl6NLasBg5~BJL+lg2?`NK%N0)yB*MwKq^$&XG4PbcI-
zZ8{J^c5a?tu3j|boe6w*Iq<RSA|uN5mL?<zh+kD&-(ZG7{{SGsWv3yRBJrkR24#=c
zjkDZ<FZtD6UhKE#v_8l4QS8`GtxQccvo4hf8IQVNuX%5IZ?NvUsQrfhEGv)f#rTz5
zJJ!%&`^#H5!4Y*;)|zAbMI*BFrN!nTq?kGdvN0r~92AuK-<dNJF!M)UWce?3QOmXF
zjTLi18-e(fYw6!;Bl!PS8*wCB_-}f(#edVQ1%`|NQ8+a`{1^5r(5(FndzBwH*hD=1
zW3<CRv{!op@aQAkKQ2hr=e@b0KWRYr)a0(o&-a}n!qQXEEL(>Rq<@CUWD2MESynWZ
ze0@8bCOudsx5h_58Z%ip&6hy=LOx-iwv7;2Ha6uOUzBM$-Qnl)9OAu%)QdbQW$I+)
z#+W^slz5D*Vo+gEdqDU8S5bwN>_Fsse@g4C1WYNfO(b^3^N9P8@uIYw-#>mr)Qh!k
zWFQN%lWrA#czUVxH;~@f7spyOyy(im*VFeF9}hsri-!FCAPoK8TwS4ORH7VxhMq;1
z^XQR=w#MeP&W6d2hQU5u%@L>m<n)U^<2)ZTfnQJZA8Pk89>tfH7`Zx;IhmOT-=Oy|
zrK*WxGAlIAGhN9y)V_Eji^5r5cOnL4ICHb+-wq+U{4KqHm%YXFi{f8Rbt@?0q2USV
z>SEj$BFU$ZdXk+unmC^M7rzu^Ux8hm=DxL;-hZPe|Bq^Q;OF!y7H*F&sl<UUvKWRa
zQCa?H!{z?=b-;@z^{>2Wa?LYg_kc+~8Yn_A9f9QUhLs$Z>X@3JPvxO%t-y4nRhhzp
zp^7Gi;%F895sw5caJT-JbMTja{I4wRz5C^%`I`xypF<a=0gX=c2w<=K58uZZ-eA6n
zF@h$}jkODp@_0XIUy3Sv=sdvKDRd!MOl`nCX7YXwN!HlpWlkq)6O9|8@5W3tc`eIQ
z=;KRP)}@ExlZz6yPVcu0#GS6%DGlZheQ`X~4CN$w5|uEQM{nA}sm=RwbzNB0RI$(O
zf=P~$s}sGj$zY1my*Faoy(N?FM&ZSvY_bbkLOn`_0wJ^+SvA=P6kfXjL(^>j?S}lH
zo0J3mZ2<WJ$d?T1e_`bR%kFsa@9vIcV0W06^9PrF!}@))S2X!i!n9=^Ys>&qX@wyE
zMB9w}6Kzx8b^vHJ6YpgKb*8l|U;&n;h-oDF3HY^-9|25GwTa=<TKq6zSX3g_s%Y|n
z{oK6pox--W+_PgyW+2#>xW5Y6pd3J|QV-!2?{0d;r;x;9##?mqlEMB&ADi{J2zL0h
z1Qfy#_>d54VdxhuqC>etdvUyJ|6;Kfb?hk;2;!Cve24K#5W<m*bH|6>#Ef@>%E`%-
z)25!*nDtL%k?pt}hlkb;z(~^T3;*FX(o998F}2I*#-93p`GT8{7o=~5J3qN`DVwUt
zDcpH_?y)7P$mqRM1yjLpHdT<)xNDzfeO=u9Y^rn(!PM`jUux2YE>!9}```Oi_f`ya
z`&u5Q(Z#~>>`R@Kp-%Ho-+(=nshiN5kPqTi98WM-<J@xmPo_rx?HBvwU!6z)e?LYk
zVc(R^Jt#6n9wq(>LgY7Yd#3B~lwZ4#d+C^|j;p?^sKoL_vU~JoGE-@8<BKjM@`tQF
zPcp8cW$@O9%2Nbxb1LthG2ZPg?PMATcy=%TanCW3O6Ef%7_CoZk=OS_)^0Ev>^;*^
zmSndN{#qs?UyKK6fF5mUr@%?n&aixp2c-D;s|OUT>nUqUMj!cjO4j>wu#zOm>H_2*
zX7Lve;1{2!60yKY>;<(x&IjW9#i)u7Y|^GdIJk5%#R}lM?&?PjD~rU7Tvfi^J^x|h
zJCg7TP$eGzMU_}|p~<^s#LOz+ED%&EcwGKNiKucF*ma|Ek97zxetN$_33u^`BVIM;
zqW^+{*nRr(<b-_17?2a=cVo<!7U0nPnDIxgAlFK4&?uRe<Gc~@0A2$C)Q2%Pwc;vm
zmQuUtqhvu&z7rTNt{TvHqj4f@{>GLXWkP4Xl9z6bCFp)2V8s6a)Hq!xEkj_{wD6Fl
z>*gi#xqypC0v54(J47;<SnDaAftaq~2R*b+dF<SHR7$PJ5dXI3Wxf~)0mym&F*oI}
ziGh+gByTuoleOnwcozeb{Oec#UwbV4uPci(<<ugofB}lsXgKJx7QuPNRN*(M-1;7<
zsztbS%4f^$S_nTUpi#SL2?DtF|Ff$se2XXM(KQou&>A`D5IgzS;3Bkg7BO<^H)ubU
z$QT?kMD*AC4T903OHKay(mz+@pL^q<Ux)vz4+m1-7=_fQ_vizcH7~0EC;3;2Vgf!8
z0?<8ks9H>Xz}O>1*Kg2mMv`#wacQ*4DiH33E(`)e&U7Z<!V>h>J3o)}h9fCJ(*B=4
zg8yF&UH{eV{XY$H6aQS_|B_na|8#vP|GB>Z(Dj}Ahp+FaW{pyT&}D+YO`@N8x`U^0
z*Q7f8EiiT$1argb!{><Cuylb7v}i&aHnL~IazLyh_+6dws>n$s^nDWzJ=@am^=J;#
zvl&l7KYWo;c+6@)<-JiE)_YArduigD1usRm6Nikr%Dn6<gj5+&o7MR3G8H;5+-yqh
zi$d)ktC*Q6Jx#HDC|+Qyyem;t1nEj6CfuZ>@^|T+<NO2kE||A)@=d(-xHdxaQwd3|
z9Y^x_jHG~DH%ZQ#{&1!0xLflZ#J2M;ix%7U*yW?mLSvUp>6{F;U`i#ugBD*;^Ka0Q
zS`P{Dbxm!;ZB(Z|RtVY-*u{xs>Rzu(kk1!nI;<@F&RmmvbcboZIJO@n1me-IDxOmS
zI);}>kB}z<*vw#z)(K+Fqc$LI)o64r`EJvPd9VKI(EEDb2Nvy^Jwd)o!4?qY<O89s
z`8rN<*2?K%p3|kfG1g72BAqdm%rl;>o^If-;9iI{R-so0`oS^*&**RxJ+=3<aqNG!
z_Z?78u4}ppMU;+$R0&d~2?$c91VoxBRfy6eBGN&nLm+e%=?ExF+0vVIP+FvmNN>`s
z^hiPpfs}oJ&z*B;pR@OwIWud`z3a~G#mdT`<o}cK<@>(3J@50B18RGINI#6_l<YPS
z60H*Xjp?aYphp+{Tf&NO*KPt=LOfB=zkrEp>spy*^bF77`P6e_J2%ekRmJjv{2mRs
z9BQ@)pTPiJM%-|EyrF*msO*el=MxPo=hD#GqerUxJjwpRP?=$nWLjIKTB%ze;^DAc
za`z*aOiO+Ga>kXT?~$*=MRqHTDc|y@o~#r)%U^5Fb}QLT$?wlx)l)gA?JJn1ODQh;
z<1}9k5ejrOy!7L43ueQoFU`4hnByiaqMu!sEEsn~zr3^_raK`zM+44h+%h<Yeyt>T
zqD)}!E9)a%^;6tdK057S>3PYQ+xF^lkbjR=AW(Fw{DVXWMcL{xZsBkkb#SWG%p7_6
z*W){XLr(oyqH0(=(Hh^fhziSM`hZQ3b0@wM_+mLzK!4<!wAXd^$Zqz)xVOTnq78``
zL3Xxi0;Fl&5AZ<`$ZiP{OYzep?D#smPZgpni<zw!2?jtnLpLDQY8X4uKb{x9w3cG~
z%e2yr+c`}fp<4BB<+pzF7TBB&0chUIVvZT_ED?Gmm%V2gmfX4Kp;b28G&GME=2pEg
zrewWlJFz4vAgD0DpR%JZyyNQ1Yh2RPhHsZ{#h0#ISt-)@HOD?ne9P|za!A%wgx5yz
z^e)1ul{MnUU0$dPch;K-SZrTuPr1oYvmRv>8z<;`=99!;{pla4X&M1RjFctF3pM6y
zEGo={dhJcXX{xwtO|cb(Ng+lzVkvt04Ar@L*k@>U_D=tMa{tiwNzi(XDSY%Wtsz74
z?09+SIn57O>HGMU$3qk;3xgNnMl@d+>X6;L%tQ*jm5TRLDYwM+oh|u^V4wx#^>^1d
zK88Q$xCMATV~7(#0w#xeA8>IjJD{mRc^6VKO@e6+AI`w5TzurGBU=~AW@CD?fzg_E
zZJZb4JhL(7YCJ$BTAqtsq=FFLL!iK;r41+tbi*7o_#Gg6>e(I+W<j=Tl0}Kv@wrQe
z6kA!`7*btUOR2@?e!sh9^S71E_L@tZF+V`qM6DoSf~gCNo=k&)j5nLoL>*{Nc(4<o
z+M2fRdM%l{u%7hKk(#=J7jk^rj)5*<Dl}efUJ3si+Qr~}_QWv!OG}mQUSG*grql$u
zkmp4{%*@rc9Hv&3Yr?c=H>4((mp#2b!2C^drjLH@TW9_U%$8;LKK>@c*6T|B0nnX7
zY-uNix0S6hK}Fxrp>SS~G+vnUK#+MSw6gFh7M-$?fqz4$1&G&pkxbC-R-m?+qiHh?
ze2x9F0}?mRE*&=hp1-!bdR$KFhvcRENnNaNA2VP56z$9a5$t+`6)}??NLpt#S*-W+
z-OYf9+V}F^3#;ETmi_o<X51?XhQG1_k(ydnh!628kWey*pZIk@4@sU5p^ZkL+G_JP
zmx6eE!&Lu^>V1gZiB>yOfg04W0at-fw_AinCtCFby%U2q8xt&HyyV49l*{YDhYt5W
z4xN%8>w`~v373QW8>M#k7qhEd`v%WB_wh7#VK}MB&Q@w|vfkrE`h8Eh4^qplXQ!CI
z=wn>?t<^S`?K=LrK1`2N6)Z>?8~c6cn$7h%Vx1hI56qW3i;{ZemLbU-s7+`(*Zp0x
z9F|427(2A)UW$&XbECb-H|8e$qPP6vvqv{x^FN5xX1pse{S)sjH4VX)0+PXMo81eh
zDFj$&I!jvZaQR#R)<I>@)t(LA*@bo{L4*)yolL8Mg>@rY5K@bT1$un&{HXQB0ZHD8
zpWVjwiH(&Goy?C{?+;;u5A9T&%;7uRgfsZkWvvs$ZPL3-Eny?r3AESLce|pk%d_c?
ziJ6$(0Ee%(bEnSWo0j@h-@hRk;$a3=v082}D=jh{xm1Z&1^HY>y}gT!2VjyS(31;J
zeeaT?6Xg{+$n0|T<nHD)Lh6aP=T*0OkzCG4!ry#&vcfZnfaEw}FeR{NxGKpMjDuD~
z-ROj&Bsb~iRn@sR8c-f+M}6+)uE&$Nu<3>Rp3qh?gg#!AJxq=5uKX{aMb19E#o9F;
z?p$A}eWz~?goIy-t7Hte*h78u#F@3?Zx?Q5$>8&Iy4+UIxMxgv>I=f)kE&RHkz#p1
z$M)UF-tH)@K#~3_80(V)d<iB}9zv>A4Bt3uIA*#lmx&Io4vPLDee7`{EU*nP&913H
z3oH&!3wM}0XV9u;mpXNwGaV_sKQdFZ?ovwcpQOe(`Dv9|o&O!F1OQGzdlsmgjj}G)
zHOM$wxJ~5Rd2K(Jc<;6%Sr&gqd#_1%Lt~X!d=ElCGgY!x53dc1%zvTSFy{3JUtw(c
z<-v`!#kQl*+0<2QcGV$;C_p6@vOG97*fD=$oG4o0H2t!%>!7~+eU6Y=OcEy;E;_U>
z(CSw{IK{oP@Eql62-@PkjMdJzE3}&0l6sW%?F^MYn^`qt(laaQ30w-<W&&N+1=KU5
zB%ueHrFnogbD5voL{Sj6ik#2&xVgBM4PG9-*(MM#`&|F3^djh#o-Cl}3?z=v<B^}T
zxW*9h<<iePvgMMevex{WXiJB{<FDh{?mYdf&XYkb#|)xLSk!WQ=Sz%Edji!)gYlMv
z$Ayp<g8uCbMbK?`3#C+6F~op6$cwWg8|6GWt=x&MolJ03<0Q((udR2OAC-C(Osf}u
z9rw`tCj4-_A(`Lw=uVpA<tHc61Yz>m4Cv1BB39z0%XgV%BkN{<sBX(lXrDNbW_8$Q
zAy>89eKN{{m^yVBaRpV5XRx=h2v2j(-Mc8Wvc5O|`jw<-$lbTzaz6h~9Q#|7{GW_z
zC;p3lR<ZcP!kMjUxA{~*It_oSw7t-=yb?2%Cam}j?HN@XSuk-BNHd;U*m+J+?@=8u
zB;Hn$3P5w|@AU-PNv_V8UEMRJ<JI;_)>vhMSDi4S2?TlPMtY0-{EEJO>sP4-qdz2s
zomNsfA5ycVV*IEL7+eurkZK(?A5QHYR;?w+<yvm<>Sn)FaMBjt<+rYpEB9Bc$c!4Q
zrC)#s8vLc`Pj4*uJp+qfjwV)PEqc9q9Hq&##(8gl1+}O{vH{-7?iOEM@)*3E3>jIk
z$FN68n(Ebld_y~8y+D%^82;JcgnoxptfZ;ZqIDLSp>Ta=gd7%enJGDA>{fKuqY!rz
z#bgHH_-gcokU0?Y7>7?4;JX%^K4`y_vHG+vN-+&?4W4S=vXR<k`{#nGN)@>P1)kcu
z517H7J@4%<KKu)_<}rSb_0JI!)z}}HkGwfCAi^4oOYJgw9A0nmd<A+AkL;MHs&d3#
zOC7IET~*`#m_#1ri8PMLtCojB<Y&1E+3l$MginJUem_Q+2GWCqcx(Lqg(93h9UsJ6
zX?>8nn#T7ug2^-#ewQVS0fYoH&<t{rNYptsq2A?+>4xrrvVa%!d>QkWdR$7^A%~t{
za!Fi5Dz0}x^alejCp?%wKmTTEA>_;(yWF4*=?AV~-XQrz&U{C&I9`y3zXXVhO%tN$
znMS=TXb~Et<!)%XnYrt?<yYG&<i;D4_oBTyW*>M_`p77($ZbO~x24!zu1&U~?bOHe
z(p;?R%=UING!v5y-b~9LnAyJd0z}F-zW)JEgv|jUxN!s2QUGdGb>}*>zkT?3Htb)_
zA5s>2{t&IUfGfq)JR@rIRNwuEZ*iAgiKP^1bGmlkj3&0aC-kslGXRxQF>|%uH;b9{
z4&mk`CZkhi#c*@{SvUTDF2e`pjQY>cmx6o@3~fhnnVa>SpT{;UEEWr;%DxDEkzQ9z
zP;?c!`+&4c^iFM~TKC|A4Wd}o=z;+}f3-ob!}8TY&w^=v`Mw~ycB7~32x(W+jHCNU
zA3xe7fE=dA-5MzjB&QfqQ7XB4_34hT#904ez2`pY9#`0l;Hl{A@Tu$`2b(=2YG>m7
z{3GbfLG=r_k7OwYchwEbBvLHO$ufJIvJK{wC3JTMYZ!gY*!h&u)P=7mQG11E<g+un
zj}_?^7Z-zG!vRPk<`!!!lvTw7gX}5U7<&FIX#Ofh>R2v#79cWZ+p9oFW2@9?i5JGC
z$}XL$a0#uQ6Vm87V-KIdrMxVt^m(Jpk#esBH;-@g#iCZMcxf=<s^s*g&8U<@EvB!M
zb)0i14jQwyZ|rP3MFff~rlh7;mV^r1pb?FbQ2sj#m0h?@gFC;1(xxL8W&5+13<OL`
zofgDbz_Rita;Ega&$+jXyHvjRLY>Y&c$8@GFkZ-MI!M*VDH+6>7@tZgREH!(F?Sa5
z`*@(W$NCUTqTnL#8pgFsL#QQMsWxIJ8r54H#|}C9E|1JdsojtJ1i36tE!mOyBqgNo
z<RV~O0Vg^E8l8Q(0AMPIE|vpC!0c@vr{?jLFns>|sgB~UjJmFgq_e<ELhl$+rCljc
zy!F-6e?>fUM-Qlv;x7%x1aodIuy|W@cluzvmRl@XUud?z`tBoQr`qhlB$RkRd^|*q
zNzr5W8u=;886J9VVP_QkKzWd%8MC3VpQ+O^`PpEDOP=1X?^n>EE;om8+I1#jAE+Jb
z;ovlHA5ewuDBAVl<j2a#2-Hou^>8L8Bzr?<Ux})C-PZvKqsI=6e_wDk126<8$n7%J
z3Ab5EsRJ3Xqm%cL2lA!AkI`WJ(u)bl*9I8@PFM9&02{D=0eS0?_|;E<Y*5pI!WjWO
zCP}GB9Wtu=a|K>dl7+1TW!f5o1Ja(<u(;69jh;j|&R9-JyiwB*Hm|F<od9}i3E5tV
zscCROc^+Pum0dP%g<!@zAS53+(FgOjh6~-zWvEZOLRh#$TCzHzWwO9YeWSrSjt6sW
zOd{D-Je}R-0Imj0iI&K`>}?6zHU8^XNcxmY6Ie+}kxDX62DuYZ`wN_wE9_>9<wBn5
z^gem##4d12#-Ck51R0C5(;VI9eUdQo1w^a@=20H<bJ8M`KyQWIREDj|r(+YkxqRCQ
z*<r1`Odf?>0kMtoxtFP9CygUGB0?17eA19McnXvOa<Le)C+G(ucd$cF4934A+vI{-
z@G!Gs6&)<psxI0Hb+tahF-7}=+oC9eNSSzRCPbK$R(d`}#gu%eZ#f|<fT^pmMY(HY
zBa45$d}5)aVV|dmC)ziYBJ4`?*ALUAPa;<=83=9gfn9VULCqOnGuYWV(R@WHy2Iit
zIxyF&Q`bdlIK1v-@Gr}*vJOh3;cugQwqZM=1np@=AU+@opeFKbxUP#nbo4V->^_TT
zCsePj_0S&kP<#ZPECYBlihuT2v<1a({G`#35IZymXz_rMh>3LC=oIVE>aS_S$#`cD
zzyMh$0R!s5jugftX^CF>IZXd$b@|_BoQao6<;aI99-;(x?x+hIb>3GYn5(JDO%HBJ
zKgp%qCy%2q-9H_kJp5VF7zn_rOEwlT#z30`o$m0t8?B=Uss<+_olZz!hikL-igWy0
zDo<(%k;Yqqp6sL$Gz3BCjqp~^W^|I7KWm9*vZK^b8QT?Y65dvagk>|Rok4hzs?-D<
zVw;_WRs-x;e>sFQTWfxAuhI!!oi1SGY$p~ylh65Hd5sKs6~5{t2+^EPGUt%n#Hhgt
z`dw{Z=rtXBF*Y3(WwO6;ip8)Nk966UcRgI7k`j;^We(^usBnm88HSD~T=ZLLx4G7g
zZ^EQ?uMu>|+}{{~j5c7vl}jEy_DN!6dS&PPHFWZ&>pBmNatVbOfuRMKZVSn(-n<}w
zFYR=vsCUV?eq3_}s7X^+3>}}JF1V++4sG=wgyVJE1Il}*4{x&@G-m_y3@`5HBTjzU
z3j-b)dJBpffz}(w-*3zs*th%;>o4v4Y{n_T>vmk{xo68XpqjJy2Ef9h@Q=TOY6{yf
zC@exc#357`UmMc9GPX^c&8CH$(z(B^XU2f;4BEb>(5-N<tF5a7T8@5+pJ!dNdw!jt
z#0sLm`-jtz7TBOFfOgyWaX<_%XHoLE5C8BIiq_Ml29$ld{z}wy`xOYKpxbgeg{T1G
zDs!|^?E{W(4(!@5oVPGfwRJcN^D!W@Un^_yID{AV=(J=o#Co$-HL`3FKvrc2%s)*2
zay|G7-Kn@-s=YfutdiBKlAp8+5gh>(3jq$uPeh-3BlMNIw{cW!*zNoIcI{)trvBBm
z`_Q-<XNt$H=dw5KECC`em77Ib0>e9UX0gm+xM}f_oWq`1<uO%xj$G`~OD=9S2DPKM
zuIJ-Ns6rX9J=88?1c2+vZ@W0R6kH%i;`8LHUbFEsO?;5ymy62{cSBh$yE}GWXZ5cr
zUiMq0K#Blus)iioq4Nv1X&=(rPtXOqP9@cjMpM1l#4LaiU5g-70ESr?@*!5gSL;=5
zIG6T1I9<lh=~H>g@;zI9NDVX%-i?hV7@ZOh|2|oOiGL1_8Md3q8+C)2Xo1A$IbIHc
zc<$>p6|&)V2+l-ZEKLtI{>&1bZfeeY(}p*jz^;GeIc|2ok3VmOGDOWuUENT<<*~~V
zIR=FFm^xaL%*L25X-6;WtCxGaoql7KsZD=WgV<Nvjq9Oh(gOKZI05*ZzgY2H3+uUi
zL{-A&v;Fl!`f3(eS45etC}WNc@xrRjxP_YLC$pVcC~hFf=X1V+LVPDB+<#b|R|z|_
zI2j#$4RLKug|0#*UJo`<*+lnRLW&t8W_EHeiu3qoqMsaI3NzXTVMRU6j;qo0ESTf0
zZA!5%zpEWTY%~4vT;&QwD!;Df<OQ%CNoZJ|XFUR81}F@4T=RH=P*un-+)CT@q)arc
z|1iL5KwGK8>`w*L`8$O@Ac+Y!A0+7k6SW#>DRDC|DbGGh0sIVrb9`vJ{P#@n@P4T<
zuYNtGa{|-NiF7bcLK#QZb6hAXpL5_zY~bWUofgR7utyoi^$o#?^<MADPiod-OnrKS
zOjfvg>GWee4yDI3cLJ7C7XbJ*gq7%oBIWSa>7#83H$TQK5wAB?-@dFtQ9mWPuSWq_
z02|ILc?6K=OEA+GmMYbj9Xy809;rzb=^o~MPlNhF2v33!Q45Re$PVq^{$#4>f-HAV
zbh>J#9y{9QqH?wO;<E~o^)Ga&lCURHZIH0}vtvZ##lf@R@HP6Q2m<<=tqKM1^K8dj
zeC%*z>Vv|hEaK#$hZZOpgzN|io`CG{djQte5!m`-aPX;JTqhuugv92yXE9*CJDGB~
z8p;{ISxa2JB+VMv8L9z)ts_T}1Fp70wi&s(zi(z+jI4{&i`hD&dN%ZtatA#LnzRH`
zal`c=>hW^r`Cm=0_gK9Q<$tNTH(Eu3(k{wjlEn97vOADmtq|v6FjnS)c;|xKQ&!ou
zbVsICmEa9S0fDD!rfqUWqMr?68MTv+ZP!cdUUUwIG`d)Bd+E3Y5)|6R6R!;Sb27$_
z*;|GwlG8+T7!{U|;%lXz+c1p+dVxKUs>T-YnLRW~zA?wE#CqW6T71Ow(9=5gKfvSs
zueD{ifPb~DB9yeyZP?l39suxTM4bwtd`c^BR@Q<=HuZI{nGDhVUjbD8*0TPi<5K7e
z!Q*6O;e--!9e;EDPZiPr(VPDD`QQ4YW`DD?ivCNmmjB88$Ug_WvTXDv1{4Aw`Y&9d
zmVb|V`?s&#cwjA;k<oxVDZk%E=1=(^!CK#7O6V(1qj{+Jqw{HiNUy4RE;jl_trW#i
zLG7+Ap$`Q4%)%S?8MoXOH`vs-zUl&u<U}BLORKRbTu0oj$LC^w%|~S+ryT(*eICpq
zY3EAc;~x6-b2<XWB7Sb24>Utie8(ZG@FggkVI_2#8$wGA8o7v>8+`tY{vhL=c9gRJ
zh(X%k_w0uqd~scdihO6U9+FFor#2d`*Q$z8=}xvD5?^=2W*$W9fBdw%q@l>Rn9Xjw
zSlG2dW90|8z&?s(?KqlXFD|w`Q}!bfk=#B1NSn!cD4xL|=)`|!b9J1|j<}9@z;ey&
zOcDih<|=S`_gm_7?N!TA4w?3x=8~m}8p@MdIYTs6_@)(xxT8;*F3Xc0mbH!XbfN`S
zS$=mfCUdIP&vCmwprlZS-NV>*S)k(8&ilD^HDcrH7)*K{K3pkG=X|Pr3=^hIu)vRg
zg6}1uHOXx!>EtxkAVUHW$n?@+V5IPK0PMpVw16MQo}XEQcUZf0n>1d=RdVc%A2az%
z^_^X5<z5pQ;#zN}!wTEHSTt+sg2WijOxav|eW}p<!UvHkKDrweRYEpeGkdGV{+MI-
z6_5A>Xnc09LU=IEw1GVa_qw<N_clp>wXrcC=(bu3D-!r-SgMfk3$0UlR-xj#%)~(c
zsz!CcA)py^yHo{{U;Y+;O+0k!j{jFCdv!IO2<0T|5%M|j73@(*((>-J$@(*@Bp~X#
zAJ11=SsS}U-R7D6kdcNmgel;Jo8X}PD=2&0gFwt)fxk=&9^hqoflG&n>lJNvXj*f?
zcP*1^j`KNNF&uPBqihE~#48Q$6(@9l;lF|?699H^b;zSOPdBe9eJ2xp8tUBp9cr)H
zQ~_3PWCWn91zkEODlPTB=zA0RqM*s4C4C@Kxuh6edF6v~S3u<NfUaiQq@V_pmj?-4
zGXC4+-<|kZi1#BGi-FXoxl6m&6F=-~DyyCse#kP7h?nIQ1Ts2G!lC`a-Y)QZ<TG~!
z<#IpFu=2y`A*4QOsWf3{YPHq{A`2Gk{Hl_(^VUvat)i**lH;|>7!{f_yl+SU!-kVc
zR1yRidp*$P{TIQr+M4dv3g8|qF5)_tEddQ)=|xy$DE9ARFr$d8c{8(cI(@b<>gP9}
z_E2dt%T!tgFrLm~s%$+UJRHo5ZHbGF+zH5hpL;&OVLEbdEmJQ4nWLSi^5q}zRP&#U
zhkxF*ImeDpQ5{cD_(AUSk>HC*s|fi!<<A<@M~QqtBlk6jqy0EkZ-64i(*!}$h+oH2
zF8xReEZNS2sIiX8HkFIb7<{NTesVE>E+_IXx+CNhJEvfgT!%{<s#g1X*#_W&TesFN
zrM~FQQ2mZdvb$fU>2UvIlGVr4oW4c&CZ~Lv1BYnphytWC+1noN0n`n#65gc=%r;D7
zYk{|Um&&GLsi`+V$Cx@=f8<-_nVeL+IF78WWze_WBLb*wr60}BE65aXwOQPNG(01l
z(gWpaifP>PVDKyLXq1W~Lj3&;Zl0v(*m2a&$X3g~Vd^E!uI@?0gZ-lR37Sjtr2+1X
zD5qxVQ?$2pTh8nLQg^>E7~^=|9%s-oyaQT?>@wus6n&3}Ey>}E3qS92S!wv>bLKuM
zJ9URA?21bc(AFA>-~API0|sP*KIQ!iqFqKgsJbl6hZDq|d>7~rl#$Ha@EL_(obqdP
z);+Fe>)nre&+K(=6nr<WVZ_S|Gdkqw=1G9E5Ei*-4+7kXzkk1V6+MClQ6FR9Qg;*^
z%r@_c^M#2zr_5RxnToGQa+nT)p^>i0_I^?=awF9Vf`_pG3c3UX6U>~mfI2IMAJhmR
zo!9ZsWdxrAsk=M#Stwqj_~|)%UUgqSnqA&n@QJh*W&Yo)Gyh8g{6FSD+8ATGS3EjA
zB96}cv?`AyYw}P!$>7!AOW!rap2mD@Q{5ilUaCmI)lVm&r7=jukCBpgNk48Se*R@V
zWeqBmb$aC>he;Fo|G$zlQ5%jBJRJZC;C`>!gQ|CEXIY%H!b3VY0A7))Y=WxA!~=Sw
z8_iQ{V!vq<7Ses%eLZFd7hILWnlz!LHg#ViM^OBJy|#{Gp1?9qY%QF{k@qrQ$zjPw
z=|Dm&cg#FcY0BN?tgU+0c|?sbWvmg+t1Wna^*thFNUApTY4>{16V<8-*Y~cgbeA6{
z$#i!(3y?+_b-Vor#D921q1X`Tur4}pEHSFs*82LF8cNd5kCbjkf71jfgV9|GI3~So
zo(bcy6VR$WRvUM1-)uPTN4M<9Fa;;6$POKfj{=@dukS^9R&o(Mmm2$o!@S}EjRC#d
zVW~rIMJcJvJEMK^qWS7wPbS3}8J}tZl(Q<6D@cF@nH2$P9G2_E%w{x1WX{PoZE$-{
z*v34Yvg1h_OZHQhVZ1s82iW#q3jJs&m)!<x*ogw^kK|&oY_lFIe4=L5si9QN+8R6)
z@)g+8#amVLj*Sa32mS(TZgq3bdr_vbdC#`AnI3B)+6Xrh3K&?2)Ku2~O^l$#`Ap>d
z=`DzCoaEZuDsFu*qYt#I)F9|mAv$$Q8O7;0vfV=#37gcg+?DD`TOR)!=Ule<rJpP1
zOUYXWCA&?y&y0(FMn!Zk_EZvI+Vs{}>lwW;%I?h;ZW@|IjkbybNPLzcelQT@X;Fuj
zYVTLp7|E-9cDuO4-%T*_{XvJAK$_rNMw);_V*tm%WQjpW9~y%3Yoj-Y5~2<k>(5Rf
zzvpyI_g^4SZmRQE5u_0}u{vGQ=muT^tV^5FPC)thcW?Zf^Y(tZ&;Gs7Grmynz&h9~
zxFdt9(_`nO+-OAyz|oy-<IJp|cMNY1oa=iXkr@>T+i3ZRJO5uTz;Ew=--}T3+g=3k
z{|G>{SjZh{PQh+vCy*mtXm8Gq{t8+dheAlFho}Lz<Ph3_@^<ymP59QN*GwklUR?~X
z`^QyQ+<uN9po~yf7$&~|es`%MOeKim;+N8{CO(qeGK@xCb(1|Gx*96rcd=R*cYHa5
z8pcPzl9efbklcC>GfB*zd;#4t0}QWvWe%e$u8@=!%t%zj3zz#9w)0-Nt$e30qmBQA
z<(t>EWs}C*E3PTm-sZA=-t7vywGobcoYWrDzZh(wq&WJax8hB(FhZI1UhVScB%Fz;
zJ<k)};S2jb=<D%n&5`e=%B{XNB{JIs(15o1W+0*E6f%l@32(pDe_hq|E@n4ca8y!)
zU+Kd=wN4Yh-btA!u}^@AF@I6&SO*8N5ZxyMr~rN5mcFH^7t0V9a&Lxw<dX7!{?^ev
zujAz(Z9S>tyP$Qd&~Hhfx0EUbF}Uz@KL;H1xR(d>)a^pJMxpa5{lG|3evhU{R^g+3
zVd9%iQ-x$*Xc%gTBm3l>5ls&hCR_tAM^GoSV7g;(HJIuY^%pHDob;8ZoC{l5I)=TI
zvi)HYT#T0>Kszvlomqld)H82^Wyo2Vh7`6jAuAOhS3;`NrodZH=^n5SUCJ7g3$=<4
zrbAAu0$*LLCFr9%Bm>T>qOc;xMxJ8I!=LM<>98pE3ud>(WZANmsIB%8G-D1Scw=N8
z4e9(nD28M56~bU#jSn!l^|#Vd^apQ(Tr7AYk>(R?#`KT)699!rCCWm%t!zvs%60cd
z7|?^H2YktRBbtd;mQn9aNc)L->hcBkt;Wp!yeo4HEVZ*=V<>y$XA<OwVbY6D0BD1k
zZDbc=Z+||syx%n;$c64WsY3}OBTn%YyqmGP@%JGRMcJ>QZ8T~-7rDV;UZN0ZoiyOd
zd08!wg&3#z$JH$$MbCpwa3eZ-FG*-KMW_3u+3-1{h6-9^dnsDApVC~1DeKXLyYIvT
z)wX31FnJ&m04o*7N;Ye}_gm@0A>AhG=H%xdJ(v{dFHCJUbJa>y;yW)4<h4<GY*=Pn
zpd|GTrt&MO{Zcb>;S1<EeJSAu(qSR`REJS=N6+*?Re|=lt-oq@58I2+Wo5YfKIynK
zvj&3ur(7!Vrr0-&yB$Fe#@P5m`xOsXzX~&IN$)uAHkIKse9zv&RqLhbn$`pty+GhY
z7+MD#zrGY5mZFa-A6VdU83EWfHMpK%qH2szyhz0?!|G7%evRu$ib8W)8F|XF65qOl
z4w#-&mpc<;#^hP+A59B_$osK^k4FKGSTYDdSL#>OZUe^p#;GX(GI#q@eu*PYx{IOa
zO#^tyHF1@fojw8$I{U2x)vWXG8{TtCt#pYbeRJBW74oqzM7q!a@pQGMA*Uiw)>`XB
zyx<ZbTOF(EqibokKR6fDT7?e<^kvzfLw8t^0QD1Zh?+Gi!F{cg^lIefTc|{B<JaM2
z8A`#C%Fnx>E7b7!0JOpk&GLH0J8XK?AwZ4OW`B9_Q?akBiS<z8nTr>n#QUu^RZMa6
z!l#^8p6fT&*O!#Y=*fHmfu8DB2$C{e<!2H6co&**ph!aDSHJc{yy^MZv}vnunVKsC
z+%I{sgRI~dETjijm)$OMU~SmVwH_^5R;Ow(-(eqD<q!4iP+DNUPw$ZEX^ljWtQkgT
zaV9l3Jlp9#c_mySj3;0p9Ia?YM|nvx;RU+PO}=fXr9bWUFSwQTZ2Xf-r0TW(Zt-!H
z`(vH<3cBz%UVb1kHVmxi*_C`&#+Sd(hyiggwI%dWnA`}|w*Z(?feu$^6HM`P|6u7^
z!!4(S-+=~U?eBg-W1#i<wI{aw2Mf_$x2k4u6-txd8}qvj9)%lp@K!+#g10J?z1?7K
z*)>4*?!beJsEpa0cR`-@_Vf&bWSQke*xvehF(D15Og(Ul2cUd<I*0e|%)X!|xCuWS
zEb78bUpXo@<W!|m{^In)<KuC2w=dThALrP~Z7kG8mD{>ub9(|9Q&uc@+5O;#BjrN%
z&0}&>4PEJv_8BoZth@xlJr>(p$W!<?xXg)Q4i_8RA@`iya&n?_oNnQBeOKR|YfcKE
z+_p*@K}ZV&=uiyJvO?4WU?Dq}KNWUNXGagsa1(Sb{UcW|JmmO(*KIYH_YCMbduc6w
z3CWJ|MMg;bg`H8mFoTajg4c$=Dzk{}yjhaA)gMBXR9q}dEbgNviDKYg8$=hZem9g!
zmZa>R5P5zMZl<THBh*=<@X3@%CbNSt(dUZdr@=U)E#8Tsig#JMJnc4+AXq=o_-)Zq
zGTOY>*Y@>8M4C|#^|VIlWc=yF?5Xv3Bm+VY7|PwHxUr11>@wZSnk{yQ*0nXCrRHVz
zi;2W0-Np4{!9*qi-r(vH0ikae#~{tHuxNymoo%T;l1_D2<VN1tfrsnDyS%p@$&*|Y
z3G6%H^BrYuW(xAJjJONb%7|pXqt$r)diD;p_LF$=2GU35YKSxRkmN+1E_2F&?CfGk
zO|6A(D(kSP8kR1cGmP3>?3K}?GC9L>LojZbBlncVB<J=9cpuW8&D4_JZv9|``1AwT
z$kH(ZC(uVH(S0xY`uRs5=@qmAuxdZ4E*n9Na9s0+cP;^Mk8C)xyWtXcqqeVmV|t9L
z>&(64{LIc4_p0g<#ew)b24GhWzJPzN;<REI$#vNR7LH)C$7!{MezzZ&a}qRjD}X(<
zdu|!_frgVTR53&_j8fyoM`H??(BxL(b9j&2Rb#GM^M;}krOt1q+rG%3T3TQQcLsN$
z_H?D7JE}!M3i&~6Z$%u+b|pK4y^x$coj{KINpk(y^w>XhFO~+FXWqX!M(@WZG|8&)
zzqS&ZkCf<`kAwiq1eJBx0cMdoXLN2_XpE$VKmn4!-!!q9Ih5Wu%gzUxCO3Ei+1o?I
z9v~qi7cJ!O^jC+;kple(aEpJ~=#Btm*>B)mFe|v%D16W2!fE~GmaDK4{z_2~{^DDL
zVvGA9@5+M0Yl}^{M|V#|D3_y2zn~l`^0Y886Gojc7ON~-Q^z#>h2%k-v-s>^oB@AB
z8lgFKyWr!px#bIM2|>7wR$bheZra!K?#jlVDYFUiI*}Zv0RT|Oqjs67i#b)ZXekwL
zBRKlQyNJ5zL+Zn1#c-K4!Jw=wepK|8oJ#1yZ%Ar{REeL|l1^l^RC!K<(dfZ9yN|KV
z^HzE84?i!a3OfnJ#*2?`<S@mlUD=rKT1Vf;1m3BwY|Q!saDQ1@b<s6n`4NX*(=9Q(
zI`f3HqI#o?6^u2+Ndg-X1B%o@%JwD!3}UAj6VO?jq&LVkKn3Sb!=qi-wbqH;3mbo2
z(fT*)vy3j?*U-CBDiS4r(FUWJCfsGW9X{yO**$-+(t5LyUf-=3M4`7xpMkG7yg(k!
zW|6yEjHg?QOo$N9xOjTPdk*FDTovk9vKr`rsTKb&M9lziN{}<m!IzTH`*xK-A=<6b
zQ&NqD<6I<BTV~INq8^urfCwdGSuF&~ZJ)-j-RHra2yHA)v>f;5{IcWvE1<1FAhYLb
z@_P-{7c6MF&9FM;bm0r6Gc<g%+6s?qo8VO&UFFj+aWpQ2{m2C)I&Ux<Tw7D%3EDat
z?X0ochA_xEO@DC-Y?@>I-rB8woSSUvqh?I^_JgoHjp;~J2~Zt_e;d>tuZ4zMxM!Dt
zo81I7m5Z)w_|mnBn16Vhmlt}MLW$?8_*My!vN!BeyExV?yd+`R8DNm4*5Yf+^oBRn
z^=AK*Fz}srrMWO&u%2R9us6aI+tVdAtu4jq$8;MHufY2YOz<^l>>EYdM^Z%Hs%d87
z7b~ri7EFl+XQ7H@|Im1Wf19}d?J->3k8*K07P3<DJXjiWJ8gQ|4=4&Dl-O$0<LTsW
z;Phbg(>`|Z_1|WGnjDukqWOe6cL05XWJEd9;5!zx0f<2VRg;n2nly1=aMHMsOj72T
zV6*!^i5mPBbHu0k!Jt5w5^KcimJD2{u7pFH`Yuxj$Sv0H%a2b3r-2Z<5m-hQQyoi3
zopgYeA}l9#wXk^=H`~}Xytw_cG@&MLP5m<``8sV5)Bc4*0V-HNf)y`ROytJ0$GjW$
z4poToc1is>Dw`b9@-{AP{i`qs<S|Iip7!o4Y}m_)COmi;N-I|p+e+PlagQt3DWDlE
ze=nJrsxR7*;DZ6JR(=+f0js+75N{LY@wvB$WDO$#JYPl55JD)pS|P^!mV;T7_smf?
zzOzKvjN9z7XKH(Py(6$>k#R8JX-EeNhI@p!$4uI!v7MhtJ9&G4M9=ODQqtk=*Q!(p
z*TnCyY->_NV~*f#2+c_{J)W+e%*snY(X8)(Df~ihT%P)o^vvu`d(z97-3cLgF@YgZ
z;Ksm=lnDOJ%ZwEcCo^k#ESH|~>YLLvZf-rFPHW=0@|t{m%ilzOfQgS+T?IB)`+GF>
zKiuAaZiOae(3SDOz#`11^NWx@rDVtna}K@8*0kB(t!b|uIy&?7ITo3}K}U>{4vH)a
z<hU7Y!PZUVKeycZ5K&!m%iB-wrURrXx04O`*<LN@SH9Fy8y)_zy$wKgx<FpP4Tt}y
z9J9SHR$@V4V4~Sh)Y+3Ov?3AT>EYsE^uP_tt&SAU56bgl8s^C%!ZzbFWBR5=o9btc
z)e9GmXaFvoLse(_ZIe3fF|c*XDK@7__4W`^#ghKE&!z9hO3OhWQ?qo>G$GB3wm^1q
zvec|kL)lWlCBREy45mlld(A^L8*Z$A7663*&I|pQ?A?FE=e1;eDLgzA3{BNpY^><~
z^qHFdjCZZJ_MSTJ4;;ztx@Uq8Sow#jK8uaSL{i@ahDXAn#aWI`Y={!jz>{+t0|PuU
zN2+31KNmZzy#%wc(a~0Fi}5QPQR}c-M_SvOO*vA$kYKUWcb&<FMjG^)%{fx~Fle87
z%(LSWoSo?iJKv|zbHt1+@qxQ9q~8SXY`nMj4y=R7Pn^Ue#Z1fRkWZj+Yz|{q%y(3S
zB<e$U1%3qSK8GBXB_X>*u9b+VlII$K1!Xy%0P>uWnFjcN|6k58iTMtQ+M(7nf>A3v
zIP@fO-*A%XNLljkBOE<bal8h+&oHUU`&W?4C=hdTfwqd?!w5dxWdFi#vh1H9@aGWy
zIg9>W2mk+X6&~omiXXzjmbUcUopc7)WV?rk=M3(bBpW_~o=iT%p^v?C7~=sp{A%W1
z5BKa$+rB3feeQ{5Hf!@jtsH8zQZdnzx@+YiRqDIOw~S4}(;>Xnx{9nrR}Ht-nE+yh
zcR*A*Nx~O2lmi4x&tk_OKFZ&`Ezkmj*&0=kzf&H2Xhbb<5t>#Z^ONjNBl(>u?8}&z
zBSq*<+ncQbNmqyn$ELj5$V-5dD?ME&9p+==?*%sO%k3-8vgWl0neJ`uZLI?@sIw9q
zRT@8jAO1c>4V6rgeDCk|=ZXKrh9`dtCU0CFB6fOnN$m4Yznh@4v-xLCBYsl1rHa{G
zBud;YRcA0OPIWSKHWC3Dn}o)wE02Zv=1^0z9v&7P7XO8&jS>J@;epmE@acY;ygUI}
zz;of`O;xz*1iuLiiHR-0Cou8KJVY$D=*xI@?MUkw_*TAbxuvLHb%^Lr-jw+l6IXyo
zPPJ$XePUMr(hZkG+oX!RDIXY|>f%wy9aKc|e$ZniE^8s*Vg{|f;0!9QnK{UHXxKEb
zCm1(OhluUqNFsl^wsq{!i~k(3|K5|N*vBmMWs?X+{g=lUm1i@nL&Sl_S{!EDPB5a<
za8|Fy0Dqt%(Zzt#kb^?%!yFK;_XVfL2O%J7{lA1_l0ZiqKz(*1&?28#kfhG=zn86S
ztm#J1>COBgF)Lbk2G9+i@=L=eBlnG{gq-9FcV--C)S2B_-#7;q#MOdNP0Npc!{zdh
zff@fi`g08a&pgNnCJCCgDX7G{Al(gg<9ueVDQ;xZh!XU39u}G`4pEqdX82G`fcQA_
z|H2^omKp@|Ed*MukN^=B@#jR$4-);*rKbhb(SkbWO*?{XO%_Ltz<QzjS|9~C?%RMj
z^oW5&_Cy5yO7=dGPd;^jINb1E2xqu9gW8`j{nw3#e(h*c&M9>Mr~H2bcZfcA@&PiY
z9Q{zo{G3yfhd+PBpBXhIPz?F`L;g*%J(mLbWuwc5M7RHOVg29C5B^8WCI64lrM)(a
Q5e+X_`$G&2^w+8X1u%?H6#xJL

literal 0
HcmV?d00001

diff --git a/docs/api.rst b/docs/api.rst
new file mode 100644
index 0000000..8ca9118
--- /dev/null
+++ b/docs/api.rst
@@ -0,0 +1,44 @@
+fileio
+-------
+.. automodule:: mmcv.fileio
+    :members:
+
+image
+------
+.. automodule:: mmcv.image
+    :members:
+
+video
+------
+.. automodule:: mmcv.video
+    :members:
+
+arraymisc
+---------
+.. automodule:: mmcv.arraymisc
+    :members:
+
+visualization
+--------------
+.. automodule:: mmcv.visualization
+    :members:
+
+utils
+-----
+.. automodule:: mmcv.utils
+    :members:
+
+cnn
+----
+.. automodule:: mmcv.cnn
+    :members:
+
+runner
+------
+.. automodule:: mmcv.runner
+    :members:
+
+ops
+------
+.. automodule:: mmcv.ops
+    :members:
diff --git a/docs/community/contributing.md b/docs/community/contributing.md
new file mode 120000
index 0000000..f939e75
--- /dev/null
+++ b/docs/community/contributing.md
@@ -0,0 +1 @@
+../../CONTRIBUTING.md
\ No newline at end of file
diff --git a/docs/community/pr.md b/docs/community/pr.md
new file mode 100644
index 0000000..77bdbf7
--- /dev/null
+++ b/docs/community/pr.md
@@ -0,0 +1,94 @@
+## Pull Request (PR)
+
+### What is PR
+
+`PR` is the abbreviation of `Pull Request`. Here's the definition of `PR` in the [official document](https://docs.github.com/en/github/collaborating-with-pull-requests/proposing-changes-to-your-work-with-pull-requests/about-pull-requests) of Github.
+
+> Pull requests let you tell others about changes you've pushed to a branch in a repository on GitHub. Once a pull request is opened, you can discuss and review the potential changes with collaborators and add follow-up commits before your changes are merged into the base branch.
+
+### Basic Workflow
+
+1. Get the most recent codebase
+2. Checkout a new branch from the master branch
+3. Commit your changes
+4. Push your changes and create a PR
+5. Discuss and review your code
+6. Merge your branch to the master branch
+
+### Procedures in detail
+
+1. Get the most recent codebase
+    + When you work on your first PR
+        - Fork the OpenMMLab repository: click the **fork** button at the top right corner of Github page
+        ![avatar](../_static/community/1.png)
+
+        - Clone forked repository to local
+            ```bash
+            git clone git@github.com:XXX/mmcv.git
+            ```
+
+        - Add source repository to upstream
+            ```bash
+            git remote add upstream git@github.com:open-mmlab/mmcv
+            ```
+
+    + After your first PR
+       - Checkout master branch of the local repository and pull the latest master branch of the source repository
+            ```bash
+            git checkout master
+            git pull upstream master
+            ```
+
+2. Checkout a new branch from the master branch
+    ```bash
+    git checkout -b branchname
+    ```
+
+```{tip}
+To make commit history clear, we strongly recommend you checkout the master branch before create a new branch.
+```
+
+3. Commit your changes
+    ```bash
+    # coding
+    git add [files]
+    git commit -m 'messages'
+    ```
+
+4. Push your changes to the forked repository and create a PR
+    + Push the branch to your forked remote repository
+        ```bash
+        git push origin branchname
+        ```
+
+    + Create a PR
+    ![avatar](../_static/community/2.png)
+
+    + Revise PR message template to describe your motivation and modifications made in this PR. You can also link the related issue to the PR manually in the PR message (For more information, checkout the [official guidance](https://docs.github.com/en/issues/tracking-your-work-with-issues/linking-a-pull-request-to-an-issue)).
+
+5. Discuss and review your code
+   + After creating a pull request, you can ask a specific person to review the changes you've proposed
+    ![avatar](../_static/community/3.png)
+
+    + Modify your codes according to reviewers' suggestions and then push your changes
+
+6.  Merge your branch to the master branch and delete the branch
+    ```bash
+    git branch -d branchname # delete local branch
+    git push origin --delete branchname # delete remote branch
+    ```
+
+### PR Specs
+
+1. Use [pre-commit](https://pre-commit.com) hook to avoid issues of code style
+2. One short-time branch should be matched with only one PR
+3. Accomplish a detailed change in one PR. Avoid large PR
+   >- Bad: Support Faster R-CNN
+   >- Acceptable: Add a box head to Faster R-CNN
+   >- Good: Add a parameter to box head to support custom conv-layer number
+4. Provide clear and significant commit message
+5. Provide clear and meaningful PR description
+   >- Task name should be clarified in title. The general format is: [Prefix] Short description of the PR (Suffix)
+   >- Prefix: add new feature [Feature], fix bug [Fix], related to documents [Docs], in developing [WIP] (which will not be reviewed temporarily)
+   >- Introduce main changes, results and influences on other modules in short description
+   >- Associate related issues and pull requests with a milestone
diff --git a/docs/en/compatibility.md b/docs/compatibility.md
similarity index 100%
rename from docs/en/compatibility.md
rename to docs/compatibility.md
diff --git a/docs/zh_cn/conf.py b/docs/conf.py
similarity index 62%
rename from docs/zh_cn/conf.py
rename to docs/conf.py
index 7bfb9c2..bea4706 100644
--- a/docs/zh_cn/conf.py
+++ b/docs/conf.py
@@ -15,19 +15,21 @@ import os
 import sys
 
 import pytorch_sphinx_theme
+from m2r import MdInclude
+from recommonmark.transform import AutoStructify
 from sphinx.builders.html import StandaloneHTMLBuilder
 
-sys.path.insert(0, os.path.abspath('../..'))
+sys.path.insert(0, os.path.abspath('..'))
 
-version_file = '../../mmcv/version.py'
-with open(version_file) as f:
+version_file = '../mmcv/version.py'
+with open(version_file, 'r') as f:
     exec(compile(f.read(), version_file, 'exec'))
 __version__ = locals()['__version__']
 
 # -- Project information -----------------------------------------------------
 
 project = 'mmcv'
-copyright = '2018-2022, OpenMMLab'
+copyright = '2018-2021, OpenMMLab'
 author = 'MMCV Authors'
 
 # The short X.Y version
@@ -47,8 +49,6 @@ release = __version__
 
 extensions = [
     'sphinx.ext.autodoc',
-    'sphinx.ext.autosummary',
-    'sphinx.ext.intersphinx',
     'sphinx.ext.napoleon',
     'sphinx.ext.viewcode',
     'sphinx.ext.autosectionlabel',
@@ -57,18 +57,6 @@ extensions = [
     'sphinx_copybutton',
 ]  # yapf: disable
 
-myst_heading_anchors = 4
-
-myst_enable_extensions = ['colon_fence']
-
-# Configuration for intersphinx
-intersphinx_mapping = {
-    'python': ('https://docs.python.org/3', None),
-    'numpy': ('https://numpy.org/doc/stable', None),
-    'torch': ('https://pytorch.org/docs/stable/', None),
-    'mmengine': ('https://mmengine.readthedocs.io/en/latest', None),
-}
-
 autodoc_mock_imports = ['mmcv._ext', 'mmcv.utils.ext_loader', 'torchvision']
 autosectionlabel_prefix_document = True
 
@@ -91,7 +79,7 @@ master_doc = 'index'
 #
 # This is also used if you do content translation via gettext catalogs.
 # Usually you set "language" from the command line for these cases.
-language = 'zh_CN'
+language = None
 
 # List of patterns, relative to source directory, that match files and
 # directories to ignore when looking for source files.
@@ -120,9 +108,92 @@ html_theme_options = {
             'name': 'GitHub',
             'url': 'https://github.com/open-mmlab/mmcv'
         },
-    ],
-    # Specify the language of shared menu
-    'menu_lang': 'cn',
+        {
+            'name':
+            'Docs',
+            'children': [
+                {
+                    'name': 'MMCV',
+                    'url': 'https://mmcv.readthedocs.io/en/latest/',
+                },
+                {
+                    'name': 'MIM',
+                    'url': 'https://openmim.readthedocs.io/en/latest/'
+                },
+                {
+                    'name': 'MMAction2',
+                    'url': 'https://mmaction2.readthedocs.io/en/latest/',
+                },
+                {
+                    'name': 'MMClassification',
+                    'url':
+                    'https://mmclassification.readthedocs.io/en/latest/',
+                },
+                {
+                    'name': 'MMDetection',
+                    'url': 'https://mmdetection.readthedocs.io/en/latest/',
+                },
+                {
+                    'name': 'MMDetection3D',
+                    'url': 'https://mmdetection3d.readthedocs.io/en/latest/',
+                },
+                {
+                    'name': 'MMEditing',
+                    'url': 'https://mmediting.readthedocs.io/en/latest/',
+                },
+                {
+                    'name': 'MMGeneration',
+                    'url': 'https://mmgeneration.readthedocs.io/en/latest/',
+                },
+                {
+                    'name': 'MMOCR',
+                    'url': 'https://mmocr.readthedocs.io/en/latest/',
+                },
+                {
+                    'name': 'MMPose',
+                    'url': 'https://mmpose.readthedocs.io/en/latest/',
+                },
+                {
+                    'name': 'MMSegmentation',
+                    'url': 'https://mmsegmentation.readthedocs.io/en/latest/',
+                },
+                {
+                    'name': 'MMTracking',
+                    'url': 'https://mmtracking.readthedocs.io/en/latest/',
+                },
+                {
+                    'name': 'MMFlow',
+                    'url': 'https://mmflow.readthedocs.io/en/latest/',
+                },
+                {
+                    'name': 'MMFewShot',
+                    'url': 'https://mmfewshot.readthedocs.io/en/latest/',
+                },
+            ]
+        },
+        {
+            'name':
+            'OpenMMLab',
+            'children': [
+                {
+                    'name': 'Homepage',
+                    'url': 'https://openmmlab.com/'
+                },
+                {
+                    'name': 'GitHub',
+                    'url': 'https://github.com/open-mmlab/'
+                },
+                {
+                    'name': 'Twitter',
+                    'url': 'https://twitter.com/OpenMMLab'
+                },
+                {
+                    'name': 'Zhihu',
+                    'url': 'https://zhihu.com/people/openmmlab'
+                },
+            ]
+        },
+    ]
 }
 
 # Add any paths that contain custom static files (such as style sheets) here,
@@ -215,3 +286,16 @@ StandaloneHTMLBuilder.supported_image_types = [
 # Ignore >>> when copying code
 copybutton_prompt_text = r'>>> |\.\.\. '
 copybutton_prompt_is_regexp = True
+
+
+def setup(app):
+    app.add_config_value('no_underscore_emphasis', False, 'env')
+    app.add_config_value('m2r_parse_relative_links', False, 'env')
+    app.add_config_value('m2r_anonymous_references', False, 'env')
+    app.add_config_value('m2r_disable_inline_math', False, 'env')
+    app.add_directive('mdinclude', MdInclude)
+    app.add_config_value('recommonmark_config', {
+        'auto_toc_tree_section': 'Contents',
+        'enable_eval_rst': True,
+    }, True)
+    app.add_transform(AutoStructify)
diff --git a/docs/en/deployment/mmcv_ops_definition.md b/docs/deployment/mmcv_ops_definition.md
similarity index 80%
rename from docs/en/deployment/mmcv_ops_definition.md
rename to docs/deployment/mmcv_ops_definition.md
index d7eabb3..5696316 100644
--- a/docs/en/deployment/mmcv_ops_definition.md
+++ b/docs/deployment/mmcv_ops_definition.md
@@ -1,10 +1,7 @@
-# MMCV Operators
-
-To make custom operators in MMCV more standard, precise definitions of each operator are listed in this document.
+# Definition of custom operators in MMCV
 
 <!-- TOC -->
-
-- [MMCV Operators](#mmcv-operators)
+- [Definition of custom operators in MMCV](#definition-of-custom-operators-in-mmcv)
   - [MMCVBorderAlign](#mmcvborderalign)
     - [Description](#description)
     - [Parameters](#parameters)
@@ -83,26 +80,25 @@ To make custom operators in MMCV more standard, precise definitions of each oper
     - [Inputs](#inputs-12)
     - [Outputs](#outputs-12)
     - [Type Constraints](#type-constraints-12)
-  - [grid_sampler\*](#grid_sampler)
+- [torch](#torch)
+  - [grid_sampler](#grid_sampler)
     - [Description](#description-13)
     - [Parameters](#parameters-13)
     - [Inputs](#inputs-13)
     - [Outputs](#outputs-13)
     - [Type Constraints](#type-constraints-13)
-  - [cummax\*](#cummax)
+  - [cummax](#cummax)
     - [Description](#description-14)
     - [Parameters](#parameters-14)
     - [Inputs](#inputs-14)
     - [Outputs](#outputs-14)
     - [Type Constraints](#type-constraints-14)
-  - [cummin\*](#cummin)
+  - [cummin](#cummin)
     - [Description](#description-15)
     - [Parameters](#parameters-15)
     - [Inputs](#inputs-15)
     - [Outputs](#outputs-15)
     - [Type Constraints](#type-constraints-15)
-  - [Reminders](#reminders)
-
 <!-- TOC -->
 
 ## MMCVBorderAlign
@@ -122,9 +118,9 @@ Read [BorderDet: Border Feature for Dense Object Detection](ttps://arxiv.org/abs
 
 ### Parameters
 
-| Type  | Parameter   | Description                                                                         |
-| ----- | ----------- | ----------------------------------------------------------------------------------- |
-| `int` | `pool_size` | number of positions sampled over the boxes' borders(e.g. top, bottom, left, right). |
+| Type    | Parameter       | Description                                                    |
+| ------- | --------------- | -------------------------------------------------------------- |
+| `int`   | `pool_size`        | number of positions sampled over the boxes' borders(e.g. top, bottom, left, right). |
 
 ### Inputs
 
@@ -156,11 +152,11 @@ Read [CARAFE: Content-Aware ReAssembly of FEatures](https://arxiv.org/abs/1905.0
 
 ### Parameters
 
-| Type    | Parameter      | Description                                   |
-| ------- | -------------- | --------------------------------------------- |
-| `int`   | `kernel_size`  | reassemble kernel size, should be odd integer |
-| `int`   | `group_size`   | reassemble group size                         |
-| `float` | `scale_factor` | upsample ratio(>=1)                           |
+| Type    | Parameter       | Description                                                    |
+| ------- | --------------- | -------------------------------------------------------------- |
+| `int`   | `kernel_size`        | reassemble kernel size, should be odd integer|
+| `int`   | `group_size`        | reassemble group size |
+| `float`   | `scale_factor`        | upsample ratio(>=1) |
 
 ### Inputs
 
@@ -191,7 +187,8 @@ Read [CCNet: Criss-Cross Attention for SemanticSegmentation](https://arxiv.org/p
 
 ### Parameters
 
-None
+| Type    | Parameter       | Description                                                    |
+| ------- | --------------- | -------------------------------------------------------------- |
 
 ### Inputs
 
@@ -222,7 +219,8 @@ Read [CCNet: Criss-Cross Attention for SemanticSegmentation](https://arxiv.org/p
 
 ### Parameters
 
-None
+| Type    | Parameter       | Description                                                    |
+| ------- | --------------- | -------------------------------------------------------------- |
 
 ### Inputs
 
@@ -244,6 +242,7 @@ None
 
 - T:tensor(float32)
 
+
 ## MMCVCornerPool
 
 ### Description
@@ -252,9 +251,9 @@ Perform CornerPool on `input` features. Read [CornerNet -- Detecting Objects as
 
 ### Parameters
 
-| Type  | Parameter | Description                                                      |
-| ----- | --------- | ---------------------------------------------------------------- |
-| `int` | `mode`    | corner pool mode, (0: `top`, 1: `bottom`, 2: `left`, 3: `right`) |
+| Type    | Parameter       | Description                                                      |
+| ------- | --------------- | ---------------------------------------------------------------- |
+| `int`   | `mode`          | corner pool mode, (0: `top`, 1: `bottom`, 2: `left`, 3: `right`) |
 
 ### Inputs
 
@@ -284,15 +283,15 @@ Read [Deformable Convolutional Networks](https://arxiv.org/pdf/1703.06211.pdf) f
 
 ### Parameters
 
-| Type           | Parameter           | Description                                                                                                       |
-| -------------- | ------------------- | ----------------------------------------------------------------------------------------------------------------- |
-| `list of ints` | `stride`            | The stride of the convolving kernel, (sH, sW). Defaults to `(1, 1)`.                                              |
-| `list of ints` | `padding`           | Paddings on both sides of the input, (padH, padW).  Defaults to `(0, 0)`.                                         |
-| `list of ints` | `dilation`          | The spacing between kernel elements (dH, dW). Defaults to `(1, 1)`.                                               |
-| `int`          | `groups`            | Split input into groups. `input_channel` should be divisible by the number of groups. Defaults to `1`.            |
-| `int`          | `deformable_groups` | Groups of deformable offset. Defaults to `1`.                                                                     |
-| `int`          | `bias`              | Whether to add a learnable bias to the output. `0` stands for `False` and `1` stands for `True`. Defaults to `0`. |
-| `int`          | `im2col_step`       | Groups of deformable offset. Defaults to `32`.                                                                    |
+| Type           | Parameter          | Description                                                                           |
+| -------------- | ------------------ | ------------------------------------------------------------------------------------- |
+| `list of ints` | `stride`           | The stride of the convolving kernel, (sH, sW). Defaults to `(1, 1)`.                                        |
+| `list of ints` | `padding`          | Paddings on both sides of the input, (padH, padW).  Defaults to `(0, 0)`.                                   |
+| `list of ints` | `dilation`         | The spacing between kernel elements (dH, dW). Defaults to `(1, 1)`.                                      |
+| `int`          | `groups`            | Split input into groups. `input_channel` should be divisible by the number of groups. Defaults to `1`.|
+| `int`          | `deformable_groups` | Groups of deformable offset. Defaults to `1`.                                                         |
+| `int`          | `bias` | Whether to add a learnable bias to the output. `0` stands for `False` and `1` stands for `True`. Defaults to `0`.                                                          |
+| `int`          | `im2col_step` | Groups of deformable offset. Defaults to `32`.                                                         |
 
 ### Inputs
 
@@ -324,11 +323,11 @@ Perform Modulated Deformable Convolution on input feature, read [Deformable Conv
 
 ### Parameters
 
-| Type           | Parameter           | Description                                                                           |
-| -------------- | ------------------- | ------------------------------------------------------------------------------------- |
-| `list of ints` | `stride`            | The stride of the convolving kernel. (sH, sW)                                         |
-| `list of ints` | `padding`           | Paddings on both sides of the input. (padH, padW)                                     |
-| `list of ints` | `dilation`          | The spacing between kernel elements. (dH, dW)                                         |
+| Type           | Parameter          | Description                                                                           |
+| -------------- | ------------------ | ------------------------------------------------------------------------------------- |
+| `list of ints` | `stride`           | The stride of the convolving kernel. (sH, sW)                                         |
+| `list of ints` | `padding`          | Paddings on both sides of the input. (padH, padW)                                     |
+| `list of ints` | `dilation`         | The spacing between kernel elements. (dH, dW)                                         |
 | `int`          | `deformable_groups` | Groups of deformable offset.                                                          |
 | `int`          | `groups`            | Split input into groups. `input_channel` should be divisible by the number of groups. |
 
@@ -366,13 +365,13 @@ Deformable roi pooling layer
 
 ### Parameters
 
-| Type    | Parameter        | Description                                                                                                   |
-| ------- | ---------------- | ------------------------------------------------------------------------------------------------------------- |
+| Type    | Parameter       | Description                                                    |
+| ------- | --------------- | -------------------------------------------------------------- |
 | `int`   | `output_height`  | height of output roi                                                                                          |
 | `int`   | `output_width`   | width of output roi                                                                                           |
 | `float` | `spatial_scale`  | used to scale the input boxes                                                                                 |
 | `int`   | `sampling_ratio` | number of input samples to take for each output sample. `0` means to take samples densely for current models. |
-| `float` | `gamma`          | gamma                                                                                                         |
+| `float`   | `gamma`        | gamma |
 
 ### Inputs
 
@@ -405,10 +404,10 @@ Read [Pixel Recurrent Neural Networks](https://arxiv.org/abs/1601.06759) for mor
 
 ### Parameters
 
-| Type           | Parameter | Description                                                                      |
-| -------------- | --------- | -------------------------------------------------------------------------------- |
-| `list of ints` | `stride`  | The stride of the convolving kernel. (sH, sW). **Only support stride=1 in mmcv** |
-| `list of ints` | `padding` | Paddings on both sides of the input. (padH, padW). Defaults to `(0, 0)`.         |
+| Type    | Parameter       | Description                                                    |
+| ------- | --------------- | -------------------------------------------------------------- |
+| `list of ints` | `stride`           | The stride of the convolving kernel. (sH, sW). **Only support stride=1 in mmcv**                                         |
+| `list of ints` | `padding`          | Paddings on both sides of the input. (padH, padW). Defaults to `(0, 0)`.                                     |
 
 ### Inputs
 
@@ -444,10 +443,10 @@ Read [PSANet: Point-wise Spatial Attention Network for Scene Parsing](https://hs
 
 ### Parameters
 
-| Type           | Parameter   | Description                                  |
-| -------------- | ----------- | -------------------------------------------- |
-| `int`          | `psa_type`  | `0` means collect and `1` means `distribute` |
-| `list of ints` | `mask_size` | The size of mask                             |
+| Type    | Parameter       | Description                                                    |
+| ------- | --------------- | -------------------------------------------------------------- |
+| `int`   | `psa_type`        | `0` means collect and `1` means `distribute` |
+| `list of ints`   | `mask_size`        | The size of mask |
 
 ### Inputs
 
@@ -479,9 +478,9 @@ Note this definition is slightly different with [onnx: NonMaxSuppression](https:
 
 | Type    | Parameter                    | Description                                                                                                                          |
 | ------- | ---------------------------- | ------------------------------------------------------------------------------------------------------------------------------------ |
-| `int`   | `center_point_box`           | 0 - the box data is supplied as \[y1, x1, y2, x2\], 1-the box data is supplied as \[x_center, y_center, width, height\].             |
+| `int`   | `center_point_box`           | 0 - the box data is supplied as [y1, x1, y2, x2], 1-the box data is supplied as [x_center, y_center, width, height].                 |
 | `int`   | `max_output_boxes_per_class` | The maximum number of boxes to be selected per batch per class. Default to 0, number of output boxes equal to number of input boxes. |
-| `float` | `iou_threshold`              | The threshold for deciding whether boxes overlap too much with respect to IoU. Value range \[0, 1\]. Default to 0.                   |
+| `float` | `iou_threshold`              | The threshold for deciding whether boxes overlap too much with respect to IoU. Value range [0, 1]. Default to 0.                     |
 | `float` | `score_threshold`            | The threshold for deciding when to remove boxes based on score.                                                                      |
 | `int`   | `offset`                     | 0 or 1, boxes' width or height is (x2 - x1 + offset).                                                                                |
 
@@ -544,6 +543,7 @@ Perform RoIAlign on output feature, used in bbox_head of most two-stage detector
 
 - T:tensor(float32)
 
+
 ## MMCVRoIAlignRotated
 
 ### Description
@@ -552,15 +552,15 @@ Perform RoI align pooling for rotated proposals
 
 ### Parameters
 
-| Type    | Parameter        | Description                                                                                                   |
-| ------- | ---------------- | ------------------------------------------------------------------------------------------------------------- |
+| Type    | Parameter       | Description                                                    |
+| ------- | --------------- | -------------------------------------------------------------- |
 | `int`   | `output_height`  | height of output roi                                                                                          |
 | `int`   | `output_width`   | width of output roi                                                                                           |
 | `float` | `spatial_scale`  | used to scale the input boxes                                                                                 |
 | `int`   | `sampling_ratio` | number of input samples to take for each output sample. `0` means to take samples densely for current models. |
 | `str`   | `mode`           | pooling mode in each bin. `avg` or `max`                                                                      |
 | `int`   | `aligned`        | If `aligned=0`, use the legacy implementation in MMDetection. Else, align the results more perfectly.         |
-| `int`   | `clockwise`      | If `aligned=0`, use the legacy implementation in MMDetection. Else, align the results more perfectly.         |
+| `int`   | `clockwise`        | If `aligned=0`, use the legacy implementation in MMDetection. Else, align the results more perfectly.         |
 
 ### Inputs
 
@@ -581,7 +581,9 @@ Perform RoI align pooling for rotated proposals
 
 - T:tensor(float32)
 
-## grid_sampler\*
+# torch
+
+## grid_sampler
 
 ### Description
 
@@ -617,7 +619,7 @@ Check [torch.nn.functional.grid_sample](https://pytorch.org/docs/stable/generate
 
 - T:tensor(float32, Linear)
 
-## cummax\*
+## cummax
 
 ### Description
 
@@ -625,9 +627,9 @@ Returns a tuple (`values`, `indices`) where `values` is the cumulative maximum e
 
 ### Parameters
 
-| Type  | Parameter | Description                            |
-| ----- | --------- | -------------------------------------- |
-| `int` | `dim`     | the dimension to do the operation over |
+| Type    | Parameter       | Description                                                      |
+| ------- | --------------- | ---------------------------------------------------------------- |
+| `int`   | `dim`           | the dimension to do the operation over                           |
 
 ### Inputs
 
@@ -649,7 +651,7 @@ Returns a tuple (`values`, `indices`) where `values` is the cumulative maximum e
 
 - T:tensor(float32)
 
-## cummin\*
+## cummin
 
 ### Description
 
@@ -657,9 +659,9 @@ Returns a tuple (`values`, `indices`) where `values` is the cumulative minimum e
 
 ### Parameters
 
-| Type  | Parameter | Description                            |
-| ----- | --------- | -------------------------------------- |
-| `int` | `dim`     | the dimension to do the operation over |
+| Type    | Parameter       | Description                                                      |
+| ------- | --------------- | ---------------------------------------------------------------- |
+| `int`   | `dim`           | the dimension to do the operation over                           |
 
 ### Inputs
 
@@ -680,7 +682,3 @@ Returns a tuple (`values`, `indices`) where `values` is the cumulative minimum e
 ### Type Constraints
 
 - T:tensor(float32)
-
-## Reminders
-
-- Operators endwith `*` are defined in Torch and are included here for the conversion to ONNX.
diff --git a/docs/deployment/onnx.md b/docs/deployment/onnx.md
new file mode 100644
index 0000000..be6c59c
--- /dev/null
+++ b/docs/deployment/onnx.md
@@ -0,0 +1,19 @@
+## Introduction of onnx module in MMCV (Experimental)
+
+### register_extra_symbolics
+
+Some extra symbolic functions need to be registered before exporting PyTorch model to ONNX.
+
+#### Example
+
+```python
+import mmcv
+from mmcv.onnx import register_extra_symbolics
+
+opset_version = 11
+register_extra_symbolics(opset_version)
+```
+
+#### FAQs
+
+- None
diff --git a/docs/deployment/onnxruntime_custom_ops.md b/docs/deployment/onnxruntime_custom_ops.md
new file mode 100644
index 0000000..baaa576
--- /dev/null
+++ b/docs/deployment/onnxruntime_custom_ops.md
@@ -0,0 +1,378 @@
+## Onnxruntime Custom Ops
+
+<!-- TOC -->
+
+- [Onnxruntime Custom Ops](#onnxruntime-custom-ops)
+  - [SoftNMS](#softnms)
+    - [Description](#description)
+    - [Parameters](#parameters)
+    - [Inputs](#inputs)
+    - [Outputs](#outputs)
+    - [Type Constraints](#type-constraints)
+  - [RoIAlign](#roialign)
+    - [Description](#description-1)
+    - [Parameters](#parameters-1)
+    - [Inputs](#inputs-1)
+    - [Outputs](#outputs-1)
+    - [Type Constraints](#type-constraints-1)
+  - [NMS](#nms)
+    - [Description](#description-2)
+    - [Parameters](#parameters-2)
+    - [Inputs](#inputs-2)
+    - [Outputs](#outputs-2)
+    - [Type Constraints](#type-constraints-2)
+  - [grid_sampler](#grid_sampler)
+    - [Description](#description-3)
+    - [Parameters](#parameters-3)
+    - [Inputs](#inputs-3)
+    - [Outputs](#outputs-3)
+    - [Type Constraints](#type-constraints-3)
+  - [CornerPool](#cornerpool)
+    - [Description](#description-4)
+    - [Parameters](#parameters-4)
+    - [Inputs](#inputs-4)
+    - [Outputs](#outputs-4)
+    - [Type Constraints](#type-constraints-4)
+  - [cummax](#cummax)
+    - [Description](#description-5)
+    - [Parameters](#parameters-5)
+    - [Inputs](#inputs-5)
+    - [Outputs](#outputs-5)
+    - [Type Constraints](#type-constraints-5)
+  - [cummin](#cummin)
+    - [Description](#description-6)
+    - [Parameters](#parameters-6)
+    - [Inputs](#inputs-6)
+    - [Outputs](#outputs-6)
+    - [Type Constraints](#type-constraints-6)
+  - [MMCVModulatedDeformConv2d](#mmcvmodulateddeformconv2d)
+    - [Description](#description-7)
+    - [Parameters](#parameters-7)
+    - [Inputs](#inputs-7)
+    - [Outputs](#outputs-7)
+    - [Type Constraints](#type-constraints-7)
+  - [MMCVDeformConv2d](#mmcvdeformconv2d)
+    - [Description](#description-8)
+    - [Parameters](#parameters-8)
+    - [Inputs](#inputs-8)
+    - [Outputs](#outputs-8)
+    - [Type Constraints](#type-constraints-8)
+
+<!-- TOC -->
+
+### SoftNMS
+
+#### Description
+
+Perform soft NMS on `boxes` with `scores`. Read [Soft-NMS -- Improving Object Detection With One Line of Code](https://arxiv.org/abs/1704.04503) for detail.
+
+#### Parameters
+
+| Type    | Parameter       | Description                                                    |
+| ------- | --------------- | -------------------------------------------------------------- |
+| `float` | `iou_threshold` | IoU threshold for NMS                                          |
+| `float` | `sigma`         | hyperparameter for gaussian method                             |
+| `float` | `min_score`     | score filter threshold                                         |
+| `int`   | `method`        | method to do the nms, (0: `naive`, 1: `linear`, 2: `gaussian`) |
+| `int`   | `offset`        | `boxes` width or height is (x2 - x1 + offset). (0 or 1)        |
+
+#### Inputs
+
+<dl>
+<dt><tt>boxes</tt>: T</dt>
+<dd>Input boxes. 2-D tensor of shape (N, 4). N is the number of boxes.</dd>
+<dt><tt>scores</tt>: T</dt>
+<dd>Input scores. 1-D tensor of shape (N, ).</dd>
+</dl>
+
+#### Outputs
+
+<dl>
+<dt><tt>dets</tt>: T</dt>
+<dd>Output boxes and scores. 2-D tensor of shape (num_valid_boxes, 5), [[x1, y1, x2, y2, score], ...]. num_valid_boxes is the number of valid boxes.</dd>
+<dt><tt>indices</tt>: tensor(int64)</dt>
+<dd>Output indices. 1-D tensor of shape (num_valid_boxes, ).</dd>
+</dl>
+
+#### Type Constraints
+
+- T:tensor(float32)
+
+### RoIAlign
+
+#### Description
+
+Perform RoIAlign on output feature, used in bbox_head of most two-stage detectors.
+
+#### Parameters
+
+| Type    | Parameter        | Description                                                                                                   |
+| ------- | ---------------- | ------------------------------------------------------------------------------------------------------------- |
+| `int`   | `output_height`  | height of output roi                                                                                          |
+| `int`   | `output_width`   | width of output roi                                                                                           |
+| `float` | `spatial_scale`  | used to scale the input boxes                                                                                 |
+| `int`   | `sampling_ratio` | number of input samples to take for each output sample. `0` means to take samples densely for current models. |
+| `str`   | `mode`           | pooling mode in each bin. `avg` or `max`                                                                      |
+| `int`   | `aligned`        | If `aligned=0`, use the legacy implementation in MMDetection. Else, align the results more perfectly.         |
+
+#### Inputs
+
+<dl>
+<dt><tt>input</tt>: T</dt>
+<dd>Input feature map; 4D tensor of shape (N, C, H, W), where N is the batch size, C is the numbers of channels, H and W are the height and width of the data.</dd>
+<dt><tt>rois</tt>: T</dt>
+<dd>RoIs (Regions of Interest) to pool over; 2-D tensor of shape (num_rois, 5) given as [[batch_index, x1, y1, x2, y2], ...]. The RoIs' coordinates are the coordinate system of input.</dd>
+</dl>
+
+#### Outputs
+
+<dl>
+<dt><tt>feat</tt>: T</dt>
+<dd>RoI pooled output, 4-D tensor of shape (num_rois, C, output_height, output_width). The r-th batch element feat[r-1] is a pooled feature map corresponding to the r-th RoI RoIs[r-1].<dd>
+</dl>
+
+#### Type Constraints
+
+- T:tensor(float32)
+
+### NMS
+
+#### Description
+
+Filter out boxes has high IoU overlap with previously selected boxes.
+
+#### Parameters
+
+| Type    | Parameter       | Description                                                                                                      |
+| ------- | --------------- | ---------------------------------------------------------------------------------------------------------------- |
+| `float` | `iou_threshold` | The threshold for deciding whether boxes overlap too much with respect to IoU. Value range [0, 1]. Default to 0. |
+| `int`   | `offset`        | 0 or 1, boxes' width or height is (x2 - x1 + offset).                                                            |
+
+#### Inputs
+
+<dl>
+<dt><tt>bboxes</tt>: T</dt>
+<dd>Input boxes. 2-D tensor of shape (num_boxes, 4). num_boxes is the number of input boxes.</dd>
+<dt><tt>scores</tt>: T</dt>
+<dd>Input scores. 1-D tensor of shape (num_boxes, ).</dd>
+</dl>
+
+#### Outputs
+
+<dl>
+<dt><tt>indices</tt>: tensor(int32, Linear)</dt>
+<dd>Selected indices. 1-D tensor of shape (num_valid_boxes, ). num_valid_boxes is the number of valid boxes.</dd>
+</dl>
+
+#### Type Constraints
+
+- T:tensor(float32)
+
+### grid_sampler
+
+#### Description
+
+Perform sample from `input` with pixel locations from `grid`.
+
+#### Parameters
+
+| Type  | Parameter            | Description                                                                                                                                                                                                                                                                                     |
+| ----- | -------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `int` | `interpolation_mode` | Interpolation mode to calculate output values. (0: `bilinear` , 1: `nearest`)                                                                                                                                                                                                                   |
+| `int` | `padding_mode`       | Padding mode for outside grid values. (0: `zeros`, 1: `border`, 2: `reflection`)                                                                                                                                                                                                                |
+| `int` | `align_corners`      | If `align_corners=1`, the extrema (`-1` and `1`) are considered as referring to the center points of the input's corner pixels. If `align_corners=0`, they are instead considered as referring to the corner points of the input's corner pixels, making the sampling more resolution agnostic. |
+
+#### Inputs
+
+<dl>
+<dt><tt>input</tt>: T</dt>
+<dd>Input feature; 4-D tensor of shape (N, C, inH, inW), where N is the batch size, C is the numbers of channels, inH and inW are the height and width of the data.</dd>
+<dt><tt>grid</tt>: T</dt>
+<dd>Input offset; 4-D tensor of shape (N, outH, outW, 2), where outH and outW is the height and width of offset and output. </dd>
+</dl>
+
+#### Outputs
+
+<dl>
+<dt><tt>output</tt>: T</dt>
+<dd>Output feature; 4-D tensor of shape (N, C, outH, outW).</dd>
+</dl>
+
+#### Type Constraints
+
+- T:tensor(float32, Linear)
+
+### CornerPool
+
+#### Description
+
+Perform CornerPool on `input` features. Read [CornerNet -- Detecting Objects as Paired Keypoints](https://arxiv.org/abs/1808.01244) for more details.
+
+#### Parameters
+
+| Type  | Parameter | Description                                                      |
+| ----- | --------- | ---------------------------------------------------------------- |
+| `int` | `mode`    | corner pool mode, (0: `top`, 1: `bottom`, 2: `left`, 3: `right`) |
+
+#### Inputs
+
+<dl>
+<dt><tt>input</tt>: T</dt>
+<dd>Input features. 4-D tensor of shape (N, C, H, W). N is the batch size.</dd>
+</dl>
+
+#### Outputs
+
+<dl>
+<dt><tt>output</tt>: T</dt>
+<dd>Output the pooled features. 4-D tensor of shape (N, C, H, W).</dd>
+</dl>
+
+#### Type Constraints
+
+- T:tensor(float32)
+
+### cummax
+
+#### Description
+
+Returns a tuple (`values`, `indices`) where `values` is the cumulative maximum elements of `input` in the dimension `dim`. And `indices` is the index location of each maximum value found in the dimension `dim`. Read [torch.cummax](https://pytorch.org/docs/stable/generated/torch.cummax.html) for more details.
+
+#### Parameters
+
+| Type  | Parameter | Description                            |
+| ----- | --------- | -------------------------------------- |
+| `int` | `dim`     | the dimension to do the operation over |
+
+#### Inputs
+
+<dl>
+<dt><tt>input</tt>: T</dt>
+<dd>The input tensor with various shapes. Tensor with empty element is also supported.</dd>
+</dl>
+
+#### Outputs
+
+<dl>
+<dt><tt>output</tt>: T</dt>
+<dd>Output the cumulative maximum elements of `input` in the dimension `dim`, with the same shape and dtype as `input`.</dd>
+<dt><tt>indices</tt>: tensor(int64)</dt>
+<dd>Output the index location of each cumulative maximum value found in the dimension `dim`, with the same shape as `input`.</dd>
+</dl>
+
+#### Type Constraints
+
+- T:tensor(float32)
+
+### cummin
+
+#### Description
+
+Returns a tuple (`values`, `indices`) where `values` is the cumulative minimum elements of `input` in the dimension `dim`. And `indices` is the index location of each minimum value found in the dimension `dim`. Read [torch.cummin](https://pytorch.org/docs/stable/generated/torch.cummin.html) for more details.
+
+#### Parameters
+
+| Type  | Parameter | Description                            |
+| ----- | --------- | -------------------------------------- |
+| `int` | `dim`     | the dimension to do the operation over |
+
+#### Inputs
+
+<dl>
+<dt><tt>input</tt>: T</dt>
+<dd>The input tensor with various shapes. Tensor with empty element is also supported.</dd>
+</dl>
+
+#### Outputs
+
+<dl>
+<dt><tt>output</tt>: T</dt>
+<dd>Output the cumulative minimum elements of `input` in the dimension `dim`, with the same shape and dtype as `input`.</dd>
+<dt><tt>indices</tt>: tensor(int64)</dt>
+<dd>Output the index location of each cumulative minimum value found in the dimension `dim`, with the same shape as `input`.</dd>
+</dl>
+
+#### Type Constraints
+
+- T:tensor(float32)
+
+### MMCVModulatedDeformConv2d
+
+#### Description
+
+Perform Modulated Deformable Convolution on input feature, read [Deformable ConvNets v2: More Deformable, Better Results](https://arxiv.org/abs/1811.11168?from=timeline) for detail.
+
+#### Parameters
+
+| Type           | Parameter           | Description                                                                           |
+| -------------- | ------------------- | ------------------------------------------------------------------------------------- |
+| `list of ints` | `stride`            | The stride of the convolving kernel. (sH, sW)                                         |
+| `list of ints` | `padding`           | Paddings on both sides of the input. (padH, padW)                                     |
+| `list of ints` | `dilation`          | The spacing between kernel elements. (dH, dW)                                         |
+| `int`          | `deformable_groups` | Groups of deformable offset.                                                          |
+| `int`          | `groups`            | Split input into groups. `input_channel` should be divisible by the number of groups. |
+
+#### Inputs
+
+<dl>
+<dt><tt>inputs[0]</tt>: T</dt>
+<dd>Input feature; 4-D tensor of shape (N, C, inH, inW), where N is the batch size, C is the number of channels, inH and inW are the height and width of the data.</dd>
+<dt><tt>inputs[1]</tt>: T</dt>
+<dd>Input offset; 4-D tensor of shape (N, deformable_group* 2* kH* kW, outH, outW), where kH and kW is the height and width of weight, outH and outW is the height and width of offset and output.</dd>
+<dt><tt>inputs[2]</tt>: T</dt>
+<dd>Input mask; 4-D tensor of shape (N, deformable_group* kH* kW, outH, outW), where kH and kW is the height and width of weight, outH and outW is the height and width of offset and output.</dd>
+<dt><tt>inputs[3]</tt>: T</dt>
+<dd>Input weight; 4-D tensor of shape (output_channel, input_channel, kH, kW).</dd>
+<dt><tt>inputs[4]</tt>: T, optional</dt>
+<dd>Input bias; 1-D tensor of shape (output_channel).</dd>
+</dl>
+
+#### Outputs
+
+<dl>
+<dt><tt>outputs[0]</tt>: T</dt>
+<dd>Output feature; 4-D tensor of shape (N, output_channel, outH, outW).</dd>
+</dl>
+
+#### Type Constraints
+
+- T:tensor(float32, Linear)
+
+## MMCVDeformConv2d
+
+### Description
+
+Perform Deformable Convolution on input feature, read [Deformable Convolutional Network](https://arxiv.org/abs/1703.06211) for detail.
+
+### Parameters
+
+| Type           | Parameter          | Description                                                                                                                       |
+| -------------- | ------------------ | --------------------------------------------------------------------------------------------------------------------------------- |
+| `list of ints` | `stride`           | The stride of the convolving kernel. (sH, sW)                                                                                     |
+| `list of ints` | `padding`          | Paddings on both sides of the input. (padH, padW)                                                                                 |
+| `list of ints` | `dilation`         | The spacing between kernel elements. (dH, dW)                                                                                     |
+| `int`          | `deformable_group` | Groups of deformable offset.                                                                                                      |
+| `int`          | `group`            | Split input into groups. `input_channel` should be divisible by the number of groups.                                             |
+| `int`          | `im2col_step`      | DeformableConv2d use im2col to compute convolution. im2col_step is used to split input and offset, reduce memory usage of column. |
+
+### Inputs
+
+<dl>
+<dt><tt>inputs[0]</tt>: T</dt>
+<dd>Input feature; 4-D tensor of shape (N, C, inH, inW), where N is the batch size, C is the numbers of channels, inH and inW are the height and width of the data.</dd>
+<dt><tt>inputs[1]</tt>: T</dt>
+<dd>Input offset; 4-D tensor of shape (N, deformable_group* 2* kH* kW, outH, outW), where kH and kW is the height and width of weight, outH and outW is the height and width of offset and output.</dd>
+<dt><tt>inputs[2]</tt>: T</dt>
+<dd>Input weight; 4-D tensor of shape (output_channel, input_channel, kH, kW).</dd>
+</dl>
+
+### Outputs
+
+<dl>
+<dt><tt>outputs[0]</tt>: T</dt>
+<dd>Output feature; 4-D tensor of shape (N, output_channel, outH, outW).</dd>
+</dl>
+
+### Type Constraints
+
+- T:tensor(float32, Linear)
diff --git a/docs/deployment/onnxruntime_op.md b/docs/deployment/onnxruntime_op.md
new file mode 100644
index 0000000..f17b32a
--- /dev/null
+++ b/docs/deployment/onnxruntime_op.md
@@ -0,0 +1,126 @@
+## Custom operators for ONNX Runtime in MMCV
+
+### Introduction of ONNX Runtime
+
+**ONNX Runtime** is a cross-platform inferencing and training accelerator compatible with many popular ML/DNN frameworks. Check its [github](https://github.com/microsoft/onnxruntime) for more information.
+
+### Introduction of ONNX
+
+**ONNX** stands for **Open Neural Network Exchange**, which acts as *Intermediate Representation(IR)* for ML/DNN models from many frameworks. Check its [github](https://github.com/onnx/onnx) for more information.
+
+### Why include custom operators for ONNX Runtime in MMCV
+
+- To verify the correctness of exported ONNX models in ONNX Runtime.
+- To ease the deployment of ONNX models with custom operators from `mmcv.ops` in ONNX Runtime.
+
+### List of operators for ONNX Runtime supported in MMCV
+
+|                        Operator                        |  CPU  |  GPU  | MMCV Releases |
+| :----------------------------------------------------: | :---: | :---: | :-----------: |
+|      [SoftNMS](onnxruntime_custom_ops.md#softnms)      |   Y   |   N   |     1.2.3     |
+|     [RoIAlign](onnxruntime_custom_ops.md#roialign)     |   Y   |   N   |     1.2.5     |
+|          [NMS](onnxruntime_custom_ops.md#nms)          |   Y   |   N   |     1.2.7     |
+| [grid_sampler](onnxruntime_custom_ops.md#grid_sampler) |   Y   |   N   |     1.3.1     |
+|   [CornerPool](onnxruntime_custom_ops.md#cornerpool)   |   Y   |   N   |     1.3.4     |
+|       [cummax](onnxruntime_custom_ops.md#cummax)       |   Y   |   N   |    master     |
+|       [cummin](onnxruntime_custom_ops.md#cummin)       |   Y   |   N   |    master     |
+
+### How to build custom operators for ONNX Runtime
+
+*Please be noted that only **onnxruntime>=1.8.1** of CPU version on Linux platform is tested by now.*
+
+#### Prerequisite
+
+- Clone repository
+
+```bash
+git clone https://github.com/open-mmlab/mmcv.git
+```
+
+- Download `onnxruntime-linux` from ONNX Runtime [releases](https://github.com/microsoft/onnxruntime/releases/tag/v1.8.1), extract it, expose `ONNXRUNTIME_DIR` and finally add the lib path to `LD_LIBRARY_PATH` as below:
+
+```bash
+wget https://github.com/microsoft/onnxruntime/releases/download/v1.8.1/onnxruntime-linux-x64-1.8.1.tgz
+
+tar -zxvf onnxruntime-linux-x64-1.8.1.tgz
+cd onnxruntime-linux-x64-1.8.1
+export ONNXRUNTIME_DIR=$(pwd)
+export LD_LIBRARY_PATH=$ONNXRUNTIME_DIR/lib:$LD_LIBRARY_PATH
+```
+
+#### Build on Linux
+
+```bash
+cd mmcv ## to MMCV root directory
+MMCV_WITH_OPS=1 MMCV_WITH_ORT=1 python setup.py develop
+```
+
+### How to do inference using exported ONNX models with custom operators in ONNX Runtime in python
+
+Install ONNX Runtime with `pip`
+
+```bash
+pip install onnxruntime==1.8.1
+```
+
+Inference Demo
+
+```python
+import os
+
+import numpy as np
+import onnxruntime as ort
+
+from mmcv.ops import get_onnxruntime_op_path
+
+ort_custom_op_path = get_onnxruntime_op_path()
+assert os.path.exists(ort_custom_op_path)
+session_options = ort.SessionOptions()
+session_options.register_custom_ops_library(ort_custom_op_path)
+## exported ONNX model with custom operators
+onnx_file = 'sample.onnx'
+input_data = np.random.randn(1, 3, 224, 224).astype(np.float32)
+sess = ort.InferenceSession(onnx_file, session_options)
+onnx_results = sess.run(None, {'input' : input_data})
+```
+
+### How to add a new custom operator for ONNX Runtime in MMCV
+
+#### Reminder
+
+- The custom operator is not included in [supported operator list](https://github.com/microsoft/onnxruntime/blob/master/docs/OperatorKernels.md) in ONNX Runtime.
+- The custom operator should be able to be exported to ONNX.
+
+#### Main procedures
+
+Take custom operator `soft_nms` for example.
+
+1. Add header `soft_nms.h` to ONNX Runtime include directory `mmcv/ops/csrc/onnxruntime/`
+2. Add source `soft_nms.cpp` to ONNX Runtime source directory `mmcv/ops/csrc/onnxruntime/cpu/`
+3. Register `soft_nms` operator in [onnxruntime_register.cpp](../../mmcv/ops/csrc/onnxruntime/cpu/onnxruntime_register.cpp)
+
+    ```c++
+    #include "soft_nms.h"
+
+    SoftNmsOp c_SoftNmsOp;
+
+    if (auto status = ortApi->CustomOpDomain_Add(domain, &c_SoftNmsOp)) {
+    return status;
+    }
+    ```
+
+4. Add unit test into `tests/test_ops/test_onnx.py`
+   Check [here](../../tests/test_ops/test_onnx.py) for examples.
+
+**Finally, welcome to send us PR of adding custom operators for ONNX Runtime in MMCV.** :nerd_face:
+
+### Known Issues
+
+- "RuntimeError: tuple appears in op that does not forward tuples, unsupported kind: `prim::PythonOp`."
+   1. Note generally `cummax` or `cummin` is exportable to ONNX as long as the torch version >= 1.5.0, since `torch.cummax` is only supported with torch >= 1.5.0. But when `cummax` or `cummin` serves as an intermediate component whose outputs is used as inputs for another modules, it's expected that torch version must be >= 1.7.0. Otherwise the above error might arise, when running exported ONNX model with onnxruntime.
+   2. Solution: update the torch version to 1.7.0 or higher.
+
+### References
+
+- [How to export Pytorch model with custom op to ONNX and run it in ONNX Runtime](https://github.com/onnx/tutorials/blob/master/PyTorchCustomOperator/README.md)
+- [How to add a custom operator/kernel in ONNX Runtime](https://github.com/microsoft/onnxruntime/blob/master/docs/AddingCustomOp.md)
diff --git a/docs/deployment/tensorrt_custom_ops.md b/docs/deployment/tensorrt_custom_ops.md
new file mode 100644
index 0000000..be47e35
--- /dev/null
+++ b/docs/deployment/tensorrt_custom_ops.md
@@ -0,0 +1,395 @@
+## TensorRT Custom Ops
+
+<!-- TOC -->
+
+- [TensorRT Custom Ops](#tensorrt-custom-ops)
+  - [MMCVRoIAlign](#mmcvroialign)
+    - [Description](#description)
+    - [Parameters](#parameters)
+    - [Inputs](#inputs)
+    - [Outputs](#outputs)
+    - [Type Constraints](#type-constraints)
+  - [ScatterND](#scatternd)
+    - [Description](#description-1)
+    - [Parameters](#parameters-1)
+    - [Inputs](#inputs-1)
+    - [Outputs](#outputs-1)
+    - [Type Constraints](#type-constraints-1)
+  - [NonMaxSuppression](#nonmaxsuppression)
+    - [Description](#description-2)
+    - [Parameters](#parameters-2)
+    - [Inputs](#inputs-2)
+    - [Outputs](#outputs-2)
+    - [Type Constraints](#type-constraints-2)
+  - [MMCVDeformConv2d](#mmcvdeformconv2d)
+    - [Description](#description-3)
+    - [Parameters](#parameters-3)
+    - [Inputs](#inputs-3)
+    - [Outputs](#outputs-3)
+    - [Type Constraints](#type-constraints-3)
+  - [grid_sampler](#grid_sampler)
+    - [Description](#description-4)
+    - [Parameters](#parameters-4)
+    - [Inputs](#inputs-4)
+    - [Outputs](#outputs-4)
+    - [Type Constraints](#type-constraints-4)
+  - [cummax](#cummax)
+    - [Description](#description-5)
+    - [Parameters](#parameters-5)
+    - [Inputs](#inputs-5)
+    - [Outputs](#outputs-5)
+    - [Type Constraints](#type-constraints-5)
+  - [cummin](#cummin)
+    - [Description](#description-6)
+    - [Parameters](#parameters-6)
+    - [Inputs](#inputs-6)
+    - [Outputs](#outputs-6)
+    - [Type Constraints](#type-constraints-6)
+  - [MMCVInstanceNormalization](#mmcvinstancenormalization)
+    - [Description](#description-7)
+    - [Parameters](#parameters-7)
+    - [Inputs](#inputs-7)
+    - [Outputs](#outputs-7)
+    - [Type Constraints](#type-constraints-7)
+  - [MMCVModulatedDeformConv2d](#mmcvmodulateddeformconv2d)
+    - [Description](#description-8)
+    - [Parameters](#parameters-8)
+    - [Inputs](#inputs-8)
+    - [Outputs](#outputs-8)
+    - [Type Constraints](#type-constraints-8)
+
+<!-- TOC -->
+
+### MMCVRoIAlign
+
+#### Description
+
+Perform RoIAlign on output feature, used in bbox_head of most two stage
+detectors.
+
+#### Parameters
+
+| Type    | Parameter        | Description                                                                                                   |
+| ------- | ---------------- | ------------------------------------------------------------------------------------------------------------- |
+| `int`   | `output_height`  | height of output roi                                                                                          |
+| `int`   | `output_width`   | width of output roi                                                                                           |
+| `float` | `spatial_scale`  | used to scale the input boxes                                                                                 |
+| `int`   | `sampling_ratio` | number of input samples to take for each output sample. `0` means to take samples densely for current models. |
+| `str`   | `mode`           | pooling mode in each bin. `avg` or `max`                                                                      |
+| `int`   | `aligned`        | If `aligned=0`, use the legacy implementation in MMDetection. Else, align the results more perfectly.         |
+
+#### Inputs
+
+<dl>
+<dt><tt>inputs[0]</tt>: T</dt>
+<dd>Input feature map; 4D tensor of shape (N, C, H, W), where N is the batch size, C is the numbers of channels, H and W are the height and width of the data.</dd>
+<dt><tt>inputs[1]</tt>: T</dt>
+<dd>RoIs (Regions of Interest) to pool over; 2-D tensor of shape (num_rois, 5) given as [[batch_index, x1, y1, x2, y2], ...]. The RoIs' coordinates are the coordinate system of inputs[0].</dd>
+</dl>
+
+#### Outputs
+
+<dl>
+<dt><tt>outputs[0]</tt>: T</dt>
+<dd>RoI pooled output, 4-D tensor of shape (num_rois, C, output_height, output_width). The r-th batch element output[0][r-1] is a pooled feature map corresponding to the r-th RoI inputs[1][r-1].<dd>
+</dl>
+
+#### Type Constraints
+
+- T:tensor(float32, Linear)
+
+### ScatterND
+
+#### Description
+
+ScatterND takes three inputs `data` tensor of rank r >= 1, `indices` tensor of rank q >= 1, and `updates` tensor of rank q + r - indices.shape[-1] - 1. The output of the operation is produced by creating a copy of the input `data`, and then updating its value to values specified by updates at specific index positions specified by `indices`. Its output shape is the same as the shape of `data`. Note that `indices` should not have duplicate entries. That is, two or more updates for the same index-location is not supported.
+
+The `output` is calculated via the following equation:
+
+```python
+  output = np.copy(data)
+  update_indices = indices.shape[:-1]
+  for idx in np.ndindex(update_indices):
+      output[indices[idx]] = updates[idx]
+```
+
+#### Parameters
+
+None
+
+#### Inputs
+
+<dl>
+<dt><tt>inputs[0]</tt>: T</dt>
+<dd>Tensor of rank r>=1.</dd>
+
+<dt><tt>inputs[1]</tt>: tensor(int32, Linear)</dt>
+<dd>Tensor of rank q>=1.</dd>
+
+<dt><tt>inputs[2]</tt>: T</dt>
+<dd>Tensor of rank q + r - indices_shape[-1] - 1.</dd>
+</dl>
+
+#### Outputs
+
+<dl>
+<dt><tt>outputs[0]</tt>: T</dt>
+<dd>Tensor of rank r >= 1.</dd>
+</dl>
+
+#### Type Constraints
+
+- T:tensor(float32, Linear), tensor(int32, Linear)
+
+### NonMaxSuppression
+
+#### Description
+
+Filter out boxes has high IoU overlap with previously selected boxes or low score. Output the indices of valid boxes. Indices of invalid boxes will be filled with -1.
+
+#### Parameters
+
+| Type    | Parameter                    | Description                                                                                                                          |
+| ------- | ---------------------------- | ------------------------------------------------------------------------------------------------------------------------------------ |
+| `int`   | `center_point_box`           | 0 - the box data is supplied as [y1, x1, y2, x2], 1-the box data is supplied as [x_center, y_center, width, height].                 |
+| `int`   | `max_output_boxes_per_class` | The maximum number of boxes to be selected per batch per class. Default to 0, number of output boxes equal to number of input boxes. |
+| `float` | `iou_threshold`              | The threshold for deciding whether boxes overlap too much with respect to IoU. Value range [0, 1]. Default to 0.                     |
+| `float` | `score_threshold`            | The threshold for deciding when to remove boxes based on score.                                                                      |
+| `int`   | `offset`                     | 0 or 1, boxes' width or height is (x2 - x1 + offset).                                                                                |
+
+#### Inputs
+
+<dl>
+<dt><tt>inputs[0]</tt>: T</dt>
+<dd>Input boxes. 3-D tensor of shape (num_batches, spatial_dimension, 4).</dd>
+<dt><tt>inputs[1]</tt>: T</dt>
+<dd>Input scores. 3-D tensor of shape (num_batches, num_classes, spatial_dimension).</dd>
+</dl>
+
+#### Outputs
+
+<dl>
+<dt><tt>outputs[0]</tt>: tensor(int32, Linear)</dt>
+<dd>Selected indices. 2-D tensor of shape (num_selected_indices, 3) as [[batch_index, class_index, box_index], ...].</dd>
+<dd>num_selected_indices=num_batches* num_classes* min(max_output_boxes_per_class, spatial_dimension).</dd>
+<dd>All invalid indices will be filled with -1.</dd>
+</dl>
+
+#### Type Constraints
+
+- T:tensor(float32, Linear)
+
+### MMCVDeformConv2d
+
+#### Description
+
+Perform Deformable Convolution on input feature, read [Deformable Convolutional Network](https://arxiv.org/abs/1703.06211) for detail.
+
+#### Parameters
+
+| Type           | Parameter          | Description                                                                                                                       |
+| -------------- | ------------------ | --------------------------------------------------------------------------------------------------------------------------------- |
+| `list of ints` | `stride`           | The stride of the convolving kernel. (sH, sW)                                                                                     |
+| `list of ints` | `padding`          | Paddings on both sides of the input. (padH, padW)                                                                                 |
+| `list of ints` | `dilation`         | The spacing between kernel elements. (dH, dW)                                                                                     |
+| `int`          | `deformable_group` | Groups of deformable offset.                                                                                                      |
+| `int`          | `group`            | Split input into groups. `input_channel` should be divisible by the number of groups.                                             |
+| `int`          | `im2col_step`      | DeformableConv2d use im2col to compute convolution. im2col_step is used to split input and offset, reduce memory usage of column. |
+
+#### Inputs
+
+<dl>
+<dt><tt>inputs[0]</tt>: T</dt>
+<dd>Input feature; 4-D tensor of shape (N, C, inH, inW), where N is the batch size, C is the numbers of channels, inH and inW are the height and width of the data.</dd>
+<dt><tt>inputs[1]</tt>: T</dt>
+<dd>Input offset; 4-D tensor of shape (N, deformable_group* 2* kH* kW, outH, outW), where kH and kW is the height and width of weight, outH and outW is the height and width of offset and output.</dd>
+<dt><tt>inputs[2]</tt>: T</dt>
+<dd>Input weight; 4-D tensor of shape (output_channel, input_channel, kH, kW).</dd>
+</dl>
+
+#### Outputs
+
+<dl>
+<dt><tt>outputs[0]</tt>: T</dt>
+<dd>Output feature; 4-D tensor of shape (N, output_channel, outH, outW).</dd>
+</dl>
+
+#### Type Constraints
+
+- T:tensor(float32, Linear)
+
+### grid_sampler
+
+#### Description
+
+Perform sample from `input` with pixel locations from `grid`.
+
+#### Parameters
+
+| Type  | Parameter            | Description                                                                                                                                                                                                                                                                                     |
+| ----- | -------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `int` | `interpolation_mode` | Interpolation mode to calculate output values. (0: `bilinear` , 1: `nearest`)                                                                                                                                                                                                                   |
+| `int` | `padding_mode`       | Padding mode for outside grid values. (0: `zeros`, 1: `border`, 2: `reflection`)                                                                                                                                                                                                                |
+| `int` | `align_corners`      | If `align_corners=1`, the extrema (`-1` and `1`) are considered as referring to the center points of the input's corner pixels. If `align_corners=0`, they are instead considered as referring to the corner points of the input's corner pixels, making the sampling more resolution agnostic. |
+
+#### Inputs
+
+<dl>
+<dt><tt>inputs[0]</tt>: T</dt>
+<dd>Input feature; 4-D tensor of shape (N, C, inH, inW), where N is the batch size, C is the numbers of channels, inH and inW are the height and width of the data.</dd>
+<dt><tt>inputs[1]</tt>: T</dt>
+<dd>Input offset; 4-D tensor of shape (N, outH, outW, 2), where outH and outW is the height and width of offset and output. </dd>
+</dl>
+
+#### Outputs
+
+<dl>
+<dt><tt>outputs[0]</tt>: T</dt>
+<dd>Output feature; 4-D tensor of shape (N, C, outH, outW).</dd>
+</dl>
+
+#### Type Constraints
+
+- T:tensor(float32, Linear)
+
+### cummax
+
+#### Description
+
+Returns a namedtuple (`values`, `indices`) where `values` is the cumulative maximum of elements of `input` in the dimension `dim`. And `indices` is the index location of each maximum value found in the dimension `dim`.
+
+#### Parameters
+
+| Type  | Parameter | Description                             |
+| ----- | --------- | --------------------------------------- |
+| `int` | `dim`     | The dimension to do the operation over. |
+
+#### Inputs
+
+<dl>
+<dt><tt>inputs[0]</tt>: T</dt>
+<dd>The input tensor.</dd>
+</dl>
+
+#### Outputs
+
+<dl>
+<dt><tt>outputs[0]</tt>: T</dt>
+<dd>Output values.</dd>
+<dt><tt>outputs[1]</tt>: (int32, Linear)</dt>
+<dd>Output indices.</dd>
+</dl>
+
+#### Type Constraints
+
+- T:tensor(float32, Linear)
+
+### cummin
+
+#### Description
+
+Returns a namedtuple (`values`, `indices`) where `values` is the cumulative minimum of elements of `input` in the dimension `dim`. And `indices` is the index location of each minimum value found in the dimension `dim`.
+
+#### Parameters
+
+| Type  | Parameter | Description                             |
+| ----- | --------- | --------------------------------------- |
+| `int` | `dim`     | The dimension to do the operation over. |
+
+#### Inputs
+
+<dl>
+<dt><tt>inputs[0]</tt>: T</dt>
+<dd>The input tensor.</dd>
+</dl>
+
+#### Outputs
+
+<dl>
+<dt><tt>outputs[0]</tt>: T</dt>
+<dd>Output values.</dd>
+<dt><tt>outputs[1]</tt>: (int32, Linear)</dt>
+<dd>Output indices.</dd>
+</dl>
+
+#### Type Constraints
+
+- T:tensor(float32, Linear)
+
+### MMCVInstanceNormalization
+
+#### Description
+
+Carries out instance normalization as described in the paper https://arxiv.org/abs/1607.08022.
+
+y = scale * (x - mean) / sqrt(variance + epsilon) + B, where mean and variance are computed per instance per channel.
+
+#### Parameters
+
+| Type    | Parameter | Description                                                          |
+| ------- | --------- | -------------------------------------------------------------------- |
+| `float` | `epsilon` | The epsilon value to use to avoid division by zero. Default is 1e-05 |
+
+#### Inputs
+
+<dl>
+<dt><tt>input</tt>: T</dt>
+<dd>Input data tensor from the previous operator; dimensions for image case are (N x C x H x W), where N is the batch size, C is the number of channels, and H and W are the height and the width of the data. For non image case, the dimensions are in the form of (N x C x D1 x D2 ... Dn), where N is the batch size.</dd>
+<dt><tt>scale</tt>: T</dt>
+<dd>The input 1-dimensional scale tensor of size C.</dd>
+<dt><tt>B</tt>: T</dt>
+<dd>The input 1-dimensional bias tensor of size C.</dd>
+</dl>
+
+#### Outputs
+
+<dl>
+<dt><tt>output</tt>: T</dt>
+<dd>The output tensor of the same shape as input.</dd>
+</dl>
+
+#### Type Constraints
+
+- T:tensor(float32, Linear)
+
+### MMCVModulatedDeformConv2d
+
+#### Description
+
+Perform Modulated Deformable Convolution on input feature, read [Deformable ConvNets v2: More Deformable, Better Results](https://arxiv.org/abs/1811.11168?from=timeline) for detail.
+
+#### Parameters
+
+| Type           | Parameter          | Description                                                                           |
+| -------------- | ------------------ | ------------------------------------------------------------------------------------- |
+| `list of ints` | `stride`           | The stride of the convolving kernel. (sH, sW)                                         |
+| `list of ints` | `padding`          | Paddings on both sides of the input. (padH, padW)                                     |
+| `list of ints` | `dilation`         | The spacing between kernel elements. (dH, dW)                                         |
+| `int`          | `deformable_group` | Groups of deformable offset.                                                          |
+| `int`          | `group`            | Split input into groups. `input_channel` should be divisible by the number of groups. |
+
+#### Inputs
+
+<dl>
+<dt><tt>inputs[0]</tt>: T</dt>
+<dd>Input feature; 4-D tensor of shape (N, C, inH, inW), where N is the batch size, C is the number of channels, inH and inW are the height and width of the data.</dd>
+<dt><tt>inputs[1]</tt>: T</dt>
+<dd>Input offset; 4-D tensor of shape (N, deformable_group* 2* kH* kW, outH, outW), where kH and kW is the height and width of weight, outH and outW is the height and width of offset and output.</dd>
+<dt><tt>inputs[2]</tt>: T</dt>
+<dd>Input mask; 4-D tensor of shape (N, deformable_group* kH* kW, outH, outW), where kH and kW is the height and width of weight, outH and outW is the height and width of offset and output.</dd>
+<dt><tt>inputs[3]</tt>: T</dt>
+<dd>Input weight; 4-D tensor of shape (output_channel, input_channel, kH, kW).</dd>
+<dt><tt>inputs[4]</tt>: T, optional</dt>
+<dd>Input weight; 1-D tensor of shape (output_channel).</dd>
+</dl>
+
+#### Outputs
+
+<dl>
+<dt><tt>outputs[0]</tt>: T</dt>
+<dd>Output feature; 4-D tensor of shape (N, output_channel, outH, outW).</dd>
+</dl>
+
+#### Type Constraints
+
+- T:tensor(float32, Linear)
diff --git a/docs/deployment/tensorrt_plugin.md b/docs/deployment/tensorrt_plugin.md
new file mode 100644
index 0000000..cd8924e
--- /dev/null
+++ b/docs/deployment/tensorrt_plugin.md
@@ -0,0 +1,178 @@
+## TensorRT Plugins for custom operators in MMCV (Experimental)
+
+<!-- TOC -->
+
+- [TensorRT Plugins for custom operators in MMCV (Experimental)](#tensorrt-plugins-for-custom-operators-in-mmcv-experimental)
+  - [Introduction](#introduction)
+  - [List of TensorRT plugins supported in MMCV](#list-of-tensorrt-plugins-supported-in-mmcv)
+  - [How to build TensorRT plugins in MMCV](#how-to-build-tensorrt-plugins-in-mmcv)
+    - [Prerequisite](#prerequisite)
+    - [Build on Linux](#build-on-linux)
+  - [Create TensorRT engine and run inference in python](#create-tensorrt-engine-and-run-inference-in-python)
+  - [How to add a TensorRT plugin for custom op in MMCV](#how-to-add-a-tensorrt-plugin-for-custom-op-in-mmcv)
+    - [Main procedures](#main-procedures)
+    - [Reminders](#reminders)
+  - [Known Issues](#known-issues)
+  - [References](#references)
+
+<!-- TOC -->
+
+### Introduction
+
+**NVIDIA TensorRT** is a software development kit(SDK) for high-performance inference of deep learning models. It includes a deep learning inference optimizer and runtime that delivers low latency and high-throughput for deep learning inference applications. Please check its [developer's website](https://developer.nvidia.com/tensorrt) for more information.
+To ease the deployment of trained models with custom operators from `mmcv.ops` using TensorRT, a series of TensorRT plugins are included in MMCV.
+
+### List of TensorRT plugins supported in MMCV
+
+|       ONNX Operator       |                                 TensorRT Plugin                                 | MMCV Releases |
+| :-----------------------: | :-----------------------------------------------------------------------------: | :-----------: |
+|       MMCVRoiAlign        |              [MMCVRoiAlign](./tensorrt_custom_ops.md#mmcvroialign)              |     1.2.6     |
+|         ScatterND         |                 [ScatterND](./tensorrt_custom_ops.md#scatternd)                 |     1.2.6     |
+|     NonMaxSuppression     |         [NonMaxSuppression](./tensorrt_custom_ops.md#nonmaxsuppression)         |     1.3.0     |
+|     MMCVDeformConv2d      |          [MMCVDeformConv2d](./tensorrt_custom_ops.md#mmcvdeformconv2d)          |     1.3.0     |
+|       grid_sampler        |              [grid_sampler](./tensorrt_custom_ops.md#grid-sampler)              |     1.3.1     |
+|          cummax           |                    [cummax](./tensorrt_custom_ops.md#cummax)                    |     1.3.5     |
+|          cummin           |                    [cummin](./tensorrt_custom_ops.md#cummin)                    |     1.3.5     |
+| MMCVInstanceNormalization | [MMCVInstanceNormalization](./tensorrt_custom_ops.md#mmcvinstancenormalization) |     1.3.5     |
+| MMCVModulatedDeformConv2d | [MMCVModulatedDeformConv2d](./tensorrt_custom_ops.md#mmcvmodulateddeformconv2d) |    master     |
+
+Notes
+
+- All plugins listed above are developed on TensorRT-7.2.1.6.Ubuntu-16.04.x86_64-gnu.cuda-10.2.cudnn8.0
+
+### How to build TensorRT plugins in MMCV
+
+#### Prerequisite
+
+- Clone repository
+
+```bash
+git clone https://github.com/open-mmlab/mmcv.git
+```
+
+- Install TensorRT
+
+Download the corresponding TensorRT build from [NVIDIA Developer Zone](https://developer.nvidia.com/nvidia-tensorrt-download).
+
+For example, for Ubuntu 16.04 on x86-64 with cuda-10.2, the downloaded file is `TensorRT-7.2.1.6.Ubuntu-16.04.x86_64-gnu.cuda-10.2.cudnn8.0.tar.gz`.
+
+Then, install as below:
+
+```bash
+cd ~/Downloads
+tar -xvzf TensorRT-7.2.1.6.Ubuntu-16.04.x86_64-gnu.cuda-10.2.cudnn8.0.tar.gz
+export TENSORRT_DIR=`pwd`/TensorRT-7.2.1.6
+export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$TENSORRT_DIR/lib
+```
+
+Install python packages: tensorrt, graphsurgeon, onnx-graphsurgeon
+
+```bash
+pip install $TENSORRT_DIR/python/tensorrt-7.2.1.6-cp37-none-linux_x86_64.whl
+pip install $TENSORRT_DIR/onnx_graphsurgeon/onnx_graphsurgeon-0.2.6-py2.py3-none-any.whl
+pip install $TENSORRT_DIR/graphsurgeon/graphsurgeon-0.4.5-py2.py3-none-any.whl
+```
+
+For more detailed information of installing TensorRT using tar, please refer to [Nvidia' website](https://docs.nvidia.com/deeplearning/tensorrt/archives/tensorrt-721/install-guide/index.html#installing-tar).
+
+#### Build on Linux
+
+```bash
+cd mmcv ## to MMCV root directory
+MMCV_WITH_OPS=1 MMCV_WITH_TRT=1 pip install -e .
+```
+
+### Create TensorRT engine and run inference in python
+
+Here is an example.
+
+```python
+import torch
+import onnx
+
+from mmcv.tensorrt import (TRTWrapper, onnx2trt, save_trt_engine,
+                                   is_tensorrt_plugin_loaded)
+
+assert is_tensorrt_plugin_loaded(), 'Requires to complie TensorRT plugins in mmcv'
+
+onnx_file = 'sample.onnx'
+trt_file = 'sample.trt'
+onnx_model = onnx.load(onnx_file)
+
+## Model input
+inputs = torch.rand(1, 3, 224, 224).cuda()
+## Model input shape info
+opt_shape_dict = {
+    'input': [list(inputs.shape),
+              list(inputs.shape),
+              list(inputs.shape)]
+}
+
+## Create TensorRT engine
+max_workspace_size = 1 << 30
+trt_engine = onnx2trt(
+    onnx_model,
+    opt_shape_dict,
+    max_workspace_size=max_workspace_size)
+
+## Save TensorRT engine
+save_trt_engine(trt_engine, trt_file)
+
+## Run inference with TensorRT
+trt_model = TRTWrapper(trt_file, ['input'], ['output'])
+
+with torch.no_grad():
+    trt_outputs = trt_model({'input': inputs})
+    output = trt_outputs['output']
+
+```
+
+### How to add a TensorRT plugin for custom op in MMCV
+
+#### Main procedures
+
+Below are the main steps:
+
+1. Add c++ header file
+2. Add c++ source file
+3. Add cuda kernel file
+4. Register plugin in `trt_plugin.cpp`
+5. Add unit test in `tests/test_ops/test_tensorrt.py`
+
+**Take RoIAlign plugin `roi_align` for example.**
+
+1. Add header `trt_roi_align.hpp` to TensorRT include directory `mmcv/ops/csrc/tensorrt/`
+2. Add source `trt_roi_align.cpp` to TensorRT source directory `mmcv/ops/csrc/tensorrt/plugins/`
+3. Add cuda kernel `trt_roi_align_kernel.cu` to TensorRT source directory `mmcv/ops/csrc/tensorrt/plugins/`
+4. Register `roi_align` plugin in [trt_plugin.cpp](https://github.com/open-mmlab/mmcv/blob/master/mmcv/ops/csrc/tensorrt/plugins/trt_plugin.cpp)
+
+    ```c++
+    #include "trt_plugin.hpp"
+
+    #include "trt_roi_align.hpp"
+
+    REGISTER_TENSORRT_PLUGIN(RoIAlignPluginDynamicCreator);
+
+    extern "C" {
+    bool initLibMMCVInferPlugins() { return true; }
+    }  // extern "C"
+    ```
+
+5. Add unit test into `tests/test_ops/test_tensorrt.py`
+   Check [here](https://github.com/open-mmlab/mmcv/blob/master/tests/test_ops/test_tensorrt.py) for examples.
+
+#### Reminders
+
+- Some of the [custom ops](https://mmcv.readthedocs.io/en/latest/ops.html) in `mmcv` have their cuda implementations, which could be referred.
+
+### Known Issues
+
+- None
+
+### References
+
+- [Developer guide of Nvidia TensorRT](https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html)
+- [TensorRT Open Source Software](https://github.com/NVIDIA/TensorRT)
+- [onnx-tensorrt](https://github.com/onnx/onnx-tensorrt)
+- [TensorRT python API](https://docs.nvidia.com/deeplearning/tensorrt/api/python_api/index.html)
+- [TensorRT c++ plugin API](https://docs.nvidia.com/deeplearning/tensorrt/api/c_api/classnvinfer1_1_1_i_plugin.html)
diff --git a/docs/en/_static/version.json b/docs/en/_static/version.json
deleted file mode 100644
index 7ee4965..0000000
--- a/docs/en/_static/version.json
+++ /dev/null
@@ -1,575 +0,0 @@
-{
-    "Linux": [
-        {
-            "cuda": "11.7",
-            "torch": "1.13.x",
-            "mmcv": [
-                "2.0.0rc3"
-            ]
-        },
-        {
-            "cuda": "11.6",
-            "torch": "1.13.x",
-            "mmcv": [
-                "2.0.0rc3"
-            ]
-        },
-        {
-            "cuda": "11.6",
-            "torch": "1.12.x",
-            "mmcv": [
-                "2.0.0rc3",
-                "2.0.0rc2",
-                "2.0.0rc1"
-            ]
-        },
-        {
-            "cuda": "11.5",
-            "torch": "1.11.x",
-            "mmcv": [
-                "2.0.0rc3",
-                "2.0.0rc2",
-                "2.0.0rc1"
-            ]
-        },
-        {
-            "cuda": "11.3",
-            "torch": "1.12.x",
-            "mmcv": [
-                "2.0.0rc3",
-                "2.0.0rc2",
-                "2.0.0rc1"
-            ]
-        },
-        {
-            "cuda": "11.3",
-            "torch": "1.11.x",
-            "mmcv": [
-                "2.0.0rc3",
-                "2.0.0rc2",
-                "2.0.0rc1"
-            ]
-        },
-        {
-            "cuda": "11.3",
-            "torch": "1.10.x",
-            "mmcv": [
-                "2.0.0rc3",
-                "2.0.0rc2",
-                "2.0.0rc1"
-            ]
-        },
-        {
-            "cuda": "11.1",
-            "torch": "1.10.x",
-            "mmcv": [
-                "2.0.0rc3",
-                "2.0.0rc2",
-                "2.0.0rc1"
-            ]
-        },
-        {
-            "cuda": "11.1",
-            "torch": "1.9.x",
-            "mmcv": [
-                "2.0.0rc3",
-                "2.0.0rc2",
-                "2.0.0rc1"
-            ]
-        },
-        {
-            "cuda": "11.1",
-            "torch": "1.8.x",
-            "mmcv": [
-                "2.0.0rc3",
-                "2.0.0rc2",
-                "2.0.0rc1"
-            ]
-        },
-        {
-            "cuda": "11.0",
-            "torch": "1.7.x",
-            "mmcv": [
-                "2.0.0rc3",
-                "2.0.0rc2",
-                "2.0.0rc1"
-            ]
-        },
-        {
-            "cuda": "10.2",
-            "torch": "1.12.x",
-            "mmcv": [
-                "2.0.0rc3",
-                "2.0.0rc2",
-                "2.0.0rc1"
-            ]
-        },
-        {
-            "cuda": "10.2",
-            "torch": "1.11.x",
-            "mmcv": [
-                "2.0.0rc3",
-                "2.0.0rc2",
-                "2.0.0rc1"
-            ]
-        },
-        {
-            "cuda": "10.2",
-            "torch": "1.10.x",
-            "mmcv": [
-                "2.0.0rc3",
-                "2.0.0rc2",
-                "2.0.0rc1"
-            ]
-        },
-        {
-            "cuda": "10.2",
-            "torch": "1.9.x",
-            "mmcv": [
-                "2.0.0rc3",
-                "2.0.0rc2",
-                "2.0.0rc1"
-            ]
-        },
-        {
-            "cuda": "10.2",
-            "torch": "1.8.x",
-            "mmcv": [
-                "2.0.0rc3",
-                "2.0.0rc2",
-                "2.0.0rc1"
-            ]
-        },
-        {
-            "cuda": "10.2",
-            "torch": "1.7.x",
-            "mmcv": [
-                "2.0.0rc3",
-                "2.0.0rc2",
-                "2.0.0rc1"
-            ]
-        },
-        {
-            "cuda": "10.2",
-            "torch": "1.6.x",
-            "mmcv": [
-                "2.0.0rc3",
-                "2.0.0rc2",
-                "2.0.0rc1"
-            ]
-        },
-        {
-            "cuda": "10.1",
-            "torch": "1.8.x",
-            "mmcv": [
-                "2.0.0rc3",
-                "2.0.0rc2",
-                "2.0.0rc1"
-            ]
-        },
-        {
-            "cuda": "10.1",
-            "torch": "1.7.x",
-            "mmcv": [
-                "2.0.0rc3",
-                "2.0.0rc2",
-                "2.0.0rc1"
-            ]
-        },
-        {
-            "cuda": "10.1",
-            "torch": "1.6.x",
-            "mmcv": [
-                "2.0.0rc3",
-                "2.0.0rc2",
-                "2.0.0rc1"
-            ]
-        },
-        {
-            "cuda": "9.2",
-            "torch": "1.7.x",
-            "mmcv": [
-                "2.0.0rc3",
-                "2.0.0rc2",
-                "2.0.0rc1"
-            ]
-        },
-        {
-            "cuda": "9.2",
-            "torch": "1.6.x",
-            "mmcv": [
-                "2.0.0rc3",
-                "2.0.0rc2",
-                "2.0.0rc1"
-            ]
-        },
-        {
-            "cuda": "cpu",
-            "torch": "1.13.x",
-            "mmcv": [
-                "2.0.0rc3"
-            ]
-        },
-        {
-            "cuda": "cpu",
-            "torch": "1.12.x",
-            "mmcv": [
-                "2.0.0rc3",
-                "2.0.0rc2",
-                "2.0.0rc1"
-            ]
-        },
-        {
-            "cuda": "cpu",
-            "torch": "1.11.x",
-            "mmcv": [
-                "2.0.0rc3",
-                "2.0.0rc2",
-                "2.0.0rc1"
-            ]
-        },
-        {
-            "cuda": "cpu",
-            "torch": "1.10.x",
-            "mmcv": [
-                "2.0.0rc3",
-                "2.0.0rc2",
-                "2.0.0rc1"
-            ]
-        },
-        {
-            "cuda": "cpu",
-            "torch": "1.9.x",
-            "mmcv": [
-                "2.0.0rc3",
-                "2.0.0rc2",
-                "2.0.0rc1"
-            ]
-        },
-        {
-            "cuda": "cpu",
-            "torch": "1.8.x",
-            "mmcv": [
-                "2.0.0rc3",
-                "2.0.0rc2",
-                "2.0.0rc1"
-            ]
-        },
-        {
-            "cuda": "cpu",
-            "torch": "1.7.x",
-            "mmcv": [
-                "2.0.0rc3",
-                "2.0.0rc2",
-                "2.0.0rc1"
-            ]
-        },
-        {
-            "cuda": "cpu",
-            "torch": "1.6.x",
-            "mmcv": [
-                "2.0.0rc3",
-                "2.0.0rc2",
-                "2.0.0rc1"
-            ]
-        }
-    ],
-    "Windows": [
-        {
-            "cuda": "11.7",
-            "torch": "1.13.x",
-            "mmcv": [
-                "2.0.0rc3"
-            ]
-        },
-        {
-            "cuda": "11.6",
-            "torch": "1.13.x",
-            "mmcv": [
-                "2.0.0rc3"
-            ]
-        },
-        {
-            "cuda": "11.6",
-            "torch": "1.12.x",
-            "mmcv": [
-                "2.0.0rc3",
-                "2.0.0rc2",
-                "2.0.0rc1"
-            ]
-        },
-        {
-            "cuda": "11.5",
-            "torch": "1.11.x",
-            "mmcv": [
-                "2.0.0rc3",
-                "2.0.0rc2",
-                "2.0.0rc1"
-            ]
-        },
-        {
-            "cuda": "11.3",
-            "torch": "1.12.x",
-            "mmcv": [
-                "2.0.0rc3",
-                "2.0.0rc2",
-                "2.0.0rc1"
-            ]
-        },
-        {
-            "cuda": "11.3",
-            "torch": "1.11.x",
-            "mmcv": [
-                "2.0.0rc3",
-                "2.0.0rc2",
-                "2.0.0rc1"
-            ]
-        },
-        {
-            "cuda": "11.3",
-            "torch": "1.10.x",
-            "mmcv": [
-                "2.0.0rc3",
-                "2.0.0rc2",
-                "2.0.0rc1"
-            ]
-        },
-        {
-            "cuda": "11.1",
-            "torch": "1.10.x",
-            "mmcv": [
-                "2.0.0rc3",
-                "2.0.0rc2",
-                "2.0.0rc1"
-            ]
-        },
-        {
-            "cuda": "11.1",
-            "torch": "1.9.x",
-            "mmcv": [
-                "2.0.0rc3",
-                "2.0.0rc2",
-                "2.0.0rc1"
-            ]
-        },
-        {
-            "cuda": "11.1",
-            "torch": "1.8.x",
-            "mmcv": [
-                "2.0.0rc3",
-                "2.0.0rc2",
-                "2.0.0rc1"
-            ]
-        },
-        {
-            "cuda": "10.2",
-            "torch": "1.10.x",
-            "mmcv": [
-                "2.0.0rc3",
-                "2.0.0rc2",
-                "2.0.0rc1"
-            ]
-        },
-        {
-            "cuda": "10.2",
-            "torch": "1.9.x",
-            "mmcv": [
-                "2.0.0rc3",
-                "2.0.0rc2",
-                "2.0.0rc1"
-            ]
-        },
-        {
-            "cuda": "10.2",
-            "torch": "1.8.x",
-            "mmcv": [
-                "2.0.0rc3",
-                "2.0.0rc2",
-                "2.0.0rc1"
-            ]
-        },
-        {
-            "cuda": "10.2",
-            "torch": "1.7.x",
-            "mmcv": [
-                "2.0.0rc3"
-            ]
-        },
-        {
-            "cuda": "10.2",
-            "torch": "1.6.x",
-            "mmcv": [
-                "2.0.0rc3",
-                "2.0.0rc2",
-                "2.0.0rc1"
-            ]
-        },
-        {
-            "cuda": "10.1",
-            "torch": "1.8.x",
-            "mmcv": [
-                "2.0.0rc3",
-                "2.0.0rc2",
-                "2.0.0rc1"
-            ]
-        },
-        {
-            "cuda": "10.1",
-            "torch": "1.7.x",
-            "mmcv": [
-                "2.0.0rc3"
-            ]
-        },
-        {
-            "cuda": "10.1",
-            "torch": "1.6.x",
-            "mmcv": [
-                "2.0.0rc3",
-                "2.0.0rc2",
-                "2.0.0rc1"
-            ]
-        },
-        {
-            "cuda": "cpu",
-            "torch": "1.13.x",
-            "mmcv": [
-                "2.0.0rc3"
-            ]
-        },
-        {
-            "cuda": "cpu",
-            "torch": "1.12.x",
-            "mmcv": [
-                "2.0.0rc3",
-                "2.0.0rc2",
-                "2.0.0rc1"
-            ]
-        },
-        {
-            "cuda": "cpu",
-            "torch": "1.11.x",
-            "mmcv": [
-                "2.0.0rc3",
-                "2.0.0rc2",
-                "2.0.0rc1"
-            ]
-        },
-        {
-            "cuda": "cpu",
-            "torch": "1.10.x",
-            "mmcv": [
-                "2.0.0rc3",
-                "2.0.0rc2",
-                "2.0.0rc1"
-            ]
-        },
-        {
-            "cuda": "cpu",
-            "torch": "1.9.x",
-            "mmcv": [
-                "2.0.0rc3",
-                "2.0.0rc2",
-                "2.0.0rc1"
-            ]
-        },
-        {
-            "cuda": "cpu",
-            "torch": "1.8.x",
-            "mmcv": [
-                "2.0.0rc3",
-                "2.0.0rc2",
-                "2.0.0rc1"
-            ]
-        },
-        {
-            "cuda": "cpu",
-            "torch": "1.7.x",
-            "mmcv": [
-                "2.0.0rc3",
-                "2.0.0rc2",
-                "2.0.0rc1"
-            ]
-        },
-        {
-            "cuda": "cpu",
-            "torch": "1.6.x",
-            "mmcv": [
-                "2.0.0rc3",
-                "2.0.0rc2",
-                "2.0.0rc1"
-            ]
-        }
-    ],
-    "macOS": [
-        {
-            "cuda": "cpu",
-            "torch": "1.13.x",
-            "mmcv": [
-                "2.0.0rc3"
-            ]
-        },
-        {
-            "cuda": "mps",
-            "torch": "1.13.x",
-            "mmcv": [
-                "2.0.0rc3"
-            ]
-        },
-        {
-            "cuda": "cpu",
-            "torch": "1.12.x",
-            "mmcv": [
-                "2.0.0rc3",
-                "2.0.0rc2"
-            ]
-        },
-        {
-            "cuda": "cpu",
-            "torch": "1.11.x",
-            "mmcv": [
-                "2.0.0rc3",
-                "2.0.0rc2"
-            ]
-        },
-        {
-            "cuda": "cpu",
-            "torch": "1.10.x",
-            "mmcv": [
-                "2.0.0rc3",
-                "2.0.0rc2"
-            ]
-        },
-        {
-            "cuda": "cpu",
-            "torch": "1.9.x",
-            "mmcv": [
-                "2.0.0rc3",
-                "2.0.0rc2"
-            ]
-        },
-        {
-            "cuda": "cpu",
-            "torch": "1.8.x",
-            "mmcv": [
-                "2.0.0rc3",
-                "2.0.0rc2"
-            ]
-        },
-        {
-            "cuda": "cpu",
-            "torch": "1.7.x",
-            "mmcv": [
-                "2.0.0rc3",
-                "2.0.0rc2"
-            ]
-        },
-        {
-            "cuda": "cpu",
-            "torch": "1.6.x",
-            "mmcv": [
-                "2.0.0rc3",
-                "2.0.0rc2"
-            ]
-        }
-    ]
-}
diff --git a/docs/en/_templates/classtemplate.rst b/docs/en/_templates/classtemplate.rst
deleted file mode 100644
index 4f74842..0000000
--- a/docs/en/_templates/classtemplate.rst
+++ /dev/null
@@ -1,14 +0,0 @@
-.. role:: hidden
-    :class: hidden-section
-.. currentmodule:: {{ module }}
-
-
-{{ name | underline}}
-
-.. autoclass:: {{ name }}
-    :members:
-
-
-..
-  autogenerated from source/_templates/classtemplate.rst
-  note it does not have :inherited-members:
diff --git a/docs/en/api/arraymisc.rst b/docs/en/api/arraymisc.rst
deleted file mode 100644
index 28975eb..0000000
--- a/docs/en/api/arraymisc.rst
+++ /dev/null
@@ -1,19 +0,0 @@
-.. role:: hidden
-    :class: hidden-section
-
-mmcv.arraymisc
-===================================
-
-.. contents:: mmcv.arraymisc
-   :depth: 2
-   :local:
-   :backlinks: top
-
-.. currentmodule:: mmcv.arraymisc
-
-.. autosummary::
-   :toctree: generated
-   :nosignatures:
-
-   quantize
-   dequantize
diff --git a/docs/en/api/cnn.rst b/docs/en/api/cnn.rst
deleted file mode 100644
index 022191f..0000000
--- a/docs/en/api/cnn.rst
+++ /dev/null
@@ -1,71 +0,0 @@
-.. role:: hidden
-    :class: hidden-section
-
-mmcv.cnn
-===================================
-
-.. contents:: mmcv.cnn
-   :depth: 2
-   :local:
-   :backlinks: top
-
-.. currentmodule:: mmcv.cnn
-
-Module
-----------------
-
-.. autosummary::
-   :toctree: generated
-   :nosignatures:
-   :template: classtemplate.rst
-
-   ContextBlock
-   Conv2d
-   Conv3d
-   ConvAWS2d
-   ConvModule
-   ConvTranspose2d
-   ConvTranspose3d
-   ConvWS2d
-   DepthwiseSeparableConvModule
-   GeneralizedAttention
-   HSigmoid
-   HSwish
-   LayerScale
-   Linear
-   MaxPool2d
-   MaxPool3d
-   NonLocal1d
-   NonLocal2d
-   NonLocal3d
-   Scale
-   Swish
-   Conv2dRFSearchOp
-
-Build Function
-----------------
-
-.. autosummary::
-   :toctree: generated
-   :nosignatures:
-
-   build_activation_layer
-   build_conv_layer
-   build_norm_layer
-   build_padding_layer
-   build_plugin_layer
-   build_upsample_layer
-
-Miscellaneous
-----------------
-
-.. autosummary::
-   :toctree: generated
-   :nosignatures:
-
-   fuse_conv_bn
-   conv_ws_2d
-   is_norm
-   make_res_layer
-   make_vgg_layer
-   get_model_complexity_info
diff --git a/docs/en/api/image.rst b/docs/en/api/image.rst
deleted file mode 100644
index 3b93484..0000000
--- a/docs/en/api/image.rst
+++ /dev/null
@@ -1,100 +0,0 @@
-.. role:: hidden
-    :class: hidden-section
-
-mmcv.image
-===================================
-
-.. contents:: mmcv.image
-   :depth: 2
-   :local:
-   :backlinks: top
-
-.. currentmodule:: mmcv.image
-
-IO
-----------------
-
-.. autosummary::
-   :toctree: generated
-   :nosignatures:
-
-   imfrombytes
-   imread
-   imwrite
-   use_backend
-
-Color Space
-----------------
-
-.. autosummary::
-   :toctree: generated
-   :nosignatures:
-
-   bgr2gray
-   bgr2hls
-   bgr2hsv
-   bgr2rgb
-   bgr2ycbcr
-   gray2bgr
-   gray2rgb
-   hls2bgr
-   hsv2bgr
-   imconvert
-   rgb2bgr
-   rgb2gray
-   rgb2ycbcr
-   ycbcr2bgr
-   ycbcr2rgb
-
-Geometric
-----------------
-
-.. autosummary::
-   :toctree: generated
-   :nosignatures:
-
-   cutout
-   imcrop
-   imflip
-   impad
-   impad_to_multiple
-   imrescale
-   imresize
-   imresize_like
-   imresize_to_multiple
-   imrotate
-   imshear
-   imtranslate
-   rescale_size
-
-Photometric
-----------------
-
-.. autosummary::
-   :toctree: generated
-   :nosignatures:
-
-   adjust_brightness
-   adjust_color
-   adjust_contrast
-   adjust_hue
-   adjust_lighting
-   adjust_sharpness
-   auto_contrast
-   clahe
-   imdenormalize
-   imequalize
-   iminvert
-   imnormalize
-   lut_transform
-   posterize
-   solarize
-
-Miscellaneous
-----------------
-
-.. autosummary::
-   :toctree: generated
-   :nosignatures:
-
-   tensor2imgs
diff --git a/docs/en/api/ops.rst b/docs/en/api/ops.rst
deleted file mode 100644
index b029045..0000000
--- a/docs/en/api/ops.rst
+++ /dev/null
@@ -1,135 +0,0 @@
-.. role:: hidden
-    :class: hidden-section
-
-mmcv.ops
-===================================
-
-.. contents:: mmcv.ops
-   :depth: 2
-   :local:
-   :backlinks: top
-
-.. currentmodule:: mmcv.ops
-
-.. autosummary::
-   :toctree: generated
-   :nosignatures:
-   :template: classtemplate.rst
-
-   BorderAlign
-   CARAFE
-   CARAFENaive
-   CARAFEPack
-   Conv2d
-   ConvTranspose2d
-   CornerPool
-   Correlation
-   CrissCrossAttention
-   DeformConv2d
-   DeformConv2dPack
-   DeformRoIPool
-   DeformRoIPoolPack
-   DynamicScatter
-   FusedBiasLeakyReLU
-   GroupAll
-   Linear
-   MaskedConv2d
-   MaxPool2d
-   ModulatedDeformConv2d
-   ModulatedDeformConv2dPack
-   ModulatedDeformRoIPoolPack
-   MultiScaleDeformableAttention
-   PSAMask
-   PointsSampler
-   PrRoIPool
-   QueryAndGroup
-   RiRoIAlignRotated
-   RoIAlign
-   RoIAlignRotated
-   RoIAwarePool3d
-   RoIPointPool3d
-   RoIPool
-   SAConv2d
-   SigmoidFocalLoss
-   SimpleRoIAlign
-   SoftmaxFocalLoss
-   SparseConv2d
-   SparseConv3d
-   SparseConvTensor
-   SparseConvTranspose2d
-   SparseConvTranspose3d
-   SparseInverseConv2d
-   SparseInverseConv3d
-   SparseMaxPool2d
-   SparseMaxPool3d
-   SparseModule
-   SparseSequential
-   SubMConv2d
-   SubMConv3d
-   SyncBatchNorm
-   TINShift
-   Voxelization
-
-.. autosummary::
-   :toctree: generated
-   :nosignatures:
-
-   active_rotated_filter
-   assign_score_withk
-   ball_query
-   batched_nms
-   bbox_overlaps
-   border_align
-   box_iou_rotated
-   boxes_iou3d
-   boxes_iou_bev
-   boxes_overlap_bev
-   carafe
-   carafe_naive
-   chamfer_distance
-   contour_expand
-   convex_giou
-   convex_iou
-   deform_conv2d
-   deform_roi_pool
-   diff_iou_rotated_2d
-   diff_iou_rotated_3d
-   dynamic_scatter
-   furthest_point_sample
-   furthest_point_sample_with_dist
-   fused_bias_leakyrelu
-   gather_points
-   grouping_operation
-   knn
-   masked_conv2d
-   min_area_polygons
-   modulated_deform_conv2d
-   nms
-   nms3d
-   nms3d_normal
-   nms_bev
-   nms_match
-   nms_normal_bev
-   nms_rotated
-   pixel_group
-   point_sample
-   points_in_boxes_all
-   points_in_boxes_cpu
-   points_in_boxes_part
-   points_in_polygons
-   prroi_pool
-   rel_roi_point_to_rel_img_point
-   riroi_align_rotated
-   roi_align
-   roi_align_rotated
-   roi_pool
-   rotated_feature_align
-   scatter_nd
-   sigmoid_focal_loss
-   soft_nms
-   softmax_focal_loss
-   three_interpolate
-   three_nn
-   tin_shift
-   upfirdn2d
-   voxelization
diff --git a/docs/en/api/transforms.rst b/docs/en/api/transforms.rst
deleted file mode 100644
index b080133..0000000
--- a/docs/en/api/transforms.rst
+++ /dev/null
@@ -1,60 +0,0 @@
-.. role:: hidden
-    :class: hidden-section
-
-mmcv.transforms
-===================================
-
-.. currentmodule:: mmcv.transforms
-
-.. autosummary::
-   :toctree: generated
-   :nosignatures:
-   :template: classtemplate.rst
-
-   BaseTransform
-   TestTimeAug
-
-Loading
-----------------
-
-.. autosummary::
-   :toctree: generated
-   :nosignatures:
-   :template: classtemplate.rst
-
-   LoadAnnotations
-   LoadImageFromFile
-
-Processing
-----------------
-
-.. autosummary::
-   :toctree: generated
-   :nosignatures:
-   :template: classtemplate.rst
-
-   CenterCrop
-   MultiScaleFlipAug
-   Normalize
-   Pad
-   RandomChoiceResize
-   RandomFlip
-   RandomGrayscale
-   RandomResize
-   Resize
-   ToTensor
-   ImageToTensor
-
-Wrapper
-----------------
-
-.. autosummary::
-   :toctree: generated
-   :nosignatures:
-   :template: classtemplate.rst
-
-   Compose
-   KeyMapper
-   RandomApply
-   RandomChoice
-   TransformBroadcaster
diff --git a/docs/en/api/utils.rst b/docs/en/api/utils.rst
deleted file mode 100644
index f2ff4c2..0000000
--- a/docs/en/api/utils.rst
+++ /dev/null
@@ -1,23 +0,0 @@
-.. role:: hidden
-    :class: hidden-section
-
-mmcv.utils
-===================================
-
-.. contents:: mmcv.utils
-   :depth: 2
-   :local:
-   :backlinks: top
-
-.. currentmodule:: mmcv.utils
-
-.. autosummary::
-   :toctree: generated
-   :nosignatures:
-
-   IS_CUDA_AVAILABLE
-   IS_MLU_AVAILABLE
-   IS_MPS_AVAILABLE
-   collect_env
-   jit
-   skip_no_elena
diff --git a/docs/en/api/video.rst b/docs/en/api/video.rst
deleted file mode 100644
index a6ebca0..0000000
--- a/docs/en/api/video.rst
+++ /dev/null
@@ -1,56 +0,0 @@
-.. role:: hidden
-    :class: hidden-section
-
-mmcv.video
-===================================
-
-.. contents:: mmcv.video
-   :depth: 2
-   :local:
-   :backlinks: top
-
-.. currentmodule:: mmcv.video
-
-IO
-----------------
-
-.. autosummary::
-   :toctree: generated
-   :nosignatures:
-   :template: classtemplate.rst
-
-   VideoReader
-   Cache
-
-.. autosummary::
-   :toctree: generated
-   :nosignatures:
-
-   frames2video
-
-Optical Flow
-----------------
-
-.. autosummary::
-   :toctree: generated
-   :nosignatures:
-
-   dequantize_flow
-   flow_from_bytes
-   flow_warp
-   flowread
-   flowwrite
-   quantize_flow
-   sparse_flow_from_bytes
-
-Video Processing
-----------------
-
-.. autosummary::
-   :toctree: generated
-   :nosignatures:
-
-   concat_video
-   convert_video
-   cut_video
-   resize_video
diff --git a/docs/en/api/visualization.rst b/docs/en/api/visualization.rst
deleted file mode 100644
index 8f43ef2..0000000
--- a/docs/en/api/visualization.rst
+++ /dev/null
@@ -1,50 +0,0 @@
-.. role:: hidden
-    :class: hidden-section
-
-mmcv.visualization
-===================================
-
-.. contents:: mmcv.visualization
-   :depth: 2
-   :local:
-   :backlinks: top
-
-.. currentmodule:: mmcv.visualization
-
-Color
-----------------
-
-.. autosummary::
-   :toctree: generated
-   :nosignatures:
-   :template: classtemplate.rst
-
-   Color
-
-.. autosummary::
-   :toctree: generated
-   :nosignatures:
-
-   color_val
-
-Image
-----------------
-
-.. autosummary::
-   :toctree: generated
-   :nosignatures:
-
-   imshow
-   imshow_bboxes
-   imshow_det_bboxes
-
-Optical Flow
-----------------
-
-.. autosummary::
-   :toctree: generated
-   :nosignatures:
-
-   flow2rgb
-   flowshow
-   make_color_wheel
diff --git a/docs/en/community/contributing.md b/docs/en/community/contributing.md
deleted file mode 100644
index e339935..0000000
--- a/docs/en/community/contributing.md
+++ /dev/null
@@ -1,267 +0,0 @@
-## Contributing to OpenMMLab
-
-Welcome to the MMCV community, we are committed to building a cutting-edge computer vision foundational library and all kinds of contributions are welcomed, including but not limited to
-
-**Fix bug**
-
-You can directly post a Pull Request to fix typo in code or documents
-
-The steps to fix the bug of code implementation are as follows.
-
-1. If the modification involve significant changes, you should create an issue first and describe the error information and how to trigger the bug. Other developers will discuss with you and propose an proper solution.
-
-2. Posting a pull request after fixing the bug and adding corresponding unit test.
-
-**New Feature or Enhancement**
-
-1. If the modification involve significant changes, you should create an issue to discuss with our developers to propose an proper design.
-2. Post a Pull Request after implementing the new feature or enhancement and add corresponding unit test.
-
-**Document**
-
-You can directly post a pull request to fix documents. If you want to add a document, you should first create an issue to check if it is reasonable.
-
-### Pull Request Workflow
-
-If you're not familiar with Pull Request, don't worry! The following guidance will tell you how to create a Pull Request step by step. If you want to dive into the develop mode of Pull Request, you can refer to the [official documents](https://docs.github.com/en/github/collaborating-with-issues-and-pull-requests/about-pull-requests)
-
-#### 1. Fork and clone
-
-If you are posting a pull request for the first time, you should fork the OpenMMLab repositories by clicking the **Fork** button in the top right corner of the GitHub page, and the forked repositories will appear under your GitHub profile.
-
-<img src="https://user-images.githubusercontent.com/57566630/167305749-43c7f4e9-449b-4e98-ade5-0c9276d5c9ce.png" width="1200">
-
-Then, you can clone the repositories to local:
-
-```shell
-git clone git@github.com:{username}/mmcv.git
-```
-
-After that, you should ddd official repository as the upstream repository
-
-```bash
-git remote add upstream git@github.com:open-mmlab/mmcv
-```
-
-Check whether remote repository has been added successfully by `git remote -v`
-
-```bash
-origin	git@github.com:{username}/mmcv.git (fetch)
-origin	git@github.com:{username}/mmcv.git (push)
-upstream	git@github.com:open-mmlab/mmcv (fetch)
-upstream	git@github.com:open-mmlab/mmcv (push)
-```
-
-```{note}
-Here's a brief introduction to origin and upstream. When we use "git clone", we create an "origin" remote by default, which points to the repository cloned from. As for "upstream", we add it ourselves to point to the target repository. Of course, if you don't like the name "upstream", you could name it as you wish. Usually, we'll push the code to "origin". If the pushed code conflicts with the latest code in official("upstream"), we should pull the latest code from upstream to resolve the conflicts, and then push to "origin" again. The posted Pull Request will be updated automatically.
-```
-
-#### 2. Configure pre-commit
-
-You should configure [pre-commit](https://pre-commit.com/#intro) in the local development environment to make sure the code style matches that of OpenMMLab. **Note**: The following code should be executed under the MMCV directory.
-
-```shell
-pip install -U pre-commit
-pre-commit install
-```
-
-Check that pre-commit is configured successfully, and install the hooks defined in `.pre-commit-config.yaml`.
-
-```shell
-pre-commit run --all-files
-```
-
-<img src="https://user-images.githubusercontent.com/57566630/173660750-3df20a63-cb66-4d33-a986-1f643f1d8aaf.png" width="1200">
-
-<img src="https://user-images.githubusercontent.com/57566630/202368856-0465a90d-8fce-4345-918e-67b8b9c82614.png" width="1200">
-
-```{note}
-Chinese users may fail to download the pre-commit hooks due to the network issue. In this case, you could download these hooks from gitee by setting the .pre-commit-config-zh-cn.yaml
-
-pre-commit install -c .pre-commit-config-zh-cn.yaml
-pre-commit run --all-files -c .pre-commit-config-zh-cn.yaml
-```
-
-If the installation process is interrupted, you can repeatedly run `pre-commit run ... ` to continue the installation.
-
-If the code does not conform to the code style specification, pre-commit will raise a warning and  fixes some of the errors automatically.
-
-<img src="https://user-images.githubusercontent.com/57566630/202369176-67642454-0025-4023-a095-263529107aa3.png" width="1200">
-
-If we want to commit our code bypassing the pre-commit hook, we can use the `--no-verify` option(**only for temporarily commit**.
-
-```shell
-git commit -m "xxx" --no-verify
-```
-
-#### 3. Create a development branch
-
-After configuring the pre-commit, we should create a branch based on the master branch to develop the new feature or fix the bug. The proposed branch name is `username/pr_name`
-
-```shell
-git checkout -b yhc/refactor_contributing_doc
-```
-
-In subsequent development, if the master branch of the local repository is behind the master branch of "upstream", we need to pull the upstream for synchronization, and then execute the above command:
-
-```shell
-git pull upstream master
-```
-
-#### 4. Commit the code and pass the unit test
-
-- MMCV introduces mypy to do static type checking to increase the robustness of the code. Therefore, we need to add Type Hints to our code and pass the mypy check. If you are not familiar with Type Hints, you can refer to [this tutorial](https://docs.python.org/3/library/typing.html).
-
-- The committed code should pass through the unit test
-
-  ```shell
-  # Pass all unit tests
-  pytest tests
-
-  # Pass the unit test of runner
-  pytest tests/test_runner/test_runner.py
-  ```
-
-  If the unit test fails for lack of dependencies, you can install the dependencies referring to the [guidance](#unit-test)
-
-- If the documents are modified/added, we should check the rendering result referring to [guidance](#document-rendering)
-
-#### 5. Push the code to remote
-
-We could push the local commits to remote after passing through the check of unit test and pre-commit. You can associate the local branch with remote branch by adding `-u` option.
-
-```shell
-git push -u origin {branch_name}
-```
-
-This will allow you to use the `git push` command to push code directly next time, without having to specify a branch or the remote repository.
-
-#### 6. Create a Pull Request
-
-(1) Create a pull request in GitHub's Pull request interface
-
-<img src="https://user-images.githubusercontent.com/57566630/201533288-516f7ac4-0b14-4dc8-afbd-912475c368b5.png" width="1200">
-
-(2) Modify the PR description according to the guidelines so that other developers can better understand your changes
-
-<img src="https://user-images.githubusercontent.com/57566630/202242953-c91a18ff-e388-4ff9-8591-5fae0ead6c1e.png" width="1200">
-
-Find more details about Pull Request description in [pull request guidelines](#pr-specs).
-
-**note**
-
-(a) The Pull Request description should contain the reason for the change, the content of the change, and the impact of the change, and be associated with the relevant Issue (see [documentation](https://docs.github.com/en/issues/tracking-your-work-with-issues/linking-a-pull-request-to-an-issue)
-
-(b) If it is your first contribution, please sign the CLA
-
-<img src="https://user-images.githubusercontent.com/57566630/167307569-a794b967-6e28-4eac-a942-00deb657815f.png" width="1200">
-
-(c) Check whether the Pull Request pass through the CI
-
-<img src="https://user-images.githubusercontent.com/57566630/167307490-f9ebf9fa-63c0-4d83-8ba1-081ea169eb3a.png" width="1200">
-
-MMCV will run unit test for the posted Pull Request on different platforms (Linux, Window, Mac), based on different versions of Python, PyTorch, CUDA to make sure the code is correct. We can see the specific test information by clicking `Details` in the above image so that we can modify the code.
-
-(3) If the Pull Request passes the CI, then you can wait for the review from other developers. You'll modify the code based on the reviewer's comments, and repeat the steps [4](#4-commit-the-code-and-pass-the-unit-test)-[5](#5-push-the-code-to-remote) until all reviewers approve it. Then, we will merge it ASAP.
-
-<img src="https://user-images.githubusercontent.com/57566630/202145400-cc2cd8c4-10b0-472f-ba37-07e6f50acc67.png" width="1200">
-
-#### 7. Resolve conflicts
-
-If your local branch conflicts with the latest master branch of "upstream", you'll need to resolove them. There are two ways to do this:
-
-```shell
-git fetch --all --prune
-git rebase upstream/master
-```
-
-or
-
-```shell
-git fetch --all --prune
-git merge upstream/master
-```
-
-If you are very good at handling conflicts, then you can use rebase to resolve conflicts, as this will keep your commit logs tidy. If you are not familiar with `rebase`, then you can use `merge` to resolve conflicts.
-
-### Guidance
-
-#### Unit test
-
-If you cannot run the unit test of some modules for lacking of some dependencies, such as [video](https://github.com/open-mmlab/mmcv/tree/master/mmcv/video) module, you can try to install the following dependencies:
-
-```shell
-# Linux
-sudo apt-get update -y
-sudo apt-get install -y libturbojpeg
-sudo apt-get install -y ffmpeg
-
-# Windows
-conda install ffmpeg
-```
-
-We should also make sure the committed code will not decrease the coverage of unit test, we could run the following command to check the coverage of unit test:
-
-```shell
-python -m coverage run -m pytest /path/to/test_file
-python -m coverage html
-# check file in htmlcov/index.html
-```
-
-#### Document rendering
-
-If the documents are modified/added, we should check the rendering result. We could install the dependencies and run the following command to render the documents and check the results:
-
-```shell
-pip install -r requirements/docs.txt
-cd docs/zh_cn/
-# or docs/en
-make html
-# check file in ./docs/zh_cn/_build/html/index.html
-```
-
-### Code style
-
-#### Python
-
-We adopt [PEP8](https://www.python.org/dev/peps/pep-0008/) as the preferred code style.
-
-We use the following tools for linting and formatting:
-
-- [flake8](https://github.com/PyCQA/flake8): A wrapper around some linter tools.
-- [isort](https://github.com/timothycrosley/isort): A Python utility to sort imports.
-- [yapf](https://github.com/google/yapf): A formatter for Python files.
-- [codespell](https://github.com/codespell-project/codespell): A Python utility to fix common misspellings in text files.
-- [mdformat](https://github.com/executablebooks/mdformat): Mdformat is an opinionated Markdown formatter that can be used to enforce a consistent style in Markdown files.
-- [docformatter](https://github.com/myint/docformatter): A formatter to format docstring.
-
-Style configurations of yapf and isort can be found in [setup.cfg](./setup.cfg).
-
-We use [pre-commit hook](https://pre-commit.com/) that checks and formats for `flake8`, `yapf`, `isort`, `trailing whitespaces`, `markdown files`,
-fixes `end-of-files`, `double-quoted-strings`, `python-encoding-pragma`, `mixed-line-ending`, sorts `requirments.txt` automatically on every commit.
-The config for a pre-commit hook is stored in [.pre-commit-config](./.pre-commit-config.yaml).
-
-#### C++ and CUDA
-
-We follow the [Google C++ Style Guide](https://google.github.io/styleguide/cppguide.html).
-
-### PR Specs
-
-1. Use [pre-commit](https://pre-commit.com) hook to avoid issues of code style
-
-2. One short-time branch should be matched with only one PR
-
-3. Accomplish a detailed change in one PR. Avoid large PR
-
-   - Bad: Support Faster R-CNN
-   - Acceptable: Add a box head to Faster R-CNN
-   - Good: Add a parameter to box head to support custom conv-layer number
-
-4. Provide clear and significant commit message
-
-5. Provide clear and meaningful PR description
-
-   - Task name should be clarified in title. The general format is: \[Prefix\] Short description of the PR (Suffix)
-   - Prefix: add new feature \[Feature\], fix bug \[Fix\], related to documents \[Docs\], in developing \[WIP\] (which will not be reviewed temporarily)
-   - Introduce main changes, results and influences on other modules in short description
-   - Associate related issues and pull requests with a milestone
diff --git a/docs/en/community/pr.md b/docs/en/community/pr.md
deleted file mode 100644
index 1bdd90f..0000000
--- a/docs/en/community/pr.md
+++ /dev/null
@@ -1,3 +0,0 @@
-## Pull Request (PR)
-
-Content has been migrated to [contributing guidance](contributing.md).
diff --git a/docs/en/docutils.conf b/docs/en/docutils.conf
deleted file mode 100644
index 0c00c84..0000000
--- a/docs/en/docutils.conf
+++ /dev/null
@@ -1,2 +0,0 @@
-[html writers]
-table_style: colwidths-auto
diff --git a/docs/en/faq.md b/docs/en/faq.md
deleted file mode 100644
index 02d31c2..0000000
--- a/docs/en/faq.md
+++ /dev/null
@@ -1,93 +0,0 @@
-## Frequently Asked Questions
-
-We list some common troubles faced by many users and their corresponding solutions here.
-Feel free to enrich the list if you find any frequent issues and have ways to help others to solve them.
-
-### Installation
-
-- KeyError: "xxx: 'yyy is not in the zzz registry'"
-
-  The registry mechanism will be triggered only when the file of the module is imported.
-  So you need to import that file somewhere. More details can be found at [KeyError: "MaskRCNN: 'RefineRoIHead is not in the models registry'"](https://github.com/open-mmlab/mmdetection/issues/5974).
-
-- "No module named 'mmcv.ops'"; "No module named 'mmcv.\_ext'"
-
-  1. Uninstall existing mmcv in the environment using `pip uninstall mmcv`
-  2. Install mmcv-full following the [installation instruction](https://mmcv.readthedocs.io/en/latest/get_started/installation.html) or [Build MMCV from source](https://mmcv.readthedocs.io/en/latest/get_started/build.html)
-
-- "invalid device function" or "no kernel image is available for execution"
-
-  1. Check the CUDA compute capability of you GPU
-  2. Run `python mmdet/utils/collect_env.py` to check whether PyTorch, torchvision, and MMCV are built for the correct GPU architecture. You may need to set `TORCH_CUDA_ARCH_LIST` to reinstall MMCV. The compatibility issue could happen when  using old GPUS, e.g., Tesla K80 (3.7) on colab.
-  3. Check whether the running environment is the same as that when mmcv/mmdet is compiled. For example, you may compile mmcv using CUDA 10.0 bug run it on CUDA9.0 environments
-
-- "undefined symbol" or "cannot open xxx.so"
-
-  1. If those symbols are CUDA/C++ symbols (e.g., libcudart.so or GLIBCXX), check
-     whether the CUDA/GCC runtimes are the same as those used for compiling mmcv
-  2. If those symbols are Pytorch symbols (e.g., symbols containing caffe, aten, and TH), check whether the Pytorch version is the same as that used for compiling mmcv
-  3. Run `python mmdet/utils/collect_env.py` to check whether PyTorch, torchvision, and MMCV are built by and running on the same environment
-
-- "RuntimeError: CUDA error: invalid configuration argument"
-
-  This error may be caused by the poor performance of GPU. Try to decrease the value of [THREADS_PER_BLOCK](https://github.com/open-mmlab/mmcv/blob/cac22f8cf5a904477e3b5461b1cc36856c2793da/mmcv/ops/csrc/common_cuda_helper.hpp#L10)
-  and recompile mmcv.
-
-- "RuntimeError: nms is not compiled with GPU support"
-
-  This error is because your CUDA environment is not installed correctly.
-  You may try to re-install your CUDA environment and then delete the build/ folder before re-compile mmcv.
-
-- "Segmentation fault"
-
-  1. Check your GCC version and use GCC >= 5.4. This usually caused by the incompatibility between PyTorch and the environment (e.g., GCC \< 4.9 for PyTorch). We also recommend the users to avoid using GCC 5.5 because many feedbacks report that GCC 5.5 will cause "segmentation fault" and simply changing it to GCC 5.4 could solve the problem
-  2. Check whether PyTorch is correctly installed and could use CUDA op, e.g. type the following command in your terminal and see whether they could correctly output results
-     ```shell
-     python -c 'import torch; print(torch.cuda.is_available())'
-     ```
-  3. If PyTorch is correctly installed, check whether MMCV is correctly installed. If MMCV is correctly installed, then there will be no issue of the command
-     ```shell
-     python -c 'import mmcv; import mmcv.ops'
-     ```
-  4. If MMCV and PyTorch are correctly installed, you can use `ipdb` to set breakpoints or directly add `print` to debug and see which part leads the `segmentation fault`
-
-- "libtorch_cuda_cu.so: cannot open shared object file"
-
-  `mmcv-full` depends on the share object but it can not be found. We can check whether the object exists in `~/miniconda3/envs/{environment-name}/lib/python3.7/site-packages/torch/lib` or try to re-install the PyTorch.
-
-- "fatal error C1189: #error:  -- unsupported Microsoft Visual Studio version!"
-
-  If you are building mmcv-full on Windows and the version of CUDA is 9.2, you will probably encounter the error `"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v9.2\include\crt/host_config.h(133): fatal error C1189: #error:  -- unsupported Microsoft Visual Studio version! Only the versions 2012, 2013, 2015 and 2017 are supported!"`, in which case you can use a lower version of Microsoft Visual Studio like vs2017.
-
-- "error: member "torch::jit::detail::ModulePolicy::all_slots" may not be initialized"
-
-  If your version of PyTorch is 1.5.0 and you are building mmcv-full on Windows, you will probably encounter the error `- torch/csrc/jit/api/module.h(474): error: member "torch::jit::detail::ModulePolicy::all_slots" may not be initialized`. The way to solve the error is to replace all the `static constexpr bool all_slots = false;` with `static bool all_slots = false;` at this file `https://github.com/pytorch/pytorch/blob/v1.5.0/torch/csrc/jit/api/module.h`. More details can be found at [member "torch::jit::detail::AttributePolicy::all_slots" may not be initialized](https://github.com/pytorch/pytorch/issues/39394).
-
-- "error: a member with an in-class initializer must be const"
-
-  If your version of PyTorch is 1.6.0 and you are building mmcv-full on Windows, you will probably encounter the error `"- torch/include\torch/csrc/jit/api/module.h(483): error: a member with an in-class initializer must be const"`. The way to solve the error is to replace all the `CONSTEXPR_EXCEPT_WIN_CUDA ` with `const` at `torch/include\torch/csrc/jit/api/module.h`. More details can be found at [Ninja: build stopped: subcommand failed](https://github.com/open-mmlab/mmcv/issues/575).
-
-- "error: member "torch::jit::ProfileOptionalOp::Kind" may not be initialized"
-
-  If your version of PyTorch is 1.7.0 and you are building mmcv-full on Windows, you will probably encounter the error `torch/include\torch/csrc/jit/ir/ir.h(1347): error: member "torch::jit::ProfileOptionalOp::Kind" may not be initialized`. The way to solve the error needs to modify several local files of PyTorch:
-
-  - delete `static constexpr Symbol Kind = ::c10::prim::profile;` and `tatic constexpr Symbol Kind = ::c10::prim::profile_optional;` at `torch/include\torch/csrc/jit/ir/ir.h`
-  - replace `explicit operator type&() { return *(this->value); }` with `explicit operator type&() { return *((type*)this->value); }` at `torch\include\pybind11\cast.h`
-  - replace all the `CONSTEXPR_EXCEPT_WIN_CUDA` with `const` at `torch/include\torch/csrc/jit/api/module.h`
-
-  More details can be found at [Ensure default extra_compile_args](https://github.com/pytorch/pytorch/pull/45956).
-
-- Compatibility issue between MMCV and MMDetection; "ConvWS is already registered in conv layer"
-
-  Please install the correct version of MMCV for the version of your MMDetection following the [installation instruction](https://mmdetection.readthedocs.io/en/latest/get_started.html#installation).
-
-### Usage
-
-- "RuntimeError: Expected to have finished reduction in the prior iteration before starting a new one"
-
-  1. This error indicates that your module has parameters that were not used in producing loss. This phenomenon may be caused by running different branches in your code in DDP mode. More datails at [Expected to have finished reduction in the prior iteration before starting a new one](https://github.com/pytorch/pytorch/issues/55582).
-  2. You can set ` find_unused_parameters = True` in the config to solve the above problems or find those unused parameters manually
-
-- "RuntimeError: Trying to backward through the graph a second time"
-
-  `GradientCumulativeOptimizerHook` and `OptimizerHook` are both set which causes the `loss.backward()` to be called twice so `RuntimeError` was raised. We can only use one of these. More datails at [Trying to backward through the graph a second time](https://github.com/open-mmlab/mmcv/issues/1379).
diff --git a/docs/en/get_started/build.md b/docs/en/get_started/build.md
deleted file mode 100644
index e3d48ec..0000000
--- a/docs/en/get_started/build.md
+++ /dev/null
@@ -1,292 +0,0 @@
-## Build MMCV from source
-
-### Build mmcv
-
-Before installing mmcv, make sure that PyTorch has been successfully installed following the [PyTorch official installation guide](https://pytorch.org/get-started/locally/#start-locally). This can be verified using the following command
-
-```bash
-python -c 'import torch;print(torch.__version__)'
-```
-
-If version information is output, then PyTorch is installed.
-
-```{note}
-If you would like to use `opencv-python-headless` instead of `opencv-python`,
-e.g., in a minimum container environment or servers without GUI,
-you can first install it before installing MMCV to skip the installation of `opencv-python`.
-```
-
-#### Build on Linux
-
-1. Clone the repo
-
-   ```bash
-   git clone https://github.com/open-mmlab/mmcv.git
-   cd mmcv
-   ```
-
-2. Install `ninja` and `psutil` to speed up the compilation
-
-   ```bash
-   pip install -r requirements/optional.txt
-   ```
-
-3. Check the nvcc version (requires 9.2+. Skip if no GPU available.)
-
-   ```bash
-   nvcc --version
-   ```
-
-   If the above command outputs the following message, it means that the nvcc setting is OK, otherwise you need to set CUDA_HOME.
-
-   ```
-   nvcc: NVIDIA (R) Cuda compiler driver
-   Copyright (c) 2005-2020 NVIDIA Corporation
-   Built on Mon_Nov_30_19:08:53_PST_2020
-   Cuda compilation tools, release 11.2, V11.2.67
-   Build cuda_11.2.r11.2/compiler.29373293_0
-   ```
-
-   :::{note}
-   If you want to support ROCm, you can refer to [AMD ROCm](https://rocmdocs.amd.com/en/latest/Installation_Guide/Installation-Guide.html) to install ROCm.
-   :::
-
-4. Check the gcc version (requires 5.4+)
-
-   ```bash
-   gcc --version
-   ```
-
-5. Start building (takes 10+ min)
-
-   ```bash
-   pip install -e . -v
-   ```
-
-6. Validate the installation
-
-   ```bash
-   python .dev_scripts/check_installation.py
-   ```
-
-   If no error is reported by the above command, the installation is successful. If there is an error reported, please check [Frequently Asked Questions](../faq.md) to see if there is already a solution.
-
-   If no solution is found, please feel free to open an [issue](https://github.com/open-mmlab/mmcv/issues).
-
-#### Build on macOS
-
-```{note}
-If you are using a mac with apple silicon chip, install the PyTorch 1.13+, otherwise you will encounter the problem in [issues#2218](https://github.com/open-mmlab/mmcv/issues/2218).
-```
-
-1. Clone the repo
-
-   ```bash
-   git clone https://github.com/open-mmlab/mmcv.git
-   cd mmcv
-   ```
-
-2. Install `ninja` and `psutil` to speed up the compilation
-
-   ```bash
-   pip install -r requirements/optional.txt
-   ```
-
-3. Start building
-
-   ```bash
-   MMCV_WITH_OPS=1 pip install -e .
-   ```
-
-4. Validate the installation
-
-   ```bash
-   python .dev_scripts/check_installation.py
-   ```
-
-   If no error is reported by the above command, the installation is successful. If there is an error reported, please check [Frequently Asked Questions](../faq.md) to see if there is already a solution.
-
-   If no solution is found, please feel free to open an [issue](https://github.com/open-mmlab/mmcv/issues).
-
-#### Build on Windows
-
-Building MMCV on Windows is a bit more complicated than that on Linux.
-The following instructions show how to get this accomplished.
-
-##### Prerequisite
-
-The following software is required for building MMCV on windows.
-Install them first.
-
-- [Git](https://git-scm.com/download/win)
-  - During installation, tick **add git to Path**.
-- [Visual Studio Community 2019](https://visualstudio.microsoft.com)
-  - A compiler for C++ and CUDA codes.
-- [Miniconda](https://docs.conda.io/en/latest/miniconda.html)
-  - Official distributions of Python should work too.
-- [CUDA 10.2](https://developer.nvidia.com/cuda-10.2-download-archive)
-  - Not required for building CPU version.
-  - Customize the installation if necessary. As a recommendation, skip the driver installation if a newer version is already installed.
-
-```{note}
-You should know how to set up environment variables, especially `Path`, on Windows. The following instruction relies heavily on this skill.
-```
-
-##### Common steps
-
-1. Launch Anaconda prompt from Windows Start menu
-
-   Do not use raw `cmd.exe` s instruction is based on PowerShell syntax.
-
-2. Create a new conda environment
-
-   ```powershell
-   (base) PS C:\Users\xxx> conda create --name mmcv python=3.7
-   (base) PS C:\Users\xxx> conda activate mmcv  # make sure to activate environment before any operation
-   ```
-
-3. Install PyTorch. Choose a version based on your need.
-
-   ```powershell
-   # CUDA version
-   (mmcv) PS C:\Users\xxx> conda install pytorch torchvision cudatoolkit=10.2 -c pytorch
-   # CPU version
-   (mmcv) PS C:\Users\xxx> conda install install pytorch torchvision cpuonly -c pytorch
-   ```
-
-4. Clone the repo
-
-   ```powershell
-   (mmcv) PS C:\Users\xxx> git clone https://github.com/open-mmlab/mmcv.git
-   (mmcv) PS C:\Users\xxx\mmcv> cd mmcv
-   ```
-
-5. Install `ninja` and `psutil` to speed up the compilation
-
-   ```powershell
-   (mmcv) PS C:\Users\xxx\mmcv> pip install -r requirements/optional.txt
-   ```
-
-6. Set up MSVC compiler
-
-   Set Environment variable, add `C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\VC\Tools\MSVC\14.27.29110\bin\Hostx86\x64` to `PATH`, so that `cl.exe` will be available in prompt, as shown below.
-
-   ```powershell
-   (mmcv) PS C:\Users\xxx\mmcv> cl
-   Microsoft (R) C/C++ Optimizing  Compiler Version 19.27.29111 for x64
-   Copyright (C) Microsoft Corporation.   All rights reserved.
-
-   usage: cl [ option... ] filename... [ / link linkoption... ]
-   ```
-
-   For compatibility, we use the x86-hosted and x64-targeted compiler. note `Hostx86\x64` in the path.
-
-   You may want to change the system language to English because pytorch will parse text output from `cl.exe` to check its version. However only utf-8 is recognized. Navigate to Control Panel -> Region -> Administrative -> Language for Non-Unicode programs and change it to English.
-
-##### Build and install MMCV
-
-mmcv can be built in two ways:
-
-1. Full version (CPU ops)
-
-   Module `ops` will be compiled as a pytorch extension, but only x86 code will be compiled. The compiled ops can be executed on CPU only.
-
-2. Full version (CUDA ops)
-
-   Both x86 and CUDA codes of `ops` module will be compiled. The compiled version can be run on both CPU and CUDA-enabled GPU (if implemented).
-
-###### CPU version
-
-Build and install
-
-```powershell
-(mmcv) PS C:\Users\xxx\mmcv> python setup.py build_ext
-(mmcv) PS C:\Users\xxx\mmcv> python setup.py develop
-```
-
-###### GPU version
-
-1. Make sure `CUDA_PATH` or `CUDA_HOME` is already set in `envs` via `ls env:`, desired output is shown as below:
-
-   ```powershell
-   (mmcv) PS C:\Users\xxx\mmcv> ls env:
-
-   Name                           Value
-   ----                           -----
-   CUDA_PATH                      C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.2
-   CUDA_PATH_V10_1                C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.1
-   CUDA_PATH_V10_2                C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.2
-   ```
-
-   This should already be done by CUDA installer. If not, or you have multiple version of CUDA toolkit installed, set it with
-
-   ```powershell
-   (mmcv) PS C:\Users\xxx\mmcv> $env:CUDA_HOME = "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.2"
-   # OR
-   (mmcv) PS C:\Users\xxx\mmcv> $env:CUDA_HOME = $env:CUDA_PATH_V10_2 # if CUDA_PATH_V10_2 is in envs:
-   ```
-
-2. Set CUDA target arch
-
-   ```shell
-   # Here you need to change to the target architecture corresponding to your GPU
-   (mmcv) PS C:\Users\xxx\mmcv> $env:TORCH_CUDA_ARCH_LIST="7.5"
-   ```
-
-   :::{note}
-   Check your the compute capability of your GPU from [here](https://developer.nvidia.com/cuda-gpus).
-
-   ```powershell
-   (mmcv) PS C:\Users\xxx\mmcv> &"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.2\extras\demo_suite\deviceQuery.exe"
-   Device 0: "NVIDIA GeForce GTX 1660 SUPER"
-   CUDA Driver Version / Runtime Version          11.7 / 11.1
-   CUDA Capability Major/Minor version number:    7.5
-   ```
-
-   The 7.5 above indicates the target architecture. Note: You need to replace v10.2 with your CUDA version in the above command.
-   :::
-
-3. Build and install
-
-   ```powershell
-   # build
-   python setup.py build_ext # if success, cl will be launched to compile ops
-   # install
-   python setup.py develop
-   ```
-
-   ```{note}
-   If you are compiling against PyTorch 1.6.0, you might meet some errors from PyTorch as described in [this issue](https://github.com/pytorch/pytorch/issues/42467). Follow [this pull request](https://github.com/pytorch/pytorch/pull/43380/files) to modify the source code in your local PyTorch installation.
-   ```
-
-##### Validate installation
-
-```powershell
-(mmcv) PS C:\Users\xxx\mmcv> python .dev_scripts/check_installation.py
-```
-
-If no error is reported by the above command, the installation is successful. If there is an error reported, please check [Frequently Asked Questions](../faq.md) to see if there is already a solution.
-If no solution is found, please feel free to open an [issue](https://github.com/open-mmlab/mmcv/issues).
-
-### Build mmcv-lite
-
-If you need to use PyTorch-related modules, make sure PyTorch has been successfully installed in your environment by referring to the [PyTorch official installation guide](https://github.com/pytorch/pytorch#installation).
-
-1. Clone the repo
-
-   ```bash
-   git clone https://github.com/open-mmlab/mmcv.git
-   cd mmcv
-   ```
-
-2. Start building
-
-   ```bash
-   MMCV_WITH_OPS=0 pip install -e . -v
-   ```
-
-3. Validate installation
-
-   ```bash
-   python -c 'import mmcv;print(mmcv.__version__)'
-   ```
diff --git a/docs/en/get_started/installation.md b/docs/en/get_started/installation.md
deleted file mode 100644
index 12bad00..0000000
--- a/docs/en/get_started/installation.md
+++ /dev/null
@@ -1,348 +0,0 @@
-## Installation
-
-There are two versions of MMCV:
-
-- **mmcv**: comprehensive, with full features and various CUDA ops out of box. It takes longer time to build.
-- **mmcv-lite**: lite, without CUDA ops but all other features, similar to mmcv\<1.0.0. It is useful when you do not need those CUDA ops.
-
-```{warning}
-Do not install both versions in the same environment, otherwise you may encounter errors like `ModuleNotFound`. You need to uninstall one before installing the other. `Installing the full version is highly recommended if CUDA is avaliable`.
-```
-
-### Install mmcv
-
-Before installing mmcv, make sure that PyTorch has been successfully installed following the [PyTorch official installation guide](https://pytorch.org/get-started/locally/#start-locally). This can be verified using the following command
-
-```bash
-python -c 'import torch;print(torch.__version__)'
-```
-
-If version information is output, then PyTorch is installed.
-
-#### Install with mim (recommended)
-
-[mim](https://github.com/open-mmlab/mim) is the package management tool for the OpenMMLab projects, which makes it easy to install mmcv
-
-```bash
-pip install -U openmim
-mim install "mmcv>=2.0.0rc1"
-```
-
-If you find that the above installation command does not use a pre-built package ending with `.whl` but a source package ending with `.tar.gz`, you may not have a pre-build package corresponding to the PyTorch or CUDA or mmcv version, in which case you can [build mmcv from source](build.md).
-
-<details>
-<summary>Installation log using pre-built packages</summary>
-
-Looking in links: https://download.openmmlab.com/mmcv/dist/cu102/torch1.8.0/index.html<br />
-Collecting mmcv<br />
-<b>Downloading https://download.openmmlab.com/mmcv/dist/cu102/torch1.8.0/mmcv-2.0.0rc3-cp38-cp38-manylinux1_x86_64.whl</b>
-
-</details>
-
-<details>
-<summary>Installation log using source packages</summary>
-
-Looking in links: https://download.openmmlab.com/mmcv/dist/cu102/torch1.8.0/index.html<br />
-Collecting mmcv==2.0.0rc3<br />
-<b>Downloading mmcv-2.0.0rc3.tar.gz</b>
-
-</details>
-
-To install a specific version of mmcv, for example, mmcv version 2.0.0rc3, you can use the following command
-
-```bash
-mim install mmcv==2.0.0rc3
-```
-
-:::{note}
-If you would like to use `opencv-python-headless` instead of `opencv-python`,
-e.g., in a minimum container environment or servers without GUI,
-you can first install it before installing MMCV to skip the installation of `opencv-python`.
-
-Alternatively, if it takes too long to install a dependency library, you can specify the pypi source
-
-```bash
-mim install "mmcv>=2.0.0rc3" -i https://pypi.tuna.tsinghua.edu.cn/simple
-```
-
-:::
-
-You can run [check_installation.py](https://github.com/open-mmlab/mmcv/blob/2.x/.dev_scripts/check_installation.py) to check the installation of mmcv-full after running the installation commands.
-
-#### Install with pip
-
-Use the following command to check the version of CUDA and PyTorch
-
-```bash
-python -c 'import torch;print(torch.__version__);print(torch.version.cuda)'
-```
-
-Select the appropriate installation command depending on the type of system, CUDA version, PyTorch version, and MMCV version
-
-<html>
-<body>
-    <style>
-      select {
-          z-index: 1000;
-          position: absolute;
-          top: 10px;
-          width: 6.7rem;
-      }
-      #select-container {
-          position: relative;
-          height: 30px;
-      }
-      #select-cmd {
-          background-color: #f5f6f7;
-          font-size: 14px;
-          margin-top: 20px;
-      }
-      /* 让每一个都间隔1.3rem */
-      #select-os {
-          /* left: 1.375rem; */
-          left: 0;
-      }
-      #select-cuda {
-          /* left: 9.375rem;    9.375 = 1.375 + 6.7 + 1.3 */
-          left: 8rem;
-      }
-      #select-torch {
-          /* left: 17.375rem;    17.375 = 9.375 + 6.7 + 1.3 */
-          left: 16rem;
-      }
-      #select-mmcv {
-          /* left: 25.375rem;    25.375 = 17.375 + 6.7 + 1.3 */
-          left: 24rem;
-      }
-    </style>
-    <div id="select-container">
-        <select
-            onmousedown="handleSelectMouseDown(this.id)"
-            onblur="handleSelectBlur(this.id)"
-            onchange="changeOS(this.value)"
-            id="select-os">
-        </select>
-        <select
-            onmousedown="handleSelectMouseDown(this.id)"
-            onblur="handleSelectBlur(this.id)"
-            onchange="changeCUDA(this.value)"
-            id="select-cuda">
-        </select>
-        <select
-            onmousedown="handleSelectMouseDown(this.id)"
-            onblur="handleSelectBlur(this.id)"
-            onchange="changeTorch(this.value)"
-            id="select-torch">
-        </select>
-        <select
-            onmousedown="handleSelectMouseDown(this.id)"
-            onblur="handleSelectBlur(this.id)"
-            onchange="changeMMCV(this.value)"
-            id="select-mmcv">
-        </select>
-    </div>
-    <pre id="select-cmd"></pre>
-</body>
-<script>
-    let osVal, cudaVal, torchVal, mmcvVal;
-    function changeMMCV(val) {
-        mmcvVal = val;
-        change("select-mmcv");
-    }
-    function changeTorch(val) {
-        torchVal = val;
-        change("select-torch");
-    }
-    function changeCUDA(val) {
-        cudaVal = val;
-        change("select-cuda");
-    }
-    function changeOS(val) {
-        osVal = val;
-        change("select-os");
-    }
-    function handleSelectMouseDown(id) {
-        const dom = document.getElementById(id);
-        if (!dom) return;
-        const len = dom?.options?.length;
-        if (len >= 9) {
-            dom.size = 10;
-            dom.style.zIndex = 100;
-        }
-    }
-    function handleSelectClick() {
-        const selects = Array.from(document.getElementsByTagName("select"));
-        selects.forEach(select => {
-            select.size = 1;
-        });
-    }
-    function handleSelectBlur(id) {
-        const dom = document.getElementById(id);
-        if (!dom) {
-            handleSelectClick();
-            return;
-        }
-        dom.size = 1;
-        dom.style.zIndex = 1;
-    }
-    function changeCmd() {
-        const cmd = document.getElementById("select-cmd");
-        let cmdString = "pip install mmcv=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/{cu_version}/{torch_version}/index.html";
-        // e.g: pip install mmcv==2.0.0rc1 -f https://download.openmmlab.com/mmcv/dist/cu111/torch1.9/index.html
-        let cudaVersion;
-        if (cudaVal === "cpu" || cudaVal === "mps") {
-            cudaVersion = "cpu";
-        } else {
-            cudaVersion = `cu${cudaVal.split(".").join("")}`;
-        }
-        const torchVersion = `torch${torchVal.substring(0, torchVal.length - 2)}`;
-        cmdString = cmdString.replace("{cu_version}", cudaVersion).replace("{mmcv_version}", mmcvVal).replace("{torch_version}", torchVersion);
-        cmd.textContent = cmdString;
-    }
-    function unique(arr) {
-        if (!arr || !Array.isArray(arr)) return [];
-        return [...new Set(arr)];
-    }
-    function genOptionFragment(data, id) {
-        const name = id.includes("-")? id.split("-")[1] : id;
-        const fragment = new DocumentFragment();
-        data.forEach(option => {
-            const ele = document.createElement("option");
-            let text = `${name} ${option}`;
-            if (name === "os" || option.toUpperCase() === "CPU" || option.toUpperCase() === "MPS") {
-                text = `${option}`;
-            }
-            ele.textContent = text;
-            ele.value = option;
-            ele.addEventListener('click', handleSelectClick);
-            fragment.appendChild(ele);
-        });
-        return fragment;
-    }
-    function findAndAppend(data, id) {
-        const fragment = genOptionFragment(data, id);
-        const dom = document.getElementById(id);
-        if (dom) dom.replaceChildren(fragment);
-    }
-    function change(id) {
-        const order = ["select-mmcv", "select-torch", "select-cuda", "select-os"];
-        const idx = order.indexOf(id);
-        if (idx === -1) return;
-        const versionDetail = version[osVal];
-        if (idx >= 3) {
-            let cuda = [];
-            versionDetail.forEach(v => {
-                cuda.push(v.cuda);
-            });
-            cuda = unique(cuda);
-            cudaVal = cuda[0];
-            findAndAppend(cuda, "select-cuda");
-        }
-        if (idx >= 2) {
-            const torch = [];
-            versionDetail.forEach(v => {
-                if (v.cuda === cudaVal) torch.push(v.torch);
-            });
-            torchVal = torch[0];
-            findAndAppend(torch, "select-torch");
-        }
-        if (idx >= 1) {
-            let mmcv = [];
-            versionDetail.forEach(v => {
-                if (v.cuda === cudaVal && v.torch === torchVal) mmcv = v.mmcv;
-            });
-            mmcvVal = mmcv[0];
-            findAndAppend(mmcv, "select-mmcv");
-        }
-        changeCmd();
-    }
-    function init() {
-        document.addEventListener("click", handleSelectBlur);
-        const version = window.version;
-        const os = Object.keys(version);
-        osVal = os[0];
-        findAndAppend(os, "select-os");
-        change("select-os");
-        changeCmd();
-    }
-    window.onload = function () {
-        const url = "../_static/version.json"
-        const request = new XMLHttpRequest();
-        request.open("get", url);
-        request.send(null);
-        request.onload = function () {
-            if (request.status !== 200) return;
-            const data = JSON.parse(request.responseText);
-            window.version = data;
-            init();
-        }
-    }
-</script>
-</html>
-
-If you do not find a corresponding version in the dropdown box above, you probably do not have a pre-built package corresponding to the PyTorch or CUDA or mmcv version, at which point you can [build mmcv from source](build.md).
-
-:::{note}
-mmcv is only compiled on PyTorch 1.x.0 because the compatibility
-usually holds between 1.x.0 and 1.x.1. If your PyTorch version is 1.x.1, you
-can install mmcv compiled with PyTorch 1.x.0 and it usually works well.
-For example, if your PyTorch version is 1.8.1, you can feel free to choose 1.8.x.
-:::
-
-:::{note}
-If you would like to use `opencv-python-headless` instead of `opencv-python`,
-e.g., in a minimum container environment or servers without GUI,
-you can first install it before installing MMCV to skip the installation of `opencv-python`.
-
-Alternatively, if it takes too long to install a dependency library, you can specify the pypi source
-
-```bash
-mim install "mmcv>=2.0.0rc1" -i https://pypi.tuna.tsinghua.edu.cn/simple
-```
-
-:::
-
-You can run [check_installation.py](https://github.com/open-mmlab/mmcv/blob/2.x/.dev_scripts/check_installation.py) to check the installation of mmcv after running the installation commands.
-
-#### Using mmcv with Docker
-
-Build with local repository
-
-```bash
-git clone https://github.com/open-mmlab/mmcv.git && cd mmcv
-docker build -t mmcv -f docker/release/Dockerfile .
-```
-
-Or build with remote repository
-
-```bash
-docker build -t mmcv https://github.com/open-mmlab/mmcv.git#2.x:docker/release
-```
-
-The [Dockerfile](release/Dockerfile) installs latest released version of mmcv-full by default, but you can specify mmcv versions to install expected versions.
-
-```bash
-docker image build -t mmcv -f docker/release/Dockerfile --build-arg MMCV=2.0.0rc1 .
-```
-
-If you also want to use other versions of PyTorch and CUDA, you can also pass them when building docker images.
-
-An example to build an image with PyTorch 1.11 and CUDA 11.3.
-
-```bash
-docker build -t mmcv -f docker/release/Dockerfile \
-    --build-arg PYTORCH=1.11.0 \
-    --build-arg CUDA=11.3 \
-    --build-arg CUDNN=8 \
-    --build-arg MMCV=2.0.0rc1 .
-```
-
-More available versions of PyTorch and CUDA can be found at [dockerhub/pytorch](https://hub.docker.com/r/pytorch/pytorch/tags).
-
-### Install mmcv-lite
-
-If you need to use PyTorch-related modules, make sure PyTorch has been successfully installed in your environment by referring to the [PyTorch official installation guide](https://github.com/pytorch/pytorch#installation).
-
-```python
-pip install mmcv-lite
-```
diff --git a/docs/en/get_started/introduction.md b/docs/en/get_started/introduction.md
deleted file mode 100644
index 461fcc7..0000000
--- a/docs/en/get_started/introduction.md
+++ /dev/null
@@ -1,36 +0,0 @@
-## Introduction
-
-MMCV is a foundational library for computer vision research and provides the following functionalities.
-
-- [Image/Video processing](../understand_mmcv/data_process.md)
-- [Image and annotation visualization](../understand_mmcv/visualization.md)
-- [Image transformation](../understand_mmcv/data_transform.md)
-- [Various CNN architectures](../understand_mmcv/cnn.md)
-- [High-quality implementation of common CUDA ops](../understand_mmcv/ops.md)
-
-It supports the following systems:
-
-- Linux
-- Windows
-- macOS
-
-It supports many research projects as below:
-
-- [MMClassification](https://github.com/open-mmlab/mmclassification): OpenMMLab image classification toolbox and benchmark.
-- [MMDetection](https://github.com/open-mmlab/mmdetection): OpenMMLab detection toolbox and benchmark.
-- [MMDetection3D](https://github.com/open-mmlab/mmdetection3d): OpenMMLab's next-generation platform for general 3D object detection.
-- [MMRotate](https://github.com/open-mmlab/mmrotate): OpenMMLab rotated object detection toolbox and benchmark.
-- [MMYOLO](https://github.com/open-mmlab/mmyolo): OpenMMLab YOLO series toolbox and benchmark.
-- [MMSegmentation](https://github.com/open-mmlab/mmsegmentation): OpenMMLab semantic segmentation toolbox and benchmark.
-- [MMOCR](https://github.com/open-mmlab/mmocr): OpenMMLab text detection, recognition, and understanding toolbox.
-- [MMPose](https://github.com/open-mmlab/mmpose): OpenMMLab pose estimation toolbox and benchmark.
-- [MMHuman3D](https://github.com/open-mmlab/mmhuman3d): OpenMMLab 3D human parametric model toolbox and benchmark.
-- [MMSelfSup](https://github.com/open-mmlab/mmselfsup): OpenMMLab self-supervised learning toolbox and benchmark.
-- [MMRazor](https://github.com/open-mmlab/mmrazor): OpenMMLab model compression toolbox and benchmark.
-- [MMFewShot](https://github.com/open-mmlab/mmfewshot): OpenMMLab fewshot learning toolbox and benchmark.
-- [MMAction2](https://github.com/open-mmlab/mmaction2): OpenMMLab's next-generation action understanding toolbox and benchmark.
-- [MMTracking](https://github.com/open-mmlab/mmtracking): OpenMMLab video perception toolbox and benchmark.
-- [MMFlow](https://github.com/open-mmlab/mmflow): OpenMMLab optical flow toolbox and benchmark.
-- [MMEditing](https://github.com/open-mmlab/mmediting): OpenMMLab image and video editing toolbox.
-- [MMGeneration](https://github.com/open-mmlab/mmgeneration): OpenMMLab image and video generative models toolbox.
-- [MMDeploy](https://github.com/open-mmlab/mmdeploy): OpenMMLab model deployment framework.
diff --git a/docs/en/switch_language.md b/docs/en/switch_language.md
deleted file mode 100644
index 9dc7b34..0000000
--- a/docs/en/switch_language.md
+++ /dev/null
@@ -1,3 +0,0 @@
-## <a href='https://mmcv.readthedocs.io/en/latest/'>English</a>
-
-## <a href='https://mmcv.readthedocs.io/zh_CN/latest/'>简体中文</a>
diff --git a/docs/en/understand_mmcv/cnn.md b/docs/en/understand_mmcv/cnn.md
deleted file mode 100644
index 2c42f25..0000000
--- a/docs/en/understand_mmcv/cnn.md
+++ /dev/null
@@ -1,120 +0,0 @@
-## CNN
-
-We provide some building bricks for CNNs, including layer building, module bundles and weight initialization.
-
-### Layer building
-
-We may need to try different layers of the same type when running experiments,
-but do not want to modify the code from time to time.
-Here we provide some layer building methods to construct layers from a dict,
-which can be written in configs or specified via command line arguments.
-
-#### Usage
-
-A simplest example is
-
-```python
-from mmcv.cnn import build_conv_layer
-
-cfg = dict(type='Conv3d')
-layer = build_conv_layer(cfg, in_channels=3, out_channels=8, kernel_size=3)
-```
-
-- `build_conv_layer`: Supported types are Conv1d, Conv2d, Conv3d, Conv (alias for Conv2d).
-- `build_norm_layer`: Supported types are BN1d, BN2d, BN3d, BN (alias for BN2d), SyncBN, GN, LN, IN1d, IN2d, IN3d, IN (alias for IN2d).
-- `build_activation_layer`: Supported types are ReLU, LeakyReLU, PReLU, RReLU, ReLU6, ELU, Sigmoid, Tanh, GELU.
-- `build_upsample_layer`: Supported types are nearest, bilinear, deconv, pixel_shuffle.
-- `build_padding_layer`: Supported types are zero, reflect, replicate.
-
-#### Extension
-
-We also allow extending the building methods with custom layers and operators.
-
-1. Write and register your own module.
-
-   ```python
-   from mmengine.registry import MODELS
-
-   @MODELS.register_module()
-   class MyUpsample:
-
-       def __init__(self, scale_factor):
-           pass
-
-       def forward(self, x):
-           pass
-   ```
-
-2. Import `MyUpsample` somewhere (e.g., in `__init__.py`) and then use it.
-
-   ```python
-   from mmcv.cnn import build_upsample_layer
-
-   cfg = dict(type='MyUpsample', scale_factor=2)
-   layer = build_upsample_layer(cfg)
-   ```
-
-### Module bundles
-
-We also provide common module bundles to facilitate the network construction.
-`ConvModule` is a bundle of convolution, normalization and activation layers,
-please refer to the [api](api.html#mmcv.cnn.ConvModule) for details.
-
-```python
-from mmcv.cnn import ConvModule
-
-# conv + bn + relu
-conv = ConvModule(3, 8, 2, norm_cfg=dict(type='BN'))
-# conv + gn + relu
-conv = ConvModule(3, 8, 2, norm_cfg=dict(type='GN', num_groups=2))
-# conv + relu
-conv = ConvModule(3, 8, 2)
-# conv
-conv = ConvModule(3, 8, 2, act_cfg=None)
-# conv + leaky relu
-conv = ConvModule(3, 8, 3, padding=1, act_cfg=dict(type='LeakyReLU'))
-# bn + conv + relu
-conv = ConvModule(
-    3, 8, 2, norm_cfg=dict(type='BN'), order=('norm', 'conv', 'act'))
-```
-
-### Model Zoo
-
-Besides torchvision pre-trained models, we also provide pre-trained models of following CNN:
-
-- VGG Caffe
-- ResNet Caffe
-- ResNeXt
-- ResNet with Group Normalization
-- ResNet with Group Normalization and Weight Standardization
-- HRNetV2
-- Res2Net
-- RegNet
-
-#### Model URLs in JSON
-
-The model zoo links in MMCV are managed by JSON files.
-The json file consists of key-value pair of model name and its url or path.
-An example json file could be like:
-
-```json
-{
-    "model_a": "https://example.com/models/model_a_9e5bac.pth",
-    "model_b": "pretrain/model_b_ab3ef2c.pth"
-}
-```
-
-The default links of the pre-trained models hosted on OpenMMLab AWS could be found [here](https://github.com/open-mmlab/mmcv/blob/master/mmcv/model_zoo/open_mmlab.json).
-
-You may override default links by putting `open-mmlab.json` under `MMCV_HOME`. If `MMCV_HOME` is not found in your environment, `~/.cache/mmcv` will be used by default. You may use your own path with `export MMCV_HOME=/your/path`.
-
-The external json files will be merged into default one. If the same key presents in both external json and default json, the external one will be used.
-
-#### Load Checkpoint
-
-The following types are supported for `filename` of `mmcv.load_checkpoint()`.
-
-- filepath: The filepath of the checkpoint.
-- `http://xxx` and `https://xxx`: The link to download the checkpoint. The `SHA256` postfix should be contained in the filename.
-- `torchvision://xxx`: The model links in `torchvision.models`. Please refer to [torchvision](https://pytorch.org/docs/stable/torchvision/models.html) for details.
-- `open-mmlab://xxx`: The model links or filepath provided in default and additional json files.
diff --git a/docs/en/understand_mmcv/data_transform.md b/docs/en/understand_mmcv/data_transform.md
deleted file mode 100644
index 64c3af9..0000000
--- a/docs/en/understand_mmcv/data_transform.md
+++ /dev/null
@@ -1,341 +0,0 @@
-# Data Transformation
-
-In the OpenMMLab algorithm library, dataset construction and data preparation are decoupled. Usually, the construction of the dataset only parses the dataset and records the basic information of each sample, while the data preparation is a series of data transformations including data loading, preprocessing, formatting, and other operations performed according to the basic information of the sample.
-
-## Design of data transformation
-
-In MMCV, we use various callable data transformation classes to manipulate data. These data transformation classes can accept several configuration parameters for the instantiation and then process the input data dictionary by `__call__` method. All data transformation methods accept a dictionary as the input and produce the output as a dictionary as well. A simple example is as follows:
-
-```python
->>> import numpy as np
->>> from mmcv.transforms import Resize
->>>
->>> transform = Resize(scale=(224, 224))
->>> data_dict = {'img': np.random.rand(256, 256, 3)}
->>> data_dict = transform(data_dict)
->>> print(data_dict['img'].shape)
-(224, 224, 3)
-```
-
-The data transformation class reads some fields of the input dictionary and may add or update some fields. The keys of these fields are mostly fixed. For example, `Resize` will always read fields such as `"img"` in the input dictionary. More information about the conventions for input and output fields could be found in the documentation of the corresponding class.
-
-```{note}
-By convention, the order of image shape which is used as **initialization parameters** in data transformation (such as Resize, Pad) is (width, height). In the dictionary returned by the data transformation, the image related shape, such as `img_shape`, `ori_shape`, `pad_shape`, etc., is (height, width).
-```
-
-MMCV provides a unified base class called `BaseTransform` for all data transformation classes:
-
-```python
-class BaseTransform(metaclass=ABCMeta):
-
-    def __call__(self, results: dict) -> dict:
-
-        return self.transform(results)
-
-    @abstractmethod
-    def transform(self, results: dict) -> dict:
-        pass
-```
-
-All data transformation classes must inherit `BaseTransform` and implement the `transform` method. Both the input and output of the `transform` method are a dictionary. In the **Custom data transformation class** section, we will describe how to implement a data transformation class in more detail.
-
-## Data pipeline
-
-As mentioned above, the inputs and outputs of all data transformations are dictionaries. Moreover, according to the \[Convention on Datasets\] (TODO) in OpenMMLab, the basic information of each sample in the dataset is also a dictionary. This way, we can connect all data transformation operations end to end and combine them into a data pipeline. This pipeline inputs the information dictionary of the samples in the dataset and outputs the information dictionary after a series of processing.
-
-Taking the classification task as an example, we show a typical data pipeline in the figure below. For each sample, the information stored in the dataset is a dictionary, as shown on the far left in the figure. After each data transformation operation represented by the blue block, a new field (marked in green) will be added to the data dictionary or an existing field (marked in orange) will be updated.
-
-<div align=center>
-<img src="https://user-images.githubusercontent.com/26739999/154197953-bf0b1a16-3f41-4bc7-9e67-b2b9b323d895.png" width="90%"/>
-</div>
-
-The data pipeline is a list of several data transformation configuration dictionaries in the configuration file. Each dataset needs to set the parameter `pipeline` to define the data preparation operations the dataset needs to perform. The configuration of the above data pipeline in the configuration file is as follows:
-
-```python
-pipeline = [
-    dict(type='LoadImageFromFile'),
-    dict(type='Resize', size=256, keep_ratio=True),
-    dict(type='CenterCrop', crop_size=224),
-    dict(type='Normalize', mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375]),
-    dict(type='ClsFormatBundle')
-]
-
-dataset = dict(
-    ...
-    pipeline=pipeline,
-    ...
-)
-```
-
-## Common data transformation classes
-
-The commonly used data transformation classes can be roughly divided into data loading, data preprocessing and augmentation, and data formatting. In MMCV, we provide some commonly used classes as follows:
-
-### Data loading
-
-To support the loading of large-scale datasets, data is usually not loaded when `Dataset` is initialized. Only the corresponding path is loaded. Therefore, it is necessary to load specific data in the data pipeline.
-
-|            Class            |                    Feature                     |
-| :-------------------------: | :--------------------------------------------: |
-| [`LoadImageFromFile`](TODO) |              Load from file path               |
-|  [`LoadAnnotations`](TODO)  | Load and organize the annotations (bbox, etc.) |
-
-### Data preprocessing and enhancement
-
-Data preprocessing and augmentation usually involve transforming the image itself, such as cropping, padding, scaling, etc.
-
-|              Class               |                        Feature                         |
-| :------------------------------: | :----------------------------------------------------: |
-|          [`Pad`](TODO)           |                        Padding                         |
-|       [`CenterCrop`](TODO)       |                      Center crop                       |
-|       [`Normalize`](TODO)        |                  Image normalization                   |
-|         [`Resize`](TODO)         |         Resize to the specified size or ratio          |
-|      [`RandomResize`](TODO)      |  Scale the image randomly within the specified range   |
-| [`RandomMultiscaleResize`](TODO) | Scale the image to a random size from multiple options |
-|    [`RandomGrayscale`](TODO)     |                    Random grayscale                    |
-|       [`RandomFlip`](TODO)       |                      Random flip                       |
-|   [`MultiScaleFlipAug`](TODO)    |    Support scaling and flipping during the testing     |
-
-### Data formatting
-
-Data formatting operations are type conversions performed on the data.
-
-|          Class          |                   Feature                    |
-| :---------------------: | :------------------------------------------: |
-|   [`ToTensor`](TODO)    | Convert the specified data to `torch.Tensor` |
-| [`ImageToTensor`](TODO) |     Convert the image to `torch.Tensor`      |
-
-## Customize data transformation classes
-
-To implement a new data transformation class, you must inherit `BaseTransform` and implement the `transform` method. Here, we use a simple flip transform (`MyFlip`) as an example:
-
-```python
-import random
-import mmcv
-from mmcv.transforms import BaseTransform, TRANSFORMS
-
-@TRANSFORMS.register_module()
-class MyFlip(BaseTransform):
-    def __init__(self, direction: str):
-        super().__init__()
-        self.direction = direction
-
-    def transform(self, results: dict) -> dict:
-        img = results['img']
-        results['img'] = mmcv.imflip(img, direction=self.direction)
-        return results
-```
-
-Now, we can instantiate `MyFlip` as a callable object to handle our data dictionary.
-
-```python
-import numpy as np
-
-transform = MyFlip(direction='horizontal')
-data_dict = {'img': np.random.rand(224, 224, 3)}
-data_dict = transform(data_dict)
-processed_img = data_dict['img']
-```
-
-Alternatively, use `MyFlip` transform in the `pipeline` of the config file.
-
-```python
-pipeline = [
-    ...
-    dict(type='MyFlip', direction='horizontal'),
-    ...
-]
-```
-
-It should be noted that if you want to use it in the configuration file, you must ensure that the file where the `MyFlip` class is located can be imported at the runtime.
-
-## Transform wrapper
-
-Transform wrappers are a special class of data transformations. They do not operate on images, labels or other information in the data dictionary by themselves. Instead, they enhance the behavior of data transformations defined in them.
-
-### KeyMapper
-
-`KeyMapper` is used to map fields in the data dictionary. For example, image processing transforms usually get their values from the `"img"` field in the data dictionary. But sometimes we want these transforms to handle images in other fields in the data dictionary, such as the `"gt_img"` field.
-
-When used with registry and configuration file, the field map wrapper should be used as follows:
-
-```python
-pipeline = [
-    ...
-    dict(type='KeyMapper',
-        mapping={
-            'img': 'gt_img',  # map "gt_img" to "img"
-            'mask': ...,  # The "mask" field in the raw data is not used. That is, for wrapped data transformations, the "mask" field is not included in the data
-        },
-        auto_remap=True,  # remap "img" back to "gt_img" after the transformation
-        transforms=[
-            # only need to specify "img" in `RandomFlip`
-            dict(type='RandomFlip'),
-        ])
-    ...
-]
-```
-
-With `KeyMapper`, we don't need to consider various possible input field names in the `transform` method when we implement the data transformation class. We only need to deal with the default fields.
-
-### RandomChoice and RandomApply
-
-`RandomChoice` is used to randomly select a data transformation pipeline from the given choices. With this wrapper, we can easily implement some data augmentation functions, such as AutoAugment.
-
-In configuration file, you can use `RandomChoice` as follows:
-
-```python
-pipeline = [
-    ...
-    dict(type='RandomChoice',
-        transforms=[
-            [
-                dict(type='Posterize', bits=4),
-                dict(type='Rotate', angle=30.)
-            ],  # the first combo option
-            [
-                dict(type='Equalize'),
-                dict(type='Rotate', angle=30)
-            ],  # the second combo option
-        ],
-        prob=[0.4, 0.6]  # the prob of each combo
-        )
-    ...
-]
-```
-
-`RandomApply` is used to randomly perform a combination of data transformations with a specified probability. For example:
-
-```python
-pipeline = [
-    ...
-    dict(type='RandomApply',
-        transforms=[dict(type='Rotate', angle=30.)],
-        prob=0.3)  # perform the transformation with prob as 0.3
-    ...
-]
-```
-
-### TransformBroadcaster
-
-Usually, a data transformation class only reads the target of an operation from one field. While we can also use `KeyMapper` to change the fields read, there is no way to apply transformations to the data of multiple fields at once. To achieve this, we need to use the multi-target extension wrapper `TransformBroadcaster`.
-
-`TransformBroadcaster` has two uses, one is to apply data transformation to multiple specified fields, and the other is to apply data transformation to a group of targets under a field.
-
-1. Apply to multiple fields
-
-   Suppose we need to apply a data transformation to images in two fields `"lq"` (low-quality) and `"gt"` (ground-truth).
-
-   ```python
-   pipeline = [
-       dict(type='TransformBroadcaster',
-           # apply to the "lq" and "gt" fields respectively, and set the "img" field to both
-           mapping={'img': ['lq', 'gt']},
-           # remap the "img" field back to the original field after the transformation
-           auto_remap=True,
-           # whether to share random variables in the transformation of each target
-           # more introduction will be referred in the following chapters (random variable sharing)
-           share_random_params=True,
-           transforms=[
-               # only need to manipulate the "img" field in the `RandomFlip` class
-               dict(type='RandomFlip'),
-           ])
-   ]
-   ```
-
-   In the `mapping` setting of the multi-target extension, we can also use `...` to ignore the specified original field. As shown in the following example, the wrapped `RandomCrop` will crop the image in the field `"img"` and update the size of the cropped image if the field `"img_shape"` exists. If we want to do the same random cropping for both image fields `"lq"` and `"gt"` at the same time but update the `"img_shape"` field only once, we can do it as in the example:
-
-   ```python
-   pipeline = [
-       dict(type='TransformBroadcaster',
-           mapping={
-               'img': ['lq', 'gt'],
-               'img_shape': ['img_shape', ...],
-            },
-           # remap the "img" and "img_shape" fields back to their original fields after the transformation
-           auto_remap=True,
-           # whether to share random variables in the transformation of each target
-           # more introduction will be referred in the following chapters (random variable sharing)
-           share_random_params=True,
-           transforms=[
-               # "img" and "img_shape" fields are manipulated in the `RandomCrop` class
-               # if "img_shape" is missing, only operate on "img"
-               dict(type='RandomCrop'),
-           ])
-   ]
-   ```
-
-2. A set of targets applied to a field
-
-   Suppose we need to apply a data transformation to the `"images"` field, which is a list of images.
-
-   ```python
-   pipeline = [
-       dict(type='TransformBroadcaster',
-           # map each image under the "images" field to the "img" field
-           mapping={'img': 'images'},
-           # remap the images under the "img" field back to the list in the "images" field after the transformation
-           auto_remap=True,
-           # whether to share random variables in the transformation of each target
-           share_random_params=True,
-           transforms=[
-               # in the `RandomFlip` transformation class, we only need to manipulate the "img" field
-               dict(type='RandomFlip'),
-           ])
-   ]
-   ```
-
-#### Decorator `cache_randomness`
-
-In `TransformBroadcaster`, we provide the `share_random_params` option to support sharing random states across multiple data transformations. For example, in a super-resolution task, we want to apply **the same** random transformations **simultaneously** to the low-resolution image and the original image. If we use this function in a custom data transformation class, we need to mark which random variables support sharing in the class. This can be achieved with the decorator `cache_randomness`.
-
-Taking `MyFlip` from the above example, we want to perform flipping randomly with a certain probability:
-
-```python
-from mmcv.transforms.utils import cache_randomness
-
-@TRANSFORMS.register_module()
-class MyRandomFlip(BaseTransform):
-    def __init__(self, prob: float, direction: str):
-        super().__init__()
-        self.prob = prob
-        self.direction = direction
-
-    @cache_randomness  # label the output of the method as a shareable random variable
-    def do_flip(self):
-        flip = True if random.random() > self.prob else False
-        return flip
-
-    def transform(self, results: dict) -> dict:
-        img = results['img']
-        if self.do_flip():
-            results['img'] = mmcv.imflip(img, direction=self.direction)
-        return results
-```
-
-In the above example, we decorate the `do_flip` method with `cache_randomness`, marking the method return value `flip` as a random variable that supports sharing. Therefore, in the transformation of `TransformBroadcaster` to multiple targets, the value of this variable will remain the same.
-
-#### Decorator `avoid_cache_randomness`
-
-In some cases, we cannot separate the process of generating random variables in data transformation into a class method. For example, modules from third-party libraries used in data transformation encapsulate the relevant parts of random variables inside, making them impossible to be extracted as class methods for data transformation. Such data transformations cannot support shared random variables through the decorator `cache_randomness` annotation, and thus cannot share random variables during multi-objective expansion.
-
-To avoid misuse of such data transformations in multi-object extensions, we provide another decorator, `avoid_cache_randomness`, to mark such data transformations:
-
-```python
-from mmcv.transforms.utils import avoid_cache_randomness
-
-@TRANSFORMS.register_module()
-@avoid_cache_randomness
-class MyRandomTransform(BaseTransform):
-
-    def transform(self, results: dict) -> dict:
-        ...
-```
-
-Data transformation classes marked with `avoid_cache_randomness` will throw an exception when their instance is wrapped by `TransformBroadcaster` and the parameter `share_random_params` is set to True. This reminds the user not to use it in this way.
-
-There are a few things to keep in mind when using `avoid_cache_randomness`:
-
-1. `avoid_cache_randomness` is only used to decorate data transformation classes (subclasses of `BaseTransfrom`) and cannot be used to decorate other general classes, class methods, or functions
-2. When a data transformation decorated with `avoid_cache_randomness` is used as a base class, its subclasses **will not inherit** its feature. If the subclass is still unable to share random variables, `avoid_cache_randomness` should be used again.
-3. A data transformation needs to be modified with `avoid_cache_randomness` only when a data transformation is random and cannot share its random parameters. Data transformations without randomness require no decoration
diff --git a/docs/en/understand_mmcv/ops.md b/docs/en/understand_mmcv/ops.md
deleted file mode 100644
index e60f77c..0000000
--- a/docs/en/understand_mmcv/ops.md
+++ /dev/null
@@ -1,66 +0,0 @@
-## ops
-
-We implement common ops used in detection, segmentation, etc.
-
-| Device                       | CPU | CUDA | MLU | MPS | Ascend |
-| ---------------------------- | --- | ---- | --- | --- | ------ |
-| ActiveRotatedFilter          | √   | √    |     |     |        |
-| AssignScoreWithK             |     | √    |     |     |        |
-| BallQuery                    |     | √    |     |     |        |
-| BBoxOverlaps                 |     | √    | √   | √   | √      |
-| BorderAlign                  |     | √    |     |     |        |
-| BoxIouRotated                | √   | √    |     |     |        |
-| BoxIouQuadri                 | √   | √    |     |     |        |
-| CARAFE                       |     | √    | √   |     |        |
-| ChamferDistance              |     | √    |     |     |        |
-| CrissCrossAttention          |     | √    |     |     |        |
-| ContourExpand                | √   |      |     |     |        |
-| ConvexIoU                    |     | √    |     |     |        |
-| CornerPool                   |     | √    |     |     |        |
-| Correlation                  |     | √    |     |     |        |
-| Deformable Convolution v1/v2 | √   | √    |     |     | √      |
-| Deformable RoIPool           |     | √    | √   |     | √      |
-| DiffIoURotated               |     | √    |     |     |        |
-| DynamicScatter               |     | √    |     |     |        |
-| FurthestPointSample          |     | √    |     |     |        |
-| FurthestPointSampleWithDist  |     | √    |     |     |        |
-| FusedBiasLeakyrelu           |     | √    |     |     | √      |
-| GatherPoints                 |     | √    |     |     | √      |
-| GroupPoints                  |     | √    |     |     |        |
-| Iou3d                        |     | √    | √   |     |        |
-| KNN                          |     | √    |     |     |        |
-| MaskedConv                   |     | √    | √   |     | √      |
-| MergeCells                   |     | √    |     |     |        |
-| MinAreaPolygon               |     | √    |     |     |        |
-| ModulatedDeformConv2d        | √   | √    |     |     | √      |
-| MultiScaleDeformableAttn     |     | √    | √   |     |        |
-| NMS                          | √   | √    | √   |     | √      |
-| NMSRotated                   | √   | √    |     |     | √      |
-| NMSQuadri                    | √   | √    |     |     |        |
-| PixelGroup                   | √   |      |     |     |        |
-| PointsInBoxes                | √   | √    |     |     |        |
-| PointsInPolygons             |     | √    |     |     |        |
-| PSAMask                      | √   | √    | √   |     | √      |
-| RotatedFeatureAlign          | √   | √    |     |     |        |
-| RoIPointPool3d               |     | √    | √   |     |        |
-| RoIPool                      |     | √    | √   |     | √      |
-| RoIAlignRotated              | √   | √    | √   |     |        |
-| RiRoIAlignRotated            |     | √    |     |     |        |
-| RoIAlign                     | √   | √    | √   |     |        |
-| RoIAwarePool3d               |     | √    | √   |     |        |
-| SAConv2d                     |     | √    |     |     |        |
-| SigmoidFocalLoss             |     | √    | √   |     | √      |
-| SoftmaxFocalLoss             |     | √    |     |     | √      |
-| SoftNMS                      |     | √    |     |     |        |
-| Sparse Convolution           |     | √    |     |     |        |
-| Synchronized BatchNorm       |     | √    |     |     |        |
-| ThreeInterpolate             |     | √    |     |     |        |
-| ThreeNN                      |     | √    | √   |     |        |
-| TINShift                     |     | √    | √   |     |        |
-| UpFirDn2d                    |     | √    |     |     |        |
-| Voxelization                 | √   | √    |     |     | √      |
-| PrRoIPool                    |     | √    |     |     |        |
-| BezierAlign                  | √   | √    |     |     |        |
-| BiasAct                      |     | √    |     |     |        |
-| FilteredLrelu                |     | √    |     |     |        |
-| Conv2dGradfix                |     | √    |     |     |        |
diff --git a/docs/faq.md b/docs/faq.md
new file mode 100644
index 0000000..ab0dd13
--- /dev/null
+++ b/docs/faq.md
@@ -0,0 +1,42 @@
+## Frequently Asked Questions
+
+We list some common troubles faced by many users and their corresponding solutions here.
+Feel free to enrich the list if you find any frequent issues and have ways to help others to solve them.
+
+- Compatibility issue between MMCV and MMDetection; "ConvWS is already registered in conv layer"
+
+    Please install the correct version of MMCV for the version of your MMDetection following the instruction above.
+
+- "No module named 'mmcv.ops'"; "No module named 'mmcv._ext'".
+
+    1. Uninstall existing mmcv in the environment using `pip uninstall mmcv`.
+    2. Install mmcv-full following the instruction above.
+
+- "invalid device function" or "no kernel image is available for execution".
+
+    1. Check the CUDA compute capability of you GPU.
+    2. Run `python mmdet/utils/collect_env.py` to check whether PyTorch, torchvision,
+       and MMCV are built for the correct GPU architecture.
+       You may need to set `TORCH_CUDA_ARCH_LIST` to reinstall MMCV.
+       The compatibility issue could happen when  using old GPUS, e.g., Tesla K80 (3.7) on colab.
+    3. Check whether the running environment is the same as that when mmcv/mmdet is compiled.
+       For example, you may compile mmcv using CUDA 10.0 bug run it on CUDA9.0   environments.
+
+- "undefined symbol" or "cannot open xxx.so".
+
+    1. If those symbols are CUDA/C++ symbols (e.g., libcudart.so or GLIBCXX), check
+       whether the CUDA/GCC runtimes are the same as those used for compiling mmcv.
+    2. If those symbols are Pytorch symbols (e.g., symbols containing caffe, aten, and TH), check whether
+       the Pytorch version is the same as that used for compiling mmcv.
+    3. Run `python mmdet/utils/collect_env.py` to check whether PyTorch, torchvision,
+       and MMCV are built by and running on the same environment.
+
+- "RuntimeError: CUDA error: invalid configuration argument".
+
+    This error may be due to your poor GPU. Try to decrease the value of [THREADS_PER_BLOCK](https://github.com/open-mmlab/mmcv/blob/cac22f8cf5a904477e3b5461b1cc36856c2793da/mmcv/ops/csrc/common_cuda_helper.hpp#L10)
+    and recompile mmcv.
+
+- "RuntimeError: nms is not compiled with GPU support".
+
+    This error is because your CUDA environment is not installed correctly.
+    You may try to re-install your CUDA environment and then delete the build/ folder before re-compile mmcv.
diff --git a/docs/get_started/build.md b/docs/get_started/build.md
new file mode 100644
index 0000000..758a83a
--- /dev/null
+++ b/docs/get_started/build.md
@@ -0,0 +1,234 @@
+## Build MMCV from source
+
+### Build on Linux or macOS
+
+After cloning the repo with
+
+```bash
+git clone https://github.com/open-mmlab/mmcv.git
+cd mmcv
+```
+
+You can either
+
+- install the lite version
+
+  ```bash
+  pip install -e .
+  ```
+
+- install the full version
+
+  ```bash
+  MMCV_WITH_OPS=1 pip install -e .
+  ```
+
+If you are on macOS, add the following environment variables before the installing command.
+
+```bash
+CC=clang CXX=clang++ CFLAGS='-stdlib=libc++'
+```
+
+e.g.,
+
+```bash
+CC=clang CXX=clang++ CFLAGS='-stdlib=libc++' MMCV_WITH_OPS=1 pip install -e .
+```
+
+```{note}
+If you would like to use `opencv-python-headless` instead of `opencv-python`,
+e.g., in a minimum container environment or servers without GUI,
+you can first install it before installing MMCV to skip the installation of `opencv-python`.
+```
+### Build on Windows
+
+Building MMCV on Windows is a bit more complicated than that on Linux.
+The following instructions show how to get this accomplished.
+
+#### Prerequisite
+
+The following software is required for building MMCV on windows.
+Install them first.
+
+- [Git](https://git-scm.com/download/win)
+  - During installation, tick **add git to Path**.
+- [Visual Studio Community 2019](https://visualstudio.microsoft.com)
+  - A compiler for C++ and CUDA codes.
+- [Miniconda](https://docs.conda.io/en/latest/miniconda.html)
+  - Official distributions of Python should work too.
+- [CUDA 10.2](https://developer.nvidia.com/cuda-10.2-download-archive)
+  - Not required for building CPU version.
+  - Customize the installation if necessary. As a recommendation, skip the driver installation if a newer version is already installed.
+
+```{note}
+You should know how to set up environment variables, especially `Path`, on Windows. The following instruction relies heavily on this skill.
+```
+
+#### Setup Python Environment
+
+1. Launch Anaconda prompt from Windows Start menu
+
+    Do not use raw `cmd.exe` s instruction is based on PowerShell syntax.
+
+1. Create a new conda environment
+
+    ```shell
+    conda create --name mmcv python=3.7  # 3.6, 3.7, 3.8 should work too as tested
+    conda activate mmcv  # make sure to activate environment before any operation
+    ```
+
+1. Install PyTorch. Choose a version based on your need.
+
+    ```shell
+    conda install pytorch torchvision cudatoolkit=10.2 -c pytorch
+    ```
+
+    We only tested PyTorch version >= 1.6.0.
+
+1. Prepare MMCV source code
+
+    ```shell
+    git clone https://github.com/open-mmlab/mmcv.git
+    cd mmcv
+    ```
+
+1. Install required Python packages
+
+    ```shell
+    pip3 install -r requirements.txt
+    ```
+
+#### Build and install MMCV
+
+MMCV can be built in three ways:
+
+1. Lite version (without ops)
+
+   In this way, no custom ops are compiled and mmcv is a pure python package.
+
+1. Full version (CPU ops)
+
+   Module `ops` will be compiled as a pytorch extension, but only x86 code will be compiled. The compiled ops can be executed on CPU only.
+
+1. Full version (CUDA ops)
+
+   Both x86 and CUDA codes of `ops` module will be compiled. The compiled version can be run on both CPU and CUDA-enabled GPU (if implemented).
+
+##### Common steps
+
+1. Set up MSVC compiler
+
+    Set Environment variable, add `C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\VC\Tools\MSVC\14.27.29110\bin\Hostx86\x64` to `PATH`, so that `cl.exe` will be available in prompt, as shown below.
+
+    ```none
+    (base) PS C:\Users\xxx> cl
+    Microsoft (R) C/C++ Optimizing  Compiler Version 19.27.29111 for x64
+    Copyright (C) Microsoft Corporation.   All rights reserved.
+
+    usage: cl [ option... ] filename... [ / link linkoption... ]
+    ```
+
+    For compatibility, we use the x86-hosted and x64-targeted compiler. note `Hostx86\x64` in the path.
+
+    You may want to change the system language to English because pytorch will parse text output from `cl.exe` to check its version. However only utf-8 is recognized. Navigate to Control Panel -> Region -> Administrative -> Language for Non-Unicode programs and change it to English.
+
+##### Option 1: Build MMCV (lite version)
+
+After finishing above common steps, launch Anaconda shell from Start menu and issue the following commands:
+
+```shell
+# activate environment
+conda activate mmcv
+# change directory
+cd mmcv
+# install
+python setup.py develop
+# check
+pip list
+```
+
+##### Option 2: Build MMCV (full version with CPU)
+
+1. Finish above common steps
+1. Set up environment variables
+
+    ```shell
+    $env:MMCV_WITH_OPS = 1
+    $env:MAX_JOBS = 8  # based on your available number of CPU cores and amount of memory
+    ```
+
+1. Following build steps of the lite version
+
+    ```shell
+    # activate environment
+    conda activate mmcv
+    # change directory
+    cd mmcv
+    # build
+    python setup.py build_ext # if success, cl will be launched to compile ops
+    # install
+    python setup.py develop
+    # check
+    pip list
+    ```
+
+##### Option 3: Build MMCV (full version with CUDA)
+
+1. Finish above common steps
+1. Make sure `CUDA_PATH` or `CUDA_HOME` is already set in `envs` via `ls env:`, desired output is shown as below:
+
+   ```none
+   (base) PS C:\Users\WRH> ls env:
+
+   Name                           Value
+   ----                           -----
+   <... omit some lines ...>
+   CUDA_PATH                      C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.2
+   CUDA_PATH_V10_1                C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.1
+   CUDA_PATH_V10_2                C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.2
+   <... omit some lines ...>
+   ```
+
+   This should already be done by CUDA installer. If not, or you have multiple version of CUDA toolkit installed, set it with
+
+   ```shell
+   $env:CUDA_HOME = "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.2"
+   # OR
+   $env:CUDA_HOME = $env:CUDA_PATH_V10_2 # if CUDA_PATH_V10_2 is in envs:
+   ```
+
+1. Set CUDA target arch
+
+   ```shell
+   # Suppose you are using GTX 1080, which is of capability 6.1
+   $env:TORCH_CUDA_ARCH_LIST="6.1"
+   # OR build all supported arch, will be slow
+   $env:TORCH_CUDA_ARCH_LIST="3.5 3.7 5.0 5.2 6.0 6.1 7.0 7.5"
+   ```
+
+```{note}
+Check your the compute capability of your GPU from [here](https://developer.nvidia.com/cuda-gpus).
+```
+
+1. Launch compiling the same way as CPU
+
+   ```shell
+   $env:MMCV_WITH_OPS = 1
+   $env:MAX_JOBS = 8  # based on available number of CPU cores and amount of memory
+   # activate environment
+   conda activate mmcv
+   # change directory
+   cd mmcv
+   # build
+   python setup.py build_ext # if success, cl will be launched to compile ops
+   # install
+   python setup.py develop
+   # check
+   pip list
+   ```
+
+```{note}
+If you are compiling against PyTorch 1.6.0, you might meet some errors from PyTorch as described in [this issue](https://github.com/pytorch/pytorch/issues/42467). Follow [this pull request](https://github.com/pytorch/pytorch/pull/43380/files) to modify the source code in your local PyTorch installation.
+```
+
+If you meet issues when running or compiling mmcv, we list some common issues in [Frequently Asked Question](../faq.html).
diff --git a/docs/get_started/installation.md b/docs/get_started/installation.md
new file mode 100644
index 0000000..0c64ea8
--- /dev/null
+++ b/docs/get_started/installation.md
@@ -0,0 +1,162 @@
+## Installation
+
+There are two versions of MMCV:
+
+- **mmcv-full**: comprehensive, with full features and various CUDA ops out of box. It takes longer time to build.
+- **mmcv**: lite, without CUDA ops but all other features, similar to mmcv<1.0.0. It is useful when you do not need those CUDA ops.
+
+```{warning}
+Do not install both versions in the same environment, otherwise you may encounter errors like `ModuleNotFound`. You need to uninstall one before installing the other. `Installing the full version is highly recommended if CUDA is avaliable`.
+```
+
+a. Install the full version.
+
+Before installing mmcv-full, make sure that PyTorch has been successfully installed following the [official guide](https://pytorch.org/).
+
+We provide pre-built mmcv packages (recommended) with different PyTorch and CUDA versions to simplify the building. In addition, you can run [check_installation.py](.dev_scripts/check_installation.py) to check the installation of mmcv-full after running the installation commands.
+
+i. Install the latest version.
+
+The rule for installing the latest ``mmcv-full`` is as follows:
+
+```shell
+pip install mmcv-full -f https://download.openmmlab.com/mmcv/dist/{cu_version}/{torch_version}/index.html
+```
+
+Please replace ``{cu_version}`` and ``{torch_version}`` in the url to your desired one. For example,
+to install the latest ``mmcv-full`` with ``CUDA 11.1`` and ``PyTorch 1.9.0``, use the following command:
+
+```shell
+pip install mmcv-full -f https://download.openmmlab.com/mmcv/dist/cu111/torch1.9.0/index.html
+```
+
+For more details, please refer the the following tables and delete ``=={mmcv_version}``.
+
+ii. Install a specified version.
+
+The rule for installing a specified ``mmcv-full`` is as follows:
+
+```shell
+pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/{cu_version}/{torch_version}/index.html
+```
+
+First of all, please refer to the Releases and replace ``{mmcv_version}`` a specified one. e.g. ``1.3.9``.
+Then replace ``{cu_version}`` and ``{torch_version}`` in the url to your desired versions. For example,
+to install ``mmcv-full==1.3.9`` with ``CUDA 11.1`` and ``PyTorch 1.9.0``, use the following command:
+
+```shell
+pip install mmcv-full==1.3.9 -f https://download.openmmlab.com/mmcv/dist/cu111/torch1.9.0/index.html
+```
+
+```{note}
+mmcv-full is only compiled on PyTorch 1.x.0 because the compatibility
+usually holds between 1.x.0 and 1.x.1. If your PyTorch version is 1.x.1, you
+can install mmcv-full compiled with PyTorch 1.x.0 and it usually works well.
+For example, if your PyTorch version is 1.8.1 and CUDA version is 11.1, you
+can use the following command to install mmcv-full.
+
+`pip install mmcv-full -f https://download.openmmlab.com/mmcv/dist/cu111/torch1.8.0/index.html`
+```
+
+For more details, please refer the the following tables.
+
+<table class="docutils">
+  <tbody>
+    <tr>
+      <th width="80"> CUDA </th>
+      <th valign="bottom" align="left" style="min-width: 100px">torch 1.10</th>
+      <th valign="bottom" align="left" style="min-width: 100px">torch 1.9</th>
+      <th valign="bottom" align="left" style="min-width: 100px">torch 1.8</th>
+      <th valign="bottom" align="left" style="min-width: 100px">torch 1.7</th>
+      <th valign="bottom" align="left" style="min-width: 100px">torch 1.6</th>
+      <th valign="bottom" align="left" style="min-width: 100px">torch 1.5</th>
+    </tr>
+    <tr>
+      <td align="left">11.3</td>
+      <td align="left"><details><summary> install </summary><pre><code>pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu113/torch1.10.0/index.html</code></pre> </details> </td>
+      <td align="left"></td>
+      <td align="left"></code></pre> </details> </td>
+      <td align="left"> </td>
+      <td align="left"> </td>
+      <td align="left"> </td>
+    </tr>
+    <tr>
+      <td align="left">11.1</td>
+      <td align="left"><details><summary> install </summary><pre><code>pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu111/torch1.10.0/index.html</code></pre> </details> </td>
+      <td align="left"><details><summary> install </summary><pre><code>pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu111/torch1.9.0/index.html</code></pre> </details> </td>
+      <td align="left"><details><summary> install </summary><pre><code>pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu111/torch1.8.0/index.html</code></pre> </details> </td>
+      <td align="left"> </td>
+      <td align="left"> </td>
+      <td align="left"> </td>
+    </tr>
+    <tr>
+      <td align="left">11.0</td>
+      <td align="left"> </td>
+      <td align="left"> </td>
+      <td align="left"> </td>
+      <td align="left"><details><summary> install </summary><pre><code>pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu110/torch1.7.0/index.html</code></pre> </details> </td>
+      <td align="left"> </td>
+      <td align="left"> </td>
+    </tr>
+    <tr>
+      <td align="left">10.2</td>
+      <td align="left"><details><summary> install </summary><pre><code>pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu102/torch1.10.0/index.html</code></pre> </details> </td>
+      <td align="left"><details><summary> install </summary><pre><code>pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu102/torch1.9.0/index.html</code></pre> </details> </td>
+      <td align="left"><details><summary> install </summary><pre><code>pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu102/torch1.8.0/index.html</code></pre> </details> </td>
+      <td align="left"><details><summary> install </summary><pre><code>pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu102/torch1.7.0/index.html</code></pre> </details> </td>
+      <td align="left"><details><summary> install </summary><pre><code>pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu102/torch1.6.0/index.html</code></pre> </details> </td>
+      <td align="left"><details><summary> install </summary><pre><code>pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu102/torch1.5.0/index.html</code></pre> </details> </td>
+    </tr>
+    <tr>
+      <td align="left">10.1</td>
+      <td align="left"> </td>
+      <td align="left"> </td>
+      <td align="left"><details><summary> install </summary><pre><code> pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu101/torch1.8.0/index.html</code></pre> </details> </td>
+      <td align="left"><details><summary> install </summary><pre><code> pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu101/torch1.7.0/index.html</code></pre> </details> </td>
+      <td align="left"><details><summary> install </summary><pre><code> pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu101/torch1.6.0/index.html</code></pre> </details> </td>
+      <td align="left"><details><summary> install </summary><pre><code> pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu101/torch1.5.0/index.html</code></pre> </details> </td>
+    </tr>
+    <tr>
+      <td align="left">9.2</td>
+      <td align="left"> </td>
+      <td align="left"> </td>
+      <td align="left"> </td>
+      <td align="left"><details><summary> install </summary><pre><code> pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu92/torch1.7.0/index.html</code></pre> </details> </td>
+      <td align="left"><details><summary> install </summary><pre><code> pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu92/torch1.6.0/index.html</code></pre> </details> </td>
+      <td align="left"><details><summary> install </summary><pre><code> pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu92/torch1.5.0/index.html</code></pre> </details> </td>
+    </tr>
+    <tr>
+      <td align="left">cpu</td>
+      <td align="left"><details><summary> install </summary><pre><code> pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cpu/torch1.10.0/index.html</code></pre> </details> </td>
+      <td align="left"><details><summary> install </summary><pre><code> pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cpu/torch1.9.0/index.html</code></pre> </details> </td>
+      <td align="left"><details><summary> install </summary><pre><code> pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cpu/torch1.8.0/index.html</code></pre> </details> </td>
+      <td align="left"><details><summary> install </summary><pre><code> pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cpu/torch1.7.0/index.html</code></pre> </details> </td>
+      <td align="left"><details><summary> install </summary><pre><code> pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cpu/torch1.6.0/index.html</code></pre> </details> </td>
+      <td align="left"><details><summary> install </summary><pre><code> pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cpu/torch1.5.0/index.html</code></pre> </details> </td>
+    </tr>
+  </tbody>
+</table>
+
+```{note}
+The pre-built packages provided above do not include all versions of mmcv-full, you can click on the corresponding links to see the supported versions. For example, if you click [cu102-torch1.8.0](https://download.openmmlab.com/mmcv/dist/cu102/torch1.8.0/index.html), you can see that `cu102-torch1.8.0` only provides 1.3.0 and above versions of mmcv-full. In addition, We no longer provide `mmcv-full` pre-built packages compiled with `PyTorch 1.3 & 1.4` since v1.3.17. You can find previous versions that compiled with PyTorch 1.3 & 1.4 [here](./docs/get_started/previous_versions.md). The compatibility is still ensured in our CI, but we will discard the support of PyTorch 1.3 & 1.4 next year.
+```
+
+Another way is to compile locally by running
+
+```python
+pip install mmcv-full
+```
+
+Note that the local compiling may take up to 10 mins.
+
+b. Install the lite version.
+
+```python
+pip install mmcv
+```
+
+c. Install full version with custom operators for onnxruntime
+
+- Check [here](https://mmcv.readthedocs.io/en/latest/deployment/onnxruntime_custom_ops.html) for detailed instruction.
+
+If you would like to build MMCV from source, please refer to the [guide](https://mmcv.readthedocs.io/en/latest/get_started/build.html).
diff --git a/docs/get_started/introduction.md b/docs/get_started/introduction.md
new file mode 100644
index 0000000..4ffb59d
--- /dev/null
+++ b/docs/get_started/introduction.md
@@ -0,0 +1,29 @@
+## Introduction
+
+MMCV is a foundational library for computer vision research and supports many
+research projects as below:
+
+- [MMClassification](https://github.com/open-mmlab/mmclassification): OpenMMLab image classification toolbox and benchmark.
+- [MMDetection](https://github.com/open-mmlab/mmdetection): OpenMMLab detection toolbox and benchmark.
+- [MMDetection3D](https://github.com/open-mmlab/mmdetection3d): OpenMMLab's next-generation platform for general 3D object detection.
+- [MMSegmentation](https://github.com/open-mmlab/mmsegmentation): OpenMMLab semantic segmentation toolbox and benchmark.
+- [MMAction2](https://github.com/open-mmlab/mmaction2): OpenMMLab's next-generation action understanding toolbox and benchmark.
+- [MMTracking](https://github.com/open-mmlab/mmtracking): OpenMMLab video perception toolbox and benchmark.
+- [MMPose](https://github.com/open-mmlab/mmpose): OpenMMLab pose estimation toolbox and benchmark.
+- [MMEditing](https://github.com/open-mmlab/mmediting): OpenMMLab image and video editing toolbox.
+- [MMOCR](https://github.com/open-mmlab/mmocr): OpenMMLab text detection, recognition and understanding toolbox.
+- [MMGeneration](https://github.com/open-mmlab/mmgeneration): OpenMMLab image and video generative models toolbox.
+
+It provides the following functionalities.
+
+- Universal IO APIs
+- Image/Video processing
+- Image and annotation visualization
+- Useful utilities (progress bar, timer, ...)
+- PyTorch runner with hooking mechanism
+- Various CNN architectures
+- High-quality implementation of common CUDA ops
+
+```{note}
+MMCV requires Python 3.6+.
+```
diff --git a/docs/en/get_started/previous_versions.md b/docs/get_started/previous_versions.md
similarity index 93%
rename from docs/en/get_started/previous_versions.md
rename to docs/get_started/previous_versions.md
index a9c3717..c91180d 100644
--- a/docs/en/get_started/previous_versions.md
+++ b/docs/get_started/previous_versions.md
@@ -4,7 +4,7 @@ We no longer provide `mmcv-full` packages compiled under lower versions of `PyTo
 
 ### PyTorch 1.4
 
-| 1.0.0 \<= mmcv_version \<= 1.2.1
+| 1.0.0 <= mmcv_version <= 1.2.1
 
 #### CUDA 10.1
 
@@ -26,7 +26,7 @@ pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dis
 
 ### PyTorch v1.3
 
-| 1.0.0 \<= mmcv_version \<= 1.3.16
+| 1.0.0 <= mmcv_version <= 1.3.16
 
 #### CUDA 10.1
 
diff --git a/docs/en/index.rst b/docs/index.rst
similarity index 71%
rename from docs/en/index.rst
rename to docs/index.rst
index dee2c37..6019f10 100644
--- a/docs/en/index.rst
+++ b/docs/index.rst
@@ -15,23 +15,27 @@ You can switch between Chinese and English documents in the lower-left corner of
    :maxdepth: 2
    :caption: Understand MMCV
 
+   understand_mmcv/config.md
+   understand_mmcv/registry.md
+   understand_mmcv/runner.md
+   understand_mmcv/io.md
    understand_mmcv/data_process.md
-   understand_mmcv/data_transform.md
    understand_mmcv/visualization.md
    understand_mmcv/cnn.md
    understand_mmcv/ops.md
+   understand_mmcv/utils.md
 
 .. toctree::
    :maxdepth: 2
    :caption: Deployment
 
+   deployment/onnx.md
+   deployment/onnxruntime_op.md
+   deployment/onnxruntime_custom_ops.md
+   deployment/tensorrt_plugin.md
+   deployment/tensorrt_custom_ops.md
    deployment/mmcv_ops_definition.md
 
-.. toctree::
-   :caption: Switch Language
-
-   switch_language.md
-
 .. toctree::
    :maxdepth: 2
    :caption: Compatibility
@@ -39,6 +43,8 @@ You can switch between Chinese and English documents in the lower-left corner of
    compatibility.md
 
 .. toctree::
+   :maxdepth: 2
+   :caption: FAQ
 
    faq.md
 
@@ -50,17 +56,10 @@ You can switch between Chinese and English documents in the lower-left corner of
    community/pr.md
 
 .. toctree::
-   :maxdepth: 1
+   :maxdepth: 2
    :caption: API Reference
 
-   mmcv.image <api/image>
-   mmcv.video <api/video>
-   mmcv.visualization <api/visualization>
-   mmcv.cnn <api/cnn>
-   mmcv.ops <api/ops>
-   mmcv.transforms <api/transforms>
-   mmcv.arraymisc <api/arraymisc>
-   mmcv.utils <api/utils>
+   api.rst
 
 Indices and tables
 ==================
diff --git a/docs/en/make.bat b/docs/make.bat
similarity index 100%
rename from docs/en/make.bat
rename to docs/make.bat
diff --git a/docs/en/mmcv-logo.png b/docs/mmcv-logo.png
similarity index 100%
rename from docs/en/mmcv-logo.png
rename to docs/mmcv-logo.png
diff --git a/docs/understand_mmcv/cnn.md b/docs/understand_mmcv/cnn.md
new file mode 100644
index 0000000..749cb95
--- /dev/null
+++ b/docs/understand_mmcv/cnn.md
@@ -0,0 +1,538 @@
+## CNN
+
+We provide some building bricks for CNNs, including layer building, module bundles and weight initialization.
+
+### Layer building
+
+We may need to try different layers of the same type when running experiments,
+but do not want to modify the code from time to time.
+Here we provide some layer building methods to construct layers from a dict,
+which can be written in configs or specified via command line arguments.
+
+#### Usage
+
+A simplest example is
+
+```python
+cfg = dict(type='Conv3d')
+layer = build_conv_layer(cfg, in_channels=3, out_channels=8, kernel_size=3)
+```
+
+- `build_conv_layer`: Supported types are Conv1d, Conv2d, Conv3d, Conv (alias for Conv2d).
+- `build_norm_layer`: Supported types are BN1d, BN2d, BN3d, BN (alias for BN2d), SyncBN, GN, LN, IN1d, IN2d, IN3d, IN (alias for IN2d).
+- `build_activation_layer`: Supported types are ReLU, LeakyReLU, PReLU, RReLU, ReLU6, ELU, Sigmoid, Tanh, GELU.
+- `build_upsample_layer`: Supported types are nearest, bilinear, deconv, pixel_shuffle.
+- `build_padding_layer`: Supported types are zero, reflect, replicate.
+
+#### Extension
+
+We also allow extending the building methods with custom layers and operators.
+
+1. Write and register your own module.
+
+    ```python
+    from mmcv.cnn import UPSAMPLE_LAYERS
+
+    @UPSAMPLE_LAYERS.register_module()
+    class MyUpsample:
+
+        def __init__(self, scale_factor):
+            pass
+
+        def forward(self, x):
+            pass
+    ```
+
+2. Import `MyUpsample` somewhere (e.g., in `__init__.py`) and then use it.
+
+    ```python
+    cfg = dict(type='MyUpsample', scale_factor=2)
+    layer = build_upsample_layer(cfg)
+    ```
+
+### Module bundles
+
+We also provide common module bundles to facilitate the network construction.
+`ConvModule` is a bundle of convolution, normalization and activation layers,
+please refer to the [api](api.html#mmcv.cnn.ConvModule) for details.
+
+```python
+# conv + bn + relu
+conv = ConvModule(3, 8, 2, norm_cfg=dict(type='BN'))
+# conv + gn + relu
+conv = ConvModule(3, 8, 2, norm_cfg=dict(type='GN', num_groups=2))
+# conv + relu
+conv = ConvModule(3, 8, 2)
+# conv
+conv = ConvModule(3, 8, 2, act_cfg=None)
+# conv + leaky relu
+conv = ConvModule(3, 8, 3, padding=1, act_cfg=dict(type='LeakyReLU'))
+# bn + conv + relu
+conv = ConvModule(
+    3, 8, 2, norm_cfg=dict(type='BN'), order=('norm', 'conv', 'act'))
+```
+
+### Weight initialization
+
+> Implementation details are available at [mmcv/cnn/utils/weight_init.py](../../mmcv/cnn/utils/weight_init.py)
+
+During training, a proper initialization strategy is beneficial to speed up the
+training or obtain a higher performance. In MMCV, we provide some commonly used
+methods for initializing modules like `nn.Conv2d`. Of course, we also provide
+high-level APIs for initializing models containing one or more
+modules.
+
+#### Initialization functions
+
+Initialize a `nn.Module` such as `nn.Conv2d`, `nn.Linear` in a functional way.
+
+We provide the following initialization methods.
+
+- constant_init
+
+  Initialize module parameters with constant values.
+
+    ```python
+    >>> import torch.nn as nn
+    >>> from mmcv.cnn import constant_init
+    >>> conv1 = nn.Conv2d(3, 3, 1)
+    >>> # constant_init(module, val, bias=0)
+    >>> constant_init(conv1, 1, 0)
+    >>> conv1.weight
+    ```
+
+- xavier_init
+
+  Initialize module parameters with values according to the method
+  described in [Understanding the difficulty of training deep feedforward neural networks - Glorot, X. & Bengio, Y. (2010)](http://proceedings.mlr.press/v9/glorot10a/glorot10a.pdf)
+
+    ```python
+    >>> import torch.nn as nn
+    >>> from mmcv.cnn import xavier_init
+    >>> conv1 = nn.Conv2d(3, 3, 1)
+    >>> # xavier_init(module, gain=1, bias=0, distribution='normal')
+    >>> xavier_init(conv1, distribution='normal')
+    ```
+
+- normal_init
+
+  Initialize module parameters with the values drawn from a normal distribution.
+
+    ```python
+    >>> import torch.nn as nn
+    >>> from mmcv.cnn import normal_init
+    >>> conv1 = nn.Conv2d(3, 3, 1)
+    >>> # normal_init(module, mean=0, std=1, bias=0)
+    >>> normal_init(conv1, std=0.01, bias=0)
+    ```
+
+- uniform_init
+
+  Initialize module parameters with values drawn from a uniform distribution.
+
+    ```python
+    >>> import torch.nn as nn
+    >>> from mmcv.cnn import uniform_init
+    >>> conv1 = nn.Conv2d(3, 3, 1)
+    >>> # uniform_init(module, a=0, b=1, bias=0)
+    >>> uniform_init(conv1, a=0, b=1)
+    ```
+
+- kaiming_init
+
+  Initialize module parameters with the values according to the method
+  described in [Delving deep into rectifiers: Surpassing human-level
+  performance on ImageNet classification - He, K. et al. (2015)](https://www.cv-foundation.org/openaccess/content_iccv_2015/papers/He_Delving_Deep_into_ICCV_2015_paper.pdf)
+
+    ```python
+    >>> import torch.nn as nn
+    >>> from mmcv.cnn import kaiming_init
+    >>> conv1 = nn.Conv2d(3, 3, 1)
+    >>> # kaiming_init(module, a=0, mode='fan_out', nonlinearity='relu', bias=0, distribution='normal')
+    >>> kaiming_init(conv1)
+    ```
+
+- caffe2_xavier_init
+
+  The xavier initialization is implemented in caffe2, which corresponds to `kaiming_uniform_` in PyTorch.
+
+    ```python
+    >>> import torch.nn as nn
+    >>> from mmcv.cnn import caffe2_xavier_init
+    >>> conv1 = nn.Conv2d(3, 3, 1)
+    >>> # caffe2_xavier_init(module, bias=0)
+    >>> caffe2_xavier_init(conv1)
+    ```
+
+- bias_init_with_prob
+
+  Initialize conv/fc bias value according to a given probability, as proposed in [Focal Loss for Dense Object Detection](https://arxiv.org/pdf/1708.02002.pdf).
+
+    ```python
+    >>> from mmcv.cnn import bias_init_with_prob
+    >>> # bias_init_with_prob is proposed in Focal Loss
+    >>> bias = bias_init_with_prob(0.01)
+    >>> bias
+    -4.59511985013459
+    ```
+
+#### Initializers and configs
+
+On the basis of the initialization methods, we define the corresponding initialization classes and register them to `INITIALIZERS`, so we can
+use the configuration to initialize the model.
+
+We provide the following initialization classes.
+
+- ConstantInit
+- XavierInit
+- NormalInit
+- UniformInit
+- KaimingInit
+- Caffe2XavierInit
+- PretrainedInit
+
+Let us introduce the usage of `initialize` in detail.
+
+1. Initialize model by `layer` key
+
+    If we only define `layer`, it just initialize the layer in `layer` key.
+
+    NOTE: Value of `layer` key is the class name with attributes weights and bias of Pytorch, so `MultiheadAttention layer` is not supported.
+
+- Define `layer` key for initializing module with same configuration.
+
+  ```python
+  import torch.nn as nn
+  from mmcv.cnn import initialize
+
+  class FooNet(nn.Module):
+      def __init__(self):
+          super().__init__()
+          self.feat = nn.Conv1d(3, 1, 3)
+          self.reg = nn.Conv2d(3, 3, 3)
+          self.cls = nn.Linear(1, 2)
+
+  model = FooNet()
+  init_cfg = dict(type='Constant', layer=['Conv1d', 'Conv2d', 'Linear'], val=1)
+  # initialize whole module with same configuration
+  initialize(model, init_cfg)
+  # model.feat.weight
+  # Parameter containing:
+  # tensor([[[1., 1., 1.],
+  #          [1., 1., 1.],
+  #          [1., 1., 1.]]], requires_grad=True)
+  ```
+
+- Define `layer` key for initializing layer with different configurations.
+
+  ```python
+  import torch.nn as nn
+  from mmcv.cnn.utils import initialize
+
+  class FooNet(nn.Module):
+      def __init__(self):
+          super().__init__()
+          self.feat = nn.Conv1d(3, 1, 3)
+          self.reg = nn.Conv2d(3, 3, 3)
+          self.cls = nn.Linear(1,2)
+
+  model = FooNet()
+  init_cfg = [dict(type='Constant', layer='Conv1d', val=1),
+              dict(type='Constant', layer='Conv2d', val=2),
+              dict(type='Constant', layer='Linear', val=3)]
+  # nn.Conv1d will be initialized with dict(type='Constant', val=1)
+  # nn.Conv2d will be initialized with dict(type='Constant', val=2)
+  # nn.Linear will be initialized with dict(type='Constant', val=3)
+  initialize(model, init_cfg)
+  # model.reg.weight
+  # Parameter containing:
+  # tensor([[[[2., 2., 2.],
+  #           [2., 2., 2.],
+  #           [2., 2., 2.]],
+  #          ...,
+  #          [[2., 2., 2.],
+  #           [2., 2., 2.],
+  #           [2., 2., 2.]]]], requires_grad=True)
+  ```
+
+2. Initialize model by `override` key
+
+- When initializing some specific part with its attribute name, we can use `override` key, and the value in `override` will ignore the value in init_cfg.
+
+    ```python
+    import torch.nn as nn
+    from mmcv.cnn import initialize
+
+    class FooNet(nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.feat = nn.Conv1d(3, 1, 3)
+            self.reg = nn.Conv2d(3, 3, 3)
+            self.cls = nn.Sequential(nn.Conv1d(3, 1, 3), nn.Linear(1,2))
+
+    # if we would like to initialize model's weights as 1 and bias as 2
+    # but weight in `cls` as 3 and bias 4, we can use override key
+    model = FooNet()
+    init_cfg = dict(type='Constant', layer=['Conv1d','Conv2d'], val=1, bias=2,
+                    override=dict(type='Constant', name='reg', val=3, bias=4))
+    # self.feat and self.cls will be initialized with dict(type='Constant', val=1, bias=2)
+    # The module called 'reg' will be initialized with dict(type='Constant', val=3, bias=4)
+    initialize(model, init_cfg)
+    # model.reg.weight
+    # Parameter containing:
+    # tensor([[[[3., 3., 3.],
+    #           [3., 3., 3.],
+    #           [3., 3., 3.]],
+    #           ...,
+    #           [[3., 3., 3.],
+    #            [3., 3., 3.],
+    #            [3., 3., 3.]]]], requires_grad=True)
+    ```
+
+- If `layer` is None in init_cfg, only sub-module with the name in override will be initialized, and type and other args in override can be omitted.
+
+    ```python
+    model = FooNet()
+    init_cfg = dict(type='Constant', val=1, bias=2, override=dict(name='reg'))
+    # self.feat and self.cls will be initialized by Pytorch
+    # The module called 'reg' will be initialized with dict(type='Constant', val=1, bias=2)
+    initialize(model, init_cfg)
+    # model.reg.weight
+    # Parameter containing:
+    # tensor([[[[1., 1., 1.],
+    #           [1., 1., 1.],
+    #           [1., 1., 1.]],
+    #           ...,
+    #           [[1., 1., 1.],
+    #            [1., 1., 1.],
+    #            [1., 1., 1.]]]], requires_grad=True)
+    ```
+
+- If we don't define `layer` key or `override` key, it will not initialize anything.
+
+- Invalid usage
+
+   ```python
+   # It is invalid that override don't have name key
+   init_cfg = dict(type='Constant', layer=['Conv1d','Conv2d'],
+                   val=1, bias=2,
+                   override=dict(type='Constant', val=3, bias=4))
+
+   # It is also invalid that override has name and other args except type
+   init_cfg = dict(type='Constant', layer=['Conv1d','Conv2d'],
+                   val=1, bias=2,
+                   override=dict(name='reg', val=3, bias=4))
+   ```
+
+3. Initialize model with the pretrained model
+
+    ```python
+    import torch.nn as nn
+    import torchvision.models as models
+    from mmcv.cnn import initialize
+
+    # initialize model with pretrained model
+    model = models.resnet50()
+    # model.conv1.weight
+    # Parameter containing:
+    # tensor([[[[-6.7435e-03, -2.3531e-02, -9.0143e-03,  ..., -2.1245e-03,
+    #            -1.8077e-03,  3.0338e-03],
+    #           [-1.2603e-02, -2.7831e-02,  2.3187e-02,  ..., -1.5793e-02,
+    #             1.1655e-02,  4.5889e-03],
+    #           [-3.7916e-02,  1.2014e-02,  1.3815e-02,  ..., -4.2651e-03,
+    #             1.7314e-02, -9.9998e-03],
+    #           ...,
+
+    init_cfg = dict(type='Pretrained',
+                    checkpoint='torchvision://resnet50')
+    initialize(model, init_cfg)
+    # model.conv1.weight
+    # Parameter containing:
+    # tensor([[[[ 1.3335e-02,  1.4664e-02, -1.5351e-02,  ..., -4.0896e-02,
+    #            -4.3034e-02, -7.0755e-02],
+    #           [ 4.1205e-03,  5.8477e-03,  1.4948e-02,  ...,  2.2060e-03,
+    #            -2.0912e-02, -3.8517e-02],
+    #           [ 2.2331e-02,  2.3595e-02,  1.6120e-02,  ...,  1.0281e-01,
+    #             6.2641e-02,  5.1977e-02],
+    #           ...,
+
+    # initialize weights of a sub-module with the specific part of a pretrained model by using 'prefix'
+    model = models.resnet50()
+    url = 'http://download.openmmlab.com/mmdetection/v2.0/retinanet/'\
+          'retinanet_r50_fpn_1x_coco/'\
+          'retinanet_r50_fpn_1x_coco_20200130-c2398f9e.pth'
+    init_cfg = dict(type='Pretrained',
+                    checkpoint=url, prefix='backbone.')
+    initialize(model, init_cfg)
+    ```
+
+4. Initialize model inherited from BaseModule, Sequential, ModuleList
+
+    `BaseModule` is inherited from `torch.nn.Module`, and the only different between them is that `BaseModule` implements `init_weight`.
+
+    `Sequential` is inherited from `BaseModule` and `torch.nn.Sequential`.
+
+    `ModuleList` is inherited from `BaseModule` and `torch.nn.ModuleList`.
+
+    `````python
+    import torch.nn as nn
+    from mmcv.runner import BaseModule, Sequential, ModuleList
+
+    class FooConv1d(BaseModule):
+
+        def __init__(self, init_cfg=None):
+            super().__init__(init_cfg)
+            self.conv1d = nn.Conv1d(4, 1, 4)
+
+        def forward(self, x):
+            return self.conv1d(x)
+
+    class FooConv2d(BaseModule):
+
+        def __init__(self, init_cfg=None):
+            super().__init__(init_cfg)
+            self.conv2d = nn.Conv2d(3, 1, 3)
+
+        def forward(self, x):
+            return self.conv2d(x)
+
+    # BaseModule
+    init_cfg = dict(type='Constant', layer='Conv1d', val=0., bias=1.)
+    model = FooConv1d(init_cfg)
+    model.init_weights()
+    # model.conv1d.weight
+    # Parameter containing:
+    # tensor([[[0., 0., 0., 0.],
+    #        [0., 0., 0., 0.],
+    #        [0., 0., 0., 0.],
+    #        [0., 0., 0., 0.]]], requires_grad=True)
+
+    # Sequential
+    init_cfg1 = dict(type='Constant', layer='Conv1d', val=0., bias=1.)
+    init_cfg2 = dict(type='Constant', layer='Conv2d', val=2., bias=3.)
+    model1 = FooConv1d(init_cfg1)
+    model2 = FooConv2d(init_cfg2)
+    seq_model = Sequential(model1, model2)
+    seq_model.init_weights()
+    # seq_model[0].conv1d.weight
+    # Parameter containing:
+    # tensor([[[0., 0., 0., 0.],
+    #         [0., 0., 0., 0.],
+    #         [0., 0., 0., 0.],
+    #         [0., 0., 0., 0.]]], requires_grad=True)
+    # seq_model[1].conv2d.weight
+    # Parameter containing:
+    # tensor([[[[2., 2., 2.],
+    #           [2., 2., 2.],
+    #           [2., 2., 2.]],
+    #         ...,
+    #          [[2., 2., 2.],
+    #           [2., 2., 2.],
+    #           [2., 2., 2.]]]], requires_grad=True)
+
+    # inner init_cfg has higher priority
+    model1 = FooConv1d(init_cfg1)
+    model2 = FooConv2d(init_cfg2)
+    init_cfg = dict(type='Constant', layer=['Conv1d', 'Conv2d'], val=4., bias=5.)
+    seq_model = Sequential(model1, model2, init_cfg=init_cfg)
+    seq_model.init_weights()
+    # seq_model[0].conv1d.weight
+    # Parameter containing:
+    # tensor([[[0., 0., 0., 0.],
+    #         [0., 0., 0., 0.],
+    #         [0., 0., 0., 0.],
+    #         [0., 0., 0., 0.]]], requires_grad=True)
+    # seq_model[1].conv2d.weight
+    # Parameter containing:
+    # tensor([[[[2., 2., 2.],
+    #           [2., 2., 2.],
+    #           [2., 2., 2.]],
+    #         ...,
+    #          [[2., 2., 2.],
+    #           [2., 2., 2.],
+    #           [2., 2., 2.]]]], requires_grad=True)
+
+    # ModuleList
+    model1 = FooConv1d(init_cfg1)
+    model2 = FooConv2d(init_cfg2)
+    modellist = ModuleList([model1, model2])
+    modellist.init_weights()
+    # modellist[0].conv1d.weight
+    # Parameter containing:
+    # tensor([[[0., 0., 0., 0.],
+    #         [0., 0., 0., 0.],
+    #         [0., 0., 0., 0.],
+    #         [0., 0., 0., 0.]]], requires_grad=True)
+    # modellist[1].conv2d.weight
+    # Parameter containing:
+    # tensor([[[[2., 2., 2.],
+    #           [2., 2., 2.],
+    #           [2., 2., 2.]],
+    #         ...,
+    #          [[2., 2., 2.],
+    #           [2., 2., 2.],
+    #           [2., 2., 2.]]]], requires_grad=True)
+
+    # inner init_cfg has higher priority
+    model1 = FooConv1d(init_cfg1)
+    model2 = FooConv2d(init_cfg2)
+    init_cfg = dict(type='Constant', layer=['Conv1d', 'Conv2d'], val=4., bias=5.)
+    modellist = ModuleList([model1, model2], init_cfg=init_cfg)
+    modellist.init_weights()
+    # modellist[0].conv1d.weight
+    # Parameter containing:
+    # tensor([[[0., 0., 0., 0.],
+    #         [0., 0., 0., 0.],
+    #         [0., 0., 0., 0.],
+    #         [0., 0., 0., 0.]]], requires_grad=True)
+    # modellist[1].conv2d.weight
+    # Parameter containing:
+    # tensor([[[[2., 2., 2.],
+    #           [2., 2., 2.],
+    #           [2., 2., 2.]],
+    #         ...,
+    #          [[2., 2., 2.],
+    #           [2., 2., 2.],
+    #           [2., 2., 2.]]]], requires_grad=True)
+    `````
+
+### Model Zoo
+
+Besides torchvision pre-trained models, we also provide pre-trained models of following CNN:
+
+- VGG Caffe
+- ResNet Caffe
+- ResNeXt
+- ResNet with Group Normalization
+- ResNet with Group Normalization and Weight Standardization
+- HRNetV2
+- Res2Net
+- RegNet
+
+#### Model URLs in JSON
+
+The model zoo links in MMCV are managed by JSON files.
+The json file consists of key-value pair of model name and its url or path.
+An example json file could be like:
+
+```json
+{
+    "model_a": "https://example.com/models/model_a_9e5bac.pth",
+    "model_b": "pretrain/model_b_ab3ef2c.pth"
+}
+```
+
+The default links of the pre-trained models hosted on OpenMMLab AWS could be found [here](https://github.com/open-mmlab/mmcv/blob/master/mmcv/model_zoo/open_mmlab.json).
+
+You may override default links by putting `open-mmlab.json` under `MMCV_HOME`. If `MMCV_HOME` is not find in the environment, `~/.cache/mmcv` will be used by default. You may `export MMCV_HOME=/your/path` to use your own path.
+
+The external json files will be merged into default one. If the same key presents in both external json and default json, the external one will be used.
+
+#### Load Checkpoint
+
+The following types are supported for `filename` argument of `mmcv.load_checkpoint()`.
+
+- filepath: The filepath of the checkpoint.
+- `http://xxx` and `https://xxx`: The link to download the checkpoint. The `SHA256` postfix should be contained in the filename.
+- `torchvision://xxx`: The model links in `torchvision.models`.Please refer to [torchvision](https://pytorch.org/docs/stable/torchvision/models.html) for details.
+- `open-mmlab://xxx`: The model links or filepath provided in default and additional json files.
diff --git a/docs/understand_mmcv/config.md b/docs/understand_mmcv/config.md
new file mode 100644
index 0000000..d0b669b
--- /dev/null
+++ b/docs/understand_mmcv/config.md
@@ -0,0 +1,200 @@
+## Config
+
+`Config` class is used for manipulating config and config files. It supports
+loading configs from multiple file formats including **python**, **json** and **yaml**.
+It provides dict-like apis to get and set values.
+
+Here is an example of the config file `test.py`.
+
+```python
+a = 1
+b = dict(b1=[0, 1, 2], b2=None)
+c = (1, 2)
+d = 'string'
+```
+
+To load and use configs
+
+```python
+>>> cfg = Config.fromfile('test.py')
+>>> print(cfg)
+>>> dict(a=1,
+...      b=dict(b1=[0, 1, 2], b2=None),
+...      c=(1, 2),
+...      d='string')
+```
+
+For all format configs, some predefined variables are supported. It will convert the variable in `{{ var }}` with its real value.
+
+Currently, it supports four predefined variables:
+
+`{{ fileDirname }}` - the current opened file's dirname, e.g. /home/your-username/your-project/folder
+
+`{{ fileBasename }}` - the current opened file's basename, e.g. file.ext
+
+`{{ fileBasenameNoExtension }}` - the current opened file's basename with no file extension, e.g. file
+
+`{{ fileExtname }}` - the current opened file's extension, e.g. .ext
+
+These variable names are referred from [VS Code](https://code.visualstudio.com/docs/editor/variables-reference).
+
+Here is one examples of config with predefined variables.
+
+`config_a.py`
+
+```python
+a = 1
+b = './work_dir/{{ fileBasenameNoExtension }}'
+c = '{{ fileExtname }}'
+```
+
+```python
+>>> cfg = Config.fromfile('./config_a.py')
+>>> print(cfg)
+>>> dict(a=1,
+...      b='./work_dir/config_a',
+...      c='.py')
+```
+
+For all format configs, inheritance is supported. To reuse fields in other config files,
+specify `_base_='./config_a.py'` or a list of configs `_base_=['./config_a.py', './config_b.py']`.
+Here are 4 examples of config inheritance.
+
+`config_a.py`
+
+```python
+a = 1
+b = dict(b1=[0, 1, 2], b2=None)
+```
+
+### Inherit from base config without overlapped keys
+
+`config_b.py`
+
+```python
+_base_ = './config_a.py'
+c = (1, 2)
+d = 'string'
+```
+
+```python
+>>> cfg = Config.fromfile('./config_b.py')
+>>> print(cfg)
+>>> dict(a=1,
+...      b=dict(b1=[0, 1, 2], b2=None),
+...      c=(1, 2),
+...      d='string')
+```
+
+New fields in `config_b.py` are combined with old fields in `config_a.py`
+
+### Inherit from base config with overlapped keys
+
+`config_c.py`
+
+```python
+_base_ = './config_a.py'
+b = dict(b2=1)
+c = (1, 2)
+```
+
+```python
+>>> cfg = Config.fromfile('./config_c.py')
+>>> print(cfg)
+>>> dict(a=1,
+...      b=dict(b1=[0, 1, 2], b2=1),
+...      c=(1, 2))
+```
+
+`b.b2=None` in `config_a` is replaced with `b.b2=1` in `config_c.py`.
+
+### Inherit from base config with ignored fields
+
+`config_d.py`
+
+```python
+_base_ = './config_a.py'
+b = dict(_delete_=True, b2=None, b3=0.1)
+c = (1, 2)
+```
+
+```python
+>>> cfg = Config.fromfile('./config_d.py')
+>>> print(cfg)
+>>> dict(a=1,
+...      b=dict(b2=None, b3=0.1),
+...      c=(1, 2))
+```
+
+You may also set `_delete_=True` to ignore some fields in base configs. All old keys `b1, b2, b3` in `b` are replaced with new keys `b2, b3`.
+
+### Inherit from multiple base configs (the base configs should not contain the same keys)
+
+`config_e.py`
+
+```python
+c = (1, 2)
+d = 'string'
+```
+
+`config_f.py`
+
+```python
+_base_ = ['./config_a.py', './config_e.py']
+```
+
+```python
+>>> cfg = Config.fromfile('./config_f.py')
+>>> print(cfg)
+>>> dict(a=1,
+...      b=dict(b1=[0, 1, 2], b2=None),
+...      c=(1, 2),
+...      d='string')
+```
+
+### Reference variables from base
+
+You can reference variables defined in base using the following grammar.
+
+`base.py`
+
+```python
+item1 = 'a'
+item2 = dict(item3 = 'b')
+```
+
+`config_g.py`
+
+```python
+_base_ = ['./base.py']
+item = dict(a = {{ _base_.item1 }}, b = {{ _base_.item2.item3 }})
+```
+
+```python
+>>> cfg = Config.fromfile('./config_g.py')
+>>> print(cfg.pretty_text)
+item1 = 'a'
+item2 = dict(item3='b')
+item = dict(a='a', b='b')
+```
+
+### Add deprecation information in configs
+
+Deprecation information can be added in a config file, which will trigger a `UserWarning` when this config file is loaded.
+
+`deprecated_cfg.py`
+
+```python
+_base_ = 'expected_cfg.py'
+
+_deprecation_ = dict(
+    expected = 'expected_cfg.py',  # optional to show expected config path in the warning information
+    reference = 'url to related PR'  # optional to show reference link in the warning information
+)
+```
+
+```python
+>>> cfg = Config.fromfile('./deprecated_cfg.py')
+
+UserWarning: The config file deprecated.py will be deprecated in the future. Please use expected_cfg.py instead. More information can be found at https://github.com/open-mmlab/mmcv/pull/1275
+```
diff --git a/docs/en/understand_mmcv/data_process.md b/docs/understand_mmcv/data_process.md
similarity index 90%
rename from docs/en/understand_mmcv/data_process.md
rename to docs/understand_mmcv/data_process.md
index 167928f..79e9281 100644
--- a/docs/en/understand_mmcv/data_process.md
+++ b/docs/understand_mmcv/data_process.md
@@ -2,7 +2,7 @@
 
 ### Image
 
-This module provides some image processing methods, which requires `opencv` to be installed first.
+This module provides some image processing methods, which requires `opencv` to be installed.
 
 #### Read/Write/Show
 
@@ -118,7 +118,7 @@ mmcv.imflip(img, direction='vertical')
 
 #### Crop
 
-`imcrop` can crop the image with one or more regions. Each region is represented by the upper left and lower right coordinates as (x1, y1, x2, y2).
+`imcrop` can crop the image with one or some regions, represented as (x1, y1, x2, y2).
 
 ```python
 import mmcv
@@ -135,12 +135,12 @@ bboxes = np.array([[10, 10, 100, 120], [0, 0, 50, 50]])
 patches = mmcv.imcrop(img, bboxes)
 
 # crop two regions, and rescale the patches by 1.2x
-patches = mmcv.imcrop(img, bboxes, scale=1.2)
+patches = mmcv.imcrop(img, bboxes, scale_ratio=1.2)
 ```
 
 #### Padding
 
-There are two methods, `impad` and `impad_to_multiple`, to pad an image to the
+There are two methods `impad` and `impad_to_multiple` to pad an image to the
 specific size with given values.
 
 ```python
@@ -150,14 +150,14 @@ img = mmcv.imread('tests/data/color.jpg')
 img_ = mmcv.impad(img, shape=(1000, 1200), pad_val=0)
 
 # pad the image to (1000, 1200) with different values for three channels.
-img_ = mmcv.impad(img, shape=(1000, 1200), pad_val=(100, 50, 200))
+img_ = mmcv.impad(img, shape=(1000, 1200), pad_val=[100, 50, 200])
 
 # pad the image on left, right, top, bottom borders with all zeros
 img_ = mmcv.impad(img, padding=(10, 20, 30, 40), pad_val=0)
 
 # pad the image on left, right, top, bottom borders with different values
 # for three channels.
-img_ = mmcv.impad(img, padding=(10, 20, 30, 40), pad_val=(100, 50, 200))
+img_ = mmcv.impad(img, padding=(10, 20, 30, 40), pad_val=[100, 50, 200])
 
 # pad an image so that each edge is a multiple of some value.
 img_ = mmcv.impad_to_multiple(img, 32)
@@ -165,7 +165,7 @@ img_ = mmcv.impad_to_multiple(img, 32)
 
 ### Video
 
-This module provides the following functionalities:
+This module provides the following functionalities.
 
 - A `VideoReader` class with friendly apis to read and convert videos.
 - Some methods for editing (cut, concat, resize) videos.
@@ -232,7 +232,7 @@ mmcv.resize_video('test.mp4', 'resized2.mp4', ratio=2)
 
 - IO
 - Visualization
-- Flow warping
+- Flow warpping
 
 We provide two options to dump optical flow files: uncompressed and compressed.
 The uncompressed way just dumps the floating numbers to a binary file. It is
@@ -265,12 +265,12 @@ mmcv.flowshow(flow)
 
 ![progress](../_static/flow_visualization.png)
 
-3. Flow warping
+3. Flow warpping
 
 ```python
 img1 = mmcv.imread('img1.jpg')
 flow = mmcv.flowread('flow.flo')
-warped_img2 = mmcv.flow_warp(img1, flow)
+warpped_img2 = mmcv.flow_warp(img1, flow)
 ```
 
 img1 (left) and img2 (right)
@@ -281,6 +281,6 @@ optical flow (img2 -> img1)
 
 ![optical flow](../_static/flow_img2toimg1.png)
 
-warped image and difference with ground truth
+warpped image and difference with ground truth
 
-![warped image](../_static/flow_warp_diff.png)
+![warpped image](../_static/flow_warp_diff.png)
diff --git a/docs/understand_mmcv/io.md b/docs/understand_mmcv/io.md
new file mode 100644
index 0000000..f6c28dd
--- /dev/null
+++ b/docs/understand_mmcv/io.md
@@ -0,0 +1,247 @@
+## File IO
+
+This module provides two universal API to load and dump files of different formats.
+
+```{note}
+Since v1.3.16, the IO modules support loading (dumping) data from (to) different backends, respectively. More details are in PR [#1330](https://github.com/open-mmlab/mmcv/pull/1330).
+```
+
+### Load and dump data
+
+`mmcv` provides a universal api for loading and dumping data, currently
+supported formats are json, yaml and pickle.
+
+#### Load from disk or dump to disk
+
+```python
+import mmcv
+
+# load data from a file
+data = mmcv.load('test.json')
+data = mmcv.load('test.yaml')
+data = mmcv.load('test.pkl')
+# load data from a file-like object
+with open('test.json', 'r') as f:
+    data = mmcv.load(f, file_format='json')
+
+# dump data to a string
+json_str = mmcv.dump(data, file_format='json')
+
+# dump data to a file with a filename (infer format from file extension)
+mmcv.dump(data, 'out.pkl')
+
+# dump data to a file with a file-like object
+with open('test.yaml', 'w') as f:
+    data = mmcv.dump(data, f, file_format='yaml')
+```
+
+#### Load from other backends or dump to other backends
+
+```python
+import mmcv
+
+# load data from a file
+data = mmcv.load('s3://bucket-name/test.json')
+data = mmcv.load('s3://bucket-name/test.yaml')
+data = mmcv.load('s3://bucket-name/test.pkl')
+
+# dump data to a file with a filename (infer format from file extension)
+mmcv.dump(data, 's3://bucket-name/out.pkl')
+```
+
+It is also very convenient to extend the api to support more file formats.
+All you need to do is to write a file handler inherited from `BaseFileHandler`
+and register it with one or several file formats.
+
+You need to implement at least 3 methods.
+
+```python
+import mmcv
+
+# To register multiple file formats, a list can be used as the argument.
+# @mmcv.register_handler(['txt', 'log'])
+@mmcv.register_handler('txt')
+class TxtHandler1(mmcv.BaseFileHandler):
+
+    def load_from_fileobj(self, file):
+        return file.read()
+
+    def dump_to_fileobj(self, obj, file):
+        file.write(str(obj))
+
+    def dump_to_str(self, obj, **kwargs):
+        return str(obj)
+```
+
+Here is an example of `PickleHandler`.
+
+```python
+import pickle
+
+class PickleHandler(mmcv.BaseFileHandler):
+
+    def load_from_fileobj(self, file, **kwargs):
+        return pickle.load(file, **kwargs)
+
+    def load_from_path(self, filepath, **kwargs):
+        return super(PickleHandler, self).load_from_path(
+            filepath, mode='rb', **kwargs)
+
+    def dump_to_str(self, obj, **kwargs):
+        kwargs.setdefault('protocol', 2)
+        return pickle.dumps(obj, **kwargs)
+
+    def dump_to_fileobj(self, obj, file, **kwargs):
+        kwargs.setdefault('protocol', 2)
+        pickle.dump(obj, file, **kwargs)
+
+    def dump_to_path(self, obj, filepath, **kwargs):
+        super(PickleHandler, self).dump_to_path(
+            obj, filepath, mode='wb', **kwargs)
+```
+
+### Load a text file as a list or dict
+
+For example `a.txt` is a text file with 5 lines.
+
+```
+a
+b
+c
+d
+e
+```
+
+#### Load from disk
+
+Use `list_from_file` to load the list from a.txt.
+
+```python
+>>> mmcv.list_from_file('a.txt')
+['a', 'b', 'c', 'd', 'e']
+>>> mmcv.list_from_file('a.txt', offset=2)
+['c', 'd', 'e']
+>>> mmcv.list_from_file('a.txt', max_num=2)
+['a', 'b']
+>>> mmcv.list_from_file('a.txt', prefix='/mnt/')
+['/mnt/a', '/mnt/b', '/mnt/c', '/mnt/d', '/mnt/e']
+```
+
+For example `b.txt` is a text file with 3 lines.
+
+```
+1 cat
+2 dog cow
+3 panda
+```
+
+Then use `dict_from_file` to load the dict from `b.txt`.
+
+```python
+>>> mmcv.dict_from_file('b.txt')
+{'1': 'cat', '2': ['dog', 'cow'], '3': 'panda'}
+>>> mmcv.dict_from_file('b.txt', key_type=int)
+{1: 'cat', 2: ['dog', 'cow'], 3: 'panda'}
+```
+
+#### Load from other backends
+
+Use `list_from_file` to load the list from `s3://bucket-name/a.txt`.
+
+```python
+>>> mmcv.list_from_file('s3://bucket-name/a.txt')
+['a', 'b', 'c', 'd', 'e']
+>>> mmcv.list_from_file('s3://bucket-name/a.txt', offset=2)
+['c', 'd', 'e']
+>>> mmcv.list_from_file('s3://bucket-name/a.txt', max_num=2)
+['a', 'b']
+>>> mmcv.list_from_file('s3://bucket-name/a.txt', prefix='/mnt/')
+['/mnt/a', '/mnt/b', '/mnt/c', '/mnt/d', '/mnt/e']
+```
+
+Use `dict_from_file` to load the dict from `s3://bucket-name/b.txt`.
+
+```python
+>>> mmcv.dict_from_file('s3://bucket-name/b.txt')
+{'1': 'cat', '2': ['dog', 'cow'], '3': 'panda'}
+>>> mmcv.dict_from_file('s3://bucket-name/b.txt', key_type=int)
+{1: 'cat', 2: ['dog', 'cow'], 3: 'panda'}
+```
+
+### Load and dump checkpoints
+
+#### Load checkpoints from disk or save to disk
+
+We can read the checkpoints from disk or save to disk in the following way.
+
+```python
+import torch
+
+filepath1 = '/path/of/your/checkpoint1.pth'
+filepath2 = '/path/of/your/checkpoint2.pth'
+# read from filepath1
+checkpoint = torch.load(filepath1)
+# save to filepath2
+torch.save(checkpoint, filepath2)
+```
+
+MMCV provides many backends. `HardDiskBackend` is one of them and we can use it to read or save checkpoints.
+
+```python
+import io
+from mmcv.fileio.file_client import HardDiskBackend
+
+disk_backend = HardDiskBackend()
+with io.BytesIO(disk_backend.get(filepath1)) as buffer:
+    checkpoint = torch.load(buffer)
+with io.BytesIO() as buffer:
+    torch.save(checkpoint, f)
+    disk_backend.put(f.getvalue(), filepath2)
+```
+
+If we want to implement an interface which automatically select the corresponding
+backend based on the file path, we can use the `FileClient`.
+For example, we want to implement two methods for reading checkpoints as well as saving checkpoints,
+which need to support different types of file paths, either disk paths, network paths or other paths.
+
+```python
+from mmcv.fileio.file_client import FileClient
+
+def load_checkpoint(path):
+    file_client = FileClient.infer(uri=path)
+    with io.BytesIO(file_client.get(path)) as buffer:
+        checkpoint = torch.load(buffer)
+    return checkpoint
+
+def save_checkpoint(checkpoint, path):
+    with io.BytesIO() as buffer:
+        torch.save(checkpoint, buffer)
+        file_client.put(buffer.getvalue(), path)
+
+file_client = FileClient.infer_client(uri=filepath1)
+checkpoint = load_checkpoint(filepath1)
+save_checkpoint(checkpoint, filepath2)
+```
+
+#### Load checkpoints from the Internet
+
+```{note}
+Currently, it only supports reading checkpoints from the Internet, and does not support saving checkpoints to the Internet.
+```
+
+```python
+import io
+import torch
+from mmcv.fileio.file_client import HTTPBackend, FileClient
+
+filepath = 'http://path/of/your/checkpoint.pth'
+checkpoint = torch.utils.model_zoo.load_url(filepath)
+
+http_backend = HTTPBackend()
+with io.BytesIO(http_backend.get(filepath)) as buffer:
+    checkpoint = torch.load(buffer)
+
+file_client = FileClient.infer_client(uri=filepath)
+with io.BytesIO(file_client.get(filepath)) as buffer:
+    checkpoint = torch.load(buffer)
+```
diff --git a/docs/understand_mmcv/ops.md b/docs/understand_mmcv/ops.md
new file mode 100644
index 0000000..2729e44
--- /dev/null
+++ b/docs/understand_mmcv/ops.md
@@ -0,0 +1,37 @@
+## CUDA ops
+
+We implement common CUDA ops used in detection, segmentation, etc.
+
+- AssignScoreWithK
+- BallQuery
+- BBoxOverlaps
+- CARAFE
+- CrissCrossAttention
+- ContextBlock
+- CornerPool
+- Deformable Convolution v1/v2
+- Deformable RoIPool
+- DynamicScatter
+- GatherPoints
+- FurthestPointSample
+- FurthestPointSampleWithDist
+- GeneralizedAttention
+- GroupPoints
+- KNN
+- MaskedConv
+- NMS
+- PSAMask
+- RoIPointPool3d
+- RoIPool
+- RoIAlign
+- RoIAwarePool3d
+- SimpleRoIAlign
+- SigmoidFocalLoss
+- SoftmaxFocalLoss
+- SoftNMS
+- Synchronized BatchNorm
+- Voxelization
+- ThreeInterpolate
+- ThreeNN
+- Weight standardization
+- Correlation
diff --git a/docs/understand_mmcv/registry.md b/docs/understand_mmcv/registry.md
new file mode 100644
index 0000000..2cf1081
--- /dev/null
+++ b/docs/understand_mmcv/registry.md
@@ -0,0 +1,155 @@
+## Registry
+
+MMCV implements [registry](https://github.com/open-mmlab/mmcv/blob/master/mmcv/utils/registry.py) to manage different modules that share similar functionalities, e.g., backbones, head, and necks, in detectors.
+Most projects in OpenMMLab use registry to manage modules of datasets and models, such as [MMDetection](https://github.com/open-mmlab/mmdetection), [MMDetection3D](https://github.com/open-mmlab/mmdetection3d), [MMClassification](https://github.com/open-mmlab/mmclassification), [MMEditing](https://github.com/open-mmlab/mmediting), etc.
+
+### What is registry
+
+In MMCV, registry can be regarded as a mapping that maps a class to a string.
+These classes contained by a single registry usually have similar APIs but implement different algorithms or support different datasets.
+With the registry, users can find and instantiate the class through its corresponding string, and use the instantiated module as they want.
+One typical example is the config systems in most OpenMMLab projects, which use the registry to create hooks, runners, models, and datasets, through configs.
+The API reference could be found [here](https://mmcv.readthedocs.io/en/latest/api.html?highlight=registry#mmcv.utils.Registry).
+
+To manage your modules in the codebase by `Registry`, there are three steps as below.
+
+1. Create a build method (optional, in most cases you can just use the default one).
+2. Create a registry.
+3. Use this registry to manage the modules.
+
+`build_func` argument of `Registry` is to customize how to instantiate the class instance, the default one is `build_from_cfg` implemented [here](https://mmcv.readthedocs.io/en/latest/api.html?highlight=registry#mmcv.utils.build_from_cfg).
+
+### A Simple Example
+
+Here we show a simple example of using registry to manage modules in a package.
+You can find more practical examples in OpenMMLab projects.
+
+Assuming we want to implement a series of Dataset Converter for converting different formats of data to the expected data format.
+We create a directory as a package named `converters`.
+In the package, we first create a file to implement builders, named `converters/builder.py`, as below
+
+```python
+from mmcv.utils import Registry
+# create a registry for converters
+CONVERTERS = Registry('converter')
+```
+
+Then we can implement different converters in the package. For example, implement `Converter1` in `converters/converter1.py`
+
+```python
+
+from .builder import CONVERTERS
+
+# use the registry to manage the module
+@CONVERTERS.register_module()
+class Converter1(object):
+    def __init__(self, a, b):
+        self.a = a
+        self.b = b
+```
+
+The key step to use registry for managing the modules is to register the implemented module into the registry `CONVERTERS` through
+`@CONVERTERS.register_module()` when you are creating the module. By this way, a mapping between a string and the class is built and maintained by `CONVERTERS` as below
+
+```python
+'Converter1' -> <class 'Converter1'>
+```
+
+If the module is successfully registered, you can use this converter through configs as
+
+```python
+converter_cfg = dict(type='Converter1', a=a_value, b=b_value)
+converter = CONVERTERS.build(converter_cfg)
+```
+
+### Customize Build Function
+
+Suppose we would like to customize how `converters` are built, we could implement a customized `build_func` and pass it into the registry.
+
+```python
+from mmcv.utils import Registry
+
+# create a build function
+def build_converter(cfg, registry, *args, **kwargs):
+    cfg_ = cfg.copy()
+    converter_type = cfg_.pop('type')
+    if converter_type not in registry:
+        raise KeyError(f'Unrecognized converter type {converter_type}')
+    else:
+        converter_cls = registry.get(converter_type)
+
+    converter = converter_cls(*args, **kwargs, **cfg_)
+    return converter
+
+# create a registry for converters and pass ``build_converter`` function
+CONVERTERS = Registry('converter', build_func=build_converter)
+```
+
+```{note}
+In this example, we demonstrate how to use the `build_func` argument to customize the way to build a class instance.
+The functionality is similar to the default `build_from_cfg`. In most cases, default one would be sufficient.
+`build_model_from_cfg` is also implemented to build PyTorch module in `nn.Sequentail`, you may directly use them instead of implementing by yourself.
+```
+
+### Hierarchy Registry
+
+You could also build modules from more than one OpenMMLab frameworks, e.g. you could use all backbones in [MMClassification](https://github.com/open-mmlab/mmclassification) for object detectors in [MMDetection](https://github.com/open-mmlab/mmdetection), you may also combine an object detection model in [MMDetection](https://github.com/open-mmlab/mmdetection) and semantic segmentation model in [MMSegmentation](https://github.com/open-mmlab/mmsegmentation).
+
+All `MODELS` registries of downstream codebases are children registries of MMCV's `MODELS` registry.
+Basically, there are two ways to build a module from child or sibling registries.
+
+1. Build from children registries.
+
+   For example:
+
+   In MMDetection we define:
+
+   ```python
+   from mmcv.utils import Registry
+   from mmcv.cnn import MODELS as MMCV_MODELS
+   MODELS = Registry('model', parent=MMCV_MODELS)
+
+   @MODELS.register_module()
+   class NetA(nn.Module):
+       def forward(self, x):
+           return x
+   ```
+
+   In MMClassification we define:
+
+   ```python
+   from mmcv.utils import Registry
+   from mmcv.cnn import MODELS as MMCV_MODELS
+   MODELS = Registry('model', parent=MMCV_MODELS)
+
+   @MODELS.register_module()
+   class NetB(nn.Module):
+       def forward(self, x):
+           return x + 1
+   ```
+
+   We could build two net in either MMDetection or MMClassification by:
+
+   ```python
+   from mmdet.models import MODELS
+   net_a = MODELS.build(cfg=dict(type='NetA'))
+   net_b = MODELS.build(cfg=dict(type='mmcls.NetB'))
+   ```
+
+   or
+
+   ```python
+   from mmcls.models import MODELS
+   net_a = MODELS.build(cfg=dict(type='mmdet.NetA'))
+   net_b = MODELS.build(cfg=dict(type='NetB'))
+   ```
+
+2. Build from parent registry.
+
+   The shared `MODELS` registry in MMCV is the parent registry for all downstream codebases (root registry):
+
+   ```python
+   from mmcv.cnn import MODELS as MMCV_MODELS
+   net_a = MMCV_MODELS.build(cfg=dict(type='mmdet.NetA'))
+   net_b = MMCV_MODELS.build(cfg=dict(type='mmcls.NetB'))
+   ```
diff --git a/docs/understand_mmcv/runner.md b/docs/understand_mmcv/runner.md
new file mode 100644
index 0000000..2e6e386
--- /dev/null
+++ b/docs/understand_mmcv/runner.md
@@ -0,0 +1,163 @@
+## Runner
+
+The runner class is designed to manage the training. It eases the training process with less code demanded from users while staying flexible and configurable. The main features are as listed:
+
+- Support `EpochBasedRunner` and `IterBasedRunner` for different scenarios. Implementing customized runners is also allowed to meet customized needs.
+- Support customized workflow to allow switching between different modes while training. Currently, supported modes are train and val.
+- Enable extensibility through various hooks, including hooks defined in MMCV and customized ones.
+
+### EpochBasedRunner
+
+As its name indicates, workflow in `EpochBasedRunner` should be set based on epochs. For example, [('train', 2), ('val', 1)] means running 2 epochs for training and 1 epoch for validation, iteratively. And each epoch may contain multiple iterations. Currently, MMDetection uses `EpochBasedRunner` by default.
+
+Let's take a look at its core logic:
+
+```python
+# the condition to stop training
+while curr_epoch < max_epochs:
+    # traverse the workflow.
+    # e.g. workflow = [('train', 2), ('val', 1)]
+    for i, flow in enumerate(workflow):
+        # mode(e.g. train) determines which function to run
+        mode, epochs = flow
+        # epoch_runner will be either self.train() or self.val()
+        epoch_runner = getattr(self, mode)
+        # execute the corresponding function
+        for _ in range(epochs):
+            epoch_runner(data_loaders[i], **kwargs)
+```
+
+Currently, we support 2 modes: train and val. Let's take a train function for example and have a look at its core logic:
+
+```python
+# Currently, epoch_runner could be either train or val
+def train(self, data_loader, **kwargs):
+    # traverse the dataset and get batch data for 1 epoch
+    for i, data_batch in enumerate(data_loader):
+        # it will execute all before_train_iter function in the hooks registered. You may want to watch out for the order.
+        self.call_hook('before_train_iter')
+        # set train_mode as False in val function
+        self.run_iter(data_batch, train_mode=True, **kwargs)
+        self.call_hook('after_train_iter')
+   self.call_hook('after_train_epoch')
+```
+
+### IterBasedRunner
+
+Different from `EpochBasedRunner`, workflow in `IterBasedRunner` should be set based on iterations. For example, [('train', 2), ('val', 1)] means running 2 iters for training and 1 iter for validation, iteratively. Currently, MMSegmentation uses `IterBasedRunner` by default.
+
+Let's take a look at its core logic:
+
+```python
+# Although we set workflow by iters here, we might also need info on the epochs in some using cases. That can be provided by IterLoader.
+iter_loaders = [IterLoader(x) for x in data_loaders]
+# the condition to stop training
+while curr_iter < max_iters:
+    # traverse the workflow.
+    # e.g. workflow = [('train', 2), ('val', 1)]
+    for i, flow in enumerate(workflow):
+        # mode(e.g. train) determines which function to run
+        mode, iters = flow
+        # iter_runner will be either self.train() or self.val()
+        iter_runner = getattr(self, mode)
+        # execute the corresponding function
+        for _ in range(iters):
+            iter_runner(iter_loaders[i], **kwargs)
+```
+
+Currently, we support 2 modes: train and val. Let's take a val function for example and have a look at its core logic:
+
+```python
+# Currently, iter_runner could be either train or val
+def val(self, data_loader, **kwargs):
+    # get batch data for 1 iter
+    data_batch = next(data_loader)
+    # it will execute all before_val_iter function in the hooks registered. You may want to watch out for the order.
+    self.call_hook('before_val_iter')
+    outputs = self.model.val_step(data_batch, self.optimizer, **kwargs)
+    self.outputs = outputs
+    self.call_hook('after_val_iter')
+```
+
+Other than the basic functionalities explained above, `EpochBasedRunner` and `IterBasedRunner` provide methods such as `resume`, `save_checkpoint` and `register_hook`. In case you are not familiar with the term Hook mentioned earlier, we will also provide a tutorial about it.(coming soon...) Essentially, a hook is functionality to alter or augment the code behaviors through predefined api. It allows users to have their own code called under certain circumstances. It makes code extensible in a non-intrusive manner.
+
+### A Simple Example
+
+We will walk you through the usage of runner with a classification task. The following code only contains essential steps for demonstration purposes. The following steps are necessary for any training tasks.
+
+**(1) Initialize dataloader, model, optimizer, etc.**
+
+```python
+# initialize model
+model=...
+# initialize optimizer, typically, we set: cfg.optimizer = dict(type='SGD', lr=0.1, momentum=0.9, weight_decay=0.0001)
+optimizer = build_optimizer(model, cfg.optimizer)
+# initialize the dataloader corresponding to the workflow(train/val)
+data_loaders = [
+        build_dataloader(
+            ds,
+            cfg.data.samples_per_gpu,
+            cfg.data.workers_per_gpu,
+            ...) for ds in dataset
+    ]
+```
+
+**(2) Initialize runner**
+
+```python
+runner = build_runner(
+    # cfg.runner is typically set as:
+    # runner = dict(type='EpochBasedRunner', max_epochs=200)
+    cfg.runner,
+    default_args=dict(
+        model=model,
+        batch_processor=None,
+        optimizer=optimizer,
+        logger=logger))
+```
+
+**(3) Register training hooks and customized hooks.**
+
+```python
+# register default hooks necessary for training
+runner.register_training_hooks(
+    # configs of learning rate, it is typically set as:
+    # lr_config = dict(policy='step', step=[100, 150])
+    cfg.lr_config,
+    # configuration of optimizer, e.g. grad_clip
+    optimizer_config,
+    # configuration of saving checkpoints, it is typically set as:
+    # checkpoint_config = dict(interval=1), saving checkpoints every epochs
+    cfg.checkpoint_config,
+    # configuration of logs
+    cfg.log_config,
+    ...)
+
+# register customized hooks
+# say we want to enable ema, then we could set custom_hooks=[dict(type='EMAHook')]
+if cfg.get('custom_hooks', None):
+    custom_hooks = cfg.custom_hooks
+    for hook_cfg in cfg.custom_hooks:
+        hook_cfg = hook_cfg.copy()
+        priority = hook_cfg.pop('priority', 'NORMAL')
+        hook = build_from_cfg(hook_cfg, HOOKS)
+        runner.register_hook(hook, priority=priority)
+```
+
+Then, we can use `resume` or `load_checkpoint` to load existing weights.
+
+**(4) Start training**
+
+```python
+# workflow is typically set as: workflow = [('train', 1)]
+# here the training begins.
+runner.run(data_loaders, cfg.workflow)
+```
+
+Let's take `EpochBasedRunner` for example and go a little bit into details about setting workflow:
+
+- Say we only want to put train in the workflow, then we can set: workflow = [('train', 1)]. The runner will only execute train iteratively in this case.
+- Say we want to put both train and val in the workflow, then we can set: workflow = [('train', 3), ('val',1)]. The runner will first execute train for 3 epochs and then switch to val mode and execute val for 1 epoch. The workflow will be repeated until the current epoch hit the max_epochs.
+- Workflow is highly flexible. Therefore, you can set workflow = [('val', 1), ('train',1)] if you would like the runner to validate first and train after.
+
+The code we demonstrated above is already in `train.py` in MM repositories. Simply modify the corresponding keys in the configuration files and the script will execute the expected workflow automatically.
diff --git a/docs/understand_mmcv/utils.md b/docs/understand_mmcv/utils.md
new file mode 100644
index 0000000..5d5e0ad
--- /dev/null
+++ b/docs/understand_mmcv/utils.md
@@ -0,0 +1,74 @@
+## Utils
+
+### ProgressBar
+
+If you want to apply a method to a list of items and track the progress, `track_progress`
+is a good choice. It will display a progress bar to tell the progress and ETA.
+
+```python
+import mmcv
+
+def func(item):
+    # do something
+    pass
+
+tasks = [item_1, item_2, ..., item_n]
+
+mmcv.track_progress(func, tasks)
+```
+
+The output is like the following.
+
+![progress](../_static/progress.*)
+
+There is another method `track_parallel_progress`, which wraps multiprocessing and
+progress visualization.
+
+```python
+mmcv.track_parallel_progress(func, tasks, 8)  # 8 workers
+```
+
+![progress](../_static/parallel_progress.*)
+
+If you want to iterate or enumerate a list of items and track the progress, `track_iter_progress`
+is a good choice. It will display a progress bar to tell the progress and ETA.
+
+```python
+import mmcv
+
+tasks = [item_1, item_2, ..., item_n]
+
+for task in mmcv.track_iter_progress(tasks):
+    # do something like print
+    print(task)
+
+for i, task in enumerate(mmcv.track_iter_progress(tasks)):
+    # do something like print
+    print(i)
+    print(task)
+```
+
+### Timer
+
+It is convenient to compute the runtime of a code block with `Timer`.
+
+```python
+import time
+
+with mmcv.Timer():
+    # simulate some code block
+    time.sleep(1)
+```
+
+or try with `since_start()` and `since_last_check()`. This former can
+return the runtime since the timer starts and the latter will return the time
+since the last time checked.
+
+```python
+timer = mmcv.Timer()
+# code block 1 here
+print(timer.since_start())
+# code block 2 here
+print(timer.since_last_check())
+print(timer.since_start())
+```
diff --git a/docs/en/understand_mmcv/visualization.md b/docs/understand_mmcv/visualization.md
similarity index 100%
rename from docs/en/understand_mmcv/visualization.md
rename to docs/understand_mmcv/visualization.md
diff --git a/docs/zh_cn/_static/version.json b/docs/zh_cn/_static/version.json
deleted file mode 100644
index 7ee4965..0000000
--- a/docs/zh_cn/_static/version.json
+++ /dev/null
@@ -1,575 +0,0 @@
-{
-    "Linux": [
-        {
-            "cuda": "11.7",
-            "torch": "1.13.x",
-            "mmcv": [
-                "2.0.0rc3"
-            ]
-        },
-        {
-            "cuda": "11.6",
-            "torch": "1.13.x",
-            "mmcv": [
-                "2.0.0rc3"
-            ]
-        },
-        {
-            "cuda": "11.6",
-            "torch": "1.12.x",
-            "mmcv": [
-                "2.0.0rc3",
-                "2.0.0rc2",
-                "2.0.0rc1"
-            ]
-        },
-        {
-            "cuda": "11.5",
-            "torch": "1.11.x",
-            "mmcv": [
-                "2.0.0rc3",
-                "2.0.0rc2",
-                "2.0.0rc1"
-            ]
-        },
-        {
-            "cuda": "11.3",
-            "torch": "1.12.x",
-            "mmcv": [
-                "2.0.0rc3",
-                "2.0.0rc2",
-                "2.0.0rc1"
-            ]
-        },
-        {
-            "cuda": "11.3",
-            "torch": "1.11.x",
-            "mmcv": [
-                "2.0.0rc3",
-                "2.0.0rc2",
-                "2.0.0rc1"
-            ]
-        },
-        {
-            "cuda": "11.3",
-            "torch": "1.10.x",
-            "mmcv": [
-                "2.0.0rc3",
-                "2.0.0rc2",
-                "2.0.0rc1"
-            ]
-        },
-        {
-            "cuda": "11.1",
-            "torch": "1.10.x",
-            "mmcv": [
-                "2.0.0rc3",
-                "2.0.0rc2",
-                "2.0.0rc1"
-            ]
-        },
-        {
-            "cuda": "11.1",
-            "torch": "1.9.x",
-            "mmcv": [
-                "2.0.0rc3",
-                "2.0.0rc2",
-                "2.0.0rc1"
-            ]
-        },
-        {
-            "cuda": "11.1",
-            "torch": "1.8.x",
-            "mmcv": [
-                "2.0.0rc3",
-                "2.0.0rc2",
-                "2.0.0rc1"
-            ]
-        },
-        {
-            "cuda": "11.0",
-            "torch": "1.7.x",
-            "mmcv": [
-                "2.0.0rc3",
-                "2.0.0rc2",
-                "2.0.0rc1"
-            ]
-        },
-        {
-            "cuda": "10.2",
-            "torch": "1.12.x",
-            "mmcv": [
-                "2.0.0rc3",
-                "2.0.0rc2",
-                "2.0.0rc1"
-            ]
-        },
-        {
-            "cuda": "10.2",
-            "torch": "1.11.x",
-            "mmcv": [
-                "2.0.0rc3",
-                "2.0.0rc2",
-                "2.0.0rc1"
-            ]
-        },
-        {
-            "cuda": "10.2",
-            "torch": "1.10.x",
-            "mmcv": [
-                "2.0.0rc3",
-                "2.0.0rc2",
-                "2.0.0rc1"
-            ]
-        },
-        {
-            "cuda": "10.2",
-            "torch": "1.9.x",
-            "mmcv": [
-                "2.0.0rc3",
-                "2.0.0rc2",
-                "2.0.0rc1"
-            ]
-        },
-        {
-            "cuda": "10.2",
-            "torch": "1.8.x",
-            "mmcv": [
-                "2.0.0rc3",
-                "2.0.0rc2",
-                "2.0.0rc1"
-            ]
-        },
-        {
-            "cuda": "10.2",
-            "torch": "1.7.x",
-            "mmcv": [
-                "2.0.0rc3",
-                "2.0.0rc2",
-                "2.0.0rc1"
-            ]
-        },
-        {
-            "cuda": "10.2",
-            "torch": "1.6.x",
-            "mmcv": [
-                "2.0.0rc3",
-                "2.0.0rc2",
-                "2.0.0rc1"
-            ]
-        },
-        {
-            "cuda": "10.1",
-            "torch": "1.8.x",
-            "mmcv": [
-                "2.0.0rc3",
-                "2.0.0rc2",
-                "2.0.0rc1"
-            ]
-        },
-        {
-            "cuda": "10.1",
-            "torch": "1.7.x",
-            "mmcv": [
-                "2.0.0rc3",
-                "2.0.0rc2",
-                "2.0.0rc1"
-            ]
-        },
-        {
-            "cuda": "10.1",
-            "torch": "1.6.x",
-            "mmcv": [
-                "2.0.0rc3",
-                "2.0.0rc2",
-                "2.0.0rc1"
-            ]
-        },
-        {
-            "cuda": "9.2",
-            "torch": "1.7.x",
-            "mmcv": [
-                "2.0.0rc3",
-                "2.0.0rc2",
-                "2.0.0rc1"
-            ]
-        },
-        {
-            "cuda": "9.2",
-            "torch": "1.6.x",
-            "mmcv": [
-                "2.0.0rc3",
-                "2.0.0rc2",
-                "2.0.0rc1"
-            ]
-        },
-        {
-            "cuda": "cpu",
-            "torch": "1.13.x",
-            "mmcv": [
-                "2.0.0rc3"
-            ]
-        },
-        {
-            "cuda": "cpu",
-            "torch": "1.12.x",
-            "mmcv": [
-                "2.0.0rc3",
-                "2.0.0rc2",
-                "2.0.0rc1"
-            ]
-        },
-        {
-            "cuda": "cpu",
-            "torch": "1.11.x",
-            "mmcv": [
-                "2.0.0rc3",
-                "2.0.0rc2",
-                "2.0.0rc1"
-            ]
-        },
-        {
-            "cuda": "cpu",
-            "torch": "1.10.x",
-            "mmcv": [
-                "2.0.0rc3",
-                "2.0.0rc2",
-                "2.0.0rc1"
-            ]
-        },
-        {
-            "cuda": "cpu",
-            "torch": "1.9.x",
-            "mmcv": [
-                "2.0.0rc3",
-                "2.0.0rc2",
-                "2.0.0rc1"
-            ]
-        },
-        {
-            "cuda": "cpu",
-            "torch": "1.8.x",
-            "mmcv": [
-                "2.0.0rc3",
-                "2.0.0rc2",
-                "2.0.0rc1"
-            ]
-        },
-        {
-            "cuda": "cpu",
-            "torch": "1.7.x",
-            "mmcv": [
-                "2.0.0rc3",
-                "2.0.0rc2",
-                "2.0.0rc1"
-            ]
-        },
-        {
-            "cuda": "cpu",
-            "torch": "1.6.x",
-            "mmcv": [
-                "2.0.0rc3",
-                "2.0.0rc2",
-                "2.0.0rc1"
-            ]
-        }
-    ],
-    "Windows": [
-        {
-            "cuda": "11.7",
-            "torch": "1.13.x",
-            "mmcv": [
-                "2.0.0rc3"
-            ]
-        },
-        {
-            "cuda": "11.6",
-            "torch": "1.13.x",
-            "mmcv": [
-                "2.0.0rc3"
-            ]
-        },
-        {
-            "cuda": "11.6",
-            "torch": "1.12.x",
-            "mmcv": [
-                "2.0.0rc3",
-                "2.0.0rc2",
-                "2.0.0rc1"
-            ]
-        },
-        {
-            "cuda": "11.5",
-            "torch": "1.11.x",
-            "mmcv": [
-                "2.0.0rc3",
-                "2.0.0rc2",
-                "2.0.0rc1"
-            ]
-        },
-        {
-            "cuda": "11.3",
-            "torch": "1.12.x",
-            "mmcv": [
-                "2.0.0rc3",
-                "2.0.0rc2",
-                "2.0.0rc1"
-            ]
-        },
-        {
-            "cuda": "11.3",
-            "torch": "1.11.x",
-            "mmcv": [
-                "2.0.0rc3",
-                "2.0.0rc2",
-                "2.0.0rc1"
-            ]
-        },
-        {
-            "cuda": "11.3",
-            "torch": "1.10.x",
-            "mmcv": [
-                "2.0.0rc3",
-                "2.0.0rc2",
-                "2.0.0rc1"
-            ]
-        },
-        {
-            "cuda": "11.1",
-            "torch": "1.10.x",
-            "mmcv": [
-                "2.0.0rc3",
-                "2.0.0rc2",
-                "2.0.0rc1"
-            ]
-        },
-        {
-            "cuda": "11.1",
-            "torch": "1.9.x",
-            "mmcv": [
-                "2.0.0rc3",
-                "2.0.0rc2",
-                "2.0.0rc1"
-            ]
-        },
-        {
-            "cuda": "11.1",
-            "torch": "1.8.x",
-            "mmcv": [
-                "2.0.0rc3",
-                "2.0.0rc2",
-                "2.0.0rc1"
-            ]
-        },
-        {
-            "cuda": "10.2",
-            "torch": "1.10.x",
-            "mmcv": [
-                "2.0.0rc3",
-                "2.0.0rc2",
-                "2.0.0rc1"
-            ]
-        },
-        {
-            "cuda": "10.2",
-            "torch": "1.9.x",
-            "mmcv": [
-                "2.0.0rc3",
-                "2.0.0rc2",
-                "2.0.0rc1"
-            ]
-        },
-        {
-            "cuda": "10.2",
-            "torch": "1.8.x",
-            "mmcv": [
-                "2.0.0rc3",
-                "2.0.0rc2",
-                "2.0.0rc1"
-            ]
-        },
-        {
-            "cuda": "10.2",
-            "torch": "1.7.x",
-            "mmcv": [
-                "2.0.0rc3"
-            ]
-        },
-        {
-            "cuda": "10.2",
-            "torch": "1.6.x",
-            "mmcv": [
-                "2.0.0rc3",
-                "2.0.0rc2",
-                "2.0.0rc1"
-            ]
-        },
-        {
-            "cuda": "10.1",
-            "torch": "1.8.x",
-            "mmcv": [
-                "2.0.0rc3",
-                "2.0.0rc2",
-                "2.0.0rc1"
-            ]
-        },
-        {
-            "cuda": "10.1",
-            "torch": "1.7.x",
-            "mmcv": [
-                "2.0.0rc3"
-            ]
-        },
-        {
-            "cuda": "10.1",
-            "torch": "1.6.x",
-            "mmcv": [
-                "2.0.0rc3",
-                "2.0.0rc2",
-                "2.0.0rc1"
-            ]
-        },
-        {
-            "cuda": "cpu",
-            "torch": "1.13.x",
-            "mmcv": [
-                "2.0.0rc3"
-            ]
-        },
-        {
-            "cuda": "cpu",
-            "torch": "1.12.x",
-            "mmcv": [
-                "2.0.0rc3",
-                "2.0.0rc2",
-                "2.0.0rc1"
-            ]
-        },
-        {
-            "cuda": "cpu",
-            "torch": "1.11.x",
-            "mmcv": [
-                "2.0.0rc3",
-                "2.0.0rc2",
-                "2.0.0rc1"
-            ]
-        },
-        {
-            "cuda": "cpu",
-            "torch": "1.10.x",
-            "mmcv": [
-                "2.0.0rc3",
-                "2.0.0rc2",
-                "2.0.0rc1"
-            ]
-        },
-        {
-            "cuda": "cpu",
-            "torch": "1.9.x",
-            "mmcv": [
-                "2.0.0rc3",
-                "2.0.0rc2",
-                "2.0.0rc1"
-            ]
-        },
-        {
-            "cuda": "cpu",
-            "torch": "1.8.x",
-            "mmcv": [
-                "2.0.0rc3",
-                "2.0.0rc2",
-                "2.0.0rc1"
-            ]
-        },
-        {
-            "cuda": "cpu",
-            "torch": "1.7.x",
-            "mmcv": [
-                "2.0.0rc3",
-                "2.0.0rc2",
-                "2.0.0rc1"
-            ]
-        },
-        {
-            "cuda": "cpu",
-            "torch": "1.6.x",
-            "mmcv": [
-                "2.0.0rc3",
-                "2.0.0rc2",
-                "2.0.0rc1"
-            ]
-        }
-    ],
-    "macOS": [
-        {
-            "cuda": "cpu",
-            "torch": "1.13.x",
-            "mmcv": [
-                "2.0.0rc3"
-            ]
-        },
-        {
-            "cuda": "mps",
-            "torch": "1.13.x",
-            "mmcv": [
-                "2.0.0rc3"
-            ]
-        },
-        {
-            "cuda": "cpu",
-            "torch": "1.12.x",
-            "mmcv": [
-                "2.0.0rc3",
-                "2.0.0rc2"
-            ]
-        },
-        {
-            "cuda": "cpu",
-            "torch": "1.11.x",
-            "mmcv": [
-                "2.0.0rc3",
-                "2.0.0rc2"
-            ]
-        },
-        {
-            "cuda": "cpu",
-            "torch": "1.10.x",
-            "mmcv": [
-                "2.0.0rc3",
-                "2.0.0rc2"
-            ]
-        },
-        {
-            "cuda": "cpu",
-            "torch": "1.9.x",
-            "mmcv": [
-                "2.0.0rc3",
-                "2.0.0rc2"
-            ]
-        },
-        {
-            "cuda": "cpu",
-            "torch": "1.8.x",
-            "mmcv": [
-                "2.0.0rc3",
-                "2.0.0rc2"
-            ]
-        },
-        {
-            "cuda": "cpu",
-            "torch": "1.7.x",
-            "mmcv": [
-                "2.0.0rc3",
-                "2.0.0rc2"
-            ]
-        },
-        {
-            "cuda": "cpu",
-            "torch": "1.6.x",
-            "mmcv": [
-                "2.0.0rc3",
-                "2.0.0rc2"
-            ]
-        }
-    ]
-}
diff --git a/docs/zh_cn/_templates/classtemplate.rst b/docs/zh_cn/_templates/classtemplate.rst
deleted file mode 100644
index 4f74842..0000000
--- a/docs/zh_cn/_templates/classtemplate.rst
+++ /dev/null
@@ -1,14 +0,0 @@
-.. role:: hidden
-    :class: hidden-section
-.. currentmodule:: {{ module }}
-
-
-{{ name | underline}}
-
-.. autoclass:: {{ name }}
-    :members:
-
-
-..
-  autogenerated from source/_templates/classtemplate.rst
-  note it does not have :inherited-members:
diff --git a/docs/zh_cn/api/arraymisc.rst b/docs/zh_cn/api/arraymisc.rst
deleted file mode 100644
index 28975eb..0000000
--- a/docs/zh_cn/api/arraymisc.rst
+++ /dev/null
@@ -1,19 +0,0 @@
-.. role:: hidden
-    :class: hidden-section
-
-mmcv.arraymisc
-===================================
-
-.. contents:: mmcv.arraymisc
-   :depth: 2
-   :local:
-   :backlinks: top
-
-.. currentmodule:: mmcv.arraymisc
-
-.. autosummary::
-   :toctree: generated
-   :nosignatures:
-
-   quantize
-   dequantize
diff --git a/docs/zh_cn/api/cnn.rst b/docs/zh_cn/api/cnn.rst
deleted file mode 100644
index 022191f..0000000
--- a/docs/zh_cn/api/cnn.rst
+++ /dev/null
@@ -1,71 +0,0 @@
-.. role:: hidden
-    :class: hidden-section
-
-mmcv.cnn
-===================================
-
-.. contents:: mmcv.cnn
-   :depth: 2
-   :local:
-   :backlinks: top
-
-.. currentmodule:: mmcv.cnn
-
-Module
-----------------
-
-.. autosummary::
-   :toctree: generated
-   :nosignatures:
-   :template: classtemplate.rst
-
-   ContextBlock
-   Conv2d
-   Conv3d
-   ConvAWS2d
-   ConvModule
-   ConvTranspose2d
-   ConvTranspose3d
-   ConvWS2d
-   DepthwiseSeparableConvModule
-   GeneralizedAttention
-   HSigmoid
-   HSwish
-   LayerScale
-   Linear
-   MaxPool2d
-   MaxPool3d
-   NonLocal1d
-   NonLocal2d
-   NonLocal3d
-   Scale
-   Swish
-   Conv2dRFSearchOp
-
-Build Function
-----------------
-
-.. autosummary::
-   :toctree: generated
-   :nosignatures:
-
-   build_activation_layer
-   build_conv_layer
-   build_norm_layer
-   build_padding_layer
-   build_plugin_layer
-   build_upsample_layer
-
-Miscellaneous
-----------------
-
-.. autosummary::
-   :toctree: generated
-   :nosignatures:
-
-   fuse_conv_bn
-   conv_ws_2d
-   is_norm
-   make_res_layer
-   make_vgg_layer
-   get_model_complexity_info
diff --git a/docs/zh_cn/api/image.rst b/docs/zh_cn/api/image.rst
deleted file mode 100644
index 3b93484..0000000
--- a/docs/zh_cn/api/image.rst
+++ /dev/null
@@ -1,100 +0,0 @@
-.. role:: hidden
-    :class: hidden-section
-
-mmcv.image
-===================================
-
-.. contents:: mmcv.image
-   :depth: 2
-   :local:
-   :backlinks: top
-
-.. currentmodule:: mmcv.image
-
-IO
-----------------
-
-.. autosummary::
-   :toctree: generated
-   :nosignatures:
-
-   imfrombytes
-   imread
-   imwrite
-   use_backend
-
-Color Space
-----------------
-
-.. autosummary::
-   :toctree: generated
-   :nosignatures:
-
-   bgr2gray
-   bgr2hls
-   bgr2hsv
-   bgr2rgb
-   bgr2ycbcr
-   gray2bgr
-   gray2rgb
-   hls2bgr
-   hsv2bgr
-   imconvert
-   rgb2bgr
-   rgb2gray
-   rgb2ycbcr
-   ycbcr2bgr
-   ycbcr2rgb
-
-Geometric
-----------------
-
-.. autosummary::
-   :toctree: generated
-   :nosignatures:
-
-   cutout
-   imcrop
-   imflip
-   impad
-   impad_to_multiple
-   imrescale
-   imresize
-   imresize_like
-   imresize_to_multiple
-   imrotate
-   imshear
-   imtranslate
-   rescale_size
-
-Photometric
-----------------
-
-.. autosummary::
-   :toctree: generated
-   :nosignatures:
-
-   adjust_brightness
-   adjust_color
-   adjust_contrast
-   adjust_hue
-   adjust_lighting
-   adjust_sharpness
-   auto_contrast
-   clahe
-   imdenormalize
-   imequalize
-   iminvert
-   imnormalize
-   lut_transform
-   posterize
-   solarize
-
-Miscellaneous
-----------------
-
-.. autosummary::
-   :toctree: generated
-   :nosignatures:
-
-   tensor2imgs
diff --git a/docs/zh_cn/api/ops.rst b/docs/zh_cn/api/ops.rst
deleted file mode 100644
index b029045..0000000
--- a/docs/zh_cn/api/ops.rst
+++ /dev/null
@@ -1,135 +0,0 @@
-.. role:: hidden
-    :class: hidden-section
-
-mmcv.ops
-===================================
-
-.. contents:: mmcv.ops
-   :depth: 2
-   :local:
-   :backlinks: top
-
-.. currentmodule:: mmcv.ops
-
-.. autosummary::
-   :toctree: generated
-   :nosignatures:
-   :template: classtemplate.rst
-
-   BorderAlign
-   CARAFE
-   CARAFENaive
-   CARAFEPack
-   Conv2d
-   ConvTranspose2d
-   CornerPool
-   Correlation
-   CrissCrossAttention
-   DeformConv2d
-   DeformConv2dPack
-   DeformRoIPool
-   DeformRoIPoolPack
-   DynamicScatter
-   FusedBiasLeakyReLU
-   GroupAll
-   Linear
-   MaskedConv2d
-   MaxPool2d
-   ModulatedDeformConv2d
-   ModulatedDeformConv2dPack
-   ModulatedDeformRoIPoolPack
-   MultiScaleDeformableAttention
-   PSAMask
-   PointsSampler
-   PrRoIPool
-   QueryAndGroup
-   RiRoIAlignRotated
-   RoIAlign
-   RoIAlignRotated
-   RoIAwarePool3d
-   RoIPointPool3d
-   RoIPool
-   SAConv2d
-   SigmoidFocalLoss
-   SimpleRoIAlign
-   SoftmaxFocalLoss
-   SparseConv2d
-   SparseConv3d
-   SparseConvTensor
-   SparseConvTranspose2d
-   SparseConvTranspose3d
-   SparseInverseConv2d
-   SparseInverseConv3d
-   SparseMaxPool2d
-   SparseMaxPool3d
-   SparseModule
-   SparseSequential
-   SubMConv2d
-   SubMConv3d
-   SyncBatchNorm
-   TINShift
-   Voxelization
-
-.. autosummary::
-   :toctree: generated
-   :nosignatures:
-
-   active_rotated_filter
-   assign_score_withk
-   ball_query
-   batched_nms
-   bbox_overlaps
-   border_align
-   box_iou_rotated
-   boxes_iou3d
-   boxes_iou_bev
-   boxes_overlap_bev
-   carafe
-   carafe_naive
-   chamfer_distance
-   contour_expand
-   convex_giou
-   convex_iou
-   deform_conv2d
-   deform_roi_pool
-   diff_iou_rotated_2d
-   diff_iou_rotated_3d
-   dynamic_scatter
-   furthest_point_sample
-   furthest_point_sample_with_dist
-   fused_bias_leakyrelu
-   gather_points
-   grouping_operation
-   knn
-   masked_conv2d
-   min_area_polygons
-   modulated_deform_conv2d
-   nms
-   nms3d
-   nms3d_normal
-   nms_bev
-   nms_match
-   nms_normal_bev
-   nms_rotated
-   pixel_group
-   point_sample
-   points_in_boxes_all
-   points_in_boxes_cpu
-   points_in_boxes_part
-   points_in_polygons
-   prroi_pool
-   rel_roi_point_to_rel_img_point
-   riroi_align_rotated
-   roi_align
-   roi_align_rotated
-   roi_pool
-   rotated_feature_align
-   scatter_nd
-   sigmoid_focal_loss
-   soft_nms
-   softmax_focal_loss
-   three_interpolate
-   three_nn
-   tin_shift
-   upfirdn2d
-   voxelization
diff --git a/docs/zh_cn/api/transforms.rst b/docs/zh_cn/api/transforms.rst
deleted file mode 100644
index b080133..0000000
--- a/docs/zh_cn/api/transforms.rst
+++ /dev/null
@@ -1,60 +0,0 @@
-.. role:: hidden
-    :class: hidden-section
-
-mmcv.transforms
-===================================
-
-.. currentmodule:: mmcv.transforms
-
-.. autosummary::
-   :toctree: generated
-   :nosignatures:
-   :template: classtemplate.rst
-
-   BaseTransform
-   TestTimeAug
-
-Loading
-----------------
-
-.. autosummary::
-   :toctree: generated
-   :nosignatures:
-   :template: classtemplate.rst
-
-   LoadAnnotations
-   LoadImageFromFile
-
-Processing
-----------------
-
-.. autosummary::
-   :toctree: generated
-   :nosignatures:
-   :template: classtemplate.rst
-
-   CenterCrop
-   MultiScaleFlipAug
-   Normalize
-   Pad
-   RandomChoiceResize
-   RandomFlip
-   RandomGrayscale
-   RandomResize
-   Resize
-   ToTensor
-   ImageToTensor
-
-Wrapper
-----------------
-
-.. autosummary::
-   :toctree: generated
-   :nosignatures:
-   :template: classtemplate.rst
-
-   Compose
-   KeyMapper
-   RandomApply
-   RandomChoice
-   TransformBroadcaster
diff --git a/docs/zh_cn/api/utils.rst b/docs/zh_cn/api/utils.rst
deleted file mode 100644
index f2ff4c2..0000000
--- a/docs/zh_cn/api/utils.rst
+++ /dev/null
@@ -1,23 +0,0 @@
-.. role:: hidden
-    :class: hidden-section
-
-mmcv.utils
-===================================
-
-.. contents:: mmcv.utils
-   :depth: 2
-   :local:
-   :backlinks: top
-
-.. currentmodule:: mmcv.utils
-
-.. autosummary::
-   :toctree: generated
-   :nosignatures:
-
-   IS_CUDA_AVAILABLE
-   IS_MLU_AVAILABLE
-   IS_MPS_AVAILABLE
-   collect_env
-   jit
-   skip_no_elena
diff --git a/docs/zh_cn/api/video.rst b/docs/zh_cn/api/video.rst
deleted file mode 100644
index a6ebca0..0000000
--- a/docs/zh_cn/api/video.rst
+++ /dev/null
@@ -1,56 +0,0 @@
-.. role:: hidden
-    :class: hidden-section
-
-mmcv.video
-===================================
-
-.. contents:: mmcv.video
-   :depth: 2
-   :local:
-   :backlinks: top
-
-.. currentmodule:: mmcv.video
-
-IO
-----------------
-
-.. autosummary::
-   :toctree: generated
-   :nosignatures:
-   :template: classtemplate.rst
-
-   VideoReader
-   Cache
-
-.. autosummary::
-   :toctree: generated
-   :nosignatures:
-
-   frames2video
-
-Optical Flow
-----------------
-
-.. autosummary::
-   :toctree: generated
-   :nosignatures:
-
-   dequantize_flow
-   flow_from_bytes
-   flow_warp
-   flowread
-   flowwrite
-   quantize_flow
-   sparse_flow_from_bytes
-
-Video Processing
-----------------
-
-.. autosummary::
-   :toctree: generated
-   :nosignatures:
-
-   concat_video
-   convert_video
-   cut_video
-   resize_video
diff --git a/docs/zh_cn/api/visualization.rst b/docs/zh_cn/api/visualization.rst
deleted file mode 100644
index 8f43ef2..0000000
--- a/docs/zh_cn/api/visualization.rst
+++ /dev/null
@@ -1,50 +0,0 @@
-.. role:: hidden
-    :class: hidden-section
-
-mmcv.visualization
-===================================
-
-.. contents:: mmcv.visualization
-   :depth: 2
-   :local:
-   :backlinks: top
-
-.. currentmodule:: mmcv.visualization
-
-Color
-----------------
-
-.. autosummary::
-   :toctree: generated
-   :nosignatures:
-   :template: classtemplate.rst
-
-   Color
-
-.. autosummary::
-   :toctree: generated
-   :nosignatures:
-
-   color_val
-
-Image
-----------------
-
-.. autosummary::
-   :toctree: generated
-   :nosignatures:
-
-   imshow
-   imshow_bboxes
-   imshow_det_bboxes
-
-Optical Flow
-----------------
-
-.. autosummary::
-   :toctree: generated
-   :nosignatures:
-
-   flow2rgb
-   flowshow
-   make_color_wheel
diff --git a/docs/zh_cn/community/code_style.md b/docs/zh_cn/community/code_style.md
deleted file mode 100644
index 8ddb87c..0000000
--- a/docs/zh_cn/community/code_style.md
+++ /dev/null
@@ -1,609 +0,0 @@
-## 代码规范
-
-### 代码规范标准
-
-#### PEP 8 —— Python 官方代码规范
-
-[Python 官方的代码风格指南](https://www.python.org/dev/peps/pep-0008/)，包含了以下几个方面的内容：
-
-- 代码布局，介绍了 Python 中空行、断行以及导入相关的代码风格规范。比如一个常见的问题：当我的代码较长，无法在一行写下时，何处可以断行？
-
-- 表达式，介绍了 Python 中表达式空格相关的一些风格规范。
-
-- 尾随逗号相关的规范。当列表较长，无法一行写下而写成如下逐行列表时，推荐在末项后加逗号，从而便于追加选项、版本控制等。
-
-  ```python
-  # Correct:
-  FILES = ['setup.cfg', 'tox.ini']
-  # Correct:
-  FILES = [
-      'setup.cfg',
-      'tox.ini',
-  ]
-  # Wrong:
-  FILES = ['setup.cfg', 'tox.ini',]
-  # Wrong:
-  FILES = [
-      'setup.cfg',
-      'tox.ini'
-  ]
-  ```
-
-- 命名相关规范、注释相关规范、类型注解相关规范，我们将在后续章节中做详细介绍。
-
-  "A style guide is about consistency. Consistency with this style guide is important. Consistency within a project is more important. Consistency within one module or function is the most important." PEP 8 -- Style Guide for Python Code
-
-:::{note}
-PEP 8 的代码规范并不是绝对的，项目内的一致性要优先于 PEP 8 的规范。OpenMMLab 各个项目都在 setup.cfg 设定了一些代码规范的设置，请遵照这些设置。一个例子是在 PEP 8 中有如下一个例子：
-
-```python
-# Correct:
-hypot2 = x*x + y*y
-# Wrong:
-hypot2 = x * x + y * y
-```
-
-这一规范是为了指示不同优先级，但 OpenMMLab 的设置中通常没有启用 yapf 的 `ARITHMETIC_PRECEDENCE_INDICATION` 选项，因而格式规范工具不会按照推荐样式格式化，以设置为准。
-:::
-
-#### Google 开源项目风格指南
-
-[Google 使用的编程风格指南](https://google.github.io/styleguide/pyguide.html)，包括了 Python 相关的章节。相较于 PEP 8，该指南提供了更为详尽的代码指南。该指南包括了语言规范和风格规范两个部分。
-
-其中，语言规范对 Python 中很多语言特性进行了优缺点的分析，并给出了使用指导意见，如异常、Lambda 表达式、列表推导式、metaclass 等。
-
-风格规范的内容与 PEP 8 较为接近，大部分约定建立在 PEP 8 的基础上，也有一些更为详细的约定，如函数长度、TODO 注释、文件与 socket 对象的访问等。
-
-推荐将该指南作为参考进行开发，但不必严格遵照，一来该指南存在一些 Python 2 兼容需求，例如指南中要求所有无基类的类应当显式地继承 Object, 而在仅使用 Python 3 的环境中，这一要求是不必要的，依本项目中的惯例即可。二来 OpenMMLab 的项目作为框架级的开源软件，不必对一些高级技巧过于避讳，尤其是 MMCV。但尝试使用这些技巧前应当认真考虑是否真的有必要，并寻求其他开发人员的广泛评估。
-
-另外需要注意的一处规范是关于包的导入，在该指南中，要求导入本地包时必须使用路径全称，且导入的每一个模块都应当单独成行，通常这是不必要的，而且也不符合目前项目的开发惯例，此处进行如下约定：
-
-```python
-# Correct
-from mmcv.cnn.bricks import (Conv2d, build_norm_layer, DropPath, MaxPool2d,
-                             Linear)
-from ..utils import ext_loader
-
-# Wrong
-from mmcv.cnn.bricks import Conv2d, build_norm_layer, DropPath, MaxPool2d, \
-                            Linear  # 使用括号进行连接，而不是反斜杠
-from ...utils import is_str  # 最多向上回溯一层，过多的回溯容易导致结构混乱
-```
-
-OpenMMLab 项目使用 pre-commit 工具自动格式化代码，详情见[贡献代码](./contributing.md#代码风格)。
-
-### 命名规范
-
-#### 命名规范的重要性
-
-优秀的命名是良好代码可读的基础。基础的命名规范对各类变量的命名做了要求，使读者可以方便地根据代码名了解变量是一个类 / 局部变量 / 全局变量等。而优秀的命名则需要代码作者对于变量的功能有清晰的认识，以及良好的表达能力，从而使读者根据名称就能了解其含义，甚至帮助了解该段代码的功能。
-
-#### 基础命名规范
-
-| 类型            | 公有             | 私有               |
-| --------------- | ---------------- | ------------------ |
-| 模块            | lower_with_under | \_lower_with_under |
-| 包              | lower_with_under |                    |
-| 类              | CapWords         | \_CapWords         |
-| 异常            | CapWordsError    |                    |
-| 函数（方法）    | lower_with_under | \_lower_with_under |
-| 函数 / 方法参数 | lower_with_under |                    |
-| 全局 / 类内常量 | CAPS_WITH_UNDER  | \_CAPS_WITH_UNDER  |
-| 全局 / 类内变量 | lower_with_under | \_lower_with_under |
-| 变量            | lower_with_under | \_lower_with_under |
-| 局部变量        | lower_with_under |                    |
-
-注意：
-
-- 尽量避免变量名与保留字冲突，特殊情况下如不可避免，可使用一个后置下划线，如 class\_
-- 尽量不要使用过于简单的命名，除了约定俗成的循环变量 i，文件变量 f，错误变量 e 等。
-- 不会被用到的变量可以命名为 \_，逻辑检查器会将其忽略。
-
-#### 命名技巧
-
-良好的变量命名需要保证三点：
-
-1. 含义准确，没有歧义
-2. 长短适中
-3. 前后统一
-
-```python
-# Wrong
-class Masks(metaclass=ABCMeta):  # 命名无法表现基类；Instance or Semantic？
-    pass
-
-# Correct
-class BaseInstanceMasks(metaclass=ABCMeta):
-    pass
-
-# Wrong，不同地方含义相同的变量尽量用统一的命名
-def __init__(self, inplanes, planes):
-    pass
-
-def __init__(self, in_channels, out_channels):
-    pass
-```
-
-常见的函数命名方法：
-
-- 动宾命名法：crop_img, init_weights
-- 动宾倒置命名法：imread, bbox_flip
-
-注意函数命名与参数的顺序，保证主语在前，符合语言习惯：
-
-- check_keys_exist(key, container)
-- check_keys_contain(container, key)
-
-注意避免非常规或统一约定的缩写，如 nb -> num_blocks，in_nc -> in_channels
-
-### docstring 规范
-
-#### 为什么要写 docstring
-
-docstring 是对一个类、一个函数功能与 API 接口的详细描述，有两个功能，一是帮助其他开发者了解代码功能，方便 debug 和复用代码；二是在 Readthedocs 文档中自动生成相关的 API reference 文档，帮助不了解源代码的社区用户使用相关功能。
-
-#### 如何写 docstring
-
-与注释不同，一份规范的 docstring 有着严格的格式要求，以便于 Python 解释器以及 sphinx 进行文档解析，详细的 docstring 约定参见 [PEP 257](https://www.python.org/dev/peps/pep-0257/)。此处以例子的形式介绍各种文档的标准格式，参考格式为 [Google 风格](https://zh-google-styleguide.readthedocs.io/en/latest/google-python-styleguide/python_style_rules/#comments)。
-
-1. 模块文档
-
-   代码风格规范推荐为每一个模块（即 Python 文件）编写一个 docstring，但目前 OpenMMLab 项目大部分没有此类 docstring，因此不做硬性要求。
-
-   ```python
-   """A one line summary of the module or program, terminated by a period.
-
-   Leave one blank line. The rest of this docstring should contain an
-   overall description of the module or program. Optionally, it may also
-   contain a brief description of exported classes and functions and/or usage
-   examples.
-
-   Typical usage example:
-
-   foo = ClassFoo()
-   bar = foo.FunctionBar()
-   """
-   ```
-
-2. 类文档
-
-   类文档是我们最常需要编写的，此处，按照 OpenMMLab 的惯例，我们使用了与 Google 风格不同的写法。如下例所示，文档中没有使用 Attributes 描述类属性，而是使用 Args 描述 __init__ 函数的参数。
-
-   在 Args 中，遵照 `parameter (type): Description.` 的格式，描述每一个参数类型和功能。其中，多种类型可使用 `(float or str)` 的写法，可以为 None 的参数可以写为 `(int, optional)`。
-
-   ```python
-   class BaseRunner(metaclass=ABCMeta):
-       """The base class of Runner, a training helper for PyTorch.
-
-       All subclasses should implement the following APIs:
-
-       - ``run()``
-       - ``train()``
-       - ``val()``
-       - ``save_checkpoint()``
-
-       Args:
-           model (:obj:`torch.nn.Module`): The model to be run.
-           batch_processor (callable, optional): A callable method that process
-               a data batch. The interface of this method should be
-               ``batch_processor(model, data, train_mode) -> dict``.
-               Defaults to None.
-           optimizer (dict or :obj:`torch.optim.Optimizer`, optional): It can be
-               either an optimizer (in most cases) or a dict of optimizers
-               (in models that requires more than one optimizer, e.g., GAN).
-               Defaults to None.
-           work_dir (str, optional): The working directory to save checkpoints
-               and logs. Defaults to None.
-           logger (:obj:`logging.Logger`): Logger used during training.
-                Defaults to None. (The default value is just for backward
-                compatibility)
-           meta (dict, optional): A dict records some import information such as
-               environment info and seed, which will be logged in logger hook.
-               Defaults to None.
-           max_epochs (int, optional): Total training epochs. Defaults to None.
-           max_iters (int, optional): Total training iterations. Defaults to None.
-       """
-
-       def __init__(self,
-                    model,
-                    batch_processor=None,
-                    optimizer=None,
-                    work_dir=None,
-                    logger=None,
-                    meta=None,
-                    max_iters=None,
-                    max_epochs=None):
-           ...
-   ```
-
-   另外，在一些算法实现的主体类中，建议加入原论文的链接；如果参考了其他开源代码的实现，则应加入 modified from，而如果是直接复制了其他代码库的实现，则应加入 copied from ，并注意源码的 License。如有必要，也可以通过 .. math:: 来加入数学公式
-
-   ```python
-   # 参考实现
-   # This func is modified from `detectron2
-   # <https://github.com/facebookresearch/detectron2/blob/ffff8acc35ea88ad1cb1806ab0f00b4c1c5dbfd9/detectron2/structures/masks.py#L387>`_.
-
-   # 复制代码
-   # This code was copied from the `ubelt
-   # library<https://github.com/Erotemic/ubelt>`_.
-
-   # 引用论文 & 添加公式
-   class LabelSmoothLoss(nn.Module):
-       r"""Initializer for the label smoothed cross entropy loss.
-
-       Refers to `Rethinking the Inception Architecture for Computer Vision
-       <https://arxiv.org/abs/1512.00567>`_.
-
-       This decreases gap between output scores and encourages generalization.
-       Labels provided to forward can be one-hot like vectors (NxC) or class
-       indices (Nx1).
-       And this accepts linear combination of one-hot like labels from mixup or
-       cutmix except multi-label task.
-
-       Args:
-           label_smooth_val (float): The degree of label smoothing.
-           num_classes (int, optional): Number of classes. Defaults to None.
-           mode (str): Refers to notes, Options are "original", "classy_vision",
-               "multi_label". Defaults to "classy_vision".
-           reduction (str): The method used to reduce the loss.
-               Options are "none", "mean" and "sum". Defaults to 'mean'.
-           loss_weight (float):  Weight of the loss. Defaults to 1.0.
-
-       Note:
-           if the ``mode`` is "original", this will use the same label smooth
-           method as the original paper as:
-
-           .. math::
-               (1-\epsilon)\delta_{k, y} + \frac{\epsilon}{K}
-
-           where :math:`\epsilon` is the ``label_smooth_val``, :math:`K` is
-           the ``num_classes`` and :math:`\delta_{k,y}` is Dirac delta,
-           which equals 1 for k=y and 0 otherwise.
-
-           if the ``mode`` is "classy_vision", this will use the same label
-           smooth method as the `facebookresearch/ClassyVision
-           <https://github.com/facebookresearch/ClassyVision/blob/main/classy_vision/losses/label_smoothing_loss.py>`_ repo as:
-
-           .. math::
-               \frac{\delta_{k, y} + \epsilon/K}{1+\epsilon}
-
-           if the ``mode`` is "multi_label", this will accept labels from
-           multi-label task and smoothing them as:
-
-           .. math::
-               (1-2\epsilon)\delta_{k, y} + \epsilon
-   ```
-
-```{note}
-注意 \`\`here\`\`、\`here\`、"here" 三种引号功能是不同。
-
-在 reStructured 语法中，\`\`here\`\` 表示一段代码；\`here\` 表示斜体；"here" 无特殊含义，一般可用来表示字符串。其中 \`here\` 的用法与 Markdown 中不同，需要多加留意。
-另外还有 :obj:\`type\` 这种更规范的表示类的写法，但鉴于长度，不做特别要求，一般仅用于表示非常用类型。
-```
-
-3. 方法（函数）文档
-
-   函数文档与类文档的结构基本一致，但需要加入返回值文档。对于较为复杂的函数和类，可以使用 Examples 字段加入示例；如果需要对参数加入一些较长的备注，可以加入 Note 字段进行说明。
-
-   对于使用较为复杂的类或函数，比起看大段大段的说明文字和参数文档，添加合适的示例更能帮助用户迅速了解其用法。需要注意的是，这些示例最好是能够直接在 Python 交互式环境中运行的，并给出一些相对应的结果。如果存在多个示例，可以使用注释简单说明每段示例，也能起到分隔作用。
-
-   ```python
-   def import_modules_from_strings(imports, allow_failed_imports=False):
-       """Import modules from the given list of strings.
-
-       Args:
-           imports (list | str | None): The given module names to be imported.
-           allow_failed_imports (bool): If True, the failed imports will return
-               None. Otherwise, an ImportError is raise. Defaults to False.
-
-       Returns:
-           List[module] | module | None: The imported modules.
-           All these three lines in docstring will be compiled into the same
-           line in readthedocs.
-
-       Examples:
-           >>> osp, sys = import_modules_from_strings(
-           ...     ['os.path', 'sys'])
-           >>> import os.path as osp_
-           >>> import sys as sys_
-           >>> assert osp == osp_
-           >>> assert sys == sys_
-       """
-       ...
-   ```
-
-   如果函数接口在某个版本发生了变化，需要在 docstring 中加入相关的说明，必要时添加 Note 或者 Warning 进行说明，例如：
-
-   ```python
-   class CheckpointHook(Hook):
-       """Save checkpoints periodically.
-
-       Args:
-           out_dir (str, optional): The root directory to save checkpoints. If
-               not specified, ``runner.work_dir`` will be used by default. If
-               specified, the ``out_dir`` will be the concatenation of
-               ``out_dir`` and the last level directory of ``runner.work_dir``.
-               Defaults to None. `Changed in version 1.3.15.`
-           file_client_args (dict, optional): Arguments to instantiate a
-               FileClient. See :class:`mmcv.fileio.FileClient` for details.
-               Defaults to None. `New in version 1.3.15.`
-
-       Warning:
-           Before v1.3.15, the ``out_dir`` argument indicates the path where the
-           checkpoint is stored. However, in v1.3.15 and later, ``out_dir``
-           indicates the root directory and the final path to save checkpoint is
-           the concatenation of out_dir and the last level directory of
-           ``runner.work_dir``. Suppose the value of ``out_dir`` is
-           "/path/of/A" and the value of ``runner.work_dir`` is "/path/of/B",
-           then the final path will be "/path/of/A/B".
-   ```
-
-   如果参数或返回值里带有需要展开描述字段的 dict，则应该采用如下格式：
-
-   ```python
-   def func(x):
-       r"""
-       Args:
-           x (None): A dict with 2 keys, ``padded_targets``, and ``targets``.
-
-               - ``targets`` (list[Tensor]): A list of tensors.
-                 Each tensor has the shape of :math:`(T_i)`. Each
-                 element is the index of a character.
-               - ``padded_targets`` (Tensor): A tensor of shape :math:`(N)`.
-                 Each item is the length of a word.
-
-       Returns:
-           dict: A dict with 2 keys, ``padded_targets``, and ``targets``.
-
-           - ``targets`` (list[Tensor]): A list of tensors.
-             Each tensor has the shape of :math:`(T_i)`. Each
-             element is the index of a character.
-           - ``padded_targets`` (Tensor): A tensor of shape :math:`(N)`.
-             Each item is the length of a word.
-       """
-       return x
-   ```
-
-```{important}
-为了生成 readthedocs 文档，文档的编写需要按照 ReStructrued 文档格式，否则会产生文档渲染错误，在提交 PR 前，最好生成并预览一下文档效果。
-语法规范参考：
-
-- [reStructuredText Primer - Sphinx documentation](https://www.sphinx-doc.org/en/master/usage/restructuredtext/basics.html#)
-- [Example Google Style Python Docstrings ‒ napoleon 0.7 documentation](https://sphinxcontrib-napoleon.readthedocs.io/en/latest/example_google.html#example-google)
-```
-
-### 注释规范
-
-#### 为什么要写注释
-
-对于一个开源项目，团队合作以及社区之间的合作是必不可少的，因而尤其要重视合理的注释。不写注释的代码，很有可能过几个月自己也难以理解，造成额外的阅读和修改成本。
-
-#### 如何写注释
-
-最需要写注释的是代码中那些技巧性的部分。如果你在下次代码审查的时候必须解释一下，那么你应该现在就给它写注释。对于复杂的操作，应该在其操作开始前写上若干行注释。对于不是一目了然的代码，应在其行尾添加注释。
-—— Google 开源项目风格指南
-
-```python
-# We use a weighted dictionary search to find out where i is in
-# the array. We extrapolate position based on the largest num
-# in the array and the array size and then do binary search to
-# get the exact number.
-if i & (i-1) == 0:  # True if i is 0 or a power of 2.
-```
-
-为了提高可读性, 注释应该至少离开代码2个空格.
-另一方面, 绝不要描述代码. 假设阅读代码的人比你更懂Python, 他只是不知道你的代码要做什么.
-—— Google 开源项目风格指南
-
-```python
-# Wrong:
-# Now go through the b array and make sure whenever i occurs
-# the next element is i+1
-
-# Wrong:
-if i & (i-1) == 0:  # True if i bitwise and i-1 is 0.
-```
-
-在注释中，可以使用 Markdown 语法，因为开发人员通常熟悉 Markdown 语法，这样可以便于交流理解，如可使用单反引号表示代码和变量（注意不要和 docstring 中的 ReStructured 语法混淆）
-
-```python
-# `_reversed_padding_repeated_twice` is the padding to be passed to
-# `F.pad` if needed (e.g., for non-zero padding types that are
-# implemented as two ops: padding + conv). `F.pad` accepts paddings in
-# reverse order than the dimension.
-self._reversed_padding_repeated_twice = _reverse_repeat_tuple(self.padding, 2)
-```
-
-#### 注释示例
-
-1. 出自 `mmcv/utils/registry.py`，对于较为复杂的逻辑结构，通过注释，明确了优先级关系。
-
-   ```python
-   # self.build_func will be set with the following priority:
-   # 1. build_func
-   # 2. parent.build_func
-   # 3. build_from_cfg
-   if build_func is None:
-       if parent is not None:
-           self.build_func = parent.build_func
-       else:
-           self.build_func = build_from_cfg
-   else:
-       self.build_func = build_func
-   ```
-
-2. 出自 `mmcv/runner/checkpoint.py`，对于 bug 修复中的一些特殊处理，可以附带相关的 issue 链接，帮助其他人了解 bug 背景。
-
-   ```python
-   def _save_ckpt(checkpoint, file):
-       # The 1.6 release of PyTorch switched torch.save to use a new
-       # zipfile-based file format. It will cause RuntimeError when a
-       # checkpoint was saved in high version (PyTorch version>=1.6.0) but
-       # loaded in low version (PyTorch version<1.6.0). More details at
-       # https://github.com/open-mmlab/mmpose/issues/904
-       if digit_version(TORCH_VERSION) >= digit_version('1.6.0'):
-           torch.save(checkpoint, file, _use_new_zipfile_serialization=False)
-       else:
-           torch.save(checkpoint, file)
-   ```
-
-### 类型注解
-
-#### 为什么要写类型注解
-
-类型注解是对函数中变量的类型做限定或提示，为代码的安全性提供保障、增强代码的可读性、避免出现类型相关的错误。
-Python 没有对类型做强制限制，类型注解只起到一个提示作用，通常你的 IDE 会解析这些类型注解，然后在你调用相关代码时对类型做提示。另外也有类型注解检查工具，这些工具会根据类型注解，对代码中可能出现的问题进行检查，减少 bug 的出现。
-需要注意的是，通常我们不需要注释模块中的所有函数：
-
-1. 公共的 API 需要注释
-2. 在代码的安全性，清晰性和灵活性上进行权衡是否注释
-3. 对于容易出现类型相关的错误的代码进行注释
-4. 难以理解的代码请进行注释
-5. 若代码中的类型已经稳定，可以进行注释. 对于一份成熟的代码，多数情况下，即使注释了所有的函数，也不会丧失太多的灵活性.
-
-#### 如何写类型注解
-
-1. 函数 / 方法类型注解，通常不对 self 和 cls 注释。
-
-   ```python
-   from typing import Optional, List, Tuple
-
-   # 全部位于一行
-   def my_method(self, first_var: int) -> int:
-       pass
-
-   # 另起一行
-   def my_method(
-           self, first_var: int,
-           second_var: float) -> Tuple[MyLongType1, MyLongType1, MyLongType1]:
-       pass
-
-   # 单独成行（具体的应用场合与行宽有关，建议结合 yapf 自动化格式使用）
-   def my_method(
-       self, first_var: int, second_var: float
-   ) -> Tuple[MyLongType1, MyLongType1, MyLongType1]:
-       pass
-
-   # 引用尚未被定义的类型
-   class MyClass:
-       def __init__(self,
-                    stack: List["MyClass"]) -> None:
-           pass
-   ```
-
-   注：类型注解中的类型可以是 Python 内置类型，也可以是自定义类，还可以使用 Python 提供的 wrapper 类对类型注解进行装饰，一些常见的注解如下：
-
-   ```python
-   # 数值类型
-   from numbers import Number
-
-   # 可选类型，指参数可以为 None
-   from typing import Optional
-   def foo(var: Optional[int] = None):
-       pass
-
-   # 联合类型，指同时接受多种类型
-   from typing import Union
-   def foo(var: Union[float, str]):
-       pass
-
-   from typing import Sequence  # 序列类型
-   from typing import Iterable  # 可迭代类型
-   from typing import Any  # 任意类型
-   from typing import Callable  # 可调用类型
-
-   from typing import List, Dict  # 列表和字典的泛型类型
-   from typing import Tuple  # 元组的特殊格式
-   # 虽然在 Python 3.9 中，list, tuple 和 dict 本身已支持泛型，但为了支持之前的版本
-   # 我们在进行类型注解时还是需要使用 List, Tuple, Dict 类型
-   # 另外，在对参数类型进行注解时，尽量使用 Sequence & Iterable & Mapping
-   # List, Tuple, Dict 主要用于返回值类型注解
-   # 参见 https://docs.python.org/3/library/typing.html#typing.List
-   ```
-
-2. 变量类型注解，一般用于难以直接推断其类型时
-
-   ```python
-   # Recommend: 带类型注解的赋值
-   a: Foo = SomeUndecoratedFunction()
-   a: List[int]: [1, 2, 3]         # List 只支持单一类型泛型，可使用 Union
-   b: Tuple[int, int] = (1, 2)     # 长度固定为 2
-   c: Tuple[int, ...] = (1, 2, 3)  # 变长
-   d: Dict[str, int] = {'a': 1, 'b': 2}
-
-   # Not Recommend：行尾类型注释
-   # 虽然这种方式被写在了 Google 开源指南中，但这是一种为了支持 Python 2.7 版本
-   # 而补充的注释方式，鉴于我们只支持 Python 3, 为了风格统一，不推荐使用这种方式。
-   a = SomeUndecoratedFunction()  # type: Foo
-   a = [1, 2, 3]  # type: List[int]
-   b = (1, 2, 3)  # type: Tuple[int, ...]
-   c = (1, "2", 3.5)  # type: Tuple[int, Text, float]
-   ```
-
-3. 泛型
-
-   上文中我们知道，typing 中提供了 list 和 dict 的泛型类型，那么我们自己是否可以定义类似的泛型呢？
-
-   ```python
-   from typing import TypeVar, Generic
-
-   KT = TypeVar('KT')
-   VT = TypeVar('VT')
-
-   class Mapping(Generic[KT, VT]):
-       def __init__(self, data: Dict[KT, VT]):
-           self._data = data
-
-       def __getitem__(self, key: KT) -> VT:
-           return self._data[key]
-   ```
-
-   使用上述方法，我们定义了一个拥有泛型能力的映射类，实际用法如下：
-
-   ```python
-   mapping = Mapping[str, float]({'a': 0.5})
-   value: float = example['a']
-   ```
-
-   另外，我们也可以利用 TypeVar 在函数签名中指定联动的多个类型：
-
-   ```python
-   from typing import TypeVar, List
-
-   T = TypeVar('T')  # Can be anything
-   A = TypeVar('A', str, bytes)  # Must be str or bytes
-
-
-   def repeat(x: T, n: int) -> List[T]:
-       """Return a list containing n references to x."""
-       return [x]*n
-
-
-   def longest(x: A, y: A) -> A:
-       """Return the longest of two strings."""
-       return x if len(x) >= len(y) else y
-   ```
-
-更多关于类型注解的写法请参考 [typing](https://docs.python.org/3/library/typing.html)。
-
-#### 类型注解检查工具
-
-[mypy](https://mypy.readthedocs.io/en/stable/) 是一个 Python 静态类型检查工具。根据你的类型注解，mypy 会检查传参、赋值等操作是否符合类型注解，从而避免可能出现的 bug。
-
-例如如下的一个  Python 脚本文件 test.py:
-
-```python
-def foo(var: int) -> float:
-    return float(var)
-
-a: str = foo('2.0')
-b: int = foo('3.0')  # type: ignore
-```
-
-运行 mypy test.py 可以得到如下检查结果，分别指出了第 4 行在函数调用和返回值赋值两处类型错误。而第 5 行同样存在两个类型错误，由于使用了 type: ignore 而被忽略了，只有部分特殊情况可能需要此类忽略。
-
-```
-test.py:4: error: Incompatible types in assignment (expression has type "float", variable has type "int")
-test.py:4: error: Argument 1 to "foo" has incompatible type "str"; expected "int"
-Found 2 errors in 1 file (checked 1 source file)
-```
diff --git a/docs/zh_cn/community/contributing.md b/docs/zh_cn/community/contributing.md
deleted file mode 100644
index e3aa781..0000000
--- a/docs/zh_cn/community/contributing.md
+++ /dev/null
@@ -1,278 +0,0 @@
-## 贡献代码
-
-欢迎加入 MMCV 社区，我们致力于打造最前沿的计算机视觉基础库，我们欢迎任何类型的贡献，包括但不限于
-
-**修复错误**
-
-修复代码实现错误的步骤如下：
-
-1. 如果提交的代码改动较大，建议先提交 issue，并正确描述 issue 的现象、原因和复现方式，讨论后确认修复方案。
-2. 修复错误并补充相应的单元测试，提交拉取请求。
-
-**新增功能或组件**
-
-1. 如果新功能或模块涉及较大的代码改动，建议先提交 issue，确认功能的必要性。
-2. 实现新增功能并添单元测试，提交拉取请求。
-
-**文档补充**
-
-修复文档可以直接提交拉取请求
-
-添加文档或将文档翻译成其他语言步骤如下
-
-1. 提交 issue，确认添加文档的必要性。
-2. 添加文档，提交拉取请求。
-
-### 拉取请求工作流
-
-如果你对拉取请求不了解，没关系，接下来的内容将会从零开始，一步一步地指引你如何创建一个拉取请求。如果你想深入了解拉取请求的开发模式，可以参考 github [官方文档](https://docs.github.com/en/github/collaborating-with-issues-and-pull-requests/about-pull-requests)
-
-#### 1. 复刻仓库
-
-当你第一次提交拉取请求时，先复刻 OpenMMLab 原代码库，点击 GitHub 页面右上角的 **Fork** 按钮，复刻后的代码库将会出现在你的 GitHub 个人主页下。
-
-<img src="https://user-images.githubusercontent.com/57566630/167305749-43c7f4e9-449b-4e98-ade5-0c9276d5c9ce.png" width="1200">
-
-将代码克隆到本地
-
-```shell
-git clone git@github.com:{username}/mmcv.git
-```
-
-添加原代码库为上游代码库
-
-```bash
-git remote add upstream git@github.com:open-mmlab/mmcv
-```
-
-检查 remote 是否添加成功，在终端输入 `git remote -v`
-
-```bash
-origin	git@github.com:{username}/mmcv.git (fetch)
-origin	git@github.com:{username}/mmcv.git (push)
-upstream	git@github.com:open-mmlab/mmcv (fetch)
-upstream	git@github.com:open-mmlab/mmcv (push)
-```
-
-```{note}
-这里对 origin 和 upstream 进行一个简单的介绍，当我们使用 git clone 来克隆代码时，会默认创建一个 origin 的 remote，它指向我们克隆的代码库地址，而 upstream 则是我们自己添加的，用来指向原始代码库地址。当然如果你不喜欢他叫 upstream，也可以自己修改，比如叫 open-mmlab。我们通常向 origin 提交代码（即 fork 下来的远程仓库），然后向 upstream 提交一个 pull request。如果提交的代码和最新的代码发生冲突，再从 upstream 拉取最新的代码，和本地分支解决冲突，再提交到 origin。
-```
-
-#### 2. 配置 pre-commit
-
-在本地开发环境中，我们使用 [pre-commit](https://pre-commit.com/#intro) 来检查代码风格，以确保代码风格的统一。在提交代码，需要先安装 pre-commit（需要在 MMCV 目录下执行）:
-
-```shell
-pip install -U pre-commit
-pre-commit install
-```
-
-检查 pre-commit 是否配置成功，并安装 `.pre-commit-config.yaml` 中的钩子：
-
-```shell
-pre-commit run --all-files
-```
-
-<img src="https://user-images.githubusercontent.com/57566630/173660750-3df20a63-cb66-4d33-a986-1f643f1d8aaf.png" width="1200">
-
-<img src="https://user-images.githubusercontent.com/57566630/202368856-0465a90d-8fce-4345-918e-67b8b9c82614.png" width="1200">
-
-```{note}
-如果你是中国用户，由于网络原因，可能会出现安装失败的情况，这时可以使用国内源
-
-pre-commit install -c .pre-commit-config-zh-cn.yaml
-
-pre-commit run --all-files -c .pre-commit-config-zh-cn.yaml
-```
-
-如果安装过程被中断，可以重复执行 `pre-commit run ...` 继续安装。
-
-如果提交的代码不符合代码风格规范，pre-commit 会发出警告，并自动修复部分错误。
-
-<img src="https://user-images.githubusercontent.com/57566630/202369176-67642454-0025-4023-a095-263529107aa3.png" width="1200">
-
-如果我们想临时绕开 pre-commit 的检查提交一次代码，可以在 `git commit` 时加上 `--no-verify`（需要保证最后推送至远程仓库的代码能够通过 pre-commit 检查）。
-
-```shell
-git commit -m "xxx" --no-verify
-```
-
-#### 3. 创建开发分支
-
-安装完 pre-commit 之后，我们需要基于 master 创建开发分支，建议的分支命名规则为 `username/pr_name`。
-
-```shell
-git checkout -b yhc/refactor_contributing_doc
-```
-
-在后续的开发中，如果本地仓库的 master 分支落后于 upstream 的 master 分支，我们需要先拉取 upstream 的代码进行同步，再执行上面的命令
-
-```shell
-git pull upstream master
-```
-
-#### 4. 提交代码并在本地通过单元测试
-
-- MMCV 引入了 mypy 来做静态类型检查，以增加代码的鲁棒性。因此我们在提交代码时，需要补充 Type Hints。具体规则可以参考[教程](https://zhuanlan.zhihu.com/p/519335398)。
-
-- 提交的代码同样需要通过单元测试
-
-  ```shell
-  # 通过全量单元测试
-  pytest tests
-
-  # 我们需要保证提交的代码能够通过修改模块的单元测试，以 runner 为例
-  pytest tests/test_runner/test_runner.py
-  ```
-
-  如果你由于缺少依赖无法运行修改模块的单元测试，可以参考[指引-单元测试](#单元测试)
-
-- 如果修改/添加了文档，参考[指引](#文档渲染)确认文档渲染正常。
-
-#### 5. 推送代码到远程
-
-代码通过单元测试和 pre-commit 检查后，将代码推送到远程仓库，如果是第一次推送，可以在 `git push` 后加上 `-u` 参数以关联远程分支
-
-```shell
-git push -u origin {branch_name}
-```
-
-这样下次就可以直接使用 `git push` 命令推送代码了，而无需指定分支和远程仓库。
-
-#### 6. 提交拉取请求（PR）
-
-(1) 在 GitHub 的 Pull request 界面创建拉取请求
-<img src="https://user-images.githubusercontent.com/57566630/201533288-516f7ac4-0b14-4dc8-afbd-912475c368b5.png" width="1200">
-
-(2) 根据指引修改 PR 描述，以便于其他开发者更好地理解你的修改
-
-<img src="https://user-images.githubusercontent.com/57566630/202242953-c91a18ff-e388-4ff9-8591-5fae0ead6c1e.png" width="1200">
-
-描述规范详见[拉取请求规范](#拉取请求规范)
-
-&#160;
-
-**注意事项**
-
-(a) PR 描述应该包含修改理由、修改内容以及修改后带来的影响，并关联相关 Issue（具体方式见[文档](https://docs.github.com/en/issues/tracking-your-work-with-issues/linking-a-pull-request-to-an-issue)）
-
-(b) 如果是第一次为 OpenMMLab 做贡献，需要签署 CLA
-
-<img src="https://user-images.githubusercontent.com/57566630/167307569-a794b967-6e28-4eac-a942-00deb657815f.png" width="1200">
-
-(c) 检查提交的 PR 是否通过 CI（集成测试）
-
-<img src="https://user-images.githubusercontent.com/57566630/167307490-f9ebf9fa-63c0-4d83-8ba1-081ea169eb3a.png" width="1200">
-
-MMCV 会在不同的平台（Linux、Window、Mac），基于不同版本的 Python、PyTorch、CUDA 对提交的代码进行单元测试，以保证代码的正确性，如果有任何一个没有通过，我们可点击上图中的 `Details` 来查看具体的测试信息，以便于我们修改代码。
-
-(3) 如果 PR 通过了 CI，那么就可以等待其他开发者的 review，并根据 reviewer 的意见，修改代码，并重复 [4](#4-提交代码并本地通过单元测试)-[5](#5-推送代码到远程) 步骤，直到 reviewer 同意合入 PR。
-
-<img src="https://user-images.githubusercontent.com/57566630/202145400-cc2cd8c4-10b0-472f-ba37-07e6f50acc67.png" width="1200">
-
-所有 reviewer 同意合入 PR 后，我们会尽快将 PR 合并到主分支。
-
-#### 7. 解决冲突
-
-随着时间的推移，我们的代码库会不断更新，这时候，如果你的 PR 与主分支存在冲突，你需要解决冲突，解决冲突的方式有两种：
-
-```shell
-git fetch --all --prune
-git rebase upstream/master
-```
-
-或者
-
-```shell
-git fetch --all --prune
-git merge upstream/master
-```
-
-如果你非常善于处理冲突，那么可以使用 rebase 的方式来解决冲突，因为这能够保证你的 commit log 的整洁。如果你不太熟悉 `rebase` 的使用，那么可以使用 `merge` 的方式来解决冲突。
-
-### 指引
-
-#### 单元测试
-
-如果你无法正常执行部分模块的单元测试，例如 [video](https://github.com/open-mmlab/mmcv/tree/master/mmcv/video) 模块，可能是你的当前环境没有安装以下依赖
-
-```shell
-# Linux
-sudo apt-get update -y
-sudo apt-get install -y libturbojpeg
-sudo apt-get install -y ffmpeg
-
-# Windows
-conda install ffmpeg
-```
-
-在提交修复代码错误或新增特性的拉取请求时，我们应该尽可能的让单元测试覆盖所有提交的代码，计算单元测试覆盖率的方法如下
-
-```shell
-python -m coverage run -m pytest /path/to/test_file
-python -m coverage html
-# check file in htmlcov/index.html
-```
-
-#### 文档渲染
-
-在提交修复代码错误或新增特性的拉取请求时，可能会需要修改/新增模块的 docstring。我们需要确认渲染后的文档样式是正确的。
-本地生成渲染后的文档的方法如下
-
-```shell
-pip install -r requirements/docs.txt
-cd docs/zh_cn/
-# or docs/en
-make html
-# check file in ./docs/zh_cn/_build/html/index.html
-```
-
-### 代码风格
-
-#### Python
-
-[PEP8](https://www.python.org/dev/peps/pep-0008/) 作为 OpenMMLab 算法库首选的代码规范，我们使用以下工具检查和格式化代码
-
-- [flake8](https://github.com/PyCQA/flake8): Python 官方发布的代码规范检查工具，是多个检查工具的封装
-- [isort](https://github.com/timothycrosley/isort): 自动调整模块导入顺序的工具
-- [yapf](https://github.com/google/yapf): Google 发布的代码规范检查工具
-- [codespell](https://github.com/codespell-project/codespell): 检查单词拼写是否有误
-- [mdformat](https://github.com/executablebooks/mdformat): 检查 markdown 文件的工具
-- [docformatter](https://github.com/myint/docformatter): 格式化 docstring 的工具
-
-yapf 和 isort 的配置可以在 [setup.cfg](./setup.cfg) 找到
-
-通过配置 [pre-commit hook](https://pre-commit.com/) ，我们可以在提交代码时自动检查和格式化 `flake8`、`yapf`、`isort`、`trailing whitespaces`、`markdown files`，
-修复 `end-of-files`、`double-quoted-strings`、`python-encoding-pragma`、`mixed-line-ending`，调整 `requirments.txt` 的包顺序。
-pre-commit 钩子的配置可以在 [.pre-commit-config](./.pre-commit-config.yaml) 找到。
-
-pre-commit 具体的安装使用方式见[拉取请求](#2-配置-pre-commit)。
-
-更具体的规范请参考 [OpenMMLab 代码规范](code_style.md)。
-
-#### C++ and CUDA
-
-C++ 和 CUDA 的代码规范遵从 [Google C++ Style Guide](https://google.github.io/styleguide/cppguide.html)
-
-### 拉取请求规范
-
-1. 使用 [pre-commit hook](https://pre-commit.com)，尽量减少代码风格相关问题
-
-2. 一个`拉取请求`对应一个短期分支
-
-3. 粒度要细，一个`拉取请求`只做一件事情，避免超大的`拉取请求`
-
-   - Bad：实现 Faster R-CNN
-   - Acceptable：给 Faster R-CNN 添加一个 box head
-   - Good：给 box head 增加一个参数来支持自定义的 conv 层数
-
-4. 每次 Commit 时需要提供清晰且有意义 commit 信息
-
-5. 提供清晰且有意义的`拉取请求`描述
-
-   - 标题写明白任务名称，一般格式:\[Prefix\] Short description of the pull request (Suffix)
-   - prefix: 新增功能 \[Feature\], 修 bug \[Fix\], 文档相关 \[Docs\], 开发中 \[WIP\] (暂时不会被review)
-   - 描述里介绍`拉取请求`的主要修改内容，结果，以及对其他部分的影响, 参考`拉取请求`模板
-   - 关联相关的`议题` (issue) 和其他`拉取请求`
-
-6. 如果引入了其他三方库，或借鉴了三方库的代码，请确认他们的许可证和 mmcv 兼容，并在借鉴的代码上补充 `This code is inspired from http://`
diff --git a/docs/zh_cn/community/pr.md b/docs/zh_cn/community/pr.md
deleted file mode 100644
index 427fdf9..0000000
--- a/docs/zh_cn/community/pr.md
+++ /dev/null
@@ -1,3 +0,0 @@
-## 拉取请求
-
-本文档的内容已迁移到[贡献指南](contributing.md)。
diff --git a/docs/zh_cn/docutils.conf b/docs/zh_cn/docutils.conf
deleted file mode 100644
index 0c00c84..0000000
--- a/docs/zh_cn/docutils.conf
+++ /dev/null
@@ -1,2 +0,0 @@
-[html writers]
-table_style: colwidths-auto
diff --git a/docs/zh_cn/faq.md b/docs/zh_cn/faq.md
deleted file mode 100644
index 6cfb100..0000000
--- a/docs/zh_cn/faq.md
+++ /dev/null
@@ -1,91 +0,0 @@
-## 常见问题
-
-在这里我们列出了用户经常遇到的问题以及对应的解决方法。如果您遇到了其他常见的问题，并且知道可以帮到大家的解决办法，
-欢迎随时丰富这个列表。
-
-### 安装问题
-
-- KeyError: "xxx: 'yyy is not in the zzz registry'"
-
-  只有模块所在的文件被导入时，注册机制才会被触发，所以您需要在某处导入该文件，更多详情请查看 [KeyError: "MaskRCNN: 'RefineRoIHead is not in the models registry'"](https://github.com/open-mmlab/mmdetection/issues/5974)。
-
-- "No module named 'mmcv.ops'"; "No module named 'mmcv.\_ext'"
-
-  1. 使用 `pip uninstall mmcv` 卸载您环境中的 mmcv
-  2. 参考 [installation instruction](https://mmcv.readthedocs.io/en/latest/get_started/installation.html) 或者 [Build MMCV from source](https://mmcv.readthedocs.io/en/latest/get_started/build.html) 安装 mmcv-full
-
-- "invalid device function" 或者 "no kernel image is available for execution"
-
-  1. 检查 GPU 的 CUDA 计算能力
-  2. 运行 `python mmdet/utils/collect_env.py` 来检查 PyTorch、torchvision 和 MMCV 是否是针对正确的 GPU 架构构建的，您可能需要去设置 `TORCH_CUDA_ARCH_LIST` 来重新安装 MMCV。兼容性问题可能会出现在使用旧版的 GPUs，如：colab 上的 Tesla K80 (3.7)
-  3. 检查运行环境是否和 mmcv/mmdet 编译时的环境相同。例如，您可能使用 CUDA 10.0 编译 mmcv，但在 CUDA 9.0 的环境中运行它
-
-- "undefined symbol" 或者 "cannot open xxx.so"
-
-  1. 如果符号和 CUDA/C++ 相关（例如：libcudart.so 或者 GLIBCXX），请检查 CUDA/GCC 运行时的版本是否和编译 mmcv 的一致
-  2. 如果符号和 PyTorch 相关（例如：符号包含 caffe、aten 和 TH），请检查 PyTorch 运行时的版本是否和编译 mmcv 的一致
-  3. 运行 `python mmdet/utils/collect_env.py` 以检查 PyTorch、torchvision 和 MMCV 构建和运行的环境是否相同
-
-- "RuntimeError: CUDA error: invalid configuration argument"
-
-  这个错误可能是由于您的 GPU 性能不佳造成的。尝试降低 [THREADS_PER_BLOCK](https://github.com/open-mmlab/mmcv/blob/cac22f8cf5a904477e3b5461b1cc36856c2793da/mmcv/ops/csrc/common_cuda_helper.hpp#L10)
-  的值并重新编译 mmcv。
-
-- "RuntimeError: nms is not compiled with GPU support"
-
-  这个错误是由于您的 CUDA 环境没有正确安装。
-  您可以尝试重新安装您的 CUDA 环境，然后删除 mmcv/build 文件夹并重新编译 mmcv。
-
-- "Segmentation fault"
-
-  1. 检查 GCC 的版本，通常是因为 PyTorch 版本与 GCC 版本不匹配 （例如 GCC \< 4.9 )，我们推荐用户使用 GCC 5.4，我们也不推荐使用 GCC 5.5， 因为有反馈 GCC 5.5 会导致 "segmentation fault" 并且切换到 GCC 5.4 就可以解决问题
-  2. 检查是否正确安装 CUDA 版本的 PyTorc。输入以下命令并检查是否返回 True
-     ```shell
-     python -c 'import torch; print(torch.cuda.is_available())'
-     ```
-  3. 如果 `torch` 安装成功，那么检查 MMCV 是否安装成功。输入以下命令，如果没有报错说明 mmcv-full 安装成。
-     ```shell
-     python -c 'import mmcv; import mmcv.ops'
-     ```
-  4. 如果 MMCV 与 PyTorch 都安装成功了，则可以使用 `ipdb` 设置断点或者使用 `print` 函数，分析是哪一部分的代码导致了 `segmentation fault`
-
-- "libtorch_cuda_cu.so: cannot open shared object file"
-
-  `mmcv-full` 依赖 `libtorch_cuda_cu.so` 文件，但程序运行时没能找到该文件。我们可以检查该文件是否存在 `~/miniconda3/envs/{environment-name}/lib/python3.7/site-packages/torch/lib` 也可以尝试重装 PyTorch。
-
-- "fatal error C1189: #error:  -- unsupported Microsoft Visual Studio version!"
-
-  如果您在 Windows 上编译 mmcv-full 并且 CUDA 的版本是 9.2，您很可能会遇到这个问题 `"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v9.2\include\crt/host_config.h(133): fatal error C1189: #error:  -- unsupported Microsoft Visual Studio version! Only the versions 2012, 2013, 2015 and 2017 are supported!"`，您可以尝试使用低版本的 Microsoft Visual Studio，例如 vs2017。
-
-- "error: member "torch::jit::detail::ModulePolicy::all_slots" may not be initialized"
-
-  如果您在 Windows 上编译 mmcv-full 并且 PyTorch 的版本是 1.5.0，您很可能会遇到这个问题 `- torch/csrc/jit/api/module.h(474): error: member "torch::jit::detail::ModulePolicy::all_slots" may not be initialized`。解决这个问题的方法是将 `torch/csrc/jit/api/module.h` 文件中所有 `static constexpr bool all_slots = false;` 替换为 `static bool all_slots = false;`。更多细节可以查看 [member "torch::jit::detail::AttributePolicy::all_slots" may not be initialized](https://github.com/pytorch/pytorch/issues/39394)。
-
-- "error: a member with an in-class initializer must be const"
-
-  如果您在 Windows 上编译 mmcv-full 并且 PyTorch 的版本是 1.6.0，您很可能会遇到这个问题 `"- torch/include\torch/csrc/jit/api/module.h(483): error: a member with an in-class initializer must be const"`. 解决这个问题的方法是将 `torch/include\torch/csrc/jit/api/module.h` 文件中的所有 `CONSTEXPR_EXCEPT_WIN_CUDA ` 替换为 `const`。更多细节可以查看 [Ninja: build stopped: subcommand failed](https://github.com/open-mmlab/mmcv/issues/575)。
-
-- "error: member "torch::jit::ProfileOptionalOp::Kind" may not be initialized"
-
-  如果您在 Windows 上编译 mmcv-full 并且 PyTorch 的版本是 1.7.0，您很可能会遇到这个问题 `torch/include\torch/csrc/jit/ir/ir.h(1347): error: member "torch::jit::ProfileOptionalOp::Kind" may not be initialized`. 解决这个问题的方法是修改 PyTorch 中的几个文件：
-
-  - 删除 `torch/include\torch/csrc/jit/ir/ir.h` 文件中的 `static constexpr Symbol Kind = ::c10::prim::profile;` 和 `tatic constexpr Symbol Kind = ::c10::prim::profile_optional;`
-  - 将 `torch\include\pybind11\cast.h` 文件中的 `explicit operator type&() { return *(this->value); }` 替换为 `explicit operator type&() { return *((type*)this->value); }`
-  - 将 `torch/include\torch/csrc/jit/api/module.h` 文件中的 所有 `CONSTEXPR_EXCEPT_WIN_CUDA` 替换为 `const`
-
-  更多细节可以查看 [Ensure default extra_compile_args](https://github.com/pytorch/pytorch/pull/45956)。
-
-- MMCV 和 MMDetection 的兼容性问题；"ConvWS is already registered in conv layer"
-
-  请参考 [installation instruction](https://mmdetection.readthedocs.io/en/latest/get_started.html#installation) 为您的 MMDetection 版本安装正确版本的 MMCV。
-
-### 使用问题
-
-- "RuntimeError: Expected to have finished reduction in the prior iteration before starting a new one"
-
-  1. 这个错误是因为有些参数没有参与 loss 的计算，可能是代码中存在多个分支，导致有些分支没有参与 loss 的计算。更多细节见 [Expected to have finished reduction in the prior iteration before starting a new one](https://github.com/pytorch/pytorch/issues/55582)。
-  2. 你可以设置 DDP 中的 `find_unused_parameters` 为 `True`，或者手动查找哪些参数没有用到。
-
-- "RuntimeError: Trying to backward through the graph a second time"
-
-  不能同时设置 `GradientCumulativeOptimizerHook` 和 `OptimizerHook`，这会导致 `loss.backward()` 被调用两次，于是程序抛出 `RuntimeError`。我们只需设置其中的一个。更多细节见 [Trying to backward through the graph a second time](https://github.com/open-mmlab/mmcv/issues/1379)。
diff --git a/docs/zh_cn/get_started/article.md b/docs/zh_cn/get_started/article.md
deleted file mode 100644
index 9676850..0000000
--- a/docs/zh_cn/get_started/article.md
+++ /dev/null
@@ -1,63 +0,0 @@
-## 解读文章汇总
-
-这篇文章汇总了 [OpenMMLab](https://www.zhihu.com/people/openmmlab) 解读的部分文章（更多文章和视频见 [OpenMMLabCourse](https://github.com/open-mmlab/OpenMMLabCourse)），如果您有推荐的文章（不一定是 OpenMMLab 发布的文章，可以是自己写的文章），非常欢迎提 [Pull Request](http://127.0.0.1:5501/mmcv/docs/zh_cn/_build/html/community/pr.html) 添加到这里。
-
-### MMCV 解读文章
-
-#### 框架解读
-
-- [MMCV 核心组件分析(一)：整体概述](https://zhuanlan.zhihu.com/p/336081587)
-- [MMCV 核心组件分析(二)：FileHandler](https://zhuanlan.zhihu.com/p/336097883)
-- [MMCV 核心组件分析(三): FileClient](https://zhuanlan.zhihu.com/p/339190576)
-- [MMCV 核心组件分析(四): Config](https://zhuanlan.zhihu.com/p/346203167)
-- [MMCV 核心组件分析(五): Registry](https://zhuanlan.zhihu.com/p/355271993)
-- [MMCV 核心组件分析(六): Hook](https://zhuanlan.zhihu.com/p/355272220)
-- [MMCV 核心组件分析(七): Runner](https://zhuanlan.zhihu.com/p/355272459)
-- [MMCV Hook 食用指南](https://zhuanlan.zhihu.com/p/448600739)
-- [PyTorch & MMCV Dispatcher 机制解析](https://zhuanlan.zhihu.com/p/451671838)
-
-#### 工具解读
-
-- [训练可视化工具哪款是你的菜？MMCV一行代码随你挑](https://zhuanlan.zhihu.com/p/387078211)
-
-#### 安装指南
-
-- [久等了！Windows 平台 MMCV 的预编译包终于来了！](https://zhuanlan.zhihu.com/p/441653536)
-- [Windows 环境从零安装 mmcv-full](https://zhuanlan.zhihu.com/p/434491590)
-
-#### 知乎问答
-
-- [深度学习科研，如何高效进行代码和实验管理？](https://www.zhihu.com/question/269707221/answer/2480772257)
-- [深度学习方面的科研工作中的实验代码有什么规范和写作技巧？如何妥善管理实验数据？](https://www.zhihu.com/question/268193800/answer/2586000037)
-
-### 下游算法库解读文章
-
-- [MMDetection](https://mmdetection.readthedocs.io/zh_CN/latest/article.html)
-
-### PyTorch 解读文章
-
-- [PyTorch1.11 亮点一览：TorchData、functorch、DDP 静态图](https://zhuanlan.zhihu.com/p/486222256)
-- [PyTorch1.12 亮点一览：DataPipe + TorchArrow 新的数据加载与处理范式](https://zhuanlan.zhihu.com/p/537868554)
-- [PyTorch 源码解读之 nn.Module：核心网络模块接口详解](https://zhuanlan.zhihu.com/p/340453841)
-- [PyTorch 源码解读之 torch.autograd：梯度计算详解](https://zhuanlan.zhihu.com/p/321449610)
-- [PyTorch 源码解读之 torch.utils.data：解析数据处理全流程](https://zhuanlan.zhihu.com/p/337850513)
-- [PyTorch 源码解读之 torch.optim：优化算法接口详解](https://zhuanlan.zhihu.com/p/346205754)
-- [PyTorch 源码解读之 DP & DDP：模型并行和分布式训练解析](https://zhuanlan.zhihu.com/p/343951042)
-- [PyTorch 源码解读之 BN & SyncBN：BN 与 多卡同步 BN 详解](https://zhuanlan.zhihu.com/p/337732517)
-- [PyTorch 源码解读之 torch.cuda.amp: 自动混合精度详解](https://zhuanlan.zhihu.com/p/348554267)
-- [PyTorch 源码解读之 cpp_extension：揭秘 C++/CUDA 算子实现和调用全流程](https://zhuanlan.zhihu.com/p/348555597)
-- [PyTorch 源码解读之即时编译篇](https://zhuanlan.zhihu.com/p/361101354)
-- [PyTorch 源码解读之分布式训练了解一下？](https://zhuanlan.zhihu.com/p/361314953)
-- [PyTorch 源码解读之 torch.serialization & torch.hub](https://zhuanlan.zhihu.com/p/364239544)
-
-### 其他
-
-- [困扰我 48 小时的深拷贝，今天终于...](https://zhuanlan.zhihu.com/p/470892209)
-- [拿什么拯救我的 4G 显卡](https://zhuanlan.zhihu.com/p/430123077)
-- [是谁偷偷动了我的 logger](https://zhuanlan.zhihu.com/p/481383590)
-- [三句话，让 logger 言听计从](https://zhuanlan.zhihu.com/p/487524917)
-- [Logging 不为人知的二三事](https://zhuanlan.zhihu.com/p/502610682)
-- [Type Hints 入门教程，让代码更加规范整洁](https://zhuanlan.zhihu.com/p/519335398)
-- [手把手教你如何高效地在 MMCV 中贡献算子](https://zhuanlan.zhihu.com/p/464492627)
-- [OpenMMLab 支持 IPU 训练芯片](https://zhuanlan.zhihu.com/p/517527926)
-- [基于 MMCV 走上开源大佬之路？](https://zhuanlan.zhihu.com/p/391144979)
diff --git a/docs/zh_cn/get_started/build.md b/docs/zh_cn/get_started/build.md
deleted file mode 100644
index 95f611b..0000000
--- a/docs/zh_cn/get_started/build.md
+++ /dev/null
@@ -1,300 +0,0 @@
-## 从源码编译 MMCV
-
-### 编译 mmcv
-
-在编译 mmcv 之前，请确保 PyTorch 已经成功安装在环境中，可以参考 [PyTorch 官方安装文档](https://pytorch.org/get-started/locally/#start-locally)。可使用以下命令验证
-
-```bash
-python -c 'import torch;print(torch.__version__)'
-```
-
-:::{note}
-
-- 如果克隆代码仓库的速度过慢，可以使用以下命令克隆（注意：gitee 的 mmcv 不一定和 github 的保持一致，因为每天只同步一次）
-
-```bash
-git clone https://gitee.com/open-mmlab/mmcv.git
-```
-
-- 如果打算使用 `opencv-python-headless` 而不是 `opencv-python`，例如在一个很小的容器环境或者没有图形用户界面的服务器中，你可以先安装 `opencv-python-headless`，这样在安装 mmcv 依赖的过程中会跳过 `opencv-python`。
-
-- 如果编译过程安装依赖库的时间过长，可以[设置 pypi 源](https://mirrors.tuna.tsinghua.edu.cn/help/pypi/)
-
-```bash
-pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple
-```
-
-:::
-
-#### 在 Linux 上编译 mmcv
-
-| TODO: 视频教程
-
-1. 克隆代码仓库
-
-   ```bash
-   git clone https://github.com/open-mmlab/mmcv.git
-   cd mmcv
-   ```
-
-2. 安装 `ninja` 和 `psutil` 以加快编译速度
-
-   ```bash
-   pip install -r requirements/optional.txt
-   ```
-
-3. 检查 nvcc 的版本（要求大于等于 9.2，如果没有 GPU，可以跳过）
-
-   ```bash
-   nvcc --version
-   ```
-
-   上述命令如果输出以下信息，表示 nvcc 的设置没有问题，否则需要设置 CUDA_HOME
-
-   ```
-   nvcc: NVIDIA (R) Cuda compiler driver
-   Copyright (c) 2005-2020 NVIDIA Corporation
-   Built on Mon_Nov_30_19:08:53_PST_2020
-   Cuda compilation tools, release 11.2, V11.2.67
-   Build cuda_11.2.r11.2/compiler.29373293_0
-   ```
-
-   :::{note}
-   如果想要支持 ROCm，可以参考 [AMD ROCm](https://rocmdocs.amd.com/en/latest/Installation_Guide/Installation-Guide.html) 安装 ROCm。
-   :::
-
-4. 检查 gcc 的版本（要求大于等于**5.4**）
-
-   ```bash
-   gcc --version
-   ```
-
-5. 开始编译（预估耗时 10 分钟）
-
-   ```bash
-   pip install -e . -v
-   ```
-
-6. 验证安装
-
-   ```bash
-   python .dev_scripts/check_installation.py
-   ```
-
-   如果上述命令没有报错，说明安装成功。如有报错，请查看[问题解决页面](../faq.html)是否已经有解决方案。
-
-   如果没有找到解决方案，欢迎提 [issue](https://github.com/open-mmlab/mmcv/issues)。
-
-#### 在 macOS 上编译 mmcv
-
-| TODO: 视频教程
-
-```{note}
-如果你使用的是搭载 apple silicon 的 mac 设备，请安装 PyTorch 1.13+ 的版本，否则会遇到 [issues#2218](https://github.com/open-mmlab/mmcv/issues/2218) 中的问题。
-```
-
-1. 克隆代码仓库
-
-   ```bash
-   git clone https://github.com/open-mmlab/mmcv.git
-   cd mmcv
-   ```
-
-2. 安装 `ninja` 和 `psutil` 以加快编译速度
-
-   ```bash
-   pip install -r requirements/optional.txt
-   ```
-
-3. 开始编译
-
-   ```bash
-   pip install -e .
-   ```
-
-4. 验证安装
-
-   ```bash
-   python .dev_scripts/check_installation.py
-   ```
-
-   如果上述命令没有报错，说明安装成功。如有报错，请查看[问题解决页面](../faq.md)是否已经有解决方案。
-
-   如果没有找到解决方案，欢迎提 [issue](https://github.com/open-mmlab/mmcv/issues)。
-
-#### 在 Windows 上编译 mmcv
-
-| TODO: 视频教程
-
-在 Windows 上编译 mmcv 比 Linux 复杂，本节将一步步介绍如何在 Windows 上编译 mmcv。
-
-##### 依赖项
-
-请先安装以下的依赖项：
-
-- [Git](https://git-scm.com/download/win)：安装期间，请选择 **add git to Path**
-- [Visual Studio Community 2019](https://visualstudio.microsoft.com)：用于编译 C++ 和 CUDA 代码
-- [Miniconda](https://docs.conda.io/en/latest/miniconda.html)：包管理工具
-- [CUDA 10.2](https://developer.nvidia.com/cuda-10.2-download-archive)：如果只需要 CPU 版本可以不安装 CUDA，安装 CUDA 时，可根据需要进行自定义安装。如果已经安装新版本的显卡驱动，建议取消驱动程序的安装
-
-```{note}
-如果不清楚如何安装以上依赖，请参考[Windows 环境从零安装 mmcv](https://zhuanlan.zhihu.com/p/434491590)。
-另外，你需要知道如何在 Windows 上设置变量环境，尤其是 "PATH" 的设置，以下安装过程都会用到。
-```
-
-##### 通用步骤
-
-1. 从 Windows 菜单启动 Anaconda 命令行
-
-   如 Miniconda 安装程序建议，不要使用原始的 `cmd.exe` 或是 `powershell.exe`。命令行有两个版本，一个基于 PowerShell，一个基于传统的 `cmd.exe`。请注意以下说明都是使用的基于 PowerShell
-
-2. 创建一个新的 Conda 环境
-
-   ```powershell
-   (base) PS C:\Users\xxx> conda create --name mmcv python=3.7
-   (base) PS C:\Users\xxx> conda activate mmcv  # 确保做任何操作前先激活环境
-   ```
-
-3. 安装 PyTorch 时，可以根据需要安装支持 CUDA 或不支持 CUDA 的版本
-
-   ```powershell
-   # CUDA version
-   (mmcv) PS C:\Users\xxx> conda install pytorch torchvision cudatoolkit=10.2 -c pytorch
-   # CPU version
-   (mmcv) PS C:\Users\xxx> conda install install pytorch torchvision cpuonly -c pytorch
-   ```
-
-4. 克隆代码仓库
-
-   ```powershell
-   (mmcv) PS C:\Users\xxx> git clone https://github.com/open-mmlab/mmcv.git
-   (mmcv) PS C:\Users\xxx> cd mmcv
-   ```
-
-5. 安装 `ninja` 和 `psutil` 以加快编译速度
-
-   ```powershell
-   (mmcv) PS C:\Users\xxx\mmcv> pip install -r requirements/optional.txt
-   ```
-
-6. 设置 MSVC 编译器
-
-   设置环境变量。添加 `C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\VC\Tools\MSVC\14.27.29110\bin\Hostx86\x64` 到 `PATH`，则 `cl.exe` 可以在命令行中运行，如下所示。
-
-   ```powershell
-   (mmcv) PS C:\Users\xxx\mmcv> cl
-   Microsoft (R) C/C++ Optimizing  Compiler Version 19.27.29111 for x64
-   Copyright (C) Microsoft Corporation.   All rights reserved.
-
-   usage: cl [ option... ] filename... [ / link linkoption... ]
-   ```
-
-   为了兼容性，我们使用 x86-hosted 以及 x64-targeted 版本，即路径中的 `Hostx86\x64` 。
-
-   因为 PyTorch 将解析 `cl.exe` 的输出以检查其版本，只有 utf-8 将会被识别，你可能需要将系统语言更改为英语。控制面板 -> 地区-> 管理-> 非 Unicode 来进行语言转换。
-
-##### 编译与安装 mmcv
-
-mmcv 有两个版本：
-
-- 只包含 CPU 算子的版本
-
-  编译 CPU 算子，但只有 x86 将会被编译，并且编译版本只能在 CPU only 情况下运行
-
-- 既包含 CPU 算子，又包含 CUDA 算子的版本
-
-  同时编译 CPU 和 CUDA 算子，`ops` 模块的 x86 与 CUDA 的代码都可以被编译。同时编译的版本可以在 CUDA 上调用 GPU
-
-###### CPU 版本
-
-编译安装
-
-```powershell
-(mmcv) PS C:\Users\xxx\mmcv> python setup.py build_ext  # 如果成功, cl 将被启动用于编译算子
-(mmcv) PS C:\Users\xxx\mmcv> python setup.py develop  # 安装
-```
-
-###### GPU 版本
-
-1. 检查 `CUDA_PATH` 或者 `CUDA_HOME` 环境变量已经存在在 `envs` 之中
-
-   ```powershell
-   (mmcv) PS C:\Users\xxx\mmcv> ls env:
-
-   Name                           Value
-   ----                           -----
-   CUDA_PATH                      C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.2
-   CUDA_PATH_V10_1                C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.1
-   CUDA_PATH_V10_2                C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.2
-   ```
-
-   如果没有，你可以按照下面的步骤设置
-
-   ```powershell
-   (mmcv) PS C:\Users\xxx\mmcv> $env:CUDA_HOME = "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.2"
-   # 或者
-   (mmcv) PS C:\Users\xxx\mmcv> $env:CUDA_HOME = $env:CUDA_PATH_V10_2  # CUDA_PATH_V10_2 已经在环境变量中
-   ```
-
-2. 设置 CUDA 的目标架构
-
-   ```powershell
-   # 这里需要改成你的显卡对应的目标架构
-   (mmcv) PS C:\Users\xxx\mmcv> $env:TORCH_CUDA_ARCH_LIST="7.5"
-   ```
-
-   :::{note}
-   可以点击 [cuda-gpus](https://developer.nvidia.com/cuda-gpus) 查看 GPU 的计算能力，也可以通过 CUDA 目录下的 deviceQuery.exe 工具查看
-
-   ```powershell
-   (mmcv) PS C:\Users\xxx\mmcv> &"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.2\extras\demo_suite\deviceQuery.exe"
-   Device 0: "NVIDIA GeForce GTX 1660 SUPER"
-   CUDA Driver Version / Runtime Version          11.7 / 11.1
-   CUDA Capability Major/Minor version number:    7.5
-   ```
-
-   上面的 7.5 表示目标架构。注意：需把上面命令的 v10.2 换成你的 CUDA 版本。
-   :::
-
-3. 编译安装
-
-   ```powershell
-   (mmcv) PS C:\Users\xxx\mmcv> python setup.py build_ext  # 如果成功, cl 将被启动用于编译算子
-   (mmcv) PS C:\Users\xxx\mmcv> python setup.py develop # 安装
-   ```
-
-   ```{note}
-   如果你的 PyTorch 版本是 1.6.0，你可能会遇到一些 [issue](https://github.com/pytorch/pytorch/issues/42467) 提到的错误，你可以参考这个 [pull request](https://github.com/pytorch/pytorch/pull/43380/files) 修改本地环境的 PyTorch 源代码
-   ```
-
-##### 验证安装
-
-```powershell
-(mmcv) PS C:\Users\xxx\mmcv> python .dev_scripts/check_installation.py
-```
-
-如果上述命令没有报错，说明安装成功。如有报错，请查看[问题解决页面](../faq.md)是否已经有解决方案。
-如果没有找到解决方案，欢迎提 [issue](https://github.com/open-mmlab/mmcv/issues)。
-
-### 编译 mmcv-lite
-
-如果你需要使用和 PyTorch 相关的模块，请确保 PyTorch 已经成功安装在环境中，可以参考 [PyTorch 官方安装文档](https://pytorch.org/get-started/locally/#start-locally)。
-
-1. 克隆代码仓库
-
-   ```bash
-   git clone https://github.com/open-mmlab/mmcv.git
-   cd mmcv
-   ```
-
-2. 开始编译
-
-   ```bash
-   MMCV_WITH_OPS=0 pip install -e . -v
-   ```
-
-3. 验证安装
-
-   ```bash
-   python -c 'import mmcv;print(mmcv.__version__)'
-   ```
diff --git a/docs/zh_cn/get_started/installation.md b/docs/zh_cn/get_started/installation.md
deleted file mode 100644
index 54cdbd9..0000000
--- a/docs/zh_cn/get_started/installation.md
+++ /dev/null
@@ -1,369 +0,0 @@
-## 安装 MMCV
-
-MMCV 有两个版本：
-
-- **mmcv**: 完整版，包含所有的特性以及丰富的开箱即用的 CPU 和 CUDA 算子。注意，完整版本可能需要更长时间来编译。
-- **mmcv-lite**: 精简版，不包含 CPU 和 CUDA 算子但包含其余所有特性和功能，类似 MMCV 1.0 之前的版本。如果你不需要使用算子的话，精简版可以作为一个考虑选项。
-
-```{warning}
-请不要在同一个环境中安装两个版本，否则可能会遇到类似 `ModuleNotFound` 的错误。在安装一个版本之前，需要先卸载另一个。`如果 CUDA 可用，强烈推荐安装 mmcv`。
-```
-
-### 安装 mmcv
-
-在安装 mmcv 之前，请确保 PyTorch 已经成功安装在环境中，可以参考 [PyTorch 官方安装文档](https://pytorch.org/get-started/locally/#start-locally)。可使用以下命令验证
-
-```bash
-python -c 'import torch;print(torch.__version__)'
-```
-
-如果输出版本信息，则表示 PyTorch 已安装。
-
-#### 使用 mim 安装（推荐）
-
-[mim](https://github.com/open-mmlab/mim) 是 OpenMMLab 项目的包管理工具，使用它可以很方便地安装 mmcv。
-
-```bash
-pip install -U openmim
-mim install "mmcv>=2.0.0rc1"
-```
-
-如果发现上述的安装命令没有使用预编译包（以 `.whl` 结尾）而是使用源码包（以 `.tar.gz` 结尾）安装，则有可能是我们没有提供和当前环境的 PyTorch 版本、CUDA 版本相匹配的 mmcv 预编译包，此时，你可以[源码安装 mmcv](build.md)。
-
-<details>
-<summary>使用预编译包的安装日志</summary>
-
-Looking in links: https://download.openmmlab.com/mmcv/dist/cu102/torch1.8.0/index.html<br />
-Collecting mmcv<br />
-<b>Downloading https://download.openmmlab.com/mmcv/dist/cu102/torch1.8.0/mmcv-2.0.0rc3-cp38-cp38-manylinux1_x86_64.whl</b>
-
-</details>
-
-<details>
-<summary>使用源码包的安装日志</summary>
-
-Looking in links: https://download.openmmlab.com/mmcv/dist/cu102/torch1.8.0/index.html<br />
-Collecting mmcv==2.0.0rc3<br />
-<b>Downloading mmcv-2.0.0rc3.tar.gz</b>
-
-</details>
-
-如需安装指定版本的 mmcv，例如安装 2.0.0rc3 版本的 mmcv，可使用以下命令
-
-```bash
-mim install mmcv==2.0.0rc3
-```
-
-:::{note}
-如果你打算使用 `opencv-python-headless` 而不是 `opencv-python`，例如在一个很小的容器环境或者没有图形用户界面的服务器中，你可以先安装 `opencv-python-headless`，这样在安装 mmcv 依赖的过程中会跳过 `opencv-python`。
-
-另外，如果安装依赖库的时间过长，可以指定 pypi 源
-
-```bash
-mim install "mmcv>=2.0.0rc1" -i https://pypi.tuna.tsinghua.edu.cn/simple
-```
-
-:::
-
-安装完成后可以运行 [check_installation.py](https://github.com/open-mmlab/mmcv/blob/2.x/.dev_scripts/check_installation.py) 脚本检查 mmcv 是否安装成功。
-
-#### 使用 pip 安装
-
-使用以下命令查看 CUDA 和 PyTorch 的版本
-
-```bash
-python -c 'import torch;print(torch.__version__);print(torch.version.cuda)'
-```
-
-根据系统的类型、CUDA 版本、PyTorch 版本以及 MMCV 版本选择相应的安装命令
-
-<html>
-<body>
-    <style>
-      select {
-          z-index: 1000;
-          position: absolute;
-          top: 10px;
-          width: 6.7rem;
-      }
-      #select-container {
-          position: relative;
-          height: 30px;
-      }
-      #select-cmd {
-          background-color: #f5f6f7;
-          font-size: 14px;
-          margin-top: 20px;
-      }
-      /* 让每一个都间隔1.3rem */
-      #select-os {
-          /* left: 1.375rem; */
-          left: 0;
-      }
-      #select-cuda {
-          /* left: 9.375rem;    9.375 = 1.375 + 6.7 + 1.3 */
-          left: 8rem;
-      }
-      #select-torch {
-          /* left: 17.375rem;    17.375 = 9.375 + 6.7 + 1.3 */
-          left: 16rem;
-      }
-      #select-mmcv {
-          /* left: 25.375rem;    25.375 = 17.375 + 6.7 + 1.3 */
-          left: 24rem;
-      }
-    </style>
-    <div id="select-container">
-        <select
-            onmousedown="handleSelectMouseDown(this.id)"
-            onblur="handleSelectBlur(this.id)"
-            onchange="changeOS(this.value)"
-            id="select-os">
-        </select>
-        <select
-            onmousedown="handleSelectMouseDown(this.id)"
-            onblur="handleSelectBlur(this.id)"
-            onchange="changeCUDA(this.value)"
-            id="select-cuda">
-        </select>
-        <select
-            onmousedown="handleSelectMouseDown(this.id)"
-            onblur="handleSelectBlur(this.id)"
-            onchange="changeTorch(this.value)"
-            id="select-torch">
-        </select>
-        <select
-            onmousedown="handleSelectMouseDown(this.id)"
-            onblur="handleSelectBlur(this.id)"
-            onchange="changeMMCV(this.value)"
-            id="select-mmcv">
-        </select>
-    </div>
-    <pre id="select-cmd"></pre>
-</body>
-<script>
-    // 各个select当前的值
-    let osVal, cudaVal, torchVal, mmcvVal;
-    function changeMMCV(val) {
-        mmcvVal = val;
-        change("select-mmcv");
-    }
-    function changeTorch(val) {
-        torchVal = val;
-        change("select-torch");
-    }
-    function changeCUDA(val) {
-        cudaVal = val;
-        change("select-cuda");
-    }
-    function changeOS(val) {
-        osVal = val;
-        change("select-os");
-    }
-    // 控制size大小相关的几个方法
-    function handleSelectMouseDown(id) {
-        const dom = document.getElementById(id);
-        if (!dom) return;
-        const len = dom?.options?.length;
-        if (len >= 9) {
-            dom.size = 10;
-            dom.style.zIndex = 100;
-        }
-    }
-    function handleSelectClick() {
-        const selects = Array.from(document.getElementsByTagName("select"));
-        selects.forEach(select => {
-            select.size = 1;
-        });
-    }
-    function handleSelectBlur(id) {
-        const dom = document.getElementById(id);
-        if (!dom) {
-            // 如果没有指定特定的id，那就直接把所有的select都设置成size = 1
-            handleSelectClick();
-            return;
-        }
-        dom.size = 1;
-        dom.style.zIndex = 1;
-    }
-    function changeCmd() {
-        const cmd = document.getElementById("select-cmd");
-        let cmdString = "pip install mmcv=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/{cu_version}/{torch_version}/index.html";
-        // e.g: pip install mmcv==2.0.0rc1 -f https://download.openmmlab.com/mmcv/dist/cu111/torch1.9/index.html
-        let cudaVersion;
-        if (cudaVal === "cpu" || cudaVal === "mps") {
-            cudaVersion = "cpu";
-        } else {
-            cudaVersion = `cu${cudaVal.split(".").join("")}`;
-        }
-        const torchVersion = `torch${torchVal.substring(0, torchVal.length - 2)}`;
-        cmdString = cmdString.replace("{cu_version}", cudaVersion).replace("{mmcv_version}", mmcvVal).replace("{torch_version}", torchVersion);
-        cmd.textContent = cmdString;
-    }
-    // string数组去重
-    function unique(arr) {
-        if (!arr || !Array.isArray(arr)) return [];
-        return [...new Set(arr)];
-    }
-    // 根据string数组生成option的DocumentFragment
-    function genOptionFragment(data, id) {
-        const name = id.includes("-")? id.split("-")[1] : id;
-        const fragment = new DocumentFragment();
-        data.forEach(option => {
-            const ele = document.createElement("option");
-            let text = `${name} ${option}`;
-            if (name === "os" || option.toUpperCase() === "CPU" || option.toUpperCase() === "MPS") {
-                text = `${option}`;
-            }
-            ele.textContent = text;
-            // 添加value属性，方便下拉框选择时直接读到数据
-            ele.value = option;
-            // 添加点击事件监听
-            ele.addEventListener('click', handleSelectClick);
-            fragment.appendChild(ele);
-        });
-        return fragment;
-    }
-    // 在dom树中找到id对应的dom（select元素），并将生成的options添加到元素内
-    function findAndAppend(data, id) {
-        const fragment = genOptionFragment(data, id);
-        const dom = document.getElementById(id);
-        if (dom) dom.replaceChildren(fragment);
-    }
-    /**
-     * change方法的重点在于
-     * 1. 各个下拉框数据的联动
-     *      OS ==> cuda ==> torch ==> mmcv
-     * 2. 命令行的修改
-    */
-    function change(id) {
-        const order = ["select-mmcv", "select-torch", "select-cuda", "select-os"];
-        const idx = order.indexOf(id);
-        if (idx === -1) return;
-        const versionDetail = version[osVal];
-        if (idx >= 3) {
-            // 根据os修改cuda
-            let cuda = [];
-            versionDetail.forEach(v => {
-                cuda.push(v.cuda);
-            });
-            cuda = unique(cuda);
-            cudaVal = cuda[0];
-            findAndAppend(cuda, "select-cuda");
-        }
-        if (idx >= 2) {
-            // 根据cuda修改torch
-            const torch = [];
-            versionDetail.forEach(v => {
-                if (v.cuda === cudaVal) torch.push(v.torch);
-            });
-            torchVal = torch[0];
-            findAndAppend(torch, "select-torch");
-        }
-        if (idx >= 1) {
-            // 根据torch修改mmcv
-            let mmcv = [];
-            versionDetail.forEach(v => {
-                if (v.cuda === cudaVal && v.torch === torchVal) mmcv = v.mmcv;
-            });
-            mmcvVal = mmcv[0];
-            findAndAppend(mmcv, "select-mmcv");
-        }
-        changeCmd();
-    }
-    // 初始化，处理version数据，并调用findAndAppend
-    function init() {
-        // 增加一个全局的click事件监听，作为select onBlur事件失效的兜底
-        document.addEventListener("click", handleSelectBlur);
-        const version = window.version;
-        // OS
-        const os = Object.keys(version);
-        osVal = os[0];
-        findAndAppend(os, "select-os");
-        change("select-os");
-        changeCmd();
-    }
-    // 利用xhr获取本地version数据，如果作为html直接浏览的话需要使用本地服务器打开，否则会有跨域问题
-    window.onload = function () {
-        const url = "../_static/version.json"
-        // 申明一个XMLHttpRequest
-        const request = new XMLHttpRequest();
-        // 设置请求方法与路径
-        request.open("get", url);
-        // 不发送数据到服务器
-        request.send(null);
-        //XHR对象获取到返回信息后执行
-        request.onload = function () {
-            // 返回状态为200，即为数据获取成功
-            if (request.status !== 200) return;
-            const data = JSON.parse(request.responseText);
-            window.version = data;
-            init();
-        }
-    }
-</script>
-</html>
-
-如果在上面的下拉框中没有找到对应的版本，则可能是没有对应 PyTorch 或者 CUDA 或者 mmcv 版本的预编译包，此时，你可以[源码安装 mmcv](build.md)。
-
-:::{note}
-PyTorch 在 1.x.0 和 1.x.1 之间通常是兼容的，故 mmcv 只提供 1.x.0 的编译包。如果你
-的 PyTorch 版本是 1.x.1，你可以放心地安装在 1.x.0 版本编译的 mmcv。例如，如果你的
-PyTorch 版本是 1.8.1，你可以放心选择 1.8.x。
-:::
-
-:::{note}
-如果你打算使用 `opencv-python-headless` 而不是 `opencv-python`，例如在一个很小的容器环境或者没有图形用户界面的服务器中，你可以先安装 `opencv-python-headless`，这样在安装 mmcv 依赖的过程中会跳过 `opencv-python`。
-
-另外，如果安装依赖库的时间过长，可以指定 pypi 源
-
-```bash
-pip install "mmcv>=2.0.0rc1" -f https://download.openmmlab.com/mmcv/dist/cu111/torch1.9.0/index.html -i https://pypi.tuna.tsinghua.edu.cn/simple
-```
-
-:::
-
-安装完成后可以运行 [check_installation.py](https://github.com/open-mmlab/mmcv/blob/2.x/.dev_scripts/check_installation.py) 脚本检查 mmcv 是否安装成功。
-
-#### 使用 docker 镜像
-
-先将算法库克隆到本地再构建镜像
-
-```bash
-git clone https://github.com/open-mmlab/mmcv.git && cd mmcv
-docker build -t mmcv -f docker/release/Dockerfile .
-```
-
-也可以直接使用下面的命令构建镜像
-
-```bash
-docker build -t mmcv https://github.com/open-mmlab/mmcv.git#2.x:docker/release
-```
-
-[Dockerfile](release/Dockerfile) 默认安装最新的 mmcv，如果你想要指定版本，可以使用下面的命令
-
-```bash
-docker image build -t mmcv -f docker/release/Dockerfile --build-arg MMCV=2.0.0rc1 .
-```
-
-如果你想要使用其他版本的 PyTorch 和 CUDA，你可以在构建镜像时指定它们的版本。
-
-例如指定 PyTorch 的版本是 1.11，CUDA 的版本是 11.3
-
-```bash
-docker build -t mmcv -f docker/release/Dockerfile \
-    --build-arg PYTORCH=1.11.0 \
-    --build-arg CUDA=11.3 \
-    --build-arg CUDNN=8 \
-    --build-arg MMCV=2.0.0rc1 .
-```
-
-更多 PyTorch 和 CUDA 镜像可以点击 [dockerhub/pytorch](https://hub.docker.com/r/pytorch/pytorch/tags) 查看。
-
-### 安装 mmcv-lite
-
-如果你需要使用和 PyTorch 相关的模块，请确保 PyTorch 已经成功安装在环境中，可以参考 [PyTorch 官方安装文档](https://pytorch.org/get-started/locally/#start-locally)。
-
-```python
-pip install mmcv-lite
-```
diff --git a/docs/zh_cn/get_started/introduction.md b/docs/zh_cn/get_started/introduction.md
deleted file mode 100644
index 4c735b9..0000000
--- a/docs/zh_cn/get_started/introduction.md
+++ /dev/null
@@ -1,36 +0,0 @@
-## 介绍 MMCV
-
-MMCV 是一个面向计算机视觉的基础库，它提供了以下功能：
-
-- [图像和视频处理](../understand_mmcv/data_process.md)
-- [图像和标注结果可视化](../understand_mmcv/visualization.md)
-- [图像变换](../understand_mmcv/data_transform.md)
-- [多种 CNN 网络结构](../understand_mmcv/cnn.md)
-- [高质量实现的常见 CUDA 算子](../understand_mmcv/ops.md)
-
-MMCV 支持多种平台，包括：
-
-- Linux
-- Windows
-- macOS
-
-它支持的 OpenMMLab 项目：
-
-- [MMClassification](https://github.com/open-mmlab/mmclassification): OpenMMLab 图像分类工具箱
-- [MMDetection](https://github.com/open-mmlab/mmdetection): OpenMMLab 目标检测工具箱
-- [MMDetection3D](https://github.com/open-mmlab/mmdetection3d): OpenMMLab 新一代通用 3D 目标检测平台
-- [MMRotate](https://github.com/open-mmlab/mmrotate): OpenMMLab 旋转框检测工具箱与测试基准
-- [MMYOLO](https://github.com/open-mmlab/mmyolo): OpenMMLab YOLO 系列工具箱与测试基准
-- [MMSegmentation](https://github.com/open-mmlab/mmsegmentation): OpenMMLab 语义分割工具箱
-- [MMOCR](https://github.com/open-mmlab/mmocr): OpenMMLab 全流程文字检测识别理解工具箱
-- [MMPose](https://github.com/open-mmlab/mmpose): OpenMMLab 姿态估计工具箱
-- [MMHuman3D](https://github.com/open-mmlab/mmhuman3d): OpenMMLab 人体参数化模型工具箱与测试基准
-- [MMSelfSup](https://github.com/open-mmlab/mmselfsup): OpenMMLab 自监督学习工具箱与测试基准
-- [MMRazor](https://github.com/open-mmlab/mmrazor): OpenMMLab 模型压缩工具箱与测试基准
-- [MMFewShot](https://github.com/open-mmlab/mmfewshot): OpenMMLab 少样本学习工具箱与测试基准
-- [MMAction2](https://github.com/open-mmlab/mmaction2): OpenMMLab 新一代视频理解工具箱
-- [MMTracking](https://github.com/open-mmlab/mmtracking): OpenMMLab 一体化视频目标感知平台
-- [MMFlow](https://github.com/open-mmlab/mmflow): OpenMMLab 光流估计工具箱与测试基准
-- [MMEditing](https://github.com/open-mmlab/mmediting): OpenMMLab 图像视频编辑工具箱
-- [MMGeneration](https://github.com/open-mmlab/mmgeneration): OpenMMLab 图片视频生成模型工具箱
-- [MMDeploy](https://github.com/open-mmlab/mmdeploy): OpenMMLab 模型部署框架
diff --git a/docs/zh_cn/switch_language.md b/docs/zh_cn/switch_language.md
deleted file mode 100644
index e4ac4b2..0000000
--- a/docs/zh_cn/switch_language.md
+++ /dev/null
@@ -1,3 +0,0 @@
-## <a href='https://mmcv.readthedocs.io/en/2.x/'>English</a>
-
-## <a href='https://mmcv.readthedocs.io/zh_CN/2.x/'>简体中文</a>
diff --git a/docs/zh_cn/understand_mmcv/cnn.md b/docs/zh_cn/understand_mmcv/cnn.md
deleted file mode 100644
index 1f91041..0000000
--- a/docs/zh_cn/understand_mmcv/cnn.md
+++ /dev/null
@@ -1,114 +0,0 @@
-## 卷积神经网络
-
-我们为卷积神经网络提供了一些构建模块，包括层构建、模块组件和权重初始化。
-
-### 网络层的构建
-
-在运行实验时，我们可能需要尝试同属一种类型但不同配置的层，但又不希望每次都修改代码。于是我们提供一些层构建方法，可以从字典构建层，字典可以在配置文件中配置，也可以通过命令行参数指定。
-
-#### 用法
-
-一个简单的例子：
-
-```python
-from mmcv.cnn import build_conv_layer
-
-cfg = dict(type='Conv3d')
-layer = build_conv_layer(cfg, in_channels=3, out_channels=8, kernel_size=3)
-```
-
-- `build_conv_layer`: 支持的类型包括 Conv1d、Conv2d、Conv3d、Conv (Conv是Conv2d的别名）
-- `build_norm_layer`: 支持的类型包括 BN1d、BN2d、BN3d、BN (alias for BN2d)、SyncBN、GN、LN、IN1d、IN2d、IN3d、IN（IN是IN2d的别名）
-- `build_activation_layer`：支持的类型包括 ReLU、LeakyReLU、PReLU、RReLU、ReLU6、ELU、Sigmoid、Tanh、GELU
-- `build_upsample_layer`: 支持的类型包括 nearest、bilinear、deconv、pixel_shuffle
-- `build_padding_layer`: 支持的类型包括 zero、reflect、replicate
-
-#### 拓展
-
-我们还允许自定义层和算子来扩展构建方法。
-
-1. 编写和注册自己的模块：
-
-   ```python
-   from mmengine.registry import MODELS
-
-   @MODELS.register_module()
-   class MyUpsample:
-
-       def __init__(self, scale_factor):
-           pass
-
-       def forward(self, x):
-           pass
-   ```
-
-2. 在某处导入 `MyUpsample` （例如 `__init__.py` ）然后使用它：
-
-   ```python
-   from mmcv.cnn import build_upsample_layer
-
-   cfg = dict(type='MyUpsample', scale_factor=2)
-   layer = build_upsample_layer(cfg)
-   ```
-
-### 模块组件
-
-我们还提供了常用的模块组件，以方便网络构建。
-卷积组件 `ConvModule` 由 convolution、normalization以及activation layers 组成，更多细节请参考 [ConvModule api](api.html#mmcv.cnn.ConvModule)。
-
-```python
-from mmcv.cnn import ConvModule
-
-# conv + bn + relu
-conv = ConvModule(3, 8, 2, norm_cfg=dict(type='BN'))
-# conv + gn + relu
-conv = ConvModule(3, 8, 2, norm_cfg=dict(type='GN', num_groups=2))
-# conv + relu
-conv = ConvModule(3, 8, 2)
-# conv
-conv = ConvModule(3, 8, 2, act_cfg=None)
-# conv + leaky relu
-conv = ConvModule(3, 8, 3, padding=1, act_cfg=dict(type='LeakyReLU'))
-# bn + conv + relu
-conv = ConvModule(
-    3, 8, 2, norm_cfg=dict(type='BN'), order=('norm', 'conv', 'act'))
-```
-
-### Model Zoo
-
-除了`torchvision`的预训练模型，我们还提供以下 CNN 的预训练模型：
-
-- VGG Caffe
-- ResNet Caffe
-- ResNeXt
-- ResNet with Group Normalization
-- ResNet with Group Normalization and Weight Standardization
-- HRNetV2
-- Res2Net
-- RegNet
-
-#### Model URLs in JSON
-
-MMCV中的Model Zoo Link 由 JSON 文件管理。 json 文件由模型名称及其url或path的键值对组成,一个json文件可能类似于:
-
-```json
-{
-    "model_a": "https://example.com/models/model_a_9e5bac.pth",
-    "model_b": "pretrain/model_b_ab3ef2c.pth"
-}
-```
-
-可以在[此处](https://github.com/open-mmlab/mmcv/blob/master/mmcv/model_zoo/open_mmlab.json)找到托管在 OpenMMLab AWS 上的预训练模型的默认链接。
-
-你可以通过将 `open-mmlab.json` 放在 `MMCV_HOME`下来覆盖默认链接，如果在环境中找不到`MMCV_HOME`，则默认使用 `~/.cache/mmcv`。当然你也可以使用命令 `export MMCV_HOME=/your/path`来设置自己的路径。
-
-外部的json文件将被合并为默认文件，如果相同的键出现在外部`json`和默认`json`中，则将使用外部`json`。
-
-#### Load Checkpoint
-
-`mmcv.load_checkpoint()`的参数`filename`支持以下类型：
-
-- filepath: `checkpoint`路径
-- `http://xxx` and `https://xxx`: 下载checkpoint的链接，文件名中必需包含`SHA256`后缀
-- `torchvision://xxx`: `torchvision.models`中的模型链接，更多细节参考 [torchvision](https://pytorch.org/docs/stable/torchvision/models.html)
-- `open-mmlab://xxx`: 默认和其他 json 文件中提供的模型链接或文件路径
diff --git a/docs/zh_cn/understand_mmcv/data_transform.md b/docs/zh_cn/understand_mmcv/data_transform.md
deleted file mode 100644
index 47d16e1..0000000
--- a/docs/zh_cn/understand_mmcv/data_transform.md
+++ /dev/null
@@ -1,341 +0,0 @@
-# 数据变换
-
-在 OpenMMLab 算法库中，数据集的构建和数据的准备是相互解耦的。通常，数据集的构建只对数据集进行解析，记录每个样本的基本信息；而数据的准备则是通过一系列的数据变换，根据样本的基本信息进行数据加载、预处理、格式化等操作。
-
-## 数据变换的设计
-
-在 MMCV 中，我们使用各种可调用的数据变换类来进行数据的操作。这些数据变换类可以接受若干配置参数进行实例化，之后通过调用的方式对输入的数据字典进行处理。同时，我们约定所有数据变换都接受一个字典作为输入，并将处理后的数据输出为一个字典。一个简单的例子如下：
-
-```python
->>> import numpy as np
->>> from mmcv.transforms import Resize
->>>
->>> transform = Resize(scale=(224, 224))
->>> data_dict = {'img': np.random.rand(256, 256, 3)}
->>> data_dict = transform(data_dict)
->>> print(data_dict['img'].shape)
-(224, 224, 3)
-```
-
-数据变换类会读取输入字典的某些字段，并且可能添加、或者更新某些字段。这些字段的键大部分情况下是固定的，如 `Resize` 会固定地读取输入字典中的 `"img"` 等字段。我们可以在对应类的文档中了解对输入输出字段的约定。
-
-```{note}
-默认情况下，在需要图像尺寸作为**初始化参数**的数据变换 (如Resize, Pad) 中，图像尺寸的顺序均为 (width, height)。在数据变换**返回的字典**中，图像相关的尺寸， 如 `img_shape`、`ori_shape`、`pad_shape` 等，均为 (height, width)。
-```
-
-MMCV 为所有的数据变换类提供了一个统一的基类 (`BaseTransform`)：
-
-```python
-class BaseTransform(metaclass=ABCMeta):
-
-    def __call__(self, results: dict) -> dict:
-
-        return self.transform(results)
-
-    @abstractmethod
-    def transform(self, results: dict) -> dict:
-        pass
-```
-
-所有的数据变换类都需要继承 `BaseTransform`，并实现 `transform` 方法。`transform` 方法的输入和输出均为一个字典。在**自定义数据变换类**一节中，我们会更详细地介绍如何实现一个数据变换类。
-
-## 数据流水线
-
-如上所述，所有数据变换的输入和输出都是一个字典，而且根据 OpenMMLab 中 [有关数据集的约定](TODO)，数据集中每个样本的基本信息都是一个字典。这样一来，我们可以将所有的数据变换操作首尾相接，组合成为一条数据流水线（data pipeline），输入数据集中样本的信息字典，输出完成一系列处理后的信息字典。
-
-以分类任务为例，我们在下图展示了一个典型的数据流水线。对每个样本，数据集中保存的基本信息是一个如图中最左侧所示的字典，之后每经过一个由蓝色块代表的数据变换操作，数据字典中都会加入新的字段（标记为绿色）或更新现有的字段（标记为橙色）。
-
-<div align=center>
-<img src="https://user-images.githubusercontent.com/26739999/154197953-bf0b1a16-3f41-4bc7-9e67-b2b9b323d895.png" width="90%"/>
-</div>
-
-在配置文件中，数据流水线是一个若干数据变换配置字典组成的列表，每个数据集都需要设置参数 `pipeline` 来定义该数据集需要进行的数据准备操作。如上数据流水线在配置文件中的配置如下：
-
-```python
-pipeline = [
-    dict(type='LoadImageFromFile'),
-    dict(type='Resize', size=256, keep_ratio=True),
-    dict(type='CenterCrop', crop_size=224),
-    dict(type='Normalize', mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375]),
-    dict(type='ClsFormatBundle')
-]
-
-dataset = dict(
-    ...
-    pipeline=pipeline,
-    ...
-)
-```
-
-## 常用的数据变换类
-
-按照功能，常用的数据变换类可以大致分为数据加载、数据预处理与增强、数据格式化。在 MMCV 中，我们提供了一些常用的数据变换类如下：
-
-### 数据加载
-
-为了支持大规模数据集的加载，通常在 `Dataset` 初始化时不加载数据，只加载相应的路径。因此需要在数据流水线中进行具体数据的加载。
-
-|            class            |                   功能                    |
-| :-------------------------: | :---------------------------------------: |
-| [`LoadImageFromFile`](TODO) |             根据路径加载图像              |
-|  [`LoadAnnotations`](TODO)  | 加载和组织标注信息，如 bbox、语义分割图等 |
-
-### 数据预处理及增强
-
-数据预处理和增强通常是对图像本身进行变换，如裁剪、填充、缩放等。
-
-|              class               |                功能                |
-| :------------------------------: | :--------------------------------: |
-|          [`Pad`](TODO)           |            填充图像边缘            |
-|       [`CenterCrop`](TODO)       |              居中裁剪              |
-|       [`Normalize`](TODO)        |          对图像进行归一化          |
-|         [`Resize`](TODO)         |     按照指定尺寸或比例缩放图像     |
-|      [`RandomResize`](TODO)      |    缩放图像至指定范围的随机尺寸    |
-| [`RandomMultiscaleResize`](TODO) | 缩放图像至多个尺寸中的随机一个尺寸 |
-|    [`RandomGrayscale`](TODO)     |             随机灰度化             |
-|       [`RandomFlip`](TODO)       |            图像随机翻转            |
-|   [`MultiScaleFlipAug`](TODO)    |   支持缩放和翻转的测试时数据增强   |
-
-### 数据格式化
-
-数据格式化操作通常是对数据进行的类型转换。
-
-|          class          |               功能                |
-| :---------------------: | :-------------------------------: |
-|   [`ToTensor`](TODO)    | 将指定的数据转换为 `torch.Tensor` |
-| [`ImageToTensor`](TODO) |    将图像转换为 `torch.Tensor`    |
-
-## 自定义数据变换类
-
-要实现一个新的数据变换类，需要继承 `BaseTransform`，并实现 `transform` 方法。这里，我们使用一个简单的翻转变换（`MyFlip`）作为示例：
-
-```python
-import random
-import mmcv
-from mmcv.transforms import BaseTransform, TRANSFORMS
-
-@TRANSFORMS.register_module()
-class MyFlip(BaseTransform):
-    def __init__(self, direction: str):
-        super().__init__()
-        self.direction = direction
-
-    def transform(self, results: dict) -> dict:
-        img = results['img']
-        results['img'] = mmcv.imflip(img, direction=self.direction)
-        return results
-```
-
-从而，我们可以实例化一个 `MyFlip` 对象，并将之作为一个可调用对象，来处理我们的数据字典。
-
-```python
-import numpy as np
-
-transform = MyFlip(direction='horizontal')
-data_dict = {'img': np.random.rand(224, 224, 3)}
-data_dict = transform(data_dict)
-processed_img = data_dict['img']
-```
-
-又或者，在配置文件的 pipeline 中使用 `MyFlip` 变换
-
-```python
-pipeline = [
-    ...
-    dict(type='MyFlip', direction='horizontal'),
-    ...
-]
-```
-
-需要注意的是，如需在配置文件中使用，需要保证 `MyFlip` 类所在的文件在运行时能够被导入。
-
-## 变换包装
-
-变换包装是一种特殊的数据变换类，他们本身并不操作数据字典中的图像、标签等信息，而是对其中定义的数据变换的行为进行增强。
-
-### 字段映射（KeyMapper）
-
-字段映射包装（`KeyMapper`）用于对数据字典中的字段进行映射。例如，一般的图像处理变换都从数据字典中的 `"img"` 字段获得值。但有些时候，我们希望这些变换处理数据字典中其他字段中的图像，比如 `"gt_img"` 字段。
-
-如果配合注册器和配置文件使用的话，在配置文件中数据集的 `pipeline` 中如下例使用字段映射包装：
-
-```python
-pipeline = [
-    ...
-    dict(type='KeyMapper',
-        mapping={
-            'img': 'gt_img',  # 将 "gt_img" 字段映射至 "img" 字段
-            'mask': ...,  # 不使用原始数据中的 "mask" 字段。即对于被包装的数据变换，数据中不包含 "mask" 字段
-        },
-        auto_remap=True,  # 在完成变换后，将 "img" 重映射回 "gt_img" 字段
-        transforms=[
-            # 在 `RandomFlip` 变换类中，我们只需要操作 "img" 字段即可
-            dict(type='RandomFlip'),
-        ])
-    ...
-]
-```
-
-利用字段映射包装，我们在实现数据变换类时，不需要考虑在 `transform` 方法中考虑各种可能的输入字段名，只需要处理默认的字段即可。
-
-### 随机选择（RandomChoice）和随机执行（RandomApply）
-
-随机选择包装（`RandomChoice`）用于从一系列数据变换组合中随机应用一个数据变换组合。利用这一包装，我们可以简单地实现一些数据增强功能，比如 AutoAugment。
-
-如果配合注册器和配置文件使用的话，在配置文件中数据集的 `pipeline` 中如下例使用随机选择包装：
-
-```python
-pipeline = [
-    ...
-    dict(type='RandomChoice',
-        transforms=[
-            [
-                dict(type='Posterize', bits=4),
-                dict(type='Rotate', angle=30.)
-            ],  # 第一种随机变化组合
-            [
-                dict(type='Equalize'),
-                dict(type='Rotate', angle=30)
-            ],  # 第二种随机变换组合
-        ],
-        prob=[0.4, 0.6]  # 两种随机变换组合各自的选用概率
-        )
-    ...
-]
-```
-
-随机执行包装（`RandomApply`）用于以指定概率随机执行数据变换组合。例如：
-
-```python
-pipeline = [
-    ...
-    dict(type='RandomApply',
-        transforms=[dict(type='Rotate', angle=30.)],
-        prob=0.3)  # 以 0.3 的概率执行被包装的数据变换
-    ...
-]
-```
-
-### 多目标扩展（TransformBroadcaster）
-
-通常，一个数据变换类只会从一个固定的字段读取操作目标。虽然我们也可以使用 `KeyMapper` 来改变读取的字段，但无法将变换一次性应用于多个字段的数据。为了实现这一功能，我们需要借助多目标扩展包装（`TransformBroadcaster`）。
-
-多目标扩展包装（`TransformBroadcaster`）有两个用法，一是将数据变换作用于指定的多个字段，二是将数据变换作用于某个字段下的一组目标中。
-
-1. 应用于多个字段
-
-   假设我们需要将数据变换应用于 `"lq"` (low-quality) 和 `"gt"` (ground-truth) 两个字段中的图像上。
-
-   ```python
-   pipeline = [
-       dict(type='TransformBroadcaster',
-           # 分别应用于 "lq" 和 "gt" 两个字段，并将二者应设置 "img" 字段
-           mapping={'img': ['lq', 'gt']},
-           # 在完成变换后，将 "img" 字段重映射回原先的字段
-           auto_remap=True,
-           # 是否在对各目标的变换中共享随机变量
-           # 更多介绍参加后续章节（随机变量共享）
-           share_random_params=True,
-           transforms=[
-               # 在 `RandomFlip` 变换类中，我们只需要操作 "img" 字段即可
-               dict(type='RandomFlip'),
-           ])
-   ]
-   ```
-
-   在多目标扩展的 `mapping` 设置中，我们同样可以使用 `...` 来忽略指定的原始字段。如以下例子中，被包裹的 `RandomCrop` 会对字段 `"img"` 中的图像进行裁剪，并且在字段 `"img_shape"` 存在时更新剪裁后的图像大小。如果我们希望同时对两个图像字段 `"lq"` 和 `"gt"` 进行相同的随机裁剪，但只更新一次 `"img_shape"` 字段，可以通过例子中的方式实现：
-
-   ```python
-   pipeline = [
-       dict(type='TransformBroadcaster',
-           mapping={
-               'img': ['lq', 'gt'],
-               'img_shape': ['img_shape', ...],
-            },
-           # 在完成变换后，将 "img" 和 "img_shape" 字段重映射回原先的字段
-           auto_remap=True,
-           # 是否在对各目标的变换中共享随机变量
-           # 更多介绍参加后续章节（随机变量共享）
-           share_random_params=True,
-           transforms=[
-               # `RandomCrop` 类中会操作 "img" 和 "img_shape" 字段。若 "img_shape" 空缺，
-               # 则只操作 "img"
-               dict(type='RandomCrop'),
-           ])
-   ]
-   ```
-
-2. 应用于一个字段的一组目标
-
-   假设我们需要将数据变换应用于 `"images"` 字段，该字段为一个图像组成的 list。
-
-   ```python
-   pipeline = [
-       dict(type='TransformBroadcaster',
-           # 将 "images" 字段下的每张图片映射至 "img" 字段
-           mapping={'img': 'images'},
-           # 在完成变换后，将 "img" 字段下的图片重映射回 "images" 字段的列表中
-           auto_remap=True,
-           # 是否在对各目标的变换中共享随机变量
-           share_random_params=True,
-           transforms=[
-               # 在 `RandomFlip` 变换类中，我们只需要操作 "img" 字段即可
-               dict(type='RandomFlip'),
-           ])
-   ]
-   ```
-
-#### 装饰器 `cache_randomness`
-
-在 `TransformBroadcaster` 中，我们提供了 `share_random_params` 选项来支持在多次数据变换中共享随机状态。例如，在超分辨率任务中，我们希望将随机变换**同步**作用于低分辨率图像和原始图像。如果我们希望在自定义的数据变换类中使用这一功能，需要在类中标注哪些随机变量是支持共享的。这可以通过装饰器 `cache_randomness` 来实现。
-
-以上文中的 `MyFlip` 为例，我们希望以一定的概率随机执行翻转：
-
-```python
-from mmcv.transforms.utils import cache_randomness
-
-@TRANSFORMS.register_module()
-class MyRandomFlip(BaseTransform):
-    def __init__(self, prob: float, direction: str):
-        super().__init__()
-        self.prob = prob
-        self.direction = direction
-
-    @cache_randomness  # 标注该方法的输出为可共享的随机变量
-    def do_flip(self):
-        flip = True if random.random() > self.prob else False
-        return flip
-
-    def transform(self, results: dict) -> dict:
-        img = results['img']
-        if self.do_flip():
-            results['img'] = mmcv.imflip(img, direction=self.direction)
-        return results
-```
-
-在上面的例子中，我们用`cache_randomness` 装饰 `do_flip`方法，即将该方法返回值 `flip` 标注为一个支持共享的随机变量。进而，在 `TransformBroadcaster` 对多个目标的变换中，这一变量的值都会保持一致。
-
-#### 装饰器 `avoid_cache_randomness`
-
-在一些情况下，我们无法将数据变换中产生随机变量的过程单独放在类方法中。例如数据变换中使用的来自第三方库的模块，这些模块将随机变量相关的部分封装在了内部，导致无法将其抽出为数据变换的类方法。这样的数据变换无法通过装饰器 `cache_randomness` 标注支持共享的随机变量，进而无法在多目标扩展时共享随机变量。
-
-为了避免在多目标扩展中误用此类数据变换，我们提供了另一个装饰器 `avoid_cache_randomness`，用来对此类数据变换进行标记：
-
-```python
-from mmcv.transforms.utils import avoid_cache_randomness
-
-@TRANSFORMS.register_module()
-@avoid_cache_randomness
-class MyRandomTransform(BaseTransform):
-
-    def transform(self, results: dict) -> dict:
-        ...
-```
-
-用 `avoid_cache_randomness` 标记的数据变换类，当其实例被 `TransformBroadcaster` 包装且将参数 `share_random_params` 设置为 True 时，会抛出异常，以此提醒用户不能这样使用。
-
-在使用 `avoid_cache_randomness` 时需要注意以下几点：
-
-1. `avoid_cache_randomness` 只用于装饰数据变换类（BaseTransfrom 的子类），而不能用与装饰其他一般的类、类方法或函数
-2. 被 `avoid_cache_randomness` 修饰的数据变换作为基类时，其子类将**不会继承**这一特性。如果子类仍无法共享随机变量，则应再次使用 `avoid_cache_randomness` 修饰
-3. 只有当一个数据变换具有随机性，且无法共享随机参数时，才需要以 `avoid_cache_randomness` 修饰。无随机性的数据变换不需要修饰
diff --git a/docs/zh_cn/understand_mmcv/ops.md b/docs/zh_cn/understand_mmcv/ops.md
deleted file mode 100644
index 11b885d..0000000
--- a/docs/zh_cn/understand_mmcv/ops.md
+++ /dev/null
@@ -1,66 +0,0 @@
-## 算子
-
-MMCV 提供了检测、分割等任务中常用的算子
-
-| Device                       | CPU | CUDA | MLU | MPS | Ascend |
-| ---------------------------- | --- | ---- | --- | --- | ------ |
-| ActiveRotatedFilter          | √   | √    |     |     |        |
-| AssignScoreWithK             |     | √    |     |     |        |
-| BallQuery                    |     | √    |     |     |        |
-| BBoxOverlaps                 |     | √    | √   | √   | √      |
-| BorderAlign                  |     | √    |     |     |        |
-| BoxIouRotated                | √   | √    |     |     |        |
-| BoxIouQuadri                 | √   | √    |     |     |        |
-| CARAFE                       |     | √    | √   |     |        |
-| ChamferDistance              |     | √    |     |     |        |
-| CrissCrossAttention          |     | √    |     |     |        |
-| ContourExpand                | √   |      |     |     |        |
-| ConvexIoU                    |     | √    |     |     |        |
-| CornerPool                   |     | √    |     |     |        |
-| Correlation                  |     | √    |     |     |        |
-| Deformable Convolution v1/v2 | √   | √    |     |     | √      |
-| Deformable RoIPool           |     | √    | √   |     | √      |
-| DiffIoURotated               |     | √    |     |     |        |
-| DynamicScatter               |     | √    |     |     |        |
-| FurthestPointSample          |     | √    |     |     |        |
-| FurthestPointSampleWithDist  |     | √    |     |     |        |
-| FusedBiasLeakyrelu           |     | √    |     |     | √      |
-| GatherPoints                 |     | √    |     |     | √      |
-| GroupPoints                  |     | √    |     |     |        |
-| Iou3d                        |     | √    | √   |     |        |
-| KNN                          |     | √    |     |     |        |
-| MaskedConv                   |     | √    | √   |     | √      |
-| MergeCells                   |     | √    |     |     |        |
-| MinAreaPolygon               |     | √    |     |     |        |
-| ModulatedDeformConv2d        | √   | √    |     |     | √      |
-| MultiScaleDeformableAttn     |     | √    | √   |     |        |
-| NMS                          | √   | √    | √   |     | √      |
-| NMSRotated                   | √   | √    |     |     | √      |
-| NMSQuadri                    | √   | √    |     |     |        |
-| PixelGroup                   | √   |      |     |     |        |
-| PointsInBoxes                | √   | √    |     |     |        |
-| PointsInPolygons             |     | √    |     |     |        |
-| PSAMask                      | √   | √    | √   |     | √      |
-| RotatedFeatureAlign          | √   | √    |     |     |        |
-| RoIPointPool3d               |     | √    | √   |     |        |
-| RoIPool                      |     | √    | √   |     | √      |
-| RoIAlignRotated              | √   | √    | √   |     |        |
-| RiRoIAlignRotated            |     | √    |     |     |        |
-| RoIAlign                     | √   | √    | √   |     |        |
-| RoIAwarePool3d               |     | √    | √   |     |        |
-| SAConv2d                     |     | √    |     |     |        |
-| SigmoidFocalLoss             |     | √    | √   |     | √      |
-| SoftmaxFocalLoss             |     | √    |     |     | √      |
-| SoftNMS                      |     | √    |     |     |        |
-| Sparse Convolution           |     | √    |     |     |        |
-| Synchronized BatchNorm       |     | √    |     |     |        |
-| ThreeInterpolate             |     | √    |     |     |        |
-| ThreeNN                      |     | √    | √   |     |        |
-| TINShift                     |     | √    | √   |     |        |
-| UpFirDn2d                    |     | √    |     |     |        |
-| Voxelization                 | √   | √    |     |     | √      |
-| PrRoIPool                    |     | √    |     |     |        |
-| BezierAlign                  | √   | √    |     |     |        |
-| BiasAct                      |     | √    |     |     |        |
-| FilteredLrelu                |     | √    |     |     |        |
-| Conv2dGradfix                |     | √    |     |     |        |
diff --git a/docs/zh_cn/Makefile b/docs_zh_CN/Makefile
similarity index 100%
rename from docs/zh_cn/Makefile
rename to docs_zh_CN/Makefile
diff --git a/docs/zh_cn/_static/css/readthedocs.css b/docs_zh_CN/_static/css/readthedocs.css
similarity index 75%
rename from docs/zh_cn/_static/css/readthedocs.css
rename to docs_zh_CN/_static/css/readthedocs.css
index 9e3a567..3f425fc 100644
--- a/docs/zh_cn/_static/css/readthedocs.css
+++ b/docs_zh_CN/_static/css/readthedocs.css
@@ -4,7 +4,3 @@
     height: 40px;
     width: 85px;
 }
-
-table.colwidths-auto td {
-    width: 50%
-}
diff --git a/docs/zh_cn/_static/image/mmcv-logo.png b/docs_zh_CN/_static/image/mmcv-logo.png
similarity index 100%
rename from docs/zh_cn/_static/image/mmcv-logo.png
rename to docs_zh_CN/_static/image/mmcv-logo.png
diff --git a/docs_zh_CN/api.rst b/docs_zh_CN/api.rst
new file mode 100644
index 0000000..8ca9118
--- /dev/null
+++ b/docs_zh_CN/api.rst
@@ -0,0 +1,44 @@
+fileio
+-------
+.. automodule:: mmcv.fileio
+    :members:
+
+image
+------
+.. automodule:: mmcv.image
+    :members:
+
+video
+------
+.. automodule:: mmcv.video
+    :members:
+
+arraymisc
+---------
+.. automodule:: mmcv.arraymisc
+    :members:
+
+visualization
+--------------
+.. automodule:: mmcv.visualization
+    :members:
+
+utils
+-----
+.. automodule:: mmcv.utils
+    :members:
+
+cnn
+----
+.. automodule:: mmcv.cnn
+    :members:
+
+runner
+------
+.. automodule:: mmcv.runner
+    :members:
+
+ops
+------
+.. automodule:: mmcv.ops
+    :members:
diff --git a/docs_zh_CN/community/contributing.md b/docs_zh_CN/community/contributing.md
new file mode 100644
index 0000000..30bac87
--- /dev/null
+++ b/docs_zh_CN/community/contributing.md
@@ -0,0 +1,69 @@
+## 贡献代码
+
+欢迎任何类型的贡献，包括但不限于
+
+- 修改拼写错误或代码错误
+- 添加文档或将文档翻译成其他语言
+- 添加新功能和新组件
+
+### 工作流
+| 详细工作流见 [拉取请求](pr.md)
+1. 复刻并拉取最新的 OpenMMLab 算法库
+2. 创建新的分支（不建议使用主分支提拉取请求）
+3. 提交你的修改
+4. 创建拉取请求
+
+```{note}
+如果你计划添加新功能并且该功能包含比较大的改动，建议先开 issue 讨论
+```
+### 代码风格
+
+#### Python
+
+[PEP8](https://www.python.org/dev/peps/pep-0008/) 作为 OpenMMLab 算法库首选的代码规范，我们使用以下工具检查和格式化代码
+
+- [flake8](http://flake8.pycqa.org/en/latest/): Python 官方发布的代码规范检查工具，是多个检查工具的封装
+- [yapf](https://github.com/google/yapf): Google 发布的代码规范检查工具
+- [isort](https://github.com/timothycrosley/isort): 自动调整模块导入顺序的工具
+- [markdownlint](https://github.com/markdownlint/markdownlint): 检查 markdown 文件的工具
+- [docformatter](https://github.com/myint/docformatter): 格式化 docstring 的工具
+
+yapf 和 isort 的配置可以在 [setup.cfg](./setup.cfg) 找到
+
+通过配置 [pre-commit hook](https://pre-commit.com/) ，我们可以在提交代码时自动检查和格式化 `flake8`、`yapf`、`isort`、`trailing whitespaces`、`markdown files`，
+修复 `end-of-files`、`double-quoted-strings`、`python-encoding-pragma`、`mixed-line-ending`，调整 `requirments.txt` 的包顺序。
+pre-commit 钩子的配置可以在 [.pre-commit-config](./.pre-commit-config.yaml) 找到。
+
+在克隆算法库后，你需要安装并初始化 pre-commit 钩子
+
+```shell
+pip install -U pre-commit
+```
+
+切换算法库根目录
+
+```shell
+pre-commit install
+```
+
+如果安装 markdownlint 遇到了问题，可以尝试使用以下的步骤安装 ruby
+
+```shell
+# install rvm
+curl -L https://get.rvm.io | bash -s -- --autolibs=read-fail
+[[ -s "$HOME/.rvm/scripts/rvm" ]] && source "$HOME/.rvm/scripts/rvm"
+rvm autolibs disable
+
+# install ruby
+rvm install 2.7.1
+```
+
+或者参考 [这个代码库](https://github.com/innerlee/setup) 和 [`zzruby.sh`](https://github.com/innerlee/setup/blob/master/zzruby.sh)。
+
+至此，每一次 commit 修改都会触发 pre-commit 检查代码格式。
+
+>提交拉取请求前，请确保你的代码符合 yapf 的格式
+
+#### C++ and CUDA
+
+C++ 和 CUDA 的代码规范遵从 [Google C++ Style Guide](https://google.github.io/styleguide/cppguide.html)
diff --git a/docs_zh_CN/community/pr.md b/docs_zh_CN/community/pr.md
new file mode 100644
index 0000000..219e01d
--- /dev/null
+++ b/docs_zh_CN/community/pr.md
@@ -0,0 +1,90 @@
+## 拉取请求
+
+### 什么是拉取请求？
+
+`拉取请求` (Pull Request), [GitHub 官方文档](https://docs.github.com/en/github/collaborating-with-pull-requests/proposing-changes-to-your-work-with-pull-requests/about-pull-requests)定义如下。
+
+>拉取请求是一种通知机制。你修改了他人的代码，将你的修改通知原来作者，希望他合并你的修改。
+
+### 基本的工作流：
+
+1. 获取最新的代码库
+2. 从主分支创建最新的分支进行开发
+3. 提交修改
+4. 推送你的修改并创建一个`拉取请求`
+5. 讨论、审核代码
+6. 将开发分支合并到主分支
+
+### 具体步骤
+
+1. 获取最新的代码库
+    + 当你第一次提 PR 时
+        - 复刻 OpenMMLab 原代码库，点击 GitHub 页面右上角的 **Fork** 按钮即可
+        ![avatar](../../docs/_static/community/1.png)
+
+        - 克隆复刻的代码库到本地
+            ```bash
+            git clone git@github.com:XXX/mmcv.git
+            ```
+
+        - 添加原代码库为上游代码库
+            ```bash
+            git remote add upstream git@github.com:open-mmlab/mmcv
+            ```
+    + 从第二个 PR 起
+       - 检出本地代码库的主分支，然后从最新的原代码库的主分支拉取更新
+            ```bash
+            git checkout master
+            git pull upstream master
+            ```
+
+2. 从主分支创建一个新的开发分支
+    ```bash
+    git checkout -b branchname
+    ```
+    注意：为了保证提交历史清晰可读，我们强烈推荐您先检出主分支 (master)，再创建新的分支。
+
+3. 提交你的修改
+    ```bash
+    # coding
+    git add [files]
+    git commit -m 'messages'
+    ```
+
+4. 推送你的修改到复刻的代码库，并创建一个`拉取请求`
+    + 推送当前分支到远端复刻的代码库
+        ```bash
+        git push origin branchname
+        ```
+
+    + 创建一个`拉取请求`
+    ![avatar](../../docs/_static/community/2.png)
+
+    + 修改`拉取请求`信息模板，描述修改原因和修改内容。还可以在 PR 描述中，手动关联到相关的`议题` (issue),（更多细节，请参考[官方文档](https://docs.github.com/en/issues/tracking-your-work-with-issues/linking-a-pull-request-to-an-issue)）。
+
+5. 讨论并评审你的代码
+    + 创建`拉取请求`时，可以关联给相关人员进行评审
+    ![avatar](../../docs/_static/community/3.png)
+
+    + 根据评审人员的意见修改代码，并推送修改
+
+6.  `拉取请求`合并之后删除该分支
+```bash
+git branch -d branchname # delete local branch
+git push origin --delete branchname # delete remote branch
+```
+
+### PR 规范
+
+1. 使用 [pre-commit hook](https://pre-commit.com)，尽量减少代码风格相关问题
+2. 一个PR对应一个短期分支
+3. 粒度要细，一个PR只做一件事情，避免超大的PR
+    >- Bad:实现Faster R-CNN
+    >- Acceptable:给 Faster R-CNN 添加一个 box head
+    >- Good:给 box head 增加一个参数来支持自定义的 conv 层数
+4. 每次 Commit 时需要提供清晰且有意义 commit 信息
+5. 提供清晰且有意义的`拉取请求`描述
+    >- 标题写明白任务名称，一般格式:[Prefix] Short description of the pull request (Suffix)
+    >- prefix: 新增功能 [Feature], 修 bug [Fix], 文档相关 [Docs], 开发中 [WIP] (暂时不会被review)
+    >- 描述里介绍`拉取请求`的主要修改内容，结果，以及对其他部分的影响, 参考`拉取请求`模板
+    >- 关联相关的`议题` (issue) 和其他`拉取请求`
diff --git a/docs/zh_cn/compatibility.md b/docs_zh_CN/compatibility.md
similarity index 100%
rename from docs/zh_cn/compatibility.md
rename to docs_zh_CN/compatibility.md
diff --git a/docs/en/conf.py b/docs_zh_CN/conf.py
similarity index 61%
rename from docs/en/conf.py
rename to docs_zh_CN/conf.py
index 471bd22..e0c65d0 100644
--- a/docs/en/conf.py
+++ b/docs_zh_CN/conf.py
@@ -15,19 +15,21 @@ import os
 import sys
 
 import pytorch_sphinx_theme
+from m2r import MdInclude
+from recommonmark.transform import AutoStructify
 from sphinx.builders.html import StandaloneHTMLBuilder
 
-sys.path.insert(0, os.path.abspath('../..'))
+sys.path.insert(0, os.path.abspath('..'))
 
-version_file = '../../mmcv/version.py'
-with open(version_file) as f:
+version_file = '../mmcv/version.py'
+with open(version_file, 'r') as f:
     exec(compile(f.read(), version_file, 'exec'))
 __version__ = locals()['__version__']
 
 # -- Project information -----------------------------------------------------
 
 project = 'mmcv'
-copyright = '2018-2022, OpenMMLab'
+copyright = '2018-2021, OpenMMLab'
 author = 'MMCV Authors'
 
 # The short X.Y version
@@ -47,28 +49,16 @@ release = __version__
 
 extensions = [
     'sphinx.ext.autodoc',
-    'sphinx.ext.autosummary',
-    'sphinx.ext.intersphinx',
     'sphinx.ext.napoleon',
     'sphinx.ext.viewcode',
+    'sphinx.ext.autosectionlabel',
     'sphinx_markdown_tables',
     'myst_parser',
     'sphinx_copybutton',
 ]  # yapf: disable
 
-myst_heading_anchors = 4
-
-myst_enable_extensions = ['colon_fence']
-
-# Configuration for intersphinx
-intersphinx_mapping = {
-    'python': ('https://docs.python.org/3', None),
-    'numpy': ('https://numpy.org/doc/stable', None),
-    'torch': ('https://pytorch.org/docs/stable/', None),
-    'mmengine': ('https://mmengine.readthedocs.io/en/latest', None),
-}
-
 autodoc_mock_imports = ['mmcv._ext', 'mmcv.utils.ext_loader', 'torchvision']
+autosectionlabel_prefix_document = True
 
 # Add any paths that contain templates here, relative to this directory.
 templates_path = ['_templates']
@@ -89,7 +79,7 @@ master_doc = 'index'
 #
 # This is also used if you do content translation via gettext catalogs.
 # Usually you set "language" from the command line for these cases.
-language = None
+language = 'zh_CN'
 
 # List of patterns, relative to source directory, that match files and
 # directories to ignore when looking for source files.
@@ -118,9 +108,94 @@ html_theme_options = {
             'name': 'GitHub',
             'url': 'https://github.com/open-mmlab/mmcv'
         },
-    ],
-    # Specify the language of shared menu
-    'menu_lang': 'en',
+        {
+            'name':
+            '文档',
+            'children': [
+                {
+                    'name': 'MMCV',
+                    'url': 'https://mmcv.readthedocs.io/zh_CN/latest/',
+                },
+                {
+                    'name': 'MIM',
+                    'url': 'https://openmim.readthedocs.io/en/latest/'
+                },
+                {
+                    'name': 'MMAction2',
+                    'url': 'https://mmaction2.readthedocs.io/zh_CN/latest/',
+                },
+                {
+                    'name': 'MMClassification',
+                    'url':
+                    'https://mmclassification.readthedocs.io/zh_CN/latest/',
+                },
+                {
+                    'name': 'MMDetection',
+                    'url': 'https://mmdetection.readthedocs.io/zh_CN/latest/',
+                },
+                {
+                    'name': 'MMDetection3D',
+                    'url':
+                    'https://mmdetection3d.readthedocs.io/zh_CN/latest/',
+                },
+                {
+                    'name': 'MMEditing',
+                    'url': 'https://mmediting.readthedocs.io/zh_CN/latest/',
+                },
+                {
+                    'name': 'MMGeneration',
+                    'url': 'https://mmgeneration.readthedocs.io/en/latest/',
+                },
+                {
+                    'name': 'MMOCR',
+                    'url': 'https://mmocr.readthedocs.io/zh_CN/latest/',
+                },
+                {
+                    'name': 'MMPose',
+                    'url': 'https://mmpose.readthedocs.io/zh_CN/latest/',
+                },
+                {
+                    'name': 'MMSegmentation',
+                    'url':
+                    'https://mmsegmentation.readthedocs.io/zh_CN/latest/',
+                },
+                {
+                    'name': 'MMTracking',
+                    'url': 'https://mmtracking.readthedocs.io/zh_CN/latest/',
+                },
+                {
+                    'name': 'MMFlow',
+                    'url': 'https://mmflow.readthedocs.io/en/latest/',
+                },
+                {
+                    'name': 'MMFewShot',
+                    'url': 'https://mmfewshot.readthedocs.io/zh_CN/latest/',
+                },
+            ]
+        },
+        {
+            'name':
+            'OpenMMLab',
+            'children': [
+                {
+                    'name': '主页',
+                    'url': 'https://openmmlab.com/'
+                },
+                {
+                    'name': 'GitHub',
+                    'url': 'https://github.com/open-mmlab/'
+                },
+                {
+                    'name': '推特',
+                    'url': 'https://twitter.com/OpenMMLab'
+                },
+                {
+                    'name': '知乎',
+                    'url': 'https://zhihu.com/people/openmmlab'
+                },
+            ]
+        },
+    ]
 }
 
 # Add any paths that contain custom static files (such as style sheets) here,
@@ -213,3 +288,16 @@ StandaloneHTMLBuilder.supported_image_types = [
 # Ignore >>> when copying code
 copybutton_prompt_text = r'>>> |\.\.\. '
 copybutton_prompt_is_regexp = True
+
+
+def setup(app):
+    app.add_config_value('no_underscore_emphasis', False, 'env')
+    app.add_config_value('m2r_parse_relative_links', False, 'env')
+    app.add_config_value('m2r_anonymous_references', False, 'env')
+    app.add_config_value('m2r_disable_inline_math', False, 'env')
+    app.add_directive('mdinclude', MdInclude)
+    app.add_config_value('recommonmark_config', {
+        'auto_toc_tree_section': 'Contents',
+        'enable_eval_rst': True,
+    }, True)
+    app.add_transform(AutoStructify)
diff --git a/docs_zh_CN/deployment/onnx.md b/docs_zh_CN/deployment/onnx.md
new file mode 100644
index 0000000..c4e0041
--- /dev/null
+++ b/docs_zh_CN/deployment/onnx.md
@@ -0,0 +1,19 @@
+## MMCV中ONNX模块简介 (实验性)
+
+### register_extra_symbolics
+
+在将PyTorch模型导出成ONNX时，需要注册额外的符号函数
+
+#### 范例
+
+```python
+import mmcv
+from mmcv.onnx import register_extra_symbolics
+
+opset_version = 11
+register_extra_symbolics(opset_version)
+```
+
+#### 常见问题
+
+- 无
diff --git a/docs_zh_CN/deployment/onnxruntime_custom_ops.md b/docs_zh_CN/deployment/onnxruntime_custom_ops.md
new file mode 100644
index 0000000..594aefb
--- /dev/null
+++ b/docs_zh_CN/deployment/onnxruntime_custom_ops.md
@@ -0,0 +1,333 @@
+## ONNX Runtime自定义算子
+
+<!-- TOC -->
+
+- [ONNX Runtime自定义算子](#onnx-runtime自定义算子)
+  - [SoftNMS](#softnms)
+    - [描述](#描述)
+    - [模型参数](#模型参数)
+    - [输入](#输入)
+    - [输出](#输出)
+    - [类型约束](#类型约束)
+  - [RoIAlign](#roialign)
+    - [描述](#描述-1)
+    - [模型参数](#模型参数-1)
+    - [输入](#输入-1)
+    - [输出](#输出-1)
+    - [类型约束](#类型约束-1)
+  - [NMS](#nms)
+    - [描述](#描述-2)
+    - [模型参数](#模型参数-2)
+    - [输入](#输入-2)
+    - [输出](#输出-2)
+    - [类型约束](#类型约束-2)
+  - [grid_sampler](#grid_sampler)
+    - [描述](#描述-3)
+    - [模型参数](#模型参数-3)
+    - [输入](#输入-3)
+    - [输出](#输出-3)
+    - [类型约束](#类型约束-3)
+  - [CornerPool](#cornerpool)
+    - [描述](#描述-4)
+    - [模型参数](#模型参数-4)
+    - [输入](#输入-4)
+    - [输出](#输出-4)
+    - [类型约束](#类型约束-4)
+  - [cummax](#cummax)
+    - [描述](#描述-5)
+    - [模型参数](#模型参数-5)
+    - [输入](#输入-5)
+    - [输出](#输出-5)
+    - [类型约束](#类型约束-5)
+  - [cummin](#cummin)
+    - [描述](#描述-6)
+    - [模型参数](#模型参数-6)
+    - [输入](#输入-6)
+    - [输出](#输出-6)
+    - [类型约束](#类型约束-6)
+  - [MMCVModulatedDeformConv2d](#mmcvmodulateddeformconv2d)
+    - [描述](#描述-7)
+    - [模型参数](#模型参数-7)
+    - [输入](#输入-7)
+    - [输出](#输出-7)
+    - [类型约束](#类型约束-7)
+
+<!-- TOC -->
+
+### SoftNMS
+
+#### 描述
+
+根据`scores`计算`boxes`的soft NMS。 请阅读[Soft-NMS -- Improving Object Detection With One Line of Code](https://arxiv.org/abs/1704.04503)了解细节。
+
+#### 模型参数
+
+| 类型    | 参数名          | 描述                                                    |
+| ------- | --------------- | ------------------------------------------------------- |
+| `float` | `iou_threshold` | 用来判断候选框重合度的阈值，取值范围[0, 1]。默认值为0   |
+| `float` | `sigma`         | 高斯方法的超参数                                        |
+| `float` | `min_score`     | NMS的score阈值                                          |
+| `int`   | `method`        | NMS的计算方式, (0: `naive`, 1: `linear`, 2: `gaussian`) |
+| `int`   | `offset`        | 用来计算候选框的宽高(x2 - x1 + offset)。可选值0或1      |
+
+#### 输入
+
+<dl>
+<dt><tt>boxes</tt>: T</dt>
+<dd>输入候选框。形状为(N, 4)的二维张量，N为候选框数量。</dd>
+<dt><tt>scores</tt>: T</dt>
+<dd>输入得分。形状为(N, )的一维张量。</dd>
+</dl>
+
+#### 输出
+
+<dl>
+<dt><tt>dets</tt>: T</dt>
+<dd>输出的检测框与得分。形状为(num_valid_boxes, 5)的二维张量，内容为[[x1, y1, x2, y2, score], ...]。num_valid_boxes是合法的检测框数量。</dd>
+<dt><tt>indices</tt>: tensor(int64)</dt>
+<dd>输出序号。形状为(num_valid_boxes, )的一维张量。</dd>
+</dl>
+
+#### 类型约束
+
+- T:tensor(float32)
+
+### RoIAlign
+
+#### 描述
+
+在特征图上计算RoIAlign，通常在双阶段目标检测模型的bbox_head中使用
+
+#### 模型参数
+
+| 类型    | 参数名           | 描述                                                    |
+| ------- | ---------------- | ------------------------------------------------------- |
+| `int`   | `output_height`  | roi特征的输出高度                                       |
+| `int`   | `output_width`   | roi特征的输出宽度                                       |
+| `float` | `spatial_scale`  | 输入检测框的缩放系数                                    |
+| `int`   | `sampling_ratio` | 输出的采样率。`0`表示使用密集采样                       |
+| `str`   | `mode`           | 池化方式。 `avg`或`max`                                 |
+| `int`   | `aligned`        | 如果`aligned=1`，则像素会进行-0.5的偏移以达到更好的对齐 |
+
+#### 输入
+
+<dl>
+<dt><tt>input</tt>: T</dt>
+<dd>输入特征图；形状为(N, C, H, W)的四维张量，其中N为batch大小，C为输入通道数，H和W为输入特征图的高和宽。</dd>
+<dt><tt>rois</tt>: T</dt>
+<dd>需要进行池化的感兴趣区域；形状为(num_rois, 5)的二维张量，内容为[[batch_index, x1, y1, x2, y2], ...]。rois的坐标为输入特征图的坐标系。</dd>
+</dl>
+
+#### 输出
+
+<dl>
+<dt><tt>feat</tt>: T</dt>
+<dd>池化的输出；形状为(num_rois, C, output_height, output_width)的四维张量。每个输出特征feat[i]都与输入感兴趣区域rois[i]一一对应。<dd>
+</dl>
+
+#### 类型约束
+
+- T:tensor(float32)
+
+### NMS
+
+#### 描述
+
+根据IoU阈值对候选框进行非极大值抑制。
+
+#### 模型参数
+
+| 类型    | 参数名          | 描述                                                  |
+| ------- | --------------- | ----------------------------------------------------- |
+| `float` | `iou_threshold` | 用来判断候选框重合度的阈值，取值范围[0, 1]。默认值为0 |
+| `int`   | `offset`        | 用来计算候选框的宽高(x2 - x1 + offset)。可选值0或1    |
+
+#### 输入
+
+<dl>
+<dt><tt>boxes</tt>: T</dt>
+<dd>输入候选框。形状为(N, 4)的二维张量，N为候选框数量。</dd>
+<dt><tt>scores</tt>: T</dt>
+<dd>输入得分。形状为(N, )的一维张量。</dd>
+</dl>
+
+#### 输出
+
+<dl>
+<dt><tt>indices</tt>: tensor(int32, Linear)</dt>
+<dd>被选中的候选框索引。形状为(num_valid_boxes, )的一维张量，num_valid_boxes表示被选上的候选框数量。</dd>
+</dl>
+
+#### 类型约束
+
+- T:tensor(float32)
+
+### grid_sampler
+
+#### 描述
+
+根据`grid`的像素位置对`input`进行网格采样。
+
+#### 模型参数
+
+| 类型  | 参数名               | 描述                                                                                                                                                 |
+| ----- | -------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `int` | `interpolation_mode` | 计算输出使用的插值模式。(0: `bilinear` , 1: `nearest`)                                                                                               |
+| `int` | `padding_mode`       | 边缘填充模式。(0: `zeros`, 1: `border`, 2: `reflection`)                                                                                             |
+| `int` | `align_corners`      | 如果`align_corners=1`，则极值(`-1`和`1`)会被当做输入边缘像素的中心点。如果`align_corners=0`，则它们会被看做是边缘像素的边缘点,减小分辨率对采样的影响 |
+
+#### 输入
+
+<dl>
+<dt><tt>input</tt>: T</dt>
+<dd>输入特征；形状为(N, C, inH, inW)的四维张量，其中N为batch大小，C为输入通道数，inH和inW为输入特征图的高和宽。</dd>
+<dt><tt>grid</tt>: T</dt>
+<dd>输入网格；形状为(N, outH, outW, 2)的四维张量，outH和outW为输出的高和宽。 </dd>
+</dl>
+
+#### 输出
+
+<dl>
+<dt><tt>output</tt>: T</dt>
+<dd>输出特征；形状为(N, C, outH, outW)的四维张量。</dd>
+</dl>
+
+#### 类型约束
+
+- T:tensor(float32, Linear)
+
+### CornerPool
+
+#### 描述
+
+对`input`计算CornerPool。请阅读[CornerNet -- Detecting Objects as Paired Keypoints](https://arxiv.org/abs/1808.01244)了解更多细节。
+
+#### 模型参数
+
+| 类型  | 参数名 | 描述                                                     |
+| ----- | ------ | -------------------------------------------------------- |
+| `int` | `mode` | 池化模式。(0: `top`, 1: `bottom`, 2: `left`, 3: `right`) |
+
+#### 输入
+
+<dl>
+<dt><tt>input</tt>: T</dt>
+<dd>输入特征；形状为(N, C, H, W)的四维张量，其中N为batch大小，C为输入通道数，H和W为输入特征图的高和宽。</dd>
+</dl>
+
+#### 输出
+
+<dl>
+<dt><tt>output</tt>: T</dt>
+<dd>输出特征；形状为(N, C, H, W)的四维张量。</dd>
+</dl>
+
+#### 类型约束
+
+- T:tensor(float32)
+
+### cummax
+
+#### 描述
+
+返回一个元组(`values`, `indices`)，其中`values`为`input`第`dim`维的累计最大值，`indices`为第`dim`维最大值位置。请阅读[torch.cummax](https://pytorch.org/docs/stable/generated/torch.cummax.html)了解更多细节。
+
+#### 模型参数
+
+| 类型  | 参数名 | 描述               |
+| ----- | ------ | ------------------ |
+| `int` | `dim`  | 进行累计计算的维度 |
+
+#### 输入
+
+<dl>
+<dt><tt>input</tt>: T</dt>
+<dd>输入张量；可以使任意形状；也支持空Tensor</dd>
+</dl>
+
+#### 输出
+
+<dl>
+<dt><tt>output</tt>: T</dt>
+<dd>`input`第`dim`维的累计最大值，形状与`input`相同。类型和`input`一致</dd>
+<dt><tt>indices</tt>: tensor(int64)</dt>
+<dd>第`dim`维最大值位置，形状与`input`相同。</dd>
+</dl>
+
+#### 类型约束
+
+- T:tensor(float32)
+
+### cummin
+
+#### 描述
+
+返回一个元组(`values`, `indices`)，其中`values`为`input`第`dim`维的累计最小值，`indices`为第`dim`维最小值位置。请阅读[torch.cummin](https://pytorch.org/docs/stable/generated/torch.cummin.html)了解更多细节。
+
+#### 模型参数
+
+| 类型  | 参数名 | 描述               |
+| ----- | ------ | ------------------ |
+| `int` | `dim`  | 进行累计计算的维度 |
+
+#### 输入
+
+<dl>
+<dt><tt>input</tt>: T</dt>
+<dd>输入张量；可以是任意形状；也支持空Tensor</dd>
+</dl>
+
+#### 输出
+
+<dl>
+<dt><tt>output</tt>: T</dt>
+<dd>`input`第`dim`维的累计最小值，形状与`input`相同。类型和`input`一致</dd>
+<dt><tt>indices</tt>: tensor(int64)</dt>
+<dd>第`dim`维最小值位置，形状与`input`相同。</dd>
+</dl>
+
+#### 类型约束
+
+- T:tensor(float32)
+
+### MMCVModulatedDeformConv2d
+
+#### 描述
+
+在输入特征上计算Modulated Deformable Convolution，请阅读[Deformable ConvNets v2: More Deformable, Better Results](https://arxiv.org/abs/1811.11168?from=timeline)了解更多细节。
+
+#### 模型参数
+
+| 类型           | 参数名              | 描述                                                          |
+| -------------- | ------------------- | ------------------------------------------------------------- |
+| `list of ints` | `stride`            | 卷积的步长 (sH, sW)                                           |
+| `list of ints` | `padding`           | 输入特征填充大小 (padH, padW)                                 |
+| `list of ints` | `dilation`          | 卷积核各元素间隔 (dH, dW)                                     |
+| `int`          | `deformable_groups` | 可变偏移量的分组，通常置位1即可                               |
+| `int`          | `groups`            | 卷积分组数，`input_channel`会根据这个值被分为数个分组进行计算 |
+
+#### 输入
+
+<dl>
+<dt><tt>inputs[0]</tt>: T</dt>
+<dd>输入特征；形状为(N, C, inH, inW)的四维张量，其中N为batch大小，C为输入通道数，inH和inW为输入特征图的高和宽。</dd>
+<dt><tt>inputs[1]</tt>: T</dt>
+<dd>输入偏移量；形状为(N, deformable_group* 2* kH* kW, outH, outW)的四维张量，kH和kW为输入特征图的高和宽，outH和outW为输入特征图的高和宽。</dd>
+<dt><tt>inputs[2]</tt>: T</dt>
+<dd>输入掩码；形状为(N, deformable_group* kH* kW, outH, outW)的四维张量。</dd>
+<dt><tt>inputs[3]</tt>: T</dt>
+<dd>输入权重；形状为(output_channel, input_channel, kH, kW)的四维张量。</dd>
+<dt><tt>inputs[4]</tt>: T, optional</dt>
+<dd>输入偏移量；形状为(output_channel)的一维张量。</dd>
+</dl>
+
+#### 输出
+
+<dl>
+<dt><tt>outputs[0]</tt>: T</dt>
+<dd>输出特征；形状为(N, output_channel, outH, outW)的四维张量。</dd>
+</dl>
+
+#### 类型约束
+
+- T:tensor(float32, Linear)
diff --git a/docs_zh_CN/deployment/onnxruntime_op.md b/docs_zh_CN/deployment/onnxruntime_op.md
new file mode 100644
index 0000000..3898aa1
--- /dev/null
+++ b/docs_zh_CN/deployment/onnxruntime_op.md
@@ -0,0 +1,127 @@
+## MMCV中的ONNX Runtime自定义算子
+
+### ONNX Runtime介绍
+
+**ONNX Runtime**是一个跨平台的推理与训练加速器，适配许多常用的机器学习/深度神经网络框架。请访问[github](https://github.com/microsoft/onnxruntime)了解更多信息。
+
+### ONNX介绍
+
+**ONNX**是**Open Neural Network Exchange**的缩写，是许多机器学习/深度神经网络框架使用的*中间表示(IR)*。请访问[github](https://github.com/onnx/onnx)了解更多信息。
+
+### 为什么要在MMCV中添加ONNX自定义算子？
+
+- 为了验证ONNX模型在ONNX Runtime下的推理的正确性。
+- 为了方便使用了`mmcv.ops`自定义算子的模型的部署工作。
+
+### MMCV已支持的算子
+
+|                                       算子                                       |  CPU  |  GPU  | MMCV版本 |
+| :------------------------------------------------------------------------------: | :---: | :---: | :------: |
+|                   [SoftNMS](onnxruntime_custom_ops.md#softnms)                   |   Y   |   N   |  1.2.3   |
+|                  [RoIAlign](onnxruntime_custom_ops.md#roialign)                  |   Y   |   N   |  1.2.5   |
+|                       [NMS](onnxruntime_custom_ops.md#nms)                       |   Y   |   N   |  1.2.7   |
+|              [grid_sampler](onnxruntime_custom_ops.md#grid_sampler)              |   Y   |   N   |  1.3.1   |
+|                [CornerPool](onnxruntime_custom_ops.md#cornerpool)                |   Y   |   N   |  1.3.4   |
+|                    [cummax](onnxruntime_custom_ops.md#cummax)                    |   Y   |   N   |  1.3.4   |
+|                    [cummin](onnxruntime_custom_ops.md#cummin)                    |   Y   |   N   |  1.3.4   |
+| [MMCVModulatedDeformConv2d](onnxruntime_custom_ops.md#mmcvmodulateddeformconv2d) |   Y   |   N   |  1.3.12  |
+
+### 如何编译ONNX Runtime自定义算子？
+
+*请注意我们仅在**onnxruntime>=1.8.1**的Linux x86-64 cpu平台上进行过测试*
+
+#### 准备工作
+
+- 克隆代码仓库
+
+```bash
+git clone https://github.com/open-mmlab/mmcv.git
+```
+
+- 从ONNX Runtime下载`onnxruntime-linux`：[releases](https://github.com/microsoft/onnxruntime/releases/tag/v1.8.1)，解压缩，根据路径创建变量`ONNXRUNTIME_DIR`并把路径下的lib目录添加到`LD_LIBRARY_PATH`，步骤如下：
+
+```bash
+wget https://github.com/microsoft/onnxruntime/releases/download/v1.8.1/onnxruntime-linux-x64-1.8.1.tgz
+
+tar -zxvf onnxruntime-linux-x64-1.8.1.tgz
+cd onnxruntime-linux-x64-1.8.1
+export ONNXRUNTIME_DIR=$(pwd)
+export LD_LIBRARY_PATH=$ONNXRUNTIME_DIR/lib:$LD_LIBRARY_PATH
+```
+
+#### Linux系统下编译
+
+```bash
+cd mmcv ## to MMCV root directory
+MMCV_WITH_OPS=1 MMCV_WITH_ORT=1 python setup.py develop
+```
+
+### 如何在python下使用ONNX Runtime对导出的ONNX模型做编译
+
+使用`pip`安装ONNX Runtime
+
+```bash
+pip install onnxruntime==1.8.1
+```
+
+推理范例
+
+```python
+import os
+
+import numpy as np
+import onnxruntime as ort
+
+from mmcv.ops import get_onnxruntime_op_path
+
+ort_custom_op_path = get_onnxruntime_op_path()
+assert os.path.exists(ort_custom_op_path)
+session_options = ort.SessionOptions()
+session_options.register_custom_ops_library(ort_custom_op_path)
+## exported ONNX model with custom operators
+onnx_file = 'sample.onnx'
+input_data = np.random.randn(1, 3, 224, 224).astype(np.float32)
+sess = ort.InferenceSession(onnx_file, session_options)
+onnx_results = sess.run(None, {'input' : input_data})
+```
+
+### 如何为MMCV添加ONNX Runtime的自定义算子
+
+#### 开发前提醒
+
+- 该算子的ONNX Runtime实现尚未在MMCV中支持[已实现算子列表](https://github.com/microsoft/onnxruntime/blob/master/docs/OperatorKernels.md)。
+- 确保该自定义算子可以被ONNX导出。
+
+#### 添加方法
+
+以`soft_nms`为例：
+
+1. 在ONNX Runtime头文件目录`mmcv/ops/csrc/onnxruntime/`下添加头文件`soft_nms.h`
+2. 在ONNX Runtime源码目录`mmcv/ops/csrc/onnxruntime/cpu/`下添加算子实现`soft_nms.cpp`
+3. 在[onnxruntime_register.cpp](../../mmcv/ops/csrc/onnxruntime/cpu/onnxruntime_register.cpp)中注册实现的算子`soft_nms`
+
+    ```c++
+    #include "soft_nms.h"
+
+    SoftNmsOp c_SoftNmsOp;
+
+    if (auto status = ortApi->CustomOpDomain_Add(domain, &c_SoftNmsOp)) {
+    return status;
+    }
+    ```
+
+4. 在`tests/test_ops/test_onnx.py`添加单元测试，
+   可以参考[here](../../tests/test_ops/test_onnx.py)。
+
+**最后，欢迎为MMCV添加ONNX Runtime自定义算子** :nerd_face:
+
+### 已知问题
+
+- "RuntimeError: tuple appears in op that does not forward tuples, unsupported kind: `prim::PythonOp`."
+   1. 请注意`cummax`和`cummin`算子是在torch >= 1.5.0被添加的。但他们需要在torch version >= 1.7.0才能正确导出。否则会在导出时发生上面的错误。
+   2. 解决方法：升级PyTorch到1.7.0以上版本
+
+### 引用
+
+- [How to export Pytorch model with custom op to ONNX and run it in ONNX Runtime](https://github.com/onnx/tutorials/blob/master/PyTorchCustomOperator/README.md)
+- [How to add a custom operator/kernel in ONNX Runtime](https://github.com/microsoft/onnxruntime/blob/master/docs/AddingCustomOp.md)
diff --git a/docs_zh_CN/deployment/tensorrt_custom_ops.md b/docs_zh_CN/deployment/tensorrt_custom_ops.md
new file mode 100644
index 0000000..123f288
--- /dev/null
+++ b/docs_zh_CN/deployment/tensorrt_custom_ops.md
@@ -0,0 +1,391 @@
+## TensorRT自定义算子
+
+<!-- TOC -->
+
+- [TensorRT自定义算子](#tensorrt自定义算子)
+  - [MMCVRoIAlign](#mmcvroialign)
+    - [描述](#描述)
+    - [模型参数](#模型参数)
+    - [输入](#输入)
+    - [输出](#输出)
+    - [类型约束](#类型约束)
+  - [ScatterND](#scatternd)
+    - [描述](#描述-1)
+    - [模型参数](#模型参数-1)
+    - [输入](#输入-1)
+    - [输出](#输出-1)
+    - [类型约束](#类型约束-1)
+  - [NonMaxSuppression](#nonmaxsuppression)
+    - [描述](#描述-2)
+    - [模型参数](#模型参数-2)
+    - [输入](#输入-2)
+    - [输出](#输出-2)
+    - [类型约束](#类型约束-2)
+  - [MMCVDeformConv2d](#mmcvdeformconv2d)
+    - [描述](#描述-3)
+    - [模型参数](#模型参数-3)
+    - [输入](#输入-3)
+    - [输出](#输出-3)
+    - [类型约束](#类型约束-3)
+  - [grid_sampler](#grid_sampler)
+    - [描述](#描述-4)
+    - [模型参数](#模型参数-4)
+    - [输入](#输入-4)
+    - [输出](#输出-4)
+    - [类型约束](#类型约束-4)
+  - [cummax](#cummax)
+    - [描述](#描述-5)
+    - [模型参数](#模型参数-5)
+    - [输入](#输入-5)
+    - [输出](#输出-5)
+    - [类型约束](#类型约束-5)
+  - [cummin](#cummin)
+    - [描述](#描述-6)
+    - [模型参数](#模型参数-6)
+    - [输入](#输入-6)
+    - [输出](#输出-6)
+    - [类型约束](#类型约束-6)
+  - [MMCVInstanceNormalization](#mmcvinstancenormalization)
+    - [描述](#描述-7)
+    - [模型参数](#模型参数-7)
+    - [输入](#输入-7)
+    - [输出](#输出-7)
+    - [类型约束](#类型约束-7)
+  - [MMCVModulatedDeformConv2d](#mmcvmodulateddeformconv2d)
+    - [描述](#描述-8)
+    - [模型参数](#模型参数-8)
+    - [输入](#输入-8)
+    - [输出](#输出-8)
+    - [类型约束](#类型约束-8)
+
+<!-- TOC -->
+
+### MMCVRoIAlign
+
+#### 描述
+
+在特征图上计算RoIAlign，在多数双阶段目标检测模型的bbox_head中使用
+
+#### 模型参数
+
+| 类型    | 参数名           | 描述                                                    |
+| ------- | ---------------- | ------------------------------------------------------- |
+| `int`   | `output_height`  | roi特征的输出高度                                       |
+| `int`   | `output_width`   | roi特征的输出宽度                                       |
+| `float` | `spatial_scale`  | 输入检测框的缩放系数                                    |
+| `int`   | `sampling_ratio` | 输出的采样率。`0`表示使用密集采样                       |
+| `str`   | `mode`           | 池化方式。 `avg`或`max`                                 |
+| `int`   | `aligned`        | 如果`aligned=1`，则像素会进行-0.5的偏移以达到更好的对齐 |
+
+#### 输入
+
+<dl>
+<dt><tt>inputs[0]</tt>: T</dt>
+<dd>输入特征图；形状为(N, C, H, W)的四维张量，其中N为batch大小，C为输入通道数，H和W为输入特征图的高和宽。</dd>
+<dt><tt>inputs[1]</tt>: T</dt>
+<dd>需要进行池化的感兴趣区域；形状为(num_rois, 5)的二维张量，内容为[[batch_index, x1, y1, x2, y2], ...]。rois的坐标为输入特征图的坐标系。</dd>
+</dl>
+
+#### 输出
+
+<dl>
+<dt><tt>outputs[0]</tt>: T</dt>
+<dd>池化的输出；形状为(num_rois, C, output_height, output_width)的四维张量。每个输出特征feat[i]都与输入感兴趣区域rois[i]一一对应。<dd>
+</dl>
+#### 类型约束
+
+- T:tensor(float32, Linear)
+
+### ScatterND
+
+#### 描述
+
+ScatterND接收三个输入，分别为秩为r >= 1的`data`，秩为q >= 1的`indices`以及秩为 q + r - indices.shape[-1] -1 的`update`。输出的计算方式为：首先创建一个`data`的拷贝，然后根据`indces`的值使用`update`对拷贝的`data`进行更新。注意`indices`中不应该存在相同的条目，也就是说对同一个位置进行一次以上的更新是不允许的。
+
+输出的计算方式可以参考如下代码：
+
+```python
+  output = np.copy(data)
+  update_indices = indices.shape[:-1]
+  for idx in np.ndindex(update_indices):
+      output[indices[idx]] = updates[idx]
+```
+
+#### 模型参数
+
+无
+
+#### 输入
+
+<dl>
+<dt><tt>inputs[0]</tt>: T</dt>
+<dd>秩为r >= 1的输入`data`</dd>
+
+<dt><tt>inputs[1]</tt>: tensor(int32, Linear)</dt>
+<dd>秩为q >= 1的输入`update`</dd>
+
+<dt><tt>inputs[2]</tt>: T</dt>
+<dd>秩为 q + r - indices.shape[-1] -1 的输入`update`</dd>
+</dl>
+
+#### 输出
+
+<dl>
+<dt><tt>outputs[0]</tt>: T</dt>
+<dd>秩为r >= 1的输出张量</dd>
+</dl>
+
+#### 类型约束
+
+- T:tensor(float32, Linear), tensor(int32, Linear)
+
+### NonMaxSuppression
+
+#### 描述
+
+根据IoU阈值对候选框进行非极大值抑制。
+
+#### 模型参数
+
+| 类型    | 参数名                       | 描述                                                                                     |
+| ------- | ---------------------------- | ---------------------------------------------------------------------------------------- |
+| `int`   | `center_point_box`           | 0 - 候选框的格式为[y1, x1, y2, x2]， 1-候选框的格式为[x_center, y_center, width, height] |
+| `int`   | `max_output_boxes_per_class` | 每一类最大的输出检测框个数。默认为0，输出检测框个数等于输入候选框数                      |
+| `float` | `iou_threshold`              | 用来判断候选框重合度的阈值，取值范围[0, 1]。默认值为0                                    |
+| `float` | `score_threshold`            | 用来判断候选框是否合法的阈值                                                             |
+| `int`   | `offset`                     | 检测框长宽计算方式为(x2 - x1 + offset)，可选值0或1                                       |
+
+#### 输入
+
+<dl>
+<dt><tt>inputs[0]</tt>: T</dt>
+<dd>输入候选框。形状为(num_batches, spatial_dimension, 4)的三维张量</dd>
+<dt><tt>inputs[1]</tt>: T</dt>
+<dd>输入得分。形状为(num_batches, num_classes, spatial_dimension)的三维张量</dd>
+</dl>
+
+#### 输出
+
+<dl>
+<dt><tt>outputs[0]</tt>: tensor(int32, Linear)</dt>
+<dd>被选中的候选框索引。形状为(num_selected_indices, 3)的二维张量。每一行内容为[batch_index, class_index, box_index]。</dd>
+<dd>其中 num_selected_indices=num_batches* num_classes* min(max_output_boxes_per_class, spatial_dimension)。</dd>
+<dd>所有未被选中的候选框索引都会被填充为-1</dd>
+</dl>
+
+#### 类型约束
+
+- T:tensor(float32, Linear)
+
+### MMCVDeformConv2d
+
+#### 描述
+
+在输入特征上计算Deformable Convolution，请阅读[Deformable Convolutional Network](https://arxiv.org/abs/1703.06211)了解更多细节。
+
+#### 模型参数
+
+| 类型           | 参数名             | 描述                                                                                          |
+| -------------- | ------------------ | --------------------------------------------------------------------------------------------- |
+| `list of ints` | `stride`           | 卷积的步长 (sH, sW)                                                                           |
+| `list of ints` | `padding`          | 输入特征填充大小 (padH, padW)                                                                 |
+| `list of ints` | `dilation`         | 卷积核各元素间隔 (dH, dW)                                                                     |
+| `int`          | `deformable_group` | 可变偏移量的分组                                                                              |
+| `int`          | `group`            | 卷积分组数，`input_channel`会根据这个值被分为数个分组进行计算                                 |
+| `int`          | `im2col_step`      | 可变卷积使用im2col计算卷积。输入与偏移量会以im2col_step为步长分块计算，减少临时空间的使用量。 |
+
+#### 输入
+
+<dl>
+<dt><tt>inputs[0]</tt>: T</dt>
+<dd>输入特征；形状为(N, C, inH, inW)的四维张量，其中N为batch大小，C为输入通道数，inH和inW为输入特征图的高和宽</dd>
+<dt><tt>inputs[1]</tt>: T</dt>
+<dd>输入偏移量；形状为(N, deformable_group* 2* kH* kW, outH, outW)的四维张量，kH和kW为输入特征图的高和宽，outH和outW为输入特征图的高和宽</dd>
+<dt><tt>inputs[2]</tt>: T</dt>
+<dd>输入权重；形状为(output_channel, input_channel, kH, kW)的四维张量</dd>
+</dl>
+
+#### 输出
+
+<dl>
+<dt><tt>outputs[0]</tt>: T</dt>
+<dd>输出特征；形状为(N, output_channel, outH, outW)的四维张量</dd>
+</dl>
+
+#### 类型约束
+
+- T:tensor(float32, Linear)
+
+### grid_sampler
+
+#### 描述
+
+根据`grid`的像素位置对`input`进行网格采样。
+
+#### 模型参数
+
+| 类型  | 参数名               | 描述                                                                                                                                                 |
+| ----- | -------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `int` | `interpolation_mode` | 计算输出使用的插值模式。(0: `bilinear` , 1: `nearest`)                                                                                               |
+| `int` | `padding_mode`       | 边缘填充模式。(0: `zeros`, 1: `border`, 2: `reflection`)                                                                                             |
+| `int` | `align_corners`      | 如果`align_corners=1`，则极值(`-1`和`1`)会被当做输入边缘像素的中心点。如果`align_corners=0`，则它们会被看做是边缘像素的边缘点,减小分辨率对采样的影响 |
+
+#### 输入
+
+<dl>
+<dt><tt>inputs[0]</tt>: T</dt>
+<dd>输入特征；形状为(N, C, inH, inW)的四维张量，其中N为batch大小，C为输入通道数，inH和inW为输入特征图的高和宽</dd>
+<dt><tt>inputs[1]</tt>: T</dt>
+<dd>输入网格；形状为(N, outH, outW, 2)的四维张量，outH和outW为输出的高和宽 </dd>
+</dl>
+
+#### 输出
+
+<dl>
+<dt><tt>outputs[0]</tt>: T</dt>
+<dd>输出特征；形状为(N, C, outH, outW)的四维张量</dd>
+</dl>
+
+#### 类型约束
+
+- T:tensor(float32, Linear)
+
+### cummax
+
+#### 描述
+
+返回一个元组(`values`, `indices`)，其中`values`为`input`第`dim`维的累计最大值，`indices`为第`dim`维最大值位置。请阅读[torch.cummax](https://pytorch.org/docs/stable/generated/torch.cummax.html)了解更多细节。
+
+#### 模型参数
+
+| 类型  | 参数名 | 描述               |
+| ----- | ------ | ------------------ |
+| `int` | `dim`  | 进行累计计算的维度 |
+
+#### 输入
+
+<dl>
+<dt><tt>inputs[0]</tt>: T</dt>
+<dd>输入张量；可以使任意形状</dd>
+</dl>
+
+#### 输出
+
+<dl>
+<dt><tt>outputs[0]</tt>: T</dt>
+<dd>`input`第`dim`维的累计最大值，形状与`input`相同。类型和`input`一致</dd>
+<dt><tt>outputs[1]</tt>: (int32, Linear)</dt>
+<dd>第`dim`维最大值位置，形状与`input`相同</dd>
+</dl>
+
+#### 类型约束
+
+- T:tensor(float32, Linear)
+
+### cummin
+
+#### 描述
+
+返回一个元组(`values`, `indices`)，其中`values`为`input`第`dim`维的累计最小值，`indices`为第`dim`维最小值位置。请阅读[torch.cummin](https://pytorch.org/docs/stable/generated/torch.cummin.html)了解更多细节。
+
+#### 模型参数
+
+| 类型  | 参数名 | 描述               |
+| ----- | ------ | ------------------ |
+| `int` | `dim`  | 进行累计计算的维度 |
+
+#### 输入
+
+<dl>
+<dt><tt>inputs[0]</tt>: T</dt>
+<dd>输入张量；可以使任意形状</dd>
+</dl>
+
+#### 输出
+
+<dl>
+<dt><tt>outputs[0]</tt>: T</dt>
+<dd>`input`第`dim`维的累计最小值，形状与`input`相同。类型和`input`一致</dd>
+<dt><tt>outputs[1]</tt>: (int32, Linear)</dt>
+<dd>第`dim`维最小值位置，形状与`input`相同</dd>
+</dl>
+
+#### 类型约束
+
+- T:tensor(float32, Linear)
+
+### MMCVInstanceNormalization
+
+#### 描述
+
+对特征计算instance normalization，请阅读[Instance Normalization: The Missing Ingredient for Fast Stylization](https://arxiv.org/abs/1607.08022)了解更多详细信息。
+
+#### 模型参数
+
+| 类型    | 参数名    | 描述                         |
+| ------- | --------- | ---------------------------- |
+| `float` | `epsilon` | 用来避免除0错误。默认为1e-05 |
+
+#### 输入
+
+<dl>
+<dt><tt>inputs[0]</tt>: T</dt>
+<dd>输入特征。形状为(N, C, H， W)的四维张量，其中N为batch大小，C为输入通道数，H和W为输入特征图的高和宽</dd>
+<dt><tt>inputs[1]</tt>: T</dt>
+<dd>输入缩放系数。形状为(C，)的一维张量</dd>
+<dt><tt>inputs[2]</tt>: T</dt>
+<dd>输入偏移量。形状为(C，)的一维张量</dd>
+</dl>
+
+#### 输出
+
+<dl>
+<dt><tt>outputs[0]</tt>: T</dt>
+<dd>输出特征。形状为(N, C, H， W)的四维张量</dd>
+</dl>
+
+#### 类型约束
+
+- T:tensor(float32, Linear)
+
+### MMCVModulatedDeformConv2d
+
+#### 描述
+
+在输入特征上计算Modulated Deformable Convolution，请阅读[Deformable ConvNets v2: More Deformable, Better Results](https://arxiv.org/abs/1811.11168?from=timeline)了解更多细节。
+
+#### 模型参数
+
+| 类型           | 参数名              | 描述                                                          |
+| -------------- | ------------------- | ------------------------------------------------------------- |
+| `list of ints` | `stride`            | 卷积的步长 (sH, sW)                                           |
+| `list of ints` | `padding`           | 输入特征填充大小 (padH, padW)                                 |
+| `list of ints` | `dilation`          | 卷积核各元素间隔 (dH, dW)                                     |
+| `int`          | `deformable_groups` | 可变偏移量的分组，通常置位1即可                               |
+| `int`          | `groups`            | 卷积分组数，`input_channel`会根据这个值被分为数个分组进行计算 |
+
+#### 输入
+
+<dl>
+<dt><tt>inputs[0]</tt>: T</dt>
+<dd>输入特征；形状为(N, C, inH, inW)的四维张量，其中N为batch大小，C为输入通道数，inH和inW为输入特征图的高和宽</dd>
+<dt><tt>inputs[1]</tt>: T</dt>
+<dd>输入偏移量；形状为(N, deformable_group* 2* kH* kW, outH, outW)的四维张量，kH和kW为输入特征图的高和宽，outH和outW为输入特征图的高和宽</dd>
+<dt><tt>inputs[2]</tt>: T</dt>
+<dd>输入掩码；形状为(N, deformable_group* kH* kW, outH, outW)的四维张量</dd>
+<dt><tt>inputs[3]</tt>: T</dt>
+<dd>输入权重；形状为(output_channel, input_channel, kH, kW)的四维张量</dd>
+<dt><tt>inputs[4]</tt>: T, optional</dt>
+<dd>输入偏移量；形状为(output_channel)的一维张量</dd>
+</dl>
+
+#### 输出
+
+<dl>
+<dt><tt>outputs[0]</tt>: T</dt>
+<dd>输出特征；形状为(N, output_channel, outH, outW)的四维张量</dd>
+</dl>
+
+#### 类型约束
+
+- T:tensor(float32, Linear)
diff --git a/docs_zh_CN/deployment/tensorrt_plugin.md b/docs_zh_CN/deployment/tensorrt_plugin.md
new file mode 100644
index 0000000..0f385b8
--- /dev/null
+++ b/docs_zh_CN/deployment/tensorrt_plugin.md
@@ -0,0 +1,177 @@
+## MMCV中的TensorRT自定义算子 (实验性)
+
+<!-- TOC -->
+
+- [MMCV中的TensorRT自定义算子 (实验性)](#mmcv中的tensorrt自定义算子-实验性)
+  - [介绍](#介绍)
+  - [MMCV中的TensorRT插件列表](#mmcv中的tensorrt插件列表)
+  - [如何编译MMCV中的TensorRT插件](#如何编译mmcv中的tensorrt插件)
+    - [准备](#准备)
+    - [在Linux上编译](#在linux上编译)
+  - [创建TensorRT推理引擎并在python下进行推理](#创建tensorrt推理引擎并在python下进行推理)
+  - [如何在MMCV中添加新的TensorRT自定义算子](#如何在mmcv中添加新的tensorrt自定义算子)
+    - [主要流程](#主要流程)
+    - [注意](#注意)
+  - [已知问题](#已知问题)
+  - [引用](#引用)
+
+<!-- TOC -->
+
+### 介绍
+
+**NVIDIA TensorRT**是一个为深度学习模型高性能推理准备的软件开发工具(SDK)。它包括深度学习推理优化器和运行时，可为深度学习推理应用提供低延迟和高吞吐量。请访问[developer's website](https://developer.nvidia.com/tensorrt)了解更多信息。
+为了简化TensorRT部署带有MMCV自定义算子的模型的流程，MMCV中添加了一系列TensorRT插件。
+
+### MMCV中的TensorRT插件列表
+
+|         ONNX算子          |                                  TensorRT插件                                   | MMCV版本 |
+| :-----------------------: | :-----------------------------------------------------------------------------: | :------: |
+|       MMCVRoiAlign        |              [MMCVRoiAlign](./tensorrt_custom_ops.md#mmcvroialign)              |  1.2.6   |
+|         ScatterND         |                 [ScatterND](./tensorrt_custom_ops.md#scatternd)                 |  1.2.6   |
+|     NonMaxSuppression     |         [NonMaxSuppression](./tensorrt_custom_ops.md#nonmaxsuppression)         |  1.3.0   |
+|     MMCVDeformConv2d      |          [MMCVDeformConv2d](./tensorrt_custom_ops.md#mmcvdeformconv2d)          |  1.3.0   |
+|       grid_sampler        |              [grid_sampler](./tensorrt_custom_ops.md#grid-sampler)              |  1.3.1   |
+|          cummax           |                    [cummax](./tensorrt_custom_ops.md#cummax)                    |  1.3.5   |
+|          cummin           |                    [cummin](./tensorrt_custom_ops.md#cummin)                    |  1.3.5   |
+| MMCVInstanceNormalization | [MMCVInstanceNormalization](./tensorrt_custom_ops.md#mmcvinstancenormalization) |  1.3.5   |
+| MMCVModulatedDeformConv2d | [MMCVModulatedDeformConv2d](./tensorrt_custom_ops.md#mmcvmodulateddeformconv2d) |  master  |
+
+注意
+
+- 以上所有算子均在 TensorRT-7.2.1.6.Ubuntu-16.04.x86_64-gnu.cuda-10.2.cudnn8.0 环境下开发。
+
+### 如何编译MMCV中的TensorRT插件
+
+#### 准备
+
+- 克隆代码仓库
+
+```bash
+git clone https://github.com/open-mmlab/mmcv.git
+```
+
+- 安装TensorRT
+
+从 [NVIDIA Developer Zone](https://developer.nvidia.com/nvidia-tensorrt-download) 下载合适的TensorRT版本。
+
+比如，对安装了cuda-10.2的x86-64的Ubuntu 16.04，下载文件为`TensorRT-7.2.1.6.Ubuntu-16.04.x86_64-gnu.cuda-10.2.cudnn8.0.tar.gz`.
+
+然后使用下面方式安装并配置环境
+
+```bash
+cd ~/Downloads
+tar -xvzf TensorRT-7.2.1.6.Ubuntu-16.04.x86_64-gnu.cuda-10.2.cudnn8.0.tar.gz
+export TENSORRT_DIR=`pwd`/TensorRT-7.2.1.6
+export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$TENSORRT_DIR/lib
+```
+
+安装python依赖: tensorrt, graphsurgeon, onnx-graphsurgeon
+
+```bash
+pip install $TENSORRT_DIR/python/tensorrt-7.2.1.6-cp37-none-linux_x86_64.whl
+pip install $TENSORRT_DIR/onnx_graphsurgeon/onnx_graphsurgeon-0.2.6-py2.py3-none-any.whl
+pip install $TENSORRT_DIR/graphsurgeon/graphsurgeon-0.4.5-py2.py3-none-any.whl
+```
+
+想了解更多通过tar包安装TensorRT，请访问[Nvidia' website](https://docs.nvidia.com/deeplearning/tensorrt/archives/tensorrt-721/install-guide/index.html#installing-tar).
+
+#### 在Linux上编译
+
+```bash
+cd mmcv ## to MMCV root directory
+MMCV_WITH_OPS=1 MMCV_WITH_TRT=1 pip install -e .
+```
+
+### 创建TensorRT推理引擎并在python下进行推理
+
+范例如下：
+
+```python
+import torch
+import onnx
+
+from mmcv.tensorrt import (TRTWrapper, onnx2trt, save_trt_engine,
+                                   is_tensorrt_plugin_loaded)
+
+assert is_tensorrt_plugin_loaded(), 'Requires to complie TensorRT plugins in mmcv'
+
+onnx_file = 'sample.onnx'
+trt_file = 'sample.trt'
+onnx_model = onnx.load(onnx_file)
+
+## Model input
+inputs = torch.rand(1, 3, 224, 224).cuda()
+## Model input shape info
+opt_shape_dict = {
+    'input': [list(inputs.shape),
+              list(inputs.shape),
+              list(inputs.shape)]
+}
+
+## Create TensorRT engine
+max_workspace_size = 1 << 30
+trt_engine = onnx2trt(
+    onnx_model,
+    opt_shape_dict,
+    max_workspace_size=max_workspace_size)
+
+## Save TensorRT engine
+save_trt_engine(trt_engine, trt_file)
+
+## Run inference with TensorRT
+trt_model = TRTWrapper(trt_file, ['input'], ['output'])
+
+with torch.no_grad():
+    trt_outputs = trt_model({'input': inputs})
+    output = trt_outputs['output']
+
+```
+
+### 如何在MMCV中添加新的TensorRT自定义算子
+
+#### 主要流程
+
+下面是主要的步骤：
+
+1. 添加c++头文件
+2. 添加c++源文件
+3. 添加cuda kernel文件
+4. 在`trt_plugin.cpp`中注册插件
+5. 在`tests/test_ops/test_tensorrt.py`中添加单元测试
+
+**以RoIAlign算子插件`roi_align`举例。**
+
+1. 在TensorRT包含目录`mmcv/ops/csrc/tensorrt/`中添加头文件`trt_roi_align.hpp`
+2. 在TensorRT源码目录`mmcv/ops/csrc/tensorrt/plugins/`中添加头文件`trt_roi_align.cpp`
+3. 在TensorRT源码目录`mmcv/ops/csrc/tensorrt/plugins/`中添加cuda kernel文件`trt_roi_align_kernel.cu`
+4. 在[trt_plugin.cpp](https://github.com/open-mmlab/mmcv/blob/master/mmcv/ops/csrc/tensorrt/plugins/trt_plugin.cpp)中注册`roi_align`插件
+
+    ```c++
+    #include "trt_plugin.hpp"
+
+    #include "trt_roi_align.hpp"
+
+    REGISTER_TENSORRT_PLUGIN(RoIAlignPluginDynamicCreator);
+
+    extern "C" {
+    bool initLibMMCVInferPlugins() { return true; }
+    }  // extern "C"
+    ```
+
+5. 在`tests/test_ops/test_tensorrt.py`中添加单元测试
+
+#### 注意
+
+- 部分MMCV中的自定义算子存在对应的cuda实现，在进行TensorRT插件开发的时候可以参考。
+
+### 已知问题
+
+- 无
+
+### 引用
+
+- [Developer guide of Nvidia TensorRT](https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html)
+- [TensorRT Open Source Software](https://github.com/NVIDIA/TensorRT)
+- [onnx-tensorrt](https://github.com/onnx/onnx-tensorrt)
+- [TensorRT python API](https://docs.nvidia.com/deeplearning/tensorrt/api/python_api/index.html)
+- [TensorRT c++ plugin API](https://docs.nvidia.com/deeplearning/tensorrt/api/c_api/classnvinfer1_1_1_i_plugin.html)
diff --git a/docs_zh_CN/faq.md b/docs_zh_CN/faq.md
new file mode 100644
index 0000000..e5d6395
--- /dev/null
+++ b/docs_zh_CN/faq.md
@@ -0,0 +1,37 @@
+## 常见问题
+
+在这里我们列出了用户经常遇到的问题以及对应的解决方法。如果您遇到了其他常见的问题，并且知道可以帮到大家的解决办法，
+欢迎随时丰富这个列表。
+
+- MMCV 和 MMDetection 的兼容性问题；"ConvWS is already registered in conv layer"
+
+    请按照上述说明为您的 MMDetection 版本安装正确版本的 MMCV。
+
+- "No module named 'mmcv.ops'"; "No module named 'mmcv._ext'"
+
+    1. 使用 `pip uninstall mmcv` 卸载您环境中的 mmcv
+    2. 按照上述说明安装 mmcv-full
+
+- "invalid device function" 或者 "no kernel image is available for execution"
+
+    1. 检查 GPU 的 CUDA 计算能力
+    2. 运行  `python mmdet/utils/collect_env.py` 来检查 PyTorch、torchvision 和 MMCV 是否是针对正确的 GPU 架构构建的
+        您可能需要去设置 `TORCH_CUDA_ARCH_LIST` 来重新安装 MMCV
+        兼容性问题的可能会出现在使用旧版的 GPUs，如：colab 上的 Tesla K80 (3.7)
+    3. 检查运行环境是否和 mmcv/mmdet 编译时的环境相同。例如，您可能使用 CUDA 10.0 编译 mmcv，但在 CUDA 9.0 的环境中运行它
+
+- "undefined symbol" 或者 "cannot open xxx.so"。
+
+    1. 如果符号和 CUDA/C++ 相关（例如：libcudart.so 或者 GLIBCXX），请检查 CUDA/GCC 运行时的版本是否和编译 mmcv 的一致
+    2. 如果符号和 PyTorch 相关（例如：符号包含 caffe、aten 和 TH），请检查 PyTorch 运行时的版本是否和编译 mmcv 的一致
+    3. 运行 `python mmdet/utils/collect_env.py` 以检查 PyTorch、torchvision 和 MMCV 构建和运行的环境是否相同
+
+- "RuntimeError: CUDA error: invalid configuration argument"。
+
+    这个错误可能是由于您的 GPU 性能不佳造成的。尝试降低[THREADS_PER_BLOCK](https://github.com/open-mmlab/mmcv/blob/cac22f8cf5a904477e3b5461b1cc36856c2793da/mmcv/ops/csrc/common_cuda_helper.hpp#L10)
+    的值并重新编译 mmcv。
+
+- "RuntimeError: nms is not compiled with GPU support"。
+
+    这个错误是由于您的 CUDA 环境没有正确安装。
+    您可以尝试重新安装您的 CUDA 环境，然后删除 mmcv/build 文件夹并重新编译 mmcv。
diff --git a/docs_zh_CN/get_started/build.md b/docs_zh_CN/get_started/build.md
new file mode 100644
index 0000000..77fb86e
--- /dev/null
+++ b/docs_zh_CN/get_started/build.md
@@ -0,0 +1,222 @@
+## 从源码编译 MMCV
+
+### 在 Linux 或者 macOS 上编译 MMCV
+
+克隆算法库
+
+```bash
+git clone https://github.com/open-mmlab/mmcv.git
+cd mmcv
+```
+
+你可以安装 lite 版本
+
+```bash
+pip install -e .
+```
+
+也可以安装 full 版本
+
+```bash
+MMCV_WITH_OPS=1 pip install -e .
+```
+
+如果是在 macOS 上编译，则需要在安装命令前添加一些环境变量
+
+```bash
+CC=clang CXX=clang++ CFLAGS='-stdlib=libc++'
+```
+
+例如
+
+```bash
+CC=clang CXX=clang++ CFLAGS='-stdlib=libc++' MMCV_WITH_OPS=1 pip install -e .
+```
+
+```{note}
+如果你打算使用 `opencv-python-headless` 而不是 `opencv-python`，例如在一个很小的容器环境或者没有图形用户界面的服务器中，你可以先安装 `opencv-python-headless`，这样在安装 mmcv 依赖的过程中会跳过 `opencv-python`
+```
+### 在 Windows 上编译 MMCV
+
+在 Windows 上编译 MMCV 比 Linux 复杂，本节将一步步介绍如何在 Windows 上编译 MMCV。
+
+#### 依赖项
+
+请首先安装以下的依赖项：
+
+- [Git](https://git-scm.com/download/win)：安装期间，请选择 **add git to Path**
+- [Visual Studio Community 2019](https://visualstudio.microsoft.com)：用于编译 C++ 和 CUDA 代码
+- [Miniconda](https://docs.conda.io/en/latest/miniconda.html)：包管理工具
+- [CUDA 10.2](https://developer.nvidia.com/cuda-10.2-download-archive)：如果只需要 CPU 版本可以不安装 CUDA，安装CUDA时，可根据需要进行自定义安装。如果已经安装新版本的显卡驱动，建议取消驱动程序的安装
+
+```{note}
+您需要知道如何在 Windows 上设置变量环境，尤其是 "PATH" 的设置，以下安装过程都会用到。
+```
+
+#### 设置 Python 环境
+
+1. 从 Windows 菜单启动 Anaconda 命令行
+
+```{note}
+如 Miniconda 安装程序建议，不要使用原始的 `cmd.exe` 或是 `powershell.exe`。命令行有两个版本，一个基于 PowerShell，一个基于传统的 `cmd.exe`。请注意以下说明都是使用的基于 PowerShell
+```
+
+2. 创建一个新的 Conda 环境
+
+    ```shell
+    conda create --name mmcv python=3.7  # 经测试，3.6, 3.7, 3.8 也能通过
+    conda activate mmcv  # 确保做任何操作前先激活环境
+    ```
+
+3. 安装 PyTorch 时，可以根据需要安装支持 CUDA 或不支持 CUDA 的版本
+
+    ```shell
+    # CUDA version
+    conda install pytorch torchvision cudatoolkit=10.2 -c pytorch
+    # CPU version
+    conda install pytorch torchvision cpuonly -c pytorch
+    ```
+
+4. 准备 MMCV 源代码
+
+    ```shell
+    git clone https://github.com/open-mmlab/mmcv.git
+    cd mmcv
+    ```
+
+5. 安装所需 Python 依赖包
+
+    ```shell
+    pip3 install -r requirements.txt
+    ```
+
+#### 编译与安装 MMCV
+
+MMCV 有三种安装的模式：
+
+1. Lite 版本（不包含算子）
+
+    这种方式下，没有算子被编译，这种模式的 mmcv 是原生的 python 包
+
+2. Full 版本（只包含 CPU 算子）
+
+    编译 CPU 算子，但只有 x86 将会被编译，并且编译版本只能在 CPU only 情况下运行
+
+3. Full 版本（既包含 CPU 算子，又包含 CUDA 算子）
+
+    同时编译 CPU 和 CUDA 算子，`ops` 模块的 x86 与 CUDA 的代码都可以被编译。同时编译的版本可以在 CUDA 上调用 GPU
+
+##### 通用步骤
+
+1. 设置 MSVC 编译器
+
+    设置环境变量。添加 `C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\VC\Tools\MSVC\14.27.29110\bin\Hostx86\x64` 到 `PATH`，则 `cl.exe` 可以在命令行中运行，如下所示。
+
+    ```none
+    (base) PS C:\Users\xxx> cl
+    Microsoft (R) C/C++ Optimizing  Compiler Version 19.27.29111 for x64
+    Copyright (C) Microsoft Corporation.   All rights reserved.
+
+    usage: cl [ option... ] filename... [ / link linkoption... ]
+    ```
+
+    为了兼容性，我们使用 x86-hosted 以及 x64-targeted 版本，即路径中的 `Hostx86\x64` 。
+
+    因为 PyTorch 将解析 `cl.exe` 的输出以检查其版本，只有 utf-8 将会被识别，你可能需要将系统语言更改为英语。控制面板 -> 地区-> 管理-> 非 Unicode 来进行语言转换。
+
+##### 安装方式一：Lite version（不包含算子）
+
+在完成上述的公共步骤后，从菜单打开 Anaconda 命令框，输入以下命令
+
+```shell
+# 激活环境
+conda activate mmcv
+# 切换到 mmcv 根目录
+cd mmcv
+# 安装
+python setup.py develop
+# 检查是否安装成功
+pip list
+```
+
+##### 安装方式二：Full version（只编译 CPU 算子）
+
+1. 完成上述的公共步骤
+
+2. 设置环境变量
+
+    ```shell
+    $env:MMCV_WITH_OPS = 1
+    $env:MAX_JOBS = 8  # 根据你可用CPU以及内存量进行设置
+    ```
+
+3. 编译安装
+
+    ```shell
+    conda activate mmcv  # 激活环境
+    cd mmcv  # 改变路径
+    python setup.py build_ext  # 如果成功, cl 将被启动用于编译算子
+    python setup.py develop  # 安装
+    pip list  # 检查是否安装成功
+    ```
+
+##### 安装方式三：Full version（既编译 CPU 算子又编译 CUDA 算子）
+
+1. 完成上述的公共步骤
+
+2. 设置环境变量
+
+    ```shell
+    $env:MMCV_WITH_OPS = 1
+    $env:MAX_JOBS = 8  # 根据你可用CPU以及内存量进行设置
+    ```
+
+3.  检查 `CUDA_PATH` 或者 `CUDA_HOME` 环境变量已经存在在 `envs` 之中
+
+    ```none
+    (base) PS C:\Users\WRH> ls env:
+
+    Name                           Value
+    ----                           -----
+    CUDA_PATH                      C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.2
+    CUDA_PATH_V10_1                C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.1
+    CUDA_PATH_V10_2                C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.2
+    ```
+
+    如果没有，你可以按照下面的步骤设置
+
+    ```shell
+    $env:CUDA_HOME = "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.2"
+    # 或者
+    $env:CUDA_HOME = $env:CUDA_PATH_V10_2  # CUDA_PATH_V10_2 已经在环境变量中
+    ```
+
+4. 设置 CUDA 的目标架构
+
+    ```shell
+    $env:TORCH_CUDA_ARCH_LIST="6.1" # 支持 GTX 1080
+    # 或者用所有支持的版本，但可能会变得很慢
+    $env:TORCH_CUDA_ARCH_LIST="3.5 3.7 5.0 5.2 6.0 6.1 7.0 7.5"
+    ```
+
+```{note}
+我们可以在 [here](https://developer.nvidia.com/cuda-gpus) 查看 GPU 的计算能力
+```
+
+5. 编译安装
+
+    ```shell
+    $env:MMCV_WITH_OPS = 1
+    $env:MAX_JOBS = 8 # 根据你可用CPU以及内存量进行设置
+    conda activate mmcv # 激活环境
+    cd mmcv  # 改变路径
+    python setup.py build_ext  # 如果成功, cl 将被启动用于编译算子
+    python setup.py develop # 安装
+    pip list # 检查是否安装成功
+    ```
+
+```{note}
+如果你的 PyTorch 版本是 1.6.0，你可能会遇到一些这个 [issue](https://github.com/pytorch/pytorch/issues/42467) 提到的错误，则可以参考这个 [pull request](https://github.com/pytorch/pytorch/pull/43380/files) 修改 本地环境的 PyTorch 源代码
+```
+
+如果编译安装 mmcv 的过程中遇到了问题，你也许可以在 [Frequently Asked Question](../faq.html) 找到解决方法
diff --git a/docs_zh_CN/get_started/installation.md b/docs_zh_CN/get_started/installation.md
new file mode 100644
index 0000000..20e8cd5
--- /dev/null
+++ b/docs_zh_CN/get_started/installation.md
@@ -0,0 +1,158 @@
+## 安装 MMCV
+
+MMCV 有两个版本：
+
+- **mmcv-full**: 完整版，包含所有的特性以及丰富的开箱即用的 CUDA 算子。注意完整版本可能需要更长时间来编译。
+- **mmcv**: 精简版，不包含 CUDA 算子但包含其余所有特性和功能，类似 MMCV 1.0 之前的版本。如果你不需要使用 CUDA 算子的话，精简版可以作为一个考虑选项。
+
+```{warning}
+请不要在同一个环境中安装两个版本，否则可能会遇到类似 `ModuleNotFound` 的错误。在安装一个版本之前，需要先卸载另一个。`如果CUDA可用，强烈推荐安装mmcv-full`。
+```
+
+a. 安装完整版
+
+在安装 mmcv-full 之前，请确保 PyTorch 已经成功安装在环境中，可以参考 PyTorch 官方[文档](https://pytorch.org/)。
+
+我们提供了不同 PyTorch 和 CUDA 版本的 mmcv-full 预编译包，可以大大简化用户安装编译过程。强烈推荐通过预编译包来安装。另外，安装完成后可以运行 [check_installation.py](https://github.com/open-mmlab/mmcv/.dev_scripts/check_installation.py) 脚本检查 mmcv-full 是否安装成功。
+
+i. 安装最新版本
+
+如下是安装最新版 ``mmcv-full`` 的命令
+
+```shell
+pip install mmcv-full -f https://download.openmmlab.com/mmcv/dist/{cu_version}/{torch_version}/index.html
+```
+
+请将链接中的 ``{cu_version}`` 和 ``{torch_version}`` 根据自身需求替换成实际的版本号，例如想安装和 ``CUDA 11.1``、``PyTorch 1.9.0`` 兼容的最新版 ``mmcv-full``，使用如下替换过的命令
+
+```shell
+pip install mmcv-full -f https://download.openmmlab.com/mmcv/dist/cu111/torch1.9.0/index.html
+```
+
+```{note}
+PyTorch 在 1.x.0 和 1.x.1 之间通常是兼容的，故 mmcv-full 只提供 1.x.0 的编译包。如果你
+的 PyTorch 版本是 1.x.1，你可以放心地安装在 1.x.0 版本编译的 mmcv-full。例如，如果你的
+PyTorch 版本是 1.8.1、CUDA 版本是 11.1，你可以使用以下命令安装 mmcv-full。
+
+`pip install mmcv-full -f https://download.openmmlab.com/mmcv/dist/cu111/torch1.8.0/index.html`
+```
+
+如果想知道更多 CUDA 和 PyTorch 版本的命令，可以参考下面的表格，将链接中的 ``=={mmcv_version}`` 删去即可。
+
+ii. 安装特定的版本
+
+如下是安装特定版本 ``mmcv-full`` 的命令
+
+```shell
+pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/{cu_version}/{torch_version}/index.html
+```
+
+首先请参考版本发布信息找到想要安装的版本号，将 ``{mmcv_version}`` 替换成该版本号，例如 ``1.3.9``。
+然后将链接中的 ``{cu_version}`` 和 ``{torch_version}`` 根据自身需求替换成实际的版本号，例如想安装和 ``CUDA 11.1``、``PyTorch 1.9.0`` 兼容的 ``mmcv-full`` 1.3.9 版本，使用如下替换过的命令
+
+```shell
+pip install mmcv-full==1.3.9 -f https://download.openmmlab.com/mmcv/dist/cu111/torch1.9.0/index.html
+```
+
+对于更多的 PyTorch 和 CUDA 版本组合，请参考下表：
+
+<table class="docutils">
+  <tbody>
+    <tr>
+      <th width="80"> CUDA </th>
+      <th valign="bottom" align="left" style="min-width: 100px">torch 1.10</th>
+      <th valign="bottom" align="left" style="min-width: 100px">torch 1.9</th>
+      <th valign="bottom" align="left" style="min-width: 100px">torch 1.8</th>
+      <th valign="bottom" align="left" style="min-width: 100px">torch 1.7</th>
+      <th valign="bottom" align="left" style="min-width: 100px">torch 1.6</th>
+      <th valign="bottom" align="left" style="min-width: 100px">torch 1.5</th>
+    </tr>
+    <tr>
+      <td align="left">11.3</td>
+      <td align="left"><details><summary> 安装 </summary><pre><code>pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu113/torch1.10.0/index.html</code></pre> </details> </td>
+      <td align="left"></td>
+      <td align="left"></code></pre> </details> </td>
+      <td align="left"> </td>
+      <td align="left"> </td>
+      <td align="left"> </td>
+    </tr>
+    <tr>
+      <td align="left">11.1</td>
+      <td align="left"><details><summary> 安装 </summary><pre><code>pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu111/torch1.10.0/index.html</code></pre> </details> </td>
+      <td align="left"><details><summary> 安装 </summary><pre><code>pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu111/torch1.9.0/index.html</code></pre> </details> </td>
+      <td align="left"><details><summary> 安装 </summary><pre><code>pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu111/torch1.8.0/index.html</code></pre> </details> </td>
+      <td align="left"> </td>
+      <td align="left"> </td>
+      <td align="left"> </td>
+    </tr>
+    <tr>
+      <td align="left">11.0</td>
+      <td align="left"> </td>
+      <td align="left"> </td>
+      <td align="left"> </td>
+      <td align="left"><details><summary> 安装 </summary><pre><code>pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu110/torch1.7.0/index.html</code></pre> </details> </td>
+      <td align="left"> </td>
+      <td align="left"> </td>
+    </tr>
+    <tr>
+      <td align="left">10.2</td>
+      <td align="left"><details><summary> 安装 </summary><pre><code>pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu102/torch1.10.0/index.html</code></pre> </details> </td>
+      <td align="left"><details><summary> 安装 </summary><pre><code>pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu102/torch1.9.0/index.html</code></pre> </details> </td>
+      <td align="left"><details><summary> 安装 </summary><pre><code>pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu102/torch1.8.0/index.html</code></pre> </details> </td>
+      <td align="left"><details><summary> 安装 </summary><pre><code>pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu102/torch1.7.0/index.html</code></pre> </details> </td>
+      <td align="left"><details><summary> 安装 </summary><pre><code>pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu102/torch1.6.0/index.html</code></pre> </details> </td>
+      <td align="left"><details><summary> 安装 </summary><pre><code>pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu102/torch1.5.0/index.html</code></pre> </details> </td>
+    </tr>
+    <tr>
+      <td align="left">10.1</td>
+      <td align="left"> </td>
+      <td align="left"> </td>
+      <td align="left"><details><summary> 安装 </summary><pre><code> pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu101/torch1.8.0/index.html</code></pre> </details> </td>
+      <td align="left"><details><summary> 安装 </summary><pre><code> pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu101/torch1.7.0/index.html</code></pre> </details> </td>
+      <td align="left"><details><summary> 安装 </summary><pre><code> pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu101/torch1.6.0/index.html</code></pre> </details> </td>
+      <td align="left"><details><summary> 安装 </summary><pre><code> pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu101/torch1.5.0/index.html</code></pre> </details> </td>
+    </tr>
+    <tr>
+      <td align="left">9.2</td>
+      <td align="left"> </td>
+      <td align="left"> </td>
+      <td align="left"> </td>
+      <td align="left"><details><summary> 安装 </summary><pre><code> pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu92/torch1.7.0/index.html</code></pre> </details> </td>
+      <td align="left"><details><summary> 安装 </summary><pre><code> pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu92/torch1.6.0/index.html</code></pre> </details> </td>
+      <td align="left"><details><summary> 安装 </summary><pre><code> pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu92/torch1.5.0/index.html</code></pre> </details> </td>
+    </tr>
+    <tr>
+      <td align="left">cpu</td>
+      <td align="left"><details><summary> 安装 </summary><pre><code> pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cpu/torch1.10.0/index.html</code></pre> </details> </td>
+      <td align="left"><details><summary> 安装 </summary><pre><code> pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cpu/torch1.9.0/index.html</code></pre> </details> </td>
+       <td align="left"><details><summary> 安装 </summary><pre><code> pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cpu/torch1.8.0/index.html</code></pre> </details> </td>
+      <td align="left"><details><summary> 安装 </summary><pre><code> pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cpu/torch1.7.0/index.html</code></pre> </details> </td>
+      <td align="left"><details><summary> 安装 </summary><pre><code> pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cpu/torch1.6.0/index.html</code></pre> </details> </td>
+      <td align="left"><details><summary> 安装 </summary><pre><code> pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cpu/torch1.5.0/index.html</code></pre> </details> </td>
+    </tr>
+  </tbody>
+</table>
+
+```{note}
+以上提供的预编译包并不囊括所有的 mmcv-full 版本，我们可以点击对应链接查看支持的版本。例如，点击 [cu102-torch1.8.0](https://download.openmmlab.com/mmcv/dist/cu102/torch1.8.0/index.html)，可以看到 `cu102-torch1.8.0` 只提供了 1.3.0 及以上的 mmcv-full 版本。另外，从 `mmcv v1.3.17` 开始，我们不再提供`PyTorch 1.3 & 1.4` 对应的 mmcv-full 预编译包。你可以在 [这](./docs_zh_CN/get_started/previous_versions.md) 找到 `PyTorch 1.3 & 1.4` 对应的预编包。虽然我们不再提供 `PyTorch 1.3 & 1.4` 对应的预编译包，但是我们依然在 CI 中保证对它们的兼容持续到下一年。
+```
+
+除了使用预编译包之外，另一种方式是在本地进行编译，直接运行下述命令
+
+```python
+pip install mmcv-full
+```
+
+但注意本地编译可能会耗时 10 分钟以上。
+
+b. 安装精简版
+
+```python
+pip install mmcv
+```
+
+c. 安装完整版并且编译 onnxruntime 的自定义算子
+
+- 详细的指南请查看 [这里](https://mmcv.readthedocs.io/zh_CN/latest/deployment/onnxruntime_custom_ops.html)。
+
+如果想从源码编译 MMCV，请参考[该文档](https://mmcv.readthedocs.io/zh_CN/latest/get_started/build.html)。
diff --git a/docs_zh_CN/get_started/introduction.md b/docs_zh_CN/get_started/introduction.md
new file mode 100644
index 0000000..0082ae8
--- /dev/null
+++ b/docs_zh_CN/get_started/introduction.md
@@ -0,0 +1,30 @@
+## 介绍 MMCV
+
+MMCV 是一个面向计算机视觉的基础库，它支持了很多开源项目，例如：
+
+- [MMClassification](https://github.com/open-mmlab/mmclassification): OpenMMLab 图像分类工具箱
+- [MMDetection](https://github.com/open-mmlab/mmdetection): OpenMMLab 目标检测工具箱
+- [MMDetection3D](https://github.com/open-mmlab/mmdetection3d): OpenMMLab 新一代通用 3D 目标检测平台
+- [MMSegmentation](https://github.com/open-mmlab/mmsegmentation): OpenMMLab 语义分割工具箱
+- [MMAction2](https://github.com/open-mmlab/mmaction2): OpenMMLab 新一代视频理解工具箱
+- [MMTracking](https://github.com/open-mmlab/mmtracking): OpenMMLab 一体化视频目标感知平台
+- [MMPose](https://github.com/open-mmlab/mmpose): OpenMMLab 姿态估计工具箱
+- [MMEditing](https://github.com/open-mmlab/mmediting): OpenMMLab 图像视频编辑工具箱
+- [MMOCR](https://github.com/open-mmlab/mmocr): OpenMMLab 全流程文字检测识别理解工具包
+- [MMGeneration](https://github.com/open-mmlab/mmgeneration): OpenMMLab 图片视频生成模型工具箱
+
+MMCV 提供了如下众多功能：
+
+- 通用的 IO 接口
+- 图像和视频处理
+- 图像和标注结果可视化
+- 常用小工具（进度条，计时器等）
+- 基于 PyTorch 的通用训练框架
+- 多种 CNN 网络结构
+- 高质量实现的常见 CUDA 算子
+
+如想了解更多特性和使用，请参考[文档](https://mmcv.readthedocs.io/zh_CN/latest)。
+
+```{note}
+MMCV 需要 Python 3.6 以上版本。
+```
diff --git a/docs/zh_cn/get_started/previous_versions.md b/docs_zh_CN/get_started/previous_versions.md
similarity index 93%
rename from docs/zh_cn/get_started/previous_versions.md
rename to docs_zh_CN/get_started/previous_versions.md
index d543818..56679d4 100644
--- a/docs/zh_cn/get_started/previous_versions.md
+++ b/docs_zh_CN/get_started/previous_versions.md
@@ -1,10 +1,11 @@
+
 ## 其他版本的 PyTorch
 
 我们不再提供在较低的 `PyTorch` 版本下编译的 `mmcv-full` 包，但为了您的方便，您可以在下面找到它们。
 
 ### PyTorch 1.4
 
-| 1.0.0 \<= mmcv_version \<= 1.2.1
+| 1.0.0 <= mmcv_version <= 1.2.1
 
 #### CUDA 10.1
 
@@ -26,7 +27,7 @@ pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dis
 
 ### PyTorch v1.3
 
-| 1.0.0 \<= mmcv_version \<= 1.3.16
+| 1.0.0 <= mmcv_version <= 1.3.16
 
 #### CUDA 10.1
 
diff --git a/docs/zh_cn/index.rst b/docs_zh_CN/index.rst
similarity index 65%
rename from docs/zh_cn/index.rst
rename to docs_zh_CN/index.rst
index 98cf088..b6d00a5 100644
--- a/docs/zh_cn/index.rst
+++ b/docs_zh_CN/index.rst
@@ -10,22 +10,30 @@
    get_started/introduction.md
    get_started/installation.md
    get_started/build.md
-   get_started/article.md
 
 .. toctree::
    :maxdepth: 2
    :caption: 深入理解 MMCV
 
+   understand_mmcv/config.md
+   understand_mmcv/registry.md
+   understand_mmcv/runner.md
+   understand_mmcv/io.md
    understand_mmcv/data_process.md
-   understand_mmcv/data_transform.md
    understand_mmcv/visualization.md
    understand_mmcv/cnn.md
    understand_mmcv/ops.md
+   understand_mmcv/utils.md
 
 .. toctree::
-   :caption: 语言切换
+   :maxdepth: 2
+   :caption: 部署
 
-   switch_language.md
+   deployment/onnx.md
+   deployment/onnxruntime_op.md
+   deployment/onnxruntime_custom_ops.md
+   deployment/tensorrt_plugin.md
+   deployment/tensorrt_custom_ops.md
 
 .. toctree::
    :maxdepth: 2
@@ -34,6 +42,8 @@
    compatibility.md
 
 .. toctree::
+   :maxdepth: 2
+   :caption: 常见问题
 
    faq.md
 
@@ -43,20 +53,12 @@
 
    community/contributing.md
    community/pr.md
-   community/code_style.md
 
 .. toctree::
-   :maxdepth: 1
+   :maxdepth: 2
    :caption: API 文档
 
-   mmcv.image <api/image>
-   mmcv.video <api/video>
-   mmcv.visualization <api/visualization>
-   mmcv.cnn <api/cnn>
-   mmcv.ops <api/ops>
-   mmcv.transforms <api/transforms>
-   mmcv.arraymisc <api/arraymisc>
-   mmcv.utils <api/utils>
+   api.rst
 
 
 Indices and tables
diff --git a/docs/zh_cn/make.bat b/docs_zh_CN/make.bat
similarity index 100%
rename from docs/zh_cn/make.bat
rename to docs_zh_CN/make.bat
diff --git a/docs/zh_cn/mmcv-logo.png b/docs_zh_CN/mmcv-logo.png
similarity index 100%
rename from docs/zh_cn/mmcv-logo.png
rename to docs_zh_CN/mmcv-logo.png
diff --git a/docs_zh_CN/understand_mmcv/cnn.md b/docs_zh_CN/understand_mmcv/cnn.md
new file mode 100644
index 0000000..9027cf3
--- /dev/null
+++ b/docs_zh_CN/understand_mmcv/cnn.md
@@ -0,0 +1,525 @@
+## 卷积神经网络
+
+我们为卷积神经网络提供了一些构建模块，包括层构建、模块组件和权重初始化。
+
+### 网络层的构建
+
+在运行实验时，我们可能需要尝试同属一种类型但不同配置的层，但又不希望每次都修改代码。于是我们提供一些层构建方法，可以从字典构建层，字典可以在配置文件中配置，也可以通过命令行参数指定。
+
+#### 用法
+
+一个简单的例子：
+
+```python
+cfg = dict(type='Conv3d')
+layer = build_conv_layer(cfg, in_channels=3, out_channels=8, kernel_size=3)
+```
+
+- `build_conv_layer`: 支持的类型包括 Conv1d、Conv2d、Conv3d、Conv (Conv是Conv2d的别名）
+- `build_norm_layer`: 支持的类型包括 BN1d、BN2d、BN3d、BN (alias for BN2d)、SyncBN、GN、LN、IN1d、IN2d、IN3d、IN（IN是IN2d的别名）
+- `build_activation_layer`：支持的类型包括 ReLU、LeakyReLU、PReLU、RReLU、ReLU6、ELU、Sigmoid、Tanh、GELU
+- `build_upsample_layer`: 支持的类型包括 nearest、bilinear、deconv、pixel_shuffle
+- `build_padding_layer`: 支持的类型包括 zero、reflect、replicate
+
+#### 拓展
+
+我们还允许自定义层和算子来扩展构建方法。
+
+1. 编写和注册自己的模块：
+
+    ```python
+    from mmcv.cnn import UPSAMPLE_LAYERS
+
+    @UPSAMPLE_LAYERS.register_module()
+    class MyUpsample:
+
+        def __init__(self, scale_factor):
+            pass
+
+        def forward(self, x):
+            pass
+    ```
+
+2. 在某处导入 `MyUpsample` （例如 `__init__.py` ）然后使用它：
+
+    ```python
+    cfg = dict(type='MyUpsample', scale_factor=2)
+    layer = build_upsample_layer(cfg)
+    ```
+
+### 模块组件
+
+我们还提供了常用的模块组件，以方便网络构建。
+卷积组件 `ConvModule` 由 convolution、normalization以及activation layers 组成，更多细节请参考 [ConvModule api](api.html#mmcv.cnn.ConvModule)。
+
+```python
+# conv + bn + relu
+conv = ConvModule(3, 8, 2, norm_cfg=dict(type='BN'))
+# conv + gn + relu
+conv = ConvModule(3, 8, 2, norm_cfg=dict(type='GN', num_groups=2))
+# conv + relu
+conv = ConvModule(3, 8, 2)
+# conv
+conv = ConvModule(3, 8, 2, act_cfg=None)
+# conv + leaky relu
+conv = ConvModule(3, 8, 3, padding=1, act_cfg=dict(type='LeakyReLU'))
+# bn + conv + relu
+conv = ConvModule(
+    3, 8, 2, norm_cfg=dict(type='BN'), order=('norm', 'conv', 'act'))
+```
+
+### Weight initialization
+
+> 实现细节可以在 [mmcv/cnn/utils/weight_init.py](../../mmcv/cnn/utils/weight_init.py)中找到
+
+在训练过程中，适当的初始化策略有利于加快训练速度或者获得更高的性能。 在MMCV中，我们提供了一些常用的方法来初始化模块，比如 `nn.Conv2d` 模块。当然，我们也提供了一些高级API，可用于初始化包含一个或多个模块的模型。
+
+#### Initialization functions
+
+以函数的方式初始化 `nn.Module` ，例如 `nn.Conv2d` 、 `nn.Linear` 等。
+
+我们提供以下初始化方法，
+
+- constant_init
+
+  使用给定常量值初始化模型参数
+
+    ```python
+    >>> import torch.nn as nn
+    >>> from mmcv.cnn import constant_init
+    >>> conv1 = nn.Conv2d(3, 3, 1)
+    >>> # constant_init(module, val, bias=0)
+    >>> constant_init(conv1, 1, 0)
+    >>> conv1.weight
+    ```
+
+- xavier_init
+
+   按照 [Understanding the difficulty of training deep feedforward neural networks - Glorot, X. & Bengio, Y. (2010)](http://proceedings.mlr.press/v9/glorot10a/glorot10a.pdf) 描述的方法初始化模型参数
+
+    ```python
+    >>> import torch.nn as nn
+    >>> from mmcv.cnn import xavier_init
+    >>> conv1 = nn.Conv2d(3, 3, 1)
+    >>> # xavier_init(module, gain=1, bias=0, distribution='normal')
+    >>> xavier_init(conv1, distribution='normal')
+    ```
+
+- normal_init
+
+  使用正态分布（高斯分布）初始化模型参数
+
+    ```python
+    >>> import torch.nn as nn
+    >>> from mmcv.cnn import normal_init
+    >>> conv1 = nn.Conv2d(3, 3, 1)
+    >>> # normal_init(module, mean=0, std=1, bias=0)
+    >>> normal_init(conv1, std=0.01, bias=0)
+    ```
+
+- uniform_init
+
+  使用均匀分布初始化模型参数
+
+    ```python
+    >>> import torch.nn as nn
+    >>> from mmcv.cnn import uniform_init
+    >>> conv1 = nn.Conv2d(3, 3, 1)
+    >>> # uniform_init(module, a=0, b=1, bias=0)
+    >>> uniform_init(conv1, a=0, b=1)
+    ```
+
+- kaiming_init
+
+   按照 [Delving deep into rectifiers: Surpassing human-level performance on ImageNet classification - He, K. et al. (2015)](https://www.cv-foundation.org/openaccess/content_iccv_2015/papers/He_Delving_Deep_into_ICCV_2015_paper.pdf) 描述的方法来初始化模型参数。
+
+    ```python
+    >>> import torch.nn as nn
+    >>> from mmcv.cnn import kaiming_init
+    >>> conv1 = nn.Conv2d(3, 3, 1)
+    >>> # kaiming_init(module, a=0, mode='fan_out', nonlinearity='relu', bias=0, distribution='normal')
+    >>> kaiming_init(conv1)
+    ```
+
+- caffe2_xavier_init
+
+  caffe2中实现的 `xavier initialization`，对应于 PyTorch中的 `kaiming_uniform_`
+
+    ```python
+    >>> import torch.nn as nn
+    >>> from mmcv.cnn import caffe2_xavier_init
+    >>> conv1 = nn.Conv2d(3, 3, 1)
+    >>> # caffe2_xavier_init(module, bias=0)
+    >>> caffe2_xavier_init(conv1)
+    ```
+
+- bias_init_with_prob
+
+  根据给定的概率初始化 `conv/fc`, 这在 [Focal Loss for Dense Object Detection](https://arxiv.org/pdf/1708.02002.pdf) 提出。
+
+    ```python
+    >>> from mmcv.cnn import bias_init_with_prob
+    >>> # bias_init_with_prob is proposed in Focal Loss
+    >>> bias = bias_init_with_prob(0.01)
+    >>> bias
+    -4.59511985013459
+    ```
+
+#### Initializers and configs
+
+在初始化方法的基础上，我们定义了相应的初始化类，并将它们注册到 `INITIALIZERS` 中，这样我们就可以使用 `config` 配置来初始化模型了。
+
+我们提供以下初始化类：
+
+- ConstantInit
+- XavierInit
+- NormalInit
+- UniformInit
+- KaimingInit
+- Caffe2XavierInit
+- PretrainedInit
+
+接下来详细介绍 `initialize` 的使用方法
+
+1. 通过关键字 `layer` 来初始化模型
+
+    如果我们只定义了关键字 `layer` ，那么只初始化 `layer` 中包含的层。
+
+    注意: 关键字 `layer` 支持的模块是带有 weights 和 bias 属性的 PyTorch 模块，所以不支持 `MultiheadAttention layer`
+
+- 定义关键字 `layer` 列表并使用相同相同配置初始化模块
+
+  ```python
+  import torch.nn as nn
+  from mmcv.cnn import initialize
+
+  class FooNet(nn.Module):
+      def __init__(self):
+          super().__init__()
+          self.feat = nn.Conv1d(3, 1, 3)
+          self.reg = nn.Conv2d(3, 3, 3)
+          self.cls = nn.Linear(1, 2)
+
+  model = FooNet()
+  init_cfg = dict(type='Constant', layer=['Conv1d', 'Conv2d', 'Linear'], val=1)
+  # 使用相同的配置初始化整个模块
+  initialize(model, init_cfg)
+  # model.feat.weight
+  # Parameter containing:
+  # tensor([[[1., 1., 1.],
+  #          [1., 1., 1.],
+  #          [1., 1., 1.]]], requires_grad=True)
+  ```
+
+- 定义关键字 `layer` 用于初始化不同配置的层
+
+  ```python
+  import torch.nn as nn
+  from mmcv.cnn.utils import initialize
+
+  class FooNet(nn.Module):
+      def __init__(self):
+          super().__init__()
+          self.feat = nn.Conv1d(3, 1, 3)
+          self.reg = nn.Conv2d(3, 3, 3)
+          self.cls = nn.Linear(1,2)
+
+  model = FooNet()
+  init_cfg = [dict(type='Constant', layer='Conv1d', val=1),
+              dict(type='Constant', layer='Conv2d', val=2),
+              dict(type='Constant', layer='Linear', val=3)]
+  # nn.Conv1d 使用 dict(type='Constant', val=1) 初始化
+  # nn.Conv2d 使用 dict(type='Constant', val=2) 初始化
+  # nn.Linear 使用 dict(type='Constant', val=3) 初始化
+  initialize(model, init_cfg)
+  # model.reg.weight
+  # Parameter containing:
+  # tensor([[[[2., 2., 2.],
+  #           [2., 2., 2.],
+  #           [2., 2., 2.]],
+  #          ...,
+  #          [[2., 2., 2.],
+  #           [2., 2., 2.],
+  #           [2., 2., 2.]]]], requires_grad=True)
+  ```
+
+2. 定义关键字`override`初始化模型
+
+- 当用属性名初始化某个特定部分时, 我们可以使用关键字 `override`, 关键字 `override` 对应的Value会替代init_cfg中相应的值
+
+    ```python
+    import torch.nn as nn
+    from mmcv.cnn import initialize
+
+    class FooNet(nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.feat = nn.Conv1d(3, 1, 3)
+            self.reg = nn.Conv2d(3, 3, 3)
+            self.cls = nn.Sequential(nn.Conv1d(3, 1, 3), nn.Linear(1,2))
+
+    # 如果我们想将模型的权重初始化为 1，将偏差初始化为 2
+    # 但希望 `cls` 中的权重为 3，偏差为 4，则我们可以使用关键字override
+
+    model = FooNet()
+    init_cfg = dict(type='Constant', layer=['Conv1d','Conv2d'], val=1, bias=2,
+                    override=dict(type='Constant', name='reg', val=3, bias=4))
+    #  使用 dict(type='Constant', val=1, bias=2)来初始化 self.feat and self.cls
+    # 使用dict(type='Constant', val=3, bias=4)来初始化‘reg’模块。
+    initialize(model, init_cfg)
+    # model.reg.weight
+    # Parameter containing:
+    # tensor([[[[3., 3., 3.],
+    #           [3., 3., 3.],
+    #           [3., 3., 3.]],
+    #           ...,
+    #           [[3., 3., 3.],
+    #            [3., 3., 3.],
+    #            [3., 3., 3.]]]], requires_grad=True)
+    ```
+
+- 如果 init_cfg 中的关键字`layer`为None，则只初始化在关键字override中的子模块，并且省略override中的 type 和其他参数
+
+    ```python
+    model = FooNet()
+    init_cfg = dict(type='Constant', val=1, bias=2, override=dict(name='reg'))
+    # self.feat 和 self.cls 使用pyTorch默认的初始化
+    # 将使用 dict(type='Constant', val=1, bias=2) 初始化名为 'reg' 的模块
+    initialize(model, init_cfg)
+    # model.reg.weight
+    # Parameter containing:
+    # tensor([[[[1., 1., 1.],
+    #           [1., 1., 1.],
+    #           [1., 1., 1.]],
+    #           ...,
+    #           [[1., 1., 1.],
+    #            [1., 1., 1.],
+    #            [1., 1., 1.]]]], requires_grad=True)
+    ```
+
+- 如果我们没有定义关键字`layer`或`override` , 将不会初始化任何东西
+
+- 关键字`override`的无效用法
+
+   ```python
+   # 没有重写任何子模块
+   init_cfg = dict(type='Constant', layer=['Conv1d','Conv2d'],
+                   val=1, bias=2,
+                   override=dict(type='Constant', val=3, bias=4))
+
+   # 没有指定type，即便有其他参数，也是无效的。
+   init_cfg = dict(type='Constant', layer=['Conv1d','Conv2d'],
+                   val=1, bias=2,
+                   override=dict(name='reg', val=3, bias=4))
+   ```
+
+3. 用预训练模型初始化
+
+    ```python
+    import torch.nn as nn
+    import torchvision.models as models
+    from mmcv.cnn import initialize
+
+    # 使用预训练模型来初始化
+    model = models.resnet50()
+    # model.conv1.weight
+    # Parameter containing:
+    # tensor([[[[-6.7435e-03, -2.3531e-02, -9.0143e-03,  ..., -2.1245e-03,
+    #            -1.8077e-03,  3.0338e-03],
+    #           [-1.2603e-02, -2.7831e-02,  2.3187e-02,  ..., -1.5793e-02,
+    #             1.1655e-02,  4.5889e-03],
+    #           [-3.7916e-02,  1.2014e-02,  1.3815e-02,  ..., -4.2651e-03,
+    #             1.7314e-02, -9.9998e-03],
+    #           ...,
+
+    init_cfg = dict(type='Pretrained',
+                    checkpoint='torchvision://resnet50')
+    initialize(model, init_cfg)
+    # model.conv1.weight
+    # Parameter containing:
+    # tensor([[[[ 1.3335e-02,  1.4664e-02, -1.5351e-02,  ..., -4.0896e-02,
+    #            -4.3034e-02, -7.0755e-02],
+    #           [ 4.1205e-03,  5.8477e-03,  1.4948e-02,  ...,  2.2060e-03,
+    #            -2.0912e-02, -3.8517e-02],
+    #           [ 2.2331e-02,  2.3595e-02,  1.6120e-02,  ...,  1.0281e-01,
+    #             6.2641e-02,  5.1977e-02],
+    #           ...,
+
+    # 使用关键字'prefix'用预训练模型的特定部分来初始化子模块权重
+    model = models.resnet50()
+    url = 'http://download.openmmlab.com/mmdetection/v2.0/retinanet/'\
+          'retinanet_r50_fpn_1x_coco/'\
+          'retinanet_r50_fpn_1x_coco_20200130-c2398f9e.pth'
+    init_cfg = dict(type='Pretrained',
+                    checkpoint=url, prefix='backbone.')
+    initialize(model, init_cfg)
+    ```
+
+4. 初始化继承自BaseModule、Sequential、ModuleList的模型
+
+    `BaseModule` 继承自 `torch.nn.Module`, 它们之间唯一的不同是 `BaseModule` 实现了 `init_weight`
+
+    `Sequential` 继承自 `BaseModule` 和 `torch.nn.Sequential`
+
+    `ModuleList` 继承自 `BaseModule` 和 `torch.nn.ModuleList`
+
+    `````python
+    import torch.nn as nn
+    from mmcv.runner import BaseModule, Sequential, ModuleList
+
+    class FooConv1d(BaseModule):
+
+        def __init__(self, init_cfg=None):
+            super().__init__(init_cfg)
+            self.conv1d = nn.Conv1d(4, 1, 4)
+
+        def forward(self, x):
+            return self.conv1d(x)
+
+    class FooConv2d(BaseModule):
+
+        def __init__(self, init_cfg=None):
+            super().__init__(init_cfg)
+            self.conv2d = nn.Conv2d(3, 1, 3)
+
+        def forward(self, x):
+            return self.conv2d(x)
+
+    # BaseModule
+    init_cfg = dict(type='Constant', layer='Conv1d', val=0., bias=1.)
+    model = FooConv1d(init_cfg)
+    model.init_weights()
+    # model.conv1d.weight
+    # Parameter containing:
+    # tensor([[[0., 0., 0., 0.],
+    #        [0., 0., 0., 0.],
+    #        [0., 0., 0., 0.],
+    #        [0., 0., 0., 0.]]], requires_grad=True)
+
+    # Sequential
+    init_cfg1 = dict(type='Constant', layer='Conv1d', val=0., bias=1.)
+    init_cfg2 = dict(type='Constant', layer='Conv2d', val=2., bias=3.)
+    model1 = FooConv1d(init_cfg1)
+    model2 = FooConv2d(init_cfg2)
+    seq_model = Sequential(model1, model2)
+    seq_model.init_weights()
+    # seq_model[0].conv1d.weight
+    # Parameter containing:
+    # tensor([[[0., 0., 0., 0.],
+    #         [0., 0., 0., 0.],
+    #         [0., 0., 0., 0.],
+    #         [0., 0., 0., 0.]]], requires_grad=True)
+    # seq_model[1].conv2d.weight
+    # Parameter containing:
+    # tensor([[[[2., 2., 2.],
+    #           [2., 2., 2.],
+    #           [2., 2., 2.]],
+    #         ...,
+    #          [[2., 2., 2.],
+    #           [2., 2., 2.],
+    #           [2., 2., 2.]]]], requires_grad=True)
+
+    # inner init_cfg has higher priority
+    model1 = FooConv1d(init_cfg1)
+    model2 = FooConv2d(init_cfg2)
+    init_cfg = dict(type='Constant', layer=['Conv1d', 'Conv2d'], val=4., bias=5.)
+    seq_model = Sequential(model1, model2, init_cfg=init_cfg)
+    seq_model.init_weights()
+    # seq_model[0].conv1d.weight
+    # Parameter containing:
+    # tensor([[[0., 0., 0., 0.],
+    #         [0., 0., 0., 0.],
+    #         [0., 0., 0., 0.],
+    #         [0., 0., 0., 0.]]], requires_grad=True)
+    # seq_model[1].conv2d.weight
+    # Parameter containing:
+    # tensor([[[[2., 2., 2.],
+    #           [2., 2., 2.],
+    #           [2., 2., 2.]],
+    #         ...,
+    #          [[2., 2., 2.],
+    #           [2., 2., 2.],
+    #           [2., 2., 2.]]]], requires_grad=True)
+
+    # ModuleList
+    model1 = FooConv1d(init_cfg1)
+    model2 = FooConv2d(init_cfg2)
+    modellist = ModuleList([model1, model2])
+    modellist.init_weights()
+    # modellist[0].conv1d.weight
+    # Parameter containing:
+    # tensor([[[0., 0., 0., 0.],
+    #         [0., 0., 0., 0.],
+    #         [0., 0., 0., 0.],
+    #         [0., 0., 0., 0.]]], requires_grad=True)
+    # modellist[1].conv2d.weight
+    # Parameter containing:
+    # tensor([[[[2., 2., 2.],
+    #           [2., 2., 2.],
+    #           [2., 2., 2.]],
+    #         ...,
+    #          [[2., 2., 2.],
+    #           [2., 2., 2.],
+    #           [2., 2., 2.]]]], requires_grad=True)
+
+    # inner init_cfg has higher priority
+    model1 = FooConv1d(init_cfg1)
+    model2 = FooConv2d(init_cfg2)
+    init_cfg = dict(type='Constant', layer=['Conv1d', 'Conv2d'], val=4., bias=5.)
+    modellist = ModuleList([model1, model2], init_cfg=init_cfg)
+    modellist.init_weights()
+    # modellist[0].conv1d.weight
+    # Parameter containing:
+    # tensor([[[0., 0., 0., 0.],
+    #         [0., 0., 0., 0.],
+    #         [0., 0., 0., 0.],
+    #         [0., 0., 0., 0.]]], requires_grad=True)
+    # modellist[1].conv2d.weight
+    # Parameter containing:
+    # tensor([[[[2., 2., 2.],
+    #           [2., 2., 2.],
+    #           [2., 2., 2.]],
+    #         ...,
+    #          [[2., 2., 2.],
+    #           [2., 2., 2.],
+    #           [2., 2., 2.]]]], requires_grad=True)
+    `````
+
+### Model Zoo
+
+除了`torchvision`的预训练模型，我们还提供以下 CNN 的预训练模型：
+
+- VGG Caffe
+- ResNet Caffe
+- ResNeXt
+- ResNet with Group Normalization
+- ResNet with Group Normalization and Weight Standardization
+- HRNetV2
+- Res2Net
+- RegNet
+
+#### Model URLs in JSON
+
+MMCV中的Model Zoo Link 由 JSON 文件管理。 json 文件由模型名称及其url或path的键值对组成,一个json文件可能类似于:
+
+```json
+{
+    "model_a": "https://example.com/models/model_a_9e5bac.pth",
+    "model_b": "pretrain/model_b_ab3ef2c.pth"
+}
+```
+
+可以在[此处](https://github.com/open-mmlab/mmcv/blob/master/mmcv/model_zoo/open_mmlab.json)找到托管在 OpenMMLab AWS 上的预训练模型的默认链接。
+
+你可以通过将 `open-mmlab.json` 放在 `MMCV_HOME`下来覆盖默认链接，如果在环境中找不到`MMCV_HOME`，则默认使用 `~/.cache/mmcv`。当然你也可以使用命令 `export MMCV_HOME=/your/path`来设置自己的路径。
+
+外部的json文件将被合并为默认文件，如果相同的键出现在外部`json`和默认`json`中，则将使用外部`json`。
+
+#### Load Checkpoint
+
+`mmcv.load_checkpoint()`的参数`filename`支持以下类型：
+
+- filepath: `checkpoint`路径
+- `http://xxx` and `https://xxx`: 下载checkpoint的链接，文件名中必需包含`SHA256`后缀
+- `torchvision://xxx`: `torchvision.models`中的模型链接，更多细节参考 [torchvision](https://pytorch.org/docs/stable/torchvision/models.html)
+- `open-mmlab://xxx`: 默认和其他 json 文件中提供的模型链接或文件路径
diff --git a/docs_zh_CN/understand_mmcv/config.md b/docs_zh_CN/understand_mmcv/config.md
new file mode 100644
index 0000000..c6da308
--- /dev/null
+++ b/docs_zh_CN/understand_mmcv/config.md
@@ -0,0 +1,176 @@
+## 配置
+
+`Config` 类用于操作配置文件，它支持从多种文件格式中加载配置，包括 **python**, **json** 和 **yaml**。
+它提供了类似字典对象的接口来获取和设置值。
+
+以配置文件 `test.py` 为例
+
+```python
+a = 1
+b = dict(b1=[0, 1, 2], b2=None)
+c = (1, 2)
+d = 'string'
+```
+
+加载与使用配置文件
+
+```python
+>>> cfg = Config.fromfile('test.py')
+>>> print(cfg)
+>>> dict(a=1,
+...      b=dict(b1=[0, 1, 2], b2=None),
+...      c=(1, 2),
+...      d='string')
+```
+
+对于所有格式的配置文件，都支持一些预定义变量。它会将 `{{ var }}` 替换为实际值。
+
+目前支持以下四个预定义变量：
+
+`{{ fileDirname }}` - 当前打开文件的目录名，例如 /home/your-username/your-project/folder
+
+`{{ fileBasename }}` - 当前打开文件的文件名，例如 file.ext
+
+`{{ fileBasenameNoExtension }}` - 当前打开文件不包含扩展名的文件名，例如 file
+
+`{{ fileExtname }}` - 当前打开文件的扩展名，例如 .ext
+
+这些变量名引用自 [VS Code](https://code.visualstudio.com/docs/editor/variables-reference)。
+
+这里是一个带有预定义变量的配置文件的例子。
+
+`config_a.py`
+```python
+a = 1
+b = './work_dir/{{ fileBasenameNoExtension }}'
+c = '{{ fileExtname }}'
+```
+
+```python
+>>> cfg = Config.fromfile('./config_a.py')
+>>> print(cfg)
+>>> dict(a=1,
+...      b='./work_dir/config_a',
+...      c='.py')
+```
+
+对于所有格式的配置文件, 都支持继承。为了重用其他配置文件的字段，
+需要指定 `_base_='./config_a.py'` 或者一个包含配置文件的列表 `_base_=['./config_a.py', './config_b.py']`。
+
+这里有 4 个配置继承关系的例子。
+
+`config_a.py` 作为基类配置文件
+
+```python
+a = 1
+b = dict(b1=[0, 1, 2], b2=None)
+```
+### 不含重复键值对从基类配置文件继承
+
+`config_b.py`
+
+```python
+_base_ = './config_a.py'
+c = (1, 2)
+d = 'string'
+```
+
+```python
+>>> cfg = Config.fromfile('./config_b.py')
+>>> print(cfg)
+>>> dict(a=1,
+...      b=dict(b1=[0, 1, 2], b2=None),
+...      c=(1, 2),
+...      d='string')
+```
+在`config_b.py`里的新字段与在`config_a.py`里的旧字段拼接
+
+### 含重复键值对从基类配置文件继承
+
+`config_c.py`
+
+```python
+_base_ = './config_a.py'
+b = dict(b2=1)
+c = (1, 2)
+```
+
+```python
+>>> cfg = Config.fromfile('./config_c.py')
+>>> print(cfg)
+>>> dict(a=1,
+...      b=dict(b1=[0, 1, 2], b2=1),
+...      c=(1, 2))
+```
+
+在基类配置文件：`config_a` 里的 `b.b2=None`被配置文件：`config_c.py`里的 `b.b2=1`替代。
+
+### 从具有忽略字段的配置文件继承
+
+`config_d.py`
+
+```python
+_base_ = './config_a.py'
+b = dict(_delete_=True, b2=None, b3=0.1)
+c = (1, 2)
+```
+
+```python
+>>> cfg = Config.fromfile('./config_d.py')
+>>> print(cfg)
+>>> dict(a=1,
+...      b=dict(b2=None, b3=0.1),
+...      c=(1, 2))
+```
+
+您还可以设置 `_delete_=True`忽略基类配置文件中的某些字段。所有在`b`中的旧键 `b1, b2, b3` 将会被新键 `b2, b3` 所取代。
+
+### 从多个基类配置文件继承（基类配置文件不应包含相同的键）
+
+`config_e.py`
+
+```python
+c = (1, 2)
+d = 'string'
+```
+
+`config_f.py`
+
+```python
+_base_ = ['./config_a.py', './config_e.py']
+```
+
+```python
+>>> cfg = Config.fromfile('./config_f.py')
+>>> print(cfg)
+>>> dict(a=1,
+...      b=dict(b1=[0, 1, 2], b2=None),
+...      c=(1, 2),
+...      d='string')
+```
+
+### 从基类引用变量
+
+您可以使用以下语法引用在基类中定义的变量。
+
+`base.py`
+
+```python
+item1 = 'a'
+item2 = dict(item3 = 'b')
+```
+
+`config_g.py`
+
+```python
+_base_ = ['./base.py']
+item = dict(a = {{ _base_.item1 }}, b = {{ _base_.item2.item3 }})
+```
+
+```python
+>>> cfg = Config.fromfile('./config_g.py')
+>>> print(cfg.pretty_text)
+item1 = 'a'
+item2 = dict(item3='b')
+item = dict(a='a', b='b')
+```
diff --git a/docs/zh_cn/understand_mmcv/data_process.md b/docs_zh_CN/understand_mmcv/data_process.md
similarity index 93%
rename from docs/zh_cn/understand_mmcv/data_process.md
rename to docs_zh_CN/understand_mmcv/data_process.md
index 7e0afd1..0885fe0 100644
--- a/docs/zh_cn/understand_mmcv/data_process.md
+++ b/docs_zh_CN/understand_mmcv/data_process.md
@@ -130,7 +130,7 @@ bboxes = np.array([[10, 10, 100, 120], [0, 0, 50, 50]])
 patches = mmcv.imcrop(img, bboxes)
 
 # 裁剪两个区域并且缩放区域1.2倍
-patches = mmcv.imcrop(img, bboxes, scale=1.2)
+patches = mmcv.imcrop(img, bboxes, scale_ratio=1.2)
 ```
 
 #### 填充
@@ -144,13 +144,13 @@ img = mmcv.imread('tests/data/color.jpg')
 img_ = mmcv.impad(img, shape=(1000, 1200), pad_val=0)
 
 # 用给定值分别填充图像的3个通道至 (1000, 1200)
-img_ = mmcv.impad(img, shape=(1000, 1200), pad_val=(100, 50, 200))
+img_ = mmcv.impad(img, shape=(1000, 1200), pad_val=[100, 50, 200])
 
 # 用给定值填充图像的左、右、上、下四条边
 img_ = mmcv.impad(img, padding=(10, 20, 30, 40), pad_val=0)
 
 # 用3个值分别填充图像的左、右、上、下四条边的3个通道
-img_ = mmcv.impad(img, padding=(10, 20, 30, 40), pad_val=(100, 50, 200))
+img_ = mmcv.impad(img, padding=(10, 20, 30, 40), pad_val=[100, 50, 200])
 
 # 将图像的四条边填充至能够被给定值整除
 img_ = mmcv.impad_to_multiple(img, 32)
@@ -252,24 +252,24 @@ flow = mmcv.flowread('compressed.jpg', quantize=True, concat_axis=1)
 mmcv.flowshow(flow)
 ```
 
-![progress](../../en/_static/flow_visualization.png)
+![progress](../../docs/_static/flow_visualization.png)
 
-1. 流变换
+3. 流变换
 
 ```python
 img1 = mmcv.imread('img1.jpg')
 flow = mmcv.flowread('flow.flo')
-warped_img2 = mmcv.flow_warp(img1, flow)
+warpped_img2 = mmcv.flow_warp(img1, flow)
 ```
 
 img1 (左) and img2 (右)
 
-![raw images](../../en/_static/flow_raw_images.png)
+![raw images](../../docs/_static/flow_raw_images.png)
 
 光流 (img2 -> img1)
 
-![optical flow](../../en/_static/flow_img2toimg1.png)
+![optical flow](../../docs/_static/flow_img2toimg1.png)
 
 变换后的图像和真实图像的差异
 
-![warped image](../../en/_static/flow_warp_diff.png)
+![warpped image](../../docs/_static/flow_warp_diff.png)
diff --git a/docs_zh_CN/understand_mmcv/io.md b/docs_zh_CN/understand_mmcv/io.md
new file mode 100644
index 0000000..0e5002f
--- /dev/null
+++ b/docs_zh_CN/understand_mmcv/io.md
@@ -0,0 +1,240 @@
+## 文件输入输出
+
+文件输入输出模块提供了两个通用的 API 接口用于读取和保存不同格式的文件。
+
+```{note}
+在 v1.3.16 及之后的版本中，IO 模块支持从不同后端读取数据并支持将数据至不同后端。更多细节请访问 PR [#1330](https://github.com/open-mmlab/mmcv/pull/1330)。
+```
+
+### 读取和保存数据
+
+`mmcv` 提供了一个通用的 api 用于读取和保存数据，目前支持的格式有 json、yaml 和 pickle。
+
+#### 从硬盘读取数据或者将数据保存至硬盘
+
+```python
+import mmcv
+
+# 从文件中读取数据
+data = mmcv.load('test.json')
+data = mmcv.load('test.yaml')
+data = mmcv.load('test.pkl')
+# 从文件对象中读取数据
+with open('test.json', 'r') as f:
+    data = mmcv.load(f, file_format='json')
+
+# 将数据序列化为字符串
+json_str = mmcv.dump(data, file_format='json')
+
+# 将数据保存至文件 (根据文件名后缀反推文件类型)
+mmcv.dump(data, 'out.pkl')
+
+# 将数据保存至文件对象
+with open('test.yaml', 'w') as f:
+    data = mmcv.dump(data, f, file_format='yaml')
+```
+
+#### 从其他后端加载或者保存至其他后端
+
+```python
+import mmcv
+
+# 从 s3 文件读取数据
+data = mmcv.load('s3://bucket-name/test.json')
+data = mmcv.load('s3://bucket-name/test.yaml')
+data = mmcv.load('s3://bucket-name/test.pkl')
+
+# 将数据保存至 s3 文件 (根据文件名后缀反推文件类型)
+mmcv.dump(data, 's3://bucket-name/out.pkl')
+```
+
+我们提供了易于拓展的方式以支持更多的文件格式。我们只需要创建一个继承自 `BaseFileHandler` 的
+文件句柄类并将其注册到 `mmcv` 中即可。句柄类至少需要重写三个方法。
+
+```python
+import mmcv
+
+# 支持为文件句柄类注册多个文件格式
+# @mmcv.register_handler(['txt', 'log'])
+@mmcv.register_handler('txt')
+class TxtHandler1(mmcv.BaseFileHandler):
+
+    def load_from_fileobj(self, file):
+        return file.read()
+
+    def dump_to_fileobj(self, obj, file):
+        file.write(str(obj))
+
+    def dump_to_str(self, obj, **kwargs):
+        return str(obj)
+```
+
+以 `PickleHandler` 为例
+
+```python
+import pickle
+
+class PickleHandler(mmcv.BaseFileHandler):
+
+    def load_from_fileobj(self, file, **kwargs):
+        return pickle.load(file, **kwargs)
+
+    def load_from_path(self, filepath, **kwargs):
+        return super(PickleHandler, self).load_from_path(
+            filepath, mode='rb', **kwargs)
+
+    def dump_to_str(self, obj, **kwargs):
+        kwargs.setdefault('protocol', 2)
+        return pickle.dumps(obj, **kwargs)
+
+    def dump_to_fileobj(self, obj, file, **kwargs):
+        kwargs.setdefault('protocol', 2)
+        pickle.dump(obj, file, **kwargs)
+
+    def dump_to_path(self, obj, filepath, **kwargs):
+        super(PickleHandler, self).dump_to_path(
+            obj, filepath, mode='wb', **kwargs)
+```
+
+### 读取文件并返回列表或字典
+
+例如， `a.txt` 是文本文件，一共有5行内容。
+
+```
+a
+b
+c
+d
+e
+```
+#### 从硬盘读取
+
+使用 `list_from_file` 读取 `a.txt`
+
+```python
+>>> mmcv.list_from_file('a.txt')
+['a', 'b', 'c', 'd', 'e']
+>>> mmcv.list_from_file('a.txt', offset=2)
+['c', 'd', 'e']
+>>> mmcv.list_from_file('a.txt', max_num=2)
+['a', 'b']
+>>> mmcv.list_from_file('a.txt', prefix='/mnt/')
+['/mnt/a', '/mnt/b', '/mnt/c', '/mnt/d', '/mnt/e']
+```
+
+同样， `b.txt` 也是文本文件，一共有3行内容
+
+```
+1 cat
+2 dog cow
+3 panda
+```
+
+使用 `dict_from_file` 读取 `b.txt`
+
+```python
+>>> mmcv.dict_from_file('b.txt')
+{'1': 'cat', '2': ['dog', 'cow'], '3': 'panda'}
+>>> mmcv.dict_from_file('b.txt', key_type=int)
+{1: 'cat', 2: ['dog', 'cow'], 3: 'panda'}
+```
+
+#### 从其他后端读取
+
+使用 `list_from_file` 读取 `s3://bucket-name/a.txt`
+
+```python
+>>> mmcv.list_from_file('s3://bucket-name/a.txt')
+['a', 'b', 'c', 'd', 'e']
+>>> mmcv.list_from_file('s3://bucket-name/a.txt', offset=2)
+['c', 'd', 'e']
+>>> mmcv.list_from_file('s3://bucket-name/a.txt', max_num=2)
+['a', 'b']
+>>> mmcv.list_from_file('s3://bucket-name/a.txt', prefix='/mnt/')
+['/mnt/a', '/mnt/b', '/mnt/c', '/mnt/d', '/mnt/e']
+```
+
+使用 `dict_from_file` 读取 `b.txt`
+
+```python
+>>> mmcv.dict_from_file('s3://bucket-name/b.txt')
+{'1': 'cat', '2': ['dog', 'cow'], '3': 'panda'}
+>>> mmcv.dict_from_file('s3://bucket-name/b.txt', key_type=int)
+{1: 'cat', 2: ['dog', 'cow'], 3: 'panda'}
+```
+
+### 读取和保存权重文件
+
+#### 从硬盘读取权重文件或者将权重文件保存至硬盘
+
+我们可以通过下面的方式从磁盘读取权重文件或者将权重文件保存至磁盘
+
+```python
+import torch
+
+filepath1 = '/path/of/your/checkpoint1.pth'
+filepath2 = '/path/of/your/checkpoint2.pth'
+# 从 filepath1 读取权重文件
+checkpoint = torch.load(filepath1)
+# 将权重文件保存至 filepath2
+torch.save(checkpoint, filepath2)
+```
+
+MMCV 提供了很多后端，`HardDiskBackend` 是其中一个，我们可以通过它来读取或者保存权重文件。
+
+```python
+import io
+from mmcv.fileio.file_client import HardDiskBackend
+
+disk_backend = HardDiskBackend()
+with io.BytesIO(disk_backend.get(filepath1)) as buffer:
+    checkpoint = torch.load(buffer)
+with io.BytesIO() as buffer:
+    torch.save(checkpoint, f)
+    disk_backend.put(f.getvalue(), filepath2)
+```
+
+如果我们想在接口中实现根据文件路径自动选择对应的后端，我们可以使用 `FileClient`。
+例如，我们想实现两个方法，分别是读取权重以及保存权重，它们需支持不同类型的文件路径，可以是磁盘路径，也可以是网络路径或者其他路径。
+
+```python
+from mmcv.fileio.file_client import FileClient
+
+def load_checkpoint(path):
+    file_client = FileClient.infer(uri=path)
+    with io.BytesIO(file_client.get(path)) as buffer:
+        checkpoint = torch.load(buffer)
+    return checkpoint
+
+def save_checkpoint(checkpoint, path):
+    with io.BytesIO() as buffer:
+        torch.save(checkpoint, buffer)
+        file_client.put(buffer.getvalue(), path)
+
+file_client = FileClient.infer_client(uri=filepath1)
+checkpoint = load_checkpoint(filepath1)
+save_checkpoint(checkpoint, filepath2)
+```
+
+#### 从网络远端读取权重文件
+
+```{note}
+目前只支持从网络远端读取权重文件，暂不支持将权重文件写入网络远端
+```
+
+```python
+import io
+import torch
+from mmcv.fileio.file_client import HTTPBackend, FileClient
+
+filepath = 'http://path/of/your/checkpoint.pth'
+checkpoint = torch.utils.model_zoo.load_url(filepath)
+
+http_backend = HTTPBackend()
+with io.BytesIO(http_backend.get(filepath)) as buffer:
+    checkpoint = torch.load(buffer)
+
+file_client = FileClient.infer_client(uri=filepath)
+with io.BytesIO(file_client.get(filepath)) as buffer:
+    checkpoint = torch.load(buffer)
+```
diff --git a/docs_zh_CN/understand_mmcv/ops.md b/docs_zh_CN/understand_mmcv/ops.md
new file mode 100644
index 0000000..a45bb14
--- /dev/null
+++ b/docs_zh_CN/understand_mmcv/ops.md
@@ -0,0 +1,36 @@
+## CUDA 算子
+
+MMCV 提供了检测、分割等任务中常用的 CUDA 算子
+
+- AssignScoreWithK
+- BallQuery
+- BBoxOverlaps
+- CARAFE
+- CrissCrossAttention
+- ContextBlock
+- CornerPool
+- Deformable Convolution v1/v2
+- Deformable RoIPool
+- DynamicScatter
+- GatherPoints
+- FurthestPointSample
+- FurthestPointSampleWithDist
+- GeneralizedAttention
+- KNN
+- MaskedConv
+- NMS
+- PSAMask
+- RoIPointPool3d
+- RoIPool
+- RoIAlign
+- RoIAwarePool3d
+- SimpleRoIAlign
+- SigmoidFocalLoss
+- SoftmaxFocalLoss
+- SoftNMS
+- Synchronized BatchNorm
+- Voxelization
+- ThreeInterpolate
+- ThreeNN
+- Weight standardization
+- Correlation
diff --git a/docs_zh_CN/understand_mmcv/registry.md b/docs_zh_CN/understand_mmcv/registry.md
new file mode 100644
index 0000000..3afd0ab
--- /dev/null
+++ b/docs_zh_CN/understand_mmcv/registry.md
@@ -0,0 +1,149 @@
+## 注册器
+MMCV 使用 [注册器](https://github.com/open-mmlab/mmcv/blob/master/mmcv/utils/registry.py) 来管理具有相似功能的不同模块, 例如, 检测器中的主干网络、头部、和模型颈部。
+在 OpenMMLab 家族中的绝大部分开源项目使用注册器去管理数据集和模型的模块，例如 [MMDetection](https://github.com/open-mmlab/mmdetection), [MMDetection3D](https://github.com/open-mmlab/mmdetection3d), [MMClassification](https://github.com/open-mmlab/mmclassification), [MMEditing](https://github.com/open-mmlab/mmediting) 等。
+
+### 什么是注册器
+在MMCV中，注册器可以看作类到字符串的映射。
+一个注册器中的类通常有相似的接口，但是可以实现不同的算法或支持不同的数据集。
+借助注册器，用户可以通过使用相应的字符串查找并实例化该类，并根据他们的需要实例化对应模块。
+一个典型的案例是，OpenMMLab　中的大部分开源项目的配置系统，这些系统通过配置文件来使用注册器创建钩子、执行器、模型和数据集。
+可以在[这里](https://mmcv.readthedocs.io/en/latest/api.html?highlight=registry#mmcv.utils.Registry)找到注册器接口使用文档。
+
+使用 `registry`（注册器）管理代码库中的模型，需要以下三个步骤。
+
+1. 创建一个构建方法（可选，在大多数情况下您可以只使用默认方法）
+2. 创建注册器
+3. 使用此注册器来管理模块
+
+`Registry`（注册器）的参数 `build_func`（构建函数） 用来自定以如何实例化类的实例，默认使用 [这里](https://mmcv.readthedocs.io/en/latest/api.html?highlight=registry#mmcv.utils.build_from_cfg)实现的`build_from_cfg`。
+
+### 一个简单的例子
+
+这里是一个使用注册器管理包中模块的简单示例。您可以在 OpenMMLab 开源项目中找到更多实例。
+
+假设我们要实现一系列数据集转换器（Dataset Converter），用于将不同格式的数据转换为标准数据格式。我们先创建一个名为converters的目录作为包，在包中我们创建一个文件来实现构建器（builder），命名为converters/builder.py，如下
+
+```python
+from mmcv.utils import Registry
+# 创建转换器（converter）的注册器（registry）
+CONVERTERS = Registry('converter')
+```
+
+然后我们在包中可以实现不同的转换器（converter）。例如，在 `converters/converter1.py` 中实现 `Converter1`。
+
+```python
+from .builder import CONVERTERS
+
+# 使用注册器管理模块
+@CONVERTERS.register_module()
+class Converter1(object):
+    def __init__(self, a, b):
+        self.a = a
+        self.b = b
+```
+使用注册器管理模块的关键步骤是，将实现的模块注册到注册表 `CONVERTERS` 中。通过 `@CONVERTERS.register_module()` 装饰所实现的模块，字符串和类之间的映射就可以由 `CONVERTERS` 构建和维护，如下所示：
+
+通过这种方式，就可以通过 `CONVERTERS` 建立字符串与类之间的映射，如下所示：
+
+```python
+'Converter1' -> <class 'Converter1'>
+```
+
+如果模块被成功注册了，你可以通过配置文件使用这个转换器（converter），如下所示：
+
+```python
+converter_cfg = dict(type='Converter1', a=a_value, b=b_value)
+converter = CONVERTERS.build(converter_cfg)
+```
+
+### 自定义构建函数
+
+假设我们想自定义 `converters` 的构建流程，我们可以实现一个自定义的 `build_func` （构建函数）并将其传递到注册器中。
+
+```python
+from mmcv.utils import Registry
+
+# 创建一个构建函数
+def build_converter(cfg, registry, *args, **kwargs):
+    cfg_ = cfg.copy()
+    converter_type = cfg_.pop('type')
+    if converter_type not in registry:
+        raise KeyError(f'Unrecognized converter type {converter_type}')
+    else:
+        converter_cls = registry.get(converter_type)
+
+    converter = converter_cls(*args, **kwargs, **cfg_)
+    return converter
+
+# 创建一个用于转换器（converters）的注册器，并传递（registry）``build_converter`` 函数
+CONVERTERS = Registry('converter', build_func=build_converter)
+```
+
+```{note}
+注：在这个例子中，我们演示了如何使用参数：`build_func` 自定义构建类的实例的方法。
+该功能类似于默认的`build_from_cfg`。在大多数情况下，默认就足够了。
+```
+
+`build_model_from_cfg`也实现了在`nn.Sequentail`中构建PyTorch模块，你可以直接使用它们。
+
+### 注册器层结构
+
+你也可以从多个 OpenMMLab 开源框架中构建模块，例如，你可以把所有 [MMClassification](https://github.com/open-mmlab/mmclassification) 中的主干网络（backbone）用到 [MMDetection](https://github.com/open-mmlab/mmdetection) 的目标检测中，你也可以融合 [MMDetection](https://github.com/open-mmlab/mmdetection) 中的目标检测模型 和 [MMSegmentation](https://github.com/open-mmlab/mmsegmentation) 语义分割模型。
+
+下游代码库中所有 `MODELS` 注册器都是MMCV `MODELS` 注册器的子注册器。基本上，使用以下两种方法从子注册器或相邻兄弟注册器构建模块。
+
+1. 从子注册器中构建
+
+   例如：
+
+   我们在 MMDetection 中定义：
+
+   ```python
+   from mmcv.utils import Registry
+   from mmcv.cnn import MODELS as MMCV_MODELS
+   MODELS = Registry('model', parent=MMCV_MODELS)
+
+   @MODELS.register_module()
+   class NetA(nn.Module):
+       def forward(self, x):
+           return x
+   ```
+
+   我们在 MMClassification 中定义：
+
+   ```python
+   from mmcv.utils import Registry
+   from mmcv.cnn import MODELS as MMCV_MODELS
+   MODELS = Registry('model', parent=MMCV_MODELS)
+
+   @MODELS.register_module()
+   class NetB(nn.Module):
+       def forward(self, x):
+           return x + 1
+   ```
+
+   我们可以通过以下代码在 MMDetection 或 MMClassification 中构建两个网络：
+
+   ```python
+   from mmdet.models import MODELS
+   net_a = MODELS.build(cfg=dict(type='NetA'))
+   net_b = MODELS.build(cfg=dict(type='mmcls.NetB'))
+   ```
+
+   或
+
+   ```python
+   from mmcls.models import MODELS
+   net_a = MODELS.build(cfg=dict(type='mmdet.NetA'))
+   net_b = MODELS.build(cfg=dict(type='NetB'))
+   ```
+
+2. 从父注册器中构建
+
+   MMCV中的共享`MODELS`注册器是所有下游代码库的父注册器（根注册器）：
+
+   ```python
+   from mmcv.cnn import MODELS as MMCV_MODELS
+   net_a = MMCV_MODELS.build(cfg=dict(type='mmdet.NetA'))
+   net_b = MMCV_MODELS.build(cfg=dict(type='mmcls.NetB'))
+   ```
diff --git a/docs_zh_CN/understand_mmcv/runner.md b/docs_zh_CN/understand_mmcv/runner.md
new file mode 100644
index 0000000..203a5dc
--- /dev/null
+++ b/docs_zh_CN/understand_mmcv/runner.md
@@ -0,0 +1,155 @@
+## 执行器
+
+执行器模块负责模型训练过程调度，主要目的是让用户使用更少的代码以及灵活可配置方式开启训练。其具备如下核心特性:
+
+- 支持以 `EpochBasedRunner` 和 `IterBasedRunner` 为单位的迭代模式以满足不同场景
+- 支持定制工作流以满足训练过程中各状态自由切换，目前支持训练和验证两个工作流。工作流可以简单理解为一个完成的训练和验证迭代过程。
+- 配合各类默认和自定义 Hook，对外提供了灵活扩展能力
+
+### EpochBasedRunner
+
+顾名思义，`EpochBasedRunner` 是指以 epoch 为周期的工作流，例如设置 workflow = [('train', 2), ('val', 1)] 表示循环迭代地训练 2 个 epoch，然后验证 1 个 epoch。MMDetection 目标检测框架默认采用的是 `EpochBasedRunner`。
+
+其抽象逻辑如下所示：
+
+```python
+# 训练终止条件
+while curr_epoch < max_epochs:
+    # 遍历用户设置的工作流，例如 workflow = [('train', 2)，('val', 1)]
+    for i, flow in enumerate(workflow):
+        # mode 是工作流函数，例如 train, epochs 是迭代次数
+        mode, epochs = flow
+        # 要么调用 self.train()，要么调用 self.val()
+        epoch_runner = getattr(self, mode)
+        # 运行对应工作流函数
+        for _ in range(epochs):
+            epoch_runner(data_loaders[i], **kwargs)
+```
+目前支持训练和验证两个工作流，以训练函数为例，其抽象逻辑是：
+
+```python
+# epoch_runner 目前可以是 train 或者 val
+def train(self, data_loader, **kwargs):
+    # 遍历 dataset，共返回一个 epoch 的 batch 数据
+    for i, data_batch in enumerate(data_loader):
+        self.call_hook('before_train_iter')
+        # 验证时候 train_mode=False
+        self.run_iter(data_batch, train_mode=True, **kwargs)
+        self.call_hook('after_train_iter')
+   self.call_hook('after_train_epoch')
+```
+
+### IterBasedRunner
+不同于 `EpochBasedRunner`，`IterBasedRunner` 是指以 iter 为周期的工作流，例如设置 workflow = [('train', 2)， ('val', 1)] 表示循环迭代的训练 2 个 iter，然后验证 1 个 iter，MMSegmentation 语义分割框架默认采用的是  `EpochBasedRunner`。
+
+其抽象逻辑如下所示：
+
+```python
+# 虽然是 iter 单位，但是某些场合需要 epoch 信息，由 IterLoader 提供
+iter_loaders = [IterLoader(x) for x in data_loaders]
+# 训练终止条件
+while curr_iter < max_iters:
+    # 遍历用户设置的工作流，例如 workflow = [('train', 2)， ('val', 1)]
+    for i, flow in enumerate(workflow):
+        # mode 是工作流函数，例如 train, iters 是迭代次数
+        mode, iters = flow
+        # 要么调用 self.train()，要么调用 self.val()
+        iter_runner = getattr(self, mode)
+        # 运行对应工作流函数
+        for _ in range(iters):
+            iter_runner(iter_loaders[i], **kwargs)
+```
+目前支持训练和验证两个工作流，以验证函数为例，其抽象逻辑是：
+
+```python
+# iter_runner 目前可以是 train 或者 val
+def val(self, data_loader, **kwargs):
+    # 获取 batch 数据，用于一次迭代
+    data_batch = next(data_loader)
+    self.call_hook('before_val_iter')
+    outputs = self.model.val_step(data_batch, self.optimizer, **kwargs)
+    self.outputs = outputs
+    self.call_hook('after_val_iter')
+```
+
+除了上述基础功能外，`EpochBasedRunner` 和 `IterBasedRunner` 还提供了 resume 、 save_checkpoint 和注册 hook 功能。
+
+### 一个简单例子
+以最常用的分类任务为例详细说明 `runner` 的使用方法。 开启任何一个训练任务，都需要包括如下步骤：
+
+**(1) dataloader、model 和优化器等类初始化**
+
+```python
+# 模型类初始化
+model=...
+# 优化器类初始化，典型值 cfg.optimizer = dict(type='SGD', lr=0.1, momentum=0.9, weight_decay=0.0001)
+optimizer = build_optimizer(model, cfg.optimizer)
+# 工作流对应的 dataloader 初始化
+data_loaders = [
+        build_dataloader(
+            ds,
+            cfg.data.samples_per_gpu,
+            cfg.data.workers_per_gpu,
+            ...) for ds in dataset
+    ]
+```
+
+**(2) runner 类初始化**
+
+```python
+runner = build_runner(
+    # cfg.runner 典型配置为
+    # runner = dict(type='EpochBasedRunner', max_epochs=200)
+    cfg.runner,
+    default_args=dict(
+        model=model,
+        batch_processor=None,
+        optimizer=optimizer,
+        logger=logger))
+```
+
+**(3) 注册默认训练所必须的 hook，和用户自定义 hook**
+
+```python
+# 注册定制必需的 hook
+runner.register_training_hooks(
+    # lr相关配置，典型为
+    # lr_config = dict(policy='step', step=[100, 150])
+    cfg.lr_config,
+    # 优化相关配置，例如 grad_clip 等
+    optimizer_config,
+    # 权重保存相关配置，典型为
+    # checkpoint_config = dict(interval=1)，每个单位都保存权重
+    cfg.checkpoint_config,
+    # 日志相关配置
+    cfg.log_config,
+    ...)
+
+# 注册用户自定义 hook
+# 例如想使用 ema 功能，则可以设置 custom_hooks=[dict(type='EMAHook')]
+if cfg.get('custom_hooks', None):
+    custom_hooks = cfg.custom_hooks
+    for hook_cfg in cfg.custom_hooks:
+        hook_cfg = hook_cfg.copy()
+        priority = hook_cfg.pop('priority', 'NORMAL')
+        hook = build_from_cfg(hook_cfg, HOOKS)
+        runner.register_hook(hook, priority=priority)
+```
+
+然后可以进行 resume 或者 load_checkpoint 对权重进行加载。
+
+**(4) 开启训练流**
+
+```python
+# workflow 典型为 workflow = [('train', 1)]
+# 此时就真正开启了训练
+runner.run(data_loaders, cfg.workflow)
+```
+
+关于 workflow 设置，以 `EpochBasedRunner` 为例，详情如下：
+
+- 假设只想运行训练工作流，则可以设置 workflow = [('train', 1)]，表示只进行迭代训练
+- 假设想运行训练和验证工作流，则可以设置 workflow = [('train',  3), ('val', 1)]，表示先训练 3 个 epoch ，然后切换到 val 工作流，运行 1 个 epoch，然后循环，直到训练 epoch 次数达到指定值
+- 工作流设置还自由定制，例如你可以先验证再训练 workflow = [('val', 1), ('train', 1)]
+
+上述代码都已经封装到了各个代码库的 train.py 中，用户只需要设置相应的配置即可，上述流程会自动运行。
diff --git a/docs_zh_CN/understand_mmcv/utils.md b/docs_zh_CN/understand_mmcv/utils.md
new file mode 100644
index 0000000..746c560
--- /dev/null
+++ b/docs_zh_CN/understand_mmcv/utils.md
@@ -0,0 +1,69 @@
+## 辅助函数
+
+### 进度条
+
+如果你想跟踪函数批处理任务的进度，可以使用 `track_progress` 。它能以进度条的形式展示任务的完成情况以及剩余任务所需的时间（内部实现为for循环）。
+
+```python
+import mmcv
+
+def func(item):
+    # 执行相关操作
+    pass
+
+tasks = [item_1, item_2, ..., item_n]
+
+mmcv.track_progress(func, tasks)
+```
+
+效果如下
+![progress](../../docs/_static/progress.*)
+
+如果你想可视化多进程任务的进度，你可以使用 `track_parallel_progress` 。
+
+```python
+mmcv.track_parallel_progress(func, tasks, 8)  # 8 workers
+```
+
+![progress](../../docs/_static/parallel_progress.*)
+
+如果你想要迭代或枚举数据列表并可视化进度,你可以使用 `track_iter_progress` 。
+
+```python
+import mmcv
+
+tasks = [item_1, item_2, ..., item_n]
+
+for task in mmcv.track_iter_progress(tasks):
+    # do something like print
+    print(task)
+
+for i, task in enumerate(mmcv.track_iter_progress(tasks)):
+    # do something like print
+    print(i)
+    print(task)
+```
+
+### 计时器
+
+mmcv提供的 `Timer` 可以很方便地计算代码块的执行时间。
+
+```python
+import time
+
+with mmcv.Timer():
+    # simulate some code block
+    time.sleep(1)
+```
+
+你也可以使用 `since_start()` 和 `since_last_check()` 。前者返回计时器启动后的运行时长，后者返回最近一次查看计时器后的运行时长。
+
+
+```python
+timer = mmcv.Timer()
+# code block 1 here
+print(timer.since_start())
+# code block 2 here
+print(timer.since_last_check())
+print(timer.since_start())
+```
diff --git a/docs/zh_cn/understand_mmcv/visualization.md b/docs_zh_CN/understand_mmcv/visualization.md
similarity index 100%
rename from docs/zh_cn/understand_mmcv/visualization.md
rename to docs_zh_CN/understand_mmcv/visualization.md
diff --git a/examples/train.py b/examples/train.py
new file mode 100644
index 0000000..2dbdfee
--- /dev/null
+++ b/examples/train.py
@@ -0,0 +1,84 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.optim as optim
+import torchvision.transforms as transforms
+from torch.utils.data import DataLoader
+from torchvision.datasets import CIFAR10
+
+from mmcv.parallel import MMDataParallel
+from mmcv.runner import EpochBasedRunner
+from mmcv.utils import get_logger
+
+
+class Model(nn.Module):
+
+    def __init__(self):
+        super(Model, self).__init__()
+        self.conv1 = nn.Conv2d(3, 6, 5)
+        self.pool = nn.MaxPool2d(2, 2)
+        self.conv2 = nn.Conv2d(6, 16, 5)
+        self.fc1 = nn.Linear(16 * 5 * 5, 120)
+        self.fc2 = nn.Linear(120, 84)
+        self.fc3 = nn.Linear(84, 10)
+        self.loss_fn = nn.CrossEntropyLoss()
+
+    def forward(self, x):
+        x = self.pool(F.relu(self.conv1(x)))
+        x = self.pool(F.relu(self.conv2(x)))
+        x = x.view(-1, 16 * 5 * 5)
+        x = F.relu(self.fc1(x))
+        x = F.relu(self.fc2(x))
+        x = self.fc3(x)
+        return x
+
+    def train_step(self, data, optimizer):
+        images, labels = data
+        predicts = self(images)  # -> self.__call__() -> self.forward()
+        loss = self.loss_fn(predicts, labels)
+        return {'loss': loss}
+
+
+if __name__ == '__main__':
+    model = Model()
+    if torch.cuda.is_available():
+        # only use gpu:0 to train
+        # Solved issue https://github.com/open-mmlab/mmcv/issues/1470
+        model = MMDataParallel(model.cuda(), device_ids=[0])
+
+    # dataset and dataloader
+    transform = transforms.Compose([
+        transforms.ToTensor(),
+        transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
+    ])
+    trainset = CIFAR10(
+        root='data', train=True, download=True, transform=transform)
+    trainloader = DataLoader(
+        trainset, batch_size=128, shuffle=True, num_workers=2)
+
+    optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)
+    logger = get_logger('mmcv')
+    # runner is a scheduler to manage the training
+    runner = EpochBasedRunner(
+        model,
+        optimizer=optimizer,
+        work_dir='./work_dir',
+        logger=logger,
+        max_epochs=4)
+
+    # learning rate scheduler config
+    lr_config = dict(policy='step', step=[2, 3])
+    # configuration of optimizer
+    optimizer_config = dict(grad_clip=None)
+    # configuration of saving checkpoints periodically
+    checkpoint_config = dict(interval=1)
+    # save log periodically and multiple hooks can be used simultaneously
+    log_config = dict(interval=100, hooks=[dict(type='TextLoggerHook')])
+    # register hooks to runner and those hooks will be invoked automatically
+    runner.register_training_hooks(
+        lr_config=lr_config,
+        optimizer_config=optimizer_config,
+        checkpoint_config=checkpoint_config,
+        log_config=log_config)
+
+    runner.run([trainloader], [('train', 1)])
diff --git a/mmcv/__init__.py b/mmcv/__init__.py
index 2410ea5..210a298 100644
--- a/mmcv/__init__.py
+++ b/mmcv/__init__.py
@@ -1,13 +1,15 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 # flake8: noqa
 from .arraymisc import *
+from .fileio import *
 from .image import *
-from .transforms import *
+from .utils import *
 from .version import *
 from .video import *
 from .visualization import *
 
 # The following modules are not imported to this level, so mmcv may be used
 # without PyTorch.
+# - runner
+# - parallel
 # - op
-# - utils
diff --git a/mmcv/arraymisc/quantization.py b/mmcv/arraymisc/quantization.py
index 6182710..8e47a35 100644
--- a/mmcv/arraymisc/quantization.py
+++ b/mmcv/arraymisc/quantization.py
@@ -1,20 +1,14 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from typing import Union
-
 import numpy as np
 
 
-def quantize(arr: np.ndarray,
-             min_val: Union[int, float],
-             max_val: Union[int, float],
-             levels: int,
-             dtype=np.int64) -> tuple:
+def quantize(arr, min_val, max_val, levels, dtype=np.int64):
     """Quantize an array of (-inf, inf) to [0, levels-1].
 
     Args:
         arr (ndarray): Input array.
-        min_val (int or float): Minimum value to be clipped.
-        max_val (int or float): Maximum value to be clipped.
+        min_val (scalar): Minimum value to be clipped.
+        max_val (scalar): Maximum value to be clipped.
         levels (int): Quantization levels.
         dtype (np.type): The type of the quantized array.
 
@@ -35,17 +29,13 @@ def quantize(arr: np.ndarray,
     return quantized_arr
 
 
-def dequantize(arr: np.ndarray,
-               min_val: Union[int, float],
-               max_val: Union[int, float],
-               levels: int,
-               dtype=np.float64) -> tuple:
+def dequantize(arr, min_val, max_val, levels, dtype=np.float64):
     """Dequantize an array.
 
     Args:
         arr (ndarray): Input array.
-        min_val (int or float): Minimum value to be clipped.
-        max_val (int or float): Maximum value to be clipped.
+        min_val (scalar): Minimum value to be clipped.
+        max_val (scalar): Maximum value to be clipped.
         levels (int): Quantization levels.
         dtype (np.type): The type of the dequantized array.
 
diff --git a/mmcv/cnn/__init__.py b/mmcv/cnn/__init__.py
index 10e7e02..7246c89 100644
--- a/mmcv/cnn/__init__.py
+++ b/mmcv/cnn/__init__.py
@@ -1,7 +1,9 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from .alexnet import AlexNet
 # yapf: disable
-from .bricks import (ContextBlock, Conv2d, Conv3d, ConvAWS2d, ConvModule,
+from .bricks import (ACTIVATION_LAYERS, CONV_LAYERS, NORM_LAYERS,
+                     PADDING_LAYERS, PLUGIN_LAYERS, UPSAMPLE_LAYERS,
+                     ContextBlock, Conv2d, Conv3d, ConvAWS2d, ConvModule,
                      ConvTranspose2d, ConvTranspose3d, ConvWS2d,
                      DepthwiseSeparableConvModule, GeneralizedAttention,
                      HSigmoid, HSwish, Linear, MaxPool2d, MaxPool3d,
@@ -9,20 +11,31 @@ from .bricks import (ContextBlock, Conv2d, Conv3d, ConvAWS2d, ConvModule,
                      build_activation_layer, build_conv_layer,
                      build_norm_layer, build_padding_layer, build_plugin_layer,
                      build_upsample_layer, conv_ws_2d, is_norm)
+from .builder import MODELS, build_model_from_cfg
 # yapf: enable
 from .resnet import ResNet, make_res_layer
-from .rfsearch import Conv2dRFSearchOp, RFSearchHook
-from .utils import fuse_conv_bn, get_model_complexity_info
+from .utils import (INITIALIZERS, Caffe2XavierInit, ConstantInit, KaimingInit,
+                    NormalInit, PretrainedInit, TruncNormalInit, UniformInit,
+                    XavierInit, bias_init_with_prob, caffe2_xavier_init,
+                    constant_init, fuse_conv_bn, get_model_complexity_info,
+                    initialize, kaiming_init, normal_init, trunc_normal_init,
+                    uniform_init, xavier_init)
 from .vgg import VGG, make_vgg_layer
 
 __all__ = [
     'AlexNet', 'VGG', 'make_vgg_layer', 'ResNet', 'make_res_layer',
-    'ConvModule', 'build_activation_layer', 'build_conv_layer',
-    'build_norm_layer', 'build_padding_layer', 'build_upsample_layer',
-    'build_plugin_layer', 'is_norm', 'NonLocal1d', 'NonLocal2d', 'NonLocal3d',
-    'ContextBlock', 'HSigmoid', 'Swish', 'HSwish', 'GeneralizedAttention',
-    'Scale', 'conv_ws_2d', 'ConvAWS2d', 'ConvWS2d',
-    'DepthwiseSeparableConvModule', 'Linear', 'Conv2d', 'ConvTranspose2d',
-    'MaxPool2d', 'ConvTranspose3d', 'MaxPool3d', 'Conv3d', 'fuse_conv_bn',
-    'get_model_complexity_info', 'Conv2dRFSearchOp', 'RFSearchHook'
+    'constant_init', 'xavier_init', 'normal_init', 'trunc_normal_init',
+    'uniform_init', 'kaiming_init', 'caffe2_xavier_init',
+    'bias_init_with_prob', 'ConvModule', 'build_activation_layer',
+    'build_conv_layer', 'build_norm_layer', 'build_padding_layer',
+    'build_upsample_layer', 'build_plugin_layer', 'is_norm', 'NonLocal1d',
+    'NonLocal2d', 'NonLocal3d', 'ContextBlock', 'HSigmoid', 'Swish', 'HSwish',
+    'GeneralizedAttention', 'ACTIVATION_LAYERS', 'CONV_LAYERS', 'NORM_LAYERS',
+    'PADDING_LAYERS', 'UPSAMPLE_LAYERS', 'PLUGIN_LAYERS', 'Scale',
+    'get_model_complexity_info', 'conv_ws_2d', 'ConvAWS2d', 'ConvWS2d',
+    'fuse_conv_bn', 'DepthwiseSeparableConvModule', 'Linear', 'Conv2d',
+    'ConvTranspose2d', 'MaxPool2d', 'ConvTranspose3d', 'MaxPool3d', 'Conv3d',
+    'initialize', 'INITIALIZERS', 'ConstantInit', 'XavierInit', 'NormalInit',
+    'TruncNormalInit', 'UniformInit', 'KaimingInit', 'PretrainedInit',
+    'Caffe2XavierInit', 'MODELS', 'build_model_from_cfg'
 ]
diff --git a/mmcv/cnn/alexnet.py b/mmcv/cnn/alexnet.py
index 309be24..89e36b8 100644
--- a/mmcv/cnn/alexnet.py
+++ b/mmcv/cnn/alexnet.py
@@ -1,10 +1,7 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import logging
-from typing import Optional
 
-import torch
 import torch.nn as nn
-from mmengine.runner import load_checkpoint
 
 
 class AlexNet(nn.Module):
@@ -14,8 +11,8 @@ class AlexNet(nn.Module):
         num_classes (int): number of classes for classification.
     """
 
-    def __init__(self, num_classes: int = -1):
-        super().__init__()
+    def __init__(self, num_classes=-1):
+        super(AlexNet, self).__init__()
         self.num_classes = num_classes
         self.features = nn.Sequential(
             nn.Conv2d(3, 64, kernel_size=11, stride=4, padding=2),
@@ -43,9 +40,10 @@ class AlexNet(nn.Module):
                 nn.Linear(4096, num_classes),
             )
 
-    def init_weights(self, pretrained: Optional[str] = None) -> None:
+    def init_weights(self, pretrained=None):
         if isinstance(pretrained, str):
             logger = logging.getLogger()
+            from ..runner import load_checkpoint
             load_checkpoint(self, pretrained, strict=False, logger=logger)
         elif pretrained is None:
             # use default initializer
@@ -53,7 +51,7 @@ class AlexNet(nn.Module):
         else:
             raise TypeError('pretrained must be a str or None')
 
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
+    def forward(self, x):
 
         x = self.features(x)
         if self.num_classes > 0:
diff --git a/mmcv/cnn/bricks/__init__.py b/mmcv/cnn/bricks/__init__.py
index 6c74986..0f33124 100644
--- a/mmcv/cnn/bricks/__init__.py
+++ b/mmcv/cnn/bricks/__init__.py
@@ -14,7 +14,9 @@ from .non_local import NonLocal1d, NonLocal2d, NonLocal3d
 from .norm import build_norm_layer, is_norm
 from .padding import build_padding_layer
 from .plugin import build_plugin_layer
-from .scale import LayerScale, Scale
+from .registry import (ACTIVATION_LAYERS, CONV_LAYERS, NORM_LAYERS,
+                       PADDING_LAYERS, PLUGIN_LAYERS, UPSAMPLE_LAYERS)
+from .scale import Scale
 from .swish import Swish
 from .upsample import build_upsample_layer
 from .wrappers import (Conv2d, Conv3d, ConvTranspose2d, ConvTranspose3d,
@@ -25,8 +27,9 @@ __all__ = [
     'build_norm_layer', 'build_padding_layer', 'build_upsample_layer',
     'build_plugin_layer', 'is_norm', 'HSigmoid', 'HSwish', 'NonLocal1d',
     'NonLocal2d', 'NonLocal3d', 'ContextBlock', 'GeneralizedAttention',
-    'Scale', 'ConvAWS2d', 'ConvWS2d', 'conv_ws_2d',
-    'DepthwiseSeparableConvModule', 'Swish', 'Linear', 'Conv2dAdaptivePadding',
-    'Conv2d', 'ConvTranspose2d', 'MaxPool2d', 'ConvTranspose3d', 'MaxPool3d',
-    'Conv3d', 'Dropout', 'DropPath', 'LayerScale'
+    'ACTIVATION_LAYERS', 'CONV_LAYERS', 'NORM_LAYERS', 'PADDING_LAYERS',
+    'UPSAMPLE_LAYERS', 'PLUGIN_LAYERS', 'Scale', 'ConvAWS2d', 'ConvWS2d',
+    'conv_ws_2d', 'DepthwiseSeparableConvModule', 'Swish', 'Linear',
+    'Conv2dAdaptivePadding', 'Conv2d', 'ConvTranspose2d', 'MaxPool2d',
+    'ConvTranspose3d', 'MaxPool3d', 'Conv3d', 'Dropout', 'DropPath'
 ]
diff --git a/mmcv/cnn/bricks/activation.py b/mmcv/cnn/bricks/activation.py
index ae99714..79f1988 100644
--- a/mmcv/cnn/bricks/activation.py
+++ b/mmcv/cnn/bricks/activation.py
@@ -1,41 +1,20 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from typing import Dict
-
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from mmengine.registry import MODELS
-from mmengine.utils import digit_version
-from mmengine.utils.dl_utils import TORCH_VERSION
+
+from mmcv.utils import TORCH_VERSION, build_from_cfg, digit_version
+from .registry import ACTIVATION_LAYERS
 
 for module in [
         nn.ReLU, nn.LeakyReLU, nn.PReLU, nn.RReLU, nn.ReLU6, nn.ELU,
         nn.Sigmoid, nn.Tanh
 ]:
-    MODELS.register_module(module=module)
-
-if digit_version(torch.__version__) >= digit_version('1.7.0'):
-    MODELS.register_module(module=nn.SiLU, name='SiLU')
-else:
-
-    class SiLU(nn.Module):
-        """Sigmoid Weighted Liner Unit."""
+    ACTIVATION_LAYERS.register_module(module=module)
 
-        def __init__(self, inplace=False):
-            super().__init__()
-            self.inplace = inplace
 
-        def forward(self, inputs) -> torch.Tensor:
-            if self.inplace:
-                return inputs.mul_(torch.sigmoid(inputs))
-            else:
-                return inputs * torch.sigmoid(inputs)
-
-    MODELS.register_module(module=SiLU, name='SiLU')
-
-
-@MODELS.register_module(name='Clip')
-@MODELS.register_module()
+@ACTIVATION_LAYERS.register_module(name='Clip')
+@ACTIVATION_LAYERS.register_module()
 class Clamp(nn.Module):
     """Clamp activation layer.
 
@@ -49,12 +28,12 @@ class Clamp(nn.Module):
             Default to 1.
     """
 
-    def __init__(self, min: float = -1., max: float = 1.):
-        super().__init__()
+    def __init__(self, min=-1., max=1.):
+        super(Clamp, self).__init__()
         self.min = min
         self.max = max
 
-    def forward(self, x) -> torch.Tensor:
+    def forward(self, x):
         """Forward function.
 
         Args:
@@ -88,27 +67,26 @@ class GELU(nn.Module):
         >>> output = m(input)
     """
 
-    def forward(self, input: torch.Tensor) -> torch.Tensor:
+    def forward(self, input):
         return F.gelu(input)
 
 
 if (TORCH_VERSION == 'parrots'
         or digit_version(TORCH_VERSION) < digit_version('1.4')):
-    MODELS.register_module(module=GELU)
+    ACTIVATION_LAYERS.register_module(module=GELU)
 else:
-    MODELS.register_module(module=nn.GELU)
+    ACTIVATION_LAYERS.register_module(module=nn.GELU)
 
 
-def build_activation_layer(cfg: Dict) -> nn.Module:
+def build_activation_layer(cfg):
     """Build activation layer.
 
     Args:
         cfg (dict): The activation layer config, which should contain:
-
             - type (str): Layer type.
             - layer args: Args needed to instantiate an activation layer.
 
     Returns:
         nn.Module: Created activation layer.
     """
-    return MODELS.build(cfg)
+    return build_from_cfg(cfg, ACTIVATION_LAYERS)
diff --git a/mmcv/cnn/bricks/context_block.py b/mmcv/cnn/bricks/context_block.py
index 1e78df8..d60fdb9 100644
--- a/mmcv/cnn/bricks/context_block.py
+++ b/mmcv/cnn/bricks/context_block.py
@@ -1,20 +1,19 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from typing import Union
-
 import torch
-from mmengine.model import constant_init, kaiming_init
-from mmengine.registry import MODELS
 from torch import nn
 
+from ..utils import constant_init, kaiming_init
+from .registry import PLUGIN_LAYERS
+
 
-def last_zero_init(m: Union[nn.Module, nn.Sequential]) -> None:
+def last_zero_init(m):
     if isinstance(m, nn.Sequential):
         constant_init(m[-1], val=0)
     else:
         constant_init(m, val=0)
 
 
-@MODELS.register_module()
+@PLUGIN_LAYERS.register_module()
 class ContextBlock(nn.Module):
     """ContextBlock module in GCNet.
 
@@ -35,11 +34,11 @@ class ContextBlock(nn.Module):
     _abbr_ = 'context_block'
 
     def __init__(self,
-                 in_channels: int,
-                 ratio: float,
-                 pooling_type: str = 'att',
-                 fusion_types: tuple = ('channel_add', )):
-        super().__init__()
+                 in_channels,
+                 ratio,
+                 pooling_type='att',
+                 fusion_types=('channel_add', )):
+        super(ContextBlock, self).__init__()
         assert pooling_type in ['avg', 'att']
         assert isinstance(fusion_types, (list, tuple))
         valid_fusion_types = ['channel_add', 'channel_mul']
@@ -83,7 +82,7 @@ class ContextBlock(nn.Module):
         if self.channel_mul_conv is not None:
             last_zero_init(self.channel_mul_conv)
 
-    def spatial_pool(self, x: torch.Tensor) -> torch.Tensor:
+    def spatial_pool(self, x):
         batch, channel, height, width = x.size()
         if self.pooling_type == 'att':
             input_x = x
@@ -109,7 +108,7 @@ class ContextBlock(nn.Module):
 
         return context
 
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
+    def forward(self, x):
         # [N, C, 1, 1]
         context = self.spatial_pool(x)
 
diff --git a/mmcv/cnn/bricks/conv.py b/mmcv/cnn/bricks/conv.py
index ace744e..cf54491 100644
--- a/mmcv/cnn/bricks/conv.py
+++ b/mmcv/cnn/bricks/conv.py
@@ -1,16 +1,15 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from typing import Dict, Optional
-
-from mmengine.registry import MODELS
 from torch import nn
 
-MODELS.register_module('Conv1d', module=nn.Conv1d)
-MODELS.register_module('Conv2d', module=nn.Conv2d)
-MODELS.register_module('Conv3d', module=nn.Conv3d)
-MODELS.register_module('Conv', module=nn.Conv2d)
+from .registry import CONV_LAYERS
+
+CONV_LAYERS.register_module('Conv1d', module=nn.Conv1d)
+CONV_LAYERS.register_module('Conv2d', module=nn.Conv2d)
+CONV_LAYERS.register_module('Conv3d', module=nn.Conv3d)
+CONV_LAYERS.register_module('Conv', module=nn.Conv2d)
 
 
-def build_conv_layer(cfg: Optional[Dict], *args, **kwargs) -> nn.Module:
+def build_conv_layer(cfg, *args, **kwargs):
     """Build convolution layer.
 
     Args:
@@ -35,15 +34,11 @@ def build_conv_layer(cfg: Optional[Dict], *args, **kwargs) -> nn.Module:
         cfg_ = cfg.copy()
 
     layer_type = cfg_.pop('type')
+    if layer_type not in CONV_LAYERS:
+        raise KeyError(f'Unrecognized norm type {layer_type}')
+    else:
+        conv_layer = CONV_LAYERS.get(layer_type)
 
-    # Switch registry to the target scope. If `conv_layer` cannot be found
-    # in the registry, fallback to search `conv_layer` in the
-    # mmengine.MODELS.
-    with MODELS.switch_scope_and_registry(None) as registry:
-        conv_layer = registry.get(layer_type)
-    if conv_layer is None:
-        raise KeyError(f'Cannot find {conv_layer} in registry under scope '
-                       f'name {registry.scope}')
     layer = conv_layer(*args, **kwargs, **cfg_)
 
     return layer
diff --git a/mmcv/cnn/bricks/conv2d_adaptive_padding.py b/mmcv/cnn/bricks/conv2d_adaptive_padding.py
index 0ac9949..b45e758 100644
--- a/mmcv/cnn/bricks/conv2d_adaptive_padding.py
+++ b/mmcv/cnn/bricks/conv2d_adaptive_padding.py
@@ -1,14 +1,13 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import math
-from typing import Tuple, Union
 
-import torch
-from mmengine.registry import MODELS
 from torch import nn
 from torch.nn import functional as F
 
+from .registry import CONV_LAYERS
 
-@MODELS.register_module()
+
+@CONV_LAYERS.register_module()
 class Conv2dAdaptivePadding(nn.Conv2d):
     """Implementation of 2D convolution in tensorflow with `padding` as "same",
     which applies padding to input (if needed) so that input image gets fully
@@ -32,18 +31,18 @@ class Conv2dAdaptivePadding(nn.Conv2d):
     """
 
     def __init__(self,
-                 in_channels: int,
-                 out_channels: int,
-                 kernel_size: Union[int, Tuple[int, int]],
-                 stride: Union[int, Tuple[int, int]] = 1,
-                 padding: Union[int, Tuple[int, int]] = 0,
-                 dilation: Union[int, Tuple[int, int]] = 1,
-                 groups: int = 1,
-                 bias: bool = True):
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 bias=True):
         super().__init__(in_channels, out_channels, kernel_size, stride, 0,
                          dilation, groups, bias)
 
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
+    def forward(self, x):
         img_h, img_w = x.size()[-2:]
         kernel_h, kernel_w = self.weight.size()[-2:]
         stride_h, stride_w = self.stride
diff --git a/mmcv/cnn/bricks/conv_module.py b/mmcv/cnn/bricks/conv_module.py
index 1f8e160..4f19f1d 100644
--- a/mmcv/cnn/bricks/conv_module.py
+++ b/mmcv/cnn/bricks/conv_module.py
@@ -1,20 +1,18 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import warnings
-from typing import Dict, Optional, Tuple, Union
 
-import torch
 import torch.nn as nn
-from mmengine.model import constant_init, kaiming_init
-from mmengine.registry import MODELS
-from mmengine.utils.dl_utils.parrots_wrapper import _BatchNorm, _InstanceNorm
 
+from mmcv.utils import _BatchNorm, _InstanceNorm
+from ..utils import constant_init, kaiming_init
 from .activation import build_activation_layer
 from .conv import build_conv_layer
 from .norm import build_norm_layer
 from .padding import build_padding_layer
+from .registry import PLUGIN_LAYERS
 
 
-@MODELS.register_module()
+@PLUGIN_LAYERS.register_module()
 class ConvModule(nn.Module):
     """A conv block that bundles conv/norm/activation layers.
 
@@ -70,22 +68,22 @@ class ConvModule(nn.Module):
     _abbr_ = 'conv_block'
 
     def __init__(self,
-                 in_channels: int,
-                 out_channels: int,
-                 kernel_size: Union[int, Tuple[int, int]],
-                 stride: Union[int, Tuple[int, int]] = 1,
-                 padding: Union[int, Tuple[int, int]] = 0,
-                 dilation: Union[int, Tuple[int, int]] = 1,
-                 groups: int = 1,
-                 bias: Union[bool, str] = 'auto',
-                 conv_cfg: Optional[Dict] = None,
-                 norm_cfg: Optional[Dict] = None,
-                 act_cfg: Optional[Dict] = dict(type='ReLU'),
-                 inplace: bool = True,
-                 with_spectral_norm: bool = False,
-                 padding_mode: str = 'zeros',
-                 order: tuple = ('conv', 'norm', 'act')):
-        super().__init__()
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 bias='auto',
+                 conv_cfg=None,
+                 norm_cfg=None,
+                 act_cfg=dict(type='ReLU'),
+                 inplace=True,
+                 with_spectral_norm=False,
+                 padding_mode='zeros',
+                 order=('conv', 'norm', 'act')):
+        super(ConvModule, self).__init__()
         assert conv_cfg is None or isinstance(conv_cfg, dict)
         assert norm_cfg is None or isinstance(norm_cfg, dict)
         assert act_cfg is None or isinstance(act_cfg, dict)
@@ -98,7 +96,7 @@ class ConvModule(nn.Module):
         self.with_explicit_padding = padding_mode not in official_padding_mode
         self.order = order
         assert isinstance(self.order, tuple) and len(self.order) == 3
-        assert set(order) == {'conv', 'norm', 'act'}
+        assert set(order) == set(['conv', 'norm', 'act'])
 
         self.with_norm = norm_cfg is not None
         self.with_activation = act_cfg is not None
@@ -145,22 +143,21 @@ class ConvModule(nn.Module):
                 norm_channels = out_channels
             else:
                 norm_channels = in_channels
-            self.norm_name, norm = build_norm_layer(
-                norm_cfg, norm_channels)  # type: ignore
+            self.norm_name, norm = build_norm_layer(norm_cfg, norm_channels)
             self.add_module(self.norm_name, norm)
             if self.with_bias:
                 if isinstance(norm, (_BatchNorm, _InstanceNorm)):
                     warnings.warn(
                         'Unnecessary conv bias before batch/instance norm')
         else:
-            self.norm_name = None  # type: ignore
+            self.norm_name = None
 
         # build activation layer
         if self.with_activation:
-            act_cfg_ = act_cfg.copy()  # type: ignore
+            act_cfg_ = act_cfg.copy()
             # nn.Tanh has no 'inplace' argument
             if act_cfg_['type'] not in [
-                    'Tanh', 'PReLU', 'Sigmoid', 'HSigmoid', 'Swish', 'GELU'
+                    'Tanh', 'PReLU', 'Sigmoid', 'HSigmoid', 'Swish'
             ]:
                 act_cfg_.setdefault('inplace', inplace)
             self.activate = build_activation_layer(act_cfg_)
@@ -196,10 +193,7 @@ class ConvModule(nn.Module):
         if self.with_norm:
             constant_init(self.norm, 1, bias=0)
 
-    def forward(self,
-                x: torch.Tensor,
-                activate: bool = True,
-                norm: bool = True) -> torch.Tensor:
+    def forward(self, x, activate=True, norm=True):
         for layer in self.order:
             if layer == 'conv':
                 if self.with_explicit_padding:
diff --git a/mmcv/cnn/bricks/conv_ws.py b/mmcv/cnn/bricks/conv_ws.py
index 261f5c1..a3941e2 100644
--- a/mmcv/cnn/bricks/conv_ws.py
+++ b/mmcv/cnn/bricks/conv_ws.py
@@ -1,21 +1,19 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from collections import OrderedDict
-from typing import Dict, List, Optional, Tuple, Union
-
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from mmengine.registry import MODELS
+
+from .registry import CONV_LAYERS
 
 
-def conv_ws_2d(input: torch.Tensor,
-               weight: torch.Tensor,
-               bias: Optional[torch.Tensor] = None,
-               stride: Union[int, Tuple[int, int]] = 1,
-               padding: Union[int, Tuple[int, int]] = 0,
-               dilation: Union[int, Tuple[int, int]] = 1,
-               groups: int = 1,
-               eps: float = 1e-5) -> torch.Tensor:
+def conv_ws_2d(input,
+               weight,
+               bias=None,
+               stride=1,
+               padding=0,
+               dilation=1,
+               groups=1,
+               eps=1e-5):
     c_in = weight.size(0)
     weight_flat = weight.view(c_in, -1)
     mean = weight_flat.mean(dim=1, keepdim=True).view(c_in, 1, 1, 1)
@@ -24,20 +22,20 @@ def conv_ws_2d(input: torch.Tensor,
     return F.conv2d(input, weight, bias, stride, padding, dilation, groups)
 
 
-@MODELS.register_module('ConvWS')
+@CONV_LAYERS.register_module('ConvWS')
 class ConvWS2d(nn.Conv2d):
 
     def __init__(self,
-                 in_channels: int,
-                 out_channels: int,
-                 kernel_size: Union[int, Tuple[int, int]],
-                 stride: Union[int, Tuple[int, int]] = 1,
-                 padding: Union[int, Tuple[int, int]] = 0,
-                 dilation: Union[int, Tuple[int, int]] = 1,
-                 groups: int = 1,
-                 bias: bool = True,
-                 eps: float = 1e-5):
-        super().__init__(
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 bias=True,
+                 eps=1e-5):
+        super(ConvWS2d, self).__init__(
             in_channels,
             out_channels,
             kernel_size,
@@ -48,12 +46,12 @@ class ConvWS2d(nn.Conv2d):
             bias=bias)
         self.eps = eps
 
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
+    def forward(self, x):
         return conv_ws_2d(x, self.weight, self.bias, self.stride, self.padding,
                           self.dilation, self.groups, self.eps)
 
 
-@MODELS.register_module(name='ConvAWS')
+@CONV_LAYERS.register_module(name='ConvAWS')
 class ConvAWS2d(nn.Conv2d):
     """AWS (Adaptive Weight Standardization)
 
@@ -78,14 +76,14 @@ class ConvAWS2d(nn.Conv2d):
     """
 
     def __init__(self,
-                 in_channels: int,
-                 out_channels: int,
-                 kernel_size: Union[int, Tuple[int, int]],
-                 stride: Union[int, Tuple[int, int]] = 1,
-                 padding: Union[int, Tuple[int, int]] = 0,
-                 dilation: Union[int, Tuple[int, int]] = 1,
-                 groups: int = 1,
-                 bias: bool = True):
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 bias=True):
         super().__init__(
             in_channels,
             out_channels,
@@ -100,7 +98,7 @@ class ConvAWS2d(nn.Conv2d):
         self.register_buffer('weight_beta',
                              torch.zeros(self.out_channels, 1, 1, 1))
 
-    def _get_weight(self, weight: torch.Tensor) -> torch.Tensor:
+    def _get_weight(self, weight):
         weight_flat = weight.view(weight.size(0), -1)
         mean = weight_flat.mean(dim=1).view(-1, 1, 1, 1)
         std = torch.sqrt(weight_flat.var(dim=1) + 1e-5).view(-1, 1, 1, 1)
@@ -108,16 +106,13 @@ class ConvAWS2d(nn.Conv2d):
         weight = self.weight_gamma * weight + self.weight_beta
         return weight
 
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
+    def forward(self, x):
         weight = self._get_weight(self.weight)
         return F.conv2d(x, weight, self.bias, self.stride, self.padding,
                         self.dilation, self.groups)
 
-    def _load_from_state_dict(self, state_dict: OrderedDict, prefix: str,
-                              local_metadata: Dict, strict: bool,
-                              missing_keys: List[str],
-                              unexpected_keys: List[str],
-                              error_msgs: List[str]) -> None:
+    def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
+                              missing_keys, unexpected_keys, error_msgs):
         """Override default load function.
 
         AWS overrides the function _load_from_state_dict to recover
@@ -129,7 +124,7 @@ class ConvAWS2d(nn.Conv2d):
         """
 
         self.weight_gamma.data.fill_(-1)
-        local_missing_keys: List = []
+        local_missing_keys = []
         super()._load_from_state_dict(state_dict, prefix, local_metadata,
                                       strict, local_missing_keys,
                                       unexpected_keys, error_msgs)
diff --git a/mmcv/cnn/bricks/depthwise_separable_conv_module.py b/mmcv/cnn/bricks/depthwise_separable_conv_module.py
index cf1fe4c..722d5d8 100644
--- a/mmcv/cnn/bricks/depthwise_separable_conv_module.py
+++ b/mmcv/cnn/bricks/depthwise_separable_conv_module.py
@@ -1,7 +1,4 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from typing import Dict, Optional, Tuple, Union
-
-import torch
 import torch.nn as nn
 
 from .conv_module import ConvModule
@@ -49,27 +46,27 @@ class DepthwiseSeparableConvModule(nn.Module):
     """
 
     def __init__(self,
-                 in_channels: int,
-                 out_channels: int,
-                 kernel_size: Union[int, Tuple[int, int]],
-                 stride: Union[int, Tuple[int, int]] = 1,
-                 padding: Union[int, Tuple[int, int]] = 0,
-                 dilation: Union[int, Tuple[int, int]] = 1,
-                 norm_cfg: Optional[Dict] = None,
-                 act_cfg: Dict = dict(type='ReLU'),
-                 dw_norm_cfg: Union[Dict, str] = 'default',
-                 dw_act_cfg: Union[Dict, str] = 'default',
-                 pw_norm_cfg: Union[Dict, str] = 'default',
-                 pw_act_cfg: Union[Dict, str] = 'default',
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 norm_cfg=None,
+                 act_cfg=dict(type='ReLU'),
+                 dw_norm_cfg='default',
+                 dw_act_cfg='default',
+                 pw_norm_cfg='default',
+                 pw_act_cfg='default',
                  **kwargs):
-        super().__init__()
+        super(DepthwiseSeparableConvModule, self).__init__()
         assert 'groups' not in kwargs, 'groups should not be specified'
 
         # if norm/activation config of depthwise/pointwise ConvModule is not
         # specified, use default config.
-        dw_norm_cfg = dw_norm_cfg if dw_norm_cfg != 'default' else norm_cfg  # type: ignore # noqa E501
+        dw_norm_cfg = dw_norm_cfg if dw_norm_cfg != 'default' else norm_cfg
         dw_act_cfg = dw_act_cfg if dw_act_cfg != 'default' else act_cfg
-        pw_norm_cfg = pw_norm_cfg if pw_norm_cfg != 'default' else norm_cfg  # type: ignore # noqa E501
+        pw_norm_cfg = pw_norm_cfg if pw_norm_cfg != 'default' else norm_cfg
         pw_act_cfg = pw_act_cfg if pw_act_cfg != 'default' else act_cfg
 
         # depthwise convolution
@@ -81,19 +78,19 @@ class DepthwiseSeparableConvModule(nn.Module):
             padding=padding,
             dilation=dilation,
             groups=in_channels,
-            norm_cfg=dw_norm_cfg,  # type: ignore
-            act_cfg=dw_act_cfg,  # type: ignore
+            norm_cfg=dw_norm_cfg,
+            act_cfg=dw_act_cfg,
             **kwargs)
 
         self.pointwise_conv = ConvModule(
             in_channels,
             out_channels,
             1,
-            norm_cfg=pw_norm_cfg,  # type: ignore
-            act_cfg=pw_act_cfg,  # type: ignore
+            norm_cfg=pw_norm_cfg,
+            act_cfg=pw_act_cfg,
             **kwargs)
 
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
+    def forward(self, x):
         x = self.depthwise_conv(x)
         x = self.pointwise_conv(x)
         return x
diff --git a/mmcv/cnn/bricks/drop.py b/mmcv/cnn/bricks/drop.py
index fe82a25..b0a0266 100644
--- a/mmcv/cnn/bricks/drop.py
+++ b/mmcv/cnn/bricks/drop.py
@@ -1,14 +1,12 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from typing import Any, Dict, Optional
-
 import torch
 import torch.nn as nn
-from mmengine.registry import MODELS
+
+from mmcv import build_from_cfg
+from .registry import DROPOUT_LAYERS
 
 
-def drop_path(x: torch.Tensor,
-              drop_prob: float = 0.,
-              training: bool = False) -> torch.Tensor:
+def drop_path(x, drop_prob=0., training=False):
     """Drop paths (Stochastic Depth) per sample (when applied in main path of
     residual blocks).
 
@@ -26,7 +24,7 @@ def drop_path(x: torch.Tensor,
     return output
 
 
-@MODELS.register_module()
+@DROPOUT_LAYERS.register_module()
 class DropPath(nn.Module):
     """Drop paths (Stochastic Depth) per sample  (when applied in main path of
     residual blocks).
@@ -38,15 +36,15 @@ class DropPath(nn.Module):
         drop_prob (float): Probability of the path to be zeroed. Default: 0.1
     """
 
-    def __init__(self, drop_prob: float = 0.1):
-        super().__init__()
+    def __init__(self, drop_prob=0.1):
+        super(DropPath, self).__init__()
         self.drop_prob = drop_prob
 
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
+    def forward(self, x):
         return drop_path(x, self.drop_prob, self.training)
 
 
-@MODELS.register_module()
+@DROPOUT_LAYERS.register_module()
 class Dropout(nn.Dropout):
     """A wrapper for ``torch.nn.Dropout``, We rename the ``p`` of
     ``torch.nn.Dropout`` to ``drop_prob`` so as to be consistent with
@@ -58,10 +56,10 @@ class Dropout(nn.Dropout):
         inplace (bool):  Do the operation inplace or not. Default: False.
     """
 
-    def __init__(self, drop_prob: float = 0.5, inplace: bool = False):
+    def __init__(self, drop_prob=0.5, inplace=False):
         super().__init__(p=drop_prob, inplace=inplace)
 
 
-def build_dropout(cfg: Dict, default_args: Optional[Dict] = None) -> Any:
+def build_dropout(cfg, default_args=None):
     """Builder for drop out layers."""
-    return MODELS.build(cfg, default_args=default_args)
+    return build_from_cfg(cfg, DROPOUT_LAYERS, default_args)
diff --git a/mmcv/cnn/bricks/generalized_attention.py b/mmcv/cnn/bricks/generalized_attention.py
index ab20467..988d9ad 100644
--- a/mmcv/cnn/bricks/generalized_attention.py
+++ b/mmcv/cnn/bricks/generalized_attention.py
@@ -5,16 +5,17 @@ import numpy as np
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from mmengine.model import kaiming_init
-from mmengine.registry import MODELS
 
+from ..utils import kaiming_init
+from .registry import PLUGIN_LAYERS
 
-@MODELS.register_module()
+
+@PLUGIN_LAYERS.register_module()
 class GeneralizedAttention(nn.Module):
     """GeneralizedAttention module.
 
     See 'An Empirical Study of Spatial Attention Mechanisms in Deep Networks'
-    (https://arxiv.org/abs/1904.05873) for details.
+    (https://arxiv.org/abs/1711.07971) for details.
 
     Args:
         in_channels (int): Channels of the input feature map.
@@ -44,16 +45,16 @@ class GeneralizedAttention(nn.Module):
     _abbr_ = 'gen_attention_block'
 
     def __init__(self,
-                 in_channels: int,
-                 spatial_range: int = -1,
-                 num_heads: int = 9,
-                 position_embedding_dim: int = -1,
-                 position_magnitude: int = 1,
-                 kv_stride: int = 2,
-                 q_stride: int = 1,
-                 attention_type: str = '1111'):
+                 in_channels,
+                 spatial_range=-1,
+                 num_heads=9,
+                 position_embedding_dim=-1,
+                 position_magnitude=1,
+                 kv_stride=2,
+                 q_stride=1,
+                 attention_type='1111'):
 
-        super().__init__()
+        super(GeneralizedAttention, self).__init__()
 
         # hard range means local range for non-local operation
         self.position_embedding_dim = (
@@ -130,7 +131,7 @@ class GeneralizedAttention(nn.Module):
 
             max_len_kv = int((max_len - 1.0) / self.kv_stride + 1)
             local_constraint_map = np.ones(
-                (max_len, max_len, max_len_kv, max_len_kv), dtype=int)
+                (max_len, max_len, max_len_kv, max_len_kv), dtype=np.int)
             for iy in range(max_len):
                 for ix in range(max_len):
                     local_constraint_map[
@@ -212,7 +213,7 @@ class GeneralizedAttention(nn.Module):
 
         return embedding_x, embedding_y
 
-    def forward(self, x_input: torch.Tensor) -> torch.Tensor:
+    def forward(self, x_input):
         num_heads = self.num_heads
 
         # use empirical_attention
@@ -350,7 +351,7 @@ class GeneralizedAttention(nn.Module):
                         repeat(n, 1, 1, 1)
 
                     position_feat_x_reshape = position_feat_x.\
-                        view(n, num_heads, w * w_kv, self.qk_embed_dim)
+                        view(n, num_heads, w*w_kv, self.qk_embed_dim)
 
                     position_feat_y_reshape = position_feat_y.\
                         view(n, num_heads, h * h_kv, self.qk_embed_dim)
diff --git a/mmcv/cnn/bricks/hsigmoid.py b/mmcv/cnn/bricks/hsigmoid.py
index 423e0aa..30b1a3d 100644
--- a/mmcv/cnn/bricks/hsigmoid.py
+++ b/mmcv/cnn/bricks/hsigmoid.py
@@ -1,24 +1,18 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-import warnings
-
-import torch
 import torch.nn as nn
-from mmengine.registry import MODELS
+
+from .registry import ACTIVATION_LAYERS
 
 
-@MODELS.register_module()
+@ACTIVATION_LAYERS.register_module()
 class HSigmoid(nn.Module):
     """Hard Sigmoid Module. Apply the hard sigmoid function:
     Hsigmoid(x) = min(max((x + bias) / divisor, min_value), max_value)
-    Default: Hsigmoid(x) = min(max((x + 3) / 6, 0), 1)
-
-    Note:
-        In MMCV v1.4.4, we modified the default value of args to align with
-        PyTorch official.
+    Default: Hsigmoid(x) = min(max((x + 1) / 2, 0), 1)
 
     Args:
-        bias (float): Bias of the input feature map. Default: 3.0.
-        divisor (float): Divisor of the input feature map. Default: 6.0.
+        bias (float): Bias of the input feature map. Default: 1.0.
+        divisor (float): Divisor of the input feature map. Default: 2.0.
         min_value (float): Lower bound value. Default: 0.0.
         max_value (float): Upper bound value. Default: 1.0.
 
@@ -26,25 +20,15 @@ class HSigmoid(nn.Module):
         Tensor: The output tensor.
     """
 
-    def __init__(self,
-                 bias: float = 3.0,
-                 divisor: float = 6.0,
-                 min_value: float = 0.0,
-                 max_value: float = 1.0):
-        super().__init__()
-        warnings.warn(
-            'In MMCV v1.4.4, we modified the default value of args to align '
-            'with PyTorch official. Previous Implementation: '
-            'Hsigmoid(x) = min(max((x + 1) / 2, 0), 1). '
-            'Current Implementation: '
-            'Hsigmoid(x) = min(max((x + 3) / 6, 0), 1).')
+    def __init__(self, bias=1.0, divisor=2.0, min_value=0.0, max_value=1.0):
+        super(HSigmoid, self).__init__()
         self.bias = bias
         self.divisor = divisor
         assert self.divisor != 0
         self.min_value = min_value
         self.max_value = max_value
 
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
+    def forward(self, x):
         x = (x + self.bias) / self.divisor
 
         return x.clamp_(self.min_value, self.max_value)
diff --git a/mmcv/cnn/bricks/hswish.py b/mmcv/cnn/bricks/hswish.py
index 6b6dd00..7e0c090 100644
--- a/mmcv/cnn/bricks/hswish.py
+++ b/mmcv/cnn/bricks/hswish.py
@@ -1,11 +1,10 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-import torch
 import torch.nn as nn
-from mmengine.registry import MODELS
-from mmengine.utils import digit_version
-from mmengine.utils.dl_utils import TORCH_VERSION
 
+from .registry import ACTIVATION_LAYERS
 
+
+@ACTIVATION_LAYERS.register_module()
 class HSwish(nn.Module):
     """Hard Swish Module.
 
@@ -22,18 +21,9 @@ class HSwish(nn.Module):
         Tensor: The output tensor.
     """
 
-    def __init__(self, inplace: bool = False):
-        super().__init__()
+    def __init__(self, inplace=False):
+        super(HSwish, self).__init__()
         self.act = nn.ReLU6(inplace)
 
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
+    def forward(self, x):
         return x * self.act(x + 3) / 6
-
-
-if (TORCH_VERSION == 'parrots'
-        or digit_version(TORCH_VERSION) < digit_version('1.7')):
-    # Hardswish is not supported when PyTorch version < 1.6.
-    # And Hardswish in PyTorch 1.6 does not support inplace.
-    MODELS.register_module(module=HSwish)
-else:
-    MODELS.register_module(module=nn.Hardswish, name='HSwish')
diff --git a/mmcv/cnn/bricks/non_local.py b/mmcv/cnn/bricks/non_local.py
index 8dd4465..92d0015 100644
--- a/mmcv/cnn/bricks/non_local.py
+++ b/mmcv/cnn/bricks/non_local.py
@@ -1,13 +1,12 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from abc import ABCMeta
-from typing import Dict, Optional
 
 import torch
 import torch.nn as nn
-from mmengine.model import constant_init, normal_init
-from mmengine.registry import MODELS
 
+from ..utils import constant_init, normal_init
 from .conv_module import ConvModule
+from .registry import PLUGIN_LAYERS
 
 
 class _NonLocalNd(nn.Module, metaclass=ABCMeta):
@@ -34,14 +33,14 @@ class _NonLocalNd(nn.Module, metaclass=ABCMeta):
     """
 
     def __init__(self,
-                 in_channels: int,
-                 reduction: int = 2,
-                 use_scale: bool = True,
-                 conv_cfg: Optional[Dict] = None,
-                 norm_cfg: Optional[Dict] = None,
-                 mode: str = 'embedded_gaussian',
+                 in_channels,
+                 reduction=2,
+                 use_scale=True,
+                 conv_cfg=None,
+                 norm_cfg=None,
+                 mode='embedded_gaussian',
                  **kwargs):
-        super().__init__()
+        super(_NonLocalNd, self).__init__()
         self.in_channels = in_channels
         self.reduction = reduction
         self.use_scale = use_scale
@@ -62,7 +61,7 @@ class _NonLocalNd(nn.Module, metaclass=ABCMeta):
             self.inter_channels,
             kernel_size=1,
             conv_cfg=conv_cfg,
-            act_cfg=None)  # type: ignore
+            act_cfg=None)
         self.conv_out = ConvModule(
             self.inter_channels,
             self.in_channels,
@@ -97,7 +96,7 @@ class _NonLocalNd(nn.Module, metaclass=ABCMeta):
 
         self.init_weights(**kwargs)
 
-    def init_weights(self, std: float = 0.01, zeros_init: bool = True) -> None:
+    def init_weights(self, std=0.01, zeros_init=True):
         if self.mode != 'gaussian':
             for m in [self.g, self.theta, self.phi]:
                 normal_init(m.conv, std=std)
@@ -114,8 +113,7 @@ class _NonLocalNd(nn.Module, metaclass=ABCMeta):
             else:
                 normal_init(self.conv_out.norm, std=std)
 
-    def gaussian(self, theta_x: torch.Tensor,
-                 phi_x: torch.Tensor) -> torch.Tensor:
+    def gaussian(self, theta_x, phi_x):
         # NonLocal1d pairwise_weight: [N, H, H]
         # NonLocal2d pairwise_weight: [N, HxW, HxW]
         # NonLocal3d pairwise_weight: [N, TxHxW, TxHxW]
@@ -123,8 +121,7 @@ class _NonLocalNd(nn.Module, metaclass=ABCMeta):
         pairwise_weight = pairwise_weight.softmax(dim=-1)
         return pairwise_weight
 
-    def embedded_gaussian(self, theta_x: torch.Tensor,
-                          phi_x: torch.Tensor) -> torch.Tensor:
+    def embedded_gaussian(self, theta_x, phi_x):
         # NonLocal1d pairwise_weight: [N, H, H]
         # NonLocal2d pairwise_weight: [N, HxW, HxW]
         # NonLocal3d pairwise_weight: [N, TxHxW, TxHxW]
@@ -135,8 +132,7 @@ class _NonLocalNd(nn.Module, metaclass=ABCMeta):
         pairwise_weight = pairwise_weight.softmax(dim=-1)
         return pairwise_weight
 
-    def dot_product(self, theta_x: torch.Tensor,
-                    phi_x: torch.Tensor) -> torch.Tensor:
+    def dot_product(self, theta_x, phi_x):
         # NonLocal1d pairwise_weight: [N, H, H]
         # NonLocal2d pairwise_weight: [N, HxW, HxW]
         # NonLocal3d pairwise_weight: [N, TxHxW, TxHxW]
@@ -144,8 +140,7 @@ class _NonLocalNd(nn.Module, metaclass=ABCMeta):
         pairwise_weight /= pairwise_weight.shape[-1]
         return pairwise_weight
 
-    def concatenation(self, theta_x: torch.Tensor,
-                      phi_x: torch.Tensor) -> torch.Tensor:
+    def concatenation(self, theta_x, phi_x):
         # NonLocal1d pairwise_weight: [N, H, H]
         # NonLocal2d pairwise_weight: [N, HxW, HxW]
         # NonLocal3d pairwise_weight: [N, TxHxW, TxHxW]
@@ -162,7 +157,7 @@ class _NonLocalNd(nn.Module, metaclass=ABCMeta):
 
         return pairwise_weight
 
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
+    def forward(self, x):
         # Assume `reduction = 1`, then `inter_channels = C`
         # or `inter_channels = C` when `mode="gaussian"`
 
@@ -229,11 +224,12 @@ class NonLocal1d(_NonLocalNd):
     """
 
     def __init__(self,
-                 in_channels: int,
-                 sub_sample: bool = False,
-                 conv_cfg: Dict = dict(type='Conv1d'),
+                 in_channels,
+                 sub_sample=False,
+                 conv_cfg=dict(type='Conv1d'),
                  **kwargs):
-        super().__init__(in_channels, conv_cfg=conv_cfg, **kwargs)
+        super(NonLocal1d, self).__init__(
+            in_channels, conv_cfg=conv_cfg, **kwargs)
 
         self.sub_sample = sub_sample
 
@@ -246,7 +242,7 @@ class NonLocal1d(_NonLocalNd):
                 self.phi = max_pool_layer
 
 
-@MODELS.register_module()
+@PLUGIN_LAYERS.register_module()
 class NonLocal2d(_NonLocalNd):
     """2D Non-local module.
 
@@ -262,11 +258,12 @@ class NonLocal2d(_NonLocalNd):
     _abbr_ = 'nonlocal_block'
 
     def __init__(self,
-                 in_channels: int,
-                 sub_sample: bool = False,
-                 conv_cfg: Dict = dict(type='Conv2d'),
+                 in_channels,
+                 sub_sample=False,
+                 conv_cfg=dict(type='Conv2d'),
                  **kwargs):
-        super().__init__(in_channels, conv_cfg=conv_cfg, **kwargs)
+        super(NonLocal2d, self).__init__(
+            in_channels, conv_cfg=conv_cfg, **kwargs)
 
         self.sub_sample = sub_sample
 
@@ -292,11 +289,12 @@ class NonLocal3d(_NonLocalNd):
     """
 
     def __init__(self,
-                 in_channels: int,
-                 sub_sample: bool = False,
-                 conv_cfg: Dict = dict(type='Conv3d'),
+                 in_channels,
+                 sub_sample=False,
+                 conv_cfg=dict(type='Conv3d'),
                  **kwargs):
-        super().__init__(in_channels, conv_cfg=conv_cfg, **kwargs)
+        super(NonLocal3d, self).__init__(
+            in_channels, conv_cfg=conv_cfg, **kwargs)
         self.sub_sample = sub_sample
 
         if sub_sample:
diff --git a/mmcv/cnn/bricks/norm.py b/mmcv/cnn/bricks/norm.py
index 2fff684..cfb326b 100644
--- a/mmcv/cnn/bricks/norm.py
+++ b/mmcv/cnn/bricks/norm.py
@@ -1,24 +1,23 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import inspect
-from typing import Dict, Tuple, Union
 
 import torch.nn as nn
-from mmengine.registry import MODELS
-from mmengine.utils import is_tuple_of
-from mmengine.utils.dl_utils.parrots_wrapper import (SyncBatchNorm, _BatchNorm,
-                                                     _InstanceNorm)
-
-MODELS.register_module('BN', module=nn.BatchNorm2d)
-MODELS.register_module('BN1d', module=nn.BatchNorm1d)
-MODELS.register_module('BN2d', module=nn.BatchNorm2d)
-MODELS.register_module('BN3d', module=nn.BatchNorm3d)
-MODELS.register_module('SyncBN', module=SyncBatchNorm)
-MODELS.register_module('GN', module=nn.GroupNorm)
-MODELS.register_module('LN', module=nn.LayerNorm)
-MODELS.register_module('IN', module=nn.InstanceNorm2d)
-MODELS.register_module('IN1d', module=nn.InstanceNorm1d)
-MODELS.register_module('IN2d', module=nn.InstanceNorm2d)
-MODELS.register_module('IN3d', module=nn.InstanceNorm3d)
+
+from mmcv.utils import is_tuple_of
+from mmcv.utils.parrots_wrapper import SyncBatchNorm, _BatchNorm, _InstanceNorm
+from .registry import NORM_LAYERS
+
+NORM_LAYERS.register_module('BN', module=nn.BatchNorm2d)
+NORM_LAYERS.register_module('BN1d', module=nn.BatchNorm1d)
+NORM_LAYERS.register_module('BN2d', module=nn.BatchNorm2d)
+NORM_LAYERS.register_module('BN3d', module=nn.BatchNorm3d)
+NORM_LAYERS.register_module('SyncBN', module=SyncBatchNorm)
+NORM_LAYERS.register_module('GN', module=nn.GroupNorm)
+NORM_LAYERS.register_module('LN', module=nn.LayerNorm)
+NORM_LAYERS.register_module('IN', module=nn.InstanceNorm2d)
+NORM_LAYERS.register_module('IN1d', module=nn.InstanceNorm1d)
+NORM_LAYERS.register_module('IN2d', module=nn.InstanceNorm2d)
+NORM_LAYERS.register_module('IN3d', module=nn.InstanceNorm3d)
 
 
 def infer_abbr(class_type):
@@ -70,9 +69,7 @@ def infer_abbr(class_type):
             return 'norm_layer'
 
 
-def build_norm_layer(cfg: Dict,
-                     num_features: int,
-                     postfix: Union[int, str] = '') -> Tuple[str, nn.Module]:
+def build_norm_layer(cfg, num_features, postfix=''):
     """Build normalization layer.
 
     Args:
@@ -86,9 +83,9 @@ def build_norm_layer(cfg: Dict,
             to create named layer.
 
     Returns:
-        tuple[str, nn.Module]: The first element is the layer name consisting
-        of abbreviation and postfix, e.g., bn1, gn. The second element is the
-        created norm layer.
+        (str, nn.Module): The first element is the layer name consisting of
+            abbreviation and postfix, e.g., bn1, gn. The second element is the
+            created norm layer.
     """
     if not isinstance(cfg, dict):
         raise TypeError('cfg must be a dict')
@@ -97,15 +94,10 @@ def build_norm_layer(cfg: Dict,
     cfg_ = cfg.copy()
 
     layer_type = cfg_.pop('type')
+    if layer_type not in NORM_LAYERS:
+        raise KeyError(f'Unrecognized norm type {layer_type}')
 
-    # Switch registry to the target scope. If `norm_layer` cannot be found
-    # in the registry, fallback to search `norm_layer` in the
-    # mmengine.MODELS.
-    with MODELS.switch_scope_and_registry(None) as registry:
-        norm_layer = registry.get(layer_type)
-    if norm_layer is None:
-        raise KeyError(f'Cannot find {norm_layer} in registry under scope '
-                       f'name {registry.scope}')
+    norm_layer = NORM_LAYERS.get(layer_type)
     abbr = infer_abbr(norm_layer)
 
     assert isinstance(postfix, (int, str))
@@ -127,8 +119,7 @@ def build_norm_layer(cfg: Dict,
     return name, layer
 
 
-def is_norm(layer: nn.Module,
-            exclude: Union[type, tuple, None] = None) -> bool:
+def is_norm(layer, exclude=None):
     """Check if a layer is a normalization layer.
 
     Args:
diff --git a/mmcv/cnn/bricks/padding.py b/mmcv/cnn/bricks/padding.py
index 4135a19..e4ac6b2 100644
--- a/mmcv/cnn/bricks/padding.py
+++ b/mmcv/cnn/bricks/padding.py
@@ -1,19 +1,18 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from typing import Dict
-
 import torch.nn as nn
-from mmengine.registry import MODELS
 
-MODELS.register_module('zero', module=nn.ZeroPad2d)
-MODELS.register_module('reflect', module=nn.ReflectionPad2d)
-MODELS.register_module('replicate', module=nn.ReplicationPad2d)
+from .registry import PADDING_LAYERS
+
+PADDING_LAYERS.register_module('zero', module=nn.ZeroPad2d)
+PADDING_LAYERS.register_module('reflect', module=nn.ReflectionPad2d)
+PADDING_LAYERS.register_module('replicate', module=nn.ReplicationPad2d)
 
 
-def build_padding_layer(cfg: Dict, *args, **kwargs) -> nn.Module:
+def build_padding_layer(cfg, *args, **kwargs):
     """Build padding layer.
 
     Args:
-        cfg (dict): The padding layer config, which should contain:
+        cfg (None or dict): The padding layer config, which should contain:
             - type (str): Layer type.
             - layer args: Args needed to instantiate a padding layer.
 
@@ -27,15 +26,11 @@ def build_padding_layer(cfg: Dict, *args, **kwargs) -> nn.Module:
 
     cfg_ = cfg.copy()
     padding_type = cfg_.pop('type')
+    if padding_type not in PADDING_LAYERS:
+        raise KeyError(f'Unrecognized padding type {padding_type}.')
+    else:
+        padding_layer = PADDING_LAYERS.get(padding_type)
 
-    # Switch registry to the target scope. If `padding_layer` cannot be found
-    # in the registry, fallback to search `padding_layer` in the
-    # mmengine.MODELS.
-    with MODELS.switch_scope_and_registry(None) as registry:
-        padding_layer = registry.get(padding_type)
-    if padding_layer is None:
-        raise KeyError(f'Cannot find {padding_layer} in registry under scope '
-                       f'name {registry.scope}')
     layer = padding_layer(*args, **kwargs, **cfg_)
 
     return layer
diff --git a/mmcv/cnn/bricks/plugin.py b/mmcv/cnn/bricks/plugin.py
index 83ba373..07c010d 100644
--- a/mmcv/cnn/bricks/plugin.py
+++ b/mmcv/cnn/bricks/plugin.py
@@ -1,18 +1,15 @@
-# Copyright (c) OpenMMLab. All rights reserved.
 import inspect
 import platform
-from typing import Dict, Tuple, Union
 
-import torch.nn as nn
-from mmengine.registry import MODELS
+from .registry import PLUGIN_LAYERS
 
 if platform.system() == 'Windows':
-    import regex as re  # type: ignore
+    import regex as re
 else:
-    import re  # type: ignore
+    import re
 
 
-def infer_abbr(class_type: type) -> str:
+def infer_abbr(class_type):
     """Infer abbreviation from the class name.
 
     This method will infer the abbreviation to map class types to
@@ -50,27 +47,25 @@ def infer_abbr(class_type: type) -> str:
         raise TypeError(
             f'class_type must be a type, but got {type(class_type)}')
     if hasattr(class_type, '_abbr_'):
-        return class_type._abbr_  # type: ignore
+        return class_type._abbr_
     else:
         return camel2snack(class_type.__name__)
 
 
-def build_plugin_layer(cfg: Dict,
-                       postfix: Union[int, str] = '',
-                       **kwargs) -> Tuple[str, nn.Module]:
+def build_plugin_layer(cfg, postfix='', **kwargs):
     """Build plugin layer.
 
     Args:
-        cfg (dict): cfg should contain:
-
-            - type (str): identify plugin layer type.
-            - layer args: args needed to instantiate a plugin layer.
+        cfg (None or dict): cfg should contain:
+            type (str): identify plugin layer type.
+            layer args: args needed to instantiate a plugin layer.
         postfix (int, str): appended into norm abbreviation to
             create named layer. Default: ''.
 
     Returns:
-        tuple[str, nn.Module]: The first one is the concatenation of
-        abbreviation and postfix. The second is the created plugin layer.
+        tuple[str, nn.Module]:
+            name (str): abbreviation + postfix
+            layer (nn.Module): created plugin layer
     """
     if not isinstance(cfg, dict):
         raise TypeError('cfg must be a dict')
@@ -79,15 +74,10 @@ def build_plugin_layer(cfg: Dict,
     cfg_ = cfg.copy()
 
     layer_type = cfg_.pop('type')
+    if layer_type not in PLUGIN_LAYERS:
+        raise KeyError(f'Unrecognized plugin type {layer_type}')
 
-    # Switch registry to the target scope. If `plugin_layer` cannot be found
-    # in the registry, fallback to search `plugin_layer` in the
-    # mmengine.MODELS.
-    with MODELS.switch_scope_and_registry(None) as registry:
-        plugin_layer = registry.get(layer_type)
-    if plugin_layer is None:
-        raise KeyError(f'Cannot find {plugin_layer} in registry under scope '
-                       f'name {registry.scope}')
+    plugin_layer = PLUGIN_LAYERS.get(layer_type)
     abbr = infer_abbr(plugin_layer)
 
     assert isinstance(postfix, (int, str))
diff --git a/mmcv/cnn/bricks/registry.py b/mmcv/cnn/bricks/registry.py
new file mode 100644
index 0000000..c292797
--- /dev/null
+++ b/mmcv/cnn/bricks/registry.py
@@ -0,0 +1,16 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmcv.utils import Registry
+
+CONV_LAYERS = Registry('conv layer')
+NORM_LAYERS = Registry('norm layer')
+ACTIVATION_LAYERS = Registry('activation layer')
+PADDING_LAYERS = Registry('padding layer')
+UPSAMPLE_LAYERS = Registry('upsample layer')
+PLUGIN_LAYERS = Registry('plugin layer')
+
+DROPOUT_LAYERS = Registry('drop out layers')
+POSITIONAL_ENCODING = Registry('position encoding')
+ATTENTION = Registry('attention')
+FEEDFORWARD_NETWORK = Registry('feed-forward Network')
+TRANSFORMER_LAYER = Registry('transformerLayer')
+TRANSFORMER_LAYER_SEQUENCE = Registry('transformer-layers sequence')
diff --git a/mmcv/cnn/bricks/scale.py b/mmcv/cnn/bricks/scale.py
index a473798..c905fff 100644
--- a/mmcv/cnn/bricks/scale.py
+++ b/mmcv/cnn/bricks/scale.py
@@ -13,45 +13,9 @@ class Scale(nn.Module):
         scale (float): Initial value of scale factor. Default: 1.0
     """
 
-    def __init__(self, scale: float = 1.0):
-        super().__init__()
+    def __init__(self, scale=1.0):
+        super(Scale, self).__init__()
         self.scale = nn.Parameter(torch.tensor(scale, dtype=torch.float))
 
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
+    def forward(self, x):
         return x * self.scale
-
-
-class LayerScale(nn.Module):
-    """LayerScale layer.
-
-    Args:
-        dim (int): Dimension of input features.
-        inplace (bool): Whether performs operation in-place.
-            Default: `False`.
-        data_format (str): The input data format, could be 'channels_last'
-            or 'channels_first', representing (B, C, H, W) and
-            (B, N, C) format data respectively. Default: 'channels_last'.
-        scale (float): Initial value of scale factor. Default: 1.0
-    """
-
-    def __init__(self,
-                 dim: int,
-                 inplace: bool = False,
-                 data_format: str = 'channels_last',
-                 scale: float = 1e-5):
-        super().__init__()
-        assert data_format in ('channels_last', 'channels_first'), \
-            "'data_format' could only be channels_last or channels_first."
-        self.inplace = inplace
-        self.data_format = data_format
-        self.weight = nn.Parameter(torch.ones(dim) * scale)
-
-    def forward(self, x) -> torch.Tensor:
-        if self.data_format == 'channels_first':
-            shape = tuple((1, -1, *(1 for _ in range(x.dim() - 2))))
-        else:
-            shape = tuple((*(1 for _ in range(x.dim() - 1)), -1))
-        if self.inplace:
-            return x.mul_(self.weight.view(*shape))
-        else:
-            return x * self.weight.view(*shape)
diff --git a/mmcv/cnn/bricks/swish.py b/mmcv/cnn/bricks/swish.py
index 75ad75b..e2ca8ed 100644
--- a/mmcv/cnn/bricks/swish.py
+++ b/mmcv/cnn/bricks/swish.py
@@ -1,10 +1,11 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import torch
 import torch.nn as nn
-from mmengine.registry import MODELS
 
+from .registry import ACTIVATION_LAYERS
 
-@MODELS.register_module()
+
+@ACTIVATION_LAYERS.register_module()
 class Swish(nn.Module):
     """Swish Module.
 
@@ -18,7 +19,7 @@ class Swish(nn.Module):
     """
 
     def __init__(self):
-        super().__init__()
+        super(Swish, self).__init__()
 
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
+    def forward(self, x):
         return x * torch.sigmoid(x)
diff --git a/mmcv/cnn/bricks/transformer.py b/mmcv/cnn/bricks/transformer.py
index f83b9a6..ed32688 100644
--- a/mmcv/cnn/bricks/transformer.py
+++ b/mmcv/cnn/bricks/transformer.py
@@ -1,26 +1,21 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import copy
-import math
 import warnings
-from typing import Sequence
 
 import torch
 import torch.nn as nn
-import torch.nn.functional as F
-from mmengine.config import ConfigDict
-from mmengine.model import BaseModule, ModuleList, Sequential
-from mmengine.registry import MODELS
-from mmengine.utils import deprecated_api_warning, to_2tuple
-
-from mmcv.cnn import (Linear, build_activation_layer, build_conv_layer,
-                      build_norm_layer)
+
+from mmcv import ConfigDict, deprecated_api_warning
+from mmcv.cnn import Linear, build_activation_layer, build_norm_layer
+from mmcv.runner.base_module import BaseModule, ModuleList, Sequential
+from mmcv.utils import build_from_cfg
 from .drop import build_dropout
-from .scale import LayerScale
+from .registry import (ATTENTION, FEEDFORWARD_NETWORK, POSITIONAL_ENCODING,
+                       TRANSFORMER_LAYER, TRANSFORMER_LAYER_SEQUENCE)
 
 # Avoid BC-breaking of importing MultiScaleDeformableAttention from this file
 try:
-    from mmcv.ops.multi_scale_deform_attn import \
-        MultiScaleDeformableAttention  # noqa F401
+    from mmcv.ops.multi_scale_deform_attn import MultiScaleDeformableAttention  # noqa F401
     warnings.warn(
         ImportWarning(
             '``MultiScaleDeformableAttention`` has been moved to '
@@ -32,379 +27,35 @@ try:
 except ImportError:
     warnings.warn('Fail to import ``MultiScaleDeformableAttention`` from '
                   '``mmcv.ops.multi_scale_deform_attn``, '
-                  'You should install ``mmcv`` rather than ``mmcv-lite`` '
-                  'if you need this module. ')
+                  'You should install ``mmcv-full`` if you need this module. ')
 
 
 def build_positional_encoding(cfg, default_args=None):
     """Builder for Position Encoding."""
-    return MODELS.build(cfg, default_args=default_args)
+    return build_from_cfg(cfg, POSITIONAL_ENCODING, default_args)
 
 
 def build_attention(cfg, default_args=None):
     """Builder for attention."""
-    return MODELS.build(cfg, default_args=default_args)
+    return build_from_cfg(cfg, ATTENTION, default_args)
 
 
 def build_feedforward_network(cfg, default_args=None):
     """Builder for feed-forward network (FFN)."""
-    return MODELS.build(cfg, default_args=default_args)
+    return build_from_cfg(cfg, FEEDFORWARD_NETWORK, default_args)
 
 
 def build_transformer_layer(cfg, default_args=None):
     """Builder for transformer layer."""
-    return MODELS.build(cfg, default_args=default_args)
+    return build_from_cfg(cfg, TRANSFORMER_LAYER, default_args)
 
 
 def build_transformer_layer_sequence(cfg, default_args=None):
     """Builder for transformer encoder and transformer decoder."""
-    return MODELS.build(cfg, default_args=default_args)
-
-
-class AdaptivePadding(nn.Module):
-    """Applies padding adaptively to the input.
-
-    This module can make input get fully covered by filter
-    you specified. It support two modes "same" and "corner". The
-    "same" mode is same with "SAME" padding mode in TensorFlow, pad
-    zero around input. The "corner"  mode would pad zero
-    to bottom right.
-
-    Args:
-        kernel_size (int | tuple): Size of the kernel. Default: 1.
-        stride (int | tuple): Stride of the filter. Default: 1.
-        dilation (int | tuple): Spacing between kernel elements.
-            Default: 1.
-        padding (str): Support "same" and "corner", "corner" mode
-            would pad zero to bottom right, and "same" mode would
-            pad zero around input. Default: "corner".
-
-    Example:
-        >>> kernel_size = 16
-        >>> stride = 16
-        >>> dilation = 1
-        >>> input = torch.rand(1, 1, 15, 17)
-        >>> adap_pad = AdaptivePadding(
-        >>>     kernel_size=kernel_size,
-        >>>     stride=stride,
-        >>>     dilation=dilation,
-        >>>     padding="corner")
-        >>> out = adap_pad(input)
-        >>> assert (out.shape[2], out.shape[3]) == (16, 32)
-        >>> input = torch.rand(1, 1, 16, 17)
-        >>> out = adap_pad(input)
-        >>> assert (out.shape[2], out.shape[3]) == (16, 32)
-    """
-
-    def __init__(self, kernel_size=1, stride=1, dilation=1, padding='corner'):
-        super().__init__()
-        assert padding in ('same', 'corner')
-
-        kernel_size = to_2tuple(kernel_size)
-        stride = to_2tuple(stride)
-        dilation = to_2tuple(dilation)
-
-        self.padding = padding
-        self.kernel_size = kernel_size
-        self.stride = stride
-        self.dilation = dilation
-
-    def get_pad_shape(self, input_shape):
-        """Calculate the padding size of input.
-
-        Args:
-            input_shape (:obj:`torch.Size`): arrange as (H, W).
-
-        Returns:
-            Tuple[int]: The padding size along the
-            original H and W directions
-        """
-        input_h, input_w = input_shape
-        kernel_h, kernel_w = self.kernel_size
-        stride_h, stride_w = self.stride
-        output_h = math.ceil(input_h / stride_h)
-        output_w = math.ceil(input_w / stride_w)
-        pad_h = max((output_h - 1) * stride_h +
-                    (kernel_h - 1) * self.dilation[0] + 1 - input_h, 0)
-        pad_w = max((output_w - 1) * stride_w +
-                    (kernel_w - 1) * self.dilation[1] + 1 - input_w, 0)
-        return pad_h, pad_w
-
-    def forward(self, x):
-        """Add padding to `x`
-
-        Args:
-            x (Tensor): Input tensor has shape (B, C, H, W).
-
-        Returns:
-            Tensor: The tensor with adaptive padding
-        """
-        pad_h, pad_w = self.get_pad_shape(x.size()[-2:])
-        if pad_h > 0 or pad_w > 0:
-            if self.padding == 'corner':
-                x = F.pad(x, [0, pad_w, 0, pad_h])
-            elif self.padding == 'same':
-                x = F.pad(x, [
-                    pad_w // 2, pad_w - pad_w // 2, pad_h // 2,
-                    pad_h - pad_h // 2
-                ])
-        return x
+    return build_from_cfg(cfg, TRANSFORMER_LAYER_SEQUENCE, default_args)
 
 
-class PatchEmbed(BaseModule):
-    """Image to Patch Embedding.
-
-    We use a conv layer to implement PatchEmbed.
-
-    Args:
-        in_channels (int): The num of input channels. Default: 3
-        embed_dims (int): The dimensions of embedding. Default: 768
-        conv_type (str): The type of convolution
-            to generate patch embedding. Default: "Conv2d".
-        kernel_size (int): The kernel_size of embedding conv. Default: 16.
-        stride (int): The slide stride of embedding conv.
-            Default: 16.
-        padding (int | tuple | string): The padding length of
-            embedding conv. When it is a string, it means the mode
-            of adaptive padding, support "same" and "corner" now.
-            Default: "corner".
-        dilation (int): The dilation rate of embedding conv. Default: 1.
-        bias (bool): Bias of embed conv. Default: True.
-        norm_cfg (dict, optional): Config dict for normalization layer.
-            Default: None.
-        input_size (int | tuple | None): The size of input, which will be
-            used to calculate the out size. Only works when `dynamic_size`
-            is False. Default: None.
-        init_cfg (`mmcv.ConfigDict`, optional): The Config for initialization.
-            Default: None.
-    """
-
-    def __init__(self,
-                 in_channels=3,
-                 embed_dims=768,
-                 conv_type='Conv2d',
-                 kernel_size=16,
-                 stride=16,
-                 padding='corner',
-                 dilation=1,
-                 bias=True,
-                 norm_cfg=None,
-                 input_size=None,
-                 init_cfg=None):
-        super().__init__(init_cfg=init_cfg)
-
-        self.embed_dims = embed_dims
-        if stride is None:
-            stride = kernel_size
-
-        kernel_size = to_2tuple(kernel_size)
-        stride = to_2tuple(stride)
-        dilation = to_2tuple(dilation)
-
-        if isinstance(padding, str):
-            self.adaptive_padding = AdaptivePadding(
-                kernel_size=kernel_size,
-                stride=stride,
-                dilation=dilation,
-                padding=padding)
-            # disable the padding of conv
-            padding = 0
-        else:
-            self.adaptive_padding = None
-        padding = to_2tuple(padding)
-
-        self.projection = build_conv_layer(
-            dict(type=conv_type),
-            in_channels=in_channels,
-            out_channels=embed_dims,
-            kernel_size=kernel_size,
-            stride=stride,
-            padding=padding,
-            dilation=dilation,
-            bias=bias)
-
-        if norm_cfg is not None:
-            self.norm = build_norm_layer(norm_cfg, embed_dims)[1]
-        else:
-            self.norm = None
-
-        if input_size:
-            input_size = to_2tuple(input_size)
-            # `init_out_size` would be used outside to
-            # calculate the num_patches
-            # e.g. when `use_abs_pos_embed` outside
-            self.init_input_size = input_size
-            if self.adaptive_padding:
-                pad_h, pad_w = self.adaptive_padding.get_pad_shape(input_size)
-                input_h, input_w = input_size
-                input_h = input_h + pad_h
-                input_w = input_w + pad_w
-                input_size = (input_h, input_w)
-
-            # https://pytorch.org/docs/stable/generated/torch.nn.Conv2d.html
-            h_out = (input_size[0] + 2 * padding[0] - dilation[0] *
-                     (kernel_size[0] - 1) - 1) // stride[0] + 1
-            w_out = (input_size[1] + 2 * padding[1] - dilation[1] *
-                     (kernel_size[1] - 1) - 1) // stride[1] + 1
-            self.init_out_size = (h_out, w_out)
-        else:
-            self.init_input_size = None
-            self.init_out_size = None
-
-    def forward(self, x):
-        """
-        Args:
-            x (Tensor): Has shape (B, C, H, W). In most case, C is 3.
-
-        Returns:
-            tuple: Contains merged results and its spatial shape.
-
-            - x (Tensor): Has shape (B, out_h * out_w, embed_dims)
-            - out_size (tuple[int]): Spatial shape of x, arrange as
-              (out_h, out_w).
-        """
-
-        if self.adaptive_padding:
-            x = self.adaptive_padding(x)
-
-        x = self.projection(x)
-        out_size = (x.shape[2], x.shape[3])
-        x = x.flatten(2).transpose(1, 2)
-        if self.norm is not None:
-            x = self.norm(x)
-        return x, out_size
-
-
-class PatchMerging(BaseModule):
-    """Merge patch feature map.
-
-    This layer groups feature map by kernel_size, and applies norm and linear
-    layers to the grouped feature map ((used in Swin Transformer)).
-    Our implementation uses `nn.Unfold` to
-    merge patches, which is about 25% faster than the original
-    implementation. However, we need to modify pretrained
-    models for compatibility.
-
-    Args:
-        in_channels (int): The num of input channels.
-            to gets fully covered by filter and stride you specified.
-        out_channels (int): The num of output channels.
-        kernel_size (int | tuple, optional): the kernel size in the unfold
-            layer. Defaults to 2.
-        stride (int | tuple, optional): the stride of the sliding blocks in the
-            unfold layer. Default: None. (Would be set as `kernel_size`)
-        padding (int | tuple | string ): The padding length of
-            embedding conv. When it is a string, it means the mode
-            of adaptive padding, support "same" and "corner" now.
-            Default: "corner".
-        dilation (int | tuple, optional): dilation parameter in the unfold
-            layer. Default: 1.
-        bias (bool, optional): Whether to add bias in linear layer or not.
-            Defaults: False.
-        norm_cfg (dict, optional): Config dict for normalization layer.
-            Default: dict(type='LN').
-        init_cfg (dict, optional): The extra config for initialization.
-            Default: None.
-    """
-
-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 kernel_size=2,
-                 stride=None,
-                 padding='corner',
-                 dilation=1,
-                 bias=False,
-                 norm_cfg=dict(type='LN'),
-                 init_cfg=None):
-        super().__init__(init_cfg=init_cfg)
-        self.in_channels = in_channels
-        self.out_channels = out_channels
-        if stride:
-            stride = stride
-        else:
-            stride = kernel_size
-
-        kernel_size = to_2tuple(kernel_size)
-        stride = to_2tuple(stride)
-        dilation = to_2tuple(dilation)
-
-        if isinstance(padding, str):
-            self.adaptive_padding = AdaptivePadding(
-                kernel_size=kernel_size,
-                stride=stride,
-                dilation=dilation,
-                padding=padding)
-            # disable the padding of unfold
-            padding = 0
-        else:
-            self.adaptive_padding = None
-
-        padding = to_2tuple(padding)
-        self.sampler = nn.Unfold(
-            kernel_size=kernel_size,
-            dilation=dilation,
-            padding=padding,
-            stride=stride)
-
-        sample_dim = kernel_size[0] * kernel_size[1] * in_channels
-
-        if norm_cfg is not None:
-            self.norm = build_norm_layer(norm_cfg, sample_dim)[1]
-        else:
-            self.norm = None
-
-        self.reduction = nn.Linear(sample_dim, out_channels, bias=bias)
-
-    def forward(self, x, input_size):
-        """
-        Args:
-            x (Tensor): Has shape (B, H*W, C_in).
-            input_size (tuple[int]): The spatial shape of x, arrange as (H, W).
-                Default: None.
-
-        Returns:
-            tuple: Contains merged results and its spatial shape.
-
-            - x (Tensor): Has shape (B, Merged_H * Merged_W, C_out)
-            - out_size (tuple[int]): Spatial shape of x, arrange as
-              (Merged_H, Merged_W).
-        """
-        B, L, C = x.shape
-        assert isinstance(input_size, Sequence), f'Expect ' \
-                                                 f'input_size is ' \
-                                                 f'`Sequence` ' \
-                                                 f'but get {input_size}'
-
-        H, W = input_size
-        assert L == H * W, 'input feature has wrong size'
-
-        x = x.view(B, H, W, C).permute([0, 3, 1, 2])  # B, C, H, W
-
-        if self.adaptive_padding:
-            x = self.adaptive_padding(x)
-            H, W = x.shape[-2:]
-
-        # Use nn.Unfold to merge patch. About 25% faster than original method,
-        # but need to modify pretrained model for compatibility
-        # if kernel_size=2 and stride=2, x should has shape (B, 4*C, H/2*W/2)
-        x = self.sampler(x)
-
-        out_h = (H + 2 * self.sampler.padding[0] - self.sampler.dilation[0] *
-                 (self.sampler.kernel_size[0] - 1) -
-                 1) // self.sampler.stride[0] + 1
-        out_w = (W + 2 * self.sampler.padding[1] - self.sampler.dilation[1] *
-                 (self.sampler.kernel_size[1] - 1) -
-                 1) // self.sampler.stride[1] + 1
-
-        output_size = (out_h, out_w)
-        x = x.transpose(1, 2)  # B, H/2*W/2, 4*C
-        x = self.norm(x) if self.norm else x
-        x = self.reduction(x)
-        return x, output_size
-
-
-@MODELS.register_module()
+@ATTENTION.register_module()
 class MultiheadAttention(BaseModule):
     """A wrapper for ``torch.nn.MultiheadAttention``.
 
@@ -436,13 +87,12 @@ class MultiheadAttention(BaseModule):
                  init_cfg=None,
                  batch_first=False,
                  **kwargs):
-        super().__init__(init_cfg)
+        super(MultiheadAttention, self).__init__(init_cfg)
         if 'dropout' in kwargs:
-            warnings.warn(
-                'The arguments `dropout` in MultiheadAttention '
-                'has been deprecated, now you can separately '
-                'set `attn_drop`(float), proj_drop(float), '
-                'and `dropout_layer`(dict) ', DeprecationWarning)
+            warnings.warn('The arguments `dropout` in MultiheadAttention '
+                          'has been deprecated, now you can separately '
+                          'set `attn_drop`(float), proj_drop(float), '
+                          'and `dropout_layer`(dict) ')
             attn_drop = kwargs['dropout']
             dropout_layer['drop_prob'] = kwargs.pop('dropout')
 
@@ -504,9 +154,9 @@ class MultiheadAttention(BaseModule):
 
         Returns:
             Tensor: forwarded results with shape
-            [num_queries, bs, embed_dims]
-            if self.batch_first is False, else
-            [bs, num_queries embed_dims].
+                [num_queries, bs, embed_dims]
+                if self.batch_first is False, else
+                [bs, num_queries embed_dims].
         """
 
         if key is None:
@@ -552,7 +202,7 @@ class MultiheadAttention(BaseModule):
         return identity + self.dropout_layer(self.proj_drop(out))
 
 
-@MODELS.register_module()
+@FEEDFORWARD_NETWORK.register_module()
 class FFN(BaseModule):
     """Implements feed-forward networks (FFNs) with identity connection.
 
@@ -573,8 +223,6 @@ class FFN(BaseModule):
             when adding the shortcut.
         init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
             Default: None.
-        layer_scale_init_value (float): Initial value of scale factor in
-            LayerScale. Default: 1.0
     """
 
     @deprecated_api_warning(
@@ -592,21 +240,23 @@ class FFN(BaseModule):
                  dropout_layer=None,
                  add_identity=True,
                  init_cfg=None,
-                 layer_scale_init_value=0.):
-        super().__init__(init_cfg)
+                 **kwargs):
+        super(FFN, self).__init__(init_cfg)
         assert num_fcs >= 2, 'num_fcs should be no less ' \
             f'than 2. got {num_fcs}.'
         self.embed_dims = embed_dims
         self.feedforward_channels = feedforward_channels
         self.num_fcs = num_fcs
+        self.act_cfg = act_cfg
+        self.activate = build_activation_layer(act_cfg)
 
         layers = []
         in_channels = embed_dims
         for _ in range(num_fcs - 1):
             layers.append(
                 Sequential(
-                    Linear(in_channels, feedforward_channels),
-                    build_activation_layer(act_cfg), nn.Dropout(ffn_drop)))
+                    Linear(in_channels, feedforward_channels), self.activate,
+                    nn.Dropout(ffn_drop)))
             in_channels = feedforward_channels
         layers.append(Linear(feedforward_channels, embed_dims))
         layers.append(nn.Dropout(ffn_drop))
@@ -615,11 +265,6 @@ class FFN(BaseModule):
             dropout_layer) if dropout_layer else torch.nn.Identity()
         self.add_identity = add_identity
 
-        if layer_scale_init_value > 0:
-            self.gamma2 = LayerScale(embed_dims, scale=layer_scale_init_value)
-        else:
-            self.gamma2 = nn.Identity()
-
     @deprecated_api_warning({'residual': 'identity'}, cls_name='FFN')
     def forward(self, x, identity=None):
         """Forward function for `FFN`.
@@ -627,7 +272,6 @@ class FFN(BaseModule):
         The function would add x to the output tensor if residue is None.
         """
         out = self.layers(x)
-        out = self.gamma2(out)
         if not self.add_identity:
             return self.dropout_layer(out)
         if identity is None:
@@ -635,7 +279,7 @@ class FFN(BaseModule):
         return identity + self.dropout_layer(out)
 
 
-@MODELS.register_module()
+@TRANSFORMER_LAYER.register_module()
 class BaseTransformerLayer(BaseModule):
     """Base `TransformerLayer` for vision transformer.
 
@@ -698,15 +342,15 @@ class BaseTransformerLayer(BaseModule):
                     f'The arguments `{ori_name}` in BaseTransformerLayer '
                     f'has been deprecated, now you should set `{new_name}` '
                     f'and other FFN related arguments '
-                    f'to a dict named `ffn_cfgs`. ', DeprecationWarning)
+                    f'to a dict named `ffn_cfgs`. ')
                 ffn_cfgs[new_name] = kwargs[ori_name]
 
-        super().__init__(init_cfg)
+        super(BaseTransformerLayer, self).__init__(init_cfg)
 
         self.batch_first = batch_first
 
-        assert set(operation_order) & {
-            'self_attn', 'norm', 'ffn', 'cross_attn'} == \
+        assert set(operation_order) & set(
+            ['self_attn', 'norm', 'ffn', 'cross_attn']) == \
             set(operation_order), f'The operation_order of' \
             f' {self.__class__.__name__} should ' \
             f'contains all four operation type ' \
@@ -753,7 +397,7 @@ class BaseTransformerLayer(BaseModule):
         assert len(ffn_cfgs) == num_ffns
         for ffn_index in range(num_ffns):
             if 'embed_dims' not in ffn_cfgs[ffn_index]:
-                ffn_cfgs[ffn_index]['embed_dims'] = self.embed_dims
+                ffn_cfgs['embed_dims'] = self.embed_dims
             else:
                 assert ffn_cfgs[ffn_index]['embed_dims'] == self.embed_dims
             self.ffns.append(
@@ -866,7 +510,7 @@ class BaseTransformerLayer(BaseModule):
         return query
 
 
-@MODELS.register_module()
+@TRANSFORMER_LAYER_SEQUENCE.register_module()
 class TransformerLayerSequence(BaseModule):
     """Base class for TransformerEncoder and TransformerDecoder in vision
     transformer.
@@ -887,7 +531,7 @@ class TransformerLayerSequence(BaseModule):
     """
 
     def __init__(self, transformerlayers=None, num_layers=None, init_cfg=None):
-        super().__init__(init_cfg)
+        super(TransformerLayerSequence, self).__init__(init_cfg)
         if isinstance(transformerlayers, dict):
             transformerlayers = [
                 copy.deepcopy(transformerlayers) for _ in range(num_layers)
diff --git a/mmcv/cnn/bricks/upsample.py b/mmcv/cnn/bricks/upsample.py
index d91689a..a1a3537 100644
--- a/mmcv/cnn/bricks/upsample.py
+++ b/mmcv/cnn/bricks/upsample.py
@@ -1,17 +1,15 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from typing import Dict
-
-import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from mmengine.model import xavier_init
-from mmengine.registry import MODELS
 
-MODELS.register_module('nearest', module=nn.Upsample)
-MODELS.register_module('bilinear', module=nn.Upsample)
+from ..utils import xavier_init
+from .registry import UPSAMPLE_LAYERS
+
+UPSAMPLE_LAYERS.register_module('nearest', module=nn.Upsample)
+UPSAMPLE_LAYERS.register_module('bilinear', module=nn.Upsample)
 
 
-@MODELS.register_module(name='pixel_shuffle')
+@UPSAMPLE_LAYERS.register_module(name='pixel_shuffle')
 class PixelShufflePack(nn.Module):
     """Pixel Shuffle upsample layer.
 
@@ -26,9 +24,9 @@ class PixelShufflePack(nn.Module):
             channels.
     """
 
-    def __init__(self, in_channels: int, out_channels: int, scale_factor: int,
-                 upsample_kernel: int):
-        super().__init__()
+    def __init__(self, in_channels, out_channels, scale_factor,
+                 upsample_kernel):
+        super(PixelShufflePack, self).__init__()
         self.in_channels = in_channels
         self.out_channels = out_channels
         self.scale_factor = scale_factor
@@ -43,13 +41,13 @@ class PixelShufflePack(nn.Module):
     def init_weights(self):
         xavier_init(self.upsample_conv, distribution='uniform')
 
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
+    def forward(self, x):
         x = self.upsample_conv(x)
         x = F.pixel_shuffle(x, self.scale_factor)
         return x
 
 
-def build_upsample_layer(cfg: Dict, *args, **kwargs) -> nn.Module:
+def build_upsample_layer(cfg, *args, **kwargs):
     """Build upsample layer.
 
     Args:
@@ -57,7 +55,7 @@ def build_upsample_layer(cfg: Dict, *args, **kwargs) -> nn.Module:
 
             - type (str): Layer type.
             - scale_factor (int): Upsample ratio, which is not applicable to
-              deconv.
+                deconv.
             - layer args: Args needed to instantiate a upsample layer.
         args (argument list): Arguments passed to the ``__init__``
             method of the corresponding conv layer.
@@ -75,15 +73,11 @@ def build_upsample_layer(cfg: Dict, *args, **kwargs) -> nn.Module:
     cfg_ = cfg.copy()
 
     layer_type = cfg_.pop('type')
+    if layer_type not in UPSAMPLE_LAYERS:
+        raise KeyError(f'Unrecognized upsample type {layer_type}')
+    else:
+        upsample = UPSAMPLE_LAYERS.get(layer_type)
 
-    # Switch registry to the target scope. If `upsample` cannot be found
-    # in the registry, fallback to search `upsample` in the
-    # mmengine.MODELS.
-    with MODELS.switch_scope_and_registry(None) as registry:
-        upsample = registry.get(layer_type)
-    if upsample is None:
-        raise KeyError(f'Cannot find {upsample} in registry under scope '
-                       f'name {registry.scope}')
     if upsample is nn.Upsample:
         cfg_['mode'] = layer_type
     layer = upsample(*args, **kwargs, **cfg_)
diff --git a/mmcv/cnn/bricks/wrappers.py b/mmcv/cnn/bricks/wrappers.py
index 07eb04e..8aebf67 100644
--- a/mmcv/cnn/bricks/wrappers.py
+++ b/mmcv/cnn/bricks/wrappers.py
@@ -9,9 +9,10 @@ import math
 
 import torch
 import torch.nn as nn
-from mmengine.registry import MODELS
 from torch.nn.modules.utils import _pair, _triple
 
+from .registry import CONV_LAYERS, UPSAMPLE_LAYERS
+
 if torch.__version__ == 'parrots':
     TORCH_VERSION = torch.__version__
 else:
@@ -20,27 +21,27 @@ else:
     TORCH_VERSION = tuple(int(x) for x in torch.__version__.split('.')[:2])
 
 
-def obsolete_torch_version(torch_version, version_threshold) -> bool:
+def obsolete_torch_version(torch_version, version_threshold):
     return torch_version == 'parrots' or torch_version <= version_threshold
 
 
 class NewEmptyTensorOp(torch.autograd.Function):
 
     @staticmethod
-    def forward(ctx, x: torch.Tensor, new_shape: tuple) -> torch.Tensor:
+    def forward(ctx, x, new_shape):
         ctx.shape = x.shape
         return x.new_empty(new_shape)
 
     @staticmethod
-    def backward(ctx, grad: torch.Tensor) -> tuple:
+    def backward(ctx, grad):
         shape = ctx.shape
         return NewEmptyTensorOp.apply(grad, shape), None
 
 
-@MODELS.register_module('Conv', force=True)
+@CONV_LAYERS.register_module('Conv', force=True)
 class Conv2d(nn.Conv2d):
 
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
+    def forward(self, x):
         if x.numel() == 0 and obsolete_torch_version(TORCH_VERSION, (1, 4)):
             out_shape = [x.shape[0], self.out_channels]
             for i, k, p, s, d in zip(x.shape[-2:], self.kernel_size,
@@ -58,10 +59,10 @@ class Conv2d(nn.Conv2d):
         return super().forward(x)
 
 
-@MODELS.register_module('Conv3d', force=True)
+@CONV_LAYERS.register_module('Conv3d', force=True)
 class Conv3d(nn.Conv3d):
 
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
+    def forward(self, x):
         if x.numel() == 0 and obsolete_torch_version(TORCH_VERSION, (1, 4)):
             out_shape = [x.shape[0], self.out_channels]
             for i, k, p, s, d in zip(x.shape[-3:], self.kernel_size,
@@ -79,11 +80,12 @@ class Conv3d(nn.Conv3d):
         return super().forward(x)
 
 
-@MODELS.register_module()
-@MODELS.register_module('deconv')
+@CONV_LAYERS.register_module()
+@CONV_LAYERS.register_module('deconv')
+@UPSAMPLE_LAYERS.register_module('deconv', force=True)
 class ConvTranspose2d(nn.ConvTranspose2d):
 
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
+    def forward(self, x):
         if x.numel() == 0 and obsolete_torch_version(TORCH_VERSION, (1, 4)):
             out_shape = [x.shape[0], self.out_channels]
             for i, k, p, s, d, op in zip(x.shape[-2:], self.kernel_size,
@@ -101,11 +103,12 @@ class ConvTranspose2d(nn.ConvTranspose2d):
         return super().forward(x)
 
 
-@MODELS.register_module()
-@MODELS.register_module('deconv3d')
+@CONV_LAYERS.register_module()
+@CONV_LAYERS.register_module('deconv3d')
+@UPSAMPLE_LAYERS.register_module('deconv3d', force=True)
 class ConvTranspose3d(nn.ConvTranspose3d):
 
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
+    def forward(self, x):
         if x.numel() == 0 and obsolete_torch_version(TORCH_VERSION, (1, 4)):
             out_shape = [x.shape[0], self.out_channels]
             for i, k, p, s, d, op in zip(x.shape[-3:], self.kernel_size,
@@ -125,7 +128,7 @@ class ConvTranspose3d(nn.ConvTranspose3d):
 
 class MaxPool2d(nn.MaxPool2d):
 
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
+    def forward(self, x):
         # PyTorch 1.9 does not support empty tensor inference yet
         if x.numel() == 0 and obsolete_torch_version(TORCH_VERSION, (1, 9)):
             out_shape = list(x.shape[:2])
@@ -143,7 +146,7 @@ class MaxPool2d(nn.MaxPool2d):
 
 class MaxPool3d(nn.MaxPool3d):
 
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
+    def forward(self, x):
         # PyTorch 1.9 does not support empty tensor inference yet
         if x.numel() == 0 and obsolete_torch_version(TORCH_VERSION, (1, 9)):
             out_shape = list(x.shape[:2])
@@ -162,7 +165,7 @@ class MaxPool3d(nn.MaxPool3d):
 
 class Linear(torch.nn.Linear):
 
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
+    def forward(self, x):
         # empty tensor forward of Linear layer is supported in Pytorch 1.6
         if x.numel() == 0 and obsolete_torch_version(TORCH_VERSION, (1, 5)):
             out_shape = [x.shape[0], self.out_features]
diff --git a/mmcv/cnn/builder.py b/mmcv/cnn/builder.py
new file mode 100644
index 0000000..7567316
--- /dev/null
+++ b/mmcv/cnn/builder.py
@@ -0,0 +1,30 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from ..runner import Sequential
+from ..utils import Registry, build_from_cfg
+
+
+def build_model_from_cfg(cfg, registry, default_args=None):
+    """Build a PyTorch model from config dict(s). Different from
+    ``build_from_cfg``, if cfg is a list, a ``nn.Sequential`` will be built.
+
+    Args:
+        cfg (dict, list[dict]): The config of modules, is is either a config
+            dict or a list of config dicts. If cfg is a list, a
+            the built modules will be wrapped with ``nn.Sequential``.
+        registry (:obj:`Registry`): A registry the module belongs to.
+        default_args (dict, optional): Default arguments to build the module.
+            Defaults to None.
+
+    Returns:
+        nn.Module: A built nn module.
+    """
+    if isinstance(cfg, list):
+        modules = [
+            build_from_cfg(cfg_, registry, default_args) for cfg_ in cfg
+        ]
+        return Sequential(*modules)
+    else:
+        return build_from_cfg(cfg, registry, default_args)
+
+
+MODELS = Registry('model', build_func=build_model_from_cfg)
diff --git a/mmcv/cnn/resnet.py b/mmcv/cnn/resnet.py
index 8fc6abf..1cb3ac0 100644
--- a/mmcv/cnn/resnet.py
+++ b/mmcv/cnn/resnet.py
@@ -1,18 +1,13 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import logging
-from typing import Optional, Sequence, Tuple, Union
 
 import torch.nn as nn
 import torch.utils.checkpoint as cp
-from mmengine.model import constant_init, kaiming_init
-from mmengine.runner import load_checkpoint
-from torch import Tensor
 
+from .utils import constant_init, kaiming_init
 
-def conv3x3(in_planes: int,
-            out_planes: int,
-            stride: int = 1,
-            dilation: int = 1):
+
+def conv3x3(in_planes, out_planes, stride=1, dilation=1):
     """3x3 convolution with padding."""
     return nn.Conv2d(
         in_planes,
@@ -28,14 +23,14 @@ class BasicBlock(nn.Module):
     expansion = 1
 
     def __init__(self,
-                 inplanes: int,
-                 planes: int,
-                 stride: int = 1,
-                 dilation: int = 1,
-                 downsample: Optional[nn.Module] = None,
-                 style: str = 'pytorch',
-                 with_cp: bool = False):
-        super().__init__()
+                 inplanes,
+                 planes,
+                 stride=1,
+                 dilation=1,
+                 downsample=None,
+                 style='pytorch',
+                 with_cp=False):
+        super(BasicBlock, self).__init__()
         assert style in ['pytorch', 'caffe']
         self.conv1 = conv3x3(inplanes, planes, stride, dilation)
         self.bn1 = nn.BatchNorm2d(planes)
@@ -47,7 +42,7 @@ class BasicBlock(nn.Module):
         self.dilation = dilation
         assert not with_cp
 
-    def forward(self, x: Tensor) -> Tensor:
+    def forward(self, x):
         residual = x
 
         out = self.conv1(x)
@@ -70,19 +65,19 @@ class Bottleneck(nn.Module):
     expansion = 4
 
     def __init__(self,
-                 inplanes: int,
-                 planes: int,
-                 stride: int = 1,
-                 dilation: int = 1,
-                 downsample: Optional[nn.Module] = None,
-                 style: str = 'pytorch',
-                 with_cp: bool = False):
+                 inplanes,
+                 planes,
+                 stride=1,
+                 dilation=1,
+                 downsample=None,
+                 style='pytorch',
+                 with_cp=False):
         """Bottleneck block.
 
         If style is "pytorch", the stride-two layer is the 3x3 conv layer, if
         it is "caffe", the stride-two layer is the first 1x1 conv layer.
         """
-        super().__init__()
+        super(Bottleneck, self).__init__()
         assert style in ['pytorch', 'caffe']
         if style == 'pytorch':
             conv1_stride = 1
@@ -112,7 +107,7 @@ class Bottleneck(nn.Module):
         self.dilation = dilation
         self.with_cp = with_cp
 
-    def forward(self, x: Tensor) -> Tensor:
+    def forward(self, x):
 
         def _inner_forward(x):
             residual = x
@@ -145,14 +140,14 @@ class Bottleneck(nn.Module):
         return out
 
 
-def make_res_layer(block: nn.Module,
-                   inplanes: int,
-                   planes: int,
-                   blocks: int,
-                   stride: int = 1,
-                   dilation: int = 1,
-                   style: str = 'pytorch',
-                   with_cp: bool = False) -> nn.Module:
+def make_res_layer(block,
+                   inplanes,
+                   planes,
+                   blocks,
+                   stride=1,
+                   dilation=1,
+                   style='pytorch',
+                   with_cp=False):
     downsample = None
     if stride != 1 or inplanes != planes * block.expansion:
         downsample = nn.Sequential(
@@ -213,22 +208,22 @@ class ResNet(nn.Module):
     }
 
     def __init__(self,
-                 depth: int,
-                 num_stages: int = 4,
-                 strides: Sequence[int] = (1, 2, 2, 2),
-                 dilations: Sequence[int] = (1, 1, 1, 1),
-                 out_indices: Sequence[int] = (0, 1, 2, 3),
-                 style: str = 'pytorch',
-                 frozen_stages: int = -1,
-                 bn_eval: bool = True,
-                 bn_frozen: bool = False,
-                 with_cp: bool = False):
-        super().__init__()
+                 depth,
+                 num_stages=4,
+                 strides=(1, 2, 2, 2),
+                 dilations=(1, 1, 1, 1),
+                 out_indices=(0, 1, 2, 3),
+                 style='pytorch',
+                 frozen_stages=-1,
+                 bn_eval=True,
+                 bn_frozen=False,
+                 with_cp=False):
+        super(ResNet, self).__init__()
         if depth not in self.arch_settings:
             raise KeyError(f'invalid depth {depth} for resnet')
         assert num_stages >= 1 and num_stages <= 4
         block, stage_blocks = self.arch_settings[depth]
-        stage_blocks = stage_blocks[:num_stages]  # type: ignore
+        stage_blocks = stage_blocks[:num_stages]
         assert len(strides) == len(dilations) == num_stages
         assert max(out_indices) < num_stages
 
@@ -239,7 +234,7 @@ class ResNet(nn.Module):
         self.bn_frozen = bn_frozen
         self.with_cp = with_cp
 
-        self.inplanes: int = 64
+        self.inplanes = 64
         self.conv1 = nn.Conv2d(
             3, 64, kernel_size=7, stride=2, padding=3, bias=False)
         self.bn1 = nn.BatchNorm2d(64)
@@ -260,17 +255,17 @@ class ResNet(nn.Module):
                 dilation=dilation,
                 style=self.style,
                 with_cp=with_cp)
-            self.inplanes = planes * block.expansion  # type: ignore
+            self.inplanes = planes * block.expansion
             layer_name = f'layer{i + 1}'
             self.add_module(layer_name, res_layer)
             self.res_layers.append(layer_name)
 
-        self.feat_dim = block.expansion * 64 * 2**(  # type: ignore
-            len(stage_blocks) - 1)
+        self.feat_dim = block.expansion * 64 * 2**(len(stage_blocks) - 1)
 
-    def init_weights(self, pretrained: Optional[str] = None) -> None:
+    def init_weights(self, pretrained=None):
         if isinstance(pretrained, str):
             logger = logging.getLogger()
+            from ..runner import load_checkpoint
             load_checkpoint(self, pretrained, strict=False, logger=logger)
         elif pretrained is None:
             for m in self.modules():
@@ -281,7 +276,7 @@ class ResNet(nn.Module):
         else:
             raise TypeError('pretrained must be a str or None')
 
-    def forward(self, x: Tensor) -> Union[Tensor, Tuple[Tensor]]:
+    def forward(self, x):
         x = self.conv1(x)
         x = self.bn1(x)
         x = self.relu(x)
@@ -297,8 +292,8 @@ class ResNet(nn.Module):
         else:
             return tuple(outs)
 
-    def train(self, mode: bool = True) -> None:
-        super().train(mode)
+    def train(self, mode=True):
+        super(ResNet, self).train(mode)
         if self.bn_eval:
             for m in self.modules():
                 if isinstance(m, nn.BatchNorm2d):
diff --git a/mmcv/cnn/rfsearch/__init__.py b/mmcv/cnn/rfsearch/__init__.py
deleted file mode 100644
index 04d4572..0000000
--- a/mmcv/cnn/rfsearch/__init__.py
+++ /dev/null
@@ -1,5 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from .operator import BaseConvRFSearchOp, Conv2dRFSearchOp
-from .search import RFSearchHook
-
-__all__ = ['BaseConvRFSearchOp', 'Conv2dRFSearchOp', 'RFSearchHook']
diff --git a/mmcv/cnn/rfsearch/operator.py b/mmcv/cnn/rfsearch/operator.py
deleted file mode 100644
index 2fa45ab..0000000
--- a/mmcv/cnn/rfsearch/operator.py
+++ /dev/null
@@ -1,169 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import copy
-
-import numpy as np
-import torch
-import torch.nn as nn
-from mmengine.logging import print_log
-from mmengine.model import BaseModule
-from torch import Tensor
-
-from .utils import expand_rates, get_single_padding
-
-
-class BaseConvRFSearchOp(BaseModule):
-    """Based class of ConvRFSearchOp.
-
-    Args:
-        op_layer (nn.Module): pytorch module, e,g, Conv2d
-        global_config (dict): config dict.
-    """
-
-    def __init__(self, op_layer: nn.Module, global_config: dict):
-        super().__init__()
-        self.op_layer = op_layer
-        self.global_config = global_config
-
-    def normlize(self, weights: nn.Parameter) -> nn.Parameter:
-        """Normalize weights.
-
-        Args:
-            weights (nn.Parameter): Weights to be normalized.
-
-        Returns:
-            nn.Parameters: Normalized weights.
-        """
-        abs_weights = torch.abs(weights)
-        normalized_weights = abs_weights / torch.sum(abs_weights)
-        return normalized_weights
-
-
-class Conv2dRFSearchOp(BaseConvRFSearchOp):
-    """Enable Conv2d with receptive field searching ability.
-
-    Args:
-        op_layer (nn.Module): pytorch module, e,g, Conv2d
-        global_config (dict): config dict. Defaults to None.
-            By default this must include:
-
-            - "init_alphas": The value for initializing weights of each branch.
-            - "num_branches": The controller of the size of
-              search space (the number of branches).
-            - "exp_rate": The controller of the sparsity of search space.
-            - "mmin": The minimum dilation rate.
-            - "mmax": The maximum dilation rate.
-
-            Extra keys may exist, but are used by RFSearchHook, e.g., "step",
-            "max_step", "search_interval", and "skip_layer".
-        verbose (bool): Determines whether to print rf-next
-            related logging messages.
-            Defaults to True.
-    """
-
-    def __init__(self,
-                 op_layer: nn.Module,
-                 global_config: dict,
-                 verbose: bool = True):
-        super().__init__(op_layer, global_config)
-        assert global_config is not None, 'global_config is None'
-        self.num_branches = global_config['num_branches']
-        assert self.num_branches in [2, 3]
-        self.verbose = verbose
-        init_dilation = op_layer.dilation
-        self.dilation_rates = expand_rates(init_dilation, global_config)
-        if self.op_layer.kernel_size[
-                0] == 1 or self.op_layer.kernel_size[0] % 2 == 0:
-            self.dilation_rates = [(op_layer.dilation[0], r[1])
-                                   for r in self.dilation_rates]
-        if self.op_layer.kernel_size[
-                1] == 1 or self.op_layer.kernel_size[1] % 2 == 0:
-            self.dilation_rates = [(r[0], op_layer.dilation[1])
-                                   for r in self.dilation_rates]
-
-        self.branch_weights = nn.Parameter(torch.Tensor(self.num_branches))
-        if self.verbose:
-            print_log(f'Expand as {self.dilation_rates}', 'current')
-        nn.init.constant_(self.branch_weights, global_config['init_alphas'])
-
-    def forward(self, input: Tensor) -> Tensor:
-        norm_w = self.normlize(self.branch_weights[:len(self.dilation_rates)])
-        if len(self.dilation_rates) == 1:
-            outputs = [
-                nn.functional.conv2d(
-                    input,
-                    weight=self.op_layer.weight,
-                    bias=self.op_layer.bias,
-                    stride=self.op_layer.stride,
-                    padding=self.get_padding(self.dilation_rates[0]),
-                    dilation=self.dilation_rates[0],
-                    groups=self.op_layer.groups,
-                )
-            ]
-        else:
-            outputs = [
-                nn.functional.conv2d(
-                    input,
-                    weight=self.op_layer.weight,
-                    bias=self.op_layer.bias,
-                    stride=self.op_layer.stride,
-                    padding=self.get_padding(r),
-                    dilation=r,
-                    groups=self.op_layer.groups,
-                ) * norm_w[i] for i, r in enumerate(self.dilation_rates)
-            ]
-        output = outputs[0]
-        for i in range(1, len(self.dilation_rates)):
-            output += outputs[i]
-        return output
-
-    def estimate_rates(self) -> None:
-        """Estimate new dilation rate based on trained branch_weights."""
-        norm_w = self.normlize(self.branch_weights[:len(self.dilation_rates)])
-        if self.verbose:
-            print_log(
-                'Estimate dilation {} with weight {}.'.format(
-                    self.dilation_rates,
-                    norm_w.detach().cpu().numpy().tolist()), 'current')
-
-        sum0, sum1, w_sum = 0, 0, 0
-        for i in range(len(self.dilation_rates)):
-            sum0 += norm_w[i].item() * self.dilation_rates[i][0]
-            sum1 += norm_w[i].item() * self.dilation_rates[i][1]
-            w_sum += norm_w[i].item()
-        estimated = [
-            np.clip(
-                int(round(sum0 / w_sum)), self.global_config['mmin'],
-                self.global_config['mmax']).item(),
-            np.clip(
-                int(round(sum1 / w_sum)), self.global_config['mmin'],
-                self.global_config['mmax']).item()
-        ]
-        self.op_layer.dilation = tuple(estimated)
-        self.op_layer.padding = self.get_padding(self.op_layer.dilation)
-        self.dilation_rates = [tuple(estimated)]
-        if self.verbose:
-            print_log(f'Estimate as {tuple(estimated)}', 'current')
-
-    def expand_rates(self) -> None:
-        """Expand dilation rate."""
-        dilation = self.op_layer.dilation
-        dilation_rates = expand_rates(dilation, self.global_config)
-        if self.op_layer.kernel_size[
-                0] == 1 or self.op_layer.kernel_size[0] % 2 == 0:
-            dilation_rates = [(dilation[0], r[1]) for r in dilation_rates]
-        if self.op_layer.kernel_size[
-                1] == 1 or self.op_layer.kernel_size[1] % 2 == 0:
-            dilation_rates = [(r[0], dilation[1]) for r in dilation_rates]
-
-        self.dilation_rates = copy.deepcopy(dilation_rates)
-        if self.verbose:
-            print_log(f'Expand as {self.dilation_rates}', 'current')
-        nn.init.constant_(self.branch_weights,
-                          self.global_config['init_alphas'])
-
-    def get_padding(self, dilation) -> tuple:
-        padding = (get_single_padding(self.op_layer.kernel_size[0],
-                                      self.op_layer.stride[0], dilation[0]),
-                   get_single_padding(self.op_layer.kernel_size[1],
-                                      self.op_layer.stride[1], dilation[1]))
-        return padding
diff --git a/mmcv/cnn/rfsearch/search.py b/mmcv/cnn/rfsearch/search.py
deleted file mode 100644
index f4add4b..0000000
--- a/mmcv/cnn/rfsearch/search.py
+++ /dev/null
@@ -1,239 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import os
-from typing import Dict, Optional
-
-import mmengine
-import torch  # noqa
-import torch.nn as nn
-from mmengine.hooks import Hook
-from mmengine.logging import print_log
-from mmengine.registry import HOOKS
-
-from .operator import BaseConvRFSearchOp, Conv2dRFSearchOp  # noqa
-from .utils import get_single_padding, write_to_json
-
-
-@HOOKS.register_module()
-class RFSearchHook(Hook):
-    """Rcecptive field search via dilation rates.
-
-    Please refer to `RF-Next: Efficient Receptive Field
-    Search for Convolutional Neural Networks
-    <https://arxiv.org/abs/2206.06637>`_ for more details.
-
-
-    Args:
-        mode (str, optional): It can be set to the following types:
-            'search', 'fixed_single_branch', or 'fixed_multi_branch'.
-            Defaults to 'search'.
-        config (Dict, optional): config dict of search.
-            By default this config contains "search",
-            and config["search"] must include:
-
-            - "step": recording the current searching step.
-            - "max_step": The maximum number of searching steps
-              to update the structures.
-            - "search_interval": The interval (epoch/iteration)
-              between two updates.
-            - "exp_rate": The controller of the sparsity of search space.
-            - "init_alphas": The value for initializing weights of each branch.
-            - "mmin": The minimum dilation rate.
-            - "mmax": The maximum dilation rate.
-            - "num_branches": The controller of the size of
-              search space (the number of branches).
-            - "skip_layer": The modules in skip_layer will be ignored
-              during the receptive field search.
-        rfstructure_file (str, optional): Path to load searched receptive
-            fields of the model. Defaults to None.
-        by_epoch (bool, optional): Determine to perform step by epoch or
-            by iteration. If set to True, it will step by epoch. Otherwise, by
-            iteration. Defaults to True.
-        verbose (bool): Determines whether to print rf-next related logging
-            messages. Defaults to True.
-    """
-
-    def __init__(self,
-                 mode: str = 'search',
-                 config: Dict = {},
-                 rfstructure_file: Optional[str] = None,
-                 by_epoch: bool = True,
-                 verbose: bool = True):
-        assert mode in ['search', 'fixed_single_branch', 'fixed_multi_branch']
-        assert config is not None
-        self.config = config
-        self.config['structure'] = {}
-        self.verbose = verbose
-        if rfstructure_file is not None:
-            rfstructure = mmengine.load(rfstructure_file)['structure']
-            self.config['structure'] = rfstructure
-        self.mode = mode
-        self.num_branches = self.config['search']['num_branches']
-        self.by_epoch = by_epoch
-
-    def init_model(self, model: nn.Module):
-        """init model with search ability.
-
-        Args:
-            model (nn.Module): pytorch model
-
-        Raises:
-            NotImplementedError: only support three modes:
-                search/fixed_single_branch/fixed_multi_branch
-        """
-        if self.verbose:
-            print_log('RFSearch init begin.', 'current')
-        if self.mode == 'search':
-            if self.config['structure']:
-                self.set_model(model, search_op='Conv2d')
-            self.wrap_model(model, search_op='Conv2d')
-        elif self.mode == 'fixed_single_branch':
-            self.set_model(model, search_op='Conv2d')
-        elif self.mode == 'fixed_multi_branch':
-            self.set_model(model, search_op='Conv2d')
-            self.wrap_model(model, search_op='Conv2d')
-        else:
-            raise NotImplementedError
-        if self.verbose:
-            print_log('RFSearch init end.', 'current')
-
-    def after_train_epoch(self, runner):
-        """Performs a dilation searching step after one training epoch."""
-        if self.by_epoch and self.mode == 'search':
-            self.step(runner.model, runner.work_dir)
-
-    def after_train_iter(self, runner, batch_idx, data_batch, outputs):
-        """Performs a dilation searching step after one training iteration."""
-        if not self.by_epoch and self.mode == 'search':
-            self.step(runner.model, runner.work_dir)
-
-    def step(self, model: nn.Module, work_dir: str) -> None:
-        """Performs a dilation searching step.
-
-        Args:
-            model (nn.Module): pytorch model
-            work_dir (str): Directory to save the searching results.
-        """
-        self.config['search']['step'] += 1
-        if (self.config['search']['step']
-            ) % self.config['search']['search_interval'] == 0 and (self.config[
-                'search']['step']) < self.config['search']['max_step']:
-            self.estimate_and_expand(model)
-            for name, module in model.named_modules():
-                if isinstance(module, BaseConvRFSearchOp):
-                    self.config['structure'][name] = module.op_layer.dilation
-
-            write_to_json(
-                self.config,
-                os.path.join(
-                    work_dir,
-                    'local_search_config_step%d.json' %
-                    self.config['search']['step'],
-                ),
-            )
-
-    def estimate_and_expand(self, model: nn.Module) -> None:
-        """estimate and search for RFConvOp.
-
-        Args:
-            model (nn.Module): pytorch model
-        """
-        for module in model.modules():
-            if isinstance(module, BaseConvRFSearchOp):
-                module.estimate_rates()
-                module.expand_rates()
-
-    def wrap_model(self,
-                   model: nn.Module,
-                   search_op: str = 'Conv2d',
-                   prefix: str = '') -> None:
-        """wrap model to support searchable conv op.
-
-        Args:
-            model (nn.Module): pytorch model
-            search_op (str): The module that uses RF search.
-                Defaults to 'Conv2d'.
-            init_rates (int, optional): Set to other initial dilation rates.
-                Defaults to None.
-            prefix (str): Prefix for function recursion. Defaults to ''.
-        """
-        op = 'torch.nn.' + search_op
-        for name, module in model.named_children():
-            if prefix == '':
-                fullname = 'module.' + name
-            else:
-                fullname = prefix + '.' + name
-            if self.config['search']['skip_layer'] is not None:
-                if any(layer in fullname
-                       for layer in self.config['search']['skip_layer']):
-                    continue
-            if isinstance(module, eval(op)):
-                if 1 < module.kernel_size[0] and \
-                    0 != module.kernel_size[0] % 2 or \
-                    1 < module.kernel_size[1] and \
-                        0 != module.kernel_size[1] % 2:
-                    moduleWrap = eval(search_op + 'RFSearchOp')(
-                        module, self.config['search'], self.verbose)
-                    moduleWrap = moduleWrap.to(module.weight.device)
-                    if self.verbose:
-                        print_log(
-                            'Wrap model %s to %s.' %
-                            (str(module), str(moduleWrap)), 'current')
-                    setattr(model, name, moduleWrap)
-            elif not isinstance(module, BaseConvRFSearchOp):
-                self.wrap_model(module, search_op, fullname)
-
-    def set_model(self,
-                  model: nn.Module,
-                  search_op: str = 'Conv2d',
-                  init_rates: Optional[int] = None,
-                  prefix: str = '') -> None:
-        """set model based on config.
-
-        Args:
-            model (nn.Module): pytorch model
-            config (Dict): config file
-            search_op (str): The module that uses RF search.
-                Defaults to 'Conv2d'.
-            init_rates (int, optional):  Set to other initial dilation rates.
-                Defaults to None.
-            prefix (str): Prefix for function recursion. Defaults to ''.
-        """
-        op = 'torch.nn.' + search_op
-        for name, module in model.named_children():
-            if prefix == '':
-                fullname = 'module.' + name
-            else:
-                fullname = prefix + '.' + name
-            if self.config['search']['skip_layer'] is not None:
-                if any(layer in fullname
-                       for layer in self.config['search']['skip_layer']):
-                    continue
-            if isinstance(module, eval(op)):
-                if 1 < module.kernel_size[0] and \
-                    0 != module.kernel_size[0] % 2 or \
-                    1 < module.kernel_size[1] and \
-                        0 != module.kernel_size[1] % 2:
-                    if isinstance(self.config['structure'][fullname], int):
-                        self.config['structure'][fullname] = [
-                            self.config['structure'][fullname],
-                            self.config['structure'][fullname]
-                        ]
-                    module.dilation = (
-                        self.config['structure'][fullname][0],
-                        self.config['structure'][fullname][1],
-                    )
-                    module.padding = (
-                        get_single_padding(
-                            module.kernel_size[0], module.stride[0],
-                            self.config['structure'][fullname][0]),
-                        get_single_padding(
-                            module.kernel_size[1], module.stride[1],
-                            self.config['structure'][fullname][1]))
-                    setattr(model, name, module)
-                    if self.verbose:
-                        print_log(
-                            'Set module %s dilation as: [%d %d]' %
-                            (fullname, module.dilation[0], module.dilation[1]),
-                            'current')
-            elif not isinstance(module, BaseConvRFSearchOp):
-                self.set_model(module, search_op, init_rates, fullname)
diff --git a/mmcv/cnn/rfsearch/utils.py b/mmcv/cnn/rfsearch/utils.py
deleted file mode 100644
index 4c8168e..0000000
--- a/mmcv/cnn/rfsearch/utils.py
+++ /dev/null
@@ -1,68 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import mmengine
-import numpy as np
-
-
-def write_to_json(config: dict, filename: str):
-    """save config to json file.
-
-    Args:
-        config (dict): Config to be saved.
-        filename (str): Path to save config.
-    """
-
-    with open(filename, 'w', encoding='utf-8') as f:
-        mmengine.dump(config, f, file_format='json')
-
-
-def expand_rates(dilation: tuple, config: dict) -> list:
-    """expand dilation rate according to config.
-
-    Args:
-        dilation (int): _description_
-        config (dict): config dict
-
-    Returns:
-        list: list of expanded dilation rates
-    """
-    exp_rate = config['exp_rate']
-
-    large_rates = []
-    small_rates = []
-    for _ in range(config['num_branches'] // 2):
-        large_rates.append(
-            tuple([
-                np.clip(
-                    int(round((1 + exp_rate) * dilation[0])), config['mmin'],
-                    config['mmax']).item(),
-                np.clip(
-                    int(round((1 + exp_rate) * dilation[1])), config['mmin'],
-                    config['mmax']).item()
-            ]))
-        small_rates.append(
-            tuple([
-                np.clip(
-                    int(round((1 - exp_rate) * dilation[0])), config['mmin'],
-                    config['mmax']).item(),
-                np.clip(
-                    int(round((1 - exp_rate) * dilation[1])), config['mmin'],
-                    config['mmax']).item()
-            ]))
-
-    small_rates.reverse()
-
-    if config['num_branches'] % 2 == 0:
-        rate_list = small_rates + large_rates
-    else:
-        rate_list = small_rates + [dilation] + large_rates
-
-    unique_rate_list = list(set(rate_list))
-    unique_rate_list.sort(key=rate_list.index)
-    return unique_rate_list
-
-
-def get_single_padding(kernel_size: int,
-                       stride: int = 1,
-                       dilation: int = 1) -> int:
-    padding = ((stride - 1) + dilation * (kernel_size - 1)) // 2
-    return padding
diff --git a/mmcv/cnn/utils/__init__.py b/mmcv/cnn/utils/__init__.py
index cdec939..a263e31 100644
--- a/mmcv/cnn/utils/__init__.py
+++ b/mmcv/cnn/utils/__init__.py
@@ -1,5 +1,19 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from .flops_counter import get_model_complexity_info
 from .fuse_conv_bn import fuse_conv_bn
+from .sync_bn import revert_sync_batchnorm
+from .weight_init import (INITIALIZERS, Caffe2XavierInit, ConstantInit,
+                          KaimingInit, NormalInit, PretrainedInit,
+                          TruncNormalInit, UniformInit, XavierInit,
+                          bias_init_with_prob, caffe2_xavier_init,
+                          constant_init, initialize, kaiming_init, normal_init,
+                          trunc_normal_init, uniform_init, xavier_init)
 
-__all__ = ['get_model_complexity_info', 'fuse_conv_bn']
+__all__ = [
+    'get_model_complexity_info', 'bias_init_with_prob', 'caffe2_xavier_init',
+    'constant_init', 'kaiming_init', 'normal_init', 'trunc_normal_init',
+    'uniform_init', 'xavier_init', 'fuse_conv_bn', 'initialize',
+    'INITIALIZERS', 'ConstantInit', 'XavierInit', 'NormalInit',
+    'TruncNormalInit', 'UniformInit', 'KaimingInit', 'PretrainedInit',
+    'Caffe2XavierInit', 'revert_sync_batchnorm'
+]
diff --git a/mmcv/cnn/utils/flops_counter.py b/mmcv/cnn/utils/flops_counter.py
index b09edbc..dceeb39 100644
--- a/mmcv/cnn/utils/flops_counter.py
+++ b/mmcv/cnn/utils/flops_counter.py
@@ -24,25 +24,22 @@
 # SOFTWARE.
 
 import sys
-import warnings
 from functools import partial
-from typing import Any, Callable, Dict, Optional, TextIO, Tuple
 
 import numpy as np
 import torch
 import torch.nn as nn
 
-from mmcv.cnn.bricks import (Conv2d, Conv3d, ConvTranspose2d, Linear,
-                             MaxPool2d, MaxPool3d)
+import mmcv
 
 
-def get_model_complexity_info(model: nn.Module,
-                              input_shape: tuple,
-                              print_per_layer_stat: bool = True,
-                              as_strings: bool = True,
-                              input_constructor: Optional[Callable] = None,
-                              flush: bool = False,
-                              ost: TextIO = sys.stdout) -> tuple:
+def get_model_complexity_info(model,
+                              input_shape,
+                              print_per_layer_stat=True,
+                              as_strings=True,
+                              input_constructor=None,
+                              flush=False,
+                              ost=sys.stdout):
     """Get complexity information of a model.
 
     This method can calculate FLOPs and parameter counts of a model with
@@ -51,16 +48,16 @@ def get_model_complexity_info(model: nn.Module,
 
     Supported layers are listed as below:
         - Convolutions: ``nn.Conv1d``, ``nn.Conv2d``, ``nn.Conv3d``.
-        - Activations: ``nn.ReLU``, ``nn.PReLU``, ``nn.ELU``,
-          ``nn.LeakyReLU``, ``nn.ReLU6``.
+        - Activations: ``nn.ReLU``, ``nn.PReLU``, ``nn.ELU``, ``nn.LeakyReLU``,
+            ``nn.ReLU6``.
         - Poolings: ``nn.MaxPool1d``, ``nn.MaxPool2d``, ``nn.MaxPool3d``,
-          ``nn.AvgPool1d``, ``nn.AvgPool2d``, ``nn.AvgPool3d``,
-          ``nn.AdaptiveMaxPool1d``, ``nn.AdaptiveMaxPool2d``,
-          ``nn.AdaptiveMaxPool3d``, ``nn.AdaptiveAvgPool1d``,
-          ``nn.AdaptiveAvgPool2d``, ``nn.AdaptiveAvgPool3d``.
+            ``nn.AvgPool1d``, ``nn.AvgPool2d``, ``nn.AvgPool3d``,
+            ``nn.AdaptiveMaxPool1d``, ``nn.AdaptiveMaxPool2d``,
+            ``nn.AdaptiveMaxPool3d``, ``nn.AdaptiveAvgPool1d``,
+            ``nn.AdaptiveAvgPool2d``, ``nn.AdaptiveAvgPool3d``.
         - BatchNorms: ``nn.BatchNorm1d``, ``nn.BatchNorm2d``,
-          ``nn.BatchNorm3d``, ``nn.GroupNorm``, ``nn.InstanceNorm1d``,
-          ``InstanceNorm2d``, ``InstanceNorm3d``, ``nn.LayerNorm``.
+            ``nn.BatchNorm3d``, ``nn.GroupNorm``, ``nn.InstanceNorm1d``,
+            ``InstanceNorm2d``, ``InstanceNorm3d``, ``nn.LayerNorm``.
         - Linear: ``nn.Linear``.
         - Deconvolution: ``nn.ConvTranspose2d``.
         - Upsample: ``nn.Upsample``.
@@ -81,8 +78,8 @@ def get_model_complexity_info(model: nn.Module,
 
     Returns:
         tuple[float | str]: If ``as_strings`` is set to True, it will return
-        FLOPs and parameter counts in a string format. otherwise, it will
-        return those in a float number format.
+            FLOPs and parameter counts in a string format. otherwise, it will
+            return those in a float number format.
     """
     assert type(input_shape) is tuple
     assert len(input_shape) >= 1
@@ -118,9 +115,7 @@ def get_model_complexity_info(model: nn.Module,
     return flops_count, params_count
 
 
-def flops_to_string(flops: float,
-                    units: Optional[str] = 'GFLOPs',
-                    precision: int = 2) -> str:
+def flops_to_string(flops, units='GFLOPs', precision=2):
     """Convert FLOPs number into a string.
 
     Note that Here we take a multiply-add counts as one FLOP.
@@ -163,9 +158,7 @@ def flops_to_string(flops: float,
             return str(flops) + ' FLOPs'
 
 
-def params_to_string(num_params: float,
-                     units: Optional[str] = None,
-                     precision: int = 2) -> str:
+def params_to_string(num_params, units=None, precision=2):
     """Convert parameter number into a string.
 
     Args:
@@ -202,13 +195,13 @@ def params_to_string(num_params: float,
             return str(num_params)
 
 
-def print_model_with_flops(model: nn.Module,
-                           total_flops: float,
-                           total_params: float,
-                           units: Optional[str] = 'GFLOPs',
-                           precision: int = 3,
-                           ost: TextIO = sys.stdout,
-                           flush: bool = False) -> None:
+def print_model_with_flops(model,
+                           total_flops,
+                           total_params,
+                           units='GFLOPs',
+                           precision=3,
+                           ost=sys.stdout,
+                           flush=False):
     """Print a model with FLOPs for each layer.
 
     Args:
@@ -283,10 +276,10 @@ def print_model_with_flops(model: nn.Module,
         return ', '.join([
             params_to_string(
                 accumulated_num_params, units='M', precision=precision),
-            f'{accumulated_num_params / total_params:.3%} Params',
+            '{:.3%} Params'.format(accumulated_num_params / total_params),
             flops_to_string(
                 accumulated_flops_cost, units=units, precision=precision),
-            f'{accumulated_flops_cost / total_flops:.3%} FLOPs',
+            '{:.3%} FLOPs'.format(accumulated_flops_cost / total_flops),
             self.original_extra_repr()
         ])
 
@@ -311,7 +304,7 @@ def print_model_with_flops(model: nn.Module,
     model.apply(del_extra_repr)
 
 
-def get_model_parameters_number(model: nn.Module) -> float:
+def get_model_parameters_number(model):
     """Calculate parameter number of a model.
 
     Args:
@@ -324,16 +317,16 @@ def get_model_parameters_number(model: nn.Module) -> float:
     return num_params
 
 
-def add_flops_counting_methods(net_main_module: nn.Module) -> nn.Module:
+def add_flops_counting_methods(net_main_module):
     # adding additional methods to the existing module object,
     # this is done this way so that each function has access to self object
-    net_main_module.start_flops_count = start_flops_count.__get__(  # type: ignore # noqa E501
+    net_main_module.start_flops_count = start_flops_count.__get__(
         net_main_module)
-    net_main_module.stop_flops_count = stop_flops_count.__get__(  # type: ignore # noqa E501
+    net_main_module.stop_flops_count = stop_flops_count.__get__(
         net_main_module)
-    net_main_module.reset_flops_count = reset_flops_count.__get__(  # type: ignore # noqa E501
+    net_main_module.reset_flops_count = reset_flops_count.__get__(
         net_main_module)
-    net_main_module.compute_average_flops_cost = compute_average_flops_cost.__get__(  # type: ignore # noqa E501
+    net_main_module.compute_average_flops_cost = compute_average_flops_cost.__get__(  # noqa: E501
         net_main_module)
 
     net_main_module.reset_flops_count()
@@ -341,7 +334,7 @@ def add_flops_counting_methods(net_main_module: nn.Module) -> nn.Module:
     return net_main_module
 
 
-def compute_average_flops_cost(self) -> Tuple[float, float]:
+def compute_average_flops_cost(self):
     """Compute average FLOPs cost.
 
     A method to compute average FLOPs cost, which will be available after
@@ -359,7 +352,7 @@ def compute_average_flops_cost(self) -> Tuple[float, float]:
     return flops_sum / batches_count, params_sum
 
 
-def start_flops_count(self) -> None:
+def start_flops_count(self):
     """Activate the computation of mean flops consumption per image.
 
     A method to activate the computation of mean flops consumption per image.
@@ -368,7 +361,7 @@ def start_flops_count(self) -> None:
     """
     add_batch_counter_hook_function(self)
 
-    def add_flops_counter_hook_function(module: nn.Module) -> None:
+    def add_flops_counter_hook_function(module):
         if is_supported_instance(module):
             if hasattr(module, '__flops_handle__'):
                 return
@@ -382,7 +375,7 @@ def start_flops_count(self) -> None:
     self.apply(partial(add_flops_counter_hook_function))
 
 
-def stop_flops_count(self) -> None:
+def stop_flops_count(self):
     """Stop computing the mean flops consumption per image.
 
     A method to stop computing the mean flops consumption per image, which will
@@ -393,7 +386,7 @@ def stop_flops_count(self) -> None:
     self.apply(remove_flops_counter_hook_function)
 
 
-def reset_flops_count(self) -> None:
+def reset_flops_count(self):
     """Reset statistics computed so far.
 
     A method to Reset computed statistics, which will be available after
@@ -404,13 +397,11 @@ def reset_flops_count(self) -> None:
 
 
 # ---- Internal functions
-def empty_flops_counter_hook(module: nn.Module, input: tuple,
-                             output: Any) -> None:
+def empty_flops_counter_hook(module, input, output):
     module.__flops__ += 0
 
 
-def upsample_flops_counter_hook(module: nn.Module, input: tuple,
-                                output: torch.Tensor) -> None:
+def upsample_flops_counter_hook(module, input, output):
     output_size = output[0]
     batch_size = output_size.shape[0]
     output_elements_count = batch_size
@@ -419,38 +410,39 @@ def upsample_flops_counter_hook(module: nn.Module, input: tuple,
     module.__flops__ += int(output_elements_count)
 
 
-def relu_flops_counter_hook(module: nn.Module, input: tuple,
-                            output: torch.Tensor) -> None:
+def relu_flops_counter_hook(module, input, output):
     active_elements_count = output.numel()
     module.__flops__ += int(active_elements_count)
 
 
-def linear_flops_counter_hook(module: nn.Module, input: tuple,
-                              output: torch.Tensor) -> None:
+def linear_flops_counter_hook(module, input, output):
+    input = input[0]
     output_last_dim = output.shape[
         -1]  # pytorch checks dimensions, so here we don't care much
-    module.__flops__ += int(np.prod(input[0].shape) * output_last_dim)
+    module.__flops__ += int(np.prod(input.shape) * output_last_dim)
 
 
-def pool_flops_counter_hook(module: nn.Module, input: tuple,
-                            output: torch.Tensor) -> None:
-    module.__flops__ += int(np.prod(input[0].shape))
+def pool_flops_counter_hook(module, input, output):
+    input = input[0]
+    module.__flops__ += int(np.prod(input.shape))
 
 
-def norm_flops_counter_hook(module: nn.Module, input: tuple,
-                            output: torch.Tensor) -> None:
-    batch_flops = np.prod(input[0].shape)
+def norm_flops_counter_hook(module, input, output):
+    input = input[0]
+
+    batch_flops = np.prod(input.shape)
     if (getattr(module, 'affine', False)
             or getattr(module, 'elementwise_affine', False)):
         batch_flops *= 2
     module.__flops__ += int(batch_flops)
 
 
-def deconv_flops_counter_hook(conv_module: nn.Module, input: tuple,
-                              output: torch.Tensor) -> None:
+def deconv_flops_counter_hook(conv_module, input, output):
     # Can have multiple inputs, getting the first one
-    batch_size = input[0].shape[0]
-    input_height, input_width = input[0].shape[2:]
+    input = input[0]
+
+    batch_size = input.shape[0]
+    input_height, input_width = input.shape[2:]
 
     kernel_height, kernel_width = conv_module.kernel_size
     in_channels = conv_module.in_channels
@@ -466,16 +458,17 @@ def deconv_flops_counter_hook(conv_module: nn.Module, input: tuple,
     bias_flops = 0
     if conv_module.bias is not None:
         output_height, output_width = output.shape[2:]
-        bias_flops = out_channels * batch_size * output_height * output_width
+        bias_flops = out_channels * batch_size * output_height * output_height
     overall_flops = overall_conv_flops + bias_flops
 
     conv_module.__flops__ += int(overall_flops)
 
 
-def conv_flops_counter_hook(conv_module: nn.Module, input: tuple,
-                            output: torch.Tensor) -> None:
+def conv_flops_counter_hook(conv_module, input, output):
     # Can have multiple inputs, getting the first one
-    batch_size = input[0].shape[0]
+    input = input[0]
+
+    batch_size = input.shape[0]
     output_dims = list(output.shape[2:])
 
     kernel_dims = list(conv_module.kernel_size)
@@ -502,23 +495,25 @@ def conv_flops_counter_hook(conv_module: nn.Module, input: tuple,
     conv_module.__flops__ += int(overall_flops)
 
 
-def batch_counter_hook(module: nn.Module, input: tuple, output: Any) -> None:
+def batch_counter_hook(module, input, output):
     batch_size = 1
     if len(input) > 0:
         # Can have multiple inputs, getting the first one
-        batch_size = len(input[0])
+        input = input[0]
+        batch_size = len(input)
     else:
-        warnings.warn('No positional inputs found for a module, '
-                      'assuming batch size is 1.')
+        pass
+        print('Warning! No positional inputs found for a module, '
+              'assuming batch size is 1.')
     module.__batch_counter__ += batch_size
 
 
-def add_batch_counter_variables_or_reset(module: nn.Module) -> None:
+def add_batch_counter_variables_or_reset(module):
 
     module.__batch_counter__ = 0
 
 
-def add_batch_counter_hook_function(module: nn.Module) -> None:
+def add_batch_counter_hook_function(module):
     if hasattr(module, '__batch_counter_handle__'):
         return
 
@@ -526,43 +521,43 @@ def add_batch_counter_hook_function(module: nn.Module) -> None:
     module.__batch_counter_handle__ = handle
 
 
-def remove_batch_counter_hook_function(module: nn.Module) -> None:
+def remove_batch_counter_hook_function(module):
     if hasattr(module, '__batch_counter_handle__'):
         module.__batch_counter_handle__.remove()
         del module.__batch_counter_handle__
 
 
-def add_flops_counter_variable_or_reset(module: nn.Module) -> None:
+def add_flops_counter_variable_or_reset(module):
     if is_supported_instance(module):
         if hasattr(module, '__flops__') or hasattr(module, '__params__'):
-            warnings.warn('variables __flops__ or __params__ are already '
-                          'defined for the module' + type(module).__name__ +
-                          ' ptflops can affect your code!')
+            print('Warning: variables __flops__ or __params__ are already '
+                  'defined for the module' + type(module).__name__ +
+                  ' ptflops can affect your code!')
         module.__flops__ = 0
         module.__params__ = get_model_parameters_number(module)
 
 
-def is_supported_instance(module: nn.Module) -> bool:
+def is_supported_instance(module):
     if type(module) in get_modules_mapping():
         return True
     return False
 
 
-def remove_flops_counter_hook_function(module: nn.Module) -> None:
+def remove_flops_counter_hook_function(module):
     if is_supported_instance(module):
         if hasattr(module, '__flops_handle__'):
             module.__flops_handle__.remove()
             del module.__flops_handle__
 
 
-def get_modules_mapping() -> Dict:
+def get_modules_mapping():
     return {
         # convolutions
         nn.Conv1d: conv_flops_counter_hook,
         nn.Conv2d: conv_flops_counter_hook,
-        Conv2d: conv_flops_counter_hook,
+        mmcv.cnn.bricks.Conv2d: conv_flops_counter_hook,
         nn.Conv3d: conv_flops_counter_hook,
-        Conv3d: conv_flops_counter_hook,
+        mmcv.cnn.bricks.Conv3d: conv_flops_counter_hook,
         # activations
         nn.ReLU: relu_flops_counter_hook,
         nn.PReLU: relu_flops_counter_hook,
@@ -574,9 +569,9 @@ def get_modules_mapping() -> Dict:
         nn.AvgPool1d: pool_flops_counter_hook,
         nn.AvgPool2d: pool_flops_counter_hook,
         nn.MaxPool2d: pool_flops_counter_hook,
-        MaxPool2d: pool_flops_counter_hook,
+        mmcv.cnn.bricks.MaxPool2d: pool_flops_counter_hook,
         nn.MaxPool3d: pool_flops_counter_hook,
-        MaxPool3d: pool_flops_counter_hook,
+        mmcv.cnn.bricks.MaxPool3d: pool_flops_counter_hook,
         nn.AvgPool3d: pool_flops_counter_hook,
         nn.AdaptiveMaxPool1d: pool_flops_counter_hook,
         nn.AdaptiveAvgPool1d: pool_flops_counter_hook,
@@ -595,10 +590,10 @@ def get_modules_mapping() -> Dict:
         nn.LayerNorm: norm_flops_counter_hook,
         # FC
         nn.Linear: linear_flops_counter_hook,
-        Linear: linear_flops_counter_hook,
+        mmcv.cnn.bricks.Linear: linear_flops_counter_hook,
         # Upscale
         nn.Upsample: upsample_flops_counter_hook,
         # Deconvolution
         nn.ConvTranspose2d: deconv_flops_counter_hook,
-        ConvTranspose2d: deconv_flops_counter_hook,
+        mmcv.cnn.bricks.ConvTranspose2d: deconv_flops_counter_hook,
     }
diff --git a/mmcv/cnn/utils/fuse_conv_bn.py b/mmcv/cnn/utils/fuse_conv_bn.py
index 6ccaab3..cb7076f 100644
--- a/mmcv/cnn/utils/fuse_conv_bn.py
+++ b/mmcv/cnn/utils/fuse_conv_bn.py
@@ -3,7 +3,7 @@ import torch
 import torch.nn as nn
 
 
-def _fuse_conv_bn(conv: nn.Module, bn: nn.Module) -> nn.Module:
+def _fuse_conv_bn(conv, bn):
     """Fuse conv and bn into one module.
 
     Args:
@@ -24,7 +24,7 @@ def _fuse_conv_bn(conv: nn.Module, bn: nn.Module) -> nn.Module:
     return conv
 
 
-def fuse_conv_bn(module: nn.Module) -> nn.Module:
+def fuse_conv_bn(module):
     """Recursively fuse conv and bn in a module.
 
     During inference, the functionary of batch norm layers is turned off
diff --git a/mmcv/cnn/utils/sync_bn.py b/mmcv/cnn/utils/sync_bn.py
new file mode 100644
index 0000000..8a79ff4
--- /dev/null
+++ b/mmcv/cnn/utils/sync_bn.py
@@ -0,0 +1,59 @@
+import torch
+
+import mmcv
+
+
+class _BatchNormXd(torch.nn.modules.batchnorm._BatchNorm):
+    """A general BatchNorm layer without input dimension check.
+
+    Reproduced from @kapily's work:
+    (https://github.com/pytorch/pytorch/issues/41081#issuecomment-783961547)
+    The only difference between BatchNorm1d, BatchNorm2d, BatchNorm3d, etc
+    is `_check_input_dim` that is designed for tensor sanity checks.
+    The check has been bypassed in this class for the convenience of converting
+    SyncBatchNorm.
+    """
+
+    def _check_input_dim(self, input):
+        return
+
+
+def revert_sync_batchnorm(module):
+    """Helper function to convert all `SyncBatchNorm` (SyncBN) and
+    `mmcv.ops.sync_bn.SyncBatchNorm`(MMSyncBN) layers in the model to
+    `BatchNormXd` layers.
+
+    Adapted from @kapily's work:
+    (https://github.com/pytorch/pytorch/issues/41081#issuecomment-783961547)
+
+    Args:
+        module (nn.Module): The module containing `SyncBatchNorm` layers.
+
+    Returns:
+        module_output: The converted module with `BatchNormXd` layers.
+    """
+    module_output = module
+    module_checklist = [torch.nn.modules.batchnorm.SyncBatchNorm]
+    if hasattr(mmcv, 'ops'):
+        module_checklist.append(mmcv.ops.SyncBatchNorm)
+    if isinstance(module, tuple(module_checklist)):
+        module_output = _BatchNormXd(module.num_features, module.eps,
+                                     module.momentum, module.affine,
+                                     module.track_running_stats)
+        if module.affine:
+            # no_grad() may not be needed here but
+            # just to be consistent with `convert_sync_batchnorm()`
+            with torch.no_grad():
+                module_output.weight = module.weight
+                module_output.bias = module.bias
+        module_output.running_mean = module.running_mean
+        module_output.running_var = module.running_var
+        module_output.num_batches_tracked = module.num_batches_tracked
+        module_output.training = module.training
+        # qconfig exists in quantized models
+        if hasattr(module, 'qconfig'):
+            module_output.qconfig = module.qconfig
+    for name, child in module.named_children():
+        module_output.add_module(name, revert_sync_batchnorm(child))
+    del module
+    return module_output
diff --git a/mmcv/cnn/utils/weight_init.py b/mmcv/cnn/utils/weight_init.py
new file mode 100644
index 0000000..e1ac999
--- /dev/null
+++ b/mmcv/cnn/utils/weight_init.py
@@ -0,0 +1,684 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import math
+import warnings
+
+import numpy as np
+import torch
+import torch.nn as nn
+from torch import Tensor
+
+from mmcv.utils import Registry, build_from_cfg, get_logger, print_log
+
+INITIALIZERS = Registry('initializer')
+
+
+def update_init_info(module, init_info):
+    """Update the `_params_init_info` in the module if the value of parameters
+    are changed.
+
+    Args:
+        module (obj:`nn.Module`): The module of PyTorch with a user-defined
+            attribute `_params_init_info` which records the initialization
+            information.
+        init_info (str): The string that describes the initialization.
+    """
+    assert hasattr(
+        module,
+        '_params_init_info'), f'Can not find `_params_init_info` in {module}'
+    for name, param in module.named_parameters():
+
+        assert param in module._params_init_info, (
+            f'Find a new :obj:`Parameter` '
+            f'named `{name}` during executing the '
+            f'`init_weights` of '
+            f'`{module.__class__.__name__}`. '
+            f'Please do not add or '
+            f'replace parameters during executing '
+            f'the `init_weights`. ')
+
+        # The parameter has been changed during executing the
+        # `init_weights` of module
+        mean_value = param.data.mean()
+        if module._params_init_info[param]['tmp_mean_value'] != mean_value:
+            module._params_init_info[param]['init_info'] = init_info
+            module._params_init_info[param]['tmp_mean_value'] = mean_value
+
+
+def constant_init(module, val, bias=0):
+    if hasattr(module, 'weight') and module.weight is not None:
+        nn.init.constant_(module.weight, val)
+    if hasattr(module, 'bias') and module.bias is not None:
+        nn.init.constant_(module.bias, bias)
+
+
+def xavier_init(module, gain=1, bias=0, distribution='normal'):
+    assert distribution in ['uniform', 'normal']
+    if hasattr(module, 'weight') and module.weight is not None:
+        if distribution == 'uniform':
+            nn.init.xavier_uniform_(module.weight, gain=gain)
+        else:
+            nn.init.xavier_normal_(module.weight, gain=gain)
+    if hasattr(module, 'bias') and module.bias is not None:
+        nn.init.constant_(module.bias, bias)
+
+
+def normal_init(module, mean=0, std=1, bias=0):
+    if hasattr(module, 'weight') and module.weight is not None:
+        nn.init.normal_(module.weight, mean, std)
+    if hasattr(module, 'bias') and module.bias is not None:
+        nn.init.constant_(module.bias, bias)
+
+
+def trunc_normal_init(module: nn.Module,
+                      mean: float = 0,
+                      std: float = 1,
+                      a: float = -2,
+                      b: float = 2,
+                      bias: float = 0) -> None:
+    if hasattr(module, 'weight') and module.weight is not None:
+        trunc_normal_(module.weight, mean, std, a, b)  # type: ignore
+    if hasattr(module, 'bias') and module.bias is not None:
+        nn.init.constant_(module.bias, bias)  # type: ignore
+
+
+def uniform_init(module, a=0, b=1, bias=0):
+    if hasattr(module, 'weight') and module.weight is not None:
+        nn.init.uniform_(module.weight, a, b)
+    if hasattr(module, 'bias') and module.bias is not None:
+        nn.init.constant_(module.bias, bias)
+
+
+def kaiming_init(module,
+                 a=0,
+                 mode='fan_out',
+                 nonlinearity='relu',
+                 bias=0,
+                 distribution='normal'):
+    assert distribution in ['uniform', 'normal']
+    if hasattr(module, 'weight') and module.weight is not None:
+        if distribution == 'uniform':
+            nn.init.kaiming_uniform_(
+                module.weight, a=a, mode=mode, nonlinearity=nonlinearity)
+        else:
+            nn.init.kaiming_normal_(
+                module.weight, a=a, mode=mode, nonlinearity=nonlinearity)
+    if hasattr(module, 'bias') and module.bias is not None:
+        nn.init.constant_(module.bias, bias)
+
+
+def caffe2_xavier_init(module, bias=0):
+    # `XavierFill` in Caffe2 corresponds to `kaiming_uniform_` in PyTorch
+    # Acknowledgment to FAIR's internal code
+    kaiming_init(
+        module,
+        a=1,
+        mode='fan_in',
+        nonlinearity='leaky_relu',
+        bias=bias,
+        distribution='uniform')
+
+
+def bias_init_with_prob(prior_prob):
+    """initialize conv/fc bias value according to a given probability value."""
+    bias_init = float(-np.log((1 - prior_prob) / prior_prob))
+    return bias_init
+
+
+def _get_bases_name(m):
+    return [b.__name__ for b in m.__class__.__bases__]
+
+
+class BaseInit(object):
+
+    def __init__(self, *, bias=0, bias_prob=None, layer=None):
+        self.wholemodule = False
+        if not isinstance(bias, (int, float)):
+            raise TypeError(f'bias must be a number, but got a {type(bias)}')
+
+        if bias_prob is not None:
+            if not isinstance(bias_prob, float):
+                raise TypeError(f'bias_prob type must be float, \
+                    but got {type(bias_prob)}')
+
+        if layer is not None:
+            if not isinstance(layer, (str, list)):
+                raise TypeError(f'layer must be a str or a list of str, \
+                    but got a {type(layer)}')
+        else:
+            layer = []
+
+        if bias_prob is not None:
+            self.bias = bias_init_with_prob(bias_prob)
+        else:
+            self.bias = bias
+        self.layer = [layer] if isinstance(layer, str) else layer
+
+    def _get_init_info(self):
+        info = f'{self.__class__.__name__}, bias={self.bias}'
+        return info
+
+
+@INITIALIZERS.register_module(name='Constant')
+class ConstantInit(BaseInit):
+    """Initialize module parameters with constant values.
+
+    Args:
+        val (int | float): the value to fill the weights in the module with
+        bias (int | float): the value to fill the bias. Defaults to 0.
+        bias_prob (float, optional): the probability for bias initialization.
+            Defaults to None.
+        layer (str | list[str], optional): the layer will be initialized.
+            Defaults to None.
+    """
+
+    def __init__(self, val, **kwargs):
+        super().__init__(**kwargs)
+        self.val = val
+
+    def __call__(self, module):
+
+        def init(m):
+            if self.wholemodule:
+                constant_init(m, self.val, self.bias)
+            else:
+                layername = m.__class__.__name__
+                basesname = _get_bases_name(m)
+                if len(set(self.layer) & set([layername] + basesname)):
+                    constant_init(m, self.val, self.bias)
+
+        module.apply(init)
+        if hasattr(module, '_params_init_info'):
+            update_init_info(module, init_info=self._get_init_info())
+
+    def _get_init_info(self):
+        info = f'{self.__class__.__name__}: val={self.val}, bias={self.bias}'
+        return info
+
+
+@INITIALIZERS.register_module(name='Xavier')
+class XavierInit(BaseInit):
+    r"""Initialize module parameters with values according to the method
+    described in `Understanding the difficulty of training deep feedforward
+    neural networks - Glorot, X. & Bengio, Y. (2010).
+    <http://proceedings.mlr.press/v9/glorot10a/glorot10a.pdf>`_
+
+    Args:
+        gain (int | float): an optional scaling factor. Defaults to 1.
+        bias (int | float): the value to fill the bias. Defaults to 0.
+        bias_prob (float, optional): the probability for bias initialization.
+            Defaults to None.
+        distribution (str): distribution either be ``'normal'``
+            or ``'uniform'``. Defaults to ``'normal'``.
+        layer (str | list[str], optional): the layer will be initialized.
+            Defaults to None.
+    """
+
+    def __init__(self, gain=1, distribution='normal', **kwargs):
+        super().__init__(**kwargs)
+        self.gain = gain
+        self.distribution = distribution
+
+    def __call__(self, module):
+
+        def init(m):
+            if self.wholemodule:
+                xavier_init(m, self.gain, self.bias, self.distribution)
+            else:
+                layername = m.__class__.__name__
+                basesname = _get_bases_name(m)
+                if len(set(self.layer) & set([layername] + basesname)):
+                    xavier_init(m, self.gain, self.bias, self.distribution)
+
+        module.apply(init)
+        if hasattr(module, '_params_init_info'):
+            update_init_info(module, init_info=self._get_init_info())
+
+    def _get_init_info(self):
+        info = f'{self.__class__.__name__}: gain={self.gain}, ' \
+               f'distribution={self.distribution}, bias={self.bias}'
+        return info
+
+
+@INITIALIZERS.register_module(name='Normal')
+class NormalInit(BaseInit):
+    r"""Initialize module parameters with the values drawn from the normal
+    distribution :math:`\mathcal{N}(\text{mean}, \text{std}^2)`.
+
+    Args:
+        mean (int | float):the mean of the normal distribution. Defaults to 0.
+        std (int | float): the standard deviation of the normal distribution.
+            Defaults to 1.
+        bias (int | float): the value to fill the bias. Defaults to 0.
+        bias_prob (float, optional): the probability for bias initialization.
+            Defaults to None.
+        layer (str | list[str], optional): the layer will be initialized.
+            Defaults to None.
+
+    """
+
+    def __init__(self, mean=0, std=1, **kwargs):
+        super().__init__(**kwargs)
+        self.mean = mean
+        self.std = std
+
+    def __call__(self, module):
+
+        def init(m):
+            if self.wholemodule:
+                normal_init(m, self.mean, self.std, self.bias)
+            else:
+                layername = m.__class__.__name__
+                basesname = _get_bases_name(m)
+                if len(set(self.layer) & set([layername] + basesname)):
+                    normal_init(m, self.mean, self.std, self.bias)
+
+        module.apply(init)
+        if hasattr(module, '_params_init_info'):
+            update_init_info(module, init_info=self._get_init_info())
+
+    def _get_init_info(self):
+        info = f'{self.__class__.__name__}: mean={self.mean},' \
+               f' std={self.std}, bias={self.bias}'
+        return info
+
+
+@INITIALIZERS.register_module(name='TruncNormal')
+class TruncNormalInit(BaseInit):
+    r"""Initialize module parameters with the values drawn from the normal
+    distribution :math:`\mathcal{N}(\text{mean}, \text{std}^2)` with values
+    outside :math:`[a, b]`.
+
+    Args:
+        mean (float): the mean of the normal distribution. Defaults to 0.
+        std (float):  the standard deviation of the normal distribution.
+            Defaults to 1.
+        a (float): The minimum cutoff value.
+        b ( float): The maximum cutoff value.
+        bias (float): the value to fill the bias. Defaults to 0.
+        bias_prob (float, optional): the probability for bias initialization.
+            Defaults to None.
+        layer (str | list[str], optional): the layer will be initialized.
+            Defaults to None.
+
+    """
+
+    def __init__(self,
+                 mean: float = 0,
+                 std: float = 1,
+                 a: float = -2,
+                 b: float = 2,
+                 **kwargs) -> None:
+        super().__init__(**kwargs)
+        self.mean = mean
+        self.std = std
+        self.a = a
+        self.b = b
+
+    def __call__(self, module: nn.Module) -> None:
+
+        def init(m):
+            if self.wholemodule:
+                trunc_normal_init(m, self.mean, self.std, self.a, self.b,
+                                  self.bias)
+            else:
+                layername = m.__class__.__name__
+                basesname = _get_bases_name(m)
+                if len(set(self.layer) & set([layername] + basesname)):
+                    trunc_normal_init(m, self.mean, self.std, self.a, self.b,
+                                      self.bias)
+
+        module.apply(init)
+        if hasattr(module, '_params_init_info'):
+            update_init_info(module, init_info=self._get_init_info())
+
+    def _get_init_info(self):
+        info = f'{self.__class__.__name__}: a={self.a}, b={self.b},' \
+               f' mean={self.mean}, std={self.std}, bias={self.bias}'
+        return info
+
+
+@INITIALIZERS.register_module(name='Uniform')
+class UniformInit(BaseInit):
+    r"""Initialize module parameters with values drawn from the uniform
+    distribution :math:`\mathcal{U}(a, b)`.
+
+    Args:
+        a (int | float): the lower bound of the uniform distribution.
+            Defaults to 0.
+        b (int | float): the upper bound of the uniform distribution.
+            Defaults to 1.
+        bias (int | float): the value to fill the bias. Defaults to 0.
+        bias_prob (float, optional): the probability for bias initialization.
+            Defaults to None.
+        layer (str | list[str], optional): the layer will be initialized.
+            Defaults to None.
+    """
+
+    def __init__(self, a=0, b=1, **kwargs):
+        super().__init__(**kwargs)
+        self.a = a
+        self.b = b
+
+    def __call__(self, module):
+
+        def init(m):
+            if self.wholemodule:
+                uniform_init(m, self.a, self.b, self.bias)
+            else:
+                layername = m.__class__.__name__
+                basesname = _get_bases_name(m)
+                if len(set(self.layer) & set([layername] + basesname)):
+                    uniform_init(m, self.a, self.b, self.bias)
+
+        module.apply(init)
+        if hasattr(module, '_params_init_info'):
+            update_init_info(module, init_info=self._get_init_info())
+
+    def _get_init_info(self):
+        info = f'{self.__class__.__name__}: a={self.a},' \
+               f' b={self.b}, bias={self.bias}'
+        return info
+
+
+@INITIALIZERS.register_module(name='Kaiming')
+class KaimingInit(BaseInit):
+    r"""Initialize module parameters with the values according to the method
+    described in `Delving deep into rectifiers: Surpassing human-level
+    performance on ImageNet classification - He, K. et al. (2015).
+    <https://www.cv-foundation.org/openaccess/content_iccv_2015/
+    papers/He_Delving_Deep_into_ICCV_2015_paper.pdf>`_
+
+    Args:
+        a (int | float): the negative slope of the rectifier used after this
+            layer (only used with ``'leaky_relu'``). Defaults to 0.
+        mode (str):  either ``'fan_in'`` or ``'fan_out'``. Choosing
+            ``'fan_in'`` preserves the magnitude of the variance of the weights
+            in the forward pass. Choosing ``'fan_out'`` preserves the
+            magnitudes in the backwards pass. Defaults to ``'fan_out'``.
+        nonlinearity (str): the non-linear function (`nn.functional` name),
+            recommended to use only with ``'relu'`` or ``'leaky_relu'`` .
+            Defaults to 'relu'.
+        bias (int | float): the value to fill the bias. Defaults to 0.
+        bias_prob (float, optional): the probability for bias initialization.
+            Defaults to None.
+        distribution (str): distribution either be ``'normal'`` or
+            ``'uniform'``. Defaults to ``'normal'``.
+        layer (str | list[str], optional): the layer will be initialized.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 a=0,
+                 mode='fan_out',
+                 nonlinearity='relu',
+                 distribution='normal',
+                 **kwargs):
+        super().__init__(**kwargs)
+        self.a = a
+        self.mode = mode
+        self.nonlinearity = nonlinearity
+        self.distribution = distribution
+
+    def __call__(self, module):
+
+        def init(m):
+            if self.wholemodule:
+                kaiming_init(m, self.a, self.mode, self.nonlinearity,
+                             self.bias, self.distribution)
+            else:
+                layername = m.__class__.__name__
+                basesname = _get_bases_name(m)
+                if len(set(self.layer) & set([layername] + basesname)):
+                    kaiming_init(m, self.a, self.mode, self.nonlinearity,
+                                 self.bias, self.distribution)
+
+        module.apply(init)
+        if hasattr(module, '_params_init_info'):
+            update_init_info(module, init_info=self._get_init_info())
+
+    def _get_init_info(self):
+        info = f'{self.__class__.__name__}: a={self.a}, mode={self.mode}, ' \
+               f'nonlinearity={self.nonlinearity}, ' \
+               f'distribution ={self.distribution}, bias={self.bias}'
+        return info
+
+
+@INITIALIZERS.register_module(name='Caffe2Xavier')
+class Caffe2XavierInit(KaimingInit):
+    # `XavierFill` in Caffe2 corresponds to `kaiming_uniform_` in PyTorch
+    # Acknowledgment to FAIR's internal code
+    def __init__(self, **kwargs):
+        super().__init__(
+            a=1,
+            mode='fan_in',
+            nonlinearity='leaky_relu',
+            distribution='uniform',
+            **kwargs)
+
+    def __call__(self, module):
+        super().__call__(module)
+
+
+@INITIALIZERS.register_module(name='Pretrained')
+class PretrainedInit(object):
+    """Initialize module by loading a pretrained model.
+
+    Args:
+        checkpoint (str): the checkpoint file of the pretrained model should
+            be load.
+        prefix (str, optional): the prefix of a sub-module in the pretrained
+            model. it is for loading a part of the pretrained model to
+            initialize. For example, if we would like to only load the
+            backbone of a detector model, we can set ``prefix='backbone.'``.
+            Defaults to None.
+        map_location (str): map tensors into proper locations.
+    """
+
+    def __init__(self, checkpoint, prefix=None, map_location=None):
+        self.checkpoint = checkpoint
+        self.prefix = prefix
+        self.map_location = map_location
+
+    def __call__(self, module):
+        from mmcv.runner import (_load_checkpoint_with_prefix, load_checkpoint,
+                                 load_state_dict)
+        logger = get_logger('mmcv')
+        if self.prefix is None:
+            print_log(f'load model from: {self.checkpoint}', logger=logger)
+            load_checkpoint(
+                module,
+                self.checkpoint,
+                map_location=self.map_location,
+                strict=False,
+                logger=logger)
+        else:
+            print_log(
+                f'load {self.prefix} in model from: {self.checkpoint}',
+                logger=logger)
+            state_dict = _load_checkpoint_with_prefix(
+                self.prefix, self.checkpoint, map_location=self.map_location)
+            load_state_dict(module, state_dict, strict=False, logger=logger)
+
+        if hasattr(module, '_params_init_info'):
+            update_init_info(module, init_info=self._get_init_info())
+
+    def _get_init_info(self):
+        info = f'{self.__class__.__name__}: load from {self.checkpoint}'
+        return info
+
+
+def _initialize(module, cfg, wholemodule=False):
+    func = build_from_cfg(cfg, INITIALIZERS)
+    # wholemodule flag is for override mode, there is no layer key in override
+    # and initializer will give init values for the whole module with the name
+    # in override.
+    func.wholemodule = wholemodule
+    func(module)
+
+
+def _initialize_override(module, override, cfg):
+    if not isinstance(override, (dict, list)):
+        raise TypeError(f'override must be a dict or a list of dict, \
+                but got {type(override)}')
+
+    override = [override] if isinstance(override, dict) else override
+
+    for override_ in override:
+
+        cp_override = copy.deepcopy(override_)
+        name = cp_override.pop('name', None)
+        if name is None:
+            raise ValueError('`override` must contain the key "name",'
+                             f'but got {cp_override}')
+        # if override only has name key, it means use args in init_cfg
+        if not cp_override:
+            cp_override.update(cfg)
+        # if override has name key and other args except type key, it will
+        # raise error
+        elif 'type' not in cp_override.keys():
+            raise ValueError(
+                f'`override` need "type" key, but got {cp_override}')
+
+        if hasattr(module, name):
+            _initialize(getattr(module, name), cp_override, wholemodule=True)
+        else:
+            raise RuntimeError(f'module did not have attribute {name}, '
+                               f'but init_cfg is {cp_override}.')
+
+
+def initialize(module, init_cfg):
+    """Initialize a module.
+
+    Args:
+        module (``torch.nn.Module``): the module will be initialized.
+        init_cfg (dict | list[dict]): initialization configuration dict to
+            define initializer. OpenMMLab has implemented 6 initializers
+            including ``Constant``, ``Xavier``, ``Normal``, ``Uniform``,
+            ``Kaiming``, and ``Pretrained``.
+    Example:
+        >>> module = nn.Linear(2, 3, bias=True)
+        >>> init_cfg = dict(type='Constant', layer='Linear', val =1 , bias =2)
+        >>> initialize(module, init_cfg)
+
+        >>> module = nn.Sequential(nn.Conv1d(3, 1, 3), nn.Linear(1,2))
+        >>> # define key ``'layer'`` for initializing layer with different
+        >>> # configuration
+        >>> init_cfg = [dict(type='Constant', layer='Conv1d', val=1),
+                dict(type='Constant', layer='Linear', val=2)]
+        >>> initialize(module, init_cfg)
+
+        >>> # define key``'override'`` to initialize some specific part in
+        >>> # module
+        >>> class FooNet(nn.Module):
+        >>>     def __init__(self):
+        >>>         super().__init__()
+        >>>         self.feat = nn.Conv2d(3, 16, 3)
+        >>>         self.reg = nn.Conv2d(16, 10, 3)
+        >>>         self.cls = nn.Conv2d(16, 5, 3)
+        >>> model = FooNet()
+        >>> init_cfg = dict(type='Constant', val=1, bias=2, layer='Conv2d',
+        >>>     override=dict(type='Constant', name='reg', val=3, bias=4))
+        >>> initialize(model, init_cfg)
+
+        >>> model = ResNet(depth=50)
+        >>> # Initialize weights with the pretrained model.
+        >>> init_cfg = dict(type='Pretrained',
+                checkpoint='torchvision://resnet50')
+        >>> initialize(model, init_cfg)
+
+        >>> # Initialize weights of a sub-module with the specific part of
+        >>> # a pretrained model by using "prefix".
+        >>> url = 'http://download.openmmlab.com/mmdetection/v2.0/retinanet/'\
+        >>>     'retinanet_r50_fpn_1x_coco/'\
+        >>>     'retinanet_r50_fpn_1x_coco_20200130-c2398f9e.pth'
+        >>> init_cfg = dict(type='Pretrained',
+                checkpoint=url, prefix='backbone.')
+    """
+    if not isinstance(init_cfg, (dict, list)):
+        raise TypeError(f'init_cfg must be a dict or a list of dict, \
+                but got {type(init_cfg)}')
+
+    if isinstance(init_cfg, dict):
+        init_cfg = [init_cfg]
+
+    for cfg in init_cfg:
+        # should deeply copy the original config because cfg may be used by
+        # other modules, e.g., one init_cfg shared by multiple bottleneck
+        # blocks, the expected cfg will be changed after pop and will change
+        # the initialization behavior of other modules
+        cp_cfg = copy.deepcopy(cfg)
+        override = cp_cfg.pop('override', None)
+        _initialize(module, cp_cfg)
+
+        if override is not None:
+            cp_cfg.pop('layer', None)
+            _initialize_override(module, override, cp_cfg)
+        else:
+            # All attributes in module have same initialization.
+            pass
+
+
+def _no_grad_trunc_normal_(tensor: Tensor, mean: float, std: float, a: float,
+                           b: float) -> Tensor:
+    # Method based on
+    # https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf
+    # Modified from
+    # https://github.com/pytorch/pytorch/blob/master/torch/nn/init.py
+    def norm_cdf(x):
+        # Computes standard normal cumulative distribution function
+        return (1. + math.erf(x / math.sqrt(2.))) / 2.
+
+    if (mean < a - 2 * std) or (mean > b + 2 * std):
+        warnings.warn(
+            'mean is more than 2 std from [a, b] in nn.init.trunc_normal_. '
+            'The distribution of values may be incorrect.',
+            stacklevel=2)
+
+    with torch.no_grad():
+        # Values are generated by using a truncated uniform distribution and
+        # then using the inverse CDF for the normal distribution.
+        # Get upper and lower cdf values
+        lower = norm_cdf((a - mean) / std)
+        upper = norm_cdf((b - mean) / std)
+
+        # Uniformly fill tensor with values from [lower, upper], then translate
+        # to [2lower-1, 2upper-1].
+        tensor.uniform_(2 * lower - 1, 2 * upper - 1)
+
+        # Use inverse cdf transform for normal distribution to get truncated
+        # standard normal
+        tensor.erfinv_()
+
+        # Transform to proper mean, std
+        tensor.mul_(std * math.sqrt(2.))
+        tensor.add_(mean)
+
+        # Clamp to ensure it's in the proper range
+        tensor.clamp_(min=a, max=b)
+        return tensor
+
+
+def trunc_normal_(tensor: Tensor,
+                  mean: float = 0.,
+                  std: float = 1.,
+                  a: float = -2.,
+                  b: float = 2.) -> Tensor:
+    r"""Fills the input Tensor with values drawn from a truncated
+    normal distribution. The values are effectively drawn from the
+    normal distribution :math:`\mathcal{N}(\text{mean}, \text{std}^2)`
+    with values outside :math:`[a, b]` redrawn until they are within
+    the bounds. The method used for generating the random values works
+    best when :math:`a \leq \text{mean} \leq b`.
+
+    Modified from
+    https://github.com/pytorch/pytorch/blob/master/torch/nn/init.py
+
+    Args:
+        tensor (``torch.Tensor``): an n-dimensional `torch.Tensor`.
+        mean (float): the mean of the normal distribution.
+        std (float): the standard deviation of the normal distribution.
+        a (float): the minimum cutoff value.
+        b (float): the maximum cutoff value.
+    """
+    return _no_grad_trunc_normal_(tensor, mean, std, a, b)
diff --git a/mmcv/cnn/vgg.py b/mmcv/cnn/vgg.py
index a7f3116..8778b64 100644
--- a/mmcv/cnn/vgg.py
+++ b/mmcv/cnn/vgg.py
@@ -1,14 +1,12 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import logging
-from typing import List, Optional, Sequence, Tuple, Union
 
 import torch.nn as nn
-from mmengine.model import constant_init, kaiming_init, normal_init
-from mmengine.runner import load_checkpoint
-from torch import Tensor
 
+from .utils import constant_init, kaiming_init, normal_init
 
-def conv3x3(in_planes: int, out_planes: int, dilation: int = 1) -> nn.Module:
+
+def conv3x3(in_planes, out_planes, dilation=1):
     """3x3 convolution with padding."""
     return nn.Conv2d(
         in_planes,
@@ -18,12 +16,12 @@ def conv3x3(in_planes: int, out_planes: int, dilation: int = 1) -> nn.Module:
         dilation=dilation)
 
 
-def make_vgg_layer(inplanes: int,
-                   planes: int,
-                   num_blocks: int,
-                   dilation: int = 1,
-                   with_bn: bool = False,
-                   ceil_mode: bool = False) -> List[nn.Module]:
+def make_vgg_layer(inplanes,
+                   planes,
+                   num_blocks,
+                   dilation=1,
+                   with_bn=False,
+                   ceil_mode=False):
     layers = []
     for _ in range(num_blocks):
         layers.append(conv3x3(inplanes, planes, dilation))
@@ -61,18 +59,18 @@ class VGG(nn.Module):
     }
 
     def __init__(self,
-                 depth: int,
-                 with_bn: bool = False,
-                 num_classes: int = -1,
-                 num_stages: int = 5,
-                 dilations: Sequence[int] = (1, 1, 1, 1, 1),
-                 out_indices: Sequence[int] = (0, 1, 2, 3, 4),
-                 frozen_stages: int = -1,
-                 bn_eval: bool = True,
-                 bn_frozen: bool = False,
-                 ceil_mode: bool = False,
-                 with_last_pool: bool = True):
-        super().__init__()
+                 depth,
+                 with_bn=False,
+                 num_classes=-1,
+                 num_stages=5,
+                 dilations=(1, 1, 1, 1, 1),
+                 out_indices=(0, 1, 2, 3, 4),
+                 frozen_stages=-1,
+                 bn_eval=True,
+                 bn_frozen=False,
+                 ceil_mode=False,
+                 with_last_pool=True):
+        super(VGG, self).__init__()
         if depth not in self.arch_settings:
             raise KeyError(f'invalid depth {depth} for vgg')
         assert num_stages >= 1 and num_stages <= 5
@@ -124,9 +122,10 @@ class VGG(nn.Module):
                 nn.Linear(4096, num_classes),
             )
 
-    def init_weights(self, pretrained: Optional[str] = None) -> None:
+    def init_weights(self, pretrained=None):
         if isinstance(pretrained, str):
             logger = logging.getLogger()
+            from ..runner import load_checkpoint
             load_checkpoint(self, pretrained, strict=False, logger=logger)
         elif pretrained is None:
             for m in self.modules():
@@ -139,7 +138,7 @@ class VGG(nn.Module):
         else:
             raise TypeError('pretrained must be a str or None')
 
-    def forward(self, x: Tensor) -> Union[Tensor, Tuple[Tensor, ...]]:
+    def forward(self, x):
         outs = []
         vgg_layers = getattr(self, self.module_name)
         for i in range(len(self.stage_blocks)):
@@ -157,8 +156,8 @@ class VGG(nn.Module):
         else:
             return tuple(outs)
 
-    def train(self, mode: bool = True) -> None:
-        super().train(mode)
+    def train(self, mode=True):
+        super(VGG, self).train(mode)
         if self.bn_eval:
             for m in self.modules():
                 if isinstance(m, nn.BatchNorm2d):
diff --git a/mmcv/engine/__init__.py b/mmcv/engine/__init__.py
new file mode 100644
index 0000000..3193b7f
--- /dev/null
+++ b/mmcv/engine/__init__.py
@@ -0,0 +1,8 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .test import (collect_results_cpu, collect_results_gpu, multi_gpu_test,
+                   single_gpu_test)
+
+__all__ = [
+    'collect_results_cpu', 'collect_results_gpu', 'multi_gpu_test',
+    'single_gpu_test'
+]
diff --git a/mmcv/engine/test.py b/mmcv/engine/test.py
new file mode 100644
index 0000000..f236b1c
--- /dev/null
+++ b/mmcv/engine/test.py
@@ -0,0 +1,202 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+import pickle
+import shutil
+import tempfile
+import time
+
+import torch
+import torch.distributed as dist
+
+import mmcv
+from mmcv.runner import get_dist_info
+
+
+def single_gpu_test(model, data_loader):
+    """Test model with a single gpu.
+
+    This method tests model with a single gpu and displays test progress bar.
+
+    Args:
+        model (nn.Module): Model to be tested.
+        data_loader (nn.Dataloader): Pytorch data loader.
+
+    Returns:
+        list: The prediction results.
+    """
+    model.eval()
+    results = []
+    dataset = data_loader.dataset
+    prog_bar = mmcv.ProgressBar(len(dataset))
+    for data in data_loader:
+        with torch.no_grad():
+            result = model(return_loss=False, **data)
+        results.extend(result)
+
+        # Assume result has the same length of batch_size
+        # refer to https://github.com/open-mmlab/mmcv/issues/985
+        batch_size = len(result)
+        for _ in range(batch_size):
+            prog_bar.update()
+    return results
+
+
+def multi_gpu_test(model, data_loader, tmpdir=None, gpu_collect=False):
+    """Test model with multiple gpus.
+
+    This method tests model with multiple gpus and collects the results
+    under two different modes: gpu and cpu modes. By setting
+    ``gpu_collect=True``, it encodes results to gpu tensors and use gpu
+    communication for results collection. On cpu mode it saves the results on
+    different gpus to ``tmpdir`` and collects them by the rank 0 worker.
+
+    Args:
+        model (nn.Module): Model to be tested.
+        data_loader (nn.Dataloader): Pytorch data loader.
+        tmpdir (str): Path of directory to save the temporary results from
+            different gpus under cpu mode.
+        gpu_collect (bool): Option to use either gpu or cpu to collect results.
+
+    Returns:
+        list: The prediction results.
+    """
+    model.eval()
+    results = []
+    dataset = data_loader.dataset
+    rank, world_size = get_dist_info()
+    if rank == 0:
+        prog_bar = mmcv.ProgressBar(len(dataset))
+    time.sleep(2)  # This line can prevent deadlock problem in some cases.
+    for i, data in enumerate(data_loader):
+        with torch.no_grad():
+            result = model(return_loss=False, **data)
+        results.extend(result)
+
+        if rank == 0:
+            batch_size = len(result)
+            batch_size_all = batch_size * world_size
+            if batch_size_all + prog_bar.completed > len(dataset):
+                batch_size_all = len(dataset) - prog_bar.completed
+            for _ in range(batch_size_all):
+                prog_bar.update()
+
+    # collect results from all ranks
+    if gpu_collect:
+        results = collect_results_gpu(results, len(dataset))
+    else:
+        results = collect_results_cpu(results, len(dataset), tmpdir)
+    return results
+
+
+def collect_results_cpu(result_part, size, tmpdir=None):
+    """Collect results under cpu mode.
+
+    On cpu mode, this function will save the results on different gpus to
+    ``tmpdir`` and collect them by the rank 0 worker.
+
+    Args:
+        result_part (list): Result list containing result parts
+            to be collected.
+        size (int): Size of the results, commonly equal to length of
+            the results.
+        tmpdir (str | None): temporal directory for collected results to
+            store. If set to None, it will create a random temporal directory
+            for it.
+
+    Returns:
+        list: The collected results.
+    """
+    rank, world_size = get_dist_info()
+    # create a tmp dir if it is not specified
+    if tmpdir is None:
+        MAX_LEN = 512
+        # 32 is whitespace
+        dir_tensor = torch.full((MAX_LEN, ),
+                                32,
+                                dtype=torch.uint8,
+                                device='cuda')
+        if rank == 0:
+            mmcv.mkdir_or_exist('.dist_test')
+            tmpdir = tempfile.mkdtemp(dir='.dist_test')
+            tmpdir = torch.tensor(
+                bytearray(tmpdir.encode()), dtype=torch.uint8, device='cuda')
+            dir_tensor[:len(tmpdir)] = tmpdir
+        dist.broadcast(dir_tensor, 0)
+        tmpdir = dir_tensor.cpu().numpy().tobytes().decode().rstrip()
+    else:
+        mmcv.mkdir_or_exist(tmpdir)
+    # dump the part result to the dir
+    mmcv.dump(result_part, osp.join(tmpdir, f'part_{rank}.pkl'))
+    dist.barrier()
+    # collect all parts
+    if rank != 0:
+        return None
+    else:
+        # load results of all parts from tmp dir
+        part_list = []
+        for i in range(world_size):
+            part_file = osp.join(tmpdir, f'part_{i}.pkl')
+            part_result = mmcv.load(part_file)
+            # When data is severely insufficient, an empty part_result
+            # on a certain gpu could makes the overall outputs empty.
+            if part_result:
+                part_list.append(part_result)
+        # sort the results
+        ordered_results = []
+        for res in zip(*part_list):
+            ordered_results.extend(list(res))
+        # the dataloader may pad some samples
+        ordered_results = ordered_results[:size]
+        # remove tmp dir
+        shutil.rmtree(tmpdir)
+        return ordered_results
+
+
+def collect_results_gpu(result_part, size):
+    """Collect results under gpu mode.
+
+    On gpu mode, this function will encode results to gpu tensors and use gpu
+    communication for results collection.
+
+    Args:
+        result_part (list): Result list containing result parts
+            to be collected.
+        size (int): Size of the results, commonly equal to length of
+            the results.
+
+    Returns:
+        list: The collected results.
+    """
+    rank, world_size = get_dist_info()
+    # dump result part to tensor with pickle
+    part_tensor = torch.tensor(
+        bytearray(pickle.dumps(result_part)), dtype=torch.uint8, device='cuda')
+    # gather all result part tensor shape
+    shape_tensor = torch.tensor(part_tensor.shape, device='cuda')
+    shape_list = [shape_tensor.clone() for _ in range(world_size)]
+    dist.all_gather(shape_list, shape_tensor)
+    # padding result part tensor to max length
+    shape_max = torch.tensor(shape_list).max()
+    part_send = torch.zeros(shape_max, dtype=torch.uint8, device='cuda')
+    part_send[:shape_tensor[0]] = part_tensor
+    part_recv_list = [
+        part_tensor.new_zeros(shape_max) for _ in range(world_size)
+    ]
+    # gather all result part
+    dist.all_gather(part_recv_list, part_send)
+
+    if rank == 0:
+        part_list = []
+        for recv, shape in zip(part_recv_list, shape_list):
+            part_result = pickle.loads(recv[:shape[0]].cpu().numpy().tobytes())
+            # When data is severely insufficient, an empty part_result
+            # on a certain gpu could makes the overall outputs empty.
+            if part_result:
+                part_list.append(part_result)
+        # sort the results
+        ordered_results = []
+        for res in zip(*part_list):
+            ordered_results.extend(list(res))
+        # the dataloader may pad some samples
+        ordered_results = ordered_results[:size]
+        return ordered_results
diff --git a/mmcv/fileio/__init__.py b/mmcv/fileio/__init__.py
new file mode 100644
index 0000000..2051b85
--- /dev/null
+++ b/mmcv/fileio/__init__.py
@@ -0,0 +1,11 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .file_client import BaseStorageBackend, FileClient
+from .handlers import BaseFileHandler, JsonHandler, PickleHandler, YamlHandler
+from .io import dump, load, register_handler
+from .parse import dict_from_file, list_from_file
+
+__all__ = [
+    'BaseStorageBackend', 'FileClient', 'load', 'dump', 'register_handler',
+    'BaseFileHandler', 'JsonHandler', 'PickleHandler', 'YamlHandler',
+    'list_from_file', 'dict_from_file'
+]
diff --git a/mmcv/fileio/file_client.py b/mmcv/fileio/file_client.py
new file mode 100644
index 0000000..b2d6228
--- /dev/null
+++ b/mmcv/fileio/file_client.py
@@ -0,0 +1,1148 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import inspect
+import os
+import os.path as osp
+import re
+import tempfile
+import warnings
+from abc import ABCMeta, abstractmethod
+from contextlib import contextmanager
+from pathlib import Path
+from typing import Iterable, Iterator, Optional, Tuple, Union
+from urllib.request import urlopen
+
+import mmcv
+from mmcv.utils.misc import has_method
+from mmcv.utils.path import is_filepath
+
+
+class BaseStorageBackend(metaclass=ABCMeta):
+    """Abstract class of storage backends.
+
+    All backends need to implement two apis: ``get()`` and ``get_text()``.
+    ``get()`` reads the file as a byte stream and ``get_text()`` reads the file
+    as texts.
+    """
+
+    # a flag to indicate whether the backend can create a symlink for a file
+    _allow_symlink = False
+
+    @property
+    def name(self):
+        return self.__class__.__name__
+
+    @property
+    def allow_symlink(self):
+        return self._allow_symlink
+
+    @abstractmethod
+    def get(self, filepath):
+        pass
+
+    @abstractmethod
+    def get_text(self, filepath):
+        pass
+
+
+class CephBackend(BaseStorageBackend):
+    """Ceph storage backend (for internal use).
+
+    Args:
+        path_mapping (dict|None): path mapping dict from local path to Petrel
+            path. When ``path_mapping={'src': 'dst'}``, ``src`` in ``filepath``
+            will be replaced by ``dst``. Default: None.
+
+    .. warning::
+        :class:`mmcv.fileio.file_client.CephBackend` will be deprecated,
+        please use :class:`mmcv.fileio.file_client.PetrelBackend` instead.
+    """
+
+    def __init__(self, path_mapping=None):
+        try:
+            import ceph
+        except ImportError:
+            raise ImportError('Please install ceph to enable CephBackend.')
+
+        warnings.warn(
+            'CephBackend will be deprecated, please use PetrelBackend instead')
+        self._client = ceph.S3Client()
+        assert isinstance(path_mapping, dict) or path_mapping is None
+        self.path_mapping = path_mapping
+
+    def get(self, filepath):
+        filepath = str(filepath)
+        if self.path_mapping is not None:
+            for k, v in self.path_mapping.items():
+                filepath = filepath.replace(k, v)
+        value = self._client.Get(filepath)
+        value_buf = memoryview(value)
+        return value_buf
+
+    def get_text(self, filepath, encoding=None):
+        raise NotImplementedError
+
+
+class PetrelBackend(BaseStorageBackend):
+    """Petrel storage backend (for internal use).
+
+    PetrelBackend supports reading and writing data to multiple clusters.
+    If the file path contains the cluster name, PetrelBackend will read data
+    from specified cluster or write data to it. Otherwise, PetrelBackend will
+    access the default cluster.
+
+    Args:
+        path_mapping (dict, optional): Path mapping dict from local path to
+            Petrel path. When ``path_mapping={'src': 'dst'}``, ``src`` in
+            ``filepath`` will be replaced by ``dst``. Default: None.
+        enable_mc (bool, optional): Whether to enable memcached support.
+            Default: True.
+
+    Examples:
+        >>> filepath1 = 's3://path/of/file'
+        >>> filepath2 = 'cluster-name:s3://path/of/file'
+        >>> client = PetrelBackend()
+        >>> client.get(filepath1)  # get data from default cluster
+        >>> client.get(filepath2)  # get data from 'cluster-name' cluster
+    """
+
+    def __init__(self,
+                 path_mapping: Optional[dict] = None,
+                 enable_mc: bool = True):
+        try:
+            from petrel_client import client
+        except ImportError:
+            raise ImportError('Please install petrel_client to enable '
+                              'PetrelBackend.')
+
+        self._client = client.Client(enable_mc=enable_mc)
+        assert isinstance(path_mapping, dict) or path_mapping is None
+        self.path_mapping = path_mapping
+
+    def _map_path(self, filepath: Union[str, Path]) -> str:
+        """Map ``filepath`` to a string path whose prefix will be replaced by
+        :attr:`self.path_mapping`.
+
+        Args:
+            filepath (str): Path to be mapped.
+        """
+        filepath = str(filepath)
+        if self.path_mapping is not None:
+            for k, v in self.path_mapping.items():
+                filepath = filepath.replace(k, v)
+        return filepath
+
+    def _format_path(self, filepath: str) -> str:
+        """Convert a ``filepath`` to standard format of petrel oss.
+
+        If the ``filepath`` is concatenated by ``os.path.join``, in a Windows
+        environment, the ``filepath`` will be the format of
+        's3://bucket_name\\image.jpg'. By invoking :meth:`_format_path`, the
+        above ``filepath`` will be converted to 's3://bucket_name/image.jpg'.
+
+        Args:
+            filepath (str): Path to be formatted.
+        """
+        return re.sub(r'\\+', '/', filepath)
+
+    def get(self, filepath: Union[str, Path]) -> memoryview:
+        """Read data from a given ``filepath`` with 'rb' mode.
+
+        Args:
+            filepath (str or Path): Path to read data.
+
+        Returns:
+            memoryview: A memory view of expected bytes object to avoid
+                copying. The memoryview object can be converted to bytes by
+                ``value_buf.tobytes()``.
+        """
+        filepath = self._map_path(filepath)
+        filepath = self._format_path(filepath)
+        value = self._client.Get(filepath)
+        value_buf = memoryview(value)
+        return value_buf
+
+    def get_text(self,
+                 filepath: Union[str, Path],
+                 encoding: str = 'utf-8') -> str:
+        """Read data from a given ``filepath`` with 'r' mode.
+
+        Args:
+            filepath (str or Path): Path to read data.
+            encoding (str): The encoding format used to open the ``filepath``.
+                Default: 'utf-8'.
+
+        Returns:
+            str: Expected text reading from ``filepath``.
+        """
+        return str(self.get(filepath), encoding=encoding)
+
+    def put(self, obj: bytes, filepath: Union[str, Path]) -> None:
+        """Save data to a given ``filepath``.
+
+        Args:
+            obj (bytes): Data to be saved.
+            filepath (str or Path): Path to write data.
+        """
+        filepath = self._map_path(filepath)
+        filepath = self._format_path(filepath)
+        self._client.put(filepath, obj)
+
+    def put_text(self,
+                 obj: str,
+                 filepath: Union[str, Path],
+                 encoding: str = 'utf-8') -> None:
+        """Save data to a given ``filepath``.
+
+        Args:
+            obj (str): Data to be written.
+            filepath (str or Path): Path to write data.
+            encoding (str): The encoding format used to encode the ``obj``.
+                Default: 'utf-8'.
+        """
+        self.put(bytes(obj, encoding=encoding), filepath)
+
+    def remove(self, filepath: Union[str, Path]) -> None:
+        """Remove a file.
+
+        Args:
+            filepath (str or Path): Path to be removed.
+        """
+        if not has_method(self._client, 'delete'):
+            raise NotImplementedError(
+                ('Current version of Petrel Python SDK has not supported '
+                 'the `delete` method, please use a higher version or dev'
+                 ' branch instead.'))
+
+        filepath = self._map_path(filepath)
+        filepath = self._format_path(filepath)
+        self._client.delete(filepath)
+
+    def exists(self, filepath: Union[str, Path]) -> bool:
+        """Check whether a file path exists.
+
+        Args:
+            filepath (str or Path): Path to be checked whether exists.
+
+        Returns:
+            bool: Return ``True`` if ``filepath`` exists, ``False`` otherwise.
+        """
+        if not (has_method(self._client, 'contains')
+                and has_method(self._client, 'isdir')):
+            raise NotImplementedError(
+                ('Current version of Petrel Python SDK has not supported '
+                 'the `contains` and `isdir` methods, please use a higher'
+                 'version or dev branch instead.'))
+
+        filepath = self._map_path(filepath)
+        filepath = self._format_path(filepath)
+        return self._client.contains(filepath) or self._client.isdir(filepath)
+
+    def isdir(self, filepath: Union[str, Path]) -> bool:
+        """Check whether a file path is a directory.
+
+        Args:
+            filepath (str or Path): Path to be checked whether it is a
+                directory.
+
+        Returns:
+            bool: Return ``True`` if ``filepath`` points to a directory,
+                ``False`` otherwise.
+        """
+        if not has_method(self._client, 'isdir'):
+            raise NotImplementedError(
+                ('Current version of Petrel Python SDK has not supported '
+                 'the `isdir` method, please use a higher version or dev'
+                 ' branch instead.'))
+
+        filepath = self._map_path(filepath)
+        filepath = self._format_path(filepath)
+        return self._client.isdir(filepath)
+
+    def isfile(self, filepath: Union[str, Path]) -> bool:
+        """Check whether a file path is a file.
+
+        Args:
+            filepath (str or Path): Path to be checked whether it is a file.
+
+        Returns:
+            bool: Return ``True`` if ``filepath`` points to a file, ``False``
+                otherwise.
+        """
+        if not has_method(self._client, 'contains'):
+            raise NotImplementedError(
+                ('Current version of Petrel Python SDK has not supported '
+                 'the `contains` method, please use a higher version or '
+                 'dev branch instead.'))
+
+        filepath = self._map_path(filepath)
+        filepath = self._format_path(filepath)
+        return self._client.contains(filepath)
+
+    def join_path(self, filepath: Union[str, Path],
+                  *filepaths: Union[str, Path]) -> str:
+        """Concatenate all file paths.
+
+        Args:
+            filepath (str or Path): Path to be concatenated.
+
+        Returns:
+            str: The result after concatenation.
+        """
+        filepath = self._format_path(self._map_path(filepath))
+        if filepath.endswith('/'):
+            filepath = filepath[:-1]
+        formatted_paths = [filepath]
+        for path in filepaths:
+            formatted_paths.append(self._format_path(self._map_path(path)))
+        return '/'.join(formatted_paths)
+
+    @contextmanager
+    def get_local_path(self, filepath: Union[str, Path]) -> Iterable[str]:
+        """Download a file from ``filepath`` and return a temporary path.
+
+        ``get_local_path`` is decorated by :meth:`contxtlib.contextmanager`. It
+        can be called with ``with`` statement, and when exists from the
+        ``with`` statement, the temporary path will be released.
+
+        Args:
+            filepath (str | Path): Download a file from ``filepath``.
+
+        Examples:
+            >>> client = PetrelBackend()
+            >>> # After existing from the ``with`` clause,
+            >>> # the path will be removed
+            >>> with client.get_local_path('s3://path/of/your/file') as path:
+            ...     # do something here
+
+        Yields:
+            Iterable[str]: Only yield one temporary path.
+        """
+        filepath = self._map_path(filepath)
+        filepath = self._format_path(filepath)
+        assert self.isfile(filepath)
+        try:
+            f = tempfile.NamedTemporaryFile(delete=False)
+            f.write(self.get(filepath))
+            f.close()
+            yield f.name
+        finally:
+            os.remove(f.name)
+
+    def list_dir_or_file(self,
+                         dir_path: Union[str, Path],
+                         list_dir: bool = True,
+                         list_file: bool = True,
+                         suffix: Optional[Union[str, Tuple[str]]] = None,
+                         recursive: bool = False) -> Iterator[str]:
+        """Scan a directory to find the interested directories or files in
+        arbitrary order.
+
+        Note:
+            Petrel has no concept of directories but it simulates the directory
+            hierarchy in the filesystem through public prefixes. In addition,
+            if the returned path ends with '/', it means the path is a public
+            prefix which is a logical directory.
+
+        Note:
+            :meth:`list_dir_or_file` returns the path relative to ``dir_path``.
+            In addition, the returned path of directory will not contains the
+            suffix '/' which is consistent with other backends.
+
+        Args:
+            dir_path (str | Path): Path of the directory.
+            list_dir (bool): List the directories. Default: True.
+            list_file (bool): List the path of files. Default: True.
+            suffix (str or tuple[str], optional):  File suffix
+                that we are interested in. Default: None.
+            recursive (bool): If set to True, recursively scan the
+                directory. Default: False.
+
+        Yields:
+            Iterable[str]: A relative path to ``dir_path``.
+        """
+        if not has_method(self._client, 'list'):
+            raise NotImplementedError(
+                ('Current version of Petrel Python SDK has not supported '
+                 'the `list` method, please use a higher version or dev'
+                 ' branch instead.'))
+
+        dir_path = self._map_path(dir_path)
+        dir_path = self._format_path(dir_path)
+        if list_dir and suffix is not None:
+            raise TypeError(
+                '`list_dir` should be False when `suffix` is not None')
+
+        if (suffix is not None) and not isinstance(suffix, (str, tuple)):
+            raise TypeError('`suffix` must be a string or tuple of strings')
+
+        # Petrel's simulated directory hierarchy assumes that directory paths
+        # should end with `/`
+        if not dir_path.endswith('/'):
+            dir_path += '/'
+
+        root = dir_path
+
+        def _list_dir_or_file(dir_path, list_dir, list_file, suffix,
+                              recursive):
+            for path in self._client.list(dir_path):
+                # the `self.isdir` is not used here to determine whether path
+                # is a directory, because `self.isdir` relies on
+                # `self._client.list`
+                if path.endswith('/'):  # a directory path
+                    next_dir_path = self.join_path(dir_path, path)
+                    if list_dir:
+                        # get the relative path and exclude the last
+                        # character '/'
+                        rel_dir = next_dir_path[len(root):-1]
+                        yield rel_dir
+                    if recursive:
+                        yield from _list_dir_or_file(next_dir_path, list_dir,
+                                                     list_file, suffix,
+                                                     recursive)
+                else:  # a file path
+                    absolute_path = self.join_path(dir_path, path)
+                    rel_path = absolute_path[len(root):]
+                    if (suffix is None
+                            or rel_path.endswith(suffix)) and list_file:
+                        yield rel_path
+
+        return _list_dir_or_file(dir_path, list_dir, list_file, suffix,
+                                 recursive)
+
+
+class MemcachedBackend(BaseStorageBackend):
+    """Memcached storage backend.
+
+    Attributes:
+        server_list_cfg (str): Config file for memcached server list.
+        client_cfg (str): Config file for memcached client.
+        sys_path (str | None): Additional path to be appended to `sys.path`.
+            Default: None.
+    """
+
+    def __init__(self, server_list_cfg, client_cfg, sys_path=None):
+        if sys_path is not None:
+            import sys
+            sys.path.append(sys_path)
+        try:
+            import mc
+        except ImportError:
+            raise ImportError(
+                'Please install memcached to enable MemcachedBackend.')
+
+        self.server_list_cfg = server_list_cfg
+        self.client_cfg = client_cfg
+        self._client = mc.MemcachedClient.GetInstance(self.server_list_cfg,
+                                                      self.client_cfg)
+        # mc.pyvector servers as a point which points to a memory cache
+        self._mc_buffer = mc.pyvector()
+
+    def get(self, filepath):
+        filepath = str(filepath)
+        import mc
+        self._client.Get(filepath, self._mc_buffer)
+        value_buf = mc.ConvertBuffer(self._mc_buffer)
+        return value_buf
+
+    def get_text(self, filepath, encoding=None):
+        raise NotImplementedError
+
+
+class LmdbBackend(BaseStorageBackend):
+    """Lmdb storage backend.
+
+    Args:
+        db_path (str): Lmdb database path.
+        readonly (bool, optional): Lmdb environment parameter. If True,
+            disallow any write operations. Default: True.
+        lock (bool, optional): Lmdb environment parameter. If False, when
+            concurrent access occurs, do not lock the database. Default: False.
+        readahead (bool, optional): Lmdb environment parameter. If False,
+            disable the OS filesystem readahead mechanism, which may improve
+            random read performance when a database is larger than RAM.
+            Default: False.
+
+    Attributes:
+        db_path (str): Lmdb database path.
+    """
+
+    def __init__(self,
+                 db_path,
+                 readonly=True,
+                 lock=False,
+                 readahead=False,
+                 **kwargs):
+        try:
+            import lmdb
+        except ImportError:
+            raise ImportError('Please install lmdb to enable LmdbBackend.')
+
+        self.db_path = str(db_path)
+        self._client = lmdb.open(
+            self.db_path,
+            readonly=readonly,
+            lock=lock,
+            readahead=readahead,
+            **kwargs)
+
+    def get(self, filepath):
+        """Get values according to the filepath.
+
+        Args:
+            filepath (str | obj:`Path`): Here, filepath is the lmdb key.
+        """
+        filepath = str(filepath)
+        with self._client.begin(write=False) as txn:
+            value_buf = txn.get(filepath.encode('ascii'))
+        return value_buf
+
+    def get_text(self, filepath, encoding=None):
+        raise NotImplementedError
+
+
+class HardDiskBackend(BaseStorageBackend):
+    """Raw hard disks storage backend."""
+
+    _allow_symlink = True
+
+    def get(self, filepath: Union[str, Path]) -> bytes:
+        """Read data from a given ``filepath`` with 'rb' mode.
+
+        Args:
+            filepath (str or Path): Path to read data.
+
+        Returns:
+            bytes: Expected bytes object.
+        """
+        with open(filepath, 'rb') as f:
+            value_buf = f.read()
+        return value_buf
+
+    def get_text(self,
+                 filepath: Union[str, Path],
+                 encoding: str = 'utf-8') -> str:
+        """Read data from a given ``filepath`` with 'r' mode.
+
+        Args:
+            filepath (str or Path): Path to read data.
+            encoding (str): The encoding format used to open the ``filepath``.
+                Default: 'utf-8'.
+
+        Returns:
+            str: Expected text reading from ``filepath``.
+        """
+        with open(filepath, 'r', encoding=encoding) as f:
+            value_buf = f.read()
+        return value_buf
+
+    def put(self, obj: bytes, filepath: Union[str, Path]) -> None:
+        """Write data to a given ``filepath`` with 'wb' mode.
+
+        Note:
+            ``put`` will create a directory if the directory of ``filepath``
+            does not exist.
+
+        Args:
+            obj (bytes): Data to be written.
+            filepath (str or Path): Path to write data.
+        """
+        mmcv.mkdir_or_exist(osp.dirname(filepath))
+        with open(filepath, 'wb') as f:
+            f.write(obj)
+
+    def put_text(self,
+                 obj: str,
+                 filepath: Union[str, Path],
+                 encoding: str = 'utf-8') -> None:
+        """Write data to a given ``filepath`` with 'w' mode.
+
+        Note:
+            ``put_text`` will create a directory if the directory of
+            ``filepath`` does not exist.
+
+        Args:
+            obj (str): Data to be written.
+            filepath (str or Path): Path to write data.
+            encoding (str): The encoding format used to open the ``filepath``.
+                Default: 'utf-8'.
+        """
+        mmcv.mkdir_or_exist(osp.dirname(filepath))
+        with open(filepath, 'w', encoding=encoding) as f:
+            f.write(obj)
+
+    def remove(self, filepath: Union[str, Path]) -> None:
+        """Remove a file.
+
+        Args:
+            filepath (str or Path): Path to be removed.
+        """
+        os.remove(filepath)
+
+    def exists(self, filepath: Union[str, Path]) -> bool:
+        """Check whether a file path exists.
+
+        Args:
+            filepath (str or Path): Path to be checked whether exists.
+
+        Returns:
+            bool: Return ``True`` if ``filepath`` exists, ``False`` otherwise.
+        """
+        return osp.exists(filepath)
+
+    def isdir(self, filepath: Union[str, Path]) -> bool:
+        """Check whether a file path is a directory.
+
+        Args:
+            filepath (str or Path): Path to be checked whether it is a
+                directory.
+
+        Returns:
+            bool: Return ``True`` if ``filepath`` points to a directory,
+                ``False`` otherwise.
+        """
+        return osp.isdir(filepath)
+
+    def isfile(self, filepath: Union[str, Path]) -> bool:
+        """Check whether a file path is a file.
+
+        Args:
+            filepath (str or Path): Path to be checked whether it is a file.
+
+        Returns:
+            bool: Return ``True`` if ``filepath`` points to a file, ``False``
+                otherwise.
+        """
+        return osp.isfile(filepath)
+
+    def join_path(self, filepath: Union[str, Path],
+                  *filepaths: Union[str, Path]) -> str:
+        """Concatenate all file paths.
+
+        Join one or more filepath components intelligently. The return value
+        is the concatenation of filepath and any members of *filepaths.
+
+        Args:
+            filepath (str or Path): Path to be concatenated.
+
+        Returns:
+            str: The result of concatenation.
+        """
+        return osp.join(filepath, *filepaths)
+
+    @contextmanager
+    def get_local_path(
+            self, filepath: Union[str, Path]) -> Iterable[Union[str, Path]]:
+        """Only for unified API and do nothing."""
+        yield filepath
+
+    def list_dir_or_file(self,
+                         dir_path: Union[str, Path],
+                         list_dir: bool = True,
+                         list_file: bool = True,
+                         suffix: Optional[Union[str, Tuple[str]]] = None,
+                         recursive: bool = False) -> Iterator[str]:
+        """Scan a directory to find the interested directories or files in
+        arbitrary order.
+
+        Note:
+            :meth:`list_dir_or_file` returns the path relative to ``dir_path``.
+
+        Args:
+            dir_path (str | Path): Path of the directory.
+            list_dir (bool): List the directories. Default: True.
+            list_file (bool): List the path of files. Default: True.
+            suffix (str or tuple[str], optional):  File suffix
+                that we are interested in. Default: None.
+            recursive (bool): If set to True, recursively scan the
+                directory. Default: False.
+
+        Yields:
+            Iterable[str]: A relative path to ``dir_path``.
+        """
+        if list_dir and suffix is not None:
+            raise TypeError('`suffix` should be None when `list_dir` is True')
+
+        if (suffix is not None) and not isinstance(suffix, (str, tuple)):
+            raise TypeError('`suffix` must be a string or tuple of strings')
+
+        root = dir_path
+
+        def _list_dir_or_file(dir_path, list_dir, list_file, suffix,
+                              recursive):
+            for entry in os.scandir(dir_path):
+                if not entry.name.startswith('.') and entry.is_file():
+                    rel_path = osp.relpath(entry.path, root)
+                    if (suffix is None
+                            or rel_path.endswith(suffix)) and list_file:
+                        yield rel_path
+                elif osp.isdir(entry.path):
+                    if list_dir:
+                        rel_dir = osp.relpath(entry.path, root)
+                        yield rel_dir
+                    if recursive:
+                        yield from _list_dir_or_file(entry.path, list_dir,
+                                                     list_file, suffix,
+                                                     recursive)
+
+        return _list_dir_or_file(dir_path, list_dir, list_file, suffix,
+                                 recursive)
+
+
+class HTTPBackend(BaseStorageBackend):
+    """HTTP and HTTPS storage bachend."""
+
+    def get(self, filepath):
+        value_buf = urlopen(filepath).read()
+        return value_buf
+
+    def get_text(self, filepath, encoding='utf-8'):
+        value_buf = urlopen(filepath).read()
+        return value_buf.decode(encoding)
+
+    @contextmanager
+    def get_local_path(self, filepath: str) -> Iterable[str]:
+        """Download a file from ``filepath``.
+
+        ``get_local_path`` is decorated by :meth:`contxtlib.contextmanager`. It
+        can be called with ``with`` statement, and when exists from the
+        ``with`` statement, the temporary path will be released.
+
+        Args:
+            filepath (str): Download a file from ``filepath``.
+
+        Examples:
+            >>> client = HTTPBackend()
+            >>> # After existing from the ``with`` clause,
+            >>> # the path will be removed
+            >>> with client.get_local_path('http://path/of/your/file') as path:
+            ...     # do something here
+        """
+        try:
+            f = tempfile.NamedTemporaryFile(delete=False)
+            f.write(self.get(filepath))
+            f.close()
+            yield f.name
+        finally:
+            os.remove(f.name)
+
+
+class FileClient:
+    """A general file client to access files in different backends.
+
+    The client loads a file or text in a specified backend from its path
+    and returns it as a binary or text file. There are two ways to choose a
+    backend, the name of backend and the prefix of path. Although both of them
+    can be used to choose a storage backend, ``backend`` has a higher priority
+    that is if they are all set, the storage backend will be chosen by the
+    backend argument. If they are all `None`, the disk backend will be chosen.
+    Note that It can also register other backend accessor with a given name,
+    prefixes, and backend class. In addition, We use the singleton pattern to
+    avoid repeated object creation. If the arguments are the same, the same
+    object will be returned.
+
+    Args:
+        backend (str, optional): The storage backend type. Options are "disk",
+            "ceph", "memcached", "lmdb", "http" and "petrel". Default: None.
+        prefix (str, optional): The prefix of the registered storage backend.
+            Options are "s3", "http", "https". Default: None.
+
+    Examples:
+        >>> # only set backend
+        >>> file_client = FileClient(backend='petrel')
+        >>> # only set prefix
+        >>> file_client = FileClient(prefix='s3')
+        >>> # set both backend and prefix but use backend to choose client
+        >>> file_client = FileClient(backend='petrel', prefix='s3')
+        >>> # if the arguments are the same, the same object is returned
+        >>> file_client1 = FileClient(backend='petrel')
+        >>> file_client1 is file_client
+        True
+
+    Attributes:
+        client (:obj:`BaseStorageBackend`): The backend object.
+    """
+
+    _backends = {
+        'disk': HardDiskBackend,
+        'ceph': CephBackend,
+        'memcached': MemcachedBackend,
+        'lmdb': LmdbBackend,
+        'petrel': PetrelBackend,
+        'http': HTTPBackend,
+    }
+    # This collection is used to record the overridden backends, and when a
+    # backend appears in the collection, the singleton pattern is disabled for
+    # that backend, because if the singleton pattern is used, then the object
+    # returned will be the backend before overwriting
+    _overridden_backends = set()
+    _prefix_to_backends = {
+        's3': PetrelBackend,
+        'http': HTTPBackend,
+        'https': HTTPBackend,
+    }
+    _overridden_prefixes = set()
+
+    _instances = {}
+
+    def __new__(cls, backend=None, prefix=None, **kwargs):
+        if backend is None and prefix is None:
+            backend = 'disk'
+        if backend is not None and backend not in cls._backends:
+            raise ValueError(
+                f'Backend {backend} is not supported. Currently supported ones'
+                f' are {list(cls._backends.keys())}')
+        if prefix is not None and prefix not in cls._prefix_to_backends:
+            raise ValueError(
+                f'prefix {prefix} is not supported. Currently supported ones '
+                f'are {list(cls._prefix_to_backends.keys())}')
+
+        # concatenate the arguments to a unique key for determining whether
+        # objects with the same arguments were created
+        arg_key = f'{backend}:{prefix}'
+        for key, value in kwargs.items():
+            arg_key += f':{key}:{value}'
+
+        # if a backend was overridden, it will create a new object
+        if (arg_key in cls._instances
+                and backend not in cls._overridden_backends
+                and prefix not in cls._overridden_prefixes):
+            _instance = cls._instances[arg_key]
+        else:
+            # create a new object and put it to _instance
+            _instance = super().__new__(cls)
+            if backend is not None:
+                _instance.client = cls._backends[backend](**kwargs)
+            else:
+                _instance.client = cls._prefix_to_backends[prefix](**kwargs)
+
+            cls._instances[arg_key] = _instance
+
+        return _instance
+
+    @property
+    def name(self):
+        return self.client.name
+
+    @property
+    def allow_symlink(self):
+        return self.client.allow_symlink
+
+    @staticmethod
+    def parse_uri_prefix(uri: Union[str, Path]) -> Optional[str]:
+        """Parse the prefix of a uri.
+
+        Args:
+            uri (str | Path): Uri to be parsed that contains the file prefix.
+
+        Examples:
+            >>> FileClient.parse_uri_prefix('s3://path/of/your/file')
+            's3'
+
+        Returns:
+            str | None: Return the prefix of uri if the uri contains '://'
+                else ``None``.
+        """
+        assert is_filepath(uri)
+        uri = str(uri)
+        if '://' not in uri:
+            return None
+        else:
+            prefix, _ = uri.split('://')
+            # In the case of PetrelBackend, the prefix may contains the cluster
+            # name like clusterName:s3
+            if ':' in prefix:
+                _, prefix = prefix.split(':')
+            return prefix
+
+    @classmethod
+    def infer_client(cls,
+                     file_client_args: Optional[dict] = None,
+                     uri: Optional[Union[str, Path]] = None) -> 'FileClient':
+        """Infer a suitable file client based on the URI and arguments.
+
+        Args:
+            file_client_args (dict, optional): Arguments to instantiate a
+                FileClient. Default: None.
+            uri (str | Path, optional): Uri to be parsed that contains the file
+                prefix. Default: None.
+
+        Examples:
+            >>> uri = 's3://path/of/your/file'
+            >>> file_client = FileClient.infer_client(uri=uri)
+            >>> file_client_args = {'backend': 'petrel'}
+            >>> file_client = FileClient.infer_client(file_client_args)
+
+        Returns:
+            FileClient: Instantiated FileClient object.
+        """
+        assert file_client_args is not None or uri is not None
+        if file_client_args is None:
+            file_prefix = cls.parse_uri_prefix(uri)  # type: ignore
+            return cls(prefix=file_prefix)
+        else:
+            return cls(**file_client_args)
+
+    @classmethod
+    def _register_backend(cls, name, backend, force=False, prefixes=None):
+        if not isinstance(name, str):
+            raise TypeError('the backend name should be a string, '
+                            f'but got {type(name)}')
+        if not inspect.isclass(backend):
+            raise TypeError(
+                f'backend should be a class but got {type(backend)}')
+        if not issubclass(backend, BaseStorageBackend):
+            raise TypeError(
+                f'backend {backend} is not a subclass of BaseStorageBackend')
+        if not force and name in cls._backends:
+            raise KeyError(
+                f'{name} is already registered as a storage backend, '
+                'add "force=True" if you want to override it')
+
+        if name in cls._backends and force:
+            cls._overridden_backends.add(name)
+        cls._backends[name] = backend
+
+        if prefixes is not None:
+            if isinstance(prefixes, str):
+                prefixes = [prefixes]
+            else:
+                assert isinstance(prefixes, (list, tuple))
+            for prefix in prefixes:
+                if prefix not in cls._prefix_to_backends:
+                    cls._prefix_to_backends[prefix] = backend
+                elif (prefix in cls._prefix_to_backends) and force:
+                    cls._overridden_prefixes.add(prefix)
+                    cls._prefix_to_backends[prefix] = backend
+                else:
+                    raise KeyError(
+                        f'{prefix} is already registered as a storage backend,'
+                        ' add "force=True" if you want to override it')
+
+    @classmethod
+    def register_backend(cls, name, backend=None, force=False, prefixes=None):
+        """Register a backend to FileClient.
+
+        This method can be used as a normal class method or a decorator.
+
+        .. code-block:: python
+
+            class NewBackend(BaseStorageBackend):
+
+                def get(self, filepath):
+                    return filepath
+
+                def get_text(self, filepath):
+                    return filepath
+
+            FileClient.register_backend('new', NewBackend)
+
+        or
+
+        .. code-block:: python
+
+            @FileClient.register_backend('new')
+            class NewBackend(BaseStorageBackend):
+
+                def get(self, filepath):
+                    return filepath
+
+                def get_text(self, filepath):
+                    return filepath
+
+        Args:
+            name (str): The name of the registered backend.
+            backend (class, optional): The backend class to be registered,
+                which must be a subclass of :class:`BaseStorageBackend`.
+                When this method is used as a decorator, backend is None.
+                Defaults to None.
+            force (bool, optional): Whether to override the backend if the name
+                has already been registered. Defaults to False.
+            prefixes (str or list[str] or tuple[str], optional): The prefixes
+                of the registered storage backend. Default: None.
+                `New in version 1.3.15.`
+        """
+        if backend is not None:
+            cls._register_backend(
+                name, backend, force=force, prefixes=prefixes)
+            return
+
+        def _register(backend_cls):
+            cls._register_backend(
+                name, backend_cls, force=force, prefixes=prefixes)
+            return backend_cls
+
+        return _register
+
+    def get(self, filepath: Union[str, Path]) -> Union[bytes, memoryview]:
+        """Read data from a given ``filepath`` with 'rb' mode.
+
+        Note:
+            There are two types of return values for ``get``, one is ``bytes``
+            and the other is ``memoryview``. The advantage of using memoryview
+            is that you can avoid copying, and if you want to convert it to
+            ``bytes``, you can use ``.tobytes()``.
+
+        Args:
+            filepath (str or Path): Path to read data.
+
+        Returns:
+            bytes | memoryview: Expected bytes object or a memory view of the
+                bytes object.
+        """
+        return self.client.get(filepath)
+
+    def get_text(self, filepath: Union[str, Path], encoding='utf-8') -> str:
+        """Read data from a given ``filepath`` with 'r' mode.
+
+        Args:
+            filepath (str or Path): Path to read data.
+            encoding (str): The encoding format used to open the ``filepath``.
+                Default: 'utf-8'.
+
+        Returns:
+            str: Expected text reading from ``filepath``.
+        """
+        return self.client.get_text(filepath, encoding)
+
+    def put(self, obj: bytes, filepath: Union[str, Path]) -> None:
+        """Write data to a given ``filepath`` with 'wb' mode.
+
+        Note:
+            ``put`` should create a directory if the directory of ``filepath``
+            does not exist.
+
+        Args:
+            obj (bytes): Data to be written.
+            filepath (str or Path): Path to write data.
+        """
+        self.client.put(obj, filepath)
+
+    def put_text(self, obj: str, filepath: Union[str, Path]) -> None:
+        """Write data to a given ``filepath`` with 'w' mode.
+
+        Note:
+            ``put_text`` should create a directory if the directory of
+            ``filepath`` does not exist.
+
+        Args:
+            obj (str): Data to be written.
+            filepath (str or Path): Path to write data.
+            encoding (str, optional): The encoding format used to open the
+                `filepath`. Default: 'utf-8'.
+        """
+        self.client.put_text(obj, filepath)
+
+    def remove(self, filepath: Union[str, Path]) -> None:
+        """Remove a file.
+
+        Args:
+            filepath (str, Path): Path to be removed.
+        """
+        self.client.remove(filepath)
+
+    def exists(self, filepath: Union[str, Path]) -> bool:
+        """Check whether a file path exists.
+
+        Args:
+            filepath (str or Path): Path to be checked whether exists.
+
+        Returns:
+            bool: Return ``True`` if ``filepath`` exists, ``False`` otherwise.
+        """
+        return self.client.exists(filepath)
+
+    def isdir(self, filepath: Union[str, Path]) -> bool:
+        """Check whether a file path is a directory.
+
+        Args:
+            filepath (str or Path): Path to be checked whether it is a
+                directory.
+
+        Returns:
+            bool: Return ``True`` if ``filepath`` points to a directory,
+                ``False`` otherwise.
+        """
+        return self.client.isdir(filepath)
+
+    def isfile(self, filepath: Union[str, Path]) -> bool:
+        """Check whether a file path is a file.
+
+        Args:
+            filepath (str or Path): Path to be checked whether it is a file.
+
+        Returns:
+            bool: Return ``True`` if ``filepath`` points to a file, ``False``
+                otherwise.
+        """
+        return self.client.isfile(filepath)
+
+    def join_path(self, filepath: Union[str, Path],
+                  *filepaths: Union[str, Path]) -> str:
+        """Concatenate all file paths.
+
+        Join one or more filepath components intelligently. The return value
+        is the concatenation of filepath and any members of *filepaths.
+
+        Args:
+            filepath (str or Path): Path to be concatenated.
+
+        Returns:
+            str: The result of concatenation.
+        """
+        return self.client.join_path(filepath, *filepaths)
+
+    @contextmanager
+    def get_local_path(self, filepath: Union[str, Path]) -> Iterable[str]:
+        """Download data from ``filepath`` and write the data to local path.
+
+        ``get_local_path`` is decorated by :meth:`contxtlib.contextmanager`. It
+        can be called with ``with`` statement, and when exists from the
+        ``with`` statement, the temporary path will be released.
+
+        Note:
+            If the ``filepath`` is a local path, just return itself.
+
+        .. warning::
+            ``get_local_path`` is an experimental interface that may change in
+            the future.
+
+        Args:
+            filepath (str or Path): Path to be read data.
+
+        Examples:
+            >>> file_client = FileClient(prefix='s3')
+            >>> with file_client.get_local_path('s3://bucket/abc.jpg') as path:
+            ...     # do something here
+
+        Yields:
+            Iterable[str]: Only yield one path.
+        """
+        with self.client.get_local_path(str(filepath)) as local_path:
+            yield local_path
+
+    def list_dir_or_file(self,
+                         dir_path: Union[str, Path],
+                         list_dir: bool = True,
+                         list_file: bool = True,
+                         suffix: Optional[Union[str, Tuple[str]]] = None,
+                         recursive: bool = False) -> Iterator[str]:
+        """Scan a directory to find the interested directories or files in
+        arbitrary order.
+
+        Note:
+            :meth:`list_dir_or_file` returns the path relative to ``dir_path``.
+
+        Args:
+            dir_path (str | Path): Path of the directory.
+            list_dir (bool): List the directories. Default: True.
+            list_file (bool): List the path of files. Default: True.
+            suffix (str or tuple[str], optional):  File suffix
+                that we are interested in. Default: None.
+            recursive (bool): If set to True, recursively scan the
+                directory. Default: False.
+
+        Yields:
+            Iterable[str]: A relative path to ``dir_path``.
+        """
+        yield from self.client.list_dir_or_file(dir_path, list_dir, list_file,
+                                                suffix, recursive)
diff --git a/mmcv/fileio/handlers/__init__.py b/mmcv/fileio/handlers/__init__.py
new file mode 100644
index 0000000..aa24d91
--- /dev/null
+++ b/mmcv/fileio/handlers/__init__.py
@@ -0,0 +1,7 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .base import BaseFileHandler
+from .json_handler import JsonHandler
+from .pickle_handler import PickleHandler
+from .yaml_handler import YamlHandler
+
+__all__ = ['BaseFileHandler', 'JsonHandler', 'PickleHandler', 'YamlHandler']
diff --git a/mmcv/fileio/handlers/base.py b/mmcv/fileio/handlers/base.py
new file mode 100644
index 0000000..288878b
--- /dev/null
+++ b/mmcv/fileio/handlers/base.py
@@ -0,0 +1,30 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from abc import ABCMeta, abstractmethod
+
+
+class BaseFileHandler(metaclass=ABCMeta):
+    # `str_like` is a flag to indicate whether the type of file object is
+    # str-like object or bytes-like object. Pickle only processes bytes-like
+    # objects but json only processes str-like object. If it is str-like
+    # object, `StringIO` will be used to process the buffer.
+    str_like = True
+
+    @abstractmethod
+    def load_from_fileobj(self, file, **kwargs):
+        pass
+
+    @abstractmethod
+    def dump_to_fileobj(self, obj, file, **kwargs):
+        pass
+
+    @abstractmethod
+    def dump_to_str(self, obj, **kwargs):
+        pass
+
+    def load_from_path(self, filepath, mode='r', **kwargs):
+        with open(filepath, mode) as f:
+            return self.load_from_fileobj(f, **kwargs)
+
+    def dump_to_path(self, obj, filepath, mode='w', **kwargs):
+        with open(filepath, mode) as f:
+            self.dump_to_fileobj(obj, f, **kwargs)
diff --git a/mmcv/fileio/handlers/json_handler.py b/mmcv/fileio/handlers/json_handler.py
new file mode 100644
index 0000000..18d4f15
--- /dev/null
+++ b/mmcv/fileio/handlers/json_handler.py
@@ -0,0 +1,36 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import json
+
+import numpy as np
+
+from .base import BaseFileHandler
+
+
+def set_default(obj):
+    """Set default json values for non-serializable values.
+
+    It helps convert ``set``, ``range`` and ``np.ndarray`` data types to list.
+    It also converts ``np.generic`` (including ``np.int32``, ``np.float32``,
+    etc.) into plain numbers of plain python built-in types.
+    """
+    if isinstance(obj, (set, range)):
+        return list(obj)
+    elif isinstance(obj, np.ndarray):
+        return obj.tolist()
+    elif isinstance(obj, np.generic):
+        return obj.item()
+    raise TypeError(f'{type(obj)} is unsupported for json dump')
+
+
+class JsonHandler(BaseFileHandler):
+
+    def load_from_fileobj(self, file):
+        return json.load(file)
+
+    def dump_to_fileobj(self, obj, file, **kwargs):
+        kwargs.setdefault('default', set_default)
+        json.dump(obj, file, **kwargs)
+
+    def dump_to_str(self, obj, **kwargs):
+        kwargs.setdefault('default', set_default)
+        return json.dumps(obj, **kwargs)
diff --git a/mmcv/fileio/handlers/pickle_handler.py b/mmcv/fileio/handlers/pickle_handler.py
new file mode 100644
index 0000000..b37c79b
--- /dev/null
+++ b/mmcv/fileio/handlers/pickle_handler.py
@@ -0,0 +1,28 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import pickle
+
+from .base import BaseFileHandler
+
+
+class PickleHandler(BaseFileHandler):
+
+    str_like = False
+
+    def load_from_fileobj(self, file, **kwargs):
+        return pickle.load(file, **kwargs)
+
+    def load_from_path(self, filepath, **kwargs):
+        return super(PickleHandler, self).load_from_path(
+            filepath, mode='rb', **kwargs)
+
+    def dump_to_str(self, obj, **kwargs):
+        kwargs.setdefault('protocol', 2)
+        return pickle.dumps(obj, **kwargs)
+
+    def dump_to_fileobj(self, obj, file, **kwargs):
+        kwargs.setdefault('protocol', 2)
+        pickle.dump(obj, file, **kwargs)
+
+    def dump_to_path(self, obj, filepath, **kwargs):
+        super(PickleHandler, self).dump_to_path(
+            obj, filepath, mode='wb', **kwargs)
diff --git a/mmcv/fileio/handlers/yaml_handler.py b/mmcv/fileio/handlers/yaml_handler.py
new file mode 100644
index 0000000..c5aa2ee
--- /dev/null
+++ b/mmcv/fileio/handlers/yaml_handler.py
@@ -0,0 +1,24 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import yaml
+
+try:
+    from yaml import CLoader as Loader, CDumper as Dumper
+except ImportError:
+    from yaml import Loader, Dumper
+
+from .base import BaseFileHandler  # isort:skip
+
+
+class YamlHandler(BaseFileHandler):
+
+    def load_from_fileobj(self, file, **kwargs):
+        kwargs.setdefault('Loader', Loader)
+        return yaml.load(file, **kwargs)
+
+    def dump_to_fileobj(self, obj, file, **kwargs):
+        kwargs.setdefault('Dumper', Dumper)
+        yaml.dump(obj, file, **kwargs)
+
+    def dump_to_str(self, obj, **kwargs):
+        kwargs.setdefault('Dumper', Dumper)
+        return yaml.dump(obj, **kwargs)
diff --git a/mmcv/fileio/io.py b/mmcv/fileio/io.py
new file mode 100644
index 0000000..aaefde5
--- /dev/null
+++ b/mmcv/fileio/io.py
@@ -0,0 +1,151 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from io import BytesIO, StringIO
+from pathlib import Path
+
+from ..utils import is_list_of, is_str
+from .file_client import FileClient
+from .handlers import BaseFileHandler, JsonHandler, PickleHandler, YamlHandler
+
+file_handlers = {
+    'json': JsonHandler(),
+    'yaml': YamlHandler(),
+    'yml': YamlHandler(),
+    'pickle': PickleHandler(),
+    'pkl': PickleHandler()
+}
+
+
+def load(file, file_format=None, file_client_args=None, **kwargs):
+    """Load data from json/yaml/pickle files.
+
+    This method provides a unified api for loading data from serialized files.
+
+    Note:
+        In v1.3.16 and later, ``load`` supports loading data from serialized
+        files those can be storaged in different backends.
+
+    Args:
+        file (str or :obj:`Path` or file-like object): Filename or a file-like
+            object.
+        file_format (str, optional): If not specified, the file format will be
+            inferred from the file extension, otherwise use the specified one.
+            Currently supported formats include "json", "yaml/yml" and
+            "pickle/pkl".
+        file_client_args (dict, optional): Arguments to instantiate a
+            FileClient. See :class:`mmcv.fileio.FileClient` for details.
+            Default: None.
+
+    Examples:
+        >>> load('/path/of/your/file')  # file is storaged in disk
+        >>> load('https://path/of/your/file')  # file is storaged in Internet
+        >>> load('s3://path/of/your/file')  # file is storaged in petrel
+
+    Returns:
+        The content from the file.
+    """
+    if isinstance(file, Path):
+        file = str(file)
+    if file_format is None and is_str(file):
+        file_format = file.split('.')[-1]
+    if file_format not in file_handlers:
+        raise TypeError(f'Unsupported format: {file_format}')
+
+    handler = file_handlers[file_format]
+    if is_str(file):
+        file_client = FileClient.infer_client(file_client_args, file)
+        if handler.str_like:
+            with StringIO(file_client.get_text(file)) as f:
+                obj = handler.load_from_fileobj(f, **kwargs)
+        else:
+            with BytesIO(file_client.get(file)) as f:
+                obj = handler.load_from_fileobj(f, **kwargs)
+    elif hasattr(file, 'read'):
+        obj = handler.load_from_fileobj(file, **kwargs)
+    else:
+        raise TypeError('"file" must be a filepath str or a file-object')
+    return obj
+
+
+def dump(obj, file=None, file_format=None, file_client_args=None, **kwargs):
+    """Dump data to json/yaml/pickle strings or files.
+
+    This method provides a unified api for dumping data as strings or to files,
+    and also supports custom arguments for each file format.
+
+    Note:
+        In v1.3.16 and later, ``dump`` supports dumping data as strings or to
+        files which is saved to different backends.
+
+    Args:
+        obj (any): The python object to be dumped.
+        file (str or :obj:`Path` or file-like object, optional): If not
+            specified, then the object is dumped to a str, otherwise to a file
+            specified by the filename or file-like object.
+        file_format (str, optional): Same as :func:`load`.
+        file_client_args (dict, optional): Arguments to instantiate a
+            FileClient. See :class:`mmcv.fileio.FileClient` for details.
+            Default: None.
+
+    Examples:
+        >>> dump('hello world', '/path/of/your/file')  # disk
+        >>> dump('hello world', 's3://path/of/your/file')  # ceph or petrel
+
+    Returns:
+        bool: True for success, False otherwise.
+    """
+    if isinstance(file, Path):
+        file = str(file)
+    if file_format is None:
+        if is_str(file):
+            file_format = file.split('.')[-1]
+        elif file is None:
+            raise ValueError(
+                'file_format must be specified since file is None')
+    if file_format not in file_handlers:
+        raise TypeError(f'Unsupported format: {file_format}')
+
+    handler = file_handlers[file_format]
+    if file is None:
+        return handler.dump_to_str(obj, **kwargs)
+    elif is_str(file):
+        file_client = FileClient.infer_client(file_client_args, file)
+        if handler.str_like:
+            with StringIO() as f:
+                handler.dump_to_fileobj(obj, f, **kwargs)
+                file_client.put_text(f.getvalue(), file)
+        else:
+            with BytesIO() as f:
+                handler.dump_to_fileobj(obj, f, **kwargs)
+                file_client.put(f.getvalue(), file)
+    elif hasattr(file, 'write'):
+        handler.dump_to_fileobj(obj, file, **kwargs)
+    else:
+        raise TypeError('"file" must be a filename str or a file-object')
+
+
+def _register_handler(handler, file_formats):
+    """Register a handler for some file extensions.
+
+    Args:
+        handler (:obj:`BaseFileHandler`): Handler to be registered.
+        file_formats (str or list[str]): File formats to be handled by this
+            handler.
+    """
+    if not isinstance(handler, BaseFileHandler):
+        raise TypeError(
+            f'handler must be a child of BaseFileHandler, not {type(handler)}')
+    if isinstance(file_formats, str):
+        file_formats = [file_formats]
+    if not is_list_of(file_formats, str):
+        raise TypeError('file_formats must be a str or a list of str')
+    for ext in file_formats:
+        file_handlers[ext] = handler
+
+
+def register_handler(file_formats, **kwargs):
+
+    def wrap(cls):
+        _register_handler(cls(**kwargs), file_formats)
+        return cls
+
+    return wrap
diff --git a/mmcv/fileio/parse.py b/mmcv/fileio/parse.py
new file mode 100644
index 0000000..f60f0d6
--- /dev/null
+++ b/mmcv/fileio/parse.py
@@ -0,0 +1,97 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+from io import StringIO
+
+from .file_client import FileClient
+
+
+def list_from_file(filename,
+                   prefix='',
+                   offset=0,
+                   max_num=0,
+                   encoding='utf-8',
+                   file_client_args=None):
+    """Load a text file and parse the content as a list of strings.
+
+    Note:
+        In v1.3.16 and later, ``list_from_file`` supports loading a text file
+        which can be storaged in different backends and parsing the content as
+        a list for strings.
+
+    Args:
+        filename (str): Filename.
+        prefix (str): The prefix to be inserted to the beginning of each item.
+        offset (int): The offset of lines.
+        max_num (int): The maximum number of lines to be read,
+            zeros and negatives mean no limitation.
+        encoding (str): Encoding used to open the file. Default utf-8.
+        file_client_args (dict, optional): Arguments to instantiate a
+            FileClient. See :class:`mmcv.fileio.FileClient` for details.
+            Default: None.
+
+    Examples:
+        >>> list_from_file('/path/of/your/file')  # disk
+        ['hello', 'world']
+        >>> list_from_file('s3://path/of/your/file')  # ceph or petrel
+        ['hello', 'world']
+
+    Returns:
+        list[str]: A list of strings.
+    """
+    cnt = 0
+    item_list = []
+    file_client = FileClient.infer_client(file_client_args, filename)
+    with StringIO(file_client.get_text(filename, encoding)) as f:
+        for _ in range(offset):
+            f.readline()
+        for line in f:
+            if 0 < max_num <= cnt:
+                break
+            item_list.append(prefix + line.rstrip('\n\r'))
+            cnt += 1
+    return item_list
+
+
+def dict_from_file(filename,
+                   key_type=str,
+                   encoding='utf-8',
+                   file_client_args=None):
+    """Load a text file and parse the content as a dict.
+
+    Each line of the text file will be two or more columns split by
+    whitespaces or tabs. The first column will be parsed as dict keys, and
+    the following columns will be parsed as dict values.
+
+    Note:
+        In v1.3.16 and later, ``dict_from_file`` supports loading a text file
+        which can be storaged in different backends and parsing the content as
+        a dict.
+
+    Args:
+        filename(str): Filename.
+        key_type(type): Type of the dict keys. str is user by default and
+            type conversion will be performed if specified.
+        encoding (str): Encoding used to open the file. Default utf-8.
+        file_client_args (dict, optional): Arguments to instantiate a
+            FileClient. See :class:`mmcv.fileio.FileClient` for details.
+            Default: None.
+
+    Examples:
+        >>> dict_from_file('/path/of/your/file')  # disk
+        {'key1': 'value1', 'key2': 'value2'}
+        >>> dict_from_file('s3://path/of/your/file')  # ceph or petrel
+        {'key1': 'value1', 'key2': 'value2'}
+
+    Returns:
+        dict: The parsed contents.
+    """
+    mapping = {}
+    file_client = FileClient.infer_client(file_client_args, filename)
+    with StringIO(file_client.get_text(filename, encoding)) as f:
+        for line in f:
+            items = line.rstrip('\n').split()
+            assert len(items) >= 2
+            key = key_type(items[0])
+            val = items[1:] if len(items) > 2 else items[1]
+            mapping[key] = val
+    return mapping
diff --git a/mmcv/image/__init__.py b/mmcv/image/__init__.py
index 92ecec4..d0051d6 100644
--- a/mmcv/image/__init__.py
+++ b/mmcv/image/__init__.py
@@ -9,10 +9,10 @@ from .geometric import (cutout, imcrop, imflip, imflip_, impad,
 from .io import imfrombytes, imread, imwrite, supported_backends, use_backend
 from .misc import tensor2imgs
 from .photometric import (adjust_brightness, adjust_color, adjust_contrast,
-                          adjust_hue, adjust_lighting, adjust_sharpness,
-                          auto_contrast, clahe, imdenormalize, imequalize,
-                          iminvert, imnormalize, imnormalize_, lut_transform,
-                          posterize, solarize)
+                          adjust_lighting, adjust_sharpness, auto_contrast,
+                          clahe, imdenormalize, imequalize, iminvert,
+                          imnormalize, imnormalize_, lut_transform, posterize,
+                          solarize)
 
 __all__ = [
     'bgr2gray', 'bgr2hls', 'bgr2hsv', 'bgr2rgb', 'gray2bgr', 'gray2rgb',
@@ -24,6 +24,5 @@ __all__ = [
     'solarize', 'rgb2ycbcr', 'bgr2ycbcr', 'ycbcr2rgb', 'ycbcr2bgr',
     'tensor2imgs', 'imshear', 'imtranslate', 'adjust_color', 'imequalize',
     'adjust_brightness', 'adjust_contrast', 'lut_transform', 'clahe',
-    'adjust_sharpness', 'auto_contrast', 'cutout', 'adjust_lighting',
-    'adjust_hue'
+    'adjust_sharpness', 'auto_contrast', 'cutout', 'adjust_lighting'
 ]
diff --git a/mmcv/image/colorspace.py b/mmcv/image/colorspace.py
index 08f9952..8145339 100644
--- a/mmcv/image/colorspace.py
+++ b/mmcv/image/colorspace.py
@@ -1,11 +1,9 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from typing import Callable, Union
-
 import cv2
 import numpy as np
 
 
-def imconvert(img: np.ndarray, src: str, dst: str) -> np.ndarray:
+def imconvert(img, src, dst):
     """Convert an image from the src colorspace to dst colorspace.
 
     Args:
@@ -21,7 +19,7 @@ def imconvert(img: np.ndarray, src: str, dst: str) -> np.ndarray:
     return out_img
 
 
-def bgr2gray(img: np.ndarray, keepdim: bool = False) -> np.ndarray:
+def bgr2gray(img, keepdim=False):
     """Convert a BGR image to grayscale image.
 
     Args:
@@ -38,7 +36,7 @@ def bgr2gray(img: np.ndarray, keepdim: bool = False) -> np.ndarray:
     return out_img
 
 
-def rgb2gray(img: np.ndarray, keepdim: bool = False) -> np.ndarray:
+def rgb2gray(img, keepdim=False):
     """Convert a RGB image to grayscale image.
 
     Args:
@@ -55,7 +53,7 @@ def rgb2gray(img: np.ndarray, keepdim: bool = False) -> np.ndarray:
     return out_img
 
 
-def gray2bgr(img: np.ndarray) -> np.ndarray:
+def gray2bgr(img):
     """Convert a grayscale image to BGR image.
 
     Args:
@@ -69,7 +67,7 @@ def gray2bgr(img: np.ndarray) -> np.ndarray:
     return out_img
 
 
-def gray2rgb(img: np.ndarray) -> np.ndarray:
+def gray2rgb(img):
     """Convert a grayscale image to RGB image.
 
     Args:
@@ -83,7 +81,7 @@ def gray2rgb(img: np.ndarray) -> np.ndarray:
     return out_img
 
 
-def _convert_input_type_range(img: np.ndarray) -> np.ndarray:
+def _convert_input_type_range(img):
     """Convert the type and range of the input image.
 
     It converts the input image to np.float32 type and range of [0, 1].
@@ -111,8 +109,7 @@ def _convert_input_type_range(img: np.ndarray) -> np.ndarray:
     return img
 
 
-def _convert_output_type_range(
-        img: np.ndarray, dst_type: Union[np.uint8, np.float32]) -> np.ndarray:
+def _convert_output_type_range(img, dst_type):
     """Convert the type and range of the image according to dst_type.
 
     It converts the image to desired type and range. If `dst_type` is np.uint8,
@@ -143,7 +140,7 @@ def _convert_output_type_range(
     return img.astype(dst_type)
 
 
-def rgb2ycbcr(img: np.ndarray, y_only: bool = False) -> np.ndarray:
+def rgb2ycbcr(img, y_only=False):
     """Convert a RGB image to YCbCr image.
 
     This function produces the same results as Matlab's `rgb2ycbcr` function.
@@ -163,7 +160,7 @@ def rgb2ycbcr(img: np.ndarray, y_only: bool = False) -> np.ndarray:
 
     Returns:
         ndarray: The converted YCbCr image. The output image has the same type
-        and range as input image.
+            and range as input image.
     """
     img_type = img.dtype
     img = _convert_input_type_range(img)
@@ -177,7 +174,7 @@ def rgb2ycbcr(img: np.ndarray, y_only: bool = False) -> np.ndarray:
     return out_img
 
 
-def bgr2ycbcr(img: np.ndarray, y_only: bool = False) -> np.ndarray:
+def bgr2ycbcr(img, y_only=False):
     """Convert a BGR image to YCbCr image.
 
     The bgr version of rgb2ycbcr.
@@ -197,7 +194,7 @@ def bgr2ycbcr(img: np.ndarray, y_only: bool = False) -> np.ndarray:
 
     Returns:
         ndarray: The converted YCbCr image. The output image has the same type
-        and range as input image.
+            and range as input image.
     """
     img_type = img.dtype
     img = _convert_input_type_range(img)
@@ -211,7 +208,7 @@ def bgr2ycbcr(img: np.ndarray, y_only: bool = False) -> np.ndarray:
     return out_img
 
 
-def ycbcr2rgb(img: np.ndarray) -> np.ndarray:
+def ycbcr2rgb(img):
     """Convert a YCbCr image to RGB image.
 
     This function produces the same results as Matlab's ycbcr2rgb function.
@@ -230,7 +227,7 @@ def ycbcr2rgb(img: np.ndarray) -> np.ndarray:
 
     Returns:
         ndarray: The converted RGB image. The output image has the same type
-        and range as input image.
+            and range as input image.
     """
     img_type = img.dtype
     img = _convert_input_type_range(img) * 255
@@ -243,7 +240,7 @@ def ycbcr2rgb(img: np.ndarray) -> np.ndarray:
     return out_img
 
 
-def ycbcr2bgr(img: np.ndarray) -> np.ndarray:
+def ycbcr2bgr(img):
     """Convert a YCbCr image to BGR image.
 
     The bgr version of ycbcr2rgb.
@@ -262,7 +259,7 @@ def ycbcr2bgr(img: np.ndarray) -> np.ndarray:
 
     Returns:
         ndarray: The converted BGR image. The output image has the same type
-        and range as input image.
+            and range as input image.
     """
     img_type = img.dtype
     img = _convert_input_type_range(img) * 255
@@ -275,11 +272,11 @@ def ycbcr2bgr(img: np.ndarray) -> np.ndarray:
     return out_img
 
 
-def convert_color_factory(src: str, dst: str) -> Callable:
+def convert_color_factory(src, dst):
 
     code = getattr(cv2, f'COLOR_{src.upper()}2{dst.upper()}')
 
-    def convert_color(img: np.ndarray) -> np.ndarray:
+    def convert_color(img):
         out_img = cv2.cvtColor(img, code)
         return out_img
 
diff --git a/mmcv/image/geometric.py b/mmcv/image/geometric.py
index f35299b..cf97c20 100644
--- a/mmcv/image/geometric.py
+++ b/mmcv/image/geometric.py
@@ -1,11 +1,10 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import numbers
-from typing import List, Optional, Tuple, Union, no_type_check
 
 import cv2
 import numpy as np
-from mmengine.utils import to_2tuple
 
+from ..utils import to_2tuple
 from .io import imread_backend
 
 try:
@@ -14,10 +13,7 @@ except ImportError:
     Image = None
 
 
-def _scale_size(
-    size: Tuple[int, int],
-    scale: Union[float, int, tuple],
-) -> Tuple[int, int]:
+def _scale_size(size, scale):
     """Rescale a size by a ratio.
 
     Args:
@@ -41,47 +37,23 @@ cv2_interp_codes = {
     'lanczos': cv2.INTER_LANCZOS4
 }
 
-cv2_border_modes = {
-    'constant': cv2.BORDER_CONSTANT,
-    'replicate': cv2.BORDER_REPLICATE,
-    'reflect': cv2.BORDER_REFLECT,
-    'wrap': cv2.BORDER_WRAP,
-    'reflect_101': cv2.BORDER_REFLECT_101,
-    'transparent': cv2.BORDER_TRANSPARENT,
-    'isolated': cv2.BORDER_ISOLATED
-}
-
-# Pillow >=v9.1.0 use a slightly different naming scheme for filters.
-# Set pillow_interp_codes according to the naming scheme used.
 if Image is not None:
-    if hasattr(Image, 'Resampling'):
-        pillow_interp_codes = {
-            'nearest': Image.Resampling.NEAREST,
-            'bilinear': Image.Resampling.BILINEAR,
-            'bicubic': Image.Resampling.BICUBIC,
-            'box': Image.Resampling.BOX,
-            'lanczos': Image.Resampling.LANCZOS,
-            'hamming': Image.Resampling.HAMMING
-        }
-    else:
-        pillow_interp_codes = {
-            'nearest': Image.NEAREST,
-            'bilinear': Image.BILINEAR,
-            'bicubic': Image.BICUBIC,
-            'box': Image.BOX,
-            'lanczos': Image.LANCZOS,
-            'hamming': Image.HAMMING
-        }
-
-
-def imresize(
-    img: np.ndarray,
-    size: Tuple[int, int],
-    return_scale: bool = False,
-    interpolation: str = 'bilinear',
-    out: Optional[np.ndarray] = None,
-    backend: Optional[str] = None
-) -> Union[Tuple[np.ndarray, float, float], np.ndarray]:
+    pillow_interp_codes = {
+        'nearest': Image.NEAREST,
+        'bilinear': Image.BILINEAR,
+        'bicubic': Image.BICUBIC,
+        'box': Image.BOX,
+        'lanczos': Image.LANCZOS,
+        'hamming': Image.HAMMING
+    }
+
+
+def imresize(img,
+             size,
+             return_scale=False,
+             interpolation='bilinear',
+             out=None,
+             backend=None):
     """Resize image to a given size.
 
     Args:
@@ -98,7 +70,7 @@ def imresize(
 
     Returns:
         tuple | ndarray: (`resized_img`, `w_scale`, `h_scale`) or
-        `resized_img`.
+            `resized_img`.
     """
     h, w = img.shape[:2]
     if backend is None:
@@ -123,18 +95,15 @@ def imresize(
         return resized_img, w_scale, h_scale
 
 
-@no_type_check
-def imresize_to_multiple(
-    img: np.ndarray,
-    divisor: Union[int, Tuple[int, int]],
-    size: Union[int, Tuple[int, int], None] = None,
-    scale_factor: Union[float, Tuple[float, float], None] = None,
-    keep_ratio: bool = False,
-    return_scale: bool = False,
-    interpolation: str = 'bilinear',
-    out: Optional[np.ndarray] = None,
-    backend: Optional[str] = None
-) -> Union[Tuple[np.ndarray, float, float], np.ndarray]:
+def imresize_to_multiple(img,
+                         divisor,
+                         size=None,
+                         scale_factor=None,
+                         keep_ratio=False,
+                         return_scale=False,
+                         interpolation='bilinear',
+                         out=None,
+                         backend=None):
     """Resize image according to a given size or scale factor and then rounds
     up the the resized or rescaled image size to the nearest value that can be
     divided by the divisor.
@@ -161,7 +130,7 @@ def imresize_to_multiple(
 
     Returns:
         tuple | ndarray: (`resized_img`, `w_scale`, `h_scale`) or
-        `resized_img`.
+            `resized_img`.
     """
     h, w = img.shape[:2]
     if size is not None and scale_factor is not None:
@@ -176,7 +145,7 @@ def imresize_to_multiple(
         size = _scale_size((w, h), scale_factor)
 
     divisor = to_2tuple(divisor)
-    size = tuple(int(np.ceil(s / d)) * d for s, d in zip(size, divisor))
+    size = tuple([int(np.ceil(s / d)) * d for s, d in zip(size, divisor)])
     resized_img, w_scale, h_scale = imresize(
         img,
         size,
@@ -190,13 +159,11 @@ def imresize_to_multiple(
         return resized_img
 
 
-def imresize_like(
-    img: np.ndarray,
-    dst_img: np.ndarray,
-    return_scale: bool = False,
-    interpolation: str = 'bilinear',
-    backend: Optional[str] = None
-) -> Union[Tuple[np.ndarray, float, float], np.ndarray]:
+def imresize_like(img,
+                  dst_img,
+                  return_scale=False,
+                  interpolation='bilinear',
+                  backend=None):
     """Resize image to the same size of a given image.
 
     Args:
@@ -208,15 +175,13 @@ def imresize_like(
 
     Returns:
         tuple or ndarray: (`resized_img`, `w_scale`, `h_scale`) or
-        `resized_img`.
+            `resized_img`.
     """
     h, w = dst_img.shape[:2]
     return imresize(img, (w, h), return_scale, interpolation, backend=backend)
 
 
-def rescale_size(old_size: tuple,
-                 scale: Union[float, int, tuple],
-                 return_scale: bool = False) -> tuple:
+def rescale_size(old_size, scale, return_scale=False):
     """Calculate the new size to be rescaled to.
 
     Args:
@@ -253,13 +218,11 @@ def rescale_size(old_size: tuple,
         return new_size
 
 
-def imrescale(
-    img: np.ndarray,
-    scale: Union[float, Tuple[int, int]],
-    return_scale: bool = False,
-    interpolation: str = 'bilinear',
-    backend: Optional[str] = None
-) -> Union[np.ndarray, Tuple[np.ndarray, float]]:
+def imrescale(img,
+              scale,
+              return_scale=False,
+              interpolation='bilinear',
+              backend=None):
     """Resize image while keeping the aspect ratio.
 
     Args:
@@ -286,7 +249,7 @@ def imrescale(
         return rescaled_img
 
 
-def imflip(img: np.ndarray, direction: str = 'horizontal') -> np.ndarray:
+def imflip(img, direction='horizontal'):
     """Flip an image horizontally or vertically.
 
     Args:
@@ -306,7 +269,7 @@ def imflip(img: np.ndarray, direction: str = 'horizontal') -> np.ndarray:
         return np.flip(img, axis=(0, 1))
 
 
-def imflip_(img: np.ndarray, direction: str = 'horizontal') -> np.ndarray:
+def imflip_(img, direction='horizontal'):
     """Inplace flip an image horizontally or vertically.
 
     Args:
@@ -326,33 +289,30 @@ def imflip_(img: np.ndarray, direction: str = 'horizontal') -> np.ndarray:
         return cv2.flip(img, -1, img)
 
 
-def imrotate(img: np.ndarray,
-             angle: float,
-             center: Optional[Tuple[float, float]] = None,
-             scale: float = 1.0,
-             border_value: int = 0,
-             interpolation: str = 'bilinear',
-             auto_bound: bool = False,
-             border_mode: str = 'constant') -> np.ndarray:
+def imrotate(img,
+             angle,
+             center=None,
+             scale=1.0,
+             border_value=0,
+             interpolation='bilinear',
+             auto_bound=False):
     """Rotate an image.
 
     Args:
-        img (np.ndarray): Image to be rotated.
+        img (ndarray): Image to be rotated.
         angle (float): Rotation angle in degrees, positive values mean
             clockwise rotation.
         center (tuple[float], optional): Center point (w, h) of the rotation in
             the source image. If not specified, the center of the image will be
             used.
         scale (float): Isotropic scale factor.
-        border_value (int): Border value used in case of a constant border.
-            Defaults to 0.
+        border_value (int): Border value.
         interpolation (str): Same as :func:`resize`.
         auto_bound (bool): Whether to adjust the image size to cover the whole
             rotated image.
-        border_mode (str): Pixel extrapolation method. Defaults to 'constant'.
 
     Returns:
-        np.ndarray: The rotated image.
+        ndarray: The rotated image.
     """
     if center is not None and auto_bound:
         raise ValueError('`auto_bound` conflicts with `center`')
@@ -375,12 +335,11 @@ def imrotate(img: np.ndarray,
         img,
         matrix, (w, h),
         flags=cv2_interp_codes[interpolation],
-        borderMode=cv2_border_modes[border_mode],
         borderValue=border_value)
     return rotated
 
 
-def bbox_clip(bboxes: np.ndarray, img_shape: Tuple[int, int]) -> np.ndarray:
+def bbox_clip(bboxes, img_shape):
     """Clip bboxes to fit the image shape.
 
     Args:
@@ -398,9 +357,7 @@ def bbox_clip(bboxes: np.ndarray, img_shape: Tuple[int, int]) -> np.ndarray:
     return clipped_bboxes
 
 
-def bbox_scaling(bboxes: np.ndarray,
-                 scale: float,
-                 clip_shape: Optional[Tuple[int, int]] = None) -> np.ndarray:
+def bbox_scaling(bboxes, scale, clip_shape=None):
     """Scaling bboxes w.r.t the box center.
 
     Args:
@@ -426,12 +383,7 @@ def bbox_scaling(bboxes: np.ndarray,
         return scaled_bboxes
 
 
-def imcrop(
-    img: np.ndarray,
-    bboxes: np.ndarray,
-    scale: float = 1.0,
-    pad_fill: Union[float, list, None] = None
-) -> Union[np.ndarray, List[np.ndarray]]:
+def imcrop(img, bboxes, scale=1.0, pad_fill=None):
     """Crop image patches.
 
     3 steps: scale the bboxes -> clip bboxes -> crop and pad.
@@ -440,7 +392,7 @@ def imcrop(
         img (ndarray): Image to be cropped.
         bboxes (ndarray): Shape (k, 4) or (4, ), location of cropped bboxes.
         scale (float, optional): Scale ratio of bboxes, the default value
-            1.0 means no scaling.
+            1.0 means no padding.
         pad_fill (Number | list[Number]): Value to be filled for padding.
             Default: None, which means no padding.
 
@@ -464,12 +416,10 @@ def imcrop(
             patch = img[y1:y2 + 1, x1:x2 + 1, ...]
         else:
             _x1, _y1, _x2, _y2 = tuple(scaled_bboxes[i, :])
-            patch_h = _y2 - _y1 + 1
-            patch_w = _x2 - _x1 + 1
             if chn == 1:
-                patch_shape = (patch_h, patch_w)
+                patch_shape = (_y2 - _y1 + 1, _x2 - _x1 + 1)
             else:
-                patch_shape = (patch_h, patch_w, chn)  # type: ignore
+                patch_shape = (_y2 - _y1 + 1, _x2 - _x1 + 1, chn)
             patch = np.array(
                 pad_fill, dtype=img.dtype) * np.ones(
                     patch_shape, dtype=img.dtype)
@@ -487,12 +437,12 @@ def imcrop(
         return patches
 
 
-def impad(img: np.ndarray,
+def impad(img,
           *,
-          shape: Optional[Tuple[int, int]] = None,
-          padding: Union[int, tuple, None] = None,
-          pad_val: Union[float, List] = 0,
-          padding_mode: str = 'constant') -> np.ndarray:
+          shape=None,
+          padding=None,
+          pad_val=0,
+          padding_mode='constant'):
     """Pad the given image to a certain shape or pad on all sides with
     specified padding mode and padding value.
 
@@ -512,16 +462,16 @@ def impad(img: np.ndarray,
             reflect or symmetric. Default: constant.
 
             - constant: pads with a constant value, this value is specified
-              with pad_val.
+                with pad_val.
             - edge: pads with the last value at the edge of the image.
-            - reflect: pads with reflection of image without repeating the last
-              value on the edge. For example, padding [1, 2, 3, 4] with 2
-              elements on both sides in reflect mode will result in
-              [3, 2, 1, 2, 3, 4, 3, 2].
-            - symmetric: pads with reflection of image repeating the last value
-              on the edge. For example, padding [1, 2, 3, 4] with 2 elements on
-              both sides in symmetric mode will result in
-              [2, 1, 1, 2, 3, 4, 4, 3]
+            - reflect: pads with reflection of image without repeating the
+                last value on the edge. For example, padding [1, 2, 3, 4]
+                with 2 elements on both sides in reflect mode will result
+                in [3, 2, 1, 2, 3, 4, 3, 2].
+            - symmetric: pads with reflection of image repeating the last
+                value on the edge. For example, padding [1, 2, 3, 4] with
+                2 elements on both sides in symmetric mode will result in
+                [2, 1, 1, 2, 3, 4, 4, 3]
 
     Returns:
         ndarray: The padded image.
@@ -529,9 +479,7 @@ def impad(img: np.ndarray,
 
     assert (shape is not None) ^ (padding is not None)
     if shape is not None:
-        width = max(shape[1] - img.shape[1], 0)
-        height = max(shape[0] - img.shape[0], 0)
-        padding = (0, 0, width, height)
+        padding = (0, 0, shape[1] - img.shape[1], shape[0] - img.shape[0])
 
     # check pad_val
     if isinstance(pad_val, tuple):
@@ -571,9 +519,7 @@ def impad(img: np.ndarray,
     return img
 
 
-def impad_to_multiple(img: np.ndarray,
-                      divisor: int,
-                      pad_val: Union[float, List] = 0) -> np.ndarray:
+def impad_to_multiple(img, divisor, pad_val=0):
     """Pad an image to ensure each edge to be multiple to some number.
 
     Args:
@@ -589,9 +535,7 @@ def impad_to_multiple(img: np.ndarray,
     return impad(img, shape=(pad_h, pad_w), pad_val=pad_val)
 
 
-def cutout(img: np.ndarray,
-           shape: Union[int, Tuple[int, int]],
-           pad_val: Union[int, float, tuple] = 0) -> np.ndarray:
+def cutout(img, shape, pad_val=0):
     """Randomly cut out a rectangle from the original img.
 
     Args:
@@ -635,7 +579,7 @@ def cutout(img: np.ndarray,
     if img.ndim == 2:
         patch_shape = (y2 - y1, x2 - x1)
     else:
-        patch_shape = (y2 - y1, x2 - x1, channels)  # type: ignore
+        patch_shape = (y2 - y1, x2 - x1, channels)
 
     img_cutout = img.copy()
     patch = np.array(
@@ -646,8 +590,7 @@ def cutout(img: np.ndarray,
     return img_cutout
 
 
-def _get_shear_matrix(magnitude: Union[int, float],
-                      direction: str = 'horizontal') -> np.ndarray:
+def _get_shear_matrix(magnitude, direction='horizontal'):
     """Generate the shear matrix for transformation.
 
     Args:
@@ -665,11 +608,11 @@ def _get_shear_matrix(magnitude: Union[int, float],
     return shear_matrix
 
 
-def imshear(img: np.ndarray,
-            magnitude: Union[int, float],
-            direction: str = 'horizontal',
-            border_value: Union[int, Tuple[int, int]] = 0,
-            interpolation: str = 'bilinear') -> np.ndarray:
+def imshear(img,
+            magnitude,
+            direction='horizontal',
+            border_value=0,
+            interpolation='bilinear'):
     """Shear an image.
 
     Args:
@@ -693,7 +636,7 @@ def imshear(img: np.ndarray,
     elif img.ndim == 3:
         channels = img.shape[-1]
     if isinstance(border_value, int):
-        border_value = tuple([border_value] * channels)  # type: ignore
+        border_value = tuple([border_value] * channels)
     elif isinstance(border_value, tuple):
         assert len(border_value) == channels, \
             'Expected the num of elements in tuple equals the channels' \
@@ -711,13 +654,12 @@ def imshear(img: np.ndarray,
         # greater than 3 (e.g. shearing masks whose channels large
         # than 3) will raise TypeError in `cv2.warpAffine`.
         # Here simply slice the first 3 values in `border_value`.
-        borderValue=border_value[:3],  # type: ignore
+        borderValue=border_value[:3],
         flags=cv2_interp_codes[interpolation])
     return sheared
 
 
-def _get_translate_matrix(offset: Union[int, float],
-                          direction: str = 'horizontal') -> np.ndarray:
+def _get_translate_matrix(offset, direction='horizontal'):
     """Generate the translate matrix.
 
     Args:
@@ -735,11 +677,11 @@ def _get_translate_matrix(offset: Union[int, float],
     return translate_matrix
 
 
-def imtranslate(img: np.ndarray,
-                offset: Union[int, float],
-                direction: str = 'horizontal',
-                border_value: Union[int, tuple] = 0,
-                interpolation: str = 'bilinear') -> np.ndarray:
+def imtranslate(img,
+                offset,
+                direction='horizontal',
+                border_value=0,
+                interpolation='bilinear'):
     """Translate an image.
 
     Args:
diff --git a/mmcv/image/io.py b/mmcv/image/io.py
index e10d443..d47aaa8 100644
--- a/mmcv/image/io.py
+++ b/mmcv/image/io.py
@@ -1,16 +1,14 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import io
 import os.path as osp
-import warnings
 from pathlib import Path
-from typing import Optional, Union
 
 import cv2
-import mmengine.fileio as fileio
 import numpy as np
 from cv2 import (IMREAD_COLOR, IMREAD_GRAYSCALE, IMREAD_IGNORE_ORIENTATION,
                  IMREAD_UNCHANGED)
-from mmengine.utils import is_filepath, is_str
+
+from mmcv.utils import check_file_exist, is_str, mkdir_or_exist
 
 try:
     from turbojpeg import TJCS_RGB, TJPF_BGR, TJPF_GRAY, TurboJPEG
@@ -42,7 +40,7 @@ imread_flags = {
 imread_backend = 'cv2'
 
 
-def use_backend(backend: str) -> None:
+def use_backend(backend):
     """Select a backend for image decoding.
 
     Args:
@@ -68,7 +66,7 @@ def use_backend(backend: str) -> None:
             raise ImportError('`tifffile` is not installed')
 
 
-def _jpegflag(flag: str = 'color', channel_order: str = 'bgr'):
+def _jpegflag(flag='color', channel_order='bgr'):
     channel_order = channel_order.lower()
     if channel_order not in ['rgb', 'bgr']:
         raise ValueError('channel order must be either "rgb" or "bgr"')
@@ -84,9 +82,7 @@ def _jpegflag(flag: str = 'color', channel_order: str = 'bgr'):
         raise ValueError('flag must be "color" or "grayscale"')
 
 
-def _pillow2array(img,
-                  flag: str = 'color',
-                  channel_order: str = 'bgr') -> np.ndarray:
+def _pillow2array(img, flag='color', channel_order='bgr'):
     """Convert a pillow image to numpy array.
 
     Args:
@@ -141,13 +137,7 @@ def _pillow2array(img,
     return array
 
 
-def imread(img_or_path: Union[np.ndarray, str, Path],
-           flag: str = 'color',
-           channel_order: str = 'bgr',
-           backend: Optional[str] = None,
-           file_client_args: Optional[dict] = None,
-           *,
-           backend_args: Optional[dict] = None) -> np.ndarray:
+def imread(img_or_path, flag='color', channel_order='bgr', backend=None):
     """Read an image.
 
     Args:
@@ -167,117 +157,78 @@ def imread(img_or_path: Union[np.ndarray, str, Path],
             `cv2`, `pillow`, `turbojpeg`, `tifffile`, `None`.
             If backend is None, the global imread_backend specified by
             ``mmcv.use_backend()`` will be used. Default: None.
-        file_client_args (dict, optional): Arguments to instantiate a
-            FileClient. See :class:`mmengine.fileio.FileClient` for details.
-            Default: None. It will be deprecated in future. Please use
-            ``backend_args`` instead.
-            Deprecated in version 2.0.0rc4.
-        backend_args (dict, optional): Instantiates the corresponding file
-            backend. It may contain `backend` key to specify the file
-            backend. If it contains, the file backend corresponding to this
-            value will be used and initialized with the remaining values,
-            otherwise the corresponding file backend will be selected
-            based on the prefix of the file path. Defaults to None.
-            New in version 2.0.0rc4.
 
     Returns:
         ndarray: Loaded image array.
-
-    Examples:
-        >>> import mmcv
-        >>> img_path = '/path/to/img.jpg'
-        >>> img = mmcv.imread(img_path)
-        >>> img = mmcv.imread(img_path, flag='color', channel_order='rgb',
-        ...     backend='cv2')
-        >>> img = mmcv.imread(img_path, flag='color', channel_order='bgr',
-        ...     backend='pillow')
-        >>> s3_img_path = 's3://bucket/img.jpg'
-        >>> # infer the file backend by the prefix s3
-        >>> img = mmcv.imread(s3_img_path)
-        >>> # manually set the file backend petrel
-        >>> img = mmcv.imread(s3_img_path, backend_args={
-        ...     'backend': 'petrel'})
-        >>> http_img_path = 'http://path/to/img.jpg'
-        >>> img = mmcv.imread(http_img_path)
-        >>> img = mmcv.imread(http_img_path, backend_args={
-        ...     'backend': 'http'})
     """
-    if file_client_args is not None:
-        warnings.warn(
-            '"file_client_args" will be deprecated in future. '
-            'Please use "backend_args" instead', DeprecationWarning)
-        if backend_args is not None:
-            raise ValueError(
-                '"file_client_args" and "backend_args" cannot be set at the '
-                'same time.')
 
+    if backend is None:
+        backend = imread_backend
+    if backend not in supported_backends:
+        raise ValueError(f'backend: {backend} is not supported. Supported '
+                         "backends are 'cv2', 'turbojpeg', 'pillow'")
     if isinstance(img_or_path, Path):
         img_or_path = str(img_or_path)
 
     if isinstance(img_or_path, np.ndarray):
         return img_or_path
     elif is_str(img_or_path):
-        if file_client_args is not None:
-            file_client = fileio.FileClient.infer_client(
-                file_client_args, img_or_path)
-            img_bytes = file_client.get(img_or_path)
+        check_file_exist(img_or_path,
+                         f'img file does not exist: {img_or_path}')
+        if backend == 'turbojpeg':
+            with open(img_or_path, 'rb') as in_file:
+                img = jpeg.decode(in_file.read(),
+                                  _jpegflag(flag, channel_order))
+                if img.shape[-1] == 1:
+                    img = img[:, :, 0]
+            return img
+        elif backend == 'pillow':
+            img = Image.open(img_or_path)
+            img = _pillow2array(img, flag, channel_order)
+            return img
+        elif backend == 'tifffile':
+            img = tifffile.imread(img_or_path)
+            return img
         else:
-            img_bytes = fileio.get(img_or_path, backend_args=backend_args)
-        return imfrombytes(img_bytes, flag, channel_order, backend)
+            flag = imread_flags[flag] if is_str(flag) else flag
+            img = cv2.imread(img_or_path, flag)
+            if flag == IMREAD_COLOR and channel_order == 'rgb':
+                cv2.cvtColor(img, cv2.COLOR_BGR2RGB, img)
+            return img
     else:
         raise TypeError('"img" must be a numpy array or a str or '
                         'a pathlib.Path object')
 
 
-def imfrombytes(content: bytes,
-                flag: str = 'color',
-                channel_order: str = 'bgr',
-                backend: Optional[str] = None) -> np.ndarray:
+def imfrombytes(content, flag='color', channel_order='bgr', backend=None):
     """Read an image from bytes.
 
     Args:
         content (bytes): Image bytes got from files or other streams.
         flag (str): Same as :func:`imread`.
-        channel_order (str): The channel order of the output, candidates
-            are 'bgr' and 'rgb'. Default to 'bgr'.
         backend (str | None): The image decoding backend type. Options are
-            `cv2`, `pillow`, `turbojpeg`, `tifffile`, `None`. If backend is
-            None, the global imread_backend specified by ``mmcv.use_backend()``
-            will be used. Default: None.
+            `cv2`, `pillow`, `turbojpeg`, `None`. If backend is None, the
+            global imread_backend specified by ``mmcv.use_backend()`` will be
+            used. Default: None.
 
     Returns:
         ndarray: Loaded image array.
-
-    Examples:
-        >>> img_path = '/path/to/img.jpg'
-        >>> with open(img_path, 'rb') as f:
-        >>>     img_buff = f.read()
-        >>> img = mmcv.imfrombytes(img_buff)
-        >>> img = mmcv.imfrombytes(img_buff, flag='color', channel_order='rgb')
-        >>> img = mmcv.imfrombytes(img_buff, backend='pillow')
-        >>> img = mmcv.imfrombytes(img_buff, backend='cv2')
     """
 
     if backend is None:
         backend = imread_backend
     if backend not in supported_backends:
-        raise ValueError(
-            f'backend: {backend} is not supported. Supported '
-            "backends are 'cv2', 'turbojpeg', 'pillow', 'tifffile'")
+        raise ValueError(f'backend: {backend} is not supported. Supported '
+                         "backends are 'cv2', 'turbojpeg', 'pillow'")
     if backend == 'turbojpeg':
-        img = jpeg.decode(  # type: ignore
-            content, _jpegflag(flag, channel_order))
+        img = jpeg.decode(content, _jpegflag(flag, channel_order))
         if img.shape[-1] == 1:
             img = img[:, :, 0]
         return img
     elif backend == 'pillow':
-        with io.BytesIO(content) as buff:
-            img = Image.open(buff)
-            img = _pillow2array(img, flag, channel_order)
-        return img
-    elif backend == 'tifffile':
-        with io.BytesIO(content) as buff:
-            img = tifffile.imread(buff)
+        buff = io.BytesIO(content)
+        img = Image.open(buff)
+        img = _pillow2array(img, flag, channel_order)
         return img
     else:
         img_np = np.frombuffer(content, np.uint8)
@@ -288,77 +239,20 @@ def imfrombytes(content: bytes,
         return img
 
 
-def imwrite(img: np.ndarray,
-            file_path: str,
-            params: Optional[list] = None,
-            auto_mkdir: Optional[bool] = None,
-            file_client_args: Optional[dict] = None,
-            *,
-            backend_args: Optional[dict] = None) -> bool:
+def imwrite(img, file_path, params=None, auto_mkdir=True):
     """Write image to file.
 
-    Warning:
-        The parameter `auto_mkdir` will be deprecated in the future and every
-        file clients will make directory automatically.
-
     Args:
         img (ndarray): Image array to be written.
         file_path (str): Image file path.
         params (None or list): Same as opencv :func:`imwrite` interface.
         auto_mkdir (bool): If the parent folder of `file_path` does not exist,
-            whether to create it automatically. It will be deprecated.
-        file_client_args (dict, optional): Arguments to instantiate a
-            FileClient. See :class:`mmengine.fileio.FileClient` for details.
-            Default: None. It will be deprecated in future. Please use
-            ``backend_args`` instead.
-            Deprecated in version 2.0.0rc4.
-        backend_args (dict, optional): Instantiates the corresponding file
-            backend. It may contain `backend` key to specify the file
-            backend. If it contains, the file backend corresponding to this
-            value will be used and initialized with the remaining values,
-            otherwise the corresponding file backend will be selected
-            based on the prefix of the file path. Defaults to None.
-            New in version 2.0.0rc4.
+            whether to create it automatically.
 
     Returns:
         bool: Successful or not.
-
-    Examples:
-        >>> # write to hard disk client
-        >>> ret = mmcv.imwrite(img, '/path/to/img.jpg')
-        >>> # infer the file backend by the prefix s3
-        >>> ret = mmcv.imwrite(img, 's3://bucket/img.jpg')
-        >>> # manually set the file backend petrel
-        >>> ret = mmcv.imwrite(img, 's3://bucket/img.jpg', backend_args={
-        ...     'backend': 'petrel'})
     """
-    if file_client_args is not None:
-        warnings.warn(
-            '"file_client_args" will be deprecated in future. '
-            'Please use "backend_args" instead', DeprecationWarning)
-        if backend_args is not None:
-            raise ValueError(
-                '"file_client_args" and "backend_args" cannot be set at the '
-                'same time.')
-
-    assert is_filepath(file_path)
-    file_path = str(file_path)
-    if auto_mkdir is not None:
-        warnings.warn(
-            'The parameter `auto_mkdir` will be deprecated in the future and '
-            'every file clients will make directory automatically.')
-
-    img_ext = osp.splitext(file_path)[-1]
-    # Encode image according to image suffix.
-    # For example, if image path is '/path/your/img.jpg', the encode
-    # format is '.jpg'.
-    flag, img_buff = cv2.imencode(img_ext, img, params)
-
-    if file_client_args is not None:
-        file_client = fileio.FileClient.infer_client(file_client_args,
-                                                     file_path)
-        file_client.put(img_buff.tobytes(), file_path)
-    else:
-        fileio.put(img_buff.tobytes(), file_path, backend_args=backend_args)
-
-    return flag
+    if auto_mkdir:
+        dir_name = osp.abspath(osp.dirname(file_path))
+        mkdir_or_exist(dir_name)
+    return cv2.imwrite(file_path, img, params)
diff --git a/mmcv/image/misc.py b/mmcv/image/misc.py
index e923cad..dfc4a9c 100644
--- a/mmcv/image/misc.py
+++ b/mmcv/image/misc.py
@@ -1,6 +1,4 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from typing import Optional
-
 import numpy as np
 
 import mmcv
@@ -11,24 +9,18 @@ except ImportError:
     torch = None
 
 
-def tensor2imgs(tensor,
-                mean: Optional[tuple] = None,
-                std: Optional[tuple] = None,
-                to_rgb: bool = True) -> list:
-    """Convert tensor to 3-channel images or 1-channel gray images.
+def tensor2imgs(tensor, mean=(0, 0, 0), std=(1, 1, 1), to_rgb=True):
+    """Convert tensor to 3-channel images.
 
     Args:
         tensor (torch.Tensor): Tensor that contains multiple images, shape (
-            N, C, H, W). :math:`C` can be either 3 or 1.
-        mean (tuple[float], optional): Mean of images. If None,
-            (0, 0, 0) will be used for tensor with 3-channel,
-            while (0, ) for tensor with 1-channel. Defaults to None.
-        std (tuple[float], optional): Standard deviation of images. If None,
-            (1, 1, 1) will be used for tensor with 3-channel,
-            while (1, ) for tensor with 1-channel. Defaults to None.
+            N, C, H, W).
+        mean (tuple[float], optional): Mean of images. Defaults to (0, 0, 0).
+        std (tuple[float], optional): Standard deviation of images.
+            Defaults to (1, 1, 1).
         to_rgb (bool, optional): Whether the tensor was converted to RGB
             format in the first place. If so, convert it back to BGR.
-            For the tensor with 1 channel, it must be False. Defaults to True.
+            Defaults to True.
 
     Returns:
         list[np.ndarray]: A list that contains multiple images.
@@ -37,14 +29,8 @@ def tensor2imgs(tensor,
     if torch is None:
         raise RuntimeError('pytorch is not installed')
     assert torch.is_tensor(tensor) and tensor.ndim == 4
-    channels = tensor.size(1)
-    assert channels in [1, 3]
-    if mean is None:
-        mean = (0, ) * channels
-    if std is None:
-        std = (1, ) * channels
-    assert (channels == len(mean) == len(std) == 3) or \
-        (channels == len(mean) == len(std) == 1 and not to_rgb)
+    assert len(mean) == 3
+    assert len(std) == 3
 
     num_imgs = tensor.size(0)
     mean = np.array(mean, dtype=np.float32)
diff --git a/mmcv/image/photometric.py b/mmcv/image/photometric.py
index 12cbb90..5085d01 100644
--- a/mmcv/image/photometric.py
+++ b/mmcv/image/photometric.py
@@ -1,14 +1,9 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-import warnings
-from typing import Optional
-
 import cv2
 import numpy as np
-from mmengine.utils import is_tuple_of
-from PIL import Image, ImageEnhance
 
+from ..utils import is_tuple_of
 from .colorspace import bgr2gray, gray2bgr
-from .io import imread_backend
 
 
 def imnormalize(img, mean, std, to_rgb=True):
@@ -102,7 +97,7 @@ def posterize(img, bits):
     return img
 
 
-def adjust_color(img, alpha=1, beta=None, gamma=0, backend=None):
+def adjust_color(img, alpha=1, beta=None, gamma=0):
     r"""It blends the source image and its gray image:
 
     .. math::
@@ -115,41 +110,22 @@ def adjust_color(img, alpha=1, beta=None, gamma=0, backend=None):
             If None, it's assigned the value (1 - `alpha`).
         gamma (int | float): Scalar added to each sum.
             Same as :func:`cv2.addWeighted`. Default 0.
-        backend (str | None): The image processing backend type. Options are
-            `cv2`, `pillow`, `None`. If backend is None, the global
-            ``imread_backend`` specified by ``mmcv.use_backend()`` will be
-            used. Defaults to None.
 
     Returns:
         ndarray: Colored image which has the same size and dtype as input.
     """
-    if backend is None:
-        backend = imread_backend
-    if backend not in ['cv2', 'pillow']:
-        raise ValueError(f'backend: {backend} is not supported.'
-                         f"Supported backends are 'cv2', 'pillow'")
-
-    if backend == 'pillow':
-        assert img.dtype == np.uint8, 'Pillow backend only support uint8 type'
-        warnings.warn("Only use 'alpha' for pillow backend.")
-        # Image.fromarray defaultly supports RGB, not BGR.
-        pil_image = Image.fromarray(img[..., ::-1], mode='RGB')
-        enhancer = ImageEnhance.Color(pil_image)
-        pil_image = enhancer.enhance(alpha)
-        return np.array(pil_image, dtype=img.dtype)[..., ::-1]
-    else:
-        gray_img = bgr2gray(img)
-        gray_img = np.tile(gray_img[..., None], [1, 1, 3])
-        if beta is None:
-            beta = 1 - alpha
-        colored_img = cv2.addWeighted(img, alpha, gray_img, beta, gamma)
-        if not colored_img.dtype == np.uint8:
-            # Note when the dtype of `img` is not the default `np.uint8`
-            # (e.g. np.float32), the value in `colored_img` got from cv2
-            # is not guaranteed to be in range [0, 255], so here clip
-            # is needed.
-            colored_img = np.clip(colored_img, 0, 255)
-        return colored_img.astype(img.dtype)
+    gray_img = bgr2gray(img)
+    gray_img = np.tile(gray_img[..., None], [1, 1, 3])
+    if beta is None:
+        beta = 1 - alpha
+    colored_img = cv2.addWeighted(img, alpha, gray_img, beta, gamma)
+    if not colored_img.dtype == np.uint8:
+        # Note when the dtype of `img` is not the default `np.uint8`
+        # (e.g. np.float32), the value in `colored_img` got from cv2
+        # is not guaranteed to be in range [0, 255], so here clip
+        # is needed.
+        colored_img = np.clip(colored_img, 0, 255)
+    return colored_img
 
 
 def imequalize(img):
@@ -197,7 +173,7 @@ def imequalize(img):
     return equalized_img.astype(img.dtype)
 
 
-def adjust_brightness(img, factor=1., backend=None):
+def adjust_brightness(img, factor=1.):
     """Adjust image brightness.
 
     This function controls the brightness of an image. An
@@ -214,40 +190,22 @@ def adjust_brightness(img, factor=1., backend=None):
             Factor 1.0 returns the original image, lower
             factors mean less color (brightness, contrast,
             etc), and higher values more. Default 1.
-        backend (str | None): The image processing backend type. Options are
-            `cv2`, `pillow`, `None`. If backend is None, the global
-            ``imread_backend`` specified by ``mmcv.use_backend()`` will be
-            used. Defaults to None.
 
     Returns:
         ndarray: The brightened image.
     """
-    if backend is None:
-        backend = imread_backend
-    if backend not in ['cv2', 'pillow']:
-        raise ValueError(f'backend: {backend} is not supported.'
-                         f"Supported backends are 'cv2', 'pillow'")
-
-    if backend == 'pillow':
-        assert img.dtype == np.uint8, 'Pillow backend only support uint8 type'
-        # Image.fromarray defaultly supports RGB, not BGR.
-        pil_image = Image.fromarray(img[..., ::-1], mode='RGB')
-        enhancer = ImageEnhance.Brightness(pil_image)
-        pil_image = enhancer.enhance(factor)
-        return np.array(pil_image, dtype=img.dtype)[..., ::-1]
-    else:
-        degenerated = np.zeros_like(img)
-        # Note manually convert the dtype to np.float32, to
-        # achieve as close results as PIL.ImageEnhance.Brightness.
-        # Set beta=1-factor, and gamma=0
-        brightened_img = cv2.addWeighted(
-            img.astype(np.float32), factor, degenerated.astype(np.float32),
-            1 - factor, 0)
-        brightened_img = np.clip(brightened_img, 0, 255)
-        return brightened_img.astype(img.dtype)
-
-
-def adjust_contrast(img, factor=1., backend=None):
+    degenerated = np.zeros_like(img)
+    # Note manually convert the dtype to np.float32, to
+    # achieve as close results as PIL.ImageEnhance.Brightness.
+    # Set beta=1-factor, and gamma=0
+    brightened_img = cv2.addWeighted(
+        img.astype(np.float32), factor, degenerated.astype(np.float32),
+        1 - factor, 0)
+    brightened_img = np.clip(brightened_img, 0, 255)
+    return brightened_img.astype(img.dtype)
+
+
+def adjust_contrast(img, factor=1.):
     """Adjust image contrast.
 
     This function controls the contrast of an image. An
@@ -261,38 +219,20 @@ def adjust_contrast(img, factor=1., backend=None):
     Args:
         img (ndarray): Image to be contrasted. BGR order.
         factor (float): Same as :func:`mmcv.adjust_brightness`.
-        backend (str | None): The image processing backend type. Options are
-            `cv2`, `pillow`, `None`. If backend is None, the global
-            ``imread_backend`` specified by ``mmcv.use_backend()`` will be
-            used. Defaults to None.
 
     Returns:
         ndarray: The contrasted image.
     """
-    if backend is None:
-        backend = imread_backend
-    if backend not in ['cv2', 'pillow']:
-        raise ValueError(f'backend: {backend} is not supported.'
-                         f"Supported backends are 'cv2', 'pillow'")
-
-    if backend == 'pillow':
-        assert img.dtype == np.uint8, 'Pillow backend only support uint8 type'
-        # Image.fromarray defaultly supports RGB, not BGR.
-        pil_image = Image.fromarray(img[..., ::-1], mode='RGB')
-        enhancer = ImageEnhance.Contrast(pil_image)
-        pil_image = enhancer.enhance(factor)
-        return np.array(pil_image, dtype=img.dtype)[..., ::-1]
-    else:
-        gray_img = bgr2gray(img)
-        hist = np.histogram(gray_img, 256, (0, 255))[0]
-        mean = round(np.sum(gray_img) / np.sum(hist))
-        degenerated = (np.ones_like(img[..., 0]) * mean).astype(img.dtype)
-        degenerated = gray2bgr(degenerated)
-        contrasted_img = cv2.addWeighted(
-            img.astype(np.float32), factor, degenerated.astype(np.float32),
-            1 - factor, 0)
-        contrasted_img = np.clip(contrasted_img, 0, 255)
-        return contrasted_img.astype(img.dtype)
+    gray_img = bgr2gray(img)
+    hist = np.histogram(gray_img, 256, (0, 255))[0]
+    mean = round(np.sum(gray_img) / np.sum(hist))
+    degenerated = (np.ones_like(img[..., 0]) * mean).astype(img.dtype)
+    degenerated = gray2bgr(degenerated)
+    contrasted_img = cv2.addWeighted(
+        img.astype(np.float32), factor, degenerated.astype(np.float32),
+        1 - factor, 0)
+    contrasted_img = np.clip(contrasted_img, 0, 255)
+    return contrasted_img.astype(img.dtype)
 
 
 def auto_contrast(img, cutoff=0):
@@ -486,76 +426,3 @@ def clahe(img, clip_limit=40.0, tile_grid_size=(8, 8)):
 
     clahe = cv2.createCLAHE(clip_limit, tile_grid_size)
     return clahe.apply(np.array(img, dtype=np.uint8))
-
-
-def adjust_hue(img: np.ndarray,
-               hue_factor: float,
-               backend: Optional[str] = None) -> np.ndarray:
-    """Adjust hue of an image.
-
-    The image hue is adjusted by converting the image to HSV and cyclically
-    shifting the intensities in the hue channel (H). The image is then
-    converted back to original image mode.
-
-    `hue_factor` is the amount of shift in H channel and must be in the
-    interval `[-0.5, 0.5]`.
-
-    Modified from
-    https://github.com/pytorch/vision/blob/main/torchvision/
-    transforms/functional.py
-
-    Args:
-        img (ndarray): Image to be adjusted.
-        hue_factor (float):  How much to shift the hue channel. Should be in
-            [-0.5, 0.5]. 0.5 and -0.5 give complete reversal of hue channel in
-            HSV space in positive and negative direction respectively.
-            0 means no shift. Therefore, both -0.5 and 0.5 will give an image
-            with complementary colors while 0 gives the original image.
-        backend (str | None): The image processing backend type. Options are
-            `cv2`, `pillow`, `None`. If backend is None, the global
-            ``imread_backend`` specified by ``mmcv.use_backend()`` will be
-            used. Defaults to None.
-
-    Returns:
-        ndarray: Hue adjusted image.
-    """
-    if backend is None:
-        backend = imread_backend
-    if backend not in ['cv2', 'pillow']:
-        raise ValueError(f'backend: {backend} is not supported.'
-                         f"Supported backends are 'cv2', 'pillow'")
-
-    if not (-0.5 <= hue_factor <= 0.5):
-        raise ValueError(f'hue_factor:{hue_factor} is not in [-0.5, 0.5].')
-    if not (isinstance(img, np.ndarray) and (img.ndim in {2, 3})):
-        raise TypeError('img should be ndarray with dim=[2 or 3].')
-
-    if backend == 'pillow':
-        assert img.dtype == np.uint8, 'Pillow backend only support uint8 type'
-        # Image.fromarray defaultly supports RGB, not BGR.
-        pil_image = Image.fromarray(img[..., ::-1], mode='RGB')
-        input_mode = pil_image.mode
-        if input_mode in {'L', '1', 'I', 'F'}:
-            return pil_image
-
-        h, s, v = pil_image.convert('HSV').split()
-
-        np_h = np.array(h, dtype=np.uint8)
-        # uint8 addition take cares of rotation across boundaries
-        with np.errstate(over='ignore'):
-            np_h += np.uint8(hue_factor * 255)
-        h = Image.fromarray(np_h, 'L')
-
-        pil_image = Image.merge('HSV', (h, s, v)).convert(input_mode)
-        return np.array(pil_image, dtype=img.dtype)[..., ::-1]
-    else:
-        dtype = img.dtype
-        img = img.astype(np.uint8)
-        hsv_img = cv2.cvtColor(img, cv2.COLOR_BGR2HSV_FULL)
-        h, s, v = cv2.split(hsv_img)
-        h = h.astype(np.uint8)
-        # uint8 addition take cares of rotation across boundaries
-        with np.errstate(over='ignore'):
-            h += np.uint8(hue_factor * 255)
-        hsv_img = cv2.merge([h, s, v])
-        return cv2.cvtColor(hsv_img, cv2.COLOR_HSV2BGR_FULL).astype(dtype)
diff --git a/mmcv/model_zoo/deprecated.json b/mmcv/model_zoo/deprecated.json
new file mode 100644
index 0000000..25cf6f2
--- /dev/null
+++ b/mmcv/model_zoo/deprecated.json
@@ -0,0 +1,6 @@
+{
+  "resnet50_caffe": "detectron/resnet50_caffe",
+  "resnet50_caffe_bgr": "detectron2/resnet50_caffe_bgr",
+  "resnet101_caffe": "detectron/resnet101_caffe",
+  "resnet101_caffe_bgr": "detectron2/resnet101_caffe_bgr"
+}
diff --git a/mmcv/model_zoo/mmcls.json b/mmcv/model_zoo/mmcls.json
new file mode 100644
index 0000000..c073a41
--- /dev/null
+++ b/mmcv/model_zoo/mmcls.json
@@ -0,0 +1,59 @@
+{
+  "vgg11": "https://download.openmmlab.com/mmclassification/v0/vgg/vgg11_batch256_imagenet_20210208-4271cd6c.pth",
+  "vgg13": "https://download.openmmlab.com/mmclassification/v0/vgg/vgg13_batch256_imagenet_20210208-4d1d6080.pth",
+  "vgg16": "https://download.openmmlab.com/mmclassification/v0/vgg/vgg16_batch256_imagenet_20210208-db26f1a5.pth",
+  "vgg19": "https://download.openmmlab.com/mmclassification/v0/vgg/vgg19_batch256_imagenet_20210208-e6920e4a.pth",
+  "vgg11_bn": "https://download.openmmlab.com/mmclassification/v0/vgg/vgg11_bn_batch256_imagenet_20210207-f244902c.pth",
+  "vgg13_bn": "https://download.openmmlab.com/mmclassification/v0/vgg/vgg13_bn_batch256_imagenet_20210207-1a8b7864.pth",
+  "vgg16_bn": "https://download.openmmlab.com/mmclassification/v0/vgg/vgg16_bn_batch256_imagenet_20210208-7e55cd29.pth",
+  "vgg19_bn": "https://download.openmmlab.com/mmclassification/v0/vgg/vgg19_bn_batch256_imagenet_20210208-da620c4f.pth",
+  "resnet18": "https://download.openmmlab.com/mmclassification/v0/resnet/resnet18_8xb32_in1k_20210831-fbbb1da6.pth",
+  "resnet34": "https://download.openmmlab.com/mmclassification/v0/resnet/resnet34_8xb32_in1k_20210831-f257d4e6.pth",
+  "resnet50": "https://download.openmmlab.com/mmclassification/v0/resnet/resnet50_8xb32_in1k_20210831-ea4938fc.pth",
+  "resnet101": "https://download.openmmlab.com/mmclassification/v0/resnet/resnet101_8xb32_in1k_20210831-539c63f8.pth",
+  "resnet152": "https://download.openmmlab.com/mmclassification/v0/resnet/resnet152_8xb32_in1k_20210901-4d7582fa.pth",
+  "resnet50_v1d": "https://download.openmmlab.com/mmclassification/v0/resnet/resnetv1d50_b32x8_imagenet_20210531-db14775a.pth",
+  "resnet101_v1d": "https://download.openmmlab.com/mmclassification/v0/resnet/resnetv1d101_b32x8_imagenet_20210531-6e13bcd3.pth",
+  "resnet152_v1d": "https://download.openmmlab.com/mmclassification/v0/resnet/resnetv1d152_b32x8_imagenet_20210531-278cf22a.pth",
+  "resnext50_32x4d": "https://download.openmmlab.com/mmclassification/v0/resnext/resnext50_32x4d_b32x8_imagenet_20210429-56066e27.pth",
+  "resnext101_32x4d": "https://download.openmmlab.com/mmclassification/v0/resnext/resnext101_32x4d_b32x8_imagenet_20210506-e0fa3dd5.pth",
+  "resnext101_32x8d": "https://download.openmmlab.com/mmclassification/v0/resnext/resnext101_32x8d_b32x8_imagenet_20210506-23a247d5.pth",
+  "resnext152_32x4d": "https://download.openmmlab.com/mmclassification/v0/resnext/resnext152_32x4d_b32x8_imagenet_20210524-927787be.pth",
+  "se-resnet50": "https://download.openmmlab.com/mmclassification/v0/se-resnet/se-resnet50_batch256_imagenet_20200804-ae206104.pth",
+  "se-resnet101": "https://download.openmmlab.com/mmclassification/v0/se-resnet/se-resnet101_batch256_imagenet_20200804-ba5b51d4.pth",
+  "resnest50": "https://download.openmmlab.com/mmclassification/v0/resnest/resnest50_imagenet_converted-1ebf0afe.pth",
+  "resnest101": "https://download.openmmlab.com/mmclassification/v0/resnest/resnest101_imagenet_converted-032caa52.pth",
+  "resnest200": "https://download.openmmlab.com/mmclassification/v0/resnest/resnest200_imagenet_converted-581a60f2.pth",
+  "resnest269": "https://download.openmmlab.com/mmclassification/v0/resnest/resnest269_imagenet_converted-59930960.pth",
+  "shufflenet_v1": "https://download.openmmlab.com/mmclassification/v0/shufflenet_v1/shufflenet_v1_batch1024_imagenet_20200804-5d6cec73.pth",
+  "shufflenet_v2": "https://download.openmmlab.com/mmclassification/v0/shufflenet_v2/shufflenet_v2_batch1024_imagenet_20200812-5bf4721e.pth",
+  "mobilenet_v2": "https://download.openmmlab.com/mmclassification/v0/mobilenet_v2/mobilenet_v2_batch256_imagenet_20200708-3b2dc3af.pth",
+  "mobilenet_v3_small": "https://download.openmmlab.com/mmclassification/v0/mobilenet_v3/convert/mobilenet_v3_small-8427ecf0.pth",
+  "mobilenet_v3_large": "https://download.openmmlab.com/mmclassification/v0/mobilenet_v3/convert/mobilenet_v3_large-3ea3c186.pth",
+  "repvgg_A0": "https://download.openmmlab.com/mmclassification/v0/repvgg/repvgg-A0_3rdparty_4xb64-coslr-120e_in1k_20210909-883ab98c.pth",
+  "repvgg_A1": "https://download.openmmlab.com/mmclassification/v0/repvgg/repvgg-A1_3rdparty_4xb64-coslr-120e_in1k_20210909-24003a24.pth",
+  "repvgg_A2": "https://download.openmmlab.com/mmclassification/v0/repvgg/repvgg-A2_3rdparty_4xb64-coslr-120e_in1k_20210909-97d7695a.pth",
+  "repvgg_B0": "https://download.openmmlab.com/mmclassification/v0/repvgg/repvgg-B0_3rdparty_4xb64-coslr-120e_in1k_20210909-446375f4.pth",
+  "repvgg_B1": "https://download.openmmlab.com/mmclassification/v0/repvgg/repvgg-B1_3rdparty_4xb64-coslr-120e_in1k_20210909-750cdf67.pth",
+  "repvgg_B1g2": "https://download.openmmlab.com/mmclassification/v0/repvgg/repvgg-B1g2_3rdparty_4xb64-coslr-120e_in1k_20210909-344f6422.pth",
+  "repvgg_B1g4": "https://download.openmmlab.com/mmclassification/v0/repvgg/repvgg-B1g4_3rdparty_4xb64-coslr-120e_in1k_20210909-d4c1a642.pth",
+  "repvgg_B2": "https://download.openmmlab.com/mmclassification/v0/repvgg/repvgg-B2_3rdparty_4xb64-coslr-120e_in1k_20210909-bd6b937c.pth",
+  "repvgg_B2g4": "https://download.openmmlab.com/mmclassification/v0/repvgg/repvgg-B2g4_3rdparty_4xb64-autoaug-lbs-mixup-coslr-200e_in1k_20210909-7b7955f0.pth",
+  "repvgg_B3": "https://download.openmmlab.com/mmclassification/v0/repvgg/repvgg-B3_3rdparty_4xb64-autoaug-lbs-mixup-coslr-200e_in1k_20210909-dda968bf.pth",
+  "repvgg_B3g4": "https://download.openmmlab.com/mmclassification/v0/repvgg/repvgg-B3g4_3rdparty_4xb64-autoaug-lbs-mixup-coslr-200e_in1k_20210909-4e54846a.pth",
+  "repvgg_D2se": "https://download.openmmlab.com/mmclassification/v0/repvgg/repvgg-D2se_3rdparty_4xb64-autoaug-lbs-mixup-coslr-200e_in1k_20210909-cf3139b7.pth",
+  "res2net101_w26": "https://download.openmmlab.com/mmclassification/v0/res2net/res2net101-w26-s4_3rdparty_8xb32_in1k_20210927-870b6c36.pth",
+  "res2net50_w14": "https://download.openmmlab.com/mmclassification/v0/res2net/res2net50-w14-s8_3rdparty_8xb32_in1k_20210927-bc967bf1.pth",
+  "res2net50_w26": "https://download.openmmlab.com/mmclassification/v0/res2net/res2net50-w26-s8_3rdparty_8xb32_in1k_20210927-f547a94b.pth",
+  "swin_tiny": "https://download.openmmlab.com/mmclassification/v0/swin-transformer/swin_tiny_224_b16x64_300e_imagenet_20210616_090925-66df6be6.pth",
+  "swin_small": "https://download.openmmlab.com/mmclassification/v0/swin-transformer/swin_small_224_b16x64_300e_imagenet_20210615_110219-7f9d988b.pth",
+  "swin_base": "https://download.openmmlab.com/mmclassification/v0/swin-transformer/convert/swin_base_patch4_window7_224_22kto1k-f967f799.pth",
+  "swin_large": "https://download.openmmlab.com/mmclassification/v0/swin-transformer/convert/swin_large_patch4_window7_224_22kto1k-5f0996db.pth",
+  "t2t_vit_t_14": "https://download.openmmlab.com/mmclassification/v0/t2t-vit/t2t-vit-t-14_3rdparty_8xb64_in1k_20210928-b7c09b62.pth",
+  "t2t_vit_t_19": "https://download.openmmlab.com/mmclassification/v0/t2t-vit/t2t-vit-t-19_3rdparty_8xb64_in1k_20210928-7f1478d5.pth",
+  "t2t_vit_t_24": "https://download.openmmlab.com/mmclassification/v0/t2t-vit/t2t-vit-t-24_3rdparty_8xb64_in1k_20210928-fe95a61b.pth",
+  "tnt_small": "https://download.openmmlab.com/mmclassification/v0/tnt/tnt-small-p16_3rdparty_in1k_20210903-c56ee7df.pth",
+  "vit_base_p16": "https://download.openmmlab.com/mmclassification/v0/vit/finetune/vit-base-p16_in21k-pre-3rdparty_ft-64xb64_in1k-384_20210928-98e8652b.pth",
+  "vit_base_p32": "https://download.openmmlab.com/mmclassification/v0/vit/finetune/vit-base-p32_in21k-pre-3rdparty_ft-64xb64_in1k-384_20210928-9cea8599.pth",
+  "vit_large_p16": "https://download.openmmlab.com/mmclassification/v0/vit/finetune/vit-large-p16_in21k-pre-3rdparty_ft-64xb64_in1k-384_20210928-b20ba619.pth"
+}
diff --git a/mmcv/model_zoo/open_mmlab.json b/mmcv/model_zoo/open_mmlab.json
new file mode 100644
index 0000000..8311db4
--- /dev/null
+++ b/mmcv/model_zoo/open_mmlab.json
@@ -0,0 +1,50 @@
+{
+  "vgg16_caffe": "https://download.openmmlab.com/pretrain/third_party/vgg16_caffe-292e1171.pth",
+  "detectron/resnet50_caffe": "https://download.openmmlab.com/pretrain/third_party/resnet50_caffe-788b5fa3.pth",
+  "detectron2/resnet50_caffe": "https://download.openmmlab.com/pretrain/third_party/resnet50_msra-5891d200.pth",
+  "detectron/resnet101_caffe": "https://download.openmmlab.com/pretrain/third_party/resnet101_caffe-3ad79236.pth",
+  "detectron2/resnet101_caffe": "https://download.openmmlab.com/pretrain/third_party/resnet101_msra-6cc46731.pth",
+  "detectron2/resnext101_32x8d": "https://download.openmmlab.com/pretrain/third_party/resnext101_32x8d-1516f1aa.pth",
+  "resnext50_32x4d": "https://download.openmmlab.com/pretrain/third_party/resnext50-32x4d-0ab1a123.pth",
+  "resnext101_32x4d": "https://download.openmmlab.com/pretrain/third_party/resnext101_32x4d-a5af3160.pth",
+  "resnext101_64x4d": "https://download.openmmlab.com/pretrain/third_party/resnext101_64x4d-ee2c6f71.pth",
+  "contrib/resnet50_gn": "https://download.openmmlab.com/pretrain/third_party/resnet50_gn_thangvubk-ad1730dd.pth",
+  "detectron/resnet50_gn": "https://download.openmmlab.com/pretrain/third_party/resnet50_gn-9186a21c.pth",
+  "detectron/resnet101_gn": "https://download.openmmlab.com/pretrain/third_party/resnet101_gn-cac0ab98.pth",
+  "jhu/resnet50_gn_ws": "https://download.openmmlab.com/pretrain/third_party/resnet50_gn_ws-15beedd8.pth",
+  "jhu/resnet101_gn_ws": "https://download.openmmlab.com/pretrain/third_party/resnet101_gn_ws-3e3c308c.pth",
+  "jhu/resnext50_32x4d_gn_ws": "https://download.openmmlab.com/pretrain/third_party/resnext50_32x4d_gn_ws-0d87ac85.pth",
+  "jhu/resnext101_32x4d_gn_ws": "https://download.openmmlab.com/pretrain/third_party/resnext101_32x4d_gn_ws-34ac1a9e.pth",
+  "jhu/resnext50_32x4d_gn": "https://download.openmmlab.com/pretrain/third_party/resnext50_32x4d_gn-c7e8b754.pth",
+  "jhu/resnext101_32x4d_gn": "https://download.openmmlab.com/pretrain/third_party/resnext101_32x4d_gn-ac3bb84e.pth",
+  "msra/hrnetv2_w18_small": "https://download.openmmlab.com/pretrain/third_party/hrnetv2_w18_small-b5a04e21.pth",
+  "msra/hrnetv2_w18": "https://download.openmmlab.com/pretrain/third_party/hrnetv2_w18-00eb2006.pth",
+  "msra/hrnetv2_w32": "https://download.openmmlab.com/pretrain/third_party/hrnetv2_w32-dc9eeb4f.pth",
+  "msra/hrnetv2_w40": "https://download.openmmlab.com/pretrain/third_party/hrnetv2_w40-ed0b031c.pth",
+  "msra/hrnetv2_w48": "https://download.openmmlab.com/pretrain/third_party/hrnetv2_w48-d2186c55.pth",
+  "bninception_caffe": "https://download.openmmlab.com/pretrain/third_party/bn_inception_caffe-ed2e8665.pth",
+  "kin400/i3d_r50_f32s2_k400": "https://download.openmmlab.com/pretrain/third_party/i3d_r50_f32s2_k400-2c57e077.pth",
+  "kin400/nl3d_r50_f32s2_k400": "https://download.openmmlab.com/pretrain/third_party/nl3d_r50_f32s2_k400-fa7e7caa.pth",
+  "res2net101_v1d_26w_4s": "https://download.openmmlab.com/pretrain/third_party/res2net101_v1d_26w_4s_mmdetv2-f0a600f9.pth",
+  "regnetx_400mf": "https://download.openmmlab.com/pretrain/third_party/regnetx_400mf-a5b10d96.pth",
+  "regnetx_800mf": "https://download.openmmlab.com/pretrain/third_party/regnetx_800mf-1f4be4c7.pth",
+  "regnetx_1.6gf": "https://download.openmmlab.com/pretrain/third_party/regnetx_1.6gf-5791c176.pth",
+  "regnetx_3.2gf": "https://download.openmmlab.com/pretrain/third_party/regnetx_3.2gf-c2599b0f.pth",
+  "regnetx_4.0gf": "https://download.openmmlab.com/pretrain/third_party/regnetx_4.0gf-a88f671e.pth",
+  "regnetx_6.4gf": "https://download.openmmlab.com/pretrain/third_party/regnetx_6.4gf-006af45d.pth",
+  "regnetx_8.0gf": "https://download.openmmlab.com/pretrain/third_party/regnetx_8.0gf-3c68abe7.pth",
+  "regnetx_12gf": "https://download.openmmlab.com/pretrain/third_party/regnetx_12gf-4c2a3350.pth",
+  "resnet18_v1c": "https://download.openmmlab.com/pretrain/third_party/resnet18_v1c-b5776b93.pth",
+  "resnet50_v1c": "https://download.openmmlab.com/pretrain/third_party/resnet50_v1c-2cccc1ad.pth",
+  "resnet101_v1c": "https://download.openmmlab.com/pretrain/third_party/resnet101_v1c-e67eebb6.pth",
+  "mmedit/vgg16": "https://download.openmmlab.com/mmediting/third_party/vgg_state_dict.pth",
+  "mmedit/res34_en_nomixup": "https://download.openmmlab.com/mmediting/third_party/model_best_resnet34_En_nomixup.pth",
+  "mmedit/mobilenet_v2": "https://download.openmmlab.com/mmediting/third_party/mobilenet_v2.pth",
+  "contrib/mobilenet_v3_large": "https://download.openmmlab.com/pretrain/third_party/mobilenet_v3_large-bc2c3fd3.pth",
+  "contrib/mobilenet_v3_small": "https://download.openmmlab.com/pretrain/third_party/mobilenet_v3_small-47085aa1.pth",
+  "resnest50": "https://download.openmmlab.com/pretrain/third_party/resnest50_d2-7497a55b.pth",
+  "resnest101": "https://download.openmmlab.com/pretrain/third_party/resnest101_d2-f3b931b2.pth",
+  "resnest200": "https://download.openmmlab.com/pretrain/third_party/resnest200_d2-ca88e41f.pth",
+  "darknet53": "https://download.openmmlab.com/pretrain/third_party/darknet53-a628ea1b.pth",
+  "mmdet/mobilenet_v2": "https://download.openmmlab.com/mmdetection/v2.0/third_party/mobilenet_v2_batch256_imagenet-ff34753d.pth"
+}
diff --git a/mmcv/onnx/__init__.py b/mmcv/onnx/__init__.py
new file mode 100644
index 0000000..0d7eb5b
--- /dev/null
+++ b/mmcv/onnx/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .info import is_custom_op_loaded
+from .symbolic import register_extra_symbolics
+
+__all__ = ['register_extra_symbolics', 'is_custom_op_loaded']
diff --git a/mmcv/onnx/info.py b/mmcv/onnx/info.py
new file mode 100644
index 0000000..e599973
--- /dev/null
+++ b/mmcv/onnx/info.py
@@ -0,0 +1,21 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+
+import torch
+
+
+def is_custom_op_loaded():
+    flag = False
+    try:
+        from ..tensorrt import is_tensorrt_plugin_loaded
+        flag = is_tensorrt_plugin_loaded()
+    except (ImportError, ModuleNotFoundError):
+        pass
+    if not flag:
+        try:
+            from ..ops import get_onnxruntime_op_path
+            ort_lib_path = get_onnxruntime_op_path()
+            flag = os.path.exists(ort_lib_path)
+        except (ImportError, ModuleNotFoundError):
+            pass
+    return flag or torch.__version__ == 'parrots'
diff --git a/mmcv/onnx/onnx_utils/__init__.py b/mmcv/onnx/onnx_utils/__init__.py
new file mode 100644
index 0000000..ef101fe
--- /dev/null
+++ b/mmcv/onnx/onnx_utils/__init__.py
@@ -0,0 +1 @@
+# Copyright (c) OpenMMLab. All rights reserved.
diff --git a/mmcv/onnx/onnx_utils/symbolic_helper.py b/mmcv/onnx/onnx_utils/symbolic_helper.py
new file mode 100644
index 0000000..a9a31eb
--- /dev/null
+++ b/mmcv/onnx/onnx_utils/symbolic_helper.py
@@ -0,0 +1,331 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+"""Modified from https://github.com/pytorch/pytorch."""
+import warnings
+from functools import wraps
+from sys import maxsize
+
+import torch
+import torch.onnx
+# This import monkey-patches graph manipulation methods on Graph, used for the
+# ONNX symbolics
+import torch.onnx.utils
+from torch._C import ListType
+
+# ---------------------------------------------------------------------------------
+# Helper functions
+# ---------------------------------------------------------------------------------
+
+# Save some builtins as locals, because we'll shadown them below
+_sum = sum
+
+
+def _parse_arg(value, desc):
+    if desc == 'none':
+        return value
+    if desc == 'v' or not _is_value(value):
+        return value
+    if value.node().mustBeNone():
+        return None
+    if value.node().kind() == 'onnx::Constant':
+        tval = value.node()['value']
+        if desc == 'i':
+            return int(tval)
+        elif desc == 'f':
+            return float(tval)
+        elif desc == 'b':
+            return bool(tval)
+        elif desc == 's':
+            return str(tval)
+        elif desc == 't':
+            return tval
+        elif desc == 'is':
+            return [int(v) for v in tval]
+        elif desc == 'fs':
+            return [float(v) for v in tval]
+        else:
+            raise RuntimeError(
+                "ONNX symbolic doesn't know to interpret Constant node")
+    elif value.node().kind() == 'prim::ListConstruct':
+        if desc == 'is':
+            for v in value.node().inputs():
+                if v.node().kind() != 'onnx::Constant':
+                    raise RuntimeError(
+                        "Failed to export an ONNX attribute '" +
+                        v.node().kind() +
+                        "', since it's not constant, please try to make "
+                        'things (e.g., kernel size) static if possible')
+            return [int(v.node()['value']) for v in value.node().inputs()]
+        else:
+            raise RuntimeError(
+                "ONNX symbolic doesn't know to interpret ListConstruct node")
+
+    raise RuntimeError('Unexpected node type: {}'.format(value.node().kind()))
+
+
+def _maybe_get_const(value, desc):
+    if _is_value(value) and value.node().kind() == 'onnx::Constant':
+        return _parse_arg(value, desc)
+    return value
+
+
+def _maybe_get_scalar(value):
+    value_t = _maybe_get_const(value, 't')
+    if isinstance(value_t, torch.Tensor) and value_t.shape == ():
+        return value_t
+    return value
+
+
+def _get_const(value, desc, arg_name):
+    if _is_value(value) and value.node().kind() not in ('onnx::Constant',
+                                                        'prim::Constant'):
+        raise RuntimeError('ONNX symbolic expected a constant'
+                           ' value of the {} argument, got `{}`'.format(
+                               arg_name, value))
+    return _parse_arg(value, desc)
+
+
+def _unpack_list(list_value):
+    list_node = list_value.node()
+    assert list_node.kind() == 'prim::ListConstruct'
+    return list(list_node.inputs())
+
+
+# Check if list_value is output from prim::ListConstruct
+# This is usually called before _unpack_list to ensure the list can be
+# unpacked.
+def _is_packed_list(list_value):
+    return _is_value(
+        list_value) and list_value.node().kind() == 'prim::ListConstruct'
+
+
+def parse_args(*arg_descriptors):
+
+    def decorator(fn):
+        fn._arg_descriptors = arg_descriptors
+
+        def wrapper(g, *args):
+            # some args may be optional, so the length may be smaller
+            assert len(arg_descriptors) >= len(args)
+            args = [
+                _parse_arg(arg, arg_desc)
+                for arg, arg_desc in zip(args, arg_descriptors)
+            ]
+            return fn(g, *args)
+
+        # In Python 2 functools.wraps chokes on partially applied functions, so
+        # we need this as a workaround
+        try:
+            wrapper = wraps(fn)(wrapper)
+        except Exception:
+            pass
+        return wrapper
+
+    return decorator
+
+
+def _scalar(x):
+    """Convert a scalar tensor into a Python value."""
+    assert x.numel() == 1
+    return x.item()
+
+
+def _if_scalar_type_as(g, self, tensor):
+    """Convert self into the same type of tensor, as necessary."""
+    if isinstance(self, torch._C.Value):
+        return self
+
+    scalar_type = tensor.type().scalarType()
+    if scalar_type:
+        ty = scalar_type.lower()
+        return getattr(self, ty)()
+
+    return self
+
+
+def _is_none(x):
+    return x.node().mustBeNone()
+
+
+def _is_value(x):
+    return isinstance(x, torch._C.Value)
+
+
+def _is_tensor_list(x):
+    return x.type().isSubtypeOf(ListType.ofTensors())
+
+
+def _unimplemented(op, msg):
+    warnings.warn('ONNX export failed on ' + op + ' because ' + msg +
+                  ' not supported')
+
+
+def _try_get_scalar_type(*args):
+    for arg in args:
+        try:
+            return arg.type().scalarType()
+        except RuntimeError:
+            pass
+    return None
+
+
+def _topk_helper(g, input, k, dim, largest=True, sorted=False, out=None):
+    if out is not None:
+        _unimplemented('TopK', 'Out parameter is not supported')
+    if not _is_value(k):
+        k = g.op('Constant', value_t=torch.tensor([k], dtype=torch.int64))
+    else:
+        k = g.op('Reshape', k, g.op('Constant', value_t=torch.tensor([1])))
+    return g.op(
+        'TopK',
+        input,
+        k,
+        axis_i=dim,
+        largest_i=largest,
+        sorted_i=sorted,
+        outputs=2)
+
+
+def _slice_helper(g,
+                  input,
+                  axes,
+                  starts,
+                  ends,
+                  steps=None,
+                  dynamic_slice=False):
+    # TODO(ruobing): add support for opset<10
+    from torch.onnx.symbolic_opset10 import _slice
+    return _slice(g, input, axes, starts, ends, steps, dynamic_slice)
+
+
+def _unsqueeze_helper(g, input, dim):
+    from torch.onnx.symbolic_opset9 import unsqueeze
+    return unsqueeze(g, input, dim)
+
+
+def _interpolate_size_to_scales(g, input, output_size, dim):
+    output_size = _maybe_get_const(output_size, 'is')
+    if _is_value(output_size):
+        offset = 2
+        offsets = g.op(
+            'Constant', value_t=torch.ones(offset, dtype=torch.float32))
+        dividend = g.op(
+            'Cast', output_size, to_i=cast_pytorch_to_onnx['Float'])
+        divisor = _slice_helper(
+            g, g.op('Shape', input), axes=[0], ends=[maxsize], starts=[offset])
+        divisor = g.op('Cast', divisor, to_i=cast_pytorch_to_onnx['Float'])
+        scale_dims = g.op('Div', dividend, divisor)
+        scales = g.op('Concat', offsets, scale_dims, axis_i=0)
+    else:
+        scales_constant = [
+            1. if i < 2 else float(output_size[-(dim - i)]) /
+            float(input.type().sizes()[-(dim - i)]) for i in range(0, dim)
+        ]
+        scales = g.op(
+            'Constant',
+            value_t=torch.tensor(scales_constant, dtype=torch.float32))
+    return scales
+
+
+def _interpolate_get_scales_if_available(g, scales):
+    if len(scales) == 0:
+        return None
+    # scales[0] is NoneType in Pytorch == 1.5.1
+    # scales[0] is TensorType with sizes = [] in Pytorch == 1.6.0
+    # scales[0] is ListType in Pytorch == 1.7.0
+    # scales[0] is TensorType with sizes = [2] in Pytorch == 1.8.0
+    scale_desc = 'fs' if scales[0].type().kind() == 'ListType' or (
+        scales[0].type().kind() == 'TensorType' and
+        (sum(scales[0].type().sizes()) > 1)) else 'f'
+    available_scales = _maybe_get_const(
+        scales[0], scale_desc) != -1 and not _is_none(scales[0])
+
+    if not available_scales:
+        return None
+
+    offsets = g.op('Constant', value_t=torch.ones(2, dtype=torch.float32))
+    if scale_desc == 'fs':
+        scales_list = g.op(
+            'Constant',
+            value_t=torch.tensor(_maybe_get_const(scales[0], scale_desc)))
+        # modify to support PyTorch==1.7.0
+        # https://github.com/pytorch/pytorch/blob/75ee5756715e7161314ce037474843b68f69fc04/torch/onnx/symbolic_helper.py#L375 # noqa: E501
+        scales = g.op('Concat', offsets, scales_list, axis_i=0)
+    else:
+        # for PyTorch < 1.7.0
+        scales_list = []
+        for scale in scales:
+            unsqueezed_scale = _unsqueeze_helper(g, scale, 0)
+            # ONNX only supports float for the scales. double -> float.
+            unsqueezed_scale = g.op(
+                'Cast', unsqueezed_scale, to_i=cast_pytorch_to_onnx['Float'])
+            scales_list.append(unsqueezed_scale)
+        scales = g.op('Concat', offsets, *scales_list, axis_i=0)
+    return scales
+
+
+def _get_interpolate_attributes(g, mode, args):
+    if mode == 'nearest':
+        align_corners = None
+        scales = args[0:]
+    else:
+        align_corners = args[0]
+        scales = args[1:]
+    scales = _interpolate_get_scales_if_available(g, scales)
+    return scales, align_corners
+
+
+def _interpolate_get_scales(g, scale_factor, dim):
+    offsets = g.op('Constant', value_t=torch.ones(2, dtype=torch.float32))
+    if isinstance(scale_factor.type(), torch._C.ListType):
+        return g.op('Concat', offsets, scale_factor, axis_i=0)
+    else:
+        scale_factor = _unsqueeze_helper(g, scale_factor, 0)
+        scale_factor = g.op(
+            'Cast', scale_factor, to_i=cast_pytorch_to_onnx['Float'])
+        scales = [scale_factor for i in range(dim - 2)]
+    scale_factor = g.op('Concat', offsets, *scales, axis_i=0)
+    return scale_factor
+
+
+def _size_helper(g, self, dim):
+    full_shape = g.op('Shape', self)
+    from torch.onnx.symbolic_opset9 import select
+    return select(g, full_shape, g.op('Constant', value_t=torch.tensor([0])),
+                  dim)
+
+
+def _avgpool_helper(tuple_fn, padding, kernel_size, stride, divisor_override,
+                    name):
+    if divisor_override and divisor_override.node().kind() != 'prim::Constant':
+        return _unimplemented(name, 'divisor_override')
+    if not stride:
+        stride = kernel_size
+    padding = tuple(tuple_fn(padding))
+    return padding
+
+
+# Metaprogram symbolics for each ATen native specialized cast operator.
+# For e.g. we specify a function named `_cast_uint8_t` that instantiates an
+# ONNX cast node with `to` attribute 'UINT8'
+#
+# TODO: remove these once we support Type's in the JIT IR and we can once again
+# use the unified toType operator
+cast_pytorch_to_onnx = {
+    'Byte': torch.onnx.TensorProtoDataType.UINT8,
+    'Char': torch.onnx.TensorProtoDataType.INT8,
+    'Double': torch.onnx.TensorProtoDataType.DOUBLE,
+    'Float': torch.onnx.TensorProtoDataType.FLOAT,
+    'Half': torch.onnx.TensorProtoDataType.FLOAT16,
+    'Int': torch.onnx.TensorProtoDataType.INT32,
+    'Long': torch.onnx.TensorProtoDataType.INT64,
+    'Short': torch.onnx.TensorProtoDataType.INT16,
+    'Bool': torch.onnx.TensorProtoDataType.BOOL,
+    'ComplexFloat': torch.onnx.TensorProtoDataType.COMPLEX64,
+    'ComplexDouble': torch.onnx.TensorProtoDataType.COMPLEX128,
+    'Undefined': torch.onnx.TensorProtoDataType.UNDEFINED,
+}
+
+# Global set to store the list of quantized operators in the network.
+# This is currently only used in the conversion of quantized ops from PT
+# -> C2 via ONNX.
+_quantized_ops = set()
diff --git a/mmcv/onnx/symbolic.py b/mmcv/onnx/symbolic.py
new file mode 100644
index 0000000..94cc1c6
--- /dev/null
+++ b/mmcv/onnx/symbolic.py
@@ -0,0 +1,496 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+"""Modified from https://github.com/pytorch/pytorch."""
+import os
+
+import numpy as np
+import torch
+from torch.nn.modules.utils import _pair, _single, _triple
+from torch.onnx.symbolic_helper import parse_args
+from torch.onnx.symbolic_registry import register_op
+
+from .onnx_utils import symbolic_helper as sym_help
+
+
+def _interpolate(name, dim, interpolate_mode):
+
+    def symbolic_fn(g, input, output_size, *args):
+        scales, align_corners = sym_help._get_interpolate_attributes(
+            g, interpolate_mode, args)
+        align_corners = sym_help._maybe_get_scalar(align_corners)
+        transformation_mode = 'asymmetric' \
+            if interpolate_mode == 'nearest' \
+            else 'align_corners' if align_corners else 'pytorch_half_pixel'
+        empty_tensor = g.op(
+            'Constant', value_t=torch.tensor([], dtype=torch.float32))
+
+        if scales is None:
+            if 'ONNX_BACKEND' in os.environ and os.environ[
+                    'ONNX_BACKEND'] == 'TensorRT':
+                input_size = input.type().sizes()
+                # slice the first two dim
+                input_size = input_size[:2]
+                # convert output_size to int type
+                output_size = sym_help._maybe_get_const(output_size, 'is')
+                input_size.extend(output_size)
+                output_size = g.op(
+                    'Constant',
+                    value_t=torch.tensor(input_size, dtype=torch.int64))
+            else:
+                input_size = g.op('Shape', input)
+                input_size_beg = sym_help._slice_helper(
+                    g, input_size, axes=[0], ends=[2], starts=[0])
+                output_size = g.op(
+                    'Cast',
+                    output_size,
+                    to_i=sym_help.cast_pytorch_to_onnx['Long'])
+                output_size = g.op(
+                    'Concat', input_size_beg, output_size, axis_i=0)
+            scales = g.op(
+                'Constant', value_t=torch.tensor([], dtype=torch.float32))
+            return g.op(
+                'Resize',
+                input,
+                empty_tensor,
+                # roi only takes effect with
+                # coordinate_transformation_mode="tf_crop_and_resize"
+                scales,  # scales is not needed since we are sending out_size
+                output_size,
+                coordinate_transformation_mode_s=transformation_mode,
+                cubic_coeff_a_f=-0.75,  # only valid when mode="cubic"
+                mode_s=interpolate_mode,  # nearest, linear, or cubic
+                nearest_mode_s='floor')  # only valid when mode="nearest"
+        else:
+            return g.op(
+                'Resize',
+                input,
+                empty_tensor,
+                # roi only takes effect with
+                # coordinate_transformation_mode="tf_crop_and_resize"
+                scales,  # scales is not needed since we are sending out_size
+                coordinate_transformation_mode_s=transformation_mode,
+                cubic_coeff_a_f=-0.75,  # only valid when mode="cubic"
+                mode_s=interpolate_mode,  # nearest, linear, or cubic
+                nearest_mode_s='floor')  # only valid when mode="nearest"
+
+    return symbolic_fn
+
+
+upsample_nearest1d = _interpolate('upsample_nearest1d', 3, 'nearest')
+upsample_nearest2d = _interpolate('upsample_nearest2d', 4, 'nearest')
+upsample_nearest3d = _interpolate('upsample_nearest3d', 5, 'nearest')
+upsample_linear1d = _interpolate('upsample_linear1d', 3, 'linear')
+upsample_bilinear2d = _interpolate('upsample_bilinear2d', 4, 'linear')
+upsample_trilinear3d = _interpolate('upsample_trilinear3d', 5, 'linear')
+upsample_bicubic2d = _interpolate('upsample_bicubic2d', 4, 'cubic')
+
+
+@parse_args('v', 'v', 'i', 'i', 'i', 'none')
+def topk(g, self, k, dim, largest, sorted, out=None):
+    return sym_help._topk_helper(
+        g, self, k, dim, largest=largest, sorted=sorted, out=out)
+
+
+def masked_select(g, self, mask):
+    from torch.onnx.symbolic_opset9 import expand_as, nonzero
+    index = nonzero(g, expand_as(g, mask, self))
+    return g.op('GatherND', self, index)
+
+
+def _prepare_onnx_paddings(g, dim, pad):
+    pad_len = torch.onnx.symbolic_opset9.size(
+        g, pad, g.op('Constant', value_t=torch.tensor([0])))
+    # Set extension = [0] * (dim * 2 - len(pad))
+    extension = g.op(
+        'Sub',
+        g.op('Mul',
+             g.op('Constant', value_t=torch.tensor(dim, dtype=torch.int64)),
+             g.op('Constant', value_t=torch.tensor(2, dtype=torch.int64))),
+        pad_len)
+    pad = g.op('Cast', pad, to_i=sym_help.cast_pytorch_to_onnx['Long'])
+    paddings = g.op(
+        'Concat',
+        pad,
+        g.op(
+            'ConstantOfShape',
+            extension,
+            value_t=torch.tensor([0], dtype=torch.int64)),
+        axis_i=0)
+    paddings = g.op('Reshape', paddings,
+                    g.op('Constant', value_t=torch.tensor([-1, 2])))
+    paddings = g.op(
+        'Transpose',
+        torch.onnx.symbolic_opset10.flip(g, paddings, [0]),
+        perm_i=[1, 0])
+    paddings = g.op('Reshape', paddings,
+                    g.op('Constant', value_t=torch.tensor([-1])))
+    padding_c = g.op(
+        'Cast', paddings, to_i=sym_help.cast_pytorch_to_onnx['Long'])
+    return padding_c
+
+
+def constant_pad_nd(g, input, padding, value=None):
+    mode = 'constant'
+    value = sym_help._maybe_get_scalar(value)
+    value = sym_help._if_scalar_type_as(g, value, input)
+    pad = _prepare_onnx_paddings(g, input.type().dim(), padding)
+    return g.op('Pad', input, pad, value, mode_s=mode)
+
+
+def reflection_pad(g, input, padding):
+    mode = 'reflect'
+    paddings = _prepare_onnx_paddings(g, input.type().dim(), padding)
+    return g.op('Pad', input, paddings, mode_s=mode)
+
+
+reflection_pad1d = reflection_pad
+reflection_pad2d = reflection_pad
+reflection_pad3d = reflection_pad
+
+
+def _avg_pool(name, tuple_fn):
+
+    @parse_args('v', 'is', 'is', 'is', 'i', 'i', 'none')
+    def symbolic_fn(g,
+                    input,
+                    kernel_size,
+                    stride,
+                    padding,
+                    ceil_mode,
+                    count_include_pad,
+                    divisor_override=None):
+        padding = sym_help._avgpool_helper(tuple_fn, padding, kernel_size,
+                                           stride, divisor_override, name)
+        if not stride:
+            stride = kernel_size
+        if count_include_pad:
+            input = g.op(
+                'Pad',
+                input,
+                g.op(
+                    'Constant',
+                    value_t=torch.tensor(((0, ) * 2 + padding) * 2)),
+                mode_s='constant')
+            padding = (0, ) * len(padding)
+        output = g.op(
+            'AveragePool',
+            input,
+            kernel_shape_i=tuple_fn(kernel_size),
+            strides_i=tuple_fn(stride),
+            pads_i=padding * 2,
+            ceil_mode_i=ceil_mode)
+        return output
+
+    return symbolic_fn
+
+
+avg_pool1d = _avg_pool('avg_pool1d', _single)
+avg_pool2d = _avg_pool('avg_pool2d', _pair)
+avg_pool3d = _avg_pool('avg_pool3d', _triple)
+
+
+def _get_im2col_indices_along_dim(g, input_d, kernel_size_d, dilation_d,
+                                  padding_d, stride_d):
+    # Input is always 4-D (N, C, H, W)
+    # Calculate indices of sliding blocks along spatial dimension
+    # Slide kernel over input each dim d:
+    # each dimension d ranges from 0 to
+    # input[d]+2xpadding[d]-dilation[d]x(kernel_size[d]-1)
+    # with steps = stride
+
+    blocks_d = g.op('Add', input_d,
+                    g.op('Constant', value_t=torch.tensor(padding_d * 2)))
+    blocks_d = g.op(
+        'Sub', blocks_d,
+        g.op(
+            'Constant',
+            value_t=torch.tensor(dilation_d * (kernel_size_d - 1))))
+
+    # Stride kernel over input and find starting indices along dim d
+    blocks_d_indices = g.op('Range', g.op('Constant', value_t=torch.tensor(0)),
+                            blocks_d,
+                            g.op('Constant', value_t=torch.tensor(stride_d)))
+
+    # Apply dilation on kernel and find its indices along dim d
+    kernel_grid = np.arange(0, kernel_size_d * dilation_d, dilation_d)
+    kernel_grid = g.op('Constant', value_t=torch.tensor([kernel_grid]))
+
+    # Broadcast and add kernel staring positions (indices) with
+    # kernel_grid along dim d, to get block indices along dim d
+    blocks_d_indices = g.op(
+        'Unsqueeze', blocks_d_indices, axes_i=[0])  # Reshape to [1, -1]
+    kernel_mask = g.op('Reshape', kernel_grid,
+                       g.op('Constant', value_t=torch.tensor([-1, 1])))
+    block_mask = g.op('Add', blocks_d_indices, kernel_mask)
+
+    return block_mask
+
+
+def _get_im2col_padded_input(g, input, padding_h, padding_w):
+    # Input is always 4-D tensor (N, C, H, W)
+    # Padding tensor has the following format: (padding_h, padding_w)
+    # Reshape the padding to follow ONNX format:
+    # (dim1_begin, dim2_begin,...,dim1_end, dim2_end,...)
+    pad = g.op(
+        'Constant', value_t=torch.LongTensor([0, 0, padding_h, padding_w] * 2))
+    return g.op('Pad', input, pad)
+
+
+def _get_im2col_output_shape(g, input, kernel_h, kernel_w):
+    batch_dim = size(g, input, g.op('Constant', value_t=torch.tensor(0)))
+    channel_dim = size(g, input, g.op('Constant', value_t=torch.tensor(1)))
+    channel_unfolded = g.op(
+        'Mul', channel_dim,
+        g.op('Constant', value_t=torch.tensor(kernel_h * kernel_w)))
+
+    return g.op(
+        'Concat',
+        g.op('Unsqueeze', batch_dim, axes_i=[0]),
+        g.op('Unsqueeze', channel_unfolded, axes_i=[0]),
+        g.op('Constant', value_t=torch.tensor([-1])),
+        axis_i=0)
+
+
+def size(g, self, dim=None):
+    if dim is None:
+        return g.op('Shape', self)
+    return sym_help._size_helper(g, self, dim)
+
+
+@parse_args('v', 'is', 'is', 'is', 'is')
+def im2col(g, input, kernel_size, dilation, padding, stride):
+    # Input is always 4-D tensor (N, C, H, W)
+    # All other args are int[2]
+
+    input_h = size(g, input, g.op('Constant', value_t=torch.tensor(2)))
+    input_w = size(g, input, g.op('Constant', value_t=torch.tensor(3)))
+
+    stride_h, stride_w = stride[0], stride[1]
+    padding_h, padding_w = padding[0], padding[1]
+    dilation_h, dilation_w = dilation[0], dilation[1]
+    kernel_h, kernel_w = kernel_size[0], kernel_size[1]
+
+    blocks_row_indices = _get_im2col_indices_along_dim(g, input_h, kernel_h,
+                                                       dilation_h, padding_h,
+                                                       stride_h)
+    blocks_col_indices = _get_im2col_indices_along_dim(g, input_w, kernel_w,
+                                                       dilation_w, padding_w,
+                                                       stride_w)
+
+    output_shape = _get_im2col_output_shape(g, input, kernel_h, kernel_w)
+    padded_input = _get_im2col_padded_input(g, input, padding_h, padding_w)
+
+    output = g.op('Gather', padded_input, blocks_row_indices, axis_i=2)
+    output = g.op('Gather', output, blocks_col_indices, axis_i=4)
+    output = g.op('Transpose', output, perm_i=[0, 1, 2, 4, 3, 5])
+    return g.op('Reshape', output, output_shape)
+
+
+@parse_args('v', 'i')
+def one_hot(g, self, num_classes):
+    values = g.op('Constant', value_t=torch.LongTensor([0, 1]))
+    depth = g.op('Constant', value_t=torch.LongTensor([num_classes]))
+    return g.op('OneHot', self, depth, values, axis_i=-1)
+
+
+@parse_args('v', 'i', 'none')
+def softmax(g, input, dim, dtype=None):
+    input_dim = input.type().dim()
+    if input_dim:
+        # TODO: remove this as onnx opset 11 spec allows negative axes
+        if dim < 0:
+            dim = input_dim + dim
+        if input_dim == dim + 1:
+            softmax = g.op('Softmax', input, axis_i=dim)
+            if dtype and dtype.node().kind() != 'prim::Constant':
+                parsed_dtype = sym_help._get_const(dtype, 'i', 'dtype')
+                softmax = g.op(
+                    'Cast',
+                    softmax,
+                    to_i=sym_help.scalar_type_to_onnx[parsed_dtype])
+            return softmax
+
+    max_value = g.op('ReduceMax', input, axes_i=[dim], keepdims_i=1)
+    input = g.op('Sub', input, max_value)
+    exp = g.op('Exp', input)
+    sum = g.op('ReduceSum', exp, axes_i=[dim])
+    softmax = g.op('Div', exp, sum)
+    if dtype and dtype.node().kind() != 'prim::Constant':
+        parsed_dtype = sym_help._get_const(dtype, 'i', 'dtype')
+        softmax = g.op(
+            'Cast', softmax, to_i=sym_help.scalar_type_to_onnx[parsed_dtype])
+    return softmax
+
+
+def _adaptive_pool(name, type, tuple_fn, fn=None):
+
+    @parse_args('v', 'is')
+    def symbolic_fn(g, input, output_size):
+        if output_size == [1] * len(output_size) and type == 'AveragePool':
+            return g.op('GlobalAveragePool', input)
+        if not input.isCompleteTensor():
+            if output_size == [1] * len(output_size):
+                return g.op('GlobalMaxPool', input), None
+            raise NotImplementedError(
+                '[Adaptive pool]:input size not accessible')
+        dim = input.type().sizes()[2:]
+        if output_size == [1] * len(output_size) and type == 'MaxPool':
+            return g.op('GlobalMaxPool', input), None
+
+        # compute stride = floor(input_size / output_size)
+        s = [int(dim[i] / output_size[i]) for i in range(0, len(dim))]
+
+        # compute kernel_size = input_size - (output_size - 1) * stride
+        k = [dim[i] - (output_size[i] - 1) * s[i] for i in range(0, len(dim))]
+
+        # call max_poolxd_with_indices to get indices in the output
+        if type == 'MaxPool':
+            return fn(g, input, k, k, (0, ) * len(dim), (1, ) * len(dim),
+                      False)
+        output = g.op(
+            type,
+            input,
+            kernel_shape_i=tuple_fn(k),
+            strides_i=tuple_fn(s),
+            ceil_mode_i=False)
+        return output
+
+    return symbolic_fn
+
+
+adaptive_avg_pool1d = _adaptive_pool('adaptive_avg_pool1d', 'AveragePool',
+                                     _single)
+adaptive_avg_pool2d = _adaptive_pool('adaptive_avg_pool2d', 'AveragePool',
+                                     _pair)
+adaptive_avg_pool3d = _adaptive_pool('adaptive_avg_pool3d', 'AveragePool',
+                                     _triple)
+
+
+def new_full(g,
+             self,
+             size,
+             fill_value,
+             dtype,
+             layout,
+             device,
+             pin_memory=False):
+    from torch.onnx.symbolic_opset9 import full
+    if dtype is None and self.isCompleteTensor():
+        dtype = self.type().scalarType()
+        dtype = sym_help.scalar_type_to_onnx.index(
+            sym_help.cast_pytorch_to_onnx[dtype])
+    return full(g, size, fill_value, dtype, layout, device, pin_memory)
+
+
+@parse_args('v', 'v', 'i', 'i', 'i')
+def grid_sampler(g,
+                 input,
+                 grid,
+                 interpolation_mode,
+                 padding_mode,
+                 align_corners=False):
+    return g.op(
+        'mmcv::grid_sampler',
+        input,
+        grid,
+        interpolation_mode_i=interpolation_mode,
+        padding_mode_i=padding_mode,
+        align_corners_i=align_corners)
+
+
+@parse_args('v', 'i')
+def cummax(g, input, dim):
+    return g.op('mmcv::cummax', input, dim_i=dim, outputs=2)
+
+
+@parse_args('v', 'i')
+def cummin(g, input, dim):
+    return g.op('mmcv::cummin', input, dim_i=dim, outputs=2)
+
+
+@parse_args('v', 'v', 'is')
+def roll(g, input, shifts, dims):
+    from torch.onnx.symbolic_opset9 import squeeze
+    from packaging import version
+    input_shape = g.op('Shape', input)
+
+    need_flatten = len(dims) == 0
+    # If dims is not specified, the tensor will be flattened before
+    # rolling and then restored to the original shape.
+    if need_flatten:
+        resize_shape = input_shape
+        input = g.op('Reshape', input,
+                     g.op('Constant', value_t=torch.LongTensor([1, -1])))
+        input_shape = g.op('Shape', input)
+        dims = [1]
+
+    for index, dim in enumerate(dims):
+        end_size = sym_help._slice_helper(
+            g, input_shape, axes=[0], ends=[dim + 1], starts=[dim])
+        shift_size = sym_help._slice_helper(
+            g, shifts, axes=[0], ends=[index + 1], starts=[index])
+        slice_size = g.op('Sub', end_size, shift_size)
+
+        # Can not use Mod because tensorrt does not support
+        div_size = g.op('Div', slice_size, end_size)
+        slice_size = g.op('Sub', slice_size, g.op('Mul', end_size, div_size))
+
+        if version.parse(torch.__version__) >= version.parse('1.7.0'):
+            # add dim=0 for pytorch 1.9.0
+            end_size = squeeze(g, end_size, 0)
+            slice_size = squeeze(g, slice_size, 0)
+        else:
+            end_size = g.op('Squeeze', end_size)
+            slice_size = g.op('Squeeze', slice_size)
+            dim = torch.LongTensor([dim])
+
+        input_slice0 = sym_help._slice_helper(
+            g,
+            input,
+            axes=dim,
+            starts=torch.LongTensor([0]),
+            ends=slice_size,
+            dynamic_slice=True)
+        input_slice1 = sym_help._slice_helper(
+            g,
+            input,
+            axes=dim,
+            ends=end_size,
+            starts=slice_size,
+            dynamic_slice=True)
+
+        input = g.op('Concat', input_slice1, input_slice0, axis_i=dim)
+
+    if need_flatten:
+        input = g.op('Reshape', input, resize_shape)
+
+    return input
+
+
+def register_extra_symbolics(opset=11):
+    register_op('one_hot', one_hot, '', opset)
+    register_op('im2col', im2col, '', opset)
+    register_op('topk', topk, '', opset)
+    register_op('softmax', softmax, '', opset)
+    register_op('constant_pad_nd', constant_pad_nd, '', opset)
+    register_op('reflection_pad1d', reflection_pad1d, '', opset)
+    register_op('reflection_pad2d', reflection_pad2d, '', opset)
+    register_op('reflection_pad3d', reflection_pad3d, '', opset)
+    register_op('avg_pool1d', avg_pool1d, '', opset)
+    register_op('avg_pool2d', avg_pool2d, '', opset)
+    register_op('avg_pool3d', avg_pool3d, '', opset)
+    register_op('adaptive_avg_pool1d', adaptive_avg_pool1d, '', opset)
+    register_op('adaptive_avg_pool2d', adaptive_avg_pool2d, '', opset)
+    register_op('adaptive_avg_pool3d', adaptive_avg_pool3d, '', opset)
+    register_op('masked_select', masked_select, '', opset)
+    register_op('upsample_nearest1d', upsample_nearest1d, '', opset)
+    register_op('upsample_nearest2d', upsample_nearest2d, '', opset)
+    register_op('upsample_nearest3d', upsample_nearest3d, '', opset)
+    register_op('upsample_linear1d', upsample_linear1d, '', opset)
+    register_op('upsample_bilinear2d', upsample_bilinear2d, '', opset)
+    register_op('upsample_trilinear3d', upsample_trilinear3d, '', opset)
+    register_op('upsample_bicubic2d', upsample_bicubic2d, '', opset)
+    register_op('new_full', new_full, '', opset)
+    register_op('grid_sampler', grid_sampler, '', opset)
+    register_op('cummax', cummax, '', opset)
+    register_op('cummin', cummin, '', opset)
+    register_op('roll', roll, '', opset)
diff --git a/mmcv/ops/__init__.py b/mmcv/ops/__init__.py
old mode 100755
new mode 100644
index cffbd23..999e090
--- a/mmcv/ops/__init__.py
+++ b/mmcv/ops/__init__.py
@@ -1,19 +1,12 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from .active_rotated_filter import active_rotated_filter
 from .assign_score_withk import assign_score_withk
 from .ball_query import ball_query
 from .bbox import bbox_overlaps
-from .bezier_align import BezierAlign, bezier_align
-from .bias_act import bias_act
 from .border_align import BorderAlign, border_align
-from .box_iou_quadri import box_iou_quadri
 from .box_iou_rotated import box_iou_rotated
 from .carafe import CARAFE, CARAFENaive, CARAFEPack, carafe, carafe_naive
 from .cc_attention import CrissCrossAttention
-from .chamfer_distance import chamfer_distance
 from .contour_expand import contour_expand
-from .conv2d_gradfix import conv2d, conv_transpose2d
-from .convex_iou import convex_giou, convex_iou
 from .corner_pool import CornerPool
 from .correlation import Correlation
 from .deform_conv import DeformConv2d, DeformConv2dPack, deform_conv2d
@@ -23,8 +16,6 @@ from .deprecated_wrappers import Conv2d_deprecated as Conv2d
 from .deprecated_wrappers import ConvTranspose2d_deprecated as ConvTranspose2d
 from .deprecated_wrappers import Linear_deprecated as Linear
 from .deprecated_wrappers import MaxPool2d_deprecated as MaxPool2d
-from .diff_iou_rotated import diff_iou_rotated_2d, diff_iou_rotated_3d
-from .filtered_lrelu import filtered_lrelu
 from .focal_loss import (SigmoidFocalLoss, SoftmaxFocalLoss,
                          sigmoid_focal_loss, softmax_focal_loss)
 from .furthest_point_sample import (furthest_point_sample,
@@ -32,46 +23,35 @@ from .furthest_point_sample import (furthest_point_sample,
 from .fused_bias_leakyrelu import FusedBiasLeakyReLU, fused_bias_leakyrelu
 from .gather_points import gather_points
 from .group_points import GroupAll, QueryAndGroup, grouping_operation
-from .info import get_compiler_version, get_compiling_cuda_version
-from .iou3d import (boxes_iou3d, boxes_iou_bev, boxes_overlap_bev, nms3d,
-                    nms3d_normal, nms_bev, nms_normal_bev)
+from .info import (get_compiler_version, get_compiling_cuda_version,
+                   get_onnxruntime_op_path)
+from .iou3d import boxes_iou_bev, nms_bev, nms_normal_bev
 from .knn import knn
 from .masked_conv import MaskedConv2d, masked_conv2d
-from .min_area_polygons import min_area_polygons
 from .modulated_deform_conv import (ModulatedDeformConv2d,
                                     ModulatedDeformConv2dPack,
                                     modulated_deform_conv2d)
 from .multi_scale_deform_attn import MultiScaleDeformableAttention
-from .nms import batched_nms, nms, nms_match, nms_quadri, nms_rotated, soft_nms
+from .nms import batched_nms, nms, nms_match, nms_rotated, soft_nms
 from .pixel_group import pixel_group
 from .point_sample import (SimpleRoIAlign, point_sample,
                            rel_roi_point_to_rel_img_point)
 from .points_in_boxes import (points_in_boxes_all, points_in_boxes_cpu,
                               points_in_boxes_part)
-from .points_in_polygons import points_in_polygons
 from .points_sampler import PointsSampler
-from .prroi_pool import PrRoIPool, prroi_pool
 from .psa_mask import PSAMask
-from .riroi_align_rotated import RiRoIAlignRotated, riroi_align_rotated
 from .roi_align import RoIAlign, roi_align
 from .roi_align_rotated import RoIAlignRotated, roi_align_rotated
 from .roi_pool import RoIPool, roi_pool
 from .roiaware_pool3d import RoIAwarePool3d
 from .roipoint_pool3d import RoIPointPool3d
-from .rotated_feature_align import rotated_feature_align
 from .saconv import SAConv2d
 from .scatter_points import DynamicScatter, dynamic_scatter
-from .sparse_conv import (SparseConv2d, SparseConv3d, SparseConvTranspose2d,
-                          SparseConvTranspose3d, SparseInverseConv2d,
-                          SparseInverseConv3d, SubMConv2d, SubMConv3d)
-from .sparse_modules import SparseModule, SparseSequential
-from .sparse_pool import SparseMaxPool2d, SparseMaxPool3d
-from .sparse_structure import SparseConvTensor, scatter_nd
 from .sync_bn import SyncBatchNorm
 from .three_interpolate import three_interpolate
 from .three_nn import three_nn
 from .tin_shift import TINShift, tin_shift
-from .upfirdn2d import filter2d, upfirdn2d, upsample2d
+from .upfirdn2d import upfirdn2d
 from .voxelize import Voxelization, voxelization
 
 __all__ = [
@@ -80,32 +60,22 @@ __all__ = [
     'deform_conv2d', 'DeformRoIPool', 'DeformRoIPoolPack',
     'ModulatedDeformRoIPoolPack', 'deform_roi_pool', 'SigmoidFocalLoss',
     'SoftmaxFocalLoss', 'sigmoid_focal_loss', 'softmax_focal_loss',
-    'get_compiler_version', 'get_compiling_cuda_version', 'MaskedConv2d',
-    'masked_conv2d', 'ModulatedDeformConv2d', 'ModulatedDeformConv2dPack',
+    'get_compiler_version', 'get_compiling_cuda_version',
+    'get_onnxruntime_op_path', 'MaskedConv2d', 'masked_conv2d',
+    'ModulatedDeformConv2d', 'ModulatedDeformConv2dPack',
     'modulated_deform_conv2d', 'batched_nms', 'nms', 'soft_nms', 'nms_match',
     'RoIAlign', 'roi_align', 'RoIPool', 'roi_pool', 'SyncBatchNorm', 'Conv2d',
     'ConvTranspose2d', 'Linear', 'MaxPool2d', 'CrissCrossAttention', 'PSAMask',
     'point_sample', 'rel_roi_point_to_rel_img_point', 'SimpleRoIAlign',
     'SAConv2d', 'TINShift', 'tin_shift', 'assign_score_withk',
-    'box_iou_rotated', 'box_iou_quadri', 'RoIPointPool3d', 'nms_rotated',
-    'knn', 'ball_query', 'upfirdn2d', 'FusedBiasLeakyReLU',
-    'fused_bias_leakyrelu', 'rotated_feature_align', 'RiRoIAlignRotated',
-    'riroi_align_rotated', 'RoIAlignRotated', 'roi_align_rotated',
-    'pixel_group', 'QueryAndGroup', 'GroupAll', 'grouping_operation',
-    'contour_expand', 'three_nn', 'three_interpolate',
-    'MultiScaleDeformableAttention', 'BorderAlign', 'border_align',
-    'gather_points', 'furthest_point_sample', 'nms_quadri',
+    'box_iou_rotated', 'RoIPointPool3d', 'nms_rotated', 'knn', 'ball_query',
+    'upfirdn2d', 'FusedBiasLeakyReLU', 'fused_bias_leakyrelu',
+    'RoIAlignRotated', 'roi_align_rotated', 'pixel_group', 'QueryAndGroup',
+    'GroupAll', 'grouping_operation', 'contour_expand', 'three_nn',
+    'three_interpolate', 'MultiScaleDeformableAttention', 'BorderAlign',
+    'border_align', 'gather_points', 'furthest_point_sample',
     'furthest_point_sample_with_dist', 'PointsSampler', 'Correlation',
-    'boxes_iou3d', 'boxes_iou_bev', 'boxes_overlap_bev', 'nms_bev',
-    'nms_normal_bev', 'nms3d', 'nms3d_normal', 'Voxelization', 'voxelization',
-    'dynamic_scatter', 'DynamicScatter', 'RoIAwarePool3d', 'SparseConv2d',
-    'SparseConv3d', 'SparseConvTranspose2d', 'SparseConvTranspose3d',
-    'SparseInverseConv2d', 'SparseInverseConv3d', 'SubMConv2d', 'SubMConv3d',
-    'SparseModule', 'SparseSequential', 'SparseMaxPool2d', 'SparseMaxPool3d',
-    'SparseConvTensor', 'scatter_nd', 'points_in_boxes_part',
-    'points_in_boxes_cpu', 'points_in_boxes_all', 'points_in_polygons',
-    'min_area_polygons', 'active_rotated_filter', 'convex_iou', 'convex_giou',
-    'diff_iou_rotated_2d', 'diff_iou_rotated_3d', 'chamfer_distance',
-    'PrRoIPool', 'prroi_pool', 'bias_act', 'filtered_lrelu', 'conv2d',
-    'conv_transpose2d', 'filter2d', 'upsample2d', 'BezierAlign', 'bezier_align'
+    'boxes_iou_bev', 'nms_bev', 'nms_normal_bev', 'Voxelization',
+    'voxelization', 'dynamic_scatter', 'DynamicScatter', 'RoIAwarePool3d',
+    'points_in_boxes_part', 'points_in_boxes_cpu', 'points_in_boxes_all'
 ]
diff --git a/mmcv/ops/active_rotated_filter.py b/mmcv/ops/active_rotated_filter.py
deleted file mode 100644
index b8ba43d..0000000
--- a/mmcv/ops/active_rotated_filter.py
+++ /dev/null
@@ -1,64 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from typing import Tuple
-
-import torch
-from torch.autograd import Function
-from torch.autograd.function import once_differentiable
-
-from ..utils import ext_loader
-
-ext_module = ext_loader.load_ext(
-    '_ext',
-    ['active_rotated_filter_forward', 'active_rotated_filter_backward'])
-
-
-class ActiveRotatedFilterFunction(Function):
-    """Encoding the orientation information and generating orientation-
-    sensitive features.
-
-    The details are described in the paper `Align Deep Features for Oriented
-    Object Detection  <https://arxiv.org/abs/2008.09397>_`.
-    """
-
-    @staticmethod
-    def forward(ctx, input: torch.Tensor,
-                indices: torch.Tensor) -> torch.Tensor:
-        """
-        Args:
-            input (torch.Tensor): Input features with shape
-                [num_output_planes, num_input_planes, num_orientations, H, W].
-            indices (torch.Tensor): Indices with shape
-                [num_orientations, H, W, num_rotations].
-
-        Returns:
-            torch.Tensor: Refined features with shape [num_output_planes *
-            num_rotations, num_input_planes * num_orientations, H, W].
-        """
-        ctx.save_for_backward(input, indices)
-        op, ip, o, h, w = input.size()
-        o, h, w, r = indices.size()
-        output = input.new_zeros((op * r, ip * o, h, w))
-        ext_module.active_rotated_filter_forward(input, indices, output)
-
-        return output
-
-    @staticmethod
-    @once_differentiable
-    def backward(ctx, grad_out: torch.Tensor) -> Tuple[torch.Tensor, None]:
-        """
-        Args:
-            grad_output (torch.Tensor): The gradient of output features
-                with shape [num_output_planes * num_rotations,
-                num_input_planes * num_orientations, H, W].
-
-        Returns:
-            torch.Tensor: The gradient of input features with shape
-            [num_output_planes, num_input_planes, num_orientations, H, W].
-        """
-        input, indices = ctx.saved_tensors
-        grad_in = torch.zeros_like(input)
-        ext_module.active_rotated_filter_backward(grad_out, indices, grad_in)
-        return grad_in, None
-
-
-active_rotated_filter = ActiveRotatedFilterFunction.apply
diff --git a/mmcv/ops/assign_score_withk.py b/mmcv/ops/assign_score_withk.py
index deca089..4906ada 100644
--- a/mmcv/ops/assign_score_withk.py
+++ b/mmcv/ops/assign_score_withk.py
@@ -1,6 +1,3 @@
-from typing import Tuple
-
-import torch
 from torch.autograd import Function
 
 from ..utils import ext_loader
@@ -30,11 +27,11 @@ class AssignScoreWithK(Function):
 
     @staticmethod
     def forward(ctx,
-                scores: torch.Tensor,
-                point_features: torch.Tensor,
-                center_features: torch.Tensor,
-                knn_idx: torch.Tensor,
-                aggregate: str = 'sum') -> torch.Tensor:
+                scores,
+                point_features,
+                center_features,
+                knn_idx,
+                aggregate='sum'):
         """
         Args:
             scores (torch.Tensor): (B, npoint, K, M), predicted scores to
@@ -81,20 +78,15 @@ class AssignScoreWithK(Function):
         return output
 
     @staticmethod
-    def backward(
-        ctx, grad_out: torch.Tensor
-    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, None, None]:
+    def backward(ctx, grad_out):
         """
         Args:
             grad_out (torch.Tensor): (B, out_dim, npoint, K)
 
         Returns:
-            tuple[torch.Tensor]: A tuple contains five elements. The first one
-            is the gradient of ``scores`` whose shape is (B, npoint, K, M). The
-            second is the gradient of ``point_features`` whose shape is
-            (B, N, M, out_dim). The third is the gradient of
-            ``center_features`` with the shape of (B, N, M, out_dim). The last
-            two are ``None``.
+            grad_scores (torch.Tensor): (B, npoint, K, M)
+            grad_point_features (torch.Tensor): (B, N, M, out_dim)
+            grad_center_features (torch.Tensor): (B, N, M, out_dim)
         """
         _, point_features, center_features, scores, knn_idx = ctx.saved_tensors
 
diff --git a/mmcv/ops/ball_query.py b/mmcv/ops/ball_query.py
index a89b36b..d046684 100644
--- a/mmcv/ops/ball_query.py
+++ b/mmcv/ops/ball_query.py
@@ -1,86 +1,54 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from typing import Optional, Tuple
-
 import torch
 from torch.autograd import Function
 
 from ..utils import ext_loader
 
-ext_module = ext_loader.load_ext(
-    '_ext', ['ball_query_forward', 'stack_ball_query_forward'])
+ext_module = ext_loader.load_ext('_ext', ['ball_query_forward'])
 
 
 class BallQuery(Function):
     """Find nearby points in spherical space."""
 
     @staticmethod
-    def forward(
-            ctx,
-            min_radius: float,
-            max_radius: float,
-            sample_num: int,
-            xyz: torch.Tensor,
-            center_xyz: torch.Tensor,
-            xyz_batch_cnt: Optional[torch.Tensor] = None,
-            center_xyz_batch_cnt: Optional[torch.Tensor] = None
-    ) -> torch.Tensor:
+    def forward(ctx, min_radius: float, max_radius: float, sample_num: int,
+                xyz: torch.Tensor, center_xyz: torch.Tensor) -> torch.Tensor:
         """
         Args:
             min_radius (float): minimum radius of the balls.
             max_radius (float): maximum radius of the balls.
             sample_num (int): maximum number of features in the balls.
-            xyz (torch.Tensor): (B, N, 3) xyz coordinates of the features,
-                or staked input (N1 + N2 ..., 3).
-            center_xyz (torch.Tensor): (B, npoint, 3) centers of the ball
-                query, or staked input (M1 + M2 ..., 3).
-            xyz_batch_cnt: (batch_size): Stacked input xyz coordinates nums in
-                each batch, just like (N1, N2, ...). Defaults to None.
-                New in version 1.7.0.
-            center_xyz_batch_cnt: (batch_size): Stacked centers coordinates
-                nums in each batch, just line (M1, M2, ...). Defaults to None.
-                New in version 1.7.0.
+            xyz (Tensor): (B, N, 3) xyz coordinates of the features.
+            center_xyz (Tensor): (B, npoint, 3) centers of the ball query.
 
         Returns:
-            torch.Tensor: (B, npoint, nsample) tensor with the indices of the
-            features that form the query balls.
+            Tensor: (B, npoint, nsample) tensor with the indices of
+                the features that form the query balls.
         """
         assert center_xyz.is_contiguous()
         assert xyz.is_contiguous()
         assert min_radius < max_radius
-        if xyz_batch_cnt is not None and center_xyz_batch_cnt is not None:
-            assert xyz_batch_cnt.dtype == torch.int
-            assert center_xyz_batch_cnt.dtype == torch.int
-            idx = center_xyz.new_zeros((center_xyz.shape[0], sample_num),
-                                       dtype=torch.int32)
-            ext_module.stack_ball_query_forward(
-                center_xyz,
-                center_xyz_batch_cnt,
-                xyz,
-                xyz_batch_cnt,
-                idx,
-                max_radius=max_radius,
-                nsample=sample_num,
-            )
-        else:
-            B, N, _ = xyz.size()
-            npoint = center_xyz.size(1)
-            idx = xyz.new_zeros(B, npoint, sample_num, dtype=torch.int32)
-            ext_module.ball_query_forward(
-                center_xyz,
-                xyz,
-                idx,
-                b=B,
-                n=N,
-                m=npoint,
-                min_radius=min_radius,
-                max_radius=max_radius,
-                nsample=sample_num)
+
+        B, N, _ = xyz.size()
+        npoint = center_xyz.size(1)
+        idx = xyz.new_zeros(B, npoint, sample_num, dtype=torch.int)
+
+        ext_module.ball_query_forward(
+            center_xyz,
+            xyz,
+            idx,
+            b=B,
+            n=N,
+            m=npoint,
+            min_radius=min_radius,
+            max_radius=max_radius,
+            nsample=sample_num)
         if torch.__version__ != 'parrots':
             ctx.mark_non_differentiable(idx)
         return idx
 
     @staticmethod
-    def backward(ctx, a=None) -> Tuple[None, None, None, None]:
+    def backward(ctx, a=None):
         return None, None, None, None
 
 
diff --git a/mmcv/ops/bbox.py b/mmcv/ops/bbox.py
index 4ba93d6..0c4d58b 100644
--- a/mmcv/ops/bbox.py
+++ b/mmcv/ops/bbox.py
@@ -1,57 +1,10 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-import torch
-
 from ..utils import ext_loader
 
 ext_module = ext_loader.load_ext('_ext', ['bbox_overlaps'])
 
 
-def _bbox_overlaps_cpu(bboxes1: torch.Tensor,
-                       bboxes2: torch.Tensor,
-                       mode: str = 'iou',
-                       aligned: bool = False,
-                       offset: int = 0) -> torch.Tensor:
-    assert mode in ['iou', 'iof']
-
-    if aligned:
-        lt = torch.max(bboxes1[:, :2], bboxes2[:, :2])  # [rows, 2]
-        rb = torch.min(bboxes1[:, 2:], bboxes2[:, 2:])  # [rows, 2]
-
-        wh = (rb - lt + offset).clamp(min=0)  # [rows, 2]
-        overlap = wh[:, 0] * wh[:, 1]
-        area1 = (bboxes1[:, 2] - bboxes1[:, 0] + offset) * (
-            bboxes1[:, 3] - bboxes1[:, 1] + offset)
-
-        if mode == 'iou':
-            area2 = (bboxes2[:, 2] - bboxes2[:, 0] + offset) * (
-                bboxes2[:, 3] - bboxes2[:, 1] + offset)
-            ious = overlap / (area1 + area2 - overlap)
-        else:
-            ious = overlap / area1
-    else:
-        lt = torch.max(bboxes1[:, None, :2], bboxes2[:, :2])  # [rows, cols, 2]
-        rb = torch.min(bboxes1[:, None, 2:], bboxes2[:, 2:])  # [rows, cols, 2]
-
-        wh = (rb - lt + offset).clamp(min=0)  # [rows, cols, 2]
-        overlap = wh[:, :, 0] * wh[:, :, 1]
-        area1 = (bboxes1[:, 2] - bboxes1[:, 0] + offset) * (
-            bboxes1[:, 3] - bboxes1[:, 1] + offset)
-
-        if mode == 'iou':
-            area2 = (bboxes2[:, 2] - bboxes2[:, 0] + offset) * (
-                bboxes2[:, 3] - bboxes2[:, 1] + offset)
-            ious = overlap / (area1[:, None] + area2 - overlap)
-        else:
-            ious = overlap / (area1[:, None])
-
-    return ious
-
-
-def bbox_overlaps(bboxes1: torch.Tensor,
-                  bboxes2: torch.Tensor,
-                  mode: str = 'iou',
-                  aligned: bool = False,
-                  offset: int = 0) -> torch.Tensor:
+def bbox_overlaps(bboxes1, bboxes2, mode='iou', aligned=False, offset=0):
     """Calculate overlap between two set of bboxes.
 
     If ``aligned`` is ``False``, then calculate the ious between each bbox
@@ -59,16 +12,14 @@ def bbox_overlaps(bboxes1: torch.Tensor,
     bboxes1 and bboxes2.
 
     Args:
-        bboxes1 (torch.Tensor): shape (m, 4) in <x1, y1, x2, y2> format or
-            empty.
-        bboxes2 (torch.Tensor): shape (n, 4) in <x1, y1, x2, y2> format or
-            empty. If aligned is ``True``, then m and n must be equal.
+        bboxes1 (Tensor): shape (m, 4) in <x1, y1, x2, y2> format or empty.
+        bboxes2 (Tensor): shape (n, 4) in <x1, y1, x2, y2> format or empty.
+            If aligned is ``True``, then m and n must be equal.
         mode (str): "iou" (intersection over union) or iof (intersection over
             foreground).
 
     Returns:
-        torch.Tensor: Return the ious betweens boxes. If ``aligned`` is
-        ``False``, the shape of ious is (m, n) else (m, 1).
+        ious(Tensor): shape (m, n) if aligned == False else shape (m, 1)
 
     Example:
         >>> bboxes1 = torch.FloatTensor([
@@ -106,17 +57,16 @@ def bbox_overlaps(bboxes1: torch.Tensor,
 
     rows = bboxes1.size(0)
     cols = bboxes2.size(0)
-
     if aligned:
         assert rows == cols
-        ious = bboxes1.new_zeros(rows)
-    else:
-        ious = bboxes1.new_zeros((rows, cols))
 
     if rows * cols == 0:
-        return ious
+        return bboxes1.new(rows, 1) if aligned else bboxes1.new(rows, cols)
 
+    if aligned:
+        ious = bboxes1.new_zeros(rows)
+    else:
+        ious = bboxes1.new_zeros((rows, cols))
     ext_module.bbox_overlaps(
         bboxes1, bboxes2, ious, mode=mode_flag, aligned=aligned, offset=offset)
-
     return ious
diff --git a/mmcv/ops/bezier_align.py b/mmcv/ops/bezier_align.py
deleted file mode 100644
index 6db7f5c..0000000
--- a/mmcv/ops/bezier_align.py
+++ /dev/null
@@ -1,137 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from typing import Tuple, Union
-
-import torch
-import torch.nn as nn
-from torch.autograd import Function
-from torch.autograd.function import once_differentiable
-from torch.nn.modules.utils import _pair
-
-from ..utils import ext_loader
-
-ext_module = ext_loader.load_ext(
-    '_ext', ['bezier_align_forward', 'bezier_align_backward'])
-
-
-class BezierAlignFunction(Function):
-
-    @staticmethod
-    def forward(ctx,
-                input: torch.Tensor,
-                beziers: torch.Tensor,
-                output_size: Union[int, Tuple[int, int]],
-                spatial_scale: Union[int, float] = 1.0,
-                sampling_ratio: int = 0,
-                aligned: bool = True) -> torch.Tensor:
-        ctx.output_size = _pair(output_size)
-        ctx.spatial_scale = spatial_scale
-        ctx.input_shape = input.size()
-        ctx.sampling_ratio = sampling_ratio
-        ctx.aligned = aligned
-
-        assert beziers.size(1) == 17
-        output_shape = (beziers.size(0), input.size(1), ctx.output_size[0],
-                        ctx.output_size[1])
-        output = input.new_zeros(output_shape)
-        ext_module.bezier_align_forward(
-            input,
-            beziers,
-            output,
-            aligned_height=ctx.output_size[0],
-            aligned_width=ctx.output_size[1],
-            spatial_scale=ctx.spatial_scale,
-            sampling_ratio=ctx.sampling_ratio,
-            aligned=ctx.aligned)
-
-        ctx.save_for_backward(beziers)
-        return output
-
-    @staticmethod
-    @once_differentiable
-    def backward(ctx, grad_output: torch.Tensor):
-        beziers = ctx.saved_tensors[0]
-        grad_input = grad_output.new_zeros(ctx.input_shape)
-        grad_output = grad_output.contiguous()
-        ext_module.bezier_align_backward(
-            grad_output,
-            beziers,
-            grad_input,
-            aligned_height=ctx.output_size[0],
-            aligned_width=ctx.output_size[1],
-            spatial_scale=ctx.spatial_scale,
-            sampling_ratio=ctx.sampling_ratio,
-            aligned=ctx.aligned)
-        return grad_input, None, None, None, None, None
-
-
-bezier_align = BezierAlignFunction.apply
-
-
-class BezierAlign(nn.Module):
-    """Bezier align pooling layer.
-
-    Args:
-        output_size (tuple): h, w
-        spatial_scale (float): scale the input boxes by this number
-        sampling_ratio (int): number of inputs samples to take for each
-            output sample. 0 to take samples densely for current models.
-        aligned (bool): if False, use the legacy implementation in
-            MMDetection. If True, align the results more perfectly.
-
-    Note:
-        The implementation of BezierAlign is modified from
-        https://github.com/aim-uofa/AdelaiDet
-
-        The meaning of aligned=True:
-
-        Given a continuous coordinate c, its two neighboring pixel
-        indices (in our pixel model) are computed by floor(c - 0.5) and
-        ceil(c - 0.5). For example, c=1.3 has pixel neighbors with discrete
-        indices [0] and [1] (which are sampled from the underlying signal
-        at continuous coordinates 0.5 and 1.5). But the original roi_align
-        (aligned=False) does not subtract the 0.5 when computing
-        neighboring pixel indices and therefore it uses pixels with a
-        slightly incorrect alignment (relative to our pixel model) when
-        performing bilinear interpolation.
-
-        With `aligned=True`,
-        we first appropriately scale the ROI and then shift it by -0.5
-        prior to calling roi_align. This produces the correct neighbors;
-
-        The difference does not make a difference to the model's
-        performance if ROIAlign is used together with conv layers.
-    """
-
-    def __init__(
-        self,
-        output_size: Tuple,
-        spatial_scale: Union[int, float],
-        sampling_ratio: int,
-        aligned: bool = True,
-    ) -> None:
-        super().__init__()
-
-        self.output_size = _pair(output_size)
-        self.spatial_scale = float(spatial_scale)
-        self.sampling_ratio = int(sampling_ratio)
-        self.aligned = aligned
-
-    def forward(self, input: torch.Tensor,
-                beziers: torch.Tensor) -> torch.Tensor:
-        """BezierAlign forward.
-
-        Args:
-            inputs (Tensor): input features.
-            beziers (Tensor): beziers for align.
-        """
-        return bezier_align(input, beziers, self.output_size,
-                            self.spatial_scale, self.sampling_ratio,
-                            self.aligned)
-
-    def __repr__(self):
-        s = self.__class__.__name__
-        s += f'(output_size={self.output_size}, '
-        s += f'spatial_scale={self.spatial_scale})'
-        s += f'sampling_ratio={self.sampling_ratio})'
-        s += f'aligned={self.aligned})'
-        return s
diff --git a/mmcv/ops/bias_act.py b/mmcv/ops/bias_act.py
deleted file mode 100644
index 3dfa557..0000000
--- a/mmcv/ops/bias_act.py
+++ /dev/null
@@ -1,375 +0,0 @@
-# Modified from
-# https://github.com/NVlabs/stylegan3/blob/main/torch_utils/ops/bias_act.py
-
-# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
-#
-# NVIDIA CORPORATION and its licensors retain all intellectual property
-# and proprietary rights in and to this software, related documentation
-# and any modifications thereto.  Any use, reproduction, disclosure or
-# distribution of this software and related documentation without an express
-# license agreement from NVIDIA CORPORATION is strictly prohibited.
-
-# source: https://github.com/open-mmlab/mmediting/blob/dev-1.x/mmedit/models/editors/stylegan3/stylegan3_ops/ops/bias_act.py # noqa
-"""Custom PyTorch ops for efficient bias and activation."""
-
-from typing import Any, Dict, Optional, Union
-
-import numpy as np
-import torch
-
-from ..utils import ext_loader
-
-ext_module = ext_loader.load_ext('_ext', ['bias_act'])
-
-
-class EasyDict(dict):
-    """Convenience class that behaves like a dict but allows access with the
-    attribute syntax."""
-
-    def __getattr__(self, name: str) -> Any:
-        try:
-            return self[name]
-        except KeyError:
-            raise AttributeError(name)
-
-    def __setattr__(self, name: str, value: Any) -> None:
-        self[name] = value
-
-    def __delattr__(self, name: str) -> None:
-        del self[name]
-
-
-activation_funcs = {
-    'linear':
-    EasyDict(
-        func=lambda x, **_: x,
-        def_alpha=0,
-        def_gain=1,
-        cuda_idx=1,
-        ref='',
-        has_2nd_grad=False),
-    'relu':
-    EasyDict(
-        func=lambda x, **_: torch.nn.functional.relu(x),
-        def_alpha=0,
-        def_gain=np.sqrt(2),
-        cuda_idx=2,
-        ref='y',
-        has_2nd_grad=False),
-    'lrelu':
-    EasyDict(
-        func=lambda x, alpha, **_: torch.nn.functional.leaky_relu(x, alpha),
-        def_alpha=0.2,
-        def_gain=np.sqrt(2),
-        cuda_idx=3,
-        ref='y',
-        has_2nd_grad=False),
-    'tanh':
-    EasyDict(
-        func=lambda x, **_: torch.tanh(x),
-        def_alpha=0,
-        def_gain=1,
-        cuda_idx=4,
-        ref='y',
-        has_2nd_grad=True),
-    'sigmoid':
-    EasyDict(
-        func=lambda x, **_: torch.sigmoid(x),
-        def_alpha=0,
-        def_gain=1,
-        cuda_idx=5,
-        ref='y',
-        has_2nd_grad=True),
-    'elu':
-    EasyDict(
-        func=lambda x, **_: torch.nn.functional.elu(x),
-        def_alpha=0,
-        def_gain=1,
-        cuda_idx=6,
-        ref='y',
-        has_2nd_grad=True),
-    'selu':
-    EasyDict(
-        func=lambda x, **_: torch.nn.functional.selu(x),
-        def_alpha=0,
-        def_gain=1,
-        cuda_idx=7,
-        ref='y',
-        has_2nd_grad=True),
-    'softplus':
-    EasyDict(
-        func=lambda x, **_: torch.nn.functional.softplus(x),
-        def_alpha=0,
-        def_gain=1,
-        cuda_idx=8,
-        ref='y',
-        has_2nd_grad=True),
-    'swish':
-    EasyDict(
-        func=lambda x, **_: torch.sigmoid(x) * x,
-        def_alpha=0,
-        def_gain=np.sqrt(2),
-        cuda_idx=9,
-        ref='x',
-        has_2nd_grad=True),
-}
-
-_null_tensor = torch.empty([0])
-
-
-def bias_act(input: torch.Tensor,
-             bias: Optional[torch.Tensor] = None,
-             dim: int = 1,
-             act: str = 'linear',
-             alpha: Optional[Union[float, int]] = None,
-             gain: Optional[float] = None,
-             clamp: Optional[float] = None,
-             use_custom_op: bool = True):
-    r"""Fused bias and activation function.
-
-    Adds `bias` to activation tensor `input`, and evaluates activation
-    function `act`, and scales the result by `gain`. Each of the steps is
-    optional.
-
-    In most cases, the fused op is considerably more efficient than performing
-    the same calculation using standard PyTorch ops. It supports first and
-    second order gradients, but not third order gradients.
-
-    Args:
-        input (torch.Tensor): Input activation tensor. Can be of any shape.
-        bias (torch.Tensor): Bias vector, or `None` to disable.
-            Must be a 1D tensor of the same type as `input`. The shape must
-            be known, and it must match the dimension of `input` corresponding
-            to `dim`. Defaults to None.
-        dim (int): The dimension in `input` corresponding to the elements of
-            `bias`. The value of `dim` is ignored if `b` is not specified.
-            Defaults to 1.
-        act (str): Name of the activation function to evaluate, or `"linear"`
-            to disable. Can be e.g. "relu", "lrelu", "tanh", "sigmoid",
-            "swish", etc. See `activation_funcs` for a full list. `None` is not
-            allowed. Defaults to `linear`.
-        alpha (float or int): Shape parameter for the activation
-            function, or `None` to use the default. Defaults to None.
-        gain (float): Scaling factor for the output tensor, or `None`
-            to use default. See `activation_funcs` for the default scaling of
-            each activation function. If unsure, consider specifying 1.
-            Defaults to None.
-        clamp (float):  Clamp the output values to `[-clamp, +clamp]`,
-            or `None` to disable the clamping (default). Defaults to None.
-        use_custom_op (bool): Whether to use customized op.
-            Defaults to True.
-
-    Returns:
-        torch.Tensor: Tensor of the same shape and datatype as `input`.
-    """
-    assert isinstance(input, torch.Tensor)
-    if use_custom_op and input.is_cuda:
-        return _bias_act_cuda(
-            dim=dim, act=act, alpha=alpha, gain=gain,
-            clamp=clamp).apply(input, bias)
-    return _bias_act_ref(
-        input=input,
-        bias=bias,
-        dim=dim,
-        act=act,
-        alpha=alpha,
-        gain=gain,
-        clamp=clamp)
-
-
-def _bias_act_ref(input: torch.Tensor,
-                  bias: Optional[torch.Tensor] = None,
-                  dim: int = 1,
-                  act: str = 'linear',
-                  alpha: Optional[Union[float, int]] = None,
-                  gain: Optional[float] = None,
-                  clamp: Optional[float] = None):
-    """Slow reference implementation of `bias_act()` using standard PyTorch
-    ops.
-
-    Adds `bias` to activation tensor `input`, and evaluates activation
-    function `act`, and scales the result by `gain`. Each of the steps is
-    optional.
-
-    In most cases, the fused op is considerably more efficient than performing
-    the same calculation using standard PyTorch ops. It supports first and
-    second order gradients, but not third order gradients.
-
-    Args:
-        input (torch.Tensor): Input activation tensor. Can be of any shape.
-        bias (torch.Tensor): Bias vector, or `None` to disable.
-            Must be a 1D tensor of the same type as `input`. The shape must
-            be known, and it must match the dimension of `input` corresponding
-            to `dim`. Defaults to None.
-        dim (int): The dimension in `input` corresponding to the elements of
-            `bias`. The value of `dim` is ignored if `b` is not specified.
-            Defaults to 1.
-        act (str): Name of the activation function to evaluate, or `"linear"`
-            to disable. Can be e.g. "relu", "lrelu", "tanh", "sigmoid",
-            "swish", etc. See `activation_funcs` for a full list. `None` is not
-            allowed. Defaults to `linear`.
-        alpha (float or int): Shape parameter for the activation
-            function, or `None` to use the default. Defaults to None.
-        gain (float): Scaling factor for the output tensor, or `None`
-            to use default. See `activation_funcs` for the default scaling of
-            each activation function. If unsure, consider specifying 1.
-            Defaults to None.
-        clamp (float):  Clamp the output values to
-            `[-clamp, +clamp]`, or `None` to disable the clamping (default).
-            Defaults to None.
-
-    Returns:
-        torch.Tensor: Tensor of the same shape and datatype as `input`.
-    """
-    assert isinstance(input, torch.Tensor)
-    assert clamp is None or clamp >= 0
-    spec = activation_funcs[act]
-    alpha = float(alpha if alpha is not None else spec.def_alpha)
-    gain = float(gain if gain is not None else spec.def_gain)
-    clamp = float(clamp if clamp is not None else -1)
-
-    # Add bias.
-    if bias is not None:
-        assert isinstance(bias, torch.Tensor) and bias.ndim == 1
-        assert 0 <= dim < input.ndim
-        assert bias.shape[0] == input.shape[dim]
-        input = input + bias.reshape(
-            [-1 if i == dim else 1 for i in range(input.ndim)])
-
-    # Evaluate activation function.
-    alpha = float(alpha)
-    output = spec.func(input, alpha=alpha)
-
-    # Scale by gain.
-    gain = float(gain)
-    if gain != 1:
-        output = output * gain
-
-    # Clamp.
-    if clamp >= 0:
-        # pylint: disable=invalid-unary-operand-type
-        output = output.clamp(-clamp, clamp)
-    return output
-
-
-_bias_act_cuda_cache: Dict = dict()
-
-
-def _bias_act_cuda(dim: int = 1,
-                   act: str = 'linear',
-                   alpha: Optional[Union[float, int]] = None,
-                   gain: Optional[float] = None,
-                   clamp: Optional[float] = None):
-    """"Fast CUDA implementation of `bias_act()` using custom ops.
-
-    Args:
-        dim (int): The dimension in `x` corresponding to the elements of `b`.
-            The value of `dim` is ignored if `b` is not specified.
-            Defaults to 1.
-        act (str): Name of the activation function to evaluate, or `"linear"`
-            to disable. Can be e.g. "relu", "lrelu", "tanh", "sigmoid",
-            "swish", etc. See `activation_funcs` for a full list. `None` is not
-            allowed. Defaults to `linear`.
-        alpha (float | int): Shape parameter for the activation
-            function, or `None` to use the default. Defaults to None.
-        gain (float): Scaling factor for the output tensor, or `None`
-            to use default. See `activation_funcs` for the default scaling of
-            each activation function. If unsure, consider specifying 1.
-            Defaults to None.
-        clamp (float): Clamp the output values to `[-clamp, +clamp]`,
-            or `None` to disable the clamping (default). Defaults to None.
-
-    Returns:
-        torch.Tensor: Tensor of the same shape and datatype as `x`.
-    """
-    # Parse arguments.
-    assert clamp is None or clamp >= 0
-    spec = activation_funcs[act]
-    alpha = float(alpha if alpha is not None else spec.def_alpha)
-    gain = float(gain if gain is not None else spec.def_gain)
-    clamp = float(clamp if clamp is not None else -1)
-
-    # Lookup from cache.
-    key = (dim, act, alpha, gain, clamp)
-    if key in _bias_act_cuda_cache:
-        return _bias_act_cuda_cache[key]
-
-    # Forward op.
-    class BiasActCuda(torch.autograd.Function):
-
-        @staticmethod
-        def forward(ctx, x, b):  # pylint: disable=arguments-differ
-            ctx.memory_format = torch.channels_last if x.ndim > 2 and x.stride(
-                1) == 1 else torch.contiguous_format
-            x = x.contiguous(memory_format=ctx.memory_format)
-            b = b.contiguous() if b is not None else _null_tensor.to(x.device)
-            y = x
-            if act != 'linear' or gain != 1 or clamp >= 0 or (
-                    b is not _null_tensor.to(x.device)):
-                y = ext_module.bias_act(x, b, _null_tensor.to(x.device),
-                                        _null_tensor.to(x.device),
-                                        _null_tensor.to(x.device), 0, dim,
-                                        spec.cuda_idx, alpha, gain, clamp)
-            ctx.save_for_backward(
-                x if 'x' in spec.ref or spec.has_2nd_grad else _null_tensor.to(
-                    x.device), b if 'x' in spec.ref or spec.has_2nd_grad else
-                _null_tensor.to(x.device),
-                y if 'y' in spec.ref else _null_tensor.to(x.device))
-            return y
-
-        @staticmethod
-        def backward(ctx, dy):  # pylint: disable=arguments-differ
-            dy = dy.contiguous(memory_format=ctx.memory_format)
-            x, b, y = ctx.saved_tensors
-            dx = None
-            db = None
-
-            if ctx.needs_input_grad[0] or ctx.needs_input_grad[1]:
-                dx = dy
-                if act != 'linear' or gain != 1 or clamp >= 0:
-                    dx = BiasActCudaGrad.apply(dy, x, b, y)
-
-            if ctx.needs_input_grad[1]:
-                db = dx.sum([i for i in range(dx.ndim) if i != dim])
-
-            return dx, db
-
-    # Backward op.
-    class BiasActCudaGrad(torch.autograd.Function):
-
-        @staticmethod
-        def forward(ctx, dy, x, b, y):  # pylint: disable=arguments-differ
-            ctx.memory_format = torch.channels_last if dy.ndim > 2 and (
-                dy.stride(1) == 1) else torch.contiguous_format
-            dx = ext_module.bias_act(dy, b, x, y, _null_tensor.to(x.device), 1,
-                                     dim, spec.cuda_idx, alpha, gain, clamp)
-            ctx.save_for_backward(
-                dy if spec.has_2nd_grad else _null_tensor.to(x.device), x, b,
-                y)
-            return dx
-
-        @staticmethod
-        def backward(ctx, d_dx):  # pylint: disable=arguments-differ
-            d_dx = d_dx.contiguous(memory_format=ctx.memory_format)
-            dy, x, b, y = ctx.saved_tensors
-            d_dy = None
-            d_x = None
-            d_b = None
-            d_y = None
-
-            if ctx.needs_input_grad[0]:
-                d_dy = BiasActCudaGrad.apply(d_dx, x, b, y)
-
-            if spec.has_2nd_grad and (ctx.needs_input_grad[1]
-                                      or ctx.needs_input_grad[2]):
-                d_x = ext_module.bias_act(d_dx, b, x, y, dy, 2, dim,
-                                          spec.cuda_idx, alpha, gain, clamp)
-
-            if spec.has_2nd_grad and ctx.needs_input_grad[2]:
-                d_b = d_x.sum([i for i in range(d_x.ndim) if i != dim])
-
-            return d_dy, d_x, d_b, d_y
-
-    # Add to cache.
-    _bias_act_cuda_cache[key] = BiasActCuda
-    return BiasActCuda
diff --git a/mmcv/ops/border_align.py b/mmcv/ops/border_align.py
index c09501b..ff305be 100644
--- a/mmcv/ops/border_align.py
+++ b/mmcv/ops/border_align.py
@@ -2,8 +2,6 @@
 # modified from
 # https://github.com/Megvii-BaseDetection/cvpods/blob/master/cvpods/layers/border_align.py
 
-from typing import Tuple
-
 import torch
 import torch.nn as nn
 from torch.autograd import Function
@@ -23,8 +21,7 @@ class BorderAlignFunction(Function):
             'mmcv::MMCVBorderAlign', input, boxes, pool_size_i=pool_size)
 
     @staticmethod
-    def forward(ctx, input: torch.Tensor, boxes: torch.Tensor,
-                pool_size: int) -> torch.Tensor:
+    def forward(ctx, input, boxes, pool_size):
         ctx.pool_size = pool_size
         ctx.input_shape = input.size()
 
@@ -48,8 +45,7 @@ class BorderAlignFunction(Function):
 
     @staticmethod
     @once_differentiable
-    def backward(ctx,
-                 grad_output: torch.Tensor) -> Tuple[torch.Tensor, None, None]:
+    def backward(ctx, grad_output):
         boxes, argmax_idx = ctx.saved_tensors
         grad_input = grad_output.new_zeros(ctx.input_shape)
         # complex head architecture may cause grad_output uncontiguous
@@ -76,25 +72,24 @@ class BorderAlign(nn.Module):
 
     For each border line (e.g. top, left, bottom or right) of each box,
     border_align does the following:
-
-    1. uniformly samples ``pool_size`` +1 positions on this line, involving
-       the start and end points.
-    2. the corresponding features on these points are computed by bilinear
-       interpolation.
-    3. max pooling over all the ``pool_size`` +1 positions are used for
-       computing pooled feature.
+        1. uniformly samples `pool_size`+1 positions on this line, involving \
+           the start and end points.
+        2. the corresponding features on these points are computed by \
+           bilinear interpolation.
+        3. max pooling over all the `pool_size`+1 positions are used for \
+           computing pooled feature.
 
     Args:
         pool_size (int): number of positions sampled over the boxes' borders
             (e.g. top, bottom, left, right).
+
     """
 
-    def __init__(self, pool_size: int):
-        super().__init__()
+    def __init__(self, pool_size):
+        super(BorderAlign, self).__init__()
         self.pool_size = pool_size
 
-    def forward(self, input: torch.Tensor,
-                boxes: torch.Tensor) -> torch.Tensor:
+    def forward(self, input, boxes):
         """
         Args:
             input: Features with shape [N,4C,H,W]. Channels ranged in [0,C),
@@ -103,8 +98,8 @@ class BorderAlign(nn.Module):
             boxes: Boxes with shape [N,H*W,4]. Coordinate format (x1,y1,x2,y2).
 
         Returns:
-            torch.Tensor: Pooled features with shape [N,C,H*W,4]. The order is
-            (top,left,bottom,right) for the last dimension.
+            Tensor: Pooled features with shape [N,C,H*W,4]. The order is
+                (top,left,bottom,right) for the last dimension.
         """
         return border_align(input, boxes, self.pool_size)
 
diff --git a/mmcv/ops/box_iou_quadri.py b/mmcv/ops/box_iou_quadri.py
deleted file mode 100644
index 89747fd..0000000
--- a/mmcv/ops/box_iou_quadri.py
+++ /dev/null
@@ -1,49 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import torch
-
-from ..utils import ext_loader
-
-ext_module = ext_loader.load_ext('_ext', ['box_iou_quadri'])
-
-
-def box_iou_quadri(bboxes1: torch.Tensor,
-                   bboxes2: torch.Tensor,
-                   mode: str = 'iou',
-                   aligned: bool = False) -> torch.Tensor:
-    """Return intersection-over-union (Jaccard index) of boxes.
-
-    Both sets of boxes are expected to be in
-    (x1, y1, ..., x4, y4) format.
-
-    If ``aligned`` is ``False``, then calculate the ious between each bbox
-    of bboxes1 and bboxes2, otherwise the ious between each aligned pair of
-    bboxes1 and bboxes2.
-
-    Args:
-        bboxes1 (torch.Tensor): quadrilateral bboxes 1. It has shape (N, 8),
-            indicating (x1, y1, ..., x4, y4) for each row.
-        bboxes2 (torch.Tensor): quadrilateral bboxes 2. It has shape (M, 8),
-            indicating (x1, y1, ..., x4, y4) for each row.
-        mode (str): "iou" (intersection over union) or iof (intersection over
-            foreground).
-
-    Returns:
-        torch.Tensor: Return the ious betweens boxes. If ``aligned`` is
-        ``False``, the shape of ious is (N, M) else (N,).
-    """
-    assert mode in ['iou', 'iof']
-    mode_dict = {'iou': 0, 'iof': 1}
-    mode_flag = mode_dict[mode]
-    rows = bboxes1.size(0)
-    cols = bboxes2.size(0)
-    if aligned:
-        ious = bboxes1.new_zeros(rows)
-    else:
-        ious = bboxes1.new_zeros(rows * cols)
-    bboxes1 = bboxes1.contiguous()
-    bboxes2 = bboxes2.contiguous()
-    ext_module.box_iou_quadri(
-        bboxes1, bboxes2, ious, mode_flag=mode_flag, aligned=aligned)
-    if not aligned:
-        ious = ious.view(rows, cols)
-    return ious
diff --git a/mmcv/ops/box_iou_rotated.py b/mmcv/ops/box_iou_rotated.py
index 2443af2..2d78015 100644
--- a/mmcv/ops/box_iou_rotated.py
+++ b/mmcv/ops/box_iou_rotated.py
@@ -1,16 +1,10 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-import torch
-
 from ..utils import ext_loader
 
 ext_module = ext_loader.load_ext('_ext', ['box_iou_rotated'])
 
 
-def box_iou_rotated(bboxes1: torch.Tensor,
-                    bboxes2: torch.Tensor,
-                    mode: str = 'iou',
-                    aligned: bool = False,
-                    clockwise: bool = True) -> torch.Tensor:
+def box_iou_rotated(bboxes1, bboxes2, mode='iou', aligned=False):
     """Return intersection-over-union (Jaccard index) of boxes.
 
     Both sets of boxes are expected to be in
@@ -20,110 +14,18 @@ def box_iou_rotated(bboxes1: torch.Tensor,
     of bboxes1 and bboxes2, otherwise the ious between each aligned pair of
     bboxes1 and bboxes2.
 
-    .. note::
-        The operator assumes:
-
-        1) The positive direction along x axis is left -> right.
-
-        2) The positive direction along y axis is top -> down.
-
-        3) The w border is in parallel with x axis when angle = 0.
-
-        However, there are 2 opposite definitions of the positive angular
-        direction, clockwise (CW) and counter-clockwise (CCW). MMCV supports
-        both definitions and uses CW by default.
-
-        Please set ``clockwise=False`` if you are using the CCW definition.
-
-        The coordinate system when ``clockwise`` is ``True`` (default)
-
-            .. code-block:: none
-
-                0-------------------> x (0 rad)
-                |  A-------------B
-                |  |             |
-                |  |     box     h
-                |  |   angle=0   |
-                |  D------w------C
-                v
-                y (pi/2 rad)
-
-            In such coordination system the rotation matrix is
-
-            .. math::
-                \\begin{pmatrix}
-                \\cos\\alpha & -\\sin\\alpha \\\\
-                \\sin\\alpha & \\cos\\alpha
-                \\end{pmatrix}
-
-            The coordinates of the corner point A can be calculated as:
-
-            .. math::
-                P_A=
-                \\begin{pmatrix} x_A \\\\ y_A\\end{pmatrix}
-                =
-                \\begin{pmatrix} x_{center} \\\\ y_{center}\\end{pmatrix} +
-                \\begin{pmatrix}\\cos\\alpha & -\\sin\\alpha \\\\
-                \\sin\\alpha & \\cos\\alpha\\end{pmatrix}
-                \\begin{pmatrix} -0.5w \\\\ -0.5h\\end{pmatrix} \\\\
-                =
-                \\begin{pmatrix} x_{center}-0.5w\\cos\\alpha+0.5h\\sin\\alpha
-                \\\\
-                y_{center}-0.5w\\sin\\alpha-0.5h\\cos\\alpha\\end{pmatrix}
-
-
-        The coordinate system when ``clockwise`` is ``False``
-
-            .. code-block:: none
-
-                0-------------------> x (0 rad)
-                |  A-------------B
-                |  |             |
-                |  |     box     h
-                |  |   angle=0   |
-                |  D------w------C
-                v
-                y (-pi/2 rad)
-
-            In such coordination system the rotation matrix is
-
-            .. math::
-                \\begin{pmatrix}
-                \\cos\\alpha & \\sin\\alpha \\\\
-                -\\sin\\alpha & \\cos\\alpha
-                \\end{pmatrix}
-
-            The coordinates of the corner point A can be calculated as:
-
-            .. math::
-                P_A=
-                \\begin{pmatrix} x_A \\\\ y_A\\end{pmatrix}
-                =
-                \\begin{pmatrix} x_{center} \\\\ y_{center}\\end{pmatrix} +
-                \\begin{pmatrix}\\cos\\alpha & \\sin\\alpha \\\\
-                -\\sin\\alpha & \\cos\\alpha\\end{pmatrix}
-                \\begin{pmatrix} -0.5w \\\\ -0.5h\\end{pmatrix} \\\\
-                =
-                \\begin{pmatrix} x_{center}-0.5w\\cos\\alpha-0.5h\\sin\\alpha
-                \\\\
-                y_{center}+0.5w\\sin\\alpha-0.5h\\cos\\alpha\\end{pmatrix}
-
-    Args:
-        boxes1 (torch.Tensor): rotated bboxes 1. It has shape (N, 5),
-            indicating (x, y, w, h, theta) for each row. Note that theta is in
-            radian.
-        boxes2 (torch.Tensor): rotated bboxes 2. It has shape (M, 5),
-            indicating (x, y, w, h, theta) for each row. Note that theta is in
-            radian.
+    Arguments:
+        boxes1 (Tensor): rotated bboxes 1. \
+            It has shape (N, 5), indicating (x, y, w, h, theta) for each row.
+            Note that theta is in radian.
+        boxes2 (Tensor): rotated bboxes 2. \
+            It has shape (M, 5), indicating (x, y, w, h, theta) for each row.
+            Note that theta is in radian.
         mode (str): "iou" (intersection over union) or iof (intersection over
             foreground).
-        clockwise (bool): flag indicating whether the positive angular
-            orientation is clockwise. default True.
-            `New in version 1.4.3.`
 
     Returns:
-        torch.Tensor: Return the ious betweens boxes. If ``aligned`` is
-        ``False``, the shape of ious is (N, M) else (N,).
+        ious(Tensor): shape (N, M) if aligned == False else shape (N,)
     """
     assert mode in ['iou', 'iof']
     mode_dict = {'iou': 0, 'iof': 1}
@@ -133,12 +35,7 @@ def box_iou_rotated(bboxes1: torch.Tensor,
     if aligned:
         ious = bboxes1.new_zeros(rows)
     else:
-        ious = bboxes1.new_zeros(rows * cols)
-    if not clockwise:
-        flip_mat = bboxes1.new_ones(bboxes1.shape[-1])
-        flip_mat[-1] = -1
-        bboxes1 = bboxes1 * flip_mat
-        bboxes2 = bboxes2 * flip_mat
+        ious = bboxes1.new_zeros((rows * cols))
     bboxes1 = bboxes1.contiguous()
     bboxes2 = bboxes2.contiguous()
     ext_module.box_iou_rotated(
diff --git a/mmcv/ops/carafe.py b/mmcv/ops/carafe.py
index f7e79c2..5154cb3 100644
--- a/mmcv/ops/carafe.py
+++ b/mmcv/ops/carafe.py
@@ -1,15 +1,11 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from typing import Tuple
-
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from mmengine.model import normal_init, xavier_init
-from mmengine.registry import MODELS
-from torch import Tensor
 from torch.autograd import Function
 from torch.nn.modules.module import Module
 
+from ..cnn import UPSAMPLE_LAYERS, normal_init, xavier_init
 from ..utils import ext_loader
 
 ext_module = ext_loader.load_ext('_ext', [
@@ -21,8 +17,7 @@ ext_module = ext_loader.load_ext('_ext', [
 class CARAFENaiveFunction(Function):
 
     @staticmethod
-    def symbolic(g, features: Tensor, masks: Tensor, kernel_size: int,
-                 group_size: int, scale_factor: int) -> Tensor:
+    def symbolic(g, features, masks, kernel_size, group_size, scale_factor):
         return g.op(
             'mmcv::MMCVCARAFENaive',
             features,
@@ -32,8 +27,7 @@ class CARAFENaiveFunction(Function):
             scale_factor_f=scale_factor)
 
     @staticmethod
-    def forward(ctx, features: Tensor, masks: Tensor, kernel_size: int,
-                group_size: int, scale_factor: int) -> Tensor:
+    def forward(ctx, features, masks, kernel_size, group_size, scale_factor):
         assert scale_factor >= 1
         assert masks.size(1) == kernel_size * kernel_size * group_size
         assert masks.size(-1) == features.size(-1) * scale_factor
@@ -56,15 +50,12 @@ class CARAFENaiveFunction(Function):
             group_size=group_size,
             scale_factor=scale_factor)
 
-        if features.requires_grad or masks.requires_grad or \
-                torch.__version__ == 'parrots':
+        if features.requires_grad or masks.requires_grad:
             ctx.save_for_backward(features, masks)
         return output
 
     @staticmethod
-    def backward(
-            ctx,
-            grad_output: Tensor) -> Tuple[Tensor, Tensor, None, None, None]:
+    def backward(ctx, grad_output):
         assert grad_output.is_cuda
 
         features, masks = ctx.saved_tensors
@@ -92,8 +83,8 @@ carafe_naive = CARAFENaiveFunction.apply
 
 class CARAFENaive(Module):
 
-    def __init__(self, kernel_size: int, group_size: int, scale_factor: int):
-        super().__init__()
+    def __init__(self, kernel_size, group_size, scale_factor):
+        super(CARAFENaive, self).__init__()
 
         assert isinstance(kernel_size, int) and isinstance(
             group_size, int) and isinstance(scale_factor, int)
@@ -101,7 +92,7 @@ class CARAFENaive(Module):
         self.group_size = group_size
         self.scale_factor = scale_factor
 
-    def forward(self, features: Tensor, masks: Tensor) -> Tensor:
+    def forward(self, features, masks):
         return carafe_naive(features, masks, self.kernel_size, self.group_size,
                             self.scale_factor)
 
@@ -109,8 +100,7 @@ class CARAFENaive(Module):
 class CARAFEFunction(Function):
 
     @staticmethod
-    def symbolic(g, features: Tensor, masks: Tensor, kernel_size: int,
-                 group_size: int, scale_factor: int) -> Tensor:
+    def symbolic(g, features, masks, kernel_size, group_size, scale_factor):
         return g.op(
             'mmcv::MMCVCARAFE',
             features,
@@ -120,8 +110,7 @@ class CARAFEFunction(Function):
             scale_factor_f=scale_factor)
 
     @staticmethod
-    def forward(ctx, features: Tensor, masks: Tensor, kernel_size: int,
-                group_size: int, scale_factor: int) -> Tensor:
+    def forward(ctx, features, masks, kernel_size, group_size, scale_factor):
         assert scale_factor >= 1
         assert masks.size(1) == kernel_size * kernel_size * group_size
         assert masks.size(-1) == features.size(-1) * scale_factor
@@ -150,15 +139,14 @@ class CARAFEFunction(Function):
             group_size=group_size,
             scale_factor=scale_factor)
 
-        if features.requires_grad or masks.requires_grad or \
-                torch.__version__ == 'parrots':
+        if features.requires_grad or masks.requires_grad:
             ctx.save_for_backward(features, masks, rfeatures)
         return output
 
     @staticmethod
-    def backward(
-            ctx,
-            grad_output: Tensor) -> Tuple[Tensor, Tensor, None, None, None]:
+    def backward(ctx, grad_output):
+        assert grad_output.is_cuda
+
         features, masks, rfeatures = ctx.saved_tensors
         kernel_size = ctx.kernel_size
         group_size = ctx.group_size
@@ -192,8 +180,7 @@ carafe = CARAFEFunction.apply
 class CARAFE(Module):
     """ CARAFE: Content-Aware ReAssembly of FEatures
 
-    Please refer to `CARAFE: Content-Aware ReAssembly of FEatures
-    <https://arxiv.org/abs/1905.02188>`_ for more details.
+    Please refer to https://arxiv.org/abs/1905.02188 for more details.
 
     Args:
         kernel_size (int): reassemble kernel size
@@ -204,8 +191,8 @@ class CARAFE(Module):
         upsampled feature map
     """
 
-    def __init__(self, kernel_size: int, group_size: int, scale_factor: int):
-        super().__init__()
+    def __init__(self, kernel_size, group_size, scale_factor):
+        super(CARAFE, self).__init__()
 
         assert isinstance(kernel_size, int) and isinstance(
             group_size, int) and isinstance(scale_factor, int)
@@ -213,19 +200,19 @@ class CARAFE(Module):
         self.group_size = group_size
         self.scale_factor = scale_factor
 
-    def forward(self, features: Tensor, masks: Tensor) -> Tensor:
+    def forward(self, features, masks):
         return carafe(features, masks, self.kernel_size, self.group_size,
                       self.scale_factor)
 
 
-@MODELS.register_module(name='carafe')
+@UPSAMPLE_LAYERS.register_module(name='carafe')
 class CARAFEPack(nn.Module):
     """A unified package of CARAFE upsampler that contains: 1) channel
     compressor 2) content encoder 3) CARAFE op.
 
     Official implementation of ICCV 2019 paper
-    `CARAFE: Content-Aware ReAssembly of FEatures
-    <https://arxiv.org/abs/1905.02188>`_.
+    CARAFE: Content-Aware ReAssembly of FEatures
+    Please refer to https://arxiv.org/abs/1905.02188 for more details.
 
     Args:
         channels (int): input feature channels
@@ -241,14 +228,14 @@ class CARAFEPack(nn.Module):
     """
 
     def __init__(self,
-                 channels: int,
-                 scale_factor: int,
-                 up_kernel: int = 5,
-                 up_group: int = 1,
-                 encoder_kernel: int = 3,
-                 encoder_dilation: int = 1,
-                 compressed_channels: int = 64):
-        super().__init__()
+                 channels,
+                 scale_factor,
+                 up_kernel=5,
+                 up_group=1,
+                 encoder_kernel=3,
+                 encoder_dilation=1,
+                 compressed_channels=64):
+        super(CARAFEPack, self).__init__()
         self.channels = channels
         self.scale_factor = scale_factor
         self.up_kernel = up_kernel
@@ -274,7 +261,7 @@ class CARAFEPack(nn.Module):
                 xavier_init(m, distribution='uniform')
         normal_init(self.content_encoder, std=0.001)
 
-    def kernel_normalizer(self, mask: Tensor) -> Tensor:
+    def kernel_normalizer(self, mask):
         mask = F.pixel_shuffle(mask, self.scale_factor)
         n, mask_c, h, w = mask.size()
         # use float division explicitly,
@@ -287,11 +274,11 @@ class CARAFEPack(nn.Module):
 
         return mask
 
-    def feature_reassemble(self, x: Tensor, mask: Tensor) -> Tensor:
+    def feature_reassemble(self, x, mask):
         x = carafe(x, mask, self.up_kernel, self.up_group, self.scale_factor)
         return x
 
-    def forward(self, x: Tensor) -> Tensor:
+    def forward(self, x):
         compressed_x = self.channel_compressor(x)
         mask = self.content_encoder(compressed_x)
         mask = self.kernel_normalizer(mask)
diff --git a/mmcv/ops/cc_attention.py b/mmcv/ops/cc_attention.py
index efde7b7..ff8dd4c 100644
--- a/mmcv/ops/cc_attention.py
+++ b/mmcv/ops/cc_attention.py
@@ -2,12 +2,11 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from mmengine.registry import MODELS
 
-from mmcv.cnn import Scale
+from mmcv.cnn import PLUGIN_LAYERS, Scale
 
 
-def NEG_INF_DIAG(n: int, device: torch.device) -> torch.Tensor:
+def NEG_INF_DIAG(n, device):
     """Returns a diagonal matrix of size [n, n].
 
     The diagonal are all "-inf". This is for avoiding calculating the
@@ -16,7 +15,7 @@ def NEG_INF_DIAG(n: int, device: torch.device) -> torch.Tensor:
     return torch.diag(torch.tensor(float('-inf')).to(device).repeat(n), 0)
 
 
-@MODELS.register_module()
+@PLUGIN_LAYERS.register_module()
 class CrissCrossAttention(nn.Module):
     """Criss-Cross Attention Module.
 
@@ -42,7 +41,7 @@ class CrissCrossAttention(nn.Module):
         in_channels (int): Channels of the input feature map.
     """
 
-    def __init__(self, in_channels: int) -> None:
+    def __init__(self, in_channels):
         super().__init__()
         self.query_conv = nn.Conv2d(in_channels, in_channels // 8, 1)
         self.key_conv = nn.Conv2d(in_channels, in_channels // 8, 1)
@@ -50,15 +49,14 @@ class CrissCrossAttention(nn.Module):
         self.gamma = Scale(0.)
         self.in_channels = in_channels
 
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
+    def forward(self, x):
         """forward function of Criss-Cross Attention.
 
         Args:
-            x (torch.Tensor): Input feature with the shape of
-                (batch_size, in_channels, height, width).
-
+            x (Tensor): Input feature. \
+                shape (batch_size, in_channels, height, width)
         Returns:
-            torch.Tensor: Output of the layer, with the shape of
+            Tensor: Output of the layer, with shape of \
             (batch_size, in_channels, height, width)
         """
         B, C, H, W = x.size()
@@ -79,7 +77,7 @@ class CrissCrossAttention(nn.Module):
 
         return out
 
-    def __repr__(self) -> str:
+    def __repr__(self):
         s = self.__class__.__name__
         s += f'(in_channels={self.in_channels})'
         return s
diff --git a/mmcv/ops/chamfer_distance.py b/mmcv/ops/chamfer_distance.py
deleted file mode 100644
index 1f908a5..0000000
--- a/mmcv/ops/chamfer_distance.py
+++ /dev/null
@@ -1,93 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from typing import Sequence, Tuple
-
-import torch
-from torch import Tensor
-from torch.autograd import Function
-from torch.autograd.function import once_differentiable
-
-from ..utils import ext_loader
-
-ext_module = ext_loader.load_ext(
-    '_ext', ['chamfer_distance_forward', 'chamfer_distance_backward'])
-
-
-class ChamferDistanceFunction(Function):
-    """This is an implementation of the 2D Chamfer Distance.
-
-    It has been used in the paper `Oriented RepPoints for Aerial Object
-    Detection (CVPR 2022) <https://arxiv.org/abs/2105.11111>_`.
-    """
-
-    @staticmethod
-    def forward(ctx, xyz1: Tensor, xyz2: Tensor) -> Sequence[Tensor]:
-        """
-        Args:
-            xyz1 (Tensor): Point set with shape (B, N, 2).
-            xyz2 (Tensor): Point set with shape (B, N, 2).
-
-        Returns:
-            Sequence[Tensor]:
-
-                - dist1 (Tensor): Chamfer distance (xyz1 to xyz2) with
-                    shape (B, N).
-                - dist2 (Tensor): Chamfer distance (xyz2 to xyz1) with
-                    shape (B, N).
-                - idx1 (Tensor): Index of chamfer distance (xyz1 to xyz2)
-                    with shape (B, N), which be used in compute gradient.
-                - idx2 (Tensor): Index of chamfer distance (xyz2 to xyz2)
-                    with shape (B, N), which be used in compute gradient.
-        """
-        batch_size, n, _ = xyz1.size()
-        _, m, _ = xyz2.size()
-        device = xyz1.device
-        xyz1 = xyz1.contiguous()
-        xyz2 = xyz2.contiguous()
-
-        dist1 = torch.zeros(batch_size, n).to(device)
-        dist2 = torch.zeros(batch_size, m).to(device)
-        idx1 = torch.zeros(batch_size, n).type(torch.IntTensor).to(device)
-        idx2 = torch.zeros(batch_size, m).type(torch.IntTensor).to(device)
-
-        ext_module.chamfer_distance_forward(xyz1, xyz2, dist1, dist2, idx1,
-                                            idx2)
-        ctx.save_for_backward(xyz1, xyz2, idx1, idx2)
-        return dist1, dist2, idx1, idx2
-
-    @staticmethod
-    @once_differentiable
-    def backward(ctx,
-                 grad_dist1: Tensor,
-                 grad_dist2: Tensor,
-                 grad_idx1=None,
-                 grad_idx2=None) -> Tuple[Tensor, Tensor]:
-        """
-
-        Args:
-            grad_dist1 (Tensor): Gradient of chamfer distance
-                (xyz1 to xyz2) with shape (B, N).
-            grad_dist2 (Tensor): Gradient of chamfer distance
-                (xyz2 to xyz1) with shape (B, N).
-
-        Returns:
-            Tuple[Tensor, Tensor]:
-
-            - grad_xyz1 (Tensor): Gradient of the point set with shape \
-                (B, N, 2).
-            - grad_xyz2 (Tensor):Gradient of the point set with shape \
-                (B, N, 2).
-        """
-        xyz1, xyz2, idx1, idx2 = ctx.saved_tensors
-        device = grad_dist1.device
-        grad_dist1 = grad_dist1.contiguous()
-        grad_dist2 = grad_dist2.contiguous()
-        grad_xyz1 = torch.zeros(xyz1.size()).to(device)
-        grad_xyz2 = torch.zeros(xyz2.size()).to(device)
-
-        ext_module.chamfer_distance_backward(xyz1, xyz2, idx1, idx2,
-                                             grad_dist1, grad_dist2, grad_xyz1,
-                                             grad_xyz2)
-        return grad_xyz1, grad_xyz2
-
-
-chamfer_distance = ChamferDistanceFunction.apply
diff --git a/mmcv/ops/contour_expand.py b/mmcv/ops/contour_expand.py
index 7184609..ea1111e 100644
--- a/mmcv/ops/contour_expand.py
+++ b/mmcv/ops/contour_expand.py
@@ -1,6 +1,4 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from typing import Union
-
 import numpy as np
 import torch
 
@@ -9,22 +7,21 @@ from ..utils import ext_loader
 ext_module = ext_loader.load_ext('_ext', ['contour_expand'])
 
 
-def contour_expand(kernel_mask: Union[np.array, torch.Tensor],
-                   internal_kernel_label: Union[np.array, torch.Tensor],
-                   min_kernel_area: int, kernel_num: int) -> list:
+def contour_expand(kernel_mask, internal_kernel_label, min_kernel_area,
+                   kernel_num):
     """Expand kernel contours so that foreground pixels are assigned into
     instances.
 
-    Args:
-        kernel_mask (np.array or torch.Tensor): The instance kernel mask with
+    Arguments:
+        kernel_mask (np.array or Tensor): The instance kernel mask with
             size hxw.
-        internal_kernel_label (np.array or torch.Tensor): The instance internal
+        internal_kernel_label (np.array or Tensor): The instance internal
             kernel label with size hxw.
         min_kernel_area (int): The minimum kernel area.
         kernel_num (int): The instance kernel number.
 
     Returns:
-        list: The instance index map with size hxw.
+        label (list): The instance index map with size hxw.
     """
     assert isinstance(kernel_mask, (torch.Tensor, np.ndarray))
     assert isinstance(internal_kernel_label, (torch.Tensor, np.ndarray))
@@ -45,7 +42,7 @@ def contour_expand(kernel_mask: Union[np.array, torch.Tensor],
                 internal_kernel_label,
                 min_kernel_area=min_kernel_area,
                 kernel_num=kernel_num)
-            label = label.tolist()  # type: ignore
+            label = label.tolist()
     else:
         label = ext_module.contour_expand(kernel_mask, internal_kernel_label,
                                           min_kernel_area, kernel_num)
diff --git a/mmcv/ops/conv2d_gradfix.py b/mmcv/ops/conv2d_gradfix.py
deleted file mode 100644
index 9d4ef6e..0000000
--- a/mmcv/ops/conv2d_gradfix.py
+++ /dev/null
@@ -1,346 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
-#
-# NVIDIA CORPORATION and its licensors retain all intellectual property
-# and proprietary rights in and to this software, related documentation
-# and any modifications thereto.  Any use, reproduction, disclosure or
-# distribution of this software and related documentation without an express
-# license agreement from NVIDIA CORPORATION is strictly prohibited.
-
-# source: https://github.com/NVlabs/stylegan3/blob/main/torch_utils/ops/conv2d_gradfix.py # noqa
-"""Custom replacement for `torch.nn.functional.conv2d` that supports
-arbitrarily high order gradients with zero performance penalty."""
-
-import contextlib
-import warnings
-from typing import Dict, Optional, Tuple, Union
-
-import torch
-from mmengine.utils import digit_version
-
-enabled = True
-weight_gradients_disabled = False
-
-
-@contextlib.contextmanager
-def no_weight_gradients(disable=True):
-    global weight_gradients_disabled
-    old = weight_gradients_disabled
-    if disable:
-        weight_gradients_disabled = True
-    yield
-    weight_gradients_disabled = old
-
-
-def conv2d(input: torch.Tensor,
-           weight: torch.Tensor,
-           bias: Optional[torch.Tensor] = None,
-           stride: Union[int, Tuple[int, ...]] = 1,
-           padding: Union[int, Tuple[int, ...]] = 0,
-           dilation: Union[int, Tuple[int, ...]] = 1,
-           groups: int = 1):
-    flag = True
-    if digit_version(torch.__version__) >= digit_version('1.10.0'):
-        warnings.warn('Since '
-                      'aten:cudnn_convolution_backward_weight is '
-                      f'not supported in torch=={torch.__version__},'
-                      ' rolling back to `torch.nn.functional.conv2d`')
-        flag = False
-    if _should_use_custom_op(input) and flag:
-        return _conv2d_gradfix(
-            transpose=False,
-            weight_shape=weight.shape,
-            stride=stride,
-            padding=padding,
-            output_padding=0,
-            dilation=dilation,
-            groups=groups).apply(input, weight, bias)
-    return torch.nn.functional.conv2d(
-        input=input,
-        weight=weight,
-        bias=bias,
-        stride=stride,
-        padding=padding,
-        dilation=dilation,
-        groups=groups)
-
-
-def conv_transpose2d(input: torch.Tensor,
-                     weight: torch.Tensor,
-                     bias: Optional[torch.Tensor] = None,
-                     stride: Union[int, Tuple[int, ...]] = 1,
-                     padding: Union[int, Tuple[int, ...]] = 0,
-                     output_padding: Union[int, Tuple[int, ...]] = 0,
-                     groups: int = 1,
-                     dilation: Union[int, Tuple[int, ...]] = 1):
-    if _should_use_custom_op(input):
-        return _conv2d_gradfix(
-            transpose=True,
-            weight_shape=weight.shape,
-            stride=stride,
-            padding=padding,
-            output_padding=output_padding,
-            groups=groups,
-            dilation=dilation).apply(input, weight, bias)
-    return torch.nn.functional.conv_transpose2d(
-        input=input,
-        weight=weight,
-        bias=bias,
-        stride=stride,
-        padding=padding,
-        output_padding=output_padding,
-        groups=groups,
-        dilation=dilation)
-
-
-def _should_use_custom_op(input):
-    assert isinstance(input, torch.Tensor)
-    if (not enabled) or (not torch.backends.cudnn.enabled):
-        return False
-    if input.device.type != 'cuda':
-        return False
-    return True
-
-
-def _to_tuple(x, ndim):
-    xs = tuple(x) if isinstance(x, (tuple, list)) else (x, ) * ndim
-    assert len(xs) == ndim
-    assert all(isinstance(x, int) for x in xs)
-    return xs
-
-
-_conv2d_gradfix_cache: Dict = dict()
-_null_tensor = torch.empty([0])
-
-
-def _conv2d_gradfix(
-    transpose: bool,
-    weight_shape: Tuple[int, ...],
-    stride: Union[int, Tuple[int, ...]],
-    padding: Union[int, Tuple[int, ...]],
-    output_padding: Union[int, Tuple[int, ...]],
-    dilation: Union[int, Tuple[int, ...]],
-    groups: int,
-):
-    # Parse arguments.
-    ndim = 2
-    weight_shape = tuple(weight_shape)
-    stride = _to_tuple(stride, ndim)
-    padding = _to_tuple(padding, ndim)
-    output_padding = _to_tuple(output_padding, ndim)
-    dilation = _to_tuple(dilation, ndim)
-
-    # Lookup from cache.
-    key = (transpose, weight_shape, stride, padding, output_padding, dilation,
-           groups)
-    if key in _conv2d_gradfix_cache:
-        return _conv2d_gradfix_cache[key]
-
-    # Validate arguments.
-
-    assert groups >= 1
-    assert len(weight_shape) == ndim + 2
-    assert all(stride[i] >= 1 for i in range(ndim))  # type: ignore
-    assert all(padding[i] >= 0 for i in range(ndim))  # type: ignore
-    assert all(dilation[i] >= 0 for i in range(ndim))  # type: ignore
-    if not transpose:
-        assert all(output_padding[i] == 0 for i in range(ndim))  # type: ignore
-    else:  # transpose
-        for i in range(ndim):
-            assert 0 <= output_padding[i] < max(  # type: ignore
-                stride[i],  # type: ignore
-                dilation[i])  # type: ignore
-
-    # Helpers.
-    common_kwargs = dict(
-        stride=stride, padding=padding, dilation=dilation, groups=groups)
-
-    def calc_output_padding(input_shape, output_shape):
-        if transpose:
-            return [0, 0]
-        return [
-            input_shape[i + 2] - (output_shape[i + 2] - 1) * stride[i] -
-            (1 - 2 * padding[i]) - dilation[i] * (weight_shape[i + 2] - 1)
-            for i in range(ndim)
-        ]
-
-    # Forward & backward.
-    class Conv2d(torch.autograd.Function):
-
-        @staticmethod
-        def forward(ctx, input, weight, bias):
-            assert weight.shape == weight_shape
-            ctx.save_for_backward(
-                input if weight.requires_grad else _null_tensor,
-                weight if input.requires_grad else _null_tensor,
-            )
-            ctx.input_shape = input.shape
-
-            # Simple 1x1 convolution => cuBLAS (only on Volta, not on Ampere).
-            if weight_shape[2:] == stride == dilation == (
-                    1, 1) and padding == (
-                        0, 0) and torch.cuda.get_device_capability(
-                            input.device) < (8, 0):
-                a = weight.reshape(groups, weight_shape[0] // groups,
-                                   weight_shape[1])
-                b = input.reshape(input.shape[0], groups,
-                                  input.shape[1] // groups, -1)
-                c = (a.transpose(1, 2) if transpose else a) @ b.permute(
-                    1, 2, 0, 3).flatten(2)
-                c = c.reshape(-1, input.shape[0],
-                              *input.shape[2:]).transpose(0, 1)
-                c = c if bias is None else c + bias.unsqueeze(0).unsqueeze(
-                    2).unsqueeze(3)
-                return c.contiguous(
-                    memory_format=(torch.channels_last if input.stride(1) ==
-                                   1 else torch.contiguous_format))
-
-            # General case => cuDNN.
-            if transpose:
-                return torch.nn.functional.conv_transpose2d(
-                    input=input,
-                    weight=weight,
-                    bias=bias,
-                    output_padding=output_padding,
-                    **common_kwargs)
-            return torch.nn.functional.conv2d(
-                input=input, weight=weight, bias=bias, **common_kwargs)
-
-        @staticmethod
-        def backward(ctx, grad_output):
-            input, weight = ctx.saved_tensors
-            input_shape = ctx.input_shape
-            grad_input = None
-            grad_weight = None
-            grad_bias = None
-
-            if ctx.needs_input_grad[0]:
-                p = calc_output_padding(
-                    input_shape=input_shape, output_shape=grad_output.shape)
-                op = _conv2d_gradfix(
-                    transpose=(not transpose),
-                    weight_shape=weight_shape,
-                    output_padding=p,
-                    **common_kwargs)
-                grad_input = op.apply(grad_output, weight, None)
-                assert grad_input.shape == input_shape
-
-            if ctx.needs_input_grad[1] and not weight_gradients_disabled:
-                grad_weight = Conv2dGradWeight.apply(grad_output, input)
-                assert grad_weight.shape == weight_shape
-
-            if ctx.needs_input_grad[2]:
-                grad_bias = grad_output.sum([0, 2, 3])
-
-            return grad_input, grad_weight, grad_bias
-
-    # Gradient with respect to the weights.
-    class Conv2dGradWeight(torch.autograd.Function):
-
-        @staticmethod
-        def forward(ctx, grad_output, input):
-            ctx.save_for_backward(
-                grad_output if input.requires_grad else _null_tensor,
-                input if grad_output.requires_grad else _null_tensor,
-            )
-            ctx.grad_output_shape = grad_output.shape
-            ctx.input_shape = input.shape
-
-            # Simple 1x1 convolution => cuBLAS (on both Volta and Ampere).
-            if weight_shape[2:] == stride == dilation == (
-                    1, 1) and padding == (0, 0):
-                a = grad_output.reshape(grad_output.shape[0], groups,
-                                        grad_output.shape[1] // groups,
-                                        -1).permute(1, 2, 0, 3).flatten(2)
-                b = input.reshape(input.shape[0], groups,
-                                  input.shape[1] // groups,
-                                  -1).permute(1, 2, 0, 3).flatten(2)
-                c = (b @ a.transpose(1, 2) if transpose else
-                     a @ b.transpose(1, 2)).reshape(weight_shape)
-                return c.contiguous(
-                    memory_format=(torch.channels_last if input.stride(1) ==
-                                   1 else torch.contiguous_format))
-
-            # PyTorch consolidated convolution backward API in PR:
-            # https://github.com/pytorch/pytorch/commit/3dc3651e0ee3623f669c3a2c096408dbc476d122  # noqa: E501
-            # Enhance the code referring to the discussion:
-            # https://github.com/pytorch/pytorch/issues/74437
-            if digit_version(torch.__version__) >= digit_version('1.11.0'):
-                empty_weight = torch.tensor(
-                    0.0, dtype=input.dtype,
-                    device=input.device).expand(weight_shape)
-                output_padding = calc_output_padding(input.shape,
-                                                     grad_output.shape)
-                return torch.ops.aten.convolution_backward(
-                    grad_output,
-                    input,
-                    empty_weight,
-                    None,
-                    stride=stride,
-                    dilation=dilation,
-                    transposed=transpose,
-                    padding=padding,
-                    groups=groups,
-                    output_padding=output_padding,
-                    output_mask=[0, 1, 0])[1]
-            else:
-                is_rocm_pytorch = False
-                try:
-                    from torch.utils.cpp_extension import ROCM_HOME
-                    is_rocm_pytorch = True if ((torch.version.hip is not None) and
-                                       (ROCM_HOME is not None)) else False
-                except ImportError:
-                    pass
-                name=''
-                flags=[]
-                if is_rocm_pytorch:
-                    name = ('aten::miopen_convolution_transpose_backward_weight'
-                        if transpose else
-                        'aten::miopen_convolution_backward_weight')
-                    flags = [
-                        torch.backends.cudnn.benchmark,
-                        torch.backends.cudnn.deterministic
-                    ]
-                else:
-                # General case => cuDNN.
-                    name = ('aten::cudnn_convolution_transpose_backward_weight'
-                        if transpose else
-                        'aten::cudnn_convolution_backward_weight')
-                    flags = [
-                        torch.backends.cudnn.benchmark,
-                        torch.backends.cudnn.deterministic,
-                        torch.backends.cudnn.allow_tf32
-                    ]
-                return torch._C._jit_get_operation(name)(weight_shape,
-                                                         grad_output, input,
-                                                         padding, stride,
-                                                         dilation, groups,
-                                                         *flags)
-
-        @staticmethod
-        def backward(ctx, grad2_grad_weight):
-            grad_output, input = ctx.saved_tensors
-            grad_output_shape = ctx.grad_output_shape
-            input_shape = ctx.input_shape
-            grad2_grad_output = None
-            grad2_input = None
-
-            if ctx.needs_input_grad[0]:
-                grad2_grad_output = Conv2d.apply(input, grad2_grad_weight,
-                                                 None)
-                assert grad2_grad_output.shape == grad_output_shape
-
-            if ctx.needs_input_grad[1]:
-                p = calc_output_padding(
-                    input_shape=input_shape, output_shape=grad_output_shape)
-                op = _conv2d_gradfix(
-                    transpose=(not transpose),
-                    weight_shape=weight_shape,
-                    output_padding=p,
-                    **common_kwargs)
-                grad2_input = op.apply(grad_output, grad2_grad_weight, None)
-                assert grad2_input.shape == input_shape
-
-            return grad2_grad_output, grad2_input
-
-    _conv2d_gradfix_cache[key] = Conv2d
-    return Conv2d
diff --git a/mmcv/ops/convex_iou.py b/mmcv/ops/convex_iou.py
deleted file mode 100644
index 5005036..0000000
--- a/mmcv/ops/convex_iou.py
+++ /dev/null
@@ -1,52 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from typing import Tuple
-
-import torch
-
-from ..utils import ext_loader
-
-ext_module = ext_loader.load_ext('_ext', ['convex_iou', 'convex_giou'])
-
-
-def convex_giou(pointsets: torch.Tensor,
-                polygons: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
-    """Return generalized intersection-over-union (Jaccard index) between point
-    sets and polygons.
-
-    Args:
-        pointsets (torch.Tensor): It has shape (N, 18),
-            indicating (x1, y1, x2, y2, ..., x9, y9) for each row.
-        polygons (torch.Tensor): It has shape (N, 8),
-            indicating (x1, y1, x2, y2, x3, y3, x4, y4) for each row.
-
-    Returns:
-        tuple[torch.Tensor, torch.Tensor]: The first element is the gious
-        between point sets and polygons with the shape (N,). The second
-        element is the gradient of point sets with the shape (N, 18).
-    """
-    output = pointsets.new_zeros((pointsets.size(0), 19))
-    ext_module.convex_giou(pointsets, polygons, output)
-    convex_giou = output[:, -1]
-    points_grad = output[:, 0:-1]
-    return convex_giou, points_grad
-
-
-def convex_iou(pointsets: torch.Tensor,
-               polygons: torch.Tensor) -> torch.Tensor:
-    """Return intersection-over-union (Jaccard index) between point sets and
-    polygons.
-
-    Args:
-        pointsets (torch.Tensor): It has shape (N, 18),
-            indicating (x1, y1, x2, y2, ..., x9, y9) for each row.
-        polygons (torch.Tensor): It has shape (K, 8),
-            indicating (x1, y1, x2, y2, x3, y3, x4, y4) for each row.
-
-    Returns:
-        torch.Tensor: Return the ious between point sets and polygons with the
-        shape (N, K).
-    """
-    N, K = pointsets.size(0), polygons.size(0)
-    ious = pointsets.new_zeros((N, K))
-    ext_module.convex_iou(pointsets, polygons, ious)
-    return ious
diff --git a/mmcv/ops/corner_pool.py b/mmcv/ops/corner_pool.py
index f18e92d..a33d798 100644
--- a/mmcv/ops/corner_pool.py
+++ b/mmcv/ops/corner_pool.py
@@ -1,37 +1,101 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import torch
-from torch import Tensor, nn
-from mmengine.utils import digit_version
+from torch import nn
+from torch.autograd import Function
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext('_ext', [
+    'top_pool_forward', 'top_pool_backward', 'bottom_pool_forward',
+    'bottom_pool_backward', 'left_pool_forward', 'left_pool_backward',
+    'right_pool_forward', 'right_pool_backward'
+])
+
 _mode_dict = {'top': 0, 'bottom': 1, 'left': 2, 'right': 3}
 
 
-def _corner_pool(x: Tensor, dim: int, flip: bool) -> Tensor:
-    size = x.size(dim)
-    output = x.clone()
+class TopPoolFunction(Function):
 
-    ind = 1
-    while ind < size:
-        if flip:
-            cur_start = 0
-            cur_len = size - ind
-            next_start = ind
-            next_len = size - ind
-        else:
-            cur_start = ind
-            cur_len = size - ind
-            next_start = 0
-            next_len = size - ind
+    @staticmethod
+    def symbolic(g, input):
+        output = g.op(
+            'mmcv::MMCVCornerPool', input, mode_i=int(_mode_dict['top']))
+        return output
+
+    @staticmethod
+    def forward(ctx, input):
+        output = ext_module.top_pool_forward(input)
+        ctx.save_for_backward(input)
+        return output
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        input, = ctx.saved_tensors
+        output = ext_module.top_pool_backward(input, grad_output)
+        return output
+
+
+class BottomPoolFunction(Function):
+
+    @staticmethod
+    def symbolic(g, input):
+        output = g.op(
+            'mmcv::MMCVCornerPool', input, mode_i=int(_mode_dict['bottom']))
+        return output
+
+    @staticmethod
+    def forward(ctx, input):
+        output = ext_module.bottom_pool_forward(input)
+        ctx.save_for_backward(input)
+        return output
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        input, = ctx.saved_tensors
+        output = ext_module.bottom_pool_backward(input, grad_output)
+        return output
 
-        # max_temp should be cloned for backward computation
-        max_temp = output.narrow(dim, cur_start, cur_len).clone()
-        cur_temp = output.narrow(dim, cur_start, cur_len)
-        next_temp = output.narrow(dim, next_start, next_len)
 
-        cur_temp[...] = torch.where(max_temp > next_temp, max_temp, next_temp)
+class LeftPoolFunction(Function):
 
-        ind = ind << 1
+    @staticmethod
+    def symbolic(g, input):
+        output = g.op(
+            'mmcv::MMCVCornerPool', input, mode_i=int(_mode_dict['left']))
+        return output
 
-    return output
+    @staticmethod
+    def forward(ctx, input):
+        output = ext_module.left_pool_forward(input)
+        ctx.save_for_backward(input)
+        return output
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        input, = ctx.saved_tensors
+        output = ext_module.left_pool_backward(input, grad_output)
+        return output
+
+
+class RightPoolFunction(Function):
+
+    @staticmethod
+    def symbolic(g, input):
+        output = g.op(
+            'mmcv::MMCVCornerPool', input, mode_i=int(_mode_dict['right']))
+        return output
+
+    @staticmethod
+    def forward(ctx, input):
+        output = ext_module.right_pool_forward(input)
+        ctx.save_for_backward(input)
+        return output
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        input, = ctx.saved_tensors
+        output = ext_module.right_pool_backward(input, grad_output)
+        return output
 
 
 class CornerPool(nn.Module):
@@ -40,13 +104,11 @@ class CornerPool(nn.Module):
     Corner Pooling is a new type of pooling layer that helps a
     convolutional network better localize corners of bounding boxes.
 
-    Please refer to `CornerNet: Detecting Objects as Paired Keypoints
-    <https://arxiv.org/abs/1808.01244>`_ for more details.
-
+    Please refer to https://arxiv.org/abs/1808.01244 for more details.
     Code is modified from https://github.com/princeton-vl/CornerNet-Lite.
 
     Args:
-        mode (str): Pooling orientation for the pooling layer
+        mode(str): Pooling orientation for the pooling layer
 
             - 'bottom': Bottom Pooling
             - 'left': Left Pooling
@@ -57,6 +119,13 @@ class CornerPool(nn.Module):
         Feature map after pooling.
     """
 
+    pool_functions = {
+        'bottom': BottomPoolFunction,
+        'left': LeftPoolFunction,
+        'right': RightPoolFunction,
+        'top': TopPoolFunction,
+    }
+
     cummax_dim_flip = {
         'bottom': (2, False),
         'left': (3, True),
@@ -64,13 +133,23 @@ class CornerPool(nn.Module):
         'top': (2, True),
     }
 
-    def __init__(self, mode: str):
-        super().__init__()
-        assert mode in self.cummax_dim_flip
+    def __init__(self, mode):
+        super(CornerPool, self).__init__()
+        assert mode in self.pool_functions
         self.mode = mode
+        self.corner_pool = self.pool_functions[mode]
+
+    def forward(self, x):
+        if torch.__version__ != 'parrots' and torch.__version__ >= '1.5.0':
+            if torch.onnx.is_in_onnx_export():
+                assert torch.__version__ >= '1.7.0', \
+                    'When `cummax` serves as an intermediate component whose '\
+                    'outputs is used as inputs for another modules, it\'s '\
+                    'expected that pytorch version must be >= 1.7.0, '\
+                    'otherwise Error appears like: `RuntimeError: tuple '\
+                    'appears in op that does not forward tuples, unsupported '\
+                    'kind: prim::PythonOp`.'
 
-    def forward(self, x: Tensor) -> Tensor:
-        if torch.__version__ != 'parrots' and digit_version(torch.__version__) >= digit_version('1.5.0'):
             dim, flip = self.cummax_dim_flip[self.mode]
             if flip:
                 x = x.flip(dim)
@@ -79,5 +158,4 @@ class CornerPool(nn.Module):
                 pool_tensor = pool_tensor.flip(dim)
             return pool_tensor
         else:
-            dim, flip = self.cummax_dim_flip[self.mode]
-            return _corner_pool(x, dim, flip)
+            return self.corner_pool.apply(x)
diff --git a/mmcv/ops/correlation.py b/mmcv/ops/correlation.py
index 319b764..3d0b79c 100644
--- a/mmcv/ops/correlation.py
+++ b/mmcv/ops/correlation.py
@@ -1,6 +1,4 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from typing import Tuple
-
 import torch
 from torch import Tensor, nn
 from torch.autograd import Function
@@ -17,14 +15,14 @@ class CorrelationFunction(Function):
 
     @staticmethod
     def forward(ctx,
-                input1: Tensor,
-                input2: Tensor,
-                kernel_size: int = 1,
-                max_displacement: int = 1,
-                stride: int = 1,
-                padding: int = 1,
-                dilation: int = 1,
-                dilation_patch: int = 1) -> Tensor:
+                input1,
+                input2,
+                kernel_size=1,
+                max_displacement=1,
+                stride=1,
+                padding=1,
+                dilation=1,
+                dilation_patch=1):
 
         ctx.save_for_backward(input1, input2)
 
@@ -62,9 +60,7 @@ class CorrelationFunction(Function):
 
     @staticmethod
     @once_differentiable
-    def backward(
-        ctx, grad_output: Tensor
-    ) -> Tuple[Tensor, Tensor, None, None, None, None, None, None]:
+    def backward(ctx, grad_output):
         input1, input2 = ctx.saved_tensors
 
         kH, kW = ctx.kernel_size
diff --git a/mmcv/ops/csrc/README.md b/mmcv/ops/csrc/README.md
index 8fcc6eb..3bc0200 100644
--- a/mmcv/ops/csrc/README.md
+++ b/mmcv/ops/csrc/README.md
@@ -13,150 +13,158 @@ This folder contains all non-python code for MMCV custom ops. Please follow the
 │   ├── pytorch_cpp_helper.hpp
 │   ├── pytorch_cuda_helper.hpp
 │   ├── pytorch_device_registry.hpp
-│   ├── cuda
-│   │   ├── common_cuda_helper.hpp
-│   │   ├── parrots_cudawarpfunction.cuh
-│   │   ├── ...
-│   │   └── ops_cuda_kernel.cuh
-|   ├── mps
-│   │   ├── MPSLibrary.h
-│   │   ├── ...
-│   │   └── MPSUtils.h
-|   ├── mlu
-│   │   └── ...
-|   └── utils
-│   │   └── ...
+│   └── cuda
+│       ├── common_cuda_helper.hpp
+│       ├── parrots_cudawarpfunction.cuh
+│       ├── ...
+│       └── ops_cuda_kernel.cuh
+├── onnxruntime
+│   ├── onnxruntime_register.h
+│   ├── onnxruntime_session_options_config_keys.h
+│   ├── ort_mmcv_utils.h
+│   ├── ...
+│   ├── onnx_ops.h
+│   └── cpu
+│       ├── onnxruntime_register.cpp
+│       ├── ...
+│       └── onnx_ops_impl.cpp
 ├── parrots
 │   ├── ...
 │   ├── ops.cpp
 │   ├── ops_parrots.cpp
 │   └── ops_pytorch.h
-└── pytorch
-    ├── info.cpp
-    ├── pybind.cpp
-    ├── ...
-    ├── ops.cpp
-    ├── cuda
-    │   ├── ...
-    │   └── ops_cuda.cu
-    ├── cpu
-    │   ├── ...
-    │   └── ops.cpp
-    ├── mps
-    │   ├── ...
-    |   └── op_mps.mm
-    └── mlu
-        ├── ...
-        └── op_mlu.cpp
+├── pytorch
+│   ├── info.cpp
+│   ├── pybind.cpp
+│   ├── ...
+│   ├── ops.cpp
+│   ├── cuda
+│   │   ├── ...
+│   │   └── ops_cuda.cu
+│   └── cpu
+│       ├── ...
+│       └── ops.cpp
+└── tensorrt
+    ├── trt_cuda_helper.cuh
+    ├── trt_plugin_helper.hpp
+    ├── trt_plugin.hpp
+    ├── trt_serialize.hpp
+    ├── ...
+    ├── trt_ops.hpp
+    └── plugins
+        ├── trt_cuda_helper.cu
+        ├── trt_plugin.cpp
+        ├── ...
+        ├── trt_ops.cpp
+        └── trt_ops_kernel.cu
 ```
 
 ## Components
 
 - `common`: This directory contains all tools and shared codes.
   - `cuda`: The cuda kernels which can be shared by all backends. **HIP** kernel is also here since they have similar syntax.
-  - `mps`: The tools used to support MPS ops. **NOTE** that MPS support is **experimental**.
-  - `mlu`: The MLU kernels used to support [Cambricon](https://www.cambricon.com/) device.
-  - `utils`: The kernels and utils of spconv.
+- `onnxruntime`: **ONNX Runtime** support for custom ops.
+  - `cpu`: CPU implementation of supported ops.
 - `parrots`: **Parrots** is a deep learning frame for model training and inference. Parrots custom ops are placed in this directory.
 - `pytorch`: **PyTorch** custom ops are supported by binding C++ to Python with **pybind11**. The ops implementation and binding codes are placed in this directory.
   - `cuda`: This directory contains cuda kernel launchers, which feed memory pointers of tensor to the cuda kernel in `common/cuda`. The launchers provide c++ interface of cuda implementation of corresponding custom ops.
   - `cpu`: This directory contain cpu implementations of corresponding custom ops.
-  - `mlu`: This directory contain launchers of each MLU kernels.
-  - `mps`: MPS ops implementation and launchers.
+- `tensorrt`: **TensorRT** support for custom ops.
+  - `plugins`: This directory contains the implementation of the supported custom ops. Some ops might also use shared cuda kernel in `common/cuda`.
 
 ## How to add new PyTorch ops?
 
 1. (Optional) Add shared kernel in `common` to support special hardware platform.
 
-   ```c++
-   // src/common/cuda/new_ops_cuda_kernel.cuh
-
-   template <typename T>
-   __global__ void new_ops_forward_cuda_kernel(const T* input, T* output, ...) {
-       // forward here
-   }
-
-   ```
-
-   Add cuda kernel launcher in `pytorch/cuda`.
-
-   ```c++
-   // src/pytorch/cuda
-   #include <new_ops_cuda_kernel.cuh>
-
-   void NewOpsForwardCUDAKernelLauncher(Tensor input, Tensor output, ...){
-       // initialize
-       at::cuda::CUDAGuard device_guard(input.device());
-       cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-       ...
-       AT_DISPATCH_FLOATING_TYPES_AND_HALF(
-           input.scalar_type(), "new_ops_forward_cuda_kernel", ([&] {
-               new_ops_forward_cuda_kernel<scalar_t>
-                   <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
-                       input.data_ptr<scalar_t>(), output.data_ptr<scalar_t>(),...);
-           }));
-       AT_CUDA_CHECK(cudaGetLastError());
-   }
-   ```
+    ```c++
+    // src/common/cuda/new_ops_cuda_kernel.cuh
+
+    template <typename T>
+    __global__ void new_ops_forward_cuda_kernel(const T* input, T* output, ...) {
+        // forward here
+    }
+
+    ```
+
+    Add cuda kernel launcher in `pytorch/cuda`.
+
+    ```c++
+    // src/pytorch/cuda
+    #include <new_ops_cuda_kernel.cuh>
+
+    void NewOpsForwardCUDAKernelLauncher(Tensor input, Tensor output, ...){
+        // initialize
+        at::cuda::CUDAGuard device_guard(input.device());
+        cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+        ...
+        AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+            input.scalar_type(), "new_ops_forward_cuda_kernel", ([&] {
+                new_ops_forward_cuda_kernel<scalar_t>
+                    <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                        input.data_ptr<scalar_t>(), output.data_ptr<scalar_t>(),...);
+            }));
+        AT_CUDA_CHECK(cudaGetLastError());
+    }
+    ```
 
 2. Register implementation for different devices.
 
-   ```c++
-   // src/pytorch/cuda/cudabind.cpp
-   ...
+    ```c++
+    // src/pytorch/cuda/cudabind.cpp
+    ...
 
-   Tensor new_ops_forward_cuda(Tensor input, Tensor output, ...){
-       // implement cuda forward here
-       // use `NewOpsForwardCUDAKernelLauncher` here
-   }
-   // declare interface here.
-   Tensor new_ops_forward_impl(Tensor input, Tensor output, ...);
-   // register the implementation for given device (CUDA here).
-   REGISTER_DEVICE_IMPL(new_ops_forward_impl, CUDA, new_ops_forward_cuda);
-   ```
+    Tensor new_ops_forward_cuda(Tensor input, Tensor output, ...){
+        // implement cuda forward here
+        // use `NewOpsForwardCUDAKernelLauncher` here
+    }
+    // declare interface here.
+    Tensor new_ops_forward_impl(Tensor input, Tensor output, ...);
+    // register the implementation for given device (CUDA here).
+    REGISTER_DEVICE_IMPL(new_ops_forward_impl, CUDA, new_ops_forward_cuda);
+    ```
 
 3. Add ops implementation in `pytorch` directory. Select different implementations according to device type.
 
-   ```c++
-   // src/pytorch/new_ops.cpp
-   Tensor new_ops_forward_impl(Tensor input, Tensor output, ...){
-       // dispatch the implementation according to the device type of input.
-       DISPATCH_DEVICE_IMPL(new_ops_forward_impl, input, output, ...);
-   }
-   ...
+    ```c++
+    // src/pytorch/new_ops.cpp
+    Tensor new_ops_forward_impl(Tensor input, Tensor output, ...){
+        // dispatch the implementation according to the device type of input.
+        DISPATCH_DEVICE_IMPL(new_ops_forward_impl, input, output, ...);
+    }
+    ...
 
-   Tensor new_ops_forward(Tensor input, Tensor output, ...){
-       return new_ops_forward_impl(input, output, ...);
-   }
-   ```
+    Tensor new_ops_forward(Tensor input, Tensor output, ...){
+        return new_ops_forward_impl(input, output, ...);
+    }
+    ```
 
 4. Binding the implementation in `pytorch/pybind.cpp`
 
-   ```c++
-   // src/pytorch/pybind.cpp
+    ```c++
+    // src/pytorch/pybind.cpp
 
-   ...
+    ...
 
-   Tensor new_ops_forward(Tensor input, Tensor output, ...);
+    Tensor new_ops_forward(Tensor input, Tensor output, ...);
 
-   ...
+    ...
 
-   // bind with pybind11
-   m.def("new_ops_forward", &new_ops_forward, "new_ops_forward",
-           py::arg("input"), py::arg("output"), ...);
+    // bind with pybind11
+    m.def("new_ops_forward", &new_ops_forward, "new_ops_forward",
+            py::arg("input"), py::arg("output"), ...);
 
-   ...
+    ...
 
-   ```
+    ```
 
 5. Build MMCV again. Enjoy new ops in python
 
-   ```python
-   from ..utils import ext_loader
-   ext_module = ext_loader.load_ext('_ext', ['new_ops_forward'])
+    ```python
+    from ..utils import ext_loader
+    ext_module = ext_loader.load_ext('_ext', ['new_ops_forward'])
 
-   ...
+    ...
 
-   ext_module.new_ops_forward(input, output, ...)
+    ext_module.new_ops_forward(input, output, ...)
 
-   ```
+    ```
diff --git a/mmcv/ops/csrc/common/box_iou_rotated_utils.hpp b/mmcv/ops/csrc/common/box_iou_rotated_utils.hpp
index a8453ea..67190dc 100644
--- a/mmcv/ops/csrc/common/box_iou_rotated_utils.hpp
+++ b/mmcv/ops/csrc/common/box_iou_rotated_utils.hpp
@@ -220,10 +220,6 @@ HOST_DEVICE_INLINE int convex_hull_graham(const Point<T> (&p)[24],
                 return temp > 0;
               }
             });
-  // compute distance to origin after sort, since the points are now different.
-  for (int i = 0; i < num_in; i++) {
-    dist[i] = dot_2d<T>(q[i], q[i]);
-  }
 #endif
 
   // Step 4:
@@ -270,17 +266,6 @@ HOST_DEVICE_INLINE int convex_hull_graham(const Point<T> (&p)[24],
   return m;
 }
 
-template <typename T>
-HOST_DEVICE_INLINE T quadri_box_area(const Point<T> (&q)[4]) {
-  T area = 0;
-#pragma unroll
-  for (int i = 1; i < 3; i++) {
-    area += fabs(cross_2d<T>(q[i] - q[0], q[i + 1] - q[0]));
-  }
-
-  return area / 2.0;
-}
-
 template <typename T>
 HOST_DEVICE_INLINE T polygon_area(const Point<T> (&q)[24], const int& m) {
   if (m <= 2) {
@@ -319,25 +304,6 @@ HOST_DEVICE_INLINE T rotated_boxes_intersection(const RotatedBox<T>& box1,
   return polygon_area<T>(orderedPts, num_convex);
 }
 
-template <typename T>
-HOST_DEVICE_INLINE T quadri_boxes_intersection(const Point<T> (&pts1)[4],
-                                               const Point<T> (&pts2)[4]) {
-  // There are up to 4 x 4 + 4 + 4 = 24 intersections (including dups) returned
-  // from rotated_rect_intersection_pts
-  Point<T> intersectPts[24], orderedPts[24];
-
-  int num = get_intersection_points<T>(pts1, pts2, intersectPts);
-
-  if (num <= 2) {
-    return 0.0;
-  }
-
-  // Convex Hull to order the intersection points in clockwise order and find
-  // the contour area.
-  int num_convex = convex_hull_graham<T>(intersectPts, num, orderedPts, true);
-  return polygon_area<T>(orderedPts, num_convex);
-}
-
 }  // namespace
 
 template <typename T>
@@ -375,52 +341,3 @@ HOST_DEVICE_INLINE T single_box_iou_rotated(T const* const box1_raw,
   const T iou = intersection / baseS;
   return iou;
 }
-
-template <typename T>
-HOST_DEVICE_INLINE T single_box_iou_quadri(T const* const pts1_raw,
-                                           T const* const pts2_raw,
-                                           const int mode_flag) {
-  // shift center to the middle point to achieve higher precision in result
-  Point<T> pts1[4], pts2[4];
-
-  auto center_shift_x =
-      (pts1_raw[0] + pts2_raw[0] + pts1_raw[2] + pts2_raw[2] + pts1_raw[4] +
-       pts2_raw[4] + pts1_raw[6] + pts2_raw[6]) /
-      8.0;
-  auto center_shift_y =
-      (pts1_raw[1] + pts2_raw[1] + pts1_raw[3] + pts2_raw[3] + pts1_raw[5] +
-       pts2_raw[5] + pts1_raw[7] + pts2_raw[7]) /
-      8.0;
-  pts1[0].x = pts1_raw[0] - center_shift_x;
-  pts1[0].y = pts1_raw[1] - center_shift_y;
-  pts1[1].x = pts1_raw[2] - center_shift_x;
-  pts1[1].y = pts1_raw[3] - center_shift_y;
-  pts1[2].x = pts1_raw[4] - center_shift_x;
-  pts1[2].y = pts1_raw[5] - center_shift_y;
-  pts1[3].x = pts1_raw[6] - center_shift_x;
-  pts1[3].y = pts1_raw[7] - center_shift_y;
-  pts2[0].x = pts2_raw[0] - center_shift_x;
-  pts2[0].y = pts2_raw[1] - center_shift_y;
-  pts2[1].x = pts2_raw[2] - center_shift_x;
-  pts2[1].y = pts2_raw[3] - center_shift_y;
-  pts2[2].x = pts2_raw[4] - center_shift_x;
-  pts2[2].y = pts2_raw[5] - center_shift_y;
-  pts2[3].x = pts2_raw[6] - center_shift_x;
-  pts2[3].y = pts2_raw[7] - center_shift_y;
-
-  const T area1 = quadri_box_area<T>(pts1);
-  const T area2 = quadri_box_area<T>(pts2);
-  if (area1 < 1e-14 || area2 < 1e-14) {
-    return 0.f;
-  }
-
-  const T intersection = quadri_boxes_intersection<T>(pts1, pts2);
-  T baseS = 1.0;
-  if (mode_flag == 0) {
-    baseS = (area1 + area2 - intersection);
-  } else if (mode_flag == 1) {
-    baseS = area1;
-  }
-  const T iou = intersection / baseS;
-  return iou;
-}
diff --git a/mmcv/ops/csrc/common/cuda/active_rotated_filter_cuda_kernel.cuh b/mmcv/ops/csrc/common/cuda/active_rotated_filter_cuda_kernel.cuh
deleted file mode 100644
index 36e4110..0000000
--- a/mmcv/ops/csrc/common/cuda/active_rotated_filter_cuda_kernel.cuh
+++ /dev/null
@@ -1,59 +0,0 @@
-// Copyright (c) OpenMMLab. All rights reserved.
-// Modified from
-// https://github.com/csuhan/s2anet/blob/master/mmdet/ops/orn/src/cuda/ActiveRotatingFilter_cuda.cu
-#ifndef ACTIVE_ROTATED_FILTER_CUDA_KERNEL_CUH
-#define ACTIVE_ROTATED_FILTER_CUDA_KERNEL_CUH
-
-#ifdef MMCV_USE_PARROTS
-#include "parrots_cuda_helper.hpp"
-#else
-#include "pytorch_cuda_helper.hpp"
-#endif
-
-template <typename scalar_t>
-__global__ void active_rotated_filter_forward_cuda_kernel(
-    const int nthreads, const scalar_t* weight_data, const int* indices_data,
-    const int num_input_planes, const int num_output_planes,
-    const int num_orientations, const int num_rotations, const int nEntry,
-    scalar_t* output_data) {
-  CUDA_1D_KERNEL_LOOP(index, nthreads) {
-    int l = index % nEntry;
-    int j = (index / nEntry) % num_input_planes;
-    int i = index / nEntry / num_input_planes;
-    int k;
-    scalar_t val = *(weight_data + index);
-    for (k = 0; k < num_rotations; k++) {
-      int idx = (int)(*(indices_data + l * num_rotations + k)) - 1;
-      scalar_t* target = output_data +
-                         i * (num_rotations * num_input_planes * nEntry) +
-                         k * (num_input_planes * nEntry) + j * (nEntry) + idx;
-      *target = val;
-    }
-  }
-}
-
-template <typename scalar_t>
-__global__ void active_rotated_filter_backward_cuda_kernel(
-    const int nthreads, const scalar_t* gradWeight_data,
-    const int* indices_data, const int num_input_planes,
-    const int num_output_planes, const int num_orientations,
-    const int num_rotations, const int nEntry, scalar_t* weight_data) {
-  CUDA_1D_KERNEL_LOOP(index, nthreads) {
-    int l = index % nEntry;
-    int j = (index / nEntry) % num_input_planes;
-    int i = index / nEntry / num_input_planes;
-    int k;
-    scalar_t* val = weight_data + index;
-    *val = 0;
-    scalar_t tmp = 0;
-    for (k = 0; k < num_rotations; k++) {
-      int idx = (int)(*(indices_data + l * num_rotations + k)) - 1;
-      scalar_t target =
-          *(gradWeight_data + i * (num_rotations * num_input_planes * nEntry) +
-            k * (num_input_planes * nEntry) + j * (nEntry) + idx);
-      tmp = tmp + target;
-    }
-    *val = tmp;
-  }
-}
-#endif  // ACTIVE_ROTATED_FILTER_CUDA_KERNEL_CUH
diff --git a/mmcv/ops/csrc/common/cuda/assign_score_withk_cuda_kernel.cuh b/mmcv/ops/csrc/common/cuda/assign_score_withk_cuda_kernel.cuh
index 9f92508..056d123 100644
--- a/mmcv/ops/csrc/common/cuda/assign_score_withk_cuda_kernel.cuh
+++ b/mmcv/ops/csrc/common/cuda/assign_score_withk_cuda_kernel.cuh
@@ -22,34 +22,34 @@ __global__ void assign_score_withk_forward_cuda_kernel(
     const int O, const int aggregate, const T* points, const T* centers,
     const T* scores, const int64_t* knn_idx, T* output) {
   // ----- parallel loop for B, N1, K and O ---------
-  CUDA_1D_KERNEL_LOOP(i, B * O * N1 * K) {
-    // ------- loop for M ----------
-    const int b = (int)(i / (O * N1 * K));
-    const int o = (int)(i % (O * N1 * K) / (N1 * K));
-    const int n = (int)(i % (N1 * K) / K);
-    const int k = (int)(i % K);
-    const int cn = (int)knn_idx[b * K * N1 + n * K +
-                                0];  // The first neighbor is the center point
-    const int kn = (int)knn_idx[b * K * N1 + n * K + k];
-    if (kn >= N0 ||
-        kn < 0) {  // if index overflows, it is out of the neighborhood range
-      return;
-    }
-    assert(b < B);
-    assert(kn < N0);
-    assert(cn < N0);
-    assert(o < O);
-    assert(n < N1);
-    const int out_idx = b * N1 * O * K + o * N1 * K + n * K + k;
-    T val = output[out_idx];
-    for (int m = 0; m < M; m++) {
-      val += points[b * N0 * M * O + kn * M * O + m * O + o] *
-                 scores[b * N1 * K * M + n * K * M + k * M + m] -
-             centers[b * N0 * M * O + cn * M * O + m * O + o] *
-                 scores[b * N1 * K * M + n * K * M + k * M + m];
-    }
-    output[out_idx] = val;
+  long i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i >= B * N1 * K * O) return;
+  // ------- loop for M ----------
+  const int b = (int)(i / (O * N1 * K));
+  const int o = (int)(i % (O * N1 * K) / (N1 * K));
+  const int n = (int)(i % (N1 * K) / K);
+  const int k = (int)(i % K);
+  const int cn = (int)knn_idx[b * K * N1 + n * K +
+                              0];  // The first neighbor is the center point
+  const int kn = (int)knn_idx[b * K * N1 + n * K + k];
+  if (kn >= N0 ||
+      kn < 0) {  // if index overflows, it is out of the neighborhood range
+    return;
+  }
+  assert(b < B);
+  assert(kn < N0);
+  assert(cn < N0);
+  assert(o < O);
+  assert(n < N1);
+  const int out_idx = b * N1 * O * K + o * N1 * K + n * K + k;
+  T val = output[out_idx];
+  for (int m = 0; m < M; m++) {
+    val += points[b * N0 * M * O + kn * M * O + m * O + o] *
+               scores[b * N1 * K * M + n * K * M + k * M + m] -
+           centers[b * N0 * M * O + cn * M * O + m * O + o] *
+               scores[b * N1 * K * M + n * K * M + k * M + m];
   }
+  output[out_idx] = val;
 }
 
 template <typename T>
@@ -58,27 +58,27 @@ __global__ void assign_score_withk_points_backward_cuda_kernel(
     const int O, const int aggregate, const T* grad_out, const T* scores,
     const int64_t* knn_idx, T* grad_points, T* grad_centers) {
   // ----- parallel loop for B, M, O ---------
-  CUDA_1D_KERNEL_LOOP(i, B * M * O) {
-    int b = (int)(i / (M * O));
-    int m = (int)(i % (M * O) / O);
-    int o = (int)(i % O);
+  long i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i >= B * M * O) return;
+  int b = (int)(i / (M * O));
+  int m = (int)(i % (M * O) / O);
+  int o = (int)(i % O);
 
-    // ----- loop for N,K ---------
-    for (int n = 0; n < N; n++) {
-      for (int k = 0; k < K; k++) {
-        int kn = knn_idx[b * N * K + n * K + k];
-        int cn = knn_idx[b * N * K + n * K + 0];
-        if (kn >= N0 || kn < 0) {  // if index overflows, it is out of the
-                                   // neighborhood range
-          continue;
-        }
-        atomicAdd(grad_points + b * N0 * M * O + kn * M * O + m * O + o,
-                  scores[b * N * K * M + n * K * M + k * M + m] *
-                      grad_out[b * O * N * K + o * N * K + n * K + k]);
-        atomicAdd(grad_centers + b * N0 * M * O + cn * M * O + m * O + o,
-                  -scores[b * N * K * M + n * K * M + k * M + m] *
-                      grad_out[b * O * N * K + o * N * K + n * K + k]);
+  // ----- loop for N,K ---------
+  for (int n = 0; n < N; n++) {
+    for (int k = 0; k < K; k++) {
+      int kn = knn_idx[b * N * K + n * K + k];
+      int cn = knn_idx[b * N * K + n * K + 0];
+      if (kn >= N0 ||
+          kn < 0) {  // if index overflows, it is out of the neighborhood range
+        continue;
       }
+      atomicAdd(grad_points + b * N0 * M * O + kn * M * O + m * O + o,
+                scores[b * N * K * M + n * K * M + k * M + m] *
+                    grad_out[b * O * N * K + o * N * K + n * K + k]);
+      atomicAdd(grad_centers + b * N0 * M * O + cn * M * O + m * O + o,
+                -scores[b * N * K * M + n * K * M + k * M + m] *
+                    grad_out[b * O * N * K + o * N * K + n * K + k]);
     }
   }
 }
@@ -89,28 +89,28 @@ __global__ void assign_score_withk_scores_backward_cuda_kernel(
     const int O, const int aggregate, const T* grad_out, const T* points,
     const T* centers, const int64_t* knn_idx, T* grad_scores) {
   // ----- parallel loop for B, N, K, M ---------
-  CUDA_1D_KERNEL_LOOP(i, B * N * K * M) {
-    const int b = (int)(i / (N * M * K));
-    const int n = (int)(i % (N * M * K) / M / K);
-    const int k = (int)(i % (M * K) / M);
-    const int m = (int)(i % M);
-    const int cn = knn_idx[b * N * K + n * K + 0];
-    const int kn = knn_idx[b * N * K + n * K + k];
-    if (kn >= N0 ||
-        kn < 0) {  // if index overflows, it is out of the neighborhood range
-      return;
-    }
+  long i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i >= B * N * K * M) return;
+  const int b = (int)(i / (N * M * K));
+  const int n = (int)(i % (N * M * K) / M / K);
+  const int k = (int)(i % (M * K) / M);
+  const int m = (int)(i % M);
+  const int cn = knn_idx[b * N * K + n * K + 0];
+  const int kn = knn_idx[b * N * K + n * K + k];
+  if (kn >= N0 ||
+      kn < 0) {  // if index overflows, it is out of the neighborhood range
+    return;
+  }
 
-    // -------------- loop for O ------------------------
-    const int out_idx = b * N * K * M + n * K * M + k * M + m;
-    T val = grad_scores[out_idx];
-    for (int o = 0; o < O; o++) {
-      val += (points[b * N0 * M * O + kn * M * O + m * O + o] -
-              centers[b * N0 * M * O + cn * M * O + m * O + o]) *
-             grad_out[b * O * N * K + o * N * K + n * K + k];
-    }
-    grad_scores[out_idx] = val;
+  // -------------- loop for O ------------------------
+  const int out_idx = b * N * K * M + n * K * M + k * M + m;
+  T val = grad_scores[out_idx];
+  for (int o = 0; o < O; o++) {
+    val += (points[b * N0 * M * O + kn * M * O + m * O + o] -
+            centers[b * N0 * M * O + cn * M * O + m * O + o]) *
+           grad_out[b * O * N * K + o * N * K + n * K + k];
   }
+  grad_scores[out_idx] = val;
 }
 
 #endif  // ASSIGN_SCORE_WITHK_CUDA_KERNEL_CUH
diff --git a/mmcv/ops/csrc/common/cuda/ball_query_cuda_kernel.cuh b/mmcv/ops/csrc/common/cuda/ball_query_cuda_kernel.cuh
index 632b5c4..ba2af01 100644
--- a/mmcv/ops/csrc/common/cuda/ball_query_cuda_kernel.cuh
+++ b/mmcv/ops/csrc/common/cuda/ball_query_cuda_kernel.cuh
@@ -21,36 +21,35 @@ __global__ void ball_query_forward_cuda_kernel(int b, int n, int m,
   // output:
   //      idx: (B, M, nsample)
   int bs_idx = blockIdx.y;
-  CUDA_1D_KERNEL_LOOP(pt_idx, m) {
-    if (bs_idx >= b) return;
+  int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (bs_idx >= b || pt_idx >= m) return;
 
-    new_xyz += bs_idx * m * 3 + pt_idx * 3;
-    xyz += bs_idx * n * 3;
-    idx += bs_idx * m * nsample + pt_idx * nsample;
+  new_xyz += bs_idx * m * 3 + pt_idx * 3;
+  xyz += bs_idx * n * 3;
+  idx += bs_idx * m * nsample + pt_idx * nsample;
 
-    float max_radius2 = max_radius * max_radius;
-    float min_radius2 = min_radius * min_radius;
-    T new_x = new_xyz[0];
-    T new_y = new_xyz[1];
-    T new_z = new_xyz[2];
+  float max_radius2 = max_radius * max_radius;
+  float min_radius2 = min_radius * min_radius;
+  T new_x = new_xyz[0];
+  T new_y = new_xyz[1];
+  T new_z = new_xyz[2];
 
-    int cnt = 0;
-    for (int k = 0; k < n; ++k) {
-      T x = xyz[k * 3 + 0];
-      T y = xyz[k * 3 + 1];
-      T z = xyz[k * 3 + 2];
-      T d2 = (new_x - x) * (new_x - x) + (new_y - y) * (new_y - y) +
-             (new_z - z) * (new_z - z);
-      if (d2 == 0 || (d2 >= min_radius2 && d2 < max_radius2)) {
-        if (cnt == 0) {
-          for (int l = 0; l < nsample; ++l) {
-            idx[l] = k;
-          }
+  int cnt = 0;
+  for (int k = 0; k < n; ++k) {
+    T x = xyz[k * 3 + 0];
+    T y = xyz[k * 3 + 1];
+    T z = xyz[k * 3 + 2];
+    T d2 = (new_x - x) * (new_x - x) + (new_y - y) * (new_y - y) +
+           (new_z - z) * (new_z - z);
+    if (d2 == 0 || (d2 >= min_radius2 && d2 < max_radius2)) {
+      if (cnt == 0) {
+        for (int l = 0; l < nsample; ++l) {
+          idx[l] = k;
         }
-        idx[cnt] = k;
-        ++cnt;
-        if (cnt >= nsample) break;
       }
+      idx[cnt] = k;
+      ++cnt;
+      if (cnt >= nsample) break;
     }
   }
 }
diff --git a/mmcv/ops/csrc/common/cuda/bbox_overlaps_cuda_kernel.cuh b/mmcv/ops/csrc/common/cuda/bbox_overlaps_cuda_kernel.cuh
index 15bd91e..249c9e8 100644
--- a/mmcv/ops/csrc/common/cuda/bbox_overlaps_cuda_kernel.cuh
+++ b/mmcv/ops/csrc/common/cuda/bbox_overlaps_cuda_kernel.cuh
@@ -8,27 +8,6 @@
 #include "pytorch_cuda_helper.hpp"
 #endif
 
-template <typename T>
-__device__ __forceinline__ void load_bbox(const T* bbox, const int base, T& x1,
-                                          T& y1, T& x2, T& y2) {
-  x1 = bbox[base];
-  y1 = bbox[base + 1];
-  x2 = bbox[base + 2];
-  y2 = bbox[base + 3];
-}
-
-template <>
-__device__ __forceinline__ void load_bbox<float>(const float* bbox,
-                                                 const int base, float& x1,
-                                                 float& y1, float& x2,
-                                                 float& y2) {
-  const float4 bbox_offset = reinterpret_cast<const float4*>(bbox + base)[0];
-  x1 = bbox_offset.x;
-  y1 = bbox_offset.y;
-  x2 = bbox_offset.z;
-  y2 = bbox_offset.w;
-}
-
 template <typename T>
 __global__ void bbox_overlaps_cuda_kernel(const T* bbox1, const T* bbox2,
                                           T* ious, const int num_bbox1,
@@ -37,111 +16,69 @@ __global__ void bbox_overlaps_cuda_kernel(const T* bbox1, const T* bbox2,
                                           const int offset) {
   if (aligned) {
     CUDA_1D_KERNEL_LOOP(index, num_bbox1) {
-      const int b1 = index;
-      const int b2 = index;
-
-      const int base1 = b1 << 2;  // b1 * 4
-      T b1_x1, b1_y1, b1_x2, b1_y2;
-      load_bbox<T>(bbox1, base1, b1_x1, b1_y1, b1_x2, b1_y2);
-      const T b1_area = (b1_x2 - b1_x1 + offset) * (b1_y2 - b1_y1 + offset);
-
-      const int base2 = b2 << 2;  // b2 * 4
-      T b2_x1, b2_y1, b2_x2, b2_y2;
-      load_bbox<T>(bbox2, base2, b2_x1, b2_y1, b2_x2, b2_y2);
-      const T b2_area = (b2_x2 - b2_x1 + offset) * (b2_y2 - b2_y1 + offset);
-
-      const T left = fmaxf(b1_x1, b2_x1), right = fminf(b1_x2, b2_x2);
-      const T top = fmaxf(b1_y1, b2_y1), bottom = fminf(b1_y2, b2_y2);
-      const T width = fmaxf(right - left + offset, 0.f);
-      const T height = fmaxf(bottom - top + offset, 0.f);
-      const T interS = width * height;
-
-      const T baseS =
-          fmaxf(mode == 0 ? b1_area + b2_area - interS : b1_area, T(offset));
+      int b1 = index;
+      int b2 = index;
+
+      int base1 = b1 * 4;
+      T b1_x1 = bbox1[base1];
+      T b1_y1 = bbox1[base1 + 1];
+      T b1_x2 = bbox1[base1 + 2];
+      T b1_y2 = bbox1[base1 + 3];
+      T b1_area = (b1_x2 - b1_x1 + offset) * (b1_y2 - b1_y1 + offset);
+
+      int base2 = b2 * 4;
+      T b2_x1 = bbox2[base2];
+      T b2_y1 = bbox2[base2 + 1];
+      T b2_x2 = bbox2[base2 + 2];
+      T b2_y2 = bbox2[base2 + 3];
+      T b2_area = (b2_x2 - b2_x1 + offset) * (b2_y2 - b2_y1 + offset);
+
+      T left = fmaxf(b1_x1, b2_x1), right = fminf(b1_x2, b2_x2);
+      T top = fmaxf(b1_y1, b2_y1), bottom = fminf(b1_y2, b2_y2);
+      T width = fmaxf(right - left + offset, 0.f);
+      T height = fmaxf(bottom - top + offset, 0.f);
+      T interS = width * height;
+      T baseS = 1.0;
+      if (mode == 0) {
+        baseS = fmaxf(b1_area + b2_area - interS, T(offset));
+      } else if (mode == 1) {
+        baseS = fmaxf(b1_area, T(offset));
+      }
       ious[index] = interS / baseS;
     }
   } else {
     CUDA_1D_KERNEL_LOOP(index, num_bbox1 * num_bbox2) {
-      const int b1 = index / num_bbox2;
-      const int b2 = index % num_bbox2;
-
-      const int base1 = b1 << 2;  // b1 * 4
-      T b1_x1, b1_y1, b1_x2, b1_y2;
-      load_bbox<T>(bbox1, base1, b1_x1, b1_y1, b1_x2, b1_y2);
-      const T b1_area = (b1_x2 - b1_x1 + offset) * (b1_y2 - b1_y1 + offset);
-
-      const int base2 = b2 << 2;  // b2 * 4
-      T b2_x1, b2_y1, b2_x2, b2_y2;
-      load_bbox<T>(bbox2, base2, b2_x1, b2_y1, b2_x2, b2_y2);
-      const T b2_area = (b2_x2 - b2_x1 + offset) * (b2_y2 - b2_y1 + offset);
-
-      const T left = fmaxf(b1_x1, b2_x1), right = fminf(b1_x2, b2_x2);
-      const T top = fmaxf(b1_y1, b2_y1), bottom = fminf(b1_y2, b2_y2);
-      const T width = fmaxf(right - left + offset, 0.f);
-      const T height = fmaxf(bottom - top + offset, 0.f);
-      const T interS = width * height;
-
-      const T baseS =
-          fmaxf(mode == 0 ? b1_area + b2_area - interS : b1_area, T(offset));
+      int b1 = index / num_bbox2;
+      int b2 = index % num_bbox2;
+
+      int base1 = b1 * 4;
+      T b1_x1 = bbox1[base1];
+      T b1_y1 = bbox1[base1 + 1];
+      T b1_x2 = bbox1[base1 + 2];
+      T b1_y2 = bbox1[base1 + 3];
+      T b1_area = (b1_x2 - b1_x1 + offset) * (b1_y2 - b1_y1 + offset);
+
+      int base2 = b2 * 4;
+      T b2_x1 = bbox2[base2];
+      T b2_y1 = bbox2[base2 + 1];
+      T b2_x2 = bbox2[base2 + 2];
+      T b2_y2 = bbox2[base2 + 3];
+      T b2_area = (b2_x2 - b2_x1 + offset) * (b2_y2 - b2_y1 + offset);
+
+      T left = fmaxf(b1_x1, b2_x1), right = fminf(b1_x2, b2_x2);
+      T top = fmaxf(b1_y1, b2_y1), bottom = fminf(b1_y2, b2_y2);
+      T width = fmaxf(right - left + offset, 0.f);
+      T height = fmaxf(bottom - top + offset, 0.f);
+      T interS = width * height;
+      T baseS = 1.0;
+      if (mode == 0) {
+        baseS = fmaxf(b1_area + b2_area - interS, T(offset));
+      } else if (mode == 1) {
+        baseS = fmaxf(b1_area, T(offset));
+      }
       ious[index] = interS / baseS;
     }
   }
 }
 
-#if __CUDA_ARCH__ >= 530
-__device__ __forceinline__ __half __half_area(const __half x1, const __half y1,
-                                              const __half x2, const __half y2,
-                                              const __half offset) {
-  const __half half_w = __hadd(__hsub(x2, x1), offset);
-  const __half half_h = __hadd(__hsub(y2, y1), offset);
-  return __hmul(half_w, half_h);
-}
-
-__device__ __forceinline__ __half __half_max(const __half a, const __half b) {
-  return __hge(a, b) ? a : b;
-}
-
-__device__ __forceinline__ __half __half_min(const __half a, const __half b) {
-  return __hle(a, b) ? a : b;
-}
-
-// fp16 won't provide much increase when aligned==true. It is useful when
-// aligned==false, which would give you ~40% bonus.
-__device__ void bbox_overlaps_cuda_kernel_half(
-    const __half* bbox1, const __half* bbox2, __half* ious, const int num_bbox1,
-    const int num_bbox2, const int mode, const bool aligned, const int offset) {
-  const int num_output = aligned ? num_bbox1 : num_bbox1 * num_bbox2;
-  const __half h_offset = __int2half_rn(offset);
-  CUDA_1D_KERNEL_LOOP(index, num_output) {
-    const int b1 = aligned ? index : index / num_bbox2;
-    const int b2 = aligned ? index : index % num_bbox2;
-
-    const int base1 = b1 << 2;
-    __half b1_x1, b1_y1, b1_x2, b1_y2;
-    load_bbox<__half>(bbox1, base1, b1_x1, b1_y1, b1_x2, b1_y2);
-    const __half b1_area = __half_area(b1_x1, b1_y1, b1_x2, b1_y2, h_offset);
-
-    const int base2 = b2 << 2;
-    __half b2_x1, b2_y1, b2_x2, b2_y2;
-    load_bbox<__half>(bbox2, base2, b2_x1, b2_y1, b2_x2, b2_y2);
-    const __half b2_area = __half_area(b2_x1, b2_y1, b2_x2, b2_y2, h_offset);
-
-    const __half left = __half_max(b1_x1, b2_x1),
-                 right = __half_min(b1_x2, b2_x2);
-    const __half top = __half_max(b1_y1, b2_y1),
-                 bottom = __half_min(b1_y2, b2_y2);
-    const __half width =
-        __half_max(__hadd(__hsub(right, left), h_offset), __float2half(0.f));
-    const __half height =
-        __half_max(__hadd(__hsub(bottom, top), h_offset), __float2half(0.f));
-    const __half interS = __hmul(width, height);
-
-    const __half baseS = __half_max(
-        mode == 0 ? __hsub(__hadd(b1_area, b2_area), interS) : b1_area,
-        h_offset);
-    ious[index] = __hdiv(interS, baseS);
-  }
-}
-#endif  // __CUDA_ARCH__ >= 530
-
 #endif  // BBOX_OVERLAPS_CUDA_KERNEL_CUH
diff --git a/mmcv/ops/csrc/common/cuda/bezier_align_cuda_kernel.cuh b/mmcv/ops/csrc/common/cuda/bezier_align_cuda_kernel.cuh
deleted file mode 100644
index 5376104..0000000
--- a/mmcv/ops/csrc/common/cuda/bezier_align_cuda_kernel.cuh
+++ /dev/null
@@ -1,230 +0,0 @@
-// Copyright (c) OpenMMLab. All rights reserved
-// Modified from
-// https://github.com/aim-uofa/AdelaiDet/blob/master/adet/layers/csrc/BezierAlign/BezierAlign_cuda.cu
-#ifndef BEZIER_ALIGN_CUDA_KERNEL_CUH
-#define BEZIER_ALIGN_CUDA_KERNEL_CUH
-
-#include <float.h>
-#ifdef MMCV_WITH_TRT
-#include "common_cuda_helper.hpp"
-#else  // MMCV_WITH_TRT
-#ifdef MMCV_USE_PARROTS
-#include "parrots_cuda_helper.hpp"
-#else  // MMCV_USE_PARROTS
-#include "pytorch_cuda_helper.hpp"
-#endif  // MMCV_USE_PARROTS
-#endif  // MMCV_WITH_TRT
-
-template <typename T>
-__device__ T bezier_curve(const T p0, const T p1, const T p2, const T p3,
-                          const T u) {
-  return ((1. - u) * (1. - u) * (1. - u) * p0 +
-          3. * u * (1. - u) * (1. - u) * p1 + 3. * u * u * (1. - u) * p2 +
-          u * u * u * p3);
-}
-
-template <typename T>
-__global__ void bezier_align_forward_cuda_kernel(
-    const int nthreads,
-    const T *bottom_data,  // inputs
-    const T *bottom_rois,  // bottom rois contains the bezier curve
-    T *top_data,           // outputs
-    const int pooled_height, const int pooled_width, const T spatial_scale,
-    const int sampling_ratio, bool aligned, const int channels,
-    const int height, const int width) {
-  CUDA_1D_KERNEL_LOOP(index, nthreads) {
-    // (n, c, ph, pw) is an element in the pooled output
-    int pw = index % pooled_width;
-    int ph = (index / pooled_width) % pooled_height;
-    int c = (index / pooled_width / pooled_height) % channels;
-    int n = index / pooled_width / pooled_height / channels;
-
-    // beziers have size Nx(1+8*2) = Nx17
-    const T *offset_bottom_rois = bottom_rois + n * 17;
-    int roi_batch_ind = offset_bottom_rois[0];
-
-    // Do not use rounding; this implementation detail is critical
-    T offset = aligned ? (T)0.5 : (T)0.0;
-
-    // TODO: avoid this by using parallel annotation, for good
-    T p0_x = offset_bottom_rois[1] * spatial_scale;
-    T p0_y = offset_bottom_rois[2] * spatial_scale;
-    T p1_x = offset_bottom_rois[3] * spatial_scale;
-    T p1_y = offset_bottom_rois[4] * spatial_scale;
-    T p2_x = offset_bottom_rois[5] * spatial_scale;
-    T p2_y = offset_bottom_rois[6] * spatial_scale;
-    T p3_x = offset_bottom_rois[7] * spatial_scale;
-    T p3_y = offset_bottom_rois[8] * spatial_scale;
-    T p4_x = offset_bottom_rois[15] * spatial_scale;
-    T p4_y = offset_bottom_rois[16] * spatial_scale;
-    T p5_x = offset_bottom_rois[13] * spatial_scale;
-    T p5_y = offset_bottom_rois[14] * spatial_scale;
-    T p6_x = offset_bottom_rois[11] * spatial_scale;
-    T p6_y = offset_bottom_rois[12] * spatial_scale;
-    T p7_x = offset_bottom_rois[9] * spatial_scale;
-    T p7_y = offset_bottom_rois[10] * spatial_scale;
-
-    // compute the coords
-    const T u = pw / static_cast<T>(pooled_width);
-    const T v = ph / static_cast<T>(pooled_height);
-    const T x0 = bezier_curve(p0_x, p1_x, p2_x, p3_x, u);
-    const T y0 = bezier_curve(p0_y, p1_y, p2_y, p3_y, u);
-    const T x1 = bezier_curve(p4_x, p5_x, p6_x, p7_x, u);
-    const T y1 = bezier_curve(p4_y, p5_y, p6_y, p7_y, u);
-    const T x_center = x1 * v + x0 * (1. - v) - offset;
-    const T y_center = y1 * v + y0 * (1. - v) - offset;
-
-    T roi_width = max(abs(p0_x - p3_x), abs(p4_x - p7_x));
-    T roi_height = max(abs(p0_y - p3_y), abs(p4_y - p7_y));
-    if (!aligned) {  // for backward-compatibility only
-      roi_width = max(roi_width, (T)1.);
-      roi_height = max(roi_height, (T)1.);
-    }
-    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
-    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
-
-    const T *offset_bottom_data =
-        bottom_data + (roi_batch_ind * channels + c) * height * width;
-
-    // We use roi_bin_grid to sample the grid and mimic integral
-    int roi_bin_grid_h = (sampling_ratio > 0)
-                             ? sampling_ratio
-                             : ceil(roi_height / pooled_height);  // e.g., = 2
-    int roi_bin_grid_w =
-        (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
-
-    // We do average (integral) pooling inside a bin
-    // When the grid is empty, output zeros == 0/1, instead of NaN.
-    const T count = max(roi_bin_grid_h * roi_bin_grid_w, 1);  // e.g. = 4
-
-    T output_val = 0.;
-    for (int iy = 0; iy < roi_bin_grid_h; iy++)  // e.g., iy = 0, 1
-    {
-      const T y = y_center - (T)0.5 * bin_size_h +
-                  static_cast<T>(iy + .5f) * bin_size_h /
-                      static_cast<T>(roi_bin_grid_h);  // e.g., 0.5, 1.5
-      for (int ix = 0; ix < roi_bin_grid_w; ix++) {
-        const T x = x_center - (T)0.5 * bin_size_w +
-                    static_cast<T>(ix + .5f) * bin_size_w /
-                        static_cast<T>(roi_bin_grid_w);
-
-        T val = bilinear_interpolate(offset_bottom_data, height, width, y, x,
-                                     index);
-        output_val += val;
-      }
-    }
-    output_val /= count;
-
-    top_data[index] = output_val;
-  }
-}
-
-template <typename T>
-__global__ void bezier_align_backward_cuda_kernel(
-    const int nthreads, const T *top_diff, const T *bottom_rois, T *bottom_diff,
-    const int pooled_height, const int pooled_width, const T spatial_scale,
-    const int sampling_ratio, bool aligned, const int channels,
-    const int height, const int width) {
-  CUDA_1D_KERNEL_LOOP(index, nthreads) {
-    // (n, c, ph, pw) is an element in the pooled output
-    int pw = index % pooled_width;
-    int ph = (index / pooled_width) % pooled_height;
-    int c = (index / pooled_width / pooled_height) % channels;
-    int n = index / pooled_width / pooled_height / channels;
-
-    // beziers have size Nx(1+8*2) = Nx17
-    const T *offset_bottom_rois = bottom_rois + n * 17;
-    int roi_batch_ind = offset_bottom_rois[0];
-
-    // Do not use rounding; this implementation detail is critical
-    T offset = aligned ? (T)0.5 : (T)0.0;
-    T p0_x = offset_bottom_rois[1] * spatial_scale;
-    T p0_y = offset_bottom_rois[2] * spatial_scale;
-    T p1_x = offset_bottom_rois[3] * spatial_scale;
-    T p1_y = offset_bottom_rois[4] * spatial_scale;
-    T p2_x = offset_bottom_rois[5] * spatial_scale;
-    T p2_y = offset_bottom_rois[6] * spatial_scale;
-    T p3_x = offset_bottom_rois[7] * spatial_scale;
-    T p3_y = offset_bottom_rois[8] * spatial_scale;
-    T p4_x = offset_bottom_rois[15] * spatial_scale;
-    T p4_y = offset_bottom_rois[16] * spatial_scale;
-    T p5_x = offset_bottom_rois[13] * spatial_scale;
-    T p5_y = offset_bottom_rois[14] * spatial_scale;
-    T p6_x = offset_bottom_rois[11] * spatial_scale;
-    T p6_y = offset_bottom_rois[12] * spatial_scale;
-    T p7_x = offset_bottom_rois[9] * spatial_scale;
-    T p7_y = offset_bottom_rois[10] * spatial_scale;
-
-    // compute the coords
-    const T u = pw / static_cast<T>(pooled_width);
-    const T v = ph / static_cast<T>(pooled_height);
-    const T x0 = bezier_curve(p0_x, p1_x, p2_x, p3_x, u);
-    const T y0 = bezier_curve(p0_y, p1_y, p2_y, p3_y, u);
-    const T x1 = bezier_curve(p4_x, p5_x, p6_x, p7_x, u);
-    const T y1 = bezier_curve(p4_y, p5_y, p6_y, p7_y, u);
-    const T x_center = x1 * v + x0 * (1. - v) - offset;
-    const T y_center = y1 * v + y0 * (1. - v) - offset;
-
-    T roi_width = max(abs(p0_x - p3_x), abs(p4_x - p7_x));
-    T roi_height = max(abs(p0_y - p3_y), abs(p4_y - p7_y));
-    if (!aligned) {  // for backward-compatibility only
-      roi_width = max(roi_width, (T)1.);
-      roi_height = max(roi_height, (T)1.);
-    }
-    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
-    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
-
-    T *offset_bottom_diff =
-        bottom_diff + (roi_batch_ind * channels + c) * height * width;
-
-    int top_offset = (n * channels + c) * pooled_height * pooled_width;
-    const T *offset_top_diff = top_diff + top_offset;
-    const T top_diff_this_bin = offset_top_diff[ph * pooled_width + pw];
-
-    // We use roi_bin_grid to sample the grid and mimic integral
-    int roi_bin_grid_h = (sampling_ratio > 0)
-                             ? sampling_ratio
-                             : ceil(roi_height / pooled_height);  // e.g., = 2
-    int roi_bin_grid_w =
-        (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
-
-    // We do average (integral) pooling inside a bin
-    const T count = roi_bin_grid_h * roi_bin_grid_w;  // e.g. = 4
-
-    for (int iy = 0; iy < roi_bin_grid_h; iy++)  // e.g., iy = 0, 1
-    {
-      const T y = y_center - (T)0.5 * bin_size_h +
-                  static_cast<T>(iy + .5f) * bin_size_h /
-                      static_cast<T>(roi_bin_grid_h);  // e.g., 0.5, 1.5
-      for (int ix = 0; ix < roi_bin_grid_w; ix++) {
-        const T x = x_center - (T)0.5 * bin_size_w +
-                    static_cast<T>(ix + .5f) * bin_size_w /
-                        static_cast<T>(roi_bin_grid_w);
-
-        T w1, w2, w3, w4;
-        int x_low, x_high, y_low, y_high;
-
-        bilinear_interpolate_gradient(height, width, y, x, w1, w2, w3, w4,
-                                      x_low, x_high, y_low, y_high, index);
-
-        T g1 = top_diff_this_bin * w1 / count;
-        T g2 = top_diff_this_bin * w2 / count;
-        T g3 = top_diff_this_bin * w3 / count;
-        T g4 = top_diff_this_bin * w4 / count;
-
-        if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
-          atomicAdd(offset_bottom_diff + y_low * width + x_low,
-                    static_cast<T>(g1));
-          atomicAdd(offset_bottom_diff + y_low * width + x_high,
-                    static_cast<T>(g2));
-          atomicAdd(offset_bottom_diff + y_high * width + x_low,
-                    static_cast<T>(g3));
-          atomicAdd(offset_bottom_diff + y_high * width + x_high,
-                    static_cast<T>(g4));
-        }  // if
-      }    // ix
-    }      // iy
-  }        // CUDA_1D_KERNEL_LOOP
-}  // BezierAlignBackward
-
-#endif  // BEZIER_ALIGN_CUDA_KERNEL_CUH
diff --git a/mmcv/ops/csrc/common/cuda/box_iou_quadri_cuda.cuh b/mmcv/ops/csrc/common/cuda/box_iou_quadri_cuda.cuh
deleted file mode 100644
index cf8ad5e..0000000
--- a/mmcv/ops/csrc/common/cuda/box_iou_quadri_cuda.cuh
+++ /dev/null
@@ -1,91 +0,0 @@
-// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
-#ifndef BOX_IOU_QUADRI_CUDA_CUH
-#define BOX_IOU_QUADRI_CUDA_CUH
-
-#ifdef MMCV_USE_PARROTS
-#include "parrots_cuda_helper.hpp"
-#else
-#include "pytorch_cuda_helper.hpp"
-#endif
-#include "box_iou_rotated_utils.hpp"
-
-// 2D block with 32 * 16 = 512 threads per block
-const int BLOCK_DIM_X = 32;
-const int BLOCK_DIM_Y = 16;
-
-inline int divideUP(const int x, const int y) { return (((x) + (y)-1) / (y)); }
-
-template <typename T>
-__global__ void box_iou_quadri_cuda_kernel(
-    const int n_boxes1, const int n_boxes2, const T* dev_boxes1,
-    const T* dev_boxes2, T* dev_ious, const int mode_flag, const bool aligned) {
-  if (aligned) {
-    CUDA_1D_KERNEL_LOOP(index, n_boxes1) {
-      int b1 = index;
-      int b2 = index;
-
-      int base1 = b1 * 8;
-
-      float block_boxes1[8];
-      float block_boxes2[8];
-
-      block_boxes1[0] = dev_boxes1[base1 + 0];
-      block_boxes1[1] = dev_boxes1[base1 + 1];
-      block_boxes1[2] = dev_boxes1[base1 + 2];
-      block_boxes1[3] = dev_boxes1[base1 + 3];
-      block_boxes1[4] = dev_boxes1[base1 + 4];
-      block_boxes1[5] = dev_boxes1[base1 + 5];
-      block_boxes1[6] = dev_boxes1[base1 + 6];
-      block_boxes1[7] = dev_boxes1[base1 + 7];
-
-      int base2 = b2 * 8;
-
-      block_boxes2[0] = dev_boxes2[base2 + 0];
-      block_boxes2[1] = dev_boxes2[base2 + 1];
-      block_boxes2[2] = dev_boxes2[base2 + 2];
-      block_boxes2[3] = dev_boxes2[base2 + 3];
-      block_boxes2[4] = dev_boxes2[base2 + 4];
-      block_boxes2[5] = dev_boxes2[base2 + 5];
-      block_boxes2[6] = dev_boxes2[base2 + 6];
-      block_boxes2[7] = dev_boxes2[base2 + 7];
-
-      dev_ious[index] =
-          single_box_iou_quadri<T>(block_boxes1, block_boxes2, mode_flag);
-    }
-  } else {
-    CUDA_1D_KERNEL_LOOP(index, n_boxes1 * n_boxes2) {
-      int b1 = index / n_boxes2;
-      int b2 = index % n_boxes2;
-
-      int base1 = b1 * 8;
-
-      float block_boxes1[8];
-      float block_boxes2[8];
-
-      block_boxes1[0] = dev_boxes1[base1 + 0];
-      block_boxes1[1] = dev_boxes1[base1 + 1];
-      block_boxes1[2] = dev_boxes1[base1 + 2];
-      block_boxes1[3] = dev_boxes1[base1 + 3];
-      block_boxes1[4] = dev_boxes1[base1 + 4];
-      block_boxes1[5] = dev_boxes1[base1 + 5];
-      block_boxes1[6] = dev_boxes1[base1 + 6];
-      block_boxes1[7] = dev_boxes1[base1 + 7];
-
-      int base2 = b2 * 8;
-
-      block_boxes2[0] = dev_boxes2[base2 + 0];
-      block_boxes2[1] = dev_boxes2[base2 + 1];
-      block_boxes2[2] = dev_boxes2[base2 + 2];
-      block_boxes2[3] = dev_boxes2[base2 + 3];
-      block_boxes2[4] = dev_boxes2[base2 + 4];
-      block_boxes2[5] = dev_boxes2[base2 + 5];
-      block_boxes2[6] = dev_boxes2[base2 + 6];
-      block_boxes2[7] = dev_boxes2[base2 + 7];
-
-      dev_ious[index] =
-          single_box_iou_quadri<T>(block_boxes1, block_boxes2, mode_flag);
-    }
-  }
-}
-
-#endif
diff --git a/mmcv/ops/csrc/common/cuda/carafe_cuda_kernel.cuh b/mmcv/ops/csrc/common/cuda/carafe_cuda_kernel.cuh
index 20fd617..07beeda 100644
--- a/mmcv/ops/csrc/common/cuda/carafe_cuda_kernel.cuh
+++ b/mmcv/ops/csrc/common/cuda/carafe_cuda_kernel.cuh
@@ -8,7 +8,7 @@
 #include "pytorch_cuda_helper.hpp"
 #endif
 
-#ifdef MMCV_WITH_HIP
+#ifdef HIP_DIFF
 #define WARP_SIZE 64
 #else
 #define WARP_SIZE 32
@@ -29,22 +29,22 @@ __device__ inline int Loc2Index(const int n, const int c, const int h,
   int index = w + (h + (c + n * channel_num) * height) * width;
   return index;
 }
-#ifndef MMCV_WITH_HIP
+#ifndef HIP_DIFF
 /* TODO: move this to a common place */
 template <typename scalar_t>
-__device__ inline scalar_t min(scalar_t a, scalar_t b) {
+__device__ inline scalar_t mmcv_min(scalar_t a, scalar_t b) {
   return a < b ? a : b;
 }
 
 template <typename scalar_t>
-__device__ inline scalar_t max(scalar_t a, scalar_t b) {
+__device__ inline scalar_t mmcv_max(scalar_t a, scalar_t b) {
   return a > b ? a : b;
 }
 #endif
 template <typename scalar_t>
 __device__ __forceinline__ scalar_t warpReduceSum(scalar_t val) {
   for (int offset = WARP_SIZE / 2; offset > 0; offset /= 2)
-#ifdef MMCV_WITH_HIP
+#ifdef HIP_DIFF
     val += __shfl_down(val, offset);
 #else
     val += __shfl_down_sync(FULL_MASK, val, offset);
@@ -55,11 +55,11 @@ __device__ __forceinline__ scalar_t warpReduceSum(scalar_t val) {
 template <>
 __device__ __forceinline__ phalf warpReduceSum(phalf val) {
   for (int offset = WARP_SIZE / 2; offset > 0; offset /= 2)
-#ifdef MMCV_WITH_HIP
-    __PHALF(val) += __shfl_down(val, offset);
+#ifdef HIP_DIFF
+    __PHALF(val) += __shfl_down(FULL_MASK, val, offset);
 #else
     __PHALF(val) +=
-        __shfl_down_sync(FULL_MASK, __PHALF(val).operator __half(), offset);
+        __shfl_down_sync(FULL_MASK, static_cast<__half>(__PHALF(val)), offset);
 #endif
   return val;
 }
@@ -316,7 +316,7 @@ __global__ void CARAFEBackward_Mask(const int num_kernels,
       output_val += top_diff[top_id] * bottom_data[bottom_id];
     }
   }
-#ifdef MMCV_WITH_HIP
+#ifdef HIP_DIFF
   __syncthreads();
 #else
   __syncwarp();
diff --git a/mmcv/ops/csrc/common/cuda/chamfer_distance_cuda_kernel.cuh b/mmcv/ops/csrc/common/cuda/chamfer_distance_cuda_kernel.cuh
deleted file mode 100644
index 89feea4..0000000
--- a/mmcv/ops/csrc/common/cuda/chamfer_distance_cuda_kernel.cuh
+++ /dev/null
@@ -1,101 +0,0 @@
-// Copyright (c) OpenMMLab. All rights reserved.
-// Modified from
-// https://github.com/chrdiller/pyTorchChamferDistance/blob/master/chamfer_distance/chamfer_distance.cu
-#ifndef CHAMFER_DISTANCE_CUDA_KERNEL_CUH
-#define CHAMFER_DISTANCE_CUDA_KERNEL_CUH
-
-#ifdef MMCV_USE_PARROTS
-#include "parrots_cuda_helper.hpp"
-#else
-#include "pytorch_cuda_helper.hpp"
-#endif
-
-#define MAX_SHARED_SCALAR_T 6144  // 49152 / 8 = 6144
-
-template <typename scalar_t>
-__global__ void chamfer_distance_forward_cuda_kernel(int b, int n,
-                                                     const scalar_t* xyz, int m,
-                                                     const scalar_t* xyz2,
-                                                     scalar_t* result,
-                                                     int* result_i) {
-  __shared__ scalar_t buf[MAX_SHARED_SCALAR_T];
-  for (int i = blockIdx.x; i < b; i += gridDim.x) {
-    for (int k2 = 0; k2 < m; k2 += THREADS_PER_BLOCK) {
-      int end_k = min(m, k2 + THREADS_PER_BLOCK) - k2;
-      for (int j = threadIdx.x; j < end_k * 2; j += blockDim.x) {
-        buf[j] = xyz2[(i * m + k2) * 2 + j];
-      }
-      __syncthreads();
-      for (int j = threadIdx.x; j < n; j += blockDim.x * gridDim.y) {
-        scalar_t x1 = xyz[(i * n + j) * 2 + 0];
-        scalar_t y1 = xyz[(i * n + j) * 2 + 1];
-        int best_i = 0;
-        scalar_t best = 1e10;
-        int end_ka = end_k & (~2);
-        if (end_ka == THREADS_PER_BLOCK) {
-          for (int k = 0; k < THREADS_PER_BLOCK; k += 4) {
-#pragma unroll
-            for (int j = 0; j < 4; ++j) {
-              scalar_t x2 = buf[(k + j) * 2] - x1;
-              scalar_t y2 = buf[(k + j) * 2 + 1] - y1;
-              scalar_t d = x2 * x2 + y2 * y2;
-              if (d < best) {
-                best = d;
-                best_i = k + k2 + j;
-              }
-            }
-          }
-        } else {
-          for (int k = 0; k < end_ka; k += 4) {
-#pragma unroll
-            for (int j = 0; j < 4; ++j) {
-              scalar_t x2 = buf[(k + j) * 2] - x1;
-              scalar_t y2 = buf[(k + j) * 2 + 1] - y1;
-              scalar_t d = x2 * x2 + y2 * y2;
-              if (d < best) {
-                best = d;
-                best_i = k + k2 + j;
-              }
-            }
-          }
-        }
-        for (int k = end_ka; k < end_k; k++) {
-          scalar_t x2 = buf[k * 2 + 0] - x1;
-          scalar_t y2 = buf[k * 2 + 1] - y1;
-          scalar_t d = x2 * x2 + y2 * y2;
-          if (k == 0 || d < best) {
-            best = d;
-            best_i = k + k2;
-          }
-        }
-        if (k2 == 0 || result[(i * n + j)] > best) {
-          result[(i * n + j)] = best;
-          result_i[(i * n + j)] = best_i;
-        }
-      }
-      __syncthreads();
-    }
-  }
-}
-
-template <typename scalar_t>
-__global__ void chamfer_distance_backward_cuda_kernel(
-    int b, int n, const scalar_t* xyz1, int m, const scalar_t* xyz2,
-    const scalar_t* grad_dist1, const int* idx1, scalar_t* grad_xyz1,
-    scalar_t* grad_xyz2) {
-  for (int i = blockIdx.x; i < b; i += gridDim.x) {
-    for (int j = threadIdx.x; j < n; j += blockDim.x * gridDim.y) {
-      scalar_t x1 = xyz1[(i * n + j) * 2 + 0];
-      scalar_t y1 = xyz1[(i * n + j) * 2 + 1];
-      int j2 = idx1[i * n + j];
-      scalar_t x2 = xyz2[(i * m + j2) * 2 + 0];
-      scalar_t y2 = xyz2[(i * m + j2) * 2 + 1];
-      scalar_t g = grad_dist1[i * n + j] * 2;
-      atomicAdd(&(grad_xyz1[(i * n + j) * 2 + 0]), g * (x1 - x2));
-      atomicAdd(&(grad_xyz1[(i * n + j) * 2 + 1]), g * (y1 - y2));
-      atomicAdd(&(grad_xyz2[(i * m + j2) * 2 + 0]), -(g * (x1 - x2)));
-      atomicAdd(&(grad_xyz2[(i * m + j2) * 2 + 1]), -(g * (y1 - y2)));
-    }
-  }
-}
-#endif  // CHAMFER_DISTANCE_CUDA_KERNEL_CUH
diff --git a/mmcv/ops/csrc/common/cuda/common_cuda_helper.hpp b/mmcv/ops/csrc/common/cuda/common_cuda_helper.hpp
index b12aa9a..dc5df17 100644
--- a/mmcv/ops/csrc/common/cuda/common_cuda_helper.hpp
+++ b/mmcv/ops/csrc/common/cuda/common_cuda_helper.hpp
@@ -7,20 +7,12 @@
   for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
        i += blockDim.x * gridDim.x)
 
-#define CUDA_2D_KERNEL_LOOP(i, n, j, m)                             \
-  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (n);   \
-       i += blockDim.x * gridDim.x)                                 \
-    for (size_t j = blockIdx.y * blockDim.y + threadIdx.y; j < (m); \
-         j += blockDim.y * gridDim.y)
-
-#define CUDA_2D_KERNEL_BLOCK_LOOP(i, n, j, m)          \
-  for (size_t i = blockIdx.x; i < (n); i += gridDim.x) \
-    for (size_t j = blockIdx.y; j < (m); j += gridDim.y)
-
 #define THREADS_PER_BLOCK 512
 
-inline int GET_BLOCKS(const int N, const int num_threads = THREADS_PER_BLOCK) {
-  int optimal_block_num = (N + num_threads - 1) / num_threads;
+#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
+
+inline int GET_BLOCKS(const int N) {
+  int optimal_block_num = (N + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
   int max_block_num = 4096;
   return min(optimal_block_num, max_block_num);
 }
diff --git a/mmcv/ops/csrc/common/cuda/convex_iou_cuda_kernel.cuh b/mmcv/ops/csrc/common/cuda/convex_iou_cuda_kernel.cuh
deleted file mode 100644
index 2af96f7..0000000
--- a/mmcv/ops/csrc/common/cuda/convex_iou_cuda_kernel.cuh
+++ /dev/null
@@ -1,831 +0,0 @@
-// Copyright (c) OpenMMLab. All rights reserved
-#ifndef CONVEX_IOU_CUDA_KERNEL_CUH
-#define CONVEX_IOU_CUDA_KERNEL_CUH
-
-#ifdef MMCV_USE_PARROTS
-#include "parrots_cuda_helper.hpp"
-#else
-#include "pytorch_cuda_helper.hpp"
-#endif
-
-#define MAXN 100
-#define NMAX 512
-__device__ const double EPS = 1E-8;
-
-__device__ inline int sig(double d) { return (d > EPS) - (d < -EPS); }
-
-struct Point {
-  double x, y;
-  __device__ Point() {}
-  __device__ Point(double x, double y) : x(x), y(y) {}
-};
-
-__device__ inline bool point_same(Point& a, Point& b) {
-  return sig(a.x - b.x) == 0 && sig(a.y - b.y) == 0;
-}
-
-__device__ inline void swap1(Point* a, Point* b) {
-  Point temp;
-  temp.x = a->x;
-  temp.y = a->y;
-
-  a->x = b->x;
-  a->y = b->y;
-
-  b->x = temp.x;
-  b->y = temp.y;
-}
-
-__device__ inline void reverse1(Point* a, const int n) {
-  for (int i = 0; i < (n - 1) / 2.0; i++) {
-    Point* j = &(a[i]);
-    Point* k = &(a[n - 1 - i]);
-    swap1(j, k);
-  }
-}
-
-__device__ inline double cross(Point o, Point a, Point b) {
-  return (a.x - o.x) * (b.y - o.y) - (b.x - o.x) * (a.y - o.y);
-}
-
-__device__ inline double dis(Point a, Point b) {
-  return (a.x - b.x) * (a.x - b.x) + (a.y - b.y) * (a.y - b.y);
-}
-__device__ inline double area(Point* ps, int n) {
-  ps[n] = ps[0];
-  double res = 0;
-  for (int i = 0; i < n; i++) {
-    res += ps[i].x * ps[i + 1].y - ps[i].y * ps[i + 1].x;
-  }
-  return res / 2.0;
-}
-__device__ inline double polygon_area_grad(Point* ps, int n,
-                                           int* polygon_to_pred_index,
-                                           int n_pred, double* grad_C) {
-  ps[n] = ps[0];
-  double partion_grad[4 * 30 + 2];
-  double res = 0;
-  for (int i = 0; i < n; i++) {
-    res += ps[i].x * ps[i + 1].y - ps[i].y * ps[i + 1].x;
-    partion_grad[i * 4 + 2] = ps[i + 1].y;
-    partion_grad[i * 4 + 3] = -ps[i + 1].x;
-    if (i != n - 1) {
-      partion_grad[i * 4 + 4] = -ps[i].y;
-      partion_grad[i * 4 + 5] = ps[i].x;
-    } else {
-      partion_grad[0] = -ps[i].y;
-      partion_grad[1] = ps[i].x;
-    }
-  }
-  for (int i = 0; i < n; i++) {
-    for (int j = 0; j < n_pred; j++) {
-      if (i == polygon_to_pred_index[j]) {
-        grad_C[2 * polygon_to_pred_index[j + n_pred]] =
-            (partion_grad[i * 4] + partion_grad[i * 4 + 2]) / 2;
-        break;
-      }
-    }
-    for (int j = 0; j < n_pred; j++) {
-      if (i == polygon_to_pred_index[j]) {
-        grad_C[2 * polygon_to_pred_index[j + n_pred] + 1] =
-            (partion_grad[i * 4 + 1] + partion_grad[i * 4 + 1 + 2]) / 2;
-        break;
-      }
-    }
-  }
-
-  return res / 2.0;
-}
-
-__device__ inline int lineCross(Point a, Point b, Point c, Point d, Point& p,
-                                double* cut_grad, int m, int n, int i) {
-  double s1, s2;
-  double s2_s1_2;
-  double ds1_dxc, ds1_dyc, ds2_dxd, ds2_dyd;
-  double dxp_dxc, dxp_dyc, dxp_dxd, dxp_dyd, dyp_dxc, dyp_dyc, dyp_dxd, dyp_dyd;
-  s1 = cross(a, b, c);
-  s2 = cross(a, b, d);
-
-  ds1_dxc = -(b.y - a.y);
-  ds1_dyc = b.x - a.x;
-  ds2_dxd = ds1_dxc;
-  ds2_dyd = ds1_dyc;
-  s2_s1_2 = (s2 - s1) * (s2 - s1);
-
-  if (sig(s1) == 0 && sig(s2) == 0) return 2;
-  if (sig(s2 - s1) == 0) return 0;
-
-  dxp_dxc =
-      ((s2 - d.x * ds1_dxc) * (s2 - s1) - (c.x * s2 - d.x * s1) * (-ds1_dxc)) /
-      (s2_s1_2);
-  dxp_dyc =
-      ((0 - d.x * ds1_dyc) * (s2 - s1) - (c.x * s2 - d.x * s1) * (-ds1_dyc)) /
-      (s2_s1_2);
-  dxp_dxd =
-      ((c.x * ds2_dxd - s1) * (s2 - s1) - (c.x * s2 - d.x * s1) * (ds2_dxd)) /
-      (s2_s1_2);
-  dxp_dyd =
-      ((c.x * ds2_dyd - 0) * (s2 - s1) - (c.x * s2 - d.x * s1) * (ds2_dyd)) /
-      (s2_s1_2);
-
-  dyp_dxc =
-      ((0 - d.y * ds1_dxc) * (s2 - s1) - (c.y * s2 - d.y * s1) * (-ds1_dxc)) /
-      (s2_s1_2);
-  dyp_dyc =
-      ((s2 - d.y * ds1_dyc) * (s2 - s1) - (c.y * s2 - d.y * s1) * (-ds1_dyc)) /
-      (s2_s1_2);
-  dyp_dxd =
-      ((c.y * ds2_dxd - 0) * (s2 - s1) - (c.y * s2 - d.y * s1) * (ds2_dxd)) /
-      (s2_s1_2);
-  dyp_dyd =
-      ((c.y * ds2_dyd - s1) * (s2 - s1) - (c.y * s2 - d.y * s1) * (ds2_dyd)) /
-      (s2_s1_2);
-
-  p.x = (c.x * s2 - d.x * s1) / (s2 - s1);
-  p.y = (c.y * s2 - d.y * s1) / (s2 - s1);
-  if (i == n - 1) {
-    cut_grad[4 * n * m + 4 * i] = dxp_dxc;  // + dyp_dxc;
-    cut_grad[4 * n * m + 4 * i + 1] = dyp_dxc;
-    cut_grad[4 * n * m + 4 * i + 2] = dxp_dyc;  // + dyp_dyc;
-    cut_grad[4 * n * m + 4 * i + 3] = dyp_dyc;
-    cut_grad[4 * n * m + 0] = dxp_dxd;  // + dyp_dxd;
-    cut_grad[4 * n * m + 1] = dyp_dxd;
-    cut_grad[4 * n * m + 2] = dxp_dyd;  // + dyp_dyd;
-    cut_grad[4 * n * m + 3] = dyp_dyd;
-  } else {
-    cut_grad[4 * n * m + 4 * i] = dxp_dxc;  // + dyp_dxc;
-    cut_grad[4 * n * m + 4 * i + 1] = dyp_dxc;
-    cut_grad[4 * n * m + 4 * i + 2] = dxp_dyc;  // + dyp_dyc;
-    cut_grad[4 * n * m + 4 * i + 3] = dyp_dyc;
-    cut_grad[4 * n * m + 4 * (i + 1)] = dxp_dxd;  // + dyp_dxd;
-    cut_grad[4 * n * m + 4 * (i + 1) + 1] = dyp_dxd;
-    cut_grad[4 * n * m + 4 * (i + 1) + 2] = dxp_dyd;  // + dyp_dyd;
-    cut_grad[4 * n * m + 4 * (i + 1) + 3] = dyp_dyd;
-  }
-
-  return 1;
-}
-__device__ inline void polygon_cut(Point* p, int& n, Point a, Point b,
-                                   double* cut_grad) {
-  Point pp[MAXN];
-  double ccur_grad[MAXN] = {};
-  int m = 0;
-  p[n] = p[0];
-  int k = n;
-  for (int i = 0; i < n; i++) {
-    if (sig(cross(a, b, p[i])) > 0) {
-      pp[m] = p[i];
-      ccur_grad[4 * n * m + 4 * i] = 1.0;
-      ccur_grad[4 * n * m + 4 * i + 3] = 1.0;
-      m++;
-    }
-    if (sig(cross(a, b, p[i])) != sig(cross(a, b, p[i + 1]))) {
-      lineCross(a, b, p[i], p[i + 1], pp[m], ccur_grad, m, n, i);
-      m++;
-    }
-  }
-
-  n = 0;
-  for (int i = 0; i < m; i++) {
-    if (!i || !(point_same(pp[i], pp[i - 1]))) {
-      p[n] = pp[i];
-      for (int j = 0; j < 4 * k; j++) {
-        cut_grad[4 * k * n + j] = ccur_grad[4 * k * i + j];
-      }
-      n++;
-    }
-  }
-
-  while (n > 1 && point_same(p[n - 1], p[0])) n--;
-}
-
-__device__ inline double intersectArea(Point a, Point b, Point c, Point d,
-                                       double* grad_AB, int order,
-                                       int convex_n) {
-  Point o(0, 0);
-  int res_flag = 0;
-  int s1 = sig(cross(o, a, b));
-  int s2 = sig(cross(o, c, d));
-  if (s1 == 0 || s2 == 0) return 0.0;
-  if (s1 == -1) {
-    Point* i = &a;
-    Point* j = &b;
-    swap1(i, j);
-    res_flag = 1;
-  }
-  if (s2 == -1) {
-    Point* i = &c;
-    Point* j = &d;
-    swap1(i, j);
-  }
-  Point p[10] = {o, a, b};
-  int n = 3, n0 = 3, n1, n2, n3;
-  double cut_grad1[MAXN] = {};
-  double cut_grad2[MAXN] = {};
-  double cut_grad3[MAXN] = {};
-  double p1_p_grad[10][10] = {};
-  double p2_p1_grad[10][10] = {};
-  double p3_p2_grad[10][10] = {};
-
-  double p3_p1_grad[10][10] = {};
-  double p3_p_grad[10][10] = {};
-
-  // 1
-  polygon_cut(p, n, o, c, cut_grad1);
-  n1 = n;
-  for (int i = 0; i < n; i++) {
-    for (int j = 0; j < 4 * n0; j++) {
-      if (!(j % 2)) {
-        p1_p_grad[2 * i][j / 2] = cut_grad1[4 * n0 * i + j];
-      } else {
-        p1_p_grad[2 * i + 1][j / 2] = cut_grad1[4 * n0 * i + j];
-      }
-    }
-  }
-
-  // 2
-  polygon_cut(p, n, c, d, cut_grad2);
-  n2 = n;
-  for (int i = 0; i < n; i++) {
-    for (int j = 0; j < 4 * n1; j++) {
-      if (!(j % 2)) {
-        p2_p1_grad[2 * i][j / 2] = cut_grad2[4 * n1 * i + j];
-      } else {
-        p2_p1_grad[2 * i + 1][j / 2] = cut_grad2[4 * n1 * i + j];
-      }
-    }
-  }
-  // 3
-  polygon_cut(p, n, d, o, cut_grad3);
-  n3 = n;
-  for (int i = 0; i < n; i++) {
-    for (int j = 0; j < 4 * n2; j++) {
-      if (!(j % 2)) {
-        p3_p2_grad[2 * i][j / 2] = cut_grad3[4 * n2 * i + j];
-      } else {
-        p3_p2_grad[2 * i + 1][j / 2] = cut_grad3[4 * n2 * i + j];
-      }
-    }
-  }
-
-  // mul
-  //  p3_p2(n3 * n2) * p2_p1(n2 * n1) = p3_p1 (n3 * n1)
-  for (int i = 0; i < 2 * n3; i++) {
-    for (int j = 0; j < 2 * n1; j++) {
-      double sum = 0.0;
-      for (int m = 0; m < 2 * n2; m++) {
-        sum = sum + p3_p2_grad[i][m] * p2_p1_grad[m][j];
-      }
-      p3_p1_grad[i][j] = sum;
-    }
-  }
-
-  // p3_p1 (n3 * n1) * p1_p (n1 * n0) = p3_p (n3 * n0)
-  for (int i = 0; i < 2 * n3; i++) {
-    for (int j = 0; j < 2 * n0; j++) {
-      double sum = 0.0;
-      for (int m = 0; m < 2 * n1; m++) {
-        sum = sum + p3_p1_grad[i][m] * p1_p_grad[m][j];
-      }
-      p3_p_grad[i][j] = sum;
-    }
-  }
-
-  // calculate S_grad
-  int polygon_index_box_index[20];
-  double grad_polygon[20];
-  double S_grad[6];
-
-  for (int i = 0; i < n3; i++) {
-    polygon_index_box_index[i] = i;
-    polygon_index_box_index[i + n3] = i;
-  }
-
-  double res =
-      polygon_area_grad(p, n3, polygon_index_box_index, n3, grad_polygon);
-
-  if (s1 * s2 == -1) {
-    for (int j = 0; j < 2 * 3; j++) {
-      double sum = 0.0;
-      for (int m = 0; m < 2 * n3; m++) {
-        sum = sum - grad_polygon[m] * p3_p_grad[m][j];
-      }
-      S_grad[j] = sum;
-    }
-
-    if (order != convex_n - 1) {
-      if (res_flag) {
-        grad_AB[2 * order] += S_grad[4];
-        grad_AB[2 * order + 1] += S_grad[5];
-        grad_AB[2 * order + 2] += S_grad[2];
-        grad_AB[2 * order + 3] += S_grad[3];
-
-      } else {
-        grad_AB[2 * order] += S_grad[2];
-        grad_AB[2 * order + 1] += S_grad[3];
-        grad_AB[2 * order + 2] += S_grad[4];
-        grad_AB[2 * order + 3] += S_grad[5];
-      }
-    } else {
-      if (res_flag) {
-        grad_AB[2 * order] += S_grad[4];
-        grad_AB[2 * order + 1] += S_grad[5];
-        grad_AB[0] += S_grad[2];
-        grad_AB[1] += S_grad[3];
-
-      } else {
-        grad_AB[2 * order] += S_grad[2];
-        grad_AB[2 * order + 1] += S_grad[3];
-        grad_AB[0] += S_grad[4];
-        grad_AB[1] += S_grad[5];
-      }
-    }
-    res = -res;
-  } else {
-    for (int j = 0; j < 2 * 3; j++) {
-      double sum = 0.0;
-      for (int m = 0; m < 2 * n3; m++) {
-        sum = sum + grad_polygon[m] * p3_p_grad[m][j];
-      }
-      S_grad[j] = sum;
-    }
-
-    if (order != convex_n - 1) {
-      if (res_flag) {
-        grad_AB[2 * order] += S_grad[4];
-        grad_AB[2 * order + 1] += S_grad[5];
-        grad_AB[2 * order + 2] += S_grad[2];
-        grad_AB[2 * order + 3] += S_grad[3];
-      } else {
-        grad_AB[2 * order] += S_grad[2];
-        grad_AB[2 * order + 1] += S_grad[3];
-        grad_AB[2 * order + 2] += S_grad[4];
-        grad_AB[2 * order + 3] += S_grad[5];
-      }
-    } else {
-      if (res_flag) {
-        grad_AB[2 * order] += S_grad[4];
-        grad_AB[2 * order + 1] += S_grad[5];
-        grad_AB[0] += S_grad[2];
-        grad_AB[1] += S_grad[3];
-      } else {
-        grad_AB[2 * order] += S_grad[2];
-        grad_AB[2 * order + 1] += S_grad[3];
-        grad_AB[0] += S_grad[4];
-        grad_AB[1] += S_grad[5];
-      }
-    }
-  }
-  return res;
-}
-
-__device__ inline double intersectAreaO(Point* ps1, int n1, Point* ps2, int n2,
-                                        double* grad_AB) {
-  if (area(ps1, n1) < 0) reverse1(ps1, n1);
-  if (area(ps2, n2) < 0) reverse1(ps2, n2);
-  ps1[n1] = ps1[0];
-  ps2[n2] = ps2[0];
-  double res = 0;
-  for (int i = 0; i < n1; i++) {
-    for (int j = 0; j < n2; j++) {
-      res +=
-          intersectArea(ps1[i], ps1[i + 1], ps2[j], ps2[j + 1], grad_AB, i, n1);
-    }
-  }
-  return res;
-}
-
-__device__ inline void Jarvis(Point* in_poly, int& n_poly) {
-  Point p_max, p_k;
-  int max_index, k_index;
-  int Stack[NMAX] = {}, top1, top2;
-  double sign;
-  Point right_point[10], left_point[10];
-
-  for (int i = 0; i < n_poly; i++) {
-    if (in_poly[i].y < in_poly[0].y ||
-        in_poly[i].y == in_poly[0].y && in_poly[i].x < in_poly[0].x) {
-      Point* j = &(in_poly[0]);
-      Point* k = &(in_poly[i]);
-      swap1(j, k);
-    }
-    if (i == 0) {
-      p_max = in_poly[0];
-      max_index = 0;
-    }
-    if (in_poly[i].y > p_max.y ||
-        in_poly[i].y == p_max.y && in_poly[i].x > p_max.x) {
-      p_max = in_poly[i];
-      max_index = i;
-    }
-  }
-
-  if (max_index == 0) {
-    max_index = 1;
-    p_max = in_poly[max_index];
-  }
-
-  k_index = 0, Stack[0] = 0, top1 = 0;
-  while (k_index != max_index) {
-    p_k = p_max;
-    k_index = max_index;
-    for (int i = 1; i < n_poly; i++) {
-      sign = cross(in_poly[Stack[top1]], in_poly[i], p_k);
-      if ((sign > 0) || ((sign == 0) && (dis(in_poly[Stack[top1]], in_poly[i]) >
-                                         dis(in_poly[Stack[top1]], p_k)))) {
-        p_k = in_poly[i];
-        k_index = i;
-      }
-    }
-    top1++;
-    Stack[top1] = k_index;
-  }
-  for (int i = 0; i <= top1; i++) right_point[i] = in_poly[Stack[i]];
-
-  k_index = 0, Stack[0] = 0, top2 = 0;
-
-  while (k_index != max_index) {
-    p_k = p_max;
-    k_index = max_index;
-    for (int i = 1; i < n_poly; i++) {
-      sign = cross(in_poly[Stack[top2]], in_poly[i], p_k);
-      if ((sign < 0) || (sign == 0) && (dis(in_poly[Stack[top2]], in_poly[i]) >
-                                        dis(in_poly[Stack[top2]], p_k))) {
-        p_k = in_poly[i];
-        k_index = i;
-      }
-    }
-    top2++;
-    Stack[top2] = k_index;
-  }
-  for (int i = top2 - 1; i >= 0; i--) left_point[i] = in_poly[Stack[i]];
-
-  for (int i = 0; i < top1 + top2; i++) {
-    if (i <= top1) {
-      in_poly[i] = right_point[i];
-    } else {
-      in_poly[i] = left_point[top2 - (i - top1)];
-    }
-  }
-  n_poly = top1 + top2;
-}
-
-__device__ inline double intersectAreaPoly(Point* ps1, int n1, Point* ps2,
-                                           int n2, double* grad_C) {
-  Point polygon[MAXN];
-  int n = n1 + n2, n_poly = 0;
-  for (int i = 0; i < n1; i++) {
-    for (int j = 0; j < n - n1; j++) {
-      if (point_same(ps1[i], ps2[j])) {
-        for (int k = j; k < n - n1 - 1; k++) {
-          ps2[k] = ps2[k + 1];
-        }
-        n2--;
-        break;
-      }
-    }
-  }
-  n_poly = n1 + n2;
-  for (int i = 0; i < n_poly; i++) {
-    if (i < n1) {
-      polygon[i] = ps1[i];
-    } else {
-      polygon[i] = ps2[i - n1];
-    }
-  }
-
-  Jarvis(polygon, n_poly);
-
-  int polygon_to_pred_index[18] = {-1, -1, -1, -1, -1, -1, -1, -1, -1,
-                                   -1, -1, -1, -1, -1, -1, -1, -1, -1};
-  int n_pred = 0;
-  for (int i = 0; i < n_poly; i++) {
-    for (int j = 0; j < n1; j++) {
-      if (polygon[i].x == ps1[j].x && polygon[i].y == ps1[j].y) {
-        polygon_to_pred_index[n_pred] = i;
-        polygon_to_pred_index[n_pred + n1] = j;
-        n_pred += 1;
-        break;
-      }
-    }
-  }
-  if (n_pred == 0) {
-    double polygon_area = fabs(area(polygon, n_poly));
-    for (int i = 0; i < 18; i++) {
-      grad_C[i] = 0.0;
-    }
-    return polygon_area;
-  } else {
-    double polygon_area =
-        polygon_area_grad(polygon, n_poly, polygon_to_pred_index, n1, grad_C);
-    if (polygon_area < 0) {
-      for (int i = 0; i < 18; i++) {
-        grad_C[i] = -grad_C[i];
-      }
-    }
-    return fabs(polygon_area);
-  }
-}
-
-// convex_find and get the polygon_index_box_index
-__device__ inline void Jarvis_and_index(Point* in_poly, int& n_poly,
-                                        int* points_to_convex_ind) {
-  int n_input = n_poly;
-  Point input_poly[20];
-  for (int i = 0; i < n_input; i++) {
-    input_poly[i].x = in_poly[i].x;
-    input_poly[i].y = in_poly[i].y;
-  }
-  Point p_max, p_k;
-  int max_index, k_index;
-  int Stack[20], top1, top2;
-  double sign;
-  Point right_point[10], left_point[10];
-
-  for (int i = 0; i < n_poly; i++) {
-    if (in_poly[i].y < in_poly[0].y ||
-        in_poly[i].y == in_poly[0].y && in_poly[i].x < in_poly[0].x) {
-      Point* j = &(in_poly[0]);
-      Point* k = &(in_poly[i]);
-      swap1(j, k);
-    }
-    if (i == 0) {
-      p_max = in_poly[0];
-      max_index = 0;
-    }
-    if (in_poly[i].y > p_max.y ||
-        in_poly[i].y == p_max.y && in_poly[i].x > p_max.x) {
-      p_max = in_poly[i];
-      max_index = i;
-    }
-  }
-  if (max_index == 0) {
-    max_index = 1;
-    p_max = in_poly[max_index];
-  }
-
-  k_index = 0, Stack[0] = 0, top1 = 0;
-  while (k_index != max_index) {
-    p_k = p_max;
-    k_index = max_index;
-    for (int i = 1; i < n_poly; i++) {
-      sign = cross(in_poly[Stack[top1]], in_poly[i], p_k);
-      if ((sign > 0) || ((sign == 0) && (dis(in_poly[Stack[top1]], in_poly[i]) >
-                                         dis(in_poly[Stack[top1]], p_k)))) {
-        p_k = in_poly[i];
-        k_index = i;
-      }
-    }
-    top1++;
-    Stack[top1] = k_index;
-  }
-  for (int i = 0; i <= top1; i++) {
-    right_point[i] = in_poly[Stack[i]];
-  }
-
-  k_index = 0, Stack[0] = 0, top2 = 0;
-
-  while (k_index != max_index) {
-    p_k = p_max;
-    k_index = max_index;
-    for (int i = 1; i < n_poly; i++) {
-      sign = cross(in_poly[Stack[top2]], in_poly[i], p_k);
-      if ((sign < 0) || (sign == 0) && (dis(in_poly[Stack[top2]], in_poly[i]) >
-                                        dis(in_poly[Stack[top2]], p_k))) {
-        p_k = in_poly[i];
-        k_index = i;
-      }
-    }
-    top2++;
-    Stack[top2] = k_index;
-  }
-
-  for (int i = top2 - 1; i >= 0; i--) {
-    left_point[i] = in_poly[Stack[i]];
-  }
-
-  for (int i = 0; i < top1 + top2; i++) {
-    if (i <= top1) {
-      in_poly[i] = right_point[i];
-    } else {
-      in_poly[i] = left_point[top2 - (i - top1)];
-    }
-  }
-  n_poly = top1 + top2;
-  for (int i = 0; i < n_poly; i++) {
-    for (int j = 0; j < n_input; j++) {
-      if (point_same(in_poly[i], input_poly[j])) {
-        points_to_convex_ind[i] = j;
-        break;
-      }
-    }
-  }
-}
-
-template <typename T>
-__device__ inline float devrIoU(T const* const p, T const* const q,
-                                T* point_grad, const int idx) {
-  Point ps1[MAXN], ps2[MAXN];
-
-  Point convex[MAXN];
-  for (int i = 0; i < 9; i++) {
-    convex[i].x = (double)p[i * 2];
-    convex[i].y = (double)p[i * 2 + 1];
-  }
-  int n_convex = 9;
-  int points_to_convex_ind[9] = {-1, -1, -1, -1, -1, -1, -1, -1, -1};
-  Jarvis_and_index(convex, n_convex, points_to_convex_ind);
-
-  int n1 = n_convex;
-  int n2 = 4;
-
-  for (int i = 0; i < n1; i++) {
-    ps1[i].x = (double)convex[i].x;
-    ps1[i].y = (double)convex[i].y;
-  }
-
-  for (int i = 0; i < n2; i++) {
-    ps2[i].x = (double)q[i * 2];
-    ps2[i].y = (double)q[i * 2 + 1];
-  }
-
-  int polygon_index_box_index[18];
-  for (int i = 0; i < n1; i++) {
-    polygon_index_box_index[i] = i;
-    polygon_index_box_index[i + n1] = i;
-  }
-
-  double grad_A[18] = {};
-  double grad_AB[18] = {};
-  double grad_C[18] = {};
-
-  double inter_area = intersectAreaO(ps1, n1, ps2, n2, grad_AB);
-  double S_pred =
-      polygon_area_grad(ps1, n1, polygon_index_box_index, n1, grad_A);
-  if (S_pred < 0) {
-    for (int i = 0; i < n_convex * 2; i++) {
-      grad_A[i] = -grad_A[i];
-    }
-  }
-  double union_area = fabs(S_pred) + fabs(area(ps2, n2)) - inter_area;
-
-  double iou = inter_area / union_area;
-  double polygon_area = intersectAreaPoly(ps1, n1, ps2, n2, grad_C);
-
-  //    printf("%d:live\n", idx);
-  double rot_giou = iou - (polygon_area - union_area) / polygon_area;
-
-  float grad_point_temp[18] = {};
-
-  for (int i = 0; i < n_convex; i++) {
-    int grad_point = points_to_convex_ind[i];
-    grad_point_temp[2 * grad_point] =
-        (float)((union_area + inter_area) / (union_area * union_area) *
-                    grad_AB[2 * i] -
-                iou / union_area * grad_A[2 * i] -
-                1 / polygon_area * (grad_AB[2 * i] - grad_A[2 * i]) -
-                (union_area) / polygon_area / polygon_area * grad_C[2 * i]);
-    grad_point_temp[2 * grad_point + 1] =
-        (float)((union_area + inter_area) / (union_area * union_area) *
-                    grad_AB[2 * i + 1] -
-                iou / union_area * grad_A[2 * i + 1] -
-                1 / polygon_area * (grad_AB[2 * i + 1] - grad_A[2 * i + 1]) -
-                (union_area) / polygon_area / polygon_area * grad_C[2 * i + 1]);
-  }
-
-  for (int i = 0; i < 9; i++) {
-    point_grad[2 * i] = grad_point_temp[2 * i];
-    point_grad[2 * i + 1] = grad_point_temp[2 * i + 1];
-  }
-  return (float)rot_giou;
-}
-
-template <typename T>
-__global__ void convex_giou_cuda_kernel(const int ex_n_boxes,
-                                        const int gt_n_boxes, const T* ex_boxes,
-                                        const T* gt_boxes, T* point_grad) {
-  CUDA_1D_KERNEL_LOOP(index, ex_n_boxes) {
-    const T* cur_box = ex_boxes + index * 18;
-    const T* cur_gt_box = gt_boxes + index * 8;
-    T* cur_grad = point_grad + index * 19;
-    T giou = devrIoU(cur_box, cur_gt_box, cur_grad, threadIdx.x);
-    cur_grad[18] = giou;
-  }
-}
-
-__device__ inline int lineCross(Point a, Point b, Point c, Point d, Point& p) {
-  double s1, s2;
-  s1 = cross(a, b, c);
-  s2 = cross(a, b, d);
-  if (sig(s1) == 0 && sig(s2) == 0) return 2;
-  if (sig(s2 - s1) == 0) return 0;
-  p.x = (c.x * s2 - d.x * s1) / (s2 - s1);
-  p.y = (c.y * s2 - d.y * s1) / (s2 - s1);
-  return 1;
-}
-
-__device__ inline void polygon_cut(Point* p, int& n, Point a, Point b) {
-  Point pp[MAXN];
-  int m = 0;
-  p[n] = p[0];
-  for (int i = 0; i < n; i++) {
-    if (sig(cross(a, b, p[i])) > 0) {
-      pp[m] = p[i];
-      m++;
-    }
-    if (sig(cross(a, b, p[i])) != sig(cross(a, b, p[i + 1]))) {
-      lineCross(a, b, p[i], p[i + 1], pp[m]);
-      m++;
-    }
-  }
-  n = 0;
-  for (int i = 0; i < m; i++) {
-    if (!i || !(point_same(pp[i], pp[i - 1]))) {
-      p[n] = pp[i];
-      n++;
-    }
-  }
-
-  while (n > 1 && point_same(p[n - 1], p[0])) n--;
-}
-
-__device__ inline double intersectArea(Point a, Point b, Point c, Point d) {
-  Point o(0, 0);
-  int s1 = sig(cross(o, a, b));
-  int s2 = sig(cross(o, c, d));
-  if (s1 == 0 || s2 == 0) return 0.0;
-  if (s1 == -1) {
-    Point* i = &a;
-    Point* j = &b;
-    swap1(i, j);
-  }
-  if (s2 == -1) {
-    Point* i = &c;
-    Point* j = &d;
-    swap1(i, j);
-  }
-  Point p[10] = {o, a, b};
-  int n = 3;
-
-  polygon_cut(p, n, o, c);
-  polygon_cut(p, n, c, d);
-  polygon_cut(p, n, d, o);
-  double res = area(p, n);
-  if (s1 * s2 == -1) res = -res;
-  return res;
-}
-__device__ inline double intersectAreaO(Point* ps1, int n1, Point* ps2,
-                                        int n2) {
-  if (area(ps1, n1) < 0) reverse1(ps1, n1);
-  if (area(ps2, n2) < 0) reverse1(ps2, n2);
-  ps1[n1] = ps1[0];
-  ps2[n2] = ps2[0];
-  double res = 0;
-  for (int i = 0; i < n1; i++) {
-    for (int j = 0; j < n2; j++) {
-      res += intersectArea(ps1[i], ps1[i + 1], ps2[j], ps2[j + 1]);
-    }
-  }
-  return res;
-}
-
-template <typename T>
-__device__ inline float devrIoU(T const* const p, T const* const q) {
-  Point ps1[MAXN], ps2[MAXN];
-  Point convex[MAXN];
-  for (int i = 0; i < 9; i++) {
-    convex[i].x = (double)p[i * 2];
-    convex[i].y = (double)p[i * 2 + 1];
-  }
-  int n_convex = 9;
-  int points_to_convex_ind[9] = {-1, -1, -1, -1, -1, -1, -1, -1, -1};
-  Jarvis_and_index(convex, n_convex, points_to_convex_ind);
-  int n1 = n_convex;
-  for (int i = 0; i < n1; i++) {
-    ps1[i].x = (double)convex[i].x;
-    ps1[i].y = (double)convex[i].y;
-  }
-  int n2 = 4;
-  for (int i = 0; i < n2; i++) {
-    ps2[i].x = (double)q[i * 2];
-    ps2[i].y = (double)q[i * 2 + 1];
-  }
-  double inter_area = intersectAreaO(ps1, n1, ps2, n2);
-  double S_pred = area(ps1, n1);
-  double union_area = fabs(S_pred) + fabs(area(ps2, n2)) - inter_area;
-  double iou = inter_area / union_area;
-  return (float)iou;
-}
-
-template <typename T>
-__global__ void convex_iou_cuda_kernel(const int ex_n_boxes,
-                                       const int gt_n_boxes, const T* ex_boxes,
-                                       const T* gt_boxes, T* iou) {
-  CUDA_1D_KERNEL_LOOP(index, ex_n_boxes) {
-    const T* cur_box = ex_boxes + index * 18;
-    for (int i = 0; i < gt_n_boxes; i++) {
-      iou[index * gt_n_boxes + i] = devrIoU(cur_box, gt_boxes + i * 8);
-    }
-  }
-}
-#endif  // CONVEX_IOU_CUDA_KERNEL_CUH
diff --git a/mmcv/ops/csrc/common/cuda/correlation_cuda.cuh b/mmcv/ops/csrc/common/cuda/correlation_cuda.cuh
index f910561..75ea4ad 100644
--- a/mmcv/ops/csrc/common/cuda/correlation_cuda.cuh
+++ b/mmcv/ops/csrc/common/cuda/correlation_cuda.cuh
@@ -29,25 +29,21 @@ using namespace torch;
 #define TensorAcc5R PackedTensorAccessor32<scalar_t, 5, RestrictPtrTraits>
 #define WITHIN_BOUNDS(x, y, H, W) (x >= 0 && x < H && y >= 0 && y < W)
 
-#define WARP_SIZE 32
-#define FULL_MASK 0xffffffff
+#define THREADS_FORWARD 32
+#define THREADS_BACKWARD 16
 
 template <typename scalar_t>
 __global__ void correlation_forward_cuda_kernel(
     const TensorAcc4R rInput1, const TensorAcc4R rInput2, TensorAcc5R output,
     int kH, int kW, int patchH, int patchW, int padH, int padW, int dilationH,
-    int dilationW, int dilation_patchH, int dilation_patchW, int dH, int dW,
-    int oH, int oW) {
+    int dilationW, int dilation_patchH, int dilation_patchW, int dH, int dW) {
   const int iH = rInput1.size(1);
   const int iW = rInput1.size(2);
   const int C = rInput1.size(3);
 
   const int n = blockIdx.x;
-  const int h = blockIdx.y * blockDim.y + threadIdx.y;
-  const int w = blockIdx.z * blockDim.z + threadIdx.z;
-
-  if (h >= oH || w >= oW) return;
-
+  const int h = blockIdx.y;
+  const int w = blockIdx.z;
   const int thread = threadIdx.x;
 
   const int start_i = -padH + h * dH;
@@ -56,37 +52,40 @@ __global__ void correlation_forward_cuda_kernel(
   const int patchRadH = dilation_patchH * (patchH - 1) / 2;
   const int patchRadW = dilation_patchW * (patchW - 1) / 2;
 
+  __shared__ scalar_t prod_sum[THREADS_FORWARD];
+
   for (int ph = 0; ph < patchH; ++ph) {
     int ph_dilated = ph * dilation_patchH - patchRadH;
     for (int pw = 0; pw < patchW; ++pw) {
       int pw_dilated = pw * dilation_patchW - patchRadW;
-      scalar_t prod_sum = 0.0f;
+      prod_sum[thread] = 0;
       for (int i = 0; i < kH; ++i) {
         int i1 = start_i + i * dilationH;
         int i2 = i1 + ph_dilated;
-        if (WITHIN_BOUNDS(i1, i2, iH, iH)) {
-          for (int j = 0; j < kW; ++j) {
-            int j1 = start_j + j * dilationW;
-            int j2 = j1 + pw_dilated;
-            if (WITHIN_BOUNDS(j1, j2, iW, iW)) {
-              for (int c = thread; c < C; c += WARP_SIZE) {
-                scalar_t v1 = rInput1[n][i1][j1][c];
-                scalar_t v2 = rInput2[n][i2][j2][c];
-                prod_sum += v1 * v2;
-              }
+        if
+          WITHIN_BOUNDS(i1, i2, iH, iH) {
+            for (int j = 0; j < kW; ++j) {
+              int j1 = start_j + j * dilationW;
+              int j2 = j1 + pw_dilated;
+              if
+                WITHIN_BOUNDS(j1, j2, iW, iW) {
+                  for (int c = thread; c < C; c += THREADS_FORWARD) {
+                    scalar_t v1 = rInput1[n][i1][j1][c];
+                    scalar_t v2 = rInput2[n][i2][j2][c];
+                    prod_sum[thread] += v1 * v2;
+                  }
+                }
             }
           }
-        }
       }
       // accumulate
-      for (int offset = 16; offset > 0; offset /= 2)
-#ifdef MMCV_WITH_HIP
-        prod_sum += __shfl_down(float(prod_sum), offset);
-#else
-        prod_sum += __shfl_down_sync(FULL_MASK, float(prod_sum), offset);
-#endif
+      __syncthreads();
       if (thread == 0) {
-        output[n][ph][pw][h][w] = prod_sum;
+        scalar_t reduce_sum = 0;
+        for (int index = 0; index < THREADS_FORWARD; ++index) {
+          reduce_sum += prod_sum[index];
+        }
+        output[n][ph][pw][h][w] = reduce_sum;
       }
     }
   }
@@ -98,10 +97,9 @@ __global__ void correlation_backward_cuda_kernel_input1(
     TensorAcc4R grad_input1, const int kH, const int kW, const int patchH,
     const int patchW, const int padH, const int padW, const int dilationH,
     const int dilationW, const int dilation_patchH, const int dilation_patchW,
-    const int dH, const int dW) {
-  const int iH = input2.size(1);
-  const int iW = input2.size(2);
-  const int C = input2.size(3);
+    const int dH, const int dW, const int batch) {
+  const int iH = input2.size(2);
+  const int iW = input2.size(3);
 
   const int H = grad_output.size(3);
   const int W = grad_output.size(4);
@@ -109,53 +107,54 @@ __global__ void correlation_backward_cuda_kernel_input1(
   const int patchRadH = (patchH - 1) / 2;
   const int patchRadW = (patchW - 1) / 2;
 
-  const int n = blockIdx.x;
+  const int n = batch;
+  const int c = blockIdx.x;
   const int h = blockIdx.y;
   const int w = blockIdx.z;
+  const int ph_off = threadIdx.x;
+  const int pw_off = threadIdx.y;
 
   const int h_2 = h + padH;
   const int w_2 = w + padW;
   const int min_h = h_2 - kH * dilationH;
   const int min_w = w_2 - kW * dilationW;
 
-  extern __shared__ __align__(sizeof(4)) unsigned char grad_cache_char[];
-  scalar_t *grad_cache = reinterpret_cast<scalar_t *>(grad_cache_char);
-  for (int i = threadIdx.x; i < patchH * patchW; i += blockDim.x) {
-    const int ph = i / patchW;
-    const int pw = i % patchW;
+  __shared__ scalar_t prod_sum[THREADS_BACKWARD][THREADS_BACKWARD];
+  prod_sum[ph_off][pw_off] = 0;
+
+  for (int ph = ph_off; ph < patchH; ph += THREADS_BACKWARD) {
     int i1 = h + dilation_patchH * (ph - patchRadH);
-    int j1 = w + dilation_patchW * (pw - patchRadW);
-
-    if (WITHIN_BOUNDS(i1, j1, iH, iW)) {
-      scalar_t grad_val = 0.0f;
-      for (int h_3 = h_2; h_3 > min_h; h_3 -= dilationH) {
-        int i2 = (h_3) / dH;
-        if (i2 * dH != h_3) continue;
-        for (int w_3 = w_2; w_3 > min_w; w_3 -= dilationW) {
-          int j2 = (w_3) / dW;
-          if (j2 * dW != w_3) continue;
-          if (WITHIN_BOUNDS(i2, j2, H, W)) {
-            grad_val += grad_output[n][ph][pw][i2][j2];
+    for (int pw = pw_off; pw < patchW; pw += THREADS_BACKWARD) {
+      int j1 = w + dilation_patchW * (pw - patchRadW);
+      if (WITHIN_BOUNDS(i1, j1, iH, iW)) {
+        scalar_t val = input2[n][c][i1][j1];
+        for (int h_3 = h_2; h_3 > min_h; h_3 -= dilationH) {
+          int i2 = (h_3) / dH;
+          if (i2 * dH != h_3) continue;
+          for (int w_3 = w_2; w_3 > min_w; w_3 -= dilationW) {
+            int j2 = (w_3) / dW;
+            if (j2 * dW != w_3) continue;
+            if
+              WITHIN_BOUNDS(i2, j2, H, W) {
+                prod_sum[ph_off][pw_off] +=
+                    grad_output[n][ph][pw][i2][j2] * val;
+              }
           }
         }
       }
-      grad_cache[i] = grad_val;
     }
   }
+
   __syncthreads();
 
-  for (int c = threadIdx.x; c < C; c += blockDim.x) {
-    scalar_t grad_input_val = 0.0f;
-    for (int ph = 0; ph < patchH; ++ph) {
-      int i1 = h + dilation_patchH * (ph - patchRadH);
-      for (int pw = 0; pw < patchW; ++pw) {
-        int j1 = w + dilation_patchW * (pw - patchRadW);
-        if (WITHIN_BOUNDS(i1, j1, iH, iW)) {
-          grad_input_val += input2[n][i1][j1][c] * grad_cache[ph * patchW + pw];
-        }
+  if (ph_off == 0 && pw_off == 0) {
+    scalar_t reduce_sum = 0;
+    for (int ph = 0; ph < THREADS_BACKWARD; ++ph) {
+      for (int pw = 0; pw < THREADS_BACKWARD; ++pw) {
+        reduce_sum += prod_sum[ph][pw];
       }
     }
-    grad_input1[n][c][h][w] = grad_input_val;
+    grad_input1[n][c][h][w] = reduce_sum;
   }
 }
 
@@ -164,10 +163,9 @@ __global__ void correlation_backward_cuda_kernel_input2(
     const TensorAcc5R grad_output, const TensorAcc4R input1,
     TensorAcc4R grad_input2, int kH, int kW, int patchH, int patchW, int padH,
     int padW, int dilationH, int dilationW, int dilation_patchH,
-    int dilation_patchW, int dH, int dW) {
-  const int iH = input1.size(1);
-  const int iW = input1.size(2);
-  const int C = input1.size(3);
+    int dilation_patchW, int dH, int dW, int batch) {
+  const int iH = input1.size(2);
+  const int iW = input1.size(3);
 
   const int patchRadH = (patchH - 1) / 2;
   const int patchRadW = (patchW - 1) / 2;
@@ -178,54 +176,56 @@ __global__ void correlation_backward_cuda_kernel_input2(
   const int dilatedKH = kH * dilationH;
   const int dilatedKW = kW * dilationW;
 
-  const int n = blockIdx.x;
+  const int n = batch;
+  const int c = blockIdx.x;
   const int h = blockIdx.y;
   const int w = blockIdx.z;
+  const int ph_off = threadIdx.x;
+  const int pw_off = threadIdx.y;
 
-  extern __shared__ __align__(sizeof(4)) unsigned char grad_cache_char[];
-  scalar_t *grad_cache = reinterpret_cast<scalar_t *>(grad_cache_char);
-  for (int i = threadIdx.x; i < patchH * patchW; i += blockDim.x) {
-    const int ph = i / patchW;
-    const int pw = i % patchW;
+  __shared__ scalar_t prod_sum[THREADS_BACKWARD][THREADS_BACKWARD];
+  prod_sum[ph_off][pw_off] = 0;
+
+  for (int ph = ph_off; ph < patchH; ph += THREADS_BACKWARD) {
     int i1 = h - dilation_patchH * (ph - patchRadH);
-    int j1 = w - dilation_patchW * (pw - patchRadW);
-
-    if (WITHIN_BOUNDS(i1, j1, iH, iW)) {
-      scalar_t grad_val = 0.0f;
-
-      const int h_2 = i1 + padH;
-      const int w_2 = j1 + padW;
-      const int min_h = h_2 - dilatedKH;
-      const int min_w = w_2 - dilatedKW;
-
-      for (int h_3 = h_2; h_3 > min_h; h_3 -= dilationH) {
-        int i2 = (h_3) / dH;
-        if (i2 * dH != h_3) continue;
-        for (int w_3 = w_2; w_3 > min_w; w_3 -= dilationW) {
-          int j2 = (w_3) / dW;
-          if (j2 * dW != w_3) continue;
-          if (WITHIN_BOUNDS(i2, j2, H, W)) {
-            grad_val += grad_output[n][ph][pw][i2][j2];
+    for (int pw = pw_off; pw < patchW; pw += THREADS_BACKWARD) {
+      int j1 = w - dilation_patchW * (pw - patchRadW);
+      if
+        WITHIN_BOUNDS(i1, j1, iH, iW) {
+          scalar_t val = input1[n][c][i1][j1];
+
+          const int h_2 = i1 + padH;
+          const int w_2 = j1 + padW;
+          const int min_h = h_2 - dilatedKH;
+          const int min_w = w_2 - dilatedKW;
+
+          for (int h_3 = h_2; h_3 > min_h; h_3 -= dilationH) {
+            int i2 = (h_3) / dH;
+            if (i2 * dH != h_3) continue;
+            for (int w_3 = w_2; w_3 > min_w; w_3 -= dilationW) {
+              int j2 = (w_3) / dW;
+              if (j2 * dW != w_3) continue;
+              if
+                WITHIN_BOUNDS(i2, j2, H, W) {
+                  prod_sum[ph_off][pw_off] +=
+                      grad_output[n][ph][pw][i2][j2] * val;
+                }
+            }
           }
         }
-      }
-      grad_cache[i] = grad_val;
     }
   }
+
   __syncthreads();
 
-  for (int c = threadIdx.x; c < C; c += blockDim.x) {
-    scalar_t grad_input_val = 0.0f;
-    for (int ph = 0; ph < patchH; ++ph) {
-      int i1 = h - dilation_patchH * (ph - patchRadH);
-      for (int pw = 0; pw < patchW; ++pw) {
-        int j1 = w - dilation_patchW * (pw - patchRadW);
-        if (WITHIN_BOUNDS(i1, j1, iH, iW)) {
-          grad_input_val += input1[n][i1][j1][c] * grad_cache[ph * patchW + pw];
-        }
+  if (ph_off == 0 && pw_off == 0) {
+    scalar_t reduce_sum = 0;
+    for (int ph = 0; ph < THREADS_BACKWARD; ++ph) {
+      for (int pw = 0; pw < THREADS_BACKWARD; ++pw) {
+        reduce_sum += prod_sum[ph][pw];
       }
     }
-    grad_input2[n][c][h][w] = grad_input_val;
+    grad_input2[n][c][h][w] = reduce_sum;
   }
 }
 #endif
diff --git a/mmcv/ops/csrc/common/cuda/diff_iou_rotated_cuda_kernel.cuh b/mmcv/ops/csrc/common/cuda/diff_iou_rotated_cuda_kernel.cuh
deleted file mode 100644
index 053977a..0000000
--- a/mmcv/ops/csrc/common/cuda/diff_iou_rotated_cuda_kernel.cuh
+++ /dev/null
@@ -1,137 +0,0 @@
-// Copyright (c) OpenMMLab. All rights reserved
-// Adapted from
-// https://github.com/lilanxiao/Rotated_IoU/cuda_op/sort_vert_kernel.cu  # noqa
-#ifdef MMCV_USE_PARROTS
-#include "parrots_cuda_helper.hpp"
-#else
-#include "pytorch_cuda_helper.hpp"
-#endif
-
-#define MAX_NUM_VERT_IDX 9
-#define INTERSECTION_OFFSET 8
-#define EPSILON 1e-8
-
-inline int opt_n_thread(int work_size) {
-  const int pow_2 = std::log(static_cast<double>(work_size)) / std::log(2.0);
-  return max(min(1 << pow_2, THREADS_PER_BLOCK), 1);
-}
-
-/*
-compare normalized vertices (vertices around (0,0))
-if vertex1 < vertex2 return true.
-order: minimum at x-aixs, become larger in anti-clockwise direction
-*/
-__device__ bool compare_vertices(float x1, float y1, float x2, float y2) {
-  if (fabs(x1 - x2) < EPSILON && fabs(y2 - y1) < EPSILON)
-    return false;  // if equal, return false
-
-  if (y1 > 0 && y2 < 0) return true;
-  if (y1 < 0 && y2 > 0) return false;
-
-  float n1 = x1 * x1 + y1 * y1 + EPSILON;
-  float n2 = x2 * x2 + y2 * y2 + EPSILON;
-  float diff = fabs(x1) * x1 / n1 - fabs(x2) * x2 / n2;
-
-  if (y1 > 0 && y2 > 0) {
-    if (diff > EPSILON)
-      return true;
-    else
-      return false;
-  }
-  if (y1 < 0 && y2 < 0) {
-    if (diff < EPSILON)
-      return true;
-    else
-      return false;
-  }
-  return false;
-}
-
-__global__ void diff_iou_rotated_sort_vertices_forward_cuda_kernel(
-    int b, int n, int m, const float *__restrict__ vertices,
-    const bool *__restrict__ mask, const int *__restrict__ num_valid,
-    int *__restrict__ idx) {
-  int batch_idx = blockIdx.x;
-  vertices += batch_idx * n * m * 2;
-  mask += batch_idx * n * m;
-  num_valid += batch_idx * n;
-  idx += batch_idx * n * MAX_NUM_VERT_IDX;
-
-  int index = threadIdx.x;  // index of polygon
-  int stride = blockDim.x;
-  for (int i = index; i < n; i += stride) {
-    int pad;  // index of arbitrary invalid intersection point (not box corner!)
-    for (int j = INTERSECTION_OFFSET; j < m; ++j) {
-      if (!mask[i * m + j]) {
-        pad = j;
-        break;
-      }
-    }
-    if (num_valid[i] < 3) {
-      // not enough vertices, take an invalid intersection point
-      // (zero padding)
-      for (int j = 0; j < MAX_NUM_VERT_IDX; ++j) {
-        idx[i * MAX_NUM_VERT_IDX + j] = pad;
-      }
-    } else {
-      // sort the valid vertices
-      // note the number of valid vertices is known
-      // note: check that num_valid[i] < MAX_NUM_VERT_IDX
-      for (int j = 0; j < num_valid[i]; ++j) {
-        // initialize with a "big" value
-        float x_min = 1;
-        float y_min = -EPSILON;
-        int i_take = 0;
-        int i2;
-        float x2, y2;
-        if (j != 0) {
-          i2 = idx[i * MAX_NUM_VERT_IDX + j - 1];
-          x2 = vertices[i * m * 2 + i2 * 2 + 0];
-          y2 = vertices[i * m * 2 + i2 * 2 + 1];
-        }
-        for (int k = 0; k < m; ++k) {
-          float x = vertices[i * m * 2 + k * 2 + 0];
-          float y = vertices[i * m * 2 + k * 2 + 1];
-          if (mask[i * m + k] && compare_vertices(x, y, x_min, y_min)) {
-            if ((j == 0) || (j != 0 && compare_vertices(x2, y2, x, y))) {
-              x_min = x;
-              y_min = y;
-              i_take = k;
-            }
-          }
-        }
-        idx[i * MAX_NUM_VERT_IDX + j] = i_take;
-      }
-      // duplicate the first idx
-      idx[i * MAX_NUM_VERT_IDX + num_valid[i]] = idx[i * MAX_NUM_VERT_IDX + 0];
-
-      // pad zeros
-      for (int j = num_valid[i] + 1; j < MAX_NUM_VERT_IDX; ++j) {
-        idx[i * MAX_NUM_VERT_IDX + j] = pad;
-      }
-
-      // for corner case: the two boxes are exactly the same.
-      // in this case, idx would have duplicate elements, which makes the
-      // shoelace formula broken because of the definition, the duplicate
-      // elements only appear in the first 8 positions (they are "corners in
-      // box", not "intersection of edges")
-      if (num_valid[i] == 8) {
-        int counter = 0;
-        for (int j = 0; j < 4; ++j) {
-          int check = idx[i * MAX_NUM_VERT_IDX + j];
-          for (int k = 4; k < INTERSECTION_OFFSET; ++k) {
-            if (idx[i * MAX_NUM_VERT_IDX + k] == check) counter++;
-          }
-        }
-        if (counter == 4) {
-          idx[i * MAX_NUM_VERT_IDX + 4] = idx[i * MAX_NUM_VERT_IDX + 0];
-          for (int j = 5; j < MAX_NUM_VERT_IDX; ++j) {
-            idx[i * MAX_NUM_VERT_IDX + j] = pad;
-          }
-        }
-      }
-
-      // TODO: still might need to cover some other corner cases :(
-    }
-  }
-}
diff --git a/mmcv/ops/csrc/common/cuda/gather_points_cuda_kernel.cuh b/mmcv/ops/csrc/common/cuda/gather_points_cuda_kernel.cuh
index 6d93243..c8fc615 100644
--- a/mmcv/ops/csrc/common/cuda/gather_points_cuda_kernel.cuh
+++ b/mmcv/ops/csrc/common/cuda/gather_points_cuda_kernel.cuh
@@ -22,14 +22,13 @@ __global__ void gather_points_forward_cuda_kernel(int b, int c, int n, int m,
 
   int bs_idx = blockIdx.z;
   int c_idx = blockIdx.y;
-  CUDA_1D_KERNEL_LOOP(pt_idx, m) {
-    if (bs_idx >= b || c_idx >= c) return;
-
-    out += bs_idx * c * m + c_idx * m + pt_idx;
-    idx += bs_idx * m + pt_idx;
-    points += bs_idx * c * n + c_idx * n;
-    out[0] = points[idx[0]];
-  }
+  int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (bs_idx >= b || c_idx >= c || pt_idx >= m) return;
+
+  out += bs_idx * c * m + c_idx * m + pt_idx;
+  idx += bs_idx * m + pt_idx;
+  points += bs_idx * c * n + c_idx * n;
+  out[0] = points[idx[0]];
 }
 
 template <typename T>
@@ -44,15 +43,14 @@ __global__ void gather_points_backward_cuda_kernel(int b, int c, int n, int m,
 
   int bs_idx = blockIdx.z;
   int c_idx = blockIdx.y;
-  CUDA_1D_KERNEL_LOOP(pt_idx, m) {
-    if (bs_idx >= b || c_idx >= c) return;
+  int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (bs_idx >= b || c_idx >= c || pt_idx >= m) return;
 
-    grad_out += bs_idx * c * m + c_idx * m + pt_idx;
-    idx += bs_idx * m + pt_idx;
-    grad_points += bs_idx * c * n + c_idx * n;
+  grad_out += bs_idx * c * m + c_idx * m + pt_idx;
+  idx += bs_idx * m + pt_idx;
+  grad_points += bs_idx * c * n + c_idx * n;
 
-    atomicAdd(grad_points + idx[0], grad_out[0]);
-  }
+  atomicAdd(grad_points + idx[0], grad_out[0]);
 }
 
 #endif  // GATHER_POINTS_CUDA_KERNEL_CUH
diff --git a/mmcv/ops/csrc/common/cuda/group_points_cuda_kernel.cuh b/mmcv/ops/csrc/common/cuda/group_points_cuda_kernel.cuh
index dfad66f..9cfc2dc 100644
--- a/mmcv/ops/csrc/common/cuda/group_points_cuda_kernel.cuh
+++ b/mmcv/ops/csrc/common/cuda/group_points_cuda_kernel.cuh
@@ -22,19 +22,18 @@ __global__ void group_points_forward_cuda_kernel(int b, int c, int n,
   //      out: (B, C, npoints, nsample)
   int bs_idx = blockIdx.z;
   int c_idx = blockIdx.y;
-  CUDA_1D_KERNEL_LOOP(index, npoints * nsample) {
-    if (bs_idx >= b || c_idx >= c) return;
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int pt_idx = index / nsample;
+  if (bs_idx >= b || c_idx >= c || pt_idx >= npoints) return;
 
-    int pt_idx = index / nsample;
-    int sample_idx = index % nsample;
+  int sample_idx = index % nsample;
 
-    idx += bs_idx * npoints * nsample + pt_idx * nsample + sample_idx;
-    int in_idx = bs_idx * c * n + c_idx * n + idx[0];
-    int out_idx = bs_idx * c * npoints * nsample + c_idx * npoints * nsample +
-                  pt_idx * nsample + sample_idx;
+  idx += bs_idx * npoints * nsample + pt_idx * nsample + sample_idx;
+  int in_idx = bs_idx * c * n + c_idx * n + idx[0];
+  int out_idx = bs_idx * c * npoints * nsample + c_idx * npoints * nsample +
+                pt_idx * nsample + sample_idx;
 
-    out[out_idx] = points[in_idx];
-  }
+  out[out_idx] = points[in_idx];
 }
 
 template <typename T>
@@ -49,17 +48,16 @@ __global__ void group_points_backward_cuda_kernel(int b, int c, int n,
   //      grad_points: (B, C, N)
   int bs_idx = blockIdx.z;
   int c_idx = blockIdx.y;
-  CUDA_1D_KERNEL_LOOP(index, npoints * nsample) {
-    int pt_idx = index / nsample;
-    if (bs_idx >= b || c_idx >= c) return;
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int pt_idx = index / nsample;
+  if (bs_idx >= b || c_idx >= c || pt_idx >= npoints) return;
 
-    int sample_idx = index % nsample;
-    grad_out += bs_idx * c * npoints * nsample + c_idx * npoints * nsample +
-                pt_idx * nsample + sample_idx;
-    idx += bs_idx * npoints * nsample + pt_idx * nsample + sample_idx;
+  int sample_idx = index % nsample;
+  grad_out += bs_idx * c * npoints * nsample + c_idx * npoints * nsample +
+              pt_idx * nsample + sample_idx;
+  idx += bs_idx * npoints * nsample + pt_idx * nsample + sample_idx;
 
-    atomicAdd(grad_points + bs_idx * c * n + c_idx * n + idx[0], grad_out[0]);
-  }
+  atomicAdd(grad_points + bs_idx * c * n + c_idx * n + idx[0], grad_out[0]);
 }
 
 #endif  // GROUP_POINTS_CUDA_KERNEL_CUH
diff --git a/mmcv/ops/csrc/common/cuda/iou3d_cuda_kernel.cuh b/mmcv/ops/csrc/common/cuda/iou3d_cuda_kernel.cuh
index 9ebdcad..4e261cb 100644
--- a/mmcv/ops/csrc/common/cuda/iou3d_cuda_kernel.cuh
+++ b/mmcv/ops/csrc/common/cuda/iou3d_cuda_kernel.cuh
@@ -50,17 +50,21 @@ __device__ int check_rect_cross(const Point &p1, const Point &p2,
 }
 
 __device__ inline int check_in_box2d(const float *box, const Point &p) {
-  // params: box (7) [x, y, z, dx, dy, dz, heading]
-  const float MARGIN = 1e-2;
-
-  float center_x = box[0], center_y = box[1];
-  // rotate the point in the opposite direction of box
-  float angle_cos = cos(-box[6]), angle_sin = sin(-box[6]);
-  float rot_x = (p.x - center_x) * angle_cos + (p.y - center_y) * (-angle_sin);
-  float rot_y = (p.x - center_x) * angle_sin + (p.y - center_y) * angle_cos;
-
-  return (fabs(rot_x) < box[3] / 2 + MARGIN &&
-          fabs(rot_y) < box[4] / 2 + MARGIN);
+  // params: box (5) [x1, y1, x2, y2, angle]
+  const float MARGIN = 1e-5;
+
+  float center_x = (box[0] + box[2]) / 2;
+  float center_y = (box[1] + box[3]) / 2;
+  float angle_cos = cos(-box[4]),
+        angle_sin =
+            sin(-box[4]);  // rotate the point in the opposite direction of box
+  float rot_x =
+      (p.x - center_x) * angle_cos - (p.y - center_y) * angle_sin + center_x;
+  float rot_y =
+      (p.x - center_x) * angle_sin + (p.y - center_y) * angle_cos + center_y;
+
+  return (rot_x > box[0] - MARGIN && rot_x < box[2] + MARGIN &&
+          rot_y > box[1] - MARGIN && rot_y < box[3] + MARGIN);
 }
 
 __device__ inline int intersection(const Point &p1, const Point &p0,
@@ -112,19 +116,16 @@ __device__ inline int point_cmp(const Point &a, const Point &b,
 }
 
 __device__ inline float box_overlap(const float *box_a, const float *box_b) {
-  // params box_a: [x, y, z, dx, dy, dz, heading]
-  // params box_b: [x, y, z, dx, dy, dz, heading]
+  // params: box_a (5) [x1, y1, x2, y2, angle]
+  // params: box_b (5) [x1, y1, x2, y2, angle]
 
-  float a_angle = box_a[6], b_angle = box_b[6];
-  float a_dx_half = box_a[3] / 2, b_dx_half = box_b[3] / 2,
-        a_dy_half = box_a[4] / 2, b_dy_half = box_b[4] / 2;
-  float a_x1 = box_a[0] - a_dx_half, a_y1 = box_a[1] - a_dy_half;
-  float a_x2 = box_a[0] + a_dx_half, a_y2 = box_a[1] + a_dy_half;
-  float b_x1 = box_b[0] - b_dx_half, b_y1 = box_b[1] - b_dy_half;
-  float b_x2 = box_b[0] + b_dx_half, b_y2 = box_b[1] + b_dy_half;
+  float a_x1 = box_a[0], a_y1 = box_a[1], a_x2 = box_a[2], a_y2 = box_a[3],
+        a_angle = box_a[4];
+  float b_x1 = box_b[0], b_y1 = box_b[1], b_x2 = box_b[2], b_y2 = box_b[3],
+        b_angle = box_b[4];
 
-  Point center_a(box_a[0], box_a[1]);
-  Point center_b(box_b[0], box_b[1]);
+  Point center_a((a_x1 + a_x2) / 2, (a_y1 + a_y2) / 2);
+  Point center_b((b_x1 + b_x2) / 2, (b_y1 + b_y2) / 2);
 
   Point box_a_corners[5];
   box_a_corners[0].set(a_x1, a_y1);
@@ -208,10 +209,10 @@ __device__ inline float box_overlap(const float *box_a, const float *box_b) {
 }
 
 __device__ inline float iou_bev(const float *box_a, const float *box_b) {
-  // params box_a: [x, y, z, dx, dy, dz, heading]
-  // params box_b: [x, y, z, dx, dy, dz, heading]
-  float sa = box_a[3] * box_a[4];
-  float sb = box_b[3] * box_b[4];
+  // params: box_a (5) [x1, y1, x2, y2, angle]
+  // params: box_b (5) [x1, y1, x2, y2, angle]
+  float sa = (box_a[2] - box_a[0]) * (box_a[3] - box_a[1]);
+  float sb = (box_b[2] - box_b[0]) * (box_b[3] - box_b[1]);
   float s_overlap = box_overlap(box_a, box_b);
   return s_overlap / fmaxf(sa + sb - s_overlap, EPS);
 }
@@ -219,148 +220,149 @@ __device__ inline float iou_bev(const float *box_a, const float *box_b) {
 __global__ void iou3d_boxes_overlap_bev_forward_cuda_kernel(
     const int num_a, const float *boxes_a, const int num_b,
     const float *boxes_b, float *ans_overlap) {
-  // params boxes_a: (N, 7) [x, y, z, dx, dy, dz, heading]
-  // params boxes_b: (M, 7) [x, y, z, dx, dy, dz, heading]
-  CUDA_2D_KERNEL_LOOP(b_idx, num_b, a_idx, num_a) {
-    if (a_idx >= num_a || b_idx >= num_b) {
-      return;
-    }
+  const int a_idx = blockIdx.y * THREADS_PER_BLOCK + threadIdx.y;
+  const int b_idx = blockIdx.x * THREADS_PER_BLOCK + threadIdx.x;
 
-    const float *cur_box_a = boxes_a + a_idx * 7;
-    const float *cur_box_b = boxes_b + b_idx * 7;
-    float cur_overlap = box_overlap(cur_box_a, cur_box_b);
-    ans_overlap[a_idx * num_b + b_idx] = cur_overlap;
+  if (a_idx >= num_a || b_idx >= num_b) {
+    return;
   }
+  const float *cur_box_a = boxes_a + a_idx * 5;
+  const float *cur_box_b = boxes_b + b_idx * 5;
+  float s_overlap = box_overlap(cur_box_a, cur_box_b);
+  ans_overlap[a_idx * num_b + b_idx] = s_overlap;
 }
 
-__global__ void iou3d_nms3d_forward_cuda_kernel(const int boxes_num,
-                                                const float nms_overlap_thresh,
-                                                const float *boxes,
-                                                unsigned long long *mask) {
-  // params: boxes (N, 7) [x, y, z, dx, dy, dz, heading]
+__global__ void iou3d_boxes_iou_bev_forward_cuda_kernel(const int num_a,
+                                                        const float *boxes_a,
+                                                        const int num_b,
+                                                        const float *boxes_b,
+                                                        float *ans_iou) {
+  const int a_idx = blockIdx.y * THREADS_PER_BLOCK + threadIdx.y;
+  const int b_idx = blockIdx.x * THREADS_PER_BLOCK + threadIdx.x;
+
+  if (a_idx >= num_a || b_idx >= num_b) {
+    return;
+  }
+
+  const float *cur_box_a = boxes_a + a_idx * 5;
+  const float *cur_box_b = boxes_b + b_idx * 5;
+  float cur_iou_bev = iou_bev(cur_box_a, cur_box_b);
+  ans_iou[a_idx * num_b + b_idx] = cur_iou_bev;
+}
+
+__global__ void nms_forward_cuda_kernel(const int boxes_num,
+                                        const float nms_overlap_thresh,
+                                        const float *boxes,
+                                        unsigned long long *mask) {
+  // params: boxes (N, 5) [x1, y1, x2, y2, ry]
   // params: mask (N, N/THREADS_PER_BLOCK_NMS)
-  const int blocks =
-      (boxes_num + THREADS_PER_BLOCK_NMS - 1) / THREADS_PER_BLOCK_NMS;
-  CUDA_2D_KERNEL_BLOCK_LOOP(col_start, blocks, row_start, blocks) {
-    // if (row_start > col_start) return;
-
-    const int row_size = fminf(boxes_num - row_start * THREADS_PER_BLOCK_NMS,
-                               THREADS_PER_BLOCK_NMS);
-    const int col_size = fminf(boxes_num - col_start * THREADS_PER_BLOCK_NMS,
-                               THREADS_PER_BLOCK_NMS);
-
-    __shared__ float block_boxes[THREADS_PER_BLOCK_NMS * 7];
-
-    if (threadIdx.x < col_size) {
-      block_boxes[threadIdx.x * 7 + 0] =
-          boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 0];
-      block_boxes[threadIdx.x * 7 + 1] =
-          boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 1];
-      block_boxes[threadIdx.x * 7 + 2] =
-          boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 2];
-      block_boxes[threadIdx.x * 7 + 3] =
-          boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 3];
-      block_boxes[threadIdx.x * 7 + 4] =
-          boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 4];
-      block_boxes[threadIdx.x * 7 + 5] =
-          boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 5];
-      block_boxes[threadIdx.x * 7 + 6] =
-          boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 6];
-    }
-    __syncthreads();
 
-    if (threadIdx.x < row_size) {
-      const int cur_box_idx = THREADS_PER_BLOCK_NMS * row_start + threadIdx.x;
-      const float *cur_box = boxes + cur_box_idx * 7;
+  const int row_start = blockIdx.y;
+  const int col_start = blockIdx.x;
+
+  // if (row_start > col_start) return;
+
+  const int row_size = fminf(boxes_num - row_start * THREADS_PER_BLOCK_NMS,
+                             THREADS_PER_BLOCK_NMS);
+  const int col_size = fminf(boxes_num - col_start * THREADS_PER_BLOCK_NMS,
+                             THREADS_PER_BLOCK_NMS);
+
+  __shared__ float block_boxes[THREADS_PER_BLOCK_NMS * 5];
+
+  if (threadIdx.x < col_size) {
+    block_boxes[threadIdx.x * 5 + 0] =
+        boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 0];
+    block_boxes[threadIdx.x * 5 + 1] =
+        boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 1];
+    block_boxes[threadIdx.x * 5 + 2] =
+        boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 2];
+    block_boxes[threadIdx.x * 5 + 3] =
+        boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 3];
+    block_boxes[threadIdx.x * 5 + 4] =
+        boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 4];
+  }
+  __syncthreads();
 
-      int i = 0;
-      unsigned long long t = 0;
-      int start = 0;
-      if (row_start == col_start) {
-        start = threadIdx.x + 1;
-      }
-      for (i = start; i < col_size; i++) {
-        if (iou_bev(cur_box, block_boxes + i * 7) > nms_overlap_thresh) {
-          t |= 1ULL << i;
-        }
+  if (threadIdx.x < row_size) {
+    const int cur_box_idx = THREADS_PER_BLOCK_NMS * row_start + threadIdx.x;
+    const float *cur_box = boxes + cur_box_idx * 5;
+
+    int i = 0;
+    unsigned long long t = 0;
+    int start = 0;
+    if (row_start == col_start) {
+      start = threadIdx.x + 1;
+    }
+    for (i = start; i < col_size; i++) {
+      if (iou_bev(cur_box, block_boxes + i * 5) > nms_overlap_thresh) {
+        t |= 1ULL << i;
       }
-      const int col_blocks =
-          (boxes_num + THREADS_PER_BLOCK_NMS - 1) / THREADS_PER_BLOCK_NMS;
-      mask[cur_box_idx * col_blocks + col_start] = t;
     }
+    const int col_blocks = DIVUP(boxes_num, THREADS_PER_BLOCK_NMS);
+    mask[cur_box_idx * col_blocks + col_start] = t;
   }
 }
 
 __device__ inline float iou_normal(float const *const a, float const *const b) {
-  // params: a: [x, y, z, dx, dy, dz, heading]
-  // params: b: [x, y, z, dx, dy, dz, heading]
-
-  float left = fmaxf(a[0] - a[3] / 2, b[0] - b[3] / 2),
-        right = fminf(a[0] + a[3] / 2, b[0] + b[3] / 2);
-  float top = fmaxf(a[1] - a[4] / 2, b[1] - b[4] / 2),
-        bottom = fminf(a[1] + a[4] / 2, b[1] + b[4] / 2);
+  float left = fmaxf(a[0], b[0]), right = fminf(a[2], b[2]);
+  float top = fmaxf(a[1], b[1]), bottom = fminf(a[3], b[3]);
   float width = fmaxf(right - left, 0.f), height = fmaxf(bottom - top, 0.f);
   float interS = width * height;
-  float Sa = a[3] * a[4];
-  float Sb = b[3] * b[4];
+  float Sa = (a[2] - a[0]) * (a[3] - a[1]);
+  float Sb = (b[2] - b[0]) * (b[3] - b[1]);
   return interS / fmaxf(Sa + Sb - interS, EPS);
 }
 
-__global__ void iou3d_nms3d_normal_forward_cuda_kernel(
-    const int boxes_num, const float nms_overlap_thresh, const float *boxes,
-    unsigned long long *mask) {
-  // params: boxes (N, 7) [x, y, z, dx, dy, dz, heading]
+__global__ void nms_normal_forward_cuda_kernel(const int boxes_num,
+                                               const float nms_overlap_thresh,
+                                               const float *boxes,
+                                               unsigned long long *mask) {
+  // params: boxes (N, 5) [x1, y1, x2, y2, ry]
   // params: mask (N, N/THREADS_PER_BLOCK_NMS)
 
-  const int blocks =
-      (boxes_num + THREADS_PER_BLOCK_NMS - 1) / THREADS_PER_BLOCK_NMS;
-  CUDA_2D_KERNEL_BLOCK_LOOP(col_start, blocks, row_start, blocks) {
-    // if (row_start > col_start) return;
-
-    const int row_size = fminf(boxes_num - row_start * THREADS_PER_BLOCK_NMS,
-                               THREADS_PER_BLOCK_NMS);
-    const int col_size = fminf(boxes_num - col_start * THREADS_PER_BLOCK_NMS,
-                               THREADS_PER_BLOCK_NMS);
-
-    __shared__ float block_boxes[THREADS_PER_BLOCK_NMS * 7];
-
-    if (threadIdx.x < col_size) {
-      block_boxes[threadIdx.x * 7 + 0] =
-          boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 0];
-      block_boxes[threadIdx.x * 7 + 1] =
-          boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 1];
-      block_boxes[threadIdx.x * 7 + 2] =
-          boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 2];
-      block_boxes[threadIdx.x * 7 + 3] =
-          boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 3];
-      block_boxes[threadIdx.x * 7 + 4] =
-          boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 4];
-      block_boxes[threadIdx.x * 7 + 5] =
-          boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 5];
-      block_boxes[threadIdx.x * 7 + 6] =
-          boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 6];
-    }
-    __syncthreads();
+  const int row_start = blockIdx.y;
+  const int col_start = blockIdx.x;
+
+  // if (row_start > col_start) return;
+
+  const int row_size = fminf(boxes_num - row_start * THREADS_PER_BLOCK_NMS,
+                             THREADS_PER_BLOCK_NMS);
+  const int col_size = fminf(boxes_num - col_start * THREADS_PER_BLOCK_NMS,
+                             THREADS_PER_BLOCK_NMS);
+
+  __shared__ float block_boxes[THREADS_PER_BLOCK_NMS * 5];
+
+  if (threadIdx.x < col_size) {
+    block_boxes[threadIdx.x * 5 + 0] =
+        boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 0];
+    block_boxes[threadIdx.x * 5 + 1] =
+        boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 1];
+    block_boxes[threadIdx.x * 5 + 2] =
+        boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 2];
+    block_boxes[threadIdx.x * 5 + 3] =
+        boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 3];
+    block_boxes[threadIdx.x * 5 + 4] =
+        boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 4];
+  }
+  __syncthreads();
 
-    if (threadIdx.x < row_size) {
-      const int cur_box_idx = THREADS_PER_BLOCK_NMS * row_start + threadIdx.x;
-      const float *cur_box = boxes + cur_box_idx * 7;
+  if (threadIdx.x < row_size) {
+    const int cur_box_idx = THREADS_PER_BLOCK_NMS * row_start + threadIdx.x;
+    const float *cur_box = boxes + cur_box_idx * 5;
 
-      int i = 0;
-      unsigned long long t = 0;
-      int start = 0;
-      if (row_start == col_start) {
-        start = threadIdx.x + 1;
-      }
-      for (i = start; i < col_size; i++) {
-        if (iou_normal(cur_box, block_boxes + i * 7) > nms_overlap_thresh) {
-          t |= 1ULL << i;
-        }
+    int i = 0;
+    unsigned long long t = 0;
+    int start = 0;
+    if (row_start == col_start) {
+      start = threadIdx.x + 1;
+    }
+    for (i = start; i < col_size; i++) {
+      if (iou_normal(cur_box, block_boxes + i * 5) > nms_overlap_thresh) {
+        t |= 1ULL << i;
       }
-      const int col_blocks =
-          (boxes_num + THREADS_PER_BLOCK_NMS - 1) / THREADS_PER_BLOCK_NMS;
-      mask[cur_box_idx * col_blocks + col_start] = t;
     }
+    const int col_blocks = DIVUP(boxes_num, THREADS_PER_BLOCK_NMS);
+    mask[cur_box_idx * col_blocks + col_start] = t;
   }
 }
 
diff --git a/mmcv/ops/csrc/common/cuda/knn_cuda_kernel.cuh b/mmcv/ops/csrc/common/cuda/knn_cuda_kernel.cuh
index 3cf52bb..3181aa6 100644
--- a/mmcv/ops/csrc/common/cuda/knn_cuda_kernel.cuh
+++ b/mmcv/ops/csrc/common/cuda/knn_cuda_kernel.cuh
@@ -51,42 +51,41 @@ __global__ void knn_forward_cuda_kernel(int b, int n, int m, int nsample,
                                         const T *xyz, const T *new_xyz,
                                         int *__restrict__ idx, T *dist2) {
   int bs_idx = blockIdx.y;
-  CUDA_1D_KERNEL_LOOP(pt_idx, m) {
-    if (bs_idx >= b) return;
+  int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (bs_idx >= b || pt_idx >= m) return;
 
-    new_xyz += bs_idx * m * 3 + pt_idx * 3;
-    xyz += bs_idx * n * 3;
-    idx += bs_idx * m * nsample + pt_idx * nsample;
-    dist2 += bs_idx * m * nsample + pt_idx * nsample;
+  new_xyz += bs_idx * m * 3 + pt_idx * 3;
+  xyz += bs_idx * n * 3;
+  idx += bs_idx * m * nsample + pt_idx * nsample;
+  dist2 += bs_idx * m * nsample + pt_idx * nsample;
 
-    T new_x = new_xyz[0];
-    T new_y = new_xyz[1];
-    T new_z = new_xyz[2];
+  T new_x = new_xyz[0];
+  T new_y = new_xyz[1];
+  T new_z = new_xyz[2];
 
-    float best_dist[100];
-    int best_idx[100];
-    for (int i = 0; i < nsample; i++) {
-      best_dist[i] = 1e10;
-      best_idx[i] = 0;
-    }
-    for (int i = 0; i < n; i++) {
-      T x = xyz[i * 3 + 0];
-      T y = xyz[i * 3 + 1];
-      T z = xyz[i * 3 + 2];
-      T d2 = (new_x - x) * (new_x - x) + (new_y - y) * (new_y - y) +
-             (new_z - z) * (new_z - z);
-      if (d2 < best_dist[0]) {
-        best_dist[0] = d2;
-        best_idx[0] = i;
-        reheap(best_dist, best_idx, nsample);
-      }
-    }
-    heap_sort(best_dist, best_idx, nsample);
-    for (int i = 0; i < nsample; i++) {
-      idx[i] = best_idx[i];
-      dist2[i] = best_dist[i];
+  float best_dist[100];
+  int best_idx[100];
+  for (int i = 0; i < nsample; i++) {
+    best_dist[i] = 1e10;
+    best_idx[i] = 0;
+  }
+  for (int i = 0; i < n; i++) {
+    T x = xyz[i * 3 + 0];
+    T y = xyz[i * 3 + 1];
+    T z = xyz[i * 3 + 2];
+    T d2 = (new_x - x) * (new_x - x) + (new_y - y) * (new_y - y) +
+           (new_z - z) * (new_z - z);
+    if (d2 < best_dist[0]) {
+      best_dist[0] = d2;
+      best_idx[0] = i;
+      reheap(best_dist, best_idx, nsample);
     }
   }
+  heap_sort(best_dist, best_idx, nsample);
+  for (int i = 0; i < nsample; i++) {
+    idx[i] = best_idx[i];
+    dist2[i] = best_dist[i];
+  }
 }
 
 #endif  // KNN_CUDA_KERNEL_CUH
diff --git a/mmcv/ops/csrc/common/cuda/min_area_polygons_cuda.cuh b/mmcv/ops/csrc/common/cuda/min_area_polygons_cuda.cuh
deleted file mode 100644
index df56e74..0000000
--- a/mmcv/ops/csrc/common/cuda/min_area_polygons_cuda.cuh
+++ /dev/null
@@ -1,300 +0,0 @@
-// Copyright (c) OpenMMLab. All rights reserved
-#ifndef MIN_AREA_POLYGONS_CUDA_KERNEL_CUH
-#define MIN_AREA_POLYGONS_CUDA_KERNEL_CUH
-
-#ifdef MMCV_USE_PARROTS
-#include "parrots_cuda_helper.hpp"
-#else
-#include "pytorch_cuda_helper.hpp"
-#endif
-
-#define MAXN 20
-__device__ const float PI = 3.1415926;
-
-struct Point {
-  float x, y;
-  __device__ Point() {}
-  __device__ Point(float x, float y) : x(x), y(y) {}
-};
-
-__device__ inline void swap1(Point *a, Point *b) {
-  Point temp;
-  temp.x = a->x;
-  temp.y = a->y;
-
-  a->x = b->x;
-  a->y = b->y;
-
-  b->x = temp.x;
-  b->y = temp.y;
-}
-__device__ inline float cross(Point o, Point a, Point b) {
-  return (a.x - o.x) * (b.y - o.y) - (b.x - o.x) * (a.y - o.y);
-}
-
-__device__ inline float dis(Point a, Point b) {
-  return (a.x - b.x) * (a.x - b.x) + (a.y - b.y) * (a.y - b.y);
-}
-__device__ inline void minBoundingRect(Point *ps, int n_points, float *minbox) {
-  float convex_points[2][MAXN];
-  for (int j = 0; j < n_points; j++) {
-    convex_points[0][j] = ps[j].x;
-  }
-  for (int j = 0; j < n_points; j++) {
-    convex_points[1][j] = ps[j].y;
-  }
-
-  Point edges[MAXN];
-  float edges_angles[MAXN];
-  float unique_angles[MAXN];
-  int n_edges = n_points - 1;
-  int n_unique = 0;
-  int unique_flag = 0;
-
-  for (int i = 0; i < n_edges; i++) {
-    edges[i].x = ps[i + 1].x - ps[i].x;
-    edges[i].y = ps[i + 1].y - ps[i].y;
-  }
-  for (int i = 0; i < n_edges; i++) {
-    edges_angles[i] = atan2((double)edges[i].y, (double)edges[i].x);
-    if (edges_angles[i] >= 0) {
-      edges_angles[i] = fmod((double)edges_angles[i], (double)PI / 2);
-    } else {
-      edges_angles[i] =
-          edges_angles[i] - (int)(edges_angles[i] / (PI / 2) - 1) * (PI / 2);
-    }
-  }
-  unique_angles[0] = edges_angles[0];
-  n_unique += 1;
-  for (int i = 1; i < n_edges; i++) {
-    for (int j = 0; j < n_unique; j++) {
-      if (edges_angles[i] == unique_angles[j]) {
-        unique_flag += 1;
-      }
-    }
-    if (unique_flag == 0) {
-      unique_angles[n_unique] = edges_angles[i];
-      n_unique += 1;
-      unique_flag = 0;
-    } else {
-      unique_flag = 0;
-    }
-  }
-
-  float minarea = 1e12;
-  for (int i = 0; i < n_unique; i++) {
-    float R[2][2];
-    float rot_points[2][MAXN];
-    R[0][0] = cos(unique_angles[i]);
-    R[0][1] = sin(unique_angles[i]);
-    R[1][0] = -sin(unique_angles[i]);
-    R[1][1] = cos(unique_angles[i]);
-    // R x Points
-    for (int m = 0; m < 2; m++) {
-      for (int n = 0; n < n_points; n++) {
-        float sum = 0.0;
-        for (int k = 0; k < 2; k++) {
-          sum = sum + R[m][k] * convex_points[k][n];
-        }
-        rot_points[m][n] = sum;
-      }
-    }
-
-    // xmin;
-    float xmin, ymin, xmax, ymax;
-    xmin = 1e12;
-    for (int j = 0; j < n_points; j++) {
-      if (isinf(rot_points[0][j]) || isnan(rot_points[0][j])) {
-        continue;
-      } else {
-        if (rot_points[0][j] < xmin) {
-          xmin = rot_points[0][j];
-        }
-      }
-    }
-    // ymin
-    ymin = 1e12;
-    for (int j = 0; j < n_points; j++) {
-      if (isinf(rot_points[1][j]) || isnan(rot_points[1][j])) {
-        continue;
-      } else {
-        if (rot_points[1][j] < ymin) {
-          ymin = rot_points[1][j];
-        }
-      }
-    }
-    // xmax
-    xmax = -1e12;
-    for (int j = 0; j < n_points; j++) {
-      if (isinf(rot_points[0][j]) || isnan(rot_points[0][j])) {
-        continue;
-      } else {
-        if (rot_points[0][j] > xmax) {
-          xmax = rot_points[0][j];
-        }
-      }
-    }
-    // ymax
-    ymax = -1e12;
-    for (int j = 0; j < n_points; j++) {
-      if (isinf(rot_points[1][j]) || isnan(rot_points[1][j])) {
-        continue;
-      } else {
-        if (rot_points[1][j] > ymax) {
-          ymax = rot_points[1][j];
-        }
-      }
-    }
-    float area = (xmax - xmin) * (ymax - ymin);
-    if (area < minarea) {
-      minarea = area;
-      minbox[0] = unique_angles[i];
-      minbox[1] = xmin;
-      minbox[2] = ymin;
-      minbox[3] = xmax;
-      minbox[4] = ymax;
-    }
-  }
-}
-
-// convex_find
-__device__ inline void Jarvis(Point *in_poly, int &n_poly) {
-  int n_input = n_poly;
-  Point input_poly[20];
-  for (int i = 0; i < n_input; i++) {
-    input_poly[i].x = in_poly[i].x;
-    input_poly[i].y = in_poly[i].y;
-  }
-  Point p_max, p_k;
-  int max_index, k_index;
-  int Stack[20], top1, top2;
-  // float sign;
-  double sign;
-  Point right_point[10], left_point[10];
-
-  for (int i = 0; i < n_poly; i++) {
-    if (in_poly[i].y < in_poly[0].y ||
-        in_poly[i].y == in_poly[0].y && in_poly[i].x < in_poly[0].x) {
-      Point *j = &(in_poly[0]);
-      Point *k = &(in_poly[i]);
-      swap1(j, k);
-    }
-    if (i == 0) {
-      p_max = in_poly[0];
-      max_index = 0;
-    }
-    if (in_poly[i].y > p_max.y ||
-        in_poly[i].y == p_max.y && in_poly[i].x > p_max.x) {
-      p_max = in_poly[i];
-      max_index = i;
-    }
-  }
-  if (max_index == 0) {
-    max_index = 1;
-    p_max = in_poly[max_index];
-  }
-
-  k_index = 0, Stack[0] = 0, top1 = 0;
-  while (k_index != max_index) {
-    p_k = p_max;
-    k_index = max_index;
-    for (int i = 1; i < n_poly; i++) {
-      sign = cross(in_poly[Stack[top1]], in_poly[i], p_k);
-      if ((sign > 0) || ((sign == 0) && (dis(in_poly[Stack[top1]], in_poly[i]) >
-                                         dis(in_poly[Stack[top1]], p_k)))) {
-        p_k = in_poly[i];
-        k_index = i;
-      }
-    }
-    top1++;
-    Stack[top1] = k_index;
-  }
-
-  for (int i = 0; i <= top1; i++) {
-    right_point[i] = in_poly[Stack[i]];
-  }
-
-  k_index = 0, Stack[0] = 0, top2 = 0;
-
-  while (k_index != max_index) {
-    p_k = p_max;
-    k_index = max_index;
-    for (int i = 1; i < n_poly; i++) {
-      sign = cross(in_poly[Stack[top2]], in_poly[i], p_k);
-      if ((sign < 0) || (sign == 0) && (dis(in_poly[Stack[top2]], in_poly[i]) >
-                                        dis(in_poly[Stack[top2]], p_k))) {
-        p_k = in_poly[i];
-        k_index = i;
-      }
-    }
-    top2++;
-    Stack[top2] = k_index;
-  }
-
-  for (int i = top2 - 1; i >= 0; i--) {
-    left_point[i] = in_poly[Stack[i]];
-  }
-
-  for (int i = 0; i < top1 + top2; i++) {
-    if (i <= top1) {
-      in_poly[i] = right_point[i];
-    } else {
-      in_poly[i] = left_point[top2 - (i - top1)];
-    }
-  }
-  n_poly = top1 + top2;
-}
-
-template <typename T>
-__device__ inline void Findminbox(T const *const p, T *minpoints) {
-  Point ps1[MAXN];
-  Point convex[MAXN];
-  for (int i = 0; i < 9; i++) {
-    convex[i].x = p[i * 2];
-    convex[i].y = p[i * 2 + 1];
-  }
-  int n_convex = 9;
-  Jarvis(convex, n_convex);
-  int n1 = n_convex;
-  for (int i = 0; i < n1; i++) {
-    ps1[i].x = convex[i].x;
-    ps1[i].y = convex[i].y;
-  }
-  ps1[n1].x = convex[0].x;
-  ps1[n1].y = convex[0].y;
-
-  float minbbox[5] = {0};
-  minBoundingRect(ps1, n1 + 1, minbbox);
-  float angle = minbbox[0];
-  float xmin = minbbox[1];
-  float ymin = minbbox[2];
-  float xmax = minbbox[3];
-  float ymax = minbbox[4];
-  float R[2][2];
-
-  R[0][0] = cos(angle);
-  R[0][1] = sin(angle);
-  R[1][0] = -sin(angle);
-  R[1][1] = cos(angle);
-
-  minpoints[0] = xmax * R[0][0] + ymin * R[1][0];
-  minpoints[1] = xmax * R[0][1] + ymin * R[1][1];
-  minpoints[2] = xmin * R[0][0] + ymin * R[1][0];
-  minpoints[3] = xmin * R[0][1] + ymin * R[1][1];
-  minpoints[4] = xmin * R[0][0] + ymax * R[1][0];
-  minpoints[5] = xmin * R[0][1] + ymax * R[1][1];
-  minpoints[6] = xmax * R[0][0] + ymax * R[1][0];
-  minpoints[7] = xmax * R[0][1] + ymax * R[1][1];
-}
-
-template <typename T>
-__global__ void min_area_polygons_cuda_kernel(const int ex_n_boxes,
-                                              const T *ex_boxes, T *minbox) {
-  CUDA_1D_KERNEL_LOOP(index, ex_n_boxes) {
-    const T *cur_box = ex_boxes + index * 18;
-    T *cur_min_box = minbox + index * 8;
-    Findminbox(cur_box, cur_min_box);
-  }
-}
-
-#endif  // MIN_AREA_POLYGONS_CUDA_KERNEL_CUH
diff --git a/mmcv/ops/csrc/common/cuda/ms_deform_attn_cuda_kernel.cuh b/mmcv/ops/csrc/common/cuda/ms_deform_attn_cuda_kernel.cuh
index 12225ff..aff1ea2 100644
--- a/mmcv/ops/csrc/common/cuda/ms_deform_attn_cuda_kernel.cuh
+++ b/mmcv/ops/csrc/common/cuda/ms_deform_attn_cuda_kernel.cuh
@@ -14,6 +14,11 @@
 #include "common_cuda_helper.hpp"
 #include "pytorch_cuda_helper.hpp"
 
+const int CUDA_NUM_THREADS = 1024;
+inline int GET_BLOCKS(const int N, const int num_threads) {
+  return (N + num_threads - 1) / num_threads;
+}
+
 template <typename scalar_t>
 __device__ scalar_t ms_deform_attn_im2col_bilinear(
     const scalar_t *&bottom_data, const int &height, const int &width,
@@ -262,11 +267,10 @@ __global__ void ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1(
     const int channels, const int num_levels, const int num_query,
     const int num_point, scalar_t *grad_value, scalar_t *grad_sampling_loc,
     scalar_t *grad_attn_weight) {
-  __shared__ scalar_t cache_grad_sampling_loc[blockSize * 2];
-  __shared__ scalar_t cache_grad_attn_weight[blockSize];
-  unsigned int tid = threadIdx.x;
-  const int qid_stride = num_heads * channels;
   CUDA_1D_KERNEL_LOOP(index, n) {
+    __shared__ scalar_t cache_grad_sampling_loc[blockSize * 2];
+    __shared__ scalar_t cache_grad_attn_weight[blockSize];
+    unsigned int tid = threadIdx.x;
     int _temp = index;
     const int c_col = _temp % channels;
     _temp /= channels;
@@ -281,11 +285,11 @@ __global__ void ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1(
     int data_weight_ptr = sampling_index * num_levels * num_point;
     int data_loc_w_ptr = data_weight_ptr << 1;
     const int grad_sampling_ptr = data_weight_ptr;
-    scalar_t *grad_sampling_loc_out =
-        grad_sampling_loc + (grad_sampling_ptr << 1);
-    scalar_t *grad_attn_weight_out = grad_attn_weight + grad_sampling_ptr;
+    grad_sampling_loc += grad_sampling_ptr << 1;
+    grad_attn_weight += grad_sampling_ptr;
     const int grad_weight_stride = 1;
     const int grad_loc_stride = 2;
+    const int qid_stride = num_heads * channels;
     const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
 
     for (int l_col = 0; l_col < num_levels; ++l_col) {
@@ -322,23 +326,23 @@ __global__ void ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1(
                    _grad_h = cache_grad_sampling_loc[1],
                    _grad_a = cache_grad_attn_weight[0];
           int sid = 2;
-          for (unsigned int _tid = 1; _tid < blockSize; ++_tid) {
+          for (unsigned int tid = 1; tid < blockSize; ++tid) {
             _grad_w += cache_grad_sampling_loc[sid];
             _grad_h += cache_grad_sampling_loc[sid + 1];
-            _grad_a += cache_grad_attn_weight[_tid];
+            _grad_a += cache_grad_attn_weight[tid];
             sid += 2;
           }
 
-          *grad_sampling_loc_out = _grad_w;
-          *(grad_sampling_loc_out + 1) = _grad_h;
-          *grad_attn_weight_out = _grad_a;
+          *grad_sampling_loc = _grad_w;
+          *(grad_sampling_loc + 1) = _grad_h;
+          *grad_attn_weight = _grad_a;
         }
         __syncthreads();
 
         data_weight_ptr += 1;
         data_loc_w_ptr += 2;
-        grad_attn_weight_out += grad_weight_stride;
-        grad_sampling_loc_out += grad_loc_stride;
+        grad_attn_weight += grad_weight_stride;
+        grad_sampling_loc += grad_loc_stride;
       }
     }
   }
@@ -353,10 +357,10 @@ __global__ void ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2(
     const int channels, const int num_levels, const int num_query,
     const int num_point, scalar_t *grad_value, scalar_t *grad_sampling_loc,
     scalar_t *grad_attn_weight) {
-  __shared__ scalar_t cache_grad_sampling_loc[blockSize * 2];
-  __shared__ scalar_t cache_grad_attn_weight[blockSize];
-  unsigned int tid = threadIdx.x;
   CUDA_1D_KERNEL_LOOP(index, n) {
+    __shared__ scalar_t cache_grad_sampling_loc[blockSize * 2];
+    __shared__ scalar_t cache_grad_attn_weight[blockSize];
+    unsigned int tid = threadIdx.x;
     int _temp = index;
     const int c_col = _temp % channels;
     _temp /= channels;
@@ -371,9 +375,8 @@ __global__ void ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2(
     int data_weight_ptr = sampling_index * num_levels * num_point;
     int data_loc_w_ptr = data_weight_ptr << 1;
     const int grad_sampling_ptr = data_weight_ptr;
-    scalar_t *grad_sampling_loc_out =
-        grad_sampling_loc + (grad_sampling_ptr << 1);
-    scalar_t *grad_attn_weight_out = grad_attn_weight + grad_sampling_ptr;
+    grad_sampling_loc += grad_sampling_ptr << 1;
+    grad_attn_weight += grad_sampling_ptr;
     const int grad_weight_stride = 1;
     const int grad_loc_stride = 2;
     const int qid_stride = num_heads * channels;
@@ -422,16 +425,16 @@ __global__ void ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2(
         }
 
         if (tid == 0) {
-          *grad_sampling_loc_out = cache_grad_sampling_loc[0];
-          *(grad_sampling_loc_out + 1) = cache_grad_sampling_loc[1];
-          *grad_attn_weight_out = cache_grad_attn_weight[0];
+          *grad_sampling_loc = cache_grad_sampling_loc[0];
+          *(grad_sampling_loc + 1) = cache_grad_sampling_loc[1];
+          *grad_attn_weight = cache_grad_attn_weight[0];
         }
         __syncthreads();
 
         data_weight_ptr += 1;
         data_loc_w_ptr += 2;
-        grad_attn_weight_out += grad_weight_stride;
-        grad_sampling_loc_out += grad_loc_stride;
+        grad_attn_weight += grad_weight_stride;
+        grad_sampling_loc += grad_loc_stride;
       }
     }
   }
@@ -446,11 +449,11 @@ __global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v1(
     const int channels, const int num_levels, const int num_query,
     const int num_point, scalar_t *grad_value, scalar_t *grad_sampling_loc,
     scalar_t *grad_attn_weight) {
-  extern __shared__ int _s[];
-  scalar_t *cache_grad_sampling_loc = reinterpret_cast<scalar_t *>(_s);
-  scalar_t *cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x;
-  unsigned int tid = threadIdx.x;
   CUDA_1D_KERNEL_LOOP(index, n) {
+    extern __shared__ int _s[];
+    scalar_t *cache_grad_sampling_loc = reinterpret_cast<scalar_t *>(_s);
+    scalar_t *cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x;
+    unsigned int tid = threadIdx.x;
     int _temp = index;
     const int c_col = _temp % channels;
     _temp /= channels;
@@ -465,9 +468,8 @@ __global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v1(
     int data_weight_ptr = sampling_index * num_levels * num_point;
     int data_loc_w_ptr = data_weight_ptr << 1;
     const int grad_sampling_ptr = data_weight_ptr;
-    scalar_t *grad_sampling_loc_out =
-        grad_sampling_loc + (grad_sampling_ptr << 1);
-    scalar_t *grad_attn_weight_out = grad_attn_weight + grad_sampling_ptr;
+    grad_sampling_loc += grad_sampling_ptr << 1;
+    grad_attn_weight += grad_sampling_ptr;
     const int grad_weight_stride = 1;
     const int grad_loc_stride = 2;
     const int qid_stride = num_heads * channels;
@@ -507,23 +509,23 @@ __global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v1(
                    _grad_h = cache_grad_sampling_loc[1],
                    _grad_a = cache_grad_attn_weight[0];
           int sid = 2;
-          for (unsigned int _tid = 1; _tid < blockDim.x; ++_tid) {
+          for (unsigned int tid = 1; tid < blockDim.x; ++tid) {
             _grad_w += cache_grad_sampling_loc[sid];
             _grad_h += cache_grad_sampling_loc[sid + 1];
-            _grad_a += cache_grad_attn_weight[_tid];
+            _grad_a += cache_grad_attn_weight[tid];
             sid += 2;
           }
 
-          *grad_sampling_loc_out = _grad_w;
-          *(grad_sampling_loc_out + 1) = _grad_h;
-          *grad_attn_weight_out = _grad_a;
+          *grad_sampling_loc = _grad_w;
+          *(grad_sampling_loc + 1) = _grad_h;
+          *grad_attn_weight = _grad_a;
         }
         __syncthreads();
 
         data_weight_ptr += 1;
         data_loc_w_ptr += 2;
-        grad_attn_weight_out += grad_weight_stride;
-        grad_sampling_loc_out += grad_loc_stride;
+        grad_attn_weight += grad_weight_stride;
+        grad_sampling_loc += grad_loc_stride;
       }
     }
   }
@@ -538,11 +540,11 @@ __global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v2(
     const int channels, const int num_levels, const int num_query,
     const int num_point, scalar_t *grad_value, scalar_t *grad_sampling_loc,
     scalar_t *grad_attn_weight) {
-  extern __shared__ int _s[];
-  scalar_t *cache_grad_sampling_loc = reinterpret_cast<scalar_t *>(_s);
-  scalar_t *cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x;
-  unsigned int tid = threadIdx.x;
   CUDA_1D_KERNEL_LOOP(index, n) {
+    extern __shared__ int _s[];
+    scalar_t *cache_grad_sampling_loc = reinterpret_cast<scalar_t *>(_s);
+    scalar_t *cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x;
+    unsigned int tid = threadIdx.x;
     int _temp = index;
     const int c_col = _temp % channels;
     _temp /= channels;
@@ -557,9 +559,8 @@ __global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v2(
     int data_weight_ptr = sampling_index * num_levels * num_point;
     int data_loc_w_ptr = data_weight_ptr << 1;
     const int grad_sampling_ptr = data_weight_ptr;
-    scalar_t *grad_sampling_loc_out =
-        grad_sampling_loc + (grad_sampling_ptr << 1);
-    scalar_t *grad_attn_weight_out = grad_attn_weight + grad_sampling_ptr;
+    grad_sampling_loc += grad_sampling_ptr << 1;
+    grad_attn_weight += grad_sampling_ptr;
     const int grad_weight_stride = 1;
     const int grad_loc_stride = 2;
     const int qid_stride = num_heads * channels;
@@ -617,16 +618,16 @@ __global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v2(
         }
 
         if (tid == 0) {
-          *grad_sampling_loc_out = cache_grad_sampling_loc[0];
-          *(grad_sampling_loc_out + 1) = cache_grad_sampling_loc[1];
-          *grad_attn_weight_out = cache_grad_attn_weight[0];
+          *grad_sampling_loc = cache_grad_sampling_loc[0];
+          *(grad_sampling_loc + 1) = cache_grad_sampling_loc[1];
+          *grad_attn_weight = cache_grad_attn_weight[0];
         }
         __syncthreads();
 
         data_weight_ptr += 1;
         data_loc_w_ptr += 2;
-        grad_attn_weight_out += grad_weight_stride;
-        grad_sampling_loc_out += grad_loc_stride;
+        grad_attn_weight += grad_weight_stride;
+        grad_sampling_loc += grad_loc_stride;
       }
     }
   }
@@ -641,11 +642,11 @@ __global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v2_multi_blocks(
     const int channels, const int num_levels, const int num_query,
     const int num_point, scalar_t *grad_value, scalar_t *grad_sampling_loc,
     scalar_t *grad_attn_weight) {
-  extern __shared__ int _s[];
-  scalar_t *cache_grad_sampling_loc = reinterpret_cast<scalar_t *>(_s);
-  scalar_t *cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x;
-  unsigned int tid = threadIdx.x;
   CUDA_1D_KERNEL_LOOP(index, n) {
+    extern __shared__ int _s[];
+    scalar_t *cache_grad_sampling_loc = reinterpret_cast<scalar_t *>(_s);
+    scalar_t *cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x;
+    unsigned int tid = threadIdx.x;
     int _temp = index;
     const int c_col = _temp % channels;
     _temp /= channels;
@@ -660,9 +661,8 @@ __global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v2_multi_blocks(
     int data_weight_ptr = sampling_index * num_levels * num_point;
     int data_loc_w_ptr = data_weight_ptr << 1;
     const int grad_sampling_ptr = data_weight_ptr;
-    scalar_t *grad_sampling_loc_out =
-        grad_sampling_loc + (grad_sampling_ptr << 1);
-    scalar_t *grad_attn_weight_out = grad_attn_weight + grad_sampling_ptr;
+    grad_sampling_loc += grad_sampling_ptr << 1;
+    grad_attn_weight += grad_sampling_ptr;
     const int grad_weight_stride = 1;
     const int grad_loc_stride = 2;
     const int qid_stride = num_heads * channels;
@@ -720,16 +720,16 @@ __global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v2_multi_blocks(
         }
 
         if (tid == 0) {
-          atomicAdd(grad_sampling_loc_out, cache_grad_sampling_loc[0]);
-          atomicAdd(grad_sampling_loc_out + 1, cache_grad_sampling_loc[1]);
-          atomicAdd(grad_attn_weight_out, cache_grad_attn_weight[0]);
+          atomicAdd(grad_sampling_loc, cache_grad_sampling_loc[0]);
+          atomicAdd(grad_sampling_loc + 1, cache_grad_sampling_loc[1]);
+          atomicAdd(grad_attn_weight, cache_grad_attn_weight[0]);
         }
         __syncthreads();
 
         data_weight_ptr += 1;
         data_loc_w_ptr += 2;
-        grad_attn_weight_out += grad_weight_stride;
-        grad_sampling_loc_out += grad_loc_stride;
+        grad_attn_weight += grad_weight_stride;
+        grad_sampling_loc += grad_loc_stride;
       }
     }
   }
@@ -759,9 +759,8 @@ __global__ void ms_deformable_col2im_gpu_kernel_gm(
     int data_weight_ptr = sampling_index * num_levels * num_point;
     int data_loc_w_ptr = data_weight_ptr << 1;
     const int grad_sampling_ptr = data_weight_ptr;
-    scalar_t *grad_sampling_loc_out =
-        grad_sampling_loc + (grad_sampling_ptr << 1);
-    scalar_t *grad_attn_weight_out = grad_attn_weight + grad_sampling_ptr;
+    grad_sampling_loc += grad_sampling_ptr << 1;
+    grad_attn_weight += grad_sampling_ptr;
     const int grad_weight_stride = 1;
     const int grad_loc_stride = 2;
     const int qid_stride = num_heads * channels;
@@ -788,12 +787,12 @@ __global__ void ms_deformable_col2im_gpu_kernel_gm(
           ms_deform_attn_col2im_bilinear_gm(
               data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im,
               w_im, m_col, c_col, top_grad, weight, grad_value_ptr,
-              grad_sampling_loc_out, grad_attn_weight_out);
+              grad_sampling_loc, grad_attn_weight);
         }
         data_weight_ptr += 1;
         data_loc_w_ptr += 2;
-        grad_attn_weight_out += grad_weight_stride;
-        grad_sampling_loc_out += grad_loc_stride;
+        grad_attn_weight += grad_weight_stride;
+        grad_sampling_loc += grad_loc_stride;
       }
     }
   }
diff --git a/mmcv/ops/csrc/common/cuda/nms_cuda_kernel.cuh b/mmcv/ops/csrc/common/cuda/nms_cuda_kernel.cuh
index 281d9f0..40a2f46 100644
--- a/mmcv/ops/csrc/common/cuda/nms_cuda_kernel.cuh
+++ b/mmcv/ops/csrc/common/cuda/nms_cuda_kernel.cuh
@@ -27,91 +27,48 @@ __device__ inline bool devIoU(float const *const a, float const *const b,
   return interS > threshold * (Sa + Sb - interS);
 }
 
-__global__ static void nms_cuda(const int n_boxes, const float iou_threshold,
-                                const int offset, const float *dev_boxes,
-                                unsigned long long *dev_mask) {
-  int blocks = (n_boxes + threadsPerBlock - 1) / threadsPerBlock;
-  CUDA_2D_KERNEL_BLOCK_LOOP(col_start, blocks, row_start, blocks) {
-    const int tid = threadIdx.x;
-
-    if (row_start > col_start) return;
-
-    const int row_size =
-        fminf(n_boxes - row_start * threadsPerBlock, threadsPerBlock);
-    const int col_size =
-        fminf(n_boxes - col_start * threadsPerBlock, threadsPerBlock);
-
-    __shared__ float block_boxes[threadsPerBlock * 4];
-    if (tid < col_size) {
-      block_boxes[tid * 4 + 0] =
-          dev_boxes[(threadsPerBlock * col_start + tid) * 4 + 0];
-      block_boxes[tid * 4 + 1] =
-          dev_boxes[(threadsPerBlock * col_start + tid) * 4 + 1];
-      block_boxes[tid * 4 + 2] =
-          dev_boxes[(threadsPerBlock * col_start + tid) * 4 + 2];
-      block_boxes[tid * 4 + 3] =
-          dev_boxes[(threadsPerBlock * col_start + tid) * 4 + 3];
-    }
-    __syncthreads();
-
-    if (tid < row_size) {
-      const int cur_box_idx = threadsPerBlock * row_start + tid;
-      const float *cur_box = dev_boxes + cur_box_idx * 4;
-      int i = 0;
-      unsigned long long int t = 0;
-      int start = 0;
-      if (row_start == col_start) {
-        start = tid + 1;
-      }
-      for (i = start; i < col_size; i++) {
-        if (devIoU(cur_box, block_boxes + i * 4, offset, iou_threshold)) {
-          t |= 1ULL << i;
-        }
-      }
-      dev_mask[cur_box_idx * gridDim.y + col_start] = t;
-    }
-  }
-}
-
-__global__ static void gather_keep_from_mask(bool *keep,
-                                             const unsigned long long *dev_mask,
-                                             const int n_boxes) {
-  const int col_blocks = (n_boxes + threadsPerBlock - 1) / threadsPerBlock;
+__global__ void nms_cuda(const int n_boxes, const float iou_threshold,
+                         const int offset, const float *dev_boxes,
+                         unsigned long long *dev_mask) {
+  const int row_start = blockIdx.y;
+  const int col_start = blockIdx.x;
   const int tid = threadIdx.x;
 
-  // mark the bboxes which have been removed.
-  extern __shared__ unsigned long long removed[];
+  if (row_start > col_start) return;
 
-  // initialize removed.
-  for (int i = tid; i < col_blocks; i += blockDim.x) {
-    removed[i] = 0;
+  const int row_size =
+      fminf(n_boxes - row_start * threadsPerBlock, threadsPerBlock);
+  const int col_size =
+      fminf(n_boxes - col_start * threadsPerBlock, threadsPerBlock);
+
+  __shared__ float block_boxes[threadsPerBlock * 4];
+  if (tid < col_size) {
+    block_boxes[tid * 4 + 0] =
+        dev_boxes[(threadsPerBlock * col_start + tid) * 4 + 0];
+    block_boxes[tid * 4 + 1] =
+        dev_boxes[(threadsPerBlock * col_start + tid) * 4 + 1];
+    block_boxes[tid * 4 + 2] =
+        dev_boxes[(threadsPerBlock * col_start + tid) * 4 + 2];
+    block_boxes[tid * 4 + 3] =
+        dev_boxes[(threadsPerBlock * col_start + tid) * 4 + 3];
   }
   __syncthreads();
 
-  for (int nblock = 0; nblock < col_blocks; ++nblock) {
-    auto removed_val = removed[nblock];
-    __syncthreads();
-    const int i_offset = nblock * threadsPerBlock;
-#pragma unroll
-    for (int inblock = 0; inblock < threadsPerBlock; ++inblock) {
-      const int i = i_offset + inblock;
-      if (i >= n_boxes) break;
-      // select a candidate, check if it should kept.
-      if (!(removed_val & (1ULL << inblock))) {
-        if (tid == 0) {
-          // mark the output.
-          keep[i] = true;
-        }
-        auto p = dev_mask + i * col_blocks;
-        // remove all bboxes which overlap the candidate.
-        for (int j = tid; j < col_blocks; j += blockDim.x) {
-          if (j >= nblock) removed[j] |= p[j];
-        }
-        __syncthreads();
-        removed_val = removed[nblock];
+  if (tid < row_size) {
+    const int cur_box_idx = threadsPerBlock * row_start + tid;
+    const float *cur_box = dev_boxes + cur_box_idx * 4;
+    int i = 0;
+    unsigned long long int t = 0;
+    int start = 0;
+    if (row_start == col_start) {
+      start = tid + 1;
+    }
+    for (i = start; i < col_size; i++) {
+      if (devIoU(cur_box, block_boxes + i * 4, offset, iou_threshold)) {
+        t |= 1ULL << i;
       }
     }
+    dev_mask[cur_box_idx * gridDim.y + col_start] = t;
   }
 }
-
 #endif  // NMS_CUDA_KERNEL_CUH
diff --git a/mmcv/ops/csrc/common/cuda/nms_quadri_cuda.cuh b/mmcv/ops/csrc/common/cuda/nms_quadri_cuda.cuh
deleted file mode 100644
index bba3b82..0000000
--- a/mmcv/ops/csrc/common/cuda/nms_quadri_cuda.cuh
+++ /dev/null
@@ -1,141 +0,0 @@
-// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
-#ifndef NMS_QUADRI_CUDA_CUH
-#define NMS_QUADRI_CUDA_CUH
-
-#ifdef MMCV_USE_PARROTS
-#include "parrots_cuda_helper.hpp"
-#else
-#include "pytorch_cuda_helper.hpp"
-#endif
-#include "box_iou_rotated_utils.hpp"
-
-__host__ __device__ inline int divideUP(const int x, const int y) {
-  return (((x) + (y)-1) / (y));
-}
-
-namespace {
-int const threadsPerBlock = sizeof(unsigned long long) * 8;
-}
-
-template <typename T>
-__global__ void nms_quadri_cuda_kernel(const int n_boxes,
-                                       const float iou_threshold,
-                                       const T* dev_boxes,
-                                       unsigned long long* dev_mask,
-                                       const int multi_label) {
-  if (multi_label == 1) {
-    const int row_start = blockIdx.y;
-    const int col_start = blockIdx.x;
-
-    // if (row_start > col_start) return;
-
-    const int row_size =
-        min(n_boxes - row_start * threadsPerBlock, threadsPerBlock);
-    const int col_size =
-        min(n_boxes - col_start * threadsPerBlock, threadsPerBlock);
-
-    // Compared to nms_cuda_kernel, where each box is represented with 4 values
-    // (x1, y1, x2, y2), each rotated box is represented with 8 values
-    // (x1, y1, ..., x4, y4) here.
-    __shared__ T block_boxes[threadsPerBlock * 8];
-    if (threadIdx.x < col_size) {
-      block_boxes[threadIdx.x * 8 + 0] =
-          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 9 + 0];
-      block_boxes[threadIdx.x * 8 + 1] =
-          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 9 + 1];
-      block_boxes[threadIdx.x * 8 + 2] =
-          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 9 + 2];
-      block_boxes[threadIdx.x * 8 + 3] =
-          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 9 + 3];
-      block_boxes[threadIdx.x * 8 + 4] =
-          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 9 + 4];
-      block_boxes[threadIdx.x * 8 + 5] =
-          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 9 + 5];
-      block_boxes[threadIdx.x * 8 + 6] =
-          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 9 + 6];
-      block_boxes[threadIdx.x * 8 + 7] =
-          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 9 + 7];
-    }
-    __syncthreads();
-
-    if (threadIdx.x < row_size) {
-      const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x;
-      const T* cur_box = dev_boxes + cur_box_idx * 9;
-      int i = 0;
-      unsigned long long t = 0;
-      int start = 0;
-      if (row_start == col_start) {
-        start = threadIdx.x + 1;
-      }
-      for (i = start; i < col_size; i++) {
-        // Instead of devIoU used by original horizontal nms, here
-        // we use the single_box_iou_quadri function from
-        // box_iou_rotated_utils.h
-        if (single_box_iou_quadri<T>(cur_box, block_boxes + i * 8, 0) >
-            iou_threshold) {
-          t |= 1ULL << i;
-        }
-      }
-      const int col_blocks = divideUP(n_boxes, threadsPerBlock);
-      dev_mask[cur_box_idx * col_blocks + col_start] = t;
-    }
-  } else {
-    const int row_start = blockIdx.y;
-    const int col_start = blockIdx.x;
-
-    // if (row_start > col_start) return;
-
-    const int row_size =
-        min(n_boxes - row_start * threadsPerBlock, threadsPerBlock);
-    const int col_size =
-        min(n_boxes - col_start * threadsPerBlock, threadsPerBlock);
-
-    // Compared to nms_cuda_kernel, where each box is represented with 4 values
-    // (x1, y1, x2, y2), each rotated box is represented with 8 values
-    // (x1, y1, , ..., x4, y4) here.
-    __shared__ T block_boxes[threadsPerBlock * 8];
-    if (threadIdx.x < col_size) {
-      block_boxes[threadIdx.x * 8 + 0] =
-          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 8 + 0];
-      block_boxes[threadIdx.x * 8 + 1] =
-          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 8 + 1];
-      block_boxes[threadIdx.x * 8 + 2] =
-          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 8 + 2];
-      block_boxes[threadIdx.x * 8 + 3] =
-          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 8 + 3];
-      block_boxes[threadIdx.x * 8 + 4] =
-          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 8 + 4];
-      block_boxes[threadIdx.x * 8 + 5] =
-          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 8 + 5];
-      block_boxes[threadIdx.x * 8 + 6] =
-          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 8 + 6];
-      block_boxes[threadIdx.x * 8 + 7] =
-          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 8 + 7];
-    }
-    __syncthreads();
-
-    if (threadIdx.x < row_size) {
-      const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x;
-      const T* cur_box = dev_boxes + cur_box_idx * 8;
-      int i = 0;
-      unsigned long long t = 0;
-      int start = 0;
-      if (row_start == col_start) {
-        start = threadIdx.x + 1;
-      }
-      for (i = start; i < col_size; i++) {
-        // Instead of devIoU used by original horizontal nms, here
-        // we use the single_box_iou_quadri function from
-        // box_iou_rotated_utils.h
-        if (single_box_iou_quadri<T>(cur_box, block_boxes + i * 8, 0) >
-            iou_threshold) {
-          t |= 1ULL << i;
-        }
-      }
-      const int col_blocks = divideUP(n_boxes, threadsPerBlock);
-      dev_mask[cur_box_idx * col_blocks + col_start] = t;
-    }
-  }
-}
-
-#endif
diff --git a/mmcv/ops/csrc/common/cuda/nms_rotated_cuda.cuh b/mmcv/ops/csrc/common/cuda/nms_rotated_cuda.cuh
index 747327a..80bed96 100644
--- a/mmcv/ops/csrc/common/cuda/nms_rotated_cuda.cuh
+++ b/mmcv/ops/csrc/common/cuda/nms_rotated_cuda.cuh
@@ -43,16 +43,18 @@ __global__ void nms_rotated_cuda_kernel(const int n_boxes,
     // (x_center, y_center, width, height, angle_degrees) here.
     __shared__ T block_boxes[threadsPerBlock * 5];
     if (threadIdx.x < col_size) {
-      block_boxes[threadIdx.x * 5 + 0] =
+      block_boxes[threadIdx.x * 6 + 0] =
           dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 6 + 0];
-      block_boxes[threadIdx.x * 5 + 1] =
+      block_boxes[threadIdx.x * 6 + 1] =
           dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 6 + 1];
-      block_boxes[threadIdx.x * 5 + 2] =
+      block_boxes[threadIdx.x * 6 + 2] =
           dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 6 + 2];
-      block_boxes[threadIdx.x * 5 + 3] =
+      block_boxes[threadIdx.x * 6 + 3] =
           dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 6 + 3];
-      block_boxes[threadIdx.x * 5 + 4] =
+      block_boxes[threadIdx.x * 6 + 4] =
           dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 6 + 4];
+      block_boxes[threadIdx.x * 6 + 5] =
+          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 6 + 5];
     }
     __syncthreads();
 
@@ -69,7 +71,7 @@ __global__ void nms_rotated_cuda_kernel(const int n_boxes,
         // Instead of devIoU used by original horizontal nms, here
         // we use the single_box_iou_rotated function from
         // box_iou_rotated_utils.h
-        if (single_box_iou_rotated<T>(cur_box, block_boxes + i * 5, 0) >
+        if (single_box_iou_rotated<T>(cur_box, block_boxes + i * 6, 0) >
             iou_threshold) {
           t |= 1ULL << i;
         }
diff --git a/mmcv/ops/csrc/common/cuda/points_in_boxes_cuda_kernel.cuh b/mmcv/ops/csrc/common/cuda/points_in_boxes_cuda_kernel.cuh
index 3423620..12182cc 100644
--- a/mmcv/ops/csrc/common/cuda/points_in_boxes_cuda_kernel.cuh
+++ b/mmcv/ops/csrc/common/cuda/points_in_boxes_cuda_kernel.cuh
@@ -45,21 +45,20 @@ __global__ void points_in_boxes_part_forward_cuda_kernel(
   // (B, npoints), default -1
 
   int bs_idx = blockIdx.y;
-  CUDA_1D_KERNEL_LOOP(pt_idx, pts_num) {
-    if (bs_idx >= batch_size) return;
+  int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (bs_idx >= batch_size || pt_idx >= pts_num) return;
 
-    boxes += bs_idx * boxes_num * 7;
-    pts += bs_idx * pts_num * 3 + pt_idx * 3;
-    box_idx_of_points += bs_idx * pts_num + pt_idx;
+  boxes += bs_idx * boxes_num * 7;
+  pts += bs_idx * pts_num * 3 + pt_idx * 3;
+  box_idx_of_points += bs_idx * pts_num + pt_idx;
 
-    T local_x = 0, local_y = 0;
-    int cur_in_flag = 0;
-    for (int k = 0; k < boxes_num; k++) {
-      cur_in_flag = check_pt_in_box3d(pts, boxes + k * 7, local_x, local_y);
-      if (cur_in_flag) {
-        box_idx_of_points[0] = k;
-        break;
-      }
+  T local_x = 0, local_y = 0;
+  int cur_in_flag = 0;
+  for (int k = 0; k < boxes_num; k++) {
+    cur_in_flag = check_pt_in_box3d(pts, boxes + k * 7, local_x, local_y);
+    if (cur_in_flag) {
+      box_idx_of_points[0] = k;
+      break;
     }
   }
 }
@@ -74,20 +73,19 @@ __global__ void points_in_boxes_all_forward_cuda_kernel(
   // (B, npoints), default -1
 
   int bs_idx = blockIdx.y;
-  CUDA_1D_KERNEL_LOOP(pt_idx, pts_num) {
-    if (bs_idx >= batch_size) return;
+  int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (bs_idx >= batch_size || pt_idx >= pts_num) return;
 
-    boxes += bs_idx * boxes_num * 7;
-    pts += bs_idx * pts_num * 3 + pt_idx * 3;
-    box_idx_of_points += bs_idx * pts_num * boxes_num + pt_idx * boxes_num;
+  boxes += bs_idx * boxes_num * 7;
+  pts += bs_idx * pts_num * 3 + pt_idx * 3;
+  box_idx_of_points += bs_idx * pts_num * boxes_num + pt_idx * boxes_num;
 
-    T local_x = 0, local_y = 0;
-    for (int k = 0; k < boxes_num; k++) {
-      const int cur_in_flag =
-          check_pt_in_box3d(pts, boxes + k * 7, local_x, local_y);
-      if (cur_in_flag) {
-        box_idx_of_points[k] = 1;
-      }
+  T local_x = 0, local_y = 0;
+  for (int k = 0; k < boxes_num; k++) {
+    const int cur_in_flag =
+        check_pt_in_box3d(pts, boxes + k * 7, local_x, local_y);
+    if (cur_in_flag) {
+      box_idx_of_points[k] = 1;
     }
   }
 }
diff --git a/mmcv/ops/csrc/common/cuda/points_in_polygons_cuda_kernel.cuh b/mmcv/ops/csrc/common/cuda/points_in_polygons_cuda_kernel.cuh
deleted file mode 100644
index a0769d7..0000000
--- a/mmcv/ops/csrc/common/cuda/points_in_polygons_cuda_kernel.cuh
+++ /dev/null
@@ -1,79 +0,0 @@
-// Copyright (c) OpenMMLab. All rights reserved
-#ifndef POINTS_IN_POLYGONS_CUDA_KERNEL_CUH
-#define POINTS_IN_POLYGONS_CUDA_KERNEL_CUH
-
-#ifdef MMCV_USE_PARROTS
-#include "parrots_cuda_helper.hpp"
-#else
-#include "pytorch_cuda_helper.hpp"
-#endif
-
-struct point {
-  float x, y;
-};
-
-template <typename scalar_t>
-__global__ void points_in_polygons_forward_cuda_kernel(
-    const int nthreads, const scalar_t *vertex1, const scalar_t *vertex2,
-    const int rows, const int cols, scalar_t *inside_flag) {
-  CUDA_1D_KERNEL_LOOP(index, nthreads) {
-    int row = index / cols;
-    int col = index % cols;
-
-    const scalar_t *offset_vertex1 = vertex1 + row * 2;
-    const scalar_t *offset_vertex2 = vertex2 + col * 8;
-
-    point point_[1];
-    point polygon[4];
-
-    point_[0].x = offset_vertex1[0];
-    point_[0].y = offset_vertex1[1];
-
-    polygon[0].x = offset_vertex2[0];
-    polygon[0].y = offset_vertex2[1];
-    polygon[1].x = offset_vertex2[2];
-    polygon[1].y = offset_vertex2[3];
-    polygon[2].x = offset_vertex2[4];
-    polygon[2].y = offset_vertex2[5];
-    polygon[3].x = offset_vertex2[6];
-    polygon[3].y = offset_vertex2[7];
-
-    int nCross = 0;
-    int i, j;
-    float sx, sy, tx, ty, px, py, x;
-    for (i = 0, j = 3; i < 4; j = i, i++) {
-      sx = polygon[i].x;
-      sy = polygon[i].y;
-      tx = polygon[j].x;
-      ty = polygon[j].y;
-
-      px = point_[0].x;
-      py = point_[0].y;
-
-      if (py < min(sy, ty)) continue;
-      if (py > max(sy, ty)) continue;
-
-      if ((sx == px && sy == py) || (tx == px && ty == py)) {
-        break;
-      } else {
-        if ((sy < py && ty >= py) || (sy >= py && ty < py)) {
-          x = sx + (py - sy) * (tx - sx) / (ty - sy);
-          if (x == px) {
-            break;
-          }
-          if (x > px) {
-            nCross++;
-          }
-        }
-      }
-    }
-    if (nCross % 2 == 1) {
-      inside_flag[index] = 1.0;
-    } else {
-      inside_flag[index] = 0.0;
-    }
-    return;
-  }
-}
-
-#endif  // POINTS_IN_POLYGONS_CUDA_KERNEL_CUH
diff --git a/mmcv/ops/csrc/common/cuda/prroi_pool_cuda_kernel.cuh b/mmcv/ops/csrc/common/cuda/prroi_pool_cuda_kernel.cuh
deleted file mode 100644
index e2f5a11..0000000
--- a/mmcv/ops/csrc/common/cuda/prroi_pool_cuda_kernel.cuh
+++ /dev/null
@@ -1,381 +0,0 @@
-// Copyright (c) OpenMMLab. All rights reserved
-// Modified from
-// https://github.com/vacancy/PreciseRoIPooling/blob/master/src/prroi_pooling_gpu_impl.cu
-// Distributed under terms of the MIT license.
-#ifndef PRROI_POOL_CUDA_KERNEL_CUH
-#define PRROI_POOL_CUDA_KERNEL_CUH
-
-#ifdef MMCV_USE_PARROTS
-#include "parrots_cuda_helper.hpp"
-#else
-#include "pytorch_cuda_helper.hpp"
-#endif
-
-template <typename T>
-__device__ static __forceinline__ T PrRoIPoolingGetData(const T *data,
-                                                        const int h,
-                                                        const int w,
-                                                        const int height,
-                                                        const int width) {
-  bool overflow = (h < 0) || (w < 0) || (h >= height) || (w >= width);
-  T retVal = overflow ? 0.0f : data[h * width + w];
-  return retVal;
-}
-
-template <typename T>
-__device__ static __forceinline__ T PrRoIPoolingGetCoeff(T dh, T dw) {
-  return (1.0f - abs(dh)) * (1.0f - abs(dw));
-}
-
-template <typename T>
-__device__ static __forceinline__ T PrRoIPoolingSingleCoorIntegral(T s, T t,
-                                                                   T c1, T c2) {
-  return 0.5 * (t * t - s * s) * (c2 - c1) + (t - s) * c1;
-}
-
-template <typename T>
-__device__ static T PrRoIPoolingInterpolation(const T *data, const T h,
-                                              const T w, const int height,
-                                              const int width) {
-  T retVal = 0.0f;
-  int h1 = floorf(h);
-  int w1 = floorf(w);
-  retVal += PrRoIPoolingGetData(data, h1, w1, height, width) *
-            PrRoIPoolingGetCoeff(h - T(h1), w - T(w1));
-  h1 = floorf(h) + 1;
-  w1 = floorf(w);
-  retVal += PrRoIPoolingGetData(data, h1, w1, height, width) *
-            PrRoIPoolingGetCoeff(h - T(h1), w - T(w1));
-  h1 = floorf(h);
-  w1 = floorf(w) + 1;
-  retVal += PrRoIPoolingGetData(data, h1, w1, height, width) *
-            PrRoIPoolingGetCoeff(h - T(h1), w - T(w1));
-  h1 = floorf(h) + 1;
-  w1 = floorf(w) + 1;
-  retVal += PrRoIPoolingGetData(data, h1, w1, height, width) *
-            PrRoIPoolingGetCoeff(h - T(h1), w - T(w1));
-  return retVal;
-}
-
-template <typename T>
-__device__ static T PrRoIPoolingMatCalculation(const T *this_data,
-                                               const int s_h, const int s_w,
-                                               const int e_h, const int e_w,
-                                               const T y0, const T x0,
-                                               const T y1, const T x1,
-                                               const int h0, const int w0) {
-  T alpha, beta, lim_alpha, lim_beta, tmp;
-  T sum_out = 0;
-
-  alpha = x0 - T(s_w);
-  beta = y0 - T(s_h);
-  lim_alpha = x1 - T(s_w);
-  lim_beta = y1 - T(s_h);
-  tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha +
-         0.5f * alpha * alpha) *
-        (lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta);
-  sum_out += PrRoIPoolingGetData(this_data, s_h, s_w, h0, w0) * tmp;
-
-  alpha = T(e_w) - x1;
-  lim_alpha = T(e_w) - x0;
-  tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha +
-         0.5f * alpha * alpha) *
-        (lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta);
-  sum_out += PrRoIPoolingGetData(this_data, s_h, e_w, h0, w0) * tmp;
-
-  alpha = x0 - T(s_w);
-  beta = T(e_h) - y1;
-  lim_alpha = x1 - T(s_w);
-  lim_beta = T(e_h) - y0;
-  tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha +
-         0.5f * alpha * alpha) *
-        (lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta);
-  sum_out += PrRoIPoolingGetData(this_data, e_h, s_w, h0, w0) * tmp;
-
-  alpha = T(e_w) - x1;
-  lim_alpha = T(e_w) - x0;
-  tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha +
-         0.5f * alpha * alpha) *
-        (lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta);
-  sum_out += PrRoIPoolingGetData(this_data, e_h, e_w, h0, w0) * tmp;
-
-  return sum_out;
-}
-
-template <typename T>
-__device__ static void PrRoIPoolingDistributeDiff(T *diff, const T top_diff,
-                                                  const int h, const int w,
-                                                  const int height,
-                                                  const int width,
-                                                  const T coeff) {
-  bool overflow = (h < 0) || (w < 0) || (h >= height) || (w >= width);
-  if (!overflow) atomicAdd(diff + h * width + w, top_diff * coeff);
-}
-
-template <typename T>
-__device__ static void PrRoIPoolingMatDistributeDiff(
-    T *diff, const T top_diff, const int s_h, const int s_w, const int e_h,
-    const int e_w, const T y0, const T x0, const T y1, const T x1, const int h0,
-    const int w0) {
-  T alpha, beta, lim_alpha, lim_beta, tmp;
-
-  alpha = x0 - T(s_w);
-  beta = y0 - T(s_h);
-  lim_alpha = x1 - T(s_w);
-  lim_beta = y1 - T(s_h);
-  tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha +
-         0.5f * alpha * alpha) *
-        (lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta);
-  PrRoIPoolingDistributeDiff(diff, top_diff, s_h, s_w, h0, w0, tmp);
-
-  alpha = T(e_w) - x1;
-  lim_alpha = T(e_w) - x0;
-  tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha +
-         0.5f * alpha * alpha) *
-        (lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta);
-  PrRoIPoolingDistributeDiff(diff, top_diff, s_h, e_w, h0, w0, tmp);
-
-  alpha = x0 - T(s_w);
-  beta = T(e_h) - y1;
-  lim_alpha = x1 - T(s_w);
-  lim_beta = T(e_h) - y0;
-  tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha +
-         0.5f * alpha * alpha) *
-        (lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta);
-  PrRoIPoolingDistributeDiff(diff, top_diff, e_h, s_w, h0, w0, tmp);
-
-  alpha = T(e_w) - x1;
-  lim_alpha = T(e_w) - x0;
-  tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha +
-         0.5f * alpha * alpha) *
-        (lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta);
-  PrRoIPoolingDistributeDiff(diff, top_diff, e_h, e_w, h0, w0, tmp);
-}
-
-template <typename T>
-__global__ void prroi_pool_forward_cuda_kernel(
-    const int nthreads, const T *input, const T *rois, T *output,
-    const int pooled_height, const int pooled_width, const T spatial_scale,
-    const int channels, const int height, const int width) {
-  CUDA_1D_KERNEL_LOOP(index, nthreads) {
-    // (n, c, ph, pw) is an element in the pooled output
-    int pw = index % pooled_width;
-    int ph = (index / pooled_width) % pooled_height;
-    int c = (index / pooled_width / pooled_height) % channels;
-    int n = index / pooled_width / pooled_height / channels;
-
-    const T *offset_rois = rois + n * 5;
-    int roi_batch_ind = offset_rois[0];
-
-    T roi_x1 = offset_rois[1] * spatial_scale;
-    T roi_y1 = offset_rois[2] * spatial_scale;
-    T roi_x2 = offset_rois[3] * spatial_scale;
-    T roi_y2 = offset_rois[4] * spatial_scale;
-
-    T roi_width = max(roi_x2 - roi_x1, ((T)0.0));
-    T roi_height = max(roi_y2 - roi_y1, ((T)0.0));
-    T bin_size_h = roi_height / static_cast<T>(pooled_height);
-    T bin_size_w = roi_width / static_cast<T>(pooled_width);
-
-    const T *this_data =
-        input + (roi_batch_ind * channels + c) * height * width;
-    T *this_out = output + index;
-
-    T bin_x1 = roi_x1 + bin_size_w * pw;
-    T bin_y1 = roi_y1 + bin_size_h * ph;
-    T bin_x2 = bin_x1 + bin_size_w;
-    T bin_y2 = bin_y1 + bin_size_h;
-
-    T bin_size = max(T(0.0), bin_size_w * bin_size_h);
-    if (bin_size == 0) {
-      *this_out = 0;
-      continue;
-    }
-
-    T sum_out = 0;
-
-    int start_x, start_y, end_x, end_y;
-
-    start_x = floorf(bin_x1);
-    end_x = ceilf(bin_x2);
-    start_y = floorf(bin_y1);
-    end_y = ceilf(bin_y2);
-
-    for (int bin_x = start_x; bin_x < end_x; ++bin_x)
-      for (int bin_y = start_y; bin_y < end_y; ++bin_y)
-        sum_out += PrRoIPoolingMatCalculation(
-            this_data, bin_y, bin_x, bin_y + 1, bin_x + 1,
-            max(bin_y1, T(bin_y)), max(bin_x1, T(bin_x)),
-            min(bin_y2, T(bin_y) + 1.0f), min(bin_x2, T(bin_x + 1.0f)), height,
-            width);
-    *this_out = sum_out / bin_size;
-  }
-}
-
-template <typename T>
-__global__ void prroi_pool_backward_cuda_kernel(
-    const int nthreads, const T *grad_output, const T *rois, T *grad_input,
-    const int pooled_height, const int pooled_width, const T spatial_scale,
-    const int channels, const int height, const int width) {
-  CUDA_1D_KERNEL_LOOP(index, nthreads) {
-    // (n, c, ph, pw) is an element in the pooled output
-    int pw = index % pooled_width;
-    int ph = (index / pooled_width) % pooled_height;
-    int c = (index / pooled_width / pooled_height) % channels;
-    int n = index / pooled_width / pooled_height / channels;
-    auto rois_cur = rois + n * 5;
-
-    int roi_batch_ind = rois_cur[0];
-    T roi_x1 = rois_cur[1] * spatial_scale;
-    T roi_y1 = rois_cur[2] * spatial_scale;
-    T roi_x2 = rois_cur[3] * spatial_scale;
-    T roi_y2 = rois_cur[4] * spatial_scale;
-
-    T roi_width = max(roi_x2 - roi_x1, (T)0);
-    T roi_height = max(roi_y2 - roi_y1, (T)0);
-    T bin_size_h = roi_height / static_cast<T>(pooled_height);
-    T bin_size_w = roi_width / static_cast<T>(pooled_width);
-
-    const T *this_out_grad = grad_output + index;
-    T *this_data_grad =
-        grad_input + (roi_batch_ind * channels + c) * height * width;
-
-    T bin_x1 = roi_x1 + bin_size_w * pw;
-    T bin_y1 = roi_y1 + bin_size_h * ph;
-    T bin_x2 = bin_x1 + bin_size_w;
-    T bin_y2 = bin_y1 + bin_size_h;
-
-    T bin_size = max(T(0.0), bin_size_w * bin_size_h);
-
-    T sum_out = bin_size == T(0) ? T(0) : *this_out_grad / bin_size;
-
-    int start_x, start_y, end_x, end_y;
-
-    start_x = floorf(bin_x1);
-    end_x = ceilf(bin_x2);
-    start_y = floorf(bin_y1);
-    end_y = ceilf(bin_y2);
-
-    for (int bin_x = start_x; bin_x < end_x; ++bin_x)
-      for (int bin_y = start_y; bin_y < end_y; ++bin_y)
-        PrRoIPoolingMatDistributeDiff(
-            this_data_grad, sum_out, bin_y, bin_x, bin_y + 1, bin_x + 1,
-            max(bin_y1, T(bin_y)), max(bin_x1, T(bin_x)),
-            min(bin_y2, T(bin_y) + 1.0f), min(bin_x2, T(bin_x + 1.0f)), height,
-            width);
-  }
-}
-
-template <typename T>
-__global__ void prroi_pool_coor_backward_cuda_kernel(
-    const int nthreads, const T *output, const T *grad_output, const T *input,
-    const T *rois, T *grad_rois, const int pooled_height,
-    const int pooled_width, const T spatial_scale, const int channels,
-    const int height, const int width) {
-  CUDA_1D_KERNEL_LOOP(index, nthreads) {
-    // (n, c, ph, pw) is an element in the pooled output
-    int pw = index % pooled_width;
-    int ph = (index / pooled_width) % pooled_height;
-    int c = (index / pooled_width / pooled_height) % channels;
-    int n = index / pooled_width / pooled_height / channels;
-    auto rois_cur = rois + n * 5;
-
-    int roi_batch_ind = rois_cur[0];
-    T roi_x1 = rois_cur[1] * spatial_scale;
-    T roi_y1 = rois_cur[2] * spatial_scale;
-    T roi_x2 = rois_cur[3] * spatial_scale;
-    T roi_y2 = rois_cur[4] * spatial_scale;
-
-    T roi_width = max(roi_x2 - roi_x1, (T)0);
-    T roi_height = max(roi_y2 - roi_y1, (T)0);
-    T bin_size_h = roi_height / static_cast<T>(pooled_height);
-    T bin_size_w = roi_width / static_cast<T>(pooled_width);
-
-    const T output_grad_val = grad_output[index];
-    const T *this_input_data =
-        input + (roi_batch_ind * channels + c) * height * width;
-    const T output_val = output[index];
-    T *this_rois_grad = grad_rois + n * 5;
-
-    T bin_x1 = roi_x1 + bin_size_w * pw;
-    T bin_y1 = roi_y1 + bin_size_h * ph;
-    T bin_x2 = bin_x1 + bin_size_w;
-    T bin_y2 = bin_y1 + bin_size_h;
-
-    T bin_size = max(T(0.0), bin_size_w * bin_size_h);
-
-    T sum_out = bin_size == T(0) ? T(0) : output_grad_val / bin_size;
-
-    // WARNING: to be discussed
-    if (sum_out == 0) continue;
-
-    int start_x, start_y, end_x, end_y;
-
-    start_x = floorf(bin_x1);
-    end_x = ceilf(bin_x2);
-    start_y = floorf(bin_y1);
-    end_y = ceilf(bin_y2);
-
-    T grad_x1_y = 0, grad_x2_y = 0, grad_x_y1 = 0, grad_x_y2 = 0;
-    for (int bin_y = start_y; bin_y < end_y; ++bin_y) {
-      grad_x1_y += PrRoIPoolingSingleCoorIntegral(
-          max(bin_y1, T(bin_y)) - bin_y, min(bin_y2, T(bin_y + 1)) - bin_y,
-          PrRoIPoolingInterpolation(this_input_data, float(bin_y), bin_x1,
-                                    height, width),
-          PrRoIPoolingInterpolation(this_input_data, float(bin_y + 1), bin_x1,
-                                    height, width));
-
-      grad_x2_y += PrRoIPoolingSingleCoorIntegral(
-          max(bin_y1, T(bin_y)) - bin_y, min(bin_y2, T(bin_y + 1)) - bin_y,
-          PrRoIPoolingInterpolation(this_input_data, float(bin_y), bin_x2,
-                                    height, width),
-          PrRoIPoolingInterpolation(this_input_data, float(bin_y + 1), bin_x2,
-                                    height, width));
-    }
-
-    for (int bin_x = start_x; bin_x < end_x; ++bin_x) {
-      grad_x_y1 += PrRoIPoolingSingleCoorIntegral(
-          max(bin_x1, T(bin_x)) - bin_x, min(bin_x2, T(bin_x + 1)) - bin_x,
-          PrRoIPoolingInterpolation(this_input_data, bin_y1, float(bin_x),
-                                    height, width),
-          PrRoIPoolingInterpolation(this_input_data, bin_y1, float(bin_x + 1),
-                                    height, width));
-
-      grad_x_y2 += PrRoIPoolingSingleCoorIntegral(
-          max(bin_x1, T(bin_x)) - bin_x, min(bin_x2, T(bin_x + 1)) - bin_x,
-          PrRoIPoolingInterpolation(this_input_data, bin_y2, float(bin_x),
-                                    height, width),
-          PrRoIPoolingInterpolation(this_input_data, bin_y2, float(bin_x + 1),
-                                    height, width));
-    }
-
-    T partial_x1 = -grad_x1_y + (bin_y2 - bin_y1) * output_val;
-    T partial_y1 = -grad_x_y1 + (bin_x2 - bin_x1) * output_val;
-    T partial_x2 = grad_x2_y - (bin_y2 - bin_y1) * output_val;
-    T partial_y2 = grad_x_y2 - (bin_x2 - bin_x1) * output_val;
-
-    partial_x1 = partial_x1 / bin_size * spatial_scale;
-    partial_x2 = partial_x2 / bin_size * spatial_scale;
-    partial_y1 = partial_y1 / bin_size * spatial_scale;
-    partial_y2 = partial_y2 / bin_size * spatial_scale;
-
-    // (index, x1, y1, x2, y2)
-    this_rois_grad[0] = 0;
-    atomicAdd(this_rois_grad + 1,
-              (partial_x1 * (1.0f - T(pw) / pooled_width) +
-               partial_x2 * (1.0f - T(pw + 1) / pooled_width)) *
-                  output_grad_val);
-    atomicAdd(this_rois_grad + 2,
-              (partial_y1 * (1.0f - T(ph) / pooled_height) +
-               partial_y2 * (1.0f - T(ph + 1) / pooled_height)) *
-                  output_grad_val);
-    atomicAdd(this_rois_grad + 3, (partial_x2 * T(pw + 1) / pooled_width +
-                                   partial_x1 * T(pw) / pooled_width) *
-                                      output_grad_val);
-    atomicAdd(this_rois_grad + 4, (partial_y2 * T(ph + 1) / pooled_height +
-                                   partial_y1 * T(ph) / pooled_height) *
-                                      output_grad_val);
-  }
-}
-
-#endif  // ROI_POOL_CUDA_KERNEL_CUH
diff --git a/mmcv/ops/csrc/common/cuda/riroi_align_rotated_cuda_kernel.cuh b/mmcv/ops/csrc/common/cuda/riroi_align_rotated_cuda_kernel.cuh
deleted file mode 100644
index 4383d9e..0000000
--- a/mmcv/ops/csrc/common/cuda/riroi_align_rotated_cuda_kernel.cuh
+++ /dev/null
@@ -1,242 +0,0 @@
-// Modified from
-// https://github.com/csuhan/ReDet/blob/master/mmdet/ops/riroi_align/src/riroi_align_kernel.cu
-#ifndef RIROI_ALIGN_ROTATED_CUDA_KERNEL_CUH
-#define RIROI_ALIGN_ROTATED_CUDA_KERNEL_CUH
-
-#include <float.h>
-#ifdef MMCV_USE_PARROTS
-#include "parrots_cuda_helper.hpp"
-#else  // MMCV_USE_PARROTS
-#include "pytorch_cuda_helper.hpp"
-#endif  // MMCV_USE_PARROTS
-
-/*** Forward ***/
-template <typename scalar_t>
-__global__ void riroi_align_rotated_forward_cuda_kernel(
-    const int nthreads, const scalar_t *bottom_data,
-    const scalar_t *bottom_rois, const scalar_t spatial_scale,
-    const int num_samples, const bool clockwise, const int channels,
-    const int height, const int width, const int pooled_height,
-    const int pooled_width, const int num_orientations, scalar_t *top_data) {
-  CUDA_1D_KERNEL_LOOP(index, nthreads) {
-    // (n, c, ph, pw) is an element in the pooled output
-    int pw = index % pooled_width;
-    int ph = (index / pooled_width) % pooled_height;
-    int o = (index / pooled_width / pooled_height) % num_orientations;
-    int c =
-        (index / pooled_width / pooled_height / num_orientations) % channels;
-    int n = index / pooled_width / pooled_height / num_orientations / channels;
-
-    const scalar_t *offset_bottom_rois = bottom_rois + n * 6;
-    int roi_batch_ind = offset_bottom_rois[0];
-
-    // Do not using rounding; this implementation detail is critical
-    scalar_t roi_center_w = offset_bottom_rois[1] * spatial_scale;
-    scalar_t roi_center_h = offset_bottom_rois[2] * spatial_scale;
-    scalar_t roi_width = offset_bottom_rois[3] * spatial_scale;
-    scalar_t roi_height = offset_bottom_rois[4] * spatial_scale;
-    // scalar_t theta = offset_bottom_rois[5] * M_PI / 180.0;
-    scalar_t theta = offset_bottom_rois[5];
-    // Force malformed ROIs to be 1x1
-    roi_width = max(roi_width, (scalar_t)1.);
-    roi_height = max(roi_height, (scalar_t)1.);
-    scalar_t bin_size_h = static_cast<scalar_t>(roi_height) /
-                          static_cast<scalar_t>(pooled_height);
-    scalar_t bin_size_w =
-        static_cast<scalar_t>(roi_width) / static_cast<scalar_t>(pooled_width);
-
-    // find aligned index
-    scalar_t ind_float = theta * num_orientations / (2 * M_PI);
-    int ind = floorf(ind_float);
-    scalar_t l_var = ind_float - (scalar_t)ind;
-    scalar_t r_var = 1.0 - l_var;
-    // correct start channel
-    ind = (ind + num_orientations) % num_orientations;
-    // rotated channel
-    int ind_rot = (o - ind + num_orientations) % num_orientations;
-    int ind_rot_plus = (ind_rot + 1 + num_orientations) % num_orientations;
-    const scalar_t *offset_bottom_data =
-        bottom_data + (roi_batch_ind * channels * num_orientations +
-                       c * num_orientations + ind_rot) *
-                          height * width;
-
-    const scalar_t *offset_bottom_data_plus =
-        bottom_data + (roi_batch_ind * channels * num_orientations +
-                       c * num_orientations + ind_rot_plus) *
-                          height * width;
-    // We use roi_bin_grid to sample the grid and mimic integral
-    int roi_bin_grid_h = (num_samples > 0)
-                             ? num_samples
-                             : ceilf(roi_height / pooled_height);  // e.g., = 2
-    int roi_bin_grid_w =
-        (num_samples > 0) ? num_samples : ceilf(roi_width / pooled_width);
-
-    // roi_start_h and roi_start_w are computed wrt the center of RoI (x, y).
-    // Appropriate translation needs to be applied after.
-    if (clockwise) {
-      theta = -theta;  // If clockwise, the angle needs to be reversed.
-    }
-    scalar_t roi_start_h = -roi_height / 2.0;
-    scalar_t roi_start_w = -roi_width / 2.0;
-    scalar_t cosscalar_theta = cos(theta);
-    scalar_t sinscalar_theta = sin(theta);
-
-    // We do average (integral) pooling inside a bin
-    const scalar_t count = max(roi_bin_grid_h * roi_bin_grid_w, 1);  // e.g. = 4
-
-    scalar_t output_val = 0.;
-    for (int iy = 0; iy < roi_bin_grid_h; iy++) {  // e.g., iy = 0, 1
-      const scalar_t yy =
-          roi_start_h + ph * bin_size_h +
-          static_cast<scalar_t>(iy + .5f) * bin_size_h /
-              static_cast<scalar_t>(roi_bin_grid_h);  // e.g., 0.5, 1.5
-      for (int ix = 0; ix < roi_bin_grid_w; ix++) {
-        const scalar_t xx = roi_start_w + pw * bin_size_w +
-                            static_cast<scalar_t>(ix + .5f) * bin_size_w /
-                                static_cast<scalar_t>(roi_bin_grid_w);
-
-        // Rotate by theta (counterclockwise) around the center and translate
-        scalar_t y = yy * cosscalar_theta - xx * sinscalar_theta + roi_center_h;
-        scalar_t x = yy * sinscalar_theta + xx * cosscalar_theta + roi_center_w;
-
-        scalar_t val = bilinear_interpolate<scalar_t>(
-            offset_bottom_data, height, width, y, x, index);
-        scalar_t val_plus = bilinear_interpolate<scalar_t>(
-            offset_bottom_data_plus, height, width, y, x, index);
-        output_val += r_var * val + l_var * val_plus;
-      }
-    }
-    output_val /= count;
-
-    top_data[index] = output_val;
-  }
-}
-
-/*** Backward ***/
-template <typename scalar_t>
-__global__ void riroi_align_rotated_backward_cuda_kernel(
-    const int nthreads, const scalar_t *top_diff, const scalar_t *bottom_rois,
-    const scalar_t spatial_scale, const int num_samples, const bool clockwise,
-    const int channels, const int height, const int width,
-    const int pooled_height, const int pooled_width, const int num_orientations,
-    scalar_t *bottom_diff) {
-  CUDA_1D_KERNEL_LOOP(index, nthreads) {
-    // (n, c, ph, pw) is an element in the pooled output
-    int pw = index % pooled_width;
-    int ph = (index / pooled_width) % pooled_height;
-    int o = (index / pooled_width / pooled_height) % num_orientations;
-    int c =
-        (index / pooled_width / pooled_height / num_orientations) % channels;
-    int n = index / pooled_width / pooled_height / num_orientations / channels;
-
-    const scalar_t *offset_bottom_rois = bottom_rois + n * 6;
-    int roi_batch_ind = offset_bottom_rois[0];
-
-    // Do not round
-    scalar_t roi_center_w = offset_bottom_rois[1] * spatial_scale;
-    scalar_t roi_center_h = offset_bottom_rois[2] * spatial_scale;
-    scalar_t roi_width = offset_bottom_rois[3] * spatial_scale;
-    scalar_t roi_height = offset_bottom_rois[4] * spatial_scale;
-    // scalar_t theta = offset_bottom_rois[5] * M_PI / 180.0;
-    scalar_t theta = offset_bottom_rois[5];
-    // Force malformed ROIs to be 1x1
-    roi_width = max(roi_width, (scalar_t)1.);
-    roi_height = max(roi_height, (scalar_t)1.);
-
-    scalar_t bin_size_h = static_cast<scalar_t>(roi_height) /
-                          static_cast<scalar_t>(pooled_height);
-    scalar_t bin_size_w =
-        static_cast<scalar_t>(roi_width) / static_cast<scalar_t>(pooled_width);
-
-    // find aligned index
-    scalar_t ind_float = theta * num_orientations / (2 * M_PI);
-    int ind = floorf(ind_float);
-    scalar_t l_var = ind_float - (scalar_t)ind;
-    scalar_t r_var = 1.0 - l_var;
-    // correct start channel
-    ind = (ind + num_orientations) % num_orientations;
-    // rotated channel
-    int ind_rot = (o - ind + num_orientations) % num_orientations;
-    int ind_rot_plus = (ind_rot + 1 + num_orientations) % num_orientations;
-    scalar_t *offset_bottom_diff =
-        bottom_diff + (roi_batch_ind * channels * num_orientations +
-                       c * num_orientations + ind_rot) *
-                          height * width;
-    scalar_t *offset_bottom_diff_plus =
-        bottom_diff + (roi_batch_ind * channels * num_orientations +
-                       c * num_orientations + ind_rot_plus) *
-                          height * width;
-    int top_offset =
-        (n * channels * num_orientations + c * num_orientations + o) *
-        pooled_height * pooled_width;
-    const scalar_t *offset_top_diff = top_diff + top_offset;
-    const scalar_t top_diff_this_bin = offset_top_diff[ph * pooled_width + pw];
-
-    // We use roi_bin_grid to sample the grid and mimic integral
-    int roi_bin_grid_h = (num_samples > 0)
-                             ? num_samples
-                             : ceilf(roi_height / pooled_height);  // e.g., = 2
-    int roi_bin_grid_w =
-        (num_samples > 0) ? num_samples : ceilf(roi_width / pooled_width);
-
-    // roi_start_h and roi_start_w are computed wrt the center of RoI (x, y).
-    // Appropriate translation needs to be applied after.
-    if (clockwise) {
-      theta = -theta;  // If clockwise, the angle needs to be reversed.
-    }
-    scalar_t roi_start_h = -roi_height / 2.0;
-    scalar_t roi_start_w = -roi_width / 2.0;
-    scalar_t cosTheta = cos(theta);
-    scalar_t sinTheta = sin(theta);
-
-    // We do average (integral) pooling inside a bin
-    const scalar_t count = roi_bin_grid_h * roi_bin_grid_w;  // e.g. = 4
-
-    for (int iy = 0; iy < roi_bin_grid_h; iy++) {  // e.g., iy = 0, 1
-      const scalar_t yy =
-          roi_start_h + ph * bin_size_h +
-          static_cast<scalar_t>(iy + .5f) * bin_size_h /
-              static_cast<scalar_t>(roi_bin_grid_h);  // e.g., 0.5, 1.5
-      for (int ix = 0; ix < roi_bin_grid_w; ix++) {
-        const scalar_t xx = roi_start_w + pw * bin_size_w +
-                            static_cast<scalar_t>(ix + .5f) * bin_size_w /
-                                static_cast<scalar_t>(roi_bin_grid_w);
-
-        // Rotate by theta around the center and translate
-        scalar_t y = yy * cosTheta - xx * sinTheta + roi_center_h;
-        scalar_t x = yy * sinTheta + xx * cosTheta + roi_center_w;
-
-        scalar_t w1, w2, w3, w4;
-        int x_low, x_high, y_low, y_high;
-
-        bilinear_interpolate_gradient<scalar_t>(height, width, y, x, w1, w2, w3,
-                                                w4, x_low, x_high, y_low,
-                                                y_high, index);
-
-        scalar_t g1 = top_diff_this_bin * w1 / count;
-        scalar_t g2 = top_diff_this_bin * w2 / count;
-        scalar_t g3 = top_diff_this_bin * w3 / count;
-        scalar_t g4 = top_diff_this_bin * w4 / count;
-
-        if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
-          atomicAdd(offset_bottom_diff + y_low * width + x_low, g1 * r_var);
-          atomicAdd(offset_bottom_diff + y_low * width + x_high, g2 * r_var);
-          atomicAdd(offset_bottom_diff + y_high * width + x_low, g3 * r_var);
-          atomicAdd(offset_bottom_diff + y_high * width + x_high, g4 * r_var);
-
-          atomicAdd(offset_bottom_diff_plus + y_low * width + x_low,
-                    g1 * l_var);
-          atomicAdd(offset_bottom_diff_plus + y_low * width + x_high,
-                    g2 * l_var);
-          atomicAdd(offset_bottom_diff_plus + y_high * width + x_low,
-                    g3 * l_var);
-          atomicAdd(offset_bottom_diff_plus + y_high * width + x_high,
-                    g4 * l_var);
-
-        }  // if
-      }    // ix
-    }      // iy
-  }        // CUDA_1D_KERNEL_LOOP
-}  // RiRoIAlignBackward
-
-#endif  // RIROI_ALIGN_ROTATED_CUDA_KERNEL_CUH
diff --git a/mmcv/ops/csrc/common/cuda/roi_align_rotated_cuda_kernel.cuh b/mmcv/ops/csrc/common/cuda/roi_align_rotated_cuda_kernel.cuh
index 8274dc5..33571f2 100644
--- a/mmcv/ops/csrc/common/cuda/roi_align_rotated_cuda_kernel.cuh
+++ b/mmcv/ops/csrc/common/cuda/roi_align_rotated_cuda_kernel.cuh
@@ -20,7 +20,7 @@ template <typename scalar_t>
 __global__ void roi_align_rotated_forward_cuda_kernel(
     const int nthreads, const scalar_t *bottom_data,
     const scalar_t *bottom_rois, const scalar_t spatial_scale,
-    const int sampling_ratio, const bool aligned, const bool clockwise,
+    const int sample_num, const bool aligned, const bool clockwise,
     const int channels, const int height, const int width,
     const int pooled_height, const int pooled_width, scalar_t *top_data) {
   CUDA_1D_KERNEL_LOOP(index, nthreads) {
@@ -58,11 +58,11 @@ __global__ void roi_align_rotated_forward_cuda_kernel(
         bottom_data + (roi_batch_ind * channels + c) * height * width;
 
     // We use roi_bin_grid to sample the grid and mimic integral
-    int roi_bin_grid_h = (sampling_ratio > 0)
-                             ? sampling_ratio
+    int roi_bin_grid_h = (sample_num > 0)
+                             ? sample_num
                              : ceilf(roi_height / pooled_height);  // e.g., = 2
     int roi_bin_grid_w =
-        (sampling_ratio > 0) ? sampling_ratio : ceilf(roi_width / pooled_width);
+        (sample_num > 0) ? sample_num : ceilf(roi_width / pooled_width);
 
     // roi_start_h and roi_start_w are computed wrt the center of RoI (x, y).
     // Appropriate translation needs to be applied after.
@@ -104,7 +104,7 @@ __global__ void roi_align_rotated_forward_cuda_kernel(
 template <typename scalar_t>
 __global__ void roi_align_rotated_backward_cuda_kernel(
     const int nthreads, const scalar_t *top_diff, const scalar_t *bottom_rois,
-    const scalar_t spatial_scale, const int sampling_ratio, const bool aligned,
+    const scalar_t spatial_scale, const int sample_num, const bool aligned,
     const bool clockwise, const int channels, const int height, const int width,
     const int pooled_height, const int pooled_width, scalar_t *bottom_diff) {
   CUDA_1D_KERNEL_LOOP(index, nthreads) {
@@ -146,11 +146,11 @@ __global__ void roi_align_rotated_backward_cuda_kernel(
     const scalar_t top_diff_this_bin = offset_top_diff[ph * pooled_width + pw];
 
     // We use roi_bin_grid to sample the grid and mimic integral
-    int roi_bin_grid_h = (sampling_ratio > 0)
-                             ? sampling_ratio
+    int roi_bin_grid_h = (sample_num > 0)
+                             ? sample_num
                              : ceilf(roi_height / pooled_height);  // e.g., = 2
     int roi_bin_grid_w =
-        (sampling_ratio > 0) ? sampling_ratio : ceilf(roi_width / pooled_width);
+        (sample_num > 0) ? sample_num : ceilf(roi_width / pooled_width);
 
     // roi_start_h and roi_start_w are computed wrt the center of RoI (x, y).
     // Appropriate translation needs to be applied after.
diff --git a/mmcv/ops/csrc/common/cuda/roiaware_pool3d_cuda_kernel.cuh b/mmcv/ops/csrc/common/cuda/roiaware_pool3d_cuda_kernel.cuh
index fc0aacf..3b95dc7 100644
--- a/mmcv/ops/csrc/common/cuda/roiaware_pool3d_cuda_kernel.cuh
+++ b/mmcv/ops/csrc/common/cuda/roiaware_pool3d_cuda_kernel.cuh
@@ -44,38 +44,37 @@ __global__ void generate_pts_mask_for_box3d(int boxes_num, int pts_num,
   // coordinate params pts: (npoints, 3) [x, y, z] params pts_mask: (N,
   // npoints): -1 means point does not in this box, otherwise: encode (x_idxs,
   // y_idxs, z_idxs) by binary bit
+  int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
   int box_idx = blockIdx.y;
-  CUDA_1D_KERNEL_LOOP(pt_idx, pts_num) {
-    if (box_idx >= boxes_num) return;
+  if (pt_idx >= pts_num || box_idx >= boxes_num) return;
 
-    pts += pt_idx * 3;
-    rois += box_idx * 7;
-    pts_mask += box_idx * pts_num + pt_idx;
+  pts += pt_idx * 3;
+  rois += box_idx * 7;
+  pts_mask += box_idx * pts_num + pt_idx;
 
-    T local_x = 0, local_y = 0;
-    int cur_in_flag = check_pt_in_box3d(pts, rois, local_x, local_y);
+  T local_x = 0, local_y = 0;
+  int cur_in_flag = check_pt_in_box3d(pts, rois, local_x, local_y);
 
-    pts_mask[0] = -1;
-    if (cur_in_flag > 0) {
-      T local_z = pts[2] - rois[2];
-      T x_size = rois[3], y_size = rois[4], z_size = rois[5];
+  pts_mask[0] = -1;
+  if (cur_in_flag > 0) {
+    T local_z = pts[2] - rois[2];
+    T x_size = rois[3], y_size = rois[4], z_size = rois[5];
 
-      T x_res = x_size / out_x;
-      T y_res = y_size / out_y;
-      T z_res = z_size / out_z;
+    T x_res = x_size / out_x;
+    T y_res = y_size / out_y;
+    T z_res = z_size / out_z;
 
-      unsigned int x_idx = int((local_x + x_size / 2) / x_res);
-      unsigned int y_idx = int((local_y + y_size / 2) / y_res);
-      unsigned int z_idx = int(local_z / z_res);
+    unsigned int x_idx = int((local_x + x_size / 2) / x_res);
+    unsigned int y_idx = int((local_y + y_size / 2) / y_res);
+    unsigned int z_idx = int(local_z / z_res);
 
-      x_idx = min(max(x_idx, 0), out_x - 1);
-      y_idx = min(max(y_idx, 0), out_y - 1);
-      z_idx = min(max(z_idx, 0), out_z - 1);
+    x_idx = min(max(x_idx, 0), out_x - 1);
+    y_idx = min(max(y_idx, 0), out_y - 1);
+    z_idx = min(max(z_idx, 0), out_z - 1);
 
-      unsigned int idx_encoding = (x_idx << 16) + (y_idx << 8) + z_idx;
+    unsigned int idx_encoding = (x_idx << 16) + (y_idx << 8) + z_idx;
 
-      pts_mask[0] = idx_encoding;
-    }
+    pts_mask[0] = idx_encoding;
   }
 }
 
@@ -87,24 +86,26 @@ __global__ void collect_inside_pts_for_box3d(int boxes_num, int pts_num,
                                              T *pts_idx_of_voxels) {
   // params pts_mask: (N, npoints)  0 or 1
   // params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel)
-  CUDA_1D_KERNEL_LOOP(box_idx, boxes_num) {
-    int max_num_pts = max_pts_each_voxel - 1;  // index 0 is the counter
-    pts_idx_of_voxels += box_idx * out_x * out_y * out_z * max_pts_each_voxel;
-
-    for (int k = 0; k < pts_num; k++) {
-      if (pts_mask[box_idx * pts_num + k] != -1) {
-        unsigned int idx_encoding = pts_mask[box_idx * pts_num + k];
-        unsigned int x_idx = (idx_encoding >> 16) & 0xFF;
-        unsigned int y_idx = (idx_encoding >> 8) & 0xFF;
-        unsigned int z_idx = idx_encoding & 0xFF;
-        unsigned int base_offset = x_idx * out_y * out_z * max_pts_each_voxel +
-                                   y_idx * out_z * max_pts_each_voxel +
-                                   z_idx * max_pts_each_voxel;
-        unsigned int cnt = pts_idx_of_voxels[base_offset];
-        if (cnt < max_num_pts) {
-          pts_idx_of_voxels[base_offset + cnt + 1] = k;
-          pts_idx_of_voxels[base_offset]++;
-        }
+
+  int box_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (box_idx >= boxes_num) return;
+
+  int max_num_pts = max_pts_each_voxel - 1;  // index 0 is the counter
+  pts_idx_of_voxels += box_idx * out_x * out_y * out_z * max_pts_each_voxel;
+
+  for (int k = 0; k < pts_num; k++) {
+    if (pts_mask[box_idx * pts_num + k] != -1) {
+      unsigned int idx_encoding = pts_mask[box_idx * pts_num + k];
+      unsigned int x_idx = (idx_encoding >> 16) & 0xFF;
+      unsigned int y_idx = (idx_encoding >> 8) & 0xFF;
+      unsigned int z_idx = idx_encoding & 0xFF;
+      unsigned int base_offset = x_idx * out_y * out_z * max_pts_each_voxel +
+                                 y_idx * out_z * max_pts_each_voxel +
+                                 z_idx * max_pts_each_voxel;
+      unsigned int cnt = pts_idx_of_voxels[base_offset];
+      if (cnt < max_num_pts) {
+        pts_idx_of_voxels[base_offset + cnt + 1] = k;
+        pts_idx_of_voxels[base_offset]++;
       }
     }
   }
@@ -123,38 +124,39 @@ __global__ void roiaware_maxpool3d(int boxes_num, int pts_num, int channels,
 
   int box_idx = blockIdx.z;
   int channel_idx = blockIdx.y;
-  CUDA_1D_KERNEL_LOOP(voxel_idx_flat, out_x * out_y * out_z) {
-    int x_idx = voxel_idx_flat / (out_y * out_z);
-    int y_idx = (voxel_idx_flat - x_idx * (out_y * out_z)) / out_z;
-    int z_idx = voxel_idx_flat % out_z;
-    if (box_idx >= boxes_num || channel_idx >= channels) return;
-
-    int offset_base = x_idx * out_y * out_z + y_idx * out_z + z_idx;
-    pts_idx_of_voxels += box_idx * out_x * out_y * out_z * max_pts_each_voxel +
-                         offset_base * max_pts_each_voxel;
-    pooled_features += box_idx * out_x * out_y * out_z * channels +
-                       offset_base * channels + channel_idx;
-    argmax += box_idx * out_x * out_y * out_z * channels +
-              offset_base * channels + channel_idx;
-
-    int argmax_idx = -1;
-    float max_val = -1e50;
-
-    int total_pts = pts_idx_of_voxels[0];
-
-    for (int k = 1; k <= total_pts; k++) {
-      if (pts_feature[pts_idx_of_voxels[k] * channels + channel_idx] >
-          max_val) {
-        max_val = pts_feature[pts_idx_of_voxels[k] * channels + channel_idx];
-        argmax_idx = pts_idx_of_voxels[k];
-      }
+  int voxel_idx_flat = blockIdx.x * blockDim.x + threadIdx.x;
+
+  int x_idx = voxel_idx_flat / (out_y * out_z);
+  int y_idx = (voxel_idx_flat - x_idx * (out_y * out_z)) / out_z;
+  int z_idx = voxel_idx_flat % out_z;
+  if (box_idx >= boxes_num || channel_idx >= channels || x_idx >= out_x ||
+      y_idx >= out_y || z_idx >= out_z)
+    return;
+
+  int offset_base = x_idx * out_y * out_z + y_idx * out_z + z_idx;
+  pts_idx_of_voxels += box_idx * out_x * out_y * out_z * max_pts_each_voxel +
+                       offset_base * max_pts_each_voxel;
+  pooled_features += box_idx * out_x * out_y * out_z * channels +
+                     offset_base * channels + channel_idx;
+  argmax += box_idx * out_x * out_y * out_z * channels +
+            offset_base * channels + channel_idx;
+
+  int argmax_idx = -1;
+  float max_val = -1e50;
+
+  int total_pts = pts_idx_of_voxels[0];
+
+  for (int k = 1; k <= total_pts; k++) {
+    if (pts_feature[pts_idx_of_voxels[k] * channels + channel_idx] > max_val) {
+      max_val = pts_feature[pts_idx_of_voxels[k] * channels + channel_idx];
+      argmax_idx = pts_idx_of_voxels[k];
     }
+  }
 
-    if (argmax_idx != -1) {
-      pooled_features[0] = max_val;
-    }
-    argmax[0] = argmax_idx;
+  if (argmax_idx != -1) {
+    pooled_features[0] = max_val;
   }
+  argmax[0] = argmax_idx;
 }
 
 template <typename T>
@@ -170,28 +172,30 @@ __global__ void roiaware_avgpool3d(int boxes_num, int pts_num, int channels,
 
   int box_idx = blockIdx.z;
   int channel_idx = blockIdx.y;
-  CUDA_1D_KERNEL_LOOP(voxel_idx_flat, out_x * out_y * out_z) {
-    int x_idx = voxel_idx_flat / (out_y * out_z);
-    int y_idx = (voxel_idx_flat - x_idx * (out_y * out_z)) / out_z;
-    int z_idx = voxel_idx_flat % out_z;
-    if (box_idx >= boxes_num || channel_idx >= channels) return;
-
-    int offset_base = x_idx * out_y * out_z + y_idx * out_z + z_idx;
-    pts_idx_of_voxels += box_idx * out_x * out_y * out_z * max_pts_each_voxel +
-                         offset_base * max_pts_each_voxel;
-    pooled_features += box_idx * out_x * out_y * out_z * channels +
-                       offset_base * channels + channel_idx;
-
-    float sum_val = 0;
-    int total_pts = pts_idx_of_voxels[0];
-
-    for (int k = 1; k <= total_pts; k++) {
-      sum_val += pts_feature[pts_idx_of_voxels[k] * channels + channel_idx];
-    }
+  int voxel_idx_flat = blockIdx.x * blockDim.x + threadIdx.x;
+
+  int x_idx = voxel_idx_flat / (out_y * out_z);
+  int y_idx = (voxel_idx_flat - x_idx * (out_y * out_z)) / out_z;
+  int z_idx = voxel_idx_flat % out_z;
+  if (box_idx >= boxes_num || channel_idx >= channels || x_idx >= out_x ||
+      y_idx >= out_y || z_idx >= out_z)
+    return;
+
+  int offset_base = x_idx * out_y * out_z + y_idx * out_z + z_idx;
+  pts_idx_of_voxels += box_idx * out_x * out_y * out_z * max_pts_each_voxel +
+                       offset_base * max_pts_each_voxel;
+  pooled_features += box_idx * out_x * out_y * out_z * channels +
+                     offset_base * channels + channel_idx;
+
+  float sum_val = 0;
+  int total_pts = pts_idx_of_voxels[0];
+
+  for (int k = 1; k <= total_pts; k++) {
+    sum_val += pts_feature[pts_idx_of_voxels[k] * channels + channel_idx];
+  }
 
-    if (total_pts > 0) {
-      pooled_features[0] = sum_val / total_pts;
-    }
+  if (total_pts > 0) {
+    pooled_features[0] = sum_val / total_pts;
   }
 }
 
@@ -206,22 +210,24 @@ __global__ void roiaware_maxpool3d_backward(int boxes_num, int channels,
 
   int box_idx = blockIdx.z;
   int channel_idx = blockIdx.y;
-  CUDA_1D_KERNEL_LOOP(voxel_idx_flat, out_x * out_y * out_z) {
-    int x_idx = voxel_idx_flat / (out_y * out_z);
-    int y_idx = (voxel_idx_flat - x_idx * (out_y * out_z)) / out_z;
-    int z_idx = voxel_idx_flat % out_z;
-    if (box_idx >= boxes_num || channel_idx >= channels) return;
-
-    int offset_base = x_idx * out_y * out_z + y_idx * out_z + z_idx;
-    argmax += box_idx * out_x * out_y * out_z * channels +
+  int voxel_idx_flat = blockIdx.x * blockDim.x + threadIdx.x;
+
+  int x_idx = voxel_idx_flat / (out_y * out_z);
+  int y_idx = (voxel_idx_flat - x_idx * (out_y * out_z)) / out_z;
+  int z_idx = voxel_idx_flat % out_z;
+  if (box_idx >= boxes_num || channel_idx >= channels || x_idx >= out_x ||
+      y_idx >= out_y || z_idx >= out_z)
+    return;
+
+  int offset_base = x_idx * out_y * out_z + y_idx * out_z + z_idx;
+  argmax += box_idx * out_x * out_y * out_z * channels +
+            offset_base * channels + channel_idx;
+  grad_out += box_idx * out_x * out_y * out_z * channels +
               offset_base * channels + channel_idx;
-    grad_out += box_idx * out_x * out_y * out_z * channels +
-                offset_base * channels + channel_idx;
 
-    if (argmax[0] == -1) return;
+  if (argmax[0] == -1) return;
 
-    atomicAdd(grad_in + argmax[0] * channels + channel_idx, grad_out[0] * 1);
-  }
+  atomicAdd(grad_in + argmax[0] * channels + channel_idx, grad_out[0] * 1);
 }
 
 template <typename T>
@@ -236,24 +242,26 @@ __global__ void roiaware_avgpool3d_backward(int boxes_num, int channels,
 
   int box_idx = blockIdx.z;
   int channel_idx = blockIdx.y;
-  CUDA_1D_KERNEL_LOOP(voxel_idx_flat, out_x * out_y * out_z) {
-    int x_idx = voxel_idx_flat / (out_y * out_z);
-    int y_idx = (voxel_idx_flat - x_idx * (out_y * out_z)) / out_z;
-    int z_idx = voxel_idx_flat % out_z;
-    if (box_idx >= boxes_num || channel_idx >= channels) return;
-
-    int offset_base = x_idx * out_y * out_z + y_idx * out_z + z_idx;
-    pts_idx_of_voxels += box_idx * out_x * out_y * out_z * max_pts_each_voxel +
-                         offset_base * max_pts_each_voxel;
-    grad_out += box_idx * out_x * out_y * out_z * channels +
-                offset_base * channels + channel_idx;
-
-    int total_pts = pts_idx_of_voxels[0];
-    float cur_grad = 1 / fmaxf(float(total_pts), 1.0);
-    for (int k = 1; k <= total_pts; k++) {
-      atomicAdd(grad_in + pts_idx_of_voxels[k] * channels + channel_idx,
-                grad_out[0] * cur_grad);
-    }
+  int voxel_idx_flat = blockIdx.x * blockDim.x + threadIdx.x;
+
+  int x_idx = voxel_idx_flat / (out_y * out_z);
+  int y_idx = (voxel_idx_flat - x_idx * (out_y * out_z)) / out_z;
+  int z_idx = voxel_idx_flat % out_z;
+  if (box_idx >= boxes_num || channel_idx >= channels || x_idx >= out_x ||
+      y_idx >= out_y || z_idx >= out_z)
+    return;
+
+  int offset_base = x_idx * out_y * out_z + y_idx * out_z + z_idx;
+  pts_idx_of_voxels += box_idx * out_x * out_y * out_z * max_pts_each_voxel +
+                       offset_base * max_pts_each_voxel;
+  grad_out += box_idx * out_x * out_y * out_z * channels +
+              offset_base * channels + channel_idx;
+
+  int total_pts = pts_idx_of_voxels[0];
+  float cur_grad = 1 / fmaxf(float(total_pts), 1.0);
+  for (int k = 1; k <= total_pts; k++) {
+    atomicAdd(grad_in + pts_idx_of_voxels[k] * channels + channel_idx,
+              grad_out[0] * cur_grad);
   }
 }
 
diff --git a/mmcv/ops/csrc/common/cuda/roipoint_pool3d_cuda_kernel.cuh b/mmcv/ops/csrc/common/cuda/roipoint_pool3d_cuda_kernel.cuh
index 545f6ff..7597719 100644
--- a/mmcv/ops/csrc/common/cuda/roipoint_pool3d_cuda_kernel.cuh
+++ b/mmcv/ops/csrc/common/cuda/roipoint_pool3d_cuda_kernel.cuh
@@ -42,23 +42,23 @@ __global__ void assign_pts_to_box3d(int batch_size, int pts_num, int boxes_num,
   // params boxes3d: (B, M, 7)
   // params pts_assign: (B, N, M): idx of the corresponding box3d, -1 means
   // background points
+  int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
   int box_idx = blockIdx.y;
   int bs_idx = blockIdx.z;
-  CUDA_1D_KERNEL_LOOP(pt_idx, pts_num) {
-    if (box_idx >= boxes_num || bs_idx >= batch_size) return;
 
-    int assign_idx =
-        bs_idx * pts_num * boxes_num + pt_idx * boxes_num + box_idx;
-    pts_assign[assign_idx] = 0;
+  if (pt_idx >= pts_num || box_idx >= boxes_num || bs_idx >= batch_size) {
+    return;
+  }
+  int assign_idx = bs_idx * pts_num * boxes_num + pt_idx * boxes_num + box_idx;
+  pts_assign[assign_idx] = 0;
 
-    int box_offset = bs_idx * boxes_num * 7 + box_idx * 7;
-    int pt_offset = bs_idx * pts_num * 3 + pt_idx * 3;
+  int box_offset = bs_idx * boxes_num * 7 + box_idx * 7;
+  int pt_offset = bs_idx * pts_num * 3 + pt_idx * 3;
 
-    T local_x = 0, local_y = 0;
-    int cur_in_flag = check_pt_in_box3d(xyz + pt_offset, boxes3d + box_offset,
-                                        local_x, local_y);
-    pts_assign[assign_idx] = cur_in_flag;
-  }
+  T local_x = 0, local_y = 0;
+  int cur_in_flag = check_pt_in_box3d(xyz + pt_offset, boxes3d + box_offset,
+                                      local_x, local_y);
+  pts_assign[assign_idx] = cur_in_flag;
 }
 
 __global__ void get_pooled_idx(int batch_size, int pts_num, int boxes_num,
@@ -69,32 +69,35 @@ __global__ void get_pooled_idx(int batch_size, int pts_num, int boxes_num,
   // params pts_assign: (B, N)
   // params pts_idx: (B, M, 512)
   // params pooled_empty_flag: (B, M)
-  CUDA_1D_KERNEL_LOOP(boxes_idx, boxes_num) {
-    int bs_idx = blockIdx.y;
-
-    int cnt = 0;
-    for (int k = 0; k < pts_num; k++) {
-      if (pts_assign[bs_idx * pts_num * boxes_num + k * boxes_num +
-                     boxes_idx]) {
-        if (cnt < sampled_pts_num) {
-          pts_idx[bs_idx * boxes_num * sampled_pts_num +
-                  boxes_idx * sampled_pts_num + cnt] = k;
-          cnt++;
-        } else
-          break;
-      }
+
+  int boxes_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (boxes_idx >= boxes_num) {
+    return;
+  }
+
+  int bs_idx = blockIdx.y;
+
+  int cnt = 0;
+  for (int k = 0; k < pts_num; k++) {
+    if (pts_assign[bs_idx * pts_num * boxes_num + k * boxes_num + boxes_idx]) {
+      if (cnt < sampled_pts_num) {
+        pts_idx[bs_idx * boxes_num * sampled_pts_num +
+                boxes_idx * sampled_pts_num + cnt] = k;
+        cnt++;
+      } else
+        break;
     }
+  }
 
-    if (cnt == 0) {
-      pooled_empty_flag[bs_idx * boxes_num + boxes_idx] = 1;
-    } else if (cnt < sampled_pts_num) {
-      // duplicate same points for sampling
-      for (int k = cnt; k < sampled_pts_num; k++) {
-        int duplicate_idx = k % cnt;
-        int base_offset =
-            bs_idx * boxes_num * sampled_pts_num + boxes_idx * sampled_pts_num;
-        pts_idx[base_offset + k] = pts_idx[base_offset + duplicate_idx];
-      }
+  if (cnt == 0) {
+    pooled_empty_flag[bs_idx * boxes_num + boxes_idx] = 1;
+  } else if (cnt < sampled_pts_num) {
+    // duplicate same points for sampling
+    for (int k = cnt; k < sampled_pts_num; k++) {
+      int duplicate_idx = k % cnt;
+      int base_offset =
+          bs_idx * boxes_num * sampled_pts_num + boxes_idx * sampled_pts_num;
+      pts_idx[base_offset + k] = pts_idx[base_offset + duplicate_idx];
     }
   }
 }
@@ -109,26 +112,33 @@ __global__ void roipoint_pool3d_forward(
   // params pts_feature: (B, N, C)
   // params pooled_features: (B, M, 512, 3+C)
   // params pooled_empty_flag: (B, M)
+
+  int sample_pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
   int box_idx = blockIdx.y;
   int bs_idx = blockIdx.z;
-  CUDA_1D_KERNEL_LOOP(sample_pt_idx, sampled_pts_num) {
-    if (box_idx >= boxes_num || bs_idx >= batch_size) return;
-    if (pooled_empty_flag[bs_idx * boxes_num + box_idx]) return;
-
-    int temp_idx = bs_idx * boxes_num * sampled_pts_num +
-                   box_idx * sampled_pts_num + sample_pt_idx;
-    int src_pt_idx = pts_idx[temp_idx];
-    int dst_feature_offset = temp_idx * (3 + feature_in_len);
-
-    for (int j = 0; j < 3; j++)
-      pooled_features[dst_feature_offset + j] =
-          xyz[bs_idx * pts_num * 3 + src_pt_idx * 3 + j];
-
-    int src_feature_offset =
-        bs_idx * pts_num * feature_in_len + src_pt_idx * feature_in_len;
-    memcpy(pooled_features + dst_feature_offset + 3,
-           pts_feature + src_feature_offset, feature_in_len * sizeof(T));
+
+  if (sample_pt_idx >= sampled_pts_num || box_idx >= boxes_num ||
+      bs_idx >= batch_size) {
+    return;
+  }
+
+  if (pooled_empty_flag[bs_idx * boxes_num + box_idx]) {
+    return;
   }
+
+  int temp_idx = bs_idx * boxes_num * sampled_pts_num +
+                 box_idx * sampled_pts_num + sample_pt_idx;
+  int src_pt_idx = pts_idx[temp_idx];
+  int dst_feature_offset = temp_idx * (3 + feature_in_len);
+
+  for (int j = 0; j < 3; j++)
+    pooled_features[dst_feature_offset + j] =
+        xyz[bs_idx * pts_num * 3 + src_pt_idx * 3 + j];
+
+  int src_feature_offset =
+      bs_idx * pts_num * feature_in_len + src_pt_idx * feature_in_len;
+  memcpy(pooled_features + dst_feature_offset + 3,
+         pts_feature + src_feature_offset, feature_in_len * sizeof(T));
 }
 
 #endif  // ROIPOINT_POOL3D_CUDA_KERNEL_CUH
diff --git a/mmcv/ops/csrc/common/cuda/rotated_feature_align_cuda_kernel.cuh b/mmcv/ops/csrc/common/cuda/rotated_feature_align_cuda_kernel.cuh
deleted file mode 100644
index ffcc658..0000000
--- a/mmcv/ops/csrc/common/cuda/rotated_feature_align_cuda_kernel.cuh
+++ /dev/null
@@ -1,129 +0,0 @@
-// Copyright (c) OpenMMLab. All rights reserved.
-// Modified from
-// https://github.com/SJTU-Thinklab-Det/r3det-on-mmdetection/blob/master/mmdet/ops/fr/src/feature_refine_kernel.cu
-#ifndef ROTATED_FEATURE_ALIGN_CUDA_KERNEL_CUH
-#define ROTATED_FEATURE_ALIGN_CUDA_KERNEL_CUH
-
-#ifdef MMCV_USE_PARROTS
-#include "parrots_cuda_helper.hpp"
-#else
-#include "pytorch_cuda_helper.hpp"
-#endif
-
-template <typename scalar_t>
-__global__ void rotated_feature_align_forward_kernel(
-    const int nthreads, const int points, const scalar_t* bottom_data,
-    const scalar_t* best_bboxes, const scalar_t spatial_scale,
-    const int channels, const int height, const int width, scalar_t* top_data) {
-  CUDA_1D_KERNEL_LOOP(index, nthreads) {
-    int w = index % width;
-    int h = (index / width) % height;
-    int c = (index / width / height) % channels;
-    int n = index / width / height / channels;
-
-    const scalar_t* bbox_offset =
-        best_bboxes + ((n * height + h) * width + w) * 5;
-    scalar_t roi_y = bbox_offset[0] * spatial_scale;
-    scalar_t roi_x = bbox_offset[1] * spatial_scale;
-
-    scalar_t px[5] = {roi_x, 0, 0, 0, 0};
-    scalar_t py[5] = {roi_y, 0, 0, 0, 0};
-
-    if (points > 1) {
-      scalar_t roi_w = bbox_offset[2] * spatial_scale;
-      scalar_t roi_h = bbox_offset[3] * spatial_scale;
-      scalar_t roi_a = bbox_offset[4];
-
-      scalar_t w_2 = roi_w / 2, h_2 = roi_h / 2;
-      scalar_t cosa = cosf(roi_a), sina = sinf(roi_a);
-      scalar_t wx = cosa * w_2, wy = sina * w_2;
-      scalar_t hx = -sina * h_2, hy = cosa * h_2;
-
-      px[1] = roi_x + wx + hx;
-      py[1] = roi_y + wy + hy;
-      px[2] = roi_x - wx + hx;
-      py[2] = roi_y - wy + hy;
-      px[3] = roi_x - wx - hx;
-      py[3] = roi_y - wy - hy;
-      px[4] = roi_x + wx - hx;
-      py[4] = roi_y + wy - hy;
-    }
-
-    const scalar_t* offset_bottom_data =
-        bottom_data + (n * channels + c) * height * width;
-
-    scalar_t output_val = bottom_data[index];
-    for (int i = 0; i < points; i++) {
-      output_val += bilinear_interpolate<scalar_t>(offset_bottom_data, height,
-                                                   width, py[i], px[i], i);
-    }
-    top_data[index] = output_val;
-  }
-}
-
-template <typename scalar_t>
-__global__ void rotated_feature_align_backward_kernel(
-    const int nthreads, const int points, const scalar_t* top_diff,
-    const scalar_t* best_bboxes, const scalar_t spatial_scale,
-    const int channels, const int height, const int width,
-    scalar_t* bottom_diff) {
-  CUDA_1D_KERNEL_LOOP(index, nthreads) {
-    int w = index % width;
-    int h = (index / width) % height;
-    int c = (index / width / height) % channels;
-    int n = index / width / height / channels;
-
-    const scalar_t* bbox_offset =
-        best_bboxes + ((n * height + h) * width + w) * 5;
-    scalar_t roi_y = bbox_offset[0] * spatial_scale;
-    scalar_t roi_x = bbox_offset[1] * spatial_scale;
-
-    scalar_t px[5] = {roi_x, 0, 0, 0, 0};
-    scalar_t py[5] = {roi_y, 0, 0, 0, 0};
-
-    if (points > 1) {
-      scalar_t roi_w = bbox_offset[2] * spatial_scale;
-      scalar_t roi_h = bbox_offset[3] * spatial_scale;
-      scalar_t roi_a = bbox_offset[4];
-
-      scalar_t w_2 = roi_w / 2, h_2 = roi_h / 2;
-      scalar_t cosa = cosf(roi_a), sina = sinf(roi_a);
-      scalar_t wx = cosa * w_2, wy = sina * w_2;
-      scalar_t hx = -sina * h_2, hy = cosa * h_2;
-
-      px[1] = roi_x + wx + hx;
-      py[1] = roi_y + wy + hy;
-      px[2] = roi_x - wx + hx;
-      py[2] = roi_y - wy + hy;
-      px[3] = roi_x - wx - hx;
-      py[3] = roi_y - wy - hy;
-      px[4] = roi_x + wx - hx;
-      py[4] = roi_y + wy - hy;
-    }
-
-    scalar_t* offset_bottom_diff =
-        bottom_diff + (n * channels + c) * height * width;
-    scalar_t value_top_diff = top_diff[index];
-
-    atomicAdd(bottom_diff + index, value_top_diff);
-    for (int i = 0; i < points; i++) {
-      scalar_t w1, w2, w3, w4;
-      int x_low, x_high, y_low, y_high;
-
-      bilinear_interpolate_gradient<scalar_t>(height, width, py[i], px[i], w1,
-                                              w2, w3, w4, x_low, x_high, y_low,
-                                              y_high, i);
-      scalar_t g1 = value_top_diff * w1;
-      scalar_t g2 = value_top_diff * w2;
-      scalar_t g3 = value_top_diff * w3;
-      scalar_t g4 = value_top_diff * w4;
-      if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
-        atomicAdd(offset_bottom_diff + y_low * width + x_low, g1);
-        atomicAdd(offset_bottom_diff + y_low * width + x_high, g2);
-        atomicAdd(offset_bottom_diff + y_high * width + x_low, g3);
-        atomicAdd(offset_bottom_diff + y_high * width + x_high, g4);
-      }
-    }
-  }
-}
-#endif  // ROTATED_FEATURE_ALIGN_CUDA_KERNEL_CUH
diff --git a/mmcv/ops/csrc/common/cuda/scatter_points_cuda_kernel.cuh b/mmcv/ops/csrc/common/cuda/scatter_points_cuda_kernel.cuh
index af5b9f6..7f9c402 100644
--- a/mmcv/ops/csrc/common/cuda/scatter_points_cuda_kernel.cuh
+++ b/mmcv/ops/csrc/common/cuda/scatter_points_cuda_kernel.cuh
@@ -34,7 +34,7 @@ __device__ __forceinline__ static void reduceMax(double *address, double val) {
 }
 
 // get rid of meaningless warnings when compiling host code
-#ifdef MMCV_WITH_HIP
+#ifdef HIP_DIFF
 __device__ __forceinline__ static void reduceAdd(float *address, float val) {
   atomicAdd(address, val);
 }
@@ -86,7 +86,7 @@ __device__ __forceinline__ static void reduceAdd(double *address, double val) {
 #endif
 }
 #endif  // __CUDA_ARCH__
-#endif  // MMCV_WITH_HIP
+#endif  // HIP_DIFF
 
 template <typename T>
 __global__ void feats_reduce_kernel(
diff --git a/mmcv/ops/csrc/common/cuda/spconv/indice.cuh b/mmcv/ops/csrc/common/cuda/spconv/indice.cuh
deleted file mode 100644
index 5ef0009..0000000
--- a/mmcv/ops/csrc/common/cuda/spconv/indice.cuh
+++ /dev/null
@@ -1,236 +0,0 @@
-// Copyright 2019 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef INDICE_CU_H_
-#define INDICE_CU_H_
-#include <utils/spconv/spconv/geometry.h>
-#include <utils/spconv/tensorview/tensorview.h>
-
-#include <utils/spconv/tensorview/helper_kernel.cuh>
-
-template <typename Index, typename IndexGrid, unsigned NDim,
-          int KernelMaxVolume = 256>
-__global__ void prepareIndicePairsKernel(
-    tv::TensorView<const Index> indicesIn, tv::TensorView<Index> indicesOut,
-    tv::TensorView<IndexGrid> gridsOut, tv::TensorView<Index> indicePairs,
-    tv::TensorView<Index> indiceNum, tv::TensorView<Index> indicePairUnique,
-    const tv::SimpleVector<Index, NDim> kernelSize,
-    const tv::SimpleVector<Index, NDim> stride,
-    const tv::SimpleVector<Index, NDim> padding,
-    const tv::SimpleVector<Index, NDim> dilation,
-    const tv::SimpleVector<Index, NDim> outSpatialShape) {
-  auto numActIn = indicesIn.dim(0);
-  Index spatialVolume = 1;
-#pragma unroll
-  for (int i = 0; i < NDim; ++i) {
-    spatialVolume *= outSpatialShape[i];
-  }
-  Index kernelVolume = 1;
-#pragma unroll
-  for (int i = 0; i < NDim; ++i) {
-    kernelVolume *= kernelSize[i];
-  }
-  Index numValidPoints = 0;
-  Index validPoints[KernelMaxVolume * (NDim + 1)];
-  Index *pointPtr = nullptr;
-  auto indicePairsDim2 = indicePairs.dim(2);
-  Index index;
-  for (int ix : tv::KernelLoopX<int>(numActIn)) {
-    numValidPoints = getValidOutPos<Index, NDim>(
-        indicesIn.data() + ix * (NDim + 1) + 1, kernelSize.data(),
-        stride.data(), padding.data(), dilation.data(), outSpatialShape.data(),
-        validPoints);
-    for (Index i = 0; i < numValidPoints; ++i) {
-      pointPtr = validPoints + i * (NDim + 1);
-      auto offset = pointPtr[NDim];
-      auto oldNum = atomicAdd(indiceNum.data() + offset, Index(1));
-      indicePairs(offset, 0, oldNum) = ix;
-      index = tv::rowArrayIdx<Index, NDim>(pointPtr, outSpatialShape.data()) +
-              spatialVolume * indicesIn(ix, 0);
-      indicePairs(offset, 1, oldNum) = index;
-      indicePairUnique[offset * indicePairsDim2 + oldNum] = index;
-    }
-  }
-}
-
-template <typename Index, typename IndexGrid, unsigned NDim,
-          int KernelMaxVolume = 256>
-__global__ void prepareDeConvIndicePairsKernel(
-    tv::TensorView<const Index> indicesIn, tv::TensorView<Index> indicesOut,
-    tv::TensorView<IndexGrid> gridsOut, tv::TensorView<Index> indicePairs,
-    tv::TensorView<Index> indiceNum, tv::TensorView<Index> indicePairUnique,
-    const tv::SimpleVector<Index, NDim> kernelSize,
-    const tv::SimpleVector<Index, NDim> stride,
-    const tv::SimpleVector<Index, NDim> padding,
-    const tv::SimpleVector<Index, NDim> dilation,
-    const tv::SimpleVector<Index, NDim> outSpatialShape) {
-  auto numActIn = indicesIn.dim(0);
-  Index spatialVolume = 1;
-#pragma unroll
-  for (int i = 0; i < NDim; ++i) {
-    spatialVolume *= outSpatialShape[i];
-  }
-  Index kernelVolume = 1;
-#pragma unroll
-  for (int i = 0; i < NDim; ++i) {
-    kernelVolume *= kernelSize[i];
-  }
-  Index numValidPoints = 0;
-  Index validPoints[KernelMaxVolume * (NDim + 1)];
-  Index *pointPtr = nullptr;
-  auto indicePairsDim2 = indicePairs.dim(2);
-  Index index;
-  for (int ix : tv::KernelLoopX<int>(numActIn)) {
-    numValidPoints = getValidOutPosTranspose<Index, NDim>(
-        indicesIn.data() + ix * (NDim + 1) + 1, kernelSize.data(),
-        stride.data(), padding.data(), dilation.data(), outSpatialShape.data(),
-        validPoints);
-    for (Index i = 0; i < numValidPoints; ++i) {
-      pointPtr = validPoints + i * (NDim + 1);
-      auto offset = pointPtr[NDim];
-      auto oldNum = atomicAdd(indiceNum.data() + offset, Index(1));
-      indicePairs(offset, 0, oldNum) = ix;
-      index = tv::rowArrayIdx<Index, NDim>(pointPtr, outSpatialShape.data()) +
-              spatialVolume * indicesIn(ix, 0);
-      indicePairs(offset, 1, oldNum) = index;
-      indicePairUnique[offset * indicePairsDim2 + oldNum] = index;
-    }
-  }
-}
-
-template <typename Index, typename IndexGrid, unsigned NDim>
-__global__ void assignGridAndIndiceOutKernel(
-    tv::TensorView<Index> indicesOut, tv::TensorView<IndexGrid> gridsOut,
-    int numAct, tv::TensorView<Index> indicePairs,
-    tv::TensorView<Index> indicePairUnique,
-    const tv::SimpleVector<Index, NDim> outSpatialShape, int batchSize) {
-  Index index;
-  auto indicesOutPtr = indicesOut.data();
-  for (int ix : tv::KernelLoopX<int>(numAct)) {
-    index = indicePairUnique[ix];
-    gridsOut[index] = ix;
-    index = tv::rowArrayIdxInv<Index, NDim>(
-        index, indicesOutPtr + ix * (NDim + 1) + 1, outSpatialShape.data());
-    indicesOut[ix * (NDim + 1)] = index % batchSize;
-  }
-}
-
-template <typename Index, typename IndexGrid, unsigned NDim>
-__global__ void assignIndicePairsKernel(
-    tv::TensorView<Index> indicesOut, tv::TensorView<IndexGrid> gridsOut,
-    int numActIn, tv::TensorView<Index> indicePairs,
-    tv::TensorView<Index> indicePairUnique,
-    const tv::SimpleVector<Index, NDim> outSpatialShape) {
-  Index index;
-  int kernelVolume = indicePairs.dim(0);
-  for (int ix : tv::KernelLoopX<int>(numActIn)) {
-    for (int i = 0; i < kernelVolume; ++i) {
-      index = indicePairs(i, 1, ix);
-      if (index > -1) {
-        indicePairs(i, 1, ix) = gridsOut[index];
-      }
-    }
-  }
-}
-
-template <typename Index, typename IndexGrid, unsigned NDim>
-__global__ void prepareSubMGridKernel(
-    tv::TensorView<const Index> indicesIn, tv::TensorView<IndexGrid> gridsOut,
-    const tv::SimpleVector<Index, NDim> outSpatialShape) {
-  auto numActIn = indicesIn.dim(0);
-  Index spatialVolume = 1;
-#pragma unroll
-  for (int i = 0; i < NDim; ++i) {
-    spatialVolume *= outSpatialShape[i];
-  }
-  Index index = 0;
-  for (int ix : tv::KernelLoopX<int>(numActIn)) {
-    index = tv::rowArrayIdx<Index, NDim>(indicesIn.data() + ix * (NDim + 1) + 1,
-                                         outSpatialShape.data()) +
-            spatialVolume * indicesIn(ix, 0);
-    gridsOut[index] = ix;
-  }
-}
-
-template <typename Index, typename IndexGrid, unsigned NDim,
-          int KernelMaxVolume = 256>
-__global__ void getSubMIndicePairsKernel(
-    tv::TensorView<const Index> indicesIn, tv::TensorView<IndexGrid> gridsOut,
-    tv::TensorView<Index> indicePairs, tv::TensorView<Index> indiceNum,
-    const tv::SimpleVector<Index, NDim> kernelSize,
-    const tv::SimpleVector<Index, NDim> stride,
-    const tv::SimpleVector<Index, NDim> padding,
-    const tv::SimpleVector<Index, NDim> dilation,
-    const tv::SimpleVector<Index, NDim> outSpatialShape) {
-  auto numActIn = indicesIn.dim(0);
-  Index spatialVolume = 1;
-#pragma unroll
-  for (int i = 0; i < NDim; ++i) {
-    spatialVolume *= outSpatialShape[i];
-  }
-  Index numValidPoints = 0;
-  Index validPoints[KernelMaxVolume * (NDim + 1)];
-  Index *pointPtr = nullptr;
-  Index index = 0;
-  for (int ix : tv::KernelLoopX<int>(numActIn)) {
-    numValidPoints = getValidOutPos<Index, NDim>(
-        indicesIn.data() + ix * (NDim + 1) + 1, kernelSize.data(),
-        stride.data(), padding.data(), dilation.data(), outSpatialShape.data(),
-        validPoints);
-    for (int i = 0; i < numValidPoints; ++i) {
-      pointPtr = validPoints + i * (NDim + 1);
-      auto offset = pointPtr[NDim];
-      index = tv::rowArrayIdx<Index, NDim>(pointPtr, outSpatialShape.data()) +
-              spatialVolume * indicesIn(ix, 0);
-      if (gridsOut[index] > -1) {
-        auto oldNum = atomicAdd(indiceNum.data() + offset, Index(1));
-        indicePairs(offset, 1, oldNum) = gridsOut[index];
-        indicePairs(offset, 0, oldNum) = ix;
-      }
-    }
-  }
-}
-
-template <typename Index, typename IndexGrid, unsigned NDim>
-__global__ void resetGridKernel(const Index *indicePairUnique,
-                                tv::TensorView<IndexGrid> gridsOut,
-                                int numAct) {
-  for (int ix : tv::KernelLoopX<int>(numAct)) {
-    gridsOut[indicePairUnique[ix]] = -1;
-  }
-}
-
-template <typename Index, typename IndexGrid, unsigned NDim>
-__global__ void resetGridSubMKernel(
-    const Index *indices, tv::TensorView<IndexGrid> gridsOut,
-    const tv::SimpleVector<Index, NDim> outSpatialShape, int numAct) {
-  int outSpatialShapeReg[NDim];
-  for (int i = 0; i < NDim; ++i) {
-    outSpatialShapeReg[i] = outSpatialShape[i];
-  }
-  Index spatialVolume = 1;
-  auto indsPtr = indices;
-#pragma unroll
-  for (int i = 0; i < NDim; ++i) {
-    spatialVolume *= outSpatialShape[i];
-  }
-  Index index;
-  for (int ix : tv::KernelLoopX<int>(numAct)) {
-    indsPtr = indices + ix * (NDim + 1);
-    index = tv::rowArrayIdx<Index, NDim>(indsPtr + 1, outSpatialShapeReg);
-    gridsOut[index + spatialVolume * indsPtr[0]] = -1;
-  }
-}
-
-#endif
diff --git a/mmcv/ops/csrc/common/cuda/spconv/reordering.cuh b/mmcv/ops/csrc/common/cuda/spconv/reordering.cuh
deleted file mode 100644
index e3ec68b..0000000
--- a/mmcv/ops/csrc/common/cuda/spconv/reordering.cuh
+++ /dev/null
@@ -1,160 +0,0 @@
-// Copyright 2019 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef REORDERING_CU_H_
-#define REORDERING_CU_H_
-#include <utils/spconv/tensorview/helper_kernel.cuh>
-
-template <typename scalar_t, typename Index, int NumTLP, int NumILP>
-__global__ void gatherGenericKernel(scalar_t *buffer, const scalar_t *features,
-                                    const Index *indices, int size,
-                                    int numPlanes) {
-  int ILPStrideX[NumILP];
-  Index inds[NumILP];
-#pragma unroll
-  for (int ilp = 0; ilp < NumILP; ilp++)
-    ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
-
-  for (int ix : tv::KernelLoopX<int, NumILP>(size)) {
-#pragma unroll
-    for (int ilp = 0; ilp < NumILP; ilp++) {
-      if (ix + ILPStrideX[ilp] < size)
-        inds[ilp] = indices[ix + ILPStrideX[ilp]] * numPlanes;
-    }
-    for (int iy : tv::KernelLoopY<int>(numPlanes)) {
-#pragma unroll
-      for (int ilp = 0; ilp < NumILP; ++ilp) {
-        if (ix + ILPStrideX[ilp] < size)
-          buffer[(ix + ILPStrideX[ilp]) * numPlanes + iy] =
-              features[inds[ilp] + iy];
-      }
-    }
-  }
-}
-
-template <typename scalar_t, typename Index, int NumTLP, int NumILP,
-          typename VecType>
-__global__ void gatherVecKernel(scalar_t *buffer, const scalar_t *features,
-                                const Index *indices, int size, int numPlanes) {
-  int ILPStrideX[NumILP];
-  Index inds[NumILP];
-#pragma unroll
-  for (int ilp = 0; ilp < NumILP; ilp++)
-    ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
-
-  for (int ix : tv::KernelLoopX<int, NumILP>(size)) {
-#pragma unroll
-    for (int ilp = 0; ilp < NumILP; ilp++) {
-      if (ix + ILPStrideX[ilp] < size)
-        inds[ilp] = indices[ix + ILPStrideX[ilp]] * numPlanes;
-    }
-    for (int iy : tv::KernelLoopY<int>(numPlanes)) {
-#pragma unroll
-      for (int ilp = 0; ilp < NumILP; ++ilp) {
-        if (ix + ILPStrideX[ilp] < size)
-          reinterpret_cast<VecType *>(
-              buffer)[(ix + ILPStrideX[ilp]) * numPlanes + iy] =
-              reinterpret_cast<const VecType *>(features)[inds[ilp] + iy];
-      }
-    }
-  }
-}
-
-template <typename scalar_t, typename Index, int NumTLP, int NumILP,
-          typename VecType = int4>
-__global__ void gatherVecBlockKernel(scalar_t *buffer, const scalar_t *features,
-                                     const Index *indices, int size,
-                                     int numPlanes) {
-  int ILPStrideY[NumILP];
-#pragma unroll
-  for (int ilp = 0; ilp < NumILP; ilp++)
-    ILPStrideY[ilp] = ilp * gridDim.y * blockDim.y;
-  features += blockIdx.x * NumTLP;
-  buffer += blockIdx.x * NumTLP;
-
-  for (int iy : tv::KernelLoopY<int, NumILP>(size)) {
-#pragma unroll
-    for (int ilp = 0; ilp < NumILP; ++ilp) {
-      reinterpret_cast<VecType *>(
-          buffer)[(iy + ILPStrideY[ilp]) * numPlanes + threadIdx.x] =
-          reinterpret_cast<const VecType *>(
-              features)[indices[iy + ILPStrideY[ilp]] * numPlanes +
-                        threadIdx.x];
-    }
-  }
-}
-
-template <typename scalar_t, typename Index, int NumTLP, int NumILP>
-__global__ void scatterAddGenericKernel(scalar_t *outFeatures,
-                                        const scalar_t *buffer,
-                                        const Index *indices, int size,
-                                        int numPlanes) {
-  int ILPStrideX[NumILP];
-  Index inds[NumILP];
-#pragma unroll
-  for (int ilp = 0; ilp < NumILP; ilp++)
-    ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
-  for (int ix : tv::KernelLoopX<int, NumILP>(size)) {
-#pragma unroll
-    for (int ilp = 0; ilp < NumILP; ilp++) {
-      if (ix + ILPStrideX[ilp] < size)
-        inds[ilp] = indices[ix + ILPStrideX[ilp]] * numPlanes;
-    }
-    for (int iy : tv::KernelLoopY<int>(numPlanes)) {
-#pragma unroll
-      for (int ilp = 0; ilp < NumILP; ++ilp) {
-        if (ix + ILPStrideX[ilp] < size) {
-          outFeatures[inds[ilp] + iy] +=
-              buffer[(ix + ILPStrideX[ilp]) * numPlanes + iy];
-        }
-      }
-    }
-  }
-}
-
-template <typename scalar_t, typename Index, int NumTLP, int NumILP,
-          typename VecType = int4>
-__global__ void scatterAddVecBlockKernel(scalar_t *outFeatures,
-                                         const scalar_t *buffer,
-                                         const Index *indices, int size,
-                                         int numPlanes) {
-  int ILPStrideY[NumILP];
-  constexpr int vecloadFactor = sizeof(VecType) / sizeof(scalar_t);
-#pragma unroll
-  for (int ilp = 0; ilp < NumILP; ilp++)
-    ILPStrideY[ilp] = ilp * gridDim.y * blockDim.y;
-  outFeatures += blockIdx.x * NumTLP;
-  buffer += blockIdx.x * NumTLP;
-  scalar_t buf[vecloadFactor];
-  scalar_t buf2[vecloadFactor];
-  Index idx;
-  for (int iy : tv::KernelLoopY<int, NumILP>(size)) {
-#pragma unroll
-    for (int ilp = 0; ilp < NumILP; ++ilp) {
-      idx = indices[iy + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
-      reinterpret_cast<VecType *>(buf)[0] =
-          reinterpret_cast<VecType *>(outFeatures)[idx];
-      reinterpret_cast<VecType *>(buf2)[0] = reinterpret_cast<const VecType *>(
-          buffer)[(iy + ILPStrideY[ilp]) * numPlanes + threadIdx.x];
-#pragma unroll
-      for (int i = 0; i < vecloadFactor; i++) {
-        buf[i] += buf2[i];
-      }
-      reinterpret_cast<VecType *>(outFeatures)[idx] =
-          reinterpret_cast<VecType *>(buf)[0];
-    }
-  }
-}
-
-#endif
diff --git a/mmcv/ops/csrc/common/cuda/stack_ball_query_cuda_kernel.cuh b/mmcv/ops/csrc/common/cuda/stack_ball_query_cuda_kernel.cuh
deleted file mode 100644
index 06caefa..0000000
--- a/mmcv/ops/csrc/common/cuda/stack_ball_query_cuda_kernel.cuh
+++ /dev/null
@@ -1,68 +0,0 @@
-// Copyright (c) OpenMMLab. All rights reserved
-// Modified from
-// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/ball_query_gpu.cu
-#ifndef STACK_BALL_QUERY_CUDA_KERNEL_CUH
-#define STACK_BALL_QUERY_CUDA_KERNEL_CUH
-
-#ifdef MMCV_USE_PARROTS
-#include "parrots_cuda_helper.hpp"
-#else
-#include "pytorch_cuda_helper.hpp"
-#endif
-
-template <typename T>
-__global__ void stack_ball_query_forward_cuda_kernel(
-    int B, int M, float radius, int nsample, const T *new_xyz,
-    const int *new_xyz_batch_cnt, const T *xyz, const int *xyz_batch_cnt,
-    int *idx) {
-  // :param xyz: (N1 + N2 ..., 3) xyz coordinates of the features
-  // :param xyz_batch_cnt: (batch_size), [N1, N2, ...]
-  // :param new_xyz: (M1 + M2 ..., 3) centers of the ball query
-  // :param new_xyz_batch_cnt: (batch_size), [M1, M2, ...]
-  // output:
-  //      idx: (M, nsample)
-  const T *cur_xyz = xyz;
-  int *cur_idx = idx;
-  CUDA_1D_KERNEL_LOOP(pt_idx, M) {
-    int bs_idx = 0;
-    for (int pt_cnt = 0; bs_idx < B; bs_idx++) {
-      pt_cnt += new_xyz_batch_cnt[bs_idx];
-      if (pt_idx < pt_cnt) break;
-    }
-
-    int xyz_batch_start_idx = 0;
-    for (int k = 0; k < bs_idx; k++) xyz_batch_start_idx += xyz_batch_cnt[k];
-
-    const T *new_xyz_p = new_xyz + pt_idx * 3;
-    cur_xyz += xyz_batch_start_idx * 3;
-    cur_idx += pt_idx * nsample;
-
-    float radius2 = radius * radius;
-    T new_x = new_xyz_p[0];
-    T new_y = new_xyz_p[1];
-    T new_z = new_xyz_p[2];
-    int n = xyz_batch_cnt[bs_idx];
-
-    int cnt = 0;
-    for (int k = 0; k < n; ++k) {
-      T x = cur_xyz[k * 3 + 0];
-      T y = cur_xyz[k * 3 + 1];
-      T z = cur_xyz[k * 3 + 2];
-      T d2 = (new_x - x) * (new_x - x) + (new_y - y) * (new_y - y) +
-             (new_z - z) * (new_z - z);
-      if (d2 < radius2) {
-        if (cnt == 0) {
-          for (int l = 0; l < nsample; ++l) {
-            cur_idx[l] = k;
-          }
-        }
-        cur_idx[cnt] = k;
-        ++cnt;
-        if (cnt >= nsample) break;
-      }
-    }
-    if (cnt == 0) cur_idx[0] = -1;
-  }
-}
-
-#endif  // STACK_BALL_QUERY_CUDA_KERNEL_CUH
diff --git a/mmcv/ops/csrc/common/cuda/stack_group_points_cuda_kernel.cuh b/mmcv/ops/csrc/common/cuda/stack_group_points_cuda_kernel.cuh
deleted file mode 100644
index 4ef3663..0000000
--- a/mmcv/ops/csrc/common/cuda/stack_group_points_cuda_kernel.cuh
+++ /dev/null
@@ -1,97 +0,0 @@
-// Copyright (c) OpenMMLab. All rights reserved.
-// Modified from
-// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/group_points_gpu.cu
-#ifndef STACK_GROUP_POINTS_CUDA_KERNEL_CUH
-#define STACK_GROUP_POINTS_CUDA_KERNEL_CUH
-#ifdef MMCV_USE_PARROTS
-#include "parrots_cuda_helper.hpp"
-#else
-#include "pytorch_cuda_helper.hpp"
-#endif
-#include <stdio.h>
-template <typename T>
-__global__ void stack_group_points_forward_cuda_kernel(
-    int b, int c, int m, int nsample, const T *features,
-    const int *features_batch_cnt, const int *idx, const int *idx_batch_cnt,
-    T *out) {
-  // :param features: (N1 + N2 ..., C) tensor of features to group
-  // :param features_batch_cnt: (batch_size) [N1 + N2 ...] tensor containing the
-  // indices of features to group with :param idx: (M1 + M2 ..., nsample) tensor
-  // containing the indices of features to group with :param idx_batch_cnt:
-  // (batch_size) [M1 + M2 ...] tensor containing the indices of features to
-  // group with :return:
-  //     output: (M1 + M2, C, nsample) tensor
-  CUDA_1D_KERNEL_LOOP(index, m * c * nsample) {
-    const T *cur_features = features;
-    const int *cur_idx = idx;
-    int sample_idx = index % nsample;
-    int c_idx = (index / nsample) % c;
-    int pt_idx = (index / nsample / c);
-
-    if (pt_idx >= m || c_idx >= c || sample_idx >= nsample) return;
-    int bs_idx = 0, pt_cnt = idx_batch_cnt[0];
-    for (int k = 1; k < b; k++) {
-      if (pt_idx < pt_cnt) break;
-      pt_cnt += idx_batch_cnt[k];
-      bs_idx = k;
-    }
-
-    int features_batch_start_idx = 0;
-    int features_batch_end_idx = features_batch_cnt[0];
-    for (int k = 0; k < bs_idx; k++) {
-      features_batch_start_idx += features_batch_cnt[k];
-      features_batch_end_idx =
-          features_batch_start_idx + features_batch_cnt[k + 1];
-    }
-    cur_features += features_batch_start_idx * c;
-
-    cur_idx += pt_idx * nsample + sample_idx;
-    int in_idx = cur_idx[0] * c + c_idx;
-    int out_idx = pt_idx * c * nsample + c_idx * nsample + sample_idx;
-    if (in_idx < features_batch_end_idx * c) {
-      out[out_idx] = cur_features[in_idx];
-    }
-  }
-}
-
-template <typename T>
-__global__ void stack_group_points_backward_cuda_kernel(
-    int b, int c, int m, int n, int nsample, const T *grad_out, const int *idx,
-    const int *idx_batch_cnt, const int *features_batch_cnt, T *grad_features) {
-  // :param grad_out: (M1 + M2 ..., C, nsample) tensor of the gradients of the
-  // output from forward :param idx: (M1 + M2 ..., nsample) tensor containing
-  // the indices of features to group with :param idx_batch_cnt: (batch_size)
-  // [M1 + M2 ...] tensor containing the indices of features to group with
-  // :param features_batch_cnt: (batch_size) [N1 + N2 ...] tensor containing the
-  // indices of features to group with :return:
-  //     grad_features: (N1 + N2 ..., C) gradient of the features
-  CUDA_1D_KERNEL_LOOP(index, m * c * nsample) {
-    const T *cur_grad_out = grad_out;
-    const int *cur_idx = idx;
-    T *cur_grad_features = grad_features;
-    int sample_idx = index % nsample;
-    int c_idx = (index / nsample) % c;
-    int pt_idx = (index / nsample / c);
-
-    if (pt_idx >= m || c_idx >= c || sample_idx >= nsample) return;
-
-    int bs_idx = 0, pt_cnt = idx_batch_cnt[0];
-    for (int k = 1; k < b; k++) {
-      if (pt_idx < pt_cnt) break;
-      pt_cnt += idx_batch_cnt[k];
-      bs_idx = k;
-    }
-
-    int features_batch_start_idx = 0;
-    for (int k = 0; k < bs_idx; k++)
-      features_batch_start_idx += features_batch_cnt[k];
-
-    cur_grad_out += pt_idx * c * nsample + c_idx * nsample + sample_idx;
-    cur_idx += pt_idx * nsample + sample_idx;
-    cur_grad_features += (features_batch_start_idx + cur_idx[0]) * c + c_idx;
-
-    atomicAdd(cur_grad_features, cur_grad_out[0]);
-  }
-}
-
-#endif  // GROUP_POINTS_CUDA_KERNEL_CUH
diff --git a/mmcv/ops/csrc/common/cuda/three_interpolate_cuda_kernel.cuh b/mmcv/ops/csrc/common/cuda/three_interpolate_cuda_kernel.cuh
index 971b496..43aecb3 100644
--- a/mmcv/ops/csrc/common/cuda/three_interpolate_cuda_kernel.cuh
+++ b/mmcv/ops/csrc/common/cuda/three_interpolate_cuda_kernel.cuh
@@ -20,17 +20,17 @@ __global__ void three_interpolate_forward_cuda_kernel(
 
   int bs_idx = blockIdx.z;
   int c_idx = blockIdx.y;
-  CUDA_1D_KERNEL_LOOP(pt_idx, n) {
-    if (bs_idx >= b || c_idx >= c) return;
+  int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
 
-    weight += bs_idx * n * 3 + pt_idx * 3;
-    points += bs_idx * c * m + c_idx * m;
-    idx += bs_idx * n * 3 + pt_idx * 3;
-    out += bs_idx * c * n + c_idx * n;
+  if (bs_idx >= b || c_idx >= c || pt_idx >= n) return;
 
-    out[pt_idx] = weight[0] * points[idx[0]] + weight[1] * points[idx[1]] +
-                  weight[2] * points[idx[2]];
-  }
+  weight += bs_idx * n * 3 + pt_idx * 3;
+  points += bs_idx * c * m + c_idx * m;
+  idx += bs_idx * n * 3 + pt_idx * 3;
+  out += bs_idx * c * n + c_idx * n;
+
+  out[pt_idx] = weight[0] * points[idx[0]] + weight[1] * points[idx[1]] +
+                weight[2] * points[idx[2]];
 }
 
 template <typename T>
@@ -44,18 +44,18 @@ __global__ void three_interpolate_backward_cuda_kernel(
 
   int bs_idx = blockIdx.z;
   int c_idx = blockIdx.y;
-  CUDA_1D_KERNEL_LOOP(pt_idx, n) {
-    if (bs_idx >= b || c_idx >= c) return;
-
-    grad_out += bs_idx * c * n + c_idx * n + pt_idx;
-    weight += bs_idx * n * 3 + pt_idx * 3;
-    grad_points += bs_idx * c * m + c_idx * m;
-    idx += bs_idx * n * 3 + pt_idx * 3;
-
-    atomicAdd(grad_points + idx[0], grad_out[0] * weight[0]);
-    atomicAdd(grad_points + idx[1], grad_out[0] * weight[1]);
-    atomicAdd(grad_points + idx[2], grad_out[0] * weight[2]);
-  }
+  int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
+
+  if (bs_idx >= b || c_idx >= c || pt_idx >= n) return;
+
+  grad_out += bs_idx * c * n + c_idx * n + pt_idx;
+  weight += bs_idx * n * 3 + pt_idx * 3;
+  grad_points += bs_idx * c * m + c_idx * m;
+  idx += bs_idx * n * 3 + pt_idx * 3;
+
+  atomicAdd(grad_points + idx[0], grad_out[0] * weight[0]);
+  atomicAdd(grad_points + idx[1], grad_out[0] * weight[1]);
+  atomicAdd(grad_points + idx[2], grad_out[0] * weight[2]);
 }
 
 #endif  // THREE_INTERPOLATE_CUDA_KERNEL_CUH
diff --git a/mmcv/ops/csrc/common/cuda/three_nn_cuda_kernel.cuh b/mmcv/ops/csrc/common/cuda/three_nn_cuda_kernel.cuh
index 1543412..824da4c 100644
--- a/mmcv/ops/csrc/common/cuda/three_nn_cuda_kernel.cuh
+++ b/mmcv/ops/csrc/common/cuda/three_nn_cuda_kernel.cuh
@@ -19,49 +19,48 @@ __global__ void three_nn_forward_cuda_kernel(int b, int n, int m,
   //      idx: (B, N, 3)
 
   int bs_idx = blockIdx.y;
-  CUDA_1D_KERNEL_LOOP(pt_idx, n) {
-    if (bs_idx >= b) return;
+  int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (bs_idx >= b || pt_idx >= n) return;
 
-    unknown += bs_idx * n * 3 + pt_idx * 3;
-    known += bs_idx * m * 3;
-    dist2 += bs_idx * n * 3 + pt_idx * 3;
-    idx += bs_idx * n * 3 + pt_idx * 3;
+  unknown += bs_idx * n * 3 + pt_idx * 3;
+  known += bs_idx * m * 3;
+  dist2 += bs_idx * n * 3 + pt_idx * 3;
+  idx += bs_idx * n * 3 + pt_idx * 3;
 
-    T ux = unknown[0];
-    T uy = unknown[1];
-    T uz = unknown[2];
+  T ux = unknown[0];
+  T uy = unknown[1];
+  T uz = unknown[2];
 
-    double best1 = 1e40, best2 = 1e40, best3 = 1e40;
-    int besti1 = 0, besti2 = 0, besti3 = 0;
-    for (int k = 0; k < m; ++k) {
-      T x = known[k * 3 + 0];
-      T y = known[k * 3 + 1];
-      T z = known[k * 3 + 2];
-      T d = (ux - x) * (ux - x) + (uy - y) * (uy - y) + (uz - z) * (uz - z);
-      if (d < best1) {
-        best3 = best2;
-        besti3 = besti2;
-        best2 = best1;
-        besti2 = besti1;
-        best1 = d;
-        besti1 = k;
-      } else if (d < best2) {
-        best3 = best2;
-        besti3 = besti2;
-        best2 = d;
-        besti2 = k;
-      } else if (d < best3) {
-        best3 = d;
-        besti3 = k;
-      }
+  double best1 = 1e40, best2 = 1e40, best3 = 1e40;
+  int besti1 = 0, besti2 = 0, besti3 = 0;
+  for (int k = 0; k < m; ++k) {
+    T x = known[k * 3 + 0];
+    T y = known[k * 3 + 1];
+    T z = known[k * 3 + 2];
+    T d = (ux - x) * (ux - x) + (uy - y) * (uy - y) + (uz - z) * (uz - z);
+    if (d < best1) {
+      best3 = best2;
+      besti3 = besti2;
+      best2 = best1;
+      besti2 = besti1;
+      best1 = d;
+      besti1 = k;
+    } else if (d < best2) {
+      best3 = best2;
+      besti3 = besti2;
+      best2 = d;
+      besti2 = k;
+    } else if (d < best3) {
+      best3 = d;
+      besti3 = k;
     }
-    dist2[0] = best1;
-    dist2[1] = best2;
-    dist2[2] = best3;
-    idx[0] = besti1;
-    idx[1] = besti2;
-    idx[2] = besti3;
   }
+  dist2[0] = best1;
+  dist2[1] = best2;
+  dist2[2] = best3;
+  idx[0] = besti1;
+  idx[1] = besti2;
+  idx[2] = besti3;
 }
 
 #endif  // THREE_NN_CUDA_KERNEL_CUH
diff --git a/mmcv/ops/csrc/common/cuda/voxelization_cuda_kernel.cuh b/mmcv/ops/csrc/common/cuda/voxelization_cuda_kernel.cuh
index 021b488..62e118b 100644
--- a/mmcv/ops/csrc/common/cuda/voxelization_cuda_kernel.cuh
+++ b/mmcv/ops/csrc/common/cuda/voxelization_cuda_kernel.cuh
@@ -23,20 +23,20 @@ __global__ void dynamic_voxelize_kernel(
     // To save some computation
     auto points_offset = points + index * num_features;
     auto coors_offset = coors + index * NDim;
-    int c_x = floorf((points_offset[0] - coors_x_min) / voxel_x);
+    int c_x = floor((points_offset[0] - coors_x_min) / voxel_x);
     if (c_x < 0 || c_x >= grid_x) {
       coors_offset[0] = -1;
       continue;
     }
 
-    int c_y = floorf((points_offset[1] - coors_y_min) / voxel_y);
+    int c_y = floor((points_offset[1] - coors_y_min) / voxel_y);
     if (c_y < 0 || c_y >= grid_y) {
       coors_offset[0] = -1;
       coors_offset[1] = -1;
       continue;
     }
 
-    int c_z = floorf((points_offset[2] - coors_z_min) / voxel_z);
+    int c_z = floor((points_offset[2] - coors_z_min) / voxel_z);
     if (c_z < 0 || c_z >= grid_z) {
       coors_offset[0] = -1;
       coors_offset[1] = -1;
@@ -101,7 +101,7 @@ __global__ void point_to_voxelidx_kernel(const T_int* coor,
   CUDA_1D_KERNEL_LOOP(index, num_points) {
     auto coor_offset = coor + index * NDim;
     // skip invalid points
-    if (coor_offset[0] == -1) continue;
+    if ((index >= num_points) || (coor_offset[0] == -1)) return;
 
     int num = 0;
     int coor_x = coor_offset[0];
@@ -122,7 +122,7 @@ __global__ void point_to_voxelidx_kernel(const T_int* coor,
           point_to_pointidx[index] = i;
         } else if (num >= max_points) {
           // out of boundary
-          break;
+          return;
         }
       }
     }
@@ -166,51 +166,4 @@ __global__ void determin_voxel_num(
   }
 }
 
-__global__ void nondeterministic_get_assign_pos(
-    const int nthreads, const int32_t* coors_map, int32_t* pts_id,
-    int32_t* coors_count, int32_t* reduce_count, int32_t* coors_order) {
-  CUDA_1D_KERNEL_LOOP(thread_idx, nthreads) {
-    int coors_idx = coors_map[thread_idx];
-    if (coors_idx > -1) {
-      int32_t coors_pts_pos = atomicAdd(&reduce_count[coors_idx], 1);
-      pts_id[thread_idx] = coors_pts_pos;
-      if (coors_pts_pos == 0) {
-        coors_order[coors_idx] = atomicAdd(coors_count, 1);
-      }
-    }
-  }
-}
-
-template <typename T>
-__global__ void nondeterministic_assign_point_voxel(
-    const int nthreads, const T* points, const int32_t* coors_map,
-    const int32_t* pts_id, const int32_t* coors_in, const int32_t* reduce_count,
-    const int32_t* coors_order, T* voxels, int32_t* coors, int32_t* pts_count,
-    const int max_voxels, const int max_points, const int num_features,
-    const int NDim) {
-  CUDA_1D_KERNEL_LOOP(thread_idx, nthreads) {
-    int coors_idx = coors_map[thread_idx];
-    int coors_pts_pos = pts_id[thread_idx];
-    if (coors_idx > -1 && coors_pts_pos < max_points) {
-      int coors_pos = coors_order[coors_idx];
-      if (coors_pos < max_voxels) {
-        auto voxels_offset =
-            voxels + (coors_pos * max_points + coors_pts_pos) * num_features;
-        auto points_offset = points + thread_idx * num_features;
-        for (int k = 0; k < num_features; k++) {
-          voxels_offset[k] = points_offset[k];
-        }
-        if (coors_pts_pos == 0) {
-          pts_count[coors_pos] = min(reduce_count[coors_idx], max_points);
-          auto coors_offset = coors + coors_pos * NDim;
-          auto coors_in_offset = coors_in + coors_idx * NDim;
-          for (int k = 0; k < NDim; k++) {
-            coors_offset[k] = coors_in_offset[k];
-          }
-        }
-      }
-    }
-  }
-}
-
 #endif  // VOXELIZATION_CUDA_KERNEL_CUH
diff --git a/mmcv/ops/csrc/common/mlu/bbox_overlaps_mlu_kernel.mlu b/mmcv/ops/csrc/common/mlu/bbox_overlaps_mlu_kernel.mlu
deleted file mode 100644
index 0f273d2..0000000
--- a/mmcv/ops/csrc/common/mlu/bbox_overlaps_mlu_kernel.mlu
+++ /dev/null
@@ -1,322 +0,0 @@
-/*************************************************************************
- * Copyright (C) 2021 Cambricon.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *************************************************************************/
-#include <float.h>
-
-#include "common_mlu_helper.hpp"
-
-#define COORD_NUM 4
-
-__nram__ char nmem_buf[MAX_NRAM_SIZE];
-
-template <typename T>
-__mlu_func__ void computeDiv(void *nram_dst, void *nram_src0, void *nram_src1,
-                             void *nram_addition, const int32_t deal_num) {
-  __bang_active_reciphp((T *)nram_dst, (T *)nram_src1, deal_num);
-  __bang_mul((T *)nram_dst, (T *)nram_src0, (T *)nram_dst, deal_num);
-}
-
-template <>
-__mlu_func__ void computeDiv<half>(void *nram_dst, void *nram_src0,
-                                   void *nram_src1, void *nram_addition,
-                                   const int32_t deal_num) {
-  __bang_half2float((float *)nram_addition, (half *)nram_src1, deal_num);
-  __bang_active_reciphp((float *)nram_addition, (float *)nram_addition,
-                        deal_num);
-  __bang_float2half_rd((half *)nram_src1, (float *)nram_addition, deal_num);
-  __bang_mul((half *)nram_dst, (half *)nram_src0, (half *)nram_src1, deal_num);
-}
-
-template <typename T>
-__mlu_func__ void bboxOverlapsWorkflow(
-    T *vec_b1_x1, T *vec_b1_y1, T *vec_b1_x2, T *vec_b1_y2, T *vec_b2_x1,
-    T *vec_b2_y1, T *vec_b2_x2, T *vec_b2_y2, T *vec_left, T *vec_right,
-    T *vec_top, T *vec_bottom, const T *bbox1, const T *bbox2, void *ious,
-    const int32_t offset, const int32_t mode, const int32_t batches_stride,
-    const int32_t num_bbox1, const int32_t num_bbox2, const bool aligned) {
-  int32_t task_batch_stride = (num_bbox1 + taskDim - 1) / taskDim;
-  int32_t batch_start = taskId * task_batch_stride;
-  int32_t batch_per_task = batch_start + task_batch_stride < num_bbox1
-                               ? task_batch_stride
-                               : num_bbox1 - batch_start;
-  batch_per_task = batch_per_task > 0 ? batch_per_task : (0);
-
-  if (aligned) {
-    int32_t num_loop_cpy = batch_per_task / batches_stride;
-    int32_t num_rem_cpy_batches = batch_per_task % batches_stride;
-    num_loop_cpy = num_rem_cpy_batches > 0 ? num_loop_cpy + 1 : num_loop_cpy;
-    for (int32_t i = 0; i < num_loop_cpy; i++) {
-      int32_t index = batch_start + i * batches_stride;
-      int32_t handle_batches = index + batches_stride > num_bbox1
-                                   ? num_rem_cpy_batches
-                                   : batches_stride;
-      int32_t b1 = index;
-      int32_t b2 = index;
-
-      int32_t base1 = b1 * COORD_NUM;
-      __memcpy(vec_b1_x1, &bbox1[base1], sizeof(T), GDRAM2NRAM, sizeof(T),
-               COORD_NUM * sizeof(T), handle_batches - 1);
-      __memcpy(vec_b1_y1, &bbox1[base1 + 1], sizeof(T), GDRAM2NRAM, sizeof(T),
-               COORD_NUM * sizeof(T), handle_batches - 1);
-      __memcpy(vec_b1_x2, &bbox1[base1 + 2], sizeof(T), GDRAM2NRAM, sizeof(T),
-               COORD_NUM * sizeof(T), handle_batches - 1);
-      __memcpy(vec_b1_y2, &bbox1[base1 + 3], sizeof(T), GDRAM2NRAM, sizeof(T),
-               COORD_NUM * sizeof(T), handle_batches - 1);
-
-      int32_t base2 = b2 * COORD_NUM;
-      __memcpy(vec_b2_x1, &bbox2[base2], sizeof(T), GDRAM2NRAM, sizeof(T),
-               COORD_NUM * sizeof(T), handle_batches - 1);
-      __memcpy(vec_b2_y1, &bbox2[base2 + 1], sizeof(T), GDRAM2NRAM, sizeof(T),
-               COORD_NUM * sizeof(T), handle_batches - 1);
-      __memcpy(vec_b2_x2, &bbox2[base2 + 2], sizeof(T), GDRAM2NRAM, sizeof(T),
-               COORD_NUM * sizeof(T), handle_batches - 1);
-      __memcpy(vec_b2_y2, &bbox2[base2 + 3], sizeof(T), GDRAM2NRAM, sizeof(T),
-               COORD_NUM * sizeof(T), handle_batches - 1);
-      // get the width and height
-      __bang_maxequal(vec_left, vec_b1_x1, vec_b2_x1, batches_stride);
-      __bang_minequal(vec_right, vec_b1_x2, vec_b2_x2, batches_stride);
-      __bang_maxequal(vec_top, vec_b1_y1, vec_b2_y1, batches_stride);
-      __bang_minequal(vec_bottom, vec_b1_y2, vec_b2_y2, batches_stride);
-
-      // right - left + offset ---> left
-      __bang_sub(vec_left, vec_right, vec_left, batches_stride);
-      __bang_add_scalar(vec_left, vec_left, (T)offset, batches_stride);
-
-      // bottom - top + offset ---> right
-      __bang_sub(vec_right, vec_bottom, vec_top, batches_stride);
-      __bang_add_scalar(vec_right, vec_right, (T)offset, batches_stride);
-
-      // zero vector ---> bottom
-      __bang_write_value(vec_bottom, batches_stride, 0.f);
-
-      // width --> vec_left
-      __bang_maxequal(vec_left, vec_bottom, vec_left, batches_stride);
-      T *width = vec_left;
-      // height --> vec_right
-      __bang_maxequal(vec_right, vec_bottom, vec_right, batches_stride);
-      T *height = vec_right;
-
-      // get the b1_area
-      // (b1_x2 - b1_x1 + offset)  --->  vec_top
-      __bang_sub(vec_top, vec_b1_x2, vec_b1_x1, batches_stride);
-      __bang_add_scalar(vec_top, vec_top, (T)offset, batches_stride);
-
-      // (b1_y2 - b1_y1 + offset)  --->  vec_bottom
-      __bang_sub(vec_bottom, vec_b1_y2, vec_b1_y1, batches_stride);
-      __bang_add_scalar(vec_bottom, vec_bottom, (T)offset, batches_stride);
-
-      // b1_area = (b1_x2 - b1_x1 + offset) * (b1_y2 - b1_y1 + offset)
-      // --->  vec_top;
-      __bang_mul(vec_top, vec_top, vec_bottom, batches_stride);
-      T *b1_area = vec_top;
-
-      // get the b2_area
-      // (b2_x2 - b2_x1 + offset)  --->  b2_x1
-      __bang_sub(vec_b2_x1, vec_b2_x2, vec_b2_x1, batches_stride);
-      __bang_add_scalar(vec_b2_x1, vec_b2_x1, (T)offset, batches_stride);
-
-      // (b2_y2 - b2_y1 + offset)  --->  b2_y1
-      __bang_sub(vec_b2_y1, vec_b2_y2, vec_b2_y1, batches_stride);
-      __bang_add_scalar(vec_b2_y1, vec_b2_y1, (T)offset, batches_stride);
-
-      // b2_area = (b2_x2 - b2_x1 + offset) * (b2_y2 - b2_y1 + offset)
-      // --->  b2_x1;
-      __bang_mul(vec_b2_x1, vec_b2_x1, vec_b2_y1, batches_stride);
-      T *b2_area = vec_b2_x1;
-
-      // inter_s = width * height
-      __bang_mul(height, width, height, batches_stride);
-      T *inter_s = height;
-
-      // offset vector ---> vec_b2_y1
-      __bang_write_value(vec_b2_y1, batches_stride, T(offset));
-      T *vec_offset = vec_b2_y1;
-
-      if (mode == 0) {
-        __bang_add(b1_area, b1_area, b2_area, batches_stride);
-        __bang_sub(b1_area, b1_area, inter_s, batches_stride);
-        __bang_maxequal(b1_area, vec_offset, b1_area, batches_stride);
-      } else {
-        __bang_maxequal(b1_area, vec_offset, b1_area, batches_stride);
-      }
-      T *base_s = b1_area;
-
-      // ious = inter_s / base_s
-      computeDiv<T>(width, inter_s, base_s, vec_b2_x2, batches_stride);
-      __memcpy((T *)ious + index, width, handle_batches * sizeof(T),
-               NRAM2GDRAM);
-    }
-  } else {
-    int32_t num_loop_cpy = num_bbox2 / batches_stride;
-    int32_t num_rem_cpy_batches = num_bbox2 % batches_stride;
-    num_loop_cpy = num_rem_cpy_batches > 0 ? num_loop_cpy + 1 : num_loop_cpy;
-    for (int32_t i = 0; i < batch_per_task; i++) {
-      int32_t index1 = batch_start + i;
-      int32_t b1 = index1;
-      int32_t base1 = b1 * COORD_NUM;
-
-      // set bbox1 and bbox2 to nram
-      __bang_write_value(vec_b1_x1, batches_stride, bbox1[base1]);
-      __bang_write_value(vec_b1_y1, batches_stride, bbox1[base1 + 1]);
-      __bang_write_value(vec_b1_x2, batches_stride, bbox1[base1 + 2]);
-      __bang_write_value(vec_b1_y2, batches_stride, bbox1[base1 + 3]);
-
-      for (int32_t j = 0; j < num_loop_cpy; j++) {
-        int32_t index2 = j * batches_stride;
-        int32_t handle_batches = index2 + batches_stride > num_bbox2
-                                     ? num_rem_cpy_batches
-                                     : batches_stride;
-        int32_t b2 = index2;
-        int32_t base2 = b2 * COORD_NUM;
-
-        // copy bbox2 to nram
-        __memcpy(vec_b2_x1, &bbox2[base2], sizeof(T), GDRAM2NRAM, sizeof(T),
-                 COORD_NUM * sizeof(T), handle_batches - 1);
-        __memcpy(vec_b2_y1, &bbox2[base2 + 1], sizeof(T), GDRAM2NRAM, sizeof(T),
-                 COORD_NUM * sizeof(T), handle_batches - 1);
-        __memcpy(vec_b2_x2, &bbox2[base2 + 2], sizeof(T), GDRAM2NRAM, sizeof(T),
-                 COORD_NUM * sizeof(T), handle_batches - 1);
-        __memcpy(vec_b2_y2, &bbox2[base2 + 3], sizeof(T), GDRAM2NRAM, sizeof(T),
-                 COORD_NUM * sizeof(T), handle_batches - 1);
-
-        // get the width and height
-        __bang_maxequal(vec_left, vec_b1_x1, vec_b2_x1, batches_stride);
-        __bang_minequal(vec_right, vec_b1_x2, vec_b2_x2, batches_stride);
-        __bang_maxequal(vec_top, vec_b1_y1, vec_b2_y1, batches_stride);
-        __bang_minequal(vec_bottom, vec_b1_y2, vec_b2_y2, batches_stride);
-
-        // right - left + offset ---> left
-        __bang_sub(vec_left, vec_right, vec_left, batches_stride);
-        __bang_add_scalar(vec_left, vec_left, (T)offset, batches_stride);
-        // bottom - top + offset ---> right
-        __bang_sub(vec_right, vec_bottom, vec_top, batches_stride);
-        __bang_add_scalar(vec_right, vec_right, (T)offset, batches_stride);
-
-        // zero vector ---> bottom
-        __bang_write_value(vec_bottom, batches_stride, (T)0);
-
-        // width --> vec_left
-        __bang_maxequal(vec_left, vec_bottom, vec_left, batches_stride);
-        T *width = vec_left;
-        // height --> vec_right
-        __bang_maxequal(vec_right, vec_bottom, vec_right, batches_stride);
-        T *height = vec_right;
-
-        // get the b1_area
-        // (b1_x2 - b1_x1 + offset)  --->  vec_top
-        __bang_sub(vec_top, vec_b1_x2, vec_b1_x1, batches_stride);
-        __bang_add_scalar(vec_top, vec_top, (T)offset, batches_stride);
-        // (b1_y2 - b1_y1 + offset)  --->  vec_bottom
-        __bang_sub(vec_bottom, vec_b1_y2, vec_b1_y1, batches_stride);
-        __bang_add_scalar(vec_bottom, vec_bottom, (T)offset, batches_stride);
-        // b1_area = (b1_x2 - b1_x1 + offset) * (b1_y2 - b1_y1 + offset)
-        // --->  vec_top;
-        __bang_mul(vec_top, vec_top, vec_bottom, batches_stride);
-        T *b1_area = vec_top;
-
-        // get the b2_area
-        // (b2_x2 - b2_x1 + offset)  --->  b2_x1
-        __bang_sub(vec_b2_x1, vec_b2_x2, vec_b2_x1, batches_stride);
-        __bang_add_scalar(vec_b2_x1, vec_b2_x1, (T)offset, batches_stride);
-        // (b2_y2 - b2_y1 + offset)  --->  b2_y1
-        __bang_sub(vec_b2_y1, vec_b2_y2, vec_b2_y1, batches_stride);
-        __bang_add_scalar(vec_b2_y1, vec_b2_y1, (T)offset, batches_stride);
-        // b2_area = (b2_x2 - b2_x1 + offset) * (b2_y2 - b2_y1 + offset)
-        // --->  b2_x1;
-        __bang_mul(vec_b2_x1, vec_b2_x1, vec_b2_y1, batches_stride);
-        T *b2_area = vec_b2_x1;
-
-        // inter_s = width * height
-        __bang_mul(height, width, height, batches_stride);
-        T *inter_s = height;
-
-        // offset vector ---> vec_b2_y1
-        __bang_write_value(vec_b2_y1, batches_stride, T(offset));
-        T *vec_offset = vec_b2_y1;
-
-        if (mode == 0) {
-          __bang_add(b1_area, b1_area, b2_area, batches_stride);
-          __bang_sub(b1_area, b1_area, inter_s, batches_stride);
-          __bang_maxequal(b1_area, vec_offset, b1_area, batches_stride);
-        } else {
-          __bang_maxequal(b1_area, vec_offset, b1_area, batches_stride);
-        }
-        T *base_s = b1_area;
-
-        // ious = inter_s / base_s
-        computeDiv<T>(width, inter_s, base_s, vec_b2_x2, batches_stride);
-        int32_t gdram_offset = index1 * num_bbox2 + index2;
-        __memcpy((T *)ious + gdram_offset, width, handle_batches * sizeof(T),
-                 NRAM2GDRAM);
-      }
-    }
-  }
-}
-
-template <typename T>
-__mlu_global__ void MLUUnion1KernelBBoxOverlaps(
-    const void *bbox1, const void *bbox2, void *ious, const int32_t num_bbox1,
-    const int32_t num_bbox2, const int32_t mode, const bool aligned,
-    const int32_t offset) {
-  /*
-   * NRAM partition
-   *  |-------------------------------------------------------------|
-   *  |   vec_b1_x1   |  vec_b1_y1   |   vec_b1_x2  |   vec_b1_y2   |
-   *  |-------------------------------------------------------------|
-   *  |   vec_b2_x1   |  vec_b2_y1   |   vec_b2_x2  |   vec_b2_y2   |
-   *  |-------------------------------------------------------------|
-   *  |    vec_left   |  vec_right   |    vec_top   |   vec_bottom  |
-   *  |-------------------------------------------------------------|
-   *
-  */
-  const int32_t align_bytes = PAD_DOWN(MAX_NRAM_SIZE, NFU_ALIGN_SIZE);
-  const int32_t split_nram_num = 12;
-  const int32_t nram_stride =
-      align_bytes / NFU_ALIGN_SIZE / split_nram_num * NFU_ALIGN_SIZE;
-
-  void *vec_b1_x1 = nmem_buf;
-  void *vec_b1_y1 = nmem_buf + nram_stride;
-  void *vec_b1_x2 = nmem_buf + 2 * nram_stride;
-  void *vec_b1_y2 = nmem_buf + 3 * nram_stride;
-
-  void *vec_b2_x1 = nmem_buf + 4 * nram_stride;
-  void *vec_b2_y1 = nmem_buf + 5 * nram_stride;
-  void *vec_b2_x2 = nmem_buf + 6 * nram_stride;
-  void *vec_b2_y2 = nmem_buf + 7 * nram_stride;
-
-  void *vec_left = nmem_buf + 8 * nram_stride;
-  void *vec_right = nmem_buf + 9 * nram_stride;
-  void *vec_top = nmem_buf + 10 * nram_stride;
-  void *vec_bottom = nmem_buf + 11 * nram_stride;
-
-  const int32_t vec_length = nram_stride / sizeof(T);
-  bboxOverlapsWorkflow((T *)vec_b1_x1, (T *)vec_b1_y1, (T *)vec_b1_x2,
-                       (T *)vec_b1_y2, (T *)vec_b2_x1, (T *)vec_b2_y1,
-                       (T *)vec_b2_x2, (T *)vec_b2_y2, (T *)vec_left,
-                       (T *)vec_right, (T *)vec_top, (T *)vec_bottom,
-                       (T *)bbox1, (T *)bbox2, (T *)ious, offset, mode,
-                       vec_length, num_bbox1, num_bbox2, aligned);
-}
-
-void KernelBBoxOverlaps(cnrtDim3_t k_dim, cnrtFunctionType_t k_type,
-                        cnrtQueue_t queue, const cnrtDataType_t d_type,
-                        const void *bbox1, const void *bbox2, void *ious,
-                        const int32_t num_bbox1, const int32_t num_bbox2,
-                        const int32_t mode, const bool aligned,
-                        const int32_t offset) {
-  if (d_type == CNRT_FLOAT16) {
-    MLUUnion1KernelBBoxOverlaps<half><<<k_dim, k_type, queue>>>(
-        bbox1, bbox2, ious, num_bbox1, num_bbox2, mode, aligned, offset);
-  } else {
-    MLUUnion1KernelBBoxOverlaps<float><<<k_dim, k_type, queue>>>(
-        bbox1, bbox2, ious, num_bbox1, num_bbox2, mode, aligned, offset);
-  }
-}
diff --git a/mmcv/ops/csrc/common/mlu/carafe_mlu_kernel.mlu b/mmcv/ops/csrc/common/mlu/carafe_mlu_kernel.mlu
deleted file mode 100644
index 8dd6a8e..0000000
--- a/mmcv/ops/csrc/common/mlu/carafe_mlu_kernel.mlu
+++ /dev/null
@@ -1,552 +0,0 @@
-/*************************************************************************
- * Copyright (C) 2022 Cambricon.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *************************************************************************/
-#include "carafe_utils.hpp"
-#include "common_mlu_helper.hpp"
-
-#define INDEX3(n, h, w, c, strN, strH, strW) \
-  (strN) * (n) + (strH) * (h) + (strW) * (w) + (c)
-
-#define NRAM_BLOCK PAD_DOWN(MAX_NRAM_SIZE / 5, NRAM_ALIGN_SIZE)
-
-__nram__ char nram_buf[MAX_NRAM_SIZE];
-
-namespace forward {
-struct BlockId {
-  int Ho;
-  int Wo;
-  int G;
-  int Cg;
-  int Kh;
-  int Kw;
-  int Hi;
-  int Wi;
-};
-
-// start indices of block
-struct BlockStart {
-  int Ho;
-  int Wo;
-  int G;
-  int Cg;
-  int Kh;
-  int Kw;
-  int Hi;
-  int Wi;
-  int C;
-};
-
-struct BlockEnd {
-  int Ho;
-  int Wo;
-  int Kh;
-  int Kw;
-  int Hi;
-  int Wi;
-};
-
-struct BlockSize {
-  int Ho;
-  int Wo;
-  int G;
-  int Cg;
-  int Kh;
-  int Kw;
-  int Hi;
-  int Wi;
-};
-
-template <typename T>
-__mlu_func__ void carafeForwardBLOCK(T *input, T *mask,
-                                     const CarafeForwardParam param,
-                                     const CarafeForwardBlockDim block_dim,
-                                     const CarafeForwardGridDim grid_dim,
-                                     T *output) {
-  // data block info
-  BlockId blkId;
-  BlockStart blkStart;
-  BlockEnd blkEnd;
-  BlockSize blkSize;
-
-  // set pointers on NRAM arrays
-
-  // input_nram[blkDim_(Hi+Kh)-1, blkDim_(Wi+Kw)-1, blkDim_(G*Cg)]
-  T *input_nram = (T *)nram_buf;
-
-  // mask_nram[blkDim_Ho, blkDim_Wo, blkDim_(G*Kh*Kw)]
-  T *mask_nram = input_nram + param.input_nram_size;
-
-  // output_nram[blkDim_Ho, blkDim_Wo, blkDim_(G*Cg)]
-  T *output_nram = mask_nram + param.mask_nram_size;
-
-  // sum_array[blkDim_(G*Cg)]
-  T *sum_array = output_nram + param.output_nram_size;
-
-  /* ===== loop over N, grid_dim(Ho,Wo,G,Cg)
-   * iterations are distributed over computing cores
-   */
-  for (int loop_index = taskId; loop_index < param.job_num;
-       loop_index += taskDim) {
-    // block idx
-    blkId.Cg = loop_index;
-    blkId.G = blkId.Cg / grid_dim.Cg;
-    blkId.Wo = blkId.G / grid_dim.G;
-    blkId.Ho = blkId.Wo / grid_dim.Wo;
-    int sample_idx = blkId.Ho / grid_dim.Ho;
-
-    blkId.Cg %= grid_dim.Cg;
-    blkId.G %= grid_dim.G;
-    blkId.Wo %= grid_dim.Wo;
-    blkId.Ho %= grid_dim.Ho;
-
-    // block starting indices
-    blkStart.Ho = blkId.Ho * block_dim.Ho;
-    blkStart.Wo = blkId.Wo * block_dim.Wo;
-    blkStart.G = blkId.G * block_dim.G;
-    blkStart.Cg = blkId.Cg * block_dim.Cg;
-    blkStart.C = blkStart.G * param.Cg + blkStart.Cg;
-
-    // block size
-    blkSize.Ho = block_dim.Ho;
-    blkSize.Wo = block_dim.Wo;
-    blkSize.G = block_dim.G;
-    blkSize.Cg = block_dim.Cg;
-
-    // take care of blocks near the end of each dimension
-    if (blkId.Ho == (grid_dim.Ho - 1)) {
-      blkSize.Ho = param.Ho - (grid_dim.Ho - 1) * block_dim.Ho;
-    }
-    if (blkId.Wo == (grid_dim.Wo - 1)) {
-      blkSize.Wo = param.Wo - (grid_dim.Wo - 1) * block_dim.Wo;
-    }
-    if (blkId.G == (grid_dim.G - 1)) {
-      blkSize.G = param.group_size - (grid_dim.G - 1) * block_dim.G;
-    }
-    if (blkId.Cg == (grid_dim.Cg - 1)) {
-      blkSize.Cg = param.Cg - (grid_dim.Cg - 1) * block_dim.Cg;
-    }
-
-    // block end indices
-    blkEnd.Ho = blkStart.Ho + blkSize.Ho - 1;
-    blkEnd.Wo = blkStart.Wo + blkSize.Wo - 1;
-
-    // set output_nram to zero
-    __bang_write_value(output_nram, param.output_nram_size, T(0));
-
-    // loop blocks of kernel window: grid_dim.(Kh, Kw)
-    for (blkId.Kh = 0; blkId.Kh < grid_dim.Kh; ++blkId.Kh) {
-      blkStart.Kh = blkId.Kh * block_dim.Kh;
-      blkSize.Kh = block_dim.Kh;
-      if (blkId.Kh == (grid_dim.Kh - 1)) {
-        blkSize.Kh = param.kernel_size - (grid_dim.Kh - 1) * block_dim.Kh;
-      }
-      blkEnd.Kh = blkStart.Kh + blkSize.Kh - 1;
-
-      blkStart.Hi = blkStart.Ho / param.scale_factor - param.kernel_size_half +
-                    blkStart.Kh;
-      blkEnd.Hi =
-          blkEnd.Ho / param.scale_factor - param.kernel_size_half + blkEnd.Kh;
-      blkSize.Hi = blkEnd.Hi - blkStart.Hi + 1;
-
-      for (blkId.Kw = 0; blkId.Kw < grid_dim.Kw; ++blkId.Kw) {
-        blkStart.Kw = blkId.Kw * block_dim.Kw;
-        blkSize.Kw = block_dim.Kw;
-        if (blkId.Kw == (grid_dim.Kw - 1)) {
-          blkSize.Kw = param.kernel_size - (grid_dim.Kw - 1) * block_dim.Kw;
-        }
-        blkEnd.Kw = blkStart.Kw + blkSize.Kw - 1;
-
-        blkStart.Wi = blkStart.Wo / param.scale_factor -
-                      param.kernel_size_half + blkStart.Kw;
-        blkEnd.Wi =
-            blkEnd.Wo / param.scale_factor - param.kernel_size_half + blkEnd.Kw;
-        blkSize.Wi = blkEnd.Wi - blkStart.Wi + 1;
-
-        // load input block from gdram2nram
-        //
-        // input_nram[            | input[ sample_idx,
-        //   0:blkSize.Hi-1,      |   blkStart.Hi + 0:blkSize.Hi-1,
-        //   0:blkSize.Wi-1,      |   blkStart.Wi + 0:blkSize.Wi-1,
-        //   0:blkSize.G-1        |   blkStart.G + 0:blkSize.G-1
-        //   0:blkSize.Cg-1]      |   blkStart.Cg + 0:blkSize.Cg-1]
-        //
-        // To skip out of bound indices:
-        //
-        // input_nram[
-        //    hi_start_local:hi_end_local,
-        //    wi_start_local:wi_end_local, ...]
-        // = input[n,
-        //    hi_start_global:hi_end_global,
-        //    wi_start_global:wi_end_global, ...]
-        //
-        int hi_start_local = 0;
-        int hi_start_global = blkStart.Hi;
-        if (blkStart.Hi < 0) {
-          hi_start_local = -blkStart.Hi;
-          hi_start_global = 0;
-        }
-        int wi_start_local = 0;
-        int wi_start_global = blkStart.Wi;
-        if (blkStart.Wi < 0) {
-          wi_start_local = -blkStart.Wi;
-          wi_start_global = 0;
-        }
-        int hi_end_local = blkSize.Hi - 1;
-        int hi_end_global = blkEnd.Hi;
-        if (blkEnd.Hi > param.Hi - 1) {
-          hi_end_global = param.Hi - 1;
-          hi_end_local -= blkEnd.Hi - hi_end_global;
-        }
-        int wi_end_local = blkSize.Wi - 1;
-        int wi_end_global = blkEnd.Wi;
-        if (blkEnd.Wi > param.Wi - 1) {
-          wi_end_global = param.Wi - 1;
-          wi_end_local -= blkEnd.Wi - wi_end_global;
-        }
-
-        int dst_offset = param.input_nram_stride_h * hi_start_local +
-                         param.input_nram_stride_w * wi_start_local;
-        T *dst = input_nram + dst_offset;
-
-        int src_offset = INDEX3(sample_idx, hi_start_global, wi_start_global,
-                                blkStart.C, param.input_stride_n,
-                                param.input_stride_h, param.input_stride_w);
-        T *src = input + src_offset;
-
-        int input_seg_num_h = hi_end_local - hi_start_local + 1;
-        int input_seg_num_w = wi_end_local - wi_start_local + 1;
-        for (int i = 0; i < input_seg_num_h; ++i) {
-          loadStr3D(dst, src, blkSize.Cg, blkSize.G, input_seg_num_w,
-                    param.input_nram_stride_g, param.input_nram_stride_w,
-                    param.input_stride_g, param.input_stride_w);
-          dst += param.input_nram_stride_h;
-          src += param.input_stride_h;
-        }
-
-        /* load mask block from gdram2nram
-         *
-         * mask_nram[          |  mask[sample_idx,
-         *   0:blkSize.Ho-1 ,  |    blkStart.Ho + 0:blkSize.Ho-1,
-         *   0:blkSize.Wo-1,   |    blkStart.Wo + 0:blkSize.Wo-1,
-         *   0:blkSize.G-1,    |    blkStart.G  + 0:blkSize.G-1,
-         *   0:blkSize.Kh-1,   |    blkStart.Kh + 0:blkSize.Kh-1,
-         *   0:blkSize.Kw-1]   |    blkStart.Kw + 0:blkSize.Kw-1]
-         */
-        src_offset = INDEX3(blkStart.Wo, blkStart.G, blkStart.Kh, blkStart.Kw,
-                            param.mask_stride_w, param.mask_stride_g,
-                            param.mask_stride_kh);
-        src_offset += sample_idx * param.mask_stride_n +
-                      blkStart.Ho * param.mask_stride_h;
-
-        for (int ho = 0; ho < blkSize.Ho; ++ho) {
-          dst = mask_nram + ho * param.mask_nram_stride_h;
-          src = mask + src_offset + ho * param.mask_stride_h;
-
-          for (int wo = 0; wo < blkSize.Wo; ++wo) {
-            loadStr3D(dst, src, blkSize.Kw, blkSize.Kh, blkSize.G,
-                      param.mask_nram_stride_kh, param.mask_nram_stride_g,
-                      param.mask_stride_kh, param.mask_stride_g);
-            dst += param.mask_nram_stride_w;
-            src += param.mask_stride_w;
-          }
-        }
-
-        // loop each pixel of the output block
-        for (int ho = 0; ho < blkSize.Ho; ++ho) {
-          int kernel_hi_start_global = (blkStart.Ho + ho) / param.scale_factor -
-                                       param.kernel_size_half + blkStart.Kh;
-          int kernel_hi_start_local = kernel_hi_start_global - blkStart.Hi;
-
-          // int kernel_hi_end_global = kernel_hi_start_global + blkSize.Kh - 1;
-          // int kernel_hi_end_local = kernel_hi_end_global - blkStart.Hi;
-
-          // exclude out of bound indices which should be ignored
-          int kh_min = hi_start_local - kernel_hi_start_local > 0
-                           ? hi_start_local - kernel_hi_start_local
-                           : 0;
-          int kh_max = hi_end_local - kernel_hi_start_local < blkSize.Kh - 1
-                           ? hi_end_local - kernel_hi_start_local
-                           : blkSize.Kh - 1;
-
-          for (int wo = 0; wo < blkSize.Wo; ++wo) {
-            int kernel_wi_start_global =
-                (blkStart.Wo + wo) / param.scale_factor -
-                param.kernel_size_half + blkStart.Kw;
-            int kernel_wi_start_local = kernel_wi_start_global - blkStart.Wi;
-
-            // exclude out of bound indices wwich should be ignored
-            int kw_min = wi_start_local - kernel_wi_start_local > 0
-                             ? wi_start_local - kernel_wi_start_local
-                             : 0;
-            int kw_max = wi_end_local - kernel_wi_start_local < blkSize.Kw - 1
-                             ? wi_end_local - kernel_wi_start_local
-                             : blkSize.Kw - 1;
-
-            // output_nram[ho, wo, g, c] = sum(mask_nram[ho, wo, g, kh, kw]
-            //     * input_nram[hi+kh, wi+kw, g, c],
-            //  for (kh,kw) in [0:blkSize.Kw-1] x [0:blkSize.Kh-1])
-            //
-            // sum(mask_nram[ho, wo, g, kh, kw]
-            //     * input_nram[hi+kh, wi+kw, g, c], (kh,kw))
-            //
-            T *mask_array = mask_nram + param.mask_nram_stride_h * ho +
-                            param.mask_nram_stride_w * wo;
-
-            for (int kh = kh_min; kh <= kh_max; ++kh) {
-              for (int kw = kw_min; kw <= kw_max; ++kw) {
-                T *src =
-                    input_nram +
-                    param.input_nram_stride_h * (kernel_hi_start_local + kh) +
-                    param.input_nram_stride_w * (kernel_wi_start_local + kw);
-
-                int mask_index = param.mask_nram_stride_kh * kh + kw;
-
-                // mlutiply mask weight with channels for each channel group
-                T *sum = sum_array;
-
-                for (int g = 0; g < blkSize.G; ++g) {
-                  __bang_mul_scalar(sum, src, mask_array[mask_index],
-                                    param.block_Cg_NFU);
-                  //
-                  // NOTE: Since block_Cg_NFU >= block_Cg_stride,
-                  // overlapped writing may occur on sum_array.
-                  // So this loop must be executed in order to
-                  // avoid data contamination, as shown below.
-                  //
-                  // |-----block_Cg_NFU---------|
-                  // xxxxxxxxxxxxxxxxxxxxyyyzzzzz------------
-                  // |---block_Cg_stride---|^^^^^will be overwritten
-                  //                             in the next iteration.
-                  //
-                  // x: actual data used, y: not used, z: overwritten
-                  //
-                  sum += param.input_nram_stride_g;
-                  src += param.input_nram_stride_g;
-                  mask_index += param.mask_nram_stride_g;
-                }  // loop blk_G
-
-                // add array[blk_G * blk_C] to output_nram
-                dst = output_nram + param.output_nram_stride_h * ho +
-                      param.output_nram_stride_w * wo;
-
-                __bang_add(dst, dst, sum_array, param.output_nram_stride_w);
-              }  // end loop blk_Kw
-            }    // end loop blk_Kh
-          }      // end loop blk_Wo
-        }        // end loop blk_Ho
-      }          // end loop grid_dim.Kw
-    }            // end loop grid_dim.Kh
-
-    /* write output from nram2gdram
-     *
-     * output_nram[          |   output[sample_idx,
-     *   0:blkSize.Ho-1,     |     blkStart.Ho + 0:blkSize.Ho-1,
-     *   0:blkSize.Wo-1,     |     blkStart.Wo + 0:blkSize.Wo-1,
-     *   0:blkSize.G-1,      |     blkStart.G  + 0:blkSize.G-1,
-     *   0:blkSize.Cg-1]     |     blkStart.Cg + 0:blkSize.Cg-1]
-     */
-    int dst_offset = INDEX3(sample_idx, blkStart.Ho, blkStart.Wo, blkStart.C,
-                            param.output_stride_n, param.output_stride_h,
-                            param.output_stride_w);
-    T *dst = output + dst_offset;
-    T *src = output_nram;
-    for (int i = 0; i < blkSize.Ho; ++i) {
-      storeStr3D(dst, src, blkSize.Cg, blkSize.G, blkSize.Wo,
-                 param.output_stride_g, param.output_stride_w,
-                 param.output_nram_stride_g, param.output_nram_stride_w);
-      dst += param.output_stride_h;
-      src += param.output_nram_stride_h;
-    }
-  }  // end loop N, grid_dim.(Hi,Wi,G,Cg)
-}
-
-template <typename T>
-__mlu_global__ void MLUBLOCKKernelCarafeForward(
-    const void *input, const void *mask, const CarafeForwardParam param,
-    const CarafeForwardBlockDim block_dim, const CarafeForwardGridDim grid_dim,
-    void *output) {
-  carafeForwardBLOCK((T *)input, (T *)mask, param, block_dim, grid_dim,
-                     (T *)output);
-}
-}  // namespace forward
-
-namespace backward {
-template <typename T>
-__mlu_func__ void CarafeCompute(T *input, T *mask, T *grad_output,
-                                T *grad_input, T *grad_mask, const int n,
-                                const int hi, const int wi, const int c,
-                                const int k_up, const int group,
-                                const int scale) {
-  char *input_buff = nram_buf;
-  char *mask_buff = input_buff + NRAM_BLOCK;
-  char *grad_input_buff = mask_buff + NRAM_BLOCK;
-  char *grad_output_buff = grad_input_buff + NRAM_BLOCK;
-  char *grad_mask_buff = grad_output_buff + NRAM_BLOCK;
-
-  int wo = wi * scale;
-  int ho = hi * scale;
-  int out_num = n * ho * wo * group;
-  int group_size = c / group;
-  int repeat = out_num / taskDim + (int)(taskId < out_num % taskDim);
-  int num_align = PAD_DOWN(NRAM_BLOCK / sizeof(T), NFU_ALIGN_SIZE / sizeof(T));
-  int num_per_loop = group_size / num_align;
-  int rem_for_loop = group_size % num_align;
-  int rem_for_loop_align = PAD_UP(rem_for_loop, NFU_ALIGN_SIZE / sizeof(T));
-  for (int k = 0; k < repeat; k++) {
-    int iter = k * taskDim + taskId;
-    int group_k = iter % group;
-    int w_k = (iter / group) % wo;
-    int h_k = (iter / wo / group) % ho;
-    int n_k = (iter / ho / wo / group) % n;
-    int h_i = h_k / scale;
-    int w_i = w_k / scale;
-    int start_h = h_i - ((k_up - 1) / 2);
-    int end_h = h_i + ((k_up - 1) / 2) + 1;
-    int start_w = w_i - ((k_up - 1) / 2);
-    int end_w = w_i + ((k_up - 1) / 2) + 1;
-    T *base_mask = (T *)mask + n_k * ho * wo * group * k_up * k_up +
-                   h_k * wo * group * k_up * k_up + w_k * group * k_up * k_up +
-                   group_k * k_up * k_up;
-    T *base_grad_mask = (T *)grad_mask + n_k * ho * wo * group * k_up * k_up +
-                        h_k * wo * group * k_up * k_up +
-                        w_k * group * k_up * k_up + group_k * k_up * k_up;
-
-    __bang_write_zero((T *)grad_input_buff, NRAM_BLOCK / sizeof(T));
-    __bang_write_zero((T *)grad_mask_buff, NRAM_BLOCK / sizeof(T));
-    __bang_write_zero((T *)grad_output_buff, NRAM_BLOCK / sizeof(T));
-
-    __memcpy((T *)mask_buff, (T *)base_mask, k_up * k_up * sizeof(T),
-             GDRAM2NRAM);
-    for (int i = 0; i < num_per_loop; i++) {
-      __bang_write_zero((T *)input_buff, NRAM_BLOCK / sizeof(T));
-      T *base_grad_output = (T *)grad_output + n_k * ho * wo * c +
-                            h_k * wo * c + w_k * c + group_k * group_size +
-                            i * num_align;
-      __memcpy((T *)grad_output_buff, (T *)base_grad_output,
-               num_align * sizeof(T), GDRAM2NRAM);
-      for (int ih = start_h; ih < end_h; ih++) {
-        for (int iw = start_w; iw < end_w; iw++) {
-          if (ih < 0 || ih > hi - 1 || iw < 0 || iw > wi - 1) {
-            continue;
-          }
-          int mask_ih = ih - h_i + (k_up - 1) / 2;
-          int mask_iw = iw - w_i + (k_up - 1) / 2;
-          int mask_index = mask_ih * k_up + mask_iw;
-          int input_index = n_k * hi * wi * c + ih * wi * c + iw * c +
-                            group_k * group_size + i * num_align;
-          T *base_input = (T *)input + input_index;
-          T *base_grad_input = (T *)grad_input + input_index;
-          __memcpy((T *)input_buff, (T *)base_input, num_align * sizeof(T),
-                   GDRAM2NRAM);
-          __bang_mul_scalar((T *)grad_input_buff, (T *)grad_output_buff,
-                            ((T *)mask_buff)[mask_index], num_align);
-          __bang_atomic_add((T *)grad_input_buff, (T *)base_grad_input,
-                            (T *)grad_input_buff, num_align);
-          __bang_mul((T *)input_buff, (T *)grad_output_buff, (T *)input_buff,
-                     num_align);
-
-          __bang_sumpool((T *)input_buff, (T *)input_buff,
-                         NFU_ALIGN_SIZE / sizeof(T),
-                         num_align / (NFU_ALIGN_SIZE / sizeof(T)), 1,
-                         num_align / (NFU_ALIGN_SIZE / sizeof(T)), 1, 1, 1);
-
-          __bang_reduce_sum((T *)input_buff, (T *)input_buff,
-                            NFU_ALIGN_SIZE / sizeof(T));
-          ((T *)grad_mask_buff)[mask_index] += ((T *)input_buff)[0];
-        }
-      }
-    }
-    if (rem_for_loop) {
-      __bang_write_zero((T *)input_buff, NRAM_BLOCK / sizeof(T));
-      T *base_grad_output = (T *)grad_output + n_k * ho * wo * c +
-                            h_k * wo * c + w_k * c + group_k * group_size +
-                            num_per_loop * num_align;
-      __memcpy((T *)grad_output_buff, (T *)base_grad_output,
-               rem_for_loop * sizeof(T), GDRAM2NRAM);
-      for (int ih = start_h; ih < end_h; ih++) {
-        for (int iw = start_w; iw < end_w; iw++) {
-          if (ih < 0 || ih > hi - 1 || iw < 0 || iw > wi - 1) {
-            continue;
-          }
-          int mask_ih = ih - h_i + (k_up - 1) / 2;
-          int mask_iw = iw - w_i + (k_up - 1) / 2;
-          int mask_index = mask_ih * k_up + mask_iw;
-          int input_index = n_k * hi * wi * c + ih * wi * c + iw * c +
-                            group_k * group_size + num_per_loop * num_align;
-          T *base_input = (T *)input + input_index;
-          T *base_grad_input = (T *)grad_input + input_index;
-          __memcpy((T *)input_buff, (T *)base_input, rem_for_loop * sizeof(T),
-                   GDRAM2NRAM);
-          __bang_mul_scalar((T *)grad_input_buff, (T *)grad_output_buff,
-                            ((T *)mask_buff)[mask_index], rem_for_loop_align);
-          __bang_atomic_add((T *)grad_input_buff, (T *)base_grad_input,
-                            (T *)grad_input_buff, rem_for_loop);
-          __bang_mul((T *)input_buff, (T *)grad_output_buff, (T *)input_buff,
-                     rem_for_loop_align);
-
-          __bang_sumpool(
-              (T *)input_buff, (T *)input_buff, NFU_ALIGN_SIZE / sizeof(T),
-              rem_for_loop_align / (NFU_ALIGN_SIZE / sizeof(T)), 1,
-              rem_for_loop_align / (NFU_ALIGN_SIZE / sizeof(T)), 1, 1, 1);
-          __bang_reduce_sum((T *)input_buff, (T *)input_buff,
-                            NFU_ALIGN_SIZE / sizeof(T));
-
-          ((T *)grad_mask_buff)[mask_index] += ((T *)input_buff)[0];
-        }
-      }
-    }
-    __memcpy((T *)base_grad_mask, (T *)grad_mask_buff, k_up * k_up * sizeof(T),
-             NRAM2GDRAM);
-  }
-}
-
-template <typename T>
-__mlu_global__ void MLUUnion1KernelCarafeBackward(
-    const void *input, const void *mask, const void *grad_output,
-    void *grad_input, void *grad_mask, const int n, const int hi, const int wi,
-    const int c, const int k_up, const int group, const int scale) {
-  CarafeCompute((T *)input, (T *)mask, (T *)grad_output, (T *)grad_input,
-                (T *)grad_mask, n, hi, wi, c, k_up, group, scale);
-}
-}  // namespace backward
-
-void KernelCarafeForward(cnrtDim3_t k_dim, cnrtFunctionType_t k_type,
-                         cnrtQueue_t queue, const cnrtDataType_t d_type,
-                         const void *input, const void *mask,
-                         const CarafeForwardParam &param,
-                         const CarafeForwardBlockDim &block_dim,
-                         const CarafeForwardGridDim &grid_dim, void *output) {
-  if (d_type == CNRT_FLOAT16) {
-    forward::MLUBLOCKKernelCarafeForward<half><<<k_dim, k_type, queue>>>(
-        input, mask, param, block_dim, grid_dim, output);
-  } else {
-    forward::MLUBLOCKKernelCarafeForward<float><<<k_dim, k_type, queue>>>(
-        input, mask, param, block_dim, grid_dim, output);
-  }
-}
-
-void KernelCarafeBackward(cnrtDim3_t k_dim, cnrtFunctionType_t k_type,
-                          cnrtQueue_t queue, cnrtDataType_t dtype,
-                          const void *input, const void *mask,
-                          const void *grad_output, void *grad_input,
-                          void *grad_mask, const int n, const int hi,
-                          const int wi, const int c, const int k_up,
-                          const int group, const int scale) {
-  if (dtype == CNRT_FLOAT16) {
-    backward::MLUUnion1KernelCarafeBackward<half><<<k_dim, k_type, queue>>>(
-        input, mask, grad_output, grad_input, grad_mask, n, hi, wi, c, k_up,
-        group, scale);
-  } else {
-    backward::MLUUnion1KernelCarafeBackward<float><<<k_dim, k_type, queue>>>(
-        input, mask, grad_output, grad_input, grad_mask, n, hi, wi, c, k_up,
-        group, scale);
-  }
-}
diff --git a/mmcv/ops/csrc/common/mlu/carafe_utils.hpp b/mmcv/ops/csrc/common/mlu/carafe_utils.hpp
deleted file mode 100644
index 09ca60a..0000000
--- a/mmcv/ops/csrc/common/mlu/carafe_utils.hpp
+++ /dev/null
@@ -1,95 +0,0 @@
-/*************************************************************************
- * Copyright (C) 2022 Cambricon.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *************************************************************************/
-#ifndef CARAFE_UTILS_HPP_
-#define CARAFE_UTILS_HPP_
-
-#define NRAM_ALIGN_SIZE 64
-
-struct CarafeForwardParam {
-  int N;   // batch size
-  int Hi;  // input height
-  int Wi;  // input width
-  int Ci;  // input channels
-  int Ho;  // output height
-  int Wo;  // output width
-  int Cg;  // channels per group
-
-  int kernel_size;       // kernel_size
-  int group_size;        // group_size
-  int scale_factor;      // scale_factor
-  int kernel_size_half;  // kernel half size (K-1)/2
-  int kernel_size_sq;    // square of kernel size
-
-  int dtype_size;  // size of tensor data type
-
-  // Host arrays' geometry
-  int input_stride_g;
-  int input_stride_w;
-  int input_stride_h;
-  int input_stride_n;
-  int input_size;
-  int mask_stride_kh;
-  int mask_stride_g;
-  int mask_stride_w;
-  int mask_stride_h;
-  int mask_stride_n;
-  int mask_size;
-  int output_stride_g;
-  int output_stride_w;
-  int output_stride_h;
-  int output_stride_n;
-  int output_size;
-
-  // NRAM arrays' geometry
-  int input_nram_stride_g;
-  int input_nram_stride_w;
-  int input_nram_stride_h;
-  int input_nram_size;
-  int mask_nram_stride_kh;
-  int mask_nram_stride_g;
-  int mask_nram_stride_w;
-  int mask_nram_stride_h;
-  int mask_nram_size;
-  int output_nram_stride_g;
-  int output_nram_stride_w;
-  int output_nram_stride_h;
-  int output_nram_size;
-
-  // for address/compute alignment
-  int align_size_NRAM;  // for addressing on NRAM
-  int align_size_NFU;   // for NFU operation length
-  int block_Cg_NFU;     // for bang_mul_const
-
-  int job_num;  // total job number
-};
-
-struct CarafeForwardBlockDim {
-  int Ho;  // block size of output height
-  int Wo;  // block size of output width
-  int Kh;  // block size of kernel height
-  int Kw;  // block size of kernel width
-  int G;   // block size of groups
-  int Cg;  // block size of channels within a group
-  int Hi;  // block size of input height
-  int Wi;  // block size of input width
-};
-
-struct CarafeForwardGridDim {
-  int Ho;  // number of blocks of output height
-  int Wo;
-  int Kh;
-  int Kw;
-  int G;
-  int Cg;
-};
-
-#endif  // CARAFE_UTILS_HPP_
diff --git a/mmcv/ops/csrc/common/mlu/common_mlu_helper.hpp b/mmcv/ops/csrc/common/mlu/common_mlu_helper.hpp
deleted file mode 100644
index 88805ba..0000000
--- a/mmcv/ops/csrc/common/mlu/common_mlu_helper.hpp
+++ /dev/null
@@ -1,398 +0,0 @@
-/*************************************************************************
- * Copyright (C) 2021 Cambricon.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *************************************************************************/
-#ifndef COMMON_MLU_HELPER_HPP_
-#define COMMON_MLU_HELPER_HPP_
-
-#define NFU_ALIGN_SIZE 128          // Byte
-#define REM_FOR_STACK (128 * 1024)  // 128KB reserved for cncc
-
-#ifdef __BANG_ARCH__
-#define MAX_NRAM_SIZE \
-  (__MLU_NRAM_SIZE__ * 1024 - REM_FOR_STACK)  // 128KB reserved for cncc
-#define MAX_SRAM_SIZE \
-  (__MLU_SRAM_SIZE__ * 1024 - REM_FOR_STACK)  // 128KB reserved for cncc
-#else
-#define MAX_NRAM_SIZE (384 * 1024)   // 384KB,  initialization value
-#define MAX_SRAM_SIZE (1920 * 1024)  // 1920KB, initialization value
-#endif
-
-#ifndef PAD_UP
-#define PAD_UP(x, y) (((x) / (y) + (int)((x) % (y) > 0)) * (y))
-#endif
-
-#ifndef PAD_DOWN
-#define PAD_DOWN(x, y) (((x) / (y)) * (y))
-#endif
-
-#define CEIL_ALIGN(x, y) (((x) + (y)-1) / (y) * (y))
-
-template <typename scalar_t>
-__mlu_func__ inline scalar_t min(scalar_t a, scalar_t b) {
-  return a < b ? a : b;
-}
-
-template <typename scalar_t>
-__mlu_func__ inline scalar_t max(scalar_t a, scalar_t b) {
-  return a > b ? a : b;
-}
-
-/*!
- * @brief loads data from global DRAM to NRAM with 2D pattern.
- *
- * @param[out] dst
- *   Pointer to NRAM that stores dst data.
- * @param[in] src
- *   Pointer to global DRAM that stores src data.
- * @param[in] size
- *   The byte size of segment in the lower dimension.
- * @param[in] dst_str
- *   The data stride in bytes between segments in the lower dimension of dst.
- * @param[in] src_str
- *   The data stride in bytes between segments in the lower dimension of src.
- * @param[in] seg_num
- *   The total count of data segments in the lower dimension.
- */
-template <typename T>
-__mlu_func__ void loadStr2D(T *dst, T *src, const int size, const int dst_str,
-                            const int src_str, const int seg_num) {
-  if (dst_str == src_str && size == src_str) {
-    __memcpy(dst, src, src_str * seg_num * sizeof(T), GDRAM2NRAM);
-  } else if ((size == src_str || src_str <= dst_str) &&
-             src_str * sizeof(T) <= 512) {
-    // gather data less than 512Bytes to improve IO efficiency
-    T *tmp = (T *)dst + (dst_str - src_str) * seg_num;
-    __memcpy(tmp, src, (src_str * (seg_num - 1) + size) * sizeof(T),
-             GDRAM2NRAM);
-    if (dst_str != src_str) {
-      __memcpy(dst, tmp, size * sizeof(T), NRAM2NRAM, dst_str * sizeof(T),
-               src_str * sizeof(T), seg_num - 1);
-    }
-  } else {
-    __memcpy(dst, src, size * sizeof(T), GDRAM2NRAM, dst_str * sizeof(T),
-             src_str * sizeof(T), seg_num - 1);
-  }
-}
-
-/*!
- * @brief loads data from global DRAM to NRAM with 3D pattern.
- *
- * @param[out] dst
- *   Pointer to NRAM that stores dst data.
- * @param[in] src
- *   Pointer to global DRAM that stores src data.
- * @param[in] size
- *   The byte size of segment in the lowest dimension.
- * @param[in] seg_num_in
- *   The total count of data segments in the lowest dimension.
- * @param[in] seg_num_out
- *   The total count of data segments in the middle dimension.
- * @param[in] dst_str_in
- *   The data stride in bytes between segments in the lowest dimension of dst.
- * @param[in] dst_str_out
- *   The data stride in bytes between segments in the middle dimension of dst.
- * @param[in] src_str_in
- *   The data stride in bytes between segments in the lowest dimension of src.
- * @param[in] src_str_out
- *   The data stride in bytes between segments in the middle dimension of src.
- */
-template <typename T>
-__mlu_func__ void loadStr3D(T *dst, T *src, const int size,
-                            const int seg_num_in, const int seg_num_out,
-                            const int dst_str_in, const int dst_str_out,
-                            const int src_str_in, const int src_str_out) {
-  T *tmp_dst = dst;
-  T *tmp_src = src;
-
-  for (int i = 0; i < seg_num_out; ++i) {
-    loadStr2D(tmp_dst, tmp_src, size, dst_str_in, src_str_in, seg_num_in);
-    tmp_src += src_str_out;
-    tmp_dst += dst_str_out;
-  }
-}
-
-/*!
- * @brief stores data from NRAM to global DRAM with 2D pattern.
- *
- * @param[out] dst
- *   Pointer to global DRAM that stores dst data.
- * @param[in] src
- *   Pointer to NRAM that stores src data.
- * @param[in] size
- *   The byte size of segment in the lower dimension.
- * @param[in] dst_str
- *   The data stride in bytes between segments in the lower dimension of dst.
- * @param[in] src_str
- *   The data stride in bytes between segments in the lower dimension of src.
- * @param[in] seg_num
- *   The total count of data segments in the lower dimension.
- */
-template <typename T>
-__mlu_func__ void storeStr2D(T *dst, T *src, const int size, const int seg_num,
-                             const int dst_str, const int src_str) {
-  if ((size == dst_str && dst_str <= src_str) && dst_str * sizeof(T) <= 512) {
-    // gather data less than 512Bytes to improve IO efficiency
-    if (dst_str != src_str) {
-      __memcpy(src, src, size * sizeof(T), NRAM2NRAM, dst_str * sizeof(T),
-               src_str * sizeof(T), seg_num - 1);
-    }
-    __memcpy(dst, src, size * seg_num * sizeof(T), NRAM2GDRAM);
-  } else {
-    __memcpy(dst, src, size * sizeof(T), NRAM2GDRAM, dst_str * sizeof(T),
-             src_str * sizeof(T), seg_num - 1);
-  }
-}
-
-/*!
- * @brief stores data from NRAM to global DRAM with 3D pattern.
- *
- * @param[out] dst
- *   Pointer to global DRAM that stores dst data.
- * @param[in] src
- *   Pointer to NRAM that stores src data.
- * @param[in] size
- *   The byte size of segment in the lowest dimension.
- * @param[in] seg_num_in
- *   The total count of data segments in the lowest dimension.
- * @param[in] seg_num_out
- *   The total count of data segments in the middle dimension.
- * @param[in] dst_str_in
- *   The data stride in bytes between segments in the lowest dimension of dst.
- * @param[in] dst_str_out
- *   The data stride in bytes between segments in the middle dimension of dst.
- * @param[in] src_str_in
- *   The data stride in bytes between segments in the lowest dimension of src.
- * @param[in] src_str_out
- *   The data stride in bytes between segments in the middle dimension of src.
- */
-template <typename T>
-__mlu_func__ void storeStr3D(T *dst, T *src, const int size,
-                             const int seg_num_in, const int seg_num_out,
-                             const int dst_str_in, const int dst_str_out,
-                             const int src_str_in, const int src_str_out) {
-  T *tmp_dst = dst;
-  T *tmp_src = src;
-  for (int i = 0; i < seg_num_out; ++i) {
-    storeStr2D(tmp_dst, tmp_src, size, seg_num_in, dst_str_in, src_str_in);
-    tmp_src += src_str_out;
-    tmp_dst += dst_str_out;
-  }
-}
-
-/*!
- * @brief Converts int32 to float32 data type.
- *
- * @param[out] dst
- *   Pointer to NRAM that stores int32 type data.
- * @param[in,out] dst_addition
- *   Pointer to NRAM as the workspace of dst, which has the same size as dst.
- *   It allows empty pointer on MLU300 series.
- * @param[in] src
- *   Pointer to NRAM that stores float32 type data.
- * @param[in,out] src_addition
- *   Pointer to NRAM as the workspace of src, which has a size of 128 Bytes.
- *   It allows empty pointer on MLU300 series.
- * @param[in] src_count
- *   The count of elements in src.
- */
-__mlu_func__ void convertInt2Float(float *dst, float *dst_addition, int *src,
-                                   float *src_addition, const int src_count) {
-#if __BANG_ARCH__ >= 300
-  __bang_int2float((float *)dst, (int32_t *)src, src_count, 0);
-#else
-  // get sign bit
-  const float move_23bit = 8388608.0;
-  // 0x80000000 = 1,000000000,0000000000000000000000000000
-  __bang_write_value((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
-                     0x80000000);
-  __bang_cycle_band((char *)dst_addition, (char *)src, (char *)src_addition,
-                    src_count * sizeof(float), NFU_ALIGN_SIZE);
-  // get 1 or 0 from sign bit
-  // judg is Odd
-  __bang_write_value((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
-                     0x00000001);
-  __bang_cycle_bor((char *)dst_addition, (char *)dst_addition,
-                   (char *)src_addition, src_count * sizeof(float),
-                   NFU_ALIGN_SIZE);
-  __bang_write_value((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
-                     0x80000001);
-  __bang_cycle_eq(dst_addition, dst_addition, src_addition, src_count,
-                  NFU_ALIGN_SIZE / sizeof(float));
-  // minus xor, positive num invariant
-  __bang_write_value((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
-                     0xffffffff);
-  __bang_cycle_mul(dst, dst_addition, src_addition, src_count,
-                   NFU_ALIGN_SIZE / sizeof(float));
-  __bang_bxor((char *)dst, (char *)src, (char *)dst, src_count * sizeof(float));
-  // convert int32 to float32
-  __bang_write_value((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
-                     0x7fffff);
-  __bang_cycle_band((char *)dst, (char *)dst, (char *)src_addition,
-                    src_count * sizeof(float), NFU_ALIGN_SIZE);
-  __bang_write_value((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
-                     0x4b000000);
-  __bang_cycle_bor((char *)dst, (char *)dst, (char *)src_addition,
-                   src_count * sizeof(float), NFU_ALIGN_SIZE);
-  __bang_sub_scalar(dst, dst, move_23bit, src_count);
-  // add one
-  __bang_add(dst, dst, dst_addition, src_count);
-  // set sign for float32
-  __bang_write_value((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
-                     0xffffffff);
-  __bang_cycle_mul(dst_addition, dst_addition, src_addition, src_count,
-                   NFU_ALIGN_SIZE / sizeof(float));
-
-  __bang_write_value((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
-                     0x00000001);
-  __bang_cycle_add(dst_addition, dst_addition, src_addition, src_count,
-                   NFU_ALIGN_SIZE / sizeof(float));
-
-  __bang_write_value((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
-                     0x80000000);
-  __bang_cycle_band((char *)dst_addition, (char *)dst_addition,
-                    (char *)src_addition, src_count * 4, 128);
-  __bang_bor((char *)dst, (char *)dst, (char *)dst_addition, src_count * 4);
-#endif  // __BANG_ARCH__ >= 300
-}
-
-/*!
- * @brief Converts float32 to int32 data type with to_zero round mode.
- *
- * @param[out] dst
- *   Pointer to NRAM that stores float32 type data.
- * @param[in,out] dst_addition
- *   Pointer to NRAM as the workspace of dst, which has the same size as dst.
- *   It allows empty pointer on MLU300 series.
- * @param[in] src
- *   Pointer to NRAM that stores int32 type data.
- * @param[in,out] src_addition
- *   Pointer to NRAM as the workspace of src, which has a size of 128 Bytes.
- *   It allows empty pointer on MLU300 series.
- * @param[in] src_count
- *   The count of elements in src.
- */
-__mlu_func__ void convertFloat2Int(int *dst, float *dst_addition, float *src,
-                                   float *src_addition, const int src_count) {
-#if __BANG_ARCH__ >= 300
-  __bang_float2int_tz((int32_t *)dst, (float *)src, src_count, 0);
-#else
-  // sign ===> src_addition
-  // dst=-1.0 : when src[i] is a negative number
-  // dst=+1.0 : when src[i] is a positive number
-  const int floatDchar = sizeof(float) / sizeof(char);
-  __bang_active_sign((float *)dst, src, src_count);
-  // dst_addition = abs(src)
-  __bang_mul(dst_addition, src, (float *)dst, src_count);
-  // if dst_addition < 1.0 , then src_addition + 1, to fix add error.
-  __bang_write_value((float *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
-                     1.0f);
-  __bang_cycle_lt(dst_addition, dst_addition, (float *)src_addition, src_count,
-                  NFU_ALIGN_SIZE / sizeof(float));
-  __bang_add_tz((float *)dst, (float *)dst, (float *)dst_addition, src_count);
-  __bang_write_value((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
-                     0xbf800000);
-  // set negative flag -1.0 = 0xbf80000
-  __bang_cycle_eq(
-      (float *)dst, (float *)dst, (float *)src_addition, src_count,
-      NFU_ALIGN_SIZE / sizeof(float));  //  to mark all src in [x<-1.0]
-  __bang_active_abs(dst_addition, src, src_count);
-  __bang_write_value((float *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
-                     8388608.0f);
-  // mask shift move 23
-  __bang_cycle_add_tz(
-      dst_addition, dst_addition, src_addition, src_count,
-      NFU_ALIGN_SIZE / sizeof(float));  // right shift move 23bit
-  // two`s complement for negatibe
-  // dst=1.0 , when src <-1.0
-  // dst=0.0 , when src >=-1.0
-  __bang_sub(dst_addition, dst_addition, (float *)dst, src_count);
-  // to fix max value
-  // 0 1001 0110 111 1111 1111 1111 1111 1111 <=> 0xcb7fffff <=> 16777215.0,
-  // means max value.
-  __bang_mul_scalar((float *)dst, (float *)dst, 16777215.0, src_count);
-  __bang_bxor((char *)dst_addition, (char *)dst_addition, (char *)dst,
-              src_count * floatDchar);
-  // get low 23bit
-  __bang_write_value((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
-                     (unsigned)0x007fffff);
-  // mask low 23bit is 1
-  __bang_cycle_band((char *)dst_addition, (char *)dst_addition,
-                    (char *)src_addition, src_count * floatDchar,
-                    NFU_ALIGN_SIZE / sizeof(char));
-  // set 9 high bit ===> dst
-  // -2.0 <=> 0xc0000000 <=> 1100 0000 0000 0000 0000 0000 0000 0000
-  //  1.0 <=> 0x3f800000 <=> 0011 1111 1000 0000 0000 0000 0000 0000
-  __bang_write_value(src_addition, NFU_ALIGN_SIZE / sizeof(float), 0x3f800000);
-  __bang_cycle_and((float *)dst, (float *)dst, src_addition, src_count,
-                   NFU_ALIGN_SIZE / sizeof(float));
-  // src or dst_addition
-  __bang_bor((char *)dst_addition, (char *)dst, (char *)dst_addition,
-             src_count * floatDchar);
-  __bang_mul_scalar((float *)dst, (float *)dst, -2.0, src_count);
-  __bang_bor((char *)dst, (char *)dst, (char *)dst_addition,
-             src_count * floatDchar);
-#endif  // __BANG_ARCH__ >= 300
-}
-
-/*!
- * @brief Converts float32 to half data type,
- * the rounding mode on MLU200 is rd, on MLU300 is rn.
- *
- * @param[out] dst
- *   Pointer to NRAM that stores half type data.
- * @param[in] src
- *   Pointer to NRAM that stores float32 type data.
- * @param[in] src_count
- *   The count of elements in src.
- */
-__mlu_func__ inline void convertFloat2half(half *dst, float *src,
-                                           int src_count) {
-#if __BANG_ARCH__ >= 300
-  __bang_float2half_rn(dst, src, src_count);
-#else
-  __bang_float2half_rd(dst, src, src_count);
-#endif
-}
-
-/*!
- * @brief recursiveSumPool.
- * @param[in,out] dst
- *     Pointer to NRAM that stores the input and output data.
- * @param[in] low_dim
- *     Which is the number of low dim.
- * @param[in] high_dim
- *     Which is the number of high dim.
- * @param[in] kernel_limit
- *     Which is the high_dim of sumpool per time.
- ******************************************************************************/
-template <typename T>
-__mlu_func__ void recursiveSumPool(T *dst, int low_dim, int high_dim,
-                                   int kernel_limit) {
-  for (; high_dim > 1;) {
-    int repeat_s = high_dim / kernel_limit;
-    int remain_s = high_dim % kernel_limit;
-
-    if (remain_s) {
-      __bang_sumpool((T *)dst, (T *)dst, low_dim, 1, remain_s, 1, remain_s, 1,
-                     1);
-    }
-    if (repeat_s) {
-      __bang_sumpool((T *)dst + (remain_s > 0 ? low_dim : 0),
-                     (T *)dst + remain_s * low_dim, low_dim,
-                     kernel_limit * repeat_s, 1, kernel_limit, 1, 1,
-                     kernel_limit);
-    }
-    high_dim = repeat_s + (bool)remain_s;
-  }
-  return;
-}
-
-#endif  // COMMON_MLU_HELPER_HPP_
diff --git a/mmcv/ops/csrc/common/mlu/deform_roi_pool_mlu_kernel.mlu b/mmcv/ops/csrc/common/mlu/deform_roi_pool_mlu_kernel.mlu
deleted file mode 100644
index 6c765e3..0000000
--- a/mmcv/ops/csrc/common/mlu/deform_roi_pool_mlu_kernel.mlu
+++ /dev/null
@@ -1,712 +0,0 @@
-/*************************************************************************
- * Copyright (C) 2022 Cambricon.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *************************************************************************/
-#include <iostream>
-
-#include "common_mlu_helper.hpp"
-
-#define ROI_OFFSET 5
-#define FOURSPLIT 4
-#define FIVESPLIT 5
-#define NINESPLIT 9
-#define THIRTEENSPLIT 13
-
-__nram__ char nram_buffer[MAX_NRAM_SIZE];
-
-template <typename T>
-static __mlu_func__ void bilinearInterpolate(const int input_width, T y, T x,
-                                             T *w1, T *w2, T *w3, T *w4,
-                                             int *x_low, int *x_high,
-                                             const int y_low, bool *is_empty) {
-  if (x < -1.0 || x > input_width) {
-    *is_empty = true;
-    return;
-  }
-
-  if (x <= 0) x = 0;
-
-  *x_low = int(x);
-
-  if (*x_low >= input_width - 1) {
-    *x_high = *x_low = input_width - 1;
-    x = T(*x_low);
-  } else {
-    *x_high = *x_low + 1;
-  }
-
-  T ly = y - y_low;
-  T lx = x - *x_low;
-  T hy = 1.0 - ly;
-  T hx = 1.0 - lx;
-  *w1 = hy * hx;
-  *w2 = hy * lx;
-  *w3 = ly * hx;
-  *w4 = ly * lx;
-}
-
-template <typename T>
-__mlu_func__ void MLUUnion1DeformRoIPoolForward(
-    const T *input, const T *rois, const T *offset, T *output,
-    const int channels, const int height, const int width, const int num_rois,
-    const int pooled_height, const int pooled_width, const T spatial_scale,
-    const int sampling_ratio, const T gamma) {
-  for (int bin_index = taskId;
-       bin_index < num_rois * pooled_width * pooled_height;
-       bin_index += taskDim) {
-    int out_batch = bin_index / pooled_width / pooled_height;
-    int out_height = bin_index / pooled_width % pooled_height;
-    int out_width = bin_index % pooled_width;
-    const T *cur_roi = rois + out_batch * ROI_OFFSET;
-    T *nram_rois = (T *)nram_buffer;
-    __memcpy((void *)nram_rois, (void *)cur_roi, ROI_OFFSET * sizeof(T),
-             GDRAM2NRAM);
-    const int roi_batch = nram_rois[0];
-    T roi_x_min = nram_rois[1] * spatial_scale - 0.5;
-    T roi_y_min = nram_rois[2] * spatial_scale - 0.5;
-    const T roi_x_max = nram_rois[3] * spatial_scale - 0.5;
-    const T roi_y_max = nram_rois[4] * spatial_scale - 0.5;
-    const T roi_width = roi_x_max - roi_x_min;
-    const T roi_height = roi_y_max - roi_y_min;
-    const T bin_width = roi_width / static_cast<T>(pooled_width);
-    const T bin_height = roi_height / static_cast<T>(pooled_height);
-    const T *offset_input = input + roi_batch * height * width * channels;
-    int roi_bin_grid_height =
-        (sampling_ratio > 0)
-            ? sampling_ratio
-            : static_cast<int>(ceilf(roi_height / pooled_height));
-    int roi_bin_grid_width =
-        (sampling_ratio > 0)
-            ? sampling_ratio
-            : static_cast<int>(ceilf(roi_width / pooled_width));
-    if (offset != NULL) {
-      const T *offset_cur = offset +
-                            out_batch * pooled_width * pooled_height * 2 +
-                            out_height * pooled_width + out_width;
-      roi_x_min += gamma * roi_width * offset_cur[0];
-      roi_y_min +=
-          gamma * roi_height * offset_cur[pooled_width * pooled_height];
-    }
-    int type_align = NFU_ALIGN_SIZE / sizeof(T);
-    int channels_max_num_nram = MAX_NRAM_SIZE / sizeof(T);
-    int channels_nram_split =
-        channels_max_num_nram / NINESPLIT / type_align * type_align;
-    int channel_rem = channels % channels_nram_split;
-    int channel_loops =
-        channels / channels_nram_split + (channel_rem != 0 ? 1 : 0);
-    for (int channel_loop_index = 0; channel_loop_index < channel_loops;
-         ++channel_loop_index) {
-      int channels_num =
-          channels_nram_split >= channels ? channels : channels_nram_split;
-      const int channel_offset = channel_loop_index * channels_num;
-      if (channel_loop_index + 1 == channel_loops && channel_rem != 0) {
-        channels_num = channel_rem;
-      }
-      int channels_align = CEIL_ALIGN(channels_num, type_align);
-      int nram_limit = (MAX_NRAM_SIZE / sizeof(T) - channels_align) >> 1;
-      int c_slice = nram_limit / FOURSPLIT / type_align * type_align;
-      int c_slice_align = 0;
-
-      /* NRAM partition
-       *
-       * |          |       ping        |       pong        |
-       * |----------|-------------------|-------------------|
-       * | nram_out | p1 | p2 | p3 | p4 | p1 | p2 | p3 | p4 |
-       *
-       */
-
-      T *nram_out = (T *)nram_buffer;
-      T *nram_ping = nram_out + channels_align;
-      T *nram_pong = nram_ping + nram_limit;
-      __bang_write_value((T *)nram_out, channels_align, (T)0);
-      __bang_write_value((T *)nram_ping, FOURSPLIT * c_slice, (T)0);
-      __bang_write_value((T *)nram_pong, FOURSPLIT * c_slice, (T)0);
-      const T num_bins =
-          static_cast<T>(max(roi_bin_grid_height * roi_bin_grid_width, 1));
-      const T value_div = 1.0f / num_bins;
-      bool is_ping_empty = true;
-      for (int iy = 0; iy < roi_bin_grid_height; ++iy) {
-        T y = roi_y_min + out_height * bin_height +
-              static_cast<T>(iy + .5f) * bin_height /
-                  static_cast<T>(roi_bin_grid_height);
-        if (y < -1.0 || y > height) {
-          is_ping_empty = true;
-          continue;
-        }
-        if (y <= 0) {
-          y = 0;
-        }
-        int y_low = 0, y_high = 0;
-        y_low = int(y);
-        if (y_low >= height - 1) {
-          y_high = y_low = height - 1;
-          y = T(y_low);
-        } else {
-          y_high = y_low + 1;
-        }
-        for (int ix = 0; ix < roi_bin_grid_width; ++ix) {
-          T x = roi_x_min + out_width * bin_width +
-                static_cast<T>(ix + .5f) * bin_width /
-                    static_cast<T>(roi_bin_grid_width);
-          const int sample_index = iy * roi_bin_grid_width + ix;
-          int c_rem = channels_num;
-          c_slice = nram_limit / FOURSPLIT / type_align * type_align;
-          c_slice_align = 0;
-          bool is_empty = false;
-          T w1, w2, w3, w4;
-          int x_low = 0, x_high = 0;
-          bilinearInterpolate(width, y, x, &w1, &w2, &w3, &w4, &x_low, &x_high,
-                              y_low, &is_empty);
-          if (is_empty) {
-            is_ping_empty = true;
-            continue;
-          }
-          if (is_ping_empty) {
-            c_slice = c_slice > c_rem ? c_rem : c_slice;
-            c_slice_align = CEIL_ALIGN(c_slice, type_align);
-            __bang_write_value(nram_ping, FOURSPLIT * c_slice_align, (T)0);
-            __asm__ volatile("sync;");
-            __memcpy(nram_ping,
-                     offset_input + y_low * width * channels +
-                         x_low * channels + channel_offset,
-                     c_slice * sizeof(T), GDRAM2NRAM);
-            __memcpy(nram_ping + c_slice_align,
-                     offset_input + y_low * width * channels +
-                         x_high * channels + channel_offset,
-                     c_slice * sizeof(T), GDRAM2NRAM);
-            __memcpy(nram_ping + 2 * c_slice_align,
-                     offset_input + y_high * width * channels +
-                         x_low * channels + channel_offset,
-                     c_slice * sizeof(T), GDRAM2NRAM);
-            __memcpy(nram_ping + 3 * c_slice_align,
-                     offset_input + y_high * width * channels +
-                         x_high * channels + channel_offset,
-                     c_slice * sizeof(T), GDRAM2NRAM);
-            is_ping_empty = false;
-          }
-          int c_offset = 0;
-          int pongc_slice = 0;
-          int pongc_slice_align = 0;
-          while (c_rem > 0) {
-            c_slice = c_slice > c_rem ? c_rem : c_slice;
-            c_slice_align = CEIL_ALIGN(c_slice, type_align);
-            if (sample_index + 1 < roi_bin_grid_height * roi_bin_grid_width) {
-              int iy_tmp = (sample_index + 1) / roi_bin_grid_width;
-              int ix_tmp = (sample_index + 1) % roi_bin_grid_width;
-              y = roi_y_min + out_height * bin_height +
-                  static_cast<T>(iy_tmp + .5f) * bin_height /
-                      static_cast<T>(roi_bin_grid_height);
-              x = roi_x_min + out_width * bin_width +
-                  static_cast<T>(ix_tmp + .5f) * bin_width /
-                      static_cast<T>(roi_bin_grid_width);
-              if (y < -1.0 || y > height) {
-                is_empty = true;
-              } else {
-                T w1_tmp, w2_tmp, w3_tmp, w4_tmp;
-                if (y <= 0) {
-                  y = 0;
-                }
-                y_low = int(y);
-                if (y_low >= height - 1) {
-                  y_high = y_low = height - 1;
-                  y = T(y_low);
-                } else {
-                  y_high = y_low + 1;
-                }
-                bilinearInterpolate(width, y, x, &w1_tmp, &w2_tmp, &w3_tmp,
-                                    &w4_tmp, &x_low, &x_high, y_low, &is_empty);
-              }
-              pongc_slice = nram_limit / FOURSPLIT / type_align * type_align;
-              pongc_slice =
-                  pongc_slice > channels_num ? channels_num : pongc_slice;
-              pongc_slice_align = CEIL_ALIGN(pongc_slice, type_align);
-              __bang_write_value(nram_pong, FOURSPLIT * pongc_slice_align,
-                                 (T)0);
-              __asm__ volatile("sync;");
-              if (!is_empty) {
-                __memcpy_async(nram_pong,
-                               offset_input + y_low * width * channels +
-                                   x_low * channels + channel_offset,
-                               pongc_slice * sizeof(T), GDRAM2NRAM);
-                __memcpy_async(nram_pong + pongc_slice_align,
-                               offset_input + y_low * width * channels +
-                                   x_high * channels + channel_offset,
-                               pongc_slice * sizeof(T), GDRAM2NRAM);
-                __memcpy_async(nram_pong + 2 * pongc_slice_align,
-                               offset_input + y_high * width * channels +
-                                   x_low * channels + channel_offset,
-                               pongc_slice * sizeof(T), GDRAM2NRAM);
-                __memcpy_async(nram_pong + 3 * pongc_slice_align,
-                               offset_input + y_high * width * channels +
-                                   x_high * channels + channel_offset,
-                               pongc_slice * sizeof(T), GDRAM2NRAM);
-              }
-            }
-            __bang_mul_scalar(nram_ping, nram_ping, w1, c_slice_align);
-            __bang_mul_scalar(nram_ping + c_slice_align,
-                              nram_ping + c_slice_align, w2, c_slice_align);
-            __bang_add(nram_ping, nram_ping, nram_ping + c_slice_align,
-                       c_slice_align);
-            __bang_mul_scalar(nram_ping + 2 * c_slice_align,
-                              nram_ping + 2 * c_slice_align, w3, c_slice_align);
-            __bang_add(nram_ping, nram_ping, nram_ping + 2 * c_slice_align,
-                       c_slice_align);
-            __bang_mul_scalar(nram_ping + 3 * c_slice_align,
-                              nram_ping + 3 * c_slice_align, w4, c_slice_align);
-            __bang_add(nram_ping, nram_ping, nram_ping + 3 * c_slice_align,
-                       c_slice_align);
-            __bang_add(nram_out + c_offset, nram_out + c_offset, nram_ping,
-                       c_slice_align);
-            T *nram_tmp = nram_ping;
-            nram_ping = nram_pong;
-            nram_pong = nram_tmp;
-            c_rem -= c_slice;
-            c_offset += c_slice;
-            __asm__ volatile("sync;");
-          }
-        }
-      }
-      __bang_mul_scalar(nram_out, nram_out, value_div, channels_align);
-      __memcpy(output + channels * bin_index + channel_offset, nram_out,
-               channels_num * sizeof(T), NRAM2GDRAM);
-    }
-  }
-}
-
-__mlu_global__ void MLUKernelDeformRoIPoolForward(
-    cnrtDataType_t data_type, const void *input, const void *rois,
-    const void *offset, void *output, const int channels, const int height,
-    const int width, const int num_rois, const int pooled_height,
-    const int pooled_width, const float spatial_scale, const int sampling_ratio,
-    const float gamma) {
-  switch (data_type) {
-    case CNRT_FLOAT16: {
-      MLUUnion1DeformRoIPoolForward((half *)input, (half *)rois, (half *)offset,
-                                    (half *)output, channels, height, width,
-                                    num_rois, pooled_height, pooled_width,
-                                    static_cast<half>(spatial_scale),
-                                    sampling_ratio, static_cast<half>(gamma));
-    }; break;
-    case CNRT_FLOAT32: {
-      MLUUnion1DeformRoIPoolForward(
-          (float *)input, (float *)rois, (float *)offset, (float *)output,
-          channels, height, width, num_rois, pooled_height, pooled_width,
-          static_cast<float>(spatial_scale), sampling_ratio,
-          static_cast<float>(gamma));
-    }; break;
-    default: {
-      break;
-    }
-  }
-}
-
-void KernelDeformRoIPoolForward(cnrtDim3_t k_dim, cnrtFunctionType_t k_type,
-                                cnrtQueue_t queue, cnrtDataType_t data_type,
-                                const void *input, const void *rois,
-                                const void *offset, void *output,
-                                const int channels, const int height,
-                                const int width, const int num_rois,
-                                const int pooled_height, const int pooled_width,
-                                const float spatial_scale,
-                                const int sampling_ratio, const float gamma) {
-  MLUKernelDeformRoIPoolForward<<<k_dim, k_type, queue>>>(
-      data_type, input, rois, offset, output, channels, height, width, num_rois,
-      pooled_height, pooled_width, spatial_scale, sampling_ratio, gamma);
-}
-
-template <typename T>
-__mlu_func__ void MLUUnion1DeformRoIPoolBackward(
-    const T *grad_output, const T *input, const T *rois, const T *offset,
-    T *grad_input, T *grad_offset, const int channels, const int height,
-    const int width, const int num_rois, const int pooled_height,
-    const int pooled_width, const T spatial_scale, const int sampling_ratio,
-    const T gamma) {
-  for (int bin_index = taskId;
-       bin_index < num_rois * pooled_width * pooled_height;
-       bin_index += taskDim) {
-    int out_batch = bin_index / pooled_width / pooled_height;
-    int out_height = bin_index / pooled_width % pooled_height;
-    int out_width = bin_index % pooled_width;
-    const T *cur_roi = rois + out_batch * ROI_OFFSET;
-    T *nram_rois = (T *)nram_buffer;
-    __memcpy((void *)nram_rois, (void *)cur_roi, ROI_OFFSET * sizeof(T),
-             GDRAM2NRAM);
-    const int roi_batch = nram_rois[0];
-    T roi_x_min = nram_rois[1] * spatial_scale - 0.5;
-    T roi_y_min = nram_rois[2] * spatial_scale - 0.5;
-    const T roi_x_max = nram_rois[3] * spatial_scale - 0.5;
-    const T roi_y_max = nram_rois[4] * spatial_scale - 0.5;
-    const T roi_width = roi_x_max - roi_x_min;
-    const T roi_height = roi_y_max - roi_y_min;
-    const T bin_width = roi_width / static_cast<T>(pooled_width);
-    const T bin_height = roi_height / static_cast<T>(pooled_height);
-    const T *offset_input = input + roi_batch * height * width * channels;
-    T *offset_grad_input = grad_input + roi_batch * height * width * channels;
-    int roi_bin_grid_height =
-        (sampling_ratio > 0)
-            ? sampling_ratio
-            : static_cast<int>(ceilf(roi_height / pooled_height));
-    int roi_bin_grid_width =
-        (sampling_ratio > 0)
-            ? sampling_ratio
-            : static_cast<int>(ceilf(roi_width / pooled_width));
-    if (offset != NULL) {
-      const T *offset_cur = offset +
-                            out_batch * pooled_width * pooled_height * 2 +
-                            out_height * pooled_width + out_width;
-      roi_x_min += gamma * roi_width * offset_cur[0];
-      roi_y_min +=
-          gamma * roi_height * offset_cur[pooled_width * pooled_height];
-    }
-
-    /* NRAM partition
-     *
-     * If offset != NULL, NRAM partition belows.
-     * |                                                                     |
-     * ping   |    pong   |
-     * |---------------------------------------------------------------------|-----------|-----------|
-     * |nram_tmp1|nram_tmp2|nram_tmp3|nram_tmp4|nram_grad_output|nram_sum_tmp|p1|p2|p3|p4|p1|p2|p3|p4|
-     *
-     * If offset == NULL, ping and pang will not be needed.
-     * | |
-     * |----------------------------------------------------------------------------------|
-     * | nram_tmp1 | nram_tmp2 | nram_tmp3 |  nram_tmp4 | nram_grad_output |
-     *
-     */
-
-    int type_align = NFU_ALIGN_SIZE / sizeof(T);
-    int channels_max_num_nram = MAX_NRAM_SIZE / sizeof(T);
-    int channels_nram_split =
-        channels_max_num_nram / FIVESPLIT / type_align * type_align;
-    int channel_rem = channels % channels_nram_split;
-    int channel_loops =
-        channels / channels_nram_split + (channel_rem != 0 ? 1 : 0);
-    if (offset != NULL) {
-      channels_nram_split =
-          channels_max_num_nram / THIRTEENSPLIT / type_align * type_align;
-      channel_rem = channels % channels_nram_split;
-      channel_loops =
-          channels / channels_nram_split + (channel_rem != 0 ? 1 : 0);
-    }
-
-    for (int channel_loop_index = 0; channel_loop_index < channel_loops;
-         ++channel_loop_index) {
-      int channels_num =
-          channels_nram_split >= channels ? channels : channels_nram_split;
-      const int channel_offset = channel_loop_index * channels_num;
-      if (channel_loop_index + 1 == channel_loops && channel_rem != 0) {
-        channels_num = channel_rem;
-      }
-      int channels_align = CEIL_ALIGN(channels_num, type_align);
-      const int32_t nram_sum_tmp_channel = NFU_ALIGN_SIZE / sizeof(T);
-      int nram_limit = (MAX_NRAM_SIZE / sizeof(T) - 5 * channels_align -
-                        nram_sum_tmp_channel) >>
-                       1;
-      int c_slice = 0;
-      int c_slice_align = 0;
-      T *nram_tmp1 = (T *)nram_buffer;
-      T *nram_tmp2 = (T *)nram_buffer + channels_align;
-      T *nram_tmp3 = (T *)nram_buffer + 2 * channels_align;
-      T *nram_tmp4 = (T *)nram_buffer + 3 * channels_align;
-      T *nram_grad_output = nram_tmp4 + channels_align;
-      T *nram_sum_tmp = NULL;
-      T *nram_ping_input = NULL;
-      T *nram_pong_input = NULL;
-      __bang_write_value((T *)nram_grad_output, channels_align, (T)0);
-      __asm__ volatile("sync;");
-
-      if (offset != NULL) {
-        c_slice = nram_limit / FOURSPLIT / type_align * type_align;
-        nram_sum_tmp = nram_grad_output + channels_align;
-        nram_ping_input = nram_sum_tmp + nram_sum_tmp_channel;
-        nram_pong_input = nram_ping_input + FOURSPLIT * c_slice;
-        __bang_write_value((T *)nram_sum_tmp, nram_sum_tmp_channel, (T)0);
-        __bang_write_value((T *)nram_ping_input, FOURSPLIT * c_slice, (T)0);
-        __bang_write_value((T *)nram_pong_input, FOURSPLIT * c_slice, (T)0);
-        __asm__ volatile("sync;");
-      }
-      const T num_bins =
-          static_cast<T>(max(roi_bin_grid_height * roi_bin_grid_width, 1));
-      const T value_div = 1.0f / num_bins;
-      bool is_ping_empty = true;
-      __memcpy(nram_grad_output,
-               grad_output + channels * bin_index + channel_offset,
-               channels_num * sizeof(T), GDRAM2NRAM);
-      __bang_mul_scalar(nram_grad_output, nram_grad_output, value_div,
-                        channels_align);
-      for (int iy = 0; iy < roi_bin_grid_height; ++iy) {
-        T y = roi_y_min + out_height * bin_height +
-              static_cast<T>(iy + .5f) * bin_height /
-                  static_cast<T>(roi_bin_grid_height);
-        T y_tmp = y;
-        if (y_tmp < -1.0 || y_tmp > height) {
-          is_ping_empty = true;
-          continue;
-        }
-        if (y_tmp <= 0) {
-          y_tmp = 0;
-        }
-        int y_low = 0, y_high = 0;
-        y_low = int(y_tmp);
-        if (y_low >= height - 1) {
-          y_high = y_low = height - 1;
-          y_tmp = T(y_low);
-        } else {
-          y_high = y_low + 1;
-        }
-        for (int ix = 0; ix < roi_bin_grid_width; ++ix) {
-          T x = roi_x_min + out_width * bin_width +
-                static_cast<T>(ix + .5f) * bin_width /
-                    static_cast<T>(roi_bin_grid_width);
-          const int sample_index = iy * roi_bin_grid_width + ix;
-          int c_rem = channels_num;
-          bool is_empty = false;
-          T w1, w2, w3, w4;
-          int x_low = 0, x_high = 0;
-          bilinearInterpolate(width, y_tmp, x, &w1, &w2, &w3, &w4, &x_low,
-                              &x_high, y_low, &is_empty);
-          if (is_empty) {
-            is_ping_empty = true;
-            continue;
-          }
-          __bang_mul_scalar((T *)nram_tmp1, (T *)nram_grad_output, w1,
-                            channels_align);
-          __bang_mul_scalar((T *)nram_tmp2, (T *)nram_grad_output, w2,
-                            channels_align);
-          __bang_mul_scalar((T *)nram_tmp3, (T *)nram_grad_output, w3,
-                            channels_align);
-          __bang_mul_scalar((T *)nram_tmp4, (T *)nram_grad_output, w4,
-                            channels_align);
-          __asm__ volatile("sync;");
-          __bang_atomic_add(
-              (T *)nram_tmp1,
-              (T *)(offset_grad_input + (y_low * width + x_low) * channels +
-                    channel_offset),
-              (T *)nram_tmp1, channels_num);
-          __bang_atomic_add(
-              (T *)nram_tmp2,
-              (T *)(offset_grad_input + (y_low * width + x_high) * channels +
-                    channel_offset),
-              (T *)nram_tmp2, channels_num);
-          __bang_atomic_add(
-              (T *)nram_tmp3,
-              (T *)(offset_grad_input + (y_high * width + x_low) * channels +
-                    channel_offset),
-              (T *)nram_tmp3, channels_num);
-          __bang_atomic_add(
-              (T *)nram_tmp4,
-              (T *)(offset_grad_input + (y_high * width + x_high) * channels +
-                    channel_offset),
-              (T *)nram_tmp4, channels_num);
-          if (offset != NULL) {
-            c_slice = nram_limit / FOURSPLIT / type_align * type_align;
-            c_slice_align = 0;
-            if (is_ping_empty) {
-              c_slice = c_slice > c_rem ? c_rem : c_slice;
-              c_slice_align = CEIL_ALIGN(c_slice, type_align);
-              __bang_write_value(nram_ping_input, FOURSPLIT * c_slice_align,
-                                 (T)0);
-              __asm__ volatile("sync;");
-              const T *src_offset1 = offset_input + y_low * width * channels +
-                                     x_low * channels + channel_offset;
-              const T *src_offset2 = offset_input + y_low * width * channels +
-                                     x_high * channels + channel_offset;
-              const T *src_offset3 = offset_input + y_high * width * channels +
-                                     x_low * channels + channel_offset;
-              const T *src_offset4 = offset_input + y_high * width * channels +
-                                     x_high * channels + channel_offset;
-              __memcpy(nram_ping_input, src_offset1, c_slice * sizeof(T),
-                       GDRAM2NRAM);
-              __memcpy(nram_ping_input + c_slice_align, src_offset2,
-                       c_slice * sizeof(T), GDRAM2NRAM);
-              __memcpy(nram_ping_input + 2 * c_slice_align, src_offset3,
-                       c_slice * sizeof(T), GDRAM2NRAM);
-              __memcpy(nram_ping_input + 3 * c_slice_align, src_offset4,
-                       c_slice * sizeof(T), GDRAM2NRAM);
-              is_ping_empty = false;
-            }
-            int c_offset = 0;
-            int pongc_slice = 0;
-            int pongc_slice_align = 0;
-            while (c_rem > 0) {
-              c_slice = c_slice > c_rem ? c_rem : c_slice;
-              c_slice_align = CEIL_ALIGN(c_slice, type_align);
-              if (sample_index + 1 < roi_bin_grid_height * roi_bin_grid_width) {
-                int iy_tmp = (sample_index + 1) / roi_bin_grid_width;
-                int ix_tmp = (sample_index + 1) % roi_bin_grid_width;
-                T y_tmp = roi_y_min + out_height * bin_height +
-                          static_cast<T>(iy_tmp + .5f) * bin_height /
-                              static_cast<T>(roi_bin_grid_height);
-                T x_tmp = roi_x_min + out_width * bin_width +
-                          static_cast<T>(ix_tmp + .5f) * bin_width /
-                              static_cast<T>(roi_bin_grid_width);
-                int x_low_tmp = 0, x_high_tmp = 0, y_low_tmp = 0,
-                    y_high_tmp = 0;
-                if (y_tmp < -1.0 || y_tmp > height) {
-                  is_empty = true;
-                } else {
-                  T w1_tmp, w2_tmp, w3_tmp, w4_tmp;
-                  if (y_tmp <= 0) {
-                    y_tmp = 0;
-                  }
-                  y_low_tmp = int(y_tmp);
-                  if (y_low_tmp >= height - 1) {
-                    y_high_tmp = y_low_tmp = height - 1;
-                    y_tmp = T(y_low_tmp);
-                  } else {
-                    y_high_tmp = y_low_tmp + 1;
-                  }
-                  bilinearInterpolate(width, y_tmp, x_tmp, &w1_tmp, &w2_tmp,
-                                      &w3_tmp, &w4_tmp, &x_low_tmp, &x_high_tmp,
-                                      y_low_tmp, &is_empty);
-                }
-                pongc_slice = nram_limit / FOURSPLIT / type_align * type_align;
-                pongc_slice =
-                    pongc_slice > channels_num ? channels_num : pongc_slice;
-                pongc_slice_align = CEIL_ALIGN(pongc_slice, type_align);
-                __bang_write_value(nram_pong_input,
-                                   FOURSPLIT * pongc_slice_align, (T)0);
-                __asm__ volatile("sync;");
-                if (!is_empty) {
-                  const T *src_offset1 = offset_input +
-                                         y_low_tmp * width * channels +
-                                         x_low_tmp * channels + channel_offset;
-                  const T *src_offset2 = offset_input +
-                                         y_low_tmp * width * channels +
-                                         x_high_tmp * channels + channel_offset;
-                  const T *src_offset3 = offset_input +
-                                         y_high_tmp * width * channels +
-                                         x_low_tmp * channels + channel_offset;
-                  const T *src_offset4 = offset_input +
-                                         y_high_tmp * width * channels +
-                                         x_high_tmp * channels + channel_offset;
-                  __memcpy_async(nram_pong_input, src_offset1,
-                                 pongc_slice * sizeof(T), GDRAM2NRAM);
-                  __memcpy_async(nram_pong_input + pongc_slice_align,
-                                 src_offset2, pongc_slice * sizeof(T),
-                                 GDRAM2NRAM);
-                  __memcpy_async(nram_pong_input + 2 * pongc_slice_align,
-                                 src_offset3, pongc_slice * sizeof(T),
-                                 GDRAM2NRAM);
-                  __memcpy_async(nram_pong_input + 3 * pongc_slice_align,
-                                 src_offset4, pongc_slice * sizeof(T),
-                                 GDRAM2NRAM);
-                }
-              }
-
-              __bang_mul_scalar(nram_tmp1, nram_ping_input + 3 * c_slice_align,
-                                y - y_low, c_slice_align);
-              __bang_mul_scalar(nram_tmp2, nram_ping_input + c_slice_align,
-                                y_high - y, c_slice_align);
-              __bang_add(nram_tmp1, nram_tmp1, nram_tmp2, c_slice_align);
-              __bang_mul_scalar(nram_tmp2, nram_ping_input + 2 * c_slice_align,
-                                y_low - y, c_slice_align);
-              __bang_add(nram_tmp1, nram_tmp1, nram_tmp2, c_slice_align);
-              __bang_mul_scalar(nram_tmp2, nram_ping_input, y - y_high,
-                                c_slice_align);
-              __bang_add(nram_tmp1, nram_tmp1, nram_tmp2, c_slice_align);
-              __bang_mul_scalar(nram_tmp1, nram_tmp1, gamma * roi_width,
-                                c_slice_align);
-              __bang_mul(nram_tmp1, nram_grad_output, nram_tmp1, c_slice_align);
-              const int32_t kernel_width =
-                  c_slice_align / nram_sum_tmp_channel +
-                  (int32_t)(c_slice_align % nram_sum_tmp_channel > 0);
-              __bang_sumpool(nram_sum_tmp, nram_tmp1, nram_sum_tmp_channel, 1,
-                             kernel_width, 1, kernel_width, kernel_width, 1);
-              __bang_reduce_sum(nram_sum_tmp, nram_sum_tmp,
-                                nram_sum_tmp_channel);
-              __bang_atomic_add(
-                  (T *)nram_sum_tmp,
-                  (T *)(grad_offset +
-                        out_batch * pooled_width * pooled_height * 2 +
-                        out_height * pooled_width + out_width),
-                  (T *)nram_sum_tmp, 1);
-              __bang_write_value((T *)nram_sum_tmp, nram_sum_tmp_channel, (T)0);
-              __bang_mul_scalar(nram_tmp1, nram_ping_input + 3 * c_slice_align,
-                                x - x_low, c_slice_align);
-              __bang_mul_scalar(nram_tmp2, nram_ping_input + 2 * c_slice_align,
-                                x_high - x, c_slice_align);
-              __bang_add(nram_tmp1, nram_tmp1, nram_tmp2, c_slice_align);
-              __bang_mul_scalar(nram_tmp2, nram_ping_input + c_slice_align,
-                                x_low - x, c_slice_align);
-              __bang_add(nram_tmp1, nram_tmp1, nram_tmp2, c_slice_align);
-              __bang_mul_scalar(nram_tmp2, nram_ping_input, x - x_high,
-                                c_slice_align);
-              __bang_add(nram_tmp1, nram_tmp1, nram_tmp2, c_slice_align);
-              __bang_mul_scalar(nram_tmp1, nram_tmp1, gamma * roi_height,
-                                c_slice_align);
-              __bang_mul(nram_tmp1, nram_grad_output, nram_tmp1, c_slice_align);
-              __bang_sumpool(nram_sum_tmp, nram_tmp1, nram_sum_tmp_channel, 1,
-                             kernel_width, 1, kernel_width, kernel_width, 1);
-              __bang_reduce_sum(nram_sum_tmp, nram_sum_tmp,
-                                NFU_ALIGN_SIZE / sizeof(T));
-              __bang_atomic_add(
-                  (T *)nram_sum_tmp,
-                  (T *)(grad_offset +
-                        out_batch * pooled_width * pooled_height * 2 +
-                        pooled_width * pooled_height +
-                        out_height * pooled_width + out_width),
-                  (T *)nram_sum_tmp, 1);
-
-              T *nram_tmp = nram_ping_input;
-              nram_ping_input = nram_pong_input;
-              nram_pong_input = nram_tmp;
-              c_rem -= c_slice;
-              c_offset += c_slice;
-              __asm__ volatile("sync;");
-            }
-          }
-        }
-      }
-    }
-  }
-}
-
-__mlu_global__ void MLUKernelDeformRoIPoolBackward(
-    cnrtDataType_t data_type, const void *grad_output, const void *input,
-    const void *rois, const void *offset, void *grad_input, void *grad_offset,
-    const int channels, const int height, const int width, const int num_rois,
-    const int pooled_height, const int pooled_width, const float spatial_scale,
-    const int sampling_ratio, const float gamma) {
-  switch (data_type) {
-    case CNRT_FLOAT16: {
-      MLUUnion1DeformRoIPoolBackward(
-          (half *)grad_output, (half *)input, (half *)rois, (half *)offset,
-          (half *)grad_input, (half *)grad_offset, channels, height, width,
-          num_rois, pooled_height, pooled_width,
-          static_cast<half>(spatial_scale), sampling_ratio,
-          static_cast<half>(gamma));
-    }; break;
-    case CNRT_FLOAT32: {
-      MLUUnion1DeformRoIPoolBackward(
-          (float *)grad_output, (float *)input, (float *)rois, (float *)offset,
-          (float *)grad_input, (float *)grad_offset, channels, height, width,
-          num_rois, pooled_height, pooled_width,
-          static_cast<float>(spatial_scale), sampling_ratio,
-          static_cast<float>(gamma));
-    }; break;
-    default: {
-      break;
-    }
-  }
-}
-
-void KernelDeformRoIPoolBackward(
-    cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
-    cnrtDataType_t data_type, const void *grad_output, const void *input,
-    const void *rois, const void *offset, void *grad_input, void *grad_offset,
-    const int channels, const int height, const int width, const int num_rois,
-    const int pooled_height, const int pooled_width, const float spatial_scale,
-    const int sampling_ratio, const float gamma) {
-  MLUKernelDeformRoIPoolBackward<<<k_dim, k_type, queue>>>(
-      data_type, grad_output, input, rois, offset, grad_input, grad_offset,
-      channels, height, width, num_rois, pooled_height, pooled_width,
-      spatial_scale, sampling_ratio, gamma);
-}
diff --git a/mmcv/ops/csrc/common/mlu/focal_loss_sigmoid_mlu_kernel.mlu b/mmcv/ops/csrc/common/mlu/focal_loss_sigmoid_mlu_kernel.mlu
deleted file mode 100644
index 7624379..0000000
--- a/mmcv/ops/csrc/common/mlu/focal_loss_sigmoid_mlu_kernel.mlu
+++ /dev/null
@@ -1,888 +0,0 @@
-/*************************************************************************
- * Copyright (C) 2021 Cambricon.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *************************************************************************/
-#include <float.h>
-
-#include "common_mlu_helper.hpp"
-
-#define PING 0
-#define PONG 1
-
-__nram__ char nram_buffer[MAX_NRAM_SIZE];
-
-namespace forward {
-template <typename T>
-__mlu_func__ void loadInput(char *nram_input, T *dram_input, const int32_t size,
-                            const int32_t dst_stride = 0,
-                            const int32_t src_stride = 0,
-                            const int32_t count = 1) {
-  if (dst_stride == src_stride) {
-    __memcpy_async(nram_input, dram_input, size * count, GDRAM2NRAM);
-  } else {
-    __memcpy_async(nram_input, dram_input, size, GDRAM2NRAM, dst_stride,
-                   src_stride, count - 1);
-  }
-}
-
-template <typename T>
-__mlu_func__ void loadWeight(char *nram_input, T *dram_input, const int32_t t,
-                             const int32_t c, const int32_t has_weight,
-                             const int32_t partition_nc) {
-  if (has_weight && partition_nc && t >= 0 && t < c) {
-    __memcpy_async(nram_input, (T *)dram_input + t, sizeof(T), GDRAM2NRAM);
-  }
-}
-
-template <typename T>
-__mlu_func__ void storeOutput(T *dram_output, char *nram_output,
-                              const int32_t size, const int32_t dst_stride = 0,
-                              const int32_t src_stride = 0,
-                              const int32_t count = 1) {
-  if (dst_stride == src_stride) {
-    __memcpy_async(dram_output, nram_output, size * count, NRAM2GDRAM);
-  } else {
-    __memcpy_async(dram_output, nram_output, size, NRAM2GDRAM, dst_stride,
-                   src_stride, count - 1);
-  }
-}
-
-template <typename T>
-__mlu_func__ void compute(T *input, const int32_t *target, const T *weight,
-                          const int32_t has_weight, const int32_t partition_nc,
-                          const int32_t deal_num, const int32_t n_seg,
-                          const int32_t c, const int32_t c_seg,
-                          const int32_t c_start_index, const float alpha,
-                          const float gamma, T *compute_a, T *compute_b,
-                          T *output) {
-  // set params
-  const int32_t c_num =
-      has_weight ? PAD_UP(c_seg, NFU_ALIGN_SIZE / sizeof(T)) : c_seg;
-  const int32_t c_end_index = c_start_index + c_seg;
-  const int32_t half_epsilon = 0x0400;
-  const T epsilon_f =
-      sizeof(T) == sizeof(float) ? FLT_MIN : *((half *)&half_epsilon);
-
-  // 0. alpha_t * p_t^r = alpha * (1 - p) ^ gamma  if t == c_i
-  //                    = (1 - alpha) * p ^ gamma  if t != c_i
-  __nramset((T *)output, deal_num, (T)(1 - alpha));
-  __bang_active_sigmoid((T *)compute_b, (T *)input, deal_num);
-  for (int32_t i = 0; i < n_seg; ++i) {
-    const int32_t t = *((uint32_t *)target + i);
-    if (t >= c_start_index && t < c_end_index) {
-      const uint32_t index = i * c_num + t - c_start_index;
-      *((T *)input + index) = -1.0 * (*((T *)input + index));
-      *((T *)compute_b + index) = 1.0 - (*((T *)compute_b + index)) + epsilon_f;
-      *((T *)output + index) = alpha;
-    }
-  }
-  if (sizeof(T) == sizeof(half)) {
-    __bang_half2float((float *)compute_a, (half *)compute_b, deal_num);
-    __bang_active_loghp((float *)compute_a, (float *)compute_a, deal_num);
-    __bang_mul_const((float *)compute_a, (float *)compute_a, (float)gamma,
-                     deal_num);
-    __bang_active_exphp((float *)compute_a, (float *)compute_a, deal_num);
-    __bang_float2half_rd((half *)compute_a, (float *)compute_a, deal_num);
-  } else {
-    __bang_active_loghp((T *)compute_a, (T *)compute_b, deal_num);
-    __bang_mul_const((T *)compute_a, (T *)compute_a, (T)gamma, deal_num);
-    __bang_active_exphp((T *)compute_a, (T *)compute_a, deal_num);
-  }
-  __bang_mul((T *)output, (T *)compute_a, (T *)output, deal_num);
-
-  // 1. max = max(0, -x)  if t == c_i
-  //        = max(0, x)   if t != c_i
-  __nramset((T *)compute_b, deal_num, (T)0);
-  __bang_maxequal((T *)compute_b, (T *)compute_b, (T *)input, deal_num);
-
-  // 2. -log(p_t) = ln(e^(-max)+ e^(-max-x) + max   if t == c_i
-  //              = ln(e^(-max)+ e^(-max+x) + max   if t != c_i
-  __bang_mul_const((T *)compute_a, (T *)compute_b, (T)-1.0, deal_num);
-  __bang_add((T *)input, (T *)compute_a, (T *)input, deal_num);
-
-  __bang_active_exphp((T *)compute_a, (T *)compute_a, deal_num);
-  __bang_active_exphp((T *)input, (T *)input, deal_num);
-  __bang_add((T *)compute_a, (T *)compute_a, (T *)input, deal_num);
-  __bang_active_loghp((T *)compute_a, (T *)compute_a, deal_num);
-  __bang_add((T *)input, (T *)compute_a, (T *)compute_b, deal_num);
-
-  // 3. output = alpha_t * p_t^r * [-log(p_t)]
-  __bang_mul((T *)output, (T *)output, (T *)input, deal_num);
-
-  // 4. with weight
-  if (has_weight) {
-    for (int32_t i = 0; i < n_seg; ++i) {
-      int32_t t = *((int32_t *)target + i);
-      if (t >= 0 && t < c) {
-        t = partition_nc ? 0 : t;
-        __bang_mul_const((T *)output + i * c_num, (T *)output + i * c_num,
-                         *((T *)weight + t), c_num);
-      }
-    }
-  }
-}
-
-template <typename T>
-__mlu_func__ void startPipeline(
-    const T *input, const int32_t *target, const T *weight,
-    char *nram_compute_a, char *nram_compute_b, char *nram_input,
-    char *nram_target, char *nram_weight, char *nram_output,
-    const int32_t has_weight, const int32_t partition_nc,
-    const int32_t pingpong_offset, const int32_t pingpong_weight_offset,
-    const int32_t c_offset_num, const int32_t n, const int32_t n_seg,
-    const int32_t c, const int32_t c_seg, const float alpha, const float gamma,
-    T *output) {
-  // with offset
-  input = (T *)((char *)input + c_offset_num * sizeof(T));
-  output = (T *)((char *)output + c_offset_num * sizeof(T));
-
-  const int32_t c_seg_align_num = PAD_UP(c_seg, NFU_ALIGN_SIZE / sizeof(T));
-  const int32_t c_num = has_weight ? c_seg_align_num : c_seg;
-  const int32_t deal_num = PAD_UP(n_seg * c_num, NFU_ALIGN_SIZE / sizeof(T));
-  const int32_t load_size = c_seg * sizeof(T);
-  const int32_t dram_stride = c * sizeof(T);
-  const int32_t nram_stride = c_num * sizeof(T);
-
-  if (has_weight && !partition_nc) {
-    loadInput<T>(nram_weight, (T *)weight, load_size, nram_stride, dram_stride,
-                 1);
-    __asm__ volatile("sync;\n\t");
-  }
-  const int32_t repeat = n / n_seg;
-  const int32_t remain = n % n_seg;
-
-  /*
-   * Pipeline: The pipeline is processed in three stages: Load, Compute, Store.
-   *           The allocated memory space of NRAM is divided into two parts:
-   *           PING and Pong. In a single time slice, PING is used to process
-   *           IO stream and PONG is used for computation. Both of them are
-   *           processed synchronously until finished.
-   *
-   * diagram of PINGPONG:
-   * |------|-----------------------------------------------------------------|
-   * |      |                              space                              |
-   * |------|-----------------------------------------------------------------|
-   * | time |   Ping   |   Pong   |   Ping   |   Pong   |   Ping   |   Pong   |
-   * |------|-----------------------------------------------------------------|
-   * |  0   |    L0    |          |          |          |          |          |
-   * |  1   |    C0    |    L1    |          |          |          |          |
-   * |  2   |    S0    |    C1    |    L2    |          |          |          |
-   * |  3   |          |    S1    |    C2    |    L3    |          |          |
-   * |  4   |          |          |    S2    |    C3    |    L4    |          |
-   * |  5   |          |          |          |    S3    |    C4    |    L5    |
-   * |  6   |          |          |          |          |    S4    |    C5    |
-   * |  7   |          |          |          |          |          |    S5    |
-   * |------|-----------------------------------------------------------------|
-   */
-
-  // diagram of PINGPONG: L0
-  if (repeat > 0) {
-    loadInput<T>(nram_input, (T *)input, load_size, nram_stride, dram_stride,
-                 n_seg);
-    loadInput<int32_t>(nram_target, (int32_t *)target, n_seg * sizeof(int32_t));
-    loadWeight<T>(nram_weight, (T *)weight, *((int32_t *)target), c, has_weight,
-                  partition_nc);
-    __asm__ volatile("sync;\n\t");
-  }
-
-  // diagram of PINGPONG: C0 and L1
-  if (repeat > 1) {
-    compute((T *)nram_input, (int32_t *)nram_target, (T *)nram_weight,
-            has_weight, partition_nc, deal_num, n_seg, c, c_seg, c_offset_num,
-            alpha, gamma, (T *)nram_compute_a, (T *)nram_compute_b,
-            (T *)nram_output);
-    loadInput<T>((char *)nram_input + pingpong_offset, (T *)input + c * n_seg,
-                 load_size, nram_stride, dram_stride, n_seg);
-    loadInput<int32_t>((char *)nram_target + pingpong_offset,
-                       (int32_t *)target + n_seg, n_seg * sizeof(int32_t));
-    loadWeight<T>((char *)nram_weight + pingpong_weight_offset, (T *)weight,
-                  *((int32_t *)target + n_seg), c, has_weight, partition_nc);
-    __asm__ volatile("sync;\n\t");
-  }
-
-  for (int32_t i = 0; i < repeat - 2; ++i) {
-    storeOutput<T>((T *)output + i * c * n_seg,
-                   nram_output + (i % 2) * pingpong_offset, load_size,
-                   dram_stride, nram_stride, n_seg);
-    loadInput<T>((char *)nram_input + (i % 2) * pingpong_offset,
-                 (T *)(input) + (i + 2) * c * n_seg, load_size, nram_stride,
-                 dram_stride, n_seg);
-    loadInput<int32_t>((char *)nram_target + (i % 2) * pingpong_offset,
-                       (int32_t *)target + (i + 2) * n_seg,
-                       n_seg * sizeof(int32_t));
-    loadWeight<T>((char *)nram_weight + (i % 2) * pingpong_weight_offset,
-                  (T *)weight, *((int32_t *)target + (i + 2) * n_seg), c,
-                  has_weight, partition_nc);
-    compute((T *)(nram_input + ((i + 1) % 2) * pingpong_offset),
-            (int32_t *)(nram_target + ((i + 1) % 2) * pingpong_offset),
-            (T *)(nram_weight +
-                  partition_nc * ((i + 1) % 2) * pingpong_weight_offset),
-            has_weight, partition_nc, deal_num, n_seg, c, c_seg, c_offset_num,
-            alpha, gamma, (T *)nram_compute_a, (T *)nram_compute_b,
-            (T *)(nram_output + ((i + 1) % 2) * pingpong_offset));
-    __asm__ volatile("sync;\n\t");
-  }
-
-  if (repeat > 1) {
-    storeOutput<T>((T *)output + (repeat - 2) * c * n_seg,
-                   (char *)nram_output + (repeat % 2) * pingpong_offset,
-                   load_size, dram_stride, nram_stride, n_seg);
-  }
-
-  if (remain > 0) {
-    loadInput<T>((char *)nram_input + (repeat % 2) * pingpong_offset,
-                 (T *)input + repeat * c * n_seg, load_size, nram_stride,
-                 dram_stride, remain);
-    loadInput<int32_t>((char *)nram_target + (repeat % 2) * pingpong_offset,
-                       (int32_t *)target + repeat * n_seg,
-                       remain * sizeof(int32_t));
-    loadWeight<T>((char *)nram_weight + (repeat % 2) * pingpong_weight_offset,
-                  (T *)weight, *((int32_t *)target + repeat * n_seg), c,
-                  has_weight, partition_nc);
-  }
-
-  if (repeat > 0) {
-    compute((T *)(nram_input + ((repeat - 1) % 2) * pingpong_offset),
-            (int32_t *)(nram_target + ((repeat - 1) % 2) * pingpong_offset),
-            (T *)(nram_weight +
-                  partition_nc * ((repeat - 1) % 2) * pingpong_weight_offset),
-            has_weight, partition_nc, deal_num, n_seg, c, c_seg, c_offset_num,
-            alpha, gamma, (T *)nram_compute_a, (T *)nram_compute_b,
-            (T *)(nram_output + ((repeat - 1) % 2) * pingpong_offset));
-  }
-  __asm__ volatile("sync;\n\t");
-
-  if (repeat > 0) {
-    storeOutput<T>((T *)output + (repeat - 1) * c * n_seg,
-                   (char *)nram_output + ((repeat - 1) % 2) * pingpong_offset,
-                   load_size, dram_stride, nram_stride, n_seg);
-  }
-
-  if (remain > 0) {
-    int32_t rem_num = PAD_UP(remain * c_num, NFU_ALIGN_SIZE / sizeof(T));
-    compute((T *)(nram_input + (repeat % 2) * pingpong_offset),
-            (int32_t *)(nram_target + (repeat % 2) * pingpong_offset),
-            (T *)(nram_weight +
-                  partition_nc * (repeat % 2) * pingpong_weight_offset),
-            has_weight, partition_nc, rem_num, remain, c, c_seg, c_offset_num,
-            alpha, gamma, (T *)nram_compute_a, (T *)nram_compute_b,
-            (T *)(nram_output + (repeat % 2) * pingpong_offset));
-    __asm__ volatile("sync;\n\t");
-
-    storeOutput<T>((T *)output + repeat * c * n_seg,
-                   (char *)nram_output + (repeat % 2) * pingpong_offset,
-                   load_size, dram_stride, nram_stride, remain);
-  }
-  __asm__ volatile("sync;\n\t");
-}
-
-template <typename T>
-__mlu_func__ void focalLossSigmoidForwardBlock(
-    const T *input, const int32_t *target, const T *weight, const int32_t n,
-    const int32_t c, const float alpha, const float gamma, T *output) {
-  /*
-   * NRAM partition
-   *  |-----------------------------------------------------------------------|
-   *  |                                weight                                 |
-   *  |------------------------------- COMPUTE -------------------------------|
-   *  |                                   |                                   |
-   *  |              computeA             |               computeB            |
-   *  |                                   |                                   |
-   *  |------------- PING ------------------------------- PONG ---------------|
-   *  |                                   |                                   |
-   *  |              input                |               input               |
-   *  |                                   |                                   |
-   *  |-----------------------------------|-----------------------------------|
-   *  |                                   |                                   |
-   *  |              output               |               output              |
-   *  |                                   |                                   |
-   *  |-----------------------------------|-----------------------------------|
-   *  |              target               |               target              |
-   *  |-----------------------------------|-----------------------------------|
-   *
-   * split_pipeline_num is 6: COMPUTE(computeA,computeB), PING(input,output),
-   * PONG(input,output).
-   * split_target_num is 2: PING(target), PONG(target).
-   * weight is not NULL:
-   *   The nram-size of weight is equal to c_align_size when partition input-N.
-   *   The nram-size of weight is equal to NFU_ALIGN_SIZE when partition
-   * input-NC.
-  */
-
-  // calculate threshold of c
-  const int32_t split_pipeline_num = 6;
-  const int32_t split_target_num = 2;
-  const int32_t has_weight = weight != NULL;
-  const int32_t threshold_c =
-      PAD_DOWN((MAX_NRAM_SIZE - split_target_num * sizeof(int32_t)) /
-                   (split_pipeline_num + has_weight),
-               NFU_ALIGN_SIZE) /
-      sizeof(T);
-  const int32_t c_align = PAD_UP(c, NFU_ALIGN_SIZE / sizeof(T));
-  const int32_t c_align_size = c_align * sizeof(T);
-
-  if (c <= threshold_c) {
-    // partition inputN
-    int32_t c_num = c;
-    int32_t reservered_align_size =
-        (split_target_num + split_pipeline_num) * NFU_ALIGN_SIZE;
-    int32_t weight_size = 0;
-    if (has_weight) {
-      c_num = c_align;
-      reservered_align_size = split_target_num * NFU_ALIGN_SIZE;
-      weight_size = c_align_size;
-    }
-
-    const int32_t remain_size =
-        MAX_NRAM_SIZE - weight_size - reservered_align_size;
-    const int32_t n_seg =
-        remain_size / (split_pipeline_num * c_num * sizeof(T) +
-                       split_target_num * sizeof(int32_t));
-    const int32_t split_pipeline_size =
-        PAD_UP(c_num * n_seg * sizeof(T), NFU_ALIGN_SIZE);
-    const int32_t compute_size = 2 * split_pipeline_size;
-    const int32_t pingpong_offset = (MAX_NRAM_SIZE - weight_size - compute_size) / 2;
-
-    char *nram_weight = (char *)nram_buffer;
-    char *nram_compute_a = nram_weight + has_weight * c_align_size;
-    char *nram_compute_b = nram_compute_a + split_pipeline_size;
-    char *nram_input = nram_compute_b + split_pipeline_size;
-    char *nram_output = nram_input + split_pipeline_size;
-    char *nram_target = nram_output + split_pipeline_size;
-
-    startPipeline<T>(input, target, weight, nram_compute_a, nram_compute_b,
-                     nram_input, nram_target, nram_weight, nram_output,
-                     has_weight, 0, pingpong_offset, 0, 0, n, n_seg, c, c,
-                     alpha, gamma, output);
-  } else {
-    // partition inputNC
-    const int32_t weight_size = has_weight * NFU_ALIGN_SIZE;
-    const int32_t remain_size = MAX_NRAM_SIZE - weight_size;
-    const int32_t split_pipeline_size = PAD_DOWN(
-        (remain_size - split_target_num * NFU_ALIGN_SIZE) / split_pipeline_num,
-        NFU_ALIGN_SIZE);
-    const int32_t c_seg = split_pipeline_size / sizeof(T);
-    const int32_t n_seg = 1;
-    const int32_t compute_size = 2 * split_pipeline_size;
-    const int32_t pingpong_offset = (MAX_NRAM_SIZE - weight_size - compute_size) / 2;
-    const int32_t pingpong_weight_offset = weight_size / 2;
-
-    char *nram_weight = (char *)nram_buffer;
-    char *nram_compute_a = nram_weight + weight_size;
-    char *nram_compute_b = nram_compute_a + split_pipeline_size;
-    char *nram_input = nram_compute_b + split_pipeline_size;
-    char *nram_output = nram_input + split_pipeline_size;
-    char *nram_target = nram_output + split_pipeline_size;
-
-    const int32_t loop_num = (c + c_seg - 1) / c_seg;
-    const int32_t partition_nc = 1;
-    for (int32_t i = 0; i < loop_num; ++i) {
-      const int32_t c_index = i * c_seg;
-      const int32_t c_seg_curr = i == (loop_num - 1) ? c - c_index : c_seg;
-      startPipeline<T>(input, target, weight, nram_compute_a, nram_compute_b,
-                       nram_input, nram_target, nram_weight, nram_output,
-                       has_weight, partition_nc, pingpong_offset,
-                       pingpong_weight_offset, c_index, n, n_seg, c, c_seg_curr,
-                       alpha, gamma, output);
-    }
-  }
-}
-
-template <typename T>
-__mlu_global__ void MLUUnion1KernelFocalLossSigmoidForward(
-    const void *input, const void *target, const void *weight, const int32_t N,
-    const int32_t C, const float alpha, const float gamma, void *output) {
-  const int32_t n_seg = N / taskDim + (taskId == taskDim - 1) * (N % taskDim);
-  const T *input_offset = (T *)input + N / taskDim * taskId * C;
-  const int32_t *target_offset = (int32_t *)target + N / taskDim * taskId;
-  T *output_offset = (T *)output + N / taskDim * taskId * C;
-
-  focalLossSigmoidForwardBlock((T *)input_offset, (int32_t *)target_offset,
-                               (T *)weight, n_seg, C, alpha, gamma,
-                               (T *)output_offset);
-}
-}  // namespace forward
-
-namespace backward {
-template <typename T>
-__mlu_func__ void loadInput(char *nram_input, char *nram_target,
-                            const T *gdram_input, const int32_t *gdram_target,
-                            const int32_t deal_n, const int32_t total_c,
-                            const bool pingping_flag, const bool has_weight,
-                            const int32_t nram_offset,
-                            const int32_t gdram_offset) {
-  if (pingping_flag == PONG) {
-    nram_input += nram_offset;
-    nram_target += nram_offset;
-  }
-
-  __memcpy_async(nram_target, gdram_target + gdram_offset / total_c,
-                 deal_n * sizeof(int32_t), GDRAM2NRAM);
-
-  char *nram_input_load = nram_input;
-  int32_t compute_align_size = 2 * NFU_ALIGN_SIZE;
-  if (has_weight) {
-    if (sizeof(T) == sizeof(half)) {
-      int32_t compute_align_num = compute_align_size / sizeof(float);
-      int32_t align_c = PAD_UP(total_c, compute_align_num);
-      int32_t compute_size = deal_n * align_c * sizeof(float);
-      nram_input_load += compute_size / 2;
-    }
-    int32_t align_c = PAD_UP(total_c, NFU_ALIGN_SIZE / sizeof(T));
-    int32_t total_c_size = total_c * sizeof(T);
-    int32_t align_c_size = align_c * sizeof(T);
-    __memcpy_async(nram_input_load, gdram_input + gdram_offset, total_c_size,
-                   GDRAM2NRAM, align_c_size, total_c_size, deal_n - 1);
-  } else {
-    if (sizeof(T) == sizeof(half)) {
-      int32_t compute_size =
-          PAD_UP(deal_n * total_c * sizeof(float), compute_align_size);
-      nram_input_load += compute_size / 2;
-    }
-    int32_t load_size = deal_n * total_c * sizeof(T);
-    __memcpy_async(nram_input_load, gdram_input + gdram_offset, load_size,
-                   GDRAM2NRAM);
-  }
-}
-
-template <typename T>
-__mlu_func__ void sigmoid(T *dst_data, const T *src_data,
-                          const int32_t elem_count) {
-  __bang_mul_const(dst_data, (T *)src_data, T(-1), elem_count);
-  __bang_active_exphp(dst_data, dst_data, elem_count);
-  __bang_add_const(dst_data, dst_data, T(1), elem_count);
-  __bang_active_reciphp(dst_data, dst_data, elem_count);
-}
-
-template <typename T>
-__mlu_func__ void coreCompute(char *nram_input, const T *nram_weight,
-                              const float *nram_flt_min, char *nram_pt,
-                              char *nram_alpha_t, char *nram_temp,
-                              char *nram_target, const float *nram_gamma,
-                              char *nram_output, const float alpha,
-                              const int32_t compute_num, const int32_t deal_n,
-                              const int32_t total_c, const bool pingpong_flag,
-                              const int32_t nram_offset,
-                              const bool has_weight) {
-  if (pingpong_flag == PONG) {
-    nram_input += nram_offset;
-    nram_pt += nram_offset;
-    nram_alpha_t += nram_offset;
-    nram_temp += nram_offset;
-    nram_output += nram_offset;
-    nram_target += nram_offset;
-  }
-
-  if (sizeof(T) == sizeof(half)) {
-    const int32_t compute_size = compute_num * sizeof(float);
-    char *nram_input_load = nram_input + compute_size / 2;
-    __bang_half2float((float *)nram_input, (half *)nram_input_load,
-                      compute_num);
-  }
-
-  // 0. alpha_t = alpha - 1
-  __nramset((float *)nram_alpha_t, compute_num, (float)(alpha - 1.0));
-
-  // 1. pt = 1 - sigmoid(x)
-  sigmoid((float *)nram_pt, (float *)nram_input, compute_num);
-  __bang_mul_const((float *)nram_pt, (float *)nram_pt, (float)(-1),
-                   compute_num);
-  __bang_add_const((float *)nram_pt, (float *)nram_pt, (float)1, compute_num);
-
-  // 2. pt      = target[n] == c ? sigmoid(x) : 1 - sigmoid(x)
-  //    alpha_t = target[n] == c ? alpha      : alpha - 1
-  const int32_t nfu_align_num = NFU_ALIGN_SIZE / sizeof(float);
-  for (int n = 0; n < deal_n; n++) {
-    const int32_t target_value = ((int32_t *)nram_target)[n];
-    if (target_value >= total_c || target_value < 0) continue;
-    int32_t c_offset = 0;
-    if (has_weight) {
-      int32_t c_align_num = nfu_align_num;
-      if (sizeof(T) == sizeof(half)) {
-        c_align_num += nfu_align_num;
-      }
-      c_offset = PAD_UP(total_c, c_align_num);
-    } else {
-      c_offset = total_c;
-    }
-    int32_t idx = n * c_offset + target_value;
-    *((float *)nram_pt + idx) = 1.0 - *((float *)nram_pt + idx);
-    *((float *)nram_alpha_t + idx) = alpha;
-  }
-
-  // 3. temp = -alpha_t * e^(gamma * log(max(1 - pt, FLT_MIN))
-  __bang_mul_const((float *)nram_temp, (float *)nram_pt, (float)(-1),
-                   compute_num);
-  __bang_add_const((float *)nram_temp, (float *)nram_temp, (float)(1),
-                   compute_num);
-  __bang_cycle_maxequal((float *)nram_temp, (float *)nram_temp,
-                        (float *)nram_flt_min, compute_num, nfu_align_num);
-  __bang_active_loghp((float *)nram_temp, (float *)nram_temp, compute_num);
-  __bang_cycle_mul((float *)nram_temp, (float *)nram_temp, (float *)nram_gamma,
-                   compute_num, nfu_align_num);
-  __bang_active_exphp((float *)nram_temp, (float *)nram_temp, compute_num);
-  __bang_mul((float *)nram_temp, (float *)nram_temp, (float *)nram_alpha_t,
-             compute_num);
-  __bang_mul_const((float *)nram_temp, (float *)nram_temp, (float)(-1),
-                   compute_num);
-
-  // 4. output = 1 - pt - gamma * pt * log(max(pt, FLT_MIN))
-  __bang_cycle_maxequal((float *)nram_output, (float *)nram_pt,
-                        (float *)nram_flt_min, compute_num, nfu_align_num);
-  __bang_active_loghp((float *)nram_output, (float *)nram_output, compute_num);
-  __bang_mul((float *)nram_output, (float *)nram_output, (float *)nram_pt,
-             compute_num);
-  __bang_cycle_mul((float *)nram_output, (float *)nram_output,
-                   (float *)nram_gamma, compute_num, nfu_align_num);
-  __bang_add((float *)nram_output, (float *)nram_output, (float *)nram_pt,
-             compute_num);
-  __bang_mul_const((float *)nram_output, (float *)nram_output, (float)(-1),
-                   compute_num);
-  __bang_add_const((float *)nram_output, (float *)nram_output, (float)(1),
-                   compute_num);
-
-  // 5. output = output * temp
-  __bang_mul((float *)nram_output, (float *)nram_output, (float *)nram_temp,
-             compute_num);
-
-  if (sizeof(T) == sizeof(half)) {
-    __bang_float2half_rd((half *)nram_output, (float *)nram_output,
-                         compute_num);
-  }
-
-  if (has_weight) {
-    // with weight
-    for (int n = 0; n < deal_n; n++) {
-      int32_t c_align_num = nfu_align_num;
-      if (sizeof(T) == sizeof(half)) {
-        c_align_num += nfu_align_num;
-      }
-      int32_t align_c = PAD_UP(total_c, c_align_num);
-      int32_t target_value = ((int32_t *)nram_target)[n];
-      T weight_value = nram_weight[target_value];
-      __bang_mul_const((T *)nram_output + n * align_c,
-                       (T *)nram_output + n * align_c, weight_value, align_c);
-    }
-  }
-}
-
-template <typename T>
-__mlu_func__ void storeOutput(T *gdram_output, const char *nram_output,
-                              const int32_t deal_n, const int32_t total_c,
-                              const bool pingpong_flag, const bool has_weight,
-                              const int32_t nram_offset,
-                              const int32_t gdram_offset) {
-  if (pingpong_flag == PONG) {
-    nram_output += nram_offset;
-  }
-  const int32_t store_size = deal_n * total_c * sizeof(T);
-  if (has_weight) {
-    int32_t align_c = PAD_UP(total_c, NFU_ALIGN_SIZE / sizeof(T));
-    int32_t total_c_size = total_c * sizeof(T);
-    int32_t align_c_size = align_c * sizeof(T);
-    __memcpy_async(gdram_output + gdram_offset, nram_output, total_c_size,
-                   NRAM2GDRAM, total_c_size, align_c_size, deal_n - 1);
-  } else {
-    __memcpy_async(gdram_output + gdram_offset, nram_output, store_size,
-                   NRAM2GDRAM);
-  }
-}
-
-template <typename T>
-__mlu_func__ void focalLossSigmoidBackwardBlock(
-    const T *input, const int32_t *target, const T *weight, const float gamma,
-    const float alpha, const int32_t total_n, const int32_t deal_n,
-    const int32_t total_c, T *output) {
-  // params per time slice
-  int32_t deal_num = deal_n * total_c;
-  int32_t deal_size = deal_num * sizeof(float);
-  int32_t compute_num = 0;
-  int32_t compute_size = 0;
-  int32_t compute_align_size = NFU_ALIGN_SIZE;
-  const int32_t nfu_align_num = NFU_ALIGN_SIZE / sizeof(T);
-  if (sizeof(T) == sizeof(half)) {
-    compute_align_size += NFU_ALIGN_SIZE;
-  }
-  const int32_t compute_align_num = compute_align_size / sizeof(float);
-  bool has_weight = false;
-  if (weight != NULL) {
-    has_weight = true;
-    int32_t align_c = PAD_UP(total_c, compute_align_num);
-    compute_num = deal_n * align_c;
-    compute_size = compute_num * sizeof(float);
-  } else {
-    compute_size = PAD_UP(deal_size, compute_align_size);
-    compute_num = compute_size / sizeof(float);
-  }
-
-  // params per core
-  int32_t total_num = total_n * total_c;
-  int32_t num_per_core = PAD_DOWN(total_num / taskDim, deal_num);
-  int32_t loop_per_core = num_per_core / deal_num;
-
-  /* NRAM partition:
-   *
-   * |-----------------ping pong--------------------|
-   * |input | pt | alpha_t | temp | output | target | flt_min | gamma | weight|
-   *
-   * split_pipeline_num is 5: input, pt, alpha_t, temp, output.
-   * nram_reserved_line_num is 2: flt_min, gamma.
-   */
-  const int32_t split_pipeline_num = 5;
-  const int32_t nram_reserved_line_num = 2;
-  int32_t target_deal_size = deal_n * sizeof(int32_t);
-  int32_t target_deal_size_align = PAD_UP(target_deal_size, NFU_ALIGN_SIZE);
-  // nram PING/PONG offset
-  int32_t ping_pong_offset =
-      compute_size * split_pipeline_num + target_deal_size_align;
-
-  // gdram addr
-  int32_t *base_addr_target =
-      (int32_t *)target + taskId * loop_per_core * deal_n;
-  T *base_addr_input = (T *)input + taskId * num_per_core;
-  T *base_addr_output = output + taskId * num_per_core;
-
-  // nram addr
-  char *nram_input = (char *)nram_buffer;
-  char *nram_pt = nram_input + compute_size;
-  char *nram_alpha_t = nram_pt + compute_size;
-  char *nram_temp = nram_alpha_t + compute_size;
-  char *nram_output = nram_temp + compute_size;
-  char *nram_target = nram_output + compute_size;
-  float *nram_flt_min = NULL;
-  float *nram_gamma = NULL;
-  T *nram_weight = NULL;
-
-  if (!has_weight) {
-    nram_flt_min = (float *)(nram_buffer + MAX_NRAM_SIZE -
-                             nram_reserved_line_num * NFU_ALIGN_SIZE);
-    nram_gamma = nram_flt_min + nfu_align_num;
-  } else {
-    int32_t weight_space = PAD_UP(total_c * sizeof(T), NFU_ALIGN_SIZE);
-    nram_flt_min =
-        (float *)(nram_buffer + MAX_NRAM_SIZE -
-                  nram_reserved_line_num * NFU_ALIGN_SIZE - weight_space);
-    nram_gamma = nram_flt_min + nfu_align_num;
-    nram_weight = (T *)(nram_gamma + nfu_align_num);
-    __memcpy_async(nram_weight, weight, total_c * sizeof(T), GDRAM2NRAM);
-  }
-
-  // nram set gamma and FLT_MIN
-  __nramset(nram_gamma, nfu_align_num, gamma);
-  __nramset(nram_flt_min, nfu_align_num, FLT_MIN);
-
-  /*
-   * Pipeline: The pipeline is processed in three stages: Load, Compute, Store.
-   *           The allocated memory space of NRAM is divided into two parts:
-   *           PING and Pong. In a single time slice, PING is used to process
-   *           IO stream and PONG is used for computation. Both of them are
-   *           processed synchronously until finished.
-   *
-   * diagram of PINGPONG:
-   * |------|-----------------------------------------------------------------|
-   * |      |                              space                              |
-   * |------|-----------------------------------------------------------------|
-   * | time |   Ping   |   Pong   |   Ping   |   Pong   |   Ping   |   Pong   |
-   * |------|-----------------------------------------------------------------|
-   * |  0   |    L0    |          |          |          |          |          |
-   * |  1   |    C0    |    L1    |          |          |          |          |
-   * |  2   |    S0    |    C1    |    L2    |          |          |          |
-   * |  3   |          |    S1    |    C2    |    L3    |          |          |
-   * |  4   |          |          |    S2    |    C3    |    L4    |          |
-   * |  5   |          |          |          |    S3    |    C4    |    L5    |
-   * |  6   |          |          |          |          |    S4    |    C5    |
-   * |  7   |          |          |          |          |          |    S5    |
-   * |------|-----------------------------------------------------------------|
-   */
-
-  // diagram of PINGPONG: L0
-  if (loop_per_core > 0) {
-    loadInput(nram_input, nram_target, base_addr_input, base_addr_target,
-              deal_n, total_c, PING, has_weight, ping_pong_offset, 0);
-    __asm__ volatile("sync;");
-  }
-
-  // diagram of PINGPONG: C0 and L1
-  if (loop_per_core > 1) {
-    coreCompute(nram_input, nram_weight, nram_flt_min, nram_pt, nram_alpha_t,
-                nram_temp, nram_target, nram_gamma, nram_output, alpha,
-                compute_num, deal_n, total_c, PING, ping_pong_offset,
-                has_weight);
-    loadInput(nram_input, nram_target, base_addr_input, base_addr_target,
-              deal_n, total_c, PONG, has_weight, ping_pong_offset, deal_num);
-    __asm__ volatile("sync;");
-  }
-
-  for (int i = 0; i < loop_per_core - 2; ++i) {
-    if (i % 2 == PING) {
-      storeOutput(base_addr_output, nram_output, deal_n, total_c, PING,
-                  has_weight, ping_pong_offset, i * deal_num);
-      coreCompute(nram_input, nram_weight, nram_flt_min, nram_pt, nram_alpha_t,
-                  nram_temp, nram_target, nram_gamma, nram_output, alpha,
-                  compute_num, deal_n, total_c, PONG, ping_pong_offset,
-                  has_weight);
-      loadInput(nram_input, nram_target, base_addr_input, base_addr_target,
-                deal_n, total_c, PING, has_weight, ping_pong_offset,
-                (i + 2) * deal_num);
-    } else {
-      storeOutput(base_addr_output, nram_output, deal_n, total_c, PONG,
-                  has_weight, ping_pong_offset, i * deal_num);
-      coreCompute(nram_input, nram_weight, nram_flt_min, nram_pt, nram_alpha_t,
-                  nram_temp, nram_target, nram_gamma, nram_output, alpha,
-                  compute_num, deal_n, total_c, PING, ping_pong_offset,
-                  has_weight);
-      loadInput(nram_input, nram_target, base_addr_input, base_addr_target,
-                deal_n, total_c, PONG, has_weight, ping_pong_offset,
-                (i + 2) * deal_num);
-    }
-    __asm__ volatile("sync;");
-  }
-
-  if (loop_per_core > 1) {
-    if ((loop_per_core - 2) % 2 == PING) {
-      storeOutput(base_addr_output, nram_output, deal_n, total_c, PING,
-                  has_weight, ping_pong_offset, (loop_per_core - 2) * deal_num);
-      coreCompute(nram_input, nram_weight, nram_flt_min, nram_pt, nram_alpha_t,
-                  nram_temp, nram_target, nram_gamma, nram_output, alpha,
-                  compute_num, deal_n, total_c, PONG, ping_pong_offset,
-                  has_weight);
-    } else {
-      storeOutput(base_addr_output, nram_output, deal_n, total_c, PONG,
-                  has_weight, ping_pong_offset, (loop_per_core - 2) * deal_num);
-      coreCompute(nram_input, nram_weight, nram_flt_min, nram_pt, nram_alpha_t,
-                  nram_temp, nram_target, nram_gamma, nram_output, alpha,
-                  compute_num, deal_n, total_c, PING, ping_pong_offset,
-                  has_weight);
-    }
-    __asm__ volatile("sync;");
-  }
-
-  if (loop_per_core > 0) {
-    if (loop_per_core == 1) {
-      coreCompute(nram_input, nram_weight, nram_flt_min, nram_pt, nram_alpha_t,
-                  nram_temp, nram_target, nram_gamma, nram_output, alpha,
-                  compute_num, deal_n, total_c, PING, ping_pong_offset,
-                  has_weight);
-      __asm__ volatile("sync;");
-    }
-    if ((loop_per_core - 1) % 2 == PING) {
-      storeOutput(base_addr_output, nram_output, deal_n, total_c, PING,
-                  has_weight, ping_pong_offset, (loop_per_core - 1) * deal_num);
-    } else {
-      storeOutput(base_addr_output, nram_output, deal_n, total_c, PONG,
-                  has_weight, ping_pong_offset, (loop_per_core - 1) * deal_num);
-    }
-  }
-
-  // process the remaining data which N remainder per core is less than deal_n
-  int32_t rem_for_all = total_num - num_per_core * taskDim;
-  if (rem_for_all == 0) return;
-  int32_t rem_n_for_all = rem_for_all / total_c;
-  int32_t rem_n_per_core = (rem_n_for_all + taskDim - 1) / taskDim;
-  int32_t rem_num_per_core = rem_n_per_core * total_c;
-  int32_t rem_num_per_core_align = 0;
-  int32_t rem_core_num = rem_for_all / rem_num_per_core;
-
-  int32_t rem_n_for_last = rem_n_for_all % rem_n_per_core;
-  int32_t rem_num_for_last = rem_n_for_last * total_c;
-  int32_t rem_num_for_last_align = 0;
-
-  if (has_weight) {
-    int32_t align_c = PAD_UP(total_c, compute_align_num);
-    rem_num_per_core_align = rem_n_per_core * align_c;
-    rem_num_for_last_align = rem_n_for_last * align_c;
-  } else {
-    rem_num_per_core_align = PAD_UP(rem_num_per_core, compute_align_num);
-    rem_num_for_last_align = PAD_UP(rem_num_for_last, compute_align_num);
-  }
-
-  int32_t rem_addr_base = num_per_core * taskDim;
-  int32_t rem_target_addr_base = loop_per_core * deal_n * taskDim;
-  base_addr_target = (int32_t *)target + rem_target_addr_base;
-  base_addr_input = (T *)input + rem_addr_base;
-  base_addr_output = output + rem_addr_base;
-
-  if (taskId < rem_core_num) {
-    loadInput(nram_input, nram_target, base_addr_input, base_addr_target,
-              rem_n_per_core, total_c, PING, has_weight, ping_pong_offset,
-              taskId * rem_num_per_core);
-    __asm__ volatile("sync;");
-    coreCompute(nram_input, nram_weight, nram_flt_min, nram_pt, nram_alpha_t,
-                nram_temp, nram_target, nram_gamma, nram_output, alpha,
-                rem_num_per_core_align, rem_n_per_core, total_c, PING,
-                ping_pong_offset, has_weight);
-    __asm__ volatile("sync;");
-    storeOutput(base_addr_output, nram_output, rem_n_per_core, total_c, PING,
-                has_weight, ping_pong_offset, taskId * rem_num_per_core);
-  } else if (taskId == rem_core_num) {
-    if (rem_num_for_last == 0) return;
-    loadInput(nram_input, nram_target, base_addr_input, base_addr_target,
-              rem_n_for_last, total_c, PING, has_weight, ping_pong_offset,
-              taskId * rem_num_per_core);
-    __asm__ volatile("sync;");
-    coreCompute(nram_input, nram_weight, nram_flt_min, nram_pt, nram_alpha_t,
-                nram_temp, nram_target, nram_gamma, nram_output, alpha,
-                rem_num_for_last_align, rem_n_for_last, total_c, PING,
-                ping_pong_offset, has_weight);
-    __asm__ volatile("sync;");
-    storeOutput(base_addr_output, nram_output, rem_n_for_last, total_c, PING,
-                has_weight, ping_pong_offset, taskId * rem_num_per_core);
-  } else {
-    return;
-  }
-}
-
-template <typename T>
-__mlu_global__ void MLUUnion1KernelFocalLossSigmoidBackward(
-    const void *input, const void *target, const void *weight,
-    const float gamma, const float alpha, const int32_t total_n,
-    const int32_t deal_n, const int32_t total_c, void *output) {
-  focalLossSigmoidBackwardBlock((T *)input, (int32_t *)target, (T *)weight,
-                                gamma, alpha, total_n, deal_n, total_c,
-                                (T *)output);
-}
-}  // namespace backward
-
-void KernelFocalLossSigmoidForward(cnrtDim3_t k_dim, cnrtFunctionType_t k_type,
-                                   cnrtQueue_t queue,
-                                   const cnrtDataType_t d_type,
-                                   const void *input, const void *target,
-                                   const void *weight, const int32_t N,
-                                   const int32_t C, const float alpha,
-                                   const float gamma, void *output) {
-  if (d_type == CNRT_FLOAT16) {
-    forward::MLUUnion1KernelFocalLossSigmoidForward<
-        half><<<k_dim, k_type, queue>>>(input, target, weight, N, C, alpha,
-                                        gamma, output);
-  } else {
-    forward::MLUUnion1KernelFocalLossSigmoidForward<
-        float><<<k_dim, k_type, queue>>>(input, target, weight, N, C, alpha,
-                                         gamma, output);
-  }
-}
-
-void KernelFocalLossSigmoidBackward(cnrtDim3_t k_dim, cnrtFunctionType_t k_type,
-                                    cnrtQueue_t queue,
-                                    const cnrtDataType_t d_type,
-                                    const void *input, const void *target,
-                                    const void *weight, const float gamma,
-                                    const float alpha, const int32_t dim_n,
-                                    const int32_t deal_n, const int32_t dim_c,
-                                    void *output) {
-  if (d_type == CNRT_FLOAT16) {
-    backward::MLUUnion1KernelFocalLossSigmoidBackward<
-        half><<<k_dim, k_type, queue>>>(input, target, weight, gamma, alpha,
-                                        dim_n, deal_n, dim_c, output);
-  } else {
-    backward::MLUUnion1KernelFocalLossSigmoidBackward<
-        float><<<k_dim, k_type, queue>>>(input, target, weight, gamma, alpha,
-                                         dim_n, deal_n, dim_c, output);
-  }
-}
diff --git a/mmcv/ops/csrc/common/mlu/iou3d_mlu_kernel.mlu b/mmcv/ops/csrc/common/mlu/iou3d_mlu_kernel.mlu
deleted file mode 100644
index 84e53aa..0000000
--- a/mmcv/ops/csrc/common/mlu/iou3d_mlu_kernel.mlu
+++ /dev/null
@@ -1,431 +0,0 @@
-/*************************************************************************
- * Copyright (C) 2022 Cambricon.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *************************************************************************/
-
-#include "common_mlu_helper.hpp"
-#include "iou3d_utils.hpp"
-
-#define SIZE_SRAM_BUF (MAX_SRAM_SIZE)
-
-/* NRAM buffer
- * Suppose deal N boxes once time.
-----------------------------------------------------------------
-| Basic |score (1N)+       |intersect_pts(48N)|                |
-|       |valid_box(1N)     |+ ordered_pts(48N)| temp_long(72N) |
-|       |+ temp_buffer(10N)|                  |                |
-|--------------------------|------------------|----------------|
-| Reuse |     null         |     null         |rotated_pts(16N)|
-|-------|------------------|------------------|----------------|
-
----------------------------------------------------------------------------
-| Basic |  dist_ram(24N)   | valid_pts(24N)  |box1(5N)  |box1_buffer(5KB) |
-|       |                  |+ nums_in_ram(1N)|+ box2(5N)|+nram_save(5KB)  |
-|--------------------------|-----------------|----------|-----------------|
-| Reuse |  vec_buffer(5N)  |    null         |   null   |      null       |
-|-------|------------------|-----------------|----------|-----------------|
-Total Basic Memory Size = 239N * sizeof(float) + 10KB
-*/
-
-__nram__ char nram_buffer[MAX_NRAM_SIZE];
-__mlu_shared__ char sram_buffer[SIZE_SRAM_BUF];
-
-template <typename T>
-__mlu_func__ void iou3D_detection(int32_t &result_box_num, int32_t *output_data,
-                                  const T *boxes_data, float *scores_data,
-                                  const int core_limit, const int input_box_num,
-                                  const float iou_threshold,
-                                  mluMemcpyDirection_t scores_load_dir,
-                                  mluMemcpyDirection_t scores_store_dir,
-                                  mluMemcpyDirection_t boxes_load_dir) {
-  // NRAM divide by (2+4*COMPUTE_COUNT_ALIGN) copies of NRAM, counted by bytes
-  const int nram_save_limit_count = 256;
-  int box_read_limit_count = 256;
-  float div_thresh_iou = 1.0 / iou_threshold;
-  // every box require 239 * sizeof(float) space in nram;
-  const int32_t copies_of_nram = 239 * sizeof(float);
-  const int32_t limit = (MAX_NRAM_SIZE - 5 * box_read_limit_count * sizeof(T) -
-                         nram_save_limit_count * sizeof(int32_t)) /
-                        copies_of_nram;
-
-  // x,y,z,dx,dy,dz,angle
-  const T *input_x_ptr = boxes_data;
-  const T *input_y_ptr = input_x_ptr + input_box_num;
-  const T *input_dx_ptr = input_y_ptr + 2 * input_box_num;
-  const T *input_dy_ptr = input_dx_ptr + input_box_num;
-  const T *input_angle_ptr = input_dy_ptr + 2 * input_box_num;
-  float *input_score_ptr = scores_data;
-
-  // data split
-  int avg_cluster = 0;
-  int rem_cluster = 0;
-  int len_cluster = 0;
-  int cluster_offset = 0;
-  if (clusterDim > 0) {
-    // union
-    avg_cluster = input_box_num / clusterDim;
-    rem_cluster = input_box_num % clusterDim;
-    len_cluster = avg_cluster + (clusterId < rem_cluster ? 1 : 0);
-    cluster_offset = avg_cluster * clusterId +
-                     (clusterId <= rem_cluster ? clusterId : rem_cluster);
-  } else {
-    // block
-    len_cluster = input_box_num;
-    cluster_offset = 0;
-  }
-  int len_core = input_box_num;
-  int input_offset = 0;
-  if (core_limit > 1) {
-    int avg_core = len_cluster / coreDim;
-    int rem_core = len_cluster % coreDim;
-    len_core = avg_core + (coreId < rem_core ? 1 : 0);
-    int core_offset =
-        avg_core * coreId + (coreId <= rem_core ? coreId : rem_core);
-    input_offset = cluster_offset + core_offset;
-  }
-
-  int32_t max_seg_pad = IOU3D_DOWN(limit, IOU3D_SIZE);
-  int repeat_iou_compute = len_core / max_seg_pad;
-  int remain_iou_compute = len_core % max_seg_pad;
-
-  // basic consistent memory layout
-  void *score = ((char *)nram_buffer);
-  void *valid_box = ((char *)score) + 1 * max_seg_pad * sizeof(float);
-  void *temp_buffer = ((char *)valid_box) + 1 * max_seg_pad * sizeof(float);
-  void *intersect_pts_x =
-      ((char *)temp_buffer) + 10 * max_seg_pad * sizeof(float);
-  void *intersect_pts_y =
-      ((char *)intersect_pts_x) + 24 * max_seg_pad * sizeof(float);
-  void *ordered_pts_x =
-      ((char *)intersect_pts_y) + 24 * max_seg_pad * sizeof(float);
-  void *ordered_pts_y =
-      ((char *)ordered_pts_x) + 24 * max_seg_pad * sizeof(float);
-  void *temp_long_1 =
-      ((char *)ordered_pts_y) + 24 * max_seg_pad * sizeof(float);
-  void *temp_long_2 = ((char *)temp_long_1) + 24 * max_seg_pad * sizeof(float);
-  void *temp_long_3 = ((char *)temp_long_2) + 24 * max_seg_pad * sizeof(float);
-  void *dist_ram = ((char *)temp_long_3) + 24 * max_seg_pad * sizeof(float);
-  void *valid_pts = ((char *)dist_ram) + 24 * max_seg_pad * sizeof(float);
-  void *nums_in_ram = ((char *)valid_pts) + 24 * max_seg_pad * sizeof(float);
-  T *box1 = (T *)(((char *)nums_in_ram) + 1 * max_seg_pad * sizeof(float));
-  T *box2 = (T *)(((char *)box1) + 5 * max_seg_pad * sizeof(float));
-  void *box1_buffer = ((char *)box2) + 5 * max_seg_pad * sizeof(float);
-  int32_t *nram_save =
-      (int32_t *)(((char *)box1_buffer) + 5 * box_read_limit_count * sizeof(T));
-  // nram_save ~ nram_save_limit_count * sizeof(int32_t)
-  int nram_save_count = 0;
-
-  // reuse memory
-  void *rotated_pts1_x = ((char *)dist_ram);
-  void *rotated_pts1_y =
-      ((char *)rotated_pts1_x) + 4 * max_seg_pad * sizeof(float);
-  void *rotated_pts2_x =
-      ((char *)rotated_pts1_y) + 4 * max_seg_pad * sizeof(float);
-  void *rotated_pts2_y =
-      ((char *)rotated_pts2_x) + 4 * max_seg_pad * sizeof(float);
-  void *vec_buffer = ((char *)temp_long_1) + 5 * max_seg_pad * sizeof(float);
-  // vec_buffer ~ 16 * max_seg_pad * sizeof(float)
-
-  // First, initialize ram with all 0, or could cause nan/inf unexcepted results
-  __bang_write_zero((unsigned char *)nram_buffer, copies_of_nram * max_seg_pad);
-  // number 8 and 0xff relay on box_read_limit_count initial as 256
-  const int max_box_seg_id = (input_box_num - 1) >> 8;
-  const int last_rem_box_number = ((input_box_num - 1) & 0xff) + 1;
-  for (int32_t cur_box = 0; cur_box < input_box_num; ++cur_box) {
-    __sync_all();
-    int box_seg_id = cur_box >> 8, box_id = cur_box & 0xff;
-    box_read_limit_count = box_seg_id == max_box_seg_id ? last_rem_box_number
-                                                        : box_read_limit_count;
-    if (box_id == 0) {
-      // x,y,z,dx,dy,dz,angle
-      int offset_num = box_seg_id << 8;
-      // x
-      __memcpy((char *)box1_buffer, input_x_ptr + offset_num,
-               box_read_limit_count * 1 * sizeof(T), boxes_load_dir,
-               box_read_limit_count * 1 * sizeof(T),
-               box_read_limit_count * 1 * sizeof(T), 0);
-      // y
-      __memcpy((char *)box1_buffer + box_read_limit_count * 1 * sizeof(T),
-               input_y_ptr + offset_num, box_read_limit_count * 1 * sizeof(T),
-               boxes_load_dir, box_read_limit_count * 1 * sizeof(T),
-               box_read_limit_count * 1 * sizeof(T), 0);
-      // dx
-      __memcpy((char *)box1_buffer + box_read_limit_count * 2 * sizeof(T),
-               input_dx_ptr + offset_num, box_read_limit_count * 1 * sizeof(T),
-               boxes_load_dir, box_read_limit_count * 1 * sizeof(T),
-               box_read_limit_count * 1 * sizeof(T), 0);
-      // dy
-      __memcpy((char *)box1_buffer + box_read_limit_count * 3 * sizeof(T),
-               input_dy_ptr + offset_num, box_read_limit_count * 1 * sizeof(T),
-               boxes_load_dir, box_read_limit_count * 1 * sizeof(T),
-               box_read_limit_count * 1 * sizeof(T), 0);
-      // angle
-      __memcpy((char *)box1_buffer + box_read_limit_count * 4 * sizeof(T),
-               input_angle_ptr + offset_num,
-               box_read_limit_count * 1 * sizeof(T), boxes_load_dir,
-               box_read_limit_count * 1 * sizeof(T),
-               box_read_limit_count * 1 * sizeof(T), 0);
-    }
-    if (((float *)input_score_ptr)[cur_box] == 0) {
-      continue;
-    }
-    // save result
-    nram_save[nram_save_count] = cur_box;
-    result_box_num++;
-    nram_save_count++;
-    if (clusterId == 0 && coreId == 0 &&
-        nram_save_count == nram_save_limit_count) {
-      pvLock();
-      __memcpy(output_data, nram_save, nram_save_count * sizeof(int32_t),
-               NRAM2GDRAM);
-      pvUnlock();
-      output_data += nram_save_count;
-      nram_save_count = 0;
-    }
-    // prepare box1
-    // x
-    __bang_write_value((float *)box1, max_seg_pad,
-                       float(((T *)box1_buffer)[box_id]));
-    // y
-    __bang_write_value(
-        (float *)box1 + max_seg_pad, max_seg_pad,
-        float(((T *)box1_buffer)[box_id + 1 * box_read_limit_count]));
-    // dx
-    __bang_write_value(
-        (float *)box1 + max_seg_pad * 2, max_seg_pad,
-        float(((T *)box1_buffer)[box_id + 2 * box_read_limit_count]));
-    // dy
-    __bang_write_value(
-        (float *)box1 + max_seg_pad * 3, max_seg_pad,
-        float(((T *)box1_buffer)[box_id + 3 * box_read_limit_count]));
-    // angle
-    __bang_write_value(
-        (float *)box1 + max_seg_pad * 4, max_seg_pad,
-        float(((T *)box1_buffer)[box_id + 4 * box_read_limit_count]));
-
-    float max_area = 1.0f *
-                     ((T *)box1_buffer)[box_id + 2 * box_read_limit_count] *
-                     ((T *)box1_buffer)[box_id + 3 * box_read_limit_count];
-    // update score
-
-    for (int i = 0; i <= repeat_iou_compute; i++) {
-      if (i == repeat_iou_compute && remain_iou_compute == 0) {
-        break;
-      }
-      int seg_len = max_seg_pad;
-      int cpy_len =
-          (i == repeat_iou_compute) ? remain_iou_compute : max_seg_pad;
-      // int half_offset = std::is_same<T, half>::value ? max_seg_pad * 5 : 0;
-      int half_offset = (sizeof(T) == sizeof(half)) ? max_seg_pad * 5 : 0;
-      // score
-      __memcpy(score, input_score_ptr + input_offset + i * max_seg_pad,
-               cpy_len * sizeof(float), scores_load_dir,
-               cpy_len * sizeof(float), cpy_len * sizeof(float), 0);
-      // x
-      __memcpy(box2 + half_offset, input_x_ptr + input_offset + i * max_seg_pad,
-               cpy_len * 1 * sizeof(T), boxes_load_dir, cpy_len * 1 * sizeof(T),
-               cpy_len * 1 * sizeof(T), 0);
-      // y
-      __memcpy(box2 + half_offset + seg_len * 1,
-               input_y_ptr + input_offset + i * max_seg_pad,
-               cpy_len * 1 * sizeof(T), boxes_load_dir, cpy_len * 1 * sizeof(T),
-               cpy_len * 1 * sizeof(T), 0);
-      // dx
-      __memcpy(box2 + half_offset + seg_len * 2,
-               input_dx_ptr + input_offset + i * max_seg_pad,
-               cpy_len * 1 * sizeof(T), boxes_load_dir, cpy_len * 1 * sizeof(T),
-               cpy_len * 1 * sizeof(T), 0);
-      // dy
-      __memcpy(box2 + half_offset + seg_len * 3,
-               input_dy_ptr + input_offset + i * max_seg_pad,
-               cpy_len * 1 * sizeof(T), boxes_load_dir, cpy_len * 1 * sizeof(T),
-               cpy_len * 1 * sizeof(T), 0);
-      // angle
-      __memcpy(box2 + half_offset + seg_len * 4,
-               input_angle_ptr + input_offset + i * max_seg_pad,
-               cpy_len * 1 * sizeof(T), boxes_load_dir, cpy_len * 1 * sizeof(T),
-               cpy_len * 1 * sizeof(T), 0);
-      // if (std::is_same<T, half>::value) {
-      if (sizeof(T) == sizeof(half)) {
-        __bang_half2float((float *)box2, (half *)(box2 + half_offset),
-                          seg_len * 5);
-      }
-
-      // Calculate rotated vertices
-      void *temp1_ram = ((char *)temp_buffer);
-      void *temp2_ram = ((char *)temp_buffer) + seg_len * sizeof(float);
-      void *temp3_ram = ((char *)temp_buffer) + 2 * seg_len * sizeof(float);
-      void *temp4_ram = ((char *)temp_buffer) + 3 * seg_len * sizeof(float);
-      getRotatedVertices((float *)rotated_pts1_x, (float *)rotated_pts1_y,
-                         (float *)box1, (float *)temp1_ram, (float *)temp2_ram,
-                         (float *)temp3_ram, (float *)temp4_ram, seg_len);
-      getRotatedVertices((float *)rotated_pts2_x, (float *)rotated_pts2_y,
-                         (float *)box2, (float *)temp1_ram, (float *)temp2_ram,
-                         (float *)temp3_ram, (float *)temp4_ram, seg_len);
-
-      __bang_write_zero((float *)valid_pts, 24 * seg_len);
-      __bang_write_zero((float *)nums_in_ram, seg_len);
-      __bang_write_value(((float *)valid_box), seg_len, 1.0f);
-      void *vec1_x = ((char *)vec_buffer);
-      void *vec1_y = ((char *)vec1_x) + 4 * seg_len * sizeof(float);
-      void *vec2_x = ((char *)vec1_y) + 4 * seg_len * sizeof(float);
-      void *vec2_y = ((char *)vec2_x) + 4 * seg_len * sizeof(float);
-      void *temp5_ram = ((char *)temp_buffer) + 4 * seg_len * sizeof(float);
-      void *temp6_ram = ((char *)temp_buffer) + 5 * seg_len * sizeof(float);
-      void *temp7_ram = ((char *)temp_buffer) + 6 * seg_len * sizeof(float);
-      void *temp8_ram = ((char *)temp_buffer) + 7 * seg_len * sizeof(float);
-      void *temp9_ram = ((char *)temp_buffer) + 8 * seg_len * sizeof(float);
-      void *temp10_ram = ((char *)temp_buffer) + 9 * seg_len * sizeof(float);
-
-      // Get all intersection points
-      getIntersectPts(
-          (float *)rotated_pts1_x, (float *)rotated_pts1_y,
-          (float *)rotated_pts2_x, (float *)rotated_pts2_y, (float *)vec1_x,
-          (float *)vec1_y, (float *)vec2_x, (float *)vec2_y,
-          (float *)intersect_pts_x, (float *)intersect_pts_y,
-          (float *)valid_pts, (float *)nums_in_ram, (float *)temp1_ram,
-          (float *)temp2_ram, (float *)temp3_ram, (float *)temp4_ram,
-          (float *)temp5_ram, (float *)temp6_ram, (float *)temp7_ram,
-          (float *)temp8_ram, (float *)temp9_ram, (float *)temp10_ram, seg_len);
-
-      // Where nums_in <= 2, set valid_box to false
-      __bang_write_value((float *)temp9_ram, COMPUTE_COUNT_ALIGN, (float)2);
-      __bang_cycle_gt((float *)temp1_ram, (float *)nums_in_ram,
-                      (float *)temp9_ram, seg_len, COMPUTE_COUNT_ALIGN);
-      __bang_and((float *)valid_box, (float *)valid_box, (float *)temp1_ram,
-                 seg_len);
-      __bang_cycle_and((float *)valid_pts, (float *)valid_pts,
-                       (float *)valid_box, 24 * seg_len, seg_len);
-
-      // Convex-hull-graham to order the intersection points in clockwise order
-      // and find the contour area
-
-      convexHullGraham(
-          (float *)intersect_pts_x, (float *)intersect_pts_y,
-          (float *)ordered_pts_x, (float *)ordered_pts_y, (float *)dist_ram,
-          (float *)valid_box, (float *)valid_pts, (float *)nums_in_ram,
-          (float *)temp7_ram, (float *)temp8_ram, (float *)temp9_ram,
-          (float *)temp_long_1, (float *)temp_long_2, (float *)temp_long_3,
-          seg_len, seg_len);
-      // Calculate polygon area
-      // set temp1 = intersection part area
-      polygonArea((float *)ordered_pts_x, (float *)ordered_pts_y,
-                  (float *)valid_box, (float *)valid_pts, (float *)nums_in_ram,
-                  (float *)temp1_ram, (float *)temp2_ram, (float *)temp3_ram,
-                  (float *)temp4_ram, (float *)temp5_ram, (float *)temp6_ram,
-                  (float *)temp7_ram, (float *)temp8_ram, (float *)temp9_ram,
-                  seg_len);
-      // area
-      __bang_mul((float *)temp2_ram, (float *)box2 + seg_len * 2,
-                 (float *)box2 + seg_len * 3, seg_len);
-      // get the area_U: area + max_area - area_I
-      __bang_add_scalar((float *)temp2_ram, (float *)temp2_ram, float(max_area),
-                        seg_len);
-      __bang_sub((float *)temp2_ram, (float *)temp2_ram, (float *)temp1_ram,
-                 seg_len);  // area_U
-      if (iou_threshold > 0.0) {
-        __bang_mul_scalar((float *)temp1_ram, (float *)temp1_ram,
-                          div_thresh_iou, seg_len);
-      } else {
-        __bang_mul_scalar((float *)temp2_ram, (float *)temp2_ram, iou_threshold,
-                          seg_len);
-      }
-      __bang_ge((float *)temp1_ram, (float *)temp2_ram, (float *)temp1_ram,
-                seg_len);
-      __bang_mul((float *)score, (float *)score, (float *)temp1_ram, seg_len);
-
-      pvLock();
-      __memcpy(input_score_ptr + input_offset + i * max_seg_pad, score,
-               cpy_len * sizeof(float), scores_store_dir,
-               cpy_len * sizeof(float), cpy_len * sizeof(float), 0);
-      pvUnlock();
-    }
-  }
-  if (clusterId == 0 && coreId == 0 && nram_save_count) {
-    pvLock();
-    __memcpy(output_data, nram_save, nram_save_count * sizeof(int32_t),
-             NRAM2GDRAM);
-    pvUnlock();
-  }
-}
-__mlu_global__ void MLUBlockorUnionIKernelOU3D(
-    const void *input_boxes, const int input_box_num, const float iou_threshold,
-    const cnrtDataType_t data_type_input, void *workspace, void *result_num,
-    void *output) {
-  int input_dwidth = (data_type_input == CNRT_FLOAT32) ? 4 : 2;
-  mluMemcpyDirection_t scores_load_dir = GDRAM2NRAM;
-  mluMemcpyDirection_t scores_store_dir = NRAM2GDRAM;
-  mluMemcpyDirection_t boxes_load_dir = GDRAM2NRAM;
-  float *scores_data = (float *)workspace;
-  float *boxes_data = (float *)input_boxes;
-  const int cluster_score_size = input_box_num * sizeof(float);
-  const int cluster_boxes_size = input_box_num * 7 * input_dwidth;
-  char *sram_score = (char *)sram_buffer;
-  char *sram_boxes = (char *)sram_buffer + cluster_score_size;
-  if (clusterDim == 1 && SIZE_SRAM_BUF > cluster_score_size) {
-    scores_data = (float *)sram_score;
-    scores_load_dir = SRAM2NRAM;
-    scores_store_dir = NRAM2SRAM;
-    if (coreId == 0x80) {
-      __sramset((void *)sram_buffer, input_box_num, 1.0f);
-    }
-  } else {
-    if (coreId == 0) {
-      __gdramset(scores_data, input_box_num, 1.0f);
-    }
-  }
-  if (clusterDim == 1 &&
-      SIZE_SRAM_BUF - cluster_score_size >= cluster_boxes_size) {
-    boxes_load_dir = SRAM2NRAM;
-    boxes_data = (float *)sram_boxes;
-    if (coreId == 0x80) {
-      __memcpy((char *)boxes_data, (char *)input_boxes, cluster_boxes_size,
-               GDRAM2SRAM);
-    }
-  }
-  __sync_cluster();
-
-  int32_t result_box_num = 0;
-  int32_t *out_data = (int32_t *)output;
-
-  switch (data_type_input) {
-    default: { return; }
-    case CNRT_FLOAT16: {
-      iou3D_detection(result_box_num, out_data, (half *)boxes_data, scores_data,
-                      taskDim, input_box_num, iou_threshold, scores_load_dir,
-                      scores_store_dir, boxes_load_dir);
-    }; break;
-    case CNRT_FLOAT32: {
-      iou3D_detection(result_box_num, out_data, boxes_data, scores_data,
-                      taskDim, input_box_num, iou_threshold, scores_load_dir,
-                      scores_store_dir, boxes_load_dir);
-    }; break;
-  }
-  ((int32_t *)result_num)[0] = result_box_num;
-}
-
-void KernelIou3d(cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
-                 const cnrtDataType_t data_type_input, const void *boxes_dram,
-                 const int input_box_num, const float iou_threshold,
-                 void *workspace, void *output_size, void *output) {
-  switch (k_type) {
-    default: { return; }
-    case CNRT_FUNC_TYPE_BLOCK:
-    case CNRT_FUNC_TYPE_UNION1:
-    case CNRT_FUNC_TYPE_UNION2:
-    case CNRT_FUNC_TYPE_UNION4:
-    case CNRT_FUNC_TYPE_UNION8:
-    case CNRT_FUNC_TYPE_UNION16: {
-      MLUBlockorUnionIKernelOU3D<<<k_dim, k_type, queue>>>(
-          (void *)boxes_dram, input_box_num, iou_threshold, data_type_input,
-          workspace, output_size, output);
-    }; break;
-  }
-}
diff --git a/mmcv/ops/csrc/common/mlu/iou3d_utils.hpp b/mmcv/ops/csrc/common/mlu/iou3d_utils.hpp
deleted file mode 100644
index b98ffe2..0000000
--- a/mmcv/ops/csrc/common/mlu/iou3d_utils.hpp
+++ /dev/null
@@ -1,695 +0,0 @@
-/*************************************************************************
- * Copyright (C) 2022 Cambricon.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *************************************************************************/
-
-#ifndef IOU3D_UTILS_HPP_
-#define IOU3D_UTILS_HPP_
-#include "common_mlu_helper.hpp"
-
-#define IOU3D_SIZE 64
-#define IOU3D_UP(x, y) (x / y + (int)(x % y > 0)) * y
-#define IOU3D_DOWN(x, y) (x / y) * y
-#define SIZE_NRAM_BUF (MAX_NRAM_SIZE)
-#define SIZE_SRAM_BUF (MAX_SRAM_SIZE)
-#define COMPUTE_COUNT_ALIGN 64
-#define INFO_NUM (5)  // score, x1, y1, x2, y2
-#define REDUCE_NUM \
-  (7)  // score, x1, y1, x2, y2, max_index (reserve 2 num for half-type input)
-#define SINGLE_BOX_DIM 5
-#define MEMORY_CORE (0x80)
-__mlu_func__ void pvLock() {
-#if __BANG_ARCH__ == 270
-  if (coreId != MEMORY_CORE) {
-    __bang_lock(0, 0);
-  }
-#endif
-}
-
-__mlu_func__ void pvUnlock() {
-#if __BANG_ARCH__ == 270
-  if (coreId != MEMORY_CORE) {
-    __bang_unlock(0, 0);
-  }
-#endif
-}
-
-// cross2d<T>(A, B) = A.x * B.y - A.y * B.x;
-template <typename T>
-inline __mlu_func__ void cross2d(T *result, const T *p1_x, const T *p1_y,
-                                 const T *p2_x, const T *p2_y,
-                                 const int &length, T *temp_ram) {
-  __bang_mul((T *)temp_ram, (T *)p1_x, (T *)p2_y, length);
-  __bang_mul((T *)result, (T *)p1_y, (T *)p2_x, length);
-  __bang_sub((T *)result, (T *)temp_ram, (T *)result, length);
-}
-
-// dot2d<T>(A, B) =  A.x * B.x + A.y * B.y
-template <typename T>
-inline __mlu_func__ void dot2d(T *result, const T *p1_x, const T *p1_y,
-                               const T *p2_x, const T *p2_y, const int &length,
-                               T *temp_ram) {
-  __bang_mul((T *)temp_ram, (T *)p1_x, (T *)p2_x, length);
-  __bang_mul((T *)result, (T *)p1_y, (T *)p2_y, length);
-  __bang_add((T *)result, (T *)temp_ram, (T *)result, length);
-}
-
-template <typename T>
-__mlu_func__ void getRotatedVertices(T *pts_x, T *pts_y, T *box, T *temp1,
-                                     T *temp2, T *temp3, T *temp4,
-                                     const uint32_t &actual_compute_box_num) {
-// T cosTheta2 = (T)cos(theta) * 0.5f; -- temp1
-// T sinTheta2 = (T)sin(theta) * 0.5f; -- temp2
-// theta is the box's 5th data: a, rotated radian;
-#if __BANG_ARCH__ >= 300
-  __bang_cos((float *)temp1, ((float *)box) + 4 * actual_compute_box_num,
-             actual_compute_box_num);
-  __bang_sin((float *)temp2, ((float *)box) + 4 * actual_compute_box_num,
-             actual_compute_box_num);
-#else
-  __bang_taylor4_cos((T *)temp1, ((T *)box) + 4 * actual_compute_box_num,
-                     (T *)temp3, (T *)temp4, actual_compute_box_num);
-  __bang_taylor4_sin((T *)temp2, ((T *)box) + 4 * actual_compute_box_num,
-                     (T *)temp3, (T *)temp4, actual_compute_box_num);
-#endif
-  __bang_mul_scalar((T *)temp1, (T *)temp1, (T)0.5, actual_compute_box_num);
-  __bang_mul_scalar((T *)temp2, (T *)temp2, (T)0.5, actual_compute_box_num);
-
-  // Temp3 = sinTheta2 * box.h;
-  // Temp4 = cosTheta2 * box.w;
-  __bang_mul((T *)temp3, (T *)temp2, ((T *)box) + 3 * actual_compute_box_num,
-             actual_compute_box_num);
-  __bang_mul((T *)temp4, (T *)temp1, ((T *)box) + 2 * actual_compute_box_num,
-             actual_compute_box_num);
-  // pts[0].x = box.x_ctr - sinTheta2 * box.h - cosTheta2 * box.w;
-  // pts[1].x = box.x_ctr + sinTheta2 * box.h - cosTheta2 * box.w;
-  __bang_sub((T *)pts_x, (T *)box, (T *)temp3, actual_compute_box_num);
-  __bang_sub((T *)pts_x, (T *)pts_x, (T *)temp4, actual_compute_box_num);
-  __bang_add((T *)pts_x + 1 * actual_compute_box_num, (T *)box, (T *)temp3,
-             actual_compute_box_num);
-  __bang_sub((T *)pts_x + 1 * actual_compute_box_num,
-             (T *)pts_x + 1 * actual_compute_box_num, (T *)temp4,
-             actual_compute_box_num);
-  // Temp3 = cosTheta2 * box.h;
-  // Temp4 = sinTheta2 * box.w;
-  __bang_mul((T *)temp3, (T *)temp1, box + 3 * actual_compute_box_num,
-             actual_compute_box_num);
-  __bang_mul((T *)temp4, (T *)temp2, box + 2 * actual_compute_box_num,
-             actual_compute_box_num);
-  // pts[0].y = box.y_ctr + cosTheta2 * box.h - sinTheta2 * box.w;
-  // pts[1].y = box.y_ctr - cosTheta2 * box.h - sinTheta2 * box.w;
-  __bang_add((T *)pts_y, (T *)box + 1 * actual_compute_box_num, (T *)temp3,
-             actual_compute_box_num);
-  __bang_sub((T *)pts_y, (T *)pts_y, (T *)temp4, actual_compute_box_num);
-  __bang_sub((T *)pts_y + 1 * actual_compute_box_num,
-             (T *)box + 1 * actual_compute_box_num, (T *)temp3,
-             actual_compute_box_num);
-  __bang_sub((T *)pts_y + 1 * actual_compute_box_num,
-             (T *)pts_y + 1 * actual_compute_box_num, (T *)temp4,
-             actual_compute_box_num);
-  // pts[2].x = 2 * box.x_ctr - pts[0].x;
-  // pts[3].x = 2 * box.x_ctr - pts[1].x;
-  __bang_add((T *)pts_x + 2 * actual_compute_box_num, (T *)box, (T *)box,
-             actual_compute_box_num);
-  __bang_sub((T *)pts_x + 2 * actual_compute_box_num,
-             (T *)pts_x + 2 * actual_compute_box_num, (T *)pts_x,
-             actual_compute_box_num);
-  __bang_add((T *)pts_x + 3 * actual_compute_box_num, (T *)box, (T *)box,
-             actual_compute_box_num);
-  __bang_sub((T *)pts_x + 3 * actual_compute_box_num,
-             (T *)pts_x + 3 * actual_compute_box_num,
-             (T *)pts_x + 1 * actual_compute_box_num, actual_compute_box_num);
-  // pts[2].y = 2 * box.y_ctr - pts[0].y;
-  // pts[3].y = 2 * box.y_ctr - pts[1].y;
-  __bang_add((T *)pts_y + 2 * actual_compute_box_num,
-             (T *)box + 1 * actual_compute_box_num,
-             (T *)box + 1 * actual_compute_box_num, actual_compute_box_num);
-  __bang_sub((T *)pts_y + 2 * actual_compute_box_num,
-             (T *)pts_y + 2 * actual_compute_box_num, (T *)pts_y,
-             actual_compute_box_num);
-  __bang_add((T *)pts_y + 3 * actual_compute_box_num,
-             (T *)box + 1 * actual_compute_box_num,
-             (T *)box + 1 * actual_compute_box_num, actual_compute_box_num);
-  __bang_sub((T *)pts_y + 3 * actual_compute_box_num,
-             (T *)pts_y + 3 * actual_compute_box_num,
-             (T *)pts_y + 1 * actual_compute_box_num, actual_compute_box_num);
-}
-
-template <typename T>
-__mlu_func__ void getIntersectPts(T *rotated_pts1_x, T *rotated_pts1_y,
-                                  T *rotated_pts2_x, T *rotated_pts2_y,
-                                  T *vec1_x, T *vec1_y, T *vec2_x, T *vec2_y,
-                                  T *intersect_pts_x, T *intersect_pts_y,
-                                  T *valid_pts, T *nums_in_ram, T *temp1_ram,
-                                  T *temp2_ram, T *temp3_ram, T *temp4_ram,
-                                  T *temp5_ram, T *temp6_ram, T *temp7_ram,
-                                  T *temp8_ram, T *temp9_ram, T *temp10_ram,
-                                  const uint32_t &actual_compute_box_num) {
-// Initialize const data to ram
-// temp3 = const 1e-14(@float), length = COMPUTE_COUNT_ALIGN
-#if __BANG_ARCH__ >= 300
-  __bang_write_value((T *)temp3_ram, COMPUTE_COUNT_ALIGN, (T)1e-14);
-#else
-  // NOTE: Since active_reciphp function has strict value range,
-  //       [2.2205e-16, 2e6]@float, [0.00391, 65504]@half
-  __bang_write_value((T *)temp3_ram, COMPUTE_COUNT_ALIGN, (float)1e-14);
-#endif
-  // temp4 = const T(0), length = COMPUTE_COUNT_ALIGN
-  __bang_write_value((T *)temp4_ram, COMPUTE_COUNT_ALIGN, (T)0);
-  // temp5 = const T(1), length = COMPUTE_COUNT_ALIGN
-  __bang_write_value((T *)temp5_ram, COMPUTE_COUNT_ALIGN, (T)1);
-
-  // Line vector, from p1 to p2 is: p1+(p2-p1)*t, t=[0,1]
-  // for i = 0~3, vec[i] = pts[(i+1)%4] - pts[i]
-  __bang_sub((T *)vec1_x, (T *)rotated_pts1_x + actual_compute_box_num,
-             (T *)rotated_pts1_x, 3 * actual_compute_box_num);
-  __bang_sub((T *)vec1_x + 3 * actual_compute_box_num, (T *)rotated_pts1_x,
-             (T *)rotated_pts1_x + 3 * actual_compute_box_num,
-             actual_compute_box_num);
-  __bang_sub((T *)vec1_y, (T *)rotated_pts1_y + actual_compute_box_num,
-             (T *)rotated_pts1_y, 3 * actual_compute_box_num);
-  __bang_sub((T *)vec1_y + 3 * actual_compute_box_num, (T *)rotated_pts1_y,
-             (T *)rotated_pts1_y + 3 * actual_compute_box_num,
-             actual_compute_box_num);
-
-  __bang_sub((T *)vec2_x, (T *)rotated_pts2_x + actual_compute_box_num,
-             (T *)rotated_pts2_x, 3 * actual_compute_box_num);
-  __bang_sub((T *)vec2_x + 3 * actual_compute_box_num, (T *)rotated_pts2_x,
-             (T *)rotated_pts2_x + 3 * actual_compute_box_num,
-             actual_compute_box_num);
-  __bang_sub((T *)vec2_y, (T *)rotated_pts2_y + actual_compute_box_num,
-             (T *)rotated_pts2_y, 3 * actual_compute_box_num);
-  __bang_sub((T *)vec2_y + 3 * actual_compute_box_num, (T *)rotated_pts2_y,
-             (T *)rotated_pts2_y + 3 * actual_compute_box_num,
-             actual_compute_box_num);
-
-  // First, line test - test all line combos for intersection, 4x4 possible
-  for (int i = 0; i < 4; i++) {
-    for (int j = 0; j < 4; j++) {
-      // T det = cross2d<T>(vec2[j], vec1[i]) -- temp2
-      cross2d<T>((T *)temp2_ram, (T *)vec2_x + j * actual_compute_box_num,
-                 (T *)vec2_y + j * actual_compute_box_num,
-                 (T *)vec1_x + i * actual_compute_box_num,
-                 (T *)vec1_y + i * actual_compute_box_num,
-                 actual_compute_box_num, (T *)temp1_ram);
-      // temp8 = sign(det), since active_reciphp only receive positive values
-      __bang_active_sign((T *)temp8_ram, (T *)temp2_ram,
-                         actual_compute_box_num);
-      // deal with parallel lines, temp2 = fabs(det), temp1 = temp2 > 1e-14
-      __bang_active_abs((T *)temp2_ram, (T *)temp2_ram, actual_compute_box_num);
-      __bang_cycle_gt((T *)temp1_ram, (T *)temp2_ram, (T *)temp3_ram,
-                      actual_compute_box_num, COMPUTE_COUNT_ALIGN);
-      // Where temp1 = false, set recip input to 1, avoiding recip(0), cause inf
-      __bang_not((T *)temp9_ram, (T *)temp1_ram, actual_compute_box_num);
-      __bang_mul((T *)temp2_ram, (T *)temp2_ram, (T *)temp1_ram,
-                 actual_compute_box_num);
-      __bang_add((T *)temp2_ram, (T *)temp2_ram, (T *)temp9_ram,
-                 actual_compute_box_num);
-// temp2 = 1/temp2, use mult (1/temp2) instead of div temp2
-#if __BANG_ARCH__ >= 300
-      __bang_recip((float *)temp2_ram, (float *)temp2_ram,
-                   actual_compute_box_num);
-#else
-      // NOTE: active_reciphp function has strict value range:
-      //       [2.2205e-16, 2e6]@float, [0.00391, 65504]@half
-      __bang_active_reciphp((T *)temp2_ram, (T *)temp2_ram,
-                            actual_compute_box_num);
-#endif
-      // Restore temp2 invalid box value 1 and sign-bit
-      __bang_mul((T *)temp2_ram, (T *)temp2_ram, (T *)temp1_ram,
-                 actual_compute_box_num);
-      __bang_mul((T *)temp2_ram, (T *)temp2_ram, (T *)temp8_ram,
-                 actual_compute_box_num);
-
-      // auto vec12 = pts2[j] - pts1[i], (temp6, temp7) = (x, y)
-      __bang_sub((T *)temp6_ram,
-                 (T *)rotated_pts2_x + j * actual_compute_box_num,
-                 (T *)rotated_pts1_x + i * actual_compute_box_num,
-                 actual_compute_box_num);
-      __bang_sub((T *)temp7_ram,
-                 (T *)rotated_pts2_y + j * actual_compute_box_num,
-                 (T *)rotated_pts1_y + i * actual_compute_box_num,
-                 actual_compute_box_num);
-
-      // T t1 = cross2d<T>(vec2[j], vec12) mult (1/det)  -- temp8
-      cross2d<T>((T *)temp8_ram, (T *)vec2_x + j * actual_compute_box_num,
-                 (T *)vec2_y + j * actual_compute_box_num, (T *)temp6_ram,
-                 (T *)temp7_ram, actual_compute_box_num, (T *)temp9_ram);
-      __bang_mul((T *)temp8_ram, (T *)temp8_ram, (T *)temp2_ram,
-                 actual_compute_box_num);
-
-      // temp1 &= (t1 >= 0.0f && t1 <= 1.0f)  -- temp9
-      __bang_cycle_ge((T *)temp9_ram, (T *)temp8_ram, (T *)temp4_ram,
-                      actual_compute_box_num, COMPUTE_COUNT_ALIGN);
-      __bang_and((T *)temp1_ram, (T *)temp1_ram, (T *)temp9_ram,
-                 actual_compute_box_num);
-      __bang_cycle_le((T *)temp9_ram, (T *)temp8_ram, (T *)temp5_ram,
-                      actual_compute_box_num, COMPUTE_COUNT_ALIGN);
-      __bang_and((T *)temp1_ram, (T *)temp1_ram, (T *)temp9_ram,
-                 actual_compute_box_num);
-
-      // T t2 = cross2d<T>(vec1[i], vec12) mult temp2  -- temp9
-      // NOTE: temp8(t1) is used after, reuse temp7(p2_y) as cross2d temp ram
-      cross2d<T>((T *)temp9_ram, (T *)vec1_x + i * actual_compute_box_num,
-                 (T *)vec1_y + i * actual_compute_box_num, (T *)temp6_ram,
-                 (T *)temp7_ram, actual_compute_box_num, (T *)temp7_ram);
-      __bang_mul((T *)temp9_ram, (T *)temp9_ram, (T *)temp2_ram,
-                 actual_compute_box_num);
-
-      // temp1 &= (t2 >= 0.0f && t2 <= 1.0f)  -- temp9
-      __bang_cycle_ge((T *)temp7_ram, (T *)temp9_ram, (T *)temp4_ram,
-                      actual_compute_box_num, COMPUTE_COUNT_ALIGN);
-      __bang_and((T *)temp1_ram, (T *)temp1_ram, (T *)temp7_ram,
-                 actual_compute_box_num);
-      __bang_cycle_le((T *)temp7_ram, (T *)temp9_ram, (T *)temp5_ram,
-                      actual_compute_box_num, COMPUTE_COUNT_ALIGN);
-      __bang_and((T *)temp1_ram, (T *)temp1_ram, (T *)temp7_ram,
-                 actual_compute_box_num);
-
-      // intersections = (pts1[i] + vec1[i] * t1) * temp1
-      __bang_mul((T *)temp9_ram, (T *)vec1_x + i * actual_compute_box_num,
-                 (T *)temp8_ram, actual_compute_box_num);
-      __bang_add((T *)temp9_ram,
-                 (T *)rotated_pts1_x + i * actual_compute_box_num,
-                 (T *)temp9_ram, actual_compute_box_num);
-      __bang_mul((T *)intersect_pts_x + (4 * i + j) * actual_compute_box_num,
-                 (T *)temp9_ram, (T *)temp1_ram, actual_compute_box_num);
-      __bang_mul((T *)temp9_ram, (T *)vec1_y + i * actual_compute_box_num,
-                 (T *)temp8_ram, actual_compute_box_num);
-      __bang_add((T *)temp9_ram,
-                 (T *)rotated_pts1_y + i * actual_compute_box_num,
-                 (T *)temp9_ram, actual_compute_box_num);
-      __bang_mul((T *)intersect_pts_y + (4 * i + j) * actual_compute_box_num,
-                 (T *)temp9_ram, (T *)temp1_ram, actual_compute_box_num);
-
-      // Assign `valid_pts` bit and accumulate `nums_in` of valid points of each
-      // box pair
-      __bang_or((T *)valid_pts + (4 * i + j) * actual_compute_box_num,
-                (T *)valid_pts + (4 * i + j) * actual_compute_box_num,
-                (T *)temp1_ram, actual_compute_box_num);
-      __bang_add((T *)nums_in_ram, (T *)nums_in_ram, (T *)temp1_ram,
-                 actual_compute_box_num);
-    }
-  }
-
-  // Check for vertices of rect1 inside rect2
-  // temp5 = ABdotAB
-  dot2d<T>((T *)temp5_ram, (T *)vec2_x, (T *)vec2_y, (T *)vec2_x, (T *)vec2_y,
-           actual_compute_box_num, (T *)temp9_ram);
-  // temp6 = ADdotAD
-  dot2d<T>((T *)temp6_ram, (T *)vec2_x + 3 * actual_compute_box_num,
-           (T *)vec2_y + 3 * actual_compute_box_num,
-           (T *)vec2_x + 3 * actual_compute_box_num,
-           (T *)vec2_y + 3 * actual_compute_box_num, actual_compute_box_num,
-           (T *)temp9_ram);
-  // assume ABCD is the rectangle, and P is the point to be judged
-  // P is inside ABCD iff. P's projection on AB lines within AB
-  // and P's projection on AD lies within AD
-  for (int i = 0; i < 4; i++) {
-    // AP = pts1[i] - pts2[0] = (temp7, temp8)
-    __bang_sub((T *)temp7_ram, (T *)rotated_pts1_x + i * actual_compute_box_num,
-               (T *)rotated_pts2_x, actual_compute_box_num);
-    __bang_sub((T *)temp8_ram, (T *)rotated_pts1_y + i * actual_compute_box_num,
-               (T *)rotated_pts2_y, actual_compute_box_num);
-
-    // temp9 = APdotAB = dot2d<T>(AP, AB)
-    dot2d<T>((T *)temp9_ram, (T *)temp7_ram, (T *)temp8_ram, (T *)vec2_x,
-             (T *)vec2_y, actual_compute_box_num, (T *)temp2_ram);
-    // temp10 = APdotAD = -dot2d<T>(AP, DA)
-    dot2d<T>((T *)temp10_ram, (T *)temp7_ram, (T *)temp8_ram,
-             (T *)vec2_x + 3 * actual_compute_box_num,
-             (T *)vec2_y + 3 * actual_compute_box_num, actual_compute_box_num,
-             (T *)temp2_ram);
-    __bang_mul_scalar((T *)temp10_ram, (T *)temp10_ram, (T)-1,
-                      actual_compute_box_num);
-
-    // ((APdotAB >= 0) && (APdotAD >= 0) && (APdotAB <= ABdotAB) && (APdotAD <=
-    // ADdotAD))
-    __bang_cycle_ge((T *)temp1_ram, (T *)temp9_ram, (T *)temp4_ram,
-                    actual_compute_box_num, COMPUTE_COUNT_ALIGN);
-    __bang_cycle_ge((T *)temp2_ram, (T *)temp10_ram, (T *)temp4_ram,
-                    actual_compute_box_num, COMPUTE_COUNT_ALIGN);
-    __bang_and((T *)temp1_ram, (T *)temp1_ram, (T *)temp2_ram,
-               actual_compute_box_num);
-    __bang_le((T *)temp2_ram, (T *)temp9_ram, (T *)temp5_ram,
-              actual_compute_box_num);
-    __bang_and((T *)temp1_ram, (T *)temp1_ram, (T *)temp2_ram,
-               actual_compute_box_num);
-    __bang_le((T *)temp2_ram, (T *)temp10_ram, (T *)temp6_ram,
-              actual_compute_box_num);
-    __bang_and((T *)temp1_ram, (T *)temp1_ram, (T *)temp2_ram,
-               actual_compute_box_num);
-
-    // 16 means the 4x4 possible intersection points above
-    __bang_mul((T *)intersect_pts_x + (16 + i) * actual_compute_box_num,
-               (T *)temp1_ram, (T *)rotated_pts1_x + i * actual_compute_box_num,
-               actual_compute_box_num);
-    __bang_mul((T *)intersect_pts_y + (16 + i) * actual_compute_box_num,
-               (T *)temp1_ram, (T *)rotated_pts1_y + i * actual_compute_box_num,
-               actual_compute_box_num);
-
-    // assign valid_pts bit and accumulate nums of valid points of each box pair
-    __bang_or((T *)valid_pts + (16 + i) * actual_compute_box_num,
-              (T *)valid_pts + (16 + i) * actual_compute_box_num,
-              (T *)temp1_ram, actual_compute_box_num);
-    __bang_add((T *)nums_in_ram, (T *)nums_in_ram, (T *)temp1_ram,
-               actual_compute_box_num);
-  }
-
-  // Reverse the check - check for vertices of rect2 inside rect1
-  // temp5 = ABdotAB
-  dot2d<T>((T *)temp5_ram, (T *)vec1_x, (T *)vec1_y, (T *)vec1_x, (T *)vec1_y,
-           actual_compute_box_num, (T *)temp9_ram);
-  // temp6 = ADdotAD
-  dot2d<T>((T *)temp6_ram, (T *)vec1_x + 3 * actual_compute_box_num,
-           (T *)vec1_y + 3 * actual_compute_box_num,
-           (T *)vec1_x + 3 * actual_compute_box_num,
-           (T *)vec1_y + 3 * actual_compute_box_num, actual_compute_box_num,
-           (T *)temp9_ram);
-  for (int i = 0; i < 4; i++) {
-    // AP = pts2[i] - pts1[0] = (temp7, temp8)
-    __bang_sub((T *)temp7_ram, (T *)rotated_pts2_x + i * actual_compute_box_num,
-               (T *)rotated_pts1_x, actual_compute_box_num);
-    __bang_sub((T *)temp8_ram, (T *)rotated_pts2_y + i * actual_compute_box_num,
-               (T *)rotated_pts1_y, actual_compute_box_num);
-
-    // temp9 = APdotAB = dot2d<T>(AP, AB)
-    dot2d<T>((T *)temp9_ram, (T *)temp7_ram, (T *)temp8_ram, (T *)vec1_x,
-             (T *)vec1_y, actual_compute_box_num, (T *)temp2_ram);
-    // temp10 = APdotAD = -dot2d<T>(AP, DA)
-    dot2d<T>((T *)temp10_ram, (T *)temp7_ram, (T *)temp8_ram,
-             (T *)vec1_x + 3 * actual_compute_box_num,
-             (T *)vec1_y + 3 * actual_compute_box_num, actual_compute_box_num,
-             (T *)temp2_ram);
-    __bang_mul_scalar((T *)temp10_ram, (T *)temp10_ram, (T)-1,
-                      actual_compute_box_num);
-
-    // ((APdotAB >= 0) && (APdotAD >= 0) && (APdotAB <= ABdotAB) && (APdotAD <=
-    // ADdotAD))
-    __bang_cycle_ge((T *)temp1_ram, (T *)temp9_ram, (T *)temp4_ram,
-                    actual_compute_box_num, COMPUTE_COUNT_ALIGN);
-    __bang_cycle_ge((T *)temp2_ram, (T *)temp10_ram, (T *)temp4_ram,
-                    actual_compute_box_num, COMPUTE_COUNT_ALIGN);
-    __bang_and((T *)temp1_ram, (T *)temp1_ram, (T *)temp2_ram,
-               actual_compute_box_num);
-    __bang_le((T *)temp2_ram, (T *)temp9_ram, (T *)temp5_ram,
-              actual_compute_box_num);
-    __bang_and((T *)temp1_ram, (T *)temp1_ram, (T *)temp2_ram,
-               actual_compute_box_num);
-    __bang_le((T *)temp2_ram, (T *)temp10_ram, (T *)temp6_ram,
-              actual_compute_box_num);
-    __bang_and((T *)temp1_ram, (T *)temp1_ram, (T *)temp2_ram,
-               actual_compute_box_num);
-
-    // 20 means the (4x4+4) possible intersection points above
-    __bang_mul((T *)intersect_pts_x + (20 + i) * actual_compute_box_num,
-               (T *)temp1_ram, (T *)rotated_pts2_x + i * actual_compute_box_num,
-               actual_compute_box_num);
-    __bang_mul((T *)intersect_pts_y + (20 + i) * actual_compute_box_num,
-               (T *)temp1_ram, (T *)rotated_pts2_y + i * actual_compute_box_num,
-               actual_compute_box_num);
-
-    // assign valid_pts bit and accumulate nums of valid points of each box pair
-    __bang_or((T *)valid_pts + (20 + i) * actual_compute_box_num,
-              (T *)valid_pts + (20 + i) * actual_compute_box_num,
-              (T *)temp1_ram, actual_compute_box_num);
-    __bang_add((T *)nums_in_ram, (T *)nums_in_ram, (T *)temp1_ram,
-               actual_compute_box_num);
-  }
-}
-
-template <typename T>
-__mlu_func__ void convexHullGraham(
-    T *intersect_pts_x, T *intersect_pts_y, T *ordered_pts_x, T *ordered_pts_y,
-    T *dist_ram, T *valid_box, T *valid_pts, T *nums_in_ram, T *temp1_ram,
-    T *temp2_ram, T *temp3_ram, T *temp_long_1, T *temp_long_2, T *temp_long_3,
-    const uint32_t &actual_box_num, const uint32_t &actual_compute_box_num) {
-  // Step1. Find the point with minimum y, if more than 1 points have the same
-  // minimum y,
-  //        pick the one with the minimum x.
-  // set p[i].y to max_y_value if not valid_pts, to avoid invalid result
-  // 24 means all possible intersection points
-  __bang_max((T *)temp2_ram, (T *)intersect_pts_y, 24 * actual_compute_box_num);
-  __bang_write_value((T *)temp3_ram, COMPUTE_COUNT_ALIGN, ((T *)temp2_ram)[0]);
-  __bang_not((T *)temp_long_1, (T *)valid_pts, 24 * actual_compute_box_num);
-  __bang_cycle_mul((T *)temp_long_1, (T *)temp_long_1, (T *)temp3_ram,
-                   24 * actual_compute_box_num, COMPUTE_COUNT_ALIGN);
-  __bang_mul((T *)temp_long_2, (T *)intersect_pts_y, (T *)valid_pts,
-             24 * actual_compute_box_num);
-  __bang_add((T *)temp_long_2, (T *)temp_long_2, (T *)temp_long_1,
-             24 * actual_compute_box_num);
-  // temp2 = min_y_value(temp_long_2), use min_pool, channel=box_num, h=1, w=24
-  __bang_minpool((T *)temp2_ram, (T *)temp_long_2, actual_compute_box_num, 1,
-                 24, 1, 24, 1, 24);
-  __bang_mul((T *)temp2_ram, (T *)temp2_ram, (T *)valid_box,
-             actual_compute_box_num);
-
-  // set p[i].x to max_x_value if not min_y point
-  __bang_max((T *)temp1_ram, (T *)intersect_pts_x, 24 * actual_compute_box_num);
-  __bang_write_value((T *)temp3_ram, COMPUTE_COUNT_ALIGN, ((T *)temp1_ram)[0]);
-  __bang_cycle_eq((T *)temp_long_1, (T *)temp_long_2, (T *)temp2_ram,
-                  24 * actual_compute_box_num, actual_compute_box_num);
-  __bang_and((T *)temp_long_1, (T *)temp_long_1, (T *)valid_pts,
-             24 * actual_compute_box_num);
-  __bang_not((T *)temp_long_3, (T *)temp_long_1, 24 * actual_compute_box_num);
-  __bang_cycle_mul((T *)temp_long_3, (T *)temp_long_3, (T *)temp3_ram,
-                   24 * actual_compute_box_num, COMPUTE_COUNT_ALIGN);
-  __bang_mul((T *)temp_long_1, (T *)intersect_pts_x, (T *)temp_long_1,
-             24 * actual_compute_box_num);
-  __bang_add((T *)temp_long_1, (T *)temp_long_1, (T *)temp_long_3,
-             24 * actual_compute_box_num);
-  // temp3 = min_x_value(temp_long_1), use min_pool, channel=box_num, h=1, w=24
-  __bang_minpool((T *)temp3_ram, (T *)temp_long_1, actual_compute_box_num, 1,
-                 24, 1, 24, 1, 24);
-  __bang_mul((T *)temp3_ram, (T *)temp3_ram, (T *)valid_box,
-             actual_compute_box_num);
-
-  // Step2. All points subtract starting-point (for sorting in the next step)
-  __bang_cycle_sub((T *)ordered_pts_x, (T *)intersect_pts_x, (T *)temp3_ram,
-                   24 * actual_compute_box_num, actual_compute_box_num);
-  __bang_cycle_sub((T *)ordered_pts_y, (T *)intersect_pts_y, (T *)temp2_ram,
-                   24 * actual_compute_box_num, actual_compute_box_num);
-  __bang_mul((T *)ordered_pts_x, (T *)ordered_pts_x, (T *)valid_pts,
-             24 * actual_compute_box_num);
-  __bang_mul((T *)ordered_pts_y, (T *)ordered_pts_y, (T *)valid_pts,
-             24 * actual_compute_box_num);
-
-  // Step3. Sort every intersection point according to their relative
-  //        cross-product values (essentially sorting according to angles)
-  //        If the angles are the same, sort according to distance to origin
-  dot2d<T>((T *)dist_ram, (T *)ordered_pts_x, (T *)ordered_pts_y,
-           (T *)ordered_pts_x, (T *)ordered_pts_y, 24 * actual_compute_box_num,
-           (T *)temp_long_3);
-
-  T temp, temp_nums_in, temp_dist_1, temp_dist_2;
-  T temp1_x, temp1_y;
-  T temp2_x, temp2_y;
-  for (int i = 0; i < actual_box_num; i++) {
-    if (((T *)valid_box)[i]) {
-      // make sure all nums_in[i] points are at the front
-      for (int ii = 0; ii < 23; ii++) {
-        for (int jj = ii + 1; jj < 24; jj++) {
-          int ii_index = ii * actual_compute_box_num + i;
-          int jj_index = jj * actual_compute_box_num + i;
-          // ii point is not valid and jj point is valid, swap jj for ii
-          if ((!((T *)valid_pts)[ii_index]) && ((T *)valid_pts)[jj_index]) {
-            ((T *)ordered_pts_x)[ii_index] = ((T *)ordered_pts_x)[jj_index];
-            ((T *)ordered_pts_y)[ii_index] = ((T *)ordered_pts_y)[jj_index];
-            ((T *)dist_ram)[ii_index] = ((T *)dist_ram)[jj_index];
-            ((T *)valid_pts)[ii_index] = true;
-            ((T *)ordered_pts_x)[jj_index] = 0;
-            ((T *)ordered_pts_y)[jj_index] = 0;
-            ((T *)dist_ram)[jj_index] = 0;
-            ((T *)valid_pts)[jj_index] = false;
-            break;
-          }
-        }
-      }
-      temp_nums_in = ((T *)nums_in_ram)[i];
-      // make original q[0] = min_x, min_y before sort
-      for (int ii = 1; ii < temp_nums_in; ii++) {
-        int ii_index = ii * actual_compute_box_num + i;
-        if (((T *)dist_ram)[ii_index] == 0) {
-          // swap q[ii_index] and q[0]
-          ((T *)ordered_pts_x)[ii_index] = ((T *)ordered_pts_x)[i];
-          ((T *)ordered_pts_y)[ii_index] = ((T *)ordered_pts_y)[i];
-          ((T *)dist_ram)[ii_index] = ((T *)dist_ram)[i];
-          ((T *)ordered_pts_x)[i] = 0;
-          ((T *)ordered_pts_y)[i] = 0;
-          ((T *)dist_ram)[i] = 0;
-          break;
-        }
-      }
-      for (int ii = 1; ii < temp_nums_in - 1; ii++) {
-        for (int jj = ii + 1; jj < temp_nums_in; jj++) {
-          int ii_index = ii * actual_compute_box_num + i;
-          int jj_index = jj * actual_compute_box_num + i;
-          temp1_x = ((T *)ordered_pts_x)[ii_index];
-          temp1_y = ((T *)ordered_pts_y)[ii_index];
-          temp2_x = ((T *)ordered_pts_x)[jj_index];
-          temp2_y = ((T *)ordered_pts_y)[jj_index];
-          // calculate cross product and sort q (ordered_pts)
-          temp = (temp1_x * temp2_y) - (temp1_y * temp2_x);
-          temp_dist_1 = ((T *)dist_ram)[ii_index];
-          temp_dist_2 = ((T *)dist_ram)[jj_index];
-          if ((temp < (T)-1e-6) ||
-              ((fabs(temp) < (T)1e-6) && (temp_dist_1 > temp_dist_2))) {
-            ((T *)ordered_pts_x)[ii_index] = temp2_x;
-            ((T *)ordered_pts_y)[ii_index] = temp2_y;
-            ((T *)ordered_pts_x)[jj_index] = temp1_x;
-            ((T *)ordered_pts_y)[jj_index] = temp1_y;
-            ((T *)dist_ram)[ii_index] = temp_dist_2;
-            ((T *)dist_ram)[jj_index] = temp_dist_1;
-          }
-        }
-      }
-
-      // Step4:
-      // Make sure there are at least 2 points(that don't overlap with each
-      // other) in the stack
-      int k;  // index of the non-overlapped second point
-      for (k = 1; k < temp_nums_in; k++) {
-        if (((T *)dist_ram)[k * actual_compute_box_num + i] > (T)1e-8) {
-          break;
-        }
-      }
-      if (k == temp_nums_in) {
-        // We reach the end, which means the convex hull is just one point
-        // set valid_box = 0, to get ious = 0
-        ((T *)valid_box)[i] = 0;
-        continue;
-      }
-      // q[1] = q[k];
-      ((T *)ordered_pts_x)[actual_compute_box_num + i] =
-          ((T *)ordered_pts_x)[k * actual_compute_box_num + i];
-      ((T *)ordered_pts_y)[actual_compute_box_num + i] =
-          ((T *)ordered_pts_y)[k * actual_compute_box_num + i];
-
-      // Step 5:
-      // Finally we can start the scanning process.
-      // When a non-convex relationship between the 3 points is found
-      // (either concave shape or duplicated points),
-      // we pop the previous point from the stack
-      // until the 3-point relationship is convex again, or
-      // until the stack only contains two points
-      int m = 2;  // 2 points in the stack
-      for (int j = k + 1; j < temp_nums_in; j++) {
-        // while (m > 1 && cross2d<T>(q[j] - q[m - 2], q[m - 1] - q[m - 2]) >=
-        // 0) {
-        //   m--;
-        // }
-        temp1_x = ((T *)ordered_pts_x)[j * actual_compute_box_num + i] -
-                  ((T *)ordered_pts_x)[(m - 2) * actual_compute_box_num + i];
-        temp1_y = ((T *)ordered_pts_y)[j * actual_compute_box_num + i] -
-                  ((T *)ordered_pts_y)[(m - 2) * actual_compute_box_num + i];
-        temp2_x = ((T *)ordered_pts_x)[(m - 1) * actual_compute_box_num + i] -
-                  ((T *)ordered_pts_x)[(m - 2) * actual_compute_box_num + i];
-        temp2_y = ((T *)ordered_pts_y)[(m - 1) * actual_compute_box_num + i] -
-                  ((T *)ordered_pts_y)[(m - 2) * actual_compute_box_num + i];
-        temp = (temp1_x * temp2_y) - (temp1_y * temp2_x);
-        while ((m > 1) && (temp >= 0)) {
-          m--;
-          if (m > 1) {
-            temp1_x =
-                ((T *)ordered_pts_x)[j * actual_compute_box_num + i] -
-                ((T *)ordered_pts_x)[(m - 2) * actual_compute_box_num + i];
-            temp1_y =
-                ((T *)ordered_pts_y)[j * actual_compute_box_num + i] -
-                ((T *)ordered_pts_y)[(m - 2) * actual_compute_box_num + i];
-            temp2_x =
-                ((T *)ordered_pts_x)[(m - 1) * actual_compute_box_num + i] -
-                ((T *)ordered_pts_x)[(m - 2) * actual_compute_box_num + i];
-            temp2_y =
-                ((T *)ordered_pts_y)[(m - 1) * actual_compute_box_num + i] -
-                ((T *)ordered_pts_y)[(m - 2) * actual_compute_box_num + i];
-            temp = (temp1_x * temp2_y) - (temp1_y * temp2_x);
-          }
-        }
-        // q[m++] = q[j];
-        ((T *)ordered_pts_x)[m * actual_compute_box_num + i] =
-            ((T *)ordered_pts_x)[j * actual_compute_box_num + i];
-        ((T *)ordered_pts_y)[m * actual_compute_box_num + i] =
-            ((T *)ordered_pts_y)[j * actual_compute_box_num + i];
-        m++;
-      }
-      // set last(24-m) valid_pts to false, to erase invalid q in polygon area
-      for (int j = m; j < temp_nums_in; j++) {
-        ((T *)valid_pts)[j * actual_compute_box_num + i] = 0;
-      }
-      ((T *)nums_in_ram)[i] = m;
-    }
-  }
-}
-
-template <typename T>
-__mlu_func__ void polygonArea(T *ordered_pts_x, T *ordered_pts_y, T *valid_box,
-                              T *valid_pts, T *nums_in_ram, T *temp1_ram,
-                              T *temp2_ram, T *temp3_ram, T *temp4_ram,
-                              T *temp5_ram, T *temp6_ram, T *temp7_ram,
-                              T *temp8_ram, T *temp9_ram,
-                              const uint32_t &actual_compute_box_num) {
-  // Set where nums_in <= 2, valid_box = false
-  __bang_write_value((T *)temp9_ram, COMPUTE_COUNT_ALIGN, (T)2);
-  __bang_cycle_gt((T *)temp1_ram, (T *)nums_in_ram, (T *)temp9_ram,
-                  actual_compute_box_num, COMPUTE_COUNT_ALIGN);
-  __bang_and((T *)valid_box, (T *)valid_box, (T *)temp1_ram,
-             actual_compute_box_num);
-
-  // temp1 = area, initialize with all 0
-  __bang_write_zero((T *)temp1_ram, actual_compute_box_num);
-  __bang_max((T *)temp7_ram, (T *)nums_in_ram, actual_compute_box_num);
-
-  // temp_nums_in = max(nums_in)
-  T temp_nums_in = ((T *)temp7_ram)[0];
-  for (int i = 1; i < temp_nums_in - 1; i++) {
-    // q[i] - q[0]: (temp6, temp7)
-    __bang_sub((T *)temp6_ram, (T *)ordered_pts_x + i * actual_compute_box_num,
-               (T *)ordered_pts_x, actual_compute_box_num);
-    __bang_sub((T *)temp7_ram, (T *)ordered_pts_y + i * actual_compute_box_num,
-               (T *)ordered_pts_y, actual_compute_box_num);
-    __bang_mul((T *)temp6_ram, (T *)temp6_ram,
-               (T *)valid_pts + (i + 1) * actual_compute_box_num,
-               actual_compute_box_num);
-    __bang_mul((T *)temp7_ram, (T *)temp7_ram,
-               (T *)valid_pts + (i + 1) * actual_compute_box_num,
-               actual_compute_box_num);
-    // q[i + 1] - q[0]: (temp8, temp9)
-    __bang_sub((T *)temp8_ram,
-               (T *)ordered_pts_x + (i + 1) * actual_compute_box_num,
-               (T *)ordered_pts_x, actual_compute_box_num);
-    __bang_sub((T *)temp9_ram,
-               (T *)ordered_pts_y + (i + 1) * actual_compute_box_num,
-               (T *)ordered_pts_y, actual_compute_box_num);
-    __bang_mul((T *)temp8_ram, (T *)temp8_ram,
-               (T *)valid_pts + (i + 1) * actual_compute_box_num,
-               actual_compute_box_num);
-    __bang_mul((T *)temp9_ram, (T *)temp9_ram,
-               (T *)valid_pts + (i + 1) * actual_compute_box_num,
-               actual_compute_box_num);
-    // area += fabs(cross2d<T>(q[i] - q[0], q[i + 1] - q[0]));
-    __bang_mul((T *)temp4_ram, (T *)temp6_ram, (T *)temp9_ram,
-               actual_compute_box_num);
-    __bang_mul((T *)temp5_ram, (T *)temp7_ram, (T *)temp8_ram,
-               actual_compute_box_num);
-    __bang_sub((T *)temp3_ram, (T *)temp4_ram, (T *)temp5_ram,
-               actual_compute_box_num);
-    __bang_active_abs((T *)temp3_ram, (T *)temp3_ram, actual_compute_box_num);
-    __bang_add((T *)temp1_ram, (T *)temp1_ram, (T *)temp3_ram,
-               actual_compute_box_num);
-  }
-  //  Set where valid_box = false, intersection = 0
-  __bang_mul((T *)temp1_ram, (T *)temp1_ram, (T *)valid_box,
-             actual_compute_box_num);
-  //  area = area / 2.0
-  __bang_mul_scalar((T *)temp1_ram, (T *)temp1_ram, (T)0.5,
-                    actual_compute_box_num);
-}
-
-#endif  // IOU3D_UTILS_HPP_
diff --git a/mmcv/ops/csrc/common/mlu/masked_conv2d_mlu_kernel.mlu b/mmcv/ops/csrc/common/mlu/masked_conv2d_mlu_kernel.mlu
deleted file mode 100755
index 1356a79..0000000
--- a/mmcv/ops/csrc/common/mlu/masked_conv2d_mlu_kernel.mlu
+++ /dev/null
@@ -1,181 +0,0 @@
-/*************************************************************************
- * Copyright (C) 2022 Cambricon.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *************************************************************************/
-#include "common_mlu_helper.hpp"
-
-__nram__ char nram_buffer[MAX_NRAM_SIZE];
-
-template <typename T>
-__mlu_func__ void MLUUnion1MaskedIm2colForward(
-    const T *feature, const int height, const int width, const int channels,
-    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
-    const int32_t *mask_h_idx, const int32_t *mask_w_idx, const int mask_cnt,
-    T *data_col) {
-  for (int index = taskId; index < mask_cnt; index += taskDim) {
-    const int h_col = mask_h_idx[index];
-    const int w_col = mask_w_idx[index];
-    const int h_offset = h_col - pad_h;
-    const int w_offset = w_col - pad_w;
-    int h_start = h_offset;
-    int h_end = h_offset + kernel_h - 1;
-    int w_start = w_offset;
-    int w_end = w_start + kernel_w - 1;
-    if (h_start >= height || w_start >= width || h_end < 0 || w_end < 0) {
-      continue;
-    } else {
-      int h_start_valid = max(0, h_start);
-      int h_end_valid = min(height - 1, h_end);
-      int w_start_valid = max(0, w_start);
-      int w_end_valid = min(width - 1, w_end);
-      __memcpy(
-          data_col + index * kernel_h * kernel_w * channels +
-              ((h_start_valid - h_start) * kernel_w +
-               (w_start_valid - w_start)) *
-                  channels,
-          feature + h_start_valid * width * channels + w_start_valid * channels,
-          (w_end_valid - w_start_valid + 1) * channels * sizeof(T), GDRAM2GDRAM,
-          kernel_w * channels * sizeof(T), width * channels * sizeof(T),
-          h_end_valid - h_start_valid);
-    }
-  }
-}
-
-template <typename T>
-__mlu_func__ void MLUUnion1MaskedCol2imForward(const T *col, const int height,
-                                               const int width,
-                                               const int channels,
-                                               const int32_t *mask_h_idx,
-                                               const int32_t *mask_w_idx,
-                                               const int mask_cnt, T *im) {
-  const int channels_max_num_nram = MAX_NRAM_SIZE / sizeof(T);
-  if (channels <= channels_max_num_nram) {
-    const int deal_num = channels_max_num_nram / channels;
-    int mask_per_core = mask_cnt / taskDim;
-    const int mask_remain = mask_cnt % taskDim;
-    mask_per_core += taskId < mask_remain ? 1 : 0;
-    int index_start = taskId < mask_remain
-                          ? taskId * mask_per_core
-                          : taskId * mask_per_core + mask_remain;
-    int loop = mask_per_core / deal_num;
-    int remain_num = mask_per_core % deal_num;
-    T *nram_col = (T *)nram_buffer;
-    for (int index = 0; index < loop; ++index) {
-      int cur_index = index_start + index * deal_num;
-      __memcpy(nram_col, col + cur_index * channels,
-               deal_num * channels * sizeof(T), GDRAM2NRAM);
-      for (int i = 0; i < deal_num; ++i) {
-        int mask_index = cur_index + i;
-        const int h_im = mask_h_idx[mask_index];
-        const int w_im = mask_w_idx[mask_index];
-        // if(h_im>=height || w_im>=width) continue;
-        __memcpy(im + (h_im * width + w_im) * channels, nram_col + i * channels,
-                 channels * sizeof(T), NRAM2GDRAM);
-      }
-    }
-    if (remain_num > 0) {
-      int cur_index = index_start + loop * deal_num;
-      __memcpy(nram_col, col + cur_index * channels,
-               remain_num * channels * sizeof(T), GDRAM2NRAM);
-      for (int i = 0; i < remain_num; ++i) {
-        int mask_index = cur_index + i;
-        const int h_im = mask_h_idx[mask_index];
-        const int w_im = mask_w_idx[mask_index];
-        // if(h_im>=height || w_im>=width) continue;
-        __memcpy(im + (h_im * width + w_im) * channels, nram_col + i * channels,
-                 channels * sizeof(T), NRAM2GDRAM);
-      }
-    }
-  } else {
-    for (int index = taskId; index < mask_cnt; index += taskDim) {
-      const int m_index = index % mask_cnt;
-      const int h_im = mask_h_idx[m_index];
-      const int w_im = mask_w_idx[m_index];
-      // if(h_im>=height || w_im>=width) continue;
-      __memcpy(im + (h_im * width + w_im) * channels, col + index * channels,
-               channels * sizeof(T), GDRAM2GDRAM);
-    }
-  }
-}
-
-__mlu_global__ void MLUKernelMaskedIm2colForward(
-    const void *feature, const int height, const int width, const int channels,
-    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
-    const void *mask_h_idx, const void *mask_w_idx, const int mask_cnt,
-    void *data_col, const cnrtDataType_t data_dtype) {
-  if (coreId == 0x80) {
-    return;
-  }
-
-  switch (data_dtype) {
-    case CNRT_FLOAT16: {
-      MLUUnion1MaskedIm2colForward((half *)feature, height, width, channels,
-                                   kernel_h, kernel_w, pad_h, pad_w,
-                                   (int32_t *)mask_h_idx, (int32_t *)mask_w_idx,
-                                   mask_cnt, (half *)data_col);
-    }; break;
-    case CNRT_FLOAT32: {
-      MLUUnion1MaskedIm2colForward((float *)feature, height, width, channels,
-                                   kernel_h, kernel_w, pad_h, pad_w,
-                                   (int32_t *)mask_h_idx, (int32_t *)mask_w_idx,
-                                   mask_cnt, (float *)data_col);
-    }; break;
-    default: {
-      break;
-    }
-  }
-}
-
-__mlu_global__ void MLUKernelMaskedCol2imForward(
-    const void *col, const int height, const int width, const int channels,
-    const void *mask_h_idx, const void *mask_w_idx, const int mask_cnt,
-    void *im, const cnrtDataType_t data_dtype) {
-  if (coreId == 0x80) {
-    return;
-  }
-  switch (data_dtype) {
-    case CNRT_FLOAT16: {
-      MLUUnion1MaskedCol2imForward((half *)col, height, width, channels,
-                                   (int32_t *)mask_h_idx, (int32_t *)mask_w_idx,
-                                   mask_cnt, (half *)im);
-    }; break;
-    case CNRT_FLOAT32: {
-      MLUUnion1MaskedCol2imForward((float *)col, height, width, channels,
-                                   (int32_t *)mask_h_idx, (int32_t *)mask_w_idx,
-                                   mask_cnt, (float *)im);
-    }; break;
-    default: {
-      break;
-    }
-  }
-}
-
-void KernelMaskedIm2colForward(
-    cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
-    cnrtDataType_t k_dtype, const void *im_ptr, const int height,
-    const int width, const int channels, const int kernel_h, const int kernel_w,
-    const int pad_h, const int pad_w, const void *mask_h_idx_ptr,
-    const void *mask_w_idx_ptr, const int mask_cnt, void *col_ptr) {
-  MLUKernelMaskedIm2colForward<<<k_dim, k_type, queue>>>(
-      im_ptr, height, width, channels, kernel_h, kernel_w, pad_h, pad_w,
-      mask_h_idx_ptr, mask_w_idx_ptr, mask_cnt, col_ptr, k_dtype);
-}
-
-void KernelMaskedCol2imForward(cnrtDim3_t k_dim, cnrtFunctionType_t k_type,
-                               cnrtQueue_t queue, cnrtDataType_t k_dtype,
-                               const void *col_ptr, const int height,
-                               const int width, const int channels,
-                               const void *mask_h_idx_ptr,
-                               const void *mask_w_idx_ptr, const int mask_cnt,
-                               void *im_ptr) {
-  MLUKernelMaskedCol2imForward<<<k_dim, k_type, queue>>>(
-      col_ptr, height, width, channels, mask_h_idx_ptr, mask_w_idx_ptr,
-      mask_cnt, im_ptr, k_dtype);
-}
diff --git a/mmcv/ops/csrc/common/mlu/ms_deform_attn_mlu_kernel.mlu b/mmcv/ops/csrc/common/mlu/ms_deform_attn_mlu_kernel.mlu
deleted file mode 100644
index 7899e52..0000000
--- a/mmcv/ops/csrc/common/mlu/ms_deform_attn_mlu_kernel.mlu
+++ /dev/null
@@ -1,853 +0,0 @@
-/*************************************************************************
- * Copyright (C) 2022 by Cambricon.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *************************************************************************/
-
-#include "common_mlu_helper.hpp"
-#include <math.h>
-
-/****************************************************************************************
- *
- * NRAM partition forward:
- * | spatial_shapes     | data_value_p1_ping | data_value_p2_ping |
- * | data_value_p3_ping | data_value_p4_ping | data_col_ping      |
- * | data_value_p1_pong | data_value_p2_pong | data_value_p3_pong |
- * | data_value_p4_pong | data_col_pong      | auxiliary_a        |
- * | auxiliary_b        |
- * | 128bytes           | deal_size          | deal_size          |
- * | deal_size          | deal_size          | deal_size          |
- * | deal_size          | deal_size          | deal_size          |
- * | deal_size          | deal_size          | deal_size          |
- * | deal_size          |
- *
- ****************************************************************************************/
-
-/****************************************************************************************
- *
- * NRAM partition backward:
- * | grad_output_nram   | grad_output_nram_temp | grad_weight       |
- * | grad_h_weight      | grad_w_weight         | top_grad          |
- * | top_grad_temp      | spatial_shapes_nram   | sampling_loc_nram |
- * | deal_size          | deal_size             | deal_size         |
- * | deal_size          | deal_size             | deal_size         |
- * | deal_size          | deal_size             | 64bytes           |
- *
- ****************************************************************************************/
-
-#define TWELVE_SPLIT 12
-#define ALIGN_NUM 64
-#define ALIGN_NUM_FOR_REDUCE 32
-
-__nram__ char nram_buffer[MAX_NRAM_SIZE];
-
-template <typename T>
-__mlu_func__ void loadNeighborPointsData(
-    const T *data_value_gdram, T *data_value_p1_nram, T *data_value_p2_nram,
-    T *data_value_p3_nram, T *data_value_p4_nram, const size_t deal_num,
-    const int32_t &width, const int32_t &height, const int32_t &num_heads,
-    const int32_t &channels, const T &x, const T &y, const int32_t &head_idx) {
-  const int32_t w_low = floorf(x);
-  const int32_t h_low = floorf(y);
-  const int32_t w_high = w_low + 1;
-  const int32_t h_high = h_low + 1;
-
-  const int32_t w_stride = num_heads * channels;
-  const int32_t h_stride = width * w_stride;
-  const int32_t h_low_ptr_offset = h_low * h_stride;
-  const int32_t h_high_ptr_offset = h_low_ptr_offset + h_stride;
-  const int32_t w_low_ptr_offset = w_low * w_stride;
-  const int32_t w_high_ptr_offset = w_low_ptr_offset + w_stride;
-  const int32_t base_ptr_offset = head_idx * channels;
-
-  // top-left point
-  if (h_low >= 0 && w_low >= 0) {
-    const int32_t v1_offset =
-        h_low_ptr_offset + w_low_ptr_offset + base_ptr_offset;
-    __memcpy_async(data_value_p1_nram, data_value_gdram + v1_offset,
-                   deal_num * sizeof(T), GDRAM2NRAM);
-  }
-
-  // top-right point
-  if (h_low >= 0 && w_high <= width - 1) {
-    const int32_t v2_offset =
-        h_low_ptr_offset + w_high_ptr_offset + base_ptr_offset;
-    __memcpy_async(data_value_p2_nram, data_value_gdram + v2_offset,
-                   deal_num * sizeof(T), GDRAM2NRAM);
-  }
-
-  // bottom-left point
-  if (h_high <= height - 1 && w_low >= 0) {
-    const int32_t v3_offset =
-        h_high_ptr_offset + w_low_ptr_offset + base_ptr_offset;
-    __memcpy_async(data_value_p3_nram, data_value_gdram + v3_offset,
-                   deal_num * sizeof(T), GDRAM2NRAM);
-  }
-
-  // bottom-right point
-  if (h_high <= height - 1 && w_high <= width - 1) {
-    const int32_t v4_offset =
-        h_high_ptr_offset + w_high_ptr_offset + base_ptr_offset;
-    __memcpy_async(data_value_p4_nram, data_value_gdram + v4_offset,
-                   deal_num * sizeof(T), GDRAM2NRAM);
-  }
-}
-
-template <typename T>
-__mlu_func__ void bilinearInterpolation(
-    T *data_value_p1_nram, T *data_value_p2_nram, T *data_value_p3_nram,
-    T *data_value_p4_nram, T *sample_point_value, T *auxiliary_b,
-    const size_t deal_num, const int32_t &width, const int32_t &height,
-    const T &x, const T &y) {
-  const int32_t w_low = floorf(x);
-  const int32_t h_low = floorf(y);
-  const int32_t w_high = w_low + 1;
-  const int32_t h_high = h_low + 1;
-
-  const T lw = x - w_low;
-  const T lh = y - h_low;
-  const T hw = 1 - lw;
-  const T hh = 1 - lh;
-  const T w1 = hh * hw;
-  const T w2 = hh * lw;
-  const T w3 = lh * hw;
-  const T w4 = lh * lw;
-
-  __bang_write_value((T *)sample_point_value, deal_num, (T)0);
-
-  // top-left point
-  if (h_low >= 0 && w_low >= 0) {
-    // sample_point_value += v1 * w1
-    __bang_mul_scalar((T *)auxiliary_b, (T *)data_value_p1_nram, (T)w1,
-                      deal_num);
-    __bang_add((T *)sample_point_value, (T *)sample_point_value,
-               (T *)auxiliary_b, deal_num);
-  }
-
-  // top-right point
-  if (h_low >= 0 && w_high <= width - 1) {
-    // sample_point_value += v2 * w2
-    __bang_mul_scalar((T *)auxiliary_b, (T *)data_value_p2_nram, (T)w2,
-                      deal_num);
-    __bang_add((T *)sample_point_value, (T *)sample_point_value,
-               (T *)auxiliary_b, deal_num);
-  }
-
-  // bottom-left point
-  if (h_high <= height - 1 && w_low >= 0) {
-    // sample_point_value += v3 * w3
-    __bang_mul_scalar((T *)auxiliary_b, (T *)data_value_p3_nram, (T)w3,
-                      deal_num);
-    __bang_add((T *)sample_point_value, (T *)sample_point_value,
-               (T *)auxiliary_b, deal_num);
-  }
-
-  // bottom-right point
-  if (h_high <= height - 1 && w_high <= width - 1) {
-    // sample_point_value += v4 * w4
-    __bang_mul_scalar((T *)auxiliary_b, (T *)data_value_p4_nram, (T)w4,
-                      deal_num);
-    __bang_add((T *)sample_point_value, (T *)sample_point_value,
-               (T *)auxiliary_b, deal_num);
-  }
-}
-
-template <typename T>
-__mlu_global__ void MLUKernelMsDeformAttnForward(
-    const char *data_value_gdram, const char *data_spatial_shapes_gdram,
-    const char *data_level_start_index_gdram,
-    const char *data_sampling_loc_gdram, const char *data_attn_weight_gdram,
-    const int32_t batch_size, const int32_t num_keys, const int32_t num_heads,
-    const int32_t channels, const int32_t num_levels, const int32_t num_queries,
-    const int32_t num_points, char *data_col_gdram) {
-  if (coreId == 0x80) {
-    return;
-  }
-
-  const size_t spatial_size = PAD_UP(2 * sizeof(int32_t), NFU_ALIGN_SIZE);
-  const size_t span_num_deal =
-      PAD_DOWN((MAX_NRAM_SIZE - spatial_size) / TWELVE_SPLIT / sizeof(T),
-               NFU_ALIGN_SIZE);
-  const size_t align_num = NFU_ALIGN_SIZE;
-  const int32_t channels_seg_num = channels / span_num_deal;
-  const size_t channels_rem = channels % span_num_deal;
-  const size_t channels_align_rem = CEIL_ALIGN(channels_rem, align_num);
-  char *data_spatial_shapes_nram = nram_buffer;
-  char *ping_data_value_p1_nram = data_spatial_shapes_nram + spatial_size;
-  char *ping_data_value_p2_nram =
-      ping_data_value_p1_nram + span_num_deal * sizeof(T);
-  char *ping_data_value_p3_nram =
-      ping_data_value_p2_nram + span_num_deal * sizeof(T);
-  char *ping_data_value_p4_nram =
-      ping_data_value_p3_nram + span_num_deal * sizeof(T);
-  char *ping_data_col_nram =
-      ping_data_value_p4_nram + span_num_deal * sizeof(T);
-  char *pong_data_value_p1_nram =
-      ping_data_col_nram + span_num_deal * sizeof(T);
-  char *pong_data_value_p2_nram =
-      pong_data_value_p1_nram + span_num_deal * sizeof(T);
-  char *pong_data_value_p3_nram =
-      pong_data_value_p2_nram + span_num_deal * sizeof(T);
-  char *pong_data_value_p4_nram =
-      pong_data_value_p3_nram + span_num_deal * sizeof(T);
-  char *pong_data_col_nram =
-      pong_data_value_p4_nram + span_num_deal * sizeof(T);
-  char *auxiliary_a = pong_data_col_nram + span_num_deal * sizeof(T);
-  char *auxiliary_b = auxiliary_a + span_num_deal * sizeof(T);
-  const size_t ping_pong_gap = 5 * span_num_deal * sizeof(T);
-  size_t data_col_ping_pong_idx = 0;
-
-  int32_t block_num_per_core = (batch_size * num_queries * num_heads) / taskDim;
-  const int32_t block_num_rem =
-      (batch_size * num_queries * num_heads) % taskDim;
-  const int32_t idx_start = taskId < (block_num_rem + 1)
-                                ? taskId * (block_num_per_core + 1)
-                                : taskId * block_num_per_core + block_num_rem;
-  block_num_per_core =
-      taskId < block_num_rem
-          ? (batch_size * num_queries * num_heads) / taskDim + 1
-          : (batch_size * num_queries * num_heads) / taskDim;
-
-  for (int32_t cur_idx = idx_start; cur_idx < idx_start + block_num_per_core;
-       ++cur_idx) {
-    // cur_idx = batch_idx * num_queries * num_heads + query_idx * num_heads +
-    // head_idx
-    const int32_t head_idx = cur_idx % num_heads;
-    const int32_t batch_idx = (cur_idx / num_heads) / num_queries;
-
-    const char *data_value_gdram_start =
-        data_value_gdram +
-        batch_idx * num_keys * num_heads * channels * sizeof(T);
-    const char *data_sampling_loc_gdram_start =
-        data_sampling_loc_gdram +
-        cur_idx * num_levels * num_points * 2 * sizeof(T);
-    const char *data_attn_weight_gdram_start =
-        data_attn_weight_gdram + cur_idx * num_levels * num_points * sizeof(T);
-    char *data_col_gdram_start =
-        data_col_gdram + cur_idx * channels * sizeof(T);
-
-    for (int32_t c_seg_idx = 0; c_seg_idx < channels_seg_num; ++c_seg_idx) {
-      __bang_write_value(
-          (T *)(ping_data_col_nram + data_col_ping_pong_idx * ping_pong_gap),
-          span_num_deal, (T)0);
-      // load data
-      // level_idx = 0, point_idx = 0
-      __memcpy(data_spatial_shapes_nram, data_spatial_shapes_gdram,
-               2 * sizeof(int32_t), GDRAM2NRAM);
-      int32_t spatial_h = ((int32_t *)data_spatial_shapes_nram)[0];
-      int32_t spatial_w = ((int32_t *)data_spatial_shapes_nram)[1];
-      const char *data_value_ptr =
-          data_value_gdram_start + c_seg_idx * span_num_deal * sizeof(T);
-      T loc_w = ((T *)data_sampling_loc_gdram_start)[0];
-      T loc_h = ((T *)data_sampling_loc_gdram_start)[1];
-      T weight = ((T *)data_attn_weight_gdram_start)[0];
-      T x = loc_w * spatial_w - 0.5;
-      T y = loc_h * spatial_h - 0.5;
-      if (y > -1 && x > -1 && y < spatial_h && x < spatial_w) {
-        loadNeighborPointsData(
-            (T *)data_value_ptr, (T *)ping_data_value_p1_nram,
-            (T *)ping_data_value_p2_nram, (T *)ping_data_value_p3_nram,
-            (T *)ping_data_value_p4_nram, span_num_deal, spatial_w, spatial_h,
-            num_heads, channels, x, y, head_idx);
-      }
-      T spatial_h_next_point = 0;
-      T spatial_w_next_point = 0;
-      T weight_next_point = 0;
-      T x_next_point = 0;
-      T y_next_point = 0;
-      __asm__ volatile("sync;");
-
-      for (int32_t level_idx = 0; level_idx < num_levels; ++level_idx) {
-        for (int32_t point_idx = 0; point_idx < num_points; ++point_idx) {
-          // load data
-          if (point_idx == num_points - 1 && level_idx == num_levels - 1) {
-            // last point no need to load data, continue to compute
-          } else if (point_idx == num_points - 1) {
-            const int32_t level_start_id =
-                ((int32_t *)data_level_start_index_gdram)[level_idx + 1];
-            const int32_t spatial_h_ptr = (level_idx + 1) << 1;
-            __memcpy(
-                data_spatial_shapes_nram,
-                data_spatial_shapes_gdram + spatial_h_ptr * sizeof(int32_t),
-                2 * sizeof(int32_t), GDRAM2NRAM);
-            spatial_h_next_point = ((int32_t *)data_spatial_shapes_nram)[0];
-            spatial_w_next_point = ((int32_t *)data_spatial_shapes_nram)[1];
-            data_value_ptr = data_value_gdram_start +
-                             (level_start_id * num_heads * channels +
-                              c_seg_idx * span_num_deal) *
-                                 sizeof(T);
-            loc_w = ((T *)data_sampling_loc_gdram_start)
-                [(level_idx * num_points + point_idx + 1) * 2];
-            loc_h = ((T *)data_sampling_loc_gdram_start)
-                [(level_idx * num_points + point_idx + 1) * 2 + 1];
-            weight_next_point =
-                ((T *)data_attn_weight_gdram_start)[level_idx * num_points +
-                                                    point_idx + 1];
-            x_next_point = loc_w * spatial_w_next_point - 0.5;
-            y_next_point = loc_h * spatial_h_next_point - 0.5;
-            if (y_next_point > -1 && x_next_point > -1 &&
-                y_next_point < spatial_h_next_point &&
-                x_next_point < spatial_w_next_point) {
-              loadNeighborPointsData(
-                  (T *)data_value_ptr,
-                  (T *)(ping_data_value_p1_nram +
-                        ((level_idx * num_points + point_idx + 1) % 2) *
-                            ping_pong_gap),
-                  (T *)(ping_data_value_p2_nram +
-                        ((level_idx * num_points + point_idx + 1) % 2) *
-                            ping_pong_gap),
-                  (T *)(ping_data_value_p3_nram +
-                        ((level_idx * num_points + point_idx + 1) % 2) *
-                            ping_pong_gap),
-                  (T *)(ping_data_value_p4_nram +
-                        ((level_idx * num_points + point_idx + 1) % 2) *
-                            ping_pong_gap),
-                  span_num_deal, spatial_w_next_point, spatial_h_next_point,
-                  num_heads, channels, x_next_point, y_next_point, head_idx);
-            }
-          } else {
-            spatial_h_next_point = spatial_h;
-            spatial_w_next_point = spatial_w;
-            loc_w = ((T *)data_sampling_loc_gdram_start)
-                [(level_idx * num_points + point_idx + 1) * 2];
-            loc_h = ((T *)data_sampling_loc_gdram_start)
-                [(level_idx * num_points + point_idx + 1) * 2 + 1];
-            weight_next_point =
-                ((T *)data_attn_weight_gdram_start)[level_idx * num_points +
-                                                    point_idx + 1];
-            x_next_point = loc_w * spatial_w - 0.5;
-            y_next_point = loc_h * spatial_h - 0.5;
-            if (y_next_point > -1 && x_next_point > -1 &&
-                y_next_point < spatial_h && x_next_point < spatial_w) {
-              loadNeighborPointsData(
-                  (T *)data_value_ptr,
-                  (T *)(ping_data_value_p1_nram +
-                        ((level_idx * num_points + point_idx + 1) % 2) *
-                            ping_pong_gap),
-                  (T *)(ping_data_value_p2_nram +
-                        ((level_idx * num_points + point_idx + 1) % 2) *
-                            ping_pong_gap),
-                  (T *)(ping_data_value_p3_nram +
-                        ((level_idx * num_points + point_idx + 1) % 2) *
-                            ping_pong_gap),
-                  (T *)(ping_data_value_p4_nram +
-                        ((level_idx * num_points + point_idx + 1) % 2) *
-                            ping_pong_gap),
-                  span_num_deal, spatial_w, spatial_h, num_heads, channels,
-                  x_next_point, y_next_point, head_idx);
-            }
-          }
-
-          // compute
-          if (y > -1 && x > -1 && y < spatial_h && x < spatial_w) {
-            bilinearInterpolation(
-                (T *)(ping_data_value_p1_nram +
-                      ((level_idx * num_points + point_idx) % 2) *
-                          ping_pong_gap),
-                (T *)(ping_data_value_p2_nram +
-                      ((level_idx * num_points + point_idx) % 2) *
-                          ping_pong_gap),
-                (T *)(ping_data_value_p3_nram +
-                      ((level_idx * num_points + point_idx) % 2) *
-                          ping_pong_gap),
-                (T *)(ping_data_value_p4_nram +
-                      ((level_idx * num_points + point_idx) % 2) *
-                          ping_pong_gap),
-                (T *)auxiliary_a, (T *)auxiliary_b, span_num_deal, spatial_w,
-                spatial_h, x, y);
-            __bang_mul_scalar((T *)auxiliary_a, (T *)auxiliary_a, (T)weight,
-                              span_num_deal);
-            __bang_add((T *)(ping_data_col_nram +
-                             data_col_ping_pong_idx * ping_pong_gap),
-                       (T *)(ping_data_col_nram +
-                             data_col_ping_pong_idx * ping_pong_gap),
-                       (T *)auxiliary_a, span_num_deal);
-          }
-
-          spatial_w = spatial_w_next_point;
-          spatial_h = spatial_h_next_point;
-          weight = weight_next_point;
-          x = x_next_point;
-          y = y_next_point;
-          __asm__ volatile("sync;");
-        }
-      }
-      // store
-      __memcpy_async(
-          data_col_gdram_start + c_seg_idx * span_num_deal * sizeof(T),
-          ping_data_col_nram + data_col_ping_pong_idx * ping_pong_gap,
-          span_num_deal * sizeof(T), NRAM2GDRAM);
-      data_col_ping_pong_idx = (data_col_ping_pong_idx + 1) % 2;
-    }
-
-    if (channels_rem > 0) {
-      __bang_write_value(
-          (T *)(ping_data_col_nram + data_col_ping_pong_idx * ping_pong_gap),
-          channels_align_rem, (T)0);
-      // load data
-      // level_idx = 0, point_idx = 0
-      __memcpy(data_spatial_shapes_nram, data_spatial_shapes_gdram,
-               2 * sizeof(int32_t), GDRAM2NRAM);
-      int32_t spatial_h = ((int32_t *)data_spatial_shapes_nram)[0];
-      int32_t spatial_w = ((int32_t *)data_spatial_shapes_nram)[1];
-      const char *data_value_ptr =
-          data_value_gdram_start + channels_seg_num * span_num_deal * sizeof(T);
-      T loc_w = ((T *)data_sampling_loc_gdram_start)[0];
-      T loc_h = ((T *)data_sampling_loc_gdram_start)[1];
-      T weight = ((T *)data_attn_weight_gdram_start)[0];
-      T x = loc_w * spatial_w - 0.5;
-      T y = loc_h * spatial_h - 0.5;
-      if (y > -1 && x > -1 && y < spatial_h && x < spatial_w) {
-        loadNeighborPointsData(
-            (T *)data_value_ptr, (T *)ping_data_value_p1_nram,
-            (T *)ping_data_value_p2_nram, (T *)ping_data_value_p3_nram,
-            (T *)ping_data_value_p4_nram, channels_rem, spatial_w, spatial_h,
-            num_heads, channels, x, y, head_idx);
-      }
-      T spatial_h_next_point = 0;
-      T spatial_w_next_point = 0;
-      T weight_next_point = 0;
-      T x_next_point = 0;
-      T y_next_point = 0;
-      __asm__ volatile("sync;");
-
-      for (int32_t level_idx = 0; level_idx < num_levels; ++level_idx) {
-        for (int32_t point_idx = 0; point_idx < num_points; ++point_idx) {
-          // load data
-          if (point_idx == num_points - 1 && level_idx == num_levels - 1) {
-            // last point no need to load data, continue to compute
-          } else if (point_idx == num_points - 1) {
-            const int32_t level_start_id =
-                ((int32_t *)data_level_start_index_gdram)[level_idx + 1];
-            const int32_t spatial_h_ptr = (level_idx + 1) << 1;
-            __memcpy(
-                data_spatial_shapes_nram,
-                data_spatial_shapes_gdram + spatial_h_ptr * sizeof(int32_t),
-                2 * sizeof(int32_t), GDRAM2NRAM);
-            spatial_h_next_point = ((int32_t *)data_spatial_shapes_nram)[0];
-            spatial_w_next_point = ((int32_t *)data_spatial_shapes_nram)[1];
-            data_value_ptr = data_value_gdram_start +
-                             (level_start_id * num_heads * channels +
-                              channels_seg_num * span_num_deal) *
-                                 sizeof(T);
-            loc_w = ((T *)data_sampling_loc_gdram_start)
-                [(level_idx * num_points + point_idx + 1) * 2];
-            loc_h = ((T *)data_sampling_loc_gdram_start)
-                [(level_idx * num_points + point_idx + 1) * 2 + 1];
-            weight_next_point =
-                ((T *)data_attn_weight_gdram_start)[level_idx * num_points +
-                                                    point_idx + 1];
-            x_next_point = loc_w * spatial_w_next_point - 0.5;
-            y_next_point = loc_h * spatial_h_next_point - 0.5;
-            if (y_next_point > -1 && x_next_point > -1 &&
-                y_next_point < spatial_h_next_point &&
-                x_next_point < spatial_w_next_point) {
-              loadNeighborPointsData(
-                  (T *)data_value_ptr,
-                  (T *)(ping_data_value_p1_nram +
-                        ((level_idx * num_points + point_idx + 1) % 2) *
-                            ping_pong_gap),
-                  (T *)(ping_data_value_p2_nram +
-                        ((level_idx * num_points + point_idx + 1) % 2) *
-                            ping_pong_gap),
-                  (T *)(ping_data_value_p3_nram +
-                        ((level_idx * num_points + point_idx + 1) % 2) *
-                            ping_pong_gap),
-                  (T *)(ping_data_value_p4_nram +
-                        ((level_idx * num_points + point_idx + 1) % 2) *
-                            ping_pong_gap),
-                  channels_rem, spatial_w_next_point, spatial_h_next_point,
-                  num_heads, channels, x_next_point, y_next_point, head_idx);
-            }
-          } else {
-            spatial_w_next_point = spatial_w;
-            spatial_h_next_point = spatial_h;
-            loc_w = ((T *)data_sampling_loc_gdram_start)
-                [(level_idx * num_points + point_idx + 1) * 2];
-            loc_h = ((T *)data_sampling_loc_gdram_start)
-                [(level_idx * num_points + point_idx + 1) * 2 + 1];
-            weight_next_point =
-                ((T *)data_attn_weight_gdram_start)[level_idx * num_points +
-                                                    point_idx + 1];
-            x_next_point = loc_w * spatial_w - 0.5;
-            y_next_point = loc_h * spatial_h - 0.5;
-            if (y_next_point > -1 && x_next_point > -1 &&
-                y_next_point < spatial_h && x_next_point < spatial_w) {
-              loadNeighborPointsData(
-                  (T *)data_value_ptr,
-                  (T *)(ping_data_value_p1_nram +
-                        ((level_idx * num_points + point_idx + 1) % 2) *
-                            ping_pong_gap),
-                  (T *)(ping_data_value_p2_nram +
-                        ((level_idx * num_points + point_idx + 1) % 2) *
-                            ping_pong_gap),
-                  (T *)(ping_data_value_p3_nram +
-                        ((level_idx * num_points + point_idx + 1) % 2) *
-                            ping_pong_gap),
-                  (T *)(ping_data_value_p4_nram +
-                        ((level_idx * num_points + point_idx + 1) % 2) *
-                            ping_pong_gap),
-                  channels_rem, spatial_w, spatial_h, num_heads, channels,
-                  x_next_point, y_next_point, head_idx);
-            }
-          }
-
-          // compute
-          if (y > -1 && x > -1 && y < spatial_h && x < spatial_w) {
-            bilinearInterpolation(
-                (T *)(ping_data_value_p1_nram +
-                      ((level_idx * num_points + point_idx) % 2) *
-                          ping_pong_gap),
-                (T *)(ping_data_value_p2_nram +
-                      ((level_idx * num_points + point_idx) % 2) *
-                          ping_pong_gap),
-                (T *)(ping_data_value_p3_nram +
-                      ((level_idx * num_points + point_idx) % 2) *
-                          ping_pong_gap),
-                (T *)(ping_data_value_p4_nram +
-                      ((level_idx * num_points + point_idx) % 2) *
-                          ping_pong_gap),
-                (T *)auxiliary_a, (T *)auxiliary_b, channels_align_rem,
-                spatial_w, spatial_h, x, y);
-            __bang_mul_scalar((T *)auxiliary_a, (T *)auxiliary_a, (T)weight,
-                              channels_align_rem);
-            __bang_add((T *)(ping_data_col_nram +
-                             data_col_ping_pong_idx * ping_pong_gap),
-                       (T *)(ping_data_col_nram +
-                             data_col_ping_pong_idx * ping_pong_gap),
-                       (T *)auxiliary_a, channels_align_rem);
-          }
-
-          spatial_w = spatial_w_next_point;
-          spatial_h = spatial_h_next_point;
-          weight = weight_next_point;
-          x = x_next_point;
-          y = y_next_point;
-          __asm__ volatile("sync;");
-        }
-      }
-      // store
-      __memcpy_async(
-          data_col_gdram_start + channels_seg_num * span_num_deal * sizeof(T),
-          ping_data_col_nram + data_col_ping_pong_idx * ping_pong_gap,
-          channels_rem * sizeof(T), NRAM2GDRAM);
-      data_col_ping_pong_idx = (data_col_ping_pong_idx + 1) % 2;
-    }
-  }
-  __asm__ volatile("sync;");
-  return;
-}
-
-template __mlu_global__ void MLUKernelMsDeformAttnForward<float>(
-    const char *data_value_gdram, const char *data_spatial_shapes_gdram,
-    const char *data_level_start_index_gdram,
-    const char *data_sampling_loc_gdram, const char *data_attn_weight_gdram,
-    const int32_t batch_size, const int32_t num_keys, const int32_t num_heads,
-    const int32_t channels, const int32_t num_levels, const int32_t num_queries,
-    const int32_t num_points, char *data_col_gdram);
-
-void KernelMsDeformAttnForward(
-    cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
-    const cnrtDataType_t d_type, const char *data_value_gdram,
-    const char *data_spatial_shapes_gdram,
-    const char *data_level_start_index_gdram,
-    const char *data_sampling_loc_gdram, const char *data_attn_weight_gdram,
-    const int32_t batch_size, const int32_t num_keys, const int32_t num_heads,
-    const int32_t channels, const int32_t num_levels, const int32_t num_queries,
-    const int32_t num_points, char *data_col_gdram) {
-  MLUKernelMsDeformAttnForward<float><<<k_dim, k_type, queue>>>(
-      data_value_gdram, data_spatial_shapes_gdram, data_level_start_index_gdram,
-      data_sampling_loc_gdram, data_attn_weight_gdram, batch_size, num_keys,
-      num_heads, channels, num_levels, num_queries, num_points, data_col_gdram);
-}
-
-template <typename T>
-void __mlu_func__ msDeformAttnCol2imBilinear(
-    T *top_grad_temp, const int32_t &height, const int32_t &width, const T &w1,
-    const T &w2, const T &w3, const T &w4, const int32_t &h_low,
-    const int32_t &w_low, const int32_t &h_high, const int32_t &w_high,
-    const int32_t &base_ptr, const int32_t &h_low_ptr_offset,
-    const int32_t &w_low_ptr_offset, const int32_t &h_high_ptr_offset,
-    const int32_t &w_high_ptr_offset, const T &hh, const T &hw, const T &lh,
-    const T &lw, T *top_grad, const T &data_attn_weight, T *grad_h_weight,
-    T *grad_w_weight, T *grad_value, T *grad_output_nram, T *grad_weight,
-    T *grad_sampling_loc, T *grad_attn_weight, T *grad_output_nram_temp,
-    const int32_t &deal_num, const int32_t &deal_num_real,
-    const T *data_value_ptr) {
-  if (h_low >= 0 && w_low >= 0) {
-    int32_t offset1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr;
-    __memcpy(grad_output_nram, data_value_ptr + offset1,
-             deal_num_real * sizeof(T), GDRAM2NRAM);
-    __bang_mul_scalar(grad_weight, grad_output_nram, hw, deal_num);
-    __bang_sub(grad_h_weight, grad_h_weight, grad_weight, deal_num);
-    __bang_mul_scalar(grad_weight, grad_output_nram, hh, deal_num);
-    __bang_sub(grad_w_weight, grad_w_weight, grad_weight, deal_num);
-
-    __bang_mul_scalar(top_grad_temp, top_grad, data_attn_weight, deal_num);
-    __bang_mul_scalar(top_grad_temp, top_grad_temp, w1, deal_num);
-    // for calc grad_attn_weight
-    __bang_mul_scalar(grad_output_nram, grad_output_nram, w1, deal_num);
-    __bang_atomic_add((T *)top_grad_temp, (T *)(grad_value + offset1),
-                      (T *)top_grad_temp, deal_num_real);
-  }
-  if (h_low >= 0 && w_high <= width - 1) {
-    int32_t offset2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr;
-    __memcpy(grad_output_nram_temp, data_value_ptr + offset2,
-             deal_num_real * sizeof(T), GDRAM2NRAM);
-    __bang_mul_scalar(grad_weight, grad_output_nram_temp, lw, deal_num);
-    __bang_sub(grad_h_weight, grad_h_weight, grad_weight, deal_num);
-    __bang_mul_scalar(grad_weight, grad_output_nram_temp, hh, deal_num);
-    __bang_add(grad_w_weight, grad_w_weight, grad_weight, deal_num);
-
-    __bang_mul_scalar(top_grad_temp, top_grad, data_attn_weight, deal_num);
-    __bang_mul_scalar(top_grad_temp, top_grad_temp, w2, deal_num);
-
-    __bang_mul_scalar(grad_output_nram_temp, grad_output_nram_temp, w2,
-                      deal_num);
-    __bang_add(grad_output_nram, grad_output_nram, grad_output_nram_temp,
-               deal_num);
-    __bang_atomic_add((T *)top_grad_temp, (T *)(grad_value + offset2),
-                      (T *)top_grad_temp, deal_num_real);
-  }
-  if (h_high <= height - 1 && w_low >= 0) {
-    int32_t offset3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr;
-    __memcpy(grad_output_nram_temp, data_value_ptr + offset3,
-             deal_num_real * sizeof(T), GDRAM2NRAM);
-    __bang_mul_scalar(grad_weight, grad_output_nram_temp, hw, deal_num);
-    __bang_add(grad_h_weight, grad_h_weight, grad_weight, deal_num);
-    __bang_mul_scalar(grad_weight, grad_output_nram_temp, lh, deal_num);
-    __bang_sub(grad_w_weight, grad_w_weight, grad_weight, deal_num);
-
-    __bang_mul_scalar(top_grad_temp, top_grad, data_attn_weight, deal_num);
-    __bang_mul_scalar(top_grad_temp, top_grad_temp, w3, deal_num);
-    // for calc grad_attn_weight
-    __bang_mul_scalar(grad_output_nram_temp, grad_output_nram_temp, w3,
-                      deal_num);
-    __bang_add(grad_output_nram, grad_output_nram, grad_output_nram_temp,
-               deal_num);
-    __bang_atomic_add((T *)top_grad_temp, (T *)(grad_value + offset3),
-                      (T *)top_grad_temp, deal_num_real);
-  }
-  if (h_high <= height - 1 && w_high <= width - 1) {
-    int32_t offset4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr;
-    __memcpy(grad_output_nram_temp, data_value_ptr + offset4,
-             deal_num_real * sizeof(T), GDRAM2NRAM);
-    __bang_mul_scalar(grad_weight, grad_output_nram_temp, lw, deal_num);
-    __bang_add(grad_h_weight, grad_h_weight, grad_weight, deal_num);
-    __bang_mul_scalar(grad_weight, grad_output_nram_temp, lh, deal_num);
-    __bang_add(grad_w_weight, grad_w_weight, grad_weight, deal_num);
-
-    __bang_mul_scalar(top_grad_temp, top_grad, data_attn_weight, deal_num);
-    __bang_mul_scalar(top_grad_temp, top_grad_temp, w4, deal_num);
-    // for calc grad_attn_weight
-    __bang_mul_scalar(grad_output_nram_temp, grad_output_nram_temp, w4,
-                      deal_num);
-    __bang_add(grad_output_nram, grad_output_nram, grad_output_nram_temp,
-               deal_num);
-
-    __bang_atomic_add((T *)top_grad_temp, (T *)(grad_value + offset4),
-                      (T *)top_grad_temp, deal_num_real);
-  }
-  __bang_mul(grad_output_nram, grad_output_nram, top_grad, deal_num);
-#if __BANG_ARCH__ >= 322
-  recursiveSumPool(grad_output_nram, 1, deal_num_real, ALIGN_NUM_FOR_REDUCE);
-#else
-  const int32_t align_num_on_200 = NFU_ALIGN_SIZE / sizeof(float);
-  recursiveSumPool(grad_output_nram, align_num_on_200,
-                   deal_num / align_num_on_200, ALIGN_NUM_FOR_REDUCE);
-  __bang_reduce_sum(grad_output_nram, grad_output_nram,
-                    NFU_ALIGN_SIZE / sizeof(float));
-#endif
-  __bang_atomic_add((T *)grad_output_nram, (T *)grad_attn_weight,
-                    (T *)grad_output_nram, 1);
-  __bang_mul_scalar(grad_w_weight, grad_w_weight, width, deal_num);
-  __bang_mul_scalar(top_grad_temp, top_grad, data_attn_weight, deal_num);
-  __bang_mul(grad_w_weight, grad_w_weight, top_grad_temp, deal_num);
-#if __BANG_ARCH__ >= 322
-  recursiveSumPool(grad_w_weight, 1, deal_num_real, ALIGN_NUM_FOR_REDUCE);
-#else
-  recursiveSumPool(grad_w_weight, align_num_on_200, deal_num / align_num_on_200,
-                   ALIGN_NUM_FOR_REDUCE);
-  __bang_reduce_sum(grad_w_weight, grad_w_weight,
-                    NFU_ALIGN_SIZE / sizeof(float));
-#endif
-  __bang_atomic_add((T *)grad_w_weight, (T *)(grad_sampling_loc),
-                    (T *)grad_w_weight, 1);
-
-  __bang_mul_scalar(grad_h_weight, grad_h_weight, height, deal_num);
-  __bang_mul(grad_h_weight, grad_h_weight, top_grad_temp, deal_num);
-#if __BANG_ARCH__ >= 322
-  recursiveSumPool(grad_h_weight, 1, deal_num_real, ALIGN_NUM_FOR_REDUCE);
-#else
-  recursiveSumPool(grad_h_weight, align_num_on_200, deal_num / align_num_on_200,
-                   ALIGN_NUM_FOR_REDUCE);
-  __bang_reduce_sum(grad_h_weight, grad_h_weight,
-                    NFU_ALIGN_SIZE / sizeof(float));
-#endif
-  __bang_atomic_add((T *)grad_h_weight, (T *)(grad_sampling_loc + 1),
-                    (T *)grad_h_weight, 1);
-}
-
-__mlu_global__ void MLUUnion1KernelMsDeformAttnBackward(
-    const float *data_value, const int32_t *spatial_shapes,
-    const int32_t *data_level_start_index, const float *data_sampling_loc,
-    const float *data_attn_weight, const float *grad_output,
-    const int32_t batch, const int32_t spatial_size, const int32_t num_heads,
-    const int32_t channels, const int32_t num_levels, const int32_t num_query,
-    const int32_t num_points, float *grad_value, float *grad_sampling_loc,
-    float *grad_attn_weight) {
-  if (coreId == 0x80) {
-    return;
-  }
-  const int32_t split_num = 8;
-  const int32_t spatial_shapes_size = 64;
-  int32_t deal_num = PAD_DOWN(
-      (MAX_NRAM_SIZE - spatial_shapes_size) / split_num / sizeof(float),
-      ALIGN_NUM);
-  float *grad_output_nram = (float *)nram_buffer;
-  float *grad_output_nram_temp = (float *)nram_buffer + deal_num;
-  float *grad_weight = (float *)nram_buffer + 2 * deal_num;
-  float *grad_h_weight = (float *)nram_buffer + 3 * deal_num;
-  float *grad_w_weight = (float *)nram_buffer + 4 * deal_num;
-  float *top_grad = (float *)nram_buffer + 5 * deal_num;
-  float *top_grad_temp = (float *)nram_buffer + 6 * deal_num;
-  int32_t *spatial_shapes_nram =
-      (int32_t *)((float *)nram_buffer + 7 * deal_num);
-  float *sampling_loc_nram =
-      (float *)nram_buffer + 7 * deal_num + 2 * sizeof(int32_t);
-  const int32_t total_num = batch * num_query * num_heads * num_levels;
-  int32_t num_per_core = total_num / taskDim;
-  int32_t num_rem = total_num % taskDim;
-  num_per_core = num_per_core + int32_t(taskId < num_rem);
-  int32_t start_per_core =
-      num_rem > taskId
-          ? (taskId * num_per_core)
-          : ((num_per_core + 1) * num_rem + (taskId - num_rem) * num_per_core);
-  int32_t end_per_core = start_per_core + num_per_core;
-  const int32_t C_repeat = channels / deal_num;
-  const int32_t C_tail = channels % deal_num;
-  const int32_t qid_stride = num_heads * channels;
-  int32_t base_ptr = 0;
-  for (int32_t num_loop = start_per_core; num_loop < end_per_core; ++num_loop) {
-    const int32_t l_col = num_loop % num_levels;
-    const int32_t m_col = num_loop / num_levels % num_heads;
-    const int32_t q_col = num_loop / num_levels / num_heads % num_query;
-    const int32_t b_col = num_loop / num_query / num_heads / num_levels;
-    int32_t data_weight_ptr = num_loop * num_points;
-    int32_t data_loc_w_ptr = data_weight_ptr << 1;
-    const int32_t value_offset = b_col * spatial_size * num_heads * channels;
-    const int32_t level_start_id = data_level_start_index[l_col];
-    int32_t spatial_h_ptr = l_col << 1;
-    int32_t grad_output_offset = b_col * num_query * num_heads * channels +
-                                 q_col * num_heads * channels +
-                                 m_col * channels;
-    __memcpy(spatial_shapes_nram, spatial_shapes + spatial_h_ptr,
-             2 * sizeof(int32_t), GDRAM2NRAM);
-    const int32_t spatial_h = spatial_shapes_nram[0];
-    const int32_t spatial_w = spatial_shapes_nram[1];
-    const int32_t value_ptr_offset = value_offset + level_start_id * qid_stride;
-    const float *data_value_ptr = data_value + value_ptr_offset;
-    float *grad_value_ptr = grad_value + value_ptr_offset;
-    const int32_t grad_attn_weight_out = num_loop * num_points;
-    const int32_t grad_sampling_loc_out = num_loop * num_points * 2;
-    for (int32_t p_col = 0; p_col < num_points; ++p_col) {
-      __memcpy(sampling_loc_nram, data_sampling_loc + data_loc_w_ptr,
-               2 * sizeof(float), GDRAM2NRAM);
-      const float loc_w = sampling_loc_nram[0];
-      const float loc_h = sampling_loc_nram[1];
-      const float weight = data_attn_weight[data_weight_ptr];
-      const float h_im = loc_h * spatial_h - 0.5;
-      const float w_im = loc_w * spatial_w - 0.5;
-      if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) {
-        const int32_t h_low = floorf(h_im);
-        const int32_t w_low = floorf(w_im);
-        const int32_t h_high = h_low + 1;
-        const int32_t w_high = w_low + 1;
-
-        const float lh = h_im - h_low;
-        const float lw = w_im - w_low;
-        const float hh = 1.0 - lh;
-        const float hw = 1.0 - lw;
-
-        const int32_t w_stride = num_heads * channels;
-        const int32_t h_stride = spatial_w * w_stride;
-        const int32_t h_low_ptr_offset = h_low * h_stride;
-        const int32_t h_high_ptr_offset = h_low_ptr_offset + h_stride;
-        const int32_t w_low_ptr_offset = w_low * w_stride;
-        const int32_t w_high_ptr_offset = w_low_ptr_offset + w_stride;
-
-        float w1 = hh * hw;
-        float w2 = hh * lw;
-        float w3 = lh * hw;
-        float w4 = lh * lw;
-
-        for (int32_t C_loop = 0; C_loop < C_repeat; ++C_loop) {
-          base_ptr = m_col * channels + C_loop * deal_num;
-          __bang_write_zero(grad_weight, 3 * deal_num);
-          __bang_write_zero(grad_output_nram, deal_num);
-          __memcpy(top_grad,
-                   grad_output + grad_output_offset + C_loop * deal_num,
-                   deal_num * sizeof(float), GDRAM2NRAM);
-          msDeformAttnCol2imBilinear(
-              top_grad_temp, spatial_h, spatial_w, w1, w2, w3, w4, h_low, w_low,
-              h_high, w_high, base_ptr, h_low_ptr_offset, w_low_ptr_offset,
-              h_high_ptr_offset, w_high_ptr_offset, hh, hw, lh, lw, top_grad,
-              weight, grad_h_weight, grad_w_weight, grad_value_ptr,
-              grad_output_nram, grad_weight,
-              grad_sampling_loc + grad_sampling_loc_out + p_col * 2,
-              grad_attn_weight + grad_attn_weight_out + p_col,
-              grad_output_nram_temp, deal_num, deal_num, data_value_ptr);
-        }
-        if (C_tail != 0) {
-          base_ptr = m_col * channels + C_repeat * deal_num;
-          __bang_write_zero(grad_output_nram, 8 * deal_num);
-          __memcpy(top_grad,
-                   grad_output + grad_output_offset + C_repeat * deal_num,
-                   C_tail * sizeof(float), GDRAM2NRAM);
-          msDeformAttnCol2imBilinear(
-              top_grad_temp, spatial_h, spatial_w, w1, w2, w3, w4, h_low, w_low,
-              h_high, w_high, base_ptr, h_low_ptr_offset, w_low_ptr_offset,
-              h_high_ptr_offset, w_high_ptr_offset, hh, hw, lh, lw, top_grad,
-              weight, grad_h_weight, grad_w_weight, grad_value_ptr,
-              grad_output_nram, grad_weight,
-              grad_sampling_loc + grad_sampling_loc_out + p_col * 2,
-              grad_attn_weight + grad_attn_weight_out + p_col,
-              grad_output_nram_temp, deal_num, C_tail, data_value_ptr);
-        }
-      }
-      data_weight_ptr += 1;
-      data_loc_w_ptr += 2;
-    }
-  }
-}
-
-__mlu_global__ void MLUUnion1KernelMsDeformAttnBackward(
-    const float *data_value, const int32_t *spatial_shapes,
-    const int32_t *data_level_start_index, const float *data_sampling_loc,
-    const float *data_attn_weight, const float *grad_output,
-    const int32_t batch, const int32_t spatial_size, const int32_t num_heads,
-    const int32_t channels, const int32_t num_levels, const int32_t num_query,
-    const int32_t num_points, float *grad_value, float *grad_sampling_loc,
-    float *grad_attn_weight);
-
-void KernelMsDeformAttnBackward(
-    cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
-    const cnrtDataType_t d_type, const float *data_value,
-    const int32_t *spatial_shapes, const int32_t *data_level_start_index,
-    const float *data_sampling_loc, const float *data_attn_weight,
-    const float *grad_output, const int32_t batch, const int32_t spatial_size,
-    const int32_t num_heads, const int32_t channels, const int32_t num_levels,
-    const int32_t num_query, const int32_t num_points, float *grad_value,
-    float *grad_sampling_loc, float *grad_attn_weight) {
-  MLUUnion1KernelMsDeformAttnBackward<<<k_dim, k_type, queue>>>(
-      data_value, spatial_shapes, data_level_start_index, data_sampling_loc,
-      data_attn_weight, grad_output, batch, spatial_size, num_heads, channels,
-      num_levels, num_query, num_points, grad_value, grad_sampling_loc,
-      grad_attn_weight);
-}
diff --git a/mmcv/ops/csrc/common/mlu/nms_mlu_kernel.mlu b/mmcv/ops/csrc/common/mlu/nms_mlu_kernel.mlu
deleted file mode 100644
index dcc722d..0000000
--- a/mmcv/ops/csrc/common/mlu/nms_mlu_kernel.mlu
+++ /dev/null
@@ -1,483 +0,0 @@
-/*************************************************************************
- * Copyright (C) 2021 Cambricon.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *************************************************************************/
-#include "nms_utils.hpp"
-
-#define COORD_DIM (4)
-
-#define SIZE_NRAM_BUF (MAX_NRAM_SIZE + REM_FOR_STACK - 62 * 1024)
-#define SIZE_SRAM_BUF (MAX_SRAM_SIZE)
-
-__nram__ int8_t nram_buffer[SIZE_NRAM_BUF];
-__mlu_shared__ int8_t sram_buffer[SIZE_SRAM_BUF];
-
-enum Addr { SRAM, GDRAM };
-
-template <typename IN_DT, typename OUT_DT>
-__mlu_func__ void nms_detection(
-    uint32_t &output_box_num, const int output_mode, OUT_DT *output_dram,
-    IN_DT *input_data_score, const IN_DT *input_data_box, const Addr input_ram,
-    IN_DT *sram, const int core_limit, const int input_num_boxes,
-    const int max_output_size, const float thresh_iou, const float thresh_score,
-    const float offset, const int algo) {
-  // global value
-  int32_t *exit_flag = (int32_t *)(sram + 28);
-  exit_flag[0] = 0;
-  // score, x1, y1, x2, y2, inter_x1, inter_y1, inter_x2, inter_y2
-  int nms_buffer_count1 = 9;
-  // temp nram buffer to store selected target.
-  int nram_save_limit_count = 256;
-  float div_thresh_iou = 1.0 / thresh_iou;
-
-  // input data ptr
-  const IN_DT *input_x1_ptr = input_data_box;
-  const IN_DT *input_y1_ptr = input_x1_ptr + input_num_boxes;
-  const IN_DT *input_x2_ptr = input_y1_ptr + input_num_boxes;
-  const IN_DT *input_y2_ptr = input_x2_ptr + input_num_boxes;
-
-  int limit = 0;        // find limit when GDRAM or SRAM
-  int max_seg_pad = 0;  // the max length every repeat
-  int repeat = 0;
-  int remain = 0;
-  int remain_pad = 0;
-  int input_offset = 0;  // offset of input_data for current core
-  int nram_save_count = 0;
-
-  if (output_mode == 0) {
-    limit = (SIZE_NRAM_BUF - NFU_ALIGN_SIZE /*for max_box*/ * sizeof(IN_DT) -
-             nram_save_limit_count * sizeof(OUT_DT)) /
-            (nms_buffer_count1 * sizeof(IN_DT));
-  } else {
-    // 5 maens: score, x1, y1, x2, y2
-    limit = (SIZE_NRAM_BUF - NFU_ALIGN_SIZE /*for max_box*/ * sizeof(IN_DT) -
-             nram_save_limit_count * 5 * sizeof(OUT_DT)) /
-            (nms_buffer_count1 * sizeof(IN_DT));
-  }
-
-  int max_seg_iou_compute = 0;
-  int repeat_iou_compute = 0;
-  int remain_iou_compute = 0;
-  int remain_pad_iou_compute = 0;
-
-  getComputeParamsBlockOrU1(sizeof(IN_DT), input_num_boxes, limit, core_limit,
-                            input_offset, max_seg_pad, repeat, remain,
-                            remain_pad, max_seg_iou_compute, repeat_iou_compute,
-                            remain_iou_compute, remain_pad_iou_compute);
-
-  // init the data ptr
-  IN_DT *score = (IN_DT *)nram_buffer;
-  IN_DT *x1 = score + max_seg_pad;
-  IN_DT *y1 = x1 + max_seg_pad;
-  IN_DT *x2 = y1 + max_seg_pad;
-  IN_DT *y2 = x2 + max_seg_pad;
-  IN_DT *inter_x1 = y2 + max_seg_pad;
-  IN_DT *inter_y1 = inter_x1 + max_seg_pad;
-  IN_DT *inter_x2 = inter_y1 + max_seg_pad;
-  IN_DT *inter_y2 = inter_x2 + max_seg_pad;
-  IN_DT *max_box = inter_y2 + max_seg_pad;  // the max score, x1, y1, x2, y2
-  OUT_DT *nram_save =
-      (OUT_DT *)((char *)max_box +
-                 NFU_ALIGN_SIZE);  // offset two line from max_box
-
-#if __BANG_ARCH__ >= 300
-  float max_box_x1 = 0;
-  float max_box_y1 = 0;
-  float max_box_x2 = 0;
-  float max_box_y2 = 0;
-#endif
-  mluMemcpyDirection_t load_dir = SRAM2NRAM;
-  mluMemcpyDirection_t store_dir = NRAM2SRAM;
-  load_dir = (input_ram == SRAM) ? SRAM2NRAM : GDRAM2NRAM;
-  store_dir = (input_ram == SRAM) ? NRAM2SRAM : NRAM2GDRAM;
-
-  for (int keep = 0; keep < max_output_size;
-       keep++) {  // loop until the max_score <= 0
-    if (core_limit != 1) {
-      __sync_cluster();  // sync before current loop
-    }
-
-    /******FIND MAX START******/
-    int max_index = 0;         // the max score index
-    int global_max_index = 0;  // for U1
-    float max_area = 0;        // the max socre area
-    max_box[0] = 0;            // init 0
-    findCoreMaxBox(input_data_score, score, inter_x1, max_box, input_x1_ptr,
-                   input_y1_ptr, input_x2_ptr, input_y2_ptr, load_dir,
-                   input_offset, repeat, remain, remain_pad, max_seg_pad,
-                   max_index);
-
-    if (core_limit == 1) {
-#if __BANG_ARCH__ >= 300
-      calMaxArea(max_box, algo, offset, max_area, max_box_x1, max_box_y1,
-                 max_box_x2, max_box_y2);
-#else
-      calMaxArea(max_box, algo, offset, max_area);
-#endif
-      input_data_score[max_index] = 0;
-      global_max_index = max_index;
-    } else if (core_limit == 4) {
-      __sync_cluster();
-      findClusterMaxBox(sram, max_box, inter_x1, input_data_score, core_limit);
-
-#if __BANG_ARCH__ >= 300
-      calMaxArea(max_box, algo, offset, max_area, max_box_x1, max_box_y1,
-                 max_box_x2, max_box_y2);
-#else
-      calMaxArea(max_box, algo, offset, max_area);
-#endif
-      global_max_index = ((uint32_t *)(max_box + 5))[0];
-      input_data_score[global_max_index] = 0;
-    }
-    // by now, we get: max_score|max_index|max_box|max_area
-    /******FIND MAX END******/
-
-    storeResult(max_box, nram_save, output_dram, keep, nram_save_limit_count,
-                max_output_size, thresh_score, output_mode, nram_save_count,
-                output_box_num);
-
-    // if the max score <= 0, end
-    if (core_limit == 1) {
-      if (float(max_box[0]) <= thresh_score) {
-        break;
-      }
-    } else {
-      if (float(max_box[0]) <= thresh_score) {
-        if (coreId == 0) {
-          exit_flag[0] = 1;
-        }
-      }
-      __sync_cluster();
-      if (exit_flag[0] == 1) {
-        break;
-      }
-    }
-/******NMS STORE END******/
-#if __BANG_ARCH__ >= 300
-    scoreUpdate(input_data_score, load_dir, store_dir, input_x1_ptr,
-                input_y1_ptr, input_x2_ptr, input_y2_ptr, x1, y1, x2, y2, score,
-                inter_x1, inter_y1, inter_x2, inter_y2, max_box, max_box_x1,
-                max_box_y1, max_box_x2, max_box_y2, nram_save,
-                repeat_iou_compute, remain_iou_compute, remain_pad_iou_compute,
-                max_seg_iou_compute, max_seg_pad, thresh_iou, div_thresh_iou,
-                input_offset, offset, max_area, input_num_boxes, algo);
-#else
-    scoreUpdate(input_data_score, load_dir, store_dir, input_x1_ptr,
-                input_y1_ptr, input_x2_ptr, input_y2_ptr, x1, y1, x2, y2, score,
-                inter_x1, inter_y1, inter_x2, inter_y2, max_box, max_box[1],
-                max_box[2], max_box[3], max_box[4], nram_save,
-                repeat_iou_compute, remain_iou_compute, remain_pad_iou_compute,
-                max_seg_iou_compute, max_seg_pad, thresh_iou, div_thresh_iou,
-                input_offset, offset, max_area, input_num_boxes, algo);
-#endif
-  }  // for max_output_size
-}
-
-__mlu_global__ void MLUUnion1KernelNMS(
-    const void *input_boxes, const void *input_confidence,
-    const int input_num_boxes, const int max_output_size,
-    const float iou_threshold, const float confidence_threshold,
-    const int output_mode, void *workspace, void *result_num, void *output,
-    const cnrtDataType_t data_type_input, const float offset, const int algo) {
-  if (data_type_input == CNRT_FLOAT16) {
-    __memcpy(workspace, input_confidence, input_num_boxes * sizeof(half),
-             GDRAM2GDRAM);
-  } else if (data_type_input == CNRT_FLOAT32) {
-    __memcpy(workspace, input_confidence, input_num_boxes * sizeof(float),
-             GDRAM2GDRAM);
-  } else {
-  }
-
-  uint32_t output_box_num = 0;
-  float *score_data = (float *)workspace;
-  float *boxes_data = (float *)input_boxes;
-  float *sram = (float *)sram_buffer;
-
-  if (output_mode == 0) {
-    if (data_type_input == CNRT_FLOAT32) {
-      nms_detection(output_box_num, output_mode, (uint32_t *)output, score_data,
-                    boxes_data, GDRAM, sram, taskDim, input_num_boxes,
-                    max_output_size, iou_threshold, confidence_threshold,
-                    offset, algo);
-    } else {
-      nms_detection(output_box_num, output_mode, (uint32_t *)output,
-                    (half *)score_data, (half *)boxes_data, GDRAM, (half *)sram,
-                    taskDim, input_num_boxes, max_output_size, iou_threshold,
-                    confidence_threshold, offset, algo);
-    }
-  } else {
-    if (data_type_input == CNRT_FLOAT32) {
-      nms_detection(output_box_num, output_mode, (float *)output, score_data,
-                    boxes_data, GDRAM, sram, taskDim, input_num_boxes,
-                    max_output_size, iou_threshold, confidence_threshold,
-                    offset, algo);
-    } else {
-      nms_detection(output_box_num, output_mode, (half *)output,
-                    (half *)score_data, (half *)boxes_data, GDRAM, (half *)sram,
-                    taskDim, input_num_boxes, max_output_size, iou_threshold,
-                    confidence_threshold, offset, algo);
-    }
-  }
-  ((uint32_t *)result_num)[0] = output_box_num;
-}
-
-template <typename IN_DT, typename OUT_DT>
-__mlu_func__ void nms_detection_ux(
-    int32_t *exit_flag, uint32_t &output_box_num, OUT_DT *output_dram,
-    IN_DT *score_data, const IN_DT *boxes_data, const Addr input_ram,
-    const int input_num_boxes, const int max_output_size,
-    const float thresh_iou, const float thresh_score, const float offset,
-    const int output_mode, const int algo, char *cdma_gdram) {
-  exit_flag[0] = 0;
-
-  IN_DT *sram = (IN_DT *)sram_buffer;
-
-  // score, x1, y1, x2, y2, inter_x1, inter_y1, inter_x2, inter_y2
-  int nms_buffer_count1 = 9;
-  // temp nram buffer to store selected target.
-  int nram_save_limit_count = 256;
-  float div_thresh_iou = 1.0 / thresh_iou;
-
-  // input data ptr
-  const IN_DT *input_x1_ptr = boxes_data;
-  const IN_DT *input_y1_ptr = input_x1_ptr + input_num_boxes;
-  const IN_DT *input_x2_ptr = input_y1_ptr + input_num_boxes;
-  const IN_DT *input_y2_ptr = input_x2_ptr + input_num_boxes;
-
-  int limit = 0;        // find limit when GDRAM or SRAM
-  int max_seg_pad = 0;  // the max length every repeat
-  int repeat = 0;
-  int remain = 0;
-  int remain_pad = 0;
-  int nram_save_count = 0;
-
-  if (output_mode == 0) {
-    limit = (SIZE_NRAM_BUF - NFU_ALIGN_SIZE /*for max_box*/ * sizeof(IN_DT) -
-             nram_save_limit_count * sizeof(OUT_DT)) /
-            (nms_buffer_count1 * sizeof(IN_DT));
-  } else {
-    limit = (SIZE_NRAM_BUF - NFU_ALIGN_SIZE /*for max_box*/ * sizeof(IN_DT) -
-             nram_save_limit_count * INFO_NUM * sizeof(OUT_DT)) /
-            (nms_buffer_count1 * sizeof(IN_DT));
-  }
-
-  int input_offset = 0;
-  int max_seg_iou_compute = 0;
-  int repeat_iou_compute = 0;
-  int remain_iou_compute = 0;
-  int remain_pad_iou_compute = 0;
-
-  getComputeParamsUx(sizeof(IN_DT), input_num_boxes, limit, input_offset,
-                     max_seg_pad, repeat, remain, remain_pad,
-                     max_seg_iou_compute, repeat_iou_compute,
-                     remain_iou_compute, remain_pad_iou_compute);
-  // init the nram ptr
-  IN_DT *score = (IN_DT *)nram_buffer;
-  IN_DT *x1 = score + max_seg_pad;
-  IN_DT *y1 = x1 + max_seg_pad;
-  IN_DT *x2 = y1 + max_seg_pad;
-  IN_DT *y2 = x2 + max_seg_pad;
-  IN_DT *inter_x1 = y2 + max_seg_pad;
-  IN_DT *inter_y1 = inter_x1 + max_seg_pad;
-  IN_DT *inter_x2 = inter_y1 + max_seg_pad;
-  IN_DT *inter_y2 = inter_x2 + max_seg_pad;
-  IN_DT *max_box = inter_y2 + max_seg_pad;  // the max score, x1, y1, x2, y2
-  OUT_DT *nram_save =
-      (OUT_DT *)((char *)max_box +
-                 NFU_ALIGN_SIZE);  // offset two line from max_box
-#if __BANG_ARCH__ >= 300
-  float max_box_x1 = 0;
-  float max_box_y1 = 0;
-  float max_box_x2 = 0;
-  float max_box_y2 = 0;
-#endif
-  mluMemcpyDirection_t load_dir = SRAM2NRAM;
-  mluMemcpyDirection_t store_dir = NRAM2SRAM;
-  load_dir = (input_ram == SRAM) ? SRAM2NRAM : GDRAM2NRAM;
-  store_dir = (input_ram == SRAM) ? NRAM2SRAM : NRAM2GDRAM;
-
-  for (int keep = 0; keep < max_output_size;
-       keep++) {  // loop until the max_score <= 0
-    __sync_all();
-
-    int max_index = 0;
-    int global_max_index = 0;  // for Ux
-    float max_area = 0;        // the max socre area
-    max_box[0] = 0;            // init 0
-
-    if (coreId == 0) {
-      findCoreMaxBox(score_data, score, inter_x1, max_box, input_x1_ptr,
-                     input_y1_ptr, input_x2_ptr, input_y2_ptr, load_dir,
-                     input_offset, repeat, remain, remain_pad, max_seg_pad,
-                     max_index);
-      // copy max box info to sram
-      __memcpy(sram, max_box, REDUCE_NUM * sizeof(IN_DT), NRAM2SRAM);
-    }
-    __sync_all();
-#if __BANG_ARCH__ >= 590
-    __memcpy((char *)cdma_gdram + REDUCE_NUM * clusterId * sizeof(IN_DT), sram,
-             REDUCE_NUM * sizeof(IN_DT), SRAM2GDRAM);
-    __sync_all();
-    if (clusterId == 0 && coreId == 0) {
-      __bang_write_zero(inter_x1, NMS_SIZE);
-      __memcpy((char *)inter_x1, (char *)cdma_gdram, sizeof(IN_DT), GDRAM2NRAM,
-               sizeof(IN_DT), REDUCE_NUM * sizeof(IN_DT), clusterDim - 1);
-      __bang_max(max_box, inter_x1, NMS_SIZE);
-      int max_cluster = (sizeof(IN_DT) == sizeof(half))
-                            ? ((uint16_t *)max_box)[1]
-                            : ((uint32_t *)max_box)[1];
-      __memcpy((char *)cdma_gdram,
-               (char *)cdma_gdram + max_cluster * REDUCE_NUM * sizeof(IN_DT),
-               REDUCE_NUM * sizeof(IN_DT), GDRAM2GDRAM);
-    }
-    __sync_all();
-    __memcpy(max_box, cdma_gdram, REDUCE_NUM * sizeof(IN_DT), GDRAM2NRAM);
-#else
-    findGlobalMaxBox(max_box, sram, inter_x1);
-#endif
-
-#if __BANG_ARCH__ >= 300
-    calMaxArea(max_box, algo, offset, max_area, max_box_x1, max_box_y1,
-               max_box_x2, max_box_y2);
-#else
-    calMaxArea(max_box, algo, offset, max_area);
-#endif
-    global_max_index = ((uint32_t *)(max_box + 5))[0];
-    if (coreId != MEMORY_CORE) {
-      score_data[global_max_index] = 0;
-    }
-
-    storeResult(max_box, nram_save, output_dram, keep, nram_save_limit_count,
-                max_output_size, thresh_score, output_mode, nram_save_count,
-                output_box_num);
-
-    if (float(max_box[0]) <= thresh_score) {
-      if (clusterId == 0 && coreId == 0) {
-        exit_flag[0] = 1;  // dram
-      }
-    }
-    __sync_all();
-    if (exit_flag[0] == 1) {
-      break;
-    }
-/******NMS STORE END******/
-#if __BANG_ARCH__ >= 300
-    scoreUpdate(score_data, load_dir, store_dir, input_x1_ptr, input_y1_ptr,
-                input_x2_ptr, input_y2_ptr, x1, y1, x2, y2, score, inter_x1,
-                inter_y1, inter_x2, inter_y2, max_box, max_box_x1, max_box_y1,
-                max_box_x2, max_box_y2, nram_save, repeat_iou_compute,
-                remain_iou_compute, remain_pad_iou_compute, max_seg_iou_compute,
-                max_seg_pad, thresh_iou, div_thresh_iou, input_offset, offset,
-                max_area, input_num_boxes, algo);
-#else
-    scoreUpdate(score_data, load_dir, store_dir, input_x1_ptr, input_y1_ptr,
-                input_x2_ptr, input_y2_ptr, x1, y1, x2, y2, score, inter_x1,
-                inter_y1, inter_x2, inter_y2, max_box, max_box[1], max_box[2],
-                max_box[3], max_box[4], nram_save, repeat_iou_compute,
-                remain_iou_compute, remain_pad_iou_compute, max_seg_iou_compute,
-                max_seg_pad, thresh_iou, div_thresh_iou, input_offset, offset,
-                max_area, input_num_boxes, algo);
-#endif
-  }  // for max_output_size
-}
-
-__mlu_global__ void MLUUionXKernelNMS(
-    const void *input_boxes, const void *input_confidence,
-    const int input_num_boxes, const int max_output_size,
-    const float iou_threshold, const float confidence_threshold,
-    const float offset, const cnrtDataType_t data_type_input,
-    const int output_mode, const int algo, void *workspace, void *result_num,
-    void *output) {
-  int input_dwidth = (data_type_input == CNRT_FLOAT32) ? 4 : 2;
-  int32_t *exit_flag = (int32_t *)((char *)workspace +
-                                   INFO_NUM * input_num_boxes * input_dwidth);
-  char *cdma_addr = (char *)exit_flag + sizeof(int32_t);
-  int reduce_sram_size = NFU_ALIGN_SIZE * REDUCE_NUM * input_dwidth;
-  int availbale_sram_size = SIZE_SRAM_BUF - reduce_sram_size;
-
-  int cluster_score_size = input_num_boxes * input_dwidth;
-  int cluster_boxes_size = input_num_boxes * 4 * input_dwidth;
-  char *sram_score = (char *)sram_buffer + reduce_sram_size;
-  char *sram_boxes =
-      (char *)sram_buffer + reduce_sram_size + cluster_score_size;
-  Addr input_ram = GDRAM;
-  if ((cluster_score_size + cluster_boxes_size) < availbale_sram_size) {
-    input_ram = SRAM;
-    __memcpy(sram_score, input_confidence, cluster_score_size, GDRAM2SRAM);
-    __memcpy(sram_boxes, input_boxes, cluster_boxes_size, GDRAM2SRAM);
-  } else {
-    __memcpy(workspace, input_confidence, cluster_score_size, GDRAM2GDRAM);
-  }
-  __sync_cluster();
-
-  uint32_t output_box_num = 0;
-  float *score_data;
-  float *boxes_data;
-  score_data = (input_ram == SRAM) ? (float *)sram_score : (float *)workspace;
-  boxes_data = (input_ram == SRAM) ? (float *)sram_boxes : (float *)input_boxes;
-
-  if (output_mode == 0) {
-    if (data_type_input == CNRT_FLOAT32) {
-      nms_detection_ux(exit_flag, output_box_num, (uint32_t *)output,
-                       score_data, boxes_data, input_ram, input_num_boxes,
-                       max_output_size, iou_threshold, confidence_threshold,
-                       offset, output_mode, algo, cdma_addr);
-    } else {
-      nms_detection_ux(exit_flag, output_box_num, (uint32_t *)output,
-                       (half *)score_data, (half *)boxes_data, input_ram,
-                       input_num_boxes, max_output_size, iou_threshold,
-                       confidence_threshold, offset, output_mode, algo,
-                       cdma_addr);
-    }
-  } else {
-    if (data_type_input == CNRT_FLOAT32) {
-      nms_detection_ux(exit_flag, output_box_num, (float *)output, score_data,
-                       boxes_data, input_ram, input_num_boxes, max_output_size,
-                       iou_threshold, confidence_threshold, offset, output_mode,
-                       algo, cdma_addr);
-    } else {
-      nms_detection_ux(exit_flag, output_box_num, (half *)output,
-                       (half *)score_data, (half *)boxes_data, input_ram,
-                       input_num_boxes, max_output_size, iou_threshold,
-                       confidence_threshold, offset, output_mode, algo,
-                       cdma_addr);
-    }
-  }
-  ((uint32_t *)result_num)[0] = output_box_num;
-}
-
-void KernelNms(cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
-               const cnrtDataType_t data_type_input, const void *boxes_ptr,
-               const void *scores_ptr, const int input_num_boxes,
-               const int max_output_boxes, const float iou_threshold,
-               const float offset, void *workspace_ptr, void *output_size_ptr,
-               void *output_ptr) {
-  switch (k_type) {
-    default: { return; }
-    case CNRT_FUNC_TYPE_BLOCK:
-    case CNRT_FUNC_TYPE_UNION1: {
-      MLUUnion1KernelNMS<<<k_dim, k_type, queue>>>(
-          (void *)boxes_ptr, (void *)scores_ptr, input_num_boxes,
-          max_output_boxes, iou_threshold, /*confidence_threshold=*/0.0,
-          /*output_mode=*/0, workspace_ptr, output_size_ptr, output_ptr,
-          data_type_input, offset, /*algo=*/1);
-    }; break;
-    case CNRT_FUNC_TYPE_UNION2:
-    case CNRT_FUNC_TYPE_UNION4:
-    case CNRT_FUNC_TYPE_UNION8:
-    case CNRT_FUNC_TYPE_UNION16: {
-      MLUUionXKernelNMS<<<k_dim, k_type, queue>>>(
-          (void *)boxes_ptr, (void *)scores_ptr, input_num_boxes,
-          max_output_boxes, iou_threshold, /*confidence_threshold=*/0.0, offset,
-          data_type_input, /*output_mode=*/0, /*algo=*/1, workspace_ptr,
-          output_size_ptr, output_ptr);
-    }; break;
-  }
-}
diff --git a/mmcv/ops/csrc/common/mlu/nms_utils.hpp b/mmcv/ops/csrc/common/mlu/nms_utils.hpp
deleted file mode 100644
index 61f5ba9..0000000
--- a/mmcv/ops/csrc/common/mlu/nms_utils.hpp
+++ /dev/null
@@ -1,553 +0,0 @@
-/*************************************************************************
- * Copyright (C) [2019-2022] by Cambricon, Inc.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *************************************************************************/
-#ifndef NMS_UTILS_HPP_
-#define NMS_UTILS_HPP_
-#include "common_mlu_helper.hpp"
-
-#define NMS_SIZE (64)
-#define NMS_UP(x, y) (x / y + (int)(x % y > 0)) * y
-#define NMS_DOWN(x, y) (x / y) * y
-#define INFO_NUM (5)  // 5 means x1, x2, y1, y2 and score
-#define MEMORY_CORE (0x80)
-#define REDUCE_NUM \
-  (7)  // score, x1, y1, x2, y2, max_index (reserve 2 num for half-type input)
-
-__mlu_func__ void pvLock() {
-#if __BANG_ARCH__ == 270
-  if (coreId != MEMORY_CORE) {
-    __bang_lock(0, 0);
-  }
-#endif
-}
-
-__mlu_func__ void pvUnlock() {
-#if __BANG_ARCH__ == 270
-  if (coreId != MEMORY_CORE) {
-    __bang_unlock(0, 0);
-  }
-#endif
-}
-
-template <typename T>
-static __mlu_func__ void computeReluN(T *nram_dst, T *nram_src, void *nram_tmp,
-                                      const int deal_num,
-                                      const T threshold = 0) {
-  if (threshold < 0) {
-    return;
-  }
-  if (threshold) {
-#if __BANG_ARCH__ >= 300
-    __bang_relun(nram_dst, nram_src, deal_num, threshold);
-#else
-    int align_num = NFU_ALIGN_SIZE / sizeof(T);
-    T *nram_aux_a = (T *)nram_tmp;
-    T *nram_aux_b = nram_aux_a + deal_num;
-    T *nram_zero = nram_aux_b + align_num;
-    __bang_write_value(nram_aux_b, align_num, threshold);
-    __bang_write_zero(nram_zero, align_num);
-    __bang_cycle_lt((T *)nram_aux_a, nram_src, (T *)nram_aux_b, deal_num,
-                    align_num);
-    __bang_mul(nram_dst, nram_src, (T *)nram_aux_a, deal_num);
-    __bang_cycle_eq((T *)nram_aux_a, (T *)nram_aux_a, (T *)nram_zero, deal_num,
-                    align_num);
-    __bang_cycle_mul((T *)nram_aux_a, (T *)nram_aux_a, (T *)nram_aux_b,
-                     deal_num, align_num);
-    __bang_add(nram_dst, nram_dst, (T *)nram_aux_a, deal_num);
-    __bang_cycle_gt((T *)nram_aux_a, nram_dst, (T *)nram_zero, deal_num,
-                    align_num);
-    __bang_mul(nram_dst, nram_dst, (T *)nram_aux_a, deal_num);
-#endif
-  } else {
-#if __BANG_ARCH__ >= 300
-    __bang_relu(nram_dst, nram_src, deal_num);
-#else
-    __bang_active_relu(nram_dst, nram_src, deal_num);
-#endif
-  }
-}
-
-__mlu_func__ void getComputeParamsBlockOrU1(
-    const int input_dwidth, const int input_box_num, const int limit,
-    const int core_limit, int &input_offset, int &max_seg_pad, int &repeat,
-    int &remain, int &remain_pad, int &max_seg_iou_compute,
-    int &repeat_iou_compute, int &remain_iou_compute,
-    int &remain_pad_iou_compute) {
-  int avg_core = input_box_num / core_limit;
-  int rem = input_box_num % core_limit;
-  int len_core = avg_core + (coreId < rem ? 1 : 0);
-  input_offset = avg_core * coreId + (coreId <= rem ? coreId : rem);
-  max_seg_pad = NMS_DOWN(limit, NMS_SIZE);
-  repeat = len_core / max_seg_pad;
-  remain = len_core % max_seg_pad;
-  remain_pad = NMS_UP(remain, NMS_SIZE);
-
-  // if datatype is fp16, we should cvt to fp32 when compute iou
-  max_seg_iou_compute = NMS_DOWN(max_seg_pad / (4 / input_dwidth), NMS_SIZE);
-  repeat_iou_compute = len_core / max_seg_iou_compute;
-  remain_iou_compute = len_core % max_seg_iou_compute;
-  remain_pad_iou_compute = NMS_UP(remain_iou_compute, NMS_SIZE);
-}
-
-__mlu_func__ void getComputeParamsUx(
-    const int input_dwidth, const int input_num_boxes, const int limit,
-    int &input_offset, int &max_seg_pad, int &repeat, int &remain,
-    int &remain_pad, int &max_seg_iou_compute, int &repeat_iou_compute,
-    int &remain_iou_compute, int &remain_pad_iou_compute) {
-  // data split
-  int avg_cluster = input_num_boxes / clusterDim;
-  int rem_cluster = input_num_boxes % clusterDim;
-  int len_cluster = avg_cluster + (clusterId < rem_cluster);
-  int cluster_offset = avg_cluster * clusterId +
-                       (clusterId <= rem_cluster ? clusterId : rem_cluster);
-
-  int avg_core = len_cluster / coreDim;
-  int rem_core = len_cluster % coreDim;
-  int len_core = avg_core + (coreId < rem_core);
-  int core_offset =
-      avg_core * coreId + (coreId <= rem_core ? coreId : rem_core);
-  input_offset = cluster_offset + core_offset;
-
-  max_seg_pad = NMS_DOWN(limit, NMS_SIZE);
-
-  // core 0 of each cluster calculate the max score index
-  int max_index_len_core = avg_cluster + (clusterId < rem_cluster);
-  repeat = max_index_len_core / max_seg_pad;
-  remain = max_index_len_core % max_seg_pad;
-  remain_pad = NMS_UP(remain, NMS_SIZE);
-  // if datatype is fp16, we should cvt to fp32 when compute iou
-  max_seg_iou_compute =
-      NMS_DOWN(max_seg_pad / (sizeof(float) / input_dwidth), NMS_SIZE);
-  repeat_iou_compute = len_core / max_seg_iou_compute;
-  remain_iou_compute = len_core % max_seg_iou_compute;
-  remain_pad_iou_compute = NMS_UP(remain_iou_compute, NMS_SIZE);
-}
-
-template <typename IN_DT>
-__mlu_func__ void findGlobalMaxBox(IN_DT *max_box, IN_DT *sram,
-                                   IN_DT *inter_x1) {
-  // copy all partial max to the sram of cluster 0
-  if (clusterId != 0) {
-    __memcpy(sram + REDUCE_NUM * clusterId, sram, REDUCE_NUM * sizeof(IN_DT),
-             SRAM2SRAM, 0);
-  }
-  __sync_all();
-
-  // reduce between clusters to get the global max box
-  if (clusterId == 0) {
-    if (coreId == 0) {
-      __bang_write_zero(inter_x1, NMS_SIZE);
-      __memcpy(inter_x1, sram, sizeof(IN_DT), SRAM2NRAM, sizeof(IN_DT),
-               REDUCE_NUM * sizeof(IN_DT), clusterDim - 1);
-      __bang_max(max_box, inter_x1, NMS_SIZE);
-      int max_cluster = (sizeof(IN_DT) == sizeof(half))
-                            ? ((uint16_t *)max_box)[1]
-                            : ((uint32_t *)max_box)[1];
-      __memcpy(max_box, sram + max_cluster * REDUCE_NUM,
-               REDUCE_NUM * sizeof(IN_DT), SRAM2NRAM);
-      __memcpy(sram, max_box, REDUCE_NUM * sizeof(IN_DT), NRAM2SRAM);
-    }
-    __sync_cluster();
-    if (coreId == 0x80 && clusterDim > 1) {
-      // broadcast global max box to each cluster's sram
-      for (int cluster_idx = 1; cluster_idx < clusterDim; ++cluster_idx) {
-        __memcpy(sram, sram, REDUCE_NUM * sizeof(IN_DT), SRAM2SRAM,
-                 cluster_idx);
-      }
-    }
-    __sync_cluster();
-  }
-  __sync_all();
-
-  // copy the global max box to max_box
-  __memcpy(max_box, sram, REDUCE_NUM * sizeof(IN_DT), SRAM2NRAM);
-}
-
-template <typename IN_DT>
-__mlu_func__ void findCoreMaxBox(
-    IN_DT *input_score_ptr, IN_DT *score, IN_DT *inter_x1, IN_DT *max_box,
-    const IN_DT *input_x1_ptr, const IN_DT *input_y1_ptr,
-    const IN_DT *input_x2_ptr, const IN_DT *input_y2_ptr,
-    const mluMemcpyDirection_t load_dir, const int input_offset,
-    const int repeat, const int remain, const int remain_pad,
-    const int max_seg_pad, int &max_index) {
-  if (coreId != 0x80) {
-    for (int i = 0; i <= repeat; i++) {
-      if (i == repeat && remain == 0) {
-        break;
-      }
-      int seg_len = 0;  // the length every nms compute
-      int cpy_len = 0;  // the length every nms memcpy
-      i == repeat ? seg_len = remain_pad : seg_len = max_seg_pad;
-      i == repeat ? cpy_len = remain : cpy_len = max_seg_pad;
-      /******NMS LOAD START******/
-      __bang_write_zero(score, seg_len);
-      __memcpy(score, input_score_ptr + input_offset + i * max_seg_pad,
-               cpy_len * sizeof(IN_DT), load_dir, cpy_len * sizeof(IN_DT),
-               cpy_len * sizeof(IN_DT), 0);
-
-      /******NMS LOAD END******/
-
-      __bang_max(inter_x1, score, seg_len);
-      if (inter_x1[0] > max_box[0]) {
-        max_box[0] = inter_x1[0];
-        if (sizeof(IN_DT) == sizeof(half)) {
-          max_index = ((uint16_t *)inter_x1)[1] + input_offset +
-                      i * max_seg_pad;  // offset start from head of input_data
-        } else if (sizeof(IN_DT) == sizeof(float)) {
-          max_index = ((uint32_t *)inter_x1)[1] + input_offset +
-                      i * max_seg_pad;  // offset start from head of input_data
-        }
-      }
-    }  // for repeat
-    // the max box's x1, y1, x2, y2 on every core
-    max_box[1] = input_x1_ptr[max_index];
-    max_box[2] = input_y1_ptr[max_index];
-    max_box[3] = input_x2_ptr[max_index];
-    max_box[4] = input_y2_ptr[max_index];
-    ((uint32_t *)(max_box + 5))[0] = max_index;
-  }
-}
-
-template <typename IN_DT>
-__mlu_func__ void findClusterMaxBox(IN_DT *sram, IN_DT *max_box,
-                                    IN_DT *inter_x1, IN_DT *input_data_score,
-                                    const int core_limit) {
-  // find the max with sram
-  // copy every core's box info to sram, form: score---x1---y1---x2---y2---
-  __memcpy(sram + REDUCE_NUM * coreId, max_box, REDUCE_NUM * sizeof(IN_DT),
-           NRAM2SRAM);  // int32_t datatype
-  __sync_cluster();
-
-  // copy score from sram to nram and find the max
-  __bang_write_zero(inter_x1, 64);
-  __memcpy(inter_x1, sram, sizeof(IN_DT), SRAM2NRAM, sizeof(IN_DT),
-           REDUCE_NUM * sizeof(IN_DT), coreDim - 1);
-  __bang_max(max_box, inter_x1, 64);
-  int max_core = sizeof(IN_DT) == sizeof(half) ? ((uint16_t *)max_box)[1]
-                                               : ((uint32_t *)max_box)[1];
-  // copy the max box to max_box
-  __memcpy(max_box, sram + max_core * REDUCE_NUM, REDUCE_NUM * sizeof(IN_DT),
-           SRAM2NRAM);
-}
-
-/*****************************************************************************/
-/*******************************CALCULATE MAX AREA****************************/
-/*****************************************************************************/
-
-template <typename IN_DT>
-__mlu_func__ void calMaxArea(IN_DT *max_box, const int algo, float offset,
-                             float &max_area) {
-  if (algo == 0 || offset == 0.0) {
-    max_area = ((float)max_box[3] - (float)max_box[1]) *
-               ((float)max_box[4] - (float)max_box[2]);
-  } else {
-    max_area = ((float)max_box[3] - (float)max_box[1] + offset) *
-               ((float)max_box[4] - (float)max_box[2] + offset);
-  }
-}
-
-template <typename IN_DT>
-__mlu_func__ void calMaxArea(IN_DT *max_box, const int algo, float offset,
-                             float &max_area, float &max_box_x1,
-                             float &max_box_y1, float &max_box_x2,
-                             float &max_box_y2) {
-  // the case of random inf will break the requirement of x1<=x2, y1<=y2
-  // so exchange it if it happens.
-  max_box_x1 = float(max_box[1]);
-  max_box_x2 = float(max_box[3]);
-  if (max_box[1] > max_box[3]) {
-    max_box_x1 = float(max_box[3]);
-    max_box_x2 = float(max_box[1]);
-  }
-  max_box_y1 = float(max_box[2]);
-  max_box_y2 = float(max_box[4]);
-  if (max_box[2] > max_box[4]) {
-    max_box_y1 = float(max_box[4]);
-    max_box_y2 = float(max_box[2]);
-  }
-  if (algo == 0 || offset == 0.0) {
-    max_area = (max_box_x2 - max_box_x1) * (max_box_y2 - max_box_y1);
-  } else {
-    max_area =
-        (max_box_x2 - max_box_x1 + offset) * (max_box_y2 - max_box_y1 + offset);
-  }
-}
-
-/***********************************************************************/
-/*******************************STORE RESULT****************************/
-/***********************************************************************/
-template <typename IN_DT, typename OUT_DT>
-__mlu_func__ void storeResult(IN_DT *max_box, OUT_DT *nram_save,
-                              OUT_DT *&output_dram, const int keep,
-                              const int nram_save_limit_count,
-                              const int max_output_size,
-                              const float thresh_score, const int output_mode,
-                              int &nram_save_count, uint32_t &output_box_num) {
-  /******NMS STORE START******/
-  // store to nram
-  if (float(max_box[0]) > thresh_score) {
-    OUT_DT *save_ptr;
-    int save_offset = 0;
-    int save_str_num = 0;
-    save_ptr = nram_save;
-    save_offset = nram_save_count;
-    save_str_num = nram_save_limit_count;
-    if (clusterId == 0 && coreId == 0) {
-      if (output_mode == 0) {  // index1, index2, ...
-        save_ptr[save_offset] = ((uint32_t *)(max_box + INFO_NUM))[0];
-      } else if (output_mode == 1) {  // score, x1, y1, x2, y2
-        __memcpy(save_ptr + save_offset * INFO_NUM, max_box,
-                 INFO_NUM * sizeof(IN_DT), NRAM2NRAM, INFO_NUM * sizeof(IN_DT),
-                 INFO_NUM * sizeof(IN_DT), 0);
-      } else if (output_mode == 2) {  // score---, x1---, y1---, x2---, y2---
-        __memcpy(save_ptr + save_offset, max_box, 1 * sizeof(IN_DT), NRAM2NRAM,
-                 save_str_num * sizeof(IN_DT), 1 * sizeof(IN_DT), 4);
-      }
-    }
-    nram_save_count++;
-    output_box_num++;
-  }
-
-  // store to sram/gdram
-  if (output_box_num != 0) {
-    if ((nram_save_count == nram_save_limit_count) ||
-        (float(max_box[0]) <= thresh_score) || keep == max_output_size - 1) {
-      if (nram_save_count != 0) {
-        if (clusterId == 0 && coreId == 0) {
-          if (output_mode == 0) {  // index1, index2, ...
-            pvLock();
-            __memcpy(output_dram, nram_save, nram_save_count * sizeof(uint32_t),
-                     NRAM2GDRAM);
-            pvUnlock();
-            output_dram += nram_save_count;
-          } else if (output_mode == 1) {  // score, x1, y1, x2, y2
-            pvLock();
-            __memcpy(output_dram, nram_save,
-                     nram_save_count * INFO_NUM * sizeof(IN_DT), NRAM2GDRAM);
-            pvUnlock();
-            output_dram += nram_save_count * INFO_NUM;
-          } else if (output_mode ==
-                     2) {  // score---, x1---, y1---, x2---, y2---
-            pvLock();
-            __memcpy(output_dram, nram_save, nram_save_count * sizeof(IN_DT),
-                     NRAM2GDRAM, max_output_size * sizeof(IN_DT),
-                     nram_save_limit_count * sizeof(IN_DT), 4);
-            pvUnlock();
-            output_dram += nram_save_count;
-          }
-          nram_save_count = 0;
-        }
-      }
-    }  // if move data nram->sram/gdram
-  }    // if dst
-}
-
-template <typename IN_DT, typename OUT_DT>
-__mlu_func__ void scoreUpdate(
-    IN_DT *input_score_ptr, const mluMemcpyDirection_t load_dir,
-    const mluMemcpyDirection_t store_dir, const IN_DT *input_x1_ptr,
-    const IN_DT *input_y1_ptr, const IN_DT *input_x2_ptr,
-    const IN_DT *input_y2_ptr, IN_DT *x1, IN_DT *y1, IN_DT *x2, IN_DT *y2,
-    IN_DT *score, IN_DT *inter_x1, IN_DT *inter_y1, IN_DT *inter_x2,
-    IN_DT *inter_y2, IN_DT *max_box, const float max_box_x1,
-    const float max_box_y1, const float max_box_x2, const float max_box_y2,
-    OUT_DT *nram_save, int repeat_iou_compute, int remain_iou_compute,
-    int remain_pad_iou_compute, int max_seg_iou_compute, int max_seg_pad,
-    const float thresh_iou, const float div_thresh_iou, const int input_offset,
-    const float offset, const float max_area, const int input_num_boxes,
-    const int algo) {
-  for (int i = 0; i <= repeat_iou_compute; i++) {
-    if (i == repeat_iou_compute && remain_iou_compute == 0) {
-      break;
-    }
-    int seg_len = (i == repeat_iou_compute) ? remain_pad_iou_compute
-                                            : max_seg_iou_compute;
-    int cpy_len =
-        (i == repeat_iou_compute) ? remain_iou_compute : max_seg_iou_compute;
-    /******NMS LOAD START******/
-    int dt_offset = 0;
-    if (sizeof(IN_DT) == sizeof(float)) {
-      __memcpy(score, input_score_ptr + input_offset + i * max_seg_pad,
-               cpy_len * sizeof(IN_DT), load_dir, cpy_len * sizeof(IN_DT),
-               cpy_len * sizeof(IN_DT), 0);
-      dt_offset = 0;
-    } else if (sizeof(IN_DT) == sizeof(half)) {
-      __memcpy(x1, input_score_ptr + input_offset + i * max_seg_iou_compute,
-               cpy_len * sizeof(IN_DT), load_dir, cpy_len * sizeof(IN_DT),
-               cpy_len * sizeof(IN_DT), 0);
-      __bang_half2float((float *)score, (half *)x1, seg_len);
-      dt_offset = max_seg_iou_compute;
-    }
-#if __BANG_ARCH__ >= 300
-    __memcpy(inter_x1 + dt_offset,
-             input_x1_ptr + input_offset + i * max_seg_iou_compute,
-             cpy_len * sizeof(IN_DT), load_dir, max_seg_pad * sizeof(IN_DT),
-             input_num_boxes * sizeof(IN_DT), 3);
-
-    if (sizeof(IN_DT) == sizeof(half)) {
-      __bang_half2float((float *)inter_x1,
-                        (half *)inter_x1 + max_seg_iou_compute, seg_len);
-      __bang_half2float((float *)inter_y1,
-                        (half *)inter_y1 + max_seg_iou_compute, seg_len);
-      __bang_half2float((float *)inter_x2,
-                        (half *)inter_x2 + max_seg_iou_compute, seg_len);
-      __bang_half2float((float *)inter_y2,
-                        (half *)inter_y2 + max_seg_iou_compute, seg_len);
-    }
-    // box transfer
-    __bang_minequal((float *)x1, (float *)inter_x1, (float *)inter_x2, seg_len);
-    __bang_maxequal((float *)x2, (float *)inter_x1, (float *)inter_x2, seg_len);
-    __bang_minequal((float *)y1, (float *)inter_y1, (float *)inter_y2, seg_len);
-    __bang_maxequal((float *)y2, (float *)inter_y1, (float *)inter_y2, seg_len);
-    // 1、 compute IOU
-    // get the area_I
-    __bang_maxeq_scalar((float *)inter_x1, (float *)x1, max_box_x1,
-                        seg_len);  // inter_x1
-    __bang_mineq_scalar((float *)inter_x2, (float *)x2, max_box_x2,
-                        seg_len);  // inter_x2
-    __bang_sub((float *)inter_x1, (float *)inter_x2, (float *)inter_x1,
-               seg_len);
-    if (algo == 1 && offset != 0.0) {
-      __bang_add_scalar((float *)inter_x1, (float *)inter_x1, offset, seg_len);
-    }
-    computeReluN((float *)inter_x1, (float *)inter_x1, NULL,
-                 seg_len);  // inter_w
-    __bang_maxeq_scalar((float *)inter_y1, (float *)y1, float(max_box_y1),
-                        seg_len);  // inter_y1
-    __bang_mineq_scalar((float *)inter_y2, (float *)y2, float(max_box_y2),
-                        seg_len);  // inter_y2
-    __bang_sub((float *)inter_y1, (float *)inter_y2, (float *)inter_y1,
-               seg_len);
-    if (algo == 1 && offset != 0.0) {
-      __bang_add_scalar((float *)inter_y1, (float *)inter_y1, offset, seg_len);
-    }
-    computeReluN((float *)inter_y1, (float *)inter_y1, NULL,
-                 seg_len);  // inter_h
-    __bang_mul((float *)inter_x1, (float *)inter_x1, (float *)inter_y1,
-               seg_len);  // area_I
-    // get the area of input_box: area = (x2 - x1) * (y2 - y1);
-    if (algo == 1 && offset != 0.0) {
-      __bang_fusion(FUSION_FSA, (float *)inter_y1, (float *)x2, (float *)x1,
-                    offset, seg_len, seg_len);
-      __bang_fusion(FUSION_FSA, (float *)inter_y2, (float *)y2, (float *)y1,
-                    offset, seg_len, seg_len);
-      __bang_mul((float *)inter_x2, (float *)inter_y1, (float *)inter_y2,
-                 seg_len);  // area
-    } else {
-      __bang_sub((float *)inter_y1, (float *)x2, (float *)x1, seg_len);
-      __bang_fusion(FUSION_FSM, (float *)inter_x2, (float *)y2, (float *)y1,
-                    (float *)inter_y1, seg_len, seg_len);
-    }
-    // get the area_U: area + max_area - area_I
-    __bang_fusion(FUSION_FAS, (float *)inter_x2, (float *)inter_x2, max_area,
-                  (float *)inter_x1, seg_len, seg_len);
-    // 2、 select the box
-    // if IOU greater than thres, set the score to zero, abort it: area_U >
-    // area_I * (1 / thresh)?
-    if (thresh_iou > 0.0) {
-      __bang_mul_scalar((float *)inter_x1, (float *)inter_x1, div_thresh_iou,
-                        seg_len);
-    } else {
-      __bang_mul_scalar((float *)inter_x2, (float *)inter_x2, thresh_iou,
-                        seg_len);
-    }
-    // process for nan
-    __bang_lt((float *)inter_x1, (float *)inter_x2, (float *)inter_x1, seg_len);
-    __bang_not((float *)inter_x1, (float *)inter_x1, seg_len);
-    __bang_mul((float *)score, (float *)score, (float *)inter_x1, seg_len);
-/******NMS COMPUTE END******/
-#else
-    __memcpy(x1 + dt_offset,
-             input_x1_ptr + input_offset + i * max_seg_iou_compute,
-             cpy_len * sizeof(IN_DT), load_dir, max_seg_pad * sizeof(IN_DT),
-             input_num_boxes * sizeof(IN_DT), 3);
-    if (sizeof(IN_DT) == sizeof(half)) {
-      __bang_half2float((float *)x1, (half *)x1 + max_seg_iou_compute, seg_len);
-      __bang_half2float((float *)y1, (half *)y1 + max_seg_iou_compute, seg_len);
-      __bang_half2float((float *)x2, (half *)x2 + max_seg_iou_compute, seg_len);
-      __bang_half2float((float *)y2, (half *)y2 + max_seg_iou_compute, seg_len);
-    }
-    // 1、 compute IOU
-    // get the area_I
-    __bang_write_value((float *)inter_y1, seg_len,
-                       float(max_box[1]));  // max_x1
-    __bang_maxequal((float *)inter_x1, (float *)x1, (float *)inter_y1,
-                    seg_len);  // inter_x1
-    __bang_write_value((float *)inter_y2, seg_len,
-                       float(max_box[3]));  // max_x2
-    __bang_minequal((float *)inter_x2, (float *)x2, (float *)inter_y2,
-                    seg_len);  // inter_x2
-    __bang_sub((float *)inter_x1, (float *)inter_x2, (float *)inter_x1,
-               seg_len);
-    if (algo == 1 && offset != 0.0) {
-      __bang_add_scalar((float *)inter_x1, (float *)inter_x1, offset, seg_len);
-    }
-    computeReluN((float *)inter_x1, (float *)inter_x1, NULL,
-                 seg_len);  // inter_w
-    __bang_write_value((float *)inter_x2, seg_len,
-                       float(max_box[2]));  // max_y1
-    __bang_maxequal((float *)inter_y1, (float *)y1, (float *)inter_x2,
-                    seg_len);  // inter_y1
-    __bang_write_value((float *)inter_x2, seg_len,
-                       float(max_box[4]));  // max_y2
-    __bang_minequal((float *)inter_y2, (float *)y2, (float *)inter_x2,
-                    seg_len);  // inter_y2
-    __bang_sub((float *)inter_y1, (float *)inter_y2, (float *)inter_y1,
-               seg_len);
-    if (algo == 1 && offset != 0.0) {
-      __bang_add_scalar((float *)inter_y1, (float *)inter_y1, offset, seg_len);
-    }
-    computeReluN((float *)inter_y1, (float *)inter_y1, NULL,
-                 seg_len);  // inter_h
-    __bang_mul((float *)inter_x1, (float *)inter_x1, (float *)inter_y1,
-               seg_len);  // area_I
-    // get the area of input_box: area = (x2 - x1) * (y2 - y1);
-    __bang_sub((float *)inter_y1, (float *)x2, (float *)x1, seg_len);
-    __bang_sub((float *)inter_y2, (float *)y2, (float *)y1, seg_len);
-    if (algo == 1 && offset != 0.0) {
-      __bang_add_scalar((float *)inter_y1, (float *)inter_y1, offset, seg_len);
-      __bang_add_scalar((float *)inter_y2, (float *)inter_y2, offset, seg_len);
-    }
-    __bang_mul((float *)inter_x2, (float *)inter_y1, (float *)inter_y2,
-               seg_len);  // area
-    // get the area_U: area + max_area - area_I
-    __bang_add_scalar((float *)inter_x2, (float *)inter_x2, float(max_area),
-                      seg_len);
-    __bang_sub((float *)inter_x2, (float *)inter_x2, (float *)inter_x1,
-               seg_len);  // area_U
-    // 2、 select the box
-    // if IOU greater than thresh, set the score to zero, abort it: area_U >
-    // area_I * (1 / thresh)?
-    if (thresh_iou > 0.0) {
-      __bang_mul_scalar((float *)inter_x1, (float *)inter_x1, div_thresh_iou,
-                        seg_len);
-    } else {
-      __bang_mul_scalar((float *)inter_x2, (float *)inter_x2, thresh_iou,
-                        seg_len);
-    }
-    __bang_ge((float *)inter_x1, (float *)inter_x2, (float *)inter_x1, seg_len);
-    __bang_mul((float *)score, (float *)score, (float *)inter_x1, seg_len);
-/******NMS COMPUTE END******/
-#endif
-    // update the score
-    if (sizeof(IN_DT) == sizeof(half)) {
-      convertFloat2half((half *)score, (float *)score, seg_len);
-    }
-    pvLock();
-    __memcpy(input_score_ptr + input_offset + i * max_seg_iou_compute, score,
-             cpy_len * sizeof(IN_DT), store_dir, cpy_len * sizeof(IN_DT),
-             cpy_len * sizeof(IN_DT), 0);
-    pvUnlock();
-  }
-}
-
-#endif  // NMS_UTILS_HPP_
diff --git a/mmcv/ops/csrc/common/mlu/psamask_mlu_kernel.mlu b/mmcv/ops/csrc/common/mlu/psamask_mlu_kernel.mlu
deleted file mode 100644
index 055ee4f..0000000
--- a/mmcv/ops/csrc/common/mlu/psamask_mlu_kernel.mlu
+++ /dev/null
@@ -1,615 +0,0 @@
-/*************************************************************************
- * Copyright (C) 2022 Cambricon.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *************************************************************************/
-#include "common_mlu_helper.hpp"
-#include "psamask_utils.hpp"
-
-#define COMPUTE_COUNT_ALIGN 64
-
-__nram__ char buf[MAX_NRAM_SIZE];
-
-template <typename T>
-__mlu_func__ void swap(T &a, T &b) {
-  T tmp = a;
-  a = b;
-  b = tmp;
-}
-
-template <typename T>
-__mlu_func__ void storeDataFromNramToDram(T *dst, const T *src,
-                                          const PositionInCore &position,
-                                          const Shape &shape_full) {
-  int n_offset = shape_full.h * shape_full.w * shape_full.c;
-  int h_offset = shape_full.w * shape_full.c;
-  int w_offset = shape_full.c;
-  int n_seg = position.n_end - position.n_start;
-  int h_seg = position.h_end - position.h_start;
-  int w_seg = position.w_end - position.w_start;
-  int size = h_seg * w_seg * shape_full.c;
-
-  __memcpy(dst + position.n_start * n_offset + position.h_start * h_offset +
-               position.w_start * w_offset,
-           src, size * sizeof(T), NRAM2GDRAM, n_offset * sizeof(T),
-           size * sizeof(T), n_seg - 1);
-}
-
-template <typename T>
-__mlu_func__ void loadDataFromDramToNram(T *dst, const T *src,
-                                         const PositionInCore &position,
-                                         const Shape &shape_full) {
-  int n_offset = shape_full.h * shape_full.w * shape_full.c;
-  int h_offset = shape_full.w * shape_full.c;
-  int w_offset = shape_full.c;
-  int n_seg = position.n_end - position.n_start;
-  int h_seg = position.h_end - position.h_start;
-  int w_seg = position.w_end - position.w_start;
-  int size = h_seg * w_seg * shape_full.c;
-
-  __memcpy(dst, src + position.n_start * n_offset +
-                    position.h_start * h_offset + position.w_start * w_offset,
-           size * sizeof(T), GDRAM2NRAM, size * sizeof(T), n_offset * sizeof(T),
-           n_seg - 1);
-}
-
-// transpose the data from A*B*C*(D*E) to A*D*E*(B*C)
-template <typename T>
-__mlu_func__ void transposeData(T *dst, T *src, const Shape &shape_seg) {
-  int align_c = CEIL_ALIGN(shape_seg.c, COMPUTE_COUNT_ALIGN / sizeof(T));
-  int align_hw =
-      CEIL_ALIGN(shape_seg.h * shape_seg.w, COMPUTE_COUNT_ALIGN / sizeof(T));
-  for (int i = 0; i < shape_seg.n; ++i) {
-    __bang_transpose(dst, src, align_hw, align_c);
-    dst += align_hw * align_c;
-    src += align_hw * align_c;
-  }
-}
-
-template <typename T>
-__mlu_func__ void psamaskCollectForward(
-    const T *x_dram, T *y_dram, const PositionInCore &position,
-    const Shape &x_full, const Shape &y_full, const Shape &shape_seg,
-    const int h_mask, const int w_mask, const int half_h_mask,
-    const int half_w_mask) {
-  T *x_nram = (T *)buf;
-  T *y_nram =
-      x_nram + CEIL_ALIGN(shape_seg.n * shape_seg.h * shape_seg.w * x_full.c,
-                          COMPUTE_COUNT_ALIGN / sizeof(T));
-  loadDataFromDramToNram(x_nram, x_dram, position, x_full);
-
-  // fill zeros to output
-  int elem_count =
-      CEIL_ALIGN(shape_seg.n * shape_seg.h * shape_seg.w * y_full.c,
-                 NFU_ALIGN_SIZE / sizeof(T));
-  __bang_write_value(y_nram, elem_count, (T)0);
-
-  int y_n_offset = shape_seg.h * shape_seg.w * shape_seg.c;
-  int y_h_offset = shape_seg.w * shape_seg.c;
-  int y_w_offset = shape_seg.c;
-  int x_n_offset = shape_seg.h * shape_seg.w * x_full.c;
-  int y_c_offset = 1;
-  int x_h_offset = shape_seg.w * x_full.c;
-  int x_w_offset = x_full.c;
-  int x_c_offset = 1;
-  int x_start = 0;
-  int y_start = 0;
-  for (int nidx = 0; nidx < shape_seg.n; ++nidx) {
-    for (int hidx = 0; hidx < shape_seg.h; ++hidx) {
-      for (int widx = 0; widx < shape_seg.w; ++widx) {
-        int h_abs = hidx + position.h_start;
-        int w_abs = widx + position.w_start;
-        int y_offset = y_start;
-        int x_offset = x_start;
-        y_offset += hidx * y_h_offset + widx * y_w_offset;
-        x_offset += hidx * x_h_offset + widx * x_w_offset;
-
-        const int hstart = half_h_mask - h_abs > 0 ? half_h_mask - h_abs : 0;
-        const int hend = x_full.h + half_h_mask - h_abs < h_mask
-                             ? x_full.h + half_h_mask - h_abs
-                             : h_mask;
-        const int wstart = half_w_mask - w_abs > 0 ? half_w_mask - w_abs : 0;
-        const int wend = x_full.w + half_w_mask - w_abs < w_mask
-                             ? x_full.w + half_w_mask - w_abs
-                             : w_mask;
-        // (h,                      w                  ) with mask-indexed
-        // (h + hidx - half_h_mask, w + widx - half_w_mask) with feature-indexed
-        y_offset += ((hstart + h_abs - half_h_mask) * x_full.w + wstart +
-                     w_abs - half_w_mask) *
-                    y_c_offset;
-        x_offset += (hstart * w_mask + wstart) * x_c_offset;
-        int count = wend - wstart;
-        __memcpy(y_nram + y_offset, x_nram + x_offset, count * sizeof(T),
-                 NRAM2NRAM, y_c_offset * x_full.w * sizeof(T),
-                 x_c_offset * w_mask * sizeof(T), hend - hstart - 1);
-      }
-    }
-    y_start += y_n_offset;
-    x_start += x_n_offset;
-  }
-  storeDataFromNramToDram(y_dram, y_nram, position, y_full);
-}
-
-template <typename T>
-__mlu_func__ void psamaskDistributeForward(
-    const T *x_dram, T *y_dram, const PositionInCore &position,
-    const Shape &x_full, const Shape &y_full, const Shape &shape_seg,
-    const int h_mask, const int w_mask, const int half_h_mask,
-    const int half_w_mask) {
-  T *x_nram = (T *)buf;
-  T *y_nram_temp =
-      x_nram + CEIL_ALIGN(shape_seg.n * shape_seg.h * shape_seg.w * x_full.c,
-                          COMPUTE_COUNT_ALIGN / sizeof(T));
-  loadDataFromDramToNram(x_nram, x_dram, position, x_full);
-
-  // fill zeros to output
-  int align_c = CEIL_ALIGN(y_full.c, COMPUTE_COUNT_ALIGN / sizeof(T));
-  int align_hw =
-      CEIL_ALIGN(shape_seg.h * shape_seg.w, COMPUTE_COUNT_ALIGN / sizeof(T));
-  int elem_count =
-      CEIL_ALIGN(shape_seg.n * align_c * align_hw, NFU_ALIGN_SIZE / sizeof(T));
-  __bang_write_value(y_nram_temp, elem_count, (T)0);
-
-  int y_n_offset = align_hw * align_c;
-  int y_h_offset = shape_seg.w * align_c;
-  int y_w_offset = align_c;
-  int y_c_offset = 1;
-  int x_n_offset = shape_seg.h * shape_seg.w * x_full.c;
-  int x_h_offset = shape_seg.w * x_full.c;
-  int x_w_offset = x_full.c;
-  int x_c_offset = 1;
-  int h_feature = y_full.h;
-  int w_feature = y_full.w;
-
-  int y_start = 0;
-  int x_start = 0;
-  for (int nidx = 0; nidx < shape_seg.n; ++nidx) {
-    for (int hidx = 0; hidx < shape_seg.h; ++hidx) {
-      for (int widx = 0; widx < shape_seg.w; ++widx) {
-        int h_abs = hidx + position.h_start;
-        int w_abs = widx + position.w_start;
-        int y_offset = y_start;
-        int x_offset = x_start;
-        y_offset += hidx * y_h_offset + widx * y_w_offset;
-        x_offset += hidx * x_h_offset + widx * x_w_offset;
-        const int hstart = half_h_mask - h_abs > 0 ? half_h_mask - h_abs : 0;
-        const int hend = h_feature + half_h_mask - h_abs < h_mask
-                             ? h_feature + half_h_mask - h_abs
-                             : h_mask;
-        const int wstart = half_w_mask - w_abs > 0 ? half_w_mask - w_abs : 0;
-        const int wend = w_feature + half_w_mask - w_abs < w_mask
-                             ? w_feature + half_w_mask - w_abs
-                             : w_mask;
-        // (h,                      w                     ) with mask-indexed
-        // (h + hidx - half_h_mask, w + widx - half_w_mask) with feature-indexed
-        y_offset += ((hstart + h_abs - half_h_mask) * x_full.w + wstart +
-                     w_abs - half_w_mask) *
-                    y_c_offset;
-        x_offset += (hstart * w_mask + wstart) * x_c_offset;
-        int count = wend - wstart;
-        __memcpy(y_nram_temp + y_offset, x_nram + x_offset, count * sizeof(T),
-                 NRAM2NRAM, y_c_offset * w_feature * sizeof(T),
-                 x_c_offset * w_mask * sizeof(T), hend - hstart - 1);
-      }
-    }
-    y_start += y_n_offset;
-    x_start += x_n_offset;
-  }
-  // transpose y
-  T *y_nram = y_nram_temp + shape_seg.n * align_hw * align_c;
-  Shape y_seg{shape_seg.n, shape_seg.h, shape_seg.w, y_full.c};
-  transposeData(y_nram, y_nram_temp, y_seg);
-  swap(align_c, align_hw);
-  // store y from nram to dram
-  int y_n_offset_full = y_full.h * y_full.w * y_full.c;
-  int y_w_offset_full = y_full.c;
-  int y_c_offset_full = 1;
-
-  int y_dram_start =
-      position.n_start * y_n_offset_full +
-      (position.h_start * y_full.w + position.w_start) * y_c_offset_full;
-  int y_nram_start = 0;
-  for (int nidx = 0; nidx < shape_seg.n; ++nidx) {
-    int y_dram_offset = y_dram_start + nidx * y_n_offset_full;
-    int y_nram_offset = y_nram_start + nidx * align_hw * align_c;
-    __memcpy(y_dram + y_dram_offset, y_nram + y_nram_offset,
-             shape_seg.h * shape_seg.w * sizeof(T), NRAM2GDRAM,
-             y_w_offset_full * sizeof(T), align_c * sizeof(T),
-             h_feature * w_feature - 1);
-  }
-}
-
-template <typename T>
-__mlu_func__ void psamaskCollectBackward(
-    const T *dy_dram, T *dx_dram, const PositionInCore &position,
-    const Shape &dy_full, const Shape &dx_full, const Shape &shape_seg,
-    const int h_mask, const int w_mask, const int half_h_mask,
-    const int half_w_mask) {
-  T *dy_nram = (T *)buf;
-  T *dx_nram =
-      dy_nram + CEIL_ALIGN(shape_seg.n * shape_seg.h * shape_seg.w * dy_full.c,
-                           COMPUTE_COUNT_ALIGN / sizeof(T));
-  loadDataFromDramToNram(dy_nram, dy_dram, position, dy_full);
-
-  // fill zeros to output
-  int elem_count =
-      CEIL_ALIGN(shape_seg.n * shape_seg.h * shape_seg.w * shape_seg.c,
-                 NFU_ALIGN_SIZE / sizeof(T));
-  __bang_write_value(dx_nram, elem_count, (T)0);
-
-  int dy_n_offset = shape_seg.h * shape_seg.w * dy_full.c;
-  int dy_h_offset = shape_seg.w * dy_full.c;
-  int dy_w_offset = dy_full.c;
-  int dy_c_offset = 1;
-  int dx_n_offset = shape_seg.h * shape_seg.w * dx_full.c;
-  int dx_h_offset = shape_seg.w * dx_full.c;
-  int dx_w_offset = dx_full.c;
-  int dx_c_offset = 1;
-  int h_feature = dy_full.h;
-  int w_feature = dy_full.w;
-
-  int dy_start = 0;
-  int dx_start = 0;
-  for (int nidx = 0; nidx < shape_seg.n; ++nidx) {
-    for (int hidx = 0; hidx < shape_seg.h; ++hidx) {
-      for (int widx = 0; widx < shape_seg.w; ++widx) {
-        int h_abs = hidx + position.h_start;
-        int w_abs = widx + position.w_start;
-        int dy_offset = dy_start;
-        int dx_offset = dx_start;
-        dy_offset += hidx * dy_h_offset + widx * dy_w_offset;
-        dx_offset += hidx * dx_h_offset + widx * dx_w_offset;
-
-        const int hstart = half_h_mask - h_abs > 0 ? half_h_mask - h_abs : 0;
-        const int hend = h_feature + half_h_mask - h_abs < h_mask
-                             ? h_feature + half_h_mask - h_abs
-                             : h_mask;
-        const int wstart = half_w_mask - w_abs > 0 ? half_w_mask - w_abs : 0;
-        const int wend = w_feature + half_w_mask - w_abs < w_mask
-                             ? w_feature + half_w_mask - w_abs
-                             : w_mask;
-        // (h,                       w                      ) with mask-indexed
-        // (h + h_abs - half_h_mask, w + w_abs - half_w_mask) with
-        // feature-indexed
-        dy_offset += ((hstart + h_abs - half_h_mask) * w_feature + wstart +
-                      w_abs - half_w_mask) *
-                     dy_c_offset;
-        dx_offset += (hstart * w_mask + wstart) * dx_c_offset;
-        int count = wend - wstart;
-        __memcpy(dx_nram + dx_offset, dy_nram + dy_offset, count * sizeof(T),
-                 NRAM2NRAM, dx_c_offset * w_mask * sizeof(T),
-                 dy_c_offset * w_feature * sizeof(T), hend - hstart - 1);
-      }
-    }
-    dy_start += dy_n_offset;
-    dx_start += dx_n_offset;
-  }
-  storeDataFromNramToDram(dx_dram, dx_nram, position, dx_full);
-}
-
-template <typename T>
-__mlu_func__ void psamaskDistributeBackward(
-    const T *dy_dram, T *dx_dram, const PositionInCore &position,
-    const Shape &dy_full, const Shape &dx_full, const Shape &shape_seg,
-    const int h_mask, const int w_mask, const int half_h_mask,
-    const int half_w_mask) {
-  // load dy from dram to nram
-  T *dy_nram_temp = (T *)buf;
-  int dy_n_offset_full = dy_full.h * dy_full.w * dy_full.c;
-  int dy_c_offset_full = 1;
-  int h_feature = dy_full.h;
-  int w_feature = dy_full.w;
-  int align_c =
-      CEIL_ALIGN(shape_seg.h * shape_seg.w, COMPUTE_COUNT_ALIGN / sizeof(T));
-  int align_hw =
-      CEIL_ALIGN(h_feature * w_feature, COMPUTE_COUNT_ALIGN / sizeof(T));
-
-  int dy_dram_start =
-      position.n_start * dy_n_offset_full +
-      (position.h_start * w_feature + position.w_start) * dy_c_offset_full;
-  int dy_nram_start = 0;
-  for (int i = 0; i < shape_seg.n; ++i) {
-    int dy_nram_offset = dy_nram_start + i * (align_hw * align_c);
-    int dy_dram_offset = dy_dram_start + i * dy_n_offset_full;
-    __memcpy(dy_nram_temp + dy_nram_offset, dy_dram + dy_dram_offset,
-             shape_seg.h * shape_seg.w * sizeof(T), GDRAM2NRAM,
-             align_c * sizeof(T), dy_full.c * sizeof(T),
-             h_feature * w_feature - 1);
-  }
-  T *dy_nram = dy_nram_temp + shape_seg.n * align_hw * align_c;
-  Shape dy_seg{shape_seg.n, h_feature, w_feature, shape_seg.h * shape_seg.w};
-  transposeData(dy_nram, dy_nram_temp, dy_seg);
-  swap(align_c, align_hw);
-
-  // fill zeros to dx
-  T *dx_nram = dy_nram + shape_seg.n * align_hw * align_c;
-  int dx_size = shape_seg.n * shape_seg.h * shape_seg.w * dx_full.c;
-  __bang_write_value(dx_nram, CEIL_ALIGN(dx_size, NFU_ALIGN_SIZE / sizeof(T)),
-                     (T)0);
-
-  int dy_n_offset_seg = align_hw * align_c;
-  int dy_h_offset_seg = shape_seg.w * align_c;
-  int dy_w_offset_seg = align_c;
-  int dy_c_offset_seg = 1;
-  int dx_n_offset_seg = shape_seg.h * shape_seg.w * shape_seg.c;
-  int dx_h_offset_seg = shape_seg.w * shape_seg.c;
-  int dx_w_offset_seg = shape_seg.c;
-  int dx_c_offset_seg = 1;
-
-  int dy_start = 0;
-  int dx_start = 0;
-  for (int nidx = 0; nidx < shape_seg.n; ++nidx) {
-    for (int hidx = 0; hidx < shape_seg.h; ++hidx) {
-      for (int widx = 0; widx < shape_seg.w; ++widx) {
-        int h_abs = hidx + position.h_start;
-        int w_abs = widx + position.w_start;
-        int dy_offset = dy_start;
-        int dx_offset = dx_start;
-        dy_offset += hidx * dy_h_offset_seg + widx * dy_w_offset_seg;
-        dx_offset += hidx * dx_h_offset_seg + widx * dx_w_offset_seg;
-        const int hstart = half_h_mask - h_abs > 0 ? half_h_mask - h_abs : 0;
-        const int hend = h_feature + half_h_mask - h_abs < h_mask
-                             ? h_feature + half_h_mask - h_abs
-                             : h_mask;
-        const int wstart = half_w_mask - w_abs > 0 ? half_w_mask - w_abs : 0;
-        const int wend = w_feature + half_w_mask - w_abs < w_mask
-                             ? w_feature + half_w_mask - w_abs
-                             : w_mask;
-        // (h,                       w                      ) with mask-indexed
-        // (h + h_abs - half_h_mask, w + w_abs - half_w_mask) with
-        // feature-indexed
-        dy_offset += ((hstart + h_abs - half_h_mask) * w_feature + wstart +
-                      w_abs - half_w_mask) *
-                     dy_c_offset_seg;
-        dx_offset += (hstart * w_mask + wstart) * dx_c_offset_seg;
-        int count = wend - wstart;
-        __memcpy(dx_nram + dx_offset, dy_nram + dy_offset, count * sizeof(T),
-                 NRAM2NRAM, w_mask * dx_c_offset_seg * sizeof(T),
-                 w_feature * dy_c_offset_seg * sizeof(T), hend - hstart - 1);
-      }
-    }
-    dy_start += dy_n_offset_seg;
-    dx_start += dx_n_offset_seg;
-  }
-  storeDataFromNramToDram(dx_dram, dx_nram, position, dx_full);
-}
-
-template <typename T>
-__mlu_func__ void psamaskBase(const T *input_dram, T *output_dram,
-                              const Shape &input_full, const Shape &output_full,
-                              LimitParam &limit, const PsamaskType psa_type,
-                              const DimPartitionType core_partition,
-                              const DimPartitionType cluster_partition,
-                              const bool is_forward, const int h_mask,
-                              const int w_mask, const int half_h_mask,
-                              const int half_w_mask, const int n_per_core,
-                              const int h_per_core, const int n_per_cluster,
-                              const int h_per_cluster) {
-  PositionInCore position_full;
-  PositionInCore position_seg;
-  position_full.w_start = 0;
-  position_full.w_end = output_full.w;
-  int n_num_in_cluster = n_per_cluster;
-  int h_num_in_cluster = h_per_cluster;
-
-  switch (cluster_partition) {
-    case PARTITION_N: {
-      position_full.h_start = 0;
-      position_full.h_end = input_full.h;
-      position_full.n_start = taskIdY * n_per_cluster;
-      int cluster_need = (input_full.n + n_per_cluster - 1) / n_per_cluster;
-      if (taskIdY >= cluster_need) return;
-      int n_remainder = input_full.n - (cluster_need - 1) * n_per_cluster;
-      n_num_in_cluster =
-          (taskIdY == cluster_need - 1) ? n_remainder : n_per_cluster;
-      position_full.n_end = position_full.n_start + n_num_in_cluster;
-    }; break;
-    case PARTITION_H: {
-      position_full.n_start = 0;
-      position_full.n_end = input_full.n;
-      position_full.h_start = taskIdY * h_per_cluster;
-      int cluster_need = (input_full.h + h_per_cluster - 1) / h_per_cluster;
-      if (taskIdY >= cluster_need) return;
-      int h_remainder = input_full.h - (cluster_need - 1) * h_per_cluster;
-      h_num_in_cluster =
-          (taskIdY == cluster_need - 1) ? h_remainder : h_per_cluster;
-      position_full.h_end = position_full.h_start + h_num_in_cluster;
-    }; break;
-  }
-  switch (core_partition) {
-    case PARTITION_N: {
-      position_full.n_start += taskIdX * n_per_core;
-      int core_need = (n_num_in_cluster + n_per_core - 1) / n_per_core;
-      if (taskIdX >= core_need) return;
-      int n_remainder = n_num_in_cluster - (core_need - 1) * n_per_core;
-      position_full.n_end =
-          position_full.n_start +
-          ((taskIdX == core_need - 1) ? n_remainder : n_per_core);
-    }; break;
-    case PARTITION_H: {
-      position_full.h_start += taskIdX * h_per_core;
-      int core_need = (h_num_in_cluster + h_per_core - 1) / h_per_core;
-      if (taskIdX >= core_need) return;
-      int h_remainder = h_num_in_cluster - (core_need - 1) * h_per_core;
-      position_full.h_end =
-          position_full.h_start +
-          ((taskIdX == core_need - 1) ? h_remainder : h_per_core);
-    }; break;
-  }
-  // the count of n ,h and w need to be processed in the current core
-  int shape_core_n = position_full.n_end - position_full.n_start;
-  int shape_core_h = position_full.h_end - position_full.h_start;
-  int shape_core_w = input_full.w;
-
-  limit.n = limit.n < shape_core_n ? limit.n : shape_core_n;
-  limit.h = limit.h < shape_core_h ? limit.h : shape_core_h;
-  limit.w = limit.w < shape_core_w ? limit.w : shape_core_w;
-
-  // load the data to nram according to the limit
-  for (int nidx = position_full.n_start; nidx < position_full.n_end;
-       nidx += limit.n) {
-    position_seg.n_start = nidx;
-    position_seg.n_end =
-        position_seg.n_start + (position_full.n_end - nidx < limit.n
-                                    ? position_full.n_end - nidx
-                                    : limit.n);
-    for (int hidx = position_full.h_start; hidx < position_full.h_end;
-         hidx += limit.h) {
-      position_seg.h_start = hidx;
-      position_seg.h_end =
-          position_seg.h_start + (position_full.h_end - hidx < limit.h
-                                      ? position_full.h_end - hidx
-                                      : limit.h);
-      for (int widx = position_full.w_start; widx < position_full.w_end;
-           widx += limit.w) {
-        position_seg.w_start = widx;
-        position_seg.w_end =
-            position_seg.w_start + (position_full.w_end - widx < limit.w
-                                        ? position_full.w_end - widx
-                                        : limit.w);
-
-        // record the segment of output except the size of channel
-        // channel segments of output and input are the same
-        Shape shape_seg;
-        shape_seg.n = position_seg.n_end - position_seg.n_start;
-        shape_seg.h = position_seg.h_end - position_seg.h_start;
-        shape_seg.w = position_seg.w_end - position_seg.w_start;
-        shape_seg.c = output_full.c;
-
-        switch (psa_type) {
-          case COLLECT: {
-            if (is_forward) {
-              psamaskCollectForward(input_dram, output_dram, position_seg,
-                                    input_full, output_full, shape_seg, h_mask,
-                                    w_mask, half_h_mask, half_w_mask);
-            } else {
-              psamaskCollectBackward(input_dram, output_dram, position_seg,
-                                     input_full, output_full, shape_seg, h_mask,
-                                     w_mask, half_h_mask, half_w_mask);
-            }
-          } break;
-          case DISTRIBUTE: {
-            if (is_forward) {
-              psamaskDistributeForward(input_dram, output_dram, position_seg,
-                                       input_full, output_full, shape_seg,
-                                       h_mask, w_mask, half_h_mask,
-                                       half_w_mask);
-            } else {
-              psamaskDistributeBackward(input_dram, output_dram, position_seg,
-                                        input_full, output_full, shape_seg,
-                                        h_mask, w_mask, half_h_mask,
-                                        half_w_mask);
-            }
-          } break;
-        }
-      }
-    }
-  }
-}
-
-template <typename T>
-__mlu_global__ void MLUUnion1KernelPsamaskForward(
-    const T *x, T *y, const PsamaskType psa_type,
-    const DimPartitionType core_partition,
-    const DimPartitionType cluster_partition, const int batch,
-    const int h_feature, const int w_feature, const int h_mask,
-    const int w_mask, const int x_c, const int y_c, const int half_h_mask,
-    const int half_w_mask, const int n_per_core, const int h_per_core,
-    const int n_per_cluster, const int h_per_cluster, const int limit_n_seg,
-    const int limit_h_seg, const int limit_w_seg) {
-  if (coreId == 0x80) {
-    return;
-  }
-  Shape x_full, y_full;
-  x_full.n = batch;
-  x_full.h = h_feature;
-  x_full.w = w_feature;
-  x_full.c = x_c;
-  y_full.n = batch;
-  y_full.h = h_feature;
-  y_full.w = w_feature;
-  y_full.c = y_c;
-
-  LimitParam limit;
-  limit.n = limit_n_seg;
-  limit.h = limit_h_seg;
-  limit.w = limit_w_seg;
-
-  psamaskBase(x, y, x_full, y_full, limit, psa_type, core_partition,
-              cluster_partition, true, h_mask, w_mask, half_h_mask, half_w_mask,
-              n_per_core, h_per_core, n_per_cluster, h_per_cluster);
-}
-
-template <typename T>
-__mlu_global__ void MLUUnion1KernelPsamaskBackward(
-    const T *dy, T *dx, const PsamaskType psa_type,
-    const DimPartitionType core_partition,
-    const DimPartitionType cluster_partition, const int batch,
-    const int h_feature, const int w_feature, const int h_mask,
-    const int w_mask, const int dx_c, const int dy_c, const int half_h_mask,
-    const int half_w_mask, const int n_per_core, const int h_per_core,
-    const int n_per_cluster, const int h_per_cluster, const int limit_n_seg,
-    const int limit_h_seg, const int limit_w_seg) {
-  if (coreId == 0x80) {
-    return;
-  }
-  Shape dy_full, dx_full;
-  dx_full.n = batch;
-  dx_full.h = h_feature;
-  dx_full.w = w_feature;
-  dx_full.c = dx_c;
-  dy_full.n = batch;
-  dy_full.h = h_feature;
-  dy_full.w = w_feature;
-  dy_full.c = dy_c;
-
-  LimitParam limit;
-  limit.n = limit_n_seg;
-  limit.h = limit_h_seg;
-  limit.w = limit_w_seg;
-
-  psamaskBase(dy, dx, dy_full, dx_full, limit, psa_type, core_partition,
-              cluster_partition, false, h_mask, w_mask, half_h_mask,
-              half_w_mask, n_per_core, h_per_core, n_per_cluster,
-              h_per_cluster);
-}
-
-void KernelPsamaskForward(
-    cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
-    const void *x, void *y, const PsamaskType psa_type,
-    const DimPartitionType core_partition,
-    const DimPartitionType cluster_partition, const int batch,
-    const int h_feature, const int w_feature, const int h_mask,
-    const int w_mask, const int x_c, const int y_c, const int half_h_mask,
-    const int half_w_mask, const int n_per_core, const int h_per_core,
-    const int n_per_cluster, const int h_per_cluster, const int limit_n_seg,
-    const int limit_h_seg, const int limit_w_seg) {
-  MLUUnion1KernelPsamaskForward<<<k_dim, k_type, queue>>>(
-      static_cast<const float *>(x), static_cast<float *>(y), psa_type,
-      core_partition, cluster_partition, batch, h_feature, w_feature, h_mask,
-      w_mask, x_c, y_c, half_h_mask, half_w_mask, n_per_core, h_per_core,
-      n_per_cluster, h_per_cluster, limit_n_seg, limit_h_seg, limit_w_seg);
-}
-
-void KernelPsamaskBackward(
-    cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
-    const void *dy, void *dx, const PsamaskType psa_type,
-    const DimPartitionType core_partition,
-    const DimPartitionType cluster_partition, const int batch,
-    const int h_feature, const int w_feature, const int h_mask,
-    const int w_mask, const int dx_c, const int dy_c, const int half_h_mask,
-    const int half_w_mask, const int n_per_core, const int h_per_core,
-    const int n_per_cluster, const int h_per_cluster, const int limit_n_seg,
-    const int limit_h_seg, const int limit_w_seg) {
-  MLUUnion1KernelPsamaskBackward<<<k_dim, k_type, queue>>>(
-      static_cast<const float *>(dy), static_cast<float *>(dx), psa_type,
-      core_partition, cluster_partition, batch, h_feature, w_feature, h_mask,
-      w_mask, dx_c, dy_c, half_h_mask, half_w_mask, n_per_core, h_per_core,
-      n_per_cluster, h_per_cluster, limit_n_seg, limit_h_seg, limit_w_seg);
-}
diff --git a/mmcv/ops/csrc/common/mlu/psamask_utils.hpp b/mmcv/ops/csrc/common/mlu/psamask_utils.hpp
deleted file mode 100644
index 30ec388..0000000
--- a/mmcv/ops/csrc/common/mlu/psamask_utils.hpp
+++ /dev/null
@@ -1,55 +0,0 @@
-/*************************************************************************
- * Copyright (C) 2022 Cambricon.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *************************************************************************/
-#ifndef PSAMASK_UTILS_HPP_
-#define PSAMASK_UTILS_HPP_
-
-typedef enum {
-  COLLECT = 0,
-  DISTRIBUTE = 1,
-} PsamaskType;
-
-typedef enum {
-  PARTITION_N = 0,
-  PARTITION_H = 1,
-} DimPartitionType;
-
-struct PartitionSeg {
-  int h_per_cluster;
-  int n_per_cluster;
-  int h_per_core;
-  int n_per_core;
-  DimPartitionType cluster_partition;
-  DimPartitionType core_partition;
-};
-
-struct Shape {
-  int n;
-  int h;
-  int w;
-  int c;
-};
-
-struct LimitParam {
-  int n;
-  int h;
-  int w;
-};
-
-struct PositionInCore {
-  int n_start;
-  int n_end;
-  int h_start;
-  int h_end;
-  int w_start;
-  int w_end;
-};
-#endif  // PSAMASK_UTILS_HPP_
diff --git a/mmcv/ops/csrc/common/mlu/roi_align_mlu_kernel.mlu b/mmcv/ops/csrc/common/mlu/roi_align_mlu_kernel.mlu
deleted file mode 100644
index c99176a..0000000
--- a/mmcv/ops/csrc/common/mlu/roi_align_mlu_kernel.mlu
+++ /dev/null
@@ -1,493 +0,0 @@
-/*************************************************************************
- * Copyright (C) 2021 Cambricon.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *************************************************************************/
-#include "common_mlu_helper.hpp"
-
-#define ROI_OFFSET 5
-
-__nram__ char buffer[MAX_NRAM_SIZE];
-
-namespace forward {
-template <typename T>
-__mlu_func__ void bilinearInterpolate(const int input_height,
-                                      const int input_width, T y, T x, T *w1,
-                                      T *w2, T *w3, T *w4, int *x_low,
-                                      int *x_high, int *y_low, int *y_high,
-                                      bool *empty) {
-  // deal with cases that inverse elements are of feature map boundary
-  if (y < -1.0 || y > input_height || x < -1.0 || x > input_width) {
-    *empty = true;
-    return;
-  }
-
-  if (y <= 0) y = 0;
-  if (x <= 0) x = 0;
-
-  int y_low_ = int(y);
-  int x_low_ = int(x);
-
-  if (y_low_ >= input_height - 1) {
-    *y_high = y_low_ = input_height - 1;
-    y = (T)y_low_;
-  } else {
-    *y_high = y_low_ + 1;
-  }
-
-  if (x_low_ >= input_width - 1) {
-    *x_high = x_low_ = input_width - 1;
-    x = T(x_low_);
-  } else {
-    *x_high = x_low_ + 1;
-  }
-
-  *y_low = y_low_;
-  *x_low = x_low_;
-
-  T ly = y - y_low_;
-  T lx = x - x_low_;
-  T hy = 1.0 - ly;
-  T hx = 1.0 - lx;
-  *w1 = hy * hx, *w2 = hy * lx, *w3 = ly * hx, *w4 = ly * lx;
-  return;
-}
-
-template <typename T>
-__mlu_func__ void computeChannel(T *input_core, T *nram_in, T *output_core,
-                                 T *nram_out, const int roi_bin_grid_h,
-                                 const int roi_bin_grid_w, const T roi_start_h,
-                                 const T roi_start_w, const int ph,
-                                 const int pw, const T bin_size_h,
-                                 const T bin_size_w, const float count,
-                                 const int input_height, const int input_width,
-                                 const int channels, const int cyc_num,
-                                 const int max_elements) {
-  int cyc_channel = max_elements;
-
-  for (int i = 0; i < cyc_num; i++) {
-    int real_channel =
-        (i == cyc_num - 1) ? channels - i * cyc_channel : cyc_channel;
-    int align_channel = PAD_UP(real_channel, NFU_ALIGN_SIZE / sizeof(T));
-    __bang_write_zero(nram_out, align_channel);
-    uint32_t real_size = real_channel * sizeof(T);
-
-    int iy, ix;
-    for (iy = 0; iy < roi_bin_grid_h; iy++) {
-      // 1. compute the coordinates of the y axis in the current roi_bin_grid_h
-      T y = roi_start_h + ph * bin_size_h +
-            (T)(iy + 0.5) * bin_size_h / (T)(roi_bin_grid_h);
-      for (ix = 0; ix < roi_bin_grid_w; ix++) {
-        // 2. compute the coordinates of the x axis in the current
-        //    roi_bin_grid_w
-        T x = roi_start_w + pw * bin_size_w +
-              (T)(ix + 0.5) * bin_size_w / (T)(roi_bin_grid_w);
-
-        // 3. compute the four weights (w1, w2, w3 and w4), the height (y_low
-        //    and y_high) and weight (x_low and x_high) of input feature map in
-        //    the current roi bin grid, and the flag (empty) which shows if x, y
-        //    are out of input feature map ranges
-        T w1, w2, w3, w4;
-        int x_low, x_high, y_low, y_high;
-        bool empty = false;
-
-        bilinearInterpolate(input_height, input_width, y, x, &w1, &w2, &w3, &w4,
-                            &x_low, &x_high, &y_low, &y_high, &empty);
-
-        // 4. compute interpolation of the current roi bin grid
-        //    tmp_cyc1, temp_cyc2, tmp_cyc3 and tmp_cyc4 store the input values
-        //    to compute the interpolation, and then reused to compute
-        //    the argmax_x and argmax_y.
-        T *tmp_cyc1 = nram_in + cyc_channel;
-        T *tmp_cyc2 = nram_in + cyc_channel * 2;
-        T *tmp_cyc3 = nram_in + cyc_channel * 3;
-        T *tmp_cyc4 = nram_in + cyc_channel * 4;
-
-        if (empty) {  // exits abnormal values
-          __bang_write_zero(nram_in, align_channel);
-        } else {
-          __bang_write_zero(nram_in, align_channel);
-          uint32_t offset1 = (y_low * input_width + x_low) * channels;
-          uint32_t offset2 = (y_low * input_width + x_high) * channels;
-          uint32_t offset3 = (y_high * input_width + x_low) * channels;
-          uint32_t offset4 = (y_high * input_width + x_high) * channels;
-          T *input1 = (T *)input_core + offset1 + i * cyc_channel;
-          T *input2 = (T *)input_core + offset2 + i * cyc_channel;
-          T *input3 = (T *)input_core + offset3 + i * cyc_channel;
-          T *input4 = (T *)input_core + offset4 + i * cyc_channel;
-
-          // load the four pixels (p1, p2, p3 and p4) of input feature map to
-          // compute interpolation
-          __memcpy(tmp_cyc1, input1, real_size, GDRAM2NRAM);
-          __memcpy(tmp_cyc2, input2, real_size, GDRAM2NRAM);
-          __memcpy(tmp_cyc3, input3, real_size, GDRAM2NRAM);
-          __memcpy(tmp_cyc4, input4, real_size, GDRAM2NRAM);
-
-          // interpolation value = w1 * p1 + w2 * p2 + w3 * p3 + w4 * p4
-          __bang_mul_scalar(tmp_cyc1, tmp_cyc1, w1, align_channel);
-          __bang_mul_scalar(tmp_cyc2, tmp_cyc2, w2, align_channel);
-          __bang_mul_scalar(tmp_cyc3, tmp_cyc3, w3, align_channel);
-          __bang_mul_scalar(tmp_cyc4, tmp_cyc4, w4, align_channel);
-
-          __bang_add(nram_in, tmp_cyc1, nram_in, align_channel);
-          __bang_add(nram_in, tmp_cyc2, nram_in, align_channel);
-          __bang_add(nram_in, tmp_cyc3, nram_in, align_channel);
-          __bang_add(nram_in, tmp_cyc4, nram_in, align_channel);
-        }
-        // 5. compute sum value and corresponding coordinates of x axis and y
-        //    axis. Update the sum value.
-        __bang_add(nram_out, nram_in, nram_out, align_channel);
-      }  // loop_roi_grid_w
-    }    // loop_roi_grid_h
-    T count_value = (T)(1.0 / count);
-    __bang_mul_scalar(nram_out, nram_out, count_value, align_channel);
-    __memcpy(output_core + i * cyc_channel, nram_out, real_size, NRAM2GDRAM);
-  }  // loop_cyc_num
-}
-
-template <typename T>
-__mlu_func__ void roialignForwardAvg(
-    T *input, T *rois, T *output, const bool aligned, const int channels,
-    const int pooled_height, const int pooled_width, const int input_height,
-    const int input_width, const int sampling_ratio, const T spatial_scale,
-    const int num_rois) {
-  // find limit for channel, the nram space is divided to 6 parts that are
-  // input, 4 weights to compute the interpolation (w1, w2, w3, w4), output
-
-  // max_elements : 300 : float datatype : 27296, half datatype : 54592
-  // max_elements : 200 : float datatype : 16384, half datatype : 32768
-  int max_elements = (PAD_DOWN(MAX_NRAM_SIZE / 6, NFU_ALIGN_SIZE)) / sizeof(T);
-  int cyc_num = channels / max_elements + (int)(channels % max_elements != 0);
-  T offset = aligned ? (T)0.5 : (T)0.0;
-  int task_num = num_rois * pooled_height * pooled_width;
-  T *nram_out = (T *)buffer;
-  T *nram_in = nram_out + max_elements;
-  if (task_num < taskDim) {
-    if (taskId >= task_num) {
-      return;
-    }
-  }
-
-  for (int bin_idx = taskId; bin_idx < task_num; bin_idx = bin_idx + taskDim) {
-    if (bin_idx >= task_num) {
-      return;
-    }
-
-    // (n,ph.pw) is a c in the pooled output
-    int pw = bin_idx % pooled_width;
-    int ph = (bin_idx / pooled_width) % pooled_height;
-    int n = bin_idx / pooled_width / pooled_height;
-
-    T *roi_id_tmp = rois + n * ROI_OFFSET;
-    // 1. compute width and height of roi region.
-    int batch_idx = (int)roi_id_tmp[0];
-    T roi_x1 = roi_id_tmp[1];
-    T roi_y1 = roi_id_tmp[2];
-    T roi_x2 = roi_id_tmp[3];
-    T roi_y2 = roi_id_tmp[4];
-    T roi_start_w = roi_x1 * spatial_scale - offset;
-    T roi_start_h = roi_y1 * spatial_scale - offset;
-    T roi_end_w = roi_x2 * spatial_scale - offset;
-    T roi_end_h = roi_y2 * spatial_scale - offset;
-    T roi_width = roi_end_w - roi_start_w;
-    T roi_height = roi_end_h - roi_start_h;
-
-    if (!aligned) {
-      roi_width = roi_width > (T)(1.0) ? roi_width : (T)(1.0);
-      roi_height = roi_height > (T)(1.0) ? roi_height : (T)(1.0);
-    }
-
-    // 2. compute float-type width and height of roi bin region.
-    T bin_size_w = (T)roi_width / (T)pooled_width;
-    T bin_size_h = (T)roi_height / (T)pooled_height;
-
-    // 3. compute int-type width and height of roi bin region.
-    int roi_bin_grid_h, roi_bin_grid_w;
-    roi_bin_grid_h = (sampling_ratio > 0)
-                         ? sampling_ratio
-                         : int(ceilf(roi_height / pooled_height));
-    roi_bin_grid_w = (sampling_ratio > 0)
-                         ? sampling_ratio
-                         : int(ceilf(roi_width / pooled_width));
-    float count = (float)((roi_bin_grid_h * roi_bin_grid_w) > 1
-                              ? roi_bin_grid_h * roi_bin_grid_w
-                              : 1.0);
-    T *input_core = input + batch_idx * channels * input_width * input_height;
-    T *output_core = output + bin_idx * channels;
-    // 4. compute avg value and corresponding coordinates of x axis and y axis.
-    computeChannel(input_core, nram_in, output_core, nram_out, roi_bin_grid_h,
-                   roi_bin_grid_w, roi_start_h, roi_start_w, ph, pw, bin_size_h,
-                   bin_size_w, count, input_height, input_width, channels,
-                   cyc_num, max_elements);
-  }
-}
-
-__mlu_global__ void MLUUnion1KernelRoiAlignAvg(
-    const void *input, const void *rois, const int channels, const bool aligned,
-    const int pooled_height, const int pooled_width, const int input_height,
-    const int input_width, const int sampling_ratio, const float spatial_scale,
-    const int num_rois, const cnrtDataType_t data_type, void *output) {
-  // make sure that memcore is not used
-  if (coreId == 0x80) {
-    return;
-  }
-
-  switch (data_type) {
-    case CNRT_FLOAT16: {
-      roialignForwardAvg((half *)input, (half *)rois, (half *)output, aligned,
-                         channels, pooled_height, pooled_width, input_height,
-                         input_width, sampling_ratio, (half)spatial_scale,
-                         num_rois);
-    }; break;
-    case CNRT_FLOAT32: {
-      roialignForwardAvg((float *)input, (float *)rois, (float *)output,
-                         aligned, channels, pooled_height, pooled_width,
-                         input_height, input_width, sampling_ratio,
-                         (float)spatial_scale, num_rois);
-    }; break;
-    default:
-      break;
-  }
-
-  return;
-}
-}  // namespace forward
-
-namespace backward {
-__mlu_func__ void bilinearInterpolateGradient(int height, int width, float y,
-                                              float x, float *w1, float *w2,
-                                              float *w3, float *w4, int *x_low,
-                                              int *x_high, int *y_low,
-                                              int *y_high) {
-  if (y < -1.0 || y > height || x < -1.0 || x > width) {
-    *w1 = 0.0, *w2 = 0.0, *w3 = 0.0, *w4 = 0.0;
-    *x_low = -1, *x_high = -1, *y_low = -1, *y_high = -1;
-    return;
-  }
-  if (y <= 0) {
-    y = 0;
-  }
-  if (x <= 0) {
-    x = 0;
-  }
-  *y_low = (int)y;
-  *x_low = (int)x;
-  if (*y_low >= height - 1) {
-    *y_high = height - 1, *y_low = height - 1;
-    y = (float)(*y_low);
-  } else {
-    *y_high = *y_low + 1;
-  }
-  if (*x_low >= width - 1) {
-    *x_high = width - 1, *x_low = width - 1;
-    x = (float)(*x_low);
-  } else {
-    *x_high = *x_low + 1;
-  }
-  float ly = y - *y_low, lx = x - *x_low;
-  float hy = 1.0 - ly, hx = 1.0 - lx;
-  *w1 = hy * hx, *w2 = hy * lx, *w3 = ly * hx, *w4 = ly * lx;
-  return;
-}
-
-template <typename T>
-__mlu_func__ void unionRoiAlignBp(
-    T *grads, T *boxes, T *grads_image, const int boxes_num, const int hi,
-    const int wi, const int c, const int no, const int ho, const int wo,
-    const float spatial_scale, const int sampling_ratio, const bool aligned) {
-  int c_align = PAD_UP(c, NFU_ALIGN_SIZE / sizeof(T));
-  int deal_all = boxes_num * hi * wi;
-  int deal_this_core = deal_all / taskDim + (int)(taskId < deal_all % taskDim);
-  for (int i = 0; i < deal_this_core; ++i) {
-    int bhw_id = i * taskDim + taskId;
-    int box_id = bhw_id / (hi * wi);
-    int ih = (bhw_id / wi) % hi;
-    int iw = bhw_id % wi;
-    T *box = boxes + box_id * 5;
-    int image_id = (int)box[0];
-    T *image_offset = grads_image + image_id * ho * wo * c;
-    T *grads_ = grads + box_id * hi * wi * c + ih * wi * c + iw * c;
-
-    float offset = aligned ? 0.5 : 0.0;
-    float x1 = box[1] * spatial_scale - offset;
-    float y1 = box[2] * spatial_scale - offset;
-    float x2 = box[3] * spatial_scale - offset;
-    float y2 = box[4] * spatial_scale - offset;
-    float roi_width = x2 - x1;
-    float roi_height = y2 - y1;
-    if (!aligned) {
-      roi_width = (roi_width > 1.0) ? roi_width : 1.0;
-      roi_height = (roi_height > 1.0) ? roi_height : 1.0;
-    }
-    float bin_size_h = roi_height / hi;
-    float bin_size_w = roi_width / wi;
-
-    int roi_grid_h =
-        (sampling_ratio > 0) ? sampling_ratio : std::ceil(roi_height / hi);
-    int roi_grid_w =
-        (sampling_ratio > 0) ? sampling_ratio : std::ceil(roi_width / wi);
-    const T count = roi_grid_h * roi_grid_w;
-    if (c_align * sizeof(T) * 2 <= MAX_NRAM_SIZE) {
-      for (int iy = 0; iy < roi_grid_h; ++iy) {
-        const float y =
-            y1 + ih * bin_size_h + (iy + 0.5) * bin_size_h / roi_grid_h;
-        for (int ix = 0; ix < roi_grid_w; ++ix) {
-          const float x =
-              x1 + iw * bin_size_w + (ix + 0.5) * bin_size_w / roi_grid_w;
-          float w1, w2, w3, w4;
-          int x_low, x_high, y_low, y_high;
-          bilinearInterpolateGradient(ho, wo, y, x, &w1, &w2, &w3, &w4, &x_low,
-                                      &x_high, &y_low, &y_high);
-          if (x_low >= 0 && y_low >= 0) {
-            __memcpy(buffer, grads_, c * sizeof(T), GDRAM2NRAM);
-            __bang_mul_scalar((T *)buffer + c_align, (T *)buffer, (T)w1,
-                              c_align);
-            __bang_mul_scalar((T *)buffer + c_align, (T *)buffer + c_align,
-                              1 / count, c_align);
-            __bang_atomic_add((T *)buffer + c_align,
-                              image_offset + y_low * wo * c + x_low * c,
-                              (T *)buffer + c_align, c);
-            __bang_mul_scalar((T *)buffer + c_align, (T *)buffer, (T)w2,
-                              c_align);
-            __bang_mul_scalar((T *)buffer + c_align, (T *)buffer + c_align,
-                              1 / count, c_align);
-            __bang_atomic_add((T *)buffer + c_align,
-                              image_offset + y_low * wo * c + x_high * c,
-                              (T *)buffer + c_align, c);
-            __bang_mul_scalar((T *)buffer + c_align, (T *)buffer, (T)w3,
-                              c_align);
-            __bang_mul_scalar((T *)buffer + c_align, (T *)buffer + c_align,
-                              1 / count, c_align);
-            __bang_atomic_add((T *)buffer + c_align,
-                              image_offset + y_high * wo * c + x_low * c,
-                              (T *)buffer + c_align, c);
-            __bang_mul_scalar((T *)buffer + c_align, (T *)buffer, (T)w4,
-                              c_align);
-            __bang_mul_scalar((T *)buffer + c_align, (T *)buffer + c_align,
-                              1 / count, c_align);
-            __bang_atomic_add((T *)buffer + c_align,
-                              image_offset + y_high * wo * c + x_high * c,
-                              (T *)buffer + c_align, c);
-          }  // x_low && y_low
-        }    // ix
-      }      // iy
-    } else {
-      for (int iy = 0; iy < roi_grid_h; ++iy) {
-        const float y =
-            y1 + ih * bin_size_h + (iy + 0.5) * bin_size_h / roi_grid_h;
-        for (int ix = 0; ix < roi_grid_w; ++ix) {
-          const float x =
-              x1 + iw * bin_size_w + (ix + 0.5) * bin_size_w / roi_grid_w;
-          float w1, w2, w3, w4;
-          int x_low, x_high, y_low, y_high;
-          bilinearInterpolateGradient(ho, wo, y, x, &w1, &w2, &w3, &w4, &x_low,
-                                      &x_high, &y_low, &y_high);
-          if (x_low >= 0 && y_low >= 0) {
-            int deal_once =
-                PAD_DOWN(MAX_NRAM_SIZE / 2, NFU_ALIGN_SIZE) / sizeof(T);
-            int c_repeat = c / deal_once + (int)(c % deal_once != 0);
-            for (int i = 0; i < c_repeat; ++i) {
-              int deal_c = deal_once;
-              int align_c = deal_once;
-              if (i == c_repeat - 1) {
-                deal_c = c - i * deal_once;
-                align_c = c_align - i * deal_once;
-              }
-              __memcpy(buffer, grads_ + i * deal_once, deal_c * sizeof(T),
-                       GDRAM2NRAM);
-              __bang_mul_scalar((T *)buffer + align_c, (T *)buffer, (T)w1,
-                                align_c);
-              __bang_mul_scalar((T *)buffer + align_c, (T *)buffer + align_c,
-                                1 / count, align_c);
-              __bang_atomic_add(
-                  (T *)buffer + align_c,
-                  image_offset + y_low * wo * c + x_low * c + i * deal_once,
-                  (T *)buffer + align_c, deal_c);
-              __bang_mul_scalar((T *)buffer + align_c, (T *)buffer, (T)w2,
-                                align_c);
-              __bang_mul_scalar((T *)buffer + align_c, (T *)buffer + align_c,
-                                1 / count, align_c);
-              __bang_atomic_add(
-                  (T *)buffer + align_c,
-                  image_offset + y_low * wo * c + x_high * c + i * deal_once,
-                  (T *)buffer + align_c, deal_c);
-              __bang_mul_scalar((T *)buffer + align_c, (T *)buffer, (T)w3,
-                                align_c);
-              __bang_mul_scalar((T *)buffer + align_c, (T *)buffer + align_c,
-                                1 / count, align_c);
-              __bang_atomic_add(
-                  (T *)buffer + align_c,
-                  image_offset + y_high * wo * c + x_low * c + i * deal_once,
-                  (T *)buffer + align_c, deal_c);
-              __bang_mul_scalar((T *)buffer + align_c, (T *)buffer, (T)w4,
-                                align_c);
-              __bang_mul_scalar((T *)buffer + align_c, (T *)buffer + align_c,
-                                1 / count, align_c);
-              __bang_atomic_add(
-                  (T *)buffer + align_c,
-                  image_offset + y_high * wo * c + x_high * c + i * deal_once,
-                  (T *)buffer + align_c, deal_c);
-            }  // for c_repeat
-          }    // x_low >= 0 && y_low >= 0
-        }      // ix
-      }        // iy
-    }          // if c
-  }            // i
-}
-
-__mlu_global__ void MLUUnion1KernelRoiAlignBackward(
-    const void *grads, const void *boxes, void *grads_image,
-    const cnrtDataType_t dtype, const int boxes_num, const int hi, const int wi,
-    const int c, const int no, const int ho, const int wo,
-    const float spatial_scale, const int sampling_ratio, const bool aligned) {
-  // make sure that memcore is not used
-  if (coreId == 0x80) {
-    return;
-  }
-  switch (dtype) {
-    case CNRT_FLOAT16: {
-      unionRoiAlignBp((half *)grads, (half *)boxes, (half *)grads_image,
-                      boxes_num, hi, wi, c, no, ho, wo, spatial_scale,
-                      sampling_ratio, aligned);
-    }; break;
-    case CNRT_FLOAT32: {
-      unionRoiAlignBp((float *)grads, (float *)boxes, (float *)grads_image,
-                      boxes_num, hi, wi, c, no, ho, wo, spatial_scale,
-                      sampling_ratio, aligned);
-    }; break;
-    default: { return; }
-  }
-}
-}  // namespace backward
-
-void KernelRoiAlign(cnrtDim3_t k_dim, cnrtFunctionType_t k_type,
-                    cnrtQueue_t queue, const cnrtDataType_t d_type,
-                    const void *input, const void *rois, const int channels,
-                    const bool aligned, const int pooled_height,
-                    const int pooled_width, const int input_height,
-                    const int input_width, const int sampling_ratio,
-                    const float spatial_scale, const int num_rois,
-                    void *output) {
-  forward::MLUUnion1KernelRoiAlignAvg<<<k_dim, k_type, queue>>>(
-      input, rois, channels, aligned, pooled_height, pooled_width, input_height,
-      input_width, sampling_ratio, spatial_scale, num_rois, d_type, output);
-}
-
-void KernelRoiAlignBackward(cnrtDim3_t k_dim, cnrtFunctionType_t k_type,
-                            cnrtQueue_t queue, const cnrtDataType_t dtype,
-                            const void *grads, const void *boxes,
-                            void *grads_image, const int boxes_num,
-                            const int hi, const int wi, const int c,
-                            const int no, const int ho, const int wo,
-                            const float spatial_scale, const int sampling_ratio,
-                            const bool aligned) {
-  backward::MLUUnion1KernelRoiAlignBackward<<<k_dim, k_type, queue>>>(
-      grads, boxes, grads_image, dtype, boxes_num, hi, wi, c, no, ho, wo,
-      spatial_scale, sampling_ratio, aligned);
-}
diff --git a/mmcv/ops/csrc/common/mlu/roi_align_rotated_mlu_kernel.mlu b/mmcv/ops/csrc/common/mlu/roi_align_rotated_mlu_kernel.mlu
deleted file mode 100644
index 9356776..0000000
--- a/mmcv/ops/csrc/common/mlu/roi_align_rotated_mlu_kernel.mlu
+++ /dev/null
@@ -1,490 +0,0 @@
-/*************************************************************************
- * Copyright (C) 2022 Cambricon.
- *
- * OR IMPLIED, INCLUDING BUvoid NOKType LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENvoid SHALL THE AUTHORS OR COPYRIGHKType HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORvoid OR OTHERWISE, ARISING FROM, OUKType OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *************************************************************************/
-#include "common_mlu_helper.hpp"
-#include "roi_align_rotated_utils.hpp"
-
-#define ROI_OFFSET 6
-#define SAMPLING_NUM 4
-
-__nram__ char nram_buffer[MAX_NRAM_SIZE];
-
-template <typename T>
-__mlu_func__ void swap(T &a, T &b) {
-  T tmp = a;
-  a = b;
-  b = tmp;
-}
-
-template <typename T>
-__mlu_func__ void bilinearInterpolate(const int input_height,
-                                      const int input_width, T x, T y, T *w1,
-                                      T *w2, T *w3, T *w4, int *x_low,
-                                      int *x_high, int *y_low, int *y_high,
-                                      bool *empty) {
-  // deal with case that the point is out of feature map boundary
-  if (y < -1.0 || y > input_height || x < -1.0 || x > input_width) {
-    *empty = true;
-    return;
-  }
-
-  if (y <= 0) y = (T)0;
-  if (x <= 0) x = (T)0;
-
-  *y_low = int(y);
-  *x_low = int(x);
-
-  if (*y_low >= input_height - 1) {
-    *y_high = *y_low = input_height - 1;
-    y = (T)(*y_low);
-  } else {
-    *y_high = *y_low + 1;
-  }
-
-  if (*x_low >= input_width - 1) {
-    *x_high = *x_low = input_width - 1;
-    x = T(*x_low);
-  } else {
-    *x_high = *x_low + 1;
-  }
-  T ly = y - *y_low;
-  T lx = x - *x_low;
-  T hy = 1.0 - ly;
-  T hx = 1.0 - lx;
-  *w1 = hy * hx;
-  *w2 = hy * lx;
-  *w3 = ly * hx;
-  *w4 = ly * lx;
-  return;
-}
-
-template <typename T>
-__mlu_func__ void getRoiBinInfo(const T *rois_dram, const int bin_i,
-                                const RoiAlignRotatedParams &params,
-                                int *batch_idx, int *roi_n, int *pw, int *ph,
-                                T *roi_center_x, T *roi_center_y, T *roi_width,
-                                T *roi_height, T *theta) {
-  T offset = params.aligned ? (T)0.5 : (T)0.0;
-  *pw = bin_i % params.pooled_width;
-  *ph = (bin_i / params.pooled_width) % params.pooled_height;
-  *roi_n = bin_i / params.pooled_width / params.pooled_height;
-  const T *roi_info = rois_dram + (*roi_n) * ROI_OFFSET;
-  *batch_idx = (int)roi_info[0];
-  *roi_center_x = roi_info[1] * (T)params.spatial_scale - offset;
-  *roi_center_y = roi_info[2] * (T)params.spatial_scale - offset;
-  *roi_width = roi_info[3] * (T)params.spatial_scale;
-  *roi_height = roi_info[4] * (T)params.spatial_scale;
-  *theta = roi_info[5];
-  if (params.clockwise) {
-    *theta = -(*theta);
-  }
-  if (!params.aligned) {
-    *roi_width = *roi_width > (T)1.0 ? *roi_width : (T)1.0;
-    *roi_height = *roi_height > (T)1.0 ? *roi_height : (T)1.0;
-  }
-}
-
-template <typename T>
-__mlu_func__ void roiAlignRotatedForward(const T *input_dram,
-                                         const T *rois_dram, const int batch,
-                                         const int height, const int width,
-                                         const int channel, const int rois_num,
-                                         const RoiAlignRotatedParams &params,
-                                         T *output_dram) {
-  int align_base_128 = NFU_ALIGN_SIZE / sizeof(T);
-  int channel_max_cap = MAX_NRAM_SIZE / sizeof(T) / (2 * SAMPLING_NUM + 1);
-  channel_max_cap = channel_max_cap / align_base_128 * align_base_128;
-  int channel_align = channel < channel_max_cap ? channel : channel_max_cap;
-  channel_align = CEIL_ALIGN(channel_align, align_base_128);
-
-  T *nram_out = (T *)nram_buffer;
-  T *nram_ping = nram_out + channel_align;
-  T *nram_pong = nram_ping + channel_align * SAMPLING_NUM;
-
-  int bin_first = taskId;
-  int bin_end = rois_num * params.pooled_height * params.pooled_width;
-
-  for (int bin_i = bin_first; bin_i < bin_end; bin_i += taskDim) {
-    T roi_center_x, roi_center_y, roi_width, roi_height, theta;
-    int batch_idx, roi_n, pw, ph;
-    getRoiBinInfo(rois_dram, bin_i, params, &batch_idx, &roi_n, &pw, &ph,
-                  &roi_center_x, &roi_center_y, &roi_width, &roi_height,
-                  &theta);
-    T bin_size_h = roi_height / params.pooled_height;
-    T bin_size_w = roi_width / params.pooled_width;
-
-    int roi_bin_grid_h =
-        (params.sample_ratio > 0)
-            ? params.sample_ratio
-            : __float2int_up((float)roi_height / params.pooled_height);
-    int roi_bin_grid_w =
-        (params.sample_ratio > 0)
-            ? params.sample_ratio
-            : __float2int_up((float)roi_width / params.pooled_width);
-    T roi_start_y = -roi_height / 2;
-    T roi_start_x = -roi_width / 2;
-    const int bin_dim = roi_bin_grid_h * roi_bin_grid_w > 1
-                            ? roi_bin_grid_h * roi_bin_grid_w
-                            : 1;
-    T cos_theta = std::cos(theta);
-    T sin_theta = std::sin(theta);
-    T zero_sign = 1.0f / bin_dim;
-
-    bool is_first_sample = true;
-    int src_offset = 0;
-    int dst_offset = 0;
-    int c_rem, c_slice, c_slice_align, pongc_slice, pongc_slice_align;
-    for (int c_offset = 0; c_offset < channel; c_offset += channel_align) {
-      __bang_write_value(nram_out, channel_align, (T)0);
-      c_rem = channel - c_offset;
-      c_slice = channel_align > c_rem ? c_rem : channel_align;
-      c_slice_align = CEIL_ALIGN(c_slice, align_base_128);
-      is_first_sample = true;
-      for (int iy = 0; iy < roi_bin_grid_h; ++iy) {
-        const T yy = roi_start_y + ph * bin_size_h +
-                     T(iy + 0.5) * bin_size_h / roi_bin_grid_h;
-        for (int ix = 0; ix < roi_bin_grid_w; ++ix) {
-          const T xx = roi_start_x + pw * bin_size_w +
-                       T(ix + 0.5) * bin_size_w / roi_bin_grid_w;
-          int sample_i = iy * roi_bin_grid_w + ix;
-
-          T y = yy * cos_theta - xx * sin_theta + roi_center_y;
-          T x = yy * sin_theta + xx * cos_theta + roi_center_x;
-          T w1, w2, w3, w4;
-          bool empty = false;
-          int x_low, x_high, y_low, y_high;
-          bilinearInterpolate(height, width, x, y, &w1, &w2, &w3, &w4, &x_low,
-                              &x_high, &y_low, &y_high, &empty);
-          /*******************************************************
-                 |          ping         |          pong         |
-          |------|-----|-----|-----|-----|-----|-----|-----|-----|
-          |output|  p1 |  p2 |  p3 |  p4 |  p1 |  p2 |  p3 |  p4 |
-          |------|-----|-----|-----|-----|-----|-----|-----|-----|
-          ********************************************************/
-          if (is_first_sample && !empty) {
-            // load input data from dram to nram
-            __bang_write_value(nram_ping, SAMPLING_NUM * c_slice_align, (T)0);
-            src_offset =
-                (batch_idx * height * width + y_low * width + x_low) * channel +
-                c_offset;
-            dst_offset = 0;
-            __memcpy(nram_ping + dst_offset, input_dram + src_offset,
-                     c_slice * sizeof(T), GDRAM2NRAM);
-            src_offset = (batch_idx * height * width + y_low * width + x_high) *
-                             channel +
-                         c_offset;
-            dst_offset = c_slice_align;
-            __memcpy(nram_ping + dst_offset, input_dram + src_offset,
-                     c_slice * sizeof(T), GDRAM2NRAM);
-            src_offset = (batch_idx * height * width + y_high * width + x_low) *
-                             channel +
-                         c_offset;
-            dst_offset = c_slice_align * 2;
-            __memcpy(nram_ping + dst_offset, input_dram + src_offset,
-                     c_slice * sizeof(T), GDRAM2NRAM);
-            src_offset =
-                (batch_idx * height * width + y_high * width + x_high) *
-                    channel +
-                c_offset;
-            dst_offset = c_slice_align * 3;
-            __memcpy(nram_ping + dst_offset, input_dram + src_offset,
-                     c_slice * sizeof(T), GDRAM2NRAM);
-          }
-          // load next input data to nram
-          if (sample_i + 1 < bin_dim) {
-            int p_iy = (sample_i + 1) / roi_bin_grid_w;
-            int p_ix = (sample_i + 1) % roi_bin_grid_w;
-            const T p_yy = roi_start_y + ph * bin_size_h +
-                           T(p_iy + 0.5) * bin_size_h / roi_bin_grid_h;
-            const T p_xx = roi_start_x + pw * bin_size_w +
-                           T(p_ix + 0.5) * bin_size_w / roi_bin_grid_w;
-            T p_y = p_yy * cos_theta - p_xx * sin_theta + roi_center_y;
-            T p_x = p_yy * sin_theta + p_xx * cos_theta + roi_center_x;
-            T p_w1, p_w2, p_w3, p_w4;
-            bool p_empty = false;
-            int p_x_low, p_x_high, p_y_low, p_y_high;
-            bilinearInterpolate(height, width, p_x, p_y, &p_w1, &p_w2, &p_w3,
-                                &p_w4, &p_x_low, &p_x_high, &p_y_low, &p_y_high,
-                                &p_empty);
-            pongc_slice = c_slice;
-            pongc_slice_align = c_slice_align;
-            if (!p_empty) {
-              __bang_write_value(nram_pong, SAMPLING_NUM * pongc_slice_align,
-                                 (T)0);
-              src_offset =
-                  (batch_idx * height * width + p_y_low * width + p_x_low) *
-                      channel +
-                  c_offset;
-              dst_offset = 0;
-              __memcpy(nram_pong + dst_offset, input_dram + src_offset,
-                       c_slice * sizeof(T), GDRAM2NRAM);
-              src_offset =
-                  (batch_idx * height * width + p_y_low * width + p_x_high) *
-                      channel +
-                  c_offset;
-              dst_offset = pongc_slice_align;
-              __memcpy(nram_pong + dst_offset, input_dram + src_offset,
-                       c_slice * sizeof(T), GDRAM2NRAM);
-              src_offset =
-                  (batch_idx * height * width + p_y_high * width + p_x_low) *
-                      channel +
-                  c_offset;
-              dst_offset = pongc_slice_align * 2;
-              __memcpy(nram_pong + dst_offset, input_dram + src_offset,
-                       c_slice * sizeof(T), GDRAM2NRAM);
-              src_offset =
-                  (batch_idx * height * width + p_y_high * width + p_x_high) *
-                      channel +
-                  c_offset;
-              dst_offset = pongc_slice_align * 3;
-              __memcpy(nram_pong + dst_offset, input_dram + src_offset,
-                       c_slice * sizeof(T), GDRAM2NRAM);
-            }
-          }
-          T *tmp_sum = nram_ping + 3 * c_slice_align;
-          if (empty) {
-            __bang_write_value(tmp_sum, c_slice_align, T(0));
-          } else {
-            __bang_mul_scalar(nram_ping, nram_ping, w1, c_slice_align);
-            __bang_mul_scalar(nram_ping + c_slice_align,
-                              nram_ping + c_slice_align, w2, c_slice_align);
-            __bang_mul_scalar(nram_ping + 2 * c_slice_align,
-                              nram_ping + 2 * c_slice_align, w3, c_slice_align);
-            __bang_mul_scalar(nram_ping + 3 * c_slice_align,
-                              nram_ping + 3 * c_slice_align, w4, c_slice_align);
-            __bang_sumpool(tmp_sum, nram_ping, c_slice_align, 1, SAMPLING_NUM,
-                           1, SAMPLING_NUM, 1, 1);
-          }
-          __bang_add(nram_out, nram_out, tmp_sum, c_slice_align);
-          swap(nram_ping, nram_pong);
-          __asm__ volatile("sync;");
-          is_first_sample = false;
-        }
-      }
-      __bang_mul_scalar(nram_out, nram_out, zero_sign, c_slice_align);
-      // store the result to dram
-      int output_offset =
-          ((roi_n * params.pooled_height + ph) * params.pooled_width + pw) *
-              channel +
-          c_offset;
-      __memcpy(output_dram + output_offset, nram_out, c_slice * sizeof(T),
-               NRAM2GDRAM);
-    }
-  }
-}
-
-template <typename T>
-__mlu_func__ void roiAlignRotatedBackward(const T *top_grad_dram,
-                                          const T *rois_dram, const int batch,
-                                          const int height, const int width,
-                                          const int channel, const int rois_num,
-                                          const RoiAlignRotatedParams &params,
-                                          T *bottom_grad_dram) {
-  int align_base_128 = NFU_ALIGN_SIZE / sizeof(T);
-  int channel_align = CEIL_ALIGN(channel, align_base_128);
-
-  unsigned int max_element = MAX_NRAM_SIZE / sizeof(T);
-  int c_limit = max_element >> 2;
-  c_limit = c_limit > channel_align ? channel_align : c_limit;
-
-  T *nram_ping = (T *)nram_buffer;
-  T *nram_pong = nram_ping + 2 * c_limit;
-  T *nram_output = nullptr;
-
-  int bin_first = taskId;
-  int bin_end = rois_num * params.pooled_height * params.pooled_width;
-  bool is_first_bin = true;
-  T roi_center_x, roi_center_y, roi_width, roi_height, theta;
-  int batch_idx, roi_n, pw, ph;
-  T pong_roi_center_x, pong_roi_center_y, pong_roi_width, pong_roi_height,
-      pong_theta;
-  int pong_batch_idx, pong_roi_n, pong_pw, pong_ph;
-  for (int bin_i = bin_first; bin_i < bin_end; bin_i += taskDim) {
-    getRoiBinInfo(rois_dram, bin_i, params, &batch_idx, &roi_n, &pw, &ph,
-                  &roi_center_x, &roi_center_y, &roi_width, &roi_height,
-                  &theta);
-    T bin_size_h = roi_height / params.pooled_height;
-    T bin_size_w = roi_width / params.pooled_width;
-
-    int roi_bin_grid_h =
-        (params.sample_ratio > 0)
-            ? params.sample_ratio
-            : __float2int_up((float)roi_height / params.pooled_height);
-    int roi_bin_grid_w =
-        (params.sample_ratio > 0)
-            ? params.sample_ratio
-            : __float2int_up((float)roi_width / params.pooled_width);
-    T roi_start_y = -roi_height / 2;
-    T roi_start_x = -roi_width / 2;
-    const int bin_dim = roi_bin_grid_h * roi_bin_grid_w > 1
-                            ? roi_bin_grid_h * roi_bin_grid_w
-                            : 1;
-    T cos_theta = std::cos(theta);
-    T sin_theta = std::sin(theta);
-    T zero_sign = 1.0f / bin_dim;
-    int c_rem, c_slice, pongc_slice, c_offset;
-    c_rem = channel;
-    c_offset = 0;
-    /****************************************
-    |        ping       |        pong       |
-    |---------|---------|---------|---------|
-    |  input  |  output |  input  |  output |
-    |---------|---------|---------|---------|
-    *****************************************/
-    if (is_first_bin) {
-      // load the first top_grad to nram
-      c_slice = c_limit < c_rem ? c_limit : c_rem;
-      int top_grad_offset =
-          ((roi_n * params.pooled_height + ph) * params.pooled_width + pw) *
-          channel;
-      __memcpy(nram_ping, top_grad_dram + top_grad_offset, c_slice * sizeof(T),
-               GDRAM2NRAM);
-    }
-    nram_output = nram_ping + c_limit;
-    while (c_rem > 0) {
-      c_slice = c_slice < c_rem ? c_slice : c_rem;
-      // load the next top_grad to nram
-      if (c_rem - c_slice > 0) {
-        // load the rest channels to nram
-        pongc_slice = (c_rem - c_slice > c_slice) ? c_slice : c_rem - c_slice;
-        int top_grad_offset =
-            ((roi_n * params.pooled_height + ph) * params.pooled_width + pw) *
-                channel +
-            c_offset + c_slice;
-        __memcpy_async(nram_pong, top_grad_dram + top_grad_offset,
-                       pongc_slice * sizeof(T), GDRAM2NRAM);
-      } else if (bin_i + taskDim < bin_end) {
-        // load next bin's data to nram
-        getRoiBinInfo(rois_dram, bin_i + taskDim, params, &pong_batch_idx,
-                      &pong_roi_n, &pong_pw, &pong_ph, &pong_roi_center_x,
-                      &pong_roi_center_y, &pong_roi_width, &pong_roi_height,
-                      &pong_theta);
-        pongc_slice = c_limit < channel ? c_limit : channel;
-        int top_grad_offset = ((pong_roi_n * params.pooled_height + pong_ph) *
-                                   params.pooled_width +
-                               pong_pw) *
-                              channel;
-        __memcpy_async(nram_pong, top_grad_dram + top_grad_offset,
-                       c_slice * sizeof(T), GDRAM2NRAM);
-      }
-      // comput the output in a single bin
-
-      for (int iy = 0; iy < roi_bin_grid_h; ++iy) {
-        const T yy = roi_start_y + ph * bin_size_h +
-                     T(iy + 0.5) * bin_size_h / roi_bin_grid_h;
-        for (int ix = 0; ix < roi_bin_grid_w; ++ix) {
-          const T xx = roi_start_x + pw * bin_size_w +
-                       T(ix + 0.5) * bin_size_w / roi_bin_grid_w;
-          T y = yy * cos_theta - xx * sin_theta + roi_center_y;
-          T x = yy * sin_theta + xx * cos_theta + roi_center_x;
-          T w1, w2, w3, w4;
-          bool empty = false;
-          int x_low, x_high, y_low, y_high;
-          bilinearInterpolate(height, width, x, y, &w1, &w2, &w3, &w4, &x_low,
-                              &x_high, &y_low, &y_high, &empty);
-          if (empty) {
-            continue;
-          } else {
-            __bang_mul_scalar(nram_output, nram_ping, w1 * zero_sign, c_limit);
-            __bang_atomic_add(
-                (T *)nram_output,
-                bottom_grad_dram + batch_idx * height * width * channel +
-                    y_low * width * channel + x_low * channel + c_offset,
-                (T *)nram_output, c_slice);
-            __bang_mul_scalar(nram_output, nram_ping, w2 * zero_sign, c_limit);
-            __bang_atomic_add(
-                (T *)nram_output,
-                bottom_grad_dram + batch_idx * height * width * channel +
-                    y_low * width * channel + x_high * channel + c_offset,
-                (T *)nram_output, c_slice);
-            __bang_mul_scalar(nram_output, nram_ping, w3 * zero_sign, c_limit);
-            __bang_atomic_add(
-                (T *)nram_output,
-                bottom_grad_dram + batch_idx * height * width * channel +
-                    y_high * width * channel + x_low * channel + c_offset,
-                (T *)nram_output, c_slice);
-            __bang_mul_scalar(nram_output, nram_ping, w4 * zero_sign, c_limit);
-            __bang_atomic_add(
-                (T *)nram_output,
-                bottom_grad_dram + batch_idx * height * width * channel +
-                    y_high * width * channel + x_high * channel + c_offset,
-                (T *)nram_output, c_slice);
-          }
-        }
-      }
-      swap(nram_ping, nram_pong);
-      c_rem -= c_slice;
-      c_offset += c_slice;
-      __asm__ volatile("sync;");
-    }
-    is_first_bin = false;
-  }
-}
-
-__mlu_global__ void MLUUnion1KernelRoiAlignRotatedForward(
-    const void *features, const void *rois, void *output, const int batch,
-    const int height, const int width, const int channel, const int rois_num,
-    const RoiAlignRotatedParams rroiAlignParams,
-    const cnrtDataType_t data_type) {
-  if (0x80 == coreId) {
-    return;
-  }
-
-  if (data_type == CNRT_FLOAT32) {
-    roiAlignRotatedForward((float *)features, (float *)rois, batch, height,
-                           width, channel, rois_num, rroiAlignParams,
-                           (float *)output);
-  } else {
-    roiAlignRotatedForward((half *)features, (half *)rois, batch, height, width,
-                           channel, rois_num, rroiAlignParams, (half *)output);
-  }
-}
-
-__mlu_global__ void MLUUnion1KernelRoiAlignRotatedBackward(
-    const void *top_grad, const void *rois, void *bottom_grad, const int batch,
-    const int height, const int width, const int channel, const int rois_num,
-    const RoiAlignRotatedParams rroiAlignParams,
-    const cnrtDataType_t data_type) {
-  if (0x80 == coreId) {
-    return;
-  }
-
-  if (data_type == CNRT_FLOAT32) {
-    roiAlignRotatedBackward((float *)top_grad, (float *)rois, batch, height,
-                            width, channel, rois_num, rroiAlignParams,
-                            (float *)bottom_grad);
-  } else {
-    roiAlignRotatedBackward((half *)top_grad, (half *)rois, batch, height,
-                            width, channel, rois_num, rroiAlignParams,
-                            (half *)bottom_grad);
-  }
-}
-
-void KernelRoiAlignRotatedForward(
-    cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
-    const cnrtDataType_t d_type, const void *features, const void *rois,
-    void *output, const int batch, const int height, const int width,
-    const int channel, const int rois_num,
-    const RoiAlignRotatedParams roiAlignRotatedParams) {
-  MLUUnion1KernelRoiAlignRotatedForward<<<k_dim, k_type, queue>>>(
-      features, rois, output, batch, height, width, channel, rois_num,
-      roiAlignRotatedParams, d_type);
-}
-
-void KernelRoiAlignRotatedBackward(
-    cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
-    const cnrtDataType_t d_type, const void *top_grad, const void *rois,
-    void *bottom_grad, const int batch, const int height, const int width,
-    const int channel, const int rois_num,
-    const RoiAlignRotatedParams roiAlignRotatedParams) {
-  MLUUnion1KernelRoiAlignRotatedBackward<<<k_dim, k_type, queue>>>(
-      top_grad, rois, bottom_grad, batch, height, width, channel, rois_num,
-      roiAlignRotatedParams, d_type);
-}
diff --git a/mmcv/ops/csrc/common/mlu/roi_align_rotated_utils.hpp b/mmcv/ops/csrc/common/mlu/roi_align_rotated_utils.hpp
deleted file mode 100644
index cd0ec02..0000000
--- a/mmcv/ops/csrc/common/mlu/roi_align_rotated_utils.hpp
+++ /dev/null
@@ -1,24 +0,0 @@
-/*************************************************************************
- * Copyright (C) 2022 Cambricon.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *************************************************************************/
-#ifndef ROI_ALIGN_ROTATED_UTILS_HPP_
-#define ROI_ALIGN_ROTATED_UTILS_HPP_
-
-struct RoiAlignRotatedParams {
-  int pooled_height;
-  int pooled_width;
-  int sample_ratio;
-  float spatial_scale;
-  bool aligned;
-  bool clockwise;
-};
-
-#endif  // ROI_ALIGN_ROTATED_UTILS_HPP_
diff --git a/mmcv/ops/csrc/common/mlu/roi_pool_mlu_kernel.mlu b/mmcv/ops/csrc/common/mlu/roi_pool_mlu_kernel.mlu
deleted file mode 100644
index 3a6d2d3..0000000
--- a/mmcv/ops/csrc/common/mlu/roi_pool_mlu_kernel.mlu
+++ /dev/null
@@ -1,747 +0,0 @@
-/*************************************************************************
- * Copyright (C) 2022 Cambricon.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *************************************************************************/
-#include "common_mlu_helper.hpp"
-
-#define ALIGN_SIZE 64
-#define PIPELINE_COMMON_NUM 2
-#define PIPELINE_PINGPONG_NUM 10
-
-__nram__ char nram_buffer[MAX_NRAM_SIZE];
-
-namespace forward {
-template <typename T>
-__mlu_func__ void getRoiBinInfo(T *input_v, T *rois_v, int bin_i, int height,
-                                int width, int channels, int p_height,
-                                int p_width, T spatial_scale, int *bin_x1,
-                                int *bin_y1, int *bin_x2, int *bin_y2,
-                                int *bin_wdim, int *bin_hdim, int *bin_dims,
-                                T **input_base, bool *is_empty) {
-  int pw = bin_i % p_width;
-  int ph = (bin_i / p_width) % p_height;
-  int roi_n = bin_i / p_width / p_height;
-
-  /*roi*/
-  const T *roi_info = rois_v + roi_n * 5;  // {{batch, x1, y1, x2, y2},,,}
-  int batch_index = (int)roi_info[0];
-  int roi_x1 = round(roi_info[1] * spatial_scale);
-  int roi_y1 = round(roi_info[2] * spatial_scale);
-  int roi_x2 = round(roi_info[3] * spatial_scale);
-  int roi_y2 = round(roi_info[4] * spatial_scale);
-  int roi_w = roi_x2 - roi_x1 + 1 > 1 ? roi_x2 - roi_x1 + 1 : 1;
-  int roi_h = roi_y2 - roi_y1 + 1 > 1 ? roi_y2 - roi_y1 + 1 : 1;
-
-  /*bin*/
-  T bin_w = (T)roi_w / (T)p_width;
-  T bin_h = (T)roi_h / (T)p_height;
-
-  *bin_x1 = (int)floor((T)pw * bin_w) + roi_x1;
-  *bin_x1 = *bin_x1 > 0 ? *bin_x1 : 0;
-  *bin_x1 = *bin_x1 < width ? *bin_x1 : width;
-
-  *bin_y1 = (int)floor((T)ph * bin_h) + roi_y1;
-  *bin_y1 = *bin_y1 > 0 ? *bin_y1 : 0;
-  *bin_y1 = *bin_y1 < height ? *bin_y1 : height;
-
-  *bin_x2 = (int)ceil((T)(pw + 1) * bin_w) + roi_x1;
-  *bin_x2 = *bin_x2 > 0 ? *bin_x2 : 0;
-  *bin_x2 = *bin_x2 < width ? *bin_x2 : width;
-
-  *bin_y2 = (int)ceil((T)(ph + 1) * bin_h) + roi_y1;
-  *bin_y2 = *bin_y2 > 0 ? *bin_y2 : 0;
-  *bin_y2 = *bin_y2 < height ? *bin_y2 : height;
-
-  *input_base = input_v + batch_index * height * width * channels;
-  *bin_wdim = *bin_x2 - *bin_x1;
-  *bin_hdim = *bin_y2 - *bin_y1;
-  *bin_dims = (*bin_hdim) * (*bin_wdim);
-  *is_empty = (*bin_y2 <= *bin_y1) || (*bin_x2 <= *bin_x1);
-}
-
-template <typename T>
-__mlu_func__ void MLUUnion1Roipool(T *input_v, T *rois_v, int batch,
-                                   int channels, int height, int width,
-                                   int p_height, int p_width, int rois_num,
-                                   T spatial_scale, T *output_v, int *argmax) {
-  /*
-   * NRAM partition
-   *  |---------------------------------------------------|
-   *  |                        ping                       |
-   *  |---------------------------------------------------|
-   *  |                        pong                       |
-   *  |---------------------------------------------------|
-   *  |                        out                        |
-   *  |---------------------------------------------------|
-   *  |                        argmax                     |
-   *  |---------------------------------------------------|
-   *  |                        a                          |
-   *  |---------------------------------------------------|
-   *  |                        b                          |
-   *  |---------------------------------------------------|
-   */
-  uint32_t is_half = sizeof(T) == sizeof(half) ? true : false;
-  uint32_t t_size = sizeof(T);
-  uint32_t float_div = NFU_ALIGN_SIZE / sizeof(float);
-  uint32_t half_div = NFU_ALIGN_SIZE / sizeof(half);
-
-  uint32_t channels_align = PAD_UP(channels, float_div);
-  uint32_t nram_limit = PAD_DOWN(
-      (MAX_NRAM_SIZE / sizeof(float) - 4 * channels_align) / 2, half_div);
-
-  // nram PING/PONG, output, argamx, a, b
-  float *nram_ping = (float *)nram_buffer;
-  float *nram_pong = (float *)nram_buffer + nram_limit;
-  float *nram_out = (float *)nram_buffer + 2 * nram_limit;
-  float *nram_argmax = nram_out + channels_align;
-  float *nram_a = nram_out + 2 * channels_align;
-  float *nram_b = nram_out + 3 * channels_align;
-
-  uint32_t c_bins_num = rois_num * p_height * p_width;
-  uint32_t task_bins = c_bins_num / taskDim;
-  uint32_t rem_bins = c_bins_num % taskDim;
-  if (taskId < rem_bins) {
-    task_bins += 1;
-  }
-  int bin_first =
-      (c_bins_num / taskDim) * taskId + (taskId > rem_bins ? rem_bins : taskId);
-  int bins_loop = bin_first + task_bins;
-
-  T *input_base = NULL;
-  T *output_base = output_v + bin_first * channels;
-  int *argmax_base = NULL != argmax ? argmax + bin_first * channels : NULL;
-  int bin_x1, bin_y1, bin_x2, bin_y2, bin_wdim, bin_hdim, bin_dims;
-  int pbin_x1, pbin_y1, pbin_x2, pbin_y2, pbin_wdim, pbin_hdim, pbin_dims;
-  bool is_empty = false;
-  bool pong_is_empty = false;
-  bool is_first_bin = true;
-  uint32_t src_offset = 0;
-  uint32_t dst_offset = 0;
-  uint32_t nram_offset = 0;
-  uint32_t half_offset =
-      is_half ? (nram_limit / 2 / half_div * half_div) * 2 : 0;
-  float *nram_tmp = NULL;
-
-  uint32_t c_slice = 0;
-  uint32_t c_slice_align = 0;
-  uint32_t pongc_slice = 0;
-  uint32_t pongc_slice_align = 0;
-  for (int bin_i = bin_first; bin_i < bins_loop; bin_i++) {
-    getRoiBinInfo((T *)input_v, (T *)rois_v, bin_i, height, width, channels,
-                  p_height, p_width, (T)spatial_scale, &bin_x1, &bin_y1,
-                  &bin_x2, &bin_y2, &bin_wdim, &bin_hdim, &bin_dims,
-                  &input_base, &is_empty);
-    uint32_t c_rem = channels;
-    c_slice = nram_limit / bin_dims / float_div * float_div;
-
-    if (is_first_bin && !is_empty) {
-      c_slice = c_slice > c_rem ? c_rem : c_slice;
-      c_slice_align = PAD_UP(c_slice, float_div);
-      for (int h = bin_y1; h < bin_y2; h++) {
-        src_offset = (h * width + bin_x1) * channels;
-        nram_offset = (h - bin_y1) * bin_wdim * c_slice_align + half_offset;
-        if (c_slice_align == channels) {
-          __memcpy((T *)nram_ping + nram_offset, (T *)input_base + src_offset,
-                   bin_wdim * c_slice * t_size, GDRAM2NRAM);
-        } else {
-          __memcpy((T *)nram_ping + nram_offset, (T *)input_base + src_offset,
-                   c_slice * t_size, GDRAM2NRAM, c_slice_align * t_size,
-                   channels * t_size, bin_wdim - 1);
-        }
-      }
-    }
-    uint32_t c_offset = 0;
-    while (c_rem > 0) {
-      c_slice = c_slice > c_rem ? c_rem : c_slice;
-      c_slice_align = PAD_UP(c_slice, float_div);
-
-      /*__memcpy_async*/
-      if (c_rem - c_slice > 0 && !is_empty) {
-        pongc_slice = c_rem - c_slice > c_slice ? c_slice : c_rem - c_slice;
-        pongc_slice_align = PAD_UP(pongc_slice, float_div);
-        for (int h = bin_y1; h < bin_y2; h++) {
-          src_offset = (h * width + bin_x1) * channels + c_offset;
-          nram_offset =
-              (h - bin_y1) * bin_wdim * pongc_slice_align + half_offset;
-          __memcpy_async((T *)nram_pong + nram_offset,
-                         (T *)input_base + src_offset + c_slice,
-                         pongc_slice * t_size, GDRAM2NRAM,
-                         pongc_slice_align * t_size, channels * t_size,
-                         bin_wdim - 1);
-        }
-      } else if (bin_i + 1 < bins_loop) {
-        getRoiBinInfo((T *)input_v, (T *)rois_v, bin_i + 1, height, width,
-                      channels, p_height, p_width, (T)spatial_scale, &pbin_x1,
-                      &pbin_y1, &pbin_x2, &pbin_y2, &pbin_wdim, &pbin_hdim,
-                      &pbin_dims, &input_base, &pong_is_empty);
-        pongc_slice = PAD_DOWN(nram_limit / pbin_dims, float_div);
-        pongc_slice = pongc_slice > channels ? channels : pongc_slice;
-        pongc_slice_align = PAD_UP(pongc_slice, float_div);
-        if (!pong_is_empty) {
-          for (int h = pbin_y1; h < pbin_y2; h++) {
-            src_offset = (h * width + pbin_x1) * channels;
-            nram_offset =
-                (h - pbin_y1) * pbin_wdim * pongc_slice_align + half_offset;
-            if (pongc_slice_align == channels) {
-              __memcpy_async((T *)nram_pong + nram_offset,
-                             (T *)input_base + src_offset,
-                             pbin_wdim * pongc_slice * t_size, GDRAM2NRAM);
-            } else {
-              __memcpy_async((T *)nram_pong + nram_offset,
-                             (T *)input_base + src_offset, pongc_slice * t_size,
-                             GDRAM2NRAM, pongc_slice_align * t_size,
-                             channels * t_size, pbin_wdim - 1);
-            }
-          }
-        }
-      }
-
-      if (is_empty) {
-        __bang_write_value((T *)nram_out, c_slice_align, (T)0);
-        __memcpy((T *)output_base + dst_offset + c_offset, (T *)nram_out,
-                 c_slice * t_size, NRAM2GDRAM);
-        if (NULL != argmax) {
-          __bang_write_value((int32_t *)nram_out, c_slice_align, (int32_t)(-1));
-          __memcpy((int32_t *)argmax_base + dst_offset + c_offset,
-                   (int32_t *)nram_out, c_slice * sizeof(int32_t), NRAM2GDRAM);
-        }
-      } else {
-        if (is_half) {
-          uint32_t bin_align64 = PAD_UP(bin_dims * c_slice_align, half_div);
-          __bang_half2float((float *)nram_ping, (half *)nram_ping + half_offset,
-                            bin_align64);
-        }
-        __bang_maxpool((float *)nram_out, (float *)nram_ping, c_slice_align,
-                       bin_hdim, bin_wdim, bin_hdim, bin_wdim, 1, 1);
-        if (is_half) {
-          uint32_t c_align64 = PAD_UP(c_slice_align, half_div);
-          __bang_float2half_rd((half *)nram_out, (float *)nram_out, c_align64);
-        }
-        __memcpy((T *)output_base + dst_offset + c_offset, (T *)nram_out,
-                 c_slice * t_size, NRAM2GDRAM);
-        if (NULL != argmax) {
-          /*compute max_index*/
-          __bang_maxpool_index((uint32_t *)nram_out, (float *)nram_ping,
-                               c_slice_align, bin_hdim, bin_wdim, bin_hdim,
-                               bin_wdim, 1, 1);
-          convertInt2Float((float *)nram_argmax, (float *)nram_a,
-                           (int32_t *)nram_out, (float *)nram_b, c_slice_align);
-
-          /*compute input_h*/
-          for (int i = 0; i < c_slice; i++) {
-            nram_out[i] = (float)(((uint32_t *)nram_out)[i] / bin_wdim);
-          }
-          __bang_add_scalar((float *)nram_a, (float *)nram_out, (float)bin_y1,
-                            c_slice_align);
-          __bang_mul_scalar((float *)nram_ping, (float *)nram_a, (float)width,
-                            c_slice_align);
-
-          /*compute input_w*/
-          __bang_mul_scalar((float *)nram_a, (float *)nram_out, (float)bin_wdim,
-                            c_slice_align);
-          __bang_sub((float *)nram_a, (float *)nram_argmax, (float *)nram_a,
-                     c_slice_align);
-          __bang_add_scalar((float *)nram_a, (float *)nram_a, (float)bin_x1,
-                            c_slice_align);
-          __bang_add((float *)nram_out, (float *)nram_ping, (float *)nram_a,
-                     c_slice_align);
-          convertFloat2Int((int32_t *)nram_argmax, (float *)nram_a,
-                           (float *)nram_out, (float *)nram_b, c_slice_align);
-          __memcpy((int32_t *)argmax_base + dst_offset + c_offset,
-                   (int32_t *)nram_argmax, c_slice * sizeof(int32_t),
-                   NRAM2GDRAM);
-        }
-      }
-      nram_tmp = nram_ping;
-      nram_ping = nram_pong;
-      nram_pong = nram_tmp;
-      c_offset += c_slice;
-      c_rem -= c_slice;
-      __asm__ volatile("sync;");
-    }
-    dst_offset += channels;
-    is_first_bin = false;
-  }
-}
-
-__mlu_global__ void MLUKernelRoiPool(cnrtDataType_t data_type,
-                                     const void *input_data,
-                                     const void *input_rois, int batch,
-                                     int channels, int height, int width,
-                                     int pooled_height, int pooled_width,
-                                     int rois_num, float spatial_scale,
-                                     void *output_data, int *argmax) {
-  switch (data_type) {
-    case CNRT_FLOAT16: {
-      MLUUnion1Roipool((half *)input_data, (half *)input_rois, batch, channels,
-                       height, width, pooled_height, pooled_width, rois_num,
-                       (half)spatial_scale, (half *)output_data, argmax);
-    }; break;
-    case CNRT_FLOAT32: {
-      MLUUnion1Roipool((float *)input_data, (float *)input_rois, batch,
-                       channels, height, width, pooled_height, pooled_width,
-                       rois_num, (float)spatial_scale, (float *)output_data,
-                       argmax);
-    }; break;
-    default: { break; }
-  }
-}
-}  // namespace forward
-
-namespace backward {
-// Convert index of argmax from global grads_image to local bin in RoI. Vector
-// operations do not support int type, so conversion from int to float is
-// performed here.
-__mlu_func__ void convertIndex(
-    int32_t *nram_argmax, int32_t *nram_argmax_fp, int32_t *nram_argmax_fp_bk1,
-    int32_t *nram_argmax_fp_bk2, int32_t *nram_argmax_int,
-    int32_t *nram_argmax_int_h, int32_t *nram_argmax_int_w,
-    int32_t *nram_argmax_fp_h, int32_t *nram_argmax_fp_w,
-    float *nram_atomic_add, float *nram_grads_image, int width, int height,
-    int wstart, int hstart, int w_compute, int h_compute, int align_c,
-    int channels, int loop_flag, int loop_id, int true_limit) {
-  convertInt2Float((float *)nram_argmax_fp, (float *)nram_argmax_fp_bk1,
-                   (int *)nram_argmax, (float *)nram_argmax_fp_bk2, align_c);
-
-  // This step uses scalar division, because the above vector division causes
-  // rounding accuracy problem.
-  for (int i = 0; i < channels; ++i) {
-    *((float *)nram_argmax_fp + i) = *((float *)nram_argmax_fp + i) / width;
-  }
-
-  // Use 'float2int_tz' to perform '*((int32_t*)nram_argmax + i) / width'
-  // operation.
-  convertFloat2Int((int *)nram_argmax_int_h, (float *)nram_argmax_fp_bk1,
-                   (float *)nram_argmax_fp, (float *)nram_argmax_fp_bk2,
-                   align_c);
-  convertInt2Float((float *)nram_argmax_fp, (float *)nram_argmax_fp_bk1,
-                   (int *)nram_argmax_int_h, (float *)nram_argmax_fp_bk2,
-                   align_c);
-
-  // Perform 'temp_result - hstart' operation
-  __bang_sub_scalar((float *)nram_argmax_fp_h, (float *)nram_argmax_fp, hstart,
-                    align_c);
-
-  // Perform 'temp_result1 - temp_result2 * width' operation
-  __bang_mul_scalar((float *)nram_argmax_fp_w, (float *)nram_argmax_fp, width,
-                    align_c);
-  convertInt2Float((float *)nram_argmax_fp, (float *)nram_argmax_fp_bk1,
-                   (int *)nram_argmax, (float *)nram_argmax_fp_bk2, align_c);
-  __bang_sub((float *)nram_argmax_fp_w, (float *)nram_argmax_fp,
-             (float *)nram_argmax_fp_w, align_c);
-
-  // Perform 'temp_result - wstart' operation
-  __bang_sub_scalar((float *)nram_argmax_fp_w, (float *)nram_argmax_fp_w,
-                    wstart, align_c);
-
-  // Perform 'temp_result = h * w_compute + w' operation
-  __bang_mul_scalar((float *)nram_argmax_fp_h, (float *)nram_argmax_fp_h,
-                    w_compute, align_c);
-  __bang_add((float *)nram_argmax_fp_h, (float *)nram_argmax_fp_h,
-             (float *)nram_argmax_fp_w, align_c);
-
-  if (loop_flag == 1) {
-    __bang_sub_scalar((float *)nram_argmax_fp_h, (float *)nram_argmax_fp_h,
-                      (loop_id * true_limit), align_c);
-  }
-  convertFloat2Int((int *)nram_argmax_int, (float *)nram_argmax_fp_bk1,
-                   (float *)nram_argmax_fp_h, (float *)nram_argmax_fp_bk2,
-                   align_c);
-}
-
-template <typename T>
-__mlu_func__ void MLUUnion1Roipool(const T *rois, const T *grads,
-                                   const int32_t *argmax, T *grads_image,
-                                   int channels, int height, int width,
-                                   int pooled_height, int pooled_width,
-                                   int rois_num, const T spatial_scale,
-                                   int high_precision) {
-  // Calculate the number of rois processed by each core
-  int bin_num = rois_num * pooled_height * pooled_width;
-  int loop =
-      (bin_num % taskDim) ? (bin_num / taskDim + 1) : (bin_num / taskDim);
-  int tid = taskId * loop;
-  if (bin_num % taskDim != 0) {
-    if (tid >= bin_num) {
-      return;
-    } else {
-      // last part is (bin_num - tid).
-      loop = bin_num - tid < loop ? bin_num - tid : loop;
-    }
-  }
-  int align_c = PAD_UP(channels, ALIGN_SIZE);
-  // Common part has 2: grads, argmax; ping-pong each is PIPELINE_PINGPONG_NUM.
-  int data_size =
-      PAD_DOWN(((MAX_NRAM_SIZE / sizeof(float) - PIPELINE_COMMON_NUM * align_c -
-                 (PIPELINE_PINGPONG_NUM - 1) * align_c * 2) /
-                2),
-               ALIGN_SIZE);
-  int hw_limit = data_size / align_c;
-  float *nram_grads = (float *)nram_buffer;
-  for (int idx = tid; idx < tid + loop; ++idx) {
-    // (n, ph, pw) is a C in the pooled output
-    int pw = idx % pooled_width;
-    int ph = (idx / pooled_width) % pooled_height;
-    int n = idx / pooled_width / pooled_height;
-
-    const T *offset_rois = (const T *)(rois + n * 5);
-    int roi_batch_ind = int(offset_rois[0]);
-    // Calculate the roi region on feature maps
-    int roi_start_w = round(offset_rois[1] * spatial_scale);
-    int roi_start_h = round(offset_rois[2] * spatial_scale);
-    int roi_end_w = round(offset_rois[3] * spatial_scale);
-    int roi_end_h = round(offset_rois[4] * spatial_scale);
-    // Force malformed rois to 1x1
-    int roi_width =
-        roi_end_w - roi_start_w + 1 > 1 ? roi_end_w - roi_start_w + 1 : 1;
-    int roi_height =
-        roi_end_h - roi_start_h + 1 > 1 ? roi_end_h - roi_start_h + 1 : 1;
-    T bin_size_h = (T)roi_height / (T)pooled_height;
-    T bin_size_w = (T)roi_width / (T)pooled_width;
-
-    // The corresponding bin region
-    int hstart = int(floor((T)ph * bin_size_h));
-    int wstart = int(floor((T)pw * bin_size_w));
-    int hend = int(ceil((T)(ph + 1) * bin_size_h));
-    int wend = int(ceil((T)(pw + 1) * bin_size_w));
-
-    // Add roi offsets and clip to input boundaries, min(max(A, B), C);
-    hstart = hstart + roi_start_h > 0 ? hstart + roi_start_h : 0;
-    hstart = hstart < height ? hstart : height;
-    hend = hend + roi_start_h > 0 ? hend + roi_start_h : 0;
-    hend = hend < height ? hend : height;
-    wstart = wstart + roi_start_w > 0 ? wstart + roi_start_w : 0;
-    wstart = wstart < width ? wstart : width;
-    wend = wend + roi_start_w > 0 ? wend + roi_start_w : 0;
-    wend = wend < width ? wend : width;
-
-    bool is_empty = (hend <= hstart) || (wend <= wstart);
-    if (!is_empty) {
-      int h_compute = hend - hstart;
-      int w_compute = wend - wstart;
-      int true_limit =
-          hw_limit < h_compute * w_compute ? hw_limit : h_compute * w_compute;
-      int loop_int = (h_compute * w_compute) / true_limit;
-      int rem = (h_compute * w_compute) % true_limit;
-      int32_t *nram_argmax = (int32_t *)nram_grads + align_c;
-      int32_t *nram_argmax_fp = (int32_t *)nram_argmax + align_c;
-      int32_t *nram_argmax_fp_bk1 = (int32_t *)nram_argmax_fp + align_c;
-      int32_t *nram_argmax_fp_bk2 = (int32_t *)nram_argmax_fp_bk1 + align_c;
-      int32_t *nram_argmax_int = (int32_t *)nram_argmax_fp_bk2 + align_c;
-      int32_t *nram_argmax_int_h = (int32_t *)nram_argmax_int + align_c;
-      int32_t *nram_argmax_int_w = (int32_t *)nram_argmax_int_h + align_c;
-      int32_t *nram_argmax_fp_h = (int32_t *)nram_argmax_int_w + align_c;
-      int32_t *nram_argmax_fp_w = (int32_t *)nram_argmax_fp_h + align_c;
-      float *nram_atomic_add = (float *)nram_argmax_fp_w + align_c;
-      float *nram_grads_image = (float *)nram_atomic_add + align_c;
-      if (true_limit == h_compute * w_compute) {
-        /*
-         * NRAM partition
-         *  |---------------------------------------------------|
-         *  |                     grads                         |
-         *  |---------------------------------------------------|
-         *  |                     argmax                        |
-         *  |---------------------------------------------------|
-         *  |                     argmax_temp                   |
-         *  |---------------------------------------------------|
-         *  |                     atomic_add                    |
-         *  |---------------------------------------------------|
-         *  |                     grads_image                   |
-         *  |---------------------------------------------------|
-         */
-
-        // Load the data from GDRAM to NRAM.
-        __memcpy(
-            (T *)nram_grads + align_c * high_precision,
-            (const T *)grads +
-                (n * pooled_height * pooled_width + ph * pooled_width + pw) *
-                    channels,
-            channels * sizeof(T), GDRAM2NRAM);
-        if (high_precision) {
-          __bang_half2float((float *)nram_grads,
-                            (half *)nram_grads + align_c * high_precision,
-                            align_c);
-        }
-
-        __memcpy((int32_t *)nram_argmax, (const int32_t *)argmax +
-                                             (n * pooled_height * pooled_width +
-                                              ph * pooled_width + pw) *
-                                                 channels,
-                 channels * sizeof(int32_t), GDRAM2NRAM);
-
-        // Perform pooling operation on NRAM.
-        convertIndex(nram_argmax, nram_argmax_fp, nram_argmax_fp_bk1,
-                     nram_argmax_fp_bk2, nram_argmax_int, nram_argmax_int_h,
-                     nram_argmax_int_w, nram_argmax_fp_h, nram_argmax_fp_w,
-                     nram_atomic_add, nram_grads_image, width, height, wstart,
-                     hstart, w_compute, h_compute, align_c, channels, 0, 0, 0);
-        __bang_maxpool_bp((float *)nram_grads_image, (float *)nram_grads,
-                          (int32_t *)nram_argmax_int, align_c, h_compute,
-                          w_compute, h_compute, w_compute, h_compute,
-                          w_compute);
-        if (high_precision) {
-          __bang_float2half_rd((half *)nram_grads_image,
-                               (float *)nram_grads_image,
-                               h_compute * w_compute * align_c);
-        }
-
-        // Store the result on NRAM back to GDRAM.
-        for (int hc = 0; hc < h_compute; ++hc) {
-          for (int wc = 0; wc < w_compute; ++wc) {
-            T *dst = (T *)nram_atomic_add;
-            int grad_image_offset = (roi_batch_ind * height * width +
-                                     (hc + hstart) * width + wc + wstart) *
-                                    channels;
-            T *src1 = (T *)grads_image + grad_image_offset;
-            int nram_grads_image_offset = (hc * w_compute + wc) * align_c;
-            T *src2 = (T *)nram_grads_image + nram_grads_image_offset;
-            __bang_atomic_add(dst, src1, src2, channels);
-          }
-        }
-      } else if (true_limit > 0) {
-        /*
-         * NRAM partition
-         *  |---------------------------------------------------|
-         *  |                     grads                         |
-         *  |---------------------------------------------------|
-         *  |                     argmax                        |
-         *  |--------------------ping_pong----------------------|
-         *  |       argmax_temp      |       argmax_temp        |
-         *  |------------------------|--------------------------|
-         *  |       atomic_add       |       atomic_add         |
-         *  |------------------------|--------------------------|
-         *  |       grads_image      |       grads_image        |
-         *  |---------------------------------------------------|
-         */
-
-        // Load the data from GDRAM to NRAM.
-        __memcpy(
-            (T *)nram_grads + align_c * high_precision,
-            (const T *)grads +
-                (n * pooled_height * pooled_width + ph * pooled_width + pw) *
-                    channels,
-            channels * sizeof(T), GDRAM2NRAM);
-        if (high_precision) {
-          __bang_half2float((float *)nram_grads,
-                            (half *)nram_grads + align_c * high_precision,
-                            align_c);
-        }
-        __memcpy((int32_t *)nram_argmax, (const int32_t *)argmax +
-                                             (n * pooled_height * pooled_width +
-                                              ph * pooled_width + pw) *
-                                                 channels,
-                 channels * sizeof(int32_t), GDRAM2NRAM);
-
-        int ping_pong = 0;
-        int ping_pong_offset =
-            (MAX_NRAM_SIZE / sizeof(float) - align_c * PIPELINE_COMMON_NUM) / 2;
-        for (int loop_id = 0; loop_id <= loop_int; ++loop_id) {
-          int size = (loop_id == loop_int) ? rem : true_limit;
-          if (size == 0) {
-            break;
-          }
-          // Perform pooling operation on NRAM.
-          nram_argmax_fp =
-              (int32_t *)nram_argmax + align_c + ping_pong * ping_pong_offset;
-          nram_argmax_fp_bk1 = (int32_t *)nram_argmax_fp + align_c;
-          nram_argmax_fp_bk2 = (int32_t *)nram_argmax_fp_bk1 + align_c;
-          nram_argmax_int = (int32_t *)nram_argmax_fp_bk2 + align_c;
-          nram_argmax_int_h = (int32_t *)nram_argmax_int + align_c;
-          nram_argmax_int_w = (int32_t *)nram_argmax_int_h + align_c;
-          nram_argmax_fp_h = (int32_t *)nram_argmax_int_w + align_c;
-          nram_argmax_fp_w = (int32_t *)nram_argmax_fp_h + align_c;
-          nram_atomic_add = (float *)nram_argmax_fp_w + align_c;
-          nram_grads_image = (float *)nram_atomic_add + align_c;
-          int loop_id_1 = loop_id;
-          int size_1 = ((loop_id_1) == loop_int) ? rem : true_limit;
-          if (size_1 == 0) {
-            break;
-          }
-          convertIndex(nram_argmax, nram_argmax_fp, nram_argmax_fp_bk1,
-                       nram_argmax_fp_bk2, nram_argmax_int, nram_argmax_int_h,
-                       nram_argmax_int_w, nram_argmax_fp_h, nram_argmax_fp_w,
-                       nram_atomic_add, nram_grads_image, width, height, wstart,
-                       hstart, w_compute, h_compute, align_c, channels, 1,
-                       loop_id_1, true_limit);
-          __bang_maxpool_bp((float *)nram_grads_image, (float *)nram_grads,
-                            (int32_t *)nram_argmax_int, align_c, size_1, 1,
-                            size_1, 1, size_1, 1);
-          if (high_precision) {
-            __bang_float2half_rd((half *)nram_grads_image,
-                                 (float *)nram_grads_image, size_1 * align_c);
-          }
-
-          // Store the result on NRAM back to GDRAM.
-          for (int index_size = 0; index_size < size; ++index_size) {
-            int h = (loop_id * true_limit + index_size) / w_compute;
-            int w = (loop_id * true_limit + index_size) % w_compute;
-            T *dst = (T *)nram_atomic_add;
-            T *grads_image_n =
-                (T *)grads_image + roi_batch_ind * height * width * channels;
-            T *src1 = (T *)grads_image_n +
-                      ((h + hstart) * width + (w + wstart)) * channels;
-            T *src2 = (T *)nram_grads_image + index_size * align_c;
-            __bang_atomic_add(dst, src1, src2, channels);
-          }
-          ping_pong = 1 - ping_pong;
-        }
-      } else {
-        /*
-         * NRAM partition
-         *  |---------------------------------------------------|
-         *  |                     grads                         |
-         *  |---------------------------------------------------|
-         *  |                     argmax                        |
-         *  |--------------------ping_pong----------------------|
-         *  |       argmax_temp      |       argmax_temp        |
-         *  |------------------------|--------------------------|
-         *  |       atomic_add       |       atomic_add         |
-         *  |------------------------|--------------------------|
-         *  |       grads_image      |       grads_image        |
-         *  |---------------------------------------------------|
-         */
-
-        int c_limit =
-            PAD_DOWN(MAX_NRAM_SIZE / sizeof(float) /
-                         (PIPELINE_COMMON_NUM + PIPELINE_PINGPONG_NUM * 2),
-                     ALIGN_SIZE);
-        int loop_int = channels / c_limit;
-        int rem = channels % c_limit;
-        int ping_pong = 0;
-        int ping_pong_offset =
-            (MAX_NRAM_SIZE / sizeof(float) - c_limit * PIPELINE_COMMON_NUM) / 2;
-        for (int loop_id = 0; loop_id <= loop_int; ++loop_id) {
-          int size = (loop_id == loop_int) ? rem : c_limit;
-          if (size == 0) {
-            break;
-          }
-          nram_argmax_fp =
-              (int32_t *)nram_argmax + c_limit + ping_pong * ping_pong_offset;
-          nram_argmax_fp_bk1 = (int32_t *)nram_argmax_fp + c_limit;
-          nram_argmax_fp_bk2 = (int32_t *)nram_argmax_fp_bk1 + c_limit;
-          nram_argmax_int = (int32_t *)nram_argmax_fp_bk2 + c_limit;
-          nram_argmax_int_h = (int32_t *)nram_argmax_int + c_limit;
-          nram_argmax_int_w = (int32_t *)nram_argmax_int_h + c_limit;
-          nram_argmax_fp_h = (int32_t *)nram_argmax_int_w + c_limit;
-          nram_argmax_fp_w = (int32_t *)nram_argmax_fp_h + c_limit;
-          nram_atomic_add = (float *)nram_argmax_fp_w + c_limit;
-          nram_grads_image = (float *)nram_atomic_add + c_limit;
-
-          // This pipeline loads the data from GDRAM to NRAM.
-          __memcpy((T *)nram_grads + c_limit * high_precision,
-                   (const T *)grads +
-                       n * pooled_height * pooled_width * channels +
-                       ph * pooled_width * channels + pw * channels +
-                       loop_id * c_limit,
-                   size * sizeof(T), GDRAM2NRAM);
-          if (high_precision) {
-            __bang_half2float((float *)nram_grads,
-                              (half *)nram_grads + c_limit * high_precision,
-                              c_limit);
-          }
-          __memcpy((int32_t *)nram_argmax,
-                   (const int32_t *)argmax +
-                       n * pooled_height * pooled_width * channels +
-                       ph * pooled_width * channels + pw * channels +
-                       loop_id * c_limit,
-                   size * sizeof(int32_t), GDRAM2NRAM);
-
-          for (int hc = 0; hc < h_compute; ++hc) {
-            for (int wc = 0; wc < w_compute; ++wc) {
-              // This pipeline performs pooling operation on NRAM.
-              convertIndex(
-                  nram_argmax, nram_argmax_fp, nram_argmax_fp_bk1,
-                  nram_argmax_fp_bk2, nram_argmax_int, nram_argmax_int_h,
-                  nram_argmax_int_w, nram_argmax_fp_h, nram_argmax_fp_w,
-                  nram_atomic_add, nram_grads_image, width, height, wstart + wc,
-                  hstart + hc, h_compute, w_compute, c_limit, size, 0, 0, 0);
-              __bang_maxpool_bp((float *)nram_grads_image, (float *)nram_grads,
-                                (int32_t *)nram_argmax_int, c_limit, 1, 1, 1, 1,
-                                1, 1);
-              if (high_precision) {
-                __bang_float2half_rd((half *)nram_grads_image,
-                                     (float *)nram_grads_image, c_limit);
-              }
-              // This pipeline stores the result on NRAM back to GDRAM.
-              T *dst = (T *)nram_atomic_add;
-              T *grads_image_n =
-                  (T *)grads_image + roi_batch_ind * height * width * channels;
-              T *src1 = (T *)grads_image_n +
-                        ((hc + hstart) * width + (wc + wstart)) * channels +
-                        loop_id * c_limit;
-              T *src2 = (T *)nram_grads_image;
-              __bang_atomic_add(dst, src1, src2, size);
-            }
-          }
-          ping_pong = 1 - ping_pong;
-        }
-      }
-    }
-  }
-}
-
-__mlu_global__ void MLUKernelRoiPoolBackward(
-    const void *grads, const void *rois, const int *argmax, void *grads_image,
-    int rois_num, int pooled_height, int pooled_width, int channels, int no,
-    int height, int width, const float spatial_scale,
-    const cnrtDataType_t k_dtype) {
-  // make sure that memcore is not used
-  if (coreId == 0x80) {
-    return;
-  }
-  switch (k_dtype) {
-    case CNRT_FLOAT16: {
-      // Using the float type '__bang_max_pool_bp' instruction to increase the
-      // bit width.
-      const int high_precision = 1;
-      MLUUnion1Roipool((const half *)rois, (const half *)grads,
-                       (const int32_t *)argmax, (half *)grads_image, channels,
-                       height, width, pooled_height, pooled_width, rois_num,
-                       (const half)spatial_scale, high_precision);
-    }; break;
-    case CNRT_FLOAT32: {
-      const int high_precision = 0;
-      MLUUnion1Roipool((const float *)rois, (const float *)grads,
-                       (const int32_t *)argmax, (float *)grads_image, channels,
-                       height, width, pooled_height, pooled_width, rois_num,
-                       (const float)spatial_scale, high_precision);
-    }; break;
-    default: { break; }
-  }
-}
-}  // namespace backward
-
-void KernelRoiPoolForward(cnrtDim3_t k_dim, cnrtFunctionType_t k_type,
-                          cnrtQueue_t queue, cnrtDataType_t data_type,
-                          const void *input_data, const void *input_rois,
-                          const int batch, const int channels, const int height,
-                          const int width, const int pooled_height,
-                          const int pooled_width, const int rois_num,
-                          const float spatial_scale, void *output_data,
-                          int *argmax) {
-  forward::MLUKernelRoiPool<<<k_dim, k_type, queue>>>(
-      data_type, input_data, input_rois, batch, channels, height, width,
-      pooled_height, pooled_width, rois_num, spatial_scale, output_data,
-      argmax);
-}
-
-void KernelRoiPoolBackward(cnrtDim3_t k_dim, cnrtFunctionType_t k_type,
-                           cnrtQueue_t queue, cnrtDataType_t k_dtype,
-                           const void *grad_output_ptr, const void *rois_ptr,
-                           const int *argmax_ptr, void *grad_input_ptr,
-                           const int box_num, const int pooled_height,
-                           const int pooled_width, const int channels,
-                           const int batch, const int height, const int width,
-                           const float spatial_scale) {
-  backward::MLUKernelRoiPoolBackward<<<k_dim, k_type, queue>>>(
-      grad_output_ptr, rois_ptr, argmax_ptr, grad_input_ptr, box_num,
-      pooled_height, pooled_width, channels, batch, height, width,
-      spatial_scale, k_dtype);
-}
diff --git a/mmcv/ops/csrc/common/mlu/roiaware_pool3d_mlu_kernel.mlu b/mmcv/ops/csrc/common/mlu/roiaware_pool3d_mlu_kernel.mlu
deleted file mode 100644
index 4c1edf0..0000000
--- a/mmcv/ops/csrc/common/mlu/roiaware_pool3d_mlu_kernel.mlu
+++ /dev/null
@@ -1,747 +0,0 @@
-/*************************************************************************
- * Copyright (C) 2022 Cambricon.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *************************************************************************/
-
-#include "common_mlu_helper.hpp"
-
-#define ROI_OFFSET 7
-#define FLOAT_NRAM_BUFFER_NUM 14
-#define HALF_NRAM_BUFFER_NUM 25
-#define ALIGN_NUM 64
-
-__nram__ char data_nram[MAX_NRAM_SIZE];
-
-template <typename T>
-__mlu_global__ void MLUUnion1KernelPtsIdxOfVoxels(
-    const int pool_method, const int boxes_num, const int pts_num,
-    const int max_pts_each_voxel, const int out_x, const int out_y,
-    const int out_z, const T *rois, const T *pts, int *pts_idx_of_voxels) {
-  // params (T)rois: (boxes_num, 7)
-  // params (T)pts: (3, pts_num)
-  // params (int)pts_idx_of_voxels: (boxes_num, out_x, out_y, out_z,
-  // max_pts_each_voxel)
-
-  // make sure that memcore is not used
-  if (coreId == 0x80) {
-    return;
-  }
-  int nram_pts_num = 0;
-  if (sizeof(T) == sizeof(float)) {
-    nram_pts_num = PAD_DOWN(
-        (MAX_NRAM_SIZE / sizeof(float) / FLOAT_NRAM_BUFFER_NUM), ALIGN_NUM);
-  } else {
-    nram_pts_num = PAD_DOWN(
-        (MAX_NRAM_SIZE / sizeof(half) / HALF_NRAM_BUFFER_NUM), ALIGN_NUM);
-  }
-
-  char *X = NULL;
-  char *Y = NULL;
-  char *Z = NULL;
-  char *local_X = NULL;
-  char *local_Y = NULL;
-  char *local_Z = NULL;
-  char *nram_pts_in_flag = NULL;
-  float *temp_buffer1 = NULL;
-  float *temp_buffer2 = NULL;
-  float *temp_buffer3 = NULL;
-  float *temp_buffer4 = NULL;
-  float *temp_buffer5 = NULL;
-  float *nram_voxel_offset = NULL;
-  int *nram_pts_idx_seq = NULL;
-  float *fp_local_X = NULL;
-  float *fp_local_Y = NULL;
-  float *fp_local_Z = NULL;
-  float *fp_nram_pts_in_flag = NULL;
-  if (sizeof(T) == sizeof(float)) {
-    X = (char *)((float *)data_nram);
-    Y = (char *)((float *)data_nram + nram_pts_num);
-    Z = (char *)((float *)data_nram + nram_pts_num * 2);
-    local_X = (char *)((float *)data_nram + nram_pts_num * 3);
-    local_Y = (char *)((float *)data_nram + nram_pts_num * 4);
-    local_Z = (char *)((float *)data_nram + nram_pts_num * 5);
-    nram_pts_in_flag = (char *)((float *)data_nram + nram_pts_num * 6);
-    temp_buffer1 = (float *)data_nram + nram_pts_num * 7;
-    temp_buffer2 = (float *)data_nram + nram_pts_num * 8;
-    temp_buffer3 = (float *)data_nram + nram_pts_num * 9;
-    temp_buffer4 = (float *)data_nram + nram_pts_num * 10;
-    temp_buffer5 = (float *)data_nram + nram_pts_num * 11;
-    nram_voxel_offset = (float *)data_nram + nram_pts_num * 12;
-    nram_pts_idx_seq = (int *)((float *)data_nram + nram_pts_num * 13);
-    fp_local_X = (float *)local_X;
-    fp_local_Y = (float *)local_Y;
-    fp_local_Z = (float *)local_Z;
-    fp_nram_pts_in_flag = (float *)nram_pts_in_flag;
-  } else {
-    X = (char *)((half *)data_nram);
-    Y = (char *)((half *)data_nram + nram_pts_num);
-    Z = (char *)((half *)data_nram + nram_pts_num * 2);
-    local_X = (char *)((half *)data_nram + nram_pts_num * 4);
-    local_Y = (char *)((half *)data_nram + nram_pts_num * 6);
-    local_Z = (char *)((half *)data_nram + nram_pts_num * 8);
-    nram_pts_in_flag = (char *)((half *)data_nram + nram_pts_num * 10);
-    temp_buffer1 = (float *)((half *)data_nram + nram_pts_num * 11);
-    temp_buffer2 = (float *)((half *)data_nram + nram_pts_num * 13);
-    temp_buffer3 = (float *)((half *)data_nram + nram_pts_num * 15);
-    temp_buffer4 = (float *)((half *)data_nram + nram_pts_num * 17);
-    temp_buffer5 = (float *)((half *)data_nram + nram_pts_num * 19);
-    nram_voxel_offset = (float *)((half *)data_nram + nram_pts_num * 21);
-    nram_pts_idx_seq = (int *)((half *)data_nram + nram_pts_num * 23);
-    fp_local_X = (float *)((half *)local_X - nram_pts_num);
-    fp_local_Y = (float *)((half *)local_Y - nram_pts_num);
-    fp_local_Z = (float *)((half *)local_Z - nram_pts_num);
-    fp_nram_pts_in_flag = (float *)((half *)nram_pts_in_flag - nram_pts_num);
-  }
-
-  for (int i = 0; i < nram_pts_num; i++) {
-    nram_pts_idx_seq[i] = i;
-  }
-
-  int nram_pts_loop_times = pts_num / nram_pts_num;
-  int rem_nram_num = pts_num % nram_pts_num;
-
-  for (int roi_index = taskId; roi_index < boxes_num; roi_index += taskDim) {
-    const T *cur_roi = rois + roi_index * ROI_OFFSET;
-    T cx = cur_roi[0];
-    T cy = cur_roi[1];
-    T cz = cur_roi[2];
-    T dx = cur_roi[3];
-    T dy = cur_roi[4];
-    T dz = cur_roi[5];
-    T rz = cur_roi[6];
-
-    T dx_2 = dx / 2.0;
-    T dy_2 = dy / 2.0;
-    T dz_2 = dz / 2.0;
-
-    for (int loop_idx = 0; loop_idx <= nram_pts_loop_times; loop_idx++) {
-      int load_pts_num =
-          (loop_idx == nram_pts_loop_times) ? rem_nram_num : nram_pts_num;
-      if (load_pts_num == 0) {
-        break;
-      }
-      int pts_offset_cur_loop = nram_pts_num * loop_idx;
-      int compute_pts_num = (loop_idx == nram_pts_loop_times)
-                                ? PAD_UP(rem_nram_num, ALIGN_NUM)
-                                : nram_pts_num;
-      // load pts
-      __memcpy((void *)X, (T *)pts + pts_offset_cur_loop,
-               load_pts_num * sizeof(T), GDRAM2NRAM);
-      __memcpy((void *)Y, (T *)pts + pts_num + pts_offset_cur_loop,
-               load_pts_num * sizeof(T), GDRAM2NRAM);
-      __memcpy((void *)Z, (T *)pts + pts_num * 2 + pts_offset_cur_loop,
-               load_pts_num * sizeof(T), GDRAM2NRAM);
-      // fabs(local_z)
-      __bang_sub_scalar((T *)local_Z, (T *)Z, (T)cz, compute_pts_num);
-      __bang_sub_scalar((T *)temp_buffer1, (T *)Z, (T)(cz + dz_2),
-                        compute_pts_num);
-      __bang_active_abs((T *)temp_buffer1, (T *)temp_buffer1, compute_pts_num);
-#if __BANG_ARCH__ >= 322
-      __bang_le_scalar((T *)nram_pts_in_flag, (T *)temp_buffer1, (T)(dz_2),
-                       compute_pts_num);
-#else
-      __bang_write_value((void *)temp_buffer2, compute_pts_num, (T)(dz_2));
-      __bang_le((T *)nram_pts_in_flag, (T *)temp_buffer1, (T *)temp_buffer2,
-                compute_pts_num);
-#endif
-      T cosa = std::cos(-rz);
-      T sina = std::sin(-rz);
-      __bang_sub_scalar((T *)temp_buffer3, (T *)X, (T)cx, compute_pts_num);
-      __bang_sub_scalar((T *)temp_buffer4, (T *)Y, (T)cy, compute_pts_num);
-      __bang_mul_scalar((T *)temp_buffer1, (T *)temp_buffer3, (T)cosa,
-                        compute_pts_num);
-      __bang_mul_scalar((T *)temp_buffer2, (T *)temp_buffer4, (T)sina,
-                        compute_pts_num);
-      // local_x
-      __bang_sub((T *)local_X, (T *)temp_buffer1, (T *)temp_buffer2,
-                 compute_pts_num);
-      // fabs(local_x)
-      __bang_active_abs((T *)temp_buffer1, (T *)local_X, compute_pts_num);
-      // fabs(local_x) < dx/2 ? 1 : 0
-#if __BANG_ARCH__ >= 322
-      __bang_lt_scalar((T *)temp_buffer1, (T *)temp_buffer1, (T)(dx_2),
-                       compute_pts_num);
-#else
-      __bang_write_value((void *)temp_buffer2, compute_pts_num, (T)(dx_2));
-      __bang_lt((T *)temp_buffer1, (T *)temp_buffer1, (T *)temp_buffer2,
-                compute_pts_num);
-#endif
-      __bang_and((T *)nram_pts_in_flag, (T *)nram_pts_in_flag,
-                 (T *)temp_buffer1,
-                 compute_pts_num);  // flush res
-
-      __bang_mul_scalar((T *)temp_buffer1, (T *)temp_buffer3, (T)sina,
-                        compute_pts_num);
-      __bang_mul_scalar((T *)temp_buffer2, (T *)temp_buffer4, (T)cosa,
-                        compute_pts_num);
-      // local_y
-      __bang_add((T *)local_Y, (T *)temp_buffer1, (T *)temp_buffer2,
-                 compute_pts_num);
-      // fabs(local_y)
-      __bang_active_abs((T *)temp_buffer1, (T *)local_Y, compute_pts_num);
-      // fabs(local_y) < dy/2 ? 1 : 0
-#if __BANG_ARCH__ >= 322
-      __bang_lt_scalar((T *)temp_buffer1, (T *)temp_buffer1, (T)(dy_2),
-                       compute_pts_num);
-#else
-      __bang_write_value((void *)temp_buffer2, compute_pts_num, (T)(dy_2));
-      __bang_lt((T *)temp_buffer1, (T *)temp_buffer1, (T *)temp_buffer2,
-                compute_pts_num);
-#endif
-      __bang_and((T *)nram_pts_in_flag, (T *)nram_pts_in_flag,
-                 (T *)temp_buffer1,
-                 compute_pts_num);  // flush res
-      T x_res = dx / out_x;
-      T y_res = dy / out_y;
-      T z_res = dz / out_z;
-      __bang_add_scalar((T *)local_X, (T *)local_X, (T)(dx_2), compute_pts_num);
-      __bang_add_scalar((T *)local_Y, (T *)local_Y, (T)(dy_2), compute_pts_num);
-      // local_Z do not need to add dz/2.0
-
-#if (__BANG_ARCH__ >= 322) && (__BANG_ARCH__ != 372)
-      __bang_div((T *)local_X, (T *)local_X, (T)x_res, compute_pts_num);
-      __bang_div((T *)local_Y, (T *)local_Y, (T)y_res, compute_pts_num);
-      __bang_div((T *)local_Z, (T *)local_Z, (T)z_res, compute_pts_num);
-#else
-      __bang_mul_scalar((T *)local_X, (T *)local_X, (T)(1 / x_res),
-                        compute_pts_num);
-      __bang_mul_scalar((T *)local_Y, (T *)local_Y, (T)(1 / y_res),
-                        compute_pts_num);
-      __bang_mul_scalar((T *)local_Z, (T *)local_Z, (T)(1 / z_res),
-                        compute_pts_num);
-#endif
-      // float = float2int + int2float, half = half2int + int2float
-      if (sizeof(T) == sizeof(float)) {
-#if __BANG_ARCH__ >= 322
-        __bang_float2int32_tz((int *)temp_buffer1, (float *)local_X,
-                              compute_pts_num, 0);
-        __bang_float2int32_tz((int *)temp_buffer2, (float *)local_Y,
-                              compute_pts_num, 0);
-        __bang_float2int32_tz((int *)temp_buffer3, (float *)local_Z,
-                              compute_pts_num, 0);
-        __bang_int322float_rn((float *)fp_local_X, (int *)temp_buffer1,
-                              compute_pts_num, 0);
-        __bang_int322float_rn((float *)fp_local_Y, (int *)temp_buffer2,
-                              compute_pts_num, 0);
-        __bang_int322float_rn((float *)fp_local_Z, (int *)temp_buffer3,
-                              compute_pts_num, 0);
-#else
-        convertFloat2Int((int *)temp_buffer1, (float *)temp_buffer2,
-                         (float *)fp_local_X, (float *)temp_buffer3,
-                         compute_pts_num);
-        convertFloat2Int((int *)temp_buffer2, (float *)temp_buffer3,
-                         (float *)fp_local_Y, (float *)temp_buffer4,
-                         compute_pts_num);
-        convertFloat2Int((int *)temp_buffer3, (float *)temp_buffer4,
-                         (float *)fp_local_Z, (float *)temp_buffer5,
-                         compute_pts_num);
-        convertInt2Float((float *)fp_local_X, (float *)temp_buffer4,
-                         (int *)temp_buffer1, (float *)temp_buffer5,
-                         compute_pts_num);
-        convertInt2Float((float *)fp_local_Y, (float *)temp_buffer4,
-                         (int *)temp_buffer2, (float *)temp_buffer5,
-                         compute_pts_num);
-        convertInt2Float((float *)fp_local_Z, (float *)temp_buffer4,
-                         (int *)temp_buffer3, (float *)temp_buffer5,
-                         compute_pts_num);
-#endif
-      } else {
-        __bang_half2float((float *)temp_buffer4, (half *)nram_pts_in_flag,
-                          compute_pts_num);
-        __bang_move((void *)fp_nram_pts_in_flag, (void *)temp_buffer4,
-                    compute_pts_num * sizeof(float));
-#if __BANG_ARCH__ >= 322
-        __bang_half2int32_tz((int *)temp_buffer1, (half *)local_X,
-                             compute_pts_num, 0);
-        __bang_half2int32_tz((int *)temp_buffer2, (half *)local_Y,
-                             compute_pts_num, 0);
-        __bang_half2int32_tz((int *)temp_buffer3, (half *)local_Z,
-                             compute_pts_num, 0);
-        __bang_int322float_rn((float *)fp_local_X, (int *)temp_buffer1,
-                              compute_pts_num, 0);
-        __bang_int322float_rn((float *)fp_local_Y, (int *)temp_buffer2,
-                              compute_pts_num, 0);
-        __bang_int322float_rn((float *)fp_local_Z, (int *)temp_buffer3,
-                              compute_pts_num, 0);
-#else
-        __bang_half2int16_tz((int16_t *)temp_buffer1, (half *)local_X,
-                             compute_pts_num, 0);
-        __bang_half2int16_tz((int16_t *)temp_buffer2, (half *)local_Y,
-                             compute_pts_num, 0);
-        __bang_half2int16_tz((int16_t *)temp_buffer3, (half *)local_Z,
-                             compute_pts_num, 0);
-        __bang_int162float((float *)fp_local_X, (int16_t *)temp_buffer1,
-                           compute_pts_num, 0);
-        __bang_int162float((float *)fp_local_Y, (int16_t *)temp_buffer2,
-                           compute_pts_num, 0);
-        __bang_int162float((float *)fp_local_Z, (int16_t *)temp_buffer3,
-                           compute_pts_num, 0);
-#endif
-      }
-      // process index >= 0
-      __bang_write_value((float *)temp_buffer4, compute_pts_num, (float)0.0f);
-      __bang_maxequal((float *)fp_local_X, (float *)fp_local_X,
-                      (float *)temp_buffer4, compute_pts_num);
-      __bang_maxequal((float *)fp_local_Y, (float *)fp_local_Y,
-                      (float *)temp_buffer4, compute_pts_num);
-      __bang_maxequal((float *)fp_local_Z, (float *)fp_local_Z,
-                      (float *)temp_buffer4, compute_pts_num);
-      // process index <= （out_x - 1)
-      __bang_write_value((float *)temp_buffer5, compute_pts_num,
-                         (float)(out_x - 1));
-      __bang_minequal((float *)fp_local_X, (float *)fp_local_X,
-                      (float *)temp_buffer5, compute_pts_num);
-      __bang_write_value((float *)temp_buffer5, compute_pts_num,
-                         (float)(out_y - 1));
-      __bang_minequal((float *)fp_local_Y, (float *)fp_local_Y,
-                      (float *)temp_buffer5, compute_pts_num);
-      __bang_write_value((float *)temp_buffer5, compute_pts_num,
-                         (float)(out_z - 1));
-      __bang_minequal((float *)fp_local_Z, (float *)fp_local_Z,
-                      (float *)temp_buffer5, compute_pts_num);
-      __bang_mul_scalar((float *)temp_buffer1, (float *)fp_local_X,
-                        (float)(out_y * out_z), compute_pts_num);
-      __bang_mul_scalar((float *)temp_buffer2, (float *)fp_local_Y,
-                        (float)out_z, compute_pts_num);
-      __bang_mul_scalar((float *)temp_buffer3, (float *)fp_local_Z, (float)1.0,
-                        compute_pts_num);
-      __bang_add((float *)nram_voxel_offset, (float *)temp_buffer1,
-                 (float *)temp_buffer2, compute_pts_num);
-      __bang_add((float *)nram_voxel_offset, (float *)nram_voxel_offset,
-                 (float *)temp_buffer3, compute_pts_num);
-      __bang_mul_scalar((float *)nram_voxel_offset, (float *)nram_voxel_offset,
-                        (float)max_pts_each_voxel, compute_pts_num);
-      if (compute_pts_num != load_pts_num) {
-        __memset_nram((float *)fp_nram_pts_in_flag + load_pts_num,
-                      compute_pts_num - load_pts_num, (float)0.0);
-      }
-      __bang_collect((float *)temp_buffer4, (float *)nram_pts_idx_seq,
-                     (float *)fp_nram_pts_in_flag, compute_pts_num);
-      int pts_num_in_cur_roi =
-          (int)__bang_count((float *)fp_nram_pts_in_flag, compute_pts_num);
-      int *pts_idx_cur_voxels =
-          (int *)pts_idx_of_voxels +
-          roi_index * out_x * out_y * out_z * max_pts_each_voxel;
-      for (int idx = 0; idx < pts_num_in_cur_roi; idx++) {
-        int cur_pts_idx = *((int *)temp_buffer4 + idx);
-        int offset = (int)(*((float *)nram_voxel_offset + cur_pts_idx));
-        int cnt = pts_idx_cur_voxels[offset];
-        if (cnt < max_pts_each_voxel - 1) {
-          pts_idx_cur_voxels[offset + cnt + 1] =
-              cur_pts_idx + loop_idx * nram_pts_num;
-          pts_idx_cur_voxels[offset]++;
-        }
-      }
-    }
-  }
-}
-
-template <typename T>
-__mlu_global__ void MLUUnion1KernelRoiawarePool3dForward(
-    const int pool_method, const int boxes_num, const int pts_num,
-    const int channels, const int max_pts_each_voxel, const int out_x,
-    const int out_y, const int out_z, const T *pts_feature,
-    const int *pts_idx_of_voxels, T *pooled_features, int *argmax) {
-  // params (T)pts_feature: (channels, pts_num)
-  // params (int)pts_idx_of_voxels: (boxes_num, out_x, out_y, out_z,
-  // max_pts_each_voxel) params (int)argmax: (boxes_num, out_x, out_y, out_z,
-  // channels) params (T)pooled_features: (boxes_num, out_x, out_y, out_z,
-  // channels)
-
-  // make sure that memcore is not used
-  if (coreId == 0x80) {
-    return;
-  }
-  int align_num = NFU_ALIGN_SIZE / sizeof(T);
-  int align_max_pts_each_voxel = PAD_UP(max_pts_each_voxel, align_num);
-  int nram_channels_limit =
-      PAD_DOWN((MAX_NRAM_SIZE - 128 -
-                align_max_pts_each_voxel * (sizeof(int) + sizeof(T))) /
-                   ((align_max_pts_each_voxel + 1) * sizeof(T) + sizeof(int)),
-               align_num);
-  int *nram_pts_idx_cur_voxel = (int *)data_nram;
-  // nram_pts_idx_cur_voxel [align_max_pts_each_voxel]
-  T *nram_max_pts_feature_tmp =
-      (T *)((int *)nram_pts_idx_cur_voxel + align_max_pts_each_voxel);
-  // nram_max_pts_feature_tmp [align_max_pts_each_voxel]
-  T *nram_pts_feature_in_voxel =
-      ((T *)nram_max_pts_feature_tmp + align_max_pts_each_voxel);
-  // nram_pts_feature_in_voxel [nram_channels_limit, align_max_pts_each_voxel]
-  T *nram_pooled_features_cur_voxel =
-      ((T *)nram_pts_feature_in_voxel +
-       nram_channels_limit * align_max_pts_each_voxel);
-  // nram_pooled_features_cur_voxel [nram_channels_limit]
-  int *nram_argmax_cur_voxel =
-      (int *)((T *)nram_pooled_features_cur_voxel + nram_channels_limit);
-  // nram_argmax_cur_voxel [nram_channels_limit]
-  char *one_pooled_feature =
-      (char *)((int *)nram_argmax_cur_voxel + nram_channels_limit);
-  // one_pooled_feature [128]
-  int channels_loop_times = channels / nram_channels_limit;
-  int rem_channels = channels % nram_channels_limit;
-  for (int voxel_index = taskId;
-       voxel_index < boxes_num * out_x * out_y * out_z;
-       voxel_index += taskDim) {
-    int *pts_idx_cur_voxels =
-        (int *)pts_idx_of_voxels + voxel_index * max_pts_each_voxel;
-    __memcpy((void *)nram_pts_idx_cur_voxel, (void *)pts_idx_cur_voxels,
-             max_pts_each_voxel * sizeof(int), GDRAM2NRAM);
-    int pts_num_cur_voxel = nram_pts_idx_cur_voxel[0];
-    if (pts_num_cur_voxel == 0) {
-      continue;
-    }
-    for (int channels_loop_idx = 0; channels_loop_idx <= channels_loop_times;
-         channels_loop_idx++) {
-      int actual_channels_num = (channels_loop_idx == channels_loop_times)
-                                    ? rem_channels
-                                    : nram_channels_limit;
-      if (actual_channels_num == 0) {
-        break;
-      }
-      int channels_offset = nram_channels_limit * channels_loop_idx;
-
-#if ((__BANG_ARCH__ >= 200) && (__BANG_ARCH__ < 300))
-      int compute_channels_num = (channels_loop_idx == channels_loop_times)
-                                     ? PAD_UP(rem_channels, align_num)
-                                     : nram_channels_limit;
-      if (pool_method == 0) {
-        __bang_write_value((void *)nram_pts_feature_in_voxel,
-                           compute_channels_num * align_max_pts_each_voxel,
-                           (T)-INFINITY);
-      }
-#endif
-
-      T *pts_feature_cur_loop = (T *)pts_feature + channels_offset * pts_num;
-      for (int idx = 0; idx < pts_num_cur_voxel; idx++) {
-        __memcpy((T *)nram_pts_feature_in_voxel + idx,
-                 (T *)pts_feature_cur_loop + nram_pts_idx_cur_voxel[idx + 1],
-                 sizeof(T), GDRAM2NRAM, align_max_pts_each_voxel * sizeof(T),
-                 pts_num * sizeof(T), actual_channels_num - 1);
-      }
-      for (int channel_idx = 0; channel_idx < actual_channels_num;
-           channel_idx++) {
-        if (pool_method == 0) {
-#if __BANG_ARCH__ >= 322
-          __bang_argmax((T *)one_pooled_feature,
-                        (T *)nram_pts_feature_in_voxel +
-                            channel_idx * align_max_pts_each_voxel,
-                        pts_num_cur_voxel);
-          T max_val = ((T *)one_pooled_feature)[0];
-          int max_idx = (int)(*(uint32_t *)((T *)one_pooled_feature + 1));
-          nram_pooled_features_cur_voxel[channel_idx] =
-              (max_val == -INFINITY) ? 0 : max_val;
-          nram_argmax_cur_voxel[channel_idx] =
-              (max_val == -INFINITY) ? -1 : nram_pts_idx_cur_voxel[max_idx + 1];
-#else
-          // __bang_max need align num on mlu200 series
-          if (sizeof(T) == sizeof(float)) {
-            __bang_max((float *)one_pooled_feature,
-                       (float *)nram_pts_feature_in_voxel +
-                           channel_idx * align_max_pts_each_voxel,
-                       align_max_pts_each_voxel);
-            float max_val = ((float *)one_pooled_feature)[0];
-            __bang_write_value((void *)nram_max_pts_feature_tmp,
-                               align_max_pts_each_voxel, (float)max_val);
-            __bang_eq((float *)nram_max_pts_feature_tmp,
-                      (float *)nram_pts_feature_in_voxel +
-                          channel_idx * align_max_pts_each_voxel,
-                      (float *)nram_max_pts_feature_tmp,
-                      align_max_pts_each_voxel);
-            int max_idx = (int)__bang_findfirst1(
-                (float *)nram_max_pts_feature_tmp, align_max_pts_each_voxel);
-            nram_pooled_features_cur_voxel[channel_idx] =
-                (max_val == -INFINITY) ? 0 : max_val;
-            nram_argmax_cur_voxel[channel_idx] =
-                (max_val == -INFINITY) ? -1
-                                       : nram_pts_idx_cur_voxel[max_idx + 1];
-          } else {
-            int max_idx = -1;
-            float max_val = -INFINITY;
-            for (int k = 0; k < pts_num_cur_voxel; k++) {
-              float pts_feature_cur_channel = __half2float_rd(
-                  *((half *)nram_pts_feature_in_voxel +
-                    channel_idx * align_max_pts_each_voxel + k));
-              if (pts_feature_cur_channel > max_val) {
-                max_val = pts_feature_cur_channel;
-                max_idx = k;
-              }
-            }
-            nram_pooled_features_cur_voxel[channel_idx] =
-                (max_idx == -1) ? 0 : max_val;
-            nram_argmax_cur_voxel[channel_idx] =
-                (max_idx == -1) ? -1 : nram_pts_idx_cur_voxel[max_idx + 1];
-          }
-#endif
-        } else if (pool_method == 1) {
-          float sum_val_cur_channel = 0;
-          for (int k = 0; k < pts_num_cur_voxel; k++) {
-            sum_val_cur_channel += static_cast<float>(
-                ((T *)nram_pts_feature_in_voxel)[channel_idx *
-                                                     align_max_pts_each_voxel +
-                                                 k]);
-          }
-          nram_pooled_features_cur_voxel[channel_idx] =
-              (T)(sum_val_cur_channel / pts_num_cur_voxel);
-        }
-      }
-      // store
-      __memcpy((T *)pooled_features + voxel_index * channels + channels_offset,
-               (void *)nram_pooled_features_cur_voxel,
-               actual_channels_num * sizeof(T), NRAM2GDRAM);
-      if (pool_method == 0) {
-        __memcpy((int *)argmax + voxel_index * channels + channels_offset,
-                 (void *)nram_argmax_cur_voxel,
-                 actual_channels_num * sizeof(int), NRAM2GDRAM);
-      }
-    }
-  }
-}
-
-void KernelPtsIdxOfVoxels(cnrtDim3_t k_dim, cnrtFunctionType_t k_type,
-                          cnrtQueue_t queue, const cnrtDataType_t d_type,
-                          const int pool_method, const int boxes_num,
-                          const int pts_num, const int max_pts_each_voxel,
-                          const int out_x, const int out_y, const int out_z,
-                          const void *rois, const void *pts,
-                          int *pts_idx_of_voxels) {
-  switch (d_type) {
-    case CNRT_FLOAT32: {
-      MLUUnion1KernelPtsIdxOfVoxels<float><<<k_dim, k_type, queue>>>(
-          pool_method, boxes_num, pts_num, max_pts_each_voxel, out_x, out_y,
-          out_z, (float *)rois, (float *)pts, (int *)pts_idx_of_voxels);
-    }; break;
-    case CNRT_FLOAT16: {
-      MLUUnion1KernelPtsIdxOfVoxels<half><<<k_dim, k_type, queue>>>(
-          pool_method, boxes_num, pts_num, max_pts_each_voxel, out_x, out_y,
-          out_z, (half *)rois, (half *)pts, (int *)pts_idx_of_voxels);
-    }; break;
-    default: {
-      break;
-    }
-  }
-}
-
-void KernelRoiawarePool3dForward(
-    cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
-    const cnrtDataType_t d_type, const int pool_method, const int boxes_num,
-    const int pts_num, const int channels, const int max_pts_each_voxel,
-    const int out_x, const int out_y, const int out_z, const void *pts_feature,
-    const int *pts_idx_of_voxels, void *pooled_features, int *argmax) {
-  switch (d_type) {
-    case CNRT_FLOAT32: {
-      MLUUnion1KernelRoiawarePool3dForward<float><<<k_dim, k_type, queue>>>(
-          pool_method, boxes_num, pts_num, channels, max_pts_each_voxel, out_x,
-          out_y, out_z, (float *)pts_feature, (int *)pts_idx_of_voxels,
-          (float *)pooled_features, (int *)argmax);
-    }; break;
-    case CNRT_FLOAT16: {
-      MLUUnion1KernelRoiawarePool3dForward<half><<<k_dim, k_type, queue>>>(
-          pool_method, boxes_num, pts_num, channels, max_pts_each_voxel, out_x,
-          out_y, out_z, (half *)pts_feature, (int *)pts_idx_of_voxels,
-          (half *)pooled_features, (int *)argmax);
-    }; break;
-    default: {
-      break;
-    }
-  }
-}
-
-template <typename T>
-__mlu_global__ void MLUUnion1KernelRoiawareMaxPool3dBackward(
-    const int boxes_num, const int out_x, const int out_y, const int out_z,
-    const int channels, const int *argmax, const T *grad_out, T *grad_in) {
-  // params (int)argmax: (boxes_num, out_x, out_y, out_z, channels)
-  // params (T)grad_out: (boxes_num, out_x, out_y, out_z, channels)
-  // params (T)grad_in: (pts_num, channels)
-
-  // make sure that memcore is not used
-  if (coreId == 0x80) {
-    return;
-  }
-  int nram_channels_limit =
-      (MAX_NRAM_SIZE - sizeof(T) * 1) / (sizeof(T) + sizeof(int));
-  int *nram_argmax_cur_loop = (int *)data_nram;
-  // nram_argmax_cur_loop [nram_channels_limit]
-  T *nram_grad_out_cur_loop =
-      (T *)((int *)nram_argmax_cur_loop + nram_channels_limit);
-  // nram_grad_out_cur_loop [nram_channels_limit]
-  T *nram_grad_in_cur_channel =
-      (T *)nram_grad_out_cur_loop + nram_channels_limit;
-  // nram_grad_in_cur_channel [1]
-  int channels_loop_times = channels / nram_channels_limit;
-  int rem_channels = channels % nram_channels_limit;
-  int voxels_num = boxes_num * out_x * out_y * out_z;
-
-  for (int voxel_index = taskId; voxel_index < voxels_num;
-       voxel_index += taskDim) {
-    const int *argmax_cur_voxel = argmax + voxel_index * channels;
-    const T *grad_out_cur_voxel = grad_out + voxel_index * channels;
-
-    for (int channels_loop_idx = 0; channels_loop_idx <= channels_loop_times;
-         channels_loop_idx++) {
-      int actual_channels_num = (channels_loop_idx == channels_loop_times)
-                                    ? rem_channels
-                                    : nram_channels_limit;
-      if (actual_channels_num == 0) {
-        break;
-      }
-      const int *argmax_cur_loop =
-          argmax_cur_voxel + nram_channels_limit * channels_loop_idx;
-      const T *grad_out_cur_loop =
-          grad_out_cur_voxel + nram_channels_limit * channels_loop_idx;
-      __memcpy((void *)nram_argmax_cur_loop, (void *)argmax_cur_loop,
-               actual_channels_num * sizeof(int), GDRAM2NRAM);
-      __memcpy((void *)nram_grad_out_cur_loop, (void *)grad_out_cur_loop,
-               actual_channels_num * sizeof(T), GDRAM2NRAM);
-
-      for (int channel_idx = 0; channel_idx < actual_channels_num;
-           channel_idx++) {
-        int *nram_argmax_cur_channel = nram_argmax_cur_loop + channel_idx;
-        T *nram_grad_out_cur_channel = nram_grad_out_cur_loop + channel_idx;
-        if (nram_argmax_cur_channel[0] == -1) {
-          continue;
-        }
-        T *grad_in_cur_channel =
-            grad_in + nram_argmax_cur_channel[0] * channels +
-            nram_channels_limit * channels_loop_idx + channel_idx;
-        __bang_atomic_add((T *)nram_grad_in_cur_channel,
-                          (T *)grad_in_cur_channel,
-                          (T *)(nram_grad_out_cur_channel), 1);
-      }
-    }
-  }
-}
-
-template <typename T>
-__mlu_global__ void MLUUnion1KernelRoiawareAvgPool3dBackward(
-    const int boxes_num, const int out_x, const int out_y, const int out_z,
-    const int channels, const int max_pts_each_voxel,
-    const int *pts_idx_of_voxels, const T *grad_out, T *grad_in) {
-  // params (int)pts_idx_of_voxels: (boxes_num, out_x, out_y, out_z,
-  // max_pts_each_voxel) params (T)grad_out: (boxes_num, out_x, out_y, out_z,
-  // channels) params (T)grad_in: (pts_num, channels)
-
-  // make sure that memcore is not used
-  if (coreId == 0x80) {
-    return;
-  }
-  int align_num = NFU_ALIGN_SIZE / sizeof(T);
-  int align_max_pts_each_voxel = PAD_UP(max_pts_each_voxel, align_num);
-  int nram_channels_limit = PAD_DOWN(
-      (MAX_NRAM_SIZE - align_max_pts_each_voxel * sizeof(int)) / 2 / sizeof(T),
-      align_num);
-  int *nram_pts_idx_cur_voxel = (int *)data_nram;
-  // nram_pts_idx_cur_voxel [align_max_pts_each_voxel]
-  T *nram_grad_out_cur_loop =
-      (T *)((int *)nram_pts_idx_cur_voxel + align_max_pts_each_voxel);
-  // nram_grad_out_cur_loop [nram_channels_limit]
-  T *nram_grad_in_cur_loop = (T *)nram_grad_out_cur_loop + nram_channels_limit;
-  // nram_grad_in_cur_loop [nram_channels_limit]
-  int channels_loop_times = channels / nram_channels_limit;
-  int rem_channels = channels % nram_channels_limit;
-  int voxels_num = boxes_num * out_x * out_y * out_z;
-
-  for (int voxel_index = taskId; voxel_index < voxels_num;
-       voxel_index += taskDim) {
-    const T *grad_out_cur_voxel = grad_out + voxel_index * channels;
-    const int *pts_idx_cur_voxel =
-        pts_idx_of_voxels + voxel_index * max_pts_each_voxel;
-    __memcpy((void *)nram_pts_idx_cur_voxel, (void *)pts_idx_cur_voxel,
-             max_pts_each_voxel * sizeof(int), GDRAM2NRAM);
-    int total_pts_of_voxel = nram_pts_idx_cur_voxel[0];
-    if (total_pts_of_voxel <= 0) {
-      continue;
-    }
-    float cur_grad = 1.0 / ((float)total_pts_of_voxel);
-
-    for (int channels_loop_idx = 0; channels_loop_idx <= channels_loop_times;
-         channels_loop_idx++) {
-      int actual_channels_num = (channels_loop_idx == channels_loop_times)
-                                    ? rem_channels
-                                    : nram_channels_limit;
-      if (actual_channels_num == 0) {
-        break;
-      }
-      const T *grad_out_cur_loop =
-          grad_out_cur_voxel + nram_channels_limit * channels_loop_idx;
-      __memcpy((void *)nram_grad_in_cur_loop, (void *)grad_out_cur_loop,
-               actual_channels_num * sizeof(T), GDRAM2NRAM);
-
-      int align_actual_channels_num = PAD_UP(actual_channels_num, align_num);
-
-      if (sizeof(T) == sizeof(half)) {
-        __bang_half2float((float *)nram_grad_out_cur_loop,
-                          (half *)nram_grad_in_cur_loop,
-                          align_actual_channels_num);
-        __bang_mul_scalar((float *)nram_grad_out_cur_loop,
-                          (float *)nram_grad_out_cur_loop, (float)cur_grad,
-                          align_actual_channels_num);
-        convertFloat2half((half *)nram_grad_out_cur_loop,
-                          (float *)nram_grad_out_cur_loop,
-                          align_actual_channels_num);
-      } else {
-        __bang_mul_scalar((float *)nram_grad_out_cur_loop,
-                          (float *)nram_grad_in_cur_loop, (float)cur_grad,
-                          align_actual_channels_num);
-      }
-      for (int k = 1; k <= total_pts_of_voxel; k++) {
-        T *grad_in_cur_loop = grad_in + nram_pts_idx_cur_voxel[k] * channels +
-                              nram_channels_limit * channels_loop_idx;
-        __bang_atomic_add((T *)nram_grad_in_cur_loop, (T *)grad_in_cur_loop,
-                          (T *)nram_grad_out_cur_loop, actual_channels_num);
-      }
-    }
-  }
-}
-
-void KernelRoiawarePool3dBackward(
-    cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
-    const cnrtDataType_t d_type, const int pool_method, const int boxes_num,
-    const int out_x, const int out_y, const int out_z, const int channels,
-    const int max_pts_each_voxel, const int *pts_idx_of_voxels,
-    const int *argmax, const void *grad_out, void *grad_in) {
-  if (pool_method == 0) {
-    switch (d_type) {
-      case CNRT_FLOAT32: {
-        MLUUnion1KernelRoiawareMaxPool3dBackward<float>
-            <<<k_dim, k_type, queue>>>(boxes_num, out_x, out_y, out_z, channels,
-                                       (int *)argmax, (float *)grad_out,
-                                       (float *)grad_in);
-      }; break;
-      case CNRT_FLOAT16: {
-        MLUUnion1KernelRoiawareMaxPool3dBackward<half>
-            <<<k_dim, k_type, queue>>>(boxes_num, out_x, out_y, out_z, channels,
-                                       (int *)argmax, (half *)grad_out,
-                                       (half *)grad_in);
-      }; break;
-      default: {
-        break;
-      }
-    }
-  } else {
-    switch (d_type) {
-      case CNRT_FLOAT32: {
-        MLUUnion1KernelRoiawareAvgPool3dBackward<float>
-            <<<k_dim, k_type, queue>>>(
-                boxes_num, out_x, out_y, out_z, channels, max_pts_each_voxel,
-                (int *)pts_idx_of_voxels, (float *)grad_out, (float *)grad_in);
-      }; break;
-      case CNRT_FLOAT16: {
-        MLUUnion1KernelRoiawareAvgPool3dBackward<half>
-            <<<k_dim, k_type, queue>>>(
-                boxes_num, out_x, out_y, out_z, channels, max_pts_each_voxel,
-                (int *)pts_idx_of_voxels, (half *)grad_out, (half *)grad_in);
-      }; break;
-      default: {
-        break;
-      }
-    }
-  }
-}
diff --git a/mmcv/ops/csrc/common/mlu/roipoint_pool3d_large_boxes_num_mlu_kernel.mlu b/mmcv/ops/csrc/common/mlu/roipoint_pool3d_large_boxes_num_mlu_kernel.mlu
deleted file mode 100644
index 58a15d8..0000000
--- a/mmcv/ops/csrc/common/mlu/roipoint_pool3d_large_boxes_num_mlu_kernel.mlu
+++ /dev/null
@@ -1,536 +0,0 @@
-/*************************************************************************
- * Copyright (C) 2022 Cambricon.
- *
- * OR IMPLIED, INCLUDING BUvoid NOKType LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENvoid SHALL THE AUTHORS OR COPYRIGHKType HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORvoid OR OTHERWISE, ARISING FROM, OUKType OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *************************************************************************/
-
-#include "common_mlu_helper.hpp"
-
-/*************************************************************************
- *
- * NRAM partition:
- * | boxes3d       | ping points + pong points | aux_a ~ aux_f            |
- * | 7 * sizeof(T) | 6 * deal_num * sizeof(T)  | 6 * deal_num * sizeof(T) |
- *
- *************************************************************************/
-#define TWELVE_SPLIT 12
-
-__nram__ char nram_buffer[MAX_NRAM_SIZE];
-
-template <typename T>
-__mlu_func__ void checkPointsInBox3d(const T *boxes3d,
-                                     const size_t deal_num,
-                                     T *x,
-                                     T *y,
-                                     T *z,
-                                     T *auxiliary_a,
-                                     T *auxiliary_b,
-                                     T *auxiliary_c,
-                                     T *auxiliary_d,
-                                     T *auxiliary_e,
-                                     T *auxiliary_f,
-                                     T *pts_assign) {
-  // param box3d: (cx, cy, cz, dx, dy, dz, rz) in LiDAR coordinate
-  T cx = boxes3d[0];
-  T cy = boxes3d[1];
-  T cz = boxes3d[2];
-  T dx = boxes3d[3];
-  T dy = boxes3d[4];
-  T dz = boxes3d[5];
-  T rz = boxes3d[6];
-  // shift to the center since cz in box3d is the bottom center
-  cz += 0.5 * dz;
-
-  T cosa = (T)std::cos(-rz);
-  T sina = (T)std::sin(-rz);
-
-  // x - cx
-  __bang_sub_scalar((T *)auxiliary_a, (T *)x, (T)cx, deal_num);
-  // y - cy
-  __bang_sub_scalar((T *)auxiliary_b, (T *)y, (T)cy, deal_num);
-  // z - cz
-  __bang_sub_scalar((T *)auxiliary_c, (T *)z, (T)cz, deal_num);
-  // |z - cz|
-  __bang_active_abs((T *)auxiliary_c, (T *)auxiliary_c, deal_num);
-  // |z - cz| > dz / 2.0
-#if __BANG_ARCH__ >= 322
-  __bang_gt_scalar((T *)auxiliary_c, (T *)auxiliary_c, (T)(0.5 * dz), deal_num);
-#else
-  __bang_write_value((T *)auxiliary_d, deal_num, (T)(0.5 * dz));
-  __bang_lt((T *)auxiliary_c, (T *)auxiliary_d, (T *)auxiliary_c, deal_num);
-#endif
-  // !(|z - cz| > dz / 2.0)
-  __bang_not((T *)auxiliary_c, (T *)auxiliary_c, deal_num);
-  // (x - cx) * cos(-rz)
-  __bang_mul_scalar((T *)auxiliary_d, (T *)auxiliary_a, (T)cosa, deal_num);
-  // (y - cy) * sin(-rz)
-  __bang_mul_scalar((T *)auxiliary_e, (T *)auxiliary_b, (T)sina, deal_num);
-  // local_x = (x - cx) * cos(-rz) + (y - cy) * -sin(-rz)
-  __bang_sub((T *)auxiliary_d, (T *)auxiliary_d, (T *)auxiliary_e, deal_num);
-  // |local_x|
-  __bang_active_abs((T *)auxiliary_d, (T *)auxiliary_d, deal_num);
-  // |local_x| < dx / 2.0
-#if __BANG_ARCH__ >= 322
-  __bang_lt_scalar(auxiliary_d, auxiliary_d, (T)(0.5 * dx), deal_num);
-#else
-  __bang_write_value((T *)auxiliary_e, deal_num, (T)(0.5 * dx));
-  __bang_gt((T *)auxiliary_d, (T *)auxiliary_e, (T *)auxiliary_d, deal_num);
-#endif
-  // (x - cx) * sin(-rz)
-  __bang_mul_scalar((T *)auxiliary_e, (T *)auxiliary_a, (T)sina, deal_num);
-  // (y - cy) * cos(-rz)
-  __bang_mul_scalar((T *)auxiliary_f, (T *)auxiliary_b, (T)cosa, deal_num);
-  // local_y = (x - cx) * sin(-rz) + (y - cy) * cos(-rz)
-  __bang_add((T *)auxiliary_e, (T *)auxiliary_e, (T *)auxiliary_f, deal_num);
-  // |local_y|
-  __bang_active_abs((T *)auxiliary_e, (T *)auxiliary_e, deal_num);
-  // |local_y| < dy / 2.0
-#if __BANG_ARCH__ >= 322
-  __bang_lt_scalar(auxiliary_e, auxiliary_e, (T)(0.5 * dy), deal_num);
-#else
-  __bang_write_value((T *)auxiliary_f, deal_num, (T)(0.5 * dy));
-  __bang_gt((T *)auxiliary_e, (T *)auxiliary_f, (T *)auxiliary_e, deal_num);
-#endif
-  // pts_assign = |x - cx| < dx / 2.0 && |y - cy| < dy / 2.0 && |z - cz| <= dz / 2.0
-  __bang_mul((T *)pts_assign, (T *)auxiliary_c, (T *)auxiliary_d, deal_num);
-  __bang_mul((T *)pts_assign, (T *)pts_assign, (T *)auxiliary_e, deal_num);
-}
-
-template <typename T>
-__mlu_func__ void computeStoreRoipointPool3d(char *boxes3d,
-                                             int  *cnt,
-                                             char *points_x,
-                                             char *points_y,
-                                             char *points_z,
-                                             const char *point_features,
-                                             char *auxiliary_a,
-                                             char *auxiliary_b,
-                                             char *auxiliary_c,
-                                             char *auxiliary_d,
-                                             char *auxiliary_e,
-                                             char *auxiliary_f,
-                                             const int box_idx,
-                                             const int pts_num,
-                                             const int feature_in_len,
-                                             const int sampled_pts_num,
-                                             const size_t span_num_deal,
-                                             char *pooled_features_gdram,
-                                             char *pooled_empty_flag_gdram) {
-  char *pts_assign = auxiliary_a;
-  if (*cnt >= sampled_pts_num) {
-    return;
-  }
-  checkPointsInBox3d((T *)boxes3d, span_num_deal, (T *)points_x, (T *)points_y, (T *)points_z,
-                     (T *)auxiliary_a, (T *)auxiliary_b, (T *)auxiliary_c, (T *)auxiliary_d,
-                     (T *)auxiliary_e, (T *)auxiliary_f, (T *)pts_assign);
-
-  // __bang_select returns selected elements vector and the number of selected elements
-  __bang_select((T *)auxiliary_b, (T *)points_x, (T *)pts_assign, span_num_deal);
-  uint32_t select_num = *((uint32_t *)auxiliary_b);
-
-  if (select_num == 0) {
-    return;
-  }
-  int sampled_pts_num_rem = sampled_pts_num - *cnt;
-  int segnum = min((int)select_num, sampled_pts_num_rem) - 1;
-
-  // copy x to pooled_features_gdram
-  // The result of __bang_select is composed of three parts:
-  // The first 4-byte is the number of selected element, whose data type is unsigned int.
-  // The next 124-byte is zero. The rest bytes are the selected elements.
-  int select_num_size = 128;
-  __memcpy(
-      pooled_features_gdram + (box_idx * sampled_pts_num + *cnt) * (3 + feature_in_len) * sizeof(T),
-      (T *)((int8_t *)auxiliary_b + select_num_size), sizeof(T), NRAM2GDRAM,
-      (3 + feature_in_len) * sizeof(T), sizeof(T), segnum);
-
-  // copy y to pooled_features_gdram
-  __bang_collect((T *)auxiliary_d, (T *)points_y, (T *)pts_assign, span_num_deal);
-  __memcpy(pooled_features_gdram +
-               (box_idx * sampled_pts_num + *cnt) * (3 + feature_in_len) * sizeof(T) +
-               1 * sizeof(T),
-           (T *)auxiliary_d, sizeof(T), NRAM2GDRAM, (3 + feature_in_len) * sizeof(T), sizeof(T),
-           segnum);
-
-  // copy z to pooled_features_gdram
-  __bang_collect((T *)auxiliary_e, (T *)points_z, (T *)pts_assign, span_num_deal);
-  __memcpy(pooled_features_gdram +
-               (box_idx * sampled_pts_num + *cnt) * (3 + feature_in_len) * sizeof(T) +
-               2 * sizeof(T),
-           (T *)auxiliary_e, sizeof(T), NRAM2GDRAM, (3 + feature_in_len) * sizeof(T), sizeof(T),
-           segnum);
-
-  // copy features to pooled_features_gdram
-  for (int c_idx = 0; c_idx < feature_in_len; c_idx++) {
-    __memcpy(auxiliary_d, point_features + c_idx * pts_num * sizeof(T), span_num_deal * sizeof(T),
-             GDRAM2NRAM);
-    __bang_collect((T *)auxiliary_e, (T *)auxiliary_d, (T *)pts_assign, span_num_deal);
-    __memcpy(pooled_features_gdram +
-                 (box_idx * sampled_pts_num + *cnt) * (3 + feature_in_len) * sizeof(T) +
-                 (3 + c_idx) * sizeof(T),
-             auxiliary_e, sizeof(T), NRAM2GDRAM, (3 + feature_in_len) * sizeof(T), sizeof(T),
-             segnum);
-  }
-
-  *cnt += select_num;
-}
-
-template <typename T>
-__mlu_func__ void computeStoreLastBlockRoipointPool3d(char *boxes3d,
-                                                      int  *cnt,
-                                                      char *points_x,
-                                                      char *points_y,
-                                                      char *points_z,
-                                                      const char *point_features,
-                                                      char *auxiliary_a,
-                                                      char *auxiliary_b,
-                                                      char *auxiliary_c,
-                                                      char *auxiliary_d,
-                                                      char *auxiliary_e,
-                                                      char *auxiliary_f,
-                                                      const int box_idx,
-                                                      const int pts_num,
-                                                      const int feature_in_len,
-                                                      const int sampled_pts_num,
-                                                      const size_t span_num_deal,
-                                                      const size_t auxiliary_num_deal,
-                                                      char *pooled_features_gdram,
-                                                      char *pooled_empty_flag_gdram) {
-  char *pts_assign = auxiliary_a;
-  if (*cnt >= sampled_pts_num) {
-    // pooled_empty_flag_gdram set 0
-    *((int *)auxiliary_a) = 0;
-    __memcpy(pooled_empty_flag_gdram + box_idx * sizeof(int), auxiliary_a, sizeof(int), NRAM2GDRAM);
-    return;
-  }
-  checkPointsInBox3d((T *)boxes3d, span_num_deal, (T *)points_x, (T *)points_y, (T *)points_z,
-                     (T *)auxiliary_a, (T *)auxiliary_b, (T *)auxiliary_c, (T *)auxiliary_d,
-                     (T *)auxiliary_e, (T *)auxiliary_f, (T *)pts_assign);
-
-  // __bang_select returns selected elements vector and the number of selected elements
-  __bang_select((T *)auxiliary_b, (T *)points_x, (T *)pts_assign, span_num_deal);
-  uint32_t select_num = *((uint32_t *)auxiliary_b);
-
-  if (*cnt + select_num == 0) {
-    // pooled_empty_flag_gdram set 1
-    *((int *)auxiliary_a) = 1;
-    __memcpy(pooled_empty_flag_gdram + box_idx * sizeof(int), auxiliary_a, sizeof(int), NRAM2GDRAM);
-
-    // pooled_features_gdram set 0
-    int repeat = (sampled_pts_num * (3 + feature_in_len)) / (auxiliary_num_deal * 6);
-    int rem = (sampled_pts_num * (3 + feature_in_len)) % (auxiliary_num_deal * 6);
-    // use auxiliary_a to auxiliary_f
-    __bang_write_zero((T *)auxiliary_a, PAD_UP(auxiliary_num_deal * 6, NFU_ALIGN_SIZE));
-    if (repeat > 0) {
-      __memcpy(pooled_features_gdram + box_idx * sampled_pts_num * (3 + feature_in_len) * sizeof(T),
-               auxiliary_a, auxiliary_num_deal * 6 * sizeof(T), NRAM2GDRAM,
-               auxiliary_num_deal * 6 * sizeof(T), 0, repeat - 1);
-    }
-    if (rem > 0) {
-      __memcpy(pooled_features_gdram +
-                   box_idx * sampled_pts_num * (3 + feature_in_len) * sizeof(T) +
-                   repeat * auxiliary_num_deal * 6 * sizeof(T),
-               auxiliary_a, rem * sizeof(T), NRAM2GDRAM);
-    }
-    return;
-  }
-
-  if (select_num > 0) {
-    int sampled_pts_num_rem = sampled_pts_num - *cnt;
-    int segnum = min((int)select_num, sampled_pts_num_rem) - 1;
-
-    // copy x to pooled_features_gdram
-    // The result of __bang_select is composed of three parts:
-    // The first 4-byte is the number of selected element, whose data type is unsigned int.
-    // The next 124-byte is zero. The rest bytes are the selected elements.
-    int select_num_size = 128;
-    __memcpy(pooled_features_gdram +
-                 (box_idx * sampled_pts_num + *cnt) * (3 + feature_in_len) * sizeof(T),
-             (T *)((int8_t *)auxiliary_b + select_num_size), sizeof(T), NRAM2GDRAM,
-             (3 + feature_in_len) * sizeof(T), sizeof(T), segnum);
-
-    // copy y to pooled_features_gdram
-    __bang_collect((T *)auxiliary_d, (T *)points_y, (T *)pts_assign, span_num_deal);
-    __memcpy(pooled_features_gdram +
-                 (box_idx * sampled_pts_num + *cnt) * (3 + feature_in_len) * sizeof(T) +
-                 1 * sizeof(T),
-             (T *)auxiliary_d, sizeof(T), NRAM2GDRAM, (3 + feature_in_len) * sizeof(T), sizeof(T),
-             segnum);
-
-    // copy z to pooled_features_gdram
-    __bang_collect((T *)auxiliary_e, (T *)points_z, (T *)pts_assign, span_num_deal);
-    __memcpy(pooled_features_gdram +
-                 (box_idx * sampled_pts_num + *cnt) * (3 + feature_in_len) * sizeof(T) +
-                 2 * sizeof(T),
-             (T *)auxiliary_e, sizeof(T), NRAM2GDRAM, (3 + feature_in_len) * sizeof(T), sizeof(T),
-             segnum);
-
-    // copy features to pooled_features_gdram
-    for (int c_idx = 0; c_idx < feature_in_len; c_idx++) {
-      __memcpy(auxiliary_d, point_features + c_idx * pts_num * sizeof(T), span_num_deal * sizeof(T),
-               GDRAM2NRAM);
-      __bang_collect((T *)auxiliary_e, (T *)auxiliary_d, (T *)pts_assign, span_num_deal);
-      __memcpy(pooled_features_gdram +
-                   (box_idx * sampled_pts_num + *cnt) * (3 + feature_in_len) * sizeof(T) +
-                   (3 + c_idx) * sizeof(T),
-               auxiliary_e, sizeof(T), NRAM2GDRAM, (3 + feature_in_len) * sizeof(T), sizeof(T),
-               segnum);
-    }
-  }
-
-  // pooled_empty_flag_gdram set 0
-  *((int *)auxiliary_a) = 0;
-  __memcpy(pooled_empty_flag_gdram + box_idx * sizeof(int), auxiliary_a, sizeof(int), NRAM2GDRAM);
-
-  *cnt += select_num;
-  if (*cnt < sampled_pts_num) {
-    // duplicate same points for sampling
-    int repeat = sampled_pts_num / (*cnt) - 1;
-    int rem = sampled_pts_num % (*cnt);
-    if (repeat > 0) {
-      __memcpy(pooled_features_gdram +
-                   (box_idx * sampled_pts_num + *cnt) * (3 + feature_in_len) * sizeof(T),
-               pooled_features_gdram + box_idx * sampled_pts_num * (3 + feature_in_len) * sizeof(T),
-               (*cnt) * (3 + feature_in_len) * sizeof(T), GDRAM2GDRAM,
-               (*cnt) * (3 + feature_in_len) * sizeof(T), 0, repeat - 1);
-    }
-    if (rem > 0) {
-      __memcpy(
-          pooled_features_gdram +
-              (box_idx * sampled_pts_num + (repeat + 1) * (*cnt)) * (3 + feature_in_len) *
-              sizeof(T),
-          pooled_features_gdram + box_idx * sampled_pts_num * (3 + feature_in_len) * sizeof(T),
-          rem * (3 + feature_in_len) * sizeof(T), GDRAM2GDRAM);
-    }
-  }
-}
-
-template <typename T>
-__mlu_global__ void MLUUnion1KernelRoiPointPool3dLargeBoxesNumForward(
-    const int batch_size,
-    const int pts_num,
-    const int boxes_num,
-    const int feature_in_len,
-    const int sampled_pts_num,
-    const char *points_xyz_gdram,
-    const char *point_features_gdram,
-    const char *boxes3d_gdram,
-    char *pooled_features_gdram,
-    char *pooled_empty_flag_gdram) {
-  if (coreId == 0x80) {
-    return;
-  }
-  size_t boxes_per_core = (batch_size * boxes_num) / taskDim;
-  size_t boxes_rem = (batch_size * boxes_num) % taskDim;
-  // calc batch_start, batch_end, first_batch_box_start, last batch_box_end for each core
-  int32_t batch_start = taskId < (boxes_rem + 1) ?
-                        (taskId * (boxes_per_core + 1)) / boxes_num :
-                        (taskId * boxes_per_core + boxes_rem) / boxes_num;
-  int32_t batch_end = taskId < boxes_rem ?
-                      ((taskId + 1) * (boxes_per_core + 1) - 1) / boxes_num :
-                      ((taskId + 1) * boxes_per_core + boxes_rem - 1) / boxes_num;
-  size_t first_batch_box_start = taskId < (boxes_rem + 1) ?
-                                 (taskId * (boxes_per_core + 1)) - batch_start * boxes_num :
-                                 taskId * boxes_per_core + boxes_rem - batch_start * boxes_num;
-  size_t last_batch_box_end = taskId < boxes_rem ?
-                              (taskId + 1) * (boxes_per_core + 1) - batch_end * boxes_num :
-                              ((taskId + 1) * boxes_per_core + boxes_rem) - batch_end * boxes_num;
-
-  // points_xyz : [3, B, N]
-  const char *points_x_gdram = points_xyz_gdram;
-  const char *points_y_gdram = points_xyz_gdram + (1 * batch_size * pts_num) * sizeof(T);
-  const char *points_z_gdram = points_xyz_gdram + (2 * batch_size * pts_num) * sizeof(T);
-
-  size_t boxes3d_size = PAD_UP(7, NFU_ALIGN_SIZE) * sizeof(T);
-  size_t span_num_deal = PAD_DOWN(MAX_NRAM_SIZE / TWELVE_SPLIT / sizeof(T), NFU_ALIGN_SIZE);
-  size_t align_num = NFU_ALIGN_SIZE;
-  int32_t repeat = pts_num / span_num_deal;
-  size_t rem = pts_num % span_num_deal;
-  size_t align_rem = CEIL_ALIGN(rem, align_num);
-  char *boxes3d = nram_buffer;
-  char *ping_points_x = nram_buffer + boxes3d_size;
-  char *ping_points_y = ping_points_x + span_num_deal * sizeof(T);
-  char *ping_points_z = ping_points_y + span_num_deal * sizeof(T);
-  size_t ping_pong_gap = 3 * span_num_deal * sizeof(T);
-  char *auxiliary_a = ping_points_x + 2 * ping_pong_gap;
-  char *auxiliary_b = auxiliary_a + span_num_deal * sizeof(T);
-  char *auxiliary_c = auxiliary_b + span_num_deal * sizeof(T);
-  char *auxiliary_d = auxiliary_c + span_num_deal * sizeof(T);
-  char *auxiliary_e = auxiliary_d + span_num_deal * sizeof(T);
-  char *auxiliary_f = auxiliary_e + span_num_deal * sizeof(T);
-  size_t span_load_input1_size = span_num_deal * sizeof(T);
-  size_t span_load_input2_size = span_num_deal * sizeof(T);
-  size_t span_load_input3_size = span_num_deal * sizeof(T);
-  size_t span_load_input4_size = span_num_deal * sizeof(T);
-  int cnt = 0;
-
-  for (int bs_idx = batch_start; bs_idx <= batch_end; bs_idx++) {
-    const char *points_x_start = points_x_gdram + bs_idx * pts_num * sizeof(T);
-    const char *points_y_start = points_y_gdram + bs_idx * pts_num * sizeof(T);
-    const char *points_z_start = points_z_gdram + bs_idx * pts_num * sizeof(T);
-    const char *point_features_start =
-        point_features_gdram + bs_idx * feature_in_len * pts_num * sizeof(T);
-    char *pooled_features_start =
-        pooled_features_gdram +
-        (bs_idx * boxes_num * sampled_pts_num * (3 + feature_in_len)) * sizeof(T);
-    char *pooled_empty_flag_start = pooled_empty_flag_gdram + bs_idx * boxes_num * sizeof(int);
-    size_t box_start = bs_idx == batch_start ? first_batch_box_start : 0;
-    size_t box_end = bs_idx == batch_end ? last_batch_box_end : boxes_num;
-
-    for (int box_idx = box_start; box_idx < box_end; box_idx++) {
-      __memcpy_async(boxes3d,
-                     boxes3d_gdram + bs_idx * boxes_num * 7 * sizeof(T) + box_idx * 7 * sizeof(T),
-                     7 * sizeof(T), GDRAM2NRAM);
-      cnt = 0;
-      if (repeat > 0) {
-        __memcpy_async(ping_points_x, points_x_start, span_load_input1_size, GDRAM2NRAM);
-        __memcpy_async(ping_points_y, points_y_start, span_load_input2_size, GDRAM2NRAM);
-        __memcpy_async(ping_points_z, points_z_start, span_load_input3_size, GDRAM2NRAM);
-        __asm__ volatile("sync;");
-      }
-
-      for (int i = 0; i < repeat - 1; i++) {
-        __memcpy_async(ping_points_x + ((i + 1) % 2) * ping_pong_gap,
-                       points_x_start + (i + 1) * span_load_input1_size, span_load_input1_size,
-                       GDRAM2NRAM);
-        __memcpy_async(ping_points_y + ((i + 1) % 2) * ping_pong_gap,
-                       points_y_start + (i + 1) * span_load_input2_size, span_load_input2_size,
-                       GDRAM2NRAM);
-        __memcpy_async(ping_points_z + ((i + 1) % 2) * ping_pong_gap,
-                       points_z_start + (i + 1) * span_load_input3_size, span_load_input3_size,
-                       GDRAM2NRAM);
-        computeStoreRoipointPool3d<T>(
-            boxes3d, &cnt, ping_points_x + (i % 2) * ping_pong_gap,
-            ping_points_y + (i % 2) * ping_pong_gap, ping_points_z + (i % 2) * ping_pong_gap,
-            point_features_start + i * span_load_input4_size, auxiliary_a, auxiliary_b, auxiliary_c,
-            auxiliary_d, auxiliary_e, auxiliary_f, box_idx, pts_num, feature_in_len,
-            sampled_pts_num, span_num_deal, pooled_features_start, pooled_empty_flag_start);
-        __asm__ volatile("sync;");
-      }
-
-      if (rem > 0) {
-        if (sizeof(T) == sizeof(float)) {
-          __bang_write_value((T *)(ping_points_x + (repeat % 2) * ping_pong_gap +
-                                   PAD_DOWN(rem, NFU_ALIGN_SIZE) * sizeof(T)),
-                             NFU_ALIGN_SIZE, (T)NAN);
-          __bang_write_value((T *)(ping_points_y + (repeat % 2) * ping_pong_gap +
-                                   PAD_DOWN(rem, NFU_ALIGN_SIZE) * sizeof(T)),
-                             NFU_ALIGN_SIZE, (T)NAN);
-          __bang_write_value((T *)(ping_points_z + (repeat % 2) * ping_pong_gap +
-                                   PAD_DOWN(rem, NFU_ALIGN_SIZE) * sizeof(T)),
-                             NFU_ALIGN_SIZE, (T)NAN);
-        } else {
-          __bang_write_value((T *)(ping_points_x + (repeat % 2) * ping_pong_gap +
-                                   PAD_DOWN(rem, NFU_ALIGN_SIZE) * sizeof(T)),
-                             NFU_ALIGN_SIZE, (T)NAN);
-          __bang_write_value((T *)(ping_points_y + (repeat % 2) * ping_pong_gap +
-                                   PAD_DOWN(rem, NFU_ALIGN_SIZE) * sizeof(T)),
-                             NFU_ALIGN_SIZE, (T)NAN);
-          __bang_write_value((T *)(ping_points_z + (repeat % 2) * ping_pong_gap +
-                                   PAD_DOWN(rem, NFU_ALIGN_SIZE) * sizeof(T)),
-                             NFU_ALIGN_SIZE, (T)NAN);
-        }
-        __memcpy_async(ping_points_x + (repeat % 2) * ping_pong_gap,
-                       points_x_start + repeat * span_load_input1_size, rem * sizeof(T),
-                       GDRAM2NRAM);
-        __memcpy_async(ping_points_y + (repeat % 2) * ping_pong_gap,
-                       points_y_start + repeat * span_load_input2_size, rem * sizeof(T),
-                       GDRAM2NRAM);
-        __memcpy_async(ping_points_z + (repeat % 2) * ping_pong_gap,
-                       points_z_start + repeat * span_load_input3_size, rem * sizeof(T),
-                       GDRAM2NRAM);
-      }
-
-      if (repeat > 0 && rem > 0) {
-        computeStoreRoipointPool3d<T>(
-            boxes3d, &cnt, ping_points_x + ((repeat - 1) % 2) * ping_pong_gap,
-            ping_points_y + ((repeat - 1) % 2) * ping_pong_gap,
-            ping_points_z + ((repeat - 1) % 2) * ping_pong_gap,
-            point_features_start + (repeat - 1) * span_load_input4_size, auxiliary_a, auxiliary_b,
-            auxiliary_c, auxiliary_d, auxiliary_e, auxiliary_f, box_idx, pts_num, feature_in_len,
-            sampled_pts_num, span_num_deal, pooled_features_start, pooled_empty_flag_start);
-      } else if (repeat > 0 && rem == 0) {
-        computeStoreLastBlockRoipointPool3d<T>(
-            boxes3d, &cnt, ping_points_x + ((repeat - 1) % 2) * ping_pong_gap,
-            ping_points_y + ((repeat - 1) % 2) * ping_pong_gap,
-            ping_points_z + ((repeat - 1) % 2) * ping_pong_gap,
-            point_features_start + (repeat - 1) * span_load_input4_size, auxiliary_a, auxiliary_b,
-            auxiliary_c, auxiliary_d, auxiliary_e, auxiliary_f, box_idx, pts_num, feature_in_len,
-            sampled_pts_num, span_num_deal, span_num_deal, pooled_features_start,
-            pooled_empty_flag_start);
-      }
-
-      if (rem > 0) {
-        __asm__ volatile("sync;");
-        computeStoreLastBlockRoipointPool3d<T>(
-            boxes3d, &cnt, ping_points_x + (repeat % 2) * ping_pong_gap,
-            ping_points_y + (repeat % 2) * ping_pong_gap,
-            ping_points_z + (repeat % 2) * ping_pong_gap,
-            point_features_start + repeat * span_load_input4_size, auxiliary_a, auxiliary_b,
-            auxiliary_c, auxiliary_d, auxiliary_e, auxiliary_f, box_idx, pts_num, feature_in_len,
-            sampled_pts_num, align_rem, span_num_deal, pooled_features_start,
-            pooled_empty_flag_start);
-      }
-    }
-  }
-}
-
-template __mlu_global__ void MLUUnion1KernelRoiPointPool3dLargeBoxesNumForward<float>(
-    const int batch_size,
-    const int pts_num,
-    const int boxes_num,
-    const int feature_in_len,
-    const int sampled_pts_num,
-    const char *points_xyz_gdram,
-    const char *point_features_gdram,
-    const char *boxes3d_gdram,
-    char *pooled_features_gdram,
-    char *pooled_empty_flag_gdram);
-
-template __mlu_global__ void MLUUnion1KernelRoiPointPool3dLargeBoxesNumForward<half>(
-    const int batch_size,
-    const int pts_num,
-    const int boxes_num,
-    const int feature_in_len,
-    const int sampled_pts_num,
-    const char *points_xyz_gdram,
-    const char *point_features_gdram,
-    const char *boxes3d_gdram,
-    char *pooled_features_gdram,
-    char *pooled_empty_flag_gdram);
-
-void KernelRoiPointPool3dLargeBoxesNumForward(cnrtDim3_t k_dim,
-                                              cnrtFunctionType_t k_type,
-                                              cnrtQueue_t queue,
-                                              const cnrtDataType_t d_type,
-                                              const int batch_size,
-                                              const int pts_num,
-                                              const int boxes_num,
-                                              const int feature_in_len,
-                                              const int sampled_pts_num,
-                                              const void *points_xyz,
-                                              const void *boxes3d,
-                                              const void *point_features,
-                                              void *pooled_features,
-                                              int *pooled_empty_flag) {
-  switch (d_type) {
-    default: { break; }
-    case CNRT_FLOAT32: {
-      MLUUnion1KernelRoiPointPool3dLargeBoxesNumForward<float><<<k_dim, k_type, queue>>>(
-          batch_size, pts_num, boxes_num, feature_in_len, sampled_pts_num,
-          (char *)points_xyz, (char *)point_features, (char *)boxes3d,
-          (char *)pooled_features, (char *)pooled_empty_flag);
-    }; break;
-    case CNRT_FLOAT16: {
-      MLUUnion1KernelRoiPointPool3dLargeBoxesNumForward<half><<<k_dim, k_type, queue>>>(
-          batch_size, pts_num, boxes_num, feature_in_len, sampled_pts_num,
-          (char *)points_xyz, (char *)point_features, (char *)boxes3d,
-          (char *)pooled_features, (char *)pooled_empty_flag);
-    }; break;
-  }
-}
diff --git a/mmcv/ops/csrc/common/mlu/roipoint_pool3d_mlu_kernel.mlu b/mmcv/ops/csrc/common/mlu/roipoint_pool3d_mlu_kernel.mlu
deleted file mode 100644
index f16d840..0000000
--- a/mmcv/ops/csrc/common/mlu/roipoint_pool3d_mlu_kernel.mlu
+++ /dev/null
@@ -1,544 +0,0 @@
-/*************************************************************************
- * Copyright (C) 2022 Cambricon.
- *
- * OR IMPLIED, INCLUDING BUvoid NOKType LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENvoid SHALL THE AUTHORS OR COPYRIGHKType HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORvoid OR OTHERWISE, ARISING FROM, OUKType OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *************************************************************************/
-
-#include "common_mlu_helper.hpp"
-
-/**************************************************************************************
- *
- * NRAM partition:
- * | boxes3d                   | cnt                      |
- * | boxes_num * 7 * sizeof(T) | boxes_num * sizeof(int)  |
- *
- * | ping points               | pong points              | aux_a ~ aux_f            |
- * | 3 * deal_num * sizeof(T)  | 3 * deal_num * sizeof(T) | 6 * deal_num * sizeof(T) |
- *
- ***************************************************************************************/
-#define TWELVE_SPLIT 12
-
-__nram__ char nram_buffer[MAX_NRAM_SIZE];
-
-template <typename T>
-__mlu_func__ void checkPointsInBox3d(const T *boxes3d,
-                                     const size_t deal_num,
-                                     T *x,
-                                     T *y,
-                                     T *z,
-                                     T *auxiliary_a,
-                                     T *auxiliary_b,
-                                     T *auxiliary_c,
-                                     T *auxiliary_d,
-                                     T *auxiliary_e,
-                                     T *auxiliary_f,
-                                     T *pts_assign) {
-  // param box3d: (cx, cy, cz, dx, dy, dz, rz) in LiDAR coordinate
-  T cx = boxes3d[0];
-  T cy = boxes3d[1];
-  T cz = boxes3d[2];
-  T dx = boxes3d[3];
-  T dy = boxes3d[4];
-  T dz = boxes3d[5];
-  T rz = boxes3d[6];
-  // shift to the center since cz in box3d is the bottom center
-  cz += 0.5 * dz;
-
-  T cosa = (T)std::cos(-rz);
-  T sina = (T)std::sin(-rz);
-
-  // x - cx
-  __bang_sub_scalar((T *)auxiliary_a, (T *)x, (T)cx, deal_num);
-  // y - cy
-  __bang_sub_scalar((T *)auxiliary_b, (T *)y, (T)cy, deal_num);
-  // z - cz
-  __bang_sub_scalar((T *)auxiliary_c, (T *)z, (T)cz, deal_num);
-  // |z - cz|
-  __bang_active_abs((T *)auxiliary_c, (T *)auxiliary_c, deal_num);
-  // |z - cz| > dz / 2.0
-#if __BANG_ARCH__ >= 322
-  __bang_gt_scalar((T *)auxiliary_c, (T *)auxiliary_c, (T)(0.5 * dz), deal_num);
-#else
-  __bang_write_value((T *)auxiliary_d, deal_num, (T)(0.5 * dz));
-  __bang_lt((T *)auxiliary_c, (T *)auxiliary_d, (T *)auxiliary_c, deal_num);
-#endif
-  // !(|z - cz| > dz / 2.0)
-  __bang_not((T *)auxiliary_c, (T *)auxiliary_c, deal_num);
-  // (x - cx) * cos(-rz)
-  __bang_mul_scalar((T *)auxiliary_d, (T *)auxiliary_a, (T)cosa, deal_num);
-  // (y - cy) * sin(-rz)
-  __bang_mul_scalar((T *)auxiliary_e, (T *)auxiliary_b, (T)sina, deal_num);
-  // local_x = (x - cx) * cos(-rz) + (y - cy) * -sin(-rz)
-  __bang_sub((T *)auxiliary_d, (T *)auxiliary_d, (T *)auxiliary_e, deal_num);
-  // |local_x|
-  __bang_active_abs((T *)auxiliary_d, (T *)auxiliary_d, deal_num);
-  // |local_x| < dx / 2.0
-#if __BANG_ARCH__ >= 322
-  __bang_lt_scalar(auxiliary_d, auxiliary_d, (T)(0.5 * dx), deal_num);
-#else
-  __bang_write_value((T *)auxiliary_e, deal_num, (T)(0.5 * dx));
-  __bang_gt((T *)auxiliary_d, (T *)auxiliary_e, (T *)auxiliary_d, deal_num);
-#endif
-  // (x - cx) * sin(-rz)
-  __bang_mul_scalar((T *)auxiliary_e, (T *)auxiliary_a, (T)sina, deal_num);
-  // (y - cy) * cos(-rz)
-  __bang_mul_scalar((T *)auxiliary_f, (T *)auxiliary_b, (T)cosa, deal_num);
-  // local_y = (x - cx) * sin(-rz) + (y - cy) * cos(-rz)
-  __bang_add((T *)auxiliary_e, (T *)auxiliary_e, (T *)auxiliary_f, deal_num);
-  // |local_y|
-  __bang_active_abs((T *)auxiliary_e, (T *)auxiliary_e, deal_num);
-  // |local_y| < dy / 2.0
-#if __BANG_ARCH__ >= 322
-  __bang_lt_scalar(auxiliary_e, auxiliary_e, (T)(0.5 * dy), deal_num);
-#else
-  __bang_write_value((T *)auxiliary_f, deal_num, (T)(0.5 * dy));
-  __bang_gt((T *)auxiliary_e, (T *)auxiliary_f, (T *)auxiliary_e, deal_num);
-#endif
-  // pts_assign = |x - cx| < dx / 2.0 && |y - cy| < dy / 2.0 && |z - cz| <= dz / 2.0
-  __bang_mul((T *)pts_assign, (T *)auxiliary_c, (T *)auxiliary_d, deal_num);
-  __bang_mul((T *)pts_assign, (T *)pts_assign, (T *)auxiliary_e, deal_num);
-}
-
-template <typename T>
-__mlu_func__ void computeStoreRoipointPool3d(char *boxes3d,
-                                             int  *cnt,
-                                             char *points_x,
-                                             char *points_y,
-                                             char *points_z,
-                                             const char *point_features,
-                                             char *auxiliary_a,
-                                             char *auxiliary_b,
-                                             char *auxiliary_c,
-                                             char *auxiliary_d,
-                                             char *auxiliary_e,
-                                             char *auxiliary_f,
-                                             const int box_idx,
-                                             const int pts_num,
-                                             const int feature_in_len,
-                                             const int sampled_pts_num,
-                                             const size_t span_num_deal,
-                                             char *pooled_features_gdram,
-                                             char *pooled_empty_flag_gdram) {
-  char *pts_assign = auxiliary_a;
-  if (cnt[box_idx] >= sampled_pts_num) {
-    return;
-  }
-  checkPointsInBox3d((T *)(boxes3d + box_idx * 7 * sizeof(T)), span_num_deal, (T *)points_x,
-                     (T *)points_y, (T *)points_z, (T *)auxiliary_a, (T *)auxiliary_b,
-                     (T *)auxiliary_c, (T *)auxiliary_d, (T *)auxiliary_e, (T *)auxiliary_f,
-                     (T *)pts_assign);
-
-  // __bang_select returns selected elements vector and the number of selected elements
-  __bang_select((T *)auxiliary_b, (T *)points_x, (T *)pts_assign, span_num_deal);
-  uint32_t select_num = *((uint32_t *)auxiliary_b);
-
-  if (select_num == 0) {
-    return;
-  }
-  int sampled_pts_num_rem = sampled_pts_num - cnt[box_idx];
-  int segnum = min((int)select_num, sampled_pts_num_rem) - 1;
-
-  // copy x to pooled_features_gdram
-  // The result of __bang_select is composed of three parts:
-  // The first 4-byte is the number of selected element, whose data type is unsigned int.
-  // The next 124-byte is zero. The rest bytes are the selected elements.
-  int select_num_size = 128;
-  __memcpy(pooled_features_gdram +
-               (box_idx * sampled_pts_num + cnt[box_idx]) * (3 + feature_in_len) * sizeof(T),
-           (T *)((int8_t *)auxiliary_b + select_num_size), sizeof(T), NRAM2GDRAM,
-           (3 + feature_in_len) * sizeof(T), sizeof(T), segnum);
-
-  // copy y to pooled_features_gdram
-  __bang_collect((T *)auxiliary_d, (T *)points_y, (T *)pts_assign, span_num_deal);
-  __memcpy(pooled_features_gdram +
-               (box_idx * sampled_pts_num + cnt[box_idx]) * (3 + feature_in_len) * sizeof(T) +
-               1 * sizeof(T),
-           (T *)auxiliary_d, sizeof(T), NRAM2GDRAM, (3 + feature_in_len) * sizeof(T), sizeof(T),
-           segnum);
-
-  // copy z to pooled_features_gdram
-  __bang_collect((T *)auxiliary_e, (T *)points_z, (T *)pts_assign, span_num_deal);
-  __memcpy(pooled_features_gdram +
-               (box_idx * sampled_pts_num + cnt[box_idx]) * (3 + feature_in_len) * sizeof(T) +
-               2 * sizeof(T),
-           (T *)auxiliary_e, sizeof(T), NRAM2GDRAM, (3 + feature_in_len) * sizeof(T), sizeof(T),
-           segnum);
-
-  // copy features to pooled_features_gdram
-  for (int c_idx = 0; c_idx < feature_in_len; c_idx++) {
-    __memcpy(auxiliary_d, point_features + c_idx * pts_num * sizeof(T), span_num_deal * sizeof(T),
-             GDRAM2NRAM);
-    __bang_collect((T *)auxiliary_e, (T *)auxiliary_d, (T *)pts_assign, span_num_deal);
-    __memcpy(pooled_features_gdram +
-                 (box_idx * sampled_pts_num + cnt[box_idx]) * (3 + feature_in_len) * sizeof(T) +
-                 (3 + c_idx) * sizeof(T),
-             auxiliary_e, sizeof(T), NRAM2GDRAM, (3 + feature_in_len) * sizeof(T), sizeof(T),
-             segnum);
-  }
-
-  cnt[box_idx] += select_num;
-}
-
-template <typename T>
-__mlu_func__ void computeStoreLastBlockRoipointPool3d(char *boxes3d,
-                                                      int  *cnt,
-                                                      char *points_x,
-                                                      char *points_y,
-                                                      char *points_z,
-                                                      const char *point_features,
-                                                      char *auxiliary_a,
-                                                      char *auxiliary_b,
-                                                      char *auxiliary_c,
-                                                      char *auxiliary_d,
-                                                      char *auxiliary_e,
-                                                      char *auxiliary_f,
-                                                      const int box_idx,
-                                                      const int pts_num,
-                                                      const int feature_in_len,
-                                                      const int sampled_pts_num,
-                                                      const size_t span_num_deal,
-                                                      const size_t auxiliary_num_deal,
-                                                      char *pooled_features_gdram,
-                                                      char *pooled_empty_flag_gdram) {
-  char *pts_assign = auxiliary_a;
-  if (cnt[box_idx] >= sampled_pts_num) {
-    // pooled_empty_flag_gdram set 0
-    *((int *)auxiliary_a) = 0;
-    __memcpy(pooled_empty_flag_gdram + box_idx * sizeof(int), auxiliary_a, sizeof(int), NRAM2GDRAM);
-    return;
-  }
-  checkPointsInBox3d((T *)(boxes3d + box_idx * 7 * sizeof(T)), span_num_deal, (T *)points_x,
-                     (T *)points_y, (T *)points_z, (T *)auxiliary_a, (T *)auxiliary_b,
-                     (T *)auxiliary_c, (T *)auxiliary_d, (T *)auxiliary_e, (T *)auxiliary_f,
-                     (T *)pts_assign);
-
-  // __bang_select returns selected elements vector and the number of selected elements
-  __bang_select((T *)auxiliary_b, (T *)points_x, (T *)pts_assign, span_num_deal);
-  uint32_t select_num = *((uint32_t *)auxiliary_b);
-
-  if (cnt[box_idx] + select_num == 0) {
-    // pooled_empty_flag_gdram set 1
-    *((int *)auxiliary_a) = 1;
-    __memcpy(pooled_empty_flag_gdram + box_idx * sizeof(int), auxiliary_a, sizeof(int), NRAM2GDRAM);
-
-    // pooled_features_gdram set 0
-    int repeat = (sampled_pts_num * (3 + feature_in_len)) / (auxiliary_num_deal * 6);
-    int rem = (sampled_pts_num * (3 + feature_in_len)) % (auxiliary_num_deal * 6);
-    // use auxiliary_a to auxiliary_f
-    __bang_write_zero((T *)auxiliary_a, PAD_UP(auxiliary_num_deal * 6, NFU_ALIGN_SIZE));
-    if (repeat > 0) {
-      __memcpy(pooled_features_gdram + box_idx * sampled_pts_num * (3 + feature_in_len) * sizeof(T),
-               auxiliary_a, auxiliary_num_deal * 6 * sizeof(T), NRAM2GDRAM,
-               auxiliary_num_deal * 6 * sizeof(T), 0, repeat - 1);
-    }
-    if (rem > 0) {
-      __memcpy(pooled_features_gdram +
-                   box_idx * sampled_pts_num * (3 + feature_in_len) * sizeof(T) +
-                   repeat * auxiliary_num_deal * 6 * sizeof(T),
-               auxiliary_a, rem * sizeof(T), NRAM2GDRAM);
-    }
-    return;
-  }
-
-  if (select_num > 0) {
-    int sampled_pts_num_rem = sampled_pts_num - cnt[box_idx];
-    int segnum = min((int)select_num, sampled_pts_num_rem) - 1;
-
-    // copy x to pooled_features_gdram
-    // The result of __bang_select is composed of three parts:
-    // The first 4-byte is the number of selected element, whose data type is unsigned int.
-    // The next 124-byte is zero. The rest bytes are the selected elements.
-    int select_num_size = 128;
-    __memcpy(pooled_features_gdram +
-                 (box_idx * sampled_pts_num + cnt[box_idx]) * (3 + feature_in_len) * sizeof(T),
-             (T *)((int8_t *)auxiliary_b + select_num_size), sizeof(T), NRAM2GDRAM,
-             (3 + feature_in_len) * sizeof(T), sizeof(T), segnum);
-
-    // copy y to pooled_features_gdram
-    __bang_collect((T *)auxiliary_d, (T *)points_y, (T *)pts_assign, span_num_deal);
-    __memcpy(pooled_features_gdram +
-                 (box_idx * sampled_pts_num + cnt[box_idx]) * (3 + feature_in_len) * sizeof(T) +
-                 1 * sizeof(T),
-             (T *)auxiliary_d, sizeof(T), NRAM2GDRAM, (3 + feature_in_len) * sizeof(T), sizeof(T),
-             segnum);
-
-    // copy z to pooled_features_gdram
-    __bang_collect((T *)auxiliary_e, (T *)points_z, (T *)pts_assign, span_num_deal);
-    __memcpy(pooled_features_gdram +
-                 (box_idx * sampled_pts_num + cnt[box_idx]) * (3 + feature_in_len) * sizeof(T) +
-                 2 * sizeof(T),
-             (T *)auxiliary_e, sizeof(T), NRAM2GDRAM, (3 + feature_in_len) * sizeof(T), sizeof(T),
-             segnum);
-
-    // copy features to pooled_features_gdram
-    for (int c_idx = 0; c_idx < feature_in_len; c_idx++) {
-      __memcpy(auxiliary_d, point_features + c_idx * pts_num * sizeof(T), span_num_deal * sizeof(T),
-               GDRAM2NRAM);
-      __bang_collect((T *)auxiliary_e, (T *)auxiliary_d, (T *)pts_assign, span_num_deal);
-      __memcpy(pooled_features_gdram +
-                   (box_idx * sampled_pts_num + cnt[box_idx]) * (3 + feature_in_len) * sizeof(T) +
-                   (3 + c_idx) * sizeof(T),
-               auxiliary_e, sizeof(T), NRAM2GDRAM, (3 + feature_in_len) * sizeof(T), sizeof(T),
-               segnum);
-    }
-  }
-
-  // pooled_empty_flag_gdram set 0
-  *((int *)auxiliary_a) = 0;
-  __memcpy(pooled_empty_flag_gdram + box_idx * sizeof(int), auxiliary_a, sizeof(int), NRAM2GDRAM);
-
-  cnt[box_idx] += select_num;
-  if (cnt[box_idx] < sampled_pts_num) {
-    // duplicate same points for sampling
-    int repeat = sampled_pts_num / cnt[box_idx] - 1;
-    int rem = sampled_pts_num % cnt[box_idx];
-    if (repeat > 0) {
-      __memcpy(pooled_features_gdram +
-                   (box_idx * sampled_pts_num + cnt[box_idx]) * (3 + feature_in_len) * sizeof(T),
-               pooled_features_gdram + box_idx * sampled_pts_num * (3 + feature_in_len) * sizeof(T),
-               cnt[box_idx] * (3 + feature_in_len) * sizeof(T), GDRAM2GDRAM,
-               cnt[box_idx] * (3 + feature_in_len) * sizeof(T), 0, repeat - 1);
-    }
-    if (rem > 0) {
-      __memcpy(pooled_features_gdram + (box_idx * sampled_pts_num + (repeat + 1) * cnt[box_idx]) *
-                   (3 + feature_in_len) * sizeof(T),
-               pooled_features_gdram + box_idx * sampled_pts_num * (3 + feature_in_len) * sizeof(T),
-               rem * (3 + feature_in_len) * sizeof(T), GDRAM2GDRAM);
-    }
-  }
-}
-
-template <typename T>
-__mlu_global__ void MLUUnion1KernelRoiPointPool3dForward(
-    const int batch_size,
-    const int pts_num,
-    const int boxes_num,
-    const int feature_in_len,
-    const int sampled_pts_num,
-    const char *points_xyz_gdram,
-    const char *point_features_gdram,
-    const char *boxes3d_gdram,
-    char *pooled_features_gdram,
-    char *pooled_empty_flag_gdram) {
-  if (coreId == 0x80) {
-    return;
-  }
-  size_t boxes_per_core = (batch_size * boxes_num) / taskDim;
-  size_t boxes_rem = (batch_size * boxes_num) % taskDim;
-  // calc batch_start, batch_end, first_batch_box_start, last batch_box_end for each core
-  int32_t batch_start = taskId < (boxes_rem + 1) ?
-                        (taskId * (boxes_per_core + 1)) / boxes_num :
-                        (taskId * boxes_per_core + boxes_rem) / boxes_num;
-  int32_t batch_end = taskId < boxes_rem ?
-                      ((taskId + 1) * (boxes_per_core + 1) - 1) / boxes_num :
-                      ((taskId + 1) * boxes_per_core + boxes_rem - 1) / boxes_num;
-  size_t first_batch_box_start = taskId < (boxes_rem + 1) ?
-                                 (taskId * (boxes_per_core + 1)) - batch_start * boxes_num :
-                                 taskId * boxes_per_core + boxes_rem - batch_start * boxes_num;
-  size_t last_batch_box_end = taskId < boxes_rem ?
-                              (taskId + 1) * (boxes_per_core + 1) - batch_end * boxes_num :
-                              ((taskId + 1) * boxes_per_core + boxes_rem) - batch_end * boxes_num;
-
-  // points_xyz : [3, B, N]
-  const char *points_x_gdram = points_xyz_gdram;
-  const char *points_y_gdram = points_xyz_gdram + (1 * batch_size * pts_num) * sizeof(T);
-  const char *points_z_gdram = points_xyz_gdram + (2 * batch_size * pts_num) * sizeof(T);
-
-  size_t boxes3d_size = PAD_UP(boxes_num * 7, NFU_ALIGN_SIZE) * sizeof(T);
-  size_t cnt_size = PAD_UP(boxes_num, NFU_ALIGN_SIZE) * sizeof(int);
-  size_t span_num_deal = PAD_DOWN(
-      (MAX_NRAM_SIZE - boxes3d_size - cnt_size) / TWELVE_SPLIT / sizeof(T), NFU_ALIGN_SIZE);
-  size_t align_num = NFU_ALIGN_SIZE;
-  int32_t repeat = pts_num / span_num_deal;
-  size_t rem = pts_num % span_num_deal;
-  size_t align_rem = CEIL_ALIGN(rem, align_num);
-  char *boxes3d = nram_buffer;
-  char *cnt = nram_buffer + boxes3d_size;
-  char *ping_points_x = cnt + cnt_size;
-  char *ping_points_y = ping_points_x + span_num_deal * sizeof(T);
-  char *ping_points_z = ping_points_y + span_num_deal * sizeof(T);
-  size_t ping_pong_gap = 3 * span_num_deal * sizeof(T);
-  char *auxiliary_a = ping_points_x + 2 * ping_pong_gap;
-  char *auxiliary_b = auxiliary_a + span_num_deal * sizeof(T);
-  char *auxiliary_c = auxiliary_b + span_num_deal * sizeof(T);
-  char *auxiliary_d = auxiliary_c + span_num_deal * sizeof(T);
-  char *auxiliary_e = auxiliary_d + span_num_deal * sizeof(T);
-  char *auxiliary_f = auxiliary_e + span_num_deal * sizeof(T);
-  size_t span_load_input1_size = span_num_deal * sizeof(T);
-  size_t span_load_input2_size = span_num_deal * sizeof(T);
-  size_t span_load_input3_size = span_num_deal * sizeof(T);
-  size_t span_load_input4_size = span_num_deal * sizeof(T);
-
-  for (int bs_idx = batch_start; bs_idx <= batch_end; bs_idx++) {
-    __memcpy_async(boxes3d, boxes3d_gdram + bs_idx * boxes_num * 7 * sizeof(T),
-                   boxes_num * 7 * sizeof(T), GDRAM2NRAM);
-    __bang_write_zero((int *)cnt, PAD_UP(boxes_num, NFU_ALIGN_SIZE));
-
-    const char *points_x_start = points_x_gdram + bs_idx * pts_num * sizeof(T);
-    const char *points_y_start = points_y_gdram + bs_idx * pts_num * sizeof(T);
-    const char *points_z_start = points_z_gdram + bs_idx * pts_num * sizeof(T);
-    const char *point_features_start =
-        point_features_gdram + bs_idx * feature_in_len * pts_num * sizeof(T);
-    char *pooled_features_start =
-        pooled_features_gdram +
-        (bs_idx * boxes_num * sampled_pts_num * (3 + feature_in_len)) * sizeof(T);
-    char *pooled_empty_flag_start = pooled_empty_flag_gdram + bs_idx * boxes_num * sizeof(int);
-    size_t box_start = bs_idx == batch_start ? first_batch_box_start : 0;
-    size_t box_end = bs_idx == batch_end ? last_batch_box_end : boxes_num;
-
-    if (repeat > 0) {
-      __memcpy_async(ping_points_x, points_x_start, span_load_input1_size, GDRAM2NRAM);
-      __memcpy_async(ping_points_y, points_y_start, span_load_input2_size, GDRAM2NRAM);
-      __memcpy_async(ping_points_z, points_z_start, span_load_input3_size, GDRAM2NRAM);
-      __asm__ volatile("sync;");
-    }
-
-    for (int i = 0; i < repeat - 1; i++) {
-      __memcpy_async(ping_points_x + ((i + 1) % 2) * ping_pong_gap,
-                     points_x_start + (i + 1) * span_load_input1_size, span_load_input1_size,
-                     GDRAM2NRAM);
-      __memcpy_async(ping_points_y + ((i + 1) % 2) * ping_pong_gap,
-                     points_y_start + (i + 1) * span_load_input2_size, span_load_input2_size,
-                     GDRAM2NRAM);
-      __memcpy_async(ping_points_z + ((i + 1) % 2) * ping_pong_gap,
-                     points_z_start + (i + 1) * span_load_input3_size, span_load_input3_size,
-                     GDRAM2NRAM);
-      for (int box_idx = box_start; box_idx < box_end; box_idx++) {
-        computeStoreRoipointPool3d<T>(
-            boxes3d, (int *)cnt, ping_points_x + (i % 2) * ping_pong_gap,
-            ping_points_y + (i % 2) * ping_pong_gap, ping_points_z + (i % 2) * ping_pong_gap,
-            point_features_start + i * span_load_input4_size, auxiliary_a, auxiliary_b, auxiliary_c,
-            auxiliary_d, auxiliary_e, auxiliary_f, box_idx, pts_num, feature_in_len,
-            sampled_pts_num, span_num_deal, pooled_features_start, pooled_empty_flag_start);
-      }
-      __asm__ volatile("sync;");
-    }
-
-    if (rem > 0) {
-      if (sizeof(T) == sizeof(float)) {
-        __bang_write_value((T *)(ping_points_x + (repeat % 2) * ping_pong_gap +
-                                 PAD_DOWN(rem, NFU_ALIGN_SIZE) * sizeof(T)),
-                           NFU_ALIGN_SIZE, (T)NAN);
-        __bang_write_value((T *)(ping_points_y + (repeat % 2) * ping_pong_gap +
-                                 PAD_DOWN(rem, NFU_ALIGN_SIZE) * sizeof(T)),
-                           NFU_ALIGN_SIZE, (T)NAN);
-        __bang_write_value((T *)(ping_points_z + (repeat % 2) * ping_pong_gap +
-                                 PAD_DOWN(rem, NFU_ALIGN_SIZE) * sizeof(T)),
-                           NFU_ALIGN_SIZE, (T)NAN);
-      } else {
-        __bang_write_value((T *)(ping_points_x + (repeat % 2) * ping_pong_gap +
-                                 PAD_DOWN(rem, NFU_ALIGN_SIZE) * sizeof(T)),
-                           NFU_ALIGN_SIZE, (T)NAN);
-        __bang_write_value((T *)(ping_points_y + (repeat % 2) * ping_pong_gap +
-                                 PAD_DOWN(rem, NFU_ALIGN_SIZE) * sizeof(T)),
-                           NFU_ALIGN_SIZE, (T)NAN);
-        __bang_write_value((T *)(ping_points_z + (repeat % 2) * ping_pong_gap +
-                                 PAD_DOWN(rem, NFU_ALIGN_SIZE) * sizeof(T)),
-                           NFU_ALIGN_SIZE, (T)NAN);
-      }
-      __memcpy_async(ping_points_x + (repeat % 2) * ping_pong_gap,
-                     points_x_start + repeat * span_load_input1_size, rem * sizeof(T), GDRAM2NRAM);
-      __memcpy_async(ping_points_y + (repeat % 2) * ping_pong_gap,
-                     points_y_start + repeat * span_load_input2_size, rem * sizeof(T), GDRAM2NRAM);
-      __memcpy_async(ping_points_z + (repeat % 2) * ping_pong_gap,
-                     points_z_start + repeat * span_load_input3_size, rem * sizeof(T), GDRAM2NRAM);
-    }
-
-    if (repeat > 0 && rem > 0) {
-      for (int box_idx = box_start; box_idx < box_end; box_idx++) {
-        computeStoreRoipointPool3d<T>(
-            boxes3d, (int *)cnt, ping_points_x + ((repeat - 1) % 2) * ping_pong_gap,
-            ping_points_y + ((repeat - 1) % 2) * ping_pong_gap,
-            ping_points_z + ((repeat - 1) % 2) * ping_pong_gap,
-            point_features_start + (repeat - 1) * span_load_input4_size, auxiliary_a, auxiliary_b,
-            auxiliary_c, auxiliary_d, auxiliary_e, auxiliary_f, box_idx, pts_num, feature_in_len,
-            sampled_pts_num, span_num_deal, pooled_features_start, pooled_empty_flag_start);
-      }
-    } else if (repeat > 0 && rem == 0) {
-      for (int box_idx = box_start; box_idx < box_end; box_idx++) {
-        computeStoreLastBlockRoipointPool3d<T>(
-            boxes3d, (int *)cnt, ping_points_x + ((repeat - 1) % 2) * ping_pong_gap,
-            ping_points_y + ((repeat - 1) % 2) * ping_pong_gap,
-            ping_points_z + ((repeat - 1) % 2) * ping_pong_gap,
-            point_features_start + (repeat - 1) * span_load_input4_size, auxiliary_a, auxiliary_b,
-            auxiliary_c, auxiliary_d, auxiliary_e, auxiliary_f, box_idx, pts_num, feature_in_len,
-            sampled_pts_num, span_num_deal, span_num_deal, pooled_features_start,
-            pooled_empty_flag_start);
-      }
-    }
-
-    if (rem > 0) {
-      __asm__ volatile("sync;");
-      for (int box_idx = box_start; box_idx < box_end; box_idx++) {
-        computeStoreLastBlockRoipointPool3d<T>(
-            boxes3d, (int *)cnt, ping_points_x + (repeat % 2) * ping_pong_gap,
-            ping_points_y + (repeat % 2) * ping_pong_gap,
-            ping_points_z + (repeat % 2) * ping_pong_gap,
-            point_features_start + repeat * span_load_input4_size, auxiliary_a, auxiliary_b,
-            auxiliary_c, auxiliary_d, auxiliary_e, auxiliary_f, box_idx, pts_num, feature_in_len,
-            sampled_pts_num, align_rem, span_num_deal, pooled_features_start,
-            pooled_empty_flag_start);
-      }
-    }
-  }
-}
-
-template __mlu_global__ void MLUUnion1KernelRoiPointPool3dForward<float>(
-    const int batch_size,
-    const int pts_num,
-    const int boxes_num,
-    const int feature_in_len,
-    const int sampled_pts_num,
-    const char *points_xyz_gdram,
-    const char *point_features_gdram,
-    const char *boxes3d_gdram,
-    char *pooled_features_gdram,
-    char *pooled_empty_flag_gdram);
-
-template __mlu_global__ void MLUUnion1KernelRoiPointPool3dForward<half>(
-    const int batch_size,
-    const int pts_num,
-    const int boxes_num,
-    const int feature_in_len,
-    const int sampled_pts_num,
-    const char *points_xyz_gdram,
-    const char *point_features_gdram,
-    const char *boxes3d_gdram,
-    char *pooled_features_gdram,
-    char *pooled_empty_flag_gdram);
-
-void KernelRoiPointPool3dForward(cnrtDim3_t k_dim,
-                                 cnrtFunctionType_t k_type,
-                                 cnrtQueue_t queue,
-                                 const cnrtDataType_t d_type,
-                                 const int batch_size,
-                                 const int pts_num,
-                                 const int boxes_num,
-                                 const int feature_in_len,
-                                 const int sampled_pts_num,
-                                 const void *points_xyz,
-                                 const void *boxes3d,
-                                 const void *point_features,
-                                 void *pooled_features,
-                                 int *pooled_empty_flag) {
-  switch (d_type) {
-    default: { break; }
-    case CNRT_FLOAT32: {
-      MLUUnion1KernelRoiPointPool3dForward<float><<<k_dim, k_type, queue>>>(
-          batch_size, pts_num, boxes_num, feature_in_len, sampled_pts_num,
-          (char *)points_xyz, (char *)point_features, (char *)boxes3d,
-          (char *)pooled_features, (char *)pooled_empty_flag);
-    }; break;
-    case CNRT_FLOAT16: {
-      MLUUnion1KernelRoiPointPool3dForward<half><<<k_dim, k_type, queue>>>(
-          batch_size, pts_num, boxes_num, feature_in_len, sampled_pts_num,
-          (char *)points_xyz, (char *)point_features, (char *)boxes3d,
-          (char *)pooled_features, (char *)pooled_empty_flag);
-    }; break;
-  }
-}
diff --git a/mmcv/ops/csrc/common/mlu/three_nn_mlu_kernel.mlu b/mmcv/ops/csrc/common/mlu/three_nn_mlu_kernel.mlu
deleted file mode 100644
index 7927385..0000000
--- a/mmcv/ops/csrc/common/mlu/three_nn_mlu_kernel.mlu
+++ /dev/null
@@ -1,466 +0,0 @@
-/*************************************************************************
- * Copyright (C) 2022 Cambricon.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *************************************************************************/
-#include "common_mlu_helper.hpp"
-#include <algorithm>
-
-__nram__ char nram_buffer[MAX_NRAM_SIZE];
-
-#if __BANG_ARCH__ >= 322
-/**
- * returns the index of ret, which is stored at the 1st position of the `ret`,
- * used after bang_min
- */
-__mlu_func__ uint32_t getIndice(half *ret) {
-  uint32_t indice = *((uint32_t *)((uint16_t *)ret + 1));
-  return indice;
-}
-
-/**
- * returns the index of ret, which is stored at the 1st position of the `ret`,
- * used after bang_min
- */
-__mlu_func__ uint32_t getIndice(float *ret) {
-  uint32_t indice = ((uint32_t *)ret)[1];
-  return indice;
-}
-#endif
-
-template <typename T>
-__mlu_func__ void auxArgmin(T *nram_dst, T *nram_src, const int num_deal,
-                            T *value, int *index) {
-  __bang_min(nram_dst, nram_src, num_deal);
-  *value = nram_dst[0];
-  __bang_write_value(nram_dst, num_deal, *value);
-  __bang_eq(nram_dst, nram_src, nram_dst, num_deal);
-  __bang_findfirst1((uint32_t *)nram_dst, nram_dst, num_deal);
-  *index = *((int *)nram_dst);
-}
-
-template <typename T>
-__mlu_func__ void auxFuncFind3Min(T *nram_aux_a, const int auxa_offset,
-                                  int *nram_aux_b, const int auxb_offset,
-                                  T *nram_dest, T *nram_aux_sort_a,
-                                  int *nram_aux_sort_b, const int deal_offset) {
-  __bang_write_value(nram_aux_sort_a, auxa_offset, (T)(INFINITY));
-  __bang_write_value(nram_aux_sort_b, auxb_offset, (int)0);
-  int index = 0;
-  for (int i = 0; i < 3; i++) {
-#if __BANG_ARCH__ >= 322
-    __bang_argmin(nram_dest, nram_aux_a, auxa_offset);
-    nram_aux_sort_a[i] = nram_dest[0];
-    index = getIndice(nram_dest);
-#else
-    T value = 0;
-    auxArgmin(nram_dest, nram_aux_a, auxa_offset, &value, &index);
-    nram_aux_sort_a[i] = value;
-#endif
-    nram_aux_sort_b[i] = nram_aux_b[index];
-    __memset_nram(nram_aux_a + index, 1, (T)(INFINITY));
-  }
-  __memcpy((char *)nram_aux_a, (char *)nram_aux_sort_a, auxa_offset * sizeof(T),
-           NRAM2NRAM);
-  __memcpy((char *)nram_aux_b, (char *)nram_aux_sort_b,
-           auxb_offset * sizeof(int), NRAM2NRAM);
-}
-
-template <typename T>
-__mlu_func__ void auxFuncSort(T *nram_aux_a, const int auxa_offset,
-                              int *nram_aux_b, const int auxb_offset,
-                              T *nram_dest, T *nram_help_value,
-                              int *nram_help_idx, const int num_deal,
-                              const int deal_offset) {
-  for (int k = 0; k < num_deal; ++k) {
-    auxFuncFind3Min(nram_aux_a + k * auxa_offset, auxa_offset,
-                    nram_aux_b + k * auxb_offset, auxb_offset, nram_dest,
-                    nram_help_value, nram_help_idx, deal_offset);
-  }
-}
-
-template <typename T>
-__mlu_func__ void auxFuncNN(
-    size_t *output_aux_sort_a_gap, size_t *output_aux_sort_b_gap,
-    size_t *output_aux_dest_gap, size_t *output_unknown_gap,
-    size_t *output_known_gap, size_t *output_dist_gap, size_t *auxillary_a_gap,
-    size_t *auxillary_b_gap, size_t *known_num_deal, size_t *unknown_num_deal,
-    size_t *align_num, size_t *auxa_offset, size_t *auxb_offset) {
-  /*
-   * nram partition:
-   *        |-NFU_ALIGN_SIZE-|-2*NFU_ALIGN_SIZE-|-X*3*sizeof(T)-|
-   * space: |   aux_sort_a   |  aux_sort_b      |  nram_unknown |
-   *
-   *        | ------        (Y * 7 *sizeof(T)) ---------------- |
-   *        |   nram_known   |    nram_dist     |   nram_dest   |
-   *
-   *        | -X * NFU_ALIGN_SIZE ---|---X * 2 * NFU_ALIGN_SIZE-|
-   *        |  output_dist(aux_a)    |    output_dist(aux_b)    |
-   *  200 series
-   *  X = (MAX_NRAM - 3 * NFU_ALIGN_SIZE) * (2/3) / (3 * sizeof(T) + 3 *
-   *  NFU_ALIGN_SIZE)
-   *  Y = (MAX_NRAM - 3 * NFU_ALIGN_SIZE) * (1/3) / (7 * sizeof(T))
-   *  300 series
-   *  X = (MAX_NRAM - 3 * NFU_ALIGN_SIZE) * (4/5) / (3 *
-   *  sizeof(T) + 3 * NFU_ALIGN_SIZE)
-   *  Y = (MAX_NRAM - 3 * NFU_ALIGN_SIZE) *
-   *  (1/5) / (7 * sizeof(T))
-   *
-   */
-
-  *align_num = NFU_ALIGN_SIZE / sizeof(T);
-  *auxa_offset = NFU_ALIGN_SIZE / sizeof(T);
-  *auxb_offset = 2 * NFU_ALIGN_SIZE / sizeof(int);
-#if __BANG_ARCH__ >= 322
-  *known_num_deal = PAD_DOWN(
-      (MAX_NRAM_SIZE - 3 * NFU_ALIGN_SIZE) / 5 / (7 * sizeof(T)), *align_num);
-  *unknown_num_deal = PAD_DOWN((MAX_NRAM_SIZE - 3 * NFU_ALIGN_SIZE) / 5 * 4 /
-                                   (3 * sizeof(T) + 3 * NFU_ALIGN_SIZE),
-                               *align_num);
-#else
-  *known_num_deal = PAD_DOWN(
-      (MAX_NRAM_SIZE - 3 * NFU_ALIGN_SIZE) / 3 / (7 * sizeof(T)), *align_num);
-  *unknown_num_deal = PAD_DOWN((MAX_NRAM_SIZE - 3 * NFU_ALIGN_SIZE) / 3 * 2 /
-                                   (3 * sizeof(T) + 3 * NFU_ALIGN_SIZE),
-                               *align_num);
-#endif
-
-  *output_aux_sort_a_gap = 0;
-  *output_aux_sort_b_gap = *output_aux_sort_a_gap + NFU_ALIGN_SIZE;
-  *output_aux_dest_gap = *output_aux_sort_b_gap + 2 * NFU_ALIGN_SIZE;
-
-  *output_unknown_gap = *output_aux_dest_gap + *known_num_deal * sizeof(T);
-  *output_known_gap = *output_unknown_gap + *unknown_num_deal * 3 * sizeof(T);
-  *output_dist_gap = *output_known_gap + *known_num_deal * 3 * sizeof(T);
-  *auxillary_a_gap = *output_dist_gap + *known_num_deal * 3 * sizeof(T);
-  *auxillary_b_gap = *auxillary_a_gap + *unknown_num_deal * NFU_ALIGN_SIZE;
-}
-
-#if __BANG_ARCH__ >= 322
-template <typename T>
-__mlu_func__ bool containNanInf(T *nram_unknown) {
-  if (std::isnan(nram_unknown[0]) || std::isnan(nram_unknown[1]) ||
-      std::isnan(nram_unknown[2]) || std::isinf(nram_unknown[0]) ||
-      std::isinf(nram_unknown[1]) || std::isinf(nram_unknown[2]))
-    return true;
-  else
-    return false;
-}
-#endif
-
-template <typename T>
-__mlu_func__ void computeThreeNN(T *nram_unknown, T *nram_known, T *nram_dist,
-                                 T *nram_dest, T *nram_aux_a,
-                                 T *nram_aux_sort_a, int *nram_aux_b,
-                                 int *nram_aux_sort_b, const int known_num_deal,
-                                 const int known_seg_num, const int deal_offset,
-                                 const int known_count,
-                                 const int known_count_align) {
-  __bang_write_value(nram_dist, 3 * known_num_deal, (T)(INFINITY));
-#if __BANG_ARCH__ >= 322
-  if (!containNanInf(nram_unknown)) {
-#endif
-    // x1 - x2
-    __bang_sub_scalar(nram_dist, nram_known, nram_unknown[0],
-                      known_count_align);
-    // y1 - y2
-    __bang_sub_scalar(nram_dist + known_count_align,
-                      nram_known + known_count_align, nram_unknown[1],
-                      known_count_align);
-    // z1 - z2
-    __bang_sub_scalar(nram_dist + 2 * known_count_align,
-                      nram_known + 2 * known_count_align, nram_unknown[2],
-                      known_count_align);
-    __bang_square(nram_dist, nram_dist, 3 * known_count_align);
-    __bang_add(nram_dist, nram_dist, nram_dist + known_count_align,
-               known_count_align);
-    __bang_add(nram_dist, nram_dist, nram_dist + 2 * known_count_align,
-               known_count_align);
-#if __BANG_ARCH__ >= 322
-  }
-#endif
-
-  int index = 0;
-  for (int i = 0; i < 3; i++) {
-#if __BANG_ARCH__ >= 322
-    __bang_argmin(nram_dest, nram_dist, known_count_align);
-    nram_aux_a[i + deal_offset] = nram_dest[0];
-    index = getIndice(nram_dest);
-#else
-    T value = 0;
-    auxArgmin(nram_dest, nram_dist, known_count_align, &value, &index);
-    nram_aux_a[i + deal_offset] = value;
-#endif
-    nram_aux_b[i + deal_offset] = index + known_seg_num * known_num_deal;
-    __memset_nram(nram_dist + index, 1, (T)(INFINITY));
-  }
-}
-
-template <typename T>
-__mlu_func__ void loadTransposedKnownTensor(
-    char *nram_known, char *nram_dist, const char *known_gdram,
-    const int known_num_deal, const int batch_id, const int m,
-    const int known_seg_num, const int count, const int count_align_num) {
-  __bang_write_value(nram_known, 3 * known_num_deal, (T)(INFINITY));
-#if __BANG_ARCH__ >= 322
-  __bang_write_value(nram_dist, 3 * known_num_deal, (T)(INFINITY));
-  __memcpy(nram_dist,
-           known_gdram +
-               (batch_id * m * 3 + known_seg_num * known_num_deal) * sizeof(T),
-           count * sizeof(T), GDRAM2NRAM, count_align_num * sizeof(T),
-           m * sizeof(T), 2);
-  __bang_minequal((T *)nram_known, (T *)nram_known, (T *)nram_dist,
-                  3 * count_align_num);
-#else
-  __memcpy(nram_known,
-           known_gdram +
-               (batch_id * m * 3 + known_seg_num * known_num_deal) * sizeof(T),
-           count * sizeof(T), GDRAM2NRAM, count_align_num * sizeof(T),
-           m * sizeof(T), 2);
-#endif
-}
-
-template <typename T>
-__mlu_func__ void loadUnknownTensor(char *nram_unknown,
-                                    const char *unknown_gdram,
-                                    const int unknown_num_deal,
-                                    const int unknown_seg_num, const int count,
-                                    const int count_align_num) {
-  __memcpy(nram_unknown,
-           unknown_gdram + unknown_seg_num * unknown_num_deal * 3 * sizeof(T),
-           count * 3 * sizeof(T), GDRAM2NRAM);
-}
-
-template <typename T>
-__mlu_func__ void auxProcessSegment(
-    const int m, const int n, T *nram_unknown, T *nram_known, T *nram_dist,
-    T *nram_dest, T *known_gdram, T *nram_aux_a, const int auxa_offset,
-    int *nram_aux_b, const int auxb_offset, T *nram_aux_sort_a,
-    int *nram_aux_sort_b, const int unknown_num_deal, const int known_num_deal,
-    const int known_seg_num, const int unknown_seg_num, const int unknown_count,
-    const int known_count, const int known_count_align, const int start_idx,
-    int *deal_offset) {
-  int pre_batch_id = -1;
-  int cur_batch_id = -1;
-  pre_batch_id = start_idx / n;
-
-  // if aux_a space is not enough, get the first 3 min among aux_a and clear.
-  if (*deal_offset >= PAD_DOWN(auxa_offset, 3)) {
-    auxFuncSort(nram_aux_a, auxa_offset, nram_aux_b, auxb_offset, nram_dest,
-                nram_aux_sort_a, nram_aux_sort_b, unknown_count, *deal_offset);
-    *deal_offset = 3;
-  }
-
-  // load i'th segment of known batch data.
-  loadTransposedKnownTensor<T>((char *)nram_known, (char *)nram_dist,
-                               (char *)known_gdram, known_num_deal,
-                               pre_batch_id, m, known_seg_num, known_count,
-                               known_count_align);
-
-  for (int k = 0; k < unknown_count; ++k) {
-    cur_batch_id = (start_idx + k) / n;
-    if (cur_batch_id != pre_batch_id) {  // if batch id of unknown data changed,
-                                         // load corresponding known batch data
-      pre_batch_id = cur_batch_id;
-      loadTransposedKnownTensor<T>((char *)nram_known, (char *)nram_dist,
-                                   (char *)known_gdram, known_num_deal,
-                                   pre_batch_id, m, known_seg_num, known_count,
-                                   known_count_align);
-    }
-    computeThreeNN(nram_unknown + 3 * k, nram_known, nram_dist, nram_dest,
-                   nram_aux_a + k * auxa_offset, nram_aux_sort_a,
-                   nram_aux_b + k * auxb_offset, nram_aux_sort_b,
-                   known_num_deal, known_seg_num, *deal_offset, known_count,
-                   known_count_align);
-  }
-}
-
-template <typename T>
-__mlu_global__ void MLUUnion1KernelThreeNN(const int b, const int n,
-                                           const int m, char *unknown_gdram,
-                                           char *known_gdram, char *dist2_gdram,
-                                           int *idx_gdram) {
-  if (coreId == 0x80) {
-    return;
-  }
-
-  size_t output_aux_sort_a_gap = 0, output_aux_sort_b_gap = 0,
-         output_dest_gap = 0, output_unknown_gap = 0, output_known_gap = 0,
-         output_dist_gap = 0, auxillary_a_gap = 0, auxillary_b_gap = 0,
-         known_num_deal = 0, unknown_num_deal = 0, align_num = 0,
-         auxa_offset = 0, auxb_offset = 0;
-  auxFuncNN<T>(&output_aux_sort_a_gap, &output_aux_sort_b_gap, &output_dest_gap,
-               &output_unknown_gap, &output_known_gap, &output_dist_gap,
-               &auxillary_a_gap, &auxillary_b_gap, &known_num_deal,
-               &unknown_num_deal, &align_num, &auxa_offset, &auxb_offset);
-
-  int num_per_core = b * n / taskDim;
-  const int core_offset = num_per_core;
-
-  char *unknown_gdram_start =
-      unknown_gdram + taskId * 3 * core_offset * sizeof(T);
-  char *known_gdram_start = known_gdram;
-  char *output_dist_start = dist2_gdram + taskId * 3 * core_offset * sizeof(T);
-  int *output_idx_start = idx_gdram + taskId * 3 * core_offset;
-
-  const int rem = (b * n) % taskDim;
-  if (taskId == taskDim - 1) {
-    num_per_core += rem;
-  }
-
-  const int unknown_repeat =
-      num_per_core / unknown_num_deal;  // if unknown number is big, process it
-                                        // by unknown_repeat times.
-  const int unknown_rem = num_per_core % unknown_num_deal;  // unknown reminder
-  const int unknown_rem_align = PAD_UP(unknown_rem, align_num);
-
-  const int known_repeat =
-      m / known_num_deal;  // if known number is big, process it by
-                           // unknown_repeat times.
-  const int known_rem = m % known_num_deal;  // known reminder
-  const int known_rem_align = PAD_UP(known_rem, align_num);
-
-  char *nram_aux_sort_a = nram_buffer;
-  int *nram_aux_sort_b = (int *)(nram_buffer + output_aux_sort_b_gap);
-  char *nram_dest = nram_buffer + output_dest_gap;
-  char *nram_unknown = nram_buffer + output_unknown_gap;
-  char *nram_known = nram_buffer + output_known_gap;
-  char *nram_dist = nram_buffer + output_dist_gap;
-  char *nram_aux_a = nram_buffer + auxillary_a_gap;
-  int *nram_aux_b = (int *)(nram_buffer + auxillary_b_gap);
-  int deal_offset = 0;
-  int start_idx = -1;
-
-  for (int j = 0; j < unknown_repeat;
-       ++j) {  // process data within a unknown_repeat
-    // if unknown need to be process segmentally, use a aux_a and aux_b
-    // space to find first 3 minimum dist.
-    __bang_write_value(nram_aux_a, unknown_num_deal * auxa_offset,
-                       (T)(INFINITY));
-    __bang_write_value(nram_aux_b, unknown_num_deal * auxb_offset, (int)0);
-    loadUnknownTensor<T>(nram_unknown, unknown_gdram_start, unknown_num_deal, j,
-                         unknown_num_deal, unknown_num_deal);
-
-    deal_offset = 0;
-    start_idx = taskId * core_offset + j * unknown_num_deal;
-
-    for (int i = 0; i < known_repeat;
-         ++i) {  // process known data in segmentally.
-      auxProcessSegment<T>(
-          m, n, (T *)nram_unknown, (T *)nram_known, (T *)nram_dist,
-          (T *)nram_dest, (T *)known_gdram_start, (T *)nram_aux_a, auxa_offset,
-          nram_aux_b, auxb_offset, (T *)nram_aux_sort_a, nram_aux_sort_b,
-          unknown_num_deal, known_num_deal, i, j, unknown_num_deal,
-          known_num_deal, known_num_deal, start_idx, &deal_offset);
-      deal_offset += 3;
-    }
-
-    if (known_rem > 0) {  // process known rem
-      __bang_write_value(nram_known, 3 * known_num_deal, (T)(INFINITY));
-      auxProcessSegment<T>(
-          m, n, (T *)nram_unknown, (T *)nram_known, (T *)nram_dist,
-          (T *)nram_dest, (T *)known_gdram_start, (T *)nram_aux_a, auxa_offset,
-          nram_aux_b, auxb_offset, (T *)nram_aux_sort_a, nram_aux_sort_b,
-          unknown_num_deal, known_num_deal, known_repeat, j, unknown_num_deal,
-          known_rem, known_rem_align, start_idx, &deal_offset);
-    }
-
-    deal_offset += 3;
-
-    if (deal_offset > 3) {
-      auxFuncSort((T *)nram_aux_a, auxa_offset, nram_aux_b, auxb_offset,
-                  (T *)nram_dest, (T *)nram_aux_sort_a, nram_aux_sort_b,
-                  unknown_num_deal, deal_offset);
-      deal_offset = 0;
-    }
-
-    __memcpy((char *)output_dist_start + j * unknown_num_deal * 3 * sizeof(T),
-             (char *)nram_aux_a, 3 * sizeof(T), NRAM2GDRAM, 3 * sizeof(T),
-             auxa_offset * sizeof(T), unknown_num_deal - 1);
-    __memcpy((char *)output_idx_start + j * unknown_num_deal * 3 * sizeof(int),
-             (char *)nram_aux_b, 3 * sizeof(int), NRAM2GDRAM, 3 * sizeof(int),
-             auxb_offset * sizeof(int), unknown_num_deal - 1);
-  }
-
-  if (unknown_rem > 0) {  // process unknown rem
-    deal_offset = 0;
-    __bang_write_value(nram_aux_a, unknown_num_deal * auxa_offset,
-                       (T)(INFINITY));
-    __bang_write_value(nram_aux_b, unknown_num_deal * auxb_offset, (int)0);
-    loadUnknownTensor<T>(nram_unknown, unknown_gdram_start, unknown_num_deal,
-                         unknown_repeat, unknown_rem, unknown_rem_align);
-    start_idx = taskId * core_offset + unknown_repeat * unknown_num_deal;
-
-    for (int i = 0; i < known_repeat; ++i) {
-      auxProcessSegment<T>(
-          m, n, (T *)nram_unknown, (T *)nram_known, (T *)nram_dist,
-          (T *)nram_dest, (T *)known_gdram_start, (T *)nram_aux_a, auxa_offset,
-          nram_aux_b, auxb_offset, (T *)nram_aux_sort_a, nram_aux_sort_b,
-          unknown_num_deal, known_num_deal, i, unknown_repeat, unknown_rem,
-          known_num_deal, known_num_deal, start_idx, &deal_offset);
-      deal_offset += 3;
-    }
-
-    if (known_rem > 0) {
-      __bang_write_value(nram_known, 3 * known_num_deal, (T)(INFINITY));
-      start_idx = taskId * core_offset + unknown_repeat * unknown_num_deal;
-
-      auxProcessSegment<T>(
-          m, n, (T *)nram_unknown, (T *)nram_known, (T *)nram_dist,
-          (T *)nram_dest, (T *)known_gdram_start, (T *)nram_aux_a, auxa_offset,
-          nram_aux_b, auxb_offset, (T *)nram_aux_sort_a, nram_aux_sort_b,
-          unknown_num_deal, known_num_deal, known_repeat, unknown_repeat,
-          unknown_rem, known_rem, known_rem_align, start_idx, &deal_offset);
-
-      deal_offset += 3;
-    }
-    if (deal_offset > 3) {
-      auxFuncSort((T *)nram_aux_a, auxa_offset, nram_aux_b, auxb_offset,
-                  (T *)nram_dest, (T *)nram_aux_sort_a, nram_aux_sort_b,
-                  unknown_rem, deal_offset);
-      deal_offset = 0;
-    }
-
-    __memcpy((char *)output_dist_start +
-                 unknown_repeat * unknown_num_deal * 3 * sizeof(T),
-             (char *)nram_aux_a, 3 * sizeof(T), NRAM2GDRAM, 3 * sizeof(T),
-             auxa_offset * sizeof(T), unknown_rem - 1);
-    __memcpy((char *)output_idx_start +
-                 unknown_repeat * unknown_num_deal * 3 * sizeof(int),
-             (char *)nram_aux_b, 3 * sizeof(int), NRAM2GDRAM, 3 * sizeof(int),
-             auxb_offset * sizeof(int), unknown_rem - 1);
-  }
-}
-
-template __mlu_global__ void MLUUnion1KernelThreeNN<float>(
-    const int b, const int n, const int m, char *unknown_gdram,
-    char *known_gdram, char *dist2_gdram, int *idx_gdram);
-
-template __mlu_global__ void MLUUnion1KernelThreeNN<half>(
-    const int b, const int n, const int m, char *unknown_gdram,
-    char *known_gdram, char *dist2_gdram, int *idx_gdram);
-
-void KernelThreeNNForward(cnrtDim3_t k_dim, cnrtFunctionType_t k_type,
-                          cnrtQueue_t queue, cnrtDataType_t data_type,
-                          const void *unknown, const void *known, void *dist2,
-                          int *idx, const int b, const int n, const int m) {
-  switch (data_type) {
-    case CNRT_FLOAT16: {
-      MLUUnion1KernelThreeNN<half><<<k_dim, k_type, queue>>>(
-          b, n, m, (char *)unknown, (char *)known, (char *)dist2, idx);
-    }; break;
-    case CNRT_FLOAT32: {
-      MLUUnion1KernelThreeNN<float><<<k_dim, k_type, queue>>>(
-          b, n, m, (char *)unknown, (char *)known, (char *)dist2, idx);
-    }; break;
-    default: {
-      break;
-    }
-  }
-}
diff --git a/mmcv/ops/csrc/common/mlu/tin_shift_mlu_kernel.mlu b/mmcv/ops/csrc/common/mlu/tin_shift_mlu_kernel.mlu
deleted file mode 100644
index ed64c2b..0000000
--- a/mmcv/ops/csrc/common/mlu/tin_shift_mlu_kernel.mlu
+++ /dev/null
@@ -1,307 +0,0 @@
-/*************************************************************************
- * Copyright (C) 2022 Cambricon.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *************************************************************************/
-#include "common_mlu_helper.hpp"
-
-__nram__ char data_nram[MAX_NRAM_SIZE];
-
-template <typename T>
-__mlu_func__ void mluMultiKernelTinShift(
-    const T *input, const int *shifts, T *output, const int batch_size,
-    const int time_size, const int channel_size, const int hw_size,
-    const int group_size, const int group_channel) {
-  for (int cur_channel_index = taskId;
-       cur_channel_index < batch_size * channel_size;
-       cur_channel_index += taskDim) {
-    int n_index = cur_channel_index / channel_size;
-    int group_id = cur_channel_index % channel_size / group_channel;
-    int t_shift = shifts[n_index * group_size + group_id];
-    int index = cur_channel_index % channel_size * hw_size +
-                n_index * time_size * channel_size * hw_size;
-    __bang_write_value(data_nram, MAX_NRAM_SIZE, (char)0);
-    __asm__ volatile("sync;");
-    if (abs(t_shift) >= time_size) {
-      __memcpy(output + index, data_nram, hw_size * sizeof(T), NRAM2GDRAM,
-               channel_size * hw_size * sizeof(T), hw_size * sizeof(T),
-               time_size - 1);
-    } else {
-      if (t_shift > 0) {
-        __memcpy(data_nram + t_shift * hw_size * sizeof(T), input + index,
-                 hw_size * sizeof(T), GDRAM2NRAM, hw_size * sizeof(T),
-                 channel_size * hw_size * sizeof(T), time_size - 1 - t_shift);
-        __memcpy(output + index, data_nram, hw_size * sizeof(T), NRAM2GDRAM,
-                 channel_size * hw_size * sizeof(T), hw_size * sizeof(T),
-                 time_size - 1);
-      } else {
-        __memcpy(data_nram, input + (index - t_shift * channel_size * hw_size),
-                 hw_size * sizeof(T), GDRAM2NRAM, hw_size * sizeof(T),
-                 channel_size * hw_size * sizeof(T), time_size - 1 + t_shift);
-        __memcpy(output + index, data_nram, hw_size * sizeof(T), NRAM2GDRAM,
-                 channel_size * hw_size * sizeof(T), hw_size * sizeof(T),
-                 time_size - 1);
-      }
-    }
-    __asm__ volatile("sync;");
-  }
-}
-
-template <typename T>
-__mlu_func__ void mluHwSplit(const T *input, const int t_shift,
-                             const int time_size, const int hw_size,
-                             const int channel_size, const int index,
-                             const int cur_sequence_index,
-                             const int max_length_per_core, T *output) {
-  for (int cur_index = index; cur_index < index + hw_size;
-       cur_index += max_length_per_core) {
-    int memcpy_size = max_length_per_core;
-    if (cur_index + max_length_per_core > index + hw_size) {
-      memcpy_size = index + hw_size - cur_index;
-    }
-    if (cur_sequence_index - t_shift < 0 ||
-        cur_sequence_index - t_shift >= time_size) {
-      __memcpy(output + cur_index, data_nram, memcpy_size * sizeof(T),
-               NRAM2GDRAM);
-    } else {
-      __memcpy(data_nram, input + cur_index - t_shift * channel_size * hw_size,
-               memcpy_size * sizeof(T), GDRAM2NRAM);
-      __memcpy(output + cur_index, data_nram, memcpy_size * sizeof(T),
-               NRAM2GDRAM);
-    }
-    __asm__ volatile("sync;");
-  }
-}
-
-template <typename T>
-__mlu_func__ void mluMultiKernelTinShiftSplitSequence(
-    const T *input, const int *shifts, T *output, const int batch_size,
-    const int time_size, const int channel_size, const int hw_size,
-    const int group_size, const int group_channel,
-    const int max_number_hw_per_core, const int max_length_per_core) {
-  const int tmp_max_number_hw_per_core =
-      max_number_hw_per_core > 0 ? max_number_hw_per_core : 1;
-  const int loop_time = time_size / tmp_max_number_hw_per_core +
-                        ((time_size % tmp_max_number_hw_per_core) > 0 ? 1 : 0);
-  int segmentime_size = tmp_max_number_hw_per_core;
-  int res_segment = time_size % tmp_max_number_hw_per_core;
-
-  for (int cur_segment_index = taskId;
-       cur_segment_index < loop_time * batch_size * channel_size;
-       cur_segment_index += taskDim) {
-    int n_index = cur_segment_index / loop_time / channel_size;
-    int group_id = cur_segment_index / loop_time % channel_size / group_channel;
-    int t_shift = shifts[n_index * group_size + group_id];
-    int index = n_index * time_size * channel_size * hw_size +
-                (cur_segment_index / loop_time % channel_size) * hw_size +
-                cur_segment_index % loop_time * segmentime_size * hw_size *
-                    channel_size;
-    char *dst_gdram2nram = data_nram;
-    const T *src_gdram2nram = input + index;
-    int count_gdram2nram = -1;
-    int count_nram2gdram = -1;
-    int next_sequence_index =
-        index / hw_size / channel_size % time_size + segmentime_size;
-    int cur_sequence_index = index / hw_size / channel_size % time_size;
-    __bang_write_value(data_nram, MAX_NRAM_SIZE, (char)0);
-    __asm__ volatile("sync;");
-    if (max_number_hw_per_core == 0) {
-      mluHwSplit(input, t_shift, time_size, hw_size, channel_size, index,
-                 cur_sequence_index, max_length_per_core, output);
-      continue;
-    }
-    if (abs(t_shift) >= time_size) {
-      if ((cur_segment_index + 1) % loop_time == 0 && res_segment != 0) {
-        __memcpy(output + index, data_nram, hw_size * sizeof(T), NRAM2GDRAM,
-                 channel_size * hw_size * sizeof(T), hw_size * sizeof(T),
-                 res_segment - 1);
-      } else {
-        __memcpy(output + index, data_nram, hw_size * sizeof(T), NRAM2GDRAM,
-                 channel_size * hw_size * sizeof(T), hw_size * sizeof(T),
-                 segmentime_size - 1);
-      }
-      continue;
-    }
-    if (t_shift == 0) {
-      if ((cur_segment_index + 1) % loop_time == 0 && res_segment != 0) {
-        dst_gdram2nram = data_nram;
-        src_gdram2nram = input + index;
-        count_gdram2nram = res_segment - 1;
-        count_nram2gdram = res_segment - 1;
-      } else {
-        dst_gdram2nram = data_nram;
-        src_gdram2nram = input + index;
-        count_gdram2nram = segmentime_size - 1;
-        count_nram2gdram = segmentime_size - 1;
-      }
-    } else if (t_shift > 0) {
-      int first_index_cur_channel =
-          n_index * time_size * channel_size * hw_size +
-          (cur_segment_index / loop_time % channel_size) * hw_size;
-      if ((cur_segment_index + 1) % loop_time == 0 && res_segment != 0) {
-        dst_gdram2nram = data_nram;
-        src_gdram2nram =
-            input +
-            (index - t_shift * channel_size * hw_size < first_index_cur_channel
-                 ? first_index_cur_channel
-                 : index - t_shift * channel_size * hw_size);
-        count_gdram2nram = res_segment - 1;
-        count_nram2gdram = res_segment - 1;
-        if (cur_sequence_index < t_shift && t_shift < next_sequence_index) {
-          dst_gdram2nram =
-              data_nram + t_shift % segmentime_size * hw_size * sizeof(T);
-          count_gdram2nram = res_segment - (t_shift - cur_sequence_index) - 1;
-        }
-      } else {
-        if (t_shift >= next_sequence_index) {
-          __memcpy(output + index, data_nram, hw_size * sizeof(T), NRAM2GDRAM,
-                   channel_size * hw_size * sizeof(T), hw_size * sizeof(T),
-                   segmentime_size - 1);
-          continue;
-        } else if (cur_sequence_index < t_shift &&
-                   t_shift < next_sequence_index) {
-          dst_gdram2nram =
-              data_nram + t_shift % segmentime_size * hw_size * sizeof(T);
-          src_gdram2nram = input + first_index_cur_channel;
-          count_gdram2nram = segmentime_size - (t_shift % segmentime_size) - 1;
-          count_nram2gdram = segmentime_size - 1;
-        } else {
-          dst_gdram2nram = data_nram;
-          src_gdram2nram = input + index - t_shift * channel_size * hw_size;
-          count_gdram2nram = segmentime_size - 1;
-          count_nram2gdram = segmentime_size - 1;
-        }
-      }
-    } else {
-      int offset_index = time_size + t_shift;
-      if (cur_sequence_index >= offset_index) {
-        if ((cur_segment_index + 1) % loop_time == 0 && res_segment != 0) {
-          __memcpy(output + index, data_nram, hw_size * sizeof(T), NRAM2GDRAM,
-                   channel_size * hw_size * sizeof(T), hw_size * sizeof(T),
-                   res_segment - 1);
-          continue;
-        } else {
-          __memcpy(output + index, data_nram, hw_size * sizeof(T), NRAM2GDRAM,
-                   channel_size * hw_size * sizeof(T), hw_size * sizeof(T),
-                   segmentime_size - 1);
-          continue;
-        }
-      } else {
-        dst_gdram2nram = data_nram;
-        src_gdram2nram = input + index - t_shift * channel_size * hw_size;
-        if (cur_sequence_index - t_shift + segmentime_size < time_size) {
-          count_gdram2nram = segmentime_size - 1;
-          count_nram2gdram = segmentime_size - 1;
-        } else {
-          count_gdram2nram = time_size - (cur_sequence_index - t_shift) - 1;
-          count_nram2gdram =
-              (segmentime_size - 1) < (time_size - cur_sequence_index - 1)
-                  ? (segmentime_size - 1)
-                  : (time_size - cur_sequence_index - 1);
-        }
-      }
-    }
-    __memcpy(dst_gdram2nram, src_gdram2nram, hw_size * sizeof(T), GDRAM2NRAM,
-             hw_size * sizeof(T), channel_size * hw_size * sizeof(T),
-             count_gdram2nram);
-    __memcpy(output + index, data_nram, hw_size * sizeof(T), NRAM2GDRAM,
-             channel_size * hw_size * sizeof(T), hw_size * sizeof(T),
-             count_nram2gdram);
-    __asm__ volatile("sync;");
-  }
-}
-
-__mlu_entry__ void MLUUnion1KernelTinShift(
-    const void *input, const void *shifts, void *output, const int batch_size,
-    const int time_size, const int channel_size, const int hw_size,
-    const int group_size, const int group_channel,
-    const cnrtDataType_t data_dtype) {
-  // make sure that memcore is not used
-  if (coreId == 0x80) {
-    return;
-  }
-  switch (data_dtype) {
-    case CNRT_FLOAT16: {
-      mluMultiKernelTinShift((half *)input, (const int *)shifts, (half *)output,
-                             batch_size, time_size, channel_size, hw_size,
-                             group_size, group_channel);
-    }; break;
-    case CNRT_FLOAT32: {
-      mluMultiKernelTinShift((float *)input, (const int *)shifts,
-                             (float *)output, batch_size, time_size,
-                             channel_size, hw_size, group_size, group_channel);
-    }; break;
-    default: { return; }
-  }
-}
-
-__mlu_entry__ void MLUUnion1KernelTinShiftSplitSequence(
-    const void *input, const void *shifts, void *output, const int batch_size,
-    const int time_size, const int channel_size, const int hw_size,
-    const int group_size, const int group_channel,
-    const int max_number_hw_per_core, const int max_length_per_core,
-    const cnrtDataType_t data_dtype) {
-  // make sure that memcore is not used
-  if (coreId == 0x80) {
-    return;
-  }
-  switch (data_dtype) {
-    case CNRT_FLOAT16: {
-      mluMultiKernelTinShiftSplitSequence(
-          (half *)input, (const int *)shifts, (half *)output, batch_size,
-          time_size, channel_size, hw_size, group_size, group_channel,
-          max_number_hw_per_core, max_length_per_core);
-    }; break;
-    case CNRT_FLOAT32: {
-      mluMultiKernelTinShiftSplitSequence(
-          (float *)input, (const int *)shifts, (float *)output, batch_size,
-          time_size, channel_size, hw_size, group_size, group_channel,
-          max_number_hw_per_core, max_length_per_core);
-    }; break;
-    default: { return; }
-  }
-}
-
-void KernelTinShiftForward(
-    cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
-    const void *input, const void *shifts, void *output, const int batch_size,
-    const int time_size, const int channel_size, const int hw_size,
-    const int group_size, const int group_channel,
-    const cnrtDataType_t data_dtype, const int channel_per_core,
-    const int max_number_hw_per_core, const int max_length_per_core) {
-  if (channel_per_core >= 1) {
-    MLUUnion1KernelTinShift<<<k_dim, k_type, queue>>>(
-        input, shifts, output, batch_size, time_size, channel_size, hw_size,
-        group_size, group_channel, data_dtype);
-  } else {
-    MLUUnion1KernelTinShiftSplitSequence<<<k_dim, k_type, queue>>>(
-        input, shifts, output, batch_size, time_size, channel_size, hw_size,
-        group_size, group_channel, max_number_hw_per_core, max_length_per_core,
-        data_dtype);
-  }
-}
-
-void KernelTinShiftBackward(
-    cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
-    const void *grad_output, const void *shifts, void *grad_input,
-    const int batch_size, const int time_size, const int channel_size,
-    const int hw_size, const int group_size, const int group_channel,
-    const cnrtDataType_t data_dtype, const int channel_per_core,
-    const int max_number_hw_per_core, const int max_length_per_core) {
-  if (channel_per_core >= 1) {
-    MLUUnion1KernelTinShift<<<k_dim, k_type, queue>>>(
-        grad_output, shifts, grad_input, batch_size, time_size, channel_size,
-        hw_size, group_size, group_channel, data_dtype);
-  } else {
-    MLUUnion1KernelTinShiftSplitSequence<<<k_dim, k_type, queue>>>(
-        grad_output, shifts, grad_input, batch_size, time_size, channel_size,
-        hw_size, group_size, group_channel, max_number_hw_per_core,
-        max_length_per_core, data_dtype);
-  }
-}
diff --git a/mmcv/ops/csrc/common/mps/MPSDevice.h b/mmcv/ops/csrc/common/mps/MPSDevice.h
deleted file mode 100644
index e1d9d49..0000000
--- a/mmcv/ops/csrc/common/mps/MPSDevice.h
+++ /dev/null
@@ -1,64 +0,0 @@
-//  Copyright © 2022 Apple Inc.
-
-// This file is modify from:
-// https://github.com/pytorch/pytorch/blob/a85d1f0bcdd02cf18d3b0517337458cb51a18cdb/aten/src/ATen/mps/MPSDevice.h
-
-#pragma once
-#include <ATen/ATen.h>
-#include <c10/macros/Macros.h>
-#include <c10/util/Exception.h>
-
-#ifdef __OBJC__
-#include <Foundation/Foundation.h>
-#include <Metal/Metal.h>
-#include <MetalPerformanceShaders/MetalPerformanceShaders.h>
-typedef id<MTLDevice> MTLDevice_t;
-#else
-typedef void* MTLDevice;
-typedef void* MTLDevice_t;
-#endif
-
-using namespace std;
-
-namespace at {
-namespace mps {
-
-//-----------------------------------------------------------------
-//  MPSDevice
-//
-// MPSDevice is a singleton class that returns the default device
-//-----------------------------------------------------------------
-
-class TORCH_API MPSDevice {
- public:
-  /**
-   * MPSDevice should not be cloneable.
-   */
-  MPSDevice(MPSDevice& other) = delete;
-  /**
-   * MPSDevice should not be assignable.
-   */
-  void operator=(const MPSDevice&) = delete;
-  /**
-   * Gets single instance of the Device.
-   */
-  static MPSDevice* getInstance();
-  /**
-   * Returns the single device.
-   */
-  MTLDevice_t device() { return _mtl_device; }
-
-  ~MPSDevice();
-
- private:
-  static MPSDevice* _device;
-  MTLDevice_t _mtl_device;
-  MPSDevice();
-};
-
-TORCH_API bool is_available();
-
-TORCH_API at::Allocator* GetMPSAllocator(bool useSharedAllocator = false);
-
-}  // namespace mps
-}  // namespace at
diff --git a/mmcv/ops/csrc/common/mps/MPSLibrary.h b/mmcv/ops/csrc/common/mps/MPSLibrary.h
deleted file mode 100644
index 41c33fb..0000000
--- a/mmcv/ops/csrc/common/mps/MPSLibrary.h
+++ /dev/null
@@ -1,61 +0,0 @@
-#ifndef _MPS_LIBRARY_H_
-#define _MPS_LIBRARY_H_
-
-#include <string>
-#include <unordered_map>
-
-#ifdef __OBJC__
-#include <Foundation/Foundation.h>
-#include <Metal/Metal.h>
-#include <MetalPerformanceShaders/MetalPerformanceShaders.h>
-
-typedef id<MTLComputePipelineState> MTLComputePipelineState_t;
-typedef id<MTLLibrary> MTLLibrary_t;
-#else
-typedef void* MTLComputePipelineState;
-typedef void* MTLComputePipelineState_t;
-typedef void* MTLLibrary;
-typedef void* MTLLibrary_t;
-#endif
-
-class MPSLibrary {
- public:
-  // disable constructor for singleton
-  static MPSLibrary* createFromUrl(const std::string& library_url);
-  static MPSLibrary* createFromSource(const std::string& source);
-  ~MPSLibrary();
-
-  MTLLibrary_t library() { return _library; }
-
-  MTLComputePipelineState_t getComputePipelineState(
-      const std::string& function_name);
-
- private:
-  MTLLibrary_t _library;
-  std::unordered_map<std::string, MTLComputePipelineState_t> _pso_map;
-};
-
-class MPSLibraryManager {
- public:
-  // disable constructor for singleton
-  MPSLibraryManager(const MPSLibraryManager&) = delete;
-  MPSLibraryManager& operator=(const MPSLibraryManager&) = delete;
-  MPSLibraryManager(MPSLibraryManager&&) = delete;
-  MPSLibraryManager& operator=(MPSLibraryManager&&) = delete;
-
-  static MPSLibraryManager* getInstance();
-
-  bool hasLibrary(const std::string& name);
-
-  MPSLibrary* getLibrary(const std::string& library_url);
-
-  MPSLibrary* createLibraryFromSouce(const std::string& name,
-                                     const std::string& sources);
-
-  ~MPSLibraryManager();
-
- private:
-  MPSLibraryManager();
-  std::unordered_map<std::string, std::unique_ptr<MPSLibrary>> _library_map;
-};
-#endif
diff --git a/mmcv/ops/csrc/common/mps/MPSLibrary.mm b/mmcv/ops/csrc/common/mps/MPSLibrary.mm
deleted file mode 100644
index 99addc7..0000000
--- a/mmcv/ops/csrc/common/mps/MPSLibrary.mm
+++ /dev/null
@@ -1,107 +0,0 @@
-#include "MPSLibrary.h"
-#include "MPSDevice.h"
-
-static std::unique_ptr<MPSLibraryManager> mps_library_manager=nullptr;
-
-MPSLibraryManager* MPSLibraryManager::getInstance() {
-  if(!mps_library_manager)
-    mps_library_manager = std::unique_ptr<MPSLibraryManager>(new MPSLibraryManager());
-  return mps_library_manager.get();
-}
-
-MPSLibraryManager::~MPSLibraryManager() {}
-
-MPSLibraryManager::MPSLibraryManager() {}
-
-bool MPSLibraryManager::hasLibrary(const std::string& name) {
-  return _library_map.find(name) != _library_map.end();
-}
-
-MPSLibrary* MPSLibraryManager::getLibrary(const std::string& library_url) {
-  if (_library_map.find(library_url) != _library_map.end()) {
-    return _library_map[library_url].get();
-  }
-  _library_map.emplace(std::make_pair(
-      library_url, std::unique_ptr<MPSLibrary>(MPSLibrary::createFromUrl(library_url))));
-  return _library_map[library_url].get();
-}
-
-MPSLibrary* MPSLibraryManager::createLibraryFromSouce(const std::string& name,
-                                                      const std::string& source) {
-  NSString* ns_name = [NSString stringWithCString:name.c_str()];
-  if (_library_map.find(name) != _library_map.end()) {
-    NSLog(@"Library %@ already exist.", ns_name);
-    return nullptr;
-  }
-
-  _library_map.emplace(
-      std::make_pair(name, std::unique_ptr<MPSLibrary>(MPSLibrary::createFromSource(source))));
-  return _library_map[name].get();
-}
-
-MPSLibrary* MPSLibrary::createFromUrl(const std::string& library_url) {
-  MPSLibrary* library = new MPSLibrary();
-  @autoreleasepool {
-    NSError* error = nil;
-
-    // load library and func
-    NSString* utl_str = [NSString stringWithCString:library_url.c_str()];
-    NSURL* metal_url = [NSURL fileURLWithPath:utl_str];
-    library->_library = [at::mps::MPSDevice::getInstance()->device() newLibraryWithURL:metal_url
-                                                                                 error:&error];
-    if (library->_library == nil) {
-      NSLog(@"Failed to find library, error %@.", error);
-      exit(1);
-    }
-  }
-
-  return library;
-}
-
-MPSLibrary* MPSLibrary::createFromSource(const std::string& sources) {
-  MPSLibrary* library = new MPSLibrary();
-  @autoreleasepool {
-    NSError* error = nil;
-
-    // load library and func
-    NSString* code_str = [NSString stringWithCString:sources.c_str()];
-    library->_library = [at::mps::MPSDevice::getInstance()->device() newLibraryWithSource:code_str
-                                                                                  options:nil
-                                                                                    error:&error];
-    if (library->_library == nil) {
-      NSLog(@"Failed to find library, error %@.", error);
-      exit(1);
-    }
-  }
-
-  return library;
-}
-
-MPSLibrary::~MPSLibrary() {
-  [_library release];
-  _library = nil;
-}
-
-MTLComputePipelineState_t MPSLibrary::getComputePipelineState(const std::string& function_name) {
-  if (_pso_map.find(function_name) != _pso_map.end()) {
-    return _pso_map[function_name];
-  }
-
-  MTLComputePipelineState_t pso;
-  @autoreleasepool {
-    NSError* error = nil;
-
-    // create function
-    NSString* function_name_str = [NSString stringWithCString:function_name.c_str()];
-    id<MTLFunction> func = [_library newFunctionWithName:function_name_str];
-    if (func == nil) {
-      NSLog(@"Failed to created pipeline state object, error %@.", error);
-      exit(1);
-    }
-    // create pipeline
-    pso = [at::mps::MPSDevice::getInstance()->device() newComputePipelineStateWithFunction:func
-                                                                                     error:&error];
-    _pso_map.emplace(std::make_pair(function_name, pso));
-  }
-  return _pso_map[function_name];
-}
diff --git a/mmcv/ops/csrc/common/mps/MPSStream.h b/mmcv/ops/csrc/common/mps/MPSStream.h
deleted file mode 100644
index 54cd388..0000000
--- a/mmcv/ops/csrc/common/mps/MPSStream.h
+++ /dev/null
@@ -1,132 +0,0 @@
-//  Copyright © 2022 Apple Inc.
-
-// This file is modify from:
-// https://github.com/pytorch/pytorch/blob/a85d1f0bcdd02cf18d3b0517337458cb51a18cdb/aten/src/ATen/mps/MPSStream.h
-
-#pragma once
-
-#include <cstdint>
-#include <utility>
-
-#include <c10/core/DeviceGuard.h>
-#include <c10/core/Stream.h>
-#include <c10/util/Exception.h>
-#include "MPSDevice.h"
-
-#ifdef __OBJC__
-#include <Foundation/Foundation.h>
-#include <Metal/Metal.h>
-#include <MetalPerformanceShaders/MetalPerformanceShaders.h>
-#include <MetalPerformanceShadersGraph/MetalPerformanceShadersGraph.h>
-typedef id<MTLCommandQueue> MTLCommandQueue_t;
-typedef id<MTLCommandBuffer> MTLCommandBuffer_t;
-typedef id<MTLSharedEvent> MTLSharedEvent_t;
-typedef id<MTLDevice> MTLDevice_t;
-#else
-typedef void* MTLCommandQueue_t;
-typedef void* MTLCommandQueue;
-typedef void* MTLCommandBuffer_t;
-typedef void* MTLCommandBuffer;
-typedef void* MTLSharedEvent_t;
-typedef void* dispatch_queue_t;
-typedef void* MTLDevice_t;
-#define nil NULL;
-#endif
-
-namespace at {
-namespace mps {
-
-//-----------------------------------------------------------------
-//  MPSStream
-//-----------------------------------------------------------------
-
-class TORCH_API MPSStream {
- public:
-  enum Unchecked { UNCHECKED };
-  /// Construct a MPSStream from a Stream.  This construction is checked,
-  /// and will raise an error if the Stream is not, in fact, a MPS stream.
-  explicit MPSStream(Stream stream);
-
-  ~MPSStream();
-  MTLCommandQueue_t commandQueue() const { return _commandQueue; };
-  dispatch_queue_t queue() const { return _serialQueue; }
-
-  MTLCommandBuffer_t commandBuffer();
-  void commit(bool flush);
-  void commitAndWait();
-  void synchronize();
-
-  void flush();
-
-  /// Get the MPS device index that this stream is associated with.
-  c10::DeviceIndex device_index() const { return _stream.device_index(); }
-
-  MTLCommandQueue_t stream() const { return _commandQueue; };
-
-  MTLDevice_t device() const { return [_commandQueue device]; }
-
-  /// Explicit conversion to Stream.
-  Stream unwrap() const { return _stream; }
-
- private:
-  Stream _stream;
-  MTLCommandQueue_t _commandQueue = nil;
-  MTLCommandBuffer_t _commandBuffer = nil;
-  void _flush(bool commitAndWait) const;
-
-  dispatch_queue_t _serialQueue = nullptr;
-};
-
-/**
- * Get the current MPS stream
- */
-TORCH_API MPSStream* getCurrentMPSStream();
-
-/**
- * Get the default MPS stream
- */
-TORCH_API MPSStream* getDefaultMPSStream();
-
-//-----------------------------------------------------------------
-//  MPSStreamImpl
-//-----------------------------------------------------------------
-
-class TORCH_API MPSStreamImpl {
- public:
-  /**
-   * Gets single instance of the MPSStream.
-   */
-  static MPSStream* getInstance();
-
- private:
-  static MPSStream* _stream;
-  MPSStreamImpl();
-};
-
-//-----------------------------------------------------------------
-//  MPSEvent
-//-----------------------------------------------------------------
-
-struct TORCH_API MPSEvent {
-  MPSEvent();
-  // MPSEvent(id<MTLDevice> device);
-
-  ~MPSEvent();
-  MTLSharedEvent_t event() const { return _event; }
-
-  void recordEvent(MPSStream* stream);
-  void waitForEvent(MPSStream* queue);  // waits on the cpu
-  bool queryEvent();
-  uint64_t getCurrentValue() { return _currentValue; }
-  void setCurrentValue(uint64_t currValue) { _currentValue = currValue; }
-
- private:
-  bool _isRecorded = false;
-  uint64_t _currentValue = 0;
-  MTLSharedEvent_t _event;
-};
-
-typedef MPSEvent* mpsEvent_t;
-
-}  // namespace mps
-}  // namespace at
diff --git a/mmcv/ops/csrc/common/mps/MPSUtils.h b/mmcv/ops/csrc/common/mps/MPSUtils.h
deleted file mode 100644
index 2a4ce6d..0000000
--- a/mmcv/ops/csrc/common/mps/MPSUtils.h
+++ /dev/null
@@ -1,51 +0,0 @@
-#ifndef _MPS_UTILS_H_
-#define _MPS_UTILS_H_
-#include <torch/extension.h>
-#ifdef __OBJC__
-#include <Foundation/Foundation.h>
-#include <Metal/Metal.h>
-#include <MetalPerformanceShaders/MetalPerformanceShaders.h>
-
-typedef id<MTLBuffer> MTLBuffer_t;
-typedef id<MTLComputeCommandEncoder> MTLComputeCommandEncoder_t;
-#else
-typedef void* MTLBuffer;
-typedef void* MTLBuffer_t;
-typedef void* MTLComputeCommandEncoder;
-typedef void* MTLComputeCommandEncoder_t;
-#endif
-
-// utils
-static inline MTLBuffer_t getMTLBufferStorage(const at::Tensor& tensor) {
-  return __builtin_bit_cast(MTLBuffer_t, tensor.storage().data());
-}
-
-template <typename T,
-          std::enable_if_t<!std::is_same<std::decay_t<T>, at::Tensor>::value, bool> = true>
-void setMTLArg(MTLComputeCommandEncoder_t encoder, int index, T&& t);
-
-template <typename T,
-          std::enable_if_t<std::is_same<std::decay_t<T>, at::Tensor>::value, bool> = true>
-void setMTLArg(MTLComputeCommandEncoder_t encoder, int index, T&& t) {
-  [encoder setBuffer:getMTLBufferStorage(t) offset:0 atIndex:index];
-}
-
-template <typename T, std::enable_if_t<!std::is_same<std::decay_t<T>, at::Tensor>::value, bool>>
-void setMTLArg(MTLComputeCommandEncoder_t encoder, int index, T&& t) {
-  [encoder setBytes:&t length:sizeof(t) atIndex:index];
-}
-
-inline void setMTLArgsImpl(MTLComputeCommandEncoder_t, int) {}
-
-template <typename T, typename... Args>
-void setMTLArgsImpl(MTLComputeCommandEncoder_t encoder, int index, T&& t, Args&&... args) {
-  setMTLArg(encoder, index, std::forward<T>(t));
-  setMTLArgsImpl(encoder, index + 1, std::forward<Args>(args)...);
-}
-
-template <typename... Args>
-void setMTLArgs(MTLComputeCommandEncoder_t encoder, MTLComputePipelineState_t pso, Args&&... args) {
-  [encoder setComputePipelineState:pso];
-  setMTLArgsImpl(encoder, 0, std::forward<Args>(args)...);
-}
-#endif
diff --git a/mmcv/ops/csrc/common/pytorch_cpp_helper.hpp b/mmcv/ops/csrc/common/pytorch_cpp_helper.hpp
index f68e874..c7f9f35 100644
--- a/mmcv/ops/csrc/common/pytorch_cpp_helper.hpp
+++ b/mmcv/ops/csrc/common/pytorch_cpp_helper.hpp
@@ -1,25 +1,22 @@
 #ifndef PYTORCH_CPP_HELPER
 #define PYTORCH_CPP_HELPER
-#include <torch/types.h>
+#include <torch/extension.h>
 
 #include <vector>
 
 using namespace at;
 
+#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
+
 #define CHECK_CUDA(x) \
   TORCH_CHECK(x.device().is_cuda(), #x " must be a CUDA tensor")
-#define CHECK_MLU(x) \
-  TORCH_CHECK(x.device().type() == at::kMLU, #x " must be a MLU tensor")
 #define CHECK_CPU(x) \
-  TORCH_CHECK(x.device().type() == at::kCPU, #x " must be a CPU tensor")
+  TORCH_CHECK(!x.device().is_cuda(), #x " must be a CPU tensor")
 #define CHECK_CONTIGUOUS(x) \
   TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
 #define CHECK_CUDA_INPUT(x) \
   CHECK_CUDA(x);            \
   CHECK_CONTIGUOUS(x)
-#define CHECK_MLU_INPUT(x) \
-  CHECK_MLU(x);            \
-  CHECK_CONTIGUOUS(x)
 #define CHECK_CPU_INPUT(x) \
   CHECK_CPU(x);            \
   CHECK_CONTIGUOUS(x)
diff --git a/mmcv/ops/csrc/common/pytorch_cuda_helper.hpp b/mmcv/ops/csrc/common/pytorch_cuda_helper.hpp
index 52e5126..9869b53 100644
--- a/mmcv/ops/csrc/common/pytorch_cuda_helper.hpp
+++ b/mmcv/ops/csrc/common/pytorch_cuda_helper.hpp
@@ -15,6 +15,5 @@ using at::Tensor;
 using phalf = at::Half;
 
 #define __PHALF(x) (x)
-#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
 
 #endif  // PYTORCH_CUDA_HELPER
diff --git a/mmcv/ops/csrc/common/pytorch_mlu_helper.hpp b/mmcv/ops/csrc/common/pytorch_mlu_helper.hpp
deleted file mode 100644
index e49572c..0000000
--- a/mmcv/ops/csrc/common/pytorch_mlu_helper.hpp
+++ /dev/null
@@ -1,61 +0,0 @@
-/*************************************************************************
- * Copyright (C) 2021 Cambricon.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *************************************************************************/
-#ifndef PYTORCH_MLU_HELPER_HPP_
-#define PYTORCH_MLU_HELPER_HPP_
-
-#ifdef MMCV_WITH_MLU
-#include "aten.h"
-
-#define NFU_ALIGN_SIZE 128
-
-#define PAD_UP(x, y) (((x) / (y) + (int)((x) % (y) > 0)) * (y))
-
-#define PAD_DOWN(x, y) (((x) / (y)) * (y))
-
-#define CEIL_DIV(x, y) (((x) + (y)-1) / (y))
-
-#define CEIL_ALIGN(x, y) (((x) + (y)-1) / (y) * (y))
-
-inline int32_t getJobLimitCapability() {
-  CNcontext drv_ctx;
-  TORCH_CHECK(CN_SUCCESS == cnCtxGetCurrent(&drv_ctx), "cnCtxGetCurrent fails");
-  CNctxConfigParam ctx_conf_param;
-  TORCH_CHECK(
-      CN_SUCCESS == cnGetCtxConfigParam(drv_ctx, CN_CTX_CONFIG_UNION_LIMIT,
-                                        &ctx_conf_param),
-      "cnGetCtxConfigParam fails.");
-  return (int32_t)ctx_conf_param.unionLimit;
-}
-
-inline int32_t getCoreNumOfJobLimitCapability() {
-  switch (getJobLimitCapability()) {
-    default:
-      return torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster) *
-             getJobLimitCapability();
-    case CN_KERNEL_CLASS_BLOCK:
-      return 1;
-    case CN_KERNEL_CLASS_UNION:
-      return torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster);
-    case CN_KERNEL_CLASS_UNION2:
-      return torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster) * 2;
-    case CN_KERNEL_CLASS_UNION4:
-      return torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster) * 4;
-    case CN_KERNEL_CLASS_UNION8:
-      return torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster) * 8;
-    case CN_KERNEL_CLASS_UNION16:
-      return torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster) * 16;
-  }
-}
-
-#endif  // MMCV_WITH_MLU
-
-#endif  // PYTORCH_MLU_HELPER_HPP_
diff --git a/mmcv/ops/csrc/common/pytorch_npu_helper.hpp b/mmcv/ops/csrc/common/pytorch_npu_helper.hpp
deleted file mode 100644
index 88607d2..0000000
--- a/mmcv/ops/csrc/common/pytorch_npu_helper.hpp
+++ /dev/null
@@ -1,35 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2022 Huawei Technologies Co., Ltd
- * All rights reserved.
- *
- * Licensed under the BSD 3-Clause License  (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * https://opensource.org/licenses/BSD-3-Clause
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- ******************************************************************************/
-
-#ifndef PYTORCH_NPU_HELPER_HPP_
-#define PYTORCH_NPU_HELPER_HPP_
-
-#include <torch_npu/csrc/aten/NPUNativeFunctions.h>
-#include <torch_npu/csrc/framework/utils/CalcuOpUtil.h>
-#include <torch_npu/csrc/framework/utils/OpAdapter.h>
-
-#include "pytorch_cpp_helper.hpp"
-#include "pytorch_device_registry.hpp"
-
-#define NPU_NAME_SPACE at_npu::native
-
-#define REGISTER_NPU_IMPL(key, value) REGISTER_DEVICE_IMPL(key, XLA, value)
-
-#define CHECK_NPU(x) \
-  TORCH_CHECK(x.device().type() == at::kXLA, #x " must be a NPU tensor")
-
-#endif  // PYTORCH_NPU_HELPER_HPP_
diff --git a/mmcv/ops/csrc/common/utils/spconv/paramsgrid.h b/mmcv/ops/csrc/common/utils/spconv/paramsgrid.h
deleted file mode 100644
index f23ff44..0000000
--- a/mmcv/ops/csrc/common/utils/spconv/paramsgrid.h
+++ /dev/null
@@ -1,70 +0,0 @@
-// Copyright 2019 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef PARAMS_GRID_H_
-#define PARAMS_GRID_H_
-#include <tuple>
-#include <vector>
-
-namespace detail {
-template <class scalar_t>
-int getTotalSize(std::vector<scalar_t> arg) {
-  return arg.size();
-}
-
-template <class scalar_t, class... TArgs>
-int getTotalSize(std::vector<scalar_t> arg, std::vector<TArgs>... args) {
-  return arg.size() * getTotalSize(args...);
-}
-
-template <typename scalar_t>
-int getSize(std::vector<scalar_t> arg) {
-  return arg.size();
-}
-
-template <int Idx, class TT, class scalar_t>
-void assigner(TT &src, std::vector<int> counter, std::vector<scalar_t> &arg) {
-  std::get<Idx>(src) = arg[counter[Idx]];
-}
-
-template <int Idx, class TT, class scalar_t, class... TArgs>
-void assigner(TT &src, std::vector<int> counter, std::vector<scalar_t> &arg,
-              std::vector<TArgs> &... args) {
-  std::get<Idx>(src) = arg[counter[Idx]];
-  assigner<Idx + 1>(src, counter, args...);
-}
-}  // namespace detail
-
-template <class... TArgs>
-std::vector<std::tuple<TArgs...>> paramsGrid(std::vector<TArgs>... args) {
-  int length = detail::getTotalSize(args...);
-  std::vector<int> sizes = {detail::getSize(args)...};
-  int size = sizes.size();
-
-  std::vector<std::tuple<TArgs...>> params(length);
-  std::vector<int> counter(size);
-  for (int i = 0; i < length; ++i) {
-    detail::assigner<0>(params[i], counter, args...);
-    counter[size - 1] += 1;
-    for (int c = size - 1; c >= 0; --c) {
-      if (counter[c] == sizes[c] && c > 0) {
-        counter[c - 1] += 1;
-        counter[c] = 0;
-      }
-    }
-  }
-  return params;
-}
-
-#endif
diff --git a/mmcv/ops/csrc/common/utils/spconv/prettyprint.h b/mmcv/ops/csrc/common/utils/spconv/prettyprint.h
deleted file mode 100644
index 0a6bdc3..0000000
--- a/mmcv/ops/csrc/common/utils/spconv/prettyprint.h
+++ /dev/null
@@ -1,493 +0,0 @@
-//          Copyright Louis Delacroix 2010 - 2014.
-// Distributed under the Boost Software License, Version 1.0.
-//    (See accompanying file LICENSE_1_0.txt or copy at
-//          http://www.boost.org/LICENSE_1_0.txt)
-//
-// A pretty printing library for C++
-//
-// Usage:
-// Include this header, and operator<< will "just work".
-
-#ifndef H_PRETTY_PRINT
-#define H_PRETTY_PRINT
-
-#include <cstddef>
-#include <iterator>
-#include <memory>
-#include <ostream>
-#include <set>
-#include <tuple>
-#include <type_traits>
-#include <unordered_set>
-#include <utility>
-#include <valarray>
-
-namespace pretty_print {
-namespace detail {
-// SFINAE type trait to detect whether T::const_iterator exists.
-
-struct sfinae_base {
-  using yes = char;
-  using no = yes[2];
-};
-
-template <typename T>
-struct has_const_iterator : private sfinae_base {
- private:
-  template <typename C>
-  static yes &test(typename C::const_iterator *);
-  template <typename C>
-  static no &test(...);
-
- public:
-  static const bool value = sizeof(test<T>(nullptr)) == sizeof(yes);
-  using type = T;
-};
-
-template <typename T>
-struct has_begin_end : private sfinae_base {
- private:
-  template <typename C>
-  static yes &
-  f(typename std::enable_if<
-      std::is_same<decltype(static_cast<typename C::const_iterator (C::*)()
-                                            const>(&C::begin)),
-                   typename C::const_iterator (C::*)() const>::value>::type *);
-
-  template <typename C>
-  static no &f(...);
-
-  template <typename C>
-  static yes &g(typename std::enable_if<
-                std::is_same<decltype(static_cast<typename C::const_iterator (
-                                          C::*)() const>(&C::end)),
-                             typename C::const_iterator (C::*)() const>::value,
-                void>::type *);
-
-  template <typename C>
-  static no &g(...);
-
- public:
-  static bool const beg_value = sizeof(f<T>(nullptr)) == sizeof(yes);
-  static bool const end_value = sizeof(g<T>(nullptr)) == sizeof(yes);
-};
-
-}  // namespace detail
-
-// Holds the delimiter values for a specific character type
-
-template <typename TChar>
-struct delimiters_values {
-  using char_type = TChar;
-  const char_type *prefix;
-  const char_type *delimiter;
-  const char_type *postfix;
-};
-
-// Defines the delimiter values for a specific container and character type
-
-template <typename T, typename TChar>
-struct delimiters {
-  using type = delimiters_values<TChar>;
-  static const type values;
-};
-
-// Functor to print containers. You can use this directly if you want
-// to specify a non-default delimiters type. The printing logic can
-// be customized by specializing the nested template.
-
-template <typename T, typename TChar = char,
-          typename TCharTraits = ::std::char_traits<TChar>,
-          typename TDelimiters = delimiters<T, TChar>>
-struct print_container_helper {
-  using delimiters_type = TDelimiters;
-  using ostream_type = std::basic_ostream<TChar, TCharTraits>;
-
-  template <typename U>
-  struct printer {
-    static void print_body(const U &c, ostream_type &stream) {
-      using std::begin;
-      using std::end;
-
-      auto it = begin(c);
-      const auto the_end = end(c);
-
-      if (it != the_end) {
-        for (;;) {
-          stream << *it;
-
-          if (++it == the_end) break;
-
-          if (delimiters_type::values.delimiter != NULL)
-            stream << delimiters_type::values.delimiter;
-        }
-      }
-    }
-  };
-
-  print_container_helper(const T &container) : container_(container) {}
-
-  inline void operator()(ostream_type &stream) const {
-    if (delimiters_type::values.prefix != NULL)
-      stream << delimiters_type::values.prefix;
-
-    printer<T>::print_body(container_, stream);
-
-    if (delimiters_type::values.postfix != NULL)
-      stream << delimiters_type::values.postfix;
-  }
-
- private:
-  const T &container_;
-};
-
-// Specialization for pairs
-
-template <typename T, typename TChar, typename TCharTraits,
-          typename TDelimiters>
-template <typename T1, typename T2>
-struct print_container_helper<T, TChar, TCharTraits,
-                              TDelimiters>::printer<std::pair<T1, T2>> {
-  using ostream_type =
-      typename print_container_helper<T, TChar, TCharTraits,
-                                      TDelimiters>::ostream_type;
-
-  static void print_body(const std::pair<T1, T2> &c, ostream_type &stream) {
-    stream << c.first;
-    if (print_container_helper<T, TChar, TCharTraits,
-                               TDelimiters>::delimiters_type::values
-            .delimiter != NULL)
-      stream << print_container_helper<T, TChar, TCharTraits,
-                                       TDelimiters>::delimiters_type::values
-                    .delimiter;
-    stream << c.second;
-  }
-};
-
-// Specialization for tuples
-
-template <typename T, typename TChar, typename TCharTraits,
-          typename TDelimiters>
-template <typename... Args>
-struct print_container_helper<T, TChar, TCharTraits,
-                              TDelimiters>::printer<std::tuple<Args...>> {
-  using ostream_type =
-      typename print_container_helper<T, TChar, TCharTraits,
-                                      TDelimiters>::ostream_type;
-  using element_type = std::tuple<Args...>;
-
-  template <std::size_t I>
-  struct Int {};
-
-  static void print_body(const element_type &c, ostream_type &stream) {
-    tuple_print(c, stream, Int<0>());
-  }
-
-  static void tuple_print(const element_type &, ostream_type &,
-                          Int<sizeof...(Args)>) {}
-
-  static void tuple_print(
-      const element_type &c, ostream_type &stream,
-      typename std::conditional<sizeof...(Args) != 0, Int<0>,
-                                std::nullptr_t>::type) {
-    stream << std::get<0>(c);
-    tuple_print(c, stream, Int<1>());
-  }
-
-  template <std::size_t N>
-  static void tuple_print(const element_type &c, ostream_type &stream, Int<N>) {
-    if (print_container_helper<T, TChar, TCharTraits,
-                               TDelimiters>::delimiters_type::values
-            .delimiter != NULL)
-      stream << print_container_helper<T, TChar, TCharTraits,
-                                       TDelimiters>::delimiters_type::values
-                    .delimiter;
-
-    stream << std::get<N>(c);
-
-    tuple_print(c, stream, Int<N + 1>());
-  }
-};
-
-// Prints a print_container_helper to the specified stream.
-
-template <typename T, typename TChar, typename TCharTraits,
-          typename TDelimiters>
-inline std::basic_ostream<TChar, TCharTraits> &operator<<(
-    std::basic_ostream<TChar, TCharTraits> &stream,
-    const print_container_helper<T, TChar, TCharTraits, TDelimiters> &helper) {
-  helper(stream);
-  return stream;
-}
-
-// Basic is_container template; specialize to derive from std::true_type for all
-// desired container types
-
-template <typename T>
-struct is_container
-    : public std::integral_constant<bool,
-                                    detail::has_const_iterator<T>::value &&
-                                        detail::has_begin_end<T>::beg_value &&
-                                        detail::has_begin_end<T>::end_value> {};
-
-template <typename T, std::size_t N>
-struct is_container<T[N]> : std::true_type {};
-
-template <std::size_t N>
-struct is_container<char[N]> : std::false_type {};
-
-template <typename T>
-struct is_container<std::valarray<T>> : std::true_type {};
-
-template <typename T1, typename T2>
-struct is_container<std::pair<T1, T2>> : std::true_type {};
-
-template <typename... Args>
-struct is_container<std::tuple<Args...>> : std::true_type {};
-
-// Default delimiters
-
-template <typename T>
-struct delimiters<T, char> {
-  static const delimiters_values<char> values;
-};
-template <typename T>
-const delimiters_values<char> delimiters<T, char>::values = {"[", ", ", "]"};
-template <typename T>
-struct delimiters<T, wchar_t> {
-  static const delimiters_values<wchar_t> values;
-};
-template <typename T>
-const delimiters_values<wchar_t> delimiters<T, wchar_t>::values = {L"[", L", ",
-                                                                   L"]"};
-
-// Delimiters for (multi)set and unordered_(multi)set
-
-template <typename T, typename TComp, typename TAllocator>
-struct delimiters<::std::set<T, TComp, TAllocator>, char> {
-  static const delimiters_values<char> values;
-};
-
-template <typename T, typename TComp, typename TAllocator>
-const delimiters_values<char>
-    delimiters<::std::set<T, TComp, TAllocator>, char>::values = {"{", ", ",
-                                                                  "}"};
-
-template <typename T, typename TComp, typename TAllocator>
-struct delimiters<::std::set<T, TComp, TAllocator>, wchar_t> {
-  static const delimiters_values<wchar_t> values;
-};
-
-template <typename T, typename TComp, typename TAllocator>
-const delimiters_values<wchar_t>
-    delimiters<::std::set<T, TComp, TAllocator>, wchar_t>::values = {
-        L"{", L", ", L"}"};
-
-template <typename T, typename TComp, typename TAllocator>
-struct delimiters<::std::multiset<T, TComp, TAllocator>, char> {
-  static const delimiters_values<char> values;
-};
-
-template <typename T, typename TComp, typename TAllocator>
-const delimiters_values<char>
-    delimiters<::std::multiset<T, TComp, TAllocator>, char>::values = {
-        "{", ", ", "}"};
-
-template <typename T, typename TComp, typename TAllocator>
-struct delimiters<::std::multiset<T, TComp, TAllocator>, wchar_t> {
-  static const delimiters_values<wchar_t> values;
-};
-
-template <typename T, typename TComp, typename TAllocator>
-const delimiters_values<wchar_t>
-    delimiters<::std::multiset<T, TComp, TAllocator>, wchar_t>::values = {
-        L"{", L", ", L"}"};
-
-template <typename T, typename THash, typename TEqual, typename TAllocator>
-struct delimiters<::std::unordered_set<T, THash, TEqual, TAllocator>, char> {
-  static const delimiters_values<char> values;
-};
-
-template <typename T, typename THash, typename TEqual, typename TAllocator>
-const delimiters_values<char> delimiters<
-    ::std::unordered_set<T, THash, TEqual, TAllocator>, char>::values = {
-    "{", ", ", "}"};
-
-template <typename T, typename THash, typename TEqual, typename TAllocator>
-struct delimiters<::std::unordered_set<T, THash, TEqual, TAllocator>, wchar_t> {
-  static const delimiters_values<wchar_t> values;
-};
-
-template <typename T, typename THash, typename TEqual, typename TAllocator>
-const delimiters_values<wchar_t> delimiters<
-    ::std::unordered_set<T, THash, TEqual, TAllocator>, wchar_t>::values = {
-    L"{", L", ", L"}"};
-
-template <typename T, typename THash, typename TEqual, typename TAllocator>
-struct delimiters<::std::unordered_multiset<T, THash, TEqual, TAllocator>,
-                  char> {
-  static const delimiters_values<char> values;
-};
-
-template <typename T, typename THash, typename TEqual, typename TAllocator>
-const delimiters_values<char> delimiters<
-    ::std::unordered_multiset<T, THash, TEqual, TAllocator>, char>::values = {
-    "{", ", ", "}"};
-
-template <typename T, typename THash, typename TEqual, typename TAllocator>
-struct delimiters<::std::unordered_multiset<T, THash, TEqual, TAllocator>,
-                  wchar_t> {
-  static const delimiters_values<wchar_t> values;
-};
-
-template <typename T, typename THash, typename TEqual, typename TAllocator>
-const delimiters_values<wchar_t>
-    delimiters<::std::unordered_multiset<T, THash, TEqual, TAllocator>,
-               wchar_t>::values = {L"{", L", ", L"}"};
-
-// Delimiters for pair and tuple
-
-template <typename T1, typename T2>
-struct delimiters<std::pair<T1, T2>, char> {
-  static const delimiters_values<char> values;
-};
-template <typename T1, typename T2>
-const delimiters_values<char> delimiters<std::pair<T1, T2>, char>::values = {
-    "(", ", ", ")"};
-template <typename T1, typename T2>
-struct delimiters<::std::pair<T1, T2>, wchar_t> {
-  static const delimiters_values<wchar_t> values;
-};
-template <typename T1, typename T2>
-const delimiters_values<wchar_t>
-    delimiters<::std::pair<T1, T2>, wchar_t>::values = {L"(", L", ", L")"};
-
-template <typename... Args>
-struct delimiters<std::tuple<Args...>, char> {
-  static const delimiters_values<char> values;
-};
-template <typename... Args>
-const delimiters_values<char> delimiters<std::tuple<Args...>, char>::values = {
-    "(", ", ", ")"};
-template <typename... Args>
-struct delimiters<::std::tuple<Args...>, wchar_t> {
-  static const delimiters_values<wchar_t> values;
-};
-template <typename... Args>
-const delimiters_values<wchar_t>
-    delimiters<::std::tuple<Args...>, wchar_t>::values = {L"(", L", ", L")"};
-
-// Type-erasing helper class for easy use of custom delimiters.
-// Requires TCharTraits = std::char_traits<TChar> and TChar = char or wchar_t,
-// and MyDelims needs to be defined for TChar. Usage: "cout <<
-// pretty_print::custom_delims<MyDelims>(x)".
-
-struct custom_delims_base {
-  virtual ~custom_delims_base() {}
-  virtual std::ostream &stream(::std::ostream &) = 0;
-  virtual std::wostream &stream(::std::wostream &) = 0;
-};
-
-template <typename T, typename Delims>
-struct custom_delims_wrapper : custom_delims_base {
-  custom_delims_wrapper(const T &t_) : t(t_) {}
-
-  std::ostream &stream(std::ostream &s) {
-    return s << print_container_helper<T, char, std::char_traits<char>, Delims>(
-               t);
-  }
-
-  std::wostream &stream(std::wostream &s) {
-    return s << print_container_helper<T, wchar_t, std::char_traits<wchar_t>,
-                                       Delims>(t);
-  }
-
- private:
-  const T &t;
-};
-
-template <typename Delims>
-struct custom_delims {
-  template <typename Container>
-  custom_delims(const Container &c)
-      : base(new custom_delims_wrapper<Container, Delims>(c)) {}
-
-  std::unique_ptr<custom_delims_base> base;
-};
-
-template <typename TChar, typename TCharTraits, typename Delims>
-inline std::basic_ostream<TChar, TCharTraits> &operator<<(
-    std::basic_ostream<TChar, TCharTraits> &s, const custom_delims<Delims> &p) {
-  return p.base->stream(s);
-}
-
-// A wrapper for a C-style array given as pointer-plus-size.
-// Usage: std::cout << pretty_print_array(arr, n) << std::endl;
-
-template <typename T>
-struct array_wrapper_n {
-  typedef const T *const_iterator;
-  typedef T value_type;
-
-  array_wrapper_n(const T *const a, size_t n) : _array(a), _n(n) {}
-  inline const_iterator begin() const { return _array; }
-  inline const_iterator end() const { return _array + _n; }
-
- private:
-  const T *const _array;
-  size_t _n;
-};
-
-// A wrapper for hash-table based containers that offer local iterators to each
-// bucket. Usage: std::cout << bucket_print(m, 4) << std::endl;  (Prints bucket
-// 5 of container m.)
-
-template <typename T>
-struct bucket_print_wrapper {
-  typedef typename T::const_local_iterator const_iterator;
-  typedef typename T::size_type size_type;
-
-  const_iterator begin() const { return m_map.cbegin(n); }
-
-  const_iterator end() const { return m_map.cend(n); }
-
-  bucket_print_wrapper(const T &m, size_type bucket) : m_map(m), n(bucket) {}
-
- private:
-  const T &m_map;
-  const size_type n;
-};
-
-}  // namespace pretty_print
-
-// Global accessor functions for the convenience wrappers
-
-template <typename T>
-inline pretty_print::array_wrapper_n<T> pretty_print_array(const T *const a,
-                                                           size_t n) {
-  return pretty_print::array_wrapper_n<T>(a, n);
-}
-
-template <typename T>
-pretty_print::bucket_print_wrapper<T> bucket_print(const T &m,
-                                                   typename T::size_type n) {
-  return pretty_print::bucket_print_wrapper<T>(m, n);
-}
-
-// Main magic entry point: An overload snuck into namespace std.
-// Can we do better?
-
-namespace std {
-// Prints a container to the stream using default delimiters
-
-template <typename T, typename TChar, typename TCharTraits>
-inline typename enable_if<::pretty_print::is_container<T>::value,
-                          basic_ostream<TChar, TCharTraits> &>::type
-operator<<(basic_ostream<TChar, TCharTraits> &stream, const T &container) {
-  return stream
-         << ::pretty_print::print_container_helper<T, TChar, TCharTraits>(
-                container);
-}
-}  // namespace std
-
-#endif  // H_PRETTY_PRINT
diff --git a/mmcv/ops/csrc/common/utils/spconv/pybind11_utils.h b/mmcv/ops/csrc/common/utils/spconv/pybind11_utils.h
deleted file mode 100644
index 026e35b..0000000
--- a/mmcv/ops/csrc/common/utils/spconv/pybind11_utils.h
+++ /dev/null
@@ -1,60 +0,0 @@
-// Copyright 2019 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <pybind11/embed.h>
-#include <pybind11/functional.h>
-#include <pybind11/numpy.h>
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
-#include <spconv/tensorview/tensorview.h>
-
-#include <algorithm>
-#include <iostream>
-
-namespace py = pybind11;
-
-template <typename scalar_t, typename TPyObject>
-std::vector<scalar_t> array2Vector(TPyObject arr) {
-  py::array arr_np = arr;
-  size_t size = arr.attr("size").template cast<size_t>();
-  py::array_t<scalar_t> arr_cc = arr_np;
-  std::vector<scalar_t> data(arr_cc.data(), arr_cc.data() + size);
-  return data;
-}
-
-template <typename scalar_t>
-std::vector<scalar_t> arrayT2Vector(py::array_t<scalar_t> arr) {
-  std::vector<scalar_t> data(arr.data(), arr.data() + arr.size());
-  return data;
-}
-
-template <typename scalar_t, typename TPyObject>
-tv::TensorView<scalar_t> array2TensorView(TPyObject arr) {
-  py::array arr_np = arr;
-  py::array_t<scalar_t> arr_cc = arr_np;
-  tv::Shape shape;
-  for (int i = 0; i < arr_cc.ndim(); ++i) {
-    shape.push_back(arr_cc.shape(i));
-  }
-  return tv::TensorView<scalar_t>(arr_cc.mutable_data(), shape);
-}
-template <typename scalar_t>
-tv::TensorView<scalar_t> arrayT2TensorView(py::array_t<scalar_t> arr) {
-  tv::Shape shape;
-  for (int i = 0; i < arr.ndim(); ++i) {
-    shape.push_back(arr.shape(i));
-  }
-  return tv::TensorView<scalar_t>(arr.mutable_data(), shape);
-}
diff --git a/mmcv/ops/csrc/common/utils/spconv/spconv/geometry.h b/mmcv/ops/csrc/common/utils/spconv/spconv/geometry.h
deleted file mode 100644
index def6fe5..0000000
--- a/mmcv/ops/csrc/common/utils/spconv/spconv/geometry.h
+++ /dev/null
@@ -1,295 +0,0 @@
-// Copyright 2019 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef SPCONV_GEOMETRY_H_
-#define SPCONV_GEOMETRY_H_
-
-#include <utils/spconv/tensorview/tensorview.h>
-
-#include <iostream>
-#include <limits>
-
-template <typename Index, unsigned NDim>
-TV_HOST_DEVICE Index getValidOutPos(const Index *input_pos,
-                                    const Index *kernelSize,
-                                    const Index *stride, const Index *padding,
-                                    const Index *dilation,
-                                    const Index *outSpatialShape, Index *out) {
-  Index lowers[NDim];
-  Index uppers[NDim];
-  Index counter[NDim];
-  Index counterSize[NDim];
-  Index pointCounter = 0;
-  Index val;
-  Index numPoints = 1;
-  Index m, offset;
-  bool valid = false;
-#pragma unroll
-  for (unsigned i = 0; i < NDim; ++i) {
-    lowers[i] = (input_pos[i] - (kernelSize[i] - 1) * dilation[i] - 1 +
-                 stride[i] + padding[i]) /
-                stride[i];
-    uppers[i] = (input_pos[i] + padding[i]) / stride[i];
-  }
-
-#pragma unroll
-  for (unsigned i = 0; i < NDim; ++i) {
-    counterSize[i] = ((uppers[i] - lowers[i]) / dilation[i] + 1);
-    numPoints *= counterSize[i];
-  }
-
-#pragma unroll
-  for (unsigned i = 0; i < NDim; ++i) {
-    counter[i] = 0;
-  }
-  for (int i = 0; i < numPoints; ++i) {
-    valid = true;
-    m = 1;
-    offset = 0;
-#pragma unroll
-    for (int j = NDim - 1; j >= 0; --j) {
-      val = uppers[j] - counter[j] * dilation[j];
-      out[pointCounter * (NDim + 1) + j] = val;
-      if (val < 0 || (val > outSpatialShape[j] - 1)) {
-        valid = false;
-        // break;
-      }
-      offset += m * (input_pos[j] - val * stride[j] + padding[j]) / dilation[j];
-      m *= kernelSize[j];
-    }
-
-    out[pointCounter * (NDim + 1) + NDim] = offset;
-    if (valid) ++pointCounter;
-    counter[NDim - 1] += 1;
-#pragma unroll
-    for (int c = NDim - 1; c >= 0; --c) {
-      if (counter[c] == counterSize[c] && c > 0) {
-        counter[c - 1] += 1;
-        counter[c] = 0;
-      }
-    }
-  }
-  return pointCounter;
-}
-
-template <typename Index, unsigned NDim>
-TV_HOST_DEVICE Index getValidOutPosTranspose(
-    const Index *input_pos, const Index *kernelSize, const Index *stride,
-    const Index *padding, const Index *dilation, const Index *outSpatialShape,
-    Index *out) {
-  Index lowers[NDim];
-  Index uppers[NDim];
-  Index counter[NDim];
-  Index counterSize[NDim];
-  Index pointCounter = 0;
-  Index val;
-  Index numPoints = 1;
-  Index m, offset;
-  bool valid = false;
-#pragma unroll
-  for (unsigned i = 0; i < NDim; ++i) {
-    lowers[i] = input_pos[i] * stride[i] - padding[i];
-    uppers[i] = lowers[i] + (kernelSize[i] - 1) * dilation[i];
-  }
-#pragma unroll
-  for (unsigned i = 0; i < NDim; ++i) {
-    counterSize[i] = ((uppers[i] - lowers[i]) / dilation[i] + 1);
-    numPoints *= counterSize[i];
-  }
-#pragma unroll
-  for (unsigned i = 0; i < NDim; ++i) {
-    counter[i] = 0;
-  }
-  for (int i = 0; i < numPoints; ++i) {
-    valid = true;
-    m = 1;
-    offset = 0;
-#pragma unroll
-    for (int j = NDim - 1; j >= 0; --j) {
-      val = uppers[j] - counter[j] * dilation[j];
-      out[pointCounter * (NDim + 1) + j] = val;
-      if (val < 0 || (val > outSpatialShape[j] - 1)) {
-        valid = false;
-      }
-      offset += m * (val - lowers[j]) / dilation[j];
-      m *= kernelSize[j];
-    }
-    out[pointCounter * (NDim + 1) + NDim] = offset;
-    if (valid) ++pointCounter;
-    counter[NDim - 1] += 1;
-#pragma unroll
-    for (int c = NDim - 1; c >= 0; --c) {
-      if (counter[c] == counterSize[c] && c > 0) {
-        counter[c - 1] += 1;
-        counter[c] = 0;
-      }
-    }
-  }
-  return pointCounter;
-}
-
-template <typename Index, typename IndexGrid, unsigned NDim>
-Index getIndicePairsConv(tv::TensorView<const Index> indicesIn,
-                         tv::TensorView<Index> indicesOut,
-                         tv::TensorView<IndexGrid> gridsOut,
-                         tv::TensorView<Index> indicePairs,
-                         tv::TensorView<Index> indiceNum,
-                         const Index *kernelSize, const Index *stride,
-                         const Index *padding, const Index *dilation,
-                         const Index *outSpatialShape) {
-  // indicesOut: num_active * kernelVolume * (NDim + 1)
-  Index numAct = 0;
-  auto numActIn = indicesIn.dim(0);
-  Index batchIdx = 0;
-  Index spatialVolume = 1;
-#pragma unroll
-  for (int i = 0; i < NDim; ++i) {
-    spatialVolume *= outSpatialShape[i];
-  }
-  Index kernelVolume = 1;
-#pragma unroll
-  for (int i = 0; i < NDim; ++i) {
-    kernelVolume *= kernelSize[i];
-  }
-  Index numValidPoints = 0;
-  std::vector<Index> validPoints_(kernelVolume * (NDim + 1));
-  Index *validPoints = validPoints_.data();
-  Index *pointPtr = nullptr;
-  for (int j = 0; j < numActIn; ++j) {
-    batchIdx = indicesIn(j, 0);
-    numValidPoints = getValidOutPos<Index, NDim>(
-        indicesIn.data() + j * (NDim + 1) + 1, kernelSize, stride, padding,
-        dilation, outSpatialShape, validPoints);
-    for (Index i = 0; i < numValidPoints; ++i) {
-      pointPtr = validPoints + i * (NDim + 1);
-      auto offset = pointPtr[NDim];
-      auto index = tv::rowArrayIdx<Index, NDim>(pointPtr, outSpatialShape) +
-                   spatialVolume * batchIdx;
-      if (gridsOut[index] == -1) {
-        for (unsigned k = 1; k < NDim + 1; ++k) {
-          indicesOut(numAct, k) = pointPtr[k - 1];
-        }
-        indicesOut(numAct, 0) = batchIdx;
-        gridsOut[index] = numAct++;
-      }
-      // indicePairs: [K, 2, L]
-      indicePairs(offset, 0, indiceNum[offset]) = j;
-      indicePairs(offset, 1, indiceNum[offset]++) = gridsOut[index];
-    }
-  }
-  return numAct;
-}
-
-template <typename Index, typename IndexGrid, unsigned NDim>
-Index getIndicePairsDeConv(tv::TensorView<const Index> indicesIn,
-                           tv::TensorView<Index> indicesOut,
-                           tv::TensorView<IndexGrid> gridsOut,
-                           tv::TensorView<Index> indicePairs,
-                           tv::TensorView<Index> indiceNum,
-                           const Index *kernelSize, const Index *stride,
-                           const Index *padding, const Index *dilation,
-                           const Index *outSpatialShape) {
-  Index numAct = 0;
-  auto numActIn = indicesIn.dim(0);
-  Index batchIdx = 0;
-  Index spatialVolume = 1;
-#pragma unroll
-  for (int i = 0; i < NDim; ++i) {
-    spatialVolume *= outSpatialShape[i];
-  }
-  Index kernelVolume = 1;
-#pragma unroll
-  for (int i = 0; i < NDim; ++i) {
-    kernelVolume *= kernelSize[i];
-  }
-  Index numValidPoints = 0;
-  std::vector<Index> validPoints_(kernelVolume * (NDim + 1));
-  Index *validPoints = validPoints_.data();
-  Index *pointPtr = nullptr;
-  for (int j = 0; j < numActIn; ++j) {
-    batchIdx = indicesIn(j, 0);
-    numValidPoints = getValidOutPosTranspose<Index, NDim>(
-        indicesIn.data() + j * (NDim + 1) + 1, kernelSize, stride, padding,
-        dilation, outSpatialShape, validPoints);
-    for (Index i = 0; i < numValidPoints; ++i) {
-      pointPtr = validPoints + i * (NDim + 1);
-      auto offset = pointPtr[NDim];
-      auto index = tv::rowArrayIdx<Index, NDim>(pointPtr, outSpatialShape) +
-                   spatialVolume * batchIdx;
-      if (gridsOut[index] == -1) {
-        for (unsigned k = 1; k < NDim + 1; ++k) {
-          indicesOut(numAct, k) = pointPtr[k - 1];
-        }
-        indicesOut(numAct, 0) = batchIdx;
-        gridsOut[index] = numAct++;
-      }
-      // indicePairs: [K, 2, L]
-      indicePairs(offset, 0, indiceNum[offset]) = j;
-      indicePairs(offset, 1, indiceNum[offset]++) = gridsOut[index];
-    }
-  }
-  return numAct;
-}
-
-template <typename Index, typename IndexGrid, unsigned NDim>
-Index getIndicePairsSubM(tv::TensorView<const Index> indicesIn,
-                         tv::TensorView<IndexGrid> gridsOut,
-                         tv::TensorView<Index> indicePairs,
-                         tv::TensorView<Index> indiceNum,
-                         const Index *const kernelSize,
-                         const Index *const stride, const Index *const padding,
-                         const Index *dilation,
-                         const Index *const outSpatialShape) {
-  auto numActIn = indicesIn.dim(0);
-  Index spatialVolume = 1;
-#pragma unroll
-  for (int i = 0; i < NDim; ++i) {
-    spatialVolume *= outSpatialShape[i];
-  }
-  Index kernelVolume = 1;
-#pragma unroll
-  for (int i = 0; i < NDim; ++i) {
-    kernelVolume *= kernelSize[i];
-  }
-  Index numValidPoints = 0;
-  // Index validPoints[kernelVolume * (NDim + 1)];
-  std::vector<Index> validPoints_(kernelVolume * (NDim + 1));
-  Index *validPoints = validPoints_.data();
-  Index *pointPtr = nullptr;
-  Index index = 0;
-  for (int j = 0; j < numActIn; ++j) {
-    index = tv::rowArrayIdx<Index, NDim>(indicesIn.data() + j * (NDim + 1) + 1,
-                                         outSpatialShape) +
-            spatialVolume * indicesIn(j, 0);
-    gridsOut[index] = j;
-  }
-  for (int j = 0; j < numActIn; ++j) {
-    numValidPoints = getValidOutPos<Index, NDim>(
-        indicesIn.data() + j * (NDim + 1) + 1, kernelSize, stride, padding,
-        dilation, outSpatialShape, validPoints);
-    for (Index i = 0; i < numValidPoints; ++i) {
-      pointPtr = validPoints + i * (NDim + 1);
-      auto offset = pointPtr[NDim];
-      index = tv::rowArrayIdx<Index, NDim>(pointPtr, outSpatialShape) +
-              spatialVolume * indicesIn(j, 0);
-      if (gridsOut[index] > -1) {
-        indicePairs(offset, 0, indiceNum[offset]) = j;
-        indicePairs(offset, 1, indiceNum[offset]++) = gridsOut[index];
-      }
-    }
-  }
-  return numActIn;
-}
-
-#endif
diff --git a/mmcv/ops/csrc/common/utils/spconv/spconv/indice.h b/mmcv/ops/csrc/common/utils/spconv/spconv/indice.h
deleted file mode 100644
index 96ce34e..0000000
--- a/mmcv/ops/csrc/common/utils/spconv/spconv/indice.h
+++ /dev/null
@@ -1,78 +0,0 @@
-// Copyright 2019 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef SPARSE_CONV_INDICE_FUNCTOR_H_
-#define SPARSE_CONV_INDICE_FUNCTOR_H_
-#include <utils/spconv/tensorview/tensorview.h>
-
-namespace functor {
-template <typename Device, typename Index, typename IndexGrid, unsigned NDim>
-struct CreateConvIndicePairFunctorP1 {
-  Index operator()(const Device& d, tv::TensorView<const Index> indicesIn,
-                   tv::TensorView<Index> indicesOut,
-                   tv::TensorView<IndexGrid> gridsOut,
-                   tv::TensorView<Index> indicePairs,
-                   tv::TensorView<Index> indiceNum,
-                   tv::TensorView<Index> indicePairUnique,
-                   const tv::SimpleVector<Index, NDim> kernelSize,
-                   const tv::SimpleVector<Index, NDim> stride,
-                   const tv::SimpleVector<Index, NDim> padding,
-                   const tv::SimpleVector<Index, NDim> dilation,
-                   const tv::SimpleVector<Index, NDim> outSpatialShape,
-                   bool transpose);
-};
-
-template <typename Device, typename Index, typename IndexGrid, unsigned NDim>
-struct CreateConvIndicePairFunctorP2 {
-  Index operator()(const Device& d, tv::TensorView<const Index> indicesIn,
-                   tv::TensorView<Index> indicesOut,
-                   tv::TensorView<IndexGrid> gridsOut,
-                   tv::TensorView<Index> indicePairs,
-                   tv::TensorView<Index> indiceNum,
-                   tv::TensorView<Index> indicePairUnique,
-                   const tv::SimpleVector<Index, NDim> outSpatialShape,
-                   bool transpose, bool resetGrid = false);
-};
-
-template <typename Device, typename Index, typename IndexGrid, unsigned NDim>
-struct CreateConvIndicePairFunctor {
-  Index operator()(const Device& d, tv::TensorView<const Index> indicesIn,
-                   tv::TensorView<Index> indicesOut,
-                   tv::TensorView<IndexGrid> gridsOut,
-                   tv::TensorView<Index> indicePairs,
-                   tv::TensorView<Index> indiceNum,
-                   const tv::SimpleVector<Index, NDim> kernelSize,
-                   const tv::SimpleVector<Index, NDim> stride,
-                   const tv::SimpleVector<Index, NDim> padding,
-                   const tv::SimpleVector<Index, NDim> dilation,
-                   const tv::SimpleVector<Index, NDim> outSpatialShape,
-                   bool transpose, bool resetGrid = false);
-};
-
-template <typename Device, typename Index, typename IndexGrid, unsigned NDim>
-struct CreateSubMIndicePairFunctor {
-  Index operator()(const Device& d, tv::TensorView<const Index> indicesIn,
-                   tv::TensorView<IndexGrid> gridsOut,
-                   tv::TensorView<Index> indicePairs,
-                   tv::TensorView<Index> indiceNum,
-                   const tv::SimpleVector<Index, NDim> kernelSize,
-                   const tv::SimpleVector<Index, NDim> stride,
-                   const tv::SimpleVector<Index, NDim> padding,
-                   const tv::SimpleVector<Index, NDim> dilation,
-                   const tv::SimpleVector<Index, NDim> outSpatialShape,
-                   bool transpose, bool resetGrid = false);
-};
-}  // namespace functor
-
-#endif
diff --git a/mmcv/ops/csrc/common/utils/spconv/spconv/maxpool.h b/mmcv/ops/csrc/common/utils/spconv/spconv/maxpool.h
deleted file mode 100644
index 78f32ed..0000000
--- a/mmcv/ops/csrc/common/utils/spconv/spconv/maxpool.h
+++ /dev/null
@@ -1,37 +0,0 @@
-// Copyright 2019 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef SPARSE_MAXPOOL_FUNCTOR_H_
-#define SPARSE_MAXPOOL_FUNCTOR_H_
-#include <utils/spconv/tensorview/tensorview.h>
-
-namespace functor {
-template <typename Device, typename scalar_t, typename Index>
-struct SparseMaxPoolForwardFunctor {
-  void operator()(const Device& d, tv::TensorView<scalar_t> outFeatures,
-                  tv::TensorView<const scalar_t> inFeatures,
-                  tv::TensorView<const Index> indices, int size);
-};
-
-template <typename Device, typename scalar_t, typename Index>
-struct SparseMaxPoolBackwardFunctor {
-  void operator()(const Device& d, tv::TensorView<const scalar_t> outFeatures,
-                  tv::TensorView<const scalar_t> inFeatures,
-                  tv::TensorView<const scalar_t> fout,
-                  tv::TensorView<scalar_t> fin,
-                  tv::TensorView<const Index> indices, int size);
-};
-}  // namespace functor
-
-#endif
diff --git a/mmcv/ops/csrc/common/utils/spconv/spconv/mp_helper.h b/mmcv/ops/csrc/common/utils/spconv/spconv/mp_helper.h
deleted file mode 100644
index 8262b30..0000000
--- a/mmcv/ops/csrc/common/utils/spconv/spconv/mp_helper.h
+++ /dev/null
@@ -1,50 +0,0 @@
-#ifndef MP_HELPER_H_
-#define MP_HELPER_H_
-#include <type_traits>
-#include <utility>
-
-template <class... T>
-struct mp_list {};
-
-template <class T, T... I>
-using mp_list_c = mp_list<std::integral_constant<T, I>...>;
-
-namespace detail {
-
-template <class... T, class F>
-constexpr F mp_for_each_impl(mp_list<T...>, F &&f) {
-  return std::initializer_list<int>{(f(T()), 0)...}, std::forward<F>(f);
-}
-
-template <class F>
-constexpr F mp_for_each_impl(mp_list<>, F &&f) {
-  return std::forward<F>(f);
-}
-
-}  // namespace detail
-
-namespace detail {
-
-template <class A, template <class...> class B>
-struct mp_rename_impl {
-  // An error "no type named 'type'" here means that the first argument to
-  // mp_rename is not a list
-};
-
-template <template <class...> class A, class... T, template <class...> class B>
-struct mp_rename_impl<A<T...>, B> {
-  using type = B<T...>;
-};
-
-}  // namespace detail
-
-template <class A, template <class...> class B>
-using mp_rename = typename ::detail::mp_rename_impl<A, B>::type;
-
-template <class L, class F>
-constexpr F mp_for_each(F &&f) {
-  return ::detail::mp_for_each_impl(mp_rename<L, mp_list>(),
-                                    std::forward<F>(f));
-}
-
-#endif
diff --git a/mmcv/ops/csrc/common/utils/spconv/spconv/point2voxel.h b/mmcv/ops/csrc/common/utils/spconv/spconv/point2voxel.h
deleted file mode 100644
index 95c1c6e..0000000
--- a/mmcv/ops/csrc/common/utils/spconv/spconv/point2voxel.h
+++ /dev/null
@@ -1,385 +0,0 @@
-// Copyright 2019 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <math.h>
-#include <pybind11/numpy.h>
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
-
-#include <algorithm>
-#include <iostream>
-
-namespace py = pybind11;
-using namespace pybind11::literals;
-
-template <typename DType, int NDim>
-int points_to_voxel_3d_np(py::array_t<DType> points, py::array_t<DType> voxels,
-                          py::array_t<int> coors,
-                          py::array_t<int> num_points_per_voxel,
-                          py::array_t<int> coor_to_voxelidx,
-                          std::vector<DType> voxel_size,
-                          std::vector<DType> coors_range, int max_points,
-                          int max_voxels) {
-  auto points_rw = points.template mutable_unchecked<2>();
-  auto voxels_rw = voxels.template mutable_unchecked<3>();
-  auto coors_rw = coors.mutable_unchecked<2>();
-  auto num_points_per_voxel_rw = num_points_per_voxel.mutable_unchecked<1>();
-  auto coor_to_voxelidx_rw = coor_to_voxelidx.mutable_unchecked<NDim>();
-  auto N = points_rw.shape(0);
-  auto num_features = points_rw.shape(1);
-  constexpr int ndim_minus_1 = NDim - 1;
-  int voxel_num = 0;
-  bool failed = false;
-  int coor[NDim];
-  int c;
-  int grid_size[NDim];
-  for (int i = 0; i < NDim; ++i) {
-    grid_size[i] =
-        round((coors_range[NDim + i] - coors_range[i]) / voxel_size[i]);
-  }
-  int voxelidx, num;
-  for (int i = 0; i < N; ++i) {
-    failed = false;
-    for (int j = 0; j < NDim; ++j) {
-      c = floor((points_rw(i, j) - coors_range[j]) / voxel_size[j]);
-      if ((c < 0 || c >= grid_size[j])) {
-        failed = true;
-        break;
-      }
-      coor[ndim_minus_1 - j] = c;
-    }
-    if (failed) continue;
-    voxelidx = coor_to_voxelidx_rw(coor[0], coor[1], coor[2]);
-    if (voxelidx == -1) {
-      voxelidx = voxel_num;
-      if (voxel_num >= max_voxels) continue;
-      voxel_num += 1;
-      coor_to_voxelidx_rw(coor[0], coor[1], coor[2]) = voxelidx;
-      for (int k = 0; k < NDim; ++k) {
-        coors_rw(voxelidx, k) = coor[k];
-      }
-    }
-    num = num_points_per_voxel_rw(voxelidx);
-    if (num < max_points) {
-      for (int k = 0; k < num_features; ++k) {
-        voxels_rw(voxelidx, num, k) = points_rw(i, k);
-      }
-      num_points_per_voxel_rw(voxelidx) += 1;
-    }
-  }
-  for (int i = 0; i < voxel_num; ++i) {
-    coor_to_voxelidx_rw(coors_rw(i, 0), coors_rw(i, 1), coors_rw(i, 2)) = -1;
-  }
-  return voxel_num;
-}
-
-template <typename DType, int NDim>
-int points_to_voxel_3d_np_mean(py::array_t<DType> points,
-                               py::array_t<DType> voxels,
-                               py::array_t<DType> means, py::array_t<int> coors,
-                               py::array_t<int> num_points_per_voxel,
-                               py::array_t<int> coor_to_voxelidx,
-                               std::vector<DType> voxel_size,
-                               std::vector<DType> coors_range, int max_points,
-                               int max_voxels) {
-  auto points_rw = points.template mutable_unchecked<2>();
-  auto means_rw = means.template mutable_unchecked<2>();
-  auto voxels_rw = voxels.template mutable_unchecked<3>();
-  auto coors_rw = coors.mutable_unchecked<2>();
-  auto num_points_per_voxel_rw = num_points_per_voxel.mutable_unchecked<1>();
-  auto coor_to_voxelidx_rw = coor_to_voxelidx.mutable_unchecked<NDim>();
-  auto N = points_rw.shape(0);
-  auto num_features = points_rw.shape(1);
-  constexpr int ndim_minus_1 = NDim - 1;
-  int voxel_num = 0;
-  bool failed = false;
-  int coor[NDim];
-  int c;
-  int grid_size[NDim];
-  for (int i = 0; i < NDim; ++i) {
-    grid_size[i] =
-        round((coors_range[NDim + i] - coors_range[i]) / voxel_size[i]);
-  }
-  int voxelidx, num;
-  for (int i = 0; i < N; ++i) {
-    failed = false;
-    for (int j = 0; j < NDim; ++j) {
-      c = floor((points_rw(i, j) - coors_range[j]) / voxel_size[j]);
-      if ((c < 0 || c >= grid_size[j])) {
-        failed = true;
-        break;
-      }
-      coor[ndim_minus_1 - j] = c;
-    }
-    if (failed) continue;
-    voxelidx = coor_to_voxelidx_rw(coor[0], coor[1], coor[2]);
-    if (voxelidx == -1) {
-      voxelidx = voxel_num;
-      if (voxel_num >= max_voxels) continue;
-      voxel_num += 1;
-      coor_to_voxelidx_rw(coor[0], coor[1], coor[2]) = voxelidx;
-      for (int k = 0; k < NDim; ++k) {
-        coors_rw(voxelidx, k) = coor[k];
-      }
-    }
-    num = num_points_per_voxel_rw(voxelidx);
-    if (num < max_points) {
-      for (int k = 0; k < num_features; ++k) {
-        voxels_rw(voxelidx, num, k) = points_rw(i, k);
-      }
-      num_points_per_voxel_rw(voxelidx) += 1;
-      for (int k = 0; k < num_features; ++k) {
-        means_rw(voxelidx, k) +=
-            (points_rw(i, k) - means_rw(voxelidx, k)) / DType(num + 1);
-      }
-    }
-  }
-  for (int i = 0; i < voxel_num; ++i) {
-    coor_to_voxelidx_rw(coors_rw(i, 0), coors_rw(i, 1), coors_rw(i, 2)) = -1;
-    num = num_points_per_voxel_rw(i);
-    for (int j = num; j < max_points; ++j) {
-      for (int k = 0; k < num_features; ++k) {
-        voxels_rw(i, j, k) = means_rw(i, k);
-      }
-    }
-  }
-  return voxel_num;
-}
-
-template <typename DType, int NDim>
-int points_to_voxel_3d_np_height(
-    py::array_t<DType> points, py::array_t<DType> voxels,
-    py::array_t<DType> height, py::array_t<DType> maxs, py::array_t<int> coors,
-    py::array_t<int> num_points_per_voxel, py::array_t<int> coor_to_voxelidx,
-    std::vector<DType> voxel_size, std::vector<DType> coors_range,
-    int max_points, int max_voxels) {
-  auto points_rw = points.template mutable_unchecked<2>();
-  auto height_rw = height.template mutable_unchecked<2>();
-  auto maxs_rw = maxs.template mutable_unchecked<2>();
-  auto voxels_rw = voxels.template mutable_unchecked<3>();
-  auto coors_rw = coors.mutable_unchecked<2>();
-  auto num_points_per_voxel_rw = num_points_per_voxel.mutable_unchecked<1>();
-  auto coor_to_voxelidx_rw = coor_to_voxelidx.mutable_unchecked<NDim>();
-  auto N = points_rw.shape(0);
-  auto num_features = points_rw.shape(1);
-  constexpr int ndim_minus_1 = NDim - 1;
-  int voxel_num = 0;
-  bool failed = false;
-  int coor[NDim];
-  int c;
-  int grid_size[NDim];
-  for (int i = 0; i < NDim; ++i) {
-    grid_size[i] =
-        round((coors_range[NDim + i] - coors_range[i]) / voxel_size[i]);
-  }
-  int voxelidx, num;
-  for (int i = 0; i < N; ++i) {
-    failed = false;
-    for (int j = 0; j < NDim; ++j) {
-      c = floor((points_rw(i, j) - coors_range[j]) / voxel_size[j]);
-      if ((c < 0 || c >= grid_size[j])) {
-        failed = true;
-        break;
-      }
-      coor[ndim_minus_1 - j] = c;
-    }
-    if (failed) continue;
-    voxelidx = coor_to_voxelidx_rw(coor[0], coor[1], coor[2]);
-    if (voxelidx == -1) {
-      voxelidx = voxel_num;
-      if (voxel_num >= max_voxels) continue;
-      voxel_num += 1;
-      coor_to_voxelidx_rw(coor[0], coor[1], coor[2]) = voxelidx;
-      for (int k = 0; k < NDim; ++k) {
-        coors_rw(voxelidx, k) = coor[k];
-      }
-    }
-    num = num_points_per_voxel_rw(voxelidx);
-    if (num < max_points) {
-      for (int k = 0; k < num_features; ++k) {
-        voxels_rw(voxelidx, num, k) = points_rw(i, k);
-        height_rw(voxelidx, k) =
-            std::min(points_rw(i, k), height_rw(voxelidx, k));
-        maxs_rw(voxelidx, k) = std::max(points_rw(i, k), maxs_rw(voxelidx, k));
-      }
-      num_points_per_voxel_rw(voxelidx) += 1;
-    }
-  }
-  for (int i = 0; i < voxel_num; ++i) {
-    coor_to_voxelidx_rw(coors_rw(i, 0), coors_rw(i, 1), coors_rw(i, 2)) = -1;
-    for (int k = 0; k < num_features; ++k) {
-      height_rw(i, k) = maxs_rw(i, k) - height_rw(i, k);
-    }
-  }
-  return voxel_num;
-}
-
-template <typename DType, int NDim>
-int block_filtering(py::array_t<DType> points, py::array_t<int> mask,
-                    py::array_t<DType> height, py::array_t<DType> maxs,
-                    py::array_t<int> coor_to_voxelidx,
-                    std::vector<DType> voxel_size,
-                    std::vector<DType> coors_range, int max_voxels, DType eps) {
-  auto points_rw = points.template mutable_unchecked<2>();
-  auto height_rw = height.template mutable_unchecked<1>();
-  auto maxs_rw = maxs.template mutable_unchecked<1>();
-  auto coor_to_voxelidx_rw = coor_to_voxelidx.mutable_unchecked<NDim>();
-  auto N = points_rw.shape(0);
-  auto num_features = points_rw.shape(1);
-  constexpr int ndim_minus_1 = NDim - 1;
-  int voxel_num = 0;
-  bool failed = false;
-  int coor[NDim];
-  int c;
-  int grid_size[NDim];
-  for (int i = 0; i < NDim; ++i) {
-    grid_size[i] =
-        round((coors_range[NDim + i] - coors_range[i]) / voxel_size[i]);
-  }
-  int voxelidx, num;
-  for (int i = 0; i < N; ++i) {
-    failed = false;
-    for (int j = 0; j < NDim; ++j) {
-      c = floor((points_rw(i, j) - coors_range[j]) / voxel_size[j]);
-      if ((c < 0 || c >= grid_size[j])) {
-        failed = true;
-        break;
-      }
-      coor[ndim_minus_1 - j] = c;
-    }
-    if (failed) continue;
-    voxelidx = coor_to_voxelidx_rw(coor[0], coor[1], coor[2]);
-    if (voxelidx == -1) {
-      voxelidx = voxel_num;
-      voxel_num += 1;
-      coor_to_voxelidx_rw(coor[0], coor[1], coor[2]) = voxelidx;
-    }
-    height_rw(voxelidx) = std::min(points_rw(i, 2), height_rw(voxelidx));
-    maxs_rw(voxelidx) = std::max(points_rw(i, 2), maxs_rw(voxelidx));
-  }
-  for (int i = 0; i < N; ++i) {
-    failed = false;
-    for (int j = 0; j < NDim; ++j) {
-      c = floor((points_rw(i, j) - coors_range[j]) / voxel_size[j]);
-      if ((c < 0 || c >= grid_size[j])) {
-        failed = true;
-        break;
-      }
-      coor[ndim_minus_1 - j] = c;
-    }
-    if (failed) continue;
-    voxelidx = coor_to_voxelidx_rw(coor[0], coor[1], coor[2]);
-    if ((maxs_rw(voxelidx) - height_rw(voxelidx, 2)) < eps) {
-      mask(i) = 0;
-    }
-  }
-}
-
-template <typename DType, int NDim>
-int points_to_voxel_3d_with_filtering(
-    py::array_t<DType> points, py::array_t<DType> voxels,
-    py::array_t<int> voxel_mask, py::array_t<DType> mins,
-    py::array_t<DType> maxs, py::array_t<int> coors,
-    py::array_t<int> num_points_per_voxel, py::array_t<int> coor_to_voxelidx,
-    std::vector<DType> voxel_size, std::vector<DType> coors_range,
-    int max_points, int max_voxels, int block_factor, int block_size,
-    DType height_threshold) {
-  auto points_rw = points.template mutable_unchecked<2>();
-  auto mins_rw = mins.template mutable_unchecked<2>();
-  auto maxs_rw = maxs.template mutable_unchecked<2>();
-  auto voxels_rw = voxels.template mutable_unchecked<3>();
-  auto voxel_mask_rw = voxel_mask.template mutable_unchecked<1>();
-  auto coors_rw = coors.mutable_unchecked<2>();
-  auto num_points_per_voxel_rw = num_points_per_voxel.mutable_unchecked<1>();
-  auto coor_to_voxelidx_rw = coor_to_voxelidx.mutable_unchecked<NDim>();
-  auto N = points_rw.shape(0);
-  auto num_features = points_rw.shape(1);
-  constexpr int ndim_minus_1 = NDim - 1;
-  int voxel_num = 0;
-  bool failed = false;
-  int coor[NDim];
-  int c;
-  int grid_size[NDim];
-
-  DType max_value, min_value;
-  for (int i = 0; i < NDim; ++i) {
-    grid_size[i] =
-        round((coors_range[NDim + i] - coors_range[i]) / voxel_size[i]);
-  }
-  int block_shape_H = grid_size[1] / block_factor;
-  int block_shape_W = grid_size[0] / block_factor;
-  int voxelidx, num;
-  int block_coor[2];
-  int startx, stopx, starty, stopy;
-  for (int i = 0; i < N; ++i) {
-    failed = false;
-    for (int j = 0; j < NDim; ++j) {
-      c = floor((points_rw(i, j) - coors_range[j]) / voxel_size[j]);
-      if ((c < 0 || c >= grid_size[j])) {
-        failed = true;
-        break;
-      }
-      coor[ndim_minus_1 - j] = c;
-    }
-    if (failed) continue;
-    voxelidx = coor_to_voxelidx_rw(coor[0], coor[1], coor[2]);
-    if (voxelidx == -1) {
-      voxelidx = voxel_num;
-      if (voxel_num >= max_voxels) continue;
-      voxel_num += 1;
-      coor_to_voxelidx_rw(coor[0], coor[1], coor[2]) = voxelidx;
-      for (int k = 0; k < NDim; ++k) {
-        coors_rw(voxelidx, k) = coor[k];
-      }
-    }
-    num = num_points_per_voxel_rw(voxelidx);
-    if (num < max_points) {
-      for (int k = 0; k < num_features; ++k) {
-        voxels_rw(voxelidx, num, k) = points_rw(i, k);
-      }
-      block_coor[0] = coor[1] / block_factor;
-      block_coor[1] = coor[2] / block_factor;
-      mins_rw(block_coor[0], block_coor[1]) =
-          std::min(points_rw(i, 2), mins_rw(block_coor[0], block_coor[1]));
-      maxs_rw(block_coor[0], block_coor[1]) =
-          std::max(points_rw(i, 2), maxs_rw(block_coor[0], block_coor[1]));
-      num_points_per_voxel_rw(voxelidx) += 1;
-    }
-  }
-  for (int i = 0; i < voxel_num; ++i) {
-    coor[1] = coors_rw(i, 1);
-    coor[2] = coors_rw(i, 2);
-    coor_to_voxelidx_rw(coors_rw(i, 0), coor[1], coor[2]) = -1;
-    block_coor[0] = coor[1] / block_factor;
-    block_coor[1] = coor[2] / block_factor;
-    min_value = mins_rw(block_coor[0], block_coor[1]);
-    max_value = maxs_rw(block_coor[0], block_coor[1]);
-    startx = std::max(0, block_coor[0] - block_size / 2);
-    stopx =
-        std::min(block_shape_H, block_coor[0] + block_size - block_size / 2);
-    starty = std::max(0, block_coor[1] - block_size / 2);
-    stopy =
-        std::min(block_shape_W, block_coor[1] + block_size - block_size / 2);
-
-    for (int j = startx; j < stopx; ++j) {
-      for (int k = starty; k < stopy; ++k) {
-        min_value = std::min(min_value, mins_rw(j, k));
-        max_value = std::max(max_value, maxs_rw(j, k));
-      }
-    }
-    voxel_mask_rw(i) = (max_value - min_value) > height_threshold;
-  }
-  return voxel_num;
-}
diff --git a/mmcv/ops/csrc/common/utils/spconv/spconv/reordering.h b/mmcv/ops/csrc/common/utils/spconv/spconv/reordering.h
deleted file mode 100644
index 998d951..0000000
--- a/mmcv/ops/csrc/common/utils/spconv/spconv/reordering.h
+++ /dev/null
@@ -1,36 +0,0 @@
-// Copyright 2019 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef SPARSE_REORDERING_FUNCTOR_H_
-#define SPARSE_REORDERING_FUNCTOR_H_
-#include <utils/spconv/tensorview/tensorview.h>
-
-namespace functor {
-template <typename Device, typename scalar_t, typename Index>
-struct SparseGatherFunctor {
-  void operator()(const Device& d, tv::TensorView<scalar_t> buffer,
-                  tv::TensorView<const scalar_t> features,
-                  tv::TensorView<const Index> indices, int size);
-};
-
-template <typename Device, typename scalar_t, typename Index>
-struct SparseScatterAddFunctor {
-  void operator()(const Device& d, tv::TensorView<scalar_t> out_features,
-                  tv::TensorView<const scalar_t> buffer,
-                  tv::TensorView<const Index> indices, int size,
-                  bool stable = false);
-};
-}  // namespace functor
-
-#endif
diff --git a/mmcv/ops/csrc/common/utils/spconv/tensorview/helper_kernel.cuh b/mmcv/ops/csrc/common/utils/spconv/tensorview/helper_kernel.cuh
deleted file mode 100644
index 70851bc..0000000
--- a/mmcv/ops/csrc/common/utils/spconv/tensorview/helper_kernel.cuh
+++ /dev/null
@@ -1,75 +0,0 @@
-#pragma once
-namespace tv {
-namespace detail {
-
-template <typename scalar_t>
-class KernelLoop {
-  struct Iterator {
-    __forceinline__ __device__ Iterator(scalar_t index, scalar_t delta)
-        : index_(index), delta_(delta) {}
-    __forceinline__ __device__ scalar_t operator*() const { return index_; }
-    __forceinline__ __device__ Iterator &operator++() {
-      index_ += delta_;
-      return *this;
-    }
-    __forceinline__ __device__ bool operator!=(const Iterator &other) const {
-      bool greater = index_ > other.index_;
-      bool less = index_ < other.index_;
-      if (!other.delta_) {
-        return less;
-      }
-      if (!delta_) {
-        return greater;
-      }
-      return less || greater;
-    }
-
-   private:
-    scalar_t index_;
-    const scalar_t delta_;
-  };
-
- public:
-  __forceinline__ __device__ KernelLoop(scalar_t begin, scalar_t delta,
-                                        scalar_t end)
-      : begin_(begin), delta_(delta), end_(end) {}
-
-  __forceinline__ __device__ Iterator begin() const {
-    return Iterator{begin_, delta_};
-  }
-  __forceinline__ __device__ Iterator end() const { return Iterator{end_, 0}; }
-
- private:
-  scalar_t begin_;
-  scalar_t delta_;
-  scalar_t end_;
-};
-
-}  // namespace detail
-
-template <typename scalar_t, int NumILP = 1>
-__forceinline__ __device__ detail::KernelLoop<scalar_t> KernelLoopX(
-    scalar_t count) {
-  return detail::KernelLoop<scalar_t>(blockIdx.x * blockDim.x + threadIdx.x,
-                                      gridDim.x * blockDim.x * NumILP, count);
-}
-
-// Helper to visit indices in the range 0 <= i < count using the y-coordinate.
-// Usage: for(int i : KernelLoopY(count)) { visit(i); }
-template <typename scalar_t, int NumILP = 1>
-__forceinline__ __device__ detail::KernelLoop<scalar_t> KernelLoopY(
-    scalar_t count) {
-  return detail::KernelLoop<scalar_t>(blockIdx.y * blockDim.y + threadIdx.y,
-                                      gridDim.y * blockDim.y * NumILP, count);
-}
-
-// Helper to visit indices in the range 0 <= i < count using the z-coordinate.
-// Usage: for(int i : KernelLoopZ(count)) { visit(i); }
-template <typename scalar_t, int NumILP = 1>
-__forceinline__ __device__ detail::KernelLoop<scalar_t> KernelLoopZ(
-    scalar_t count) {
-  return detail::KernelLoop<scalar_t>(blockIdx.z * blockDim.z + threadIdx.z,
-                                      gridDim.z * blockDim.z * NumILP, count);
-}
-
-}  // namespace tv
diff --git a/mmcv/ops/csrc/common/utils/spconv/tensorview/helper_launch.h b/mmcv/ops/csrc/common/utils/spconv/tensorview/helper_launch.h
deleted file mode 100644
index 163df17..0000000
--- a/mmcv/ops/csrc/common/utils/spconv/tensorview/helper_launch.h
+++ /dev/null
@@ -1,19 +0,0 @@
-#pragma once
-// from pytorch.aten
-#include "tensorview.h"
-namespace tv {
-namespace launch {
-
-template <typename T1, typename T2>
-inline int DivUp(const T1 a, const T2 b) {
-  return (a + b - 1) / b;
-}
-
-constexpr int CUDA_NUM_THREADS = 1024;
-inline int getBlocks(const int N) {
-  TV_ASSERT_RT_ERR(N > 0,
-                   "CUDA kernel launch blocks must be positive, but got N=", N);
-  return DivUp(N, CUDA_NUM_THREADS);
-}
-}  // namespace launch
-}  // namespace tv
diff --git a/mmcv/ops/csrc/common/utils/spconv/tensorview/tensorview.h b/mmcv/ops/csrc/common/utils/spconv/tensorview/tensorview.h
deleted file mode 100644
index 27745be..0000000
--- a/mmcv/ops/csrc/common/utils/spconv/tensorview/tensorview.h
+++ /dev/null
@@ -1,1119 +0,0 @@
-// Copyright 2019 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <algorithm>
-#include <cassert>
-#include <cstdlib>
-#include <iostream>
-#include <memory>
-#include <sstream>
-#include <type_traits>
-#include <vector>
-
-#include "pytorch_cpp_helper.hpp"
-
-namespace tv {
-
-#if defined(__NVCC__) || defined(__HIP__)
-#define TV_HOST_DEVICE_INLINE __forceinline__ __device__ __host__
-#define TV_DEVICE_INLINE __forceinline__ __device__
-#define TV_HOST_DEVICE __device__ __host__
-#define TV_ASSERT(expr) assert(expr)
-#elif defined(__CUDACC_RTC__)
-#define TV_ASSERT(expr) assert(expr)
-#define TV_HOST_DEVICE_INLINE __forceinline__ __device__
-#define TV_DEVICE_INLINE __forceinline__ __device__
-#define TV_HOST_DEVICE __device__ __host__
-#else
-#define TV_ASSERT(x) assert(x)
-#define TV_HOST_DEVICE_INLINE inline
-#define TV_HOST_DEVICE
-#endif
-
-#define TV_REQUIRE(expr, ...) \
-  {                           \
-    if (!(expr)) {            \
-      printf(__VA_ARGS__);    \
-      assert(expr);           \
-    }                         \
-  }
-
-#define TV_DEVICE_REQUIRE(expr, ...)                      \
-  {                                                       \
-    if (!(expr) && threadIdx.x == 0) printf(__VA_ARGS__); \
-    assert(expr);                                         \
-  }
-
-template <class SStream, class T>
-void sstream_print(SStream &ss, T val) {
-  ss << val;
-}
-
-template <class SStream, class T, class... TArgs>
-void sstream_print(SStream &ss, T val, TArgs... args) {
-  ss << val << " ";
-  sstream_print(ss, args...);
-}
-
-#define TV_ASSERT_RT_ERR(expr, ...)                     \
-  {                                                     \
-    if (!(expr)) {                                      \
-      std::stringstream __macro_s;                      \
-      __macro_s << __FILE__ << " " << __LINE__ << "\n"; \
-      __macro_s << #expr << " assert failed. ";         \
-      tv::sstream_print(__macro_s, __VA_ARGS__);        \
-      throw std::runtime_error(__macro_s.str());        \
-    }                                                   \
-  }
-
-#define TV_ASSERT_INVALID_ARG(expr, ...)                \
-  {                                                     \
-    if (!(expr)) {                                      \
-      std::stringstream __macro_s;                      \
-      __macro_s << __FILE__ << " " << __LINE__ << "\n"; \
-      __macro_s << #expr << " assert failed. ";         \
-      tv::sstream_print(__macro_s, __VA_ARGS__);        \
-      throw std::invalid_argument(__macro_s.str());     \
-    }                                                   \
-  }
-
-#define TV_CHECK_CUDA_ERR()                                    \
-  {                                                            \
-    auto err = cudaGetLastError();                             \
-    if (err != cudaSuccess) {                                  \
-      std::stringstream __macro_s;                             \
-      __macro_s << __FILE__ << " " << __LINE__ << "\n";        \
-      __macro_s << "cuda execution failed with error " << err; \
-      throw std::runtime_error(__macro_s.str());               \
-    }                                                          \
-  }
-
-struct CPU {};
-
-#define TV_MAX_DIM 6
-
-template <typename scalar_t, size_t MaxDim = TV_MAX_DIM>
-struct SimpleVector {
- public:
-  TV_HOST_DEVICE_INLINE SimpleVector(){};
-  TV_HOST_DEVICE_INLINE SimpleVector(std::initializer_list<scalar_t> q) {
-    TV_ASSERT(q.size() <= MaxDim);
-    mSize = 0;
-    for (scalar_t s : q) {
-      mArray[mSize++] = s;
-    }
-    mSize = q.size();
-  }
-  SimpleVector(const std::vector<scalar_t> &arr) {
-    TV_ASSERT(arr.size() <= MaxDim);
-    for (size_t i = 0; i < arr.size(); ++i) {
-      mArray[i] = arr[i];
-    }
-    mSize = arr.size();
-  }
-  TV_HOST_DEVICE_INLINE SimpleVector(
-      const SimpleVector<scalar_t, MaxDim> &arr) {
-    TV_ASSERT(arr.size() <= MaxDim);
-    for (size_t i = 0; i < arr.size(); ++i) {
-      mArray[i] = arr[i];
-    }
-    mSize = arr.size();
-  }
-  TV_HOST_DEVICE_INLINE scalar_t &operator[](int idx) {
-#ifdef TV_DEBUG
-    TV_ASSERT(idx >= 0 && idx < mSize);
-#endif
-    return mArray[idx];
-  }
-  TV_HOST_DEVICE_INLINE const scalar_t &operator[](int idx) const {
-#ifdef TV_DEBUG
-    TV_ASSERT(idx >= 0 && idx < mSize);
-#endif
-    return mArray[idx];
-  }
-  TV_HOST_DEVICE_INLINE void push_back(scalar_t s) {
-#ifdef TV_DEBUG
-    TV_ASSERT(mSize < MaxDim);
-#endif
-    mArray[mSize] = s;
-    mSize++;
-  }
-  TV_HOST_DEVICE_INLINE void pop_back() {
-#ifdef TV_DEBUG
-    TV_ASSERT(mSize > 0);
-#endif
-    mSize--;
-  }
-
-  TV_HOST_DEVICE_INLINE size_t size() const { return mSize; }
-  TV_HOST_DEVICE_INLINE const scalar_t *data() const { return mArray; }
-  TV_HOST_DEVICE_INLINE size_t empty() const { return mSize == 0; }
-
-  typedef size_t size_type;
-
-  class iterator {
-   public:
-    typedef iterator self_type;
-    typedef scalar_t value_type;
-    typedef scalar_t &reference;
-    typedef scalar_t *pointer;
-    typedef std::forward_iterator_tag iterator_category;
-    typedef std::ptrdiff_t difference_type;
-    TV_HOST_DEVICE_INLINE iterator(pointer ptr) : ptr_(ptr) {}
-    TV_HOST_DEVICE_INLINE self_type operator++(int junk) {
-      self_type i = *this;
-      ptr_++;
-      return i;
-    }
-    TV_HOST_DEVICE_INLINE self_type operator++() {
-      ptr_++;
-      return *this;
-    }
-    TV_HOST_DEVICE_INLINE reference operator*() { return *ptr_; }
-    TV_HOST_DEVICE_INLINE pointer operator->() { return ptr_; }
-    TV_HOST_DEVICE_INLINE bool operator==(const self_type &rhs) {
-      return ptr_ == rhs.ptr_;
-    }
-    TV_HOST_DEVICE_INLINE bool operator!=(const self_type &rhs) {
-      return ptr_ != rhs.ptr_;
-    }
-
-   private:
-    pointer ptr_;
-  };
-
-  class const_iterator {
-   public:
-    typedef const_iterator self_type;
-    typedef scalar_t value_type;
-    typedef const scalar_t &reference;
-    typedef const scalar_t *pointer;
-    typedef std::ptrdiff_t difference_type;
-    typedef std::forward_iterator_tag iterator_category;
-    TV_HOST_DEVICE_INLINE const_iterator(pointer ptr) : ptr_(ptr) {}
-    TV_HOST_DEVICE_INLINE self_type operator++(int junk) {
-      self_type i = *this;
-      ptr_++;
-      return i;
-    }
-    TV_HOST_DEVICE_INLINE self_type operator++() {
-      ptr_++;
-      return *this;
-    }
-    TV_HOST_DEVICE_INLINE reference operator*() { return *ptr_; }
-    TV_HOST_DEVICE_INLINE pointer operator->() { return ptr_; }
-    TV_HOST_DEVICE_INLINE bool operator==(const self_type &rhs) {
-      return ptr_ == rhs.ptr_;
-    }
-    TV_HOST_DEVICE_INLINE bool operator!=(const self_type &rhs) {
-      return ptr_ != rhs.ptr_;
-    }
-
-   private:
-    pointer ptr_;
-  };
-
-  TV_HOST_DEVICE_INLINE iterator begin() { return iterator(mArray); }
-
-  TV_HOST_DEVICE_INLINE iterator end() { return iterator(mArray + mSize); }
-
-  TV_HOST_DEVICE_INLINE const_iterator begin() const {
-    return const_iterator(mArray);
-  }
-
-  TV_HOST_DEVICE_INLINE const_iterator end() const {
-    return const_iterator(mArray + mSize);
-  }
-  TV_HOST_DEVICE_INLINE const_iterator cbegin() const {
-    return const_iterator(mArray);
-  }
-
-  TV_HOST_DEVICE_INLINE const_iterator cend() const {
-    return const_iterator(mArray + mSize);
-  }
-
- protected:
-  scalar_t mArray[MaxDim];
-  size_t mSize = 0;
-};
-
-template <typename scalar_t, size_t MaxDim>
-bool operator==(const SimpleVector<scalar_t, MaxDim> &lfs,
-                const SimpleVector<scalar_t, MaxDim> &rfs) {
-  if (lfs.size() != rfs.size()) return false;
-  for (size_t i = 0; i < lfs.size(); ++i) {
-    if (lfs[i] != rfs[i]) return false;
-  }
-  return true;
-}
-
-template <typename scalar_t, size_t MaxDim>
-bool operator!=(const SimpleVector<scalar_t, MaxDim> &lfs,
-                const SimpleVector<scalar_t, MaxDim> &rfs) {
-  return !(lfs == rfs);
-}
-
-struct Slice {
-  template <class... Integers>
-  TV_HOST_DEVICE_INLINE Slice(Integers... ints) {
-    static_assert(sizeof...(ints) <= 3, "slice init must smaller than 3");
-    SimpleVector<int, 3> slices{int(ints)...};
-    mSlices[0] = -1;
-    mSlices[1] = -1;
-    mSlices[2] = -1;
-    for (size_t i = 0; i < slices.size(); ++i) {
-      mSlices[i] = slices[i];
-    }
-  }
-
-  TV_HOST_DEVICE_INLINE Slice() {
-    mSlices[0] = -1;
-    mSlices[1] = -1;
-    mSlices[2] = -1;
-  }
-  template <typename scalar_t>
-  TV_HOST_DEVICE_INLINE Slice(std::initializer_list<scalar_t> slice) {
-    mSlices[0] = -1;
-    mSlices[1] = -1;
-    mSlices[2] = -1;
-    TV_ASSERT(slice.size() <= 3);
-    int idx = 0;
-    for (scalar_t s : slice) {
-      mSlices[idx] = int(s);
-      ++idx;
-    }
-  }
-  TV_HOST_DEVICE_INLINE int &operator[](int idx) {
-#ifdef TV_DEBUG
-    TV_ASSERT(idx >= 0 && idx < 3);
-#endif
-    return mSlices[idx];
-  }
-  TV_HOST_DEVICE_INLINE const int &operator[](int idx) const {
-#ifdef TV_DEBUG
-    TV_ASSERT(idx >= 0 && idx < 3);
-#endif
-    return mSlices[idx];
-  }
-
- protected:
-  int mSlices[3];
-};
-
-template <size_t MaxDim = TV_MAX_DIM>
-struct ShapeBase : public SimpleVector<int, MaxDim> {
-  TV_HOST_DEVICE_INLINE ShapeBase() : SimpleVector<int, MaxDim>(){};
-  TV_HOST_DEVICE_INLINE ShapeBase(std::initializer_list<int> shape)
-      : SimpleVector<int, MaxDim>(shape) {}
-
-  template <typename scalar_t, template <class...> class Container>
-  ShapeBase(Container<scalar_t> shape) : SimpleVector<int, MaxDim>(shape) {}
-  TV_HOST_DEVICE_INLINE ShapeBase(const ShapeBase<MaxDim> &shape)
-      : SimpleVector<int, MaxDim>(shape) {}
-  ShapeBase(const std::vector<int> &arr) : SimpleVector<int, MaxDim>(arr) {}
-
-  ShapeBase<MaxDim> &operator=(const ShapeBase<MaxDim> &shape) = default;
-  TV_HOST_DEVICE_INLINE ShapeBase<MaxDim> subshape(int start, int end) const {
-#ifdef TV_DEBUG
-    TV_ASSERT(start >= 0 && end < this->mSize && end > start);
-#endif
-    ShapeBase<MaxDim> shape;
-    for (int i = start; i < end; ++i) {
-      shape.push_back(this->mArray[i]);
-    }
-    return shape;
-  }
-  TV_HOST_DEVICE_INLINE ShapeBase<MaxDim> subshape(int start) const {
-#ifdef TV_DEBUG
-    TV_ASSERT(start >= 0 && start <= this->mSize);
-#endif
-    ShapeBase<MaxDim> shape;
-    for (int i = start; i < this->mSize; ++i) {
-      shape.push_back(this->mArray[i]);
-    }
-    return shape;
-  }
-
-  TV_HOST_DEVICE_INLINE size_t size() const {
-    if (this->mSize == 0) return 0;
-    size_t s = 1;
-    for (int i = 0; i < int(this->mSize); ++i) {
-      s *= this->mArray[i];
-    }
-    return s;
-  }
-  TV_HOST_DEVICE_INLINE size_t ndim() const { return this->mSize; }
-  TV_HOST_DEVICE_INLINE ShapeBase<MaxDim> squeeze() const {
-    ShapeBase<MaxDim> shape;
-    for (int i = 0; i < this->mSize; ++i) {
-      if (this->mArray[i] != 1) shape.push_back(this->mArray[i]);
-    }
-    return shape;
-  }
-  TV_HOST_DEVICE_INLINE ShapeBase<MaxDim> squeeze(int dim) const {
-    ShapeBase<MaxDim> shape;
-    for (int i = 0; i < this->mSize; ++i) {
-      if (i != dim || this->mArray[i] != 1) shape.push_back(this->mArray[i]);
-    }
-    return shape;
-  }
-};
-
-using Shape = ShapeBase<TV_MAX_DIM>;
-
-template <class... Inds>
-TV_HOST_DEVICE_INLINE unsigned rowArrayIdx(std::vector<int> &shape,
-                                           Inds... indexes) {
-  unsigned offset = 0;
-  unsigned m = 1;
-  int indexes_vec[sizeof...(indexes)] = {indexes...};
-#ifdef TV_DEBUG
-  TV_ASSERT(sizeof...(indexes) == shape.size());
-#endif
-#pragma unroll
-  for (int i = sizeof...(indexes) - 1; i >= 0; --i) {
-    offset += m * indexes_vec[i];
-    m *= shape[i];
-  }
-  return offset;
-}
-
-TV_HOST_DEVICE_INLINE unsigned rowArrayIdx(std::vector<int> &shape,
-                                           std::vector<int> &indexes_vec) {
-  unsigned offset = 0;
-  unsigned m = 1;
-  for (int i = shape.size() - 1; i >= 0; --i) {
-    offset += m * indexes_vec[i];
-    m *= shape[i];
-  }
-  return offset;
-}
-
-template <class... Inds>
-TV_HOST_DEVICE_INLINE unsigned rowArrayIdx(const Shape &shape,
-                                           Inds... indexes) {
-  unsigned offset = 0;
-  unsigned m = 1;
-  int indexes_vec[sizeof...(indexes)] = {indexes...};
-#pragma unroll
-  for (int i = sizeof...(indexes) - 1; i >= 0; --i) {
-    offset += m * indexes_vec[i];
-    m *= shape[i];
-  }
-  return offset;
-}
-
-TV_HOST_DEVICE_INLINE unsigned rowArrayIdx(const Shape &shape,
-                                           const Shape &indexes_vec) {
-  unsigned offset = 0;
-  unsigned m = 1;
-  for (int i = indexes_vec.ndim() - 1; i >= 0; --i) {
-    offset += m * indexes_vec[i];
-    m *= shape[i];
-  }
-  return offset;
-}
-
-template <typename Index, unsigned NDim>
-TV_HOST_DEVICE_INLINE unsigned rowArrayIdx(const Index *indexes,
-                                           const Index *shape) {
-  unsigned offset = 0;
-  unsigned m = 1;
-#pragma unroll
-  for (int i = NDim - 1; i >= 0; --i) {
-    offset += m * indexes[i];
-    m *= shape[i];
-  }
-  return offset;
-}
-
-template <typename Index, unsigned NDim>
-TV_HOST_DEVICE_INLINE Index rowArrayIdxInv(Index index, Index *output,
-                                           const Index *shape) {
-#pragma unroll
-  for (int i = NDim - 1; i >= 0; --i) {
-    output[i] = index % shape[i];
-    index -= output[i];
-    index /= shape[i];
-  }
-  return index;
-}
-
-template <int N>
-struct ArrayIndexRowMajor {
-  TV_HOST_DEVICE_INLINE static unsigned run(const Shape &shape,
-                                            const Shape &indexes) {
-    return indexes[N - 1] +
-           shape[N - 1] * ArrayIndexRowMajor<N - 1>::run(shape, indexes);
-  }
-};
-
-template <>
-struct ArrayIndexRowMajor<0> {
-  TV_HOST_DEVICE_INLINE static unsigned run(const Shape &shape,
-                                            const Shape &indexes) {
-    return 0;
-  }
-};
-
-namespace detail {
-template <typename scalar_t>
-constexpr const char *simpleTypeName(scalar_t val = scalar_t());
-template <>
-constexpr const char *simpleTypeName(float val) {
-  return "float32";
-}
-template <>
-constexpr const char *simpleTypeName(double val) {
-  return "float64";
-}
-template <>
-constexpr const char *simpleTypeName(int val) {
-  return "int32";
-}
-template <>
-constexpr const char *simpleTypeName(unsigned val) {
-  return "uint32";
-}
-template <>
-constexpr const char *simpleTypeName(long val) {
-  return "int64";
-}
-template <>
-constexpr const char *simpleTypeName(unsigned long val) {
-  return "uint64";
-}
-};  // namespace detail
-
-template <typename scalar_t, int Rank = -1>
-struct TensorView {
-  TV_HOST_DEVICE_INLINE TensorView() {}
-  explicit TV_HOST_DEVICE_INLINE TensorView(scalar_t *ptr, Shape shape)
-      : mPtr(ptr), mShape(shape) {}
-  template <class... Integers>
-  explicit TV_HOST_DEVICE_INLINE TensorView(scalar_t *ptr, Integers... shapes)
-      : mPtr(ptr) {
-    mShape = {int(shapes)...};
-  }
-
-  TV_HOST_DEVICE_INLINE TensorView<scalar_t, Rank> &assign(
-      const TensorView<scalar_t, Rank> &tensor) {
-    TV_REQUIRE(tensor.shape() == shape(), "you must provide same input size%s",
-               "\n");
-    scalar_t *ptr = mPtr;
-    const scalar_t *other_ptr = tensor.data();
-    for (size_t i = 0; i < size(); ++i) *(ptr++) = *(other_ptr++);
-    return *this;
-  }
-
-  template <typename T1>
-  TV_HOST_DEVICE_INLINE TensorView<scalar_t, Rank> &assign(
-      std::initializer_list<T1> seq) {
-    TV_REQUIRE(seq.size() == size(), "you must provide same input size%s",
-               "\n");
-    scalar_t *ptr = mPtr;
-    for (const T1 &s : seq) *(ptr++) = scalar_t(s);
-    return *this;
-  }
-
-  template <class... Inds>
-  TV_HOST_DEVICE_INLINE scalar_t &operator()(Inds... inds) {
-#ifdef TV_DEBUG
-    int idxes[sizeof...(Inds)]{int(inds)...};
-    TV_REQUIRE(sizeof...(inds) == mShape.ndim(),
-               "you provide %d indexes, but dim is %d\n", sizeof...(inds),
-               mShape.ndim());
-    for (int i = 0; i < sizeof...(inds); ++i) {
-      TV_REQUIRE(idxes[i] >= 0 && idxes[i] < mShape[i],
-                 "index-%d(%d) out-of-range: [0, %d)\n", i, idxes[i],
-                 mShape[i]);
-    }
-#endif
-    return mPtr[rowArrayIdx(mShape, int(inds)...)];
-  }
-  template <class... Inds>
-  TV_HOST_DEVICE_INLINE const scalar_t &operator()(Inds... inds) const {
-#ifdef TV_DEBUG
-    int idxes[sizeof...(Inds)]{int(inds)...};
-    TV_REQUIRE(sizeof...(inds) == mShape.ndim(),
-               "you provide %d indexes, but dim is %d\n", sizeof...(inds),
-               mShape.ndim());
-    for (int i = 0; i < sizeof...(inds); ++i) {
-      TV_REQUIRE(idxes[i] >= 0 && idxes[i] < mShape[i],
-                 "index-%d(%d) out-of-range: [0, %d)\n", i, idxes[i],
-                 mShape[i]);
-    }
-#endif
-    return mPtr[rowArrayIdx(mShape, int(inds)...)];
-  }
-  TV_HOST_DEVICE_INLINE scalar_t &operator()() {
-#if defined TV_DEBUG
-#if defined(__CUDA_ARCH__)
-    TV_DEVICE_REQUIRE(mPtr != nullptr,
-                      "you want get value but the view is empty.%s", "\n");
-    TV_DEVICE_REQUIRE(mShape.ndim() == 0,
-                      "you provide 0 indexes, but dim is %ld\n", mShape.ndim());
-#else
-    TV_REQUIRE(mPtr != nullptr, "you want get value but the view is empty.%s",
-               "\n");
-    TV_REQUIRE(mShape.ndim() == 0, "you provide 0 indexes, but dim is %ld\n",
-               mShape.ndim());
-#endif
-#endif
-    return mPtr[0];
-  }
-  TV_HOST_DEVICE_INLINE const scalar_t &operator()() const {
-#if defined TV_DEBUG
-#if defined(__CUDA_ARCH__)
-    TV_DEVICE_REQUIRE(mPtr != nullptr,
-                      "you want get value but the view is empty.%s", "\n");
-    TV_DEVICE_REQUIRE(mShape.ndim() == 0,
-                      "you provide 0 indexes, but dim is %ld\n", mShape.ndim());
-#else
-    TV_REQUIRE(mPtr != nullptr, "you want get value but the view is empty.%s",
-               "\n");
-    TV_REQUIRE(mShape.ndim() == 0, "you provide 0 indexes, but dim is %ld\n",
-               mShape.ndim());
-#endif
-#endif
-    return mPtr[0];
-  }
-
-  template <class T1>
-  TV_HOST_DEVICE_INLINE scalar_t &operator()(T1 i1) {
-#if defined TV_DEBUG
-#if defined(__CUDA_ARCH__)
-    TV_DEVICE_REQUIRE(mShape.ndim() == 1,
-                      "you provide 1 indexes, but dim is %ld\n", mShape.ndim());
-    TV_DEVICE_REQUIRE(i1 >= 0 && i1 < mShape[0],
-                      "index-%d(%d) out-of-range: [0, %d)\n", 0, i1, mShape[0]);
-#else
-    TV_REQUIRE(mShape.ndim() == 1, "you provide 1 indexes, but dim is %ld\n",
-               mShape.ndim());
-    TV_REQUIRE(i1 >= 0 && i1 < mShape[0],
-               "index-%d(%d) out-of-range: [0, %d)\n", 0, i1, mShape[0]);
-#endif
-#endif
-    return mPtr[i1];
-  }
-  template <class T1, class T2>
-  TV_HOST_DEVICE_INLINE scalar_t &operator()(T1 i1, T2 i2) {
-#ifdef TV_DEBUG
-#if defined(__CUDA_ARCH__)
-    TV_DEVICE_REQUIRE(mShape.ndim() == 2,
-                      "you provide 2 indexes, but dim is %ld\n", mShape.ndim());
-    TV_DEVICE_REQUIRE(i1 >= 0 && i1 < mShape[0],
-                      "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1),
-                      mShape[0]);
-    TV_DEVICE_REQUIRE(i2 >= 0 && i2 < mShape[1],
-                      "index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2),
-                      mShape[1]);
-#else
-    TV_REQUIRE(mShape.ndim() == 2, "you provide 2 indexes, but dim is %ld\n",
-               mShape.ndim());
-    TV_REQUIRE(i1 >= 0 && i1 < mShape[0],
-               "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1), mShape[0]);
-    TV_REQUIRE(i2 >= 0 && i2 < mShape[1],
-               "index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2), mShape[1]);
-#endif
-#endif
-    return mPtr[i1 * mShape[1] + i2];
-  }
-  template <class T1, class T2, class T3>
-  TV_HOST_DEVICE_INLINE scalar_t &operator()(T1 i1, T2 i2, T3 i3) {
-#ifdef TV_DEBUG
-#if defined(__CUDA_ARCH__)
-    TV_DEVICE_REQUIRE(mShape.ndim() == 3,
-                      "you provide 3 indexes, but dim is %ld\n", mShape.ndim());
-    TV_DEVICE_REQUIRE(i1 >= 0 && i1 < mShape[0],
-                      "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1),
-                      mShape[0]);
-    TV_DEVICE_REQUIRE(i2 >= 0 && i2 < mShape[1],
-                      "index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2),
-                      mShape[1]);
-    TV_DEVICE_REQUIRE(i3 >= 0 && i3 < mShape[2],
-                      "index-%d(%d) out-of-range: [0, %d)\n", 2, int(i3),
-                      mShape[2]);
-#else
-    TV_REQUIRE(mShape.ndim() == 3, "you provide 3 indexes, but dim is %ld\n",
-               mShape.ndim());
-    TV_REQUIRE(i1 >= 0 && i1 < mShape[0],
-               "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1), mShape[0]);
-    TV_REQUIRE(i2 >= 0 && i2 < mShape[1],
-               "index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2), mShape[1]);
-    TV_REQUIRE(i3 >= 0 && i3 < mShape[2],
-               "index-%d(%d) out-of-range: [0, %d)\n", 2, int(i3), mShape[2]);
-#endif
-#endif
-    return mPtr[(i1 * mShape[1] + i2) * mShape[2] + i3];
-  }
-  template <class T1, class T2, class T3, class T4>
-  TV_HOST_DEVICE_INLINE scalar_t &operator()(T1 i1, T2 i2, T3 i3, T4 i4) {
-#ifdef TV_DEBUG
-#if defined(__CUDA_ARCH__)
-    TV_DEVICE_REQUIRE(mShape.ndim() == 4,
-                      "you provide 4 indexes, but dim is %ld\n", mShape.ndim());
-    TV_DEVICE_REQUIRE(i1 >= 0 && i1 < mShape[0],
-                      "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1),
-                      mShape[0]);
-    TV_DEVICE_REQUIRE(i2 >= 0 && i2 < mShape[1],
-                      "index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2),
-                      mShape[1]);
-    TV_DEVICE_REQUIRE(i3 >= 0 && i3 < mShape[2],
-                      "index-%d(%d) out-of-range: [0, %d)\n", 2, int(i3),
-                      mShape[2]);
-    TV_DEVICE_REQUIRE(i4 >= 0 && i4 < mShape[3],
-                      "index-%d(%d) out-of-range: [0, %d)\n", 3, int(i4),
-                      mShape[3]);
-#else
-    TV_REQUIRE(mShape.ndim() == 4, "you provide 4 indexes, but dim is %ld\n",
-               mShape.ndim());
-    TV_REQUIRE(i1 >= 0 && i1 < mShape[0],
-               "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1), mShape[0]);
-    TV_REQUIRE(i2 >= 0 && i2 < mShape[1],
-               "index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2), mShape[1]);
-    TV_REQUIRE(i3 >= 0 && i3 < mShape[2],
-               "index-%d(%d) out-of-range: [0, %d)\n", 2, int(i3), mShape[2]);
-    TV_REQUIRE(i4 >= 0 && i4 < mShape[3],
-               "index-%d(%d) out-of-range: [0, %d)\n", 3, int(i4), mShape[3]);
-#endif
-#endif
-    return mPtr[((i1 * mShape[1] + i2) * mShape[2] + i3) * mShape[3] + i4];
-  }
-
-  template <class T1>
-  TV_HOST_DEVICE_INLINE const scalar_t &operator()(T1 i1) const {
-#ifdef TV_DEBUG
-#if defined(__CUDA_ARCH__)
-    TV_DEVICE_REQUIRE(mShape.ndim() == 1,
-                      "you provide 1 indexes, but dim is %ld\n", mShape.ndim());
-    TV_DEVICE_REQUIRE(i1 >= 0 && i1 < mShape[0],
-                      "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1),
-                      mShape[0]);
-#else
-    TV_REQUIRE(mShape.ndim() == 1, "you provide 1 indexes, but dim is %ld\n",
-               mShape.ndim());
-    TV_REQUIRE(i1 >= 0 && i1 < mShape[0],
-               "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1), mShape[0]);
-#endif
-#endif
-    return mPtr[i1];
-  }
-  template <class T1, class T2>
-  TV_HOST_DEVICE_INLINE const scalar_t &operator()(T1 i1, T2 i2) const {
-#ifdef TV_DEBUG
-#if defined(__CUDA_ARCH__)
-    TV_DEVICE_REQUIRE(mShape.ndim() == 2,
-                      "you provide 2 indexes, but dim is %ld\n", mShape.ndim());
-    TV_DEVICE_REQUIRE(i1 >= 0 && i1 < mShape[0],
-                      "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1),
-                      mShape[0]);
-    TV_DEVICE_REQUIRE(i2 >= 0 && i2 < mShape[1],
-                      "index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2),
-                      mShape[1]);
-#else
-    TV_REQUIRE(mShape.ndim() == 2, "you provide 2 indexes, but dim is %ld\n",
-               mShape.ndim());
-    TV_REQUIRE(i1 >= 0 && i1 < mShape[0],
-               "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1), mShape[0]);
-    TV_REQUIRE(i2 >= 0 && i2 < mShape[1],
-               "index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2), mShape[1]);
-
-#endif
-#endif
-    return mPtr[i1 * mShape[1] + i2];
-  }
-  template <class T1, class T2, class T3>
-  TV_HOST_DEVICE_INLINE const scalar_t &operator()(T1 i1, T2 i2, T3 i3) const {
-#ifdef TV_DEBUG
-#if defined(__CUDA_ARCH__)
-    TV_DEVICE_REQUIRE(mShape.ndim() == 3,
-                      "you provide 3 indexes, but dim is %ld\n", mShape.ndim());
-    TV_DEVICE_REQUIRE(i1 >= 0 && i1 < mShape[0],
-                      "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1),
-                      mShape[0]);
-    TV_DEVICE_REQUIRE(i2 >= 0 && i2 < mShape[1],
-                      "index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2),
-                      mShape[1]);
-    TV_DEVICE_REQUIRE(i3 >= 0 && i3 < mShape[2],
-                      "index-%d(%d) out-of-range: [0, %d)\n", 2, int(i3),
-                      mShape[2]);
-#else
-    TV_REQUIRE(mShape.ndim() == 3, "you provide 3 indexes, but dim is %ld\n",
-               mShape.ndim());
-    TV_REQUIRE(i1 >= 0 && i1 < mShape[0],
-               "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1), mShape[0]);
-    TV_REQUIRE(i2 >= 0 && i2 < mShape[1],
-               "index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2), mShape[1]);
-    TV_REQUIRE(i3 >= 0 && i3 < mShape[2],
-               "index-%d(%d) out-of-range: [0, %d)\n", 2, int(i3), mShape[2]);
-#endif
-#endif
-    return mPtr[(i1 * mShape[1] + i2) * mShape[2] + i3];
-  }
-  template <class T1, class T2, class T3, class T4>
-  TV_HOST_DEVICE_INLINE const scalar_t &operator()(T1 i1, T2 i2, T3 i3,
-                                                   T4 i4) const {
-#ifdef TV_DEBUG
-#if defined(__CUDA_ARCH__)
-    TV_DEVICE_REQUIRE(mShape.ndim() == 4,
-                      "you provide 4 indexes, but dim is %ld\n", mShape.ndim());
-    TV_DEVICE_REQUIRE(i1 >= 0 && i1 < mShape[0],
-                      "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1),
-                      mShape[0]);
-    TV_DEVICE_REQUIRE(i2 >= 0 && i2 < mShape[1],
-                      "index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2),
-                      mShape[1]);
-    TV_DEVICE_REQUIRE(i3 >= 0 && i3 < mShape[2],
-                      "index-%d(%d) out-of-range: [0, %d)\n", 2, int(i3),
-                      mShape[2]);
-    TV_DEVICE_REQUIRE(i4 >= 0 && i4 < mShape[3],
-                      "index-%d(%d) out-of-range: [0, %d)\n", 3, int(i4),
-                      mShape[3]);
-#else
-    TV_REQUIRE(mShape.ndim() == 4, "you provide 4 indexes, but dim is %ld\n",
-               mShape.ndim());
-    TV_REQUIRE(i1 >= 0 && i1 < mShape[0],
-               "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1), mShape[0]);
-    TV_REQUIRE(i2 >= 0 && i2 < mShape[1],
-               "index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2), mShape[1]);
-    TV_REQUIRE(i3 >= 0 && i3 < mShape[2],
-               "index-%d(%d) out-of-range: [0, %d)\n", 2, int(i3), mShape[2]);
-    TV_REQUIRE(i4 >= 0 && i4 < mShape[3],
-               "index-%d(%d) out-of-range: [0, %d)\n", 3, int(i4), mShape[3]);
-#endif
-#endif
-    return mPtr[((i1 * mShape[1] + i2) * mShape[2] + i3) * mShape[3] + i4];
-  }
-
-  TV_HOST_DEVICE_INLINE scalar_t &operator[](int idx) {
-#ifdef TV_DEBUG
-#if defined(__CUDA_ARCH__)
-    TV_DEVICE_REQUIRE(idx >= 0 && idx < size(),
-                      "index(%d) out-of-range: [0, %ld)\n", int(idx), size());
-#else
-    TV_REQUIRE(idx >= 0 && idx < size(), "index(%d) out-of-range: [0, %ld)\n",
-               int(idx), size());
-#endif
-#endif
-    return mPtr[idx];
-  }
-  TV_HOST_DEVICE_INLINE TensorView<scalar_t, Rank> operator[](
-      SimpleVector<Slice> slice_vec) {
-    return _subview(slice_vec);
-  }
-  TV_HOST_DEVICE_INLINE const TensorView<scalar_t, Rank> operator[](
-      SimpleVector<Slice> slice_vec) const {
-    return _subview(slice_vec);
-  }
-  TV_HOST_DEVICE_INLINE bool empty() const { return mPtr == nullptr; }
-  TV_HOST_DEVICE_INLINE scalar_t *data() { return mPtr; }
-  TV_HOST_DEVICE_INLINE const scalar_t *data() const { return mPtr; }
-  TV_HOST_DEVICE_INLINE const Shape &shape() const { return mShape; }
-  TV_HOST_DEVICE_INLINE int dim(int idx) const { return mShape[idx]; }
-  TV_HOST_DEVICE_INLINE int ndim() const { return mShape.ndim(); }
-  template <class... Inds>
-  TV_HOST_DEVICE_INLINE TensorView<scalar_t, Rank> &reshape(Inds... newShapes) {
-    Shape shapes{int(newShapes)...};
-    TV_ASSERT(shapes.size() == size());
-    mShape = shapes;
-    return *this;
-  }
-  TV_HOST_DEVICE_INLINE TensorView<scalar_t, Rank> &reshape(Shape shapes) {
-    TV_ASSERT(shapes.size() == size());
-    mShape = shapes;
-    return *this;
-  }
-  template <class... Inds>
-  TV_HOST_DEVICE_INLINE TensorView<scalar_t, Rank> view(
-      Inds... newShapes) const {
-    Shape shapes{int(newShapes)...};
-    for (size_t i = 0; i < shapes.ndim(); ++i) {
-      if (shapes[i] == -1) {
-        shapes[i] = 1;
-        shapes[i] = size() / shapes.size();
-        break;
-      }
-    }
-    TV_ASSERT(shapes.size() == size());
-    return TensorView<scalar_t, Rank>(mPtr, shapes);
-  }
-  TV_HOST_DEVICE_INLINE TensorView<scalar_t, Rank> view(Shape shapes) const {
-    TV_ASSERT(shapes.size() == size());
-    return TensorView<scalar_t, Rank>(mPtr, shapes);
-  }
-  TV_HOST_DEVICE_INLINE TensorView<scalar_t, Rank> squeeze() const {
-    return TensorView<scalar_t, Rank>(mPtr, mShape.squeeze());
-  }
-  TV_HOST_DEVICE_INLINE TensorView<scalar_t, Rank> squeeze(int dim) const {
-    return TensorView<scalar_t, Rank>(mPtr, mShape.squeeze(dim));
-  }
-  TV_HOST_DEVICE_INLINE size_t size() const { return mShape.size(); }
-
-  template <class... Slices>
-  TV_HOST_DEVICE_INLINE TensorView<scalar_t, Rank> subview(
-      Slice slice, Slices... slices) const {
-    return subview<float, Slice, Slices...>(slice, slices...);
-  }
-  template <class T2 = float, class... Slices>
-  TV_HOST_DEVICE_INLINE TensorView<scalar_t, Rank> subview(
-      Slices... slices) const {
-    Slice slice_vec[sizeof...(Slices)] = {to_slice(slices)...};
-    Shape new_shape{to_slice(slices)[0]...};
-    Shape start{to_slice(slices)[0]...};
-    TV_ASSERT(new_shape.ndim() <= mShape.ndim());
-    TV_ASSERT(new_shape.ndim() != 0);
-    size_t idxsize = new_shape.ndim();
-    for (size_t i = idxsize; i < mShape.ndim(); ++i) {
-      new_shape.push_back(0);
-      start.push_back(0);
-    }
-#pragma unroll
-    for (size_t i = 0; i < sizeof...(Slices); ++i) {
-      if (slice_vec[i][1] != -1) {
-        new_shape[i] = slice_vec[i][1] - slice_vec[i][0];
-        TV_ASSERT(new_shape[i] >= 0);
-      } else {
-        new_shape[i] = 1;
-      }
-    }
-    auto offset = rowArrayIdx(mShape, start);
-#pragma unroll
-    for (size_t i = sizeof...(Slices); i < mShape.ndim(); ++i) {
-      new_shape[i] = mShape[i];
-      TV_ASSERT(new_shape[i] >= 0);
-    }
-    Shape reduced_shape;
-#pragma unroll
-    for (size_t i = 0; i < sizeof...(Slices); ++i) {
-      if (slice_vec[i][1] != -1) {
-        reduced_shape.push_back(new_shape[i]);
-      }
-    }
-#pragma unroll
-    for (size_t i = sizeof...(Slices); i < mShape.ndim(); ++i) {
-      reduced_shape.push_back(new_shape[i]);
-    }
-    return TensorView<scalar_t, Rank>(mPtr + offset, reduced_shape);
-  }
-
-  template <class... Integers>
-  TV_HOST_DEVICE_INLINE TensorView<scalar_t, Rank> subview(int id,
-                                                           Integers... ints) {
-    Shape start = {id, ints...};
-    for (int i = 1 + sizeof...(ints); i < ndim(); ++i) {
-      start.push_back(0);
-    }
-    return TensorView<scalar_t, Rank>(mPtr + rowArrayIdx(mShape, start),
-                                      mShape.subshape(sizeof...(ints) + 1));
-  }
-
-  std::string repr() const {
-    std::ostringstream ss;
-    if (empty()) return "";
-    if (mShape.ndim() == 0) {
-      ss << *mPtr;
-      ss << "Tensor: dtype=" << detail::simpleTypeName<scalar_t>();
-      return ss.str();
-    }
-    Shape counter = mShape;
-    auto tensor_flat = this->view(-1);
-    for (int i = 0; i < counter.ndim(); ++i) {
-      counter[i] = 0;
-      ss << "[";
-    }
-    for (size_t i = 0; i < this->size(); ++i) {
-      ss << tensor_flat(rowArrayIdx(mShape, counter));
-      counter[counter.ndim() - 1] += 1;
-      int inc_count = 0;
-      bool print_comma = true;
-      for (int c = counter.ndim() - 1; c >= 0; --c) {
-        if (counter[c] == this->dim(c) && c > 0) {
-          ++inc_count;
-          counter[c - 1] += 1;
-          counter[c] = 0;
-          print_comma = false;
-        }
-      }
-      if (print_comma && i != this->size() - 1) ss << ", ";
-      for (int j = 0; j < inc_count; ++j) {
-        ss << "]";
-      }
-      if (i != this->size() - 1) {
-        if (inc_count != 0) ss << "\n";
-        for (int j = 0; j < inc_count; ++j) {
-          ss << "[";
-        }
-      }
-    }
-    ss << "]";
-    ss << "Tensor: dtype=" << detail::simpleTypeName<scalar_t>();
-    return ss.str();
-  }
-
- protected:
-  // TODO: make this function public.
-  // currently this function is called unexpectedly when using subview({0, 0}).
-  TV_HOST_DEVICE_INLINE TensorView<scalar_t, Rank> _subview(
-      SimpleVector<Slice> slice_vec) {
-    Shape new_shape;
-    for (int i = 0; i < slice_vec.size(); ++i) {
-      new_shape.push_back(slice_vec[i][0]);
-    }
-    Shape start = new_shape;
-    TV_ASSERT(new_shape.ndim() <= mShape.ndim());
-    TV_ASSERT(new_shape.ndim() != 0);
-    size_t idxsize = new_shape.ndim();
-    for (size_t i = idxsize; i < mShape.ndim(); ++i) {
-      new_shape.push_back(0);
-      start.push_back(0);
-    }
-    for (size_t i = 0; i < slice_vec.size(); ++i) {
-      if (slice_vec[i][1] != -1) {
-        new_shape[i] = slice_vec[i][1] - slice_vec[i][0];
-        TV_ASSERT(new_shape[i] >= 0);
-      } else {
-        new_shape[i] = 1;  // reduce dim
-      }
-    }
-    auto offset = rowArrayIdx(mShape, start);
-    for (size_t i = slice_vec.size(); i < mShape.ndim(); ++i) {
-      new_shape[i] = mShape[i];
-      TV_ASSERT(new_shape[i] >= 0);
-    }
-    Shape reduced_shape;
-    for (size_t i = 0; i < slice_vec.size(); ++i) {
-      if (slice_vec[i][1] != -1) {
-        reduced_shape.push_back(new_shape[i]);
-      }
-    }
-    for (size_t i = slice_vec.size(); i < mShape.ndim(); ++i) {
-      reduced_shape.push_back(new_shape[i]);
-    }
-    return TensorView<scalar_t, Rank>(mPtr + offset, reduced_shape);
-  }
-  template <typename T1>
-  TV_HOST_DEVICE_INLINE Slice to_slice(T1 s) const {
-    return Slice{int(s), -1, -1};
-  }
-
-  TV_HOST_DEVICE_INLINE Slice to_slice(Slice s) const { return Slice(s); }
-
-  scalar_t *mPtr = nullptr;
-  Shape mShape;
-};
-
-template <typename Os, typename scalar_t, int Rank>
-Os &operator<<(Os &os, const TensorView<scalar_t, Rank> &dt) {
-  os << dt.repr();
-  return os;
-}
-
-template <typename Os, typename scalar_t, int Rank>
-Os &operator<<(Os &os, const TensorView<const scalar_t, Rank> &dt) {
-  os << dt.repr();
-  return os;
-}
-
-namespace detail {
-template <typename scalar_t>
-constexpr const char *printfTypeFormat(scalar_t val = scalar_t());
-template <>
-constexpr const char *printfTypeFormat(float val) {
-  return "%.2f";
-}
-template <>
-constexpr const char *printfTypeFormat(double val) {
-  return "%.2f";
-}
-template <>
-constexpr const char *printfTypeFormat(int val) {
-  return "%d";
-}
-template <>
-constexpr const char *printfTypeFormat(unsigned val) {
-  return "%u";
-}
-template <>
-constexpr const char *printfTypeFormat(long val) {
-  return "%ld";
-}
-template <>
-constexpr const char *printfTypeFormat(unsigned long val) {
-  return "%lu";
-}
-};  // namespace detail
-
-template <typename scalar_t>
-TV_HOST_DEVICE void printTensorView(const TensorView<scalar_t> tensor,
-                                    const char *format) {
-  if (tensor.empty()) return;
-  if (tensor.ndim() == 0) {
-    printf(format, tensor());
-    printf("\n");
-    return;
-  }
-  Shape counter = tensor.shape();
-  auto tensor_flat = tensor.view(-1);
-  for (int i = 0; i < counter.ndim(); ++i) {
-    counter[i] = 0;
-    printf("[");
-  }
-  for (size_t i = 0; i < tensor.size(); ++i) {
-    printf(format, tensor_flat(rowArrayIdx(tensor.shape(), counter)));
-    counter[counter.ndim() - 1] += 1;
-    int inc_count = 0;
-    bool print_comma = true;
-    for (int c = counter.ndim() - 1; c >= 0; --c) {
-      if (counter[c] == tensor.dim(c) && c > 0) {
-        ++inc_count;
-        counter[c - 1] += 1;
-        counter[c] = 0;
-        print_comma = false;
-      }
-    }
-    if (print_comma && i != tensor.size() - 1) printf(", ");
-    for (int j = 0; j < inc_count; ++j) {
-      printf("]");
-    }
-    if (i != tensor.size() - 1) {
-      if (inc_count != 0) printf("\n");
-      for (int j = 0; j < inc_count; ++j) {
-        printf("[");
-      }
-    }
-  }
-  printf("]\n");
-}
-
-template <typename scalar_t>
-TV_HOST_DEVICE void printTensorView(TensorView<scalar_t> tensor) {
-  using Traw = typename std::remove_const<scalar_t>::type;
-  return printTensorView(tensor, detail::printfTypeFormat<Traw>());
-}
-template <typename scalar_t>
-TV_HOST_DEVICE void printTensorView(const scalar_t *ptr, Shape shape) {
-  using Traw = typename std::remove_const<scalar_t>::type;
-  return printTensorView(TensorView<const scalar_t>(ptr, shape),
-                         detail::printfTypeFormat<Traw>());
-}
-template <typename scalar_t>
-TV_HOST_DEVICE void printTensorView(const scalar_t *ptr, Shape shape,
-                                    const char *format) {
-  return printTensorView(TensorView<const scalar_t>(ptr, shape), format);
-}
-
-}  // namespace tv
diff --git a/mmcv/ops/csrc/onnxruntime/corner_pool.h b/mmcv/ops/csrc/onnxruntime/corner_pool.h
new file mode 100644
index 0000000..b408679
--- /dev/null
+++ b/mmcv/ops/csrc/onnxruntime/corner_pool.h
@@ -0,0 +1,46 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef ONNXRUNTIME_CORNER_POOL_H
+#define ONNXRUNTIME_CORNER_POOL_H
+
+#include <assert.h>
+#include <onnxruntime_cxx_api.h>
+
+struct MMCVCornerPoolKernel {
+ public:
+  MMCVCornerPoolKernel(Ort::CustomOpApi ort, const OrtKernelInfo* info)
+      : ort_(ort) {
+    mode_ = ort_.KernelInfoGetAttribute<int64_t>(info, "mode");
+  }
+
+  void Compute(OrtKernelContext* context);
+
+ private:
+  Ort::CustomOpApi ort_;
+
+  int64_t mode_;
+};
+
+struct MMCVCornerPoolCustomOp
+    : Ort::CustomOpBase<MMCVCornerPoolCustomOp, MMCVCornerPoolKernel> {
+  void* CreateKernel(Ort::CustomOpApi api, const OrtKernelInfo* info) const {
+    return new MMCVCornerPoolKernel(api, info);
+  }
+
+  const char* GetName() const { return "MMCVCornerPool"; }
+
+  size_t GetInputTypeCount() const { return 1; }
+  ONNXTensorElementDataType GetInputType(size_t) const {
+    return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
+  }
+
+  size_t GetOutputTypeCount() const { return 1; }
+  ONNXTensorElementDataType GetOutputType(size_t) const {
+    return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
+  }
+
+  // force cpu
+  const char* GetExecutionProviderType() const {
+    return "CPUExecutionProvider";
+  }
+};
+#endif  // ONNXRUNTIME_CORNER_POOL_H
diff --git a/mmcv/ops/csrc/onnxruntime/cpu/corner_pool.cpp b/mmcv/ops/csrc/onnxruntime/cpu/corner_pool.cpp
new file mode 100644
index 0000000..397fe10
--- /dev/null
+++ b/mmcv/ops/csrc/onnxruntime/cpu/corner_pool.cpp
@@ -0,0 +1,123 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "corner_pool.h"
+
+#include "../ort_mmcv_utils.h"
+
+void TopPoolForwardCPU(const float *input, float *output, const int batch_size,
+                       const int channels, const int height, const int width) {
+  for (int n = 0; n < batch_size; n++) {
+    int index_n = n * channels * width * height;
+    for (int c = 0; c < channels; c++) {
+      int index_n_c = index_n + c * width * height;
+      for (int w = 0; w < width; w++) {
+        // directly copy the most bottom value from input to output
+        output[index_n_c + (height - 1) * width + w] =
+            input[index_n_c + (height - 1) * width + w];
+        // do top_pool
+        for (int h = height - 2; h >= 0; h--) {
+          output[index_n_c + h * width + w] =
+              std::max(output[index_n_c + (h + 1) * width + w],
+                       input[index_n_c + h * width + w]);
+        }  // for h
+      }    // for w
+    }      // for c
+  }        // for n
+}
+
+void BottomPoolForwardCPU(const float *input, float *output,
+                          const int batch_size, const int channels,
+                          const int height, const int width) {
+  for (int n = 0; n < batch_size; n++) {
+    int index_n = n * channels * width * height;
+    for (int c = 0; c < channels; c++) {
+      int index_n_c = index_n + c * width * height;
+      for (int w = 0; w < width; w++) {
+        // directly copy the most top value from input to output
+        output[index_n_c + w] = input[index_n_c + w];
+        // do top_pool
+        for (int h = 1; h < height; h++) {
+          output[index_n_c + h * width + w] =
+              std::max(output[index_n_c + (h - 1) * width + w],
+                       input[index_n_c + h * width + w]);
+        }  // for h
+      }    // for w
+    }      // for c
+  }        // for n
+}
+
+void LeftPoolForwardCPU(const float *input, float *output, const int batch_size,
+                        const int channels, const int height, const int width) {
+  for (int n = 0; n < batch_size; n++) {
+    int index_n = n * channels * width * height;
+    for (int c = 0; c < channels; c++) {
+      int index_n_c = index_n + c * width * height;
+      for (int h = 0; h < height; h++) {
+        // directly copy the most right value from input to output
+        output[index_n_c + h * width + width - 1] =
+            input[index_n_c + h * width + width - 1];
+        // do left_pool
+        for (int w = width - 2; w >= 0; w--) {
+          output[index_n_c + h * width + w] =
+              std::max(output[index_n_c + h * width + w + 1],
+                       input[index_n_c + h * width + w]);
+        }  // for w
+      }    // for h
+    }      // for c
+  }        // for n
+}
+
+void RightPoolForwardCPU(const float *input, float *output,
+                         const int batch_size, const int channels,
+                         const int height, const int width) {
+  for (int n = 0; n < batch_size; n++) {
+    int index_n = n * channels * width * height;
+    for (int c = 0; c < channels; c++) {
+      int index_n_c = index_n + c * width * height;
+      for (int h = 0; h < height; h++) {
+        // directly copy the most left value from input to output
+        output[index_n_c + h * width] = input[index_n_c + h * width];
+        // do right_pool
+        for (int w = 1; w < width; w++) {
+          output[index_n_c + h * width + w] =
+              std::max(output[index_n_c + h * width + w - 1],
+                       input[index_n_c + h * width + w]);
+        }  // for w
+      }    // for h
+    }      // for c
+  }        // for n
+}
+
+void MMCVCornerPoolKernel::Compute(OrtKernelContext *context) {
+  const int mode = int(mode_);
+  typedef float T;
+  const OrtValue *input = ort_.KernelContext_GetInput(context, 0);
+  const T *input_data =
+      reinterpret_cast<const float *>(ort_.GetTensorData<T>(input));
+
+  // get output memory
+  OrtTensorDimensions out_dimensions(ort_, input);
+  OrtValue *output = ort_.KernelContext_GetOutput(
+      context, 0, out_dimensions.data(), out_dimensions.size());
+  T *output_data = ort_.GetTensorMutableData<T>(output);
+
+  // 'top': 0, 'bottom': 1, 'left': 2, 'right':3
+  assert(mode == 0 || mode == 1 || mode == 2 || mode == 3);
+
+  // do corner_pool
+  int batch_size = out_dimensions.data()[0];
+  int input_channels = out_dimensions.data()[1];
+  int input_height = out_dimensions.data()[2];
+  int input_width = out_dimensions.data()[3];
+  if (mode == 0)
+    TopPoolForwardCPU(input_data, output_data, batch_size, input_channels,
+                      input_height, input_width);
+  else if (mode == 1)
+    BottomPoolForwardCPU(input_data, output_data, batch_size, input_channels,
+                         input_height, input_width);
+  else if (mode == 2)
+    LeftPoolForwardCPU(input_data, output_data, batch_size, input_channels,
+                       input_height, input_width);
+  else
+    RightPoolForwardCPU(input_data, output_data, batch_size, input_channels,
+                        input_height, input_width);
+}
diff --git a/mmcv/ops/csrc/onnxruntime/cpu/deform_conv.cpp b/mmcv/ops/csrc/onnxruntime/cpu/deform_conv.cpp
new file mode 100644
index 0000000..db1f08b
--- /dev/null
+++ b/mmcv/ops/csrc/onnxruntime/cpu/deform_conv.cpp
@@ -0,0 +1,263 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "deform_conv.h"
+
+#include <cmath>
+#include <vector>
+
+#include "../ort_mmcv_utils.h"
+
+void gemm_ref_fp32_deform(const float *A, const float *B, const float *V,
+                          const float *H, const int32_t trans_A,
+                          const int32_t trans_B, const int32_t M,
+                          const int32_t N, const int32_t K, const float alpha,
+                          const float beta, float *Y) {
+  if (!trans_A && !trans_B) {  // MK, KN; NN
+    for (int64_t m = 0; m < M; ++m) {
+      for (int64_t n = 0; n < N; ++n) {
+        float y = 0.0f;
+        for (int64_t k = 0; k < K; ++k) {
+          y += A[m * K + k] * B[k * N + n];
+        }
+        y *= alpha;
+        if (V) y += beta * V[n];
+        if (H) y += beta * H[m * N + n];
+        Y[m * N + n] = y;
+      }
+    }
+  }
+  if (trans_A && !trans_B) {  // KM, KN; TN
+    for (int64_t m = 0; m < M; ++m) {
+      for (int64_t n = 0; n < N; ++n) {
+        float y = 0.0f;
+        for (int64_t k = 0; k < K; ++k) {
+          y += A[k * M + m] * B[k * N + n];
+        }
+        y *= alpha;
+        if (V) y += beta * V[n];
+        if (H) y += beta * H[m * N + n];
+        Y[m * N + n] = y;
+      }
+    }
+  }
+  if (trans_A && trans_B) {  // KM, NK; TT
+    for (int64_t m = 0; m < M; ++m) {
+      for (int64_t n = 0; n < N; ++n) {
+        float y = 0.0f;
+        for (int64_t k = 0; k < K; ++k) {
+          y += A[k * M + m] * B[n * K + k];
+        }
+        y *= alpha;
+        if (V) y += beta * V[n];
+        if (H) y += beta * H[m * N + n];
+        Y[m * N + n] = y;
+      }
+    }
+  }
+  if (!trans_A && trans_B) {  // MK, NK; NT
+    for (int64_t m = 0; m < M; ++m) {
+      for (int64_t n = 0; n < N; ++n) {
+        float y = 0.0f;
+        for (int64_t k = 0; k < K; ++k) {
+          y += A[m * K + k] * B[n * K + k];
+        }
+        y *= alpha;
+        if (V) y += beta * V[n];
+        if (H) y += beta * H[m * N + n];
+        Y[m * N + n] = y;
+      }
+    }
+  }
+}
+
+float bilinear_interpolate(const float *src, const int64_t src_h,
+                           const int64_t src_w, const float h, const float w) {
+  if (h <= -1 || src_h <= h || w <= -1 || src_w <= w) {
+    return 0;
+  }
+
+  int64_t h_low = floor(h);
+  int64_t w_low = floor(w);
+  int64_t h_high = h_low + 1;
+  int64_t w_high = w_low + 1;
+
+  float lh = h - h_low;
+  float lw = w - w_low;
+  float hh = 1 - lh;
+  float hw = 1 - lw;
+
+  float v1 = 0;
+  if (h_low >= 0 && w_low >= 0) v1 = src[h_low * src_w + w_low];
+  float v2 = 0;
+  if (h_low >= 0 && w_high <= src_w - 1) v2 = src[h_low * src_w + w_high];
+  float v3 = 0;
+  if (h_high <= src_h - 1 && w_low >= 0) v3 = src[h_high * src_w + w_low];
+  float v4 = 0;
+  if (h_high <= src_h - 1 && w_high <= src_w - 1)
+    v4 = src[h_high * src_w + w_high];
+
+  float w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
+
+  float val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+  return val;
+}
+
+void deformable_im2col(const float *input, const float *offset,
+                       const int64_t src_h, const int64_t src_w,
+                       const int64_t kernel_h, const int64_t kernel_w,
+                       const int64_t pad_h, const int64_t pad_w,
+                       const int64_t stride_h, const int64_t stride_w,
+                       const int64_t dilation_h, const int64_t dilation_w,
+                       const int64_t channels, const int64_t offset_groups,
+                       const int64_t dst_h, const int64_t dst_w,
+                       float *columns) {
+  const int64_t indices = channels * dst_h * dst_w;
+  for (int64_t index = 0; index != indices; ++index) {
+    const int64_t w_col = index % dst_w;
+    const int64_t h_col = (index / dst_w) % dst_h;
+    const int64_t c_im = index / (dst_w * dst_h);
+    const int64_t c_col = c_im * kernel_h * kernel_w;
+
+    int64_t c_per_offset_grp = channels / offset_groups;
+    const int64_t grp_idx = c_im / c_per_offset_grp;
+    auto columns_ptr =
+        columns + (c_col * (dst_h * dst_w) + h_col * dst_w + w_col);
+    auto input_ptr = input + c_im * (src_h * src_w);
+    auto offset_ptr =
+        offset + grp_idx * 2 * kernel_h * kernel_w * dst_h * dst_w;
+
+    for (int64_t kh = 0; kh < kernel_h; ++kh) {
+      for (int64_t kw = 0; kw < kernel_w; ++kw) {
+        const int data_offset_h_ptr =
+            ((2 * (kh * kernel_w + kw)) * dst_h + h_col) * dst_w + w_col;
+        const int data_offset_w_ptr =
+            ((2 * (kh * kernel_w + kw) + 1) * dst_h + h_col) * dst_w + w_col;
+
+        const float offset_h = offset_ptr[data_offset_h_ptr];
+        const float offset_w = offset_ptr[data_offset_w_ptr];
+        const float ih =
+            (h_col * stride_h - pad_h) + kh * dilation_h + offset_h;
+        const float iw =
+            (w_col * stride_w - pad_w) + kw * dilation_w + offset_w;
+        *columns_ptr = bilinear_interpolate(input_ptr, src_h, src_w, ih, iw);
+        columns_ptr += dst_h * dst_w;
+      }
+    }
+  }
+}
+
+void deformable_conv_forward(
+    const float *src, const float *offset, const float *filter,
+    const int64_t batch, const int64_t src_c, const int64_t src_h,
+    const int64_t src_w, const int64_t dst_c, const int64_t dst_h,
+    const int64_t dst_w, const int64_t group, const int64_t offset_group,
+    const int64_t channels, const int64_t num_output, const int64_t kernel_h,
+    const int64_t kernel_w, const int64_t stride_h, const int64_t stride_w,
+    const int64_t pad_h, const int64_t pad_w, const int64_t dilation_h,
+    const int64_t dilation_w, float *columns, float *dst) {
+  const int64_t ic_per_gp = channels / group;
+  const int64_t oc_per_gp = num_output / group;
+  for (int64_t b = 0; b < batch; ++b) {
+    for (int64_t g = 0; g < group; ++g) {
+      deformable_im2col(
+          src + b * src_c * src_h * src_w + g * ic_per_gp * src_h * src_w,
+          offset + b * offset_group * 2 * kernel_h * kernel_w * dst_h * dst_w,
+          src_h, src_w, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
+          dilation_h, dilation_w, ic_per_gp, offset_group, dst_h, dst_w,
+          columns);
+      float *dst_ptr =
+          dst + b * dst_c * dst_h * dst_w + g * oc_per_gp * dst_h * dst_w;
+
+      memset(dst_ptr, 0.0f, sizeof(float) * oc_per_gp * dst_h * dst_w);
+
+      gemm_ref_fp32_deform(
+          filter + g * oc_per_gp * ic_per_gp * kernel_h * kernel_w, columns,
+          nullptr, dst_ptr, 0, 0, oc_per_gp, dst_h * dst_w,
+          ic_per_gp * kernel_h * kernel_w, 1.0f, 1.0f, dst_ptr);
+    }
+  }
+}
+
+MMCVDeformConvKernel::MMCVDeformConvKernel(OrtApi api,
+                                           const OrtKernelInfo *info)
+    : api_(api), ort_(api_), info_(info) {
+  std::vector<int64_t> stride =
+      ort_.KernelInfoGetAttribute<std::vector<int64_t>>(info, "stride");
+  stride_height_ = stride[0];
+  stride_width_ = stride[1];
+  std::vector<int64_t> padding =
+      ort_.KernelInfoGetAttribute<std::vector<int64_t>>(info, "padding");
+  padding_height_ = padding[0];
+  padding_width_ = padding[1];
+  std::vector<int64_t> dilation =
+      ort_.KernelInfoGetAttribute<std::vector<int64_t>>(info, "dilation");
+  dilation_height_ = dilation[0];
+  dilation_width_ = dilation[1];
+  deformable_group_ =
+      ort_.KernelInfoGetAttribute<int64_t>(info, "deform_groups");
+  group_ = ort_.KernelInfoGetAttribute<int64_t>(info, "groups");
+
+  // create allocator
+  allocator_ = Ort::AllocatorWithDefaultOptions();
+}
+
+void MMCVDeformConvKernel::Compute(OrtKernelContext *context) {
+  const int64_t stride_height = stride_height_;
+  const int64_t stride_width = stride_width_;
+  const int64_t padding_height = padding_height_;
+  const int64_t padding_width = padding_width_;
+  const int64_t dilation_height = dilation_height_;
+  const int64_t dilation_width = dilation_width_;
+  const int64_t deformable_group = deformable_group_;
+  const int64_t group = group_;
+
+  const OrtValue *input = ort_.KernelContext_GetInput(context, 0);
+  const float *input_data =
+      reinterpret_cast<const float *>(ort_.GetTensorData<float>(input));
+
+  const OrtValue *offset = ort_.KernelContext_GetInput(context, 1);
+  const float *offset_data =
+      reinterpret_cast<const float *>(ort_.GetTensorData<float>(offset));
+
+  const OrtValue *filter = ort_.KernelContext_GetInput(context, 2);
+  const float *filter_data =
+      reinterpret_cast<const float *>(ort_.GetTensorData<float>(filter));
+
+  OrtTensorDimensions input_dims(ort_, input);
+  OrtTensorDimensions filter_dims(ort_, filter);
+
+  int64_t batch_size = input_dims[0];
+  int64_t in_channels = input_dims[1];
+  int64_t in_height = input_dims[2];
+  int64_t in_width = input_dims[3];
+  int64_t out_channels = filter_dims[0];
+  int64_t kernel_height = filter_dims[2];
+  int64_t kernel_width = filter_dims[3];
+
+  // get output memory
+  int64_t out_height = floor((in_height + 2 * padding_height -
+                              dilation_height * (kernel_height - 1) - 1) /
+                                 stride_height +
+                             1);
+  int64_t out_width = floor(
+      (in_width + 2 * padding_width - dilation_width * (kernel_width - 1) - 1) /
+          stride_width +
+      1);
+
+  std::vector<int64_t> output_dims = {batch_size, out_channels, out_height,
+                                      out_width};
+
+  OrtValue *output = ort_.KernelContext_GetOutput(
+      context, 0, output_dims.data(), output_dims.size());
+  float *out_ptr = ort_.GetTensorMutableData<float>(output);
+
+  // allocate tmp memory
+  int64_t column_len = (in_channels / group) * kernel_height * kernel_width *
+                       out_height * out_width;
+  float *columns = (float *)allocator_.Alloc(sizeof(float) * column_len);
+  deformable_conv_forward(
+      input_data, offset_data, filter_data, batch_size, in_channels, in_height,
+      in_width, out_channels, out_height, out_width, group, deformable_group,
+      in_channels, out_channels, kernel_height, kernel_width, stride_height,
+      stride_width, padding_height, padding_width, dilation_height,
+      dilation_width, columns, out_ptr);
+}
diff --git a/mmcv/ops/csrc/onnxruntime/cpu/gridSample.cpp b/mmcv/ops/csrc/onnxruntime/cpu/gridSample.cpp
new file mode 100644
index 0000000..ca150cd
--- /dev/null
+++ b/mmcv/ops/csrc/onnxruntime/cpu/gridSample.cpp
@@ -0,0 +1,314 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include <cmath>
+
+#include "../ort_mmcv_utils.h"
+#include "grid_sample.h"
+
+#define MIN(a, b) (((a) < (b)) ? (a) : (b))
+#define MAX(a, b) (((a) < (b)) ? (b) : (a))
+#define CLIP_COORDINATES(in, out, clip_limit) \
+  out = MIN((clip_limit - 1), MAX(in, 0))
+
+// modified from
+// https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/GridSampler.cpp
+
+GridSampleKernel::GridSampleKernel(OrtApi api, const OrtKernelInfo *info)
+    : api_(api), ort_(api_), info_(info) {
+  align_corners_ = ort_.KernelInfoGetAttribute<int64_t>(info, "align_corners");
+  interpolation_mode_ =
+      ort_.KernelInfoGetAttribute<int64_t>(info, "interpolation_mode");
+  padding_mode_ = ort_.KernelInfoGetAttribute<int64_t>(info, "padding_mode");
+
+  allocator_ = Ort::AllocatorWithDefaultOptions();
+}
+
+enum GridSamplerInterpolation { Bilinear = 0, Nearest = 1, Bicubic = 2 };
+enum GridSamplerPadding { Zeros = 0, Border = 1, Reflection = 2 };
+
+template <typename scalar_t>
+static inline scalar_t grid_sampler_unnormalize(scalar_t coord, int64_t size,
+                                                bool align_corners) {
+  if (align_corners) {
+    return ((coord + 1) / 2) * (size - 1);
+  } else {
+    return ((coord + 1) * size - 1) / 2;
+  }
+}
+
+// Clips coordinates to between 0 and clip_limit - 1
+template <typename scalar_t>
+static inline scalar_t clip_coordinates(scalar_t in, int64_t clip_limit) {
+  return std::min(static_cast<scalar_t>(clip_limit - 1),
+                  std::max(in, static_cast<scalar_t>(0)));
+}
+
+// Reflects coordinates until they fall between low and high (inclusive).
+// The bounds are passed as twice their value so that half-integer values
+// can be represented as ints.
+template <typename scalar_t>
+static inline scalar_t reflect_coordinates(scalar_t in, int64_t twice_low,
+                                           int64_t twice_high) {
+  if (twice_low == twice_high) {
+    return static_cast<scalar_t>(0);
+  }
+  scalar_t min = static_cast<scalar_t>(twice_low) / 2;
+  scalar_t span = static_cast<scalar_t>(twice_high - twice_low) / 2;
+  in = std::fabs(in - min);
+  // `fmod` returns same sign as `in`, which is positive after the `fabs` above.
+  scalar_t extra = std::fmod(in, span);
+  int flips = static_cast<int>(std::floor(in / span));
+  if (flips % 2 == 0) {
+    return extra + min;
+  } else {
+    return span - extra + min;
+  }
+}
+
+template <typename scalar_t>
+static inline scalar_t compute_coordinates(scalar_t coord, int64_t size,
+                                           int64_t padding_mode,
+                                           bool align_corners) {
+  if (padding_mode == GridSamplerPadding::Border) {
+    coord = clip_coordinates(coord, size);
+  } else if (padding_mode == GridSamplerPadding::Reflection) {
+    if (align_corners) {
+      coord = reflect_coordinates(coord, 0, 2 * (size - 1));
+    } else {
+      coord = reflect_coordinates(coord, -1, 2 * size - 1);
+    }
+    coord = clip_coordinates(coord, size);
+  }
+  return coord;
+}
+
+// Computes the pixel source index value for a grid coordinate
+template <typename scalar_t>
+static inline scalar_t grid_sampler_compute_source_index(scalar_t coord,
+                                                         int64_t size,
+                                                         int64_t padding_mode,
+                                                         bool align_corners) {
+  coord = grid_sampler_unnormalize(coord, size, align_corners);
+  coord = compute_coordinates(coord, size, padding_mode, align_corners);
+  return coord;
+}
+
+static inline bool within_bounds_2d(int64_t h, int64_t w, int64_t H,
+                                    int64_t W) {
+  return h >= 0 && h < H && w >= 0 && w < W;
+}
+
+template <typename scalar_t>
+static inline scalar_t get_value_bounded(const scalar_t *data, scalar_t x,
+                                         scalar_t y, int64_t W, int64_t H,
+                                         int64_t sW, int64_t sH,
+                                         int64_t padding_mode,
+                                         bool align_corners) {
+  x = compute_coordinates(x, W, padding_mode, align_corners);
+  y = compute_coordinates(y, H, padding_mode, align_corners);
+
+  int64_t ix = static_cast<int64_t>(x);
+  int64_t iy = static_cast<int64_t>(y);
+
+  if (within_bounds_2d(iy, ix, H, W)) {
+    return data[iy * sH + ix * sW];
+  }
+  return static_cast<scalar_t>(0);
+}
+
+template <typename scalar_t>
+static inline scalar_t cubic_convolution1(scalar_t x, scalar_t A) {
+  return ((A + 2) * x - (A + 3)) * x * x + 1;
+}
+
+template <typename scalar_t>
+static inline scalar_t cubic_convolution2(scalar_t x, scalar_t A) {
+  return ((A * x - 5 * A) * x + 8 * A) * x - 4 * A;
+}
+
+template <typename scalar_t>
+static inline void get_cubic_upsample_coefficients(scalar_t coeffs[4],
+                                                   scalar_t t) {
+  scalar_t A = -0.75;
+
+  scalar_t x1 = t;
+  coeffs[0] = cubic_convolution2<scalar_t>(x1 + 1.0, A);
+  coeffs[1] = cubic_convolution1<scalar_t>(x1, A);
+
+  // opposite coefficients
+  scalar_t x2 = 1.0 - t;
+  coeffs[2] = cubic_convolution1<scalar_t>(x2, A);
+  coeffs[3] = cubic_convolution2<scalar_t>(x2 + 1.0, A);
+}
+
+template <typename scalar_t>
+static inline scalar_t cubic_interp1d(scalar_t x0, scalar_t x1, scalar_t x2,
+                                      scalar_t x3, scalar_t t) {
+  scalar_t coeffs[4];
+  get_cubic_upsample_coefficients<scalar_t>(coeffs, t);
+
+  return x0 * coeffs[0] + x1 * coeffs[1] + x2 * coeffs[2] + x3 * coeffs[3];
+}
+
+void GridSampleKernel::Compute(OrtKernelContext *context) {
+  const bool align_corners = align_corners_;
+  const int64_t padding_mode = padding_mode_;
+  const int64_t interpolation_mode = interpolation_mode_;
+
+  const OrtValue *input = ort_.KernelContext_GetInput(context, 0);
+  const float *input_data =
+      reinterpret_cast<const float *>(ort_.GetTensorData<float>(input));
+
+  const OrtValue *grid = ort_.KernelContext_GetInput(context, 1);
+  const float *grid_data =
+      reinterpret_cast<const float *>(ort_.GetTensorData<float>(grid));
+
+  OrtTensorDimensions input_dims(ort_, input);
+  OrtTensorDimensions grid_dims(ort_, grid);
+  int64_t N = input_dims[0];
+  int64_t C = input_dims[1];
+  int64_t inp_H = input_dims[2];
+  int64_t inp_W = input_dims[3];
+  int64_t out_H = grid_dims[1];
+  int64_t out_W = grid_dims[2];
+
+  std::vector<int64_t> output_dims = {N, C, out_H, out_W};
+  OrtValue *output = ort_.KernelContext_GetOutput(
+      context, 0, output_dims.data(), output_dims.size());
+  float *out_ptr = ort_.GetTensorMutableData<float>(output);
+
+  int64_t inp_sN = input_dims[1] * input_dims[2] * input_dims[3];
+  int64_t inp_sC = input_dims[2] * input_dims[3];
+  int64_t inp_sH = input_dims[3];
+  int64_t inp_sW = 1;
+  int64_t grid_sN = grid_dims[1] * grid_dims[2] * grid_dims[3];
+  int64_t grid_sH = grid_dims[2] * grid_dims[3];
+  int64_t grid_sW = grid_dims[3];
+  int64_t grid_sCoor = 1;
+  int64_t out_sN = output_dims[1] * output_dims[2] * output_dims[3];
+  int64_t out_sC = output_dims[2] * output_dims[3];
+  int64_t out_sH = output_dims[3];
+  int64_t out_sW = 1;
+
+  // loop over each output pixel
+  for (int64_t n = 0; n < N; ++n) {
+    const float *grid_ptr_N = grid_data + n * grid_sN;
+    const float *inp_ptr_N = input_data + n * inp_sN;
+    for (int64_t h = 0; h < out_H; ++h) {
+      for (int64_t w = 0; w < out_W; ++w) {
+        const float *grid_ptr_NHW = grid_ptr_N + h * grid_sH + w * grid_sW;
+        float x = *grid_ptr_NHW;
+        float y = grid_ptr_NHW[grid_sCoor];
+
+        float ix = grid_sampler_compute_source_index(x, inp_W, padding_mode,
+                                                     align_corners);
+        float iy = grid_sampler_compute_source_index(y, inp_H, padding_mode,
+                                                     align_corners);
+
+        if (interpolation_mode == GridSamplerInterpolation::Bilinear) {
+          // get corner pixel values from (x, y)
+          // for 4d, we use north-east-south-west
+          int64_t ix_nw = static_cast<int64_t>(std::floor(ix));
+          int64_t iy_nw = static_cast<int64_t>(std::floor(iy));
+
+          int64_t ix_ne = ix_nw + 1;
+          int64_t iy_ne = iy_nw;
+
+          int64_t ix_sw = ix_nw;
+          int64_t iy_sw = iy_nw + 1;
+
+          int64_t ix_se = ix_nw + 1;
+          int64_t iy_se = iy_nw + 1;
+
+          // get surfaces to each neighbor:
+          float nw = (ix_se - ix) * (iy_se - iy);
+          float ne = (ix - ix_sw) * (iy_sw - iy);
+          float sw = (ix_ne - ix) * (iy - iy_ne);
+          float se = (ix - ix_nw) * (iy - iy_nw);
+
+          // calculate bilinear weighted pixel value and set output pixel
+          const float *inp_ptr_NC = inp_ptr_N;
+          float *out_ptr_NCHW = out_ptr + n * out_sN + h * out_sH + w * out_sW;
+          for (int64_t c = 0; c < C;
+               ++c, out_ptr_NCHW += out_sC, inp_ptr_NC += inp_sC) {
+            auto res = static_cast<float>(0);
+            if (within_bounds_2d(iy_nw, ix_nw, inp_H, inp_W)) {
+              res += inp_ptr_NC[iy_nw * inp_sH + ix_nw * inp_sW] * nw;
+            }
+            if (within_bounds_2d(iy_ne, ix_ne, inp_H, inp_W)) {
+              res += inp_ptr_NC[iy_ne * inp_sH + ix_ne * inp_sW] * ne;
+            }
+            if (within_bounds_2d(iy_sw, ix_sw, inp_H, inp_W)) {
+              res += inp_ptr_NC[iy_sw * inp_sH + ix_sw * inp_sW] * sw;
+            }
+            if (within_bounds_2d(iy_se, ix_se, inp_H, inp_W)) {
+              res += inp_ptr_NC[iy_se * inp_sH + ix_se * inp_sW] * se;
+            }
+            *out_ptr_NCHW = res;
+          }
+        } else if (interpolation_mode == GridSamplerInterpolation::Nearest) {
+          int64_t ix_nearest = static_cast<int64_t>(std::nearbyint(ix));
+          int64_t iy_nearest = static_cast<int64_t>(std::nearbyint(iy));
+
+          // assign nearest neighbor pixel value to output pixel
+          float *out_ptr_NCHW = out_ptr + n * out_sN + h * out_sH + w * out_sW;
+          const float *inp_ptr_NC = inp_ptr_N;
+          for (int64_t c = 0; c < C;
+               ++c, out_ptr_NCHW += out_sC, inp_ptr_NC += inp_sC) {
+            if (within_bounds_2d(iy_nearest, ix_nearest, inp_H, inp_W)) {
+              *out_ptr_NCHW =
+                  inp_ptr_NC[iy_nearest * inp_sH + ix_nearest * inp_sW];
+            } else {
+              *out_ptr_NCHW = static_cast<float>(0);
+            }
+          }
+        } else if (interpolation_mode == GridSamplerInterpolation::Bicubic) {
+          // grid_sampler_compute_source_index will "clip the value" of idx
+          // depends on the padding,
+          // which would cause calculation to be wrong,
+          // for example x = -0.1 -> ix = 0 for zero padding, but in bicubic ix
+          // = floor(x) = -1
+          // There would be more problem in reflection padding, since the -1 and
+          // +1 direction is not fixed in boundary condition
+          ix = grid_sampler_unnormalize(x, inp_W, align_corners);
+          iy = grid_sampler_unnormalize(y, inp_H, align_corners);
+
+          float ix_nw = std::floor(ix);
+          float iy_nw = std::floor(iy);
+
+          const float tx = ix - ix_nw;
+          const float ty = iy - iy_nw;
+
+          const float *inp_ptr_NC = inp_ptr_N;
+          float *out_ptr_NCHW = out_ptr + n * out_sN + h * out_sH + w * out_sW;
+          for (int64_t c = 0; c < C;
+               ++c, out_ptr_NCHW += out_sC, inp_ptr_NC += inp_sC) {
+            float coefficients[4];
+
+            // Interpolate 4 values in the x direction
+            for (int64_t i = 0; i < 4; ++i) {
+              coefficients[i] = cubic_interp1d<float>(
+                  get_value_bounded<float>(inp_ptr_NC, ix_nw - 1, iy_nw - 1 + i,
+                                           inp_W, inp_H, inp_sW, inp_sH,
+                                           padding_mode, align_corners),
+                  get_value_bounded<float>(inp_ptr_NC, ix_nw + 0, iy_nw - 1 + i,
+                                           inp_W, inp_H, inp_sW, inp_sH,
+                                           padding_mode, align_corners),
+                  get_value_bounded<float>(inp_ptr_NC, ix_nw + 1, iy_nw - 1 + i,
+                                           inp_W, inp_H, inp_sW, inp_sH,
+                                           padding_mode, align_corners),
+                  get_value_bounded<float>(inp_ptr_NC, ix_nw + 2, iy_nw - 1 + i,
+                                           inp_W, inp_H, inp_sW, inp_sH,
+                                           padding_mode, align_corners),
+                  tx);
+            }
+
+            // Interpolate in the y direction
+            *out_ptr_NCHW =
+                cubic_interp1d<float>(coefficients[0], coefficients[1],
+                                      coefficients[2], coefficients[3], ty);
+          }
+        }
+      }
+    }
+  }
+}
diff --git a/mmcv/ops/csrc/onnxruntime/cpu/modulated_deform_conv.cpp b/mmcv/ops/csrc/onnxruntime/cpu/modulated_deform_conv.cpp
new file mode 100644
index 0000000..cd8f0d0
--- /dev/null
+++ b/mmcv/ops/csrc/onnxruntime/cpu/modulated_deform_conv.cpp
@@ -0,0 +1,292 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "modulated_deform_conv.h"
+
+#include <cmath>
+#include <vector>
+
+#include "../ort_mmcv_utils.h"
+
+float bilinear_interpolate_2d(const float *src, const int64_t src_h,
+                              const int64_t src_w, const float h,
+                              const float w) {
+  if (h <= -1 || src_h <= h || w <= -1 || src_w <= w) {
+    return 0;
+  }
+
+  int64_t h_low = floor(h);
+  int64_t w_low = floor(w);
+  int64_t h_high = h_low + 1;
+  int64_t w_high = w_low + 1;
+
+  float lh = h - h_low;
+  float lw = w - w_low;
+  float hh = 1 - lh;
+  float hw = 1 - lw;
+
+  float v1 = 0;
+  if (h_low >= 0 && w_low >= 0) v1 = src[h_low * src_w + w_low];
+  float v2 = 0;
+  if (h_low >= 0 && w_high <= src_w - 1) v2 = src[h_low * src_w + w_high];
+  float v3 = 0;
+  if (h_high <= src_h - 1 && w_low >= 0) v3 = src[h_high * src_w + w_low];
+  float v4 = 0;
+  if (h_high <= src_h - 1 && w_high <= src_w - 1)
+    v4 = src[h_high * src_w + w_high];
+
+  float w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
+
+  float val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+  return val;
+}
+
+// output: (channels * kernel_h * kernel_w, dst_h * dst_w)
+void deformable_im2col_2d(const float *input, const float *offset,
+                          const float *mask, const int64_t src_h,
+                          const int64_t src_w, const int64_t kernel_h,
+                          const int64_t kernel_w, const int64_t pad_h,
+                          const int64_t pad_w, const int64_t stride_h,
+                          const int64_t stride_w, const int64_t dilation_h,
+                          const int64_t dilation_w, const int64_t channels,
+                          const int64_t offset_groups, const int64_t dst_h,
+                          const int64_t dst_w, const bool use_mask,
+                          float *columns) {
+  const int64_t workload = channels * dst_h * dst_w;
+  for (int64_t index = 0; index != workload; ++index) {
+    const int64_t ow = index % dst_w;
+    const int64_t oh = (index / dst_w) % dst_h;
+    const int64_t ic = index / (dst_w * dst_h);
+    const int64_t oc = ic * kernel_h * kernel_w;
+
+    int64_t c_per_offset_grp = channels / offset_groups;
+    const int64_t grp_idx = ic / c_per_offset_grp;
+
+    auto columns_ptr = columns + (oc * (dst_h * dst_w) + oh * dst_w + ow);
+    auto input_ptr = input + ic * (src_h * src_w);
+    auto offset_ptr =
+        offset + grp_idx * 2 * kernel_h * kernel_w * dst_h * dst_w;
+    auto mask_ptr = mask;
+    if (use_mask) {
+      mask_ptr += grp_idx * kernel_h * kernel_w * dst_h * dst_w;
+    }
+
+    for (int64_t kh = 0; kh < kernel_h; ++kh) {
+      for (int64_t kw = 0; kw < kernel_w; ++kw) {
+        const int64_t mask_idx = kh * kernel_w + kw;
+        const int64_t offset_idx = 2 * mask_idx;
+
+        float mask_value = 1;
+        if (use_mask) {
+          mask_value = mask_ptr[mask_idx * (dst_h * dst_w) + oh * dst_w + ow];
+        }
+
+        const float offset_h =
+            offset_ptr[offset_idx * (dst_h * dst_w) + oh * dst_w + ow];
+        const float offset_w =
+            offset_ptr[(offset_idx + 1) * (dst_h * dst_w) + oh * dst_w + ow];
+        const float ih = (oh * stride_h - pad_h) + kh * dilation_h + offset_h;
+        const float iw = (ow * stride_w - pad_w) + kw * dilation_w + offset_w;
+        *columns_ptr = mask_value *
+                       bilinear_interpolate_2d(input_ptr, src_h, src_w, ih, iw);
+        columns_ptr += dst_h * dst_w;
+      }
+    }
+  }
+}
+
+void gemm_ref_fp32(const float *A, const float *B, const float *V,
+                   const float *H, const int32_t trans_A, const int32_t trans_B,
+                   const int32_t M, const int32_t N, const int32_t K,
+                   const float alpha, const float beta, float *Y) {
+  if (!trans_A && !trans_B) {  // MK, KN; NN
+    for (int64_t m = 0; m < M; ++m) {
+      for (int64_t n = 0; n < N; ++n) {
+        float y = 0.0f;
+        for (int64_t k = 0; k < K; ++k) {
+          y += A[m * K + k] * B[k * N + n];
+        }
+        y *= alpha;
+        if (V) y += beta * V[n];
+        if (H) y += beta * H[m * N + n];
+        Y[m * N + n] = y;
+      }
+    }
+  }
+  if (trans_A && !trans_B) {  // KM, KN; TN
+    for (int64_t m = 0; m < M; ++m) {
+      for (int64_t n = 0; n < N; ++n) {
+        float y = 0.0f;
+        for (int64_t k = 0; k < K; ++k) {
+          y += A[k * M + m] * B[k * N + n];
+        }
+        y *= alpha;
+        if (V) y += beta * V[n];
+        if (H) y += beta * H[m * N + n];
+        Y[m * N + n] = y;
+      }
+    }
+  }
+  if (trans_A && trans_B) {  // KM, NK; TT
+    for (int64_t m = 0; m < M; ++m) {
+      for (int64_t n = 0; n < N; ++n) {
+        float y = 0.0f;
+        for (int64_t k = 0; k < K; ++k) {
+          y += A[k * M + m] * B[n * K + k];
+        }
+        y *= alpha;
+        if (V) y += beta * V[n];
+        if (H) y += beta * H[m * N + n];
+        Y[m * N + n] = y;
+      }
+    }
+  }
+  if (!trans_A && trans_B) {  // MK, NK; NT
+    for (int64_t m = 0; m < M; ++m) {
+      for (int64_t n = 0; n < N; ++n) {
+        float y = 0.0f;
+        for (int64_t k = 0; k < K; ++k) {
+          y += A[m * K + k] * B[n * K + k];
+        }
+        y *= alpha;
+        if (V) y += beta * V[n];
+        if (H) y += beta * H[m * N + n];
+        Y[m * N + n] = y;
+      }
+    }
+  }
+}
+
+void deformable_conv2d_ref_fp32(
+    const float *src, const float *offset, const float *mask,
+    const float *filter, const float *bias, const int64_t batch,
+    const int64_t src_c, const int64_t src_h, const int64_t src_w,
+    const int64_t dst_c, const int64_t dst_h, const int64_t dst_w,
+    const int64_t group, const int64_t offset_group, const int64_t channels,
+    const int64_t num_output, const int64_t kernel_h, const int64_t kernel_w,
+    const int64_t stride_h, const int64_t stride_w, const int64_t pad_h,
+    const int64_t pad_w, const int64_t dilation_h, const int64_t dilation_w,
+    float *columns, float *dst) {
+  const int64_t ic_per_gp = channels / group;
+  const int64_t oc_per_gp = num_output / group;
+
+  for (int64_t b = 0; b < batch; ++b) {
+    for (int64_t g = 0; g < group; ++g) {
+      deformable_im2col_2d(
+          src + b * src_c * src_h * src_w + g * ic_per_gp * src_h * src_w,
+          offset + b * offset_group * 2 * kernel_h * kernel_w * dst_h * dst_w,
+          mask + b * offset_group * kernel_h * kernel_w * dst_h * dst_w, src_h,
+          src_w, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
+          dilation_h, dilation_w, ic_per_gp, offset_group, dst_h, dst_w,
+          mask != nullptr, columns);
+      float *dst_ptr =
+          dst + b * dst_c * dst_h * dst_w + g * oc_per_gp * dst_h * dst_w;
+      if (bias != nullptr) {
+        const float *bias_ptr = bias + g * oc_per_gp;
+        for (int64_t oc = 0; oc < oc_per_gp; ++oc) {
+          for (int64_t hw = 0; hw < dst_h * dst_w; ++hw) {
+            dst_ptr[oc * dst_h * dst_w + hw] = bias_ptr[oc];
+          }
+        }
+      } else {
+        memset(dst_ptr, 0.0f, sizeof(float) * oc_per_gp * dst_h * dst_w);
+      }
+      gemm_ref_fp32(filter + g * oc_per_gp * ic_per_gp * kernel_h * kernel_w,
+                    columns, nullptr, dst_ptr, 0, 0, oc_per_gp, dst_h * dst_w,
+                    ic_per_gp * kernel_h * kernel_w, 1.0f, 1.0f, dst_ptr);
+    }
+  }
+}
+
+MMCVModulatedDeformConvKernel::MMCVModulatedDeformConvKernel(
+    OrtApi api, const OrtKernelInfo *info)
+    : api_(api), ort_(api_), info_(info) {
+  std::vector<int64_t> stride =
+      ort_.KernelInfoGetAttribute<std::vector<int64_t>>(info, "stride");
+  stride_height_ = stride[0];
+  stride_width_ = stride[1];
+  std::vector<int64_t> padding =
+      ort_.KernelInfoGetAttribute<std::vector<int64_t>>(info, "padding");
+  padding_height_ = padding[0];
+  padding_width_ = padding[1];
+  std::vector<int64_t> dilation =
+      ort_.KernelInfoGetAttribute<std::vector<int64_t>>(info, "dilation");
+  dilation_height_ = dilation[0];
+  dilation_width_ = dilation[1];
+  deformable_group_ =
+      ort_.KernelInfoGetAttribute<int64_t>(info, "deform_groups");
+  group_ = ort_.KernelInfoGetAttribute<int64_t>(info, "groups");
+
+  // create allocator
+  allocator_ = Ort::AllocatorWithDefaultOptions();
+}
+
+void MMCVModulatedDeformConvKernel::Compute(OrtKernelContext *context) {
+  const int64_t stride_height = stride_height_;
+  const int64_t stride_width = stride_width_;
+  const int64_t padding_height = padding_height_;
+  const int64_t padding_width = padding_width_;
+  const int64_t dilation_height = dilation_height_;
+  const int64_t dilation_width = dilation_width_;
+  const int64_t deformable_group = deformable_group_;
+  const int64_t group = group_;
+
+  const OrtValue *input = ort_.KernelContext_GetInput(context, 0);
+  const float *input_data =
+      reinterpret_cast<const float *>(ort_.GetTensorData<float>(input));
+
+  const OrtValue *offset = ort_.KernelContext_GetInput(context, 1);
+  const float *offset_data =
+      reinterpret_cast<const float *>(ort_.GetTensorData<float>(offset));
+
+  const OrtValue *mask = ort_.KernelContext_GetInput(context, 2);
+  const float *mask_data =
+      reinterpret_cast<const float *>(ort_.GetTensorData<float>(mask));
+
+  const OrtValue *filter = ort_.KernelContext_GetInput(context, 3);
+  const float *filter_data =
+      reinterpret_cast<const float *>(ort_.GetTensorData<float>(filter));
+
+  const OrtValue *bias = ort_.KernelContext_GetInput(context, 4);
+  const float *bias_data =
+      (bias != nullptr)
+          ? reinterpret_cast<const float *>(ort_.GetTensorData<float>(bias))
+          : nullptr;
+  // const float *bias_data = nullptr;
+
+  OrtTensorDimensions input_dims(ort_, input);
+  OrtTensorDimensions filter_dims(ort_, filter);
+
+  int64_t batch = input_dims[0];
+  int64_t channels = input_dims[1];
+  int64_t in_height = input_dims[2];
+  int64_t in_width = input_dims[3];
+  int64_t num_output = filter_dims[0];
+  int64_t kernel_height = filter_dims[2];
+  int64_t kernel_width = filter_dims[3];
+
+  // get output memory
+  int64_t out_height = floor((in_height + 2 * padding_height -
+                              dilation_height * (kernel_height - 1) - 1) /
+                                 stride_height +
+                             1);
+  int64_t out_width = floor(
+      (in_width + 2 * padding_width - dilation_width * (kernel_width - 1) - 1) /
+          stride_width +
+      1);
+
+  std::vector<int64_t> output_dims = {batch, num_output, out_height, out_width};
+  OrtValue *output = ort_.KernelContext_GetOutput(
+      context, 0, output_dims.data(), output_dims.size());
+  float *out_ptr = ort_.GetTensorMutableData<float>(output);
+
+  // allocate tmp memory
+  int64_t column_len = (channels / group) * kernel_height * kernel_width *
+                       out_height * out_width;
+  float *columns = (float *)allocator_.Alloc(sizeof(float) * column_len);
+
+  deformable_conv2d_ref_fp32(
+      input_data, offset_data, mask_data, filter_data, bias_data, batch,
+      channels, in_height, in_width, num_output, out_height, out_width, group,
+      deformable_group, channels, num_output, kernel_height, kernel_width,
+      stride_height, stride_width, padding_height, padding_width,
+      dilation_height, dilation_width, columns, out_ptr);
+}
diff --git a/mmcv/ops/csrc/onnxruntime/cpu/nms.cpp b/mmcv/ops/csrc/onnxruntime/cpu/nms.cpp
new file mode 100644
index 0000000..b38a76e
--- /dev/null
+++ b/mmcv/ops/csrc/onnxruntime/cpu/nms.cpp
@@ -0,0 +1,108 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "nms.h"
+
+#include <assert.h>
+
+#include <algorithm>
+#include <cmath>
+#include <iostream>
+#include <iterator>
+#include <numeric>  // std::iota
+#include <vector>
+
+#include "../ort_mmcv_utils.h"
+
+NmsKernel::NmsKernel(OrtApi api, const OrtKernelInfo *info)
+    : api_(api), ort_(api_), info_(info) {
+  iou_threshold_ = ort_.KernelInfoGetAttribute<float>(info, "iou_threshold");
+  offset_ = ort_.KernelInfoGetAttribute<int64_t>(info, "offset");
+
+  // create allocator
+  allocator_ = Ort::AllocatorWithDefaultOptions();
+}
+
+void NmsKernel::Compute(OrtKernelContext *context) {
+  const float iou_threshold = iou_threshold_;
+  const int64_t offset = offset_;
+
+  const OrtValue *boxes = ort_.KernelContext_GetInput(context, 0);
+  const float *boxes_data =
+      reinterpret_cast<const float *>(ort_.GetTensorData<float>(boxes));
+  const OrtValue *scores = ort_.KernelContext_GetInput(context, 1);
+  const float *scores_data =
+      reinterpret_cast<const float *>(ort_.GetTensorData<float>(scores));
+
+  OrtTensorDimensions boxes_dim(ort_, boxes);
+  OrtTensorDimensions scores_dim(ort_, scores);
+
+  int64_t nboxes = boxes_dim[0];
+  assert(boxes_dim[1] == 4);
+
+  // allocate tmp memory
+  float *tmp_boxes = (float *)allocator_.Alloc(sizeof(float) * nboxes * 4);
+  float *sc = (float *)allocator_.Alloc(sizeof(float) * nboxes);
+  float *areas = (float *)allocator_.Alloc(sizeof(float) * nboxes);
+  bool *select = (bool *)allocator_.Alloc(sizeof(bool) * nboxes);
+  for (int64_t i = 0; i < nboxes; i++) {
+    select[i] = true;
+  }
+
+  memcpy(tmp_boxes, boxes_data, sizeof(float) * nboxes * 4);
+  memcpy(sc, scores_data, sizeof(float) * nboxes);
+
+  // sort scores
+  std::vector<float> tmp_sc;
+  for (int i = 0; i < nboxes; i++) {
+    tmp_sc.push_back(sc[i]);
+  }
+  std::vector<int64_t> order(tmp_sc.size());
+  std::iota(order.begin(), order.end(), 0);
+  std::sort(order.begin(), order.end(), [&tmp_sc](int64_t id1, int64_t id2) {
+    return tmp_sc[id1] > tmp_sc[id2];
+  });
+
+  // area = (x2 - x1 + offset) * (y2 - y1 + offset)
+  for (int64_t i = 0; i < nboxes; i++) {
+    areas[i] = (tmp_boxes[i * 4 + 2] - tmp_boxes[i * 4 + 0] + offset) *
+               (tmp_boxes[i * 4 + 3] - tmp_boxes[i * 4 + 1] + offset);
+  }
+
+  for (int64_t _i = 0; _i < nboxes; _i++) {
+    if (select[_i] == false) continue;
+    auto i = order[_i];
+    auto ix1 = tmp_boxes[i * 4 + 0];
+    auto iy1 = tmp_boxes[i * 4 + 1];
+    auto ix2 = tmp_boxes[i * 4 + 2];
+    auto iy2 = tmp_boxes[i * 4 + 3];
+    auto iarea = areas[i];
+
+    for (int64_t _j = _i + 1; _j < nboxes; _j++) {
+      if (select[_j] == false) continue;
+      auto j = order[_j];
+      auto xx1 = std::max(ix1, tmp_boxes[j * 4 + 0]);
+      auto yy1 = std::max(iy1, tmp_boxes[j * 4 + 1]);
+      auto xx2 = std::min(ix2, tmp_boxes[j * 4 + 2]);
+      auto yy2 = std::min(iy2, tmp_boxes[j * 4 + 3]);
+
+      auto w = std::max(0.f, xx2 - xx1 + offset);
+      auto h = std::max(0.f, yy2 - yy1 + offset);
+      auto inter = w * h;
+      auto ovr = inter / (iarea + areas[j] - inter);
+      if (ovr > iou_threshold) select[_j] = false;
+    }
+  }
+  std::vector<int64_t> res_order;
+  for (int i = 0; i < nboxes; i++) {
+    if (select[i]) {
+      res_order.push_back(order[i]);
+    }
+  }
+
+  std::vector<int64_t> inds_dims({res_order.size()});
+
+  OrtValue *res = ort_.KernelContext_GetOutput(context, 0, inds_dims.data(),
+                                               inds_dims.size());
+  int64_t *res_data = ort_.GetTensorMutableData<int64_t>(res);
+
+  memcpy(res_data, res_order.data(), sizeof(int64_t) * res_order.size());
+}
diff --git a/mmcv/ops/csrc/onnxruntime/cpu/onnxruntime_register.cpp b/mmcv/ops/csrc/onnxruntime/cpu/onnxruntime_register.cpp
new file mode 100644
index 0000000..ae78072
--- /dev/null
+++ b/mmcv/ops/csrc/onnxruntime/cpu/onnxruntime_register.cpp
@@ -0,0 +1,81 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "onnxruntime_register.h"
+
+#include "corner_pool.h"
+#include "deform_conv.h"
+#include "grid_sample.h"
+#include "modulated_deform_conv.h"
+#include "nms.h"
+#include "ort_mmcv_utils.h"
+#include "reduce_ops.h"
+#include "roi_align.h"
+#include "roi_align_rotated.h"
+#include "soft_nms.h"
+
+const char *c_MMCVOpDomain = "mmcv";
+SoftNmsOp c_SoftNmsOp;
+NmsOp c_NmsOp;
+MMCVRoiAlignCustomOp c_MMCVRoiAlignCustomOp;
+MMCVRoIAlignRotatedCustomOp c_MMCVRoIAlignRotatedCustomOp;
+GridSampleOp c_GridSampleOp;
+MMCVCumMaxCustomOp c_MMCVCumMaxCustomOp;
+MMCVCumMinCustomOp c_MMCVCumMinCustomOp;
+MMCVCornerPoolCustomOp c_MMCVCornerPoolCustomOp;
+MMCVModulatedDeformConvOp c_MMCVModulatedDeformConvOp;
+MMCVDeformConvOp c_MMCVDeformConvOp;
+
+OrtStatus *ORT_API_CALL RegisterCustomOps(OrtSessionOptions *options,
+                                          const OrtApiBase *api) {
+  OrtCustomOpDomain *domain = nullptr;
+  const OrtApi *ortApi = api->GetApi(ORT_API_VERSION);
+
+  if (auto status = ortApi->CreateCustomOpDomain(c_MMCVOpDomain, &domain)) {
+    return status;
+  }
+
+  if (auto status = ortApi->CustomOpDomain_Add(domain, &c_SoftNmsOp)) {
+    return status;
+  }
+
+  if (auto status = ortApi->CustomOpDomain_Add(domain, &c_NmsOp)) {
+    return status;
+  }
+
+  if (auto status =
+          ortApi->CustomOpDomain_Add(domain, &c_MMCVRoiAlignCustomOp)) {
+    return status;
+  }
+
+  if (auto status =
+          ortApi->CustomOpDomain_Add(domain, &c_MMCVRoIAlignRotatedCustomOp)) {
+    return status;
+  }
+
+  if (auto status = ortApi->CustomOpDomain_Add(domain, &c_GridSampleOp)) {
+    return status;
+  }
+
+  if (auto status =
+          ortApi->CustomOpDomain_Add(domain, &c_MMCVCornerPoolCustomOp)) {
+    return status;
+  }
+
+  if (auto status = ortApi->CustomOpDomain_Add(domain, &c_MMCVCumMaxCustomOp)) {
+    return status;
+  }
+
+  if (auto status = ortApi->CustomOpDomain_Add(domain, &c_MMCVCumMinCustomOp)) {
+    return status;
+  }
+
+  if (auto status =
+          ortApi->CustomOpDomain_Add(domain, &c_MMCVModulatedDeformConvOp)) {
+    return status;
+  }
+
+  if (auto status = ortApi->CustomOpDomain_Add(domain, &c_MMCVDeformConvOp)) {
+    return status;
+  }
+
+  return ortApi->AddCustomOpDomain(options, domain);
+}
diff --git a/mmcv/ops/csrc/onnxruntime/cpu/reduce_ops.cpp b/mmcv/ops/csrc/onnxruntime/cpu/reduce_ops.cpp
new file mode 100644
index 0000000..81aef39
--- /dev/null
+++ b/mmcv/ops/csrc/onnxruntime/cpu/reduce_ops.cpp
@@ -0,0 +1,188 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "reduce_ops.h"
+
+#include <assert.h>
+
+#include <vector>
+
+#include "../ort_mmcv_utils.h"
+
+// modified from
+// https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/ReduceOps.cpp
+
+static inline int64_t maybe_wrap_dim(int64_t dim, int64_t ndims) {
+  int64_t min = -ndims;
+  int64_t max = ndims - 1;
+  assert(dim >= min && dim <= max);
+  if (dim < 0) dim += ndims;
+  return dim;
+}
+
+static inline int64_t get_dim_stride(const int64_t dim, const int64_t ndims,
+                                     const int64_t *reversed_dim_cumprod) {
+  return dim == ndims - 1 ? 1 : reversed_dim_cumprod[dim + 1];
+}
+
+static inline int64_t get_dim_size(const int64_t dim, const int64_t ndims,
+                                   const int64_t *reversed_dim_cumprod) {
+  return dim == ndims - 1
+             ? reversed_dim_cumprod[dim]
+             : reversed_dim_cumprod[dim] / reversed_dim_cumprod[dim + 1];
+}
+
+template <typename T1, typename T2, typename Operation>
+void cummax_cummin_helper(const T1 *input, T1 *output, T2 *indices,
+                          const int64_t input_dim_size, const int64_t stride) {
+  Operation op;
+  T1 out = input[0];
+  int64_t idx = 0;
+  for (int64_t i = 0; i < input_dim_size; i++) {
+    T1 curr_elem = input[i * stride];
+    if (op(curr_elem, out)) {
+      out = curr_elem;
+      idx = i;
+    }
+    output[i * stride] = out;
+    indices[i * stride] = idx;
+  }
+}
+
+// modified `tensor_dim_apply3` from
+// https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/TensorDimApply.h.
+// the difference is that: (1) use `reversed_dim_cumprod` for fast computing of
+// tensor `size` and `stride`. (2) the same `stride` is used for input, output,
+// and indices, since it's unnecessary to use separate values. currently
+// `tensor_dim_apply3` is only used for `cummax` and `cummin`, according to the
+// official pytorch projects: https://github.com/pytorch/pytorch.
+template <typename T1, typename T2, typename Function>
+void tensor_dim_apply3(const T1 *input, T1 *output, T2 *indices,
+                       const int64_t dim, const int64_t ndims,
+                       const int64_t *reversed_dim_cumprod, Function func) {
+  int dim_apply_finished = 0;
+  int64_t input_dim_size = get_dim_size(dim, ndims, reversed_dim_cumprod);
+  // the same stride is used for input, output and indices
+  int64_t stride = get_dim_stride(dim, ndims, reversed_dim_cumprod);
+  std::vector<int64_t> counter(ndims, 0);
+
+  while (!dim_apply_finished) {
+    // call `func` once to update output and indices
+    func(input, output, indices, input_dim_size, stride);
+    if (ndims == 1) break;
+    for (int64_t dim_i = 0; dim_i < ndims; dim_i++) {
+      if (dim_i == dim) {
+        if (dim_i == (ndims - 1)) {
+          dim_apply_finished = 1;
+          break;
+        }
+        continue;
+      }
+      counter[dim_i]++;
+
+      // the same stride is used for input, output, and indices
+      int64_t stride_dim_i = get_dim_stride(dim_i, ndims, reversed_dim_cumprod);
+      input += stride_dim_i;
+      output += stride_dim_i;
+      indices += stride_dim_i;
+
+      if (counter[dim_i] == get_dim_size(dim_i, ndims, reversed_dim_cumprod)) {
+        if (dim_i == ndims - 1) {
+          dim_apply_finished = 1;
+          break;
+        } else {
+          input -= counter[dim_i] * stride_dim_i;
+          output -= counter[dim_i] * stride_dim_i;
+          indices -= counter[dim_i] * stride_dim_i;
+          counter[dim_i] = 0;
+        }
+      } else {
+        break;
+      }  // if
+    }    // for
+  }      // while
+}
+
+template <typename T1, typename T2, typename Operation>
+void CumMax_CumMin_CPU(const T1 *input, T1 *output, T2 *indices,
+                       int64_t *reversed_dim_cumprod, const int64_t dim,
+                       const OrtTensorDimensions &out_dimensions) {
+  // calculate numel
+  const int64_t ndims = out_dimensions.size();
+  int64_t numel = 1;
+  for (int64_t dim_i = 0; dim_i < ndims; dim_i++) {
+    numel *= out_dimensions.data()[dim_i];
+  }
+
+  // cummax is only applied to input which is non-zero dim and non-empty
+  if (numel) {
+    // compute the cumulative production on dimension size,
+    // which is then used for computing the stride or size of a specific `dim`.
+    reversed_dim_cumprod[ndims - 1] = out_dimensions.data()[ndims - 1];
+    for (int64_t dim_i = ndims - 2; dim_i >= 0; dim_i--) {
+      reversed_dim_cumprod[dim_i] =
+          reversed_dim_cumprod[dim_i + 1] * out_dimensions.data()[dim_i];
+    }
+
+    // do cummax or cummin based on `Operation` type
+    tensor_dim_apply3<float, int64_t>(
+        input, output, indices, dim, ndims, reversed_dim_cumprod,
+        cummax_cummin_helper<float, int64_t, Operation>);
+  }
+}
+
+void MMCVCumMaxKernel::Compute(OrtKernelContext *context) {
+  // get input
+  const OrtValue *input = ort_.KernelContext_GetInput(context, 0);
+  const float *input_data =
+      reinterpret_cast<const float *>(ort_.GetTensorData<float>(input));
+
+  // get output
+  OrtTensorDimensions out_dimensions(ort_, input);
+  OrtValue *output = ort_.KernelContext_GetOutput(
+      context, 0, out_dimensions.data(), out_dimensions.size());
+  float *output_data = ort_.GetTensorMutableData<float>(output);
+  OrtValue *indices = ort_.KernelContext_GetOutput(
+      context, 1, out_dimensions.data(), out_dimensions.size());
+  int64_t *indices_data = ort_.GetTensorMutableData<int64_t>(indices);
+
+  // allocate tmp memory for computing the cumulative production on dimension
+  // size
+  const int64_t ndims = out_dimensions.size();
+  assert(ndims > 0);
+  int64_t *reversed_dim_cumprod =
+      (int64_t *)allocator_.Alloc(sizeof(int64_t) * ndims);
+
+  // dim should be wrapped if it's negative (e.g. -1)
+  const int64_t dim = maybe_wrap_dim(dim_, ndims);
+  CumMax_CumMin_CPU<float, int64_t, std::greater_equal<float>>(
+      input_data, output_data, indices_data, reversed_dim_cumprod, dim,
+      out_dimensions);
+}
+
+void MMCVCumMinKernel::Compute(OrtKernelContext *context) {
+  // get input
+  const OrtValue *input = ort_.KernelContext_GetInput(context, 0);
+  const float *input_data =
+      reinterpret_cast<const float *>(ort_.GetTensorData<float>(input));
+
+  // get output
+  OrtTensorDimensions out_dimensions(ort_, input);
+  OrtValue *output = ort_.KernelContext_GetOutput(
+      context, 0, out_dimensions.data(), out_dimensions.size());
+  float *output_data = ort_.GetTensorMutableData<float>(output);
+  OrtValue *indices = ort_.KernelContext_GetOutput(
+      context, 1, out_dimensions.data(), out_dimensions.size());
+  int64_t *indices_data = ort_.GetTensorMutableData<int64_t>(indices);
+
+  // allocate tmp memory for computing the cumulative production on dimension
+  // size
+  const int64_t ndims = out_dimensions.size();
+  assert(ndims > 0);
+  int64_t *reversed_dim_cumprod =
+      (int64_t *)allocator_.Alloc(sizeof(int64_t) * ndims);
+
+  // dim should be wrapped if it's negative (e.g. -1)
+  const int64_t dim = maybe_wrap_dim(dim_, ndims);
+  CumMax_CumMin_CPU<float, int64_t, std::less_equal<float>>(
+      input_data, output_data, indices_data, reversed_dim_cumprod, dim,
+      out_dimensions);
+}
diff --git a/mmcv/ops/csrc/onnxruntime/cpu/roi_align.cpp b/mmcv/ops/csrc/onnxruntime/cpu/roi_align.cpp
new file mode 100644
index 0000000..2151d2a
--- /dev/null
+++ b/mmcv/ops/csrc/onnxruntime/cpu/roi_align.cpp
@@ -0,0 +1,265 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "roi_align.h"
+
+#include "../ort_mmcv_utils.h"
+
+// implementation taken from Caffe2
+struct PreCalc {
+  int pos1;
+  int pos2;
+  int pos3;
+  int pos4;
+  float w1;
+  float w2;
+  float w3;
+  float w4;
+};
+
+void pre_calc_for_bilinear_interpolate(
+    const int height, const int width, const int pooled_height,
+    const int pooled_width, const int iy_upper, const int ix_upper,
+    float roi_start_h, float roi_start_w, float bin_size_h, float bin_size_w,
+    int roi_bin_grid_h, int roi_bin_grid_w, std::vector<PreCalc> &pre_calc) {
+  int pre_calc_index = 0;
+  for (int ph = 0; ph < pooled_height; ph++) {
+    for (int pw = 0; pw < pooled_width; pw++) {
+      for (int iy = 0; iy < iy_upper; iy++) {
+        const float yy =
+            roi_start_h + ph * bin_size_h +
+            static_cast<float>(iy + .5f) * bin_size_h /
+                static_cast<float>(roi_bin_grid_h);  // e.g., 0.5, 1.5
+        for (int ix = 0; ix < ix_upper; ix++) {
+          const float xx = roi_start_w + pw * bin_size_w +
+                           static_cast<float>(ix + .5f) * bin_size_w /
+                               static_cast<float>(roi_bin_grid_w);
+
+          float x = xx;
+          float y = yy;
+          // deal with: inverse elements are out of feature map boundary
+          if (y < -1.0 || y > height || x < -1.0 || x > width) {
+            // empty
+            PreCalc pc;
+            pc.pos1 = 0;
+            pc.pos2 = 0;
+            pc.pos3 = 0;
+            pc.pos4 = 0;
+            pc.w1 = 0;
+            pc.w2 = 0;
+            pc.w3 = 0;
+            pc.w4 = 0;
+            pre_calc[pre_calc_index] = pc;
+            pre_calc_index += 1;
+            continue;
+          }
+
+          if (y <= 0) {
+            y = 0;
+          }
+          if (x <= 0) {
+            x = 0;
+          }
+
+          int y_low = (int)y;
+          int x_low = (int)x;
+          int y_high;
+          int x_high;
+
+          if (y_low >= height - 1) {
+            y_high = y_low = height - 1;
+            y = (float)y_low;
+          } else {
+            y_high = y_low + 1;
+          }
+
+          if (x_low >= width - 1) {
+            x_high = x_low = width - 1;
+            x = (float)x_low;
+          } else {
+            x_high = x_low + 1;
+          }
+
+          float ly = y - y_low;
+          float lx = x - x_low;
+          float hy = 1. - ly, hx = 1. - lx;
+          float w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+
+          // save weights and indices
+          PreCalc pc;
+          pc.pos1 = y_low * width + x_low;
+          pc.pos2 = y_low * width + x_high;
+          pc.pos3 = y_high * width + x_low;
+          pc.pos4 = y_high * width + x_high;
+          pc.w1 = w1;
+          pc.w2 = w2;
+          pc.w3 = w3;
+          pc.w4 = w4;
+          pre_calc[pre_calc_index] = pc;
+
+          pre_calc_index += 1;
+        }
+      }
+    }
+  }
+}
+
+void ROIAlignForwardCPU(const int nthreads, const float *input,
+                        const float *rois, float *output, float *argmax_y,
+                        float *argmax_x, const int pooled_height,
+                        const int pooled_width, const float spatial_scale,
+                        const int sampling_ratio,
+                        const int pool_mode,  // 0 - max pool, 1 - avg pool
+                        const bool aligned, const int channels,
+                        const int height, const int width) {
+  int n_rois = nthreads / channels / pooled_width / pooled_height;
+  // (n, c, ph, pw) is an element in the pooled output
+  // can be parallelized using omp
+  // #pragma omp parallel for num_threads(32)
+  for (int n = 0; n < n_rois; n++) {
+    int index_n = n * channels * pooled_width * pooled_height;
+
+    const float *offset_rois = rois + n * 5;
+    int roi_batch_ind = offset_rois[0];
+
+    // Do not use rounding; this implementation detail is critical
+    float offset = aligned ? (float)0.5 : (float)0.0;
+    float roi_start_w = offset_rois[1] * spatial_scale - offset;
+    float roi_start_h = offset_rois[2] * spatial_scale - offset;
+    float roi_end_w = offset_rois[3] * spatial_scale - offset;
+    float roi_end_h = offset_rois[4] * spatial_scale - offset;
+
+    float roi_width = roi_end_w - roi_start_w;
+    float roi_height = roi_end_h - roi_start_h;
+    if (aligned) {
+      /*AT_ASSERTM(roi_width >= 0 && roi_height >= 0,
+                 "ROIs in ROIAlign cannot have non-negative size!");*/
+      assert(roi_width >= 0 && roi_height >= 0);
+    } else {  // for backward-compatibility only
+      roi_width = std::max(roi_width, (float)1.);
+      roi_height = std::max(roi_height, (float)1.);
+    }
+    float bin_size_h =
+        static_cast<float>(roi_height) / static_cast<float>(pooled_height);
+    float bin_size_w =
+        static_cast<float>(roi_width) / static_cast<float>(pooled_width);
+
+    // We use roi_bin_grid to sample the grid and mimic integral
+    int roi_bin_grid_h = (sampling_ratio > 0)
+                             ? sampling_ratio
+                             : ceil(roi_height / pooled_height);  // e.g., = 2
+    int roi_bin_grid_w =
+        (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
+
+    // When the grid is empty, output zeros == 0/1, instead of NaN.
+    const float count =
+        std::max(roi_bin_grid_h * roi_bin_grid_w, 1);  // e.g. = 4
+
+    // we want to precalculate indices and weights shared by all channels,
+    // this is the key point of optimization
+    std::vector<PreCalc> pre_calc(roi_bin_grid_h * roi_bin_grid_w *
+                                  pooled_width * pooled_height);
+    pre_calc_for_bilinear_interpolate(
+        height, width, pooled_height, pooled_width, roi_bin_grid_h,
+        roi_bin_grid_w, roi_start_h, roi_start_w, bin_size_h, bin_size_w,
+        roi_bin_grid_h, roi_bin_grid_w, pre_calc);
+
+    for (int c = 0; c < channels; c++) {
+      int index_n_c = index_n + c * pooled_width * pooled_height;
+      const float *offset_input =
+          input + (roi_batch_ind * channels + c) * height * width;
+      int pre_calc_index = 0;
+
+      for (int ph = 0; ph < pooled_height; ph++) {
+        for (int pw = 0; pw < pooled_width; pw++) {
+          int index = index_n_c + ph * pooled_width + pw;
+
+          float output_val = 0.;
+          float maxval = -10000;
+          float maxidx_y = -1.f, maxidx_x = -1.f;
+          for (int iy = 0; iy < roi_bin_grid_h; iy++) {
+            const float y = roi_start_h + ph * bin_size_h +
+                            static_cast<float>(iy + .5f) * bin_size_h /
+                                static_cast<float>(roi_bin_grid_h);
+            for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+              const float x = roi_start_w + pw * bin_size_w +
+                              static_cast<float>(ix + .5f) * bin_size_w /
+                                  static_cast<float>(roi_bin_grid_w);
+              PreCalc pc = pre_calc[pre_calc_index];
+              float val = pc.w1 * offset_input[pc.pos1] +
+                          pc.w2 * offset_input[pc.pos2] +
+                          pc.w3 * offset_input[pc.pos3] +
+                          pc.w4 * offset_input[pc.pos4];
+              if (val > maxval) {
+                maxval = val;
+                maxidx_y = y;
+                maxidx_x = x;
+              }
+              output_val += val;
+              pre_calc_index += 1;
+            }
+          }
+          if (pool_mode == 0) {
+            // We do max pooling inside a bin
+            output[index] = maxval;
+            argmax_y[index] = maxidx_y;
+            argmax_x[index] = maxidx_x;
+          } else if (pool_mode == 1) {
+            // We do average (integral) pooling inside a bin
+            output[index] = output_val / count;
+          }  // if
+        }    // for pw
+      }      // for ph
+    }        // for c
+  }          // for n
+}
+
+void MMCVRoiAlignKernel::Compute(OrtKernelContext *context) {
+  // Setup inputs
+  const OrtValue *input_X = ort_.KernelContext_GetInput(context, 0);
+  const float *X_data =
+      reinterpret_cast<const float *>(ort_.GetTensorData<float>(input_X));
+  const OrtValue *input_rois = ort_.KernelContext_GetInput(context, 1);
+  const float *rois = reinterpret_cast<const float *>(
+      ort_.GetTensorData<const float *>(input_rois));
+
+  // Setup output
+  OrtTensorDimensions out_dimensions(ort_, input_X);
+  OrtTensorDimensions roi_dimensions(ort_, input_rois);
+
+  int batch_size = out_dimensions.data()[0];
+  int input_channels = out_dimensions.data()[1];
+  int input_height = out_dimensions.data()[2];
+  int input_width = out_dimensions.data()[3];
+
+  out_dimensions.data()[0] = roi_dimensions.data()[0];
+  out_dimensions.data()[2] = aligned_height_;
+  out_dimensions.data()[3] = aligned_width_;
+
+  OrtValue *output = ort_.KernelContext_GetOutput(
+      context, 0, out_dimensions.data(), out_dimensions.size());
+  float *out = ort_.GetTensorMutableData<float>(output);
+  OrtTensorTypeAndShapeInfo *output_info = ort_.GetTensorTypeAndShape(output);
+  ort_.ReleaseTensorTypeAndShapeInfo(output_info);
+
+  // TODO: forward here
+  int output_size = out_dimensions.data()[0];
+  for (auto i = 1; i < out_dimensions.size(); ++i) {
+    output_size *= out_dimensions.data()[i];
+  }
+
+  int poolMod = 1;
+  if (pool_mode_ == "max") poolMod = 0;
+
+  float *argmax_x = nullptr, *argmax_y = nullptr;
+  if (poolMod == 0) {
+    argmax_y = new float[output_size];
+    argmax_x = new float[output_size];
+  }
+
+  ROIAlignForwardCPU(output_size, X_data, rois, out, argmax_y, argmax_x,
+                     aligned_height_, aligned_width_, spatial_scale_,
+                     sampling_ratio_, poolMod, aligned_, input_channels,
+                     input_height, input_width);
+
+  if (argmax_x) delete argmax_x;
+  if (argmax_y) delete argmax_y;
+}
diff --git a/mmcv/ops/csrc/onnxruntime/cpu/roi_align_rotated.cpp b/mmcv/ops/csrc/onnxruntime/cpu/roi_align_rotated.cpp
new file mode 100644
index 0000000..ce0b220
--- /dev/null
+++ b/mmcv/ops/csrc/onnxruntime/cpu/roi_align_rotated.cpp
@@ -0,0 +1,247 @@
+// Modified from
+// https://github.com/facebookresearch/detectron2/tree/master/detectron2/layers/csrc/ROIAlignRotated
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+#include "roi_align_rotated.h"
+
+#include "../ort_mmcv_utils.h"
+
+struct PreCalc {
+  int pos1;
+  int pos2;
+  int pos3;
+  int pos4;
+  float w1;
+  float w2;
+  float w3;
+  float w4;
+};
+
+void pre_calc_for_bilinear_interpolate(
+    const int height, const int width, const int pooled_height,
+    const int pooled_width, const int iy_upper, const int ix_upper,
+    float roi_start_h, float roi_start_w, float bin_size_h, float bin_size_w,
+    int roi_bin_grid_h, int roi_bin_grid_w, float roi_center_h,
+    float roi_center_w, float cos_theta, float sin_theta,
+    std::vector<PreCalc> &pre_calc) {
+  int pre_calc_index = 0;
+  for (int ph = 0; ph < pooled_height; ph++) {
+    for (int pw = 0; pw < pooled_width; pw++) {
+      for (int iy = 0; iy < iy_upper; iy++) {
+        const float yy =
+            roi_start_h + ph * bin_size_h +
+            static_cast<float>(iy + .5f) * bin_size_h /
+                static_cast<float>(roi_bin_grid_h);  // e.g., 0.5, 1.5
+        for (int ix = 0; ix < ix_upper; ix++) {
+          const float xx = roi_start_w + pw * bin_size_w +
+                           static_cast<float>(ix + .5f) * bin_size_w /
+                               static_cast<float>(roi_bin_grid_w);
+
+          // Rotate by theta around the center and translate
+          // In image space, (y, x) is the order for Right Handed System,
+          // and this is essentially multiplying the point by a rotation matrix
+          // to rotate it counterclockwise through angle theta.
+          float y = yy * cos_theta - xx * sin_theta + roi_center_h;
+          float x = yy * sin_theta + xx * cos_theta + roi_center_w;
+          // deal with: inverse elements are out of feature map boundary
+          if (y < -1.0 || y > height || x < -1.0 || x > width) {
+            // empty
+            PreCalc pc;
+            pc.pos1 = 0;
+            pc.pos2 = 0;
+            pc.pos3 = 0;
+            pc.pos4 = 0;
+            pc.w1 = 0;
+            pc.w2 = 0;
+            pc.w3 = 0;
+            pc.w4 = 0;
+            pre_calc[pre_calc_index] = pc;
+            pre_calc_index += 1;
+            continue;
+          }
+
+          if (y < 0) {
+            y = 0;
+          }
+          if (x < 0) {
+            x = 0;
+          }
+
+          int y_low = (int)y;
+          int x_low = (int)x;
+          int y_high;
+          int x_high;
+
+          if (y_low >= height - 1) {
+            y_high = y_low = height - 1;
+            y = (float)y_low;
+          } else {
+            y_high = y_low + 1;
+          }
+
+          if (x_low >= width - 1) {
+            x_high = x_low = width - 1;
+            x = (float)x_low;
+          } else {
+            x_high = x_low + 1;
+          }
+
+          float ly = y - y_low;
+          float lx = x - x_low;
+          float hy = 1. - ly, hx = 1. - lx;
+          float w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+
+          // save weights and indices
+          PreCalc pc;
+          pc.pos1 = y_low * width + x_low;
+          pc.pos2 = y_low * width + x_high;
+          pc.pos3 = y_high * width + x_low;
+          pc.pos4 = y_high * width + x_high;
+          pc.w1 = w1;
+          pc.w2 = w2;
+          pc.w3 = w3;
+          pc.w4 = w4;
+          pre_calc[pre_calc_index] = pc;
+
+          pre_calc_index += 1;
+        }
+      }
+    }
+  }
+}
+
+void ROIAlignRotatedForwardCPU(const int nthreads, const float *input,
+                               const float *rois, float *output,
+                               const float &spatial_scale, const int aligned,
+                               const int clockwise, const int channels,
+                               const int height, const int width,
+                               const int pooled_height, const int pooled_width,
+                               const int sampling_ratio) {
+  int n_rois = nthreads / channels / pooled_width / pooled_height;
+  // (n, c, ph, pw) is an element in the pooled output
+  // can be parallelized using omp
+  // #pragma omp parallel for num_threads(32)
+  for (int n = 0; n < n_rois; n++) {
+    int index_n = n * channels * pooled_width * pooled_height;
+
+    const float *current_roi = rois + n * 6;
+    int roi_batch_ind = current_roi[0];
+
+    // Do not use rounding; this implementation detail is critical
+    float offset = aligned ? (float)0.5 : (float)0.0;
+    float roi_center_w = current_roi[1] * spatial_scale - offset;
+    float roi_center_h = current_roi[2] * spatial_scale - offset;
+    float roi_width = current_roi[3] * spatial_scale;
+    float roi_height = current_roi[4] * spatial_scale;
+    // float theta = current_roi[5] * M_PI / 180.0;
+    float theta = current_roi[5];  // Radian angle by default
+    if (clockwise) {
+      theta = -theta;
+    }
+    float cos_theta = cos(theta);
+    float sin_theta = sin(theta);
+    if (!aligned) {  // for backward-compatibility only
+      roi_width = std::max(roi_width, (float)1.);
+      roi_height = std::max(roi_height, (float)1.);
+    }
+
+    float bin_size_h =
+        static_cast<float>(roi_height) / static_cast<float>(pooled_height);
+    float bin_size_w =
+        static_cast<float>(roi_width) / static_cast<float>(pooled_width);
+
+    // We use roi_bin_grid to sample the grid and mimic integral
+    int roi_bin_grid_h = (sampling_ratio > 0)
+                             ? sampling_ratio
+                             : ceil(roi_height / pooled_height);  // e.g., = 2
+    int roi_bin_grid_w =
+        (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
+
+    // We do average (integral) pooling inside a bin
+    const float count =
+        std::max(roi_bin_grid_h * roi_bin_grid_w, 1);  // e.g. = 4
+
+    // we want to precalculate indices and weights shared by all channels,
+    // this is the key point of optimization
+    std::vector<PreCalc> pre_calc(roi_bin_grid_h * roi_bin_grid_w *
+                                  pooled_width * pooled_height);
+
+    // roi_start_h and roi_start_w are computed wrt the center of RoI (x, y).
+    // Appropriate translation needs to be applied after.
+    float roi_start_h = -roi_height / 2.0;
+    float roi_start_w = -roi_width / 2.0;
+
+    pre_calc_for_bilinear_interpolate(
+        height, width, pooled_height, pooled_width, roi_bin_grid_h,
+        roi_bin_grid_w, roi_start_h, roi_start_w, bin_size_h, bin_size_w,
+        roi_bin_grid_h, roi_bin_grid_w, roi_center_h, roi_center_w, cos_theta,
+        sin_theta, pre_calc);
+
+    for (int c = 0; c < channels; c++) {
+      int index_n_c = index_n + c * pooled_width * pooled_height;
+      const float *offset_input =
+          input + (roi_batch_ind * channels + c) * height * width;
+      int pre_calc_index = 0;
+
+      for (int ph = 0; ph < pooled_height; ph++) {
+        for (int pw = 0; pw < pooled_width; pw++) {
+          int index = index_n_c + ph * pooled_width + pw;
+
+          float output_val = 0.;
+          for (int iy = 0; iy < roi_bin_grid_h; iy++) {
+            for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+              PreCalc pc = pre_calc[pre_calc_index];
+              output_val += pc.w1 * offset_input[pc.pos1] +
+                            pc.w2 * offset_input[pc.pos2] +
+                            pc.w3 * offset_input[pc.pos3] +
+                            pc.w4 * offset_input[pc.pos4];
+
+              pre_calc_index += 1;
+            }
+          }
+          output_val /= count;
+
+          output[index] = output_val;
+        }  // for pw
+      }    // for ph
+    }      // for c
+  }        // for n
+}
+
+void MMCVRoIAlignRotatedKernel::Compute(OrtKernelContext *context) {
+  // Setup inputs
+  const OrtValue *input_X = ort_.KernelContext_GetInput(context, 0);
+  const float *X_data =
+      reinterpret_cast<const float *>(ort_.GetTensorData<float>(input_X));
+  const OrtValue *input_rois = ort_.KernelContext_GetInput(context, 1);
+  const float *rois = reinterpret_cast<const float *>(
+      ort_.GetTensorData<const float *>(input_rois));
+
+  // Setup output
+  OrtTensorDimensions out_dimensions(ort_, input_X);
+  OrtTensorDimensions roi_dimensions(ort_, input_rois);
+
+  int batch_size = out_dimensions.data()[0];
+  int input_channels = out_dimensions.data()[1];
+  int input_height = out_dimensions.data()[2];
+  int input_width = out_dimensions.data()[3];
+
+  out_dimensions.data()[0] = roi_dimensions.data()[0];
+  out_dimensions.data()[2] = aligned_height_;
+  out_dimensions.data()[3] = aligned_width_;
+
+  OrtValue *output = ort_.KernelContext_GetOutput(
+      context, 0, out_dimensions.data(), out_dimensions.size());
+  float *out = ort_.GetTensorMutableData<float>(output);
+  OrtTensorTypeAndShapeInfo *output_info = ort_.GetTensorTypeAndShape(output);
+  ort_.ReleaseTensorTypeAndShapeInfo(output_info);
+
+  // TODO: forward here
+  int output_size = out_dimensions.data()[0];
+  for (auto i = 1; i < out_dimensions.size(); ++i) {
+    output_size *= out_dimensions.data()[i];
+  }
+  ROIAlignRotatedForwardCPU(output_size, X_data, rois, out, spatial_scale_,
+                            aligned_, clockwise_, input_channels, input_height,
+                            input_width, aligned_height_, aligned_width_,
+                            sampling_ratio_);
+}
diff --git a/mmcv/ops/csrc/onnxruntime/cpu/soft_nms.cpp b/mmcv/ops/csrc/onnxruntime/cpu/soft_nms.cpp
new file mode 100644
index 0000000..8bb4ce3
--- /dev/null
+++ b/mmcv/ops/csrc/onnxruntime/cpu/soft_nms.cpp
@@ -0,0 +1,156 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "soft_nms.h"
+
+#include <assert.h>
+
+#include <algorithm>
+#include <cmath>
+
+#include "../ort_mmcv_utils.h"
+
+SoftNmsKernel::SoftNmsKernel(OrtApi api, const OrtKernelInfo *info)
+    : api_(api), ort_(api_), info_(info) {
+  iou_threshold_ = ort_.KernelInfoGetAttribute<float>(info, "iou_threshold");
+  sigma_ = ort_.KernelInfoGetAttribute<float>(info, "sigma");
+  min_score_ = ort_.KernelInfoGetAttribute<float>(info, "min_score");
+  method_ = ort_.KernelInfoGetAttribute<int64_t>(info, "method");
+  offset_ = ort_.KernelInfoGetAttribute<int64_t>(info, "offset");
+
+  // create allocator
+  allocator_ = Ort::AllocatorWithDefaultOptions();
+}
+
+void SoftNmsKernel::Compute(OrtKernelContext *context) {
+  typedef float T;
+
+  const T iou_threshold = T(iou_threshold_);
+  const T sigma = T(sigma_);
+  const T min_score = T(min_score_);
+  const int method = int(method_);
+  const T offset = T(offset_);
+
+  const OrtValue *boxes = ort_.KernelContext_GetInput(context, 0);
+  const T *boxes_data =
+      reinterpret_cast<const float *>(ort_.GetTensorData<T>(boxes));
+  const OrtValue *scores = ort_.KernelContext_GetInput(context, 1);
+  const T *scores_data =
+      reinterpret_cast<const float *>(ort_.GetTensorData<T>(scores));
+
+  OrtTensorDimensions boxes_dim(ort_, boxes);
+  OrtTensorDimensions scores_dim(ort_, scores);
+
+  int64_t nboxes = boxes_dim[0];
+  assert(boxes_dim[1] == 4);
+
+  // allocate tmp memory
+  T *tmp_boxes = (T *)allocator_.Alloc(sizeof(T) * nboxes * 4);
+  T *x1 = tmp_boxes;
+  T *y1 = tmp_boxes + 1;
+  T *x2 = tmp_boxes + 2;
+  T *y2 = tmp_boxes + 3;
+  T *sc = (T *)allocator_.Alloc(sizeof(T) * nboxes);
+  T *areas = (T *)allocator_.Alloc(sizeof(T) * nboxes);
+  T *de = (T *)allocator_.Alloc(sizeof(T) * nboxes * 5);
+  int64_t *inds = (int64_t *)allocator_.Alloc(sizeof(int64_t) * nboxes);
+
+  memcpy(tmp_boxes, boxes_data, sizeof(T) * nboxes * 4);
+  memcpy(sc, scores_data, sizeof(T) * nboxes);
+
+  // init inds as arange(nboxes)
+  std::generate(inds, inds + nboxes, [n = 0]() mutable { return n++; });
+
+  // area = (x2-x1+offset)*(y2-y1+offset)
+  for (int64_t i = 0; i < nboxes; i++) {
+    areas[i] =
+        (x2[i * 4] - x1[i * 4] + offset) * (y2[i * 4] - y1[i * 4] + offset);
+  }
+
+  int64_t pos = 0;
+
+  for (int64_t i = 0; i < nboxes; i++) {
+    auto max_score = sc[i];
+    auto max_pos = i;
+
+    pos = i + 1;
+    // get max box
+    while (pos < nboxes) {
+      if (max_score < sc[pos]) {
+        max_score = sc[pos];
+        max_pos = pos;
+      }
+      pos = pos + 1;
+    }
+    // swap
+    auto ix1 = de[i * 5 + 0] = x1[max_pos * 4];
+    auto iy1 = de[i * 5 + 1] = y1[max_pos * 4];
+    auto ix2 = de[i * 5 + 2] = x2[max_pos * 4];
+    auto iy2 = de[i * 5 + 3] = y2[max_pos * 4];
+    auto iscore = de[i * 5 + 4] = sc[max_pos];
+    auto iarea = areas[max_pos];
+    auto iind = inds[max_pos];
+    x1[max_pos * 4] = x1[i * 4];
+    y1[max_pos * 4] = y1[i * 4];
+    x2[max_pos * 4] = x2[i * 4];
+    y2[max_pos * 4] = y2[i * 4];
+    sc[max_pos] = sc[i];
+    areas[max_pos] = areas[i];
+    inds[max_pos] = inds[i];
+    x1[i * 4] = ix1;
+    y1[i * 4] = iy1;
+    x2[i * 4] = ix2;
+    y2[i * 4] = iy2;
+    sc[i] = iscore;
+    areas[i] = iarea;
+    inds[i] = iind;
+
+    pos = i + 1;
+    while (pos < nboxes) {
+      auto xx1 = std::max(ix1, x1[pos * 4]);
+      auto yy1 = std::max(iy1, y1[pos * 4]);
+      auto xx2 = std::min(ix2, x2[pos * 4]);
+      auto yy2 = std::min(iy2, y2[pos * 4]);
+
+      auto w = std::max(0.f, xx2 - xx1 + offset);
+      auto h = std::max(0.f, yy2 - yy1 + offset);
+      auto inter = w * h;
+      auto ovr = inter / (iarea + areas[pos] - inter);
+
+      float weight = 1.;
+      if (method == 0) {
+        if (ovr >= iou_threshold) weight = 0;
+      } else if (method == 1) {
+        if (ovr >= iou_threshold) weight = 1 - ovr;
+      } else if (method == 2) {
+        weight = std::exp(-(ovr * ovr) / sigma);
+      }
+      sc[pos] *= weight;
+      // if box score falls below threshold, discard the box by
+      // swapping with last box update N
+      if (sc[pos] < min_score) {
+        x1[pos * 4] = x1[(nboxes - 1) * 4];
+        y1[pos * 4] = y1[(nboxes - 1) * 4];
+        x2[pos * 4] = x2[(nboxes - 1) * 4];
+        y2[pos * 4] = y2[(nboxes - 1) * 4];
+        sc[pos] = sc[nboxes - 1];
+        areas[pos] = areas[nboxes - 1];
+        inds[pos] = inds[nboxes - 1];
+        nboxes = nboxes - 1;
+        pos = pos - 1;
+      }
+      pos = pos + 1;
+    }
+  }
+
+  std::vector<int64_t> dets_dim({nboxes, 5});
+  OrtValue *dets = ort_.KernelContext_GetOutput(context, 0, dets_dim.data(),
+                                                dets_dim.size());
+  T *dets_data = ort_.GetTensorMutableData<T>(dets);
+
+  std::vector<int64_t> inds_dim({nboxes});
+  OrtValue *inds_ov = ort_.KernelContext_GetOutput(context, 1, inds_dim.data(),
+                                                   inds_dim.size());
+  int64_t *inds_data = ort_.GetTensorMutableData<int64_t>(inds_ov);
+
+  memcpy(dets_data, de, sizeof(T) * nboxes * 5);
+  memcpy(inds_data, inds, sizeof(int64_t) * nboxes);
+}
diff --git a/mmcv/ops/csrc/onnxruntime/deform_conv.h b/mmcv/ops/csrc/onnxruntime/deform_conv.h
new file mode 100644
index 0000000..05f324a
--- /dev/null
+++ b/mmcv/ops/csrc/onnxruntime/deform_conv.h
@@ -0,0 +1,57 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef ONNXRUNTIME_DEFORM_CONV_H
+#define ONNXRUNTIME_DEFORM_CONV_H
+
+#include <onnxruntime_cxx_api.h>
+
+struct MMCVDeformConvKernel {
+  MMCVDeformConvKernel(OrtApi api, const OrtKernelInfo *info);
+
+  void Compute(OrtKernelContext *context);
+
+ protected:
+  OrtApi api_;
+  Ort::CustomOpApi ort_;
+  const OrtKernelInfo *info_;
+  Ort::AllocatorWithDefaultOptions allocator_;
+
+  int64_t stride_height_;
+  int64_t stride_width_;
+  int64_t padding_height_;
+  int64_t padding_width_;
+  int64_t dilation_height_;
+  int64_t dilation_width_;
+  int64_t deformable_group_;
+  int64_t group_;
+  int64_t im2col_step_;
+};
+
+struct MMCVDeformConvOp
+    : Ort::CustomOpBase<MMCVDeformConvOp, MMCVDeformConvKernel> {
+  void *CreateKernel(OrtApi api, const OrtKernelInfo *info) const {
+    return new MMCVDeformConvKernel(api, info);
+  }
+
+  const char *GetName() const { return "MMCVDeformConv2d"; };
+
+  size_t GetInputTypeCount() const { return 3; };
+  ONNXTensorElementDataType GetInputType(size_t /*index*/) const {
+    return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
+  };
+
+  OrtCustomOpInputOutputCharacteristic GetInputCharacteristic(
+      size_t index) const {
+    return OrtCustomOpInputOutputCharacteristic::INPUT_OUTPUT_REQUIRED;
+  }
+
+  size_t GetOutputTypeCount() const { return 1; };
+  ONNXTensorElementDataType GetOutputType(size_t /*index*/) const {
+    return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
+  };
+
+  // force cpu
+  const char *GetExecutionProviderType() const {
+    return "CPUExecutionProvider";
+  };
+};
+#endif
diff --git a/mmcv/ops/csrc/onnxruntime/grid_sample.h b/mmcv/ops/csrc/onnxruntime/grid_sample.h
new file mode 100644
index 0000000..6be1514
--- /dev/null
+++ b/mmcv/ops/csrc/onnxruntime/grid_sample.h
@@ -0,0 +1,44 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef ONNXRUNTIME_GRIDSAMPLE_H
+#define ONNXRUNTIME_GRIDSAMPLE_H
+
+#include <onnxruntime_cxx_api.h>
+
+struct GridSampleKernel {
+  GridSampleKernel(OrtApi api, const OrtKernelInfo *info);
+
+  void Compute(OrtKernelContext *context);
+
+ protected:
+  OrtApi api_;
+  Ort::CustomOpApi ort_;
+  const OrtKernelInfo *info_;
+  Ort::AllocatorWithDefaultOptions allocator_;
+
+  int64_t align_corners_;
+  int64_t interpolation_mode_;
+  int64_t padding_mode_;
+};
+
+struct GridSampleOp : Ort::CustomOpBase<GridSampleOp, GridSampleKernel> {
+  void *CreateKernel(OrtApi api, const OrtKernelInfo *info) const {
+    return new GridSampleKernel(api, info);
+  };
+
+  const char *GetName() const { return "grid_sampler"; };
+
+  size_t GetInputTypeCount() const { return 2; };
+  ONNXTensorElementDataType GetInputType(size_t /*index*/) const {
+    return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
+  };
+
+  size_t GetOutputTypeCount() const { return 1; };
+  ONNXTensorElementDataType GetOutputType(size_t /*index*/) const {
+    return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
+  };
+
+  const char *GetExecutionProviderType() const {
+    return "CPUExecutionProvider";
+  };
+};
+#endif
diff --git a/mmcv/ops/csrc/onnxruntime/modulated_deform_conv.h b/mmcv/ops/csrc/onnxruntime/modulated_deform_conv.h
new file mode 100644
index 0000000..09d9d1f
--- /dev/null
+++ b/mmcv/ops/csrc/onnxruntime/modulated_deform_conv.h
@@ -0,0 +1,61 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef ONNXRUNTIME_MODULATED_DEFORM_CONV_H
+#define ONNXRUNTIME_MODULATED_DEFORM_CONV_H
+
+#include <onnxruntime_cxx_api.h>
+
+struct MMCVModulatedDeformConvKernel {
+  MMCVModulatedDeformConvKernel(OrtApi api, const OrtKernelInfo *info);
+
+  void Compute(OrtKernelContext *context);
+
+ protected:
+  OrtApi api_;
+  Ort::CustomOpApi ort_;
+  const OrtKernelInfo *info_;
+  Ort::AllocatorWithDefaultOptions allocator_;
+
+  int64_t stride_height_;
+  int64_t stride_width_;
+  int64_t padding_height_;
+  int64_t padding_width_;
+  int64_t dilation_height_;
+  int64_t dilation_width_;
+  int64_t deformable_group_;
+  int64_t group_;
+};
+
+struct MMCVModulatedDeformConvOp
+    : Ort::CustomOpBase<MMCVModulatedDeformConvOp,
+                        MMCVModulatedDeformConvKernel> {
+  void *CreateKernel(OrtApi api, const OrtKernelInfo *info) const {
+    return new MMCVModulatedDeformConvKernel(api, info);
+  }
+
+  const char *GetName() const { return "MMCVModulatedDeformConv2d"; };
+
+  size_t GetInputTypeCount() const { return 5; };
+  ONNXTensorElementDataType GetInputType(size_t /*index*/) const {
+    return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
+  };
+
+  OrtCustomOpInputOutputCharacteristic GetInputCharacteristic(
+      size_t index) const {
+    // The last input (index == 4) is optional, which is bias
+    if (index == 4)
+      return OrtCustomOpInputOutputCharacteristic::INPUT_OUTPUT_OPTIONAL;
+
+    return OrtCustomOpInputOutputCharacteristic::INPUT_OUTPUT_REQUIRED;
+  }
+
+  size_t GetOutputTypeCount() const { return 1; };
+  ONNXTensorElementDataType GetOutputType(size_t /*index*/) const {
+    return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
+  };
+
+  // force cpu
+  const char *GetExecutionProviderType() const {
+    return "CPUExecutionProvider";
+  };
+};
+#endif
diff --git a/mmcv/ops/csrc/onnxruntime/nms.h b/mmcv/ops/csrc/onnxruntime/nms.h
new file mode 100644
index 0000000..ddb208d
--- /dev/null
+++ b/mmcv/ops/csrc/onnxruntime/nms.h
@@ -0,0 +1,45 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef ONNXRUNTIME_NMS_H
+#define ONNXRUNTIME_NMS_H
+
+#include <onnxruntime_cxx_api.h>
+
+struct NmsKernel {
+  NmsKernel(OrtApi api, const OrtKernelInfo *info);
+
+  void Compute(OrtKernelContext *context);
+
+ protected:
+  OrtApi api_;
+  Ort::CustomOpApi ort_;
+  const OrtKernelInfo *info_;
+  Ort::AllocatorWithDefaultOptions allocator_;
+
+  float iou_threshold_;
+  int64_t offset_;
+};
+
+struct NmsOp : Ort::CustomOpBase<NmsOp, NmsKernel> {
+  void *CreateKernel(OrtApi api, const OrtKernelInfo *info) const {
+    return new NmsKernel(api, info);
+  };
+
+  const char *GetName() const { return "NonMaxSuppression"; };
+
+  size_t GetInputTypeCount() const { return 2; };
+  ONNXTensorElementDataType GetInputType(size_t /*index*/) const {
+    return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
+  };
+
+  size_t GetOutputTypeCount() const { return 1; };
+  ONNXTensorElementDataType GetOutputType(size_t index) const {
+    return ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64;
+  }
+
+  // force cpu
+  const char *GetExecutionProviderType() const {
+    return "CPUExecutionProvider";
+  }
+};
+
+#endif
diff --git a/mmcv/ops/csrc/onnxruntime/onnxruntime_register.h b/mmcv/ops/csrc/onnxruntime/onnxruntime_register.h
new file mode 100644
index 0000000..84d2014
--- /dev/null
+++ b/mmcv/ops/csrc/onnxruntime/onnxruntime_register.h
@@ -0,0 +1,16 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef ONNXRUNTIME_REGISTER_H
+#define ONNXRUNTIME_REGISTER_H
+#include <onnxruntime_c_api.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+OrtStatus *ORT_API_CALL RegisterCustomOps(OrtSessionOptions *options,
+                                          const OrtApiBase *api);
+
+#ifdef __cplusplus
+}
+#endif
+#endif  // ONNXRUNTIME_REGISTER_H
diff --git a/mmcv/ops/csrc/onnxruntime/onnxruntime_session_options_config_keys.h b/mmcv/ops/csrc/onnxruntime/onnxruntime_session_options_config_keys.h
new file mode 100644
index 0000000..8e8dbf4
--- /dev/null
+++ b/mmcv/ops/csrc/onnxruntime/onnxruntime_session_options_config_keys.h
@@ -0,0 +1,44 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#ifndef ONNXRUNTIME_SESSION_OPTIONS_CONFIG_KEYS_H
+#define ONNXRUNTIME_SESSION_OPTIONS_CONFIG_KEYS_H
+
+/*
+ * This file defines SessionOptions Config Keys and format of the Config Values.
+ *
+ * The Naming Convention for a SessionOptions Config Key,
+ * "[Area][.[SubArea1].[SubArea2]...].[Keyname]"
+ * Such as "ep.cuda.use_arena"
+ * The Config Key cannot be empty
+ * The maximum length of the Config Key is 128
+ *
+ * The string format of a SessionOptions Config Value is defined individually
+ * for each Config. The maximum length of the Config Value is 1024
+ */
+
+// Key for disable PrePacking,
+// If the config value is set to "1" then the prepacking is disabled, otherwise
+// prepacking is enabled (default value)
+static const char* const kOrtSessionOptionsConfigDisablePrepacking =
+    "session.disable_prepacking";
+
+// A value of "1" means allocators registered in the env will be used. "0" means
+// the allocators created in the session will be used. Use this to override the
+// usage of env allocators on a per session level.
+static const char* const kOrtSessionOptionsConfigUseEnvAllocators =
+    "session.use_env_allocators";
+
+// Set to 'ORT' (case sensitive) to load an ORT format model.
+// If unset, model type will default to ONNX unless inferred from filename
+// ('.ort' == ORT format) or bytes to be ORT
+static const char* const kOrtSessionOptionsConfigLoadModelFormat =
+    "session.load_model_format";
+
+// Set to 'ORT' (case sensitive) to save optimized model in ORT format when
+// SessionOptions.optimized_model_path is set. If unset, format will default to
+// ONNX unless optimized_model_filepath ends in '.ort'.
+static const char* const kOrtSessionOptionsConfigSaveModelFormat =
+    "session.save_model_format";
+
+#endif  // ONNXRUNTIME_SESSION_OPTIONS_CONFIG_KEYS_H
diff --git a/mmcv/ops/csrc/onnxruntime/ort_mmcv_utils.h b/mmcv/ops/csrc/onnxruntime/ort_mmcv_utils.h
new file mode 100644
index 0000000..b3d6d3d
--- /dev/null
+++ b/mmcv/ops/csrc/onnxruntime/ort_mmcv_utils.h
@@ -0,0 +1,15 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef ORT_MMCV_UTILS_H
+#define ORT_MMCV_UTILS_H
+#include <onnxruntime_cxx_api.h>
+
+#include <vector>
+
+struct OrtTensorDimensions : std::vector<int64_t> {
+  OrtTensorDimensions(Ort::CustomOpApi ort, const OrtValue* value) {
+    OrtTensorTypeAndShapeInfo* info = ort.GetTensorTypeAndShape(value);
+    std::vector<int64_t>::operator=(ort.GetTensorShape(info));
+    ort.ReleaseTensorTypeAndShapeInfo(info);
+  }
+};
+#endif  // ORT_MMCV_UTILS_H
diff --git a/mmcv/ops/csrc/onnxruntime/reduce_ops.h b/mmcv/ops/csrc/onnxruntime/reduce_ops.h
new file mode 100644
index 0000000..996a84e
--- /dev/null
+++ b/mmcv/ops/csrc/onnxruntime/reduce_ops.h
@@ -0,0 +1,95 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef ONNXRUNTIME_REDUCE_OPS_H
+#define ONNXRUNTIME_REDUCE_OPS_H
+
+#include <onnxruntime_cxx_api.h>
+
+struct MMCVCumMaxKernel {
+ public:
+  MMCVCumMaxKernel(Ort::CustomOpApi ort, const OrtKernelInfo* info)
+      : ort_(ort) {
+    dim_ = ort_.KernelInfoGetAttribute<int64_t>(info, "dim");
+
+    // create allocator
+    allocator_ = Ort::AllocatorWithDefaultOptions();
+  }
+
+  void Compute(OrtKernelContext* context);
+
+ private:
+  Ort::CustomOpApi ort_;
+  Ort::AllocatorWithDefaultOptions allocator_;
+
+  int64_t dim_;
+};
+
+struct MMCVCumMinKernel {
+ public:
+  MMCVCumMinKernel(Ort::CustomOpApi ort, const OrtKernelInfo* info)
+      : ort_(ort) {
+    dim_ = ort_.KernelInfoGetAttribute<int64_t>(info, "dim");
+
+    // create allocator
+    allocator_ = Ort::AllocatorWithDefaultOptions();
+  }
+
+  void Compute(OrtKernelContext* context);
+
+ private:
+  Ort::CustomOpApi ort_;
+  Ort::AllocatorWithDefaultOptions allocator_;
+
+  int64_t dim_;
+};
+
+struct MMCVCumMaxCustomOp
+    : Ort::CustomOpBase<MMCVCumMaxCustomOp, MMCVCumMaxKernel> {
+  void* CreateKernel(Ort::CustomOpApi api, const OrtKernelInfo* info) const {
+    return new MMCVCumMaxKernel(api, info);
+  }
+
+  const char* GetName() const { return "cummax"; }
+
+  size_t GetInputTypeCount() const { return 1; }
+  ONNXTensorElementDataType GetInputType(size_t) const {
+    return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
+  };
+
+  size_t GetOutputTypeCount() const { return 2; }
+  ONNXTensorElementDataType GetOutputType(size_t index) const {
+    if (index == 1) return ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64;
+    return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
+  };
+
+  // force cpu
+  const char* GetExecutionProviderType() const {
+    return "CPUExecutionProvider";
+  };
+};
+
+struct MMCVCumMinCustomOp
+    : Ort::CustomOpBase<MMCVCumMinCustomOp, MMCVCumMinKernel> {
+  void* CreateKernel(Ort::CustomOpApi api, const OrtKernelInfo* info) const {
+    return new MMCVCumMinKernel(api, info);
+  }
+
+  const char* GetName() const { return "cummin"; }
+
+  size_t GetInputTypeCount() const { return 1; }
+  ONNXTensorElementDataType GetInputType(size_t) const {
+    return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
+  };
+
+  size_t GetOutputTypeCount() const { return 2; }
+  ONNXTensorElementDataType GetOutputType(size_t index) const {
+    if (index == 1) return ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64;
+    return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
+  };
+
+  // force cpu
+  const char* GetExecutionProviderType() const {
+    return "CPUExecutionProvider";
+  };
+};
+
+#endif  // ONNXRUNTIME_REDUCE_OPS_H
diff --git a/mmcv/ops/csrc/onnxruntime/roi_align.h b/mmcv/ops/csrc/onnxruntime/roi_align.h
new file mode 100644
index 0000000..bacc11c
--- /dev/null
+++ b/mmcv/ops/csrc/onnxruntime/roi_align.h
@@ -0,0 +1,62 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef ONNXRUNTIME_ROI_ALIGN_H
+#define ONNXRUNTIME_ROI_ALIGN_H
+
+#include <assert.h>
+#include <onnxruntime_cxx_api.h>
+
+#include <cmath>
+#include <mutex>
+#include <string>
+#include <vector>
+
+struct MMCVRoiAlignKernel {
+ public:
+  MMCVRoiAlignKernel(Ort::CustomOpApi ort, const OrtKernelInfo* info)
+      : ort_(ort) {
+    aligned_ = ort_.KernelInfoGetAttribute<int64_t>(info, "aligned");
+    aligned_height_ =
+        ort_.KernelInfoGetAttribute<int64_t>(info, "output_height");
+    aligned_width_ = ort_.KernelInfoGetAttribute<int64_t>(info, "output_width");
+    pool_mode_ = ort_.KernelInfoGetAttribute<std::string>(info, "mode");
+    sampling_ratio_ =
+        ort_.KernelInfoGetAttribute<int64_t>(info, "sampling_ratio");
+    spatial_scale_ = ort_.KernelInfoGetAttribute<float>(info, "spatial_scale");
+  }
+
+  void Compute(OrtKernelContext* context);
+
+ private:
+  Ort::CustomOpApi ort_;
+
+  int aligned_height_;
+  int aligned_width_;
+  float spatial_scale_;
+  int sampling_ratio_;
+  std::string pool_mode_;
+  int aligned_;
+};
+
+struct MMCVRoiAlignCustomOp
+    : Ort::CustomOpBase<MMCVRoiAlignCustomOp, MMCVRoiAlignKernel> {
+  void* CreateKernel(Ort::CustomOpApi api, const OrtKernelInfo* info) const {
+    return new MMCVRoiAlignKernel(api, info);
+  }
+  const char* GetName() const { return "MMCVRoiAlign"; }
+
+  size_t GetInputTypeCount() const { return 2; }
+  ONNXTensorElementDataType GetInputType(size_t) const {
+    return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
+  }
+
+  size_t GetOutputTypeCount() const { return 1; }
+  ONNXTensorElementDataType GetOutputType(size_t) const {
+    return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
+  }
+
+  // force cpu
+  const char* GetExecutionProviderType() const {
+    return "CPUExecutionProvider";
+  }
+};
+#endif  // ONNXRUNTIME_ROI_ALIGN_H
diff --git a/mmcv/ops/csrc/onnxruntime/roi_align_rotated.h b/mmcv/ops/csrc/onnxruntime/roi_align_rotated.h
new file mode 100644
index 0000000..b9ba289
--- /dev/null
+++ b/mmcv/ops/csrc/onnxruntime/roi_align_rotated.h
@@ -0,0 +1,62 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef ONNXRUNTIME_ROI_ALIGN_ROTATED_H
+#define ONNXRUNTIME_ROI_ALIGN_ROTATED_H
+
+#include <assert.h>
+#include <onnxruntime_cxx_api.h>
+
+#include <cmath>
+#include <mutex>
+#include <string>
+#include <vector>
+
+struct MMCVRoIAlignRotatedKernel {
+ public:
+  MMCVRoIAlignRotatedKernel(Ort::CustomOpApi ort, const OrtKernelInfo* info)
+      : ort_(ort) {
+    aligned_height_ =
+        ort_.KernelInfoGetAttribute<int64_t>(info, "output_height");
+    aligned_width_ = ort_.KernelInfoGetAttribute<int64_t>(info, "output_width");
+    sampling_ratio_ =
+        ort_.KernelInfoGetAttribute<int64_t>(info, "sampling_ratio");
+    spatial_scale_ = ort_.KernelInfoGetAttribute<float>(info, "spatial_scale");
+    aligned_ = ort_.KernelInfoGetAttribute<int64_t>(info, "aligned");
+    clockwise_ = ort_.KernelInfoGetAttribute<int64_t>(info, "clockwise");
+  }
+
+  void Compute(OrtKernelContext* context);
+
+ private:
+  Ort::CustomOpApi ort_;
+  int aligned_height_;
+  int aligned_width_;
+  float spatial_scale_;
+  int sampling_ratio_;
+  int aligned_;
+  int clockwise_;
+};
+
+struct MMCVRoIAlignRotatedCustomOp
+    : Ort::CustomOpBase<MMCVRoIAlignRotatedCustomOp,
+                        MMCVRoIAlignRotatedKernel> {
+  void* CreateKernel(Ort::CustomOpApi api, const OrtKernelInfo* info) const {
+    return new MMCVRoIAlignRotatedKernel(api, info);
+  }
+  const char* GetName() const { return "MMCVRoIAlignRotated"; }
+
+  size_t GetInputTypeCount() const { return 2; }
+  ONNXTensorElementDataType GetInputType(size_t) const {
+    return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
+  }
+
+  size_t GetOutputTypeCount() const { return 1; }
+  ONNXTensorElementDataType GetOutputType(size_t) const {
+    return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
+  }
+
+  // force cpu
+  const char* GetExecutionProviderType() const {
+    return "CPUExecutionProvider";
+  }
+};
+#endif  // ONNXRUNTIME_ROI_ALIGN_ROTATED_H
diff --git a/mmcv/ops/csrc/onnxruntime/soft_nms.h b/mmcv/ops/csrc/onnxruntime/soft_nms.h
new file mode 100644
index 0000000..7f9f8e6
--- /dev/null
+++ b/mmcv/ops/csrc/onnxruntime/soft_nms.h
@@ -0,0 +1,49 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef ONNXRUNTIME_SOFT_NMS_H
+#define ONNXRUNTIME_SOFT_NMS_H
+#include <onnxruntime_cxx_api.h>
+
+struct SoftNmsKernel {
+  SoftNmsKernel(OrtApi api, const OrtKernelInfo *info);
+
+  void Compute(OrtKernelContext *context);
+
+ protected:
+  OrtApi api_;
+  Ort::CustomOpApi ort_;
+  const OrtKernelInfo *info_;
+  Ort::AllocatorWithDefaultOptions allocator_;
+
+  float iou_threshold_;
+  float sigma_;
+  float min_score_;
+  int64_t method_;
+  int64_t offset_;
+};
+
+struct SoftNmsOp : Ort::CustomOpBase<SoftNmsOp, SoftNmsKernel> {
+  void *CreateKernel(OrtApi api, const OrtKernelInfo *info) const {
+    return new SoftNmsKernel(api, info);
+  };
+
+  const char *GetName() const { return "SoftNonMaxSuppression"; };
+
+  size_t GetInputTypeCount() const { return 2; };
+  ONNXTensorElementDataType GetInputType(size_t /*index*/) const {
+    return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
+  };
+
+  size_t GetOutputTypeCount() const { return 2; };
+  ONNXTensorElementDataType GetOutputType(size_t index) const {
+    if (index == 1) {
+      return ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64;
+    }
+    return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
+  };
+
+  // force cpu
+  const char *GetExecutionProviderType() const {
+    return "CPUExecutionProvider";
+  };
+};
+#endif  // ONNXRUNTIME_SOFT_NMS_H
diff --git a/mmcv/ops/csrc/parrots/active_rotated_filter.cpp b/mmcv/ops/csrc/parrots/active_rotated_filter.cpp
deleted file mode 100644
index e1ead1f..0000000
--- a/mmcv/ops/csrc/parrots/active_rotated_filter.cpp
+++ /dev/null
@@ -1,28 +0,0 @@
-// Copyright (c) OpenMMLab. All rights reserved.
-// Modified from
-// https://github.com/csuhan/s2anet/blob/master/mmdet/ops/orn/src/ActiveRotatingFilter.h
-
-#include "pytorch_cpp_helper.hpp"
-#include "pytorch_device_registry.hpp"
-
-void active_rotated_filter_forward_impl(const Tensor input,
-                                        const Tensor indices, Tensor output) {
-  DISPATCH_DEVICE_IMPL(active_rotated_filter_forward_impl, input, indices,
-                       output);
-}
-
-void active_rotated_filter_backward_impl(const Tensor grad_out,
-                                         const Tensor indices, Tensor grad_in) {
-  DISPATCH_DEVICE_IMPL(active_rotated_filter_backward_impl, grad_out, indices,
-                       grad_in);
-}
-
-void active_rotated_filter_forward(const Tensor input, const Tensor indices,
-                                   Tensor output) {
-  active_rotated_filter_forward_impl(input, indices, output);
-}
-
-void active_rotated_filter_backward(const Tensor grad_out, const Tensor indices,
-                                    Tensor grad_in) {
-  active_rotated_filter_backward_impl(grad_out, indices, grad_in);
-}
diff --git a/mmcv/ops/csrc/parrots/active_rotated_filter_parrots.cpp b/mmcv/ops/csrc/parrots/active_rotated_filter_parrots.cpp
deleted file mode 100644
index 9097f7e..0000000
--- a/mmcv/ops/csrc/parrots/active_rotated_filter_parrots.cpp
+++ /dev/null
@@ -1,63 +0,0 @@
-// Copyright (c) OpenMMLab. All rights reserved
-#include <parrots/compute/aten.hpp>
-#include <parrots/extension.hpp>
-#include <parrots/foundation/ssattrs.hpp>
-
-#include "active_rotated_filter_pytorch.h"
-using namespace parrots;
-
-#ifdef MMCV_WITH_CUDA
-void active_rotated_filter_forward_cuda_parrots(
-    CudaContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
-    OperatorBase::out_list_t& outs) {
-  auto input = buildATensor(ctx, ins[0]);
-  auto indices = buildATensor(ctx, ins[1]);
-  auto output = buildATensor(ctx, outs[0]);
-  active_rotated_filter_forward(input, indices, output);
-}
-
-void active_rotated_filter_backward_cuda_parrots(
-    CudaContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
-    OperatorBase::out_list_t& outs) {
-  auto grad_out = buildATensor(ctx, ins[0]);
-  auto indices = buildATensor(ctx, ins[1]);
-  auto grad_in = buildATensor(ctx, outs[0]);
-  active_rotated_filter_backward(grad_out, indices, grad_in);
-}
-#endif
-
-void active_rotated_filter_forward_cpu_parrots(
-    HostContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
-    OperatorBase::out_list_t& outs) {
-  auto input = buildATensor(ctx, ins[0]);
-  auto indices = buildATensor(ctx, ins[1]);
-  auto output = buildATensor(ctx, outs[0]);
-  active_rotated_filter_forward(input, indices, output);
-}
-
-void active_rotated_filter_backward_cpu_parrots(
-    HostContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
-    OperatorBase::out_list_t& outs) {
-  auto grad_out = buildATensor(ctx, ins[0]);
-  auto indices = buildATensor(ctx, ins[1]);
-  auto grad_in = buildATensor(ctx, outs[0]);
-  active_rotated_filter_backward(grad_out, indices, grad_in);
-}
-
-PARROTS_EXTENSION_REGISTER(active_rotated_filter_forward)
-    .input(2)
-    .output(1)
-    .apply(active_rotated_filter_forward_cpu_parrots)
-#ifdef MMCV_WITH_CUDA
-    .apply(active_rotated_filter_forward_cuda_parrots)
-#endif
-    .done();
-
-PARROTS_EXTENSION_REGISTER(active_rotated_filter_backward)
-    .input(2)
-    .output(1)
-    .apply(active_rotated_filter_backward_cpu_parrots)
-#ifdef MMCV_WITH_CUDA
-    .apply(active_rotated_filter_backward_cuda_parrots)
-#endif
-    .done();
diff --git a/mmcv/ops/csrc/parrots/active_rotated_filter_pytorch.h b/mmcv/ops/csrc/parrots/active_rotated_filter_pytorch.h
deleted file mode 100644
index 9a4d2ce..0000000
--- a/mmcv/ops/csrc/parrots/active_rotated_filter_pytorch.h
+++ /dev/null
@@ -1,13 +0,0 @@
-// Copyright (c) OpenMMLab. All rights reserved
-#ifndef ACTIVE_ROTATED_FILTER_PYTORCH_H
-#define ACTIVE_ROTATED_FILTER_PYTORCH_H
-#include <torch/extension.h>
-using namespace at;
-
-void active_rotated_filter_forward(const Tensor input, const Tensor indices,
-                                   Tensor output);
-
-void active_rotated_filter_backward(const Tensor grad_out, const Tensor indices,
-                                    Tensor grad_in);
-
-#endif  // ACTIVE_ROTATED_FILTER_PYTORCH_H
diff --git a/mmcv/ops/csrc/parrots/assign_score_withk.cpp b/mmcv/ops/csrc/parrots/assign_score_withk.cpp
index 9076277..d35fd24 100644
--- a/mmcv/ops/csrc/parrots/assign_score_withk.cpp
+++ b/mmcv/ops/csrc/parrots/assign_score_withk.cpp
@@ -1,33 +1,59 @@
 // Modified from
 // https://github.com/CVMI-Lab/PAConv/tree/main/scene_seg/lib/paconv_lib/src/gpu
 #include "pytorch_cpp_helper.hpp"
-#include "pytorch_device_registry.hpp"
 
-void assign_score_withk_forward_impl(int B, int N0, int N1, int M, int K, int O,
+#ifdef MMCV_WITH_CUDA
+void AssignScoreWithKForwardCUDAKernelLauncher(
+    int B, int N0, int N1, int M, int K, int O, int aggregate,
+    const Tensor& points, const Tensor& centers, const Tensor& scores,
+    const Tensor& knn_idx, Tensor& output);
+
+void assign_score_withk_forward_cuda(int B, int N0, int N1, int M, int K, int O,
                                      int aggregate, const Tensor& points,
                                      const Tensor& centers,
                                      const Tensor& scores,
                                      const Tensor& knn_idx, Tensor& output) {
-  DISPATCH_DEVICE_IMPL(assign_score_withk_forward_impl, B, N0, N1, M, K, O,
-                       aggregate, points, centers, scores, knn_idx, output);
-}
+  AssignScoreWithKForwardCUDAKernelLauncher(
+      B, N0, N1, M, K, O, aggregate, points, centers, scores, knn_idx, output);
+};
+
+void AssignScoreWithKBackwardCUDAKernelLauncher(
+    int B, int N0, int N1, int M, int K, int O, int aggregate,
+    const Tensor& grad_out, const Tensor& points, const Tensor& centers,
+    const Tensor& scores, const Tensor& knn_idx, Tensor& grad_points,
+    Tensor& grad_centers, Tensor& grad_scores);
 
-void assign_score_withk_backward_impl(
+void assign_score_withk_backward_cuda(
     int B, int N0, int N1, int M, int K, int O, int aggregate,
     const Tensor& grad_out, const Tensor& points, const Tensor& centers,
     const Tensor& scores, const Tensor& knn_idx, Tensor& grad_points,
     Tensor& grad_centers, Tensor& grad_scores) {
-  DISPATCH_DEVICE_IMPL(assign_score_withk_backward_impl, B, N0, N1, M, K, O,
-                       aggregate, grad_out, points, centers, scores, knn_idx,
-                       grad_points, grad_centers, grad_scores);
-}
+  AssignScoreWithKBackwardCUDAKernelLauncher(
+      B, N0, N1, M, K, O, aggregate, grad_out, points, centers, scores, knn_idx,
+      grad_points, grad_centers, grad_scores);
+};
+#endif
 
 void assign_score_withk_forward(const Tensor& points, const Tensor& centers,
                                 const Tensor& scores, const Tensor& knn_idx,
                                 Tensor& output, int B, int N0, int N1, int M,
                                 int K, int O, int aggregate) {
-  assign_score_withk_forward_impl(B, N0, N1, M, K, O, aggregate, points,
-                                  centers, scores, knn_idx, output);
+  if (points.device().is_cuda()) {
+#ifdef MMCV_WITH_CUDA
+    CHECK_CONTIGUOUS(points);
+    CHECK_CONTIGUOUS(centers);
+    CHECK_CONTIGUOUS(scores);
+    CHECK_CONTIGUOUS(knn_idx);
+    CHECK_CONTIGUOUS(output);
+
+    assign_score_withk_forward_cuda(B, N0, N1, M, K, O, aggregate, points,
+                                    centers, scores, knn_idx, output);
+#else
+    AT_ERROR("assign_score_withk is not compiled with GPU support");
+#endif
+  } else {
+    AT_ERROR("assign_score_withk is not implemented on CPU");
+  }
 }
 
 void assign_score_withk_backward(const Tensor& grad_out, const Tensor& points,
@@ -36,7 +62,24 @@ void assign_score_withk_backward(const Tensor& grad_out, const Tensor& points,
                                  Tensor& grad_centers, Tensor& grad_scores,
                                  int B, int N0, int N1, int M, int K, int O,
                                  int aggregate) {
-  assign_score_withk_backward_impl(B, N0, N1, M, K, O, aggregate, grad_out,
-                                   points, centers, scores, knn_idx,
-                                   grad_points, grad_centers, grad_scores);
+  if (grad_points.device().is_cuda()) {
+#ifdef MMCV_WITH_CUDA
+    CHECK_CONTIGUOUS(grad_out);
+    CHECK_CONTIGUOUS(scores);
+    CHECK_CONTIGUOUS(points);
+    CHECK_CONTIGUOUS(centers);
+    CHECK_CONTIGUOUS(knn_idx);
+    CHECK_CONTIGUOUS(grad_scores);
+    CHECK_CONTIGUOUS(grad_points);
+    CHECK_CONTIGUOUS(grad_centers);
+
+    assign_score_withk_backward_cuda(B, N0, N1, M, K, O, aggregate, grad_out,
+                                     points, centers, scores, knn_idx,
+                                     grad_points, grad_centers, grad_scores);
+#else
+    AT_ERROR("assign_score_withk is not compiled with GPU support");
+#endif
+  } else {
+    AT_ERROR("assign_score_withk is not implemented on CPU");
+  }
 }
diff --git a/mmcv/ops/csrc/parrots/ball_query.cpp b/mmcv/ops/csrc/parrots/ball_query.cpp
index 1c9e7a2..fc2709f 100644
--- a/mmcv/ops/csrc/parrots/ball_query.cpp
+++ b/mmcv/ops/csrc/parrots/ball_query.cpp
@@ -2,19 +2,36 @@
 // https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/ball_query.cpp
 
 #include "pytorch_cpp_helper.hpp"
-#include "pytorch_device_registry.hpp"
 
-void ball_query_forward_impl(int b, int n, int m, float min_radius,
+#ifdef MMCV_WITH_CUDA
+void BallQueryForwardCUDAKernelLauncher(int b, int n, int m, float min_radius,
+                                        float max_radius, int nsample,
+                                        const Tensor new_xyz, const Tensor xyz,
+                                        Tensor idx);
+
+void ball_query_forward_cuda(int b, int n, int m, float min_radius,
                              float max_radius, int nsample,
                              const Tensor new_xyz, const Tensor xyz,
                              Tensor idx) {
-  DISPATCH_DEVICE_IMPL(ball_query_forward_impl, b, n, m, min_radius, max_radius,
-                       nsample, new_xyz, xyz, idx);
-}
+  BallQueryForwardCUDAKernelLauncher(b, n, m, min_radius, max_radius, nsample,
+                                     new_xyz, xyz, idx);
+};
+#endif
 
 void ball_query_forward(Tensor new_xyz_tensor, Tensor xyz_tensor,
                         Tensor idx_tensor, int b, int n, int m,
                         float min_radius, float max_radius, int nsample) {
-  ball_query_forward_impl(b, n, m, min_radius, max_radius, nsample,
-                          new_xyz_tensor, xyz_tensor, idx_tensor);
+  if (new_xyz_tensor.device().is_cuda()) {
+#ifdef MMCV_WITH_CUDA
+    CHECK_CUDA_INPUT(new_xyz_tensor);
+    CHECK_CUDA_INPUT(xyz_tensor);
+
+    ball_query_forward_cuda(b, n, m, min_radius, max_radius, nsample,
+                            new_xyz_tensor, xyz_tensor, idx_tensor);
+#else
+    AT_ERROR("ball_query is not compiled with GPU support");
+#endif
+  } else {
+    AT_ERROR("ball_query is not implemented on CPU");
+  }
 }
diff --git a/mmcv/ops/csrc/parrots/bbox_overlaps.cpp b/mmcv/ops/csrc/parrots/bbox_overlaps.cpp
index 187216f..073110d 100644
--- a/mmcv/ops/csrc/parrots/bbox_overlaps.cpp
+++ b/mmcv/ops/csrc/parrots/bbox_overlaps.cpp
@@ -1,14 +1,30 @@
 // Copyright (c) OpenMMLab. All rights reserved
 #include "pytorch_cpp_helper.hpp"
-#include "pytorch_device_registry.hpp"
 
-void bbox_overlaps_impl(const Tensor bboxes1, const Tensor bboxes2, Tensor ious,
+#ifdef MMCV_WITH_CUDA
+void BBoxOverlapsCUDAKernelLauncher(const Tensor bboxes1, const Tensor bboxes2,
+                                    Tensor ious, const int mode,
+                                    const bool aligned, const int offset);
+
+void bbox_overlaps_cuda(const Tensor bboxes1, const Tensor bboxes2, Tensor ious,
                         const int mode, const bool aligned, const int offset) {
-  DISPATCH_DEVICE_IMPL(bbox_overlaps_impl, bboxes1, bboxes2, ious, mode,
-                       aligned, offset);
+  BBoxOverlapsCUDAKernelLauncher(bboxes1, bboxes2, ious, mode, aligned, offset);
 }
+#endif
 
 void bbox_overlaps(const Tensor bboxes1, const Tensor bboxes2, Tensor ious,
                    const int mode, const bool aligned, const int offset) {
-  bbox_overlaps_impl(bboxes1, bboxes2, ious, mode, aligned, offset);
+  if (bboxes1.device().is_cuda()) {
+#ifdef MMCV_WITH_CUDA
+    CHECK_CUDA_INPUT(bboxes1);
+    CHECK_CUDA_INPUT(bboxes2);
+    CHECK_CUDA_INPUT(ious);
+
+    bbox_overlaps_cuda(bboxes1, bboxes2, ious, mode, aligned, offset);
+#else
+    AT_ERROR("bbox_overlaps is not compiled with GPU support");
+#endif
+  } else {
+    AT_ERROR("bbox_overlaps is not implemented on CPU");
+  }
 }
diff --git a/mmcv/ops/csrc/parrots/border_align.cpp b/mmcv/ops/csrc/parrots/border_align.cpp
index 565de68..bb14962 100644
--- a/mmcv/ops/csrc/parrots/border_align.cpp
+++ b/mmcv/ops/csrc/parrots/border_align.cpp
@@ -1,30 +1,68 @@
 // Copyright (c) OpenMMLab. All rights reserved
 #include "pytorch_cpp_helper.hpp"
-#include "pytorch_device_registry.hpp"
 
-void border_align_forward_impl(const Tensor &input, const Tensor &boxes,
+#ifdef MMCV_WITH_CUDA
+void BorderAlignForwardCUDAKernelLauncher(const Tensor &input,
+                                          const Tensor &boxes, Tensor output,
+                                          Tensor argmax_idx,
+                                          const int pool_size);
+
+void BorderAlignBackwardCUDAKernelLauncher(const Tensor &grad_output,
+                                           const Tensor &boxes,
+                                           const Tensor &argmax_idx,
+                                           Tensor grad_input,
+                                           const int pool_size);
+
+void border_align_forward_cuda(const Tensor &input, const Tensor &boxes,
                                Tensor output, Tensor argmax_idx,
                                const int pool_size) {
-  DISPATCH_DEVICE_IMPL(border_align_forward_impl, input, boxes, output,
-                       argmax_idx, pool_size);
+  BorderAlignForwardCUDAKernelLauncher(input, boxes, output, argmax_idx,
+                                       pool_size);
 }
 
-void border_align_backward_impl(const Tensor &grad_output, const Tensor &boxes,
+void border_align_backward_cuda(const Tensor &grad_output, const Tensor &boxes,
                                 const Tensor &argmax_idx, Tensor grad_input,
                                 const int pool_size) {
-  DISPATCH_DEVICE_IMPL(border_align_backward_impl, grad_output, boxes,
-                       argmax_idx, grad_input, pool_size);
+  BorderAlignBackwardCUDAKernelLauncher(grad_output, boxes, argmax_idx,
+                                        grad_input, pool_size);
 }
+#endif
 
 void border_align_forward(const Tensor &input, const Tensor &boxes,
                           Tensor output, Tensor argmax_idx,
                           const int pool_size) {
-  border_align_forward_impl(input, boxes, output, argmax_idx, pool_size);
+  if (input.device().is_cuda()) {
+#ifdef MMCV_WITH_CUDA
+    CHECK_CUDA_INPUT(input);
+    CHECK_CUDA_INPUT(boxes);
+    CHECK_CUDA_INPUT(output);
+    CHECK_CUDA_INPUT(argmax_idx);
+
+    border_align_forward_cuda(input, boxes, output, argmax_idx, pool_size);
+#else
+    AT_ERROR("BorderAlign is not compiled with GPU support");
+#endif
+  } else {
+    AT_ERROR("BorderAlign is not implemented on CPU");
+  }
 }
 
 void border_align_backward(const Tensor &grad_output, const Tensor &boxes,
                            const Tensor &argmax_idx, Tensor grad_input,
                            const int pool_size) {
-  border_align_backward_impl(grad_output, boxes, argmax_idx, grad_input,
-                             pool_size);
+  if (grad_output.device().is_cuda()) {
+#ifdef MMCV_WITH_CUDA
+    CHECK_CUDA_INPUT(grad_output);
+    CHECK_CUDA_INPUT(boxes);
+    CHECK_CUDA_INPUT(argmax_idx);
+    CHECK_CUDA_INPUT(grad_input);
+
+    border_align_backward_cuda(grad_output, boxes, argmax_idx, grad_input,
+                               pool_size);
+#else
+    AT_ERROR("BorderAlign is not compiled with GPU support");
+#endif
+  } else {
+    AT_ERROR("BorderAlign is not implemented on CPU");
+  }
 }
diff --git a/mmcv/ops/csrc/parrots/border_align_parrots.cpp b/mmcv/ops/csrc/parrots/border_align_parrots.cpp
index 8c3bea5..9a075a1 100644
--- a/mmcv/ops/csrc/parrots/border_align_parrots.cpp
+++ b/mmcv/ops/csrc/parrots/border_align_parrots.cpp
@@ -7,7 +7,6 @@
 
 using namespace parrots;
 
-#ifdef MMCV_WITH_CUDA
 void border_align_forward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
                                        const OperatorBase::in_list_t& ins,
                                        OperatorBase::out_list_t& outs) {
@@ -50,4 +49,3 @@ PARROTS_EXTENSION_REGISTER(border_align_backward)
     .output(1)
     .apply(border_align_backward_cuda_parrots)
     .done();
-#endif
diff --git a/mmcv/ops/csrc/parrots/box_iou_rotated.cpp b/mmcv/ops/csrc/parrots/box_iou_rotated.cpp
index a2a4e09..01fc02f 100644
--- a/mmcv/ops/csrc/parrots/box_iou_rotated.cpp
+++ b/mmcv/ops/csrc/parrots/box_iou_rotated.cpp
@@ -2,18 +2,28 @@
 // modified from
 // https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/csrc/box_iou_rotated/box_iou_rotated.h
 #include "pytorch_cpp_helper.hpp"
-#include "pytorch_device_registry.hpp"
 
-void box_iou_rotated_impl(const Tensor boxes1, const Tensor boxes2, Tensor ious,
-                          const int mode_flag, const bool aligned) {
-  DISPATCH_DEVICE_IMPL(box_iou_rotated_impl, boxes1, boxes2, ious, mode_flag,
-                       aligned);
-}
+void box_iou_rotated_cpu(const Tensor boxes1, const Tensor boxes2, Tensor ious,
+                         const int mode_flag, const bool aligned);
+
+#ifdef MMCV_WITH_CUDA
+void box_iou_rotated_cuda(const Tensor boxes1, const Tensor boxes2, Tensor ious,
+                          const int mode_flag, const bool aligned);
+#endif
 
 // Interface for Python
 // inline is needed to prevent multiple function definitions when this header is
 // included by different cpps
 void box_iou_rotated(const Tensor boxes1, const Tensor boxes2, Tensor ious,
                      const int mode_flag, const bool aligned) {
-  box_iou_rotated_impl(boxes1, boxes2, ious, mode_flag, aligned);
+  assert(boxes1.device().is_cuda() == boxes2.device().is_cuda());
+  if (boxes1.device().is_cuda()) {
+#ifdef MMCV_WITH_CUDA
+    box_iou_rotated_cuda(boxes1, boxes2, ious, mode_flag, aligned);
+#else
+    AT_ERROR("Not compiled with GPU support");
+#endif
+  } else {
+    box_iou_rotated_cpu(boxes1, boxes2, ious, mode_flag, aligned);
+  }
 }
diff --git a/mmcv/ops/csrc/parrots/box_iou_rotated_cpu.cpp b/mmcv/ops/csrc/parrots/box_iou_rotated_cpu.cpp
new file mode 100644
index 0000000..2b43488
--- /dev/null
+++ b/mmcv/ops/csrc/parrots/box_iou_rotated_cpu.cpp
@@ -0,0 +1,33 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+// modified from
+// https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/csrc/box_iou_rotated/box_iou_rotated_cpu.cpp
+#include "box_iou_rotated_utils.hpp"
+#include "pytorch_cpp_helper.hpp"
+
+template <typename T>
+void box_iou_rotated_cpu_kernel(const Tensor boxes1, const Tensor boxes2,
+                                Tensor ious, const int mode_flag,
+                                const bool aligned) {
+  int output_size = ious.numel();
+  auto num_boxes1 = boxes1.size(0);
+  auto num_boxes2 = boxes2.size(0);
+
+  if (aligned) {
+    for (int i = 0; i < output_size; i++) {
+      ious[i] = single_box_iou_rotated<T>(boxes1[i].data_ptr<T>(),
+                                          boxes2[i].data_ptr<T>(), mode_flag);
+    }
+  } else {
+    for (int i = 0; i < num_boxes1; i++) {
+      for (int j = 0; j < num_boxes2; j++) {
+        ious[i * num_boxes2 + j] = single_box_iou_rotated<T>(
+            boxes1[i].data_ptr<T>(), boxes2[j].data_ptr<T>(), mode_flag);
+      }
+    }
+  }
+}
+
+void box_iou_rotated_cpu(const Tensor boxes1, const Tensor boxes2, Tensor ious,
+                         const int mode_flag, const bool aligned) {
+  box_iou_rotated_cpu_kernel<float>(boxes1, boxes2, ious, mode_flag, aligned);
+}
diff --git a/mmcv/ops/csrc/parrots/carafe.cpp b/mmcv/ops/csrc/parrots/carafe.cpp
index a563aed..c413737 100644
--- a/mmcv/ops/csrc/parrots/carafe.cpp
+++ b/mmcv/ops/csrc/parrots/carafe.cpp
@@ -1,30 +1,59 @@
 // Copyright (c) OpenMMLab. All rights reserved
 #include "pytorch_cpp_helper.hpp"
-#include "pytorch_device_registry.hpp"
 
-void carafe_forward_impl(Tensor features, Tensor masks, Tensor rfeatures,
+#ifdef MMCV_WITH_CUDA
+void CARAFEForwardCUDAKernelLauncher(const Tensor features, const Tensor masks,
+                                     Tensor rfeatures, Tensor routput,
+                                     Tensor rmasks, Tensor output,
+                                     const int kernel_size,
+                                     const int group_size,
+                                     const int scale_factor);
+
+void CARAFEBackwardCUDAKernelLauncher(
+    const Tensor top_grad, const Tensor rfeatures, const Tensor masks,
+    Tensor rtop_grad, Tensor rbottom_grad_hs, Tensor rbottom_grad,
+    Tensor rmask_grad, Tensor bottom_grad, Tensor mask_grad,
+    const int kernel_size, const int group_size, const int scale_factor);
+
+void carafe_forward_cuda(Tensor features, Tensor masks, Tensor rfeatures,
                          Tensor routput, Tensor rmasks, Tensor output,
                          int kernel_size, int group_size, int scale_factor) {
-  DISPATCH_DEVICE_IMPL(carafe_forward_impl, features, masks, rfeatures, routput,
-                       rmasks, output, kernel_size, group_size, scale_factor);
+  CARAFEForwardCUDAKernelLauncher(features, masks, rfeatures, routput, rmasks,
+                                  output, kernel_size, group_size,
+                                  scale_factor);
 }
 
-void carafe_backward_impl(Tensor top_grad, Tensor rfeatures, Tensor masks,
+void carafe_backward_cuda(Tensor top_grad, Tensor rfeatures, Tensor masks,
                           Tensor rtop_grad, Tensor rbottom_grad_hs,
                           Tensor rbottom_grad, Tensor rmask_grad,
                           Tensor bottom_grad, Tensor mask_grad, int kernel_size,
                           int group_size, int scale_factor) {
-  DISPATCH_DEVICE_IMPL(carafe_backward_impl, top_grad, rfeatures, masks,
-                       rtop_grad, rbottom_grad_hs, rbottom_grad, rmask_grad,
-                       bottom_grad, mask_grad, kernel_size, group_size,
-                       scale_factor);
+  CARAFEBackwardCUDAKernelLauncher(top_grad, rfeatures, masks, rtop_grad,
+                                   rbottom_grad_hs, rbottom_grad, rmask_grad,
+                                   bottom_grad, mask_grad, kernel_size,
+                                   group_size, scale_factor);
 }
+#endif
 
 void carafe_forward(Tensor features, Tensor masks, Tensor rfeatures,
                     Tensor routput, Tensor rmasks, Tensor output,
                     int kernel_size, int group_size, int scale_factor) {
-  carafe_forward_impl(features, masks, rfeatures, routput, rmasks, output,
-                      kernel_size, group_size, scale_factor);
+  if (features.device().is_cuda()) {
+#ifdef MMCV_WITH_CUDA
+    CHECK_CUDA_INPUT(features);
+    CHECK_CUDA_INPUT(masks);
+    CHECK_CUDA_INPUT(rfeatures);
+    CHECK_CUDA_INPUT(routput);
+    CHECK_CUDA_INPUT(rmasks);
+    CHECK_CUDA_INPUT(output);
+    carafe_forward_cuda(features, masks, rfeatures, routput, rmasks, output,
+                        kernel_size, group_size, scale_factor);
+#else
+    AT_ERROR("Carafe is not compiled with GPU support");
+#endif
+  } else {
+    AT_ERROR("Carafe is not implemented on CPU");
+  }
 }
 
 void carafe_backward(Tensor top_grad, Tensor rfeatures, Tensor masks,
@@ -32,7 +61,24 @@ void carafe_backward(Tensor top_grad, Tensor rfeatures, Tensor masks,
                      Tensor rbottom_grad, Tensor rmask_grad, Tensor bottom_grad,
                      Tensor mask_grad, int kernel_size, int group_size,
                      int scale_factor) {
-  carafe_backward_impl(top_grad, rfeatures, masks, rtop_grad, rbottom_grad_hs,
-                       rbottom_grad, rmask_grad, bottom_grad, mask_grad,
-                       kernel_size, group_size, scale_factor);
+  if (top_grad.device().is_cuda()) {
+#ifdef MMCV_WITH_CUDA
+    CHECK_CUDA_INPUT(top_grad);
+    CHECK_CUDA_INPUT(rfeatures);
+    CHECK_CUDA_INPUT(masks);
+    CHECK_CUDA_INPUT(rtop_grad);
+    CHECK_CUDA_INPUT(rbottom_grad_hs);
+    CHECK_CUDA_INPUT(rbottom_grad);
+    CHECK_CUDA_INPUT(rmask_grad);
+    CHECK_CUDA_INPUT(bottom_grad);
+    CHECK_CUDA_INPUT(mask_grad);
+    carafe_backward_cuda(top_grad, rfeatures, masks, rtop_grad, rbottom_grad_hs,
+                         rbottom_grad, rmask_grad, bottom_grad, mask_grad,
+                         kernel_size, group_size, scale_factor);
+#else
+    AT_ERROR("Carafe is not compiled with GPU support");
+#endif
+  } else {
+    AT_ERROR("Carafe is not implemented on CPU");
+  }
 }
diff --git a/mmcv/ops/csrc/parrots/carafe_naive.cpp b/mmcv/ops/csrc/parrots/carafe_naive.cpp
index 6e8917a..d6ebda3 100644
--- a/mmcv/ops/csrc/parrots/carafe_naive.cpp
+++ b/mmcv/ops/csrc/parrots/carafe_naive.cpp
@@ -1,32 +1,69 @@
 // Copyright (c) OpenMMLab. All rights reserved
 #include "pytorch_cpp_helper.hpp"
-#include "pytorch_device_registry.hpp"
 
-void carafe_naive_forward_impl(Tensor features, Tensor masks, Tensor output,
+#ifdef MMCV_WITH_CUDA
+void CARAFENAIVEForwardCUDAKernelLauncher(const Tensor features,
+                                          const Tensor masks, Tensor output,
+                                          const int kernel_size,
+                                          const int group_size,
+                                          const int scale_factor);
+
+void CARAFENAIVEBackwardCUDAKernelLauncher(
+    const Tensor top_grad, const Tensor features, const Tensor masks,
+    Tensor bottom_grad, Tensor mask_grad, const int kernel_size,
+    const int group_size, const int scale_factor);
+
+void carafe_naive_forward_cuda(Tensor features, Tensor masks, Tensor output,
                                int kernel_size, int group_size,
                                int scale_factor) {
-  DISPATCH_DEVICE_IMPL(carafe_naive_forward_impl, features, masks, output,
-                       kernel_size, group_size, scale_factor);
+  CARAFENAIVEForwardCUDAKernelLauncher(features, masks, output, kernel_size,
+                                       group_size, scale_factor);
 }
 
-void carafe_naive_backward_impl(Tensor top_grad, Tensor features, Tensor masks,
+void carafe_naive_backward_cuda(Tensor top_grad, Tensor features, Tensor masks,
                                 Tensor bottom_grad, Tensor mask_grad,
                                 int kernel_size, int group_size,
                                 int scale_factor) {
-  DISPATCH_DEVICE_IMPL(carafe_naive_backward_impl, top_grad, features, masks,
-                       bottom_grad, mask_grad, kernel_size, group_size,
-                       scale_factor);
+  CARAFENAIVEBackwardCUDAKernelLauncher(top_grad, features, masks, bottom_grad,
+                                        mask_grad, kernel_size, group_size,
+                                        scale_factor);
 }
+#endif
 
 void carafe_naive_forward(Tensor features, Tensor masks, Tensor output,
                           int kernel_size, int group_size, int scale_factor) {
-  carafe_naive_forward_impl(features, masks, output, kernel_size, group_size,
-                            scale_factor);
+  if (features.device().is_cuda()) {
+#ifdef MMCV_WITH_CUDA
+    CHECK_CUDA_INPUT(features);
+    CHECK_CUDA_INPUT(masks);
+    CHECK_CUDA_INPUT(output);
+    carafe_naive_forward_cuda(features, masks, output, kernel_size, group_size,
+                              scale_factor);
+#else
+    AT_ERROR("CarafeNaive is not compiled with GPU support");
+#endif
+  } else {
+    AT_ERROR("CarafeNaive is not implemented on CPU");
+  }
 }
 
 void carafe_naive_backward(Tensor top_grad, Tensor features, Tensor masks,
                            Tensor bottom_grad, Tensor mask_grad,
                            int kernel_size, int group_size, int scale_factor) {
-  carafe_naive_backward_impl(top_grad, features, masks, bottom_grad, mask_grad,
-                             kernel_size, group_size, scale_factor);
+  if (top_grad.device().is_cuda()) {
+#ifdef MMCV_WITH_CUDA
+    CHECK_CUDA_INPUT(top_grad);
+    CHECK_CUDA_INPUT(features);
+    CHECK_CUDA_INPUT(masks);
+    CHECK_CUDA_INPUT(bottom_grad);
+    CHECK_CUDA_INPUT(mask_grad);
+    carafe_naive_backward_cuda(top_grad, features, masks, bottom_grad,
+                               mask_grad, kernel_size, group_size,
+                               scale_factor);
+#else
+    AT_ERROR("CarafeNaive is not compiled with GPU support");
+#endif
+  } else {
+    AT_ERROR("CarafeNaive is not implemented on CPU");
+  }
 }
diff --git a/mmcv/ops/csrc/parrots/chamfer_distance.cpp b/mmcv/ops/csrc/parrots/chamfer_distance.cpp
deleted file mode 100644
index dcff698..0000000
--- a/mmcv/ops/csrc/parrots/chamfer_distance.cpp
+++ /dev/null
@@ -1,35 +0,0 @@
-// Copyright (c) OpenMMLab. All rights reserved.
-// Modified from
-// https://github.com/chrdiller/pyTorchChamferDistance/blob/master/chamfer_distance/chamfer_distance.cpp
-
-#include "pytorch_cpp_helper.hpp"
-#include "pytorch_device_registry.hpp"
-
-void chamfer_distance_forward_impl(const Tensor xyz1, const Tensor xyz2,
-                                   const Tensor dist1, const Tensor dist2,
-                                   const Tensor idx1, const Tensor idx2) {
-  DISPATCH_DEVICE_IMPL(chamfer_distance_forward_impl, xyz1, xyz2, dist1, dist2,
-                       idx1, idx2);
-}
-
-void chamfer_distance_backward_impl(const Tensor xyz1, const Tensor xyz2,
-                                    Tensor idx1, Tensor idx2, Tensor graddist1,
-                                    Tensor graddist2, Tensor gradxyz1,
-                                    Tensor gradxyz2) {
-  DISPATCH_DEVICE_IMPL(chamfer_distance_backward_impl, xyz1, xyz2, idx1, idx2,
-                       graddist1, graddist2, gradxyz1, gradxyz2);
-}
-
-void chamfer_distance_forward(const Tensor xyz1, const Tensor xyz2,
-                              const Tensor dist1, const Tensor dist2,
-                              const Tensor idx1, const Tensor idx2) {
-  chamfer_distance_forward_impl(xyz1, xyz2, dist1, dist2, idx1, idx2);
-}
-
-void chamfer_distance_backward(const Tensor xyz1, const Tensor xyz2,
-                               Tensor idx1, Tensor idx2, Tensor graddist1,
-                               Tensor graddist2, Tensor gradxyz1,
-                               Tensor gradxyz2) {
-  chamfer_distance_backward_impl(xyz1, xyz2, idx1, idx2, graddist1, graddist2,
-                                 gradxyz1, gradxyz2);
-}
diff --git a/mmcv/ops/csrc/parrots/chamfer_distance_parrots.cpp b/mmcv/ops/csrc/parrots/chamfer_distance_parrots.cpp
deleted file mode 100644
index db8eff1..0000000
--- a/mmcv/ops/csrc/parrots/chamfer_distance_parrots.cpp
+++ /dev/null
@@ -1,51 +0,0 @@
-// Copyright (c) OpenMMLab. All rights reserved
-#include <parrots/compute/aten.hpp>
-#include <parrots/extension.hpp>
-#include <parrots/foundation/ssattrs.hpp>
-
-#include "chamfer_distance_pytorch.h"
-using namespace parrots;
-
-#ifdef MMCV_WITH_CUDA
-void chamfer_distance_forward_cuda_parrots(CudaContext& ctx,
-                                           const SSElement& attr,
-                                           const OperatorBase::in_list_t& ins,
-                                           OperatorBase::out_list_t& outs) {
-  auto xyz1 = buildATensor(ctx, ins[0]);
-  auto xyz2 = buildATensor(ctx, ins[1]);
-  auto dist1 = buildATensor(ctx, outs[0]);
-  auto dist2 = buildATensor(ctx, outs[1]);
-  auto idx1 = buildATensor(ctx, outs[2]);
-  auto idx2 = buildATensor(ctx, outs[3]);
-  chamfer_distance_forward(xyz1, xyz2, dist1, dist2, idx1, idx2);
-}
-
-void chamfer_distance_backward_cuda_parrots(CudaContext& ctx,
-                                            const SSElement& attr,
-                                            const OperatorBase::in_list_t& ins,
-                                            OperatorBase::out_list_t& outs) {
-  auto xyz1 = buildATensor(ctx, ins[0]);
-  auto xyz2 = buildATensor(ctx, ins[1]);
-  auto idx1 = buildATensor(ctx, ins[2]);
-  auto idx2 = buildATensor(ctx, ins[3]);
-  auto graddist1 = buildATensor(ctx, ins[4]);
-  auto graddist2 = buildATensor(ctx, ins[5]);
-  auto gradxyz1 = buildATensor(ctx, outs[0]);
-  auto gradxyz2 = buildATensor(ctx, outs[1]);
-  chamfer_distance_backward(xyz1, xyz2, idx1, idx2, graddist1, graddist2,
-                            gradxyz1, gradxyz2);
-}
-
-PARROTS_EXTENSION_REGISTER(chamfer_distance_forward)
-    .input(2)
-    .output(4)
-    .apply(chamfer_distance_forward_cuda_parrots)
-    .done();
-
-PARROTS_EXTENSION_REGISTER(chamfer_distance_backward)
-    .input(6)
-    .output(2)
-    .apply(chamfer_distance_backward_cuda_parrots)
-    .done();
-
-#endif
diff --git a/mmcv/ops/csrc/parrots/chamfer_distance_pytorch.h b/mmcv/ops/csrc/parrots/chamfer_distance_pytorch.h
deleted file mode 100644
index 6405526..0000000
--- a/mmcv/ops/csrc/parrots/chamfer_distance_pytorch.h
+++ /dev/null
@@ -1,16 +0,0 @@
-// Copyright (c) OpenMMLab. All rights reserved
-#ifndef ACTIVE_CHAMFER_DISTANCE_PYTORCH_H
-#define ACTIVE_CHAMFER_DISTANCE_PYTORCH_H
-#include <torch/extension.h>
-using namespace at;
-
-void chamfer_distance_forward(const Tensor xyz1, const Tensor xyz2,
-                              const Tensor dist1, const Tensor dist2,
-                              const Tensor idx1, const Tensor idx);
-
-void chamfer_distance_backward(const Tensor xyz1, const Tensor xyz2,
-                               Tensor idx1, Tensor idx2, Tensor graddist1,
-                               Tensor graddist2, Tensor gradxyz1,
-                               Tensor gradxyz2);
-
-#endif  // ACTIVE_CHAMFER_DISTANCE_PYTORCH_H
diff --git a/mmcv/ops/csrc/parrots/contour_expand.cpp b/mmcv/ops/csrc/parrots/contour_expand.cpp
index 586c48e..7639ae5 100644
--- a/mmcv/ops/csrc/parrots/contour_expand.cpp
+++ b/mmcv/ops/csrc/parrots/contour_expand.cpp
@@ -102,6 +102,7 @@ std::vector<std::vector<int>> contour_expand(Tensor kernel_mask,
   IntArrayRef data_shape = kernel_mask.sizes();
 
   auto data_label_map = internal_kernel_label.data_ptr<int32_t>();
+  IntArrayRef label_map_shape = internal_kernel_label.sizes();
   vector<vector<int>> text_line;
 
   kernel_dilate(ptr_data, data_shape, data_label_map, kernel_num,
diff --git a/mmcv/ops/csrc/parrots/convex_iou.cpp b/mmcv/ops/csrc/parrots/convex_iou.cpp
deleted file mode 100644
index 79f2028..0000000
--- a/mmcv/ops/csrc/parrots/convex_iou.cpp
+++ /dev/null
@@ -1,23 +0,0 @@
-// Copyright (c) OpenMMLab. All rights reserved
-// modified from
-// https://github.com/SDL-GuoZonghao/BeyondBoundingBox/tree/main/mmdet/ops/iou/src
-#include "pytorch_cpp_helper.hpp"
-#include "pytorch_device_registry.hpp"
-
-void convex_iou_impl(const Tensor pointsets, const Tensor polygons,
-                     Tensor ious) {
-  DISPATCH_DEVICE_IMPL(convex_iou_impl, pointsets, polygons, ious);
-}
-
-void convex_iou(const Tensor pointsets, const Tensor polygons, Tensor ious) {
-  convex_iou_impl(pointsets, polygons, ious);
-}
-
-void convex_giou_impl(const Tensor pointsets, const Tensor polygons,
-                      Tensor output) {
-  DISPATCH_DEVICE_IMPL(convex_giou_impl, pointsets, polygons, output);
-}
-
-void convex_giou(const Tensor pointsets, const Tensor polygons, Tensor output) {
-  convex_giou_impl(pointsets, polygons, output);
-}
diff --git a/mmcv/ops/csrc/parrots/convex_iou_parrots.cpp b/mmcv/ops/csrc/parrots/convex_iou_parrots.cpp
deleted file mode 100644
index bf76654..0000000
--- a/mmcv/ops/csrc/parrots/convex_iou_parrots.cpp
+++ /dev/null
@@ -1,40 +0,0 @@
-// Copyright (c) OpenMMLab. All rights reserved
-#include <parrots/compute/aten.hpp>
-#include <parrots/extension.hpp>
-#include <parrots/foundation/ssattrs.hpp>
-
-#include "convex_iou_pytorch.h"
-using namespace parrots;
-
-#ifdef MMCV_WITH_CUDA
-void convex_iou_forward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
-                                     const OperatorBase::in_list_t& ins,
-                                     OperatorBase::out_list_t& outs) {
-  auto pointsets = buildATensor(ctx, ins[0]);
-  auto polygons = buildATensor(ctx, ins[1]);
-  auto ious = buildATensor(ctx, outs[0]);
-  convex_iou(pointsets, polygons, ious);
-}
-
-void convex_giou_forward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
-                                      const OperatorBase::in_list_t& ins,
-                                      OperatorBase::out_list_t& outs) {
-  auto pointsets = buildATensor(ctx, ins[0]);
-  auto polygons = buildATensor(ctx, ins[1]);
-  auto output = buildATensor(ctx, outs[0]);
-  convex_giou(pointsets, polygons, output);
-}
-
-PARROTS_EXTENSION_REGISTER(convex_iou)
-    .input(2)
-    .output(1)
-    .apply(convex_iou_forward_cuda_parrots)
-    .done();
-
-PARROTS_EXTENSION_REGISTER(convex_giou)
-    .input(2)
-    .output(1)
-    .apply(convex_giou_forward_cuda_parrots)
-    .done();
-
-#endif
diff --git a/mmcv/ops/csrc/parrots/convex_iou_pytorch.h b/mmcv/ops/csrc/parrots/convex_iou_pytorch.h
deleted file mode 100644
index 4f16a1c..0000000
--- a/mmcv/ops/csrc/parrots/convex_iou_pytorch.h
+++ /dev/null
@@ -1,11 +0,0 @@
-// Copyright (c) OpenMMLab. All rights reserved
-#ifndef CONVEX_IOU_PYTORCH_H
-#define CONVEX_IOU_PYTORCH_H
-#include <torch/extension.h>
-using namespace at;
-
-void convex_iou(const Tensor pointsets, const Tensor polygons, Tensor ious);
-
-void convex_giou(const Tensor pointsets, const Tensor polygons, Tensor output);
-
-#endif  // RIROI_ALIGN_ROTATED_PYTORCH_H
diff --git a/mmcv/ops/csrc/parrots/corner_pool.cpp b/mmcv/ops/csrc/parrots/corner_pool.cpp
new file mode 100644
index 0000000..732cdb0
--- /dev/null
+++ b/mmcv/ops/csrc/parrots/corner_pool.cpp
@@ -0,0 +1,240 @@
+// Copyright (c) OpenMMLab. All rights reserved
+// Modified from
+// https://github.com/princeton-vl/CornerNet-Lite/tree/master/core/models/py_utils/_cpools/src
+#include "pytorch_cpp_helper.hpp"
+
+Tensor bottom_pool_forward(Tensor input) {
+  // Initialize output
+  Tensor output = at::zeros_like(input);
+  // Get height
+  int64_t height = input.size(2);
+  output.copy_(input);
+
+  for (int64_t ind = 1; ind < height; ind <<= 1) {
+    Tensor max_temp = at::slice(output, 2, ind, height);
+    Tensor cur_temp = at::slice(output, 2, ind, height).clone();
+    Tensor next_temp = at::slice(output, 2, 0, height - ind).clone();
+    at::max_out(max_temp, cur_temp, next_temp);
+  }
+
+  return output;
+}
+
+Tensor bottom_pool_backward(Tensor input, Tensor grad_output) {
+  auto output = at::zeros_like(input);
+
+  int32_t batch = input.size(0);
+  int32_t channel = input.size(1);
+  int32_t height = input.size(2);
+  int32_t width = input.size(3);
+
+  auto max_val = torch::zeros({batch, channel, width},
+                              at::device(at::kCUDA).dtype(at::kFloat));
+  auto max_ind = torch::zeros({batch, channel, width},
+                              at::device(at::kCUDA).dtype(at::kLong));
+
+  auto input_temp = input.select(2, 0);
+  max_val.copy_(input_temp);
+
+  max_ind.fill_(0);
+
+  auto output_temp = output.select(2, 0);
+  auto grad_output_temp = grad_output.select(2, 0);
+  output_temp.copy_(grad_output_temp);
+
+  auto un_max_ind = max_ind.unsqueeze(2);
+  auto gt_mask = torch::zeros({batch, channel, width},
+                              at::device(at::kCUDA).dtype(at::kBool));
+  auto max_temp = torch::zeros({batch, channel, width},
+                               at::device(at::kCUDA).dtype(at::kFloat));
+  for (int32_t ind = 0; ind < height - 1; ++ind) {
+    input_temp = input.select(2, ind + 1);
+    at::gt_out(gt_mask, input_temp, max_val);
+
+    at::masked_select_out(max_temp, input_temp, gt_mask);
+    max_val.masked_scatter_(gt_mask, max_temp);
+    max_ind.masked_fill_(gt_mask, ind + 1);
+
+    grad_output_temp = grad_output.select(2, ind + 1).unsqueeze(2);
+    output.scatter_add_(2, un_max_ind, grad_output_temp);
+  }
+
+  return output;
+}
+
+Tensor left_pool_forward(Tensor input) {
+  // Initialize output
+  Tensor output = at::zeros_like(input);
+  // Get width
+  int64_t width = input.size(3);
+  output.copy_(input);
+
+  for (int64_t ind = 1; ind < width; ind <<= 1) {
+    Tensor max_temp = at::slice(output, 3, 0, width - ind);
+    Tensor cur_temp = at::slice(output, 3, 0, width - ind).clone();
+    Tensor next_temp = at::slice(output, 3, ind, width).clone();
+    at::max_out(max_temp, cur_temp, next_temp);
+  }
+
+  return output;
+}
+
+Tensor left_pool_backward(Tensor input, Tensor grad_output) {
+  auto output = at::zeros_like(input);
+
+  int32_t batch = input.size(0);
+  int32_t channel = input.size(1);
+  int32_t height = input.size(2);
+  int32_t width = input.size(3);
+
+  auto max_val = torch::zeros({batch, channel, height},
+                              at::device(at::kCUDA).dtype(at::kFloat));
+  auto max_ind = torch::zeros({batch, channel, height},
+                              at::device(at::kCUDA).dtype(at::kLong));
+
+  auto input_temp = input.select(3, width - 1);
+  max_val.copy_(input_temp);
+
+  max_ind.fill_(width - 1);
+
+  auto output_temp = output.select(3, width - 1);
+  auto grad_output_temp = grad_output.select(3, width - 1);
+  output_temp.copy_(grad_output_temp);
+
+  auto un_max_ind = max_ind.unsqueeze(3);
+  auto gt_mask = torch::zeros({batch, channel, height},
+                              at::device(at::kCUDA).dtype(at::kBool));
+  auto max_temp = torch::zeros({batch, channel, height},
+                               at::device(at::kCUDA).dtype(at::kFloat));
+  for (int32_t ind = 1; ind < width; ++ind) {
+    input_temp = input.select(3, width - ind - 1);
+    at::gt_out(gt_mask, input_temp, max_val);
+
+    at::masked_select_out(max_temp, input_temp, gt_mask);
+    max_val.masked_scatter_(gt_mask, max_temp);
+    max_ind.masked_fill_(gt_mask, width - ind - 1);
+
+    grad_output_temp = grad_output.select(3, width - ind - 1).unsqueeze(3);
+    output.scatter_add_(3, un_max_ind, grad_output_temp);
+  }
+
+  return output;
+}
+
+Tensor right_pool_forward(Tensor input) {
+  // Initialize output
+  Tensor output = at::zeros_like(input);
+  // Get width
+  int64_t width = input.size(3);
+  output.copy_(input);
+
+  for (int64_t ind = 1; ind < width; ind <<= 1) {
+    Tensor max_temp = at::slice(output, 3, ind, width);
+    Tensor cur_temp = at::slice(output, 3, ind, width).clone();
+    Tensor next_temp = at::slice(output, 3, 0, width - ind).clone();
+    at::max_out(max_temp, cur_temp, next_temp);
+  }
+
+  return output;
+}
+
+Tensor right_pool_backward(Tensor input, Tensor grad_output) {
+  Tensor output = at::zeros_like(input);
+
+  int32_t batch = input.size(0);
+  int32_t channel = input.size(1);
+  int32_t height = input.size(2);
+  int32_t width = input.size(3);
+
+  auto max_val = torch::zeros({batch, channel, height},
+                              at::device(at::kCUDA).dtype(at::kFloat));
+  auto max_ind = torch::zeros({batch, channel, height},
+                              at::device(at::kCUDA).dtype(at::kLong));
+
+  auto input_temp = input.select(3, 0);
+  max_val.copy_(input_temp);
+
+  max_ind.fill_(0);
+
+  auto output_temp = output.select(3, 0);
+  auto grad_output_temp = grad_output.select(3, 0);
+  output_temp.copy_(grad_output_temp);
+
+  auto un_max_ind = max_ind.unsqueeze(3);
+  auto gt_mask = torch::zeros({batch, channel, height},
+                              at::device(at::kCUDA).dtype(at::kBool));
+  auto max_temp = torch::zeros({batch, channel, height},
+                               at::device(at::kCUDA).dtype(at::kFloat));
+  for (int32_t ind = 0; ind < width - 1; ++ind) {
+    input_temp = input.select(3, ind + 1);
+    at::gt_out(gt_mask, input_temp, max_val);
+
+    at::masked_select_out(max_temp, input_temp, gt_mask);
+    max_val.masked_scatter_(gt_mask, max_temp);
+    max_ind.masked_fill_(gt_mask, ind + 1);
+
+    grad_output_temp = grad_output.select(3, ind + 1).unsqueeze(3);
+    output.scatter_add_(3, un_max_ind, grad_output_temp);
+  }
+
+  return output;
+}
+
+Tensor top_pool_forward(Tensor input) {
+  // Initialize output
+  Tensor output = at::zeros_like(input);
+  // Get height
+  int64_t height = input.size(2);
+  output.copy_(input);
+
+  for (int64_t ind = 1; ind < height; ind <<= 1) {
+    Tensor max_temp = at::slice(output, 2, 0, height - ind);
+    Tensor cur_temp = at::slice(output, 2, 0, height - ind).clone();
+    Tensor next_temp = at::slice(output, 2, ind, height).clone();
+    at::max_out(max_temp, cur_temp, next_temp);
+  }
+
+  return output;
+}
+
+Tensor top_pool_backward(Tensor input, Tensor grad_output) {
+  auto output = at::zeros_like(input);
+
+  int32_t batch = input.size(0);
+  int32_t channel = input.size(1);
+  int32_t height = input.size(2);
+  int32_t width = input.size(3);
+
+  auto max_val = torch::zeros({batch, channel, width},
+                              at::device(at::kCUDA).dtype(at::kFloat));
+  auto max_ind = torch::zeros({batch, channel, width},
+                              at::device(at::kCUDA).dtype(at::kLong));
+
+  auto input_temp = input.select(2, height - 1);
+  max_val.copy_(input_temp);
+
+  max_ind.fill_(height - 1);
+
+  auto output_temp = output.select(2, height - 1);
+  auto grad_output_temp = grad_output.select(2, height - 1);
+  output_temp.copy_(grad_output_temp);
+
+  auto un_max_ind = max_ind.unsqueeze(2);
+  auto gt_mask = torch::zeros({batch, channel, width},
+                              at::device(at::kCUDA).dtype(at::kBool));
+  auto max_temp = torch::zeros({batch, channel, width},
+                               at::device(at::kCUDA).dtype(at::kFloat));
+  for (int32_t ind = 1; ind < height; ++ind) {
+    input_temp = input.select(2, height - ind - 1);
+    at::gt_out(gt_mask, input_temp, max_val);
+
+    at::masked_select_out(max_temp, input_temp, gt_mask);
+    max_val.masked_scatter_(gt_mask, max_temp);
+    max_ind.masked_fill_(gt_mask, height - ind - 1);
+
+    grad_output_temp = grad_output.select(2, height - ind - 1).unsqueeze(2);
+    output.scatter_add_(2, un_max_ind, grad_output_temp);
+  }
+
+  return output;
+}
diff --git a/mmcv/ops/csrc/parrots/corner_pool_parrots.cpp b/mmcv/ops/csrc/parrots/corner_pool_parrots.cpp
new file mode 100644
index 0000000..8b8cc5e
--- /dev/null
+++ b/mmcv/ops/csrc/parrots/corner_pool_parrots.cpp
@@ -0,0 +1,234 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "corner_pool_pytorch.h"
+
+using namespace parrots;
+
+#ifdef MMCV_WITH_CUDA
+void bottom_pool_forward_parrots(CudaContext& ctx, const SSElement& attr,
+                                 const OperatorBase::in_list_t& ins,
+                                 OperatorBase::out_list_t& outs) {
+  at::Tensor input;
+  input = buildATensor(ctx, ins[0]);
+  auto out = bottom_pool_forward(input);
+  updateDArray(ctx, out, outs[0]);
+}
+
+void bottom_pool_backward_parrots(CudaContext& ctx, const SSElement& attr,
+                                  const OperatorBase::in_list_t& ins,
+                                  OperatorBase::out_list_t& outs) {
+  at::Tensor input, grad_output;
+  input = buildATensor(ctx, ins[0]);
+  grad_output = buildATensor(ctx, ins[1]);
+  auto out = bottom_pool_backward(input, grad_output);
+  updateDArray(ctx, out, outs[0]);
+}
+
+void left_pool_forward_parrots(CudaContext& ctx, const SSElement& attr,
+                               const OperatorBase::in_list_t& ins,
+                               OperatorBase::out_list_t& outs) {
+  at::Tensor input;
+  input = buildATensor(ctx, ins[0]);
+  auto out = left_pool_forward(input);
+  updateDArray(ctx, out, outs[0]);
+}
+
+void left_pool_backward_parrots(CudaContext& ctx, const SSElement& attr,
+                                const OperatorBase::in_list_t& ins,
+                                OperatorBase::out_list_t& outs) {
+  at::Tensor input, grad_output;
+  input = buildATensor(ctx, ins[0]);
+  grad_output = buildATensor(ctx, ins[1]);
+  auto out = left_pool_backward(input, grad_output);
+  updateDArray(ctx, out, outs[0]);
+}
+
+void right_pool_forward_parrots(CudaContext& ctx, const SSElement& attr,
+                                const OperatorBase::in_list_t& ins,
+                                OperatorBase::out_list_t& outs) {
+  at::Tensor input;
+  input = buildATensor(ctx, ins[0]);
+  auto out = right_pool_forward(input);
+  updateDArray(ctx, out, outs[0]);
+}
+
+void right_pool_backward_parrots(CudaContext& ctx, const SSElement& attr,
+                                 const OperatorBase::in_list_t& ins,
+                                 OperatorBase::out_list_t& outs) {
+  at::Tensor input, grad_output;
+  input = buildATensor(ctx, ins[0]);
+  grad_output = buildATensor(ctx, ins[1]);
+  auto out = right_pool_backward(input, grad_output);
+  updateDArray(ctx, out, outs[0]);
+}
+
+void top_pool_forward_parrots(CudaContext& ctx, const SSElement& attr,
+                              const OperatorBase::in_list_t& ins,
+                              OperatorBase::out_list_t& outs) {
+  at::Tensor input;
+  input = buildATensor(ctx, ins[0]);
+  auto out = top_pool_forward(input);
+  updateDArray(ctx, out, outs[0]);
+}
+
+void top_pool_backward_parrots(CudaContext& ctx, const SSElement& attr,
+                               const OperatorBase::in_list_t& ins,
+                               OperatorBase::out_list_t& outs) {
+  at::Tensor input, grad_output;
+  input = buildATensor(ctx, ins[0]);
+  grad_output = buildATensor(ctx, ins[1]);
+  auto out = top_pool_backward(input, grad_output);
+  updateDArray(ctx, out, outs[0]);
+}
+#endif
+
+void bottom_pool_forward_parrots_cpu(HostContext& ctx, const SSElement& attr,
+                                     const OperatorBase::in_list_t& ins,
+                                     OperatorBase::out_list_t& outs) {
+  at::Tensor input;
+  input = buildATensor(ctx, ins[0]);
+  auto out = bottom_pool_forward(input);
+  updateDArray(ctx, out, outs[0]);
+}
+
+void bottom_pool_backward_parrots_cpu(HostContext& ctx, const SSElement& attr,
+                                      const OperatorBase::in_list_t& ins,
+                                      OperatorBase::out_list_t& outs) {
+  at::Tensor input, grad_output;
+  input = buildATensor(ctx, ins[0]);
+  grad_output = buildATensor(ctx, ins[1]);
+  auto out = bottom_pool_backward(input, grad_output);
+  updateDArray(ctx, out, outs[0]);
+}
+
+void left_pool_forward_parrots_cpu(HostContext& ctx, const SSElement& attr,
+                                   const OperatorBase::in_list_t& ins,
+                                   OperatorBase::out_list_t& outs) {
+  at::Tensor input;
+  input = buildATensor(ctx, ins[0]);
+  auto out = left_pool_forward(input);
+  updateDArray(ctx, out, outs[0]);
+}
+
+void left_pool_backward_parrots_cpu(HostContext& ctx, const SSElement& attr,
+                                    const OperatorBase::in_list_t& ins,
+                                    OperatorBase::out_list_t& outs) {
+  at::Tensor input, grad_output;
+  input = buildATensor(ctx, ins[0]);
+  grad_output = buildATensor(ctx, ins[1]);
+  auto out = left_pool_backward(input, grad_output);
+  updateDArray(ctx, out, outs[0]);
+}
+
+void right_pool_forward_parrots_cpu(HostContext& ctx, const SSElement& attr,
+                                    const OperatorBase::in_list_t& ins,
+                                    OperatorBase::out_list_t& outs) {
+  at::Tensor input;
+  input = buildATensor(ctx, ins[0]);
+  auto out = right_pool_forward(input);
+  updateDArray(ctx, out, outs[0]);
+}
+
+void right_pool_backward_parrots_cpu(HostContext& ctx, const SSElement& attr,
+                                     const OperatorBase::in_list_t& ins,
+                                     OperatorBase::out_list_t& outs) {
+  at::Tensor input, grad_output;
+  input = buildATensor(ctx, ins[0]);
+  grad_output = buildATensor(ctx, ins[1]);
+  auto out = right_pool_backward(input, grad_output);
+  updateDArray(ctx, out, outs[0]);
+}
+
+void top_pool_forward_parrots_cpu(HostContext& ctx, const SSElement& attr,
+                                  const OperatorBase::in_list_t& ins,
+                                  OperatorBase::out_list_t& outs) {
+  at::Tensor input;
+  input = buildATensor(ctx, ins[0]);
+  auto out = top_pool_forward(input);
+  updateDArray(ctx, out, outs[0]);
+}
+
+void top_pool_backward_parrots_cpu(HostContext& ctx, const SSElement& attr,
+                                   const OperatorBase::in_list_t& ins,
+                                   OperatorBase::out_list_t& outs) {
+  at::Tensor input, grad_output;
+  input = buildATensor(ctx, ins[0]);
+  grad_output = buildATensor(ctx, ins[1]);
+  auto out = top_pool_backward(input, grad_output);
+  updateDArray(ctx, out, outs[0]);
+}
+
+PARROTS_EXTENSION_REGISTER(bottom_pool_forward)
+    .input(1)
+    .output(1)
+#ifdef MMCV_WITH_CUDA
+    .apply(bottom_pool_forward_parrots)
+#endif
+    .apply(bottom_pool_forward_parrots_cpu)
+    .done();
+
+PARROTS_EXTENSION_REGISTER(bottom_pool_backward)
+    .input(2)
+    .output(1)
+#ifdef MMCV_WITH_CUDA
+    .apply(bottom_pool_backward_parrots)
+#endif
+    .apply(bottom_pool_backward_parrots_cpu)
+    .done();
+
+PARROTS_EXTENSION_REGISTER(top_pool_forward)
+    .input(1)
+    .output(1)
+#ifdef MMCV_WITH_CUDA
+    .apply(top_pool_forward_parrots)
+#endif
+    .apply(top_pool_forward_parrots_cpu)
+    .done();
+
+PARROTS_EXTENSION_REGISTER(top_pool_backward)
+    .input(2)
+    .output(1)
+#ifdef MMCV_WITH_CUDA
+    .apply(top_pool_backward_parrots)
+#endif
+    .apply(top_pool_backward_parrots_cpu)
+    .done();
+
+PARROTS_EXTENSION_REGISTER(left_pool_forward)
+    .input(1)
+    .output(1)
+#ifdef MMCV_WITH_CUDA
+    .apply(left_pool_forward_parrots)
+#endif
+    .apply(left_pool_forward_parrots_cpu)
+    .done();
+
+PARROTS_EXTENSION_REGISTER(left_pool_backward)
+    .input(2)
+    .output(1)
+#ifdef MMCV_WITH_CUDA
+    .apply(left_pool_backward_parrots)
+#endif
+    .apply(left_pool_backward_parrots_cpu)
+    .done();
+
+PARROTS_EXTENSION_REGISTER(right_pool_forward)
+    .input(1)
+    .output(1)
+#ifdef MMCV_WITH_CUDA
+    .apply(right_pool_forward_parrots)
+#endif
+    .apply(right_pool_forward_parrots_cpu)
+    .done();
+
+PARROTS_EXTENSION_REGISTER(right_pool_backward)
+    .input(2)
+    .output(1)
+#ifdef MMCV_WITH_CUDA
+    .apply(right_pool_backward_parrots)
+#endif
+    .apply(right_pool_backward_parrots_cpu)
+    .done();
diff --git a/mmcv/ops/csrc/parrots/corner_pool_pytorch.h b/mmcv/ops/csrc/parrots/corner_pool_pytorch.h
new file mode 100644
index 0000000..fd94234
--- /dev/null
+++ b/mmcv/ops/csrc/parrots/corner_pool_pytorch.h
@@ -0,0 +1,15 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef CORNER_POOL_PYTORCH_H
+#define CORNER_POOL_PYTORCH_H
+#include <torch/extension.h>
+
+at::Tensor bottom_pool_forward(at::Tensor input);
+at::Tensor bottom_pool_backward(at::Tensor input, at::Tensor grad_output);
+at::Tensor left_pool_forward(at::Tensor input);
+at::Tensor left_pool_backward(at::Tensor input, at::Tensor grad_output);
+at::Tensor right_pool_forward(at::Tensor input);
+at::Tensor right_pool_backward(at::Tensor input, at::Tensor grad_output);
+at::Tensor top_pool_forward(at::Tensor input);
+at::Tensor top_pool_backward(at::Tensor input, at::Tensor grad_output);
+
+#endif  // CORNER_POOL_PYTORCH_H
diff --git a/mmcv/ops/csrc/parrots/correlation.cpp b/mmcv/ops/csrc/parrots/correlation.cpp
index f4adba2..c3614a5 100644
--- a/mmcv/ops/csrc/parrots/correlation.cpp
+++ b/mmcv/ops/csrc/parrots/correlation.cpp
@@ -2,37 +2,65 @@
 #include <iostream>
 
 #include "pytorch_cpp_helper.hpp"
-#include "pytorch_device_registry.hpp"
 
-void correlation_forward_impl(Tensor input1, Tensor input2, Tensor output,
+#ifdef MMCV_WITH_CUDA
+
+void CorrelationForwardCUDAKernelLauncher(Tensor input1, Tensor input2,
+                                          Tensor output, int kH, int kW,
+                                          int patchH, int patchW, int padH,
+                                          int padW, int dilationH,
+                                          int dilationW, int dilation_patchH,
+                                          int dilation_patchW, int dH, int dW);
+
+void CorrelationBackwardCUDAKernelLauncher(Tensor grad_output, Tensor input1,
+                                           Tensor input2, Tensor grad_input1,
+                                           Tensor grad_input2, int kH, int kW,
+                                           int patchH, int patchW, int padH,
+                                           int padW, int dilationH,
+                                           int dilationW, int dilation_patchH,
+                                           int dilation_patchW, int dH, int dW);
+
+void correlation_cuda_forward(Tensor input1, Tensor input2, Tensor output,
                               int kH, int kW, int patchH, int patchW, int padH,
                               int padW, int dilationH, int dilationW,
                               int dilation_patchH, int dilation_patchW, int dH,
                               int dW) {
-  DISPATCH_DEVICE_IMPL(correlation_forward_impl, input1, input2, output, kH, kW,
-                       patchH, patchW, padH, padW, dilationH, dilationW,
-                       dilation_patchH, dilation_patchW, dH, dW);
+  CorrelationForwardCUDAKernelLauncher(
+      input1, input2, output, kH, kW, patchH, patchW, padH, padW, dilationH,
+      dilationW, dilation_patchH, dilation_patchW, dH, dW);
 }
 
-void correlation_backward_impl(Tensor grad_output, Tensor input1, Tensor input2,
+void correlation_cuda_backward(Tensor grad_output, Tensor input1, Tensor input2,
                                Tensor grad_input1, Tensor grad_input2, int kH,
                                int kW, int patchH, int patchW, int padH,
                                int padW, int dilationH, int dilationW,
                                int dilation_patchH, int dilation_patchW, int dH,
                                int dW) {
-  DISPATCH_DEVICE_IMPL(correlation_backward_impl, grad_output, input1, input2,
-                       grad_input1, grad_input2, kH, kW, patchH, patchW, padH,
-                       padW, dilationH, dilationW, dilation_patchH,
-                       dilation_patchW, dH, dW);
+  CorrelationBackwardCUDAKernelLauncher(
+      grad_output, input1, input2, grad_input1, grad_input2, kH, kW, patchH,
+      patchW, padH, padW, dilationH, dilationW, dilation_patchH,
+      dilation_patchW, dH, dW);
 }
 
+#endif
+
 void correlation_forward(Tensor input1, Tensor input2, Tensor output, int kH,
                          int kW, int patchH, int patchW, int padH, int padW,
                          int dilationH, int dilationW, int dilation_patchH,
                          int dilation_patchW, int dH, int dW) {
-  correlation_forward_impl(input1, input2, output, kH, kW, patchH, patchW, padH,
-                           padW, dilationH, dilationW, dilation_patchH,
-                           dilation_patchW, dH, dW);
+  if (input1.device().is_cuda() && input2.device().is_cuda()) {
+#ifdef MMCV_WITH_CUDA
+    CHECK_CUDA_INPUT(input1);
+    CHECK_CUDA_INPUT(input2);
+    correlation_cuda_forward(input1, input2, output, kH, kW, patchH, patchW,
+                             padH, padW, dilationH, dilationW, dilation_patchH,
+                             dilation_patchW, dH, dW);
+#else
+    AT_ERROR("Correlation is not compiled with GPU support");
+#endif
+  } else {
+    AT_ERROR("Correlation is not implemented on CPU");
+  }
 }
 
 void correlation_backward(Tensor grad_output, Tensor input1, Tensor input2,
@@ -40,8 +68,20 @@ void correlation_backward(Tensor grad_output, Tensor input1, Tensor input2,
                           int kW, int patchH, int patchW, int padH, int padW,
                           int dilationH, int dilationW, int dilation_patchH,
                           int dilation_patchW, int dH, int dW) {
-  correlation_backward_impl(grad_output, input1, input2, grad_input1,
-                            grad_input2, kH, kW, patchH, patchW, padH, padW,
-                            dilationH, dilationW, dilation_patchH,
-                            dilation_patchW, dH, dW);
+  if (input1.device().is_cuda() && input2.device().is_cuda()) {
+#ifdef MMCV_WITH_CUDA
+    CHECK_CUDA_INPUT(grad_output);
+    CHECK_CUDA_INPUT(input1);
+    CHECK_CUDA_INPUT(input2);
+    correlation_cuda_backward(grad_output, input1, input2, grad_input1,
+                              grad_input2, kH, kW, patchH, patchW, padH, padW,
+                              dilationH, dilationW, dilation_patchH,
+                              dilation_patchW, dH, dW);
+
+#else
+    AT_ERROR("Correlation is not compiled with GPU support");
+#endif
+  } else {
+    AT_ERROR("Correlation is not implemented on CPU");
+  }
 }
diff --git a/mmcv/ops/csrc/parrots/cudabind.cpp b/mmcv/ops/csrc/parrots/cudabind.cpp
deleted file mode 100644
index 9627e26..0000000
--- a/mmcv/ops/csrc/parrots/cudabind.cpp
+++ /dev/null
@@ -1,1677 +0,0 @@
-#include "pytorch_cpp_helper.hpp"
-#include "pytorch_device_registry.hpp"
-
-void AssignScoreWithKForwardCUDAKernelLauncher(
-    int B, int N0, int N1, int M, int K, int O, int aggregate,
-    const Tensor& points, const Tensor& centers, const Tensor& scores,
-    const Tensor& knn_idx, Tensor& output);
-
-void AssignScoreWithKBackwardCUDAKernelLauncher(
-    int B, int N0, int N1, int M, int K, int O, int aggregate,
-    const Tensor& grad_out, const Tensor& points, const Tensor& centers,
-    const Tensor& scores, const Tensor& knn_idx, Tensor& grad_points,
-    Tensor& grad_centers, Tensor& grad_scores);
-
-void assign_score_withk_forward_cuda(int B, int N0, int N1, int M, int K, int O,
-                                     int aggregate, const Tensor& points,
-                                     const Tensor& centers,
-                                     const Tensor& scores,
-                                     const Tensor& knn_idx, Tensor& output) {
-  AssignScoreWithKForwardCUDAKernelLauncher(
-      B, N0, N1, M, K, O, aggregate, points, centers, scores, knn_idx, output);
-};
-
-void assign_score_withk_backward_cuda(
-    int B, int N0, int N1, int M, int K, int O, int aggregate,
-    const Tensor& grad_out, const Tensor& points, const Tensor& centers,
-    const Tensor& scores, const Tensor& knn_idx, Tensor& grad_points,
-    Tensor& grad_centers, Tensor& grad_scores) {
-  AssignScoreWithKBackwardCUDAKernelLauncher(
-      B, N0, N1, M, K, O, aggregate, grad_out, points, centers, scores, knn_idx,
-      grad_points, grad_centers, grad_scores);
-};
-
-void assign_score_withk_forward_impl(int B, int N0, int N1, int M, int K, int O,
-                                     int aggregate, const Tensor& points,
-                                     const Tensor& centers,
-                                     const Tensor& scores,
-                                     const Tensor& knn_idx, Tensor& output);
-
-void assign_score_withk_backward_impl(
-    int B, int N0, int N1, int M, int K, int O, int aggregate,
-    const Tensor& grad_out, const Tensor& points, const Tensor& centers,
-    const Tensor& scores, const Tensor& knn_idx, Tensor& grad_points,
-    Tensor& grad_centers, Tensor& grad_scores);
-
-REGISTER_DEVICE_IMPL(assign_score_withk_forward_impl, CUDA,
-                     assign_score_withk_forward_cuda);
-REGISTER_DEVICE_IMPL(assign_score_withk_backward_impl, CUDA,
-                     assign_score_withk_backward_cuda);
-
-void BallQueryForwardCUDAKernelLauncher(int b, int n, int m, float min_radius,
-                                        float max_radius, int nsample,
-                                        const Tensor new_xyz, const Tensor xyz,
-                                        Tensor idx);
-
-void ball_query_forward_cuda(int b, int n, int m, float min_radius,
-                             float max_radius, int nsample,
-                             const Tensor new_xyz, const Tensor xyz,
-                             Tensor idx) {
-  BallQueryForwardCUDAKernelLauncher(b, n, m, min_radius, max_radius, nsample,
-                                     new_xyz, xyz, idx);
-};
-
-void ball_query_forward_impl(int b, int n, int m, float min_radius,
-                             float max_radius, int nsample,
-                             const Tensor new_xyz, const Tensor xyz,
-                             Tensor idx);
-REGISTER_DEVICE_IMPL(ball_query_forward_impl, CUDA, ball_query_forward_cuda);
-
-void BBoxOverlapsCUDAKernelLauncher(const Tensor bboxes1, const Tensor bboxes2,
-                                    Tensor ious, const int mode,
-                                    const bool aligned, const int offset);
-
-void bbox_overlaps_cuda(const Tensor bboxes1, const Tensor bboxes2, Tensor ious,
-                        const int mode, const bool aligned, const int offset) {
-  BBoxOverlapsCUDAKernelLauncher(bboxes1, bboxes2, ious, mode, aligned, offset);
-}
-
-void bbox_overlaps_impl(const Tensor bboxes1, const Tensor bboxes2, Tensor ious,
-                        const int mode, const bool aligned, const int offset);
-REGISTER_DEVICE_IMPL(bbox_overlaps_impl, CUDA, bbox_overlaps_cuda);
-
-void BorderAlignForwardCUDAKernelLauncher(const Tensor& input,
-                                          const Tensor& boxes, Tensor output,
-                                          Tensor argmax_idx,
-                                          const int pool_size);
-
-void BorderAlignBackwardCUDAKernelLauncher(const Tensor& grad_output,
-                                           const Tensor& boxes,
-                                           const Tensor& argmax_idx,
-                                           Tensor grad_input,
-                                           const int pool_size);
-
-void border_align_forward_cuda(const Tensor& input, const Tensor& boxes,
-                               Tensor output, Tensor argmax_idx,
-                               const int pool_size) {
-  BorderAlignForwardCUDAKernelLauncher(input, boxes, output, argmax_idx,
-                                       pool_size);
-}
-
-void border_align_backward_cuda(const Tensor& grad_output, const Tensor& boxes,
-                                const Tensor& argmax_idx, Tensor grad_input,
-                                const int pool_size) {
-  BorderAlignBackwardCUDAKernelLauncher(grad_output, boxes, argmax_idx,
-                                        grad_input, pool_size);
-}
-
-void border_align_forward_impl(const Tensor& input, const Tensor& boxes,
-                               Tensor output, Tensor argmax_idx,
-                               const int pool_size);
-
-void border_align_backward_impl(const Tensor& grad_output, const Tensor& boxes,
-                                const Tensor& argmax_idx, Tensor grad_input,
-                                const int pool_size);
-
-REGISTER_DEVICE_IMPL(border_align_forward_impl, CUDA,
-                     border_align_forward_cuda);
-REGISTER_DEVICE_IMPL(border_align_backward_impl, CUDA,
-                     border_align_backward_cuda);
-
-void box_iou_rotated_cuda(const Tensor boxes1, const Tensor boxes2, Tensor ious,
-                          const int mode_flag, const bool aligned);
-
-void box_iou_rotated_impl(const Tensor boxes1, const Tensor boxes2, Tensor ious,
-                          const int mode_flag, const bool aligned);
-REGISTER_DEVICE_IMPL(box_iou_rotated_impl, CUDA, box_iou_rotated_cuda);
-
-void CARAFEForwardCUDAKernelLauncher(const Tensor features, const Tensor masks,
-                                     Tensor rfeatures, Tensor routput,
-                                     Tensor rmasks, Tensor output,
-                                     const int kernel_size,
-                                     const int group_size,
-                                     const int scale_factor);
-
-void CARAFEBackwardCUDAKernelLauncher(
-    const Tensor top_grad, const Tensor rfeatures, const Tensor masks,
-    Tensor rtop_grad, Tensor rbottom_grad_hs, Tensor rbottom_grad,
-    Tensor rmask_grad, Tensor bottom_grad, Tensor mask_grad,
-    const int kernel_size, const int group_size, const int scale_factor);
-
-void carafe_forward_cuda(Tensor features, Tensor masks, Tensor rfeatures,
-                         Tensor routput, Tensor rmasks, Tensor output,
-                         int kernel_size, int group_size, int scale_factor) {
-  CARAFEForwardCUDAKernelLauncher(features, masks, rfeatures, routput, rmasks,
-                                  output, kernel_size, group_size,
-                                  scale_factor);
-}
-
-void carafe_backward_cuda(Tensor top_grad, Tensor rfeatures, Tensor masks,
-                          Tensor rtop_grad, Tensor rbottom_grad_hs,
-                          Tensor rbottom_grad, Tensor rmask_grad,
-                          Tensor bottom_grad, Tensor mask_grad, int kernel_size,
-                          int group_size, int scale_factor) {
-  CARAFEBackwardCUDAKernelLauncher(top_grad, rfeatures, masks, rtop_grad,
-                                   rbottom_grad_hs, rbottom_grad, rmask_grad,
-                                   bottom_grad, mask_grad, kernel_size,
-                                   group_size, scale_factor);
-}
-
-void carafe_forward_impl(Tensor features, Tensor masks, Tensor rfeatures,
-                         Tensor routput, Tensor rmasks, Tensor output,
-                         int kernel_size, int group_size, int scale_factor);
-
-void carafe_backward_impl(Tensor top_grad, Tensor rfeatures, Tensor masks,
-                          Tensor rtop_grad, Tensor rbottom_grad_hs,
-                          Tensor rbottom_grad, Tensor rmask_grad,
-                          Tensor bottom_grad, Tensor mask_grad, int kernel_size,
-                          int group_size, int scale_factor);
-
-REGISTER_DEVICE_IMPL(carafe_forward_impl, CUDA, carafe_forward_cuda);
-REGISTER_DEVICE_IMPL(carafe_backward_impl, CUDA, carafe_backward_cuda);
-
-void CARAFENAIVEForwardCUDAKernelLauncher(const Tensor features,
-                                          const Tensor masks, Tensor output,
-                                          const int kernel_size,
-                                          const int group_size,
-                                          const int scale_factor);
-
-void CARAFENAIVEBackwardCUDAKernelLauncher(
-    const Tensor top_grad, const Tensor features, const Tensor masks,
-    Tensor bottom_grad, Tensor mask_grad, const int kernel_size,
-    const int group_size, const int scale_factor);
-
-void carafe_naive_forward_cuda(Tensor features, Tensor masks, Tensor output,
-                               int kernel_size, int group_size,
-                               int scale_factor) {
-  CARAFENAIVEForwardCUDAKernelLauncher(features, masks, output, kernel_size,
-                                       group_size, scale_factor);
-}
-
-void carafe_naive_backward_cuda(Tensor top_grad, Tensor features, Tensor masks,
-                                Tensor bottom_grad, Tensor mask_grad,
-                                int kernel_size, int group_size,
-                                int scale_factor) {
-  CARAFENAIVEBackwardCUDAKernelLauncher(top_grad, features, masks, bottom_grad,
-                                        mask_grad, kernel_size, group_size,
-                                        scale_factor);
-}
-void carafe_naive_forward_impl(Tensor features, Tensor masks, Tensor output,
-                               int kernel_size, int group_size,
-                               int scale_factor);
-
-void carafe_naive_backward_impl(Tensor top_grad, Tensor features, Tensor masks,
-                                Tensor bottom_grad, Tensor mask_grad,
-                                int kernel_size, int group_size,
-                                int scale_factor);
-
-REGISTER_DEVICE_IMPL(carafe_naive_forward_impl, CUDA,
-                     carafe_naive_forward_cuda);
-REGISTER_DEVICE_IMPL(carafe_naive_backward_impl, CUDA,
-                     carafe_naive_backward_cuda);
-
-void CorrelationForwardCUDAKernelLauncher(Tensor input1, Tensor input2,
-                                          Tensor output, int kH, int kW,
-                                          int patchH, int patchW, int padH,
-                                          int padW, int dilationH,
-                                          int dilationW, int dilation_patchH,
-                                          int dilation_patchW, int dH, int dW);
-
-void CorrelationBackwardCUDAKernelLauncher(Tensor grad_output, Tensor input1,
-                                           Tensor input2, Tensor grad_input1,
-                                           Tensor grad_input2, int kH, int kW,
-                                           int patchH, int patchW, int padH,
-                                           int padW, int dilationH,
-                                           int dilationW, int dilation_patchH,
-                                           int dilation_patchW, int dH, int dW);
-
-void correlation_forward_cuda(Tensor input1, Tensor input2, Tensor output,
-                              int kH, int kW, int patchH, int patchW, int padH,
-                              int padW, int dilationH, int dilationW,
-                              int dilation_patchH, int dilation_patchW, int dH,
-                              int dW) {
-  CorrelationForwardCUDAKernelLauncher(
-      input1, input2, output, kH, kW, patchH, patchW, padH, padW, dilationH,
-      dilationW, dilation_patchH, dilation_patchW, dH, dW);
-}
-
-void correlation_backward_cuda(Tensor grad_output, Tensor input1, Tensor input2,
-                               Tensor grad_input1, Tensor grad_input2, int kH,
-                               int kW, int patchH, int patchW, int padH,
-                               int padW, int dilationH, int dilationW,
-                               int dilation_patchH, int dilation_patchW, int dH,
-                               int dW) {
-  CorrelationBackwardCUDAKernelLauncher(
-      grad_output, input1, input2, grad_input1, grad_input2, kH, kW, patchH,
-      patchW, padH, padW, dilationH, dilationW, dilation_patchH,
-      dilation_patchW, dH, dW);
-}
-
-void correlation_forward_impl(Tensor input1, Tensor input2, Tensor output,
-                              int kH, int kW, int patchH, int patchW, int padH,
-                              int padW, int dilationH, int dilationW,
-                              int dilation_patchH, int dilation_patchW, int dH,
-                              int dW);
-
-void correlation_backward_impl(Tensor grad_output, Tensor input1, Tensor input2,
-                               Tensor grad_input1, Tensor grad_input2, int kH,
-                               int kW, int patchH, int patchW, int padH,
-                               int padW, int dilationH, int dilationW,
-                               int dilation_patchH, int dilation_patchW, int dH,
-                               int dW);
-
-REGISTER_DEVICE_IMPL(correlation_forward_impl, CUDA, correlation_forward_cuda);
-REGISTER_DEVICE_IMPL(correlation_backward_impl, CUDA,
-                     correlation_backward_cuda);
-
-void deformable_im2col_cuda(Tensor data_im, Tensor data_offset,
-                            const int channels, const int height,
-                            const int width, const int ksize_h,
-                            const int ksize_w, const int pad_h, const int pad_w,
-                            const int stride_h, const int stride_w,
-                            const int dilation_h, const int dilation_w,
-                            const int parallel_imgs, const int deformable_group,
-                            Tensor data_col);
-
-void deformable_col2im_cuda(Tensor data_col, Tensor data_offset,
-                            const int channels, const int height,
-                            const int width, const int ksize_h,
-                            const int ksize_w, const int pad_h, const int pad_w,
-                            const int stride_h, const int stride_w,
-                            const int dilation_h, const int dilation_w,
-                            const int parallel_imgs, const int deformable_group,
-                            Tensor grad_im);
-
-void deformable_col2im_coord_cuda(
-    Tensor data_col, Tensor data_im, Tensor data_offset, const int channels,
-    const int height, const int width, const int ksize_h, const int ksize_w,
-    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
-    const int dilation_h, const int dilation_w, const int parallel_imgs,
-    const int deformable_group, Tensor grad_offset);
-
-void deformable_im2col_impl(Tensor data_im, Tensor data_offset,
-                            const int channels, const int height,
-                            const int width, const int ksize_h,
-                            const int ksize_w, const int pad_h, const int pad_w,
-                            const int stride_h, const int stride_w,
-                            const int dilation_h, const int dilation_w,
-                            const int parallel_imgs, const int deformable_group,
-                            Tensor data_col);
-
-void deformable_col2im_impl(Tensor data_col, Tensor data_offset,
-                            const int channels, const int height,
-                            const int width, const int ksize_h,
-                            const int ksize_w, const int pad_h, const int pad_w,
-                            const int stride_h, const int stride_w,
-                            const int dilation_h, const int dilation_w,
-                            const int parallel_imgs, const int deformable_group,
-                            Tensor grad_im);
-
-void deformable_col2im_coord_impl(
-    Tensor data_col, Tensor data_im, Tensor data_offset, const int channels,
-    const int height, const int width, const int ksize_h, const int ksize_w,
-    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
-    const int dilation_h, const int dilation_w, const int parallel_imgs,
-    const int deformable_group, Tensor grad_offset);
-
-REGISTER_DEVICE_IMPL(deformable_im2col_impl, CUDA, deformable_im2col_cuda);
-REGISTER_DEVICE_IMPL(deformable_col2im_impl, CUDA, deformable_col2im_cuda);
-REGISTER_DEVICE_IMPL(deformable_col2im_coord_impl, CUDA,
-                     deformable_col2im_coord_cuda);
-
-void DeformRoIPoolForwardCUDAKernelLauncher(Tensor input, Tensor rois,
-                                            Tensor offset, Tensor output,
-                                            int pooled_height, int pooled_width,
-                                            float spatial_scale,
-                                            int sampling_ratio, float gamma);
-
-void DeformRoIPoolBackwardCUDAKernelLauncher(
-    Tensor grad_output, Tensor input, Tensor rois, Tensor offset,
-    Tensor grad_input, Tensor grad_offset, int pooled_height, int pooled_width,
-    float spatial_scale, int sampling_ratio, float gamma);
-
-void deform_roi_pool_forward_cuda(Tensor input, Tensor rois, Tensor offset,
-                                  Tensor output, int pooled_height,
-                                  int pooled_width, float spatial_scale,
-                                  int sampling_ratio, float gamma) {
-  DeformRoIPoolForwardCUDAKernelLauncher(input, rois, offset, output,
-                                         pooled_height, pooled_width,
-                                         spatial_scale, sampling_ratio, gamma);
-}
-
-void deform_roi_pool_backward_cuda(Tensor grad_output, Tensor input,
-                                   Tensor rois, Tensor offset,
-                                   Tensor grad_input, Tensor grad_offset,
-                                   int pooled_height, int pooled_width,
-                                   float spatial_scale, int sampling_ratio,
-                                   float gamma) {
-  DeformRoIPoolBackwardCUDAKernelLauncher(
-      grad_output, input, rois, offset, grad_input, grad_offset, pooled_height,
-      pooled_width, spatial_scale, sampling_ratio, gamma);
-}
-
-void deform_roi_pool_forward_impl(Tensor input, Tensor rois, Tensor offset,
-                                  Tensor output, int pooled_height,
-                                  int pooled_width, float spatial_scale,
-                                  int sampling_ratio, float gamma);
-
-void deform_roi_pool_backward_impl(Tensor grad_output, Tensor input,
-                                   Tensor rois, Tensor offset,
-                                   Tensor grad_input, Tensor grad_offset,
-                                   int pooled_height, int pooled_width,
-                                   float spatial_scale, int sampling_ratio,
-                                   float gamma);
-
-REGISTER_DEVICE_IMPL(deform_roi_pool_forward_impl, CUDA,
-                     deform_roi_pool_forward_cuda);
-REGISTER_DEVICE_IMPL(deform_roi_pool_backward_impl, CUDA,
-                     deform_roi_pool_backward_cuda);
-
-void SigmoidFocalLossForwardCUDAKernelLauncher(Tensor input, Tensor target,
-                                               Tensor weight, Tensor output,
-                                               const float gamma,
-                                               const float alpha);
-
-void SigmoidFocalLossBackwardCUDAKernelLauncher(Tensor input, Tensor target,
-                                                Tensor weight,
-                                                Tensor grad_input,
-                                                const float gamma,
-                                                const float alpha);
-
-void SoftmaxFocalLossForwardCUDAKernelLauncher(Tensor softmax, Tensor target,
-                                               Tensor weight, Tensor output,
-                                               const float gamma,
-                                               const float alpha);
-
-void SoftmaxFocalLossBackwardCUDAKernelLauncher(Tensor softmax, Tensor target,
-                                                Tensor weight, Tensor buff,
-                                                Tensor grad_input,
-                                                const float gamma,
-                                                const float alpha);
-
-void sigmoid_focal_loss_forward_cuda(Tensor input, Tensor target, Tensor weight,
-                                     Tensor output, float gamma, float alpha) {
-  SigmoidFocalLossForwardCUDAKernelLauncher(input, target, weight, output,
-                                            gamma, alpha);
-}
-
-void sigmoid_focal_loss_backward_cuda(Tensor input, Tensor target,
-                                      Tensor weight, Tensor grad_input,
-                                      float gamma, float alpha) {
-  SigmoidFocalLossBackwardCUDAKernelLauncher(input, target, weight, grad_input,
-                                             gamma, alpha);
-}
-
-void softmax_focal_loss_forward_cuda(Tensor input, Tensor target, Tensor weight,
-                                     Tensor output, float gamma, float alpha) {
-  SoftmaxFocalLossForwardCUDAKernelLauncher(input, target, weight, output,
-                                            gamma, alpha);
-}
-
-void softmax_focal_loss_backward_cuda(Tensor input, Tensor target,
-                                      Tensor weight, Tensor buff,
-                                      Tensor grad_input, float gamma,
-                                      float alpha) {
-  SoftmaxFocalLossBackwardCUDAKernelLauncher(input, target, weight, buff,
-                                             grad_input, gamma, alpha);
-}
-
-void sigmoid_focal_loss_forward_impl(Tensor input, Tensor target, Tensor weight,
-                                     Tensor output, float gamma, float alpha);
-
-void sigmoid_focal_loss_backward_impl(Tensor input, Tensor target,
-                                      Tensor weight, Tensor grad_input,
-                                      float gamma, float alpha);
-
-void softmax_focal_loss_forward_impl(Tensor input, Tensor target, Tensor weight,
-                                     Tensor output, float gamma, float alpha);
-
-void softmax_focal_loss_backward_impl(Tensor input, Tensor target,
-                                      Tensor weight, Tensor buff,
-                                      Tensor grad_input, float gamma,
-                                      float alpha);
-
-REGISTER_DEVICE_IMPL(sigmoid_focal_loss_forward_impl, CUDA,
-                     sigmoid_focal_loss_forward_cuda);
-REGISTER_DEVICE_IMPL(sigmoid_focal_loss_backward_impl, CUDA,
-                     sigmoid_focal_loss_backward_cuda);
-REGISTER_DEVICE_IMPL(softmax_focal_loss_forward_impl, CUDA,
-                     softmax_focal_loss_forward_cuda);
-REGISTER_DEVICE_IMPL(softmax_focal_loss_backward_impl, CUDA,
-                     softmax_focal_loss_backward_cuda);
-
-void FurthestPointSamplingForwardCUDAKernelLauncher(int b, int n, int m,
-                                                    const float* dataset,
-                                                    float* temp, int* idxs);
-
-void FurthestPointSamplingWithDistForwardCUDAKernelLauncher(
-    int b, int n, int m, const float* dataset, float* temp, int* idxs);
-
-void furthest_point_sampling_forward_cuda(Tensor points_tensor,
-                                          Tensor temp_tensor, Tensor idx_tensor,
-                                          int b, int n, int m) {
-  const float* dataset = points_tensor.data_ptr<float>();
-  float* temp = temp_tensor.data_ptr<float>();
-  int* idxs = idx_tensor.data_ptr<int>();
-  FurthestPointSamplingForwardCUDAKernelLauncher(b, n, m, dataset, temp, idxs);
-}
-
-void furthest_point_sampling_with_dist_forward_cuda(Tensor points_tensor,
-                                                    Tensor temp_tensor,
-                                                    Tensor idx_tensor, int b,
-                                                    int n, int m) {
-  const float* dataset = points_tensor.data_ptr<float>();
-  float* temp = temp_tensor.data_ptr<float>();
-  int* idxs = idx_tensor.data_ptr<int>();
-  FurthestPointSamplingWithDistForwardCUDAKernelLauncher(b, n, m, dataset, temp,
-                                                         idxs);
-}
-
-void furthest_point_sampling_forward_impl(Tensor points_tensor,
-                                          Tensor temp_tensor, Tensor idx_tensor,
-                                          int b, int n, int m);
-
-void furthest_point_sampling_with_dist_forward_impl(Tensor points_tensor,
-                                                    Tensor temp_tensor,
-                                                    Tensor idx_tensor, int b,
-                                                    int n, int m);
-
-REGISTER_DEVICE_IMPL(furthest_point_sampling_forward_impl, CUDA,
-                     furthest_point_sampling_forward_cuda);
-REGISTER_DEVICE_IMPL(furthest_point_sampling_with_dist_forward_impl, CUDA,
-                     furthest_point_sampling_with_dist_forward_cuda);
-
-torch::Tensor fused_bias_leakyrelu_op(const torch::Tensor& input,
-                                      const torch::Tensor& bias,
-                                      const torch::Tensor& refer, int act,
-                                      int grad, float alpha, float scale);
-
-torch::Tensor fused_bias_leakyrelu_op_impl(const torch::Tensor& input,
-                                           const torch::Tensor& bias,
-                                           const torch::Tensor& refer, int act,
-                                           int grad, float alpha, float scale);
-REGISTER_DEVICE_IMPL(fused_bias_leakyrelu_op_impl, CUDA,
-                     fused_bias_leakyrelu_op);
-
-void GatherPointsForwardCUDAKernelLauncher(int b, int c, int n, int npoints,
-                                           const Tensor points,
-                                           const Tensor idx, Tensor out);
-
-void GatherPointsBackwardCUDAKernelLauncher(int b, int c, int n, int npoints,
-                                            const Tensor grad_out,
-                                            const Tensor idx,
-                                            Tensor grad_points);
-
-void gather_points_forward_cuda(int b, int c, int n, int npoints,
-                                const Tensor points, const Tensor idx,
-                                Tensor out) {
-  GatherPointsForwardCUDAKernelLauncher(b, c, n, npoints, points, idx, out);
-};
-
-void gather_points_backward_cuda(int b, int c, int n, int npoints,
-                                 const Tensor grad_out, const Tensor idx,
-                                 Tensor grad_points) {
-  GatherPointsBackwardCUDAKernelLauncher(b, c, n, npoints, grad_out, idx,
-                                         grad_points);
-};
-
-void gather_points_forward_impl(int b, int c, int n, int npoints,
-                                const Tensor points, const Tensor idx,
-                                Tensor out);
-
-void gather_points_backward_impl(int b, int c, int n, int npoints,
-                                 const Tensor grad_out, const Tensor idx,
-                                 Tensor grad_points);
-
-REGISTER_DEVICE_IMPL(gather_points_forward_impl, CUDA,
-                     gather_points_forward_cuda);
-REGISTER_DEVICE_IMPL(gather_points_backward_impl, CUDA,
-                     gather_points_backward_cuda);
-
-void GroupPointsForwardCUDAKernelLauncher(int b, int c, int n, int npoints,
-                                          int nsample, const Tensor points,
-                                          const Tensor idx, Tensor out);
-
-void GroupPointsBackwardCUDAKernelLauncher(int b, int c, int n, int npoints,
-                                           int nsample, const Tensor grad_out,
-                                           const Tensor idx,
-                                           Tensor grad_points);
-
-void group_points_forward_cuda(int b, int c, int n, int npoints, int nsample,
-                               const Tensor points, const Tensor idx,
-                               Tensor out) {
-  GroupPointsForwardCUDAKernelLauncher(b, c, n, npoints, nsample, points, idx,
-                                       out);
-};
-
-void group_points_backward_cuda(int b, int c, int n, int npoints, int nsample,
-                                const Tensor grad_out, const Tensor idx,
-                                Tensor grad_points) {
-  GroupPointsBackwardCUDAKernelLauncher(b, c, n, npoints, nsample, grad_out,
-                                        idx, grad_points);
-};
-
-void group_points_forward_impl(int b, int c, int n, int npoints, int nsample,
-                               const Tensor points, const Tensor idx,
-                               Tensor out);
-
-void group_points_backward_impl(int b, int c, int n, int npoints, int nsample,
-                                const Tensor grad_out, const Tensor idx,
-                                Tensor grad_points);
-
-REGISTER_DEVICE_IMPL(group_points_forward_impl, CUDA,
-                     group_points_forward_cuda);
-REGISTER_DEVICE_IMPL(group_points_backward_impl, CUDA,
-                     group_points_backward_cuda);
-
-void IoU3DBoxesOverlapBevForwardCUDAKernelLauncher(const int num_a,
-                                                   const Tensor boxes_a,
-                                                   const int num_b,
-                                                   const Tensor boxes_b,
-                                                   Tensor ans_overlap);
-
-void IoU3DNMS3DForwardCUDAKernelLauncher(const Tensor boxes, Tensor& keep,
-                                         Tensor& keep_num,
-                                         float nms_overlap_thresh);
-
-void IoU3DNMS3DNormalForwardCUDAKernelLauncher(const Tensor boxes, Tensor& keep,
-                                               Tensor& keep_num,
-                                               float nms_overlap_thresh);
-
-void iou3d_boxes_overlap_bev_forward_cuda(const int num_a, const Tensor boxes_a,
-                                          const int num_b, const Tensor boxes_b,
-                                          Tensor ans_overlap) {
-  IoU3DBoxesOverlapBevForwardCUDAKernelLauncher(num_a, boxes_a, num_b, boxes_b,
-                                                ans_overlap);
-};
-
-void iou3d_nms3d_forward_cuda(const Tensor boxes, Tensor& keep,
-                              Tensor& keep_num, float nms_overlap_thresh) {
-  IoU3DNMS3DForwardCUDAKernelLauncher(boxes, keep, keep_num,
-                                      nms_overlap_thresh);
-};
-
-void iou3d_nms3d_normal_forward_cuda(const Tensor boxes, Tensor& keep,
-                                     Tensor& keep_num,
-                                     float nms_overlap_thresh) {
-  IoU3DNMS3DNormalForwardCUDAKernelLauncher(boxes, keep, keep_num,
-                                            nms_overlap_thresh);
-};
-
-void iou3d_boxes_overlap_bev_forward_impl(const int num_a, const Tensor boxes_a,
-                                          const int num_b, const Tensor boxes_b,
-                                          Tensor ans_overlap);
-
-void iou3d_nms3d_forward_impl(const Tensor boxes, Tensor& keep,
-                              Tensor& keep_num, float nms_overlap_thresh);
-
-void iou3d_nms3d_normal_forward_impl(const Tensor boxes, Tensor& keep,
-                                     Tensor& keep_num,
-                                     float nms_overlap_thresh);
-
-REGISTER_DEVICE_IMPL(iou3d_boxes_overlap_bev_forward_impl, CUDA,
-                     iou3d_boxes_overlap_bev_forward_cuda);
-REGISTER_DEVICE_IMPL(iou3d_nms3d_forward_impl, CUDA, iou3d_nms3d_forward_cuda);
-REGISTER_DEVICE_IMPL(iou3d_nms3d_normal_forward_impl, CUDA,
-                     iou3d_nms3d_normal_forward_cuda);
-
-void KNNForwardCUDAKernelLauncher(int b, int n, int m, int nsample,
-                                  const Tensor xyz, const Tensor new_xyz,
-                                  Tensor idx, Tensor dist2);
-
-void knn_forward_cuda(int b, int n, int m, int nsample, const Tensor xyz,
-                      const Tensor new_xyz, Tensor idx, Tensor dist2) {
-  KNNForwardCUDAKernelLauncher(b, n, m, nsample, xyz, new_xyz, idx, dist2);
-}
-
-void knn_forward_impl(int b, int n, int m, int nsample, const Tensor xyz,
-                      const Tensor new_xyz, Tensor idx, Tensor dist2);
-REGISTER_DEVICE_IMPL(knn_forward_impl, CUDA, knn_forward_cuda);
-
-void MaskedIm2colForwardCUDAKernelLauncher(const Tensor bottom_data,
-                                           const Tensor mask_h_idx,
-                                           const Tensor mask_w_idx,
-                                           Tensor top_data, const int kernel_h,
-                                           const int kernel_w, const int pad_h,
-                                           const int pad_w);
-
-void MaskedCol2imForwardCUDAKernelLauncher(const Tensor bottom_data,
-                                           const Tensor mask_h_idx,
-                                           const Tensor mask_w_idx,
-                                           Tensor top_data, const int height,
-                                           const int width, const int channels);
-
-void masked_im2col_forward_cuda(const Tensor im, const Tensor mask_h_idx,
-                                const Tensor mask_w_idx, Tensor col,
-                                const int kernel_h, const int kernel_w,
-                                const int pad_h, const int pad_w) {
-  // im: (n, ic, h, w), kernel size (kh, kw)
-  // kernel: (oc, ic * kh * kw), col: (kh * kw * ic, ow * oh)
-  MaskedIm2colForwardCUDAKernelLauncher(im, mask_h_idx, mask_w_idx, col,
-                                        kernel_h, kernel_w, pad_h, pad_w);
-}
-
-void masked_col2im_forward_cuda(const Tensor col, const Tensor mask_h_idx,
-                                const Tensor mask_w_idx, Tensor im, int height,
-                                int width, int channels) {
-  // im: (n, ic, h, w), kernel size (kh, kw)
-  // kernel: (oc, ic * kh * kh), col: (kh * kw * ic, ow * oh)
-  MaskedCol2imForwardCUDAKernelLauncher(col, mask_h_idx, mask_w_idx, im, height,
-                                        width, channels);
-}
-
-void masked_im2col_forward_impl(const Tensor im, const Tensor mask_h_idx,
-                                const Tensor mask_w_idx, Tensor col,
-                                const int kernel_h, const int kernel_w,
-                                const int pad_h, const int pad_w);
-
-void masked_col2im_forward_impl(const Tensor col, const Tensor mask_h_idx,
-                                const Tensor mask_w_idx, Tensor im, int height,
-                                int width, int channels);
-
-REGISTER_DEVICE_IMPL(masked_im2col_forward_impl, CUDA,
-                     masked_im2col_forward_cuda);
-REGISTER_DEVICE_IMPL(masked_col2im_forward_impl, CUDA,
-                     masked_col2im_forward_cuda);
-
-void modulated_deformable_im2col_cuda(
-    const Tensor data_im, const Tensor data_offset, const Tensor data_mask,
-    const int batch_size, const int channels, const int height_im,
-    const int width_im, const int height_col, const int width_col,
-    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
-    const int stride_h, const int stride_w, const int dilation_h,
-    const int dilation_w, const int deformable_group, Tensor data_col);
-
-void modulated_deformable_col2im_cuda(
-    const Tensor data_col, const Tensor data_offset, const Tensor data_mask,
-    const int batch_size, const int channels, const int height_im,
-    const int width_im, const int height_col, const int width_col,
-    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
-    const int stride_h, const int stride_w, const int dilation_h,
-    const int dilation_w, const int deformable_group, Tensor grad_im);
-
-void modulated_deformable_col2im_coord_cuda(
-    const Tensor data_col, const Tensor data_im, const Tensor data_offset,
-    const Tensor data_mask, const int batch_size, const int channels,
-    const int height_im, const int width_im, const int height_col,
-    const int width_col, const int kernel_h, const int kernel_w,
-    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
-    const int dilation_h, const int dilation_w, const int deformable_group,
-    Tensor grad_offset, Tensor grad_mask);
-
-void modulated_deformable_im2col_impl(
-    const Tensor data_im, const Tensor data_offset, const Tensor data_mask,
-    const int batch_size, const int channels, const int height_im,
-    const int width_im, const int height_col, const int width_col,
-    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
-    const int stride_h, const int stride_w, const int dilation_h,
-    const int dilation_w, const int deformable_group, Tensor data_col);
-
-void modulated_deformable_col2im_impl(
-    const Tensor data_col, const Tensor data_offset, const Tensor data_mask,
-    const int batch_size, const int channels, const int height_im,
-    const int width_im, const int height_col, const int width_col,
-    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
-    const int stride_h, const int stride_w, const int dilation_h,
-    const int dilation_w, const int deformable_group, Tensor grad_im);
-
-void modulated_deformable_col2im_coord_impl(
-    const Tensor data_col, const Tensor data_im, const Tensor data_offset,
-    const Tensor data_mask, const int batch_size, const int channels,
-    const int height_im, const int width_im, const int height_col,
-    const int width_col, const int kernel_h, const int kernel_w,
-    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
-    const int dilation_h, const int dilation_w, const int deformable_group,
-    Tensor grad_offset, Tensor grad_mask);
-
-REGISTER_DEVICE_IMPL(modulated_deformable_im2col_impl, CUDA,
-                     modulated_deformable_im2col_cuda);
-REGISTER_DEVICE_IMPL(modulated_deformable_col2im_impl, CUDA,
-                     modulated_deformable_col2im_cuda);
-REGISTER_DEVICE_IMPL(modulated_deformable_col2im_coord_impl, CUDA,
-                     modulated_deformable_col2im_coord_cuda);
-
-Tensor ms_deform_attn_cuda_forward(const Tensor& value,
-                                   const Tensor& spatial_shapes,
-                                   const Tensor& level_start_index,
-                                   const Tensor& sampling_loc,
-                                   const Tensor& attn_weight,
-                                   const int im2col_step);
-
-void ms_deform_attn_cuda_backward(
-    const Tensor& value, const Tensor& spatial_shapes,
-    const Tensor& level_start_index, const Tensor& sampling_loc,
-    const Tensor& attn_weight, const Tensor& grad_output, Tensor& grad_value,
-    Tensor& grad_sampling_loc, Tensor& grad_attn_weight, const int im2col_step);
-
-Tensor ms_deform_attn_impl_forward(const Tensor& value,
-                                   const Tensor& spatial_shapes,
-                                   const Tensor& level_start_index,
-                                   const Tensor& sampling_loc,
-                                   const Tensor& attn_weight,
-                                   const int im2col_step);
-
-void ms_deform_attn_impl_backward(
-    const Tensor& value, const Tensor& spatial_shapes,
-    const Tensor& level_start_index, const Tensor& sampling_loc,
-    const Tensor& attn_weight, const Tensor& grad_output, Tensor& grad_value,
-    Tensor& grad_sampling_loc, Tensor& grad_attn_weight, const int im2col_step);
-
-REGISTER_DEVICE_IMPL(ms_deform_attn_impl_forward, CUDA,
-                     ms_deform_attn_cuda_forward);
-REGISTER_DEVICE_IMPL(ms_deform_attn_impl_backward, CUDA,
-                     ms_deform_attn_cuda_backward);
-
-Tensor NMSCUDAKernelLauncher(Tensor boxes, Tensor scores, float iou_threshold,
-                             int offset);
-
-Tensor nms_cuda(Tensor boxes, Tensor scores, float iou_threshold, int offset) {
-  return NMSCUDAKernelLauncher(boxes, scores, iou_threshold, offset);
-}
-
-Tensor nms_impl(Tensor boxes, Tensor scores, float iou_threshold, int offset);
-REGISTER_DEVICE_IMPL(nms_impl, CUDA, nms_cuda);
-
-void PointsInBoxesPartForwardCUDAKernelLauncher(int batch_size, int boxes_num,
-                                                int pts_num, const Tensor boxes,
-                                                const Tensor pts,
-                                                Tensor box_idx_of_points);
-
-void PointsInBoxesAllForwardCUDAKernelLauncher(int batch_size, int boxes_num,
-                                               int pts_num, const Tensor boxes,
-                                               const Tensor pts,
-                                               Tensor box_idx_of_points);
-
-void points_in_boxes_part_forward_cuda(int batch_size, int boxes_num,
-                                       int pts_num, const Tensor boxes,
-                                       const Tensor pts,
-                                       Tensor box_idx_of_points) {
-  PointsInBoxesPartForwardCUDAKernelLauncher(batch_size, boxes_num, pts_num,
-                                             boxes, pts, box_idx_of_points);
-};
-
-void points_in_boxes_all_forward_cuda(int batch_size, int boxes_num,
-                                      int pts_num, const Tensor boxes,
-                                      const Tensor pts,
-                                      Tensor box_idx_of_points) {
-  PointsInBoxesAllForwardCUDAKernelLauncher(batch_size, boxes_num, pts_num,
-                                            boxes, pts, box_idx_of_points);
-};
-
-void points_in_boxes_part_forward_impl(int batch_size, int boxes_num,
-                                       int pts_num, const Tensor boxes,
-                                       const Tensor pts,
-                                       Tensor box_idx_of_points);
-
-void points_in_boxes_all_forward_impl(int batch_size, int boxes_num,
-                                      int pts_num, const Tensor boxes,
-                                      const Tensor pts,
-                                      Tensor box_idx_of_points);
-REGISTER_DEVICE_IMPL(points_in_boxes_part_forward_impl, CUDA,
-                     points_in_boxes_part_forward_cuda);
-REGISTER_DEVICE_IMPL(points_in_boxes_all_forward_impl, CUDA,
-                     points_in_boxes_all_forward_cuda);
-
-void PSAMaskForwardCUDAKernelLauncher(const int psa_type, const Tensor input,
-                                      Tensor output, const int num_,
-                                      const int h_feature, const int w_feature,
-                                      const int h_mask, const int w_mask,
-                                      const int half_h_mask,
-                                      const int half_w_mask);
-
-void PSAMaskBackwardCUDAKernelLauncher(
-    const int psa_type, const Tensor grad_output, Tensor grad_input,
-    const int num_, const int h_feature, const int w_feature, const int h_mask,
-    const int w_mask, const int half_h_mask, const int half_w_mask);
-
-void psamask_forward_cuda(const int psa_type, const Tensor input, Tensor output,
-                          const int num_, const int h_feature,
-                          const int w_feature, const int h_mask,
-                          const int w_mask, const int half_h_mask,
-                          const int half_w_mask) {
-  PSAMaskForwardCUDAKernelLauncher(psa_type, input, output, num_, h_feature,
-                                   w_feature, h_mask, w_mask, half_h_mask,
-                                   half_w_mask);
-}
-
-void psamask_backward_cuda(const int psa_type, const Tensor grad_output,
-                           Tensor grad_input, const int num_,
-                           const int h_feature, const int w_feature,
-                           const int h_mask, const int w_mask,
-                           const int half_h_mask, const int half_w_mask) {
-  PSAMaskBackwardCUDAKernelLauncher(psa_type, grad_output, grad_input, num_,
-                                    h_feature, w_feature, h_mask, w_mask,
-                                    half_h_mask, half_w_mask);
-}
-
-void psamask_forward_impl(const int psa_type, const Tensor input, Tensor output,
-                          const int num_, const int h_feature,
-                          const int w_feature, const int h_mask,
-                          const int w_mask, const int half_h_mask,
-                          const int half_w_mask);
-
-void psamask_backward_impl(const int psa_type, const Tensor grad_output,
-                           Tensor grad_input, const int num_,
-                           const int h_feature, const int w_feature,
-                           const int h_mask, const int w_mask,
-                           const int half_h_mask, const int half_w_mask);
-REGISTER_DEVICE_IMPL(psamask_forward_impl, CUDA, psamask_forward_cuda);
-REGISTER_DEVICE_IMPL(psamask_backward_impl, CUDA, psamask_backward_cuda);
-
-void ROIAlignForwardCUDAKernelLauncher(Tensor input, Tensor rois, Tensor output,
-                                       Tensor argmax_y, Tensor argmax_x,
-                                       int aligned_height, int aligned_width,
-                                       float spatial_scale, int sampling_ratio,
-                                       int pool_mode, bool aligned);
-
-void ROIAlignBackwardCUDAKernelLauncher(Tensor grad_output, Tensor rois,
-                                        Tensor argmax_y, Tensor argmax_x,
-                                        Tensor grad_input, int aligned_height,
-                                        int aligned_width, float spatial_scale,
-                                        int sampling_ratio, int pool_mode,
-                                        bool aligned);
-
-void roi_align_forward_cuda(Tensor input, Tensor rois, Tensor output,
-                            Tensor argmax_y, Tensor argmax_x,
-                            int aligned_height, int aligned_width,
-                            float spatial_scale, int sampling_ratio,
-                            int pool_mode, bool aligned) {
-  ROIAlignForwardCUDAKernelLauncher(
-      input, rois, output, argmax_y, argmax_x, aligned_height, aligned_width,
-      spatial_scale, sampling_ratio, pool_mode, aligned);
-}
-
-void roi_align_backward_cuda(Tensor grad_output, Tensor rois, Tensor argmax_y,
-                             Tensor argmax_x, Tensor grad_input,
-                             int aligned_height, int aligned_width,
-                             float spatial_scale, int sampling_ratio,
-                             int pool_mode, bool aligned) {
-  ROIAlignBackwardCUDAKernelLauncher(
-      grad_output, rois, argmax_y, argmax_x, grad_input, aligned_height,
-      aligned_width, spatial_scale, sampling_ratio, pool_mode, aligned);
-}
-
-void roi_align_forward_impl(Tensor input, Tensor rois, Tensor output,
-                            Tensor argmax_y, Tensor argmax_x,
-                            int aligned_height, int aligned_width,
-                            float spatial_scale, int sampling_ratio,
-                            int pool_mode, bool aligned);
-
-void roi_align_backward_impl(Tensor grad_output, Tensor rois, Tensor argmax_y,
-                             Tensor argmax_x, Tensor grad_input,
-                             int aligned_height, int aligned_width,
-                             float spatial_scale, int sampling_ratio,
-                             int pool_mode, bool aligned);
-
-REGISTER_DEVICE_IMPL(roi_align_forward_impl, CUDA, roi_align_forward_cuda);
-REGISTER_DEVICE_IMPL(roi_align_backward_impl, CUDA, roi_align_backward_cuda);
-
-void ROIAlignRotatedForwardCUDAKernelLauncher(
-    const at::Tensor input, const at::Tensor rois, const float spatial_scale,
-    const int sampling_ratio, const bool aligned, const bool clockwise,
-    const int channels, const int height, const int width, const int num_rois,
-    const int pooled_height, const int pooled_width, at::Tensor output);
-
-void ROIAlignRotatedBackwardCUDAKernelLauncher(
-    const at::Tensor top_grad, const at::Tensor rois, const float spatial_scale,
-    const int sampling_ratio, const bool aligned, const bool clockwise,
-    const int channels, const int height, const int width, const int num_rois,
-    const int pooled_height, const int pooled_width, at::Tensor bottom_grad);
-
-void roi_align_rotated_forward_cuda(Tensor input, Tensor rois, Tensor output,
-                                    int aligned_height, int aligned_width,
-                                    float spatial_scale, int sampling_ratio,
-                                    bool aligned, bool clockwise) {
-  // Number of ROIs
-  int num_rois = rois.size(0);
-  int size_rois = rois.size(1);
-
-  if (size_rois != 6) {
-    AT_ERROR("wrong roi size");
-  }
-
-  int num_channels = input.size(1);
-  int data_height = input.size(2);
-  int data_width = input.size(3);
-  ROIAlignRotatedForwardCUDAKernelLauncher(
-      input, rois, spatial_scale, sampling_ratio, aligned, clockwise,
-      num_channels, data_height, data_width, num_rois, aligned_height,
-      aligned_width, output);
-}
-
-void roi_align_rotated_backward_cuda(Tensor top_grad, Tensor rois,
-                                     Tensor bottom_grad, int aligned_height,
-                                     int aligned_width, float spatial_scale,
-                                     int sampling_ratio, bool aligned,
-                                     bool clockwise) {
-  // Number of ROIs
-  int num_rois = rois.size(0);
-  int size_rois = rois.size(1);
-  if (size_rois != 6) {
-    AT_ERROR("wrong roi size");
-  }
-
-  int num_channels = bottom_grad.size(1);
-  int data_height = bottom_grad.size(2);
-  int data_width = bottom_grad.size(3);
-  ROIAlignRotatedBackwardCUDAKernelLauncher(
-      top_grad, rois, spatial_scale, sampling_ratio, aligned, clockwise,
-      num_channels, data_height, data_width, num_rois, aligned_height,
-      aligned_width, bottom_grad);
-}
-
-void roi_align_rotated_forward_impl(Tensor input, Tensor rois, Tensor output,
-                                    int aligned_height, int aligned_width,
-                                    float spatial_scale, int sampling_ratio,
-                                    bool aligned, bool clockwise);
-
-void roi_align_rotated_backward_impl(Tensor top_grad, Tensor rois,
-                                     Tensor bottom_grad, int aligned_height,
-                                     int aligned_width, float spatial_scale,
-                                     int sampling_ratio, bool aligned,
-                                     bool clockwise);
-REGISTER_DEVICE_IMPL(roi_align_rotated_forward_impl, CUDA,
-                     roi_align_rotated_forward_cuda);
-REGISTER_DEVICE_IMPL(roi_align_rotated_backward_impl, CUDA,
-                     roi_align_rotated_backward_cuda);
-
-void RiROIAlignRotatedForwardCUDAKernelLauncher(
-    const at::Tensor features, const at::Tensor rois, const float spatial_scale,
-    const int num_samples, const bool clockwise, const int channels,
-    const int height, const int width, const int num_rois,
-    const int pooled_height, const int pooled_width, const int num_orientations,
-    at::Tensor output);
-
-void RiROIAlignRotatedBackwardCUDAKernelLauncher(
-    const at::Tensor top_grad, const at::Tensor rois, const float spatial_scale,
-    const int num_samples, const bool clockwise, const int channels,
-    const int height, const int width, const int num_rois,
-    const int pooled_height, const int pooled_width, const int num_orientations,
-    at::Tensor bottom_grad);
-
-void riroi_align_rotated_forward_cuda(Tensor features, Tensor rois,
-                                      Tensor output, int pooled_height,
-                                      int pooled_width, float spatial_scale,
-                                      int num_samples, int num_orientations,
-                                      bool clockwise) {
-  // Number of ROIs
-  int num_rois = rois.size(0);
-  int size_rois = rois.size(1);
-  if (size_rois != 6) {
-    AT_ERROR("wrong roi size");
-  }
-  CHECK_CONTIGUOUS(features);
-  CHECK_CONTIGUOUS(rois);
-  int num_channels = features.size(1) / num_orientations;
-  int data_height = features.size(2);
-  int data_width = features.size(3);
-  RiROIAlignRotatedForwardCUDAKernelLauncher(
-      features, rois, spatial_scale, num_samples, clockwise, num_channels,
-      data_height, data_width, num_rois, pooled_height, pooled_width,
-      num_orientations, output);
-}
-
-void riroi_align_rotated_backward_cuda(Tensor top_grad, Tensor rois,
-                                       Tensor bottom_grad, int pooled_height,
-                                       int pooled_width, float spatial_scale,
-                                       int num_samples, int num_orientations,
-                                       bool clockwise) {
-  // Number of ROIs
-  int num_rois = rois.size(0);
-  int size_rois = rois.size(1);
-  if (size_rois != 6) {
-    AT_ERROR("wrong roi size");
-  }
-  CHECK_CONTIGUOUS(top_grad);
-  CHECK_CONTIGUOUS(rois);
-  int num_channels = bottom_grad.size(1) / num_orientations;
-  int data_height = bottom_grad.size(2);
-  int data_width = bottom_grad.size(3);
-  RiROIAlignRotatedBackwardCUDAKernelLauncher(
-      top_grad, rois, spatial_scale, num_samples, clockwise, num_channels,
-      data_height, data_width, num_rois, pooled_height, pooled_width,
-      num_orientations, bottom_grad);
-}
-
-void riroi_align_rotated_forward_impl(Tensor features, Tensor rois,
-                                      Tensor output, int pooled_height,
-                                      int pooled_width, float spatial_scale,
-                                      int num_samples, int num_orientations,
-                                      bool clockwise);
-
-void riroi_align_rotated_backward_impl(Tensor top_grad, Tensor rois,
-                                       Tensor bottom_grad, int pooled_height,
-                                       int pooled_width, float spatial_scale,
-                                       int num_samples, int num_orientations,
-                                       bool clockwise);
-
-REGISTER_DEVICE_IMPL(riroi_align_rotated_forward_impl, CUDA,
-                     riroi_align_rotated_forward_cuda);
-REGISTER_DEVICE_IMPL(riroi_align_rotated_backward_impl, CUDA,
-                     riroi_align_rotated_backward_cuda);
-
-void RoiawarePool3dForwardCUDAKernelLauncher(
-    int boxes_num, int pts_num, int channels, int max_pts_each_voxel, int out_x,
-    int out_y, int out_z, const Tensor rois, const Tensor pts,
-    const Tensor pts_feature, Tensor argmax, Tensor pts_idx_of_voxels,
-    Tensor pooled_features, int pool_method);
-
-void RoiawarePool3dBackwardCUDAKernelLauncher(
-    int boxes_num, int out_x, int out_y, int out_z, int channels,
-    int max_pts_each_voxel, const Tensor pts_idx_of_voxels, const Tensor argmax,
-    const Tensor grad_out, Tensor grad_in, int pool_method);
-
-void roiaware_pool3d_forward_cuda(int boxes_num, int pts_num, int channels,
-                                  int max_pts_each_voxel, int out_x, int out_y,
-                                  int out_z, const Tensor rois,
-                                  const Tensor pts, const Tensor pts_feature,
-                                  Tensor argmax, Tensor pts_idx_of_voxels,
-                                  Tensor pooled_features, int pool_method) {
-  RoiawarePool3dForwardCUDAKernelLauncher(
-      boxes_num, pts_num, channels, max_pts_each_voxel, out_x, out_y, out_z,
-      rois, pts, pts_feature, argmax, pts_idx_of_voxels, pooled_features,
-      pool_method);
-};
-
-void roiaware_pool3d_backward_cuda(int boxes_num, int out_x, int out_y,
-                                   int out_z, int channels,
-                                   int max_pts_each_voxel,
-                                   const Tensor pts_idx_of_voxels,
-                                   const Tensor argmax, const Tensor grad_out,
-                                   Tensor grad_in, int pool_method) {
-  RoiawarePool3dBackwardCUDAKernelLauncher(
-      boxes_num, out_x, out_y, out_z, channels, max_pts_each_voxel,
-      pts_idx_of_voxels, argmax, grad_out, grad_in, pool_method);
-};
-
-void roiaware_pool3d_forward_impl(int boxes_num, int pts_num, int channels,
-                                  int max_pts_each_voxel, int out_x, int out_y,
-                                  int out_z, const Tensor rois,
-                                  const Tensor pts, const Tensor pts_feature,
-                                  Tensor argmax, Tensor pts_idx_of_voxels,
-                                  Tensor pooled_features, int pool_method);
-
-void roiaware_pool3d_backward_impl(int boxes_num, int out_x, int out_y,
-                                   int out_z, int channels,
-                                   int max_pts_each_voxel,
-                                   const Tensor pts_idx_of_voxels,
-                                   const Tensor argmax, const Tensor grad_out,
-                                   Tensor grad_in, int pool_method);
-
-REGISTER_DEVICE_IMPL(roiaware_pool3d_forward_impl, CUDA,
-                     roiaware_pool3d_forward_cuda);
-REGISTER_DEVICE_IMPL(roiaware_pool3d_backward_impl, CUDA,
-                     roiaware_pool3d_backward_cuda);
-
-void RoIPointPool3dForwardCUDAKernelLauncher(
-    int batch_size, int pts_num, int boxes_num, int feature_in_len,
-    int sampled_pts_num, const Tensor xyz, const Tensor boxes3d,
-    const Tensor pts_feature, Tensor pooled_features, Tensor pooled_empty_flag);
-
-void roipoint_pool3d_forward_cuda(int batch_size, int pts_num, int boxes_num,
-                                  int feature_in_len, int sampled_pts_num,
-                                  const Tensor xyz, const Tensor boxes3d,
-                                  const Tensor pts_feature,
-                                  Tensor pooled_features,
-                                  Tensor pooled_empty_flag) {
-  RoIPointPool3dForwardCUDAKernelLauncher(
-      batch_size, pts_num, boxes_num, feature_in_len, sampled_pts_num, xyz,
-      boxes3d, pts_feature, pooled_features, pooled_empty_flag);
-};
-
-void roipoint_pool3d_forward_impl(int batch_size, int pts_num, int boxes_num,
-                                  int feature_in_len, int sampled_pts_num,
-                                  const Tensor xyz, const Tensor boxes3d,
-                                  const Tensor pts_feature,
-                                  Tensor pooled_features,
-                                  Tensor pooled_empty_flag);
-REGISTER_DEVICE_IMPL(roipoint_pool3d_forward_impl, CUDA,
-                     roipoint_pool3d_forward_cuda);
-
-void ROIPoolForwardCUDAKernelLauncher(Tensor input, Tensor rois, Tensor output,
-                                      Tensor argmax, int pooled_height,
-                                      int pooled_width, float spatial_scale);
-
-void ROIPoolBackwardCUDAKernelLauncher(Tensor grad_output, Tensor rois,
-                                       Tensor argmax, Tensor grad_input,
-                                       int pooled_height, int pooled_width,
-                                       float spatial_scale);
-
-void roi_pool_forward_cuda(Tensor input, Tensor rois, Tensor output,
-                           Tensor argmax, int pooled_height, int pooled_width,
-                           float spatial_scale) {
-  ROIPoolForwardCUDAKernelLauncher(input, rois, output, argmax, pooled_height,
-                                   pooled_width, spatial_scale);
-}
-
-void roi_pool_backward_cuda(Tensor grad_output, Tensor rois, Tensor argmax,
-                            Tensor grad_input, int pooled_height,
-                            int pooled_width, float spatial_scale) {
-  ROIPoolBackwardCUDAKernelLauncher(grad_output, rois, argmax, grad_input,
-                                    pooled_height, pooled_width, spatial_scale);
-}
-
-void roi_pool_forward_impl(Tensor input, Tensor rois, Tensor output,
-                           Tensor argmax, int pooled_height, int pooled_width,
-                           float spatial_scale);
-void roi_pool_backward_impl(Tensor grad_output, Tensor rois, Tensor argmax,
-                            Tensor grad_input, int pooled_height,
-                            int pooled_width, float spatial_scale);
-REGISTER_DEVICE_IMPL(roi_pool_forward_impl, CUDA, roi_pool_forward_cuda);
-REGISTER_DEVICE_IMPL(roi_pool_backward_impl, CUDA, roi_pool_backward_cuda);
-
-typedef enum { SUM = 0, MEAN = 1, MAX = 2 } reduce_t;
-
-std::vector<at::Tensor> DynamicPointToVoxelForwardCUDAKernelLauncher(
-    const at::Tensor& feats, const at::Tensor& coors,
-    const reduce_t reduce_type);
-
-void DynamicPointToVoxelBackwardCUDAKernelLauncher(
-    at::Tensor& grad_feats, const at::Tensor& grad_reduced_feats,
-    const at::Tensor& feats, const at::Tensor& reduced_feats,
-    const at::Tensor& coors_map, const at::Tensor& reduce_count,
-    const reduce_t reduce_type);
-
-std::vector<torch::Tensor> dynamic_point_to_voxel_forward_cuda(
-    const torch::Tensor& feats, const torch::Tensor& coors,
-    const reduce_t reduce_type) {
-  return DynamicPointToVoxelForwardCUDAKernelLauncher(feats, coors,
-                                                      reduce_type);
-};
-
-void dynamic_point_to_voxel_backward_cuda(
-    torch::Tensor& grad_feats, const torch::Tensor& grad_reduced_feats,
-    const torch::Tensor& feats, const torch::Tensor& reduced_feats,
-    const torch::Tensor& coors_idx, const torch::Tensor& reduce_count,
-    const reduce_t reduce_type) {
-  DynamicPointToVoxelBackwardCUDAKernelLauncher(grad_feats, grad_reduced_feats,
-                                                feats, reduced_feats, coors_idx,
-                                                reduce_count, reduce_type);
-};
-
-std::vector<torch::Tensor> dynamic_point_to_voxel_forward_impl(
-    const torch::Tensor& feats, const torch::Tensor& coors,
-    const reduce_t reduce_type);
-
-void dynamic_point_to_voxel_backward_impl(
-    torch::Tensor& grad_feats, const torch::Tensor& grad_reduced_feats,
-    const torch::Tensor& feats, const torch::Tensor& reduced_feats,
-    const torch::Tensor& coors_idx, const torch::Tensor& reduce_count,
-    const reduce_t reduce_type);
-
-REGISTER_DEVICE_IMPL(dynamic_point_to_voxel_forward_impl, CUDA,
-                     dynamic_point_to_voxel_forward_cuda);
-REGISTER_DEVICE_IMPL(dynamic_point_to_voxel_backward_impl, CUDA,
-                     dynamic_point_to_voxel_backward_cuda);
-
-void SyncBNForwardMeanCUDAKernelLauncher(const Tensor input, Tensor mean);
-
-void SyncBNForwardVarCUDAKernelLauncher(const Tensor input, const Tensor mean,
-                                        Tensor var);
-
-void SyncBNForwardOutputCUDAKernelLauncher(
-    const Tensor input, const Tensor mean, const Tensor var,
-    Tensor running_mean, Tensor running_var, const Tensor weight,
-    const Tensor bias, Tensor norm, Tensor std, Tensor output, float eps,
-    float momentum, int group_size);
-
-void SyncBNBackwardParamCUDAKernelLauncher(const Tensor grad_output,
-                                           const Tensor norm,
-                                           Tensor grad_weight,
-                                           Tensor grad_bias);
-
-void SyncBNBackwardDataCUDAKernelLauncher(const Tensor grad_output,
-                                          const Tensor weight,
-                                          const Tensor grad_weight,
-                                          const Tensor grad_bias,
-                                          const Tensor norm, const Tensor std,
-                                          Tensor grad_input);
-
-void sync_bn_forward_mean_cuda(const Tensor input, Tensor mean) {
-  SyncBNForwardMeanCUDAKernelLauncher(input, mean);
-}
-
-void sync_bn_forward_var_cuda(const Tensor input, const Tensor mean,
-                              Tensor var) {
-  SyncBNForwardVarCUDAKernelLauncher(input, mean, var);
-}
-
-void sync_bn_forward_output_cuda(const Tensor input, const Tensor mean,
-                                 const Tensor var, Tensor running_mean,
-                                 Tensor running_var, const Tensor weight,
-                                 const Tensor bias, Tensor norm, Tensor std,
-                                 Tensor output, float eps, float momentum,
-                                 int group_size) {
-  SyncBNForwardOutputCUDAKernelLauncher(input, mean, var, running_mean,
-                                        running_var, weight, bias, norm, std,
-                                        output, eps, momentum, group_size);
-}
-
-void sync_bn_backward_param_cuda(const Tensor grad_output, const Tensor norm,
-                                 Tensor grad_weight, Tensor grad_bias) {
-  SyncBNBackwardParamCUDAKernelLauncher(grad_output, norm, grad_weight,
-                                        grad_bias);
-}
-
-void sync_bn_backward_data_cuda(const Tensor grad_output, const Tensor weight,
-                                const Tensor grad_weight,
-                                const Tensor grad_bias, const Tensor norm,
-                                const Tensor std, Tensor grad_input) {
-  SyncBNBackwardDataCUDAKernelLauncher(grad_output, weight, grad_weight,
-                                       grad_bias, norm, std, grad_input);
-}
-
-void sync_bn_forward_mean_impl(const Tensor input, Tensor mean);
-
-void sync_bn_forward_var_impl(const Tensor input, const Tensor mean,
-                              Tensor var);
-
-void sync_bn_forward_output_impl(const Tensor input, const Tensor mean,
-                                 const Tensor var, Tensor running_mean,
-                                 Tensor running_var, const Tensor weight,
-                                 const Tensor bias, Tensor norm, Tensor std,
-                                 Tensor output, float eps, float momentum,
-                                 int group_size);
-
-void sync_bn_backward_param_impl(const Tensor grad_output, const Tensor norm,
-                                 Tensor grad_weight, Tensor grad_bias);
-
-void sync_bn_backward_data_impl(const Tensor grad_output, const Tensor weight,
-                                const Tensor grad_weight,
-                                const Tensor grad_bias, const Tensor norm,
-                                const Tensor std, Tensor grad_input);
-
-REGISTER_DEVICE_IMPL(sync_bn_forward_mean_impl, CUDA,
-                     sync_bn_forward_mean_cuda);
-REGISTER_DEVICE_IMPL(sync_bn_forward_var_impl, CUDA, sync_bn_forward_var_cuda);
-REGISTER_DEVICE_IMPL(sync_bn_forward_output_impl, CUDA,
-                     sync_bn_forward_output_cuda);
-REGISTER_DEVICE_IMPL(sync_bn_backward_param_impl, CUDA,
-                     sync_bn_backward_param_cuda);
-REGISTER_DEVICE_IMPL(sync_bn_backward_data_impl, CUDA,
-                     sync_bn_backward_data_cuda);
-
-void ThreeInterpolateForwardCUDAKernelLauncher(int b, int c, int m, int n,
-                                               const Tensor points,
-                                               const Tensor idx,
-                                               const Tensor weight, Tensor out);
-
-void ThreeInterpolateBackwardCUDAKernelLauncher(int b, int c, int n, int m,
-                                                const Tensor grad_out,
-                                                const Tensor idx,
-                                                const Tensor weight,
-                                                Tensor grad_points);
-
-void three_interpolate_forward_cuda(int b, int c, int m, int n,
-                                    const Tensor points, const Tensor idx,
-                                    const Tensor weight, Tensor out) {
-  ThreeInterpolateForwardCUDAKernelLauncher(b, c, m, n, points, idx, weight,
-                                            out);
-};
-
-void three_interpolate_backward_cuda(int b, int c, int n, int m,
-                                     const Tensor grad_out, const Tensor idx,
-                                     const Tensor weight, Tensor grad_points) {
-  ThreeInterpolateBackwardCUDAKernelLauncher(b, c, n, m, grad_out, idx, weight,
-                                             grad_points);
-};
-
-void three_interpolate_forward_impl(int b, int c, int m, int n,
-                                    const Tensor points, const Tensor idx,
-                                    const Tensor weight, Tensor out);
-
-void three_interpolate_backward_impl(int b, int c, int n, int m,
-                                     const Tensor grad_out, const Tensor idx,
-                                     const Tensor weight, Tensor grad_points);
-REGISTER_DEVICE_IMPL(three_interpolate_forward_impl, CUDA,
-                     three_interpolate_forward_cuda);
-REGISTER_DEVICE_IMPL(three_interpolate_backward_impl, CUDA,
-                     three_interpolate_backward_cuda);
-
-void ThreeNNForwardCUDAKernelLauncher(int b, int n, int m, const Tensor unknown,
-                                      const Tensor known, Tensor dist2,
-                                      Tensor idx);
-
-void three_nn_forward_cuda(int b, int n, int m, const Tensor unknown,
-                           const Tensor known, Tensor dist2, Tensor idx) {
-  ThreeNNForwardCUDAKernelLauncher(b, n, m, unknown, known, dist2, idx);
-};
-
-void three_nn_forward_impl(int b, int n, int m, const Tensor unknown,
-                           const Tensor known, Tensor dist2, Tensor idx);
-REGISTER_DEVICE_IMPL(three_nn_forward_impl, CUDA, three_nn_forward_cuda);
-
-void TINShiftForwardCUDAKernelLauncher(Tensor input, Tensor shift,
-                                       Tensor output);
-
-void TINShiftBackwardCUDAKernelLauncher(Tensor grad_output, Tensor shift,
-                                        Tensor grad_input);
-
-void tin_shift_forward_cuda(Tensor input, Tensor shift, Tensor output) {
-  TINShiftForwardCUDAKernelLauncher(input, shift, output);
-}
-
-void tin_shift_backward_cuda(Tensor grad_output, Tensor shift,
-                             Tensor grad_input) {
-  TINShiftBackwardCUDAKernelLauncher(grad_output, shift, grad_input);
-}
-
-void tin_shift_forward_impl(Tensor input, Tensor shift, Tensor output);
-void tin_shift_backward_impl(Tensor grad_output, Tensor shift,
-                             Tensor grad_input);
-REGISTER_DEVICE_IMPL(tin_shift_forward_impl, CUDA, tin_shift_forward_cuda);
-REGISTER_DEVICE_IMPL(tin_shift_backward_impl, CUDA, tin_shift_backward_cuda);
-
-torch::Tensor upfirdn2d_op(const torch::Tensor& input,
-                           const torch::Tensor& kernel, int up_x, int up_y,
-                           int down_x, int down_y, int pad_x0, int pad_x1,
-                           int pad_y0, int pad_y1);
-
-torch::Tensor upfirdn2d_op_impl(const torch::Tensor& input,
-                                const torch::Tensor& kernel, int up_x, int up_y,
-                                int down_x, int down_y, int pad_x0, int pad_x1,
-                                int pad_y0, int pad_y1);
-REGISTER_DEVICE_IMPL(upfirdn2d_op_impl, CUDA, upfirdn2d_op);
-
-int HardVoxelizeForwardCUDAKernelLauncher(
-    const at::Tensor& points, at::Tensor& voxels, at::Tensor& coors,
-    at::Tensor& num_points_per_voxel, const std::vector<float> voxel_size,
-    const std::vector<float> coors_range, const int max_points,
-    const int max_voxels, const int NDim = 3);
-
-int NondeterministicHardVoxelizeForwardCUDAKernelLauncher(
-    const at::Tensor& points, at::Tensor& voxels, at::Tensor& coors,
-    at::Tensor& num_points_per_voxel, const std::vector<float> voxel_size,
-    const std::vector<float> coors_range, const int max_points,
-    const int max_voxels, const int NDim = 3);
-
-void DynamicVoxelizeForwardCUDAKernelLauncher(
-    const at::Tensor& points, at::Tensor& coors,
-    const std::vector<float> voxel_size, const std::vector<float> coors_range,
-    const int NDim = 3);
-
-int hard_voxelize_forward_cuda(const at::Tensor& points, at::Tensor& voxels,
-                               at::Tensor& coors,
-                               at::Tensor& num_points_per_voxel,
-                               const std::vector<float> voxel_size,
-                               const std::vector<float> coors_range,
-                               const int max_points, const int max_voxels,
-                               const int NDim) {
-  return HardVoxelizeForwardCUDAKernelLauncher(
-      points, voxels, coors, num_points_per_voxel, voxel_size, coors_range,
-      max_points, max_voxels, NDim);
-};
-
-int nondeterministic_hard_voxelize_forward_cuda(
-    const at::Tensor& points, at::Tensor& voxels, at::Tensor& coors,
-    at::Tensor& num_points_per_voxel, const std::vector<float> voxel_size,
-    const std::vector<float> coors_range, const int max_points,
-    const int max_voxels, const int NDim) {
-  return NondeterministicHardVoxelizeForwardCUDAKernelLauncher(
-      points, voxels, coors, num_points_per_voxel, voxel_size, coors_range,
-      max_points, max_voxels, NDim);
-};
-
-void dynamic_voxelize_forward_cuda(const at::Tensor& points, at::Tensor& coors,
-                                   const std::vector<float> voxel_size,
-                                   const std::vector<float> coors_range,
-                                   const int NDim) {
-  DynamicVoxelizeForwardCUDAKernelLauncher(points, coors, voxel_size,
-                                           coors_range, NDim);
-};
-
-int hard_voxelize_forward_impl(const at::Tensor& points, at::Tensor& voxels,
-                               at::Tensor& coors,
-                               at::Tensor& num_points_per_voxel,
-                               const std::vector<float> voxel_size,
-                               const std::vector<float> coors_range,
-                               const int max_points, const int max_voxels,
-                               const int NDim);
-
-int nondeterministic_hard_voxelize_forward_impl(
-    const at::Tensor& points, at::Tensor& voxels, at::Tensor& coors,
-    at::Tensor& num_points_per_voxel, const std::vector<float> voxel_size,
-    const std::vector<float> coors_range, const int max_points,
-    const int max_voxels, const int NDim);
-
-void dynamic_voxelize_forward_impl(const at::Tensor& points, at::Tensor& coors,
-                                   const std::vector<float> voxel_size,
-                                   const std::vector<float> coors_range,
-                                   const int NDim);
-
-REGISTER_DEVICE_IMPL(hard_voxelize_forward_impl, CUDA,
-                     hard_voxelize_forward_cuda);
-REGISTER_DEVICE_IMPL(nondeterministic_hard_voxelize_forward_impl, CUDA,
-                     nondeterministic_hard_voxelize_forward_cuda);
-REGISTER_DEVICE_IMPL(dynamic_voxelize_forward_impl, CUDA,
-                     dynamic_voxelize_forward_cuda);
-
-void RotatedFeatureAlignForwardCUDAKernelLauncher(const Tensor features,
-                                                  const Tensor best_bboxes,
-                                                  const float spatial_scale,
-                                                  const int points,
-                                                  Tensor output);
-
-void RotatedFeatureAlignBackwardCUDAKernelLauncher(const Tensor top_grad,
-                                                   const Tensor best_bboxes,
-                                                   const float spatial_scale,
-                                                   const int points,
-                                                   Tensor bottom_grad);
-
-void rotated_feature_align_forward_cuda(const Tensor features,
-                                        const Tensor best_bboxes,
-                                        const float spatial_scale,
-                                        const int points, Tensor output) {
-  RotatedFeatureAlignForwardCUDAKernelLauncher(features, best_bboxes,
-                                               spatial_scale, points, output);
-};
-
-void rotated_feature_align_backward_cuda(const Tensor top_grad,
-                                         const Tensor best_bboxes,
-                                         const float spatial_scale,
-                                         const int points, Tensor bottom_grad) {
-  RotatedFeatureAlignBackwardCUDAKernelLauncher(
-      top_grad, best_bboxes, spatial_scale, points, bottom_grad);
-};
-
-void rotated_feature_align_forward_impl(const Tensor features,
-                                        const Tensor best_bboxes,
-                                        const float spatial_scale,
-                                        const int points, Tensor output);
-
-void rotated_feature_align_backward_impl(const Tensor top_grad,
-                                         const Tensor best_bboxes,
-                                         const float spatial_scale,
-                                         const int points, Tensor bottom_grad);
-
-REGISTER_DEVICE_IMPL(rotated_feature_align_forward_impl, CUDA,
-                     rotated_feature_align_forward_cuda);
-REGISTER_DEVICE_IMPL(rotated_feature_align_backward_impl, CUDA,
-                     rotated_feature_align_backward_cuda);
-
-void PointsInPolygonsForwardCUDAKernelLauncher(const at::Tensor points,
-                                               const at::Tensor polygons,
-                                               const int rows, const int cols,
-                                               at::Tensor output);
-
-void points_in_polygons_forward_cuda(const Tensor points, const Tensor polygons,
-                                     Tensor output, const int rows,
-                                     const int cols) {
-  PointsInPolygonsForwardCUDAKernelLauncher(points, polygons, rows, cols,
-                                            output);
-};
-
-void points_in_polygons_forward_impl(const Tensor points, const Tensor polygons,
-                                     Tensor output, const int rows,
-                                     const int cols);
-
-REGISTER_DEVICE_IMPL(points_in_polygons_forward_impl, CUDA,
-                     points_in_polygons_forward_cuda);
-
-void MinAreaPolygonsCUDAKernelLauncher(const Tensor pointsets, Tensor polygons);
-
-void min_area_polygons_cuda(const Tensor pointsets, Tensor polygons) {
-  MinAreaPolygonsCUDAKernelLauncher(pointsets, polygons);
-}
-
-void min_area_polygons_impl(const Tensor pointsets, Tensor polygons);
-
-REGISTER_DEVICE_IMPL(min_area_polygons_impl, CUDA, min_area_polygons_cuda);
-
-void ActiveRotatedFilterForwardCUDAKernelLauncher(const Tensor input,
-                                                  const Tensor indices,
-                                                  Tensor output);
-
-void ActiveRotatedFilterBackwardCUDAKernelLauncher(const Tensor grad_out,
-                                                   const Tensor indices,
-                                                   Tensor grad_in);
-
-void active_rotated_filter_forward_cuda(const Tensor input,
-                                        const Tensor indices, Tensor output) {
-  ActiveRotatedFilterForwardCUDAKernelLauncher(input, indices, output);
-};
-
-void active_rotated_filter_backward_cuda(const Tensor grad_out,
-                                         const Tensor indices, Tensor grad_in) {
-  ActiveRotatedFilterBackwardCUDAKernelLauncher(grad_out, indices, grad_in);
-};
-
-void active_rotated_filter_forward_impl(const Tensor input,
-                                        const Tensor indices, Tensor output);
-
-void active_rotated_filter_backward_impl(const Tensor grad_out,
-                                         const Tensor indices, Tensor grad_in);
-
-REGISTER_DEVICE_IMPL(active_rotated_filter_forward_impl, CUDA,
-                     active_rotated_filter_forward_cuda);
-REGISTER_DEVICE_IMPL(active_rotated_filter_backward_impl, CUDA,
-                     active_rotated_filter_backward_cuda);
-
-void ConvexIoUCUDAKernelLauncher(const Tensor pointsets, const Tensor polygons,
-                                 Tensor ious);
-
-void ConvexGIoUCUDAKernelLauncher(const Tensor pointsets, const Tensor polygons,
-                                  Tensor output);
-
-void convex_iou_cuda(const Tensor pointsets, const Tensor polygons,
-                     Tensor ious) {
-  ConvexIoUCUDAKernelLauncher(pointsets, polygons, ious);
-}
-
-void convex_giou_cuda(const Tensor pointsets, const Tensor polygons,
-                      Tensor output) {
-  ConvexGIoUCUDAKernelLauncher(pointsets, polygons, output);
-}
-
-void convex_iou_impl(const Tensor pointsets, const Tensor polygons,
-                     Tensor ious);
-
-void convex_giou_impl(const Tensor pointsets, const Tensor polygons,
-                      Tensor output);
-
-REGISTER_DEVICE_IMPL(convex_iou_impl, CUDA, convex_iou_cuda);
-REGISTER_DEVICE_IMPL(convex_giou_impl, CUDA, convex_giou_cuda);
-
-Tensor DiffIoURotatedSortVerticesCUDAKernelLauncher(Tensor vertices,
-                                                    Tensor mask,
-                                                    Tensor num_valid);
-
-Tensor diff_iou_rotated_sort_vertices_forward_cuda(Tensor vertices, Tensor mask,
-                                                   Tensor num_valid) {
-  return DiffIoURotatedSortVerticesCUDAKernelLauncher(vertices, mask,
-                                                      num_valid);
-}
-
-Tensor diff_iou_rotated_sort_vertices_forward_impl(Tensor vertices, Tensor mask,
-                                                   Tensor num_valid);
-
-REGISTER_DEVICE_IMPL(diff_iou_rotated_sort_vertices_forward_impl, CUDA,
-                     diff_iou_rotated_sort_vertices_forward_cuda);
-
-void ChamferDistanceForwardCUDAKernelLauncher(
-    const Tensor xyz1, const Tensor xyz2, const Tensor dist1,
-    const Tensor dist2, const Tensor idx1, const Tensor idx2);
-
-void ChamferDistanceBackwardCUDAKernelLauncher(
-    const Tensor xyz1, const Tensor xyz2, Tensor idx1, Tensor idx2,
-    Tensor grad_dist1, Tensor grad_dist2, Tensor grad_xyz1, Tensor grad_xyz2);
-
-void chamfer_distance_forward_cuda(const Tensor xyz1, const Tensor xyz2,
-                                   const Tensor dist1, const Tensor dist2,
-                                   const Tensor idx1, const Tensor idx2) {
-  ChamferDistanceForwardCUDAKernelLauncher(xyz1, xyz2, dist1, dist2, idx1,
-                                           idx2);
-};
-
-void chamfer_distance_backward_cuda(const Tensor xyz1, const Tensor xyz2,
-                                    Tensor idx1, Tensor idx2, Tensor graddist1,
-                                    Tensor graddist2, Tensor gradxyz1,
-                                    Tensor gradxyz2) {
-  ChamferDistanceBackwardCUDAKernelLauncher(xyz1, xyz2, idx1, idx2, graddist1,
-                                            graddist2, gradxyz1, gradxyz2);
-};
-
-void chamfer_distance_forward_impl(const Tensor xyz1, const Tensor xyz2,
-                                   const Tensor dist1, const Tensor dist2,
-                                   const Tensor idx1, const Tensor idx2);
-
-void chamfer_distance_backward_impl(const Tensor xyz1, const Tensor xyz2,
-                                    Tensor idx1, Tensor idx2, Tensor graddist1,
-                                    Tensor graddist2, Tensor gradxyz1,
-                                    Tensor gradxyz2);
-
-REGISTER_DEVICE_IMPL(chamfer_distance_forward_impl, CUDA,
-                     chamfer_distance_forward_cuda);
-REGISTER_DEVICE_IMPL(chamfer_distance_backward_impl, CUDA,
-                     chamfer_distance_backward_cuda);
-
-void PrROIPoolForwardCUDAKernelLauncher(Tensor input, Tensor rois,
-                                        Tensor output, int pooled_height,
-                                        int pooled_width, float spatial_scale);
-
-void PrROIPoolBackwardCUDAKernelLauncher(Tensor grad_output, Tensor rois,
-                                         Tensor grad_input, int pooled_height,
-                                         int pooled_width, float spatial_scale);
-
-void PrROIPoolCoorBackwardCUDAKernelLauncher(
-    Tensor output, Tensor grad_output, Tensor input, Tensor rois,
-    Tensor grad_rois, int pooled_height, int pooled_width, float spatial_scale);
-
-void prroi_pool_forward_cuda(Tensor input, Tensor rois, Tensor output,
-                             int pooled_height, int pooled_width,
-                             float spatial_scale) {
-  PrROIPoolForwardCUDAKernelLauncher(input, rois, output, pooled_height,
-                                     pooled_width, spatial_scale);
-}
-
-void prroi_pool_backward_cuda(Tensor grad_output, Tensor rois,
-                              Tensor grad_input, int pooled_height,
-                              int pooled_width, float spatial_scale) {
-  PrROIPoolBackwardCUDAKernelLauncher(grad_output, rois, grad_input,
-                                      pooled_height, pooled_width,
-                                      spatial_scale);
-}
-
-void prroi_pool_coor_backward_cuda(Tensor output, Tensor grad_output,
-                                   Tensor input, Tensor rois, Tensor grad_rois,
-                                   int pooled_height, int pooled_width,
-                                   float spatial_scale) {
-  PrROIPoolCoorBackwardCUDAKernelLauncher(output, grad_output, input, rois,
-                                          grad_rois, pooled_height,
-                                          pooled_width, spatial_scale);
-}
-
-void prroi_pool_forward_impl(Tensor input, Tensor rois, Tensor output,
-                             int pooled_height, int pooled_width,
-                             float spatial_scale);
-void prroi_pool_backward_impl(Tensor grad_output, Tensor rois,
-                              Tensor grad_input, int pooled_height,
-                              int pooled_width, float spatial_scale);
-void prroi_pool_coor_backward_impl(Tensor output, Tensor grad_output,
-                                   Tensor input, Tensor rois, Tensor grad_rois,
-                                   int pooled_height, int pooled_width,
-                                   float spatial_scale);
-REGISTER_DEVICE_IMPL(prroi_pool_forward_impl, CUDA, prroi_pool_forward_cuda);
-REGISTER_DEVICE_IMPL(prroi_pool_backward_impl, CUDA, prroi_pool_backward_cuda);
-REGISTER_DEVICE_IMPL(prroi_pool_coor_backward_impl, CUDA,
-                     prroi_pool_coor_backward_cuda);
diff --git a/mmcv/ops/csrc/parrots/deform_conv.cpp b/mmcv/ops/csrc/parrots/deform_conv.cpp
index 86690b9..4551027 100644
--- a/mmcv/ops/csrc/parrots/deform_conv.cpp
+++ b/mmcv/ops/csrc/parrots/deform_conv.cpp
@@ -1,46 +1,57 @@
 // Copyright (c) OpenMMLab. All rights reserved
 #include "pytorch_cpp_helper.hpp"
-#include "pytorch_device_registry.hpp"
-
-void deformable_im2col_impl(Tensor data_im, Tensor data_offset,
-                            const int channels, const int height,
-                            const int width, const int ksize_h,
-                            const int ksize_w, const int pad_h, const int pad_w,
-                            const int stride_h, const int stride_w,
-                            const int dilation_h, const int dilation_w,
-                            const int parallel_imgs, const int deformable_group,
-                            Tensor data_col) {
-  DISPATCH_DEVICE_IMPL(deformable_im2col_impl, data_im, data_offset, channels,
-                       height, width, ksize_h, ksize_w, pad_h, pad_w, stride_h,
-                       stride_w, dilation_h, dilation_w, parallel_imgs,
-                       deformable_group, data_col);
-}
 
-void deformable_col2im_impl(Tensor data_col, Tensor data_offset,
-                            const int channels, const int height,
-                            const int width, const int ksize_h,
-                            const int ksize_w, const int pad_h, const int pad_w,
-                            const int stride_h, const int stride_w,
-                            const int dilation_h, const int dilation_w,
-                            const int parallel_imgs, const int deformable_group,
-                            Tensor grad_im) {
-  DISPATCH_DEVICE_IMPL(deformable_col2im_impl, data_col, data_offset, channels,
-                       height, width, ksize_h, ksize_w, pad_h, pad_w, stride_h,
-                       stride_w, dilation_h, dilation_w, parallel_imgs,
-                       deformable_group, grad_im);
-}
+#ifdef MMCV_WITH_CUDA
 
-void deformable_col2im_coord_impl(
+void deformable_im2col(Tensor data_im, Tensor data_offset, const int channels,
+                       const int height, const int width, const int ksize_h,
+                       const int ksize_w, const int pad_h, const int pad_w,
+                       const int stride_h, const int stride_w,
+                       const int dilation_h, const int dilation_w,
+                       const int parallel_imgs, const int deformable_group,
+                       Tensor data_col);
+
+void deformable_col2im(Tensor data_col, Tensor data_offset, const int channels,
+                       const int height, const int width, const int ksize_h,
+                       const int ksize_w, const int pad_h, const int pad_w,
+                       const int stride_h, const int stride_w,
+                       const int dilation_h, const int dilation_w,
+                       const int parallel_imgs, const int deformable_group,
+                       Tensor grad_im);
+
+void deformable_col2im_coord(
     Tensor data_col, Tensor data_im, Tensor data_offset, const int channels,
     const int height, const int width, const int ksize_h, const int ksize_w,
     const int pad_h, const int pad_w, const int stride_h, const int stride_w,
     const int dilation_h, const int dilation_w, const int parallel_imgs,
-    const int deformable_group, Tensor grad_offset) {
-  DISPATCH_DEVICE_IMPL(deformable_col2im_coord_impl, data_col, data_im,
-                       data_offset, channels, height, width, ksize_h, ksize_w,
-                       pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w,
-                       parallel_imgs, deformable_group, grad_offset);
-}
+    const int deformable_group, Tensor grad_offset);
+
+#endif
+
+void deformable_im2col_cpu(Tensor data_im, Tensor data_offset,
+                           const int channels, const int height,
+                           const int width, const int ksize_h,
+                           const int ksize_w, const int pad_h, const int pad_w,
+                           const int stride_h, const int stride_w,
+                           const int dilation_h, const int dilation_w,
+                           const int parallel_imgs, const int deformable_group,
+                           Tensor data_col);
+
+void deformable_col2im_cpu(Tensor data_col, Tensor data_offset,
+                           const int channels, const int height,
+                           const int width, const int ksize_h,
+                           const int ksize_w, const int pad_h, const int pad_w,
+                           const int stride_h, const int stride_w,
+                           const int dilation_h, const int dilation_w,
+                           const int parallel_imgs, const int deformable_group,
+                           Tensor grad_im);
+
+void deformable_col2im_coord_cpu(
+    Tensor data_col, Tensor data_im, Tensor data_offset, const int channels,
+    const int height, const int width, const int ksize_h, const int ksize_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w, const int parallel_imgs,
+    const int deformable_group, Tensor grad_offset);
 
 void deform_conv_shape_check(at::Tensor input, at::Tensor offset,
                              at::Tensor *gradOutput, at::Tensor weight, int kH,
@@ -216,9 +227,17 @@ void deform_conv_forward(Tensor input, Tensor weight, Tensor offset,
        output_buffer.size(2), output_buffer.size(3)});
 
   for (int elt = 0; elt < batchSize / im2col_step; elt++) {
-    deformable_im2col_impl(input[elt], offset[elt], nInputPlane, inputHeight,
-                           inputWidth, kH, kW, padH, padW, dH, dW, dilationH,
-                           dilationW, im2col_step, deformable_group, columns);
+    if (input.device().is_cuda()) {
+#ifdef MMCV_WITH_CUDA
+      deformable_im2col(input[elt], offset[elt], nInputPlane, inputHeight,
+                        inputWidth, kH, kW, padH, padW, dH, dW, dilationH,
+                        dilationW, im2col_step, deformable_group, columns);
+#endif
+    } else {
+      deformable_im2col_cpu(input[elt], offset[elt], nInputPlane, inputHeight,
+                            inputWidth, kH, kW, padH, padW, dH, dW, dilationH,
+                            dilationW, im2col_step, deformable_group, columns);
+    }
 
     columns = columns.view({group, columns.size(0) / group, columns.size(1)});
     weight = weight.view({group, weight.size(0) / group, weight.size(1),
@@ -354,15 +373,29 @@ void deform_conv_backward_input(Tensor input, Tensor offset, Tensor gradOutput,
         {gradOutput.size(0), gradOutput.size(1) * gradOutput.size(2),
          gradOutput.size(3), gradOutput.size(4), gradOutput.size(5)});
 
-    deformable_col2im_coord_impl(columns, input[elt], offset[elt], nInputPlane,
-                                 inputHeight, inputWidth, kH, kW, padH, padW,
-                                 dH, dW, dilationH, dilationW, im2col_step,
-                                 deformable_group, gradOffset[elt]);
-
-    deformable_col2im_impl(columns, offset[elt], nInputPlane, inputHeight,
-                           inputWidth, kH, kW, padH, padW, dH, dW, dilationH,
-                           dilationW, im2col_step, deformable_group,
-                           gradInput[elt]);
+    if (input.device().is_cuda()) {
+#ifdef MMCV_WITH_CUDA
+      deformable_col2im_coord(columns, input[elt], offset[elt], nInputPlane,
+                              inputHeight, inputWidth, kH, kW, padH, padW, dH,
+                              dW, dilationH, dilationW, im2col_step,
+                              deformable_group, gradOffset[elt]);
+
+      deformable_col2im(columns, offset[elt], nInputPlane, inputHeight,
+                        inputWidth, kH, kW, padH, padW, dH, dW, dilationH,
+                        dilationW, im2col_step, deformable_group,
+                        gradInput[elt]);
+#endif
+    } else {
+      deformable_col2im_coord_cpu(columns, input[elt], offset[elt], nInputPlane,
+                                  inputHeight, inputWidth, kH, kW, padH, padW,
+                                  dH, dW, dilationH, dilationW, im2col_step,
+                                  deformable_group, gradOffset[elt]);
+
+      deformable_col2im_cpu(columns, offset[elt], nInputPlane, inputHeight,
+                            inputWidth, kH, kW, padH, padW, dH, dW, dilationH,
+                            dilationW, im2col_step, deformable_group,
+                            gradInput[elt]);
+    }
 
     weight = weight.view({weight.size(0) * weight.size(1), weight.size(2),
                           weight.size(3), weight.size(4)});
@@ -475,9 +508,17 @@ void deform_conv_backward_parameters(Tensor input, Tensor offset,
                    deformable_group * 2 * kH * kW, outputHeight, outputWidth});
 
   for (int elt = 0; elt < batchSize / im2col_step; elt++) {
-    deformable_im2col_impl(input[elt], offset[elt], nInputPlane, inputHeight,
-                           inputWidth, kH, kW, padH, padW, dH, dW, dilationH,
-                           dilationW, im2col_step, deformable_group, columns);
+    if (input.device().is_cuda()) {
+#ifdef MMCV_WITH_CUDA
+      deformable_im2col(input[elt], offset[elt], nInputPlane, inputHeight,
+                        inputWidth, kH, kW, padH, padW, dH, dW, dilationH,
+                        dilationW, im2col_step, deformable_group, columns);
+#endif
+    } else {
+      deformable_im2col_cpu(input[elt], offset[elt], nInputPlane, inputHeight,
+                            inputWidth, kH, kW, padH, padW, dH, dW, dilationH,
+                            dilationW, im2col_step, deformable_group, columns);
+    }
 
     // divide into group
     gradOutputBuffer = gradOutputBuffer.view(
diff --git a/mmcv/ops/csrc/parrots/deform_conv_cpu.cpp b/mmcv/ops/csrc/parrots/deform_conv_cpu.cpp
new file mode 100644
index 0000000..cb0e638
--- /dev/null
+++ b/mmcv/ops/csrc/parrots/deform_conv_cpu.cpp
@@ -0,0 +1,377 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cpp_helper.hpp"
+
+template <typename T>
+T deformable_im2col_bilinear_cpu(const T *input, const int data_width,
+                                 const int height, const int width, T h, T w) {
+  if (h <= -1 || height <= h || w <= -1 || width <= w) {
+    return 0;
+  }
+
+  int h_low = floor(h);
+  int w_low = floor(w);
+  int h_high = h_low + 1;
+  int w_high = w_low + 1;
+
+  T lh = h - h_low;
+  T lw = w - w_low;
+  T hh = 1 - lh, hw = 1 - lw;
+
+  T v1 = 0;
+  if (h_low >= 0 && w_low >= 0) v1 = input[h_low * data_width + w_low];
+  T v2 = 0;
+  if (h_low >= 0 && w_high <= width - 1)
+    v2 = input[h_low * data_width + w_high];
+  T v3 = 0;
+  if (h_high <= height - 1 && w_low >= 0)
+    v3 = input[h_high * data_width + w_low];
+  T v4 = 0;
+  if (h_high <= height - 1 && w_high <= width - 1)
+    v4 = input[h_high * data_width + w_high];
+
+  T w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
+
+  T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+  return val;
+}
+
+template <typename T>
+T get_gradient_weight_cpu(T argmax_h, T argmax_w, const int h, const int w,
+                          const int height, const int width) {
+  if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 ||
+      argmax_w >= width) {
+    // empty
+    return 0;
+  }
+
+  int argmax_h_low = floor(argmax_h);
+  int argmax_w_low = floor(argmax_w);
+  int argmax_h_high = argmax_h_low + 1;
+  int argmax_w_high = argmax_w_low + 1;
+
+  T weight = 0;
+  if (h == argmax_h_low && w == argmax_w_low)
+    weight = (h + 1 - argmax_h) * (w + 1 - argmax_w);
+  if (h == argmax_h_low && w == argmax_w_high)
+    weight = (h + 1 - argmax_h) * (argmax_w + 1 - w);
+  if (h == argmax_h_high && w == argmax_w_low)
+    weight = (argmax_h + 1 - h) * (w + 1 - argmax_w);
+  if (h == argmax_h_high && w == argmax_w_high)
+    weight = (argmax_h + 1 - h) * (argmax_w + 1 - w);
+  return weight;
+}
+
+template <typename T>
+T get_coordinate_weight_cpu(T argmax_h, T argmax_w, const int height,
+                            const int width, const T *im_data,
+                            const int data_width, const int bp_dir) {
+  if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 ||
+      argmax_w >= width) {
+    // empty
+    return 0;
+  }
+
+  int argmax_h_low = floor(argmax_h);
+  int argmax_w_low = floor(argmax_w);
+  int argmax_h_high = argmax_h_low + 1;
+  int argmax_w_high = argmax_w_low + 1;
+
+  T weight = 0;
+
+  if (bp_dir == 0) {
+    if (argmax_h_low >= 0 && argmax_w_low >= 0)
+      weight += -1 * (argmax_w_low + 1 - argmax_w) *
+                im_data[argmax_h_low * data_width + argmax_w_low];
+    if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
+      weight += -1 * (argmax_w - argmax_w_low) *
+                im_data[argmax_h_low * data_width + argmax_w_high];
+    if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
+      weight += (argmax_w_low + 1 - argmax_w) *
+                im_data[argmax_h_high * data_width + argmax_w_low];
+    if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
+      weight += (argmax_w - argmax_w_low) *
+                im_data[argmax_h_high * data_width + argmax_w_high];
+  } else if (bp_dir == 1) {
+    if (argmax_h_low >= 0 && argmax_w_low >= 0)
+      weight += -1 * (argmax_h_low + 1 - argmax_h) *
+                im_data[argmax_h_low * data_width + argmax_w_low];
+    if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
+      weight += (argmax_h_low + 1 - argmax_h) *
+                im_data[argmax_h_low * data_width + argmax_w_high];
+    if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
+      weight += -1 * (argmax_h - argmax_h_low) *
+                im_data[argmax_h_high * data_width + argmax_w_low];
+    if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
+      weight += (argmax_h - argmax_h_low) *
+                im_data[argmax_h_high * data_width + argmax_w_high];
+  }
+
+  return weight;
+}
+
+template <typename T>
+void deformable_im2col_cpu_kernel(
+    const int n, const T *data_im, const T *data_offset, const int height,
+    const int width, const int kernel_h, const int kernel_w, const int pad_h,
+    const int pad_w, const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w,
+    const int channel_per_deformable_group, const int batch_size,
+    const int num_channels, const int deformable_group, const int height_col,
+    const int width_col, T *data_col) {
+  for (int index = 0; index < n; index++) {
+    // index index of output matrix
+    const int w_col = index % width_col;
+    const int h_col = (index / width_col) % height_col;
+    const int b_col = (index / width_col / height_col) % batch_size;
+    const int c_im = (index / width_col / height_col) / batch_size;
+    const int c_col = c_im * kernel_h * kernel_w;
+
+    // compute deformable group index
+    const int deformable_group_index = c_im / channel_per_deformable_group;
+
+    const int h_in = h_col * stride_h - pad_h;
+    const int w_in = w_col * stride_w - pad_w;
+    T *data_col_ptr =
+        data_col +
+        ((c_col * batch_size + b_col) * height_col + h_col) * width_col + w_col;
+    const T *data_im_ptr =
+        data_im + (b_col * num_channels + c_im) * height * width;
+    const T *data_offset_ptr =
+        data_offset + (b_col * deformable_group + deformable_group_index) * 2 *
+                          kernel_h * kernel_w * height_col * width_col;
+
+    for (int i = 0; i < kernel_h; ++i) {
+      for (int j = 0; j < kernel_w; ++j) {
+        const int data_offset_h_ptr =
+            ((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col;
+        const int data_offset_w_ptr =
+            ((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col +
+            w_col;
+        const T offset_h = data_offset_ptr[data_offset_h_ptr];
+        const T offset_w = data_offset_ptr[data_offset_w_ptr];
+        T val = static_cast<T>(0);
+        const T h_im = h_in + i * dilation_h + offset_h;
+        const T w_im = w_in + j * dilation_w + offset_w;
+        if (h_im > -1 && w_im > -1 && h_im < height && w_im < width)
+          val = deformable_im2col_bilinear_cpu(data_im_ptr, width, height,
+                                               width, h_im, w_im);
+        *data_col_ptr = val;
+        data_col_ptr += batch_size * height_col * width_col;
+      }
+    }
+  }
+}
+
+template <typename T>
+void deformable_col2im_cpu_kernel(
+    const int n, const T *data_col, const T *data_offset, const int channels,
+    const int height, const int width, const int kernel_h, const int kernel_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w,
+    const int channel_per_deformable_group, const int batch_size,
+    const int deformable_group, const int height_col, const int width_col,
+    T *grad_im) {
+  for (int index = 0; index < n; index++) {
+    const int j = (index / width_col / height_col / batch_size) % kernel_w;
+    const int i =
+        (index / width_col / height_col / batch_size / kernel_w) % kernel_h;
+    const int c =
+        index / width_col / height_col / batch_size / kernel_w / kernel_h;
+    // compute the start and end of the output
+
+    const int deformable_group_index = c / channel_per_deformable_group;
+
+    int w_out = index % width_col;
+    int h_out = (index / width_col) % height_col;
+    int b = (index / width_col / height_col) % batch_size;
+    int w_in = w_out * stride_w - pad_w;
+    int h_in = h_out * stride_h - pad_h;
+
+    const T *data_offset_ptr =
+        data_offset + (b * deformable_group + deformable_group_index) * 2 *
+                          kernel_h * kernel_w * height_col * width_col;
+    const int data_offset_h_ptr =
+        ((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out;
+    const int data_offset_w_ptr =
+        ((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out;
+    const T offset_h = data_offset_ptr[data_offset_h_ptr];
+    const T offset_w = data_offset_ptr[data_offset_w_ptr];
+    const T cur_inv_h_data = h_in + i * dilation_h + offset_h;
+    const T cur_inv_w_data = w_in + j * dilation_w + offset_w;
+
+    const T cur_top_grad = data_col[index];
+    const int cur_h = (int)cur_inv_h_data;
+    const int cur_w = (int)cur_inv_w_data;
+    for (int dy = -2; dy <= 2; dy++) {
+      for (int dx = -2; dx <= 2; dx++) {
+        if (cur_h + dy >= 0 && cur_h + dy < height && cur_w + dx >= 0 &&
+            cur_w + dx < width && abs(cur_inv_h_data - (cur_h + dy)) < 1 &&
+            abs(cur_inv_w_data - (cur_w + dx)) < 1) {
+          int cur_bottom_grad_pos =
+              ((b * channels + c) * height + cur_h + dy) * width + cur_w + dx;
+          T weight =
+              get_gradient_weight_cpu(cur_inv_h_data, cur_inv_w_data,
+                                      cur_h + dy, cur_w + dx, height, width);
+          *(grad_im + cur_bottom_grad_pos) += weight * cur_top_grad;
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+void deformable_col2im_coord_cpu_kernel(
+    const int n, const T *data_col, const T *data_im, const T *data_offset,
+    const int channels, const int height, const int width, const int kernel_h,
+    const int kernel_w, const int pad_h, const int pad_w, const int stride_h,
+    const int stride_w, const int dilation_h, const int dilation_w,
+    const int channel_per_deformable_group, const int batch_size,
+    const int offset_channels, const int deformable_group, const int height_col,
+    const int width_col, T *grad_offset) {
+  for (int index = 0; index < n; index++) {
+    T val = 0;
+    int w = index % width_col;
+    int h = (index / width_col) % height_col;
+    int c = (index / width_col / height_col) % offset_channels;
+    int b = (index / width_col / height_col) / offset_channels;
+    // compute the start and end of the output
+
+    const int deformable_group_index = c / (2 * kernel_h * kernel_w);
+    const int col_step = kernel_h * kernel_w;
+    int cnt = 0;
+    const T *data_col_ptr = data_col + deformable_group_index *
+                                           channel_per_deformable_group *
+                                           batch_size * width_col * height_col;
+    const T *data_im_ptr =
+        data_im + (b * deformable_group + deformable_group_index) *
+                      channel_per_deformable_group / kernel_h / kernel_w *
+                      height * width;
+    const T *data_offset_ptr =
+        data_offset + (b * deformable_group + deformable_group_index) * 2 *
+                          kernel_h * kernel_w * height_col * width_col;
+
+    const int offset_c = c - deformable_group_index * 2 * kernel_h * kernel_w;
+
+    for (int col_c = (offset_c / 2); col_c < channel_per_deformable_group;
+         col_c += col_step) {
+      const int col_pos =
+          (((col_c * batch_size + b) * height_col) + h) * width_col + w;
+      const int bp_dir = offset_c % 2;
+
+      int j = (col_pos / width_col / height_col / batch_size) % kernel_w;
+      int i =
+          (col_pos / width_col / height_col / batch_size / kernel_w) % kernel_h;
+      int w_out = col_pos % width_col;
+      int h_out = (col_pos / width_col) % height_col;
+      int w_in = w_out * stride_w - pad_w;
+      int h_in = h_out * stride_h - pad_h;
+      const int data_offset_h_ptr =
+          (((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out);
+      const int data_offset_w_ptr =
+          (((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col +
+           w_out);
+      const T offset_h = data_offset_ptr[data_offset_h_ptr];
+      const T offset_w = data_offset_ptr[data_offset_w_ptr];
+      T inv_h = h_in + i * dilation_h + offset_h;
+      T inv_w = w_in + j * dilation_w + offset_w;
+      if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width)
+        inv_h = inv_w = -2;
+      const T weight = get_coordinate_weight_cpu(
+          inv_h, inv_w, height, width, data_im_ptr + cnt * height * width,
+          width, bp_dir);
+      val += weight * data_col_ptr[col_pos];
+      cnt += 1;
+    }
+
+    grad_offset[index] = val;
+  }
+}
+
+void deformable_im2col_cpu(Tensor data_im, Tensor data_offset,
+                           const int channels, const int height,
+                           const int width, const int ksize_h,
+                           const int ksize_w, const int pad_h, const int pad_w,
+                           const int stride_h, const int stride_w,
+                           const int dilation_h, const int dilation_w,
+                           const int parallel_imgs, const int deformable_group,
+                           Tensor data_col) {
+  int height_col =
+      (height + 2 * pad_h - (dilation_h * (ksize_h - 1) + 1)) / stride_h + 1;
+  int width_col =
+      (width + 2 * pad_w - (dilation_w * (ksize_w - 1) + 1)) / stride_w + 1;
+  int num_kernels = channels * height_col * width_col * parallel_imgs;
+  int channel_per_deformable_group = channels / deformable_group;
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      data_im.scalar_type(), "deformable_im2col_cpu", [&] {
+        deformable_im2col_cpu_kernel<scalar_t>(
+            num_kernels, data_im.data_ptr<scalar_t>(),
+            data_offset.data_ptr<scalar_t>(), height, width, ksize_h, ksize_w,
+            pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w,
+            channel_per_deformable_group, parallel_imgs, channels,
+            deformable_group, height_col, width_col,
+            data_col.data_ptr<scalar_t>());
+      });
+}
+
+void deformable_col2im_cpu(Tensor data_col, Tensor data_offset,
+                           const int channels, const int height,
+                           const int width, const int ksize_h,
+                           const int ksize_w, const int pad_h, const int pad_w,
+                           const int stride_h, const int stride_w,
+                           const int dilation_h, const int dilation_w,
+                           const int parallel_imgs, const int deformable_group,
+                           Tensor grad_im) {
+  // todo: make sure parallel_imgs is passed in correctly
+  int height_col =
+      (height + 2 * pad_h - (dilation_h * (ksize_h - 1) + 1)) / stride_h + 1;
+  int width_col =
+      (width + 2 * pad_w - (dilation_w * (ksize_w - 1) + 1)) / stride_w + 1;
+  int num_kernels =
+      channels * ksize_h * ksize_w * height_col * width_col * parallel_imgs;
+  int channel_per_deformable_group = channels / deformable_group;
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      data_col.scalar_type(), "deformable_col2im_gpu", ([&] {
+        const scalar_t *data_col_ = data_col.data_ptr<scalar_t>();
+        const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();
+        scalar_t *grad_im_ = grad_im.data_ptr<scalar_t>();
+
+        deformable_col2im_cpu_kernel<scalar_t>(
+            num_kernels, data_col_, data_offset_, channels, height, width,
+            ksize_h, ksize_w, pad_h, pad_w, stride_h, stride_w, dilation_h,
+            dilation_w, channel_per_deformable_group, parallel_imgs,
+            deformable_group, height_col, width_col, grad_im_);
+      }));
+}
+
+void deformable_col2im_coord_cpu(
+    Tensor data_col, Tensor data_im, Tensor data_offset, const int channels,
+    const int height, const int width, const int ksize_h, const int ksize_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w, const int parallel_imgs,
+    const int deformable_group, Tensor grad_offset) {
+  int height_col =
+      (height + 2 * pad_h - (dilation_h * (ksize_h - 1) + 1)) / stride_h + 1;
+  int width_col =
+      (width + 2 * pad_w - (dilation_w * (ksize_w - 1) + 1)) / stride_w + 1;
+  int num_kernels = height_col * width_col * 2 * ksize_h * ksize_w *
+                    deformable_group * parallel_imgs;
+  int channel_per_deformable_group =
+      channels * ksize_h * ksize_w / deformable_group;
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      data_col.scalar_type(), "deformable_col2im_coord_cpu", ([&] {
+        const scalar_t *data_col_ = data_col.data_ptr<scalar_t>();
+        const scalar_t *data_im_ = data_im.data_ptr<scalar_t>();
+        const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();
+        scalar_t *grad_offset_ = grad_offset.data_ptr<scalar_t>();
+
+        deformable_col2im_coord_cpu_kernel<scalar_t>(
+            num_kernels, data_col_, data_im_, data_offset_, channels, height,
+            width, ksize_h, ksize_w, pad_h, pad_w, stride_h, stride_w,
+            dilation_h, dilation_w, channel_per_deformable_group, parallel_imgs,
+            2 * ksize_h * ksize_w * deformable_group, deformable_group,
+            height_col, width_col, grad_offset_);
+      }));
+}
diff --git a/mmcv/ops/csrc/parrots/deform_roi_pool.cpp b/mmcv/ops/csrc/parrots/deform_roi_pool.cpp
index 4fb78a9..b4654c0 100644
--- a/mmcv/ops/csrc/parrots/deform_roi_pool.cpp
+++ b/mmcv/ops/csrc/parrots/deform_roi_pool.cpp
@@ -1,34 +1,59 @@
 // Copyright (c) OpenMMLab. All rights reserved
 #include "pytorch_cpp_helper.hpp"
-#include "pytorch_device_registry.hpp"
 
-void deform_roi_pool_forward_impl(Tensor input, Tensor rois, Tensor offset,
+#ifdef MMCV_WITH_CUDA
+void DeformRoIPoolForwardCUDAKernelLauncher(Tensor input, Tensor rois,
+                                            Tensor offset, Tensor output,
+                                            int pooled_height, int pooled_width,
+                                            float spatial_scale,
+                                            int sampling_ratio, float gamma);
+
+void DeformRoIPoolBackwardCUDAKernelLauncher(
+    Tensor grad_output, Tensor input, Tensor rois, Tensor offset,
+    Tensor grad_input, Tensor grad_offset, int pooled_height, int pooled_width,
+    float spatial_scale, int sampling_ratio, float gamma);
+
+void deform_roi_pool_forward_cuda(Tensor input, Tensor rois, Tensor offset,
                                   Tensor output, int pooled_height,
                                   int pooled_width, float spatial_scale,
                                   int sampling_ratio, float gamma) {
-  DISPATCH_DEVICE_IMPL(deform_roi_pool_forward_impl, input, rois, offset,
-                       output, pooled_height, pooled_width, spatial_scale,
-                       sampling_ratio, gamma);
+  DeformRoIPoolForwardCUDAKernelLauncher(input, rois, offset, output,
+                                         pooled_height, pooled_width,
+                                         spatial_scale, sampling_ratio, gamma);
 }
 
-void deform_roi_pool_backward_impl(Tensor grad_output, Tensor input,
+void deform_roi_pool_backward_cuda(Tensor grad_output, Tensor input,
                                    Tensor rois, Tensor offset,
                                    Tensor grad_input, Tensor grad_offset,
                                    int pooled_height, int pooled_width,
                                    float spatial_scale, int sampling_ratio,
                                    float gamma) {
-  DISPATCH_DEVICE_IMPL(deform_roi_pool_backward_impl, grad_output, input, rois,
-                       offset, grad_input, grad_offset, pooled_height,
-                       pooled_width, spatial_scale, sampling_ratio, gamma);
+  DeformRoIPoolBackwardCUDAKernelLauncher(
+      grad_output, input, rois, offset, grad_input, grad_offset, pooled_height,
+      pooled_width, spatial_scale, sampling_ratio, gamma);
 }
+#endif
 
 void deform_roi_pool_forward(Tensor input, Tensor rois, Tensor offset,
                              Tensor output, int pooled_height, int pooled_width,
                              float spatial_scale, int sampling_ratio,
                              float gamma) {
-  deform_roi_pool_forward_impl(input, rois, offset, output, pooled_height,
-                               pooled_width, spatial_scale, sampling_ratio,
-                               gamma);
+  if (input.device().is_cuda()) {
+#ifdef MMCV_WITH_CUDA
+    CHECK_CUDA_INPUT(input);
+    CHECK_CUDA_INPUT(rois);
+    CHECK_CUDA_INPUT(offset);
+    CHECK_CUDA_INPUT(output);
+
+    deform_roi_pool_forward_cuda(input, rois, offset, output, pooled_height,
+                                 pooled_width, spatial_scale, sampling_ratio,
+                                 gamma);
+#else
+    AT_ERROR("DeformRoIPool is not compiled with GPU support");
+#endif
+  } else {
+    AT_ERROR("DeformRoIPool is not implemented on CPU");
+  }
 }
 
 void deform_roi_pool_backward(Tensor grad_output, Tensor input, Tensor rois,
@@ -36,7 +61,22 @@ void deform_roi_pool_backward(Tensor grad_output, Tensor input, Tensor rois,
                               Tensor grad_offset, int pooled_height,
                               int pooled_width, float spatial_scale,
                               int sampling_ratio, float gamma) {
-  deform_roi_pool_backward_impl(grad_output, input, rois, offset, grad_input,
-                                grad_offset, pooled_height, pooled_width,
-                                spatial_scale, sampling_ratio, gamma);
+  if (grad_output.device().is_cuda()) {
+#ifdef MMCV_WITH_CUDA
+    CHECK_CUDA_INPUT(grad_output);
+    CHECK_CUDA_INPUT(input);
+    CHECK_CUDA_INPUT(rois);
+    CHECK_CUDA_INPUT(offset);
+    CHECK_CUDA_INPUT(grad_input);
+    CHECK_CUDA_INPUT(grad_offset);
+
+    deform_roi_pool_backward_cuda(grad_output, input, rois, offset, grad_input,
+                                  grad_offset, pooled_height, pooled_width,
+                                  spatial_scale, sampling_ratio, gamma);
+#else
+    AT_ERROR("DeformRoIPool is not compiled with GPU support");
+#endif
+  } else {
+    AT_ERROR("DeformRoIPool is not implemented on CPU");
+  }
 }
diff --git a/mmcv/ops/csrc/parrots/diff_iou_rotated.cpp b/mmcv/ops/csrc/parrots/diff_iou_rotated.cpp
deleted file mode 100644
index 2361b7f..0000000
--- a/mmcv/ops/csrc/parrots/diff_iou_rotated.cpp
+++ /dev/null
@@ -1,14 +0,0 @@
-// Copyright (c) OpenMMLab. All rights reserved
-#include "pytorch_cpp_helper.hpp"
-#include "pytorch_device_registry.hpp"
-
-Tensor diff_iou_rotated_sort_vertices_forward_impl(Tensor vertices, Tensor mask,
-                                                   Tensor num_valid) {
-  return DISPATCH_DEVICE_IMPL(diff_iou_rotated_sort_vertices_forward_impl,
-                              vertices, mask, num_valid);
-}
-
-Tensor diff_iou_rotated_sort_vertices_forward(Tensor vertices, Tensor mask,
-                                              Tensor num_valid) {
-  return diff_iou_rotated_sort_vertices_forward_impl(vertices, mask, num_valid);
-}
diff --git a/mmcv/ops/csrc/parrots/diff_iou_rotated_parrots.cpp b/mmcv/ops/csrc/parrots/diff_iou_rotated_parrots.cpp
deleted file mode 100644
index b4d3e0e..0000000
--- a/mmcv/ops/csrc/parrots/diff_iou_rotated_parrots.cpp
+++ /dev/null
@@ -1,28 +0,0 @@
-// Copyright (c) OpenMMLab. All rights reserved
-#include <parrots/compute/aten.hpp>
-#include <parrots/extension.hpp>
-#include <parrots/foundation/ssattrs.hpp>
-
-#include "diff_iou_rotated_pytorch.h"
-
-using namespace parrots;
-
-#ifdef MMCV_WITH_CUDA
-void diff_iou_rotated_sort_vertices_forward_cuda_parrots(
-    CudaContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
-    OperatorBase::out_list_t& outs) {
-  at::Tensor boxes, scores, dets;
-  auto vertices = buildATensor(ctx, ins[0]);
-  auto mask = buildATensor(ctx, ins[1]);
-  auto num_valid = buildATensor(ctx, ins[2]);
-  auto out =
-      diff_iou_rotated_sort_vertices_forward_cuda(vertices, mask, num_valid);
-  updateDArray(ctx, out, outs[0]);
-}
-
-PARROTS_EXTENSION_REGISTER(diff_iou_rotated_sort_vertices_forward)
-    .input(3)
-    .output(1)
-    .apply(diff_iou_rotated_sort_vertices_forward_cuda_parrots)
-    .done();
-#endif
diff --git a/mmcv/ops/csrc/parrots/diff_iou_rotated_pytorch.h b/mmcv/ops/csrc/parrots/diff_iou_rotated_pytorch.h
deleted file mode 100644
index ef911ec..0000000
--- a/mmcv/ops/csrc/parrots/diff_iou_rotated_pytorch.h
+++ /dev/null
@@ -1,10 +0,0 @@
-// Copyright (c) OpenMMLab. All rights reserved
-#ifndef DIFF_IOU_ROTATED_PYTORCH_H
-#define DIFF_IOU_ROTATED_PYTORCH_H
-#include <torch/extension.h>
-using namespace at;
-
-Tensor diff_iou_rotated_sort_vertices_forward_cuda(Tensor vertices, Tensor mask,
-                                                   Tensor num_valid);
-
-#endif  // DIFF_IOU_ROTATED_PYTORCH_H
diff --git a/mmcv/ops/csrc/parrots/focal_loss.cpp b/mmcv/ops/csrc/parrots/focal_loss.cpp
index ed0e218..3e2c92b 100644
--- a/mmcv/ops/csrc/parrots/focal_loss.cpp
+++ b/mmcv/ops/csrc/parrots/focal_loss.cpp
@@ -1,53 +1,131 @@
 // Copyright (c) OpenMMLab. All rights reserved
 #include "pytorch_cpp_helper.hpp"
-#include "pytorch_device_registry.hpp"
 
-void sigmoid_focal_loss_forward_impl(Tensor input, Tensor target, Tensor weight,
+#ifdef MMCV_WITH_CUDA
+void SigmoidFocalLossForwardCUDAKernelLauncher(Tensor input, Tensor target,
+                                               Tensor weight, Tensor output,
+                                               const float gamma,
+                                               const float alpha);
+
+void SigmoidFocalLossBackwardCUDAKernelLauncher(Tensor input, Tensor target,
+                                                Tensor weight,
+                                                Tensor grad_input,
+                                                const float gamma,
+                                                const float alpha);
+
+void SoftmaxFocalLossForwardCUDAKernelLauncher(Tensor input, Tensor target,
+                                               Tensor weight, Tensor output,
+                                               const float gamma,
+                                               const float alpha);
+
+void SoftmaxFocalLossBackwardCUDAKernelLauncher(Tensor input, Tensor target,
+                                                Tensor weight, Tensor buff,
+                                                Tensor grad_input,
+                                                const float gamma,
+                                                const float alpha);
+
+void sigmoid_focal_loss_forward_cuda(Tensor input, Tensor target, Tensor weight,
                                      Tensor output, float gamma, float alpha) {
-  DISPATCH_DEVICE_IMPL(sigmoid_focal_loss_forward_impl, input, target, weight,
-                       output, gamma, alpha);
+  SigmoidFocalLossForwardCUDAKernelLauncher(input, target, weight, output,
+                                            gamma, alpha);
 }
 
-void sigmoid_focal_loss_backward_impl(Tensor input, Tensor target,
+void sigmoid_focal_loss_backward_cuda(Tensor input, Tensor target,
                                       Tensor weight, Tensor grad_input,
                                       float gamma, float alpha) {
-  DISPATCH_DEVICE_IMPL(sigmoid_focal_loss_backward_impl, input, target, weight,
-                       grad_input, gamma, alpha);
+  SigmoidFocalLossBackwardCUDAKernelLauncher(input, target, weight, grad_input,
+                                             gamma, alpha);
 }
 
-void softmax_focal_loss_forward_impl(Tensor input, Tensor target, Tensor weight,
+void softmax_focal_loss_forward_cuda(Tensor input, Tensor target, Tensor weight,
                                      Tensor output, float gamma, float alpha) {
-  DISPATCH_DEVICE_IMPL(softmax_focal_loss_forward_impl, input, target, weight,
-                       output, gamma, alpha);
+  SoftmaxFocalLossForwardCUDAKernelLauncher(input, target, weight, output,
+                                            gamma, alpha);
 }
 
-void softmax_focal_loss_backward_impl(Tensor input, Tensor target,
+void softmax_focal_loss_backward_cuda(Tensor input, Tensor target,
                                       Tensor weight, Tensor buff,
                                       Tensor grad_input, float gamma,
                                       float alpha) {
-  DISPATCH_DEVICE_IMPL(softmax_focal_loss_backward_impl, input, target, weight,
-                       buff, grad_input, gamma, alpha);
+  SoftmaxFocalLossBackwardCUDAKernelLauncher(input, target, weight, buff,
+                                             grad_input, gamma, alpha);
 }
+#endif
 
 void sigmoid_focal_loss_forward(Tensor input, Tensor target, Tensor weight,
                                 Tensor output, float gamma, float alpha) {
-  sigmoid_focal_loss_forward_impl(input, target, weight, output, gamma, alpha);
+  if (input.device().is_cuda()) {
+#ifdef MMCV_WITH_CUDA
+    CHECK_CUDA_INPUT(input);
+    CHECK_CUDA_INPUT(target);
+    CHECK_CUDA_INPUT(weight);
+    CHECK_CUDA_INPUT(output);
+
+    sigmoid_focal_loss_forward_cuda(input, target, weight, output, gamma,
+                                    alpha);
+#else
+    AT_ERROR("SigmoidFocalLoss is not compiled with GPU support");
+#endif
+  } else {
+    AT_ERROR("SigmoidFocalLoss is not implemented on CPU");
+  }
 }
 
 void sigmoid_focal_loss_backward(Tensor input, Tensor target, Tensor weight,
                                  Tensor grad_input, float gamma, float alpha) {
-  sigmoid_focal_loss_backward_impl(input, target, weight, grad_input, gamma,
-                                   alpha);
+  if (input.device().is_cuda()) {
+#ifdef MMCV_WITH_CUDA
+    CHECK_CUDA_INPUT(input);
+    CHECK_CUDA_INPUT(target);
+    CHECK_CUDA_INPUT(weight);
+    CHECK_CUDA_INPUT(grad_input);
+
+    sigmoid_focal_loss_backward_cuda(input, target, weight, grad_input, gamma,
+                                     alpha);
+#else
+    AT_ERROR("SigmoidFocalLoss is not compiled with GPU support");
+#endif
+  } else {
+    AT_ERROR("SigmoidFocalLoss is not implemented on CPU");
+  }
 }
 
 void softmax_focal_loss_forward(Tensor input, Tensor target, Tensor weight,
                                 Tensor output, float gamma, float alpha) {
-  softmax_focal_loss_forward_impl(input, target, weight, output, gamma, alpha);
+  if (input.device().is_cuda()) {
+#ifdef MMCV_WITH_CUDA
+    CHECK_CUDA_INPUT(input);
+    CHECK_CUDA_INPUT(target);
+    CHECK_CUDA_INPUT(weight);
+    CHECK_CUDA_INPUT(output);
+
+    softmax_focal_loss_forward_cuda(input, target, weight, output, gamma,
+                                    alpha);
+#else
+    AT_ERROR("SoftmaxFocalLoss is not compiled with GPU support");
+#endif
+  } else {
+    AT_ERROR("SoftmaxFocalLoss is not implemented on CPU");
+  }
 }
 
 void softmax_focal_loss_backward(Tensor input, Tensor target, Tensor weight,
                                  Tensor buff, Tensor grad_input, float gamma,
                                  float alpha) {
-  softmax_focal_loss_backward_impl(input, target, weight, buff, grad_input,
-                                   gamma, alpha);
+  if (input.device().is_cuda()) {
+#ifdef MMCV_WITH_CUDA
+    CHECK_CUDA_INPUT(input);
+    CHECK_CUDA_INPUT(target);
+    CHECK_CUDA_INPUT(weight);
+    CHECK_CUDA_INPUT(buff);
+    CHECK_CUDA_INPUT(grad_input);
+
+    softmax_focal_loss_backward_cuda(input, target, weight, buff, grad_input,
+                                     gamma, alpha);
+#else
+    AT_ERROR("SoftmaxFocalLoss is not compiled with GPU support");
+#endif
+  } else {
+    AT_ERROR("SoftmaxFocalLoss is not implemented on CPU");
+  }
 }
diff --git a/mmcv/ops/csrc/parrots/furthest_point_sample.cpp b/mmcv/ops/csrc/parrots/furthest_point_sample.cpp
index 9c7098a..e3ec99a 100644
--- a/mmcv/ops/csrc/parrots/furthest_point_sample.cpp
+++ b/mmcv/ops/csrc/parrots/furthest_point_sample.cpp
@@ -2,33 +2,61 @@
 // https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/sampling.cpp
 
 #include "pytorch_cpp_helper.hpp"
-#include "pytorch_device_registry.hpp"
 
-void furthest_point_sampling_forward_impl(Tensor points_tensor,
-                                          Tensor temp_tensor, Tensor idx_tensor,
-                                          int b, int n, int m) {
-  DISPATCH_DEVICE_IMPL(furthest_point_sampling_forward_impl, points_tensor,
-                       temp_tensor, idx_tensor, b, n, m);
+#ifdef MMCV_WITH_CUDA
+void FurthestPointSamplingForwardCUDAKernelLauncher(int b, int n, int m,
+                                                    const float *dataset,
+                                                    float *temp, int *idxs);
+
+void furthest_point_sampling_forward_cuda(int b, int n, int m,
+                                          const float *dataset, float *temp,
+                                          int *idxs) {
+  FurthestPointSamplingForwardCUDAKernelLauncher(b, n, m, dataset, temp, idxs);
 }
 
-void furthest_point_sampling_with_dist_forward_impl(Tensor points_tensor,
-                                                    Tensor temp_tensor,
-                                                    Tensor idx_tensor, int b,
-                                                    int n, int m) {
-  DISPATCH_DEVICE_IMPL(furthest_point_sampling_with_dist_forward_impl,
-                       points_tensor, temp_tensor, idx_tensor, b, n, m);
+void FurthestPointSamplingWithDistForwardCUDAKernelLauncher(
+    int b, int n, int m, const float *dataset, float *temp, int *idxs);
+
+void furthest_point_sampling_with_dist_forward_cuda(int b, int n, int m,
+                                                    const float *dataset,
+                                                    float *temp, int *idxs) {
+  FurthestPointSamplingWithDistForwardCUDAKernelLauncher(b, n, m, dataset, temp,
+                                                         idxs);
 }
+#endif
 
 void furthest_point_sampling_forward(Tensor points_tensor, Tensor temp_tensor,
                                      Tensor idx_tensor, int b, int n, int m) {
-  furthest_point_sampling_forward_impl(points_tensor, temp_tensor, idx_tensor,
-                                       b, n, m);
+  if (points_tensor.device().is_cuda()) {
+#ifdef MMCV_WITH_CUDA
+    const float *points = points_tensor.data_ptr<float>();
+    float *temp = temp_tensor.data_ptr<float>();
+    int *idx = idx_tensor.data_ptr<int>();
+    furthest_point_sampling_forward_cuda(b, n, m, points, temp, idx);
+#else
+    AT_ERROR("furthest_point_sampling is not compiled with GPU support");
+#endif
+  } else {
+    AT_ERROR("furthest_point_sampling is not implemented on CPU");
+  }
 }
 
 void furthest_point_sampling_with_dist_forward(Tensor points_tensor,
                                                Tensor temp_tensor,
                                                Tensor idx_tensor, int b, int n,
                                                int m) {
-  furthest_point_sampling_with_dist_forward_impl(points_tensor, temp_tensor,
-                                                 idx_tensor, b, n, m);
+  if (points_tensor.device().is_cuda()) {
+#ifdef MMCV_WITH_CUDA
+    const float *points = points_tensor.data<float>();
+    float *temp = temp_tensor.data<float>();
+    int *idx = idx_tensor.data<int>();
+
+    furthest_point_sampling_with_dist_forward_cuda(b, n, m, points, temp, idx);
+#else
+    AT_ERROR(
+        "furthest_point_sampling_with_dist is not compiled with GPU support");
+#endif
+  } else {
+    AT_ERROR("furthest_point_sampling_with_dist is not implemented on CPU");
+  }
 }
diff --git a/mmcv/ops/csrc/parrots/fused_bias_leakyrelu.cpp b/mmcv/ops/csrc/parrots/fused_bias_leakyrelu.cpp
index 8d411c9..2eadfeb 100644
--- a/mmcv/ops/csrc/parrots/fused_bias_leakyrelu.cpp
+++ b/mmcv/ops/csrc/parrots/fused_bias_leakyrelu.cpp
@@ -1,119 +1,27 @@
+// Copyright (c) OpenMMLab. All rights reserved
 // Modified from
+// from
 // https://github.com/rosinality/stylegan2-pytorch/blob/master/op/fused_bias_act.cpp
-
-/*
-Copyright (c) 2021, NVIDIA Corporation. All rights reserved.
-
-NVIDIA Source Code License for StyleGAN2 with Adaptive Discriminator
-Augmentation (ADA)
-=======================================================================
-
-1. Definitions
-
-"Licensor" means any person or entity that distributes its Work.
-
-"Software" means the original work of authorship made available under
-this License.
-
-"Work" means the Software and any additions to or derivative works of
-the Software that are made available under this License.
-
-The terms "reproduce," "reproduction," "derivative works," and
-"distribution" have the meaning as provided under U.S. copyright law;
-provided, however, that for the purposes of this License, derivative
-works shall not include works that remain separable from, or merely
-link (or bind by name) to the interfaces of, the Work.
-
-Works, including the Software, are "made available" under this License
-by including in or with the Work either (a) a copyright notice
-referencing the applicability of this License to the Work, or (b) a
-copy of this License.
-
-2. License Grants
-
-    2.1 Copyright Grant. Subject to the terms and conditions of this
-    License, each Licensor grants to you a perpetual, worldwide,
-    non-exclusive, royalty-free, copyright license to reproduce,
-    prepare derivative works of, publicly display, publicly perform,
-    sublicense and distribute its Work and any resulting derivative
-    works in any form.
-
-3. Limitations
-
-    3.1 Redistribution. You may reproduce or distribute the Work only
-    if (a) you do so under this License, (b) you include a complete
-    copy of this License with your distribution, and (c) you retain
-    without modification any copyright, patent, trademark, or
-    attribution notices that are present in the Work.
-
-    3.2 Derivative Works. You may specify that additional or different
-    terms apply to the use, reproduction, and distribution of your
-    derivative works of the Work ("Your Terms") only if (a) Your Terms
-    provide that the use limitation in Section 3.3 applies to your
-    derivative works, and (b) you identify the specific derivative
-    works that are subject to Your Terms. Notwithstanding Your Terms,
-    this License (including the redistribution requirements in Section
-    3.1) will continue to apply to the Work itself.
-
-    3.3 Use Limitation. The Work and any derivative works thereof only
-    may be used or intended for use non-commercially. Notwithstanding
-    the foregoing, NVIDIA and its affiliates may use the Work and any
-    derivative works commercially. As used herein, "non-commercially"
-    means for research or evaluation purposes only.
-
-    3.4 Patent Claims. If you bring or threaten to bring a patent claim
-    against any Licensor (including any claim, cross-claim or
-    counterclaim in a lawsuit) to enforce any patents that you allege
-    are infringed by any Work, then your rights under this License from
-    such Licensor (including the grant in Section 2.1) will terminate
-    immediately.
-
-    3.5 Trademarks. This License does not grant any rights to use any
-    Licensor’s or its affiliates’ names, logos, or trademarks, except
-    as necessary to reproduce the notices described in this License.
-
-    3.6 Termination. If you violate any term of this License, then your
-    rights under this License (including the grant in Section 2.1) will
-    terminate immediately.
-
-4. Disclaimer of Warranty.
-
-THE WORK IS PROVIDED "AS IS" WITHOUT WARRANTIES OR CONDITIONS OF ANY
-KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WARRANTIES OR CONDITIONS OF
-MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE OR
-NON-INFRINGEMENT. YOU BEAR THE RISK OF UNDERTAKING ANY ACTIVITIES UNDER
-THIS LICENSE.
-
-5. Limitation of Liability.
-
-EXCEPT AS PROHIBITED BY APPLICABLE LAW, IN NO EVENT AND UNDER NO LEGAL
-THEORY, WHETHER IN TORT (INCLUDING NEGLIGENCE), CONTRACT, OR OTHERWISE
-SHALL ANY LICENSOR BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY DIRECT,
-INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING OUT OF
-OR RELATED TO THIS LICENSE, THE USE OR INABILITY TO USE THE WORK
-(INCLUDING BUT NOT LIMITED TO LOSS OF GOODWILL, BUSINESS INTERRUPTION,
-LOST PROFITS OR DATA, COMPUTER FAILURE OR MALFUNCTION, OR ANY OTHER
-COMMERCIAL DAMAGES OR LOSSES), EVEN IF THE LICENSOR HAS BEEN ADVISED OF
-THE POSSIBILITY OF SUCH DAMAGES.
-
-=======================================================================
-*/
-
 #include "pytorch_cpp_helper.hpp"
-#include "pytorch_device_registry.hpp"
 
-torch::Tensor fused_bias_leakyrelu_op_impl(const torch::Tensor& input,
-                                           const torch::Tensor& bias,
-                                           const torch::Tensor& refer, int act,
-                                           int grad, float alpha, float scale) {
-  return DISPATCH_DEVICE_IMPL(fused_bias_leakyrelu_op_impl, input, bias, refer,
-                              act, grad, alpha, scale);
-}
+#ifdef MMCV_WITH_CUDA
+torch::Tensor fused_bias_leakyrelu_op(const torch::Tensor &input,
+                                      const torch::Tensor &bias,
+                                      const torch::Tensor &refer, int act,
+                                      int grad, float alpha, float scale);
+
+#endif
 
-torch::Tensor fused_bias_leakyrelu(const torch::Tensor& input,
-                                   const torch::Tensor& bias,
-                                   const torch::Tensor& refer, int act,
+torch::Tensor fused_bias_leakyrelu(const torch::Tensor &input,
+                                   const torch::Tensor &bias,
+                                   const torch::Tensor &refer, int act,
                                    int grad, float alpha, float scale) {
-  return fused_bias_leakyrelu_op_impl(input, bias, refer, act, grad, alpha,
-                                      scale);
+#ifdef MMCV_WITH_CUDA
+  CHECK_CUDA(input);
+  CHECK_CUDA(bias);
+
+  return fused_bias_leakyrelu_op(input, bias, refer, act, grad, alpha, scale);
+#else
+  AT_ERROR("Fused bias leakyrelu is not compiled with GPU support");
+#endif
 }
diff --git a/mmcv/ops/csrc/parrots/gather_points.cpp b/mmcv/ops/csrc/parrots/gather_points.cpp
index b8fb020..3ab93b6 100644
--- a/mmcv/ops/csrc/parrots/gather_points.cpp
+++ b/mmcv/ops/csrc/parrots/gather_points.cpp
@@ -1,30 +1,55 @@
 #include "pytorch_cpp_helper.hpp"
-#include "pytorch_device_registry.hpp"
 
-void gather_points_forward_impl(int b, int c, int n, int npoints,
+#ifdef MMCV_WITH_CUDA
+void GatherPointsForwardCUDAKernelLauncher(int b, int c, int n, int npoints,
+                                           const Tensor points,
+                                           const Tensor idx, Tensor out);
+
+void gather_points_forward_cuda(int b, int c, int n, int npoints,
                                 const Tensor points, const Tensor idx,
                                 Tensor out) {
-  DISPATCH_DEVICE_IMPL(gather_points_forward_impl, b, c, n, npoints, points,
-                       idx, out);
-}
+  GatherPointsForwardCUDAKernelLauncher(b, c, n, npoints, points, idx, out);
+};
 
-void gather_points_backward_impl(int b, int c, int n, int npoints,
+void GatherPointsBackwardCUDAKernelLauncher(int b, int c, int n, int npoints,
+                                            const Tensor grad_out,
+                                            const Tensor idx,
+                                            Tensor grad_points);
+
+void gather_points_backward_cuda(int b, int c, int n, int npoints,
                                  const Tensor grad_out, const Tensor idx,
                                  Tensor grad_points) {
-  DISPATCH_DEVICE_IMPL(gather_points_backward_impl, b, c, n, npoints, grad_out,
-                       idx, grad_points);
-}
+  GatherPointsBackwardCUDAKernelLauncher(b, c, n, npoints, grad_out, idx,
+                                         grad_points);
+};
+#endif
 
 void gather_points_forward(Tensor points_tensor, Tensor idx_tensor,
                            Tensor out_tensor, int b, int c, int n,
                            int npoints) {
-  gather_points_forward_impl(b, c, n, npoints, points_tensor, idx_tensor,
-                             out_tensor);
+  if (points_tensor.device().is_cuda()) {
+#ifdef MMCV_WITH_CUDA
+    gather_points_forward_cuda(b, c, n, npoints, points_tensor, idx_tensor,
+                               out_tensor);
+#else
+    AT_ERROR("gather_points is not compiled with GPU support");
+#endif
+  } else {
+    AT_ERROR("gather_points is not implemented on CPU");
+  }
 }
 
 void gather_points_backward(Tensor grad_out_tensor, Tensor idx_tensor,
                             Tensor grad_points_tensor, int b, int c, int n,
                             int npoints) {
-  gather_points_backward_impl(b, c, n, npoints, grad_out_tensor, idx_tensor,
-                              grad_points_tensor);
+  if (grad_out_tensor.device().is_cuda()) {
+#ifdef MMCV_WITH_CUDA
+    gather_points_backward_cuda(b, c, n, npoints, grad_out_tensor, idx_tensor,
+                                grad_points_tensor);
+#else
+    AT_ERROR("gather_points is not compiled with GPU support");
+#endif
+  } else {
+    AT_ERROR("gather_points is not implemented on CPU");
+  }
 }
diff --git a/mmcv/ops/csrc/parrots/group_points.cpp b/mmcv/ops/csrc/parrots/group_points.cpp
index cdd190d..864cf86 100644
--- a/mmcv/ops/csrc/parrots/group_points.cpp
+++ b/mmcv/ops/csrc/parrots/group_points.cpp
@@ -3,32 +3,56 @@
 // https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/group_points.cpp
 
 #include "pytorch_cpp_helper.hpp"
-#include "pytorch_device_registry.hpp"
 
-void group_points_forward_impl(int b, int c, int n, int npoints, int nsample,
+#ifdef MMCV_WITH_CUDA
+void GroupPointsForwardCUDAKernelLauncher(int b, int c, int n, int npoints,
+                                          int nsample, const Tensor points,
+                                          const Tensor idx, Tensor out);
+void group_points_forward_cuda(int b, int c, int n, int npoints, int nsample,
                                const Tensor points, const Tensor idx,
                                Tensor out) {
-  DISPATCH_DEVICE_IMPL(group_points_forward_impl, b, c, n, npoints, nsample,
-                       points, idx, out);
-}
+  GroupPointsForwardCUDAKernelLauncher(b, c, n, npoints, nsample, points, idx,
+                                       out);
+};
 
-void group_points_backward_impl(int b, int c, int n, int npoints, int nsample,
+void GroupPointsBackwardCUDAKernelLauncher(int b, int c, int n, int npoints,
+                                           int nsample, const Tensor grad_out,
+                                           const Tensor idx,
+                                           Tensor grad_points);
+void group_points_backward_cuda(int b, int c, int n, int npoints, int nsample,
                                 const Tensor grad_out, const Tensor idx,
                                 Tensor grad_points) {
-  DISPATCH_DEVICE_IMPL(group_points_backward_impl, b, c, n, npoints, nsample,
-                       grad_out, idx, grad_points);
-}
+  GroupPointsBackwardCUDAKernelLauncher(b, c, n, npoints, nsample, grad_out,
+                                        idx, grad_points);
+};
+#endif
 
 void group_points_forward(Tensor points_tensor, Tensor idx_tensor,
                           Tensor out_tensor, int b, int c, int n, int npoints,
                           int nsample) {
-  DISPATCH_DEVICE_IMPL(group_points_forward_impl, b, c, n, npoints, nsample,
-                       points_tensor, idx_tensor, out_tensor);
+  if (points_tensor.device().is_cuda()) {
+#ifdef MMCV_WITH_CUDA
+    group_points_forward_cuda(b, c, n, npoints, nsample, points_tensor,
+                              idx_tensor, out_tensor);
+#else
+    AT_ERROR("group_points is not compiled with GPU support");
+#endif
+  } else {
+    AT_ERROR("group_points is not implemented on CPU");
+  }
 }
 
 void group_points_backward(Tensor grad_out_tensor, Tensor idx_tensor,
                            Tensor grad_points_tensor, int b, int c, int n,
                            int npoints, int nsample) {
-  group_points_backward_impl(b, c, n, npoints, nsample, grad_out_tensor,
-                             idx_tensor, grad_points_tensor);
+  if (grad_out_tensor.device().is_cuda()) {
+#ifdef MMCV_WITH_CUDA
+    group_points_backward_cuda(b, c, n, npoints, nsample, grad_out_tensor,
+                               idx_tensor, grad_points_tensor);
+#else
+    AT_ERROR("group_points is not compiled with GPU support");
+#endif
+  } else {
+    AT_ERROR("group_points is not implemented on CPU");
+  }
 }
diff --git a/mmcv/ops/csrc/parrots/info.cpp b/mmcv/ops/csrc/parrots/info.cpp
deleted file mode 100644
index a4cc418..0000000
--- a/mmcv/ops/csrc/parrots/info.cpp
+++ /dev/null
@@ -1,65 +0,0 @@
-// Copyright (c) OpenMMLab. All rights reserved
-// modified from
-// https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/csrc/vision.cpp
-#include "pytorch_cpp_helper.hpp"
-
-#ifdef MMCV_WITH_CUDA
-#ifdef MMCV_WITH_HIP
-#include <hip/hip_runtime_api.h>
-int get_hiprt_version() {
-  int runtimeVersion;
-  hipRuntimeGetVersion(&runtimeVersion);
-  return runtimeVersion;
-}
-#else
-#include <cuda_runtime_api.h>
-int get_cudart_version() { return CUDART_VERSION; }
-#endif
-#endif
-
-std::string get_compiling_cuda_version() {
-#ifdef MMCV_WITH_CUDA
-#ifndef MMCV_WITH_HIP
-  std::ostringstream oss;
-  // copied from
-  // https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/cuda/detail/CUDAHooks.cpp#L231
-  auto printCudaStyleVersion = [&](int v) {
-    oss << (v / 1000) << "." << (v / 10 % 100);
-    if (v % 10 != 0) {
-      oss << "." << (v % 10);
-    }
-  };
-  printCudaStyleVersion(get_cudart_version());
-  return oss.str();
-#else
-  std::ostringstream oss;
-  oss << get_hiprt_version();
-  return oss.str();
-#endif
-#else
-  return std::string("not available");
-#endif
-}
-
-// similar to
-// https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/Version.cpp
-std::string get_compiler_version() {
-  std::ostringstream ss;
-#if defined(__GNUC__)
-#ifndef __clang__
-  { ss << "GCC " << __GNUC__ << "." << __GNUC_MINOR__; }
-#endif
-#endif
-
-#if defined(__clang_major__)
-  {
-    ss << "clang " << __clang_major__ << "." << __clang_minor__ << "."
-       << __clang_patchlevel__;
-  }
-#endif
-
-#if defined(_MSC_VER)
-  { ss << "MSVC " << _MSC_FULL_VER; }
-#endif
-  return ss.str();
-}
diff --git a/mmcv/ops/csrc/parrots/iou3d.cpp b/mmcv/ops/csrc/parrots/iou3d.cpp
index a347c0e..584447b 100644
--- a/mmcv/ops/csrc/parrots/iou3d.cpp
+++ b/mmcv/ops/csrc/parrots/iou3d.cpp
@@ -8,59 +8,225 @@ All Rights Reserved 2019-2020.
 */
 
 #include "pytorch_cpp_helper.hpp"
-#include "pytorch_device_registry.hpp"
 
 const int THREADS_PER_BLOCK_NMS = sizeof(unsigned long long) * 8;
 
-void iou3d_boxes_overlap_bev_forward_impl(const int num_a, const Tensor boxes_a,
+#ifdef MMCV_WITH_CUDA
+#include <cuda.h>
+#include <cuda_runtime_api.h>
+
+#define CHECK_ERROR(state) \
+  { gpuAssert((state), __FILE__, __LINE__); }
+inline void gpuAssert(cudaError_t code, const char *file, int line,
+                      bool abort = true) {
+  if (code != cudaSuccess) {
+    fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file,
+            line);
+    if (abort) exit(code);
+  }
+}
+
+void IoU3DBoxesOverlapBevForwardCUDAKernelLauncher(const int num_a,
+                                                   const Tensor boxes_a,
+                                                   const int num_b,
+                                                   const Tensor boxes_b,
+                                                   Tensor ans_overlap);
+void iou3d_boxes_overlap_bev_forward_cuda(const int num_a, const Tensor boxes_a,
                                           const int num_b, const Tensor boxes_b,
                                           Tensor ans_overlap) {
-  DISPATCH_DEVICE_IMPL(iou3d_boxes_overlap_bev_forward_impl, num_a, boxes_a,
-                       num_b, boxes_b, ans_overlap);
-}
+  IoU3DBoxesOverlapBevForwardCUDAKernelLauncher(num_a, boxes_a, num_b, boxes_b,
+                                                ans_overlap);
+};
 
-void iou3d_nms3d_forward_impl(const Tensor boxes, Tensor &keep,
-                              Tensor &keep_num, float nms_overlap_thresh) {
-  DISPATCH_DEVICE_IMPL(iou3d_nms3d_forward_impl, boxes, keep, keep_num,
-                       nms_overlap_thresh);
-}
+void IoU3DBoxesIoUBevForwardCUDAKernelLauncher(const int num_a,
+                                               const Tensor boxes_a,
+                                               const int num_b,
+                                               const Tensor boxes_b,
+                                               Tensor ans_iou);
+void iou3d_boxes_iou_bev_forward_cuda(const int num_a, const Tensor boxes_a,
+                                      const int num_b, const Tensor boxes_b,
+                                      Tensor ans_iou) {
+  IoU3DBoxesIoUBevForwardCUDAKernelLauncher(num_a, boxes_a, num_b, boxes_b,
+                                            ans_iou);
+};
 
-void iou3d_nms3d_normal_forward_impl(const Tensor boxes, Tensor &keep,
-                                     Tensor &keep_num,
-                                     float nms_overlap_thresh) {
-  DISPATCH_DEVICE_IMPL(iou3d_nms3d_normal_forward_impl, boxes, keep, keep_num,
-                       nms_overlap_thresh);
-}
+void IoU3DNMSForwardCUDAKernelLauncher(const Tensor boxes,
+                                       unsigned long long *mask, int boxes_num,
+                                       float nms_overlap_thresh);
+
+void iou3d_nms_forward_cuda(const Tensor boxes, unsigned long long *mask,
+                            int boxes_num, float nms_overlap_thresh) {
+  IoU3DNMSForwardCUDAKernelLauncher(boxes, mask, boxes_num, nms_overlap_thresh);
+};
+
+void IoU3DNMSNormalForwardCUDAKernelLauncher(const Tensor boxes,
+                                             unsigned long long *mask,
+                                             int boxes_num,
+                                             float nms_overlap_thresh);
+
+void iou3d_nms_normal_forward_cuda(const Tensor boxes, unsigned long long *mask,
+                                   int boxes_num, float nms_overlap_thresh) {
+  IoU3DNMSNormalForwardCUDAKernelLauncher(boxes, mask, boxes_num,
+                                          nms_overlap_thresh);
+};
+#endif
 
 void iou3d_boxes_overlap_bev_forward(Tensor boxes_a, Tensor boxes_b,
                                      Tensor ans_overlap) {
-  // params boxes: (N, 7) [x, y, z, dx, dy, dz, heading]
+  // params boxes_a: (N, 5) [x1, y1, x2, y2, ry]
   // params boxes_b: (M, 5)
   // params ans_overlap: (N, M)
-  int num_a = boxes_a.size(0);
-  int num_b = boxes_b.size(0);
 
-  iou3d_boxes_overlap_bev_forward_impl(num_a, boxes_a, num_b, boxes_b,
-                                       ans_overlap);
+  if (boxes_a.device().is_cuda()) {
+#ifdef MMCV_WITH_CUDA
+    CHECK_CUDA_INPUT(boxes_a);
+    CHECK_CUDA_INPUT(boxes_b);
+    CHECK_CUDA_INPUT(ans_overlap);
+
+    int num_a = boxes_a.size(0);
+    int num_b = boxes_b.size(0);
+
+    iou3d_boxes_overlap_bev_forward_cuda(num_a, boxes_a, num_b, boxes_b,
+                                         ans_overlap);
+#else
+    AT_ERROR("iou3d_boxes_overlap_bev is not compiled with GPU support");
+#endif
+  } else {
+    AT_ERROR("iou3d_boxes_overlap_bev is not implemented on CPU");
+  }
 }
 
-void iou3d_nms3d_forward(Tensor boxes, Tensor keep, Tensor keep_num,
-                         float nms_overlap_thresh) {
-  // params boxes: (N, 7) [x, y, z, dx, dy, dz, heading]
+void iou3d_boxes_iou_bev_forward(Tensor boxes_a, Tensor boxes_b,
+                                 Tensor ans_iou) {
+  // params boxes_a: (N, 5) [x1, y1, x2, y2, ry]
+  // params boxes_b: (M, 5)
+  // params ans_overlap: (N, M)
+
+  if (boxes_a.device().is_cuda()) {
+#ifdef MMCV_WITH_CUDA
+    CHECK_CUDA_INPUT(boxes_a);
+    CHECK_CUDA_INPUT(boxes_b);
+    CHECK_CUDA_INPUT(ans_iou);
+
+    int num_a = boxes_a.size(0);
+    int num_b = boxes_b.size(0);
+
+    iou3d_boxes_iou_bev_forward_cuda(num_a, boxes_a, num_b, boxes_b, ans_iou);
+#else
+    AT_ERROR("iou3d_boxes_iou_bev is not compiled with GPU support");
+#endif
+  } else {
+    AT_ERROR("iou3d_boxes_iou_bev is not implemented on CPU");
+  }
+}
+
+void iou3d_nms_forward(Tensor boxes, Tensor keep, Tensor keep_num,
+                       float nms_overlap_thresh) {
+  // params boxes: (N, 5) [x1, y1, x2, y2, ry]
   // params keep: (N)
-  CHECK_CONTIGUOUS(boxes);
-  CHECK_CONTIGUOUS(keep);
 
-  iou3d_nms3d_forward_impl(boxes, keep, keep_num, nms_overlap_thresh);
+  if (boxes.device().is_cuda()) {
+#ifdef MMCV_WITH_CUDA
+    CHECK_CUDA_INPUT(boxes);
+    CHECK_CONTIGUOUS(keep);
+
+    int boxes_num = boxes.size(0);
+    int64_t *keep_data = keep.data_ptr<int64_t>();
+    int64_t *keep_num_data = keep_num.data_ptr<int64_t>();
+
+    const int col_blocks = DIVUP(boxes_num, THREADS_PER_BLOCK_NMS);
+
+    Tensor mask =
+        at::empty({boxes_num, col_blocks}, boxes.options().dtype(at::kLong));
+    unsigned long long *mask_data =
+        (unsigned long long *)mask.data_ptr<int64_t>();
+    iou3d_nms_forward_cuda(boxes, mask_data, boxes_num, nms_overlap_thresh);
+
+    at::Tensor mask_cpu = mask.to(at::kCPU);
+    unsigned long long *mask_host =
+        (unsigned long long *)mask_cpu.data_ptr<int64_t>();
+
+    std::vector<unsigned long long> remv_cpu(col_blocks);
+    memset(&remv_cpu[0], 0, sizeof(unsigned long long) * col_blocks);
+
+    int num_to_keep = 0;
+
+    for (int i = 0; i < boxes_num; i++) {
+      int nblock = i / THREADS_PER_BLOCK_NMS;
+      int inblock = i % THREADS_PER_BLOCK_NMS;
+
+      if (!(remv_cpu[nblock] & (1ULL << inblock))) {
+        keep_data[num_to_keep++] = i;
+        unsigned long long *p = &mask_host[0] + i * col_blocks;
+        for (int j = nblock; j < col_blocks; j++) {
+          remv_cpu[j] |= p[j];
+        }
+      }
+    }
+
+    if (cudaSuccess != cudaGetLastError()) printf("Error!\n");
+    *keep_num_data = num_to_keep;
+
+#else
+    AT_ERROR("iou3d_nms is not compiled with GPU support");
+#endif
+  } else {
+    AT_ERROR("iou3d_nms is not implemented on CPU");
+  }
 }
 
-void iou3d_nms3d_normal_forward(Tensor boxes, Tensor keep, Tensor keep_num,
-                                float nms_overlap_thresh) {
-  // params boxes: (N, 7) [x, y, z, dx, dy, dz, heading]
+void iou3d_nms_normal_forward(Tensor boxes, Tensor keep, Tensor keep_num,
+                              float nms_overlap_thresh) {
+  // params boxes: (N, 5) [x1, y1, x2, y2, ry]
   // params keep: (N)
 
-  CHECK_CONTIGUOUS(boxes);
-  CHECK_CONTIGUOUS(keep);
+  if (boxes.device().is_cuda()) {
+#ifdef MMCV_WITH_CUDA
+    CHECK_CUDA_INPUT(boxes);
+    CHECK_CONTIGUOUS(keep);
+
+    int boxes_num = boxes.size(0);
+    int64_t *keep_data = keep.data_ptr<int64_t>();
+    int64_t *keep_num_data = keep_num.data_ptr<int64_t>();
+
+    const int col_blocks = DIVUP(boxes_num, THREADS_PER_BLOCK_NMS);
+
+    Tensor mask =
+        at::empty({boxes_num, col_blocks}, boxes.options().dtype(at::kLong));
+    unsigned long long *mask_data =
+        (unsigned long long *)mask.data_ptr<int64_t>();
+    iou3d_nms_normal_forward_cuda(boxes, mask_data, boxes_num,
+                                  nms_overlap_thresh);
+
+    at::Tensor mask_cpu = mask.to(at::kCPU);
+    unsigned long long *mask_host =
+        (unsigned long long *)mask_cpu.data_ptr<int64_t>();
+
+    std::vector<unsigned long long> remv_cpu(col_blocks);
+    memset(&remv_cpu[0], 0, sizeof(unsigned long long) * col_blocks);
+    int num_to_keep = 0;
+
+    for (int i = 0; i < boxes_num; i++) {
+      int nblock = i / THREADS_PER_BLOCK_NMS;
+      int inblock = i % THREADS_PER_BLOCK_NMS;
+
+      if (!(remv_cpu[nblock] & (1ULL << inblock))) {
+        keep_data[num_to_keep++] = i;
+        unsigned long long *p = &mask_host[0] + i * col_blocks;
+        for (int j = nblock; j < col_blocks; j++) {
+          remv_cpu[j] |= p[j];
+        }
+      }
+    }
+
+    if (cudaSuccess != cudaGetLastError()) printf("Error!\n");
+
+    *keep_num_data = num_to_keep;
 
-  iou3d_nms3d_normal_forward_impl(boxes, keep, keep_num, nms_overlap_thresh);
+#else
+    AT_ERROR("iou3d_nms_normal is not compiled with GPU support");
+#endif
+  } else {
+    AT_ERROR("iou3d_nms_normal is not implemented on CPU");
+  }
 }
diff --git a/mmcv/ops/csrc/parrots/iou3d_parrots.cpp b/mmcv/ops/csrc/parrots/iou3d_parrots.cpp
index 20e288a..7a6477e 100644
--- a/mmcv/ops/csrc/parrots/iou3d_parrots.cpp
+++ b/mmcv/ops/csrc/parrots/iou3d_parrots.cpp
@@ -8,7 +8,7 @@
 using namespace parrots;
 
 #ifdef MMCV_WITH_CUDA
-void iou3d_boxes_overlap_bev_forward_cuda_parrots(
+void iou3d_boxes_iou_bev_forward_cuda_parrots(
     CudaContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
     OperatorBase::out_list_t& outs) {
   auto boxes_a = buildATensor(ctx, ins[0]);
@@ -16,12 +16,12 @@ void iou3d_boxes_overlap_bev_forward_cuda_parrots(
 
   auto ans_iou = buildATensor(ctx, outs[0]);
 
-  iou3d_boxes_overlap_bev_forward(boxes_a, boxes_b, ans_iou);
+  iou3d_boxes_iou_bev_forward(boxes_a, boxes_b, ans_iou);
 }
 
-void iou3d_nms3d_forward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
-                                      const OperatorBase::in_list_t& ins,
-                                      OperatorBase::out_list_t& outs) {
+void iou3d_nms_forward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
+                                    const OperatorBase::in_list_t& ins,
+                                    OperatorBase::out_list_t& outs) {
   float nms_overlap_thresh;
   SSAttrs(attr).get<float>("nms_overlap_thresh", nms_overlap_thresh).done();
 
@@ -30,13 +30,13 @@ void iou3d_nms3d_forward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
   auto keep = buildATensor(ctx, outs[0]);
   auto keep_num = buildATensor(ctx, outs[1]);
 
-  iou3d_nms3d_forward(boxes, keep, keep_num, nms_overlap_thresh);
+  iou3d_nms_forward(boxes, keep, keep_num, nms_overlap_thresh);
 }
 
-void iou3d_nms3d_normal_forward_cuda_parrots(CudaContext& ctx,
-                                             const SSElement& attr,
-                                             const OperatorBase::in_list_t& ins,
-                                             OperatorBase::out_list_t& outs) {
+void iou3d_nms_normal_forward_cuda_parrots(CudaContext& ctx,
+                                           const SSElement& attr,
+                                           const OperatorBase::in_list_t& ins,
+                                           OperatorBase::out_list_t& outs) {
   float nms_overlap_thresh;
   SSAttrs(attr).get<float>("nms_overlap_thresh", nms_overlap_thresh).done();
 
@@ -45,26 +45,26 @@ void iou3d_nms3d_normal_forward_cuda_parrots(CudaContext& ctx,
   auto keep = buildATensor(ctx, outs[0]);
   auto keep_num = buildATensor(ctx, outs[1]);
 
-  iou3d_nms3d_normal_forward(boxes, keep, keep_num, nms_overlap_thresh);
+  iou3d_nms_normal_forward(boxes, keep, keep_num, nms_overlap_thresh);
 }
 
-PARROTS_EXTENSION_REGISTER(iou3d_boxes_overlap_bev_forward)
+PARROTS_EXTENSION_REGISTER(iou3d_boxes_iou_bev_forward)
     .input(2)
     .output(1)
-    .apply(iou3d_boxes_overlap_bev_forward_cuda_parrots)
+    .apply(iou3d_boxes_iou_bev_forward_cuda_parrots)
     .done();
 
-PARROTS_EXTENSION_REGISTER(iou3d_nms3d_forward)
+PARROTS_EXTENSION_REGISTER(iou3d_nms_forward)
     .attr("nms_overlap_thresh")
     .input(1)
     .output(2)
-    .apply(iou3d_nms3d_forward_cuda_parrots)
+    .apply(iou3d_nms_forward_cuda_parrots)
     .done();
 
-PARROTS_EXTENSION_REGISTER(iou3d_nms3d_normal_forward)
+PARROTS_EXTENSION_REGISTER(iou3d_nms_normal_forward)
     .attr("nms_overlap_thresh")
     .input(1)
     .output(2)
-    .apply(iou3d_nms3d_normal_forward_cuda_parrots)
+    .apply(iou3d_nms_normal_forward_cuda_parrots)
     .done();
 #endif
diff --git a/mmcv/ops/csrc/parrots/iou3d_pytorch.h b/mmcv/ops/csrc/parrots/iou3d_pytorch.h
index 76170ed..01777d3 100644
--- a/mmcv/ops/csrc/parrots/iou3d_pytorch.h
+++ b/mmcv/ops/csrc/parrots/iou3d_pytorch.h
@@ -4,13 +4,13 @@
 #include <torch/extension.h>
 using namespace at;
 
-void iou3d_boxes_overlap_bev_forward(Tensor boxes_a, Tensor boxes_b,
-                                     Tensor ans_overlap);
+void iou3d_boxes_iou_bev_forward(Tensor boxes_a, Tensor boxes_b,
+                                 Tensor ans_iou);
 
-void iou3d_nms3d_forward(Tensor boxes, Tensor keep, Tensor keep_num,
-                         float nms_overlap_thresh);
+void iou3d_nms_forward(Tensor boxes, Tensor keep, Tensor keep_num,
+                       float nms_overlap_thresh);
 
-void iou3d_nms3d_normal_forward(Tensor boxes, Tensor keep, Tensor keep_num,
-                                float nms_overlap_thresh);
+void iou3d_nms_normal_forward(Tensor boxes, Tensor keep, Tensor keep_num,
+                              float nms_overlap_thresh);
 
 #endif  // IOU_3D_PYTORCH_H
diff --git a/mmcv/ops/csrc/parrots/knn.cpp b/mmcv/ops/csrc/parrots/knn.cpp
index b4be942..55105eb 100644
--- a/mmcv/ops/csrc/parrots/knn.cpp
+++ b/mmcv/ops/csrc/parrots/knn.cpp
@@ -2,16 +2,31 @@
 // https://github.com/CVMI-Lab/PAConv/tree/main/scene_seg/lib/pointops/src/knnquery_heap
 
 #include "pytorch_cpp_helper.hpp"
-#include "pytorch_device_registry.hpp"
 
-void knn_forward_impl(int b, int n, int m, int nsample, const Tensor xyz,
+#ifdef MMCV_WITH_CUDA
+void KNNForwardCUDAKernelLauncher(int b, int n, int m, int nsample,
+                                  const Tensor xyz, const Tensor new_xyz,
+                                  Tensor idx, Tensor dist2);
+
+void knn_forward_cuda(int b, int n, int m, int nsample, const Tensor xyz,
                       const Tensor new_xyz, Tensor idx, Tensor dist2) {
-  DISPATCH_DEVICE_IMPL(knn_forward_impl, b, n, m, nsample, xyz, new_xyz, idx,
-                       dist2);
+  KNNForwardCUDAKernelLauncher(b, n, m, nsample, xyz, new_xyz, idx, dist2);
 }
+#endif
 
 void knn_forward(Tensor xyz_tensor, Tensor new_xyz_tensor, Tensor idx_tensor,
                  Tensor dist2_tensor, int b, int n, int m, int nsample) {
-  knn_forward_impl(b, n, m, nsample, xyz_tensor, new_xyz_tensor, idx_tensor,
-                   dist2_tensor);
+  if (new_xyz_tensor.device().is_cuda()) {
+#ifdef MMCV_WITH_CUDA
+    CHECK_CUDA_INPUT(new_xyz_tensor);
+    CHECK_CUDA_INPUT(xyz_tensor);
+
+    knn_forward_cuda(b, n, m, nsample, xyz_tensor, new_xyz_tensor, idx_tensor,
+                     dist2_tensor);
+#else
+    AT_ERROR("knn is not compiled with GPU support");
+#endif
+  } else {
+    AT_ERROR("knn is not implemented on CPU");
+  }
 }
diff --git a/mmcv/ops/csrc/parrots/masked_conv2d.cpp b/mmcv/ops/csrc/parrots/masked_conv2d.cpp
index 5903925..338cd85 100644
--- a/mmcv/ops/csrc/parrots/masked_conv2d.cpp
+++ b/mmcv/ops/csrc/parrots/masked_conv2d.cpp
@@ -1,33 +1,75 @@
 // Copyright (c) OpenMMLab. All rights reserved
 #include "pytorch_cpp_helper.hpp"
-#include "pytorch_device_registry.hpp"
 
-void masked_im2col_forward_impl(const Tensor im, const Tensor mask_h_idx,
+#ifdef MMCV_WITH_CUDA
+void MaskedIm2colForwardCUDAKernelLauncher(const Tensor bottom_data,
+                                           const Tensor mask_h_idx,
+                                           const Tensor mask_w_idx,
+                                           Tensor top_data, const int kernel_h,
+                                           const int kernel_w, const int pad_h,
+                                           const int pad_w);
+
+void MaskedCol2imForwardCUDAKernelLauncher(const Tensor bottom_data,
+                                           const Tensor mask_h_idx,
+                                           const Tensor mask_w_idx,
+                                           Tensor top_data, const int height,
+                                           const int width, const int channels);
+
+void masked_im2col_forward_cuda(const Tensor im, const Tensor mask_h_idx,
                                 const Tensor mask_w_idx, Tensor col,
                                 const int kernel_h, const int kernel_w,
                                 const int pad_h, const int pad_w) {
-  DISPATCH_DEVICE_IMPL(masked_im2col_forward_impl, im, mask_h_idx, mask_w_idx,
-                       col, kernel_h, kernel_w, pad_h, pad_w);
+  // im: (n, ic, h, w), kernel size (kh, kw)
+  // kernel: (oc, ic * kh * kw), col: (kh * kw * ic, ow * oh)
+  MaskedIm2colForwardCUDAKernelLauncher(im, mask_h_idx, mask_w_idx, col,
+                                        kernel_h, kernel_w, pad_h, pad_w);
 }
 
-void masked_col2im_forward_impl(const Tensor col, const Tensor mask_h_idx,
+void masked_col2im_forward_cuda(const Tensor col, const Tensor mask_h_idx,
                                 const Tensor mask_w_idx, Tensor im, int height,
                                 int width, int channels) {
-  DISPATCH_DEVICE_IMPL(masked_col2im_forward_impl, col, mask_h_idx, mask_w_idx,
-                       im, height, width, channels);
+  // im: (n, ic, h, w), kernel size (kh, kw)
+  // kernel: (oc, ic * kh * kh), col: (kh * kw * ic, ow * oh)
+  MaskedCol2imForwardCUDAKernelLauncher(col, mask_h_idx, mask_w_idx, im, height,
+                                        width, channels);
 }
+#endif
 
 void masked_im2col_forward(const Tensor im, const Tensor mask_h_idx,
                            const Tensor mask_w_idx, Tensor col,
                            const int kernel_h, const int kernel_w,
                            const int pad_h, const int pad_w) {
-  masked_im2col_forward_impl(im, mask_h_idx, mask_w_idx, col, kernel_h,
-                             kernel_w, pad_h, pad_w);
+  if (im.device().is_cuda()) {
+#ifdef MMCV_WITH_CUDA
+    CHECK_CUDA_INPUT(im);
+    CHECK_CUDA_INPUT(mask_h_idx);
+    CHECK_CUDA_INPUT(mask_w_idx);
+    CHECK_CUDA_INPUT(col);
+    masked_im2col_forward_cuda(im, mask_h_idx, mask_w_idx, col, kernel_h,
+                               kernel_w, pad_h, pad_w);
+#else
+    AT_ERROR("MaskConv is not compiled with GPU support");
+#endif
+  } else {
+    AT_ERROR("MaskConv is not implemented on CPU");
+  }
 }
 
 void masked_col2im_forward(const Tensor col, const Tensor mask_h_idx,
                            const Tensor mask_w_idx, Tensor im, int height,
                            int width, int channels) {
-  masked_col2im_forward_impl(col, mask_h_idx, mask_w_idx, im, height, width,
-                             channels);
+  if (col.device().is_cuda()) {
+#ifdef MMCV_WITH_CUDA
+    CHECK_CUDA_INPUT(col);
+    CHECK_CUDA_INPUT(mask_h_idx);
+    CHECK_CUDA_INPUT(mask_w_idx);
+    CHECK_CUDA_INPUT(im);
+    masked_col2im_forward_cuda(col, mask_h_idx, mask_w_idx, im, height, width,
+                               channels);
+#else
+    AT_ERROR("MaskConv is not compiled with GPU support");
+#endif
+  } else {
+    AT_ERROR("MaskConv is not implemented on CPU");
+  }
 }
diff --git a/mmcv/ops/csrc/parrots/min_area_polygons.cpp b/mmcv/ops/csrc/parrots/min_area_polygons.cpp
deleted file mode 100644
index 8ff996d..0000000
--- a/mmcv/ops/csrc/parrots/min_area_polygons.cpp
+++ /dev/null
@@ -1,11 +0,0 @@
-// Copyright (c) OpenMMLab. All rights reserved
-#include "pytorch_cpp_helper.hpp"
-#include "pytorch_device_registry.hpp"
-
-void min_area_polygons_impl(const Tensor pointsets, Tensor polygons) {
-  DISPATCH_DEVICE_IMPL(min_area_polygons_impl, pointsets, polygons);
-}
-
-void min_area_polygons(const Tensor pointsets, Tensor polygons) {
-  min_area_polygons_impl(pointsets, polygons);
-}
diff --git a/mmcv/ops/csrc/parrots/min_area_polygons_parrots.cpp b/mmcv/ops/csrc/parrots/min_area_polygons_parrots.cpp
deleted file mode 100644
index d9e4ff4..0000000
--- a/mmcv/ops/csrc/parrots/min_area_polygons_parrots.cpp
+++ /dev/null
@@ -1,26 +0,0 @@
-// Copyright (c) OpenMMLab. All rights reserved
-#include <parrots/compute/aten.hpp>
-#include <parrots/extension.hpp>
-#include <parrots/foundation/ssattrs.hpp>
-
-#include "min_area_polygons_pytorch.h"
-
-using namespace parrots;
-
-#ifdef MMCV_WITH_CUDA
-void min_area_polygons_cuda_parrots(CudaContext& ctx, const SSElement& attr,
-                                    const OperatorBase::in_list_t& ins,
-                                    OperatorBase::out_list_t& outs) {
-  auto pointsets = buildATensor(ctx, ins[0]);
-
-  auto polygons = buildATensor(ctx, outs[0]);
-  min_area_polygons(pointsets, polygons);
-}
-
-PARROTS_EXTENSION_REGISTER(min_area_polygons)
-    .input(1)
-    .output(1)
-    .apply(min_area_polygons_cuda_parrots)
-    .done();
-
-#endif
diff --git a/mmcv/ops/csrc/parrots/min_area_polygons_pytorch.h b/mmcv/ops/csrc/parrots/min_area_polygons_pytorch.h
deleted file mode 100644
index 1df2764..0000000
--- a/mmcv/ops/csrc/parrots/min_area_polygons_pytorch.h
+++ /dev/null
@@ -1,9 +0,0 @@
-// Copyright (c) OpenMMLab. All rights reserved
-#ifndef MIN_AREA_POLYGONS_PYTORCH_H
-#define MIN_AREA_POLYGONS_PYTORCH_H
-#include <torch/extension.h>
-using namespace at;
-
-void min_area_polygons(const Tensor pointsets, Tensor polygons);
-
-#endif  // MIN_AREA_POLYGONS_PYTORCH_H
diff --git a/mmcv/ops/csrc/parrots/modulated_deform_conv.cpp b/mmcv/ops/csrc/parrots/modulated_deform_conv.cpp
index 12b538a..c5e78c3 100644
--- a/mmcv/ops/csrc/parrots/modulated_deform_conv.cpp
+++ b/mmcv/ops/csrc/parrots/modulated_deform_conv.cpp
@@ -1,49 +1,59 @@
 // Copyright (c) OpenMMLab. All rights reserved
 #include "pytorch_cpp_helper.hpp"
-#include "pytorch_device_registry.hpp"
 
-void modulated_deformable_im2col_impl(
+#ifdef MMCV_WITH_CUDA
+
+void modulated_deformable_im2col_cuda(
     const Tensor data_im, const Tensor data_offset, const Tensor data_mask,
     const int batch_size, const int channels, const int height_im,
     const int width_im, const int height_col, const int width_col,
+    const int kernel_h, const int kenerl_w, const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w, const int dilation_h,
+    const int dilation_w, const int deformable_group, Tensor data_col);
+
+void modulated_deformable_col2im_cuda(
+    const Tensor data_col, const Tensor data_offset, const Tensor data_mask,
+    const int batch_size, const int channels, const int height_im,
+    const int width_im, const int height_col, const int width_col,
     const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
     const int stride_h, const int stride_w, const int dilation_h,
-    const int dilation_w, const int deformable_group, Tensor data_col) {
-  DISPATCH_DEVICE_IMPL(modulated_deformable_im2col_impl, data_im, data_offset,
-                       data_mask, batch_size, channels, height_im, width_im,
-                       height_col, width_col, kernel_h, kernel_w, pad_h, pad_w,
-                       stride_h, stride_w, dilation_h, dilation_w,
-                       deformable_group, data_col);
-}
+    const int dilation_w, const int deformable_group, Tensor grad_im);
+
+void modulated_deformable_col2im_coord_cuda(
+    const Tensor data_col, const Tensor data_im, const Tensor data_offset,
+    const Tensor data_mask, const int batch_size, const int channels,
+    const int height_im, const int width_im, const int height_col,
+    const int width_col, const int kernel_h, const int kernel_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w, const int deformable_group,
+    Tensor grad_offset, Tensor grad_mask);
 
-void modulated_deformable_col2im_impl(
+#endif
+
+void modulated_deformable_im2col_cpu(
+    const Tensor data_im, const Tensor data_offset, const Tensor data_mask,
+    const int batch_size, const int channels, const int height_im,
+    const int width_im, const int height_col, const int width_col,
+    const int kernel_h, const int kenerl_w, const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w, const int dilation_h,
+    const int dilation_w, const int deformable_group, Tensor data_col);
+
+void modulated_deformable_col2im_cpu(
     const Tensor data_col, const Tensor data_offset, const Tensor data_mask,
     const int batch_size, const int channels, const int height_im,
     const int width_im, const int height_col, const int width_col,
     const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
     const int stride_h, const int stride_w, const int dilation_h,
-    const int dilation_w, const int deformable_group, Tensor grad_im) {
-  DISPATCH_DEVICE_IMPL(modulated_deformable_col2im_impl, data_col, data_offset,
-                       data_mask, batch_size, channels, height_im, width_im,
-                       height_col, width_col, kernel_h, kernel_w, pad_h, pad_w,
-                       stride_h, stride_w, dilation_h, dilation_w,
-                       deformable_group, grad_im);
-}
+    const int dilation_w, const int deformable_group, Tensor grad_im);
 
-void modulated_deformable_col2im_coord_impl(
+void modulated_deformable_col2im_coord_cpu(
     const Tensor data_col, const Tensor data_im, const Tensor data_offset,
     const Tensor data_mask, const int batch_size, const int channels,
     const int height_im, const int width_im, const int height_col,
     const int width_col, const int kernel_h, const int kernel_w,
     const int pad_h, const int pad_w, const int stride_h, const int stride_w,
     const int dilation_h, const int dilation_w, const int deformable_group,
-    Tensor grad_offset, Tensor grad_mask) {
-  DISPATCH_DEVICE_IMPL(modulated_deformable_col2im_coord_impl, data_col,
-                       data_im, data_offset, data_mask, batch_size, channels,
-                       height_im, width_im, height_col, width_col, kernel_h,
-                       kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h,
-                       dilation_w, deformable_group, grad_offset, grad_mask);
-}
+    Tensor grad_offset, Tensor grad_mask);
 
 void modulated_deform_conv_forward(
     Tensor input, Tensor weight, Tensor bias, Tensor ones, Tensor offset,
@@ -51,6 +61,31 @@ void modulated_deform_conv_forward(
     const int stride_h, const int stride_w, const int pad_h, const int pad_w,
     const int dilation_h, const int dilation_w, const int group,
     const int deformable_group, const bool with_bias) {
+  if (input.device().is_cuda()) {
+#ifdef MMCV_WITH_CUDA
+    CHECK_CUDA_INPUT(input);
+    CHECK_CUDA_INPUT(weight);
+    CHECK_CUDA_INPUT(bias);
+    CHECK_CUDA_INPUT(ones);
+    CHECK_CUDA_INPUT(offset);
+    CHECK_CUDA_INPUT(mask);
+    CHECK_CUDA_INPUT(output);
+    CHECK_CUDA_INPUT(columns);
+
+#else
+    AT_ERROR("ModulatedDeformConv is not compiled with GPU support");
+#endif
+  } else {
+    CHECK_CPU_INPUT(input);
+    CHECK_CPU_INPUT(weight);
+    CHECK_CPU_INPUT(bias);
+    CHECK_CPU_INPUT(ones);
+    CHECK_CPU_INPUT(offset);
+    CHECK_CPU_INPUT(mask);
+    CHECK_CPU_INPUT(output);
+    CHECK_CPU_INPUT(columns);
+  }
+
   at::DeviceGuard guard(input.device());
 
   const int batch = input.size(0);
@@ -92,10 +127,19 @@ void modulated_deform_conv_forward(
                         output.size(2), output.size(3)});
 
   for (int b = 0; b < batch; b++) {
-    modulated_deformable_im2col_impl(
-        input[b], offset[b], mask[b], 1, channels, height, width, height_out,
-        width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
-        dilation_h, dilation_w, deformable_group, columns);
+    if (input.device().is_cuda()) {
+#ifdef MMCV_WITH_CUDA
+      modulated_deformable_im2col_cuda(
+          input[b], offset[b], mask[b], 1, channels, height, width, height_out,
+          width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
+          dilation_h, dilation_w, deformable_group, columns);
+#endif
+    } else {
+      modulated_deformable_im2col_cpu(
+          input[b], offset[b], mask[b], 1, channels, height, width, height_out,
+          width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
+          dilation_h, dilation_w, deformable_group, columns);
+    }
 
     // divide into group
     weight = weight.view({group, weight.size(0) / group, weight.size(1),
@@ -130,6 +174,41 @@ void modulated_deform_conv_backward(
     int kernel_h, int kernel_w, int stride_h, int stride_w, int pad_h,
     int pad_w, int dilation_h, int dilation_w, int group, int deformable_group,
     const bool with_bias) {
+  if (input.device().is_cuda()) {
+#ifdef MMCV_WITH_CUDA
+    CHECK_CUDA_INPUT(input);
+    CHECK_CUDA_INPUT(weight);
+    CHECK_CUDA_INPUT(bias);
+    CHECK_CUDA_INPUT(ones);
+    CHECK_CUDA_INPUT(offset);
+    CHECK_CUDA_INPUT(mask);
+    CHECK_CUDA_INPUT(columns);
+    CHECK_CUDA_INPUT(grad_input);
+    CHECK_CUDA_INPUT(grad_weight);
+    CHECK_CUDA_INPUT(grad_bias);
+    CHECK_CUDA_INPUT(grad_offset);
+    CHECK_CUDA_INPUT(grad_mask);
+    CHECK_CUDA_INPUT(grad_output);
+
+#else
+    AT_ERROR("ModulatedDeformConv is not compiled with GPU support");
+#endif
+  } else {
+    CHECK_CPU_INPUT(input);
+    CHECK_CPU_INPUT(weight);
+    CHECK_CPU_INPUT(bias);
+    CHECK_CPU_INPUT(ones);
+    CHECK_CPU_INPUT(offset);
+    CHECK_CPU_INPUT(mask);
+    CHECK_CPU_INPUT(columns);
+    CHECK_CPU_INPUT(grad_input);
+    CHECK_CPU_INPUT(grad_weight);
+    CHECK_CPU_INPUT(grad_bias);
+    CHECK_CPU_INPUT(grad_offset);
+    CHECK_CPU_INPUT(grad_mask);
+    CHECK_CPU_INPUT(grad_output);
+  }
+
   at::DeviceGuard guard(input.device());
 
   const int batch = input.size(0);
@@ -182,24 +261,46 @@ void modulated_deform_conv_backward(
     weight = weight.view({weight.size(0) * weight.size(1), weight.size(2),
                           weight.size(3), weight.size(4)});
 
-    // gradient w.r.t. input coordinate data
-    modulated_deformable_col2im_coord_impl(
-        columns, input[b], offset[b], mask[b], 1, channels, height, width,
-        height_out, width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h,
-        stride_w, dilation_h, dilation_w, deformable_group, grad_offset[b],
-        grad_mask[b]);
-    // gradient w.r.t. input data
-    modulated_deformable_col2im_impl(
-        columns, offset[b], mask[b], 1, channels, height, width, height_out,
-        width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
-        dilation_h, dilation_w, deformable_group, grad_input[b]);
-
-    // gradient w.r.t. weight, dWeight should accumulate across the batch and
-    // group
-    modulated_deformable_im2col_impl(
-        input[b], offset[b], mask[b], 1, channels, height, width, height_out,
-        width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
-        dilation_h, dilation_w, deformable_group, columns);
+    if (input.device().is_cuda()) {
+#ifdef MMCV_WITH_CUDA
+      // gradient w.r.t. input coordinate data
+      modulated_deformable_col2im_coord_cuda(
+          columns, input[b], offset[b], mask[b], 1, channels, height, width,
+          height_out, width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h,
+          stride_w, dilation_h, dilation_w, deformable_group, grad_offset[b],
+          grad_mask[b]);
+      // gradient w.r.t. input data
+      modulated_deformable_col2im_cuda(
+          columns, offset[b], mask[b], 1, channels, height, width, height_out,
+          width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
+          dilation_h, dilation_w, deformable_group, grad_input[b]);
+
+      // gradient w.r.t. weight, dWeight should accumulate across the batch and
+      // group
+      modulated_deformable_im2col_cuda(
+          input[b], offset[b], mask[b], 1, channels, height, width, height_out,
+          width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
+          dilation_h, dilation_w, deformable_group, columns);
+#endif
+    } else {
+      // gradient w.r.t. input coordinate data
+      modulated_deformable_col2im_coord_cpu(
+          columns, input[b], offset[b], mask[b], 1, channels, height, width,
+          height_out, width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h,
+          stride_w, dilation_h, dilation_w, deformable_group, grad_offset[b],
+          grad_mask[b]);
+      // gradient w.r.t. input data
+      modulated_deformable_col2im_cpu(
+          columns, offset[b], mask[b], 1, channels, height, width, height_out,
+          width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
+          dilation_h, dilation_w, deformable_group, grad_input[b]);
+      // gradient w.r.t. weight, dWeight should accumulate across the batch and
+      // group
+      modulated_deformable_im2col_cpu(
+          input[b], offset[b], mask[b], 1, channels, height, width, height_out,
+          width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
+          dilation_h, dilation_w, deformable_group, columns);
+    }
 
     columns = columns.view({group, columns.size(0) / group, columns.size(1)});
     grad_weight = grad_weight.view({group, grad_weight.size(0) / group,
diff --git a/mmcv/ops/csrc/parrots/modulated_deform_conv_cpu.cpp b/mmcv/ops/csrc/parrots/modulated_deform_conv_cpu.cpp
new file mode 100644
index 0000000..89a81d7
--- /dev/null
+++ b/mmcv/ops/csrc/parrots/modulated_deform_conv_cpu.cpp
@@ -0,0 +1,403 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cpp_helper.hpp"
+
+template <typename T>
+T dmcn_im2col_bilinear_cpu(const T *input, const int data_width,
+                           const int height, const int width, T h, T w) {
+  int h_low = floorf(h);
+  int w_low = floorf(w);
+  int h_high = h_low + 1;
+  int w_high = w_low + 1;
+
+  T lh = h - h_low;
+  T lw = w - w_low;
+  T hh = 1 - lh, hw = 1 - lw;
+
+  T v1 = 0;
+  if (h_low >= 0 && w_low >= 0) v1 = input[h_low * data_width + w_low];
+  T v2 = 0;
+  if (h_low >= 0 && w_high <= width - 1)
+    v2 = input[h_low * data_width + w_high];
+  T v3 = 0;
+  if (h_high <= height - 1 && w_low >= 0)
+    v3 = input[h_high * data_width + w_low];
+  T v4 = 0;
+  if (h_high <= height - 1 && w_high <= width - 1)
+    v4 = input[h_high * data_width + w_high];
+
+  T w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
+
+  T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+  return val;
+}
+
+template <typename T>
+T dmcn_get_gradient_weight_cpu(T argmax_h, T argmax_w, const int h, const int w,
+                               const int height, const int width) {
+  if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 ||
+      argmax_w >= width) {
+    // empty
+    return 0;
+  }
+
+  int argmax_h_low = floorf(argmax_h);
+  int argmax_w_low = floorf(argmax_w);
+  int argmax_h_high = argmax_h_low + 1;
+  int argmax_w_high = argmax_w_low + 1;
+
+  T weight = 0;
+  if (h == argmax_h_low && w == argmax_w_low)
+    weight = (h + 1 - argmax_h) * (w + 1 - argmax_w);
+  if (h == argmax_h_low && w == argmax_w_high)
+    weight = (h + 1 - argmax_h) * (argmax_w + 1 - w);
+  if (h == argmax_h_high && w == argmax_w_low)
+    weight = (argmax_h + 1 - h) * (w + 1 - argmax_w);
+  if (h == argmax_h_high && w == argmax_w_high)
+    weight = (argmax_h + 1 - h) * (argmax_w + 1 - w);
+  return weight;
+}
+
+template <typename T>
+T dmcn_get_coordinate_weight_cpu(T argmax_h, T argmax_w, const int height,
+                                 const int width, const T *im_data,
+                                 const int data_width, const int bp_dir) {
+  if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 ||
+      argmax_w >= width) {
+    // empty
+    return 0;
+  }
+
+  int argmax_h_low = floorf(argmax_h);
+  int argmax_w_low = floorf(argmax_w);
+  int argmax_h_high = argmax_h_low + 1;
+  int argmax_w_high = argmax_w_low + 1;
+
+  T weight = 0;
+
+  if (bp_dir == 0) {
+    if (argmax_h_low >= 0 && argmax_w_low >= 0)
+      weight += -1 * (argmax_w_low + 1 - argmax_w) *
+                im_data[argmax_h_low * data_width + argmax_w_low];
+    if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
+      weight += -1 * (argmax_w - argmax_w_low) *
+                im_data[argmax_h_low * data_width + argmax_w_high];
+    if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
+      weight += (argmax_w_low + 1 - argmax_w) *
+                im_data[argmax_h_high * data_width + argmax_w_low];
+    if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
+      weight += (argmax_w - argmax_w_low) *
+                im_data[argmax_h_high * data_width + argmax_w_high];
+  } else if (bp_dir == 1) {
+    if (argmax_h_low >= 0 && argmax_w_low >= 0)
+      weight += -1 * (argmax_h_low + 1 - argmax_h) *
+                im_data[argmax_h_low * data_width + argmax_w_low];
+    if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
+      weight += (argmax_h_low + 1 - argmax_h) *
+                im_data[argmax_h_low * data_width + argmax_w_high];
+    if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
+      weight += -1 * (argmax_h - argmax_h_low) *
+                im_data[argmax_h_high * data_width + argmax_w_low];
+    if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
+      weight += (argmax_h - argmax_h_low) *
+                im_data[argmax_h_high * data_width + argmax_w_high];
+  }
+
+  return weight;
+}
+
+template <typename T>
+void modulated_deformable_im2col_cpu_kernel(
+    const int n, const T *data_im, const T *data_offset, const T *data_mask,
+    const int height, const int width, const int kernel_h, const int kernel_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w,
+    const int channel_per_deformable_group, const int batch_size,
+    const int num_channels, const int deformable_group, const int height_col,
+    const int width_col, T *data_col) {
+  for (int index = 0; index < n; index++) {
+    // index index of output matrix
+    const int w_col = index % width_col;
+    const int h_col = (index / width_col) % height_col;
+    const int b_col = (index / width_col / height_col) % batch_size;
+    const int c_im = (index / width_col / height_col) / batch_size;
+    const int c_col = c_im * kernel_h * kernel_w;
+
+    // compute deformable group index
+    const int deformable_group_index = c_im / channel_per_deformable_group;
+
+    const int h_in = h_col * stride_h - pad_h;
+    const int w_in = w_col * stride_w - pad_w;
+
+    T *data_col_ptr =
+        data_col +
+        ((c_col * batch_size + b_col) * height_col + h_col) * width_col + w_col;
+    const T *data_im_ptr =
+        data_im + (b_col * num_channels + c_im) * height * width;
+    const T *data_offset_ptr =
+        data_offset + (b_col * deformable_group + deformable_group_index) * 2 *
+                          kernel_h * kernel_w * height_col * width_col;
+
+    const T *data_mask_ptr =
+        data_mask + (b_col * deformable_group + deformable_group_index) *
+                        kernel_h * kernel_w * height_col * width_col;
+
+    for (int i = 0; i < kernel_h; ++i) {
+      for (int j = 0; j < kernel_w; ++j) {
+        const int data_offset_h_ptr =
+            ((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col;
+        const int data_offset_w_ptr =
+            ((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col +
+            w_col;
+        const int data_mask_hw_ptr =
+            ((i * kernel_w + j) * height_col + h_col) * width_col + w_col;
+        const T offset_h = data_offset_ptr[data_offset_h_ptr];
+        const T offset_w = data_offset_ptr[data_offset_w_ptr];
+        const T mask = data_mask_ptr[data_mask_hw_ptr];
+        T val = static_cast<T>(0);
+        const T h_im = h_in + i * dilation_h + offset_h;
+        const T w_im = w_in + j * dilation_w + offset_w;
+        if (h_im > -1 && w_im > -1 && h_im < height && w_im < width)
+          val = dmcn_im2col_bilinear_cpu(data_im_ptr, width, height, width,
+                                         h_im, w_im);
+        *data_col_ptr = val * mask;
+        data_col_ptr += batch_size * height_col * width_col;
+      }
+    }
+  }
+}
+
+template <typename T>
+void modulated_deformable_col2im_cpu_kernel(
+    const int n, const T *data_col, const T *data_offset, const T *data_mask,
+    const int channels, const int height, const int width, const int kernel_h,
+    const int kernel_w, const int pad_h, const int pad_w, const int stride_h,
+    const int stride_w, const int dilation_h, const int dilation_w,
+    const int channel_per_deformable_group, const int batch_size,
+    const int deformable_group, const int height_col, const int width_col,
+    T *grad_im) {
+  for (int index = 0; index < n; index++) {
+    const int j = (index / width_col / height_col / batch_size) % kernel_w;
+    const int i =
+        (index / width_col / height_col / batch_size / kernel_w) % kernel_h;
+    const int c =
+        index / width_col / height_col / batch_size / kernel_w / kernel_h;
+    // compute the start and end of the output
+
+    const int deformable_group_index = c / channel_per_deformable_group;
+
+    int w_out = index % width_col;
+    int h_out = (index / width_col) % height_col;
+    int b = (index / width_col / height_col) % batch_size;
+    int w_in = w_out * stride_w - pad_w;
+    int h_in = h_out * stride_h - pad_h;
+
+    const T *data_offset_ptr =
+        data_offset + (b * deformable_group + deformable_group_index) * 2 *
+                          kernel_h * kernel_w * height_col * width_col;
+    const T *data_mask_ptr =
+        data_mask + (b * deformable_group + deformable_group_index) * kernel_h *
+                        kernel_w * height_col * width_col;
+    const int data_offset_h_ptr =
+        ((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out;
+    const int data_offset_w_ptr =
+        ((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out;
+    const int data_mask_hw_ptr =
+        ((i * kernel_w + j) * height_col + h_out) * width_col + w_out;
+    const T offset_h = data_offset_ptr[data_offset_h_ptr];
+    const T offset_w = data_offset_ptr[data_offset_w_ptr];
+    const T mask = data_mask_ptr[data_mask_hw_ptr];
+    const T cur_inv_h_data = h_in + i * dilation_h + offset_h;
+    const T cur_inv_w_data = w_in + j * dilation_w + offset_w;
+
+    const T cur_top_grad = data_col[index] * mask;
+    const int cur_h = (int)cur_inv_h_data;
+    const int cur_w = (int)cur_inv_w_data;
+    for (int dy = -2; dy <= 2; dy++) {
+      for (int dx = -2; dx <= 2; dx++) {
+        if (cur_h + dy >= 0 && cur_h + dy < height && cur_w + dx >= 0 &&
+            cur_w + dx < width && abs(cur_inv_h_data - (cur_h + dy)) < 1 &&
+            abs(cur_inv_w_data - (cur_w + dx)) < 1) {
+          int cur_bottom_grad_pos =
+              ((b * channels + c) * height + cur_h + dy) * width + cur_w + dx;
+          T weight = dmcn_get_gradient_weight_cpu(cur_inv_h_data,
+                                                  cur_inv_w_data, cur_h + dy,
+                                                  cur_w + dx, height, width);
+          *(grad_im + cur_bottom_grad_pos) += weight * cur_top_grad;
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+void modulated_deformable_col2im_coord_cpu_kernel(
+    const int n, const T *data_col, const T *data_im, const T *data_offset,
+    const T *data_mask, const int channels, const int height, const int width,
+    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w, const int dilation_h,
+    const int dilation_w, const int channel_per_deformable_group,
+    const int batch_size, const int offset_channels, const int deformable_group,
+    const int height_col, const int width_col, T *grad_offset, T *grad_mask) {
+  for (int index = 0; index < n; index++) {
+    T val = 0, mval = 0;
+    int w = index % width_col;
+    int h = (index / width_col) % height_col;
+    int c = (index / width_col / height_col) % offset_channels;
+    int b = (index / width_col / height_col) / offset_channels;
+    // compute the start and end of the output
+
+    const int deformable_group_index = c / (2 * kernel_h * kernel_w);
+    const int col_step = kernel_h * kernel_w;
+    int cnt = 0;
+    const T *data_col_ptr = data_col + deformable_group_index *
+                                           channel_per_deformable_group *
+                                           batch_size * width_col * height_col;
+    const T *data_im_ptr =
+        data_im + (b * deformable_group + deformable_group_index) *
+                      channel_per_deformable_group / kernel_h / kernel_w *
+                      height * width;
+    const T *data_offset_ptr =
+        data_offset + (b * deformable_group + deformable_group_index) * 2 *
+                          kernel_h * kernel_w * height_col * width_col;
+    const T *data_mask_ptr =
+        data_mask + (b * deformable_group + deformable_group_index) * kernel_h *
+                        kernel_w * height_col * width_col;
+
+    const int offset_c = c - deformable_group_index * 2 * kernel_h * kernel_w;
+
+    for (int col_c = (offset_c / 2); col_c < channel_per_deformable_group;
+         col_c += col_step) {
+      const int col_pos =
+          (((col_c * batch_size + b) * height_col) + h) * width_col + w;
+      const int bp_dir = offset_c % 2;
+
+      int j = (col_pos / width_col / height_col / batch_size) % kernel_w;
+      int i =
+          (col_pos / width_col / height_col / batch_size / kernel_w) % kernel_h;
+      int w_out = col_pos % width_col;
+      int h_out = (col_pos / width_col) % height_col;
+      int w_in = w_out * stride_w - pad_w;
+      int h_in = h_out * stride_h - pad_h;
+      const int data_offset_h_ptr =
+          (((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out);
+      const int data_offset_w_ptr =
+          (((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col +
+           w_out);
+      const int data_mask_hw_ptr =
+          (((i * kernel_w + j) * height_col + h_out) * width_col + w_out);
+      const T offset_h = data_offset_ptr[data_offset_h_ptr];
+      const T offset_w = data_offset_ptr[data_offset_w_ptr];
+      const T mask = data_mask_ptr[data_mask_hw_ptr];
+      T inv_h = h_in + i * dilation_h + offset_h;
+      T inv_w = w_in + j * dilation_w + offset_w;
+      if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width)
+        inv_h = inv_w = -2;
+      else
+        mval += data_col_ptr[col_pos] *
+                dmcn_im2col_bilinear_cpu(data_im_ptr + cnt * height * width,
+                                         width, height, width, inv_h, inv_w);
+      const T weight = dmcn_get_coordinate_weight_cpu(
+          inv_h, inv_w, height, width, data_im_ptr + cnt * height * width,
+          width, bp_dir);
+      val += weight * data_col_ptr[col_pos] * mask;
+      cnt += 1;
+    }
+    // KERNEL_ASSIGN(grad_offset[index], offset_req, val);
+    grad_offset[index] = val;
+    if (offset_c % 2 == 0)
+      // KERNEL_ASSIGN(grad_mask[(((b * deformable_group +
+      // deformable_group_index) * kernel_h * kernel_w + offset_c / 2) *
+      // height_col + h) * width_col + w], mask_req, mval);
+      grad_mask[(((b * deformable_group + deformable_group_index) * kernel_h *
+                      kernel_w +
+                  offset_c / 2) *
+                     height_col +
+                 h) *
+                    width_col +
+                w] = mval;
+  }
+}
+
+void modulated_deformable_im2col_cpu(
+    const Tensor data_im, const Tensor data_offset, const Tensor data_mask,
+    const int batch_size, const int channels, const int height_im,
+    const int width_im, const int height_col, const int width_col,
+    const int kernel_h, const int kenerl_w, const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w, const int dilation_h,
+    const int dilation_w, const int deformable_group, Tensor data_col) {
+  // num_axes should be smaller than block size
+  const int channel_per_deformable_group = channels / deformable_group;
+  const int num_kernels = channels * batch_size * height_col * width_col;
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      data_im.scalar_type(), "modulated_deformable_im2col_cpu", ([&] {
+        const scalar_t *data_im_ = data_im.data_ptr<scalar_t>();
+        const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();
+        const scalar_t *data_mask_ = data_mask.data_ptr<scalar_t>();
+        scalar_t *data_col_ = data_col.data_ptr<scalar_t>();
+
+        modulated_deformable_im2col_cpu_kernel(
+            num_kernels, data_im_, data_offset_, data_mask_, height_im,
+            width_im, kernel_h, kenerl_w, pad_h, pad_w, stride_h, stride_w,
+            dilation_h, dilation_w, channel_per_deformable_group, batch_size,
+            channels, deformable_group, height_col, width_col, data_col_);
+      }));
+}
+
+void modulated_deformable_col2im_cpu(
+    const Tensor data_col, const Tensor data_offset, const Tensor data_mask,
+    const int batch_size, const int channels, const int height_im,
+    const int width_im, const int height_col, const int width_col,
+    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w, const int dilation_h,
+    const int dilation_w, const int deformable_group, Tensor grad_im) {
+  const int channel_per_deformable_group = channels / deformable_group;
+  const int num_kernels =
+      channels * kernel_h * kernel_w * batch_size * height_col * width_col;
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      data_col.scalar_type(), "modulated_deformable_col2im_cpu", ([&] {
+        const scalar_t *data_col_ = data_col.data_ptr<scalar_t>();
+        const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();
+        const scalar_t *data_mask_ = data_mask.data_ptr<scalar_t>();
+        scalar_t *grad_im_ = grad_im.data_ptr<scalar_t>();
+
+        modulated_deformable_col2im_cpu_kernel(
+            num_kernels, data_col_, data_offset_, data_mask_, channels,
+            height_im, width_im, kernel_h, kernel_w, pad_h, pad_w, stride_h,
+            stride_w, dilation_h, dilation_w, channel_per_deformable_group,
+            batch_size, deformable_group, height_col, width_col, grad_im_);
+      }));
+}
+
+void modulated_deformable_col2im_coord_cpu(
+    const Tensor data_col, const Tensor data_im, const Tensor data_offset,
+    const Tensor data_mask, const int batch_size, const int channels,
+    const int height_im, const int width_im, const int height_col,
+    const int width_col, const int kernel_h, const int kernel_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w, const int deformable_group,
+    Tensor grad_offset, Tensor grad_mask) {
+  const int num_kernels = batch_size * height_col * width_col * 2 * kernel_h *
+                          kernel_w * deformable_group;
+  const int channel_per_deformable_group =
+      channels * kernel_h * kernel_w / deformable_group;
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      data_col.scalar_type(), "modulated_deformable_col2im_coord_cpu", ([&] {
+        const scalar_t *data_col_ = data_col.data_ptr<scalar_t>();
+        const scalar_t *data_im_ = data_im.data_ptr<scalar_t>();
+        const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();
+        const scalar_t *data_mask_ = data_mask.data_ptr<scalar_t>();
+        scalar_t *grad_offset_ = grad_offset.data_ptr<scalar_t>();
+        scalar_t *grad_mask_ = grad_mask.data_ptr<scalar_t>();
+
+        modulated_deformable_col2im_coord_cpu_kernel(
+            num_kernels, data_col_, data_im_, data_offset_, data_mask_,
+            channels, height_im, width_im, kernel_h, kernel_w, pad_h, pad_w,
+            stride_h, stride_w, dilation_h, dilation_w,
+            channel_per_deformable_group, batch_size,
+            2 * kernel_h * kernel_w * deformable_group, deformable_group,
+            height_col, width_col, grad_offset_, grad_mask_);
+      }));
+}
diff --git a/mmcv/ops/csrc/parrots/ms_deform_attn.cpp b/mmcv/ops/csrc/parrots/ms_deform_attn.cpp
index 25c8f62..9bfabdd 100644
--- a/mmcv/ops/csrc/parrots/ms_deform_attn.cpp
+++ b/mmcv/ops/csrc/parrots/ms_deform_attn.cpp
@@ -10,39 +10,42 @@
 */
 
 #include "pytorch_cpp_helper.hpp"
-#include "pytorch_device_registry.hpp"
 
-Tensor ms_deform_attn_impl_forward(const Tensor &value,
+#ifdef MMCV_WITH_CUDA
+Tensor ms_deform_attn_cuda_forward(const Tensor &value,
                                    const Tensor &spatial_shapes,
                                    const Tensor &level_start_index,
                                    const Tensor &sampling_loc,
                                    const Tensor &attn_weight,
-                                   const int im2col_step) {
-  return DISPATCH_DEVICE_IMPL(ms_deform_attn_impl_forward, value,
-                              spatial_shapes, level_start_index, sampling_loc,
-                              attn_weight, im2col_step);
-}
+                                   const int im2col_step);
 
-void ms_deform_attn_impl_backward(
+void ms_deform_attn_cuda_backward(
     const Tensor &value, const Tensor &spatial_shapes,
     const Tensor &level_start_index, const Tensor &sampling_loc,
     const Tensor &attn_weight, const Tensor &grad_output, Tensor &grad_value,
-    Tensor &grad_sampling_loc, Tensor &grad_attn_weight,
-    const int im2col_step) {
-  DISPATCH_DEVICE_IMPL(ms_deform_attn_impl_backward, value, spatial_shapes,
-                       level_start_index, sampling_loc, attn_weight,
-                       grad_output, grad_value, grad_sampling_loc,
-                       grad_attn_weight, im2col_step);
-}
+    Tensor &grad_sampling_loc, Tensor &grad_attn_weight, const int im2col_step);
+
+#endif
 
 Tensor ms_deform_attn_forward(const Tensor &value, const Tensor &spatial_shapes,
                               const Tensor &level_start_index,
                               const Tensor &sampling_loc,
                               const Tensor &attn_weight,
                               const int im2col_step) {
-  at::DeviceGuard guard(value.device());
-  return ms_deform_attn_impl_forward(value, spatial_shapes, level_start_index,
-                                     sampling_loc, attn_weight, im2col_step);
+  if (value.type().is_cuda()) {
+#ifdef MMCV_WITH_CUDA
+    CHECK_CUDA_INPUT(value)
+    CHECK_CUDA_INPUT(spatial_shapes)
+    CHECK_CUDA_INPUT(level_start_index)
+    CHECK_CUDA_INPUT(sampling_loc)
+    CHECK_CUDA_INPUT(attn_weight)
+    return ms_deform_attn_cuda_forward(value, spatial_shapes, level_start_index,
+                                       sampling_loc, attn_weight, im2col_step);
+#else
+    AT_ERROR("Not compiled with GPU support");
+#endif
+  }
+  AT_ERROR("Not implemented on the CPU");
 }
 
 void ms_deform_attn_backward(const Tensor &value, const Tensor &spatial_shapes,
@@ -52,9 +55,25 @@ void ms_deform_attn_backward(const Tensor &value, const Tensor &spatial_shapes,
                              const Tensor &grad_output, Tensor &grad_value,
                              Tensor &grad_sampling_loc,
                              Tensor &grad_attn_weight, const int im2col_step) {
-  at::DeviceGuard guard(value.device());
-  ms_deform_attn_impl_backward(value, spatial_shapes, level_start_index,
-                               sampling_loc, attn_weight, grad_output,
-                               grad_value, grad_sampling_loc, grad_attn_weight,
-                               im2col_step);
+  if (value.type().is_cuda()) {
+#ifdef MMCV_WITH_CUDA
+    CHECK_CUDA_INPUT(value)
+    CHECK_CUDA_INPUT(spatial_shapes)
+    CHECK_CUDA_INPUT(level_start_index)
+    CHECK_CUDA_INPUT(sampling_loc)
+    CHECK_CUDA_INPUT(attn_weight)
+    CHECK_CUDA_INPUT(grad_output)
+    CHECK_CUDA_INPUT(grad_value)
+    CHECK_CUDA_INPUT(grad_sampling_loc)
+    CHECK_CUDA_INPUT(grad_attn_weight)
+    ms_deform_attn_cuda_backward(value, spatial_shapes, level_start_index,
+                                 sampling_loc, attn_weight, grad_output,
+                                 grad_value, grad_sampling_loc,
+                                 grad_attn_weight, im2col_step);
+#else
+    AT_ERROR("Not compiled with GPU support");
+#endif
+  } else {
+    AT_ERROR("Not implemented on the CPU");
+  }
 }
diff --git a/mmcv/ops/csrc/parrots/nms.cpp b/mmcv/ops/csrc/parrots/nms.cpp
index 199d8af..e88208d 100644
--- a/mmcv/ops/csrc/parrots/nms.cpp
+++ b/mmcv/ops/csrc/parrots/nms.cpp
@@ -1,33 +1,261 @@
 // Copyright (c) OpenMMLab. All rights reserved
 #include "pytorch_cpp_helper.hpp"
-#include "pytorch_device_registry.hpp"
 
-Tensor nms_impl(Tensor boxes, Tensor scores, float iou_threshold, int offset) {
-  return DISPATCH_DEVICE_IMPL(nms_impl, boxes, scores, iou_threshold, offset);
-}
+#ifdef MMCV_WITH_CUDA
+Tensor NMSCUDAKernelLauncher(Tensor boxes, Tensor scores, float iou_threshold,
+                             int offset);
 
-Tensor softnms_impl(Tensor boxes, Tensor scores, Tensor dets,
-                    float iou_threshold, float sigma, float min_score,
-                    int method, int offset) {
-  return DISPATCH_DEVICE_IMPL(softnms_impl, boxes, scores, dets, iou_threshold,
-                              sigma, min_score, method, offset);
+Tensor nms_cuda(Tensor boxes, Tensor scores, float iou_threshold, int offset) {
+  return NMSCUDAKernelLauncher(boxes, scores, iou_threshold, offset);
 }
+#endif
+
+Tensor nms_cpu(Tensor boxes, Tensor scores, float iou_threshold, int offset) {
+  if (boxes.numel() == 0) {
+    return at::empty({0}, boxes.options().dtype(at::kLong));
+  }
+  auto x1_t = boxes.select(1, 0).contiguous();
+  auto y1_t = boxes.select(1, 1).contiguous();
+  auto x2_t = boxes.select(1, 2).contiguous();
+  auto y2_t = boxes.select(1, 3).contiguous();
+
+  Tensor areas_t = (x2_t - x1_t + offset) * (y2_t - y1_t + offset);
+
+  auto order_t = std::get<1>(scores.sort(0, /* descending=*/true));
+
+  auto nboxes = boxes.size(0);
+  Tensor select_t = at::ones({nboxes}, boxes.options().dtype(at::kBool));
+
+  auto select = select_t.data_ptr<bool>();
+  auto order = order_t.data_ptr<int64_t>();
+  auto x1 = x1_t.data_ptr<float>();
+  auto y1 = y1_t.data_ptr<float>();
+  auto x2 = x2_t.data_ptr<float>();
+  auto y2 = y2_t.data_ptr<float>();
+  auto areas = areas_t.data_ptr<float>();
+
+  for (int64_t _i = 0; _i < nboxes; _i++) {
+    if (select[_i] == false) continue;
+    auto i = order[_i];
+    auto ix1 = x1[i];
+    auto iy1 = y1[i];
+    auto ix2 = x2[i];
+    auto iy2 = y2[i];
+    auto iarea = areas[i];
+
+    for (int64_t _j = _i + 1; _j < nboxes; _j++) {
+      if (select[_j] == false) continue;
+      auto j = order[_j];
+      auto xx1 = std::max(ix1, x1[j]);
+      auto yy1 = std::max(iy1, y1[j]);
+      auto xx2 = std::min(ix2, x2[j]);
+      auto yy2 = std::min(iy2, y2[j]);
 
-std::vector<std::vector<int> > nms_match_impl(Tensor dets,
-                                              float iou_threshold) {
-  return DISPATCH_DEVICE_IMPL(nms_match_impl, dets, iou_threshold);
+      auto w = std::max(0.f, xx2 - xx1 + offset);
+      auto h = std::max(0.f, yy2 - yy1 + offset);
+      auto inter = w * h;
+      auto ovr = inter / (iarea + areas[j] - inter);
+      if (ovr > iou_threshold) select[_j] = false;
+    }
+  }
+  return order_t.masked_select(select_t);
 }
 
 Tensor nms(Tensor boxes, Tensor scores, float iou_threshold, int offset) {
-  return nms_impl(boxes, scores, iou_threshold, offset);
+  if (boxes.device().is_cuda()) {
+#ifdef MMCV_WITH_CUDA
+    CHECK_CUDA_INPUT(boxes);
+    CHECK_CUDA_INPUT(scores);
+    return nms_cuda(boxes, scores, iou_threshold, offset);
+#else
+    AT_ERROR("nms is not compiled with GPU support");
+#endif
+  } else {
+    CHECK_CPU_INPUT(boxes);
+    CHECK_CPU_INPUT(scores);
+    return nms_cpu(boxes, scores, iou_threshold, offset);
+  }
+}
+
+Tensor softnms_cpu(Tensor boxes, Tensor scores, Tensor dets,
+                   float iou_threshold, float sigma, float min_score,
+                   int method, int offset) {
+  if (boxes.numel() == 0) {
+    return at::empty({0}, boxes.options().dtype(at::kLong));
+  }
+
+  auto x1_t = boxes.select(1, 0).contiguous();
+  auto y1_t = boxes.select(1, 1).contiguous();
+  auto x2_t = boxes.select(1, 2).contiguous();
+  auto y2_t = boxes.select(1, 3).contiguous();
+  auto scores_t = scores.clone();
+
+  Tensor areas_t = (x2_t - x1_t + offset) * (y2_t - y1_t + offset);
+
+  auto nboxes = boxes.size(0);
+  auto x1 = x1_t.data_ptr<float>();
+  auto y1 = y1_t.data_ptr<float>();
+  auto x2 = x2_t.data_ptr<float>();
+  auto y2 = y2_t.data_ptr<float>();
+  auto sc = scores_t.data_ptr<float>();
+  auto areas = areas_t.data_ptr<float>();
+  auto de = dets.data_ptr<float>();
+
+  int64_t pos = 0;
+  Tensor inds_t = at::arange(nboxes, boxes.options().dtype(at::kLong));
+  auto inds = inds_t.data_ptr<int64_t>();
+
+  for (int64_t i = 0; i < nboxes; i++) {
+    auto max_score = sc[i];
+    auto max_pos = i;
+
+    pos = i + 1;
+    // get max box
+    while (pos < nboxes) {
+      if (max_score < sc[pos]) {
+        max_score = sc[pos];
+        max_pos = pos;
+      }
+      pos = pos + 1;
+    }
+    // swap
+    auto ix1 = de[i * 5 + 0] = x1[max_pos];
+    auto iy1 = de[i * 5 + 1] = y1[max_pos];
+    auto ix2 = de[i * 5 + 2] = x2[max_pos];
+    auto iy2 = de[i * 5 + 3] = y2[max_pos];
+    auto iscore = de[i * 5 + 4] = sc[max_pos];
+    auto iarea = areas[max_pos];
+    auto iind = inds[max_pos];
+    x1[max_pos] = x1[i];
+    y1[max_pos] = y1[i];
+    x2[max_pos] = x2[i];
+    y2[max_pos] = y2[i];
+    sc[max_pos] = sc[i];
+    areas[max_pos] = areas[i];
+    inds[max_pos] = inds[i];
+    x1[i] = ix1;
+    y1[i] = iy1;
+    x2[i] = ix2;
+    y2[i] = iy2;
+    sc[i] = iscore;
+    areas[i] = iarea;
+    inds[i] = iind;
+
+    pos = i + 1;
+    while (pos < nboxes) {
+      auto xx1 = std::max(ix1, x1[pos]);
+      auto yy1 = std::max(iy1, y1[pos]);
+      auto xx2 = std::min(ix2, x2[pos]);
+      auto yy2 = std::min(iy2, y2[pos]);
+
+      auto w = std::max(0.f, xx2 - xx1 + offset);
+      auto h = std::max(0.f, yy2 - yy1 + offset);
+      auto inter = w * h;
+      auto ovr = inter / (iarea + areas[pos] - inter);
+
+      float weight = 1.;
+      if (method == 0) {
+        if (ovr >= iou_threshold) weight = 0;
+      } else if (method == 1) {
+        if (ovr >= iou_threshold) weight = 1 - ovr;
+      } else if (method == 2) {
+        weight = std::exp(-(ovr * ovr) / sigma);
+      }
+      sc[pos] *= weight;
+      // if box score falls below threshold, discard the box by
+      // swapping with last box update N
+      if (sc[pos] < min_score) {
+        x1[pos] = x1[nboxes - 1];
+        y1[pos] = y1[nboxes - 1];
+        x2[pos] = x2[nboxes - 1];
+        y2[pos] = y2[nboxes - 1];
+        sc[pos] = sc[nboxes - 1];
+        areas[pos] = areas[nboxes - 1];
+        inds[pos] = inds[nboxes - 1];
+        nboxes = nboxes - 1;
+        pos = pos - 1;
+      }
+      pos = pos + 1;
+    }
+  }
+  return inds_t.slice(0, 0, nboxes);
 }
 
 Tensor softnms(Tensor boxes, Tensor scores, Tensor dets, float iou_threshold,
                float sigma, float min_score, int method, int offset) {
-  return softnms_impl(boxes, scores, dets, iou_threshold, sigma, min_score,
-                      method, offset);
+  if (boxes.device().is_cuda()) {
+    AT_ERROR("softnms is not implemented on GPU");
+  } else {
+    return softnms_cpu(boxes, scores, dets, iou_threshold, sigma, min_score,
+                       method, offset);
+  }
+}
+
+std::vector<std::vector<int> > nms_match_cpu(Tensor dets, float iou_threshold) {
+  auto x1_t = dets.select(1, 0).contiguous();
+  auto y1_t = dets.select(1, 1).contiguous();
+  auto x2_t = dets.select(1, 2).contiguous();
+  auto y2_t = dets.select(1, 3).contiguous();
+  auto scores = dets.select(1, 4).contiguous();
+
+  at::Tensor areas_t = (x2_t - x1_t) * (y2_t - y1_t);
+
+  auto order_t = std::get<1>(scores.sort(0, /* descending=*/true));
+
+  auto ndets = dets.size(0);
+  at::Tensor suppressed_t =
+      at::zeros({ndets}, dets.options().dtype(at::kByte).device(at::kCPU));
+
+  auto suppressed = suppressed_t.data_ptr<uint8_t>();
+  auto order = order_t.data_ptr<int64_t>();
+  auto x1 = x1_t.data_ptr<float>();
+  auto y1 = y1_t.data_ptr<float>();
+  auto x2 = x2_t.data_ptr<float>();
+  auto y2 = y2_t.data_ptr<float>();
+  auto areas = areas_t.data_ptr<float>();
+
+  std::vector<int> keep;
+  std::vector<std::vector<int> > matched;
+
+  for (int64_t _i = 0; _i < ndets; _i++) {
+    auto i = order[_i];
+    if (suppressed[i] == 1) continue;
+    keep.push_back(i);
+    std::vector<int> v_i;
+    auto ix1 = x1[i];
+    auto iy1 = y1[i];
+    auto ix2 = x2[i];
+    auto iy2 = y2[i];
+    auto iarea = areas[i];
+
+    for (int64_t _j = _i + 1; _j < ndets; _j++) {
+      auto j = order[_j];
+      if (suppressed[j] == 1) continue;
+      auto xx1 = std::max(ix1, x1[j]);
+      auto yy1 = std::max(iy1, y1[j]);
+      auto xx2 = std::min(ix2, x2[j]);
+      auto yy2 = std::min(iy2, y2[j]);
+
+      auto w = std::max(static_cast<float>(0), xx2 - xx1);
+      auto h = std::max(static_cast<float>(0), yy2 - yy1);
+      auto inter = w * h;
+      auto ovr = inter / (iarea + areas[j] - inter);
+      if (ovr >= iou_threshold) {
+        suppressed[j] = 1;
+        v_i.push_back(j);
+      }
+    }
+    matched.push_back(v_i);
+  }
+  for (int i = 0; i < keep.size(); i++)
+    matched[i].insert(matched[i].begin(), keep[i]);
+  return matched;
 }
 
 std::vector<std::vector<int> > nms_match(Tensor dets, float iou_threshold) {
-  return nms_match_impl(dets, iou_threshold);
+  if (dets.device().is_cuda()) {
+    AT_ERROR("nms_match is not implemented on GPU");
+  } else {
+    return nms_match_cpu(dets, iou_threshold);
+  }
 }
diff --git a/mmcv/ops/csrc/pytorch/cpu/nms_quadri.cpp b/mmcv/ops/csrc/parrots/nms_rotated_cpu.cpp
similarity index 59%
rename from mmcv/ops/csrc/pytorch/cpu/nms_quadri.cpp
rename to mmcv/ops/csrc/parrots/nms_rotated_cpu.cpp
index 086df16..042cb7e 100644
--- a/mmcv/ops/csrc/pytorch/cpu/nms_quadri.cpp
+++ b/mmcv/ops/csrc/parrots/nms_rotated_cpu.cpp
@@ -1,17 +1,19 @@
 // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+// modified from
+// https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/csrc/nms_rotated/nms_rotated_cpu.cpp
 #include "box_iou_rotated_utils.hpp"
 #include "pytorch_cpp_helper.hpp"
 
 template <typename scalar_t>
-Tensor nms_quadri_cpu_kernel(const Tensor dets, const Tensor scores,
-                             const float iou_threshold) {
-  // nms_quadri_cpu_kernel is modified from torchvision's nms_cpu_kernel,
+Tensor nms_rotated_cpu_kernel(const Tensor dets, const Tensor scores,
+                              const float iou_threshold) {
+  // nms_rotated_cpu_kernel is modified from torchvision's nms_cpu_kernel,
   // however, the code in this function is much shorter because
-  // we delegate the IoU computation for quadri boxes to
-  // the single_box_iou_quadri function in box_iou_rotated_utils.h
-  AT_ASSERTM(!dets.is_cuda(), "dets must be a CPU tensor");
-  AT_ASSERTM(!scores.is_cuda(), "scores must be a CPU tensor");
-  AT_ASSERTM(dets.scalar_type() == scores.scalar_type(),
+  // we delegate the IoU computation for rotated boxes to
+  // the single_box_iou_rotated function in box_iou_rotated_utils.h
+  AT_ASSERTM(!dets.type().is_cuda(), "dets must be a CPU tensor");
+  AT_ASSERTM(!scores.type().is_cuda(), "scores must be a CPU tensor");
+  AT_ASSERTM(dets.type() == scores.type(),
              "dets should have the same type as scores");
 
   if (dets.numel() == 0) {
@@ -44,7 +46,7 @@ Tensor nms_quadri_cpu_kernel(const Tensor dets, const Tensor scores,
         continue;
       }
 
-      auto ovr = single_box_iou_quadri<scalar_t>(
+      auto ovr = single_box_iou_rotated<scalar_t>(
           dets[i].data_ptr<scalar_t>(), dets[j].data_ptr<scalar_t>(), 0);
       if (ovr >= iou_threshold) {
         suppressed[j] = 1;
@@ -54,11 +56,11 @@ Tensor nms_quadri_cpu_kernel(const Tensor dets, const Tensor scores,
   return keep_t.narrow(/*dim=*/0, /*start=*/0, /*length=*/num_to_keep);
 }
 
-Tensor nms_quadri_cpu(const Tensor dets, const Tensor scores,
-                      const float iou_threshold) {
+Tensor nms_rotated_cpu(const Tensor dets, const Tensor scores,
+                       const float iou_threshold) {
   auto result = at::empty({0}, dets.options());
-  AT_DISPATCH_FLOATING_TYPES(dets.scalar_type(), "nms_quadri", [&] {
-    result = nms_quadri_cpu_kernel<scalar_t>(dets, scores, iou_threshold);
+  AT_DISPATCH_FLOATING_TYPES(dets.type(), "nms_rotated", [&] {
+    result = nms_rotated_cpu_kernel<scalar_t>(dets, scores, iou_threshold);
   });
   return result;
 }
diff --git a/mmcv/ops/csrc/parrots/pixel_group.cpp b/mmcv/ops/csrc/parrots/pixel_group.cpp
index 2bf8c8b..5c6af46 100644
--- a/mmcv/ops/csrc/parrots/pixel_group.cpp
+++ b/mmcv/ops/csrc/parrots/pixel_group.cpp
@@ -2,14 +2,120 @@
 // It is modified from https://github.com/WenmuZhou/PAN.pytorch
 
 #include "pytorch_cpp_helper.hpp"
-#include "pytorch_device_registry.hpp"
 
-std::vector<std::vector<float>> pixel_group_impl(
+std::vector<std::vector<float>> estimate_confidence(int32_t* label,
+                                                    float* score, int label_num,
+                                                    int height, int width) {
+  std::vector<std::vector<float>> point_vector;
+  for (int i = 0; i < label_num; i++) {
+    std::vector<float> point;
+    point.push_back(0);
+    point.push_back(0);
+    point_vector.push_back(point);
+  }
+  for (int y = 0; y < height; y++) {
+    auto label_tmp = label + y * width;
+    auto score_tmp = score + y * width;
+    for (int x = 0; x < width; x++) {
+      auto l = label_tmp[x];
+      if (l > 0) {
+        float confidence = score_tmp[x];
+        point_vector[l].push_back(x);
+        point_vector[l].push_back(y);
+        point_vector[l][0] += confidence;
+        point_vector[l][1] += 1;
+      }
+    }
+  }
+  for (int l = 0; l < point_vector.size(); l++)
+    if (point_vector[l][1] > 0) {
+      point_vector[l][0] /= point_vector[l][1];
+    }
+  return point_vector;
+}
+std::vector<std::vector<float>> pixel_group_cpu(
     Tensor score, Tensor mask, Tensor embedding, Tensor kernel_label,
     Tensor kernel_contour, int kernel_region_num, float dis_threshold) {
-  return DISPATCH_DEVICE_IMPL(pixel_group_impl, score, mask, embedding,
-                              kernel_label, kernel_contour, kernel_region_num,
-                              dis_threshold);
+  assert(score.dim() == 2);
+  assert(mask.dim() == 2);
+  assert(embedding_dim.dim() == 3);
+  int height = score.size(0);
+  int width = score.size(1);
+  assert(height == mask.size(0) == embedding.size(1) == kernel_label.size(1));
+  assert(width == mask.size(1) == embedding.size(2) == kernel_label.size(2));
+
+  auto threshold_square = dis_threshold * dis_threshold;
+  auto ptr_score = score.data_ptr<float>();
+  auto ptr_mask = mask.data_ptr<bool>();
+  auto ptr_kernel_contour = kernel_contour.data_ptr<uint8_t>();
+  auto ptr_embedding = embedding.data_ptr<float>();
+  auto ptr_kernel_label = kernel_label.data_ptr<int32_t>();
+  std::queue<std::tuple<int, int, int32_t>> contour_pixels;
+  auto embedding_dim = embedding.size(2);
+  std::vector<std::vector<float>> kernel_vector(
+      kernel_region_num, std::vector<float>(embedding_dim + 1, 0));
+
+  Tensor text_label;
+  text_label = kernel_label.clone();
+  auto ptr_text_label = text_label.data_ptr<int32_t>();
+
+  for (int i = 0; i < height; i++) {
+    auto ptr_embedding_tmp = ptr_embedding + i * width * embedding_dim;
+    auto ptr_kernel_label_tmp = ptr_kernel_label + i * width;
+    auto ptr_kernel_contour_tmp = ptr_kernel_contour + i * width;
+
+    for (int j = 0, k = 0; j < width && k < width * embedding_dim;
+         j++, k += embedding_dim) {
+      int32_t label = ptr_kernel_label_tmp[j];
+      if (label > 0) {
+        for (int d = 0; d < embedding_dim; d++)
+          kernel_vector[label][d] += ptr_embedding_tmp[k + d];
+        kernel_vector[label][embedding_dim] += 1;
+        // kernel pixel number
+        if (ptr_kernel_contour_tmp[j]) {
+          contour_pixels.push(std::make_tuple(i, j, label));
+        }
+      }
+    }
+  }
+  for (int i = 0; i < kernel_region_num; i++) {
+    for (int j = 0; j < embedding_dim; j++) {
+      kernel_vector[i][j] /= kernel_vector[i][embedding_dim];
+    }
+  }
+  int dx[4] = {-1, 1, 0, 0};
+  int dy[4] = {0, 0, -1, 1};
+  while (!contour_pixels.empty()) {
+    auto query_pixel = contour_pixels.front();
+    contour_pixels.pop();
+    int y = std::get<0>(query_pixel);
+    int x = std::get<1>(query_pixel);
+    int32_t l = std::get<2>(query_pixel);
+    auto kernel_cv = kernel_vector[l];
+    for (int idx = 0; idx < 4; idx++) {
+      int tmpy = y + dy[idx];
+      int tmpx = x + dx[idx];
+      auto ptr_text_label_tmp = ptr_text_label + tmpy * width;
+      if (tmpy < 0 || tmpy >= height || tmpx < 0 || tmpx >= width) continue;
+      if (!ptr_mask[tmpy * width + tmpx] || ptr_text_label_tmp[tmpx] > 0)
+        continue;
+
+      float dis = 0;
+      auto ptr_embedding_tmp = ptr_embedding + tmpy * width * embedding_dim;
+      for (size_t i = 0; i < embedding_dim; i++) {
+        dis +=
+            pow(kernel_cv[i] - ptr_embedding_tmp[tmpx * embedding_dim + i], 2);
+        // ignore further computing if dis is big enough
+        if (dis >= threshold_square) break;
+      }
+      if (dis >= threshold_square) continue;
+      contour_pixels.push(std::make_tuple(tmpy, tmpx, l));
+      ptr_text_label_tmp[tmpx] = l;
+    }
+  }
+
+  return estimate_confidence(ptr_text_label, ptr_score, kernel_region_num,
+                             height, width);
 }
 
 std::vector<std::vector<float>> pixel_group(
@@ -21,6 +127,11 @@ std::vector<std::vector<float>> pixel_group(
   kernel_label = kernel_label.contiguous();
   kernel_contour = kernel_contour.contiguous();
 
-  return pixel_group_impl(score, mask, embedding, kernel_label, kernel_contour,
-                          kernel_region_num, distance_threshold);
+  CHECK_CPU_INPUT(score);
+  CHECK_CPU_INPUT(mask);
+  CHECK_CPU_INPUT(embedding);
+  CHECK_CPU_INPUT(kernel_label);
+  CHECK_CPU_INPUT(kernel_contour);
+  return pixel_group_cpu(score, mask, embedding, kernel_label, kernel_contour,
+                         kernel_region_num, distance_threshold);
 }
diff --git a/mmcv/ops/csrc/parrots/points_in_boxes.cpp b/mmcv/ops/csrc/parrots/points_in_boxes.cpp
index 540da94..9ebeec9 100644
--- a/mmcv/ops/csrc/parrots/points_in_boxes.cpp
+++ b/mmcv/ops/csrc/parrots/points_in_boxes.cpp
@@ -1,21 +1,32 @@
 #include "pytorch_cpp_helper.hpp"
-#include "pytorch_device_registry.hpp"
 
-void points_in_boxes_part_forward_impl(int batch_size, int boxes_num,
+#ifdef MMCV_WITH_CUDA
+void PointsInBoxesPartForwardCUDAKernelLauncher(int batch_size, int boxes_num,
+                                                int pts_num, const Tensor boxes,
+                                                const Tensor pts,
+                                                Tensor box_idx_of_points);
+
+void points_in_boxes_part_forward_cuda(int batch_size, int boxes_num,
                                        int pts_num, const Tensor boxes,
                                        const Tensor pts,
                                        Tensor box_idx_of_points) {
-  DISPATCH_DEVICE_IMPL(points_in_boxes_part_forward_impl, batch_size, boxes_num,
-                       pts_num, boxes, pts, box_idx_of_points);
-}
+  PointsInBoxesPartForwardCUDAKernelLauncher(batch_size, boxes_num, pts_num,
+                                             boxes, pts, box_idx_of_points);
+};
+
+void PointsInBoxesAllForwardCUDAKernelLauncher(int batch_size, int boxes_num,
+                                               int pts_num, const Tensor boxes,
+                                               const Tensor pts,
+                                               Tensor box_idx_of_points);
 
-void points_in_boxes_all_forward_impl(int batch_size, int boxes_num,
+void points_in_boxes_all_forward_cuda(int batch_size, int boxes_num,
                                       int pts_num, const Tensor boxes,
                                       const Tensor pts,
                                       Tensor box_idx_of_points) {
-  DISPATCH_DEVICE_IMPL(points_in_boxes_all_forward_impl, batch_size, boxes_num,
-                       pts_num, boxes, pts, box_idx_of_points);
-}
+  PointsInBoxesAllForwardCUDAKernelLauncher(batch_size, boxes_num, pts_num,
+                                            boxes, pts, box_idx_of_points);
+};
+#endif
 
 void points_in_boxes_part_forward(Tensor boxes_tensor, Tensor pts_tensor,
                                   Tensor box_idx_of_points_tensor) {
@@ -23,12 +34,30 @@ void points_in_boxes_part_forward(Tensor boxes_tensor, Tensor pts_tensor,
   // coordinate, z is the bottom center, each box params pts: (B, npoints, 3)
   // [x, y, z] in LiDAR coordinate params boxes_idx_of_points: (B, npoints),
   // default -1
-  int batch_size = boxes_tensor.size(0);
-  int boxes_num = boxes_tensor.size(1);
-  int pts_num = pts_tensor.size(1);
-  points_in_boxes_part_forward_impl(batch_size, boxes_num, pts_num,
-                                    boxes_tensor, pts_tensor,
-                                    box_idx_of_points_tensor);
+
+  if (pts_tensor.device().is_cuda()) {
+#ifdef MMCV_WITH_CUDA
+    CHECK_CUDA_INPUT(boxes_tensor);
+    CHECK_CUDA_INPUT(pts_tensor);
+    CHECK_CUDA_INPUT(box_idx_of_points_tensor);
+
+    int batch_size = boxes_tensor.size(0);
+    int boxes_num = boxes_tensor.size(1);
+    int pts_num = pts_tensor.size(1);
+
+    const float *boxes = boxes_tensor.data_ptr<float>();
+    const float *pts = pts_tensor.data_ptr<float>();
+    int *box_idx_of_points = box_idx_of_points_tensor.data_ptr<int>();
+
+    points_in_boxes_part_forward_cuda(batch_size, boxes_num, pts_num,
+                                      boxes_tensor, pts_tensor,
+                                      box_idx_of_points_tensor);
+#else
+    AT_ERROR("points_in_boxes_part is not compiled with GPU support");
+#endif
+  } else {
+    AT_ERROR("points_in_boxes_part is not implemented on CPU");
+  }
 }
 
 void points_in_boxes_all_forward(Tensor boxes_tensor, Tensor pts_tensor,
@@ -36,9 +65,28 @@ void points_in_boxes_all_forward(Tensor boxes_tensor, Tensor pts_tensor,
   // params boxes: (B, N, 7) [x, y, z, x_size, y_size, z_size, rz] in LiDAR
   // coordinate, z is the bottom center. params pts: (B, npoints, 3) [x, y, z]
   // in LiDAR coordinate params boxes_idx_of_points: (B, npoints), default -1
-  int batch_size = boxes_tensor.size(0);
-  int boxes_num = boxes_tensor.size(1);
-  int pts_num = pts_tensor.size(1);
-  points_in_boxes_all_forward_impl(batch_size, boxes_num, pts_num, boxes_tensor,
-                                   pts_tensor, box_idx_of_points_tensor);
+
+  if (pts_tensor.device().is_cuda()) {
+#ifdef MMCV_WITH_CUDA
+    CHECK_CUDA_INPUT(boxes_tensor);
+    CHECK_CUDA_INPUT(pts_tensor);
+    CHECK_CUDA_INPUT(box_idx_of_points_tensor);
+
+    int batch_size = boxes_tensor.size(0);
+    int boxes_num = boxes_tensor.size(1);
+    int pts_num = pts_tensor.size(1);
+
+    const float *boxes = boxes_tensor.data_ptr<float>();
+    const float *pts = pts_tensor.data_ptr<float>();
+    int *box_idx_of_points = box_idx_of_points_tensor.data_ptr<int>();
+
+    points_in_boxes_all_forward_cuda(batch_size, boxes_num, pts_num,
+                                     boxes_tensor, pts_tensor,
+                                     box_idx_of_points_tensor);
+#else
+    AT_ERROR("points_in_boxes_all is not compiled with GPU support");
+#endif
+  } else {
+    AT_ERROR("points_in_boxes_all is not implemented on CPU");
+  }
 }
diff --git a/mmcv/ops/csrc/parrots/points_in_boxes_cpu.cpp b/mmcv/ops/csrc/parrots/points_in_boxes_cpu.cpp
new file mode 100644
index 0000000..c16baa4
--- /dev/null
+++ b/mmcv/ops/csrc/parrots/points_in_boxes_cpu.cpp
@@ -0,0 +1,53 @@
+#include "pytorch_cpp_helper.hpp"
+
+inline void lidar_to_local_coords_cpu(float shift_x, float shift_y, float rz,
+                                      float &local_x, float &local_y) {
+  float cosa = cos(-rz), sina = sin(-rz);
+  local_x = shift_x * cosa + shift_y * (-sina);
+  local_y = shift_x * sina + shift_y * cosa;
+}
+
+inline int check_pt_in_box3d_cpu(const float *pt, const float *box3d,
+                                 float &local_x, float &local_y) {
+  // param pt: (x, y, z)
+  // param box3d: (cx, cy, cz, x_size, y_size, z_size, rz) in LiDAR coordinate,
+  // cz in the bottom center
+  float x = pt[0], y = pt[1], z = pt[2];
+  float cx = box3d[0], cy = box3d[1], cz = box3d[2];
+  float x_size = box3d[3], y_size = box3d[4], z_size = box3d[5], rz = box3d[6];
+  cz += z_size /
+        2.0;  // shift to the center since cz in box3d is the bottom center
+
+  if (fabsf(z - cz) > z_size / 2.0) return 0;
+  lidar_to_local_coords_cpu(x - cx, y - cy, rz, local_x, local_y);
+  float in_flag = (local_x > -x_size / 2.0) & (local_x < x_size / 2.0) &
+                  (local_y > -y_size / 2.0) & (local_y < y_size / 2.0);
+  return in_flag;
+}
+
+void points_in_boxes_cpu_forward(Tensor boxes_tensor, Tensor pts_tensor,
+                                 Tensor pts_indices_tensor) {
+  // params boxes: (N, 7) [x, y, z, x_size, y_size, z_size, rz] in LiDAR
+  // coordinate, z is the bottom center, each box DO NOT overlaps params pts:
+  // (npoints, 3) [x, y, z] in LiDAR coordinate params pts_indices: (N, npoints)
+
+  CHECK_CONTIGUOUS(boxes_tensor);
+  CHECK_CONTIGUOUS(pts_tensor);
+  CHECK_CONTIGUOUS(pts_indices_tensor);
+
+  int boxes_num = boxes_tensor.size(0);
+  int pts_num = pts_tensor.size(0);
+
+  const float *boxes = boxes_tensor.data_ptr<float>();
+  const float *pts = pts_tensor.data_ptr<float>();
+  int *pts_indices = pts_indices_tensor.data_ptr<int>();
+
+  float local_x = 0, local_y = 0;
+  for (int i = 0; i < boxes_num; i++) {
+    for (int j = 0; j < pts_num; j++) {
+      int cur_in_flag =
+          check_pt_in_box3d_cpu(pts + j * 3, boxes + i * 7, local_x, local_y);
+      pts_indices[i * pts_num + j] = cur_in_flag;
+    }
+  }
+}
diff --git a/mmcv/ops/csrc/parrots/points_in_polygons.cpp b/mmcv/ops/csrc/parrots/points_in_polygons.cpp
deleted file mode 100644
index 75a93dc..0000000
--- a/mmcv/ops/csrc/parrots/points_in_polygons.cpp
+++ /dev/null
@@ -1,15 +0,0 @@
-#include "pytorch_cpp_helper.hpp"
-#include "pytorch_device_registry.hpp"
-
-void points_in_polygons_forward_impl(const Tensor points, const Tensor polygons,
-                                     Tensor output, const int rows,
-                                     const int cols) {
-  DISPATCH_DEVICE_IMPL(points_in_polygons_forward_impl, points, polygons,
-                       output, rows, cols);
-}
-
-void points_in_polygons_forward(Tensor points, Tensor polygons, Tensor output) {
-  int rows = points.size(0);
-  int cols = polygons.size(0);
-  points_in_polygons_forward_impl(points, polygons, output, rows, cols);
-}
diff --git a/mmcv/ops/csrc/parrots/points_in_polygons_parrots.cpp b/mmcv/ops/csrc/parrots/points_in_polygons_parrots.cpp
deleted file mode 100644
index d52018e..0000000
--- a/mmcv/ops/csrc/parrots/points_in_polygons_parrots.cpp
+++ /dev/null
@@ -1,28 +0,0 @@
-// Copyright (c) OpenMMLab. All rights reserved
-#include <parrots/compute/aten.hpp>
-#include <parrots/extension.hpp>
-#include <parrots/foundation/ssattrs.hpp>
-
-#include "points_in_polygons_pytorch.h"
-
-using namespace parrots;
-
-#ifdef MMCV_WITH_CUDA
-void points_in_polygons_cuda_parrots(CudaContext& ctx, const SSElement& attr,
-                                     const OperatorBase::in_list_t& ins,
-                                     OperatorBase::out_list_t& outs) {
-  auto points = buildATensor(ctx, ins[0]);
-  auto polygons = buildATensor(ctx, ins[1]);
-
-  auto output = buildATensor(ctx, outs[0]);
-
-  points_in_polygons_forward(points, polygons, output);
-}
-
-PARROTS_EXTENSION_REGISTER(points_in_polygons_forward)
-    .input(2)
-    .output(1)
-    .apply(points_in_polygons_cuda_parrots)
-    .done();
-
-#endif
diff --git a/mmcv/ops/csrc/parrots/points_in_polygons_pytorch.h b/mmcv/ops/csrc/parrots/points_in_polygons_pytorch.h
deleted file mode 100644
index 0426781..0000000
--- a/mmcv/ops/csrc/parrots/points_in_polygons_pytorch.h
+++ /dev/null
@@ -1,9 +0,0 @@
-// Copyright (c) OpenMMLab. All rights reserved
-#ifndef POINTS_IN_POLYGONS_PYTORCH_H
-#define POINTS_IN_POLYGONS_PYTORCH_H
-#include <torch/extension.h>
-using namespace at;
-
-void points_in_polygons_forward(Tensor points, Tensor polygons, Tensor output);
-
-#endif  // POINTS_IN_POLYGONS_PYTORCH_H
diff --git a/mmcv/ops/csrc/parrots/prroi_pool.cpp b/mmcv/ops/csrc/parrots/prroi_pool.cpp
deleted file mode 100644
index 00db84a..0000000
--- a/mmcv/ops/csrc/parrots/prroi_pool.cpp
+++ /dev/null
@@ -1,47 +0,0 @@
-// Copyright (c) OpenMMLab. All rights reserved
-#include "pytorch_cpp_helper.hpp"
-#include "pytorch_device_registry.hpp"
-
-void prroi_pool_forward_impl(Tensor input, Tensor rois, Tensor output,
-                             int pooled_height, int pooled_width,
-                             float spatial_scale) {
-  DISPATCH_DEVICE_IMPL(prroi_pool_forward_impl, input, rois, output,
-                       pooled_height, pooled_width, spatial_scale);
-}
-
-void prroi_pool_backward_impl(Tensor grad_output, Tensor rois,
-                              Tensor grad_input, int pooled_height,
-                              int pooled_width, float spatial_scale) {
-  DISPATCH_DEVICE_IMPL(prroi_pool_backward_impl, grad_output, rois, grad_input,
-                       pooled_height, pooled_width, spatial_scale);
-}
-
-void prroi_pool_coor_backward_impl(Tensor output, Tensor grad_output,
-                                   Tensor input, Tensor rois, Tensor grad_rois,
-                                   int pooled_height, int pooled_width,
-                                   float spatial_scale) {
-  DISPATCH_DEVICE_IMPL(prroi_pool_coor_backward_impl, output, grad_output,
-                       input, rois, grad_rois, pooled_height, pooled_width,
-                       spatial_scale);
-}
-
-void prroi_pool_forward(Tensor input, Tensor rois, Tensor output,
-                        int pooled_height, int pooled_width,
-                        float spatial_scale) {
-  prroi_pool_forward_impl(input, rois, output, pooled_height, pooled_width,
-                          spatial_scale);
-}
-
-void prroi_pool_backward(Tensor grad_output, Tensor rois, Tensor grad_input,
-                         int pooled_height, int pooled_width,
-                         float spatial_scale) {
-  prroi_pool_backward_impl(grad_output, rois, grad_input, pooled_height,
-                           pooled_width, spatial_scale);
-}
-
-void prroi_pool_coor_backward(Tensor output, Tensor grad_output, Tensor input,
-                              Tensor rois, Tensor grad_rois, int pooled_height,
-                              int pooled_width, float spatial_scale) {
-  prroi_pool_coor_backward_impl(output, grad_output, input, rois, grad_rois,
-                                pooled_height, pooled_width, spatial_scale);
-}
diff --git a/mmcv/ops/csrc/parrots/prroi_pool_parrots.cpp b/mmcv/ops/csrc/parrots/prroi_pool_parrots.cpp
deleted file mode 100644
index 4e82955..0000000
--- a/mmcv/ops/csrc/parrots/prroi_pool_parrots.cpp
+++ /dev/null
@@ -1,97 +0,0 @@
-// Copyright (c) OpenMMLab. All rights reserved
-#include <parrots/compute/aten.hpp>
-#include <parrots/extension.hpp>
-#include <parrots/foundation/ssattrs.hpp>
-
-#include "prroi_pool_pytorch.h"
-
-using namespace parrots;
-
-#ifdef MMCV_WITH_CUDA
-void prroi_pool_forward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
-                                     const OperatorBase::in_list_t& ins,
-                                     OperatorBase::out_list_t& outs) {
-  int pooled_height;
-  int pooled_width;
-  float spatial_scale;
-  SSAttrs(attr)
-      .get<int>("pooled_height", pooled_height)
-      .get<int>("pooled_width", pooled_width)
-      .get<float>("spatial_scale", spatial_scale)
-      .done();
-
-  const auto& input = buildATensor(ctx, ins[0]);
-  const auto& rois = buildATensor(ctx, ins[1]);
-  auto output = buildATensor(ctx, outs[0]);
-  prroi_pool_forward(input, rois, output, pooled_height, pooled_width,
-                     spatial_scale);
-}
-
-void prroi_pool_backward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
-                                      const OperatorBase::in_list_t& ins,
-                                      OperatorBase::out_list_t& outs) {
-  int pooled_height;
-  int pooled_width;
-  float spatial_scale;
-  SSAttrs(attr)
-      .get<int>("pooled_height", pooled_height)
-      .get<int>("pooled_width", pooled_width)
-      .get<float>("spatial_scale", spatial_scale)
-      .done();
-
-  const auto& grad_output = buildATensor(ctx, ins[0]);
-  const auto& rois = buildATensor(ctx, ins[1]);
-  auto grad_input = buildATensor(ctx, outs[0]);
-  prroi_pool_backward(grad_output, rois, grad_input, pooled_height,
-                      pooled_width, spatial_scale);
-}
-
-void prroi_pool_coor_backward_cuda_parrots(CudaContext& ctx,
-                                           const SSElement& attr,
-                                           const OperatorBase::in_list_t& ins,
-                                           OperatorBase::out_list_t& outs) {
-  int pooled_height;
-  int pooled_width;
-  float spatial_scale;
-  SSAttrs(attr)
-      .get<int>("pooled_height", pooled_height)
-      .get<int>("pooled_width", pooled_width)
-      .get<float>("spatial_scale", spatial_scale)
-      .done();
-
-  const auto& output = buildATensor(ctx, ins[0]);
-  const auto& grad_output = buildATensor(ctx, ins[1]);
-  const auto& input = buildATensor(ctx, ins[2]);
-  const auto& rois = buildATensor(ctx, ins[3]);
-  auto grad_rois = buildATensor(ctx, outs[0]);
-  prroi_pool_coor_backward(output, grad_output, input, rois, grad_rois,
-                           pooled_height, pooled_width, spatial_scale);
-}
-
-PARROTS_EXTENSION_REGISTER(prroi_pool_forward)
-    .attr("pooled_height")
-    .attr("pooled_width")
-    .attr("spatial_scale")
-    .input(2)
-    .output(1)
-    .apply(prroi_pool_forward_cuda_parrots)
-    .done();
-
-PARROTS_EXTENSION_REGISTER(prroi_pool_backward)
-    .attr("pooled_height")
-    .attr("pooled_width")
-    .attr("spatial_scale")
-    .input(2)
-    .output(1)
-    .apply(prroi_pool_backward_cuda_parrots)
-    .done();
-
-PARROTS_EXTENSION_REGISTER(prroi_pool_coor_backward)
-    .attr("pooled_height")
-    .attr("pooled_width")
-    .attr("spatial_scale")
-    .input(4)
-    .output(1)
-    .apply(prroi_pool_coor_backward_cuda_parrots)
-    .done();
-#endif
diff --git a/mmcv/ops/csrc/parrots/prroi_pool_pytorch.h b/mmcv/ops/csrc/parrots/prroi_pool_pytorch.h
deleted file mode 100644
index 451b01d..0000000
--- a/mmcv/ops/csrc/parrots/prroi_pool_pytorch.h
+++ /dev/null
@@ -1,19 +0,0 @@
-// Copyright (c) OpenMMLab. All rights reserved
-#ifndef PRROI_POOL_PYTORCH_H
-#define PRROI_POOL_PYTORCH_H
-#include <torch/extension.h>
-using namespace at;
-
-void prroi_pool_forward(Tensor input, Tensor rois, Tensor output,
-                        int pooled_height, int pooled_width,
-                        float spatial_scale);
-
-void prroi_pool_backward(Tensor grad_output, Tensor rois, Tensor grad_input,
-                         int pooled_height, int pooled_width,
-                         float spatial_scale);
-
-void prroi_pool_coor_backward(Tensor output, Tensor grad_output, Tensor input,
-                              Tensor rois, Tensor grad_rois, int pooled_height,
-                              int pooled_width, float spatial_scale);
-
-#endif  // PRROI_POOL_PYTORCH_H
diff --git a/mmcv/ops/csrc/parrots/psamask.cpp b/mmcv/ops/csrc/parrots/psamask.cpp
index 6064c9b..315bd2a 100644
--- a/mmcv/ops/csrc/parrots/psamask.cpp
+++ b/mmcv/ops/csrc/parrots/psamask.cpp
@@ -2,40 +2,255 @@
 // Modified from
 // https://github.com/hszhao/semseg/blob/master/lib/psa/src
 #include "pytorch_cpp_helper.hpp"
-#include "pytorch_device_registry.hpp"
 
-void psamask_forward_impl(const int psa_type, const Tensor input, Tensor output,
+#ifndef min
+#define min(a, b) (((a) < (b)) ? (a) : (b))
+#endif
+#ifndef max
+#define max(a, b) (((a) > (b)) ? (a) : (b))
+#endif
+
+void psamask_collect_forward(const int num_, const int h_feature,
+                             const int w_feature, const int h_mask,
+                             const int w_mask, const int half_h_mask,
+                             const int half_w_mask, const Tensor mask_data,
+                             Tensor buffer_data) {
+  for (int n = 0; n < num_; n++) {
+    for (int h = 0; h < h_feature; h++) {
+      for (int w = 0; w < w_feature; w++) {
+        // effective mask region : [hstart, hend) x [wstart, wend) with
+        // mask-indexed
+        const int hstart = max(0, half_h_mask - h);
+        const int hend = min(h_mask, h_feature + half_h_mask - h);
+        const int wstart = max(0, half_w_mask - w);
+        const int wend = min(w_mask, w_feature + half_w_mask - w);
+        // (hidx,                    widx                   ) with mask-indexed
+        // (hidx + h - half_h_mask, widx + w - half_w_mask) with
+        // feature-indexed
+        for (int hidx = hstart; hidx < hend; hidx++) {
+          for (int widx = wstart; widx < wend; widx++) {
+            buffer_data.view({-1})[(n * h_feature * w_feature +
+                                    (hidx + h - half_h_mask) * w_feature +
+                                    (widx + w - half_w_mask)) *
+                                       h_feature * w_feature +
+                                   h * w_feature + w] =
+                mask_data.view(
+                    {-1})[((n * h_mask * w_mask + hidx * w_mask + widx) *
+                               h_feature +
+                           h) *
+                              w_feature +
+                          w];
+          }
+        }
+      }
+    }
+  }
+}
+
+void psamask_distribute_forward(const int num_, const int h_feature,
+                                const int w_feature, const int h_mask,
+                                const int w_mask, const int half_h_mask,
+                                const int half_w_mask, const Tensor mask_data,
+                                Tensor buffer_data) {
+  for (int n = 0; n < num_; n++) {
+    for (int h = 0; h < h_feature; h++) {
+      for (int w = 0; w < w_feature; w++) {
+        // effective mask region : [hstart, hend) x [wstart, wend) with
+        // mask-indexed
+        const int hstart = max(0, half_h_mask - h);
+        const int hend = min(h_mask, h_feature + half_h_mask - h);
+        const int wstart = max(0, half_w_mask - w);
+        const int wend = min(w_mask, w_feature + half_w_mask - w);
+        // (hidx,                    widx                   ) with mask-indexed
+        // (hidx + h - half_h_mask, widx + w - half_w_mask) with
+        // feature-indexed
+        for (int hidx = hstart; hidx < hend; hidx++) {
+          for (int widx = wstart; widx < wend; widx++) {
+            buffer_data.view(
+                {-1})[(n * h_feature * w_feature + h * w_feature + w) *
+                          h_feature * w_feature +
+                      (hidx + h - half_h_mask) * w_feature +
+                      (widx + w - half_w_mask)] =
+                mask_data.view(
+                    {-1})[((n * h_mask * w_mask + hidx * w_mask + widx) *
+                               h_feature +
+                           h) *
+                              w_feature +
+                          w];
+          }
+        }
+      }
+    }
+  }
+}
+
+void psamask_collect_backward(const int num_, const int h_feature,
+                              const int w_feature, const int h_mask,
+                              const int w_mask, const int half_h_mask,
+                              const int half_w_mask, const Tensor buffer_diff,
+                              Tensor mask_diff) {
+  for (int n = 0; n < num_; n++) {
+    for (int h = 0; h < h_feature; h++) {
+      for (int w = 0; w < w_feature; w++) {
+        // effective mask region : [hstart, hend) x [wstart, wend) with
+        // mask-indexed
+        const int hstart = max(0, half_h_mask - h);
+        const int hend = min(h_mask, h_feature + half_h_mask - h);
+        const int wstart = max(0, half_w_mask - w);
+        const int wend = min(w_mask, w_feature + half_w_mask - w);
+        // (hidx,                    widx                   ) with mask-indexed
+        // (hidx + h - half_h_mask, widx + w - half_w_mask) with
+        // feature-indexed
+        for (int hidx = hstart; hidx < hend; hidx++) {
+          for (int widx = wstart; widx < wend; widx++) {
+            mask_diff.view({-1})[((n * h_mask * w_mask + hidx * w_mask + widx) *
+                                      h_feature +
+                                  h) *
+                                     w_feature +
+                                 w] =
+                buffer_diff.view({-1})[(n * h_feature * w_feature +
+                                        (hidx + h - half_h_mask) * w_feature +
+                                        (widx + w - half_w_mask)) *
+                                           h_feature * w_feature +
+                                       h * w_feature + w];
+          }
+        }
+      }
+    }
+  }
+}
+
+void psamask_distribute_backward(const int num_, const int h_feature,
+                                 const int w_feature, const int h_mask,
+                                 const int w_mask, const int half_h_mask,
+                                 const int half_w_mask,
+                                 const Tensor buffer_diff, Tensor mask_diff) {
+  for (int n = 0; n < num_; n++) {
+    for (int h = 0; h < h_feature; h++) {
+      for (int w = 0; w < w_feature; w++) {
+        // effective mask region : [hstart, hend) x [wstart, wend) with
+        // mask-indexed
+        const int hstart = max(0, half_h_mask - h);
+        const int hend = min(h_mask, h_feature + half_h_mask - h);
+        const int wstart = max(0, half_w_mask - w);
+        const int wend = min(w_mask, w_feature + half_w_mask - w);
+        // (hidx,                    widx                   ) with mask-indexed
+        // (hidx + h - half_h_mask, widx + w - half_w_mask) with
+        // feature-indexed
+        for (int hidx = hstart; hidx < hend; hidx++) {
+          for (int widx = wstart; widx < wend; widx++) {
+            mask_diff.view({-1})[((n * h_mask * w_mask + hidx * w_mask + widx) *
+                                      h_feature +
+                                  h) *
+                                     w_feature +
+                                 w] =
+                buffer_diff.view(
+                    {-1})[(n * h_feature * w_feature + h * w_feature + w) *
+                              h_feature * w_feature +
+                          (hidx + h - half_h_mask) * w_feature +
+                          (widx + w - half_w_mask)];
+          }
+        }
+      }
+    }
+  }
+}
+
+void psamask_forward_cpu(const int psa_type, const Tensor input, Tensor output,
+                         const int num_, const int h_feature,
+                         const int w_feature, const int h_mask,
+                         const int w_mask, const int half_h_mask,
+                         const int half_w_mask) {
+  if (psa_type == 0)
+    psamask_collect_forward(num_, h_feature, w_feature, h_mask, w_mask,
+                            half_h_mask, half_w_mask, input, output);
+  else
+    psamask_distribute_forward(num_, h_feature, w_feature, h_mask, w_mask,
+                               half_h_mask, half_w_mask, input, output);
+}
+
+void psamask_backward_cpu(const int psa_type, const Tensor grad_output,
+                          Tensor grad_input, const int num_,
+                          const int h_feature, const int w_feature,
+                          const int h_mask, const int w_mask,
+                          const int half_h_mask, const int half_w_mask) {
+  if (psa_type == 0)
+    psamask_collect_backward(num_, h_feature, w_feature, h_mask, w_mask,
+                             half_h_mask, half_w_mask, grad_output, grad_input);
+  else
+    psamask_distribute_backward(num_, h_feature, w_feature, h_mask, w_mask,
+                                half_h_mask, half_w_mask, grad_output,
+                                grad_input);
+}
+
+#ifdef MMCV_WITH_CUDA
+void PSAMaskForwardCUDAKernelLauncher(const int psa_type, const Tensor input,
+                                      Tensor output, const int num_,
+                                      const int h_feature, const int w_feature,
+                                      const int h_mask, const int w_mask,
+                                      const int half_h_mask,
+                                      const int half_w_mask);
+
+void PSAMaskBackwardCUDAKernelLauncher(
+    const int psa_type, const Tensor grad_output, Tensor grad_input,
+    const int num_, const int h_feature, const int w_feature, const int h_mask,
+    const int w_mask, const int half_h_mask, const int half_w_mask);
+
+void psamask_forward_cuda(const int psa_type, const Tensor input, Tensor output,
                           const int num_, const int h_feature,
                           const int w_feature, const int h_mask,
                           const int w_mask, const int half_h_mask,
                           const int half_w_mask) {
-  DISPATCH_DEVICE_IMPL(psamask_forward_impl, psa_type, input, output, num_,
-                       h_feature, w_feature, h_mask, w_mask, half_h_mask,
-                       half_w_mask);
+  PSAMaskForwardCUDAKernelLauncher(psa_type, input, output, num_, h_feature,
+                                   w_feature, h_mask, w_mask, half_h_mask,
+                                   half_w_mask);
 }
 
-void psamask_backward_impl(const int psa_type, const Tensor grad_output,
+void psamask_backward_cuda(const int psa_type, const Tensor grad_output,
                            Tensor grad_input, const int num_,
                            const int h_feature, const int w_feature,
                            const int h_mask, const int w_mask,
                            const int half_h_mask, const int half_w_mask) {
-  DISPATCH_DEVICE_IMPL(psamask_backward_impl, psa_type, grad_output, grad_input,
-                       num_, h_feature, w_feature, h_mask, w_mask, half_h_mask,
-                       half_w_mask);
+  PSAMaskBackwardCUDAKernelLauncher(psa_type, grad_output, grad_input, num_,
+                                    h_feature, w_feature, h_mask, w_mask,
+                                    half_h_mask, half_w_mask);
 }
+#endif
 
 void psamask_forward(const Tensor input, Tensor output, const int psa_type,
                      const int num_, const int h_feature, const int w_feature,
                      const int h_mask, const int w_mask, const int half_h_mask,
                      const int half_w_mask) {
-  psamask_forward_impl(psa_type, input, output, num_, h_feature, w_feature,
-                       h_mask, w_mask, half_h_mask, half_w_mask);
+  if (input.device().is_cuda()) {
+#ifdef MMCV_WITH_CUDA
+    CHECK_CUDA_INPUT(input);
+    CHECK_CUDA_INPUT(output);
+    psamask_forward_cuda(psa_type, input, output, num_, h_feature, w_feature,
+                         h_mask, w_mask, half_h_mask, half_w_mask);
+#else
+    AT_ERROR("PSAMask is not compiled with GPU support");
+#endif
+  } else {
+    psamask_forward_cpu(psa_type, input, output, num_, h_feature, w_feature,
+                        h_mask, w_mask, half_h_mask, half_w_mask);
+  }
 }
 
 void psamask_backward(Tensor grad_output, const Tensor grad_input,
                       const int psa_type, const int num_, const int h_feature,
                       const int w_feature, const int h_mask, const int w_mask,
                       const int half_h_mask, const int half_w_mask) {
-  psamask_backward_impl(psa_type, grad_output, grad_input, num_, h_feature,
-                        w_feature, h_mask, w_mask, half_h_mask, half_w_mask);
+  if (grad_input.device().is_cuda()) {
+#ifdef MMCV_WITH_CUDA
+    CHECK_CUDA_INPUT(grad_input);
+    CHECK_CUDA_INPUT(grad_output);
+    psamask_backward_cuda(psa_type, grad_output, grad_input, num_, h_feature,
+                          w_feature, h_mask, w_mask, half_h_mask, half_w_mask);
+#else
+    AT_ERROR("PSAMask is not compiled with GPU support");
+#endif
+  } else {
+    psamask_backward_cpu(psa_type, grad_output, grad_input, num_, h_feature,
+                         w_feature, h_mask, w_mask, half_h_mask, half_w_mask);
+  }
 }
diff --git a/mmcv/ops/csrc/parrots/riroi_align_rotated.cpp b/mmcv/ops/csrc/parrots/riroi_align_rotated.cpp
deleted file mode 100644
index 81ffa9f..0000000
--- a/mmcv/ops/csrc/parrots/riroi_align_rotated.cpp
+++ /dev/null
@@ -1,42 +0,0 @@
-// Copyright (c) OpenMMLab. All rights reserved
-#include "pytorch_cpp_helper.hpp"
-#include "pytorch_device_registry.hpp"
-
-void riroi_align_rotated_forward_impl(Tensor features, Tensor rois,
-                                      Tensor output, int pooled_height,
-                                      int pooled_width, float spatial_scale,
-                                      int num_samples, int num_orientations,
-                                      bool clockwise) {
-  DISPATCH_DEVICE_IMPL(riroi_align_rotated_forward_impl, features, rois, output,
-                       pooled_height, pooled_width, spatial_scale, num_samples,
-                       num_orientations, clockwise);
-}
-
-void riroi_align_rotated_backward_impl(Tensor top_grad, Tensor rois,
-                                       Tensor bottom_grad, int pooled_height,
-                                       int pooled_width, float spatial_scale,
-                                       int num_samples, int num_orientations,
-                                       bool clockwise) {
-  DISPATCH_DEVICE_IMPL(riroi_align_rotated_backward_impl, top_grad, rois,
-                       bottom_grad, pooled_height, pooled_width, spatial_scale,
-                       num_samples, num_orientations, clockwise);
-}
-
-void riroi_align_rotated_forward(Tensor features, Tensor rois, Tensor output,
-                                 int pooled_height, int pooled_width,
-                                 float spatial_scale, int num_samples,
-                                 int num_orientations, bool clockwise) {
-  riroi_align_rotated_forward_impl(features, rois, output, pooled_height,
-                                   pooled_width, spatial_scale, num_samples,
-                                   num_orientations, clockwise);
-}
-
-void riroi_align_rotated_backward(Tensor top_grad, Tensor rois,
-                                  Tensor bottom_grad, int pooled_height,
-                                  int pooled_width, float spatial_scale,
-                                  int num_samples, int num_orientations,
-                                  bool clockwise) {
-  riroi_align_rotated_backward_impl(top_grad, rois, bottom_grad, pooled_height,
-                                    pooled_width, spatial_scale, num_samples,
-                                    num_orientations, clockwise);
-}
diff --git a/mmcv/ops/csrc/parrots/riroi_align_rotated_parrots.cpp b/mmcv/ops/csrc/parrots/riroi_align_rotated_parrots.cpp
deleted file mode 100644
index 5eb340c..0000000
--- a/mmcv/ops/csrc/parrots/riroi_align_rotated_parrots.cpp
+++ /dev/null
@@ -1,86 +0,0 @@
-// Copyright (c) OpenMMLab. All rights reserved
-#include <parrots/compute/aten.hpp>
-#include <parrots/extension.hpp>
-#include <parrots/foundation/ssattrs.hpp>
-
-#include "riroi_align_rotated_pytorch.h"
-using namespace parrots;
-
-#ifdef MMCV_WITH_CUDA
-void riroi_align_rotated_forward_cuda_parrots(
-    CudaContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
-    OperatorBase::out_list_t& outs) {
-  int pooled_height;
-  int pooled_width;
-  float spatial_scale;
-  int sample_num;
-  int num_orientations;
-  bool clockwise;
-  SSAttrs(attr)
-      .get<int>("pooled_height", pooled_height)
-      .get<int>("pooled_width", pooled_width)
-      .get<float>("spatial_scale", spatial_scale)
-      .get<int>("num_samples", sample_num)
-      .get<int>("num_orientations", num_orientations)
-      .get<bool>("clockwise", clockwise)
-      .done();
-
-  auto input = buildATensor(ctx, ins[0]);
-  auto rois = buildATensor(ctx, ins[1]);
-  auto output = buildATensor(ctx, outs[0]);
-  riroi_align_rotated_forward(input, rois, output, pooled_height, pooled_width,
-                              spatial_scale, sample_num, num_orientations,
-                              clockwise);
-}
-
-void riroi_align_rotated_backward_cuda_parrots(
-    CudaContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
-    OperatorBase::out_list_t& outs) {
-  int pooled_height;
-  int pooled_width;
-  float spatial_scale;
-  int sample_num;
-  int num_orientations;
-  bool clockwise;
-  SSAttrs(attr)
-      .get<int>("pooled_height", pooled_height)
-      .get<int>("pooled_width", pooled_width)
-      .get<float>("spatial_scale", spatial_scale)
-      .get<int>("num_samples", sample_num)
-      .get<int>("num_orientations", num_orientations)
-      .get<bool>("clockwise", clockwise)
-      .done();
-
-  auto grad_output = buildATensor(ctx, ins[0]);
-  auto rois = buildATensor(ctx, ins[1]);
-  auto grad_input = buildATensor(ctx, outs[0]);
-  riroi_align_rotated_backward(grad_output, rois, grad_input, pooled_height,
-                               pooled_width, spatial_scale, sample_num,
-                               num_orientations, clockwise);
-}
-
-PARROTS_EXTENSION_REGISTER(riroi_align_rotated_forward)
-    .attr("pooled_height")
-    .attr("pooled_width")
-    .attr("spatial_scale")
-    .attr("num_samples")
-    .attr("num_orientations")
-    .attr("clockwise")
-    .input(2)
-    .output(1)
-    .apply(riroi_align_rotated_forward_cuda_parrots)
-    .done();
-
-PARROTS_EXTENSION_REGISTER(riroi_align_rotated_backward)
-    .attr("pooled_height")
-    .attr("pooled_width")
-    .attr("spatial_scale")
-    .attr("num_samples")
-    .attr("num_orientations")
-    .attr("clockwise")
-    .input(2)
-    .output(1)
-    .apply(riroi_align_rotated_backward_cuda_parrots)
-    .done();
-
-#endif
diff --git a/mmcv/ops/csrc/parrots/riroi_align_rotated_pytorch.h b/mmcv/ops/csrc/parrots/riroi_align_rotated_pytorch.h
deleted file mode 100644
index 49a30bf..0000000
--- a/mmcv/ops/csrc/parrots/riroi_align_rotated_pytorch.h
+++ /dev/null
@@ -1,18 +0,0 @@
-// Copyright (c) OpenMMLab. All rights reserved
-#ifndef RIROI_ALIGN_ROTATED_PYTORCH_H
-#define RIROI_ALIGN_ROTATED_PYTORCH_H
-#include <torch/extension.h>
-using namespace at;
-
-void riroi_align_rotated_forward(Tensor features, Tensor rois, Tensor output,
-                                 int pooled_height, int pooled_width,
-                                 float spatial_scale, int num_samples,
-                                 int num_orientations, bool clockwise);
-
-void riroi_align_rotated_backward(Tensor top_grad, Tensor rois,
-                                  Tensor bottom_grad, int pooled_height,
-                                  int pooled_width, float spatial_scale,
-                                  int num_samples, int num_orientations,
-                                  bool clockwise);
-
-#endif  // RIROI_ALIGN_ROTATED_PYTORCH_H
diff --git a/mmcv/ops/csrc/parrots/roi_align.cpp b/mmcv/ops/csrc/parrots/roi_align.cpp
index 6e70773..b44a742 100644
--- a/mmcv/ops/csrc/parrots/roi_align.cpp
+++ b/mmcv/ops/csrc/parrots/roi_align.cpp
@@ -1,41 +1,130 @@
 // Copyright (c) OpenMMLab. All rights reserved
 #include "pytorch_cpp_helper.hpp"
-#include "pytorch_device_registry.hpp"
 
-void roi_align_forward_impl(Tensor input, Tensor rois, Tensor output,
+#ifdef MMCV_WITH_CUDA
+void ROIAlignForwardCUDAKernelLauncher(Tensor input, Tensor rois, Tensor output,
+                                       Tensor argmax_y, Tensor argmax_x,
+                                       int aligned_height, int aligned_width,
+                                       float spatial_scale, int sampling_ratio,
+                                       int pool_mode, bool aligned);
+
+void ROIAlignBackwardCUDAKernelLauncher(Tensor grad_output, Tensor rois,
+                                        Tensor argmax_y, Tensor argmax_x,
+                                        Tensor grad_input, int aligned_height,
+                                        int aligned_width, float spatial_scale,
+                                        int sampling_ratio, int pool_mode,
+                                        bool aligned);
+
+void roi_align_forward_cuda(Tensor input, Tensor rois, Tensor output,
                             Tensor argmax_y, Tensor argmax_x,
                             int aligned_height, int aligned_width,
                             float spatial_scale, int sampling_ratio,
                             int pool_mode, bool aligned) {
-  DISPATCH_DEVICE_IMPL(roi_align_forward_impl, input, rois, output, argmax_y,
-                       argmax_x, aligned_height, aligned_width, spatial_scale,
-                       sampling_ratio, pool_mode, aligned);
+  ROIAlignForwardCUDAKernelLauncher(
+      input, rois, output, argmax_y, argmax_x, aligned_height, aligned_width,
+      spatial_scale, sampling_ratio, pool_mode, aligned);
 }
 
-void roi_align_backward_impl(Tensor grad_output, Tensor rois, Tensor argmax_y,
+void roi_align_backward_cuda(Tensor grad_output, Tensor rois, Tensor argmax_y,
                              Tensor argmax_x, Tensor grad_input,
                              int aligned_height, int aligned_width,
                              float spatial_scale, int sampling_ratio,
                              int pool_mode, bool aligned) {
-  DISPATCH_DEVICE_IMPL(roi_align_backward_impl, grad_output, rois, argmax_y,
-                       argmax_x, grad_input, aligned_height, aligned_width,
-                       spatial_scale, sampling_ratio, pool_mode, aligned);
+  ROIAlignBackwardCUDAKernelLauncher(
+      grad_output, rois, argmax_y, argmax_x, grad_input, aligned_height,
+      aligned_width, spatial_scale, sampling_ratio, pool_mode, aligned);
+}
+#endif
+
+void ROIAlignForwardCPULauncher(Tensor input, Tensor rois, Tensor output,
+                                Tensor argmax_y, Tensor argmax_x,
+                                int aligned_height, int aligned_width,
+                                float spatial_scale, int sampling_ratio,
+                                int pool_mode, bool aligned);
+
+void ROIAlignBackwardCPULauncher(Tensor grad_output, Tensor rois,
+                                 Tensor argmax_y, Tensor argmax_x,
+                                 Tensor grad_input, int aligned_height,
+                                 int aligned_width, float spatial_scale,
+                                 int sampling_ratio, int pool_mode,
+                                 bool aligned);
+
+void roi_align_forward_cpu(Tensor input, Tensor rois, Tensor output,
+                           Tensor argmax_y, Tensor argmax_x, int aligned_height,
+                           int aligned_width, float spatial_scale,
+                           int sampling_ratio, int pool_mode, bool aligned) {
+  ROIAlignForwardCPULauncher(input, rois, output, argmax_y, argmax_x,
+                             aligned_height, aligned_width, spatial_scale,
+                             sampling_ratio, pool_mode, aligned);
+}
+
+void roi_align_backward_cpu(Tensor grad_output, Tensor rois, Tensor argmax_y,
+                            Tensor argmax_x, Tensor grad_input,
+                            int aligned_height, int aligned_width,
+                            float spatial_scale, int sampling_ratio,
+                            int pool_mode, bool aligned) {
+  ROIAlignBackwardCPULauncher(grad_output, rois, argmax_y, argmax_x, grad_input,
+                              aligned_height, aligned_width, spatial_scale,
+                              sampling_ratio, pool_mode, aligned);
 }
 
 void roi_align_forward(Tensor input, Tensor rois, Tensor output,
                        Tensor argmax_y, Tensor argmax_x, int aligned_height,
                        int aligned_width, float spatial_scale,
                        int sampling_ratio, int pool_mode, bool aligned) {
-  roi_align_forward_impl(input, rois, output, argmax_y, argmax_x,
-                         aligned_height, aligned_width, spatial_scale,
-                         sampling_ratio, pool_mode, aligned);
+  if (input.device().is_cuda()) {
+#ifdef MMCV_WITH_CUDA
+    CHECK_CUDA_INPUT(input);
+    CHECK_CUDA_INPUT(rois);
+    CHECK_CUDA_INPUT(output);
+    CHECK_CUDA_INPUT(argmax_y);
+    CHECK_CUDA_INPUT(argmax_x);
+
+    roi_align_forward_cuda(input, rois, output, argmax_y, argmax_x,
+                           aligned_height, aligned_width, spatial_scale,
+                           sampling_ratio, pool_mode, aligned);
+#else
+    AT_ERROR("RoIAlign is not compiled with GPU support");
+#endif
+  } else {
+    CHECK_CPU_INPUT(input);
+    CHECK_CPU_INPUT(rois);
+    CHECK_CPU_INPUT(output);
+    CHECK_CPU_INPUT(argmax_y);
+    CHECK_CPU_INPUT(argmax_x);
+    roi_align_forward_cpu(input, rois, output, argmax_y, argmax_x,
+                          aligned_height, aligned_width, spatial_scale,
+                          sampling_ratio, pool_mode, aligned);
+  }
 }
 
 void roi_align_backward(Tensor grad_output, Tensor rois, Tensor argmax_y,
                         Tensor argmax_x, Tensor grad_input, int aligned_height,
                         int aligned_width, float spatial_scale,
                         int sampling_ratio, int pool_mode, bool aligned) {
-  roi_align_backward_impl(grad_output, rois, argmax_y, argmax_x, grad_input,
-                          aligned_height, aligned_width, spatial_scale,
-                          sampling_ratio, pool_mode, aligned);
+  if (grad_output.device().is_cuda()) {
+#ifdef MMCV_WITH_CUDA
+    CHECK_CUDA_INPUT(grad_output);
+    CHECK_CUDA_INPUT(rois);
+    CHECK_CUDA_INPUT(argmax_y);
+    CHECK_CUDA_INPUT(argmax_x);
+    CHECK_CUDA_INPUT(grad_input);
+
+    roi_align_backward_cuda(grad_output, rois, argmax_y, argmax_x, grad_input,
+                            aligned_height, aligned_width, spatial_scale,
+                            sampling_ratio, pool_mode, aligned);
+#else
+    AT_ERROR("RoIAlign is not compiled with GPU support");
+#endif
+  } else {
+    CHECK_CPU_INPUT(grad_output);
+    CHECK_CPU_INPUT(rois);
+    CHECK_CPU_INPUT(argmax_y);
+    CHECK_CPU_INPUT(argmax_x);
+    CHECK_CPU_INPUT(grad_input);
+
+    roi_align_backward_cpu(grad_output, rois, argmax_y, argmax_x, grad_input,
+                           aligned_height, aligned_width, spatial_scale,
+                           sampling_ratio, pool_mode, aligned);
+  }
 }
diff --git a/mmcv/ops/csrc/parrots/roi_align_cpu.cpp b/mmcv/ops/csrc/parrots/roi_align_cpu.cpp
new file mode 100644
index 0000000..ac64eb3
--- /dev/null
+++ b/mmcv/ops/csrc/parrots/roi_align_cpu.cpp
@@ -0,0 +1,430 @@
+// Modified from
+// https://github.com/facebookresearch/detectron2/tree/master/detectron2/layers/csrc/ROIAlign
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+#include <ATen/ATen.h>
+#include <ATen/TensorUtils.h>
+
+#include "../pytorch_cpp_helper.hpp"
+
+// implementation taken from Caffe2
+template <typename T>
+struct PreCalc {
+  int pos1;
+  int pos2;
+  int pos3;
+  int pos4;
+  T w1;
+  T w2;
+  T w3;
+  T w4;
+};
+
+template <typename T>
+void pre_calc_for_bilinear_interpolate(
+    const int height, const int width, const int pooled_height,
+    const int pooled_width, const int iy_upper, const int ix_upper,
+    T roi_start_h, T roi_start_w, T bin_size_h, T bin_size_w,
+    int roi_bin_grid_h, int roi_bin_grid_w, std::vector<PreCalc<T>>& pre_calc) {
+  int pre_calc_index = 0;
+  for (int ph = 0; ph < pooled_height; ph++) {
+    for (int pw = 0; pw < pooled_width; pw++) {
+      for (int iy = 0; iy < iy_upper; iy++) {
+        const T yy = roi_start_h + ph * bin_size_h +
+                     static_cast<T>(iy + .5f) * bin_size_h /
+                         static_cast<T>(roi_bin_grid_h);  // e.g., 0.5, 1.5
+        for (int ix = 0; ix < ix_upper; ix++) {
+          const T xx = roi_start_w + pw * bin_size_w +
+                       static_cast<T>(ix + .5f) * bin_size_w /
+                           static_cast<T>(roi_bin_grid_w);
+
+          T x = xx;
+          T y = yy;
+          // deal with: inverse elements are out of feature map boundary
+          if (y < -1.0 || y > height || x < -1.0 || x > width) {
+            // empty
+            PreCalc<T> pc;
+            pc.pos1 = 0;
+            pc.pos2 = 0;
+            pc.pos3 = 0;
+            pc.pos4 = 0;
+            pc.w1 = 0;
+            pc.w2 = 0;
+            pc.w3 = 0;
+            pc.w4 = 0;
+            pre_calc[pre_calc_index] = pc;
+            pre_calc_index += 1;
+            continue;
+          }
+
+          if (y <= 0) {
+            y = 0;
+          }
+          if (x <= 0) {
+            x = 0;
+          }
+
+          int y_low = (int)y;
+          int x_low = (int)x;
+          int y_high;
+          int x_high;
+
+          if (y_low >= height - 1) {
+            y_high = y_low = height - 1;
+            y = (T)y_low;
+          } else {
+            y_high = y_low + 1;
+          }
+
+          if (x_low >= width - 1) {
+            x_high = x_low = width - 1;
+            x = (T)x_low;
+          } else {
+            x_high = x_low + 1;
+          }
+
+          T ly = y - y_low;
+          T lx = x - x_low;
+          T hy = 1. - ly, hx = 1. - lx;
+          T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+
+          // save weights and indices
+          PreCalc<T> pc;
+          pc.pos1 = y_low * width + x_low;
+          pc.pos2 = y_low * width + x_high;
+          pc.pos3 = y_high * width + x_low;
+          pc.pos4 = y_high * width + x_high;
+          pc.w1 = w1;
+          pc.w2 = w2;
+          pc.w3 = w3;
+          pc.w4 = w4;
+          pre_calc[pre_calc_index] = pc;
+
+          pre_calc_index += 1;
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+void ROIAlignForward(const int nthreads, const T* input, const T* rois,
+                     T* output, T* argmax_y, T* argmax_x,
+                     const int pooled_height, const int pooled_width,
+                     const T spatial_scale, const int sampling_ratio,
+                     const int pool_mode,  // 0 - max pool, 1 - avg pool
+                     const bool aligned, const int channels, const int height,
+                     const int width) {
+  int n_rois = nthreads / channels / pooled_width / pooled_height;
+  // (n, c, ph, pw) is an element in the pooled output
+  // can be parallelized using omp
+  // #pragma omp parallel for num_threads(32)
+  for (int n = 0; n < n_rois; n++) {
+    int index_n = n * channels * pooled_width * pooled_height;
+
+    const T* offset_rois = rois + n * 5;
+    int roi_batch_ind = offset_rois[0];
+
+    // Do not use rounding; this implementation detail is critical
+    T offset = aligned ? (T)0.5 : (T)0.0;
+    T roi_start_w = offset_rois[1] * spatial_scale - offset;
+    T roi_start_h = offset_rois[2] * spatial_scale - offset;
+    T roi_end_w = offset_rois[3] * spatial_scale - offset;
+    T roi_end_h = offset_rois[4] * spatial_scale - offset;
+
+    T roi_width = roi_end_w - roi_start_w;
+    T roi_height = roi_end_h - roi_start_h;
+    if (aligned) {
+      AT_ASSERTM(roi_width >= 0 && roi_height >= 0,
+                 "ROIs in ROIAlign cannot have non-negative size!");
+    } else {  // for backward-compatibility only
+      roi_width = std::max(roi_width, (T)1.);
+      roi_height = std::max(roi_height, (T)1.);
+    }
+    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+
+    // We use roi_bin_grid to sample the grid and mimic integral
+    int roi_bin_grid_h = (sampling_ratio > 0)
+                             ? sampling_ratio
+                             : ceil(roi_height / pooled_height);  // e.g., = 2
+    int roi_bin_grid_w =
+        (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
+
+    // When the grid is empty, output zeros == 0/1, instead of NaN.
+    const T count = std::max(roi_bin_grid_h * roi_bin_grid_w, 1);  // e.g. = 4
+
+    // we want to precalculate indices and weights shared by all channels,
+    // this is the key point of optimization
+    std::vector<PreCalc<T>> pre_calc(roi_bin_grid_h * roi_bin_grid_w *
+                                     pooled_width * pooled_height);
+    pre_calc_for_bilinear_interpolate(
+        height, width, pooled_height, pooled_width, roi_bin_grid_h,
+        roi_bin_grid_w, roi_start_h, roi_start_w, bin_size_h, bin_size_w,
+        roi_bin_grid_h, roi_bin_grid_w, pre_calc);
+
+    for (int c = 0; c < channels; c++) {
+      int index_n_c = index_n + c * pooled_width * pooled_height;
+      const T* offset_input =
+          input + (roi_batch_ind * channels + c) * height * width;
+      int pre_calc_index = 0;
+
+      for (int ph = 0; ph < pooled_height; ph++) {
+        for (int pw = 0; pw < pooled_width; pw++) {
+          int index = index_n_c + ph * pooled_width + pw;
+
+          T output_val = 0.;
+          T maxval = -10000;
+          T maxidx_y = -1.f, maxidx_x = -1.f;
+          for (int iy = 0; iy < roi_bin_grid_h; iy++) {
+            const T y = roi_start_h + ph * bin_size_h +
+                        static_cast<T>(iy + .5f) * bin_size_h /
+                            static_cast<T>(roi_bin_grid_h);
+            for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+              const T x = roi_start_w + pw * bin_size_w +
+                          static_cast<T>(ix + .5f) * bin_size_w /
+                              static_cast<T>(roi_bin_grid_w);
+              PreCalc<T> pc = pre_calc[pre_calc_index];
+              T val = pc.w1 * offset_input[pc.pos1] +
+                      pc.w2 * offset_input[pc.pos2] +
+                      pc.w3 * offset_input[pc.pos3] +
+                      pc.w4 * offset_input[pc.pos4];
+              if (val > maxval) {
+                maxval = val;
+                maxidx_y = y;
+                maxidx_x = x;
+              }
+              output_val += val;
+              pre_calc_index += 1;
+            }
+          }
+          if (pool_mode == 0) {
+            // We do max pooling inside a bin
+            output[index] = maxval;
+            argmax_y[index] = maxidx_y;
+            argmax_x[index] = maxidx_x;
+          } else if (pool_mode == 1) {
+            // We do average (integral) pooling inside a bin
+            output[index] = output_val / count;
+          }  // if
+        }    // for pw
+      }      // for ph
+    }        // for c
+  }          // for n
+}
+
+template <typename T>
+void bilinear_interpolate_gradient(const int height, const int width, T y, T x,
+                                   T& w1, T& w2, T& w3, T& w4, int& x_low,
+                                   int& x_high, int& y_low, int& y_high,
+                                   const int index /* index for debug only*/) {
+  // deal with cases that inverse elements are out of feature map boundary
+  if (y < -1.0 || y > height || x < -1.0 || x > width) {
+    // empty
+    w1 = w2 = w3 = w4 = 0.;
+    x_low = x_high = y_low = y_high = -1;
+    return;
+  }
+
+  if (y <= 0) y = 0;
+  if (x <= 0) x = 0;
+
+  y_low = (int)y;
+  x_low = (int)x;
+
+  if (y_low >= height - 1) {
+    y_high = y_low = height - 1;
+    y = (T)y_low;
+  } else {
+    y_high = y_low + 1;
+  }
+
+  if (x_low >= width - 1) {
+    x_high = x_low = width - 1;
+    x = (T)x_low;
+  } else {
+    x_high = x_low + 1;
+  }
+
+  T ly = y - y_low;
+  T lx = x - x_low;
+  T hy = 1. - ly, hx = 1. - lx;
+
+  // reference in forward
+  // T v1 = input[y_low * width + x_low];
+  // T v2 = input[y_low * width + x_high];
+  // T v3 = input[y_high * width + x_low];
+  // T v4 = input[y_high * width + x_high];
+  // T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+
+  w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+
+  return;
+}
+
+template <class T>
+inline void add(T* address, const T& val) {
+  *address += val;
+}
+
+template <typename T>
+void ROIAlignBackward(const int nthreads, const T* grad_output, const T* rois,
+                      const T* argmax_y, const T* argmax_x, T* grad_input,
+                      const int pooled_height, const int pooled_width,
+                      const T spatial_scale, const int sampling_ratio,
+                      const int pool_mode,  // 0 - max pool, 1 - avg pool
+                      const bool aligned, const int channels, const int height,
+                      const int width, const int n_stride, const int c_stride,
+                      const int h_stride, const int w_stride) {
+  for (int index = 0; index < nthreads; index++) {
+    // (n, c, ph, pw) is an element in the pooled output
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int c = (index / pooled_width / pooled_height) % channels;
+    int n = index / pooled_width / pooled_height / channels;
+
+    const T* offset_rois = rois + n * 5;
+    int roi_batch_ind = offset_rois[0];
+
+    // Do not use rounding; this implementation detail is critical
+    T offset = aligned ? (T)0.5 : (T)0.0;
+    T roi_start_w = offset_rois[1] * spatial_scale - offset;
+    T roi_start_h = offset_rois[2] * spatial_scale - offset;
+    T roi_end_w = offset_rois[3] * spatial_scale - offset;
+    T roi_end_h = offset_rois[4] * spatial_scale - offset;
+
+    T roi_width = roi_end_w - roi_start_w;
+    T roi_height = roi_end_h - roi_start_h;
+    if (aligned) {
+      AT_ASSERTM(roi_width >= 0 && roi_height >= 0,
+                 "ROIs in ROIAlign do not have non-negative size!");
+    } else {  // for backward-compatibility only
+      roi_width = std::max(roi_width, (T)1.);
+      roi_height = std::max(roi_height, (T)1.);
+    }
+    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+
+    T* offset_grad_input =
+        grad_input + ((roi_batch_ind * channels + c) * height * width);
+
+    int output_offset = n * n_stride + c * c_stride;
+    const T* offset_grad_output = grad_output + output_offset;
+    const T grad_output_this_bin =
+        offset_grad_output[ph * h_stride + pw * w_stride];
+
+    if (pool_mode == 0) {
+      // We do max pooling inside a bin
+      T y = argmax_y[index], x = argmax_x[index];
+      if (y != -1.f) {
+        T w1, w2, w3, w4;
+        int x_low, x_high, y_low, y_high;
+        bilinear_interpolate_gradient(height, width, y, x, w1, w2, w3, w4,
+                                      x_low, x_high, y_low, y_high, index);
+
+        T g1 = grad_output_this_bin * w1;
+        T g2 = grad_output_this_bin * w2;
+        T g3 = grad_output_this_bin * w3;
+        T g4 = grad_output_this_bin * w4;
+
+        if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
+          // atomic add is not needed for now since it is single threaded
+          add(offset_grad_input + y_low * width + x_low, static_cast<T>(g1));
+          add(offset_grad_input + y_low * width + x_high, static_cast<T>(g2));
+          add(offset_grad_input + y_high * width + x_low, static_cast<T>(g3));
+          add(offset_grad_input + y_high * width + x_high, static_cast<T>(g4));
+        }  // if
+      }    // mode
+    } else if (pool_mode == 1) {
+      // We do average (integral) pooling inside a bin
+      // We use roi_bin_grid to sample the grid and mimic integral
+      int roi_bin_grid_h = (sampling_ratio > 0)
+                               ? sampling_ratio
+                               : ceil(roi_height / pooled_height);  // e.g., = 2
+      int roi_bin_grid_w = (sampling_ratio > 0)
+                               ? sampling_ratio
+                               : ceil(roi_width / pooled_width);
+
+      const T count = roi_bin_grid_h * roi_bin_grid_w;  // e.g. = 4
+      for (int iy = 0; iy < roi_bin_grid_h; iy++) {
+        const T y = roi_start_h + ph * bin_size_h +
+                    static_cast<T>(iy + .5f) * bin_size_h /
+                        static_cast<T>(roi_bin_grid_h);  // e.g., 0.5, 1.5
+        for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+          const T x = roi_start_w + pw * bin_size_w +
+                      static_cast<T>(ix + .5f) * bin_size_w /
+                          static_cast<T>(roi_bin_grid_w);
+
+          T w1, w2, w3, w4;
+          int x_low, x_high, y_low, y_high;
+
+          bilinear_interpolate_gradient(height, width, y, x, w1, w2, w3, w4,
+                                        x_low, x_high, y_low, y_high, index);
+
+          T g1 = grad_output_this_bin * w1 / count;
+          T g2 = grad_output_this_bin * w2 / count;
+          T g3 = grad_output_this_bin * w3 / count;
+          T g4 = grad_output_this_bin * w4 / count;
+
+          if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
+            // atomic add is not needed for now since it is single threaded
+            add(offset_grad_input + y_low * width + x_low, static_cast<T>(g1));
+            add(offset_grad_input + y_low * width + x_high, static_cast<T>(g2));
+            add(offset_grad_input + y_high * width + x_low, static_cast<T>(g3));
+            add(offset_grad_input + y_high * width + x_high,
+                static_cast<T>(g4));
+          }  // if
+        }    // ix
+      }      // iy
+    }        // mode
+  }          // for
+}  // ROIAlignBackward
+
+void ROIAlignForwardCPULauncher(Tensor input, Tensor rois, Tensor output,
+                                Tensor argmax_y, Tensor argmax_x,
+                                int aligned_height, int aligned_width,
+                                float spatial_scale, int sampling_ratio,
+                                int pool_mode, bool aligned) {
+  int output_size = output.numel();
+  int channels = input.size(1);
+  int height = input.size(2);
+  int width = input.size(3);
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      input.scalar_type(), "ROIAlign_forward", [&] {
+        ROIAlignForward<scalar_t>(
+            output_size, input.data_ptr<scalar_t>(), rois.data_ptr<scalar_t>(),
+            output.data_ptr<scalar_t>(), argmax_y.data_ptr<scalar_t>(),
+            argmax_x.data_ptr<scalar_t>(), aligned_height, aligned_width,
+            static_cast<scalar_t>(spatial_scale), sampling_ratio, pool_mode,
+            aligned, channels, height, width);
+      });
+}
+
+void ROIAlignBackwardCPULauncher(Tensor grad_output, Tensor rois,
+                                 Tensor argmax_y, Tensor argmax_x,
+                                 Tensor grad_input, int aligned_height,
+                                 int aligned_width, float spatial_scale,
+                                 int sampling_ratio, int pool_mode,
+                                 bool aligned) {
+  int output_size = grad_output.numel();
+  int channels = grad_input.size(1);
+  int height = grad_input.size(2);
+  int width = grad_input.size(3);
+
+  // get stride values to ensure indexing into gradients is correct.
+  int n_stride = grad_output.stride(0);
+  int c_stride = grad_output.stride(1);
+  int h_stride = grad_output.stride(2);
+  int w_stride = grad_output.stride(3);
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      grad_output.scalar_type(), "ROIAlign_backward", [&] {
+        ROIAlignBackward<scalar_t>(
+            output_size, grad_output.data_ptr<scalar_t>(),
+            rois.data_ptr<scalar_t>(), argmax_y.data_ptr<scalar_t>(),
+            argmax_x.data_ptr<scalar_t>(), grad_input.data_ptr<scalar_t>(),
+            aligned_height, aligned_width, static_cast<scalar_t>(spatial_scale),
+            sampling_ratio, pool_mode, aligned, channels, height, width,
+            n_stride, c_stride, h_stride, w_stride);
+      });
+}
diff --git a/mmcv/ops/csrc/parrots/roi_align_rotated.cpp b/mmcv/ops/csrc/parrots/roi_align_rotated.cpp
index 5ef691a..c1bf57e 100644
--- a/mmcv/ops/csrc/parrots/roi_align_rotated.cpp
+++ b/mmcv/ops/csrc/parrots/roi_align_rotated.cpp
@@ -1,41 +1,141 @@
 // Copyright (c) OpenMMLab. All rights reserved
 #include "pytorch_cpp_helper.hpp"
-#include "pytorch_device_registry.hpp"
 
-void roi_align_rotated_forward_impl(Tensor features, Tensor rois, Tensor output,
-                                    int aligned_height, int aligned_width,
-                                    float spatial_scale, int sample_ratio,
+#ifdef MMCV_WITH_CUDA
+void ROIAlignRotatedForwardCUDAKernelLauncher(
+    const at::Tensor features, const at::Tensor rois, const float spatial_scale,
+    const int sample_num, const bool aligned, const bool clockwise,
+    const int channels, const int height, const int width, const int num_rois,
+    const int pooled_height, const int pooled_width, at::Tensor output);
+
+void ROIAlignRotatedBackwardCUDAKernelLauncher(
+    const at::Tensor top_grad, const at::Tensor rois, const float spatial_scale,
+    const int sample_num, const bool aligned, const bool clockwise,
+    const int channels, const int height, const int width, const int num_rois,
+    const int pooled_height, const int pooled_width, at::Tensor bottom_grad);
+
+void roi_align_rotated_forward_cuda(Tensor features, Tensor rois, Tensor output,
+                                    int pooled_height, int pooled_width,
+                                    float spatial_scale, int sample_num,
                                     bool aligned, bool clockwise) {
-  DISPATCH_DEVICE_IMPL(roi_align_rotated_forward_impl, features, rois, output,
-                       aligned_height, aligned_width, spatial_scale,
-                       sample_ratio, aligned, clockwise);
+  // Number of ROIs
+  int num_rois = rois.size(0);
+  int size_rois = rois.size(1);
+
+  if (size_rois != 6) {
+    AT_ERROR("wrong roi size");
+  }
+
+  int num_channels = features.size(1);
+  int data_height = features.size(2);
+  int data_width = features.size(3);
+  ROIAlignRotatedForwardCUDAKernelLauncher(
+      features, rois, spatial_scale, sample_num, aligned, clockwise,
+      num_channels, data_height, data_width, num_rois, pooled_height,
+      pooled_width, output);
 }
 
-void roi_align_rotated_backward_impl(Tensor top_grad, Tensor rois,
-                                     Tensor bottom_grad, int aligned_height,
-                                     int aligned_width, float spatial_scale,
-                                     int sample_ratio, bool aligned,
+void roi_align_rotated_backward_cuda(Tensor top_grad, Tensor rois,
+                                     Tensor bottom_grad, int pooled_height,
+                                     int pooled_width, float spatial_scale,
+                                     int sample_num, bool aligned,
                                      bool clockwise) {
-  DISPATCH_DEVICE_IMPL(roi_align_rotated_backward_impl, top_grad, rois,
-                       bottom_grad, aligned_height, aligned_width,
-                       spatial_scale, sample_ratio, aligned, clockwise);
+  // Number of ROIs
+  int num_rois = rois.size(0);
+  int size_rois = rois.size(1);
+  if (size_rois != 6) {
+    AT_ERROR("wrong roi size");
+  }
+
+  int num_channels = bottom_grad.size(1);
+  int data_height = bottom_grad.size(2);
+  int data_width = bottom_grad.size(3);
+  ROIAlignRotatedBackwardCUDAKernelLauncher(
+      top_grad, rois, spatial_scale, sample_num, aligned, clockwise,
+      num_channels, data_height, data_width, num_rois, pooled_height,
+      pooled_width, bottom_grad);
+}
+#endif
+
+void ROIAlignRotatedForwardCPULauncher(Tensor input, Tensor rois, Tensor output,
+                                       int aligned_height, int aligned_width,
+                                       float spatial_scale, int sampling_ratio,
+                                       bool aligned, bool clockwise);
+
+void ROIAlignRotatedBackwardCPULauncher(Tensor grad_output, Tensor rois,
+                                        Tensor grad_input, int aligned_height,
+                                        int aligned_width, float spatial_scale,
+                                        int sampling_ratio, bool aligned,
+                                        bool clockwise);
+
+void roi_align_rotated_forward_cpu(Tensor features, Tensor rois, Tensor output,
+                                   int pooled_height, int pooled_width,
+                                   float spatial_scale, int sample_num,
+                                   bool aligned, bool clockwise) {
+  ROIAlignRotatedForwardCPULauncher(features, rois, output, pooled_height,
+                                    pooled_width, spatial_scale, sample_num,
+                                    aligned, clockwise);
+}
+
+void roi_align_rotated_backward_cpu(Tensor features, Tensor rois, Tensor output,
+                                    int pooled_height, int pooled_width,
+                                    float spatial_scale, int sample_num,
+                                    bool aligned, bool clockwise) {
+  ROIAlignRotatedBackwardCPULauncher(features, rois, output, pooled_height,
+                                     pooled_width, spatial_scale, sample_num,
+                                     aligned, clockwise);
 }
 
 void roi_align_rotated_forward(Tensor input, Tensor rois, Tensor output,
-                               int aligned_height, int aligned_width,
-                               float spatial_scale, int sampling_ratio,
+                               int pooled_height, int pooled_width,
+                               float spatial_scale, int sample_num,
                                bool aligned, bool clockwise) {
-  roi_align_rotated_forward_impl(input, rois, output, aligned_height,
-                                 aligned_width, spatial_scale, sampling_ratio,
-                                 aligned, clockwise);
-}
+  if (input.device().is_cuda()) {
+#ifdef MMCV_WITH_CUDA
+    CHECK_CUDA_INPUT(input);
+    CHECK_CUDA_INPUT(rois);
+    CHECK_CUDA_INPUT(output);
+
+    roi_align_rotated_forward_cuda(input, rois, output, pooled_height,
+                                   pooled_width, spatial_scale, sample_num,
+                                   aligned, clockwise);
+#else
+    AT_ERROR("RoIAlignRotated is not compiled with GPU support");
+#endif
+  } else {
+    CHECK_CPU_INPUT(input);
+    CHECK_CPU_INPUT(rois);
+    CHECK_CPU_INPUT(output);
 
-void roi_align_rotated_backward(Tensor top_grad, Tensor rois,
-                                Tensor bottom_grad, int aligned_height,
-                                int aligned_width, float spatial_scale,
-                                int sampling_ratio, bool aligned,
-                                bool clockwise) {
-  roi_align_rotated_backward_impl(top_grad, rois, bottom_grad, aligned_height,
-                                  aligned_width, spatial_scale, sampling_ratio,
+    roi_align_rotated_forward_cpu(input, rois, output, pooled_height,
+                                  pooled_width, spatial_scale, sample_num,
                                   aligned, clockwise);
+  }
+}
+
+void roi_align_rotated_backward(Tensor grad_output, Tensor rois,
+                                Tensor grad_input, int pooled_height,
+                                int pooled_width, float spatial_scale,
+                                int sample_num, bool aligned, bool clockwise) {
+  if (grad_output.device().is_cuda()) {
+#ifdef MMCV_WITH_CUDA
+    CHECK_CUDA_INPUT(grad_output);
+    CHECK_CUDA_INPUT(rois);
+    CHECK_CUDA_INPUT(grad_input);
+
+    roi_align_rotated_backward_cuda(grad_output, rois, grad_input,
+                                    pooled_height, pooled_width, spatial_scale,
+                                    sample_num, aligned, clockwise);
+#else
+    AT_ERROR("RoIAlignRotated is not compiled with GPU support");
+#endif
+  } else {
+    CHECK_CPU_INPUT(grad_output);
+    CHECK_CPU_INPUT(rois);
+    CHECK_CPU_INPUT(grad_input);
+
+    roi_align_rotated_backward_cpu(grad_output, rois, grad_input, pooled_height,
+                                   pooled_width, spatial_scale, sample_num,
+                                   aligned, clockwise);
+  }
 }
diff --git a/mmcv/ops/csrc/pytorch/cpu/bezier_align.cpp b/mmcv/ops/csrc/parrots/roi_align_rotated_cpu.cpp
similarity index 54%
rename from mmcv/ops/csrc/pytorch/cpu/bezier_align.cpp
rename to mmcv/ops/csrc/parrots/roi_align_rotated_cpu.cpp
index 7eb0e5b..73b8b2a 100644
--- a/mmcv/ops/csrc/pytorch/cpu/bezier_align.cpp
+++ b/mmcv/ops/csrc/parrots/roi_align_rotated_cpu.cpp
@@ -1,11 +1,10 @@
 // Modified from
-// https://github.com/facebookresearch/detectron2/tree/master/detectron2/layers/csrc/BezierAlign
+// https://github.com/facebookresearch/detectron2/tree/master/detectron2/layers/csrc/ROIAlignRotated
 // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
 #include <ATen/ATen.h>
 #include <ATen/TensorUtils.h>
 
-#include "pytorch_cpp_helper.hpp"
-#include "pytorch_device_registry.hpp"
+#include "../pytorch_cpp_helper.hpp"
 
 // implementation taken from Caffe2
 template <typename T>
@@ -20,44 +19,31 @@ struct PreCalc {
   T w4;
 };
 
-template <typename T>
-T bezier_curve(const T p0, const T p1, const T p2, const T p3, const T u) {
-  return ((1. - u) * (1. - u) * (1. - u) * p0 +
-          3. * u * (1. - u) * (1. - u) * p1 + 3. * u * u * (1. - u) * p2 +
-          u * u * u * p3);
-}
-
 template <typename T>
 void pre_calc_for_bilinear_interpolate(
     const int height, const int width, const int pooled_height,
-    const int pooled_width, const int iy_upper, const int ix_upper, T p0_x,
-    T p0_y, T p1_x, T p1_y, T p2_x, T p2_y, T p3_x, T p3_y, T p4_x, T p4_y,
-    T p5_x, T p5_y, T p6_x, T p6_y, T p7_x, T p7_y, T bin_size_h, T bin_size_w,
-    int roi_bin_grid_h, int roi_bin_grid_w, T offset,
-    std::vector<PreCalc<T>> &pre_calc) {
+    const int pooled_width, const int iy_upper, const int ix_upper,
+    T roi_start_h, T roi_start_w, T bin_size_h, T bin_size_w,
+    int roi_bin_grid_h, int roi_bin_grid_w, T roi_center_h, T roi_center_w,
+    T cos_theta, T sin_theta, std::vector<PreCalc<T>>& pre_calc) {
   int pre_calc_index = 0;
   for (int ph = 0; ph < pooled_height; ph++) {
     for (int pw = 0; pw < pooled_width; pw++) {
-      // compute the coords
-      const T u = pw / static_cast<T>(pooled_width);
-      const T v = ph / static_cast<T>(pooled_height);
-      const T x0 = bezier_curve(p0_x, p1_x, p2_x, p3_x, u);
-      const T y0 = bezier_curve(p0_y, p1_y, p2_y, p3_y, u);
-      const T x1 = bezier_curve(p4_x, p5_x, p6_x, p7_x, u);
-      const T y1 = bezier_curve(p4_y, p5_y, p6_y, p7_y, u);
-      const T x_center = x1 * v + x0 * (1. - v) - offset;
-      const T y_center = y1 * v + y0 * (1. - v) - offset;
       for (int iy = 0; iy < iy_upper; iy++) {
-        const T yy = y_center - (T)0.5 * bin_size_h +
+        const T yy = roi_start_h + ph * bin_size_h +
                      static_cast<T>(iy + .5f) * bin_size_h /
                          static_cast<T>(roi_bin_grid_h);  // e.g., 0.5, 1.5
         for (int ix = 0; ix < ix_upper; ix++) {
-          const T xx = x_center - (T)0.5 * bin_size_w +
+          const T xx = roi_start_w + pw * bin_size_w +
                        static_cast<T>(ix + .5f) * bin_size_w /
                            static_cast<T>(roi_bin_grid_w);
 
-          T x = xx;
-          T y = yy;
+          // Rotate by theta around the center and translate
+          // In image space, (y, x) is the order for Right Handed System,
+          // and this is essentially multiplying the point by a rotation matrix
+          // to rotate it counterclockwise through angle theta.
+          T y = yy * cos_theta - xx * sin_theta + roi_center_h;
+          T x = yy * sin_theta + xx * cos_theta + roi_center_w;
           // deal with: inverse elements are out of feature map boundary
           if (y < -1.0 || y > height || x < -1.0 || x > width) {
             // empty
@@ -75,10 +61,10 @@ void pre_calc_for_bilinear_interpolate(
             continue;
           }
 
-          if (y <= 0) {
+          if (y < 0) {
             y = 0;
           }
-          if (x <= 0) {
+          if (x < 0) {
             x = 0;
           }
 
@@ -126,11 +112,13 @@ void pre_calc_for_bilinear_interpolate(
 }
 
 template <typename T>
-void BezierAlignForward(const int nthreads, const T *input, const T *rois,
-                        T *output, const int pooled_height,
-                        const int pooled_width, const T &spatial_scale,
-                        const int sampling_ratio, bool aligned,
-                        const int channels, const int height, const int width) {
+void ROIAlignRotatedForward(const int nthreads, const T* input,
+                            const T& spatial_scale, const bool aligned,
+                            const bool clockwise, const int channels,
+                            const int height, const int width,
+                            const int pooled_height, const int pooled_width,
+                            const int sampling_ratio, const T* rois,
+                            T* output) {
   int n_rois = nthreads / channels / pooled_width / pooled_height;
   // (n, c, ph, pw) is an element in the pooled output
   // can be parallelized using omp
@@ -138,38 +126,30 @@ void BezierAlignForward(const int nthreads, const T *input, const T *rois,
   for (int n = 0; n < n_rois; n++) {
     int index_n = n * channels * pooled_width * pooled_height;
 
-    // beziers have size Nx(1+8*2) = Nx17
-    const T *offset_rois = rois + n * 17;
-    int roi_batch_ind = offset_rois[0];
+    const T* current_roi = rois + n * 6;
+    int roi_batch_ind = current_roi[0];
 
-    T offset = aligned ? (T)0.5 : (T)0.0;
     // Do not use rounding; this implementation detail is critical
-    T p0_x = offset_rois[1] * spatial_scale;
-    T p0_y = offset_rois[2] * spatial_scale;
-    T p1_x = offset_rois[3] * spatial_scale;
-    T p1_y = offset_rois[4] * spatial_scale;
-    T p2_x = offset_rois[5] * spatial_scale;
-    T p2_y = offset_rois[6] * spatial_scale;
-    T p3_x = offset_rois[7] * spatial_scale;
-    T p3_y = offset_rois[8] * spatial_scale;
-    T p4_x = offset_rois[15] * spatial_scale;
-    T p4_y = offset_rois[16] * spatial_scale;
-    T p5_x = offset_rois[13] * spatial_scale;
-    T p5_y = offset_rois[14] * spatial_scale;
-    T p6_x = offset_rois[11] * spatial_scale;
-    T p6_y = offset_rois[12] * spatial_scale;
-    T p7_x = offset_rois[9] * spatial_scale;
-    T p7_y = offset_rois[10] * spatial_scale;
-
-    T roi_width = std::max(std::abs(p0_x - p3_x), std::abs(p4_x - p7_x));
-    T roi_height = std::max(std::abs(p0_y - p3_y), std::abs(p4_y - p7_y));
+    T offset = aligned ? (T)0.5 : (T)0.0;
+    T roi_center_w = current_roi[1] * spatial_scale - offset;
+    T roi_center_h = current_roi[2] * spatial_scale - offset;
+    T roi_width = current_roi[3] * spatial_scale;
+    T roi_height = current_roi[4] * spatial_scale;
+    T theta = current_roi[5];
+    if (clockwise) {
+      theta = -theta;  // If clockwise, the angle needs to be reversed.
+    }
+    T cos_theta = cos(theta);
+    T sin_theta = sin(theta);
+
     if (aligned) {
       AT_ASSERTM(roi_width >= 0 && roi_height >= 0,
-                 "Beziers in BezierAlign cannot have non-negative size!");
+                 "ROIs in ROIAlignRotated do not have non-negative size!");
     } else {  // for backward-compatibility only
       roi_width = std::max(roi_width, (T)1.);
       roi_height = std::max(roi_height, (T)1.);
     }
+
     T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
     T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
 
@@ -181,22 +161,27 @@ void BezierAlignForward(const int nthreads, const T *input, const T *rois,
         (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
 
     // We do average (integral) pooling inside a bin
-    // When the grid is empty, output zeros == 0/1, instead of NaN.
     const T count = std::max(roi_bin_grid_h * roi_bin_grid_w, 1);  // e.g. = 4
 
     // we want to precalculate indices and weights shared by all channels,
     // this is the key point of optimization
     std::vector<PreCalc<T>> pre_calc(roi_bin_grid_h * roi_bin_grid_w *
                                      pooled_width * pooled_height);
+
+    // roi_start_h and roi_start_w are computed wrt the center of RoI (x, y).
+    // Appropriate translation needs to be applied after.
+    T roi_start_h = -roi_height / 2.0;
+    T roi_start_w = -roi_width / 2.0;
+
     pre_calc_for_bilinear_interpolate(
         height, width, pooled_height, pooled_width, roi_bin_grid_h,
-        roi_bin_grid_w, p0_x, p0_y, p1_x, p1_y, p2_x, p2_y, p3_x, p3_y, p4_x,
-        p4_y, p5_x, p5_y, p6_x, p6_y, p7_x, p7_y, bin_size_h, bin_size_w,
-        roi_bin_grid_h, roi_bin_grid_w, offset, pre_calc);
+        roi_bin_grid_w, roi_start_h, roi_start_w, bin_size_h, bin_size_w,
+        roi_bin_grid_h, roi_bin_grid_w, roi_center_h, roi_center_w, cos_theta,
+        sin_theta, pre_calc);
 
     for (int c = 0; c < channels; c++) {
       int index_n_c = index_n + c * pooled_width * pooled_height;
-      const T *offset_input =
+      const T* offset_input =
           input + (roi_batch_ind * channels + c) * height * width;
       int pre_calc_index = 0;
 
@@ -227,9 +212,8 @@ void BezierAlignForward(const int nthreads, const T *input, const T *rois,
 
 template <typename T>
 void bilinear_interpolate_gradient(const int height, const int width, T y, T x,
-                                   T &w1, T &w2, T &w3, T &w4, int &x_low,
-                                   int &x_high, int &y_low, int &y_high,
-                                   const int index /* index for debug only*/) {
+                                   T& w1, T& w2, T& w3, T& w4, int& x_low,
+                                   int& x_high, int& y_low, int& y_high) {
   // deal with cases that inverse elements are out of feature map boundary
   if (y < -1.0 || y > height || x < -1.0 || x > width) {
     // empty
@@ -238,8 +222,13 @@ void bilinear_interpolate_gradient(const int height, const int width, T y, T x,
     return;
   }
 
-  if (y <= 0) y = 0;
-  if (x <= 0) x = 0;
+  if (y < 0) {
+    y = 0;
+  }
+
+  if (x < 0) {
+    x = 0;
+  }
 
   y_low = (int)y;
   x_low = (int)x;
@@ -270,21 +259,24 @@ void bilinear_interpolate_gradient(const int height, const int width, T y, T x,
   // T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
 
   w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+
+  return;
 }
 
 template <class T>
-inline void add(T *address, const T &val) {
+inline void add(T* address, const T& val) {
   *address += val;
 }
 
 template <typename T>
-void BezierAlignBackward(const int nthreads, const T *grad_output,
-                         const T *rois, T *grad_input, const int pooled_height,
-                         const int pooled_width, const T &spatial_scale,
-                         const int sampling_ratio, bool aligned,
-                         const int channels, const int height, const int width,
-                         const int n_stride, const int c_stride,
-                         const int h_stride, const int w_stride) {
+void ROIAlignRotatedBackward(
+    const int nthreads,
+    // may not be contiguous. should index using n_stride, etc
+    const T* grad_output, const T& spatial_scale, const bool aligned,
+    const bool clockwise, const int channels, const int height, const int width,
+    const int pooled_height, const int pooled_width, const int sampling_ratio,
+    T* grad_input, const T* rois, const int n_stride, const int c_stride,
+    const int h_stride, const int w_stride) {
   for (int index = 0; index < nthreads; index++) {
     // (n, c, ph, pw) is an element in the pooled output
     int pw = index % pooled_width;
@@ -292,55 +284,38 @@ void BezierAlignBackward(const int nthreads, const T *grad_output,
     int c = (index / pooled_width / pooled_height) % channels;
     int n = index / pooled_width / pooled_height / channels;
 
-    const T *offset_rois = rois + n * 17;
-    int roi_batch_ind = offset_rois[0];
+    const T* current_roi = rois + n * 6;
+    int roi_batch_ind = current_roi[0];
 
     // Do not use rounding; this implementation detail is critical
     T offset = aligned ? (T)0.5 : (T)0.0;
-    T p0_x = offset_rois[1] * spatial_scale;
-    T p0_y = offset_rois[2] * spatial_scale;
-    T p1_x = offset_rois[3] * spatial_scale;
-    T p1_y = offset_rois[4] * spatial_scale;
-    T p2_x = offset_rois[5] * spatial_scale;
-    T p2_y = offset_rois[6] * spatial_scale;
-    T p3_x = offset_rois[7] * spatial_scale;
-    T p3_y = offset_rois[8] * spatial_scale;
-    T p4_x = offset_rois[15] * spatial_scale;
-    T p4_y = offset_rois[16] * spatial_scale;
-    T p5_x = offset_rois[13] * spatial_scale;
-    T p5_y = offset_rois[14] * spatial_scale;
-    T p6_x = offset_rois[11] * spatial_scale;
-    T p6_y = offset_rois[12] * spatial_scale;
-    T p7_x = offset_rois[9] * spatial_scale;
-    T p7_y = offset_rois[10] * spatial_scale;
-
-    // compute the coords
-    const T u = pw / static_cast<T>(pooled_width);
-    const T v = ph / static_cast<T>(pooled_height);
-    const T x0 = bezier_curve(p0_x, p1_x, p2_x, p3_x, u);
-    const T y0 = bezier_curve(p0_y, p1_y, p2_y, p3_y, u);
-    const T x1 = bezier_curve(p4_x, p5_x, p6_x, p7_x, u);
-    const T y1 = bezier_curve(p4_y, p5_y, p6_y, p7_y, u);
-    const T x_center = x1 * v + x0 * (1. - v) - offset;
-    const T y_center = y1 * v + y0 * (1. - v) - offset;
-
-    T roi_width = std::max(std::abs(p0_x - p3_x), std::abs(p4_x - p7_x));
-    T roi_height = std::max(std::abs(p0_y - p3_y), std::abs(p4_y - p7_y));
+    T roi_center_w = current_roi[1] * spatial_scale - offset;
+    T roi_center_h = current_roi[2] * spatial_scale - offset;
+    T roi_width = current_roi[3] * spatial_scale;
+    T roi_height = current_roi[4] * spatial_scale;
+    T theta = current_roi[5];
+    if (clockwise) {
+      theta = -theta;  // If clockwise, the angle needs to be reversed.
+    }
+    T cos_theta = cos(theta);
+    T sin_theta = sin(theta);
+
     if (aligned) {
       AT_ASSERTM(roi_width >= 0 && roi_height >= 0,
-                 "Beziers in BezierAlign do not have non-negative size!");
+                 "ROIs in ROIAlignRotated do not have non-negative size!");
     } else {  // for backward-compatibility only
       roi_width = std::max(roi_width, (T)1.);
       roi_height = std::max(roi_height, (T)1.);
     }
+
     T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
     T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
 
-    T *offset_grad_input =
+    T* offset_grad_input =
         grad_input + ((roi_batch_ind * channels + c) * height * width);
 
     int output_offset = n * n_stride + c * c_stride;
-    const T *offset_grad_output = grad_output + output_offset;
+    const T* offset_grad_output = grad_output + output_offset;
     const T grad_output_this_bin =
         offset_grad_output[ph * h_stride + pw * w_stride];
 
@@ -351,23 +326,32 @@ void BezierAlignBackward(const int nthreads, const T *grad_output,
     int roi_bin_grid_w =
         (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
 
+    // roi_start_h and roi_start_w are computed wrt the center of RoI (x, y).
+    // Appropriate translation needs to be applied after.
+    T roi_start_h = -roi_height / 2.0;
+    T roi_start_w = -roi_width / 2.0;
+
     // We do average (integral) pooling inside a bin
     const T count = roi_bin_grid_h * roi_bin_grid_w;  // e.g. = 4
 
     for (int iy = 0; iy < roi_bin_grid_h; iy++) {
-      const T y = y_center - (T)0.5 * bin_size_h +
-                  static_cast<T>(iy + .5f) * bin_size_h /
-                      static_cast<T>(roi_bin_grid_h);  // e.g., 0.5, 1.5
+      const T yy = roi_start_h + ph * bin_size_h +
+                   static_cast<T>(iy + .5f) * bin_size_h /
+                       static_cast<T>(roi_bin_grid_h);  // e.g., 0.5, 1.5
       for (int ix = 0; ix < roi_bin_grid_w; ix++) {
-        const T x = x_center - (T)0.5 * bin_size_w +
-                    static_cast<T>(ix + .5f) * bin_size_w /
-                        static_cast<T>(roi_bin_grid_w);
+        const T xx = roi_start_w + pw * bin_size_w +
+                     static_cast<T>(ix + .5f) * bin_size_w /
+                         static_cast<T>(roi_bin_grid_w);
+
+        // Rotate by theta around the center and translate
+        T y = yy * cos_theta - xx * sin_theta + roi_center_h;
+        T x = yy * sin_theta + xx * cos_theta + roi_center_w;
 
         T w1, w2, w3, w4;
         int x_low, x_high, y_low, y_high;
 
         bilinear_interpolate_gradient(height, width, y, x, w1, w2, w3, w4,
-                                      x_low, x_high, y_low, y_high, index);
+                                      x_low, x_high, y_low, y_high);
 
         T g1 = grad_output_this_bin * w1 / count;
         T g2 = grad_output_this_bin * w2 / count;
@@ -384,31 +368,32 @@ void BezierAlignBackward(const int nthreads, const T *grad_output,
       }    // ix
     }      // iy
   }        // for
-}  // BezierAlignBackward
+}  // ROIAlignRotatedBackward
 
-void BezierAlignForwardCPULauncher(Tensor input, Tensor rois, Tensor output,
-                                   int aligned_height, int aligned_width,
-                                   float spatial_scale, int sampling_ratio,
-                                   bool aligned) {
+void ROIAlignRotatedForwardCPULauncher(Tensor input, Tensor rois, Tensor output,
+                                       int aligned_height, int aligned_width,
+                                       float spatial_scale, int sampling_ratio,
+                                       bool aligned, bool clockwise) {
   int output_size = output.numel();
   int channels = input.size(1);
   int height = input.size(2);
   int width = input.size(3);
 
   AT_DISPATCH_FLOATING_TYPES_AND_HALF(
-      input.scalar_type(), "BezierAlign_forward", [&] {
-        BezierAlignForward<scalar_t>(
-            output_size, input.data_ptr<scalar_t>(), rois.data_ptr<scalar_t>(),
-            output.data_ptr<scalar_t>(), aligned_height, aligned_width,
-            static_cast<scalar_t>(spatial_scale), sampling_ratio, aligned,
-            channels, height, width);
+      input.scalar_type(), "ROIAlignRotated_forward", [&] {
+        ROIAlignRotatedForward<scalar_t>(
+            output_size, input.data_ptr<scalar_t>(),
+            static_cast<scalar_t>(spatial_scale), aligned, clockwise, channels,
+            height, width, aligned_height, aligned_width, sampling_ratio,
+            rois.data_ptr<scalar_t>(), output.data_ptr<scalar_t>());
       });
 }
 
-void BezierAlignBackwardCPULauncher(Tensor grad_output, Tensor rois,
-                                    Tensor grad_input, int aligned_height,
-                                    int aligned_width, float spatial_scale,
-                                    int sampling_ratio, bool aligned) {
+void ROIAlignRotatedBackwardCPULauncher(Tensor grad_output, Tensor rois,
+                                        Tensor grad_input, int aligned_height,
+                                        int aligned_width, float spatial_scale,
+                                        int sampling_ratio, bool aligned,
+                                        bool clockwise) {
   int output_size = grad_output.numel();
   int channels = grad_input.size(1);
   int height = grad_input.size(2);
@@ -421,27 +406,12 @@ void BezierAlignBackwardCPULauncher(Tensor grad_output, Tensor rois,
   int w_stride = grad_output.stride(3);
 
   AT_DISPATCH_FLOATING_TYPES_AND_HALF(
-      grad_output.scalar_type(), "BezierAlign_backward", [&] {
-        BezierAlignBackward<scalar_t>(
-            output_size, grad_output.data_ptr<scalar_t>(),
-            rois.data_ptr<scalar_t>(), grad_input.data_ptr<scalar_t>(),
-            aligned_height, aligned_width, static_cast<scalar_t>(spatial_scale),
-            sampling_ratio, aligned, channels, height, width, n_stride,
-            c_stride, h_stride, w_stride);
+      grad_output.scalar_type(), "ROIAlignRotated_backward", [&] {
+        ROIAlignRotatedBackward<scalar_t>(
+            grad_output.numel(), grad_output.data_ptr<scalar_t>(),
+            static_cast<scalar_t>(spatial_scale), aligned, clockwise, channels,
+            height, width, aligned_height, aligned_width, sampling_ratio,
+            grad_input.data_ptr<scalar_t>(), rois.data_ptr<scalar_t>(),
+            n_stride, c_stride, h_stride, w_stride);
       });
 }
-
-void bezier_align_forward_impl(Tensor input, Tensor rois, Tensor output,
-                               int aligned_height, int aligned_width,
-                               float spatial_scale, int sampling_ratio,
-                               bool aligned);
-
-void bezier_align_backward_impl(Tensor grad_output, Tensor rois,
-                                Tensor grad_input, int aligned_height,
-                                int aligned_width, float spatial_scale,
-                                int sampling_ratio, bool aligned);
-
-REGISTER_DEVICE_IMPL(bezier_align_forward_impl, CPU,
-                     BezierAlignForwardCPULauncher);
-REGISTER_DEVICE_IMPL(bezier_align_backward_impl, CPU,
-                     BezierAlignBackwardCPULauncher);
diff --git a/mmcv/ops/csrc/parrots/roi_align_rotated_parrots.cpp b/mmcv/ops/csrc/parrots/roi_align_rotated_parrots.cpp
index 9386250..1a87ad2 100644
--- a/mmcv/ops/csrc/parrots/roi_align_rotated_parrots.cpp
+++ b/mmcv/ops/csrc/parrots/roi_align_rotated_parrots.cpp
@@ -14,14 +14,14 @@ void roi_align_rotated_forward_cuda_parrots(CudaContext& ctx,
   int pooled_height;
   int pooled_width;
   float spatial_scale;
-  int sampling_ratio;
+  int sample_num;
   bool aligned;
   bool clockwise;
   SSAttrs(attr)
       .get<int>("pooled_height", pooled_height)
       .get<int>("pooled_width", pooled_width)
       .get<float>("spatial_scale", spatial_scale)
-      .get<int>("sampling_ratio", sampling_ratio)
+      .get<int>("sample_num", sample_num)
       .get<bool>("aligned", aligned)
       .get<bool>("clockwise", clockwise)
       .done();
@@ -30,7 +30,7 @@ void roi_align_rotated_forward_cuda_parrots(CudaContext& ctx,
   const auto& rois = buildATensor(ctx, ins[1]);
   auto output = buildATensor(ctx, outs[0]);
   roi_align_rotated_forward_cuda(input, rois, output, pooled_height,
-                                 pooled_width, spatial_scale, sampling_ratio,
+                                 pooled_width, spatial_scale, sample_num,
                                  aligned, clockwise);
 }
 
@@ -41,14 +41,14 @@ void roi_align_rotated_backward_cuda_parrots(CudaContext& ctx,
   int pooled_height;
   int pooled_width;
   float spatial_scale;
-  int sampling_ratio;
+  int sample_num;
   bool aligned;
   bool clockwise;
   SSAttrs(attr)
       .get<int>("pooled_height", pooled_height)
       .get<int>("pooled_width", pooled_width)
       .get<float>("spatial_scale", spatial_scale)
-      .get<int>("sampling_ratio", sampling_ratio)
+      .get<int>("sample_num", sample_num)
       .get<bool>("aligned", aligned)
       .get<bool>("clockwise", clockwise)
       .done();
@@ -57,7 +57,7 @@ void roi_align_rotated_backward_cuda_parrots(CudaContext& ctx,
   const auto& rois = buildATensor(ctx, ins[1]);
   auto grad_input = buildATensor(ctx, outs[0]);
   roi_align_rotated_backward_cuda(grad_output, rois, grad_input, pooled_height,
-                                  pooled_width, spatial_scale, sampling_ratio,
+                                  pooled_width, spatial_scale, sample_num,
                                   aligned, clockwise);
 }
 #endif
@@ -69,14 +69,14 @@ void roi_align_rotated_forward_cpu_parrots(HostContext& ctx,
   int pooled_height;
   int pooled_width;
   float spatial_scale;
-  int sampling_ratio;
+  int sample_num;
   bool aligned;
   bool clockwise;
   SSAttrs(attr)
       .get<int>("pooled_height", pooled_height)
       .get<int>("pooled_width", pooled_width)
       .get<float>("spatial_scale", spatial_scale)
-      .get<int>("sampling_ratio", sampling_ratio)
+      .get<int>("sample_num", sample_num)
       .get<bool>("aligned", aligned)
       .get<bool>("clockwise", clockwise)
       .done();
@@ -85,7 +85,7 @@ void roi_align_rotated_forward_cpu_parrots(HostContext& ctx,
   const auto& rois = buildATensor(ctx, ins[1]);
   auto output = buildATensor(ctx, outs[0]);
   roi_align_rotated_forward_cpu(input, rois, output, pooled_height,
-                                pooled_width, spatial_scale, sampling_ratio,
+                                pooled_width, spatial_scale, sample_num,
                                 aligned, clockwise);
 }
 
@@ -96,14 +96,14 @@ void roi_align_rotated_backward_cpu_parrots(HostContext& ctx,
   int pooled_height;
   int pooled_width;
   float spatial_scale;
-  int sampling_ratio;
+  int sample_num;
   bool aligned;
   bool clockwise;
   SSAttrs(attr)
       .get<int>("pooled_height", pooled_height)
       .get<int>("pooled_width", pooled_width)
       .get<float>("spatial_scale", spatial_scale)
-      .get<int>("sampling_ratio", sampling_ratio)
+      .get<int>("sample_num", sample_num)
       .get<bool>("aligned", aligned)
       .get<bool>("clockwise", clockwise)
       .done();
@@ -112,7 +112,7 @@ void roi_align_rotated_backward_cpu_parrots(HostContext& ctx,
   const auto& rois = buildATensor(ctx, ins[1]);
   auto grad_input = buildATensor(ctx, outs[0]);
   roi_align_rotated_backward_cpu(grad_output, rois, grad_input, pooled_height,
-                                 pooled_width, spatial_scale, sampling_ratio,
+                                 pooled_width, spatial_scale, sample_num,
                                  aligned, clockwise);
 }
 
@@ -120,7 +120,7 @@ PARROTS_EXTENSION_REGISTER(roi_align_rotated_forward)
     .attr("pooled_height")
     .attr("pooled_width")
     .attr("spatial_scale")
-    .attr("sampling_ratio")
+    .attr("sample_num")
     .attr("aligned")
     .attr("clockwise")
     .input(2)
@@ -135,7 +135,7 @@ PARROTS_EXTENSION_REGISTER(roi_align_rotated_backward)
     .attr("pooled_height")
     .attr("pooled_width")
     .attr("spatial_scale")
-    .attr("sampling_ratio")
+    .attr("sample_num")
     .attr("aligned")
     .attr("clockwise")
     .input(2)
diff --git a/mmcv/ops/csrc/parrots/roi_align_rotated_pytorch.h b/mmcv/ops/csrc/parrots/roi_align_rotated_pytorch.h
index 8136b56..eafe313 100644
--- a/mmcv/ops/csrc/parrots/roi_align_rotated_pytorch.h
+++ b/mmcv/ops/csrc/parrots/roi_align_rotated_pytorch.h
@@ -5,27 +5,27 @@
 using namespace at;
 
 #ifdef MMCV_WITH_CUDA
-void roi_align_rotated_forward_cuda(Tensor input, Tensor rois, Tensor output,
+void roi_align_rotated_forward_cuda(Tensor features, Tensor rois, Tensor output,
                                     int pooled_height, int pooled_width,
-                                    float spatial_scale, int sampling_ratio,
+                                    float spatial_scale, int sample_num,
                                     bool aligned, bool clockwise);
 
 void roi_align_rotated_backward_cuda(Tensor grad_output, Tensor rois,
                                      Tensor bottom_grad, int pooled_height,
                                      int pooled_width, float spatial_scale,
-                                     int sampling_ratio, bool aligned,
+                                     int sample_num, bool aligned,
                                      bool clockwise);
 #endif
 
-void roi_align_rotated_forward_cpu(Tensor input, Tensor rois, Tensor output,
+void roi_align_rotated_forward_cpu(Tensor features, Tensor rois, Tensor output,
                                    int pooled_height, int pooled_width,
-                                   float spatial_scale, int sampling_ratio,
+                                   float spatial_scale, int sample_num,
                                    bool aligned, bool clockwise);
 
 void roi_align_rotated_backward_cpu(Tensor grad_output, Tensor rois,
                                     Tensor bottom_grad, int pooled_height,
                                     int pooled_width, float spatial_scale,
-                                    int sampling_ratio, bool aligned,
+                                    int sample_num, bool aligned,
                                     bool clockwise);
 
 #endif  // ROI_ALIGN_ROTATED_PYTORCH_H
diff --git a/mmcv/ops/csrc/parrots/roi_pool.cpp b/mmcv/ops/csrc/parrots/roi_pool.cpp
index bba90b8..34c4b99 100644
--- a/mmcv/ops/csrc/parrots/roi_pool.cpp
+++ b/mmcv/ops/csrc/parrots/roi_pool.cpp
@@ -1,31 +1,67 @@
 // Copyright (c) OpenMMLab. All rights reserved
 #include "pytorch_cpp_helper.hpp"
-#include "pytorch_device_registry.hpp"
 
-void roi_pool_forward_impl(Tensor input, Tensor rois, Tensor output,
+#ifdef MMCV_WITH_CUDA
+void ROIPoolForwardCUDAKernelLauncher(Tensor input, Tensor rois, Tensor output,
+                                      Tensor argmax, int pooled_height,
+                                      int pooled_width, float spatial_scale);
+
+void ROIPoolBackwardCUDAKernelLauncher(Tensor grad_output, Tensor rois,
+                                       Tensor argmax, Tensor grad_input,
+                                       int pooled_height, int pooled_width,
+                                       float spatial_scale);
+
+void roi_pool_forward_cuda(Tensor input, Tensor rois, Tensor output,
                            Tensor argmax, int pooled_height, int pooled_width,
                            float spatial_scale) {
-  DISPATCH_DEVICE_IMPL(roi_pool_forward_impl, input, rois, output, argmax,
-                       pooled_height, pooled_width, spatial_scale);
+  ROIPoolForwardCUDAKernelLauncher(input, rois, output, argmax, pooled_height,
+                                   pooled_width, spatial_scale);
 }
 
-void roi_pool_backward_impl(Tensor grad_output, Tensor rois, Tensor argmax,
+void roi_pool_backward_cuda(Tensor grad_output, Tensor rois, Tensor argmax,
                             Tensor grad_input, int pooled_height,
                             int pooled_width, float spatial_scale) {
-  DISPATCH_DEVICE_IMPL(roi_pool_backward_impl, grad_output, rois, argmax,
-                       grad_input, pooled_height, pooled_width, spatial_scale);
+  ROIPoolBackwardCUDAKernelLauncher(grad_output, rois, argmax, grad_input,
+                                    pooled_height, pooled_width, spatial_scale);
 }
+#endif
 
 void roi_pool_forward(Tensor input, Tensor rois, Tensor output, Tensor argmax,
                       int pooled_height, int pooled_width,
                       float spatial_scale) {
-  roi_pool_forward_impl(input, rois, output, argmax, pooled_height,
-                        pooled_width, spatial_scale);
+  if (input.device().is_cuda()) {
+#ifdef MMCV_WITH_CUDA
+    CHECK_CUDA_INPUT(input);
+    CHECK_CUDA_INPUT(rois);
+    CHECK_CUDA_INPUT(output);
+    CHECK_CUDA_INPUT(argmax);
+
+    roi_pool_forward_cuda(input, rois, output, argmax, pooled_height,
+                          pooled_width, spatial_scale);
+#else
+    AT_ERROR("RoIPool is not compiled with GPU support");
+#endif
+  } else {
+    AT_ERROR("RoIPool is not implemented on CPU");
+  }
 }
 
 void roi_pool_backward(Tensor grad_output, Tensor rois, Tensor argmax,
                        Tensor grad_input, int pooled_height, int pooled_width,
                        float spatial_scale) {
-  roi_pool_backward_impl(grad_output, rois, argmax, grad_input, pooled_height,
-                         pooled_width, spatial_scale);
+  if (grad_output.device().is_cuda()) {
+#ifdef MMCV_WITH_CUDA
+    CHECK_CUDA_INPUT(grad_output);
+    CHECK_CUDA_INPUT(rois);
+    CHECK_CUDA_INPUT(argmax);
+    CHECK_CUDA_INPUT(grad_input);
+
+    roi_pool_backward_cuda(grad_output, rois, argmax, grad_input, pooled_height,
+                           pooled_width, spatial_scale);
+#else
+    AT_ERROR("RoIPool is not compiled with GPU support");
+#endif
+  } else {
+    AT_ERROR("RoIPool is not implemented on CPU");
+  }
 }
diff --git a/mmcv/ops/csrc/parrots/roiaware_pool3d.cpp b/mmcv/ops/csrc/parrots/roiaware_pool3d.cpp
index 6cf9cf0..c7e267f 100644
--- a/mmcv/ops/csrc/parrots/roiaware_pool3d.cpp
+++ b/mmcv/ops/csrc/parrots/roiaware_pool3d.cpp
@@ -1,28 +1,40 @@
 #include "pytorch_cpp_helper.hpp"
-#include "pytorch_device_registry.hpp"
 
-void roiaware_pool3d_forward_impl(int boxes_num, int pts_num, int channels,
+#ifdef MMCV_WITH_CUDA
+void RoiawarePool3dForwardCUDAKernelLauncher(
+    int boxes_num, int pts_num, int channels, int max_pts_each_voxel, int out_x,
+    int out_y, int out_z, const Tensor rois, const Tensor pts,
+    const Tensor pts_feature, Tensor argmax, Tensor pts_idx_of_voxels,
+    Tensor pooled_features, int pool_method);
+
+void roiaware_pool3d_forward_cuda(int boxes_num, int pts_num, int channels,
                                   int max_pts_each_voxel, int out_x, int out_y,
                                   int out_z, const Tensor rois,
                                   const Tensor pts, const Tensor pts_feature,
                                   Tensor argmax, Tensor pts_idx_of_voxels,
                                   Tensor pooled_features, int pool_method) {
-  DISPATCH_DEVICE_IMPL(roiaware_pool3d_forward_impl, boxes_num, pts_num,
-                       channels, max_pts_each_voxel, out_x, out_y, out_z, rois,
-                       pts, pts_feature, argmax, pts_idx_of_voxels,
-                       pooled_features, pool_method);
-}
+  RoiawarePool3dForwardCUDAKernelLauncher(
+      boxes_num, pts_num, channels, max_pts_each_voxel, out_x, out_y, out_z,
+      rois, pts, pts_feature, argmax, pts_idx_of_voxels, pooled_features,
+      pool_method);
+};
 
-void roiaware_pool3d_backward_impl(int boxes_num, int out_x, int out_y,
+void RoiawarePool3dBackwardCUDAKernelLauncher(
+    int boxes_num, int out_x, int out_y, int out_z, int channels,
+    int max_pts_each_voxel, const Tensor pts_idx_of_voxels, const Tensor argmax,
+    const Tensor grad_out, Tensor grad_in, int pool_method);
+
+void roiaware_pool3d_backward_cuda(int boxes_num, int out_x, int out_y,
                                    int out_z, int channels,
                                    int max_pts_each_voxel,
                                    const Tensor pts_idx_of_voxels,
                                    const Tensor argmax, const Tensor grad_out,
                                    Tensor grad_in, int pool_method) {
-  DISPATCH_DEVICE_IMPL(roiaware_pool3d_backward_impl, boxes_num, out_x, out_y,
-                       out_z, channels, max_pts_each_voxel, pts_idx_of_voxels,
-                       argmax, grad_out, grad_in, pool_method);
-}
+  RoiawarePool3dBackwardCUDAKernelLauncher(
+      boxes_num, out_x, out_y, out_z, channels, max_pts_each_voxel,
+      pts_idx_of_voxels, argmax, grad_out, grad_in, pool_method);
+};
+#endif
 
 void roiaware_pool3d_forward(Tensor rois, Tensor pts, Tensor pts_feature,
                              Tensor argmax, Tensor pts_idx_of_voxels,
@@ -35,20 +47,36 @@ void roiaware_pool3d_forward(Tensor rois, Tensor pts, Tensor pts_feature,
   // params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel)
   // params pooled_features: (N, out_x, out_y, out_z, C)
   // params pool_method: 0: max_pool 1: avg_pool
-  int boxes_num = rois.size(0);
-  int pts_num = pts.size(0);
-  int channels = pts_feature.size(1);
-  int max_pts_each_voxel = pts_idx_of_voxels.size(4);  // index 0 is the counter
-  int out_x = pts_idx_of_voxels.size(1);
-  int out_y = pts_idx_of_voxels.size(2);
-  int out_z = pts_idx_of_voxels.size(3);
-  assert((out_x < 256) && (out_y < 256) &&
-         (out_z < 256));  // we encode index with 8bit
+  if (pts.device().is_cuda()) {
+#ifdef MMCV_WITH_CUDA
+    CHECK_CUDA_INPUT(rois);
+    CHECK_CUDA_INPUT(pts);
+    CHECK_CUDA_INPUT(pts_feature);
+    CHECK_CUDA_INPUT(argmax);
+    CHECK_CUDA_INPUT(pts_idx_of_voxels);
+    CHECK_CUDA_INPUT(pooled_features);
 
-  roiaware_pool3d_forward_impl(boxes_num, pts_num, channels, max_pts_each_voxel,
-                               out_x, out_y, out_z, rois, pts, pts_feature,
-                               argmax, pts_idx_of_voxels, pooled_features,
-                               pool_method);
+    int boxes_num = rois.size(0);
+    int pts_num = pts.size(0);
+    int channels = pts_feature.size(1);
+    int max_pts_each_voxel =
+        pts_idx_of_voxels.size(4);  // index 0 is the counter
+    int out_x = pts_idx_of_voxels.size(1);
+    int out_y = pts_idx_of_voxels.size(2);
+    int out_z = pts_idx_of_voxels.size(3);
+    assert((out_x < 256) && (out_y < 256) &&
+           (out_z < 256));  // we encode index with 8bit
+
+    roiaware_pool3d_forward_cuda(boxes_num, pts_num, channels,
+                                 max_pts_each_voxel, out_x, out_y, out_z, rois,
+                                 pts, pts_feature, argmax, pts_idx_of_voxels,
+                                 pooled_features, pool_method);
+#else
+    AT_ERROR("roiaware_pool3d is not compiled with GPU support");
+#endif
+  } else {
+    AT_ERROR("roiaware_pool3d is not implemented on CPU");
+  }
 }
 
 void roiaware_pool3d_backward(Tensor pts_idx_of_voxels, Tensor argmax,
@@ -59,14 +87,29 @@ void roiaware_pool3d_backward(Tensor pts_idx_of_voxels, Tensor argmax,
   // params grad_out: (N, out_x, out_y, out_z, C)
   // params grad_in: (npoints, C), return value
   // params pool_method: 0: max_pool 1: avg_pool
-  int boxes_num = pts_idx_of_voxels.size(0);
-  int out_x = pts_idx_of_voxels.size(1);
-  int out_y = pts_idx_of_voxels.size(2);
-  int out_z = pts_idx_of_voxels.size(3);
-  int max_pts_each_voxel = pts_idx_of_voxels.size(4);  // index 0 is the counter
-  int channels = grad_out.size(4);
 
-  roiaware_pool3d_backward_impl(boxes_num, out_x, out_y, out_z, channels,
-                                max_pts_each_voxel, pts_idx_of_voxels, argmax,
-                                grad_out, grad_in, pool_method);
+  if (grad_in.device().is_cuda()) {
+#ifdef MMCV_WITH_CUDA
+    CHECK_CUDA_INPUT(pts_idx_of_voxels);
+    CHECK_CUDA_INPUT(argmax);
+    CHECK_CUDA_INPUT(grad_out);
+    CHECK_CUDA_INPUT(grad_in);
+
+    int boxes_num = pts_idx_of_voxels.size(0);
+    int out_x = pts_idx_of_voxels.size(1);
+    int out_y = pts_idx_of_voxels.size(2);
+    int out_z = pts_idx_of_voxels.size(3);
+    int max_pts_each_voxel =
+        pts_idx_of_voxels.size(4);  // index 0 is the counter
+    int channels = grad_out.size(4);
+
+    roiaware_pool3d_backward_cuda(boxes_num, out_x, out_y, out_z, channels,
+                                  max_pts_each_voxel, pts_idx_of_voxels, argmax,
+                                  grad_out, grad_in, pool_method);
+#else
+    AT_ERROR("roiaware_pool3d is not compiled with GPU support");
+#endif
+  } else {
+    AT_ERROR("roiaware_pool3d is not implemented on CPU");
+  }
 }
diff --git a/mmcv/ops/csrc/parrots/roipoint_pool3d.cpp b/mmcv/ops/csrc/parrots/roipoint_pool3d.cpp
index a10080b..e9b5054 100644
--- a/mmcv/ops/csrc/parrots/roipoint_pool3d.cpp
+++ b/mmcv/ops/csrc/parrots/roipoint_pool3d.cpp
@@ -7,18 +7,24 @@ All Rights Reserved 2018.
 */
 
 #include "pytorch_cpp_helper.hpp"
-#include "pytorch_device_registry.hpp"
 
-void roipoint_pool3d_forward_impl(int batch_size, int pts_num, int boxes_num,
+#ifdef MMCV_WITH_CUDA
+void RoIPointPool3dForwardCUDAKernelLauncher(
+    int batch_size, int pts_num, int boxes_num, int feature_in_len,
+    int sampled_pts_num, const Tensor xyz, const Tensor boxes3d,
+    const Tensor pts_feature, Tensor pooled_features, Tensor pooled_empty_flag);
+
+void roipoint_pool3d_forward_cuda(int batch_size, int pts_num, int boxes_num,
                                   int feature_in_len, int sampled_pts_num,
                                   const Tensor xyz, const Tensor boxes3d,
                                   const Tensor pts_feature,
                                   Tensor pooled_features,
                                   Tensor pooled_empty_flag) {
-  DISPATCH_DEVICE_IMPL(roipoint_pool3d_forward_impl, batch_size, pts_num,
-                       boxes_num, feature_in_len, sampled_pts_num, xyz, boxes3d,
-                       pts_feature, pooled_features, pooled_empty_flag);
-}
+  RoIPointPool3dForwardCUDAKernelLauncher(
+      batch_size, pts_num, boxes_num, feature_in_len, sampled_pts_num, xyz,
+      boxes3d, pts_feature, pooled_features, pooled_empty_flag);
+};
+#endif
 
 void roipoint_pool3d_forward(Tensor xyz, Tensor boxes3d, Tensor pts_feature,
                              Tensor pooled_features, Tensor pooled_empty_flag) {
@@ -27,13 +33,28 @@ void roipoint_pool3d_forward(Tensor xyz, Tensor boxes3d, Tensor pts_feature,
   // params pts_feature: (B, N, C)
   // params pooled_features: (B, M, 512, 3+C)
   // params pooled_empty_flag: (B, M)
-  int batch_size = xyz.size(0);
-  int pts_num = xyz.size(1);
-  int boxes_num = boxes3d.size(1);
-  int feature_in_len = pts_feature.size(2);
-  int sampled_pts_num = pooled_features.size(2);
 
-  roipoint_pool3d_forward_impl(batch_size, pts_num, boxes_num, feature_in_len,
-                               sampled_pts_num, xyz, boxes3d, pts_feature,
-                               pooled_features, pooled_empty_flag);
+  if (xyz.device().is_cuda()) {
+#ifdef MMCV_WITH_CUDA
+    CHECK_CUDA_INPUT(xyz);
+    CHECK_CUDA_INPUT(boxes3d);
+    CHECK_CUDA_INPUT(pts_feature);
+    CHECK_CUDA_INPUT(pooled_features);
+    CHECK_CUDA_INPUT(pooled_empty_flag);
+
+    int batch_size = xyz.size(0);
+    int pts_num = xyz.size(1);
+    int boxes_num = boxes3d.size(1);
+    int feature_in_len = pts_feature.size(2);
+    int sampled_pts_num = pooled_features.size(2);
+
+    roipoint_pool3d_forward_cuda(batch_size, pts_num, boxes_num, feature_in_len,
+                                 sampled_pts_num, xyz, boxes3d, pts_feature,
+                                 pooled_features, pooled_empty_flag);
+#else
+    AT_ERROR("roipoint_pool3d is not compiled with GPU support");
+#endif
+  } else {
+    AT_ERROR("roipoint_pool3d is not implemented on CPU");
+  }
 }
diff --git a/mmcv/ops/csrc/parrots/rotated_feature_align.cpp b/mmcv/ops/csrc/parrots/rotated_feature_align.cpp
deleted file mode 100644
index 71fe0c9..0000000
--- a/mmcv/ops/csrc/parrots/rotated_feature_align.cpp
+++ /dev/null
@@ -1,39 +0,0 @@
-// Copyright (c) OpenMMLab. All rights reserved.
-// Modified from
-// https://github.com/SJTU-Thinklab-Det/r3det-on-mmdetection/blob/master/mmdet/ops/fr/src/feature_refine_cuda.cpp
-
-#include "pytorch_cpp_helper.hpp"
-#include "pytorch_device_registry.hpp"
-
-void rotated_feature_align_forward_impl(const Tensor features,
-                                        const Tensor best_bboxes,
-                                        const float spatial_scale,
-                                        const int points, Tensor output) {
-  DISPATCH_DEVICE_IMPL(rotated_feature_align_forward_impl, features,
-                       best_bboxes, spatial_scale, points, output);
-}
-
-void rotated_feature_align_backward_impl(const Tensor top_grad,
-                                         const Tensor best_bboxes,
-                                         const float spatial_scale,
-                                         const int points, Tensor bottom_grad) {
-  DISPATCH_DEVICE_IMPL(rotated_feature_align_backward_impl, top_grad,
-                       best_bboxes, spatial_scale, points, bottom_grad);
-}
-
-void rotated_feature_align_forward(const Tensor features,
-                                   const Tensor best_bboxes, Tensor output,
-                                   const float spatial_scale,
-                                   const int points) {
-  rotated_feature_align_forward_impl(features, best_bboxes, spatial_scale,
-                                     points, output);
-}
-
-void rotated_feature_align_backward(const Tensor top_grad,
-                                    const Tensor best_bboxes,
-                                    Tensor bottom_grad,
-                                    const float spatial_scale,
-                                    const int points) {
-  rotated_feature_align_backward_impl(top_grad, best_bboxes, spatial_scale,
-                                      points, bottom_grad);
-}
diff --git a/mmcv/ops/csrc/parrots/rotated_feature_align_parrots.cpp b/mmcv/ops/csrc/parrots/rotated_feature_align_parrots.cpp
deleted file mode 100644
index d4efaf1..0000000
--- a/mmcv/ops/csrc/parrots/rotated_feature_align_parrots.cpp
+++ /dev/null
@@ -1,99 +0,0 @@
-// Copyright (c) OpenMMLab. All rights reserved
-#include <parrots/compute/aten.hpp>
-#include <parrots/extension.hpp>
-#include <parrots/foundation/ssattrs.hpp>
-
-#include "rotated_feature_align_pytorch.h"
-using namespace parrots;
-
-#ifdef MMCV_WITH_CUDA
-void rotated_feature_align_forward_cuda_parrots(
-    CudaContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
-    OperatorBase::out_list_t& outs) {
-  float spatial_scale;
-  int points;
-  SSAttrs(attr)
-      .get<float>("spatial_scale", spatial_scale)
-      .get<int>("points", points)
-      .done();
-
-  auto features = buildATensor(ctx, ins[0]);
-  auto best_bboxes = buildATensor(ctx, ins[1]);
-  auto output = buildATensor(ctx, outs[0]);
-  rotated_feature_align_forward(features, best_bboxes, output, spatial_scale,
-                                points);
-}
-
-void rotated_feature_align_backward_cuda_parrots(
-    CudaContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
-    OperatorBase::out_list_t& outs) {
-  float spatial_scale;
-  int points;
-  SSAttrs(attr)
-      .get<float>("spatial_scale", spatial_scale)
-      .get<int>("points", points)
-      .done();
-
-  auto grad_output = buildATensor(ctx, ins[0]);
-  auto best_bboxes = buildATensor(ctx, ins[1]);
-  auto grad_input = buildATensor(ctx, outs[0]);
-  rotated_feature_align_backward(grad_output, best_bboxes, grad_input,
-                                 spatial_scale, points);
-}
-#endif
-
-void rotated_feature_align_forward_cpu_parrots(
-    HostContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
-    OperatorBase::out_list_t& outs) {
-  float spatial_scale;
-  int points;
-  SSAttrs(attr)
-      .get<float>("spatial_scale", spatial_scale)
-      .get<int>("points", points)
-      .done();
-
-  auto features = buildATensor(ctx, ins[0]);
-  auto best_bboxes = buildATensor(ctx, ins[1]);
-  auto output = buildATensor(ctx, outs[0]);
-  rotated_feature_align_forward(features, best_bboxes, output, spatial_scale,
-                                points);
-}
-
-void rotated_feature_align_backward_cpu_parrots(
-    HostContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
-    OperatorBase::out_list_t& outs) {
-  float spatial_scale;
-  int points;
-  SSAttrs(attr)
-      .get<float>("spatial_scale", spatial_scale)
-      .get<int>("points", points)
-      .done();
-
-  auto grad_output = buildATensor(ctx, ins[0]);
-  auto best_bboxes = buildATensor(ctx, ins[1]);
-  auto grad_input = buildATensor(ctx, outs[0]);
-  rotated_feature_align_backward(grad_output, best_bboxes, grad_input,
-                                 spatial_scale, points);
-}
-
-PARROTS_EXTENSION_REGISTER(rotated_feature_align_forward)
-    .attr("spatial_scale")
-    .attr("points")
-    .input(2)
-    .output(1)
-    .apply(rotated_feature_align_forward_cpu_parrots)
-#ifdef MMCV_WITH_CUDA
-    .apply(rotated_feature_align_forward_cuda_parrots)
-#endif
-    .done();
-
-PARROTS_EXTENSION_REGISTER(rotated_feature_align_backward)
-    .attr("spatial_scale")
-    .attr("points")
-    .input(2)
-    .output(1)
-    .apply(rotated_feature_align_backward_cpu_parrots)
-#ifdef MMCV_WITH_CUDA
-    .apply(rotated_feature_align_backward_cuda_parrots)
-#endif
-    .done();
diff --git a/mmcv/ops/csrc/parrots/rotated_feature_align_pytorch.h b/mmcv/ops/csrc/parrots/rotated_feature_align_pytorch.h
deleted file mode 100644
index 9a695ee..0000000
--- a/mmcv/ops/csrc/parrots/rotated_feature_align_pytorch.h
+++ /dev/null
@@ -1,17 +0,0 @@
-// Copyright (c) OpenMMLab. All rights reserved
-#ifndef ROTATED_FEATURE_ALIGN_PYTORCH_H
-#define ROTATED_FEATURE_ALIGN_PYTORCH_H
-#include <torch/extension.h>
-using namespace at;
-
-void rotated_feature_align_forward(const Tensor features,
-                                   const Tensor best_bboxes, Tensor output,
-                                   const float spatial_scale, const int points);
-
-void rotated_feature_align_backward(const Tensor top_grad,
-                                    const Tensor best_bboxes,
-                                    Tensor bottom_grad,
-                                    const float spatial_scale,
-                                    const int points);
-
-#endif  // ROTATED_FEATURE_ALIGN_PYTORCH_H
diff --git a/mmcv/ops/csrc/parrots/sync_bn.cpp b/mmcv/ops/csrc/parrots/sync_bn.cpp
index fd5a513..2e023a8 100644
--- a/mmcv/ops/csrc/parrots/sync_bn.cpp
+++ b/mmcv/ops/csrc/parrots/sync_bn.cpp
@@ -1,47 +1,92 @@
 // Copyright (c) OpenMMLab. All rights reserved
 #include "pytorch_cpp_helper.hpp"
-#include "pytorch_device_registry.hpp"
 
-void sync_bn_forward_mean_impl(const Tensor input, Tensor mean) {
-  DISPATCH_DEVICE_IMPL(sync_bn_forward_mean_impl, input, mean);
+#ifdef MMCV_WITH_CUDA
+void SyncBNForwardMeanCUDAKernelLauncher(const Tensor input, Tensor mean);
+
+void SyncBNForwardVarCUDAKernelLauncher(const Tensor input, const Tensor mean,
+                                        Tensor var);
+
+void SyncBNForwardOutputCUDAKernelLauncher(
+    const Tensor input, const Tensor mean, const Tensor var,
+    Tensor running_mean, Tensor running_var, const Tensor weight,
+    const Tensor bias, Tensor norm, Tensor std, Tensor output, float eps,
+    float momentum, int group_size);
+
+void SyncBNBackwardParamCUDAKernelLauncher(const Tensor grad_output,
+                                           const Tensor norm,
+                                           Tensor grad_weight,
+                                           Tensor grad_bias);
+
+void SyncBNBackwardDataCUDAKernelLauncher(const Tensor grad_output,
+                                          const Tensor weight,
+                                          const Tensor grad_weight,
+                                          const Tensor grad_bias,
+                                          const Tensor norm, const Tensor std,
+                                          Tensor grad_input);
+
+void sync_bn_forward_mean_cuda(const Tensor input, Tensor mean) {
+  SyncBNForwardMeanCUDAKernelLauncher(input, mean);
 }
 
-void sync_bn_forward_var_impl(const Tensor input, const Tensor mean,
+void sync_bn_forward_var_cuda(const Tensor input, const Tensor mean,
                               Tensor var) {
-  DISPATCH_DEVICE_IMPL(sync_bn_forward_var_impl, input, mean, var);
+  SyncBNForwardVarCUDAKernelLauncher(input, mean, var);
 }
 
-void sync_bn_forward_output_impl(const Tensor input, const Tensor mean,
+void sync_bn_forward_output_cuda(const Tensor input, const Tensor mean,
                                  const Tensor var, Tensor running_mean,
                                  Tensor running_var, const Tensor weight,
                                  const Tensor bias, Tensor norm, Tensor std,
                                  Tensor output, float eps, float momentum,
                                  int group_size) {
-  DISPATCH_DEVICE_IMPL(sync_bn_forward_output_impl, input, mean, var,
-                       running_mean, running_var, weight, bias, norm, std,
-                       output, eps, momentum, group_size);
+  SyncBNForwardOutputCUDAKernelLauncher(input, mean, var, running_mean,
+                                        running_var, weight, bias, norm, std,
+                                        output, eps, momentum, group_size);
 }
 
-void sync_bn_backward_param_impl(const Tensor grad_output, const Tensor norm,
+void sync_bn_backward_param_cuda(const Tensor grad_output, const Tensor norm,
                                  Tensor grad_weight, Tensor grad_bias) {
-  DISPATCH_DEVICE_IMPL(sync_bn_backward_param_impl, grad_output, norm,
-                       grad_weight, grad_bias);
+  SyncBNBackwardParamCUDAKernelLauncher(grad_output, norm, grad_weight,
+                                        grad_bias);
 }
 
-void sync_bn_backward_data_impl(const Tensor grad_output, const Tensor weight,
+void sync_bn_backward_data_cuda(const Tensor grad_output, const Tensor weight,
                                 const Tensor grad_weight,
                                 const Tensor grad_bias, const Tensor norm,
                                 const Tensor std, Tensor grad_input) {
-  DISPATCH_DEVICE_IMPL(sync_bn_backward_data_impl, grad_output, weight,
-                       grad_weight, grad_bias, norm, std, grad_input);
+  SyncBNBackwardDataCUDAKernelLauncher(grad_output, weight, grad_weight,
+                                       grad_bias, norm, std, grad_input);
 }
+#endif
 
 void sync_bn_forward_mean(const Tensor input, Tensor mean) {
-  sync_bn_forward_mean_impl(input, mean);
+  if (input.device().is_cuda()) {
+#ifdef MMCV_WITH_CUDA
+    CHECK_CUDA_INPUT(input);
+    CHECK_CUDA_INPUT(mean);
+    sync_bn_forward_mean_cuda(input, mean);
+#else
+    AT_ERROR("SyncBatchNorm is not compiled with GPU support");
+#endif
+  } else {
+    AT_ERROR("SyncBatchNorm is not implemented on CPU");
+  }
 }
 
 void sync_bn_forward_var(const Tensor input, const Tensor mean, Tensor var) {
-  sync_bn_forward_var_impl(input, mean, var);
+  if (input.device().is_cuda()) {
+#ifdef MMCV_WITH_CUDA
+    CHECK_CUDA_INPUT(input);
+    CHECK_CUDA_INPUT(mean);
+    CHECK_CUDA_INPUT(var);
+    sync_bn_forward_var_cuda(input, mean, var);
+#else
+    AT_ERROR("SyncBatchNorm is not compiled with GPU support");
+#endif
+  } else {
+    AT_ERROR("SyncBatchNorm is not implemented on CPU");
+  }
 }
 
 void sync_bn_forward_output(const Tensor input, const Tensor mean,
@@ -50,20 +95,65 @@ void sync_bn_forward_output(const Tensor input, const Tensor mean,
                             Tensor running_var, Tensor norm, Tensor std,
                             Tensor output, float eps, float momentum,
                             int group_size) {
-  sync_bn_forward_output_impl(input, mean, var, running_mean, running_var,
-                              weight, bias, norm, std, output, eps, momentum,
-                              group_size);
+  if (input.device().is_cuda()) {
+#ifdef MMCV_WITH_CUDA
+    CHECK_CUDA_INPUT(input);
+    CHECK_CUDA_INPUT(mean);
+    CHECK_CUDA_INPUT(var);
+    CHECK_CUDA_INPUT(weight);
+    CHECK_CUDA_INPUT(bias);
+    CHECK_CUDA_INPUT(running_mean);
+    CHECK_CUDA_INPUT(running_var);
+    CHECK_CUDA_INPUT(norm);
+    CHECK_CUDA_INPUT(std);
+    CHECK_CUDA_INPUT(output);
+    sync_bn_forward_output_cuda(input, mean, var, running_mean, running_var,
+                                weight, bias, norm, std, output, eps, momentum,
+                                group_size);
+#else
+    AT_ERROR("SyncBatchNorm is not compiled with GPU support");
+#endif
+  } else {
+    AT_ERROR("SyncBatchNorm is not implemented on CPU");
+  }
 }
 
 void sync_bn_backward_param(const Tensor grad_output, const Tensor norm,
                             Tensor grad_weight, Tensor grad_bias) {
-  sync_bn_backward_param_impl(grad_output, norm, grad_weight, grad_bias);
+  if (grad_output.device().is_cuda()) {
+#ifdef MMCV_WITH_CUDA
+    CHECK_CUDA_INPUT(grad_output);
+    CHECK_CUDA_INPUT(norm);
+    CHECK_CUDA_INPUT(grad_weight);
+    CHECK_CUDA_INPUT(grad_bias);
+    sync_bn_backward_param_cuda(grad_output, norm, grad_weight, grad_bias);
+#else
+    AT_ERROR("SyncBatchNorm is not compiled with GPU support");
+#endif
+  } else {
+    AT_ERROR("SyncBatchNorm is not implemented on CPU");
+  }
 }
 
 void sync_bn_backward_data(const Tensor grad_output, const Tensor weight,
                            const Tensor grad_weight, const Tensor grad_bias,
                            const Tensor norm, const Tensor std,
                            Tensor grad_input) {
-  sync_bn_backward_data_impl(grad_output, weight, grad_weight, grad_bias, norm,
-                             std, grad_input);
+  if (grad_output.device().is_cuda()) {
+#ifdef MMCV_WITH_CUDA
+    CHECK_CUDA_INPUT(grad_output);
+    CHECK_CUDA_INPUT(weight);
+    CHECK_CUDA_INPUT(grad_weight);
+    CHECK_CUDA_INPUT(grad_bias);
+    CHECK_CUDA_INPUT(norm);
+    CHECK_CUDA_INPUT(std);
+    CHECK_CUDA_INPUT(grad_input);
+    sync_bn_backward_data_cuda(grad_output, weight, grad_weight, grad_bias,
+                               norm, std, grad_input);
+#else
+    AT_ERROR("SyncBatchNorm is not compiled with GPU support");
+#endif
+  } else {
+    AT_ERROR("SyncBatchNorm is not implemented on CPU");
+  }
 }
diff --git a/mmcv/ops/csrc/parrots/three_interpolate.cpp b/mmcv/ops/csrc/parrots/three_interpolate.cpp
index 1e0ec71..dbbcd99 100644
--- a/mmcv/ops/csrc/parrots/three_interpolate.cpp
+++ b/mmcv/ops/csrc/parrots/three_interpolate.cpp
@@ -2,32 +2,60 @@
 // https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/interpolate.cpp
 
 #include "pytorch_cpp_helper.hpp"
-#include "pytorch_device_registry.hpp"
 
-void three_interpolate_forward_impl(int b, int c, int m, int n,
+#ifdef MMCV_WITH_CUDA
+void ThreeInterpolateForwardCUDAKernelLauncher(int b, int c, int m, int n,
+                                               const Tensor points,
+                                               const Tensor idx,
+                                               const Tensor weight, Tensor out);
+
+void three_interpolate_forward_cuda(int b, int c, int m, int n,
                                     const Tensor points, const Tensor idx,
                                     const Tensor weight, Tensor out) {
-  DISPATCH_DEVICE_IMPL(three_interpolate_forward_impl, b, c, m, n, points, idx,
-                       weight, out);
-}
+  ThreeInterpolateForwardCUDAKernelLauncher(b, c, m, n, points, idx, weight,
+                                            out);
+};
 
-void three_interpolate_backward_impl(int b, int c, int n, int m,
+void ThreeInterpolateBackwardCUDAKernelLauncher(int b, int c, int n, int m,
+                                                const Tensor grad_out,
+                                                const Tensor idx,
+                                                const Tensor weight,
+                                                Tensor grad_points);
+
+void three_interpolate_backward_cuda(int b, int c, int n, int m,
                                      const Tensor grad_out, const Tensor idx,
                                      const Tensor weight, Tensor grad_points) {
-  DISPATCH_DEVICE_IMPL(three_interpolate_backward_impl, b, c, n, m, grad_out,
-                       idx, weight, grad_points);
-}
+  ThreeInterpolateBackwardCUDAKernelLauncher(b, c, n, m, grad_out, idx, weight,
+                                             grad_points);
+};
+#endif
 
 void three_interpolate_forward(Tensor points_tensor, Tensor idx_tensor,
                                Tensor weight_tensor, Tensor out_tensor, int b,
                                int c, int m, int n) {
-  three_interpolate_forward_impl(b, c, m, n, points_tensor, idx_tensor,
-                                 weight_tensor, out_tensor);
+  if (points_tensor.device().is_cuda()) {
+#ifdef MMCV_WITH_CUDA
+    three_interpolate_forward_cuda(b, c, m, n, points_tensor, idx_tensor,
+                                   weight_tensor, out_tensor);
+#else
+    AT_ERROR("three_interpolate is not compiled with GPU support");
+#endif
+  } else {
+    AT_ERROR("three_interpolate is not implemented on CPU");
+  }
 }
 
 void three_interpolate_backward(Tensor grad_out_tensor, Tensor idx_tensor,
                                 Tensor weight_tensor, Tensor grad_points_tensor,
                                 int b, int c, int n, int m) {
-  three_interpolate_backward_impl(b, c, n, m, grad_out_tensor, idx_tensor,
-                                  weight_tensor, grad_points_tensor);
+  if (grad_out_tensor.device().is_cuda()) {
+#ifdef MMCV_WITH_CUDA
+    three_interpolate_backward_cuda(b, c, n, m, grad_out_tensor, idx_tensor,
+                                    weight_tensor, grad_points_tensor);
+#else
+    AT_ERROR("three_interpolate is not compiled with GPU support");
+#endif
+  } else {
+    AT_ERROR("three_interpolate is not implemented on CPU");
+  }
 }
diff --git a/mmcv/ops/csrc/parrots/three_nn.cpp b/mmcv/ops/csrc/parrots/three_nn.cpp
index b629200..158ac00 100644
--- a/mmcv/ops/csrc/parrots/three_nn.cpp
+++ b/mmcv/ops/csrc/parrots/three_nn.cpp
@@ -2,17 +2,29 @@
 // https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/interpolate.cpp
 
 #include "pytorch_cpp_helper.hpp"
-#include "pytorch_device_registry.hpp"
 
-void three_nn_forward_impl(int b, int n, int m, const Tensor unknown,
+#ifdef MMCV_WITH_CUDA
+void ThreeNNForwardCUDAKernelLauncher(int b, int n, int m, const Tensor unknown,
+                                      const Tensor known, Tensor dist2,
+                                      Tensor idx);
+
+void three_nn_forward_cuda(int b, int n, int m, const Tensor unknown,
                            const Tensor known, Tensor dist2, Tensor idx) {
-  DISPATCH_DEVICE_IMPL(three_nn_forward_impl, b, n, m, unknown, known, dist2,
-                       idx);
-}
+  ThreeNNForwardCUDAKernelLauncher(b, n, m, unknown, known, dist2, idx);
+};
+#endif
 
 void three_nn_forward(Tensor unknown_tensor, Tensor known_tensor,
                       Tensor dist2_tensor, Tensor idx_tensor, int b, int n,
                       int m) {
-  three_nn_forward_impl(b, n, m, unknown_tensor, known_tensor, dist2_tensor,
-                        idx_tensor);
+  if (unknown_tensor.device().is_cuda()) {
+#ifdef MMCV_WITH_CUDA
+    three_nn_forward_cuda(b, n, m, unknown_tensor, known_tensor, dist2_tensor,
+                          idx_tensor);
+#else
+    AT_ERROR("three_nn is not compiled with GPU support");
+#endif
+  } else {
+    AT_ERROR("three_nn is not implemented on CPU");
+  }
 }
diff --git a/mmcv/ops/csrc/parrots/tin_shift.cpp b/mmcv/ops/csrc/parrots/tin_shift.cpp
index b03f587..a10af24 100644
--- a/mmcv/ops/csrc/parrots/tin_shift.cpp
+++ b/mmcv/ops/csrc/parrots/tin_shift.cpp
@@ -1,20 +1,52 @@
 // Copyright (c) OpenMMLab. All rights reserved
 #include "pytorch_cpp_helper.hpp"
-#include "pytorch_device_registry.hpp"
 
-void tin_shift_forward_impl(Tensor input, Tensor shift, Tensor output) {
-  DISPATCH_DEVICE_IMPL(tin_shift_forward_impl, input, shift, output);
+#ifdef MMCV_WITH_CUDA
+void TINShiftForwardCUDAKernelLauncher(Tensor input, Tensor shift,
+                                       Tensor output);
+
+void TINShiftBackwardCUDAKernelLauncher(Tensor grad_output, Tensor shift,
+                                        Tensor grad_input);
+
+void tin_shift_forward_cuda(Tensor input, Tensor shift, Tensor output) {
+  TINShiftForwardCUDAKernelLauncher(input, shift, output);
 }
 
-void tin_shift_backward_impl(Tensor grad_output, Tensor shift,
+void tin_shift_backward_cuda(Tensor grad_output, Tensor shift,
                              Tensor grad_input) {
-  DISPATCH_DEVICE_IMPL(tin_shift_backward_impl, grad_output, shift, grad_input);
+  TINShiftBackwardCUDAKernelLauncher(grad_output, shift, grad_input);
 }
 
+#endif
+
 void tin_shift_forward(Tensor input, Tensor shift, Tensor output) {
-  tin_shift_forward_impl(input, shift, output);
+  if (input.device().is_cuda()) {
+#ifdef MMCV_WITH_CUDA
+    CHECK_CUDA_INPUT(input);
+    CHECK_CUDA_INPUT(shift);
+    CHECK_CUDA_INPUT(output);
+
+    tin_shift_forward_cuda(input, shift, output);
+#else
+    AT_ERROR("TINShift is not compiled with GPU support");
+#endif
+  } else {
+    AT_ERROR("TINShift is not implemented on CPU");
+  }
 }
 
 void tin_shift_backward(Tensor grad_output, Tensor shift, Tensor grad_input) {
-  tin_shift_backward_impl(grad_output, shift, grad_input);
+  if (grad_output.device().is_cuda()) {
+#ifdef MMCV_WITH_CUDA
+    CHECK_CUDA_INPUT(grad_output);
+    CHECK_CUDA_INPUT(shift);
+    CHECK_CUDA_INPUT(grad_input);
+
+    tin_shift_backward_cuda(grad_output, shift, grad_input);
+#else
+    AT_ERROR("TINShift is not compiled with GPU support");
+#endif
+  } else {
+    AT_ERROR("TINShift is not implemented on CPU");
+  }
 }
diff --git a/mmcv/ops/csrc/parrots/upfirdn2d.cpp b/mmcv/ops/csrc/parrots/upfirdn2d.cpp
index dd325bd..c966822 100644
--- a/mmcv/ops/csrc/parrots/upfirdn2d.cpp
+++ b/mmcv/ops/csrc/parrots/upfirdn2d.cpp
@@ -1,118 +1,26 @@
-// Modified from
+// Copyright (c) OpenMMLab. All rights reserved
+// from
 // https://github.com/rosinality/stylegan2-pytorch/blob/master/op/upfirdn2d.cpp
-
-/*
-Copyright (c) 2021, NVIDIA Corporation. All rights reserved.
-
-NVIDIA Source Code License for StyleGAN2 with Adaptive Discriminator
-Augmentation (ADA)
-=======================================================================
-
-1. Definitions
-
-"Licensor" means any person or entity that distributes its Work.
-
-"Software" means the original work of authorship made available under
-this License.
-
-"Work" means the Software and any additions to or derivative works of
-the Software that are made available under this License.
-
-The terms "reproduce," "reproduction," "derivative works," and
-"distribution" have the meaning as provided under U.S. copyright law;
-provided, however, that for the purposes of this License, derivative
-works shall not include works that remain separable from, or merely
-link (or bind by name) to the interfaces of, the Work.
-
-Works, including the Software, are "made available" under this License
-by including in or with the Work either (a) a copyright notice
-referencing the applicability of this License to the Work, or (b) a
-copy of this License.
-
-2. License Grants
-
-    2.1 Copyright Grant. Subject to the terms and conditions of this
-    License, each Licensor grants to you a perpetual, worldwide,
-    non-exclusive, royalty-free, copyright license to reproduce,
-    prepare derivative works of, publicly display, publicly perform,
-    sublicense and distribute its Work and any resulting derivative
-    works in any form.
-
-3. Limitations
-
-    3.1 Redistribution. You may reproduce or distribute the Work only
-    if (a) you do so under this License, (b) you include a complete
-    copy of this License with your distribution, and (c) you retain
-    without modification any copyright, patent, trademark, or
-    attribution notices that are present in the Work.
-
-    3.2 Derivative Works. You may specify that additional or different
-    terms apply to the use, reproduction, and distribution of your
-    derivative works of the Work ("Your Terms") only if (a) Your Terms
-    provide that the use limitation in Section 3.3 applies to your
-    derivative works, and (b) you identify the specific derivative
-    works that are subject to Your Terms. Notwithstanding Your Terms,
-    this License (including the redistribution requirements in Section
-    3.1) will continue to apply to the Work itself.
-
-    3.3 Use Limitation. The Work and any derivative works thereof only
-    may be used or intended for use non-commercially. Notwithstanding
-    the foregoing, NVIDIA and its affiliates may use the Work and any
-    derivative works commercially. As used herein, "non-commercially"
-    means for research or evaluation purposes only.
-
-    3.4 Patent Claims. If you bring or threaten to bring a patent claim
-    against any Licensor (including any claim, cross-claim or
-    counterclaim in a lawsuit) to enforce any patents that you allege
-    are infringed by any Work, then your rights under this License from
-    such Licensor (including the grant in Section 2.1) will terminate
-    immediately.
-
-    3.5 Trademarks. This License does not grant any rights to use any
-    Licensor’s or its affiliates’ names, logos, or trademarks, except
-    as necessary to reproduce the notices described in this License.
-
-    3.6 Termination. If you violate any term of this License, then your
-    rights under this License (including the grant in Section 2.1) will
-    terminate immediately.
-
-4. Disclaimer of Warranty.
-
-THE WORK IS PROVIDED "AS IS" WITHOUT WARRANTIES OR CONDITIONS OF ANY
-KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WARRANTIES OR CONDITIONS OF
-MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE OR
-NON-INFRINGEMENT. YOU BEAR THE RISK OF UNDERTAKING ANY ACTIVITIES UNDER
-THIS LICENSE.
-
-5. Limitation of Liability.
-
-EXCEPT AS PROHIBITED BY APPLICABLE LAW, IN NO EVENT AND UNDER NO LEGAL
-THEORY, WHETHER IN TORT (INCLUDING NEGLIGENCE), CONTRACT, OR OTHERWISE
-SHALL ANY LICENSOR BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY DIRECT,
-INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING OUT OF
-OR RELATED TO THIS LICENSE, THE USE OR INABILITY TO USE THE WORK
-(INCLUDING BUT NOT LIMITED TO LOSS OF GOODWILL, BUSINESS INTERRUPTION,
-LOST PROFITS OR DATA, COMPUTER FAILURE OR MALFUNCTION, OR ANY OTHER
-COMMERCIAL DAMAGES OR LOSSES), EVEN IF THE LICENSOR HAS BEEN ADVISED OF
-THE POSSIBILITY OF SUCH DAMAGES.
-
-=======================================================================
-*/
-
 #include "pytorch_cpp_helper.hpp"
-#include "pytorch_device_registry.hpp"
 
-torch::Tensor upfirdn2d_op_impl(const torch::Tensor& input,
-                                const torch::Tensor& kernel, int up_x, int up_y,
-                                int down_x, int down_y, int pad_x0, int pad_x1,
-                                int pad_y0, int pad_y1) {
-  return DISPATCH_DEVICE_IMPL(upfirdn2d_op_impl, input, kernel, up_x, up_y,
-                              down_x, down_y, pad_x0, pad_x1, pad_y0, pad_y1);
-}
+#ifdef MMCV_WITH_CUDA
+torch::Tensor upfirdn2d_op(const torch::Tensor &input,
+                           const torch::Tensor &kernel, int up_x, int up_y,
+                           int down_x, int down_y, int pad_x0, int pad_x1,
+                           int pad_y0, int pad_y1);
+
+#endif
 
-torch::Tensor upfirdn2d(const torch::Tensor& input, const torch::Tensor& kernel,
+torch::Tensor upfirdn2d(const torch::Tensor &input, const torch::Tensor &kernel,
                         int up_x, int up_y, int down_x, int down_y, int pad_x0,
                         int pad_x1, int pad_y0, int pad_y1) {
-  return upfirdn2d_op_impl(input, kernel, up_x, up_y, down_x, down_y, pad_x0,
-                           pad_x1, pad_y0, pad_y1);
+#ifdef MMCV_WITH_CUDA
+  CHECK_CUDA(input);
+  CHECK_CUDA(kernel);
+
+  return upfirdn2d_op(input, kernel, up_x, up_y, down_x, down_y, pad_x0, pad_x1,
+                      pad_y0, pad_y1);
+#else
+  AT_ERROR("UpFirDn2d is not compiled with GPU support");
+#endif
 }
diff --git a/mmcv/ops/csrc/parrots/voxelization.cpp b/mmcv/ops/csrc/parrots/voxelization.cpp
index 7946be6..6b6ce4e 100644
--- a/mmcv/ops/csrc/parrots/voxelization.cpp
+++ b/mmcv/ops/csrc/parrots/voxelization.cpp
@@ -1,45 +1,58 @@
 // Copyright (c) OpenMMLab. All rights reserved.
 #include "pytorch_cpp_helper.hpp"
-#include "pytorch_device_registry.hpp"
 
-int hard_voxelize_forward_impl(const at::Tensor &points, at::Tensor &voxels,
+#ifdef MMCV_WITH_CUDA
+int HardVoxelizeForwardCUDAKernelLauncher(
+    const at::Tensor &points, at::Tensor &voxels, at::Tensor &coors,
+    at::Tensor &num_points_per_voxel, const std::vector<float> voxel_size,
+    const std::vector<float> coors_range, const int max_points,
+    const int max_voxels, const int NDim = 3);
+
+int hard_voxelize_forward_cuda(const at::Tensor &points, at::Tensor &voxels,
                                at::Tensor &coors,
                                at::Tensor &num_points_per_voxel,
                                const std::vector<float> voxel_size,
                                const std::vector<float> coors_range,
                                const int max_points, const int max_voxels,
                                const int NDim = 3) {
-  return DISPATCH_DEVICE_IMPL(hard_voxelize_forward_impl, points, voxels, coors,
-                              num_points_per_voxel, voxel_size, coors_range,
-                              max_points, max_voxels, NDim);
-}
+  return HardVoxelizeForwardCUDAKernelLauncher(
+      points, voxels, coors, num_points_per_voxel, voxel_size, coors_range,
+      max_points, max_voxels, NDim);
+};
 
-int nondeterministic_hard_voxelize_forward_impl(
-    const at::Tensor &points, at::Tensor &voxels, at::Tensor &coors,
-    at::Tensor &num_points_per_voxel, const std::vector<float> voxel_size,
-    const std::vector<float> coors_range, const int max_points,
-    const int max_voxels, const int NDim = 3) {
-  return DISPATCH_DEVICE_IMPL(nondeterministic_hard_voxelize_forward_impl,
-                              points, voxels, coors, num_points_per_voxel,
-                              voxel_size, coors_range, max_points, max_voxels,
-                              NDim);
-}
+void DynamicVoxelizeForwardCUDAKernelLauncher(
+    const at::Tensor &points, at::Tensor &coors,
+    const std::vector<float> voxel_size, const std::vector<float> coors_range,
+    const int NDim = 3);
 
-void dynamic_voxelize_forward_impl(const at::Tensor &points, at::Tensor &coors,
+void dynamic_voxelize_forward_cuda(const at::Tensor &points, at::Tensor &coors,
                                    const std::vector<float> voxel_size,
                                    const std::vector<float> coors_range,
                                    const int NDim = 3) {
-  DISPATCH_DEVICE_IMPL(dynamic_voxelize_forward_impl, points, coors, voxel_size,
-                       coors_range, NDim);
-}
+  DynamicVoxelizeForwardCUDAKernelLauncher(points, coors, voxel_size,
+                                           coors_range, NDim);
+};
+#endif
+
+int hard_voxelize_forward_cpu(const at::Tensor &points, at::Tensor &voxels,
+                              at::Tensor &coors,
+                              at::Tensor &num_points_per_voxel,
+                              const std::vector<float> voxel_size,
+                              const std::vector<float> coors_range,
+                              const int max_points, const int max_voxels,
+                              const int NDim = 3);
+
+void dynamic_voxelize_forward_cpu(const at::Tensor &points, at::Tensor &coors,
+                                  const std::vector<float> voxel_size,
+                                  const std::vector<float> coors_range,
+                                  const int NDim = 3);
 
 void hard_voxelize_forward(const at::Tensor &points,
                            const at::Tensor &voxel_size,
                            const at::Tensor &coors_range, at::Tensor &voxels,
                            at::Tensor &coors, at::Tensor &num_points_per_voxel,
                            at::Tensor &voxel_num, const int max_points,
-                           const int max_voxels, const int NDim = 3,
-                           const bool deterministic = true) {
+                           const int max_voxels, const int NDim = 3) {
   int64_t *voxel_num_data = voxel_num.data_ptr<int64_t>();
   std::vector<float> voxel_size_v(
       voxel_size.data_ptr<float>(),
@@ -47,13 +60,18 @@ void hard_voxelize_forward(const at::Tensor &points,
   std::vector<float> coors_range_v(
       coors_range.data_ptr<float>(),
       coors_range.data_ptr<float>() + coors_range.numel());
+  if (points.device().is_cuda()) {
+#ifdef MMCV_WITH_CUDA
+    CHECK_CUDA_INPUT(points);
 
-  if (deterministic) {
-    *voxel_num_data = hard_voxelize_forward_impl(
+    *voxel_num_data = hard_voxelize_forward_cuda(
         points, voxels, coors, num_points_per_voxel, voxel_size_v,
         coors_range_v, max_points, max_voxels, NDim);
+#else
+    AT_ERROR("hard_voxelize is not compiled with GPU support");
+#endif
   } else {
-    *voxel_num_data = nondeterministic_hard_voxelize_forward_impl(
+    *voxel_num_data = hard_voxelize_forward_cpu(
         points, voxels, coors, num_points_per_voxel, voxel_size_v,
         coors_range_v, max_points, max_voxels, NDim);
   }
@@ -69,6 +87,17 @@ void dynamic_voxelize_forward(const at::Tensor &points,
   std::vector<float> coors_range_v(
       coors_range.data_ptr<float>(),
       coors_range.data_ptr<float>() + coors_range.numel());
-  dynamic_voxelize_forward_impl(points, coors, voxel_size_v, coors_range_v,
-                                NDim);
+  if (points.device().is_cuda()) {
+#ifdef MMCV_WITH_CUDA
+    CHECK_CUDA_INPUT(points);
+
+    dynamic_voxelize_forward_cuda(points, coors, voxel_size_v, coors_range_v,
+                                  NDim);
+#else
+    AT_ERROR("dynamic_voxelize is not compiled with GPU support");
+#endif
+  } else {
+    dynamic_voxelize_forward_cpu(points, coors, voxel_size_v, coors_range_v,
+                                 NDim);
+  }
 }
diff --git a/mmcv/ops/csrc/parrots/voxelization_cpu.cpp b/mmcv/ops/csrc/parrots/voxelization_cpu.cpp
new file mode 100644
index 0000000..59eb86f
--- /dev/null
+++ b/mmcv/ops/csrc/parrots/voxelization_cpu.cpp
@@ -0,0 +1,152 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+#include "pytorch_cpp_helper.hpp"
+
+template <typename T, typename T_int>
+void dynamic_voxelize_forward_cpu_kernel(
+    const torch::TensorAccessor<T, 2> points,
+    torch::TensorAccessor<T_int, 2> coors, const std::vector<float> voxel_size,
+    const std::vector<float> coors_range, const std::vector<int> grid_size,
+    const int num_points, const int num_features, const int NDim) {
+  const int ndim_minus_1 = NDim - 1;
+  bool failed = false;
+  // int coor[NDim];
+  int* coor = new int[NDim]();
+  int c;
+
+  for (int i = 0; i < num_points; ++i) {
+    failed = false;
+    for (int j = 0; j < NDim; ++j) {
+      c = floor((points[i][j] - coors_range[j]) / voxel_size[j]);
+      // necessary to rm points out of range
+      if ((c < 0 || c >= grid_size[j])) {
+        failed = true;
+        break;
+      }
+      coor[ndim_minus_1 - j] = c;
+    }
+
+    if (failed)
+      memset(&coors[i][0], -1, NDim * sizeof(T_int));
+    else
+      memcpy(&coors[i][0], &coor[0], NDim * sizeof(T_int));
+  }
+
+  delete[] coor;
+}
+
+template <typename T, typename T_int>
+void hard_voxelize_forward_cpu_kernel(
+    const torch::TensorAccessor<T, 2> points,
+    torch::TensorAccessor<T, 3> voxels, torch::TensorAccessor<T_int, 2> coors,
+    torch::TensorAccessor<T_int, 1> num_points_per_voxel,
+    torch::TensorAccessor<T_int, 3> coor_to_voxelidx, int& voxel_num,
+    const std::vector<float> voxel_size, const std::vector<float> coors_range,
+    const std::vector<int> grid_size, const int max_points,
+    const int max_voxels, const int num_points, const int num_features,
+    const int NDim) {
+  // declare a temp coors
+  at::Tensor temp_coors = at::zeros(
+      {num_points, NDim}, at::TensorOptions().dtype(at::kInt).device(at::kCPU));
+
+  // First use dynamic voxelization to get coors,
+  // then check max points/voxels constraints
+  dynamic_voxelize_forward_cpu_kernel<T, int>(
+      points, temp_coors.accessor<int, 2>(), voxel_size, coors_range, grid_size,
+      num_points, num_features, NDim);
+
+  int voxelidx, num;
+  auto coor = temp_coors.accessor<int, 2>();
+
+  for (int i = 0; i < num_points; ++i) {
+    // T_int* coor = temp_coors.data_ptr<int>() + i * NDim;
+
+    if (coor[i][0] == -1) continue;
+
+    voxelidx = coor_to_voxelidx[coor[i][0]][coor[i][1]][coor[i][2]];
+
+    // record voxel
+    if (voxelidx == -1) {
+      voxelidx = voxel_num;
+      if (max_voxels != -1 && voxel_num >= max_voxels) continue;
+      voxel_num += 1;
+
+      coor_to_voxelidx[coor[i][0]][coor[i][1]][coor[i][2]] = voxelidx;
+      memcpy(&coors[voxelidx][0], &coor[i][0], NDim * sizeof(T_int));
+    }
+
+    // put points into voxel
+    num = num_points_per_voxel[voxelidx];
+    if (max_points == -1 || num < max_points) {
+      memcpy(&voxels[voxelidx][num][0], &points[i][0],
+             num_features * sizeof(T));
+      num_points_per_voxel[voxelidx] += 1;
+    }
+  }
+
+  return;
+}
+
+void dynamic_voxelize_forward_cpu(const at::Tensor& points, at::Tensor& coors,
+                                  const std::vector<float> voxel_size,
+                                  const std::vector<float> coors_range,
+                                  const int NDim = 3) {
+  // check device
+  AT_ASSERTM(points.device().is_cpu(), "points must be a CPU tensor");
+
+  std::vector<int> grid_size(NDim);
+  const int num_points = points.size(0);
+  const int num_features = points.size(1);
+
+  for (int i = 0; i < NDim; ++i) {
+    grid_size[i] =
+        round((coors_range[NDim + i] - coors_range[i]) / voxel_size[i]);
+  }
+
+  // coors, num_points_per_voxel, coor_to_voxelidx are int Tensor
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      points.scalar_type(), "dynamic_voxelize_forward_cpu_kernel", [&] {
+        dynamic_voxelize_forward_cpu_kernel<scalar_t, int>(
+            points.accessor<scalar_t, 2>(), coors.accessor<int, 2>(),
+            voxel_size, coors_range, grid_size, num_points, num_features, NDim);
+      });
+}
+
+int hard_voxelize_forward_cpu(const at::Tensor& points, at::Tensor& voxels,
+                              at::Tensor& coors,
+                              at::Tensor& num_points_per_voxel,
+                              const std::vector<float> voxel_size,
+                              const std::vector<float> coors_range,
+                              const int max_points, const int max_voxels,
+                              const int NDim = 3) {
+  // current version tooks about 0.02s_0.03s for one frame on cpu
+  // check device
+  AT_ASSERTM(points.device().is_cpu(), "points must be a CPU tensor");
+
+  std::vector<int> grid_size(NDim);
+  const int num_points = points.size(0);
+  const int num_features = points.size(1);
+
+  for (int i = 0; i < NDim; ++i) {
+    grid_size[i] =
+        round((coors_range[NDim + i] - coors_range[i]) / voxel_size[i]);
+  }
+
+  // coors, num_points_per_voxel, coor_to_voxelidx are int Tensor
+  // printf("cpu coor_to_voxelidx size: [%d, %d, %d]\n", grid_size[2],
+  // grid_size[1], grid_size[0]);
+  at::Tensor coor_to_voxelidx =
+      -at::ones({grid_size[2], grid_size[1], grid_size[0]}, coors.options());
+
+  int voxel_num = 0;
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      points.scalar_type(), "hard_voxelize_forward_cpu_kernel", [&] {
+        hard_voxelize_forward_cpu_kernel<scalar_t, int>(
+            points.accessor<scalar_t, 2>(), voxels.accessor<scalar_t, 3>(),
+            coors.accessor<int, 2>(), num_points_per_voxel.accessor<int, 1>(),
+            coor_to_voxelidx.accessor<int, 3>(), voxel_num, voxel_size,
+            coors_range, grid_size, max_points, max_voxels, num_points,
+            num_features, NDim);
+      });
+
+  return voxel_num;
+}
diff --git a/mmcv/ops/csrc/parrots/voxelization_parrots.cpp b/mmcv/ops/csrc/parrots/voxelization_parrots.cpp
index 90e2a44..208e539 100644
--- a/mmcv/ops/csrc/parrots/voxelization_parrots.cpp
+++ b/mmcv/ops/csrc/parrots/voxelization_parrots.cpp
@@ -12,12 +12,10 @@ void hard_voxelize_forward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
                                         const OperatorBase::in_list_t& ins,
                                         OperatorBase::out_list_t& outs) {
   int max_points, max_voxels, NDim;
-  bool deterministic;
   SSAttrs(attr)
       .get<int>("max_points", max_points)
       .get<int>("max_voxels", max_voxels)
       .get<int>("NDim", NDim)
-      .get<bool>("deterministic", deterministic)
       .done();
   const auto& points = buildATensor(ctx, ins[0]);
   const auto& voxel_size = buildATensor(ctx, ins[1]);
@@ -30,7 +28,7 @@ void hard_voxelize_forward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
 
   hard_voxelize_forward(points, voxel_size, coors_range, voxels, coors,
                         num_points_per_voxel, voxel_num, max_points, max_voxels,
-                        NDim, deterministic);
+                        NDim);
 }
 
 void dynamic_voxelize_forward_cuda_parrots(CudaContext& ctx,
@@ -53,12 +51,10 @@ void hard_voxelize_forward_cpu_parrots(HostContext& ctx, const SSElement& attr,
                                        const OperatorBase::in_list_t& ins,
                                        OperatorBase::out_list_t& outs) {
   int max_points, max_voxels, NDim;
-  bool deterministic;
   SSAttrs(attr)
       .get<int>("max_points", max_points)
       .get<int>("max_voxels", max_voxels)
       .get<int>("NDim", NDim)
-      .get<bool>("deterministic", deterministic)
       .done();
   const auto& points = buildATensor(ctx, ins[0]);
   const auto& voxel_size = buildATensor(ctx, ins[1]);
@@ -71,7 +67,7 @@ void hard_voxelize_forward_cpu_parrots(HostContext& ctx, const SSElement& attr,
 
   hard_voxelize_forward(points, voxel_size, coors_range, voxels, coors,
                         num_points_per_voxel, voxel_num, max_points, max_voxels,
-                        NDim, deterministic);
+                        NDim);
 }
 
 void dynamic_voxelize_forward_cpu_parrots(HostContext& ctx,
@@ -93,7 +89,6 @@ PARROTS_EXTENSION_REGISTER(hard_voxelize_forward)
     .attr("max_points")
     .attr("max_voxels")
     .attr("NDim")
-    .attr("deterministic")
     .input(3)
     .output(4)
     .apply(hard_voxelize_forward_cpu_parrots)
diff --git a/mmcv/ops/csrc/parrots/voxelization_pytorch.h b/mmcv/ops/csrc/parrots/voxelization_pytorch.h
index 0019d51..f0fb325 100644
--- a/mmcv/ops/csrc/parrots/voxelization_pytorch.h
+++ b/mmcv/ops/csrc/parrots/voxelization_pytorch.h
@@ -9,8 +9,7 @@ void hard_voxelize_forward(const at::Tensor &points,
                            const at::Tensor &coors_range, at::Tensor &voxels,
                            at::Tensor &coors, at::Tensor &num_points_per_voxel,
                            at::Tensor &voxel_num, const int max_points,
-                           const int max_voxels, const int NDim = 3,
-                           const bool deterministic = true);
+                           const int max_voxels, const int NDim = 3);
 
 void dynamic_voxelize_forward(const at::Tensor &points,
                               const at::Tensor &voxel_size,
diff --git a/mmcv/ops/csrc/pytorch/active_rotated_filter.cpp b/mmcv/ops/csrc/pytorch/active_rotated_filter.cpp
deleted file mode 100644
index e1ead1f..0000000
--- a/mmcv/ops/csrc/pytorch/active_rotated_filter.cpp
+++ /dev/null
@@ -1,28 +0,0 @@
-// Copyright (c) OpenMMLab. All rights reserved.
-// Modified from
-// https://github.com/csuhan/s2anet/blob/master/mmdet/ops/orn/src/ActiveRotatingFilter.h
-
-#include "pytorch_cpp_helper.hpp"
-#include "pytorch_device_registry.hpp"
-
-void active_rotated_filter_forward_impl(const Tensor input,
-                                        const Tensor indices, Tensor output) {
-  DISPATCH_DEVICE_IMPL(active_rotated_filter_forward_impl, input, indices,
-                       output);
-}
-
-void active_rotated_filter_backward_impl(const Tensor grad_out,
-                                         const Tensor indices, Tensor grad_in) {
-  DISPATCH_DEVICE_IMPL(active_rotated_filter_backward_impl, grad_out, indices,
-                       grad_in);
-}
-
-void active_rotated_filter_forward(const Tensor input, const Tensor indices,
-                                   Tensor output) {
-  active_rotated_filter_forward_impl(input, indices, output);
-}
-
-void active_rotated_filter_backward(const Tensor grad_out, const Tensor indices,
-                                    Tensor grad_in) {
-  active_rotated_filter_backward_impl(grad_out, indices, grad_in);
-}
diff --git a/mmcv/ops/csrc/pytorch/ball_query.cpp b/mmcv/ops/csrc/pytorch/ball_query.cpp
index b0534db..1c9e7a2 100644
--- a/mmcv/ops/csrc/pytorch/ball_query.cpp
+++ b/mmcv/ops/csrc/pytorch/ball_query.cpp
@@ -18,21 +18,3 @@ void ball_query_forward(Tensor new_xyz_tensor, Tensor xyz_tensor,
   ball_query_forward_impl(b, n, m, min_radius, max_radius, nsample,
                           new_xyz_tensor, xyz_tensor, idx_tensor);
 }
-
-void stack_ball_query_forward_impl(float max_radius, int nsample,
-                                   const Tensor new_xyz,
-                                   const Tensor new_xyz_batch_cnt,
-                                   const Tensor xyz, const Tensor xyz_batch_cnt,
-                                   Tensor idx) {
-  DISPATCH_DEVICE_IMPL(stack_ball_query_forward_impl, max_radius, nsample,
-                       new_xyz, new_xyz_batch_cnt, xyz, xyz_batch_cnt, idx);
-}
-
-void stack_ball_query_forward(Tensor new_xyz_tensor, Tensor new_xyz_batch_cnt,
-                              Tensor xyz_tensor, Tensor xyz_batch_cnt,
-                              Tensor idx_tensor, float max_radius,
-                              int nsample) {
-  stack_ball_query_forward_impl(max_radius, nsample, new_xyz_tensor,
-                                new_xyz_batch_cnt, xyz_tensor, xyz_batch_cnt,
-                                idx_tensor);
-}
diff --git a/mmcv/ops/csrc/pytorch/bezier_align.cpp b/mmcv/ops/csrc/pytorch/bezier_align.cpp
deleted file mode 100644
index b8521d6..0000000
--- a/mmcv/ops/csrc/pytorch/bezier_align.cpp
+++ /dev/null
@@ -1,38 +0,0 @@
-// Copyright (c) OpenMMLab. All rights reserved
-#include "pytorch_cpp_helper.hpp"
-#include "pytorch_device_registry.hpp"
-
-void bezier_align_forward_impl(Tensor input, Tensor rois, Tensor output,
-                               int aligned_height, int aligned_width,
-                               float spatial_scale, int sampling_ratio,
-                               bool aligned) {
-  DISPATCH_DEVICE_IMPL(bezier_align_forward_impl, input, rois, output,
-                       aligned_height, aligned_width, spatial_scale,
-                       sampling_ratio, aligned);
-}
-
-void bezier_align_backward_impl(Tensor grad_output, Tensor rois,
-                                Tensor grad_input, int aligned_height,
-                                int aligned_width, float spatial_scale,
-                                int sampling_ratio, bool aligned) {
-  DISPATCH_DEVICE_IMPL(bezier_align_backward_impl, grad_output, rois,
-                       grad_input, aligned_height, aligned_width, spatial_scale,
-                       sampling_ratio, aligned);
-}
-
-void bezier_align_forward(Tensor input, Tensor rois, Tensor output,
-                          int aligned_height, int aligned_width,
-                          float spatial_scale, int sampling_ratio,
-                          bool aligned) {
-  bezier_align_forward_impl(input, rois, output, aligned_height, aligned_width,
-                            spatial_scale, sampling_ratio, aligned);
-}
-
-void bezier_align_backward(Tensor grad_output, Tensor rois, Tensor grad_input,
-                           int aligned_height, int aligned_width,
-                           float spatial_scale, int sampling_ratio,
-                           bool aligned) {
-  bezier_align_backward_impl(grad_output, rois, grad_input, aligned_height,
-                             aligned_width, spatial_scale, sampling_ratio,
-                             aligned);
-}
diff --git a/mmcv/ops/csrc/pytorch/bias_act.cpp b/mmcv/ops/csrc/pytorch/bias_act.cpp
deleted file mode 100644
index 5ad32dc..0000000
--- a/mmcv/ops/csrc/pytorch/bias_act.cpp
+++ /dev/null
@@ -1,20 +0,0 @@
-#include "pytorch_cpp_helper.hpp"
-#include "pytorch_device_registry.hpp"
-
-torch::Tensor bias_act_op_impl(const torch::Tensor &input,
-                               const torch::Tensor &bias,
-                               const torch::Tensor &xref,
-                               const torch::Tensor &yref,
-                               const torch::Tensor &dy, int grad, int dim,
-                               int act, float alpha, float gain, float clamp) {
-  return DISPATCH_DEVICE_IMPL(bias_act_op_impl, input, bias, xref, yref, dy,
-                              grad, dim, act, alpha, gain, clamp);
-}
-
-torch::Tensor bias_act(const torch::Tensor &input, const torch::Tensor &bias,
-                       const torch::Tensor &xref, const torch::Tensor &yref,
-                       const torch::Tensor &dy, int grad, int dim, int act,
-                       float alpha, float gain, float clamp) {
-  return bias_act_op_impl(input, bias, xref, yref, dy, grad, dim, act, alpha,
-                          gain, clamp);
-}
diff --git a/mmcv/ops/csrc/pytorch/box_iou_quadri.cpp b/mmcv/ops/csrc/pytorch/box_iou_quadri.cpp
deleted file mode 100644
index 48c9281..0000000
--- a/mmcv/ops/csrc/pytorch/box_iou_quadri.cpp
+++ /dev/null
@@ -1,17 +0,0 @@
-// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
-#include "pytorch_cpp_helper.hpp"
-#include "pytorch_device_registry.hpp"
-
-void box_iou_quadri_impl(const Tensor boxes1, const Tensor boxes2, Tensor ious,
-                         const int mode_flag, const bool aligned) {
-  DISPATCH_DEVICE_IMPL(box_iou_quadri_impl, boxes1, boxes2, ious, mode_flag,
-                       aligned);
-}
-
-// Interface for Python
-// inline is needed to prevent multiple function definitions when this header is
-// included by different cpps
-void box_iou_quadri(const Tensor boxes1, const Tensor boxes2, Tensor ious,
-                    const int mode_flag, const bool aligned) {
-  box_iou_quadri_impl(boxes1, boxes2, ious, mode_flag, aligned);
-}
diff --git a/mmcv/ops/csrc/pytorch/chamfer_distance.cpp b/mmcv/ops/csrc/pytorch/chamfer_distance.cpp
deleted file mode 100644
index dcff698..0000000
--- a/mmcv/ops/csrc/pytorch/chamfer_distance.cpp
+++ /dev/null
@@ -1,35 +0,0 @@
-// Copyright (c) OpenMMLab. All rights reserved.
-// Modified from
-// https://github.com/chrdiller/pyTorchChamferDistance/blob/master/chamfer_distance/chamfer_distance.cpp
-
-#include "pytorch_cpp_helper.hpp"
-#include "pytorch_device_registry.hpp"
-
-void chamfer_distance_forward_impl(const Tensor xyz1, const Tensor xyz2,
-                                   const Tensor dist1, const Tensor dist2,
-                                   const Tensor idx1, const Tensor idx2) {
-  DISPATCH_DEVICE_IMPL(chamfer_distance_forward_impl, xyz1, xyz2, dist1, dist2,
-                       idx1, idx2);
-}
-
-void chamfer_distance_backward_impl(const Tensor xyz1, const Tensor xyz2,
-                                    Tensor idx1, Tensor idx2, Tensor graddist1,
-                                    Tensor graddist2, Tensor gradxyz1,
-                                    Tensor gradxyz2) {
-  DISPATCH_DEVICE_IMPL(chamfer_distance_backward_impl, xyz1, xyz2, idx1, idx2,
-                       graddist1, graddist2, gradxyz1, gradxyz2);
-}
-
-void chamfer_distance_forward(const Tensor xyz1, const Tensor xyz2,
-                              const Tensor dist1, const Tensor dist2,
-                              const Tensor idx1, const Tensor idx2) {
-  chamfer_distance_forward_impl(xyz1, xyz2, dist1, dist2, idx1, idx2);
-}
-
-void chamfer_distance_backward(const Tensor xyz1, const Tensor xyz2,
-                               Tensor idx1, Tensor idx2, Tensor graddist1,
-                               Tensor graddist2, Tensor gradxyz1,
-                               Tensor gradxyz2) {
-  chamfer_distance_backward_impl(xyz1, xyz2, idx1, idx2, graddist1, graddist2,
-                                 gradxyz1, gradxyz2);
-}
diff --git a/mmcv/ops/csrc/pytorch/contour_expand.cpp b/mmcv/ops/csrc/pytorch/contour_expand.cpp
index 586c48e..7639ae5 100755
--- a/mmcv/ops/csrc/pytorch/contour_expand.cpp
+++ b/mmcv/ops/csrc/pytorch/contour_expand.cpp
@@ -102,6 +102,7 @@ std::vector<std::vector<int>> contour_expand(Tensor kernel_mask,
   IntArrayRef data_shape = kernel_mask.sizes();
 
   auto data_label_map = internal_kernel_label.data_ptr<int32_t>();
+  IntArrayRef label_map_shape = internal_kernel_label.sizes();
   vector<vector<int>> text_line;
 
   kernel_dilate(ptr_data, data_shape, data_label_map, kernel_num,
diff --git a/mmcv/ops/csrc/pytorch/convex_iou.cpp b/mmcv/ops/csrc/pytorch/convex_iou.cpp
deleted file mode 100644
index 79f2028..0000000
--- a/mmcv/ops/csrc/pytorch/convex_iou.cpp
+++ /dev/null
@@ -1,23 +0,0 @@
-// Copyright (c) OpenMMLab. All rights reserved
-// modified from
-// https://github.com/SDL-GuoZonghao/BeyondBoundingBox/tree/main/mmdet/ops/iou/src
-#include "pytorch_cpp_helper.hpp"
-#include "pytorch_device_registry.hpp"
-
-void convex_iou_impl(const Tensor pointsets, const Tensor polygons,
-                     Tensor ious) {
-  DISPATCH_DEVICE_IMPL(convex_iou_impl, pointsets, polygons, ious);
-}
-
-void convex_iou(const Tensor pointsets, const Tensor polygons, Tensor ious) {
-  convex_iou_impl(pointsets, polygons, ious);
-}
-
-void convex_giou_impl(const Tensor pointsets, const Tensor polygons,
-                      Tensor output) {
-  DISPATCH_DEVICE_IMPL(convex_giou_impl, pointsets, polygons, output);
-}
-
-void convex_giou(const Tensor pointsets, const Tensor polygons, Tensor output) {
-  convex_giou_impl(pointsets, polygons, output);
-}
diff --git a/mmcv/ops/csrc/pytorch/corner_pool.cpp b/mmcv/ops/csrc/pytorch/corner_pool.cpp
new file mode 100644
index 0000000..732cdb0
--- /dev/null
+++ b/mmcv/ops/csrc/pytorch/corner_pool.cpp
@@ -0,0 +1,240 @@
+// Copyright (c) OpenMMLab. All rights reserved
+// Modified from
+// https://github.com/princeton-vl/CornerNet-Lite/tree/master/core/models/py_utils/_cpools/src
+#include "pytorch_cpp_helper.hpp"
+
+Tensor bottom_pool_forward(Tensor input) {
+  // Initialize output
+  Tensor output = at::zeros_like(input);
+  // Get height
+  int64_t height = input.size(2);
+  output.copy_(input);
+
+  for (int64_t ind = 1; ind < height; ind <<= 1) {
+    Tensor max_temp = at::slice(output, 2, ind, height);
+    Tensor cur_temp = at::slice(output, 2, ind, height).clone();
+    Tensor next_temp = at::slice(output, 2, 0, height - ind).clone();
+    at::max_out(max_temp, cur_temp, next_temp);
+  }
+
+  return output;
+}
+
+Tensor bottom_pool_backward(Tensor input, Tensor grad_output) {
+  auto output = at::zeros_like(input);
+
+  int32_t batch = input.size(0);
+  int32_t channel = input.size(1);
+  int32_t height = input.size(2);
+  int32_t width = input.size(3);
+
+  auto max_val = torch::zeros({batch, channel, width},
+                              at::device(at::kCUDA).dtype(at::kFloat));
+  auto max_ind = torch::zeros({batch, channel, width},
+                              at::device(at::kCUDA).dtype(at::kLong));
+
+  auto input_temp = input.select(2, 0);
+  max_val.copy_(input_temp);
+
+  max_ind.fill_(0);
+
+  auto output_temp = output.select(2, 0);
+  auto grad_output_temp = grad_output.select(2, 0);
+  output_temp.copy_(grad_output_temp);
+
+  auto un_max_ind = max_ind.unsqueeze(2);
+  auto gt_mask = torch::zeros({batch, channel, width},
+                              at::device(at::kCUDA).dtype(at::kBool));
+  auto max_temp = torch::zeros({batch, channel, width},
+                               at::device(at::kCUDA).dtype(at::kFloat));
+  for (int32_t ind = 0; ind < height - 1; ++ind) {
+    input_temp = input.select(2, ind + 1);
+    at::gt_out(gt_mask, input_temp, max_val);
+
+    at::masked_select_out(max_temp, input_temp, gt_mask);
+    max_val.masked_scatter_(gt_mask, max_temp);
+    max_ind.masked_fill_(gt_mask, ind + 1);
+
+    grad_output_temp = grad_output.select(2, ind + 1).unsqueeze(2);
+    output.scatter_add_(2, un_max_ind, grad_output_temp);
+  }
+
+  return output;
+}
+
+Tensor left_pool_forward(Tensor input) {
+  // Initialize output
+  Tensor output = at::zeros_like(input);
+  // Get width
+  int64_t width = input.size(3);
+  output.copy_(input);
+
+  for (int64_t ind = 1; ind < width; ind <<= 1) {
+    Tensor max_temp = at::slice(output, 3, 0, width - ind);
+    Tensor cur_temp = at::slice(output, 3, 0, width - ind).clone();
+    Tensor next_temp = at::slice(output, 3, ind, width).clone();
+    at::max_out(max_temp, cur_temp, next_temp);
+  }
+
+  return output;
+}
+
+Tensor left_pool_backward(Tensor input, Tensor grad_output) {
+  auto output = at::zeros_like(input);
+
+  int32_t batch = input.size(0);
+  int32_t channel = input.size(1);
+  int32_t height = input.size(2);
+  int32_t width = input.size(3);
+
+  auto max_val = torch::zeros({batch, channel, height},
+                              at::device(at::kCUDA).dtype(at::kFloat));
+  auto max_ind = torch::zeros({batch, channel, height},
+                              at::device(at::kCUDA).dtype(at::kLong));
+
+  auto input_temp = input.select(3, width - 1);
+  max_val.copy_(input_temp);
+
+  max_ind.fill_(width - 1);
+
+  auto output_temp = output.select(3, width - 1);
+  auto grad_output_temp = grad_output.select(3, width - 1);
+  output_temp.copy_(grad_output_temp);
+
+  auto un_max_ind = max_ind.unsqueeze(3);
+  auto gt_mask = torch::zeros({batch, channel, height},
+                              at::device(at::kCUDA).dtype(at::kBool));
+  auto max_temp = torch::zeros({batch, channel, height},
+                               at::device(at::kCUDA).dtype(at::kFloat));
+  for (int32_t ind = 1; ind < width; ++ind) {
+    input_temp = input.select(3, width - ind - 1);
+    at::gt_out(gt_mask, input_temp, max_val);
+
+    at::masked_select_out(max_temp, input_temp, gt_mask);
+    max_val.masked_scatter_(gt_mask, max_temp);
+    max_ind.masked_fill_(gt_mask, width - ind - 1);
+
+    grad_output_temp = grad_output.select(3, width - ind - 1).unsqueeze(3);
+    output.scatter_add_(3, un_max_ind, grad_output_temp);
+  }
+
+  return output;
+}
+
+Tensor right_pool_forward(Tensor input) {
+  // Initialize output
+  Tensor output = at::zeros_like(input);
+  // Get width
+  int64_t width = input.size(3);
+  output.copy_(input);
+
+  for (int64_t ind = 1; ind < width; ind <<= 1) {
+    Tensor max_temp = at::slice(output, 3, ind, width);
+    Tensor cur_temp = at::slice(output, 3, ind, width).clone();
+    Tensor next_temp = at::slice(output, 3, 0, width - ind).clone();
+    at::max_out(max_temp, cur_temp, next_temp);
+  }
+
+  return output;
+}
+
+Tensor right_pool_backward(Tensor input, Tensor grad_output) {
+  Tensor output = at::zeros_like(input);
+
+  int32_t batch = input.size(0);
+  int32_t channel = input.size(1);
+  int32_t height = input.size(2);
+  int32_t width = input.size(3);
+
+  auto max_val = torch::zeros({batch, channel, height},
+                              at::device(at::kCUDA).dtype(at::kFloat));
+  auto max_ind = torch::zeros({batch, channel, height},
+                              at::device(at::kCUDA).dtype(at::kLong));
+
+  auto input_temp = input.select(3, 0);
+  max_val.copy_(input_temp);
+
+  max_ind.fill_(0);
+
+  auto output_temp = output.select(3, 0);
+  auto grad_output_temp = grad_output.select(3, 0);
+  output_temp.copy_(grad_output_temp);
+
+  auto un_max_ind = max_ind.unsqueeze(3);
+  auto gt_mask = torch::zeros({batch, channel, height},
+                              at::device(at::kCUDA).dtype(at::kBool));
+  auto max_temp = torch::zeros({batch, channel, height},
+                               at::device(at::kCUDA).dtype(at::kFloat));
+  for (int32_t ind = 0; ind < width - 1; ++ind) {
+    input_temp = input.select(3, ind + 1);
+    at::gt_out(gt_mask, input_temp, max_val);
+
+    at::masked_select_out(max_temp, input_temp, gt_mask);
+    max_val.masked_scatter_(gt_mask, max_temp);
+    max_ind.masked_fill_(gt_mask, ind + 1);
+
+    grad_output_temp = grad_output.select(3, ind + 1).unsqueeze(3);
+    output.scatter_add_(3, un_max_ind, grad_output_temp);
+  }
+
+  return output;
+}
+
+Tensor top_pool_forward(Tensor input) {
+  // Initialize output
+  Tensor output = at::zeros_like(input);
+  // Get height
+  int64_t height = input.size(2);
+  output.copy_(input);
+
+  for (int64_t ind = 1; ind < height; ind <<= 1) {
+    Tensor max_temp = at::slice(output, 2, 0, height - ind);
+    Tensor cur_temp = at::slice(output, 2, 0, height - ind).clone();
+    Tensor next_temp = at::slice(output, 2, ind, height).clone();
+    at::max_out(max_temp, cur_temp, next_temp);
+  }
+
+  return output;
+}
+
+Tensor top_pool_backward(Tensor input, Tensor grad_output) {
+  auto output = at::zeros_like(input);
+
+  int32_t batch = input.size(0);
+  int32_t channel = input.size(1);
+  int32_t height = input.size(2);
+  int32_t width = input.size(3);
+
+  auto max_val = torch::zeros({batch, channel, width},
+                              at::device(at::kCUDA).dtype(at::kFloat));
+  auto max_ind = torch::zeros({batch, channel, width},
+                              at::device(at::kCUDA).dtype(at::kLong));
+
+  auto input_temp = input.select(2, height - 1);
+  max_val.copy_(input_temp);
+
+  max_ind.fill_(height - 1);
+
+  auto output_temp = output.select(2, height - 1);
+  auto grad_output_temp = grad_output.select(2, height - 1);
+  output_temp.copy_(grad_output_temp);
+
+  auto un_max_ind = max_ind.unsqueeze(2);
+  auto gt_mask = torch::zeros({batch, channel, width},
+                              at::device(at::kCUDA).dtype(at::kBool));
+  auto max_temp = torch::zeros({batch, channel, width},
+                               at::device(at::kCUDA).dtype(at::kFloat));
+  for (int32_t ind = 1; ind < height; ++ind) {
+    input_temp = input.select(2, height - ind - 1);
+    at::gt_out(gt_mask, input_temp, max_val);
+
+    at::masked_select_out(max_temp, input_temp, gt_mask);
+    max_val.masked_scatter_(gt_mask, max_temp);
+    max_ind.masked_fill_(gt_mask, height - ind - 1);
+
+    grad_output_temp = grad_output.select(2, height - ind - 1).unsqueeze(2);
+    output.scatter_add_(2, un_max_ind, grad_output_temp);
+  }
+
+  return output;
+}
diff --git a/mmcv/ops/csrc/pytorch/cpu/active_rotated_filter.cpp b/mmcv/ops/csrc/pytorch/cpu/active_rotated_filter.cpp
deleted file mode 100644
index aa5a8b3..0000000
--- a/mmcv/ops/csrc/pytorch/cpu/active_rotated_filter.cpp
+++ /dev/null
@@ -1,120 +0,0 @@
-// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
-// modified from
-// https://github.com/csuhan/s2anet/blob/master/mmdet/ops/orn/src/cpu/ActiveRotatingFilter_cpu.cpp
-#include "pytorch_cpp_helper.hpp"
-#include "pytorch_device_registry.hpp"
-
-template <typename T>
-void active_rotated_filter_forward_cpu_kernel(
-    const T* weightData, const int* indicesData, const int num_output_planes,
-    const int num_input_planes, const int num_orientations, const int kH,
-    const int kW, const int num_rotations, T* outputData) {
-  const int nEntry = num_orientations * kH * kW;
-  int i, j, l;
-  int k;
-
-#pragma omp parallel for private(i, j, l, k)
-  for (i = 0; i < num_output_planes; i++) {
-    for (j = 0; j < num_input_planes; j++) {
-      for (l = 0; l < nEntry; l++) {
-        int weightIndex = i * num_input_planes * nEntry + j * nEntry + l;
-        T val = *(weightData + weightIndex);
-        for (k = 0; k < num_rotations; k++) {
-          int index = (int)(*(indicesData + l * num_rotations + k)) - 1;
-          T* target = outputData +
-                      i * (num_rotations * num_input_planes * nEntry) +
-                      k * (num_input_planes * nEntry) + j * (nEntry) + index;
-          *target = val;
-        }
-      }
-    }
-  }
-}
-
-template <typename T>
-void active_rotated_filter_backward_cpu_kernel(
-    const T* gradOutputData, const int* indicesData,
-    const int num_output_planes, const int num_input_planes,
-    const int num_orientations, const int kH, const int kW,
-    const int num_rotations, T* gradInputData) {
-  const int nEntry = num_orientations * kH * kW;
-  int i, j, l;
-  int k;
-
-#pragma omp parallel for private(i, j, l, k)
-  for (i = 0; i < num_output_planes; i++) {
-    for (j = 0; j < num_input_planes; j++) {
-      for (l = 0; l < nEntry; l++) {
-        int gradInputIndex = i * num_input_planes * nEntry + j * nEntry + l;
-        T* val = gradInputData + gradInputIndex;
-        *val = 0;
-        for (k = 0; k < num_rotations; k++) {
-          int index = (int)(*(indicesData + l * num_rotations + k)) - 1;
-          const T* target =
-              gradOutputData + i * (num_rotations * num_input_planes * nEntry) +
-              k * (num_input_planes * nEntry) + j * (nEntry) + index;
-          *val = *val + *target;
-        }
-      }
-    }
-  }
-}
-
-void ActiveRotatedFilterForwardCPULauncher(const Tensor input,
-                                           const Tensor indices,
-                                           Tensor output) {
-  const int num_output_planes = input.size(0);
-  const int num_input_planes = input.size(1);
-  const int num_orientations = input.size(2);
-  const int kH = input.size(3);
-  const int kW = input.size(4);
-  const int num_rotations = indices.size(3);
-
-  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
-      input.scalar_type(), "active_rotated_filter_forward_cpu_kernel", [&] {
-        active_rotated_filter_forward_cpu_kernel<scalar_t>(
-            input.data_ptr<scalar_t>(), indices.data_ptr<int>(),
-            num_output_planes, num_input_planes, num_orientations, kH, kW,
-            num_rotations, output.data_ptr<scalar_t>());
-      });
-}
-
-void ActiveRotatedFilterBackwardCPULauncher(const Tensor grad_out,
-                                            const Tensor indices,
-                                            Tensor grad_in) {
-  const int num_orientations = indices.size(0);
-  const int kH = indices.size(1);
-  const int kW = indices.size(2);
-  const int num_rotations = indices.size(3);
-  const int num_output_planes = grad_out.size(0) / num_rotations;
-  const int num_input_planes = grad_out.size(1) / num_orientations;
-
-  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
-      grad_out.scalar_type(), "active_rotated_filter_backward_cpu_kernel", [&] {
-        active_rotated_filter_backward_cpu_kernel<scalar_t>(
-            grad_out.data_ptr<scalar_t>(), indices.data_ptr<int>(),
-            num_output_planes, num_input_planes, num_orientations, kH, kW,
-            num_rotations, grad_in.data_ptr<scalar_t>());
-      });
-}
-
-void active_rotated_filter_forward_cpu(const Tensor input, const Tensor indices,
-                                       Tensor output) {
-  ActiveRotatedFilterForwardCPULauncher(input, indices, output);
-}
-
-void active_rotated_filter_backward_cpu(const Tensor grad_out,
-                                        const Tensor indices, Tensor grad_in) {
-  ActiveRotatedFilterBackwardCPULauncher(grad_out, indices, grad_in);
-}
-
-void active_rotated_filter_forward_impl(const Tensor input,
-                                        const Tensor indices, Tensor output);
-
-void active_rotated_filter_backward_impl(const Tensor grad_out,
-                                         const Tensor indices, Tensor grad_in);
-
-REGISTER_DEVICE_IMPL(active_rotated_filter_forward_impl, CPU,
-                     active_rotated_filter_forward_cpu);
-REGISTER_DEVICE_IMPL(active_rotated_filter_backward_impl, CPU,
-                     active_rotated_filter_backward_cpu);
diff --git a/mmcv/ops/csrc/pytorch/cpu/bbox_overlaps_cpu.cpp b/mmcv/ops/csrc/pytorch/cpu/bbox_overlaps_cpu.cpp
deleted file mode 100644
index 4498895..0000000
--- a/mmcv/ops/csrc/pytorch/cpu/bbox_overlaps_cpu.cpp
+++ /dev/null
@@ -1,65 +0,0 @@
-// Copyright(c) OpenMMLab.All rights reserved.
-#include "pytorch_cpp_helper.hpp"
-#include "pytorch_device_registry.hpp"
-
-using torch::indexing::None;
-using torch::indexing::Slice;
-
-void bbox_overlaps_cpu_kernel(const Tensor boxes1, const Tensor boxes2,
-                              Tensor ious, const int mode_flag,
-                              const bool aligned, const int offset) {
-  Tensor temp_ious;
-  if (aligned) {
-    Tensor lt = torch::max(boxes1.index({Slice(None), Slice({None, 2})}),
-                           boxes2.index({Slice(None), Slice({None, 2})}));
-    Tensor rb = torch::min(boxes1.index({Slice(None), Slice(2)}),
-                           boxes2.index({Slice(None), Slice(2)}));
-    Tensor wh = (rb - lt + offset).clamp(0.f, INT_MAX * 1.f);
-    Tensor overlap = wh.index({Slice(None), 0}) * wh.index({Slice(None), 1});
-    Tensor area1 = (boxes1.index({Slice(None), 2}) -
-                    boxes1.index({Slice(None), 0}) + offset) *
-                   (boxes1.index({Slice(None), 3}) -
-                    boxes1.index({Slice(None), 1}) + offset);
-    if (mode_flag == 0) {
-      Tensor area2 = (boxes2.index({Slice(None), 2}) -
-                      boxes2.index({Slice(None), 0}) + offset) *
-                     (boxes2.index({Slice(None), 3}) -
-                      boxes2.index({Slice(None), 1}) + offset);
-      temp_ious = overlap / (area1 + area2 - overlap);
-    } else {
-      temp_ious = overlap / area1;
-    }
-  } else {
-    Tensor lt = torch::max(boxes1.index({Slice(None), None, Slice({None, 2})}),
-                           boxes2.index({Slice(None), Slice({None, 2})}));
-    Tensor rb = torch::min(boxes1.index({Slice(None), None, Slice(2)}),
-                           boxes2.index({Slice(None), Slice(2)}));
-    Tensor wh = (rb - lt + offset).clamp(0.f, INT_MAX * 1.f);
-    Tensor overlap = wh.index({"...", 0}) * wh.index({"...", 1});
-    Tensor area1 = (boxes1.index({Slice(None), 2}) -
-                    boxes1.index({Slice(None), 0}) + offset) *
-                   (boxes1.index({Slice(None), 3}) -
-                    boxes1.index({Slice(None), 1}) + offset);
-    if (mode_flag == 0) {
-      Tensor area2 = (boxes2.index({Slice(None), 2}) -
-                      boxes2.index({Slice(None), 0}) + offset) *
-                     (boxes2.index({Slice(None), 3}) -
-                      boxes2.index({Slice(None), 1}) + offset);
-      temp_ious =
-          overlap / (area1.index({Slice(None), None}) + area2 - overlap);
-    } else {
-      temp_ious = overlap / area1.index({Slice(None), None});
-    }
-  }
-  ious.copy_(temp_ious);
-}
-
-void bbox_overlaps_cpu(const Tensor boxes1, const Tensor boxes2, Tensor ious,
-                       const int mode, const bool aligned, const int offset) {
-  bbox_overlaps_cpu_kernel(boxes1, boxes2, ious, mode, aligned, offset);
-}
-
-void bbox_overlaps_impl(const Tensor boxes1, const Tensor boxes2, Tensor ious,
-                        const int mode, const bool aligned, const int offset);
-
-REGISTER_DEVICE_IMPL(bbox_overlaps_impl, CPU, bbox_overlaps_cpu);
diff --git a/mmcv/ops/csrc/pytorch/cpu/box_iou_quadri.cpp b/mmcv/ops/csrc/pytorch/cpu/box_iou_quadri.cpp
deleted file mode 100644
index 211699c..0000000
--- a/mmcv/ops/csrc/pytorch/cpu/box_iou_quadri.cpp
+++ /dev/null
@@ -1,36 +0,0 @@
-// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
-#include "box_iou_rotated_utils.hpp"
-#include "pytorch_cpp_helper.hpp"
-#include "pytorch_device_registry.hpp"
-
-template <typename T>
-void box_iou_quadri_cpu_kernel(const Tensor boxes1, const Tensor boxes2,
-                               Tensor ious, const int mode_flag,
-                               const bool aligned) {
-  int output_size = ious.numel();
-  auto num_boxes1 = boxes1.size(0);
-  auto num_boxes2 = boxes2.size(0);
-
-  if (aligned) {
-    for (int i = 0; i < output_size; i++) {
-      ious[i] = single_box_iou_quadri<T>(boxes1[i].data_ptr<T>(),
-                                         boxes2[i].data_ptr<T>(), mode_flag);
-    }
-  } else {
-    for (int i = 0; i < num_boxes1; i++) {
-      for (int j = 0; j < num_boxes2; j++) {
-        ious[i * num_boxes2 + j] = single_box_iou_quadri<T>(
-            boxes1[i].data_ptr<T>(), boxes2[j].data_ptr<T>(), mode_flag);
-      }
-    }
-  }
-}
-
-void box_iou_quadri_cpu(const Tensor boxes1, const Tensor boxes2, Tensor ious,
-                        const int mode_flag, const bool aligned) {
-  box_iou_quadri_cpu_kernel<float>(boxes1, boxes2, ious, mode_flag, aligned);
-}
-
-void box_iou_quadri_impl(const Tensor boxes1, const Tensor boxes2, Tensor ious,
-                         const int mode_flag, const bool aligned);
-REGISTER_DEVICE_IMPL(box_iou_quadri_impl, CPU, box_iou_quadri_cpu);
diff --git a/mmcv/ops/csrc/pytorch/cpu/nms_rotated.cpp b/mmcv/ops/csrc/pytorch/cpu/nms_rotated.cpp
index d2774c8..223ee1a 100644
--- a/mmcv/ops/csrc/pytorch/cpu/nms_rotated.cpp
+++ b/mmcv/ops/csrc/pytorch/cpu/nms_rotated.cpp
@@ -59,7 +59,7 @@ Tensor nms_rotated_cpu_kernel(const Tensor dets, const Tensor scores,
 Tensor nms_rotated_cpu(const Tensor dets, const Tensor scores,
                        const float iou_threshold) {
   auto result = at::empty({0}, dets.options());
-  AT_DISPATCH_FLOATING_TYPES(dets.scalar_type(), "nms_rotated", [&] {
+  AT_DISPATCH_FLOATING_TYPES(dets.type(), "nms_rotated", [&] {
     result = nms_rotated_cpu_kernel<scalar_t>(dets, scores, iou_threshold);
   });
   return result;
diff --git a/mmcv/ops/csrc/pytorch/cpu/pixel_group.cpp b/mmcv/ops/csrc/pytorch/cpu/pixel_group.cpp
index db06a22..9083281 100755
--- a/mmcv/ops/csrc/pytorch/cpu/pixel_group.cpp
+++ b/mmcv/ops/csrc/pytorch/cpu/pixel_group.cpp
@@ -1,8 +1,6 @@
 // Copyright (c) OpenMMLab. All rights reserved
 // It is modified from https://github.com/WenmuZhou/PAN.pytorch
 
-#include <queue>
-
 #include "pytorch_cpp_helper.hpp"
 #include "pytorch_device_registry.hpp"
 
@@ -41,7 +39,7 @@ std::vector<std::vector<float>> pixel_group_cpu(
     Tensor kernel_contour, int kernel_region_num, float dis_threshold) {
   assert(score.dim() == 2);
   assert(mask.dim() == 2);
-  assert(embedding.dim() == 3);
+  assert(embedding_dim.dim() == 3);
   int height = score.size(0);
   int width = score.size(1);
   assert(height == mask.size(0) == embedding.size(1) == kernel_label.size(1));
@@ -105,7 +103,7 @@ std::vector<std::vector<float>> pixel_group_cpu(
 
       float dis = 0;
       auto ptr_embedding_tmp = ptr_embedding + tmpy * width * embedding_dim;
-      for (size_t i = 0; i < size_t(embedding_dim); i++) {
+      for (size_t i = 0; i < embedding_dim; i++) {
         dis +=
             pow(kernel_cv[i] - ptr_embedding_tmp[tmpx * embedding_dim + i], 2);
         // ignore further computing if dis is big enough
diff --git a/mmcv/ops/csrc/pytorch/cpu/roi_align_rotated.cpp b/mmcv/ops/csrc/pytorch/cpu/roi_align_rotated.cpp
index 8c849de..0f7511b 100644
--- a/mmcv/ops/csrc/pytorch/cpu/roi_align_rotated.cpp
+++ b/mmcv/ops/csrc/pytorch/cpu/roi_align_rotated.cpp
@@ -395,6 +395,7 @@ void ROIAlignRotatedBackwardCPULauncher(Tensor grad_output, Tensor rois,
                                         int aligned_width, float spatial_scale,
                                         int sampling_ratio, bool aligned,
                                         bool clockwise) {
+  int output_size = grad_output.numel();
   int channels = grad_input.size(1);
   int height = grad_input.size(2);
   int width = grad_input.size(3);
@@ -430,6 +431,8 @@ void roi_align_rotated_backward_cpu(Tensor top_grad, Tensor rois,
                                     int aligned_width, float spatial_scale,
                                     int sampling_ratio, bool aligned,
                                     bool clockwise) {
+  // Number of ROIs
+  int num_rois = rois.size(0);
   int size_rois = rois.size(1);
   if (size_rois != 6) {
     AT_ERROR("wrong roi size");
@@ -439,15 +442,15 @@ void roi_align_rotated_backward_cpu(Tensor top_grad, Tensor rois,
       sampling_ratio, aligned, clockwise);
 }
 
-void roi_align_rotated_forward_impl(Tensor input, Tensor rois, Tensor output,
+void roi_align_rotated_forward_impl(Tensor features, Tensor rois, Tensor output,
                                     int aligned_height, int aligned_width,
-                                    float spatial_scale, int sampling_ratio,
+                                    float spatial_scale, int sample_ratio,
                                     bool aligned, bool clockwise);
 
 void roi_align_rotated_backward_impl(Tensor top_grad, Tensor rois,
                                      Tensor bottom_grad, int aligned_height,
                                      int aligned_width, float spatial_scale,
-                                     int sampling_ratio, bool aligned,
+                                     int sample_ratio, bool aligned,
                                      bool clockwise);
 REGISTER_DEVICE_IMPL(roi_align_rotated_forward_impl, CPU,
                      roi_align_rotated_forward_cpu);
diff --git a/mmcv/ops/csrc/pytorch/cpu/rotated_feature_align.cpp b/mmcv/ops/csrc/pytorch/cpu/rotated_feature_align.cpp
deleted file mode 100644
index 09dcdd3..0000000
--- a/mmcv/ops/csrc/pytorch/cpu/rotated_feature_align.cpp
+++ /dev/null
@@ -1,262 +0,0 @@
-// modified from
-// https://github.com/SJTU-Thinklab-Det/r3det-on-mmdetection/blob/master/mmdet/ops/fr/src/feature_refine_kernel.cu
-#include "pytorch_cpp_helper.hpp"
-#include "pytorch_device_registry.hpp"
-
-template <typename T>
-T bilinear_interpolate(const T* input, const int height, const int width, T y,
-                       T x, const int index /* index for debug only*/) {
-  // deal with cases that inverse elements are out of feature map boundary
-  if (y < -1.0 || y > height || x < -1.0 || x > width) return 0;
-
-  if (y <= 0) y = 0;
-  if (x <= 0) x = 0;
-
-  int y_low = (int)y;
-  int x_low = (int)x;
-  int y_high;
-  int x_high;
-
-  if (y_low >= height - 1) {
-    y_high = y_low = height - 1;
-    y = (T)y_low;
-  } else {
-    y_high = y_low + 1;
-  }
-
-  if (x_low >= width - 1) {
-    x_high = x_low = width - 1;
-    x = (T)x_low;
-  } else {
-    x_high = x_low + 1;
-  }
-
-  T ly = y - y_low;
-  T lx = x - x_low;
-  // do bilinear interpolation
-  T v1 = input[y_low * width + x_low];
-  T v2 = input[y_low * width + x_high];
-  T v3 = input[y_high * width + x_low];
-  T v4 = input[y_high * width + x_high];
-  const T v_low = fma(v2 - v1, lx, v1);
-  const T v_high = fma(v4 - v3, lx, v3);
-  const T val = fma(v_high - v_low, ly, v_low);
-
-  return val;
-}
-
-template <typename scalar_t>
-void rotated_feature_align_forward_cpu_kernel(
-    const int nthreads, const int points, const scalar_t* bottom_data,
-    const scalar_t* best_bboxes, const scalar_t spatial_scale,
-    const int channels, const int height, const int width, scalar_t* top_data) {
-  for (int index = 0; index < nthreads; index++) {
-    int w = index % width;
-    int h = (index / width) % height;
-    int c = (index / width / height) % channels;
-    int n = index / width / height / channels;
-
-    const scalar_t* bbox_offset =
-        best_bboxes + ((n * height + h) * width + w) * 5;
-    scalar_t roi_y = bbox_offset[0] * spatial_scale;
-    scalar_t roi_x = bbox_offset[1] * spatial_scale;
-
-    scalar_t px[5] = {roi_x, 0, 0, 0, 0};
-    scalar_t py[5] = {roi_y, 0, 0, 0, 0};
-
-    if (points > 1) {
-      scalar_t roi_w = bbox_offset[2] * spatial_scale;
-      scalar_t roi_h = bbox_offset[3] * spatial_scale;
-      scalar_t roi_a = bbox_offset[4];
-
-      scalar_t w_2 = roi_w / 2, h_2 = roi_h / 2;
-      scalar_t cosa = cosf(roi_a), sina = sinf(roi_a);
-      scalar_t wx = cosa * w_2, wy = sina * w_2;
-      scalar_t hx = -sina * h_2, hy = cosa * h_2;
-
-      px[1] = roi_x + wx + hx;
-      py[1] = roi_y + wy + hy;
-      px[2] = roi_x - wx + hx;
-      py[2] = roi_y - wy + hy;
-      px[3] = roi_x - wx - hx;
-      py[3] = roi_y - wy - hy;
-      px[4] = roi_x + wx - hx;
-      py[4] = roi_y + wy - hy;
-    }
-
-    const scalar_t* offset_bottom_data =
-        bottom_data + (n * channels + c) * height * width;
-
-    scalar_t output_val = bottom_data[index];
-    for (int i = 0; i < points; i++) {
-      output_val += bilinear_interpolate<scalar_t>(offset_bottom_data, height,
-                                                   width, py[i], px[i], i);
-    }
-    top_data[index] = output_val;
-  }
-}
-
-template <typename T>
-void bilinear_interpolate_gradient(const int height, const int width, T y, T x,
-                                   T& w1, T& w2, T& w3, T& w4, int& x_low,
-                                   int& x_high, int& y_low, int& y_high,
-                                   const int index) {
-  // deal with cases that inverse elements are out of feature map boundary
-  if (y < -1.0 || y > height || x < -1.0 || x > width) {
-    // empty
-    w1 = w2 = w3 = w4 = 0.;
-    x_low = x_high = y_low = y_high = -1;
-    return;
-  }
-
-  if (y <= 0) y = 0;
-  if (x <= 0) x = 0;
-
-  y_low = (int)y;
-  x_low = (int)x;
-
-  if (y_low >= height - 1) {
-    y_high = y_low = height - 1;
-    y = (T)y_low;
-  } else {
-    y_high = y_low + 1;
-  }
-
-  if (x_low >= width - 1) {
-    x_high = x_low = width - 1;
-    x = (T)x_low;
-  } else {
-    x_high = x_low + 1;
-  }
-
-  T ly = y - y_low;
-  T lx = x - x_low;
-  T hy = 1. - ly, hx = 1. - lx;
-
-  w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
-
-  return;
-}
-
-template <typename scalar_t>
-inline void valueAdd(scalar_t* address, scalar_t val) {
-  scalar_t old = *address;
-  *address = (old + val);
-}
-
-template <typename scalar_t>
-void rotated_feature_align_backward_cpu_kernel(
-    const int nthreads, const int points, const scalar_t* top_diff,
-    const scalar_t* best_bboxes, const scalar_t spatial_scale,
-    const int channels, const int height, const int width,
-    scalar_t* bottom_diff) {
-  for (int index = 0; index < nthreads; index++) {
-    int w = index % width;
-    int h = (index / width) % height;
-    int c = (index / width / height) % channels;
-    int n = index / width / height / channels;
-
-    const scalar_t* bbox_offset =
-        best_bboxes + ((n * height + h) * width + w) * 5;
-    scalar_t roi_y = bbox_offset[0] * spatial_scale;
-    scalar_t roi_x = bbox_offset[1] * spatial_scale;
-
-    scalar_t px[5] = {roi_x, 0, 0, 0, 0};
-    scalar_t py[5] = {roi_y, 0, 0, 0, 0};
-
-    if (points > 1) {
-      scalar_t roi_w = bbox_offset[2] * spatial_scale;
-      scalar_t roi_h = bbox_offset[3] * spatial_scale;
-      scalar_t roi_a = bbox_offset[4];
-
-      scalar_t w_2 = roi_w / 2, h_2 = roi_h / 2;
-      scalar_t cosa = cosf(roi_a), sina = sinf(roi_a);
-      scalar_t wx = cosa * w_2, wy = sina * w_2;
-      scalar_t hx = -sina * h_2, hy = cosa * h_2;
-
-      px[1] = roi_x + wx + hx;
-      py[1] = roi_y + wy + hy;
-      px[2] = roi_x - wx + hx;
-      py[2] = roi_y - wy + hy;
-      px[3] = roi_x - wx - hx;
-      py[3] = roi_y - wy - hy;
-      px[4] = roi_x + wx - hx;
-      py[4] = roi_y + wy - hy;
-    }
-
-    scalar_t* offset_bottom_diff =
-        bottom_diff + (n * channels + c) * height * width;
-    scalar_t value_top_diff = top_diff[index];
-
-    valueAdd(bottom_diff + index, value_top_diff);
-    for (int i = 0; i < points; i++) {
-      scalar_t w1, w2, w3, w4;
-      int x_low, x_high, y_low, y_high;
-
-      bilinear_interpolate_gradient<scalar_t>(height, width, py[i], px[i], w1,
-                                              w2, w3, w4, x_low, x_high, y_low,
-                                              y_high, i);
-      scalar_t g1 = value_top_diff * w1;
-      scalar_t g2 = value_top_diff * w2;
-      scalar_t g3 = value_top_diff * w3;
-      scalar_t g4 = value_top_diff * w4;
-      if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
-        valueAdd(offset_bottom_diff + y_low * width + x_low, g1);
-        valueAdd(offset_bottom_diff + y_low * width + x_high, g2);
-        valueAdd(offset_bottom_diff + y_high * width + x_low, g3);
-        valueAdd(offset_bottom_diff + y_high * width + x_high, g4);
-      }
-    }
-  }
-}
-
-void rotated_feature_align_forward_cpu(const Tensor features,
-                                       const Tensor best_bboxes,
-                                       const float spatial_scale,
-                                       const int points, Tensor output) {
-  const int output_size = features.numel();
-  AT_DISPATCH_FLOATING_TYPES(
-      features.scalar_type(), "rotated_feature_align_forward_cpu_kernel", [&] {
-        const scalar_t* bottom_data = features.data_ptr<scalar_t>();
-        const scalar_t* bboxes_data = best_bboxes.data_ptr<scalar_t>();
-        scalar_t* top_data = output.data_ptr<scalar_t>();
-
-        rotated_feature_align_forward_cpu_kernel<scalar_t>(
-            output_size, points, bottom_data, bboxes_data,
-            scalar_t(spatial_scale), features.size(1), features.size(2),
-            features.size(3), top_data);
-      });
-}
-
-void rotated_feature_align_backward_cpu(const Tensor top_grad,
-                                        const Tensor best_bboxes,
-                                        const float spatial_scale,
-                                        const int points, Tensor bottom_grad) {
-  const int output_size = top_grad.numel();
-  AT_DISPATCH_FLOATING_TYPES(
-      top_grad.scalar_type(), "rotated_feature_align_backward_cpu_kernel", [&] {
-        const scalar_t* top_diff = top_grad.data_ptr<scalar_t>();
-        const scalar_t* bboxes_data = best_bboxes.data_ptr<scalar_t>();
-        scalar_t* bottom_diff = bottom_grad.data_ptr<scalar_t>();
-
-        rotated_feature_align_backward_cpu_kernel<scalar_t>(
-            output_size, points, top_diff, bboxes_data, scalar_t(spatial_scale),
-            top_grad.size(1), top_grad.size(2), top_grad.size(3), bottom_diff);
-      });
-}
-
-void rotated_feature_align_forward_impl(const Tensor features,
-                                        const Tensor best_bboxes,
-                                        const float spatial_scale,
-                                        const int points, Tensor output);
-
-void rotated_feature_align_backward_impl(const Tensor top_grad,
-                                         const Tensor best_bboxes,
-                                         const float spatial_scale,
-                                         const int points, Tensor bottom_grad);
-
-REGISTER_DEVICE_IMPL(rotated_feature_align_forward_impl, CPU,
-                     rotated_feature_align_forward_cpu);
-
-REGISTER_DEVICE_IMPL(rotated_feature_align_backward_impl, CPU,
-                     rotated_feature_align_backward_cpu);
diff --git a/mmcv/ops/csrc/pytorch/cpu/sparse_indice.cpp b/mmcv/ops/csrc/pytorch/cpu/sparse_indice.cpp
deleted file mode 100644
index b2c592b..0000000
--- a/mmcv/ops/csrc/pytorch/cpu/sparse_indice.cpp
+++ /dev/null
@@ -1,84 +0,0 @@
-// Copyright 2019 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <torch/script.h>
-#include <utils/spconv/spconv/geometry.h>
-#include <utils/spconv/spconv/indice.h>
-
-#include "pytorch_cpp_helper.hpp"
-
-namespace functor {
-template <typename Index, typename IndexGrid, unsigned NDim>
-struct CreateConvIndicePairFunctor<tv::CPU, Index, IndexGrid, NDim> {
-  Index operator()(const tv::CPU& d, tv::TensorView<const Index> indicesIn,
-                   tv::TensorView<Index> indicesOut,
-                   tv::TensorView<IndexGrid> gridsOut,
-                   tv::TensorView<Index> indicePairs,
-                   tv::TensorView<Index> indiceNum,
-                   const tv::SimpleVector<Index, NDim> kernelSize,
-                   const tv::SimpleVector<Index, NDim> stride,
-                   const tv::SimpleVector<Index, NDim> padding,
-                   const tv::SimpleVector<Index, NDim> dilation,
-                   const tv::SimpleVector<Index, NDim> outSpatialShape,
-                   bool transpose, bool resetGrid) {
-    if (transpose)
-      return getIndicePairsDeConv<Index, IndexGrid, NDim>(
-          indicesIn, indicesOut, gridsOut, indicePairs, indiceNum,
-          kernelSize.data(), stride.data(), padding.data(), dilation.data(),
-          outSpatialShape.data());
-    else
-      return getIndicePairsConv<Index, IndexGrid, NDim>(
-          indicesIn, indicesOut, gridsOut, indicePairs, indiceNum,
-          kernelSize.data(), stride.data(), padding.data(), dilation.data(),
-          outSpatialShape.data());
-  }
-};
-
-template <typename Index, typename IndexGrid, unsigned NDim>
-struct CreateSubMIndicePairFunctor<tv::CPU, Index, IndexGrid, NDim> {
-  Index operator()(const tv::CPU& d, tv::TensorView<const Index> indicesIn,
-                   tv::TensorView<IndexGrid> gridsOut,
-                   tv::TensorView<Index> indicePairs,
-                   tv::TensorView<Index> indiceNum,
-                   const tv::SimpleVector<Index, NDim> kernelSize,
-                   const tv::SimpleVector<Index, NDim> stride,
-                   const tv::SimpleVector<Index, NDim> padding,
-                   const tv::SimpleVector<Index, NDim> dilation,
-                   const tv::SimpleVector<Index, NDim> outSpatialShape,
-                   bool transpose, bool resetGrid) {
-    return getIndicePairsSubM<Index, IndexGrid, NDim>(
-        indicesIn, gridsOut, indicePairs, indiceNum, kernelSize.data(),
-        stride.data(), padding.data(), dilation.data(), outSpatialShape.data());
-  }
-};
-
-}  // namespace functor
-
-#define DECLARE_CPU_SPECS_INDEX_NDIM(Index, NDIM)                           \
-  template struct functor::CreateConvIndicePairFunctor<tv::CPU, Index, int, \
-                                                       NDIM>;               \
-  template struct functor::CreateSubMIndicePairFunctor<tv::CPU, Index, int, \
-                                                       NDIM>;
-
-#define DECLARE_CPU_INDEX(Index)          \
-  DECLARE_CPU_SPECS_INDEX_NDIM(Index, 1); \
-  DECLARE_CPU_SPECS_INDEX_NDIM(Index, 2); \
-  DECLARE_CPU_SPECS_INDEX_NDIM(Index, 3); \
-  DECLARE_CPU_SPECS_INDEX_NDIM(Index, 4);
-
-DECLARE_CPU_INDEX(int);
-DECLARE_CPU_INDEX(long);
-
-#undef DECLARE_CPU_INDEX
-#undef DECLARE_CPU_SPECS_INDEX_NDIM
diff --git a/mmcv/ops/csrc/pytorch/cpu/sparse_maxpool.cpp b/mmcv/ops/csrc/pytorch/cpu/sparse_maxpool.cpp
deleted file mode 100644
index 6266741..0000000
--- a/mmcv/ops/csrc/pytorch/cpu/sparse_maxpool.cpp
+++ /dev/null
@@ -1,82 +0,0 @@
-// Copyright 2019 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <torch/script.h>
-#include <utils/spconv/spconv/maxpool.h>
-
-#include "pytorch_cpp_helper.hpp"
-
-namespace functor {
-template <typename scalar_t, typename Index>
-struct SparseMaxPoolForwardFunctor<tv::CPU, scalar_t, Index> {
-  void operator()(const tv::CPU &d, tv::TensorView<scalar_t> outFeatures,
-                  tv::TensorView<const scalar_t> inFeatures,
-                  tv::TensorView<const Index> indices, int size) {
-    int stride = outFeatures.dim(1);
-    auto outFeaturesData = outFeatures.data();
-    auto inFeaturesData = inFeatures.data();
-    auto indicesIn = indices.subview(0).data();
-    auto indicesOut = indices.subview(1).data();
-    Index idxi, idxo;
-    for (int row = 0; row < size; row++) {
-      idxi = indicesIn[row] * stride;
-      idxo = indicesOut[row] * stride;
-      for (int plane = 0; plane < stride; ++plane)
-        if (outFeaturesData[idxo + plane] < inFeaturesData[idxi + plane])
-          outFeaturesData[idxo + plane] = inFeaturesData[idxi + plane];
-    }
-  }
-};
-
-template <typename scalar_t, typename Index>
-struct SparseMaxPoolBackwardFunctor<tv::CPU, scalar_t, Index> {
-  void operator()(const tv::CPU &d, tv::TensorView<const scalar_t> outFeatures,
-                  tv::TensorView<const scalar_t> inFeatures,
-                  tv::TensorView<const scalar_t> fout,
-                  tv::TensorView<scalar_t> fin,
-                  tv::TensorView<const Index> indices, int size) {
-    int stride = outFeatures.dim(1);
-    auto outFeaturesData = outFeatures.data();
-    auto inFeaturesData = inFeatures.data();
-    auto foutData = fout.data();
-    auto finData = fin.data();
-    auto indicesIn = indices.subview(0).data();
-    auto indicesOut = indices.subview(1).data();
-    Index idxi, idxo;
-    for (int row = 0; row < size; row++) {
-      idxi = indicesIn[row] * stride;
-      idxo = indicesOut[row] * stride;
-      for (int plane = 0; plane < stride; ++plane)
-        if (outFeaturesData[idxo + plane] == inFeaturesData[idxi + plane])
-          finData[idxi + plane] += foutData[idxo + plane];
-    }
-  }
-};
-
-}  // namespace functor
-
-#define DECLARE_CPU_SPECS_T_INDEX(T, Index)                                \
-  template struct functor::SparseMaxPoolForwardFunctor<tv::CPU, T, Index>; \
-  template struct functor::SparseMaxPoolBackwardFunctor<tv::CPU, T, Index>;
-
-#define DECLARE_CPU_SPECS(T)         \
-  DECLARE_CPU_SPECS_T_INDEX(T, int); \
-  DECLARE_CPU_SPECS_T_INDEX(T, long);
-
-DECLARE_CPU_SPECS(float);
-DECLARE_CPU_SPECS(double);
-DECLARE_CPU_SPECS(at::Half);
-
-#undef DECLARE_CPU_SPECS
-#undef DECLARE_CPU_SPECS_T_INDEX
diff --git a/mmcv/ops/csrc/pytorch/cpu/sparse_reordering.cpp b/mmcv/ops/csrc/pytorch/cpu/sparse_reordering.cpp
deleted file mode 100644
index d4223da..0000000
--- a/mmcv/ops/csrc/pytorch/cpu/sparse_reordering.cpp
+++ /dev/null
@@ -1,68 +0,0 @@
-// Copyright 2019 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <torch/script.h>
-#include <utils/spconv/spconv/reordering.h>
-
-#include "pytorch_cpp_helper.hpp"
-
-namespace functor {
-template <typename scalar_t, typename Index>
-struct SparseGatherFunctor<tv::CPU, scalar_t, Index> {
-  void operator()(const tv::CPU& d, tv::TensorView<scalar_t> buffer,
-                  tv::TensorView<const scalar_t> features,
-                  tv::TensorView<const Index> indices, int size) {
-    int numPlanes = features.dim(1);
-    for (int i = 0; i < size; ++i) {
-      std::memcpy(buffer.data() + i * numPlanes,
-                  features.data() + indices[i] * numPlanes,
-                  sizeof(scalar_t) * numPlanes);
-    }
-  }
-};
-
-template <typename scalar_t, typename Index>
-struct SparseScatterAddFunctor<tv::CPU, scalar_t, Index> {
-  void operator()(const tv::CPU& d, tv::TensorView<scalar_t> outFeatures,
-                  tv::TensorView<const scalar_t> buffer,
-                  tv::TensorView<const Index> indices, int size, bool stable) {
-    int numPlanes = outFeatures.dim(1);
-    const scalar_t* buf = buffer.data();
-    scalar_t* out = outFeatures.data();
-    for (int i = 0; i < size; ++i) {
-      buf = buffer.data() + i * numPlanes;
-      out = outFeatures.data() + indices[i] * numPlanes;
-      for (int j = 0; j < numPlanes; ++j) {
-        out[j] += buf[j];
-      }
-    }
-  }
-};
-
-}  // namespace functor
-
-#define DECLARE_CPU_SPECS_T_INDEX(scalar_t, Index)                        \
-  template struct functor::SparseGatherFunctor<tv::CPU, scalar_t, Index>; \
-  template struct functor::SparseScatterAddFunctor<tv::CPU, scalar_t, Index>;
-
-#define DECLARE_CPU_SPECS(scalar_t)         \
-  DECLARE_CPU_SPECS_T_INDEX(scalar_t, int); \
-  DECLARE_CPU_SPECS_T_INDEX(scalar_t, long);
-
-DECLARE_CPU_SPECS(float);
-DECLARE_CPU_SPECS(double);
-DECLARE_CPU_SPECS(at::Half);
-
-#undef DECLARE_CPU_SPECS
-#undef DECLARE_CPU_SPECS_T_INDEX
diff --git a/mmcv/ops/csrc/pytorch/cpu/voxelization.cpp b/mmcv/ops/csrc/pytorch/cpu/voxelization.cpp
index a21f849..25cc2b5 100644
--- a/mmcv/ops/csrc/pytorch/cpu/voxelization.cpp
+++ b/mmcv/ops/csrc/pytorch/cpu/voxelization.cpp
@@ -26,22 +26,13 @@ void dynamic_voxelize_forward_cpu_kernel(
       coor[ndim_minus_1 - j] = c;
     }
 
-    // memcpy and memset will cause problem because of the memory distribution
-    // discontinuity of TensorAccessor, so here using loops to replace memcpy
-    // or memset
-    if (failed) {
-      for (int k = 0; k < NDim; ++k) {
-        coors[i][k] = -1;
-      }
-    } else {
-      for (int k = 0; k < NDim; ++k) {
-        coors[i][k] = coor[k];
-      }
-    }
+    if (failed)
+      memset(&coors[i][0], -1, NDim * sizeof(T_int));
+    else
+      memcpy(&coors[i][0], &coor[0], NDim * sizeof(T_int));
   }
 
   delete[] coor;
-  return;
 }
 
 template <typename T, typename T_int>
@@ -81,21 +72,14 @@ void hard_voxelize_forward_cpu_kernel(
       voxel_num += 1;
 
       coor_to_voxelidx[coor[i][0]][coor[i][1]][coor[i][2]] = voxelidx;
-      // memcpy will cause problem because of the memory distribution
-      // discontinuity of TensorAccessor, so here using loops to replace memcpy
-      for (int k = 0; k < NDim; ++k) {
-        coors[voxelidx][k] = coor[i][k];
-      }
+      memcpy(&coors[voxelidx][0], &coor[i][0], NDim * sizeof(T_int));
     }
 
     // put points into voxel
     num = num_points_per_voxel[voxelidx];
     if (max_points == -1 || num < max_points) {
-      // memcpy will cause problem because of the memory distribution
-      // discontinuity of TensorAccessor, so here using loops to replace memcpy
-      for (int k = 0; k < num_features; ++k) {
-        voxels[voxelidx][num][k] = points[i][k];
-      }
+      memcpy(&voxels[voxelidx][num][0], &points[i][0],
+             num_features * sizeof(T));
       num_points_per_voxel[voxelidx] += 1;
     }
   }
diff --git a/mmcv/ops/csrc/pytorch/cuda/active_rotated_filter_cuda.cu b/mmcv/ops/csrc/pytorch/cuda/active_rotated_filter_cuda.cu
deleted file mode 100644
index 27fffb9..0000000
--- a/mmcv/ops/csrc/pytorch/cuda/active_rotated_filter_cuda.cu
+++ /dev/null
@@ -1,58 +0,0 @@
-// Copyright (c) OpenMMLab. All rights reserved.
-// Modified from
-// https://github.com/csuhan/s2anet/blob/master/mmdet/ops/orn/src/cuda/ActiveRotatingFilter_cuda.cu
-#include "active_rotated_filter_cuda_kernel.cuh"
-#include "pytorch_cuda_helper.hpp"
-
-void ActiveRotatedFilterForwardCUDAKernelLauncher(const Tensor input,
-                                                  const Tensor indices,
-                                                  Tensor output) {
-  int num_output_planes = input.size(0);
-  int num_input_planes = input.size(1);
-  int num_orientations = input.size(2);
-  int kH = input.size(3);
-  int kW = input.size(4);
-  int num_rotations = indices.size(3);
-  int nEntry = num_orientations * kH * kW;
-  int output_size = input.numel();
-
-  at::cuda::CUDAGuard device_guard(input.device());
-  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
-      input.scalar_type(), "active_rotated_filter_forward_cuda_kernel", [&] {
-        active_rotated_filter_forward_cuda_kernel<scalar_t>
-            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
-                output_size, input.data_ptr<scalar_t>(),
-                indices.data_ptr<int>(), num_input_planes, num_output_planes,
-                num_orientations, num_rotations, nEntry,
-                output.data_ptr<scalar_t>());
-      });
-  AT_CUDA_CHECK(cudaGetLastError());
-}
-
-void ActiveRotatedFilterBackwardCUDAKernelLauncher(const Tensor grad_out,
-                                                   const Tensor indices,
-                                                   Tensor grad_in) {
-  int num_orientations = indices.size(0);
-  int kH = indices.size(1);
-  int kW = indices.size(2);
-  int num_rotations = indices.size(3);
-  int num_output_planes = grad_out.size(0) / num_rotations;
-  int num_input_planes = grad_out.size(1) / num_orientations;
-  int nEntry = num_orientations * kH * kW;
-  int output_size = grad_in.numel();
-
-  at::cuda::CUDAGuard device_guard(indices.device());
-  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
-      grad_out.scalar_type(), "active_rotated_filter_backward_cuda_kernel",
-      [&] {
-        active_rotated_filter_backward_cuda_kernel<scalar_t>
-            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
-                output_size, grad_out.data_ptr<scalar_t>(),
-                indices.data_ptr<int>(), num_input_planes, num_output_planes,
-                num_orientations, num_rotations, nEntry,
-                grad_in.data_ptr<scalar_t>());
-      });
-  AT_CUDA_CHECK(cudaGetLastError());
-}
diff --git a/mmcv/ops/csrc/pytorch/cuda/assign_score_withk_cuda.cu b/mmcv/ops/csrc/pytorch/cuda/assign_score_withk_cuda.cu
index bdb5fab..c4e684b 100644
--- a/mmcv/ops/csrc/pytorch/cuda/assign_score_withk_cuda.cu
+++ b/mmcv/ops/csrc/pytorch/cuda/assign_score_withk_cuda.cu
@@ -13,7 +13,7 @@ void AssignScoreWithKForwardCUDAKernelLauncher(
   at::cuda::CUDAGuard device_guard(points.device());
   cudaStream_t stream = at::cuda::getCurrentCUDAStream();
 
-  dim3 blocks(GET_BLOCKS(B * O * N1 * K, THREADS_PER_BLOCK));
+  dim3 blocks(DIVUP(B * O * N1 * K, THREADS_PER_BLOCK));
   dim3 threads(THREADS_PER_BLOCK);
 
   AT_DISPATCH_FLOATING_TYPES_AND_HALF(
@@ -36,9 +36,9 @@ void AssignScoreWithKBackwardCUDAKernelLauncher(
   at::cuda::CUDAGuard device_guard(grad_out.device());
   cudaStream_t stream = at::cuda::getCurrentCUDAStream();
 
-  dim3 blocks1(GET_BLOCKS(B * M * O, THREADS_PER_BLOCK));
+  dim3 blocks1(DIVUP(B * M * O, THREADS_PER_BLOCK));
   dim3 threads1(THREADS_PER_BLOCK);
-  dim3 blocks2(GET_BLOCKS(B * N1 * K * M, THREADS_PER_BLOCK));
+  dim3 blocks2(DIVUP(B * N1 * K * M, THREADS_PER_BLOCK));
   dim3 threads2(THREADS_PER_BLOCK);
 
   AT_DISPATCH_FLOATING_TYPES_AND_HALF(
diff --git a/mmcv/ops/csrc/pytorch/cuda/ball_query_cuda.cu b/mmcv/ops/csrc/pytorch/cuda/ball_query_cuda.cu
index c42c3e2..f5f5f39 100644
--- a/mmcv/ops/csrc/pytorch/cuda/ball_query_cuda.cu
+++ b/mmcv/ops/csrc/pytorch/cuda/ball_query_cuda.cu
@@ -22,7 +22,7 @@ void BallQueryForwardCUDAKernelLauncher(int b, int n, int m, float min_radius,
   cudaStream_t stream = at::cuda::getCurrentCUDAStream();
 
   // blockIdx.x(col), blockIdx.y(row)
-  dim3 blocks(GET_BLOCKS(m, THREADS_PER_BLOCK), b);
+  dim3 blocks(DIVUP(m, THREADS_PER_BLOCK), b);
   dim3 threads(THREADS_PER_BLOCK);
 
   AT_DISPATCH_FLOATING_TYPES_AND_HALF(
diff --git a/mmcv/ops/csrc/pytorch/cuda/bbox_overlaps_cuda.cu b/mmcv/ops/csrc/pytorch/cuda/bbox_overlaps_cuda.cu
index 7dae535..16679c7 100644
--- a/mmcv/ops/csrc/pytorch/cuda/bbox_overlaps_cuda.cu
+++ b/mmcv/ops/csrc/pytorch/cuda/bbox_overlaps_cuda.cu
@@ -2,23 +2,6 @@
 #include "bbox_overlaps_cuda_kernel.cuh"
 #include "pytorch_cuda_helper.hpp"
 
-// Disable fp16 on ROCm device
-#ifndef MMCV_WITH_HIP
-#if __CUDA_ARCH__ >= 530
-template <>
-__global__ void bbox_overlaps_cuda_kernel<at::Half>(
-    const at::Half* bbox1, const at::Half* bbox2, at::Half* ious,
-    const int num_bbox1, const int num_bbox2, const int mode,
-    const bool aligned, const int offset) {
-  bbox_overlaps_cuda_kernel_half(reinterpret_cast<const __half*>(bbox1),
-                                 reinterpret_cast<const __half*>(bbox2),
-                                 reinterpret_cast<__half*>(ious), num_bbox1,
-                                 num_bbox2, mode, aligned, offset);
-}
-
-#endif  // __CUDA_ARCH__ >= 530
-#endif  // MMCV_WITH_HIP
-
 void BBoxOverlapsCUDAKernelLauncher(const Tensor bboxes1, const Tensor bboxes2,
                                     Tensor ious, const int mode,
                                     const bool aligned, const int offset) {
diff --git a/mmcv/ops/csrc/pytorch/cuda/bezier_align_cuda.cu b/mmcv/ops/csrc/pytorch/cuda/bezier_align_cuda.cu
deleted file mode 100644
index b2786a8..0000000
--- a/mmcv/ops/csrc/pytorch/cuda/bezier_align_cuda.cu
+++ /dev/null
@@ -1,53 +0,0 @@
-// Copyright (c) OpenMMLab. All rights reserved
-#include "bezier_align_cuda_kernel.cuh"
-#include "pytorch_cuda_helper.hpp"
-
-void BezierAlignForwardCUDAKernelLauncher(Tensor input, Tensor rois,
-                                          Tensor output, int aligned_height,
-                                          int aligned_width,
-                                          float spatial_scale,
-                                          int sampling_ratio, bool aligned) {
-  int output_size = output.numel();
-  int channels = input.size(1);
-  int height = input.size(2);
-  int width = input.size(3);
-
-  at::cuda::CUDAGuard device_guard(input.device());
-  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
-      input.scalar_type(), "bezier_align_forward_cuda_kernel", [&] {
-        bezier_align_forward_cuda_kernel<scalar_t>
-            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
-                output_size, input.data_ptr<scalar_t>(),
-                rois.data_ptr<scalar_t>(), output.data_ptr<scalar_t>(),
-                aligned_height, aligned_width,
-                static_cast<scalar_t>(spatial_scale), sampling_ratio, aligned,
-                channels, height, width);
-      });
-
-  AT_CUDA_CHECK(cudaGetLastError());
-}
-
-void BezierAlignBackwardCUDAKernelLauncher(
-    Tensor grad_output, Tensor rois, Tensor grad_input, int aligned_height,
-    int aligned_width, float spatial_scale, int sampling_ratio, bool aligned) {
-  int output_size = grad_output.numel();
-  int channels = grad_input.size(1);
-  int height = grad_input.size(2);
-  int width = grad_input.size(3);
-
-  at::cuda::CUDAGuard device_guard(grad_output.device());
-  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
-      grad_output.scalar_type(), "bezier_align_backward_cuda_kernel", [&] {
-        bezier_align_backward_cuda_kernel<scalar_t>
-            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
-                output_size, grad_output.data_ptr<scalar_t>(),
-                rois.data_ptr<scalar_t>(), grad_input.data_ptr<scalar_t>(),
-                aligned_height, aligned_width,
-                static_cast<scalar_t>(spatial_scale), sampling_ratio, aligned,
-                channels, height, width);
-      });
-
-  AT_CUDA_CHECK(cudaGetLastError());
-}
diff --git a/mmcv/ops/csrc/pytorch/cuda/bias_act_cuda.cu b/mmcv/ops/csrc/pytorch/cuda/bias_act_cuda.cu
deleted file mode 100644
index 3ffda8d..0000000
--- a/mmcv/ops/csrc/pytorch/cuda/bias_act_cuda.cu
+++ /dev/null
@@ -1,300 +0,0 @@
-// Modified from
-// https://github.com/NVlabs/stylegan3/blob/main/torch_utils/ops/bias_act.cpp
-
-// Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
-//
-// NVIDIA CORPORATION and its licensors retain all intellectual property
-// and proprietary rights in and to this software, related documentation
-// and any modifications thereto.  Any use, reproduction, disclosure or
-// distribution of this software and related documentation without an express
-// license agreement from NVIDIA CORPORATION is strictly prohibited.
-
-#include <c10/util/Half.h>
-#include <cuda_runtime.h>
-#include <torch/types.h>
-
-#include "pytorch_cuda_helper.hpp"
-
-struct bias_act_kernel_params {
-  const void *x;     // [sizeX]
-  const void *b;     // [sizeB] or NULL
-  const void *xref;  // [sizeX] or NULL
-  const void *yref;  // [sizeX] or NULL
-  const void *dy;    // [sizeX] or NULL
-  void *y;           // [sizeX]
-
-  int grad;
-  int act;
-  float alpha;
-  float gain;
-  float clamp;
-
-  int sizeX;
-  int sizeB;
-  int stepB;
-  int loopX;
-};
-
-// CUDA kernel selection.
-
-template <class T>
-void *choose_bias_act_kernel(const bias_act_kernel_params &p);
-//------------------------------------------------------------------------
-// Helpers.
-
-template <class T>
-struct InternalType;
-template <>
-struct InternalType<double> {
-  typedef double scalar_t;
-};
-template <>
-struct InternalType<float> {
-  typedef float scalar_t;
-};
-template <>
-struct InternalType<c10::Half> {
-  typedef float scalar_t;
-};
-
-//------------------------------------------------------------------------
-// CUDA kernel.
-
-template <class T, int A>
-__global__ void bias_act_kernel(bias_act_kernel_params p) {
-  typedef typename InternalType<T>::scalar_t scalar_t;
-  int G = p.grad;
-  scalar_t alpha = (scalar_t)p.alpha;
-  scalar_t gain = (scalar_t)p.gain;
-  scalar_t clamp = (scalar_t)p.clamp;
-  scalar_t one = (scalar_t)1;
-  scalar_t two = (scalar_t)2;
-  scalar_t expRange = (scalar_t)80;
-  scalar_t halfExpRange = (scalar_t)40;
-  scalar_t seluScale = (scalar_t)1.0507009873554804934193349852946;
-  scalar_t seluAlpha = (scalar_t)1.6732632423543772848170429916717;
-
-  // Loop over elements.
-  int xi = blockIdx.x * p.loopX * blockDim.x + threadIdx.x;
-  for (int loopIdx = 0; loopIdx < p.loopX && xi < p.sizeX;
-       loopIdx++, xi += blockDim.x) {
-    // Load.
-    scalar_t x = (scalar_t)((const T *)p.x)[xi];
-    scalar_t b =
-        (p.b) ? (scalar_t)((const T *)p.b)[(xi / p.stepB) % p.sizeB] : 0;
-    scalar_t xref = (p.xref) ? (scalar_t)((const T *)p.xref)[xi] : 0;
-    scalar_t yref = (p.yref) ? (scalar_t)((const T *)p.yref)[xi] : 0;
-    scalar_t dy = (p.dy) ? (scalar_t)((const T *)p.dy)[xi] : one;
-    scalar_t yy = (gain != 0) ? yref / gain : 0;
-    scalar_t y = 0;
-
-    // Apply bias.
-    ((G == 0) ? x : xref) += b;
-
-    // linear
-    if (A == 1) {
-      if (G == 0) y = x;
-      if (G == 1) y = x;
-    }
-
-    // relu
-    if (A == 2) {
-      if (G == 0) y = (x > 0) ? x : 0;
-      if (G == 1) y = (yy > 0) ? x : 0;
-    }
-
-    // lrelu
-    if (A == 3) {
-      if (G == 0) y = (x > 0) ? x : x * alpha;
-      if (G == 1) y = (yy > 0) ? x : x * alpha;
-    }
-
-    // tanh
-    if (A == 4) {
-      if (G == 0) {
-        scalar_t c = exp(x);
-        scalar_t d = one / c;
-        y = (x < -expRange) ? -one : (x > expRange) ? one : (c - d) / (c + d);
-      }
-      if (G == 1) y = x * (one - yy * yy);
-      if (G == 2) y = x * (one - yy * yy) * (-two * yy);
-    }
-
-    // sigmoid
-    if (A == 5) {
-      if (G == 0) y = (x < -expRange) ? 0 : one / (exp(-x) + one);
-      if (G == 1) y = x * yy * (one - yy);
-      if (G == 2) y = x * yy * (one - yy) * (one - two * yy);
-    }
-
-    // elu
-    if (A == 6) {
-      if (G == 0) y = (x >= 0) ? x : exp(x) - one;
-      if (G == 1) y = (yy >= 0) ? x : x * (yy + one);
-      if (G == 2) y = (yy >= 0) ? 0 : x * (yy + one);
-    }
-
-    // selu
-    if (A == 7) {
-      if (G == 0)
-        y = (x >= 0) ? seluScale * x : (seluScale * seluAlpha) * (exp(x) - one);
-      if (G == 1)
-        y = (yy >= 0) ? x * seluScale : x * (yy + seluScale * seluAlpha);
-      if (G == 2) y = (yy >= 0) ? 0 : x * (yy + seluScale * seluAlpha);
-    }
-
-    // softplus
-    if (A == 8) {
-      if (G == 0) y = (x > expRange) ? x : log(exp(x) + one);
-      if (G == 1) y = x * (one - exp(-yy));
-      if (G == 2) {
-        scalar_t c = exp(-yy);
-        y = x * c * (one - c);
-      }
-    }
-
-    // swish
-    if (A == 9) {
-      if (G == 0)
-        y = (x < -expRange) ? 0 : x / (exp(-x) + one);
-      else {
-        scalar_t c = exp(xref);
-        scalar_t d = c + one;
-        if (G == 1)
-          y = (xref > halfExpRange) ? x : x * c * (xref + d) / (d * d);
-        else
-          y = (xref > halfExpRange)
-                  ? 0
-                  : x * c * (xref * (two - d) + two * d) / (d * d * d);
-        yref = (xref < -expRange) ? 0 : xref / (exp(-xref) + one) * gain;
-      }
-    }
-
-    // Apply gain.
-    y *= gain * dy;
-
-    // Clamp.
-    if (clamp >= 0) {
-      if (G == 0)
-        y = (y > -clamp & y < clamp) ? y : (y >= 0) ? clamp : -clamp;
-      else
-        y = (yref > -clamp & yref < clamp) ? y : 0;
-    }
-
-    // Store.
-    ((T *)p.y)[xi] = (T)y;
-  }
-}
-
-//------------------------------------------------------------------------
-// CUDA kernel selection.
-
-template <class T>
-void *choose_bias_act_kernel(const bias_act_kernel_params &p) {
-  if (p.act == 1) return (void *)bias_act_kernel<T, 1>;
-  if (p.act == 2) return (void *)bias_act_kernel<T, 2>;
-  if (p.act == 3) return (void *)bias_act_kernel<T, 3>;
-  if (p.act == 4) return (void *)bias_act_kernel<T, 4>;
-  if (p.act == 5) return (void *)bias_act_kernel<T, 5>;
-  if (p.act == 6) return (void *)bias_act_kernel<T, 6>;
-  if (p.act == 7) return (void *)bias_act_kernel<T, 7>;
-  if (p.act == 8) return (void *)bias_act_kernel<T, 8>;
-  if (p.act == 9) return (void *)bias_act_kernel<T, 9>;
-  return NULL;
-}
-
-//------------------------------------------------------------------------
-
-static bool has_same_layout(torch::Tensor x, torch::Tensor y) {
-  if (x.dim() != y.dim()) return false;
-  for (int64_t i = 0; i < x.dim(); i++) {
-    if (x.size(i) != y.size(i)) return false;
-    if (x.size(i) >= 2 && x.stride(i) != y.stride(i)) return false;
-  }
-  return true;
-}
-
-//------------------------------------------------------------------------
-torch::Tensor bias_act_op(const torch::Tensor &x, const torch::Tensor &b,
-                          const torch::Tensor &xref, const torch::Tensor &yref,
-                          const torch::Tensor &dy, int grad, int dim, int act,
-                          float alpha, float gain, float clamp) {
-  // Validate arguments.
-  TORCH_CHECK(x.is_cuda(), "x must reside on CUDA device");
-  TORCH_CHECK(
-      b.numel() == 0 || (b.dtype() == x.dtype() && b.device() == x.device()),
-      "b must have the same dtype and device as x");
-  TORCH_CHECK(xref.numel() == 0 ||
-                  (xref.sizes() == x.sizes() && xref.dtype() == x.dtype() &&
-                   xref.device() == x.device()),
-              "xref must have the same shape, dtype, and device as x");
-  TORCH_CHECK(yref.numel() == 0 ||
-                  (yref.sizes() == x.sizes() && yref.dtype() == x.dtype() &&
-                   yref.device() == x.device()),
-              "yref must have the same shape, dtype, and device as x");
-  TORCH_CHECK(
-      dy.numel() == 0 || (dy.sizes() == x.sizes() && dy.dtype() == x.dtype() &&
-                          dy.device() == x.device()),
-      "dy must have the same dtype and device as x");
-  TORCH_CHECK(x.numel() <= INT_MAX, "x is too large");
-  TORCH_CHECK(b.dim() == 1, "b must have rank 1");
-  TORCH_CHECK(b.numel() == 0 || (dim >= 0 && dim < x.dim()),
-              "dim is out of bounds");
-  TORCH_CHECK(b.numel() == 0 || b.numel() == x.size(dim),
-              "b has wrong number of elements");
-  TORCH_CHECK(grad >= 0, "grad must be non-negative");
-
-  // Validate layout.
-  TORCH_CHECK(x.is_non_overlapping_and_dense(),
-              "x must be non-overlapping and dense");
-  TORCH_CHECK(b.is_contiguous(), "b must be contiguous");
-  TORCH_CHECK(xref.numel() == 0 || has_same_layout(xref, x),
-              "xref must have the same layout as x");
-  TORCH_CHECK(yref.numel() == 0 || has_same_layout(yref, x),
-              "yref must have the same layout as x");
-  TORCH_CHECK(dy.numel() == 0 || has_same_layout(dy, x),
-              "dy must have the same layout as x");
-
-  // Create output tensor.
-  const at::cuda::OptionalCUDAGuard device_guard(device_of(x));
-  torch::Tensor y = torch::empty_like(x);
-  TORCH_CHECK(has_same_layout(y, x), "y must have the same layout as x");
-
-  // Initialize CUDA kernel parameters.
-  bias_act_kernel_params p;
-  p.x = x.data_ptr();
-  p.b = (b.numel()) ? b.data_ptr() : NULL;
-  p.xref = (xref.numel()) ? xref.data_ptr() : NULL;
-  p.yref = (yref.numel()) ? yref.data_ptr() : NULL;
-  p.dy = (dy.numel()) ? dy.data_ptr() : NULL;
-  p.y = y.data_ptr();
-  p.grad = grad;
-  p.act = act;
-  p.alpha = alpha;
-  p.gain = gain;
-  p.clamp = clamp;
-  p.sizeX = (int)x.numel();
-  p.sizeB = (int)b.numel();
-  p.stepB = (b.numel()) ? (int)x.stride(dim) : 1;
-
-  // Choose CUDA kernel.
-  void *kernel;
-  AT_DISPATCH_FLOATING_TYPES_AND_HALF(x.scalar_type(), "upfirdn2d_cuda", [&] {
-    kernel = choose_bias_act_kernel<scalar_t>(p);
-  });
-  TORCH_CHECK(kernel, "no CUDA kernel found for the specified activation func");
-
-  // Launch CUDA kernel.
-  p.loopX = 4;
-  int blockSize = 4 * 32;
-  int gridSize = (p.sizeX - 1) / (p.loopX * blockSize) + 1;
-  void *args[] = {&p};
-#ifndef MMCV_WITH_HIP
-  AT_CUDA_CHECK(cudaLaunchKernel(kernel, gridSize, blockSize, args, 0,
-                                 at::cuda::getCurrentCUDAStream()));
-#else
-  AT_CUDA_CHECK(hipLaunchKernel(kernel, gridSize, blockSize, args, 0,
-                                 at::cuda::getCurrentCUDAStream()));
-#endif
-  return y;
-}
diff --git a/mmcv/ops/csrc/pytorch/cuda/box_iou_quadri_cuda.cu b/mmcv/ops/csrc/pytorch/cuda/box_iou_quadri_cuda.cu
deleted file mode 100644
index 25b6819..0000000
--- a/mmcv/ops/csrc/pytorch/cuda/box_iou_quadri_cuda.cu
+++ /dev/null
@@ -1,23 +0,0 @@
-// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
-#include "box_iou_quadri_cuda.cuh"
-#include "pytorch_cuda_helper.hpp"
-
-void box_iou_quadri_cuda(const Tensor boxes1, const Tensor boxes2, Tensor ious,
-                         const int mode_flag, const bool aligned) {
-  using scalar_t = float;
-  AT_ASSERTM(boxes1.is_cuda(), "boxes1 must be a CUDA tensor");
-  AT_ASSERTM(boxes2.is_cuda(), "boxes2 must be a CUDA tensor");
-
-  int output_size = ious.numel();
-  int num_boxes1 = boxes1.size(0);
-  int num_boxes2 = boxes2.size(0);
-
-  at::cuda::CUDAGuard device_guard(boxes1.device());
-  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-  box_iou_quadri_cuda_kernel<scalar_t>
-      <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
-          num_boxes1, num_boxes2, boxes1.data_ptr<scalar_t>(),
-          boxes2.data_ptr<scalar_t>(), (scalar_t*)ious.data_ptr<scalar_t>(),
-          mode_flag, aligned);
-  AT_CUDA_CHECK(cudaGetLastError());
-}
diff --git a/mmcv/ops/csrc/pytorch/cuda/chamfer_distance_cuda.cu b/mmcv/ops/csrc/pytorch/cuda/chamfer_distance_cuda.cu
deleted file mode 100644
index 6effa29..0000000
--- a/mmcv/ops/csrc/pytorch/cuda/chamfer_distance_cuda.cu
+++ /dev/null
@@ -1,63 +0,0 @@
-// Copyright (c) OpenMMLab. All rights reserved.
-// Modified from
-// https://github.com/chrdiller/pyTorchChamferDistance/blob/master/chamfer_distance/chamfer_distance.cpp
-#include "chamfer_distance_cuda_kernel.cuh"
-#include "pytorch_cuda_helper.hpp"
-
-void ChamferDistanceForwardCUDAKernelLauncher(
-    const Tensor xyz1, const Tensor xyz2, const Tensor dist1,
-    const Tensor dist2, const Tensor idx1, const Tensor idx2) {
-  int batch_size = xyz1.size(0);
-  int n = xyz1.size(1);
-  int m = xyz2.size(1);
-
-  at::cuda::CUDAGuard device_guard(xyz1.device());
-  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
-      xyz1.scalar_type(), "chamfer_distance_forward_cuda_kernel", [&] {
-        chamfer_distance_forward_cuda_kernel<scalar_t>
-            <<<GET_BLOCKS(batch_size * n), THREADS_PER_BLOCK, 0, stream>>>(
-                batch_size, n, xyz1.data_ptr<scalar_t>(), m,
-                xyz2.data_ptr<scalar_t>(), dist1.data_ptr<scalar_t>(),
-                idx1.data_ptr<int>());
-      });
-  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
-      xyz1.scalar_type(), "chamfer_distance_forward_cuda_kernel", [&] {
-        chamfer_distance_forward_cuda_kernel<scalar_t>
-            <<<GET_BLOCKS(batch_size * m), THREADS_PER_BLOCK, 0, stream>>>(
-                batch_size, m, xyz2.data_ptr<scalar_t>(), n,
-                xyz1.data_ptr<scalar_t>(), dist2.data_ptr<scalar_t>(),
-                idx2.data_ptr<int>());
-      });
-  AT_CUDA_CHECK(cudaGetLastError());
-}
-
-void ChamferDistanceBackwardCUDAKernelLauncher(
-    const Tensor xyz1, const Tensor xyz2, Tensor idx1, Tensor idx2,
-    Tensor grad_dist1, Tensor grad_dist2, Tensor grad_xyz1, Tensor grad_xyz2) {
-  int batch_size = xyz1.size(0);
-  int n = xyz1.size(1);
-  int m = xyz2.size(1);
-
-  at::cuda::CUDAGuard device_guard(xyz1.device());
-  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
-      xyz1.scalar_type(), "chamfer_distance_backward_cuda_kernel", [&] {
-        chamfer_distance_backward_cuda_kernel<scalar_t>
-            <<<GET_BLOCKS(batch_size * n), THREADS_PER_BLOCK / 2, 0, stream>>>(
-                batch_size, m, xyz1.data_ptr<scalar_t>(), n,
-                xyz2.data_ptr<scalar_t>(), grad_dist1.data_ptr<scalar_t>(),
-                idx1.data_ptr<int>(), grad_xyz1.data_ptr<scalar_t>(),
-                grad_xyz2.data_ptr<scalar_t>());
-      });
-  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
-      xyz1.scalar_type(), "chamfer_distance_backward_cuda_kernel", [&] {
-        chamfer_distance_backward_cuda_kernel<scalar_t>
-            <<<GET_BLOCKS(batch_size * m), THREADS_PER_BLOCK / 2, 0, stream>>>(
-                batch_size, n, xyz2.data_ptr<scalar_t>(), m,
-                xyz1.data_ptr<scalar_t>(), grad_dist2.data_ptr<scalar_t>(),
-                idx2.data_ptr<int>(), grad_xyz2.data_ptr<scalar_t>(),
-                grad_xyz1.data_ptr<scalar_t>());
-      });
-  AT_CUDA_CHECK(cudaGetLastError());
-}
diff --git a/mmcv/ops/csrc/pytorch/cuda/convex_iou.cu b/mmcv/ops/csrc/pytorch/cuda/convex_iou.cu
deleted file mode 100644
index 804f7ac..0000000
--- a/mmcv/ops/csrc/pytorch/cuda/convex_iou.cu
+++ /dev/null
@@ -1,41 +0,0 @@
-// Copyright (c) OpenMMLab. All rights reserved
-// modified from
-// https://github.com/SDL-GuoZonghao/BeyondBoundingBox/blob/main/mmdet/ops/iou/src/convex_iou_kernel.cu
-#include "convex_iou_cuda_kernel.cuh"
-#include "pytorch_cuda_helper.hpp"
-
-void ConvexIoUCUDAKernelLauncher(const Tensor pointsets, const Tensor polygons,
-                                 Tensor ious) {
-  int output_size = ious.numel();
-  int num_pointsets = pointsets.size(0);
-  int num_polygons = polygons.size(0);
-
-  at::cuda::CUDAGuard device_guard(pointsets.device());
-  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
-      pointsets.scalar_type(), "convex_iou_cuda_kernel", ([&] {
-        convex_iou_cuda_kernel<scalar_t>
-            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK / 2, 0, stream>>>(
-                num_pointsets, num_polygons, pointsets.data_ptr<scalar_t>(),
-                polygons.data_ptr<scalar_t>(), ious.data_ptr<scalar_t>());
-      }));
-  AT_CUDA_CHECK(cudaGetLastError());
-}
-
-void ConvexGIoUCUDAKernelLauncher(const Tensor pointsets, const Tensor polygons,
-                                  Tensor output) {
-  int output_size = output.numel();
-  int num_pointsets = pointsets.size(0);
-  int num_polygons = polygons.size(0);
-
-  at::cuda::CUDAGuard device_guard(pointsets.device());
-  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
-      pointsets.scalar_type(), "convex_giou_cuda_kernel", ([&] {
-        convex_giou_cuda_kernel<scalar_t>
-            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK / 2, 0, stream>>>(
-                num_pointsets, num_polygons, pointsets.data_ptr<scalar_t>(),
-                polygons.data_ptr<scalar_t>(), output.data_ptr<scalar_t>());
-      }));
-  AT_CUDA_CHECK(cudaGetLastError());
-}
diff --git a/mmcv/ops/csrc/pytorch/cuda/correlation_cuda.cu b/mmcv/ops/csrc/pytorch/cuda/correlation_cuda.cu
index 6a43cfc..56d2e64 100644
--- a/mmcv/ops/csrc/pytorch/cuda/correlation_cuda.cu
+++ b/mmcv/ops/csrc/pytorch/cuda/correlation_cuda.cu
@@ -24,8 +24,8 @@ void CorrelationForwardCUDAKernelLauncher(Tensor input1, Tensor input2,
   auto trInput1 = input1.permute({0, 2, 3, 1}).contiguous();
   auto trInput2 = input2.permute({0, 2, 3, 1}).contiguous();
 
-  const dim3 threads(WARP_SIZE, 4, 4);
-  const dim3 blocks(batch_size, (oH + 3) >> 2, (oW + 3) >> 2);
+  const int threads = THREADS_FORWARD;
+  const dim3 blocks(batch_size, oH, oW);
 
   at::cuda::CUDAGuard device_guard(input1.device());
 
@@ -42,7 +42,7 @@ void CorrelationForwardCUDAKernelLauncher(Tensor input1, Tensor input2,
             <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
                 trInput1_acc, trInput2_acc, output_acc, kH, kW, patchH, patchW,
                 padH, padW, dilationH, dilationW, dilation_patchH,
-                dilation_patchW, dH, dW, oH, oW);
+                dilation_patchW, dH, dW);
       }));
 }
 
@@ -56,20 +56,17 @@ void CorrelationBackwardCUDAKernelLauncher(
   const int iW = input1.size(3);
   const int C = input1.size(1);
 
-  auto trInput1 = input1.permute({0, 2, 3, 1}).contiguous();
-  auto trInput2 = input2.permute({0, 2, 3, 1}).contiguous();
-  const dim3 blocks(batch_size, iH, iW);
-  const dim3 threads(THREADS_PER_BLOCK);
+  const dim3 blocks(C, iH, iW);
+  const dim3 threads(THREADS_BACKWARD, THREADS_BACKWARD);
 
   at::cuda::CUDAGuard device_guard(input1.device());
 
   AT_DISPATCH_FLOATING_TYPES_AND_HALF(
       input1.scalar_type(), "correlation_backward_cuda", ([&] {
-        const int grad_cache_size = patchH * patchW * sizeof(scalar_t);
         TensorAcc4R input1_acc =
-            trInput1.packed_accessor32<scalar_t, 4, RestrictPtrTraits>();
+            input1.packed_accessor32<scalar_t, 4, RestrictPtrTraits>();
         TensorAcc4R input2_acc =
-            trInput2.packed_accessor32<scalar_t, 4, RestrictPtrTraits>();
+            input2.packed_accessor32<scalar_t, 4, RestrictPtrTraits>();
         TensorAcc4R grad_input1_acc =
             grad_input1.packed_accessor32<scalar_t, 4, RestrictPtrTraits>();
         TensorAcc4R grad_input2_acc =
@@ -77,18 +74,20 @@ void CorrelationBackwardCUDAKernelLauncher(
         TensorAcc5R grad_output_acc =
             grad_output.packed_accessor32<scalar_t, 5, RestrictPtrTraits>();
 
-        correlation_backward_cuda_kernel_input1<scalar_t>
-            <<<blocks, threads, grad_cache_size,
-               at::cuda::getCurrentCUDAStream()>>>(
-                grad_output_acc, input2_acc, grad_input1_acc, kH, kW, patchH,
-                patchW, padH, padW, dilationH, dilationW, dilation_patchH,
-                dilation_patchW, dH, dW);
+        for (int n = 0; n < batch_size; ++n) {
+          correlation_backward_cuda_kernel_input1<scalar_t>
+              <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
+                  grad_output_acc, input2_acc, grad_input1_acc, kH, kW, patchH,
+                  patchW, padH, padW, dilationH, dilationW, dilation_patchH,
+                  dilation_patchW, dH, dW, n);
+        }
 
-        correlation_backward_cuda_kernel_input2<scalar_t>
-            <<<blocks, threads, grad_cache_size,
-               at::cuda::getCurrentCUDAStream()>>>(
-                grad_output_acc, input1_acc, grad_input2_acc, kH, kW, patchH,
-                patchW, padH, padW, dilationH, dilationW, dilation_patchH,
-                dilation_patchW, dH, dW);
+        for (int n = 0; n < batch_size; ++n) {
+          correlation_backward_cuda_kernel_input2<scalar_t>
+              <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
+                  grad_output_acc, input1_acc, grad_input2_acc, kH, kW, patchH,
+                  patchW, padH, padW, dilationH, dilationW, dilation_patchH,
+                  dilation_patchW, dH, dW, n);
+        }
       }));
 }
diff --git a/mmcv/ops/csrc/pytorch/cuda/cudabind.cpp b/mmcv/ops/csrc/pytorch/cuda/cudabind.cpp
index d935955..2e7a3f5 100644
--- a/mmcv/ops/csrc/pytorch/cuda/cudabind.cpp
+++ b/mmcv/ops/csrc/pytorch/cuda/cudabind.cpp
@@ -3,45 +3,45 @@
 
 void AssignScoreWithKForwardCUDAKernelLauncher(
     int B, int N0, int N1, int M, int K, int O, int aggregate,
-    const Tensor &points, const Tensor &centers, const Tensor &scores,
-    const Tensor &knn_idx, Tensor &output);
+    const Tensor& points, const Tensor& centers, const Tensor& scores,
+    const Tensor& knn_idx, Tensor& output);
 
 void AssignScoreWithKBackwardCUDAKernelLauncher(
     int B, int N0, int N1, int M, int K, int O, int aggregate,
-    const Tensor &grad_out, const Tensor &points, const Tensor &centers,
-    const Tensor &scores, const Tensor &knn_idx, Tensor &grad_points,
-    Tensor &grad_centers, Tensor &grad_scores);
+    const Tensor& grad_out, const Tensor& points, const Tensor& centers,
+    const Tensor& scores, const Tensor& knn_idx, Tensor& grad_points,
+    Tensor& grad_centers, Tensor& grad_scores);
 
 void assign_score_withk_forward_cuda(int B, int N0, int N1, int M, int K, int O,
-                                     int aggregate, const Tensor &points,
-                                     const Tensor &centers,
-                                     const Tensor &scores,
-                                     const Tensor &knn_idx, Tensor &output) {
+                                     int aggregate, const Tensor& points,
+                                     const Tensor& centers,
+                                     const Tensor& scores,
+                                     const Tensor& knn_idx, Tensor& output) {
   AssignScoreWithKForwardCUDAKernelLauncher(
       B, N0, N1, M, K, O, aggregate, points, centers, scores, knn_idx, output);
 };
 
 void assign_score_withk_backward_cuda(
     int B, int N0, int N1, int M, int K, int O, int aggregate,
-    const Tensor &grad_out, const Tensor &points, const Tensor &centers,
-    const Tensor &scores, const Tensor &knn_idx, Tensor &grad_points,
-    Tensor &grad_centers, Tensor &grad_scores) {
+    const Tensor& grad_out, const Tensor& points, const Tensor& centers,
+    const Tensor& scores, const Tensor& knn_idx, Tensor& grad_points,
+    Tensor& grad_centers, Tensor& grad_scores) {
   AssignScoreWithKBackwardCUDAKernelLauncher(
       B, N0, N1, M, K, O, aggregate, grad_out, points, centers, scores, knn_idx,
       grad_points, grad_centers, grad_scores);
 };
 
 void assign_score_withk_forward_impl(int B, int N0, int N1, int M, int K, int O,
-                                     int aggregate, const Tensor &points,
-                                     const Tensor &centers,
-                                     const Tensor &scores,
-                                     const Tensor &knn_idx, Tensor &output);
+                                     int aggregate, const Tensor& points,
+                                     const Tensor& centers,
+                                     const Tensor& scores,
+                                     const Tensor& knn_idx, Tensor& output);
 
 void assign_score_withk_backward_impl(
     int B, int N0, int N1, int M, int K, int O, int aggregate,
-    const Tensor &grad_out, const Tensor &points, const Tensor &centers,
-    const Tensor &scores, const Tensor &knn_idx, Tensor &grad_points,
-    Tensor &grad_centers, Tensor &grad_scores);
+    const Tensor& grad_out, const Tensor& points, const Tensor& centers,
+    const Tensor& scores, const Tensor& knn_idx, Tensor& grad_points,
+    Tensor& grad_centers, Tensor& grad_scores);
 
 REGISTER_DEVICE_IMPL(assign_score_withk_forward_impl, CUDA,
                      assign_score_withk_forward_cuda);
@@ -67,30 +67,6 @@ void ball_query_forward_impl(int b, int n, int m, float min_radius,
                              Tensor idx);
 REGISTER_DEVICE_IMPL(ball_query_forward_impl, CUDA, ball_query_forward_cuda);
 
-void StackBallQueryForwardCUDAKernelLauncher(float max_radius, int nsample,
-                                             const Tensor new_xyz,
-                                             const Tensor new_xyz_batch_cnt,
-                                             const Tensor xyz,
-                                             const Tensor xyz_batch_cnt,
-                                             Tensor idx);
-
-void stack_ball_query_forward_cuda(float max_radius, int nsample,
-                                   const Tensor new_xyz,
-                                   const Tensor new_xyz_batch_cnt,
-                                   const Tensor xyz, const Tensor xyz_batch_cnt,
-                                   Tensor idx) {
-  StackBallQueryForwardCUDAKernelLauncher(
-      max_radius, nsample, new_xyz, new_xyz_batch_cnt, xyz, xyz_batch_cnt, idx);
-};
-
-void stack_ball_query_forward_impl(float max_radius, int nsample,
-                                   const Tensor new_xyz,
-                                   const Tensor new_xyz_batch_cnt,
-                                   const Tensor xyz, const Tensor xyz_batch_cnt,
-                                   Tensor idx);
-REGISTER_DEVICE_IMPL(stack_ball_query_forward_impl, CUDA,
-                     stack_ball_query_forward_cuda);
-
 void BBoxOverlapsCUDAKernelLauncher(const Tensor bboxes1, const Tensor bboxes2,
                                     Tensor ious, const int mode,
                                     const bool aligned, const int offset);
@@ -104,37 +80,37 @@ void bbox_overlaps_impl(const Tensor bboxes1, const Tensor bboxes2, Tensor ious,
                         const int mode, const bool aligned, const int offset);
 REGISTER_DEVICE_IMPL(bbox_overlaps_impl, CUDA, bbox_overlaps_cuda);
 
-void BorderAlignForwardCUDAKernelLauncher(const Tensor &input,
-                                          const Tensor &boxes, Tensor output,
+void BorderAlignForwardCUDAKernelLauncher(const Tensor& input,
+                                          const Tensor& boxes, Tensor output,
                                           Tensor argmax_idx,
                                           const int pool_size);
 
-void BorderAlignBackwardCUDAKernelLauncher(const Tensor &grad_output,
-                                           const Tensor &boxes,
-                                           const Tensor &argmax_idx,
+void BorderAlignBackwardCUDAKernelLauncher(const Tensor& grad_output,
+                                           const Tensor& boxes,
+                                           const Tensor& argmax_idx,
                                            Tensor grad_input,
                                            const int pool_size);
 
-void border_align_forward_cuda(const Tensor &input, const Tensor &boxes,
+void border_align_forward_cuda(const Tensor& input, const Tensor& boxes,
                                Tensor output, Tensor argmax_idx,
                                const int pool_size) {
   BorderAlignForwardCUDAKernelLauncher(input, boxes, output, argmax_idx,
                                        pool_size);
 }
 
-void border_align_backward_cuda(const Tensor &grad_output, const Tensor &boxes,
-                                const Tensor &argmax_idx, Tensor grad_input,
+void border_align_backward_cuda(const Tensor& grad_output, const Tensor& boxes,
+                                const Tensor& argmax_idx, Tensor grad_input,
                                 const int pool_size) {
   BorderAlignBackwardCUDAKernelLauncher(grad_output, boxes, argmax_idx,
                                         grad_input, pool_size);
 }
 
-void border_align_forward_impl(const Tensor &input, const Tensor &boxes,
+void border_align_forward_impl(const Tensor& input, const Tensor& boxes,
                                Tensor output, Tensor argmax_idx,
                                const int pool_size);
 
-void border_align_backward_impl(const Tensor &grad_output, const Tensor &boxes,
-                                const Tensor &argmax_idx, Tensor grad_input,
+void border_align_backward_impl(const Tensor& grad_output, const Tensor& boxes,
+                                const Tensor& argmax_idx, Tensor grad_input,
                                 const int pool_size);
 
 REGISTER_DEVICE_IMPL(border_align_forward_impl, CUDA,
@@ -149,13 +125,6 @@ void box_iou_rotated_impl(const Tensor boxes1, const Tensor boxes2, Tensor ious,
                           const int mode_flag, const bool aligned);
 REGISTER_DEVICE_IMPL(box_iou_rotated_impl, CUDA, box_iou_rotated_cuda);
 
-void box_iou_quadri_cuda(const Tensor boxes1, const Tensor boxes2, Tensor ious,
-                         const int mode_flag, const bool aligned);
-
-void box_iou_quadri_impl(const Tensor boxes1, const Tensor boxes2, Tensor ious,
-                         const int mode_flag, const bool aligned);
-REGISTER_DEVICE_IMPL(box_iou_quadri_impl, CUDA, box_iou_quadri_cuda);
-
 void CARAFEForwardCUDAKernelLauncher(const Tensor features, const Tensor masks,
                                      Tensor rfeatures, Tensor routput,
                                      Tensor rmasks, Tensor output,
@@ -472,18 +441,18 @@ REGISTER_DEVICE_IMPL(softmax_focal_loss_backward_impl, CUDA,
                      softmax_focal_loss_backward_cuda);
 
 void FurthestPointSamplingForwardCUDAKernelLauncher(int b, int n, int m,
-                                                    const float *dataset,
-                                                    float *temp, int *idxs);
+                                                    const float* dataset,
+                                                    float* temp, int* idxs);
 
 void FurthestPointSamplingWithDistForwardCUDAKernelLauncher(
-    int b, int n, int m, const float *dataset, float *temp, int *idxs);
+    int b, int n, int m, const float* dataset, float* temp, int* idxs);
 
 void furthest_point_sampling_forward_cuda(Tensor points_tensor,
                                           Tensor temp_tensor, Tensor idx_tensor,
                                           int b, int n, int m) {
-  const float *dataset = points_tensor.data_ptr<float>();
-  float *temp = temp_tensor.data_ptr<float>();
-  int *idxs = idx_tensor.data_ptr<int>();
+  const float* dataset = points_tensor.data_ptr<float>();
+  float* temp = temp_tensor.data_ptr<float>();
+  int* idxs = idx_tensor.data_ptr<int>();
   FurthestPointSamplingForwardCUDAKernelLauncher(b, n, m, dataset, temp, idxs);
 }
 
@@ -491,9 +460,9 @@ void furthest_point_sampling_with_dist_forward_cuda(Tensor points_tensor,
                                                     Tensor temp_tensor,
                                                     Tensor idx_tensor, int b,
                                                     int n, int m) {
-  const float *dataset = points_tensor.data_ptr<float>();
-  float *temp = temp_tensor.data_ptr<float>();
-  int *idxs = idx_tensor.data_ptr<int>();
+  const float* dataset = points_tensor.data_ptr<float>();
+  float* temp = temp_tensor.data_ptr<float>();
+  int* idxs = idx_tensor.data_ptr<int>();
   FurthestPointSamplingWithDistForwardCUDAKernelLauncher(b, n, m, dataset, temp,
                                                          idxs);
 }
@@ -512,43 +481,18 @@ REGISTER_DEVICE_IMPL(furthest_point_sampling_forward_impl, CUDA,
 REGISTER_DEVICE_IMPL(furthest_point_sampling_with_dist_forward_impl, CUDA,
                      furthest_point_sampling_with_dist_forward_cuda);
 
-torch::Tensor fused_bias_leakyrelu_op(const torch::Tensor &input,
-                                      const torch::Tensor &bias,
-                                      const torch::Tensor &refer, int act,
+torch::Tensor fused_bias_leakyrelu_op(const torch::Tensor& input,
+                                      const torch::Tensor& bias,
+                                      const torch::Tensor& refer, int act,
                                       int grad, float alpha, float scale);
 
-torch::Tensor fused_bias_leakyrelu_op_impl(const torch::Tensor &input,
-                                           const torch::Tensor &bias,
-                                           const torch::Tensor &refer, int act,
+torch::Tensor fused_bias_leakyrelu_op_impl(const torch::Tensor& input,
+                                           const torch::Tensor& bias,
+                                           const torch::Tensor& refer, int act,
                                            int grad, float alpha, float scale);
 REGISTER_DEVICE_IMPL(fused_bias_leakyrelu_op_impl, CUDA,
                      fused_bias_leakyrelu_op);
 
-torch::Tensor bias_act_op_impl(const torch::Tensor &input,
-                               const torch::Tensor &bias,
-                               const torch::Tensor &xref,
-                               const torch::Tensor &yref,
-                               const torch::Tensor &dy, int grad, int dim,
-                               int act, float alpha, float gain, float clamp);
-
-torch::Tensor bias_act_op(const torch::Tensor &input, const torch::Tensor &bias,
-                          const torch::Tensor &xref, const torch::Tensor &yref,
-                          const torch::Tensor &dy, int grad, int dim, int act,
-                          float alpha, float gain, float clamp);
-
-REGISTER_DEVICE_IMPL(bias_act_op_impl, CUDA, bias_act_op);
-
-torch::Tensor filtered_lrelu_act_op_impl(torch::Tensor x, torch::Tensor si,
-                                         int sx, int sy, float gain,
-                                         float slope, float clamp,
-                                         bool writeSigns);
-
-torch::Tensor filtered_lrelu_act_op(torch::Tensor x, torch::Tensor si, int sx,
-                                    int sy, float gain, float slope,
-                                    float clamp, bool writeSigns);
-
-REGISTER_DEVICE_IMPL(filtered_lrelu_act_op_impl, CUDA, filtered_lrelu_act_op);
-
 void GatherPointsForwardCUDAKernelLauncher(int b, int c, int n, int npoints,
                                            const Tensor points,
                                            const Tensor idx, Tensor out);
@@ -620,69 +564,26 @@ REGISTER_DEVICE_IMPL(group_points_forward_impl, CUDA,
 REGISTER_DEVICE_IMPL(group_points_backward_impl, CUDA,
                      group_points_backward_cuda);
 
-void StackGroupPointsForwardCUDAKernelLauncher(
-    int b, int c, int m, int nsample, const Tensor features_tensor,
-    const Tensor features_batch_cnt_tensor, const Tensor idx_tensor,
-    const Tensor idx_batch_cnt_tensor, Tensor out_tensor);
-void StackGroupPointsBackwardCUDAKernelLauncher(
-    int b, int c, int m, int n, int nsample, const Tensor grad_out_tensor,
-    const Tensor idx_tensor, const Tensor idx_batch_cnt_tensor,
-    const Tensor features_batch_cnt_tensor, Tensor grad_features_tensor);
-
-void stack_group_points_forward_cuda(int b, int c, int m, int nsample,
-                                     const Tensor features_tensor,
-                                     const Tensor features_batch_cnt_tensor,
-                                     const Tensor idx_tensor,
-                                     const Tensor idx_batch_cnt_tensor,
-                                     Tensor out_tensor) {
-  StackGroupPointsForwardCUDAKernelLauncher(
-      b, c, m, nsample, features_tensor, features_batch_cnt_tensor, idx_tensor,
-      idx_batch_cnt_tensor, out_tensor);
-};
-
-void stack_group_points_backward_cuda(int b, int c, int m, int n, int nsample,
-                                      const Tensor grad_out_tensor,
-                                      const Tensor idx_tensor,
-                                      const Tensor idx_batch_cnt_tensor,
-                                      const Tensor features_batch_cnt_tensor,
-                                      Tensor grad_features_tensor) {
-  StackGroupPointsBackwardCUDAKernelLauncher(
-      b, c, m, n, nsample, grad_out_tensor, idx_tensor, idx_batch_cnt_tensor,
-      features_batch_cnt_tensor, grad_features_tensor);
-};
-
-void stack_group_points_forward_impl(int b, int c, int m, int nsample,
-                                     const Tensor features_tensor,
-                                     const Tensor features_batch_cnt_tensor,
-                                     const Tensor idx_tensor,
-                                     const Tensor idx_batch_cnt_tensor,
-                                     Tensor out_tensor);
-
-void stack_group_points_backward_impl(int b, int c, int m, int n, int nsample,
-                                      const Tensor grad_out_tensor,
-                                      const Tensor idx_tensor,
-                                      const Tensor idx_batch_cnt_tensor,
-                                      const Tensor features_batch_cnt_tensor,
-                                      Tensor grad_features_tensor);
-
-REGISTER_DEVICE_IMPL(stack_group_points_forward_impl, CUDA,
-                     stack_group_points_forward_cuda);
-REGISTER_DEVICE_IMPL(stack_group_points_backward_impl, CUDA,
-                     stack_group_points_backward_cuda);
-
 void IoU3DBoxesOverlapBevForwardCUDAKernelLauncher(const int num_a,
                                                    const Tensor boxes_a,
                                                    const int num_b,
                                                    const Tensor boxes_b,
                                                    Tensor ans_overlap);
 
-void IoU3DNMS3DForwardCUDAKernelLauncher(const Tensor boxes, Tensor &keep,
-                                         Tensor &keep_num,
-                                         float nms_overlap_thresh);
+void IoU3DBoxesIoUBevForwardCUDAKernelLauncher(const int num_a,
+                                               const Tensor boxes_a,
+                                               const int num_b,
+                                               const Tensor boxes_b,
+                                               Tensor ans_iou);
 
-void IoU3DNMS3DNormalForwardCUDAKernelLauncher(const Tensor boxes, Tensor &keep,
-                                               Tensor &keep_num,
-                                               float nms_overlap_thresh);
+void IoU3DNMSForwardCUDAKernelLauncher(const Tensor boxes,
+                                       unsigned long long* mask, int boxes_num,
+                                       float nms_overlap_thresh);
+
+void IoU3DNMSNormalForwardCUDAKernelLauncher(const Tensor boxes,
+                                             unsigned long long* mask,
+                                             int boxes_num,
+                                             float nms_overlap_thresh);
 
 void iou3d_boxes_overlap_bev_forward_cuda(const int num_a, const Tensor boxes_a,
                                           const int num_b, const Tensor boxes_b,
@@ -691,35 +592,45 @@ void iou3d_boxes_overlap_bev_forward_cuda(const int num_a, const Tensor boxes_a,
                                                 ans_overlap);
 };
 
-void iou3d_nms3d_forward_cuda(const Tensor boxes, Tensor &keep,
-                              Tensor &keep_num, float nms_overlap_thresh) {
-  IoU3DNMS3DForwardCUDAKernelLauncher(boxes, keep, keep_num,
-                                      nms_overlap_thresh);
+void iou3d_boxes_iou_bev_forward_cuda(const int num_a, const Tensor boxes_a,
+                                      const int num_b, const Tensor boxes_b,
+                                      Tensor ans_iou) {
+  IoU3DBoxesIoUBevForwardCUDAKernelLauncher(num_a, boxes_a, num_b, boxes_b,
+                                            ans_iou);
+};
+
+void iou3d_nms_forward_cuda(const Tensor boxes, unsigned long long* mask,
+                            int boxes_num, float nms_overlap_thresh) {
+  IoU3DNMSForwardCUDAKernelLauncher(boxes, mask, boxes_num, nms_overlap_thresh);
 };
 
-void iou3d_nms3d_normal_forward_cuda(const Tensor boxes, Tensor &keep,
-                                     Tensor &keep_num,
-                                     float nms_overlap_thresh) {
-  IoU3DNMS3DNormalForwardCUDAKernelLauncher(boxes, keep, keep_num,
-                                            nms_overlap_thresh);
+void iou3d_nms_normal_forward_cuda(const Tensor boxes, unsigned long long* mask,
+                                   int boxes_num, float nms_overlap_thresh) {
+  IoU3DNMSNormalForwardCUDAKernelLauncher(boxes, mask, boxes_num,
+                                          nms_overlap_thresh);
 };
 
 void iou3d_boxes_overlap_bev_forward_impl(const int num_a, const Tensor boxes_a,
                                           const int num_b, const Tensor boxes_b,
                                           Tensor ans_overlap);
 
-void iou3d_nms3d_forward_impl(const Tensor boxes, Tensor &keep,
-                              Tensor &keep_num, float nms_overlap_thresh);
+void iou3d_boxes_iou_bev_forward_impl(const int num_a, const Tensor boxes_a,
+                                      const int num_b, const Tensor boxes_b,
+                                      Tensor ans_iou);
+
+void iou3d_nms_forward_impl(const Tensor boxes, unsigned long long* mask,
+                            int boxes_num, float nms_overlap_thresh);
 
-void iou3d_nms3d_normal_forward_impl(const Tensor boxes, Tensor &keep,
-                                     Tensor &keep_num,
-                                     float nms_overlap_thresh);
+void iou3d_nms_normal_forward_impl(const Tensor boxes, unsigned long long* mask,
+                                   int boxes_num, float nms_overlap_thresh);
 
 REGISTER_DEVICE_IMPL(iou3d_boxes_overlap_bev_forward_impl, CUDA,
                      iou3d_boxes_overlap_bev_forward_cuda);
-REGISTER_DEVICE_IMPL(iou3d_nms3d_forward_impl, CUDA, iou3d_nms3d_forward_cuda);
-REGISTER_DEVICE_IMPL(iou3d_nms3d_normal_forward_impl, CUDA,
-                     iou3d_nms3d_normal_forward_cuda);
+REGISTER_DEVICE_IMPL(iou3d_boxes_iou_bev_forward_impl, CUDA,
+                     iou3d_boxes_iou_bev_forward_cuda);
+REGISTER_DEVICE_IMPL(iou3d_nms_forward_impl, CUDA, iou3d_nms_forward_cuda);
+REGISTER_DEVICE_IMPL(iou3d_nms_normal_forward_impl, CUDA,
+                     iou3d_nms_normal_forward_cuda);
 
 void KNNForwardCUDAKernelLauncher(int b, int n, int m, int nsample,
                                   const Tensor xyz, const Tensor new_xyz,
@@ -837,31 +748,31 @@ REGISTER_DEVICE_IMPL(modulated_deformable_col2im_impl, CUDA,
 REGISTER_DEVICE_IMPL(modulated_deformable_col2im_coord_impl, CUDA,
                      modulated_deformable_col2im_coord_cuda);
 
-Tensor ms_deform_attn_cuda_forward(const Tensor &value,
-                                   const Tensor &spatial_shapes,
-                                   const Tensor &level_start_index,
-                                   const Tensor &sampling_loc,
-                                   const Tensor &attn_weight,
+Tensor ms_deform_attn_cuda_forward(const Tensor& value,
+                                   const Tensor& spatial_shapes,
+                                   const Tensor& level_start_index,
+                                   const Tensor& sampling_loc,
+                                   const Tensor& attn_weight,
                                    const int im2col_step);
 
 void ms_deform_attn_cuda_backward(
-    const Tensor &value, const Tensor &spatial_shapes,
-    const Tensor &level_start_index, const Tensor &sampling_loc,
-    const Tensor &attn_weight, const Tensor &grad_output, Tensor &grad_value,
-    Tensor &grad_sampling_loc, Tensor &grad_attn_weight, const int im2col_step);
-
-Tensor ms_deform_attn_impl_forward(const Tensor &value,
-                                   const Tensor &spatial_shapes,
-                                   const Tensor &level_start_index,
-                                   const Tensor &sampling_loc,
-                                   const Tensor &attn_weight,
+    const Tensor& value, const Tensor& spatial_shapes,
+    const Tensor& level_start_index, const Tensor& sampling_loc,
+    const Tensor& attn_weight, const Tensor& grad_output, Tensor& grad_value,
+    Tensor& grad_sampling_loc, Tensor& grad_attn_weight, const int im2col_step);
+
+Tensor ms_deform_attn_impl_forward(const Tensor& value,
+                                   const Tensor& spatial_shapes,
+                                   const Tensor& level_start_index,
+                                   const Tensor& sampling_loc,
+                                   const Tensor& attn_weight,
                                    const int im2col_step);
 
 void ms_deform_attn_impl_backward(
-    const Tensor &value, const Tensor &spatial_shapes,
-    const Tensor &level_start_index, const Tensor &sampling_loc,
-    const Tensor &attn_weight, const Tensor &grad_output, Tensor &grad_value,
-    Tensor &grad_sampling_loc, Tensor &grad_attn_weight, const int im2col_step);
+    const Tensor& value, const Tensor& spatial_shapes,
+    const Tensor& level_start_index, const Tensor& sampling_loc,
+    const Tensor& attn_weight, const Tensor& grad_output, Tensor& grad_value,
+    Tensor& grad_sampling_loc, Tensor& grad_attn_weight, const int im2col_step);
 
 REGISTER_DEVICE_IMPL(ms_deform_attn_impl_forward, CUDA,
                      ms_deform_attn_cuda_forward);
@@ -1013,20 +924,20 @@ REGISTER_DEVICE_IMPL(roi_align_forward_impl, CUDA, roi_align_forward_cuda);
 REGISTER_DEVICE_IMPL(roi_align_backward_impl, CUDA, roi_align_backward_cuda);
 
 void ROIAlignRotatedForwardCUDAKernelLauncher(
-    const at::Tensor input, const at::Tensor rois, const float spatial_scale,
-    const int sampling_ratio, const bool aligned, const bool clockwise,
+    const at::Tensor features, const at::Tensor rois, const float spatial_scale,
+    const int sample_num, const bool aligned, const bool clockwise,
     const int channels, const int height, const int width, const int num_rois,
     const int pooled_height, const int pooled_width, at::Tensor output);
 
 void ROIAlignRotatedBackwardCUDAKernelLauncher(
     const at::Tensor top_grad, const at::Tensor rois, const float spatial_scale,
-    const int sampling_ratio, const bool aligned, const bool clockwise,
+    const int sample_num, const bool aligned, const bool clockwise,
     const int channels, const int height, const int width, const int num_rois,
     const int pooled_height, const int pooled_width, at::Tensor bottom_grad);
 
-void roi_align_rotated_forward_cuda(Tensor input, Tensor rois, Tensor output,
+void roi_align_rotated_forward_cuda(Tensor features, Tensor rois, Tensor output,
                                     int aligned_height, int aligned_width,
-                                    float spatial_scale, int sampling_ratio,
+                                    float spatial_scale, int sample_ratio,
                                     bool aligned, bool clockwise) {
   // Number of ROIs
   int num_rois = rois.size(0);
@@ -1036,11 +947,11 @@ void roi_align_rotated_forward_cuda(Tensor input, Tensor rois, Tensor output,
     AT_ERROR("wrong roi size");
   }
 
-  int num_channels = input.size(1);
-  int data_height = input.size(2);
-  int data_width = input.size(3);
+  int num_channels = features.size(1);
+  int data_height = features.size(2);
+  int data_width = features.size(3);
   ROIAlignRotatedForwardCUDAKernelLauncher(
-      input, rois, spatial_scale, sampling_ratio, aligned, clockwise,
+      features, rois, spatial_scale, sample_ratio, aligned, clockwise,
       num_channels, data_height, data_width, num_rois, aligned_height,
       aligned_width, output);
 }
@@ -1048,7 +959,7 @@ void roi_align_rotated_forward_cuda(Tensor input, Tensor rois, Tensor output,
 void roi_align_rotated_backward_cuda(Tensor top_grad, Tensor rois,
                                      Tensor bottom_grad, int aligned_height,
                                      int aligned_width, float spatial_scale,
-                                     int sampling_ratio, bool aligned,
+                                     int sample_ratio, bool aligned,
                                      bool clockwise) {
   // Number of ROIs
   int num_rois = rois.size(0);
@@ -1061,101 +972,26 @@ void roi_align_rotated_backward_cuda(Tensor top_grad, Tensor rois,
   int data_height = bottom_grad.size(2);
   int data_width = bottom_grad.size(3);
   ROIAlignRotatedBackwardCUDAKernelLauncher(
-      top_grad, rois, spatial_scale, sampling_ratio, aligned, clockwise,
+      top_grad, rois, spatial_scale, sample_ratio, aligned, clockwise,
       num_channels, data_height, data_width, num_rois, aligned_height,
       aligned_width, bottom_grad);
 }
 
-void roi_align_rotated_forward_impl(Tensor input, Tensor rois, Tensor output,
+void roi_align_rotated_forward_impl(Tensor features, Tensor rois, Tensor output,
                                     int aligned_height, int aligned_width,
-                                    float spatial_scale, int sampling_ratio,
+                                    float spatial_scale, int sample_ratio,
                                     bool aligned, bool clockwise);
 
 void roi_align_rotated_backward_impl(Tensor top_grad, Tensor rois,
                                      Tensor bottom_grad, int aligned_height,
                                      int aligned_width, float spatial_scale,
-                                     int sampling_ratio, bool aligned,
+                                     int sample_ratio, bool aligned,
                                      bool clockwise);
 REGISTER_DEVICE_IMPL(roi_align_rotated_forward_impl, CUDA,
                      roi_align_rotated_forward_cuda);
 REGISTER_DEVICE_IMPL(roi_align_rotated_backward_impl, CUDA,
                      roi_align_rotated_backward_cuda);
 
-void RiROIAlignRotatedForwardCUDAKernelLauncher(
-    const at::Tensor features, const at::Tensor rois, const float spatial_scale,
-    const int num_samples, const bool clockwise, const int channels,
-    const int height, const int width, const int num_rois,
-    const int pooled_height, const int pooled_width, const int num_orientations,
-    at::Tensor output);
-
-void RiROIAlignRotatedBackwardCUDAKernelLauncher(
-    const at::Tensor top_grad, const at::Tensor rois, const float spatial_scale,
-    const int num_samples, const bool clockwise, const int channels,
-    const int height, const int width, const int num_rois,
-    const int pooled_height, const int pooled_width, const int num_orientations,
-    at::Tensor bottom_grad);
-
-void riroi_align_rotated_forward_cuda(Tensor features, Tensor rois,
-                                      Tensor output, int pooled_height,
-                                      int pooled_width, float spatial_scale,
-                                      int num_samples, int num_orientations,
-                                      bool clockwise) {
-  // Number of ROIs
-  int num_rois = rois.size(0);
-  int size_rois = rois.size(1);
-  if (size_rois != 6) {
-    AT_ERROR("wrong roi size");
-  }
-  CHECK_CONTIGUOUS(features);
-  CHECK_CONTIGUOUS(rois);
-  int num_channels = features.size(1) / num_orientations;
-  int data_height = features.size(2);
-  int data_width = features.size(3);
-  RiROIAlignRotatedForwardCUDAKernelLauncher(
-      features, rois, spatial_scale, num_samples, clockwise, num_channels,
-      data_height, data_width, num_rois, pooled_height, pooled_width,
-      num_orientations, output);
-}
-
-void riroi_align_rotated_backward_cuda(Tensor top_grad, Tensor rois,
-                                       Tensor bottom_grad, int pooled_height,
-                                       int pooled_width, float spatial_scale,
-                                       int num_samples, int num_orientations,
-                                       bool clockwise) {
-  // Number of ROIs
-  int num_rois = rois.size(0);
-  int size_rois = rois.size(1);
-  if (size_rois != 6) {
-    AT_ERROR("wrong roi size");
-  }
-  CHECK_CONTIGUOUS(top_grad);
-  CHECK_CONTIGUOUS(rois);
-  int num_channels = bottom_grad.size(1) / num_orientations;
-  int data_height = bottom_grad.size(2);
-  int data_width = bottom_grad.size(3);
-  RiROIAlignRotatedBackwardCUDAKernelLauncher(
-      top_grad, rois, spatial_scale, num_samples, clockwise, num_channels,
-      data_height, data_width, num_rois, pooled_height, pooled_width,
-      num_orientations, bottom_grad);
-}
-
-void riroi_align_rotated_forward_impl(Tensor features, Tensor rois,
-                                      Tensor output, int pooled_height,
-                                      int pooled_width, float spatial_scale,
-                                      int num_samples, int num_orientations,
-                                      bool clockwise);
-
-void riroi_align_rotated_backward_impl(Tensor top_grad, Tensor rois,
-                                       Tensor bottom_grad, int pooled_height,
-                                       int pooled_width, float spatial_scale,
-                                       int num_samples, int num_orientations,
-                                       bool clockwise);
-
-REGISTER_DEVICE_IMPL(riroi_align_rotated_forward_impl, CUDA,
-                     riroi_align_rotated_forward_cuda);
-REGISTER_DEVICE_IMPL(riroi_align_rotated_backward_impl, CUDA,
-                     riroi_align_rotated_backward_cuda);
-
 void RoiawarePool3dForwardCUDAKernelLauncher(
     int boxes_num, int pts_num, int channels, int max_pts_each_voxel, int out_x,
     int out_y, int out_z, const Tensor rois, const Tensor pts,
@@ -1269,26 +1105,26 @@ REGISTER_DEVICE_IMPL(roi_pool_backward_impl, CUDA, roi_pool_backward_cuda);
 typedef enum { SUM = 0, MEAN = 1, MAX = 2 } reduce_t;
 
 std::vector<at::Tensor> DynamicPointToVoxelForwardCUDAKernelLauncher(
-    const at::Tensor &feats, const at::Tensor &coors,
+    const at::Tensor& feats, const at::Tensor& coors,
     const reduce_t reduce_type);
 
 void DynamicPointToVoxelBackwardCUDAKernelLauncher(
-    at::Tensor &grad_feats, const at::Tensor &grad_reduced_feats,
-    const at::Tensor &feats, const at::Tensor &reduced_feats,
-    const at::Tensor &coors_map, const at::Tensor &reduce_count,
+    at::Tensor& grad_feats, const at::Tensor& grad_reduced_feats,
+    const at::Tensor& feats, const at::Tensor& reduced_feats,
+    const at::Tensor& coors_map, const at::Tensor& reduce_count,
     const reduce_t reduce_type);
 
 std::vector<torch::Tensor> dynamic_point_to_voxel_forward_cuda(
-    const torch::Tensor &feats, const torch::Tensor &coors,
+    const torch::Tensor& feats, const torch::Tensor& coors,
     const reduce_t reduce_type) {
   return DynamicPointToVoxelForwardCUDAKernelLauncher(feats, coors,
                                                       reduce_type);
 };
 
 void dynamic_point_to_voxel_backward_cuda(
-    torch::Tensor &grad_feats, const torch::Tensor &grad_reduced_feats,
-    const torch::Tensor &feats, const torch::Tensor &reduced_feats,
-    const torch::Tensor &coors_idx, const torch::Tensor &reduce_count,
+    torch::Tensor& grad_feats, const torch::Tensor& grad_reduced_feats,
+    const torch::Tensor& feats, const torch::Tensor& reduced_feats,
+    const torch::Tensor& coors_idx, const torch::Tensor& reduce_count,
     const reduce_t reduce_type) {
   DynamicPointToVoxelBackwardCUDAKernelLauncher(grad_feats, grad_reduced_feats,
                                                 feats, reduced_feats, coors_idx,
@@ -1296,13 +1132,13 @@ void dynamic_point_to_voxel_backward_cuda(
 };
 
 std::vector<torch::Tensor> dynamic_point_to_voxel_forward_impl(
-    const torch::Tensor &feats, const torch::Tensor &coors,
+    const torch::Tensor& feats, const torch::Tensor& coors,
     const reduce_t reduce_type);
 
 void dynamic_point_to_voxel_backward_impl(
-    torch::Tensor &grad_feats, const torch::Tensor &grad_reduced_feats,
-    const torch::Tensor &feats, const torch::Tensor &reduced_feats,
-    const torch::Tensor &coors_idx, const torch::Tensor &reduce_count,
+    torch::Tensor& grad_feats, const torch::Tensor& grad_reduced_feats,
+    const torch::Tensor& feats, const torch::Tensor& reduced_feats,
+    const torch::Tensor& coors_idx, const torch::Tensor& reduce_count,
     const reduce_t reduce_type);
 
 REGISTER_DEVICE_IMPL(dynamic_point_to_voxel_forward_impl, CUDA,
@@ -1468,36 +1304,31 @@ void tin_shift_backward_impl(Tensor grad_output, Tensor shift,
 REGISTER_DEVICE_IMPL(tin_shift_forward_impl, CUDA, tin_shift_forward_cuda);
 REGISTER_DEVICE_IMPL(tin_shift_backward_impl, CUDA, tin_shift_backward_cuda);
 
-torch::Tensor upfirdn2d_op(torch::Tensor input, torch::Tensor filter, int upx,
-                           int upy, int downx, int downy, int padx0, int padx1,
-                           int pady0, int pady1, bool flip, float gain);
+torch::Tensor upfirdn2d_op(const torch::Tensor& input,
+                           const torch::Tensor& kernel, int up_x, int up_y,
+                           int down_x, int down_y, int pad_x0, int pad_x1,
+                           int pad_y0, int pad_y1);
 
-torch::Tensor upfirdn2d_op_impl(torch::Tensor input, torch::Tensor filter,
-                                int upx, int upy, int downx, int downy,
-                                int padx0, int padx1, int pady0, int pady1,
-                                bool flip, float gain);
+torch::Tensor upfirdn2d_op_impl(const torch::Tensor& input,
+                                const torch::Tensor& kernel, int up_x, int up_y,
+                                int down_x, int down_y, int pad_x0, int pad_x1,
+                                int pad_y0, int pad_y1);
 REGISTER_DEVICE_IMPL(upfirdn2d_op_impl, CUDA, upfirdn2d_op);
 
 int HardVoxelizeForwardCUDAKernelLauncher(
-    const at::Tensor &points, at::Tensor &voxels, at::Tensor &coors,
-    at::Tensor &num_points_per_voxel, const std::vector<float> voxel_size,
-    const std::vector<float> coors_range, const int max_points,
-    const int max_voxels, const int NDim = 3);
-
-int NondeterministicHardVoxelizeForwardCUDAKernelLauncher(
-    const at::Tensor &points, at::Tensor &voxels, at::Tensor &coors,
-    at::Tensor &num_points_per_voxel, const std::vector<float> voxel_size,
+    const at::Tensor& points, at::Tensor& voxels, at::Tensor& coors,
+    at::Tensor& num_points_per_voxel, const std::vector<float> voxel_size,
     const std::vector<float> coors_range, const int max_points,
     const int max_voxels, const int NDim = 3);
 
 void DynamicVoxelizeForwardCUDAKernelLauncher(
-    const at::Tensor &points, at::Tensor &coors,
+    const at::Tensor& points, at::Tensor& coors,
     const std::vector<float> voxel_size, const std::vector<float> coors_range,
     const int NDim = 3);
 
-int hard_voxelize_forward_cuda(const at::Tensor &points, at::Tensor &voxels,
-                               at::Tensor &coors,
-                               at::Tensor &num_points_per_voxel,
+int hard_voxelize_forward_cuda(const at::Tensor& points, at::Tensor& voxels,
+                               at::Tensor& coors,
+                               at::Tensor& num_points_per_voxel,
                                const std::vector<float> voxel_size,
                                const std::vector<float> coors_range,
                                const int max_points, const int max_voxels,
@@ -1507,17 +1338,7 @@ int hard_voxelize_forward_cuda(const at::Tensor &points, at::Tensor &voxels,
       max_points, max_voxels, NDim);
 };
 
-int nondeterministic_hard_voxelize_forward_cuda(
-    const at::Tensor &points, at::Tensor &voxels, at::Tensor &coors,
-    at::Tensor &num_points_per_voxel, const std::vector<float> voxel_size,
-    const std::vector<float> coors_range, const int max_points,
-    const int max_voxels, const int NDim) {
-  return NondeterministicHardVoxelizeForwardCUDAKernelLauncher(
-      points, voxels, coors, num_points_per_voxel, voxel_size, coors_range,
-      max_points, max_voxels, NDim);
-};
-
-void dynamic_voxelize_forward_cuda(const at::Tensor &points, at::Tensor &coors,
+void dynamic_voxelize_forward_cuda(const at::Tensor& points, at::Tensor& coors,
                                    const std::vector<float> voxel_size,
                                    const std::vector<float> coors_range,
                                    const int NDim) {
@@ -1525,394 +1346,19 @@ void dynamic_voxelize_forward_cuda(const at::Tensor &points, at::Tensor &coors,
                                            coors_range, NDim);
 };
 
-int hard_voxelize_forward_impl(const at::Tensor &points, at::Tensor &voxels,
-                               at::Tensor &coors,
-                               at::Tensor &num_points_per_voxel,
+int hard_voxelize_forward_impl(const at::Tensor& points, at::Tensor& voxels,
+                               at::Tensor& coors,
+                               at::Tensor& num_points_per_voxel,
                                const std::vector<float> voxel_size,
                                const std::vector<float> coors_range,
                                const int max_points, const int max_voxels,
                                const int NDim);
 
-int nondeterministic_hard_voxelize_forward_impl(
-    const at::Tensor &points, at::Tensor &voxels, at::Tensor &coors,
-    at::Tensor &num_points_per_voxel, const std::vector<float> voxel_size,
-    const std::vector<float> coors_range, const int max_points,
-    const int max_voxels, const int NDim);
-
-void dynamic_voxelize_forward_impl(const at::Tensor &points, at::Tensor &coors,
+void dynamic_voxelize_forward_impl(const at::Tensor& points, at::Tensor& coors,
                                    const std::vector<float> voxel_size,
                                    const std::vector<float> coors_range,
                                    const int NDim);
-
 REGISTER_DEVICE_IMPL(hard_voxelize_forward_impl, CUDA,
                      hard_voxelize_forward_cuda);
-REGISTER_DEVICE_IMPL(nondeterministic_hard_voxelize_forward_impl, CUDA,
-                     nondeterministic_hard_voxelize_forward_cuda);
 REGISTER_DEVICE_IMPL(dynamic_voxelize_forward_impl, CUDA,
                      dynamic_voxelize_forward_cuda);
-
-void RotatedFeatureAlignForwardCUDAKernelLauncher(const Tensor features,
-                                                  const Tensor best_bboxes,
-                                                  const float spatial_scale,
-                                                  const int points,
-                                                  Tensor output);
-
-void RotatedFeatureAlignBackwardCUDAKernelLauncher(const Tensor top_grad,
-                                                   const Tensor best_bboxes,
-                                                   const float spatial_scale,
-                                                   const int points,
-                                                   Tensor bottom_grad);
-
-void rotated_feature_align_forward_cuda(const Tensor features,
-                                        const Tensor best_bboxes,
-                                        const float spatial_scale,
-                                        const int points, Tensor output) {
-  RotatedFeatureAlignForwardCUDAKernelLauncher(features, best_bboxes,
-                                               spatial_scale, points, output);
-};
-
-void rotated_feature_align_backward_cuda(const Tensor top_grad,
-                                         const Tensor best_bboxes,
-                                         const float spatial_scale,
-                                         const int points, Tensor bottom_grad) {
-  RotatedFeatureAlignBackwardCUDAKernelLauncher(
-      top_grad, best_bboxes, spatial_scale, points, bottom_grad);
-};
-
-void rotated_feature_align_forward_impl(const Tensor features,
-                                        const Tensor best_bboxes,
-                                        const float spatial_scale,
-                                        const int points, Tensor output);
-
-void rotated_feature_align_backward_impl(const Tensor top_grad,
-                                         const Tensor best_bboxes,
-                                         const float spatial_scale,
-                                         const int points, Tensor bottom_grad);
-
-REGISTER_DEVICE_IMPL(rotated_feature_align_forward_impl, CUDA,
-                     rotated_feature_align_forward_cuda);
-REGISTER_DEVICE_IMPL(rotated_feature_align_backward_impl, CUDA,
-                     rotated_feature_align_backward_cuda);
-
-void PointsInPolygonsForwardCUDAKernelLauncher(const at::Tensor points,
-                                               const at::Tensor polygons,
-                                               const int rows, const int cols,
-                                               at::Tensor output);
-
-void points_in_polygons_forward_cuda(const Tensor points, const Tensor polygons,
-                                     Tensor output, const int rows,
-                                     const int cols) {
-  PointsInPolygonsForwardCUDAKernelLauncher(points, polygons, rows, cols,
-                                            output);
-};
-
-void points_in_polygons_forward_impl(const Tensor points, const Tensor polygons,
-                                     Tensor output, const int rows,
-                                     const int cols);
-
-REGISTER_DEVICE_IMPL(points_in_polygons_forward_impl, CUDA,
-                     points_in_polygons_forward_cuda);
-
-torch::Tensor IndiceMaxpoolForwardCUDAKernelLauncher(torch::Tensor features,
-                                                     torch::Tensor indicePairs,
-                                                     torch::Tensor indiceNum,
-                                                     int64_t numAct);
-
-torch::Tensor indice_maxpool_forward_cuda(torch::Tensor features,
-                                          torch::Tensor indicePairs,
-                                          torch::Tensor indiceNum,
-                                          int64_t numAct) {
-  return IndiceMaxpoolForwardCUDAKernelLauncher(features, indicePairs,
-                                                indiceNum, numAct);
-};
-
-torch::Tensor indice_maxpool_forward_impl(torch::Tensor features,
-                                          torch::Tensor indicePairs,
-                                          torch::Tensor indiceNum,
-                                          int64_t numAct);
-REGISTER_DEVICE_IMPL(indice_maxpool_forward_impl, CUDA,
-                     indice_maxpool_forward_cuda);
-
-torch::Tensor IndiceMaxpoolBackwardCUDAKernelLauncher(torch::Tensor features,
-                                                      torch::Tensor outFeatures,
-                                                      torch::Tensor outGrad,
-                                                      torch::Tensor indicePairs,
-                                                      torch::Tensor indiceNum);
-
-torch::Tensor indice_maxpool_backward_cuda(torch::Tensor features,
-                                           torch::Tensor outFeatures,
-                                           torch::Tensor outGrad,
-                                           torch::Tensor indicePairs,
-                                           torch::Tensor indiceNum) {
-  return IndiceMaxpoolBackwardCUDAKernelLauncher(features, outFeatures, outGrad,
-                                                 indicePairs, indiceNum);
-};
-
-torch::Tensor indice_maxpool_backward_impl(torch::Tensor features,
-                                           torch::Tensor outFeatures,
-                                           torch::Tensor outGrad,
-                                           torch::Tensor indicePairs,
-                                           torch::Tensor indiceNum);
-
-REGISTER_DEVICE_IMPL(indice_maxpool_backward_impl, CUDA,
-                     indice_maxpool_backward_cuda)
-
-torch::Tensor IndiceConvForwardCUDAKernelLauncher(
-    torch::Tensor features, torch::Tensor filters, torch::Tensor indicePairs,
-    torch::Tensor indiceNum, int64_t numActOut, int64_t _inverse,
-    int64_t _subM);
-
-torch::Tensor indice_conv_forward_cuda(torch::Tensor features,
-                                       torch::Tensor filters,
-                                       torch::Tensor indicePairs,
-                                       torch::Tensor indiceNum,
-                                       int64_t numActOut, int64_t _inverse,
-                                       int64_t _subM) {
-  return IndiceConvForwardCUDAKernelLauncher(
-      features, filters, indicePairs, indiceNum, numActOut, _inverse, _subM);
-};
-
-torch::Tensor indice_conv_forward_impl(torch::Tensor features,
-                                       torch::Tensor filters,
-                                       torch::Tensor indicePairs,
-                                       torch::Tensor indiceNum,
-                                       int64_t numActOut, int64_t _inverse,
-                                       int64_t _subM);
-
-REGISTER_DEVICE_IMPL(indice_conv_forward_impl, CUDA, indice_conv_forward_cuda);
-
-std::vector<torch::Tensor> IndiceConvBackwardCUDAKernelLauncher(
-    torch::Tensor features, torch::Tensor filters, torch::Tensor outGrad,
-    torch::Tensor indicePairs, torch::Tensor indiceNum, int64_t _inverse,
-    int64_t _subM);
-
-std::vector<torch::Tensor> indice_conv_backward_cuda(
-    torch::Tensor features, torch::Tensor filters, torch::Tensor outGrad,
-    torch::Tensor indicePairs, torch::Tensor indiceNum, int64_t _inverse,
-    int64_t _subM) {
-  return IndiceConvBackwardCUDAKernelLauncher(
-      features, filters, outGrad, indicePairs, indiceNum, _inverse, _subM);
-};
-
-std::vector<torch::Tensor> indice_conv_backward_impl(
-    torch::Tensor features, torch::Tensor filters, torch::Tensor outGrad,
-    torch::Tensor indicePairs, torch::Tensor indiceNum, int64_t _inverse,
-    int64_t _subM);
-
-REGISTER_DEVICE_IMPL(indice_conv_backward_impl, CUDA,
-                     indice_conv_backward_cuda);
-
-torch::Tensor FusedIndiceConvBatchnormCUDAKernelLauncher(
-    torch::Tensor features, torch::Tensor filters, torch::Tensor bias,
-    torch::Tensor indicePairs, torch::Tensor indiceNum, int64_t numActOut,
-    int64_t _inverse, int64_t _subM);
-
-torch::Tensor fused_indice_conv_batchnorm_forward_cuda(
-    torch::Tensor features, torch::Tensor filters, torch::Tensor bias,
-    torch::Tensor indicePairs, torch::Tensor indiceNum, int64_t numActOut,
-    int64_t _inverse, int64_t _subM) {
-  return FusedIndiceConvBatchnormCUDAKernelLauncher(features, filters, bias,
-                                                    indicePairs, indiceNum,
-                                                    numActOut, _inverse, _subM);
-};
-
-torch::Tensor fused_indice_conv_batchnorm_forward_impl(
-    torch::Tensor features, torch::Tensor filters, torch::Tensor bias,
-    torch::Tensor indicePairs, torch::Tensor indiceNum, int64_t numActOut,
-    int64_t _inverse, int64_t _subM);
-
-REGISTER_DEVICE_IMPL(fused_indice_conv_batchnorm_forward_impl, CUDA,
-                     fused_indice_conv_batchnorm_forward_cuda)
-
-void MinAreaPolygonsCUDAKernelLauncher(const Tensor pointsets, Tensor polygons);
-
-void min_area_polygons_cuda(const Tensor pointsets, Tensor polygons) {
-  MinAreaPolygonsCUDAKernelLauncher(pointsets, polygons);
-}
-
-void min_area_polygons_impl(const Tensor pointsets, Tensor polygons);
-
-REGISTER_DEVICE_IMPL(min_area_polygons_impl, CUDA, min_area_polygons_cuda);
-
-void ActiveRotatedFilterForwardCUDAKernelLauncher(const Tensor input,
-                                                  const Tensor indices,
-                                                  Tensor output);
-
-void ActiveRotatedFilterBackwardCUDAKernelLauncher(const Tensor grad_out,
-                                                   const Tensor indices,
-                                                   Tensor grad_in);
-
-void active_rotated_filter_forward_cuda(const Tensor input,
-                                        const Tensor indices, Tensor output) {
-  ActiveRotatedFilterForwardCUDAKernelLauncher(input, indices, output);
-};
-
-void active_rotated_filter_backward_cuda(const Tensor grad_out,
-                                         const Tensor indices, Tensor grad_in) {
-  ActiveRotatedFilterBackwardCUDAKernelLauncher(grad_out, indices, grad_in);
-};
-
-void active_rotated_filter_forward_impl(const Tensor input,
-                                        const Tensor indices, Tensor output);
-
-void active_rotated_filter_backward_impl(const Tensor grad_out,
-                                         const Tensor indices, Tensor grad_in);
-
-REGISTER_DEVICE_IMPL(active_rotated_filter_forward_impl, CUDA,
-                     active_rotated_filter_forward_cuda);
-REGISTER_DEVICE_IMPL(active_rotated_filter_backward_impl, CUDA,
-                     active_rotated_filter_backward_cuda);
-
-void ConvexIoUCUDAKernelLauncher(const Tensor pointsets, const Tensor polygons,
-                                 Tensor ious);
-
-void ConvexGIoUCUDAKernelLauncher(const Tensor pointsets, const Tensor polygons,
-                                  Tensor output);
-
-void convex_iou_cuda(const Tensor pointsets, const Tensor polygons,
-                     Tensor ious) {
-  ConvexIoUCUDAKernelLauncher(pointsets, polygons, ious);
-}
-
-void convex_giou_cuda(const Tensor pointsets, const Tensor polygons,
-                      Tensor output) {
-  ConvexGIoUCUDAKernelLauncher(pointsets, polygons, output);
-}
-
-void convex_iou_impl(const Tensor pointsets, const Tensor polygons,
-                     Tensor ious);
-
-void convex_giou_impl(const Tensor pointsets, const Tensor polygons,
-                      Tensor output);
-
-REGISTER_DEVICE_IMPL(convex_iou_impl, CUDA, convex_iou_cuda);
-REGISTER_DEVICE_IMPL(convex_giou_impl, CUDA, convex_giou_cuda);
-
-Tensor DiffIoURotatedSortVerticesCUDAKernelLauncher(Tensor vertices,
-                                                    Tensor mask,
-                                                    Tensor num_valid);
-
-Tensor diff_iou_rotated_sort_vertices_forward_cuda(Tensor vertices, Tensor mask,
-                                                   Tensor num_valid) {
-  return DiffIoURotatedSortVerticesCUDAKernelLauncher(vertices, mask,
-                                                      num_valid);
-}
-
-Tensor diff_iou_rotated_sort_vertices_forward_impl(Tensor vertices, Tensor mask,
-                                                   Tensor num_valid);
-
-REGISTER_DEVICE_IMPL(diff_iou_rotated_sort_vertices_forward_impl, CUDA,
-                     diff_iou_rotated_sort_vertices_forward_cuda);
-
-void ChamferDistanceForwardCUDAKernelLauncher(
-    const Tensor xyz1, const Tensor xyz2, const Tensor dist1,
-    const Tensor dist2, const Tensor idx1, const Tensor idx2);
-
-void ChamferDistanceBackwardCUDAKernelLauncher(
-    const Tensor xyz1, const Tensor xyz2, Tensor idx1, Tensor idx2,
-    Tensor grad_dist1, Tensor grad_dist2, Tensor grad_xyz1, Tensor grad_xyz2);
-
-void chamfer_distance_forward_cuda(const Tensor xyz1, const Tensor xyz2,
-                                   const Tensor dist1, const Tensor dist2,
-                                   const Tensor idx1, const Tensor idx2) {
-  ChamferDistanceForwardCUDAKernelLauncher(xyz1, xyz2, dist1, dist2, idx1,
-                                           idx2);
-};
-
-void chamfer_distance_backward_cuda(const Tensor xyz1, const Tensor xyz2,
-                                    Tensor idx1, Tensor idx2, Tensor graddist1,
-                                    Tensor graddist2, Tensor gradxyz1,
-                                    Tensor gradxyz2) {
-  ChamferDistanceBackwardCUDAKernelLauncher(xyz1, xyz2, idx1, idx2, graddist1,
-                                            graddist2, gradxyz1, gradxyz2);
-};
-
-void chamfer_distance_forward_impl(const Tensor xyz1, const Tensor xyz2,
-                                   const Tensor dist1, const Tensor dist2,
-                                   const Tensor idx1, const Tensor idx2);
-
-void chamfer_distance_backward_impl(const Tensor xyz1, const Tensor xyz2,
-                                    Tensor idx1, Tensor idx2, Tensor graddist1,
-                                    Tensor graddist2, Tensor gradxyz1,
-                                    Tensor gradxyz2);
-
-REGISTER_DEVICE_IMPL(chamfer_distance_forward_impl, CUDA,
-                     chamfer_distance_forward_cuda);
-REGISTER_DEVICE_IMPL(chamfer_distance_backward_impl, CUDA,
-                     chamfer_distance_backward_cuda);
-
-void PrROIPoolForwardCUDAKernelLauncher(Tensor input, Tensor rois,
-                                        Tensor output, int pooled_height,
-                                        int pooled_width, float spatial_scale);
-
-void PrROIPoolBackwardCUDAKernelLauncher(Tensor grad_output, Tensor rois,
-                                         Tensor grad_input, int pooled_height,
-                                         int pooled_width, float spatial_scale);
-
-void PrROIPoolCoorBackwardCUDAKernelLauncher(
-    Tensor output, Tensor grad_output, Tensor input, Tensor rois,
-    Tensor grad_rois, int pooled_height, int pooled_width, float spatial_scale);
-
-void prroi_pool_forward_cuda(Tensor input, Tensor rois, Tensor output,
-                             int pooled_height, int pooled_width,
-                             float spatial_scale) {
-  PrROIPoolForwardCUDAKernelLauncher(input, rois, output, pooled_height,
-                                     pooled_width, spatial_scale);
-}
-
-void prroi_pool_backward_cuda(Tensor grad_output, Tensor rois,
-                              Tensor grad_input, int pooled_height,
-                              int pooled_width, float spatial_scale) {
-  PrROIPoolBackwardCUDAKernelLauncher(grad_output, rois, grad_input,
-                                      pooled_height, pooled_width,
-                                      spatial_scale);
-}
-
-void prroi_pool_coor_backward_cuda(Tensor output, Tensor grad_output,
-                                   Tensor input, Tensor rois, Tensor grad_rois,
-                                   int pooled_height, int pooled_width,
-                                   float spatial_scale) {
-  PrROIPoolCoorBackwardCUDAKernelLauncher(output, grad_output, input, rois,
-                                          grad_rois, pooled_height,
-                                          pooled_width, spatial_scale);
-}
-
-void prroi_pool_forward_impl(Tensor input, Tensor rois, Tensor output,
-                             int pooled_height, int pooled_width,
-                             float spatial_scale);
-void prroi_pool_backward_impl(Tensor grad_output, Tensor rois,
-                              Tensor grad_input, int pooled_height,
-                              int pooled_width, float spatial_scale);
-void prroi_pool_coor_backward_impl(Tensor output, Tensor grad_output,
-                                   Tensor input, Tensor rois, Tensor grad_rois,
-                                   int pooled_height, int pooled_width,
-                                   float spatial_scale);
-REGISTER_DEVICE_IMPL(prroi_pool_forward_impl, CUDA, prroi_pool_forward_cuda);
-REGISTER_DEVICE_IMPL(prroi_pool_backward_impl, CUDA, prroi_pool_backward_cuda);
-REGISTER_DEVICE_IMPL(prroi_pool_coor_backward_impl, CUDA,
-                     prroi_pool_coor_backward_cuda);
-
-void BezierAlignForwardCUDAKernelLauncher(Tensor input, Tensor rois,
-                                          Tensor output, int aligned_height,
-                                          int aligned_width,
-                                          float spatial_scale,
-                                          int sampling_ratio, bool aligned);
-
-void BezierAlignBackwardCUDAKernelLauncher(
-    Tensor grad_output, Tensor rois, Tensor grad_input, int aligned_height,
-    int aligned_width, float spatial_scale, int sampling_ratio, bool aligned);
-
-void bezier_align_forward_impl(Tensor input, Tensor rois, Tensor output,
-                               int aligned_height, int aligned_width,
-                               float spatial_scale, int sampling_ratio,
-                               bool aligned);
-
-void bezier_align_backward_impl(Tensor grad_output, Tensor rois,
-                                Tensor grad_input, int aligned_height,
-                                int aligned_width, float spatial_scale,
-                                int sampling_ratio, bool aligned);
-
-REGISTER_DEVICE_IMPL(bezier_align_forward_impl, CUDA,
-                     BezierAlignForwardCUDAKernelLauncher);
-REGISTER_DEVICE_IMPL(bezier_align_backward_impl, CUDA,
-                     BezierAlignBackwardCUDAKernelLauncher);
diff --git a/mmcv/ops/csrc/pytorch/cuda/diff_iou_rotated_cuda.cu b/mmcv/ops/csrc/pytorch/cuda/diff_iou_rotated_cuda.cu
deleted file mode 100644
index 62dbf5d..0000000
--- a/mmcv/ops/csrc/pytorch/cuda/diff_iou_rotated_cuda.cu
+++ /dev/null
@@ -1,35 +0,0 @@
-// Copyright (c) OpenMMLab. All rights reserved
-// Adapted from
-// https://github.com/lilanxiao/Rotated_IoU/cuda_op/sort_vert_kernel.cu  # noqa
-#include "diff_iou_rotated_cuda_kernel.cuh"
-#include "pytorch_cpp_helper.hpp"
-#include "pytorch_cuda_helper.hpp"
-
-at::Tensor DiffIoURotatedSortVerticesCUDAKernelLauncher(at::Tensor vertices,
-                                                        at::Tensor mask,
-                                                        at::Tensor num_valid) {
-  at::cuda::CUDAGuard device_guard(vertices.device());
-  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-
-  CHECK_CONTIGUOUS(vertices);
-  CHECK_CONTIGUOUS(mask);
-  CHECK_CONTIGUOUS(num_valid);
-  CHECK_CUDA(vertices);
-  CHECK_CUDA(mask);
-  CHECK_CUDA(num_valid);
-
-  int b = vertices.size(0);
-  int n = vertices.size(1);
-  int m = vertices.size(2);
-  at::Tensor idx =
-      torch::zeros({b, n, MAX_NUM_VERT_IDX},
-                   at::device(vertices.device()).dtype(at::ScalarType::Int));
-
-  diff_iou_rotated_sort_vertices_forward_cuda_kernel<<<b, opt_n_thread(n), 0,
-                                                       stream>>>(
-      b, n, m, vertices.data_ptr<float>(), mask.data_ptr<bool>(),
-      num_valid.data_ptr<int>(), idx.data_ptr<int>());
-  AT_CUDA_CHECK(cudaGetLastError());
-
-  return idx;
-}
diff --git a/mmcv/ops/csrc/pytorch/cuda/filtered_lrelu.cu b/mmcv/ops/csrc/pytorch/cuda/filtered_lrelu.cu
deleted file mode 100644
index c6004da..0000000
--- a/mmcv/ops/csrc/pytorch/cuda/filtered_lrelu.cu
+++ /dev/null
@@ -1,2044 +0,0 @@
-// Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
-//
-// NVIDIA CORPORATION and its licensors retain all intellectual property
-// and proprietary rights in and to this software, related documentation
-// and any modifications thereto.  Any use, reproduction, disclosure or
-// distribution of this software and related documentation without an express
-// license agreement from NVIDIA CORPORATION is strictly prohibited.
-#include <c10/util/Half.h>
-#include <cuda_runtime.h>
-#include <torch/types.h>
-
-#include <cstdint>
-
-#include "pytorch_cuda_helper.hpp"
-#include "pytorch_device_registry.hpp"
-
-//------------------------------------------------------------------------
-// CUDA kernel parameters.
-
-struct filtered_lrelu_kernel_params {
-  // These parameters decide which kernel to use.
-  int up;        // upsampling ratio (1, 2, 4)
-  int down;      // downsampling ratio (1, 2, 4)
-  int2 fuShape;  // [size, 1] | [size, size]
-  int2 fdShape;  // [size, 1] | [size, size]
-
-  int _dummy;  // Alignment.
-
-  // Rest of the parameters.
-  const void *x;     // Input tensor.
-  void *y;           // Output tensor.
-  const void *b;     // Bias tensor.
-  unsigned char *s;  // Sign tensor in/out. NULL if unused.
-  const float *fu;   // Upsampling filter.
-  const float *fd;   // Downsampling filter.
-
-  int2 pad0;    // Left/top padding.
-  float gain;   // Additional gain factor.
-  float slope;  // Leaky ReLU slope on negative side.
-  float clamp;  // Clamp after nonlinearity.
-  int flip;     // Filter kernel flip for gradient computation.
-
-  int tilesXdim;  // Original number of horizontal output tiles.
-  int tilesXrep;  // Number of horizontal tiles per CTA.
-  int blockZofs;  // Block z offset to support large minibatch, channel
-                  // dimensions.
-
-  int4 xShape;  // [width, height, channel, batch]
-  int4 yShape;  // [width, height, channel, batch]
-  int2 sShape;  // [width, height] - width is in bytes. Contiguous. Zeros if
-                // unused.
-  int2 sOfs;  // [ofs_x, ofs_y] - offset between upsampled data and sign tensor.
-  int swLimit;  // Active width of sign tensor in bytes.
-
-  longlong4 xStride;   // Strides of all tensors except signs, same component
-                       // order as shapes.
-  longlong4 yStride;   //
-  int64_t bStride;     //
-  longlong3 fuStride;  //
-  longlong3 fdStride;  //
-};
-
-struct filtered_lrelu_act_kernel_params {
-  void *x;           // Input/output, modified in-place.
-  unsigned char *s;  // Sign tensor in/out. NULL if unused.
-
-  float gain;   // Additional gain factor.
-  float slope;  // Leaky ReLU slope on negative side.
-  float clamp;  // Clamp after nonlinearity.
-
-  int4 xShape;        // [width, height, channel, batch]
-  longlong4 xStride;  // Input/output tensor strides, same order as in shape.
-  int2 sShape;  // [width, height] - width is in elements. Contiguous. Zeros if
-                // unused.
-  int2 sOfs;  // [ofs_x, ofs_y] - offset between upsampled data and sign tensor.
-};
-
-//------------------------------------------------------------------------
-// CUDA kernel specialization.
-
-struct filtered_lrelu_kernel_spec {
-  void *setup;   // Function for filter kernel setup.
-  void *exec;    // Function for main operation.
-  int2 tileOut;  // Width/height of launch tile.
-  int numWarps;  // Number of warps per thread block, determines launch block
-                 // size.
-  int xrep;      // For processing multiple horizontal tiles per thread block.
-  int dynamicSharedKB;  // How much dynamic shared memory the exec kernel wants.
-};
-
-//------------------------------------------------------------------------
-// CUDA kernel selection.
-
-template <class T, class index_t, bool signWrite, bool signRead>
-filtered_lrelu_kernel_spec choose_filtered_lrelu_kernel(
-    const filtered_lrelu_kernel_params &p, int sharedKB);
-template <class T, bool signWrite, bool signRead>
-void *choose_filtered_lrelu_act_kernel(void);
-
-//------------------------------------------------------------------------
-// Helpers.
-
-enum              // Filter modes.
-{ MODE_SUSD = 0,  // Separable upsampling, separable downsampling.
-  MODE_FUSD = 1,  // Full upsampling, separable downsampling.
-  MODE_SUFD = 2,  // Separable upsampling, full downsampling.
-  MODE_FUFD = 3,  // Full upsampling, full downsampling.
-};
-
-template <class T>
-struct InternalType;
-template <>
-struct InternalType<double> {
-  typedef double scalar_t;
-  typedef double2 vec2_t;
-  typedef double4 vec4_t;
-  __device__ __forceinline__ static vec2_t zero_vec2(void) {
-    return make_double2(0, 0);
-  }
-  __device__ __forceinline__ static vec4_t zero_vec4(void) {
-    return make_double4(0, 0, 0, 0);
-  }
-  __device__ __forceinline__ static double clamp(double x, double c) {
-    return fmin(fmax(x, -c), c);
-  }
-};
-template <>
-struct InternalType<float> {
-  typedef float scalar_t;
-  typedef float2 vec2_t;
-  typedef float4 vec4_t;
-  __device__ __forceinline__ static vec2_t zero_vec2(void) {
-    return make_float2(0, 0);
-  }
-  __device__ __forceinline__ static vec4_t zero_vec4(void) {
-    return make_float4(0, 0, 0, 0);
-  }
-  __device__ __forceinline__ static float clamp(float x, float c) {
-    return fminf(fmaxf(x, -c), c);
-  }
-};
-template <>
-struct InternalType<c10::Half> {
-  typedef float scalar_t;
-  typedef float2 vec2_t;
-  typedef float4 vec4_t;
-  __device__ __forceinline__ static vec2_t zero_vec2(void) {
-    return make_float2(0, 0);
-  }
-  __device__ __forceinline__ static vec4_t zero_vec4(void) {
-    return make_float4(0, 0, 0, 0);
-  }
-  __device__ __forceinline__ static float clamp(float x, float c) {
-    return fminf(fmaxf(x, -c), c);
-  }
-};
-
-#define MIN(A, B) ((A) < (B) ? (A) : (B))
-#define MAX(A, B) ((A) > (B) ? (A) : (B))
-#define CEIL_DIV(A, B)                                   \
-  (((B) == 1)                                            \
-       ? (A)                                             \
-       : ((B) == 2) ? ((int)((A) + 1) >> 1)              \
-                    : ((B) == 4) ? ((int)((A) + 3) >> 2) \
-                                 : (((A) + ((A) > 0 ? (B)-1 : 0)) / (B)))
-
-// This works only up to blocks of size 256 x 256 and for all N that are powers
-// of two.
-template <int N>
-__device__ __forceinline__ void fast_div_mod(int &x, int &y, unsigned int i) {
-  if ((N & (N - 1)) && N <= 256)
-    y = (i * ((1 << 24) / N + 1)) >> 24;  // Assumes N <= 256, i < N*256.
-  else
-    y = i / N;
-
-  x = i - y * N;
-}
-
-// Type cast stride before reading it.
-template <class T>
-__device__ __forceinline__ T get_stride(const int64_t &x) {
-  return *reinterpret_cast<const T *>(&x);
-}
-
-//------------------------------------------------------------------------
-// Filters, setup kernel, copying function.
-
-#define MAX_FILTER_SIZE 32
-
-// Combined up/down filter buffers so that transfer can be done with one copy.
-__device__ float
-    g_fbuf[2 * MAX_FILTER_SIZE * MAX_FILTER_SIZE];  // Filters in global memory,
-                                                    // written by setup kernel.
-__device__ __constant__ float
-    c_fbuf[2 * MAX_FILTER_SIZE *
-           MAX_FILTER_SIZE];  // Filters in constant memory, read by main
-                              // kernel.
-
-// Accessors to combined buffers to index up/down filters individually.
-#define c_fu (c_fbuf)
-#define c_fd (c_fbuf + MAX_FILTER_SIZE * MAX_FILTER_SIZE)
-#define g_fu (g_fbuf)
-#define g_fd (g_fbuf + MAX_FILTER_SIZE * MAX_FILTER_SIZE)
-
-// Set up filters into global memory buffer.
-static __global__ void setup_filters_kernel(filtered_lrelu_kernel_params p) {
-  for (int idx = threadIdx.x; idx < MAX_FILTER_SIZE * MAX_FILTER_SIZE;
-       idx += blockDim.x) {
-    int x, y;
-    fast_div_mod<MAX_FILTER_SIZE>(x, y, idx);
-
-    int fu_x = p.flip ? x : (p.fuShape.x - 1 - x);
-    int fu_y = p.flip ? y : (p.fuShape.y - 1 - y);
-    if (p.fuShape.y > 0)
-      g_fu[idx] = (x >= p.fuShape.x || y >= p.fuShape.y)
-                      ? 0.0f
-                      : p.fu[fu_x * p.fuStride.x + fu_y * p.fuStride.y];
-    else
-      g_fu[idx] =
-          (x >= p.fuShape.x || y > 0) ? 0.0f : p.fu[fu_x * p.fuStride.x];
-
-    int fd_x = p.flip ? x : (p.fdShape.x - 1 - x);
-    int fd_y = p.flip ? y : (p.fdShape.y - 1 - y);
-    if (p.fdShape.y > 0)
-      g_fd[idx] = (x >= p.fdShape.x || y >= p.fdShape.y)
-                      ? 0.0f
-                      : p.fd[fd_x * p.fdStride.x + fd_y * p.fdStride.y];
-    else
-      g_fd[idx] =
-          (x >= p.fdShape.x || y > 0) ? 0.0f : p.fd[fd_x * p.fdStride.x];
-  }
-}
-
-// Host function to copy filters written by setup kernel into constant buffer
-// for main kernel.
-static cudaError_t copy_filters(cudaStream_t stream) {
-  void *src = 0;
-  cudaError_t err = cudaGetSymbolAddress(&src, g_fbuf);
-  if (err) return err;
-  return cudaMemcpyToSymbolAsync(
-      c_fbuf, src, 2 * MAX_FILTER_SIZE * MAX_FILTER_SIZE * sizeof(float), 0,
-      cudaMemcpyDeviceToDevice, stream);
-}
-
-//------------------------------------------------------------------------
-// Coordinate spaces:
-// - Relative to input tensor:      inX, inY, tileInX, tileInY
-// - Relative to input tile:        relInX, relInY, tileInW, tileInH
-// - Relative to upsampled tile:    relUpX, relUpY, tileUpW, tileUpH
-// - Relative to output tile:       relOutX, relOutY, tileOutW, tileOutH
-// - Relative to output tensor:     outX, outY, tileOutX, tileOutY
-//
-// Relationships between coordinate spaces:
-// - inX = tileInX + relInX
-// - inY = tileInY + relInY
-// - relUpX = relInX * up + phaseInX
-// - relUpY = relInY * up + phaseInY
-// - relUpX = relOutX * down
-// - relUpY = relOutY * down
-// - outX = tileOutX + relOutX
-// - outY = tileOutY + relOutY
-
-extern __shared__ char
-    s_buf_raw[];  // When sharedKB <= 48, allocate shared memory statically
-                  // inside the kernel, otherwise use the externally allocated
-                  // shared memory buffer.
-
-template <class T, class index_t, int sharedKB, bool signWrite, bool signRead,
-          int filterMode, int up, int fuSize, int down, int fdSize,
-          int tileOutW, int tileOutH, int threadsPerBlock, bool enableXrep,
-          bool enableWriteSkip>
-static __global__ void filtered_lrelu_kernel(filtered_lrelu_kernel_params p) {
-  // Check that we don't try to support non-existing filter modes.
-  static_assert(up == 1 || up == 2 || up == 4,
-                "only up=1, up=2, up=4 scales supported");
-  static_assert(down == 1 || down == 2 || down == 4,
-                "only down=1, down=2, down=4 scales supported");
-  static_assert(fuSize >= up,
-                "upsampling filter size must be at least upsampling factor");
-  static_assert(
-      fdSize >= down,
-      "downsampling filter size must be at least downsampling factor");
-  static_assert(
-      fuSize % up == 0,
-      "upsampling filter size must be divisible with upsampling factor");
-  static_assert(
-      fdSize % down == 0,
-      "downsampling filter size must be divisible with downsampling factor");
-  static_assert(fuSize <= MAX_FILTER_SIZE && fdSize <= MAX_FILTER_SIZE,
-                "filter size greater than MAX_FILTER_SIZE");
-  static_assert(up != 1 || (fuSize == 1 && (filterMode == MODE_FUFD ||
-                                            filterMode == MODE_FUSD)),
-                "up=1 supported only for 1x1 full filters");
-  static_assert(down != 1 || (fdSize == 1 && (filterMode == MODE_FUFD ||
-                                              filterMode == MODE_SUFD)),
-                "down=1 supported only for 1x1 full filters");
-  static_assert(
-      !(up == 4 && (filterMode == MODE_FUFD || filterMode == MODE_FUSD)),
-      "full filters not supported for up=4");
-  static_assert(
-      !(down == 4 && (filterMode == MODE_FUFD || filterMode == MODE_SUFD)),
-      "full filters not supported for down=4");
-
-  // Static definitions.
-  typedef typename InternalType<T>::scalar_t scalar_t;
-  typedef typename InternalType<T>::vec2_t vec2_t;
-  typedef typename InternalType<T>::vec4_t vec4_t;
-  const int tileUpW = (tileOutW * down + (fdSize - 1) - (down - 1) + 3) &
-                      ~3;  // Upsampled tile width, rounded up to multiple of 4.
-  const int tileUpH =
-      tileOutH * down + (fdSize - 1) - (down - 1);  // Upsampled tile height.
-  const int tileInW =
-      CEIL_DIV(tileUpW + (fuSize - 1), up);  // Input tile width.
-  const int tileInH =
-      CEIL_DIV(tileUpH + (fuSize - 1), up);  // Input tile height.
-  const int tileUpH_up =
-      CEIL_DIV(tileUpH, up) *
-      up;  // Upsampled tile height rounded up to a multiple of up.
-  const int tileInH_up =
-      CEIL_DIV(tileUpH_up + (fuSize - 1),
-               up);  // For allocations only, to avoid shared memory read
-                     // overruns with up=2 and up=4.
-
-  // Merge 1x1 downsampling into last upsampling step for upf1 and ups2.
-  const bool downInline =
-      (down == 1) && ((up == 1 && filterMode == MODE_FUFD) ||
-                      (up == 2 && filterMode == MODE_SUFD));
-
-  // Sizes of logical buffers.
-  const int szIn = tileInH_up * tileInW;
-  const int szUpX = tileInH_up * tileUpW;
-  const int szUpXY = downInline ? 0 : (tileUpH * tileUpW);
-  const int szDownX = tileUpH * tileOutW;
-
-  // Sizes for shared memory arrays.
-  const int s_buf0_size_base =
-      (filterMode == MODE_SUSD)
-          ? MAX(szIn, szUpXY)
-          : (filterMode == MODE_FUSD)
-                ? MAX(szIn, szDownX)
-                : (filterMode == MODE_SUFD)
-                      ? MAX(szIn, szUpXY)
-                      : (filterMode == MODE_FUFD) ? szIn : -1;
-  const int s_buf1_size_base =
-      (filterMode == MODE_SUSD)
-          ? MAX(szUpX, szDownX)
-          : (filterMode == MODE_FUSD)
-                ? szUpXY
-                : (filterMode == MODE_SUFD)
-                      ? szUpX
-                      : (filterMode == MODE_FUFD) ? szUpXY : -1;
-
-  // Ensure U128 alignment.
-  const int s_buf0_size = (s_buf0_size_base + 3) & ~3;
-  const int s_buf1_size = (s_buf1_size_base + 3) & ~3;
-
-  // Check at compile time that we don't use too much shared memory.
-  static_assert(
-      (s_buf0_size + s_buf1_size) * sizeof(scalar_t) <= (sharedKB << 10),
-      "shared memory overflow");
-
-  // Declare shared memory arrays.
-  scalar_t *s_buf0;
-  scalar_t *s_buf1;
-  if (sharedKB <= 48) {
-    // Allocate shared memory arrays here.
-    __shared__ scalar_t
-        s_buf0_st[(sharedKB > 48)
-                      ? (1 << 24)
-                      : (s_buf0_size +
-                         s_buf1_size)];  // Prevent launching if this isn't
-                                         // optimized away when unused.
-    s_buf0 = s_buf0_st;
-    s_buf1 = s_buf0 + s_buf0_size;
-  } else {
-    // Use the dynamically allocated shared memory array.
-    s_buf0 = (scalar_t *)s_buf_raw;
-    s_buf1 = s_buf0 + s_buf0_size;
-  }
-
-  // Pointers to the buffers.
-  scalar_t *
-      s_tileIn;  // Input tile:                      [relInX * tileInH + relInY]
-  scalar_t *s_tileUpX;   // After horizontal upsampling:     [relInY * tileUpW +
-                         // relUpX]
-  scalar_t *s_tileUpXY;  // After upsampling:                [relUpY * tileUpW +
-                         // relUpX]
-  scalar_t *s_tileDownX;  // After horizontal downsampling:   [relUpY * tileOutW
-                          // + relOutX]
-  if (filterMode == MODE_SUSD) {
-    s_tileIn = s_buf0;
-    s_tileUpX = s_buf1;
-    s_tileUpXY = s_buf0;
-    s_tileDownX = s_buf1;
-  } else if (filterMode == MODE_FUSD) {
-    s_tileIn = s_buf0;
-    s_tileUpXY = s_buf1;
-    s_tileDownX = s_buf0;
-  } else if (filterMode == MODE_SUFD) {
-    s_tileIn = s_buf0;
-    s_tileUpX = s_buf1;
-    s_tileUpXY = s_buf0;
-  } else if (filterMode == MODE_FUFD) {
-    s_tileIn = s_buf0;
-    s_tileUpXY = s_buf1;
-  }
-
-  // Allow large grids in z direction via per-launch offset.
-  int channelIdx = blockIdx.z + p.blockZofs;
-  int batchIdx = channelIdx / p.yShape.z;
-  channelIdx -= batchIdx * p.yShape.z;
-
-  // Offset to output feature map. In bytes.
-  index_t mapOfsOut = channelIdx * get_stride<index_t>(p.yStride.z) +
-                      batchIdx * get_stride<index_t>(p.yStride.w);
-
-  // Sign shift amount.
-  uint32_t signXo = ((threadIdx.x + p.sOfs.x) << 1) & 6;
-
-// Inner tile loop.
-#pragma unroll 1
-  for (int tileIdx = 0;
-       !enableXrep ||
-       (tileIdx < MIN(p.tilesXrep, p.tilesXdim - p.tilesXrep * blockIdx.y));
-       tileIdx++) {
-    // Locate output tile.
-    int tileX = enableXrep ? blockIdx.y * p.tilesXrep + tileIdx : blockIdx.x;
-    int tileOutX = tileX * tileOutW;
-    int tileOutY = (enableXrep ? blockIdx.x : blockIdx.y) * tileOutH;
-
-    // Locate input tile.
-    int tmpX = tileOutX * down - p.pad0.x;
-    int tmpY = tileOutY * down - p.pad0.y;
-    int tileInX = CEIL_DIV(tmpX, up);
-    int tileInY = CEIL_DIV(tmpY, up);
-    const int phaseInX = tileInX * up - tmpX;
-    const int phaseInY = tileInY * up - tmpY;
-
-    // Extra sync if input and output buffers are the same and we are not on
-    // first tile.
-    if (enableXrep && tileIdx > 0 &&
-        (filterMode == MODE_FUSD || (filterMode == MODE_SUFD && !downInline) ||
-         (filterMode == MODE_FUFD && downInline)))
-      __syncthreads();
-
-    // Load input tile & apply bias. Unrolled.
-    scalar_t b =
-        (scalar_t) * (const T *)((const char *)p.b +
-                                 (channelIdx * get_stride<index_t>(p.bStride)));
-    index_t mapOfsIn = channelIdx * get_stride<index_t>(p.xStride.z) +
-                       batchIdx * get_stride<index_t>(p.xStride.w);
-    int idx = threadIdx.x;
-    const int loopCountIN = CEIL_DIV(tileInW * tileInH, threadsPerBlock);
-#pragma unroll
-    for (int loop = 0; loop < loopCountIN; loop++) {
-      int relInX, relInY;
-      fast_div_mod<tileInW>(relInX, relInY, idx);
-      int inX = tileInX + relInX;
-      int inY = tileInY + relInY;
-      scalar_t v = 0;
-
-      if ((uint32_t)inX < p.xShape.x && (uint32_t)inY < p.xShape.y)
-        v = (scalar_t) * ((const T *)((const char *)p.x +
-                                      (inX * get_stride<index_t>(p.xStride.x) +
-                                       inY * get_stride<index_t>(p.xStride.y) +
-                                       mapOfsIn))) +
-            b;
-
-      bool skip = (loop == loopCountIN - 1) && (idx >= tileInW * tileInH);
-      if (!skip) s_tileIn[idx] = v;
-
-      idx += threadsPerBlock;
-    }
-
-    if (filterMode == MODE_SUSD ||
-        filterMode == MODE_SUFD)  // Separable upsampling filter.
-    {
-      // Horizontal upsampling.
-      __syncthreads();
-      if (up == 4) {
-        for (int idx = threadIdx.x * up; idx < tileUpW * tileInH;
-             idx += blockDim.x * up) {
-          int relUpX0, relInY;
-          fast_div_mod<tileUpW>(relUpX0, relInY, idx);
-          int relInX0 = relUpX0 / up;
-          int src0 = relInX0 + tileInW * relInY;
-          int dst = relInY * tileUpW + relUpX0;
-          vec4_t v = InternalType<T>::zero_vec4();
-          scalar_t a = s_tileIn[src0];
-          if (phaseInX == 0) {
-#pragma unroll
-            for (int step = 0; step < fuSize / up; step++) {
-              v.x += a * (scalar_t)c_fu[step * up + 0];
-              a = s_tileIn[src0 + step + 1];
-              v.y += a * (scalar_t)c_fu[step * up + 3];
-              v.z += a * (scalar_t)c_fu[step * up + 2];
-              v.w += a * (scalar_t)c_fu[step * up + 1];
-            }
-          } else if (phaseInX == 1) {
-#pragma unroll
-            for (int step = 0; step < fuSize / up; step++) {
-              v.x += a * (scalar_t)c_fu[step * up + 1];
-              v.y += a * (scalar_t)c_fu[step * up + 0];
-              a = s_tileIn[src0 + step + 1];
-              v.z += a * (scalar_t)c_fu[step * up + 3];
-              v.w += a * (scalar_t)c_fu[step * up + 2];
-            }
-          } else if (phaseInX == 2) {
-#pragma unroll
-            for (int step = 0; step < fuSize / up; step++) {
-              v.x += a * (scalar_t)c_fu[step * up + 2];
-              v.y += a * (scalar_t)c_fu[step * up + 1];
-              v.z += a * (scalar_t)c_fu[step * up + 0];
-              a = s_tileIn[src0 + step + 1];
-              v.w += a * (scalar_t)c_fu[step * up + 3];
-            }
-          } else  // (phaseInX == 3)
-          {
-#pragma unroll
-            for (int step = 0; step < fuSize / up; step++) {
-              v.x += a * (scalar_t)c_fu[step * up + 3];
-              v.y += a * (scalar_t)c_fu[step * up + 2];
-              v.z += a * (scalar_t)c_fu[step * up + 1];
-              v.w += a * (scalar_t)c_fu[step * up + 0];
-              a = s_tileIn[src0 + step + 1];
-            }
-          }
-          s_tileUpX[dst + 0] = v.x;
-          s_tileUpX[dst + 1] = v.y;
-          s_tileUpX[dst + 2] = v.z;
-          s_tileUpX[dst + 3] = v.w;
-        }
-      } else if (up == 2) {
-        bool p0 = (phaseInX == 0);
-        for (int idx = threadIdx.x * up; idx < tileUpW * tileInH;
-             idx += blockDim.x * up) {
-          int relUpX0, relInY;
-          fast_div_mod<tileUpW>(relUpX0, relInY, idx);
-          int relInX0 = relUpX0 / up;
-          int src0 = relInX0 + tileInW * relInY;
-          int dst = relInY * tileUpW + relUpX0;
-          vec2_t v = InternalType<T>::zero_vec2();
-          scalar_t a = s_tileIn[src0];
-          if (p0)  // (phaseInX == 0)
-          {
-#pragma unroll
-            for (int step = 0; step < fuSize / up; step++) {
-              v.x += a * (scalar_t)c_fu[step * up + 0];
-              a = s_tileIn[src0 + step + 1];
-              v.y += a * (scalar_t)c_fu[step * up + 1];
-            }
-          } else  // (phaseInX == 1)
-          {
-#pragma unroll
-            for (int step = 0; step < fuSize / up; step++) {
-              v.x += a * (scalar_t)c_fu[step * up + 1];
-              v.y += a * (scalar_t)c_fu[step * up + 0];
-              a = s_tileIn[src0 + step + 1];
-            }
-          }
-          s_tileUpX[dst + 0] = v.x;
-          s_tileUpX[dst + 1] = v.y;
-        }
-      }
-
-      // Vertical upsampling & nonlinearity.
-
-      __syncthreads();
-      int groupMask = 15 << ((threadIdx.x & 31) & ~3);
-      int minY = tileOutY ? (tileOutY - tileOutH) * down + tileUpH
-                          : 0;  // Skip already written signs.
-      int sShapeMaxY =
-          MIN(p.sShape.y,
-              tileOutY * down + tileUpH);  // Avoid out-of-tile sign writes.
-      if (up == 4) {
-        minY -= 3;  // Adjust according to block height.
-        for (int idx = threadIdx.x; idx < tileUpW * tileUpH_up / up;
-             idx += blockDim.x) {
-          int relUpX, relInY0;
-          fast_div_mod<tileUpW>(relUpX, relInY0, idx);
-          int relUpY0 = relInY0 * up;
-          int src0 = relInY0 * tileUpW + relUpX;
-          int dst = relUpY0 * tileUpW + relUpX;
-          vec4_t v = InternalType<T>::zero_vec4();
-
-          scalar_t a = s_tileUpX[src0];
-          if (phaseInY == 0) {
-#pragma unroll
-            for (int step = 0; step < fuSize / up; step++) {
-              v.x += a * (scalar_t)c_fu[step * up + 0];
-              a = s_tileUpX[src0 + (step + 1) * tileUpW];
-              v.y += a * (scalar_t)c_fu[step * up + 3];
-              v.z += a * (scalar_t)c_fu[step * up + 2];
-              v.w += a * (scalar_t)c_fu[step * up + 1];
-            }
-          } else if (phaseInY == 1) {
-#pragma unroll
-            for (int step = 0; step < fuSize / up; step++) {
-              v.x += a * (scalar_t)c_fu[step * up + 1];
-              v.y += a * (scalar_t)c_fu[step * up + 0];
-              a = s_tileUpX[src0 + (step + 1) * tileUpW];
-              v.z += a * (scalar_t)c_fu[step * up + 3];
-              v.w += a * (scalar_t)c_fu[step * up + 2];
-            }
-          } else if (phaseInY == 2) {
-#pragma unroll
-            for (int step = 0; step < fuSize / up; step++) {
-              v.x += a * (scalar_t)c_fu[step * up + 2];
-              v.y += a * (scalar_t)c_fu[step * up + 1];
-              v.z += a * (scalar_t)c_fu[step * up + 0];
-              a = s_tileUpX[src0 + (step + 1) * tileUpW];
-              v.w += a * (scalar_t)c_fu[step * up + 3];
-            }
-          } else  // (phaseInY == 3)
-          {
-#pragma unroll
-            for (int step = 0; step < fuSize / up; step++) {
-              v.x += a * (scalar_t)c_fu[step * up + 3];
-              v.y += a * (scalar_t)c_fu[step * up + 2];
-              v.z += a * (scalar_t)c_fu[step * up + 1];
-              v.w += a * (scalar_t)c_fu[step * up + 0];
-              a = s_tileUpX[src0 + (step + 1) * tileUpW];
-            }
-          }
-
-          int x = tileOutX * down + relUpX;
-          int y = tileOutY * down + relUpY0;
-          int signX = x + p.sOfs.x;
-          int signY = y + p.sOfs.y;
-          int signZ = blockIdx.z + p.blockZofs;
-          int signXb = signX >> 2;
-          index_t si0 =
-              signXb + p.sShape.x * (signY + (index_t)p.sShape.y * signZ);
-          index_t si1 = si0 + p.sShape.x;
-          index_t si2 = si0 + p.sShape.x * 2;
-          index_t si3 = si0 + p.sShape.x * 3;
-
-          v.x *= (scalar_t)((float)up * (float)up * p.gain);
-          v.y *= (scalar_t)((float)up * (float)up * p.gain);
-          v.z *= (scalar_t)((float)up * (float)up * p.gain);
-          v.w *= (scalar_t)((float)up * (float)up * p.gain);
-
-          if (signWrite) {
-            if (!enableWriteSkip) {
-              // Determine and write signs.
-              int sx = __float_as_uint(v.x) >> 31 << 0;
-              int sy = __float_as_uint(v.y) >> 31 << 8;
-              int sz = __float_as_uint(v.z) >> 31 << 16;
-              int sw = __float_as_uint(v.w) >> 31 << 24;
-              if (sx) v.x *= p.slope;
-              if (sy) v.y *= p.slope;
-              if (sz) v.z *= p.slope;
-              if (sw) v.w *= p.slope;
-              if (fabsf(v.x) > p.clamp) {
-                sx = 2 << 0;
-                v.x = InternalType<T>::clamp(v.x, p.clamp);
-              }
-              if (fabsf(v.y) > p.clamp) {
-                sy = 2 << 8;
-                v.y = InternalType<T>::clamp(v.y, p.clamp);
-              }
-              if (fabsf(v.z) > p.clamp) {
-                sz = 2 << 16;
-                v.z = InternalType<T>::clamp(v.z, p.clamp);
-              }
-              if (fabsf(v.w) > p.clamp) {
-                sw = 2 << 24;
-                v.w = InternalType<T>::clamp(v.w, p.clamp);
-              }
-
-              if ((uint32_t)signXb < p.swLimit && signY >= minY) {
-                // Combine signs.
-                uint32_t s = sx + sy + sw + sz;
-                s <<= (signX & 3) << 1;
-#ifndef MMCV_WITH_HIP
-                s |= __shfl_xor_sync(groupMask, s, 1);
-                s |= __shfl_xor_sync(groupMask, s, 2);
-#else
-                s |= __shfl_xor(s, 1);
-                s |= __shfl_xor(s, 2);
-#endif
-
-                // Write signs.
-                if ((uint32_t)(signY + 0) < sShapeMaxY) {
-                  p.s[si0] = (unsigned char)(s >> 0);
-                }
-                if ((uint32_t)(signY + 1) < sShapeMaxY) {
-                  p.s[si1] = (unsigned char)(s >> 8);
-                }
-                if ((uint32_t)(signY + 2) < sShapeMaxY) {
-                  p.s[si2] = (unsigned char)(s >> 16);
-                }
-                if ((uint32_t)(signY + 3) < sShapeMaxY) {
-                  p.s[si3] = (unsigned char)(s >> 24);
-                }
-              }
-            } else {
-              // Determine and write signs.
-              if ((uint32_t)signXb < p.swLimit && signY >= minY) {
-                int sx = __float_as_uint(v.x) >> 31 << 0;
-                int sy = __float_as_uint(v.y) >> 31 << 8;
-                int sz = __float_as_uint(v.z) >> 31 << 16;
-                int sw = __float_as_uint(v.w) >> 31 << 24;
-                if (sx) v.x *= p.slope;
-                if (sy) v.y *= p.slope;
-                if (sz) v.z *= p.slope;
-                if (sw) v.w *= p.slope;
-                if (fabsf(v.x) > p.clamp) {
-                  sx = 2 << 0;
-                  v.x = InternalType<T>::clamp(v.x, p.clamp);
-                }
-                if (fabsf(v.y) > p.clamp) {
-                  sy = 2 << 8;
-                  v.y = InternalType<T>::clamp(v.y, p.clamp);
-                }
-                if (fabsf(v.z) > p.clamp) {
-                  sz = 2 << 16;
-                  v.z = InternalType<T>::clamp(v.z, p.clamp);
-                }
-                if (fabsf(v.w) > p.clamp) {
-                  sw = 2 << 24;
-                  v.w = InternalType<T>::clamp(v.w, p.clamp);
-                }
-
-                // Combine signs.
-                uint32_t s = sx + sy + sw + sz;
-                s <<= (signX & 3) << 1;
-#ifndef MMCV_WITH_HIP
-                s |= __shfl_xor_sync(groupMask, s, 1);
-                s |= __shfl_xor_sync(groupMask, s, 2);
-#else
-                s |= __shfl_xor(s, 1);
-                s |= __shfl_xor(s, 2);
-#endif
-                // Write signs.
-                if ((uint32_t)(signY + 0) < sShapeMaxY) {
-                  p.s[si0] = (unsigned char)(s >> 0);
-                }
-                if ((uint32_t)(signY + 1) < sShapeMaxY) {
-                  p.s[si1] = (unsigned char)(s >> 8);
-                }
-                if ((uint32_t)(signY + 2) < sShapeMaxY) {
-                  p.s[si2] = (unsigned char)(s >> 16);
-                }
-                if ((uint32_t)(signY + 3) < sShapeMaxY) {
-                  p.s[si3] = (unsigned char)(s >> 24);
-                }
-              } else {
-                // Just compute the values.
-                if (v.x < 0.f) v.x *= p.slope;
-                v.x = InternalType<T>::clamp(v.x, p.clamp);
-                if (v.y < 0.f) v.y *= p.slope;
-                v.y = InternalType<T>::clamp(v.y, p.clamp);
-                if (v.z < 0.f) v.z *= p.slope;
-                v.z = InternalType<T>::clamp(v.z, p.clamp);
-                if (v.w < 0.f) v.w *= p.slope;
-                v.w = InternalType<T>::clamp(v.w, p.clamp);
-              }
-            }
-          } else if (signRead)  // Read signs and apply.
-          {
-            if ((uint32_t)signXb < p.swLimit) {
-              int ss = (signX & 3) << 1;
-              if ((uint32_t)(signY + 0) < p.sShape.y) {
-                int s = p.s[si0] >> ss;
-                if (s & 1) v.x *= p.slope;
-                if (s & 2) v.x = 0.f;
-              }
-              if ((uint32_t)(signY + 1) < p.sShape.y) {
-                int s = p.s[si1] >> ss;
-                if (s & 1) v.y *= p.slope;
-                if (s & 2) v.y = 0.f;
-              }
-              if ((uint32_t)(signY + 2) < p.sShape.y) {
-                int s = p.s[si2] >> ss;
-                if (s & 1) v.z *= p.slope;
-                if (s & 2) v.z = 0.f;
-              }
-              if ((uint32_t)(signY + 3) < p.sShape.y) {
-                int s = p.s[si3] >> ss;
-                if (s & 1) v.w *= p.slope;
-                if (s & 2) v.w = 0.f;
-              }
-            }
-          } else  // Forward pass with no sign write.
-          {
-            if (v.x < 0.f) v.x *= p.slope;
-            v.x = InternalType<T>::clamp(v.x, p.clamp);
-            if (v.y < 0.f) v.y *= p.slope;
-            v.y = InternalType<T>::clamp(v.y, p.clamp);
-            if (v.z < 0.f) v.z *= p.slope;
-            v.z = InternalType<T>::clamp(v.z, p.clamp);
-            if (v.w < 0.f) v.w *= p.slope;
-            v.w = InternalType<T>::clamp(v.w, p.clamp);
-          }
-
-          s_tileUpXY[dst + 0 * tileUpW] = v.x;
-          if (relUpY0 + 1 < tileUpH) s_tileUpXY[dst + 1 * tileUpW] = v.y;
-          if (relUpY0 + 2 < tileUpH) s_tileUpXY[dst + 2 * tileUpW] = v.z;
-          if (relUpY0 + 3 < tileUpH) s_tileUpXY[dst + 3 * tileUpW] = v.w;
-        }
-      } else if (up == 2) {
-        minY -= 1;  // Adjust according to block height.
-        for (int idx = threadIdx.x; idx < tileUpW * tileUpH_up / up;
-             idx += blockDim.x) {
-          int relUpX, relInY0;
-          fast_div_mod<tileUpW>(relUpX, relInY0, idx);
-          int relUpY0 = relInY0 * up;
-          int src0 = relInY0 * tileUpW + relUpX;
-          int dst = relUpY0 * tileUpW + relUpX;
-          vec2_t v = InternalType<T>::zero_vec2();
-
-          scalar_t a = s_tileUpX[src0];
-          if (phaseInY == 0) {
-#pragma unroll
-            for (int step = 0; step < fuSize / up; step++) {
-              v.x += a * (scalar_t)c_fu[step * up + 0];
-              a = s_tileUpX[src0 + (step + 1) * tileUpW];
-              v.y += a * (scalar_t)c_fu[step * up + 1];
-            }
-          } else  // (phaseInY == 1)
-          {
-#pragma unroll
-            for (int step = 0; step < fuSize / up; step++) {
-              v.x += a * (scalar_t)c_fu[step * up + 1];
-              v.y += a * (scalar_t)c_fu[step * up + 0];
-              a = s_tileUpX[src0 + (step + 1) * tileUpW];
-            }
-          }
-
-          int x = tileOutX * down + relUpX;
-          int y = tileOutY * down + relUpY0;
-          int signX = x + p.sOfs.x;
-          int signY = y + p.sOfs.y;
-          int signZ = blockIdx.z + p.blockZofs;
-          int signXb = signX >> 2;
-          index_t si0 =
-              signXb + p.sShape.x * (signY + (index_t)p.sShape.y * signZ);
-          index_t si1 = si0 + p.sShape.x;
-
-          v.x *= (scalar_t)((float)up * (float)up * p.gain);
-          v.y *= (scalar_t)((float)up * (float)up * p.gain);
-
-          if (signWrite) {
-            if (!enableWriteSkip) {
-              // Determine and write signs.
-              int sx = __float_as_uint(v.x) >> 31 << 0;
-              int sy = __float_as_uint(v.y) >> 31 << 8;
-              if (sx) v.x *= p.slope;
-              if (sy) v.y *= p.slope;
-              if (fabsf(v.x) > p.clamp) {
-                sx = 2 << 0;
-                v.x = InternalType<T>::clamp(v.x, p.clamp);
-              }
-              if (fabsf(v.y) > p.clamp) {
-                sy = 2 << 8;
-                v.y = InternalType<T>::clamp(v.y, p.clamp);
-              }
-
-              if ((uint32_t)signXb < p.swLimit && signY >= minY) {
-                // Combine signs.
-                int s = sx + sy;
-                s <<= signXo;
-#ifndef MMCV_WITH_HIP
-                s |= __shfl_xor_sync(groupMask, s, 1);
-                s |= __shfl_xor_sync(groupMask, s, 2);
-#else
-                s |= __shfl_xor(s, 1);
-                s |= __shfl_xor(s, 2);
-#endif
-                // Write signs.
-                if ((uint32_t)(signY + 0) < sShapeMaxY) {
-                  p.s[si0] = (unsigned char)(s >> 0);
-                }
-                if ((uint32_t)(signY + 1) < sShapeMaxY) {
-                  p.s[si1] = (unsigned char)(s >> 8);
-                }
-              }
-            } else {
-              // Determine and write signs.
-              if ((uint32_t)signXb < p.swLimit && signY >= minY) {
-                int sx = __float_as_uint(v.x) >> 31 << 0;
-                int sy = __float_as_uint(v.y) >> 31 << 8;
-                if (sx) v.x *= p.slope;
-                if (sy) v.y *= p.slope;
-                if (fabsf(v.x) > p.clamp) {
-                  sx = 2 << 0;
-                  v.x = InternalType<T>::clamp(v.x, p.clamp);
-                }
-                if (fabsf(v.y) > p.clamp) {
-                  sy = 2 << 8;
-                  v.y = InternalType<T>::clamp(v.y, p.clamp);
-                }
-
-                // Combine signs.
-                int s = sx + sy;
-                s <<= signXo;
-#ifndef MMCV_WITH_HIP
-                s |= __shfl_xor_sync(groupMask, s, 1);
-                s |= __shfl_xor_sync(groupMask, s, 2);
-#else
-                s |= __shfl_xor(s, 1);
-                s |= __shfl_xor(s, 2);
-#endif
-                // Write signs.
-                if ((uint32_t)(signY + 0) < sShapeMaxY) {
-                  p.s[si0] = (unsigned char)(s >> 0);
-                }
-                if ((uint32_t)(signY + 1) < sShapeMaxY) {
-                  p.s[si1] = (unsigned char)(s >> 8);
-                }
-              } else {
-                // Just compute the values.
-                if (v.x < 0.f) v.x *= p.slope;
-                v.x = InternalType<T>::clamp(v.x, p.clamp);
-                if (v.y < 0.f) v.y *= p.slope;
-                v.y = InternalType<T>::clamp(v.y, p.clamp);
-              }
-            }
-          } else if (signRead)  // Read signs and apply.
-          {
-            if ((uint32_t)signXb < p.swLimit) {
-              if ((uint32_t)(signY + 0) < p.sShape.y) {
-                int s = p.s[si0] >> signXo;
-                if (s & 1) v.x *= p.slope;
-                if (s & 2) v.x = 0.f;
-              }
-              if ((uint32_t)(signY + 1) < p.sShape.y) {
-                int s = p.s[si1] >> signXo;
-                if (s & 1) v.y *= p.slope;
-                if (s & 2) v.y = 0.f;
-              }
-            }
-          } else  // Forward pass with no sign write.
-          {
-            if (v.x < 0.f) v.x *= p.slope;
-            v.x = InternalType<T>::clamp(v.x, p.clamp);
-            if (v.y < 0.f) v.y *= p.slope;
-            v.y = InternalType<T>::clamp(v.y, p.clamp);
-          }
-
-          if (!downInline) {
-            // Write into temporary buffer.
-            s_tileUpXY[dst] = v.x;
-            if (relUpY0 < tileUpH - 1) s_tileUpXY[dst + tileUpW] = v.y;
-          } else {
-            // Write directly into output buffer.
-            if ((uint32_t)x < p.yShape.x) {
-              int ymax = MIN(p.yShape.y, tileUpH + tileOutY * down);
-              index_t ofs = x * get_stride<index_t>(p.yStride.x) +
-                            y * get_stride<index_t>(p.yStride.y) + mapOfsOut;
-              if ((uint32_t)y + 0 < p.yShape.y)
-                *((T *)((char *)p.y + ofs)) = (T)(v.x * (scalar_t)c_fd[0]);
-              if ((uint32_t)y + 1 < ymax)
-                *((T *)((char *)p.y + ofs + get_stride<index_t>(p.yStride.y))) =
-                    (T)(v.y * (scalar_t)c_fd[0]);
-            }
-          }
-        }
-      }
-    } else if (filterMode == MODE_FUSD || filterMode == MODE_FUFD) {
-      // Full upsampling filter.
-
-      if (up == 2) {
-        // 2 x 2-wide.
-        __syncthreads();
-        int minY = tileOutY ? (tileOutY - tileOutH) * down + tileUpH + p.sOfs.y
-                            : 0;  // Skip already written signs.
-        for (int idx = threadIdx.x * 4; idx < tileUpW * tileUpH;
-             idx += blockDim.x * 4) {
-          int relUpX0, relUpY0;
-          fast_div_mod<tileUpW>(relUpX0, relUpY0, idx);
-          int relInX0 = CEIL_DIV(relUpX0 - phaseInX, up);
-          int relInY0 = CEIL_DIV(relUpY0 - phaseInY, up);
-          int src0 = relInX0 + tileInW * relInY0;
-          int tap0y = (relInY0 * up + phaseInY - relUpY0);
-
-#define X_LOOP(TAPY, PX)                                             \
-  for (int sx = 0; sx < fuSize / up; sx++) {                         \
-    v.x += a * (scalar_t)c_fu[(sx * up + (((PX)-0) & (up - 1))) +    \
-                              (sy * up + (TAPY)) * MAX_FILTER_SIZE]; \
-    v.z += b * (scalar_t)c_fu[(sx * up + (((PX)-0) & (up - 1))) +    \
-                              (sy * up + (TAPY)) * MAX_FILTER_SIZE]; \
-    if ((PX) == 0) {                                                 \
-      a = b;                                                         \
-      b = s_tileIn[src0 + 2 + sx + sy * tileInW];                    \
-    }                                                                \
-    v.y += a * (scalar_t)c_fu[(sx * up + (((PX)-1) & (up - 1))) +    \
-                              (sy * up + (TAPY)) * MAX_FILTER_SIZE]; \
-    v.w += b * (scalar_t)c_fu[(sx * up + (((PX)-1) & (up - 1))) +    \
-                              (sy * up + (TAPY)) * MAX_FILTER_SIZE]; \
-    if ((PX) == 1) {                                                 \
-      a = b;                                                         \
-      b = s_tileIn[src0 + 2 + sx + sy * tileInW];                    \
-    }                                                                \
-  }
-
-          vec4_t v = InternalType<T>::zero_vec4();
-          if (tap0y == 0 && phaseInX == 0)
-#pragma unroll
-            for (int sy = 0; sy < fuSize / up; sy++) {
-              scalar_t a = s_tileIn[src0 + sy * tileInW];
-              scalar_t b = s_tileIn[src0 + sy * tileInW + 1];
-#pragma unroll
-              X_LOOP(0, 0)
-            }
-          if (tap0y == 0 && phaseInX == 1)
-#pragma unroll
-            for (int sy = 0; sy < fuSize / up; sy++) {
-              scalar_t a = s_tileIn[src0 + sy * tileInW];
-              scalar_t b = s_tileIn[src0 + sy * tileInW + 1];
-#pragma unroll
-              X_LOOP(0, 1)
-            }
-          if (tap0y == 1 && phaseInX == 0)
-#pragma unroll
-            for (int sy = 0; sy < fuSize / up; sy++) {
-              scalar_t a = s_tileIn[src0 + sy * tileInW];
-              scalar_t b = s_tileIn[src0 + sy * tileInW + 1];
-#pragma unroll
-              X_LOOP(1, 0)
-            }
-          if (tap0y == 1 && phaseInX == 1)
-#pragma unroll
-            for (int sy = 0; sy < fuSize / up; sy++) {
-              scalar_t a = s_tileIn[src0 + sy * tileInW];
-              scalar_t b = s_tileIn[src0 + sy * tileInW + 1];
-#pragma unroll
-              X_LOOP(1, 1)
-            }
-
-#undef X_LOOP
-
-          int x = tileOutX * down + relUpX0;
-          int y = tileOutY * down + relUpY0;
-          int signX = x + p.sOfs.x;
-          int signY = y + p.sOfs.y;
-          int signZ = blockIdx.z + p.blockZofs;
-          int signXb = signX >> 2;
-          index_t si =
-              signXb + p.sShape.x * (signY + (index_t)p.sShape.y * signZ);
-
-          v.x *= (scalar_t)((float)up * (float)up * p.gain);
-          v.y *= (scalar_t)((float)up * (float)up * p.gain);
-          v.z *= (scalar_t)((float)up * (float)up * p.gain);
-          v.w *= (scalar_t)((float)up * (float)up * p.gain);
-
-          if (signWrite) {
-            if (!enableWriteSkip) {
-              // Determine and write signs.
-              int sx = __float_as_uint(v.x) >> 31;
-              int sy = __float_as_uint(v.y) >> 31;
-              int sz = __float_as_uint(v.z) >> 31;
-              int sw = __float_as_uint(v.w) >> 31;
-              if (sx) v.x *= p.slope;
-              if (fabsf(v.x) > p.clamp) {
-                sx = 2;
-                v.x = InternalType<T>::clamp(v.x, p.clamp);
-              }
-              if (sy) v.y *= p.slope;
-              if (fabsf(v.y) > p.clamp) {
-                sy = 2;
-                v.y = InternalType<T>::clamp(v.y, p.clamp);
-              }
-              if (sz) v.z *= p.slope;
-              if (fabsf(v.z) > p.clamp) {
-                sz = 2;
-                v.z = InternalType<T>::clamp(v.z, p.clamp);
-              }
-              if (sw) v.w *= p.slope;
-              if (fabsf(v.w) > p.clamp) {
-                sw = 2;
-                v.w = InternalType<T>::clamp(v.w, p.clamp);
-              }
-
-              if ((uint32_t)signXb < p.swLimit &&
-                  (uint32_t)signY < p.sShape.y && signY >= minY) {
-                p.s[si] = sx + (sy << 2) + (sz << 4) + (sw << 6);
-              }
-            } else {
-              // Determine and write signs.
-              if ((uint32_t)signXb < p.swLimit &&
-                  (uint32_t)signY < p.sShape.y && signY >= minY) {
-                int sx = __float_as_uint(v.x) >> 31;
-                int sy = __float_as_uint(v.y) >> 31;
-                int sz = __float_as_uint(v.z) >> 31;
-                int sw = __float_as_uint(v.w) >> 31;
-                if (sx) v.x *= p.slope;
-                if (fabsf(v.x) > p.clamp) {
-                  sx = 2;
-                  v.x = InternalType<T>::clamp(v.x, p.clamp);
-                }
-                if (sy) v.y *= p.slope;
-                if (fabsf(v.y) > p.clamp) {
-                  sy = 2;
-                  v.y = InternalType<T>::clamp(v.y, p.clamp);
-                }
-                if (sz) v.z *= p.slope;
-                if (fabsf(v.z) > p.clamp) {
-                  sz = 2;
-                  v.z = InternalType<T>::clamp(v.z, p.clamp);
-                }
-                if (sw) v.w *= p.slope;
-                if (fabsf(v.w) > p.clamp) {
-                  sw = 2;
-                  v.w = InternalType<T>::clamp(v.w, p.clamp);
-                }
-
-                p.s[si] = sx + (sy << 2) + (sz << 4) + (sw << 6);
-              } else {
-                // Just compute the values.
-                if (v.x < 0.f) v.x *= p.slope;
-                v.x = InternalType<T>::clamp(v.x, p.clamp);
-                if (v.y < 0.f) v.y *= p.slope;
-                v.y = InternalType<T>::clamp(v.y, p.clamp);
-                if (v.z < 0.f) v.z *= p.slope;
-                v.z = InternalType<T>::clamp(v.z, p.clamp);
-                if (v.w < 0.f) v.w *= p.slope;
-                v.w = InternalType<T>::clamp(v.w, p.clamp);
-              }
-            }
-          } else if (signRead)  // Read sign and apply.
-          {
-            if ((uint32_t)signY < p.sShape.y) {
-              int s = 0;
-              if ((uint32_t)signXb < p.swLimit) s = p.s[si];
-              if ((uint32_t)signXb + 1 < p.swLimit) s |= p.s[si + 1] << 8;
-              s >>= (signX & 3) << 1;
-              if (s & 0x01) v.x *= p.slope;
-              if (s & 0x02) v.x = 0.f;
-              if (s & 0x04) v.y *= p.slope;
-              if (s & 0x08) v.y = 0.f;
-              if (s & 0x10) v.z *= p.slope;
-              if (s & 0x20) v.z = 0.f;
-              if (s & 0x40) v.w *= p.slope;
-              if (s & 0x80) v.w = 0.f;
-            }
-          } else  // Forward pass with no sign write.
-          {
-            if (v.x < 0.f) v.x *= p.slope;
-            v.x = InternalType<T>::clamp(v.x, p.clamp);
-            if (v.y < 0.f) v.y *= p.slope;
-            v.y = InternalType<T>::clamp(v.y, p.clamp);
-            if (v.z < 0.f) v.z *= p.slope;
-            v.z = InternalType<T>::clamp(v.z, p.clamp);
-            if (v.w < 0.f) v.w *= p.slope;
-            v.w = InternalType<T>::clamp(v.w, p.clamp);
-          }
-
-          s_tileUpXY[idx + 0] = v.x;
-          s_tileUpXY[idx + 1] = v.y;
-          s_tileUpXY[idx + 2] = v.z;
-          s_tileUpXY[idx + 3] = v.w;
-        }
-      } else if (up == 1) {
-        __syncthreads();
-        uint32_t groupMask = 15 << ((threadIdx.x & 31) & ~3);
-        int minY = tileOutY ? (tileOutY - tileOutH) * down + tileUpH
-                            : 0;  // Skip already written signs.
-        for (int idx = threadIdx.x; idx < tileUpW * tileUpH;
-             idx += blockDim.x) {
-          int relUpX0, relUpY0;
-          fast_div_mod<tileUpW>(relUpX0, relUpY0, idx);
-          scalar_t v = s_tileIn[idx] * (scalar_t)c_fu[0];  // 1x1 filter.
-
-          int x = tileOutX * down + relUpX0;
-          int y = tileOutY * down + relUpY0;
-          int signX = x + p.sOfs.x;
-          int signY = y + p.sOfs.y;
-          int signZ = blockIdx.z + p.blockZofs;
-          int signXb = signX >> 2;
-          index_t si =
-              signXb + p.sShape.x * (signY + (index_t)p.sShape.y * signZ);
-          v *= (scalar_t)((float)up * (float)up * p.gain);
-
-          if (signWrite) {
-            if (!enableWriteSkip) {
-              // Determine and write sign.
-              uint32_t s = 0;
-              uint32_t signXbit = (1u << signXo);
-              if (v < 0.f) {
-                s = signXbit;
-                v *= p.slope;
-              }
-              if (fabsf(v) > p.clamp) {
-                s = signXbit * 2;
-                v = InternalType<T>::clamp(v, p.clamp);
-              }
-              if ((uint32_t)signXb < p.swLimit &&
-                  (uint32_t)signY < p.sShape.y && signY >= minY) {
-#ifndef MMCV_WITH_HIP
-                s += __shfl_xor_sync(groupMask, s, 1);  // Coalesce.
-                s += __shfl_xor_sync(groupMask, s, 2);  // Coalesce.
-#else
-                s += __shfl_xor(s, 1);  // Coalesce.
-                s += __shfl_xor(s, 2);  // Coalesce.
-#endif
-                p.s[si] = s;                            // Write.
-              }
-            } else {
-              // Determine and write sign.
-              if ((uint32_t)signXb < p.swLimit &&
-                  (uint32_t)signY < p.sShape.y && signY >= minY) {
-                uint32_t s = 0;
-                uint32_t signXbit = (1u << signXo);
-                if (v < 0.f) {
-                  s = signXbit;
-                  v *= p.slope;
-                }
-                if (fabsf(v) > p.clamp) {
-                  s = signXbit * 2;
-                  v = InternalType<T>::clamp(v, p.clamp);
-                }
-#ifndef MMCV_WITH_HIP
-                s += __shfl_xor_sync(groupMask, s, 1);  // Coalesce.
-                s += __shfl_xor_sync(groupMask, s, 2);  // Coalesce.
-#else
-                s += __shfl_xor(s, 1);  // Coalesce.
-                s += __shfl_xor(s, 2);  // Coalesce.
-#endif
-                p.s[si] = s;                            // Write.
-              } else {
-                // Just compute the value.
-                if (v < 0.f) v *= p.slope;
-                v = InternalType<T>::clamp(v, p.clamp);
-              }
-            }
-          } else if (signRead) {
-            // Read sign and apply if within sign tensor bounds.
-            if ((uint32_t)signXb < p.swLimit && (uint32_t)signY < p.sShape.y) {
-              int s = p.s[si];
-              s >>= signXo;
-              if (s & 1) v *= p.slope;
-              if (s & 2) v = 0.f;
-            }
-          } else  // Forward pass with no sign write.
-          {
-            if (v < 0.f) v *= p.slope;
-            v = InternalType<T>::clamp(v, p.clamp);
-          }
-
-          if (!downInline)  // Write into temporary buffer.
-            s_tileUpXY[idx] = v;
-          else if ((uint32_t)x < p.yShape.x &&
-                   (uint32_t)y <
-                       p.yShape.y)  // Write directly into output buffer
-            *((T *)((char *)p.y + (x * get_stride<index_t>(p.yStride.x) +
-                                   y * get_stride<index_t>(p.yStride.y) +
-                                   mapOfsOut))) = (T)(v * (scalar_t)c_fd[0]);
-        }
-      }
-    }
-
-    // Downsampling.
-    if (filterMode == MODE_SUSD || filterMode == MODE_FUSD) {
-      // Horizontal downsampling.
-      __syncthreads();
-      if (down == 4 && tileOutW % 4 == 0) {
-        // Calculate 4 pixels at a time.
-        for (int idx = threadIdx.x * 4; idx < tileOutW * tileUpH;
-             idx += blockDim.x * 4) {
-          int relOutX0, relUpY;
-          fast_div_mod<tileOutW>(relOutX0, relUpY, idx);
-          int relUpX0 = relOutX0 * down;
-          int src0 = relUpY * tileUpW + relUpX0;
-          vec4_t v = InternalType<T>::zero_vec4();
-#pragma unroll
-          for (int step = 0; step < fdSize; step++) {
-            v.x += s_tileUpXY[src0 + 0 + step] * (scalar_t)c_fd[step];
-            v.y += s_tileUpXY[src0 + 4 + step] * (scalar_t)c_fd[step];
-            v.z += s_tileUpXY[src0 + 8 + step] * (scalar_t)c_fd[step];
-            v.w += s_tileUpXY[src0 + 12 + step] * (scalar_t)c_fd[step];
-          }
-          s_tileDownX[idx + 0] = v.x;
-          s_tileDownX[idx + 1] = v.y;
-          s_tileDownX[idx + 2] = v.z;
-          s_tileDownX[idx + 3] = v.w;
-        }
-      } else if ((down == 2 || down == 4) && (tileOutW % 2 == 0)) {
-        // Calculate 2 pixels at a time.
-        for (int idx = threadIdx.x * 2; idx < tileOutW * tileUpH;
-             idx += blockDim.x * 2) {
-          int relOutX0, relUpY;
-          fast_div_mod<tileOutW>(relOutX0, relUpY, idx);
-          int relUpX0 = relOutX0 * down;
-          int src0 = relUpY * tileUpW + relUpX0;
-          vec2_t v = InternalType<T>::zero_vec2();
-#pragma unroll
-          for (int step = 0; step < fdSize; step++) {
-            v.x += s_tileUpXY[src0 + 0 + step] * (scalar_t)c_fd[step];
-            v.y += s_tileUpXY[src0 + down + step] * (scalar_t)c_fd[step];
-          }
-          s_tileDownX[idx + 0] = v.x;
-          s_tileDownX[idx + 1] = v.y;
-        }
-      } else {
-        // Calculate 1 pixel at a time.
-        for (int idx = threadIdx.x; idx < tileOutW * tileUpH;
-             idx += blockDim.x) {
-          int relOutX0, relUpY;
-          fast_div_mod<tileOutW>(relOutX0, relUpY, idx);
-          int relUpX0 = relOutX0 * down;
-          int src = relUpY * tileUpW + relUpX0;
-          scalar_t v = 0.f;
-#pragma unroll
-          for (int step = 0; step < fdSize; step++)
-            v += s_tileUpXY[src + step] * (scalar_t)c_fd[step];
-          s_tileDownX[idx] = v;
-        }
-      }
-
-      // Vertical downsampling & store output tile.
-      __syncthreads();
-      for (int idx = threadIdx.x; idx < tileOutW * tileOutH;
-           idx += blockDim.x) {
-        int relOutX, relOutY0;
-        fast_div_mod<tileOutW>(relOutX, relOutY0, idx);
-        int relUpY0 = relOutY0 * down;
-        int src0 = relUpY0 * tileOutW + relOutX;
-        scalar_t v = 0;
-#pragma unroll
-        for (int step = 0; step < fdSize; step++)
-          v += s_tileDownX[src0 + step * tileOutW] * (scalar_t)c_fd[step];
-
-        int outX = tileOutX + relOutX;
-        int outY = tileOutY + relOutY0;
-
-        if (outX < p.yShape.x & outY < p.yShape.y)
-          *((T *)((char *)p.y + (outX * get_stride<index_t>(p.yStride.x) +
-                                 outY * get_stride<index_t>(p.yStride.y) +
-                                 mapOfsOut))) = (T)v;
-      }
-    } else if (filterMode == MODE_SUFD || filterMode == MODE_FUFD) {
-      // Full downsampling filter.
-      if (down == 2) {
-        // 2-wide.
-        __syncthreads();
-        for (int idx = threadIdx.x * 2; idx < tileOutW * tileOutH;
-             idx += blockDim.x * 2) {
-          int relOutX0, relOutY0;
-          fast_div_mod<tileOutW>(relOutX0, relOutY0, idx);
-          int relUpX0 = relOutX0 * down;
-          int relUpY0 = relOutY0 * down;
-          int src0 = relUpY0 * tileUpW + relUpX0;
-          vec2_t v = InternalType<T>::zero_vec2();
-#pragma unroll
-          for (int sy = 0; sy < fdSize; sy++)
-#pragma unroll
-            for (int sx = 0; sx < fdSize; sx++) {
-              v.x += s_tileUpXY[src0 + 0 + sx + sy * tileUpW] *
-                     (scalar_t)c_fd[sx + sy * MAX_FILTER_SIZE];
-              v.y += s_tileUpXY[src0 + 2 + sx + sy * tileUpW] *
-                     (scalar_t)c_fd[sx + sy * MAX_FILTER_SIZE];
-            }
-
-          int outX = tileOutX + relOutX0;
-          int outY = tileOutY + relOutY0;
-          if ((uint32_t)outY < p.yShape.y) {
-            index_t ofs = outX * get_stride<index_t>(p.yStride.x) +
-                          outY * get_stride<index_t>(p.yStride.y) + mapOfsOut;
-            if (outX + 0 < p.yShape.x) *((T *)((char *)p.y + ofs)) = (T)v.x;
-            if (outX + 1 < p.yShape.x)
-              *((T *)((char *)p.y + ofs + get_stride<index_t>(p.yStride.x))) =
-                  (T)v.y;
-          }
-        }
-      } else if (down == 1 && !downInline) {
-        // Thread per pixel.
-        __syncthreads();
-        for (int idx = threadIdx.x; idx < tileOutW * tileOutH;
-             idx += blockDim.x) {
-          int relOutX0, relOutY0;
-          fast_div_mod<tileOutW>(relOutX0, relOutY0, idx);
-          scalar_t v = s_tileUpXY[idx] * (scalar_t)c_fd[0];  // 1x1 filter.
-
-          int outX = tileOutX + relOutX0;
-          int outY = tileOutY + relOutY0;
-          if ((uint32_t)outX < p.yShape.x && (uint32_t)outY < p.yShape.y)
-            *((T *)((char *)p.y + (outX * get_stride<index_t>(p.yStride.x) +
-                                   outY * get_stride<index_t>(p.yStride.y) +
-                                   mapOfsOut))) = (T)v;
-        }
-      }
-    }
-
-    if (!enableXrep) break;
-  }
-}
-
-//------------------------------------------------------------------------
-// Compute activation function and signs for upsampled data tensor, modifying
-// data tensor in-place. Used for accelerating the generic variant. Sign tensor
-// is known to be contiguous, and p.x and p.s have the same z, w dimensions.
-// 64-bit indexing is always used.
-
-template <class T, bool signWrite, bool signRead>
-static __global__ void filtered_lrelu_act_kernel(
-    filtered_lrelu_act_kernel_params p) {
-  typedef typename InternalType<T>::scalar_t scalar_t;
-
-  // Indexing.
-  int32_t x = threadIdx.x + blockIdx.x * blockDim.x;
-  int32_t ymax = signWrite ? p.sShape.y : p.xShape.y;
-  int32_t qmax =
-      p.xShape.z * p.xShape.w;  // Combined minibatch*channel maximum index.
-
-  // Loop to accommodate oversized tensors.
-  for (int32_t q = blockIdx.z; q < qmax; q += gridDim.z)
-    for (int32_t y = blockIdx.y; y < ymax; y += gridDim.y) {
-      // Extract z and w (channel, minibatch index).
-      int32_t w = q / p.xShape.z;
-      int32_t z = q - w * p.xShape.z;
-
-      // Choose behavior based on sign read/write mode.
-      if (signWrite) {
-        // Process value if in p.x.
-        uint32_t s = 0;
-        if (x < p.xShape.x && y < p.xShape.y) {
-          int64_t ix = x * p.xStride.x + y * p.xStride.y + z * p.xStride.z +
-                       w * p.xStride.w;
-          T *pv = ((T *)p.x) + ix;
-          scalar_t v = (scalar_t)(*pv);
-
-          // Gain, LReLU, clamp.
-          v *= p.gain;
-          if (v < 0.f) {
-            v *= p.slope;
-            s = 1;  // Sign.
-          }
-          if (fabsf(v) > p.clamp) {
-            v = InternalType<T>::clamp(v, p.clamp);
-            s = 2;  // Clamp.
-          }
-
-          *pv = (T)v;  // Write value.
-        }
-
-        // Coalesce into threads 0 and 16 of warp.
-        uint32_t m = (threadIdx.x & 16) ? 0xffff0000u : 0x0000ffffu;
-        s <<= ((threadIdx.x & 15) << 1);  // Shift into place.
-#ifndef MMCV_WITH_HIP
-        s |= __shfl_xor_sync(m, s, 1);    // Distribute.
-        s |= __shfl_xor_sync(m, s, 2);
-        s |= __shfl_xor_sync(m, s, 4);
-        s |= __shfl_xor_sync(m, s, 8);
-#else
-        s |= __shfl_xor(s, 1);    // Distribute.
-        s |= __shfl_xor(s, 2);
-        s |= __shfl_xor(s, 4);
-        s |= __shfl_xor(s, 8);
-#endif
-        // Write signs if leader and in p.s.
-        if (!(threadIdx.x & 15) && x < p.sShape.x)  // y is always in.
-        {
-          uint64_t is =
-              x + p.sShape.x * (y + (int64_t)p.sShape.y * q);  // Contiguous.
-          ((uint32_t *)p.s)[is >> 4] = s;
-        }
-      } else if (signRead) {
-        // Process value if in p.x.
-        if (x < p.xShape.x)  // y is always in.
-        {
-          int64_t ix = x * p.xStride.x + y * p.xStride.y + z * p.xStride.z +
-                       w * p.xStride.w;
-          T *pv = ((T *)p.x) + ix;
-          scalar_t v = (scalar_t)(*pv);
-          v *= p.gain;
-
-          // Apply sign buffer offset.
-          uint32_t sx = x + p.sOfs.x;
-          uint32_t sy = y + p.sOfs.y;
-
-          // Read and apply signs if we land inside valid region of sign buffer.
-          if (sx < p.sShape.x && sy < p.sShape.y) {
-            uint64_t is =
-                (sx >> 2) + (p.sShape.x >> 2) *
-                                (sy + (uint64_t)p.sShape.y * q);  // Contiguous.
-            unsigned char s = p.s[is];
-            s >>= (sx & 3) << 1;  // Shift into place.
-            if (s & 1)            // Sign?
-              v *= p.slope;
-            if (s & 2)  // Clamp?
-              v = 0.f;
-          }
-
-          *pv = (T)v;  // Write value.
-        }
-      } else {
-        // Forward pass with no sign write. Process value if in p.x.
-        if (x < p.xShape.x)  // y is always in.
-        {
-          int64_t ix = x * p.xStride.x + y * p.xStride.y + z * p.xStride.z +
-                       w * p.xStride.w;
-          T *pv = ((T *)p.x) + ix;
-          scalar_t v = (scalar_t)(*pv);
-          v *= p.gain;
-          if (v < 0.f) v *= p.slope;
-          if (fabsf(v) > p.clamp) v = InternalType<T>::clamp(v, p.clamp);
-          *pv = (T)v;  // Write value.
-        }
-      }
-    }
-}
-
-template <class T, bool signWrite, bool signRead>
-void *choose_filtered_lrelu_act_kernel(void) {
-  return (void *)filtered_lrelu_act_kernel<T, signWrite, signRead>;
-}
-
-//------------------------------------------------------------------------
-// CUDA kernel selection.
-
-template <class T, class index_t, bool signWrite, bool signRead>
-filtered_lrelu_kernel_spec choose_filtered_lrelu_kernel(
-    const filtered_lrelu_kernel_params &p, int sharedKB) {
-  filtered_lrelu_kernel_spec s = {0};
-
-  // Return the first matching kernel.
-#define CASE(SH, U, FU, D, FD, MODE, TW, TH, W, XR, WS)                        \
-  if (sharedKB >= SH)                                                          \
-    if ((p.fuShape.y == 0 && (MODE == MODE_SUSD || MODE == MODE_SUFD)) ||      \
-        (p.fuShape.y > 0 && (MODE == MODE_FUSD || MODE == MODE_FUFD)))         \
-      if ((p.fdShape.y == 0 && (MODE == MODE_SUSD || MODE == MODE_FUSD)) ||    \
-          (p.fdShape.y > 0 && (MODE == MODE_SUFD || MODE == MODE_FUFD)))       \
-        if (p.up == U && p.fuShape.x <= FU && p.fuShape.y <= FU &&             \
-            p.down == D && p.fdShape.x <= FD && p.fdShape.y <= FD) {           \
-          static_assert((D * TW % 4) == 0,                                     \
-                        "down * tileWidth must be divisible by 4");            \
-          static_assert(                                                       \
-              FU % U == 0,                                                     \
-              "upscaling filter size must be multiple of upscaling factor");   \
-          static_assert(FD % D == 0,                                           \
-                        "downscaling filter size must be multiple of "         \
-                        "downscaling factor");                                 \
-          s.setup = (void *)setup_filters_kernel;                              \
-          s.exec = (void *)                                                    \
-              filtered_lrelu_kernel<T, index_t, SH, signWrite, signRead, MODE, \
-                                    U, FU, D, FD, TW, TH, W * 32, !!XR, !!WS>; \
-          s.tileOut = make_int2(TW, TH);                                       \
-          s.numWarps = W;                                                      \
-          s.xrep = XR;                                                         \
-          s.dynamicSharedKB = (SH == 48) ? 0 : SH;                             \
-          return s;                                                            \
-        }
-
-  // Launch parameters for various kernel specializations.
-  // Small filters must be listed before large filters, otherwise the kernel for
-  // larger filter will always match first. Kernels that use more shared memory
-  // must be listed before those that use less, for the same reason.
-
-  CASE(/*sharedKB*/ 48, /*up,fu*/ 1, 1, /*down,fd*/ 1, 1, /*mode*/ MODE_FUFD,
-       /*tw,th,warps,xrep,wskip*/ 64, 178, 32, 0, 0)  // 1t-upf1-downf1
-  CASE(/*sharedKB*/ 48, /*up,fu*/ 2, 8, /*down,fd*/ 1, 1, /*mode*/ MODE_SUFD,
-       /*tw,th,warps,xrep,wskip*/ 152, 95, 16, 0, 0)  // 4t-ups2-downf1
-  CASE(/*sharedKB*/ 48, /*up,fu*/ 1, 1, /*down,fd*/ 2, 8, /*mode*/ MODE_FUSD,
-       /*tw,th,warps,xrep,wskip*/ 56, 22, 16, 0, 0)  // 4t-upf1-downs2
-  CASE(/*sharedKB*/ 48, /*up,fu*/ 2, 8, /*down,fd*/ 2, 8, /*mode*/ MODE_SUSD,
-       /*tw,th,warps,xrep,wskip*/ 56, 29, 16, 11, 0)  // 4t-ups2-downs2
-  CASE(/*sharedKB*/ 48, /*up,fu*/ 2, 8, /*down,fd*/ 2, 8, /*mode*/ MODE_FUSD,
-       /*tw,th,warps,xrep,wskip*/ 60, 28, 16, 0, 0)  // 4t-upf2-downs2
-  CASE(/*sharedKB*/ 48, /*up,fu*/ 2, 8, /*down,fd*/ 2, 8, /*mode*/ MODE_SUFD,
-       /*tw,th,warps,xrep,wskip*/ 56, 28, 16, 0, 0)  // 4t-ups2-downf2
-  CASE(/*sharedKB*/ 48, /*up,fu*/ 4, 16, /*down,fd*/ 2, 8, /*mode*/ MODE_SUSD,
-       /*tw,th,warps,xrep,wskip*/ 56, 31, 16, 11, 0)  // 4t-ups4-downs2
-  CASE(/*sharedKB*/ 48, /*up,fu*/ 4, 16, /*down,fd*/ 2, 8, /*mode*/ MODE_SUFD,
-       /*tw,th,warps,xrep,wskip*/ 56, 36, 16, 0, 0)  // 4t-ups4-downf2
-  CASE(/*sharedKB*/ 48, /*up,fu*/ 2, 8, /*down,fd*/ 4, 16, /*mode*/ MODE_SUSD,
-       /*tw,th,warps,xrep,wskip*/ 16, 22, 16, 12, 0)  // 4t-ups2-downs4
-  CASE(/*sharedKB*/ 48, /*up,fu*/ 2, 8, /*down,fd*/ 4, 16, /*mode*/ MODE_FUSD,
-       /*tw,th,warps,xrep,wskip*/ 29, 15, 16, 0, 0)  // 4t-upf2-downs4
-  CASE(/*sharedKB*/ 48, /*up,fu*/ 2, 12, /*down,fd*/ 1, 1, /*mode*/ MODE_SUFD,
-       /*tw,th,warps,xrep,wskip*/ 96, 150, 28, 0, 0)  // 6t-ups2-downf1
-  CASE(/*sharedKB*/ 48, /*up,fu*/ 1, 1, /*down,fd*/ 2, 12, /*mode*/ MODE_FUSD,
-       /*tw,th,warps,xrep,wskip*/ 32, 35, 24, 0, 0)  // 6t-upf1-downs2
-  CASE(/*sharedKB*/ 48, /*up,fu*/ 2, 12, /*down,fd*/ 2, 12, /*mode*/ MODE_SUSD,
-       /*tw,th,warps,xrep,wskip*/ 32, 46, 16, 10, 0)  // 6t-ups2-downs2
-  CASE(/*sharedKB*/ 48, /*up,fu*/ 2, 12, /*down,fd*/ 2, 12, /*mode*/ MODE_FUSD,
-       /*tw,th,warps,xrep,wskip*/ 58, 28, 24, 8, 0)  // 6t-upf2-downs2
-  CASE(/*sharedKB*/ 48, /*up,fu*/ 2, 12, /*down,fd*/ 2, 12, /*mode*/ MODE_SUFD,
-       /*tw,th,warps,xrep,wskip*/ 52, 28, 16, 0, 0)  // 6t-ups2-downf2
-  CASE(/*sharedKB*/ 48, /*up,fu*/ 4, 24, /*down,fd*/ 2, 12, /*mode*/ MODE_SUSD,
-       /*tw,th,warps,xrep,wskip*/ 32, 51, 16, 5, 0)  // 6t-ups4-downs2
-  CASE(/*sharedKB*/ 48, /*up,fu*/ 4, 24, /*down,fd*/ 2, 12, /*mode*/ MODE_SUFD,
-       /*tw,th,warps,xrep,wskip*/ 32, 56, 16, 6, 0)  // 6t-ups4-downf2
-  CASE(/*sharedKB*/ 48, /*up,fu*/ 2, 12, /*down,fd*/ 4, 24, /*mode*/ MODE_SUSD,
-       /*tw,th,warps,xrep,wskip*/ 16, 18, 16, 12, 0)  // 6t-ups2-downs4
-  CASE(/*sharedKB*/ 96, /*up,fu*/ 2, 12, /*down,fd*/ 4, 24, /*mode*/ MODE_FUSD,
-       /*tw,th,warps,xrep,wskip*/ 27, 31, 32, 6, 0)  // 6t-upf2-downs4 96kB
-  CASE(/*sharedKB*/ 48, /*up,fu*/ 2, 12, /*down,fd*/ 4, 24, /*mode*/ MODE_FUSD,
-       /*tw,th,warps,xrep,wskip*/ 27, 13, 24, 0, 0)  // 6t-upf2-downs4
-  CASE(/*sharedKB*/ 48, /*up,fu*/ 2, 16, /*down,fd*/ 1, 1, /*mode*/ MODE_SUFD,
-       /*tw,th,warps,xrep,wskip*/ 148, 89, 24, 0, 0)  // 8t-ups2-downf1
-  CASE(/*sharedKB*/ 48, /*up,fu*/ 1, 1, /*down,fd*/ 2, 16, /*mode*/ MODE_FUSD,
-       /*tw,th,warps,xrep,wskip*/ 32, 31, 16, 5, 0)  // 8t-upf1-downs2
-  CASE(/*sharedKB*/ 48, /*up,fu*/ 2, 16, /*down,fd*/ 2, 16, /*mode*/ MODE_SUSD,
-       /*tw,th,warps,xrep,wskip*/ 32, 41, 16, 9, 0)  // 8t-ups2-downs2
-  CASE(/*sharedKB*/ 48, /*up,fu*/ 2, 16, /*down,fd*/ 2, 16, /*mode*/ MODE_FUSD,
-       /*tw,th,warps,xrep,wskip*/ 56, 26, 24, 0, 0)  // 8t-upf2-downs2
-  CASE(/*sharedKB*/ 48, /*up,fu*/ 2, 16, /*down,fd*/ 2, 16, /*mode*/ MODE_SUFD,
-       /*tw,th,warps,xrep,wskip*/ 32, 40, 16, 0, 0)  // 8t-ups2-downf2
-  CASE(/*sharedKB*/ 48, /*up,fu*/ 4, 32, /*down,fd*/ 2, 16, /*mode*/ MODE_SUSD,
-       /*tw,th,warps,xrep,wskip*/ 32, 46, 24, 5, 0)  // 8t-ups4-downs2
-  CASE(/*sharedKB*/ 48, /*up,fu*/ 4, 32, /*down,fd*/ 2, 16, /*mode*/ MODE_SUFD,
-       /*tw,th,warps,xrep,wskip*/ 32, 50, 16, 0, 0)  // 8t-ups4-downf2
-  CASE(/*sharedKB*/ 96, /*up,fu*/ 2, 16, /*down,fd*/ 4, 32, /*mode*/ MODE_SUSD,
-       /*tw,th,warps,xrep,wskip*/ 24, 24, 32, 12, 1)  // 8t-ups2-downs4 96kB
-  CASE(/*sharedKB*/ 48, /*up,fu*/ 2, 16, /*down,fd*/ 4, 32, /*mode*/ MODE_SUSD,
-       /*tw,th,warps,xrep,wskip*/ 16, 13, 16, 10, 1)  // 8t-ups2-downs4
-  CASE(/*sharedKB*/ 96, /*up,fu*/ 2, 16, /*down,fd*/ 4, 32, /*mode*/ MODE_FUSD,
-       /*tw,th,warps,xrep,wskip*/ 25, 28, 28, 4, 0)  // 8t-upf2-downs4 96kB
-  CASE(/*sharedKB*/ 48, /*up,fu*/ 2, 16, /*down,fd*/ 4, 32, /*mode*/ MODE_FUSD,
-       /*tw,th,warps,xrep,wskip*/ 25, 10, 24, 0, 0)  // 8t-upf2-downs4
-
-#undef CASE
-  return s;  // No kernel found.
-}
-
-//------------------------------------------------------------------------
-
-#define BUILD_FILTERED_LRELU_OP 1
-
-#ifndef MMCV_WITH_HIP
-#ifdef __GNUC__
-#if __GNUC__ < 6
-#undef BUILD_FILTERED_LRELU_OP
-#define BUILD_FILTERED_LRELU_OP 0
-#endif
-#endif
-
-
-#if CUDA_VERSION < 10020
-#undef BUILD_FILTERED_LRELU_OP
-#define BUILD_FILTERED_LRELU_OP 0
-#endif
-#endif
-
-#if BUILD_FILTERED_LRELU_OP == 1
-std::tuple<torch::Tensor, torch::Tensor, int> filtered_lrelu_op(
-    torch::Tensor x, torch::Tensor fu, torch::Tensor fd, torch::Tensor b,
-    torch::Tensor si, int up, int down, int px0, int px1, int py0, int py1,
-    int sx, int sy, float gain, float slope, float clamp, bool flip_filters,
-    bool writeSigns) {
-  // Set CUDA device.
-  TORCH_CHECK(x.is_cuda(), "x must reside on CUDA device");
-  const at::cuda::OptionalCUDAGuard device_guard(device_of(x));
-
-  // Validate arguments.
-  TORCH_CHECK(fu.device() == x.device() && fd.device() == x.device() &&
-                  b.device() == x.device(),
-              "all input tensors must reside on the same device");
-  TORCH_CHECK(fu.dtype() == torch::kFloat && fd.dtype() == torch::kFloat,
-              "fu and fd must be float32");
-  TORCH_CHECK(b.dtype() == x.dtype(), "x and b must have the same dtype");
-  TORCH_CHECK(x.dtype() == torch::kHalf || x.dtype() == torch::kFloat,
-              "x and b must be float16 or float32");
-  TORCH_CHECK(x.dim() == 4, "x must be rank 4");
-  TORCH_CHECK(x.size(0) * x.size(1) <= INT_MAX && x.size(2) <= INT_MAX &&
-                  x.size(3) <= INT_MAX,
-              "x is too large");
-  TORCH_CHECK(x.numel() > 0, "x is empty");
-  TORCH_CHECK(
-      (fu.dim() == 1 || fu.dim() == 2) && (fd.dim() == 1 || fd.dim() == 2),
-      "fu and fd must be rank 1 or 2");
-  TORCH_CHECK(fu.size(0) <= INT_MAX && fu.size(-1) <= INT_MAX,
-              "fu is too large");
-  TORCH_CHECK(fd.size(0) <= INT_MAX && fd.size(-1) <= INT_MAX,
-              "fd is too large");
-  TORCH_CHECK(fu.numel() > 0, "fu is empty");
-  TORCH_CHECK(fd.numel() > 0, "fd is empty");
-  TORCH_CHECK(b.dim() == 1 && b.size(0) == x.size(1),
-              "b must be a vector with the same number of channels as x");
-  TORCH_CHECK(up >= 1 && down >= 1, "up and down must be at least 1");
-
-  // Figure out how much shared memory is available on the device.
-  int maxSharedBytes = 0;
-  int result=cudaDeviceGetAttribute(&maxSharedBytes,
-                                      //  cudaDevAttrMaxSharedMemoryPerBlockOptin,
-                                      // hipDeviceAttributeSharedMemPerBlockOptin,
-                                       hipDeviceAttributeMaxSharedMemoryPerBlock,
-                                       x.device().index());
-  int sharedKB = maxSharedBytes >> 10;
-
-  // Populate enough launch parameters to check if a CUDA kernel exists.
-  filtered_lrelu_kernel_params p;
-  p.up = up;
-  p.down = down;
-  p.fuShape =
-      make_int2((int)fu.size(-1),
-                fu.dim() == 2 ? (int)fu.size(0)
-                              : 0);  // shape [n, 0] indicates separable filter.
-  p.fdShape = make_int2((int)fd.size(-1), fd.dim() == 2 ? (int)fd.size(0) : 0);
-  filtered_lrelu_kernel_spec test_spec =
-      choose_filtered_lrelu_kernel<float, int32_t, false, false>(p, sharedKB);
-  if (!test_spec.exec) {
-    // No kernel found - return empty tensors and indicate missing kernel with
-    // return code of -1.
-    return std::make_tuple(torch::Tensor(), torch::Tensor(), -1);
-  }
-
-  // Input/output element size.
-  int64_t sz = (x.dtype() == torch::kHalf) ? 2 : 4;
-
-  // Input sizes.
-  int64_t xw = (int)x.size(3);
-  int64_t xh = (int)x.size(2);
-  int64_t fut_w = (int)fu.size(-1) - 1;
-  int64_t fut_h = (int)fu.size(0) - 1;
-  int64_t fdt_w = (int)fd.size(-1) - 1;
-  int64_t fdt_h = (int)fd.size(0) - 1;
-
-  // Logical size of upsampled buffer.
-  int64_t cw = xw * up + (px0 + px1) - fut_w;
-  int64_t ch = xh * up + (py0 + py1) - fut_h;
-  TORCH_CHECK(
-      cw > fdt_w && ch > fdt_h,
-      "upsampled buffer must be at least the size of downsampling filter");
-  TORCH_CHECK(cw <= INT_MAX && ch <= INT_MAX, "upsampled buffer is too large");
-
-  // Compute output size and allocate.
-  int64_t yw = (cw - fdt_w + (down - 1)) / down;
-  int64_t yh = (ch - fdt_h + (down - 1)) / down;
-  TORCH_CHECK(yw > 0 && yh > 0, "output must be at least 1x1");
-  TORCH_CHECK(yw <= INT_MAX && yh <= INT_MAX, "output is too large");
-  torch::Tensor y = torch::empty({x.size(0), x.size(1), yh, yw}, x.options(),
-                                 x.suggest_memory_format());
-
-  // Allocate sign tensor.
-  torch::Tensor so;
-  torch::Tensor s = si;
-  bool readSigns = !!s.numel();
-  int64_t sw_active = 0;  // Active width of sign tensor.
-  if (writeSigns) {
-    sw_active = yw * down - (down - 1) + fdt_w;   // Active width in elements.
-    int64_t sh = yh * down - (down - 1) + fdt_h;  // Height = active height.
-    int64_t sw = (sw_active + 15) & ~15;  // Width  = active width in elements,
-                                          // rounded up to multiple of 16.
-    TORCH_CHECK(sh <= INT_MAX && (sw >> 2) <= INT_MAX, "signs is too large");
-    s = so = torch::empty({x.size(0), x.size(1), sh, sw >> 2},
-                          x.options().dtype(torch::kUInt8),
-                          at::MemoryFormat::Contiguous);
-  } else if (readSigns)
-    sw_active = s.size(3) << 2;
-
-  // Validate sign tensor if in use.
-  if (readSigns || writeSigns) {
-    TORCH_CHECK(s.is_contiguous(), "signs must be contiguous");
-    TORCH_CHECK(s.dtype() == torch::kUInt8, "signs must be uint8");
-    TORCH_CHECK(s.device() == x.device(),
-                "signs must reside on the same device as x");
-    TORCH_CHECK(s.dim() == 4, "signs must be rank 4");
-    TORCH_CHECK(s.size(0) == x.size(0) && s.size(1) == x.size(1),
-                "signs must have same batch & channels as x");
-    TORCH_CHECK(s.size(2) <= INT_MAX && s.size(3) <= INT_MAX,
-                "signs is too large");
-  }
-
-  // Populate rest of CUDA kernel parameters.
-  p.x = x.data_ptr();
-  p.y = y.data_ptr();
-  p.b = b.data_ptr();
-  p.s = (readSigns || writeSigns) ? s.data_ptr<unsigned char>() : 0;
-  p.fu = fu.data_ptr<float>();
-  p.fd = fd.data_ptr<float>();
-  p.pad0 = make_int2(px0, py0);
-  p.gain = gain;
-  p.slope = slope;
-  p.clamp = clamp;
-  p.flip = (flip_filters) ? 1 : 0;
-  p.xShape =
-      make_int4((int)x.size(3), (int)x.size(2), (int)x.size(1), (int)x.size(0));
-  p.yShape =
-      make_int4((int)y.size(3), (int)y.size(2), (int)y.size(1), (int)y.size(0));
-  p.sShape = (readSigns || writeSigns)
-                 ? make_int2((int)s.size(3), (int)s.size(2))
-                 : make_int2(0, 0);  // Width is in bytes. Contiguous.
-  p.sOfs = make_int2(sx, sy);
-  p.swLimit = (sw_active + 3) >> 2;  // Rounded up to bytes.
-
-  // x, y, b strides are in bytes.
-  p.xStride = make_longlong4(sz * x.stride(3), sz * x.stride(2),
-                             sz * x.stride(1), sz * x.stride(0));
-  p.yStride = make_longlong4(sz * y.stride(3), sz * y.stride(2),
-                             sz * y.stride(1), sz * y.stride(0));
-  p.bStride = sz * b.stride(0);
-
-  // fu, fd strides are in elements.
-  p.fuStride =
-      make_longlong3(fu.stride(-1), fu.dim() == 2 ? fu.stride(0) : 0, 0);
-  p.fdStride =
-      make_longlong3(fd.stride(-1), fd.dim() == 2 ? fd.stride(0) : 0, 0);
-
-  // Determine if indices don't fit in int32. Support negative strides although
-  // Torch currently never produces those.
-  bool index64b = false;
-  if (std::abs(p.bStride * x.size(1)) > INT_MAX) index64b = true;
-  if (std::min(x.size(0) * p.xStride.w, 0ll) +
-          std::min(x.size(1) * p.xStride.z, 0ll) +
-          std::min(x.size(2) * p.xStride.y, 0ll) +
-          std::min(x.size(3) * p.xStride.x, 0ll) <
-      -INT_MAX)
-    index64b = true;
-  if (std::max(x.size(0) * p.xStride.w, 0ll) +
-          std::max(x.size(1) * p.xStride.z, 0ll) +
-          std::max(x.size(2) * p.xStride.y, 0ll) +
-          std::max(x.size(3) * p.xStride.x, 0ll) >
-      INT_MAX)
-    index64b = true;
-  if (std::min(y.size(0) * p.yStride.w, 0ll) +
-          std::min(y.size(1) * p.yStride.z, 0ll) +
-          std::min(y.size(2) * p.yStride.y, 0ll) +
-          std::min(y.size(3) * p.yStride.x, 0ll) <
-      -INT_MAX)
-    index64b = true;
-  if (std::max(y.size(0) * p.yStride.w, 0ll) +
-          std::max(y.size(1) * p.yStride.z, 0ll) +
-          std::max(y.size(2) * p.yStride.y, 0ll) +
-          std::max(y.size(3) * p.yStride.x, 0ll) >
-      INT_MAX)
-    index64b = true;
-  if (s.numel() > INT_MAX) index64b = true;
-
-  // Choose CUDA kernel.
-  filtered_lrelu_kernel_spec spec = {0};
-  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
-      x.scalar_type(), "filtered_lrelu_cuda", [&] {
-        if constexpr (sizeof(scalar_t) <=
-                      4)  // Exclude doubles. constexpr
-                          // prevents template instantiation.
-        {
-          // Choose kernel based on index type, datatype and sign read/write
-          // modes.
-          if (!index64b && writeSigns && !readSigns)
-            spec = choose_filtered_lrelu_kernel<scalar_t, int32_t, true, false>(
-                p, sharedKB);
-          else if (!index64b && !writeSigns && readSigns)
-            spec = choose_filtered_lrelu_kernel<scalar_t, int32_t, false, true>(
-                p, sharedKB);
-          else if (!index64b && !writeSigns && !readSigns)
-            spec =
-                choose_filtered_lrelu_kernel<scalar_t, int32_t, false, false>(
-                    p, sharedKB);
-          else if (index64b && writeSigns && !readSigns)
-            spec = choose_filtered_lrelu_kernel<scalar_t, int64_t, true, false>(
-                p, sharedKB);
-          else if (index64b && !writeSigns && readSigns)
-            spec = choose_filtered_lrelu_kernel<scalar_t, int64_t, false, true>(
-                p, sharedKB);
-          else if (index64b && !writeSigns && !readSigns)
-            spec =
-                choose_filtered_lrelu_kernel<scalar_t, int64_t, false, false>(
-                    p, sharedKB);
-        }
-      });
-  TORCH_CHECK(
-      spec.exec,
-      "internal error - CUDA kernel not found")  // This should not happen
-                                                 // because we tested earlier
-                                                 // that kernel exists.
-
-  // Launch CUDA kernel.
-  void *args[] = {&p};
-  int bx = spec.numWarps * 32;
-  int gx = (p.yShape.x - 1) / spec.tileOut.x + 1;
-  int gy = (p.yShape.y - 1) / spec.tileOut.y + 1;
-  int gz = p.yShape.z * p.yShape.w;
-
-  // Repeat multiple horizontal tiles in a CTA?
-  if (spec.xrep) {
-    p.tilesXrep = spec.xrep;
-    p.tilesXdim = gx;
-
-    gx = (gx + p.tilesXrep - 1) / p.tilesXrep;
-    std::swap(gx, gy);
-  } else {
-    p.tilesXrep = 0;
-    p.tilesXdim = 0;
-  }
-
-  // Launch filter setup kernel.
-#ifndef MMCV_WITH_HIP
-  AT_CUDA_CHECK(cudaLaunchKernel(spec.setup, 1, 1024, args, 0,
-                                 at::cuda::getCurrentCUDAStream()));
-#else
-  AT_CUDA_CHECK(hipLaunchKernel(spec.setup, 1, 1024, args, 0,
-                                 at::cuda::getCurrentCUDAStream()));
-#endif
-  // Copy kernels to constant memory.
-  if (writeSigns && !readSigns)
-    AT_CUDA_CHECK((copy_filters(at::cuda::getCurrentCUDAStream())));
-  else if (!writeSigns && readSigns)
-    AT_CUDA_CHECK((copy_filters(at::cuda::getCurrentCUDAStream())));
-  else if (!writeSigns && !readSigns)
-    AT_CUDA_CHECK((copy_filters(at::cuda::getCurrentCUDAStream())));
-
-  // Set cache and shared memory configurations for main kernel.
-  AT_CUDA_CHECK(cudaFuncSetCacheConfig(spec.exec, cudaFuncCachePreferShared));
-  if (spec.dynamicSharedKB)  // Need dynamically allocated shared memory?
-    // AT_CUDA_CHECK(cudaFuncSetAttribute(
-    AT_CUDA_CHECK(hipFuncSetAttribute(
-        // spec.exec, cudaFuncAttributeMaxDynamicSharedMemorySize,
-        spec.exec, hipFuncAttributeMaxDynamicSharedMemorySize,
-        spec.dynamicSharedKB << 10));
-  AT_CUDA_CHECK(
-      cudaFuncSetSharedMemConfig(spec.exec, cudaSharedMemBankSizeFourByte));
-
-  // Launch main kernel.
-  const int maxSubGz = 65535;  // CUDA maximum for block z dimension.
-  for (int zofs = 0; zofs < gz;
-       zofs += maxSubGz)  // Do multiple launches if gz is too big.
-  {
-    p.blockZofs = zofs;
-    int subGz = std::min(maxSubGz, gz - zofs);
-#ifndef MMCV_WITH_HIP
-    AT_CUDA_CHECK(cudaLaunchKernel(spec.exec, dim3(gx, gy, subGz), bx, args,
-                                   spec.dynamicSharedKB << 10,
-                                   at::cuda::getCurrentCUDAStream()));
-#else
-    AT_CUDA_CHECK(hipLaunchKernel(spec.exec, dim3(gx, gy, subGz), bx, args,
-                                   spec.dynamicSharedKB << 10,
-                                   at::cuda::getCurrentCUDAStream()));
-#endif
-  }
-
-  // Done.
-  return std::make_tuple(y, so, 0);
-}
-
-std::tuple<torch::Tensor, torch::Tensor, int> filtered_lrelu_op_impl(
-    torch::Tensor x, torch::Tensor fu, torch::Tensor fd, torch::Tensor b,
-    torch::Tensor si, int up, int down, int px0, int px1, int py0, int py1,
-    int sx, int sy, float gain, float slope, float clamp, bool flip_filters,
-    bool writeSigns);
-
-REGISTER_DEVICE_IMPL(filtered_lrelu_op_impl, CUDA, filtered_lrelu_op);
-
-#else
-
-#pragma message(                           \
-    "filtered_lrelu_op is not available. " \
-    "Please update your compiler and cuda version.")
-
-#endif
-#undef BUILD_FILTERED_LRELU_OP
-
-//------------------------------------------------------------------------
-
-torch::Tensor filtered_lrelu_act_op(torch::Tensor x, torch::Tensor si, int sx,
-                                    int sy, float gain, float slope,
-                                    float clamp, bool writeSigns) {
-  // Set CUDA device.
-  TORCH_CHECK(x.is_cuda(), "x must reside on CUDA device");
-  const at::cuda::OptionalCUDAGuard device_guard(device_of(x));
-
-  // Validate arguments.
-  TORCH_CHECK(x.dim() == 4, "x must be rank 4");
-  TORCH_CHECK(x.size(0) * x.size(1) <= INT_MAX && x.size(2) <= INT_MAX &&
-                  x.size(3) <= INT_MAX,
-              "x is too large");
-  TORCH_CHECK(x.numel() > 0, "x is empty");
-  TORCH_CHECK(x.dtype() == torch::kHalf || x.dtype() == torch::kFloat ||
-                  x.dtype() == torch::kDouble,
-              "x must be float16, float32 or float64");
-
-  // Output signs if we don't have sign input.
-  torch::Tensor so;
-  torch::Tensor s = si;
-  bool readSigns = !!s.numel();
-  if (writeSigns) {
-    int64_t sw = x.size(3);
-    sw = (sw + 15) & ~15;  // Round to a multiple of 16 for coalescing.
-    s = so = torch::empty({x.size(0), x.size(1), x.size(2), sw >> 2},
-                          x.options().dtype(torch::kUInt8),
-                          at::MemoryFormat::Contiguous);
-  }
-
-  // Validate sign tensor if in use.
-  if (readSigns || writeSigns) {
-    TORCH_CHECK(s.is_contiguous(), "signs must be contiguous");
-    TORCH_CHECK(s.dtype() == torch::kUInt8, "signs must be uint8");
-    TORCH_CHECK(s.device() == x.device(),
-                "signs must reside on the same device as x");
-    TORCH_CHECK(s.dim() == 4, "signs must be rank 4");
-    TORCH_CHECK(s.size(0) == x.size(0) && s.size(1) == x.size(1),
-                "signs must have same batch & channels as x");
-    TORCH_CHECK(s.size(2) <= INT_MAX && (s.size(3) << 2) <= INT_MAX,
-                "signs tensor is too large");
-  }
-
-  // Initialize CUDA kernel parameters.
-  filtered_lrelu_act_kernel_params p;
-  p.x = x.data_ptr();
-  p.s = (readSigns || writeSigns) ? s.data_ptr<unsigned char>() : 0;
-  p.gain = gain;
-  p.slope = slope;
-  p.clamp = clamp;
-  p.xShape =
-      make_int4((int)x.size(3), (int)x.size(2), (int)x.size(1), (int)x.size(0));
-  p.xStride =
-      make_longlong4(x.stride(3), x.stride(2), x.stride(1), x.stride(0));
-  p.sShape = (readSigns || writeSigns)
-                 ? make_int2((int)s.size(3) << 2, (int)s.size(2))
-                 : make_int2(0, 0);  // Width is in elements. Contiguous.
-  p.sOfs = make_int2(sx, sy);
-
-  // Choose CUDA kernel.
-  void *func = 0;
-  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
-      x.scalar_type(), "filtered_lrelu_act_cuda", [&] {
-        if (writeSigns)
-          func = choose_filtered_lrelu_act_kernel<scalar_t, true, false>();
-        else if (readSigns)
-          func = choose_filtered_lrelu_act_kernel<scalar_t, false, true>();
-        else
-          func = choose_filtered_lrelu_act_kernel<scalar_t, false, false>();
-      });
-  TORCH_CHECK(func, "internal error - CUDA kernel not found");
-
-  // Launch CUDA kernel.
-  void *args[] = {&p};
-  int bx = 128;  // 4 warps per block.
-
-  // Logical size of launch = writeSigns ? p.s : p.x
-  uint32_t gx = writeSigns ? p.sShape.x : p.xShape.x;
-  uint32_t gy = writeSigns ? p.sShape.y : p.xShape.y;
-  uint32_t gz =
-      p.xShape.z * p.xShape.w;  // Same as in p.sShape if signs are in use.
-  gx = (gx - 1) / bx + 1;
-
-  // Make sure grid y and z dimensions are within CUDA launch limits. Kernel
-  // loops internally to do the rest.
-  const uint32_t gmax = 65535;
-  gy = std::min(gy, gmax);
-  gz = std::min(gz, gmax);
-
-  // Launch.
-#ifndef MMCV_WITH_HIP
-  AT_CUDA_CHECK(cudaLaunchKernel(func, dim3(gx, gy, gz), bx, args, 0,
-                                 at::cuda::getCurrentCUDAStream()));
-#else
-  AT_CUDA_CHECK(hipLaunchKernel(func, dim3(gx, gy, gz), bx, args, 0,
-                                 at::cuda::getCurrentCUDAStream()));
-#endif
-  return so;
-}
diff --git a/mmcv/ops/csrc/pytorch/cuda/fused_spconv_ops_cuda.cu b/mmcv/ops/csrc/pytorch/cuda/fused_spconv_ops_cuda.cu
deleted file mode 100644
index aa0e456..0000000
--- a/mmcv/ops/csrc/pytorch/cuda/fused_spconv_ops_cuda.cu
+++ /dev/null
@@ -1,104 +0,0 @@
-#include <cuda_runtime_api.h>
-#include <torch/script.h>
-// clang-format off
-// TODO: make spconv_utils.h order agnostic
-#include "../spconv_utils.h"
-// clang-format on
-#include <utils/spconv/spconv/indice.h>
-#include <utils/spconv/spconv/reordering.h>
-
-#include "pytorch_cuda_helper.hpp"
-
-torch::Tensor FusedIndiceConvBatchnormCUDAKernelLauncher(
-    torch::Tensor features, torch::Tensor filters, torch::Tensor bias,
-    torch::Tensor indicePairs, torch::Tensor indiceNum, int64_t numActOut,
-    int64_t _inverse, int64_t _subM) {
-  at::cuda::CUDAGuard device_guard(features.device());
-  bool subM = _subM != 0;
-  bool inverse = _inverse != 0;
-  auto device = features.device().type();
-  auto ndim = filters.dim() - 2;
-  auto kernelVolume = indicePairs.size(0);
-  auto numInPlanes = features.size(1);
-  auto numOutPlanes = filters.size(ndim + 1);
-  auto indicePairNumCpu = indiceNum.to({torch::kCPU});
-  auto indicePairMaxSizeIter =
-      std::max_element(indicePairNumCpu.data_ptr<int>(),
-                       indicePairNumCpu.data_ptr<int>() + kernelVolume);
-  int indicePairMaxOffset =
-      indicePairMaxSizeIter - indicePairNumCpu.data_ptr<int>();
-  int indicePairMaxSize = *indicePairMaxSizeIter;
-
-  auto options =
-      torch::TensorOptions().dtype(features.dtype()).device(features.device());
-
-  torch::Tensor output =
-      torch::zeros({numActOut, numOutPlanes}, options).copy_(bias);
-  torch::Tensor inputBuffer =
-      torch::zeros({indicePairMaxSize, numInPlanes}, options);
-  torch::Tensor outputBuffer =
-      torch::zeros({indicePairMaxSize, numOutPlanes}, options);
-  filters = filters.view({-1, numInPlanes, numOutPlanes});
-  if (subM) {  // the center index of subm conv don't need gather and scatter
-               // add.
-    torch::mm_out(output, features, filters[indicePairMaxOffset]);
-  }
-  for (int i = 0; i < kernelVolume; ++i) {
-    auto nHot = indicePairNumCpu.data_ptr<int>()[i];
-    if (nHot <= 0 || (subM && i == indicePairMaxOffset)) {
-      continue;
-    }
-
-    AT_DISPATCH_FLOATING_TYPES_AND_HALF(
-        features.scalar_type(), "FusedIndiceConvBatchnormKernel", [&] {
-          auto outputBufferBlob = torch::from_blob(
-              outputBuffer.data_ptr<scalar_t>(), {nHot, numOutPlanes}, options);
-          auto inputBufferBlob = torch::from_blob(
-              inputBuffer.data_ptr<scalar_t>(), {nHot, numInPlanes}, options);
-
-          if (device == torch::kCPU) {
-            functor::SparseGatherFunctor<tv::CPU, scalar_t, int> gatherFtor;
-            gatherFtor(tv::CPU(), tv::torch2tv<scalar_t>(inputBuffer),
-                       tv::torch2tv<const scalar_t>(features),
-                       tv::torch2tv<const int>(indicePairs).subview(i, inverse),
-                       nHot);
-          } else {
-            functor::SparseGatherFunctor<tv::TorchGPU, scalar_t, int>
-                gatherFtor;
-            gatherFtor(tv::TorchGPU(), tv::torch2tv<scalar_t>(inputBuffer),
-                       tv::torch2tv<const scalar_t>(features),
-                       tv::torch2tv<const int>(indicePairs).subview(i, inverse),
-                       nHot);
-            TV_CHECK_CUDA_ERR();
-            /* slower than SparseGatherFunctor, may due to int->long conversion
-            auto indicePairLong = indicePairs[i][inverse].to(torch::kInt64);
-            auto indicePairBlob =
-            torch::from_blob(indicePairLong.data_ptr<long>(), {nHot},
-            indicePairOptions); torch::index_select_out(inputBufferBlob,
-            features, 0, indicePairBlob);*/
-          }
-          torch::mm_out(outputBufferBlob, inputBufferBlob, filters[i]);
-
-          if (device == torch::kCPU) {
-            functor::SparseScatterAddFunctor<tv::CPU, scalar_t, int>
-                scatterFtor;
-            scatterFtor(
-                tv::CPU(), tv::torch2tv<scalar_t>(output),
-                tv::torch2tv<const scalar_t>(outputBuffer),
-                tv::torch2tv<const int>(indicePairs).subview(i, !inverse), nHot,
-                true);
-          } else {
-            functor::SparseScatterAddFunctor<tv::TorchGPU, scalar_t, int>
-                scatterFtor;
-            scatterFtor(
-                tv::TorchGPU(), tv::torch2tv<scalar_t>(output),
-                tv::torch2tv<const scalar_t>(outputBuffer),
-                tv::torch2tv<const int>(indicePairs).subview(i, !inverse), nHot,
-                true);
-            TV_CHECK_CUDA_ERR();
-          }
-        });
-  }
-
-  return output;
-}
diff --git a/mmcv/ops/csrc/pytorch/cuda/gather_points_cuda.cu b/mmcv/ops/csrc/pytorch/cuda/gather_points_cuda.cu
index fd0a7b5..672fec6 100644
--- a/mmcv/ops/csrc/pytorch/cuda/gather_points_cuda.cu
+++ b/mmcv/ops/csrc/pytorch/cuda/gather_points_cuda.cu
@@ -16,7 +16,7 @@ void GatherPointsForwardCUDAKernelLauncher(int b, int c, int n, int npoints,
   cudaStream_t stream = at::cuda::getCurrentCUDAStream();
 
   // blockIdx.x(col), blockIdx.y(row)
-  dim3 blocks(GET_BLOCKS(npoints, THREADS_PER_BLOCK), c, b);
+  dim3 blocks(DIVUP(npoints, THREADS_PER_BLOCK), c, b);
   dim3 threads(THREADS_PER_BLOCK);
 
   AT_DISPATCH_FLOATING_TYPES_AND_HALF(
@@ -43,7 +43,7 @@ void GatherPointsBackwardCUDAKernelLauncher(int b, int c, int n, int npoints,
   cudaStream_t stream = at::cuda::getCurrentCUDAStream();
 
   // blockIdx.x(col), blockIdx.y(row)
-  dim3 blocks(GET_BLOCKS(npoints, THREADS_PER_BLOCK), c, b);
+  dim3 blocks(DIVUP(npoints, THREADS_PER_BLOCK), c, b);
   dim3 threads(THREADS_PER_BLOCK);
 
   AT_DISPATCH_FLOATING_TYPES_AND_HALF(
diff --git a/mmcv/ops/csrc/pytorch/cuda/group_points_cuda.cu b/mmcv/ops/csrc/pytorch/cuda/group_points_cuda.cu
index 42fc2bb..e7c57b0 100644
--- a/mmcv/ops/csrc/pytorch/cuda/group_points_cuda.cu
+++ b/mmcv/ops/csrc/pytorch/cuda/group_points_cuda.cu
@@ -19,7 +19,7 @@ void GroupPointsForwardCUDAKernelLauncher(int b, int c, int n, int npoints,
   cudaStream_t stream = at::cuda::getCurrentCUDAStream();
 
   // blockIdx.x(col), blockIdx.y(row)
-  dim3 blocks(GET_BLOCKS(npoints * nsample, THREADS_PER_BLOCK), c, b);
+  dim3 blocks(DIVUP(npoints * nsample, THREADS_PER_BLOCK), c, b);
   dim3 threads(THREADS_PER_BLOCK);
 
   AT_DISPATCH_FLOATING_TYPES_AND_HALF(
@@ -46,7 +46,7 @@ void GroupPointsBackwardCUDAKernelLauncher(int b, int c, int n, int npoints,
   cudaStream_t stream = at::cuda::getCurrentCUDAStream();
 
   // blockIdx.x(col), blockIdx.y(row)
-  dim3 blocks(GET_BLOCKS(npoints * nsample, THREADS_PER_BLOCK), c, b);
+  dim3 blocks(DIVUP(npoints * nsample, THREADS_PER_BLOCK), c, b);
   dim3 threads(THREADS_PER_BLOCK);
 
   AT_DISPATCH_FLOATING_TYPES_AND_HALF(
diff --git a/mmcv/ops/csrc/pytorch/cuda/iou3d_cuda.cu b/mmcv/ops/csrc/pytorch/cuda/iou3d_cuda.cu
index 968e13d..0643c16 100644
--- a/mmcv/ops/csrc/pytorch/cuda/iou3d_cuda.cu
+++ b/mmcv/ops/csrc/pytorch/cuda/iou3d_cuda.cu
@@ -10,7 +10,6 @@ All Rights Reserved 2019-2020.
 #include <stdio.h>
 
 #include "iou3d_cuda_kernel.cuh"
-#include "nms_cuda_kernel.cuh"
 #include "pytorch_cuda_helper.hpp"
 
 void IoU3DBoxesOverlapBevForwardCUDAKernelLauncher(const int num_a,
@@ -22,8 +21,8 @@ void IoU3DBoxesOverlapBevForwardCUDAKernelLauncher(const int num_a,
   cudaStream_t stream = at::cuda::getCurrentCUDAStream();
 
   // blockIdx.x(col), blockIdx.y(row)
-  dim3 blocks(GET_BLOCKS(num_b, THREADS_PER_BLOCK_IOU3D),
-              GET_BLOCKS(num_a, THREADS_PER_BLOCK_IOU3D));
+  dim3 blocks(DIVUP(num_b, THREADS_PER_BLOCK_IOU3D),
+              DIVUP(num_a, THREADS_PER_BLOCK_IOU3D));
   dim3 threads(THREADS_PER_BLOCK_IOU3D, THREADS_PER_BLOCK_IOU3D);
 
   iou3d_boxes_overlap_bev_forward_cuda_kernel<<<blocks, threads, 0, stream>>>(
@@ -33,72 +32,55 @@ void IoU3DBoxesOverlapBevForwardCUDAKernelLauncher(const int num_a,
   AT_CUDA_CHECK(cudaGetLastError());
 }
 
-void IoU3DNMS3DForwardCUDAKernelLauncher(const Tensor boxes, Tensor& keep,
-                                         Tensor& keep_num,
-                                         float nms_overlap_thresh) {
-  using namespace at::indexing;
-  at::cuda::CUDAGuard device_guard(boxes.device());
+void IoU3DBoxesIoUBevForwardCUDAKernelLauncher(const int num_a,
+                                               const Tensor boxes_a,
+                                               const int num_b,
+                                               const Tensor boxes_b,
+                                               Tensor ans_iou) {
+  at::cuda::CUDAGuard device_guard(boxes_a.device());
   cudaStream_t stream = at::cuda::getCurrentCUDAStream();
 
-  int boxes_num = boxes.size(0);
+  // blockIdx.x(col), blockIdx.y(row)
+  dim3 blocks(DIVUP(num_b, THREADS_PER_BLOCK_IOU3D),
+              DIVUP(num_a, THREADS_PER_BLOCK_IOU3D));
+  dim3 threads(THREADS_PER_BLOCK_IOU3D, THREADS_PER_BLOCK_IOU3D);
 
-  const int col_blocks =
-      (boxes_num + THREADS_PER_BLOCK_NMS - 1) / THREADS_PER_BLOCK_NMS;
-  Tensor mask =
-      at::empty({boxes_num, col_blocks}, boxes.options().dtype(at::kLong));
+  iou3d_boxes_iou_bev_forward_cuda_kernel<<<blocks, threads, 0, stream>>>(
+      num_a, boxes_a.data_ptr<float>(), num_b, boxes_b.data_ptr<float>(),
+      ans_iou.data_ptr<float>());
 
-  dim3 blocks(GET_BLOCKS(boxes_num, THREADS_PER_BLOCK_NMS),
-              GET_BLOCKS(boxes_num, THREADS_PER_BLOCK_NMS));
-  dim3 threads(THREADS_PER_BLOCK_NMS);
+  AT_CUDA_CHECK(cudaGetLastError());
+}
 
-  iou3d_nms3d_forward_cuda_kernel<<<blocks, threads, 0, stream>>>(
-      boxes_num, nms_overlap_thresh, boxes.data_ptr<float>(),
-      (unsigned long long*)mask.data_ptr<int64_t>());
+void IoU3DNMSForwardCUDAKernelLauncher(const Tensor boxes,
+                                       unsigned long long *mask, int boxes_num,
+                                       float nms_overlap_thresh) {
+  at::cuda::CUDAGuard device_guard(boxes.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  dim3 blocks(DIVUP(boxes_num, THREADS_PER_BLOCK_NMS),
+              DIVUP(boxes_num, THREADS_PER_BLOCK_NMS));
+  dim3 threads(THREADS_PER_BLOCK_NMS);
 
-  at::Tensor keep_t = at::zeros(
-      {boxes_num}, boxes.options().dtype(at::kBool).device(at::kCUDA));
-  gather_keep_from_mask<<<1, min(col_blocks, THREADS_PER_BLOCK),
-                          col_blocks * sizeof(unsigned long long), stream>>>(
-      keep_t.data_ptr<bool>(), (unsigned long long*)mask.data_ptr<int64_t>(),
-      boxes_num);
+  nms_forward_cuda_kernel<<<blocks, threads, 0, stream>>>(
+      boxes_num, nms_overlap_thresh, boxes.data_ptr<float>(), mask);
 
-  auto keep_data = keep_t.nonzero().index({Slice(), 0});
-  keep_num.fill_(at::Scalar(keep_data.size(0)));
-  keep.index_put_({Slice(0, keep_data.size(0))}, keep_data);
   AT_CUDA_CHECK(cudaGetLastError());
 }
 
-void IoU3DNMS3DNormalForwardCUDAKernelLauncher(const Tensor boxes, Tensor& keep,
-                                               Tensor& keep_num,
-                                               float nms_overlap_thresh) {
-  using namespace at::indexing;
+void IoU3DNMSNormalForwardCUDAKernelLauncher(const Tensor boxes,
+                                             unsigned long long *mask,
+                                             int boxes_num,
+                                             float nms_overlap_thresh) {
   at::cuda::CUDAGuard device_guard(boxes.device());
   cudaStream_t stream = at::cuda::getCurrentCUDAStream();
 
-  int boxes_num = boxes.size(0);
-
-  const int col_blocks =
-      (boxes_num + THREADS_PER_BLOCK_NMS - 1) / THREADS_PER_BLOCK_NMS;
-  Tensor mask =
-      at::empty({boxes_num, col_blocks}, boxes.options().dtype(at::kLong));
-
-  dim3 blocks(GET_BLOCKS(boxes_num, THREADS_PER_BLOCK_NMS),
-              GET_BLOCKS(boxes_num, THREADS_PER_BLOCK_NMS));
+  dim3 blocks(DIVUP(boxes_num, THREADS_PER_BLOCK_NMS),
+              DIVUP(boxes_num, THREADS_PER_BLOCK_NMS));
   dim3 threads(THREADS_PER_BLOCK_NMS);
 
-  iou3d_nms3d_normal_forward_cuda_kernel<<<blocks, threads, 0, stream>>>(
-      boxes_num, nms_overlap_thresh, boxes.data_ptr<float>(),
-      (unsigned long long*)mask.data_ptr<int64_t>());
-
-  at::Tensor keep_t = at::zeros(
-      {boxes_num}, boxes.options().dtype(at::kBool).device(at::kCUDA));
-  gather_keep_from_mask<<<1, min(col_blocks, THREADS_PER_BLOCK),
-                          col_blocks * sizeof(unsigned long long), stream>>>(
-      keep_t.data_ptr<bool>(), (unsigned long long*)mask.data_ptr<int64_t>(),
-      boxes_num);
+  nms_normal_forward_cuda_kernel<<<blocks, threads, 0, stream>>>(
+      boxes_num, nms_overlap_thresh, boxes.data_ptr<float>(), mask);
 
-  auto keep_data = keep_t.nonzero().index({Slice(), 0});
-  keep_num.fill_(at::Scalar(keep_data.size(0)));
-  keep.index_put_({Slice(0, keep_data.size(0))}, keep_data);
   AT_CUDA_CHECK(cudaGetLastError());
 }
diff --git a/mmcv/ops/csrc/pytorch/cuda/knn_cuda.cu b/mmcv/ops/csrc/pytorch/cuda/knn_cuda.cu
index e335181..4954fe4 100644
--- a/mmcv/ops/csrc/pytorch/cuda/knn_cuda.cu
+++ b/mmcv/ops/csrc/pytorch/cuda/knn_cuda.cu
@@ -19,7 +19,7 @@ void KNNForwardCUDAKernelLauncher(int b, int n, int m, int nsample,
   cudaStream_t stream = at::cuda::getCurrentCUDAStream();
 
   // blockIdx.x(col), blockIdx.y(row)
-  dim3 blocks(GET_BLOCKS(m, THREADS_PER_BLOCK), b);
+  dim3 blocks(DIVUP(m, THREADS_PER_BLOCK), b);
   dim3 threads(THREADS_PER_BLOCK);
 
   AT_DISPATCH_FLOATING_TYPES_AND_HALF(
diff --git a/mmcv/ops/csrc/pytorch/cuda/min_area_polygons.cu b/mmcv/ops/csrc/pytorch/cuda/min_area_polygons.cu
deleted file mode 100644
index 9314f2d..0000000
--- a/mmcv/ops/csrc/pytorch/cuda/min_area_polygons.cu
+++ /dev/null
@@ -1,21 +0,0 @@
-// Copyright (c) OpenMMLab. All rights reserved
-// modified from
-// https://github.com/SDL-GuoZonghao/BeyondBoundingBox/blob/main/mmdet/ops/minareabbox/src/minareabbox_kernel.cu
-#include "min_area_polygons_cuda.cuh"
-#include "pytorch_cuda_helper.hpp"
-
-void MinAreaPolygonsCUDAKernelLauncher(const Tensor pointsets,
-                                       Tensor polygons) {
-  int num_pointsets = pointsets.size(0);
-  const int output_size = polygons.numel();
-  at::cuda::CUDAGuard device_guard(pointsets.device());
-  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
-      pointsets.scalar_type(), "min_area_polygons_cuda_kernel", ([&] {
-        min_area_polygons_cuda_kernel<scalar_t>
-            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
-                num_pointsets, pointsets.data_ptr<scalar_t>(),
-                polygons.data_ptr<scalar_t>());
-      }));
-  AT_CUDA_CHECK(cudaGetLastError());
-}
diff --git a/mmcv/ops/csrc/pytorch/cuda/ms_deform_attn_cuda.cu b/mmcv/ops/csrc/pytorch/cuda/ms_deform_attn_cuda.cu
index 8e1e62d..2fccaa2 100644
--- a/mmcv/ops/csrc/pytorch/cuda/ms_deform_attn_cuda.cu
+++ b/mmcv/ops/csrc/pytorch/cuda/ms_deform_attn_cuda.cu
@@ -31,7 +31,7 @@ void ms_deformable_im2col_cuda(cudaStream_t stream, const scalar_t *data_value,
                                const int num_point, scalar_t *data_col) {
   const int num_kernels = batch_size * num_query * num_heads * channels;
   const int num_actual_kernels = batch_size * num_query * num_heads * channels;
-  const int num_threads = THREADS_PER_BLOCK;
+  const int num_threads = CUDA_NUM_THREADS;
   ms_deformable_im2col_gpu_kernel<scalar_t>
       <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0, stream>>>(
           num_kernels, data_value, data_spatial_shapes, data_level_start_index,
@@ -54,11 +54,11 @@ void ms_deformable_col2im_cuda(
     const int num_point, scalar_t *grad_value, scalar_t *grad_sampling_loc,
     scalar_t *grad_attn_weight) {
   const int num_threads =
-      (channels > THREADS_PER_BLOCK) ? THREADS_PER_BLOCK : channels;
+      (channels > CUDA_NUM_THREADS) ? CUDA_NUM_THREADS : channels;
   const int num_kernels = batch_size * num_query * num_heads * channels;
   const int num_actual_kernels = batch_size * num_query * num_heads * channels;
-  if (channels > THREADS_PER_BLOCK) {
-    if ((channels & THREADS_PER_BLOCK - 1) == 0) {
+  if (channels > 1024) {
+    if ((channels & 1023) == 0) {
       ms_deformable_col2im_gpu_kernel_shm_reduce_v2_multi_blocks<scalar_t>
           <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
              num_threads * 3 * sizeof(scalar_t), stream>>>(
@@ -178,6 +178,16 @@ void ms_deformable_col2im_cuda(
                          channels, num_levels, num_query, num_point, grad_value,
                          grad_sampling_loc, grad_attn_weight);
         break;
+      case 1024:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t,
+                                                                      1024>
+            <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
+               stream>>>(num_kernels, grad_col, data_value, data_spatial_shapes,
+                         data_level_start_index, data_sampling_loc,
+                         data_attn_weight, batch_size, spatial_size, num_heads,
+                         channels, num_levels, num_query, num_point, grad_value,
+                         grad_sampling_loc, grad_attn_weight);
+        break;
       default:
         if (channels < 64) {
           ms_deformable_col2im_gpu_kernel_shm_reduce_v1<scalar_t>
@@ -255,7 +265,7 @@ at::Tensor ms_deform_attn_cuda_forward(const at::Tensor &value,
   auto per_attn_weight_size = num_query * num_heads * num_levels * num_point;
   for (int n = 0; n < batch / im2col_step_; ++n) {
     auto columns = output_n.select(0, n);
-    AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+    AT_DISPATCH_FLOATING_TYPES(
         value.scalar_type(), "ms_deform_attn_forward_cuda", ([&] {
           ms_deformable_im2col_cuda(
               at::cuda::getCurrentCUDAStream(),
@@ -326,7 +336,7 @@ void ms_deform_attn_cuda_backward(
 
   for (int n = 0; n < batch / im2col_step_; ++n) {
     auto grad_output_g = grad_output_n.select(0, n);
-    AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+    AT_DISPATCH_FLOATING_TYPES(
         value.scalar_type(), "ms_deform_attn_backward_cuda", ([&] {
           ms_deformable_col2im_cuda(
               at::cuda::getCurrentCUDAStream(),
diff --git a/mmcv/ops/csrc/pytorch/cuda/nms_cuda.cu b/mmcv/ops/csrc/pytorch/cuda/nms_cuda.cu
index 1b87e0f..16cf646 100644
--- a/mmcv/ops/csrc/pytorch/cuda/nms_cuda.cu
+++ b/mmcv/ops/csrc/pytorch/cuda/nms_cuda.cu
@@ -13,24 +13,41 @@ Tensor NMSCUDAKernelLauncher(Tensor boxes, Tensor scores, float iou_threshold,
   auto boxes_sorted = boxes.index_select(0, order_t);
 
   int boxes_num = boxes.size(0);
-  const int col_blocks = (boxes_num + threadsPerBlock - 1) / threadsPerBlock;
-  const int col_blocks_alloc = GET_BLOCKS(boxes_num, threadsPerBlock);
+  const int col_blocks = DIVUP(boxes_num, threadsPerBlock);
   Tensor mask =
       at::empty({boxes_num, col_blocks}, boxes.options().dtype(at::kLong));
-  dim3 blocks(col_blocks_alloc, col_blocks_alloc);
+  dim3 blocks(col_blocks, col_blocks);
   dim3 threads(threadsPerBlock);
   cudaStream_t stream = at::cuda::getCurrentCUDAStream();
   nms_cuda<<<blocks, threads, 0, stream>>>(
       boxes_num, iou_threshold, offset, boxes_sorted.data_ptr<float>(),
       (unsigned long long*)mask.data_ptr<int64_t>());
 
-  // Filter the boxes which should be kept.
-  at::Tensor keep_t = at::zeros(
-      {boxes_num}, boxes.options().dtype(at::kBool).device(at::kCUDA));
-  gather_keep_from_mask<<<1, min(col_blocks, THREADS_PER_BLOCK),
-                          col_blocks * sizeof(unsigned long long), stream>>>(
-      keep_t.data_ptr<bool>(), (unsigned long long*)mask.data_ptr<int64_t>(),
-      boxes_num);
+  at::Tensor mask_cpu = mask.to(at::kCPU);
+  unsigned long long* mask_host =
+      (unsigned long long*)mask_cpu.data_ptr<int64_t>();
+
+  std::vector<unsigned long long> remv(col_blocks);
+  memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks);
+
+  at::Tensor keep_t =
+      at::zeros({boxes_num}, boxes.options().dtype(at::kBool).device(at::kCPU));
+  bool* keep = keep_t.data_ptr<bool>();
+
+  for (int i = 0; i < boxes_num; i++) {
+    int nblock = i / threadsPerBlock;
+    int inblock = i % threadsPerBlock;
+
+    if (!(remv[nblock] & (1ULL << inblock))) {
+      keep[i] = true;
+      // set every overlap box with bit 1 in remv
+      unsigned long long* p = mask_host + i * col_blocks;
+      for (int j = nblock; j < col_blocks; j++) {
+        remv[j] |= p[j];
+      }
+    }
+  }
+
   AT_CUDA_CHECK(cudaGetLastError());
-  return order_t.masked_select(keep_t);
+  return order_t.masked_select(keep_t.to(at::kCUDA));
 }
diff --git a/mmcv/ops/csrc/pytorch/cuda/nms_quadri_cuda.cu b/mmcv/ops/csrc/pytorch/cuda/nms_quadri_cuda.cu
deleted file mode 100644
index 15004b8..0000000
--- a/mmcv/ops/csrc/pytorch/cuda/nms_quadri_cuda.cu
+++ /dev/null
@@ -1,60 +0,0 @@
-// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
-#include "nms_quadri_cuda.cuh"
-#include "pytorch_cuda_helper.hpp"
-
-Tensor nms_quadri_cuda(const Tensor dets, const Tensor scores,
-                       const Tensor order_t, const Tensor dets_sorted,
-                       float iou_threshold, const int multi_label) {
-  // using scalar_t = float;
-  AT_ASSERTM(dets.is_cuda(), "dets must be a CUDA tensor");
-  AT_ASSERTM(scores.is_cuda(), "scores must be a CUDA tensor");
-  at::cuda::CUDAGuard device_guard(dets.device());
-
-  int dets_num = dets.size(0);
-
-  const int col_blocks = at::cuda::ATenCeilDiv(dets_num, threadsPerBlock);
-
-  Tensor mask =
-      at::empty({dets_num * col_blocks}, dets.options().dtype(at::kLong));
-
-  dim3 blocks(col_blocks, col_blocks);
-  dim3 threads(threadsPerBlock);
-  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-
-  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
-      dets_sorted.scalar_type(), "nms_quadri_kernel_cuda", [&] {
-        nms_quadri_cuda_kernel<scalar_t><<<blocks, threads, 0, stream>>>(
-            dets_num, iou_threshold, dets_sorted.data_ptr<scalar_t>(),
-            (unsigned long long*)mask.data_ptr<int64_t>(), multi_label);
-      });
-
-  Tensor mask_cpu = mask.to(at::kCPU);
-  unsigned long long* mask_host =
-      (unsigned long long*)mask_cpu.data_ptr<int64_t>();
-
-  std::vector<unsigned long long> remv(col_blocks);
-  memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks);
-
-  Tensor keep =
-      at::empty({dets_num}, dets.options().dtype(at::kLong).device(at::kCPU));
-  int64_t* keep_out = keep.data_ptr<int64_t>();
-
-  int num_to_keep = 0;
-  for (int i = 0; i < dets_num; i++) {
-    int nblock = i / threadsPerBlock;
-    int inblock = i % threadsPerBlock;
-
-    if (!(remv[nblock] & (1ULL << inblock))) {
-      keep_out[num_to_keep++] = i;
-      unsigned long long* p = mask_host + i * col_blocks;
-      for (int j = nblock; j < col_blocks; j++) {
-        remv[j] |= p[j];
-      }
-    }
-  }
-
-  AT_CUDA_CHECK(cudaGetLastError());
-  return order_t.index(
-      {keep.narrow(/*dim=*/0, /*start=*/0, /*length=*/num_to_keep)
-           .to(order_t.device(), keep.scalar_type())});
-}
diff --git a/mmcv/ops/csrc/pytorch/cuda/points_in_boxes_cuda.cu b/mmcv/ops/csrc/pytorch/cuda/points_in_boxes_cuda.cu
index 3cc89d0..17e6441 100644
--- a/mmcv/ops/csrc/pytorch/cuda/points_in_boxes_cuda.cu
+++ b/mmcv/ops/csrc/pytorch/cuda/points_in_boxes_cuda.cu
@@ -21,7 +21,7 @@ void PointsInBoxesPartForwardCUDAKernelLauncher(int batch_size, int boxes_num,
   at::cuda::CUDAGuard device_guard(boxes.device());
   cudaStream_t stream = at::cuda::getCurrentCUDAStream();
 
-  dim3 blocks(GET_BLOCKS(pts_num, THREADS_PER_BLOCK), batch_size);
+  dim3 blocks(DIVUP(pts_num, THREADS_PER_BLOCK), batch_size);
   dim3 threads(THREADS_PER_BLOCK);
 
   AT_DISPATCH_FLOATING_TYPES_AND_HALF(
@@ -47,7 +47,7 @@ void PointsInBoxesAllForwardCUDAKernelLauncher(int batch_size, int boxes_num,
   at::cuda::CUDAGuard device_guard(boxes.device());
   cudaStream_t stream = at::cuda::getCurrentCUDAStream();
 
-  dim3 blocks(GET_BLOCKS(pts_num, THREADS_PER_BLOCK), batch_size);
+  dim3 blocks(DIVUP(pts_num, THREADS_PER_BLOCK), batch_size);
   dim3 threads(THREADS_PER_BLOCK);
 
   AT_DISPATCH_FLOATING_TYPES_AND_HALF(
diff --git a/mmcv/ops/csrc/pytorch/cuda/points_in_polygons_cuda.cu b/mmcv/ops/csrc/pytorch/cuda/points_in_polygons_cuda.cu
deleted file mode 100644
index 6e7db9d..0000000
--- a/mmcv/ops/csrc/pytorch/cuda/points_in_polygons_cuda.cu
+++ /dev/null
@@ -1,28 +0,0 @@
-// Copyright (c) OpenMMLab. All rights reserved
-// Modified from
-// https://github.com/ming71/CUDA/blob/master/point_justify/points_justify_kernel.cu
-
-#include <stdio.h>
-
-#include "points_in_polygons_cuda_kernel.cuh"
-#include "pytorch_cuda_helper.hpp"
-
-void PointsInPolygonsForwardCUDAKernelLauncher(const at::Tensor points,
-                                               const at::Tensor polygons,
-                                               const int rows, const int cols,
-                                               at::Tensor output) {
-  const int output_size = rows * cols;
-  at::cuda::CUDAGuard device_guard(points.device());
-  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
-      points.scalar_type(), "points_in_polygons_forward_cuda_kernel", ([&] {
-        const scalar_t *vertex1 = points.data_ptr<scalar_t>();
-        const scalar_t *vertex2 = polygons.data_ptr<scalar_t>();
-        scalar_t *inside_flag = output.data_ptr<scalar_t>();
-
-        points_in_polygons_forward_cuda_kernel<scalar_t>
-            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
-                output_size, vertex1, vertex2, rows, cols, inside_flag);
-      }));
-  AT_CUDA_CHECK(cudaGetLastError());
-}
diff --git a/mmcv/ops/csrc/pytorch/cuda/prroi_pool_cuda.cu b/mmcv/ops/csrc/pytorch/cuda/prroi_pool_cuda.cu
deleted file mode 100644
index e063609..0000000
--- a/mmcv/ops/csrc/pytorch/cuda/prroi_pool_cuda.cu
+++ /dev/null
@@ -1,65 +0,0 @@
-// Copyright (c) OpenMMLab. All rights reserved
-#include "prroi_pool_cuda_kernel.cuh"
-#include "pytorch_cuda_helper.hpp"
-
-void PrROIPoolForwardCUDAKernelLauncher(Tensor input, Tensor rois,
-                                        Tensor output, int pooled_height,
-                                        int pooled_width, float spatial_scale) {
-  int output_size = output.numel();
-  int channels = input.size(1);
-  int height = input.size(2);
-  int width = input.size(3);
-
-  at::cuda::CUDAGuard device_guard(input.device());
-  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-  prroi_pool_forward_cuda_kernel<float>
-      <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
-          output_size, input.data_ptr<float>(), rois.data_ptr<float>(),
-          output.data_ptr<float>(), pooled_height, pooled_width,
-          static_cast<float>(spatial_scale), channels, height, width);
-
-  AT_CUDA_CHECK(cudaGetLastError());
-}
-
-void PrROIPoolBackwardCUDAKernelLauncher(Tensor grad_output, Tensor rois,
-                                         Tensor grad_input, int pooled_height,
-                                         int pooled_width,
-                                         float spatial_scale) {
-  int output_size = grad_output.numel();
-  int channels = grad_input.size(1);
-  int height = grad_input.size(2);
-  int width = grad_input.size(3);
-
-  at::cuda::CUDAGuard device_guard(grad_output.device());
-  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-  prroi_pool_backward_cuda_kernel<float>
-      <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
-          output_size, grad_output.data_ptr<float>(), rois.data_ptr<float>(),
-          grad_input.data_ptr<float>(), pooled_height, pooled_width,
-          static_cast<float>(spatial_scale), channels, height, width);
-
-  AT_CUDA_CHECK(cudaGetLastError());
-}
-
-void PrROIPoolCoorBackwardCUDAKernelLauncher(Tensor output, Tensor grad_output,
-                                             Tensor input, Tensor rois,
-                                             Tensor grad_rois,
-                                             int pooled_height,
-                                             int pooled_width,
-                                             float spatial_scale) {
-  int output_size = grad_output.numel();
-  int channels = input.size(1);
-  int height = input.size(2);
-  int width = input.size(3);
-
-  at::cuda::CUDAGuard device_guard(grad_output.device());
-  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-  prroi_pool_coor_backward_cuda_kernel<float>
-      <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
-          output_size, output.data_ptr<float>(), grad_output.data_ptr<float>(),
-          input.data_ptr<float>(), rois.data_ptr<float>(),
-          grad_rois.data_ptr<float>(), pooled_height, pooled_width,
-          static_cast<float>(spatial_scale), channels, height, width);
-
-  AT_CUDA_CHECK(cudaGetLastError());
-}
diff --git a/mmcv/ops/csrc/pytorch/cuda/psamask_cuda.cu b/mmcv/ops/csrc/pytorch/cuda/psamask_cuda.cu
index a0bdfa6..fe7d180 100644
--- a/mmcv/ops/csrc/pytorch/cuda/psamask_cuda.cu
+++ b/mmcv/ops/csrc/pytorch/cuda/psamask_cuda.cu
@@ -2,8 +2,11 @@
 // Modified from
 // https://github.com/hszhao/semseg/blob/master/lib/psa/src
 
+#include <THC/THC.h>
 #include <torch/serialize/tensor.h>
 
+#include <THC/THCDeviceUtils.cuh>
+
 #include "psamask_cuda_kernel.cuh"
 #include "pytorch_cuda_helper.hpp"
 
diff --git a/mmcv/ops/csrc/pytorch/cuda/riroi_align_rotated_cuda.cu b/mmcv/ops/csrc/pytorch/cuda/riroi_align_rotated_cuda.cu
deleted file mode 100644
index 9829da7..0000000
--- a/mmcv/ops/csrc/pytorch/cuda/riroi_align_rotated_cuda.cu
+++ /dev/null
@@ -1,53 +0,0 @@
-// Copyright (c) OpenMMLab. All rights reserved
-#include "pytorch_cuda_helper.hpp"
-#include "riroi_align_rotated_cuda_kernel.cuh"
-
-void RiROIAlignRotatedForwardCUDAKernelLauncher(
-    const at::Tensor features, const at::Tensor rois, const float spatial_scale,
-    const int num_samples, const bool clockwise, const int channels,
-    const int height, const int width, const int num_rois,
-    const int pooled_height, const int pooled_width, const int num_orientations,
-    at::Tensor output) {
-  const int output_size =
-      num_rois * pooled_height * pooled_width * channels * num_orientations;
-  at::cuda::CUDAGuard device_guard(features.device());
-  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
-      features.scalar_type(), "riroi_align_rotated_forward_cuda_kernel", ([&] {
-        const scalar_t *bottom_data = features.data_ptr<scalar_t>();
-        const scalar_t *rois_data = rois.data_ptr<scalar_t>();
-        scalar_t *top_data = output.data_ptr<scalar_t>();
-
-        riroi_align_rotated_forward_cuda_kernel<scalar_t>
-            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
-                output_size, bottom_data, rois_data, scalar_t(spatial_scale),
-                num_samples, clockwise, channels, height, width, pooled_height,
-                pooled_width, num_orientations, top_data);
-      }));
-
-  AT_CUDA_CHECK(cudaGetLastError());
-}
-
-void RiROIAlignRotatedBackwardCUDAKernelLauncher(
-    const at::Tensor top_grad, const at::Tensor rois, const float spatial_scale,
-    const int num_samples, const bool clockwise, const int channels,
-    const int height, const int width, const int num_rois,
-    const int pooled_height, const int pooled_width, const int num_orientations,
-    at::Tensor bottom_grad) {
-  const int output_size =
-      num_rois * pooled_height * pooled_width * channels * num_orientations;
-  at::cuda::CUDAGuard device_guard(top_grad.device());
-  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
-      top_grad.scalar_type(), "riroi_align_rotated_backward_cuda_kernel", ([&] {
-        const scalar_t *top_diff = top_grad.data_ptr<scalar_t>();
-        const scalar_t *rois_data = rois.data_ptr<scalar_t>();
-        scalar_t *bottom_diff = bottom_grad.data_ptr<scalar_t>();
-        riroi_align_rotated_backward_cuda_kernel<scalar_t>
-            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
-                output_size, top_diff, rois_data, spatial_scale, num_samples,
-                clockwise, channels, height, width, pooled_height, pooled_width,
-                num_orientations, bottom_diff);
-      }));
-  AT_CUDA_CHECK(cudaGetLastError());
-}
diff --git a/mmcv/ops/csrc/pytorch/cuda/roi_align_rotated_cuda.cu b/mmcv/ops/csrc/pytorch/cuda/roi_align_rotated_cuda.cu
index c0fd987..aa631bc 100644
--- a/mmcv/ops/csrc/pytorch/cuda/roi_align_rotated_cuda.cu
+++ b/mmcv/ops/csrc/pytorch/cuda/roi_align_rotated_cuda.cu
@@ -3,21 +3,21 @@
 #include "roi_align_rotated_cuda_kernel.cuh"
 
 void ROIAlignRotatedForwardCUDAKernelLauncher(
-    const at::Tensor input, const at::Tensor rois, const float spatial_scale,
-    const int sampling_ratio, const bool aligned, const bool clockwise,
+    const at::Tensor features, const at::Tensor rois, const float spatial_scale,
+    const int sample_num, const bool aligned, const bool clockwise,
     const int channels, const int height, const int width, const int num_rois,
     const int pooled_height, const int pooled_width, at::Tensor output) {
   const int output_size = num_rois * pooled_height * pooled_width * channels;
   AT_DISPATCH_FLOATING_TYPES_AND_HALF(
-      input.scalar_type(), "ROIAlignRotatedLaucherForward", ([&] {
-        const scalar_t *bottom_data = input.data_ptr<scalar_t>();
+      features.scalar_type(), "ROIAlignRotatedLaucherForward", ([&] {
+        const scalar_t *bottom_data = features.data_ptr<scalar_t>();
         const scalar_t *rois_data = rois.data_ptr<scalar_t>();
         scalar_t *top_data = output.data_ptr<scalar_t>();
 
         roi_align_rotated_forward_cuda_kernel<scalar_t>
             <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK>>>(
                 output_size, bottom_data, rois_data, scalar_t(spatial_scale),
-                sampling_ratio, aligned, clockwise, channels, height, width,
+                sample_num, aligned, clockwise, channels, height, width,
                 pooled_height, pooled_width, top_data);
       }));
 
@@ -26,7 +26,7 @@ void ROIAlignRotatedForwardCUDAKernelLauncher(
 
 void ROIAlignRotatedBackwardCUDAKernelLauncher(
     const at::Tensor top_grad, const at::Tensor rois, const float spatial_scale,
-    const int sampling_ratio, const bool aligned, const bool clockwise,
+    const int sample_num, const bool aligned, const bool clockwise,
     const int channels, const int height, const int width, const int num_rois,
     const int pooled_height, const int pooled_width, at::Tensor bottom_grad) {
   const int output_size = num_rois * pooled_height * pooled_width * channels;
@@ -37,7 +37,7 @@ void ROIAlignRotatedBackwardCUDAKernelLauncher(
         scalar_t *bottom_diff = bottom_grad.data_ptr<scalar_t>();
         roi_align_rotated_backward_cuda_kernel<scalar_t>
             <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK>>>(
-                output_size, top_diff, rois_data, spatial_scale, sampling_ratio,
+                output_size, top_diff, rois_data, spatial_scale, sample_num,
                 aligned, clockwise, channels, height, width, pooled_height,
                 pooled_width, bottom_diff);
       }));
diff --git a/mmcv/ops/csrc/pytorch/cuda/roiaware_pool3d_cuda.cu b/mmcv/ops/csrc/pytorch/cuda/roiaware_pool3d_cuda.cu
index 7d83755..2bc7c3f 100644
--- a/mmcv/ops/csrc/pytorch/cuda/roiaware_pool3d_cuda.cu
+++ b/mmcv/ops/csrc/pytorch/cuda/roiaware_pool3d_cuda.cu
@@ -26,7 +26,7 @@ void RoiawarePool3dForwardCUDAKernelLauncher(
   Tensor pts_mask =
       -at::ones({boxes_num, pts_num}, pts_feature.options().dtype(at::kInt));
 
-  dim3 blocks_mask(GET_BLOCKS(pts_num, THREADS_PER_BLOCK), boxes_num);
+  dim3 blocks_mask(DIVUP(pts_num, THREADS_PER_BLOCK), boxes_num);
   dim3 threads(THREADS_PER_BLOCK);
 
   AT_DISPATCH_FLOATING_TYPES_AND_HALF(
@@ -42,7 +42,7 @@ void RoiawarePool3dForwardCUDAKernelLauncher(
 
   // TODO: Merge the collect and pool functions, SS
 
-  dim3 blocks_collect(GET_BLOCKS(boxes_num, THREADS_PER_BLOCK));
+  dim3 blocks_collect(DIVUP(boxes_num, THREADS_PER_BLOCK));
 
   AT_DISPATCH_INTEGRAL_TYPES(
       pts_idx_of_voxels.scalar_type(), "collect_inside_pts_for_box3d", [&] {
@@ -55,8 +55,8 @@ void RoiawarePool3dForwardCUDAKernelLauncher(
 
   AT_CUDA_CHECK(cudaGetLastError());
 
-  dim3 blocks_pool(GET_BLOCKS(out_x * out_y * out_z, THREADS_PER_BLOCK),
-                   channels, boxes_num);
+  dim3 blocks_pool(DIVUP(out_x * out_y * out_z, THREADS_PER_BLOCK), channels,
+                   boxes_num);
   if (pool_method == 0) {
     AT_DISPATCH_FLOATING_TYPES_AND_HALF(
         pts_feature.scalar_type(), "roiaware_maxpool3d", [&] {
@@ -93,7 +93,7 @@ void RoiawarePool3dBackwardCUDAKernelLauncher(
   at::cuda::CUDAGuard device_guard(grad_out.device());
   cudaStream_t stream = at::cuda::getCurrentCUDAStream();
 
-  dim3 blocks(GET_BLOCKS(out_x * out_y * out_z, THREADS_PER_BLOCK), channels,
+  dim3 blocks(DIVUP(out_x * out_y * out_z, THREADS_PER_BLOCK), channels,
               boxes_num);
   dim3 threads(THREADS_PER_BLOCK);
 
diff --git a/mmcv/ops/csrc/pytorch/cuda/roipoint_pool3d_cuda.cu b/mmcv/ops/csrc/pytorch/cuda/roipoint_pool3d_cuda.cu
index af2098e..49c003f 100644
--- a/mmcv/ops/csrc/pytorch/cuda/roipoint_pool3d_cuda.cu
+++ b/mmcv/ops/csrc/pytorch/cuda/roipoint_pool3d_cuda.cu
@@ -24,7 +24,7 @@ void RoIPointPool3dForwardCUDAKernelLauncher(
   cudaStream_t stream = at::cuda::getCurrentCUDAStream();
 
   // blockIdx.x(col), blockIdx.y(row)
-  dim3 blocks(GET_BLOCKS(pts_num, THREADS_PER_BLOCK), boxes_num, batch_size);
+  dim3 blocks(DIVUP(pts_num, THREADS_PER_BLOCK), boxes_num, batch_size);
   dim3 threads(THREADS_PER_BLOCK);
 
   AT_DISPATCH_FLOATING_TYPES_AND_HALF(
@@ -38,14 +38,14 @@ void RoIPointPool3dForwardCUDAKernelLauncher(
                              boxes3d.options().dtype(at::kInt));
 
   // blockIdx.x(col), blockIdx.y(row)
-  dim3 blocks2(GET_BLOCKS(boxes_num, THREADS_PER_BLOCK), batch_size);
+  dim3 blocks2(DIVUP(boxes_num, THREADS_PER_BLOCK), batch_size);
 
   get_pooled_idx<<<blocks2, threads, 0, stream>>>(
       batch_size, pts_num, boxes_num, sampled_pts_num,
       pts_assign.data_ptr<int>(), pts_idx.data_ptr<int>(),
       pooled_empty_flag.data_ptr<int>());
 
-  dim3 blocks_pool(GET_BLOCKS(sampled_pts_num, THREADS_PER_BLOCK), boxes_num,
+  dim3 blocks_pool(DIVUP(sampled_pts_num, THREADS_PER_BLOCK), boxes_num,
                    batch_size);
 
   AT_DISPATCH_FLOATING_TYPES_AND_HALF(
diff --git a/mmcv/ops/csrc/pytorch/cuda/rotated_feature_align_cuda.cu b/mmcv/ops/csrc/pytorch/cuda/rotated_feature_align_cuda.cu
deleted file mode 100644
index d172338..0000000
--- a/mmcv/ops/csrc/pytorch/cuda/rotated_feature_align_cuda.cu
+++ /dev/null
@@ -1,53 +0,0 @@
-// Copyright (c) OpenMMLab. All rights reserved.
-// Modified from
-// https://github.com/SJTU-Thinklab-Det/r3det-on-mmdetection/blob/master/mmdet/ops/fr/src/feature_refine_kernel.cu
-#include "pytorch_cuda_helper.hpp"
-#include "rotated_feature_align_cuda_kernel.cuh"
-
-void RotatedFeatureAlignForwardCUDAKernelLauncher(const Tensor features,
-                                                  const Tensor best_bboxes,
-                                                  const float spatial_scale,
-                                                  const int points,
-                                                  Tensor output) {
-  at::cuda::CUDAGuard device_guard(features.device());
-  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-  const int output_size = features.numel();
-  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
-      features.scalar_type(), "rotated_feature_align_forward_cuda_kernel",
-      ([&] {
-        const scalar_t* bottom_data = features.data_ptr<scalar_t>();
-        const scalar_t* bboxes_data = best_bboxes.data_ptr<scalar_t>();
-        scalar_t* top_data = output.data_ptr<scalar_t>();
-
-        rotated_feature_align_forward_kernel<scalar_t>
-            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
-                output_size, points, bottom_data, bboxes_data,
-                scalar_t(spatial_scale), features.size(1), features.size(2),
-                features.size(3), top_data);
-      }));
-  AT_CUDA_CHECK(cudaGetLastError());
-}
-
-void RotatedFeatureAlignBackwardCUDAKernelLauncher(const Tensor top_grad,
-                                                   const Tensor best_bboxes,
-                                                   const float spatial_scale,
-                                                   const int points,
-                                                   Tensor bottom_grad) {
-  at::cuda::CUDAGuard device_guard(top_grad.device());
-  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-  const int output_size = top_grad.numel();
-  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
-      top_grad.scalar_type(), "rotated_feature_align_backward_cuda_kernel",
-      ([&] {
-        const scalar_t* top_diff = top_grad.data_ptr<scalar_t>();
-        const scalar_t* bboxes_data = best_bboxes.data_ptr<scalar_t>();
-        scalar_t* bottom_diff = bottom_grad.data_ptr<scalar_t>();
-
-        rotated_feature_align_backward_kernel<scalar_t>
-            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
-                output_size, points, top_diff, bboxes_data,
-                scalar_t(spatial_scale), top_grad.size(1), top_grad.size(2),
-                top_grad.size(3), bottom_diff);
-      }));
-  AT_CUDA_CHECK(cudaGetLastError());
-}
diff --git a/mmcv/ops/csrc/pytorch/cuda/scatter_points_cuda.cu b/mmcv/ops/csrc/pytorch/cuda/scatter_points_cuda.cu
index cbc4465..4939fe4 100644
--- a/mmcv/ops/csrc/pytorch/cuda/scatter_points_cuda.cu
+++ b/mmcv/ops/csrc/pytorch/cuda/scatter_points_cuda.cu
@@ -26,15 +26,10 @@ std::vector<at::Tensor> DynamicPointToVoxelForwardCUDAKernelLauncher(
   std::tie(out_coors, coors_map, reduce_count) =
       at::unique_dim(coors_clean, 0, true, true, true);
 
-  if (out_coors[0][0].lt(0).item<bool>()) {
-    // the first element of out_coors (-1,-1,-1) and should be removed
-    out_coors = out_coors.slice(0, 1);
-    reduce_count = reduce_count.slice(0, 1);
-    coors_map = coors_map - 1;
-  }
-
-  coors_map = coors_map.to(torch::kInt32);
-  reduce_count = reduce_count.to(torch::kInt32);
+  // the first element of out_coors is always (-1,-1,-1) and should be removed
+  out_coors = out_coors.slice(0, 1);
+  reduce_count = reduce_count.slice(0, 1).to(torch::kInt32);
+  coors_map = coors_map.to(torch::kInt32) - 1;
 
   auto reduced_feats =
       at::empty({out_coors.size(0), num_feats}, feats.options());
diff --git a/mmcv/ops/csrc/pytorch/cuda/sparse_indice.cu b/mmcv/ops/csrc/pytorch/cuda/sparse_indice.cu
deleted file mode 100644
index a014dfc..0000000
--- a/mmcv/ops/csrc/pytorch/cuda/sparse_indice.cu
+++ /dev/null
@@ -1,159 +0,0 @@
-// Copyright 2019 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <ATen/ATen.h>
-// clang-format off
-// TODO: make spconv_utils.h order agnostic
-#include "../spconv_utils.h"
-// clang-format on
-#include <utils/spconv/spconv/indice.h>
-#include <utils/spconv/spconv/mp_helper.h>
-#include <utils/spconv/tensorview/helper_launch.h>
-#include <utils/spconv/tensorview/tensorview.h>
-
-#include <chrono>
-#include <limits>
-#include <spconv/indice.cuh>
-#include <type_traits>
-
-#include "pytorch_cuda_helper.hpp"
-
-namespace functor {
-template <typename Index, typename IndexGrid, unsigned NDim>
-struct CreateConvIndicePairFunctorP1<tv::TorchGPU, Index, IndexGrid, NDim> {
-  Index operator()(const tv::TorchGPU &d, tv::TensorView<const Index> indicesIn,
-                   tv::TensorView<Index> indicesOut,
-                   tv::TensorView<IndexGrid> gridsOut,
-                   tv::TensorView<Index> indicePairs,
-                   tv::TensorView<Index> indiceNum,
-                   tv::TensorView<Index> indicePairUnique,
-                   const tv::SimpleVector<Index, NDim> kernelSize,
-                   const tv::SimpleVector<Index, NDim> stride,
-                   const tv::SimpleVector<Index, NDim> padding,
-                   const tv::SimpleVector<Index, NDim> dilation,
-                   const tv::SimpleVector<Index, NDim> outSpatialShape,
-                   bool transpose) {
-    Index batchSize = gridsOut.dim(0);
-    auto numActIn = indicesIn.dim(0);
-    if (numActIn == 0) return 0;
-    if (transpose)
-      prepareDeConvIndicePairsKernel<Index, IndexGrid, NDim, 4096>
-          <<<tv::launch::getBlocks(numActIn), tv::launch::CUDA_NUM_THREADS, 0,
-             d.getStream()>>>(indicesIn, indicesOut, gridsOut, indicePairs,
-                              indiceNum, indicePairUnique, kernelSize, stride,
-                              padding, dilation, outSpatialShape);
-    else
-      prepareIndicePairsKernel<Index, IndexGrid, NDim, 4096>
-          <<<tv::launch::getBlocks(numActIn), tv::launch::CUDA_NUM_THREADS, 0,
-             d.getStream()>>>(indicesIn, indicesOut, gridsOut, indicePairs,
-                              indiceNum, indicePairUnique, kernelSize, stride,
-                              padding, dilation, outSpatialShape);
-    TV_CHECK_CUDA_ERR();
-    return 1;
-  }
-};
-
-template <typename Index, typename IndexGrid, unsigned NDim>
-struct CreateConvIndicePairFunctorP2<tv::TorchGPU, Index, IndexGrid, NDim> {
-  Index operator()(const tv::TorchGPU &d, tv::TensorView<const Index> indicesIn,
-                   tv::TensorView<Index> indicesOut,
-                   tv::TensorView<IndexGrid> gridsOut,
-                   tv::TensorView<Index> indicePairs,
-                   tv::TensorView<Index> indiceNum,
-                   tv::TensorView<Index> indicePairUnique,
-                   const tv::SimpleVector<Index, NDim> outSpatialShape,
-                   bool transpose, bool resetGrid) {
-    Index batchSize = gridsOut.dim(0);
-    auto kernelVolume = indicePairs.dim(0);
-    auto numActIn = indicesIn.dim(0);
-    if (numActIn == 0) return 0;
-    Index numAct = indicePairUnique.dim(0) - 1;
-    assignGridAndIndiceOutKernel<Index, IndexGrid, NDim>
-        <<<tv::launch::getBlocks(numAct), tv::launch::CUDA_NUM_THREADS, 0,
-           d.getStream()>>>(indicesOut, gridsOut, numAct, indicePairs,
-                            indicePairUnique, outSpatialShape, batchSize);
-    TV_CHECK_CUDA_ERR();
-    assignIndicePairsKernel<Index, IndexGrid, NDim>
-        <<<tv::launch::getBlocks(numActIn), tv::launch::CUDA_NUM_THREADS, 0,
-           d.getStream()>>>(indicesOut, gridsOut, numActIn, indicePairs,
-                            indicePairUnique, outSpatialShape);
-    TV_CHECK_CUDA_ERR();
-
-    if (resetGrid) {
-      resetGridKernel<Index, IndexGrid, NDim>
-          <<<tv::launch::getBlocks(numAct), tv::launch::CUDA_NUM_THREADS, 0,
-             d.getStream()>>>(indicePairUnique.data(), gridsOut, numAct);
-      TV_CHECK_CUDA_ERR();
-    }
-    return numAct;
-  }
-};
-
-template <typename Index, typename IndexGrid, unsigned NDim>
-struct CreateSubMIndicePairFunctor<tv::TorchGPU, Index, IndexGrid, NDim> {
-  Index operator()(const tv::TorchGPU &d, tv::TensorView<const Index> indicesIn,
-                   tv::TensorView<IndexGrid> gridsOut,
-                   tv::TensorView<Index> indicePairs,
-                   tv::TensorView<Index> indiceNum,
-                   const tv::SimpleVector<Index, NDim> kernelSize,
-                   const tv::SimpleVector<Index, NDim> stride,
-                   const tv::SimpleVector<Index, NDim> padding,
-                   const tv::SimpleVector<Index, NDim> dilation,
-                   const tv::SimpleVector<Index, NDim> outSpatialShape,
-                   bool transpose, bool resetGrid) {
-    auto numActIn = indicesIn.dim(0);
-    if (numActIn == 0) return 0;
-    prepareSubMGridKernel<Index, IndexGrid, NDim>
-        <<<tv::launch::getBlocks(numActIn), tv::launch::CUDA_NUM_THREADS, 0,
-           d.getStream()>>>(indicesIn, gridsOut, outSpatialShape);
-    TV_CHECK_CUDA_ERR();
-    getSubMIndicePairsKernel<Index, IndexGrid, NDim, 4096>
-        <<<tv::launch::getBlocks(numActIn), tv::launch::CUDA_NUM_THREADS, 0,
-           d.getStream()>>>(indicesIn, gridsOut, indicePairs, indiceNum,
-                            kernelSize, stride, padding, dilation,
-                            outSpatialShape);
-    TV_CHECK_CUDA_ERR();
-
-    if (resetGrid) {
-      resetGridSubMKernel<Index, IndexGrid, NDim>
-          <<<tv::launch::getBlocks(numActIn), tv::launch::CUDA_NUM_THREADS, 0,
-             d.getStream()>>>(indicesIn.data(), gridsOut, outSpatialShape,
-                              numActIn);
-      TV_CHECK_CUDA_ERR();
-    }
-    return numActIn;
-  }
-};
-}  // namespace functor
-
-#define DECLARE_GPU_SPECS_INDEX_NDIM(Index, NDIM)                             \
-  template struct functor::CreateConvIndicePairFunctor<tv::TorchGPU, Index,   \
-                                                       int, NDIM>;            \
-  template struct functor::CreateConvIndicePairFunctorP1<tv::TorchGPU, Index, \
-                                                         int, NDIM>;          \
-  template struct functor::CreateConvIndicePairFunctorP2<tv::TorchGPU, Index, \
-                                                         int, NDIM>;          \
-  template struct functor::CreateSubMIndicePairFunctor<tv::TorchGPU, Index,   \
-                                                       int, NDIM>;
-
-#define DECLARE_GPU_INDEX(Index)          \
-  DECLARE_GPU_SPECS_INDEX_NDIM(Index, 1); \
-  DECLARE_GPU_SPECS_INDEX_NDIM(Index, 2); \
-  DECLARE_GPU_SPECS_INDEX_NDIM(Index, 3); \
-  DECLARE_GPU_SPECS_INDEX_NDIM(Index, 4);
-
-DECLARE_GPU_INDEX(int);
-
-#undef DECLARE_GPU_INDEX
-#undef DECLARE_GPU_SPECS_INDEX_NDIM
diff --git a/mmcv/ops/csrc/pytorch/cuda/sparse_maxpool.cu b/mmcv/ops/csrc/pytorch/cuda/sparse_maxpool.cu
deleted file mode 100644
index 9858205..0000000
--- a/mmcv/ops/csrc/pytorch/cuda/sparse_maxpool.cu
+++ /dev/null
@@ -1,486 +0,0 @@
-// Copyright 2019 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <ATen/ATen.h>
-// clang-format off
-// TODO: make spconv_utils.h order agnostic
-#include "../spconv_utils.h"
-// clang-format on
-#include <utils/spconv/spconv/maxpool.h>
-#include <utils/spconv/spconv/mp_helper.h>
-#include <utils/spconv/tensorview/helper_launch.h>
-#include <utils/spconv/tensorview/tensorview.h>
-
-#include <chrono>
-#include <limits>
-#include <type_traits>
-#include <utils/spconv/tensorview/helper_kernel.cuh>
-
-#include "pytorch_cuda_helper.hpp"
-
-template <typename scalar_t, typename Index, int NumTLP, int NumILP>
-__global__ void maxPoolFwdBlockKernel(scalar_t *outFeatures,
-                                      const scalar_t *inFeatures,
-                                      const Index *indicesIn,
-                                      const Index *indicesOut, int numHot,
-                                      int numPlanes) {
-  scalar_t in, out;
-  int ILPStrideY[NumILP];
-  Index idxo, idxi;
-#pragma unroll
-  for (int ilp = 0; ilp < NumILP; ilp++)
-    ILPStrideY[ilp] = threadIdx.y + ilp * blockDim.y;
-  outFeatures += blockIdx.y * NumTLP;
-  inFeatures += blockIdx.y * NumTLP;
-  for (int ix = blockIdx.x * blockDim.x; ix < numHot;
-       ix += blockDim.x * gridDim.x) {
-    {
-#pragma unroll
-      for (int ilp = 0; ilp < NumILP; ++ilp) {
-        idxi = indicesIn[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
-        idxo = indicesOut[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
-        in = inFeatures[idxi];
-        out = outFeatures[idxo];
-        if (in > out) {
-          outFeatures[idxo] = in;
-        }
-      }
-    }
-  }
-}
-
-template <typename scalar_t, typename Index, int NumTLP, int NumILP>
-__global__ void maxPoolFwdGenericBlockKernel(scalar_t *outFeatures,
-                                             const scalar_t *inFeatures,
-                                             const Index *indicesIn,
-                                             const Index *indicesOut,
-                                             int numHot, int numPlanes) {
-  int ILPStrideX[NumILP];
-  Index RI[NumILP];
-  Index RO[NumILP];
-  scalar_t in, out;
-#pragma unroll
-  for (int ilp = 0; ilp < NumILP; ilp++)
-    ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
-  for (int ix : tv::KernelLoopX<int, NumILP>(numHot)) {
-#pragma unroll
-    for (int ilp = 0; ilp < NumILP; ilp++) {
-      RI[ilp] = indicesIn[ix + ILPStrideX[ilp]] * numPlanes;
-      RO[ilp] = indicesOut[ix + ILPStrideX[ilp]] * numPlanes;
-    }
-    for (int iy : tv::KernelLoopY<int>(numPlanes)) {
-#pragma unroll
-      for (int ilp = 0; ilp < NumILP; ++ilp) {
-        in = inFeatures[RI[ilp] + iy];
-        out = outFeatures[RO[ilp] + iy];
-        if (in > out) {
-          outFeatures[RO[ilp] + iy] = in;
-        }
-      }
-    }
-  }
-}
-
-template <typename scalar_t, typename Index, int NumTLP, int NumILP,
-          typename VecType>
-__global__ void maxPoolFwdVecBlockKernel(scalar_t *outFeatures,
-                                         const scalar_t *inFeatures,
-                                         const Index *indicesIn,
-                                         const Index *indicesOut, int numHot,
-                                         int numPlanes) {
-  int ILPStrideY[NumILP];
-  constexpr int vecloadFactor = sizeof(VecType) / sizeof(scalar_t);
-  scalar_t bufi[vecloadFactor];
-  scalar_t bufo[vecloadFactor];
-  Index idxi, idxo;
-#pragma unroll
-  for (int ilp = 0; ilp < NumILP; ilp++)
-    ILPStrideY[ilp] = threadIdx.y + ilp * blockDim.y;
-  outFeatures += blockIdx.y * NumTLP;
-  inFeatures += blockIdx.y * NumTLP;
-  for (int ix = blockIdx.x * blockDim.x * vecloadFactor; ix < numHot;
-       ix += blockDim.x * gridDim.x * vecloadFactor) {
-#pragma unroll
-    for (int ilp = 0; ilp < NumILP; ++ilp) {
-      idxi = indicesIn[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
-      idxo = indicesOut[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
-      reinterpret_cast<VecType *>(bufo)[0] =
-          reinterpret_cast<VecType *>(outFeatures)[idxo];
-      reinterpret_cast<VecType *>(bufi)[0] =
-          reinterpret_cast<const VecType *>(inFeatures)[idxi];
-#pragma unroll
-      for (int i = 0; i < vecloadFactor; i++) {
-        if (bufi[i] > bufo[i]) {
-          bufo[i] = bufi[i];
-        }
-      }
-      reinterpret_cast<VecType *>(outFeatures)[idxo] =
-          reinterpret_cast<VecType *>(bufo)[0];
-    }
-  }
-}
-
-template <typename scalar_t, typename Index, int NumTLP, int NumILP>
-__global__ void maxPoolFwdGenericKernel(scalar_t *outFeatures,
-                                        const scalar_t *inFeatures,
-                                        const Index *indicesIn,
-                                        const Index *indicesOut, int numHot,
-                                        int numPlanes) {
-  int ILPStrideX[NumILP];
-  Index RI[NumILP];
-  Index RO[NumILP];
-  scalar_t in, out;
-#pragma unroll
-  for (int ilp = 0; ilp < NumILP; ilp++)
-    ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
-  for (int ix : tv::KernelLoopX<int, NumILP>(numHot)) {
-#pragma unroll
-    for (int ilp = 0; ilp < NumILP; ilp++) {
-      if (ix + ILPStrideX[ilp] < numHot) {
-        RI[ilp] = indicesIn[ix + ILPStrideX[ilp]] * numPlanes;
-        RO[ilp] = indicesOut[ix + ILPStrideX[ilp]] * numPlanes;
-      }
-    }
-    for (int iy : tv::KernelLoopY<int>(numPlanes)) {
-#pragma unroll
-      for (int ilp = 0; ilp < NumILP; ++ilp) {
-        if (ix + ILPStrideX[ilp] < numHot) {
-          in = inFeatures[RI[ilp] + iy];
-          out = outFeatures[RO[ilp] + iy];
-          if (in > out) {
-            outFeatures[RO[ilp] + iy] = in;
-          }
-        }
-      }
-    }
-  }
-}
-
-template <typename scalar_t, typename Index, int NumTLP, int NumILP>
-__global__ void maxPoolBwdBlockKernel(const scalar_t *outFeatures,
-                                      const scalar_t *inFeatures,
-                                      const scalar_t *fout, scalar_t *fin,
-                                      const Index *indicesIn,
-                                      const Index *indicesOut, int numHot,
-                                      int numPlanes) {
-  scalar_t in, out;
-  Index idxo, idxi;
-  int ILPStrideY[NumILP];
-#pragma unroll
-  for (int ilp = 0; ilp < NumILP; ilp++)
-    ILPStrideY[ilp] = threadIdx.y + ilp * blockDim.y;
-  outFeatures += blockIdx.y * NumTLP;
-  inFeatures += blockIdx.y * NumTLP;
-  fout += blockIdx.y * NumTLP;
-  fin += blockIdx.y * NumTLP;
-  for (int ix = blockIdx.x * blockDim.x; ix < numHot;
-       ix += blockDim.x * gridDim.x) {
-    {
-#pragma unroll
-      for (int ilp = 0; ilp < NumILP; ++ilp) {
-        idxi = indicesIn[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
-        idxo = indicesOut[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
-        in = inFeatures[idxi];
-        out = outFeatures[idxo];
-        if (in == out) {
-          fin[idxi] += fout[idxo];
-        }
-      }
-    }
-  }
-}
-
-template <typename scalar_t, typename Index, int NumTLP, int NumILP>
-__global__ void maxPoolBwdGenericBlockKernel(
-    const scalar_t *outFeatures, const scalar_t *inFeatures,
-    const scalar_t *fout, scalar_t *fin, const Index *indicesIn,
-    const Index *indicesOut, int numHot, int numPlanes) {
-  int ILPStrideX[NumILP];
-  Index RI[NumILP];
-  Index RO[NumILP];
-  scalar_t in, out;
-#pragma unroll
-  for (int ilp = 0; ilp < NumILP; ilp++)
-    ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
-  for (int ix : tv::KernelLoopX<int, NumILP>(numHot)) {
-#pragma unroll
-    for (int ilp = 0; ilp < NumILP; ilp++) {
-      RI[ilp] = indicesIn[ix + ILPStrideX[ilp]] * numPlanes;
-      RO[ilp] = indicesOut[ix + ILPStrideX[ilp]] * numPlanes;
-    }
-    for (int iy : tv::KernelLoopY<int>(numPlanes)) {
-#pragma unroll
-      for (int ilp = 0; ilp < NumILP; ++ilp) {
-        in = inFeatures[RI[ilp] + iy];
-        out = outFeatures[RO[ilp] + iy];
-        if (in == out) {
-          fin[RI[ilp] + iy] += fout[RO[ilp] + iy];
-        }
-      }
-    }
-  }
-}
-
-template <typename scalar_t, typename Index, int NumTLP, int NumILP,
-          typename VecType>
-__global__ void maxPoolBwdVecBlockKernel(const scalar_t *outFeatures,
-                                         const scalar_t *inFeatures,
-                                         const scalar_t *fout, scalar_t *fin,
-                                         const Index *indicesIn,
-                                         const Index *indicesOut, int numHot,
-                                         int numPlanes) {
-  int ILPStrideY[NumILP];
-  constexpr int vecloadFactor = sizeof(VecType) / sizeof(scalar_t);
-  scalar_t bufi[vecloadFactor];
-  scalar_t bufo[vecloadFactor];
-  scalar_t bufdi[vecloadFactor];
-  scalar_t bufdo[vecloadFactor];
-  Index idxi, idxo;
-#pragma unroll
-  for (int ilp = 0; ilp < NumILP; ilp++)
-    ILPStrideY[ilp] = threadIdx.y + ilp * blockDim.y;
-  outFeatures += blockIdx.y * NumTLP;
-  inFeatures += blockIdx.y * NumTLP;
-  for (int ix = blockIdx.x * blockDim.x * vecloadFactor; ix < numHot;
-       ix += blockDim.x * gridDim.x * vecloadFactor) {
-#pragma unroll
-    for (int ilp = 0; ilp < NumILP; ++ilp) {
-      idxi = indicesIn[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
-      idxo = indicesOut[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
-      reinterpret_cast<VecType *>(bufo)[0] =
-          reinterpret_cast<const VecType *>(outFeatures)[idxo];
-      reinterpret_cast<VecType *>(bufi)[0] =
-          reinterpret_cast<const VecType *>(inFeatures)[idxi];
-      reinterpret_cast<VecType *>(bufdo)[0] =
-          reinterpret_cast<const VecType *>(fout)[idxo];
-      reinterpret_cast<VecType *>(bufdi)[0] =
-          reinterpret_cast<VecType *>(fin)[idxi];
-
-#pragma unroll
-      for (int i = 0; i < vecloadFactor; i++) {
-        if (bufi[i] == bufo[i]) {
-          bufdi[i] += bufdo[i];
-        }
-      }
-      reinterpret_cast<VecType *>(fin)[idxi] =
-          reinterpret_cast<VecType *>(bufdi)[0];
-    }
-  }
-}
-
-template <typename scalar_t, typename Index, int NumTLP, int NumILP>
-__global__ void maxPoolBwdGenericKernel(const scalar_t *outFeatures,
-                                        const scalar_t *inFeatures,
-                                        const scalar_t *fout, scalar_t *fin,
-                                        const Index *indicesIn,
-                                        const Index *indicesOut, int numHot,
-                                        int numPlanes) {
-  int ILPStrideX[NumILP];
-  Index RI[NumILP];
-  Index RO[NumILP];
-  scalar_t in, out;
-#pragma unroll
-  for (int ilp = 0; ilp < NumILP; ilp++)
-    ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
-  for (int ix : tv::KernelLoopX<int, NumILP>(numHot)) {
-#pragma unroll
-    for (int ilp = 0; ilp < NumILP; ilp++) {
-      if (ix + ILPStrideX[ilp] < numHot) {
-        RI[ilp] = indicesIn[ix + ILPStrideX[ilp]] * numPlanes;
-        RO[ilp] = indicesOut[ix + ILPStrideX[ilp]] * numPlanes;
-      }
-    }
-    for (int iy : tv::KernelLoopY<int>(numPlanes)) {
-#pragma unroll
-      for (int ilp = 0; ilp < NumILP; ++ilp) {
-        if (ix + ILPStrideX[ilp] < numHot) {
-          in = inFeatures[RI[ilp] + iy];
-          out = outFeatures[RO[ilp] + iy];
-          if (in == out) {
-            fin[RI[ilp] + iy] += fout[RO[ilp] + iy];
-          }
-        }
-      }
-    }
-  }
-}
-
-namespace functor {
-template <typename scalar_t, typename Index>
-struct SparseMaxPoolForwardFunctor<tv::TorchGPU, scalar_t, Index> {
-  using vecload_type_t =
-      std::conditional_t<std::is_same<scalar_t, at::Half>::value, int2, int4>;
-  using kernel_block_t = mp_list_c<int, 64, 32, 16>;
-  void operator()(const tv::TorchGPU &d, tv::TensorView<scalar_t> outFeatures,
-                  tv::TensorView<const scalar_t> inFeatures,
-                  tv::TensorView<const Index> indices, int size) {
-    if (size <= 0) return;
-    int numPlanes = inFeatures.dim(1);
-    bool notFound = true;
-    constexpr int vecloadFactor = sizeof(vecload_type_t) / sizeof(scalar_t);
-    mp_for_each<kernel_block_t>([=, &outFeatures, &inFeatures, &indices,
-                                 &notFound](auto NumTLP) {
-      constexpr int NumILP = NumTLP / 4;
-
-      int numHotBlock = (size / NumTLP) * NumTLP;
-      if (notFound) {
-        if (numPlanes % NumTLP == 0) {
-          if (numHotBlock >= NumTLP) {
-            maxPoolFwdVecBlockKernel<scalar_t, Index, int(NumTLP), NumILP,
-                                     vecload_type_t>
-                <<<dim3(std::min(size / NumTLP, 512), numPlanes / NumTLP),
-                   dim3(NumTLP / vecloadFactor, NumTLP / NumILP), 0,
-                   d.getStream()>>>(outFeatures.data(), inFeatures.data(),
-                                    indices.subview(0).data(),
-                                    indices.subview(1).data(), numHotBlock,
-                                    numPlanes / vecloadFactor);
-            TV_CHECK_CUDA_ERR();
-          }
-
-          if (size > numHotBlock) {
-            maxPoolFwdGenericKernel<scalar_t, Index, int(NumTLP), NumILP>
-                <<<dim3(1, numPlanes / NumTLP), dim3(NumTLP / NumILP, NumTLP),
-                   0, d.getStream()>>>(outFeatures.data(), inFeatures.data(),
-                                       indices.subview(0).data() + numHotBlock,
-                                       indices.subview(1).data() + numHotBlock,
-                                       size - numHotBlock, numPlanes);
-            TV_CHECK_CUDA_ERR();
-          }
-          notFound = false;
-        }
-      }
-    });
-
-    if (notFound) {
-      constexpr int NumTLP = 64;
-      constexpr int NumILP = NumTLP / 4;
-      int numHotBlock = (size / NumTLP) * NumTLP;
-      if (numHotBlock >= NumTLP) {
-        maxPoolFwdGenericBlockKernel<scalar_t, Index, NumTLP, NumILP>
-            <<<dim3(size / NumTLP, tv::launch::DivUp(numPlanes, NumTLP)),
-               dim3(NumTLP / NumILP, NumTLP), 0, d.getStream()>>>(
-                outFeatures.data(), inFeatures.data(),
-                indices.subview(0).data(), indices.subview(1).data(),
-                numHotBlock, numPlanes);
-        TV_CHECK_CUDA_ERR();
-      }
-
-      if (size > numHotBlock) {
-        maxPoolFwdGenericKernel<scalar_t, Index, NumTLP, NumILP>
-            <<<dim3(1, tv::launch::DivUp(numPlanes, NumTLP)),
-               dim3(NumTLP / NumILP, NumTLP), 0, d.getStream()>>>(
-                outFeatures.data(), inFeatures.data(),
-                indices.subview(0).data() + numHotBlock,
-                indices.subview(1).data() + numHotBlock, size - numHotBlock,
-                numPlanes);
-        TV_CHECK_CUDA_ERR();
-      }
-    }
-  }
-};
-
-template <typename scalar_t, typename Index>
-struct SparseMaxPoolBackwardFunctor<tv::TorchGPU, scalar_t, Index> {
-  using vecload_type_t =
-      std::conditional_t<std::is_same<scalar_t, at::Half>::value, int2, int4>;
-  using kernel_block_t = mp_list_c<int, 64, 32, 16>;
-  void operator()(const tv::TorchGPU &d,
-                  tv::TensorView<const scalar_t> outFeatures,
-                  tv::TensorView<const scalar_t> inFeatures,
-                  tv::TensorView<const scalar_t> fout,
-                  tv::TensorView<scalar_t> fin,
-                  tv::TensorView<const Index> indices, int size) {
-    if (size <= 0) return;
-    int numPlanes = inFeatures.dim(1);
-    bool notFound = true;
-    constexpr int vecloadFactor = sizeof(vecload_type_t) / sizeof(scalar_t);
-    mp_for_each<kernel_block_t>([=, &outFeatures, &inFeatures, &fout, &fin,
-                                 &indices, &notFound](auto NumTLP) {
-      constexpr int NumILP = NumTLP / 4;
-
-      int numHotBlock = (size / NumTLP) * NumTLP;
-      if (notFound) {
-        if (numPlanes % NumTLP == 0) {
-          if (numHotBlock >= NumTLP) {
-            maxPoolBwdVecBlockKernel<scalar_t, Index, int(NumTLP), NumILP,
-                                     vecload_type_t>
-                <<<dim3(std::min(size / NumTLP, 512), numPlanes / NumTLP),
-                   dim3(NumTLP / vecloadFactor, NumTLP / NumILP), 0,
-                   d.getStream()>>>(outFeatures.data(), inFeatures.data(),
-                                    fout.data(), fin.data(),
-                                    indices.subview(0).data(),
-                                    indices.subview(1).data(), numHotBlock,
-                                    numPlanes / vecloadFactor);
-            TV_CHECK_CUDA_ERR();
-          }
-
-          if (size > numHotBlock) {
-            maxPoolBwdGenericKernel<scalar_t, Index, int(NumTLP), NumILP>
-                <<<dim3(1, numPlanes / NumTLP), dim3(NumTLP / NumILP, NumTLP),
-                   0, d.getStream()>>>(outFeatures.data(), inFeatures.data(),
-                                       fout.data(), fin.data(),
-                                       indices.subview(0).data() + numHotBlock,
-                                       indices.subview(1).data() + numHotBlock,
-                                       size - numHotBlock, numPlanes);
-            TV_CHECK_CUDA_ERR();
-          }
-          notFound = false;
-        }
-      }
-    });
-
-    if (notFound) {
-      constexpr int NumTLP = 64;
-      constexpr int NumILP = NumTLP / 4;
-      int numHotBlock = (size / NumTLP) * NumTLP;
-      if (numHotBlock >= NumTLP) {
-        maxPoolBwdGenericBlockKernel<scalar_t, Index, NumTLP, NumILP>
-            <<<dim3(size / NumTLP, tv::launch::DivUp(numPlanes, NumTLP)),
-               dim3(NumTLP / NumILP, NumTLP), 0, d.getStream()>>>(
-                outFeatures.data(), inFeatures.data(), fout.data(), fin.data(),
-                indices.subview(0).data(), indices.subview(1).data(),
-                numHotBlock, numPlanes);
-        TV_CHECK_CUDA_ERR();
-      }
-
-      if (size > numHotBlock) {
-        maxPoolBwdGenericKernel<scalar_t, Index, NumTLP, NumILP>
-            <<<dim3(1, tv::launch::DivUp(numPlanes, NumTLP)),
-               dim3(NumTLP / NumILP, NumTLP), 0, d.getStream()>>>(
-                outFeatures.data(), inFeatures.data(), fout.data(), fin.data(),
-                indices.subview(0).data() + numHotBlock,
-                indices.subview(1).data() + numHotBlock, size - numHotBlock,
-                numPlanes);
-        TV_CHECK_CUDA_ERR();
-      }
-    }
-  }
-};
-
-}  // namespace functor
-
-#define DECLARE_GPU_SPECS_T_INDEX(scalar_t, Index)                             \
-  template struct functor::SparseMaxPoolForwardFunctor<tv::TorchGPU, scalar_t, \
-                                                       Index>;                 \
-  template struct functor::SparseMaxPoolBackwardFunctor<tv::TorchGPU,          \
-                                                        scalar_t, Index>;
-
-#define DECLARE_GPU_SPECS(scalar_t) DECLARE_GPU_SPECS_T_INDEX(scalar_t, int);
-
-DECLARE_GPU_SPECS(float);
-DECLARE_GPU_SPECS(double);
-DECLARE_GPU_SPECS(at::Half);
-
-#undef DECLARE_GPU_SPECS
-#undef DECLARE_GPU_SPECS_T_INDEX
diff --git a/mmcv/ops/csrc/pytorch/cuda/sparse_pool_ops_cuda.cu b/mmcv/ops/csrc/pytorch/cuda/sparse_pool_ops_cuda.cu
deleted file mode 100644
index 74b182d..0000000
--- a/mmcv/ops/csrc/pytorch/cuda/sparse_pool_ops_cuda.cu
+++ /dev/null
@@ -1,91 +0,0 @@
-#include <cuda_runtime_api.h>
-#include <torch/script.h>
-// clang-format off
-// TODO: make spconv_utils.h order agnostic
-#include "../spconv_utils.h"
-// clang-format on
-#include <utils/spconv/spconv/maxpool.h>
-
-#include "pytorch_cuda_helper.hpp"
-
-torch::Tensor IndiceMaxpoolForwardCUDAKernelLauncher(torch::Tensor features,
-                                                     torch::Tensor indicePairs,
-                                                     torch::Tensor indiceNum,
-                                                     int64_t numAct) {
-  at::cuda::CUDAGuard device_guard(features.device());
-  auto device = features.device().type();
-  auto kernelVolume = indicePairs.size(0);
-  auto numInPlanes = features.size(1);
-  auto indicePairNumCpu = indiceNum.to({torch::kCPU});
-  auto options =
-      torch::TensorOptions().dtype(features.dtype()).device(features.device());
-  torch::Tensor output = torch::zeros({numAct, numInPlanes}, options);
-  for (int i = 0; i < kernelVolume; ++i) {
-    auto nHot = indicePairNumCpu.data_ptr<int>()[i];
-    if (nHot <= 0) {
-      continue;
-    }
-    AT_DISPATCH_FLOATING_TYPES_AND_HALF(
-        features.scalar_type(), "IndiceMaxpoolForwardKernel", [&] {
-          if (device == torch::kCPU) {
-            functor::SparseMaxPoolForwardFunctor<tv::CPU, scalar_t, int>
-                forwardFtor;
-            forwardFtor(tv::CPU(), tv::torch2tv<scalar_t>(output),
-                        tv::torch2tv<const scalar_t>(features),
-                        tv::torch2tv<const int>(indicePairs).subview(i), nHot);
-          } else {
-            functor::SparseMaxPoolForwardFunctor<tv::TorchGPU, scalar_t, int>
-                forwardFtor;
-            forwardFtor(tv::TorchGPU(), tv::torch2tv<scalar_t>(output),
-                        tv::torch2tv<const scalar_t>(features),
-                        tv::torch2tv<const int>(indicePairs).subview(i), nHot);
-            TV_CHECK_CUDA_ERR();
-          }
-        });
-  }
-  return output;
-}
-
-torch::Tensor IndiceMaxpoolBackwardCUDAKernelLauncher(torch::Tensor features,
-                                                      torch::Tensor outFeatures,
-                                                      torch::Tensor outGrad,
-                                                      torch::Tensor indicePairs,
-                                                      torch::Tensor indiceNum) {
-  at::cuda::CUDAGuard device_guard(features.device());
-  auto device = features.device().type();
-  auto numInPlanes = features.size(1);
-  auto indicePairNumCpu = indiceNum.to({torch::kCPU});
-  auto options =
-      torch::TensorOptions().dtype(features.dtype()).device(features.device());
-  torch::Tensor inputGrad = torch::zeros(features.sizes(), options);
-  auto kernelVolume = indicePairs.size(0);
-  for (int i = 0; i < kernelVolume; ++i) {
-    auto nHot = indicePairNumCpu.data_ptr<int>()[i];
-    if (nHot <= 0) {
-      continue;
-    }
-    AT_DISPATCH_FLOATING_TYPES_AND_HALF(
-        features.scalar_type(), "IndiceMaxpoolBackwardKernel", [&] {
-          if (device == torch::kCPU) {
-            functor::SparseMaxPoolBackwardFunctor<tv::CPU, scalar_t, int>
-                backwardFtor;
-            backwardFtor(tv::CPU(), tv::torch2tv<const scalar_t>(outFeatures),
-                         tv::torch2tv<const scalar_t>(features),
-                         tv::torch2tv<const scalar_t>(outGrad),
-                         tv::torch2tv<scalar_t>(inputGrad),
-                         tv::torch2tv<const int>(indicePairs).subview(i), nHot);
-          } else {
-            functor::SparseMaxPoolBackwardFunctor<tv::TorchGPU, scalar_t, int>
-                backwardFtor;
-            backwardFtor(tv::TorchGPU(),
-                         tv::torch2tv<const scalar_t>(outFeatures),
-                         tv::torch2tv<const scalar_t>(features),
-                         tv::torch2tv<const scalar_t>(outGrad),
-                         tv::torch2tv<scalar_t>(inputGrad),
-                         tv::torch2tv<const int>(indicePairs).subview(i), nHot);
-            TV_CHECK_CUDA_ERR();
-          }
-        });
-  }
-  return inputGrad;
-}
diff --git a/mmcv/ops/csrc/pytorch/cuda/sparse_reordering.cu b/mmcv/ops/csrc/pytorch/cuda/sparse_reordering.cu
deleted file mode 100644
index ba16860..0000000
--- a/mmcv/ops/csrc/pytorch/cuda/sparse_reordering.cu
+++ /dev/null
@@ -1,160 +0,0 @@
-// Copyright 2019 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <ATen/ATen.h>
-// clang-format off
-// TODO: make spconv_utils.h order agnostic
-#include "../spconv_utils.h"
-// clang-format on
-#include <utils/spconv/spconv/mp_helper.h>
-#include <utils/spconv/spconv/reordering.h>
-#include <utils/spconv/tensorview/helper_launch.h>
-#include <utils/spconv/tensorview/tensorview.h>
-
-#include <chrono>
-#include <limits>
-#include <spconv/reordering.cuh>
-#include <type_traits>
-#include <utils/spconv/tensorview/helper_kernel.cuh>
-
-#include "pytorch_cuda_helper.hpp"
-
-namespace functor {
-template <typename scalar_t, typename Index>
-struct SparseGatherFunctor<tv::TorchGPU, scalar_t, Index> {
-  using vecload_type_t =
-      std::conditional_t<std::is_same<scalar_t, at::Half>::value, int2, int4>;
-  using kernel_block_t = mp_list_c<int, 64, 32, 16>;
-  void operator()(const tv::TorchGPU &d, tv::TensorView<scalar_t> buffer,
-                  tv::TensorView<const scalar_t> features,
-                  tv::TensorView<const Index> indices, int size) {
-    if (size <= 0) return;
-    int numPlanes = features.dim(1);
-    bool notFound = true;
-    constexpr int vecloadFactor = sizeof(vecload_type_t) / sizeof(scalar_t);
-    mp_for_each<kernel_block_t>([=, &buffer, &features, &indices,
-                                 &notFound](auto NumTLP) {
-      constexpr int NumILP = NumTLP / 4;
-      int nHotBlock = (size / NumTLP) * NumTLP;
-      if (notFound) {
-        if (numPlanes % NumTLP == 0) {
-          if (nHotBlock >= NumTLP) {
-            gatherVecBlockKernel<scalar_t, Index, int(NumTLP), NumILP,
-                                 vecload_type_t>
-                <<<dim3(numPlanes / NumTLP, size / NumTLP),
-                   dim3(NumTLP / vecloadFactor, NumTLP / NumILP), 0,
-                   d.getStream()>>>(buffer.data(), features.data(),
-                                    indices.data(), nHotBlock,
-                                    numPlanes / vecloadFactor);
-
-            TV_CHECK_CUDA_ERR();
-          }
-          if (size - nHotBlock > 0) {
-            gatherVecKernel<scalar_t, Index, int(NumTLP), NumILP,
-                            vecload_type_t>
-                <<<dim3(1, numPlanes / NumTLP),
-                   dim3(NumTLP / NumILP, NumTLP / vecloadFactor), 0,
-                   d.getStream()>>>(buffer.data() + nHotBlock * numPlanes,
-                                    features.data(), indices.data() + nHotBlock,
-                                    size - nHotBlock,
-                                    numPlanes / vecloadFactor);
-            TV_CHECK_CUDA_ERR();
-          }
-          notFound = false;
-        }
-      }
-    });
-
-    if (notFound) {
-      constexpr int NumTLP = 64;
-      constexpr int NumILP = NumTLP / 4;
-      gatherGenericKernel<scalar_t, Index, NumTLP, NumILP>
-          <<<dim3(tv::launch::DivUp(size, NumTLP),
-                  tv::launch::DivUp(numPlanes, NumTLP)),
-             dim3(NumTLP / NumILP, NumTLP), 0, d.getStream()>>>(
-              buffer.data(), features.data(), indices.data(), size, numPlanes);
-      TV_CHECK_CUDA_ERR();
-    }
-  }
-};
-template <typename scalar_t, typename Index>
-struct SparseScatterAddFunctor<tv::TorchGPU, scalar_t, Index> {
-  using vecload_type_t =
-      std::conditional_t<std::is_same<scalar_t, at::Half>::value, int2, int4>;
-  using kernel_block_t = mp_list_c<int, 64, 32, 16>;
-  void operator()(const tv::TorchGPU &d, tv::TensorView<scalar_t> outFeatures,
-                  tv::TensorView<const scalar_t> buffer,
-                  tv::TensorView<const Index> indices, int size, bool stable) {
-    if (size <= 0) return;
-    int numPlanes = outFeatures.dim(1);
-    bool notFound = true;
-    constexpr int vecloadFactor =
-        sizeof(vecload_type_t) / sizeof(scalar_t);  // important for half.
-    mp_for_each<kernel_block_t>([=, &d, &outFeatures, &buffer, &indices,
-                                 &notFound](auto NumTLP) {
-      constexpr int NumILP = NumTLP / 4;
-      int nHotBlock = (size / NumTLP) * NumTLP;
-      if (notFound) {
-        if (numPlanes % NumTLP == 0) {
-          if (nHotBlock >= NumTLP) {
-            scatterAddVecBlockKernel<scalar_t, Index, int(NumTLP), NumILP,
-                                     vecload_type_t>
-                <<<dim3(numPlanes / NumTLP, size / NumTLP),
-                   dim3(NumTLP / vecloadFactor, NumTLP / NumILP), 0,
-                   d.getStream()>>>(outFeatures.data(), buffer.data(),
-                                    indices.data(), nHotBlock,
-                                    numPlanes / vecloadFactor);
-            TV_CHECK_CUDA_ERR();
-          }
-          if (size - nHotBlock > 0) {
-            scatterAddGenericKernel<scalar_t, Index, int(NumTLP), NumILP>
-                <<<dim3(1, numPlanes / NumTLP), dim3(NumTLP / NumILP, NumTLP),
-                   0, d.getStream()>>>(
-                    outFeatures.data(), buffer.data() + nHotBlock * numPlanes,
-                    indices.data() + nHotBlock, size - nHotBlock, numPlanes);
-            TV_CHECK_CUDA_ERR();
-          }
-          notFound = false;
-        }
-      }
-    });
-    if (notFound) {
-      constexpr int NumTLP = 64;
-      constexpr int NumILP = NumTLP / 4;
-      scatterAddGenericKernel<scalar_t, Index, NumTLP, NumILP>
-          <<<dim3(tv::launch::DivUp(size, NumTLP),
-                  tv::launch::DivUp(numPlanes, NumTLP)),
-             dim3(NumTLP / NumILP, NumTLP), 0, d.getStream()>>>(
-              outFeatures.data(), buffer.data(), indices.data(), size,
-              numPlanes);
-      TV_CHECK_CUDA_ERR();
-    }
-  }
-};
-
-}  // namespace functor
-
-#define DECLARE_GPU_SPECS_T_INDEX(scalar_t, Index)                             \
-  template struct functor::SparseGatherFunctor<tv::TorchGPU, scalar_t, Index>; \
-  template struct functor::SparseScatterAddFunctor<tv::TorchGPU, scalar_t,     \
-                                                   Index>;
-
-#define DECLARE_GPU_SPECS(scalar_t) DECLARE_GPU_SPECS_T_INDEX(scalar_t, int);
-
-DECLARE_GPU_SPECS(float);
-DECLARE_GPU_SPECS(double);
-DECLARE_GPU_SPECS(at::Half);
-
-#undef DECLARE_GPU_SPECS
-#undef DECLARE_GPU_SPECS_T_INDEX
diff --git a/mmcv/ops/csrc/pytorch/cuda/spconv_ops_cuda.cu b/mmcv/ops/csrc/pytorch/cuda/spconv_ops_cuda.cu
deleted file mode 100644
index d3791f3..0000000
--- a/mmcv/ops/csrc/pytorch/cuda/spconv_ops_cuda.cu
+++ /dev/null
@@ -1,477 +0,0 @@
-#include <cuda_runtime_api.h>
-#include <torch/script.h>
-// clang-format off
-// TODO: make spconv_utils.h order agnostic
-#include "../spconv_utils.h"
-// clang-format on
-#include <utils/spconv/spconv/indice.h>
-#include <utils/spconv/spconv/reordering.h>
-
-#include "pytorch_cuda_helper.hpp"
-
-template <unsigned NDim>
-std::vector<torch::Tensor> GetIndicePairsForwardCUDAKernelLauncher(
-    torch::Tensor indices, int64_t batchSize,
-    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
-    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
-    std::vector<int64_t> padding, std::vector<int64_t> dilation,
-    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose) {
-  at::cuda::CUDAGuard device_guard(indices.device());
-  bool subM = _subM != 0;
-  bool transpose = _transpose != 0;
-  auto numAct = indices.size(0);
-  auto coorDim = indices.size(1) - 1;
-  TV_ASSERT_RT_ERR(NDim == coorDim, "error");
-  TV_ASSERT_RT_ERR(kernelSize.size() == coorDim, "error");
-  TV_ASSERT_RT_ERR(outSpatialShape.size() == coorDim, "error");
-  TV_ASSERT_RT_ERR(stride.size() == coorDim, "error");
-  TV_ASSERT_RT_ERR(padding.size() == coorDim, "error");
-  TV_ASSERT_RT_ERR(outPadding.size() == coorDim, "error");
-  TV_ASSERT_RT_ERR(dilation.size() == coorDim, "error");
-  auto kernelVolume = kernelSize[0];
-  for (int i = 1; i < kernelSize.size(); ++i) {
-    kernelVolume *= kernelSize[i];
-  }
-  TV_ASSERT_RT_ERR(kernelVolume <= 4096, "error");
-  auto outputVolume = outSpatialShape[0];
-  for (int i = 1; i < outSpatialShape.size(); ++i) {
-    outputVolume *= outSpatialShape[i];
-  }
-  torch::Tensor indicePairs =
-      torch::full({kernelVolume, 2, numAct}, -1,
-                  torch::dtype(torch::kInt32).device(indices.device()));
-  torch::Tensor indiceNum = torch::zeros(
-      {kernelVolume}, torch::dtype(torch::kInt32).device(indices.device()));
-  torch::Tensor gridOut =
-      torch::full({batchSize * outputVolume}, -1,
-                  torch::dtype(torch::kInt32).device(indices.device()));
-  int64_t numActOut = -1;
-  tv::SimpleVector<int, NDim> outSpatialShape32;
-  tv::SimpleVector<int, NDim> kernelSize32;
-  tv::SimpleVector<int, NDim> stride32;
-  tv::SimpleVector<int, NDim> padding32;
-  tv::SimpleVector<int, NDim> dilation32;
-  auto indicePairUnique = torch::full(
-      {indicePairs.numel() / 2 + 1}, std::numeric_limits<int>::max(),
-      torch::dtype(torch::kInt32).device(indices.device()));
-  for (int i = 0; i < NDim; ++i) {
-    outSpatialShape32.push_back(outSpatialShape[i]);
-    kernelSize32.push_back(kernelSize[i]);
-    if (subM) {
-      stride32.push_back(1);
-      padding32.push_back(kernelSize[i] / 2);
-      dilation32.push_back(dilation[i]);
-    } else {
-      stride32.push_back(stride[i]);
-      padding32.push_back(padding[i]);
-      dilation32.push_back(dilation[i]);
-    }
-  }
-  if (subM) {
-    if (indices.device().type() == torch::kCPU) {
-      auto getIndicePairFtor =
-          functor::CreateSubMIndicePairFunctor<tv::CPU, int, int, NDim>();
-      numActOut = getIndicePairFtor(
-          tv::CPU(), tv::torch2tv<const int>(indices),
-          tv::torch2tv<int>(gridOut), tv::torch2tv<int>(indicePairs),
-          tv::torch2tv<int>(indiceNum), kernelSize32, stride32, padding32,
-          dilation32, outSpatialShape32, transpose);
-    } else {
-      auto getIndicePairFtor =
-          functor::CreateSubMIndicePairFunctor<tv::TorchGPU, int, int, NDim>();
-      numActOut = getIndicePairFtor(
-          tv::TorchGPU(), tv::torch2tv<const int>(indices),
-          tv::torch2tv<int>(gridOut), tv::torch2tv<int>(indicePairs),
-          tv::torch2tv<int>(indiceNum), kernelSize32, stride32, padding32,
-          dilation32, outSpatialShape32, transpose);
-    }
-    return {indices, indicePairs, indiceNum};
-  } else {
-    torch::Tensor outInds =
-        torch::zeros({numAct * kernelVolume, coorDim + 1},
-                     torch::dtype(torch::kInt32).device(indices.device()));
-    if (indices.device().type() == torch::kCPU) {
-      auto getIndicePairFtor =
-          functor::CreateConvIndicePairFunctor<tv::CPU, int, int, NDim>();
-      numActOut = getIndicePairFtor(
-          tv::CPU(), tv::torch2tv<const int>(indices),
-          tv::torch2tv<int>(outInds), tv::torch2tv<int>(gridOut),
-          tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum),
-          kernelSize32, stride32, padding32, dilation32, outSpatialShape32,
-          transpose);
-    } else {
-      auto getIndicePairFtorP1 =
-          functor::CreateConvIndicePairFunctorP1<tv::TorchGPU, int, int,
-                                                 NDim>();
-      auto getIndicePairFtorP2 =
-          functor::CreateConvIndicePairFunctorP2<tv::TorchGPU, int, int,
-                                                 NDim>();
-      numActOut = getIndicePairFtorP1(
-          tv::TorchGPU(), tv::torch2tv<const int>(indices),
-          tv::torch2tv<int>(outInds), tv::torch2tv<int>(gridOut),
-          tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum),
-          tv::torch2tv<int>(indicePairUnique), kernelSize32, stride32,
-          padding32, dilation32, outSpatialShape32, transpose);
-      if (numActOut > 0) {
-        auto res = torch::_unique(indicePairUnique);
-        indicePairUnique = std::get<0>(res);
-        numActOut = getIndicePairFtorP2(
-            tv::TorchGPU(), tv::torch2tv<const int>(indices),
-            tv::torch2tv<int>(outInds), tv::torch2tv<int>(gridOut),
-            tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum),
-            tv::torch2tv<int>(indicePairUnique), outSpatialShape32, transpose);
-      }
-    }
-    return {outInds.slice(0, 0, numActOut), indicePairs, indiceNum};
-  }
-}
-
-template <unsigned NDim>
-std::vector<torch::Tensor> GetIndicePairsBackwardCUDAKernelLauncher(
-    torch::Tensor indices, torch::Tensor gridOut, int64_t batchSize,
-    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
-    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
-    std::vector<int64_t> padding, std::vector<int64_t> dilation,
-    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose) {
-  at::cuda::CUDAGuard device_guard(indices.device());
-  bool subM = _subM != 0;
-  bool transpose = _transpose != 0;
-  auto numAct = indices.size(0);
-  auto coorDim = indices.size(1) - 1;
-  TV_ASSERT_RT_ERR(NDim == coorDim, "error");
-  TV_ASSERT_RT_ERR(kernelSize.size() == coorDim, "error");
-  TV_ASSERT_RT_ERR(outSpatialShape.size() == coorDim, "error");
-  TV_ASSERT_RT_ERR(stride.size() == coorDim, "error");
-  TV_ASSERT_RT_ERR(padding.size() == coorDim, "error");
-  TV_ASSERT_RT_ERR(outPadding.size() == coorDim, "error");
-  TV_ASSERT_RT_ERR(dilation.size() == coorDim, "error");
-  auto kernelVolume = kernelSize[0];
-  for (int i = 1; i < kernelSize.size(); ++i) {
-    kernelVolume *= kernelSize[i];
-  }
-  TV_ASSERT_RT_ERR(kernelVolume <= 4096, "error");
-  auto outputVolume = outSpatialShape[0];
-  for (int i = 1; i < outSpatialShape.size(); ++i) {
-    outputVolume *= outSpatialShape[i];
-  }
-  TV_ASSERT_INVALID_ARG(gridOut.numel() >= outputVolume * batchSize, "error");
-  torch::Tensor indicePairs =
-      torch::full({kernelVolume, 2, numAct}, -1,
-                  torch::dtype(torch::kInt32).device(indices.device()));
-  torch::Tensor indiceNum = torch::zeros(
-      {kernelVolume}, torch::dtype(torch::kInt32).device(indices.device()));
-  int64_t numActOut = -1;
-  tv::SimpleVector<int, NDim> outSpatialShape32;
-  tv::SimpleVector<int, NDim> kernelSize32;
-  tv::SimpleVector<int, NDim> stride32;
-  tv::SimpleVector<int, NDim> padding32;
-  tv::SimpleVector<int, NDim> dilation32;
-  auto indicePairUnique = torch::full(
-      {indicePairs.numel() / 2 + 1}, std::numeric_limits<int>::max(),
-      torch::dtype(torch::kInt32).device(indices.device()));
-  for (int i = 0; i < NDim; ++i) {
-    outSpatialShape32.push_back(outSpatialShape[i]);
-    kernelSize32.push_back(kernelSize[i]);
-    if (subM) {
-      stride32.push_back(1);
-      padding32.push_back(kernelSize[i] / 2);
-      dilation32.push_back(dilation[i]);
-    } else {
-      stride32.push_back(stride[i]);
-      padding32.push_back(padding[i]);
-      dilation32.push_back(dilation[i]);
-    }
-  }
-  if (subM) {
-    if (indices.device().type() == torch::kCPU) {
-      auto getIndicePairFtor =
-          functor::CreateSubMIndicePairFunctor<tv::CPU, int, int, NDim>();
-      numActOut = getIndicePairFtor(
-          tv::CPU(), tv::torch2tv<const int>(indices),
-          tv::torch2tv<int>(gridOut), tv::torch2tv<int>(indicePairs),
-          tv::torch2tv<int>(indiceNum), kernelSize32, stride32, padding32,
-          dilation32, outSpatialShape32, transpose);
-      gridOut.fill_(-1);
-    } else {
-      auto getIndicePairFtor =
-          functor::CreateSubMIndicePairFunctor<tv::TorchGPU, int, int, NDim>();
-      numActOut = getIndicePairFtor(
-          tv::TorchGPU(), tv::torch2tv<const int>(indices),
-          tv::torch2tv<int>(gridOut), tv::torch2tv<int>(indicePairs),
-          tv::torch2tv<int>(indiceNum), kernelSize32, stride32, padding32,
-          dilation32, outSpatialShape32, transpose, true);
-    }
-    return {indices, indicePairs, indiceNum};
-  } else {
-    torch::Tensor outInds =
-        torch::zeros({numAct * kernelVolume, coorDim + 1},
-                     torch::dtype(torch::kInt32).device(indices.device()));
-    if (indices.device().type() == torch::kCPU) {
-      auto getIndicePairFtor =
-          functor::CreateConvIndicePairFunctor<tv::CPU, int, int, NDim>();
-      numActOut = getIndicePairFtor(
-          tv::CPU(), tv::torch2tv<const int>(indices),
-          tv::torch2tv<int>(outInds), tv::torch2tv<int>(gridOut),
-          tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum),
-          kernelSize32, stride32, padding32, dilation32, outSpatialShape32,
-          transpose, true);
-      gridOut.fill_(-1);
-    } else {
-      auto getIndicePairFtorP1 =
-          functor::CreateConvIndicePairFunctorP1<tv::TorchGPU, int, int,
-                                                 NDim>();
-      auto getIndicePairFtorP2 =
-          functor::CreateConvIndicePairFunctorP2<tv::TorchGPU, int, int,
-                                                 NDim>();
-      numActOut = getIndicePairFtorP1(
-          tv::TorchGPU(), tv::torch2tv<const int>(indices),
-          tv::torch2tv<int>(outInds), tv::torch2tv<int>(gridOut),
-          tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum),
-          tv::torch2tv<int>(indicePairUnique), kernelSize32, stride32,
-          padding32, dilation32, outSpatialShape32, transpose);
-      if (numActOut > 0) {
-        auto res = torch::_unique(indicePairUnique);
-        indicePairUnique = std::get<0>(res);
-        numActOut = getIndicePairFtorP2(
-            tv::TorchGPU(), tv::torch2tv<const int>(indices),
-            tv::torch2tv<int>(outInds), tv::torch2tv<int>(gridOut),
-            tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum),
-            tv::torch2tv<int>(indicePairUnique), outSpatialShape32, transpose,
-            true);
-      }
-    }
-    return {outInds.slice(0, 0, numActOut), indicePairs, indiceNum};
-  }
-}
-
-torch::Tensor IndiceConvForwardCUDAKernelLauncher(
-    torch::Tensor features, torch::Tensor filters, torch::Tensor indicePairs,
-    torch::Tensor indiceNum, int64_t numActOut, int64_t _inverse,
-    int64_t _subM) {
-  at::cuda::CUDAGuard device_guard(features.device());
-  bool subM = _subM != 0;
-  bool inverse = _inverse != 0;
-  auto device = features.device().type();
-  auto ndim = filters.dim() - 2;
-  auto kernelVolume = indicePairs.size(0);
-  auto numInPlanes = features.size(1);
-  auto numOutPlanes = filters.size(ndim + 1);
-  auto indicePairNumCpu = indiceNum.to({torch::kCPU});
-  auto indicePairMaxSizeIter =
-      std::max_element(indicePairNumCpu.data_ptr<int>(),
-                       indicePairNumCpu.data_ptr<int>() + kernelVolume);
-  int indicePairMaxOffset =
-      indicePairMaxSizeIter - indicePairNumCpu.data_ptr<int>();
-  int indicePairMaxSize = *indicePairMaxSizeIter;
-
-  auto options =
-      torch::TensorOptions().dtype(features.dtype()).device(features.device());
-
-  torch::Tensor output = torch::zeros({numActOut, numOutPlanes}, options);
-  torch::Tensor inputBuffer =
-      torch::zeros({indicePairMaxSize, numInPlanes}, options);
-  torch::Tensor outputBuffer =
-      torch::zeros({indicePairMaxSize, numOutPlanes}, options);
-  filters = filters.view({-1, numInPlanes, numOutPlanes});
-  if (subM) {
-    torch::mm_out(output, features, filters[indicePairMaxOffset]);
-  }
-  double totalGatherTime = 0;
-  double totalGEMMTime = 0;
-  double totalSAddTime = 0;
-  for (int i = 0; i < kernelVolume; ++i) {
-    auto nHot = indicePairNumCpu.data_ptr<int>()[i];
-    if (nHot <= 0 || (subM && i == indicePairMaxOffset)) {
-      continue;
-    }
-
-    AT_DISPATCH_FLOATING_TYPES_AND_HALF(
-        features.scalar_type(), "IndiceConvForwardKernel", [&] {
-          auto outputBufferBlob = torch::from_blob(
-              outputBuffer.data_ptr<scalar_t>(), {nHot, numOutPlanes}, options);
-          auto inputBufferBlob = torch::from_blob(
-              inputBuffer.data_ptr<scalar_t>(), {nHot, numInPlanes}, options);
-
-          if (device == torch::kCPU) {
-            functor::SparseGatherFunctor<tv::CPU, scalar_t, int> gatherFtor;
-            gatherFtor(tv::CPU(), tv::torch2tv<scalar_t>(inputBuffer),
-                       tv::torch2tv<const scalar_t>(features),
-                       tv::torch2tv<const int>(indicePairs).subview(i, inverse),
-                       nHot);
-          } else {
-            functor::SparseGatherFunctor<tv::TorchGPU, scalar_t, int>
-                gatherFtor;
-            gatherFtor(tv::TorchGPU(), tv::torch2tv<scalar_t>(inputBuffer),
-                       tv::torch2tv<const scalar_t>(features),
-                       tv::torch2tv<const int>(indicePairs).subview(i, inverse),
-                       nHot);
-            TV_CHECK_CUDA_ERR();
-            /* slower than SparseGatherFunctor, may due to int->long conversion
-            auto indicePairLong = indicePairs[i][inverse].to(torch::kInt64);
-            auto indicePairBlob =
-            torch::from_blob(indicePairLong.data_ptr<long>(), {nHot},
-            indicePairOptions); torch::index_select_out(inputBufferBlob,
-            features, 0, indicePairBlob);*/
-          }
-          torch::mm_out(outputBufferBlob, inputBufferBlob, filters[i]);
-
-          if (device == torch::kCPU) {
-            functor::SparseScatterAddFunctor<tv::CPU, scalar_t, int>
-                scatterFtor;
-            scatterFtor(
-                tv::CPU(), tv::torch2tv<scalar_t>(output),
-                tv::torch2tv<const scalar_t>(outputBuffer),
-                tv::torch2tv<const int>(indicePairs).subview(i, !inverse), nHot,
-                true);
-          } else {
-            functor::SparseScatterAddFunctor<tv::TorchGPU, scalar_t, int>
-                scatterFtor;
-            scatterFtor(
-                tv::TorchGPU(), tv::torch2tv<scalar_t>(output),
-                tv::torch2tv<const scalar_t>(outputBuffer),
-                tv::torch2tv<const int>(indicePairs).subview(i, !inverse), nHot,
-                true);
-            TV_CHECK_CUDA_ERR();
-          }
-        });
-  }
-  return output;
-}
-
-std::vector<torch::Tensor> IndiceConvBackwardCUDAKernelLauncher(
-    torch::Tensor features, torch::Tensor filters, torch::Tensor outGrad,
-    torch::Tensor indicePairs, torch::Tensor indiceNum, int64_t _inverse,
-    int64_t _subM) {
-  at::cuda::CUDAGuard device_guard(features.device());
-  bool subM = _subM != 0;
-  bool inverse = _inverse != 0;
-
-  auto device = features.device().type();
-  auto ndim = filters.dim() - 2;
-  auto kernelVolume = indicePairs.size(0);
-  auto numInPlanes = features.size(1);
-  auto numOutPlanes = filters.size(ndim + 1);
-  auto indicePairNumCpu = indiceNum.to({torch::kCPU});
-  auto indicePairMaxSizeIter =
-      std::max_element(indicePairNumCpu.data_ptr<int>(),
-                       indicePairNumCpu.data_ptr<int>() + kernelVolume);
-  int indicePairMaxOffset =
-      indicePairMaxSizeIter - indicePairNumCpu.data_ptr<int>();
-  int indicePairMaxSize = *indicePairMaxSizeIter;
-  auto options =
-      torch::TensorOptions().dtype(features.dtype()).device(features.device());
-  auto filterShape = filters.sizes();
-  torch::Tensor inputGrad = torch::zeros(features.sizes(), options);
-  torch::Tensor filtersGrad = torch::zeros(filterShape, options);
-  torch::Tensor inputBuffer =
-      torch::zeros({indicePairMaxSize, numInPlanes}, options);
-  torch::Tensor outputBuffer =
-      torch::zeros({indicePairMaxSize, numOutPlanes}, options);
-
-  filters = filters.view({-1, numInPlanes, numOutPlanes});
-  filtersGrad = filtersGrad.view({-1, numInPlanes, numOutPlanes});
-  if (subM) {
-    auto filterGradSub = filtersGrad[indicePairMaxOffset];
-    torch::mm_out(filterGradSub, features.t(), outGrad);
-    torch::mm_out(inputGrad, outGrad, filters[indicePairMaxOffset].t());
-  }
-  for (int i = 0; i < kernelVolume; ++i) {
-    auto nHot = indicePairNumCpu.data_ptr<int>()[i];
-    if (nHot <= 0 || (subM && i == indicePairMaxOffset)) {
-      continue;
-    }
-
-    AT_DISPATCH_FLOATING_TYPES_AND_HALF(
-        features.scalar_type(), "IndiceConvBackwardKernel", [&] {
-          if (device == torch::kCPU) {
-            functor::SparseGatherFunctor<tv::CPU, scalar_t, int> gatherFtor;
-            functor::SparseGatherFunctor<tv::CPU, scalar_t, int> gatherFtorOut;
-            gatherFtor(tv::CPU(), tv::torch2tv<scalar_t>(inputBuffer),
-                       tv::torch2tv<const scalar_t>(features),
-                       tv::torch2tv<const int>(indicePairs).subview(i, inverse),
-                       nHot);
-            gatherFtorOut(
-                tv::CPU(), tv::torch2tv<scalar_t>(outputBuffer),
-                tv::torch2tv<const scalar_t>(outGrad),
-                tv::torch2tv<const int>(indicePairs).subview(i, !inverse),
-                nHot);
-          } else {
-            functor::SparseGatherFunctor<tv::TorchGPU, scalar_t, int>
-                gatherFtor;
-            functor::SparseGatherFunctor<tv::TorchGPU, scalar_t, int>
-                gatherFtorOut;
-            gatherFtor(tv::TorchGPU(), tv::torch2tv<scalar_t>(inputBuffer),
-                       tv::torch2tv<const scalar_t>(features),
-                       tv::torch2tv<const int>(indicePairs).subview(i, inverse),
-                       nHot);
-            TV_CHECK_CUDA_ERR();
-            gatherFtorOut(
-                tv::TorchGPU(), tv::torch2tv<scalar_t>(outputBuffer),
-                tv::torch2tv<const scalar_t>(outGrad),
-                tv::torch2tv<const int>(indicePairs).subview(i, !inverse),
-                nHot);
-            TV_CHECK_CUDA_ERR();
-          }
-          auto filterGradSub = filtersGrad[i];
-          auto outputBufferBlob = torch::from_blob(
-              outputBuffer.data_ptr<scalar_t>(), {nHot, numOutPlanes}, options);
-          auto inputBufferBlob = torch::from_blob(
-              inputBuffer.data_ptr<scalar_t>(), {nHot, numInPlanes}, options);
-
-          torch::mm_out(filterGradSub, inputBufferBlob.t(), outputBufferBlob);
-          torch::mm_out(inputBufferBlob, outputBufferBlob, filters[i].t());
-          if (device == torch::kCPU) {
-            functor::SparseScatterAddFunctor<tv::CPU, scalar_t, int>
-                scatterFtor;
-            scatterFtor(
-                tv::CPU(), tv::torch2tv<scalar_t>(inputGrad),
-                tv::torch2tv<const scalar_t>(inputBuffer),
-                tv::torch2tv<const int>(indicePairs).subview(i, inverse), nHot);
-          } else {
-            functor::SparseScatterAddFunctor<tv::TorchGPU, scalar_t, int>
-                scatterFtor;
-            scatterFtor(
-                tv::TorchGPU(), tv::torch2tv<scalar_t>(inputGrad),
-                tv::torch2tv<const scalar_t>(inputBuffer),
-                tv::torch2tv<const int>(indicePairs).subview(i, inverse), nHot);
-            TV_CHECK_CUDA_ERR();
-          }
-        });
-  }
-  return {inputGrad, filtersGrad.view(filterShape)};
-}
-
-template std::vector<torch::Tensor> GetIndicePairsForwardCUDAKernelLauncher<2>(
-    torch::Tensor indices, int64_t batchSize,
-    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
-    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
-    std::vector<int64_t> padding, std::vector<int64_t> dilation,
-    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose);
-
-template std::vector<torch::Tensor> GetIndicePairsForwardCUDAKernelLauncher<3>(
-    torch::Tensor indices, int64_t batchSize,
-    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
-    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
-    std::vector<int64_t> padding, std::vector<int64_t> dilation,
-    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose);
-
-template std::vector<torch::Tensor> GetIndicePairsForwardCUDAKernelLauncher<4>(
-    torch::Tensor indices, int64_t batchSize,
-    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
-    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
-    std::vector<int64_t> padding, std::vector<int64_t> dilation,
-    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose);
-
-template std::vector<torch::Tensor> GetIndicePairsBackwardCUDAKernelLauncher<2>(
-    torch::Tensor indices, torch::Tensor gridOut, int64_t batchSize,
-    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
-    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
-    std::vector<int64_t> padding, std::vector<int64_t> dilation,
-    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose);
-
-template std::vector<torch::Tensor> GetIndicePairsBackwardCUDAKernelLauncher<3>(
-    torch::Tensor indices, torch::Tensor gridOut, int64_t batchSize,
-    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
-    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
-    std::vector<int64_t> padding, std::vector<int64_t> dilation,
-    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose);
diff --git a/mmcv/ops/csrc/pytorch/cuda/stack_ball_query_cuda.cu b/mmcv/ops/csrc/pytorch/cuda/stack_ball_query_cuda.cu
deleted file mode 100644
index 3095df5..0000000
--- a/mmcv/ops/csrc/pytorch/cuda/stack_ball_query_cuda.cu
+++ /dev/null
@@ -1,45 +0,0 @@
-// Copyright (c) OpenMMLab. All rights reserved
-// Modified from
-// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/ball_query_gpu.cu
-
-#include <math.h>
-#include <stdio.h>
-#include <stdlib.h>
-
-#include "pytorch_cuda_helper.hpp"
-#include "stack_ball_query_cuda_kernel.cuh"
-#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
-
-void StackBallQueryForwardCUDAKernelLauncher(float max_radius, int nsample,
-                                             const Tensor new_xyz,
-                                             const Tensor new_xyz_batch_cnt,
-                                             const Tensor xyz,
-                                             const Tensor xyz_batch_cnt,
-                                             Tensor idx) {
-  at::cuda::CUDAGuard device_guard(new_xyz.device());
-  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-
-  //   const float *new_xyz_ptr = new_xyz.data_ptr<float>();
-  //   const float *xyz_ptr = xyz.data_ptr<float>();
-  //   const int *new_xyz_batch_cnt_ptr = new_xyz_batch_cnt.data_ptr<int>();
-  //   const int *xyz_batch_cnt_ptr = xyz_batch_cnt.data_ptr<int>();
-  //   int *idx_ptr = idx.data_ptr<int>();
-
-  int B = xyz_batch_cnt.size(0);
-  int M = new_xyz.size(0);
-
-  // blockIdx.x(col), blockIdx.y(row)
-  dim3 blocks(DIVUP(M, THREADS_PER_BLOCK));
-  dim3 threads(THREADS_PER_BLOCK);
-
-  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
-      new_xyz.scalar_type(), "stack_ball_query_forward_cuda_kernel", [&] {
-        stack_ball_query_forward_cuda_kernel<scalar_t>
-            <<<blocks, threads, 0, stream>>>(
-                B, M, max_radius, nsample, new_xyz.data_ptr<scalar_t>(),
-                new_xyz_batch_cnt.data_ptr<int>(), xyz.data_ptr<scalar_t>(),
-                xyz_batch_cnt.data_ptr<int>(), idx.data_ptr<int>());
-      });
-
-  AT_CUDA_CHECK(cudaGetLastError());
-}
diff --git a/mmcv/ops/csrc/pytorch/cuda/stack_group_points_cuda.cu b/mmcv/ops/csrc/pytorch/cuda/stack_group_points_cuda.cu
deleted file mode 100644
index 9f903b0..0000000
--- a/mmcv/ops/csrc/pytorch/cuda/stack_group_points_cuda.cu
+++ /dev/null
@@ -1,62 +0,0 @@
-// Copyright (c) OpenMMLab. All rights reserved.
-// Modified from
-// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/group_points_gpu.cu
-#include <stdio.h>
-#include <stdlib.h>
-
-#include "pytorch_cuda_helper.hpp"
-#include "stack_group_points_cuda_kernel.cuh"
-
-void StackGroupPointsForwardCUDAKernelLauncher(
-    int b, int c, int m, int nsample, const Tensor features_tensor,
-    const Tensor features_batch_cnt_tensor, const Tensor idx_tensor,
-    const Tensor idx_batch_cnt_tensor, Tensor out_tensor) {
-  // points: (B, C, N)
-  // idx: (B, npoints, nsample)
-  // output:
-  //      out: (B, C, npoints, nsample)
-  at::cuda::CUDAGuard device_guard(features_tensor.device());
-  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-
-  dim3 blocks(DIVUP(m * c * nsample, THREADS_PER_BLOCK));
-  dim3 threads(THREADS_PER_BLOCK);
-
-  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
-      features_tensor.scalar_type(), "stack_group_points_forward_cuda_kernel",
-      [&] {
-        stack_group_points_forward_cuda_kernel<scalar_t>
-            <<<blocks, threads, 0, stream>>>(
-                b, c, m, nsample, features_tensor.data_ptr<scalar_t>(),
-                features_batch_cnt_tensor.data_ptr<int>(),
-                idx_tensor.data_ptr<int>(),
-                idx_batch_cnt_tensor.data_ptr<int>(),
-                out_tensor.data_ptr<scalar_t>());
-      });
-
-  AT_CUDA_CHECK(cudaGetLastError());
-}
-
-void StackGroupPointsBackwardCUDAKernelLauncher(
-    int b, int c, int m, int n, int nsample, const Tensor grad_out_tensor,
-    const Tensor idx_tensor, const Tensor idx_batch_cnt_tensor,
-    const Tensor features_batch_cnt_tensor, Tensor grad_features_tensor) {
-  at::cuda::CUDAGuard device_guard(grad_features_tensor.device());
-  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-
-  dim3 blocks(DIVUP(m * c * nsample, THREADS_PER_BLOCK));
-  dim3 threads(THREADS_PER_BLOCK);
-
-  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
-      grad_features_tensor.scalar_type(),
-      "stack_group_points_backward_cuda_kernel", [&] {
-        stack_group_points_backward_cuda_kernel<scalar_t>
-            <<<blocks, threads, 0, stream>>>(
-                b, c, m, n, nsample, grad_out_tensor.data_ptr<scalar_t>(),
-                idx_tensor.data_ptr<int>(),
-                idx_batch_cnt_tensor.data_ptr<int>(),
-                features_batch_cnt_tensor.data_ptr<int>(),
-                grad_features_tensor.data_ptr<scalar_t>());
-      });
-
-  AT_CUDA_CHECK(cudaGetLastError());
-}
diff --git a/mmcv/ops/csrc/pytorch/cuda/three_interpolate_cuda.cu b/mmcv/ops/csrc/pytorch/cuda/three_interpolate_cuda.cu
index 56a5550..839d2d8 100644
--- a/mmcv/ops/csrc/pytorch/cuda/three_interpolate_cuda.cu
+++ b/mmcv/ops/csrc/pytorch/cuda/three_interpolate_cuda.cu
@@ -23,7 +23,7 @@ void ThreeInterpolateForwardCUDAKernelLauncher(int b, int c, int m, int n,
   cudaStream_t stream = at::cuda::getCurrentCUDAStream();
 
   // blockIdx.x(col), blockIdx.y(row)
-  dim3 blocks(GET_BLOCKS(n, THREADS_PER_BLOCK), c, b);
+  dim3 blocks(DIVUP(n, THREADS_PER_BLOCK), c, b);
   dim3 threads(THREADS_PER_BLOCK);
 
   AT_DISPATCH_FLOATING_TYPES_AND_HALF(
@@ -51,7 +51,7 @@ void ThreeInterpolateBackwardCUDAKernelLauncher(int b, int c, int n, int m,
   cudaStream_t stream = at::cuda::getCurrentCUDAStream();
 
   // blockIdx.x(col), blockIdx.y(row)
-  dim3 blocks(GET_BLOCKS(n, THREADS_PER_BLOCK), c, b);
+  dim3 blocks(DIVUP(n, THREADS_PER_BLOCK), c, b);
   dim3 threads(THREADS_PER_BLOCK);
 
   AT_DISPATCH_FLOATING_TYPES_AND_HALF(
diff --git a/mmcv/ops/csrc/pytorch/cuda/three_nn_cuda.cu b/mmcv/ops/csrc/pytorch/cuda/three_nn_cuda.cu
index 91c6882..9afde8f 100644
--- a/mmcv/ops/csrc/pytorch/cuda/three_nn_cuda.cu
+++ b/mmcv/ops/csrc/pytorch/cuda/three_nn_cuda.cu
@@ -21,7 +21,7 @@ void ThreeNNForwardCUDAKernelLauncher(int b, int n, int m, const Tensor unknown,
   cudaStream_t stream = at::cuda::getCurrentCUDAStream();
 
   // blockIdx.x(col), blockIdx.y(row)
-  dim3 blocks(GET_BLOCKS(n, THREADS_PER_BLOCK), b);
+  dim3 blocks(DIVUP(n, THREADS_PER_BLOCK), b);
   dim3 threads(THREADS_PER_BLOCK);
 
   AT_DISPATCH_FLOATING_TYPES_AND_HALF(
diff --git a/mmcv/ops/csrc/pytorch/cuda/upfirdn2d_kernel.cu b/mmcv/ops/csrc/pytorch/cuda/upfirdn2d_kernel.cu
index 60cdd82..ea2f088 100644
--- a/mmcv/ops/csrc/pytorch/cuda/upfirdn2d_kernel.cu
+++ b/mmcv/ops/csrc/pytorch/cuda/upfirdn2d_kernel.cu
@@ -1,745 +1,370 @@
-// Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+// Modified from
+// https://github.com/rosinality/stylegan2-pytorch/blob/master/op/upfirdn2d_kernel.cu
+// Copyright (c) 2019, NVIDIA Corporation. All rights reserved.
 //
-// NVIDIA CORPORATION and its licensors retain all intellectual property
-// and proprietary rights in and to this software, related documentation
-// and any modifications thereto.  Any use, reproduction, disclosure or
-// distribution of this software and related documentation without an express
-// license agreement from NVIDIA CORPORATION is strictly prohibited.
-#include <c10/util/Half.h>
+// This work is made available under the Nvidia Source Code License-NC.
+// To view a copy of this license, visit
+// https://nvlabs.github.io/stylegan2/license.html
+
+#include <ATen/ATen.h>
+#include <ATen/AccumulateType.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
 #include <torch/types.h>
 
-#include "pytorch_cuda_helper.hpp"
-
-struct upfirdn2d_kernel_params {
-  const void *x;
-  const float *f;
-  void *y;
-
-  int2 up;
-  int2 down;
-  int2 pad0;
-  int flip;
-  float gain;
-
-  int4 inSize;  // [width, height, channel, batch]
-  int4 inStride;
-  int2 filterSize;  // [width, height]
-  int2 filterStride;
-  int4 outSize;  // [width, height, channel, batch]
-  int4 outStride;
-  int sizeMinor;
-  int sizeMajor;
-
-  int loopMinor;
-  int loopMajor;
-  int loopX;
-  int launchMinor;
-  int launchMajor;
-};
-
-//------------------------------------------------------------------------
-// CUDA kernel specialization.
-
-struct upfirdn2d_kernel_spec {
-  void *kernel;
-  int tileOutW;
-  int tileOutH;
-  int loopMinor;
-  int loopX;
-};
-
-//------------------------------------------------------------------------
-// CUDA kernel selection.
+#include <ATen/cuda/CUDAApplyUtils.cuh>
 
-template <class T>
-upfirdn2d_kernel_spec choose_upfirdn2d_kernel(const upfirdn2d_kernel_params &p);
-//------------------------------------------------------------------------
+static __host__ __device__ __forceinline__ int floor_div(int a, int b) {
+  int c = a / b;
 
-// Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
-//
-// NVIDIA CORPORATION and its licensors retain all intellectual property
-// and proprietary rights in and to this software, related documentation
-// and any modifications thereto.  Any use, reproduction, disclosure or
-// distribution of this software and related documentation without an express
-// license agreement from NVIDIA CORPORATION is strictly prohibited.
-
-//------------------------------------------------------------------------
-// Helpers.
-
-template <class T>
-struct InternalType;
-template <>
-struct InternalType<double> {
-  typedef double scalar_t;
-};
-template <>
-struct InternalType<float> {
-  typedef float scalar_t;
-};
-template <>
-struct InternalType<c10::Half> {
-  typedef float scalar_t;
-};
+  if (c * b > a) {
+    c--;
+  }
 
-static __device__ __forceinline__ int floor_div(int a, int b) {
-  int t = 1 - a / b;
-  return (a + t * b) / b - t;
+  return c;
 }
 
-//------------------------------------------------------------------------
-// Generic CUDA implementation for large filters.
-
-template <class T>
-static __global__ void upfirdn2d_kernel_large(upfirdn2d_kernel_params p) {
-  typedef typename InternalType<T>::scalar_t scalar_t;
+struct UpFirDn2DKernelParams {
+  int up_x;
+  int up_y;
+  int down_x;
+  int down_y;
+  int pad_x0;
+  int pad_x1;
+  int pad_y0;
+  int pad_y1;
+
+  int major_dim;
+  int in_h;
+  int in_w;
+  int minor_dim;
+  int kernel_h;
+  int kernel_w;
+  int out_h;
+  int out_w;
+  int loop_major;
+  int loop_x;
+};
 
-  // Calculate thread index.
-  int minorBase = blockIdx.x * blockDim.x + threadIdx.x;
-  int outY = minorBase / p.launchMinor;
-  minorBase -= outY * p.launchMinor;
-  int outXBase = blockIdx.y * p.loopX * blockDim.y + threadIdx.y;
-  int majorBase = blockIdx.z * p.loopMajor;
-  if (outXBase >= p.outSize.x | outY >= p.outSize.y | majorBase >= p.sizeMajor)
+template <typename scalar_t>
+__global__ void upfirdn2d_kernel_large(scalar_t *out, const scalar_t *input,
+                                       const scalar_t *kernel,
+                                       const UpFirDn2DKernelParams p) {
+  int minor_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  int out_y = minor_idx / p.minor_dim;
+  minor_idx -= out_y * p.minor_dim;
+  int out_x_base = blockIdx.y * p.loop_x * blockDim.y + threadIdx.y;
+  int major_idx_base = blockIdx.z * p.loop_major;
+
+  if (out_x_base >= p.out_w || out_y >= p.out_h ||
+      major_idx_base >= p.major_dim) {
     return;
+  }
 
-  // Setup Y receptive field.
-  int midY = outY * p.down.y + p.up.y - 1 - p.pad0.y;
-  int inY = min(max(floor_div(midY, p.up.y), 0), p.inSize.y);
-  int h =
-      min(max(floor_div(midY + p.filterSize.y, p.up.y), 0), p.inSize.y) - inY;
-  int filterY = midY + p.filterSize.y - (inY + 1) * p.up.y;
-  if (p.flip) filterY = p.filterSize.y - 1 - filterY;
-
-  // Loop over major, minor, and X.
-  for (int majorIdx = 0, major = majorBase;
-       majorIdx < p.loopMajor & major < p.sizeMajor; majorIdx++, major++)
-    for (int minorIdx = 0, minor = minorBase;
-         minorIdx < p.loopMinor & minor < p.sizeMinor;
-         minorIdx++, minor += p.launchMinor) {
-      int nc = major * p.sizeMinor + minor;
-      int n = nc / p.inSize.z;
-      int c = nc - n * p.inSize.z;
-      for (int loopX = 0, outX = outXBase; loopX < p.loopX & outX < p.outSize.x;
-           loopX++, outX += blockDim.y) {
-        // Setup X receptive field.
-        int midX = outX * p.down.x + p.up.x - 1 - p.pad0.x;
-        int inX = min(max(floor_div(midX, p.up.x), 0), p.inSize.x);
-        int w =
-            min(max(floor_div(midX + p.filterSize.x, p.up.x), 0), p.inSize.x) -
-            inX;
-        int filterX = midX + p.filterSize.x - (inX + 1) * p.up.x;
-        if (p.flip) filterX = p.filterSize.x - 1 - filterX;
-
-        // Initialize pointers.
-        const T *xp =
-            &((const T *)p.x)[inX * p.inStride.x + inY * p.inStride.y +
-                              c * p.inStride.z + n * p.inStride.w];
-        const float *fp =
-            &p.f[filterX * p.filterStride.x + filterY * p.filterStride.y];
-        int filterStepX = ((p.flip) ? p.up.x : -p.up.x) * p.filterStride.x;
-        int filterStepY = ((p.flip) ? p.up.y : -p.up.y) * p.filterStride.y;
-
-        // Inner loop.
-        scalar_t v = 0;
-        for (int y = 0; y < h; y++) {
-          for (int x = 0; x < w; x++) {
-            v += (scalar_t)(*xp) * (scalar_t)(*fp);
-            xp += p.inStride.x;
-            fp += filterStepX;
-          }
-          xp += p.inStride.y - w * p.inStride.x;
-          fp += filterStepY - w * filterStepX;
+  int mid_y = out_y * p.down_y + p.up_y - 1 - p.pad_y0;
+  int in_y = min(max(floor_div(mid_y, p.up_y), 0), p.in_h);
+  int h = min(max(floor_div(mid_y + p.kernel_h, p.up_y), 0), p.in_h) - in_y;
+  int kernel_y = mid_y + p.kernel_h - (in_y + 1) * p.up_y;
+
+  for (int loop_major = 0, major_idx = major_idx_base;
+       loop_major < p.loop_major && major_idx < p.major_dim;
+       loop_major++, major_idx++) {
+    for (int loop_x = 0, out_x = out_x_base;
+         loop_x < p.loop_x && out_x < p.out_w; loop_x++, out_x += blockDim.y) {
+      int mid_x = out_x * p.down_x + p.up_x - 1 - p.pad_x0;
+      int in_x = min(max(floor_div(mid_x, p.up_x), 0), p.in_w);
+      int w = min(max(floor_div(mid_x + p.kernel_w, p.up_x), 0), p.in_w) - in_x;
+      int kernel_x = mid_x + p.kernel_w - (in_x + 1) * p.up_x;
+
+      const scalar_t *x_p =
+          &input[((major_idx * p.in_h + in_y) * p.in_w + in_x) * p.minor_dim +
+                 minor_idx];
+      const scalar_t *k_p = &kernel[kernel_y * p.kernel_w + kernel_x];
+      int x_px = p.minor_dim;
+      int k_px = -p.up_x;
+      int x_py = p.in_w * p.minor_dim;
+      int k_py = -p.up_y * p.kernel_w;
+
+      scalar_t v = 0.0f;
+
+      for (int y = 0; y < h; y++) {
+        for (int x = 0; x < w; x++) {
+          v += static_cast<scalar_t>(*x_p) * static_cast<scalar_t>(*k_p);
+          x_p += x_px;
+          k_p += k_px;
         }
 
-        // Store result.
-        v *= p.gain;
-        ((T *)p.y)[outX * p.outStride.x + outY * p.outStride.y +
-                   c * p.outStride.z + n * p.outStride.w] = (T)v;
+        x_p += x_py - w * x_px;
+        k_p += k_py - w * k_px;
       }
+
+      out[((major_idx * p.out_h + out_y) * p.out_w + out_x) * p.minor_dim +
+          minor_idx] = v;
     }
+  }
 }
 
-//------------------------------------------------------------------------
-// Specialized CUDA implementation for small filters.
-
-template <class T, int upx, int upy, int downx, int downy, int filterW,
-          int filterH, int tileOutW, int tileOutH, int loopMinor>
-static __global__ void upfirdn2d_kernel_small(upfirdn2d_kernel_params p) {
-  typedef typename InternalType<T>::scalar_t scalar_t;
-  const int tileInW = ((tileOutW - 1) * downx + filterW - 1) / upx + 1;
-  const int tileInH = ((tileOutH - 1) * downy + filterH - 1) / upy + 1;
-  __shared__ volatile scalar_t sf[filterH][filterW];
-  __shared__ volatile scalar_t sx[tileInH][tileInW][loopMinor];
-
-  // Calculate tile index.
-  int minorBase = blockIdx.x;
-  int tileOutY = minorBase / p.launchMinor;
-  minorBase -= tileOutY * p.launchMinor;
-  minorBase *= loopMinor;
-  tileOutY *= tileOutH;
-  int tileOutXBase = blockIdx.y * p.loopX * tileOutW;
-  int majorBase = blockIdx.z * p.loopMajor;
-  if (tileOutXBase >= p.outSize.x | tileOutY >= p.outSize.y |
-      majorBase >= p.sizeMajor)
+template <typename scalar_t, int up_x, int up_y, int down_x, int down_y,
+          int kernel_h, int kernel_w, int tile_out_h, int tile_out_w>
+__global__ void upfirdn2d_kernel(scalar_t *out, const scalar_t *input,
+                                 const scalar_t *kernel,
+                                 const UpFirDn2DKernelParams p) {
+  const int tile_in_h = ((tile_out_h - 1) * down_y + kernel_h - 1) / up_y + 1;
+  const int tile_in_w = ((tile_out_w - 1) * down_x + kernel_w - 1) / up_x + 1;
+
+  __shared__ volatile float sk[kernel_h][kernel_w];
+  __shared__ volatile float sx[tile_in_h][tile_in_w];
+
+  int minor_idx = blockIdx.x;
+  int tile_out_y = minor_idx / p.minor_dim;
+  minor_idx -= tile_out_y * p.minor_dim;
+  tile_out_y *= tile_out_h;
+  int tile_out_x_base = blockIdx.y * p.loop_x * tile_out_w;
+  int major_idx_base = blockIdx.z * p.loop_major;
+
+  if (tile_out_x_base >= p.out_w | tile_out_y >= p.out_h |
+      major_idx_base >= p.major_dim) {
     return;
+  }
+
+  for (int tap_idx = threadIdx.x; tap_idx < kernel_h * kernel_w;
+       tap_idx += blockDim.x) {
+    int ky = tap_idx / kernel_w;
+    int kx = tap_idx - ky * kernel_w;
+    scalar_t v = 0.0;
 
-  // Load filter (flipped).
-  for (int tapIdx = threadIdx.x; tapIdx < filterH * filterW;
-       tapIdx += blockDim.x) {
-    int fy = tapIdx / filterW;
-    int fx = tapIdx - fy * filterW;
-    scalar_t v = 0;
-    if (fx < p.filterSize.x & fy < p.filterSize.y) {
-      int ffx = (p.flip) ? fx : p.filterSize.x - 1 - fx;
-      int ffy = (p.flip) ? fy : p.filterSize.y - 1 - fy;
-      v = (scalar_t)p.f[ffx * p.filterStride.x + ffy * p.filterStride.y];
+    if (kx < p.kernel_w & ky < p.kernel_h) {
+      v = kernel[(p.kernel_h - 1 - ky) * p.kernel_w + (p.kernel_w - 1 - kx)];
     }
-    sf[fy][fx] = v;
+
+    sk[ky][kx] = v;
   }
 
-  // Loop over major and X.
-  for (int majorIdx = 0, major = majorBase;
-       majorIdx < p.loopMajor & major < p.sizeMajor; majorIdx++, major++) {
-    int baseNC = major * p.sizeMinor + minorBase;
-    int n = baseNC / p.inSize.z;
-    int baseC = baseNC - n * p.inSize.z;
-    for (int loopX = 0, tileOutX = tileOutXBase;
-         loopX < p.loopX & tileOutX < p.outSize.x;
-         loopX++, tileOutX += tileOutW) {
-      // Load input pixels.
-      int tileMidX = tileOutX * downx + upx - 1 - p.pad0.x;
-      int tileMidY = tileOutY * downy + upy - 1 - p.pad0.y;
-      int tileInX = floor_div(tileMidX, upx);
-      int tileInY = floor_div(tileMidY, upy);
+  for (int loop_major = 0, major_idx = major_idx_base;
+       loop_major < p.loop_major & major_idx < p.major_dim;
+       loop_major++, major_idx++) {
+    for (int loop_x = 0, tile_out_x = tile_out_x_base;
+         loop_x < p.loop_x & tile_out_x < p.out_w;
+         loop_x++, tile_out_x += tile_out_w) {
+      int tile_mid_x = tile_out_x * down_x + up_x - 1 - p.pad_x0;
+      int tile_mid_y = tile_out_y * down_y + up_y - 1 - p.pad_y0;
+      int tile_in_x = floor_div(tile_mid_x, up_x);
+      int tile_in_y = floor_div(tile_mid_y, up_y);
+
       __syncthreads();
-      for (int inIdx = threadIdx.x; inIdx < tileInH * tileInW * loopMinor;
-           inIdx += blockDim.x) {
-        int relC = inIdx;
-        int relInX = relC / loopMinor;
-        int relInY = relInX / tileInW;
-        relC -= relInX * loopMinor;
-        relInX -= relInY * tileInW;
-        int c = baseC + relC;
-        int inX = tileInX + relInX;
-        int inY = tileInY + relInY;
-        scalar_t v = 0;
-        if (inX >= 0 & inY >= 0 & inX < p.inSize.x & inY < p.inSize.y &
-            c < p.inSize.z)
-          v = (scalar_t)(
-              (const T *)p.x)[inX * p.inStride.x + inY * p.inStride.y +
-                              c * p.inStride.z + n * p.inStride.w];
-        sx[relInY][relInX][relC] = v;
+
+      for (int in_idx = threadIdx.x; in_idx < tile_in_h * tile_in_w;
+           in_idx += blockDim.x) {
+        int rel_in_y = in_idx / tile_in_w;
+        int rel_in_x = in_idx - rel_in_y * tile_in_w;
+        int in_x = rel_in_x + tile_in_x;
+        int in_y = rel_in_y + tile_in_y;
+
+        scalar_t v = 0.0;
+
+        if (in_x >= 0 & in_y >= 0 & in_x < p.in_w & in_y < p.in_h) {
+          v = input[((major_idx * p.in_h + in_y) * p.in_w + in_x) *
+                        p.minor_dim +
+                    minor_idx];
+        }
+
+        sx[rel_in_y][rel_in_x] = v;
       }
 
-      // Loop over output pixels.
       __syncthreads();
-      for (int outIdx = threadIdx.x; outIdx < tileOutH * tileOutW * loopMinor;
-           outIdx += blockDim.x) {
-        int relC = outIdx;
-        int relOutX = relC / loopMinor;
-        int relOutY = relOutX / tileOutW;
-        relC -= relOutX * loopMinor;
-        relOutX -= relOutY * tileOutW;
-        int c = baseC + relC;
-        int outX = tileOutX + relOutX;
-        int outY = tileOutY + relOutY;
-
-        // Setup receptive field.
-        int midX = tileMidX + relOutX * downx;
-        int midY = tileMidY + relOutY * downy;
-        int inX = floor_div(midX, upx);
-        int inY = floor_div(midY, upy);
-        int relInX = inX - tileInX;
-        int relInY = inY - tileInY;
-        int filterX = (inX + 1) * upx - midX - 1;  // flipped
-        int filterY = (inY + 1) * upy - midY - 1;  // flipped
-
-        // Inner loop.
-        if (outX < p.outSize.x & outY < p.outSize.y & c < p.outSize.z) {
-          scalar_t v = 0;
+      for (int out_idx = threadIdx.x; out_idx < tile_out_h * tile_out_w;
+           out_idx += blockDim.x) {
+        int rel_out_y = out_idx / tile_out_w;
+        int rel_out_x = out_idx - rel_out_y * tile_out_w;
+        int out_x = rel_out_x + tile_out_x;
+        int out_y = rel_out_y + tile_out_y;
+
+        int mid_x = tile_mid_x + rel_out_x * down_x;
+        int mid_y = tile_mid_y + rel_out_y * down_y;
+        int in_x = floor_div(mid_x, up_x);
+        int in_y = floor_div(mid_y, up_y);
+        int rel_in_x = in_x - tile_in_x;
+        int rel_in_y = in_y - tile_in_y;
+        int kernel_x = (in_x + 1) * up_x - mid_x - 1;
+        int kernel_y = (in_y + 1) * up_y - mid_y - 1;
+
+        scalar_t v = 0.0;
+
 #pragma unroll
-          for (int y = 0; y < filterH / upy; y++)
+        for (int y = 0; y < kernel_h / up_y; y++)
 #pragma unroll
-            for (int x = 0; x < filterW / upx; x++)
-              v += sx[relInY + y][relInX + x][relC] *
-                   sf[filterY + y * upy][filterX + x * upx];
-          v *= p.gain;
-          ((T *)p.y)[outX * p.outStride.x + outY * p.outStride.y +
-                     c * p.outStride.z + n * p.outStride.w] = (T)v;
+          for (int x = 0; x < kernel_w / up_x; x++)
+            v += sx[rel_in_y + y][rel_in_x + x] *
+                 sk[kernel_y + y * up_y][kernel_x + x * up_x];
+
+        if (out_x < p.out_w & out_y < p.out_h) {
+          out[((major_idx * p.out_h + out_y) * p.out_w + out_x) * p.minor_dim +
+              minor_idx] = v;
         }
       }
     }
   }
 }
 
-//------------------------------------------------------------------------
-// CUDA kernel selection.
-
-template <class T>
-upfirdn2d_kernel_spec choose_upfirdn2d_kernel(
-    const upfirdn2d_kernel_params &p) {
-  int s = p.inStride.z, fx = p.filterSize.x, fy = p.filterSize.y;
-  upfirdn2d_kernel_spec spec = {(void *)upfirdn2d_kernel_large<T>, -1, -1, 1,
-                                4};  // contiguous
-  if (s == 1)
-    spec = {(void *)upfirdn2d_kernel_large<T>, -1, -1, 4, 1};  // channels_last
-
-  // No up/downsampling.
-  if (p.up.x == 1 && p.up.y == 1 && p.down.x == 1 && p.down.y == 1) {
-    // contiguous
-    if (s != 1 && fx <= 24 && fy <= 24)
-      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 24, 24, 64, 32, 1>,
-              64, 32, 1, 1};
-    if (s != 1 && fx <= 16 && fy <= 16)
-      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 16, 16, 64, 32, 1>,
-              64, 32, 1, 1};
-    if (s != 1 && fx <= 7 && fy <= 7)
-      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 7, 7, 64, 16, 1>,
-              64, 16, 1, 1};
-    if (s != 1 && fx <= 6 && fy <= 6)
-      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 6, 6, 64, 16, 1>,
-              64, 16, 1, 1};
-    if (s != 1 && fx <= 5 && fy <= 5)
-      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 5, 5, 64, 16, 1>,
-              64, 16, 1, 1};
-    if (s != 1 && fx <= 4 && fy <= 4)
-      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 4, 4, 64, 16, 1>,
-              64, 16, 1, 1};
-    if (s != 1 && fx <= 3 && fy <= 3)
-      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 3, 3, 64, 16, 1>,
-              64, 16, 1, 1};
-    if (s != 1 && fx <= 24 && fy <= 1)
-      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 24, 1, 128, 8, 1>,
-              128, 8, 1, 1};
-    if (s != 1 && fx <= 16 && fy <= 1)
-      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 16, 1, 128, 8, 1>,
-              128, 8, 1, 1};
-    if (s != 1 && fx <= 8 && fy <= 1)
-      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 8, 1, 128, 8, 1>,
-              128, 8, 1, 1};
-    if (s != 1 && fx <= 1 && fy <= 24)
-      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 1, 24, 32, 32, 1>,
-              32, 32, 1, 1};
-    if (s != 1 && fx <= 1 && fy <= 16)
-      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 1, 16, 32, 32, 1>,
-              32, 32, 1, 1};
-    if (s != 1 && fx <= 1 && fy <= 8)
-      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 1, 8, 32, 32, 1>,
-              32, 32, 1, 1};
-    // channels_last
-    if (s == 1 && fx <= 24 && fy <= 24)
-      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 24, 24, 32, 32, 1>,
-              32, 32, 1, 1};
-    if (s == 1 && fx <= 16 && fy <= 16)
-      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 16, 16, 32, 32, 1>,
-              32, 32, 1, 1};
-    if (s == 1 && fx <= 7 && fy <= 7)
-      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 7, 7, 16, 16, 8>,
-              16, 16, 8, 1};
-    if (s == 1 && fx <= 6 && fy <= 6)
-      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 6, 6, 16, 16, 8>,
-              16, 16, 8, 1};
-    if (s == 1 && fx <= 5 && fy <= 5)
-      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 5, 5, 16, 16, 8>,
-              16, 16, 8, 1};
-    if (s == 1 && fx <= 4 && fy <= 4)
-      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 4, 4, 16, 16, 8>,
-              16, 16, 8, 1};
-    if (s == 1 && fx <= 3 && fy <= 3)
-      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 3, 3, 16, 16, 8>,
-              16, 16, 8, 1};
-    if (s == 1 && fx <= 24 && fy <= 1)
-      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 24, 1, 128, 1, 16>,
-              128, 1, 16, 1};
-    if (s == 1 && fx <= 16 && fy <= 1)
-      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 16, 1, 128, 1, 16>,
-              128, 1, 16, 1};
-    if (s == 1 && fx <= 8 && fy <= 1)
-      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 8, 1, 128, 1, 16>,
-              128, 1, 16, 1};
-    if (s == 1 && fx <= 1 && fy <= 24)
-      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 1, 24, 1, 128, 16>,
-              1, 128, 16, 1};
-    if (s == 1 && fx <= 1 && fy <= 16)
-      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 1, 16, 1, 128, 16>,
-              1, 128, 16, 1};
-    if (s == 1 && fx <= 1 && fy <= 8)
-      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 1, 8, 1, 128, 16>,
-              1, 128, 16, 1};
+torch::Tensor upfirdn2d_op(const torch::Tensor &input,
+                           const torch::Tensor &kernel, int up_x, int up_y,
+                           int down_x, int down_y, int pad_x0, int pad_x1,
+                           int pad_y0, int pad_y1) {
+  int curDevice = -1;
+  cudaGetDevice(&curDevice);
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream(curDevice);
+
+  UpFirDn2DKernelParams p;
+
+  auto x = input.contiguous();
+  auto k = kernel.contiguous();
+
+  p.major_dim = x.size(0);
+  p.in_h = x.size(1);
+  p.in_w = x.size(2);
+  p.minor_dim = x.size(3);
+  p.kernel_h = k.size(0);
+  p.kernel_w = k.size(1);
+  p.up_x = up_x;
+  p.up_y = up_y;
+  p.down_x = down_x;
+  p.down_y = down_y;
+  p.pad_x0 = pad_x0;
+  p.pad_x1 = pad_x1;
+  p.pad_y0 = pad_y0;
+  p.pad_y1 = pad_y1;
+
+  p.out_h = (p.in_h * p.up_y + p.pad_y0 + p.pad_y1 - p.kernel_h + p.down_y) /
+            p.down_y;
+  p.out_w = (p.in_w * p.up_x + p.pad_x0 + p.pad_x1 - p.kernel_w + p.down_x) /
+            p.down_x;
+
+  auto out =
+      at::empty({p.major_dim, p.out_h, p.out_w, p.minor_dim}, x.options());
+
+  int mode = -1;
+
+  int tile_out_h = -1;
+  int tile_out_w = -1;
+
+  if (p.up_x == 1 && p.up_y == 1 && p.down_x == 1 && p.down_y == 1 &&
+      p.kernel_h <= 4 && p.kernel_w <= 4) {
+    mode = 1;
+    tile_out_h = 16;
+    tile_out_w = 64;
   }
 
-  // 2x upsampling.
-  if (p.up.x == 2 && p.up.y == 2 && p.down.x == 1 && p.down.y == 1) {
-    // contiguous
-    if (s != 1 && fx <= 24 && fy <= 24)
-      spec = {(void *)upfirdn2d_kernel_small<T, 2, 2, 1, 1, 24, 24, 64, 32, 1>,
-              64, 32, 1, 1};
-    if (s != 1 && fx <= 16 && fy <= 16)
-      spec = {(void *)upfirdn2d_kernel_small<T, 2, 2, 1, 1, 16, 16, 64, 32, 1>,
-              64, 32, 1, 1};
-    if (s != 1 && fx <= 8 && fy <= 8)
-      spec = {(void *)upfirdn2d_kernel_small<T, 2, 2, 1, 1, 8, 8, 64, 16, 1>,
-              64, 16, 1, 1};
-    if (s != 1 && fx <= 6 && fy <= 6)
-      spec = {(void *)upfirdn2d_kernel_small<T, 2, 2, 1, 1, 6, 6, 64, 16, 1>,
-              64, 16, 1, 1};
-    if (s != 1 && fx <= 4 && fy <= 4)
-      spec = {(void *)upfirdn2d_kernel_small<T, 2, 2, 1, 1, 4, 4, 64, 16, 1>,
-              64, 16, 1, 1};
-    if (s != 1 && fx <= 2 && fy <= 2)
-      spec = {(void *)upfirdn2d_kernel_small<T, 2, 2, 1, 1, 2, 2, 64, 16, 1>,
-              64, 16, 1, 1};
-    // channels_last
-    if (s == 1 && fx <= 24 && fy <= 24)
-      spec = {(void *)upfirdn2d_kernel_small<T, 2, 2, 1, 1, 24, 24, 32, 32, 1>,
-              32, 32, 1, 1};
-    if (s == 1 && fx <= 16 && fy <= 16)
-      spec = {(void *)upfirdn2d_kernel_small<T, 2, 2, 1, 1, 16, 16, 32, 32, 1>,
-              32, 32, 1, 1};
-    if (s == 1 && fx <= 8 && fy <= 8)
-      spec = {(void *)upfirdn2d_kernel_small<T, 2, 2, 1, 1, 8, 8, 16, 16, 8>,
-              16, 16, 8, 1};
-    if (s == 1 && fx <= 6 && fy <= 6)
-      spec = {(void *)upfirdn2d_kernel_small<T, 2, 2, 1, 1, 6, 6, 16, 16, 8>,
-              16, 16, 8, 1};
-    if (s == 1 && fx <= 4 && fy <= 4)
-      spec = {(void *)upfirdn2d_kernel_small<T, 2, 2, 1, 1, 4, 4, 16, 16, 8>,
-              16, 16, 8, 1};
-    if (s == 1 && fx <= 2 && fy <= 2)
-      spec = {(void *)upfirdn2d_kernel_small<T, 2, 2, 1, 1, 2, 2, 16, 16, 8>,
-              16, 16, 8, 1};
-  }
-  if (p.up.x == 2 && p.up.y == 1 && p.down.x == 1 && p.down.y == 1) {
-    // contiguous
-    if (s != 1 && fx <= 24 && fy <= 1)
-      spec = {(void *)upfirdn2d_kernel_small<T, 2, 1, 1, 1, 24, 1, 128, 8, 1>,
-              128, 8, 1, 1};
-    if (s != 1 && fx <= 16 && fy <= 1)
-      spec = {(void *)upfirdn2d_kernel_small<T, 2, 1, 1, 1, 16, 1, 128, 8, 1>,
-              128, 8, 1, 1};
-    if (s != 1 && fx <= 8 && fy <= 1)
-      spec = {(void *)upfirdn2d_kernel_small<T, 2, 1, 1, 1, 8, 1, 128, 8, 1>,
-              128, 8, 1, 1};
-    // channels_last
-    if (s == 1 && fx <= 24 && fy <= 1)
-      spec = {(void *)upfirdn2d_kernel_small<T, 2, 1, 1, 1, 24, 1, 128, 1, 16>,
-              128, 1, 16, 1};
-    if (s == 1 && fx <= 16 && fy <= 1)
-      spec = {(void *)upfirdn2d_kernel_small<T, 2, 1, 1, 1, 16, 1, 128, 1, 16>,
-              128, 1, 16, 1};
-    if (s == 1 && fx <= 8 && fy <= 1)
-      spec = {(void *)upfirdn2d_kernel_small<T, 2, 1, 1, 1, 8, 1, 128, 1, 16>,
-              128, 1, 16, 1};
-  }
-  if (p.up.x == 1 && p.up.y == 2 && p.down.x == 1 && p.down.y == 1) {
-    // contiguous
-    if (s != 1 && fx <= 1 && fy <= 24)
-      spec = {(void *)upfirdn2d_kernel_small<T, 1, 2, 1, 1, 1, 24, 32, 32, 1>,
-              32, 32, 1, 1};
-    if (s != 1 && fx <= 1 && fy <= 16)
-      spec = {(void *)upfirdn2d_kernel_small<T, 1, 2, 1, 1, 1, 16, 32, 32, 1>,
-              32, 32, 1, 1};
-    if (s != 1 && fx <= 1 && fy <= 8)
-      spec = {(void *)upfirdn2d_kernel_small<T, 1, 2, 1, 1, 1, 8, 32, 32, 1>,
-              32, 32, 1, 1};
-    // channels_last
-    if (s == 1 && fx <= 1 && fy <= 24)
-      spec = {(void *)upfirdn2d_kernel_small<T, 1, 2, 1, 1, 1, 24, 1, 128, 16>,
-              1, 128, 16, 1};
-    if (s == 1 && fx <= 1 && fy <= 16)
-      spec = {(void *)upfirdn2d_kernel_small<T, 1, 2, 1, 1, 1, 16, 1, 128, 16>,
-              1, 128, 16, 1};
-    if (s == 1 && fx <= 1 && fy <= 8)
-      spec = {(void *)upfirdn2d_kernel_small<T, 1, 2, 1, 1, 1, 8, 1, 128, 16>,
-              1, 128, 16, 1};
+  if (p.up_x == 1 && p.up_y == 1 && p.down_x == 1 && p.down_y == 1 &&
+      p.kernel_h <= 3 && p.kernel_w <= 3) {
+    mode = 2;
+    tile_out_h = 16;
+    tile_out_w = 64;
   }
 
-  // 2x downsampling.
-  if (p.up.x == 1 && p.up.y == 1 && p.down.x == 2 && p.down.y == 2) {
-    // contiguous
-    if (s != 1 && fx <= 24 && fy <= 24)
-      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 2, 2, 24, 24, 32, 16, 1>,
-              32, 16, 1, 1};
-    if (s != 1 && fx <= 16 && fy <= 16)
-      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 2, 2, 16, 16, 32, 16, 1>,
-              32, 16, 1, 1};
-    if (s != 1 && fx <= 8 && fy <= 8)
-      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 2, 2, 8, 8, 32, 8, 1>, 32,
-              8, 1, 1};
-    if (s != 1 && fx <= 6 && fy <= 6)
-      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 2, 2, 6, 6, 32, 8, 1>, 32,
-              8, 1, 1};
-    if (s != 1 && fx <= 4 && fy <= 4)
-      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 2, 2, 4, 4, 32, 8, 1>, 32,
-              8, 1, 1};
-    if (s != 1 && fx <= 2 && fy <= 2)
-      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 2, 2, 2, 2, 32, 8, 1>, 32,
-              8, 1, 1};
-    // channels_last
-    if (s == 1 && fx <= 24 && fy <= 24)
-      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 2, 2, 24, 24, 16, 16, 1>,
-              16, 16, 1, 1};
-    if (s == 1 && fx <= 16 && fy <= 16)
-      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 2, 2, 16, 16, 16, 16, 1>,
-              16, 16, 1, 1};
-    if (s == 1 && fx <= 8 && fy <= 8)
-      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 2, 2, 8, 8, 8, 8, 8>, 8,
-              8, 8, 1};
-    if (s == 1 && fx <= 6 && fy <= 6)
-      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 2, 2, 6, 6, 8, 8, 8>, 8,
-              8, 8, 1};
-    if (s == 1 && fx <= 4 && fy <= 4)
-      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 2, 2, 4, 4, 8, 8, 8>, 8,
-              8, 8, 1};
-    if (s == 1 && fx <= 2 && fy <= 2)
-      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 2, 2, 2, 2, 8, 8, 8>, 8,
-              8, 8, 1};
-  }
-  if (p.up.x == 1 && p.up.y == 1 && p.down.x == 2 && p.down.y == 1) {
-    // contiguous
-    if (s != 1 && fx <= 24 && fy <= 1)
-      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 2, 1, 24, 1, 64, 8, 1>,
-              64, 8, 1, 1};
-    if (s != 1 && fx <= 16 && fy <= 1)
-      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 2, 1, 16, 1, 64, 8, 1>,
-              64, 8, 1, 1};
-    if (s != 1 && fx <= 8 && fy <= 1)
-      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 2, 1, 8, 1, 64, 8, 1>, 64,
-              8, 1, 1};
-    // channels_last
-    if (s == 1 && fx <= 24 && fy <= 1)
-      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 2, 1, 24, 1, 64, 1, 8>,
-              64, 1, 8, 1};
-    if (s == 1 && fx <= 16 && fy <= 1)
-      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 2, 1, 16, 1, 64, 1, 8>,
-              64, 1, 8, 1};
-    if (s == 1 && fx <= 8 && fy <= 1)
-      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 2, 1, 8, 1, 64, 1, 8>, 64,
-              1, 8, 1};
-  }
-  if (p.up.x == 1 && p.up.y == 1 && p.down.x == 1 && p.down.y == 2) {
-    // contiguous
-    if (s != 1 && fx <= 1 && fy <= 24)
-      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 2, 1, 24, 32, 16, 1>,
-              32, 16, 1, 1};
-    if (s != 1 && fx <= 1 && fy <= 16)
-      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 2, 1, 16, 32, 16, 1>,
-              32, 16, 1, 1};
-    if (s != 1 && fx <= 1 && fy <= 8)
-      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 2, 1, 8, 32, 16, 1>,
-              32, 16, 1, 1};
-    // channels_last
-    if (s == 1 && fx <= 1 && fy <= 24)
-      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 2, 1, 24, 1, 64, 8>, 1,
-              64, 8, 1};
-    if (s == 1 && fx <= 1 && fy <= 16)
-      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 2, 1, 16, 1, 64, 8>, 1,
-              64, 8, 1};
-    if (s == 1 && fx <= 1 && fy <= 8)
-      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 2, 1, 8, 1, 64, 8>, 1,
-              64, 8, 1};
+  if (p.up_x == 2 && p.up_y == 2 && p.down_x == 1 && p.down_y == 1 &&
+      p.kernel_h <= 4 && p.kernel_w <= 4) {
+    mode = 3;
+    tile_out_h = 16;
+    tile_out_w = 64;
   }
 
-  // 4x upsampling.
-  if (p.up.x == 4 && p.up.y == 4 && p.down.x == 1 && p.down.y == 1) {
-    // contiguous
-    if (s != 1 && fx <= 48 && fy <= 48)
-      spec = {(void *)upfirdn2d_kernel_small<T, 4, 4, 1, 1, 48, 48, 64, 32, 1>,
-              64, 32, 1, 1};
-    if (s != 1 && fx <= 32 && fy <= 32)
-      spec = {(void *)upfirdn2d_kernel_small<T, 4, 4, 1, 1, 32, 32, 64, 32, 1>,
-              64, 32, 1, 1};
-    // channels_last
-    if (s == 1 && fx <= 48 && fy <= 48)
-      spec = {(void *)upfirdn2d_kernel_small<T, 4, 4, 1, 1, 48, 48, 32, 32, 1>,
-              32, 32, 1, 1};
-    if (s == 1 && fx <= 32 && fy <= 32)
-      spec = {(void *)upfirdn2d_kernel_small<T, 4, 4, 1, 1, 32, 32, 32, 32, 1>,
-              32, 32, 1, 1};
-  }
-  if (p.up.x == 4 && p.up.y == 1 && p.down.x == 1 && p.down.y == 1) {
-    // contiguous
-    if (s != 1 && fx <= 48 && fy <= 1)
-      spec = {(void *)upfirdn2d_kernel_small<T, 4, 1, 1, 1, 48, 1, 128, 8, 1>,
-              128, 8, 1, 1};
-    if (s != 1 && fx <= 32 && fy <= 1)
-      spec = {(void *)upfirdn2d_kernel_small<T, 4, 1, 1, 1, 32, 1, 128, 8, 1>,
-              128, 8, 1, 1};
-    // channels_last
-    if (s == 1 && fx <= 48 && fy <= 1)
-      spec = {(void *)upfirdn2d_kernel_small<T, 4, 1, 1, 1, 48, 1, 128, 1, 16>,
-              128, 1, 16, 1};
-    if (s == 1 && fx <= 32 && fy <= 1)
-      spec = {(void *)upfirdn2d_kernel_small<T, 4, 1, 1, 1, 32, 1, 128, 1, 16>,
-              128, 1, 16, 1};
+  if (p.up_x == 2 && p.up_y == 2 && p.down_x == 1 && p.down_y == 1 &&
+      p.kernel_h <= 2 && p.kernel_w <= 2) {
+    mode = 4;
+    tile_out_h = 16;
+    tile_out_w = 64;
   }
-  if (p.up.x == 1 && p.up.y == 4 && p.down.x == 1 && p.down.y == 1) {
-    // contiguous
-    if (s != 1 && fx <= 1 && fy <= 48)
-      spec = {(void *)upfirdn2d_kernel_small<T, 1, 4, 1, 1, 1, 48, 32, 32, 1>,
-              32, 32, 1, 1};
-    if (s != 1 && fx <= 1 && fy <= 32)
-      spec = {(void *)upfirdn2d_kernel_small<T, 1, 4, 1, 1, 1, 32, 32, 32, 1>,
-              32, 32, 1, 1};
-    // channels_last
-    if (s == 1 && fx <= 1 && fy <= 48)
-      spec = {(void *)upfirdn2d_kernel_small<T, 1, 4, 1, 1, 1, 48, 1, 128, 16>,
-              1, 128, 16, 1};
-    if (s == 1 && fx <= 1 && fy <= 32)
-      spec = {(void *)upfirdn2d_kernel_small<T, 1, 4, 1, 1, 1, 32, 1, 128, 16>,
-              1, 128, 16, 1};
+
+  if (p.up_x == 1 && p.up_y == 1 && p.down_x == 2 && p.down_y == 2 &&
+      p.kernel_h <= 4 && p.kernel_w <= 4) {
+    mode = 5;
+    tile_out_h = 8;
+    tile_out_w = 32;
   }
 
-  // 4x downsampling (inefficient).
-  if (p.up.x == 1 && p.up.y == 1 && p.down.x == 4 && p.down.y == 1) {
-    // contiguous
-    if (s != 1 && fx <= 48 && fy <= 1)
-      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 4, 1, 48, 1, 32, 8, 1>,
-              32, 8, 1, 1};
-    if (s != 1 && fx <= 32 && fy <= 1)
-      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 4, 1, 32, 1, 32, 8, 1>,
-              32, 8, 1, 1};
-    // channels_last
-    if (s == 1 && fx <= 48 && fy <= 1)
-      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 4, 1, 48, 1, 32, 1, 8>,
-              32, 1, 8, 1};
-    if (s == 1 && fx <= 32 && fy <= 1)
-      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 4, 1, 32, 1, 32, 1, 8>,
-              32, 1, 8, 1};
+  if (p.up_x == 1 && p.up_y == 1 && p.down_x == 2 && p.down_y == 2 &&
+      p.kernel_h <= 2 && p.kernel_w <= 2) {
+    mode = 6;
+    tile_out_h = 8;
+    tile_out_w = 32;
   }
-  if (p.up.x == 1 && p.up.y == 1 && p.down.x == 1 && p.down.y == 4) {
-    // contiguous
-    if (s != 1 && fx <= 1 && fy <= 48)
-      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 4, 1, 48, 32, 8, 1>,
-              32, 8, 1, 1};
-    if (s != 1 && fx <= 1 && fy <= 32)
-      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 4, 1, 32, 32, 8, 1>,
-              32, 8, 1, 1};
-    // channels_last
-    if (s == 1 && fx <= 1 && fy <= 48)
-      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 4, 1, 48, 1, 32, 8>, 1,
-              32, 8, 1};
-    if (s == 1 && fx <= 1 && fy <= 32)
-      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 4, 1, 32, 1, 32, 8>, 1,
-              32, 8, 1};
+
+  dim3 block_size;
+  dim3 grid_size;
+
+  if (tile_out_h > 0 && tile_out_w > 0) {
+    p.loop_major = (p.major_dim - 1) / 16384 + 1;
+    p.loop_x = 1;
+    block_size = dim3(32 * 8, 1, 1);
+    grid_size = dim3(((p.out_h - 1) / tile_out_h + 1) * p.minor_dim,
+                     (p.out_w - 1) / (p.loop_x * tile_out_w) + 1,
+                     (p.major_dim - 1) / p.loop_major + 1);
+  } else {
+    p.loop_major = (p.major_dim - 1) / 16384 + 1;
+    p.loop_x = 4;
+    block_size = dim3(4, 32, 1);
+    grid_size = dim3((p.out_h * p.minor_dim - 1) / block_size.x + 1,
+                     (p.out_w - 1) / (p.loop_x * block_size.y) + 1,
+                     (p.major_dim - 1) / p.loop_major + 1);
   }
-  return spec;
-}
 
-//------------------------------------------------------------------------
-// Template specializations.
-
-template upfirdn2d_kernel_spec choose_upfirdn2d_kernel<double>(
-    const upfirdn2d_kernel_params &p);
-template upfirdn2d_kernel_spec choose_upfirdn2d_kernel<float>(
-    const upfirdn2d_kernel_params &p);
-template upfirdn2d_kernel_spec choose_upfirdn2d_kernel<c10::Half>(
-    const upfirdn2d_kernel_params &p);
-
-//------------------------------------------------------------------------
-
-//------------------------------------------------------------------------
-
-torch::Tensor upfirdn2d_op(torch::Tensor x, torch::Tensor f, int upx, int upy,
-                           int downx, int downy, int padx0, int padx1,
-                           int pady0, int pady1, bool flip, float gain) {
-  // Validate arguments.
-  TORCH_CHECK(x.is_cuda(), "x must reside on CUDA device");
-  TORCH_CHECK(f.device() == x.device(),
-              "f must reside on the same device as x");
-  TORCH_CHECK(f.dtype() == torch::kFloat, "f must be float32");
-  TORCH_CHECK(x.numel() <= INT_MAX, "x is too large");
-  TORCH_CHECK(f.numel() <= INT_MAX, "f is too large");
-  TORCH_CHECK(x.numel() > 0, "x has zero size");
-  TORCH_CHECK(f.numel() > 0, "f has zero size");
-  TORCH_CHECK(x.dim() == 4, "x must be rank 4");
-  TORCH_CHECK(f.dim() == 2, "f must be rank 2");
-  TORCH_CHECK((x.size(0) - 1) * x.stride(0) + (x.size(1) - 1) * x.stride(1) +
-                      (x.size(2) - 1) * x.stride(2) +
-                      (x.size(3) - 1) * x.stride(3) <=
-                  INT_MAX,
-              "x memory footprint is too large");
-  TORCH_CHECK(f.size(0) >= 1 && f.size(1) >= 1, "f must be at least 1x1");
-  TORCH_CHECK(upx >= 1 && upy >= 1, "upsampling factor must be at least 1");
-  TORCH_CHECK(downx >= 1 && downy >= 1,
-              "downsampling factor must be at least 1");
-
-  // Create output tensor.
-  const at::cuda::OptionalCUDAGuard device_guard(device_of(x));
-  int outW =
-      ((int)x.size(3) * upx + padx0 + padx1 - (int)f.size(1) + downx) / downx;
-  int outH =
-      ((int)x.size(2) * upy + pady0 + pady1 - (int)f.size(0) + downy) / downy;
-  TORCH_CHECK(outW >= 1 && outH >= 1, "output must be at least 1x1");
-  torch::Tensor y = torch::empty({x.size(0), x.size(1), outH, outW},
-                                 x.options(), x.suggest_memory_format());
-  TORCH_CHECK(y.numel() <= INT_MAX, "output is too large");
-  TORCH_CHECK((y.size(0) - 1) * y.stride(0) + (y.size(1) - 1) * y.stride(1) +
-                      (y.size(2) - 1) * y.stride(2) +
-                      (y.size(3) - 1) * y.stride(3) <=
-                  INT_MAX,
-              "output memory footprint is too large");
-
-  // Initialize CUDA kernel parameters.
-  upfirdn2d_kernel_params p;
-  p.x = x.data_ptr();
-  p.f = f.data_ptr<float>();
-  p.y = y.data_ptr();
-  p.up = make_int2(upx, upy);
-  p.down = make_int2(downx, downy);
-  p.pad0 = make_int2(padx0, pady0);
-  p.flip = (flip) ? 1 : 0;
-  p.gain = gain;
-  p.inSize =
-      make_int4((int)x.size(3), (int)x.size(2), (int)x.size(1), (int)x.size(0));
-  p.inStride = make_int4((int)x.stride(3), (int)x.stride(2), (int)x.stride(1),
-                         (int)x.stride(0));
-  p.filterSize = make_int2((int)f.size(1), (int)f.size(0));
-  p.filterStride = make_int2((int)f.stride(1), (int)f.stride(0));
-  p.outSize =
-      make_int4((int)y.size(3), (int)y.size(2), (int)y.size(1), (int)y.size(0));
-  p.outStride = make_int4((int)y.stride(3), (int)y.stride(2), (int)y.stride(1),
-                          (int)y.stride(0));
-  p.sizeMajor = (p.inStride.z == 1) ? p.inSize.w : p.inSize.w * p.inSize.z;
-  p.sizeMinor = (p.inStride.z == 1) ? p.inSize.z : 1;
-
-  // Choose CUDA kernel.
-  upfirdn2d_kernel_spec spec;
   AT_DISPATCH_FLOATING_TYPES_AND_HALF(x.scalar_type(), "upfirdn2d_cuda", [&] {
-    spec = choose_upfirdn2d_kernel<scalar_t>(p);
+    switch (mode) {
+      case 1:
+        upfirdn2d_kernel<scalar_t, 1, 1, 1, 1, 4, 4, 16, 64>
+            <<<grid_size, block_size, 0, stream>>>(out.data_ptr<scalar_t>(),
+                                                   x.data_ptr<scalar_t>(),
+                                                   k.data_ptr<scalar_t>(), p);
+
+        break;
+
+      case 2:
+        upfirdn2d_kernel<scalar_t, 1, 1, 1, 1, 3, 3, 16, 64>
+            <<<grid_size, block_size, 0, stream>>>(out.data_ptr<scalar_t>(),
+                                                   x.data_ptr<scalar_t>(),
+                                                   k.data_ptr<scalar_t>(), p);
+
+        break;
+
+      case 3:
+        upfirdn2d_kernel<scalar_t, 2, 2, 1, 1, 4, 4, 16, 64>
+            <<<grid_size, block_size, 0, stream>>>(out.data_ptr<scalar_t>(),
+                                                   x.data_ptr<scalar_t>(),
+                                                   k.data_ptr<scalar_t>(), p);
+
+        break;
+
+      case 4:
+        upfirdn2d_kernel<scalar_t, 2, 2, 1, 1, 2, 2, 16, 64>
+            <<<grid_size, block_size, 0, stream>>>(out.data_ptr<scalar_t>(),
+                                                   x.data_ptr<scalar_t>(),
+                                                   k.data_ptr<scalar_t>(), p);
+
+        break;
+
+      case 5:
+        upfirdn2d_kernel<scalar_t, 1, 1, 2, 2, 4, 4, 8, 32>
+            <<<grid_size, block_size, 0, stream>>>(out.data_ptr<scalar_t>(),
+                                                   x.data_ptr<scalar_t>(),
+                                                   k.data_ptr<scalar_t>(), p);
+
+        break;
+
+      case 6:
+        upfirdn2d_kernel<scalar_t, 1, 1, 2, 2, 4, 4, 8, 32>
+            <<<grid_size, block_size, 0, stream>>>(out.data_ptr<scalar_t>(),
+                                                   x.data_ptr<scalar_t>(),
+                                                   k.data_ptr<scalar_t>(), p);
+
+        break;
+
+      default:
+        upfirdn2d_kernel_large<scalar_t><<<grid_size, block_size, 0, stream>>>(
+            out.data_ptr<scalar_t>(), x.data_ptr<scalar_t>(),
+            k.data_ptr<scalar_t>(), p);
+    }
   });
 
-  // Set looping options.
-  p.loopMajor = (p.sizeMajor - 1) / 16384 + 1;
-  p.loopMinor = spec.loopMinor;
-  p.loopX = spec.loopX;
-  p.launchMinor = (p.sizeMinor - 1) / p.loopMinor + 1;
-  p.launchMajor = (p.sizeMajor - 1) / p.loopMajor + 1;
-
-  // Compute grid size.
-  dim3 blockSize, gridSize;
-  if (spec.tileOutW < 0)  // large
-  {
-    blockSize = dim3(4, 32, 1);
-    gridSize =
-        dim3(((p.outSize.y - 1) / blockSize.x + 1) * p.launchMinor,
-             (p.outSize.x - 1) / (blockSize.y * p.loopX) + 1, p.launchMajor);
-  } else  // small
-  {
-    blockSize = dim3(256, 1, 1);
-    gridSize =
-        dim3(((p.outSize.y - 1) / spec.tileOutH + 1) * p.launchMinor,
-             (p.outSize.x - 1) / (spec.tileOutW * p.loopX) + 1, p.launchMajor);
-  }
-
-  // Launch CUDA kernel.
-  void *args[] = {&p};
-#ifndef MMCV_WITH_HIP
-  AT_CUDA_CHECK(cudaLaunchKernel(spec.kernel, gridSize, blockSize, args, 0,
-                                 at::cuda::getCurrentCUDAStream()));
-#else
-  AT_CUDA_CHECK(hipLaunchKernel(spec.kernel, gridSize, blockSize, args, 0,
-                                 at::cuda::getCurrentCUDAStream()));
-#endif
-  return y;
+  return out;
 }
diff --git a/mmcv/ops/csrc/pytorch/cuda/voxelization_cuda.cu b/mmcv/ops/csrc/pytorch/cuda/voxelization_cuda.cu
index f4166b7..bcb7da3 100644
--- a/mmcv/ops/csrc/pytorch/cuda/voxelization_cuda.cu
+++ b/mmcv/ops/csrc/pytorch/cuda/voxelization_cuda.cu
@@ -145,104 +145,6 @@ int HardVoxelizeForwardCUDAKernelLauncher(
   return voxel_num_int;
 }
 
-int NondeterministicHardVoxelizeForwardCUDAKernelLauncher(
-    const at::Tensor &points, at::Tensor &voxels, at::Tensor &coors,
-    at::Tensor &num_points_per_voxel, const std::vector<float> voxel_size,
-    const std::vector<float> coors_range, const int max_points,
-    const int max_voxels, const int NDim = 3) {
-  at::cuda::CUDAGuard device_guard(points.device());
-  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-
-  const int num_points = points.size(0);
-  const int num_features = points.size(1);
-
-  if (num_points == 0) return 0;
-
-  dim3 blocks(
-      std::min(at::cuda::ATenCeilDiv(num_points, THREADS_PER_BLOCK), 4096));
-  dim3 threads(THREADS_PER_BLOCK);
-
-  const float voxel_x = voxel_size[0];
-  const float voxel_y = voxel_size[1];
-  const float voxel_z = voxel_size[2];
-  const float coors_x_min = coors_range[0];
-  const float coors_y_min = coors_range[1];
-  const float coors_z_min = coors_range[2];
-  const float coors_x_max = coors_range[3];
-  const float coors_y_max = coors_range[4];
-  const float coors_z_max = coors_range[5];
-
-  const int grid_x = round((coors_x_max - coors_x_min) / voxel_x);
-  const int grid_y = round((coors_y_max - coors_y_min) / voxel_y);
-  const int grid_z = round((coors_z_max - coors_z_min) / voxel_z);
-
-  // map points to voxel coors
-  at::Tensor temp_coors =
-      at::zeros({num_points, NDim}, points.options().dtype(at::kInt));
-
-  // 1. link point to corresponding voxel coors
-  AT_DISPATCH_ALL_TYPES(
-      points.scalar_type(), "hard_voxelize_kernel", ([&] {
-        dynamic_voxelize_kernel<scalar_t, int><<<blocks, threads, 0, stream>>>(
-            points.contiguous().data_ptr<scalar_t>(),
-            temp_coors.contiguous().data_ptr<int>(), voxel_x, voxel_y, voxel_z,
-            coors_x_min, coors_y_min, coors_z_min, coors_x_max, coors_y_max,
-            coors_z_max, grid_x, grid_y, grid_z, num_points, num_features,
-            NDim);
-      }));
-
-  at::Tensor coors_map;
-  at::Tensor reduce_count;
-
-  auto coors_clean = temp_coors.masked_fill(temp_coors.lt(0).any(-1, true), -1);
-
-  std::tie(temp_coors, coors_map, reduce_count) =
-      at::unique_dim(coors_clean, 0, true, true, false);
-
-  if (temp_coors[0][0].lt(0).item<bool>()) {
-    // the first element of temp_coors is (-1,-1,-1) and should be removed
-    temp_coors = temp_coors.slice(0, 1);
-    coors_map = coors_map - 1;
-  }
-
-  int num_coors = temp_coors.size(0);
-  temp_coors = temp_coors.to(at::kInt);
-  coors_map = coors_map.to(at::kInt);
-
-  at::Tensor coors_count = at::zeros({1}, coors_map.options());
-  at::Tensor coors_order = at::empty({num_coors}, coors_map.options());
-  at::Tensor pts_id = at::zeros({num_points}, coors_map.options());
-  reduce_count = at::zeros({num_coors}, coors_map.options());
-
-  AT_DISPATCH_ALL_TYPES(
-      points.scalar_type(), "get_assign_pos", ([&] {
-        nondeterministic_get_assign_pos<<<blocks, threads, 0, stream>>>(
-            num_points, coors_map.contiguous().data_ptr<int32_t>(),
-            pts_id.contiguous().data_ptr<int32_t>(),
-            coors_count.contiguous().data_ptr<int32_t>(),
-            reduce_count.contiguous().data_ptr<int32_t>(),
-            coors_order.contiguous().data_ptr<int32_t>());
-      }));
-
-  AT_DISPATCH_ALL_TYPES(
-      points.scalar_type(), "assign_point_to_voxel", ([&] {
-        nondeterministic_assign_point_voxel<scalar_t>
-            <<<blocks, threads, 0, stream>>>(
-                num_points, points.contiguous().data_ptr<scalar_t>(),
-                coors_map.contiguous().data_ptr<int32_t>(),
-                pts_id.contiguous().data_ptr<int32_t>(),
-                temp_coors.contiguous().data_ptr<int32_t>(),
-                reduce_count.contiguous().data_ptr<int32_t>(),
-                coors_order.contiguous().data_ptr<int32_t>(),
-                voxels.contiguous().data_ptr<scalar_t>(),
-                coors.contiguous().data_ptr<int32_t>(),
-                num_points_per_voxel.contiguous().data_ptr<int32_t>(),
-                max_voxels, max_points, num_features, NDim);
-      }));
-  AT_CUDA_CHECK(cudaGetLastError());
-  return max_voxels < num_coors ? max_voxels : num_coors;
-}
-
 void DynamicVoxelizeForwardCUDAKernelLauncher(
     const at::Tensor &points, at::Tensor &coors,
     const std::vector<float> voxel_size, const std::vector<float> coors_range,
diff --git a/mmcv/ops/csrc/pytorch/diff_iou_rotated.cpp b/mmcv/ops/csrc/pytorch/diff_iou_rotated.cpp
deleted file mode 100644
index 2361b7f..0000000
--- a/mmcv/ops/csrc/pytorch/diff_iou_rotated.cpp
+++ /dev/null
@@ -1,14 +0,0 @@
-// Copyright (c) OpenMMLab. All rights reserved
-#include "pytorch_cpp_helper.hpp"
-#include "pytorch_device_registry.hpp"
-
-Tensor diff_iou_rotated_sort_vertices_forward_impl(Tensor vertices, Tensor mask,
-                                                   Tensor num_valid) {
-  return DISPATCH_DEVICE_IMPL(diff_iou_rotated_sort_vertices_forward_impl,
-                              vertices, mask, num_valid);
-}
-
-Tensor diff_iou_rotated_sort_vertices_forward(Tensor vertices, Tensor mask,
-                                              Tensor num_valid) {
-  return diff_iou_rotated_sort_vertices_forward_impl(vertices, mask, num_valid);
-}
diff --git a/mmcv/ops/csrc/pytorch/filtered_lrelu.cpp b/mmcv/ops/csrc/pytorch/filtered_lrelu.cpp
deleted file mode 100644
index c7ecc14..0000000
--- a/mmcv/ops/csrc/pytorch/filtered_lrelu.cpp
+++ /dev/null
@@ -1,37 +0,0 @@
-#include "pytorch_cpp_helper.hpp"
-#include "pytorch_device_registry.hpp"
-
-std::tuple<torch::Tensor, torch::Tensor, int> filtered_lrelu_op_impl(
-    torch::Tensor x, torch::Tensor fu, torch::Tensor fd, torch::Tensor b,
-    torch::Tensor si, int up, int down, int px0, int px1, int py0, int py1,
-    int sx, int sy, float gain, float slope, float clamp, bool flip_filters,
-    bool writeSigns) {
-  return DISPATCH_DEVICE_IMPL(filtered_lrelu_op_impl, x, fu, fd, b, si, up,
-                              down, px0, px1, py0, py1, sx, sy, gain, slope,
-                              clamp, flip_filters, writeSigns);
-}
-
-std::tuple<torch::Tensor, torch::Tensor, int> filtered_lrelu(
-    torch::Tensor x, torch::Tensor fu, torch::Tensor fd, torch::Tensor b,
-    torch::Tensor si, int up, int down, int px0, int px1, int py0, int py1,
-    int sx, int sy, float gain, float slope, float clamp, bool flip_filters,
-    bool writeSigns) {
-  return filtered_lrelu_op_impl(x, fu, fd, b, si, up, down, px0, px1, py0, py1,
-                                sx, sy, gain, slope, clamp, flip_filters,
-                                writeSigns);
-}
-
-torch::Tensor filtered_lrelu_act_op_impl(torch::Tensor x, torch::Tensor si,
-                                         int sx, int sy, float gain,
-                                         float slope, float clamp,
-                                         bool writeSigns) {
-  return DISPATCH_DEVICE_IMPL(filtered_lrelu_act_op_impl, x, si, sx, sy, gain,
-                              slope, clamp, writeSigns);
-}
-
-torch::Tensor filtered_lrelu_act_(torch::Tensor x, torch::Tensor si, int sx,
-                                  int sy, float gain, float slope, float clamp,
-                                  bool writeSigns) {
-  return filtered_lrelu_act_op_impl(x, si, sx, sy, gain, slope, clamp,
-                                    writeSigns);
-}
diff --git a/mmcv/ops/csrc/pytorch/fused_spconv_ops.cpp b/mmcv/ops/csrc/pytorch/fused_spconv_ops.cpp
deleted file mode 100644
index 54073a5..0000000
--- a/mmcv/ops/csrc/pytorch/fused_spconv_ops.cpp
+++ /dev/null
@@ -1,34 +0,0 @@
-// Copyright 2019 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "pytorch_cpp_helper.hpp"
-#include "pytorch_device_registry.hpp"
-
-torch::Tensor fused_indice_conv_batchnorm_forward_impl(
-    torch::Tensor features, torch::Tensor filters, torch::Tensor bias,
-    torch::Tensor indicePairs, torch::Tensor indiceNum, int64_t numActOut,
-    int64_t _inverse, int64_t _subM) {
-  return DISPATCH_DEVICE_IMPL(fused_indice_conv_batchnorm_forward_impl,
-                              features, filters, bias, indicePairs, indiceNum,
-                              numActOut, _inverse, _subM);
-}
-
-torch::Tensor fused_indice_conv_batchnorm_forward(
-    torch::Tensor features, torch::Tensor filters, torch::Tensor bias,
-    torch::Tensor indicePairs, torch::Tensor indiceNum, int64_t numActOut,
-    int64_t _inverse, int64_t _subM) {
-  return fused_indice_conv_batchnorm_forward_impl(features, filters, bias,
-                                                  indicePairs, indiceNum,
-                                                  numActOut, _inverse, _subM);
-}
diff --git a/mmcv/ops/csrc/pytorch/group_points.cpp b/mmcv/ops/csrc/pytorch/group_points.cpp
index 850deed..cdd190d 100644
--- a/mmcv/ops/csrc/pytorch/group_points.cpp
+++ b/mmcv/ops/csrc/pytorch/group_points.cpp
@@ -32,45 +32,3 @@ void group_points_backward(Tensor grad_out_tensor, Tensor idx_tensor,
   group_points_backward_impl(b, c, n, npoints, nsample, grad_out_tensor,
                              idx_tensor, grad_points_tensor);
 }
-
-void stack_group_points_backward_impl(int b, int c, int m, int n, int nsample,
-                                      const Tensor grad_out_tensor,
-                                      const Tensor idx_tensor,
-                                      const Tensor idx_batch_cnt_tensor,
-                                      const Tensor features_batch_cnt_tensor,
-                                      Tensor grad_features_tensor) {
-  DISPATCH_DEVICE_IMPL(stack_group_points_backward_impl, b, c, m, n, nsample,
-                       grad_out_tensor, idx_tensor, idx_batch_cnt_tensor,
-                       features_batch_cnt_tensor, grad_features_tensor);
-}
-
-void stack_group_points_backward(Tensor grad_out_tensor, Tensor idx_tensor,
-                                 Tensor idx_batch_cnt_tensor,
-                                 Tensor features_batch_cnt_tensor,
-                                 Tensor grad_features_tensor, int b, int c,
-                                 int m, int n, int nsample) {
-  stack_group_points_backward_impl(
-      b, c, m, n, nsample, grad_out_tensor, idx_tensor, idx_batch_cnt_tensor,
-      features_batch_cnt_tensor, grad_features_tensor);
-}
-
-void stack_group_points_forward_impl(int b, int c, int m, int nsample,
-                                     const Tensor features_tensor,
-                                     const Tensor features_batch_cnt_tensor,
-                                     const Tensor idx_tensor,
-                                     const Tensor idx_batch_cnt_tensor,
-                                     Tensor out_tensor) {
-  DISPATCH_DEVICE_IMPL(stack_group_points_forward_impl, b, c, m, nsample,
-                       features_tensor, features_batch_cnt_tensor, idx_tensor,
-                       idx_batch_cnt_tensor, out_tensor);
-}
-
-void stack_group_points_forward(Tensor features_tensor,
-                                Tensor features_batch_cnt_tensor,
-                                Tensor idx_tensor, Tensor idx_batch_cnt_tensor,
-                                Tensor out_tensor, int b, int c, int m,
-                                int nsample) {
-  DISPATCH_DEVICE_IMPL(stack_group_points_forward_impl, b, c, m, nsample,
-                       features_tensor, features_batch_cnt_tensor, idx_tensor,
-                       idx_batch_cnt_tensor, out_tensor);
-}
diff --git a/mmcv/ops/csrc/pytorch/info.cpp b/mmcv/ops/csrc/pytorch/info.cpp
index a4cc418..a08d227 100644
--- a/mmcv/ops/csrc/pytorch/info.cpp
+++ b/mmcv/ops/csrc/pytorch/info.cpp
@@ -4,14 +4,7 @@
 #include "pytorch_cpp_helper.hpp"
 
 #ifdef MMCV_WITH_CUDA
-#ifdef MMCV_WITH_HIP
-#include <hip/hip_runtime_api.h>
-int get_hiprt_version() {
-  int runtimeVersion;
-  hipRuntimeGetVersion(&runtimeVersion);
-  return runtimeVersion;
-}
-#else
+#ifndef HIP_DIFF
 #include <cuda_runtime_api.h>
 int get_cudart_version() { return CUDART_VERSION; }
 #endif
@@ -19,7 +12,7 @@ int get_cudart_version() { return CUDART_VERSION; }
 
 std::string get_compiling_cuda_version() {
 #ifdef MMCV_WITH_CUDA
-#ifndef MMCV_WITH_HIP
+#ifndef HIP_DIFF
   std::ostringstream oss;
   // copied from
   // https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/cuda/detail/CUDAHooks.cpp#L231
@@ -32,9 +25,7 @@ std::string get_compiling_cuda_version() {
   printCudaStyleVersion(get_cudart_version());
   return oss.str();
 #else
-  std::ostringstream oss;
-  oss << get_hiprt_version();
-  return oss.str();
+  return std::string("rocm not available");
 #endif
 #else
   return std::string("not available");
diff --git a/mmcv/ops/csrc/pytorch/iou3d.cpp b/mmcv/ops/csrc/pytorch/iou3d.cpp
index a347c0e..71f5030 100644
--- a/mmcv/ops/csrc/pytorch/iou3d.cpp
+++ b/mmcv/ops/csrc/pytorch/iou3d.cpp
@@ -19,24 +19,31 @@ void iou3d_boxes_overlap_bev_forward_impl(const int num_a, const Tensor boxes_a,
                        num_b, boxes_b, ans_overlap);
 }
 
-void iou3d_nms3d_forward_impl(const Tensor boxes, Tensor &keep,
-                              Tensor &keep_num, float nms_overlap_thresh) {
-  DISPATCH_DEVICE_IMPL(iou3d_nms3d_forward_impl, boxes, keep, keep_num,
+void iou3d_boxes_iou_bev_forward_impl(const int num_a, const Tensor boxes_a,
+                                      const int num_b, const Tensor boxes_b,
+                                      Tensor ans_iou) {
+  DISPATCH_DEVICE_IMPL(iou3d_boxes_iou_bev_forward_impl, num_a, boxes_a, num_b,
+                       boxes_b, ans_iou);
+}
+
+void iou3d_nms_forward_impl(const Tensor boxes, unsigned long long *mask,
+                            int boxes_num, float nms_overlap_thresh) {
+  DISPATCH_DEVICE_IMPL(iou3d_nms_forward_impl, boxes, mask, boxes_num,
                        nms_overlap_thresh);
 }
 
-void iou3d_nms3d_normal_forward_impl(const Tensor boxes, Tensor &keep,
-                                     Tensor &keep_num,
-                                     float nms_overlap_thresh) {
-  DISPATCH_DEVICE_IMPL(iou3d_nms3d_normal_forward_impl, boxes, keep, keep_num,
+void iou3d_nms_normal_forward_impl(const Tensor boxes, unsigned long long *mask,
+                                   int boxes_num, float nms_overlap_thresh) {
+  DISPATCH_DEVICE_IMPL(iou3d_nms_normal_forward_impl, boxes, mask, boxes_num,
                        nms_overlap_thresh);
 }
 
 void iou3d_boxes_overlap_bev_forward(Tensor boxes_a, Tensor boxes_b,
                                      Tensor ans_overlap) {
-  // params boxes: (N, 7) [x, y, z, dx, dy, dz, heading]
+  // params boxes_a: (N, 5) [x1, y1, x2, y2, ry]
   // params boxes_b: (M, 5)
   // params ans_overlap: (N, M)
+
   int num_a = boxes_a.size(0);
   int num_b = boxes_b.size(0);
 
@@ -44,23 +51,101 @@ void iou3d_boxes_overlap_bev_forward(Tensor boxes_a, Tensor boxes_b,
                                        ans_overlap);
 }
 
-void iou3d_nms3d_forward(Tensor boxes, Tensor keep, Tensor keep_num,
-                         float nms_overlap_thresh) {
-  // params boxes: (N, 7) [x, y, z, dx, dy, dz, heading]
+void iou3d_boxes_iou_bev_forward(Tensor boxes_a, Tensor boxes_b,
+                                 Tensor ans_iou) {
+  // params boxes_a: (N, 5) [x1, y1, x2, y2, ry]
+  // params boxes_b: (M, 5)
+  // params ans_overlap: (N, M)
+  int num_a = boxes_a.size(0);
+  int num_b = boxes_b.size(0);
+
+  iou3d_boxes_iou_bev_forward_impl(num_a, boxes_a, num_b, boxes_b, ans_iou);
+}
+
+void iou3d_nms_forward(Tensor boxes, Tensor keep, Tensor keep_num,
+                       float nms_overlap_thresh) {
+  // params boxes: (N, 5) [x1, y1, x2, y2, ry]
   // params keep: (N)
   CHECK_CONTIGUOUS(boxes);
   CHECK_CONTIGUOUS(keep);
 
-  iou3d_nms3d_forward_impl(boxes, keep, keep_num, nms_overlap_thresh);
+  int boxes_num = boxes.size(0);
+  int64_t *keep_data = keep.data_ptr<int64_t>();
+  int64_t *keep_num_data = keep_num.data_ptr<int64_t>();
+
+  const int col_blocks = DIVUP(boxes_num, THREADS_PER_BLOCK_NMS);
+
+  Tensor mask =
+      at::empty({boxes_num, col_blocks}, boxes.options().dtype(at::kLong));
+  unsigned long long *mask_data =
+      (unsigned long long *)mask.data_ptr<int64_t>();
+  iou3d_nms_forward_impl(boxes, mask_data, boxes_num, nms_overlap_thresh);
+
+  at::Tensor mask_cpu = mask.to(at::kCPU);
+  unsigned long long *mask_host =
+      (unsigned long long *)mask_cpu.data_ptr<int64_t>();
+
+  std::vector<unsigned long long> remv_cpu(col_blocks);
+  memset(&remv_cpu[0], 0, sizeof(unsigned long long) * col_blocks);
+
+  int num_to_keep = 0;
+
+  for (int i = 0; i < boxes_num; i++) {
+    int nblock = i / THREADS_PER_BLOCK_NMS;
+    int inblock = i % THREADS_PER_BLOCK_NMS;
+
+    if (!(remv_cpu[nblock] & (1ULL << inblock))) {
+      keep_data[num_to_keep++] = i;
+      unsigned long long *p = &mask_host[0] + i * col_blocks;
+      for (int j = nblock; j < col_blocks; j++) {
+        remv_cpu[j] |= p[j];
+      }
+    }
+    *keep_num_data = num_to_keep;
+  }
 }
 
-void iou3d_nms3d_normal_forward(Tensor boxes, Tensor keep, Tensor keep_num,
-                                float nms_overlap_thresh) {
-  // params boxes: (N, 7) [x, y, z, dx, dy, dz, heading]
+void iou3d_nms_normal_forward(Tensor boxes, Tensor keep, Tensor keep_num,
+                              float nms_overlap_thresh) {
+  // params boxes: (N, 5) [x1, y1, x2, y2, ry]
   // params keep: (N)
 
   CHECK_CONTIGUOUS(boxes);
   CHECK_CONTIGUOUS(keep);
 
-  iou3d_nms3d_normal_forward_impl(boxes, keep, keep_num, nms_overlap_thresh);
+  int boxes_num = boxes.size(0);
+  int64_t *keep_data = keep.data_ptr<int64_t>();
+  int64_t *keep_num_data = keep_num.data_ptr<int64_t>();
+
+  const int col_blocks = DIVUP(boxes_num, THREADS_PER_BLOCK_NMS);
+
+  Tensor mask =
+      at::empty({boxes_num, col_blocks}, boxes.options().dtype(at::kLong));
+  unsigned long long *mask_data =
+      (unsigned long long *)mask.data_ptr<int64_t>();
+  iou3d_nms_normal_forward_impl(boxes, mask_data, boxes_num,
+                                nms_overlap_thresh);
+
+  at::Tensor mask_cpu = mask.to(at::kCPU);
+  unsigned long long *mask_host =
+      (unsigned long long *)mask_cpu.data_ptr<int64_t>();
+
+  std::vector<unsigned long long> remv_cpu(col_blocks);
+  memset(&remv_cpu[0], 0, sizeof(unsigned long long) * col_blocks);
+  int num_to_keep = 0;
+
+  for (int i = 0; i < boxes_num; i++) {
+    int nblock = i / THREADS_PER_BLOCK_NMS;
+    int inblock = i % THREADS_PER_BLOCK_NMS;
+
+    if (!(remv_cpu[nblock] & (1ULL << inblock))) {
+      keep_data[num_to_keep++] = i;
+      unsigned long long *p = &mask_host[0] + i * col_blocks;
+      for (int j = nblock; j < col_blocks; j++) {
+        remv_cpu[j] |= p[j];
+      }
+    }
+  }
+
+  *keep_num_data = num_to_keep;
 }
diff --git a/mmcv/ops/csrc/pytorch/min_area_polygons.cpp b/mmcv/ops/csrc/pytorch/min_area_polygons.cpp
deleted file mode 100644
index 8ff996d..0000000
--- a/mmcv/ops/csrc/pytorch/min_area_polygons.cpp
+++ /dev/null
@@ -1,11 +0,0 @@
-// Copyright (c) OpenMMLab. All rights reserved
-#include "pytorch_cpp_helper.hpp"
-#include "pytorch_device_registry.hpp"
-
-void min_area_polygons_impl(const Tensor pointsets, Tensor polygons) {
-  DISPATCH_DEVICE_IMPL(min_area_polygons_impl, pointsets, polygons);
-}
-
-void min_area_polygons(const Tensor pointsets, Tensor polygons) {
-  min_area_polygons_impl(pointsets, polygons);
-}
diff --git a/mmcv/ops/csrc/pytorch/mlu/bbox_overlaps_mlu.cpp b/mmcv/ops/csrc/pytorch/mlu/bbox_overlaps_mlu.cpp
deleted file mode 100644
index 82d5555..0000000
--- a/mmcv/ops/csrc/pytorch/mlu/bbox_overlaps_mlu.cpp
+++ /dev/null
@@ -1,100 +0,0 @@
-/*************************************************************************
- * Copyright (C) 2021 Cambricon.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *************************************************************************/
-
-#include "pytorch_device_registry.hpp"
-#include "pytorch_mlu_helper.hpp"
-
-void KernelBBoxOverlaps(cnrtDim3_t k_dim, cnrtFunctionType_t k_type,
-                        cnrtQueue_t queue, const cnrtDataType_t d_type,
-                        const void *bbox1, const void *bbox2, void *ious,
-                        const int32_t num_bbox1, const int32_t num_bbox2,
-                        const int32_t mode, const bool aligned,
-                        const int32_t offset);
-
-static void policyFunc(cnrtDim3_t *k_dim, cnrtFunctionType_t *k_type,
-                       const int32_t batch_num_all) {
-  auto union_num = torch_mlu::getDeviceAttr(cnrtAttrClusterCount);
-  auto core_dim = torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster);
-  auto core_num = union_num * core_dim;
-
-  // Union1 policyFunc
-  *k_type = CNRT_FUNC_TYPE_UNION1;
-  k_dim->x = core_dim;
-  auto need_core_num = PAD_UP(batch_num_all, core_dim);
-  k_dim->y =
-      (need_core_num < core_num) ? (need_core_num / core_dim) : union_num;
-  k_dim->z = 1;
-
-  return;
-}
-
-void BBoxOverlapsMLUKernelLauncher(const Tensor bboxes1, const Tensor bboxes2,
-                                   Tensor ious, const int32_t mode,
-                                   const bool aligned, const int32_t offset) {
-  // check dtype
-  TORCH_CHECK(
-      bboxes1.scalar_type() == at::kFloat || bboxes1.scalar_type() == at::kHalf,
-      "Data type of input should be Float or Half. But now input type is ",
-      bboxes1.scalar_type(), ".");
-  TORCH_CHECK(bboxes1.scalar_type() == bboxes2.scalar_type(),
-              "bboxes1's dtype should be the same with bboxes2's dtype.");
-
-  // params check
-  TORCH_CHECK(bboxes1.dim() == 2, "bboxes1 should be a 2d tensor, got ",
-              bboxes1.dim(), "D");
-  TORCH_CHECK(bboxes2.dim() == 2, "bboxes2 should be a 2d tensor, got ",
-              bboxes2.dim(), "D");
-
-  auto rows = bboxes1.size(0);
-  auto cols = bboxes2.size(0);
-  auto batch_num_all = rows;
-
-  if (rows * cols == 0) {
-    // return if zero element
-    return;
-  }
-
-  // calculate task dimension
-  cnrtDim3_t k_dim;
-  cnrtFunctionType_t k_type;
-  policyFunc(&k_dim, &k_type, batch_num_all);
-
-  // get compute queue
-  cnrtQueue_t queue = torch_mlu::getCurQueue();
-
-  // get dtype of input
-  cnrtDataType_t d_type = torch_mlu::toCnrtDtype(bboxes1.dtype());
-
-  // get ptr of tensors
-  auto bboxes1_impl = torch_mlu::getMluTensorImpl(bboxes1);
-  auto bboxes1_ptr = bboxes1_impl->cnnlMalloc();
-  auto bboxes2_impl = torch_mlu::getMluTensorImpl(bboxes2);
-  auto bboxes2_ptr = bboxes2_impl->cnnlMalloc();
-  auto ious_impl = torch_mlu::getMluTensorImpl(ious);
-  auto ious_ptr = ious_impl->cnnlMalloc();
-
-  // launch kernel
-  CNLOG(INFO) << "Launch Kernel MLUUnion1BboxOverlapsKernel";
-  CNLOG(INFO) << "kDim :[ " << k_dim.x << ", " << k_dim.y << ", " << k_dim.z
-              << " ]";
-  KernelBBoxOverlaps(k_dim, k_type, queue, d_type, bboxes1_ptr, bboxes2_ptr,
-                     ious_ptr, rows, cols, mode, aligned, offset);
-}
-
-void bbox_overlaps_mlu(const Tensor bboxes1, const Tensor bboxes2, Tensor ious,
-                       const int mode, const bool aligned, const int offset) {
-  BBoxOverlapsMLUKernelLauncher(bboxes1, bboxes2, ious, mode, aligned, offset);
-}
-
-void bbox_overlaps_impl(const Tensor bboxes1, const Tensor bboxes2, Tensor ious,
-                        const int mode, const bool aligned, const int offset);
-REGISTER_DEVICE_IMPL(bbox_overlaps_impl, MLU, bbox_overlaps_mlu);
diff --git a/mmcv/ops/csrc/pytorch/mlu/carafe_mlu.cpp b/mmcv/ops/csrc/pytorch/mlu/carafe_mlu.cpp
deleted file mode 100644
index 25e0b85..0000000
--- a/mmcv/ops/csrc/pytorch/mlu/carafe_mlu.cpp
+++ /dev/null
@@ -1,429 +0,0 @@
-/*************************************************************************
- * Copyright (C) 2022 Cambricon.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *************************************************************************/
-#include "carafe_utils.hpp"
-#include "pytorch_device_registry.hpp"
-#include "pytorch_mlu_helper.hpp"
-
-void KernelCarafeForward(cnrtDim3_t k_dim, cnrtFunctionType_t k_type,
-                         cnrtQueue_t queue, const cnrtDataType_t d_type,
-                         const void *input, const void *mask,
-                         const CarafeForwardParam &param,
-                         const CarafeForwardBlockDim &block_dim,
-                         const CarafeForwardGridDim &grid_dim, void *output);
-
-void KernelCarafeBackward(cnrtDim3_t k_dim, cnrtFunctionType_t k_type,
-                          cnrtQueue_t queue, cnrtDataType_t dtype,
-                          const void *input, const void *mask,
-                          const void *grad_output, void *grad_input,
-                          void *grad_mask, const int n, const int hi,
-                          const int wi, const int c, const int k_up,
-                          const int group, const int scale);
-
-// Get total NRAM usage and set strides of NRAM arrays.
-static void getNramUsage(CarafeForwardParam *param,
-                         CarafeForwardBlockDim *block_dim, int *nram_usage) {
-  // input_nram[blkDim_(Hi+Kh)-1, blkDim_(Wi+Kw)-1, blkDim_G, blkDim_Cg]
-  block_dim->Hi = CEIL_DIV(block_dim->Ho, param->scale_factor) + 1;
-  block_dim->Wi = CEIL_DIV(block_dim->Wo, param->scale_factor) + 1;
-
-  param->input_nram_stride_g = PAD_UP(block_dim->Cg, param->align_size_NRAM);
-  param->input_nram_stride_w = param->input_nram_stride_g * block_dim->G;
-  param->input_nram_stride_h =
-      (block_dim->Wi + block_dim->Kw - 1) * param->input_nram_stride_w;
-  param->input_nram_size =
-      (block_dim->Hi + block_dim->Kh - 1) * param->input_nram_stride_h;
-
-  // mask_nram[blkDim_Ho, blkDim_Wo, blkDim_G, blkDim_Kh, blkDim_Kw]
-  param->mask_nram_stride_kh = block_dim->Kw;
-  param->mask_nram_stride_g = block_dim->Kh * param->mask_nram_stride_kh;
-  param->mask_nram_stride_w = block_dim->G * param->mask_nram_stride_g;
-  param->mask_nram_stride_h = block_dim->Wo * param->mask_nram_stride_w;
-  param->mask_nram_size =
-      PAD_UP(block_dim->Ho * param->mask_nram_stride_h, param->align_size_NRAM);
-
-  // output_nram[blkDim_Ho, blkDim_Wo, blkDim_(G*Cg)]
-  param->output_nram_stride_g = param->input_nram_stride_g;
-  param->output_nram_stride_w =
-      PAD_UP(param->input_nram_stride_w, param->align_size_NFU);
-  param->output_nram_stride_h = block_dim->Wo * param->output_nram_stride_w;
-  param->output_nram_size = block_dim->Ho * param->output_nram_stride_h;
-
-  // sum_array[blkDim_(G*Cg)]
-
-  // ensure the last mul_const on Cg does not exceed memory boundary
-  int sum_array_size_bang_mul_const =
-      (block_dim->G - 1) * param->input_nram_stride_g +
-      PAD_UP(param->input_nram_stride_g, param->align_size_NFU);
-
-  int sum_array_size =
-      std::max(param->output_nram_stride_w, sum_array_size_bang_mul_const);
-
-  *nram_usage = param->input_nram_size + param->mask_nram_size +
-                param->output_nram_size + sum_array_size;
-}
-
-// Policy Function for Forward
-static void genPolicyForward(CarafeForwardParam *param,
-                             CarafeForwardBlockDim *block_dim,
-                             CarafeForwardGridDim *grid_dim, cnrtDim3_t *k_dim,
-                             cnrtFunctionType_t *k_type) {
-  // device info
-  auto core_dim = torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster);
-  auto cluster_num = torch_mlu::getDeviceAttr(cnrtAttrClusterCount);
-  auto core_num = core_dim * cluster_num;
-
-  // maximum NRAM size as the number of <dtype>
-  auto max_nram_size =
-      torch_mlu::getDeviceAttr(cnrtAttrNramSizePerMcore) / param->dtype_size;
-
-  // determine grid and block dimensions
-
-  // set initial values for block_dim and grid_dim
-  block_dim->Ho = param->Ho;
-  block_dim->Wo = param->Wo;
-  block_dim->Kh = param->kernel_size;
-  block_dim->Kw = param->kernel_size;
-  block_dim->G = param->group_size;
-  block_dim->Cg = param->Cg;
-
-  grid_dim->Ho = 1;
-  grid_dim->Wo = 1;
-  grid_dim->Kh = 1;
-  grid_dim->Kw = 1;
-  grid_dim->G = 1;
-  grid_dim->Cg = 1;
-
-  // decrease the block size to fit in the NRAM.
-  int nram_usage = 0;
-  while (true) {
-    getNramUsage(param, block_dim, &nram_usage);
-
-    if (nram_usage > max_nram_size) {
-      // decrease Ho
-      // decrease block_Ho and block_Wo evenly
-      // so that the block is close to a square.
-      if (block_dim->Ho > 1 && block_dim->Ho >= block_dim->Wo) {
-        grid_dim->Ho += 1;
-        block_dim->Ho = CEIL_DIV(param->Ho, grid_dim->Ho);
-      } else if (block_dim->Wo > 1 && block_dim->Wo > block_dim->Ho) {
-        // decrease Wo
-        grid_dim->Wo += 1;
-        block_dim->Wo = CEIL_DIV(param->Wo, grid_dim->Wo);
-      } else if (block_dim->Kh > 1) {
-        // decrease Kh
-        grid_dim->Kh += 1;
-        block_dim->Kh = CEIL_DIV(param->kernel_size, grid_dim->Kh);
-        // reset Hi, Wi to maximize NRAM usage
-        grid_dim->Ho = 1;
-        block_dim->Ho = param->Ho;
-        grid_dim->Wo = 1;
-        block_dim->Wo = param->Wo;
-      } else if (block_dim->Kw > 1) {
-        // decrease Kw
-        grid_dim->Kw += 1;
-        block_dim->Kw = CEIL_DIV(param->kernel_size, grid_dim->Kw);
-        // reset Kh
-        grid_dim->Kh = 1;
-        block_dim->Kh = param->kernel_size;
-      } else if (block_dim->G > 1) {
-        // decrease G
-        grid_dim->G += 1;
-        block_dim->G = CEIL_DIV(param->group_size, grid_dim->G);
-        // reset Kw
-        grid_dim->Kw = 1;
-        block_dim->Kw = param->kernel_size;
-      } else if (block_dim->Cg > 1) {
-        // decrease block_Cg
-        // This is done in the last since c is the continuous dim
-        // (input layout is NHWC) and large c can improve
-        // IO & compute efficiency.
-        grid_dim->Cg += 1;
-        block_dim->Cg = CEIL_DIV(param->Cg, grid_dim->Cg);
-        // reset G
-        grid_dim->G = 1;
-        block_dim->G = param->group_size;
-      } else {
-        // the block volume is one now, cannot decrease the block size anymore!
-        // this situation should not occur.
-        break;
-      }
-    } else {
-      break;
-    }
-  }
-
-  // define parameters depending on block_dim, grid_dim
-  param->block_Cg_NFU = PAD_UP(block_dim->Cg, param->align_size_NFU);
-
-  // define host arrays' strides
-
-  // input[N,H,W,G,Cg]
-  param->input_stride_g = param->Cg;
-  param->input_stride_w = param->Ci;
-  param->input_stride_h = param->Wi * param->input_stride_w;
-  param->input_stride_n = param->Hi * param->input_stride_h;
-  // mask[N,Ho,Wo,G,Kh,Kw]
-  param->mask_stride_kh = param->kernel_size;
-  param->mask_stride_g = param->kernel_size * param->mask_stride_kh;
-  param->mask_stride_w = param->group_size * param->mask_stride_g;
-  param->mask_stride_h = param->Wo * param->mask_stride_w;
-  param->mask_stride_n = param->Ho * param->mask_stride_h;
-  // output[N,Ho,Wo,G,Cg]
-  param->output_stride_g = param->Cg;
-  param->output_stride_w = param->Ci;
-  param->output_stride_h = param->Wo * param->output_stride_w;
-  param->output_stride_n = param->Ho * param->output_stride_h;
-
-  param->job_num =
-      param->N * grid_dim->Ho * grid_dim->Wo * grid_dim->G * grid_dim->Cg;
-
-  // determine task type and dims
-  *k_type = CNRT_FUNC_TYPE_BLOCK;
-  k_dim->x = std::min(param->job_num, static_cast<int>(core_num));
-  k_dim->y = 1;
-  k_dim->z = 1;
-}
-
-void CARAFEForwardMLUKernelLauncher(const Tensor input, const Tensor mask,
-                                    Tensor rinput, Tensor routput, Tensor rmask,
-                                    Tensor output, const int kernel_size,
-                                    const int group_size,
-                                    const int scale_factor) {
-  const int batch_size = output.size(0);
-  const int channels = output.size(1);
-  const int ho = output.size(2);
-  const int wo = output.size(3);
-
-  // check tensor data type
-  TORCH_CHECK(
-      input.scalar_type() == at::kFloat || input.scalar_type() == at::kHalf,
-      "Data type of input should be Float or Half. But now input type is ",
-      input.scalar_type(), ".");
-
-  TORCH_CHECK(mask.scalar_type() == input.scalar_type(),
-              "Data types of input and mask should be the same, but got ",
-              input.scalar_type(), " and ", mask.scalar_type());
-
-  // check number of dimensions
-  TORCH_CHECK(input.dim() == 4, "input should be a 4-D tensor, but has ",
-              input.dim(), "D.");
-  TORCH_CHECK(mask.dim() == 4, "mask should be a 4-D tensor, but has ",
-              input.dim(), "D.");
-
-  // return fast on zero-element tensor
-  if (output.numel() == 0) {
-    output = at::zeros({batch_size, channels, ho, wo}, output.options());
-    return;
-  }
-
-  // set param
-  CarafeForwardParam param;
-  param.N = input.size(0);
-  param.Ci = input.size(1);
-  param.Hi = input.size(2);
-  param.Wi = input.size(3);
-
-  param.kernel_size = kernel_size;
-  param.group_size = group_size;
-  param.scale_factor = scale_factor;
-  param.Cg = param.Ci / group_size;
-  param.dtype_size = input.itemsize();
-  param.align_size_NRAM = NRAM_ALIGN_SIZE / param.dtype_size;
-  param.align_size_NFU = NFU_ALIGN_SIZE / param.dtype_size;
-  param.kernel_size_sq = param.kernel_size * param.kernel_size;
-  param.kernel_size_half = (param.kernel_size - 1) / 2;
-  param.Ho = param.Hi * param.scale_factor;
-  param.Wo = param.Wi * param.scale_factor;
-
-  // generate policy
-  cnrtDim3_t k_dim;
-  cnrtFunctionType_t k_type;
-  CarafeForwardBlockDim block_dim;
-  CarafeForwardGridDim grid_dim;
-
-  genPolicyForward(&param, &block_dim, &grid_dim, &k_dim, &k_type);
-
-  // convert NCHW to NHWC
-  auto memory_format_input_nhwc =
-      torch_mlu::cnnl::ops::get_channels_last_memory_format(input.dim());
-  auto rinput_ =
-      torch_mlu::cnnl::ops::cnnl_contiguous(input, memory_format_input_nhwc);
-
-  auto memory_format_mask_nhwc =
-      torch_mlu::cnnl::ops::get_channels_last_memory_format(mask.dim());
-  auto rmask_ =
-      torch_mlu::cnnl::ops::cnnl_contiguous(mask, memory_format_mask_nhwc);
-
-  auto memory_format_output_nhwc =
-      torch_mlu::cnnl::ops::get_channels_last_memory_format(output.dim());
-  auto routput_ =
-      torch_mlu::cnnl::ops::cnnl_contiguous(output, memory_format_output_nhwc);
-
-  // get ptr of tensors
-  auto input_impl = torch_mlu::getMluTensorImpl(rinput_);
-  auto input_ptr = input_impl->cnnlMalloc();
-  auto mask_impl = torch_mlu::getMluTensorImpl(rmask_);
-  auto mask_ptr = mask_impl->cnnlMalloc();
-  auto output_impl = torch_mlu::getMluTensorImpl(routput_);
-  auto output_ptr = output_impl->cnnlMalloc();
-
-  // get compute queue
-  auto queue = torch_mlu::getCurQueue();
-
-  // get dtype of input
-  cnrtDataType_t d_type = torch_mlu::toCnrtDtype(input.dtype());
-
-  // launch kernel
-  auto core_dim = torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster);
-  CNLOG(INFO) << "Launch Kernel KernelCarafeForward<<<Union"
-              << k_type / core_dim << ", " << k_dim.x << ", " << k_dim.y << ", "
-              << k_dim.z << ">>>";
-
-  KernelCarafeForward(k_dim, k_type, queue, d_type, input_ptr, mask_ptr, param,
-                      block_dim, grid_dim, output_ptr);
-
-  // copy output from NHWC back into NCHW
-  rinput.copy_(rinput_);
-  output.copy_(routput_);
-}
-
-// Policy Function for Backward
-static void policyFuncBackward(cnrtDim3_t *k_dim, cnrtFunctionType_t *k_type) {
-  // set Union1 Job
-  *k_type = CNRT_FUNC_TYPE_UNION1;
-  k_dim->x = torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster);
-  k_dim->y = torch_mlu::getDeviceAttr(cnrtAttrClusterCount);
-  k_dim->z = 1;
-}
-
-void CARAFEBackwardMLUKernelLauncher(
-    const Tensor grad_output, const Tensor rinput, const Tensor mask,
-    Tensor rgrad_output, Tensor rgrad_input_hs, Tensor rgrad_input,
-    Tensor rgrad_mask, Tensor grad_input, Tensor grad_mask,
-    const int kernel_size, const int group_size, const int scale_factor) {
-  const int batch_size = rinput.size(0);
-  const int channels = rinput.size(1);
-  const int hi = rinput.size(2);
-  const int wi = rinput.size(3);
-
-  // data type check
-  TORCH_CHECK(grad_output.scalar_type() == at::kFloat ||
-                  grad_output.scalar_type() == at::kHalf,
-              "grad_output type should be Float or Half, got ",
-              grad_output.scalar_type());
-  TORCH_CHECK(grad_output.scalar_type() == mask.scalar_type(),
-              "mask should have the same type as grad_output");
-
-  // dim check
-  TORCH_CHECK(grad_output.dim() == 4, "grad_output should be a 4d tensor, got ",
-              grad_output.dim(), "D");
-
-  // param check
-  TORCH_CHECK(kernel_size < 137, "kernel_size should be less than 137, got ",
-              kernel_size);
-
-  // set task dimension
-  cnrtDim3_t k_dim;
-  cnrtFunctionType_t k_type;
-  policyFuncBackward(&k_dim, &k_type);
-
-  // convert NCHW to NHWC
-  auto memory_format_input_nhwc =
-      torch_mlu::cnnl::ops::get_channels_last_memory_format(rinput.dim());
-  auto rinput_ =
-      torch_mlu::cnnl::ops::cnnl_contiguous(rinput, memory_format_input_nhwc);
-
-  auto memory_format_mask_nhwc =
-      torch_mlu::cnnl::ops::get_channels_last_memory_format(mask.dim());
-  auto rmask_ =
-      torch_mlu::cnnl::ops::cnnl_contiguous(mask, memory_format_mask_nhwc);
-
-  auto memory_format_grad_output_nhwc =
-      torch_mlu::cnnl::ops::get_channels_last_memory_format(grad_output.dim());
-  auto rgrad_output_ = torch_mlu::cnnl::ops::cnnl_contiguous(
-      grad_output, memory_format_grad_output_nhwc);
-
-  auto memory_format_grad_input_nhwc =
-      torch_mlu::cnnl::ops::get_channels_last_memory_format(grad_input.dim());
-  auto rgrad_input_ = torch_mlu::cnnl::ops::cnnl_contiguous(
-                          grad_input, memory_format_grad_input_nhwc)
-                          .zero_();
-
-  auto memory_format_grad_mask_nhwc =
-      torch_mlu::cnnl::ops::get_channels_last_memory_format(grad_mask.dim());
-  auto rgrad_mask_ = torch_mlu::cnnl::ops::cnnl_contiguous(
-      grad_mask, memory_format_grad_mask_nhwc);
-
-  // get compute queue
-  auto queue = torch_mlu::getCurQueue();
-
-  // get ptr of tensors
-  auto input_impl = torch_mlu::getMluTensorImpl(rinput_);
-  auto input_ptr = input_impl->cnnlMalloc();
-  auto mask_impl = torch_mlu::getMluTensorImpl(rmask_);
-  auto mask_ptr = mask_impl->cnnlMalloc();
-  auto grad_output_impl = torch_mlu::getMluTensorImpl(rgrad_output_);
-  auto grad_output_ptr = grad_output_impl->cnnlMalloc();
-  auto grad_input_impl = torch_mlu::getMluTensorImpl(rgrad_input_);
-  auto grad_input_ptr = grad_input_impl->cnnlMalloc();
-  auto grad_mask_impl = torch_mlu::getMluTensorImpl(rgrad_mask_);
-  auto grad_mask_ptr = grad_mask_impl->cnnlMalloc();
-
-  // get dtype of grad_output
-  cnrtDataType_t d_type = torch_mlu::toCnrtDtype(grad_output.dtype());
-  auto core_dim = torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster);
-
-  CNLOG(INFO) << "Launch Kernel KernelCarafeBackward<<<Union"
-              << k_type / core_dim << ", " << k_dim.x << ", " << k_dim.y << ", "
-              << k_dim.z << ">>>";
-
-  // launch kernel
-  KernelCarafeBackward(k_dim, k_type, queue, d_type, input_ptr, mask_ptr,
-                       grad_output_ptr, grad_input_ptr, grad_mask_ptr,
-                       batch_size, hi, wi, channels, kernel_size, group_size,
-                       scale_factor);
-
-  // copy output from NHWC back into NCHW
-  grad_input.copy_(rgrad_input_);
-  grad_mask.copy_(rgrad_mask_);
-}
-
-void carafe_forward_mlu(Tensor features, Tensor masks, Tensor rfeatures,
-                        Tensor routput, Tensor rmasks, Tensor output,
-                        int kernel_size, int group_size, int scale_factor) {
-  CARAFEForwardMLUKernelLauncher(features, masks, rfeatures, routput, rmasks,
-                                 output, kernel_size, group_size, scale_factor);
-}
-
-void carafe_backward_mlu(Tensor top_grad, Tensor rfeatures, Tensor masks,
-                         Tensor rtop_grad, Tensor rbottom_grad_hs,
-                         Tensor rbottom_grad, Tensor rmask_grad,
-                         Tensor bottom_grad, Tensor mask_grad, int kernel_size,
-                         int group_size, int scale_factor) {
-  CARAFEBackwardMLUKernelLauncher(top_grad, rfeatures, masks, rtop_grad,
-                                  rbottom_grad_hs, rbottom_grad, rmask_grad,
-                                  bottom_grad, mask_grad, kernel_size,
-                                  group_size, scale_factor);
-}
-
-void carafe_forward_impl(Tensor features, Tensor masks, Tensor rfeatures,
-                         Tensor routput, Tensor rmasks, Tensor output,
-                         int kernel_size, int group_size, int scale_factor);
-
-void carafe_backward_impl(Tensor top_grad, Tensor rfeatures, Tensor masks,
-                          Tensor rtop_grad, Tensor rbottom_grad_hs,
-                          Tensor rbottom_grad, Tensor rmask_grad,
-                          Tensor bottom_grad, Tensor mask_grad, int kernel_size,
-                          int group_size, int scale_factor);
-
-REGISTER_DEVICE_IMPL(carafe_forward_impl, MLU, carafe_forward_mlu);
-REGISTER_DEVICE_IMPL(carafe_backward_impl, MLU, carafe_backward_mlu);
diff --git a/mmcv/ops/csrc/pytorch/mlu/deform_roi_pool_mlu.cpp b/mmcv/ops/csrc/pytorch/mlu/deform_roi_pool_mlu.cpp
deleted file mode 100644
index 4d73cbb..0000000
--- a/mmcv/ops/csrc/pytorch/mlu/deform_roi_pool_mlu.cpp
+++ /dev/null
@@ -1,343 +0,0 @@
-/*************************************************************************
- * Copyright (C) 2022 Cambricon.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *************************************************************************/
-#include "pytorch_device_registry.hpp"
-#include "pytorch_mlu_helper.hpp"
-
-void KernelDeformRoIPoolForward(cnrtDim3_t k_dim, cnrtFunctionType_t k_type,
-                                cnrtQueue_t queue, cnrtDataType_t data_type,
-                                const void *input, const void *rois,
-                                const void *offset, void *output,
-                                const int channels, const int height,
-                                const int width, const int num_rois,
-                                const int pooled_height, const int pooled_width,
-                                const float spatial_scale,
-                                const int sampling_ratio, const float gamma);
-
-void KernelDeformRoIPoolBackward(
-    cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
-    cnrtDataType_t data_type, const void *grad_output, const void *input,
-    const void *rois, const void *offset, void *grad_input, void *grad_offset,
-    const int channels, const int height, const int width, const int num_rois,
-    const int pooled_height, const int pooled_width, const float spatial_scale,
-    const int sampling_ratio, const float gamma);
-
-// policy function for forward and backward
-static void policyFunc(const int bin_num, cnrtDim3_t *k_dim,
-                       cnrtFunctionType_t *k_type) {
-  const size_t cluster_limit = torch_mlu::getDeviceAttr(cnrtAttrClusterCount);
-  ;
-  const size_t core_limit = torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster);
-  const size_t bin_num_align = CEIL_ALIGN(bin_num, core_limit);
-  k_dim->x = core_limit;
-  k_dim->y = (bin_num_align / core_limit) > cluster_limit
-                 ? cluster_limit
-                 : (bin_num_align / core_limit);
-  k_dim->z = 1;
-  *k_type = CNRT_FUNC_TYPE_UNION1;
-}
-
-void DeformRoIPoolForwardMLUKernelLauncher(Tensor input, Tensor rois,
-                                           Tensor offset, Tensor output,
-                                           int pooled_height, int pooled_width,
-                                           float spatial_scale,
-                                           int sampling_ratio, float gamma) {
-  // Check dtype.
-  TORCH_CHECK(
-      input.scalar_type() == at::kFloat || input.scalar_type() == at::kHalf,
-      "input type should be Float or Half, got ", input.scalar_type());
-  TORCH_CHECK(input.scalar_type() == rois.scalar_type(),
-              "rois should have the same type as input");
-
-  // Check shape.
-  TORCH_CHECK(input.dim() == 4, "input should be 4d tensor, got ", input.dim(),
-              "D.");
-  TORCH_CHECK(rois.dim() == 2, "rois should be 2d tensor, got ", rois.dim(),
-              "D.");
-  if (offset.defined() && offset.numel() > 0) {
-    TORCH_CHECK(input.scalar_type() == offset.scalar_type(),
-                "offset should have the same type as input");
-    TORCH_CHECK(offset.dim() == 4, "offset should be 4d tensor, got ",
-                offset.dim(), "D.");
-    TORCH_CHECK(
-        (offset.size(0) == rois.size(0)), "offset.size(0) = ", offset.size(0),
-        "while rois.size(0)) = ", rois.size(0), ". They should be the same.");
-    TORCH_CHECK((offset.size(1) == 2), "offset.size(1) should be 2, ",
-                "but now offset.size(1) = ", offset.size(1), ".");
-    TORCH_CHECK((offset.size(2) == output.size(2)),
-                "offset.size(2) = ", offset.size(2),
-                "while output.size(2)) = ", output.size(2),
-                ". They should be the same.");
-    TORCH_CHECK((offset.size(3) == output.size(3)),
-                "offset.size(3) = ", offset.size(3),
-                "while output.size(3)) = ", output.size(3),
-                ". They should be the same.");
-  }
-
-  TORCH_CHECK(spatial_scale > 0 && spatial_scale <= 1,
-              "spatial_scale should be within (0, 1], got ", spatial_scale,
-              ".");
-
-  // compute kernel params
-  auto height = input.size(2);
-  auto width = input.size(3);
-  auto channels = input.size(1);
-  auto num_rois = output.size(0);
-
-  if (output.numel() == 0) {
-    output = at::zeros({num_rois, channels, pooled_height, pooled_width},
-                       input.options());
-    return;
-  }
-
-  // zero element check
-  TORCH_CHECK(input.size(0) != 0, "input.size(0) should not be zero, got ",
-              input.size(0));
-  TORCH_CHECK(rois.numel() != 0, "rois.numel() should not be zero, got ",
-              rois.numel());
-  if (input.numel() == 0 || output.numel() == 0) {
-    return;
-  }
-
-  // large tensor check
-  const size_t max_input_num = 2147483648;  // 2^31, 2G num
-  TORCH_CHECK(input.numel() < max_input_num,
-              "input.numel() should be less than 2147483648, got ",
-              input.numel());
-  TORCH_CHECK(rois.numel() < max_input_num,
-              "rois.numel() should be less than 2147483648, got ",
-              rois.numel());
-  TORCH_CHECK(output.numel() < max_input_num,
-              "output.numel() should be less than 2147483648, got ",
-              output.numel());
-  TORCH_CHECK(!offset.defined() || offset.numel() < max_input_num,
-              "offset.numel() should be less than 2147483648, got ",
-              offset.numel());
-
-  auto memory_format =
-      torch_mlu::cnnl::ops::get_channels_last_memory_format(input.dim());
-  auto input_ = torch_mlu::cnnl::ops::cnnl_contiguous(input, memory_format);
-
-  at::Tensor output_ =
-      at::empty({num_rois, channels, pooled_height, pooled_width},
-                input.options(), memory_format);
-
-  // calculate task dimension
-  cnrtDim3_t k_dim;
-  cnrtFunctionType_t k_type;
-  policyFunc(num_rois * pooled_height * pooled_width, &k_dim, &k_type);
-
-  // get compute queue
-  auto queue = torch_mlu::getCurQueue();
-
-  // get ptr of tensors
-  auto input_impl = torch_mlu::getMluTensorImpl(input_);
-  auto input_ptr = input_impl->cnnlMalloc();
-  auto rois_impl = torch_mlu::getMluTensorImpl(rois);
-  auto rois_ptr = rois_impl->cnnlMalloc();
-  auto offset_impl = torch_mlu::getMluTensorImpl(offset);
-  auto offset_ptr = offset_impl->cnnlMalloc();
-  auto output_impl = torch_mlu::getMluTensorImpl(output_);
-  auto output_ptr = output_impl->cnnlMalloc();
-
-  // get comput dtype of input
-  cnrtDataType_t data_type = torch_mlu::toCnrtDtype(input_.dtype());
-
-  // launch kernel
-  CNLOG(INFO) << "Launch Kernel MLUKernelDeformRoIPoolForward<<<" << k_dim.x
-              << ", " << k_dim.y << ", " << k_dim.z << ">>>";
-
-  KernelDeformRoIPoolForward(k_dim, k_type, queue, data_type, input_ptr,
-                             rois_ptr, offset_ptr, output_ptr, channels, height,
-                             width, num_rois, pooled_height, pooled_width,
-                             spatial_scale, sampling_ratio, gamma);
-
-  output.copy_(output_);
-}
-
-void DeformRoIPoolBackwardMLUKernelLauncher(
-    Tensor grad_output, Tensor input, Tensor rois, Tensor offset,
-    Tensor grad_input, Tensor grad_offset, int pooled_height, int pooled_width,
-    float spatial_scale, int sampling_ratio, float gamma) {
-  // Check dtype.
-  TORCH_CHECK(
-      input.scalar_type() == at::kFloat || input.scalar_type() == at::kHalf,
-      "input type should be Float or Half, got ", input.scalar_type());
-  TORCH_CHECK(input.scalar_type() == grad_output.scalar_type(),
-              "grad_output should have the same type as input");
-  TORCH_CHECK(input.scalar_type() == rois.scalar_type(),
-              "rois should have the same type as input");
-  TORCH_CHECK(input.scalar_type() == grad_input.scalar_type(),
-              "grad_input should have the same type as input");
-
-  // Check shape.
-  TORCH_CHECK(grad_output.dim() == 4, "grad_output should be 4d tensor, got ",
-              grad_output.dim(), "D.");
-  TORCH_CHECK(input.dim() == 4, "input should be 4d tensor, got ", input.dim(),
-              "D.");
-  TORCH_CHECK(rois.dim() == 2, "rois should be 2d tensor, got ", rois.dim(),
-              "D.");
-  if (offset.defined() && offset.numel() > 0) {
-    TORCH_CHECK(input.scalar_type() == offset.scalar_type(),
-                "offset should have the same type as input");
-    TORCH_CHECK(offset.dim() == 4, "offset should be 4d tensor, got ",
-                offset.dim(), "D.");
-    TORCH_CHECK(
-        (offset.size(0) == rois.size(0)), "offset.size(0) = ", offset.size(0),
-        "while rois.size(0)) = ", rois.size(0), ". They should be the same.");
-    TORCH_CHECK((offset.size(1) == 2), "offset.size(1) should be 2, ",
-                "but now offset.size(1) = ", offset.size(1), ".");
-    TORCH_CHECK((offset.size(2) == grad_output.size(2)),
-                "offset.size(2) = ", offset.size(2),
-                "while grad_output.size(2)) = ", grad_output.size(2),
-                ". They should be the same.");
-    TORCH_CHECK((offset.size(3) == grad_output.size(3)),
-                "offset.size(3) = ", offset.size(3),
-                "while grad_output.size(3)) = ", grad_output.size(3),
-                ". They should be the same.");
-  }
-
-  TORCH_CHECK(spatial_scale > 0 && spatial_scale <= 1,
-              "spatial_scale should be within (0, 1], got ", spatial_scale);
-
-  // Check relationship between tensor.
-  TORCH_CHECK((grad_output.size(0) == rois.size(0)),
-              "grad_output.size(0) = ", grad_output.size(0),
-              "while rois.size(0)) = ", rois.size(0),
-              ". They should be the same.");
-  TORCH_CHECK((grad_output.size(1) == input.size(1)),
-              "grad_output.size(1) = ", grad_output.size(1),
-              "while input.size(1)) = ", input.size(1),
-              ". They should be the same.");
-  TORCH_CHECK((grad_output.size(2) == pooled_height),
-              "grad_output.size(2) = ", grad_output.size(2),
-              "while pooled_height = ", pooled_height,
-              ". They should be the same.");
-  TORCH_CHECK((grad_output.size(3) == pooled_width),
-              "grad_output.size(3) = ", grad_output.size(3),
-              "while pooled_width = ", pooled_width,
-              ". They should be the same.");
-
-  // compute kernel params
-  auto batch = input.size(0);
-  auto channels = input.size(1);
-  auto height = input.size(2);
-  auto width = input.size(3);
-  auto num_rois = grad_output.size(0);
-
-  // zero element check
-  TORCH_CHECK(input.size(0) != 0, "input.size(0) should not be zero, got ",
-              input.size(0));
-  TORCH_CHECK(rois.numel() != 0, "rois.numel() should not be zero, got ",
-              rois.numel());
-  if (input.numel() == 0 || grad_output.numel() == 0) {
-    return;
-  }
-
-  // large tensor check
-  const size_t max_input_num = 2147483648;  // 2^31, 2G num
-  TORCH_CHECK(input.numel() < max_input_num,
-              "input.numel() should be less than 2147483648, got ",
-              input.numel());
-  TORCH_CHECK(rois.numel() < max_input_num,
-              "rois.numel() should be less than 2147483648, got ",
-              rois.numel());
-  TORCH_CHECK(grad_output.numel() < max_input_num,
-              "grad_output.numel() should be less than 2147483648, got ",
-              grad_output.numel());
-  TORCH_CHECK(!offset.defined() || offset.numel() < max_input_num,
-              "offset.numel() should be less than 2147483648, got ",
-              offset.numel());
-
-  auto memory_format =
-      torch_mlu::cnnl::ops::get_channels_last_memory_format(grad_output.dim());
-  auto grad_output_ =
-      torch_mlu::cnnl::ops::cnnl_contiguous(grad_output, memory_format);
-  memory_format =
-      torch_mlu::cnnl::ops::get_channels_last_memory_format(input.dim());
-  auto input_ = torch_mlu::cnnl::ops::cnnl_contiguous(input, memory_format);
-  at::Tensor grad_input_ = at::empty({batch, channels, height, width},
-                                     input.options(), memory_format)
-                               .zero_();
-
-  // calculate task dimension
-  cnrtDim3_t k_dim;
-  cnrtFunctionType_t k_type;
-  policyFunc(num_rois * pooled_height * pooled_width, &k_dim, &k_type);
-
-  // get compute queue
-  auto queue = torch_mlu::getCurQueue();
-
-  // get ptr of tensors
-  auto grad_output_impl = torch_mlu::getMluTensorImpl(grad_output_);
-  auto grad_output_ptr = grad_output_impl->cnnlMalloc();
-  auto input_impl = torch_mlu::getMluTensorImpl(input_);
-  auto input_ptr = input_impl->cnnlMalloc();
-  auto rois_impl = torch_mlu::getMluTensorImpl(rois);
-  auto rois_ptr = rois_impl->cnnlMalloc();
-  auto offset_impl = torch_mlu::getMluTensorImpl(offset);
-  auto offset_ptr = offset_impl->cnnlMalloc();
-  auto grad_input_impl = torch_mlu::getMluTensorImpl(grad_input_);
-  auto grad_input_ptr = grad_input_impl->cnnlMalloc();
-  auto grad_offset_impl = torch_mlu::getMluTensorImpl(grad_offset);
-  auto grad_offset_ptr = grad_offset_impl->cnnlMalloc();
-
-  // get comput dtype of input
-  cnrtDataType_t data_type = torch_mlu::toCnrtDtype(input.dtype());
-
-  // launch kernel
-  CNLOG(INFO) << "Launch Kernel KernelDeformRoIPoolBackward<<<" << k_dim.x
-              << ", " << k_dim.y << ", " << k_dim.z << ">>>";
-
-  KernelDeformRoIPoolBackward(k_dim, k_type, queue, data_type, grad_output_ptr,
-                              input_ptr, rois_ptr, offset_ptr, grad_input_ptr,
-                              grad_offset_ptr, channels, height, width,
-                              num_rois, pooled_height, pooled_width,
-                              spatial_scale, sampling_ratio, gamma);
-
-  grad_input.copy_(grad_input_);
-}
-
-void deform_roi_pool_forward_mlu(Tensor input, Tensor rois, Tensor offset,
-                                 Tensor output, int pooled_height,
-                                 int pooled_width, float spatial_scale,
-                                 int sampling_ratio, float gamma) {
-  DeformRoIPoolForwardMLUKernelLauncher(input, rois, offset, output,
-                                        pooled_height, pooled_width,
-                                        spatial_scale, sampling_ratio, gamma);
-}
-
-void deform_roi_pool_backward_mlu(Tensor grad_output, Tensor input, Tensor rois,
-                                  Tensor offset, Tensor grad_input,
-                                  Tensor grad_offset, int pooled_height,
-                                  int pooled_width, float spatial_scale,
-                                  int sampling_ratio, float gamma) {
-  DeformRoIPoolBackwardMLUKernelLauncher(
-      grad_output, input, rois, offset, grad_input, grad_offset, pooled_height,
-      pooled_width, spatial_scale, sampling_ratio, gamma);
-}
-
-void deform_roi_pool_forward_impl(Tensor input, Tensor rois, Tensor offset,
-                                  Tensor output, int pooled_height,
-                                  int pooled_width, float spatial_scale,
-                                  int sampling_ratio, float gamma);
-
-void deform_roi_pool_backward_impl(Tensor grad_output, Tensor input,
-                                   Tensor rois, Tensor offset,
-                                   Tensor grad_input, Tensor grad_offset,
-                                   int pooled_height, int pooled_width,
-                                   float spatial_scale, int sampling_ratio,
-                                   float gamma);
-
-REGISTER_DEVICE_IMPL(deform_roi_pool_forward_impl, MLU,
-                     deform_roi_pool_forward_mlu);
-REGISTER_DEVICE_IMPL(deform_roi_pool_backward_impl, MLU,
-                     deform_roi_pool_backward_mlu);
diff --git a/mmcv/ops/csrc/pytorch/mlu/focal_loss_sigmoid_mlu.cpp b/mmcv/ops/csrc/pytorch/mlu/focal_loss_sigmoid_mlu.cpp
deleted file mode 100644
index 9242644..0000000
--- a/mmcv/ops/csrc/pytorch/mlu/focal_loss_sigmoid_mlu.cpp
+++ /dev/null
@@ -1,332 +0,0 @@
-/*************************************************************************
- * Copyright (C) 2021 Cambricon.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *************************************************************************/
-#include <string>
-#include <vector>
-
-#include "pytorch_device_registry.hpp"
-#include "pytorch_mlu_helper.hpp"
-
-void KernelFocalLossSigmoidForward(cnrtDim3_t k_dim, cnrtFunctionType_t k_type,
-                                   cnrtQueue_t queue,
-                                   const cnrtDataType_t d_type,
-                                   const void *input, const void *target,
-                                   const void *weight, const int32_t N,
-                                   const int32_t C, const float alpha,
-                                   const float gamma, void *output);
-
-void KernelFocalLossSigmoidBackward(cnrtDim3_t k_dim, cnrtFunctionType_t k_type,
-                                    cnrtQueue_t queue,
-                                    const cnrtDataType_t d_type,
-                                    const void *input, const void *target,
-                                    const void *weight, const float gamma,
-                                    const float alpha, const int32_t dim_n,
-                                    const int32_t deal_n, const int32_t dim_c,
-                                    void *output);
-// Policy Function for Forward
-static void policyFuncForward(cnrtDim3_t *k_dim, cnrtFunctionType_t *k_type,
-                              const Tensor &input, const Tensor &target,
-                              const Tensor &weight) {
-  auto N = input.size(0);
-  auto C = input.size(1);
-
-  const size_t nram_size = torch_mlu::getDeviceAttr(cnrtAttrNramSizePerMcore);
-  const size_t c_align_size = PAD_UP((C * input.itemsize()), NFU_ALIGN_SIZE);
-  const int split_target_num = 2;
-  const int split_pipeline_num = 6;
-  const int has_weight = weight.data_ptr() != nullptr;
-  const int target_data_width = target.scalar_type() == at::kLong
-                                    ? target.itemsize() / 2
-                                    : target.itemsize();
-  const int threshold_c =
-      PAD_DOWN((nram_size - split_target_num * sizeof(int)) /
-                   (split_pipeline_num + has_weight),
-               NFU_ALIGN_SIZE) /
-      input.itemsize();
-
-  int n_seg = 1;
-  if (C <= threshold_c) {
-    int c_size = C * input.itemsize();
-    int reservered_align_size =
-        (split_target_num + split_pipeline_num) * NFU_ALIGN_SIZE;
-    int wegiht_size = 0;
-    if (has_weight) {
-      c_size = c_align_size;
-      reservered_align_size = split_target_num * NFU_ALIGN_SIZE;
-      wegiht_size = c_align_size;
-    }
-    // n_seg * c_size * split_pipeline_num + n_seg * target.itemsize() *
-    // split_target_num
-    //     + weight_size + reservered_align_size <= nram_size
-    n_seg = (nram_size - wegiht_size - reservered_align_size) /
-            (split_pipeline_num * c_size + split_target_num * sizeof(int32_t));
-  }
-  auto seg_num = n_seg == 0 ? N : (N + n_seg - 1) / n_seg;
-  auto core_dim = torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster);
-  auto cluster_num = torch_mlu::getDeviceAttr(cnrtAttrClusterCount);
-  auto core_num = core_dim * cluster_num;
-
-  k_dim->x = *k_type;
-  k_dim->y =
-      seg_num > core_num ? cluster_num : (seg_num + core_dim - 1) / core_dim;
-  k_dim->z = 1;
-}
-
-// Policy Function for Backward
-static void policyFuncBackward(cnrtDim3_t *k_dim, cnrtFunctionType_t *k_type) {
-  // set Union1 Job
-  *k_type = CNRT_FUNC_TYPE_UNION1;
-  k_dim->x = torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster);
-  k_dim->y = torch_mlu::getDeviceAttr(cnrtAttrClusterCount);
-  k_dim->z = 1;
-}
-
-void SigmoidFocalLossForwardMLUKernelLauncher(Tensor input, Tensor target,
-                                              Tensor weight, Tensor output,
-                                              const float gamma,
-                                              const float alpha) {
-  // params check
-  TORCH_CHECK(gamma >= 0, "gamma should be greater than or equal to 0. ",
-              "But now gamma is ", gamma, ".");
-
-  // check dtype
-  TORCH_CHECK(
-      input.scalar_type() == at::kFloat || input.scalar_type() == at::kHalf,
-      "Data type of input should be Float or Half. But now input type is ",
-      input.scalar_type(), ".");
-
-  TORCH_CHECK(
-      (target.scalar_type() == at::kInt || target.scalar_type() == at::kLong),
-      "target type should be Int or Long. ", "But now target type is ",
-      target.scalar_type(), ".");
-
-  if (weight.data_ptr() != nullptr) {
-    TORCH_CHECK(weight.scalar_type() == input.scalar_type(),
-                "Data types of input and weight should be the same. But now "
-                "input type is ",
-                input.scalar_type(), ", weight type is ", weight.scalar_type(),
-                ".");
-  } else {
-    CNLOG(INFO) << "weight is a empty tensor.";
-  }
-
-  // return if zero-element
-  if (input.numel() == 0 || target.numel() == 0 || output.numel() == 0) {
-    return;
-  }
-
-  // calculate task dimension
-  cnrtDim3_t k_dim;
-  cnrtFunctionType_t k_type = CNRT_FUNC_TYPE_UNION1;
-  policyFuncForward(&k_dim, &k_type, input, target, weight);
-  auto core_dim = torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster);
-
-  // get compute queue
-  auto queue = torch_mlu::getCurQueue();
-
-  // get ptr of tensors
-  auto input_impl = torch_mlu::getMluTensorImpl(input);
-  auto input_ptr = input_impl->cnnlMalloc();
-  auto target_impl = torch_mlu::getMluTensorImpl(target);
-  auto target_ptr = target_impl->cnnlMalloc();
-  auto weight_impl = torch_mlu::getMluTensorImpl(weight);
-  auto weight_ptr = weight_impl->cnnlMalloc();
-  auto output_impl = torch_mlu::getMluTensorImpl(output);
-  auto output_ptr = output_impl->cnnlMalloc();
-
-  // get dtype of input
-  cnrtDataType_t d_type = torch_mlu::toCnrtDtype(input.dtype());
-
-  CNLOG(INFO) << "Launch Kernel KernelFocalLossSigmoidForward<<<Union"
-              << k_type / core_dim << ", " << k_dim.x << ", " << k_dim.y << ", "
-              << k_dim.z << ">>>";
-  // launch kernel
-  KernelFocalLossSigmoidForward(k_dim, k_type, queue, d_type, input_ptr,
-                                target_ptr, weight_ptr, input.size(0),
-                                input.size(1), alpha, gamma, output_ptr);
-}
-
-void getDealNAndThresholdC(const int compute_data_bytes,
-                           const int target_data_bytes, const int total_c,
-                           int *deal_n_ptr, int *threshold_c_ptr,
-                           const bool has_weight, const bool is_half) {
-  /* NRAM partition:
-   *
-   * |-----------------ping pong--------------------|
-   * |input | pt | alpha_t | temp | output | target | flt_min | gamma | weight|
-   *
-   * split_pipeline_num is 5: including input, pt, alpha_t, temp, output.
-   */
-  const int nram_split_num = 5;
-  const int nram_split_pingpong = 2;
-  const int max_nram_size = torch_mlu::getDeviceAttr(cnrtAttrNramSizePerMcore);
-  int32_t compute_align_size = NFU_ALIGN_SIZE;
-  if (is_half) {
-    compute_align_size += NFU_ALIGN_SIZE;
-  }
-  const int32_t compute_align_num = compute_align_size / compute_data_bytes;
-  // reservered_align_size: including input(ping pong), pt(ping pong),
-  //                        alpha_t(ping pong), temp(ping pong),
-  //                        output(ping pong), target(ping pong),
-  //                        flt_min and gamma.
-  const int reservered_align_size =
-      ((nram_split_num + 1) * nram_split_pingpong + 2) * compute_align_size;
-  int nram_pingpong_size = max_nram_size - reservered_align_size;
-
-  int compute_c = total_c;
-  int threshold_c = 0;
-  if (has_weight) {
-    // reserved space for weight to align
-    nram_pingpong_size -= NFU_ALIGN_SIZE;
-
-    // threshold_c * nram_split_pingpong * compute_data_bytes * nram_split_num +
-    //     nram_split_pingpong * target_data_bytes +
-    //     threshold_c * compute_data_bytes <= nram_pingpong_size
-    threshold_c =
-        (nram_pingpong_size - nram_split_pingpong * target_data_bytes) /
-        (compute_data_bytes * (nram_split_num * nram_split_pingpong + 1));
-    threshold_c = PAD_DOWN(threshold_c, compute_align_num);
-    int weight_space = PAD_UP(total_c * compute_data_bytes, NFU_ALIGN_SIZE);
-
-    // reserved space for weight
-    nram_pingpong_size -= weight_space;
-    compute_c = PAD_UP(total_c, compute_align_num);
-  } else {
-    // threshold_c * nram_split_pingpong * compute_data_bytes * nram_split_num +
-    //     nram_split_pingpong * target_data_bytes <= nram_pingpong_size
-    threshold_c =
-        (nram_pingpong_size / nram_split_pingpong - target_data_bytes) /
-        (nram_split_num * compute_data_bytes);
-  }
-  // deal_n * compute_c * nram_split_pingpong * compute_data_bytes *
-  //     nram_split_num + deal_n * nram_split_pingpong * target_data_bytes <=
-  //     nram_pingpong_size
-  *deal_n_ptr =
-      nram_pingpong_size /
-      ((nram_split_num * compute_c * compute_data_bytes + target_data_bytes) *
-       nram_split_pingpong);
-  *threshold_c_ptr = threshold_c;
-}
-
-void SigmoidFocalLossBackwardMLUKernelLauncher(Tensor input, Tensor target,
-                                               Tensor weight, Tensor output,
-                                               const float gamma,
-                                               const float alpha) {
-  // params check
-  TORCH_CHECK(gamma >= 0, "gamma should be greater than or equal to 0. ",
-              "But now gamma is ", gamma, ".");
-  // check dtype
-  TORCH_CHECK(
-      input.scalar_type() == at::kFloat || input.scalar_type() == at::kHalf,
-      "Data type of input should be Float or Half. But now input type is ",
-      input.scalar_type(), ".");
-
-  TORCH_CHECK(
-      (target.scalar_type() == at::kInt || target.scalar_type() == at::kLong),
-      "target type should be Int or Long. ", "But now target type is ",
-      target.scalar_type(), ".");
-
-  bool has_weight = false;
-  if (weight.data_ptr() != nullptr) {
-    TORCH_CHECK(weight.scalar_type() == input.scalar_type(),
-                "Data types of input and weight should be the same. But now "
-                "input type is ",
-                input.scalar_type(), ", weight type is ", weight.scalar_type(),
-                ".");
-    has_weight = true;
-  } else {
-    CNLOG(INFO) << "weight is a empty tensor.";
-  }
-
-  auto dim_c = input.size(1);
-  const int compute_data_bytes = sizeof(float);
-  // target supports only INT on MLU device while it keeps LONG on host side,
-  // so target.itemsize() / 2
-  const int target_data_bytes = target.scalar_type() == at::kLong
-                                    ? (target.itemsize() / 2)
-                                    : target.itemsize();
-  int deal_n = 0;
-  int threshold_c = 0;
-  bool is_half = false;
-  if (input.scalar_type() == at::kHalf) {
-    is_half = true;
-  }
-  // calculate deal_n and threshold_c
-  getDealNAndThresholdC(compute_data_bytes, target_data_bytes, dim_c, &deal_n,
-                        &threshold_c, has_weight, is_half);
-
-  // check C
-  TORCH_CHECK(threshold_c >= dim_c,
-              "input.size(1) should be in the range of [0, ", threshold_c,
-              "]. ", "But now input.size(1) is ", dim_c, ".");
-
-  if (input.numel() == 0 || target.numel() == 0 || output.numel() == 0) {
-    // return if zero-element
-    return;
-  }
-
-  // set task dimension
-  cnrtDim3_t k_dim;
-  cnrtFunctionType_t k_type;
-  policyFuncBackward(&k_dim, &k_type);
-
-  // get compute queue
-  auto queue = torch_mlu::getCurQueue();
-
-  // get ptr of tensors
-  auto input_impl = torch_mlu::getMluTensorImpl(input);
-  auto input_ptr = input_impl->cnnlMalloc();
-  auto target_impl = torch_mlu::getMluTensorImpl(target);
-  auto target_ptr = target_impl->cnnlMalloc();
-  auto weight_impl = torch_mlu::getMluTensorImpl(weight);
-  auto weight_ptr = weight_impl->cnnlMalloc();
-  auto output_impl = torch_mlu::getMluTensorImpl(output);
-  auto output_ptr = output_impl->cnnlMalloc();
-
-  // get dtype of input
-  cnrtDataType_t d_type = torch_mlu::toCnrtDtype(input.dtype());
-  auto core_dim = torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster);
-  auto dim_n = input.size(0);
-
-  CNLOG(INFO) << "Launch Kernel KernelFocalLossSigmoidBackward<<<Union"
-              << k_type / core_dim << ", " << k_dim.x << ", " << k_dim.y << ", "
-              << k_dim.z << ">>>";
-
-  // launch kernel
-  KernelFocalLossSigmoidBackward(k_dim, k_type, queue, d_type, input_ptr,
-                                 target_ptr, weight_ptr, gamma, alpha, dim_n,
-                                 deal_n, dim_c, output_ptr);
-}
-
-void sigmoid_focal_loss_forward_mlu(Tensor input, Tensor target, Tensor weight,
-                                    Tensor output, float gamma, float alpha) {
-  SigmoidFocalLossForwardMLUKernelLauncher(input, target, weight, output, gamma,
-                                           alpha);
-}
-
-void sigmoid_focal_loss_backward_mlu(Tensor input, Tensor target, Tensor weight,
-                                     Tensor grad_input, float gamma,
-                                     float alpha) {
-  SigmoidFocalLossBackwardMLUKernelLauncher(input, target, weight, grad_input,
-                                            gamma, alpha);
-}
-
-void sigmoid_focal_loss_forward_impl(Tensor input, Tensor target, Tensor weight,
-                                     Tensor output, float gamma, float alpha);
-
-void sigmoid_focal_loss_backward_impl(Tensor input, Tensor target,
-                                      Tensor weight, Tensor grad_input,
-                                      float gamma, float alpha);
-
-REGISTER_DEVICE_IMPL(sigmoid_focal_loss_forward_impl, MLU,
-                     sigmoid_focal_loss_forward_mlu);
-REGISTER_DEVICE_IMPL(sigmoid_focal_loss_backward_impl, MLU,
-                     sigmoid_focal_loss_backward_mlu);
diff --git a/mmcv/ops/csrc/pytorch/mlu/iou3d_mlu.cpp b/mmcv/ops/csrc/pytorch/mlu/iou3d_mlu.cpp
deleted file mode 100644
index 5348d16..0000000
--- a/mmcv/ops/csrc/pytorch/mlu/iou3d_mlu.cpp
+++ /dev/null
@@ -1,144 +0,0 @@
-/*************************************************************************
- * Copyright (C) 2022 Cambricon.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *************************************************************************/
-
-#include "pytorch_device_registry.hpp"
-#include "pytorch_mlu_helper.hpp"
-
-void KernelIou3d(cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
-                 const cnrtDataType_t data_type_input, const void *boxes_dram,
-                 const int input_box_num, const float iou_threshold,
-                 void *workspace, void *output_size, void *output);
-
-int selectType(uint32_t use_job, int box_num_per_core) {
-  // the box_num_per_core should be at least 256, otherwise the real IO
-  // bandwidth would be very low
-  while (box_num_per_core < 256 && use_job >= 4) {
-    box_num_per_core *= 2;
-    use_job /= 2;
-  }
-  return use_job;
-}
-static cnnlStatus_t policyFunc(cnrtDim3_t *k_dim, cnrtFunctionType_t *k_type,
-                               int &core_num_per_class,
-                               const int input_box_num) {
-  uint32_t core_dim = torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster);
-  uint32_t job_limit = getJobLimitCapability();
-  uint32_t core_number = job_limit;
-
-  int box_num_per_core = (input_box_num + core_number - 1) / core_number;
-  int use_job = selectType(job_limit, box_num_per_core);
-  // initiate k_type as Union1
-  k_dim->x = core_dim;
-  k_dim->y = 1;
-  k_dim->z = 1;
-  *k_type = CNRT_FUNC_TYPE_UNION1;
-  switch (job_limit) {
-    case CN_KERNEL_CLASS_BLOCK:
-    case CN_KERNEL_CLASS_UNION:
-    case CN_KERNEL_CLASS_UNION2:
-    case CN_KERNEL_CLASS_UNION4:
-    case CN_KERNEL_CLASS_UNION8:
-    case CN_KERNEL_CLASS_UNION16: {
-      if (use_job < 4) {
-        k_dim->x = 1;
-        *k_type = CNRT_FUNC_TYPE_BLOCK;
-      } else if (use_job == 4) {
-        k_dim->x = core_dim;
-        *k_type = CNRT_FUNC_TYPE_UNION1;
-      } else {
-        k_dim->x = use_job;
-        *k_type = (cnrtFunctionType_t)use_job;
-      }
-    }; break;
-    default:
-      LOG(WARNING) << "[cnnlNms_v2]: got unsupported job limit number."
-                   << " Use default CN_KERNEL_CLASS_UNION1 with UNION1 task.";
-  }
-  return CNNL_STATUS_SUCCESS;
-}
-
-void IoU3DNMS3DMLUKernelLauncher(Tensor boxes, Tensor &keep, Tensor &keep_num,
-                                 float iou_threshold) {
-  // dimension parameters check
-  TORCH_CHECK(boxes.dim() == 2, "boxes should be a 2d tensor, got ",
-              boxes.dim(), "D");
-  TORCH_CHECK(boxes.size(1) == 7,
-              "boxes should have 7 elements in dimension 1, got ",
-              boxes.size(1));
-
-  // data type check
-  TORCH_CHECK(
-      boxes.scalar_type() == at::kFloat || boxes.scalar_type() == at::kHalf,
-      "data type of boxes should be Float or Half, got ", boxes.scalar_type());
-
-  if (boxes.numel() == 0) {
-    return;
-  }
-  const size_t max_input_num = 2147483648;  // 2^31, 2G num
-  TORCH_CHECK(boxes.numel() < max_input_num,
-              "boxes.numel() should be less than 2147483648, got ",
-              boxes.numel());
-  int input_box_num = boxes.size(0);
-
-  cnrtDataType_t data_type_input = torch_mlu::toCnrtDtype(boxes.dtype());
-  cnrtDim3_t k_dim;
-  cnrtJobType_t k_type;
-
-  int core_num_per_class;
-  policyFunc(&k_dim, &k_type, core_num_per_class, input_box_num);
-
-  // transpose boxes (n, 7) to (7, n) for better performance
-  auto boxes_t = boxes.transpose(0, 1);
-  auto boxes_ = torch_mlu::cnnl::ops::cnnl_contiguous(boxes_t);
-
-  auto output = at::empty({input_box_num}, boxes.options().dtype(at::kLong));
-  auto output_size = at::empty({1}, boxes.options().dtype(at::kInt));
-
-  // workspace
-  const int info_num = 7;  // x, y,z, dx, dy, dz,angle
-  size_t space_size = 0;
-  if (boxes.scalar_type() == at::kHalf) {
-    space_size = input_box_num * sizeof(int16_t) * info_num +
-                 input_box_num * sizeof(float) + sizeof(float);
-  } else {
-    space_size = input_box_num * sizeof(float) * (info_num + 1) + sizeof(float);
-  }
-
-  auto workspace = at::empty(space_size, boxes.options().dtype(at::kByte));
-
-  // get compute queue
-  auto queue = torch_mlu::getCurQueue();
-
-  auto boxes_impl = torch_mlu::getMluTensorImpl(boxes_);
-  auto boxes_ptr = boxes_impl->cnnlMalloc();
-  auto workspace_impl = torch_mlu::getMluTensorImpl(workspace);
-  auto workspace_ptr = workspace_impl->cnnlMalloc();
-  auto output_impl = torch_mlu::getMluTensorImpl(keep);
-  auto output_ptr = output_impl->cnnlMalloc();
-  auto output_size_impl = torch_mlu::getMluTensorImpl(keep_num);
-  auto output_size_ptr = output_size_impl->cnnlMalloc();
-
-  uint32_t core_dim = torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster);
-  CNLOG(INFO) << "Launch Kernel KernelIou3d<<<Union" << k_type / core_dim
-              << ", " << k_dim.x << ", " << k_dim.y << ", " << k_dim.z << ">>>";
-  KernelIou3d(k_dim, k_type, queue, data_type_input, boxes_ptr, input_box_num,
-              iou_threshold, workspace_ptr, output_size_ptr, output_ptr);
-}
-
-void iou3d_nms3d_forward_mlu(const Tensor boxes, Tensor &keep, Tensor &keep_num,
-                             float nms_overlap_thresh) {
-  IoU3DNMS3DMLUKernelLauncher(boxes, keep, keep_num, nms_overlap_thresh);
-}
-
-void iou3d_nms3d_forward_impl(const Tensor boxes, Tensor &keep,
-                              Tensor &keep_num, float nms_overlap_thresh);
-REGISTER_DEVICE_IMPL(iou3d_nms3d_forward_impl, MLU, iou3d_nms3d_forward_mlu);
diff --git a/mmcv/ops/csrc/pytorch/mlu/masked_conv2d_mlu.cpp b/mmcv/ops/csrc/pytorch/mlu/masked_conv2d_mlu.cpp
deleted file mode 100755
index e7842b3..0000000
--- a/mmcv/ops/csrc/pytorch/mlu/masked_conv2d_mlu.cpp
+++ /dev/null
@@ -1,226 +0,0 @@
-/*************************************************************************
- * Copyright (C) 2022 Cambricon.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *************************************************************************/
-#include "pytorch_device_registry.hpp"
-#include "pytorch_mlu_helper.hpp"
-
-void KernelMaskedIm2colForward(
-    cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
-    cnrtDataType_t k_dtype, const void *im_ptr, const int height,
-    const int width, const int channels, const int kernel_h, const int kernel_w,
-    const int pad_h, const int pad_w, const void *mask_h_idx_ptr,
-    const void *mask_w_idx_ptr, const int mask_cnt, void *col_ptr);
-
-void KernelMaskedCol2imForward(cnrtDim3_t k_dim, cnrtFunctionType_t k_type,
-                               cnrtQueue_t queue, cnrtDataType_t k_dtype,
-                               const void *col_ptr, const int height,
-                               const int width, const int channels,
-                               const void *mask_h_idx_ptr,
-                               const void *mask_w_idx_ptr, const int mask_cnt,
-                               void *im_ptr);
-
-// policy function
-static void policyFunc(const int mask_cnt, cnrtDim3_t *k_dim,
-                       cnrtFunctionType_t *k_type) {
-  const size_t cluster_num = torch_mlu::getDeviceAttr(cnrtAttrClusterCount);
-  const size_t core_num = torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster);
-  const size_t task_dim = CEIL_ALIGN(mask_cnt, core_num);
-  k_dim->x = core_num;
-  k_dim->y =
-      (task_dim / core_num) > cluster_num ? cluster_num : (task_dim / core_num);
-  k_dim->z = 1;
-  *k_type = CNRT_FUNC_TYPE_UNION1;
-}
-
-void MaskedIm2colForwardMLUKernelLauncher(const Tensor im,
-                                          const Tensor mask_h_idx,
-                                          const Tensor mask_w_idx, Tensor col,
-                                          const int kernel_h,
-                                          const int kernel_w, const int pad_h,
-                                          const int pad_w) {
-  // Check dtype.
-  TORCH_CHECK(im.scalar_type() == at::kFloat || im.scalar_type() == at::kHalf,
-              "im type should be Float or Half, got ", im.scalar_type(), ".");
-  TORCH_CHECK(mask_h_idx.scalar_type() == at::kInt ||
-                  mask_h_idx.scalar_type() == at::kLong,
-              "mask_h_idx type should be Int or Long, got ",
-              mask_h_idx.scalar_type(), ".");
-  TORCH_CHECK(mask_w_idx.scalar_type() == at::kInt ||
-                  mask_w_idx.scalar_type() == at::kLong,
-              "mask_w_idx type should be Int or Long, got ",
-              mask_w_idx.scalar_type(), ".");
-  TORCH_CHECK(kernel_h > 0, "kernel_h should greater than 0, got ", kernel_h,
-              ".");
-  TORCH_CHECK(kernel_w > 0, "kernel_w should greater than 0, got ", kernel_w,
-              ".");
-
-  // zero element check
-  TORCH_CHECK(im.numel() > 0, "im.numel should greater than zero, got ",
-              im.numel(), ".");
-  TORCH_CHECK(col.size(0) > 0, "col.size(0) should greater than zero, got ",
-              col.size(0), ".");
-
-  // large tensor check
-  const size_t max_input_num = 2147483648;  // 2^31, 2G num
-  TORCH_CHECK(im.numel() < max_input_num,
-              "im.numel() should be less than 2147483648, got ", im.numel(),
-              ".");
-  TORCH_CHECK(col.numel() < max_input_num,
-              "col.numel() should be less than 2147483648, got ", col.numel(),
-              ".");
-
-  const int channels = im.size(1);
-  const int height = im.size(2);
-  const int width = im.size(3);
-  const int mask_cnt = mask_h_idx.size(0);
-
-  // auto im_t = im.permute({0, 2, 3, 1}).contiguous();
-  auto memory_format =
-      torch_mlu::cnnl::ops::get_channels_last_memory_format(im.dim());
-  auto im_ = torch_mlu::cnnl::ops::cnnl_contiguous(im, memory_format);
-  auto col_ =
-      at::zeros({mask_cnt, kernel_h * kernel_w, channels}, col.options());
-  // calculate task dimension
-  cnrtDim3_t k_dim;
-  cnrtFunctionType_t k_type;
-  policyFunc(mask_cnt, &k_dim, &k_type);
-
-  // get compute queue
-  auto queue = torch_mlu::getCurQueue();
-  // get ptr of tensors
-  auto im_impl = torch_mlu::getMluTensorImpl(im_);
-  auto im_ptr = im_impl->cnnlMalloc();
-  auto mask_h_idx_impl = torch_mlu::getMluTensorImpl(mask_h_idx);
-  auto mask_h_idx_ptr = mask_h_idx_impl->cnnlMalloc();
-  auto mask_w_idx_impl = torch_mlu::getMluTensorImpl(mask_w_idx);
-  auto mask_w_idx_ptr = mask_w_idx_impl->cnnlMalloc();
-  auto col_impl = torch_mlu::getMluTensorImpl(col_);
-  auto col_ptr = col_impl->cnnlMalloc();
-
-  // get comput dtype of input
-  cnrtDataType_t data_type = torch_mlu::toCnrtDtype(im.dtype());
-
-  // launch kernel
-  CNLOG(INFO) << "Launch Kernel MLUKernelMaskedIm2colForward<<<" << k_dim.x
-              << ", " << k_dim.y << ", " << k_dim.z << ">>>";
-  KernelMaskedIm2colForward(k_dim, k_type, queue, data_type, im_ptr, height,
-                            width, channels, kernel_h, kernel_w, pad_h, pad_w,
-                            mask_h_idx_ptr, mask_w_idx_ptr, mask_cnt, col_ptr);
-
-  col.copy_(col_.permute({2, 1, 0})
-                .reshape({channels * kernel_h * kernel_w, mask_cnt})
-                .contiguous());
-}
-
-void MaskedCol2imForwardMLUKernelLauncher(const Tensor col,
-                                          const Tensor mask_h_idx,
-                                          const Tensor mask_w_idx, Tensor im,
-                                          const int height, const int width,
-                                          const int channels) {
-  // Check dtype.
-  TORCH_CHECK(col.scalar_type() == at::kFloat || col.scalar_type() == at::kHalf,
-              "col type should be Float or Half, got ", col.scalar_type(), ".");
-  TORCH_CHECK(mask_h_idx.scalar_type() == at::kInt ||
-                  mask_h_idx.scalar_type() == at::kLong,
-              "mask_h_idx type should be Int or Long, got ",
-              mask_h_idx.scalar_type(), ".");
-  TORCH_CHECK(mask_w_idx.scalar_type() == at::kInt ||
-                  mask_w_idx.scalar_type() == at::kLong,
-              "mask_w_idx type should be Int or Long, got ",
-              mask_w_idx.scalar_type(), ".");
-
-  // zero element check
-  TORCH_CHECK(im.numel() > 0, "im.numel should greater than zero, got ",
-              im.numel(), ".");
-  TORCH_CHECK(col.size(0) > 0, "col.size(0) should greater than zero, got ",
-              col.size(0), ".");
-
-  // large tensor check
-  const size_t max_input_num = 2147483648;  // 2^31, 2G num
-  TORCH_CHECK(im.numel() < max_input_num,
-              "im.numel() should be less than 2147483648, got ", im.numel(),
-              ".");
-  TORCH_CHECK(col.numel() < max_input_num,
-              "col.numel() should be less than 2147483648, got ", col.numel(),
-              ".");
-
-  auto memory_format =
-      torch_mlu::cnnl::ops::get_channels_last_memory_format(im.dim());
-  at::Tensor im_ =
-      at::empty({1, channels, height, width}, im.options(), memory_format)
-          .zero_();
-
-  auto col_t = torch_mlu::cnnl::ops::cnnl_contiguous(col.transpose(0, 1));
-
-  const int mask_cnt = mask_h_idx.size(0);
-  // calculate task dimension
-  cnrtDim3_t k_dim;
-  cnrtFunctionType_t k_type;
-  policyFunc(mask_cnt, &k_dim, &k_type);
-
-  // get compute queue
-  auto queue = torch_mlu::getCurQueue();
-  // get ptr of tensors
-  auto im_impl = torch_mlu::getMluTensorImpl(im_);
-  auto im_ptr = im_impl->cnnlMalloc();
-  auto mask_h_idx_impl = torch_mlu::getMluTensorImpl(mask_h_idx);
-  auto mask_h_idx_ptr = mask_h_idx_impl->cnnlMalloc();
-  auto mask_w_idx_impl = torch_mlu::getMluTensorImpl(mask_w_idx);
-  auto mask_w_idx_ptr = mask_w_idx_impl->cnnlMalloc();
-  auto col_t_impl = torch_mlu::getMluTensorImpl(col_t);
-  auto col_t_ptr = col_t_impl->cnnlMalloc();
-
-  // get comput dtype of input
-  cnrtDataType_t data_type = torch_mlu::toCnrtDtype(col.dtype());
-
-  // launch kernel
-  CNLOG(INFO) << "Launch Kernel MLUKernelMaskedCol2imForward<<<" << k_dim.x
-              << ", " << k_dim.y << ", " << k_dim.z << ">>>";
-
-  KernelMaskedCol2imForward(k_dim, k_type, queue, data_type, col_t_ptr, height,
-                            width, channels, mask_h_idx_ptr, mask_w_idx_ptr,
-                            mask_cnt, im_ptr);
-
-  im.copy_(im_);
-}
-
-void masked_im2col_forward_mlu(const Tensor im, const Tensor mask_h_idx,
-                               const Tensor mask_w_idx, Tensor col,
-                               const int kernel_h, const int kernel_w,
-                               const int pad_h, const int pad_w) {
-  // im: (n, ic, h, w), kernel size (kh, kw)
-  // kernel: (oc, ic * kh * kw), col: (kh * kw * ic, ow * oh)
-  MaskedIm2colForwardMLUKernelLauncher(im, mask_h_idx, mask_w_idx, col,
-                                       kernel_h, kernel_w, pad_h, pad_w);
-}
-
-void masked_col2im_forward_mlu(const Tensor col, const Tensor mask_h_idx,
-                               const Tensor mask_w_idx, Tensor im, int height,
-                               int width, int channels) {
-  // im: (n, ic, h, w), kernel size (kh, kw)
-  // kernel: (oc, ic * kh * kh), col: (kh * kw * ic, ow * oh)
-  MaskedCol2imForwardMLUKernelLauncher(col, mask_h_idx, mask_w_idx, im, height,
-                                       width, channels);
-}
-
-void masked_im2col_forward_impl(const Tensor im, const Tensor mask_h_idx,
-                                const Tensor mask_w_idx, Tensor col,
-                                const int kernel_h, const int kernel_w,
-                                const int pad_h, const int pad_w);
-
-void masked_col2im_forward_impl(const Tensor col, const Tensor mask_h_idx,
-                                const Tensor mask_w_idx, Tensor im, int height,
-                                int width, int channels);
-
-REGISTER_DEVICE_IMPL(masked_im2col_forward_impl, MLU,
-                     masked_im2col_forward_mlu);
-REGISTER_DEVICE_IMPL(masked_col2im_forward_impl, MLU,
-                     masked_col2im_forward_mlu);
diff --git a/mmcv/ops/csrc/pytorch/mlu/ms_deform_attn_mlu.cpp b/mmcv/ops/csrc/pytorch/mlu/ms_deform_attn_mlu.cpp
deleted file mode 100644
index e93fd98..0000000
--- a/mmcv/ops/csrc/pytorch/mlu/ms_deform_attn_mlu.cpp
+++ /dev/null
@@ -1,420 +0,0 @@
-/*************************************************************************
- * Copyright (C) 2022 by Cambricon.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *************************************************************************/
-#include "pytorch_device_registry.hpp"
-#include "pytorch_mlu_helper.hpp"
-
-#define MIN(a, b) (((a) < (b)) ? (a) : (b))
-
-void KernelMsDeformAttnForward(
-    cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
-    const cnrtDataType_t d_type, const char* data_value_gdram,
-    const char* data_spatial_shapes_gdram,
-    const char* data_level_start_index_gdram,
-    const char* data_sampling_loc_gdram, const char* data_attn_weight_gdram,
-    const int32_t batch_size, const int32_t num_keys, const int32_t num_heads,
-    const int32_t channels, const int32_t num_levels, const int32_t num_queries,
-    const int32_t num_points, char* data_col_gdram);
-void KernelMsDeformAttnBackward(
-    cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
-    const cnrtDataType_t d_type, const float* data_value,
-    const int32_t* spatial_shapes, const int32_t* data_level_start_index,
-    const float* data_sampling_loc, const float* data_attn_weight,
-    const float* grad_output, const int32_t batch_size, const int32_t num_keys,
-    const int32_t num_heads, const int32_t channels, const int32_t num_levels,
-    const int32_t num_queries, const int32_t num_points, float* grad_value,
-    float* grad_sampling_loc, float* grad_attn_weight);
-// policy function
-static void policyFuncForward(cnrtDim3_t* k_dim, cnrtFunctionType_t* k_type,
-                              const int batch_size, const int num_queries,
-                              const int num_heads) {
-  k_dim->x = torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster);
-  k_dim->y =
-      MIN((batch_size * num_queries * num_heads + k_dim->x - 1) / k_dim->x,
-          torch_mlu::getDeviceAttr(cnrtAttrClusterCount));
-  k_dim->z = 1;
-#if __BANG_ARCH__ == 520
-  *k_type = CNRT_FUNC_TYPE_BLOCK;
-#else
-  *k_type = CNRT_FUNC_TYPE_UNION1;
-#endif
-}
-
-// policy function for backward
-static void policyFuncBackward(const int32_t batch_size,
-                               const int32_t num_queries,
-                               const int32_t num_heads,
-                               const int32_t num_levels,
-                               cnrtFunctionType_t* k_type, cnrtDim3_t* k_dim) {
-  size_t cluster_limit = torch_mlu::getDeviceAttr(cnrtAttrClusterCount);
-  size_t core_limit = torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster);
-  k_dim->x = core_limit;
-  int32_t total_num = batch_size * num_queries * num_heads * num_levels;
-  size_t total_num_align = CEIL_ALIGN(total_num, core_limit);
-  k_dim->y = (total_num_align / core_limit) > cluster_limit
-                 ? cluster_limit
-                 : (total_num_align / core_limit);
-  k_dim->z = 1;
-  *k_type = CNRT_FUNC_TYPE_UNION1;
-}
-
-Tensor ms_deform_attn_mlu_forward(const Tensor& value,
-                                  const Tensor& spatial_shapes,
-                                  const Tensor& level_start_index,
-                                  const Tensor& sampling_loc,
-                                  const Tensor& attn_weight,
-                                  const int im2col_step) {
-  // check contiguous
-  AT_ASSERTM(value.is_contiguous(), "value tensor has to be contiguous");
-  AT_ASSERTM(spatial_shapes.is_contiguous(),
-             "spatial_shapes tensor has to be contiguous");
-  AT_ASSERTM(level_start_index.is_contiguous(),
-             "level_start_index tensor has to be contiguous");
-  AT_ASSERTM(sampling_loc.is_contiguous(),
-             "sampling_loc tensor has to be contiguous");
-  AT_ASSERTM(attn_weight.is_contiguous(),
-             "attn_weight tensor has to be contiguous");
-
-  // check datatype
-  TORCH_CHECK((value.scalar_type() == at::kFloat),
-              "value type should be Float, got ", value.scalar_type(), ".");
-  TORCH_CHECK((spatial_shapes.scalar_type() == at::kInt ||
-               spatial_shapes.scalar_type() == at::kLong),
-              "spatial_shapes type should be Int, got ",
-              spatial_shapes.scalar_type(), ".");
-  TORCH_CHECK((level_start_index.scalar_type() == at::kInt ||
-               level_start_index.scalar_type() == at::kLong),
-              "level_start_index type should be Int, got ",
-              level_start_index.scalar_type(), ".");
-  TORCH_CHECK((sampling_loc.scalar_type() == at::kFloat),
-              "sampling_loc type should be Float, got ",
-              sampling_loc.scalar_type(), ".");
-  TORCH_CHECK((attn_weight.scalar_type() == at::kFloat),
-              "attn_weight type should be Float, got ",
-              attn_weight.scalar_type(), ".");
-
-  // check shape
-  TORCH_CHECK(value.dim() == 4, "value should be a 4d tensor, got ",
-              value.dim(), "D.");
-  TORCH_CHECK(spatial_shapes.dim() == 2,
-              "spatial_shapes should be a 2d tensor, got ",
-              spatial_shapes.dim(), "D.");
-  TORCH_CHECK(level_start_index.dim() == 1,
-              "level_start_index should be a 1d tensor, got ",
-              level_start_index.dim(), "D.");
-  TORCH_CHECK(sampling_loc.dim() == 6,
-              "sampling_loc should be a 6d tensor, got ", sampling_loc.dim(),
-              "D.");
-  TORCH_CHECK(attn_weight.dim() == 5, "attn_weight should be a 5d tensor, got ",
-              attn_weight.dim(), "D.");
-
-  const int batch_size = value.size(0);
-  const int num_keys = value.size(1);
-  const int num_heads = value.size(2);
-  const int channels = value.size(3);
-  const int num_levels = spatial_shapes.size(0);
-  const int num_queries = sampling_loc.size(1);
-  const int num_points = sampling_loc.size(4);
-
-  TORCH_CHECK(spatial_shapes.size(1) == 2,
-              "the 2nd dimensions of spatial_shapes should be 2, got ",
-              spatial_shapes.size(1), ".");
-  TORCH_CHECK(sampling_loc.size(5) == 2,
-              "the 6th dimensions of sampling_loc should be 2, got ",
-              sampling_loc.size(5), ".");
-  TORCH_CHECK((sampling_loc.size(0) == batch_size),
-              "the 1st dimensions of sampling_loc should be batch_size, ",
-              "but now the 1st dimension of sampling_loc is ",
-              sampling_loc.size(0), ", and batch_size is ", batch_size, ".");
-  TORCH_CHECK((attn_weight.size(0) == batch_size),
-              "the 1st dimensions of attn_weight should be batch_size, ",
-              "but now the 1st dimension of attn_weight is ",
-              attn_weight.size(0), ", and batch_size is ", batch_size, ".");
-  TORCH_CHECK((sampling_loc.size(2) == num_heads),
-              "the 3rd dimensions of sampling_loc should be num_heads, ",
-              "but now the 3rd dimension of sampling_loc is ",
-              sampling_loc.size(2), ", and num_heads is ", num_heads, ".");
-  TORCH_CHECK((attn_weight.size(2) == num_heads),
-              "the 3rd dimensions of attn_weight should be num_heads, ",
-              "but now the 3rd dimension of attn_weight is ",
-              attn_weight.size(2), ", and num_heads is ", num_heads, ".");
-  TORCH_CHECK((level_start_index.size(0) == num_levels),
-              "the 1st dimensions of level_start_index should be num_levels, ",
-              "but now the 1st dimension of level_start_index is ",
-              level_start_index.size(0), ", and num_levels is ", num_levels,
-              ".");
-  TORCH_CHECK((sampling_loc.size(3) == num_levels),
-              "the 4th dimensions of sampling_loc should be num_levels, ",
-              "but now the 4th dimension of sampling_loc is ",
-              sampling_loc.size(3), ", and num_levels is ", num_levels, ".");
-  TORCH_CHECK((attn_weight.size(3) == num_levels),
-              "the 4th dimensions of attn_weight should be num_levels, ",
-              "but now the 4th dimension of attn_weight is ",
-              attn_weight.size(3), ", and num_levels is ", num_levels, ".");
-  TORCH_CHECK((attn_weight.size(1) == num_queries),
-              "the 2nd dimensions of attn_weight should be num_queries, ",
-              "but now the 2nd dimension of attn_weight is ",
-              attn_weight.size(1), ", and num_queries is ", num_queries, ".");
-  TORCH_CHECK((attn_weight.size(4) == num_points),
-              "the 5th dimensions of attn_weight should be num_points, ",
-              "but now the 5th dimension of attn_weight is ",
-              attn_weight.size(4), ", and num_points is ", num_points, ".");
-
-  auto output = at::zeros({batch_size, num_queries, num_heads, channels},
-                          value.options());
-
-  // large tensor check
-  const size_t max_input_size = 2147483648;
-  TORCH_CHECK(value.numel() < max_input_size,
-              "value element num should be less than 2^31, got ", value.numel(),
-              ".");
-  TORCH_CHECK(sampling_loc.numel() < max_input_size,
-              "sampling_loc element num should be less than 2^31, got ",
-              sampling_loc.numel(), ".");
-  TORCH_CHECK(output.numel() < max_input_size,
-              "output element num should be less than 2^31, got ",
-              output.numel(), ".");
-
-  // check zero element
-  TORCH_CHECK(batch_size != 0, "batch_size should not be zero");
-  TORCH_CHECK(num_heads != 0, "num_heads should not be zero");
-  TORCH_CHECK(channels != 0, "channels should not be zero");
-  TORCH_CHECK(num_queries != 0, "num_queries should not be zero");
-
-  if (num_keys == 0 || num_levels == 0 || num_points == 0) {
-    return output;
-  }
-
-  // calculate task dimension
-  cnrtDim3_t k_dim;
-  cnrtFunctionType_t k_type;
-  policyFuncForward(&k_dim, &k_type, batch_size, num_queries, num_heads);
-
-  // get compute queue
-  auto queue = torch_mlu::getCurQueue();
-
-  auto spatial_shapes_ = spatial_shapes.to(at::kInt);
-  auto level_start_index_ = level_start_index.to(at::kInt);
-
-  // get ptr of tensors
-  auto value_impl = torch_mlu::getMluTensorImpl(value);
-  auto value_ptr = value_impl->cnnlMalloc();
-  auto spatial_shapes_impl = torch_mlu::getMluTensorImpl(spatial_shapes_);
-  auto spatial_shapes_ptr = spatial_shapes_impl->cnnlMalloc();
-  auto level_start_index_impl = torch_mlu::getMluTensorImpl(level_start_index_);
-  auto level_start_index_ptr = level_start_index_impl->cnnlMalloc();
-  auto sampling_loc_impl = torch_mlu::getMluTensorImpl(sampling_loc);
-  auto sampling_loc_ptr = sampling_loc_impl->cnnlMalloc();
-  auto attn_weight_impl = torch_mlu::getMluTensorImpl(attn_weight);
-  auto attn_weight_ptr = attn_weight_impl->cnnlMalloc();
-  auto output_impl = torch_mlu::getMluTensorImpl(output);
-  auto output_ptr = output_impl->cnnlMalloc();
-
-  // get compute dtype of input
-  cnrtDataType_t data_type = torch_mlu::toCnrtDtype(value.dtype());
-
-  // launch kernel
-  CNLOG(INFO) << "Launch Kernel MLUKernelMsDeformAttnForward<<<" << k_dim.x
-              << ", " << k_dim.y << ", " << k_dim.z << ">>>";
-
-  KernelMsDeformAttnForward(
-      k_dim, k_type, queue, data_type, (char*)value_ptr,
-      (char*)spatial_shapes_ptr, (char*)level_start_index_ptr,
-      (char*)sampling_loc_ptr, (char*)attn_weight_ptr, batch_size, num_keys,
-      num_heads, channels, num_levels, num_queries, num_points,
-      (char*)output_ptr);
-
-  output = output.view({batch_size, num_queries, num_heads * channels});
-  return output;
-}
-
-void ms_deform_attn_mlu_backward(
-    const Tensor& value, const Tensor& spatial_shapes,
-    const Tensor& level_start_index, const Tensor& sampling_loc,
-    const Tensor& attn_weight, const Tensor& grad_output, Tensor& grad_value,
-    Tensor& grad_sampling_loc, Tensor& grad_attn_weight,
-    const int im2col_step) {
-  // check contiguous
-  AT_ASSERTM(value.is_contiguous(), "value tensor has to be contiguous");
-  AT_ASSERTM(spatial_shapes.is_contiguous(),
-             "spatial_shapes tensor has to be contiguous");
-  AT_ASSERTM(level_start_index.is_contiguous(),
-             "level_start_index tensor has to be contiguous");
-  AT_ASSERTM(sampling_loc.is_contiguous(),
-             "sampling_loc tensor has to be contiguous");
-  AT_ASSERTM(attn_weight.is_contiguous(),
-             "attn_weight tensor has to be contiguous");
-  AT_ASSERTM(grad_output.is_contiguous(),
-             "grad_output tensor has to be contiguous");
-
-  // check datatype
-  TORCH_CHECK((value.scalar_type() == at::kFloat),
-              "value type should be Float, got ", value.scalar_type(), ".");
-  TORCH_CHECK((spatial_shapes.scalar_type() == at::kInt ||
-               spatial_shapes.scalar_type() == at::kLong),
-              "spatial_shapes type should be Int, got ",
-              spatial_shapes.scalar_type(), ".");
-  TORCH_CHECK((level_start_index.scalar_type() == at::kInt ||
-               level_start_index.scalar_type() == at::kLong),
-              "level_start_index type should be Int, got ",
-              level_start_index.scalar_type(), ".");
-  TORCH_CHECK((sampling_loc.scalar_type() == at::kFloat),
-              "sampling_loc type should be Float, got ",
-              sampling_loc.scalar_type(), ".");
-  TORCH_CHECK((attn_weight.scalar_type() == at::kFloat),
-              "attn_weight type should be Float, got ",
-              attn_weight.scalar_type(), ".");
-  TORCH_CHECK((grad_output.scalar_type() == at::kFloat),
-              "grad_output type should be Float, got ",
-              grad_output.scalar_type(), ".");
-
-  const int batch_size = value.size(0);
-  const int num_keys = value.size(1);
-  const int num_heads = value.size(2);
-  const int channels = value.size(3);
-  const int num_levels = spatial_shapes.size(0);
-  const int num_queries = sampling_loc.size(1);
-  const int num_points = sampling_loc.size(4);
-  // Check shape.
-  TORCH_CHECK(spatial_shapes.size(1) == 2,
-              "the 2nd dimensions of spatial_shapes should be 2, got ",
-              spatial_shapes.size(1), ".");
-
-  TORCH_CHECK((level_start_index.size(0) == num_levels),
-              "the 1st dimensions of level_start_index should be num_levels, ",
-              "but now the 1st dimension of level_start_index is ",
-              level_start_index.size(0), ", and num_levels is ", num_levels,
-              ".");
-
-  TORCH_CHECK((sampling_loc.size(0) == batch_size),
-              "the 1st dimensions of sampling_loc should be batch_size, ",
-              "but now the 1st dimension of sampling_loc is ",
-              sampling_loc.size(0), ", and batch_size is ", batch_size, ".");
-  TORCH_CHECK((sampling_loc.size(2) == num_heads),
-              "the 3rd dimensions of sampling_loc should be num_heads, ",
-              "but now the 3rd dimension of sampling_loc is ",
-              sampling_loc.size(2), ", and num_heads is ", num_heads, ".");
-  TORCH_CHECK((sampling_loc.size(3) == num_levels),
-              "the 4th dimensions of sampling_loc should be num_levels, ",
-              "but now the 4th dimension of sampling_loc is ",
-              sampling_loc.size(3), ", and num_levels is ", num_levels, ".");
-  TORCH_CHECK(sampling_loc.size(5) == 2,
-              "the 6th dimensions of sampling_loc should be 2, got ",
-              sampling_loc.size(5), ".");
-
-  TORCH_CHECK((attn_weight.size(0) == batch_size),
-              "the 1st dimensions of attn_weight should be batch_size, ",
-              "but now the 1st dimension of attn_weight is ",
-              attn_weight.size(0), ", and batch_size is ", batch_size, ".");
-  TORCH_CHECK((attn_weight.size(1) == num_queries),
-              "the 2nd dimensions of attn_weight should be num_queries, ",
-              "but now the 2nd dimension of attn_weight is ",
-              attn_weight.size(1), ", and num_queries is ", num_queries, ".");
-
-  TORCH_CHECK((attn_weight.size(2) == num_heads),
-              "the 3rd dimensions of attn_weight should be num_heads, ",
-              "but now the 3rd dimension of attn_weight is ",
-              attn_weight.size(2), ", and num_heads is ", num_heads, ".");
-  TORCH_CHECK((attn_weight.size(3) == num_levels),
-              "the 4th dimensions of attn_weight should be num_levels, ",
-              "but now the 4th dimension of attn_weight is ",
-              attn_weight.size(3), ", and num_levels is ", num_levels, ".");
-  TORCH_CHECK((attn_weight.size(4) == num_points),
-              "the 5th dimensions of attn_weight should be num_points, ",
-              "but now the 5th dimension of attn_weight is ",
-              attn_weight.size(4), ", and num_points is ", num_points, ".");
-
-  TORCH_CHECK((grad_output.size(0) == batch_size),
-              "the 1st dimensions of grad_output should be batch_size, ",
-              "but now the 1st dimension of grad_output is ",
-              grad_output.size(0), ", and batch_size is ", batch_size, ".");
-  TORCH_CHECK((grad_output.size(1) == num_queries),
-              "the 2nd dimensions of grad_output should be num_queries, ",
-              "but now the 2nd dimension of grad_output is ",
-              grad_output.size(1), ", and num_queries is ", num_queries, ".");
-  TORCH_CHECK(
-      (grad_output.size(2) == num_heads * channels),
-      "the 3rd dimensions of grad_output should be num_heads * channels, ",
-      "but now the 3rd dimension of grad_output is ", grad_output.size(2),
-      ", and num_heads * channels is ", num_heads * channels, ".");
-
-  // check zero element
-  TORCH_CHECK(batch_size != 0, "The batch_size is zero.");
-  TORCH_CHECK(channels != 0, "The channels is zero.");
-  TORCH_CHECK(num_keys != 0, "The num_keys is zero.");
-  TORCH_CHECK(num_heads != 0, "The num_heads is zero.");
-  TORCH_CHECK(num_queries != 0, "The num_queries is zero.");
-  if (num_levels == 0 || num_points == 0) {
-    return;
-  }
-
-  // calculate task dimension
-  cnrtDim3_t k_dim;
-  cnrtFunctionType_t k_type;
-  policyFuncBackward(batch_size, num_queries, num_heads, num_levels, &k_type,
-                     &k_dim);
-
-  // get compute queue
-  auto queue = torch_mlu::getCurQueue();
-
-  // get ptr of tensors
-  auto value_impl = torch_mlu::getMluTensorImpl(value);
-  auto value_ptr = value_impl->cnnlMalloc();
-  auto spatial_shapes_impl = torch_mlu::getMluTensorImpl(spatial_shapes);
-  auto spatial_shapes_ptr = spatial_shapes_impl->cnnlMalloc();
-  auto level_start_index_impl = torch_mlu::getMluTensorImpl(level_start_index);
-  auto level_start_index_ptr = level_start_index_impl->cnnlMalloc();
-  auto sampling_loc_impl = torch_mlu::getMluTensorImpl(sampling_loc);
-  auto sampling_loc_ptr = sampling_loc_impl->cnnlMalloc();
-  auto attn_weight_impl = torch_mlu::getMluTensorImpl(attn_weight);
-  auto attn_weight_ptr = attn_weight_impl->cnnlMalloc();
-  auto grad_output_impl = torch_mlu::getMluTensorImpl(grad_output);
-  auto grad_output_ptr = grad_output_impl->cnnlMalloc();
-  auto grad_value_impl = torch_mlu::getMluTensorImpl(grad_value);
-  auto grad_value_ptr = grad_value_impl->cnnlMalloc();
-  auto grad_sampling_loc_impl = torch_mlu::getMluTensorImpl(grad_sampling_loc);
-  auto grad_sampling_loc_ptr = grad_sampling_loc_impl->cnnlMalloc();
-  auto grad_attn_weight_impl = torch_mlu::getMluTensorImpl(grad_attn_weight);
-  auto grad_attn_weight_ptr = grad_attn_weight_impl->cnnlMalloc();
-
-  // get comput dtype of input
-  cnrtDataType_t data_type = torch_mlu::toCnrtDtype(value.dtype());
-
-  // launch kernel
-  CNLOG(INFO) << "Launch Kernel MLUKernelMsDeformAttnBackward<<<" << k_dim.x
-              << ", " << k_dim.y << ", " << k_dim.z << ">>>";
-
-  KernelMsDeformAttnBackward(
-      k_dim, k_type, queue, data_type, (float*)value_ptr,
-      (int32_t*)spatial_shapes_ptr, (int32_t*)level_start_index_ptr,
-      (float*)sampling_loc_ptr, (float*)attn_weight_ptr,
-      (float*)grad_output_ptr, batch_size, num_keys, num_heads, channels,
-      num_levels, num_queries, num_points, (float*)grad_value_ptr,
-      (float*)grad_sampling_loc_ptr, (float*)grad_attn_weight_ptr);
-}
-
-Tensor ms_deform_attn_impl_forward(const Tensor& value,
-                                   const Tensor& spatial_shapes,
-                                   const Tensor& level_start_index,
-                                   const Tensor& sampling_loc,
-                                   const Tensor& attn_weight,
-                                   const int im2col_step);
-
-void ms_deform_attn_impl_backward(
-    const Tensor& value, const Tensor& spatial_shapes,
-    const Tensor& level_start_index, const Tensor& sampling_loc,
-    const Tensor& attn_weight, const Tensor& grad_output, Tensor& grad_value,
-    Tensor& grad_sampling_loc, Tensor& grad_attn_weight, const int im2col_step);
-
-REGISTER_DEVICE_IMPL(ms_deform_attn_impl_forward, MLU,
-                     ms_deform_attn_mlu_forward);
-REGISTER_DEVICE_IMPL(ms_deform_attn_impl_backward, MLU,
-                     ms_deform_attn_mlu_backward);
diff --git a/mmcv/ops/csrc/pytorch/mlu/nms_mlu.cpp b/mmcv/ops/csrc/pytorch/mlu/nms_mlu.cpp
deleted file mode 100644
index e2f4322..0000000
--- a/mmcv/ops/csrc/pytorch/mlu/nms_mlu.cpp
+++ /dev/null
@@ -1,156 +0,0 @@
-/*************************************************************************
- * Copyright (C) 2021 Cambricon.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *************************************************************************/
-
-#include "pytorch_device_registry.hpp"
-#include "pytorch_mlu_helper.hpp"
-
-void KernelNms(cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
-               const cnrtDataType_t data_type_input, const void *boxes_ptr,
-               const void *scores_ptr, const int input_num_boxes,
-               const int max_output_boxes, const float iou_threshold,
-               const float offset, void *workspace_ptr, void *output_size_ptr,
-               void *output_ptr);
-
-int selectUnionType(uint32_t use_job, int box_num_per_core) {
-  // the box_num_per_core should be at least 256, otherwise the real IO
-  // bandwidth would be very low
-  while (box_num_per_core < 256 && use_job >= 4) {
-    box_num_per_core *= 2;
-    use_job /= 2;
-  }
-  return use_job;
-}
-
-static cnnlStatus_t policyFunc(cnrtDim3_t *k_dim, cnrtFunctionType_t *k_type,
-                               int &core_num_per_class,
-                               const int input_box_num) {
-  uint32_t core_dim = torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster);
-  uint32_t cluster_number = torch_mlu::getDeviceAttr(cnrtAttrClusterCount);
-  uint32_t job_limit = getJobLimitCapability();
-  uint32_t core_number = job_limit;
-
-  int box_num_per_core = (input_box_num + core_number - 1) / core_number;
-  int use_job = selectUnionType(job_limit, box_num_per_core);
-  // initiate k_type as Union1
-  k_dim->x = core_dim;
-  k_dim->y = 1;
-  k_dim->z = 1;
-  *k_type = CNRT_FUNC_TYPE_UNION1;
-  switch (job_limit) {
-    case CN_KERNEL_CLASS_BLOCK:
-    case CN_KERNEL_CLASS_UNION:
-    case CN_KERNEL_CLASS_UNION2:
-    case CN_KERNEL_CLASS_UNION4:
-    case CN_KERNEL_CLASS_UNION8:
-    case CN_KERNEL_CLASS_UNION16: {
-      if (use_job < 4) {
-        k_dim->x = 1;
-        *k_type = CNRT_FUNC_TYPE_BLOCK;
-      } else if (use_job == 4) {
-        k_dim->x = core_dim;
-        *k_type = CNRT_FUNC_TYPE_UNION1;
-      } else {
-        k_dim->x = use_job;
-        *k_type = (cnrtFunctionType_t)use_job;
-      }
-    }; break;
-    default:
-      LOG(WARNING) << "[cnnlNms_v2]: got unsupported job limit number."
-                   << " Use default CN_KERNEL_CLASS_UNION1 with UNION1 task.";
-  }
-  return CNNL_STATUS_SUCCESS;
-}
-
-Tensor NMSMLUKernelLauncher(Tensor boxes, Tensor scores, float iou_threshold,
-                            int offset) {
-  // dimension parameters check
-  TORCH_CHECK(boxes.dim() == 2, "boxes should be a 2d tensor, got ",
-              boxes.dim(), "D");
-  TORCH_CHECK(boxes.size(1) == 4,
-              "boxes should have 4 elements in dimension 1, got ",
-              boxes.size(1));
-  TORCH_CHECK(scores.dim() == 1, "scores should be a 1d tensor, got ",
-              scores.dim(), "D");
-
-  // data type check
-  TORCH_CHECK(boxes.scalar_type() == scores.scalar_type(),
-              "boxes should have the same type as scores");
-  TORCH_CHECK(
-      boxes.scalar_type() == at::kFloat || boxes.scalar_type() == at::kHalf,
-      "data type of boxes should be Float or Half, got ", boxes.scalar_type());
-
-  if (boxes.numel() == 0) {
-    return at::empty({0}, boxes.options().dtype(at::kLong));
-  }
-
-  int input_num_boxes = boxes.size(0);
-  int max_output_boxes = boxes.size(0);
-
-  cnrtDataType_t data_type_input = torch_mlu::toCnrtDtype(boxes.dtype());
-  cnrtDim3_t k_dim;
-  cnrtJobType_t k_type;
-
-  int core_num_per_class;
-  policyFunc(&k_dim, &k_type, core_num_per_class, input_num_boxes);
-
-  // transpose boxes (n, 4) to (4, n) for better performance
-  auto boxes_t = boxes.transpose(0, 1);
-  auto boxes_ = torch_mlu::cnnl::ops::cnnl_contiguous(boxes_t);
-  auto scores_ = torch_mlu::cnnl::ops::cnnl_contiguous(scores);
-  auto output = at::empty({max_output_boxes}, boxes.options().dtype(at::kLong));
-  auto output_size = at::empty({1}, scores.options().dtype(at::kInt));
-
-  // workspace
-  const int info_num = 5;  // x1, x2, y1, y2 and score
-  size_t space_size = 0;
-  if (boxes.scalar_type() == at::kHalf) {
-    space_size = input_num_boxes * sizeof(int16_t) * info_num + sizeof(float);
-  } else {
-    space_size = input_num_boxes * sizeof(float) * info_num + sizeof(float);
-  }
-#if __BANG_ARCH__ > 370
-  int cluster_num = getCoreNumOfJobLimitCapability() /
-                    torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster);
-  space_size += cluster_number * sizeof(float) * 7;
-#endif
-  auto workspace = at::empty(space_size, boxes.options().dtype(at::kByte));
-
-  // get compute queue
-  auto queue = torch_mlu::getCurQueue();
-
-  auto boxes_impl = torch_mlu::getMluTensorImpl(boxes_);
-  auto boxes_ptr = boxes_impl->cnnlMalloc();
-  auto scores_impl = torch_mlu::getMluTensorImpl(scores_);
-  auto scores_ptr = scores_impl->cnnlMalloc();
-  auto workspace_impl = torch_mlu::getMluTensorImpl(workspace);
-  auto workspace_ptr = workspace_impl->cnnlMalloc();
-  auto output_impl = torch_mlu::getMluTensorImpl(output);
-  auto output_ptr = output_impl->cnnlMalloc();
-  auto output_size_impl = torch_mlu::getMluTensorImpl(output_size);
-  auto output_size_ptr = output_size_impl->cnnlMalloc();
-
-  uint32_t core_dim = torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster);
-  CNLOG(INFO) << "Launch Kernel MLUUnionX NMS<<<Union" << k_type / core_dim
-              << ", " << k_dim.x << ", " << k_dim.y << ", " << k_dim.z << ">>>";
-  KernelNms(k_dim, k_type, queue, data_type_input, boxes_ptr, scores_ptr,
-            input_num_boxes, max_output_boxes, iou_threshold, offset,
-            workspace_ptr, output_size_ptr, output_ptr);
-  int output_num = *static_cast<int *>(output_size.cpu().data_ptr());
-  return output.slice(0, 0, output_num);
-}
-
-Tensor nms_mlu(Tensor boxes, Tensor scores, float iou_threshold, int offset) {
-  return NMSMLUKernelLauncher(boxes, scores, iou_threshold, offset);
-}
-
-Tensor nms_impl(Tensor boxes, Tensor scores, float iou_threshold, int offset);
-REGISTER_DEVICE_IMPL(nms_impl, MLU, nms_mlu);
diff --git a/mmcv/ops/csrc/pytorch/mlu/psamask_mlu.cpp b/mmcv/ops/csrc/pytorch/mlu/psamask_mlu.cpp
deleted file mode 100644
index 87077b5..0000000
--- a/mmcv/ops/csrc/pytorch/mlu/psamask_mlu.cpp
+++ /dev/null
@@ -1,308 +0,0 @@
-/*************************************************************************
- * Copyright (C) 2022 Cambricon.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *************************************************************************/
-#include <algorithm>
-
-#include "psamask_utils.hpp"
-#include "pytorch_device_registry.hpp"
-#include "pytorch_mlu_helper.hpp"
-
-#define COMPUTE_COUNT_ALIGN 64
-
-void KernelPsamaskForward(
-    cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
-    const void *x, void *y, const PsamaskType psa_type,
-    const DimPartitionType core_partition,
-    const DimPartitionType cluster_partition, const int batch,
-    const int h_feature, const int w_feature, const int h_mask,
-    const int w_mask, const int x_c, const int y_c, const int half_h_mask,
-    const int half_w_mask, const int n_per_core, const int h_per_core,
-    const int n_per_cluster, const int h_per_cluster, const int limit_n_seg,
-    const int limit_h_seg, const int limit_w_seg);
-
-void KernelPsamaskBackward(
-    cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
-    const void *dy, void *dx, const PsamaskType psa_type,
-    const DimPartitionType core_partition,
-    const DimPartitionType cluster_partition, const int batch,
-    const int h_feature, const int w_feature, const int h_mask,
-    const int w_mask, const int dx_c, const int dy_c, const int half_h_mask,
-    const int half_w_mask, const int n_per_core, const int h_per_core,
-    const int n_per_cluster, const int h_per_cluster, const int limit_n_seg,
-    const int limit_h_seg, const int limit_w_seg);
-
-namespace {
-void policyFunc(cnrtDim3_t *k_dim_ptr, cnrtFunctionType_t *f_type_ptr,
-                PartitionSeg *partition_ptr, const int n, const int h_feature) {
-  unsigned int core_dim = torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster);
-  unsigned int cluster_num = torch_mlu::getDeviceAttr(cnrtAttrClusterCount);
-  unsigned int use_cluster_num = cluster_num;
-  unsigned int use_core_num = core_dim;
-
-  if (n >= cluster_num || n >= h_feature) {
-    partition_ptr->cluster_partition = PARTITION_N;
-    partition_ptr->n_per_cluster = (n + cluster_num - 1) / cluster_num;
-    partition_ptr->h_per_cluster = h_feature;
-    use_cluster_num =
-        (n + partition_ptr->n_per_cluster - 1) / partition_ptr->n_per_cluster;
-  } else {
-    partition_ptr->cluster_partition = PARTITION_H;
-    partition_ptr->h_per_cluster = (h_feature + cluster_num - 1) / cluster_num;
-    partition_ptr->n_per_cluster = n;
-    use_cluster_num = (h_feature + partition_ptr->h_per_cluster - 1) /
-                      partition_ptr->h_per_cluster;
-  }
-
-  if (partition_ptr->n_per_cluster >= core_dim ||
-      partition_ptr->n_per_cluster >= partition_ptr->h_per_cluster) {
-    partition_ptr->core_partition = PARTITION_N;
-    partition_ptr->n_per_core =
-        (partition_ptr->n_per_cluster + core_dim - 1) / core_dim;
-    partition_ptr->h_per_core = partition_ptr->h_per_cluster;
-    use_core_num =
-        (partition_ptr->n_per_cluster + partition_ptr->n_per_core - 1) /
-        partition_ptr->n_per_core;
-  } else {
-    partition_ptr->core_partition = PARTITION_H;
-    partition_ptr->h_per_core =
-        (partition_ptr->h_per_cluster + core_dim - 1) / core_dim;
-    partition_ptr->n_per_core = partition_ptr->n_per_cluster;
-    use_core_num =
-        (partition_ptr->h_per_cluster + partition_ptr->h_per_core - 1) /
-        partition_ptr->h_per_core;
-  }
-  *k_dim_ptr = {core_dim, use_cluster_num, 1};
-}
-
-}  // namespace
-
-bool findLimit(const int shape_core_n, const int shape_core_h,
-               const int shape_core_w, const int shape_core_ci,
-               const int shape_core_co, int *limit_n_seg_ptr,
-               int *limit_h_seg_ptr, int *limit_w_seg_ptr, const int psa_type) {
-  const bool need_temp = psa_type == 1;
-  const int input_bytes = sizeof(float);
-  int limit_n_seg = shape_core_n;
-  int limit_h_seg = shape_core_h;
-  int limit_w_seg = shape_core_w;
-
-  const int max_nram_size = torch_mlu::getDeviceAttr(cnrtAttrNramSizePerMcore);
-  const int align_base_128 = NFU_ALIGN_SIZE / input_bytes;
-  const int align_base_64 = COMPUTE_COUNT_ALIGN / input_bytes;
-  const int align_co = CEIL_ALIGN(shape_core_co, align_base_64);
-  const int align_w = CEIL_ALIGN(shape_core_w, align_base_64);
-  const int align_hw = CEIL_ALIGN(shape_core_h * shape_core_w, align_base_64);
-  const int max_num = max_nram_size / input_bytes;
-
-  int n_limit =
-      max_num /
-      (CEIL_ALIGN(shape_core_h * shape_core_w * shape_core_ci, align_base_128) +
-       align_hw * align_co * (1 + need_temp));
-  if (n_limit > 0) {
-    n_limit = std::min(n_limit, shape_core_n);
-    limit_n_seg = n_limit;
-  } else {
-    int h_limit =
-        max_num / (CEIL_ALIGN(shape_core_w * shape_core_ci, align_base_128) +
-                   align_w * align_co * (1 + need_temp));
-    if (h_limit > 0) {
-      h_limit = std::min(h_limit, shape_core_h);
-      limit_h_seg = h_limit;
-      limit_n_seg = 1;
-    } else {
-      int w_limit =
-          max_num / (CEIL_ALIGN(shape_core_ci, align_base_128) +
-                     CEIL_ALIGN(align_co, align_base_128) * (1 + need_temp));
-      if (w_limit > 0 && w_limit >= (COMPUTE_COUNT_ALIGN / input_bytes)) {
-        w_limit = std::min(w_limit, shape_core_w);
-        w_limit = w_limit / (COMPUTE_COUNT_ALIGN / input_bytes) *
-                  (COMPUTE_COUNT_ALIGN / input_bytes);
-        limit_w_seg = w_limit;
-        limit_h_seg = 1;
-        limit_n_seg = 1;
-      } else {
-        CNLOG(INFO) << "The size of input channel is too large.";
-        return false;
-      }
-    }
-  }
-  *limit_n_seg_ptr = limit_n_seg;
-  *limit_h_seg_ptr = limit_h_seg;
-  *limit_w_seg_ptr = limit_w_seg;
-  return true;
-}
-
-void PSAMaskForwardMLUKernelLauncher(const int psa_type, const Tensor x,
-                                     Tensor y, const int num_,
-                                     const int h_feature, const int w_feature,
-                                     const int h_mask, const int w_mask,
-                                     const int half_h_mask,
-                                     const int half_w_mask) {
-  // params check
-  TORCH_CHECK(x.scalar_type() == at::kFloat, "x type should be Float, got ",
-              x.scalar_type());
-  TORCH_CHECK(y.scalar_type() == x.scalar_type(),
-              "y should have the same type as x");
-  TORCH_CHECK(x.dim() == 4, "x should be a 4d tensor, got ", x.dim(), "D");
-  TORCH_CHECK(y.dim() == 4, "y should be a 4d tensor, got ", y.dim(), "D");
-
-  int x_c = x.size(1);
-  int y_c = y.size(1);
-  TORCH_CHECK(h_mask * w_mask == x_c,
-              "channel of x should be the same as h_mask * w_mask");
-  TORCH_CHECK(h_feature * w_feature == y_c,
-              "channel of y should be the same as h_feature * w_feature");
-  TORCH_CHECK(psa_type == 0 || psa_type == 1,
-              "psa_type only supports 'COLLECT' and 'DISTRIBUTE' currently");
-
-  if (x.numel() == 0) {
-    CNLOG(INFO) << "skip zero-element tensor";
-    return;
-  }
-
-  cnrtFunctionType_t k_type = CNRT_FUNC_TYPE_UNION1;
-  cnrtDim3_t k_dim;
-  PartitionSeg partition_info;
-  policyFunc(&k_dim, &k_type, &partition_info, num_, h_feature);
-  int n_limit_seg, h_limit_seg, w_limit_seg;
-  bool ret =
-      findLimit(partition_info.n_per_core, partition_info.h_per_core, w_feature,
-                x_c, y_c, &n_limit_seg, &h_limit_seg, &w_limit_seg, psa_type);
-  if (ret != true) {
-    return;
-  }
-
-  auto memory_format =
-      torch_mlu::cnnl::ops::get_channels_last_memory_format(x.dim());
-  auto x_tensor = torch_mlu::cnnl::ops::cnnl_contiguous(x, memory_format);
-  at::Tensor y_tmp =
-      at::empty({num_, y_c, h_feature, w_feature}, x.options(), memory_format);
-
-  // get compute queue
-  auto queue = torch_mlu::getCurQueue();
-
-  // get ptr of tensors
-  auto x_impl = torch_mlu::getMluTensorImpl(x_tensor);
-  auto x_ptr = x_impl->cnnlMalloc();
-  auto y_impl = torch_mlu::getMluTensorImpl(y_tmp);
-  auto y_ptr = y_impl->cnnlMalloc();
-
-  KernelPsamaskForward(
-      k_dim, k_type, queue, x_ptr, y_ptr, (PsamaskType)psa_type,
-      partition_info.core_partition, partition_info.cluster_partition, num_,
-      h_feature, w_feature, h_mask, w_mask, x_c, y_c, half_h_mask, half_w_mask,
-      partition_info.n_per_core, partition_info.h_per_core,
-      partition_info.n_per_cluster, partition_info.h_per_cluster, n_limit_seg,
-      h_limit_seg, w_limit_seg);
-
-  y.copy_(y_tmp);
-}
-
-void PSAMaskBackwardMLUKernelLauncher(const int psa_type, const Tensor dy,
-                                      Tensor dx, const int num_,
-                                      const int h_feature, const int w_feature,
-                                      const int h_mask, const int w_mask,
-                                      const int half_h_mask,
-                                      const int half_w_mask) {
-  // params check
-  TORCH_CHECK(dy.scalar_type() == at::kFloat, "dy type should be Float, got ",
-              dy.scalar_type());
-  TORCH_CHECK(dx.scalar_type() == dy.scalar_type(),
-              "dx should have the same type as dy");
-  TORCH_CHECK(dy.dim() == 4, "dy should be a 4d tensor, got ", dy.dim(), "D");
-  TORCH_CHECK(dx.dim() == 4, "dx should be a 4d tensor, got ", dx.dim(), "D");
-
-  int dy_c = dy.size(1);
-  int dx_c = dx.size(1);
-  TORCH_CHECK(h_feature * w_feature == dy_c,
-              "channel of dy should be the same as h_feature * w_feature");
-  TORCH_CHECK(h_mask * w_mask == dx_c,
-              "channel of dx should be the same as h_mask * w_mask");
-  TORCH_CHECK(psa_type == 0 || psa_type == 1,
-              "psa_type only supports 'COLLECT' and 'DISTRIBUTE' currently");
-
-  if (dx.numel() == 0) {
-    CNLOG(INFO) << "skip zero-element tensor";
-    return;
-  }
-
-  cnrtFunctionType_t k_type = CNRT_FUNC_TYPE_UNION1;
-  cnrtDim3_t k_dim;
-  PartitionSeg partition_info;
-  policyFunc(&k_dim, &k_type, &partition_info, num_, h_feature);
-  int n_limit_seg, h_limit_seg, w_limit_seg;
-  bool ret =
-      findLimit(partition_info.n_per_core, partition_info.h_per_core, w_feature,
-                dx_c, dy_c, &n_limit_seg, &h_limit_seg, &w_limit_seg, psa_type);
-  if (ret != true) {
-    return;
-  }
-
-  auto memory_format =
-      torch_mlu::cnnl::ops::get_channels_last_memory_format(dy.dim());
-  auto dy_tensor = torch_mlu::cnnl::ops::cnnl_contiguous(dy, memory_format);
-  at::Tensor dx_tmp = at::empty({num_, dx_c, h_feature, w_feature},
-                                dy.options(), memory_format);
-
-  // get compute queue
-  auto queue = torch_mlu::getCurQueue();
-
-  // get ptr of tensors
-  auto dx_impl = torch_mlu::getMluTensorImpl(dx_tmp);
-  auto dx_ptr = dx_impl->cnnlMalloc();
-  auto dy_impl = torch_mlu::getMluTensorImpl(dy_tensor);
-  auto dy_ptr = dy_impl->cnnlMalloc();
-
-  KernelPsamaskBackward(
-      k_dim, k_type, queue, dy_ptr, dx_ptr, (PsamaskType)psa_type,
-      partition_info.core_partition, partition_info.cluster_partition, num_,
-      h_feature, w_feature, h_mask, w_mask, dx_c, dy_c, half_h_mask,
-      half_w_mask, partition_info.n_per_core, partition_info.h_per_core,
-      partition_info.n_per_cluster, partition_info.h_per_cluster, n_limit_seg,
-      h_limit_seg, w_limit_seg);
-
-  dx.copy_(dx_tmp);
-}
-
-void psamask_forward_mlu(const int psa_type, const Tensor input, Tensor output,
-                         const int num_, const int h_feature,
-                         const int w_feature, const int h_mask,
-                         const int w_mask, const int half_h_mask,
-                         const int half_w_mask) {
-  PSAMaskForwardMLUKernelLauncher(psa_type, input, output, num_, h_feature,
-                                  w_feature, h_mask, w_mask, half_h_mask,
-                                  half_w_mask);
-}
-
-void psamask_backward_mlu(const int psa_type, const Tensor grad_output,
-                          Tensor grad_input, const int num_,
-                          const int h_feature, const int w_feature,
-                          const int h_mask, const int w_mask,
-                          const int half_h_mask, const int half_w_mask) {
-  PSAMaskBackwardMLUKernelLauncher(psa_type, grad_output, grad_input, num_,
-                                   h_feature, w_feature, h_mask, w_mask,
-                                   half_h_mask, half_w_mask);
-}
-
-void psamask_forward_impl(const int psa_type, const Tensor input, Tensor output,
-                          const int num_, const int h_feature,
-                          const int w_feature, const int h_mask,
-                          const int w_mask, const int half_h_mask,
-                          const int half_w_mask);
-
-void psamask_backward_impl(const int psa_type, const Tensor grad_output,
-                           Tensor grad_input, const int num_,
-                           const int h_feature, const int w_feature,
-                           const int h_mask, const int w_mask,
-                           const int half_h_mask, const int half_w_mask);
-
-REGISTER_DEVICE_IMPL(psamask_forward_impl, MLU, psamask_forward_mlu);
-REGISTER_DEVICE_IMPL(psamask_backward_impl, MLU, psamask_backward_mlu);
diff --git a/mmcv/ops/csrc/pytorch/mlu/roi_align_mlu.cpp b/mmcv/ops/csrc/pytorch/mlu/roi_align_mlu.cpp
deleted file mode 100644
index 361bba2..0000000
--- a/mmcv/ops/csrc/pytorch/mlu/roi_align_mlu.cpp
+++ /dev/null
@@ -1,206 +0,0 @@
-/*************************************************************************
- * Copyright (C) 2021 Cambricon.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *************************************************************************/
-#include "pytorch_device_registry.hpp"
-#include "pytorch_mlu_helper.hpp"
-
-void KernelRoiAlign(cnrtDim3_t k_dim, cnrtFunctionType_t k_type,
-                    cnrtQueue_t queue, const cnrtDataType_t d_type,
-                    const void *input, const void *rois, const int channels,
-                    const bool aligned, const int pooled_height,
-                    const int pooled_width, const int input_height,
-                    const int input_width, const int sampling_ratio,
-                    const float spatial_scale, const int num_rois,
-                    void *output);
-
-void KernelRoiAlignBackward(cnrtDim3_t k_dim, cnrtFunctionType_t k_type,
-                            cnrtQueue_t queue, const cnrtDataType_t dtype,
-                            const void *grads, const void *boxes,
-                            void *grads_image, const int boxes_num,
-                            const int hi, const int wi, const int c,
-                            const int no, const int ho, const int wo,
-                            const float spatial_scale, const int sampling_ratio,
-                            const bool aligned);
-
-void ROIAlignForwardMLUKernelLauncher(Tensor input, Tensor rois, Tensor output,
-                                      Tensor argmax_y, Tensor argmax_x,
-                                      int aligned_height, int aligned_width,
-                                      float spatial_scale, int sampling_ratio,
-                                      int pool_mode, bool aligned) {
-  // params check
-  TORCH_CHECK(
-      input.scalar_type() == at::kFloat || input.scalar_type() == at::kHalf,
-      "input type should be Float or Half, got ", input.scalar_type());
-  TORCH_CHECK(rois.scalar_type() == input.scalar_type(),
-              "rois should have the same type as input");
-  TORCH_CHECK(input.dim() == 4, "input should be a 4d tensor, got ",
-              input.dim(), "D");
-  TORCH_CHECK(rois.dim() == 2, "rois should be a 2d tensor, got ", rois.dim(),
-              "D");
-  TORCH_CHECK(pool_mode == 1, "pool_mode only supports 'avg' currently");
-
-  auto memory_format =
-      torch_mlu::cnnl::ops::get_channels_last_memory_format(input.dim());
-  auto input_tensor =
-      torch_mlu::cnnl::ops::cnnl_contiguous(input, memory_format);
-
-  auto num_rois = rois.size(0);
-  auto channels = input.size(1);
-  int height = input.size(2);
-  int width = input.size(3);
-
-  if (output.numel() == 0) {
-    output = at::zeros({num_rois, channels, aligned_height, aligned_width},
-                       input.options());
-    return;
-  }
-
-  at::Tensor output_tmp =
-      at::empty({num_rois, channels, aligned_height, aligned_width},
-                input.options(), memory_format);
-
-  // get tensor impl
-  auto self_impl = torch_mlu::getMluTensorImpl(input_tensor);
-  auto rois_impl = torch_mlu::getMluTensorImpl(rois);
-  auto output_impl = torch_mlu::getMluTensorImpl(output_tmp);
-
-  // get compute queue
-  auto queue = torch_mlu::getCurQueue();
-
-  // get the mlu ptr
-  auto self_ptr = self_impl->cnnlMalloc();
-  auto rois_ptr = rois_impl->cnnlMalloc();
-  auto output_ptr = output_impl->cnnlMalloc();
-
-  cnrtJobType_t k_type = CNRT_FUNC_TYPE_UNION1;
-  cnrtDim3_t k_dim;
-  k_dim.x = torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster);
-  k_dim.y = torch_mlu::getDeviceAttr(cnrtAttrClusterCount);
-  k_dim.z = 1;
-  cnrtDataType_t data_type = torch_mlu::toCnrtDtype(input.dtype());
-
-  KernelRoiAlign(k_dim, k_type, queue, data_type, self_ptr, rois_ptr, channels,
-                 aligned, aligned_height, aligned_width, height, width,
-                 sampling_ratio, spatial_scale, num_rois, output_ptr);
-
-  output.copy_(output_tmp);
-}
-
-static int nearestPower2(int x) {
-  x--;
-  x |= x >> 1;
-  x |= x >> 2;
-  x |= x >> 4;
-  x |= x >> 8;
-  x |= x >> 16;
-  x++;
-  return x;
-}
-
-void ROIAlignBackwardMLUKernelLauncher(Tensor grad, Tensor rois,
-                                       Tensor argmax_y, Tensor argmax_x,
-                                       Tensor grad_input, int aligned_height,
-                                       int aligned_width, float spatial_scale,
-                                       int sampling_ratio, int pool_mode,
-                                       bool aligned) {
-  // params check
-  TORCH_CHECK(
-      grad.scalar_type() == at::kFloat || grad.scalar_type() == at::kHalf,
-      "grad type should be Float or Half, got ", grad.scalar_type());
-  TORCH_CHECK(rois.scalar_type() == grad.scalar_type(),
-              "rois should have the same type as grad");
-  TORCH_CHECK(grad.dim() == 4, "grad should be a 4d tensor, got ", grad.dim(),
-              "D");
-  TORCH_CHECK(rois.dim() == 2, "rois should be a 2d tensor, got ", rois.dim(),
-              "D");
-  TORCH_CHECK(pool_mode == 1, "pool_mode only supports 'avg' currently");
-
-  int batch_size = grad_input.size(0);
-  int channels = grad_input.size(1);
-  int height = grad_input.size(2);
-  int width = grad_input.size(3);
-  auto memory_format =
-      torch_mlu::cnnl::ops::get_channels_last_memory_format(grad.dim());
-  auto grad_ = torch_mlu::cnnl::ops::cnnl_contiguous(grad, memory_format);
-  auto grad_input_ = at::empty({batch_size, channels, height, width},
-                               grad.options(), memory_format)
-                         .zero_();
-
-  int boxes_num = rois.size(0);
-  int hi = grad.size(2);
-  int wi = grad.size(3);
-  int c = grad.size(1);
-
-  int no = grad_input.size(0);
-  int ho = grad_input.size(2);
-  int wo = grad_input.size(3);
-
-  // get tensor impl
-  auto grad_impl = torch_mlu::getMluTensorImpl(grad_);
-  auto grad_input_impl = torch_mlu::getMluTensorImpl(grad_input_);
-  auto rois_impl = torch_mlu::getMluTensorImpl(rois);
-
-  // get compute queue
-  auto queue = torch_mlu::getCurQueue();
-
-  // get the mlu ptr
-  auto grad_ptr = grad_impl->cnnlMalloc();
-  auto rois_ptr = rois_impl->cnnlMalloc();
-  auto grad_input_ptr = grad_input_impl->cnnlMalloc();
-
-  cnrtJobType_t k_type = CNRT_FUNC_TYPE_UNION1;
-  int need_core = nearestPower2(boxes_num);
-  int union_number = torch_mlu::getDeviceAttr(cnrtAttrClusterCount);
-  uint32_t dim_x = torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster);
-  uint32_t dim_y = (need_core - 1) / dim_x + 1;
-  dim_y = (dim_y > union_number) ? union_number : dim_y;
-  cnrtDim3_t k_dim = {dim_x, dim_y, 1};
-  cnrtDataType_t k_dtype = torch_mlu::toCnrtDtype(grad.dtype());
-
-  KernelRoiAlignBackward(k_dim, k_type, queue, k_dtype, grad_ptr, rois_ptr,
-                         grad_input_ptr, boxes_num, hi, wi, c, no, ho, wo,
-                         spatial_scale, sampling_ratio, aligned);
-  grad_input.copy_(grad_input_);
-}
-
-void roi_align_forward_mlu(Tensor input, Tensor rois, Tensor output,
-                           Tensor argmax_y, Tensor argmax_x, int aligned_height,
-                           int aligned_width, float spatial_scale,
-                           int sampling_ratio, int pool_mode, bool aligned) {
-  ROIAlignForwardMLUKernelLauncher(input, rois, output, argmax_y, argmax_x,
-                                   aligned_height, aligned_width, spatial_scale,
-                                   sampling_ratio, pool_mode, aligned);
-}
-
-void roi_align_backward_mlu(Tensor grad_output, Tensor rois, Tensor argmax_y,
-                            Tensor argmax_x, Tensor grad_input,
-                            int aligned_height, int aligned_width,
-                            float spatial_scale, int sampling_ratio,
-                            int pool_mode, bool aligned) {
-  ROIAlignBackwardMLUKernelLauncher(
-      grad_output, rois, argmax_y, argmax_x, grad_input, aligned_height,
-      aligned_width, spatial_scale, sampling_ratio, pool_mode, aligned);
-}
-
-void roi_align_forward_impl(Tensor input, Tensor rois, Tensor output,
-                            Tensor argmax_y, Tensor argmax_x,
-                            int aligned_height, int aligned_width,
-                            float spatial_scale, int sampling_ratio,
-                            int pool_mode, bool aligned);
-
-void roi_align_backward_impl(Tensor grad_output, Tensor rois, Tensor argmax_y,
-                             Tensor argmax_x, Tensor grad_input,
-                             int aligned_height, int aligned_width,
-                             float spatial_scale, int sampling_ratio,
-                             int pool_mode, bool aligned);
-
-REGISTER_DEVICE_IMPL(roi_align_forward_impl, MLU, roi_align_forward_mlu);
-REGISTER_DEVICE_IMPL(roi_align_backward_impl, MLU, roi_align_backward_mlu);
diff --git a/mmcv/ops/csrc/pytorch/mlu/roi_align_rotated_mlu.cpp b/mmcv/ops/csrc/pytorch/mlu/roi_align_rotated_mlu.cpp
deleted file mode 100755
index c3058c0..0000000
--- a/mmcv/ops/csrc/pytorch/mlu/roi_align_rotated_mlu.cpp
+++ /dev/null
@@ -1,232 +0,0 @@
-/*************************************************************************
- * Copyright (C) 2022 by Cambricon.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *************************************************************************/
-#include "pytorch_device_registry.hpp"
-#include "pytorch_mlu_helper.hpp"
-#include "roi_align_rotated_utils.hpp"
-
-namespace {
-
-void policyFunc(int bin_num, cnrtDim3_t *k_dim, cnrtFunctionType_t *k_type) {
-  unsigned int core_num = torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster);
-  unsigned int cluster_num = torch_mlu::getDeviceAttr(cnrtAttrClusterCount);
-  *k_type = CNRT_FUNC_TYPE_UNION1;
-  k_dim->x = core_num;
-  unsigned int use_cluster = (bin_num + core_num - 1) / core_num;
-  k_dim->y = use_cluster > cluster_num ? cluster_num : use_cluster;
-  k_dim->z = 1;
-}
-
-}  // namespace
-
-void KernelRoiAlignRotatedForward(
-    cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
-    const cnrtDataType_t d_type, const void *features, const void *rois,
-    void *output, const int batch, const int height, const int width,
-    const int channel, const int rois_num,
-    const RoiAlignRotatedParams roiAlignRotatedParams);
-
-void KernelRoiAlignRotatedBackward(
-    cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
-    const cnrtDataType_t d_type, const void *top_grad, const void *rois,
-    void *bottom_grad, const int batch, const int height, const int width,
-    const int channel, const int rois_num,
-    const RoiAlignRotatedParams roiAlignRotatedParams);
-
-void ROIAlignRotatedForwardMLUKernelLauncher(Tensor input, Tensor rois,
-                                             Tensor output, int pooled_height,
-                                             int pooled_width,
-                                             float spatial_scale,
-                                             int sampling_ratio, bool aligned,
-                                             bool clockwise) {
-  TORCH_CHECK(((input.scalar_type() == output.scalar_type()) &&
-               (output.scalar_type() == rois.scalar_type())),
-              "data types of input, rois and output should be the same, ",
-              "but now input type is ", input.scalar_type(), ", rois type is ",
-              rois.scalar_type(), ", output type is ", output.scalar_type(),
-              ".");
-  TORCH_CHECK(
-      (input.scalar_type() == at::kFloat || input.scalar_type() == at::kHalf),
-      "input type should be Float or Half, got ", input.scalar_type(), ".");
-
-  TORCH_CHECK(input.dim() == 4, "input should be a 4d tensor, got ",
-              input.dim(), "D.");
-  TORCH_CHECK(rois.dim() == 2, "rois should be a 2d tensor, got ", rois.dim(),
-              "D.");
-  TORCH_CHECK(output.dim() == 4, "output should be a 4d tensor, got ",
-              output.dim(), "D.");
-
-  TORCH_CHECK((rois.size(0) == output.size(0)),
-              "the 1st dimensions of rois and output should be the same, ",
-              "but now the 1st dimension of rois is ", rois.size(0),
-              ", and output is ", output.size(0), ".");
-
-  TORCH_CHECK((input.size(1) == output.size(1)),
-              "the 2nd dimensions of input and output should be the same, ",
-              "but now the 2nd dimension of input is ", input.size(1),
-              ", and output is ", output.size(1), ".");
-
-  int channel = input.size(1);
-  int width = input.size(3);
-  int height = input.size(2);
-  int batch = input.size(0);
-  int rois_nums = rois.size(0);
-  cnrtDataType_t d_type = torch_mlu::toCnrtDtype(input.dtype());
-
-  // return if zero-elements
-  if (input.numel() == 0) {
-    CNLOG(INFO) << "Skip the zero-elements case.";
-    return;
-  }
-
-  RoiAlignRotatedParams roiAlignRotatedParams{pooled_height,  pooled_width,
-                                              sampling_ratio, spatial_scale,
-                                              aligned,        clockwise};
-  cnrtDim3_t k_dim;
-  cnrtFunctionType_t k_type;
-  policyFunc(rois_nums * pooled_height * pooled_width, &k_dim, &k_type);
-
-  auto memory_format =
-      torch_mlu::cnnl::ops::get_channels_last_memory_format(input.dim());
-  auto input_tensor =
-      torch_mlu::cnnl::ops::cnnl_contiguous(input, memory_format);
-  at::Tensor output_tmp =
-      at::empty({rois_nums, channel, pooled_height, pooled_width},
-                input.options(), memory_format);
-
-  // get compute queue
-  auto queue = torch_mlu::getCurQueue();
-
-  // get ptr of tensors
-  auto input_impl = torch_mlu::getMluTensorImpl(input_tensor);
-  auto input_ptr = input_impl->cnnlMalloc();
-  auto rois_impl = torch_mlu::getMluTensorImpl(rois);
-  auto rois_ptr = rois_impl->cnnlMalloc();
-  auto output_impl = torch_mlu::getMluTensorImpl(output_tmp);
-  auto output_ptr = output_impl->cnnlMalloc();
-
-  KernelRoiAlignRotatedForward(k_dim, k_type, queue, d_type, input_ptr,
-                               rois_ptr, output_ptr, batch, height, width,
-                               channel, rois_nums, roiAlignRotatedParams);
-  output.copy_(output_tmp);
-}
-
-void ROIAlignRotatedBackwardMLUKernelLauncher(
-    Tensor top_grad, Tensor rois, Tensor bottom_grad, int pooled_height,
-    int pooled_width, float spatial_scale, int sampling_ratio, bool aligned,
-    bool clockwise) {
-  TORCH_CHECK(((top_grad.scalar_type() == bottom_grad.scalar_type()) &&
-               (bottom_grad.scalar_type() == rois.scalar_type())),
-              "data types of top_grad, rois and bottom_grad should be ",
-              "the same, but now top_grad type is ", top_grad.scalar_type(),
-              ", rois type is ", rois.scalar_type(), ", bottom_grad type is ",
-              bottom_grad.scalar_type(), ".");
-  TORCH_CHECK((bottom_grad.scalar_type() == at::kFloat ||
-               bottom_grad.scalar_type() == at::kHalf),
-              "Data type of bottom_grad should be Float ro Half, got ",
-              bottom_grad.scalar_type(), ".");
-
-  TORCH_CHECK(bottom_grad.dim() == 4, "bottom_grad should be a 4d tensor, got ",
-              top_grad.dim(), "D.");
-  TORCH_CHECK(rois.dim() == 2, "rois should be a 2d tensor, got ", rois.dim(),
-              "D.");
-  TORCH_CHECK(top_grad.dim() == 4, "top_grad should be a 4d tensor, got ",
-              bottom_grad.dim(), "D.");
-
-  TORCH_CHECK((rois.size(0) == top_grad.size(0)),
-              "the 1st dimensions of rois and top_grad should be the same, ",
-              "but now the 1st dimension of rois is ", rois.size(0),
-              ", and top_grad is ", top_grad.size(0), ".");
-
-  TORCH_CHECK((bottom_grad.size(1) == top_grad.size(1)),
-              "the 2nd dimensions of bottom_grad and top_grad should be ",
-              "the same, but now the 2nd dimension of bottom_grad is ",
-              bottom_grad.size(1), ", and top_grad is ", top_grad.size(1), ".");
-
-  int channel = bottom_grad.size(1);
-  int width = bottom_grad.size(3);
-  int height = bottom_grad.size(2);
-  int batch = bottom_grad.size(0);
-  int rois_nums = rois.size(0);
-  cnrtDataType_t d_type = torch_mlu::toCnrtDtype(bottom_grad.dtype());
-
-  // return if zero-elements
-  if (bottom_grad.numel() == 0) {
-    CNLOG(INFO) << "Skip the zero-elements case.";
-    return;
-  }
-
-  RoiAlignRotatedParams roiAlignRotatedParams{pooled_height,  pooled_width,
-                                              sampling_ratio, spatial_scale,
-                                              aligned,        clockwise};
-  cnrtDim3_t k_dim;
-  cnrtFunctionType_t k_type;
-  policyFunc(rois_nums * pooled_height * pooled_width, &k_dim, &k_type);
-
-  auto memory_format =
-      torch_mlu::cnnl::ops::get_channels_last_memory_format(top_grad.dim());
-  auto top_grad_tensor =
-      torch_mlu::cnnl::ops::cnnl_contiguous(top_grad, memory_format);
-  at::Tensor bottom_grad_tmp = at::empty({batch, channel, height, width},
-                                         top_grad.options(), memory_format)
-                                   .zero_();
-
-  // get compute queue
-  auto queue = torch_mlu::getCurQueue();
-
-  // get ptr of tensors
-  auto bottom_grad_impl = torch_mlu::getMluTensorImpl(bottom_grad_tmp);
-  auto bottom_grad_ptr = bottom_grad_impl->cnnlMalloc();
-  auto rois_impl = torch_mlu::getMluTensorImpl(rois);
-  auto rois_ptr = rois_impl->cnnlMalloc();
-  auto top_grad_impl = torch_mlu::getMluTensorImpl(top_grad_tensor);
-  auto top_grad_ptr = top_grad_impl->cnnlMalloc();
-
-  KernelRoiAlignRotatedBackward(k_dim, k_type, queue, d_type, top_grad_ptr,
-                                rois_ptr, bottom_grad_ptr, batch, height, width,
-                                channel, rois_nums, roiAlignRotatedParams);
-  bottom_grad.copy_(bottom_grad_tmp);
-}
-
-void roi_align_rotated_forward_mlu(Tensor input, Tensor rois, Tensor output,
-                                   int aligned_height, int aligned_width,
-                                   float spatial_scale, int sampling_ratio,
-                                   bool aligned, bool clockwise) {
-  ROIAlignRotatedForwardMLUKernelLauncher(input, rois, output, aligned_height,
-                                          aligned_width, spatial_scale,
-                                          sampling_ratio, aligned, clockwise);
-}
-
-void roi_align_rotated_backward_mlu(Tensor top_grad, Tensor rois,
-                                    Tensor bottom_grad, int aligned_height,
-                                    int aligned_width, float spatial_scale,
-                                    int sampling_ratio, bool aligned,
-                                    bool clockwise) {
-  ROIAlignRotatedBackwardMLUKernelLauncher(
-      top_grad, rois, bottom_grad, aligned_height, aligned_width, spatial_scale,
-      sampling_ratio, aligned, clockwise);
-}
-
-void roi_align_rotated_forward_impl(Tensor input, Tensor rois, Tensor output,
-                                    int aligned_height, int aligned_width,
-                                    float spatial_scale, int sampling_ratio,
-                                    bool aligned, bool clockwise);
-
-void roi_align_rotated_backward_impl(Tensor top_grad, Tensor rois,
-                                     Tensor bottom_grad, int aligned_height,
-                                     int aligned_width, float spatial_scale,
-                                     int sampling_ratio, bool aligned,
-                                     bool clockwise);
-
-REGISTER_DEVICE_IMPL(roi_align_rotated_forward_impl, MLU,
-                     roi_align_rotated_forward_mlu);
-REGISTER_DEVICE_IMPL(roi_align_rotated_backward_impl, MLU,
-                     roi_align_rotated_backward_mlu);
diff --git a/mmcv/ops/csrc/pytorch/mlu/roi_pool_mlu.cpp b/mmcv/ops/csrc/pytorch/mlu/roi_pool_mlu.cpp
deleted file mode 100644
index 7db2395..0000000
--- a/mmcv/ops/csrc/pytorch/mlu/roi_pool_mlu.cpp
+++ /dev/null
@@ -1,275 +0,0 @@
-/*************************************************************************
- * Copyright (C) 2022 Cambricon.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *************************************************************************/
-#include "pytorch_device_registry.hpp"
-#include "pytorch_mlu_helper.hpp"
-
-void KernelRoiPoolForward(cnrtDim3_t k_dim, cnrtFunctionType_t k_type,
-                          cnrtQueue_t queue, cnrtDataType_t data_type,
-                          const void *input_data, const void *input_rois,
-                          const int batch, const int channels, const int height,
-                          const int width, const int pooled_height,
-                          const int pooled_width, const int rois_num,
-                          const float spatial_scale, void *output_data,
-                          int *argmax);
-
-void KernelRoiPoolBackward(cnrtDim3_t k_dim, cnrtFunctionType_t k_type,
-                           cnrtQueue_t queue, cnrtDataType_t k_dtype,
-                           const void *grad_output_ptr, const void *rois_ptr,
-                           const int *argmax_ptr, void *grad_input_ptr,
-                           const int box_num, const int pooled_height,
-                           const int pooled_width, const int channels,
-                           const int batch, const int height, const int width,
-                           const float spatial_scale);
-
-// policy function for forward
-static void policyFuncForward(const int bin_num, cnrtDim3_t *k_dim,
-                              cnrtFunctionType_t *k_type) {
-  auto core_num = torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster);
-  auto cluster_num = torch_mlu::getDeviceAttr(cnrtAttrClusterCount);
-  *k_type = CNRT_FUNC_TYPE_UNION1;
-  k_dim->x = core_num;
-  unsigned int use_cluster = bin_num / core_num + (bin_num % core_num > 0);
-  k_dim->y = use_cluster > cluster_num ? cluster_num : use_cluster;
-  k_dim->z = 1;
-}
-
-void ROIPoolForwardMLUKernelLauncher(Tensor input, Tensor rois, Tensor output,
-                                     Tensor argmax, int pooled_height,
-                                     int pooled_width, float spatial_scale) {
-  // Check dtype.
-  TORCH_CHECK(
-      input.scalar_type() == at::kFloat || input.scalar_type() == at::kHalf,
-      "input type should be Float or Half, got ", input.scalar_type());
-  TORCH_CHECK(input.scalar_type() == rois.scalar_type(),
-              "rois should have the same type as input");
-
-  // Check dtype relationship.
-  TORCH_CHECK(
-      argmax.scalar_type() == at::kLong || argmax.scalar_type() == at::kInt,
-      "argmax type should be Int or Long, got ", argmax.scalar_type());
-
-  // Check shape.
-  TORCH_CHECK(input.dim() == 4, "input should be 4d tensor, got ", input.dim(),
-              "D");
-  TORCH_CHECK(rois.dim() == 2, "rois should be 2d tensor, got ", rois.dim(),
-              "D");
-  TORCH_CHECK(argmax.dim() == 4, "argmax should be 4d tensor, got ",
-              argmax.dim(), "D");
-
-  TORCH_CHECK(spatial_scale > 0 && spatial_scale <= 1,
-              "spatial_scale should be within (0, 1], got ", spatial_scale);
-
-  // compute kernel params
-  auto batch = input.size(0);
-  auto height = input.size(2);
-  auto width = input.size(3);
-  auto channels = input.size(1);
-  auto rois_num = output.size(0);
-
-  if (output.numel() == 0) {
-    output = at::zeros({rois_num, channels, pooled_height, pooled_width},
-                       input.options());
-    return;
-  }
-  if (argmax.numel() == 0) {
-    argmax = at::zeros({rois_num, channels, pooled_height, pooled_width},
-                       argmax.options());
-    return;
-  }
-
-  // zero element check
-  if (input.numel() == 0 || rois.numel() == 0 || output.numel() == 0 ||
-      argmax.numel() == 0) {
-    return;
-  }
-
-  auto memory_format =
-      torch_mlu::cnnl::ops::get_channels_last_memory_format(input.dim());
-  auto input_ = torch_mlu::cnnl::ops::cnnl_contiguous(input, memory_format);
-
-  at::Tensor output_ =
-      at::empty({rois_num, channels, pooled_height, pooled_width},
-                input.options(), memory_format);
-  at::Tensor argmax_ =
-      at::empty({rois_num, channels, pooled_height, pooled_width},
-                argmax.options(), memory_format);
-
-  // calculate task dimension
-  cnrtDim3_t k_dim;
-  cnrtFunctionType_t k_type;
-  policyFuncForward(rois_num * pooled_height * pooled_width, &k_dim, &k_type);
-
-  // get compute queue
-  auto queue = torch_mlu::getCurQueue();
-
-  // get ptr of tensors
-  auto input_impl = torch_mlu::getMluTensorImpl(input_);
-  auto input_ptr = input_impl->cnnlMalloc();
-  auto rois_impl = torch_mlu::getMluTensorImpl(rois);
-  auto rois_ptr = rois_impl->cnnlMalloc();
-  auto output_impl = torch_mlu::getMluTensorImpl(output_);
-  auto output_ptr = output_impl->cnnlMalloc();
-  auto argmax_impl = torch_mlu::getMluTensorImpl(argmax_);
-  auto argmax_ptr = argmax_impl->cnnlMalloc();
-
-  // get comput dtype of input
-  cnrtDataType_t data_type = torch_mlu::toCnrtDtype(input_.dtype());
-
-  // launch kernel
-  CNLOG(INFO) << "Launch Kernel MLUKernelRoiPoolForward<<<" << k_dim.x << ", "
-              << k_dim.y << ", " << k_dim.z << ">>>";
-
-  KernelRoiPoolForward(k_dim, k_type, queue, data_type, input_ptr, rois_ptr,
-                       batch, channels, height, width, pooled_height,
-                       pooled_width, rois_num, spatial_scale, output_ptr,
-                       (int *)argmax_ptr);
-  output.copy_(output_);
-  argmax.copy_(argmax_);
-}
-
-// policy function for backward
-static void policyFuncBackward(cnrtDim3_t *k_dim, cnrtFunctionType_t *k_type) {
-  *k_type = CNRT_FUNC_TYPE_UNION1;
-  k_dim->x = torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster);
-  k_dim->y = torch_mlu::getDeviceAttr(cnrtAttrClusterCount);
-  k_dim->z = 1;
-}
-
-void ROIPoolBackwardMLUKernelLauncher(Tensor grad_output, Tensor rois,
-                                      Tensor argmax, Tensor grad_input,
-                                      int pooled_height, int pooled_width,
-                                      float spatial_scale) {
-  // Check dtype.
-  TORCH_CHECK(
-      argmax.scalar_type() == at::kLong || argmax.scalar_type() == at::kInt,
-      "argmax type should be Int or Long, got ", argmax.scalar_type());
-  TORCH_CHECK((grad_output.scalar_type() == at::kFloat ||
-               grad_output.scalar_type() == at::kHalf),
-              "grad_output type should be FLoat or Half, got ",
-              grad_output.scalar_type());
-
-  // Check dtype relationship.
-  TORCH_CHECK((rois.scalar_type() == grad_output.scalar_type()),
-              "rois should have the same type as grad_output");
-
-  // Check shape.
-  TORCH_CHECK(grad_output.dim() == 4, "grad_output should be 4d tensor, got ",
-              grad_output.dim(), "D");
-  TORCH_CHECK(rois.dim() == 2, "rois should be 2d tensor, got ", rois.dim(),
-              "D");
-  TORCH_CHECK(argmax.dim() == 4, "argmax should be 4d tensor, got ",
-              argmax.dim(), "D");
-
-  TORCH_CHECK(spatial_scale > 0 && spatial_scale <= 1,
-              "spatial_scale should be within (0, 1], got ", spatial_scale);
-
-  // Check relationship between tensor.
-  // Check the relationship of n.
-  TORCH_CHECK(grad_output.size(0) == rois.size(0),
-              "grad_output.size(0) = ", grad_output.size(0),
-              ", while rois.size(0) = ", rois.size(0),
-              ". They should be the same.");
-
-  // Check the relationship of channels.
-  TORCH_CHECK(grad_output.size(1) == argmax.size(1),
-              "grad_output.size(1) = ", grad_output.size(1),
-              ", while argmax.size(1) = ", argmax.size(1),
-              ". They should be the same.");
-
-  // Check the relationship of height and width.
-  TORCH_CHECK(grad_output.size(2) == argmax.size(2),
-              "argmax.size(2) = ", argmax.size(2),
-              ", while grad_output.size(2) = ", grad_output.size(2),
-              ". They should be the same.");
-  TORCH_CHECK(grad_output.size(3) == argmax.size(3),
-              "argmax.size(3) = ", argmax.size(3),
-              ", while grad_output.size(3) = ", grad_output.size(3),
-              ". They should be the same.");
-
-  // Check zero element.
-  if (grad_output.numel() == 0 || rois.numel() == 0 || argmax.numel() == 0 ||
-      grad_input.numel() == 0) {
-    // return if zero-element
-    return;
-  }
-
-  auto memory_format =
-      torch_mlu::cnnl::ops::get_channels_last_memory_format(grad_output.dim());
-  auto grad_output_ =
-      torch_mlu::cnnl::ops::cnnl_contiguous(grad_output, memory_format);
-  auto argmax_ = torch_mlu::cnnl::ops::cnnl_contiguous(argmax, memory_format);
-
-  int boxes_num = grad_output.size(0);
-  int no = grad_input.size(0);
-  int channels = grad_input.size(1);
-  int height = grad_input.size(2);
-  int width = grad_input.size(3);
-  auto grad_input_ = at::empty({no, channels, height, width},
-                               grad_input.options(), memory_format)
-                         .zero_();
-
-  // get tensor impl
-  auto grad_output_impl = torch_mlu::getMluTensorImpl(grad_output_);
-  auto rois_impl = torch_mlu::getMluTensorImpl(rois);
-  auto argmax_impl = torch_mlu::getMluTensorImpl(argmax_);
-  auto grad_input_impl = torch_mlu::getMluTensorImpl(grad_input_);
-
-  // get compute queue
-  auto queue = torch_mlu::getCurQueue();
-
-  // get mlu ptr
-  auto grad_output_ptr = grad_output_impl->cnnlMalloc();
-  auto rois_ptr = rois_impl->cnnlMalloc();
-  auto argmax_ptr = argmax_impl->cnnlMalloc();
-  auto grad_input_ptr = grad_input_impl->cnnlMalloc();
-
-  // calculate task dimension
-  cnrtDataType_t k_dtype = torch_mlu::toCnrtDtype(grad_input.dtype());
-  cnrtDim3_t k_dim;
-  cnrtFunctionType_t k_type;
-  policyFuncBackward(&k_dim, &k_type);
-
-  CNLOG(INFO) << "Launch Kernel MLUKernelRoiPoolBackward<<<" << k_dim.x << ", "
-              << k_dim.y << ", " << k_dim.z << ">>>";
-
-  KernelRoiPoolBackward(k_dim, k_type, queue, k_dtype, grad_output_ptr,
-                        rois_ptr, (int *)argmax_ptr, grad_input_ptr, boxes_num,
-                        pooled_height, pooled_width, channels, no, height,
-                        width, spatial_scale);
-
-  grad_input.copy_(grad_input_);
-}
-
-void roi_pool_forward_mlu(Tensor input, Tensor rois, Tensor output,
-                          Tensor argmax, int pooled_height, int pooled_width,
-                          float spatial_scale) {
-  ROIPoolForwardMLUKernelLauncher(input, rois, output, argmax, pooled_height,
-                                  pooled_width, spatial_scale);
-}
-
-void roi_pool_backward_mlu(Tensor grad_output, Tensor rois, Tensor argmax,
-                           Tensor grad_input, int pooled_height,
-                           int pooled_width, float spatial_scale) {
-  ROIPoolBackwardMLUKernelLauncher(grad_output, rois, argmax, grad_input,
-                                   pooled_height, pooled_width, spatial_scale);
-}
-
-void roi_pool_forward_impl(Tensor input, Tensor rois, Tensor output,
-                           Tensor argmax, int pooled_height, int pooled_width,
-                           float spatial_scale);
-
-void roi_pool_backward_impl(Tensor grad_output, Tensor rois, Tensor argmax,
-                            Tensor grad_input, int pooled_height,
-                            int pooled_width, float spatial_scale);
-
-REGISTER_DEVICE_IMPL(roi_pool_forward_impl, MLU, roi_pool_forward_mlu);
-REGISTER_DEVICE_IMPL(roi_pool_backward_impl, MLU, roi_pool_backward_mlu);
diff --git a/mmcv/ops/csrc/pytorch/mlu/roiaware_pool3d_mlu.cpp b/mmcv/ops/csrc/pytorch/mlu/roiaware_pool3d_mlu.cpp
deleted file mode 100644
index 62cb2dc..0000000
--- a/mmcv/ops/csrc/pytorch/mlu/roiaware_pool3d_mlu.cpp
+++ /dev/null
@@ -1,399 +0,0 @@
-/*************************************************************************
- * Copyright (C) 2022 by Cambricon.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *************************************************************************/
-#include "pytorch_device_registry.hpp"
-#include "pytorch_mlu_helper.hpp"
-
-void KernelPtsIdxOfVoxels(cnrtDim3_t k_dim, cnrtFunctionType_t k_type,
-                          cnrtQueue_t queue, const cnrtDataType_t d_type,
-                          const int pool_method, const int boxes_num,
-                          const int pts_num, const int max_pts_each_voxel,
-                          const int out_x, const int out_y, const int out_z,
-                          const void *rois, const void *pts,
-                          int *pts_idx_of_voxels);
-
-void KernelRoiawarePool3dForward(
-    cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
-    const cnrtDataType_t d_type, const int pool_method, const int boxes_num,
-    const int pts_num, const int channels, const int max_pts_each_voxel,
-    const int out_x, const int out_y, const int out_z, const void *pts_feature,
-    const int *pts_idx_of_voxels, void *pooled_features, int *argmax);
-
-// policy function
-static void kernelPtsIdxOfVoxelsPolicyFunc(const int boxes_num,
-                                           cnrtDim3_t *k_dim,
-                                           cnrtFunctionType_t *k_type) {
-  unsigned int core_num = torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster);
-  unsigned int cluster_num = torch_mlu::getDeviceAttr(cnrtAttrClusterCount);
-  *k_type = CNRT_FUNC_TYPE_UNION1;
-  k_dim->x = core_num;
-  unsigned int use_cluster = (boxes_num + core_num - 1) / core_num;
-  k_dim->y = use_cluster > cluster_num ? cluster_num : use_cluster;
-  k_dim->z = 1;
-}
-
-static void kernelRoiawarePool3dForwardPolicyFunc(
-    const int boxes_num, const int out_x, const int out_y, const int out_z,
-    cnrtDim3_t *k_dim, cnrtFunctionType_t *k_type) {
-  unsigned int core_num = torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster);
-  unsigned int cluster_num = torch_mlu::getDeviceAttr(cnrtAttrClusterCount);
-  *k_type = CNRT_FUNC_TYPE_UNION1;
-  k_dim->x = core_num;
-  const int voxels_num = boxes_num * out_x * out_y * out_z;
-  unsigned int use_cluster = (voxels_num + core_num - 1) / core_num;
-  k_dim->y = use_cluster > cluster_num ? cluster_num : use_cluster;
-  k_dim->z = 1;
-}
-
-void RoiawarePool3dForwardMLUKernelLauncher(
-    const int pool_method, const int boxes_num, const int pts_num,
-    const int channels, const int max_pts_each_voxel, const int out_x,
-    const int out_y, const int out_z, const Tensor rois, const Tensor pts,
-    const Tensor pts_feature, Tensor pts_idx_of_voxels, Tensor pooled_features,
-    Tensor argmax) {
-  // check datatype
-  TORCH_CHECK(((pts.scalar_type() == rois.scalar_type()) &&
-               (pts_feature.scalar_type() == rois.scalar_type()) &&
-               (pooled_features.scalar_type() == rois.scalar_type())),
-              "data types of rois, rois, pts_feature and pooled_features "
-              "should be the same, ",
-              "but now rois type is ", rois.scalar_type(), ", pts type is ",
-              pts.scalar_type(), ", pts_feature type is ",
-              pts_feature.scalar_type(), ", pooled_features type is ",
-              pooled_features.scalar_type(), ".");
-  TORCH_CHECK(
-      (rois.scalar_type() == at::kFloat || rois.scalar_type() == at::kHalf),
-      "rois type should be Float or Half, got ", rois.scalar_type(), ".");
-  TORCH_CHECK((pts_idx_of_voxels.scalar_type() == at::kInt),
-              "pts_idx_of_voxels type should be Int, got ",
-              pts_idx_of_voxels.scalar_type(), ".");
-  // check dim
-  TORCH_CHECK(rois.dim() == 2, "rois should be a 2D tensor, got ", rois.dim(),
-              "D.");
-  TORCH_CHECK(pts.dim() == 2, "pts should be a 2D tensor, got ", pts.dim(),
-              "D.");
-  TORCH_CHECK(pts_feature.dim() == 2, "pts_feature should be a 2D tensor, got ",
-              pts_feature.dim(), "D.");
-  TORCH_CHECK(pts_idx_of_voxels.dim() == 5,
-              "pts_idx_of_voxels should be a 5D tensor, got ",
-              pts_idx_of_voxels.dim(), "D.");
-  TORCH_CHECK(pooled_features.dim() == 5,
-              "pooled_features should be a 5D tensor, got ",
-              pooled_features.dim(), "D.");
-  // check shape
-  TORCH_CHECK(((rois.size(0) == boxes_num) && (rois.size(1) == 7)),
-              "the dimensions of rois should be (boxes_num, 7), ", "but got (",
-              rois.size(0), ", ", rois.size(1), ") .");
-  TORCH_CHECK(((pts.size(0) == pts_num) && (pts.size(1) == 3)),
-              "the dimensions of pts should be (pts_num, 3), ", "but got (",
-              pts.size(0), ",", pts.size(1), ").");
-  TORCH_CHECK(
-      ((pts_feature.size(0) == pts_num) && (pts_feature.size(1) == channels)),
-      "the dimensions of pts_feature should be (pts_num, channels), ",
-      "but got (", pts_feature.size(0), ",", pts_feature.size(1), ").");
-  TORCH_CHECK(((pts_idx_of_voxels.size(0) == boxes_num) &&
-               (pts_idx_of_voxels.size(1) == out_x) &&
-               (pts_idx_of_voxels.size(2) == out_y) &&
-               (pts_idx_of_voxels.size(3) == out_z) &&
-               (pts_idx_of_voxels.size(4) == max_pts_each_voxel)),
-              "the dimensions of pts_idx_of_voxels should be (boxes_num, "
-              "out_x, out_y, out_z, max_pts_each_voxel), ",
-              "but got (", pts_idx_of_voxels.size(0), ",",
-              pts_idx_of_voxels.size(1), ",", pts_idx_of_voxels.size(2), ",",
-              pts_idx_of_voxels.size(3), ",", pts_idx_of_voxels.size(4), ").");
-  TORCH_CHECK(((pooled_features.size(0) == boxes_num) &&
-               (pooled_features.size(1) == out_x) &&
-               (pooled_features.size(2) == out_y) &&
-               (pooled_features.size(3) == out_z) &&
-               (pooled_features.size(4) == channels)),
-              "the dimensions of pooled_features should be (boxes_num, out_x, "
-              "out_y, out_z, channels), ",
-              "but got (", pooled_features.size(0), ",",
-              pooled_features.size(1), ",", pooled_features.size(2), ",",
-              pooled_features.size(3), ",", pooled_features.size(4), ").");
-  // check other params : pool_mothod
-  TORCH_CHECK(((pool_method == 0) || (pool_method == 1)),
-              "the num of pool_method should be 0(max) or 1(avg), ", "but got ",
-              pool_method, ".");
-  // check large tensor
-  const size_t max_input_size = 2147483648;
-  TORCH_CHECK(rois.numel() < max_input_size,
-              "rois element num should be less than 2^31, got ", rois.numel(),
-              ".");
-  TORCH_CHECK(pts.numel() < max_input_size,
-              "pts element num should be less than 2^31, got ", pts.numel(),
-              ".");
-  TORCH_CHECK(pts_feature.numel() < max_input_size,
-              "pts_feature element num should be less than 2^31, got ",
-              pts_feature.numel(), ".");
-  TORCH_CHECK(pts_idx_of_voxels.numel() < max_input_size,
-              "pts_idx_of_voxels element num should be less than 2^31, got ",
-              pts_idx_of_voxels.numel(), ".");
-  TORCH_CHECK(pooled_features.numel() < max_input_size,
-              "pooled_features element num should be less than 2^31, got ",
-              pooled_features.numel(), ".");
-  // check zero element
-  TORCH_CHECK(rois.numel() != 0, "rois.numel() should not be zero, got ",
-              rois.numel());
-  TORCH_CHECK(pts.numel() != 0, "pts.numel() should not be zero, got ",
-              pts.numel());
-  TORCH_CHECK(pts_feature.numel() != 0,
-              "pts_feature.numel() should not be zero, got ",
-              pts_feature.numel());
-  TORCH_CHECK(pts_idx_of_voxels.numel() != 0,
-              "pts_idx_of_voxels.numel() should not be zero, got ",
-              pts_idx_of_voxels.numel());
-  TORCH_CHECK(pooled_features.numel() != 0,
-              "pooled_features.numel() should not be zero, got ",
-              pooled_features.numel());
-  if (pool_method == 0) {
-    // check datatype
-    TORCH_CHECK((argmax.scalar_type() == at::kInt),
-                "argmax type should be Int, got ", argmax.scalar_type(), ".");
-    // check dim
-    TORCH_CHECK(argmax.dim() == 5, "argmax should be a 5D tensor, got ",
-                argmax.dim(), "D.");
-    // check shape
-    TORCH_CHECK(((argmax.size(0) == boxes_num) && (argmax.size(1) == out_x) &&
-                 (argmax.size(2) == out_y) && (argmax.size(3) == out_z) &&
-                 (argmax.size(4) == channels)),
-                "the dimensions of argmax should be (boxes_num, out_x, out_y, "
-                "out_z, channels), ",
-                "but got (", argmax.size(0), ",", argmax.size(1), ",",
-                argmax.size(2), ",", argmax.size(3), ",", argmax.size(4), ").");
-    // check large tensor
-    TORCH_CHECK(argmax.numel() < max_input_size,
-                "argmax element num should be less than 2^31, got ",
-                argmax.numel(), ".");
-    // check zero element
-    TORCH_CHECK(argmax.numel() != 0, "argmax.numel() should not be zero, got ",
-                argmax.numel());
-    // when pool_method is 0, which is max pool, init argmax data value to -1
-    argmax.fill_(static_cast<int>(-1));
-  }
-  // calculate task one dimension
-  cnrtDim3_t k1_dim;
-  cnrtFunctionType_t k1_type;
-  kernelPtsIdxOfVoxelsPolicyFunc(boxes_num, &k1_dim, &k1_type);
-  cnrtDim3_t k2_dim;
-  cnrtFunctionType_t k2_type;
-  kernelRoiawarePool3dForwardPolicyFunc(boxes_num, out_x, out_y, out_z, &k2_dim,
-                                        &k2_type);
-  // get compute queue
-  auto queue = torch_mlu::getCurQueue();
-  // get ptr of tensors
-  auto rois_impl = torch_mlu::getMluTensorImpl(rois);
-  auto rois_ptr = rois_impl->cnnlMalloc();
-  // transpose points [pts_num, 3] -> [3, pts_num]
-  auto pts_ = pts.permute({1, 0}).contiguous();
-  auto pts_impl = torch_mlu::getMluTensorImpl(pts_);
-  auto pts_ptr = pts_impl->cnnlMalloc();
-  // transpose points_features [pts_num, channels] -> [channels, pts_num]
-  auto pts_feature_ = pts_feature.permute({1, 0}).contiguous();
-  auto pts_feature_impl = torch_mlu::getMluTensorImpl(pts_feature_);
-  auto pts_feature_ptr = pts_feature_impl->cnnlMalloc();
-  auto pts_idx_of_voxels_impl = torch_mlu::getMluTensorImpl(pts_idx_of_voxels);
-  auto pts_idx_of_voxels_ptr = pts_idx_of_voxels_impl->cnnlMalloc();
-  auto pooled_features_impl = torch_mlu::getMluTensorImpl(pooled_features);
-  auto pooled_features_ptr = pooled_features_impl->cnnlMalloc();
-  auto argmax_impl = torch_mlu::getMluTensorImpl(argmax);
-  auto argmax_ptr = argmax_impl->cnnlMalloc();
-  // get compute dtype of input
-  cnrtDataType_t data_type = torch_mlu::toCnrtDtype(rois.dtype());
-  // launch kernel PtsIdxOfVoxels
-  CNLOG(INFO) << "Launch Kernel MLUKernel PtsIdxOfVoxels<<<" << k1_dim.x << ", "
-              << k1_dim.y << ", " << k1_dim.z << ">>>";
-  KernelPtsIdxOfVoxels(k1_dim, k1_type, queue, data_type, pool_method,
-                       boxes_num, pts_num, max_pts_each_voxel, out_x, out_y,
-                       out_z, rois_ptr, pts_ptr, (int *)pts_idx_of_voxels_ptr);
-  // launch kernel RoiawarePool3dForward
-  CNLOG(INFO) << "Launch Kernel MLUKernel RoiawarePool3dForward<<<" << k2_dim.x
-              << ", " << k2_dim.y << ", " << k2_dim.z << ">>>";
-  KernelRoiawarePool3dForward(
-      k2_dim, k2_type, queue, data_type, pool_method, boxes_num, pts_num,
-      channels, max_pts_each_voxel, out_x, out_y, out_z, pts_feature_ptr,
-      (int *)pts_idx_of_voxels_ptr, pooled_features_ptr, (int *)argmax_ptr);
-}
-
-void roiaware_pool3d_forward_mlu(int boxes_num, int pts_num, int channels,
-                                 int max_pts_each_voxel, int out_x, int out_y,
-                                 int out_z, const Tensor rois, const Tensor pts,
-                                 const Tensor pts_feature, Tensor argmax,
-                                 Tensor pts_idx_of_voxels,
-                                 Tensor pooled_features, int pool_method) {
-  RoiawarePool3dForwardMLUKernelLauncher(
-      pool_method, boxes_num, pts_num, channels, max_pts_each_voxel, out_x,
-      out_y, out_z, rois, pts, pts_feature, pts_idx_of_voxels, pooled_features,
-      argmax);
-}
-
-void roiaware_pool3d_forward_impl(int boxes_num, int pts_num, int channels,
-                                  int max_pts_each_voxel, int out_x, int out_y,
-                                  int out_z, const Tensor rois,
-                                  const Tensor pts, const Tensor pts_feature,
-                                  Tensor argmax, Tensor pts_idx_of_voxels,
-                                  Tensor pooled_features, int pool_method);
-
-REGISTER_DEVICE_IMPL(roiaware_pool3d_forward_impl, MLU,
-                     roiaware_pool3d_forward_mlu);
-
-void KernelRoiawarePool3dBackward(
-    cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
-    const cnrtDataType_t d_type, const int pool_method, const int boxes_num,
-    const int out_x, const int out_y, const int out_z, const int channels,
-    const int max_pts_each_voxel, const int *pts_idx_of_voxels,
-    const int *argmax, const void *grad_out, void *grad_in);
-
-static void kernelRoiawarePool3dBackwardPolicyFunc(
-    const int boxes_num, const int out_x, const int out_y, const int out_z,
-    cnrtDim3_t *k_dim, cnrtFunctionType_t *k_type) {
-  unsigned int core_num = torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster);
-  unsigned int cluster_num = torch_mlu::getDeviceAttr(cnrtAttrClusterCount);
-  *k_type = CNRT_FUNC_TYPE_UNION1;
-  k_dim->x = core_num;
-  const int voxels_num = boxes_num * out_x * out_y * out_z;
-  unsigned int use_cluster = (voxels_num + core_num - 1) / core_num;
-  k_dim->y = use_cluster > cluster_num ? cluster_num : use_cluster;
-  k_dim->z = 1;
-}
-
-void RoiawarePool3dBackwardMLUKernelLauncher(
-    int pool_method, int boxes_num, int out_x, int out_y, int out_z,
-    int channels, int max_pts_each_voxel, const Tensor pts_idx_of_voxels,
-    const Tensor argmax, const Tensor grad_out, Tensor grad_in) {
-  // check datatype
-  TORCH_CHECK((pts_idx_of_voxels.scalar_type() == at::kInt),
-              "pts_idx_of_voxels type should be Int, got ",
-              pts_idx_of_voxels.scalar_type(), ".");
-  TORCH_CHECK((argmax.scalar_type() == at::kInt),
-              "argmax type should be Int, got ", argmax.scalar_type(), ".");
-  TORCH_CHECK((grad_out.scalar_type() == at::kFloat ||
-               grad_out.scalar_type() == at::kHalf),
-              "grad_out type should be Float or Half, got ",
-              grad_out.scalar_type(), ".");
-  TORCH_CHECK((grad_out.scalar_type() == grad_in.scalar_type()),
-              "data types of grad_out, grad_in, should be the same, ",
-              "but now grad_out type is ", grad_out.scalar_type(),
-              ", grad_in type is ", grad_in.scalar_type(), ".");
-  // check dim
-  TORCH_CHECK(pts_idx_of_voxels.dim() == 5,
-              "pts_idx_of_voxels should be a 5D tensor, got ",
-              pts_idx_of_voxels.dim(), "D.");
-  TORCH_CHECK(argmax.dim() == 5, "argmax should be a 5D tensor, got ",
-              argmax.dim(), "D.");
-  TORCH_CHECK(grad_out.dim() == 5, "grad_out should be a 5D tensor, got ",
-              grad_out.dim(), "D.");
-  TORCH_CHECK(grad_in.dim() == 2, "grad_in should be a 2D tensor, got ",
-              grad_in.dim(), "D.");
-  // check shape
-  TORCH_CHECK(((pts_idx_of_voxels.size(0) == boxes_num) &&
-               (pts_idx_of_voxels.size(1) == out_x) &&
-               (pts_idx_of_voxels.size(2) == out_y) &&
-               (pts_idx_of_voxels.size(3) == out_z) &&
-               (pts_idx_of_voxels.size(4) == max_pts_each_voxel)),
-              "the dimensions of pts_idx_of_voxels should be (boxes_num, "
-              "out_x, out_y, out_z, max_pts_each_voxel), ",
-              "but got (", pts_idx_of_voxels.size(0), ",",
-              pts_idx_of_voxels.size(1), ",", pts_idx_of_voxels.size(2), ",",
-              pts_idx_of_voxels.size(3), ",", pts_idx_of_voxels.size(4), ").");
-  TORCH_CHECK(((argmax.size(0) == boxes_num) && (argmax.size(1) == out_x) &&
-               (argmax.size(2) == out_y) && (argmax.size(3) == out_z) &&
-               (argmax.size(4) == channels)),
-              "the dimensions of argmax should be (boxes_num, out_x, out_y, "
-              "out_z, channels), ",
-              "but got (", argmax.size(0), ",", argmax.size(1), ",",
-              argmax.size(2), ",", argmax.size(3), ",", argmax.size(4), ").");
-  TORCH_CHECK(((grad_out.size(0) == boxes_num) && (grad_out.size(1) == out_x) &&
-               (grad_out.size(2) == out_y) && (grad_out.size(3) == out_z) &&
-               (grad_out.size(4) == channels)),
-              "the dimensions of grad_out should be (boxes_num, out_x, "
-              "out_y, out_z, channels), ",
-              "but got (", grad_out.size(0), ",", grad_out.size(1), ",",
-              grad_out.size(2), ",", grad_out.size(3), ",", grad_out.size(4),
-              ").");
-  TORCH_CHECK((grad_in.size(1) == channels),
-              "the 1st dimensions of grad_in should be channels, ", "but got ",
-              grad_in.size(1), ".");
-  // check other params : pool_mothod
-  TORCH_CHECK(((pool_method == 0) || (pool_method == 1)),
-              "the num of pool_method should be 0(max) or 1(avg), ", "but got ",
-              pool_method, ".");
-  // check large tensor
-  const size_t max_input_size = 2147483648;
-  TORCH_CHECK(pts_idx_of_voxels.numel() < max_input_size,
-              "pts_idx_of_voxels element num should be less than 2^31, got ",
-              pts_idx_of_voxels.numel(), ".");
-  TORCH_CHECK(argmax.numel() < max_input_size,
-              "argmax element num should be less than 2^31, got ",
-              argmax.numel(), ".");
-  TORCH_CHECK(grad_out.numel() < max_input_size,
-              "grad_out element num should be less than 2^31, got ",
-              grad_out.numel(), ".");
-  TORCH_CHECK(grad_in.numel() < max_input_size,
-              "grad_in element num should be less than 2^31, got ",
-              grad_in.numel(), ".");
-  // check zero element
-  TORCH_CHECK(pts_idx_of_voxels.numel() != 0,
-              "pts_idx_of_voxels.numel() should not be zero, got ",
-              pts_idx_of_voxels.numel());
-  TORCH_CHECK(argmax.numel() != 0, "argmax.numel() should not be zero, got ",
-              argmax.numel());
-  TORCH_CHECK(grad_out.numel() != 0,
-              "grad_out.numel() should not be zero, got ", grad_out.numel());
-  TORCH_CHECK(grad_in.numel() != 0, "grad_in.numel() should not be zero, got ",
-              grad_in.numel());
-  // calculate task one dimension
-  cnrtDim3_t k_dim;
-  cnrtFunctionType_t k_type;
-  kernelRoiawarePool3dBackwardPolicyFunc(boxes_num, out_x, out_y, out_z, &k_dim,
-                                         &k_type);
-  // get compute queue
-  auto queue = torch_mlu::getCurQueue();
-  // transpose points_features [pts_num, channels] -> [channels, pts_num]
-  auto pts_idx_of_voxels_impl = torch_mlu::getMluTensorImpl(pts_idx_of_voxels);
-  auto pts_idx_of_voxels_ptr = pts_idx_of_voxels_impl->cnnlMalloc();
-  auto argmax_impl = torch_mlu::getMluTensorImpl(argmax);
-  auto argmax_ptr = argmax_impl->cnnlMalloc();
-  auto grad_out_impl = torch_mlu::getMluTensorImpl(grad_out);
-  auto grad_out_ptr = grad_out_impl->cnnlMalloc();
-  auto grad_in_impl = torch_mlu::getMluTensorImpl(grad_in);
-  auto grad_in_ptr = grad_in_impl->cnnlMalloc();
-  // get compute dtype of input
-  cnrtDataType_t data_type = torch_mlu::toCnrtDtype(grad_out.dtype());
-  // launch kernel RoiawarePool3dForward
-  CNLOG(INFO) << "Launch Kernel MLUKernel RoiawarePool3dBackward<<<" << k_dim.x
-              << ", " << k_dim.y << ", " << k_dim.z << ">>>";
-  KernelRoiawarePool3dBackward(k_dim, k_type, queue, data_type, pool_method,
-                               boxes_num, out_x, out_y, out_z, channels,
-                               max_pts_each_voxel, (int *)pts_idx_of_voxels_ptr,
-                               (int *)argmax_ptr, grad_out_ptr, grad_in_ptr);
-}
-
-void roiaware_pool3d_backward_mlu(int boxes_num, int out_x, int out_y,
-                                  int out_z, int channels,
-                                  int max_pts_each_voxel,
-                                  const Tensor pts_idx_of_voxels,
-                                  const Tensor argmax, const Tensor grad_out,
-                                  Tensor grad_in, int pool_method) {
-  RoiawarePool3dBackwardMLUKernelLauncher(
-      pool_method, boxes_num, out_x, out_y, out_z, channels, max_pts_each_voxel,
-      pts_idx_of_voxels, argmax, grad_out, grad_in);
-}
-
-void roiaware_pool3d_backward_impl(int boxes_num, int out_x, int out_y,
-                                   int out_z, int channels,
-                                   int max_pts_each_voxel,
-                                   const Tensor pts_idx_of_voxels,
-                                   const Tensor argmax, const Tensor grad_out,
-                                   Tensor grad_in, int pool_method);
-
-REGISTER_DEVICE_IMPL(roiaware_pool3d_backward_impl, MLU,
-                     roiaware_pool3d_backward_mlu);
diff --git a/mmcv/ops/csrc/pytorch/mlu/roipoint_pool3d_mlu.cpp b/mmcv/ops/csrc/pytorch/mlu/roipoint_pool3d_mlu.cpp
deleted file mode 100644
index 49dfe0e..0000000
--- a/mmcv/ops/csrc/pytorch/mlu/roipoint_pool3d_mlu.cpp
+++ /dev/null
@@ -1,166 +0,0 @@
-/*************************************************************************
- * Copyright (C) 2022 by Cambricon.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *************************************************************************/
-#include "pytorch_device_registry.hpp"
-#include "pytorch_mlu_helper.hpp"
-
-void KernelRoiPointPool3dForward(cnrtDim3_t k_dim, cnrtFunctionType_t k_type,
-                                 cnrtQueue_t queue, const cnrtDataType_t d_type,
-                                 const int batch_size, const int pts_num,
-                                 const int boxes_num, const int feature_in_len,
-                                 const int sampled_pts_num, const void *xyz,
-                                 const void *boxes3d, const void *pts_feature,
-                                 void *pooled_features, int *pooled_empty_flag);
-
-void KernelRoiPointPool3dLargeBoxesNumForward(
-    cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
-    const cnrtDataType_t d_type, const int batch_size, const int pts_num,
-    const int boxes_num, const int feature_in_len, const int sampled_pts_num,
-    const void *xyz, const void *boxes3d, const void *pts_feature,
-    void *pooled_features, int *pooled_empty_flag);
-
-// policy function
-static void policyFuncForward(cnrtDim3_t *k_dim, cnrtFunctionType_t *k_type) {
-  // start U1 task, occupy all available clusters
-  k_dim->x = torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster);
-  k_dim->y = torch_mlu::getDeviceAttr(cnrtAttrClusterCount);
-  k_dim->z = 1;
-  *k_type = CNRT_FUNC_TYPE_UNION1;
-}
-
-void RoIPointPool3dForwardMLUKernelLauncher(
-    int batch_size, int pts_num, int boxes_num, int feature_in_len,
-    int sampled_pts_num, const Tensor xyz, const Tensor boxes3d,
-    const Tensor pts_feature, Tensor pooled_features,
-    Tensor pooled_empty_flag) {
-  // check datatype
-  TORCH_CHECK(((xyz.scalar_type() == pooled_features.scalar_type()) &&
-               (boxes3d.scalar_type() == pooled_features.scalar_type()) &&
-               (pts_feature.scalar_type() == pooled_features.scalar_type())),
-              "data types of xyz, boxes3d, pts_feature and pooled_features "
-              "should be the same, ",
-              "but now xyz type is ", xyz.scalar_type(), ", boxes3d type is ",
-              boxes3d.scalar_type(), ", pts_feature type is ",
-              pts_feature.scalar_type(), ", pooled_features type is ",
-              pooled_features.scalar_type(), ".");
-  TORCH_CHECK(
-      (xyz.scalar_type() == at::kFloat || xyz.scalar_type() == at::kHalf),
-      "xyz type should be Float or Half, got ", xyz.scalar_type(), ".");
-  TORCH_CHECK((pooled_empty_flag.scalar_type() == at::kInt),
-              "pooled_empty_flag type should be Int, got ",
-              pooled_empty_flag.scalar_type(), ".");
-
-  // check shape
-  TORCH_CHECK(boxes3d.dim() == 3, "boxes3d should be a 3d tensor, got ",
-              boxes3d.dim(), "D.");
-  TORCH_CHECK(pts_feature.dim() == 3, "pts_feature should be a 3d tensor, got ",
-              pts_feature.dim(), "D.");
-
-  TORCH_CHECK(boxes3d.size(2) == 7,
-              "the 3rd dimensions of boxes3d should be 7, got ",
-              boxes3d.size(2), ".");
-  TORCH_CHECK((boxes3d.size(0) == batch_size),
-              "the 1st dimensions of boxes3d should be batch_size, ",
-              "but now the 1st dimension of boxes3d is ", boxes3d.size(0),
-              ", and batch_size is ", batch_size, ".");
-  TORCH_CHECK((pts_feature.size(0) == batch_size),
-              "the 1st dimensions of pts_feature should be batch_size, ",
-              "but now the 1st dimension of pts_feature is ",
-              pts_feature.size(0), ", and batch_size is ", batch_size, ".");
-  TORCH_CHECK((pts_feature.size(1) == pts_num),
-              "the 2nd dimensions of pts_feature should be pts_num, ",
-              "but now the 2nd dimension of pts_feature is ",
-              pts_feature.size(1), ", and pts_num is ", pts_num, ".");
-
-  // check zero element
-  if (xyz.numel() == 0 || pts_feature.numel() == 0 || boxes3d.numel() == 0 ||
-      pooled_features.numel() == 0 || pooled_empty_flag.numel() == 0) {
-    return;
-  }
-
-  // large tensor check
-  const size_t max_input_size = 2147483648;
-  TORCH_CHECK(xyz.numel() < max_input_size,
-              "xyz element num should be less than 2^31, got ", xyz.numel(),
-              ".");
-  TORCH_CHECK(boxes3d.numel() < max_input_size,
-              "boxes3d element num should be less than 2^31, got ",
-              boxes3d.numel(), ".");
-  TORCH_CHECK(pts_feature.numel() < max_input_size,
-              "pts_feature element num should be less than 2^31, got ",
-              pts_feature.numel(), ".");
-
-  // calculate task dimension
-  cnrtDim3_t k_dim;
-  cnrtFunctionType_t k_type;
-  policyFuncForward(&k_dim, &k_type);
-
-  // get compute queue
-  auto queue = torch_mlu::getCurQueue();
-
-  // get ptr of tensors
-  // transpose points [B, N ,3] -> [3, B, N]
-  auto xyz_ = xyz.permute({2, 0, 1}).contiguous();
-  auto xyz_impl = torch_mlu::getMluTensorImpl(xyz_);
-  auto xyz_ptr = xyz_impl->cnnlMalloc();
-  // transpose point_features [B, N, C] -> [B, C, N]
-  auto pts_feature_ = pts_feature.permute({0, 2, 1}).contiguous();
-  auto pts_feature_impl = torch_mlu::getMluTensorImpl(pts_feature_);
-  auto pts_feature_ptr = pts_feature_impl->cnnlMalloc();
-  auto boxes3d_impl = torch_mlu::getMluTensorImpl(boxes3d);
-  auto boxes3d_ptr = boxes3d_impl->cnnlMalloc();
-  auto pooled_features_impl = torch_mlu::getMluTensorImpl(pooled_features);
-  auto pooled_features_ptr = pooled_features_impl->cnnlMalloc();
-  auto pooled_empty_flag_impl = torch_mlu::getMluTensorImpl(pooled_empty_flag);
-  auto pooled_empty_flag_ptr = pooled_empty_flag_impl->cnnlMalloc();
-
-  // get compute dtype of input
-  cnrtDataType_t data_type = torch_mlu::toCnrtDtype(xyz_.dtype());
-
-  // launch kernel
-  if (boxes_num <= 10240) {
-    CNLOG(INFO) << "Launch Kernel MLUKernelRoiPointPool3dForward<<<" << k_dim.x
-                << ", " << k_dim.y << ", " << k_dim.z << ">>>";
-    KernelRoiPointPool3dForward(
-        k_dim, k_type, queue, data_type, batch_size, pts_num, boxes_num,
-        feature_in_len, sampled_pts_num, xyz_ptr, boxes3d_ptr, pts_feature_ptr,
-        pooled_features_ptr, (int *)pooled_empty_flag_ptr);
-  } else {
-    CNLOG(INFO)
-        << "Launch Kernel MLUKernelRoiPointPool3dLargeBoxesNumForward<<<"
-        << k_dim.x << ", " << k_dim.y << ", " << k_dim.z << ">>>";
-    KernelRoiPointPool3dLargeBoxesNumForward(
-        k_dim, k_type, queue, data_type, batch_size, pts_num, boxes_num,
-        feature_in_len, sampled_pts_num, xyz_ptr, boxes3d_ptr, pts_feature_ptr,
-        pooled_features_ptr, (int *)pooled_empty_flag_ptr);
-  }
-}
-
-void roipoint_pool3d_forward_mlu(int batch_size, int pts_num, int boxes_num,
-                                 int feature_in_len, int sampled_pts_num,
-                                 const Tensor xyz, const Tensor boxes3d,
-                                 const Tensor pts_feature,
-                                 Tensor pooled_features,
-                                 Tensor pooled_empty_flag) {
-  RoIPointPool3dForwardMLUKernelLauncher(
-      batch_size, pts_num, boxes_num, feature_in_len, sampled_pts_num, xyz,
-      boxes3d, pts_feature, pooled_features, pooled_empty_flag);
-}
-
-void roipoint_pool3d_forward_impl(int batch_size, int pts_num, int boxes_num,
-                                  int feature_in_len, int sampled_pts_num,
-                                  const Tensor xyz, const Tensor boxes3d,
-                                  const Tensor pts_feature,
-                                  Tensor pooled_features,
-                                  Tensor pooled_empty_flag);
-
-REGISTER_DEVICE_IMPL(roipoint_pool3d_forward_impl, MLU,
-                     roipoint_pool3d_forward_mlu);
diff --git a/mmcv/ops/csrc/pytorch/mlu/three_nn_mlu.cpp b/mmcv/ops/csrc/pytorch/mlu/three_nn_mlu.cpp
deleted file mode 100644
index f407e3f..0000000
--- a/mmcv/ops/csrc/pytorch/mlu/three_nn_mlu.cpp
+++ /dev/null
@@ -1,100 +0,0 @@
-/*************************************************************************
- * Copyright (C) 2022 Cambricon.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *************************************************************************/
-#include "pytorch_device_registry.hpp"
-#include "pytorch_mlu_helper.hpp"
-
-void KernelThreeNNForward(cnrtDim3_t k_dim, cnrtFunctionType_t k_type,
-                          cnrtQueue_t queue, cnrtDataType_t data_type,
-                          const void *unknown, const void *known, void *dist2,
-                          int *idx, const int b, const int n, const int m);
-
-void ThreeNNMLUKernelLauncher(int b, int n, int m, const Tensor unknown,
-                              const Tensor known, Tensor dist2, Tensor idx) {
-  // Check dtype.
-  TORCH_CHECK(
-      unknown.scalar_type() == at::kFloat || unknown.scalar_type() == at::kHalf,
-      "unknown type should be Float or Half, got ", unknown.scalar_type(), ".");
-  TORCH_CHECK(unknown.scalar_type() == known.scalar_type(),
-              "known should have the same type as unknown.");
-  TORCH_CHECK(unknown.scalar_type() == dist2.scalar_type(),
-              "dist2 should have the same type as unknown.");
-  TORCH_CHECK(idx.scalar_type() == at::kInt, "idx type should be Int.");
-
-  // Check shape.
-  TORCH_CHECK(unknown.dim() == 3, "unknown should be 3d tensor, got ",
-              unknown.dim(), "D.");
-  TORCH_CHECK(known.dim() == 3, "known should be 3d tensor, got ", known.dim(),
-              "D.");
-  TORCH_CHECK(unknown.size(0) == known.size(0),
-              "known.dim0 should be equal to unknown.dim0, got ", known.size(0),
-              ".");
-  TORCH_CHECK(unknown.size(2) == 3, "unknown dim2 should be 3, got ",
-              unknown.size(2), ".");
-  TORCH_CHECK(known.size(2) == 3, "known dim2 should be 3, got ", known.size(2),
-              ".");
-
-  // zero element check
-  TORCH_CHECK(unknown.numel() > 0,
-              "unknown.numel should greater than zero, got ", unknown.numel(),
-              ".");
-  if (known.numel() == 0) {
-    // return if known zero element
-    return;
-  }
-
-  // large tensor check
-  const size_t max_input_num = 2147483648;  // 2^31, 2G num
-  TORCH_CHECK(unknown.numel() < max_input_num,
-              "unknown.numel() should be less than 2147483648, got ",
-              unknown.numel(), ".");
-  TORCH_CHECK(known.numel() < max_input_num,
-              "known.numel() should be less than 2147483648, got ",
-              known.numel(), ".");
-
-  // get compute queue
-  auto queue = torch_mlu::getCurQueue();
-
-  // get ptr of tensors
-  auto unknown_impl = torch_mlu::getMluTensorImpl(unknown);
-  auto unknown_ptr = unknown_impl->cnnlMalloc();
-  auto known_t = known.permute({0, 2, 1}).contiguous();
-  auto known_impl = torch_mlu::getMluTensorImpl(known_t);
-  auto known_ptr = known_impl->cnnlMalloc();
-  auto dist2_impl = torch_mlu::getMluTensorImpl(dist2);
-  auto dist2_ptr = dist2_impl->cnnlMalloc();
-  auto idx_impl = torch_mlu::getMluTensorImpl(idx);
-  auto idx_ptr = idx_impl->cnnlMalloc();
-
-  cnrtJobType_t k_type = CNRT_FUNC_TYPE_UNION1;
-  cnrtDim3_t k_dim;
-  k_dim.x = torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster);
-  k_dim.y = torch_mlu::getDeviceAttr(cnrtAttrClusterCount);
-  k_dim.z = 1;
-  cnrtDataType_t data_type = torch_mlu::toCnrtDtype(unknown.dtype());
-
-  // launch kernel
-  CNLOG(INFO) << "Launch Kernel MLUKernelThreeNNForward<<<" << k_dim.x << ", "
-              << k_dim.y << ", " << k_dim.z << ">>>.";
-
-  KernelThreeNNForward(k_dim, k_type, queue, data_type, unknown_ptr, known_ptr,
-                       dist2_ptr, (int *)idx_ptr, b, n, m);
-}
-
-void three_nn_forward_mlu(int b, int n, int m, const Tensor unknown,
-                          const Tensor known, Tensor dist2, Tensor idx) {
-  ThreeNNMLUKernelLauncher(b, n, m, unknown, known, dist2, idx);
-}
-
-void three_nn_forward_impl(int b, int n, int m, const Tensor unknown,
-                           const Tensor known, Tensor dist2, Tensor idx);
-
-REGISTER_DEVICE_IMPL(three_nn_forward_impl, MLU, three_nn_forward_mlu);
diff --git a/mmcv/ops/csrc/pytorch/mlu/tin_shift_mlu.cpp b/mmcv/ops/csrc/pytorch/mlu/tin_shift_mlu.cpp
deleted file mode 100644
index 7283307..0000000
--- a/mmcv/ops/csrc/pytorch/mlu/tin_shift_mlu.cpp
+++ /dev/null
@@ -1,203 +0,0 @@
-/*************************************************************************
- * Copyright (C) 2022 Cambricon.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *************************************************************************/
-#include "pytorch_device_registry.hpp"
-#include "pytorch_mlu_helper.hpp"
-
-void KernelTinShiftForward(
-    cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
-    const void *input, const void *shifts, void *output, const int batch_size,
-    const int time_size, const int channel_size, const int hw_size,
-    const int group_size, const int group_channel,
-    const cnrtDataType_t data_dtype, const int channel_per_core,
-    const int max_number_hw_per_core, const int max_length_per_core);
-
-void KernelTinShiftBackward(
-    cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
-    const void *grad_output, const void *shifts, void *grad_input,
-    const int batch_size, const int time_size, const int channel_size,
-    const int hw_size, const int group_size, const int group_channel,
-    const cnrtDataType_t data_dtype, const int channel_per_core,
-    const int max_number_hw_per_core, const int max_length_per_core);
-
-// policy function
-static void policyFunc(const Tensor &input, cnrtDim3_t *k_dim,
-                       cnrtFunctionType_t *k_type, int *channel_per_core,
-                       int *max_number_hw_per_core, int *max_length_per_core) {
-  const int32_t cluster_limit = torch_mlu::getDeviceAttr(cnrtAttrClusterCount);
-  const int32_t core_limit = torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster);
-  auto nram_size = torch_mlu::getDeviceAttr(cnrtAttrNramSizePerMcore);
-  const int core_num = core_limit * cluster_limit;
-  const int batch_size = input.size(0);
-  const int time_size = input.size(1);
-  const int channel_size = input.size(2);
-  const int hw_size = input.size(3);
-
-  const size_t size_per_channel = time_size * hw_size * input.itemsize();
-  *channel_per_core = nram_size / size_per_channel;
-  int task_dim = 0;
-  if (*channel_per_core == 0) {
-    const size_t size_per_hw = hw_size * input.itemsize();
-    *max_number_hw_per_core = nram_size / size_per_hw;
-    if (*max_number_hw_per_core <= 0) {
-      *max_length_per_core = nram_size / input.itemsize();
-    }
-    int tmp_max_number_hw_per_core =
-        *max_number_hw_per_core > 0 ? *max_number_hw_per_core : 1;
-    const int loop_time =
-        (time_size / (tmp_max_number_hw_per_core)) +
-        ((time_size % (tmp_max_number_hw_per_core)) > 0 ? 1 : 0);
-    task_dim = batch_size * channel_size * loop_time < core_num
-                   ? batch_size * channel_size * loop_time
-                   : core_num;
-  } else {
-    task_dim = batch_size * channel_size < core_num ? batch_size * channel_size
-                                                    : core_num;
-  }
-
-  k_dim->x = core_limit;
-  k_dim->y = (task_dim / core_limit) > 0 ? (task_dim / core_limit) : 1;
-  k_dim->z = 1;
-  *k_type = CNRT_FUNC_TYPE_UNION1;
-}
-
-void TINShiftForwardMLUKernelLauncher(Tensor input, Tensor shift,
-                                      Tensor output) {
-  // params check
-  TORCH_CHECK(
-      input.scalar_type() == at::kFloat || input.scalar_type() == at::kHalf,
-      "input type should be Float or Half, got ", input.scalar_type(), ".");
-  TORCH_CHECK(input.dim() == 4, "input should be a 4d tensor, got ",
-              input.dim(), "d.");
-  TORCH_CHECK(shift.dim() == 2, "shift should be a 2d tensor, got ",
-              shift.dim(), "d.");
-  TORCH_CHECK(
-      input.size(0) == shift.size(0),
-      "input batch size should be the same as shift's, input batch size is ",
-      input.size(0), " and shift batch size is ", shift.size(0), ".");
-  TORCH_CHECK(input.size(0) != 0, "Input batch size should not be zero.");
-  TORCH_CHECK(input.size(3) != 0,
-              "The last dim size of input should not be zero.");
-  if (input.size(1) == 0) {
-    return;
-  }
-  cnrtDim3_t k_dim;
-  cnrtFunctionType_t k_type;
-  int channel_per_core = 0;
-  int max_number_hw_per_core = 0;
-  int max_length_per_core = 0;
-  policyFunc(input, &k_dim, &k_type, &channel_per_core, &max_number_hw_per_core,
-             &max_length_per_core);
-
-  const int batch_size = input.size(0);
-  const int time_size = input.size(1);
-  const int channel_size = input.size(2);
-  const int hw_size = input.size(3);
-  const int group_size = shift.size(1);
-  int group_channel = channel_size / group_size;
-
-  // get tensor impl
-  auto input_impl = torch_mlu::getMluTensorImpl(input);
-  auto shift_impl = torch_mlu::getMluTensorImpl(shift);
-  auto output_impl = torch_mlu::getMluTensorImpl(output);
-
-  // get compute queue
-  auto queue = torch_mlu::getCurQueue();
-
-  // get the mlu ptr
-  auto input_ptr = input_impl->cnnlMalloc();
-  auto shift_ptr = shift_impl->cnnlMalloc();
-  auto output_ptr = output_impl->cnnlMalloc();
-
-  cnrtDataType_t data_dtype = torch_mlu::toCnrtDtype(input.dtype());
-
-  KernelTinShiftForward(k_dim, k_type, queue, input_ptr, shift_ptr, output_ptr,
-                        batch_size, time_size, channel_size, hw_size,
-                        group_size, group_channel, data_dtype, channel_per_core,
-                        max_number_hw_per_core, max_length_per_core);
-}
-
-void TINShiftBackwardMLUKernelLauncher(Tensor grad_output, Tensor shift,
-                                       Tensor grad_input) {
-  // params check
-  TORCH_CHECK(grad_output.scalar_type() == at::kFloat ||
-                  grad_output.scalar_type() == at::kHalf,
-              "grad_output type should be Float or Half, got ",
-              grad_output.scalar_type(), ".");
-  TORCH_CHECK(grad_output.dim() == 4, "grad_output should be a 4d tensor, got ",
-              grad_output.dim(), "d.");
-  TORCH_CHECK(shift.dim() == 2, "shift should be a 2d tensor, got ",
-              shift.dim(), "d.");
-  TORCH_CHECK(grad_output.size(0) == shift.size(0),
-              "grad_output batch size should be the same as shift's, "
-              "grad_output batch size is ",
-              grad_output.size(0), ", shift batch size is ", shift.size(0),
-              ".");
-  TORCH_CHECK(grad_output.size(0) != 0,
-              "grad_output batch size should not be zero.");
-  TORCH_CHECK(grad_output.size(3) != 0,
-              "The last dim size of grad_output should not be zero.");
-  if (grad_output.size(1) == 0) {
-    return;
-  }
-  cnrtDim3_t k_dim;
-  cnrtFunctionType_t k_type;
-  int channel_per_core = 0;
-  int max_number_hw_per_core = 0;
-  int max_length_per_core = 0;
-  policyFunc(grad_output, &k_dim, &k_type, &channel_per_core,
-             &max_number_hw_per_core, &max_length_per_core);
-
-  const int batch_size = grad_output.size(0);
-  const int time_size = grad_output.size(1);
-  const int channel_size = grad_output.size(2);
-  const int hw_size = grad_output.size(3);
-  const int group_size = shift.size(1);
-  int group_channel = channel_size / group_size;
-
-  // get tensor impl
-  auto grad_output_impl = torch_mlu::getMluTensorImpl(grad_output);
-  auto shift_impl = torch_mlu::getMluTensorImpl(shift);
-  auto grad_input_impl = torch_mlu::getMluTensorImpl(grad_input);
-
-  // get compute queue
-  auto queue = torch_mlu::getCurQueue();
-
-  // get the mlu ptr
-  auto grad_output_ptr = grad_output_impl->cnnlMalloc();
-  auto shift_ptr = shift_impl->cnnlMalloc();
-  auto grad_input_ptr = grad_input_impl->cnnlMalloc();
-
-  cnrtDataType_t data_dtype = torch_mlu::toCnrtDtype(grad_output.dtype());
-
-  KernelTinShiftBackward(k_dim, k_type, queue, grad_output_ptr, shift_ptr,
-                         grad_input_ptr, batch_size, time_size, channel_size,
-                         hw_size, group_size, group_channel, data_dtype,
-                         channel_per_core, max_number_hw_per_core,
-                         max_length_per_core);
-}
-
-void tin_shift_forward_mlu(Tensor input, Tensor shift, Tensor output) {
-  TINShiftForwardMLUKernelLauncher(input, shift, output);
-}
-
-void tin_shift_backward_mlu(Tensor grad_output, Tensor shift,
-                            Tensor grad_input) {
-  TINShiftBackwardMLUKernelLauncher(grad_output, shift, grad_input);
-}
-
-void tin_shift_forward_impl(Tensor input, Tensor shift, Tensor output);
-
-void tin_shift_backward_impl(Tensor grad_output, Tensor shift,
-                             Tensor grad_input);
-
-REGISTER_DEVICE_IMPL(tin_shift_forward_impl, MLU, tin_shift_forward_mlu);
-REGISTER_DEVICE_IMPL(tin_shift_backward_impl, MLU, tin_shift_backward_mlu);
diff --git a/mmcv/ops/csrc/pytorch/mps/bbox_overlaps_mps.mm b/mmcv/ops/csrc/pytorch/mps/bbox_overlaps_mps.mm
deleted file mode 100644
index cad6a41..0000000
--- a/mmcv/ops/csrc/pytorch/mps/bbox_overlaps_mps.mm
+++ /dev/null
@@ -1,99 +0,0 @@
-// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
-#include "pytorch_device_registry.hpp"
-
-#include "MPSLibrary.h"
-#include "MPSStream.h"
-#include "MPSUtils.h"
-
-using at::Tensor;
-
-const static std::string kSourceCode = R"(
-#include <metal_math>
-#include <metal_stdlib>
-using namespace metal;
-
-kernel void bbox_overlap_mps_kernel(constant const float4* bboxes1,
-                       constant const float4* bboxes2,
-                       device float* ious,
-                       constant int& num_bbox1,
-                       constant int& num_bbox2,
-                       constant int& mode,
-                       constant bool& aligned,
-                       constant int& offset,
-                       uint index [[thread_position_in_grid]])
-{
-    int base1 = index;
-    int base2 = index;
-    if(!aligned){
-      base1 = index / num_bbox2;
-      base2 = index % num_bbox2;
-    }
-
-    const float f_offset = float(offset);
-
-    const float4 b1 = bboxes1[base1];
-    const float b1_area = (b1[2]-b1[0]+f_offset)*(b1[3]-b1[1]+f_offset);
-
-    const float4 b2 = bboxes2[base2];
-    const float b2_area = (b2[2]-b2[0]+f_offset)*(b2[3]-b2[1]+f_offset);
-
-    const float2 left_top = fmax(b1.xy, b2.xy);
-    const float2 right_bottom = fmin(b1.zw, b2.zw);
-    const float2 wh = fmax(right_bottom - left_top + f_offset, 0.0f);
-    const float interS = wh.x * wh.y;
-
-    const float baseS =
-        fmax(mode == 0 ? b1_area + b2_area - interS : b1_area, f_offset);
-    ious[index] = interS / baseS;
-}
-)";
-
-void BBoxOverlapsMPSKernelLauncher(const Tensor bboxes1, const Tensor bboxes2, Tensor ious,
-                                   const int mode, const bool aligned, const int offset) {
-  // get stream
-  auto stream = at::mps::getCurrentMPSStream();
-  auto library_manager = MPSLibraryManager::getInstance();
-  MPSLibrary* library;
-  const static std::string kLibraryName = "bbox_overlap";
-  if (library_manager->hasLibrary(kLibraryName))
-    library = library_manager->getLibrary(kLibraryName);
-  else
-    library = library_manager->createLibraryFromSouce(kLibraryName, kSourceCode);
-  auto func_pso = library->getComputePipelineState("bbox_overlap_mps_kernel");
-
-  // create command buffer and encoder
-  MTLCommandBuffer_t command_buffer = stream->commandBuffer();
-  MTLComputeCommandEncoder_t compute_encoder = [command_buffer computeCommandEncoder];
-
-  // set pso and buffer
-  int output_size = ious.numel();
-  int num_bbox1 = bboxes1.size(0);
-  int num_bbox2 = bboxes2.size(0);
-  int num_elements = output_size;
-  setMTLArgs(compute_encoder, func_pso, bboxes1, bboxes2, ious, num_bbox1, num_bbox2, mode, aligned,
-             offset);
-
-  // set grid size
-  MTLSize grid_size = MTLSizeMake(num_elements, 1, 1);
-  NSUInteger thread_group_size_x = func_pso.maxTotalThreadsPerThreadgroup;
-  if (thread_group_size_x > num_elements) {
-    thread_group_size_x = num_elements;
-  }
-  MTLSize thread_group_size = MTLSizeMake(thread_group_size_x, 1, 1);
-
-  // encoding
-  [compute_encoder dispatchThreads:grid_size threadsPerThreadgroup:thread_group_size];
-  [compute_encoder endEncoding];
-
-  // commit, not sure if flush is required
-  stream->commit(false);
-}
-
-void bbox_overlaps_mps(const Tensor bboxes1, const Tensor bboxes2, Tensor ious, const int mode,
-                       const bool aligned, const int offset) {
-  BBoxOverlapsMPSKernelLauncher(bboxes1, bboxes2, ious, mode, aligned, offset);
-}
-
-void bbox_overlaps_impl(const Tensor bboxes1, const Tensor bboxes2, Tensor ious, const int mode,
-                        const bool aligned, const int offset);
-REGISTER_DEVICE_IMPL(bbox_overlaps_impl, MPS, bbox_overlaps_mps);
diff --git a/mmcv/ops/csrc/pytorch/nms_quadri.cpp b/mmcv/ops/csrc/pytorch/nms_quadri.cpp
deleted file mode 100644
index b8baed9..0000000
--- a/mmcv/ops/csrc/pytorch/nms_quadri.cpp
+++ /dev/null
@@ -1,30 +0,0 @@
-// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
-#include "pytorch_cpp_helper.hpp"
-
-Tensor nms_quadri_cpu(const Tensor dets, const Tensor scores,
-                      const float iou_threshold);
-
-#ifdef MMCV_WITH_CUDA
-Tensor nms_quadri_cuda(const Tensor dets, const Tensor scores,
-                       const Tensor order, const Tensor dets_sorted,
-                       const float iou_threshold, const int multi_label);
-#endif
-
-// Interface for Python
-// inline is needed to prevent multiple function definitions when this header is
-// included by different cpps
-Tensor nms_quadri(const Tensor dets, const Tensor scores, const Tensor order,
-                  const Tensor dets_sorted, const float iou_threshold,
-                  const int multi_label) {
-  assert(dets.device().is_cuda() == scores.device().is_cuda());
-  if (dets.device().is_cuda()) {
-#ifdef MMCV_WITH_CUDA
-    return nms_quadri_cuda(dets, scores, order, dets_sorted, iou_threshold,
-                           multi_label);
-#else
-    AT_ERROR("Not compiled with GPU support");
-#endif
-  }
-
-  return nms_quadri_cpu(dets, scores, iou_threshold);
-}
diff --git a/mmcv/ops/csrc/pytorch/nms_rotated.cpp b/mmcv/ops/csrc/pytorch/nms_rotated.cpp
index b07ed5a..e4ef676 100644
--- a/mmcv/ops/csrc/pytorch/nms_rotated.cpp
+++ b/mmcv/ops/csrc/pytorch/nms_rotated.cpp
@@ -12,32 +12,21 @@ Tensor nms_rotated_cuda(const Tensor dets, const Tensor scores,
                         const float iou_threshold, const int multi_label);
 #endif
 
-#ifdef MMCV_WITH_NPU
-Tensor nms_rotated_npu(const Tensor dets, const Tensor scores,
-                       const Tensor labels, const float iou_threshold);
-#endif
-
 // Interface for Python
 // inline is needed to prevent multiple function definitions when this header is
 // included by different cpps
 Tensor nms_rotated(const Tensor dets, const Tensor scores, const Tensor order,
-                   const Tensor dets_sorted, const Tensor labels,
-                   const float iou_threshold, const int multi_label) {
+                   const Tensor dets_sorted, const float iou_threshold,
+                   const int multi_label) {
   assert(dets.device().is_cuda() == scores.device().is_cuda());
   if (dets.device().is_cuda()) {
 #ifdef MMCV_WITH_CUDA
-    return nms_rotated_cuda(dets, scores, order, dets_sorted.contiguous(),
-                            iou_threshold, multi_label);
+    return nms_rotated_cuda(dets, scores, order, dets_sorted, iou_threshold,
+                            multi_label);
 #else
     AT_ERROR("Not compiled with GPU support");
-#endif
-  } else if (dets.device().type() == at::kXLA) {
-#ifdef MMCV_WITH_NPU
-    return nms_rotated_npu(dets, scores, labels, iou_threshold);
-#else
-    AT_ERROR("Not compiled with NPU support");
 #endif
   }
 
-  return nms_rotated_cpu(dets.contiguous(), scores.contiguous(), iou_threshold);
+  return nms_rotated_cpu(dets, scores, iou_threshold);
 }
diff --git a/mmcv/ops/csrc/pytorch/npu/bbox_overlaps_npu.cpp b/mmcv/ops/csrc/pytorch/npu/bbox_overlaps_npu.cpp
deleted file mode 100644
index ebe190f..0000000
--- a/mmcv/ops/csrc/pytorch/npu/bbox_overlaps_npu.cpp
+++ /dev/null
@@ -1,34 +0,0 @@
-#include "pytorch_npu_helper.hpp"
-
-using namespace NPU_NAME_SPACE;
-using namespace std;
-
-void bbox_overlaps_impl(const Tensor bboxes1, const Tensor bboxes2, Tensor ious,
-                        const int mode, const bool aligned, const int offset);
-
-void bbox_overlaps_npu(const Tensor bboxes1, const Tensor bboxes2, Tensor ious,
-                       const int mode, const bool aligned, const int offset) {
-  string modeStr = "iou";
-  if (mode == 1) {
-    modeStr = "iof";
-  }
-  float offset_ = 1;
-  if (offset == 0) {
-    offset_ = 0.01;
-  }
-  at::Tensor bboxes = at::ones_like(bboxes2);
-  at::Tensor gtboxes = at::ones_like(bboxes1);
-  bboxes = aligned ? bboxes2.transpose(0, 1) : bboxes2;
-  gtboxes = aligned ? bboxes1.transpose(0, 1) : bboxes1;
-  OpCommand cmd;
-  cmd.Name("Iou")
-      .Input(bboxes)
-      .Input(gtboxes)
-      .Output(ious)
-      .Attr("mode", modeStr)
-      .Attr("eps", offset_)
-      .Attr("aligned", aligned)
-      .Run();
-}
-
-REGISTER_NPU_IMPL(bbox_overlaps_impl, bbox_overlaps_npu);
diff --git a/mmcv/ops/csrc/pytorch/npu/deform_roi_pool.cpp b/mmcv/ops/csrc/pytorch/npu/deform_roi_pool.cpp
deleted file mode 100644
index 074e52d..0000000
--- a/mmcv/ops/csrc/pytorch/npu/deform_roi_pool.cpp
+++ /dev/null
@@ -1,63 +0,0 @@
-#include "pytorch_npu_helper.hpp"
-
-using namespace NPU_NAME_SPACE;
-using namespace std;
-
-void deform_roi_pool_forward_impl(Tensor input, Tensor rois, Tensor offset,
-                                  Tensor output, int pooled_height,
-                                  int pooled_width, float spatial_scale,
-                                  int sampling_ratio, float gamma);
-
-void deform_roi_pool_backward_impl(Tensor grad_output, Tensor input,
-                                   Tensor rois, Tensor offset,
-                                   Tensor grad_input, Tensor grad_offset,
-                                   int pooled_height, int pooled_width,
-                                   float spatial_scale, int sampling_ratio,
-                                   float gamma);
-
-void deform_roi_pool_forward_npu(Tensor input, Tensor rois, Tensor offset,
-                                 Tensor output, int pooled_height,
-                                 int pooled_width, float spatial_scale,
-                                 int sampling_ratio, float gamma) {
-  c10::SmallVector<int64_t, 2> output_sizes = {pooled_height, pooled_width};
-  at::IntArrayRef output_size = at::IntArrayRef(output_sizes);
-  int64_t sampling_ratio_ = (int64_t)sampling_ratio;
-  OpCommand cmd;
-  cmd.Name("DeformableRoiPool")
-      .Input(input)
-      .Input(rois)
-      .Input(offset)
-      .Output(output)
-      .Attr("spatial_scale", spatial_scale)
-      .Attr("output_size", output_size)
-      .Attr("sampling_ratio", sampling_ratio_)
-      .Attr("gamma", gamma)
-      .Run();
-}
-
-void deform_roi_pool_backward_npu(Tensor grad_output, Tensor input, Tensor rois,
-                                  Tensor offset, Tensor grad_input,
-                                  Tensor grad_offset, int pooled_height,
-                                  int pooled_width, float spatial_scale,
-                                  int sampling_ratio, float gamma) {
-  c10::SmallVector<int64_t, 2> output_sizes = {pooled_height, pooled_width};
-  at::IntArrayRef output_size = at::IntArrayRef(output_sizes);
-  int64_t sampling_ratio_ = (int64_t)sampling_ratio;
-  OpCommand cmd;
-  cmd.Name("DeformableRoiPoolGrad")
-      .Input(grad_output)
-      .Input(input)
-      .Input(rois)
-      .Input(offset)
-      .Output(grad_input)
-      .Output(grad_offset)
-      .Attr("output_size", output_size)
-      .Attr("spatial_scale", spatial_scale)
-      .Attr("sample_ratio", sampling_ratio_)
-      .Attr("gamma", gamma)
-      .Run();
-}
-
-REGISTER_NPU_IMPL(deform_roi_pool_forward_impl, deform_roi_pool_forward_npu);
-
-REGISTER_NPU_IMPL(deform_roi_pool_backward_impl, deform_roi_pool_backward_npu);
diff --git a/mmcv/ops/csrc/pytorch/npu/focal_loss_npu.cpp b/mmcv/ops/csrc/pytorch/npu/focal_loss_npu.cpp
deleted file mode 100644
index c949bf9..0000000
--- a/mmcv/ops/csrc/pytorch/npu/focal_loss_npu.cpp
+++ /dev/null
@@ -1,162 +0,0 @@
-#include "pytorch_npu_helper.hpp"
-
-using namespace NPU_NAME_SPACE;
-using namespace std;
-
-void sigmoid_focal_loss_forward_npu(Tensor input, Tensor target, Tensor weight,
-                                    Tensor output, float gamma, float alpha) {
-  int64_t n_class = input.size(1);
-  at::Tensor target_y = at::ones_like(input);
-  if (n_class == 1) {
-    target_y = at::reshape(target, input.sizes());
-    target_y = at::mul(target_y, -1.0);
-    target_y = at::add(target_y, 1.0);
-  } else {
-    target_y = at_npu::native::NPUNativeFunctions::one_hot(target, n_class);
-  }
-  target_y =
-      at_npu::native::NPUNativeFunctions::npu_dtype_cast(target_y, at::kInt);
-  int64_t weight_size = weight.size(0);
-  at::Tensor weight_y = at::ones_like(input);
-  if (weight_size > 0) {
-    weight_y = at_npu::native::NPUNativeFunctions::npu_broadcast(weight,
-                                                                 input.sizes());
-  }
-  OpCommand cmd;
-  string reduction = "none";
-  cmd.Name("SigmoidFocalLoss")
-      .Input(input)
-      .Input(target_y)
-      .Input(weight_y)
-      .Output(output)
-      .Attr("gamma", gamma)
-      .Attr("alpha", alpha)
-      .Attr("reduction", reduction)
-      .Run();
-}
-
-void sigmoid_focal_loss_forward_impl(Tensor input, Tensor target, Tensor weight,
-                                     Tensor output, float gamma, float alpha);
-
-void sigmoid_focal_loss_backward_npu(Tensor input, Tensor target, Tensor weight,
-                                     Tensor grad_input, float gamma,
-                                     float alpha) {
-  int64_t n_class = input.size(1);
-  at::Tensor target_y = at::ones_like(input);
-  if (n_class == 1) {
-    target_y = at::reshape(target, input.sizes());
-  } else {
-    target_y = at_npu::native::NPUNativeFunctions::one_hot(target, n_class);
-    target_y = at::mul(target_y, -1.0);
-    target_y = at::add(target_y, 1.0);
-  }
-  target_y =
-      at_npu::native::NPUNativeFunctions::npu_dtype_cast(target_y, at::kInt);
-  at::Tensor grad_up = at::ones_like(input);
-  int64_t weight_size = weight.size(0);
-  at::Tensor weight_y = at::ones_like(input);
-  if (weight_size > 0) {
-    weight_y = at_npu::native::NPUNativeFunctions::npu_broadcast(weight,
-                                                                 input.sizes());
-  }
-  OpCommand cmd;
-  string reduction = "none";
-  cmd.Name("SigmoidFocalLossGrad")
-      .Input(input)
-      .Input(target_y)
-      .Input(grad_up)
-      .Input(weight_y)
-      .Output(grad_input)
-      .Attr("gamma", gamma)
-      .Attr("alpha", alpha)
-      .Attr("reduction", reduction)
-      .Run();
-}
-
-void sigmoid_focal_loss_backward_impl(Tensor input, Tensor target,
-                                      Tensor weight, Tensor grad_input,
-                                      float gamma, float alpha);
-
-void softmax_focal_loss_forward_npu(Tensor input, Tensor target, Tensor weight,
-                                    Tensor output, float gamma, float alpha) {
-  int64_t n_class = input.size(1);
-  at::Tensor target_y =
-      at_npu::native::NPUNativeFunctions::one_hot(target, n_class);
-  target_y =
-      at_npu::native::NPUNativeFunctions::npu_dtype_cast(target_y, at::kInt);
-  int64_t weight_size = weight.size(0);
-  at::Tensor weight_y = at::ones_like(input);
-  if (weight_size > 0) {
-    weight_y = at_npu::native::NPUNativeFunctions::npu_broadcast(weight,
-                                                                 input.sizes());
-  }
-  at::Tensor op_output = at::ones_like(input);
-  OpCommand cmd;
-  string reduction = "none";
-  cmd.Name("SoftmaxFocalLoss")
-      .Input(input)
-      .Input(target_y)
-      .Input(weight_y)
-      .Output(op_output)
-      .Attr("gamma", gamma)
-      .Attr("alpha", alpha)
-      .Attr("reduction", reduction)
-      .Run();
-  int64_t n_batch = input.size(0);
-  c10::SmallVector<int64_t, 2> offsets = {0, 0};
-  c10::SmallVector<int64_t, 2> sizes = {n_batch, 1};
-  at::IntArrayRef offset = at::IntArrayRef(offsets);
-  at::IntArrayRef size = at::IntArrayRef(sizes);
-  at_npu::native::NPUNativeFunctions::npu_slice_out(op_output, offset, size,
-                                                    output);
-}
-
-void softmax_focal_loss_forward_impl(Tensor input, Tensor target, Tensor weight,
-                                     Tensor grad_input, float gamma,
-                                     float alpha);
-
-void softmax_focal_loss_backward_npu(Tensor input, Tensor target, Tensor weight,
-                                     Tensor buff, Tensor grad_input,
-                                     float gamma, float alpha) {
-  int64_t n_class = input.size(1);
-  at::Tensor target_y =
-      at_npu::native::NPUNativeFunctions::one_hot(target, n_class);
-  target_y =
-      at_npu::native::NPUNativeFunctions::npu_dtype_cast(target_y, at::kInt);
-  at::Tensor grad_up = at::ones_like(input);
-  int64_t weight_size = weight.size(0);
-  at::Tensor weight_y = at::ones_like(input);
-  if (weight_size > 0) {
-    weight_y = at_npu::native::NPUNativeFunctions::npu_broadcast(weight,
-                                                                 input.sizes());
-  }
-  OpCommand cmd;
-  string reduction = "none";
-  cmd.Name("SoftmaxFocalLossGrad")
-      .Input(input)
-      .Input(target_y)
-      .Input(grad_up)
-      .Input(weight_y)
-      .Output(grad_input)
-      .Attr("gamma", gamma)
-      .Attr("alpha", alpha)
-      .Attr("reduction", reduction)
-      .Run();
-}
-
-void softmax_focal_loss_backward_impl(Tensor input, Tensor target,
-                                      Tensor weight, Tensor buff,
-                                      Tensor grad_input, float gamma,
-                                      float alpha);
-
-REGISTER_NPU_IMPL(sigmoid_focal_loss_forward_impl,
-                  sigmoid_focal_loss_forward_npu);
-
-REGISTER_NPU_IMPL(sigmoid_focal_loss_backward_impl,
-                  sigmoid_focal_loss_backward_npu);
-
-REGISTER_NPU_IMPL(softmax_focal_loss_forward_impl,
-                  softmax_focal_loss_forward_npu);
-
-REGISTER_NPU_IMPL(softmax_focal_loss_backward_impl,
-                  softmax_focal_loss_backward_npu);
diff --git a/mmcv/ops/csrc/pytorch/npu/fused_bias_leakyrelu_npu.cpp b/mmcv/ops/csrc/pytorch/npu/fused_bias_leakyrelu_npu.cpp
deleted file mode 100644
index cd052b5..0000000
--- a/mmcv/ops/csrc/pytorch/npu/fused_bias_leakyrelu_npu.cpp
+++ /dev/null
@@ -1,54 +0,0 @@
-#include "pytorch_npu_helper.hpp"
-
-using namespace NPU_NAME_SPACE;
-using namespace std;
-
-Tensor fused_bias_leakyrelu_op_impl(const Tensor &input, const Tensor &bias,
-                                    const Tensor &refer, int act, int grad,
-                                    float alpha, float scale);
-
-Tensor fused_bias_leakyrelu_npu(const Tensor &input, const Tensor &bias,
-                                const Tensor &refer, int act, int grad,
-                                float alpha, float scale) {
-  at::Tensor py = at::empty_like(input);
-  // forward
-  if (grad == 0) {
-    auto input_size = input.sizes();
-    int input_length = input_size.size();
-    c10::SmallVector<int64_t, SIZE> input_size_tmp;
-    input_size_tmp = array_to_small_vector(input_size);
-    if (input_length > 1) {
-      for (int i = 0; i < input_length; i++) {
-        if (i != 1) {
-          input_size_tmp[i] = 1;
-        }
-      }
-    }
-    at::Tensor bias_tmp = at::reshape(bias, input_size_tmp);
-    at::Tensor bias_ = at_npu::native::NPUNativeFunctions::npu_broadcast(
-        bias_tmp, input.sizes());
-    OpCommand cmd;
-    cmd.Name("FusedBiasLeakyRelu")
-        .Input(input)
-        .Input(bias_)
-        .Output(py)
-        .Attr("scale", scale)
-        .Attr("negative_slope", alpha)
-        .Run();
-  }
-
-  // backward
-  if (grad == 1) {
-    OpCommand cmd;
-    cmd.Name("FusedBiasLeakyReluGrad")
-        .Input(input)
-        .Input(refer)
-        .Output(py)
-        .Attr("scale", scale)
-        .Attr("negative_slope", alpha)
-        .Run();
-  }
-  return py;
-}
-
-REGISTER_NPU_IMPL(fused_bias_leakyrelu_op_impl, fused_bias_leakyrelu_npu);
diff --git a/mmcv/ops/csrc/pytorch/npu/gather_points_npu.cpp b/mmcv/ops/csrc/pytorch/npu/gather_points_npu.cpp
deleted file mode 100644
index ce19b4c..0000000
--- a/mmcv/ops/csrc/pytorch/npu/gather_points_npu.cpp
+++ /dev/null
@@ -1,29 +0,0 @@
-#include "pytorch_npu_helper.hpp"
-
-using namespace NPU_NAME_SPACE;
-using namespace std;
-
-void gather_points_forward_npu(int b, int c, int n, int npoints,
-                               const Tensor points, const Tensor idx,
-                               Tensor out) {
-  // b, c, n, and npoints do not need to be passed into gatherv2,
-  // b, c, n, and npoints are calculated inside the operator
-  // gatherv2 operator in ascend needs to set axis to 2, batch_dims is 1
-  c10::SmallVector<int64_t, N> axis = {2};
-  int64_t batch_dims = 1;
-
-  OpCommand cmd;
-  cmd.Name("GatherV2")
-      .Input(points)
-      .Input(idx)
-      .Input(axis)
-      .Output(out)
-      .Attr("batch_dims", batch_dims)
-      .Run();
-}
-
-void gather_points_forward_impl(int b, int c, int n, int npoints,
-                                const Tensor points, const Tensor idx,
-                                Tensor out);
-
-REGISTER_NPU_IMPL(gather_points_forward_impl, gather_points_forward_npu);
diff --git a/mmcv/ops/csrc/pytorch/npu/nms_npu.cpp b/mmcv/ops/csrc/pytorch/npu/nms_npu.cpp
deleted file mode 100644
index 2f86893..0000000
--- a/mmcv/ops/csrc/pytorch/npu/nms_npu.cpp
+++ /dev/null
@@ -1,45 +0,0 @@
-#include "pytorch_npu_helper.hpp"
-
-using namespace NPU_NAME_SPACE;
-using namespace std;
-
-Tensor nms_npu(Tensor boxes, Tensor scores, float iou_threshold, int offset) {
-  at::Tensor boxed_offest = at_npu::native::OpPreparation::ApplyTensor(boxes);
-  at::Tensor ones_tensor =
-      at_npu::native::OpPreparation::ApplyTensor(boxes).fill_(1);
-  at::add_out(boxed_offest, boxes, ones_tensor, offset);
-  at::Tensor iou_threshold_y = at_npu::native::OpPreparation::ApplyTensor(
-                                   {}, boxes.options().dtype(at::kFloat), boxes)
-                                   .fill_(iou_threshold);
-  at::Tensor scores_threshold_y =
-      at_npu::native::OpPreparation::ApplyTensor(
-          {}, boxes.options().dtype(at::kFloat), boxes)
-          .fill_(0);
-  at::Tensor max_outputsize_y = at_npu::native::OpPreparation::ApplyTensor(
-                                    {}, boxes.options().dtype(at::kInt), boxes)
-                                    .fill_(boxes.size(0));
-  c10::SmallVector<int64_t, SIZE> outputsize = {boxes.size(0)};
-  at::Tensor output = at_npu::native::OpPreparation::ApplyTensor(
-                          outputsize, boxes.options().dtype(at::kInt), boxes)
-                          .fill_(-1);
-  OpCommand cmd;
-  cmd.Name("NonMaxSuppressionV3")
-      .Input(boxes)
-      .Input(scores)
-      .Input(max_outputsize_y)
-      .Input(iou_threshold_y)
-      .Input(scores_threshold_y)
-      .Output(output)
-      .Run();
-  auto outputsizeBool = at::gt(output, -1);
-  auto outputsizeInt = outputsizeBool.to(at::ScalarType::Int);
-  auto countLen = at::sum(outputsizeInt, at::ScalarType::Int);
-  at::Tensor actual_output = output.slice(0, 0, countLen.item().toLong());
-  actual_output = at_npu::native::NPUNativeFunctions::npu_dtype_cast(
-      actual_output, at::kLong);
-  return actual_output;
-}
-
-Tensor nms_impl(Tensor boxes, Tensor scores, float iou_threshold, int offset);
-
-REGISTER_NPU_IMPL(nms_impl, nms_npu);
diff --git a/mmcv/ops/csrc/pytorch/npu/nms_rotated_npu.cpp b/mmcv/ops/csrc/pytorch/npu/nms_rotated_npu.cpp
deleted file mode 100644
index b82ae58..0000000
--- a/mmcv/ops/csrc/pytorch/npu/nms_rotated_npu.cpp
+++ /dev/null
@@ -1,32 +0,0 @@
-#include "pytorch_npu_helper.hpp"
-
-using namespace NPU_NAME_SPACE;
-
-Tensor nms_rotated_npu(const Tensor dets, const Tensor scores,
-                       const Tensor labels, const float iou_threshold) {
-  auto originDtype = dets.scalar_type();
-  at::Tensor detsCast = dets;
-  at::Tensor scoresCast = scores;
-  if (originDtype != at::ScalarType::Float) {
-    detsCast = NPUNativeFunctions::npu_dtype_cast(dets, at::kFloat);
-    scoresCast = NPUNativeFunctions::npu_dtype_cast(scores, at::kFloat);
-  }
-  c10::SmallVector<int64_t, SIZE> selectedIndexSize = {dets.size(0)};
-  at::Tensor selectedBox = OpPreparation::ApplyTensor(dets);
-  at::Tensor selectedIndex = OpPreparation::ApplyTensor(
-      selectedIndexSize, dets.options().dtype(at::kInt), dets);
-
-  c10::SmallVector<int64_t, N> output_sync_idx = {0, 1};
-  OpCommand cmd;
-  cmd.Sync(output_sync_idx)
-      .Name("RotatedNMS")
-      .Input(detsCast)
-      .Input(scoresCast)
-      .Input(labels)
-      .Output(selectedBox)
-      .Output(selectedIndex)
-      .Attr("iou_threshold", (float)iou_threshold)
-      .Run();
-  selectedIndex = NPUNativeFunctions::npu_dtype_cast(selectedIndex, at::kLong);
-  return selectedIndex;
-}
diff --git a/mmcv/ops/csrc/pytorch/npu/psa_mask_npu.cpp b/mmcv/ops/csrc/pytorch/npu/psa_mask_npu.cpp
deleted file mode 100644
index 44ddb54..0000000
--- a/mmcv/ops/csrc/pytorch/npu/psa_mask_npu.cpp
+++ /dev/null
@@ -1,75 +0,0 @@
-#include "pytorch_npu_helper.hpp"
-
-using namespace NPU_NAME_SPACE;
-using namespace std;
-
-void psamask_forward_npu(const int psa_type, const Tensor x, Tensor y,
-                         const int num, const int h_feature,
-                         const int w_feature, const int h_mask,
-                         const int w_mask, const int half_h_mask,
-                         const int half_w_mask) {
-  int64_t psa_type_i64 = psa_type;
-  int64_t num_i64 = num;
-  int64_t h_feature_i64 = h_feature;
-  int64_t w_feature_i64 = w_feature;
-  int64_t h_mask_i64 = h_mask;
-  int64_t w_mask_i64 = w_mask;
-  int64_t half_h_mask_i64 = half_h_mask;
-  int64_t half_w_mask_i64 = half_w_mask;
-  OpCommand cmd;
-  cmd.Name("PSAMask")
-      .Input(x)
-      .Output(y)
-      .Attr("psa_type", psa_type_i64)
-      .Attr("num", num_i64)
-      .Attr("h_feature", h_feature_i64)
-      .Attr("w_feature", w_feature_i64)
-      .Attr("h_mask", h_mask_i64)
-      .Attr("w_mask", w_mask_i64)
-      .Attr("half_h_mask", half_h_mask_i64)
-      .Attr("half_w_mask", half_w_mask_i64)
-      .Run();
-}
-
-void psamask_forward_impl(const int psa_type, const Tensor x, Tensor y,
-                          const int num, const int h_feature,
-                          const int w_feature, const int h_mask,
-                          const int w_mask, const int half_h_mask,
-                          const int half_w_mask);
-
-void psamask_backward_npu(const int psa_type, const Tensor y_grad,
-                          Tensor x_grad, const int num, const int h_feature,
-                          const int w_feature, const int h_mask,
-                          const int w_mask, const int half_h_mask,
-                          const int half_w_mask) {
-  int64_t psa_type_i64 = psa_type;
-  int64_t num_i64 = num;
-  int64_t h_feature_i64 = h_feature;
-  int64_t w_feature_i64 = w_feature;
-  int64_t h_mask_i64 = h_mask;
-  int64_t w_mask_i64 = w_mask;
-  int64_t half_h_mask_i64 = half_h_mask;
-  int64_t half_w_mask_i64 = half_w_mask;
-  OpCommand cmd;
-  cmd.Name("PSAMaskGrad")
-      .Input(y_grad)
-      .Output(x_grad)
-      .Attr("psa_type", psa_type_i64)
-      .Attr("num", num_i64)
-      .Attr("h_feature", h_feature_i64)
-      .Attr("w_feature", w_feature_i64)
-      .Attr("h_mask", h_mask_i64)
-      .Attr("w_mask", w_mask_i64)
-      .Attr("half_h_mask", half_h_mask_i64)
-      .Attr("half_w_mask", half_w_mask_i64)
-      .Run();
-}
-
-void psamask_backward_impl(const int psa_type, const Tensor y_grad,
-                           Tensor x_grad, const int num, const int h_feature,
-                           const int w_feature, const int h_mask,
-                           const int w_mask, const int half_h_mask,
-                           const int half_w_mask);
-
-REGISTER_NPU_IMPL(psamask_forward_impl, psamask_forward_npu);
-REGISTER_NPU_IMPL(psamask_backward_impl, psamask_backward_npu);
diff --git a/mmcv/ops/csrc/pytorch/npu/roi_pool_npu.cpp b/mmcv/ops/csrc/pytorch/npu/roi_pool_npu.cpp
deleted file mode 100644
index f428311..0000000
--- a/mmcv/ops/csrc/pytorch/npu/roi_pool_npu.cpp
+++ /dev/null
@@ -1,63 +0,0 @@
-#include "pytorch_npu_helper.hpp"
-
-using namespace NPU_NAME_SPACE;
-using namespace std;
-
-void roi_pool_forward_npu(Tensor input, Tensor rois, Tensor output,
-                          Tensor argmax, int pooled_height, int pooled_width,
-                          float spatial_scale) {
-  int64_t pooled_height_64 = pooled_height;
-  int64_t pooled_width_64 = pooled_width;
-  int64_t pooled_channel = 1;
-  at::Tensor roi_actual_num = at_npu::native::OpPreparation::ApplyTensor(
-      {}, rois.options().dtype(at::kInt), rois);
-  OpCommand cmd;
-  cmd.Name("RoiPoolingWithArgMax")
-      .Input(input)
-      .Input(rois)
-      .Input(roi_actual_num)
-      .Output(output)
-      .Output(argmax)
-      .Attr("pooled_h", pooled_height_64)
-      .Attr("pooled_w", pooled_width_64)
-      .Attr("spatial_scale_h", spatial_scale)
-      .Attr("spatial_scale_w", spatial_scale)
-      .Attr("pool_channel", pooled_channel)
-      .Run();
-}
-
-void roi_pool_backward_npu(Tensor grad_output, Tensor rois, Tensor argmax,
-                           Tensor grad_input, int pooled_height,
-                           int pooled_width, float spatial_scale) {
-  int64_t pooled_height_64 = pooled_height;
-  int64_t pooled_width_64 = pooled_width;
-  int64_t pooled_channel = 1;
-  at::Tensor roi_actual_num = at_npu::native::OpPreparation::ApplyTensor(
-      {}, rois.options().dtype(at::kInt), rois);
-  at::Tensor x = at::ones_like(grad_input);
-  OpCommand cmd;
-  cmd.Name("RoiPoolingGradWithArgMax")
-      .Input(grad_output)
-      .Input(x)
-      .Input(rois)
-      .Input(roi_actual_num)
-      .Input(argmax)
-      .Output(grad_input)
-      .Attr("pooled_h", pooled_height_64)
-      .Attr("pooled_w", pooled_width_64)
-      .Attr("spatial_scale_h", spatial_scale)
-      .Attr("spatial_scale_w", spatial_scale)
-      .Attr("pool_channel", pooled_channel)
-      .Run();
-}
-
-void roi_pool_forward_impl(Tensor input, Tensor rois, Tensor output,
-                           Tensor argmax, int pooled_height, int pooled_width,
-                           float spatial_scale);
-
-void roi_pool_backward_impl(Tensor grad_output, Tensor rois, Tensor argmax,
-                            Tensor grad_input, int pooled_height,
-                            int pooled_width, float spatial_scale);
-
-REGISTER_NPU_IMPL(roi_pool_forward_impl, roi_pool_forward_npu);
-REGISTER_NPU_IMPL(roi_pool_backward_impl, roi_pool_backward_npu);
diff --git a/mmcv/ops/csrc/pytorch/npu/voxelization_npu.cpp b/mmcv/ops/csrc/pytorch/npu/voxelization_npu.cpp
deleted file mode 100644
index 13e5040..0000000
--- a/mmcv/ops/csrc/pytorch/npu/voxelization_npu.cpp
+++ /dev/null
@@ -1,59 +0,0 @@
-#include "pytorch_npu_helper.hpp"
-
-using namespace NPU_NAME_SPACE;
-using namespace std;
-
-int hard_voxelize_forward_impl(const at::Tensor &points, at::Tensor &voxels,
-                               at::Tensor &coors,
-                               at::Tensor &num_points_per_voxel,
-                               const std::vector<float> voxel_size,
-                               const std::vector<float> coors_range,
-                               const int max_points, const int max_voxels,
-                               const int NDim = 3);
-
-int hard_voxelize_forward_npu(const at::Tensor &points, at::Tensor &voxels,
-                              at::Tensor &coors,
-                              at::Tensor &num_points_per_voxel,
-                              const std::vector<float> voxel_size,
-                              const std::vector<float> coors_range,
-                              const int max_points, const int max_voxels,
-                              const int NDim = 3) {
-  at::Tensor voxel_num_tmp = OpPreparation::ApplyTensor(points, {1});
-  at::Tensor voxel_num = at_npu::native::NPUNativeFunctions::npu_dtype_cast(
-      voxel_num_tmp, at::kInt);
-
-  at::Tensor voxel_size_cpu = at::from_blob(
-      const_cast<float *>(voxel_size.data()), {3}, dtype(at::kFloat));
-  at::Tensor voxel_size_npu =
-      CalcuOpUtil::CopyTensorHostToDevice(voxel_size_cpu);
-
-  at::Tensor coors_range_cpu = at::from_blob(
-      const_cast<float *>(coors_range.data()), {6}, dtype(at::kFloat));
-  at::Tensor coors_range_npu =
-      CalcuOpUtil::CopyTensorHostToDevice(coors_range_cpu);
-
-  int64_t max_points_ = (int64_t)max_points;
-  int64_t max_voxels_ = (int64_t)max_voxels;
-
-  // only support true now
-  bool deterministic = true;
-
-  OpCommand cmd;
-  cmd.Name("Voxelization")
-      .Input(points)
-      .Input(voxel_size_npu)
-      .Input(coors_range_npu)
-      .Output(voxels)
-      .Output(coors)
-      .Output(num_points_per_voxel)
-      .Output(voxel_num)
-      .Attr("max_points", max_points_)
-      .Attr("max_voxels", max_voxels_)
-      .Attr("deterministic", deterministic)
-      .Run();
-  auto voxel_num_cpu = voxel_num.to(at::kCPU);
-  int voxel_num_int = voxel_num_cpu.data_ptr<int>()[0];
-  return voxel_num_int;
-}
-
-REGISTER_NPU_IMPL(hard_voxelize_forward_impl, hard_voxelize_forward_npu);
diff --git a/mmcv/ops/csrc/pytorch/points_in_polygons.cpp b/mmcv/ops/csrc/pytorch/points_in_polygons.cpp
deleted file mode 100644
index 75a93dc..0000000
--- a/mmcv/ops/csrc/pytorch/points_in_polygons.cpp
+++ /dev/null
@@ -1,15 +0,0 @@
-#include "pytorch_cpp_helper.hpp"
-#include "pytorch_device_registry.hpp"
-
-void points_in_polygons_forward_impl(const Tensor points, const Tensor polygons,
-                                     Tensor output, const int rows,
-                                     const int cols) {
-  DISPATCH_DEVICE_IMPL(points_in_polygons_forward_impl, points, polygons,
-                       output, rows, cols);
-}
-
-void points_in_polygons_forward(Tensor points, Tensor polygons, Tensor output) {
-  int rows = points.size(0);
-  int cols = polygons.size(0);
-  points_in_polygons_forward_impl(points, polygons, output, rows, cols);
-}
diff --git a/mmcv/ops/csrc/pytorch/prroi_pool.cpp b/mmcv/ops/csrc/pytorch/prroi_pool.cpp
deleted file mode 100644
index 00db84a..0000000
--- a/mmcv/ops/csrc/pytorch/prroi_pool.cpp
+++ /dev/null
@@ -1,47 +0,0 @@
-// Copyright (c) OpenMMLab. All rights reserved
-#include "pytorch_cpp_helper.hpp"
-#include "pytorch_device_registry.hpp"
-
-void prroi_pool_forward_impl(Tensor input, Tensor rois, Tensor output,
-                             int pooled_height, int pooled_width,
-                             float spatial_scale) {
-  DISPATCH_DEVICE_IMPL(prroi_pool_forward_impl, input, rois, output,
-                       pooled_height, pooled_width, spatial_scale);
-}
-
-void prroi_pool_backward_impl(Tensor grad_output, Tensor rois,
-                              Tensor grad_input, int pooled_height,
-                              int pooled_width, float spatial_scale) {
-  DISPATCH_DEVICE_IMPL(prroi_pool_backward_impl, grad_output, rois, grad_input,
-                       pooled_height, pooled_width, spatial_scale);
-}
-
-void prroi_pool_coor_backward_impl(Tensor output, Tensor grad_output,
-                                   Tensor input, Tensor rois, Tensor grad_rois,
-                                   int pooled_height, int pooled_width,
-                                   float spatial_scale) {
-  DISPATCH_DEVICE_IMPL(prroi_pool_coor_backward_impl, output, grad_output,
-                       input, rois, grad_rois, pooled_height, pooled_width,
-                       spatial_scale);
-}
-
-void prroi_pool_forward(Tensor input, Tensor rois, Tensor output,
-                        int pooled_height, int pooled_width,
-                        float spatial_scale) {
-  prroi_pool_forward_impl(input, rois, output, pooled_height, pooled_width,
-                          spatial_scale);
-}
-
-void prroi_pool_backward(Tensor grad_output, Tensor rois, Tensor grad_input,
-                         int pooled_height, int pooled_width,
-                         float spatial_scale) {
-  prroi_pool_backward_impl(grad_output, rois, grad_input, pooled_height,
-                           pooled_width, spatial_scale);
-}
-
-void prroi_pool_coor_backward(Tensor output, Tensor grad_output, Tensor input,
-                              Tensor rois, Tensor grad_rois, int pooled_height,
-                              int pooled_width, float spatial_scale) {
-  prroi_pool_coor_backward_impl(output, grad_output, input, rois, grad_rois,
-                                pooled_height, pooled_width, spatial_scale);
-}
diff --git a/mmcv/ops/csrc/pytorch/pybind.cpp b/mmcv/ops/csrc/pytorch/pybind.cpp
index c8591a5..09d62d3 100644
--- a/mmcv/ops/csrc/pytorch/pybind.cpp
+++ b/mmcv/ops/csrc/pytorch/pybind.cpp
@@ -1,6 +1,4 @@
 // Copyright (c) OpenMMLab. All rights reserved
-#include <torch/extension.h>
-
 #include "pytorch_cpp_helper.hpp"
 
 std::string get_compiler_version();
@@ -75,18 +73,6 @@ void group_points_backward(Tensor grad_out_tensor, Tensor idx_tensor,
                            Tensor grad_points_tensor, int b, int c, int n,
                            int npoints, int nsample);
 
-void stack_group_points_forward(Tensor features_tensor,
-                                Tensor features_batch_cnt_tensor,
-                                Tensor idx_tensor, Tensor idx_batch_cnt_tensor,
-                                Tensor out_tensor, int b, int c, int m,
-                                int nsample);
-
-void stack_group_points_backward(Tensor grad_out_tensor, Tensor idx_tensor,
-                                 Tensor idx_batch_cnt_tensor,
-                                 Tensor features_batch_cnt_tensor,
-                                 Tensor grad_features_tensor, int b, int c,
-                                 int m, int n, int nsample);
-
 void roipoint_pool3d_forward(Tensor xyz, Tensor boxes3d, Tensor pts_feature,
                              Tensor pooled_features, Tensor pooled_empty_flag);
 
@@ -127,15 +113,17 @@ void bbox_overlaps(const Tensor bboxes1, const Tensor bboxes2, Tensor ious,
 
 void knn_forward(Tensor xyz_tensor, Tensor new_xyz_tensor, Tensor idx_tensor,
                  Tensor dist2_tensor, int b, int n, int m, int nsample);
-
 void iou3d_boxes_overlap_bev_forward(Tensor boxes_a, Tensor boxes_b,
                                      Tensor ans_overlap);
 
-void iou3d_nms3d_forward(Tensor boxes, Tensor keep, Tensor keep_num,
-                         float nms_overlap_thresh);
+void iou3d_boxes_iou_bev_forward(Tensor boxes_a, Tensor boxes_b,
+                                 Tensor ans_iou);
+
+void iou3d_nms_forward(Tensor boxes, Tensor keep, Tensor keep_num,
+                       float nms_overlap_thresh);
 
-void iou3d_nms3d_normal_forward(Tensor boxes, Tensor keep, Tensor keep_num,
-                                float nms_overlap_thresh);
+void iou3d_nms_normal_forward(Tensor boxes, Tensor keep, Tensor keep_num,
+                              float nms_overlap_thresh);
 
 void furthest_point_sampling_forward(Tensor points_tensor, Tensor temp_tensor,
                                      Tensor idx_tensor, int b, int n, int m);
@@ -252,69 +240,32 @@ void ball_query_forward(Tensor new_xyz_tensor, Tensor xyz_tensor,
                         Tensor idx_tensor, int b, int n, int m,
                         float min_radius, float max_radius, int nsample);
 
-void stack_ball_query_forward(Tensor new_xyz_tensor, Tensor new_xyz_batch_cnt,
-                              Tensor xyz_tensor, Tensor xyz_batch_cnt,
-                              Tensor idx_tensor, float max_radius, int nsample);
-
-void prroi_pool_forward(Tensor input, Tensor rois, Tensor output,
-                        int pooled_height, int pooled_width,
-                        float spatial_scale);
-
-void prroi_pool_backward(Tensor grad_output, Tensor rois, Tensor grad_input,
-                         int pooled_height, int pooled_width,
-                         float spatial_scale);
-
-void prroi_pool_coor_backward(Tensor output, Tensor grad_output, Tensor input,
-                              Tensor rois, Tensor grad_rois, int pooled_height,
-                              int pooled_width, float spatial_scale);
-
-template <unsigned NDim>
-std::vector<torch::Tensor> get_indice_pairs_forward(
-    torch::Tensor indices, int64_t batchSize,
-    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
-    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
-    std::vector<int64_t> padding, std::vector<int64_t> dilation,
-    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose);
-
-template <unsigned NDim>
-std::vector<Tensor> get_indice_pairs_backward(
-    Tensor indices, Tensor gridOut, int64_t batchSize,
-    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
-    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
-    std::vector<int64_t> padding, std::vector<int64_t> dilation,
-    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose);
-
-Tensor indice_conv_forward(Tensor features, Tensor filters, Tensor indicePairs,
-                           Tensor indiceNum, int64_t numActOut,
-                           int64_t _inverse, int64_t _subM);
-
-std::vector<Tensor> indice_conv_backward(Tensor features, Tensor filters,
-                                         Tensor outGrad, Tensor indicePairs,
-                                         Tensor indiceNum, int64_t _inverse,
-                                         int64_t _subM);
-
-Tensor fused_indice_conv_batchnorm_forward(Tensor features, Tensor filters,
-                                           Tensor bias, Tensor indicePairs,
-                                           Tensor indiceNum, int64_t numActOut,
-                                           int64_t _inverse, int64_t _subM);
-
-Tensor indice_maxpool_forward(Tensor features, Tensor indicePairs,
-                              Tensor indiceNum, int64_t numAct);
-
-Tensor indice_maxpool_backward(Tensor features, Tensor outFeatures,
-                               Tensor outGrad, Tensor indicePairs,
-                               Tensor indiceNum);
+Tensor bottom_pool_forward(Tensor input);
+
+Tensor bottom_pool_backward(Tensor input, Tensor grad_output);
+
+Tensor left_pool_forward(Tensor input);
+
+Tensor left_pool_backward(Tensor input, Tensor grad_output);
+
+Tensor right_pool_forward(Tensor input);
+
+Tensor right_pool_backward(Tensor input, Tensor grad_output);
+
+Tensor top_pool_forward(Tensor input);
+
+Tensor top_pool_backward(Tensor input, Tensor grad_output);
 
 void box_iou_rotated(const Tensor boxes1, const Tensor boxes2, Tensor ious,
                      const int mode_flag, const bool aligned);
 
 Tensor nms_rotated(const Tensor dets, const Tensor scores, const Tensor order,
-                   const Tensor dets_sorted, const Tensor labels,
-                   const float iou_threshold, const int multi_label);
+                   const Tensor dets_sorted, const float iou_threshold,
+                   const int multi_label);
 
-Tensor upfirdn2d(torch::Tensor input, torch::Tensor filter, int upx, int upy,
-                 int downx, int downy, int padx0, int padx1, int pady0,
-                 int pady1, bool flip, float gain);
+Tensor upfirdn2d(const Tensor &input, const Tensor &kernel, int up_x, int up_y,
+                 int down_x, int down_y, int pad_x0, int pad_x1, int pad_y0,
+                 int pad_y1);
 
 Tensor fused_bias_leakyrelu(const Tensor &input, const Tensor &bias,
                             const Tensor &refer, int act, int grad, float alpha,
@@ -322,14 +273,13 @@ Tensor fused_bias_leakyrelu(const Tensor &input, const Tensor &bias,
 
 void roi_align_rotated_forward(Tensor input, Tensor rois, Tensor output,
                                int pooled_height, int pooled_width,
-                               float spatial_scale, int sampling_ratio,
+                               float spatial_scale, int sample_num,
                                bool aligned, bool clockwise);
 
 void roi_align_rotated_backward(Tensor grad_output, Tensor rois,
                                 Tensor grad_input, int pooled_height,
                                 int pooled_width, float spatial_scale,
-                                int sampling_ratio, bool aligned,
-                                bool clockwise);
+                                int sample_num, bool aligned, bool clockwise);
 
 std::vector<torch::Tensor> dynamic_point_to_voxel_forward(
     const torch::Tensor &feats, const torch::Tensor &coors,
@@ -348,8 +298,7 @@ void hard_voxelize_forward(const at::Tensor &points,
                            const at::Tensor &coors_range, at::Tensor &voxels,
                            at::Tensor &coors, at::Tensor &num_points_per_voxel,
                            at::Tensor &voxel_num, const int max_points,
-                           const int max_voxels, const int NDim,
-                           const bool deterministic);
+                           const int max_voxels, const int NDim);
 
 void dynamic_voxelize_forward(const at::Tensor &points,
                               const at::Tensor &voxel_size,
@@ -391,90 +340,11 @@ void correlation_backward(Tensor grad_output, Tensor input1, Tensor input2,
                           int dilationH, int dilationW, int dilation_patchH,
                           int dilation_patchW, int dH, int dW);
 
-void rotated_feature_align_forward(const Tensor features,
-                                   const Tensor best_bboxes, Tensor output,
-                                   const float spatial_scale, const int points);
-
-void rotated_feature_align_backward(const Tensor top_grad,
-                                    const Tensor best_bboxes,
-                                    Tensor bottom_grad,
-                                    const float spatial_scale,
-                                    const int points);
-
-void riroi_align_rotated_forward(Tensor features, Tensor rois, Tensor output,
-                                 int pooled_height, int pooled_width,
-                                 float spatial_scale, int num_samples,
-                                 int num_orientations, bool clockwise);
-
-void riroi_align_rotated_backward(Tensor top_grad, Tensor rois,
-                                  Tensor bottom_grad, int pooled_height,
-                                  int pooled_width, float spatial_scale,
-                                  int num_samples, int num_orientations,
-                                  bool clockwise);
-
-void points_in_polygons_forward(Tensor points, Tensor polygons, Tensor output);
-
-void min_area_polygons(const Tensor pointsets, Tensor polygons);
-
-void active_rotated_filter_forward(const Tensor input, const Tensor indices,
-                                   Tensor output);
-
-void active_rotated_filter_backward(const Tensor grad_out, const Tensor indices,
-                                    Tensor grad_in);
-
-void convex_iou(const Tensor pointsets, const Tensor polygons, Tensor ious);
-
-void convex_giou(const Tensor pointsets, const Tensor polygons, Tensor output);
-
-at::Tensor diff_iou_rotated_sort_vertices_forward(at::Tensor vertices,
-                                                  at::Tensor mask,
-                                                  at::Tensor num_valid);
-
-void chamfer_distance_forward(const Tensor xyz1, const Tensor xyz2,
-                              const Tensor dist1, const Tensor dist2,
-                              const Tensor idx1, const Tensor idx);
-
-void chamfer_distance_backward(const Tensor xyz1, const Tensor xyz2,
-                               Tensor idx1, Tensor idx2, Tensor graddist1,
-                               Tensor graddist2, Tensor gradxyz1,
-                               Tensor gradxyz2);
-
-Tensor bias_act(const Tensor &input, const Tensor &bias, const Tensor &xref,
-                const Tensor &yref, const Tensor &dy, int grad, int dim,
-                int act, float alpha, float gain, float clamp);
-
-std::tuple<torch::Tensor, torch::Tensor, int> filtered_lrelu(
-    torch::Tensor x, torch::Tensor fu, torch::Tensor fd, torch::Tensor b,
-    torch::Tensor si, int up, int down, int px0, int px1, int py0, int py1,
-    int sx, int sy, float gain, float slope, float clamp, bool flip_filters,
-    bool writeSigns);
-
-torch::Tensor filtered_lrelu_act_(torch::Tensor x, torch::Tensor si, int sx,
-                                  int sy, float gain, float slope, float clamp,
-                                  bool writeSigns);
-
-void box_iou_quadri(const Tensor boxes1, const Tensor boxes2, Tensor ious,
-                    const int mode_flag, const bool aligned);
-
-Tensor nms_quadri(const Tensor dets, const Tensor scores, const Tensor order,
-                  const Tensor dets_sorted, const float iou_threshold,
-                  const int multi_label);
-
-void bezier_align_forward(Tensor input, Tensor rois, Tensor output,
-                          int aligned_height, int aligned_width,
-                          float spatial_scale, int sampling_ratio,
-                          bool aligned);
-
-void bezier_align_backward(Tensor grad_output, Tensor rois, Tensor grad_input,
-                           int aligned_height, int aligned_width,
-                           float spatial_scale, int sampling_ratio,
-                           bool aligned);
-
 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
   m.def("upfirdn2d", &upfirdn2d, "upfirdn2d (CUDA)", py::arg("input"),
-        py::arg("filter"), py::arg("upx"), py::arg("upy"), py::arg("downx"),
-        py::arg("downy"), py::arg("padx0"), py::arg("padx1"), py::arg("pady0"),
-        py::arg("pady1"), py::arg("flip"), py::arg("gain"));
+        py::arg("kernel"), py::arg("up_x"), py::arg("up_y"), py::arg("down_x"),
+        py::arg("down_y"), py::arg("pad_x0"), py::arg("pad_x1"),
+        py::arg("pad_y0"), py::arg("pad_y1"));
   m.def("fused_bias_leakyrelu", &fused_bias_leakyrelu,
         "fused_bias_leakyrelu (CUDA)", py::arg("input"), py::arg("bias"),
         py::arg("empty"), py::arg("act"), py::arg("grad"), py::arg("alpha"),
@@ -525,21 +395,21 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
   m.def("deform_conv_forward", &deform_conv_forward, "deform_conv_forward",
         py::arg("input"), py::arg("weight"), py::arg("offset"),
         py::arg("output"), py::arg("columns"), py::arg("ones"), py::arg("kW"),
-        py::arg("kH"), py::arg("dW"), py::arg("dH"), py::arg("padW"),
-        py::arg("padH"), py::arg("dilationW"), py::arg("dilationH"),
+        py::arg("kH"), py::arg("dW"), py::arg("dH"), py::arg("padH"),
+        py::arg("padW"), py::arg("dilationW"), py::arg("dilationH"),
         py::arg("group"), py::arg("deformable_group"), py::arg("im2col_step"));
   m.def("deform_conv_backward_input", &deform_conv_backward_input,
         "deform_conv_backward_input", py::arg("input"), py::arg("offset"),
         py::arg("gradOutput"), py::arg("gradInput"), py::arg("gradOffset"),
         py::arg("weight"), py::arg("columns"), py::arg("kW"), py::arg("kH"),
-        py::arg("dW"), py::arg("dH"), py::arg("padW"), py::arg("padH"),
+        py::arg("dW"), py::arg("dH"), py::arg("padH"), py::arg("padW"),
         py::arg("dilationW"), py::arg("dilationH"), py::arg("group"),
         py::arg("deformable_group"), py::arg("im2col_step"));
   m.def("deform_conv_backward_parameters", &deform_conv_backward_parameters,
         "deform_conv_backward_parameters", py::arg("input"), py::arg("offset"),
         py::arg("gradOutput"), py::arg("gradWeight"), py::arg("columns"),
         py::arg("ones"), py::arg("kW"), py::arg("kH"), py::arg("dW"),
-        py::arg("dH"), py::arg("padW"), py::arg("padH"), py::arg("dilationW"),
+        py::arg("dH"), py::arg("padH"), py::arg("padW"), py::arg("dilationW"),
         py::arg("dilationH"), py::arg("group"), py::arg("deformable_group"),
         py::arg("scale"), py::arg("im2col_step"));
   m.def("deform_roi_pool_forward", &deform_roi_pool_forward,
@@ -597,29 +467,21 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
         "group_points_backward", py::arg("grad_out_tensor"),
         py::arg("idx_tensor"), py::arg("grad_points_tensor"), py::arg("b"),
         py::arg("c"), py::arg("n"), py::arg("npoints"), py::arg("nsample"));
-  m.def("stack_group_points_forward", &stack_group_points_forward,
-        "stack_group_points_forward", py::arg("features_tensor"),
-        py::arg("features_batch_cnt_tensor"), py::arg("idx_tensor"),
-        py::arg("idx_batch_cnt_tensor"), py::arg("out_tensor"), py::arg("b"),
-        py::arg("c"), py::arg("m"), py::arg("nsample"));
-  m.def("stack_group_points_backward", &stack_group_points_backward,
-        "stack_group_points_backward", py::arg("grad_out_tensor"),
-        py::arg("idx_tensor"), py::arg("idx_batch_cnt_tensor"),
-        py::arg("features_batch_cnt_tensor"), py::arg("grad_features_tensor"),
-        py::arg("b"), py::arg("c"), py::arg("m"), py::arg("n"),
-        py::arg("nsample"));
   m.def("knn_forward", &knn_forward, "knn_forward", py::arg("b"), py::arg("n"),
         py::arg("m"), py::arg("nsample"), py::arg("xyz_tensor"),
         py::arg("new_xyz_tensor"), py::arg("idx_tensor"),
         py::arg("dist2_tensor"));
   m.def("iou3d_boxes_overlap_bev_forward", &iou3d_boxes_overlap_bev_forward,
         "iou3d_boxes_overlap_bev_forward", py::arg("boxes_a"),
-        py::arg("boxes_b"), py::arg("ans_iou"));
-  m.def("iou3d_nms3d_forward", &iou3d_nms3d_forward, "iou3d_nms3d_forward",
+        py::arg("boxes_b"), py::arg("ans_overlap"));
+  m.def("iou3d_boxes_iou_bev_forward", &iou3d_boxes_iou_bev_forward,
+        "iou3d_boxes_iou_bev_forward", py::arg("boxes_a"), py::arg("boxes_b"),
+        py::arg("ans_iou"));
+  m.def("iou3d_nms_forward", &iou3d_nms_forward, "iou3d_nms_forward",
         py::arg("boxes"), py::arg("keep"), py::arg("num_out"),
         py::arg("nms_overlap_thresh"));
-  m.def("iou3d_nms3d_normal_forward", &iou3d_nms3d_normal_forward,
-        "iou3d_nms3d_normal_forward", py::arg("boxes"), py::arg("keep"),
+  m.def("iou3d_nms_normal_forward", &iou3d_nms_normal_forward,
+        "iou3d_nms_normal_forward", py::arg("boxes"), py::arg("keep"),
         py::arg("num_out"), py::arg("nms_overlap_thresh"));
   m.def("furthest_point_sampling_forward", &furthest_point_sampling_forward,
         "furthest_point_sampling_forward", py::arg("points_tensor"),
@@ -705,54 +567,6 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
         "sync_bn backward_data", py::arg("grad_output"), py::arg("weight"),
         py::arg("grad_weight"), py::arg("grad_bias"), py::arg("norm"),
         py::arg("std"), py::arg("grad_input"));
-  m.def("get_indice_pairs_2d_forward", &get_indice_pairs_forward<2>,
-        "get_indice_pairs_2d_forward", py::arg("indices"), py::arg("batchSize"),
-        py::arg("outSpatialShape"), py::arg("spatialShape"),
-        py::arg("kernelSize"), py::arg("stride"), py::arg("padding"),
-        py::arg("dilation"), py::arg("outPadding"), py::arg("_subM"),
-        py::arg("_transpose"));
-  m.def("get_indice_pairs_3d_forward", &get_indice_pairs_forward<3>,
-        "get_indice_pairs_3d_forward", py::arg("indices"), py::arg("batchSize"),
-        py::arg("outSpatialShape"), py::arg("spatialShape"),
-        py::arg("kernelSize"), py::arg("stride"), py::arg("padding"),
-        py::arg("dilation"), py::arg("outPadding"), py::arg("_subM"),
-        py::arg("_transpose"));
-  m.def("get_indice_pairs_4d_forward", &get_indice_pairs_forward<4>,
-        "get_indice_pairs_4d_forward", py::arg("indices"), py::arg("batchSize"),
-        py::arg("outSpatialShape"), py::arg("spatialShape"),
-        py::arg("kernelSize"), py::arg("stride"), py::arg("padding"),
-        py::arg("dilation"), py::arg("outPadding"), py::arg("_subM"),
-        py::arg("_transpose"));
-  m.def("get_indice_pairs_2d_backward", &get_indice_pairs_backward<2>,
-        "get_indice_pairs_2d_backward", py::arg("indices"), py::arg("gridOut"),
-        py::arg("batchSize"), py::arg("outSpatialShape"),
-        py::arg("spatialShape"), py::arg("kernelSize"), py::arg("stride"),
-        py::arg("padding"), py::arg("dilation"), py::arg("outPadding"),
-        py::arg("_subM"), py::arg("_transpose"));
-  m.def("get_indice_pairs_3d_backward", &get_indice_pairs_backward<3>,
-        "get_indice_pairs_3d_backward", py::arg("indices"), py::arg("gridOut"),
-        py::arg("batchSize"), py::arg("outSpatialShape"),
-        py::arg("spatialShape"), py::arg("kernelSize"), py::arg("stride"),
-        py::arg("padding"), py::arg("dilation"), py::arg("outPadding"),
-        py::arg("_subM"), py::arg("_transpose"));
-  m.def("indice_conv_forward", &indice_conv_forward, "indice_conv_forward",
-        py::arg("features"), py::arg("filters"), py::arg("indicePairs"),
-        py::arg("indiceNum"), py::arg("numActOut"), py::arg("_inverse"),
-        py::arg("_subM"));
-  m.def("indice_conv_backward", &indice_conv_backward, "indice_conv_backward",
-        py::arg("features"), py::arg("filters"), py::arg("outGrad"),
-        py::arg("indicePairs"), py::arg("indiceNum"), py::arg("_inverse"),
-        py::arg("_subM"));
-  m.def("fused_indice_conv_forward", &fused_indice_conv_batchnorm_forward,
-        "fused_indice_conv_forward", py::arg("features"), py::arg("filters"),
-        py::arg("bias"), py::arg("indicePairs"), py::arg("indiceNum"),
-        py::arg("numActOut"), py::arg("_inverse"), py::arg("_subM"));
-  m.def("indice_maxpool_forward", &indice_maxpool_forward,
-        "indice_maxpool_forward", py::arg("features"), py::arg("indicePairs"),
-        py::arg("indiceNum"), py::arg("numAct"));
-  m.def("indice_maxpool_backward", &indice_maxpool_backward,
-        "indice_maxpool_backward", py::arg("features"), py::arg("outFeatures"),
-        py::arg("outGrad"), py::arg("indicePairs"), py::arg("indiceNum"));
   m.def("psamask_forward", &psamask_forward, "PSAMASK forward (CPU/CUDA)",
         py::arg("input"), py::arg("output"), py::arg("psa_type"),
         py::arg("num_"), py::arg("h_feature"), py::arg("w_feature"),
@@ -767,31 +581,46 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
         py::arg("input"), py::arg("shift"), py::arg("output"));
   m.def("tin_shift_backward", &tin_shift_backward, "tin_shift backward",
         py::arg("grad_output"), py::arg("shift"), py::arg("grad_input"));
+  m.def("bottom_pool_forward", &bottom_pool_forward, "Bottom Pool Forward",
+        py::arg("input"), py::call_guard<py::gil_scoped_release>());
+  m.def("bottom_pool_backward", &bottom_pool_backward, "Bottom Pool Backward",
+        py::arg("input"), py::arg("grad_output"),
+        py::call_guard<py::gil_scoped_release>());
+  m.def("left_pool_forward", &left_pool_forward, "Left Pool Forward",
+        py::arg("input"), py::call_guard<py::gil_scoped_release>());
+  m.def("left_pool_backward", &left_pool_backward, "Left Pool Backward",
+        py::arg("input"), py::arg("grad_output"),
+        py::call_guard<py::gil_scoped_release>());
+  m.def("right_pool_forward", &right_pool_forward, "Right Pool Forward",
+        py::arg("input"), py::call_guard<py::gil_scoped_release>());
+  m.def("right_pool_backward", &right_pool_backward, "Right Pool Backward",
+        py::arg("input"), py::arg("grad_output"),
+        py::call_guard<py::gil_scoped_release>());
+  m.def("top_pool_forward", &top_pool_forward, "Top Pool Forward",
+        py::arg("input"), py::call_guard<py::gil_scoped_release>());
+  m.def("top_pool_backward", &top_pool_backward, "Top Pool Backward",
+        py::arg("input"), py::arg("grad_output"),
+        py::call_guard<py::gil_scoped_release>());
   m.def("box_iou_rotated", &box_iou_rotated, "IoU for rotated boxes",
         py::arg("boxes1"), py::arg("boxes2"), py::arg("ious"),
         py::arg("mode_flag"), py::arg("aligned"));
   m.def("nms_rotated", &nms_rotated, "NMS for rotated boxes", py::arg("dets"),
         py::arg("scores"), py::arg("order"), py::arg("dets_sorted"),
-        py::arg("labels"), py::arg("iou_threshold"), py::arg("multi_label"));
+        py::arg("iou_threshold"), py::arg("multi_label"));
   m.def("ball_query_forward", &ball_query_forward, "ball_query_forward",
         py::arg("new_xyz_tensor"), py::arg("xyz_tensor"), py::arg("idx_tensor"),
         py::arg("b"), py::arg("n"), py::arg("m"), py::arg("min_radius"),
         py::arg("max_radius"), py::arg("nsample"));
-  m.def("stack_ball_query_forward", &stack_ball_query_forward,
-        "stack_ball_query_forward", py::arg("new_xyz_tensor"),
-        py::arg("new_xyz_batch_cnt"), py::arg("xyz_tensor"),
-        py::arg("xyz_batch_cnt"), py::arg("idx_tensor"), py::arg("max_radius"),
-        py::arg("nsample"));
   m.def("roi_align_rotated_forward", &roi_align_rotated_forward,
         "roi_align_rotated forward", py::arg("input"), py::arg("rois"),
         py::arg("output"), py::arg("pooled_height"), py::arg("pooled_width"),
-        py::arg("spatial_scale"), py::arg("sampling_ratio"), py::arg("aligned"),
+        py::arg("spatial_scale"), py::arg("sample_num"), py::arg("aligned"),
         py::arg("clockwise"));
   m.def("roi_align_rotated_backward", &roi_align_rotated_backward,
         "roi_align_rotated backward", py::arg("rois"), py::arg("grad_input"),
         py::arg("grad_output"), py::arg("pooled_height"),
         py::arg("pooled_width"), py::arg("spatial_scale"),
-        py::arg("sampling_ratio"), py::arg("aligned"), py::arg("clockwise"));
+        py::arg("sample_num"), py::arg("aligned"), py::arg("clockwise"));
   m.def("dynamic_point_to_voxel_forward", &dynamic_point_to_voxel_forward,
         "dynamic_point_to_voxel_forward", py::arg("feats"), py::arg("coors"),
         py::arg("reduce_type"));
@@ -804,8 +633,7 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
         "hard_voxelize_forward", py::arg("points"), py::arg("voxel_size"),
         py::arg("coors_range"), py::arg("voxels"), py::arg("coors"),
         py::arg("num_points_per_voxel"), py::arg("voxel_num"),
-        py::arg("max_points"), py::arg("max_voxels"), py::arg("NDim"),
-        py::arg("deterministic"));
+        py::arg("max_points"), py::arg("max_voxels"), py::arg("NDim"));
   m.def("dynamic_voxelize_forward", &dynamic_voxelize_forward,
         "dynamic_voxelize_forward", py::arg("points"), py::arg("voxel_size"),
         py::arg("coors_range"), py::arg("coors"), py::arg("NDim"));
@@ -858,93 +686,4 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
         "roiaware_pool3d_backward", py::arg("pts_idx_of_voxels"),
         py::arg("argmax"), py::arg("grad_out"), py::arg("grad_in"),
         py::arg("pool_method"));
-  m.def("rotated_feature_align_forward", &rotated_feature_align_forward,
-        "Feature Refine forward (CUDA)", py::arg("features"),
-        py::arg("best_bboxes"), py::arg("output"), py::arg("spatial_scale"),
-        py::arg("points"));
-  m.def("rotated_feature_align_backward", &rotated_feature_align_backward,
-        "Feature Refine backward (CUDA)", py::arg("top_grad"),
-        py::arg("best_bboxes"), py::arg("bottom_grad"),
-        py::arg("spatial_scale"), py::arg("points"));
-  m.def("riroi_align_rotated_forward", &riroi_align_rotated_forward,
-        "riroi_align_rotated forward", py::arg("features"), py::arg("rois"),
-        py::arg("output"), py::arg("pooled_height"), py::arg("pooled_width"),
-        py::arg("spatial_scale"), py::arg("num_samples"),
-        py::arg("num_orientations"), py::arg("clockwise"));
-  m.def("riroi_align_rotated_backward", &riroi_align_rotated_backward,
-        "riroi_align_rotated backward", py::arg("top_grad"), py::arg("rois"),
-        py::arg("bottom_grad"), py::arg("pooled_height"),
-        py::arg("pooled_width"), py::arg("spatial_scale"),
-        py::arg("num_samples"), py::arg("num_orientations"),
-        py::arg("clockwise"));
-  m.def("points_in_polygons_forward", &points_in_polygons_forward,
-        "points_in_polygons_forward", py::arg("points"), py::arg("polygons"),
-        py::arg("output"));
-  m.def("min_area_polygons", &min_area_polygons, "min_area_polygons",
-        py::arg("pointsets"), py::arg("polygons"));
-  m.def("active_rotated_filter_forward", &active_rotated_filter_forward,
-        "active_rotated_filter_forward", py::arg("input"), py::arg("indices"),
-        py::arg("output"));
-  m.def("active_rotated_filter_backward", &active_rotated_filter_backward,
-        "active_rotated_filter_backward", py::arg("grad_out"),
-        py::arg("indices"), py::arg("grad_in"));
-  m.def("convex_iou", &convex_iou, "convex_iou", py::arg("pointsets"),
-        py::arg("polygons"), py::arg("ious"));
-  m.def("convex_giou", &convex_giou, "convex_giou", py::arg("pointsets"),
-        py::arg("polygons"), py::arg("output"));
-  m.def("diff_iou_rotated_sort_vertices_forward",
-        &diff_iou_rotated_sort_vertices_forward,
-        "diff_iou_rotated_sort_vertices_forward", py::arg("vertices"),
-        py::arg("mask"), py::arg("num_valid"));
-  m.def("chamfer_distance_forward", &chamfer_distance_forward,
-        "chamfer_distance_forward", py::arg("xyz1"), py::arg("xyz2"),
-        py::arg("dist1"), py::arg("dist2"), py::arg("idx1"), py::arg("idx2"));
-  m.def("chamfer_distance_backward", &chamfer_distance_backward,
-        "chamfer_distance_backward", py::arg("xyz1"), py::arg("xyz2"),
-        py::arg("idx1"), py::arg("idx2"), py::arg("graddist1"),
-        py::arg("graddist2"), py::arg("gradxyz1"), py::arg("gradxyz2"));
-  m.def("prroi_pool_forward", &prroi_pool_forward, "prroi_pool forward",
-        py::arg("input"), py::arg("rois"), py::arg("output"),
-        py::arg("pooled_height"), py::arg("pooled_width"),
-        py::arg("spatial_scale"));
-  m.def("prroi_pool_backward", &prroi_pool_backward, "prroi_pool_backward",
-        py::arg("grad_output"), py::arg("rois"), py::arg("grad_input"),
-        py::arg("pooled_height"), py::arg("pooled_width"),
-        py::arg("spatial_scale"));
-  m.def("prroi_pool_coor_backward", &prroi_pool_coor_backward,
-        "prroi_pool_coor_backward", py::arg("output"), py::arg("grad_output"),
-        py::arg("input"), py::arg("rois"), py::arg("grad_rois"),
-        py::arg("pooled_height"), py::arg("pooled_width"),
-        py::arg("spatial_scale"));
-  m.def("bias_act", &bias_act, "bias_act (CUDA)", py::arg("input"),
-        py::arg("bias"), py::arg("xref"), py::arg("yref"), py::arg("dy"),
-        py::arg("grad"), py::arg("dim"), py::arg("act"), py::arg("alpha"),
-        py::arg("gain"), py::arg("clamp"));
-  m.def("filtered_lrelu", &filtered_lrelu, "filtered_lrelu (CUDA)",
-        py::arg("x"), py::arg("fu"), py::arg("fd"), py::arg("b"), py::arg("si"),
-        py::arg("up"), py::arg("down"), py::arg("px0"), py::arg("px1"),
-        py::arg("py0"), py::arg("py1"), py::arg("sx"), py::arg("sy"),
-        py::arg("gain"), py::arg("slope"), py::arg("clamp"),
-        py::arg("flip_filters"), py::arg("writeSigns"));
-  m.def("filtered_lrelu_act_", &filtered_lrelu_act_,
-        "filtered_lrelu_act_ (CUDA)", py::arg("x"), py::arg("si"),
-        py::arg("sx"), py::arg("sy"), py::arg("gain"), py::arg("slope"),
-        py::arg("clamp"), py::arg("writeSigns"));
-  m.def("box_iou_quadri", &box_iou_quadri, "IoU for quadrilateral boxes",
-        py::arg("boxes1"), py::arg("boxes2"), py::arg("ious"),
-        py::arg("mode_flag"), py::arg("aligned"));
-  m.def("nms_quadri", &nms_quadri, "NMS for quadrilateral boxes",
-        py::arg("dets"), py::arg("scores"), py::arg("order"),
-        py::arg("dets_sorted"), py::arg("iou_threshold"),
-        py::arg("multi_label"));
-  m.def("bezier_align_forward", &bezier_align_forward, "bezier_align forward",
-        py::arg("input"), py::arg("rois"), py::arg("output"),
-        py::arg("aligned_height"), py::arg("aligned_width"),
-        py::arg("spatial_scale"), py::arg("sampling_ratio"),
-        py::arg("aligned"));
-  m.def("bezier_align_backward", &bezier_align_backward,
-        "bezier_align backward", py::arg("grad_output"), py::arg("rois"),
-        py::arg("grad_input"), py::arg("aligned_height"),
-        py::arg("aligned_width"), py::arg("spatial_scale"),
-        py::arg("sampling_ratio"), py::arg("aligned"));
 }
diff --git a/mmcv/ops/csrc/pytorch/riroi_align_rotated.cpp b/mmcv/ops/csrc/pytorch/riroi_align_rotated.cpp
deleted file mode 100644
index 81ffa9f..0000000
--- a/mmcv/ops/csrc/pytorch/riroi_align_rotated.cpp
+++ /dev/null
@@ -1,42 +0,0 @@
-// Copyright (c) OpenMMLab. All rights reserved
-#include "pytorch_cpp_helper.hpp"
-#include "pytorch_device_registry.hpp"
-
-void riroi_align_rotated_forward_impl(Tensor features, Tensor rois,
-                                      Tensor output, int pooled_height,
-                                      int pooled_width, float spatial_scale,
-                                      int num_samples, int num_orientations,
-                                      bool clockwise) {
-  DISPATCH_DEVICE_IMPL(riroi_align_rotated_forward_impl, features, rois, output,
-                       pooled_height, pooled_width, spatial_scale, num_samples,
-                       num_orientations, clockwise);
-}
-
-void riroi_align_rotated_backward_impl(Tensor top_grad, Tensor rois,
-                                       Tensor bottom_grad, int pooled_height,
-                                       int pooled_width, float spatial_scale,
-                                       int num_samples, int num_orientations,
-                                       bool clockwise) {
-  DISPATCH_DEVICE_IMPL(riroi_align_rotated_backward_impl, top_grad, rois,
-                       bottom_grad, pooled_height, pooled_width, spatial_scale,
-                       num_samples, num_orientations, clockwise);
-}
-
-void riroi_align_rotated_forward(Tensor features, Tensor rois, Tensor output,
-                                 int pooled_height, int pooled_width,
-                                 float spatial_scale, int num_samples,
-                                 int num_orientations, bool clockwise) {
-  riroi_align_rotated_forward_impl(features, rois, output, pooled_height,
-                                   pooled_width, spatial_scale, num_samples,
-                                   num_orientations, clockwise);
-}
-
-void riroi_align_rotated_backward(Tensor top_grad, Tensor rois,
-                                  Tensor bottom_grad, int pooled_height,
-                                  int pooled_width, float spatial_scale,
-                                  int num_samples, int num_orientations,
-                                  bool clockwise) {
-  riroi_align_rotated_backward_impl(top_grad, rois, bottom_grad, pooled_height,
-                                    pooled_width, spatial_scale, num_samples,
-                                    num_orientations, clockwise);
-}
diff --git a/mmcv/ops/csrc/pytorch/roi_align_rotated.cpp b/mmcv/ops/csrc/pytorch/roi_align_rotated.cpp
index 77ea5ce..5ef691a 100644
--- a/mmcv/ops/csrc/pytorch/roi_align_rotated.cpp
+++ b/mmcv/ops/csrc/pytorch/roi_align_rotated.cpp
@@ -2,23 +2,23 @@
 #include "pytorch_cpp_helper.hpp"
 #include "pytorch_device_registry.hpp"
 
-void roi_align_rotated_forward_impl(Tensor input, Tensor rois, Tensor output,
+void roi_align_rotated_forward_impl(Tensor features, Tensor rois, Tensor output,
                                     int aligned_height, int aligned_width,
-                                    float spatial_scale, int sampling_ratio,
+                                    float spatial_scale, int sample_ratio,
                                     bool aligned, bool clockwise) {
-  DISPATCH_DEVICE_IMPL(roi_align_rotated_forward_impl, input, rois, output,
+  DISPATCH_DEVICE_IMPL(roi_align_rotated_forward_impl, features, rois, output,
                        aligned_height, aligned_width, spatial_scale,
-                       sampling_ratio, aligned, clockwise);
+                       sample_ratio, aligned, clockwise);
 }
 
 void roi_align_rotated_backward_impl(Tensor top_grad, Tensor rois,
                                      Tensor bottom_grad, int aligned_height,
                                      int aligned_width, float spatial_scale,
-                                     int sampling_ratio, bool aligned,
+                                     int sample_ratio, bool aligned,
                                      bool clockwise) {
   DISPATCH_DEVICE_IMPL(roi_align_rotated_backward_impl, top_grad, rois,
                        bottom_grad, aligned_height, aligned_width,
-                       spatial_scale, sampling_ratio, aligned, clockwise);
+                       spatial_scale, sample_ratio, aligned, clockwise);
 }
 
 void roi_align_rotated_forward(Tensor input, Tensor rois, Tensor output,
diff --git a/mmcv/ops/csrc/pytorch/rotated_feature_align.cpp b/mmcv/ops/csrc/pytorch/rotated_feature_align.cpp
deleted file mode 100644
index 71fe0c9..0000000
--- a/mmcv/ops/csrc/pytorch/rotated_feature_align.cpp
+++ /dev/null
@@ -1,39 +0,0 @@
-// Copyright (c) OpenMMLab. All rights reserved.
-// Modified from
-// https://github.com/SJTU-Thinklab-Det/r3det-on-mmdetection/blob/master/mmdet/ops/fr/src/feature_refine_cuda.cpp
-
-#include "pytorch_cpp_helper.hpp"
-#include "pytorch_device_registry.hpp"
-
-void rotated_feature_align_forward_impl(const Tensor features,
-                                        const Tensor best_bboxes,
-                                        const float spatial_scale,
-                                        const int points, Tensor output) {
-  DISPATCH_DEVICE_IMPL(rotated_feature_align_forward_impl, features,
-                       best_bboxes, spatial_scale, points, output);
-}
-
-void rotated_feature_align_backward_impl(const Tensor top_grad,
-                                         const Tensor best_bboxes,
-                                         const float spatial_scale,
-                                         const int points, Tensor bottom_grad) {
-  DISPATCH_DEVICE_IMPL(rotated_feature_align_backward_impl, top_grad,
-                       best_bboxes, spatial_scale, points, bottom_grad);
-}
-
-void rotated_feature_align_forward(const Tensor features,
-                                   const Tensor best_bboxes, Tensor output,
-                                   const float spatial_scale,
-                                   const int points) {
-  rotated_feature_align_forward_impl(features, best_bboxes, spatial_scale,
-                                     points, output);
-}
-
-void rotated_feature_align_backward(const Tensor top_grad,
-                                    const Tensor best_bboxes,
-                                    Tensor bottom_grad,
-                                    const float spatial_scale,
-                                    const int points) {
-  rotated_feature_align_backward_impl(top_grad, best_bboxes, spatial_scale,
-                                      points, bottom_grad);
-}
diff --git a/mmcv/ops/csrc/pytorch/sparse_pool_ops.cpp b/mmcv/ops/csrc/pytorch/sparse_pool_ops.cpp
deleted file mode 100644
index b6f38fc..0000000
--- a/mmcv/ops/csrc/pytorch/sparse_pool_ops.cpp
+++ /dev/null
@@ -1,48 +0,0 @@
-// Copyright 2019 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "pytorch_cpp_helper.hpp"
-#include "pytorch_device_registry.hpp"
-
-torch::Tensor indice_maxpool_forward_impl(torch::Tensor features,
-                                          torch::Tensor indicePairs,
-                                          torch::Tensor indiceNum,
-                                          int64_t numAct) {
-  return DISPATCH_DEVICE_IMPL(indice_maxpool_forward_impl, features,
-                              indicePairs, indiceNum, numAct);
-}
-
-torch::Tensor indice_maxpool_forward(torch::Tensor features,
-                                     torch::Tensor indicePairs,
-                                     torch::Tensor indiceNum, int64_t numAct) {
-  return indice_maxpool_forward_impl(features, indicePairs, indiceNum, numAct);
-}
-
-torch::Tensor indice_maxpool_backward_impl(torch::Tensor features,
-                                           torch::Tensor outFeatures,
-                                           torch::Tensor outGrad,
-                                           torch::Tensor indicePairs,
-                                           torch::Tensor indiceNum) {
-  return DISPATCH_DEVICE_IMPL(indice_maxpool_backward_impl, features,
-                              outFeatures, outGrad, indicePairs, indiceNum);
-}
-
-torch::Tensor indice_maxpool_backward(torch::Tensor features,
-                                      torch::Tensor outFeatures,
-                                      torch::Tensor outGrad,
-                                      torch::Tensor indicePairs,
-                                      torch::Tensor indiceNum) {
-  return indice_maxpool_backward_impl(features, outFeatures, outGrad,
-                                      indicePairs, indiceNum);
-}
diff --git a/mmcv/ops/csrc/pytorch/spconv_ops.cpp b/mmcv/ops/csrc/pytorch/spconv_ops.cpp
deleted file mode 100644
index 09c8110..0000000
--- a/mmcv/ops/csrc/pytorch/spconv_ops.cpp
+++ /dev/null
@@ -1,171 +0,0 @@
-// Copyright 2019 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "pytorch_cpp_helper.hpp"
-#include "pytorch_device_registry.hpp"
-
-template <unsigned NDim>
-std::vector<torch::Tensor> GetIndicePairsForwardCUDAKernelLauncher(
-    torch::Tensor indices, int64_t batchSize,
-    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
-    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
-    std::vector<int64_t> padding, std::vector<int64_t> dilation,
-    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose);
-
-template <unsigned NDim>
-std::vector<torch::Tensor> get_indice_pairs_forward_cuda(
-    torch::Tensor indices, int64_t batchSize,
-    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
-    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
-    std::vector<int64_t> padding, std::vector<int64_t> dilation,
-    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose) {
-  return GetIndicePairsForwardCUDAKernelLauncher<NDim>(
-      indices, batchSize, outSpatialShape, spatialShape, kernelSize, stride,
-      padding, dilation, outPadding, _subM, _transpose);
-};
-
-template <unsigned NDim>
-std::vector<torch::Tensor> GetIndicePairsBackwardCUDAKernelLauncher(
-    torch::Tensor indices, torch::Tensor gridOut, int64_t batchSize,
-    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
-    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
-    std::vector<int64_t> padding, std::vector<int64_t> dilation,
-    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose);
-
-template <unsigned NDim>
-std::vector<torch::Tensor> get_indice_pairs_backward_cuda(
-    torch::Tensor indices, torch::Tensor gridOut, int64_t batchSize,
-    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
-    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
-    std::vector<int64_t> padding, std::vector<int64_t> dilation,
-    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose) {
-  return GetIndicePairsBackwardCUDAKernelLauncher<NDim>(
-      indices, gridOut, batchSize, outSpatialShape, spatialShape, kernelSize,
-      stride, padding, dilation, outPadding, _subM, _transpose);
-};
-
-template <unsigned NDim>
-std::vector<torch::Tensor> get_indice_pairs_forward(
-    torch::Tensor indices, int64_t batchSize,
-    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
-    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
-    std::vector<int64_t> padding, std::vector<int64_t> dilation,
-    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose) {
-  if (indices.device().is_cuda()) {
-#ifdef MMCV_WITH_CUDA
-    CHECK_CUDA_INPUT(indices);
-
-    return get_indice_pairs_forward_cuda<NDim>(
-        indices, batchSize, outSpatialShape, spatialShape, kernelSize, stride,
-        padding, dilation, outPadding, _subM, _transpose);
-#else
-    AT_ERROR("get_indice_pairs is not compiled with GPU support");
-#endif
-  } else {
-    AT_ERROR("get_indice_pairs is not implemented on CPU");
-  }
-}
-
-template <unsigned NDim>
-std::vector<torch::Tensor> get_indice_pairs_backward(
-    torch::Tensor indices, torch::Tensor gridOut, int64_t batchSize,
-    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
-    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
-    std::vector<int64_t> padding, std::vector<int64_t> dilation,
-    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose) {
-  if (indices.device().is_cuda()) {
-#ifdef MMCV_WITH_CUDA
-    CHECK_CUDA_INPUT(indices);
-    CHECK_CUDA_INPUT(gridOut);
-
-    return get_indice_pairs_backward_cuda<NDim>(
-        indices, gridOut, batchSize, outSpatialShape, spatialShape, kernelSize,
-        stride, padding, dilation, outPadding, _subM, _transpose);
-#else
-    AT_ERROR("get_indice_pairs is not compiled with GPU support");
-#endif
-  } else {
-    AT_ERROR("get_indice_pairs is not implemented on CPU");
-  }
-}
-
-torch::Tensor indice_conv_forward_impl(torch::Tensor features,
-                                       torch::Tensor filters,
-                                       torch::Tensor indicePairs,
-                                       torch::Tensor indiceNum,
-                                       int64_t numActOut, int64_t _inverse,
-                                       int64_t _subM) {
-  return DISPATCH_DEVICE_IMPL(indice_conv_forward_impl, features, filters,
-                              indicePairs, indiceNum, numActOut, _inverse,
-                              _subM);
-}
-
-torch::Tensor indice_conv_forward(torch::Tensor features, torch::Tensor filters,
-                                  torch::Tensor indicePairs,
-                                  torch::Tensor indiceNum, int64_t numActOut,
-                                  int64_t _inverse, int64_t _subM) {
-  return indice_conv_forward_impl(features, filters, indicePairs, indiceNum,
-                                  numActOut, _inverse, _subM);
-}
-
-std::vector<torch::Tensor> indice_conv_backward_impl(
-    torch::Tensor features, torch::Tensor filters, torch::Tensor outGrad,
-    torch::Tensor indicePairs, torch::Tensor indiceNum, int64_t _inverse,
-    int64_t _subM) {
-  return DISPATCH_DEVICE_IMPL(indice_conv_backward_impl, features, filters,
-                              outGrad, indicePairs, indiceNum, _inverse, _subM);
-}
-
-std::vector<torch::Tensor> indice_conv_backward(
-    torch::Tensor features, torch::Tensor filters, torch::Tensor outGrad,
-    torch::Tensor indicePairs, torch::Tensor indiceNum, int64_t _inverse,
-    int64_t _subM) {
-  return indice_conv_backward_impl(features, filters, outGrad, indicePairs,
-                                   indiceNum, _inverse, _subM);
-}
-
-template std::vector<torch::Tensor> get_indice_pairs_forward<2>(
-    torch::Tensor indices, int64_t batchSize,
-    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
-    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
-    std::vector<int64_t> padding, std::vector<int64_t> dilation,
-    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose);
-
-template std::vector<torch::Tensor> get_indice_pairs_forward<3>(
-    torch::Tensor indices, int64_t batchSize,
-    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
-    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
-    std::vector<int64_t> padding, std::vector<int64_t> dilation,
-    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose);
-
-template std::vector<torch::Tensor> get_indice_pairs_forward<4>(
-    torch::Tensor indices, int64_t batchSize,
-    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
-    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
-    std::vector<int64_t> padding, std::vector<int64_t> dilation,
-    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose);
-
-template std::vector<torch::Tensor> get_indice_pairs_backward<2>(
-    torch::Tensor indices, torch::Tensor gridOut, int64_t batchSize,
-    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
-    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
-    std::vector<int64_t> padding, std::vector<int64_t> dilation,
-    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose);
-
-template std::vector<torch::Tensor> get_indice_pairs_backward<3>(
-    torch::Tensor indices, torch::Tensor gridOut, int64_t batchSize,
-    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
-    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
-    std::vector<int64_t> padding, std::vector<int64_t> dilation,
-    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose);
diff --git a/mmcv/ops/csrc/pytorch/spconv_utils.h b/mmcv/ops/csrc/pytorch/spconv_utils.h
deleted file mode 100644
index 7d3de02..0000000
--- a/mmcv/ops/csrc/pytorch/spconv_utils.h
+++ /dev/null
@@ -1,79 +0,0 @@
-// Copyright 2019 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <ATen/ATen.h>
-#include <ATen/cuda/CUDAContext.h>
-#include <torch/script.h>
-#include <utils/spconv/tensorview/tensorview.h>
-
-#include "pytorch_cuda_helper.hpp"
-
-namespace tv {
-struct GPU {
-  GPU(cudaStream_t s = 0) : mStream(s) {}
-  virtual cudaStream_t getStream() const { return mStream; }
-  cudaStream_t mStream = 0;
-};
-
-struct TorchGPU : public tv::GPU {
-  virtual cudaStream_t getStream() const override {
-    return at::cuda::getCurrentCUDAStream();
-  }
-};
-
-template <typename scalar_t>
-void check_torch_dtype(const torch::Tensor &tensor) {
-  switch (tensor.type().scalarType()) {
-    case at::ScalarType::Double: {
-      auto val = std::is_same<std::remove_const_t<scalar_t>, double>::value;
-      TV_ASSERT_RT_ERR(val, "error");
-      break;
-    }
-    case at::ScalarType::Float: {
-      auto val = std::is_same<std::remove_const_t<scalar_t>, float>::value;
-      TV_ASSERT_RT_ERR(val, "error");
-      break;
-    }
-    case at::ScalarType::Int: {
-      auto val = std::is_same<std::remove_const_t<scalar_t>, int>::value;
-      TV_ASSERT_RT_ERR(val, "error");
-      break;
-    }
-    case at::ScalarType::Half: {
-      auto val = std::is_same<std::remove_const_t<scalar_t>, at::Half>::value;
-      TV_ASSERT_RT_ERR(val, "error");
-      break;
-    }
-    case at::ScalarType::Long: {
-      auto val = std::is_same<std::remove_const_t<scalar_t>, long>::value;
-      TV_ASSERT_RT_ERR(val, "error");
-      break;
-    }
-    default:
-      TV_ASSERT_RT_ERR(false, "error");
-  }
-}
-
-template <typename scalar_t>
-tv::TensorView<scalar_t> torch2tv(const torch::Tensor &tensor) {
-  check_torch_dtype<scalar_t>(tensor);
-  tv::Shape shape;
-  for (auto i : tensor.sizes()) {
-    shape.push_back(i);
-  }
-  return tv::TensorView<scalar_t>(
-      tensor.data_ptr<std::remove_const_t<scalar_t>>(), shape);
-}
-}  // namespace tv
diff --git a/mmcv/ops/csrc/pytorch/upfirdn2d.cpp b/mmcv/ops/csrc/pytorch/upfirdn2d.cpp
index 4a3e928..dd325bd 100644
--- a/mmcv/ops/csrc/pytorch/upfirdn2d.cpp
+++ b/mmcv/ops/csrc/pytorch/upfirdn2d.cpp
@@ -102,17 +102,17 @@ THE POSSIBILITY OF SUCH DAMAGES.
 #include "pytorch_cpp_helper.hpp"
 #include "pytorch_device_registry.hpp"
 
-torch::Tensor upfirdn2d_op_impl(torch::Tensor input, torch::Tensor filter,
-                                int upx, int upy, int downx, int downy,
-                                int padx0, int padx1, int pady0, int pady1,
-                                bool flip, float gain) {
-  return DISPATCH_DEVICE_IMPL(upfirdn2d_op_impl, input, filter, upx, upy, downx,
-                              downy, padx0, padx1, pady0, pady1, flip, gain);
+torch::Tensor upfirdn2d_op_impl(const torch::Tensor& input,
+                                const torch::Tensor& kernel, int up_x, int up_y,
+                                int down_x, int down_y, int pad_x0, int pad_x1,
+                                int pad_y0, int pad_y1) {
+  return DISPATCH_DEVICE_IMPL(upfirdn2d_op_impl, input, kernel, up_x, up_y,
+                              down_x, down_y, pad_x0, pad_x1, pad_y0, pad_y1);
 }
 
-torch::Tensor upfirdn2d(torch::Tensor input, torch::Tensor filter, int upx,
-                        int upy, int downx, int downy, int padx0, int padx1,
-                        int pady0, int pady1, bool flip, float gain) {
-  return upfirdn2d_op_impl(input, filter, upx, upy, downx, downy, padx0, padx1,
-                           pady0, pady1, flip, gain);
+torch::Tensor upfirdn2d(const torch::Tensor& input, const torch::Tensor& kernel,
+                        int up_x, int up_y, int down_x, int down_y, int pad_x0,
+                        int pad_x1, int pad_y0, int pad_y1) {
+  return upfirdn2d_op_impl(input, kernel, up_x, up_y, down_x, down_y, pad_x0,
+                           pad_x1, pad_y0, pad_y1);
 }
diff --git a/mmcv/ops/csrc/pytorch/voxelization.cpp b/mmcv/ops/csrc/pytorch/voxelization.cpp
index 7946be6..1d1c229 100644
--- a/mmcv/ops/csrc/pytorch/voxelization.cpp
+++ b/mmcv/ops/csrc/pytorch/voxelization.cpp
@@ -14,17 +14,6 @@ int hard_voxelize_forward_impl(const at::Tensor &points, at::Tensor &voxels,
                               max_points, max_voxels, NDim);
 }
 
-int nondeterministic_hard_voxelize_forward_impl(
-    const at::Tensor &points, at::Tensor &voxels, at::Tensor &coors,
-    at::Tensor &num_points_per_voxel, const std::vector<float> voxel_size,
-    const std::vector<float> coors_range, const int max_points,
-    const int max_voxels, const int NDim = 3) {
-  return DISPATCH_DEVICE_IMPL(nondeterministic_hard_voxelize_forward_impl,
-                              points, voxels, coors, num_points_per_voxel,
-                              voxel_size, coors_range, max_points, max_voxels,
-                              NDim);
-}
-
 void dynamic_voxelize_forward_impl(const at::Tensor &points, at::Tensor &coors,
                                    const std::vector<float> voxel_size,
                                    const std::vector<float> coors_range,
@@ -38,8 +27,7 @@ void hard_voxelize_forward(const at::Tensor &points,
                            const at::Tensor &coors_range, at::Tensor &voxels,
                            at::Tensor &coors, at::Tensor &num_points_per_voxel,
                            at::Tensor &voxel_num, const int max_points,
-                           const int max_voxels, const int NDim = 3,
-                           const bool deterministic = true) {
+                           const int max_voxels, const int NDim = 3) {
   int64_t *voxel_num_data = voxel_num.data_ptr<int64_t>();
   std::vector<float> voxel_size_v(
       voxel_size.data_ptr<float>(),
@@ -48,15 +36,9 @@ void hard_voxelize_forward(const at::Tensor &points,
       coors_range.data_ptr<float>(),
       coors_range.data_ptr<float>() + coors_range.numel());
 
-  if (deterministic) {
-    *voxel_num_data = hard_voxelize_forward_impl(
-        points, voxels, coors, num_points_per_voxel, voxel_size_v,
-        coors_range_v, max_points, max_voxels, NDim);
-  } else {
-    *voxel_num_data = nondeterministic_hard_voxelize_forward_impl(
-        points, voxels, coors, num_points_per_voxel, voxel_size_v,
-        coors_range_v, max_points, max_voxels, NDim);
-  }
+  *voxel_num_data = hard_voxelize_forward_impl(
+      points, voxels, coors, num_points_per_voxel, voxel_size_v, coors_range_v,
+      max_points, max_voxels, NDim);
 }
 
 void dynamic_voxelize_forward(const at::Tensor &points,
diff --git a/mmcv/ops/csrc/tensorrt/plugins/trt_corner_pool.cpp b/mmcv/ops/csrc/tensorrt/plugins/trt_corner_pool.cpp
new file mode 100644
index 0000000..d405a7d
--- /dev/null
+++ b/mmcv/ops/csrc/tensorrt/plugins/trt_corner_pool.cpp
@@ -0,0 +1,217 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "trt_corner_pool.hpp"
+
+#include <assert.h>
+
+#include "trt_serialize.hpp"
+
+void CornerPoolForwardLauncher_float(const float *input, float *output,
+                                     const int batch_size, const int channels,
+                                     const int height, const int width,
+                                     const int pool_type, cudaStream_t stream);
+
+namespace {
+static const char *PLUGIN_VERSION{"1"};
+static const char *CORNER_POOL_PLUGIN_NAME{"MMCVCornerPool"};
+}  // namespace
+
+CornerPoolPluginDynamic::CornerPoolPluginDynamic(const std::string &name,
+                                                 TRT_CORNER_POOL_TYPE poolType)
+    : mLayerName(name), mPoolType(poolType) {}
+
+CornerPoolPluginDynamic::CornerPoolPluginDynamic(const std::string name,
+                                                 const void *data,
+                                                 size_t length)
+    : mLayerName(name) {
+  deserialize_value(&data, &length, &mPoolType);
+}
+
+CornerPoolPluginDynamic::~CornerPoolPluginDynamic() {}
+
+nvinfer1::IPluginV2DynamicExt *CornerPoolPluginDynamic::clone() const {
+  CornerPoolPluginDynamic *plugin =
+      new CornerPoolPluginDynamic(mLayerName, mPoolType);
+  plugin->setPluginNamespace(getPluginNamespace());
+
+  return plugin;
+}
+
+nvinfer1::DimsExprs CornerPoolPluginDynamic::getOutputDimensions(
+    int outputIndex, const nvinfer1::DimsExprs *inputs, int nbInputs,
+    nvinfer1::IExprBuilder &exprBuilder) {
+  return inputs[0];
+}
+
+bool CornerPoolPluginDynamic::supportsFormatCombination(
+    int pos, const nvinfer1::PluginTensorDesc *inOut, int nbInputs,
+    int nbOutputs) {
+  switch (pos) {
+    // input[0]
+    case 0:
+      return inOut[pos].type == nvinfer1::DataType::kFLOAT &&
+             inOut[pos].format == nvinfer1::TensorFormat::kLINEAR;
+    // output[0]
+    case 1:
+      return inOut[pos].type == inOut[0].type &&
+             inOut[pos].format == inOut[0].format;
+    default:
+      return false;
+  }
+}
+
+void CornerPoolPluginDynamic::configurePlugin(
+    const nvinfer1::DynamicPluginTensorDesc *inputs, int nbInputs,
+    const nvinfer1::DynamicPluginTensorDesc *outputs, int nbOutputs) {}
+
+size_t CornerPoolPluginDynamic::getWorkspaceSize(
+    const nvinfer1::PluginTensorDesc *inputs, int nbInputs,
+    const nvinfer1::PluginTensorDesc *outputs, int nbOutputs) const {
+  int sizeof_dtype = mmcv::getElementSize(outputs[0].type);
+}
+
+int CornerPoolPluginDynamic::enqueue(
+    const nvinfer1::PluginTensorDesc *inputDesc,
+    const nvinfer1::PluginTensorDesc *outputDesc, const void *const *inputs,
+    void *const *outputs, void *workSpace, cudaStream_t stream) {
+  const void *input = inputs[0];
+  void *output_value = outputs[0];
+
+  const int batch_size = inputDesc[0].dims.d[0];
+  const int channels = inputDesc[0].dims.d[1];
+  const int height = inputDesc[0].dims.d[2];
+  const int width = inputDesc[0].dims.d[3];
+
+  CornerPoolForwardLauncher_float((float *)input, (float *)output_value,
+                                  batch_size, channels, height, width,
+                                  int(mPoolType), stream);
+
+  return 0;
+}
+
+nvinfer1::DataType CornerPoolPluginDynamic::getOutputDataType(
+    int index, const nvinfer1::DataType *inputTypes, int nbInputs) const {
+  return inputTypes[0];
+}
+
+// IPluginV2 Methods
+const char *CornerPoolPluginDynamic::getPluginType() const {
+  switch (mPoolType) {
+    case TRT_CORNER_POOL_TYPE::TRT_TOP_POOL:
+    case TRT_CORNER_POOL_TYPE::TRT_BOTTOM_POOL:
+    case TRT_CORNER_POOL_TYPE::TRT_LEFT_POOL:
+    case TRT_CORNER_POOL_TYPE::TRT_RIGHT_POOL:
+      return CORNER_POOL_PLUGIN_NAME;
+
+    default:
+      return "UnknownpoolType";
+  }
+}
+
+const char *CornerPoolPluginDynamic::getPluginVersion() const {
+  return PLUGIN_VERSION;
+}
+
+int CornerPoolPluginDynamic::getNbOutputs() const { return 1; }
+
+int CornerPoolPluginDynamic::initialize() { return 0; }
+
+void CornerPoolPluginDynamic::terminate() {}
+
+size_t CornerPoolPluginDynamic::getSerializationSize() const {
+  return sizeof(mPoolType);
+}
+
+void CornerPoolPluginDynamic::serialize(void *buffer) const {
+  serialize_value(&buffer, mPoolType);
+}
+
+void CornerPoolPluginDynamic::destroy() {
+  // This gets called when the network containing plugin is destroyed
+  delete this;
+}
+
+void CornerPoolPluginDynamic::setPluginNamespace(const char *libNamespace) {
+  mNamespace = libNamespace;
+}
+
+const char *CornerPoolPluginDynamic::getPluginNamespace() const {
+  return mNamespace.c_str();
+}
+
+CornerPoolPluginDynamicCreator::CornerPoolPluginDynamicCreator() {
+  mPluginAttributes.clear();
+  mPluginAttributes.emplace_back(nvinfer1::PluginField("mode"));
+  mFC.nbFields = mPluginAttributes.size();
+  mFC.fields = mPluginAttributes.data();
+}
+
+const char *CornerPoolPluginDynamicCreator::getPluginName() const {
+  return CORNER_POOL_PLUGIN_NAME;
+}
+
+const char *CornerPoolPluginDynamicCreator::getPluginVersion() const {
+  return PLUGIN_VERSION;
+}
+
+const nvinfer1::PluginFieldCollection *
+CornerPoolPluginDynamicCreator::getFieldNames() {
+  return &mFC;
+}
+
+nvinfer1::IPluginV2 *CornerPoolPluginDynamicCreator::createPlugin(
+    const char *name, const nvinfer1::PluginFieldCollection *fc) {
+  TRT_CORNER_POOL_TYPE poolType;
+  int poolMode = -1;
+
+  for (int i = 0; i < fc->nbFields; i++) {
+    if (fc->fields[i].data == nullptr) {
+      continue;
+    }
+    std::string field_name(fc->fields[i].name);
+
+    if (field_name.compare("mode") == 0) {
+      poolMode = static_cast<const int *>(fc->fields[i].data)[0];
+    }
+  }
+
+  assert(poolMode >= 0 && poolMode <= 3);
+  switch (poolMode) {
+    case 0:
+      poolType = TRT_CORNER_POOL_TYPE::TRT_TOP_POOL;
+      break;
+    case 1:
+      poolType = TRT_CORNER_POOL_TYPE::TRT_BOTTOM_POOL;
+      break;
+    case 2:
+      poolType = TRT_CORNER_POOL_TYPE::TRT_LEFT_POOL;
+      break;
+    case 3:
+      poolType = TRT_CORNER_POOL_TYPE::TRT_RIGHT_POOL;
+      break;
+
+    default:
+      break;
+  }
+
+  CornerPoolPluginDynamic *plugin = new CornerPoolPluginDynamic(name, poolType);
+  plugin->setPluginNamespace(getPluginNamespace());
+  return plugin;
+}
+
+nvinfer1::IPluginV2 *CornerPoolPluginDynamicCreator::deserializePlugin(
+    const char *name, const void *serialData, size_t serialLength) {
+  // This object will be deleted when the network is destroyed, which will
+  // call FCPluginDynamic::destroy()
+  auto plugin = new CornerPoolPluginDynamic(name, serialData, serialLength);
+  plugin->setPluginNamespace(getPluginNamespace());
+  return plugin;
+}
+
+void CornerPoolPluginDynamicCreator::setPluginNamespace(
+    const char *libNamespace) {
+  mNamespace = libNamespace;
+}
+
+const char *CornerPoolPluginDynamicCreator::getPluginNamespace() const {
+  return mNamespace.c_str();
+}
diff --git a/mmcv/ops/csrc/tensorrt/plugins/trt_corner_pool_kernel.cu b/mmcv/ops/csrc/tensorrt/plugins/trt_corner_pool_kernel.cu
new file mode 100644
index 0000000..b457870
--- /dev/null
+++ b/mmcv/ops/csrc/tensorrt/plugins/trt_corner_pool_kernel.cu
@@ -0,0 +1,110 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "common_cuda_helper.hpp"
+#include "trt_cuda_helper.cuh"
+#include "trt_plugin_helper.hpp"
+
+template <typename scalar_t>
+__global__ void top_bottom_pool_kernel(const scalar_t *input, scalar_t *output,
+                                       const int batch_size, const int channels,
+                                       const int height, const int width,
+                                       const int pool_type) {
+  const int nthreads = batch_size * channels * width;
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    int n_idx = index / (channels * width);  // batch
+    int w_idx = index % width;               // width
+    int c_idx = (index / width) % channels;  // channels
+    int offset_n = n_idx * channels * width * height;
+    int offset_n_c = offset_n + c_idx * width * height;
+    int direction = -1;            // in [-1, 1], default for TopPool
+    int index_start = height - 2;  // default for TopPool
+    // pool_type in [0, 1]
+    if (pool_type == 0) {
+      // TopPool
+      // directly copy the most bottom value from input to output
+      output[offset_n_c + (height - 1) * width + w_idx] =
+          input[offset_n_c + (height - 1) * width + w_idx];
+    } else {
+      // BottomPool
+      // directly copy the most top value from input to output
+      output[offset_n_c + w_idx] = input[offset_n_c + w_idx];
+      index_start = 1;
+      direction = 1;
+    }
+    // do pool
+    for (int h = index_start; h >= 0 && h < height; h += direction) {
+      output[offset_n_c + h * width + w_idx] =
+          max(output[offset_n_c + (h - direction) * width + w_idx],
+              input[offset_n_c + h * width + w_idx]);
+    }
+  }
+}
+
+template <typename scalar_t>
+__global__ void left_right_pool_kernel(const scalar_t *input, scalar_t *output,
+                                       const int batch_size, const int channels,
+                                       const int height, const int width,
+                                       const int pool_type) {
+  const int nthreads = batch_size * channels * height;
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    int n_idx = index / (channels * height);  // batch
+    int h_idx = index % height;               // height
+    int c_idx = (index / height) % channels;  // channels
+    int offset_n = n_idx * channels * width * height;
+    int offset_n_c = offset_n + c_idx * width * height;
+    int offset_n_c_h = offset_n_c + h_idx * width;
+    int direction = -1;           // in [-1, 1], default for LeftPool
+    int index_start = width - 2;  // default for LeftPool
+    // pool_type in [2, 3]
+    if (pool_type == 2) {
+      // LeftPool
+      // directly copy the most right value from input to output
+      output[offset_n_c_h + width - 1] = input[offset_n_c_h + width - 1];
+    } else {
+      // RightPool
+      // directly copy the most left value from input to output
+      output[offset_n_c_h] = input[offset_n_c_h];
+      index_start = 1;
+      direction = 1;
+    }
+    // do pool
+    for (int w = index_start; w >= 0 && w < width; w += direction) {
+      output[offset_n_c_h + w] =
+          max(output[offset_n_c_h + w - direction], input[offset_n_c_h + w]);
+    }
+  }
+}
+
+template <typename scalar_t>
+void CornerPoolForwardLauncher(const scalar_t *input, scalar_t *output,
+                               const int batch_size, const int channels,
+                               const int height, const int width,
+                               const int pool_type, cudaStream_t stream) {
+  int nthreads = -1, col_block = -1;
+
+  switch (pool_type) {
+    case 0:
+    case 1:
+      nthreads = batch_size * channels * width;
+      col_block = DIVUP(nthreads, THREADS_PER_BLOCK);
+      top_bottom_pool_kernel<scalar_t>
+          <<<col_block, THREADS_PER_BLOCK, 0, stream>>>(
+              input, output, batch_size, channels, height, width, pool_type);
+      break;
+    case 2:
+    case 3:
+      nthreads = batch_size * channels * height;
+      col_block = DIVUP(nthreads, THREADS_PER_BLOCK);
+      left_right_pool_kernel<scalar_t>
+          <<<col_block, THREADS_PER_BLOCK, 0, stream>>>(
+              input, output, batch_size, channels, height, width, pool_type);
+      break;
+  }
+}
+
+void CornerPoolForwardLauncher_float(const float *input, float *output,
+                                     const int batch_size, const int channels,
+                                     const int height, const int width,
+                                     const int pool_type, cudaStream_t stream) {
+  CornerPoolForwardLauncher<float>(input, output, batch_size, channels, height,
+                                   width, pool_type, stream);
+}
diff --git a/mmcv/ops/csrc/tensorrt/plugins/trt_cuda_helper.cu b/mmcv/ops/csrc/tensorrt/plugins/trt_cuda_helper.cu
new file mode 100644
index 0000000..f76c5f2
--- /dev/null
+++ b/mmcv/ops/csrc/tensorrt/plugins/trt_cuda_helper.cu
@@ -0,0 +1,91 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include <cublas_v2.h>
+
+#include "common_cuda_helper.hpp"
+#include "trt_cuda_helper.cuh"
+#include "trt_plugin_helper.hpp"
+
+using mmcv::TensorDesc;
+
+template <class scalar_t>
+__global__ void copy_permute_kernel(scalar_t *dst, const scalar_t *src, int n,
+                                    TensorDesc ts_src_stride,
+                                    TensorDesc ts_dst_stride,
+                                    TensorDesc ts_permute) {
+  const int src_dim = ts_src_stride.dim;
+  int *src_stride = &(ts_src_stride.stride[0]);
+  int *dst_stride = &(ts_dst_stride.stride[0]);
+  int *permute = &(ts_permute.shape[0]);
+  CUDA_1D_KERNEL_LOOP(index, n) {
+    size_t dst_index = index;
+    size_t src_index = 0;
+    for (int i = 0; i < src_dim; ++i) {
+      int dim_index = dst_index / dst_stride[i];
+      dst_index = dst_index % dst_stride[i];
+      src_index += dim_index * src_stride[permute[i]];
+    }
+    dst[index] = src[src_index];
+  }
+}
+
+template <class scalar_t>
+void memcpyPermute(scalar_t *dst, const scalar_t *src, int *src_size,
+                   int *permute, int src_dim, cudaStream_t stream) {
+  size_t copy_size = 1;
+  TensorDesc ts_permute;
+  memcpy(&(ts_permute.shape[0]), permute, src_dim * sizeof(int));
+
+  TensorDesc ts_src_stride;
+  TensorDesc ts_dst_stride;
+  ts_src_stride.dim = src_dim;
+  ts_dst_stride.dim = src_dim;
+  int *src_stride = &(ts_src_stride.stride[0]);
+  int *dst_stride = &(ts_dst_stride.stride[0]);
+  int *dst_size = &(ts_dst_stride.shape[0]);
+  src_stride[src_dim - 1] = 1;
+  dst_stride[src_dim - 1] = 1;
+
+  for (int i = src_dim - 1; i >= 0; --i) {
+    dst_size[i] = src_size[permute[i]];
+    if (i < src_dim - 1) {
+      src_stride[i] = src_stride[i + 1] * src_size[i + 1];
+    }
+  }
+
+  for (int i = src_dim - 1; i >= 0; --i) {
+    copy_size *= dst_size[i];
+    if (i < src_dim - 1) {
+      dst_stride[i] = dst_stride[i + 1] * dst_size[i + 1];
+    }
+  }
+
+  copy_permute_kernel<scalar_t>
+      <<<GET_BLOCKS(copy_size), THREADS_PER_BLOCK, 0, stream>>>(
+          dst, src, copy_size, ts_src_stride, ts_dst_stride, ts_permute);
+}
+
+template void memcpyPermute<float>(float *dst, const float *src, int *src_size,
+                                   int *permute, int src_dim,
+                                   cudaStream_t stream);
+
+template <>
+cublasStatus_t cublasGemmWrap<float>(cublasHandle_t handle,
+                                     cublasOperation_t transa,
+                                     cublasOperation_t transb, int m, int n,
+                                     int k, const float *alpha, const float *A,
+                                     int lda, const float *B, int ldb,
+                                     const float *beta, float *C, int ldc) {
+  return cublasSgemm(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb,
+                     beta, C, ldc);
+}
+
+template <>
+cublasStatus_t cublasGemmWrap<half>(cublasHandle_t handle,
+                                    cublasOperation_t transa,
+                                    cublasOperation_t transb, int m, int n,
+                                    int k, const half *alpha, const half *A,
+                                    int lda, const half *B, int ldb,
+                                    const half *beta, half *C, int ldc) {
+  return cublasHgemm(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb,
+                     beta, C, ldc);
+}
diff --git a/mmcv/ops/csrc/tensorrt/plugins/trt_cummaxmin.cpp b/mmcv/ops/csrc/tensorrt/plugins/trt_cummaxmin.cpp
new file mode 100644
index 0000000..40bebbc
--- /dev/null
+++ b/mmcv/ops/csrc/tensorrt/plugins/trt_cummaxmin.cpp
@@ -0,0 +1,242 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "trt_cummaxmin.hpp"
+
+#include <assert.h>
+
+#include "trt_serialize.hpp"
+
+void CumMaxMinForwardLauncher_float(const float *input, float *output_value,
+                                    int *output_index, const int *dims,
+                                    int nbDims, int cum_dim, int cum_type,
+                                    cudaStream_t stream);
+
+void CumMaxMinForwardLauncher_int32(const int *input, int *output_value,
+                                    int *output_index, const int *dims,
+                                    int nbDims, int cum_dim, int cum_type,
+                                    cudaStream_t stream);
+
+namespace {
+static const char *PLUGIN_VERSION{"1"};
+static const char *CUMMAXMIN_PLUGIN_NAME{"cummaxmin"};
+static const char *CUMMAX_PLUGIN_NAME{"cummax"};
+static const char *CUMMIN_PLUGIN_NAME{"cummin"};
+}  // namespace
+
+CumMaxMinPluginDynamic::CumMaxMinPluginDynamic(const std::string &name, int dim,
+                                               TRT_CUMCMPTYPE cumType)
+    : mLayerName(name), mDim(dim), mCumType(cumType) {}
+
+CumMaxMinPluginDynamic::CumMaxMinPluginDynamic(const std::string name,
+                                               const void *data, size_t length)
+    : mLayerName(name) {
+  deserialize_value(&data, &length, &mDim);
+  deserialize_value(&data, &length, &mCumType);
+}
+
+CumMaxMinPluginDynamic::~CumMaxMinPluginDynamic() {}
+
+nvinfer1::IPluginV2DynamicExt *CumMaxMinPluginDynamic::clone() const {
+  CumMaxMinPluginDynamic *plugin =
+      new CumMaxMinPluginDynamic(mLayerName, mDim, mCumType);
+  plugin->setPluginNamespace(getPluginNamespace());
+
+  return plugin;
+}
+
+nvinfer1::DimsExprs CumMaxMinPluginDynamic::getOutputDimensions(
+    int outputIndex, const nvinfer1::DimsExprs *inputs, int nbInputs,
+    nvinfer1::IExprBuilder &exprBuilder) {
+  return inputs[0];
+}
+
+bool CumMaxMinPluginDynamic::supportsFormatCombination(
+    int pos, const nvinfer1::PluginTensorDesc *inOut, int nbInputs,
+    int nbOutputs) {
+  switch (pos) {
+    // input[0]
+    case 0:
+      return (inOut[pos].type == nvinfer1::DataType::kFLOAT ||
+              inOut[pos].type == nvinfer1::DataType::kINT32) &&
+             inOut[pos].format == nvinfer1::TensorFormat::kLINEAR;
+    // output[0]
+    case 1:
+      return inOut[pos].type == inOut[0].type &&
+             inOut[pos].format == inOut[0].format;
+    // output[1]
+    case 2:
+      return inOut[pos].type == nvinfer1::DataType::kINT32 &&
+             inOut[pos].format == nvinfer1::TensorFormat::kLINEAR;
+    default:
+      return false;
+  }
+}
+
+void CumMaxMinPluginDynamic::configurePlugin(
+    const nvinfer1::DynamicPluginTensorDesc *inputs, int nbInputs,
+    const nvinfer1::DynamicPluginTensorDesc *outputs, int nbOutputs) {}
+
+size_t CumMaxMinPluginDynamic::getWorkspaceSize(
+    const nvinfer1::PluginTensorDesc *inputs, int nbInputs,
+    const nvinfer1::PluginTensorDesc *outputs, int nbOutputs) const {
+  int sizeof_dtype = mmcv::getElementSize(outputs[0].type);
+}
+
+int CumMaxMinPluginDynamic::enqueue(
+    const nvinfer1::PluginTensorDesc *inputDesc,
+    const nvinfer1::PluginTensorDesc *outputDesc, const void *const *inputs,
+    void *const *outputs, void *workSpace, cudaStream_t stream) {
+  const void *input = inputs[0];
+  void *output_value = outputs[0];
+  int *output_index = (int *)outputs[1];
+
+  const int *dims = &(inputDesc[0].dims.d[0]);
+  int nbDims = inputDesc[0].dims.nbDims;
+
+  switch (inputDesc[0].type) {
+    case nvinfer1::DataType::kFLOAT:
+      CumMaxMinForwardLauncher_float((float *)input, (float *)output_value,
+                                     output_index, dims, nbDims, mDim,
+                                     int(mCumType), stream);
+      break;
+    case nvinfer1::DataType::kINT32:
+      CumMaxMinForwardLauncher_int32((int *)input, (int *)output_value,
+                                     output_index, dims, nbDims, mDim,
+                                     int(mCumType), stream);
+      break;
+    default:
+      break;
+  }
+
+  return 0;
+}
+
+nvinfer1::DataType CumMaxMinPluginDynamic::getOutputDataType(
+    int index, const nvinfer1::DataType *inputTypes, int nbInputs) const {
+  switch (index) {
+    case 0:
+      return inputTypes[0];
+    case 1:
+      return nvinfer1::DataType::kINT32;
+    default:
+      break;
+  }
+}
+
+// IPluginV2 Methods
+const char *CumMaxMinPluginDynamic::getPluginType() const {
+  switch (mCumType) {
+    case TRT_CUMCMPTYPE::TRT_CUMMAX:
+      return CUMMAX_PLUGIN_NAME;
+    case TRT_CUMCMPTYPE::TRT_CUMMIN:
+      return CUMMIN_PLUGIN_NAME;
+    default:
+      return "UnknownCumType";
+  }
+}
+
+const char *CumMaxMinPluginDynamic::getPluginVersion() const {
+  return PLUGIN_VERSION;
+}
+
+int CumMaxMinPluginDynamic::getNbOutputs() const { return 2; }
+
+int CumMaxMinPluginDynamic::initialize() { return 0; }
+
+void CumMaxMinPluginDynamic::terminate() {}
+
+size_t CumMaxMinPluginDynamic::getSerializationSize() const {
+  return sizeof(mDim) + sizeof(mCumType);
+}
+
+void CumMaxMinPluginDynamic::serialize(void *buffer) const {
+  serialize_value(&buffer, mDim);
+  serialize_value(&buffer, mCumType);
+}
+
+void CumMaxMinPluginDynamic::destroy() {
+  // This gets called when the network containing plugin is destroyed
+  delete this;
+}
+
+void CumMaxMinPluginDynamic::setPluginNamespace(const char *libNamespace) {
+  mNamespace = libNamespace;
+}
+
+const char *CumMaxMinPluginDynamic::getPluginNamespace() const {
+  return mNamespace.c_str();
+}
+
+CumMaxMinPluginDynamicCreator::CumMaxMinPluginDynamicCreator(
+    TRT_CUMCMPTYPE cumType)
+    : mCumType(cumType) {
+  mPluginAttributes.clear();
+  mPluginAttributes.emplace_back(nvinfer1::PluginField("dim"));
+  mFC.nbFields = mPluginAttributes.size();
+  mFC.fields = mPluginAttributes.data();
+}
+
+const char *CumMaxMinPluginDynamicCreator::getPluginName() const {
+  return CUMMAXMIN_PLUGIN_NAME;
+}
+
+const char *CumMaxMinPluginDynamicCreator::getPluginVersion() const {
+  return PLUGIN_VERSION;
+}
+
+const nvinfer1::PluginFieldCollection *
+CumMaxMinPluginDynamicCreator::getFieldNames() {
+  return &mFC;
+}
+
+nvinfer1::IPluginV2 *CumMaxMinPluginDynamicCreator::createPlugin(
+    const char *name, const nvinfer1::PluginFieldCollection *fc) {
+  int dim = 0;
+
+  for (int i = 0; i < fc->nbFields; i++) {
+    if (fc->fields[i].data == nullptr) {
+      continue;
+    }
+    std::string field_name(fc->fields[i].name);
+
+    if (field_name.compare("dim") == 0) {
+      dim = static_cast<const int *>(fc->fields[i].data)[0];
+    }
+  }
+
+  CumMaxMinPluginDynamic *plugin =
+      new CumMaxMinPluginDynamic(name, dim, mCumType);
+  plugin->setPluginNamespace(getPluginNamespace());
+  return plugin;
+}
+
+nvinfer1::IPluginV2 *CumMaxMinPluginDynamicCreator::deserializePlugin(
+    const char *name, const void *serialData, size_t serialLength) {
+  // This object will be deleted when the network is destroyed, which will
+  // call FCPluginDynamic::destroy()
+  auto plugin = new CumMaxMinPluginDynamic(name, serialData, serialLength);
+  plugin->setPluginNamespace(getPluginNamespace());
+  return plugin;
+}
+
+void CumMaxMinPluginDynamicCreator::setPluginNamespace(
+    const char *libNamespace) {
+  mNamespace = libNamespace;
+}
+
+const char *CumMaxMinPluginDynamicCreator::getPluginNamespace() const {
+  return mNamespace.c_str();
+}
+
+CumMaxPluginDynamicCreator::CumMaxPluginDynamicCreator()
+    : CumMaxMinPluginDynamicCreator(TRT_CUMCMPTYPE::TRT_CUMMAX) {}
+
+const char *CumMaxPluginDynamicCreator::getPluginName() const {
+  return CUMMAX_PLUGIN_NAME;
+}
+
+CumMinPluginDynamicCreator::CumMinPluginDynamicCreator()
+    : CumMaxMinPluginDynamicCreator(TRT_CUMCMPTYPE::TRT_CUMMIN) {}
+
+const char *CumMinPluginDynamicCreator::getPluginName() const {
+  return CUMMIN_PLUGIN_NAME;
+}
diff --git a/mmcv/ops/csrc/tensorrt/plugins/trt_cummaxmin_kernel.cu b/mmcv/ops/csrc/tensorrt/plugins/trt_cummaxmin_kernel.cu
new file mode 100644
index 0000000..1ff2e4e
--- /dev/null
+++ b/mmcv/ops/csrc/tensorrt/plugins/trt_cummaxmin_kernel.cu
@@ -0,0 +1,90 @@
+// Copyright (c) OpenMMLab. All rights reserved
+
+#include "common_cuda_helper.hpp"
+#include "trt_cuda_helper.cuh"
+#include "trt_plugin_helper.hpp"
+
+using mmcv::TensorDesc;
+
+template <typename scalar_t>
+__global__ void cummaxmin_kernel(const scalar_t *input, scalar_t *output_value,
+                                 int *output_index, TensorDesc tensor_desc,
+                                 int cum_dim, int cum_type) {
+  const size_t cum_size = tensor_desc.shape[cum_dim];
+  const size_t cum_stride = tensor_desc.stride[cum_dim];
+  const size_t data_size =
+      tensor_desc.stride[0] * tensor_desc.shape[0] / cum_size;
+  CUDA_1D_KERNEL_LOOP(index, data_size) {
+    size_t cum_offset =
+        index / cum_stride * (cum_size * cum_stride) + index % cum_stride;
+    int cum_index = 0;
+    auto cum_value = input[cum_offset];
+    output_value[cum_offset] = cum_value;
+    output_index[cum_offset] = cum_index;
+
+    for (size_t cum_index_current = 1; cum_index_current < cum_size;
+         ++cum_index_current) {
+      cum_offset += cum_stride;
+      const auto cum_value_current = input[cum_offset];
+      switch (cum_type) {
+        case 0:  // max
+          if (cum_value_current > cum_value) {
+            cum_value = cum_value_current;
+            cum_index = cum_index_current;
+          }
+          break;
+        case 1:  // min
+          if (cum_value_current < cum_value) {
+            cum_value = cum_value_current;
+            cum_index = cum_index_current;
+          }
+          break;
+      }
+      output_value[cum_offset] = cum_value;
+      output_index[cum_offset] = cum_index;
+    }
+  }
+}
+
+template <typename scalar_t>
+void CumMaxMinForwardLauncher(const scalar_t *input, scalar_t *output_value,
+                              int *output_index, const int *dims, int nbDims,
+                              int cum_dim, int cum_type, cudaStream_t stream) {
+  // fill tensordesc and initial
+  TensorDesc tensor_desc;
+  memset((void *)&tensor_desc, 0, sizeof(TensorDesc));
+  tensor_desc.dim = nbDims;
+  tensor_desc.shape[nbDims - 1] = dims[nbDims - 1];
+  tensor_desc.stride[nbDims - 1] = 1;
+  for (int i = nbDims - 2; i >= 0; --i) {
+    tensor_desc.shape[i] = dims[i];
+    tensor_desc.stride[i] = dims[i + 1] * tensor_desc.stride[i + 1];
+  }
+
+  // cum dim should be larger than 0
+  cum_dim = cum_dim >= 0 ? cum_dim : (nbDims + cum_dim);
+
+  const int data_size =
+      tensor_desc.stride[0] * tensor_desc.shape[0] / tensor_desc.shape[cum_dim];
+
+  const int col_block = DIVUP(data_size, THREADS_PER_BLOCK);
+
+  cummaxmin_kernel<scalar_t><<<col_block, THREADS_PER_BLOCK, 0, stream>>>(
+      input, output_value, output_index, tensor_desc, cum_dim, cum_type);
+}
+
+void CumMaxMinForwardLauncher_float(const float *input, float *output_value,
+                                    int *output_index, const int *dims,
+                                    int nbDims, int cum_dim, int cum_type,
+                                    cudaStream_t stream) {
+  CumMaxMinForwardLauncher<float>(input, output_value, output_index, dims,
+                                  nbDims, cum_dim, cum_type, stream);
+}
+
+void CumMaxMinForwardLauncher_int32(const int *input, int *output_value,
+                                    int *output_index, const int *dims,
+                                    int nbDims, int cum_dim, int cum_type,
+                                    cudaStream_t stream) {
+  CumMaxMinForwardLauncher<int>(input, output_value, output_index, dims, nbDims,
+                                cum_dim, cum_type, stream);
+}
diff --git a/mmcv/ops/csrc/tensorrt/plugins/trt_deform_conv.cpp b/mmcv/ops/csrc/tensorrt/plugins/trt_deform_conv.cpp
new file mode 100644
index 0000000..76056de
--- /dev/null
+++ b/mmcv/ops/csrc/tensorrt/plugins/trt_deform_conv.cpp
@@ -0,0 +1,318 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "trt_deform_conv.hpp"
+
+#include <assert.h>
+
+#include <chrono>
+
+#include "trt_serialize.hpp"
+
+void DeformConvForwardCUDAKernelLauncher_float(
+    const float *input, const float *weight, const float *offset, float *output,
+    void *workspace, int batchSize, int nInputPlane, int inputHeight,
+    int inputWidth, int nOutputPlane, int kW, int kH, int dW, int dH, int padW,
+    int padH, int dilationW, int dilationH, int group, int deformable_group,
+    int im2col_step, cublasHandle_t cublas_handle, cudaStream_t stream);
+
+namespace {
+static const char *PLUGIN_VERSION{"1"};
+static const char *PLUGIN_NAME{"MMCVDeformConv2d"};
+}  // namespace
+
+nvinfer1::PluginFieldCollection DeformableConvPluginDynamicCreator::mFC{};
+std::vector<nvinfer1::PluginField>
+    DeformableConvPluginDynamicCreator::mPluginAttributes;
+
+DeformableConvPluginDynamic::DeformableConvPluginDynamic(
+    const std::string &name, const nvinfer1::Dims &stride,
+    const nvinfer1::Dims &padding, const nvinfer1::Dims &dilation,
+    const int deformableGroup, const int group, int im2colStep)
+    : mLayerName(name),
+      mStride(stride),
+      mPadding(padding),
+      mDilation(dilation),
+      mDeformableGroup(deformableGroup),
+      mGroup(group),
+      mIm2colStep(im2colStep) {}
+
+DeformableConvPluginDynamic::DeformableConvPluginDynamic(const std::string name,
+                                                         const void *data,
+                                                         size_t length)
+    : mLayerName(name) {
+  deserialize_value(&data, &length, &mStride);
+  deserialize_value(&data, &length, &mPadding);
+  deserialize_value(&data, &length, &mDilation);
+  deserialize_value(&data, &length, &mDeformableGroup);
+  deserialize_value(&data, &length, &mGroup);
+  deserialize_value(&data, &length, &mIm2colStep);
+}
+DeformableConvPluginDynamic::~DeformableConvPluginDynamic() {}
+
+nvinfer1::IPluginV2DynamicExt *DeformableConvPluginDynamic::clone() const {
+  DeformableConvPluginDynamic *plugin =
+      new DeformableConvPluginDynamic(mLayerName, mStride, mPadding, mDilation,
+                                      mDeformableGroup, mGroup, mIm2colStep);
+  plugin->setPluginNamespace(getPluginNamespace());
+
+  return plugin;
+}
+
+nvinfer1::DimsExprs DeformableConvPluginDynamic::getOutputDimensions(
+    int outputIndex, const nvinfer1::DimsExprs *inputs, int nbInputs,
+    nvinfer1::IExprBuilder &exprBuilder) {
+  nvinfer1::DimsExprs ret;
+  ret.nbDims = 4;
+  ret.d[0] = inputs[0].d[0];
+  ret.d[1] = inputs[2].d[0];
+
+  ret.d[2] = inputs[1].d[2];
+  ret.d[3] = inputs[1].d[3];
+
+  return ret;
+}
+
+bool DeformableConvPluginDynamic::supportsFormatCombination(
+    int pos, const nvinfer1::PluginTensorDesc *inOut, int nbInputs,
+    int nbOutputs) {
+  if (pos == 0) {
+    return (inOut[pos].type == nvinfer1::DataType::kFLOAT &&
+            inOut[pos].format == nvinfer1::TensorFormat::kLINEAR);
+
+  } else {
+    return inOut[pos].type == inOut[0].type &&
+           inOut[pos].format == inOut[0].format;
+  }
+}
+
+void DeformableConvPluginDynamic::configurePlugin(
+    const nvinfer1::DynamicPluginTensorDesc *inputs, int nbInputs,
+    const nvinfer1::DynamicPluginTensorDesc *outputs, int nbOutputs) {}
+
+size_t DeformableConvPluginDynamic::getWorkspaceSize(
+    const nvinfer1::PluginTensorDesc *inputs, int nbInputs,
+    const nvinfer1::PluginTensorDesc *outputs, int nbOutputs) const {
+  int sizeof_dtype = mmcv::getElementSize(outputs[0].type);
+
+  int batch_size = inputs[0].dims.d[0];
+  int nInputPlane = inputs[0].dims.d[1];
+  int inputHeight = inputs[0].dims.d[2];
+  int inputWidth = inputs[0].dims.d[3];
+
+  int nOutputPlane = outputs[0].dims.d[1];
+  int outputHeight = outputs[0].dims.d[2];
+  int outputWidth = outputs[0].dims.d[3];
+
+  int kW = inputs[2].dims.d[2];
+  int kH = inputs[2].dims.d[3];
+  int im2col_step = std::min(batch_size, mIm2colStep);
+
+  size_t col_size =
+      mmcv::getAlignedSize(nInputPlane * kW * kH * im2col_step * outputHeight *
+                           outputWidth * sizeof_dtype);
+
+  size_t out_size = 0;
+  if (im2col_step != 1)
+    out_size = mmcv::getAlignedSize(batch_size * nOutputPlane * outputHeight *
+                                    outputWidth * sizeof_dtype);
+
+  return col_size + out_size;
+}
+
+int DeformableConvPluginDynamic::enqueue(
+    const nvinfer1::PluginTensorDesc *inputDesc,
+    const nvinfer1::PluginTensorDesc *outputDesc, const void *const *inputs,
+    void *const *outputs, void *workSpace, cudaStream_t stream) {
+  int batch_size = inputDesc[0].dims.d[0];
+  int inputChannel = inputDesc[0].dims.d[1];
+  int inputHeight = inputDesc[0].dims.d[2];
+  int inputWidth = inputDesc[0].dims.d[3];
+  int outputChannel = outputDesc[0].dims.d[1];
+  int kernelHeight = inputDesc[2].dims.d[2];
+  int kernelWidth = inputDesc[2].dims.d[3];
+
+  const void *x = inputs[0];
+  const void *offset = inputs[1];
+  const void *weight = inputs[2];
+  void *output = outputs[0];
+  int im2col_step = std::min(batch_size, mIm2colStep);
+
+  // TODO: add fp16 support
+  auto data_type = inputDesc[0].type;
+  switch (data_type) {
+    case nvinfer1::DataType::kFLOAT:
+      DeformConvForwardCUDAKernelLauncher_float(
+          (float *)x, (float *)weight, (float *)offset, (float *)output,
+          workSpace, batch_size, inputChannel, inputHeight, inputWidth,
+          outputChannel, kernelWidth, kernelHeight, mStride.d[0], mStride.d[1],
+          mPadding.d[0], mPadding.d[1], mDilation.d[0], mDilation.d[1], mGroup,
+          mDeformableGroup, im2col_step, m_cublas_handle, stream);
+      break;
+    default:
+      return 1;
+      break;
+  }
+
+  return 0;
+}
+
+nvinfer1::DataType DeformableConvPluginDynamic::getOutputDataType(
+    int index, const nvinfer1::DataType *inputTypes, int nbInputs) const {
+  return inputTypes[0];
+}
+
+// IPluginV2 Methods
+const char *DeformableConvPluginDynamic::getPluginType() const {
+  return PLUGIN_NAME;
+}
+
+const char *DeformableConvPluginDynamic::getPluginVersion() const {
+  return PLUGIN_VERSION;
+}
+
+int DeformableConvPluginDynamic::getNbOutputs() const { return 1; }
+
+int DeformableConvPluginDynamic::initialize() { return 0; }
+
+void DeformableConvPluginDynamic::terminate() {}
+
+size_t DeformableConvPluginDynamic::getSerializationSize() const {
+  return sizeof(mStride) + sizeof(mPadding) + sizeof(mDilation) +
+         sizeof(mDeformableGroup) + sizeof(mGroup) + sizeof(mIm2colStep);
+}
+
+void DeformableConvPluginDynamic::serialize(void *buffer) const {
+  serialize_value(&buffer, mStride);
+  serialize_value(&buffer, mPadding);
+  serialize_value(&buffer, mDilation);
+  serialize_value(&buffer, mDeformableGroup);
+  serialize_value(&buffer, mGroup);
+  serialize_value(&buffer, mIm2colStep);
+}
+
+void DeformableConvPluginDynamic::destroy() {
+  // This gets called when the network containing plugin is destroyed
+  delete this;
+}
+
+void DeformableConvPluginDynamic::attachToContext(
+    cudnnContext *cudnnContext, cublasContext *cublasContext,
+    nvinfer1::IGpuAllocator *gpuAllocator) {
+  m_cublas_handle = cublasContext;
+}
+
+void DeformableConvPluginDynamic::detachFromContext() {}
+
+void DeformableConvPluginDynamic::setPluginNamespace(const char *libNamespace) {
+  mNamespace = libNamespace;
+}
+
+const char *DeformableConvPluginDynamic::getPluginNamespace() const {
+  return mNamespace.c_str();
+}
+
+////////////////////// creator /////////////////////////////
+
+DeformableConvPluginDynamicCreator::DeformableConvPluginDynamicCreator() {
+  mPluginAttributes.emplace_back(nvinfer1::PluginField("stride"));
+  mPluginAttributes.emplace_back(nvinfer1::PluginField("padding"));
+  mPluginAttributes.emplace_back(nvinfer1::PluginField("dilation"));
+  mPluginAttributes.emplace_back(nvinfer1::PluginField("groups"));
+  mPluginAttributes.emplace_back(nvinfer1::PluginField("deform_groups"));
+  mPluginAttributes.emplace_back(nvinfer1::PluginField("bias"));
+  mPluginAttributes.emplace_back(nvinfer1::PluginField("im2col_step"));
+  mFC.nbFields = mPluginAttributes.size();
+  mFC.fields = mPluginAttributes.data();
+}
+
+const char *DeformableConvPluginDynamicCreator::getPluginName() const {
+  return PLUGIN_NAME;
+}
+
+const char *DeformableConvPluginDynamicCreator::getPluginVersion() const {
+  return PLUGIN_VERSION;
+}
+
+const nvinfer1::PluginFieldCollection *
+DeformableConvPluginDynamicCreator::getFieldNames() {
+  return &mFC;
+}
+
+nvinfer1::IPluginV2 *DeformableConvPluginDynamicCreator::createPlugin(
+    const char *name, const nvinfer1::PluginFieldCollection *fc) {
+  nvinfer1::Dims stride{2, {1, 1}};
+  nvinfer1::Dims padding{2, {0, 0}};
+  nvinfer1::Dims dilation{2, {1, 1}};
+  int deformableGroup = 1;
+  int group = 1;
+  int im2col_step = 32;
+
+  for (int i = 0; i < fc->nbFields; i++) {
+    if (fc->fields[i].data == nullptr) {
+      continue;
+    }
+    std::string field_name(fc->fields[i].name);
+
+    if (field_name.compare("stride") == 0) {
+      stride.nbDims = 2;
+      stride.d[0] = static_cast<const int *>(fc->fields[i].data)[0];
+      if (fc->fields[i].length == 1) {
+        stride.d[1] = stride.d[0];
+      } else {
+        stride.d[1] = static_cast<const int *>(fc->fields[i].data)[1];
+      }
+    }
+
+    if (field_name.compare("padding") == 0) {
+      padding.nbDims = 2;
+      padding.d[0] = static_cast<const int *>(fc->fields[i].data)[0];
+      if (fc->fields[i].length == 1) {
+        padding.d[1] = padding.d[0];
+      } else {
+        padding.d[1] = static_cast<const int *>(fc->fields[i].data)[1];
+      }
+    }
+
+    if (field_name.compare("dilation") == 0) {
+      dilation.nbDims = 2;
+      dilation.d[0] = static_cast<const int *>(fc->fields[i].data)[0];
+      if (fc->fields[i].length == 1) {
+        dilation.d[1] = dilation.d[0];
+      } else {
+        dilation.d[1] = static_cast<const int *>(fc->fields[i].data)[1];
+      }
+    }
+
+    if (field_name.compare("deformable_group") == 0) {
+      deformableGroup = static_cast<const int *>(fc->fields[i].data)[0];
+    }
+
+    if (field_name.compare("group") == 0) {
+      group = static_cast<const int *>(fc->fields[i].data)[0];
+    }
+
+    if (field_name.compare("im2col_step") == 0) {
+      im2col_step = static_cast<const int *>(fc->fields[i].data)[0];
+    }
+  }
+
+  DeformableConvPluginDynamic *plugin = new DeformableConvPluginDynamic(
+      name, stride, padding, dilation, deformableGroup, group, im2col_step);
+  plugin->setPluginNamespace(getPluginNamespace());
+  return plugin;
+}
+
+nvinfer1::IPluginV2 *DeformableConvPluginDynamicCreator::deserializePlugin(
+    const char *name, const void *serialData, size_t serialLength) {
+  auto plugin = new DeformableConvPluginDynamic(name, serialData, serialLength);
+  plugin->setPluginNamespace(getPluginNamespace());
+  return plugin;
+}
+
+void DeformableConvPluginDynamicCreator::setPluginNamespace(
+    const char *libNamespace) {
+  mNamespace = libNamespace;
+}
+
+const char *DeformableConvPluginDynamicCreator::getPluginNamespace() const {
+  return mNamespace.c_str();
+}
diff --git a/mmcv/ops/csrc/tensorrt/plugins/trt_deform_conv_kernel.cu b/mmcv/ops/csrc/tensorrt/plugins/trt_deform_conv_kernel.cu
new file mode 100644
index 0000000..b1f6989
--- /dev/null
+++ b/mmcv/ops/csrc/tensorrt/plugins/trt_deform_conv_kernel.cu
@@ -0,0 +1,129 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include <cuda_fp16.h>
+
+#include "common_cuda_helper.hpp"
+#include "deform_conv_cuda_kernel.cuh"
+#include "trt_cuda_helper.cuh"
+#include "trt_plugin_helper.hpp"
+
+template <typename T>
+void trt_deformable_im2col(const T* data_input, const T* data_offset,
+                           const int channels, const int height,
+                           const int width, const int ksize_h,
+                           const int ksize_w, const int pad_h, const int pad_w,
+                           const int stride_h, const int stride_w,
+                           const int dilation_h, const int dilation_w,
+                           const int parallel_imgs, const int deformable_group,
+                           T* data_col, cudaStream_t stream) {
+  int height_col =
+      (height + 2 * pad_h - (dilation_h * (ksize_h - 1) + 1)) / stride_h + 1;
+  int width_col =
+      (width + 2 * pad_w - (dilation_w * (ksize_w - 1) + 1)) / stride_w + 1;
+  int num_kernels = channels * height_col * width_col * parallel_imgs;
+  int channel_per_deformable_group = channels / deformable_group;
+
+  deformable_im2col_gpu_kernel<T>
+      <<<GET_BLOCKS(num_kernels), THREADS_PER_BLOCK, 0, stream>>>(
+          num_kernels, data_input, data_offset, height, width, ksize_h, ksize_w,
+          pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w,
+          channel_per_deformable_group, parallel_imgs, channels,
+          deformable_group, height_col, width_col, data_col);
+
+  cudaCheckError();
+}
+
+template <typename scalar_t>
+void DeformConvForwardCUDAKernelLauncher(
+    const scalar_t* input, const scalar_t* weight, const scalar_t* offset,
+    scalar_t* output, void* workspace, int batchSize, int nInputPlane,
+    int inputHeight, int inputWidth, int nOutputPlane, int kW, int kH, int dW,
+    int dH, int padW, int padH, int dilationW, int dilationH, int group,
+    int deformable_group, int im2col_step, cublasHandle_t cublas_handle,
+    cudaStream_t stream) {
+  size_t word_size = sizeof(scalar_t);
+
+  im2col_step = std::min(int(batchSize), im2col_step);
+  long outputWidth =
+      (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1;
+  long outputHeight =
+      (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1;
+
+  long long columns_size =
+      mmcv::getAlignedSize(nInputPlane * kW * kH * im2col_step * outputHeight *
+                           outputWidth * word_size);
+
+  // column buffer for img2col
+  scalar_t* columns = (scalar_t*)workspace;
+  workspace = workspace + columns_size;
+
+  scalar_t* output_buffer;
+  long long output_buffer_size = 0;
+  if (im2col_step == 1) {
+    output_buffer = output;
+  } else {
+    // output need permute when im2col_step!=1
+    output_buffer = (scalar_t*)workspace;
+    output_buffer_size = batchSize * nOutputPlane * outputWidth * outputHeight;
+  }
+
+  long long input_elt_step =
+      im2col_step * nInputPlane * inputHeight * inputWidth;
+  long long offset_elt_step =
+      im2col_step * deformable_group * 2 * kH * kW * outputHeight * outputWidth;
+  long long out_buffer_step =
+      nOutputPlane * im2col_step * outputHeight * outputWidth;
+  long long col_g_step =
+      nInputPlane * kW * kH / group * im2col_step * outputHeight * outputWidth;
+  long long weight_g_step =
+      nOutputPlane / group * nInputPlane / group * kH * kW;
+  long long out_buffer_g_step =
+      nOutputPlane / group * im2col_step * outputHeight * outputWidth;
+  int m = nOutputPlane / group;
+  int n = im2col_step * outputHeight * outputWidth;
+  int k = nInputPlane / group * kH * kW;
+  scalar_t alpha = 1.;
+  scalar_t beta = 0.;
+
+  for (int elt = 0; elt < batchSize / im2col_step; elt++) {
+    const scalar_t* input_start = input + elt * input_elt_step;
+    const scalar_t* offset_start = offset + elt * offset_elt_step;
+
+    trt_deformable_im2col<scalar_t>(input_start, offset_start, nInputPlane,
+                                    inputHeight, inputWidth, kH, kW, padH, padW,
+                                    dH, dW, dilationH, dilationW, im2col_step,
+                                    deformable_group, columns, stream);
+
+    for (int g = 0; g < group; ++g) {
+      const scalar_t* weight_start = weight + g * weight_g_step;
+      scalar_t* col_start = columns + g * col_g_step;
+      scalar_t* out_buffer_start =
+          output_buffer + elt * out_buffer_step + g * out_buffer_g_step;
+
+      cublasGemmWrap<scalar_t>(cublas_handle, CUBLAS_OP_N, CUBLAS_OP_N, n, m, k,
+                               &alpha, col_start, n, weight_start, k, &beta,
+                               out_buffer_start, n);
+      cudaCheckError();
+    }
+  }
+
+  if (im2col_step != 1) {
+    int output_buffer_shape[5] = {batchSize / im2col_step, nOutputPlane,
+                                  im2col_step, outputHeight, outputWidth};
+    int output_buffer_permute[5] = {0, 2, 1, 3, 4};
+    memcpyPermute<scalar_t>(output, output_buffer, &output_buffer_shape[0],
+                            &output_buffer_permute[0], 5, stream);
+  }
+}
+
+void DeformConvForwardCUDAKernelLauncher_float(
+    const float* input, const float* weight, const float* offset, float* output,
+    void* workspace, int batchSize, int nInputPlane, int inputHeight,
+    int inputWidth, int nOutputPlane, int kW, int kH, int dW, int dH, int padW,
+    int padH, int dilationW, int dilationH, int group, int deformable_group,
+    int im2col_step, cublasHandle_t cublas_handle, cudaStream_t stream) {
+  DeformConvForwardCUDAKernelLauncher<float>(
+      input, weight, offset, output, workspace, batchSize, nInputPlane,
+      inputHeight, inputWidth, nOutputPlane, kW, kH, dW, dH, padW, padH,
+      dilationW, dilationH, group, deformable_group, im2col_step, cublas_handle,
+      stream);
+}
diff --git a/mmcv/ops/csrc/tensorrt/plugins/trt_grid_sampler.cpp b/mmcv/ops/csrc/tensorrt/plugins/trt_grid_sampler.cpp
new file mode 100644
index 0000000..d955ca5
--- /dev/null
+++ b/mmcv/ops/csrc/tensorrt/plugins/trt_grid_sampler.cpp
@@ -0,0 +1,256 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "trt_grid_sampler.hpp"
+
+#include <assert.h>
+#include <stdio.h>
+
+#include <chrono>
+
+#include "trt_serialize.hpp"
+
+using mmcv::GridSamplerInterpolation;
+using mmcv::GridSamplerPadding;
+
+void grid_sample_float(float *output, const float *input, const float *grid,
+                       int *output_dims, int *input_dims, int *grid_dims,
+                       int nb_dims, GridSamplerInterpolation interp,
+                       GridSamplerPadding padding, bool align_corners,
+                       cudaStream_t stream);
+
+namespace {
+static const char *PLUGIN_VERSION{"1"};
+static const char *PLUGIN_NAME{"grid_sampler"};
+}  // namespace
+
+nvinfer1::PluginFieldCollection GridSamplerDynamicCreator::mFC{};
+std::vector<nvinfer1::PluginField> GridSamplerDynamicCreator::mPluginAttributes;
+
+GridSamplerDynamic::GridSamplerDynamic(const std::string &name, int mode,
+                                       int paddingMode, bool alignCorners)
+    : mLayerName(name),
+      mMode(mode),
+      mPaddingMode(paddingMode),
+      mAlignCorners(alignCorners) {}
+
+GridSamplerDynamic::GridSamplerDynamic(const std::string name, const void *data,
+                                       size_t length)
+    : mLayerName(name) {
+  deserialize_value(&data, &length, &mMode);
+  deserialize_value(&data, &length, &mPaddingMode);
+  deserialize_value(&data, &length, &mAlignCorners);
+}
+
+nvinfer1::IPluginV2DynamicExt *GridSamplerDynamic::clone() const {
+  GridSamplerDynamic *plugin =
+      new GridSamplerDynamic(mLayerName, mMode, mPaddingMode, mAlignCorners);
+  plugin->setPluginNamespace(getPluginNamespace());
+
+  return plugin;
+}
+
+nvinfer1::DimsExprs GridSamplerDynamic::getOutputDimensions(
+    int outputIndex, const nvinfer1::DimsExprs *inputs, int nbInputs,
+    nvinfer1::IExprBuilder &exprBuilder) {
+  nvinfer1::DimsExprs ret;
+  ret.nbDims = inputs[0].nbDims;
+  ret.d[0] = inputs[0].d[0];
+  ret.d[1] = inputs[0].d[1];
+  for (int i = 2; i < ret.nbDims; ++i) {
+    ret.d[i] = inputs[1].d[i - 1];
+  }
+  return ret;
+}
+
+bool GridSamplerDynamic::supportsFormatCombination(
+    int pos, const nvinfer1::PluginTensorDesc *inOut, int nbInputs,
+    int nbOutputs) {
+  if (pos == 0) {
+    return (inOut[pos].type == nvinfer1::DataType::kFLOAT &&
+            inOut[pos].format == nvinfer1::TensorFormat::kLINEAR);
+  } else {
+    return inOut[pos].type == inOut[0].type &&
+           inOut[pos].format == inOut[0].format;
+  }
+}
+
+void GridSamplerDynamic::configurePlugin(
+    const nvinfer1::DynamicPluginTensorDesc *inputs, int nbInputs,
+    const nvinfer1::DynamicPluginTensorDesc *outputs, int nbOutputs) {
+  // Validate input arguments
+}
+
+size_t GridSamplerDynamic::getWorkspaceSize(
+    const nvinfer1::PluginTensorDesc *inputs, int nbInputs,
+    const nvinfer1::PluginTensorDesc *outputs, int nbOutputs) const {
+  return 0;
+}
+
+int GridSamplerDynamic::enqueue(const nvinfer1::PluginTensorDesc *inputDesc,
+                                const nvinfer1::PluginTensorDesc *outputDesc,
+                                const void *const *inputs, void *const *outputs,
+                                void *workSpace, cudaStream_t stream) {
+  nvinfer1::Dims input_dims = inputDesc[0].dims;
+  nvinfer1::Dims grid_dims = inputDesc[1].dims;
+  nvinfer1::Dims output_dims = outputDesc[0].dims;
+
+  using mmcv::GridSamplerInterpolation;
+  using mmcv::GridSamplerPadding;
+
+  GridSamplerInterpolation interp_mode = GridSamplerInterpolation::Bilinear;
+  switch (mMode) {
+    case 0:
+      interp_mode = GridSamplerInterpolation::Bilinear;
+      break;
+    case 1:
+      interp_mode = GridSamplerInterpolation::Nearest;
+      break;
+    default:
+      break;
+  }
+
+  GridSamplerPadding padding_mode = GridSamplerPadding::Zeros;
+  switch (mPaddingMode) {
+    case 0:
+      padding_mode = GridSamplerPadding::Zeros;
+      break;
+
+    case 1:
+      padding_mode = GridSamplerPadding::Border;
+      break;
+
+    case 2:
+      padding_mode = GridSamplerPadding::Reflection;
+      break;
+    default:
+      break;
+  }
+
+  auto data_type = inputDesc[0].type;
+
+  switch (data_type) {
+    case nvinfer1::DataType::kFLOAT:
+      grid_sample_float(
+          (float *)outputs[0], (float *)inputs[0], (float *)inputs[1],
+          &(output_dims.d[0]), &(input_dims.d[0]), &(grid_dims.d[0]),
+          input_dims.nbDims, interp_mode, padding_mode, mAlignCorners, stream);
+      break;
+    default:
+      return 1;
+      break;
+  }
+
+  return 0;
+}
+
+nvinfer1::DataType GridSamplerDynamic::getOutputDataType(
+    int index, const nvinfer1::DataType *inputTypes, int nbInputs) const {
+  return inputTypes[0];
+}
+
+// IPluginV2 Methods
+const char *GridSamplerDynamic::getPluginType() const { return PLUGIN_NAME; }
+
+const char *GridSamplerDynamic::getPluginVersion() const {
+  return PLUGIN_VERSION;
+}
+
+int GridSamplerDynamic::getNbOutputs() const { return 1; }
+
+int GridSamplerDynamic::initialize() { return 0; }
+
+void GridSamplerDynamic::terminate() {}
+
+size_t GridSamplerDynamic::getSerializationSize() const {
+  return sizeof(mMode) + sizeof(mPaddingMode) + sizeof(mAlignCorners);
+}
+
+void GridSamplerDynamic::serialize(void *buffer) const {
+  serialize_value(&buffer, mMode);
+  serialize_value(&buffer, mPaddingMode);
+  serialize_value(&buffer, mAlignCorners);
+}
+
+void GridSamplerDynamic::destroy() {
+  // This gets called when the network containing plugin is destroyed
+  delete this;
+}
+
+void GridSamplerDynamic::setPluginNamespace(const char *libNamespace) {
+  mNamespace = libNamespace;
+}
+
+const char *GridSamplerDynamic::getPluginNamespace() const {
+  return mNamespace.c_str();
+}
+
+////////////////////// creator /////////////////////////////
+
+GridSamplerDynamicCreator::GridSamplerDynamicCreator() {
+  mPluginAttributes.clear();
+  mPluginAttributes.emplace_back(nvinfer1::PluginField("interpolation_mode"));
+  mPluginAttributes.emplace_back(nvinfer1::PluginField("padding_mode"));
+  mPluginAttributes.emplace_back(nvinfer1::PluginField("align_corners"));
+  mFC.nbFields = mPluginAttributes.size();
+  mFC.fields = mPluginAttributes.data();
+}
+
+const char *GridSamplerDynamicCreator::getPluginName() const {
+  return PLUGIN_NAME;
+}
+
+const char *GridSamplerDynamicCreator::getPluginVersion() const {
+  return PLUGIN_VERSION;
+}
+
+const nvinfer1::PluginFieldCollection *
+GridSamplerDynamicCreator::getFieldNames() {
+  return &mFC;
+}
+
+nvinfer1::IPluginV2 *GridSamplerDynamicCreator::createPlugin(
+    const char *name, const nvinfer1::PluginFieldCollection *fc) {
+  int mode = 0;
+  int paddingMode = 0;
+  bool alignCorners = false;
+
+  for (int i = 0; i < fc->nbFields; i++) {
+    if (fc->fields[i].data == nullptr) {
+      continue;
+    }
+    std::string field_name(fc->fields[i].name);
+
+    if (field_name.compare("interpolation_mode") == 0) {
+      mode = static_cast<const int *>(fc->fields[i].data)[0];
+    }
+
+    if (field_name.compare("padding_mode") == 0) {
+      paddingMode = static_cast<const int *>(fc->fields[i].data)[0];
+    }
+
+    if (field_name.compare("align_corners") == 0) {
+      alignCorners = (bool)(static_cast<const int *>(fc->fields[i].data)[0]);
+    }
+  }
+
+  GridSamplerDynamic *plugin =
+      new GridSamplerDynamic(name, mode, paddingMode, alignCorners);
+  plugin->setPluginNamespace(getPluginNamespace());
+  return plugin;
+}
+
+nvinfer1::IPluginV2 *GridSamplerDynamicCreator::deserializePlugin(
+    const char *name, const void *serialData, size_t serialLength) {
+  // This object will be deleted when the network is destroyed, which will
+  // call FCPluginDynamic::destroy()
+  auto plugin = new GridSamplerDynamic(name, serialData, serialLength);
+  plugin->setPluginNamespace(getPluginNamespace());
+  return plugin;
+}
+
+void GridSamplerDynamicCreator::setPluginNamespace(const char *libNamespace) {
+  mNamespace = libNamespace;
+}
+
+const char *GridSamplerDynamicCreator::getPluginNamespace() const {
+  return mNamespace.c_str();
+}
diff --git a/mmcv/ops/csrc/tensorrt/plugins/trt_grid_sampler_kernel.cu b/mmcv/ops/csrc/tensorrt/plugins/trt_grid_sampler_kernel.cu
new file mode 100644
index 0000000..253a35d
--- /dev/null
+++ b/mmcv/ops/csrc/tensorrt/plugins/trt_grid_sampler_kernel.cu
@@ -0,0 +1,441 @@
+// Copyright (c) OpenMMLab. All rights reserved
+// modified from
+// https://github.com/pytorch/pytorch/blob/ec683299ebabf297a3504c76248d37be830e4342/aten/src/ATen/native/cuda/GridSampler.cuh
+// and
+// https://github.com/pytorch/pytorch/blob/ec683299ebabf297a3504c76248d37be830e4342/aten/src/ATen/native/cuda/GridSampler.cu
+
+#include <cuda_fp16.h>
+#include <stdio.h>
+
+#include <algorithm>
+#include <cmath>
+#include <vector>
+
+#include "common_cuda_helper.hpp"
+#include "trt_cuda_helper.cuh"
+#include "trt_grid_sampler.hpp"
+#include "trt_plugin_helper.hpp"
+
+using mmcv::GridSamplerInterpolation;
+using mmcv::GridSamplerPadding;
+using mmcv::TensorDesc;
+
+// Unnormalizes a coordinate from the -1 to +1 scale to its pixel index value,
+// where we view each pixel as an area between (idx - 0.5) and (idx + 0.5).
+// if align_corners: -1 and +1 get sent to the centers of the corner pixels
+//     -1 --> 0
+//     +1 --> (size - 1)
+//     scale_factor = (size - 1) / 2
+// if not align_corners: -1 and +1 get sent to the image edges
+//     -1 --> -0.5
+//     +1 --> (size - 1) + 0.5 == size - 0.5
+//     scale_factor = size / 2
+template <typename scalar_t>
+static __forceinline__ __device__ scalar_t
+grid_sampler_unnormalize(scalar_t coord, int size, bool align_corners) {
+  if (align_corners) {
+    // unnormalize coord from [-1, 1] to [0, size - 1]
+    return ((coord + 1.f) / 2) * (size - 1);
+  } else {
+    // unnormalize coord from [-1, 1] to [-0.5, size - 0.5]
+    return ((coord + 1.f) * size - 1) / 2;
+  }
+}
+
+// Clips coordinates to between 0 and clip_limit - 1
+template <typename scalar_t>
+static __forceinline__ __device__ scalar_t clip_coordinates(scalar_t in,
+                                                            int clip_limit) {
+  return ::min(static_cast<scalar_t>(clip_limit - 1),
+               ::max(in, static_cast<scalar_t>(0)));
+}
+
+// Reflects coordinates until they fall between low and high (inclusive).
+// The bounds are passed as twice their value so that half-integer values
+// can be represented as ints.
+template <typename scalar_t>
+static __forceinline__ __device__ scalar_t reflect_coordinates(scalar_t in,
+                                                               int twice_low,
+                                                               int twice_high) {
+  if (twice_low == twice_high) {
+    return static_cast<scalar_t>(0);
+  }
+  scalar_t min = static_cast<scalar_t>(twice_low) / 2;
+  scalar_t span = static_cast<scalar_t>(twice_high - twice_low) / 2;
+  in = ::fabs(in - min);
+  // `fmod` returns same sign as `in`, which is positive after the `fabs` above.
+  scalar_t extra = ::fmod(in, span);
+  int flips = static_cast<int>(::floor(in / span));
+  if (flips % 2 == 0) {
+    return extra + min;
+  } else {
+    return span - extra + min;
+  }
+}
+
+template <typename scalar_t>
+static __forceinline__ __device__ scalar_t
+safe_downgrade_to_int_range(scalar_t x) {
+  // -100.0 does not have special meaning. This is just to make sure
+  // it's not within_bounds_2d or within_bounds_3d, and does not cause
+  // undefined behavior. See #35506.
+  if (x > INT_MAX - 1 || x < INT_MIN || !::isfinite(static_cast<double>(x)))
+    return static_cast<scalar_t>(-100.0);
+  return x;
+}
+
+// Computes the pixel source index value for a grid coordinate
+template <typename scalar_t>
+static __forceinline__ __device__ scalar_t grid_sampler_compute_source_index(
+    scalar_t coord, int size, GridSamplerPadding padding_mode,
+    bool align_corners) {
+  coord = grid_sampler_unnormalize(coord, size, align_corners);
+  if (padding_mode == GridSamplerPadding::Border) {
+    // clip coordinates to image borders
+    coord = clip_coordinates(coord, size);
+  } else if (padding_mode == GridSamplerPadding::Reflection) {
+    // reflect coordinates by image borders
+    if (align_corners) {
+      coord = reflect_coordinates(coord, 0, 2 * (size - 1));
+    } else {
+      coord = reflect_coordinates(coord, -1, 2 * size - 1);
+    }
+    // clip coordinates to image borders
+    coord = clip_coordinates(coord, size);
+  }
+
+  coord = safe_downgrade_to_int_range(coord);
+  return coord;
+}
+
+static __forceinline__ __device__ bool within_bounds_2d(int h, int w, int H,
+                                                        int W) {
+  return h >= 0 && h < H && w >= 0 && w < W;
+}
+
+static __forceinline__ __device__ bool within_bounds_3d(int d, int h, int w,
+                                                        int D, int H, int W) {
+  return d >= 0 && d < D && h >= 0 && h < H && w >= 0 && w < W;
+}
+
+template <typename scalar_t>
+__global__ void grid_sampler_2d_kernel(
+    const int nthreads, const scalar_t *input, const scalar_t *grid,
+    scalar_t *output, TensorDesc input_desc, TensorDesc grid_desc,
+    TensorDesc output_desc, const GridSamplerInterpolation interpolation_mode,
+    const GridSamplerPadding padding_mode, bool align_corners) {
+  int C = input_desc.shape[1];
+  int inp_H = input_desc.shape[2];
+  int inp_W = input_desc.shape[3];
+  int out_H = grid_desc.shape[1];
+  int out_W = grid_desc.shape[2];
+  int inp_sN = input_desc.stride[0];
+  int inp_sC = input_desc.stride[1];
+  int inp_sH = input_desc.stride[2];
+  int inp_sW = input_desc.stride[3];
+  int grid_sN = grid_desc.stride[0];
+  int grid_sH = grid_desc.stride[1];
+  int grid_sW = grid_desc.stride[2];
+  int grid_sCoor = grid_desc.stride[3];
+  int out_sN = output_desc.stride[0];
+  int out_sC = output_desc.stride[1];
+  int out_sH = output_desc.stride[2];
+  int out_sW = output_desc.stride[3];
+
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    const int w = index % out_W;
+    const int h = (index / out_W) % out_H;
+    const int n = index / (out_H * out_W);
+    const int grid_offset = n * grid_sN + h * grid_sH + w * grid_sW;
+
+    // get the corresponding input x, y coordinates from grid
+    scalar_t ix = grid[grid_offset];
+    scalar_t iy = grid[grid_offset + grid_sCoor];
+
+    ix = grid_sampler_compute_source_index(ix, inp_W, padding_mode,
+                                           align_corners);
+    iy = grid_sampler_compute_source_index(iy, inp_H, padding_mode,
+                                           align_corners);
+
+    if (interpolation_mode == GridSamplerInterpolation::Bilinear) {
+      // get NE, NW, SE, SW pixel values from (x, y)
+      int ix_nw = static_cast<int>(::floor(ix));
+      int iy_nw = static_cast<int>(::floor(iy));
+      int ix_ne = ix_nw + 1;
+      int iy_ne = iy_nw;
+      int ix_sw = ix_nw;
+      int iy_sw = iy_nw + 1;
+      int ix_se = ix_nw + 1;
+      int iy_se = iy_nw + 1;
+
+      // get surfaces to each neighbor:
+      scalar_t nw = (ix_se - ix) * (iy_se - iy);
+      scalar_t ne = (ix - ix_sw) * (iy_sw - iy);
+      scalar_t sw = (ix_ne - ix) * (iy - iy_ne);
+      scalar_t se = (ix - ix_nw) * (iy - iy_nw);
+
+      // calculate bilinear weighted pixel value and set output pixel
+      auto inp_ptr_NC = input + n * inp_sN;
+      auto out_ptr_NCHW = output + n * out_sN + h * out_sH + w * out_sW;
+      for (int c = 0; c < C;
+           ++c, inp_ptr_NC += inp_sC, out_ptr_NCHW += out_sC) {
+        *out_ptr_NCHW = static_cast<scalar_t>(0);
+        if (within_bounds_2d(iy_nw, ix_nw, inp_H, inp_W)) {
+          *out_ptr_NCHW += inp_ptr_NC[iy_nw * inp_sH + ix_nw * inp_sW] * nw;
+        }
+        if (within_bounds_2d(iy_ne, ix_ne, inp_H, inp_W)) {
+          *out_ptr_NCHW += inp_ptr_NC[iy_ne * inp_sH + ix_ne * inp_sW] * ne;
+        }
+        if (within_bounds_2d(iy_sw, ix_sw, inp_H, inp_W)) {
+          *out_ptr_NCHW += inp_ptr_NC[iy_sw * inp_sH + ix_sw * inp_sW] * sw;
+        }
+        if (within_bounds_2d(iy_se, ix_se, inp_H, inp_W)) {
+          *out_ptr_NCHW += inp_ptr_NC[iy_se * inp_sH + ix_se * inp_sW] * se;
+        }
+      }
+    } else if (interpolation_mode == GridSamplerInterpolation::Nearest) {
+      int ix_nearest = static_cast<int>(::round(ix));
+      int iy_nearest = static_cast<int>(::round(iy));
+
+      // assign nearest neighbor pixel value to output pixel
+      auto inp_ptr_NC = input + n * inp_sN;
+      auto out_ptr_NCHW = output + n * out_sN + h * out_sH + w * out_sW;
+      for (int c = 0; c < C;
+           ++c, inp_ptr_NC += inp_sC, out_ptr_NCHW += out_sC) {
+        if (within_bounds_2d(iy_nearest, ix_nearest, inp_H, inp_W)) {
+          *out_ptr_NCHW = inp_ptr_NC[iy_nearest * inp_sH + ix_nearest * inp_sW];
+        } else {
+          *out_ptr_NCHW = static_cast<scalar_t>(0);
+        }
+      }
+    }
+  }
+}
+
+template <typename scalar_t>
+__global__ void grid_sampler_3d_kernel(
+    const int nthreads, const scalar_t *input, const scalar_t *grid,
+    scalar_t *output, TensorDesc input_desc, TensorDesc grid_desc,
+    TensorDesc output_desc, const GridSamplerInterpolation interpolation_mode,
+    const GridSamplerPadding padding_mode, bool align_corners) {
+  int C = input_desc.shape[1];
+  int inp_D = input_desc.shape[2];
+  int inp_H = input_desc.shape[3];
+  int inp_W = input_desc.shape[4];
+  int out_D = grid_desc.shape[1];
+  int out_H = grid_desc.shape[2];
+  int out_W = grid_desc.shape[3];
+  int inp_sN = input_desc.stride[0];
+  int inp_sC = input_desc.stride[1];
+  int inp_sD = input_desc.stride[2];
+  int inp_sH = input_desc.stride[3];
+  int inp_sW = input_desc.stride[4];
+  int grid_sN = grid_desc.stride[0];
+  int grid_sD = grid_desc.stride[1];
+  int grid_sH = grid_desc.stride[2];
+  int grid_sW = grid_desc.stride[3];
+  int grid_sCoor = grid_desc.stride[4];
+  int out_sN = output_desc.stride[0];
+  int out_sC = output_desc.stride[1];
+  int out_sD = output_desc.stride[2];
+  int out_sH = output_desc.stride[3];
+  int out_sW = output_desc.stride[4];
+
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    const int w = index % out_W;
+    const int h = (index / out_W) % out_H;
+    const int d = (index / (out_H * out_W)) % out_D;
+    const int n = index / (out_D * out_H * out_W);
+    const int grid_offset =
+        n * grid_sN + d * grid_sD + h * grid_sH + w * grid_sW;
+
+    // get the corresponding input x, y, z coordinates from grid
+    scalar_t ix = grid[grid_offset];
+    scalar_t iy = grid[grid_offset + grid_sCoor];
+    scalar_t iz = grid[grid_offset + 2 * grid_sCoor];
+
+    ix = grid_sampler_compute_source_index(ix, inp_W, padding_mode,
+                                           align_corners);
+    iy = grid_sampler_compute_source_index(iy, inp_H, padding_mode,
+                                           align_corners);
+    iz = grid_sampler_compute_source_index(iz, inp_D, padding_mode,
+                                           align_corners);
+
+    if (interpolation_mode == GridSamplerInterpolation::Bilinear) {
+      // get corner pixel values from (x, y, z)
+      // for 4d, we used north-east-south-west
+      // for 5d, we add top-bottom
+      int ix_tnw = static_cast<int>(::floor(ix));
+      int iy_tnw = static_cast<int>(::floor(iy));
+      int iz_tnw = static_cast<int>(::floor(iz));
+
+      int ix_tne = ix_tnw + 1;
+      int iy_tne = iy_tnw;
+      int iz_tne = iz_tnw;
+
+      int ix_tsw = ix_tnw;
+      int iy_tsw = iy_tnw + 1;
+      int iz_tsw = iz_tnw;
+
+      int ix_tse = ix_tnw + 1;
+      int iy_tse = iy_tnw + 1;
+      int iz_tse = iz_tnw;
+
+      int ix_bnw = ix_tnw;
+      int iy_bnw = iy_tnw;
+      int iz_bnw = iz_tnw + 1;
+
+      int ix_bne = ix_tnw + 1;
+      int iy_bne = iy_tnw;
+      int iz_bne = iz_tnw + 1;
+
+      int ix_bsw = ix_tnw;
+      int iy_bsw = iy_tnw + 1;
+      int iz_bsw = iz_tnw + 1;
+
+      int ix_bse = ix_tnw + 1;
+      int iy_bse = iy_tnw + 1;
+      int iz_bse = iz_tnw + 1;
+
+      // get surfaces to each neighbor:
+      scalar_t tnw = (ix_bse - ix) * (iy_bse - iy) * (iz_bse - iz);
+      scalar_t tne = (ix - ix_bsw) * (iy_bsw - iy) * (iz_bsw - iz);
+      scalar_t tsw = (ix_bne - ix) * (iy - iy_bne) * (iz_bne - iz);
+      scalar_t tse = (ix - ix_bnw) * (iy - iy_bnw) * (iz_bnw - iz);
+      scalar_t bnw = (ix_tse - ix) * (iy_tse - iy) * (iz - iz_tse);
+      scalar_t bne = (ix - ix_tsw) * (iy_tsw - iy) * (iz - iz_tsw);
+      scalar_t bsw = (ix_tne - ix) * (iy - iy_tne) * (iz - iz_tne);
+      scalar_t bse = (ix - ix_tnw) * (iy - iy_tnw) * (iz - iz_tnw);
+
+      auto inp_ptr_NC = input + n * inp_sN;
+      auto out_ptr_NCDHW =
+          output + n * out_sN + d * out_sD + h * out_sH + w * out_sW;
+      for (int c = 0; c < C;
+           ++c, inp_ptr_NC += inp_sC, out_ptr_NCDHW += out_sC) {
+        //   (c, iz_tnw, iy_tnw, ix_tnw) * tnw + (c, iz_tne, iy_tne, ix_tne) *
+        //   tne
+        // + (c, iz_tsw, iy_tsw, ix_tsw) * tsw + (c, iz_tse, iy_tse, ix_tse) *
+        // tse
+        // + (c, iz_bnw, iy_bnw, ix_bnw) * bnw + (c, iz_bne, iy_bne, ix_bne) *
+        // bne
+        // + (c, iz_bsw, iy_bsw, ix_bsw) * bsw + (c, iz_bse, iy_bse, ix_bse) *
+        // bse
+        *out_ptr_NCDHW = static_cast<scalar_t>(0);
+        if (within_bounds_3d(iz_tnw, iy_tnw, ix_tnw, inp_D, inp_H, inp_W)) {
+          *out_ptr_NCDHW +=
+              inp_ptr_NC[iz_tnw * inp_sD + iy_tnw * inp_sH + ix_tnw * inp_sW] *
+              tnw;
+        }
+        if (within_bounds_3d(iz_tne, iy_tne, ix_tne, inp_D, inp_H, inp_W)) {
+          *out_ptr_NCDHW +=
+              inp_ptr_NC[iz_tne * inp_sD + iy_tne * inp_sH + ix_tne * inp_sW] *
+              tne;
+        }
+        if (within_bounds_3d(iz_tsw, iy_tsw, ix_tsw, inp_D, inp_H, inp_W)) {
+          *out_ptr_NCDHW +=
+              inp_ptr_NC[iz_tsw * inp_sD + iy_tsw * inp_sH + ix_tsw * inp_sW] *
+              tsw;
+        }
+        if (within_bounds_3d(iz_tse, iy_tse, ix_tse, inp_D, inp_H, inp_W)) {
+          *out_ptr_NCDHW +=
+              inp_ptr_NC[iz_tse * inp_sD + iy_tse * inp_sH + ix_tse * inp_sW] *
+              tse;
+        }
+        if (within_bounds_3d(iz_bnw, iy_bnw, ix_bnw, inp_D, inp_H, inp_W)) {
+          *out_ptr_NCDHW +=
+              inp_ptr_NC[iz_bnw * inp_sD + iy_bnw * inp_sH + ix_bnw * inp_sW] *
+              bnw;
+        }
+        if (within_bounds_3d(iz_bne, iy_bne, ix_bne, inp_D, inp_H, inp_W)) {
+          *out_ptr_NCDHW +=
+              inp_ptr_NC[iz_bne * inp_sD + iy_bne * inp_sH + ix_bne * inp_sW] *
+              bne;
+        }
+        if (within_bounds_3d(iz_bsw, iy_bsw, ix_bsw, inp_D, inp_H, inp_W)) {
+          *out_ptr_NCDHW +=
+              inp_ptr_NC[iz_bsw * inp_sD + iy_bsw * inp_sH + ix_bsw * inp_sW] *
+              bsw;
+        }
+        if (within_bounds_3d(iz_bse, iy_bse, ix_bse, inp_D, inp_H, inp_W)) {
+          *out_ptr_NCDHW +=
+              inp_ptr_NC[iz_bse * inp_sD + iy_bse * inp_sH + ix_bse * inp_sW] *
+              bse;
+        }
+      }
+    } else if (interpolation_mode == GridSamplerInterpolation::Nearest) {
+      int ix_nearest = static_cast<int>(::round(ix));
+      int iy_nearest = static_cast<int>(::round(iy));
+      int iz_nearest = static_cast<int>(::round(iz));
+
+      // assign nearest neighbor pixel value to output pixel
+      auto inp_ptr_NC = input + n * inp_sN;
+      auto out_ptr_NCDHW =
+          output + n * out_sN + d * out_sD + h * out_sH + w * out_sW;
+      for (int c = 0; c < C;
+           ++c, inp_ptr_NC += inp_sC, out_ptr_NCDHW += out_sC) {
+        if (within_bounds_3d(iz_nearest, iy_nearest, ix_nearest, inp_D, inp_H,
+                             inp_W)) {
+          *out_ptr_NCDHW =
+              inp_ptr_NC[iz_nearest * inp_sD + iy_nearest * inp_sH +
+                         ix_nearest * inp_sW];
+        } else {
+          *out_ptr_NCDHW = static_cast<scalar_t>(0);
+        }
+      }
+    }
+  }
+}
+
+void create_desc(const int *dims, int nb_dims, TensorDesc &desc) {
+  memcpy(&desc.shape[0], dims, sizeof(int) * nb_dims);
+  desc.stride[nb_dims - 1] = 1;
+  for (int i = nb_dims - 2; i >= 0; --i) {
+    desc.stride[i] = desc.stride[i + 1] * desc.shape[i + 1];
+  }
+}
+
+template <typename T>
+void grid_sample(T *output, const T *input, const T *grid, int *output_dims,
+                 int *input_dims, int *grid_dims, int nb_dims,
+                 GridSamplerInterpolation interp, GridSamplerPadding padding,
+                 bool align_corners, cudaStream_t stream) {
+  TensorDesc input_desc;
+  create_desc(input_dims, nb_dims, input_desc);
+
+  TensorDesc output_desc;
+  create_desc(output_dims, nb_dims, output_desc);
+
+  TensorDesc grid_desc;
+  create_desc(grid_dims, nb_dims, grid_desc);
+
+  int count = 1;
+  for (int i = 0; i < nb_dims; ++i) {
+    if (i == 1) {
+      continue;
+    }
+    count *= output_desc.shape[i];
+  }
+
+  if (nb_dims == 4) {
+    grid_sampler_2d_kernel<T>
+        <<<GET_BLOCKS(count), THREADS_PER_BLOCK, 0, stream>>>(
+            count, input, grid, output, input_desc, grid_desc, output_desc,
+            interp, padding, align_corners);
+  } else if (nb_dims == 5) {
+    grid_sampler_3d_kernel<T>
+        <<<GET_BLOCKS(count), THREADS_PER_BLOCK, 0, stream>>>(
+            count, input, grid, output, input_desc, grid_desc, output_desc,
+            interp, padding, align_corners);
+  } else {
+    printf("input and grid dims should be 4 or 5\n");
+  }
+}
+
+void grid_sample_float(float *output, const float *input, const float *grid,
+                       int *output_dims, int *input_dims, int *grid_dims,
+                       int nb_dims, GridSamplerInterpolation interp,
+                       GridSamplerPadding padding, bool align_corners,
+                       cudaStream_t stream) {
+  grid_sample<float>(output, input, grid, output_dims, input_dims, grid_dims,
+                     nb_dims, interp, padding, align_corners, stream);
+}
diff --git a/mmcv/ops/csrc/tensorrt/plugins/trt_instance_norm.cpp b/mmcv/ops/csrc/tensorrt/plugins/trt_instance_norm.cpp
new file mode 100644
index 0000000..b9b363a
--- /dev/null
+++ b/mmcv/ops/csrc/tensorrt/plugins/trt_instance_norm.cpp
@@ -0,0 +1,246 @@
+// Copyright (c) OpenMMLab. All rights reserved
+// Modified from:
+// https://github.com/NVIDIA/TensorRT/blob/master/plugin/instanceNormalizationPlugin/instanceNormalizationPlugin.cpp
+
+#include "trt_instance_norm.hpp"
+
+#include <cuda_fp16.h>
+
+#include <stdexcept>
+
+#include "trt_serialize.hpp"
+
+using namespace nvinfer1;
+
+cudnnStatus_t convert_trt2cudnn_dtype(nvinfer1::DataType trt_dtype,
+                                      cudnnDataType_t* cudnn_dtype) {
+  switch (trt_dtype) {
+    case nvinfer1::DataType::kFLOAT:
+      *cudnn_dtype = CUDNN_DATA_FLOAT;
+      break;
+    case nvinfer1::DataType::kHALF:
+      *cudnn_dtype = CUDNN_DATA_HALF;
+      break;
+    default:
+      return CUDNN_STATUS_BAD_PARAM;
+  }
+  return CUDNN_STATUS_SUCCESS;
+}
+
+namespace {
+constexpr const char* PLUGIN_VERSION{"1"};
+constexpr const char* PLUGIN_NAME{"MMCVInstanceNormalization"};
+}  // namespace
+
+PluginFieldCollection InstanceNormalizationDynamicCreator::mFC{};
+std::vector<PluginField> InstanceNormalizationDynamicCreator::mPluginAttributes;
+
+InstanceNormalizationDynamic::InstanceNormalizationDynamic(
+    const std::string& name, float epsilon)
+    : mLayerName(name), mEpsilon(epsilon) {}
+
+InstanceNormalizationDynamic::InstanceNormalizationDynamic(
+    const std::string& name, void const* serialData, size_t serialLength)
+    : mLayerName(name) {
+  deserialize_value(&serialData, &serialLength, &mEpsilon);
+}
+
+InstanceNormalizationDynamic::~InstanceNormalizationDynamic() {}
+
+// InstanceNormalizationDynamic returns one output.
+int InstanceNormalizationDynamic::getNbOutputs() const { return 1; }
+
+DimsExprs InstanceNormalizationDynamic::getOutputDimensions(
+    int outputIndex, const nvinfer1::DimsExprs* inputs, int nbInputs,
+    nvinfer1::IExprBuilder& exprBuilder) {
+  nvinfer1::DimsExprs output(inputs[0]);
+  return output;
+}
+
+int InstanceNormalizationDynamic::initialize() { return 0; }
+
+void InstanceNormalizationDynamic::terminate() {}
+
+size_t InstanceNormalizationDynamic::getWorkspaceSize(
+    const nvinfer1::PluginTensorDesc* inputs, int nbInputs,
+    const nvinfer1::PluginTensorDesc* outputs, int nbOutputs) const {
+  int n = inputs[0].dims.d[0];
+  int c = inputs[0].dims.d[1];
+  int elem_size = mmcv::getElementSize(inputs[1].type);
+  return mmcv::getAlignedSize(n * c * elem_size) * 2;
+}
+
+int InstanceNormalizationDynamic::enqueue(
+    const nvinfer1::PluginTensorDesc* inputDesc,
+    const nvinfer1::PluginTensorDesc* outputDesc, const void* const* inputs,
+    void* const* outputs, void* workspace, cudaStream_t stream) {
+  nvinfer1::Dims input_dims = inputDesc[0].dims;
+  int n = input_dims.d[0];
+  int c = input_dims.d[1];
+  int h = input_dims.d[2];
+  int w = input_dims.nbDims > 3 ? input_dims.d[3] : 1;
+  int elem_size = mmcv::getElementSize(inputDesc[1].type);
+
+  void* n_scales = (void*)workspace;
+  void* n_bias = (void*)(workspace + mmcv::getAlignedSize(n * c * elem_size));
+
+  const void* scales = (const void*)inputs[1];
+  const void* bias = (const void*)inputs[2];
+
+  for (int i = 0; i < n; ++i) {
+    cudaMemcpyAsync(n_scales + i * c * elem_size, scales, c * elem_size,
+                    cudaMemcpyDeviceToDevice, stream);
+    cudaMemcpyAsync(n_bias + i * c * elem_size, bias, c * elem_size,
+                    cudaMemcpyDeviceToDevice, stream);
+  }
+
+  cudnnSetTensor4dDescriptor(_b_desc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1,
+                             n * c, 1, 1);
+  cudnnDataType_t cudnn_dtype{};
+  convert_trt2cudnn_dtype(inputDesc[0].type, &cudnn_dtype);
+  cudnnSetTensor4dDescriptor(_x_desc, CUDNN_TENSOR_NCHW, cudnn_dtype, 1, n * c,
+                             h, w);
+  cudnnSetTensor4dDescriptor(_y_desc, CUDNN_TENSOR_NCHW, cudnn_dtype, 1, n * c,
+                             h, w);
+  float alpha = 1;
+  float beta = 0;
+  void const* x_ptr = inputs[0];
+  void* y_ptr = outputs[0];
+  cudnnSetStream(_cudnn_handle, stream);
+  // Note: Use of CUDNN_BATCHNORM_SPATIAL_PERSISTENT can cause numerical
+  //       overflows (NaNs) for fp32 data in some circumstances. The lower-
+  //       performance CUDNN_BATCHNORM_SPATIAL should be used if this is not
+  //       acceptable.
+  cudnnBatchNormalizationForwardTraining(
+      _cudnn_handle, CUDNN_BATCHNORM_SPATIAL_PERSISTENT, &alpha, &beta, _x_desc,
+      x_ptr, _y_desc, y_ptr, _b_desc, n_scales, n_bias, 1., nullptr, nullptr,
+      mEpsilon, nullptr, nullptr);
+  return 0;
+}
+
+size_t InstanceNormalizationDynamic::getSerializationSize() const {
+  return serialized_size(mEpsilon);
+}
+
+void InstanceNormalizationDynamic::serialize(void* buffer) const {
+  serialize_value(&buffer, mEpsilon);
+}
+
+bool InstanceNormalizationDynamic::supportsFormatCombination(
+    int pos, const nvinfer1::PluginTensorDesc* inOut, int nbInputs,
+    int nbOutputs) {
+  return ((inOut[pos].type == nvinfer1::DataType::kFLOAT ||
+           inOut[pos].type == nvinfer1::DataType::kHALF) &&
+          inOut[pos].format == nvinfer1::PluginFormat::kLINEAR &&
+          inOut[pos].type == inOut[0].type);
+}
+
+const char* InstanceNormalizationDynamic::getPluginType() const {
+  return PLUGIN_NAME;
+}
+
+const char* InstanceNormalizationDynamic::getPluginVersion() const {
+  return PLUGIN_VERSION;
+}
+
+void InstanceNormalizationDynamic::destroy() { delete this; }
+
+IPluginV2DynamicExt* InstanceNormalizationDynamic::clone() const {
+  auto* plugin = new InstanceNormalizationDynamic{mLayerName, mEpsilon};
+  plugin->setPluginNamespace(mPluginNamespace.c_str());
+  return plugin;
+}
+
+// Set plugin namespace
+void InstanceNormalizationDynamic::setPluginNamespace(
+    const char* pluginNamespace) {
+  mPluginNamespace = pluginNamespace;
+}
+
+const char* InstanceNormalizationDynamic::getPluginNamespace() const {
+  return mPluginNamespace.c_str();
+}
+
+nvinfer1::DataType InstanceNormalizationDynamic::getOutputDataType(
+    int index, const nvinfer1::DataType* inputTypes, int nbInputs) const {
+  return inputTypes[0];
+}
+
+// Attach the plugin object to an execution context and grant the plugin the
+// access to some context resource.
+void InstanceNormalizationDynamic::attachToContext(
+    cudnnContext* cudnnContext, cublasContext* cublasContext,
+    IGpuAllocator* gpuAllocator) {
+  _cudnn_handle = cudnnContext;
+  cudnnCreateTensorDescriptor(&_b_desc);
+  cudnnCreateTensorDescriptor(&_x_desc);
+  cudnnCreateTensorDescriptor(&_y_desc);
+}
+
+// Detach the plugin object from its execution context.
+void InstanceNormalizationDynamic::detachFromContext() {
+  cudnnDestroyTensorDescriptor(_y_desc);
+  cudnnDestroyTensorDescriptor(_x_desc);
+  cudnnDestroyTensorDescriptor(_b_desc);
+}
+
+void InstanceNormalizationDynamic::configurePlugin(
+    const nvinfer1::DynamicPluginTensorDesc* in, int nbInputs,
+    const nvinfer1::DynamicPluginTensorDesc* out, int nbOutputs) {}
+
+// InstanceNormalizationDynamicCreator methods
+InstanceNormalizationDynamicCreator::InstanceNormalizationDynamicCreator() {
+  mPluginAttributes.clear();
+  mPluginAttributes.emplace_back(
+      PluginField("epsilon", nullptr, PluginFieldType::kFLOAT32, 1));
+
+  mFC.nbFields = mPluginAttributes.size();
+  mFC.fields = mPluginAttributes.data();
+}
+
+const char* InstanceNormalizationDynamicCreator::getPluginName() const {
+  return PLUGIN_NAME;
+}
+
+const char* InstanceNormalizationDynamicCreator::getPluginVersion() const {
+  return PLUGIN_VERSION;
+}
+
+const PluginFieldCollection*
+InstanceNormalizationDynamicCreator::getFieldNames() {
+  return &mFC;
+}
+
+IPluginV2DynamicExt* InstanceNormalizationDynamicCreator::createPlugin(
+    const char* name, const nvinfer1::PluginFieldCollection* fc) {
+  float epsilon = 1e-5;
+  const PluginField* fields = fc->fields;
+  for (int i = 0; i < fc->nbFields; ++i) {
+    const char* attrName = fields[i].name;
+    if (!strcmp(attrName, "epsilon")) {
+      epsilon = *(static_cast<const float*>(fields[i].data));
+    }
+  }
+
+  InstanceNormalizationDynamic* obj =
+      new InstanceNormalizationDynamic(name, epsilon);
+  obj->setPluginNamespace(mNamespace.c_str());
+  return obj;
+}
+
+IPluginV2DynamicExt* InstanceNormalizationDynamicCreator::deserializePlugin(
+    const char* name, const void* serialData, size_t serialLength) {
+  InstanceNormalizationDynamic* obj =
+      new InstanceNormalizationDynamic{name, serialData, serialLength};
+  obj->setPluginNamespace(mNamespace.c_str());
+  return obj;
+}
+
+void InstanceNormalizationDynamicCreator::setPluginNamespace(
+    const char* libNamespace) {
+  mNamespace = libNamespace;
+}
+
+const char* InstanceNormalizationDynamicCreator::getPluginNamespace() const {
+  return mNamespace.c_str();
+}
diff --git a/mmcv/ops/csrc/tensorrt/plugins/trt_modulated_deform_conv.cpp b/mmcv/ops/csrc/tensorrt/plugins/trt_modulated_deform_conv.cpp
new file mode 100644
index 0000000..330ee80
--- /dev/null
+++ b/mmcv/ops/csrc/tensorrt/plugins/trt_modulated_deform_conv.cpp
@@ -0,0 +1,308 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "trt_modulated_deform_conv.hpp"
+
+#include <assert.h>
+
+#include <chrono>
+
+#include "trt_serialize.hpp"
+
+void ModulatedDeformConvForwardCUDAKernelLauncher_float(
+    const float *input, const float *weight, const float *bias,
+    const float *offset, const float *mask, float *output, void *workspace,
+    int batch, int channels, int height, int width, int channels_out,
+    int kernel_w, int kernel_h, int stride_w, int stride_h, int pad_w,
+    int pad_h, int dilation_w, int dilation_h, int group, int deformable_group,
+    int im2col_step, cublasHandle_t cublas_handle, cudaStream_t stream);
+
+namespace {
+static const char *PLUGIN_VERSION{"1"};
+static const char *PLUGIN_NAME{"MMCVModulatedDeformConv2d"};
+}  // namespace
+
+nvinfer1::PluginFieldCollection
+    ModulatedDeformableConvPluginDynamicCreator::mFC{};
+std::vector<nvinfer1::PluginField>
+    ModulatedDeformableConvPluginDynamicCreator::mPluginAttributes;
+
+ModulatedDeformableConvPluginDynamic::ModulatedDeformableConvPluginDynamic(
+    const std::string &name, const nvinfer1::Dims stride,
+    const nvinfer1::Dims padding, const nvinfer1::Dims dilation,
+    const int deformableGroup, const int group)
+    : mLayerName(name),
+      mStride(stride),
+      mPadding(padding),
+      mDilation(dilation),
+      mDeformableGroup(deformableGroup),
+      mGroup(group) {
+  mWithBias = false;
+}
+
+ModulatedDeformableConvPluginDynamic::ModulatedDeformableConvPluginDynamic(
+    const std::string name, const void *data, size_t length)
+    : mLayerName(name) {
+  deserialize_value(&data, &length, &mStride);
+  deserialize_value(&data, &length, &mPadding);
+  deserialize_value(&data, &length, &mDilation);
+  deserialize_value(&data, &length, &mDeformableGroup);
+  deserialize_value(&data, &length, &mGroup);
+  mWithBias = false;
+}
+ModulatedDeformableConvPluginDynamic::~ModulatedDeformableConvPluginDynamic() {}
+
+nvinfer1::IPluginV2DynamicExt *ModulatedDeformableConvPluginDynamic::clone()
+    const {
+  ModulatedDeformableConvPluginDynamic *plugin =
+      new ModulatedDeformableConvPluginDynamic(
+          mLayerName, mStride, mPadding, mDilation, mDeformableGroup, mGroup);
+  plugin->setPluginNamespace(getPluginNamespace());
+
+  return plugin;
+}
+
+nvinfer1::DimsExprs ModulatedDeformableConvPluginDynamic::getOutputDimensions(
+    int outputIndex, const nvinfer1::DimsExprs *inputs, int nbInputs,
+    nvinfer1::IExprBuilder &exprBuilder) {
+  nvinfer1::DimsExprs ret;
+  ret.nbDims = 4;
+  ret.d[0] = inputs[0].d[0];
+  ret.d[1] = inputs[3].d[0];
+
+  ret.d[2] = inputs[1].d[2];
+  ret.d[3] = inputs[1].d[3];
+
+  return ret;
+}
+
+bool ModulatedDeformableConvPluginDynamic::supportsFormatCombination(
+    int pos, const nvinfer1::PluginTensorDesc *inOut, int nbInputs,
+    int nbOutputs) {
+  if (pos == 0) {
+    return (inOut[pos].type == nvinfer1::DataType::kFLOAT &&
+            inOut[pos].format == nvinfer1::TensorFormat::kLINEAR);
+
+  } else {
+    return inOut[pos].type == inOut[0].type &&
+           inOut[pos].format == inOut[0].format;
+  }
+}
+
+void ModulatedDeformableConvPluginDynamic::configurePlugin(
+    const nvinfer1::DynamicPluginTensorDesc *inputs, int nbInputs,
+    const nvinfer1::DynamicPluginTensorDesc *outputs, int nbOutputs) {
+  if (nbInputs == 5) {
+    mWithBias = true;
+  }
+}
+
+size_t ModulatedDeformableConvPluginDynamic::getWorkspaceSize(
+    const nvinfer1::PluginTensorDesc *inputs, int nbInputs,
+    const nvinfer1::PluginTensorDesc *outputs, int nbOutputs) const {
+  int sizeof_dtype = mmcv::getElementSize(outputs[0].type);
+
+  int batch_size = inputs[0].dims.d[0];
+  int nInputPlane = inputs[0].dims.d[1];
+  int inputHeight = inputs[0].dims.d[2];
+  int inputWidth = inputs[0].dims.d[3];
+
+  int nOutputPlane = outputs[0].dims.d[1];
+  int outputHeight = outputs[0].dims.d[2];
+  int outputWidth = outputs[0].dims.d[3];
+
+  int kW = inputs[3].dims.d[2];
+  int kH = inputs[3].dims.d[3];
+  int im2col_step = std::min(32, batch_size);
+
+  size_t col_size = mmcv::getAlignedSize(nInputPlane * kW * kH * outputHeight *
+                                         outputWidth * sizeof_dtype);
+
+  return col_size;
+}
+
+int ModulatedDeformableConvPluginDynamic::enqueue(
+    const nvinfer1::PluginTensorDesc *inputDesc,
+    const nvinfer1::PluginTensorDesc *outputDesc, const void *const *inputs,
+    void *const *outputs, void *workSpace, cudaStream_t stream) {
+  int batch = inputDesc[0].dims.d[0];
+  int channels = inputDesc[0].dims.d[1];
+  int height = inputDesc[0].dims.d[2];
+  int width = inputDesc[0].dims.d[3];
+  int channels_out = outputDesc[0].dims.d[1];
+  int kernel_h = inputDesc[3].dims.d[2];
+  int kernel_w = inputDesc[3].dims.d[3];
+
+  const void *x = inputs[0];
+  const void *offset = inputs[1];
+  const void *mask = inputs[2];
+  const void *weight = inputs[3];
+  const void *bias = mWithBias ? inputs[4] : nullptr;
+  void *output = outputs[0];
+  int im2col_step = std::min(batch, 32);
+
+  // TODO: add fp16 support
+  auto data_type = inputDesc[0].type;
+  switch (data_type) {
+    case nvinfer1::DataType::kFLOAT:
+      ModulatedDeformConvForwardCUDAKernelLauncher_float(
+          (float *)x, (float *)weight, (float *)bias, (float *)offset,
+          (float *)mask, (float *)output, workSpace, batch, channels, height,
+          width, channels_out, kernel_w, kernel_h, mStride.d[0], mStride.d[1],
+          mPadding.d[0], mPadding.d[1], mDilation.d[0], mDilation.d[1], mGroup,
+          mDeformableGroup, im2col_step, m_cublas_handle, stream);
+      break;
+    default:
+      return 1;
+      break;
+  }
+
+  return 0;
+}
+
+nvinfer1::DataType ModulatedDeformableConvPluginDynamic::getOutputDataType(
+    int index, const nvinfer1::DataType *inputTypes, int nbInputs) const {
+  return inputTypes[0];
+}
+
+// IPluginV2 Methods
+const char *ModulatedDeformableConvPluginDynamic::getPluginType() const {
+  return PLUGIN_NAME;
+}
+
+const char *ModulatedDeformableConvPluginDynamic::getPluginVersion() const {
+  return PLUGIN_VERSION;
+}
+
+int ModulatedDeformableConvPluginDynamic::getNbOutputs() const { return 1; }
+
+int ModulatedDeformableConvPluginDynamic::initialize() { return 0; }
+
+void ModulatedDeformableConvPluginDynamic::terminate() {}
+
+size_t ModulatedDeformableConvPluginDynamic::getSerializationSize() const {
+  return sizeof(mStride) + sizeof(mPadding) + sizeof(mDilation) +
+         sizeof(mDeformableGroup) + sizeof(mGroup);
+}
+
+void ModulatedDeformableConvPluginDynamic::serialize(void *buffer) const {
+  serialize_value(&buffer, mStride);
+  serialize_value(&buffer, mPadding);
+  serialize_value(&buffer, mDilation);
+  serialize_value(&buffer, mDeformableGroup);
+  serialize_value(&buffer, mGroup);
+}
+
+void ModulatedDeformableConvPluginDynamic::destroy() {
+  // This gets called when the network containing plugin is destroyed
+  delete this;
+}
+
+void ModulatedDeformableConvPluginDynamic::attachToContext(
+    cudnnContext *cudnnContext, cublasContext *cublasContext,
+    nvinfer1::IGpuAllocator *gpuAllocator) {
+  m_cublas_handle = cublasContext;
+}
+
+void ModulatedDeformableConvPluginDynamic::detachFromContext() {}
+
+void ModulatedDeformableConvPluginDynamic::setPluginNamespace(
+    const char *libNamespace) {
+  mNamespace = libNamespace;
+}
+
+const char *ModulatedDeformableConvPluginDynamic::getPluginNamespace() const {
+  return mNamespace.c_str();
+}
+
+////////////////////// creator /////////////////////////////
+
+ModulatedDeformableConvPluginDynamicCreator::
+    ModulatedDeformableConvPluginDynamicCreator() {
+  mPluginAttributes.emplace_back(nvinfer1::PluginField("stride"));
+  mPluginAttributes.emplace_back(nvinfer1::PluginField("padding"));
+  mPluginAttributes.emplace_back(nvinfer1::PluginField("dilation"));
+  mPluginAttributes.emplace_back(nvinfer1::PluginField("groups"));
+  mPluginAttributes.emplace_back(nvinfer1::PluginField("deform_groups"));
+  mFC.nbFields = mPluginAttributes.size();
+  mFC.fields = mPluginAttributes.data();
+}
+
+const char *ModulatedDeformableConvPluginDynamicCreator::getPluginName() const {
+  return PLUGIN_NAME;
+}
+
+const char *ModulatedDeformableConvPluginDynamicCreator::getPluginVersion()
+    const {
+  return PLUGIN_VERSION;
+}
+
+const nvinfer1::PluginFieldCollection *
+ModulatedDeformableConvPluginDynamicCreator::getFieldNames() {
+  return &mFC;
+}
+
+nvinfer1::IPluginV2 *ModulatedDeformableConvPluginDynamicCreator::createPlugin(
+    const char *name, const nvinfer1::PluginFieldCollection *fc) {
+  nvinfer1::Dims stride{2, {1, 1}};
+  nvinfer1::Dims padding{2, {0, 0}};
+  nvinfer1::Dims dilation{2, {1, 1}};
+  int deformableGroup = 1;
+  int group = 1;
+
+  for (int i = 0; i < fc->nbFields; i++) {
+    if (fc->fields[i].data == nullptr) {
+      continue;
+    }
+    std::string field_name(fc->fields[i].name);
+
+    if (field_name.compare("deformable_group") == 0) {
+      deformableGroup = static_cast<const int *>(fc->fields[i].data)[0];
+    }
+
+    if (field_name.compare("group") == 0) {
+      group = static_cast<const int *>(fc->fields[i].data)[0];
+    }
+
+    if (field_name.compare("stride") == 0) {
+      stride.nbDims = 2;
+      stride.d[0] = static_cast<const int *>(fc->fields[i].data)[0];
+      stride.d[1] = static_cast<const int *>(fc->fields[i].data)[1];
+    }
+
+    if (field_name.compare("padding") == 0) {
+      padding.nbDims = 2;
+      padding.d[0] = static_cast<const int *>(fc->fields[i].data)[0];
+      padding.d[1] = static_cast<const int *>(fc->fields[i].data)[1];
+    }
+
+    if (field_name.compare("dilation") == 0) {
+      dilation.nbDims = 2;
+      dilation.d[0] = static_cast<const int *>(fc->fields[i].data)[0];
+      dilation.d[1] = static_cast<const int *>(fc->fields[i].data)[1];
+    }
+  }
+
+  ModulatedDeformableConvPluginDynamic *plugin =
+      new ModulatedDeformableConvPluginDynamic(name, stride, padding, dilation,
+                                               deformableGroup, group);
+  plugin->setPluginNamespace(getPluginNamespace());
+  return plugin;
+}
+
+nvinfer1::IPluginV2 *
+ModulatedDeformableConvPluginDynamicCreator::deserializePlugin(
+    const char *name, const void *serialData, size_t serialLength) {
+  auto plugin =
+      new ModulatedDeformableConvPluginDynamic(name, serialData, serialLength);
+  plugin->setPluginNamespace(getPluginNamespace());
+  return plugin;
+}
+
+void ModulatedDeformableConvPluginDynamicCreator::setPluginNamespace(
+    const char *libNamespace) {
+  mNamespace = libNamespace;
+}
+
+const char *ModulatedDeformableConvPluginDynamicCreator::getPluginNamespace()
+    const {
+  return mNamespace.c_str();
+}
diff --git a/mmcv/ops/csrc/tensorrt/plugins/trt_modulated_deform_conv_kernel.cu b/mmcv/ops/csrc/tensorrt/plugins/trt_modulated_deform_conv_kernel.cu
new file mode 100644
index 0000000..f29a7a7
--- /dev/null
+++ b/mmcv/ops/csrc/tensorrt/plugins/trt_modulated_deform_conv_kernel.cu
@@ -0,0 +1,134 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include <assert.h>
+#include <cuda_fp16.h>
+
+#include "common_cuda_helper.hpp"
+#include "modulated_deform_conv_cuda_kernel.cuh"
+#include "trt_cuda_helper.cuh"
+#include "trt_plugin_helper.hpp"
+
+template <typename T>
+void trt_modulated_deformable_im2col(
+    const T* data_im_, const T* data_offset_, const T* data_mask_,
+    const int batch_size, const int channels, const int height_im,
+    const int width_im, const int height_col, const int width_col,
+    const int kernel_h, const int kenerl_w, const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w, const int dilation_h,
+    const int dilation_w, const int deformable_group, T* data_col_,
+    cudaStream_t stream) {
+  // num_axes should be smaller than block size
+  const int channel_per_deformable_group = channels / deformable_group;
+  const int num_kernels = channels * batch_size * height_col * width_col;
+
+  modulated_deformable_im2col_gpu_kernel<T>
+      <<<GET_BLOCKS(num_kernels), THREADS_PER_BLOCK, 0, stream>>>(
+          num_kernels, data_im_, data_offset_, data_mask_, height_im, width_im,
+          kernel_h, kenerl_w, pad_h, pad_w, stride_h, stride_w, dilation_h,
+          dilation_w, channel_per_deformable_group, batch_size, channels,
+          deformable_group, height_col, width_col, data_col_);
+
+  cudaCheckError();
+}
+
+template <typename scalar_t>
+__global__ void output_add_bias_kernel(scalar_t* output, const scalar_t* bias,
+                                       size_t step_batch, size_t step_channel,
+                                       size_t n) {
+  CUDA_1D_KERNEL_LOOP(index, n) {
+    output[index] += bias[(index % step_batch) / step_channel];
+  }
+}
+
+template <typename scalar_t>
+static void output_add_bias(scalar_t* output, const scalar_t* bias,
+                            size_t batch, size_t channel, size_t height,
+                            size_t width, cudaStream_t stream) {
+  size_t step_channel = height * width;
+  size_t step_batch = step_channel * channel;
+  size_t n = step_batch * batch;
+  output_add_bias_kernel<<<GET_BLOCKS(n), THREADS_PER_BLOCK, 0, stream>>>(
+      output, bias, step_batch, step_channel, n);
+}
+
+template <typename scalar_t>
+void ModulatedDeformConvForwardCUDAKernelLauncher(
+    const scalar_t* input, const scalar_t* weight, const scalar_t* bias,
+    const scalar_t* offset, const scalar_t* mask, scalar_t* output,
+    void* workspace, int batch, int channels, int height, int width,
+    int channels_out, int kernel_w, int kernel_h, int stride_w, int stride_h,
+    int pad_w, int pad_h, int dilation_w, int dilation_h, int group,
+    int deformable_group, int im2col_step, cublasHandle_t cublas_handle,
+    cudaStream_t stream) {
+  size_t sizeof_dtype = sizeof(scalar_t);
+  bool with_bias = (bias != nullptr);
+
+  im2col_step = std::min(int(batch), im2col_step);
+  assert(batch % im2col_step == 0);
+  const int channels_kernel = channels / group;
+
+  const int height_out =
+      (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
+  const int width_out =
+      (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
+
+  scalar_t* columns = (scalar_t*)workspace;
+
+  const size_t input_step = channels * height * width;
+  const size_t offset_step =
+      deformable_group * kernel_h * kernel_w * 2 * height * width;
+  const size_t mask_step =
+      deformable_group * kernel_h * kernel_w * height * width;
+  const size_t out_step = channels_out * height_out * width_out;
+  const size_t out_group_step = out_step / group;
+  const size_t col_g_step =
+      channels * kernel_w * kernel_h / group * height_out * width_out;
+  const size_t weight_g_step =
+      channels_out / group * channels / group * kernel_h * kernel_w;
+
+  const int m = channels_out / group;
+  const int n = height_out * width_out;
+  const int k = channels / group * kernel_h * kernel_w;
+  scalar_t alpha = 1.;
+  scalar_t beta = 0.;
+
+  for (int b = 0; b < batch; b++) {
+    const scalar_t* input_start = input + b * input_step;
+    const scalar_t* offset_start = offset + b * offset_step;
+    const scalar_t* mask_start = mask + b * mask_step;
+    trt_modulated_deformable_im2col<scalar_t>(
+        input_start, offset_start, mask_start, 1, channels, height, width,
+        height_out, width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h,
+        stride_w, dilation_h, dilation_w, deformable_group, columns, stream);
+
+    for (int g = 0; g < group; g++) {
+      const scalar_t* weight_start = weight + g * weight_g_step;
+      scalar_t* col_start = columns + g * col_g_step;
+      scalar_t* out_buffer_start = output + b * out_step + g * out_group_step;
+
+      // cudaMemsetAsync(out_buffer_start, 0, 1, stream);
+      cublasGemmWrap<scalar_t>(cublas_handle, CUBLAS_OP_N, CUBLAS_OP_N, n, m, k,
+                               &alpha, col_start, n, weight_start, k, &beta,
+                               out_buffer_start, n);
+      cudaCheckError();
+    }
+  }
+
+  if (with_bias) {
+    output_add_bias<scalar_t>(output, bias, batch, channels_out, height_out,
+                              width_out, stream);
+  }
+}
+
+void ModulatedDeformConvForwardCUDAKernelLauncher_float(
+    const float* input, const float* weight, const float* bias,
+    const float* offset, const float* mask, float* output, void* workspace,
+    int batch, int channels, int height, int width, int channels_out,
+    int kernel_w, int kernel_h, int stride_w, int stride_h, int pad_w,
+    int pad_h, int dilation_w, int dilation_h, int group, int deformable_group,
+    int im2col_step, cublasHandle_t cublas_handle, cudaStream_t stream) {
+  ModulatedDeformConvForwardCUDAKernelLauncher<float>(
+      input, weight, bias, offset, mask, output, workspace, batch, channels,
+      height, width, channels_out, kernel_w, kernel_h, stride_w, stride_h,
+      pad_w, pad_h, dilation_w, dilation_h, group, deformable_group,
+      im2col_step, cublas_handle, stream);
+}
diff --git a/mmcv/ops/csrc/tensorrt/plugins/trt_nms.cpp b/mmcv/ops/csrc/tensorrt/plugins/trt_nms.cpp
new file mode 100644
index 0000000..64be215
--- /dev/null
+++ b/mmcv/ops/csrc/tensorrt/plugins/trt_nms.cpp
@@ -0,0 +1,279 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "trt_nms.hpp"
+
+#include <assert.h>
+#include <stdio.h>
+
+#include <chrono>
+
+#include "trt_serialize.hpp"
+
+extern size_t get_onnxnms_workspace_size(
+    size_t num_batches, size_t spatial_dimension, size_t num_classes,
+    size_t boxes_word_size, int center_point_box, size_t output_length);
+
+extern void TRTNMSCUDAKernelLauncher_float(
+    const float *boxes, const float *scores,
+    const int max_output_boxes_per_class, const float iou_threshold,
+    const float score_threshold, const int offset, int *output,
+    int center_point_box, int num_batches, int spatial_dimension,
+    int num_classes, size_t output_length, void *workspace,
+    cudaStream_t stream);
+
+namespace {
+static const char *PLUGIN_VERSION{"1"};
+static const char *PLUGIN_NAME{"NonMaxSuppression"};
+}  // namespace
+
+nvinfer1::PluginFieldCollection NonMaxSuppressionDynamicCreator::mFC{};
+std::vector<nvinfer1::PluginField>
+    NonMaxSuppressionDynamicCreator::mPluginAttributes;
+
+NonMaxSuppressionDynamic::NonMaxSuppressionDynamic(
+    const std::string &name, int centerPointBox, int maxOutputBoxesPerClass,
+    float iouThreshold, float scoreThreshold, int offset)
+    : mLayerName(name),
+      mCenterPointBox(centerPointBox),
+      mMaxOutputBoxesPerClass(maxOutputBoxesPerClass),
+      mIouThreshold(iouThreshold),
+      mScoreThreshold(scoreThreshold),
+      mOffset(offset) {}
+
+NonMaxSuppressionDynamic::NonMaxSuppressionDynamic(const std::string name,
+                                                   const void *data,
+                                                   size_t length)
+    : mLayerName(name) {
+  deserialize_value(&data, &length, &mCenterPointBox);
+  deserialize_value(&data, &length, &mMaxOutputBoxesPerClass);
+  deserialize_value(&data, &length, &mIouThreshold);
+  deserialize_value(&data, &length, &mScoreThreshold);
+  deserialize_value(&data, &length, &mOffset);
+}
+
+nvinfer1::IPluginV2DynamicExt *NonMaxSuppressionDynamic::clone() const {
+  NonMaxSuppressionDynamic *plugin = new NonMaxSuppressionDynamic(
+      mLayerName, mCenterPointBox, mMaxOutputBoxesPerClass, mIouThreshold,
+      mScoreThreshold, mOffset);
+  plugin->setPluginNamespace(getPluginNamespace());
+
+  return plugin;
+}
+
+nvinfer1::DimsExprs NonMaxSuppressionDynamic::getOutputDimensions(
+    int outputIndex, const nvinfer1::DimsExprs *inputs, int nbInputs,
+    nvinfer1::IExprBuilder &exprBuilder) {
+  nvinfer1::DimsExprs ret;
+  ret.nbDims = 2;
+  auto num_batches = inputs[0].d[0];
+  auto spatial_dimension = inputs[0].d[1];
+  if (mMaxOutputBoxesPerClass > 0) {
+    spatial_dimension = exprBuilder.operation(
+        nvinfer1::DimensionOperation::kMIN, *spatial_dimension,
+        *exprBuilder.constant(mMaxOutputBoxesPerClass));
+  }
+  auto num_classes = inputs[1].d[1];
+  ret.d[0] = exprBuilder.operation(
+      nvinfer1::DimensionOperation::kPROD, *num_batches,
+      *exprBuilder.operation(nvinfer1::DimensionOperation::kPROD,
+                             *spatial_dimension, *num_classes));
+  ret.d[1] = exprBuilder.constant(3);
+
+  return ret;
+}
+
+bool NonMaxSuppressionDynamic::supportsFormatCombination(
+    int pos, const nvinfer1::PluginTensorDesc *inOut, int nbInputs,
+    int nbOutputs) {
+  if (pos < nbInputs) {
+    switch (pos) {
+      case 0:
+        // boxes
+        return inOut[pos].type == nvinfer1::DataType::kFLOAT &&
+               inOut[pos].format == nvinfer1::TensorFormat::kLINEAR;
+      case 1:
+        // scores
+        return inOut[pos].type == nvinfer1::DataType::kFLOAT &&
+               inOut[pos].format == nvinfer1::TensorFormat::kLINEAR;
+      default:
+        return true;
+    }
+  } else {
+    switch (pos - nbInputs) {
+      case 0:
+        // selected_indices
+        return inOut[pos].type == nvinfer1::DataType::kINT32 &&
+               inOut[pos].format == nvinfer1::TensorFormat::kLINEAR;
+      default:
+        return true;
+    }
+  }
+  return true;
+}
+
+void NonMaxSuppressionDynamic::configurePlugin(
+    const nvinfer1::DynamicPluginTensorDesc *inputs, int nbInputs,
+    const nvinfer1::DynamicPluginTensorDesc *outputs, int nbOutputs) {}
+
+size_t NonMaxSuppressionDynamic::getWorkspaceSize(
+    const nvinfer1::PluginTensorDesc *inputs, int nbInputs,
+    const nvinfer1::PluginTensorDesc *outputs, int nbOutputs) const {
+  size_t boxes_word_size = mmcv::getElementSize(inputs[0].type);
+  size_t num_batches = inputs[0].dims.d[0];
+  size_t spatial_dimension = inputs[0].dims.d[1];
+  size_t num_classes = inputs[1].dims.d[1];
+  size_t output_length = outputs[0].dims.d[0];
+
+  return get_onnxnms_workspace_size(num_batches, spatial_dimension, num_classes,
+                                    boxes_word_size, mCenterPointBox,
+                                    output_length);
+}
+
+int NonMaxSuppressionDynamic::enqueue(
+    const nvinfer1::PluginTensorDesc *inputDesc,
+    const nvinfer1::PluginTensorDesc *outputDesc, const void *const *inputs,
+    void *const *outputs, void *workSpace, cudaStream_t stream) {
+  int num_batches = inputDesc[0].dims.d[0];
+  int spatial_dimension = inputDesc[0].dims.d[1];
+  int num_classes = inputDesc[1].dims.d[1];
+  int output_length = outputDesc[0].dims.d[0];
+
+  const float *boxes = (const float *)inputs[0];
+  const float *scores = (const float *)inputs[1];
+  int *output = (int *)outputs[0];
+  TRTNMSCUDAKernelLauncher_float(
+      boxes, scores, mMaxOutputBoxesPerClass, mIouThreshold, mScoreThreshold,
+      mOffset, output, mCenterPointBox, num_batches, spatial_dimension,
+      num_classes, output_length, workSpace, stream);
+
+  return 0;
+}
+
+nvinfer1::DataType NonMaxSuppressionDynamic::getOutputDataType(
+    int index, const nvinfer1::DataType *inputTypes, int nbInputs) const {
+  return nvinfer1::DataType::kINT32;
+}
+
+// IPluginV2 Methods
+const char *NonMaxSuppressionDynamic::getPluginType() const {
+  return PLUGIN_NAME;
+}
+
+const char *NonMaxSuppressionDynamic::getPluginVersion() const {
+  return PLUGIN_VERSION;
+}
+
+int NonMaxSuppressionDynamic::getNbOutputs() const { return 1; }
+
+int NonMaxSuppressionDynamic::initialize() { return 0; }
+
+void NonMaxSuppressionDynamic::terminate() {}
+
+size_t NonMaxSuppressionDynamic::getSerializationSize() const {
+  return sizeof(mCenterPointBox) + sizeof(mMaxOutputBoxesPerClass) +
+         sizeof(mIouThreshold) + sizeof(mScoreThreshold) + sizeof(mOffset);
+}
+
+void NonMaxSuppressionDynamic::serialize(void *buffer) const {
+  serialize_value(&buffer, mCenterPointBox);
+  serialize_value(&buffer, mMaxOutputBoxesPerClass);
+  serialize_value(&buffer, mIouThreshold);
+  serialize_value(&buffer, mScoreThreshold);
+  serialize_value(&buffer, mOffset);
+}
+
+void NonMaxSuppressionDynamic::destroy() {
+  // This gets called when the network containing plugin is destroyed
+  delete this;
+}
+
+void NonMaxSuppressionDynamic::setPluginNamespace(const char *libNamespace) {
+  mNamespace = libNamespace;
+}
+
+const char *NonMaxSuppressionDynamic::getPluginNamespace() const {
+  return mNamespace.c_str();
+}
+
+////////////////////// creator /////////////////////////////
+
+NonMaxSuppressionDynamicCreator::NonMaxSuppressionDynamicCreator() {
+  mPluginAttributes.clear();
+  mPluginAttributes.emplace_back(nvinfer1::PluginField("center_point_box"));
+  mPluginAttributes.emplace_back(
+      nvinfer1::PluginField("max_output_boxes_per_class"));
+  mPluginAttributes.emplace_back(nvinfer1::PluginField("iou_threshold"));
+  mPluginAttributes.emplace_back(nvinfer1::PluginField("score_threshold"));
+  mPluginAttributes.emplace_back(nvinfer1::PluginField("offset"));
+  mFC.nbFields = mPluginAttributes.size();
+  mFC.fields = mPluginAttributes.data();
+}
+
+const char *NonMaxSuppressionDynamicCreator::getPluginName() const {
+  return PLUGIN_NAME;
+}
+
+const char *NonMaxSuppressionDynamicCreator::getPluginVersion() const {
+  return PLUGIN_VERSION;
+}
+
+const nvinfer1::PluginFieldCollection *
+NonMaxSuppressionDynamicCreator::getFieldNames() {
+  return &mFC;
+}
+
+nvinfer1::IPluginV2 *NonMaxSuppressionDynamicCreator::createPlugin(
+    const char *name, const nvinfer1::PluginFieldCollection *fc) {
+  int centerPointBox = 0;
+  int maxOutputBoxesPerClass = 0;
+  float iouThreshold = 0.0f;
+  float scoreThreshold = 0.0f;
+  int offset = 0;
+
+  for (int i = 0; i < fc->nbFields; i++) {
+    if (fc->fields[i].data == nullptr) {
+      continue;
+    }
+    std::string field_name(fc->fields[i].name);
+
+    if (field_name.compare("center_point_box") == 0) {
+      centerPointBox = static_cast<const int *>(fc->fields[i].data)[0];
+    }
+
+    if (field_name.compare("max_output_boxes_per_class") == 0) {
+      maxOutputBoxesPerClass = static_cast<const int *>(fc->fields[i].data)[0];
+    }
+
+    if (field_name.compare("iou_threshold") == 0) {
+      iouThreshold = static_cast<const float *>(fc->fields[i].data)[0];
+    }
+
+    if (field_name.compare("score_threshold") == 0) {
+      scoreThreshold = static_cast<const float *>(fc->fields[i].data)[0];
+    }
+
+    if (field_name.compare("offset") == 0) {
+      offset = static_cast<const int *>(fc->fields[i].data)[0];
+    }
+  }
+  NonMaxSuppressionDynamic *plugin =
+      new NonMaxSuppressionDynamic(name, centerPointBox, maxOutputBoxesPerClass,
+                                   iouThreshold, scoreThreshold, offset);
+  plugin->setPluginNamespace(getPluginNamespace());
+  return plugin;
+}
+
+nvinfer1::IPluginV2 *NonMaxSuppressionDynamicCreator::deserializePlugin(
+    const char *name, const void *serialData, size_t serialLength) {
+  auto plugin = new NonMaxSuppressionDynamic(name, serialData, serialLength);
+  plugin->setPluginNamespace(getPluginNamespace());
+  return plugin;
+}
+
+void NonMaxSuppressionDynamicCreator::setPluginNamespace(
+    const char *libNamespace) {
+  mNamespace = libNamespace;
+}
+
+const char *NonMaxSuppressionDynamicCreator::getPluginNamespace() const {
+  return mNamespace.c_str();
+}
diff --git a/mmcv/ops/csrc/tensorrt/plugins/trt_nms_kernel.cu b/mmcv/ops/csrc/tensorrt/plugins/trt_nms_kernel.cu
new file mode 100644
index 0000000..303ea56
--- /dev/null
+++ b/mmcv/ops/csrc/tensorrt/plugins/trt_nms_kernel.cu
@@ -0,0 +1,272 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include <stdio.h>
+#include <thrust/execution_policy.h>
+#include <thrust/gather.h>
+#include <thrust/sort.h>
+#include <thrust/transform.h>
+
+#include <chrono>
+#include <thread>
+#include <vector>
+
+#include "common_cuda_helper.hpp"
+#include "nms_cuda_kernel.cuh"
+#include "trt_cuda_helper.cuh"
+#include "trt_plugin_helper.hpp"
+
+struct NMSBox {
+  float box[4];
+};
+
+struct nms_centerwh2xyxy {
+  __host__ __device__ NMSBox operator()(const NMSBox box) {
+    NMSBox out;
+    out.box[0] = box.box[0] - box.box[2] / 2.0f;
+    out.box[1] = box.box[1] - box.box[3] / 2.0f;
+    out.box[2] = box.box[0] + box.box[2] / 2.0f;
+    out.box[3] = box.box[1] + box.box[3] / 2.0f;
+    return out;
+  }
+};
+
+struct nms_sbox_idle {
+  const float* idle_box_;
+  __host__ __device__ nms_sbox_idle(const float* idle_box) {
+    idle_box_ = idle_box;
+  }
+
+  __host__ __device__ NMSBox operator()(const NMSBox box) {
+    return {idle_box_[0], idle_box_[1], idle_box_[2], idle_box_[3]};
+  }
+};
+
+struct nms_score_threshold {
+  float score_threshold_;
+  __host__ __device__ nms_score_threshold(const float score_threshold) {
+    score_threshold_ = score_threshold;
+  }
+
+  __host__ __device__ bool operator()(const float score) {
+    return score < score_threshold_;
+  }
+};
+
+__global__ void nms_reindex_kernel(int n, int* output, int* index_cache) {
+  CUDA_1D_KERNEL_LOOP(index, n) {
+    const int old_index = output[index * 3 + 2];
+    output[index * 3 + 2] = index_cache[old_index];
+  }
+}
+
+__global__ void mask_to_output_kernel(const unsigned long long* dev_mask,
+                                      const int* index, int* output,
+                                      int* output_count, int batch_id,
+                                      int cls_id, int spatial_dimension,
+                                      int col_blocks,
+                                      int max_output_boxes_per_class) {
+  extern __shared__ unsigned long long remv[];
+
+  // fill remv with 0
+  CUDA_1D_KERNEL_LOOP(i, col_blocks) { remv[i] = 0; }
+  __syncthreads();
+
+  int start = *output_count;
+  int out_per_class_count = 0;
+  for (int i = 0; i < spatial_dimension; i++) {
+    const int nblock = i / threadsPerBlock;
+    const int inblock = i % threadsPerBlock;
+    if (!(remv[nblock] & (1ULL << inblock))) {
+      if (threadIdx.x == 0) {
+        output[start * 3 + 0] = batch_id;
+        output[start * 3 + 1] = cls_id;
+        output[start * 3 + 2] = index[i];
+        start += 1;
+      }
+      out_per_class_count += 1;
+      if (out_per_class_count >= max_output_boxes_per_class) {
+        break;
+      }
+      __syncthreads();
+      // set every overlap box with bit 1 in remv
+      const unsigned long long* p = dev_mask + i * col_blocks;
+      CUDA_1D_KERNEL_LOOP(j, col_blocks) {
+        if (j >= nblock) {
+          remv[j] |= p[j];
+        }
+      }  // j
+      __syncthreads();
+    }
+  }  // i
+  if (threadIdx.x == 0) {
+    *output_count = start;
+  }
+}
+
+size_t get_onnxnms_workspace_size(size_t num_batches, size_t spatial_dimension,
+                                  size_t num_classes, size_t boxes_word_size,
+                                  int center_point_box, size_t output_length) {
+  size_t boxes_xyxy_workspace = 0;
+  if (center_point_box == 1) {
+    boxes_xyxy_workspace = mmcv::getAlignedSize(
+        num_batches * spatial_dimension * 4 * boxes_word_size);
+  }
+  size_t scores_workspace =
+      mmcv::getAlignedSize(spatial_dimension * boxes_word_size);
+  size_t boxes_workspace =
+      mmcv::getAlignedSize(spatial_dimension * 4 * boxes_word_size);
+  const int col_blocks = DIVUP(spatial_dimension, threadsPerBlock);
+  size_t mask_workspace = mmcv::getAlignedSize(spatial_dimension * col_blocks *
+                                               sizeof(unsigned long long));
+  size_t index_template_workspace =
+      mmcv::getAlignedSize(spatial_dimension * sizeof(int));
+  size_t index_workspace =
+      mmcv::getAlignedSize(spatial_dimension * sizeof(int));
+  size_t count_workspace = mmcv::getAlignedSize(sizeof(int));
+  return scores_workspace + boxes_xyxy_workspace + boxes_workspace +
+         mask_workspace + index_template_workspace + index_workspace +
+         count_workspace;
+}
+
+/**
+ * Launch the NonMaxSuppression kernel
+ *
+ * The NMS will be performed on each batch/class, share the kernel implement
+ * `nms_cuda`. For each batch/class, the `boxes_sorted` and `index_cache` will
+ * be sorted by scores, boxes_sorted will be used in `nms_cuda` kernel. After
+ * that, the output would be generated by `mask_to_output_kernel` with
+ * `dev_mask` and `sorted_cache`.
+ *
+ * @param[in] bboxes with shape [num_batch, spatial_dimension, 4], input boxes
+ * @param[in] scores with shape [num_batch, num_classes, spatial_dimension],
+ *     input scores
+ * @param[in] max_output_boxes_per_class max output boxes per class
+ * @param[in] iou_threshold threshold of iou
+ * @param[in] score_threshold threshold of scores
+ * @param[in] offset box offset, only 0 or 1 is valid
+ * @param[out] output with shape [output_length, 3], each row contain index
+ *     (batch_id, class_id, boxes_id), filling -1 if result is not valid.
+ * @param[in] center_point_box 0 if boxes is [left, top, right, bottom] 1 if
+ *     boxes is [center_x, center_y, width, height]
+ * @param[in] num_batches batch size of boxes and scores
+ * @param[in] spatial_dimension boxes numbers each batch
+ * @param[in] num_classes class numbers
+ * @param[in] output_length the max output rows
+ * @param[in] workspace memory for all temporary variables.
+ * @param[in] stream cuda stream
+ */
+void TRTNMSCUDAKernelLauncher_float(const float* boxes, const float* scores,
+                                    const int max_output_boxes_per_class,
+                                    const float iou_threshold,
+                                    const float score_threshold,
+                                    const int offset, int* output,
+                                    int center_point_box, int num_batches,
+                                    int spatial_dimension, int num_classes,
+                                    size_t output_length, void* workspace,
+                                    cudaStream_t stream) {
+  const int col_blocks = DIVUP(spatial_dimension, threadsPerBlock);
+  float* boxes_sorted = (float*)workspace;
+  workspace = static_cast<char*>(workspace) +
+              mmcv::getAlignedSize(spatial_dimension * 4 * sizeof(float));
+
+  float* boxes_xyxy = nullptr;
+  if (center_point_box == 1) {
+    boxes_xyxy = (float*)workspace;
+    workspace = static_cast<char*>(workspace) +
+                mmcv::getAlignedSize(num_batches * spatial_dimension * 4 *
+                                     sizeof(float));
+    thrust::transform(thrust::cuda::par.on(stream), (NMSBox*)boxes,
+                      (NMSBox*)(boxes + num_batches * spatial_dimension * 4),
+                      (NMSBox*)boxes_xyxy, nms_centerwh2xyxy());
+    cudaCheckError();
+  }
+
+  float* scores_sorted = (float*)workspace;
+  workspace = static_cast<char*>(workspace) +
+              mmcv::getAlignedSize(spatial_dimension * sizeof(float));
+
+  unsigned long long* dev_mask = (unsigned long long*)workspace;
+  workspace = static_cast<char*>(workspace) +
+              mmcv::getAlignedSize(spatial_dimension * col_blocks *
+                                   sizeof(unsigned long long));
+
+  int* index_cache = (int*)workspace;
+  workspace = static_cast<char*>(workspace) +
+              mmcv::getAlignedSize(spatial_dimension * sizeof(int));
+
+  // generate sequence [0,1,2,3,4 ....]
+  int* index_template = (int*)workspace;
+  workspace = static_cast<char*>(workspace) +
+              mmcv::getAlignedSize(spatial_dimension * sizeof(int));
+  thrust::sequence(thrust::cuda::par.on(stream), index_template,
+                   index_template + spatial_dimension, 0);
+
+  int max_output_boxes_per_class_cpu = max_output_boxes_per_class;
+  if (max_output_boxes_per_class_cpu <= 0) {
+    max_output_boxes_per_class_cpu = spatial_dimension;
+  }
+
+  int* output_count = (int*)workspace;
+  workspace = static_cast<char*>(workspace) + mmcv::getAlignedSize(sizeof(int));
+  cudaMemsetAsync(output_count, 0, sizeof(int), stream);
+
+  // fill output with -1
+  thrust::fill(thrust::cuda::par.on(stream), output, output + output_length * 3,
+               -1);
+  cudaCheckError();
+
+  dim3 blocks(col_blocks, col_blocks);
+  dim3 threads(threadsPerBlock);
+
+  for (int batch_id = 0; batch_id < num_batches; ++batch_id) {
+    for (int cls_id = 0; cls_id < num_classes; ++cls_id) {
+      const int batch_cls_id = batch_id * num_classes + cls_id;
+
+      // sort boxes by score
+      cudaMemcpyAsync(scores_sorted, scores + batch_cls_id * spatial_dimension,
+                      spatial_dimension * sizeof(float),
+                      cudaMemcpyDeviceToDevice, stream);
+      cudaCheckError();
+
+      cudaMemcpyAsync(index_cache, index_template,
+                      spatial_dimension * sizeof(int), cudaMemcpyDeviceToDevice,
+                      stream);
+      cudaCheckError();
+
+      thrust::sort_by_key(thrust::cuda::par.on(stream), scores_sorted,
+                          scores_sorted + spatial_dimension, index_cache,
+                          thrust::greater<float>());
+
+      if (center_point_box == 1) {
+        thrust::gather(thrust::cuda::par.on(stream), index_cache,
+                       index_cache + spatial_dimension,
+                       (NMSBox*)(boxes_xyxy + batch_id * spatial_dimension * 4),
+                       (NMSBox*)boxes_sorted);
+      } else {
+        thrust::gather(thrust::cuda::par.on(stream), index_cache,
+                       index_cache + spatial_dimension,
+                       (NMSBox*)(boxes + batch_id * spatial_dimension * 4),
+                       (NMSBox*)boxes_sorted);
+      }
+
+      cudaCheckError();
+
+      if (score_threshold > 0.0f) {
+        thrust::transform_if(
+            thrust::cuda::par.on(stream), (NMSBox*)boxes_sorted,
+            (NMSBox*)(boxes_sorted + spatial_dimension * 4), scores_sorted,
+            (NMSBox*)boxes_sorted, nms_sbox_idle(boxes_sorted),
+            nms_score_threshold(score_threshold));
+      }
+
+      nms_cuda<<<blocks, threads, 0, stream>>>(spatial_dimension, iou_threshold,
+                                               offset, boxes_sorted, dev_mask);
+
+      // will be performed when dev_mask is full.
+      mask_to_output_kernel<<<1, threadsPerBlock,
+                              col_blocks * sizeof(unsigned long long),
+                              stream>>>(
+          dev_mask, index_cache, output, output_count, batch_id, cls_id,
+          spatial_dimension, col_blocks, max_output_boxes_per_class_cpu);
+    }  // cls_id
+  }    // batch_id
+}
diff --git a/mmcv/ops/csrc/tensorrt/plugins/trt_plugin.cpp b/mmcv/ops/csrc/tensorrt/plugins/trt_plugin.cpp
new file mode 100644
index 0000000..eec1bb2
--- /dev/null
+++ b/mmcv/ops/csrc/tensorrt/plugins/trt_plugin.cpp
@@ -0,0 +1,27 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "trt_plugin.hpp"
+
+#include "trt_corner_pool.hpp"
+#include "trt_cummaxmin.hpp"
+#include "trt_deform_conv.hpp"
+#include "trt_grid_sampler.hpp"
+#include "trt_instance_norm.hpp"
+#include "trt_modulated_deform_conv.hpp"
+#include "trt_nms.hpp"
+#include "trt_roi_align.hpp"
+#include "trt_scatternd.hpp"
+
+REGISTER_TENSORRT_PLUGIN(CumMaxPluginDynamicCreator);
+REGISTER_TENSORRT_PLUGIN(CumMinPluginDynamicCreator);
+REGISTER_TENSORRT_PLUGIN(GridSamplerDynamicCreator);
+REGISTER_TENSORRT_PLUGIN(DeformableConvPluginDynamicCreator);
+REGISTER_TENSORRT_PLUGIN(ModulatedDeformableConvPluginDynamicCreator);
+REGISTER_TENSORRT_PLUGIN(NonMaxSuppressionDynamicCreator);
+REGISTER_TENSORRT_PLUGIN(RoIAlignPluginDynamicCreator);
+REGISTER_TENSORRT_PLUGIN(ONNXScatterNDDynamicCreator);
+REGISTER_TENSORRT_PLUGIN(InstanceNormalizationDynamicCreator);
+REGISTER_TENSORRT_PLUGIN(CornerPoolPluginDynamicCreator);
+
+extern "C" {
+bool initLibMMCVInferPlugins() { return true; }
+}  // extern "C"
diff --git a/mmcv/ops/csrc/tensorrt/plugins/trt_roi_align.cpp b/mmcv/ops/csrc/tensorrt/plugins/trt_roi_align.cpp
new file mode 100644
index 0000000..97700f9
--- /dev/null
+++ b/mmcv/ops/csrc/tensorrt/plugins/trt_roi_align.cpp
@@ -0,0 +1,294 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "trt_roi_align.hpp"
+
+#include <assert.h>
+
+#include <chrono>
+
+#include "trt_serialize.hpp"
+
+extern void TRTRoIAlignForwardCUDAKernelLauncher_float(
+    const float *input, const float *rois, float *output, float *argmax_y,
+    float *argmax_x, int output_size, int channels, int height, int width,
+    int aligned_height, int aligned_width, float spatial_scale,
+    int sampling_ratio, int pool_mode, bool aligned, cudaStream_t stream);
+
+namespace {
+static const char *PLUGIN_VERSION{"1"};
+static const char *PLUGIN_NAME{"MMCVRoiAlign"};
+}  // namespace
+
+nvinfer1::PluginFieldCollection RoIAlignPluginDynamicCreator::mFC{};
+std::vector<nvinfer1::PluginField>
+    RoIAlignPluginDynamicCreator::mPluginAttributes;
+
+RoIAlignPluginDynamic::RoIAlignPluginDynamic(const std::string &name,
+                                             int outWidth, int outHeight,
+                                             float spatialScale,
+                                             int sampleRatio, int poolMode,
+                                             bool aligned)
+    : mLayerName(name),
+      mOutWidth(outWidth),
+      mOutHeight(outHeight),
+      mSpatialScale(spatialScale),
+      mSampleRatio(sampleRatio),
+      mPoolMode(poolMode),
+      mAligned(aligned) {}
+
+RoIAlignPluginDynamic::RoIAlignPluginDynamic(const std::string name,
+                                             const void *data, size_t length)
+    : mLayerName(name) {
+  deserialize_value(&data, &length, &mOutWidth);
+  deserialize_value(&data, &length, &mOutHeight);
+  deserialize_value(&data, &length, &mSpatialScale);
+  deserialize_value(&data, &length, &mSampleRatio);
+  deserialize_value(&data, &length, &mPoolMode);
+  deserialize_value(&data, &length, &mAligned);
+}
+
+nvinfer1::IPluginV2DynamicExt *RoIAlignPluginDynamic::clone() const {
+  RoIAlignPluginDynamic *plugin = new RoIAlignPluginDynamic(
+      mLayerName, mOutWidth, mOutHeight, mSpatialScale, mSampleRatio, mPoolMode,
+      mAligned);
+  plugin->setPluginNamespace(getPluginNamespace());
+
+  return plugin;
+}
+
+nvinfer1::DimsExprs RoIAlignPluginDynamic::getOutputDimensions(
+    int outputIndex, const nvinfer1::DimsExprs *inputs, int nbInputs,
+    nvinfer1::IExprBuilder &exprBuilder) {
+  nvinfer1::DimsExprs ret;
+  ret.nbDims = 4;
+  ret.d[0] = inputs[1].d[0];
+  ret.d[1] = inputs[0].d[1];
+  ret.d[2] = exprBuilder.constant(mOutHeight);
+  ret.d[3] = exprBuilder.constant(mOutWidth);
+
+  return ret;
+}
+
+bool RoIAlignPluginDynamic::supportsFormatCombination(
+    int pos, const nvinfer1::PluginTensorDesc *inOut, int nbInputs,
+    int nbOutputs) {
+  return inOut[pos].type == nvinfer1::DataType::kFLOAT &&
+         inOut[pos].format == nvinfer1::TensorFormat::kLINEAR;
+}
+
+void RoIAlignPluginDynamic::configurePlugin(
+    const nvinfer1::DynamicPluginTensorDesc *inputs, int nbInputs,
+    const nvinfer1::DynamicPluginTensorDesc *outputs, int nbOutputs) {}
+
+size_t RoIAlignPluginDynamic::getWorkspaceSize(
+    const nvinfer1::PluginTensorDesc *inputs, int nbInputs,
+    const nvinfer1::PluginTensorDesc *outputs, int nbOutputs) const {
+  size_t output_size = 0;
+  size_t word_size = 0;
+  switch (mPoolMode) {
+    case 0:  // max
+      output_size = outputs[0].dims.d[0] * outputs[0].dims.d[1] *
+                    outputs[0].dims.d[2] * outputs[0].dims.d[3];
+      word_size = mmcv::getElementSize(outputs[0].type);
+      return output_size * word_size * 2;
+      break;
+    case 1:
+      return 0;
+      break;
+    default:
+      return 0;
+  }
+  return 0;
+}
+
+int RoIAlignPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc *inputDesc,
+                                   const nvinfer1::PluginTensorDesc *outputDesc,
+                                   const void *const *inputs,
+                                   void *const *outputs, void *workSpace,
+                                   cudaStream_t stream) {
+  int channels = inputDesc[0].dims.d[1];
+  int height = inputDesc[0].dims.d[2];
+  int width = inputDesc[0].dims.d[3];
+
+  int output_size = outputDesc[0].dims.d[0] * outputDesc[0].dims.d[1] *
+                    outputDesc[0].dims.d[2] * outputDesc[0].dims.d[3];
+  int word_size = mmcv::getElementSize(outputDesc[0].type);
+
+  const void *feat = inputs[0];
+  const void *rois = inputs[1];
+  void *output = outputs[0];
+  void *argmax_y = nullptr;
+  void *argmax_x = nullptr;
+
+  switch (mPoolMode) {
+    case 0:  // max
+      argmax_y = workSpace;
+      argmax_x = argmax_y + output_size * word_size;
+      break;
+    case 1:  // avg
+      break;
+  }
+
+  switch (outputDesc[0].type) {
+    case nvinfer1::DataType::kFLOAT:
+      TRTRoIAlignForwardCUDAKernelLauncher_float(
+          (const float *)feat, (const float *)rois, (float *)output,
+          (float *)argmax_y, (float *)argmax_x, output_size, channels, height,
+          width, mOutHeight, mOutWidth, mSpatialScale, mSampleRatio, mPoolMode,
+          mAligned, stream);
+      break;
+
+    default:
+      break;
+  }
+
+  return 0;
+}
+
+nvinfer1::DataType RoIAlignPluginDynamic::getOutputDataType(
+    int index, const nvinfer1::DataType *inputTypes, int nbInputs) const {
+  return inputTypes[0];
+}
+
+// IPluginV2 Methods
+const char *RoIAlignPluginDynamic::getPluginType() const { return PLUGIN_NAME; }
+
+const char *RoIAlignPluginDynamic::getPluginVersion() const {
+  return PLUGIN_VERSION;
+}
+
+int RoIAlignPluginDynamic::getNbOutputs() const { return 1; }
+
+int RoIAlignPluginDynamic::initialize() { return 0; }
+
+void RoIAlignPluginDynamic::terminate() {}
+
+size_t RoIAlignPluginDynamic::getSerializationSize() const {
+  return sizeof(mOutWidth) + sizeof(mOutHeight) + sizeof(mSpatialScale) +
+         sizeof(mSampleRatio) + sizeof(mPoolMode) + sizeof(mAligned);
+}
+
+void RoIAlignPluginDynamic::serialize(void *buffer) const {
+  serialize_value(&buffer, mOutWidth);
+  serialize_value(&buffer, mOutHeight);
+  serialize_value(&buffer, mSpatialScale);
+  serialize_value(&buffer, mSampleRatio);
+  serialize_value(&buffer, mPoolMode);
+  serialize_value(&buffer, mAligned);
+}
+
+void RoIAlignPluginDynamic::destroy() {
+  // This gets called when the network containing plugin is destroyed
+  delete this;
+}
+
+void RoIAlignPluginDynamic::setPluginNamespace(const char *libNamespace) {
+  mNamespace = libNamespace;
+}
+
+const char *RoIAlignPluginDynamic::getPluginNamespace() const {
+  return mNamespace.c_str();
+}
+
+////////////////////// creator /////////////////////////////
+
+RoIAlignPluginDynamicCreator::RoIAlignPluginDynamicCreator() {
+  mPluginAttributes.emplace_back(nvinfer1::PluginField("output_height"));
+  mPluginAttributes.emplace_back(nvinfer1::PluginField("output_width"));
+  mPluginAttributes.emplace_back(nvinfer1::PluginField("spatial_scale"));
+  mPluginAttributes.emplace_back(nvinfer1::PluginField("sampling_ratio"));
+  mPluginAttributes.emplace_back(nvinfer1::PluginField("mode"));
+  mPluginAttributes.emplace_back(nvinfer1::PluginField("aligned"));
+  mFC.nbFields = mPluginAttributes.size();
+  mFC.fields = mPluginAttributes.data();
+}
+
+const char *RoIAlignPluginDynamicCreator::getPluginName() const {
+  return PLUGIN_NAME;
+}
+
+const char *RoIAlignPluginDynamicCreator::getPluginVersion() const {
+  return PLUGIN_VERSION;
+}
+
+const nvinfer1::PluginFieldCollection *
+RoIAlignPluginDynamicCreator::getFieldNames() {
+  return &mFC;
+}
+
+nvinfer1::IPluginV2 *RoIAlignPluginDynamicCreator::createPlugin(
+    const char *name, const nvinfer1::PluginFieldCollection *fc) {
+  int outWidth = 7;
+  int outHeight = 7;
+  float spatialScale = 1.0;
+  int sampleRatio = 0;
+  int poolMode = -1;
+  bool aligned = true;
+  for (int i = 0; i < fc->nbFields; i++) {
+    if (fc->fields[i].data == nullptr) {
+      continue;
+    }
+    std::string field_name(fc->fields[i].name);
+
+    if (field_name.compare("output_height") == 0) {
+      outHeight = static_cast<const int *>(fc->fields[i].data)[0];
+    }
+
+    if (field_name.compare("output_width") == 0) {
+      outWidth = static_cast<const int *>(fc->fields[i].data)[0];
+    }
+
+    if (field_name.compare("spatial_scale") == 0) {
+      spatialScale = static_cast<const float *>(fc->fields[i].data)[0];
+    }
+
+    if (field_name.compare("sampling_ratio") == 0) {
+      sampleRatio = static_cast<const int *>(fc->fields[i].data)[0];
+    }
+
+    if (field_name.compare("mode") == 0) {
+      int data_size = fc->fields[i].length;
+      const char *data_start = static_cast<const char *>(fc->fields[i].data);
+      std::string poolModeStr(data_start, data_size);
+      if (poolModeStr == "avg") {
+        poolMode = 1;
+      } else if (poolModeStr == "max") {
+        poolMode = 0;
+      } else {
+        std::cout << "Unknown pool mode \"" << poolModeStr << "\"."
+                  << std::endl;
+      }
+      assert(poolMode >= 0);
+    }
+
+    if (field_name.compare("aligned") == 0) {
+      int aligned_int = static_cast<const int *>(fc->fields[i].data)[0];
+      aligned = aligned_int != 0;
+    }
+  }
+
+  assert(outHeight > 0);
+  assert(outWidth > 0);
+  assert(spatialScale > 0.);
+  assert(poolMode >= 0);
+
+  RoIAlignPluginDynamic *plugin = new RoIAlignPluginDynamic(
+      name, outWidth, outHeight, spatialScale, sampleRatio, poolMode, aligned);
+  plugin->setPluginNamespace(getPluginNamespace());
+  return plugin;
+}
+
+nvinfer1::IPluginV2 *RoIAlignPluginDynamicCreator::deserializePlugin(
+    const char *name, const void *serialData, size_t serialLength) {
+  auto plugin = new RoIAlignPluginDynamic(name, serialData, serialLength);
+  plugin->setPluginNamespace(getPluginNamespace());
+  return plugin;
+}
+
+void RoIAlignPluginDynamicCreator::setPluginNamespace(
+    const char *libNamespace) {
+  mNamespace = libNamespace;
+}
+
+const char *RoIAlignPluginDynamicCreator::getPluginNamespace() const {
+  return mNamespace.c_str();
+}
diff --git a/mmcv/ops/csrc/tensorrt/plugins/trt_roi_align_kernel.cu b/mmcv/ops/csrc/tensorrt/plugins/trt_roi_align_kernel.cu
new file mode 100644
index 0000000..650bc68
--- /dev/null
+++ b/mmcv/ops/csrc/tensorrt/plugins/trt_roi_align_kernel.cu
@@ -0,0 +1,28 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "common_cuda_helper.hpp"
+#include "roi_align_cuda_kernel.cuh"
+
+template <typename scalar_t>
+void TRTRoIAlignForwardCUDAKernelLauncher(
+    const scalar_t* input, const scalar_t* rois, scalar_t* output,
+    scalar_t* argmax_y, scalar_t* argmax_x, int output_size, int channels,
+    int height, int width, int aligned_height, int aligned_width,
+    scalar_t spatial_scale, int sampling_ratio, int pool_mode, bool aligned,
+    cudaStream_t stream) {
+  roi_align_forward_cuda_kernel<scalar_t>
+      <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+          output_size, input, rois, output, argmax_y, argmax_x, aligned_height,
+          aligned_width, static_cast<scalar_t>(spatial_scale), sampling_ratio,
+          pool_mode, aligned, channels, height, width);
+}
+
+void TRTRoIAlignForwardCUDAKernelLauncher_float(
+    const float* input, const float* rois, float* output, float* argmax_y,
+    float* argmax_x, int output_size, int channels, int height, int width,
+    int aligned_height, int aligned_width, float spatial_scale,
+    int sampling_ratio, int pool_mode, bool aligned, cudaStream_t stream) {
+  TRTRoIAlignForwardCUDAKernelLauncher<float>(
+      input, rois, output, argmax_y, argmax_x, output_size, channels, height,
+      width, aligned_height, aligned_width, spatial_scale, sampling_ratio,
+      pool_mode, aligned, stream);
+}
diff --git a/mmcv/ops/csrc/tensorrt/plugins/trt_scatternd.cpp b/mmcv/ops/csrc/tensorrt/plugins/trt_scatternd.cpp
new file mode 100644
index 0000000..0d07790
--- /dev/null
+++ b/mmcv/ops/csrc/tensorrt/plugins/trt_scatternd.cpp
@@ -0,0 +1,207 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "trt_scatternd.hpp"
+
+#include <assert.h>
+#include <stdio.h>
+
+#include <chrono>
+
+#include "trt_serialize.hpp"
+
+extern void TRTONNXScatterNDKernelLauncher_float(
+    const float *data, const int *indices, const float *update, const int *dims,
+    int nbDims, const int *indices_dims, int indice_nbDims, float *output,
+    cudaStream_t stream);
+
+extern void TRTONNXScatterNDKernelLauncher_int32(
+    const int *data, const int *indices, const int *update, const int *dims,
+    int nbDims, const int *indices_dims, int indice_nbDims, int *output,
+    cudaStream_t stream);
+
+namespace {
+static const char *PLUGIN_VERSION{"1"};
+static const char *PLUGIN_NAME{"ScatterND"};
+}  // namespace
+
+nvinfer1::PluginFieldCollection ONNXScatterNDDynamicCreator::mFC{};
+std::vector<nvinfer1::PluginField>
+    ONNXScatterNDDynamicCreator::mPluginAttributes;
+
+ONNXScatterNDDynamic::ONNXScatterNDDynamic(const std::string &name)
+    : mLayerName(name) {}
+
+ONNXScatterNDDynamic::ONNXScatterNDDynamic(const std::string name,
+                                           const void *data, size_t length)
+    : mLayerName(name) {}
+
+nvinfer1::IPluginV2DynamicExt *ONNXScatterNDDynamic::clone() const {
+  ONNXScatterNDDynamic *plugin = new ONNXScatterNDDynamic(mLayerName);
+  plugin->setPluginNamespace(getPluginNamespace());
+
+  return plugin;
+}
+
+nvinfer1::DimsExprs ONNXScatterNDDynamic::getOutputDimensions(
+    int outputIndex, const nvinfer1::DimsExprs *inputs, int nbInputs,
+    nvinfer1::IExprBuilder &exprBuilder) {
+  return inputs[0];
+}
+
+bool ONNXScatterNDDynamic::supportsFormatCombination(
+    int pos, const nvinfer1::PluginTensorDesc *inOut, int nbInputs,
+    int nbOutputs) {
+  if (pos < nbInputs) {
+    switch (pos) {
+      case 0:
+        // data
+        return (inOut[pos].type == nvinfer1::DataType::kFLOAT &&
+                inOut[pos].format == nvinfer1::TensorFormat::kLINEAR) ||
+               (inOut[pos].type == nvinfer1::DataType::kINT32 &&
+                inOut[pos].format == nvinfer1::TensorFormat::kLINEAR);
+      case 1:
+        // indices
+        return inOut[pos].type == nvinfer1::DataType::kINT32 &&
+               inOut[pos].format == nvinfer1::TensorFormat::kLINEAR;
+      case 2:
+        // updates
+        return inOut[pos].type == inOut[0].type &&
+               inOut[pos].format == inOut[0].format;
+      default:
+        return true;
+    }
+  } else {
+    switch (pos - nbInputs) {
+      case 0:
+        // output
+        return inOut[pos].type == inOut[0].type &&
+               inOut[pos].format == inOut[0].format;
+      default:
+        return true;
+    }
+  }
+  return true;
+}
+
+void ONNXScatterNDDynamic::configurePlugin(
+    const nvinfer1::DynamicPluginTensorDesc *inputs, int nbInputs,
+    const nvinfer1::DynamicPluginTensorDesc *outputs, int nbOutputs) {}
+
+size_t ONNXScatterNDDynamic::getWorkspaceSize(
+    const nvinfer1::PluginTensorDesc *inputs, int nbInputs,
+    const nvinfer1::PluginTensorDesc *outputs, int nbOutputs) const {
+  return 0;
+}
+
+int ONNXScatterNDDynamic::enqueue(const nvinfer1::PluginTensorDesc *inputDesc,
+                                  const nvinfer1::PluginTensorDesc *outputDesc,
+                                  const void *const *inputs,
+                                  void *const *outputs, void *workSpace,
+                                  cudaStream_t stream) {
+  const int *dims = &(inputDesc[0].dims.d[0]);
+  const int *indices_dims = &(inputDesc[1].dims.d[0]);
+  int nbDims = inputDesc[0].dims.nbDims;
+  int indice_nbDims = inputDesc[1].dims.nbDims;
+
+  const void *data = inputs[0];
+  const void *indices = inputs[1];
+  const void *update = inputs[2];
+  void *output = outputs[0];
+
+  auto data_type = inputDesc[0].type;
+
+  switch (data_type) {
+    case nvinfer1::DataType::kFLOAT:
+      TRTONNXScatterNDKernelLauncher_float(
+          (float *)data, (int *)indices, (float *)update, dims, nbDims,
+          indices_dims, indice_nbDims, (float *)output, stream);
+      break;
+
+    case nvinfer1::DataType::kINT32:
+      TRTONNXScatterNDKernelLauncher_int32(
+          (int *)data, (int *)indices, (int *)update, dims, nbDims,
+          indices_dims, indice_nbDims, (int *)output, stream);
+      break;
+    default:
+      break;
+  }
+
+  return 0;
+}
+
+nvinfer1::DataType ONNXScatterNDDynamic::getOutputDataType(
+    int index, const nvinfer1::DataType *inputTypes, int nbInputs) const {
+  return inputTypes[0];
+}
+
+// IPluginV2 Methods
+const char *ONNXScatterNDDynamic::getPluginType() const { return PLUGIN_NAME; }
+
+const char *ONNXScatterNDDynamic::getPluginVersion() const {
+  return PLUGIN_VERSION;
+}
+
+int ONNXScatterNDDynamic::getNbOutputs() const { return 1; }
+
+int ONNXScatterNDDynamic::initialize() { return 0; }
+
+void ONNXScatterNDDynamic::terminate() {}
+
+size_t ONNXScatterNDDynamic::getSerializationSize() const { return 0; }
+
+void ONNXScatterNDDynamic::serialize(void *buffer) const {}
+
+void ONNXScatterNDDynamic::destroy() {
+  // This gets called when the network containing plugin is destroyed
+  delete this;
+}
+
+void ONNXScatterNDDynamic::setPluginNamespace(const char *libNamespace) {
+  mNamespace = libNamespace;
+}
+
+const char *ONNXScatterNDDynamic::getPluginNamespace() const {
+  return mNamespace.c_str();
+}
+
+////////////////////// creator /////////////////////////////
+
+ONNXScatterNDDynamicCreator::ONNXScatterNDDynamicCreator() {
+  mPluginAttributes.clear();
+  mFC.nbFields = mPluginAttributes.size();
+  mFC.fields = mPluginAttributes.data();
+}
+
+const char *ONNXScatterNDDynamicCreator::getPluginName() const {
+  return PLUGIN_NAME;
+}
+
+const char *ONNXScatterNDDynamicCreator::getPluginVersion() const {
+  return PLUGIN_VERSION;
+}
+
+const nvinfer1::PluginFieldCollection *
+ONNXScatterNDDynamicCreator::getFieldNames() {
+  return &mFC;
+}
+
+nvinfer1::IPluginV2 *ONNXScatterNDDynamicCreator::createPlugin(
+    const char *name, const nvinfer1::PluginFieldCollection *fc) {
+  ONNXScatterNDDynamic *plugin = new ONNXScatterNDDynamic(name);
+  plugin->setPluginNamespace(getPluginNamespace());
+  return plugin;
+}
+
+nvinfer1::IPluginV2 *ONNXScatterNDDynamicCreator::deserializePlugin(
+    const char *name, const void *serialData, size_t serialLength) {
+  auto plugin = new ONNXScatterNDDynamic(name, serialData, serialLength);
+  plugin->setPluginNamespace(getPluginNamespace());
+  return plugin;
+}
+
+void ONNXScatterNDDynamicCreator::setPluginNamespace(const char *libNamespace) {
+  mNamespace = libNamespace;
+}
+
+const char *ONNXScatterNDDynamicCreator::getPluginNamespace() const {
+  return mNamespace.c_str();
+}
diff --git a/mmcv/ops/csrc/tensorrt/plugins/trt_scatternd_kernel.cu b/mmcv/ops/csrc/tensorrt/plugins/trt_scatternd_kernel.cu
new file mode 100644
index 0000000..3c7423a
--- /dev/null
+++ b/mmcv/ops/csrc/tensorrt/plugins/trt_scatternd_kernel.cu
@@ -0,0 +1,93 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include <stdio.h>
+
+#include <vector>
+
+#include "common_cuda_helper.hpp"
+#include "trt_cuda_helper.cuh"
+#include "trt_plugin_helper.hpp"
+
+static int const threadsPerBlock = sizeof(unsigned long long int) * 8;
+
+using mmcv::TensorDesc;
+
+template <typename T>
+__global__ void onnx_scatternd_kernel(const int n, const int* indices,
+                                      const T* update, T* output,
+                                      TensorDesc tensor_desc,
+                                      TensorDesc indice_desc) {
+  const int indice_cols = indice_desc.shape[indice_desc.dim - 1];
+  const int copy_stride = tensor_desc.stride[indice_cols - 1];
+  const int* stride = &(tensor_desc.stride[0]);
+  CUDA_1D_KERNEL_LOOP(index, n) {
+    int output_offset = 0;
+    const int* indices_current = indices + index * indice_cols;
+    for (int i = 0; i < indice_cols; ++i) {
+      output_offset += stride[i] * indices_current[i];
+    }
+    memcpy(output + output_offset, update + index * copy_stride,
+           copy_stride * sizeof(T));
+  }
+}
+
+template <typename T>
+void TRTONNXScatterNDKernelLauncher(const T* data, const int* indices,
+                                    const T* update, const int* dims,
+                                    int nbDims, const int* indices_dims,
+                                    int indice_nbDims, T* output,
+                                    cudaStream_t stream) {
+  // fill tensordesc and initial
+  TensorDesc tensor_desc;
+  memset((void*)&tensor_desc, 0, sizeof(TensorDesc));
+  tensor_desc.dim = nbDims;
+  tensor_desc.shape[nbDims - 1] = dims[nbDims - 1];
+  tensor_desc.stride[nbDims - 1] = 1;
+  for (int i = nbDims - 2; i >= 0; --i) {
+    tensor_desc.shape[i] = dims[i];
+    tensor_desc.stride[i] = dims[i + 1] * tensor_desc.stride[i + 1];
+  }
+  const int data_size = tensor_desc.stride[0] * tensor_desc.shape[0];
+
+  TensorDesc indice_desc;
+  memset((void*)&indice_desc, 0, sizeof(TensorDesc));
+  indice_desc.dim = indice_nbDims;
+  indice_desc.shape[indice_nbDims - 1] = indices_dims[indice_nbDims - 1];
+  indice_desc.stride[indice_nbDims - 1] = 1;
+  for (int i = indice_nbDims - 2; i >= 0; --i) {
+    indice_desc.shape[i] = indices_dims[i];
+    indice_desc.stride[i] = indices_dims[i + 1] * indice_desc.stride[i + 1];
+  }
+
+  // output = np.copy(data)
+  cudaMemcpyAsync(output, data, data_size * sizeof(T),
+                  cudaMemcpyDeviceToDevice);
+
+  int num_update_indice = 1;
+  for (int i = 0; i < indice_nbDims - 1; ++i) {
+    num_update_indice *= indice_desc.shape[i];
+  }
+  // scatter
+  const int col_block = DIVUP(num_update_indice, threadsPerBlock);
+  onnx_scatternd_kernel<<<col_block, threadsPerBlock, 0, stream>>>(
+      num_update_indice, indices, update, output, tensor_desc, indice_desc);
+}
+
+void TRTONNXScatterNDKernelLauncher_float(const float* data, const int* indices,
+                                          const float* update, const int* dims,
+                                          int nbDims, const int* indices_dims,
+                                          int indice_nbDims, float* output,
+                                          cudaStream_t stream) {
+  TRTONNXScatterNDKernelLauncher<float>(data, indices, update, dims, nbDims,
+                                        indices_dims, indice_nbDims, output,
+                                        stream);
+}
+
+void TRTONNXScatterNDKernelLauncher_int32(const int* data, const int* indices,
+                                          const int* update, const int* dims,
+                                          int nbDims, const int* indices_dims,
+                                          int indice_nbDims, int* output,
+                                          cudaStream_t stream) {
+  TRTONNXScatterNDKernelLauncher<int>(data, indices, update, dims, nbDims,
+                                      indices_dims, indice_nbDims, output,
+                                      stream);
+}
diff --git a/mmcv/ops/csrc/tensorrt/trt_corner_pool.hpp b/mmcv/ops/csrc/tensorrt/trt_corner_pool.hpp
new file mode 100644
index 0000000..f34e15b
--- /dev/null
+++ b/mmcv/ops/csrc/tensorrt/trt_corner_pool.hpp
@@ -0,0 +1,111 @@
+#ifndef TRT_CORNER_POOL_HPP
+#define TRT_CORNER_POOL_HPP
+#include <string>
+#include <vector>
+
+#include "trt_plugin_helper.hpp"
+
+enum TRT_CORNER_POOL_TYPE {
+  TRT_TOP_POOL = 0,
+  TRT_BOTTOM_POOL = 1,
+  TRT_LEFT_POOL = 2,
+  TRT_RIGHT_POOL = 3
+};
+
+// implement of CornerPool
+class CornerPoolPluginDynamic : public nvinfer1::IPluginV2DynamicExt {
+ public:
+  CornerPoolPluginDynamic(const std::string &name,
+                          TRT_CORNER_POOL_TYPE poolType);
+
+  CornerPoolPluginDynamic(const std::string name, const void *data,
+                          size_t length);
+
+  CornerPoolPluginDynamic() = delete;
+
+  ~CornerPoolPluginDynamic();
+
+  // IPluginV2DynamicExt Methods
+  nvinfer1::IPluginV2DynamicExt *clone() const override;
+  nvinfer1::DimsExprs getOutputDimensions(
+      int outputIndex, const nvinfer1::DimsExprs *inputs, int nbInputs,
+      nvinfer1::IExprBuilder &exprBuilder) override;
+  bool supportsFormatCombination(int pos,
+                                 const nvinfer1::PluginTensorDesc *inOut,
+                                 int nbInputs, int nbOutputs) override;
+  void configurePlugin(const nvinfer1::DynamicPluginTensorDesc *in,
+                       int nbInputs,
+                       const nvinfer1::DynamicPluginTensorDesc *out,
+                       int nbOutputs) override;
+  size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc *inputs,
+                          int nbInputs,
+                          const nvinfer1::PluginTensorDesc *outputs,
+                          int nbOutputs) const override;
+  int enqueue(const nvinfer1::PluginTensorDesc *inputDesc,
+              const nvinfer1::PluginTensorDesc *outputDesc,
+              const void *const *inputs, void *const *outputs, void *workspace,
+              cudaStream_t stream) override;
+
+  // IPluginV2Ext Methods
+  nvinfer1::DataType getOutputDataType(int index,
+                                       const nvinfer1::DataType *inputTypes,
+                                       int nbInputs) const override;
+
+  // IPluginV2 Methods
+  const char *getPluginType() const override;
+  const char *getPluginVersion() const override;
+  int getNbOutputs() const override;
+  int initialize() override;
+  void terminate() override;
+  size_t getSerializationSize() const override;
+  void serialize(void *buffer) const override;
+  void destroy() override;
+  void setPluginNamespace(const char *pluginNamespace) override;
+  const char *getPluginNamespace() const override;
+
+ protected:
+  const std::string mLayerName;
+  std::string mNamespace;
+
+  TRT_CORNER_POOL_TYPE mPoolType;
+
+ protected:
+  // To prevent compiler warnings.
+  using nvinfer1::IPluginV2DynamicExt::canBroadcastInputAcrossBatch;
+  using nvinfer1::IPluginV2DynamicExt::configurePlugin;
+  using nvinfer1::IPluginV2DynamicExt::enqueue;
+  using nvinfer1::IPluginV2DynamicExt::getOutputDimensions;
+  using nvinfer1::IPluginV2DynamicExt::getWorkspaceSize;
+  using nvinfer1::IPluginV2DynamicExt::isOutputBroadcastAcrossBatch;
+  using nvinfer1::IPluginV2DynamicExt::supportsFormat;
+};
+
+// CornerPool creator
+class CornerPoolPluginDynamicCreator : public nvinfer1::IPluginCreator {
+ public:
+  CornerPoolPluginDynamicCreator();
+
+  const char *getPluginName() const override;
+
+  const char *getPluginVersion() const override;
+
+  const nvinfer1::PluginFieldCollection *getFieldNames() override;
+
+  nvinfer1::IPluginV2 *createPlugin(
+      const char *name, const nvinfer1::PluginFieldCollection *fc) override;
+
+  nvinfer1::IPluginV2 *deserializePlugin(const char *name,
+                                         const void *serialData,
+                                         size_t serialLength) override;
+
+  void setPluginNamespace(const char *pluginNamespace) override;
+
+  const char *getPluginNamespace() const override;
+
+ protected:
+  nvinfer1::PluginFieldCollection mFC;
+  std::vector<nvinfer1::PluginField> mPluginAttributes;
+  std::string mNamespace;
+};
+
+#endif TRT_CORNER_POOL_HPP  // TRT_CORNER_POOL_HPP
diff --git a/mmcv/ops/csrc/tensorrt/trt_cuda_helper.cuh b/mmcv/ops/csrc/tensorrt/trt_cuda_helper.cuh
new file mode 100644
index 0000000..e7df9c0
--- /dev/null
+++ b/mmcv/ops/csrc/tensorrt/trt_cuda_helper.cuh
@@ -0,0 +1,41 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef TRT_CUDA_HELPER_HPP
+#define TRT_CUDA_HELPER_HPP
+#include <cublas_v2.h>
+
+#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
+
+#define cudaCheckError()                                       \
+  {                                                            \
+    cudaError_t e = cudaGetLastError();                        \
+    if (e != cudaSuccess) {                                    \
+      printf("Cuda failure %s:%d: '%s'\n", __FILE__, __LINE__, \
+             cudaGetErrorString(e));                           \
+      exit(0);                                                 \
+    }                                                          \
+  }
+
+/**
+ * Returns a view of the original tensor with its dimensions permuted.
+ *
+ * @param[out] dst pointer to the destination tensor
+ * @param[in] src pointer to the source tensor
+ * @param[in] src_size shape of the src tensor
+ * @param[in] permute The desired ordering of dimensions
+ * @param[in] src_dim dim of src tensor
+ * @param[in] stream cuda stream handle
+ */
+template <class scalar_t>
+void memcpyPermute(scalar_t* dst, const scalar_t* src, int* src_size,
+                   int* permute, int src_dim, cudaStream_t stream = 0);
+
+template <typename scalar_t>
+cublasStatus_t cublasGemmWrap(cublasHandle_t handle, cublasOperation_t transa,
+                              cublasOperation_t transb, int m, int n, int k,
+                              const scalar_t* alpha, const scalar_t* A, int lda,
+                              const scalar_t* B, int ldb, const scalar_t* beta,
+                              scalar_t* C, int ldc) {
+  return CUBLAS_STATUS_INTERNAL_ERROR;
+}
+
+#endif  // TRT_CUDA_HELPER_HPP
diff --git a/mmcv/ops/csrc/tensorrt/trt_cummaxmin.hpp b/mmcv/ops/csrc/tensorrt/trt_cummaxmin.hpp
new file mode 100644
index 0000000..5b856b0
--- /dev/null
+++ b/mmcv/ops/csrc/tensorrt/trt_cummaxmin.hpp
@@ -0,0 +1,122 @@
+#ifndef TRT_CUMMAXMIN_HPP
+#define TRT_CUMMAXMIN_HPP
+#include <string>
+#include <vector>
+
+#include "trt_plugin_helper.hpp"
+
+enum TRT_CUMCMPTYPE { TRT_CUMMAX = 0, TRT_CUMMIN = 1 };
+
+// implement of cummax and cummin
+class CumMaxMinPluginDynamic : public nvinfer1::IPluginV2DynamicExt {
+ public:
+  CumMaxMinPluginDynamic(const std::string &name, int dim,
+                         TRT_CUMCMPTYPE cumType);
+
+  CumMaxMinPluginDynamic(const std::string name, const void *data,
+                         size_t length);
+
+  CumMaxMinPluginDynamic() = delete;
+
+  ~CumMaxMinPluginDynamic();
+
+  // IPluginV2DynamicExt Methods
+  nvinfer1::IPluginV2DynamicExt *clone() const override;
+  nvinfer1::DimsExprs getOutputDimensions(
+      int outputIndex, const nvinfer1::DimsExprs *inputs, int nbInputs,
+      nvinfer1::IExprBuilder &exprBuilder) override;
+  bool supportsFormatCombination(int pos,
+                                 const nvinfer1::PluginTensorDesc *inOut,
+                                 int nbInputs, int nbOutputs) override;
+  void configurePlugin(const nvinfer1::DynamicPluginTensorDesc *in,
+                       int nbInputs,
+                       const nvinfer1::DynamicPluginTensorDesc *out,
+                       int nbOutputs) override;
+  size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc *inputs,
+                          int nbInputs,
+                          const nvinfer1::PluginTensorDesc *outputs,
+                          int nbOutputs) const override;
+  int enqueue(const nvinfer1::PluginTensorDesc *inputDesc,
+              const nvinfer1::PluginTensorDesc *outputDesc,
+              const void *const *inputs, void *const *outputs, void *workspace,
+              cudaStream_t stream) override;
+
+  // IPluginV2Ext Methods
+  nvinfer1::DataType getOutputDataType(int index,
+                                       const nvinfer1::DataType *inputTypes,
+                                       int nbInputs) const override;
+
+  // IPluginV2 Methods
+  const char *getPluginType() const override;
+  const char *getPluginVersion() const override;
+  int getNbOutputs() const override;
+  int initialize() override;
+  void terminate() override;
+  size_t getSerializationSize() const override;
+  void serialize(void *buffer) const override;
+  void destroy() override;
+  void setPluginNamespace(const char *pluginNamespace) override;
+  const char *getPluginNamespace() const override;
+
+ protected:
+  const std::string mLayerName;
+  std::string mNamespace;
+
+  int mDim;
+  TRT_CUMCMPTYPE mCumType;
+
+ protected:
+  // To prevent compiler warnings.
+  using nvinfer1::IPluginV2DynamicExt::canBroadcastInputAcrossBatch;
+  using nvinfer1::IPluginV2DynamicExt::configurePlugin;
+  using nvinfer1::IPluginV2DynamicExt::enqueue;
+  using nvinfer1::IPluginV2DynamicExt::getOutputDimensions;
+  using nvinfer1::IPluginV2DynamicExt::getWorkspaceSize;
+  using nvinfer1::IPluginV2DynamicExt::isOutputBroadcastAcrossBatch;
+  using nvinfer1::IPluginV2DynamicExt::supportsFormat;
+};
+
+// cummax and cummin creator
+class CumMaxMinPluginDynamicCreator : public nvinfer1::IPluginCreator {
+ public:
+  CumMaxMinPluginDynamicCreator(TRT_CUMCMPTYPE cumType);
+
+  const char *getPluginName() const override;
+
+  const char *getPluginVersion() const override;
+
+  const nvinfer1::PluginFieldCollection *getFieldNames() override;
+
+  nvinfer1::IPluginV2 *createPlugin(
+      const char *name, const nvinfer1::PluginFieldCollection *fc) override;
+
+  nvinfer1::IPluginV2 *deserializePlugin(const char *name,
+                                         const void *serialData,
+                                         size_t serialLength) override;
+
+  void setPluginNamespace(const char *pluginNamespace) override;
+
+  const char *getPluginNamespace() const override;
+
+ protected:
+  TRT_CUMCMPTYPE mCumType;
+  nvinfer1::PluginFieldCollection mFC;
+  std::vector<nvinfer1::PluginField> mPluginAttributes;
+  std::string mNamespace;
+};
+
+// cummax creator
+class CumMaxPluginDynamicCreator : public CumMaxMinPluginDynamicCreator {
+ public:
+  CumMaxPluginDynamicCreator();
+  const char *getPluginName() const override;
+};
+
+// cummin creator
+class CumMinPluginDynamicCreator : public CumMaxMinPluginDynamicCreator {
+ public:
+  CumMinPluginDynamicCreator();
+  const char *getPluginName() const override;
+};
+
+#endif TRT_CUMMAXMIN_HPP  // TRT_CUMMAXMIN_HPP
diff --git a/mmcv/ops/csrc/tensorrt/trt_deform_conv.hpp b/mmcv/ops/csrc/tensorrt/trt_deform_conv.hpp
new file mode 100644
index 0000000..fc48ac5
--- /dev/null
+++ b/mmcv/ops/csrc/tensorrt/trt_deform_conv.hpp
@@ -0,0 +1,118 @@
+#ifndef TRT_DEFORM_CONV_HPP
+#define TRT_DEFORM_CONV_HPP
+#include <cublas_v2.h>
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "trt_plugin_helper.hpp"
+
+class DeformableConvPluginDynamic : public nvinfer1::IPluginV2DynamicExt {
+ public:
+  DeformableConvPluginDynamic(const std::string &name,
+                              const nvinfer1::Dims &stride,
+                              const nvinfer1::Dims &padding,
+                              const nvinfer1::Dims &dilation,
+                              const int deformableGroup, const int group,
+                              int im2colStep);
+
+  DeformableConvPluginDynamic(const std::string name, const void *data,
+                              size_t length);
+
+  DeformableConvPluginDynamic() = delete;
+
+  ~DeformableConvPluginDynamic();
+
+  // IPluginV2DynamicExt Methods
+  nvinfer1::IPluginV2DynamicExt *clone() const override;
+  nvinfer1::DimsExprs getOutputDimensions(
+      int outputIndex, const nvinfer1::DimsExprs *inputs, int nbInputs,
+      nvinfer1::IExprBuilder &exprBuilder) override;
+  bool supportsFormatCombination(int pos,
+                                 const nvinfer1::PluginTensorDesc *inOut,
+                                 int nbInputs, int nbOutputs) override;
+  void configurePlugin(const nvinfer1::DynamicPluginTensorDesc *in,
+                       int nbInputs,
+                       const nvinfer1::DynamicPluginTensorDesc *out,
+                       int nbOutputs) override;
+  size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc *inputs,
+                          int nbInputs,
+                          const nvinfer1::PluginTensorDesc *outputs,
+                          int nbOutputs) const override;
+  int enqueue(const nvinfer1::PluginTensorDesc *inputDesc,
+              const nvinfer1::PluginTensorDesc *outputDesc,
+              const void *const *inputs, void *const *outputs, void *workspace,
+              cudaStream_t stream) override;
+  void attachToContext(cudnnContext *cudnnContext, cublasContext *cublasContext,
+                       nvinfer1::IGpuAllocator *gpuAllocator) override;
+  void detachFromContext() override;
+
+  // IPluginV2Ext Methods
+  nvinfer1::DataType getOutputDataType(int index,
+                                       const nvinfer1::DataType *inputTypes,
+                                       int nbInputs) const override;
+
+  // IPluginV2 Methods
+  const char *getPluginType() const override;
+  const char *getPluginVersion() const override;
+  int getNbOutputs() const override;
+  int initialize() override;
+  void terminate() override;
+  size_t getSerializationSize() const override;
+  void serialize(void *buffer) const override;
+  void destroy() override;
+  void setPluginNamespace(const char *pluginNamespace) override;
+  const char *getPluginNamespace() const override;
+
+ private:
+  const std::string mLayerName;
+  std::string mNamespace;
+
+  nvinfer1::Dims mStride;
+  nvinfer1::Dims mPadding;
+  nvinfer1::Dims mDilation;
+  int mDeformableGroup;
+  int mGroup;
+  int mIm2colStep;
+
+  cublasHandle_t m_cublas_handle;
+
+ protected:
+  // To prevent compiler warnings.
+  using nvinfer1::IPluginV2DynamicExt::canBroadcastInputAcrossBatch;
+  using nvinfer1::IPluginV2DynamicExt::configurePlugin;
+  using nvinfer1::IPluginV2DynamicExt::enqueue;
+  using nvinfer1::IPluginV2DynamicExt::getOutputDimensions;
+  using nvinfer1::IPluginV2DynamicExt::getWorkspaceSize;
+  using nvinfer1::IPluginV2DynamicExt::isOutputBroadcastAcrossBatch;
+  using nvinfer1::IPluginV2DynamicExt::supportsFormat;
+};
+
+class DeformableConvPluginDynamicCreator : public nvinfer1::IPluginCreator {
+ public:
+  DeformableConvPluginDynamicCreator();
+
+  const char *getPluginName() const override;
+
+  const char *getPluginVersion() const override;
+
+  const nvinfer1::PluginFieldCollection *getFieldNames() override;
+
+  nvinfer1::IPluginV2 *createPlugin(
+      const char *name, const nvinfer1::PluginFieldCollection *fc) override;
+
+  nvinfer1::IPluginV2 *deserializePlugin(const char *name,
+                                         const void *serialData,
+                                         size_t serialLength) override;
+
+  void setPluginNamespace(const char *pluginNamespace) override;
+
+  const char *getPluginNamespace() const override;
+
+ private:
+  static nvinfer1::PluginFieldCollection mFC;
+  static std::vector<nvinfer1::PluginField> mPluginAttributes;
+  std::string mNamespace;
+};
+#endif  // TRT_DEFORM_CONV_HPP
diff --git a/mmcv/ops/csrc/tensorrt/trt_grid_sampler.hpp b/mmcv/ops/csrc/tensorrt/trt_grid_sampler.hpp
new file mode 100644
index 0000000..40920ce
--- /dev/null
+++ b/mmcv/ops/csrc/tensorrt/trt_grid_sampler.hpp
@@ -0,0 +1,108 @@
+#ifndef TRT_GRID_SAMPLER_HPP
+#define TRT_GRID_SAMPLER_HPP
+#include <cublas_v2.h>
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "trt_plugin_helper.hpp"
+
+namespace mmcv {
+enum class GridSamplerInterpolation { Bilinear, Nearest };
+enum class GridSamplerPadding { Zeros, Border, Reflection };
+}  // namespace mmcv
+
+class GridSamplerDynamic : public nvinfer1::IPluginV2DynamicExt {
+ public:
+  GridSamplerDynamic(const std::string &name, int mode, int paddingMode,
+                     bool alignCorners);
+
+  GridSamplerDynamic(const std::string name, const void *data, size_t length);
+
+  GridSamplerDynamic() = delete;
+
+  // IPluginV2DynamicExt Methods
+  nvinfer1::IPluginV2DynamicExt *clone() const override;
+  nvinfer1::DimsExprs getOutputDimensions(
+      int outputIndex, const nvinfer1::DimsExprs *inputs, int nbInputs,
+      nvinfer1::IExprBuilder &exprBuilder) override;
+  bool supportsFormatCombination(int pos,
+                                 const nvinfer1::PluginTensorDesc *inOut,
+                                 int nbInputs, int nbOutputs) override;
+  void configurePlugin(const nvinfer1::DynamicPluginTensorDesc *in,
+                       int nbInputs,
+                       const nvinfer1::DynamicPluginTensorDesc *out,
+                       int nbOutputs) override;
+  size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc *inputs,
+                          int nbInputs,
+                          const nvinfer1::PluginTensorDesc *outputs,
+                          int nbOutputs) const override;
+  int enqueue(const nvinfer1::PluginTensorDesc *inputDesc,
+              const nvinfer1::PluginTensorDesc *outputDesc,
+              const void *const *inputs, void *const *outputs, void *workspace,
+              cudaStream_t stream) override;
+
+  // IPluginV2Ext Methods
+  nvinfer1::DataType getOutputDataType(int index,
+                                       const nvinfer1::DataType *inputTypes,
+                                       int nbInputs) const override;
+
+  // IPluginV2 Methods
+  const char *getPluginType() const override;
+  const char *getPluginVersion() const override;
+  int getNbOutputs() const override;
+  int initialize() override;
+  void terminate() override;
+  size_t getSerializationSize() const override;
+  void serialize(void *buffer) const override;
+  void destroy() override;
+  void setPluginNamespace(const char *pluginNamespace) override;
+  const char *getPluginNamespace() const override;
+
+ private:
+  const std::string mLayerName;
+  std::string mNamespace;
+
+  int mMode;
+  int mPaddingMode;
+  bool mAlignCorners;
+
+ protected:
+  // To prevent compiler warnings.
+  using nvinfer1::IPluginV2DynamicExt::canBroadcastInputAcrossBatch;
+  using nvinfer1::IPluginV2DynamicExt::configurePlugin;
+  using nvinfer1::IPluginV2DynamicExt::enqueue;
+  using nvinfer1::IPluginV2DynamicExt::getOutputDimensions;
+  using nvinfer1::IPluginV2DynamicExt::getWorkspaceSize;
+  using nvinfer1::IPluginV2DynamicExt::isOutputBroadcastAcrossBatch;
+  using nvinfer1::IPluginV2DynamicExt::supportsFormat;
+};
+
+class GridSamplerDynamicCreator : public nvinfer1::IPluginCreator {
+ public:
+  GridSamplerDynamicCreator();
+
+  const char *getPluginName() const override;
+
+  const char *getPluginVersion() const override;
+
+  const nvinfer1::PluginFieldCollection *getFieldNames() override;
+
+  nvinfer1::IPluginV2 *createPlugin(
+      const char *name, const nvinfer1::PluginFieldCollection *fc) override;
+
+  nvinfer1::IPluginV2 *deserializePlugin(const char *name,
+                                         const void *serialData,
+                                         size_t serialLength) override;
+
+  void setPluginNamespace(const char *pluginNamespace) override;
+
+  const char *getPluginNamespace() const override;
+
+ private:
+  static nvinfer1::PluginFieldCollection mFC;
+  static std::vector<nvinfer1::PluginField> mPluginAttributes;
+  std::string mNamespace;
+};
+#endif  // TRT_GRID_SAMPLER_HPP
diff --git a/mmcv/ops/csrc/tensorrt/trt_instance_norm.hpp b/mmcv/ops/csrc/tensorrt/trt_instance_norm.hpp
new file mode 100644
index 0000000..78060c3
--- /dev/null
+++ b/mmcv/ops/csrc/tensorrt/trt_instance_norm.hpp
@@ -0,0 +1,120 @@
+// Modified from:
+// https://github.com/NVIDIA/TensorRT/blob/master/plugin/instanceNormalizationPlugin/instanceNormalizationPlugin.h
+
+#ifndef TRT_INSTANCE_NORMALIZATION_PLUGIN_H
+#define TRT_INSTANCE_NORMALIZATION_PLUGIN_H
+#include <cudnn.h>
+
+#include <iostream>
+#include <string>
+#include <vector>
+
+#include "trt_plugin_helper.hpp"
+
+typedef unsigned short half_type;
+
+class InstanceNormalizationDynamic final
+    : public nvinfer1::IPluginV2DynamicExt {
+ public:
+  InstanceNormalizationDynamic(const std::string& name, float epsilon);
+
+  InstanceNormalizationDynamic(const std::string& name, void const* serialData,
+                               size_t serialLength);
+
+  InstanceNormalizationDynamic() = delete;
+
+  ~InstanceNormalizationDynamic() override;
+
+  int getNbOutputs() const override;
+
+  // DynamicExt plugins returns DimsExprs class instead of Dims
+  nvinfer1::DimsExprs getOutputDimensions(
+      int outputIndex, const nvinfer1::DimsExprs* inputs, int nbInputs,
+      nvinfer1::IExprBuilder& exprBuilder) override;
+
+  int initialize() override;
+
+  void terminate() override;
+
+  size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs,
+                          int nbInputs,
+                          const nvinfer1::PluginTensorDesc* outputs,
+                          int nbOutputs) const override;
+
+  int enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
+              const nvinfer1::PluginTensorDesc* outputDesc,
+              const void* const* inputs, void* const* outputs, void* workspace,
+              cudaStream_t stream) override;
+
+  size_t getSerializationSize() const override;
+
+  void serialize(void* buffer) const override;
+
+  // DynamicExt plugin supportsFormat update.
+  bool supportsFormatCombination(int pos,
+                                 const nvinfer1::PluginTensorDesc* inOut,
+                                 int nbInputs, int nbOutputs) override;
+
+  const char* getPluginType() const override;
+
+  const char* getPluginVersion() const override;
+
+  void destroy() override;
+
+  nvinfer1::IPluginV2DynamicExt* clone() const override;
+
+  void setPluginNamespace(const char* pluginNamespace) override;
+
+  const char* getPluginNamespace() const override;
+
+  nvinfer1::DataType getOutputDataType(int index,
+                                       const nvinfer1::DataType* inputTypes,
+                                       int nbInputs) const override;
+
+  void attachToContext(cudnnContext* cudnn, cublasContext* cublas,
+                       nvinfer1::IGpuAllocator* allocator) override;
+
+  void detachFromContext() override;
+
+  void configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in,
+                       int nbInputs,
+                       const nvinfer1::DynamicPluginTensorDesc* out,
+                       int nbOutputs) override;
+
+ private:
+  const std::string mLayerName;
+  float mEpsilon{};
+  cudnnHandle_t _cudnn_handle{};
+  cudnnTensorDescriptor_t _x_desc{}, _y_desc{}, _b_desc{};
+  std::string mPluginNamespace{};
+};
+
+class InstanceNormalizationDynamicCreator : public nvinfer1::IPluginCreator {
+ public:
+  InstanceNormalizationDynamicCreator();
+
+  ~InstanceNormalizationDynamicCreator() override = default;
+
+  const char* getPluginName() const override;
+
+  const char* getPluginVersion() const override;
+
+  const nvinfer1::PluginFieldCollection* getFieldNames() override;
+
+  nvinfer1::IPluginV2DynamicExt* createPlugin(
+      const char* name, const nvinfer1::PluginFieldCollection* fc) override;
+
+  nvinfer1::IPluginV2DynamicExt* deserializePlugin(
+      const char* name, const void* serialData, size_t serialLength) override;
+
+  void setPluginNamespace(const char* pluginNamespace) override;
+
+  const char* getPluginNamespace() const override;
+
+ private:
+  static nvinfer1::PluginFieldCollection mFC;
+  static std::vector<nvinfer1::PluginField> mPluginAttributes;
+  std::string mNamespace;
+};
+
+#endif  // TRT_INSTANCE_NORMALIZATION_PLUGIN_H
diff --git a/mmcv/ops/csrc/tensorrt/trt_modulated_deform_conv.hpp b/mmcv/ops/csrc/tensorrt/trt_modulated_deform_conv.hpp
new file mode 100644
index 0000000..0907e7e
--- /dev/null
+++ b/mmcv/ops/csrc/tensorrt/trt_modulated_deform_conv.hpp
@@ -0,0 +1,120 @@
+#ifndef TRT_MODULATED_DEFORM_CONV_HPP
+#define TRT_MODULATED_DEFORM_CONV_HPP
+#include <cublas_v2.h>
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "trt_plugin_helper.hpp"
+
+class ModulatedDeformableConvPluginDynamic
+    : public nvinfer1::IPluginV2DynamicExt {
+ public:
+  ModulatedDeformableConvPluginDynamic(const std::string &name,
+                                       const nvinfer1::Dims stride,
+                                       const nvinfer1::Dims padding,
+                                       const nvinfer1::Dims dilation,
+                                       const int deformableGroup,
+                                       const int group);
+
+  ModulatedDeformableConvPluginDynamic(const std::string name, const void *data,
+                                       size_t length);
+
+  ModulatedDeformableConvPluginDynamic() = delete;
+
+  ~ModulatedDeformableConvPluginDynamic();
+
+  // IPluginV2DynamicExt Methods
+  nvinfer1::IPluginV2DynamicExt *clone() const override;
+  nvinfer1::DimsExprs getOutputDimensions(
+      int outputIndex, const nvinfer1::DimsExprs *inputs, int nbInputs,
+      nvinfer1::IExprBuilder &exprBuilder) override;
+  bool supportsFormatCombination(int pos,
+                                 const nvinfer1::PluginTensorDesc *inOut,
+                                 int nbInputs, int nbOutputs) override;
+  void configurePlugin(const nvinfer1::DynamicPluginTensorDesc *in,
+                       int nbInputs,
+                       const nvinfer1::DynamicPluginTensorDesc *out,
+                       int nbOutputs) override;
+  size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc *inputs,
+                          int nbInputs,
+                          const nvinfer1::PluginTensorDesc *outputs,
+                          int nbOutputs) const override;
+  int enqueue(const nvinfer1::PluginTensorDesc *inputDesc,
+              const nvinfer1::PluginTensorDesc *outputDesc,
+              const void *const *inputs, void *const *outputs, void *workspace,
+              cudaStream_t stream) override;
+  void attachToContext(cudnnContext *cudnnContext, cublasContext *cublasContext,
+                       nvinfer1::IGpuAllocator *gpuAllocator) override;
+  void detachFromContext() override;
+
+  // IPluginV2Ext Methods
+  nvinfer1::DataType getOutputDataType(int index,
+                                       const nvinfer1::DataType *inputTypes,
+                                       int nbInputs) const override;
+
+  // IPluginV2 Methods
+  const char *getPluginType() const override;
+  const char *getPluginVersion() const override;
+  int getNbOutputs() const override;
+  int initialize() override;
+  void terminate() override;
+  size_t getSerializationSize() const override;
+  void serialize(void *buffer) const override;
+  void destroy() override;
+  void setPluginNamespace(const char *pluginNamespace) override;
+  const char *getPluginNamespace() const override;
+
+ private:
+  const std::string mLayerName;
+  std::string mNamespace;
+
+  nvinfer1::Dims mStride;
+  nvinfer1::Dims mPadding;
+  nvinfer1::Dims mDilation;
+  int mDeformableGroup;
+  int mGroup;
+  bool mWithBias;
+
+  cublasHandle_t m_cublas_handle;
+
+ protected:
+  // To prevent compiler warnings.
+  using nvinfer1::IPluginV2DynamicExt::canBroadcastInputAcrossBatch;
+  using nvinfer1::IPluginV2DynamicExt::configurePlugin;
+  using nvinfer1::IPluginV2DynamicExt::enqueue;
+  using nvinfer1::IPluginV2DynamicExt::getOutputDimensions;
+  using nvinfer1::IPluginV2DynamicExt::getWorkspaceSize;
+  using nvinfer1::IPluginV2DynamicExt::isOutputBroadcastAcrossBatch;
+  using nvinfer1::IPluginV2DynamicExt::supportsFormat;
+};
+
+class ModulatedDeformableConvPluginDynamicCreator
+    : public nvinfer1::IPluginCreator {
+ public:
+  ModulatedDeformableConvPluginDynamicCreator();
+
+  const char *getPluginName() const override;
+
+  const char *getPluginVersion() const override;
+
+  const nvinfer1::PluginFieldCollection *getFieldNames() override;
+
+  nvinfer1::IPluginV2 *createPlugin(
+      const char *name, const nvinfer1::PluginFieldCollection *fc) override;
+
+  nvinfer1::IPluginV2 *deserializePlugin(const char *name,
+                                         const void *serialData,
+                                         size_t serialLength) override;
+
+  void setPluginNamespace(const char *pluginNamespace) override;
+
+  const char *getPluginNamespace() const override;
+
+ private:
+  static nvinfer1::PluginFieldCollection mFC;
+  static std::vector<nvinfer1::PluginField> mPluginAttributes;
+  std::string mNamespace;
+};
+#endif  // TRT_MODULATED_DEFORM_CONV_HPP
diff --git a/mmcv/ops/csrc/tensorrt/trt_nms.hpp b/mmcv/ops/csrc/tensorrt/trt_nms.hpp
new file mode 100644
index 0000000..a914d90
--- /dev/null
+++ b/mmcv/ops/csrc/tensorrt/trt_nms.hpp
@@ -0,0 +1,107 @@
+#ifndef TRT_NMS_HPP
+#define TRT_NMS_HPP
+#include <cublas_v2.h>
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "trt_plugin_helper.hpp"
+
+class NonMaxSuppressionDynamic : public nvinfer1::IPluginV2DynamicExt {
+ public:
+  NonMaxSuppressionDynamic(const std::string &name, int centerPointBox,
+                           int maxOutputBoxesPerClass, float iouThreshold,
+                           float scoreThreshold, int offset);
+
+  NonMaxSuppressionDynamic(const std::string name, const void *data,
+                           size_t length);
+
+  NonMaxSuppressionDynamic() = delete;
+
+  // IPluginV2DynamicExt Methods
+  nvinfer1::IPluginV2DynamicExt *clone() const override;
+  nvinfer1::DimsExprs getOutputDimensions(
+      int outputIndex, const nvinfer1::DimsExprs *inputs, int nbInputs,
+      nvinfer1::IExprBuilder &exprBuilder) override;
+  bool supportsFormatCombination(int pos,
+                                 const nvinfer1::PluginTensorDesc *inOut,
+                                 int nbInputs, int nbOutputs) override;
+  void configurePlugin(const nvinfer1::DynamicPluginTensorDesc *in,
+                       int nbInputs,
+                       const nvinfer1::DynamicPluginTensorDesc *out,
+                       int nbOutputs) override;
+  size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc *inputs,
+                          int nbInputs,
+                          const nvinfer1::PluginTensorDesc *outputs,
+                          int nbOutputs) const override;
+  int enqueue(const nvinfer1::PluginTensorDesc *inputDesc,
+              const nvinfer1::PluginTensorDesc *outputDesc,
+              const void *const *inputs, void *const *outputs, void *workspace,
+              cudaStream_t stream) override;
+
+  // IPluginV2Ext Methods
+  nvinfer1::DataType getOutputDataType(int index,
+                                       const nvinfer1::DataType *inputTypes,
+                                       int nbInputs) const override;
+
+  // IPluginV2 Methods
+  const char *getPluginType() const override;
+  const char *getPluginVersion() const override;
+  int getNbOutputs() const override;
+  int initialize() override;
+  void terminate() override;
+  size_t getSerializationSize() const override;
+  void serialize(void *buffer) const override;
+  void destroy() override;
+  void setPluginNamespace(const char *pluginNamespace) override;
+  const char *getPluginNamespace() const override;
+
+ private:
+  const std::string mLayerName;
+  std::string mNamespace;
+
+  int mCenterPointBox;
+  int mMaxOutputBoxesPerClass;
+  float mIouThreshold;
+  float mScoreThreshold;
+  int mOffset;
+
+ protected:
+  // To prevent compiler warnings.
+  using nvinfer1::IPluginV2DynamicExt::canBroadcastInputAcrossBatch;
+  using nvinfer1::IPluginV2DynamicExt::configurePlugin;
+  using nvinfer1::IPluginV2DynamicExt::enqueue;
+  using nvinfer1::IPluginV2DynamicExt::getOutputDimensions;
+  using nvinfer1::IPluginV2DynamicExt::getWorkspaceSize;
+  using nvinfer1::IPluginV2DynamicExt::isOutputBroadcastAcrossBatch;
+  using nvinfer1::IPluginV2DynamicExt::supportsFormat;
+};
+
+class NonMaxSuppressionDynamicCreator : public nvinfer1::IPluginCreator {
+ public:
+  NonMaxSuppressionDynamicCreator();
+
+  const char *getPluginName() const override;
+
+  const char *getPluginVersion() const override;
+
+  const nvinfer1::PluginFieldCollection *getFieldNames() override;
+
+  nvinfer1::IPluginV2 *createPlugin(
+      const char *name, const nvinfer1::PluginFieldCollection *fc) override;
+
+  nvinfer1::IPluginV2 *deserializePlugin(const char *name,
+                                         const void *serialData,
+                                         size_t serialLength) override;
+
+  void setPluginNamespace(const char *pluginNamespace) override;
+
+  const char *getPluginNamespace() const override;
+
+ private:
+  static nvinfer1::PluginFieldCollection mFC;
+  static std::vector<nvinfer1::PluginField> mPluginAttributes;
+  std::string mNamespace;
+};
+#endif  // TRT_NMS_HPP
diff --git a/mmcv/ops/csrc/tensorrt/trt_plugin.hpp b/mmcv/ops/csrc/tensorrt/trt_plugin.hpp
new file mode 100644
index 0000000..a4adf29
--- /dev/null
+++ b/mmcv/ops/csrc/tensorrt/trt_plugin.hpp
@@ -0,0 +1,7 @@
+#ifndef TRT_PLUGIN_HPP
+#define TRT_PLUGIN_HPP
+
+extern "C" {
+bool initLibMMCVInferPlugins();
+}  // extern "C"
+#endif  // TRT_PLUGIN_HPP
diff --git a/mmcv/ops/csrc/tensorrt/trt_plugin_helper.hpp b/mmcv/ops/csrc/tensorrt/trt_plugin_helper.hpp
new file mode 100644
index 0000000..70fba78
--- /dev/null
+++ b/mmcv/ops/csrc/tensorrt/trt_plugin_helper.hpp
@@ -0,0 +1,41 @@
+#ifndef TRT_PLUGIN_HELPER_HPP
+#define TRT_PLUGIN_HELPER_HPP
+#include <stdexcept>
+
+#include "NvInferPlugin.h"
+
+namespace mmcv {
+
+const int MAXTENSORDIMS = 10;
+
+struct TensorDesc {
+  int shape[MAXTENSORDIMS];
+  int stride[MAXTENSORDIMS];
+  int dim;
+};
+
+inline unsigned int getElementSize(nvinfer1::DataType t) {
+  switch (t) {
+    case nvinfer1::DataType::kINT32:
+      return 4;
+    case nvinfer1::DataType::kFLOAT:
+      return 4;
+    case nvinfer1::DataType::kHALF:
+      return 2;
+    // case nvinfer1::DataType::kBOOL:
+    case nvinfer1::DataType::kINT8:
+      return 1;
+    default:
+      throw std::runtime_error("Invalid DataType.");
+  }
+  throw std::runtime_error("Invalid DataType.");
+  return 0;
+}
+
+inline size_t getAlignedSize(size_t origin_size, size_t aligned_number = 16) {
+  return size_t((origin_size + aligned_number - 1) / aligned_number) *
+         aligned_number;
+}
+
+}  // namespace mmcv
+#endif  // TRT_PLUGIN_HELPER_HPP
diff --git a/mmcv/ops/csrc/tensorrt/trt_roi_align.hpp b/mmcv/ops/csrc/tensorrt/trt_roi_align.hpp
new file mode 100644
index 0000000..5677af9
--- /dev/null
+++ b/mmcv/ops/csrc/tensorrt/trt_roi_align.hpp
@@ -0,0 +1,108 @@
+#ifndef TRT_ROI_ALIGN_HPP
+#define TRT_ROI_ALIGN_HPP
+#include <cublas_v2.h>
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "trt_plugin_helper.hpp"
+
+class RoIAlignPluginDynamic : public nvinfer1::IPluginV2DynamicExt {
+ public:
+  RoIAlignPluginDynamic(const std::string &name, int outWidth, int outHeight,
+                        float spatialScale, int sampleRatio, int poolMode,
+                        bool aligned);
+
+  RoIAlignPluginDynamic(const std::string name, const void *data,
+                        size_t length);
+
+  RoIAlignPluginDynamic() = delete;
+
+  // IPluginV2DynamicExt Methods
+  nvinfer1::IPluginV2DynamicExt *clone() const override;
+  nvinfer1::DimsExprs getOutputDimensions(
+      int outputIndex, const nvinfer1::DimsExprs *inputs, int nbInputs,
+      nvinfer1::IExprBuilder &exprBuilder) override;
+  bool supportsFormatCombination(int pos,
+                                 const nvinfer1::PluginTensorDesc *inOut,
+                                 int nbInputs, int nbOutputs) override;
+  void configurePlugin(const nvinfer1::DynamicPluginTensorDesc *in,
+                       int nbInputs,
+                       const nvinfer1::DynamicPluginTensorDesc *out,
+                       int nbOutputs) override;
+  size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc *inputs,
+                          int nbInputs,
+                          const nvinfer1::PluginTensorDesc *outputs,
+                          int nbOutputs) const override;
+  int enqueue(const nvinfer1::PluginTensorDesc *inputDesc,
+              const nvinfer1::PluginTensorDesc *outputDesc,
+              const void *const *inputs, void *const *outputs, void *workspace,
+              cudaStream_t stream) override;
+
+  // IPluginV2Ext Methods
+  nvinfer1::DataType getOutputDataType(int index,
+                                       const nvinfer1::DataType *inputTypes,
+                                       int nbInputs) const override;
+
+  // IPluginV2 Methods
+  const char *getPluginType() const override;
+  const char *getPluginVersion() const override;
+  int getNbOutputs() const override;
+  int initialize() override;
+  void terminate() override;
+  size_t getSerializationSize() const override;
+  void serialize(void *buffer) const override;
+  void destroy() override;
+  void setPluginNamespace(const char *pluginNamespace) override;
+  const char *getPluginNamespace() const override;
+
+ private:
+  const std::string mLayerName;
+  std::string mNamespace;
+
+  int mOutWidth;
+  int mOutHeight;
+  float mSpatialScale;
+  int mSampleRatio;
+  int mPoolMode;  // 1:avg 0:max
+  bool mAligned;
+
+ protected:
+  // To prevent compiler warnings.
+  using nvinfer1::IPluginV2DynamicExt::canBroadcastInputAcrossBatch;
+  using nvinfer1::IPluginV2DynamicExt::configurePlugin;
+  using nvinfer1::IPluginV2DynamicExt::enqueue;
+  using nvinfer1::IPluginV2DynamicExt::getOutputDimensions;
+  using nvinfer1::IPluginV2DynamicExt::getWorkspaceSize;
+  using nvinfer1::IPluginV2DynamicExt::isOutputBroadcastAcrossBatch;
+  using nvinfer1::IPluginV2DynamicExt::supportsFormat;
+};
+
+class RoIAlignPluginDynamicCreator : public nvinfer1::IPluginCreator {
+ public:
+  RoIAlignPluginDynamicCreator();
+
+  const char *getPluginName() const override;
+
+  const char *getPluginVersion() const override;
+
+  const nvinfer1::PluginFieldCollection *getFieldNames() override;
+
+  nvinfer1::IPluginV2 *createPlugin(
+      const char *name, const nvinfer1::PluginFieldCollection *fc) override;
+
+  nvinfer1::IPluginV2 *deserializePlugin(const char *name,
+                                         const void *serialData,
+                                         size_t serialLength) override;
+
+  void setPluginNamespace(const char *pluginNamespace) override;
+
+  const char *getPluginNamespace() const override;
+
+ private:
+  static nvinfer1::PluginFieldCollection mFC;
+  static std::vector<nvinfer1::PluginField> mPluginAttributes;
+  std::string mNamespace;
+};
+#endif  // TRT_ROI_ALIGN_HPP
diff --git a/mmcv/ops/csrc/tensorrt/trt_scatternd.hpp b/mmcv/ops/csrc/tensorrt/trt_scatternd.hpp
new file mode 100644
index 0000000..6087cbe
--- /dev/null
+++ b/mmcv/ops/csrc/tensorrt/trt_scatternd.hpp
@@ -0,0 +1,98 @@
+#ifndef TRT_SCATTERND_HPP
+#define TRT_SCATTERND_HPP
+#include <cublas_v2.h>
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "trt_plugin_helper.hpp"
+
+class ONNXScatterNDDynamic : public nvinfer1::IPluginV2DynamicExt {
+ public:
+  ONNXScatterNDDynamic(const std::string &name);
+
+  ONNXScatterNDDynamic(const std::string name, const void *data, size_t length);
+
+  ONNXScatterNDDynamic() = delete;
+
+  // IPluginV2DynamicExt Methods
+  nvinfer1::IPluginV2DynamicExt *clone() const override;
+  nvinfer1::DimsExprs getOutputDimensions(
+      int outputIndex, const nvinfer1::DimsExprs *inputs, int nbInputs,
+      nvinfer1::IExprBuilder &exprBuilder) override;
+  bool supportsFormatCombination(int pos,
+                                 const nvinfer1::PluginTensorDesc *inOut,
+                                 int nbInputs, int nbOutputs) override;
+  void configurePlugin(const nvinfer1::DynamicPluginTensorDesc *in,
+                       int nbInputs,
+                       const nvinfer1::DynamicPluginTensorDesc *out,
+                       int nbOutputs) override;
+  size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc *inputs,
+                          int nbInputs,
+                          const nvinfer1::PluginTensorDesc *outputs,
+                          int nbOutputs) const override;
+  int enqueue(const nvinfer1::PluginTensorDesc *inputDesc,
+              const nvinfer1::PluginTensorDesc *outputDesc,
+              const void *const *inputs, void *const *outputs, void *workspace,
+              cudaStream_t stream) override;
+
+  // IPluginV2Ext Methods
+  nvinfer1::DataType getOutputDataType(int index,
+                                       const nvinfer1::DataType *inputTypes,
+                                       int nbInputs) const override;
+
+  // IPluginV2 Methods
+  const char *getPluginType() const override;
+  const char *getPluginVersion() const override;
+  int getNbOutputs() const override;
+  int initialize() override;
+  void terminate() override;
+  size_t getSerializationSize() const override;
+  void serialize(void *buffer) const override;
+  void destroy() override;
+  void setPluginNamespace(const char *pluginNamespace) override;
+  const char *getPluginNamespace() const override;
+
+ private:
+  const std::string mLayerName;
+  std::string mNamespace;
+
+ protected:
+  // To prevent compiler warnings.
+  using nvinfer1::IPluginV2DynamicExt::canBroadcastInputAcrossBatch;
+  using nvinfer1::IPluginV2DynamicExt::configurePlugin;
+  using nvinfer1::IPluginV2DynamicExt::enqueue;
+  using nvinfer1::IPluginV2DynamicExt::getOutputDimensions;
+  using nvinfer1::IPluginV2DynamicExt::getWorkspaceSize;
+  using nvinfer1::IPluginV2DynamicExt::isOutputBroadcastAcrossBatch;
+  using nvinfer1::IPluginV2DynamicExt::supportsFormat;
+};
+
+class ONNXScatterNDDynamicCreator : public nvinfer1::IPluginCreator {
+ public:
+  ONNXScatterNDDynamicCreator();
+
+  const char *getPluginName() const override;
+
+  const char *getPluginVersion() const override;
+
+  const nvinfer1::PluginFieldCollection *getFieldNames() override;
+
+  nvinfer1::IPluginV2 *createPlugin(
+      const char *name, const nvinfer1::PluginFieldCollection *fc) override;
+
+  nvinfer1::IPluginV2 *deserializePlugin(const char *name,
+                                         const void *serialData,
+                                         size_t serialLength) override;
+
+  void setPluginNamespace(const char *pluginNamespace) override;
+
+  const char *getPluginNamespace() const override;
+
+ private:
+  static nvinfer1::PluginFieldCollection mFC;
+  static std::vector<nvinfer1::PluginField> mPluginAttributes;
+  std::string mNamespace;
+};
+#endif  // TRT_SCATTERND_HPP
diff --git a/mmcv/ops/csrc/tensorrt/trt_serialize.hpp b/mmcv/ops/csrc/tensorrt/trt_serialize.hpp
new file mode 100644
index 0000000..1f0899f
--- /dev/null
+++ b/mmcv/ops/csrc/tensorrt/trt_serialize.hpp
@@ -0,0 +1,105 @@
+// Modified from:
+// https://github.com/NVIDIA/TensorRT/blob/master/plugin/common/serialize.hpp
+
+#ifndef TRT_SERIALIZE_HPP
+#define TRT_SERIALIZE_HPP
+#include <cassert>
+#include <cstring>
+#include <iostream>
+#include <type_traits>
+#include <vector>
+using std::cerr;
+using std::cout;
+using std::endl;
+
+template <typename T>
+inline void serialize_value(void** buffer, T const& value);
+
+template <typename T>
+inline void deserialize_value(void const** buffer, size_t* buffer_size,
+                              T* value);
+
+namespace {
+
+template <typename T, class Enable = void>
+struct Serializer {};
+
+template <typename T>
+struct Serializer<T, typename std::enable_if<std::is_arithmetic<T>::value ||
+                                             std::is_enum<T>::value ||
+                                             std::is_pod<T>::value>::type> {
+  static size_t serialized_size(T const& value) { return sizeof(T); }
+  static void serialize(void** buffer, T const& value) {
+    ::memcpy(*buffer, &value, sizeof(T));
+    reinterpret_cast<char*&>(*buffer) += sizeof(T);
+  }
+  static void deserialize(void const** buffer, size_t* buffer_size, T* value) {
+    assert(*buffer_size >= sizeof(T));
+    ::memcpy(value, *buffer, sizeof(T));
+    reinterpret_cast<char const*&>(*buffer) += sizeof(T);
+    *buffer_size -= sizeof(T);
+  }
+};
+
+template <>
+struct Serializer<const char*> {
+  static size_t serialized_size(const char* value) { return strlen(value) + 1; }
+  static void serialize(void** buffer, const char* value) {
+    ::strcpy(static_cast<char*>(*buffer), value);
+    reinterpret_cast<char*&>(*buffer) += strlen(value) + 1;
+  }
+  static void deserialize(void const** buffer, size_t* buffer_size,
+                          const char** value) {
+    *value = static_cast<char const*>(*buffer);
+    size_t data_size = strnlen(*value, *buffer_size) + 1;
+    assert(*buffer_size >= data_size);
+    reinterpret_cast<char const*&>(*buffer) += data_size;
+    *buffer_size -= data_size;
+  }
+};
+
+template <typename T>
+struct Serializer<std::vector<T>,
+                  typename std::enable_if<std::is_arithmetic<T>::value ||
+                                          std::is_enum<T>::value ||
+                                          std::is_pod<T>::value>::type> {
+  static size_t serialized_size(std::vector<T> const& value) {
+    return sizeof(value.size()) + value.size() * sizeof(T);
+  }
+  static void serialize(void** buffer, std::vector<T> const& value) {
+    serialize_value(buffer, value.size());
+    size_t nbyte = value.size() * sizeof(T);
+    ::memcpy(*buffer, value.data(), nbyte);
+    reinterpret_cast<char*&>(*buffer) += nbyte;
+  }
+  static void deserialize(void const** buffer, size_t* buffer_size,
+                          std::vector<T>* value) {
+    size_t size;
+    deserialize_value(buffer, buffer_size, &size);
+    value->resize(size);
+    size_t nbyte = value->size() * sizeof(T);
+    assert(*buffer_size >= nbyte);
+    ::memcpy(value->data(), *buffer, nbyte);
+    reinterpret_cast<char const*&>(*buffer) += nbyte;
+    *buffer_size -= nbyte;
+  }
+};
+
+}  // namespace
+
+template <typename T>
+inline size_t serialized_size(T const& value) {
+  return Serializer<T>::serialized_size(value);
+}
+
+template <typename T>
+inline void serialize_value(void** buffer, T const& value) {
+  return Serializer<T>::serialize(buffer, value);
+}
+
+template <typename T>
+inline void deserialize_value(void const** buffer, size_t* buffer_size,
+                              T* value) {
+  return Serializer<T>::deserialize(buffer, buffer_size, value);
+}
+#endif  // TRT_SERIALIZE_HPP
diff --git a/mmcv/ops/deform_conv.py b/mmcv/ops/deform_conv.py
index c314185..dc2391d 100644
--- a/mmcv/ops/deform_conv.py
+++ b/mmcv/ops/deform_conv.py
@@ -1,19 +1,17 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from typing import Optional, Tuple, Union
+from typing import Tuple, Union
 
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from mmengine.logging import print_log
-from mmengine.registry import MODELS
-from mmengine.utils import deprecated_api_warning
 from torch import Tensor
 from torch.autograd import Function
 from torch.autograd.function import once_differentiable
 from torch.nn.modules.utils import _pair, _single
 
-from ..utils import ext_loader
-from .modulated_deform_conv import ModulatedDeformConv2dFunction
+from mmcv.utils import deprecated_api_warning
+from ..cnn import CONV_LAYERS
+from ..utils import ext_loader, print_log
 
 ext_module = ext_loader.load_ext('_ext', [
     'deform_conv_forward', 'deform_conv_backward_input',
@@ -48,35 +46,18 @@ class DeformConv2dFunction(Function):
             bias_i=bias,
             im2col_step_i=im2col_step)
 
-    @staticmethod
-    def _npu_backward(ctx, grad_output):
-        input_tensor, weight, offset_out, offset_all, sort_index_for_npu_bp = \
-            ctx.saved_tensors
-        grad_input, grad_weight, grad_offset_all, grad_bias = \
-            torch.npu_deformable_conv2dbk(
-                input_tensor, grad_output, offset_out, weight, offset_all,
-                kernel_size=[weight.shape[3], weight.shape[2]],
-                stride=[1, 1, ctx.stride[0], ctx.stride[1]],
-                padding=[1, 1, ctx.padding[0], ctx.padding[1]],
-                dilation=[1, 1, ctx.dilation[0], ctx.dilation[1]],
-                groups=ctx.groups, deformable_groups=ctx.deform_groups,
-                modulated=True)
-        grad_offset = grad_offset_all.index_select(1, sort_index_for_npu_bp)
-        return grad_input, grad_offset, grad_weight, \
-            None, None, None, None, None, None, None
-
     @staticmethod
     def forward(ctx,
-                input: Tensor,
-                offset: Tensor,
-                weight: Tensor,
-                stride: Union[int, Tuple[int, ...]] = 1,
-                padding: Union[int, Tuple[int, ...]] = 0,
-                dilation: Union[int, Tuple[int, ...]] = 1,
-                groups: int = 1,
-                deform_groups: int = 1,
-                bias: bool = False,
-                im2col_step: int = 32) -> Tensor:
+                input,
+                offset,
+                weight,
+                stride=1,
+                padding=0,
+                dilation=1,
+                groups=1,
+                deform_groups=1,
+                bias=False,
+                im2col_step=32):
         if input is not None and input.dim() != 4:
             raise ValueError(
                 f'Expected 4D tensor as input, got {input.dim()}D tensor \
@@ -88,7 +69,6 @@ class DeformConv2dFunction(Function):
         ctx.groups = groups
         ctx.deform_groups = deform_groups
         ctx.im2col_step = im2col_step
-        ctx.device = input.device.type
 
         # When pytorch version >= 1.6.0, amp is adopted for fp16 mode;
         # amp won't cast the type of model (float32), but "offset" is cast
@@ -99,19 +79,10 @@ class DeformConv2dFunction(Function):
         # whatever the pytorch version is.
         input = input.type_as(offset)
         weight = weight.type_as(input)
-        if ctx.device == 'npu':
-            mask_shape, _ = torch.chunk(offset, 2, dim=1)
-            mask = torch.ones_like(mask_shape).to(input.device)
-            bias = input.new_empty(0)
-            output = ModulatedDeformConv2dFunction._npu_forward(
-                ctx, input, offset, mask, weight, bias)
-            return output
         ctx.save_for_backward(input, offset, weight)
 
-        output = input.new_empty([
-            int(i)
-            for i in DeformConv2dFunction._output_size(ctx, input, weight)
-        ])
+        output = input.new_empty(
+            DeformConv2dFunction._output_size(ctx, input, weight))
 
         ctx.bufs_ = [input.new_empty(0), input.new_empty(0)]  # columns, ones
 
@@ -140,12 +111,7 @@ class DeformConv2dFunction(Function):
 
     @staticmethod
     @once_differentiable
-    def backward(
-        ctx, grad_output: Tensor
-    ) -> Tuple[Optional[Tensor], Optional[Tensor], Optional[Tensor], None,
-               None, None, None, None, None, None]:
-        if ctx.device == 'npu':
-            return DeformConv2dFunction._npu_backward(ctx, grad_output)
+    def backward(ctx, grad_output):
         input, offset, weight = ctx.saved_tensors
 
         grad_input = grad_offset = grad_weight = None
@@ -270,7 +236,7 @@ class DeformConv2d(nn.Module):
                  deform_groups: int = 1,
                  bias: bool = False,
                  im2col_step: int = 32) -> None:
-        super().__init__()
+        super(DeformConv2d, self).__init__()
 
         assert not bias, \
             f'bias={bias} is not supported in DeformConv2d.'
@@ -361,7 +327,7 @@ class DeformConv2d(nn.Module):
         return s
 
 
-@MODELS.register_module('DCN')
+@CONV_LAYERS.register_module('DCN')
 class DeformConv2dPack(DeformConv2d):
     """A Deformable Conv Encapsulation that acts as normal Conv layers.
 
@@ -390,7 +356,7 @@ class DeformConv2dPack(DeformConv2d):
     _version = 2
 
     def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
+        super(DeformConv2dPack, self).__init__(*args, **kwargs)
         self.conv_offset = nn.Conv2d(
             self.in_channels,
             self.deform_groups * 2 * self.kernel_size[0] * self.kernel_size[1],
@@ -405,7 +371,7 @@ class DeformConv2dPack(DeformConv2d):
         self.conv_offset.weight.data.zero_()
         self.conv_offset.bias.data.zero_()
 
-    def forward(self, x: Tensor) -> Tensor:  # type: ignore
+    def forward(self, x):
         offset = self.conv_offset(x)
         return deform_conv2d(x, offset, self.weight, self.stride, self.padding,
                              self.dilation, self.groups, self.deform_groups,
@@ -432,7 +398,7 @@ class DeformConv2dPack(DeformConv2d):
             print_log(
                 f'DeformConv2dPack {prefix.rstrip(".")} is upgraded to '
                 'version 2.',
-                logger='current')
+                logger='root')
 
         super()._load_from_state_dict(state_dict, prefix, local_metadata,
                                       strict, missing_keys, unexpected_keys,
diff --git a/mmcv/ops/deform_roi_pool.py b/mmcv/ops/deform_roi_pool.py
index 1b14088..cc245ba 100644
--- a/mmcv/ops/deform_roi_pool.py
+++ b/mmcv/ops/deform_roi_pool.py
@@ -1,7 +1,5 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from typing import Optional, Tuple
-
-from torch import Tensor, nn
+from torch import nn
 from torch.autograd import Function
 from torch.autograd.function import once_differentiable
 from torch.nn.modules.utils import _pair
@@ -17,28 +15,26 @@ class DeformRoIPoolFunction(Function):
     @staticmethod
     def symbolic(g, input, rois, offset, output_size, spatial_scale,
                  sampling_ratio, gamma):
-        inputs = [input, rois]
-        if offset is not None:
-            inputs = [input, rois, offset]
         return g.op(
             'mmcv::MMCVDeformRoIPool',
-            *inputs,
+            input,
+            rois,
+            offset,
             pooled_height_i=output_size[0],
             pooled_width_i=output_size[1],
             spatial_scale_f=spatial_scale,
             sampling_ratio_f=sampling_ratio,
-            gamma_f=gamma,
-        )
+            gamma_f=gamma)
 
     @staticmethod
     def forward(ctx,
-                input: Tensor,
-                rois: Tensor,
-                offset: Optional[Tensor],
-                output_size: Tuple[int, ...],
-                spatial_scale: float = 1.0,
-                sampling_ratio: int = 0,
-                gamma: float = 0.1) -> Tensor:
+                input,
+                rois,
+                offset,
+                output_size,
+                spatial_scale=1.0,
+                sampling_ratio=0,
+                gamma=0.1):
         if offset is None:
             offset = input.new_zeros(0)
         ctx.output_size = _pair(output_size)
@@ -68,9 +64,7 @@ class DeformRoIPoolFunction(Function):
 
     @staticmethod
     @once_differentiable
-    def backward(
-        ctx, grad_output: Tensor
-    ) -> Tuple[Tensor, None, Tensor, None, None, None, None]:
+    def backward(ctx, grad_output):
         input, rois, offset = ctx.saved_tensors
         grad_input = grad_output.new_zeros(input.shape)
         grad_offset = grad_output.new_zeros(offset.shape)
@@ -98,20 +92,17 @@ deform_roi_pool = DeformRoIPoolFunction.apply
 class DeformRoIPool(nn.Module):
 
     def __init__(self,
-                 output_size: Tuple[int, ...],
-                 spatial_scale: float = 1.0,
-                 sampling_ratio: int = 0,
-                 gamma: float = 0.1):
-        super().__init__()
+                 output_size,
+                 spatial_scale=1.0,
+                 sampling_ratio=0,
+                 gamma=0.1):
+        super(DeformRoIPool, self).__init__()
         self.output_size = _pair(output_size)
         self.spatial_scale = float(spatial_scale)
         self.sampling_ratio = int(sampling_ratio)
         self.gamma = float(gamma)
 
-    def forward(self,
-                input: Tensor,
-                rois: Tensor,
-                offset: Optional[Tensor] = None) -> Tensor:
+    def forward(self, input, rois, offset=None):
         return deform_roi_pool(input, rois, offset, self.output_size,
                                self.spatial_scale, self.sampling_ratio,
                                self.gamma)
@@ -120,13 +111,14 @@ class DeformRoIPool(nn.Module):
 class DeformRoIPoolPack(DeformRoIPool):
 
     def __init__(self,
-                 output_size: Tuple[int, ...],
-                 output_channels: int,
-                 deform_fc_channels: int = 1024,
-                 spatial_scale: float = 1.0,
-                 sampling_ratio: int = 0,
-                 gamma: float = 0.1):
-        super().__init__(output_size, spatial_scale, sampling_ratio, gamma)
+                 output_size,
+                 output_channels,
+                 deform_fc_channels=1024,
+                 spatial_scale=1.0,
+                 sampling_ratio=0,
+                 gamma=0.1):
+        super(DeformRoIPoolPack, self).__init__(output_size, spatial_scale,
+                                                sampling_ratio, gamma)
 
         self.output_channels = output_channels
         self.deform_fc_channels = deform_fc_channels
@@ -143,7 +135,7 @@ class DeformRoIPoolPack(DeformRoIPool):
         self.offset_fc[-1].weight.data.zero_()
         self.offset_fc[-1].bias.data.zero_()
 
-    def forward(self, input: Tensor, rois: Tensor) -> Tensor:  # type: ignore
+    def forward(self, input, rois):
         assert input.size(1) == self.output_channels
         x = deform_roi_pool(input, rois, None, self.output_size,
                             self.spatial_scale, self.sampling_ratio,
@@ -160,13 +152,14 @@ class DeformRoIPoolPack(DeformRoIPool):
 class ModulatedDeformRoIPoolPack(DeformRoIPool):
 
     def __init__(self,
-                 output_size: Tuple[int, ...],
-                 output_channels: int,
-                 deform_fc_channels: int = 1024,
-                 spatial_scale: float = 1.0,
-                 sampling_ratio: int = 0,
-                 gamma: float = 0.1):
-        super().__init__(output_size, spatial_scale, sampling_ratio, gamma)
+                 output_size,
+                 output_channels,
+                 deform_fc_channels=1024,
+                 spatial_scale=1.0,
+                 sampling_ratio=0,
+                 gamma=0.1):
+        super(ModulatedDeformRoIPoolPack,
+              self).__init__(output_size, spatial_scale, sampling_ratio, gamma)
 
         self.output_channels = output_channels
         self.deform_fc_channels = deform_fc_channels
@@ -194,7 +187,7 @@ class ModulatedDeformRoIPoolPack(DeformRoIPool):
         self.mask_fc[2].weight.data.zero_()
         self.mask_fc[2].bias.data.zero_()
 
-    def forward(self, input: Tensor, rois: Tensor) -> Tensor:  # type: ignore
+    def forward(self, input, rois):
         assert input.size(1) == self.output_channels
         x = deform_roi_pool(input, rois, None, self.output_size,
                             self.spatial_scale, self.sampling_ratio,
diff --git a/mmcv/ops/deprecated_wrappers.py b/mmcv/ops/deprecated_wrappers.py
index 629a803..a2e593d 100644
--- a/mmcv/ops/deprecated_wrappers.py
+++ b/mmcv/ops/deprecated_wrappers.py
@@ -12,8 +12,7 @@ class Conv2d_deprecated(Conv2d):
         super().__init__(*args, **kwargs)
         warnings.warn(
             'Importing Conv2d wrapper from "mmcv.ops" will be deprecated in'
-            ' the future. Please import them from "mmcv.cnn" instead',
-            DeprecationWarning)
+            ' the future. Please import them from "mmcv.cnn" instead')
 
 
 class ConvTranspose2d_deprecated(ConvTranspose2d):
@@ -23,7 +22,7 @@ class ConvTranspose2d_deprecated(ConvTranspose2d):
         warnings.warn(
             'Importing ConvTranspose2d wrapper from "mmcv.ops" will be '
             'deprecated in the future. Please import them from "mmcv.cnn" '
-            'instead', DeprecationWarning)
+            'instead')
 
 
 class MaxPool2d_deprecated(MaxPool2d):
@@ -32,8 +31,7 @@ class MaxPool2d_deprecated(MaxPool2d):
         super().__init__(*args, **kwargs)
         warnings.warn(
             'Importing MaxPool2d wrapper from "mmcv.ops" will be deprecated in'
-            ' the future. Please import them from "mmcv.cnn" instead',
-            DeprecationWarning)
+            ' the future. Please import them from "mmcv.cnn" instead')
 
 
 class Linear_deprecated(Linear):
@@ -42,5 +40,4 @@ class Linear_deprecated(Linear):
         super().__init__(*args, **kwargs)
         warnings.warn(
             'Importing Linear wrapper from "mmcv.ops" will be deprecated in'
-            ' the future. Please import them from "mmcv.cnn" instead',
-            DeprecationWarning)
+            ' the future. Please import them from "mmcv.cnn" instead')
diff --git a/mmcv/ops/diff_iou_rotated.py b/mmcv/ops/diff_iou_rotated.py
deleted file mode 100644
index ddcf4b4..0000000
--- a/mmcv/ops/diff_iou_rotated.py
+++ /dev/null
@@ -1,301 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-# Adapted from https://github.com/lilanxiao/Rotated_IoU/blob/master/box_intersection_2d.py  # noqa
-# Adapted from https://github.com/lilanxiao/Rotated_IoU/blob/master/oriented_iou_loss.py  # noqa
-from typing import Tuple
-
-import torch
-from torch import Tensor
-from torch.autograd import Function
-
-from ..utils import ext_loader
-
-EPSILON = 1e-8
-ext_module = ext_loader.load_ext('_ext',
-                                 ['diff_iou_rotated_sort_vertices_forward'])
-
-
-class SortVertices(Function):
-
-    @staticmethod
-    def forward(ctx, vertices, mask, num_valid):
-        idx = ext_module.diff_iou_rotated_sort_vertices_forward(
-            vertices, mask, num_valid)
-        if torch.__version__ != 'parrots':
-            ctx.mark_non_differentiable(idx)
-        return idx
-
-    @staticmethod
-    def backward(ctx, gradout):
-        return ()
-
-
-def box_intersection(corners1: Tensor,
-                     corners2: Tensor) -> Tuple[Tensor, Tensor]:
-    """Find intersection points of rectangles.
-    Convention: if two edges are collinear, there is no intersection point.
-
-    Args:
-        corners1 (Tensor): (B, N, 4, 2) First batch of boxes.
-        corners2 (Tensor): (B, N, 4, 2) Second batch of boxes.
-
-    Returns:
-        Tuple:
-         - Tensor: (B, N, 4, 4, 2) Intersections.
-         - Tensor: (B, N, 4, 4) Valid intersections mask.
-    """
-    # build edges from corners
-    # B, N, 4, 4: Batch, Box, edge, point
-    line1 = torch.cat([corners1, corners1[:, :, [1, 2, 3, 0], :]], dim=3)
-    line2 = torch.cat([corners2, corners2[:, :, [1, 2, 3, 0], :]], dim=3)
-    # duplicate data to pair each edges from the boxes
-    # (B, N, 4, 4) -> (B, N, 4, 4, 4) : Batch, Box, edge1, edge2, point
-    line1_ext = line1.unsqueeze(3)
-    line2_ext = line2.unsqueeze(2)
-    x1, y1, x2, y2 = line1_ext.split([1, 1, 1, 1], dim=-1)
-    x3, y3, x4, y4 = line2_ext.split([1, 1, 1, 1], dim=-1)
-    # math: https://en.wikipedia.org/wiki/Line%E2%80%93line_intersection
-    numerator = (x1 - x2) * (y3 - y4) - (y1 - y2) * (x3 - x4)
-    denumerator_t = (x1 - x3) * (y3 - y4) - (y1 - y3) * (x3 - x4)
-    t = denumerator_t / numerator
-    t[numerator == .0] = -1.
-    mask_t = (t > 0) & (t < 1)  # intersection on line segment 1
-    denumerator_u = (x1 - x2) * (y1 - y3) - (y1 - y2) * (x1 - x3)
-    u = -denumerator_u / numerator
-    u[numerator == .0] = -1.
-    mask_u = (u > 0) & (u < 1)  # intersection on line segment 2
-    mask = mask_t * mask_u
-    # overwrite with EPSILON. otherwise numerically unstable
-    t = denumerator_t / (numerator + EPSILON)
-    intersections = torch.stack([x1 + t * (x2 - x1), y1 + t * (y2 - y1)],
-                                dim=-1)
-    intersections = intersections * mask.float().unsqueeze(-1)
-    return intersections, mask
-
-
-def box1_in_box2(corners1: Tensor, corners2: Tensor) -> Tensor:
-    """Check if corners of box1 lie in box2.
-    Convention: if a corner is exactly on the edge of the other box,
-    it's also a valid point.
-
-    Args:
-        corners1 (Tensor): (B, N, 4, 2) First batch of boxes.
-        corners2 (Tensor): (B, N, 4, 2) Second batch of boxes.
-
-    Returns:
-        Tensor: (B, N, 4) Intersection.
-    """
-    # a, b, c, d - 4 vertices of box2
-    a = corners2[:, :, 0:1, :]  # (B, N, 1, 2)
-    b = corners2[:, :, 1:2, :]  # (B, N, 1, 2)
-    d = corners2[:, :, 3:4, :]  # (B, N, 1, 2)
-    # ab, am, ad - vectors between corresponding vertices
-    ab = b - a  # (B, N, 1, 2)
-    am = corners1 - a  # (B, N, 4, 2)
-    ad = d - a  # (B, N, 1, 2)
-    prod_ab = torch.sum(ab * am, dim=-1)  # (B, N, 4)
-    norm_ab = torch.sum(ab * ab, dim=-1)  # (B, N, 1)
-    prod_ad = torch.sum(ad * am, dim=-1)  # (B, N, 4)
-    norm_ad = torch.sum(ad * ad, dim=-1)  # (B, N, 1)
-    # NOTE: the expression looks ugly but is stable if the two boxes
-    # are exactly the same also stable with different scale of bboxes
-    cond1 = (prod_ab / norm_ab > -1e-6) * (prod_ab / norm_ab < 1 + 1e-6
-                                           )  # (B, N, 4)
-    cond2 = (prod_ad / norm_ad > -1e-6) * (prod_ad / norm_ad < 1 + 1e-6
-                                           )  # (B, N, 4)
-    return cond1 * cond2
-
-
-def box_in_box(corners1: Tensor, corners2: Tensor) -> Tuple[Tensor, Tensor]:
-    """Check if corners of two boxes lie in each other.
-
-    Args:
-        corners1 (Tensor): (B, N, 4, 2) First batch of boxes.
-        corners2 (Tensor): (B, N, 4, 2) Second batch of boxes.
-
-    Returns:
-        Tuple:
-         - Tensor: (B, N, 4) True if i-th corner of box1 is in box2.
-         - Tensor: (B, N, 4) True if i-th corner of box2 is in box1.
-    """
-    c1_in_2 = box1_in_box2(corners1, corners2)
-    c2_in_1 = box1_in_box2(corners2, corners1)
-    return c1_in_2, c2_in_1
-
-
-def build_vertices(corners1: Tensor, corners2: Tensor, c1_in_2: Tensor,
-                   c2_in_1: Tensor, intersections: Tensor,
-                   valid_mask: Tensor) -> Tuple[Tensor, Tensor]:
-    """Find vertices of intersection area.
-
-    Args:
-        corners1 (Tensor): (B, N, 4, 2) First batch of boxes.
-        corners2 (Tensor): (B, N, 4, 2) Second batch of boxes.
-        c1_in_2 (Tensor): (B, N, 4) True if i-th corner of box1 is in box2.
-        c2_in_1 (Tensor): (B, N, 4) True if i-th corner of box2 is in box1.
-        intersections (Tensor): (B, N, 4, 4, 2) Intersections.
-        valid_mask (Tensor): (B, N, 4, 4) Valid intersections mask.
-
-    Returns:
-        Tuple:
-         - Tensor: (B, N, 24, 2) Vertices of intersection area;
-               only some elements are valid.
-         - Tensor: (B, N, 24) Mask of valid elements in vertices.
-    """
-    # NOTE: inter has elements equals zero and has zeros gradient
-    # (masked by multiplying with 0); can be used as trick
-    B = corners1.size()[0]
-    N = corners1.size()[1]
-    # (B, N, 4 + 4 + 16, 2)
-    vertices = torch.cat(
-        [corners1, corners2,
-         intersections.view([B, N, -1, 2])], dim=2)
-    # Bool (B, N, 4 + 4 + 16)
-    mask = torch.cat([c1_in_2, c2_in_1, valid_mask.view([B, N, -1])], dim=2)
-    return vertices, mask
-
-
-def sort_indices(vertices: Tensor, mask: Tensor) -> Tensor:
-    """Sort indices.
-    Note:
-        why 9? the polygon has maximal 8 vertices.
-        +1 to duplicate the first element.
-        the index should have following structure:
-            (A, B, C, ... , A, X, X, X)
-        and X indicates the index of arbitrary elements in the last
-        16 (intersections not corners) with value 0 and mask False.
-        (cause they have zero value and zero gradient)
-
-    Args:
-        vertices (Tensor): (B, N, 24, 2) Box vertices.
-        mask (Tensor): (B, N, 24) Mask.
-
-    Returns:
-        Tensor: (B, N, 9) Sorted indices.
-
-    """
-    num_valid = torch.sum(mask.int(), dim=2).int()  # (B, N)
-    mean = torch.sum(
-        vertices * mask.float().unsqueeze(-1), dim=2,
-        keepdim=True) / num_valid.unsqueeze(-1).unsqueeze(-1)
-    vertices_normalized = vertices - mean  # normalization makes sorting easier
-    return SortVertices.apply(vertices_normalized, mask, num_valid).long()
-
-
-def calculate_area(idx_sorted: Tensor,
-                   vertices: Tensor) -> Tuple[Tensor, Tensor]:
-    """Calculate area of intersection.
-
-    Args:
-        idx_sorted (Tensor): (B, N, 9) Sorted vertex ids.
-        vertices (Tensor): (B, N, 24, 2) Vertices.
-
-    Returns:
-        Tuple:
-         - Tensor (B, N): Area of intersection.
-         - Tensor: (B, N, 9, 2) Vertices of polygon with zero padding.
-    """
-    idx_ext = idx_sorted.unsqueeze(-1).repeat([1, 1, 1, 2])
-    selected = torch.gather(vertices, 2, idx_ext)
-    total = selected[:, :, 0:-1, 0] * selected[:, :, 1:, 1] \
-        - selected[:, :, 0:-1, 1] * selected[:, :, 1:, 0]
-    total = torch.sum(total, dim=2)
-    area = torch.abs(total) / 2
-    return area, selected
-
-
-def oriented_box_intersection_2d(corners1: Tensor,
-                                 corners2: Tensor) -> Tuple[Tensor, Tensor]:
-    """Calculate intersection area of 2d rotated boxes.
-
-    Args:
-        corners1 (Tensor): (B, N, 4, 2) First batch of boxes.
-        corners2 (Tensor): (B, N, 4, 2) Second batch of boxes.
-
-    Returns:
-        Tuple:
-         - Tensor (B, N): Area of intersection.
-         - Tensor (B, N, 9, 2): Vertices of polygon with zero padding.
-    """
-    intersections, valid_mask = box_intersection(corners1, corners2)
-    c12, c21 = box_in_box(corners1, corners2)
-    vertices, mask = build_vertices(corners1, corners2, c12, c21,
-                                    intersections, valid_mask)
-    sorted_indices = sort_indices(vertices, mask)
-    return calculate_area(sorted_indices, vertices)
-
-
-def box2corners(box: Tensor) -> Tensor:
-    """Convert rotated 2d box coordinate to corners.
-
-    Args:
-        box (Tensor): (B, N, 5) with x, y, w, h, alpha.
-
-    Returns:
-        Tensor: (B, N, 4, 2) Corners.
-    """
-    B = box.size()[0]
-    x, y, w, h, alpha = box.split([1, 1, 1, 1, 1], dim=-1)
-    x4 = box.new_tensor([0.5, -0.5, -0.5, 0.5]).to(box.device)
-    x4 = x4 * w  # (B, N, 4)
-    y4 = box.new_tensor([0.5, 0.5, -0.5, -0.5]).to(box.device)
-    y4 = y4 * h  # (B, N, 4)
-    corners = torch.stack([x4, y4], dim=-1)  # (B, N, 4, 2)
-    sin = torch.sin(alpha)
-    cos = torch.cos(alpha)
-    row1 = torch.cat([cos, sin], dim=-1)
-    row2 = torch.cat([-sin, cos], dim=-1)  # (B, N, 2)
-    rot_T = torch.stack([row1, row2], dim=-2)  # (B, N, 2, 2)
-    rotated = torch.bmm(corners.view([-1, 4, 2]), rot_T.view([-1, 2, 2]))
-    rotated = rotated.view([B, -1, 4, 2])  # (B * N, 4, 2) -> (B, N, 4, 2)
-    rotated[..., 0] += x
-    rotated[..., 1] += y
-    return rotated
-
-
-def diff_iou_rotated_2d(box1: Tensor, box2: Tensor) -> Tensor:
-    """Calculate differentiable iou of rotated 2d boxes.
-
-    Args:
-        box1 (Tensor): (B, N, 5) First box.
-        box2 (Tensor): (B, N, 5) Second box.
-
-    Returns:
-        Tensor: (B, N) IoU.
-    """
-    corners1 = box2corners(box1)
-    corners2 = box2corners(box2)
-    intersection, _ = oriented_box_intersection_2d(corners1,
-                                                   corners2)  # (B, N)
-    area1 = box1[:, :, 2] * box1[:, :, 3]
-    area2 = box2[:, :, 2] * box2[:, :, 3]
-    union = area1 + area2 - intersection
-    iou = intersection / union
-    return iou
-
-
-def diff_iou_rotated_3d(box3d1: Tensor, box3d2: Tensor) -> Tensor:
-    """Calculate differentiable iou of rotated 3d boxes.
-
-    Args:
-        box3d1 (Tensor): (B, N, 3+3+1) First box (x,y,z,w,h,l,alpha).
-        box3d2 (Tensor): (B, N, 3+3+1) Second box (x,y,z,w,h,l,alpha).
-
-    Returns:
-        Tensor: (B, N) IoU.
-    """
-    box1 = box3d1[..., [0, 1, 3, 4, 6]]  # 2d box
-    box2 = box3d2[..., [0, 1, 3, 4, 6]]
-    corners1 = box2corners(box1)
-    corners2 = box2corners(box2)
-    intersection, _ = oriented_box_intersection_2d(corners1, corners2)
-    zmax1 = box3d1[..., 2] + box3d1[..., 5] * 0.5
-    zmin1 = box3d1[..., 2] - box3d1[..., 5] * 0.5
-    zmax2 = box3d2[..., 2] + box3d2[..., 5] * 0.5
-    zmin2 = box3d2[..., 2] - box3d2[..., 5] * 0.5
-    z_overlap = (torch.min(zmax1, zmax2) -
-                 torch.max(zmin1, zmin2)).clamp_(min=0.)
-    intersection_3d = intersection * z_overlap
-    volume1 = box3d1[..., 3] * box3d1[..., 4] * box3d1[..., 5]
-    volume2 = box3d2[..., 3] * box3d2[..., 4] * box3d2[..., 5]
-    union_3d = volume1 + volume2 - intersection_3d
-    return intersection_3d / union_3d
diff --git a/mmcv/ops/filtered_lrelu.py b/mmcv/ops/filtered_lrelu.py
deleted file mode 100644
index 04a9848..0000000
--- a/mmcv/ops/filtered_lrelu.py
+++ /dev/null
@@ -1,414 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
-#
-# NVIDIA CORPORATION and its licensors retain all intellectual property
-# and proprietary rights in and to this software, related documentation
-# and any modifications thereto.  Any use, reproduction, disclosure or
-# distribution of this software and related documentation without an express
-# license agreement from NVIDIA CORPORATION is strictly prohibited.
-
-# source: https://github.com/NVlabs/stylegan3/blob/main/torch_utils/ops/filtered_lrelu.py # noqa
-import warnings
-from typing import Dict, Optional, Union
-
-import numpy as np
-import torch
-
-from ..utils import ext_loader
-from .bias_act import bias_act
-from .upfirdn2d import _get_filter_size, _parse_padding, upfirdn2d
-
-ext_module = ext_loader.load_ext('_ext',
-                                 ['filtered_lrelu', 'filtered_lrelu_act_'])
-
-_plugin = None
-
-
-def filtered_lrelu(input: torch.Tensor,
-                   filter_up: Optional[torch.Tensor] = None,
-                   filter_down: Optional[torch.Tensor] = None,
-                   bias: Optional[torch.Tensor] = None,
-                   up: int = 1,
-                   down: int = 1,
-                   padding: int = 0,
-                   gain: float = np.sqrt(2),
-                   slope: float = 0.2,
-                   clamp: Optional[Union[float, int]] = None,
-                   flip_filter: bool = False,
-                   use_custom_op: bool = True):
-    """Filtered leaky ReLU for a batch of 2D images.
-
-    Performs the following sequence of operations for each channel:
-
-    1. Add channel-specific bias if `bias` is provided.
-
-    2. Upsample the image by inserting N-1 zeros after each pixel (`up`).
-
-    3. Pad the image with the specified number of zeros on each side
-      (`padding`). Negative padding corresponds to cropping the image.
-
-    4. Convolve the image with the specified upsampling FIR filter
-        (`filter_up`), shrinking it so that the footprint of all output pixels
-        lies within the input image.
-
-    5. Multiply each value by the provided gain factor (`gain`).
-
-    6. Apply leaky ReLU activation function to each value.
-
-    7. Clamp each value between -clamp and +clamp, if `clamp` parameter is
-       provided.
-
-    8. Convolve the image with the specified downsampling FIR filter
-        (`filter_down`), shrinking it so that the footprint of all output
-        pixels lies within the input image.
-
-    9. Downsample the image by keeping every Nth pixel (`down`).
-
-    The fused op is considerably more efficient than performing the same
-    calculation using standard PyTorch ops. It supports gradients of arbitrary
-    order.
-
-    Args:
-        input (torch.Tensor): Float32/float16/float64 input tensor of the shape
-            `[batch_size, num_channels, in_height, in_width]`.
-        filter_up (torch.Tensor): Float32 upsampling FIR filter of the shape
-            `[filter_height, filter_width]` (non-separable), `[filter_taps]`
-            (separable), or `None` (identity). Defaults to None.
-        filter_down (torch.Tensor): Float32 downsampling FIR filter of the
-            shape `[filter_height, filter_width]` (non-separable),
-            `[filter_taps]` (separable), or `None` (identity).
-            Defaults to None.
-        bias (torch.Tensor): Bias vector, or `None` to disable. Must be
-            a 1D tensor of the same type as `input`. The length of vector must
-            match the channel dimension of `input`. Defaults to None.
-        up (int): Integer upsampling factor. Defaults to 1.
-        down (int): Integer downsampling factor. Defaults to 1.
-        padding (int): Padding with respect to the upsampled image. Can be a
-            single number or a list/tuple `[x, y]` or `[x_before, x_after,
-            y_before, y_after]`. Defaults to 0.
-        gain (float): Overall scaling factor for signal magnitude.
-            Defaults to np.sqrt(2).
-        slope (float): Slope on the negative side of leaky ReLU.
-            Defaults to 0.2.
-        clamp (Optional[Union[float, int]]): Maximum magnitude for leaky ReLU
-            output. Defaults to None.
-        flip_filter (bool): False = convolution, True = correlation.
-            Defaults to False.
-        use_custom_op (bool): Whether to use customized op.
-            Defaults to True.
-
-    Returns:
-        Tensor of the shape `[batch_size, num_channels, out_height,
-        out_width]`.
-    """
-    assert isinstance(input, torch.Tensor)
-    if use_custom_op and input.is_cuda:
-        return _filtered_lrelu_cuda(
-            up=up,
-            down=down,
-            padding=padding,
-            gain=gain,
-            slope=slope,
-            clamp=clamp,
-            flip_filter=flip_filter).apply(input, filter_up, filter_down, bias,
-                                           None, 0, 0)
-    return _filtered_lrelu_ref(
-        input,
-        filter_up=filter_up,
-        filter_down=filter_down,
-        bias=bias,
-        up=up,
-        down=down,
-        padding=padding,
-        gain=gain,
-        slope=slope,
-        clamp=clamp,
-        flip_filter=flip_filter)
-
-
-def _filtered_lrelu_ref(input: torch.Tensor,
-                        filter_up: Optional[torch.Tensor] = None,
-                        filter_down: Optional[torch.Tensor] = None,
-                        bias: Optional[torch.Tensor] = None,
-                        up: int = 1,
-                        down: int = 1,
-                        padding: int = 0,
-                        gain: float = np.sqrt(2),
-                        slope: float = 0.2,
-                        clamp: Optional[Union[float, int]] = None,
-                        flip_filter: bool = False):
-    """Slow and memory-inefficient reference implementation of
-    `filtered_lrelu()` using existing `upfirdn2n()` and `bias_act()` ops.
-
-    Args:
-        input (torch.Tensor): Float32/float16/float64 input tensor of the shape
-            `[batch_size, num_channels, in_height, in_width]`.
-        filter_up (torch.Tensor): Float32 upsampling FIR filter of the shape
-            `[filter_height, filter_width]` (non-separable), `[filter_taps]`
-            (separable), or `None` (identity). Defaults to None.
-        filter_down (torch.Tensor): Float32 downsampling FIR filter of the
-            shape `[filter_height, filter_width]` (non-separable),
-            `[filter_taps]` (separable), or `None` (identity).
-            Defaults to None.
-        bias (torch.Tensor): Bias vector, or `None` to disable. Must be
-            a 1D tensor of the same type as `input`. The length of vector must
-            match the channel dimension of `input`. Defaults to None.
-        up (int): Integer upsampling factor. Defaults to 1.
-        down (int): Integer downsampling factor. Defaults to 1.
-        padding (int): Padding with respect to the upsampled image. Can be a
-            single number or a list/tuple `[x, y]` or `[x_before, x_after,
-            y_before, y_after]`. Defaults to 0.
-        gain (float): Overall scaling factor for signal magnitude.
-            Defaults to np.sqrt(2).
-        slope (float): Slope on the negative side of leaky ReLU.
-            Defaults to 0.2.
-        clamp (float or int): Maximum magnitude for leaky ReLU
-            output. Defaults to None.
-        flip_filter (bool): False = convolution, True = correlation.
-            Defaults to False.
-
-    Returns:
-        Tensor of the shape `[batch_size, num_channels, out_height,
-        out_width]`.
-    """
-    assert isinstance(input, torch.Tensor) and input.ndim == 4
-    filter_up_w, filter_up_h = _get_filter_size(filter_up)
-    filter_down_w, filter_down_h = _get_filter_size(filter_down)
-    if bias is not None:
-        assert isinstance(bias, torch.Tensor) and bias.dtype == input.dtype
-    assert isinstance(up, int) and up >= 1
-    assert isinstance(down, int) and down >= 1
-    px0, px1, py0, py1 = _parse_padding(padding)
-    assert gain == float(gain) and gain > 0
-    assert slope == float(slope) and slope >= 0
-    assert clamp is None or (clamp == float(clamp) and clamp >= 0)
-
-    # Calculate output size.
-    batch_size, channels, in_h, in_w = input.shape
-    in_dtype = input.dtype
-    out_w = (in_w * up + (px0 + px1) - (filter_up_w - 1) -
-             (filter_down_w - 1) + (down - 1)) // down
-    out_h = (in_h * up + (py0 + py1) - (filter_up_h - 1) -
-             (filter_down_h - 1) + (down - 1)) // down
-
-    # Compute using existing ops.
-    output = bias_act(input=input, bias=bias)  # Apply bias.
-    output = upfirdn2d(
-        input=output,
-        filter=filter_up,
-        up=up,
-        padding=[px0, px1, py0, py1],
-        gain=up**2,
-        flip_filter=flip_filter)  # Upsample.
-    output = bias_act(
-        input=output, act='lrelu', alpha=slope, gain=gain,
-        clamp=clamp)  # Bias, leaky ReLU, clamp.
-    output = upfirdn2d(
-        input=output, filter=filter_down, down=down,
-        flip_filter=flip_filter)  # Downsample.
-
-    assert output.shape == (batch_size, channels, out_h, out_w)
-    assert output.dtype == in_dtype
-    return output
-
-
-_filtered_lrelu_cuda_cache: Dict = dict()
-
-
-def _filtered_lrelu_cuda(up: int = 1,
-                         down: int = 1,
-                         padding: int = 0,
-                         gain: float = np.sqrt(2),
-                         slope: float = 0.2,
-                         clamp: Optional[Union[float, int]] = None,
-                         flip_filter: bool = False):
-    """Fast CUDA implementation of `filtered_lrelu()` using custom ops.
-
-    Args:
-        up (int): Integer upsampling factor. Defaults to 1.
-        down (int): Integer downsampling factor. Defaults to 1.
-        padding (int): Padding with respect to the upsampled image. Can be a
-            single number or a list/tuple `[x, y]` or `[x_before, x_after,
-            y_before, y_after]`. Defaults to 0.
-        gain (float): Overall scaling factor for signal magnitude.
-            Defaults to np.sqrt(2).
-        slope (float): Slope on the negative side of leaky ReLU.
-            Defaults to 0.2.
-        clamp (float or int): Maximum magnitude for leaky ReLU
-            output. Defaults to None.
-        flip_filter (bool): False = convolution, True = correlation.
-            Defaults to False.
-
-    Returns:
-        Tensor of the shape `[batch_size, num_channels, out_height,
-        out_width]`.
-    """
-    assert isinstance(up, int) and up >= 1
-    assert isinstance(down, int) and down >= 1
-    px0, px1, py0, py1 = _parse_padding(padding)
-    assert gain == float(gain) and gain > 0
-    gain = float(gain)
-    assert slope == float(slope) and slope >= 0
-    slope = float(slope)
-    assert clamp is None or (clamp == float(clamp) and clamp >= 0)
-    clamp = float(clamp if clamp is not None else 'inf')
-
-    # Lookup from cache.
-    key = (up, down, px0, px1, py0, py1, gain, slope, clamp, flip_filter)
-    if key in _filtered_lrelu_cuda_cache:
-        return _filtered_lrelu_cuda_cache[key]
-
-    # Forward op.
-    class FilteredLReluCuda(torch.autograd.Function):
-
-        @staticmethod
-        def forward(ctx, input, filter_up, filter_down, bias, si, sx, sy):
-            # pylint: disable=arguments-differ
-            assert isinstance(input, torch.Tensor) and input.ndim == 4
-
-            # Replace empty up/downsample kernels with full 1x1 kernels
-            # (faster than separable).
-            if filter_up is None:
-                filter_up = torch.ones([1, 1],
-                                       dtype=torch.float32,
-                                       device=input.device)
-            if filter_down is None:
-                filter_down = torch.ones([1, 1],
-                                         dtype=torch.float32,
-                                         device=input.device)
-            assert 1 <= filter_up.ndim <= 2
-            assert 1 <= filter_down.ndim <= 2
-
-            # Replace separable 1x1 kernels with full 1x1 kernels when scale
-            # factor is 1.
-            if up == 1 and filter_up.ndim == 1 and filter_up.shape[0] == 1:
-                filter_up = filter_up.square()[None]
-            if down == 1 and filter_down.ndim == 1 and filter_down.shape[
-                    0] == 1:
-                filter_down = filter_down.square()[None]
-
-            # Missing sign input tensor.
-            if si is None:
-                si = torch.empty([0])
-
-            # Missing bias tensor.
-            if bias is None:
-                bias = torch.zeros([input.shape[1]],
-                                   dtype=input.dtype,
-                                   device=input.device)
-
-            # Construct internal sign tensor only if gradients are needed.
-            write_signs = (si.numel() == 0) and (input.requires_grad
-                                                 or bias.requires_grad)
-
-            # Warn if input storage strides are not in decreasing order due to
-            # e.g. channels-last layout.
-            strides = [
-                input.stride(i) for i in range(input.ndim) if input.size(i) > 1
-            ]
-            if any(a < b for a, b in zip(strides[:-1], strides[1:])):
-                warnings.warn(
-                    'low-performance memory layout detected in filtered_lrelu '
-                    'input', RuntimeWarning)
-
-            # Call C++/Cuda plugin if datatype is supported.
-            if input.dtype in [torch.float16, torch.float32]:
-                if torch.cuda.current_stream(
-                        input.device) != torch.cuda.default_stream(
-                            input.device):
-                    warnings.warn(
-                        'filtered_lrelu called with non-default cuda stream '
-                        'but concurrent execution is not supported',
-                        RuntimeWarning)
-                y, so, return_code = ext_module.filtered_lrelu(
-                    input, filter_up, filter_down, bias, si.to(input.device),
-                    up, down, px0, px1, py0, py1, sx, sy, gain, slope, clamp,
-                    flip_filter, write_signs)
-            else:
-                return_code = -1
-
-            # No Cuda kernel found? Fall back to generic implementation.
-            # Still more memory efficient than the reference implementation
-            # because only the bit-packed sign tensor is retained for gradient
-            # computation.
-            if return_code < 0:
-                warnings.warn(
-                    'filtered_lrelu called with parameters that have no '
-                    'optimized CUDA kernel, using generic fallback',
-                    RuntimeWarning)
-
-                y = input.add(bias.unsqueeze(-1).unsqueeze(-1))  # Add bias.
-                y = upfirdn2d(
-                    input=y,
-                    filter=filter_up,
-                    up=up,
-                    padding=[px0, px1, py0, py1],
-                    gain=float(up**2),
-                    flip_filter=flip_filter)  # Upsample.
-                # Activation function and sign handling. Modifies y in-place.
-                so = ext_module.filtered_lrelu_act_(y, si.to(y.device), sx, sy,
-                                                    gain, slope, clamp,
-                                                    write_signs)
-                y = upfirdn2d(
-                    input=y,
-                    filter=filter_down,
-                    down=down,
-                    flip_filter=flip_filter)  # Downsample.
-
-            # Prepare for gradient computation.
-            ctx.save_for_backward(filter_up, filter_down,
-                                  (si if si.numel() else so))
-            ctx.x_shape = input.shape
-            ctx.y_shape = y.shape
-            ctx.s_ofs = sx, sy
-            return y
-
-        @staticmethod
-        def backward(ctx, dy):  # pylint: disable=arguments-differ
-            filter_up, filter_down, si = ctx.saved_tensors
-            _, _, xh, xw = ctx.x_shape
-            _, _, yh, yw = ctx.y_shape
-            sx, sy = ctx.s_ofs
-            dx = None  # 0
-            dfu = None
-            assert not ctx.needs_input_grad[1]
-            dfd = None
-            assert not ctx.needs_input_grad[2]
-            db = None  # 3
-            dsi = None
-            assert not ctx.needs_input_grad[4]
-            dsx = None
-            assert not ctx.needs_input_grad[5]
-            dsy = None
-            assert not ctx.needs_input_grad[6]
-
-            if ctx.needs_input_grad[0] or ctx.needs_input_grad[3]:
-                pp = [
-                    (filter_up.shape[-1] - 1) + (filter_down.shape[-1] - 1) -
-                    px0,
-                    xw * up - yw * down + px0 - (up - 1),
-                    (filter_up.shape[0] - 1) + (filter_down.shape[0] - 1) -
-                    py0,
-                    xh * up - yh * down + py0 - (up - 1),
-                ]
-                gg = gain * (up**2) / (down**2)
-                ff = (not flip_filter)
-                sx = sx - (filter_up.shape[-1] - 1) + px0
-                sy = sy - (filter_up.shape[0] - 1) + py0
-                dx = _filtered_lrelu_cuda(
-                    up=down,
-                    down=up,
-                    padding=pp,
-                    gain=gg,
-                    slope=slope,
-                    clamp=None,
-                    flip_filter=ff).apply(dy, filter_down, filter_up, None, si,
-                                          sx, sy)
-
-            if ctx.needs_input_grad[3]:
-                db = dx.sum([0, 2, 3])
-
-            return dx, dfu, dfd, db, dsi, dsx, dsy
-
-    # Add to cache.
-    _filtered_lrelu_cuda_cache[key] = FilteredLReluCuda
-    return FilteredLReluCuda
diff --git a/mmcv/ops/focal_loss.py b/mmcv/ops/focal_loss.py
index 69aab73..763bc93 100644
--- a/mmcv/ops/focal_loss.py
+++ b/mmcv/ops/focal_loss.py
@@ -1,6 +1,4 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from typing import Optional, Union
-
 import torch
 import torch.nn as nn
 from torch.autograd import Function
@@ -16,16 +14,27 @@ ext_module = ext_loader.load_ext('_ext', [
 
 class SigmoidFocalLossFunction(Function):
 
+    @staticmethod
+    def symbolic(g, input, target, gamma, alpha, weight, reduction):
+        return g.op(
+            'mmcv::MMCVSigmoidFocalLoss',
+            input,
+            target,
+            gamma_f=gamma,
+            alpha_f=alpha,
+            weight_f=weight,
+            reduction_s=reduction)
+
     @staticmethod
     def forward(ctx,
-                input: torch.Tensor,
-                target: Union[torch.LongTensor, torch.cuda.LongTensor],
-                gamma: float = 2.0,
-                alpha: float = 0.25,
-                weight: Optional[torch.Tensor] = None,
-                reduction: str = 'mean') -> torch.Tensor:
-
-        assert target.dtype == torch.long
+                input,
+                target,
+                gamma=2.0,
+                alpha=0.25,
+                weight=None,
+                reduction='mean'):
+
+        assert isinstance(target, (torch.LongTensor, torch.cuda.LongTensor))
         assert input.dim() == 2
         assert target.dim() == 1
         assert input.size(0) == target.size(0)
@@ -54,7 +63,7 @@ class SigmoidFocalLossFunction(Function):
 
     @staticmethod
     @once_differentiable
-    def backward(ctx, grad_output: torch.Tensor) -> tuple:
+    def backward(ctx, grad_output):
         input, target, weight = ctx.saved_tensors
 
         grad_input = input.new_zeros(input.size())
@@ -78,22 +87,14 @@ sigmoid_focal_loss = SigmoidFocalLossFunction.apply
 
 class SigmoidFocalLoss(nn.Module):
 
-    def __init__(self,
-                 gamma: float,
-                 alpha: float,
-                 weight: Optional[torch.Tensor] = None,
-                 reduction: str = 'mean'):
-        super().__init__()
+    def __init__(self, gamma, alpha, weight=None, reduction='mean'):
+        super(SigmoidFocalLoss, self).__init__()
         self.gamma = gamma
         self.alpha = alpha
         self.register_buffer('weight', weight)
         self.reduction = reduction
 
-    def forward(
-        self,
-        input: torch.Tensor,
-        target: Union[torch.LongTensor, torch.cuda.LongTensor],
-    ) -> torch.Tensor:
+    def forward(self, input, target):
         return sigmoid_focal_loss(input, target, self.gamma, self.alpha,
                                   self.weight, self.reduction)
 
@@ -107,16 +108,27 @@ class SigmoidFocalLoss(nn.Module):
 
 class SoftmaxFocalLossFunction(Function):
 
+    @staticmethod
+    def symbolic(g, input, target, gamma, alpha, weight, reduction):
+        return g.op(
+            'mmcv::MMCVSoftmaxFocalLoss',
+            input,
+            target,
+            gamma_f=gamma,
+            alpha_f=alpha,
+            weight_f=weight,
+            reduction_s=reduction)
+
     @staticmethod
     def forward(ctx,
-                input: torch.Tensor,
-                target: Union[torch.LongTensor, torch.cuda.LongTensor],
-                gamma: float = 2.0,
-                alpha: float = 0.25,
-                weight: Optional[torch.Tensor] = None,
-                reduction='mean') -> torch.Tensor:
-
-        assert target.dtype == torch.long
+                input,
+                target,
+                gamma=2.0,
+                alpha=0.25,
+                weight=None,
+                reduction='mean'):
+
+        assert isinstance(target, (torch.LongTensor, torch.cuda.LongTensor))
         assert input.dim() == 2
         assert target.dim() == 1
         assert input.size(0) == target.size(0)
@@ -156,7 +168,7 @@ class SoftmaxFocalLossFunction(Function):
         return output
 
     @staticmethod
-    def backward(ctx, grad_output: torch.Tensor) -> tuple:
+    def backward(ctx, grad_output):
         input_softmax, target, weight = ctx.saved_tensors
         buff = input_softmax.new_zeros(input_softmax.size(0))
         grad_input = input_softmax.new_zeros(input_softmax.size())
@@ -181,22 +193,14 @@ softmax_focal_loss = SoftmaxFocalLossFunction.apply
 
 class SoftmaxFocalLoss(nn.Module):
 
-    def __init__(self,
-                 gamma: float,
-                 alpha: float,
-                 weight: Optional[torch.Tensor] = None,
-                 reduction: str = 'mean'):
-        super().__init__()
+    def __init__(self, gamma, alpha, weight=None, reduction='mean'):
+        super(SoftmaxFocalLoss, self).__init__()
         self.gamma = gamma
         self.alpha = alpha
         self.register_buffer('weight', weight)
         self.reduction = reduction
 
-    def forward(
-        self,
-        input: torch.Tensor,
-        target: Union[torch.LongTensor, torch.cuda.LongTensor],
-    ) -> torch.Tensor:
+    def forward(self, input, target):
         return softmax_focal_loss(input, target, self.gamma, self.alpha,
                                   self.weight, self.reduction)
 
diff --git a/mmcv/ops/furthest_point_sample.py b/mmcv/ops/furthest_point_sample.py
index 22b1a30..374b7a8 100644
--- a/mmcv/ops/furthest_point_sample.py
+++ b/mmcv/ops/furthest_point_sample.py
@@ -18,11 +18,11 @@ class FurthestPointSampling(Function):
                 num_points: int) -> torch.Tensor:
         """
         Args:
-            points_xyz (torch.Tensor): (B, N, 3) where N > num_points.
+            points_xyz (Tensor): (B, N, 3) where N > num_points.
             num_points (int): Number of points in the sampled set.
 
         Returns:
-            torch.Tensor: (B, num_points) indices of the sampled points.
+             Tensor: (B, num_points) indices of the sampled points.
         """
         assert points_xyz.is_contiguous()
 
@@ -56,12 +56,11 @@ class FurthestPointSamplingWithDist(Function):
                 num_points: int) -> torch.Tensor:
         """
         Args:
-            points_dist (torch.Tensor): (B, N, N) Distance between each point
-                pair.
+            points_dist (Tensor): (B, N, N) Distance between each point pair.
             num_points (int): Number of points in the sampled set.
 
         Returns:
-            torch.Tensor: (B, num_points) indices of the sampled points.
+             Tensor: (B, num_points) indices of the sampled points.
         """
         assert points_dist.is_contiguous()
 
diff --git a/mmcv/ops/fused_bias_leakyrelu.py b/mmcv/ops/fused_bias_leakyrelu.py
index e23617f..6d12508 100644
--- a/mmcv/ops/fused_bias_leakyrelu.py
+++ b/mmcv/ops/fused_bias_leakyrelu.py
@@ -113,8 +113,7 @@ class FusedBiasLeakyReLUFunctionBackward(Function):
     """
 
     @staticmethod
-    def forward(ctx, grad_output: torch.Tensor, out: torch.Tensor,
-                negative_slope: float, scale: float) -> tuple:
+    def forward(ctx, grad_output, out, negative_slope, scale):
         ctx.save_for_backward(out)
         ctx.negative_slope = negative_slope
         ctx.scale = scale
@@ -140,8 +139,7 @@ class FusedBiasLeakyReLUFunctionBackward(Function):
         return grad_input, grad_bias
 
     @staticmethod
-    def backward(ctx, gradgrad_input: torch.Tensor,
-                 gradgrad_bias: nn.Parameter) -> tuple:
+    def backward(ctx, gradgrad_input, gradgrad_bias):
         out, = ctx.saved_tensors
 
         # The second order deviation, in fact, contains two parts, while the
@@ -162,8 +160,7 @@ class FusedBiasLeakyReLUFunctionBackward(Function):
 class FusedBiasLeakyReLUFunction(Function):
 
     @staticmethod
-    def forward(ctx, input: torch.Tensor, bias: nn.Parameter,
-                negative_slope: float, scale: float) -> torch.Tensor:
+    def forward(ctx, input, bias, negative_slope, scale):
         empty = input.new_empty(0)
 
         out = ext_module.fused_bias_leakyrelu(
@@ -181,7 +178,7 @@ class FusedBiasLeakyReLUFunction(Function):
         return out
 
     @staticmethod
-    def backward(ctx, grad_output: torch.Tensor) -> tuple:
+    def backward(ctx, grad_output):
         out, = ctx.saved_tensors
 
         grad_input, grad_bias = FusedBiasLeakyReLUFunctionBackward.apply(
@@ -191,59 +188,51 @@ class FusedBiasLeakyReLUFunction(Function):
 
 
 class FusedBiasLeakyReLU(nn.Module):
-    r"""Fused bias leaky ReLU.
+    """Fused bias leaky ReLU.
 
     This function is introduced in the StyleGAN2:
-    `Analyzing and Improving the Image Quality of StyleGAN
-    <http://arxiv.org/abs/1912.04958>`_
+    http://arxiv.org/abs/1912.04958
 
     The bias term comes from the convolution operation. In addition, to keep
     the variance of the feature map or gradients unchanged, they also adopt a
     scale similarly with Kaiming initialization. However, since the
-    :math:`1+{alpha}^2` is too small, we can just ignore it. Therefore, the
-    final scale is just :math:`\sqrt{2}`. Of course, you may change it with
+    :math:`1+{alpha}^2` : is too small, we can just ignore it. Therefore, the
+    final scale is just :math:`\sqrt{2}`:. Of course, you may change it with # noqa: W605, E501
     your own scale.
 
     TODO: Implement the CPU version.
 
     Args:
-        num_channels (int): The channel number of the feature map.
+        channel (int): The channel number of the feature map.
         negative_slope (float, optional): Same as nn.LeakyRelu.
             Defaults to 0.2.
         scale (float, optional): A scalar to adjust the variance of the feature
             map. Defaults to 2**0.5.
     """
 
-    def __init__(self,
-                 num_channels: int,
-                 negative_slope: float = 0.2,
-                 scale: float = 2**0.5):
-        super().__init__()
+    def __init__(self, num_channels, negative_slope=0.2, scale=2**0.5):
+        super(FusedBiasLeakyReLU, self).__init__()
 
         self.bias = nn.Parameter(torch.zeros(num_channels))
         self.negative_slope = negative_slope
         self.scale = scale
 
-    def forward(self, input: torch.Tensor) -> torch.Tensor:
+    def forward(self, input):
         return fused_bias_leakyrelu(input, self.bias, self.negative_slope,
                                     self.scale)
 
 
-def fused_bias_leakyrelu(input: torch.Tensor,
-                         bias: nn.Parameter,
-                         negative_slope: float = 0.2,
-                         scale: float = 2**0.5) -> torch.Tensor:
-    r"""Fused bias leaky ReLU function.
+def fused_bias_leakyrelu(input, bias, negative_slope=0.2, scale=2**0.5):
+    """Fused bias leaky ReLU function.
 
     This function is introduced in the StyleGAN2:
-    `Analyzing and Improving the Image Quality of StyleGAN
-    <http://arxiv.org/abs/1912.04958>`_
+    http://arxiv.org/abs/1912.04958
 
     The bias term comes from the convolution operation. In addition, to keep
     the variance of the feature map or gradients unchanged, they also adopt a
     scale similarly with Kaiming initialization. However, since the
-    :math:`1+{alpha}^2` is too small, we can just ignore it. Therefore, the
-    final scale is just :math:`\sqrt{2}`. Of course, you may change it with
+    :math:`1+{alpha}^2` : is too small, we can just ignore it. Therefore, the
+    final scale is just :math:`\sqrt{2}`:. Of course, you may change it with # noqa: W605, E501
     your own scale.
 
     Args:
@@ -265,10 +254,7 @@ def fused_bias_leakyrelu(input: torch.Tensor,
                                             negative_slope, scale)
 
 
-def bias_leakyrelu_ref(x: torch.Tensor,
-                       bias: nn.Parameter,
-                       negative_slope: float = 0.2,
-                       scale: float = 2**0.5) -> torch.Tensor:
+def bias_leakyrelu_ref(x, bias, negative_slope=0.2, scale=2**0.5):
 
     if bias is not None:
         assert bias.ndim == 1
diff --git a/mmcv/ops/gather_points.py b/mmcv/ops/gather_points.py
index 895bfab..f52f167 100644
--- a/mmcv/ops/gather_points.py
+++ b/mmcv/ops/gather_points.py
@@ -1,5 +1,3 @@
-from typing import Tuple
-
 import torch
 from torch.autograd import Function
 
@@ -17,18 +15,18 @@ class GatherPoints(Function):
                 indices: torch.Tensor) -> torch.Tensor:
         """
         Args:
-            features (torch.Tensor): (B, C, N) features to gather.
-            indices (torch.Tensor): (B, M) where M is the number of points.
+            features (Tensor): (B, C, N) features to gather.
+            indices (Tensor): (B, M) where M is the number of points.
 
         Returns:
-            torch.Tensor: (B, C, M) where M is the number of points.
+            Tensor: (B, C, M) where M is the number of points.
         """
         assert features.is_contiguous()
         assert indices.is_contiguous()
 
         B, npoint = indices.size()
         _, C, N = features.size()
-        output = features.new_zeros((B, C, npoint))
+        output = torch.cuda.FloatTensor(B, C, npoint)
 
         ext_module.gather_points_forward(
             features, indices, output, b=B, c=C, n=N, npoints=npoint)
@@ -39,11 +37,11 @@ class GatherPoints(Function):
         return output
 
     @staticmethod
-    def backward(ctx, grad_out: torch.Tensor) -> Tuple[torch.Tensor, None]:
+    def backward(ctx, grad_out):
         idx, C, N = ctx.for_backwards
         B, npoint = idx.size()
 
-        grad_features = grad_out.new_zeros((B, C, N))
+        grad_features = torch.cuda.FloatTensor(B, C, N).zero_()
         grad_out_data = grad_out.data.contiguous()
         ext_module.gather_points_backward(
             grad_out_data,
diff --git a/mmcv/ops/group_points.py b/mmcv/ops/group_points.py
index 999728c..b7d7613 100644
--- a/mmcv/ops/group_points.py
+++ b/mmcv/ops/group_points.py
@@ -1,5 +1,5 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from typing import Optional, Tuple, Union
+from typing import Tuple
 
 import torch
 from torch import nn as nn
@@ -9,10 +9,8 @@ from ..utils import ext_loader
 from .ball_query import ball_query
 from .knn import knn
 
-ext_module = ext_loader.load_ext('_ext', [
-    'group_points_forward', 'group_points_backward',
-    'stack_group_points_forward', 'stack_group_points_backward'
-])
+ext_module = ext_loader.load_ext(
+    '_ext', ['group_points_forward', 'group_points_backward'])
 
 
 class QueryAndGroup(nn.Module):
@@ -39,15 +37,15 @@ class QueryAndGroup(nn.Module):
     """
 
     def __init__(self,
-                 max_radius: float,
-                 sample_num: int,
-                 min_radius: float = 0.,
-                 use_xyz: bool = True,
-                 return_grouped_xyz: bool = False,
-                 normalize_xyz: bool = False,
-                 uniform_sample: bool = False,
-                 return_unique_cnt: bool = False,
-                 return_grouped_idx: bool = False):
+                 max_radius,
+                 sample_num,
+                 min_radius=0,
+                 use_xyz=True,
+                 return_grouped_xyz=False,
+                 normalize_xyz=False,
+                 uniform_sample=False,
+                 return_unique_cnt=False,
+                 return_grouped_idx=False):
         super().__init__()
         self.max_radius = max_radius
         self.min_radius = min_radius
@@ -66,24 +64,15 @@ class QueryAndGroup(nn.Module):
             assert not self.normalize_xyz, \
                 'can not normalize grouped xyz when max_radius is None'
 
-    def forward(
-        self,
-        points_xyz: torch.Tensor,
-        center_xyz: torch.Tensor,
-        features: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, Tuple]:
+    def forward(self, points_xyz, center_xyz, features=None):
         """
         Args:
-            points_xyz (torch.Tensor): (B, N, 3) xyz coordinates of the
-                points.
-            center_xyz (torch.Tensor): (B, npoint, 3) coordinates of the
-                centriods.
-            features (torch.Tensor): (B, C, N) The features of grouped
-                points.
+            points_xyz (Tensor): (B, N, 3) xyz coordinates of the features.
+            center_xyz (Tensor): (B, npoint, 3) coordinates of the centriods.
+            features (Tensor): (B, C, N) Descriptors of the features.
 
         Returns:
-            Tuple | torch.Tensor: (B, 3 + C, npoint, sample_num) Grouped
-            concatenated coordinates and features of points.
+            Tensor: (B, 3 + C, npoint, sample_num) Grouped feature.
         """
         # if self.max_radius is None, we will perform kNN instead of ball query
         # idx is of shape [B, npoint, sample_num]
@@ -156,7 +145,7 @@ class GroupAll(nn.Module):
     def forward(self,
                 xyz: torch.Tensor,
                 new_xyz: torch.Tensor,
-                features: Optional[torch.Tensor] = None) -> torch.Tensor:
+                features: torch.Tensor = None):
         """
         Args:
             xyz (Tensor): (B, N, 3) xyz coordinates of the features.
@@ -185,71 +174,40 @@ class GroupingOperation(Function):
     """Group feature with given index."""
 
     @staticmethod
-    def forward(
-            ctx,
-            features: torch.Tensor,
-            indices: torch.Tensor,
-            features_batch_cnt: Optional[torch.Tensor] = None,
-            indices_batch_cnt: Optional[torch.Tensor] = None) -> torch.Tensor:
+    def forward(ctx, features: torch.Tensor,
+                indices: torch.Tensor) -> torch.Tensor:
         """
         Args:
-            features (Tensor): Tensor of features to group, input shape is
-                (B, C, N) or stacked inputs (N1 + N2 ..., C).
-            indices (Tensor):  The indices of features to group with, input
-                shape is (B, npoint, nsample) or stacked inputs
-                (M1 + M2 ..., nsample).
-            features_batch_cnt (Tensor, optional): Input features nums in
-                each batch, just like (N1, N2, ...). Defaults to None.
-                New in version 1.7.0.
-            indices_batch_cnt (Tensor, optional): Input indices nums in
-                each batch, just like (M1, M2, ...). Defaults to None.
-                New in version 1.7.0.
+            features (Tensor): (B, C, N) tensor of features to group.
+            indices (Tensor): (B, npoint, nsample) the indices of
+                features to group with.
 
         Returns:
-            Tensor: Grouped features, the shape is (B, C, npoint, nsample)
-            or (M1 + M2 ..., C, nsample).
+            Tensor: (B, C, npoint, nsample) Grouped features.
         """
         features = features.contiguous()
         indices = indices.contiguous()
-        if features_batch_cnt is not None and indices_batch_cnt is not None:
-            assert features_batch_cnt.dtype == torch.int
-            assert indices_batch_cnt.dtype == torch.int
-            M, nsample = indices.size()
-            N, C = features.size()
-            B = indices_batch_cnt.shape[0]
-            output = features.new_zeros((M, C, nsample))
-            ext_module.stack_group_points_forward(
-                features,
-                features_batch_cnt,
-                indices,
-                indices_batch_cnt,
-                output,
-                b=B,
-                m=M,
-                c=C,
-                nsample=nsample)
-            ctx.for_backwards = (B, N, indices, features_batch_cnt,
-                                 indices_batch_cnt)
-        else:
-            B, nfeatures, nsample = indices.size()
-            _, C, N = features.size()
-            output = features.new_zeros(B, C, nfeatures, nsample)
-
-            ext_module.group_points_forward(
-                features,
-                indices,
-                output,
-                b=B,
-                c=C,
-                n=N,
-                npoints=nfeatures,
-                nsample=nsample)
-
-            ctx.for_backwards = (indices, N)
+
+        B, nfeatures, nsample = indices.size()
+        _, C, N = features.size()
+        output = torch.cuda.FloatTensor(B, C, nfeatures, nsample)
+
+        ext_module.group_points_forward(
+            features,
+            indices,
+            output,
+            b=B,
+            c=C,
+            n=N,
+            npoints=nfeatures,
+            nsample=nsample)
+
+        ctx.for_backwards = (indices, N)
         return output
 
     @staticmethod
-    def backward(ctx, grad_out: torch.Tensor) -> Tuple:
+    def backward(ctx,
+                 grad_out: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
         """
         Args:
             grad_out (Tensor): (B, C, npoint, nsample) tensor of the gradients
@@ -258,42 +216,22 @@ class GroupingOperation(Function):
         Returns:
             Tensor: (B, C, N) gradient of the features.
         """
-        if len(ctx.for_backwards) != 5:
-            idx, N = ctx.for_backwards
-
-            B, C, npoint, nsample = grad_out.size()
-            grad_features = grad_out.new_zeros(B, C, N)
-
-            grad_out_data = grad_out.data.contiguous()
-            ext_module.group_points_backward(
-                grad_out_data,
-                idx,
-                grad_features.data,
-                b=B,
-                c=C,
-                n=N,
-                npoints=npoint,
-                nsample=nsample)
-            return grad_features, None
-        else:
-            B, N, idx, features_batch_cnt, idx_batch_cnt = ctx.for_backwards
-
-            M, C, nsample = grad_out.size()
-            grad_features = grad_out.new_zeros(N, C)
-
-            grad_out_data = grad_out.data.contiguous()
-            ext_module.stack_group_points_backward(
-                grad_out_data,
-                idx,
-                idx_batch_cnt,
-                features_batch_cnt,
-                grad_features.data,
-                b=B,
-                c=C,
-                m=M,
-                n=N,
-                nsample=nsample)
-            return grad_features, None, None, None
+        idx, N = ctx.for_backwards
+
+        B, C, npoint, nsample = grad_out.size()
+        grad_features = torch.cuda.FloatTensor(B, C, N).zero_()
+
+        grad_out_data = grad_out.data.contiguous()
+        ext_module.group_points_backward(
+            grad_out_data,
+            idx,
+            grad_features.data,
+            b=B,
+            c=C,
+            n=N,
+            npoints=npoint,
+            nsample=nsample)
+        return grad_features, None
 
 
 grouping_operation = GroupingOperation.apply
diff --git a/mmcv/ops/info.py b/mmcv/ops/info.py
index b24b981..29f2e55 100644
--- a/mmcv/ops/info.py
+++ b/mmcv/ops/info.py
@@ -1,4 +1,7 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+import glob
+import os
+
 import torch
 
 if torch.__version__ == 'parrots':
@@ -19,3 +22,15 @@ else:
 
     def get_compiling_cuda_version():
         return ext_module.get_compiling_cuda_version()
+
+
+def get_onnxruntime_op_path():
+    wildcard = os.path.join(
+        os.path.abspath(os.path.dirname(os.path.dirname(__file__))),
+        '_ext_ort.*.so')
+
+    paths = glob.glob(wildcard)
+    if len(paths) > 0:
+        return paths[0]
+    else:
+        return ''
diff --git a/mmcv/ops/iou3d.py b/mmcv/ops/iou3d.py
old mode 100755
new mode 100644
index 94e2057..8c4cc82
--- a/mmcv/ops/iou3d.py
+++ b/mmcv/ops/iou3d.py
@@ -1,176 +1,43 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-import warnings
-from typing import Optional
-
 import torch
-from torch import Tensor
 
 from ..utils import ext_loader
 
 ext_module = ext_loader.load_ext('_ext', [
-    'iou3d_boxes_overlap_bev_forward', 'iou3d_nms3d_forward',
-    'iou3d_nms3d_normal_forward'
+    'iou3d_boxes_iou_bev_forward', 'iou3d_nms_forward',
+    'iou3d_nms_normal_forward'
 ])
 
 
-def boxes_overlap_bev(boxes_a: Tensor, boxes_b: Tensor) -> Tensor:
-    """Calculate boxes BEV overlap.
-
-    Args:
-        boxes_a (torch.Tensor): Input boxes a with shape (M, 7).
-        boxes_b (torch.Tensor): Input boxes b with shape (N, 7).
-
-    Returns:
-        torch.Tensor: BEV overlap result with shape (M, N).
-    """
-    ans_overlap = boxes_a.new_zeros(
-        torch.Size((boxes_a.shape[0], boxes_b.shape[0])))
-    ext_module.iou3d_boxes_overlap_bev_forward(boxes_a.contiguous(),
-                                               boxes_b.contiguous(),
-                                               ans_overlap)
-
-    return ans_overlap
-
-
-def boxes_iou3d(boxes_a: Tensor, boxes_b: Tensor) -> Tensor:
-    """Calculate boxes 3D IoU.
-
-    Args:
-        boxes_a (torch.Tensor): Input boxes a with shape (M, 7).
-        boxes_b (torch.Tensor): Input boxes b with shape (N, 7).
-
-    Returns:
-        torch.Tensor: 3D IoU result with shape (M, N).
-    """
-    assert boxes_a.shape[1] == boxes_b.shape[1] == 7,\
-        'Input boxes shape should be (N, 7)'
-
-    boxes_a_height_max = (boxes_a[:, 2] + boxes_a[:, 5] / 2).view(-1, 1)
-    boxes_a_height_min = (boxes_a[:, 2] - boxes_a[:, 5] / 2).view(-1, 1)
-    boxes_b_height_max = (boxes_b[:, 2] + boxes_b[:, 5] / 2).view(1, -1)
-    boxes_b_height_min = (boxes_b[:, 2] - boxes_b[:, 5] / 2).view(1, -1)
-
-    overlaps_bev = boxes_a.new_zeros(
-        torch.Size((boxes_a.shape[0], boxes_b.shape[0])))
-    ext_module.iou3d_boxes_overlap_bev_forward(boxes_a.contiguous(),
-                                               boxes_b.contiguous(),
-                                               overlaps_bev)
-
-    max_of_min = torch.max(boxes_a_height_min, boxes_b_height_min)
-    min_of_max = torch.min(boxes_a_height_max, boxes_b_height_max)
-    overlaps_h = torch.clamp(min_of_max - max_of_min, min=0)
-    overlaps_3d = overlaps_bev * overlaps_h
-    vol_a = (boxes_a[:, 3] * boxes_a[:, 4] * boxes_a[:, 5]).view(-1, 1)
-    vol_b = (boxes_b[:, 3] * boxes_b[:, 4] * boxes_b[:, 5]).view(1, -1)
-    iou3d = overlaps_3d / torch.clamp(vol_a + vol_b - overlaps_3d, min=1e-6)
-    return iou3d
-
-
-def nms3d(boxes: Tensor, scores: Tensor, iou_threshold: float) -> Tensor:
-    """3D NMS function GPU implementation (for BEV boxes).
-
-    Args:
-        boxes (torch.Tensor): Input boxes with the shape of (N, 7)
-            ([x, y, z, dx, dy, dz, heading]).
-        scores (torch.Tensor): Scores of boxes with the shape of (N).
-        iou_threshold (float): Overlap threshold of NMS.
-
-    Returns:
-        torch.Tensor: Indexes after NMS.
-    """
-    assert boxes.size(1) == 7, 'Input boxes shape should be (N, 7)'
-    order = scores.sort(0, descending=True)[1]
-    boxes = boxes[order].contiguous()
-
-    keep = boxes.new_zeros(boxes.size(0), dtype=torch.long)
-    num_out = boxes.new_zeros(size=(), dtype=torch.long)
-    ext_module.iou3d_nms3d_forward(
-        boxes, keep, num_out, nms_overlap_thresh=iou_threshold)
-    keep = order[keep[:num_out].to(boxes.device)].contiguous()
-    return keep
-
-
-def nms3d_normal(boxes: Tensor, scores: Tensor,
-                 iou_threshold: float) -> Tensor:
-    """Normal 3D NMS function GPU implementation. The overlap of two boxes for
-    IoU calculation is defined as the exact overlapping area of the two boxes
-    WITH their yaw angle set to 0.
-
-    Args:
-        boxes (torch.Tensor): Input boxes with shape (N, 7).
-            ([x, y, z, dx, dy, dz, heading]).
-        scores (torch.Tensor): Scores of predicted boxes with shape (N).
-        iou_threshold (float): Overlap threshold of NMS.
-
-    Returns:
-        torch.Tensor: Remaining indices with scores in descending order.
-    """
-    assert boxes.shape[1] == 7, 'Input boxes shape should be (N, 7)'
-    order = scores.sort(0, descending=True)[1]
-    boxes = boxes[order].contiguous()
-
-    keep = boxes.new_zeros(boxes.size(0), dtype=torch.long)
-    num_out = boxes.new_zeros(size=(), dtype=torch.long)
-    ext_module.iou3d_nms3d_normal_forward(
-        boxes, keep, num_out, nms_overlap_thresh=iou_threshold)
-    return order[keep[:num_out].to(boxes.device)].contiguous()
-
-
-def _xyxyr2xywhr(boxes: Tensor) -> Tensor:
-    """Convert [x1, y1, x2, y2, heading] box to [x, y, dx, dy, heading] box.
-
-    Args:
-        box (torch.Tensor): Input boxes with shape (N, 5).
-
-    Returns:
-        torch.Tensor: Converted boxes with shape (N, 7).
-    """
-    warnings.warn(
-        'This function is deprecated and will be removed in the future.',
-        DeprecationWarning)
-    return torch.stack(
-        ((boxes[:, 0] + boxes[:, 2]) / 2, (boxes[:, 1] + boxes[:, 3]) / 2,
-         boxes[:, 2] - boxes[:, 0], boxes[:, 3] - boxes[:, 1], boxes[:, 4]),
-        dim=-1)
-
-
-def boxes_iou_bev(boxes_a: Tensor, boxes_b: Tensor) -> Tensor:
+def boxes_iou_bev(boxes_a, boxes_b):
     """Calculate boxes IoU in the Bird's Eye View.
 
     Args:
-        boxes_a (torch.Tensor): Input boxes a with shape (M, 5)
-            ([x1, y1, x2, y2, ry]).
-        boxes_b (torch.Tensor): Input boxes b with shape (N, 5)
-            ([x1, y1, x2, y2, ry]).
+        boxes_a (torch.Tensor): Input boxes a with shape (M, 5).
+        boxes_b (torch.Tensor): Input boxes b with shape (N, 5).
 
     Returns:
-        torch.Tensor: IoU result with shape (M, N).
+        ans_iou (torch.Tensor): IoU result with shape (M, N).
     """
-    from .box_iou_rotated import box_iou_rotated
-
-    warnings.warn(
-        '`iou3d.boxes_iou_bev` is deprecated and will be removed in'
-        ' the future. Please, use `box_iou_rotated.box_iou_rotated`.',
-        DeprecationWarning)
+    ans_iou = boxes_a.new_zeros(
+        torch.Size((boxes_a.shape[0], boxes_b.shape[0])))
 
-    return box_iou_rotated(_xyxyr2xywhr(boxes_a), _xyxyr2xywhr(boxes_b))
+    ext_module.iou3d_boxes_iou_bev_forward(boxes_a.contiguous(),
+                                           boxes_b.contiguous(), ans_iou)
 
+    return ans_iou
 
-def nms_bev(boxes: Tensor,
-            scores: Tensor,
-            thresh: float,
-            pre_max_size: Optional[int] = None,
-            post_max_size: Optional[int] = None) -> Tensor:
-    """NMS function GPU implementation (for BEV boxes).
 
-    The overlap of two boxes for IoU calculation is defined as the exact
-    overlapping area of the two boxes. In this function, one can also
-    set ``pre_max_size`` and ``post_max_size``.
+def nms_bev(boxes, scores, thresh, pre_max_size=None, post_max_size=None):
+    """NMS function GPU implementation (for BEV boxes). The overlap of two
+    boxes for IoU calculation is defined as the exact overlapping area of the
+    two boxes. In this function, one can also set ``pre_max_size`` and
+    ``post_max_size``.
 
     Args:
-        boxes (torch.Tensor): Input boxes with the shape of (N, 5)
+        boxes (torch.Tensor): Input boxes with the shape of [N, 5]
             ([x1, y1, x2, y2, ry]).
-        scores (torch.Tensor): Scores of boxes with the shape of (N,).
+        scores (torch.Tensor): Scores of boxes with the shape of [N].
         thresh (float): Overlap threshold of NMS.
         pre_max_size (int, optional): Max size of boxes before NMS.
             Default: None.
@@ -180,47 +47,43 @@ def nms_bev(boxes: Tensor,
     Returns:
         torch.Tensor: Indexes after NMS.
     """
-    from .nms import nms_rotated
-
-    warnings.warn(
-        '`iou3d.nms_bev` is deprecated and will be removed in'
-        ' the future. Please, use `nms.nms_rotated`.', DeprecationWarning)
-    assert boxes.size(1) == 5, 'Input boxes shape should be (N, 5)'
+    assert boxes.size(1) == 5, 'Input boxes shape should be [N, 5]'
     order = scores.sort(0, descending=True)[1]
 
     if pre_max_size is not None:
         order = order[:pre_max_size]
-    boxes = _xyxyr2xywhr(boxes)[order]
-    scores = scores[order]
-
-    keep = nms_rotated(boxes, scores, thresh)[1]
-    keep = order[keep]
+    boxes = boxes[order].contiguous()
 
+    keep = torch.zeros(boxes.size(0), dtype=torch.long)
+    num_out = torch.zeros(size=(), dtype=torch.long)
+    ext_module.iou3d_nms_forward(
+        boxes, keep, num_out, nms_overlap_thresh=thresh)
+    keep = order[keep[:num_out].cuda(boxes.device)].contiguous()
     if post_max_size is not None:
         keep = keep[:post_max_size]
     return keep
 
 
-def nms_normal_bev(boxes: Tensor, scores: Tensor, thresh: float) -> Tensor:
-    """Normal NMS function GPU implementation (for BEV boxes).
-
-    The overlap of two boxes for IoU calculation is defined as the exact
-    overlapping area of the two boxes WITH their yaw angle set to 0.
+def nms_normal_bev(boxes, scores, thresh):
+    """Normal NMS function GPU implementation (for BEV boxes). The overlap of
+    two boxes for IoU calculation is defined as the exact overlapping area of
+    the two boxes WITH their yaw angle set to 0.
 
     Args:
-        boxes (torch.Tensor): Input boxes with shape (N, 5)
-            ([x1, y1, x2, y2, ry]).
-        scores (torch.Tensor): Scores of predicted boxes with shape (N,).
+        boxes (torch.Tensor): Input boxes with shape (N, 5).
+        scores (torch.Tensor): Scores of predicted boxes with shape (N).
         thresh (float): Overlap threshold of NMS.
 
     Returns:
         torch.Tensor: Remaining indices with scores in descending order.
     """
-    from .nms import nms
+    assert boxes.shape[1] == 5, 'Input boxes shape should be [N, 5]'
+    order = scores.sort(0, descending=True)[1]
 
-    warnings.warn(
-        '`iou3d.nms_normal_bev` is deprecated and will be removed in'
-        ' the future. Please, use `nms.nms`.', DeprecationWarning)
-    assert boxes.shape[1] == 5, 'Input boxes shape should be (N, 5)'
+    boxes = boxes[order].contiguous()
 
-    return nms(boxes[:, :-1], scores, thresh)[1]
+    keep = torch.zeros(boxes.size(0), dtype=torch.long)
+    num_out = torch.zeros(size=(), dtype=torch.long)
+    ext_module.iou3d_nms_normal_forward(
+        boxes, keep, num_out, nms_overlap_thresh=thresh)
+    return order[keep[:num_out].cuda(boxes.device)].contiguous()
diff --git a/mmcv/ops/knn.py b/mmcv/ops/knn.py
index 48ce92f..f335785 100644
--- a/mmcv/ops/knn.py
+++ b/mmcv/ops/knn.py
@@ -1,5 +1,3 @@
-from typing import Optional
-
 import torch
 from torch.autograd import Function
 
@@ -10,7 +8,6 @@ ext_module = ext_loader.load_ext('_ext', ['knn_forward'])
 
 class KNN(Function):
     r"""KNN (CUDA) based on heap data structure.
-
     Modified from `PAConv <https://github.com/CVMI-Lab/PAConv/tree/main/
     scene_seg/lib/pointops/src/knnquery_heap>`_.
 
@@ -21,15 +18,15 @@ class KNN(Function):
     def forward(ctx,
                 k: int,
                 xyz: torch.Tensor,
-                center_xyz: Optional[torch.Tensor] = None,
+                center_xyz: torch.Tensor = None,
                 transposed: bool = False) -> torch.Tensor:
         """
         Args:
             k (int): number of nearest neighbors.
-            xyz (torch.Tensor): (B, N, 3) if transposed == False, else
-                (B, 3, N). xyz coordinates of the features.
-            center_xyz (torch.Tensor, optional): (B, npoint, 3) if transposed
-                is False, else (B, 3, npoint). centers of the knn query.
+            xyz (Tensor): (B, N, 3) if transposed == False, else (B, 3, N).
+                xyz coordinates of the features.
+            center_xyz (Tensor, optional): (B, npoint, 3) if transposed ==
+                False, else (B, 3, npoint). centers of the knn query.
                 Default: None.
             transposed (bool, optional): whether the input tensors are
                 transposed. Should not explicitly use this keyword when
@@ -37,8 +34,8 @@ class KNN(Function):
                 Default: False.
 
         Returns:
-            torch.Tensor: (B, k, npoint) tensor with the indices of the
-            features that form k-nearest neighbours.
+            Tensor: (B, k, npoint) tensor with the indices of
+                the features that form k-nearest neighbours.
         """
         assert (k > 0) & (k < 100), 'k should be in range(0, 100)'
 
diff --git a/mmcv/ops/masked_conv.py b/mmcv/ops/masked_conv.py
index 919702e..cd514cc 100644
--- a/mmcv/ops/masked_conv.py
+++ b/mmcv/ops/masked_conv.py
@@ -1,6 +1,5 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import math
-from typing import Optional, Tuple, Union
 
 import torch
 import torch.nn as nn
@@ -17,7 +16,7 @@ ext_module = ext_loader.load_ext(
 class MaskedConv2dFunction(Function):
 
     @staticmethod
-    def symbolic(g, features, mask, weight, bias, padding, stride=1):
+    def symbolic(g, features, mask, weight, bias, padding, stride):
         return g.op(
             'mmcv::MMCVMaskedConv2d',
             features,
@@ -28,13 +27,7 @@ class MaskedConv2dFunction(Function):
             stride_i=stride)
 
     @staticmethod
-    def forward(ctx,
-                features: torch.Tensor,
-                mask: torch.Tensor,
-                weight: torch.nn.Parameter,
-                bias: torch.nn.Parameter,
-                padding: int = 0,
-                stride: int = 1) -> torch.Tensor:
+    def forward(ctx, features, mask, weight, bias, padding=0, stride=1):
         assert mask.dim() == 3 and mask.size(0) == 1
         assert features.dim() == 4 and features.size(0) == 1
         assert features.size()[2:] == mask.size()[1:]
@@ -45,33 +38,13 @@ class MaskedConv2dFunction(Function):
                 'Stride could not only be 1 in masked_conv2d currently.')
         out_channel, in_channel, kernel_h, kernel_w = weight.size()
 
-        if features.device.type == 'npu':
-            import torch_npu
-            output = torch_npu.npu_conv2d(
-                features,
-                weight,
-                bias,
-                stride=(stride_h, stride_w),
-                padding=(pad_h, pad_w),
-                dilation=(1, 1),
-                groups=1)
-            if mask.size()[1:] != output.size()[2:]:
-                raise ValueError(
-                    'The mask is inconsistent with the shape of output_conv.')
-            mask = mask > 0
-            mask = mask.type(output.dtype)
-            output = output * mask
-            return output
-
         batch_size = features.size(0)
         out_h = int(
-            math.floor(
-                torch.true_divide((features.size(2) + 2 * pad_h -
-                                   (kernel_h - 1) - 1), stride_h) + 1))
+            math.floor((features.size(2) + 2 * pad_h -
+                        (kernel_h - 1) - 1) / stride_h + 1))
         out_w = int(
-            math.floor(
-                torch.true_divide((features.size(3) + 2 * pad_w -
-                                   (kernel_w - 1) - 1), stride_w) + 1))
+            math.floor((features.size(3) + 2 * pad_w -
+                        (kernel_h - 1) - 1) / stride_w + 1))
         mask_inds = torch.nonzero(mask[0] > 0, as_tuple=False)
         output = features.new_zeros(batch_size, out_channel, out_h, out_w)
         if mask_inds.numel() > 0:
@@ -88,6 +61,7 @@ class MaskedConv2dFunction(Function):
                 kernel_w=kernel_w,
                 pad_h=pad_h,
                 pad_w=pad_w)
+
             masked_output = torch.addmm(1, bias[:, None], 1,
                                         weight.view(out_channel, -1), data_col)
             ext_module.masked_col2im_forward(
@@ -102,7 +76,7 @@ class MaskedConv2dFunction(Function):
 
     @staticmethod
     @once_differentiable
-    def backward(ctx, grad_output: torch.Tensor) -> tuple:
+    def backward(ctx, grad_output):
         return (None, ) * 5
 
 
@@ -117,22 +91,21 @@ class MaskedConv2d(nn.Conv2d):
     """
 
     def __init__(self,
-                 in_channels: int,
-                 out_channels: int,
-                 kernel_size: Union[int, Tuple[int, ...]],
-                 stride: int = 1,
-                 padding: int = 0,
-                 dilation: int = 1,
-                 groups: int = 1,
-                 bias: bool = True):
-        super().__init__(in_channels, out_channels, kernel_size, stride,
-                         padding, dilation, groups, bias)
-
-    def forward(self,
-                input: torch.Tensor,
-                mask: Optional[torch.Tensor] = None) -> torch.Tensor:
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 bias=True):
+        super(MaskedConv2d,
+              self).__init__(in_channels, out_channels, kernel_size, stride,
+                             padding, dilation, groups, bias)
+
+    def forward(self, input, mask=None):
         if mask is None:  # fallback to the normal Conv2d
-            return super().forward(input)
+            return super(MaskedConv2d, self).forward(input)
         else:
             return masked_conv2d(input, mask, self.weight, self.bias,
                                  self.padding)
diff --git a/mmcv/ops/merge_cells.py b/mmcv/ops/merge_cells.py
index 19c3fe6..48ca8cc 100644
--- a/mmcv/ops/merge_cells.py
+++ b/mmcv/ops/merge_cells.py
@@ -1,7 +1,5 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-import math
 from abc import abstractmethod
-from typing import Optional
 
 import torch
 import torch.nn as nn
@@ -20,7 +18,7 @@ class BaseMergeCell(nn.Module):
     another convolution layer.
 
     Args:
-        fused_channels (int): number of input channels in out_conv layer.
+        in_channels (int): number of input channels in out_conv layer.
         out_channels (int): number of output channels in out_conv layer.
         with_out_conv (bool): Whether to use out_conv layer
         out_conv_cfg (dict): Config dict for convolution layer, which should
@@ -43,19 +41,19 @@ class BaseMergeCell(nn.Module):
     """
 
     def __init__(self,
-                 fused_channels: Optional[int] = 256,
-                 out_channels: Optional[int] = 256,
-                 with_out_conv: bool = True,
-                 out_conv_cfg: dict = dict(
+                 fused_channels=256,
+                 out_channels=256,
+                 with_out_conv=True,
+                 out_conv_cfg=dict(
                      groups=1, kernel_size=3, padding=1, bias=True),
-                 out_norm_cfg: Optional[dict] = None,
-                 out_conv_order: tuple = ('act', 'conv', 'norm'),
-                 with_input1_conv: bool = False,
-                 with_input2_conv: bool = False,
-                 input_conv_cfg: Optional[dict] = None,
-                 input_norm_cfg: Optional[dict] = None,
-                 upsample_mode: str = 'nearest'):
-        super().__init__()
+                 out_norm_cfg=None,
+                 out_conv_order=('act', 'conv', 'norm'),
+                 with_input1_conv=False,
+                 with_input2_conv=False,
+                 input_conv_cfg=None,
+                 input_norm_cfg=None,
+                 upsample_mode='nearest'):
+        super(BaseMergeCell, self).__init__()
         assert upsample_mode in ['nearest', 'bilinear']
         self.with_out_conv = with_out_conv
         self.with_input1_conv = with_input1_conv
@@ -64,8 +62,8 @@ class BaseMergeCell(nn.Module):
 
         if self.with_out_conv:
             self.out_conv = ConvModule(
-                fused_channels,  # type: ignore
-                out_channels,  # type: ignore
+                fused_channels,
+                out_channels,
                 **out_conv_cfg,
                 norm_cfg=out_norm_cfg,
                 order=out_conv_order)
@@ -97,25 +95,12 @@ class BaseMergeCell(nn.Module):
         elif x.shape[-2:] < size:
             return F.interpolate(x, size=size, mode=self.upsample_mode)
         else:
-            if x.shape[-2] % size[-2] != 0 or x.shape[-1] % size[-1] != 0:
-                h, w = x.shape[-2:]
-                target_h, target_w = size
-                pad_h = math.ceil(h / target_h) * target_h - h
-                pad_w = math.ceil(w / target_w) * target_w - w
-                pad_l = pad_w // 2
-                pad_r = pad_w - pad_l
-                pad_t = pad_h // 2
-                pad_b = pad_h - pad_t
-                pad = (pad_l, pad_r, pad_t, pad_b)
-                x = F.pad(x, pad, mode='constant', value=0.0)
-            kernel_size = (x.shape[-2] // size[-2], x.shape[-1] // size[-1])
+            assert x.shape[-2] % size[-2] == 0 and x.shape[-1] % size[-1] == 0
+            kernel_size = x.shape[-1] // size[-1]
             x = F.max_pool2d(x, kernel_size=kernel_size, stride=kernel_size)
             return x
 
-    def forward(self,
-                x1: torch.Tensor,
-                x2: torch.Tensor,
-                out_size: Optional[tuple] = None) -> torch.Tensor:
+    def forward(self, x1, x2, out_size=None):
         assert x1.shape[:2] == x2.shape[:2]
         assert out_size is None or len(out_size) == 2
         if out_size is None:  # resize to larger one
@@ -135,8 +120,8 @@ class BaseMergeCell(nn.Module):
 
 class SumCell(BaseMergeCell):
 
-    def __init__(self, in_channels: int, out_channels: int, **kwargs):
-        super().__init__(in_channels, out_channels, **kwargs)
+    def __init__(self, in_channels, out_channels, **kwargs):
+        super(SumCell, self).__init__(in_channels, out_channels, **kwargs)
 
     def _binary_op(self, x1, x2):
         return x1 + x2
@@ -144,8 +129,9 @@ class SumCell(BaseMergeCell):
 
 class ConcatCell(BaseMergeCell):
 
-    def __init__(self, in_channels: int, out_channels: int, **kwargs):
-        super().__init__(in_channels * 2, out_channels, **kwargs)
+    def __init__(self, in_channels, out_channels, **kwargs):
+        super(ConcatCell, self).__init__(in_channels * 2, out_channels,
+                                         **kwargs)
 
     def _binary_op(self, x1, x2):
         ret = torch.cat([x1, x2], dim=1)
@@ -154,10 +140,7 @@ class ConcatCell(BaseMergeCell):
 
 class GlobalPoolingCell(BaseMergeCell):
 
-    def __init__(self,
-                 in_channels: Optional[int] = None,
-                 out_channels: Optional[int] = None,
-                 **kwargs):
+    def __init__(self, in_channels=None, out_channels=None, **kwargs):
         super().__init__(in_channels, out_channels, **kwargs)
         self.global_pool = nn.AdaptiveAvgPool2d((1, 1))
 
diff --git a/mmcv/ops/min_area_polygons.py b/mmcv/ops/min_area_polygons.py
deleted file mode 100644
index b95f587..0000000
--- a/mmcv/ops/min_area_polygons.py
+++ /dev/null
@@ -1,20 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import torch
-
-from ..utils import ext_loader
-
-ext_module = ext_loader.load_ext('_ext', ['min_area_polygons'])
-
-
-def min_area_polygons(pointsets: torch.Tensor) -> torch.Tensor:
-    """Find the smallest polygons that surrounds all points in the point sets.
-
-    Args:
-        pointsets (Tensor): point sets with shape  (N, 18).
-
-    Returns:
-        torch.Tensor: Return the smallest polygons with shape (N, 8).
-    """
-    polygons = pointsets.new_zeros((pointsets.size(0), 8))
-    ext_module.min_area_polygons(pointsets, polygons)
-    return polygons
diff --git a/mmcv/ops/modulated_deform_conv.py b/mmcv/ops/modulated_deform_conv.py
index c631cf0..3417980 100644
--- a/mmcv/ops/modulated_deform_conv.py
+++ b/mmcv/ops/modulated_deform_conv.py
@@ -1,17 +1,15 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import math
-from typing import Optional, Tuple, Union
 
 import torch
 import torch.nn as nn
-from mmengine.logging import print_log
-from mmengine.registry import MODELS
-from mmengine.utils import deprecated_api_warning
 from torch.autograd import Function
 from torch.autograd.function import once_differentiable
 from torch.nn.modules.utils import _pair, _single
 
-from ..utils import ext_loader
+from mmcv.utils import deprecated_api_warning
+from ..cnn import CONV_LAYERS
+from ..utils import ext_loader, print_log
 
 ext_module = ext_loader.load_ext(
     '_ext',
@@ -35,78 +33,18 @@ class ModulatedDeformConv2dFunction(Function):
             groups_i=groups,
             deform_groups_i=deform_groups)
 
-    @staticmethod
-    def _calculate_sort_index(kernel_h, kernel_w, deformable_group):
-        split_num = deformable_group * 2 * kernel_h * kernel_w
-        sort_index = list(range(split_num))
-        sort_index_fp = (sort_index[1::2] + sort_index[::2])
-        sort_index_bp_dict = {i: idx for idx, i in enumerate(sort_index_fp)}
-        sort_index_bp = [sort_index_bp_dict[i] for i in sort_index]
-        sort_index_fp = torch.IntTensor(sort_index_fp)
-        sort_index_bp = torch.IntTensor(sort_index_bp)
-        sort_index_fp = sort_index_fp.npu()
-        sort_index_bp = sort_index_bp.npu()
-        return sort_index_fp, sort_index_bp
-
-    @staticmethod
-    def _npu_forward(ctx, input_tensor, offset, mask, weight, bias):
-        _, _, kernel_h, kernel_w = weight.shape
-        conv2d_bias = bias if len(bias) > 0 else None
-        sort_index_fp, sort_index_bp = \
-            ModulatedDeformConv2dFunction._calculate_sort_index(
-                kernel_w, kernel_h, ctx.deform_groups)
-        select_offset = offset.index_select(1, sort_index_fp)
-        offset_all = torch.cat([select_offset, mask], dim=1)
-        output, offset_out = torch.npu_deformable_conv2d(
-            input_tensor,
-            weight,
-            offset_all,
-            conv2d_bias,
-            kernel_size=[kernel_w, kernel_h],
-            stride=[1, 1, ctx.stride[0], ctx.stride[1]],
-            padding=[1, 1, ctx.padding[0], ctx.padding[1]],
-            dilation=[1, 1, ctx.dilation[0], ctx.dilation[1]],
-            groups=ctx.groups,
-            deformable_groups=ctx.deform_groups,
-            modulated=True)
-        if weight.requires_grad or mask.requires_grad or offset.requires_grad \
-                or input_tensor.requires_grad:
-            ctx.save_for_backward(input_tensor, weight, offset_out, offset_all,
-                                  sort_index_bp)
-        return output
-
-    @staticmethod
-    def _npu_backward(ctx, grad_output):
-        input_tensor, weight, offset_out, offset_all, sort_index_bp = \
-            ctx.saved_tensors
-        grad_input, grad_weight, grad_offset_all, grad_bias = \
-            torch.npu_deformable_conv2dbk(
-                input_tensor, grad_output, offset_out, weight, offset_all,
-                kernel_size=[weight.shape[3], weight.shape[2]],
-                stride=[1, 1, ctx.stride[0], ctx.stride[1]],
-                padding=[1, 1, ctx.padding[0], ctx.padding[1]],
-                dilation=[1, 1, ctx.dilation[0], ctx.dilation[1]],
-                groups=ctx.groups, deformable_groups=ctx.deform_groups,
-                modulated=True)
-        grad_offset = grad_offset_all.index_select(1, sort_index_bp)
-        grad_mask = grad_offset_all[:, grad_offset.shape[1]:, :, :]
-        if not ctx.with_bias:
-            grad_bias = None
-        return (grad_input, grad_offset, grad_mask, grad_weight, grad_bias,
-                None, None, None, None, None, None, None, None)
-
     @staticmethod
     def forward(ctx,
-                input: torch.Tensor,
-                offset: torch.Tensor,
-                mask: torch.Tensor,
-                weight: nn.Parameter,
-                bias: Optional[nn.Parameter] = None,
-                stride: int = 1,
-                padding: int = 0,
-                dilation: int = 1,
-                groups: int = 1,
-                deform_groups: int = 1) -> torch.Tensor:
+                input,
+                offset,
+                mask,
+                weight,
+                bias=None,
+                stride=1,
+                padding=0,
+                dilation=1,
+                groups=1,
+                deform_groups=1):
         if input is not None and input.dim() != 4:
             raise ValueError(
                 f'Expected 4D tensor as input, got {input.dim()}D tensor \
@@ -117,7 +55,6 @@ class ModulatedDeformConv2dFunction(Function):
         ctx.groups = groups
         ctx.deform_groups = deform_groups
         ctx.with_bias = bias is not None
-        ctx.device = input.device.type
         if not ctx.with_bias:
             bias = input.new_empty(0)  # fake tensor
         # When pytorch version >= 1.6.0, amp is adopted for fp16 mode;
@@ -129,17 +66,9 @@ class ModulatedDeformConv2dFunction(Function):
         # whatever the pytorch version is.
         input = input.type_as(offset)
         weight = weight.type_as(input)
-        bias = bias.type_as(input)  # type: ignore
-        mask = mask.type_as(input)
-        if ctx.device == 'npu':
-            output = ModulatedDeformConv2dFunction._npu_forward(
-                ctx, input, offset, mask, weight, bias)
-            return output
         ctx.save_for_backward(input, offset, mask, weight, bias)
-        output = input.new_empty([
-            int(i) for i in ModulatedDeformConv2dFunction._output_size(
-                ctx, input, weight)
-        ])
+        output = input.new_empty(
+            ModulatedDeformConv2dFunction._output_size(ctx, input, weight))
         ctx._bufs = [input.new_empty(0), input.new_empty(0)]
         ext_module.modulated_deform_conv_forward(
             input,
@@ -165,10 +94,7 @@ class ModulatedDeformConv2dFunction(Function):
 
     @staticmethod
     @once_differentiable
-    def backward(ctx, grad_output: torch.Tensor) -> tuple:
-        if ctx.device == 'npu':
-            return ModulatedDeformConv2dFunction._npu_backward(
-                ctx, grad_output)
+    def backward(ctx, grad_output):
         input, offset, mask, weight, bias = ctx.saved_tensors
         grad_input = torch.zeros_like(input)
         grad_offset = torch.zeros_like(offset)
@@ -232,16 +158,16 @@ class ModulatedDeformConv2d(nn.Module):
     @deprecated_api_warning({'deformable_groups': 'deform_groups'},
                             cls_name='ModulatedDeformConv2d')
     def __init__(self,
-                 in_channels: int,
-                 out_channels: int,
-                 kernel_size: Union[int, Tuple[int]],
-                 stride: int = 1,
-                 padding: int = 0,
-                 dilation: int = 1,
-                 groups: int = 1,
-                 deform_groups: int = 1,
-                 bias: Union[bool, str] = True):
-        super().__init__()
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 deform_groups=1,
+                 bias=True):
+        super(ModulatedDeformConv2d, self).__init__()
         self.in_channels = in_channels
         self.out_channels = out_channels
         self.kernel_size = _pair(kernel_size)
@@ -272,15 +198,14 @@ class ModulatedDeformConv2d(nn.Module):
         if self.bias is not None:
             self.bias.data.zero_()
 
-    def forward(self, x: torch.Tensor, offset: torch.Tensor,
-                mask: torch.Tensor) -> torch.Tensor:
+    def forward(self, x, offset, mask):
         return modulated_deform_conv2d(x, offset, mask, self.weight, self.bias,
                                        self.stride, self.padding,
                                        self.dilation, self.groups,
                                        self.deform_groups)
 
 
-@MODELS.register_module('DCNv2')
+@CONV_LAYERS.register_module('DCNv2')
 class ModulatedDeformConv2dPack(ModulatedDeformConv2d):
     """A ModulatedDeformable Conv Encapsulation that acts as normal Conv
     layers.
@@ -301,7 +226,7 @@ class ModulatedDeformConv2dPack(ModulatedDeformConv2d):
     _version = 2
 
     def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
+        super(ModulatedDeformConv2dPack, self).__init__(*args, **kwargs)
         self.conv_offset = nn.Conv2d(
             self.in_channels,
             self.deform_groups * 3 * self.kernel_size[0] * self.kernel_size[1],
@@ -312,13 +237,13 @@ class ModulatedDeformConv2dPack(ModulatedDeformConv2d):
             bias=True)
         self.init_weights()
 
-    def init_weights(self) -> None:
-        super().init_weights()
+    def init_weights(self):
+        super(ModulatedDeformConv2dPack, self).init_weights()
         if hasattr(self, 'conv_offset'):
             self.conv_offset.weight.data.zero_()
             self.conv_offset.bias.data.zero_()
 
-    def forward(self, x: torch.Tensor) -> torch.Tensor:  # type: ignore
+    def forward(self, x):
         out = self.conv_offset(x)
         o1, o2, mask = torch.chunk(out, 3, dim=1)
         offset = torch.cat((o1, o2), dim=1)
@@ -350,7 +275,7 @@ class ModulatedDeformConv2dPack(ModulatedDeformConv2d):
             print_log(
                 f'ModulatedDeformConvPack {prefix.rstrip(".")} is upgraded to '
                 'version 2.',
-                logger='current')
+                logger='root')
 
         super()._load_from_state_dict(state_dict, prefix, local_metadata,
                                       strict, missing_keys, unexpected_keys,
diff --git a/mmcv/ops/multi_scale_deform_attn.py b/mmcv/ops/multi_scale_deform_attn.py
index 7459263..d0efc8d 100644
--- a/mmcv/ops/multi_scale_deform_attn.py
+++ b/mmcv/ops/multi_scale_deform_attn.py
@@ -1,18 +1,16 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import math
 import warnings
-from typing import Optional, no_type_check
 
-import mmengine
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from mmengine.model import BaseModule, constant_init, xavier_init
-from mmengine.registry import MODELS
-from mmengine.utils import deprecated_api_warning
 from torch.autograd.function import Function, once_differentiable
 
-from mmcv.utils import IS_CUDA_AVAILABLE, IS_MLU_AVAILABLE
+from mmcv import deprecated_api_warning
+from mmcv.cnn import constant_init, xavier_init
+from mmcv.cnn.bricks.registry import ATTENTION
+from mmcv.runner import BaseModule
 from ..utils import ext_loader
 
 ext_module = ext_loader.load_ext(
@@ -22,45 +20,30 @@ ext_module = ext_loader.load_ext(
 class MultiScaleDeformableAttnFunction(Function):
 
     @staticmethod
-    def forward(ctx, value: torch.Tensor, value_spatial_shapes: torch.Tensor,
-                value_level_start_index: torch.Tensor,
-                sampling_locations: torch.Tensor,
-                attention_weights: torch.Tensor,
-                im2col_step: torch.Tensor) -> torch.Tensor:
-        """GPU/MLU version of multi-scale deformable attention.
+    def forward(ctx, value, value_spatial_shapes, value_level_start_index,
+                sampling_locations, attention_weights, im2col_step):
+        """GPU version of multi-scale deformable attention.
 
         Args:
-            value (torch.Tensor): The value has shape
+            value (Tensor): The value has shape
                 (bs, num_keys, mum_heads, embed_dims//num_heads)
-            value_spatial_shapes (torch.Tensor): Spatial shape of
+            value_spatial_shapes (Tensor): Spatial shape of
                 each feature map, has shape (num_levels, 2),
                 last dimension 2 represent (h, w)
-            sampling_locations (torch.Tensor): The location of sampling points,
+            sampling_locations (Tensor): The location of sampling points,
                 has shape
                 (bs ,num_queries, num_heads, num_levels, num_points, 2),
                 the last dimension 2 represent (x, y).
-            attention_weights (torch.Tensor): The weight of sampling points
-                used when calculate the attention, has shape
+            attention_weights (Tensor): The weight of sampling points used
+                when calculate the attention, has shape
                 (bs ,num_queries, num_heads, num_levels, num_points),
-            im2col_step (torch.Tensor): The step used in image to column.
+            im2col_step (Tensor): The step used in image to column.
 
         Returns:
-            torch.Tensor: has shape (bs, num_queries, embed_dims)
+            Tensor: has shape (bs, num_queries, embed_dims)
         """
 
         ctx.im2col_step = im2col_step
-
-        # When pytorch version >= 1.6.0, amp is adopted for fp16 mode;
-        # amp won't cast the type of sampling_locations, attention_weights
-        # (float32), but "value" is cast to float16, leading to the type
-        # mismatch with input (when it is float32) or weight.
-        # The flag for whether to use fp16 or amp is the type of "value",
-        # we cast sampling_locations and attention_weights to
-        # temporarily support fp16 and amp whatever the
-        # pytorch version is.
-        sampling_locations = sampling_locations.type_as(value)
-        attention_weights = attention_weights.type_as(value)
-
         output = ext_module.ms_deform_attn_forward(
             value,
             value_spatial_shapes,
@@ -75,14 +58,16 @@ class MultiScaleDeformableAttnFunction(Function):
 
     @staticmethod
     @once_differentiable
-    def backward(ctx, grad_output: torch.Tensor) -> tuple:
-        """GPU/MLU version of backward function.
+    def backward(ctx, grad_output):
+        """GPU version of backward function.
 
         Args:
-            grad_output (torch.Tensor): Gradient of output tensor of forward.
+            grad_output (Tensor): Gradient
+                of output tensor of forward.
 
         Returns:
-            tuple[Tensor]: Gradient of input tensors in forward.
+             Tuple[Tensor]: Gradient
+                of input tensors in forward.
         """
         value, value_spatial_shapes, value_level_start_index,\
             sampling_locations, attention_weights = ctx.saved_tensors
@@ -106,28 +91,26 @@ class MultiScaleDeformableAttnFunction(Function):
             grad_sampling_loc, grad_attn_weight, None
 
 
-def multi_scale_deformable_attn_pytorch(
-        value: torch.Tensor, value_spatial_shapes: torch.Tensor,
-        sampling_locations: torch.Tensor,
-        attention_weights: torch.Tensor) -> torch.Tensor:
+def multi_scale_deformable_attn_pytorch(value, value_spatial_shapes,
+                                        sampling_locations, attention_weights):
     """CPU version of multi-scale deformable attention.
 
     Args:
-        value (torch.Tensor): The value has shape
-            (bs, num_keys, num_heads, embed_dims//num_heads)
-        value_spatial_shapes (torch.Tensor): Spatial shape of
+        value (Tensor): The value has shape
+            (bs, num_keys, mum_heads, embed_dims//num_heads)
+        value_spatial_shapes (Tensor): Spatial shape of
             each feature map, has shape (num_levels, 2),
             last dimension 2 represent (h, w)
-        sampling_locations (torch.Tensor): The location of sampling points,
+        sampling_locations (Tensor): The location of sampling points,
             has shape
             (bs ,num_queries, num_heads, num_levels, num_points, 2),
             the last dimension 2 represent (x, y).
-        attention_weights (torch.Tensor): The weight of sampling points used
+        attention_weights (Tensor): The weight of sampling points used
             when calculate the attention, has shape
             (bs ,num_queries, num_heads, num_levels, num_points),
 
     Returns:
-        torch.Tensor: has shape (bs, num_queries, embed_dims)
+        Tensor: has shape (bs, num_queries, embed_dims)
     """
 
     bs, _, num_heads, embed_dims = value.shape
@@ -168,7 +151,7 @@ def multi_scale_deformable_attn_pytorch(
     return output.transpose(1, 2).contiguous()
 
 
-@MODELS.register_module()
+@ATTENTION.register_module()
 class MultiScaleDeformableAttention(BaseModule):
     """An attention module used in Deformable-Detr.
 
@@ -178,7 +161,7 @@ class MultiScaleDeformableAttention(BaseModule):
     Args:
         embed_dims (int): The embedding dimension of Attention.
             Default: 256.
-        num_heads (int): Parallel attention heads. Default: 8.
+        num_heads (int): Parallel attention heads. Default: 64.
         num_levels (int): The number of feature map used in
             Attention. Default: 4.
         num_points (int): The number of sampling points for
@@ -194,21 +177,18 @@ class MultiScaleDeformableAttention(BaseModule):
             Default: None.
         init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
             Default: None.
-        value_proj_ratio (float): The expansion ratio of value_proj.
-            Default: 1.0.
     """
 
     def __init__(self,
-                 embed_dims: int = 256,
-                 num_heads: int = 8,
-                 num_levels: int = 4,
-                 num_points: int = 4,
-                 im2col_step: int = 64,
-                 dropout: float = 0.1,
-                 batch_first: bool = False,
-                 norm_cfg: Optional[dict] = None,
-                 init_cfg: Optional[mmengine.ConfigDict] = None,
-                 value_proj_ratio: float = 1.0):
+                 embed_dims=256,
+                 num_heads=8,
+                 num_levels=4,
+                 num_points=4,
+                 im2col_step=64,
+                 dropout=0.1,
+                 batch_first=False,
+                 norm_cfg=None,
+                 init_cfg=None):
         super().__init__(init_cfg)
         if embed_dims % num_heads != 0:
             raise ValueError(f'embed_dims must be divisible by num_heads, '
@@ -243,18 +223,16 @@ class MultiScaleDeformableAttention(BaseModule):
             embed_dims, num_heads * num_levels * num_points * 2)
         self.attention_weights = nn.Linear(embed_dims,
                                            num_heads * num_levels * num_points)
-        value_proj_size = int(embed_dims * value_proj_ratio)
-        self.value_proj = nn.Linear(embed_dims, value_proj_size)
-        self.output_proj = nn.Linear(value_proj_size, embed_dims)
+        self.value_proj = nn.Linear(embed_dims, embed_dims)
+        self.output_proj = nn.Linear(embed_dims, embed_dims)
         self.init_weights()
 
-    def init_weights(self) -> None:
+    def init_weights(self):
         """Default initialization for Parameters of Module."""
         constant_init(self.sampling_offsets, 0.)
-        device = next(self.parameters()).device
         thetas = torch.arange(
-            self.num_heads, dtype=torch.float32,
-            device=device) * (2.0 * math.pi / self.num_heads)
+            self.num_heads,
+            dtype=torch.float32) * (2.0 * math.pi / self.num_heads)
         grid_init = torch.stack([thetas.cos(), thetas.sin()], -1)
         grid_init = (grid_init /
                      grid_init.abs().max(-1, keepdim=True)[0]).view(
@@ -269,53 +247,53 @@ class MultiScaleDeformableAttention(BaseModule):
         xavier_init(self.output_proj, distribution='uniform', bias=0.)
         self._is_init = True
 
-    @no_type_check
     @deprecated_api_warning({'residual': 'identity'},
                             cls_name='MultiScaleDeformableAttention')
     def forward(self,
-                query: torch.Tensor,
-                key: Optional[torch.Tensor] = None,
-                value: Optional[torch.Tensor] = None,
-                identity: Optional[torch.Tensor] = None,
-                query_pos: Optional[torch.Tensor] = None,
-                key_padding_mask: Optional[torch.Tensor] = None,
-                reference_points: Optional[torch.Tensor] = None,
-                spatial_shapes: Optional[torch.Tensor] = None,
-                level_start_index: Optional[torch.Tensor] = None,
-                **kwargs) -> torch.Tensor:
+                query,
+                key=None,
+                value=None,
+                identity=None,
+                query_pos=None,
+                key_padding_mask=None,
+                reference_points=None,
+                spatial_shapes=None,
+                level_start_index=None,
+                **kwargs):
         """Forward Function of MultiScaleDeformAttention.
 
         Args:
-            query (torch.Tensor): Query of Transformer with shape
+            query (Tensor): Query of Transformer with shape
                 (num_query, bs, embed_dims).
-            key (torch.Tensor): The key tensor with shape
+            key (Tensor): The key tensor with shape
                 `(num_key, bs, embed_dims)`.
-            value (torch.Tensor): The value tensor with shape
+            value (Tensor): The value tensor with shape
                 `(num_key, bs, embed_dims)`.
-            identity (torch.Tensor): The tensor used for addition, with the
+            identity (Tensor): The tensor used for addition, with the
                 same shape as `query`. Default None. If None,
                 `query` will be used.
-            query_pos (torch.Tensor): The positional encoding for `query`.
+            query_pos (Tensor): The positional encoding for `query`.
                 Default: None.
-            key_padding_mask (torch.Tensor): ByteTensor for `query`, with
-                shape [bs, num_key].
-            reference_points (torch.Tensor):  The normalized reference
+            key_pos (Tensor): The positional encoding for `key`. Default
+                None.
+            reference_points (Tensor):  The normalized reference
                 points with shape (bs, num_query, num_levels, 2),
                 all elements is range in [0, 1], top-left (0,0),
                 bottom-right (1, 1), including padding area.
                 or (N, Length_{query}, num_levels, 4), add
                 additional two dimensions is (w, h) to
                 form reference boxes.
-            spatial_shapes (torch.Tensor): Spatial shape of features in
+            key_padding_mask (Tensor): ByteTensor for `query`, with
+                shape [bs, num_key].
+            spatial_shapes (Tensor): Spatial shape of features in
                 different levels. With shape (num_levels, 2),
                 last dimension represents (h, w).
-            level_start_index (torch.Tensor): The start index of each level.
+            level_start_index (Tensor): The start index of each level.
                 A tensor has shape ``(num_levels, )`` and can be represented
                 as [0, h_0*w_0, h_0*w_0+h_1*w_1, ...].
 
         Returns:
-            torch.Tensor: forwarded results with shape
-            [num_query, bs, embed_dims].
+             Tensor: forwarded results with shape [num_query, bs, embed_dims].
         """
 
         if value is None:
@@ -363,8 +341,7 @@ class MultiScaleDeformableAttention(BaseModule):
             raise ValueError(
                 f'Last dim of reference_points must be'
                 f' 2 or 4, but get {reference_points.shape[-1]} instead.')
-        if ((IS_CUDA_AVAILABLE and value.is_cuda)
-                or (IS_MLU_AVAILABLE and value.is_mlu)):
+        if torch.cuda.is_available() and value.is_cuda:
             output = MultiScaleDeformableAttnFunction.apply(
                 value, spatial_shapes, level_start_index, sampling_locations,
                 attention_weights, self.im2col_step)
diff --git a/mmcv/ops/nms.py b/mmcv/ops/nms.py
index 14df44a..0d2467a 100644
--- a/mmcv/ops/nms.py
+++ b/mmcv/ops/nms.py
@@ -1,22 +1,21 @@
-from typing import Any, Dict, List, Optional, Tuple, Union
+import os
 
 import numpy as np
 import torch
-from mmengine.utils import deprecated_api_warning
-from torch import Tensor
 
+from mmcv.utils import deprecated_api_warning
 from ..utils import ext_loader
 
 ext_module = ext_loader.load_ext(
-    '_ext', ['nms', 'softnms', 'nms_match', 'nms_rotated', 'nms_quadri'])
+    '_ext', ['nms', 'softnms', 'nms_match', 'nms_rotated'])
 
 
 # This function is modified from: https://github.com/pytorch/vision/
 class NMSop(torch.autograd.Function):
 
     @staticmethod
-    def forward(ctx: Any, bboxes: Tensor, scores: Tensor, iou_threshold: float,
-                offset: int, score_threshold: float, max_num: int) -> Tensor:
+    def forward(ctx, bboxes, scores, iou_threshold, offset, score_threshold,
+                max_num):
         is_filtering_by_score = score_threshold > 0
         if is_filtering_by_score:
             valid_mask = scores > score_threshold
@@ -33,13 +32,58 @@ class NMSop(torch.autograd.Function):
             inds = valid_inds[inds]
         return inds
 
+    @staticmethod
+    def symbolic(g, bboxes, scores, iou_threshold, offset, score_threshold,
+                 max_num):
+        from ..onnx import is_custom_op_loaded
+        has_custom_op = is_custom_op_loaded()
+        # TensorRT nms plugin is aligned with original nms in ONNXRuntime
+        is_trt_backend = os.environ.get('ONNX_BACKEND') == 'MMCVTensorRT'
+        if has_custom_op and (not is_trt_backend):
+            return g.op(
+                'mmcv::NonMaxSuppression',
+                bboxes,
+                scores,
+                iou_threshold_f=float(iou_threshold),
+                offset_i=int(offset))
+        else:
+            from torch.onnx.symbolic_opset9 import select, squeeze, unsqueeze
+            from ..onnx.onnx_utils.symbolic_helper import _size_helper
+
+            boxes = unsqueeze(g, bboxes, 0)
+            scores = unsqueeze(g, unsqueeze(g, scores, 0), 0)
+
+            if max_num > 0:
+                max_num = g.op(
+                    'Constant',
+                    value_t=torch.tensor(max_num, dtype=torch.long))
+            else:
+                dim = g.op('Constant', value_t=torch.tensor(0))
+                max_num = _size_helper(g, bboxes, dim)
+            max_output_per_class = max_num
+            iou_threshold = g.op(
+                'Constant',
+                value_t=torch.tensor([iou_threshold], dtype=torch.float))
+            score_threshold = g.op(
+                'Constant',
+                value_t=torch.tensor([score_threshold], dtype=torch.float))
+            nms_out = g.op('NonMaxSuppression', boxes, scores,
+                           max_output_per_class, iou_threshold,
+                           score_threshold)
+            return squeeze(
+                g,
+                select(
+                    g, nms_out, 1,
+                    g.op(
+                        'Constant',
+                        value_t=torch.tensor([2], dtype=torch.long))), 1)
+
 
 class SoftNMSop(torch.autograd.Function):
 
     @staticmethod
-    def forward(ctx: Any, boxes: Tensor, scores: Tensor, iou_threshold: float,
-                sigma: float, min_score: float, method: int,
-                offset: int) -> Tuple[Tensor, Tensor]:
+    def forward(ctx, boxes, scores, iou_threshold, sigma, min_score, method,
+                offset):
         dets = boxes.new_empty((boxes.size(0), 5), device='cpu')
         inds = ext_module.softnms(
             boxes.cpu(),
@@ -70,16 +114,8 @@ class SoftNMSop(torch.autograd.Function):
         return nms_out
 
 
-array_like_type = Union[Tensor, np.ndarray]
-
-
 @deprecated_api_warning({'iou_thr': 'iou_threshold'})
-def nms(boxes: array_like_type,
-        scores: array_like_type,
-        iou_threshold: float,
-        offset: int = 0,
-        score_threshold: float = 0,
-        max_num: int = -1) -> Tuple[array_like_type, array_like_type]:
+def nms(boxes, scores, iou_threshold, offset=0, score_threshold=0, max_num=-1):
     """Dispatch to either CPU or GPU NMS implementations.
 
     The input can be either torch tensor or numpy array. GPU NMS will be used
@@ -95,8 +131,8 @@ def nms(boxes: array_like_type,
         max_num (int): maximum number of boxes after NMS.
 
     Returns:
-        tuple: kept dets (boxes and scores) and indice, which always have
-        the same data type as the input.
+        tuple: kept dets(boxes and scores) and indice, which is always the \
+            same data type as the input.
 
     Example:
         >>> boxes = np.array([[49.1, 32.4, 51.0, 35.9],
@@ -112,8 +148,8 @@ def nms(boxes: array_like_type,
         >>> dets, inds = nms(boxes, scores, iou_threshold)
         >>> assert len(inds) == len(dets) == 3
     """
-    assert isinstance(boxes, (Tensor, np.ndarray))
-    assert isinstance(scores, (Tensor, np.ndarray))
+    assert isinstance(boxes, (torch.Tensor, np.ndarray))
+    assert isinstance(scores, (torch.Tensor, np.ndarray))
     is_numpy = False
     if isinstance(boxes, np.ndarray):
         is_numpy = True
@@ -124,8 +160,16 @@ def nms(boxes: array_like_type,
     assert boxes.size(0) == scores.size(0)
     assert offset in (0, 1)
 
-    inds = NMSop.apply(boxes, scores, iou_threshold, offset, score_threshold,
-                       max_num)
+    if torch.__version__ == 'parrots':
+        indata_list = [boxes, scores]
+        indata_dict = {
+            'iou_threshold': float(iou_threshold),
+            'offset': int(offset)
+        }
+        inds = ext_module.nms(*indata_list, **indata_dict)
+    else:
+        inds = NMSop.apply(boxes, scores, iou_threshold, offset,
+                           score_threshold, max_num)
     dets = torch.cat((boxes[inds], scores[inds].reshape(-1, 1)), dim=1)
     if is_numpy:
         dets = dets.cpu().numpy()
@@ -134,19 +178,19 @@ def nms(boxes: array_like_type,
 
 
 @deprecated_api_warning({'iou_thr': 'iou_threshold'})
-def soft_nms(boxes: array_like_type,
-             scores: array_like_type,
-             iou_threshold: float = 0.3,
-             sigma: float = 0.5,
-             min_score: float = 1e-3,
-             method: str = 'linear',
-             offset: int = 0) -> Tuple[array_like_type, array_like_type]:
+def soft_nms(boxes,
+             scores,
+             iou_threshold=0.3,
+             sigma=0.5,
+             min_score=1e-3,
+             method='linear',
+             offset=0):
     """Dispatch to only CPU Soft NMS implementations.
 
     The input can be either a torch tensor or numpy array.
     The returned type will always be the same as inputs.
 
-    Args:
+    Arguments:
         boxes (torch.Tensor or np.ndarray): boxes in shape (N, 4).
         scores (torch.Tensor or np.ndarray): scores in shape (N, ).
         iou_threshold (float): IoU threshold for NMS.
@@ -156,8 +200,8 @@ def soft_nms(boxes: array_like_type,
         offset (int, 0 or 1): boxes' width or height is (x2 - x1 + offset).
 
     Returns:
-        tuple: kept dets (boxes and scores) and indice, which always have
-        the same data type as the input.
+        tuple: kept dets(boxes and scores) and indice, which is always the \
+            same data type as the input.
 
     Example:
         >>> boxes = np.array([[4., 3., 5., 3.],
@@ -172,8 +216,8 @@ def soft_nms(boxes: array_like_type,
         >>> assert len(inds) == len(dets) == 5
     """
 
-    assert isinstance(boxes, (Tensor, np.ndarray))
-    assert isinstance(scores, (Tensor, np.ndarray))
+    assert isinstance(boxes, (torch.Tensor, np.ndarray))
+    assert isinstance(scores, (torch.Tensor, np.ndarray))
     is_numpy = False
     if isinstance(boxes, np.ndarray):
         is_numpy = True
@@ -213,101 +257,61 @@ def soft_nms(boxes: array_like_type,
         return dets.to(device=boxes.device), inds.to(device=boxes.device)
 
 
-def batched_nms(boxes: Tensor,
-                scores: Tensor,
-                idxs: Tensor,
-                nms_cfg: Optional[Dict],
-                class_agnostic: bool = False) -> Tuple[Tensor, Tensor]:
-    r"""Performs non-maximum suppression in a batched fashion.
+def batched_nms(boxes, scores, idxs, nms_cfg, class_agnostic=False):
+    """Performs non-maximum suppression in a batched fashion.
 
-    Modified from `torchvision/ops/boxes.py#L39
-    <https://github.com/pytorch/vision/blob/
-    505cd6957711af790211896d32b40291bea1bc21/torchvision/ops/boxes.py#L39>`_.
+    Modified from https://github.com/pytorch/vision/blob
+    /505cd6957711af790211896d32b40291bea1bc21/torchvision/ops/boxes.py#L39.
     In order to perform NMS independently per class, we add an offset to all
     the boxes. The offset is dependent only on the class idx, and is large
     enough so that boxes from different classes do not overlap.
 
-    Note:
-        In v1.4.1 and later, ``batched_nms`` supports skipping the NMS and
-        returns sorted raw results when `nms_cfg` is None.
-
-    Args:
-        boxes (torch.Tensor): boxes in shape (N, 4) or (N, 5).
+    Arguments:
+        boxes (torch.Tensor): boxes in shape (N, 4).
         scores (torch.Tensor): scores in shape (N, ).
         idxs (torch.Tensor): each index value correspond to a bbox cluster,
             and NMS will not be applied between elements of different idxs,
             shape (N, ).
-        nms_cfg (dict | optional): Supports skipping the nms when `nms_cfg`
-            is None, otherwise it should specify nms type and other
-            parameters like `iou_thr`. Possible keys includes the following.
+        nms_cfg (dict): specify nms type and other parameters like iou_thr.
+            Possible keys includes the following.
 
-            - iou_threshold (float): IoU threshold used for NMS.
+            - iou_thr (float): IoU threshold used for NMS.
             - split_thr (float): threshold number of boxes. In some cases the
-              number of boxes is large (e.g., 200k). To avoid OOM during
-              training, the users could set `split_thr` to a small value.
-              If the number of boxes is greater than the threshold, it will
-              perform NMS on each group of boxes separately and sequentially.
-              Defaults to 10000.
+                number of boxes is large (e.g., 200k). To avoid OOM during
+                training, the users could set `split_thr` to a small value.
+                If the number of boxes is greater than the threshold, it will
+                perform NMS on each group of boxes separately and sequentially.
+                Defaults to 10000.
         class_agnostic (bool): if true, nms is class agnostic,
             i.e. IoU thresholding happens over all boxes,
-            regardless of the predicted class. Defaults to False.
+            regardless of the predicted class.
 
     Returns:
         tuple: kept dets and indice.
-
-        - boxes (Tensor): Bboxes with score after nms, has shape
-          (num_bboxes, 5). last dimension 5 arrange as
-          (x1, y1, x2, y2, score)
-        - keep (Tensor): The indices of remaining boxes in input
-          boxes.
     """
-    # skip nms when nms_cfg is None
-    if nms_cfg is None:
-        scores, inds = scores.sort(descending=True)
-        boxes = boxes[inds]
-        return torch.cat([boxes, scores[:, None]], -1), inds
-
     nms_cfg_ = nms_cfg.copy()
     class_agnostic = nms_cfg_.pop('class_agnostic', class_agnostic)
     if class_agnostic:
         boxes_for_nms = boxes
     else:
-        # When using rotated boxes, only apply offsets on center.
-        if boxes.size(-1) == 5:
-            # Strictly, the maximum coordinates of the rotating box
-            # (x,y,w,h,a) should be calculated by polygon coordinates.
-            # But the conversion from rotated box to polygon will
-            # slow down the speed.
-            # So we use max(x,y) + max(w,h) as max coordinate
-            # which is larger than polygon max coordinate
-            # max(x1, y1, x2, y2,x3, y3, x4, y4)
-            max_coordinate = boxes[..., :2].max() + boxes[..., 2:4].max()
-            offsets = idxs.to(boxes) * (
-                max_coordinate + torch.tensor(1).to(boxes))
-            boxes_ctr_for_nms = boxes[..., :2] + offsets[:, None]
-            boxes_for_nms = torch.cat([boxes_ctr_for_nms, boxes[..., 2:5]],
-                                      dim=-1)
-        else:
-            max_coordinate = boxes.max()
-            offsets = idxs.to(boxes) * (
-                max_coordinate + torch.tensor(1).to(boxes))
-            boxes_for_nms = boxes + offsets[:, None]
+        max_coordinate = boxes.max()
+        offsets = idxs.to(boxes) * (max_coordinate + torch.tensor(1).to(boxes))
+        boxes_for_nms = boxes + offsets[:, None]
 
     nms_type = nms_cfg_.pop('type', 'nms')
     nms_op = eval(nms_type)
 
     split_thr = nms_cfg_.pop('split_thr', 10000)
     # Won't split to multiple nms nodes when exporting to onnx
-    if boxes_for_nms.shape[0] < split_thr:
+    if boxes_for_nms.shape[0] < split_thr or torch.onnx.is_in_onnx_export():
         dets, keep = nms_op(boxes_for_nms, scores, **nms_cfg_)
         boxes = boxes[keep]
-
-        # This assumes `dets` has arbitrary dimensions where
+        # -1 indexing works abnormal in TensorRT
+        # This assumes `dets` has 5 dimensions where
         # the last dimension is score.
-        # Currently it supports bounding boxes [x1, y1, x2, y2, score] or
-        # rotated boxes [cx, cy, w, h, angle_radian, score].
-
-        scores = dets[:, -1]
+        # TODO: more elegant way to handle the dimension issue.
+        # Some type of nms would reweight the score, such as SoftNMS
+        scores = dets[:, 4]
     else:
         max_num = nms_cfg_.pop('max_num', -1)
         total_mask = scores.new_zeros(scores.size(), dtype=torch.bool)
@@ -329,33 +333,31 @@ def batched_nms(boxes: Tensor,
             boxes = boxes[:max_num]
             scores = scores[:max_num]
 
-    boxes = torch.cat([boxes, scores[:, None]], -1)
-    return boxes, keep
+    return torch.cat([boxes, scores[:, None]], -1), keep
 
 
-def nms_match(dets: array_like_type,
-              iou_threshold: float) -> List[array_like_type]:
+def nms_match(dets, iou_threshold):
     """Matched dets into different groups by NMS.
 
     NMS match is Similar to NMS but when a bbox is suppressed, nms match will
     record the indice of suppressed bbox and form a group with the indice of
     kept bbox. In each group, indice is sorted as score order.
 
-    Args:
+    Arguments:
         dets (torch.Tensor | np.ndarray): Det boxes with scores, shape (N, 5).
-        iou_threshold (float): IoU thresh for NMS.
+        iou_thr (float): IoU thresh for NMS.
 
     Returns:
-        list[torch.Tensor | np.ndarray]: The outer list corresponds different
-        matched group, the inner Tensor corresponds the indices for a group
-        in score order.
+        List[torch.Tensor | np.ndarray]: The outer list corresponds different
+            matched group, the inner Tensor corresponds the indices for a group
+            in score order.
     """
     if dets.shape[0] == 0:
         matched = []
     else:
         assert dets.shape[-1] == 5, 'inputs dets.shape should be (N, 5), ' \
                                     f'but get {dets.shape}'
-        if isinstance(dets, Tensor):
+        if isinstance(dets, torch.Tensor):
             dets_t = dets.detach().cpu()
         else:
             dets_t = torch.from_numpy(dets)
@@ -363,19 +365,15 @@ def nms_match(dets: array_like_type,
         indata_dict = {'iou_threshold': float(iou_threshold)}
         matched = ext_module.nms_match(*indata_list, **indata_dict)
         if torch.__version__ == 'parrots':
-            matched = matched.tolist()  # type: ignore
+            matched = matched.tolist()
 
-    if isinstance(dets, Tensor):
+    if isinstance(dets, torch.Tensor):
         return [dets.new_tensor(m, dtype=torch.long) for m in matched]
     else:
-        return [np.array(m, dtype=int) for m in matched]
+        return [np.array(m, dtype=np.int) for m in matched]
 
 
-def nms_rotated(dets: Tensor,
-                scores: Tensor,
-                iou_threshold: float,
-                labels: Optional[Tensor] = None,
-                clockwise: bool = True) -> Tuple[Tensor, Tensor]:
+def nms_rotated(dets, scores, iou_threshold, labels=None):
     """Performs non-maximum suppression (NMS) on the rotated boxes according to
     their intersection-over-union (IoU).
 
@@ -383,49 +381,23 @@ def nms_rotated(dets: Tensor,
     IoU greater than iou_threshold with another (higher scoring) rotated box.
 
     Args:
-        dets (torch.Tensor):  Rotated boxes in shape (N, 5).
-            They are expected to be in
-            (x_ctr, y_ctr, width, height, angle_radian) format.
-        scores (torch.Tensor): scores in shape (N, ).
+        boxes (Tensor):  Rotated boxes in shape (N, 5). They are expected to \
+            be in (x_ctr, y_ctr, width, height, angle_radian) format.
+        scores (Tensor): scores in shape (N, ).
         iou_threshold (float): IoU thresh for NMS.
-        labels (torch.Tensor, optional): boxes' label in shape (N,).
-        clockwise (bool): flag indicating whether the positive angular
-            orientation is clockwise. default True.
-            `New in version 1.4.3.`
+        labels (Tensor): boxes' label in shape (N,).
 
     Returns:
-        tuple: kept dets(boxes and scores) and indice, which is always the
-        same data type as the input.
+        tuple: kept dets(boxes and scores) and indice, which is always the \
+            same data type as the input.
     """
     if dets.shape[0] == 0:
         return dets, None
-    if not clockwise:
-        flip_mat = dets.new_ones(dets.shape[-1])
-        flip_mat[-1] = -1
-        dets_cw = dets * flip_mat
-    else:
-        dets_cw = dets
     multi_label = labels is not None
-    if labels is None:
-        input_labels = scores.new_empty(0, dtype=torch.int)
-    else:
-        input_labels = labels
-    if dets.device.type == 'npu':
-        order = scores.new_empty(0, dtype=torch.long)
-        coefficient = 57.29578  # 180 / PI
-        for i in range(dets.size()[0]):
-            dets_cw[i][4] *= coefficient  # radians to angle
-        keep_inds = ext_module.nms_rotated(dets_cw, scores, order, dets_cw,
-                                           input_labels, iou_threshold,
-                                           multi_label)
-        dets = torch.cat((dets[keep_inds], scores[keep_inds].reshape(-1, 1)),
-                         dim=1)
-        return dets, keep_inds
-
     if multi_label:
-        dets_wl = torch.cat((dets_cw, labels.unsqueeze(1)), 1)  # type: ignore
+        dets_wl = torch.cat((dets, labels.unsqueeze(1)), 1)
     else:
-        dets_wl = dets_cw
+        dets_wl = dets
     _, order = scores.sort(0, descending=True)
     dets_sorted = dets_wl.index_select(0, order)
 
@@ -435,55 +407,11 @@ def nms_rotated(dets: Tensor,
             scores,
             order,
             dets_sorted,
-            input_labels,
             iou_threshold=iou_threshold,
             multi_label=multi_label)
     else:
         keep_inds = ext_module.nms_rotated(dets_wl, scores, order, dets_sorted,
-                                           input_labels, iou_threshold,
-                                           multi_label)
-    dets = torch.cat((dets[keep_inds], scores[keep_inds].reshape(-1, 1)),
-                     dim=1)
-    return dets, keep_inds
-
-
-def nms_quadri(dets: Tensor,
-               scores: Tensor,
-               iou_threshold: float,
-               labels: Optional[Tensor] = None) -> Tuple[Tensor, Tensor]:
-    """Performs non-maximum suppression (NMS) on the quadrilateral boxes
-    according to their intersection-over-union (IoU).
-
-    Quadri NMS iteratively removes lower scoring quadrilateral boxes
-    which have an IoU greater than iou_threshold with another (higher
-    scoring) quadrilateral box.
-
-    Args:
-        dets (torch.Tensor):  Quadri boxes in shape (N, 8).
-            They are expected to be in
-            (x1, y1, ..., x4, y4) format.
-        scores (torch.Tensor): scores in shape (N, ).
-        iou_threshold (float): IoU thresh for NMS.
-        labels (torch.Tensor, optional): boxes' label in shape (N,).
-
-    Returns:
-        tuple: kept dets(boxes and scores) and indice, which is always the
-        same data type as the input.
-    """
-    if dets.shape[0] == 0:
-        return dets, None
-
-    multi_label = labels is not None
-    if multi_label:
-        dets_with_lables = \
-            torch.cat((dets, labels.unsqueeze(1)), 1)  # type: ignore
-    else:
-        dets_with_lables = dets
-    _, order = scores.sort(0, descending=True)
-    dets_sorted = dets_with_lables.index_select(0, order)
-
-    keep_inds = ext_module.nms_quadri(dets_with_lables, scores, order,
-                                      dets_sorted, iou_threshold, multi_label)
+                                           iou_threshold, multi_label)
     dets = torch.cat((dets[keep_inds], scores[keep_inds].reshape(-1, 1)),
                      dim=1)
     return dets, keep_inds
diff --git a/mmcv/ops/pixel_group.py b/mmcv/ops/pixel_group.py
index cf73e32..2143c75 100644
--- a/mmcv/ops/pixel_group.py
+++ b/mmcv/ops/pixel_group.py
@@ -1,44 +1,33 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from typing import List, Union
-
 import numpy as np
 import torch
-from torch import Tensor
 
 from ..utils import ext_loader
 
 ext_module = ext_loader.load_ext('_ext', ['pixel_group'])
 
 
-def pixel_group(
-    score: Union[np.ndarray, Tensor],
-    mask: Union[np.ndarray, Tensor],
-    embedding: Union[np.ndarray, Tensor],
-    kernel_label: Union[np.ndarray, Tensor],
-    kernel_contour: Union[np.ndarray, Tensor],
-    kernel_region_num: int,
-    distance_threshold: float,
-) -> List[List[float]]:
+def pixel_group(score, mask, embedding, kernel_label, kernel_contour,
+                kernel_region_num, distance_threshold):
     """Group pixels into text instances, which is widely used text detection
     methods.
 
     Arguments:
-        score (np.array or torch.Tensor): The foreground score with size hxw.
+        score (np.array or Tensor): The foreground score with size hxw.
         mask (np.array or Tensor): The foreground mask with size hxw.
-        embedding (np.array or torch.Tensor): The embedding with size hxwxc to
+        embedding (np.array or Tensor): The embedding with size hxwxc to
             distinguish instances.
-        kernel_label (np.array or torch.Tensor): The instance kernel index with
-            size hxw.
-        kernel_contour (np.array or torch.Tensor): The kernel contour with
+        kernel_label (np.array or Tensor): The instance kernel index with
             size hxw.
+        kernel_contour (np.array or Tensor): The kernel contour with size hxw.
         kernel_region_num (int): The instance kernel region number.
         distance_threshold (float): The embedding distance threshold between
             kernel and pixel in one instance.
 
     Returns:
-        list[list[float]]: The instance coordinates and attributes list. Each
-        element consists of averaged confidence, pixel number, and coordinates
-        (x_i, y_i for all pixels) in order.
+        pixel_assignment (List[List[float]]): The instance coordinate list.
+            Each element consists of averaged confidence, pixel number, and
+            coordinates (x_i, y_i for all pixels) in order.
     """
     assert isinstance(score, (torch.Tensor, np.ndarray))
     assert isinstance(mask, (torch.Tensor, np.ndarray))
diff --git a/mmcv/ops/point_sample.py b/mmcv/ops/point_sample.py
index 3811253..c084a8c 100644
--- a/mmcv/ops/point_sample.py
+++ b/mmcv/ops/point_sample.py
@@ -1,17 +1,15 @@
 # Modified from https://github.com/facebookresearch/detectron2/tree/master/projects/PointRend  # noqa
 
-from typing import Tuple, Union
+from os import path as osp
 
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from torch import Tensor
 from torch.nn.modules.utils import _pair
+from torch.onnx.operators import shape_as_tensor
 
 
-def bilinear_grid_sample(im: Tensor,
-                         grid: Tensor,
-                         align_corners: bool = False) -> Tensor:
+def bilinear_grid_sample(im, grid, align_corners=False):
     """Given an input and a flow-field grid, computes the output using input
     values and pixel locations from grid. Supported only bilinear interpolation
     method to sample the input pixels.
@@ -19,12 +17,11 @@ def bilinear_grid_sample(im: Tensor,
     Args:
         im (torch.Tensor): Input feature map, shape (N, C, H, W)
         grid (torch.Tensor): Point coordinates, shape (N, Hg, Wg, 2)
-        align_corners (bool): If set to True, the extrema (-1 and 1) are
+        align_corners {bool}: If set to True, the extrema (-1 and 1) are
             considered as referring to the center points of the input’s
             corner pixels. If set to False, they are instead considered as
             referring to the corner points of the input’s corner pixels,
             making the sampling more resolution agnostic.
-
     Returns:
         torch.Tensor: A tensor with sampled points, shape (N, C, Hg, Wg)
     """
@@ -87,45 +84,47 @@ def bilinear_grid_sample(im: Tensor,
     return (Ia * wa + Ib * wb + Ic * wc + Id * wd).reshape(n, c, gh, gw)
 
 
-def normalize(grid: Tensor) -> Tensor:
-    """Normalize input grid from [-1, 1] to [0, 1]
+def is_in_onnx_export_without_custom_ops():
+    from mmcv.ops import get_onnxruntime_op_path
+    ort_custom_op_path = get_onnxruntime_op_path()
+    return torch.onnx.is_in_onnx_export(
+    ) and not osp.exists(ort_custom_op_path)
 
-    Args:
-        grid (torch.Tensor): The grid to be normalize, range [-1, 1].
 
+def normalize(grid):
+    """Normalize input grid from [-1, 1] to [0, 1]
+    Args:
+        grid (Tensor): The grid to be normalize, range [-1, 1].
     Returns:
-        torch.Tensor: Normalized grid, range [0, 1].
+        Tensor: Normalized grid, range [0, 1].
     """
 
     return (grid + 1.0) / 2.0
 
 
-def denormalize(grid: Tensor) -> Tensor:
+def denormalize(grid):
     """Denormalize input grid from range [0, 1] to [-1, 1]
-
     Args:
-        grid (torch.Tensor): The grid to be denormalize, range [0, 1].
-
+        grid (Tensor): The grid to be denormalize, range [0, 1].
     Returns:
-        torch.Tensor: Denormalized grid, range [-1, 1].
+        Tensor: Denormalized grid, range [-1, 1].
     """
 
     return grid * 2.0 - 1.0
 
 
-def generate_grid(num_grid: int, size: Tuple[int, int],
-                  device: torch.device) -> Tensor:
+def generate_grid(num_grid, size, device):
     """Generate regular square grid of points in [0, 1] x [0, 1] coordinate
     space.
 
     Args:
         num_grid (int): The number of grids to sample, one for each region.
-        size (tuple[int, int]): The side size of the regular grid.
+        size (tuple(int, int)): The side size of the regular grid.
         device (torch.device): Desired device of returned tensor.
 
     Returns:
-        torch.Tensor: A tensor of shape (num_grid, size[0]*size[1], 2) that
-        contains coordinates for the regular grids.
+        (torch.Tensor): A tensor of shape (num_grid, size[0]*size[1], 2) that
+            contains coordinates for the regular grids.
     """
 
     affine_trans = torch.tensor([[[1., 0., 0.], [0., 1., 0.]]], device=device)
@@ -135,17 +134,16 @@ def generate_grid(num_grid: int, size: Tuple[int, int],
     return grid.view(1, -1, 2).expand(num_grid, -1, -1)
 
 
-def rel_roi_point_to_abs_img_point(rois: Tensor,
-                                   rel_roi_points: Tensor) -> Tensor:
+def rel_roi_point_to_abs_img_point(rois, rel_roi_points):
     """Convert roi based relative point coordinates to image based absolute
     point coordinates.
 
     Args:
-        rois (torch.Tensor): RoIs or BBoxes, shape (N, 4) or (N, 5)
-        rel_roi_points (torch.Tensor): Point coordinates inside RoI, relative
-            to RoI, location, range (0, 1), shape (N, P, 2)
+        rois (Tensor): RoIs or BBoxes, shape (N, 4) or (N, 5)
+        rel_roi_points (Tensor): Point coordinates inside RoI, relative to
+            RoI, location, range (0, 1), shape (N, P, 2)
     Returns:
-        torch.Tensor: Image based absolute point coordinates, shape (N, P, 2)
+        Tensor: Image based absolute point coordinates, shape (N, P, 2)
     """
 
     with torch.no_grad():
@@ -167,37 +165,37 @@ def rel_roi_point_to_abs_img_point(rois: Tensor,
     return abs_img_points
 
 
-def get_shape_from_feature_map(x: Tensor) -> Tensor:
+def get_shape_from_feature_map(x):
     """Get spatial resolution of input feature map considering exporting to
     onnx mode.
 
     Args:
         x (torch.Tensor): Input tensor, shape (N, C, H, W)
-
     Returns:
         torch.Tensor: Spatial resolution (width, height), shape (1, 1, 2)
     """
-    img_shape = torch.tensor(x.shape[2:]).flip(0).view(1, 1,
-                                                       2).to(x.device).float()
+    if torch.onnx.is_in_onnx_export():
+        img_shape = shape_as_tensor(x)[2:].flip(0).view(1, 1, 2).to(
+            x.device).float()
+    else:
+        img_shape = torch.tensor(x.shape[2:]).flip(0).view(1, 1, 2).to(
+            x.device).float()
     return img_shape
 
 
-def abs_img_point_to_rel_img_point(abs_img_points: Tensor,
-                                   img: Union[tuple, Tensor],
-                                   spatial_scale: float = 1.) -> Tensor:
+def abs_img_point_to_rel_img_point(abs_img_points, img, spatial_scale=1.):
     """Convert image based absolute point coordinates to image based relative
     coordinates for sampling.
 
     Args:
-        abs_img_points (torch.Tensor): Image based absolute point coordinates,
+        abs_img_points (Tensor): Image based absolute point coordinates,
             shape (N, P, 2)
-        img (tuple or torch.Tensor): (height, width) of image or feature map.
-        spatial_scale (float, optional): Scale points by this factor.
-            Default: 1.
+        img (tuple/Tensor): (height, width) of image or feature map.
+        spatial_scale (float): Scale points by this factor. Default: 1.
 
     Returns:
-        Tensor: Image based relative point coordinates for sampling, shape
-        (N, P, 2).
+        Tensor: Image based relative point coordinates for sampling,
+            shape (N, P, 2)
     """
 
     assert (isinstance(img, tuple) and len(img) == 2) or \
@@ -215,24 +213,23 @@ def abs_img_point_to_rel_img_point(abs_img_points: Tensor,
     return abs_img_points / scale * spatial_scale
 
 
-def rel_roi_point_to_rel_img_point(rois: Tensor,
-                                   rel_roi_points: Tensor,
-                                   img: Union[tuple, Tensor],
-                                   spatial_scale: float = 1.) -> Tensor:
+def rel_roi_point_to_rel_img_point(rois,
+                                   rel_roi_points,
+                                   img,
+                                   spatial_scale=1.):
     """Convert roi based relative point coordinates to image based absolute
     point coordinates.
 
     Args:
-        rois (torch.Tensor): RoIs or BBoxes, shape (N, 4) or (N, 5)
-        rel_roi_points (torch.Tensor): Point coordinates inside RoI, relative
-            to RoI, location, range (0, 1), shape (N, P, 2)
-        img (tuple or torch.Tensor): (height, width) of image or feature map.
-        spatial_scale (float, optional): Scale points by this factor.
-            Default: 1.
+        rois (Tensor): RoIs or BBoxes, shape (N, 4) or (N, 5)
+        rel_roi_points (Tensor): Point coordinates inside RoI, relative to
+            RoI, location, range (0, 1), shape (N, P, 2)
+        img (tuple/Tensor): (height, width) of image or feature map.
+        spatial_scale (float): Scale points by this factor. Default: 1.
 
     Returns:
-        torch.Tensor: Image based relative point coordinates for sampling,
-        shape (N, P, 2).
+        Tensor: Image based relative point coordinates for sampling,
+            shape (N, P, 2)
     """
 
     abs_img_point = rel_roi_point_to_abs_img_point(rois, rel_roi_points)
@@ -242,33 +239,35 @@ def rel_roi_point_to_rel_img_point(rois: Tensor,
     return rel_img_point
 
 
-def point_sample(input: Tensor,
-                 points: Tensor,
-                 align_corners: bool = False,
-                 **kwargs) -> Tensor:
+def point_sample(input, points, align_corners=False, **kwargs):
     """A wrapper around :func:`grid_sample` to support 3D point_coords tensors
     Unlike :func:`torch.nn.functional.grid_sample` it assumes point_coords to
     lie inside ``[0, 1] x [0, 1]`` square.
 
     Args:
-        input (torch.Tensor): Feature map, shape (N, C, H, W).
-        points (torch.Tensor): Image based absolute point coordinates
-            (normalized), range [0, 1] x [0, 1], shape (N, P, 2) or
-            (N, Hgrid, Wgrid, 2).
-        align_corners (bool, optional): Whether align_corners.
-            Default: False
+        input (Tensor): Feature map, shape (N, C, H, W).
+        points (Tensor): Image based absolute point coordinates (normalized),
+            range [0, 1] x [0, 1], shape (N, P, 2) or (N, Hgrid, Wgrid, 2).
+        align_corners (bool): Whether align_corners. Default: False
 
     Returns:
-        torch.Tensor: Features of `point` on `input`, shape (N, C, P) or
-        (N, C, Hgrid, Wgrid).
+        Tensor: Features of `point` on `input`, shape (N, C, P) or
+            (N, C, Hgrid, Wgrid).
     """
 
     add_dim = False
     if points.dim() == 3:
         add_dim = True
         points = points.unsqueeze(2)
-    output = F.grid_sample(
-        input, denormalize(points), align_corners=align_corners, **kwargs)
+    if is_in_onnx_export_without_custom_ops():
+        # If custom ops for onnx runtime not compiled use python
+        # implementation of grid_sample function to make onnx graph
+        # with supported nodes
+        output = bilinear_grid_sample(
+            input, denormalize(points), align_corners=align_corners)
+    else:
+        output = F.grid_sample(
+            input, denormalize(points), align_corners=align_corners, **kwargs)
     if add_dim:
         output = output.squeeze(3)
     return output
@@ -276,10 +275,7 @@ def point_sample(input: Tensor,
 
 class SimpleRoIAlign(nn.Module):
 
-    def __init__(self,
-                 output_size: Tuple[int],
-                 spatial_scale: float,
-                 aligned: bool = True) -> None:
+    def __init__(self, output_size, spatial_scale, aligned=True):
         """Simple RoI align in PointRend, faster than standard RoIAlign.
 
         Args:
@@ -290,42 +286,50 @@ class SimpleRoIAlign(nn.Module):
                 If True, align the results more perfectly.
         """
 
-        super().__init__()
+        super(SimpleRoIAlign, self).__init__()
         self.output_size = _pair(output_size)
         self.spatial_scale = float(spatial_scale)
         # to be consistent with other RoI ops
         self.use_torchvision = False
         self.aligned = aligned
 
-    def forward(self, features: Tensor, rois: Tensor) -> Tensor:
+    def forward(self, features, rois):
         num_imgs = features.size(0)
         num_rois = rois.size(0)
         rel_roi_points = generate_grid(
             num_rois, self.output_size, device=rois.device)
 
-        point_feats = []
-        for batch_ind in range(num_imgs):
-            # unravel batch dim
-            feat = features[batch_ind].unsqueeze(0)
-            inds = (rois[:, 0].long() == batch_ind)
-            if inds.any():
-                rel_img_points = rel_roi_point_to_rel_img_point(
-                    rois[inds], rel_roi_points[inds], feat,
-                    self.spatial_scale).unsqueeze(0)
-                point_feat = point_sample(
-                    feat, rel_img_points, align_corners=not self.aligned)
-                point_feat = point_feat.squeeze(0).transpose(0, 1)
-                point_feats.append(point_feat)
-
-        point_feats_t = torch.cat(point_feats, dim=0)
+        if torch.onnx.is_in_onnx_export():
+            rel_img_points = rel_roi_point_to_rel_img_point(
+                rois, rel_roi_points, features, self.spatial_scale)
+            rel_img_points = rel_img_points.reshape(num_imgs, -1,
+                                                    *rel_img_points.shape[1:])
+            point_feats = point_sample(
+                features, rel_img_points, align_corners=not self.aligned)
+            point_feats = point_feats.transpose(1, 2)
+        else:
+            point_feats = []
+            for batch_ind in range(num_imgs):
+                # unravel batch dim
+                feat = features[batch_ind].unsqueeze(0)
+                inds = (rois[:, 0].long() == batch_ind)
+                if inds.any():
+                    rel_img_points = rel_roi_point_to_rel_img_point(
+                        rois[inds], rel_roi_points[inds], feat,
+                        self.spatial_scale).unsqueeze(0)
+                    point_feat = point_sample(
+                        feat, rel_img_points, align_corners=not self.aligned)
+                    point_feat = point_feat.squeeze(0).transpose(0, 1)
+                    point_feats.append(point_feat)
+
+            point_feats = torch.cat(point_feats, dim=0)
 
         channels = features.size(1)
-        roi_feats = point_feats_t.reshape(num_rois, channels,
-                                          *self.output_size)
+        roi_feats = point_feats.reshape(num_rois, channels, *self.output_size)
 
         return roi_feats
 
-    def __repr__(self) -> str:
+    def __repr__(self):
         format_str = self.__class__.__name__
         format_str += '(output_size={}, spatial_scale={}'.format(
             self.output_size, self.spatial_scale)
diff --git a/mmcv/ops/points_in_boxes.py b/mmcv/ops/points_in_boxes.py
index 4915e6b..4003173 100644
--- a/mmcv/ops/points_in_boxes.py
+++ b/mmcv/ops/points_in_boxes.py
@@ -1,5 +1,4 @@
 import torch
-from torch import Tensor
 
 from ..utils import ext_loader
 
@@ -9,18 +8,17 @@ ext_module = ext_loader.load_ext('_ext', [
 ])
 
 
-def points_in_boxes_part(points: Tensor, boxes: Tensor) -> Tensor:
+def points_in_boxes_part(points, boxes):
     """Find the box in which each point is (CUDA).
 
     Args:
-        points (torch.Tensor): [B, M, 3], [x, y, z] in LiDAR/DEPTH coordinate.
+        points (torch.Tensor): [B, M, 3], [x, y, z] in LiDAR/DEPTH coordinate
         boxes (torch.Tensor): [B, T, 7],
             num_valid_boxes <= T, [x, y, z, x_size, y_size, z_size, rz] in
-            LiDAR/DEPTH coordinate, (x, y, z) is the bottom center.
+            LiDAR/DEPTH coordinate, (x, y, z) is the bottom center
 
     Returns:
-        torch.Tensor: Return the box indices of points with the shape of
-        (B, M). Default background = -1.
+        box_idxs_of_pts (torch.Tensor): (B, M), default background = -1
     """
     assert points.shape[0] == boxes.shape[0], \
         'Points and boxes should have the same batch size, ' \
@@ -57,7 +55,7 @@ def points_in_boxes_part(points: Tensor, boxes: Tensor) -> Tensor:
     return box_idxs_of_pts
 
 
-def points_in_boxes_cpu(points: Tensor, boxes: Tensor) -> Tensor:
+def points_in_boxes_cpu(points, boxes):
     """Find all boxes in which each point is (CPU). The CPU version of
     :meth:`points_in_boxes_all`.
 
@@ -69,8 +67,7 @@ def points_in_boxes_cpu(points: Tensor, boxes: Tensor) -> Tensor:
             (x, y, z) is the bottom center.
 
     Returns:
-        torch.Tensor: Return the box indices of points with the shape of
-        (B, M, T). Default background = 0.
+        box_idxs_of_pts (torch.Tensor): (B, M, T), default background = 0.
     """
     assert points.shape[0] == boxes.shape[0], \
         'Points and boxes should have the same batch size, ' \
@@ -95,7 +92,7 @@ def points_in_boxes_cpu(points: Tensor, boxes: Tensor) -> Tensor:
     return point_indices
 
 
-def points_in_boxes_all(points: Tensor, boxes: Tensor) -> Tensor:
+def points_in_boxes_all(points, boxes):
     """Find all boxes in which each point is (CUDA).
 
     Args:
@@ -105,8 +102,7 @@ def points_in_boxes_all(points: Tensor, boxes: Tensor) -> Tensor:
             (x, y, z) is the bottom center.
 
     Returns:
-        torch.Tensor: Return the box indices of points with the shape of
-        (B, M, T). Default background = 0.
+        box_idxs_of_pts (torch.Tensor): (B, M, T), default background = 0.
     """
     assert boxes.shape[0] == points.shape[0], \
         'Points and boxes should have the same batch size, ' \
diff --git a/mmcv/ops/points_in_polygons.py b/mmcv/ops/points_in_polygons.py
deleted file mode 100644
index 62d0dbd..0000000
--- a/mmcv/ops/points_in_polygons.py
+++ /dev/null
@@ -1,38 +0,0 @@
-import torch
-from torch import Tensor
-
-from ..utils import ext_loader
-
-ext_module = ext_loader.load_ext('_ext', ['points_in_polygons_forward'])
-
-
-def points_in_polygons(points: Tensor, polygons: Tensor) -> Tensor:
-    """Judging whether points are inside polygons, which is used in the ATSS
-    assignment for the rotated boxes.
-
-    It should be noted that when the point is just at the polygon boundary, the
-    judgment will be inaccurate, but the effect on assignment is limited.
-
-    Args:
-        points (torch.Tensor): It has shape (B, 2), indicating (x, y).
-            M means the number of predicted points.
-        polygons (torch.Tensor): It has shape (M, 8), indicating
-            (x1, y1, x2, y2, x3, y3, x4, y4). M means the number of
-            ground truth polygons.
-
-    Returns:
-        torch.Tensor: Return the result with the shape of (B, M),
-        1 indicates that the point is inside the polygon,
-        0 indicates that the point is outside the polygon.
-    """
-    assert points.shape[1] == 2, \
-        'points dimension should be 2, ' \
-        f'but got unexpected shape {points.shape[1]}'
-    assert polygons.shape[1] == 8, \
-        'polygons dimension should be 8, ' \
-        f'but got unexpected shape {polygons.shape[1]}'
-    output = torch.full([points.shape[0], polygons.shape[0]],
-                        0.).cuda().float()
-    ext_module.points_in_polygons_forward(points.contiguous(),
-                                          polygons.contiguous(), output)
-    return output
diff --git a/mmcv/ops/points_sampler.py b/mmcv/ops/points_sampler.py
index 776abc7..da41248 100644
--- a/mmcv/ops/points_sampler.py
+++ b/mmcv/ops/points_sampler.py
@@ -1,37 +1,40 @@
 from typing import List
 
 import torch
-from torch import Tensor
 from torch import nn as nn
 
+from mmcv.runner import force_fp32
 from .furthest_point_sample import (furthest_point_sample,
                                     furthest_point_sample_with_dist)
 
 
-def calc_square_dist(point_feat_a: Tensor,
-                     point_feat_b: Tensor,
-                     norm: bool = True) -> Tensor:
+def calc_square_dist(point_feat_a, point_feat_b, norm=True):
     """Calculating square distance between a and b.
 
     Args:
-        point_feat_a (torch.Tensor): (B, N, C) Feature vector of each point.
-        point_feat_b (torch.Tensor): (B, M, C) Feature vector of each point.
-        norm (bool, optional): Whether to normalize the distance.
+        point_feat_a (Tensor): (B, N, C) Feature vector of each point.
+        point_feat_b (Tensor): (B, M, C) Feature vector of each point.
+        norm (Bool, optional): Whether to normalize the distance.
             Default: True.
 
     Returns:
-        torch.Tensor: (B, N, M) Square distance between each point pair.
+        Tensor: (B, N, M) Distance between each pair points.
     """
     num_channel = point_feat_a.shape[-1]
-    dist = torch.cdist(point_feat_a, point_feat_b)
+    # [bs, n, 1]
+    a_square = torch.sum(point_feat_a.unsqueeze(dim=2).pow(2), dim=-1)
+    # [bs, 1, m]
+    b_square = torch.sum(point_feat_b.unsqueeze(dim=1).pow(2), dim=-1)
+
+    corr_matrix = torch.matmul(point_feat_a, point_feat_b.transpose(1, 2))
+
+    dist = a_square + b_square - 2 * corr_matrix
     if norm:
-        dist = dist / num_channel
-    else:
-        dist = torch.square(dist)
+        dist = torch.sqrt(dist) / num_channel
     return dist
 
 
-def get_sampler_cls(sampler_type: str) -> nn.Module:
+def get_sampler_cls(sampler_type):
     """Get the type and mode of points sampler.
 
     Args:
@@ -71,7 +74,7 @@ class PointsSampler(nn.Module):
     def __init__(self,
                  num_point: List[int],
                  fps_mod_list: List[str] = ['D-FPS'],
-                 fps_sample_range_list: List[int] = [-1]) -> None:
+                 fps_sample_range_list: List[int] = [-1]):
         super().__init__()
         # FPS would be applied to different fps_mod in the list,
         # so the length of the num_point should be equal to
@@ -85,23 +88,19 @@ class PointsSampler(nn.Module):
             self.samplers.append(get_sampler_cls(fps_mod)())
         self.fp16_enabled = False
 
-    def forward(self, points_xyz: Tensor, features: Tensor) -> Tensor:
+    @force_fp32()
+    def forward(self, points_xyz, features):
         """
         Args:
-            points_xyz (torch.Tensor): (B, N, 3) xyz coordinates of
-                the points.
-            features (torch.Tensor): (B, C, N) features of the points.
+            points_xyz (Tensor): (B, N, 3) xyz coordinates of the features.
+            features (Tensor): (B, C, N) Descriptors of the features.
 
         Returns:
-            torch.Tensor: (B, npoint, sample_num) Indices of sampled points.
+            Tensor: (B, npoint, sample_num) Indices of sampled points.
         """
-        if points_xyz.dtype == torch.half:
-            points_xyz = points_xyz.to(torch.float32)
-        if features is not None and features.dtype == torch.half:
-            features = features.to(torch.float32)
-
         indices = []
         last_fps_end_index = 0
+
         for fps_sample_range, sampler, npoint in zip(
                 self.fps_sample_range_list, self.samplers, self.num_point):
             assert fps_sample_range < points_xyz.shape[1]
@@ -113,8 +112,8 @@ class PointsSampler(nn.Module):
                 else:
                     sample_features = None
             else:
-                sample_points_xyz = points_xyz[:, last_fps_end_index:
-                                               fps_sample_range]
+                sample_points_xyz = \
+                    points_xyz[:, last_fps_end_index:fps_sample_range]
                 if features is not None:
                     sample_features = features[:, :, last_fps_end_index:
                                                fps_sample_range]
@@ -125,7 +124,7 @@ class PointsSampler(nn.Module):
                               npoint)
 
             indices.append(fps_idx + last_fps_end_index)
-            last_fps_end_index = fps_sample_range
+            last_fps_end_index += fps_sample_range
         indices = torch.cat(indices, dim=1)
 
         return indices
@@ -134,10 +133,10 @@ class PointsSampler(nn.Module):
 class DFPSSampler(nn.Module):
     """Using Euclidean distances of points for FPS."""
 
-    def __init__(self) -> None:
+    def __init__(self):
         super().__init__()
 
-    def forward(self, points: Tensor, features: Tensor, npoint: int) -> Tensor:
+    def forward(self, points, features, npoint):
         """Sampling points with D-FPS."""
         fps_idx = furthest_point_sample(points.contiguous(), npoint)
         return fps_idx
@@ -146,10 +145,10 @@ class DFPSSampler(nn.Module):
 class FFPSSampler(nn.Module):
     """Using feature distances for FPS."""
 
-    def __init__(self) -> None:
+    def __init__(self):
         super().__init__()
 
-    def forward(self, points: Tensor, features: Tensor, npoint: int) -> Tensor:
+    def forward(self, points, features, npoint):
         """Sampling points with F-FPS."""
         assert features is not None, \
             'feature input to FFPS_Sampler should not be None'
@@ -163,10 +162,10 @@ class FFPSSampler(nn.Module):
 class FSSampler(nn.Module):
     """Using F-FPS and D-FPS simultaneously."""
 
-    def __init__(self) -> None:
+    def __init__(self):
         super().__init__()
 
-    def forward(self, points: Tensor, features: Tensor, npoint: int) -> Tensor:
+    def forward(self, points, features, npoint):
         """Sampling points with FS_Sampling."""
         assert features is not None, \
             'feature input to FS_Sampler should not be None'
diff --git a/mmcv/ops/prroi_pool.py b/mmcv/ops/prroi_pool.py
deleted file mode 100644
index 8c263e3..0000000
--- a/mmcv/ops/prroi_pool.py
+++ /dev/null
@@ -1,152 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from typing import Tuple, Union
-
-import torch
-import torch.nn as nn
-from mmengine.utils.dl_utils import TORCH_VERSION
-from torch.autograd import Function
-from torch.autograd.function import once_differentiable
-from torch.nn.modules.utils import _pair
-
-from ..utils import ext_loader
-
-ext_module = ext_loader.load_ext(
-    '_ext',
-    ['prroi_pool_forward', 'prroi_pool_backward', 'prroi_pool_coor_backward'])
-
-
-class PrRoIPoolFunction(Function):
-
-    @staticmethod
-    def symbolic(g, features, rois, output_size, spatial_scale):
-        return g.op(
-            'mmcv::PrRoIPool',
-            features,
-            rois,
-            pooled_height_i=int(output_size[0]),
-            pooled_width_i=int(output_size[1]),
-            spatial_scale_f=float(spatial_scale))
-
-    @staticmethod
-    def forward(ctx,
-                features: torch.Tensor,
-                rois: torch.Tensor,
-                output_size: Tuple,
-                spatial_scale: float = 1.0) -> torch.Tensor:
-        if features.dtype != torch.float32 or rois.dtype != torch.float32:
-            raise ValueError('Precise RoI Pooling only takes float input, got '
-                             f'{features.dtype()} for features and'
-                             f'{rois.dtype()} for rois.')
-
-        pooled_height = int(output_size[0])
-        pooled_width = int(output_size[1])
-        spatial_scale = float(spatial_scale)
-
-        features = features.contiguous()
-        rois = rois.contiguous()
-        output_shape = (rois.size(0), features.size(1), pooled_height,
-                        pooled_width)
-        output = features.new_zeros(output_shape)
-        params = (pooled_height, pooled_width, spatial_scale)
-
-        ext_module.prroi_pool_forward(
-            features,
-            rois,
-            output,
-            pooled_height=params[0],
-            pooled_width=params[1],
-            spatial_scale=params[2])
-        ctx.params = params
-        # everything here is contiguous.
-        ctx.save_for_backward(features, rois, output)
-
-        return output
-
-    @staticmethod
-    @once_differentiable
-    def backward(
-        ctx, grad_output: torch.Tensor
-    ) -> Tuple[torch.Tensor, torch.Tensor, None, None, None]:
-        features, rois, output = ctx.saved_tensors
-        grad_input = grad_output.new_zeros(*features.shape)
-        grad_coor = grad_output.new_zeros(*rois.shape)
-
-        if features.requires_grad or TORCH_VERSION == 'parrots':
-            grad_output = grad_output.contiguous()
-            ext_module.prroi_pool_backward(
-                grad_output,
-                rois,
-                grad_input,
-                pooled_height=ctx.params[0],
-                pooled_width=ctx.params[1],
-                spatial_scale=ctx.params[2])
-        if rois.requires_grad or TORCH_VERSION == 'parrots':
-            grad_output = grad_output.contiguous()
-            ext_module.prroi_pool_coor_backward(
-                output,
-                grad_output,
-                features,
-                rois,
-                grad_coor,
-                pooled_height=ctx.params[0],
-                pooled_width=ctx.params[1],
-                spatial_scale=ctx.params[2])
-
-        return grad_input, grad_coor, None, None, None
-
-
-prroi_pool = PrRoIPoolFunction.apply
-
-
-class PrRoIPool(nn.Module):
-    """The operation of precision RoI pooling. The implementation of PrRoIPool
-    is modified from https://github.com/vacancy/PreciseRoIPooling/
-
-    Precise RoI Pooling (PrRoIPool) is an integration-based (bilinear
-    interpolation) average pooling method for RoI Pooling. It avoids any
-    quantization and has a continuous gradient on bounding box coordinates.
-    It is:
-
-    1. different from the original RoI Pooling proposed in Fast R-CNN. PrRoI
-    Pooling uses average pooling instead of max pooling for each bin and has a
-    continuous gradient on bounding box coordinates. That is, one can take the
-    derivatives of some loss function w.r.t the coordinates of each RoI and
-    optimize the RoI coordinates.
-    2. different from the RoI Align proposed in Mask R-CNN. PrRoI Pooling uses
-    a full integration-based average pooling instead of sampling a constant
-    number of points. This makes the gradient w.r.t. the coordinates
-    continuous.
-
-    Args:
-        output_size (Union[int, tuple]): h, w.
-        spatial_scale (float, optional): scale the input boxes by this number.
-            Defaults to 1.0.
-    """
-
-    def __init__(self,
-                 output_size: Union[int, tuple],
-                 spatial_scale: float = 1.0):
-        super().__init__()
-
-        self.output_size = _pair(output_size)
-        self.spatial_scale = float(spatial_scale)
-
-    def forward(self, features: torch.Tensor,
-                rois: torch.Tensor) -> torch.Tensor:
-        """Forward function.
-
-        Args:
-            features (torch.Tensor): The feature map.
-            rois (torch.Tensor): The RoI bboxes in [tl_x, tl_y, br_x, br_y]
-                format.
-
-        Returns:
-            torch.Tensor: The pooled results.
-        """
-        return prroi_pool(features, rois, self.output_size, self.spatial_scale)
-
-    def __repr__(self):
-        s = self.__class__.__name__
-        s += f'(output_size={self.output_size}, '
-        s += f'spatial_scale={self.spatial_scale})'
-        return s
diff --git a/mmcv/ops/psa_mask.py b/mmcv/ops/psa_mask.py
index 45f4946..cdf14e6 100644
--- a/mmcv/ops/psa_mask.py
+++ b/mmcv/ops/psa_mask.py
@@ -1,7 +1,4 @@
 # Modified from https://github.com/hszhao/semseg/blob/master/lib/psa
-from typing import Optional, Tuple
-
-import torch
 from torch import nn
 from torch.autograd import Function
 from torch.nn.modules.utils import _pair
@@ -23,8 +20,7 @@ class PSAMaskFunction(Function):
             mask_size_i=mask_size)
 
     @staticmethod
-    def forward(ctx, input: torch.Tensor, psa_type: str,
-                mask_size: int) -> torch.Tensor:
+    def forward(ctx, input, psa_type, mask_size):
         ctx.psa_type = psa_type
         ctx.mask_size = _pair(mask_size)
         ctx.save_for_backward(input)
@@ -49,9 +45,7 @@ class PSAMaskFunction(Function):
         return output
 
     @staticmethod
-    def backward(
-            ctx, grad_output: torch.Tensor
-    ) -> Tuple[torch.Tensor, None, None, None]:
+    def backward(ctx, grad_output):
         input = ctx.saved_tensors[0]
         psa_type = ctx.psa_type
         h_mask, w_mask = ctx.mask_size
@@ -77,8 +71,8 @@ psa_mask = PSAMaskFunction.apply
 
 class PSAMask(nn.Module):
 
-    def __init__(self, psa_type: str, mask_size: Optional[tuple] = None):
-        super().__init__()
+    def __init__(self, psa_type, mask_size=None):
+        super(PSAMask, self).__init__()
         assert psa_type in ['collect', 'distribute']
         if psa_type == 'collect':
             psa_type_enum = 0
@@ -88,7 +82,7 @@ class PSAMask(nn.Module):
         self.mask_size = mask_size
         self.psa_type = psa_type
 
-    def forward(self, input: torch.Tensor) -> torch.Tensor:
+    def forward(self, input):
         return psa_mask(input, self.psa_type_enum, self.mask_size)
 
     def __repr__(self):
diff --git a/mmcv/ops/riroi_align_rotated.py b/mmcv/ops/riroi_align_rotated.py
deleted file mode 100644
index c4e5a54..0000000
--- a/mmcv/ops/riroi_align_rotated.py
+++ /dev/null
@@ -1,140 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from typing import Any, Optional, Tuple, Union
-
-import torch
-import torch.nn as nn
-from mmengine.utils import is_tuple_of
-from torch.autograd import Function
-
-from ..utils import ext_loader
-
-ext_module = ext_loader.load_ext(
-    '_ext', ['riroi_align_rotated_forward', 'riroi_align_rotated_backward'])
-
-
-class RiRoIAlignRotatedFunction(Function):
-
-    @staticmethod
-    def forward(ctx: Any,
-                features: torch.Tensor,
-                rois: torch.Tensor,
-                out_size: Union[int, tuple],
-                spatial_scale: float,
-                num_samples: int = 0,
-                num_orientations: int = 8,
-                clockwise: bool = False) -> torch.Tensor:
-        if isinstance(out_size, int):
-            out_h = out_size
-            out_w = out_size
-        elif is_tuple_of(out_size, int):
-            assert len(out_size) == 2
-            out_h, out_w = out_size
-        else:
-            raise TypeError(
-                f'"out_size" should be an integer or tuple of integers,'
-                f' but got {out_size}')
-        ctx.spatial_scale = spatial_scale
-        ctx.num_samples = num_samples
-        ctx.num_orientations = num_orientations
-        ctx.clockwise = clockwise
-        ctx.save_for_backward(rois)
-        ctx.feature_size = features.size()
-
-        batch_size, num_channels, _, _ = features.size()
-        num_rois = rois.size(0)
-
-        output = features.new_zeros(num_rois, num_channels, out_h, out_w)
-
-        ext_module.riroi_align_rotated_forward(
-            features,
-            rois,
-            output,
-            pooled_height=out_h,
-            pooled_width=out_w,
-            spatial_scale=spatial_scale,
-            num_samples=num_samples,
-            num_orientations=num_orientations,
-            clockwise=clockwise)
-        return output
-
-    @staticmethod
-    def backward(
-        ctx: Any, grad_output: torch.Tensor
-    ) -> Optional[Tuple[torch.Tensor, None, None, None, None, None, None]]:
-        feature_size = ctx.feature_size
-        spatial_scale = ctx.spatial_scale
-        num_orientations = ctx.num_orientations
-        clockwise = ctx.clockwise
-        num_samples = ctx.num_samples
-        rois = ctx.saved_tensors[0]
-        assert feature_size is not None
-        batch_size, num_channels, feature_h, feature_w = feature_size
-
-        out_w = grad_output.size(3)
-        out_h = grad_output.size(2)
-
-        grad_input = None
-
-        if ctx.needs_input_grad[0]:
-            grad_input = rois.new_zeros(batch_size, num_channels, feature_h,
-                                        feature_w)
-            ext_module.riroi_align_rotated_backward(
-                grad_output.contiguous(),
-                rois,
-                grad_input,
-                pooled_height=out_h,
-                pooled_width=out_w,
-                spatial_scale=spatial_scale,
-                num_samples=num_samples,
-                num_orientations=num_orientations,
-                clockwise=clockwise)
-
-            return grad_input, None, None, None, None, None, None
-        return None
-
-
-riroi_align_rotated = RiRoIAlignRotatedFunction.apply
-
-
-class RiRoIAlignRotated(nn.Module):
-    """Rotation-invariant RoI align pooling layer for rotated proposals.
-
-    It accepts a feature map of shape (N, C, H, W) and rois with shape
-    (n, 6) with each roi decoded as (batch_index, center_x, center_y,
-    w, h, angle). The angle is in radian.
-
-    The details are described in the paper `ReDet: A Rotation-equivariant
-    Detector for Aerial Object Detection  <https://arxiv.org/abs/2103.07733>`_.
-
-    Args:
-        out_size (tuple): fixed dimensional RoI output with shape (h, w).
-        spatial_scale (float): scale the input boxes by this number
-        num_samples (int): number of inputs samples to take for each
-            output sample. 0 to take samples densely for current models.
-        num_orientations (int): number of oriented channels.
-        clockwise (bool): If True, the angle in each proposal follows a
-            clockwise fashion in image space, otherwise, the angle is
-            counterclockwise. Default: False.
-    """
-
-    def __init__(self,
-                 out_size: tuple,
-                 spatial_scale: float,
-                 num_samples: int = 0,
-                 num_orientations: int = 8,
-                 clockwise: bool = False):
-        super().__init__()
-
-        self.out_size = out_size
-        self.spatial_scale = float(spatial_scale)
-        self.num_samples = int(num_samples)
-        self.num_orientations = int(num_orientations)
-        self.clockwise = clockwise
-
-    def forward(self, features: torch.Tensor,
-                rois: torch.Tensor) -> torch.Tensor:
-        return RiRoIAlignRotatedFunction.apply(features, rois, self.out_size,
-                                               self.spatial_scale,
-                                               self.num_samples,
-                                               self.num_orientations,
-                                               self.clockwise)
diff --git a/mmcv/ops/roi_align.py b/mmcv/ops/roi_align.py
index de2bed2..0755aef 100644
--- a/mmcv/ops/roi_align.py
+++ b/mmcv/ops/roi_align.py
@@ -1,14 +1,11 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from typing import Any
-
 import torch
 import torch.nn as nn
-from mmengine.utils import deprecated_api_warning
 from torch.autograd import Function
 from torch.autograd.function import once_differentiable
 from torch.nn.modules.utils import _pair
 
-from ..utils import ext_loader
+from ..utils import deprecated_api_warning, ext_loader
 
 ext_module = ext_loader.load_ext('_ext',
                                  ['roi_align_forward', 'roi_align_backward'])
@@ -19,54 +16,59 @@ class RoIAlignFunction(Function):
     @staticmethod
     def symbolic(g, input, rois, output_size, spatial_scale, sampling_ratio,
                  pool_mode, aligned):
-        from torch.onnx import TensorProtoDataType
-        from torch.onnx.symbolic_opset9 import sub
-
-        def _select(g, self, dim, index):
-            return g.op('Gather', self, index, axis_i=dim)
-
-        # batch_indices = rois[:, 0].long()
-        batch_indices = _select(
-            g, rois, 1,
-            g.op('Constant', value_t=torch.tensor([0], dtype=torch.long)))
-        batch_indices = g.op('Squeeze', batch_indices, axes_i=[1])
-        batch_indices = g.op(
-            'Cast', batch_indices, to_i=TensorProtoDataType.INT64)
-        # rois = rois[:, 1:]
-        rois = _select(
-            g, rois, 1,
-            g.op(
-                'Constant',
-                value_t=torch.tensor([1, 2, 3, 4], dtype=torch.long)))
-
-        if aligned:
-            # rois -= 0.5/spatial_scale
-            aligned_offset = g.op(
-                'Constant',
-                value_t=torch.tensor([0.5 / spatial_scale],
-                                     dtype=torch.float32))
-            rois = sub(g, rois, aligned_offset)
-        # roi align
-        return g.op(
-            'RoiAlign',
-            input,
-            rois,
-            batch_indices,
-            output_height_i=output_size[0],
-            output_width_i=output_size[1],
-            spatial_scale_f=spatial_scale,
-            sampling_ratio_i=max(0, sampling_ratio),
-            mode_s=pool_mode)
+        from ..onnx import is_custom_op_loaded
+        has_custom_op = is_custom_op_loaded()
+        if has_custom_op:
+            return g.op(
+                'mmcv::MMCVRoiAlign',
+                input,
+                rois,
+                output_height_i=output_size[0],
+                output_width_i=output_size[1],
+                spatial_scale_f=spatial_scale,
+                sampling_ratio_i=sampling_ratio,
+                mode_s=pool_mode,
+                aligned_i=aligned)
+        else:
+            from torch.onnx.symbolic_opset9 import sub, squeeze
+            from torch.onnx.symbolic_helper import _slice_helper
+            from torch.onnx import TensorProtoDataType
+            # batch_indices = rois[:, 0].long()
+            batch_indices = _slice_helper(
+                g, rois, axes=[1], starts=[0], ends=[1])
+            batch_indices = squeeze(g, batch_indices, 1)
+            batch_indices = g.op(
+                'Cast', batch_indices, to_i=TensorProtoDataType.INT64)
+            # rois = rois[:, 1:]
+            rois = _slice_helper(g, rois, axes=[1], starts=[1], ends=[5])
+            if aligned:
+                # rois -= 0.5/spatial_scale
+                aligned_offset = g.op(
+                    'Constant',
+                    value_t=torch.tensor([0.5 / spatial_scale],
+                                         dtype=torch.float32))
+                rois = sub(g, rois, aligned_offset)
+            # roi align
+            return g.op(
+                'RoiAlign',
+                input,
+                rois,
+                batch_indices,
+                output_height_i=output_size[0],
+                output_width_i=output_size[1],
+                spatial_scale_f=spatial_scale,
+                sampling_ratio_i=max(0, sampling_ratio),
+                mode_s=pool_mode)
 
     @staticmethod
-    def forward(ctx: Any,
-                input: torch.Tensor,
-                rois: torch.Tensor,
-                output_size: int,
-                spatial_scale: float = 1.0,
-                sampling_ratio: int = 0,
-                pool_mode: str = 'avg',
-                aligned: bool = True) -> torch.Tensor:
+    def forward(ctx,
+                input,
+                rois,
+                output_size,
+                spatial_scale=1.0,
+                sampling_ratio=0,
+                pool_mode='avg',
+                aligned=True):
         ctx.output_size = _pair(output_size)
         ctx.spatial_scale = spatial_scale
         ctx.sampling_ratio = sampling_ratio
@@ -105,7 +107,7 @@ class RoIAlignFunction(Function):
 
     @staticmethod
     @once_differentiable
-    def backward(ctx: Any, grad_output: torch.Tensor) -> tuple:
+    def backward(ctx, grad_output):
         rois, argmax_y, argmax_x = ctx.saved_tensors
         grad_input = grad_output.new_zeros(ctx.input_shape)
         # complex head architecture may cause grad_output uncontiguous.
@@ -172,13 +174,13 @@ class RoIAlign(nn.Module):
         },
         cls_name='RoIAlign')
     def __init__(self,
-                 output_size: tuple,
-                 spatial_scale: float = 1.0,
-                 sampling_ratio: int = 0,
-                 pool_mode: str = 'avg',
-                 aligned: bool = True,
-                 use_torchvision: bool = False):
-        super().__init__()
+                 output_size,
+                 spatial_scale=1.0,
+                 sampling_ratio=0,
+                 pool_mode='avg',
+                 aligned=True,
+                 use_torchvision=False):
+        super(RoIAlign, self).__init__()
 
         self.output_size = _pair(output_size)
         self.spatial_scale = float(spatial_scale)
@@ -187,7 +189,7 @@ class RoIAlign(nn.Module):
         self.aligned = aligned
         self.use_torchvision = use_torchvision
 
-    def forward(self, input: torch.Tensor, rois: torch.Tensor) -> torch.Tensor:
+    def forward(self, input, rois):
         """
         Args:
             input: NCHW images
diff --git a/mmcv/ops/roi_align_rotated.py b/mmcv/ops/roi_align_rotated.py
index 38e6ea3..0ce4961 100644
--- a/mmcv/ops/roi_align_rotated.py
+++ b/mmcv/ops/roi_align_rotated.py
@@ -1,11 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from typing import Any, Optional, Tuple, Union
-
-import torch
 import torch.nn as nn
-from mmengine.utils import deprecated_api_warning
 from torch.autograd import Function
-from torch.nn.modules.utils import _pair
 
 from ..utils import ext_loader
 
@@ -16,70 +11,80 @@ ext_module = ext_loader.load_ext(
 class RoIAlignRotatedFunction(Function):
 
     @staticmethod
-    def symbolic(g, input, rois, output_size, spatial_scale, sampling_ratio,
+    def symbolic(g, features, rois, out_size, spatial_scale, sample_num,
                  aligned, clockwise):
-        if isinstance(output_size, int):
-            out_h = output_size
-            out_w = output_size
-        elif isinstance(output_size, tuple):
-            assert len(output_size) == 2
-            assert isinstance(output_size[0], int)
-            assert isinstance(output_size[1], int)
-            out_h, out_w = output_size
+        if isinstance(out_size, int):
+            out_h = out_size
+            out_w = out_size
+        elif isinstance(out_size, tuple):
+            assert len(out_size) == 2
+            assert isinstance(out_size[0], int)
+            assert isinstance(out_size[1], int)
+            out_h, out_w = out_size
         else:
             raise TypeError(
-                '"output_size" must be an integer or tuple of integers')
+                '"out_size" must be an integer or tuple of integers')
         return g.op(
             'mmcv::MMCVRoIAlignRotated',
-            input,
+            features,
             rois,
             output_height_i=out_h,
             output_width_i=out_h,
             spatial_scale_f=spatial_scale,
-            sampling_ratio_i=sampling_ratio,
+            sampling_ratio_i=sample_num,
             aligned_i=aligned,
             clockwise_i=clockwise)
 
     @staticmethod
-    def forward(ctx: Any,
-                input: torch.Tensor,
-                rois: torch.Tensor,
-                output_size: Union[int, tuple],
-                spatial_scale: float,
-                sampling_ratio: int = 0,
-                aligned: bool = True,
-                clockwise: bool = False) -> torch.Tensor:
-        ctx.output_size = _pair(output_size)
+    def forward(ctx,
+                features,
+                rois,
+                out_size,
+                spatial_scale,
+                sample_num=0,
+                aligned=True,
+                clockwise=False):
+        if isinstance(out_size, int):
+            out_h = out_size
+            out_w = out_size
+        elif isinstance(out_size, tuple):
+            assert len(out_size) == 2
+            assert isinstance(out_size[0], int)
+            assert isinstance(out_size[1], int)
+            out_h, out_w = out_size
+        else:
+            raise TypeError(
+                '"out_size" must be an integer or tuple of integers')
         ctx.spatial_scale = spatial_scale
-        ctx.sampling_ratio = sampling_ratio
+        ctx.sample_num = sample_num
         ctx.aligned = aligned
         ctx.clockwise = clockwise
         ctx.save_for_backward(rois)
-        ctx.feature_size = input.size()
+        ctx.feature_size = features.size()
 
-        batch_size, num_channels, data_height, data_width = input.size()
+        batch_size, num_channels, data_height, data_width = features.size()
         num_rois = rois.size(0)
 
-        output = input.new_zeros(num_rois, num_channels, ctx.output_size[0],
-                                 ctx.output_size[1])
+        output = features.new_zeros(num_rois, num_channels, out_h, out_w)
         ext_module.roi_align_rotated_forward(
-            input,
+            features,
             rois,
             output,
-            pooled_height=ctx.output_size[0],
-            pooled_width=ctx.output_size[1],
-            spatial_scale=ctx.spatial_scale,
-            sampling_ratio=ctx.sampling_ratio,
-            aligned=ctx.aligned,
-            clockwise=ctx.clockwise)
+            pooled_height=out_h,
+            pooled_width=out_w,
+            spatial_scale=spatial_scale,
+            sample_num=sample_num,
+            aligned=aligned,
+            clockwise=clockwise)
         return output
 
     @staticmethod
-    def backward(
-        ctx: Any, grad_output: torch.Tensor
-    ) -> Tuple[Optional[torch.Tensor], Optional[torch.Tensor], None, None,
-               None, None, None]:
+    def backward(ctx, grad_output):
         feature_size = ctx.feature_size
+        spatial_scale = ctx.spatial_scale
+        aligned = ctx.aligned
+        clockwise = ctx.clockwise
+        sample_num = ctx.sample_num
         rois = ctx.saved_tensors[0]
         assert feature_size is not None
         batch_size, num_channels, data_height, data_width = feature_size
@@ -98,10 +103,10 @@ class RoIAlignRotatedFunction(Function):
                 grad_input,
                 pooled_height=out_h,
                 pooled_width=out_w,
-                spatial_scale=ctx.spatial_scale,
-                sampling_ratio=ctx.sampling_ratio,
-                aligned=ctx.aligned,
-                clockwise=ctx.clockwise)
+                spatial_scale=spatial_scale,
+                sample_num=sample_num,
+                aligned=aligned,
+                clockwise=clockwise)
         return grad_input, grad_rois, None, None, None, None, None
 
 
@@ -116,9 +121,9 @@ class RoIAlignRotated(nn.Module):
     w, h, angle). The angle is in radian.
 
     Args:
-        output_size (tuple): h, w
+        out_size (tuple): h, w
         spatial_scale (float): scale the input boxes by this number
-        sampling_ratio(int): number of inputs samples to take for each
+        sample_num (int): number of inputs samples to take for each
             output sample. 0 to take samples densely for current models.
         aligned (bool): if False, use the legacy implementation in
             MMDetection. If True, align the results more perfectly.
@@ -151,37 +156,22 @@ class RoIAlignRotated(nn.Module):
         performance if ROIAlign is used together with conv layers.
     """
 
-    @deprecated_api_warning(
-        {
-            'out_size': 'output_size',
-            'sample_num': 'sampling_ratio'
-        },
-        cls_name='RoIAlignRotated')
     def __init__(self,
-                 output_size: Union[int, tuple],
-                 spatial_scale: float,
-                 sampling_ratio: int = 0,
-                 aligned: bool = True,
-                 clockwise: bool = False):
-        super().__init__()
-
-        self.output_size = _pair(output_size)
+                 out_size,
+                 spatial_scale,
+                 sample_num=0,
+                 aligned=True,
+                 clockwise=False):
+        super(RoIAlignRotated, self).__init__()
+
+        self.out_size = out_size
         self.spatial_scale = float(spatial_scale)
-        self.sampling_ratio = int(sampling_ratio)
+        self.sample_num = int(sample_num)
         self.aligned = aligned
         self.clockwise = clockwise
 
-    def forward(self, input: torch.Tensor, rois: torch.Tensor) -> torch.Tensor:
-        return RoIAlignRotatedFunction.apply(input, rois, self.output_size,
+    def forward(self, features, rois):
+        return RoIAlignRotatedFunction.apply(features, rois, self.out_size,
                                              self.spatial_scale,
-                                             self.sampling_ratio, self.aligned,
+                                             self.sample_num, self.aligned,
                                              self.clockwise)
-
-    def __repr__(self):
-        s = self.__class__.__name__
-        s += f'(output_size={self.output_size}, '
-        s += f'spatial_scale={self.spatial_scale}, '
-        s += f'sampling_ratio={self.sampling_ratio}, '
-        s += f'aligned={self.aligned}, '
-        s += f'clockwise={self.clockwise})'
-        return s
diff --git a/mmcv/ops/roi_pool.py b/mmcv/ops/roi_pool.py
index e295b6a..d339d8f 100644
--- a/mmcv/ops/roi_pool.py
+++ b/mmcv/ops/roi_pool.py
@@ -1,6 +1,4 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from typing import Any, Tuple, Union
-
 import torch
 import torch.nn as nn
 from torch.autograd import Function
@@ -25,11 +23,7 @@ class RoIPoolFunction(Function):
             spatial_scale_f=spatial_scale)
 
     @staticmethod
-    def forward(ctx: Any,
-                input: torch.Tensor,
-                rois: torch.Tensor,
-                output_size: Union[int, tuple],
-                spatial_scale: float = 1.0) -> torch.Tensor:
+    def forward(ctx, input, rois, output_size, spatial_scale=1.0):
         ctx.output_size = _pair(output_size)
         ctx.spatial_scale = spatial_scale
         ctx.input_shape = input.size()
@@ -55,9 +49,7 @@ class RoIPoolFunction(Function):
 
     @staticmethod
     @once_differentiable
-    def backward(
-            ctx: Any, grad_output: torch.Tensor
-    ) -> Tuple[torch.Tensor, None, None, None]:
+    def backward(ctx, grad_output):
         rois, argmax = ctx.saved_tensors
         grad_input = grad_output.new_zeros(ctx.input_shape)
 
@@ -78,15 +70,13 @@ roi_pool = RoIPoolFunction.apply
 
 class RoIPool(nn.Module):
 
-    def __init__(self,
-                 output_size: Union[int, tuple],
-                 spatial_scale: float = 1.0):
-        super().__init__()
+    def __init__(self, output_size, spatial_scale=1.0):
+        super(RoIPool, self).__init__()
 
         self.output_size = _pair(output_size)
         self.spatial_scale = float(spatial_scale)
 
-    def forward(self, input: torch.Tensor, rois: torch.Tensor) -> torch.Tensor:
+    def forward(self, input, rois):
         return roi_pool(input, rois, self.output_size, self.spatial_scale)
 
     def __repr__(self):
diff --git a/mmcv/ops/roiaware_pool3d.py b/mmcv/ops/roiaware_pool3d.py
index 728f246..f259f06 100644
--- a/mmcv/ops/roiaware_pool3d.py
+++ b/mmcv/ops/roiaware_pool3d.py
@@ -1,11 +1,9 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from typing import Any, Tuple, Union
-
-import mmengine
 import torch
 from torch import nn as nn
 from torch.autograd import Function
 
+import mmcv
 from ..utils import ext_loader
 
 ext_module = ext_loader.load_ext(
@@ -27,10 +25,7 @@ class RoIAwarePool3d(nn.Module):
             Default: 'max'.
     """
 
-    def __init__(self,
-                 out_size: Union[int, tuple],
-                 max_pts_per_voxel: int = 128,
-                 mode: str = 'max'):
+    def __init__(self, out_size, max_pts_per_voxel=128, mode='max'):
         super().__init__()
 
         self.out_size = out_size
@@ -39,8 +34,7 @@ class RoIAwarePool3d(nn.Module):
         pool_mapping = {'max': 0, 'avg': 1}
         self.mode = pool_mapping[mode]
 
-    def forward(self, rois: torch.Tensor, pts: torch.Tensor,
-                pts_feature: torch.Tensor) -> torch.Tensor:
+    def forward(self, rois, pts, pts_feature):
         """
         Args:
             rois (torch.Tensor): [N, 7], in LiDAR coordinate,
@@ -49,8 +43,7 @@ class RoIAwarePool3d(nn.Module):
             pts_feature (torch.Tensor): [npoints, C], features of input points.
 
         Returns:
-            torch.Tensor: Pooled features whose shape is
-            [N, out_x, out_y, out_z, C].
+            pooled_features (torch.Tensor): [N, out_x, out_y, out_z, C]
         """
 
         return RoIAwarePool3dFunction.apply(rois, pts, pts_feature,
@@ -61,9 +54,8 @@ class RoIAwarePool3d(nn.Module):
 class RoIAwarePool3dFunction(Function):
 
     @staticmethod
-    def forward(ctx: Any, rois: torch.Tensor, pts: torch.Tensor,
-                pts_feature: torch.Tensor, out_size: Union[int, tuple],
-                max_pts_per_voxel: int, mode: int) -> torch.Tensor:
+    def forward(ctx, rois, pts, pts_feature, out_size, max_pts_per_voxel,
+                mode):
         """
         Args:
             rois (torch.Tensor): [N, 7], in LiDAR coordinate,
@@ -78,15 +70,15 @@ class RoIAwarePool3dFunction(Function):
                 pool).
 
         Returns:
-            torch.Tensor: Pooled features whose shape is
-            [N, out_x, out_y, out_z, C].
+            pooled_features (torch.Tensor): [N, out_x, out_y, out_z, C], output
+                pooled features.
         """
 
         if isinstance(out_size, int):
             out_x = out_y = out_z = out_size
         else:
             assert len(out_size) == 3
-            assert mmengine.is_tuple_of(out_size, int)
+            assert mmcv.is_tuple_of(out_size, int)
             out_x, out_y, out_z = out_size
 
         num_rois = rois.shape[0]
@@ -115,9 +107,7 @@ class RoIAwarePool3dFunction(Function):
         return pooled_features
 
     @staticmethod
-    def backward(
-        ctx: Any, grad_out: torch.Tensor
-    ) -> Tuple[None, None, torch.Tensor, None, None, None]:
+    def backward(ctx, grad_out):
         ret = ctx.roiaware_pool3d_for_backward
         pts_idx_of_voxels, argmax, mode, num_pts, num_channels = ret
 
diff --git a/mmcv/ops/roipoint_pool3d.py b/mmcv/ops/roipoint_pool3d.py
index 3c16f5f..0a21412 100644
--- a/mmcv/ops/roipoint_pool3d.py
+++ b/mmcv/ops/roipoint_pool3d.py
@@ -1,6 +1,3 @@
-from typing import Any, Tuple
-
-import torch
 from torch import nn as nn
 from torch.autograd import Function
 
@@ -20,12 +17,11 @@ class RoIPointPool3d(nn.Module):
             Default: 512.
     """
 
-    def __init__(self, num_sampled_points: int = 512):
+    def __init__(self, num_sampled_points=512):
         super().__init__()
         self.num_sampled_points = num_sampled_points
 
-    def forward(self, points: torch.Tensor, point_features: torch.Tensor,
-                boxes3d: torch.Tensor) -> Tuple[torch.Tensor]:
+    def forward(self, points, point_features, boxes3d):
         """
         Args:
             points (torch.Tensor): Input points whose shape is (B, N, C).
@@ -34,9 +30,9 @@ class RoIPointPool3d(nn.Module):
             boxes3d (B, M, 7), Input bounding boxes whose shape is (B, M, 7).
 
         Returns:
-            tuple[torch.Tensor]: A tuple contains two elements. The first one
-            is the pooled features whose shape is (B, M, 512, 3 + C). The
-            second is an empty flag whose shape is (B, M).
+            pooled_features (torch.Tensor): The output pooled features whose
+                shape is (B, M, 512, 3 + C).
+            pooled_empty_flag (torch.Tensor): Empty flag whose shape is (B, M).
         """
         return RoIPointPool3dFunction.apply(points, point_features, boxes3d,
                                             self.num_sampled_points)
@@ -45,13 +41,7 @@ class RoIPointPool3d(nn.Module):
 class RoIPointPool3dFunction(Function):
 
     @staticmethod
-    def forward(
-            ctx: Any,
-            points: torch.Tensor,
-            point_features: torch.Tensor,
-            boxes3d: torch.Tensor,
-            num_sampled_points: int = 512
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
+    def forward(ctx, points, point_features, boxes3d, num_sampled_points=512):
         """
         Args:
             points (torch.Tensor): Input points whose shape is (B, N, C).
@@ -62,9 +52,9 @@ class RoIPointPool3dFunction(Function):
                 Default: 512.
 
         Returns:
-            tuple[torch.Tensor]: A tuple contains two elements. The first one
-            is the pooled features whose shape is (B, M, 512, 3 + C). The
-            second is an empty flag whose shape is (B, M).
+            pooled_features (torch.Tensor): The output pooled features whose
+                shape is (B, M, 512, 3 + C).
+            pooled_empty_flag (torch.Tensor): Empty flag whose shape is (B, M).
         """
         assert len(points.shape) == 3 and points.shape[2] == 3
         batch_size, boxes_num, feature_len = points.shape[0], boxes3d.shape[
@@ -83,5 +73,5 @@ class RoIPointPool3dFunction(Function):
         return pooled_features, pooled_empty_flag
 
     @staticmethod
-    def backward(ctx: Any, grad_out: torch.Tensor) -> torch.Tensor:
+    def backward(ctx, grad_out):
         raise NotImplementedError
diff --git a/mmcv/ops/rotated_feature_align.py b/mmcv/ops/rotated_feature_align.py
deleted file mode 100644
index 0132c04..0000000
--- a/mmcv/ops/rotated_feature_align.py
+++ /dev/null
@@ -1,95 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from typing import Any
-
-import torch
-from torch.autograd import Function
-from torch.autograd.function import once_differentiable
-
-from ..utils import ext_loader
-
-ext_module = ext_loader.load_ext(
-    '_ext',
-    ['rotated_feature_align_forward', 'rotated_feature_align_backward'])
-
-
-class RotatedFeatureAlignFunction(Function):
-    """Using the feature interpolation to obtain the position information
-    correspond to the refined rotate anchors and reconstruct the feature maps
-    in pixel-wise manner to achieve feature alignment.
-
-    The details are described in the paper
-    `R3Det: Refined Single-Stage Detector with Feature Refinement for Rotating
-    Object <https://arxiv.org/abs/1908.05612>`_.
-    """
-
-    @staticmethod
-    def symbolic(g, features, best_rbboxes, spatial_scale, points):
-        assert points in [1, 5]
-        return g.op(
-            'mmcv::MMCVRotatedFeatureAlign',
-            features,
-            best_rbboxes,
-            spatial_scale_f=spatial_scale,
-            points_i=points)
-
-    @staticmethod
-    def forward(ctx: Any, features: torch.Tensor, best_rbboxes: torch.Tensor,
-                spatial_scale: float, points: int) -> torch.Tensor:
-        """
-        Args:
-            features (torch.Tensor): Input features with shape [N,C,H,W].
-            best_rbboxes (torch.Tensor): Refined rotate anchors with
-                shape [N,H,W,5]. Coordinate format (cx,cx,h,w,a).
-            spatial_scale (float): The scale of feature map size and
-                input image size.
-            points (int, optional): The number of sample points.
-                Only 1 and 5 are supported. Defaults to 1.
-
-        Returns:
-            torch.Tensor: Refined features with shape [N,C,H,W].
-        """
-        ctx.spatial_scale = spatial_scale
-        ctx.points = points
-        ctx.save_for_backward(best_rbboxes)
-        assert points in [1, 5]
-        output = torch.zeros_like(features)
-        ext_module.rotated_feature_align_forward(
-            features,
-            best_rbboxes,
-            output,
-            spatial_scale=spatial_scale,
-            points=points)
-        return output
-
-    @staticmethod
-    @once_differentiable
-    def backward(ctx: Any, grad_output: torch.Tensor) -> tuple:
-        """
-        Args:
-            grad_output (torch.Tensor): The gradient of output features
-                with shape [N,C,H,W].
-
-        Returns:
-            torch.Tensor: The gradient of input features with shape [N,C,H,W].
-        """
-        best_rbboxes = ctx.saved_tensors[0]
-        points = ctx.points
-        spatial_scale = ctx.spatial_scale
-        grad_input = None
-        if ctx.needs_input_grad[0]:
-            grad_input = torch.zeros_like(grad_output)
-            ext_module.rotated_feature_align_backward(
-                grad_output.contiguous(),
-                best_rbboxes,
-                grad_input,
-                spatial_scale=spatial_scale,
-                points=points)
-        return grad_input, None, None, None
-
-
-def rotated_feature_align(features: torch.Tensor,
-                          best_rbboxes: torch.Tensor,
-                          spatial_scale: float = 1 / 8,
-                          points: int = 1) -> torch.Tensor:
-    return RotatedFeatureAlignFunction.apply(features, best_rbboxes,
-                                             spatial_scale, points)
diff --git a/mmcv/ops/saconv.py b/mmcv/ops/saconv.py
index f932884..111fab9 100644
--- a/mmcv/ops/saconv.py
+++ b/mmcv/ops/saconv.py
@@ -2,22 +2,18 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from mmengine.model import constant_init
-from mmengine.registry import MODELS
-from mmengine.utils import digit_version
-from mmengine.utils.dl_utils import TORCH_VERSION
 
-from mmcv.cnn import ConvAWS2d
+from mmcv.cnn import CONV_LAYERS, ConvAWS2d, constant_init
 from mmcv.ops.deform_conv import deform_conv2d
+from mmcv.utils import TORCH_VERSION, digit_version
 
 
-@MODELS.register_module(name='SAC')
+@CONV_LAYERS.register_module(name='SAC')
 class SAConv2d(ConvAWS2d):
     """SAC (Switchable Atrous Convolution)
 
-    This is an implementation of `DetectoRS: Detecting Objects with Recursive
-    Feature Pyramid and Switchable Atrous Convolution
-    <https://arxiv.org/abs/2006.02334>`_.
+    This is an implementation of SAC in DetectoRS
+    (https://arxiv.org/pdf/2006.02334.pdf).
 
     Args:
         in_channels (int): Number of channels in the input image
diff --git a/mmcv/ops/scatter_points.py b/mmcv/ops/scatter_points.py
index 5d881bf..2b8aa41 100644
--- a/mmcv/ops/scatter_points.py
+++ b/mmcv/ops/scatter_points.py
@@ -1,8 +1,5 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from typing import Any, List, Optional, Tuple
-
 import torch
-import torch.nn.functional as F
 from torch import nn
 from torch.autograd import Function
 
@@ -16,10 +13,7 @@ ext_module = ext_loader.load_ext(
 class _DynamicScatter(Function):
 
     @staticmethod
-    def forward(ctx: Any,
-                feats: torch.Tensor,
-                coors: torch.Tensor,
-                reduce_type: str = 'max') -> Tuple[torch.Tensor, torch.Tensor]:
+    def forward(ctx, feats, coors, reduce_type='max'):
         """convert kitti points(N, >=3) to voxels.
 
         Args:
@@ -31,10 +25,10 @@ class _DynamicScatter(Function):
                 'mean'. Default: 'max'.
 
         Returns:
-            tuple[torch.Tensor]: A tuple contains two elements. The first one
-            is the voxel features with shape [M, C] which are respectively
-            reduced from input features that share the same voxel coordinates.
-            The second is voxel coordinates with shape [M, ndim].
+            voxel_feats (torch.Tensor): [M, C]. Reduced features, input
+                features that shares the same voxel coordinates are reduced to
+                one row.
+            voxel_coors (torch.Tensor): [M, ndim]. Voxel coordinates.
         """
         results = ext_module.dynamic_point_to_voxel_forward(
             feats, coors, reduce_type)
@@ -47,9 +41,7 @@ class _DynamicScatter(Function):
         return voxel_feats, voxel_coors
 
     @staticmethod
-    def backward(ctx: Any,
-                 grad_voxel_feats: torch.Tensor,
-                 grad_voxel_coors: Optional[torch.Tensor] = None) -> tuple:
+    def backward(ctx, grad_voxel_feats, grad_voxel_coors=None):
         (feats, voxel_feats, point2voxel_map,
          voxel_points_count) = ctx.saved_tensors
         grad_feats = torch.zeros_like(feats)
@@ -80,17 +72,14 @@ class DynamicScatter(nn.Module):
             into voxel.
     """
 
-    def __init__(self, voxel_size: List, point_cloud_range: List,
-                 average_points: bool):
+    def __init__(self, voxel_size, point_cloud_range, average_points: bool):
         super().__init__()
 
         self.voxel_size = voxel_size
         self.point_cloud_range = point_cloud_range
         self.average_points = average_points
 
-    def forward_single(
-            self, points: torch.Tensor,
-            coors: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+    def forward_single(self, points, coors):
         """Scatters points into voxels.
 
         Args:
@@ -99,16 +88,14 @@ class DynamicScatter(nn.Module):
                 multi-dim voxel index) of each points.
 
         Returns:
-            tuple[torch.Tensor]: A tuple contains two elements. The first one
-            is the voxel features with shape [M, C] which are respectively
-            reduced from input features that share the same voxel coordinates.
-            The second is voxel coordinates with shape [M, ndim].
+            voxel_feats (torch.Tensor): Reduced features, input features that
+                shares the same voxel coordinates are reduced to one row.
+            voxel_coors (torch.Tensor): Voxel coordinates.
         """
         reduce = 'mean' if self.average_points else 'max'
         return dynamic_scatter(points.contiguous(), coors.contiguous(), reduce)
 
-    def forward(self, points: torch.Tensor,
-                coors: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+    def forward(self, points, coors):
         """Scatters points/features into voxels.
 
         Args:
@@ -117,10 +104,9 @@ class DynamicScatter(nn.Module):
                 multi-dim voxel index) of each points.
 
         Returns:
-            tuple[torch.Tensor]: A tuple contains two elements. The first one
-            is the voxel features with shape [M, C] which are respectively
-            reduced from input features that share the same voxel coordinates.
-            The second is voxel coordinates with shape [M, ndim].
+            voxel_feats (torch.Tensor): Reduced features, input features that
+                shares the same voxel coordinates are reduced to one row.
+            voxel_coors (torch.Tensor): Voxel coordinates.
         """
         if coors.size(-1) == 3:
             return self.forward_single(points, coors)
@@ -131,7 +117,8 @@ class DynamicScatter(nn.Module):
                 inds = torch.where(coors[:, 0] == i)
                 voxel, voxel_coor = self.forward_single(
                     points[inds], coors[inds][:, 1:])
-                coor_pad = F.pad(voxel_coor, (1, 0), mode='constant', value=i)
+                coor_pad = nn.functional.pad(
+                    voxel_coor, (1, 0), mode='constant', value=i)
                 voxel_coors.append(coor_pad)
                 voxels.append(voxel)
             features = torch.cat(voxels, dim=0)
diff --git a/mmcv/ops/sparse_conv.py b/mmcv/ops/sparse_conv.py
deleted file mode 100644
index b32129d..0000000
--- a/mmcv/ops/sparse_conv.py
+++ /dev/null
@@ -1,455 +0,0 @@
-# Copyright 2019 Yan Yan
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import math
-
-import numpy as np
-import torch
-from mmengine.registry import MODELS
-from torch.nn import init
-from torch.nn.parameter import Parameter
-
-from . import sparse_functional as Fsp
-from . import sparse_ops as ops
-from .sparse_modules import SparseModule
-from .sparse_structure import SparseConvTensor
-
-
-def _calculate_fan_in_and_fan_out_hwio(tensor):
-    dimensions = tensor.ndimension()
-    if dimensions < 2:
-        raise ValueError('fan in and fan out can not be computed for tensor'
-                         'with fewer than 2 dimensions')
-
-    if dimensions == 2:  # Linear
-        fan_in = tensor.size(-2)
-        fan_out = tensor.size(-1)
-    else:
-        num_input_fmaps = tensor.size(-2)
-        num_output_fmaps = tensor.size(-1)
-        receptive_field_size = 1
-        if tensor.dim() > 2:
-            receptive_field_size = tensor[..., 0, 0].numel()
-        fan_in = num_input_fmaps * receptive_field_size
-        fan_out = num_output_fmaps * receptive_field_size
-
-    return fan_in, fan_out
-
-
-class SparseConvolution(SparseModule):
-
-    def __init__(self,
-                 ndim,
-                 in_channels,
-                 out_channels,
-                 kernel_size=3,
-                 stride=1,
-                 padding=0,
-                 dilation=1,
-                 groups=1,
-                 bias=True,
-                 subm=False,
-                 output_padding=0,
-                 transposed=False,
-                 inverse=False,
-                 indice_key=None,
-                 fused_bn=False):
-        super().__init__()
-        assert groups == 1
-        if not isinstance(kernel_size, (list, tuple)):
-            kernel_size = [kernel_size] * ndim
-        if not isinstance(stride, (list, tuple)):
-            stride = [stride] * ndim
-        if not isinstance(padding, (list, tuple)):
-            padding = [padding] * ndim
-        if not isinstance(dilation, (list, tuple)):
-            dilation = [dilation] * ndim
-        if not isinstance(output_padding, (list, tuple)):
-            output_padding = [output_padding] * ndim
-
-        for d, s in zip(dilation, stride):
-            assert any([s == 1, d == 1]), "don't support this."
-
-        self.ndim = ndim
-        self.in_channels = in_channels
-        self.out_channels = out_channels
-        self.kernel_size = kernel_size
-        self.conv1x1 = np.prod(kernel_size) == 1
-        self.stride = stride
-        self.padding = padding
-        self.dilation = dilation
-        self.transposed = transposed
-        self.inverse = inverse
-        self.output_padding = output_padding
-        self.groups = groups
-        self.subm = subm
-        self.indice_key = indice_key
-        self.fused_bn = fused_bn
-
-        self.weight = Parameter(
-            torch.Tensor(*kernel_size, in_channels, out_channels))
-        if bias:
-            self.bias = Parameter(torch.Tensor(out_channels))
-        else:
-            self.register_parameter('bias', None)
-        self.reset_parameters()
-
-    def reset_parameters(self):
-        init.kaiming_uniform_(self.weight, a=math.sqrt(5))
-        if self.bias is not None:
-            fan_in, _ = _calculate_fan_in_and_fan_out_hwio(self.weight)
-            bound = 1 / math.sqrt(fan_in)
-            init.uniform_(self.bias, -bound, bound)
-
-    def forward(self, input):
-        assert isinstance(input, SparseConvTensor)
-        features = input.features
-        device = features.device
-        indices = input.indices
-        spatial_shape = input.spatial_shape
-        batch_size = input.batch_size
-        if not self.subm:
-            if self.transposed:
-                out_spatial_shape = ops.get_deconv_output_size(
-                    spatial_shape, self.kernel_size, self.stride, self.padding,
-                    self.dilation, self.output_padding)
-            else:
-                out_spatial_shape = ops.get_conv_output_size(
-                    spatial_shape, self.kernel_size, self.stride, self.padding,
-                    self.dilation)
-
-        else:
-            out_spatial_shape = spatial_shape
-
-        if self.conv1x1:
-            features = torch.mm(
-                input.features,
-                self.weight.view(self.in_channels, self.out_channels))
-            if self.bias is not None:
-                features += self.bias
-            out_tensor = SparseConvTensor(features, input.indices,
-                                          input.spatial_shape,
-                                          input.batch_size)
-            out_tensor.indice_dict = input.indice_dict
-            out_tensor.grid = input.grid
-            return out_tensor
-        data = input.find_indice_pair(self.indice_key)
-        if self.inverse:
-            assert data is not None and self.indice_key is not None
-            _, outids, indice_pairs, indice_pair_num, out_spatial_shape = data
-            assert indice_pairs.shape[0] == np.prod(
-                self.kernel_size
-            ), 'inverse conv must have same kernel size as its couple conv'
-        else:
-            if self.indice_key is not None and data is not None:
-                outids, _, indice_pairs, indice_pair_num, _ = data
-            else:
-                outids, indice_pairs, indice_pair_num = ops.get_indice_pairs(
-                    indices,
-                    batch_size,
-                    spatial_shape,
-                    self.kernel_size,
-                    self.stride,
-                    self.padding,
-                    self.dilation,
-                    self.output_padding,
-                    self.subm,
-                    self.transposed,
-                    grid=input.grid)
-                input.indice_dict[self.indice_key] = (outids, indices,
-                                                      indice_pairs,
-                                                      indice_pair_num,
-                                                      spatial_shape)
-        if self.fused_bn:
-            assert self.bias is not None
-            out_features = ops.fused_indice_conv(features, self.weight,
-                                                 self.bias,
-                                                 indice_pairs.to(device),
-                                                 indice_pair_num,
-                                                 outids.shape[0], self.inverse,
-                                                 self.subm)
-        else:
-            if self.subm:
-                out_features = Fsp.indice_subm_conv(features, self.weight,
-                                                    indice_pairs.to(device),
-                                                    indice_pair_num,
-                                                    outids.shape[0])
-            else:
-                if self.inverse:
-                    out_features = Fsp.indice_inverse_conv(
-                        features, self.weight, indice_pairs.to(device),
-                        indice_pair_num, outids.shape[0])
-                else:
-                    out_features = Fsp.indice_conv(features, self.weight,
-                                                   indice_pairs.to(device),
-                                                   indice_pair_num,
-                                                   outids.shape[0])
-
-            if self.bias is not None:
-                out_features += self.bias
-        out_tensor = SparseConvTensor(out_features, outids, out_spatial_shape,
-                                      batch_size)
-        out_tensor.indice_dict = input.indice_dict
-        out_tensor.grid = input.grid
-        return out_tensor
-
-
-@MODELS.register_module()
-class SparseConv2d(SparseConvolution):
-
-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 kernel_size,
-                 stride=1,
-                 padding=0,
-                 dilation=1,
-                 groups=1,
-                 bias=True,
-                 indice_key=None):
-        super().__init__(
-            2,
-            in_channels,
-            out_channels,
-            kernel_size,
-            stride,
-            padding,
-            dilation,
-            groups,
-            bias,
-            indice_key=indice_key)
-
-
-@MODELS.register_module()
-class SparseConv3d(SparseConvolution):
-
-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 kernel_size,
-                 stride=1,
-                 padding=0,
-                 dilation=1,
-                 groups=1,
-                 bias=True,
-                 indice_key=None):
-        super().__init__(
-            3,
-            in_channels,
-            out_channels,
-            kernel_size,
-            stride,
-            padding,
-            dilation,
-            groups,
-            bias,
-            indice_key=indice_key)
-
-
-@MODELS.register_module()
-class SparseConv4d(SparseConvolution):
-
-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 kernel_size,
-                 stride=1,
-                 padding=0,
-                 dilation=1,
-                 groups=1,
-                 bias=True,
-                 indice_key=None):
-        super().__init__(
-            4,
-            in_channels,
-            out_channels,
-            kernel_size,
-            stride,
-            padding,
-            dilation,
-            groups,
-            bias,
-            indice_key=indice_key)
-
-
-@MODELS.register_module()
-class SparseConvTranspose2d(SparseConvolution):
-
-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 kernel_size,
-                 stride=1,
-                 padding=0,
-                 dilation=1,
-                 groups=1,
-                 bias=True,
-                 indice_key=None):
-        super().__init__(
-            2,
-            in_channels,
-            out_channels,
-            kernel_size,
-            stride,
-            padding,
-            dilation,
-            groups,
-            bias,
-            transposed=True,
-            indice_key=indice_key)
-
-
-@MODELS.register_module()
-class SparseConvTranspose3d(SparseConvolution):
-
-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 kernel_size,
-                 stride=1,
-                 padding=0,
-                 dilation=1,
-                 groups=1,
-                 bias=True,
-                 indice_key=None):
-        super().__init__(
-            3,
-            in_channels,
-            out_channels,
-            kernel_size,
-            stride,
-            padding,
-            dilation,
-            groups,
-            bias,
-            transposed=True,
-            indice_key=indice_key)
-
-
-@MODELS.register_module()
-class SparseInverseConv2d(SparseConvolution):
-
-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 kernel_size,
-                 indice_key=None,
-                 bias=True):
-        super().__init__(
-            2,
-            in_channels,
-            out_channels,
-            kernel_size,
-            bias=bias,
-            inverse=True,
-            indice_key=indice_key)
-
-
-@MODELS.register_module()
-class SparseInverseConv3d(SparseConvolution):
-
-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 kernel_size,
-                 indice_key=None,
-                 bias=True):
-        super().__init__(
-            3,
-            in_channels,
-            out_channels,
-            kernel_size,
-            bias=bias,
-            inverse=True,
-            indice_key=indice_key)
-
-
-@MODELS.register_module()
-class SubMConv2d(SparseConvolution):
-
-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 kernel_size,
-                 stride=1,
-                 padding=0,
-                 dilation=1,
-                 groups=1,
-                 bias=True,
-                 indice_key=None):
-        super().__init__(
-            2,
-            in_channels,
-            out_channels,
-            kernel_size,
-            stride,
-            padding,
-            dilation,
-            groups,
-            bias,
-            True,
-            indice_key=indice_key)
-
-
-@MODELS.register_module()
-class SubMConv3d(SparseConvolution):
-
-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 kernel_size,
-                 stride=1,
-                 padding=0,
-                 dilation=1,
-                 groups=1,
-                 bias=True,
-                 indice_key=None):
-        super().__init__(
-            3,
-            in_channels,
-            out_channels,
-            kernel_size,
-            stride,
-            padding,
-            dilation,
-            groups,
-            bias,
-            True,
-            indice_key=indice_key)
-
-
-@MODELS.register_module()
-class SubMConv4d(SparseConvolution):
-
-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 kernel_size,
-                 stride=1,
-                 padding=0,
-                 dilation=1,
-                 groups=1,
-                 bias=True,
-                 indice_key=None):
-        super().__init__(
-            4,
-            in_channels,
-            out_channels,
-            kernel_size,
-            stride,
-            padding,
-            dilation,
-            groups,
-            bias,
-            True,
-            indice_key=indice_key)
diff --git a/mmcv/ops/sparse_functional.py b/mmcv/ops/sparse_functional.py
deleted file mode 100644
index 5a80a54..0000000
--- a/mmcv/ops/sparse_functional.py
+++ /dev/null
@@ -1,156 +0,0 @@
-# Copyright 2019 Yan Yan
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from typing import Any
-
-import torch
-from torch.autograd import Function
-
-from . import sparse_ops as ops
-
-
-class SparseConvFunction(Function):
-    """Sparse Convolution.
-
-    Please refer to `SECOND <https://www.mdpi.com/1424-8220/18/10/3337>`_ for
-    more details.
-    """
-
-    @staticmethod
-    def forward(ctx: Any, features: torch.Tensor, filters: torch.nn.Parameter,
-                indice_pairs: torch.Tensor, indice_pair_num: torch.Tensor,
-                num_activate_out: torch.Tensor) -> torch.Tensor:
-        """
-        Args:
-            features (torch.Tensor): Features that needs to convolute.
-            filters (torch.nn.parameter.Parameter): Convolution filters.
-            indice_pairs (torch.Tensor): Indice pairs between inputs locations
-                and outputs locations.
-            indice_pair_num (torch.Tensor): Indice pairs num.
-            num_activate_out (torch.Tensor): Output channels num.
-
-        Returns:
-            torch.Tensor: Output features from gather-gemm-scatter.
-        """
-        ctx.save_for_backward(indice_pairs, indice_pair_num, features, filters)
-        return ops.indice_conv(features, filters, indice_pairs,
-                               indice_pair_num, num_activate_out, False)
-
-    @staticmethod
-    def backward(ctx: Any, grad_output: torch.Tensor) -> tuple:
-        indice_pairs, indice_pair_num, features, filters = ctx.saved_tensors
-        input_bp, filters_bp = ops.indice_conv_backward(
-            features, filters, grad_output, indice_pairs, indice_pair_num,
-            False)
-
-        return input_bp, filters_bp, None, None, None
-
-
-class SparseInverseConvFunction(Function):
-
-    @staticmethod
-    def forward(ctx: Any, features: torch.Tensor, filters: torch.nn.Parameter,
-                indice_pairs: torch.Tensor, indice_pair_num: torch.Tensor,
-                num_activate_out: torch.Tensor) -> torch.Tensor:
-        """
-        Args:
-            features (torch.Tensor): Features that needs to convolute.
-            filters (torch.nn.parameter.Parameter): Convolution filters.
-            indice_pairs (torch.Tensor): Indice pairs between inputs locations
-                and outputs locations.
-            indice_pair_num (torch.Tensor): Indice pairs num.
-            num_activate_out (torch.Tensor): Output channels num.
-
-        Returns:
-            torch.Tensor: Output features from gather-gemm-scatter.
-        """
-        ctx.save_for_backward(indice_pairs, indice_pair_num, features, filters)
-        return ops.indice_conv(features, filters, indice_pairs,
-                               indice_pair_num, num_activate_out, True, False)
-
-    @staticmethod
-    def backward(ctx: Any, grad_output: torch.Tensor) -> tuple:
-        indice_pairs, indice_pair_num, features, filters = ctx.saved_tensors
-        input_bp, filters_bp = ops.indice_conv_backward(
-            features, filters, grad_output, indice_pairs, indice_pair_num,
-            True, False)
-
-        return input_bp, filters_bp, None, None, None
-
-
-class SubMConvFunction(Function):
-
-    @staticmethod
-    def forward(ctx: Any, features: torch.Tensor, filters: torch.nn.Parameter,
-                indice_pairs: torch.Tensor, indice_pair_num: torch.Tensor,
-                num_activate_out: torch.Tensor) -> torch.Tensor:
-        """
-        Args:
-            features (torch.Tensor): Features that needs to convolute.
-            filters (torch.nn.parameter.Parameter): Convolution filters.
-            indice_pairs (torch.Tensor): Indice pairs between inputs locations
-                and outputs locations.
-            indice_pair_num (torch.Tensor): Indice pairs num.
-            num_activate_out (torch.Tensor): Output channels num.
-
-        Returns:
-            torch.Tensor: Output features from gather-gemm-scatter.
-        """
-        ctx.save_for_backward(indice_pairs, indice_pair_num, features, filters)
-        return ops.indice_conv(features, filters, indice_pairs,
-                               indice_pair_num, num_activate_out, False, True)
-
-    @staticmethod
-    def backward(ctx: Any, grad_output: torch.Tensor) -> tuple:
-        indice_pairs, indice_pair_num, features, filters = ctx.saved_tensors
-        input_bp, filters_bp = ops.indice_conv_backward(
-            features, filters, grad_output, indice_pairs, indice_pair_num,
-            False, True)
-
-        return input_bp, filters_bp, None, None, None
-
-
-class SparseMaxPoolFunction(Function):
-
-    @staticmethod
-    def forward(ctx, features: torch.Tensor, indice_pairs: torch.Tensor,
-                indice_pair_num: torch.Tensor,
-                num_activate_out: torch.Tensor) -> torch.Tensor:
-        """
-        Args:
-            features (torch.Tensor): Features that needs to convolute.
-            indice_pairs (torch.Tensor): Indice pairs between inputs locations
-                and outputs locations.
-            indice_pair_num (torch.Tensor): Indice pairs num.
-            num_activate_out (torch.Tensor): Output channels num.
-
-        Returns:
-            torch.Tensor: Output features from sparse maxpooling.
-        """
-        out = ops.indice_maxpool(features, indice_pairs, indice_pair_num,
-                                 num_activate_out)
-        ctx.save_for_backward(indice_pairs, indice_pair_num, features, out)
-        return out
-
-    @staticmethod
-    def backward(ctx: Any, grad_output: torch.Tensor) -> tuple:
-        indice_pairs, indice_pair_num, features, out = ctx.saved_tensors
-        input_bp = ops.indice_maxpool_backward(features, out, grad_output,
-                                               indice_pairs, indice_pair_num)
-        return input_bp, None, None, None
-
-
-indice_conv = SparseConvFunction.apply
-indice_inverse_conv = SparseInverseConvFunction.apply
-indice_subm_conv = SubMConvFunction.apply
-indice_maxpool = SparseMaxPoolFunction.apply
diff --git a/mmcv/ops/sparse_modules.py b/mmcv/ops/sparse_modules.py
deleted file mode 100644
index 20a92aa..0000000
--- a/mmcv/ops/sparse_modules.py
+++ /dev/null
@@ -1,203 +0,0 @@
-# Copyright 2019 Yan Yan
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import sys
-from collections import OrderedDict
-from typing import Any, List, Optional, Union
-
-import torch
-from torch import nn
-
-from .sparse_structure import SparseConvTensor
-
-
-def is_spconv_module(module: nn.Module) -> bool:
-    spconv_modules = (SparseModule, )
-    return isinstance(module, spconv_modules)
-
-
-def is_sparse_conv(module: nn.Module) -> bool:
-    from .sparse_conv import SparseConvolution
-    return isinstance(module, SparseConvolution)
-
-
-def _mean_update(vals: Union[int, List], m_vals: Union[int, List],
-                 t: float) -> List:
-    outputs = []
-    if not isinstance(vals, list):
-        vals = [vals]
-    if not isinstance(m_vals, list):
-        m_vals = [m_vals]
-    for val, m_val in zip(vals, m_vals):
-        output = t / float(t + 1) * m_val + 1 / float(t + 1) * val
-        outputs.append(output)
-    if len(outputs) == 1:
-        outputs = outputs[0]
-    return outputs
-
-
-class SparseModule(nn.Module):
-    """place holder, All module subclass from this will take sptensor in
-    SparseSequential."""
-    pass
-
-
-class SparseSequential(SparseModule):
-    r"""A sequential container.
-    Modules will be added to it in the order they are passed in the
-    constructor.
-    Alternatively, an ordered dict of modules can also be passed in.
-
-    To make it easier to understand, given is a small example::
-
-    Example:
-        >>> # using Sequential:
-        >>> from mmcv.ops import SparseSequential
-        >>> model = SparseSequential(
-                    SparseConv2d(1,20,5),
-                    nn.ReLU(),
-                    SparseConv2d(20,64,5),
-                    nn.ReLU()
-                    )
-
-        >>> # using Sequential with OrderedDict
-        >>> model = SparseSequential(OrderedDict([
-                      ('conv1', SparseConv2d(1,20,5)),
-                      ('relu1', nn.ReLU()),
-                      ('conv2', SparseConv2d(20,64,5)),
-                      ('relu2', nn.ReLU())
-                    ]))
-
-        >>> # using Sequential with kwargs(python 3.6+)
-        >>> model = SparseSequential(
-                      conv1=SparseConv2d(1,20,5),
-                      relu1=nn.ReLU(),
-                      conv2=SparseConv2d(20,64,5),
-                      relu2=nn.ReLU()
-                    )
-    """
-
-    def __init__(self, *args, **kwargs):
-        super().__init__()
-        if len(args) == 1 and isinstance(args[0], OrderedDict):
-            for key, module in args[0].items():
-                self.add_module(key, module)
-        else:
-            for idx, module in enumerate(args):
-                self.add_module(str(idx), module)
-        for name, module in kwargs.items():
-            if sys.version_info < (3, 6):
-                raise ValueError('kwargs only supported in py36+')
-            if name in self._modules:
-                raise ValueError('name exists.')
-            self.add_module(name, module)
-        self._sparity_dict = {}
-
-    def __getitem__(self, idx: int) -> torch.Tensor:
-        if not (-len(self) <= idx < len(self)):
-            raise IndexError(f'index {idx} is out of range')
-        if idx < 0:
-            idx += len(self)
-        it = iter(self._modules.values())
-        for i in range(idx):
-            next(it)
-        return next(it)
-
-    def __len__(self):
-        return len(self._modules)
-
-    @property
-    def sparity_dict(self):
-        return self._sparity_dict
-
-    def add(self, module: Any, name: Optional[str] = None) -> None:
-        if name is None:
-            name = str(len(self._modules))
-            if name in self._modules:
-                raise KeyError('name exists')
-        self.add_module(name, module)
-
-    def forward(self, input: torch.Tensor) -> torch.Tensor:
-        for k, module in self._modules.items():
-            if is_spconv_module(module):
-                assert isinstance(input, SparseConvTensor)
-                self._sparity_dict[k] = input.sparity
-                input = module(input)
-            else:
-                if isinstance(input, SparseConvTensor):
-                    if input.indices.shape[0] != 0:
-                        input.features = module(input.features)
-                else:
-                    input = module(input)
-        return input
-
-    def fused(self):
-        from .sparse_conv import SparseConvolution
-        mods = [v for k, v in self._modules.items()]
-        fused_mods = []
-        idx = 0
-        while idx < len(mods):
-            if is_sparse_conv(mods[idx]):
-                if idx < len(mods) - 1 and isinstance(mods[idx + 1],
-                                                      nn.BatchNorm1d):
-                    new_module = SparseConvolution(
-                        ndim=mods[idx].ndim,
-                        in_channels=mods[idx].in_channels,
-                        out_channels=mods[idx].out_channels,
-                        kernel_size=mods[idx].kernel_size,
-                        stride=mods[idx].stride,
-                        padding=mods[idx].padding,
-                        dilation=mods[idx].dilation,
-                        groups=mods[idx].groups,
-                        bias=True,
-                        subm=mods[idx].subm,
-                        output_padding=mods[idx].output_padding,
-                        transposed=mods[idx].transposed,
-                        inverse=mods[idx].inverse,
-                        indice_key=mods[idx].indice_key,
-                        fused_bn=True,
-                    )
-                    new_module.load_state_dict(mods[idx].state_dict(), False)
-                    new_module.to(mods[idx].weight.device)
-                    conv = new_module
-                    bn = mods[idx + 1]
-                    conv.bias.data.zero_()
-                    conv.weight.data[:] = conv.weight.data * bn.weight.data / (
-                        torch.sqrt(bn.running_var) + bn.eps)
-                    conv.bias.data[:] = (
-                        conv.bias.data - bn.running_mean) * bn.weight.data / (
-                            torch.sqrt(bn.running_var) + bn.eps) + bn.bias.data
-                    fused_mods.append(conv)
-                    idx += 2
-                else:
-                    fused_mods.append(mods[idx])
-                    idx += 1
-            else:
-                fused_mods.append(mods[idx])
-                idx += 1
-        return SparseSequential(*fused_mods)
-
-
-class ToDense(SparseModule):
-    """convert SparseConvTensor to NCHW dense tensor."""
-
-    def forward(self, x: SparseConvTensor):
-        return x.dense()
-
-
-class RemoveGrid(SparseModule):
-    """remove pre-allocated grid buffer."""
-
-    def forward(self, x: SparseConvTensor):
-        x.grid = None
-        return x
diff --git a/mmcv/ops/sparse_ops.py b/mmcv/ops/sparse_ops.py
deleted file mode 100644
index e3b3f54..0000000
--- a/mmcv/ops/sparse_ops.py
+++ /dev/null
@@ -1,174 +0,0 @@
-# Copyright 2019 Yan Yan
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-
-from ..utils import ext_loader
-
-ext_module = ext_loader.load_ext('_ext', [
-    'get_indice_pairs_2d_forward', 'get_indice_pairs_3d_forward',
-    'get_indice_pairs_4d_forward', 'get_indice_pairs_2d_backward',
-    'get_indice_pairs_3d_backward', 'indice_conv_forward',
-    'indice_conv_backward', 'fused_indice_conv_forward',
-    'indice_maxpool_forward', 'indice_maxpool_backward'
-])
-
-
-def get_conv_output_size(input_size, kernel_size, stride, padding, dilation):
-    ndim = len(input_size)
-    output_size = []
-    for i in range(ndim):
-        size = (input_size[i] + 2 * padding[i] - dilation[i] *
-                (kernel_size[i] - 1) - 1) // stride[i] + 1
-        if kernel_size[i] == -1:
-            output_size.append(1)
-        else:
-            output_size.append(size)
-    return output_size
-
-
-def get_deconv_output_size(input_size, kernel_size, stride, padding, dilation,
-                           output_padding):
-    ndim = len(input_size)
-    output_size = []
-    for i in range(ndim):
-        if kernel_size[i] == -1:
-            raise ValueError("deconv don't support kernel_size < 0")
-        size = (input_size[i] - 1) * stride[i] - 2 * padding[i] + kernel_size[
-            i] + output_padding[i]
-        output_size.append(size)
-    return output_size
-
-
-def get_indice_pairs(indices,
-                     batch_size,
-                     spatial_shape,
-                     ksize=3,
-                     stride=1,
-                     padding=0,
-                     dilation=1,
-                     out_padding=0,
-                     subm=False,
-                     transpose=False,
-                     grid=None):
-    ndim = indices.shape[1] - 1
-    if not isinstance(ksize, (list, tuple)):
-        ksize = [ksize] * ndim
-    if not isinstance(stride, (list, tuple)):
-        stride = [stride] * ndim
-    if not isinstance(padding, (list, tuple)):
-        padding = [padding] * ndim
-    if not isinstance(dilation, (list, tuple)):
-        dilation = [dilation] * ndim
-    if not isinstance(out_padding, (list, tuple)):
-        out_padding = [out_padding] * ndim
-
-    for d, s in zip(dilation, stride):
-        assert any([s == 1, d == 1]), "don't support this."
-
-    if not subm:
-        if transpose:
-            out_shape = get_deconv_output_size(spatial_shape, ksize, stride,
-                                               padding, dilation, out_padding)
-        else:
-            out_shape = get_conv_output_size(spatial_shape, ksize, stride,
-                                             padding, dilation)
-
-    else:
-        out_shape = spatial_shape
-    if grid is None:
-        if ndim == 2:
-            get_indice_pairs_func = ext_module.get_indice_pairs_2d_forward
-        elif ndim == 3:
-            get_indice_pairs_func = ext_module.get_indice_pairs_3d_forward
-        elif ndim == 4:
-            get_indice_pairs_func = ext_module.get_indice_pairs_4d_forward
-        else:
-            raise NotImplementedError
-        return get_indice_pairs_func(indices, batch_size, out_shape,
-                                     spatial_shape, ksize, stride, padding,
-                                     dilation, out_padding, int(subm),
-                                     int(transpose))
-    else:
-        if ndim == 2:
-            get_indice_pairs_func = ext_module.get_indice_pairs_2d_backward
-        elif ndim == 3:
-            get_indice_pairs_func = ext_module.get_indice_pairs_3d_backward
-        else:
-            raise NotImplementedError
-        return get_indice_pairs_func(indices, grid, batch_size, out_shape,
-                                     spatial_shape, ksize, stride, padding,
-                                     dilation, out_padding, int(subm),
-                                     int(transpose))
-
-
-def indice_conv(features,
-                filters,
-                indice_pairs,
-                indice_pair_num,
-                num_activate_out,
-                inverse=False,
-                subm=False):
-    if filters.dtype == torch.float32 or filters.dtype == torch.half:
-        return ext_module.indice_conv_forward(features, filters, indice_pairs,
-                                              indice_pair_num,
-                                              num_activate_out, int(inverse),
-                                              int(subm))
-    else:
-        raise NotImplementedError
-
-
-def fused_indice_conv(features, filters, bias, indice_pairs, indice_pair_num,
-                      num_activate_out, inverse, subm):
-    if features.dtype == torch.half or filters.dtypes == torch.float32:
-        func = ext_module.fused_indice_conv_forward
-    else:
-        raise NotImplementedError
-
-    return func(features, filters, bias, indice_pairs, indice_pair_num,
-                num_activate_out, int(inverse), int(subm))
-
-
-def indice_conv_backward(features,
-                         filters,
-                         out_bp,
-                         indice_pairs,
-                         indice_pair_num,
-                         inverse=False,
-                         subm=False):
-    if filters.dtype == torch.float32 or filters.dtype == torch.half:
-        return ext_module.indice_conv_backward(features, filters, out_bp,
-                                               indice_pairs, indice_pair_num,
-                                               int(inverse), int(subm))
-    else:
-        raise NotImplementedError
-
-
-def indice_maxpool(features, indice_pairs, indice_pair_num, num_activate_out):
-    if features.dtype == torch.float32 or features.dtype == torch.half:
-        return ext_module.indice_maxpool_forward(features, indice_pairs,
-                                                 indice_pair_num,
-                                                 num_activate_out)
-    else:
-        raise NotImplementedError
-
-
-def indice_maxpool_backward(features, out_features, out_bp, indice_pairs,
-                            indice_pair_num):
-    if features.dtype == torch.float32 or features.dtype == torch.half:
-        return ext_module.indice_maxpool_backward(features, out_features,
-                                                  out_bp, indice_pairs,
-                                                  indice_pair_num)
-    else:
-        raise NotImplementedError
diff --git a/mmcv/ops/sparse_pool.py b/mmcv/ops/sparse_pool.py
deleted file mode 100644
index c4edb1d..0000000
--- a/mmcv/ops/sparse_pool.py
+++ /dev/null
@@ -1,86 +0,0 @@
-# Copyright 2019 Yan Yan
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# import sparse_functional as Fsp
-# import sparse_ops as ops
-from .sparse_functional import indice_maxpool
-from .sparse_modules import SparseModule
-from .sparse_ops import get_conv_output_size, get_indice_pairs
-from .sparse_structure import SparseConvTensor
-
-
-class SparseMaxPool(SparseModule):
-
-    def __init__(self,
-                 ndim,
-                 kernel_size,
-                 stride=1,
-                 padding=0,
-                 dilation=1,
-                 subm=False):
-        super().__init__()
-        if not isinstance(kernel_size, (list, tuple)):
-            kernel_size = [kernel_size] * ndim
-        if not isinstance(stride, (list, tuple)):
-            stride = [stride] * ndim
-        if not isinstance(padding, (list, tuple)):
-            padding = [padding] * ndim
-        if not isinstance(dilation, (list, tuple)):
-            dilation = [dilation] * ndim
-
-        self.ndim = ndim
-        self.kernel_size = kernel_size
-        self.stride = stride
-        self.padding = padding
-        self.subm = subm
-        self.dilation = dilation
-
-    def forward(self, input):
-        assert isinstance(input, SparseConvTensor)
-        features = input.features
-        device = features.device
-        indices = input.indices
-        spatial_shape = input.spatial_shape
-        batch_size = input.batch_size
-        if not self.subm:
-            out_spatial_shape = get_conv_output_size(spatial_shape,
-                                                     self.kernel_size,
-                                                     self.stride, self.padding,
-                                                     self.dilation)
-        else:
-            out_spatial_shape = spatial_shape
-        outids, indice_pairs, indice_pairs_num = get_indice_pairs(
-            indices, batch_size, spatial_shape, self.kernel_size, self.stride,
-            self.padding, self.dilation, 0, self.subm)
-
-        out_features = indice_maxpool(features, indice_pairs.to(device),
-                                      indice_pairs_num.to(device),
-                                      outids.shape[0])
-        out_tensor = SparseConvTensor(out_features, outids, out_spatial_shape,
-                                      batch_size)
-        out_tensor.indice_dict = input.indice_dict
-        out_tensor.grid = input.grid
-        return out_tensor
-
-
-class SparseMaxPool2d(SparseMaxPool):
-
-    def __init__(self, kernel_size, stride=1, padding=0, dilation=1):
-        super().__init__(2, kernel_size, stride, padding, dilation)
-
-
-class SparseMaxPool3d(SparseMaxPool):
-
-    def __init__(self, kernel_size, stride=1, padding=0, dilation=1):
-        super().__init__(3, kernel_size, stride, padding, dilation)
diff --git a/mmcv/ops/sparse_structure.py b/mmcv/ops/sparse_structure.py
deleted file mode 100644
index 83907ab..0000000
--- a/mmcv/ops/sparse_structure.py
+++ /dev/null
@@ -1,66 +0,0 @@
-from typing import List, Optional, Tuple, Union
-
-import numpy as np
-import torch
-
-
-def scatter_nd(indices: torch.Tensor, updates: torch.Tensor,
-               shape: torch.Tensor) -> torch.Tensor:
-    """pytorch edition of tensorflow scatter_nd.
-
-    this function don't contain except handle code. so use this carefully when
-    indice repeats, don't support repeat add which is supported in tensorflow.
-    """
-    ret = torch.zeros(*shape, dtype=updates.dtype, device=updates.device)
-    ndim = indices.shape[-1]
-    output_shape = list(indices.shape[:-1]) + shape[indices.shape[-1]:]
-    flatted_indices = indices.view(-1, ndim)
-    slices = [flatted_indices[:, i] for i in range(ndim)]
-    slices += [Ellipsis]
-    ret[slices] = updates.view(*output_shape)
-    return ret
-
-
-class SparseConvTensor:
-
-    def __init__(self,
-                 features: torch.Tensor,
-                 indices: torch.Tensor,
-                 spatial_shape: Union[List, Tuple],
-                 batch_size: int,
-                 grid: Optional[torch.Tensor] = None):
-        self.features = features
-        self.indices = indices
-        if self.indices.dtype != torch.int32:
-            self.indices.int()
-        self.spatial_shape = spatial_shape
-        self.batch_size = batch_size
-        self.indice_dict: dict = {}
-        self.grid = grid
-
-    @property
-    def spatial_size(self):
-        return np.prod(self.spatial_shape)
-
-    def find_indice_pair(self, key):
-        if key is None:
-            return None
-        if key in self.indice_dict:
-            return self.indice_dict[key]
-        return None
-
-    def dense(self, channels_first: bool = True) -> torch.Tensor:
-        output_shape = [self.batch_size] + list(
-            self.spatial_shape) + [self.features.shape[1]]
-        res = scatter_nd(self.indices.long(), self.features, output_shape)
-        if not channels_first:
-            return res
-        ndim = len(self.spatial_shape)
-        trans_params = list(range(0, ndim + 1))
-        trans_params.insert(1, ndim + 1)
-        return res.permute(*trans_params).contiguous()
-
-    @property
-    def sparity(self):
-        return (self.indices.shape[0] / np.prod(self.spatial_shape) /
-                self.batch_size)
diff --git a/mmcv/ops/sync_bn.py b/mmcv/ops/sync_bn.py
index 2b14d30..04302f0 100644
--- a/mmcv/ops/sync_bn.py
+++ b/mmcv/ops/sync_bn.py
@@ -1,15 +1,13 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from typing import Optional
-
 import torch
 import torch.distributed as dist
 import torch.nn.functional as F
-from mmengine.registry import MODELS
 from torch.autograd import Function
 from torch.autograd.function import once_differentiable
 from torch.nn.modules.module import Module
 from torch.nn.parameter import Parameter
 
+from mmcv.cnn import NORM_LAYERS
 from ..utils import ext_loader
 
 ext_module = ext_loader.load_ext('_ext', [
@@ -37,10 +35,8 @@ class SyncBatchNormFunction(Function):
             stats_mode=stats_mode)
 
     @staticmethod
-    def forward(self, input: torch.Tensor, running_mean: torch.Tensor,
-                running_var: torch.Tensor, weight: torch.Tensor,
-                bias: torch.Tensor, momentum: float, eps: float, group: int,
-                group_size: int, stats_mode: str) -> torch.Tensor:
+    def forward(self, input, running_mean, running_var, weight, bias, momentum,
+                eps, group, group_size, stats_mode):
         self.momentum = momentum
         self.eps = eps
         self.group = group
@@ -130,7 +126,7 @@ class SyncBatchNormFunction(Function):
 
     @staticmethod
     @once_differentiable
-    def backward(self, grad_output: torch.Tensor) -> tuple:
+    def backward(self, grad_output):
         norm, std, weight = self.saved_tensors
         grad_weight = torch.zeros_like(weight)
         grad_bias = torch.zeros_like(weight)
@@ -159,7 +155,7 @@ class SyncBatchNormFunction(Function):
             None, None, None, None, None
 
 
-@MODELS.register_module(name='MMSyncBN')
+@NORM_LAYERS.register_module(name='MMSyncBN')
 class SyncBatchNorm(Module):
     """Synchronized Batch Normalization.
 
@@ -195,14 +191,14 @@ class SyncBatchNorm(Module):
     """
 
     def __init__(self,
-                 num_features: int,
-                 eps: float = 1e-5,
-                 momentum: float = 0.1,
-                 affine: bool = True,
-                 track_running_stats: bool = True,
-                 group: Optional[int] = None,
-                 stats_mode: str = 'default'):
-        super().__init__()
+                 num_features,
+                 eps=1e-5,
+                 momentum=0.1,
+                 affine=True,
+                 track_running_stats=True,
+                 group=None,
+                 stats_mode='default'):
+        super(SyncBatchNorm, self).__init__()
         self.num_features = num_features
         self.eps = eps
         self.momentum = momentum
@@ -243,7 +239,7 @@ class SyncBatchNorm(Module):
             self.weight.data.uniform_()  # pytorch use ones_()
             self.bias.data.zero_()
 
-    def forward(self, input: torch.Tensor) -> torch.Tensor:
+    def forward(self, input):
         if input.dim() < 2:
             raise ValueError(
                 f'expected at least 2D input, got {input.dim()}D input')
diff --git a/mmcv/ops/three_interpolate.py b/mmcv/ops/three_interpolate.py
index 286bd04..203f47f 100644
--- a/mmcv/ops/three_interpolate.py
+++ b/mmcv/ops/three_interpolate.py
@@ -1,4 +1,4 @@
-from typing import Any, Tuple
+from typing import Tuple
 
 import torch
 from torch.autograd import Function
@@ -17,19 +17,18 @@ class ThreeInterpolate(Function):
     """
 
     @staticmethod
-    def forward(ctx: Any, features: torch.Tensor, indices: torch.Tensor,
+    def forward(ctx, features: torch.Tensor, indices: torch.Tensor,
                 weight: torch.Tensor) -> torch.Tensor:
         """
         Args:
-            features (torch.Tensor): (B, C, M) Features descriptors to be
-                interpolated.
-            indices (torch.Tensor): (B, n, 3) indices of three nearest
-                neighbor features for the target features.
-            weight (torch.Tensor): (B, n, 3) weights of three nearest
-                neighbor features for the target features.
+            features (Tensor): (B, C, M) Features descriptors to be
+                interpolated
+            indices (Tensor): (B, n, 3) index three nearest neighbors
+                of the target features in features
+            weight (Tensor): (B, n, 3) weights of interpolation
 
         Returns:
-            torch.Tensor: (B, C, N) tensor of the interpolated features
+            Tensor: (B, C, N) tensor of the interpolated features
         """
         assert features.is_contiguous()
         assert indices.is_contiguous()
@@ -38,7 +37,7 @@ class ThreeInterpolate(Function):
         B, c, m = features.size()
         n = indices.size(1)
         ctx.three_interpolate_for_backward = (indices, weight, m)
-        output = features.new_empty(B, c, n)
+        output = torch.cuda.FloatTensor(B, c, n)
 
         ext_module.three_interpolate_forward(
             features, indices, weight, output, b=B, c=c, m=m, n=n)
@@ -50,15 +49,15 @@ class ThreeInterpolate(Function):
     ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
         """
         Args:
-            grad_out (torch.Tensor): (B, C, N) tensor with gradients of outputs
+            grad_out (Tensor): (B, C, N) tensor with gradients of outputs
 
         Returns:
-            torch.Tensor: (B, C, M) tensor with gradients of features
+            Tensor: (B, C, M) tensor with gradients of features
         """
         idx, weight, m = ctx.three_interpolate_for_backward
         B, c, n = grad_out.size()
 
-        grad_features = grad_out.new_zeros(B, c, m)
+        grad_features = torch.cuda.FloatTensor(B, c, m).zero_()
         grad_out_data = grad_out.data.contiguous()
 
         ext_module.three_interpolate_backward(
diff --git a/mmcv/ops/three_nn.py b/mmcv/ops/three_nn.py
index d41b978..2b01047 100644
--- a/mmcv/ops/three_nn.py
+++ b/mmcv/ops/three_nn.py
@@ -1,4 +1,4 @@
-from typing import Any, Tuple
+from typing import Tuple
 
 import torch
 from torch.autograd import Function
@@ -16,26 +16,26 @@ class ThreeNN(Function):
     """
 
     @staticmethod
-    def forward(ctx: Any, target: torch.Tensor,
+    def forward(ctx, target: torch.Tensor,
                 source: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
         """
         Args:
-            target (torch.Tensor): shape (B, N, 3), points set that needs to
+            target (Tensor): shape (B, N, 3), points set that needs to
                 find the nearest neighbors.
-            source (torch.Tensor): shape (B, M, 3), points set that is used
+            source (Tensor): shape (B, M, 3), points set that is used
                 to find the nearest neighbors of points in target set.
 
         Returns:
-            torch.Tensor: shape (B, N, 3), L2 distance of each point in target
-            set to their corresponding top three nearest neighbors.
+            Tensor: shape (B, N, 3), L2 distance of each point in target
+                set to their corresponding nearest neighbors.
         """
         target = target.contiguous()
         source = source.contiguous()
 
         B, N, _ = target.size()
         m = source.size(1)
-        dist2 = target.new_empty(B, N, 3)
-        idx = target.new_empty(B, N, 3, dtype=torch.int32)
+        dist2 = torch.cuda.FloatTensor(B, N, 3)
+        idx = torch.cuda.IntTensor(B, N, 3)
 
         ext_module.three_nn_forward(target, source, dist2, idx, b=B, n=N, m=m)
         if torch.__version__ != 'parrots':
diff --git a/mmcv/ops/tin_shift.py b/mmcv/ops/tin_shift.py
old mode 100755
new mode 100644
index 473231c..472c9fc
--- a/mmcv/ops/tin_shift.py
+++ b/mmcv/ops/tin_shift.py
@@ -18,10 +18,6 @@ class TINShiftFunction(Function):
 
     @staticmethod
     def forward(ctx, input, shift):
-        if input.size(0) != shift.size(0):
-            raise ValueError(
-                'The first dim (batch) of `input` and `shift` should be '
-                f'same, but got {input.size(0)} and {shift.size(0)}.')
         C = input.size(2)
         num_segments = shift.size(1)
         if C // num_segments <= 0 or C % num_segments != 0:
@@ -55,9 +51,7 @@ class TINShift(nn.Module):
     Temporal Interlace shift is a differentiable temporal-wise frame shifting
     which is proposed in "Temporal Interlacing Network"
 
-    Please refer to `Temporal Interlacing Network
-    <https://arxiv.org/abs/2001.06499>`_ for more details.
-
+    Please refer to https://arxiv.org/abs/2001.06499 for more details.
     Code is modified from https://github.com/mit-han-lab/temporal-shift-module
     """
 
@@ -65,9 +59,8 @@ class TINShift(nn.Module):
         """Perform temporal interlace shift.
 
         Args:
-            input (torch.Tensor): Feature map with shape
-                [N, num_segments, C, H * W].
-            shift (torch.Tensor): Shift tensor with shape [N, num_segments].
+            input (Tensor): Feature map with shape [N, num_segments, C, H * W].
+            shift (Tensor): Shift tensor with shape [N, num_segments].
 
         Returns:
             Feature map after temporal interlace shift.
diff --git a/mmcv/ops/upfirdn2d.py b/mmcv/ops/upfirdn2d.py
index 857e840..1d2f321 100644
--- a/mmcv/ops/upfirdn2d.py
+++ b/mmcv/ops/upfirdn2d.py
@@ -1,460 +1,330 @@
-# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
-#
-# NVIDIA CORPORATION and its licensors retain all intellectual property
-# and proprietary rights in and to this software, related documentation
-# and any modifications thereto.  Any use, reproduction, disclosure or
-# distribution of this software and related documentation without an express
-# license agreement from NVIDIA CORPORATION is strictly prohibited.
-
-# source: https://github.com/NVlabs/stylegan3/blob/main/torch_utils/ops/upfirdn2d.py # noqa
-"""Custom PyTorch ops for efficient resampling of 2D images."""
-from typing import Dict, List, Union
+# modified from https://github.com/rosinality/stylegan2-pytorch/blob/master/op/upfirdn2d.py  # noqa:E501
+
+# Copyright (c) 2021, NVIDIA Corporation. All rights reserved.
+# NVIDIA Source Code License for StyleGAN2 with Adaptive Discriminator
+# Augmentation (ADA)
+# =======================================================================
+
+# 1. Definitions
+
+# "Licensor" means any person or entity that distributes its Work.
+
+# "Software" means the original work of authorship made available under
+# this License.
+
+# "Work" means the Software and any additions to or derivative works of
+# the Software that are made available under this License.
+
+# The terms "reproduce," "reproduction," "derivative works," and
+# "distribution" have the meaning as provided under U.S. copyright law;
+# provided, however, that for the purposes of this License, derivative
+# works shall not include works that remain separable from, or merely
+# link (or bind by name) to the interfaces of, the Work.
+
+# Works, including the Software, are "made available" under this License
+# by including in or with the Work either (a) a copyright notice
+# referencing the applicability of this License to the Work, or (b) a
+# copy of this License.
+
+# 2. License Grants
+
+#     2.1 Copyright Grant. Subject to the terms and conditions of this
+#     License, each Licensor grants to you a perpetual, worldwide,
+#     non-exclusive, royalty-free, copyright license to reproduce,
+#     prepare derivative works of, publicly display, publicly perform,
+#     sublicense and distribute its Work and any resulting derivative
+#     works in any form.
+
+# 3. Limitations
+
+#     3.1 Redistribution. You may reproduce or distribute the Work only
+#     if (a) you do so under this License, (b) you include a complete
+#     copy of this License with your distribution, and (c) you retain
+#     without modification any copyright, patent, trademark, or
+#     attribution notices that are present in the Work.
+
+#     3.2 Derivative Works. You may specify that additional or different
+#     terms apply to the use, reproduction, and distribution of your
+#     derivative works of the Work ("Your Terms") only if (a) Your Terms
+#     provide that the use limitation in Section 3.3 applies to your
+#     derivative works, and (b) you identify the specific derivative
+#     works that are subject to Your Terms. Notwithstanding Your Terms,
+#     this License (including the redistribution requirements in Section
+#     3.1) will continue to apply to the Work itself.
+
+#     3.3 Use Limitation. The Work and any derivative works thereof only
+#     may be used or intended for use non-commercially. Notwithstanding
+#     the foregoing, NVIDIA and its affiliates may use the Work and any
+#     derivative works commercially. As used herein, "non-commercially"
+#     means for research or evaluation purposes only.
+
+#     3.4 Patent Claims. If you bring or threaten to bring a patent claim
+#     against any Licensor (including any claim, cross-claim or
+#     counterclaim in a lawsuit) to enforce any patents that you allege
+#     are infringed by any Work, then your rights under this License from
+#     such Licensor (including the grant in Section 2.1) will terminate
+#     immediately.
+
+#     3.5 Trademarks. This License does not grant any rights to use any
+#     Licensor’s or its affiliates’ names, logos, or trademarks, except
+#     as necessary to reproduce the notices described in this License.
+
+#     3.6 Termination. If you violate any term of this License, then your
+#     rights under this License (including the grant in Section 2.1) will
+#     terminate immediately.
+
+# 4. Disclaimer of Warranty.
+
+# THE WORK IS PROVIDED "AS IS" WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WARRANTIES OR CONDITIONS OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE OR
+# NON-INFRINGEMENT. YOU BEAR THE RISK OF UNDERTAKING ANY ACTIVITIES UNDER
+# THIS LICENSE.
+
+# 5. Limitation of Liability.
+
+# EXCEPT AS PROHIBITED BY APPLICABLE LAW, IN NO EVENT AND UNDER NO LEGAL
+# THEORY, WHETHER IN TORT (INCLUDING NEGLIGENCE), CONTRACT, OR OTHERWISE
+# SHALL ANY LICENSOR BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY DIRECT,
+# INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING OUT OF
+# OR RELATED TO THIS LICENSE, THE USE OR INABILITY TO USE THE WORK
+# (INCLUDING BUT NOT LIMITED TO LOSS OF GOODWILL, BUSINESS INTERRUPTION,
+# LOST PROFITS OR DATA, COMPUTER FAILURE OR MALFUNCTION, OR ANY OTHER
+# COMMERCIAL DAMAGES OR LOSSES), EVEN IF THE LICENSOR HAS BEEN ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGES.
+
+# =======================================================================
 
 import torch
+from torch.autograd import Function
+from torch.nn import functional as F
 
+from mmcv.utils import to_2tuple
 from ..utils import ext_loader
-from .conv2d_gradfix import conv2d
-
-ext_module = ext_loader.load_ext('_ext', ['upfirdn2d'])
-
-
-def _parse_scaling(scaling):
-    """parse scaling into list [x, y]"""
-    if isinstance(scaling, int):
-        scaling = [scaling, scaling]
-    assert isinstance(scaling, (list, tuple))
-    assert all(isinstance(x, int) for x in scaling)
-    sx, sy = scaling
-    assert sx >= 1 and sy >= 1
-    return sx, sy
-
-
-def _parse_padding(padding):
-    """parse padding into list [padx0, padx1, pady0, pady1]"""
-    if isinstance(padding, int):
-        padding = [padding, padding]
-    assert isinstance(padding, (list, tuple))
-    assert all(isinstance(x, int) for x in padding)
-    if len(padding) == 2:
-        padx, pady = padding
-        padding = [padx, padx, pady, pady]
-    padx0, padx1, pady0, pady1 = padding
-    return padx0, padx1, pady0, pady1
-
-
-def _get_filter_size(filter):
-    """get width and height of filter kernel."""
-    if filter is None:
-        return 1, 1
-    assert isinstance(filter, torch.Tensor) and filter.ndim in [1, 2]
-    fw = filter.shape[-1]
-    fh = filter.shape[0]
-    fw = int(fw)
-    fh = int(fh)
-    assert fw >= 1 and fh >= 1
-    return fw, fh
-
-
-def upfirdn2d(input: torch.Tensor,
-              filter: torch.Tensor,
-              up: int = 1,
-              down: int = 1,
-              padding: Union[int, List[int]] = 0,
-              flip_filter: bool = False,
-              gain: Union[float, int] = 1,
-              use_custom_op: bool = True):
-    """Pad, upsample, filter, and downsample a batch of 2D images.
-
-    Performs the following sequence of operations for each channel:
-
-    1. Upsample the image by inserting N-1 zeros after each pixel (`up`).
-
-    2. Pad the image with the specified number of zeros on each side
-    (`padding`). Negative padding corresponds to cropping the image.
-
-    3. Convolve the image with the specified 2D FIR filter (`f`),
-    shrinking it so that the footprint of all output pixels lies within
-    the input image.
-
-    4. Downsample the image by keeping every Nth pixel (`down`).
-
-    This sequence of operations bears close resemblance to
-        scipy.signal.upfirdn().
-
-    The fused op is considerably more efficient than performing the same
-    calculation using standard PyTorch ops. It supports gradients of arbitrary
-    order.
-
-    Args:
-        input (torch.Tensor): Float32/float64/float16 input tensor of the shape
-            `[batch_size, num_channels, in_height, in_width]`.
-        filter (torch.Tensor): Float32 FIR filter of the shape `[filter_height,
-            filter_width]` (non-separable), `[filter_taps]` (separable), or
-            `None` (identity).
-        up (int): Integer upsampling factor. Can be a single int or a
-            list/tuple `[x, y]`. Defaults to 1.
-        down (int): Integer downsampling factor. Can be a single int
-            or a list/tuple `[x, y]`. Defaults to 1.
-        padding (int | tuple[int]): Padding with respect to the upsampled
-            image. Can be a single number or a list/tuple `[x, y]` or
-            `[x_before, x_after, y_before, y_after]`. Defaults to 0.
-        flip_filter (bool): False = convolution, True = correlation.
-            Defaults to False.
-        gain (int): Overall scaling factor for signal magnitude.
-            Defaults to 1.
-        use_custom_op (bool): Whether to use customized op.
-            Defaults to True.
-
-    Returns:
-        Tensor of the shape `[batch_size, num_channels, out_height, out_width]`
-    """
-    assert isinstance(input, torch.Tensor)
-    if use_custom_op and input.device.type == 'cuda':
-        return _upfirdn2d_cuda(
-            up=up,
-            down=down,
-            padding=padding,
-            flip_filter=flip_filter,
-            gain=gain).apply(input, filter)
-    return _upfirdn2d_ref(
-        input,
-        filter,
-        up=up,
-        down=down,
-        padding=padding,
-        flip_filter=flip_filter,
-        gain=gain)
-
-
-def _upfirdn2d_ref(input: torch.Tensor,
-                   filter: torch.Tensor,
-                   up: int = 1,
-                   down: int = 1,
-                   padding: Union[int, List[int]] = 0,
-                   flip_filter: bool = False,
-                   gain: Union[float, int] = 1):
-    """Slow reference implementation of `upfirdn2d()` using standard PyTorch
-    ops.
-
-    Args:
-        input (torch.Tensor): Float32/float64/float16 input tensor of the shape
-            `[batch_size, num_channels, in_height, in_width]`.
-        filter (torch.Tensor): Float32 FIR filter of the shape `[filter_height,
-            filter_width]` (non-separable), `[filter_taps]` (separable), or
-            `None` (identity).
-        up (int): Integer upsampling factor. Can be a single int or a
-            list/tuple `[x, y]`. Defaults to 1.
-        down (int): Integer downsampling factor. Can be a single int
-            or a list/tuple `[x, y]`. Defaults to 1.
-        padding (int | tuple[int]): Padding with respect to the upsampled
-            image. Can be a single number or a list/tuple `[x, y]` or
-            `[x_before, x_after, y_before, y_after]`. Defaults to 0.
-        flip_filter (bool): False = convolution, True = correlation.
-            Defaults to False.
-        gain (int): Overall scaling factor for signal magnitude.
-            Defaults to 1.
-
-    Returns:
-        torch.Tensor: Tensor of the shape `[batch_size, num_channels,
-            out_height, out_width]`.
-    """
-    # Validate arguments.
-    assert isinstance(input, torch.Tensor) and input.ndim == 4
-    if filter is None:
-        filter = torch.ones([1, 1], dtype=torch.float32, device=input.device)
-    assert isinstance(filter, torch.Tensor) and filter.ndim in [1, 2]
-    assert filter.dtype == torch.float32 and not filter.requires_grad
-    batch_size, num_channels, in_height, in_width = input.shape
-    upx, upy = _parse_scaling(up)
-    downx, downy = _parse_scaling(down)
-    padx0, padx1, pady0, pady1 = _parse_padding(padding)
-
-    # Check that upsampled buffer is not smaller than the filter.
-    upW = in_width * upx + padx0 + padx1
-    upH = in_height * upy + pady0 + pady1
-    assert upW >= filter.shape[-1] and upH >= filter.shape[0]
-
-    # Upsample by inserting zeros.
-    x = input.reshape([batch_size, num_channels, in_height, 1, in_width, 1])
-    x = torch.nn.functional.pad(x, [0, upx - 1, 0, 0, 0, upy - 1])
-    x = x.reshape([batch_size, num_channels, in_height * upy, in_width * upx])
-
-    # Pad or crop.
-    x = torch.nn.functional.pad(
-        x, [max(padx0, 0),
-            max(padx1, 0),
-            max(pady0, 0),
-            max(pady1, 0)])
-    x = x[:, :,
-          max(-pady0, 0):x.shape[2] - max(-pady1, 0),
-          max(-padx0, 0):x.shape[3] - max(-padx1, 0)]
-
-    # Setup filter.
-    filter = filter * (gain**(filter.ndim / 2))
-    filter = filter.to(x.dtype)
-    if not flip_filter:
-        filter = filter.flip(list(range(filter.ndim)))
-
-    # Convolve with the filter.
-    filter = filter[None, None].repeat([num_channels, 1] + [1] * filter.ndim)
-    if filter.ndim == 4:
-        x = conv2d(input=x, weight=filter, groups=num_channels)
-    else:
-        x = conv2d(input=x, weight=filter.unsqueeze(2), groups=num_channels)
-        x = conv2d(input=x, weight=filter.unsqueeze(3), groups=num_channels)
-
-    # Downsample by throwing away pixels.
-    x = x[:, :, ::downy, ::downx]
-    return x
 
+upfirdn2d_ext = ext_loader.load_ext('_ext', ['upfirdn2d'])
 
-_upfirdn2d_cuda_cache: Dict = dict()
 
+class UpFirDn2dBackward(Function):
 
-def _upfirdn2d_cuda(up: int = 1,
-                    down: int = 1,
-                    padding: Union[int, List[int]] = 0,
-                    flip_filter: bool = False,
-                    gain: Union[float, int] = 1):
-    """Fast CUDA implementation of `upfirdn2d()` using custom ops.
+    @staticmethod
+    def forward(ctx, grad_output, kernel, grad_kernel, up, down, pad, g_pad,
+                in_size, out_size):
+
+        up_x, up_y = up
+        down_x, down_y = down
+        g_pad_x0, g_pad_x1, g_pad_y0, g_pad_y1 = g_pad
+
+        grad_output = grad_output.reshape(-1, out_size[0], out_size[1], 1)
+
+        grad_input = upfirdn2d_ext.upfirdn2d(
+            grad_output,
+            grad_kernel,
+            up_x=down_x,
+            up_y=down_y,
+            down_x=up_x,
+            down_y=up_y,
+            pad_x0=g_pad_x0,
+            pad_x1=g_pad_x1,
+            pad_y0=g_pad_y0,
+            pad_y1=g_pad_y1)
+        grad_input = grad_input.view(in_size[0], in_size[1], in_size[2],
+                                     in_size[3])
+
+        ctx.save_for_backward(kernel)
+
+        pad_x0, pad_x1, pad_y0, pad_y1 = pad
+
+        ctx.up_x = up_x
+        ctx.up_y = up_y
+        ctx.down_x = down_x
+        ctx.down_y = down_y
+        ctx.pad_x0 = pad_x0
+        ctx.pad_x1 = pad_x1
+        ctx.pad_y0 = pad_y0
+        ctx.pad_y1 = pad_y1
+        ctx.in_size = in_size
+        ctx.out_size = out_size
+
+        return grad_input
+
+    @staticmethod
+    def backward(ctx, gradgrad_input):
+        kernel, = ctx.saved_tensors
+
+        gradgrad_input = gradgrad_input.reshape(-1, ctx.in_size[2],
+                                                ctx.in_size[3], 1)
+
+        gradgrad_out = upfirdn2d_ext.upfirdn2d(
+            gradgrad_input,
+            kernel,
+            up_x=ctx.up_x,
+            up_y=ctx.up_y,
+            down_x=ctx.down_x,
+            down_y=ctx.down_y,
+            pad_x0=ctx.pad_x0,
+            pad_x1=ctx.pad_x1,
+            pad_y0=ctx.pad_y0,
+            pad_y1=ctx.pad_y1)
+        # gradgrad_out = gradgrad_out.view(ctx.in_size[0], ctx.out_size[0],
+        #                                  ctx.out_size[1], ctx.in_size[3])
+        gradgrad_out = gradgrad_out.view(ctx.in_size[0], ctx.in_size[1],
+                                         ctx.out_size[0], ctx.out_size[1])
+
+        return gradgrad_out, None, None, None, None, None, None, None, None
+
+
+class UpFirDn2d(Function):
+
+    @staticmethod
+    def forward(ctx, input, kernel, up, down, pad):
+        up_x, up_y = up
+        down_x, down_y = down
+        pad_x0, pad_x1, pad_y0, pad_y1 = pad
+
+        kernel_h, kernel_w = kernel.shape
+        batch, channel, in_h, in_w = input.shape
+        ctx.in_size = input.shape
+
+        input = input.reshape(-1, in_h, in_w, 1)
+
+        ctx.save_for_backward(kernel, torch.flip(kernel, [0, 1]))
+
+        out_h = (in_h * up_y + pad_y0 + pad_y1 - kernel_h) // down_y + 1
+        out_w = (in_w * up_x + pad_x0 + pad_x1 - kernel_w) // down_x + 1
+        ctx.out_size = (out_h, out_w)
+
+        ctx.up = (up_x, up_y)
+        ctx.down = (down_x, down_y)
+        ctx.pad = (pad_x0, pad_x1, pad_y0, pad_y1)
+
+        g_pad_x0 = kernel_w - pad_x0 - 1
+        g_pad_y0 = kernel_h - pad_y0 - 1
+        g_pad_x1 = in_w * up_x - out_w * down_x + pad_x0 - up_x + 1
+        g_pad_y1 = in_h * up_y - out_h * down_y + pad_y0 - up_y + 1
+
+        ctx.g_pad = (g_pad_x0, g_pad_x1, g_pad_y0, g_pad_y1)
+
+        out = upfirdn2d_ext.upfirdn2d(
+            input,
+            kernel,
+            up_x=up_x,
+            up_y=up_y,
+            down_x=down_x,
+            down_y=down_y,
+            pad_x0=pad_x0,
+            pad_x1=pad_x1,
+            pad_y0=pad_y0,
+            pad_y1=pad_y1)
+        # out = out.view(major, out_h, out_w, minor)
+        out = out.view(-1, channel, out_h, out_w)
+
+        return out
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        kernel, grad_kernel = ctx.saved_tensors
+
+        grad_input = UpFirDn2dBackward.apply(
+            grad_output,
+            kernel,
+            grad_kernel,
+            ctx.up,
+            ctx.down,
+            ctx.pad,
+            ctx.g_pad,
+            ctx.in_size,
+            ctx.out_size,
+        )
+
+        return grad_input, None, None, None, None
+
+
+def upfirdn2d(input, kernel, up=1, down=1, pad=(0, 0)):
+    """UpFRIDn for 2d features.
+
+    UpFIRDn is short for upsample, apply FIR filter and downsample. More
+    details can be found in:
+    https://www.mathworks.com/help/signal/ref/upfirdn.html
 
     Args:
-        up (int): Integer upsampling factor. Can be a single int or a
-            list/tuple `[x, y]`. Defaults to 1.
-        down (int): Integer downsampling factor. Can be a single int
-            or a list/tuple `[x, y]`. Defaults to 1.
-        padding (int | tuple[int]): Padding with respect to the upsampled
-            image. Can be a single number or a list/tuple `[x, y]` or
-            `[x_before, x_after, y_before, y_after]`. Defaults to 0.
-        flip_filter (bool): False = convolution, True = correlation.
-            Defaults to False.
-        gain (int): Overall scaling factor for signal magnitude.
+        input (Tensor): Tensor with shape of (n, c, h, w).
+        kernel (Tensor): Filter kernel.
+        up (int | tuple[int], optional): Upsampling factor. If given a number,
+            we will use this factor for the both height and width side.
             Defaults to 1.
-
-    Returns:
-        torch.Tensor: Tensor of the shape `[batch_size, num_channels,
-        out_height, out_width]`
-    """
-    # Parse arguments.
-    upx, upy = _parse_scaling(up)
-    downx, downy = _parse_scaling(down)
-    padx0, padx1, pady0, pady1 = _parse_padding(padding)
-
-    # Lookup from cache.
-    key = (upx, upy, downx, downy, padx0, padx1, pady0, pady1, flip_filter,
-           gain)
-    if key in _upfirdn2d_cuda_cache:
-        return _upfirdn2d_cuda_cache[key]
-
-    # Forward op.
-    class Upfirdn2dCuda(torch.autograd.Function):
-
-        @staticmethod
-        def forward(ctx, x, f):  # pylint: disable=arguments-differ
-            assert isinstance(x, torch.Tensor) and x.ndim == 4
-            if f is None:
-                f = torch.ones([1, 1], dtype=torch.float32, device=x.device)
-            if f.ndim == 1 and f.shape[0] == 1:
-                f = f.square().unsqueeze(
-                    0)  # Convert separable-1 into full-1x1.
-            assert isinstance(f, torch.Tensor) and f.ndim in [1, 2]
-            y = x
-            if f.ndim == 2:
-                y = ext_module.upfirdn2d(y, f, upx, upy, downx, downy, padx0,
-                                         padx1, pady0, pady1, flip_filter,
-                                         gain)
-            else:
-                y = ext_module.upfirdn2d(y, f.unsqueeze(0), upx, 1, downx, 1,
-                                         padx0, padx1, 0, 0, flip_filter, 1.0)
-                y = ext_module.upfirdn2d(y, f.unsqueeze(1), 1, upy, 1, downy,
-                                         0, 0, pady0, pady1, flip_filter, gain)
-            ctx.save_for_backward(f)
-            ctx.x_shape = x.shape
-            return y
-
-        @staticmethod
-        def backward(ctx, dy):  # pylint: disable=arguments-differ
-            f, = ctx.saved_tensors
-            _, _, ih, iw = ctx.x_shape
-            _, _, oh, ow = dy.shape
-            fw, fh = _get_filter_size(f)
-            p = [
-                fw - padx0 - 1,
-                iw * upx - ow * downx + padx0 - upx + 1,
-                fh - pady0 - 1,
-                ih * upy - oh * downy + pady0 - upy + 1,
-            ]
-            dx = None
-            df = None
-
-            if ctx.needs_input_grad[0]:
-                dx = _upfirdn2d_cuda(
-                    up=down,
-                    down=up,
-                    padding=p,
-                    flip_filter=(not flip_filter),
-                    gain=gain).apply(dy, f)
-
-            assert not ctx.needs_input_grad[1]
-            return dx, df
-
-    # Add to cache.
-    _upfirdn2d_cuda_cache[key] = Upfirdn2dCuda
-    return Upfirdn2dCuda
-
-
-def filter2d(input: torch.Tensor,
-             filter: torch.Tensor,
-             padding: Union[int, List[int]] = 0,
-             flip_filter: bool = False,
-             gain: Union[float, int] = 1,
-             use_custom_op: bool = True):
-    """Filter a batch of 2D images using the given 2D FIR filter.
-
-    By default, the result is padded so that its shape matches the input.
-    User-specified padding is applied on top of that, with negative values
-    indicating cropping. Pixels outside the image are assumed to be zero.
-
-    Args:
-        input (torch.Tensor): Float32/float64/float16 input tensor of the shape
-            `[batch_size, num_channels, in_height, in_width]`.
-        filter (torch.Tensor): Float32 FIR filter of the shape `[filter_height,
-            filter_width]` (non-separable), `[filter_taps]` (separable), or
-            `None`.
-        padding (int | tuple[int]): Padding with respect to the output.
-            Can be a single number or a list/tuple `[x, y]` or `[x_before,
-            x_after, y_before, y_after]`. Defaults to 0.
-        flip_filter (bool): False = convolution, True = correlation.
-            Defaults to False.
-        gain (int): Overall scaling factor for signal magnitude.
+        down (int | tuple[int], optional): Downsampling factor. If given a
+            number, we will use this factor for the both height and width side.
             Defaults to 1.
-        use_custom_op (bool): Whether to use customized op.
-            Defaults to True.
+        pad (tuple[int], optional): Padding for tensors, (x_pad, y_pad) or
+            (x_pad_0, x_pad_1, y_pad_0, y_pad_1). Defaults to (0, 0).
 
     Returns:
-        Tensor of the shape `[batch_size, num_channels, out_height,
-        out_width]`.
+        Tensor: Tensor after UpFIRDn.
     """
-    padx0, padx1, pady0, pady1 = _parse_padding(padding)
-    fw, fh = _get_filter_size(filter)
-    p = [
-        padx0 + fw // 2,
-        padx1 + (fw - 1) // 2,
-        pady0 + fh // 2,
-        pady1 + (fh - 1) // 2,
-    ]
-    return upfirdn2d(
-        input,
-        filter,
-        padding=p,
-        flip_filter=flip_filter,
-        gain=gain,
-        use_custom_op=use_custom_op)
-
-
-def upsample2d(input: torch.Tensor,
-               filter: torch.Tensor,
-               up: int = 2,
-               padding: Union[int, List[int]] = 0,
-               flip_filter: bool = False,
-               gain: Union[float, int] = 1,
-               use_custom_op: bool = True):
-    """Upsample a batch of 2D images using the given 2D FIR filter.
-
-    By default, the result is padded so that its shape is a multiple of the
-    input.
-    User-specified padding is applied on top of that, with negative values
-    indicating cropping. Pixels outside the image are assumed to be zero.
+    if input.device.type == 'cpu':
+        if len(pad) == 2:
+            pad = (pad[0], pad[1], pad[0], pad[1])
 
-    Args:
-        input (torch.Tensor): Float32/float64/float16 input tensor of the shape
-            `[batch_size, num_channels, in_height, in_width]`.
-        filter (torch.Tensor): Float32 FIR filter of the shape `[filter_height,
-            filter_width]` (non-separable), `[filter_taps]` (separable), or
-            `None` (identity).
-        up (int): Integer upsampling factor. Can be a single int or a
-            list/tuple `[x, y]`. Defaults to 2.
-        padding (int | tuple[int]): Padding with respect to the output.
-            Can be a single number or a list/tuple `[x, y]` or `[x_before,
-            x_after, y_before, y_after]`. Defaults to 0.
-        flip_filter (bool): False = convolution, True = correlation. Defaults
-            to False.
-        gain (int): Overall scaling factor for signal magnitude. Defaults to 1.
-        use_custom_op (bool): Whether to use customized op.
-            Defaults to True.
+        up = to_2tuple(up)
 
-    Returns:
-        torch.Tensor: Tensor of the shape `[batch_size, num_channels,
-        out_height, out_width]`
-    """
-    upx, upy = _parse_scaling(up)
-    padx0, padx1, pady0, pady1 = _parse_padding(padding)
-    fw, fh = _get_filter_size(filter)
-    p = [
-        padx0 + (fw + upx - 1) // 2,
-        padx1 + (fw - upx) // 2,
-        pady0 + (fh + upy - 1) // 2,
-        pady1 + (fh - upy) // 2,
-    ]
-    return upfirdn2d(
-        input,
-        filter,
-        up=up,
-        padding=p,
-        flip_filter=flip_filter,
-        gain=gain * upx * upy,
-        use_custom_op=use_custom_op)
-
-
-def downsample2d(input: torch.Tensor,
-                 filter: torch.Tensor,
-                 down: int = 2,
-                 padding: Union[int, List[int]] = 0,
-                 flip_filter: bool = False,
-                 gain: Union[float, int] = 1,
-                 use_custom_op: bool = True):
-    """Downsample a batch of 2D images using the given 2D FIR filter.
-
-    By default, the result is padded so that its shape is a fraction of the
-    input.
-    User-specified padding is applied on top of that, with negative values
-    indicating cropping. Pixels outside the image are assumed to be zero.
-
-    Args:
-        input (torch.Tensor): Float32/float64/float16 input tensor of the shape
-            `[batch_size, num_channels, in_height, in_width]`.
-        filter (torch.Tensor): Float32 FIR filter of the shape `[filter_height,
-            filter_width]` (non-separable), `[filter_taps]` (separable), or
-            `None` (identity).
-        down (int): Integer downsampling factor. Can be a single int or a
-                     list/tuple `[x, y]` (default: 1). Defaults to 2.
-        padding (int | tuple[int]): Padding with respect to the input.
-            Can be a single number or a list/tuple `[x, y]` or `[x_before,
-            x_after, y_before, y_after]`. Defaults to 0.
-        flip_filter (bool): False = convolution, True = correlation. Defaults
-            to False.
-        gain (int): Overall scaling factor for signal magnitude. Defaults to 1.
-        use_custom_op (bool): Whether to use customized op.
-            Defaults to True.
+        down = to_2tuple(down)
 
-    Returns:
-        torch.Tensor: Tensor of the shape `[batch_size, num_channels,
-        out_height, out_width]`.
-    """
-    downx, downy = _parse_scaling(down)
-    padx0, padx1, pady0, pady1 = _parse_padding(padding)
-    fw, fh = _get_filter_size(filter)
-    p = [
-        padx0 + (fw - downx + 1) // 2,
-        padx1 + (fw - downx) // 2,
-        pady0 + (fh - downy + 1) // 2,
-        pady1 + (fh - downy) // 2,
-    ]
-    return upfirdn2d(
-        input,
-        filter,
-        down=down,
-        padding=p,
-        flip_filter=flip_filter,
-        gain=gain,
-        use_custom_op=use_custom_op)
+        out = upfirdn2d_native(input, kernel, up[0], up[1], down[0], down[1],
+                               pad[0], pad[1], pad[2], pad[3])
+    else:
+        _up = to_2tuple(up)
+
+        _down = to_2tuple(down)
+
+        if len(pad) == 4:
+            _pad = pad
+        elif len(pad) == 2:
+            _pad = (pad[0], pad[1], pad[0], pad[1])
+
+        out = UpFirDn2d.apply(input, kernel, _up, _down, _pad)
+
+    return out
+
+
+def upfirdn2d_native(input, kernel, up_x, up_y, down_x, down_y, pad_x0, pad_x1,
+                     pad_y0, pad_y1):
+    _, channel, in_h, in_w = input.shape
+    input = input.reshape(-1, in_h, in_w, 1)
+
+    _, in_h, in_w, minor = input.shape
+    kernel_h, kernel_w = kernel.shape
+
+    out = input.view(-1, in_h, 1, in_w, 1, minor)
+    out = F.pad(out, [0, 0, 0, up_x - 1, 0, 0, 0, up_y - 1])
+    out = out.view(-1, in_h * up_y, in_w * up_x, minor)
+
+    out = F.pad(
+        out,
+        [0, 0,
+         max(pad_x0, 0),
+         max(pad_x1, 0),
+         max(pad_y0, 0),
+         max(pad_y1, 0)])
+    out = out[:,
+              max(-pad_y0, 0):out.shape[1] - max(-pad_y1, 0),
+              max(-pad_x0, 0):out.shape[2] - max(-pad_x1, 0), :, ]
+
+    out = out.permute(0, 3, 1, 2)
+    out = out.reshape(
+        [-1, 1, in_h * up_y + pad_y0 + pad_y1, in_w * up_x + pad_x0 + pad_x1])
+    w = torch.flip(kernel, [0, 1]).view(1, 1, kernel_h, kernel_w)
+    out = F.conv2d(out, w)
+    out = out.reshape(
+        -1,
+        minor,
+        in_h * up_y + pad_y0 + pad_y1 - kernel_h + 1,
+        in_w * up_x + pad_x0 + pad_x1 - kernel_w + 1,
+    )
+    out = out.permute(0, 2, 3, 1)
+    out = out[:, ::down_y, ::down_x, :]
+
+    out_h = (in_h * up_y + pad_y0 + pad_y1 - kernel_h) // down_y + 1
+    out_w = (in_w * up_x + pad_x0 + pad_x1 - kernel_w) // down_x + 1
+
+    return out.view(-1, channel, out_h, out_w)
diff --git a/mmcv/ops/voxelize.py b/mmcv/ops/voxelize.py
index 992ce68..d6fc855 100644
--- a/mmcv/ops/voxelize.py
+++ b/mmcv/ops/voxelize.py
@@ -1,6 +1,4 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from typing import Any, List, Tuple, Union
-
 import torch
 from torch import nn
 from torch.autograd import Function
@@ -15,14 +13,12 @@ ext_module = ext_loader.load_ext(
 class _Voxelization(Function):
 
     @staticmethod
-    def forward(
-            ctx: Any,
-            points: torch.Tensor,
-            voxel_size: Union[tuple, float],
-            coors_range: Union[tuple, float],
-            max_points: int = 35,
-            max_voxels: int = 20000,
-            deterministic: bool = True) -> Union[Tuple[torch.Tensor], Tuple]:
+    def forward(ctx,
+                points,
+                voxel_size,
+                coors_range,
+                max_points=35,
+                max_voxels=20000):
         """Convert kitti points(N, >=3) to voxels.
 
         Args:
@@ -38,24 +34,15 @@ class _Voxelization(Function):
                 for second, 20000 is a good choice. Users should shuffle points
                 before call this function because max_voxels may drop points.
                 Default: 20000.
-            deterministic: bool. whether to invoke the non-deterministic
-                version of hard-voxelization implementations. non-deterministic
-                version is considerablly fast but is not deterministic. only
-                affects hard voxelization. default True. for more information
-                of this argument and the implementation insights, please refer
-                to the following links:
-                https://github.com/open-mmlab/mmdetection3d/issues/894
-                https://github.com/open-mmlab/mmdetection3d/pull/904
-                it is an experimental feature and we will appreciate it if
-                you could share with us the failing cases.
 
         Returns:
-            tuple[torch.Tensor]: tuple[torch.Tensor]: A tuple contains three
-            elements. The first one is the output voxels with the shape of
-            [M, max_points, n_dim], which only contain points and returned
-            when max_points != -1. The second is the voxel coordinates with
-            shape of [M, 3]. The last is number of point per voxel with the
-            shape of [M], which only returned when max_points != -1.
+            voxels_out (torch.Tensor): Output voxels with the shape of [M,
+                max_points, ndim]. Only contain points and returned when
+                max_points != -1.
+            coors_out (torch.Tensor): Output coordinates with the shape of
+                [M, 3].
+            num_points_per_voxel_out (torch.Tensor): Num points per voxel with
+                the shape of [M]. Only returned when max_points != -1.
         """
         if max_points == -1 or max_voxels == -1:
             coors = points.new_zeros(size=(points.size(0), 3), dtype=torch.int)
@@ -83,8 +70,7 @@ class _Voxelization(Function):
                 voxel_num,
                 max_points=max_points,
                 max_voxels=max_voxels,
-                NDim=3,
-                deterministic=deterministic)
+                NDim=3)
             # select the valid voxels
             voxels_out = voxels[:voxel_num]
             coors_out = coors[:voxel_num]
@@ -98,8 +84,8 @@ voxelization = _Voxelization.apply
 class Voxelization(nn.Module):
     """Convert kitti points(N, >=3) to voxels.
 
-    Please refer to `Point-Voxel CNN for Efficient 3D Deep Learning
-    <https://arxiv.org/abs/1907.03739>`_ for more details.
+    Please refer to `PVCNN <https://arxiv.org/abs/1907.03739>`_ for more
+    details.
 
     Args:
         voxel_size (tuple or float): The size of voxel with the shape of [3].
@@ -114,30 +100,10 @@ class Voxelization(nn.Module):
     """
 
     def __init__(self,
-                 voxel_size: List,
-                 point_cloud_range: List,
-                 max_num_points: int,
-                 max_voxels: Union[tuple, int] = 20000,
-                 deterministic: bool = True):
-        """
-        Args:
-            voxel_size (list): list [x, y, z] size of three dimension
-            point_cloud_range (list):
-                [x_min, y_min, z_min, x_max, y_max, z_max]
-            max_num_points (int): max number of points per voxel
-            max_voxels (tuple or int): max number of voxels in
-                (training, testing) time
-            deterministic: bool. whether to invoke the non-deterministic
-                version of hard-voxelization implementations. non-deterministic
-                version is considerablly fast but is not deterministic. only
-                affects hard voxelization. default True. for more information
-                of this argument and the implementation insights, please refer
-                to the following links:
-                https://github.com/open-mmlab/mmdetection3d/issues/894
-                https://github.com/open-mmlab/mmdetection3d/pull/904
-                it is an experimental feature and we will appreciate it if
-                you could share with us the failing cases.
-        """
+                 voxel_size,
+                 point_cloud_range,
+                 max_num_points,
+                 max_voxels=20000):
         super().__init__()
 
         self.voxel_size = voxel_size
@@ -147,14 +113,12 @@ class Voxelization(nn.Module):
             self.max_voxels = max_voxels
         else:
             self.max_voxels = _pair(max_voxels)
-        self.deterministic = deterministic
 
         point_cloud_range = torch.tensor(
             point_cloud_range, dtype=torch.float32)
         voxel_size = torch.tensor(voxel_size, dtype=torch.float32)
-        grid_size = (
-            point_cloud_range[3:] -  # type: ignore
-            point_cloud_range[:3]) / voxel_size  # type: ignore
+        grid_size = (point_cloud_range[3:] -
+                     point_cloud_range[:3]) / voxel_size
         grid_size = torch.round(grid_size).long()
         input_feat_shape = grid_size[:2]
         self.grid_size = grid_size
@@ -162,15 +126,14 @@ class Voxelization(nn.Module):
         # [w, h, d] -> [d, h, w]
         self.pcd_shape = [*input_feat_shape, 1][::-1]
 
-    def forward(self, input: torch.Tensor) -> torch.Tensor:
+    def forward(self, input):
         if self.training:
             max_voxels = self.max_voxels[0]
         else:
             max_voxels = self.max_voxels[1]
 
         return voxelization(input, self.voxel_size, self.point_cloud_range,
-                            self.max_num_points, max_voxels,
-                            self.deterministic)
+                            self.max_num_points, max_voxels)
 
     def __repr__(self):
         s = self.__class__.__name__ + '('
@@ -178,6 +141,5 @@ class Voxelization(nn.Module):
         s += ', point_cloud_range=' + str(self.point_cloud_range)
         s += ', max_num_points=' + str(self.max_num_points)
         s += ', max_voxels=' + str(self.max_voxels)
-        s += ', deterministic=' + str(self.deterministic)
         s += ')'
         return s
diff --git a/mmcv/parallel/__init__.py b/mmcv/parallel/__init__.py
new file mode 100644
index 0000000..2ed2c17
--- /dev/null
+++ b/mmcv/parallel/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .collate import collate
+from .data_container import DataContainer
+from .data_parallel import MMDataParallel
+from .distributed import MMDistributedDataParallel
+from .registry import MODULE_WRAPPERS
+from .scatter_gather import scatter, scatter_kwargs
+from .utils import is_module_wrapper
+
+__all__ = [
+    'collate', 'DataContainer', 'MMDataParallel', 'MMDistributedDataParallel',
+    'scatter', 'scatter_kwargs', 'is_module_wrapper', 'MODULE_WRAPPERS'
+]
diff --git a/mmcv/parallel/_functions.py b/mmcv/parallel/_functions.py
new file mode 100644
index 0000000..9b5a8a4
--- /dev/null
+++ b/mmcv/parallel/_functions.py
@@ -0,0 +1,79 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+from torch.nn.parallel._functions import _get_stream
+
+
+def scatter(input, devices, streams=None):
+    """Scatters tensor across multiple GPUs."""
+    if streams is None:
+        streams = [None] * len(devices)
+
+    if isinstance(input, list):
+        chunk_size = (len(input) - 1) // len(devices) + 1
+        outputs = [
+            scatter(input[i], [devices[i // chunk_size]],
+                    [streams[i // chunk_size]]) for i in range(len(input))
+        ]
+        return outputs
+    elif isinstance(input, torch.Tensor):
+        output = input.contiguous()
+        # TODO: copy to a pinned buffer first (if copying from CPU)
+        stream = streams[0] if output.numel() > 0 else None
+        if devices != [-1]:
+            with torch.cuda.device(devices[0]), torch.cuda.stream(stream):
+                output = output.cuda(devices[0], non_blocking=True)
+        else:
+            # unsqueeze the first dimension thus the tensor's shape is the
+            # same as those scattered with GPU.
+            output = output.unsqueeze(0)
+        return output
+    else:
+        raise Exception(f'Unknown type {type(input)}.')
+
+
+def synchronize_stream(output, devices, streams):
+    if isinstance(output, list):
+        chunk_size = len(output) // len(devices)
+        for i in range(len(devices)):
+            for j in range(chunk_size):
+                synchronize_stream(output[i * chunk_size + j], [devices[i]],
+                                   [streams[i]])
+    elif isinstance(output, torch.Tensor):
+        if output.numel() != 0:
+            with torch.cuda.device(devices[0]):
+                main_stream = torch.cuda.current_stream()
+                main_stream.wait_stream(streams[0])
+                output.record_stream(main_stream)
+    else:
+        raise Exception(f'Unknown type {type(output)}.')
+
+
+def get_input_device(input):
+    if isinstance(input, list):
+        for item in input:
+            input_device = get_input_device(item)
+            if input_device != -1:
+                return input_device
+        return -1
+    elif isinstance(input, torch.Tensor):
+        return input.get_device() if input.is_cuda else -1
+    else:
+        raise Exception(f'Unknown type {type(input)}.')
+
+
+class Scatter:
+
+    @staticmethod
+    def forward(target_gpus, input):
+        input_device = get_input_device(input)
+        streams = None
+        if input_device == -1 and target_gpus != [-1]:
+            # Perform CPU to GPU copies in a background stream
+            streams = [_get_stream(device) for device in target_gpus]
+
+        outputs = scatter(input, target_gpus, streams)
+        # Synchronize with the copy stream
+        if streams is not None:
+            synchronize_stream(outputs, target_gpus, streams)
+
+        return tuple(outputs)
diff --git a/mmcv/parallel/collate.py b/mmcv/parallel/collate.py
new file mode 100644
index 0000000..ad74919
--- /dev/null
+++ b/mmcv/parallel/collate.py
@@ -0,0 +1,84 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from collections.abc import Mapping, Sequence
+
+import torch
+import torch.nn.functional as F
+from torch.utils.data.dataloader import default_collate
+
+from .data_container import DataContainer
+
+
+def collate(batch, samples_per_gpu=1):
+    """Puts each data field into a tensor/DataContainer with outer dimension
+    batch size.
+
+    Extend default_collate to add support for
+    :type:`~mmcv.parallel.DataContainer`. There are 3 cases.
+
+    1. cpu_only = True, e.g., meta data
+    2. cpu_only = False, stack = True, e.g., images tensors
+    3. cpu_only = False, stack = False, e.g., gt bboxes
+    """
+
+    if not isinstance(batch, Sequence):
+        raise TypeError(f'{batch.dtype} is not supported.')
+
+    if isinstance(batch[0], DataContainer):
+        stacked = []
+        if batch[0].cpu_only:
+            for i in range(0, len(batch), samples_per_gpu):
+                stacked.append(
+                    [sample.data for sample in batch[i:i + samples_per_gpu]])
+            return DataContainer(
+                stacked, batch[0].stack, batch[0].padding_value, cpu_only=True)
+        elif batch[0].stack:
+            for i in range(0, len(batch), samples_per_gpu):
+                assert isinstance(batch[i].data, torch.Tensor)
+
+                if batch[i].pad_dims is not None:
+                    ndim = batch[i].dim()
+                    assert ndim > batch[i].pad_dims
+                    max_shape = [0 for _ in range(batch[i].pad_dims)]
+                    for dim in range(1, batch[i].pad_dims + 1):
+                        max_shape[dim - 1] = batch[i].size(-dim)
+                    for sample in batch[i:i + samples_per_gpu]:
+                        for dim in range(0, ndim - batch[i].pad_dims):
+                            assert batch[i].size(dim) == sample.size(dim)
+                        for dim in range(1, batch[i].pad_dims + 1):
+                            max_shape[dim - 1] = max(max_shape[dim - 1],
+                                                     sample.size(-dim))
+                    padded_samples = []
+                    for sample in batch[i:i + samples_per_gpu]:
+                        pad = [0 for _ in range(batch[i].pad_dims * 2)]
+                        for dim in range(1, batch[i].pad_dims + 1):
+                            pad[2 * dim -
+                                1] = max_shape[dim - 1] - sample.size(-dim)
+                        padded_samples.append(
+                            F.pad(
+                                sample.data, pad, value=sample.padding_value))
+                    stacked.append(default_collate(padded_samples))
+                elif batch[i].pad_dims is None:
+                    stacked.append(
+                        default_collate([
+                            sample.data
+                            for sample in batch[i:i + samples_per_gpu]
+                        ]))
+                else:
+                    raise ValueError(
+                        'pad_dims should be either None or integers (1-3)')
+
+        else:
+            for i in range(0, len(batch), samples_per_gpu):
+                stacked.append(
+                    [sample.data for sample in batch[i:i + samples_per_gpu]])
+        return DataContainer(stacked, batch[0].stack, batch[0].padding_value)
+    elif isinstance(batch[0], Sequence):
+        transposed = zip(*batch)
+        return [collate(samples, samples_per_gpu) for samples in transposed]
+    elif isinstance(batch[0], Mapping):
+        return {
+            key: collate([d[key] for d in batch], samples_per_gpu)
+            for key in batch[0]
+        }
+    else:
+        return default_collate(batch)
diff --git a/mmcv/parallel/data_container.py b/mmcv/parallel/data_container.py
new file mode 100644
index 0000000..cedb0d3
--- /dev/null
+++ b/mmcv/parallel/data_container.py
@@ -0,0 +1,89 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import functools
+
+import torch
+
+
+def assert_tensor_type(func):
+
+    @functools.wraps(func)
+    def wrapper(*args, **kwargs):
+        if not isinstance(args[0].data, torch.Tensor):
+            raise AttributeError(
+                f'{args[0].__class__.__name__} has no attribute '
+                f'{func.__name__} for type {args[0].datatype}')
+        return func(*args, **kwargs)
+
+    return wrapper
+
+
+class DataContainer:
+    """A container for any type of objects.
+
+    Typically tensors will be stacked in the collate function and sliced along
+    some dimension in the scatter function. This behavior has some limitations.
+    1. All tensors have to be the same size.
+    2. Types are limited (numpy array or Tensor).
+
+    We design `DataContainer` and `MMDataParallel` to overcome these
+    limitations. The behavior can be either of the following.
+
+    - copy to GPU, pad all tensors to the same size and stack them
+    - copy to GPU without stacking
+    - leave the objects as is and pass it to the model
+    - pad_dims specifies the number of last few dimensions to do padding
+    """
+
+    def __init__(self,
+                 data,
+                 stack=False,
+                 padding_value=0,
+                 cpu_only=False,
+                 pad_dims=2):
+        self._data = data
+        self._cpu_only = cpu_only
+        self._stack = stack
+        self._padding_value = padding_value
+        assert pad_dims in [None, 1, 2, 3]
+        self._pad_dims = pad_dims
+
+    def __repr__(self):
+        return f'{self.__class__.__name__}({repr(self.data)})'
+
+    def __len__(self):
+        return len(self._data)
+
+    @property
+    def data(self):
+        return self._data
+
+    @property
+    def datatype(self):
+        if isinstance(self.data, torch.Tensor):
+            return self.data.type()
+        else:
+            return type(self.data)
+
+    @property
+    def cpu_only(self):
+        return self._cpu_only
+
+    @property
+    def stack(self):
+        return self._stack
+
+    @property
+    def padding_value(self):
+        return self._padding_value
+
+    @property
+    def pad_dims(self):
+        return self._pad_dims
+
+    @assert_tensor_type
+    def size(self, *args, **kwargs):
+        return self.data.size(*args, **kwargs)
+
+    @assert_tensor_type
+    def dim(self):
+        return self.data.dim()
diff --git a/mmcv/parallel/data_parallel.py b/mmcv/parallel/data_parallel.py
new file mode 100644
index 0000000..7a5abeb
--- /dev/null
+++ b/mmcv/parallel/data_parallel.py
@@ -0,0 +1,97 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from itertools import chain
+
+from torch.nn.parallel import DataParallel
+
+from .scatter_gather import scatter_kwargs
+
+
+class MMDataParallel(DataParallel):
+    """The DataParallel module that supports DataContainer.
+
+    MMDataParallel has two main differences with PyTorch DataParallel:
+
+    - It supports a custom type :class:`DataContainer` which allows more
+      flexible control of input data during both GPU and CPU inference.
+    - It implement two more APIs ``train_step()`` and ``val_step()``.
+
+    .. warning::
+        MMDataParallel only supports single GPU training, if you need to
+        train with multiple GPUs, please use MMDistributedDataParallel
+        instead. If you have multiple GPUs and you just want to use
+        MMDataParallel, you can set the environment variable
+        ``CUDA_VISIBLE_DEVICES=0`` or instantiate ``MMDataParallel`` with
+        ``device_ids=[0]``.
+
+    Args:
+        module (:class:`nn.Module`): Module to be encapsulated.
+        device_ids (list[int]): Device IDS of modules to be scattered to.
+            Defaults to None when GPU is not available.
+        output_device (str | int): Device ID for output. Defaults to None.
+        dim (int): Dimension used to scatter the data. Defaults to 0.
+    """
+
+    def __init__(self, *args, dim=0, **kwargs):
+        super(MMDataParallel, self).__init__(*args, dim=dim, **kwargs)
+        self.dim = dim
+
+    def forward(self, *inputs, **kwargs):
+        """Override the original forward function.
+
+        The main difference lies in the CPU inference where the data in
+        :class:`DataContainers` will still be gathered.
+        """
+        if not self.device_ids:
+            # We add the following line thus the module could gather and
+            # convert data containers as those in GPU inference
+            inputs, kwargs = self.scatter(inputs, kwargs, [-1])
+            return self.module(*inputs[0], **kwargs[0])
+        else:
+            return super().forward(*inputs, **kwargs)
+
+    def scatter(self, inputs, kwargs, device_ids):
+        return scatter_kwargs(inputs, kwargs, device_ids, dim=self.dim)
+
+    def train_step(self, *inputs, **kwargs):
+        if not self.device_ids:
+            # We add the following line thus the module could gather and
+            # convert data containers as those in GPU inference
+            inputs, kwargs = self.scatter(inputs, kwargs, [-1])
+            return self.module.train_step(*inputs[0], **kwargs[0])
+
+        assert len(self.device_ids) == 1, \
+            ('MMDataParallel only supports single GPU training, if you need to'
+             ' train with multiple GPUs, please use MMDistributedDataParallel'
+             ' instead.')
+
+        for t in chain(self.module.parameters(), self.module.buffers()):
+            if t.device != self.src_device_obj:
+                raise RuntimeError(
+                    'module must have its parameters and buffers '
+                    f'on device {self.src_device_obj} (device_ids[0]) but '
+                    f'found one of them on device: {t.device}')
+
+        inputs, kwargs = self.scatter(inputs, kwargs, self.device_ids)
+        return self.module.train_step(*inputs[0], **kwargs[0])
+
+    def val_step(self, *inputs, **kwargs):
+        if not self.device_ids:
+            # We add the following line thus the module could gather and
+            # convert data containers as those in GPU inference
+            inputs, kwargs = self.scatter(inputs, kwargs, [-1])
+            return self.module.val_step(*inputs[0], **kwargs[0])
+
+        assert len(self.device_ids) == 1, \
+            ('MMDataParallel only supports single GPU training, if you need to'
+             ' train with multiple GPUs, please use MMDistributedDataParallel'
+             ' instead.')
+
+        for t in chain(self.module.parameters(), self.module.buffers()):
+            if t.device != self.src_device_obj:
+                raise RuntimeError(
+                    'module must have its parameters and buffers '
+                    f'on device {self.src_device_obj} (device_ids[0]) but '
+                    f'found one of them on device: {t.device}')
+
+        inputs, kwargs = self.scatter(inputs, kwargs, self.device_ids)
+        return self.module.val_step(*inputs[0], **kwargs[0])
diff --git a/mmcv/parallel/distributed.py b/mmcv/parallel/distributed.py
new file mode 100644
index 0000000..b799a21
--- /dev/null
+++ b/mmcv/parallel/distributed.py
@@ -0,0 +1,112 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+from torch.nn.parallel.distributed import (DistributedDataParallel,
+                                           _find_tensors)
+
+from mmcv import print_log
+from mmcv.utils import TORCH_VERSION, digit_version
+from .scatter_gather import scatter_kwargs
+
+
+class MMDistributedDataParallel(DistributedDataParallel):
+    """The DDP module that supports DataContainer.
+
+    MMDDP has two main differences with PyTorch DDP:
+
+    - It supports a custom type :class:`DataContainer` which allows more
+      flexible control of input data.
+    - It implement two APIs ``train_step()`` and ``val_step()``.
+    """
+
+    def to_kwargs(self, inputs, kwargs, device_id):
+        # Use `self.to_kwargs` instead of `self.scatter` in pytorch1.8
+        # to move all tensors to device_id
+        return scatter_kwargs(inputs, kwargs, [device_id], dim=self.dim)
+
+    def scatter(self, inputs, kwargs, device_ids):
+        return scatter_kwargs(inputs, kwargs, device_ids, dim=self.dim)
+
+    def train_step(self, *inputs, **kwargs):
+        """train_step() API for module wrapped by DistributedDataParallel.
+
+        This method is basically the same as
+        ``DistributedDataParallel.forward()``, while replacing
+        ``self.module.forward()`` with ``self.module.train_step()``.
+        It is compatible with PyTorch 1.1 - 1.5.
+        """
+
+        # In PyTorch >= 1.7, ``reducer._rebuild_buckets()`` is moved from the
+        # end of backward to the beginning of forward.
+        if ('parrots' not in TORCH_VERSION
+                and digit_version(TORCH_VERSION) >= digit_version('1.7')
+                and self.reducer._rebuild_buckets()):
+            print_log(
+                'Reducer buckets have been rebuilt in this iteration.',
+                logger='mmcv')
+
+        if getattr(self, 'require_forward_param_sync', True):
+            self._sync_params()
+        if self.device_ids:
+            inputs, kwargs = self.scatter(inputs, kwargs, self.device_ids)
+            if len(self.device_ids) == 1:
+                output = self.module.train_step(*inputs[0], **kwargs[0])
+            else:
+                outputs = self.parallel_apply(
+                    self._module_copies[:len(inputs)], inputs, kwargs)
+                output = self.gather(outputs, self.output_device)
+        else:
+            output = self.module.train_step(*inputs, **kwargs)
+
+        if torch.is_grad_enabled() and getattr(
+                self, 'require_backward_grad_sync', True):
+            if self.find_unused_parameters:
+                self.reducer.prepare_for_backward(list(_find_tensors(output)))
+            else:
+                self.reducer.prepare_for_backward([])
+        else:
+            if ('parrots' not in TORCH_VERSION
+                    and digit_version(TORCH_VERSION) > digit_version('1.2')):
+                self.require_forward_param_sync = False
+        return output
+
+    def val_step(self, *inputs, **kwargs):
+        """val_step() API for module wrapped by DistributedDataParallel.
+
+        This method is basically the same as
+        ``DistributedDataParallel.forward()``, while replacing
+        ``self.module.forward()`` with ``self.module.val_step()``.
+        It is compatible with PyTorch 1.1 - 1.5.
+        """
+        # In PyTorch >= 1.7, ``reducer._rebuild_buckets()`` is moved from the
+        # end of backward to the beginning of forward.
+        if ('parrots' not in TORCH_VERSION
+                and digit_version(TORCH_VERSION) >= digit_version('1.7')
+                and self.reducer._rebuild_buckets()):
+            print_log(
+                'Reducer buckets have been rebuilt in this iteration.',
+                logger='mmcv')
+
+        if getattr(self, 'require_forward_param_sync', True):
+            self._sync_params()
+        if self.device_ids:
+            inputs, kwargs = self.scatter(inputs, kwargs, self.device_ids)
+            if len(self.device_ids) == 1:
+                output = self.module.val_step(*inputs[0], **kwargs[0])
+            else:
+                outputs = self.parallel_apply(
+                    self._module_copies[:len(inputs)], inputs, kwargs)
+                output = self.gather(outputs, self.output_device)
+        else:
+            output = self.module.val_step(*inputs, **kwargs)
+
+        if torch.is_grad_enabled() and getattr(
+                self, 'require_backward_grad_sync', True):
+            if self.find_unused_parameters:
+                self.reducer.prepare_for_backward(list(_find_tensors(output)))
+            else:
+                self.reducer.prepare_for_backward([])
+        else:
+            if ('parrots' not in TORCH_VERSION
+                    and digit_version(TORCH_VERSION) > digit_version('1.2')):
+                self.require_forward_param_sync = False
+        return output
diff --git a/mmcv/parallel/distributed_deprecated.py b/mmcv/parallel/distributed_deprecated.py
new file mode 100644
index 0000000..b593d4a
--- /dev/null
+++ b/mmcv/parallel/distributed_deprecated.py
@@ -0,0 +1,70 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.distributed as dist
+import torch.nn as nn
+from torch._utils import (_flatten_dense_tensors, _take_tensors,
+                          _unflatten_dense_tensors)
+
+from mmcv.utils import TORCH_VERSION, digit_version
+from .registry import MODULE_WRAPPERS
+from .scatter_gather import scatter_kwargs
+
+
+@MODULE_WRAPPERS.register_module()
+class MMDistributedDataParallel(nn.Module):
+
+    def __init__(self,
+                 module,
+                 dim=0,
+                 broadcast_buffers=True,
+                 bucket_cap_mb=25):
+        super(MMDistributedDataParallel, self).__init__()
+        self.module = module
+        self.dim = dim
+        self.broadcast_buffers = broadcast_buffers
+
+        self.broadcast_bucket_size = bucket_cap_mb * 1024 * 1024
+        self._sync_params()
+
+    def _dist_broadcast_coalesced(self, tensors, buffer_size):
+        for tensors in _take_tensors(tensors, buffer_size):
+            flat_tensors = _flatten_dense_tensors(tensors)
+            dist.broadcast(flat_tensors, 0)
+            for tensor, synced in zip(
+                    tensors, _unflatten_dense_tensors(flat_tensors, tensors)):
+                tensor.copy_(synced)
+
+    def _sync_params(self):
+        module_states = list(self.module.state_dict().values())
+        if len(module_states) > 0:
+            self._dist_broadcast_coalesced(module_states,
+                                           self.broadcast_bucket_size)
+        if self.broadcast_buffers:
+            if (TORCH_VERSION != 'parrots'
+                    and digit_version(TORCH_VERSION) < digit_version('1.0')):
+                buffers = [b.data for b in self.module._all_buffers()]
+            else:
+                buffers = [b.data for b in self.module.buffers()]
+            if len(buffers) > 0:
+                self._dist_broadcast_coalesced(buffers,
+                                               self.broadcast_bucket_size)
+
+    def scatter(self, inputs, kwargs, device_ids):
+        return scatter_kwargs(inputs, kwargs, device_ids, dim=self.dim)
+
+    def forward(self, *inputs, **kwargs):
+        inputs, kwargs = self.scatter(inputs, kwargs,
+                                      [torch.cuda.current_device()])
+        return self.module(*inputs[0], **kwargs[0])
+
+    def train_step(self, *inputs, **kwargs):
+        inputs, kwargs = self.scatter(inputs, kwargs,
+                                      [torch.cuda.current_device()])
+        output = self.module.train_step(*inputs[0], **kwargs[0])
+        return output
+
+    def val_step(self, *inputs, **kwargs):
+        inputs, kwargs = self.scatter(inputs, kwargs,
+                                      [torch.cuda.current_device()])
+        output = self.module.val_step(*inputs[0], **kwargs[0])
+        return output
diff --git a/mmcv/parallel/registry.py b/mmcv/parallel/registry.py
new file mode 100644
index 0000000..144f9fb
--- /dev/null
+++ b/mmcv/parallel/registry.py
@@ -0,0 +1,8 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from torch.nn.parallel import DataParallel, DistributedDataParallel
+
+from mmcv.utils import Registry
+
+MODULE_WRAPPERS = Registry('module wrapper')
+MODULE_WRAPPERS.register_module(module=DataParallel)
+MODULE_WRAPPERS.register_module(module=DistributedDataParallel)
diff --git a/mmcv/parallel/scatter_gather.py b/mmcv/parallel/scatter_gather.py
new file mode 100644
index 0000000..900ff88
--- /dev/null
+++ b/mmcv/parallel/scatter_gather.py
@@ -0,0 +1,59 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+from torch.nn.parallel._functions import Scatter as OrigScatter
+
+from ._functions import Scatter
+from .data_container import DataContainer
+
+
+def scatter(inputs, target_gpus, dim=0):
+    """Scatter inputs to target gpus.
+
+    The only difference from original :func:`scatter` is to add support for
+    :type:`~mmcv.parallel.DataContainer`.
+    """
+
+    def scatter_map(obj):
+        if isinstance(obj, torch.Tensor):
+            if target_gpus != [-1]:
+                return OrigScatter.apply(target_gpus, None, dim, obj)
+            else:
+                # for CPU inference we use self-implemented scatter
+                return Scatter.forward(target_gpus, obj)
+        if isinstance(obj, DataContainer):
+            if obj.cpu_only:
+                return obj.data
+            else:
+                return Scatter.forward(target_gpus, obj.data)
+        if isinstance(obj, tuple) and len(obj) > 0:
+            return list(zip(*map(scatter_map, obj)))
+        if isinstance(obj, list) and len(obj) > 0:
+            out = list(map(list, zip(*map(scatter_map, obj))))
+            return out
+        if isinstance(obj, dict) and len(obj) > 0:
+            out = list(map(type(obj), zip(*map(scatter_map, obj.items()))))
+            return out
+        return [obj for targets in target_gpus]
+
+    # After scatter_map is called, a scatter_map cell will exist. This cell
+    # has a reference to the actual function scatter_map, which has references
+    # to a closure that has a reference to the scatter_map cell (because the
+    # fn is recursive). To avoid this reference cycle, we set the function to
+    # None, clearing the cell
+    try:
+        return scatter_map(inputs)
+    finally:
+        scatter_map = None
+
+
+def scatter_kwargs(inputs, kwargs, target_gpus, dim=0):
+    """Scatter with support for kwargs dictionary."""
+    inputs = scatter(inputs, target_gpus, dim) if inputs else []
+    kwargs = scatter(kwargs, target_gpus, dim) if kwargs else []
+    if len(inputs) < len(kwargs):
+        inputs.extend([() for _ in range(len(kwargs) - len(inputs))])
+    elif len(kwargs) < len(inputs):
+        kwargs.extend([{} for _ in range(len(inputs) - len(kwargs))])
+    inputs = tuple(inputs)
+    kwargs = tuple(kwargs)
+    return inputs, kwargs
diff --git a/mmcv/parallel/utils.py b/mmcv/parallel/utils.py
new file mode 100644
index 0000000..0f5712c
--- /dev/null
+++ b/mmcv/parallel/utils.py
@@ -0,0 +1,20 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .registry import MODULE_WRAPPERS
+
+
+def is_module_wrapper(module):
+    """Check if a module is a module wrapper.
+
+    The following 3 modules in MMCV (and their subclasses) are regarded as
+    module wrappers: DataParallel, DistributedDataParallel,
+    MMDistributedDataParallel (the deprecated version). You may add you own
+    module wrapper by registering it to mmcv.parallel.MODULE_WRAPPERS.
+
+    Args:
+        module (nn.Module): The module to be checked.
+
+    Returns:
+        bool: True if the input module is a module wrapper.
+    """
+    module_wrappers = tuple(MODULE_WRAPPERS.module_dict.values())
+    return isinstance(module, module_wrappers)
diff --git a/mmcv/runner/__init__.py b/mmcv/runner/__init__.py
new file mode 100644
index 0000000..52e4b48
--- /dev/null
+++ b/mmcv/runner/__init__.py
@@ -0,0 +1,47 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .base_module import BaseModule, ModuleList, Sequential
+from .base_runner import BaseRunner
+from .builder import RUNNERS, build_runner
+from .checkpoint import (CheckpointLoader, _load_checkpoint,
+                         _load_checkpoint_with_prefix, load_checkpoint,
+                         load_state_dict, save_checkpoint, weights_to_cpu)
+from .default_constructor import DefaultRunnerConstructor
+from .dist_utils import (allreduce_grads, allreduce_params, get_dist_info,
+                         init_dist, master_only)
+from .epoch_based_runner import EpochBasedRunner, Runner
+from .fp16_utils import LossScaler, auto_fp16, force_fp32, wrap_fp16_model
+from .hooks import (HOOKS, CheckpointHook, ClosureHook, DistEvalHook,
+                    DistSamplerSeedHook, DvcliveLoggerHook, EMAHook, EvalHook,
+                    Fp16OptimizerHook, GradientCumulativeFp16OptimizerHook,
+                    GradientCumulativeOptimizerHook, Hook, IterTimerHook,
+                    LoggerHook, LrUpdaterHook, MlflowLoggerHook,
+                    NeptuneLoggerHook, OptimizerHook, PaviLoggerHook,
+                    SyncBuffersHook, TensorboardLoggerHook, TextLoggerHook,
+                    WandbLoggerHook)
+from .iter_based_runner import IterBasedRunner, IterLoader
+from .log_buffer import LogBuffer
+from .optimizer import (OPTIMIZER_BUILDERS, OPTIMIZERS,
+                        DefaultOptimizerConstructor, build_optimizer,
+                        build_optimizer_constructor)
+from .priority import Priority, get_priority
+from .utils import get_host_info, get_time_str, obj_from_dict, set_random_seed
+
+__all__ = [
+    'BaseRunner', 'Runner', 'EpochBasedRunner', 'IterBasedRunner', 'LogBuffer',
+    'HOOKS', 'Hook', 'CheckpointHook', 'ClosureHook', 'LrUpdaterHook',
+    'OptimizerHook', 'IterTimerHook', 'DistSamplerSeedHook', 'LoggerHook',
+    'PaviLoggerHook', 'TextLoggerHook', 'TensorboardLoggerHook',
+    'NeptuneLoggerHook', 'WandbLoggerHook', 'MlflowLoggerHook',
+    'DvcliveLoggerHook', '_load_checkpoint', 'load_state_dict',
+    'load_checkpoint', 'weights_to_cpu', 'save_checkpoint', 'Priority',
+    'get_priority', 'get_host_info', 'get_time_str', 'obj_from_dict',
+    'init_dist', 'get_dist_info', 'master_only', 'OPTIMIZER_BUILDERS',
+    'OPTIMIZERS', 'DefaultOptimizerConstructor', 'build_optimizer',
+    'build_optimizer_constructor', 'IterLoader', 'set_random_seed',
+    'auto_fp16', 'force_fp32', 'wrap_fp16_model', 'Fp16OptimizerHook',
+    'SyncBuffersHook', 'EMAHook', 'build_runner', 'RUNNERS', 'allreduce_grads',
+    'allreduce_params', 'LossScaler', 'CheckpointLoader', 'BaseModule',
+    '_load_checkpoint_with_prefix', 'EvalHook', 'DistEvalHook', 'Sequential',
+    'ModuleList', 'GradientCumulativeOptimizerHook',
+    'GradientCumulativeFp16OptimizerHook', 'DefaultRunnerConstructor'
+]
diff --git a/mmcv/runner/base_module.py b/mmcv/runner/base_module.py
new file mode 100644
index 0000000..529575b
--- /dev/null
+++ b/mmcv/runner/base_module.py
@@ -0,0 +1,195 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import warnings
+from abc import ABCMeta
+from collections import defaultdict
+from logging import FileHandler
+
+import torch.nn as nn
+
+from mmcv.runner.dist_utils import master_only
+from mmcv.utils.logging import get_logger, logger_initialized, print_log
+
+
+class BaseModule(nn.Module, metaclass=ABCMeta):
+    """Base module for all modules in openmmlab.
+
+    ``BaseModule`` is a wrapper of ``torch.nn.Module`` with additional
+    functionality of parameter initialization. Compared with
+    ``torch.nn.Module``, ``BaseModule`` mainly adds three attributes.
+
+        - ``init_cfg``: the config to control the initialization.
+        - ``init_weights``: The function of parameter
+            initialization and recording initialization
+            information.
+        - ``_params_init_info``: Used to track the parameter
+            initialization information. This attribute only
+            exists during executing the ``init_weights``.
+
+    Args:
+        init_cfg (dict, optional): Initialization config dict.
+    """
+
+    def __init__(self, init_cfg=None):
+        """Initialize BaseModule, inherited from `torch.nn.Module`"""
+
+        # NOTE init_cfg can be defined in different levels, but init_cfg
+        # in low levels has a higher priority.
+
+        super(BaseModule, self).__init__()
+        # define default value of init_cfg instead of hard code
+        # in init_weights() function
+        self._is_init = False
+
+        self.init_cfg = copy.deepcopy(init_cfg)
+
+        # Backward compatibility in derived classes
+        # if pretrained is not None:
+        #     warnings.warn('DeprecationWarning: pretrained is a deprecated \
+        #         key, please consider using init_cfg')
+        #     self.init_cfg = dict(type='Pretrained', checkpoint=pretrained)
+
+    @property
+    def is_init(self):
+        return self._is_init
+
+    def init_weights(self):
+        """Initialize the weights."""
+
+        is_top_level_module = False
+        # check if it is top-level module
+        if not hasattr(self, '_params_init_info'):
+            # The `_params_init_info` is used to record the initialization
+            # information of the parameters
+            # the key should be the obj:`nn.Parameter` of model and the value
+            # should be a dict containing
+            # - init_info (str): The string that describes the initialization.
+            # - tmp_mean_value (FloatTensor): The mean of the parameter,
+            #       which indicates whether the parameter has been modified.
+            # this attribute would be deleted after all parameters
+            # is initialized.
+            self._params_init_info = defaultdict(dict)
+            is_top_level_module = True
+
+            # Initialize the `_params_init_info`,
+            # When detecting the `tmp_mean_value` of
+            # the corresponding parameter is changed, update related
+            # initialization information
+            for name, param in self.named_parameters():
+                self._params_init_info[param][
+                    'init_info'] = f'The value is the same before and ' \
+                                   f'after calling `init_weights` ' \
+                                   f'of {self.__class__.__name__} '
+                self._params_init_info[param][
+                    'tmp_mean_value'] = param.data.mean()
+
+            # pass `params_init_info` to all submodules
+            # All submodules share the same `params_init_info`,
+            # so it will be updated when parameters are
+            # modified at any level of the model.
+            for sub_module in self.modules():
+                sub_module._params_init_info = self._params_init_info
+
+        # Get the initialized logger, if not exist,
+        # create a logger named `mmcv`
+        logger_names = list(logger_initialized.keys())
+        logger_name = logger_names[0] if logger_names else 'mmcv'
+
+        from ..cnn import initialize
+        from ..cnn.utils.weight_init import update_init_info
+        module_name = self.__class__.__name__
+        if not self._is_init:
+            if self.init_cfg:
+                print_log(
+                    f'initialize {module_name} with init_cfg {self.init_cfg}',
+                    logger=logger_name)
+                initialize(self, self.init_cfg)
+                if isinstance(self.init_cfg, dict):
+                    # prevent the parameters of
+                    # the pre-trained model
+                    # from being overwritten by
+                    # the `init_weights`
+                    if self.init_cfg['type'] == 'Pretrained':
+                        return
+
+            for m in self.children():
+                if hasattr(m, 'init_weights'):
+                    m.init_weights()
+                    # users may overload the `init_weights`
+                    update_init_info(
+                        m,
+                        init_info=f'Initialized by '
+                        f'user-defined `init_weights`'
+                        f' in {m.__class__.__name__} ')
+
+            self._is_init = True
+        else:
+            warnings.warn(f'init_weights of {self.__class__.__name__} has '
+                          f'been called more than once.')
+
+        if is_top_level_module:
+            self._dump_init_info(logger_name)
+
+            for sub_module in self.modules():
+                del sub_module._params_init_info
+
+    @master_only
+    def _dump_init_info(self, logger_name):
+        """Dump the initialization information to a file named
+        `initialization.log.json` in workdir.
+
+        Args:
+            logger_name (str): The name of logger.
+        """
+
+        logger = get_logger(logger_name)
+
+        with_file_handler = False
+        # dump the information to the logger file if there is a `FileHandler`
+        for handler in logger.handlers:
+            if isinstance(handler, FileHandler):
+                handler.stream.write(
+                    'Name of parameter - Initialization information\n')
+                for name, param in self.named_parameters():
+                    handler.stream.write(
+                        f'\n{name} - {param.shape}: '
+                        f"\n{self._params_init_info[param]['init_info']} \n")
+                handler.stream.flush()
+                with_file_handler = True
+        if not with_file_handler:
+            for name, param in self.named_parameters():
+                print_log(
+                    f'\n{name} - {param.shape}: '
+                    f"\n{self._params_init_info[param]['init_info']} \n ",
+                    logger=logger_name)
+
+    def __repr__(self):
+        s = super().__repr__()
+        if self.init_cfg:
+            s += f'\ninit_cfg={self.init_cfg}'
+        return s
+
+
+class Sequential(BaseModule, nn.Sequential):
+    """Sequential module in openmmlab.
+
+    Args:
+        init_cfg (dict, optional): Initialization config dict.
+    """
+
+    def __init__(self, *args, init_cfg=None):
+        BaseModule.__init__(self, init_cfg)
+        nn.Sequential.__init__(self, *args)
+
+
+class ModuleList(BaseModule, nn.ModuleList):
+    """ModuleList in openmmlab.
+
+    Args:
+        modules (iterable, optional): an iterable of modules to add.
+        init_cfg (dict, optional): Initialization config dict.
+    """
+
+    def __init__(self, modules=None, init_cfg=None):
+        BaseModule.__init__(self, init_cfg)
+        nn.ModuleList.__init__(self, modules)
diff --git a/mmcv/runner/base_runner.py b/mmcv/runner/base_runner.py
new file mode 100644
index 0000000..25cd98f
--- /dev/null
+++ b/mmcv/runner/base_runner.py
@@ -0,0 +1,542 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import logging
+import os.path as osp
+import warnings
+from abc import ABCMeta, abstractmethod
+
+import torch
+from torch.optim import Optimizer
+
+import mmcv
+from ..parallel import is_module_wrapper
+from .checkpoint import load_checkpoint
+from .dist_utils import get_dist_info
+from .hooks import HOOKS, Hook
+from .log_buffer import LogBuffer
+from .priority import Priority, get_priority
+from .utils import get_time_str
+
+
+class BaseRunner(metaclass=ABCMeta):
+    """The base class of Runner, a training helper for PyTorch.
+
+    All subclasses should implement the following APIs:
+
+    - ``run()``
+    - ``train()``
+    - ``val()``
+    - ``save_checkpoint()``
+
+    Args:
+        model (:obj:`torch.nn.Module`): The model to be run.
+        batch_processor (callable): A callable method that process a data
+            batch. The interface of this method should be
+            `batch_processor(model, data, train_mode) -> dict`
+        optimizer (dict or :obj:`torch.optim.Optimizer`): It can be either an
+            optimizer (in most cases) or a dict of optimizers (in models that
+            requires more than one optimizer, e.g., GAN).
+        work_dir (str, optional): The working directory to save checkpoints
+            and logs. Defaults to None.
+        logger (:obj:`logging.Logger`): Logger used during training.
+             Defaults to None. (The default value is just for backward
+             compatibility)
+        meta (dict | None): A dict records some import information such as
+            environment info and seed, which will be logged in logger hook.
+            Defaults to None.
+        max_epochs (int, optional): Total training epochs.
+        max_iters (int, optional): Total training iterations.
+    """
+
+    def __init__(self,
+                 model,
+                 batch_processor=None,
+                 optimizer=None,
+                 work_dir=None,
+                 logger=None,
+                 meta=None,
+                 max_iters=None,
+                 max_epochs=None):
+        if batch_processor is not None:
+            if not callable(batch_processor):
+                raise TypeError('batch_processor must be callable, '
+                                f'but got {type(batch_processor)}')
+            warnings.warn('batch_processor is deprecated, please implement '
+                          'train_step() and val_step() in the model instead.')
+            # raise an error is `batch_processor` is not None and
+            # `model.train_step()` exists.
+            if is_module_wrapper(model):
+                _model = model.module
+            else:
+                _model = model
+            if hasattr(_model, 'train_step') or hasattr(_model, 'val_step'):
+                raise RuntimeError(
+                    'batch_processor and model.train_step()/model.val_step() '
+                    'cannot be both available.')
+        else:
+            assert hasattr(model, 'train_step')
+
+        # check the type of `optimizer`
+        if isinstance(optimizer, dict):
+            for name, optim in optimizer.items():
+                if not isinstance(optim, Optimizer):
+                    raise TypeError(
+                        f'optimizer must be a dict of torch.optim.Optimizers, '
+                        f'but optimizer["{name}"] is a {type(optim)}')
+        elif not isinstance(optimizer, Optimizer) and optimizer is not None:
+            raise TypeError(
+                f'optimizer must be a torch.optim.Optimizer object '
+                f'or dict or None, but got {type(optimizer)}')
+
+        # check the type of `logger`
+        if not isinstance(logger, logging.Logger):
+            raise TypeError(f'logger must be a logging.Logger object, '
+                            f'but got {type(logger)}')
+
+        # check the type of `meta`
+        if meta is not None and not isinstance(meta, dict):
+            raise TypeError(
+                f'meta must be a dict or None, but got {type(meta)}')
+
+        self.model = model
+        self.batch_processor = batch_processor
+        self.optimizer = optimizer
+        self.logger = logger
+        self.meta = meta
+        # create work_dir
+        if mmcv.is_str(work_dir):
+            self.work_dir = osp.abspath(work_dir)
+            mmcv.mkdir_or_exist(self.work_dir)
+        elif work_dir is None:
+            self.work_dir = None
+        else:
+            raise TypeError('"work_dir" must be a str or None')
+
+        # get model name from the model class
+        if hasattr(self.model, 'module'):
+            self._model_name = self.model.module.__class__.__name__
+        else:
+            self._model_name = self.model.__class__.__name__
+
+        self._rank, self._world_size = get_dist_info()
+        self.timestamp = get_time_str()
+        self.mode = None
+        self._hooks = []
+        self._epoch = 0
+        self._iter = 0
+        self._inner_iter = 0
+
+        if max_epochs is not None and max_iters is not None:
+            raise ValueError(
+                'Only one of `max_epochs` or `max_iters` can be set.')
+
+        self._max_epochs = max_epochs
+        self._max_iters = max_iters
+        # TODO: Redesign LogBuffer, it is not flexible and elegant enough
+        self.log_buffer = LogBuffer()
+
+    @property
+    def model_name(self):
+        """str: Name of the model, usually the module class name."""
+        return self._model_name
+
+    @property
+    def rank(self):
+        """int: Rank of current process. (distributed training)"""
+        return self._rank
+
+    @property
+    def world_size(self):
+        """int: Number of processes participating in the job.
+        (distributed training)"""
+        return self._world_size
+
+    @property
+    def hooks(self):
+        """list[:obj:`Hook`]: A list of registered hooks."""
+        return self._hooks
+
+    @property
+    def epoch(self):
+        """int: Current epoch."""
+        return self._epoch
+
+    @property
+    def iter(self):
+        """int: Current iteration."""
+        return self._iter
+
+    @property
+    def inner_iter(self):
+        """int: Iteration in an epoch."""
+        return self._inner_iter
+
+    @property
+    def max_epochs(self):
+        """int: Maximum training epochs."""
+        return self._max_epochs
+
+    @property
+    def max_iters(self):
+        """int: Maximum training iterations."""
+        return self._max_iters
+
+    @abstractmethod
+    def train(self):
+        pass
+
+    @abstractmethod
+    def val(self):
+        pass
+
+    @abstractmethod
+    def run(self, data_loaders, workflow, **kwargs):
+        pass
+
+    @abstractmethod
+    def save_checkpoint(self,
+                        out_dir,
+                        filename_tmpl,
+                        save_optimizer=True,
+                        meta=None,
+                        create_symlink=True):
+        pass
+
+    def current_lr(self):
+        """Get current learning rates.
+
+        Returns:
+            list[float] | dict[str, list[float]]: Current learning rates of all
+                param groups. If the runner has a dict of optimizers, this
+                method will return a dict.
+        """
+        if isinstance(self.optimizer, torch.optim.Optimizer):
+            lr = [group['lr'] for group in self.optimizer.param_groups]
+        elif isinstance(self.optimizer, dict):
+            lr = dict()
+            for name, optim in self.optimizer.items():
+                lr[name] = [group['lr'] for group in optim.param_groups]
+        else:
+            raise RuntimeError(
+                'lr is not applicable because optimizer does not exist.')
+        return lr
+
+    def current_momentum(self):
+        """Get current momentums.
+
+        Returns:
+            list[float] | dict[str, list[float]]: Current momentums of all
+                param groups. If the runner has a dict of optimizers, this
+                method will return a dict.
+        """
+
+        def _get_momentum(optimizer):
+            momentums = []
+            for group in optimizer.param_groups:
+                if 'momentum' in group.keys():
+                    momentums.append(group['momentum'])
+                elif 'betas' in group.keys():
+                    momentums.append(group['betas'][0])
+                else:
+                    momentums.append(0)
+            return momentums
+
+        if self.optimizer is None:
+            raise RuntimeError(
+                'momentum is not applicable because optimizer does not exist.')
+        elif isinstance(self.optimizer, torch.optim.Optimizer):
+            momentums = _get_momentum(self.optimizer)
+        elif isinstance(self.optimizer, dict):
+            momentums = dict()
+            for name, optim in self.optimizer.items():
+                momentums[name] = _get_momentum(optim)
+        return momentums
+
+    def register_hook(self, hook, priority='NORMAL'):
+        """Register a hook into the hook list.
+
+        The hook will be inserted into a priority queue, with the specified
+        priority (See :class:`Priority` for details of priorities).
+        For hooks with the same priority, they will be triggered in the same
+        order as they are registered.
+
+        Args:
+            hook (:obj:`Hook`): The hook to be registered.
+            priority (int or str or :obj:`Priority`): Hook priority.
+                Lower value means higher priority.
+        """
+        assert isinstance(hook, Hook)
+        if hasattr(hook, 'priority'):
+            raise ValueError('"priority" is a reserved attribute for hooks')
+        priority = get_priority(priority)
+        hook.priority = priority
+        # insert the hook to a sorted list
+        inserted = False
+        for i in range(len(self._hooks) - 1, -1, -1):
+            if priority >= self._hooks[i].priority:
+                self._hooks.insert(i + 1, hook)
+                inserted = True
+                break
+        if not inserted:
+            self._hooks.insert(0, hook)
+
+    def register_hook_from_cfg(self, hook_cfg):
+        """Register a hook from its cfg.
+
+        Args:
+            hook_cfg (dict): Hook config. It should have at least keys 'type'
+              and 'priority' indicating its type and priority.
+
+        Notes:
+            The specific hook class to register should not use 'type' and
+            'priority' arguments during initialization.
+        """
+        hook_cfg = hook_cfg.copy()
+        priority = hook_cfg.pop('priority', 'NORMAL')
+        hook = mmcv.build_from_cfg(hook_cfg, HOOKS)
+        self.register_hook(hook, priority=priority)
+
+    def call_hook(self, fn_name):
+        """Call all hooks.
+
+        Args:
+            fn_name (str): The function name in each hook to be called, such as
+                "before_train_epoch".
+        """
+        for hook in self._hooks:
+            getattr(hook, fn_name)(self)
+
+    def get_hook_info(self):
+        # Get hooks info in each stage
+        stage_hook_map = {stage: [] for stage in Hook.stages}
+        for hook in self.hooks:
+            try:
+                priority = Priority(hook.priority).name
+            except ValueError:
+                priority = hook.priority
+            classname = hook.__class__.__name__
+            hook_info = f'({priority:<12}) {classname:<35}'
+            for trigger_stage in hook.get_triggered_stages():
+                stage_hook_map[trigger_stage].append(hook_info)
+
+        stage_hook_infos = []
+        for stage in Hook.stages:
+            hook_infos = stage_hook_map[stage]
+            if len(hook_infos) > 0:
+                info = f'{stage}:\n'
+                info += '\n'.join(hook_infos)
+                info += '\n -------------------- '
+                stage_hook_infos.append(info)
+        return '\n'.join(stage_hook_infos)
+
+    def load_checkpoint(self,
+                        filename,
+                        map_location='cpu',
+                        strict=False,
+                        revise_keys=[(r'^module.', '')]):
+        return load_checkpoint(
+            self.model,
+            filename,
+            map_location,
+            strict,
+            self.logger,
+            revise_keys=revise_keys)
+
+    def resume(self,
+               checkpoint,
+               resume_optimizer=True,
+               map_location='default'):
+        if map_location == 'default':
+            if torch.cuda.is_available():
+                device_id = torch.cuda.current_device()
+                checkpoint = self.load_checkpoint(
+                    checkpoint,
+                    map_location=lambda storage, loc: storage.cuda(device_id))
+            else:
+                checkpoint = self.load_checkpoint(checkpoint)
+        else:
+            checkpoint = self.load_checkpoint(
+                checkpoint, map_location=map_location)
+
+        self._epoch = checkpoint['meta']['epoch']
+        self._iter = checkpoint['meta']['iter']
+        if self.meta is None:
+            self.meta = {}
+        self.meta.setdefault('hook_msgs', {})
+        # load `last_ckpt`, `best_score`, `best_ckpt`, etc. for hook messages
+        self.meta['hook_msgs'].update(checkpoint['meta'].get('hook_msgs', {}))
+
+        # Re-calculate the number of iterations when resuming
+        # models with different number of GPUs
+        if 'config' in checkpoint['meta']:
+            config = mmcv.Config.fromstring(
+                checkpoint['meta']['config'], file_format='.py')
+            previous_gpu_ids = config.get('gpu_ids', None)
+            if previous_gpu_ids and len(previous_gpu_ids) > 0 and len(
+                    previous_gpu_ids) != self.world_size:
+                self._iter = int(self._iter * len(previous_gpu_ids) /
+                                 self.world_size)
+                self.logger.info('the iteration number is changed due to '
+                                 'change of GPU number')
+
+        # resume meta information meta
+        self.meta = checkpoint['meta']
+
+        if 'optimizer' in checkpoint and resume_optimizer:
+            if isinstance(self.optimizer, Optimizer):
+                self.optimizer.load_state_dict(checkpoint['optimizer'])
+            elif isinstance(self.optimizer, dict):
+                for k in self.optimizer.keys():
+                    self.optimizer[k].load_state_dict(
+                        checkpoint['optimizer'][k])
+            else:
+                raise TypeError(
+                    'Optimizer should be dict or torch.optim.Optimizer '
+                    f'but got {type(self.optimizer)}')
+
+        self.logger.info('resumed epoch %d, iter %d', self.epoch, self.iter)
+
+    def register_lr_hook(self, lr_config):
+        if lr_config is None:
+            return
+        elif isinstance(lr_config, dict):
+            assert 'policy' in lr_config
+            policy_type = lr_config.pop('policy')
+            # If the type of policy is all in lower case, e.g., 'cyclic',
+            # then its first letter will be capitalized, e.g., to be 'Cyclic'.
+            # This is for the convenient usage of Lr updater.
+            # Since this is not applicable for `
+            # CosineAnnealingLrUpdater`,
+            # the string will not be changed if it contains capital letters.
+            if policy_type == policy_type.lower():
+                policy_type = policy_type.title()
+            hook_type = policy_type + 'LrUpdaterHook'
+            lr_config['type'] = hook_type
+            hook = mmcv.build_from_cfg(lr_config, HOOKS)
+        else:
+            hook = lr_config
+        self.register_hook(hook, priority='VERY_HIGH')
+
+    def register_momentum_hook(self, momentum_config):
+        if momentum_config is None:
+            return
+        if isinstance(momentum_config, dict):
+            assert 'policy' in momentum_config
+            policy_type = momentum_config.pop('policy')
+            # If the type of policy is all in lower case, e.g., 'cyclic',
+            # then its first letter will be capitalized, e.g., to be 'Cyclic'.
+            # This is for the convenient usage of momentum updater.
+            # Since this is not applicable for
+            # `CosineAnnealingMomentumUpdater`,
+            # the string will not be changed if it contains capital letters.
+            if policy_type == policy_type.lower():
+                policy_type = policy_type.title()
+            hook_type = policy_type + 'MomentumUpdaterHook'
+            momentum_config['type'] = hook_type
+            hook = mmcv.build_from_cfg(momentum_config, HOOKS)
+        else:
+            hook = momentum_config
+        self.register_hook(hook, priority='HIGH')
+
+    def register_optimizer_hook(self, optimizer_config):
+        if optimizer_config is None:
+            return
+        if isinstance(optimizer_config, dict):
+            optimizer_config.setdefault('type', 'OptimizerHook')
+            hook = mmcv.build_from_cfg(optimizer_config, HOOKS)
+        else:
+            hook = optimizer_config
+        self.register_hook(hook, priority='ABOVE_NORMAL')
+
+    def register_checkpoint_hook(self, checkpoint_config):
+        if checkpoint_config is None:
+            return
+        if isinstance(checkpoint_config, dict):
+            checkpoint_config.setdefault('type', 'CheckpointHook')
+            hook = mmcv.build_from_cfg(checkpoint_config, HOOKS)
+        else:
+            hook = checkpoint_config
+        self.register_hook(hook, priority='NORMAL')
+
+    def register_logger_hooks(self, log_config):
+        if log_config is None:
+            return
+        log_interval = log_config['interval']
+        for info in log_config['hooks']:
+            logger_hook = mmcv.build_from_cfg(
+                info, HOOKS, default_args=dict(interval=log_interval))
+            self.register_hook(logger_hook, priority='VERY_LOW')
+
+    def register_timer_hook(self, timer_config):
+        if timer_config is None:
+            return
+        if isinstance(timer_config, dict):
+            timer_config_ = copy.deepcopy(timer_config)
+            hook = mmcv.build_from_cfg(timer_config_, HOOKS)
+        else:
+            hook = timer_config
+        self.register_hook(hook, priority='LOW')
+
+    def register_custom_hooks(self, custom_config):
+        if custom_config is None:
+            return
+
+        if not isinstance(custom_config, list):
+            custom_config = [custom_config]
+
+        for item in custom_config:
+            if isinstance(item, dict):
+                self.register_hook_from_cfg(item)
+            else:
+                self.register_hook(item, priority='NORMAL')
+
+    def register_profiler_hook(self, profiler_config):
+        if profiler_config is None:
+            return
+        if isinstance(profiler_config, dict):
+            profiler_config.setdefault('type', 'ProfilerHook')
+            hook = mmcv.build_from_cfg(profiler_config, HOOKS)
+        else:
+            hook = profiler_config
+        self.register_hook(hook)
+
+    def register_training_hooks(self,
+                                lr_config,
+                                optimizer_config=None,
+                                checkpoint_config=None,
+                                log_config=None,
+                                momentum_config=None,
+                                timer_config=dict(type='IterTimerHook'),
+                                custom_hooks_config=None):
+        """Register default and custom hooks for training.
+
+        Default and custom hooks include:
+
+        +----------------------+-------------------------+
+        | Hooks                | Priority                |
+        +======================+=========================+
+        | LrUpdaterHook        | VERY_HIGH (10)          |
+        +----------------------+-------------------------+
+        | MomentumUpdaterHook  | HIGH (30)               |
+        +----------------------+-------------------------+
+        | OptimizerStepperHook | ABOVE_NORMAL (40)       |
+        +----------------------+-------------------------+
+        | CheckpointSaverHook  | NORMAL (50)             |
+        +----------------------+-------------------------+
+        | IterTimerHook        | LOW (70)                |
+        +----------------------+-------------------------+
+        | LoggerHook(s)        | VERY_LOW (90)           |
+        +----------------------+-------------------------+
+        | CustomHook(s)        | defaults to NORMAL (50) |
+        +----------------------+-------------------------+
+
+        If custom hooks have same priority with default hooks, custom hooks
+        will be triggered after default hooks.
+        """
+        self.register_lr_hook(lr_config)
+        self.register_momentum_hook(momentum_config)
+        self.register_optimizer_hook(optimizer_config)
+        self.register_checkpoint_hook(checkpoint_config)
+        self.register_timer_hook(timer_config)
+        self.register_logger_hooks(log_config)
+        self.register_custom_hooks(custom_hooks_config)
diff --git a/mmcv/runner/builder.py b/mmcv/runner/builder.py
new file mode 100644
index 0000000..77c96ba
--- /dev/null
+++ b/mmcv/runner/builder.py
@@ -0,0 +1,24 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+
+from ..utils import Registry
+
+RUNNERS = Registry('runner')
+RUNNER_BUILDERS = Registry('runner builder')
+
+
+def build_runner_constructor(cfg):
+    return RUNNER_BUILDERS.build(cfg)
+
+
+def build_runner(cfg, default_args=None):
+    runner_cfg = copy.deepcopy(cfg)
+    constructor_type = runner_cfg.pop('constructor',
+                                      'DefaultRunnerConstructor')
+    runner_constructor = build_runner_constructor(
+        dict(
+            type=constructor_type,
+            runner_cfg=runner_cfg,
+            default_args=default_args))
+    runner = runner_constructor()
+    return runner
diff --git a/mmcv/runner/checkpoint.py b/mmcv/runner/checkpoint.py
new file mode 100644
index 0000000..fec055e
--- /dev/null
+++ b/mmcv/runner/checkpoint.py
@@ -0,0 +1,710 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import io
+import os
+import os.path as osp
+import pkgutil
+import re
+import time
+import warnings
+from collections import OrderedDict
+from importlib import import_module
+from tempfile import TemporaryDirectory
+
+import torch
+import torchvision
+from torch.optim import Optimizer
+
+import mmcv
+from ..fileio import FileClient
+from ..fileio import load as load_file
+from ..parallel import is_module_wrapper
+from ..utils import load_url, mkdir_or_exist
+from .dist_utils import get_dist_info
+
+ENV_MMCV_HOME = 'MMCV_HOME'
+ENV_XDG_CACHE_HOME = 'XDG_CACHE_HOME'
+DEFAULT_CACHE_DIR = '~/.cache'
+
+
+def _get_mmcv_home():
+    mmcv_home = os.path.expanduser(
+        os.getenv(
+            ENV_MMCV_HOME,
+            os.path.join(
+                os.getenv(ENV_XDG_CACHE_HOME, DEFAULT_CACHE_DIR), 'mmcv')))
+
+    mkdir_or_exist(mmcv_home)
+    return mmcv_home
+
+
+def load_state_dict(module, state_dict, strict=False, logger=None):
+    """Load state_dict to a module.
+
+    This method is modified from :meth:`torch.nn.Module.load_state_dict`.
+    Default value for ``strict`` is set to ``False`` and the message for
+    param mismatch will be shown even if strict is False.
+
+    Args:
+        module (Module): Module that receives the state_dict.
+        state_dict (OrderedDict): Weights.
+        strict (bool): whether to strictly enforce that the keys
+            in :attr:`state_dict` match the keys returned by this module's
+            :meth:`~torch.nn.Module.state_dict` function. Default: ``False``.
+        logger (:obj:`logging.Logger`, optional): Logger to log the error
+            message. If not specified, print function will be used.
+    """
+    unexpected_keys = []
+    all_missing_keys = []
+    err_msg = []
+
+    metadata = getattr(state_dict, '_metadata', None)
+    state_dict = state_dict.copy()
+    if metadata is not None:
+        state_dict._metadata = metadata
+
+    # use _load_from_state_dict to enable checkpoint version control
+    def load(module, prefix=''):
+        # recursively check parallel module in case that the model has a
+        # complicated structure, e.g., nn.Module(nn.Module(DDP))
+        if is_module_wrapper(module):
+            module = module.module
+        local_metadata = {} if metadata is None else metadata.get(
+            prefix[:-1], {})
+        module._load_from_state_dict(state_dict, prefix, local_metadata, True,
+                                     all_missing_keys, unexpected_keys,
+                                     err_msg)
+        for name, child in module._modules.items():
+            if child is not None:
+                load(child, prefix + name + '.')
+
+    load(module)
+    load = None  # break load->load reference cycle
+
+    # ignore "num_batches_tracked" of BN layers
+    missing_keys = [
+        key for key in all_missing_keys if 'num_batches_tracked' not in key
+    ]
+
+    if unexpected_keys:
+        err_msg.append('unexpected key in source '
+                       f'state_dict: {", ".join(unexpected_keys)}\n')
+    if missing_keys:
+        err_msg.append(
+            f'missing keys in source state_dict: {", ".join(missing_keys)}\n')
+
+    rank, _ = get_dist_info()
+    if len(err_msg) > 0 and rank == 0:
+        err_msg.insert(
+            0, 'The model and loaded state dict do not match exactly\n')
+        err_msg = '\n'.join(err_msg)
+        if strict:
+            raise RuntimeError(err_msg)
+        elif logger is not None:
+            logger.warning(err_msg)
+        else:
+            print(err_msg)
+
+
+def get_torchvision_models():
+    model_urls = dict()
+    for _, name, ispkg in pkgutil.walk_packages(torchvision.models.__path__):
+        if ispkg:
+            continue
+        _zoo = import_module(f'torchvision.models.{name}')
+        if hasattr(_zoo, 'model_urls'):
+            _urls = getattr(_zoo, 'model_urls')
+            model_urls.update(_urls)
+    return model_urls
+
+
+def get_external_models():
+    mmcv_home = _get_mmcv_home()
+    default_json_path = osp.join(mmcv.__path__[0], 'model_zoo/open_mmlab.json')
+    default_urls = load_file(default_json_path)
+    assert isinstance(default_urls, dict)
+    external_json_path = osp.join(mmcv_home, 'open_mmlab.json')
+    if osp.exists(external_json_path):
+        external_urls = load_file(external_json_path)
+        assert isinstance(external_urls, dict)
+        default_urls.update(external_urls)
+
+    return default_urls
+
+
+def get_mmcls_models():
+    mmcls_json_path = osp.join(mmcv.__path__[0], 'model_zoo/mmcls.json')
+    mmcls_urls = load_file(mmcls_json_path)
+
+    return mmcls_urls
+
+
+def get_deprecated_model_names():
+    deprecate_json_path = osp.join(mmcv.__path__[0],
+                                   'model_zoo/deprecated.json')
+    deprecate_urls = load_file(deprecate_json_path)
+    assert isinstance(deprecate_urls, dict)
+
+    return deprecate_urls
+
+
+def _process_mmcls_checkpoint(checkpoint):
+    if 'state_dict' in checkpoint:
+        state_dict = checkpoint['state_dict']
+    else:
+        # Some checkpoints converted from 3rd-party repo don't
+        # have the "state_dict" key.
+        state_dict = checkpoint
+    new_state_dict = OrderedDict()
+    for k, v in state_dict.items():
+        if k.startswith('backbone.'):
+            new_state_dict[k[9:]] = v
+    new_checkpoint = dict(state_dict=new_state_dict)
+
+    return new_checkpoint
+
+
+class CheckpointLoader:
+    """A general checkpoint loader to manage all schemes."""
+
+    _schemes = {}
+
+    @classmethod
+    def _register_scheme(cls, prefixes, loader, force=False):
+        if isinstance(prefixes, str):
+            prefixes = [prefixes]
+        else:
+            assert isinstance(prefixes, (list, tuple))
+        for prefix in prefixes:
+            if (prefix not in cls._schemes) or force:
+                cls._schemes[prefix] = loader
+            else:
+                raise KeyError(
+                    f'{prefix} is already registered as a loader backend, '
+                    'add "force=True" if you want to override it')
+        # sort, longer prefixes take priority
+        cls._schemes = OrderedDict(
+            sorted(cls._schemes.items(), key=lambda t: t[0], reverse=True))
+
+    @classmethod
+    def register_scheme(cls, prefixes, loader=None, force=False):
+        """Register a loader to CheckpointLoader.
+
+        This method can be used as a normal class method or a decorator.
+
+        Args:
+            prefixes (str or list[str] or tuple[str]):
+            The prefix of the registered loader.
+            loader (function, optional): The loader function to be registered.
+                When this method is used as a decorator, loader is None.
+                Defaults to None.
+            force (bool, optional): Whether to override the loader
+                if the prefix has already been registered. Defaults to False.
+        """
+
+        if loader is not None:
+            cls._register_scheme(prefixes, loader, force=force)
+            return
+
+        def _register(loader_cls):
+            cls._register_scheme(prefixes, loader_cls, force=force)
+            return loader_cls
+
+        return _register
+
+    @classmethod
+    def _get_checkpoint_loader(cls, path):
+        """Finds a loader that supports the given path. Falls back to the local
+        loader if no other loader is found.
+
+        Args:
+            path (str): checkpoint path
+
+        Returns:
+            loader (function): checkpoint loader
+        """
+
+        for p in cls._schemes:
+            if path.startswith(p):
+                return cls._schemes[p]
+
+    @classmethod
+    def load_checkpoint(cls, filename, map_location=None, logger=None):
+        """load checkpoint through URL scheme path.
+
+        Args:
+            filename (str): checkpoint file name with given prefix
+            map_location (str, optional): Same as :func:`torch.load`.
+                Default: None
+            logger (:mod:`logging.Logger`, optional): The logger for message.
+                Default: None
+
+        Returns:
+            dict or OrderedDict: The loaded checkpoint.
+        """
+
+        checkpoint_loader = cls._get_checkpoint_loader(filename)
+        class_name = checkpoint_loader.__name__
+        mmcv.print_log(
+            f'load checkpoint from {class_name[10:]} path: {filename}', logger)
+        return checkpoint_loader(filename, map_location)
+
+
+@CheckpointLoader.register_scheme(prefixes='')
+def load_from_local(filename, map_location):
+    """load checkpoint by local file path.
+
+    Args:
+        filename (str): local checkpoint file path
+        map_location (str, optional): Same as :func:`torch.load`.
+
+    Returns:
+        dict or OrderedDict: The loaded checkpoint.
+    """
+
+    if not osp.isfile(filename):
+        raise IOError(f'{filename} is not a checkpoint file')
+    checkpoint = torch.load(filename, map_location=map_location)
+    return checkpoint
+
+
+@CheckpointLoader.register_scheme(prefixes=('http://', 'https://'))
+def load_from_http(filename, map_location=None, model_dir=None):
+    """load checkpoint through HTTP or HTTPS scheme path. In distributed
+    setting, this function only download checkpoint at local rank 0.
+
+    Args:
+        filename (str): checkpoint file path with modelzoo or
+            torchvision prefix
+        map_location (str, optional): Same as :func:`torch.load`.
+        model_dir (string, optional): directory in which to save the object,
+            Default: None
+
+    Returns:
+        dict or OrderedDict: The loaded checkpoint.
+    """
+    rank, world_size = get_dist_info()
+    if rank == 0:
+        checkpoint = load_url(
+            filename, model_dir=model_dir, map_location=map_location)
+    if world_size > 1:
+        torch.distributed.barrier()
+        if rank > 0:
+            checkpoint = load_url(
+                filename, model_dir=model_dir, map_location=map_location)
+    return checkpoint
+
+
+@CheckpointLoader.register_scheme(prefixes='pavi://')
+def load_from_pavi(filename, map_location=None):
+    """load checkpoint through the file path prefixed with pavi. In distributed
+    setting, this function download ckpt at all ranks to different temporary
+    directories.
+
+    Args:
+        filename (str): checkpoint file path with pavi prefix
+        map_location (str, optional): Same as :func:`torch.load`.
+          Default: None
+
+    Returns:
+        dict or OrderedDict: The loaded checkpoint.
+    """
+    assert filename.startswith('pavi://'), \
+        f'Expected filename startswith `pavi://`, but get {filename}'
+    model_path = filename[7:]
+
+    try:
+        from pavi import modelcloud
+    except ImportError:
+        raise ImportError(
+            'Please install pavi to load checkpoint from modelcloud.')
+
+    model = modelcloud.get(model_path)
+    with TemporaryDirectory() as tmp_dir:
+        downloaded_file = osp.join(tmp_dir, model.name)
+        model.download(downloaded_file)
+        checkpoint = torch.load(downloaded_file, map_location=map_location)
+    return checkpoint
+
+
+@CheckpointLoader.register_scheme(prefixes='s3://')
+def load_from_ceph(filename, map_location=None, backend='petrel'):
+    """load checkpoint through the file path prefixed with s3.  In distributed
+    setting, this function download ckpt at all ranks to different temporary
+    directories.
+
+    Args:
+        filename (str): checkpoint file path with s3 prefix
+        map_location (str, optional): Same as :func:`torch.load`.
+        backend (str, optional): The storage backend type. Options are 'ceph',
+            'petrel'. Default: 'petrel'.
+
+    .. warning::
+        :class:`mmcv.fileio.file_client.CephBackend` will be deprecated,
+        please use :class:`mmcv.fileio.file_client.PetrelBackend` instead.
+
+    Returns:
+        dict or OrderedDict: The loaded checkpoint.
+    """
+    allowed_backends = ['ceph', 'petrel']
+    if backend not in allowed_backends:
+        raise ValueError(f'Load from Backend {backend} is not supported.')
+
+    if backend == 'ceph':
+        warnings.warn(
+            'CephBackend will be deprecated, please use PetrelBackend instead')
+
+    # CephClient and PetrelBackend have the same prefix 's3://' and the latter
+    # will be chosen as default. If PetrelBackend can not be instantiated
+    # successfully, the CephClient will be chosen.
+    try:
+        file_client = FileClient(backend=backend)
+    except ImportError:
+        allowed_backends.remove(backend)
+        file_client = FileClient(backend=allowed_backends[0])
+
+    with io.BytesIO(file_client.get(filename)) as buffer:
+        checkpoint = torch.load(buffer, map_location=map_location)
+    return checkpoint
+
+
+@CheckpointLoader.register_scheme(prefixes=('modelzoo://', 'torchvision://'))
+def load_from_torchvision(filename, map_location=None):
+    """load checkpoint through the file path prefixed with modelzoo or
+    torchvision.
+
+    Args:
+        filename (str): checkpoint file path with modelzoo or
+            torchvision prefix
+        map_location (str, optional): Same as :func:`torch.load`.
+
+    Returns:
+        dict or OrderedDict: The loaded checkpoint.
+    """
+    model_urls = get_torchvision_models()
+    if filename.startswith('modelzoo://'):
+        warnings.warn('The URL scheme of "modelzoo://" is deprecated, please '
+                      'use "torchvision://" instead')
+        model_name = filename[11:]
+    else:
+        model_name = filename[14:]
+    return load_from_http(model_urls[model_name], map_location=map_location)
+
+
+@CheckpointLoader.register_scheme(prefixes=('open-mmlab://', 'openmmlab://'))
+def load_from_openmmlab(filename, map_location=None):
+    """load checkpoint through the file path prefixed with open-mmlab or
+    openmmlab.
+
+    Args:
+        filename (str): checkpoint file path with open-mmlab or
+        openmmlab prefix
+        map_location (str, optional): Same as :func:`torch.load`.
+          Default: None
+
+    Returns:
+        dict or OrderedDict: The loaded checkpoint.
+    """
+
+    model_urls = get_external_models()
+    prefix_str = 'open-mmlab://'
+    if filename.startswith(prefix_str):
+        model_name = filename[13:]
+    else:
+        model_name = filename[12:]
+        prefix_str = 'openmmlab://'
+
+    deprecated_urls = get_deprecated_model_names()
+    if model_name in deprecated_urls:
+        warnings.warn(f'{prefix_str}{model_name} is deprecated in favor '
+                      f'of {prefix_str}{deprecated_urls[model_name]}')
+        model_name = deprecated_urls[model_name]
+    model_url = model_urls[model_name]
+    # check if is url
+    if model_url.startswith(('http://', 'https://')):
+        checkpoint = load_from_http(model_url, map_location=map_location)
+    else:
+        filename = osp.join(_get_mmcv_home(), model_url)
+        if not osp.isfile(filename):
+            raise IOError(f'{filename} is not a checkpoint file')
+        checkpoint = torch.load(filename, map_location=map_location)
+    return checkpoint
+
+
+@CheckpointLoader.register_scheme(prefixes='mmcls://')
+def load_from_mmcls(filename, map_location=None):
+    """load checkpoint through the file path prefixed with mmcls.
+
+    Args:
+        filename (str): checkpoint file path with mmcls prefix
+        map_location (str, optional): Same as :func:`torch.load`.
+
+    Returns:
+        dict or OrderedDict: The loaded checkpoint.
+    """
+
+    model_urls = get_mmcls_models()
+    model_name = filename[8:]
+    checkpoint = load_from_http(
+        model_urls[model_name], map_location=map_location)
+    checkpoint = _process_mmcls_checkpoint(checkpoint)
+    return checkpoint
+
+
+def _load_checkpoint(filename, map_location=None, logger=None):
+    """Load checkpoint from somewhere (modelzoo, file, url).
+
+    Args:
+        filename (str): Accept local filepath, URL, ``torchvision://xxx``,
+            ``open-mmlab://xxx``. Please refer to ``docs/model_zoo.md`` for
+            details.
+        map_location (str, optional): Same as :func:`torch.load`.
+           Default: None.
+        logger (:mod:`logging.Logger`, optional): The logger for error message.
+           Default: None
+
+    Returns:
+        dict or OrderedDict: The loaded checkpoint. It can be either an
+           OrderedDict storing model weights or a dict containing other
+           information, which depends on the checkpoint.
+    """
+    return CheckpointLoader.load_checkpoint(filename, map_location, logger)
+
+
+def _load_checkpoint_with_prefix(prefix, filename, map_location=None):
+    """Load partial pretrained model with specific prefix.
+
+    Args:
+        prefix (str): The prefix of sub-module.
+        filename (str): Accept local filepath, URL, ``torchvision://xxx``,
+            ``open-mmlab://xxx``. Please refer to ``docs/model_zoo.md`` for
+            details.
+        map_location (str | None): Same as :func:`torch.load`. Default: None.
+
+    Returns:
+        dict or OrderedDict: The loaded checkpoint.
+    """
+
+    checkpoint = _load_checkpoint(filename, map_location=map_location)
+
+    if 'state_dict' in checkpoint:
+        state_dict = checkpoint['state_dict']
+    else:
+        state_dict = checkpoint
+    if not prefix.endswith('.'):
+        prefix += '.'
+    prefix_len = len(prefix)
+
+    state_dict = {
+        k[prefix_len:]: v
+        for k, v in state_dict.items() if k.startswith(prefix)
+    }
+
+    assert state_dict, f'{prefix} is not in the pretrained model'
+    return state_dict
+
+
+def load_checkpoint(model,
+                    filename,
+                    map_location=None,
+                    strict=False,
+                    logger=None,
+                    revise_keys=[(r'^module\.', '')]):
+    """Load checkpoint from a file or URI.
+
+    Args:
+        model (Module): Module to load checkpoint.
+        filename (str): Accept local filepath, URL, ``torchvision://xxx``,
+            ``open-mmlab://xxx``. Please refer to ``docs/model_zoo.md`` for
+            details.
+        map_location (str): Same as :func:`torch.load`.
+        strict (bool): Whether to allow different params for the model and
+            checkpoint.
+        logger (:mod:`logging.Logger` or None): The logger for error message.
+        revise_keys (list): A list of customized keywords to modify the
+            state_dict in checkpoint. Each item is a (pattern, replacement)
+            pair of the regular expression operations. Default: strip
+            the prefix 'module.' by [(r'^module\\.', '')].
+
+    Returns:
+        dict or OrderedDict: The loaded checkpoint.
+    """
+    checkpoint = _load_checkpoint(filename, map_location, logger)
+    # OrderedDict is a subclass of dict
+    if not isinstance(checkpoint, dict):
+        raise RuntimeError(
+            f'No state_dict found in checkpoint file {filename}')
+    # get state_dict from checkpoint
+    if 'state_dict' in checkpoint:
+        state_dict = checkpoint['state_dict']
+    else:
+        state_dict = checkpoint
+
+    # strip prefix of state_dict
+    metadata = getattr(state_dict, '_metadata', OrderedDict())
+    for p, r in revise_keys:
+        state_dict = OrderedDict(
+            {re.sub(p, r, k): v
+             for k, v in state_dict.items()})
+    # Keep metadata in state_dict
+    state_dict._metadata = metadata
+
+    # load state_dict
+    load_state_dict(model, state_dict, strict, logger)
+    return checkpoint
+
+
+def weights_to_cpu(state_dict):
+    """Copy a model state_dict to cpu.
+
+    Args:
+        state_dict (OrderedDict): Model weights on GPU.
+
+    Returns:
+        OrderedDict: Model weights on GPU.
+    """
+    state_dict_cpu = OrderedDict()
+    for key, val in state_dict.items():
+        state_dict_cpu[key] = val.cpu()
+    # Keep metadata in state_dict
+    state_dict_cpu._metadata = getattr(state_dict, '_metadata', OrderedDict())
+    return state_dict_cpu
+
+
+def _save_to_state_dict(module, destination, prefix, keep_vars):
+    """Saves module state to `destination` dictionary.
+
+    This method is modified from :meth:`torch.nn.Module._save_to_state_dict`.
+
+    Args:
+        module (nn.Module): The module to generate state_dict.
+        destination (dict): A dict where state will be stored.
+        prefix (str): The prefix for parameters and buffers used in this
+            module.
+    """
+    for name, param in module._parameters.items():
+        if param is not None:
+            destination[prefix + name] = param if keep_vars else param.detach()
+    for name, buf in module._buffers.items():
+        # remove check of _non_persistent_buffers_set to allow nn.BatchNorm2d
+        if buf is not None:
+            destination[prefix + name] = buf if keep_vars else buf.detach()
+
+
+def get_state_dict(module, destination=None, prefix='', keep_vars=False):
+    """Returns a dictionary containing a whole state of the module.
+
+    Both parameters and persistent buffers (e.g. running averages) are
+    included. Keys are corresponding parameter and buffer names.
+
+    This method is modified from :meth:`torch.nn.Module.state_dict` to
+    recursively check parallel module in case that the model has a complicated
+    structure, e.g., nn.Module(nn.Module(DDP)).
+
+    Args:
+        module (nn.Module): The module to generate state_dict.
+        destination (OrderedDict): Returned dict for the state of the
+            module.
+        prefix (str): Prefix of the key.
+        keep_vars (bool): Whether to keep the variable property of the
+            parameters. Default: False.
+
+    Returns:
+        dict: A dictionary containing a whole state of the module.
+    """
+    # recursively check parallel module in case that the model has a
+    # complicated structure, e.g., nn.Module(nn.Module(DDP))
+    if is_module_wrapper(module):
+        module = module.module
+
+    # below is the same as torch.nn.Module.state_dict()
+    if destination is None:
+        destination = OrderedDict()
+        destination._metadata = OrderedDict()
+    destination._metadata[prefix[:-1]] = local_metadata = dict(
+        version=module._version)
+    _save_to_state_dict(module, destination, prefix, keep_vars)
+    for name, child in module._modules.items():
+        if child is not None:
+            get_state_dict(
+                child, destination, prefix + name + '.', keep_vars=keep_vars)
+    for hook in module._state_dict_hooks.values():
+        hook_result = hook(module, destination, prefix, local_metadata)
+        if hook_result is not None:
+            destination = hook_result
+    return destination
+
+
+def save_checkpoint(model,
+                    filename,
+                    optimizer=None,
+                    meta=None,
+                    file_client_args=None):
+    """Save checkpoint to file.
+
+    The checkpoint will have 3 fields: ``meta``, ``state_dict`` and
+    ``optimizer``. By default ``meta`` will contain version and time info.
+
+    Args:
+        model (Module): Module whose params are to be saved.
+        filename (str): Checkpoint filename.
+        optimizer (:obj:`Optimizer`, optional): Optimizer to be saved.
+        meta (dict, optional): Metadata to be saved in checkpoint.
+        file_client_args (dict, optional): Arguments to instantiate a
+            FileClient. See :class:`mmcv.fileio.FileClient` for details.
+            Default: None.
+            `New in version 1.3.16.`
+    """
+    if meta is None:
+        meta = {}
+    elif not isinstance(meta, dict):
+        raise TypeError(f'meta must be a dict or None, but got {type(meta)}')
+    meta.update(mmcv_version=mmcv.__version__, time=time.asctime())
+
+    if is_module_wrapper(model):
+        model = model.module
+
+    if hasattr(model, 'CLASSES') and model.CLASSES is not None:
+        # save class name to the meta
+        meta.update(CLASSES=model.CLASSES)
+
+    checkpoint = {
+        'meta': meta,
+        'state_dict': weights_to_cpu(get_state_dict(model))
+    }
+    # save optimizer state dict in the checkpoint
+    if isinstance(optimizer, Optimizer):
+        checkpoint['optimizer'] = optimizer.state_dict()
+    elif isinstance(optimizer, dict):
+        checkpoint['optimizer'] = {}
+        for name, optim in optimizer.items():
+            checkpoint['optimizer'][name] = optim.state_dict()
+
+    if filename.startswith('pavi://'):
+        if file_client_args is not None:
+            raise ValueError(
+                'file_client_args should be "None" if filename starts with'
+                f'"pavi://", but got {file_client_args}')
+        try:
+            from pavi import modelcloud
+            from pavi import exception
+        except ImportError:
+            raise ImportError(
+                'Please install pavi to load checkpoint from modelcloud.')
+        model_path = filename[7:]
+        root = modelcloud.Folder()
+        model_dir, model_name = osp.split(model_path)
+        try:
+            model = modelcloud.get(model_dir)
+        except exception.NodeNotFoundError:
+            model = root.create_training_model(model_dir)
+        with TemporaryDirectory() as tmp_dir:
+            checkpoint_file = osp.join(tmp_dir, model_name)
+            with open(checkpoint_file, 'wb') as f:
+                torch.save(checkpoint, f)
+                f.flush()
+            model.create_file(checkpoint_file, name=model_name)
+    else:
+        file_client = FileClient.infer_client(file_client_args, filename)
+        with io.BytesIO() as f:
+            torch.save(checkpoint, f)
+            file_client.put(f.getvalue(), filename)
diff --git a/mmcv/runner/default_constructor.py b/mmcv/runner/default_constructor.py
new file mode 100644
index 0000000..0bad847
--- /dev/null
+++ b/mmcv/runner/default_constructor.py
@@ -0,0 +1,44 @@
+from .builder import RUNNER_BUILDERS, RUNNERS
+
+
+@RUNNER_BUILDERS.register_module()
+class DefaultRunnerConstructor:
+    """Default constructor for runners.
+
+    Custom existing `Runner` like `EpocBasedRunner` though `RunnerConstructor`.
+    For example, We can inject some new properties and functions for `Runner`.
+
+    Example:
+        >>> from mmcv.runner import RUNNER_BUILDERS, build_runner
+        >>> # Define a new RunnerReconstructor
+        >>> @RUNNER_BUILDERS.register_module()
+        >>> class MyRunnerConstructor:
+        ...     def __init__(self, runner_cfg, default_args=None):
+        ...         if not isinstance(runner_cfg, dict):
+        ...             raise TypeError('runner_cfg should be a dict',
+        ...                             f'but got {type(runner_cfg)}')
+        ...         self.runner_cfg = runner_cfg
+        ...         self.default_args = default_args
+        ...
+        ...     def __call__(self):
+        ...         runner = RUNNERS.build(self.runner_cfg,
+        ...                                default_args=self.default_args)
+        ...         # Add new properties for existing runner
+        ...         runner.my_name = 'my_runner'
+        ...         runner.my_function = lambda self: print(self.my_name)
+        ...         ...
+        >>> # build your runner
+        >>> runner_cfg = dict(type='EpochBasedRunner', max_epochs=40,
+        ...                   constructor='MyRunnerConstructor')
+        >>> runner = build_runner(runner_cfg)
+    """
+
+    def __init__(self, runner_cfg, default_args=None):
+        if not isinstance(runner_cfg, dict):
+            raise TypeError('runner_cfg should be a dict',
+                            f'but got {type(runner_cfg)}')
+        self.runner_cfg = runner_cfg
+        self.default_args = default_args
+
+    def __call__(self):
+        return RUNNERS.build(self.runner_cfg, default_args=self.default_args)
diff --git a/mmcv/runner/dist_utils.py b/mmcv/runner/dist_utils.py
new file mode 100644
index 0000000..d3a1ef3
--- /dev/null
+++ b/mmcv/runner/dist_utils.py
@@ -0,0 +1,164 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import functools
+import os
+import subprocess
+from collections import OrderedDict
+
+import torch
+import torch.multiprocessing as mp
+from torch import distributed as dist
+from torch._utils import (_flatten_dense_tensors, _take_tensors,
+                          _unflatten_dense_tensors)
+
+
+def init_dist(launcher, backend='nccl', **kwargs):
+    if mp.get_start_method(allow_none=True) is None:
+        mp.set_start_method('spawn')
+    if launcher == 'pytorch':
+        _init_dist_pytorch(backend, **kwargs)
+    elif launcher == 'mpi':
+        _init_dist_mpi(backend, **kwargs)
+    elif launcher == 'slurm':
+        _init_dist_slurm(backend, **kwargs)
+    else:
+        raise ValueError(f'Invalid launcher type: {launcher}')
+
+
+def _init_dist_pytorch(backend, **kwargs):
+    # TODO: use local_rank instead of rank % num_gpus
+    rank = int(os.environ['RANK'])
+    num_gpus = torch.cuda.device_count()
+    torch.cuda.set_device(rank % num_gpus)
+    dist.init_process_group(backend=backend, **kwargs)
+
+
+def _init_dist_mpi(backend, **kwargs):
+    # TODO: use local_rank instead of rank % num_gpus
+    rank = int(os.environ['OMPI_COMM_WORLD_RANK'])
+    num_gpus = torch.cuda.device_count()
+    torch.cuda.set_device(rank % num_gpus)
+    dist.init_process_group(backend=backend, **kwargs)
+
+
+def _init_dist_slurm(backend, port=None):
+    """Initialize slurm distributed training environment.
+
+    If argument ``port`` is not specified, then the master port will be system
+    environment variable ``MASTER_PORT``. If ``MASTER_PORT`` is not in system
+    environment variable, then a default port ``29500`` will be used.
+
+    Args:
+        backend (str): Backend of torch.distributed.
+        port (int, optional): Master port. Defaults to None.
+    """
+    proc_id = int(os.environ['SLURM_PROCID'])
+    ntasks = int(os.environ['SLURM_NTASKS'])
+    node_list = os.environ['SLURM_NODELIST']
+    num_gpus = torch.cuda.device_count()
+    torch.cuda.set_device(proc_id % num_gpus)
+    addr = subprocess.getoutput(
+        f'scontrol show hostname {node_list} | head -n1')
+    # specify master port
+    if port is not None:
+        os.environ['MASTER_PORT'] = str(port)
+    elif 'MASTER_PORT' in os.environ:
+        pass  # use MASTER_PORT in the environment variable
+    else:
+        # 29500 is torch.distributed default port
+        os.environ['MASTER_PORT'] = '29500'
+    # use MASTER_ADDR in the environment variable if it already exists
+    if 'MASTER_ADDR' not in os.environ:
+        os.environ['MASTER_ADDR'] = addr
+    os.environ['WORLD_SIZE'] = str(ntasks)
+    os.environ['LOCAL_RANK'] = str(proc_id % num_gpus)
+    os.environ['RANK'] = str(proc_id)
+    dist.init_process_group(backend=backend)
+
+
+def get_dist_info():
+    if dist.is_available() and dist.is_initialized():
+        rank = dist.get_rank()
+        world_size = dist.get_world_size()
+    else:
+        rank = 0
+        world_size = 1
+    return rank, world_size
+
+
+def master_only(func):
+
+    @functools.wraps(func)
+    def wrapper(*args, **kwargs):
+        rank, _ = get_dist_info()
+        if rank == 0:
+            return func(*args, **kwargs)
+
+    return wrapper
+
+
+def allreduce_params(params, coalesce=True, bucket_size_mb=-1):
+    """Allreduce parameters.
+
+    Args:
+        params (list[torch.Parameters]): List of parameters or buffers of a
+            model.
+        coalesce (bool, optional): Whether allreduce parameters as a whole.
+            Defaults to True.
+        bucket_size_mb (int, optional): Size of bucket, the unit is MB.
+            Defaults to -1.
+    """
+    _, world_size = get_dist_info()
+    if world_size == 1:
+        return
+    params = [param.data for param in params]
+    if coalesce:
+        _allreduce_coalesced(params, world_size, bucket_size_mb)
+    else:
+        for tensor in params:
+            dist.all_reduce(tensor.div_(world_size))
+
+
+def allreduce_grads(params, coalesce=True, bucket_size_mb=-1):
+    """Allreduce gradients.
+
+    Args:
+        params (list[torch.Parameters]): List of parameters of a model
+        coalesce (bool, optional): Whether allreduce parameters as a whole.
+            Defaults to True.
+        bucket_size_mb (int, optional): Size of bucket, the unit is MB.
+            Defaults to -1.
+    """
+    grads = [
+        param.grad.data for param in params
+        if param.requires_grad and param.grad is not None
+    ]
+    _, world_size = get_dist_info()
+    if world_size == 1:
+        return
+    if coalesce:
+        _allreduce_coalesced(grads, world_size, bucket_size_mb)
+    else:
+        for tensor in grads:
+            dist.all_reduce(tensor.div_(world_size))
+
+
+def _allreduce_coalesced(tensors, world_size, bucket_size_mb=-1):
+    if bucket_size_mb > 0:
+        bucket_size_bytes = bucket_size_mb * 1024 * 1024
+        buckets = _take_tensors(tensors, bucket_size_bytes)
+    else:
+        buckets = OrderedDict()
+        for tensor in tensors:
+            tp = tensor.type()
+            if tp not in buckets:
+                buckets[tp] = []
+            buckets[tp].append(tensor)
+        buckets = buckets.values()
+
+    for bucket in buckets:
+        flat_tensors = _flatten_dense_tensors(bucket)
+        dist.all_reduce(flat_tensors)
+        flat_tensors.div_(world_size)
+        for tensor, synced in zip(
+                bucket, _unflatten_dense_tensors(flat_tensors, bucket)):
+            tensor.copy_(synced)
diff --git a/mmcv/runner/epoch_based_runner.py b/mmcv/runner/epoch_based_runner.py
new file mode 100644
index 0000000..2dd2935
--- /dev/null
+++ b/mmcv/runner/epoch_based_runner.py
@@ -0,0 +1,187 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+import platform
+import shutil
+import time
+import warnings
+
+import torch
+
+import mmcv
+from .base_runner import BaseRunner
+from .builder import RUNNERS
+from .checkpoint import save_checkpoint
+from .utils import get_host_info
+
+
+@RUNNERS.register_module()
+class EpochBasedRunner(BaseRunner):
+    """Epoch-based Runner.
+
+    This runner train models epoch by epoch.
+    """
+
+    def run_iter(self, data_batch, train_mode, **kwargs):
+        if self.batch_processor is not None:
+            outputs = self.batch_processor(
+                self.model, data_batch, train_mode=train_mode, **kwargs)
+        elif train_mode:
+            outputs = self.model.train_step(data_batch, self.optimizer,
+                                            **kwargs)
+        else:
+            outputs = self.model.val_step(data_batch, self.optimizer, **kwargs)
+        if not isinstance(outputs, dict):
+            raise TypeError('"batch_processor()" or "model.train_step()"'
+                            'and "model.val_step()" must return a dict')
+        if 'log_vars' in outputs:
+            self.log_buffer.update(outputs['log_vars'], outputs['num_samples'])
+        self.outputs = outputs
+
+    def train(self, data_loader, **kwargs):
+        self.model.train()
+        self.mode = 'train'
+        self.data_loader = data_loader
+        self._max_iters = self._max_epochs * len(self.data_loader)
+        self.call_hook('before_train_epoch')
+        time.sleep(2)  # Prevent possible deadlock during epoch transition
+        for i, data_batch in enumerate(self.data_loader):
+            self._inner_iter = i
+            self.call_hook('before_train_iter')
+            self.run_iter(data_batch, train_mode=True, **kwargs)
+            self.call_hook('after_train_iter')
+            self._iter += 1
+
+        self.call_hook('after_train_epoch')
+        self._epoch += 1
+
+    @torch.no_grad()
+    def val(self, data_loader, **kwargs):
+        self.model.eval()
+        self.mode = 'val'
+        self.data_loader = data_loader
+        self.call_hook('before_val_epoch')
+        time.sleep(2)  # Prevent possible deadlock during epoch transition
+        for i, data_batch in enumerate(self.data_loader):
+            self._inner_iter = i
+            self.call_hook('before_val_iter')
+            self.run_iter(data_batch, train_mode=False)
+            self.call_hook('after_val_iter')
+
+        self.call_hook('after_val_epoch')
+
+    def run(self, data_loaders, workflow, max_epochs=None, **kwargs):
+        """Start running.
+
+        Args:
+            data_loaders (list[:obj:`DataLoader`]): Dataloaders for training
+                and validation.
+            workflow (list[tuple]): A list of (phase, epochs) to specify the
+                running order and epochs. E.g, [('train', 2), ('val', 1)] means
+                running 2 epochs for training and 1 epoch for validation,
+                iteratively.
+        """
+        assert isinstance(data_loaders, list)
+        assert mmcv.is_list_of(workflow, tuple)
+        assert len(data_loaders) == len(workflow)
+        if max_epochs is not None:
+            warnings.warn(
+                'setting max_epochs in run is deprecated, '
+                'please set max_epochs in runner_config', DeprecationWarning)
+            self._max_epochs = max_epochs
+
+        assert self._max_epochs is not None, (
+            'max_epochs must be specified during instantiation')
+
+        for i, flow in enumerate(workflow):
+            mode, epochs = flow
+            if mode == 'train':
+                self._max_iters = self._max_epochs * len(data_loaders[i])
+                break
+
+        work_dir = self.work_dir if self.work_dir is not None else 'NONE'
+        self.logger.info('Start running, host: %s, work_dir: %s',
+                         get_host_info(), work_dir)
+        self.logger.info('Hooks will be executed in the following order:\n%s',
+                         self.get_hook_info())
+        self.logger.info('workflow: %s, max: %d epochs', workflow,
+                         self._max_epochs)
+        self.call_hook('before_run')
+
+        while self.epoch < self._max_epochs:
+            for i, flow in enumerate(workflow):
+                mode, epochs = flow
+                if isinstance(mode, str):  # self.train()
+                    if not hasattr(self, mode):
+                        raise ValueError(
+                            f'runner has no method named "{mode}" to run an '
+                            'epoch')
+                    epoch_runner = getattr(self, mode)
+                else:
+                    raise TypeError(
+                        'mode in workflow must be a str, but got {}'.format(
+                            type(mode)))
+
+                for _ in range(epochs):
+                    if mode == 'train' and self.epoch >= self._max_epochs:
+                        break
+                    epoch_runner(data_loaders[i], **kwargs)
+
+        time.sleep(1)  # wait for some hooks like loggers to finish
+        self.call_hook('after_run')
+
+    def save_checkpoint(self,
+                        out_dir,
+                        filename_tmpl='epoch_{}.pth',
+                        save_optimizer=True,
+                        meta=None,
+                        create_symlink=True):
+        """Save the checkpoint.
+
+        Args:
+            out_dir (str): The directory that checkpoints are saved.
+            filename_tmpl (str, optional): The checkpoint filename template,
+                which contains a placeholder for the epoch number.
+                Defaults to 'epoch_{}.pth'.
+            save_optimizer (bool, optional): Whether to save the optimizer to
+                the checkpoint. Defaults to True.
+            meta (dict, optional): The meta information to be saved in the
+                checkpoint. Defaults to None.
+            create_symlink (bool, optional): Whether to create a symlink
+                "latest.pth" to point to the latest checkpoint.
+                Defaults to True.
+        """
+        if meta is None:
+            meta = {}
+        elif not isinstance(meta, dict):
+            raise TypeError(
+                f'meta should be a dict or None, but got {type(meta)}')
+        if self.meta is not None:
+            meta.update(self.meta)
+            # Note: meta.update(self.meta) should be done before
+            # meta.update(epoch=self.epoch + 1, iter=self.iter) otherwise
+            # there will be problems with resumed checkpoints.
+            # More details in https://github.com/open-mmlab/mmcv/pull/1108
+        meta.update(epoch=self.epoch + 1, iter=self.iter)
+
+        filename = filename_tmpl.format(self.epoch + 1)
+        filepath = osp.join(out_dir, filename)
+        optimizer = self.optimizer if save_optimizer else None
+        save_checkpoint(self.model, filepath, optimizer=optimizer, meta=meta)
+        # in some environments, `os.symlink` is not supported, you may need to
+        # set `create_symlink` to False
+        if create_symlink:
+            dst_file = osp.join(out_dir, 'latest.pth')
+            if platform.system() != 'Windows':
+                mmcv.symlink(filename, dst_file)
+            else:
+                shutil.copy(filepath, dst_file)
+
+
+@RUNNERS.register_module()
+class Runner(EpochBasedRunner):
+    """Deprecated name of EpochBasedRunner."""
+
+    def __init__(self, *args, **kwargs):
+        warnings.warn(
+            'Runner was deprecated, please use EpochBasedRunner instead')
+        super().__init__(*args, **kwargs)
diff --git a/mmcv/runner/fp16_utils.py b/mmcv/runner/fp16_utils.py
new file mode 100644
index 0000000..4baab93
--- /dev/null
+++ b/mmcv/runner/fp16_utils.py
@@ -0,0 +1,410 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import functools
+import warnings
+from collections import abc
+from inspect import getfullargspec
+
+import numpy as np
+import torch
+import torch.nn as nn
+
+from mmcv.utils import TORCH_VERSION, digit_version
+from .dist_utils import allreduce_grads as _allreduce_grads
+
+try:
+    # If PyTorch version >= 1.6.0, torch.cuda.amp.autocast would be imported
+    # and used; otherwise, auto fp16 will adopt mmcv's implementation.
+    # Note that when PyTorch >= 1.6.0, we still cast tensor types to fp16
+    # manually, so the behavior may not be consistent with real amp.
+    from torch.cuda.amp import autocast
+except ImportError:
+    pass
+
+
+def cast_tensor_type(inputs, src_type, dst_type):
+    """Recursively convert Tensor in inputs from src_type to dst_type.
+
+    Args:
+        inputs: Inputs that to be casted.
+        src_type (torch.dtype): Source type..
+        dst_type (torch.dtype): Destination type.
+
+    Returns:
+        The same type with inputs, but all contained Tensors have been cast.
+    """
+    if isinstance(inputs, nn.Module):
+        return inputs
+    elif isinstance(inputs, torch.Tensor):
+        return inputs.to(dst_type)
+    elif isinstance(inputs, str):
+        return inputs
+    elif isinstance(inputs, np.ndarray):
+        return inputs
+    elif isinstance(inputs, abc.Mapping):
+        return type(inputs)({
+            k: cast_tensor_type(v, src_type, dst_type)
+            for k, v in inputs.items()
+        })
+    elif isinstance(inputs, abc.Iterable):
+        return type(inputs)(
+            cast_tensor_type(item, src_type, dst_type) for item in inputs)
+    else:
+        return inputs
+
+
+def auto_fp16(apply_to=None, out_fp32=False):
+    """Decorator to enable fp16 training automatically.
+
+    This decorator is useful when you write custom modules and want to support
+    mixed precision training. If inputs arguments are fp32 tensors, they will
+    be converted to fp16 automatically. Arguments other than fp32 tensors are
+    ignored. If you are using PyTorch >= 1.6, torch.cuda.amp is used as the
+    backend, otherwise, original mmcv implementation will be adopted.
+
+    Args:
+        apply_to (Iterable, optional): The argument names to be converted.
+            `None` indicates all arguments.
+        out_fp32 (bool): Whether to convert the output back to fp32.
+
+    Example:
+
+        >>> import torch.nn as nn
+        >>> class MyModule1(nn.Module):
+        >>>
+        >>>     # Convert x and y to fp16
+        >>>     @auto_fp16()
+        >>>     def forward(self, x, y):
+        >>>         pass
+
+        >>> import torch.nn as nn
+        >>> class MyModule2(nn.Module):
+        >>>
+        >>>     # convert pred to fp16
+        >>>     @auto_fp16(apply_to=('pred', ))
+        >>>     def do_something(self, pred, others):
+        >>>         pass
+    """
+
+    def auto_fp16_wrapper(old_func):
+
+        @functools.wraps(old_func)
+        def new_func(*args, **kwargs):
+            # check if the module has set the attribute `fp16_enabled`, if not,
+            # just fallback to the original method.
+            if not isinstance(args[0], torch.nn.Module):
+                raise TypeError('@auto_fp16 can only be used to decorate the '
+                                'method of nn.Module')
+            if not (hasattr(args[0], 'fp16_enabled') and args[0].fp16_enabled):
+                return old_func(*args, **kwargs)
+
+            # get the arg spec of the decorated method
+            args_info = getfullargspec(old_func)
+            # get the argument names to be casted
+            args_to_cast = args_info.args if apply_to is None else apply_to
+            # convert the args that need to be processed
+            new_args = []
+            # NOTE: default args are not taken into consideration
+            if args:
+                arg_names = args_info.args[:len(args)]
+                for i, arg_name in enumerate(arg_names):
+                    if arg_name in args_to_cast:
+                        new_args.append(
+                            cast_tensor_type(args[i], torch.float, torch.half))
+                    else:
+                        new_args.append(args[i])
+            # convert the kwargs that need to be processed
+            new_kwargs = {}
+            if kwargs:
+                for arg_name, arg_value in kwargs.items():
+                    if arg_name in args_to_cast:
+                        new_kwargs[arg_name] = cast_tensor_type(
+                            arg_value, torch.float, torch.half)
+                    else:
+                        new_kwargs[arg_name] = arg_value
+            # apply converted arguments to the decorated method
+            if (TORCH_VERSION != 'parrots' and
+                    digit_version(TORCH_VERSION) >= digit_version('1.6.0')):
+                with autocast(enabled=True):
+                    output = old_func(*new_args, **new_kwargs)
+            else:
+                output = old_func(*new_args, **new_kwargs)
+            # cast the results back to fp32 if necessary
+            if out_fp32:
+                output = cast_tensor_type(output, torch.half, torch.float)
+            return output
+
+        return new_func
+
+    return auto_fp16_wrapper
+
+
+def force_fp32(apply_to=None, out_fp16=False):
+    """Decorator to convert input arguments to fp32 in force.
+
+    This decorator is useful when you write custom modules and want to support
+    mixed precision training. If there are some inputs that must be processed
+    in fp32 mode, then this decorator can handle it. If inputs arguments are
+    fp16 tensors, they will be converted to fp32 automatically. Arguments other
+    than fp16 tensors are ignored. If you are using PyTorch >= 1.6,
+    torch.cuda.amp is used as the backend, otherwise, original mmcv
+    implementation will be adopted.
+
+    Args:
+        apply_to (Iterable, optional): The argument names to be converted.
+            `None` indicates all arguments.
+        out_fp16 (bool): Whether to convert the output back to fp16.
+
+    Example:
+
+        >>> import torch.nn as nn
+        >>> class MyModule1(nn.Module):
+        >>>
+        >>>     # Convert x and y to fp32
+        >>>     @force_fp32()
+        >>>     def loss(self, x, y):
+        >>>         pass
+
+        >>> import torch.nn as nn
+        >>> class MyModule2(nn.Module):
+        >>>
+        >>>     # convert pred to fp32
+        >>>     @force_fp32(apply_to=('pred', ))
+        >>>     def post_process(self, pred, others):
+        >>>         pass
+    """
+
+    def force_fp32_wrapper(old_func):
+
+        @functools.wraps(old_func)
+        def new_func(*args, **kwargs):
+            # check if the module has set the attribute `fp16_enabled`, if not,
+            # just fallback to the original method.
+            if not isinstance(args[0], torch.nn.Module):
+                raise TypeError('@force_fp32 can only be used to decorate the '
+                                'method of nn.Module')
+            if not (hasattr(args[0], 'fp16_enabled') and args[0].fp16_enabled):
+                return old_func(*args, **kwargs)
+            # get the arg spec of the decorated method
+            args_info = getfullargspec(old_func)
+            # get the argument names to be casted
+            args_to_cast = args_info.args if apply_to is None else apply_to
+            # convert the args that need to be processed
+            new_args = []
+            if args:
+                arg_names = args_info.args[:len(args)]
+                for i, arg_name in enumerate(arg_names):
+                    if arg_name in args_to_cast:
+                        new_args.append(
+                            cast_tensor_type(args[i], torch.half, torch.float))
+                    else:
+                        new_args.append(args[i])
+            # convert the kwargs that need to be processed
+            new_kwargs = dict()
+            if kwargs:
+                for arg_name, arg_value in kwargs.items():
+                    if arg_name in args_to_cast:
+                        new_kwargs[arg_name] = cast_tensor_type(
+                            arg_value, torch.half, torch.float)
+                    else:
+                        new_kwargs[arg_name] = arg_value
+            # apply converted arguments to the decorated method
+            if (TORCH_VERSION != 'parrots' and
+                    digit_version(TORCH_VERSION) >= digit_version('1.6.0')):
+                with autocast(enabled=False):
+                    output = old_func(*new_args, **new_kwargs)
+            else:
+                output = old_func(*new_args, **new_kwargs)
+            # cast the results back to fp32 if necessary
+            if out_fp16:
+                output = cast_tensor_type(output, torch.float, torch.half)
+            return output
+
+        return new_func
+
+    return force_fp32_wrapper
+
+
+def allreduce_grads(params, coalesce=True, bucket_size_mb=-1):
+    warnings.warning(
+        '"mmcv.runner.fp16_utils.allreduce_grads" is deprecated, and will be '
+        'removed in v2.8. Please switch to "mmcv.runner.allreduce_grads')
+    _allreduce_grads(params, coalesce=coalesce, bucket_size_mb=bucket_size_mb)
+
+
+def wrap_fp16_model(model):
+    """Wrap the FP32 model to FP16.
+
+    If you are using PyTorch >= 1.6, torch.cuda.amp is used as the
+    backend, otherwise, original mmcv implementation will be adopted.
+
+    For PyTorch >= 1.6, this function will
+    1. Set fp16 flag inside the model to True.
+
+    Otherwise:
+    1. Convert FP32 model to FP16.
+    2. Remain some necessary layers to be FP32, e.g., normalization layers.
+    3. Set `fp16_enabled` flag inside the model to True.
+
+    Args:
+        model (nn.Module): Model in FP32.
+    """
+    if (TORCH_VERSION == 'parrots'
+            or digit_version(TORCH_VERSION) < digit_version('1.6.0')):
+        # convert model to fp16
+        model.half()
+        # patch the normalization layers to make it work in fp32 mode
+        patch_norm_fp32(model)
+    # set `fp16_enabled` flag
+    for m in model.modules():
+        if hasattr(m, 'fp16_enabled'):
+            m.fp16_enabled = True
+
+
+def patch_norm_fp32(module):
+    """Recursively convert normalization layers from FP16 to FP32.
+
+    Args:
+        module (nn.Module): The modules to be converted in FP16.
+
+    Returns:
+        nn.Module: The converted module, the normalization layers have been
+            converted to FP32.
+    """
+    if isinstance(module, (nn.modules.batchnorm._BatchNorm, nn.GroupNorm)):
+        module.float()
+        if isinstance(module, nn.GroupNorm) or torch.__version__ < '1.3':
+            module.forward = patch_forward_method(module.forward, torch.half,
+                                                  torch.float)
+    for child in module.children():
+        patch_norm_fp32(child)
+    return module
+
+
+def patch_forward_method(func, src_type, dst_type, convert_output=True):
+    """Patch the forward method of a module.
+
+    Args:
+        func (callable): The original forward method.
+        src_type (torch.dtype): Type of input arguments to be converted from.
+        dst_type (torch.dtype): Type of input arguments to be converted to.
+        convert_output (bool): Whether to convert the output back to src_type.
+
+    Returns:
+        callable: The patched forward method.
+    """
+
+    def new_forward(*args, **kwargs):
+        output = func(*cast_tensor_type(args, src_type, dst_type),
+                      **cast_tensor_type(kwargs, src_type, dst_type))
+        if convert_output:
+            output = cast_tensor_type(output, dst_type, src_type)
+        return output
+
+    return new_forward
+
+
+class LossScaler:
+    """Class that manages loss scaling in mixed precision training which
+    supports both dynamic or static mode.
+
+    The implementation refers to
+    https://github.com/NVIDIA/apex/blob/master/apex/fp16_utils/loss_scaler.py.
+    Indirectly, by supplying ``mode='dynamic'`` for dynamic loss scaling.
+    It's important to understand how :class:`LossScaler` operates.
+    Loss scaling is designed to combat the problem of underflowing
+    gradients encountered at long times when training fp16 networks.
+    Dynamic loss scaling begins by attempting a very high loss
+    scale.  Ironically, this may result in OVERflowing gradients.
+    If overflowing gradients are encountered, :class:`FP16_Optimizer` then
+    skips the update step for this particular iteration/minibatch,
+    and :class:`LossScaler` adjusts the loss scale to a lower value.
+    If a certain number of iterations occur without overflowing gradients
+    detected,:class:`LossScaler` increases the loss scale once more.
+    In this way :class:`LossScaler` attempts to "ride the edge" of always
+    using the highest loss scale possible without incurring overflow.
+
+    Args:
+        init_scale (float): Initial loss scale value, default: 2**32.
+        scale_factor (float): Factor used when adjusting the loss scale.
+            Default: 2.
+        mode (str): Loss scaling mode. 'dynamic' or 'static'
+        scale_window (int): Number of consecutive iterations without an
+            overflow to wait before increasing the loss scale. Default: 1000.
+    """
+
+    def __init__(self,
+                 init_scale=2**32,
+                 mode='dynamic',
+                 scale_factor=2.,
+                 scale_window=1000):
+        self.cur_scale = init_scale
+        self.cur_iter = 0
+        assert mode in ('dynamic',
+                        'static'), 'mode can only be dynamic or static'
+        self.mode = mode
+        self.last_overflow_iter = -1
+        self.scale_factor = scale_factor
+        self.scale_window = scale_window
+
+    def has_overflow(self, params):
+        """Check if params contain overflow."""
+        if self.mode != 'dynamic':
+            return False
+        for p in params:
+            if p.grad is not None and LossScaler._has_inf_or_nan(p.grad.data):
+                return True
+        return False
+
+    def _has_inf_or_nan(x):
+        """Check if params contain NaN."""
+        try:
+            cpu_sum = float(x.float().sum())
+        except RuntimeError as instance:
+            if 'value cannot be converted' not in instance.args[0]:
+                raise
+            return True
+        else:
+            if cpu_sum == float('inf') or cpu_sum == -float('inf') \
+                    or cpu_sum != cpu_sum:
+                return True
+            return False
+
+    def update_scale(self, overflow):
+        """update the current loss scale value when overflow happens."""
+        if self.mode != 'dynamic':
+            return
+        if overflow:
+            self.cur_scale = max(self.cur_scale / self.scale_factor, 1)
+            self.last_overflow_iter = self.cur_iter
+        else:
+            if (self.cur_iter - self.last_overflow_iter) % \
+                    self.scale_window == 0:
+                self.cur_scale *= self.scale_factor
+        self.cur_iter += 1
+
+    def state_dict(self):
+        """Returns the state of the scaler as a :class:`dict`."""
+        return dict(
+            cur_scale=self.cur_scale,
+            cur_iter=self.cur_iter,
+            mode=self.mode,
+            last_overflow_iter=self.last_overflow_iter,
+            scale_factor=self.scale_factor,
+            scale_window=self.scale_window)
+
+    def load_state_dict(self, state_dict):
+        """Loads the loss_scaler state dict.
+
+        Args:
+           state_dict (dict): scaler state.
+        """
+        self.cur_scale = state_dict['cur_scale']
+        self.cur_iter = state_dict['cur_iter']
+        self.mode = state_dict['mode']
+        self.last_overflow_iter = state_dict['last_overflow_iter']
+        self.scale_factor = state_dict['scale_factor']
+        self.scale_window = state_dict['scale_window']
+
+    @property
+    def loss_scale(self):
+        return self.cur_scale
diff --git a/mmcv/runner/hooks/__init__.py b/mmcv/runner/hooks/__init__.py
new file mode 100644
index 0000000..915af28
--- /dev/null
+++ b/mmcv/runner/hooks/__init__.py
@@ -0,0 +1,29 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .checkpoint import CheckpointHook
+from .closure import ClosureHook
+from .ema import EMAHook
+from .evaluation import DistEvalHook, EvalHook
+from .hook import HOOKS, Hook
+from .iter_timer import IterTimerHook
+from .logger import (DvcliveLoggerHook, LoggerHook, MlflowLoggerHook,
+                     NeptuneLoggerHook, PaviLoggerHook, TensorboardLoggerHook,
+                     TextLoggerHook, WandbLoggerHook)
+from .lr_updater import LrUpdaterHook
+from .memory import EmptyCacheHook
+from .momentum_updater import MomentumUpdaterHook
+from .optimizer import (Fp16OptimizerHook, GradientCumulativeFp16OptimizerHook,
+                        GradientCumulativeOptimizerHook, OptimizerHook)
+from .profiler import ProfilerHook
+from .sampler_seed import DistSamplerSeedHook
+from .sync_buffer import SyncBuffersHook
+
+__all__ = [
+    'HOOKS', 'Hook', 'CheckpointHook', 'ClosureHook', 'LrUpdaterHook',
+    'OptimizerHook', 'Fp16OptimizerHook', 'IterTimerHook',
+    'DistSamplerSeedHook', 'EmptyCacheHook', 'LoggerHook', 'MlflowLoggerHook',
+    'PaviLoggerHook', 'TextLoggerHook', 'TensorboardLoggerHook',
+    'NeptuneLoggerHook', 'WandbLoggerHook', 'DvcliveLoggerHook',
+    'MomentumUpdaterHook', 'SyncBuffersHook', 'EMAHook', 'EvalHook',
+    'DistEvalHook', 'ProfilerHook', 'GradientCumulativeOptimizerHook',
+    'GradientCumulativeFp16OptimizerHook'
+]
diff --git a/mmcv/runner/hooks/checkpoint.py b/mmcv/runner/hooks/checkpoint.py
new file mode 100644
index 0000000..7bb75f4
--- /dev/null
+++ b/mmcv/runner/hooks/checkpoint.py
@@ -0,0 +1,167 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+import warnings
+
+from mmcv.fileio import FileClient
+from ..dist_utils import allreduce_params, master_only
+from .hook import HOOKS, Hook
+
+
+@HOOKS.register_module()
+class CheckpointHook(Hook):
+    """Save checkpoints periodically.
+
+    Args:
+        interval (int): The saving period. If ``by_epoch=True``, interval
+            indicates epochs, otherwise it indicates iterations.
+            Default: -1, which means "never".
+        by_epoch (bool): Saving checkpoints by epoch or by iteration.
+            Default: True.
+        save_optimizer (bool): Whether to save optimizer state_dict in the
+            checkpoint. It is usually used for resuming experiments.
+            Default: True.
+        out_dir (str, optional): The root directory to save checkpoints. If not
+            specified, ``runner.work_dir`` will be used by default. If
+            specified, the ``out_dir`` will be the concatenation of ``out_dir``
+            and the last level directory of ``runner.work_dir``.
+            `Changed in version 1.3.16.`
+        max_keep_ckpts (int, optional): The maximum checkpoints to keep.
+            In some cases we want only the latest few checkpoints and would
+            like to delete old ones to save the disk space.
+            Default: -1, which means unlimited.
+        save_last (bool, optional): Whether to force the last checkpoint to be
+            saved regardless of interval. Default: True.
+        sync_buffer (bool, optional): Whether to synchronize buffers in
+            different gpus. Default: False.
+        file_client_args (dict, optional): Arguments to instantiate a
+            FileClient. See :class:`mmcv.fileio.FileClient` for details.
+            Default: None.
+            `New in version 1.3.16.`
+
+    .. warning::
+        Before v1.3.16, the ``out_dir`` argument indicates the path where the
+        checkpoint is stored. However, since v1.3.16, ``out_dir`` indicates the
+        root directory and the final path to save checkpoint is the
+        concatenation of ``out_dir`` and the last level directory of
+        ``runner.work_dir``. Suppose the value of ``out_dir`` is "/path/of/A"
+        and the value of ``runner.work_dir`` is "/path/of/B", then the final
+        path will be "/path/of/A/B".
+    """
+
+    def __init__(self,
+                 interval=-1,
+                 by_epoch=True,
+                 save_optimizer=True,
+                 out_dir=None,
+                 max_keep_ckpts=-1,
+                 save_last=True,
+                 sync_buffer=False,
+                 file_client_args=None,
+                 **kwargs):
+        self.interval = interval
+        self.by_epoch = by_epoch
+        self.save_optimizer = save_optimizer
+        self.out_dir = out_dir
+        self.max_keep_ckpts = max_keep_ckpts
+        self.save_last = save_last
+        self.args = kwargs
+        self.sync_buffer = sync_buffer
+        self.file_client_args = file_client_args
+
+    def before_run(self, runner):
+        if not self.out_dir:
+            self.out_dir = runner.work_dir
+
+        self.file_client = FileClient.infer_client(self.file_client_args,
+                                                   self.out_dir)
+
+        # if `self.out_dir` is not equal to `runner.work_dir`, it means that
+        # `self.out_dir` is set so the final `self.out_dir` is the
+        # concatenation of `self.out_dir` and the last level directory of
+        # `runner.work_dir`
+        if self.out_dir != runner.work_dir:
+            basename = osp.basename(runner.work_dir.rstrip(osp.sep))
+            self.out_dir = self.file_client.join_path(self.out_dir, basename)
+
+        runner.logger.info((f'Checkpoints will be saved to {self.out_dir} by '
+                            f'{self.file_client.name}.'))
+
+        # disable the create_symlink option because some file backends do not
+        # allow to create a symlink
+        if 'create_symlink' in self.args:
+            if self.args[
+                    'create_symlink'] and not self.file_client.allow_symlink:
+                self.args['create_symlink'] = False
+                warnings.warn(
+                    ('create_symlink is set as True by the user but is changed'
+                     'to be False because creating symbolic link is not '
+                     f'allowed in {self.file_client.name}'))
+        else:
+            self.args['create_symlink'] = self.file_client.allow_symlink
+
+    def after_train_epoch(self, runner):
+        if not self.by_epoch:
+            return
+
+        # save checkpoint for following cases:
+        # 1. every ``self.interval`` epochs
+        # 2. reach the last epoch of training
+        if self.every_n_epochs(
+                runner, self.interval) or (self.save_last
+                                           and self.is_last_epoch(runner)):
+            runner.logger.info(
+                f'Saving checkpoint at {runner.epoch + 1} epochs')
+            if self.sync_buffer:
+                allreduce_params(runner.model.buffers())
+            self._save_checkpoint(runner)
+
+    @master_only
+    def _save_checkpoint(self, runner):
+        """Save the current checkpoint and delete unwanted checkpoint."""
+        runner.save_checkpoint(
+            self.out_dir, save_optimizer=self.save_optimizer, **self.args)
+        if runner.meta is not None:
+            if self.by_epoch:
+                cur_ckpt_filename = self.args.get(
+                    'filename_tmpl', 'epoch_{}.pth').format(runner.epoch + 1)
+            else:
+                cur_ckpt_filename = self.args.get(
+                    'filename_tmpl', 'iter_{}.pth').format(runner.iter + 1)
+            runner.meta.setdefault('hook_msgs', dict())
+            runner.meta['hook_msgs']['last_ckpt'] = self.file_client.join_path(
+                self.out_dir, cur_ckpt_filename)
+        # remove other checkpoints
+        if self.max_keep_ckpts > 0:
+            if self.by_epoch:
+                name = 'epoch_{}.pth'
+                current_ckpt = runner.epoch + 1
+            else:
+                name = 'iter_{}.pth'
+                current_ckpt = runner.iter + 1
+            redundant_ckpts = range(
+                current_ckpt - self.max_keep_ckpts * self.interval, 0,
+                -self.interval)
+            filename_tmpl = self.args.get('filename_tmpl', name)
+            for _step in redundant_ckpts:
+                ckpt_path = self.file_client.join_path(
+                    self.out_dir, filename_tmpl.format(_step))
+                if self.file_client.isfile(ckpt_path):
+                    self.file_client.remove(ckpt_path)
+                else:
+                    break
+
+    def after_train_iter(self, runner):
+        if self.by_epoch:
+            return
+
+        # save checkpoint for following cases:
+        # 1. every ``self.interval`` iterations
+        # 2. reach the last iteration of training
+        if self.every_n_iters(
+                runner, self.interval) or (self.save_last
+                                           and self.is_last_iter(runner)):
+            runner.logger.info(
+                f'Saving checkpoint at {runner.iter + 1} iterations')
+            if self.sync_buffer:
+                allreduce_params(runner.model.buffers())
+            self._save_checkpoint(runner)
diff --git a/mmcv/runner/hooks/closure.py b/mmcv/runner/hooks/closure.py
new file mode 100644
index 0000000..b955f81
--- /dev/null
+++ b/mmcv/runner/hooks/closure.py
@@ -0,0 +1,11 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .hook import HOOKS, Hook
+
+
+@HOOKS.register_module()
+class ClosureHook(Hook):
+
+    def __init__(self, fn_name, fn):
+        assert hasattr(self, fn_name)
+        assert callable(fn)
+        setattr(self, fn_name, fn)
diff --git a/mmcv/runner/hooks/ema.py b/mmcv/runner/hooks/ema.py
new file mode 100644
index 0000000..15c7e68
--- /dev/null
+++ b/mmcv/runner/hooks/ema.py
@@ -0,0 +1,89 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from ...parallel import is_module_wrapper
+from ..hooks.hook import HOOKS, Hook
+
+
+@HOOKS.register_module()
+class EMAHook(Hook):
+    r"""Exponential Moving Average Hook.
+
+    Use Exponential Moving Average on all parameters of model in training
+    process. All parameters have a ema backup, which update by the formula
+    as below. EMAHook takes priority over EvalHook and CheckpointSaverHook.
+
+        .. math::
+
+            \text{Xema\_{t+1}} = (1 - \text{momentum}) \times
+            \text{Xema\_{t}} +  \text{momentum} \times X_t
+
+    Args:
+        momentum (float): The momentum used for updating ema parameter.
+            Defaults to 0.0002.
+        interval (int): Update ema parameter every interval iteration.
+            Defaults to 1.
+        warm_up (int): During first warm_up steps, we may use smaller momentum
+            to update ema parameters more slowly. Defaults to 100.
+        resume_from (str): The checkpoint path. Defaults to None.
+    """
+
+    def __init__(self,
+                 momentum=0.0002,
+                 interval=1,
+                 warm_up=100,
+                 resume_from=None):
+        assert isinstance(interval, int) and interval > 0
+        self.warm_up = warm_up
+        self.interval = interval
+        assert momentum > 0 and momentum < 1
+        self.momentum = momentum**interval
+        self.checkpoint = resume_from
+
+    def before_run(self, runner):
+        """To resume model with it's ema parameters more friendly.
+
+        Register ema parameter as ``named_buffer`` to model
+        """
+        model = runner.model
+        if is_module_wrapper(model):
+            model = model.module
+        self.param_ema_buffer = {}
+        self.model_parameters = dict(model.named_parameters(recurse=True))
+        for name, value in self.model_parameters.items():
+            # "." is not allowed in module's buffer name
+            buffer_name = f"ema_{name.replace('.', '_')}"
+            self.param_ema_buffer[name] = buffer_name
+            model.register_buffer(buffer_name, value.data.clone())
+        self.model_buffers = dict(model.named_buffers(recurse=True))
+        if self.checkpoint is not None:
+            runner.resume(self.checkpoint)
+
+    def after_train_iter(self, runner):
+        """Update ema parameter every self.interval iterations."""
+        curr_step = runner.iter
+        # We warm up the momentum considering the instability at beginning
+        momentum = min(self.momentum,
+                       (1 + curr_step) / (self.warm_up + curr_step))
+        if curr_step % self.interval != 0:
+            return
+        for name, parameter in self.model_parameters.items():
+            buffer_name = self.param_ema_buffer[name]
+            buffer_parameter = self.model_buffers[buffer_name]
+            buffer_parameter.mul_(1 - momentum).add_(momentum, parameter.data)
+
+    def after_train_epoch(self, runner):
+        """We load parameter values from ema backup to model before the
+        EvalHook."""
+        self._swap_ema_parameters()
+
+    def before_train_epoch(self, runner):
+        """We recover model's parameter from ema backup after last epoch's
+        EvalHook."""
+        self._swap_ema_parameters()
+
+    def _swap_ema_parameters(self):
+        """Swap the parameter of model with parameter in ema_buffer."""
+        for name, value in self.model_parameters.items():
+            temp = value.data.clone()
+            ema_buffer = self.model_buffers[self.param_ema_buffer[name]]
+            value.data.copy_(ema_buffer.data)
+            ema_buffer.data.copy_(temp)
diff --git a/mmcv/runner/hooks/evaluation.py b/mmcv/runner/hooks/evaluation.py
new file mode 100644
index 0000000..1eeb446
--- /dev/null
+++ b/mmcv/runner/hooks/evaluation.py
@@ -0,0 +1,509 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+import warnings
+from math import inf
+
+import torch.distributed as dist
+from torch.nn.modules.batchnorm import _BatchNorm
+from torch.utils.data import DataLoader
+
+from mmcv.fileio import FileClient
+from mmcv.utils import is_seq_of
+from .hook import Hook
+from .logger import LoggerHook
+
+
+class EvalHook(Hook):
+    """Non-Distributed evaluation hook.
+
+    This hook will regularly perform evaluation in a given interval when
+    performing in non-distributed environment.
+
+    Args:
+        dataloader (DataLoader): A PyTorch dataloader, whose dataset has
+            implemented ``evaluate`` function.
+        start (int | None, optional): Evaluation starting epoch. It enables
+            evaluation before the training starts if ``start`` <= the resuming
+            epoch. If None, whether to evaluate is merely decided by
+            ``interval``. Default: None.
+        interval (int): Evaluation interval. Default: 1.
+        by_epoch (bool): Determine perform evaluation by epoch or by iteration.
+            If set to True, it will perform by epoch. Otherwise, by iteration.
+            Default: True.
+        save_best (str, optional): If a metric is specified, it would measure
+            the best checkpoint during evaluation. The information about best
+            checkpoint would be saved in ``runner.meta['hook_msgs']`` to keep
+            best score value and best checkpoint path, which will be also
+            loaded when resume checkpoint. Options are the evaluation metrics
+            on the test dataset. e.g., ``bbox_mAP``, ``segm_mAP`` for bbox
+            detection and instance segmentation. ``AR@100`` for proposal
+            recall. If ``save_best`` is ``auto``, the first key of the returned
+            ``OrderedDict`` result will be used. Default: None.
+        rule (str | None, optional): Comparison rule for best score. If set to
+            None, it will infer a reasonable rule. Keys such as 'acc', 'top'
+            .etc will be inferred by 'greater' rule. Keys contain 'loss' will
+            be inferred by 'less' rule. Options are 'greater', 'less', None.
+            Default: None.
+        test_fn (callable, optional): test a model with samples from a
+            dataloader, and return the test results. If ``None``, the default
+            test function ``mmcv.engine.single_gpu_test`` will be used.
+            (default: ``None``)
+        greater_keys (List[str] | None, optional): Metric keys that will be
+            inferred by 'greater' comparison rule. If ``None``,
+            _default_greater_keys will be used. (default: ``None``)
+        less_keys (List[str] | None, optional): Metric keys that will be
+            inferred by 'less' comparison rule. If ``None``, _default_less_keys
+            will be used. (default: ``None``)
+        out_dir (str, optional): The root directory to save checkpoints. If not
+            specified, `runner.work_dir` will be used by default. If specified,
+            the `out_dir` will be the concatenation of `out_dir` and the last
+            level directory of `runner.work_dir`.
+            `New in version 1.3.16.`
+        file_client_args (dict): Arguments to instantiate a FileClient.
+            See :class:`mmcv.fileio.FileClient` for details. Default: None.
+            `New in version 1.3.16.`
+        **eval_kwargs: Evaluation arguments fed into the evaluate function of
+            the dataset.
+
+    Notes:
+        If new arguments are added for EvalHook, tools/test.py,
+        tools/eval_metric.py may be affected.
+    """
+
+    # Since the key for determine greater or less is related to the downstream
+    # tasks, downstream repos may need to overwrite the following inner
+    # variable accordingly.
+
+    rule_map = {'greater': lambda x, y: x > y, 'less': lambda x, y: x < y}
+    init_value_map = {'greater': -inf, 'less': inf}
+    _default_greater_keys = [
+        'acc', 'top', 'AR@', 'auc', 'precision', 'mAP', 'mDice', 'mIoU',
+        'mAcc', 'aAcc'
+    ]
+    _default_less_keys = ['loss']
+
+    def __init__(self,
+                 dataloader,
+                 start=None,
+                 interval=1,
+                 by_epoch=True,
+                 save_best=None,
+                 rule=None,
+                 test_fn=None,
+                 greater_keys=None,
+                 less_keys=None,
+                 out_dir=None,
+                 file_client_args=None,
+                 **eval_kwargs):
+        if not isinstance(dataloader, DataLoader):
+            raise TypeError(f'dataloader must be a pytorch DataLoader, '
+                            f'but got {type(dataloader)}')
+
+        if interval <= 0:
+            raise ValueError(f'interval must be a positive number, '
+                             f'but got {interval}')
+
+        assert isinstance(by_epoch, bool), '``by_epoch`` should be a boolean'
+
+        if start is not None and start < 0:
+            raise ValueError(f'The evaluation start epoch {start} is smaller '
+                             f'than 0')
+
+        self.dataloader = dataloader
+        self.interval = interval
+        self.start = start
+        self.by_epoch = by_epoch
+
+        assert isinstance(save_best, str) or save_best is None, \
+            '""save_best"" should be a str or None ' \
+            f'rather than {type(save_best)}'
+        self.save_best = save_best
+        self.eval_kwargs = eval_kwargs
+        self.initial_flag = True
+
+        if test_fn is None:
+            from mmcv.engine import single_gpu_test
+            self.test_fn = single_gpu_test
+        else:
+            self.test_fn = test_fn
+
+        if greater_keys is None:
+            self.greater_keys = self._default_greater_keys
+        else:
+            if not isinstance(greater_keys, (list, tuple)):
+                greater_keys = (greater_keys, )
+            assert is_seq_of(greater_keys, str)
+            self.greater_keys = greater_keys
+
+        if less_keys is None:
+            self.less_keys = self._default_less_keys
+        else:
+            if not isinstance(less_keys, (list, tuple)):
+                less_keys = (less_keys, )
+            assert is_seq_of(less_keys, str)
+            self.less_keys = less_keys
+
+        if self.save_best is not None:
+            self.best_ckpt_path = None
+            self._init_rule(rule, self.save_best)
+
+        self.out_dir = out_dir
+        self.file_client_args = file_client_args
+
+    def _init_rule(self, rule, key_indicator):
+        """Initialize rule, key_indicator, comparison_func, and best score.
+
+        Here is the rule to determine which rule is used for key indicator
+        when the rule is not specific (note that the key indicator matching
+        is case-insensitive):
+        1. If the key indicator is in ``self.greater_keys``, the rule will be
+           specified as 'greater'.
+        2. Or if the key indicator is in ``self.less_keys``, the rule will be
+           specified as 'less'.
+        3. Or if the key indicator is equal to the substring in any one item
+           in ``self.greater_keys``, the rule will be specified as 'greater'.
+        4. Or if the key indicator is equal to the substring in any one item
+           in ``self.less_keys``, the rule will be specified as 'less'.
+
+        Args:
+            rule (str | None): Comparison rule for best score.
+            key_indicator (str | None): Key indicator to determine the
+                comparison rule.
+        """
+        if rule not in self.rule_map and rule is not None:
+            raise KeyError(f'rule must be greater, less or None, '
+                           f'but got {rule}.')
+
+        if rule is None:
+            if key_indicator != 'auto':
+                # `_lc` here means we use the lower case of keys for
+                # case-insensitive matching
+                key_indicator_lc = key_indicator.lower()
+                greater_keys = [key.lower() for key in self.greater_keys]
+                less_keys = [key.lower() for key in self.less_keys]
+
+                if key_indicator_lc in greater_keys:
+                    rule = 'greater'
+                elif key_indicator_lc in less_keys:
+                    rule = 'less'
+                elif any(key in key_indicator_lc for key in greater_keys):
+                    rule = 'greater'
+                elif any(key in key_indicator_lc for key in less_keys):
+                    rule = 'less'
+                else:
+                    raise ValueError(f'Cannot infer the rule for key '
+                                     f'{key_indicator}, thus a specific rule '
+                                     f'must be specified.')
+        self.rule = rule
+        self.key_indicator = key_indicator
+        if self.rule is not None:
+            self.compare_func = self.rule_map[self.rule]
+
+    def before_run(self, runner):
+        if not self.out_dir:
+            self.out_dir = runner.work_dir
+
+        self.file_client = FileClient.infer_client(self.file_client_args,
+                                                   self.out_dir)
+
+        # if `self.out_dir` is not equal to `runner.work_dir`, it means that
+        # `self.out_dir` is set so the final `self.out_dir` is the
+        # concatenation of `self.out_dir` and the last level directory of
+        # `runner.work_dir`
+        if self.out_dir != runner.work_dir:
+            basename = osp.basename(runner.work_dir.rstrip(osp.sep))
+            self.out_dir = self.file_client.join_path(self.out_dir, basename)
+            runner.logger.info(
+                (f'The best checkpoint will be saved to {self.out_dir} by '
+                 f'{self.file_client.name}'))
+
+        if self.save_best is not None:
+            if runner.meta is None:
+                warnings.warn('runner.meta is None. Creating an empty one.')
+                runner.meta = dict()
+            runner.meta.setdefault('hook_msgs', dict())
+            self.best_ckpt_path = runner.meta['hook_msgs'].get(
+                'best_ckpt', None)
+
+    def before_train_iter(self, runner):
+        """Evaluate the model only at the start of training by iteration."""
+        if self.by_epoch or not self.initial_flag:
+            return
+        if self.start is not None and runner.iter >= self.start:
+            self.after_train_iter(runner)
+        self.initial_flag = False
+
+    def before_train_epoch(self, runner):
+        """Evaluate the model only at the start of training by epoch."""
+        if not (self.by_epoch and self.initial_flag):
+            return
+        if self.start is not None and runner.epoch >= self.start:
+            self.after_train_epoch(runner)
+        self.initial_flag = False
+
+    def after_train_iter(self, runner):
+        """Called after every training iter to evaluate the results."""
+        if not self.by_epoch and self._should_evaluate(runner):
+            # Because the priority of EvalHook is higher than LoggerHook, the
+            # training log and the evaluating log are mixed. Therefore,
+            # we need to dump the training log and clear it before evaluating
+            # log is generated. In addition, this problem will only appear in
+            # `IterBasedRunner` whose `self.by_epoch` is False, because
+            # `EpochBasedRunner` whose `self.by_epoch` is True calls
+            # `_do_evaluate` in `after_train_epoch` stage, and at this stage
+            # the training log has been printed, so it will not cause any
+            # problem. more details at
+            # https://github.com/open-mmlab/mmsegmentation/issues/694
+            for hook in runner._hooks:
+                if isinstance(hook, LoggerHook):
+                    hook.after_train_iter(runner)
+            runner.log_buffer.clear()
+
+            self._do_evaluate(runner)
+
+    def after_train_epoch(self, runner):
+        """Called after every training epoch to evaluate the results."""
+        if self.by_epoch and self._should_evaluate(runner):
+            self._do_evaluate(runner)
+
+    def _do_evaluate(self, runner):
+        """perform evaluation and save ckpt."""
+        results = self.test_fn(runner.model, self.dataloader)
+        runner.log_buffer.output['eval_iter_num'] = len(self.dataloader)
+        key_score = self.evaluate(runner, results)
+        # the key_score may be `None` so it needs to skip the action to save
+        # the best checkpoint
+        if self.save_best and key_score:
+            self._save_ckpt(runner, key_score)
+
+    def _should_evaluate(self, runner):
+        """Judge whether to perform evaluation.
+
+        Here is the rule to judge whether to perform evaluation:
+        1. It will not perform evaluation during the epoch/iteration interval,
+           which is determined by ``self.interval``.
+        2. It will not perform evaluation if the start time is larger than
+           current time.
+        3. It will not perform evaluation when current time is larger than
+           the start time but during epoch/iteration interval.
+
+        Returns:
+            bool: The flag indicating whether to perform evaluation.
+        """
+        if self.by_epoch:
+            current = runner.epoch
+            check_time = self.every_n_epochs
+        else:
+            current = runner.iter
+            check_time = self.every_n_iters
+
+        if self.start is None:
+            if not check_time(runner, self.interval):
+                # No evaluation during the interval.
+                return False
+        elif (current + 1) < self.start:
+            # No evaluation if start is larger than the current time.
+            return False
+        else:
+            # Evaluation only at epochs/iters 3, 5, 7...
+            # if start==3 and interval==2
+            if (current + 1 - self.start) % self.interval:
+                return False
+        return True
+
+    def _save_ckpt(self, runner, key_score):
+        """Save the best checkpoint.
+
+        It will compare the score according to the compare function, write
+        related information (best score, best checkpoint path) and save the
+        best checkpoint into ``work_dir``.
+        """
+        if self.by_epoch:
+            current = f'epoch_{runner.epoch + 1}'
+            cur_type, cur_time = 'epoch', runner.epoch + 1
+        else:
+            current = f'iter_{runner.iter + 1}'
+            cur_type, cur_time = 'iter', runner.iter + 1
+
+        best_score = runner.meta['hook_msgs'].get(
+            'best_score', self.init_value_map[self.rule])
+        if self.compare_func(key_score, best_score):
+            best_score = key_score
+            runner.meta['hook_msgs']['best_score'] = best_score
+
+            if self.best_ckpt_path and self.file_client.isfile(
+                    self.best_ckpt_path):
+                self.file_client.remove(self.best_ckpt_path)
+                runner.logger.info(
+                    (f'The previous best checkpoint {self.best_ckpt_path} was '
+                     'removed'))
+
+            best_ckpt_name = f'best_{self.key_indicator}_{current}.pth'
+            self.best_ckpt_path = self.file_client.join_path(
+                self.out_dir, best_ckpt_name)
+            runner.meta['hook_msgs']['best_ckpt'] = self.best_ckpt_path
+
+            runner.save_checkpoint(
+                self.out_dir, best_ckpt_name, create_symlink=False)
+            runner.logger.info(
+                f'Now best checkpoint is saved as {best_ckpt_name}.')
+            runner.logger.info(
+                f'Best {self.key_indicator} is {best_score:0.4f} '
+                f'at {cur_time} {cur_type}.')
+
+    def evaluate(self, runner, results):
+        """Evaluate the results.
+
+        Args:
+            runner (:obj:`mmcv.Runner`): The underlined training runner.
+            results (list): Output results.
+        """
+        eval_res = self.dataloader.dataset.evaluate(
+            results, logger=runner.logger, **self.eval_kwargs)
+
+        for name, val in eval_res.items():
+            runner.log_buffer.output[name] = val
+        runner.log_buffer.ready = True
+
+        if self.save_best is not None:
+            # If the performance of model is pool, the `eval_res` may be an
+            # empty dict and it will raise exception when `self.save_best` is
+            # not None. More details at
+            # https://github.com/open-mmlab/mmdetection/issues/6265.
+            if not eval_res:
+                warnings.warn(
+                    'Since `eval_res` is an empty dict, the behavior to save '
+                    'the best checkpoint will be skipped in this evaluation.')
+                return None
+
+            if self.key_indicator == 'auto':
+                # infer from eval_results
+                self._init_rule(self.rule, list(eval_res.keys())[0])
+            return eval_res[self.key_indicator]
+
+        return None
+
+
+class DistEvalHook(EvalHook):
+    """Distributed evaluation hook.
+
+    This hook will regularly perform evaluation in a given interval when
+    performing in distributed environment.
+
+    Args:
+        dataloader (DataLoader): A PyTorch dataloader, whose dataset has
+            implemented ``evaluate`` function.
+        start (int | None, optional): Evaluation starting epoch. It enables
+            evaluation before the training starts if ``start`` <= the resuming
+            epoch. If None, whether to evaluate is merely decided by
+            ``interval``. Default: None.
+        interval (int): Evaluation interval. Default: 1.
+        by_epoch (bool): Determine perform evaluation by epoch or by iteration.
+            If set to True, it will perform by epoch. Otherwise, by iteration.
+            default: True.
+        save_best (str, optional): If a metric is specified, it would measure
+            the best checkpoint during evaluation. The information about best
+            checkpoint would be saved in ``runner.meta['hook_msgs']`` to keep
+            best score value and best checkpoint path, which will be also
+            loaded when resume checkpoint. Options are the evaluation metrics
+            on the test dataset. e.g., ``bbox_mAP``, ``segm_mAP`` for bbox
+            detection and instance segmentation. ``AR@100`` for proposal
+            recall. If ``save_best`` is ``auto``, the first key of the returned
+            ``OrderedDict`` result will be used. Default: None.
+        rule (str | None, optional): Comparison rule for best score. If set to
+            None, it will infer a reasonable rule. Keys such as 'acc', 'top'
+            .etc will be inferred by 'greater' rule. Keys contain 'loss' will
+            be inferred by 'less' rule. Options are 'greater', 'less', None.
+            Default: None.
+        test_fn (callable, optional): test a model with samples from a
+            dataloader in a multi-gpu manner, and return the test results. If
+            ``None``, the default test function ``mmcv.engine.multi_gpu_test``
+            will be used. (default: ``None``)
+        tmpdir (str | None): Temporary directory to save the results of all
+            processes. Default: None.
+        gpu_collect (bool): Whether to use gpu or cpu to collect results.
+            Default: False.
+        broadcast_bn_buffer (bool): Whether to broadcast the
+            buffer(running_mean and running_var) of rank 0 to other rank
+            before evaluation. Default: True.
+        out_dir (str, optional): The root directory to save checkpoints. If not
+            specified, `runner.work_dir` will be used by default. If specified,
+            the `out_dir` will be the concatenation of `out_dir` and the last
+            level directory of `runner.work_dir`.
+        file_client_args (dict): Arguments to instantiate a FileClient.
+            See :class:`mmcv.fileio.FileClient` for details. Default: None.
+        **eval_kwargs: Evaluation arguments fed into the evaluate function of
+            the dataset.
+    """
+
+    def __init__(self,
+                 dataloader,
+                 start=None,
+                 interval=1,
+                 by_epoch=True,
+                 save_best=None,
+                 rule=None,
+                 test_fn=None,
+                 greater_keys=None,
+                 less_keys=None,
+                 broadcast_bn_buffer=True,
+                 tmpdir=None,
+                 gpu_collect=False,
+                 out_dir=None,
+                 file_client_args=None,
+                 **eval_kwargs):
+
+        if test_fn is None:
+            from mmcv.engine import multi_gpu_test
+            test_fn = multi_gpu_test
+
+        super().__init__(
+            dataloader,
+            start=start,
+            interval=interval,
+            by_epoch=by_epoch,
+            save_best=save_best,
+            rule=rule,
+            test_fn=test_fn,
+            greater_keys=greater_keys,
+            less_keys=less_keys,
+            out_dir=out_dir,
+            file_client_args=file_client_args,
+            **eval_kwargs)
+
+        self.broadcast_bn_buffer = broadcast_bn_buffer
+        self.tmpdir = tmpdir
+        self.gpu_collect = gpu_collect
+
+    def _do_evaluate(self, runner):
+        """perform evaluation and save ckpt."""
+        # Synchronization of BatchNorm's buffer (running_mean
+        # and running_var) is not supported in the DDP of pytorch,
+        # which may cause the inconsistent performance of models in
+        # different ranks, so we broadcast BatchNorm's buffers
+        # of rank 0 to other ranks to avoid this.
+        if self.broadcast_bn_buffer:
+            model = runner.model
+            for name, module in model.named_modules():
+                if isinstance(module,
+                              _BatchNorm) and module.track_running_stats:
+                    dist.broadcast(module.running_var, 0)
+                    dist.broadcast(module.running_mean, 0)
+
+        tmpdir = self.tmpdir
+        if tmpdir is None:
+            tmpdir = osp.join(runner.work_dir, '.eval_hook')
+
+        results = self.test_fn(
+            runner.model,
+            self.dataloader,
+            tmpdir=tmpdir,
+            gpu_collect=self.gpu_collect)
+        if runner.rank == 0:
+            print('\n')
+            runner.log_buffer.output['eval_iter_num'] = len(self.dataloader)
+            key_score = self.evaluate(runner, results)
+            # the key_score may be `None` so it needs to skip the action to
+            # save the best checkpoint
+            if self.save_best and key_score:
+                self._save_ckpt(runner, key_score)
diff --git a/mmcv/runner/hooks/hook.py b/mmcv/runner/hooks/hook.py
new file mode 100644
index 0000000..f2d1c98
--- /dev/null
+++ b/mmcv/runner/hooks/hook.py
@@ -0,0 +1,92 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmcv.utils import Registry, is_method_overridden
+
+HOOKS = Registry('hook')
+
+
+class Hook:
+    stages = ('before_run', 'before_train_epoch', 'before_train_iter',
+              'after_train_iter', 'after_train_epoch', 'before_val_epoch',
+              'before_val_iter', 'after_val_iter', 'after_val_epoch',
+              'after_run')
+
+    def before_run(self, runner):
+        pass
+
+    def after_run(self, runner):
+        pass
+
+    def before_epoch(self, runner):
+        pass
+
+    def after_epoch(self, runner):
+        pass
+
+    def before_iter(self, runner):
+        pass
+
+    def after_iter(self, runner):
+        pass
+
+    def before_train_epoch(self, runner):
+        self.before_epoch(runner)
+
+    def before_val_epoch(self, runner):
+        self.before_epoch(runner)
+
+    def after_train_epoch(self, runner):
+        self.after_epoch(runner)
+
+    def after_val_epoch(self, runner):
+        self.after_epoch(runner)
+
+    def before_train_iter(self, runner):
+        self.before_iter(runner)
+
+    def before_val_iter(self, runner):
+        self.before_iter(runner)
+
+    def after_train_iter(self, runner):
+        self.after_iter(runner)
+
+    def after_val_iter(self, runner):
+        self.after_iter(runner)
+
+    def every_n_epochs(self, runner, n):
+        return (runner.epoch + 1) % n == 0 if n > 0 else False
+
+    def every_n_inner_iters(self, runner, n):
+        return (runner.inner_iter + 1) % n == 0 if n > 0 else False
+
+    def every_n_iters(self, runner, n):
+        return (runner.iter + 1) % n == 0 if n > 0 else False
+
+    def end_of_epoch(self, runner):
+        return runner.inner_iter + 1 == len(runner.data_loader)
+
+    def is_last_epoch(self, runner):
+        return runner.epoch + 1 == runner._max_epochs
+
+    def is_last_iter(self, runner):
+        return runner.iter + 1 == runner._max_iters
+
+    def get_triggered_stages(self):
+        trigger_stages = set()
+        for stage in Hook.stages:
+            if is_method_overridden(stage, Hook, self):
+                trigger_stages.add(stage)
+
+        # some methods will be triggered in multi stages
+        # use this dict to map method to stages.
+        method_stages_map = {
+            'before_epoch': ['before_train_epoch', 'before_val_epoch'],
+            'after_epoch': ['after_train_epoch', 'after_val_epoch'],
+            'before_iter': ['before_train_iter', 'before_val_iter'],
+            'after_iter': ['after_train_iter', 'after_val_iter'],
+        }
+
+        for method, map_stages in method_stages_map.items():
+            if is_method_overridden(method, Hook, self):
+                trigger_stages.update(map_stages)
+
+        return [stage for stage in Hook.stages if stage in trigger_stages]
diff --git a/mmcv/runner/hooks/iter_timer.py b/mmcv/runner/hooks/iter_timer.py
new file mode 100644
index 0000000..cfd5002
--- /dev/null
+++ b/mmcv/runner/hooks/iter_timer.py
@@ -0,0 +1,18 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import time
+
+from .hook import HOOKS, Hook
+
+
+@HOOKS.register_module()
+class IterTimerHook(Hook):
+
+    def before_epoch(self, runner):
+        self.t = time.time()
+
+    def before_iter(self, runner):
+        runner.log_buffer.update({'data_time': time.time() - self.t})
+
+    def after_iter(self, runner):
+        runner.log_buffer.update({'time': time.time() - self.t})
+        self.t = time.time()
diff --git a/mmcv/runner/hooks/logger/__init__.py b/mmcv/runner/hooks/logger/__init__.py
new file mode 100644
index 0000000..a0b6b34
--- /dev/null
+++ b/mmcv/runner/hooks/logger/__init__.py
@@ -0,0 +1,15 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .base import LoggerHook
+from .dvclive import DvcliveLoggerHook
+from .mlflow import MlflowLoggerHook
+from .neptune import NeptuneLoggerHook
+from .pavi import PaviLoggerHook
+from .tensorboard import TensorboardLoggerHook
+from .text import TextLoggerHook
+from .wandb import WandbLoggerHook
+
+__all__ = [
+    'LoggerHook', 'MlflowLoggerHook', 'PaviLoggerHook',
+    'TensorboardLoggerHook', 'TextLoggerHook', 'WandbLoggerHook',
+    'NeptuneLoggerHook', 'DvcliveLoggerHook'
+]
diff --git a/mmcv/runner/hooks/logger/base.py b/mmcv/runner/hooks/logger/base.py
new file mode 100644
index 0000000..f845256
--- /dev/null
+++ b/mmcv/runner/hooks/logger/base.py
@@ -0,0 +1,166 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numbers
+from abc import ABCMeta, abstractmethod
+
+import numpy as np
+import torch
+
+from ..hook import Hook
+
+
+class LoggerHook(Hook):
+    """Base class for logger hooks.
+
+    Args:
+        interval (int): Logging interval (every k iterations).
+        ignore_last (bool): Ignore the log of last iterations in each epoch
+            if less than `interval`.
+        reset_flag (bool): Whether to clear the output buffer after logging.
+        by_epoch (bool): Whether EpochBasedRunner is used.
+    """
+
+    __metaclass__ = ABCMeta
+
+    def __init__(self,
+                 interval=10,
+                 ignore_last=True,
+                 reset_flag=False,
+                 by_epoch=True):
+        self.interval = interval
+        self.ignore_last = ignore_last
+        self.reset_flag = reset_flag
+        self.by_epoch = by_epoch
+
+    @abstractmethod
+    def log(self, runner):
+        pass
+
+    @staticmethod
+    def is_scalar(val, include_np=True, include_torch=True):
+        """Tell the input variable is a scalar or not.
+
+        Args:
+            val: Input variable.
+            include_np (bool): Whether include 0-d np.ndarray as a scalar.
+            include_torch (bool): Whether include 0-d torch.Tensor as a scalar.
+
+        Returns:
+            bool: True or False.
+        """
+        if isinstance(val, numbers.Number):
+            return True
+        elif include_np and isinstance(val, np.ndarray) and val.ndim == 0:
+            return True
+        elif include_torch and isinstance(val, torch.Tensor) and len(val) == 1:
+            return True
+        else:
+            return False
+
+    def get_mode(self, runner):
+        if runner.mode == 'train':
+            if 'time' in runner.log_buffer.output:
+                mode = 'train'
+            else:
+                mode = 'val'
+        elif runner.mode == 'val':
+            mode = 'val'
+        else:
+            raise ValueError(f"runner mode should be 'train' or 'val', "
+                             f'but got {runner.mode}')
+        return mode
+
+    def get_epoch(self, runner):
+        if runner.mode == 'train':
+            epoch = runner.epoch + 1
+        elif runner.mode == 'val':
+            # normal val mode
+            # runner.epoch += 1 has been done before val workflow
+            epoch = runner.epoch
+        else:
+            raise ValueError(f"runner mode should be 'train' or 'val', "
+                             f'but got {runner.mode}')
+        return epoch
+
+    def get_iter(self, runner, inner_iter=False):
+        """Get the current training iteration step."""
+        if self.by_epoch and inner_iter:
+            current_iter = runner.inner_iter + 1
+        else:
+            current_iter = runner.iter + 1
+        return current_iter
+
+    def get_lr_tags(self, runner):
+        tags = {}
+        lrs = runner.current_lr()
+        if isinstance(lrs, dict):
+            for name, value in lrs.items():
+                tags[f'learning_rate/{name}'] = value[0]
+        else:
+            tags['learning_rate'] = lrs[0]
+        return tags
+
+    def get_momentum_tags(self, runner):
+        tags = {}
+        momentums = runner.current_momentum()
+        if isinstance(momentums, dict):
+            for name, value in momentums.items():
+                tags[f'momentum/{name}'] = value[0]
+        else:
+            tags['momentum'] = momentums[0]
+        return tags
+
+    def get_loggable_tags(self,
+                          runner,
+                          allow_scalar=True,
+                          allow_text=False,
+                          add_mode=True,
+                          tags_to_skip=('time', 'data_time')):
+        tags = {}
+        for var, val in runner.log_buffer.output.items():
+            if var in tags_to_skip:
+                continue
+            if self.is_scalar(val) and not allow_scalar:
+                continue
+            if isinstance(val, str) and not allow_text:
+                continue
+            if add_mode:
+                var = f'{self.get_mode(runner)}/{var}'
+            tags[var] = val
+        tags.update(self.get_lr_tags(runner))
+        tags.update(self.get_momentum_tags(runner))
+        return tags
+
+    def before_run(self, runner):
+        for hook in runner.hooks[::-1]:
+            if isinstance(hook, LoggerHook):
+                hook.reset_flag = True
+                break
+
+    def before_epoch(self, runner):
+        runner.log_buffer.clear()  # clear logs of last epoch
+
+    def after_train_iter(self, runner):
+        if self.by_epoch and self.every_n_inner_iters(runner, self.interval):
+            runner.log_buffer.average(self.interval)
+        elif not self.by_epoch and self.every_n_iters(runner, self.interval):
+            runner.log_buffer.average(self.interval)
+        elif self.end_of_epoch(runner) and not self.ignore_last:
+            # not precise but more stable
+            runner.log_buffer.average(self.interval)
+
+        if runner.log_buffer.ready:
+            self.log(runner)
+            if self.reset_flag:
+                runner.log_buffer.clear_output()
+
+    def after_train_epoch(self, runner):
+        if runner.log_buffer.ready:
+            self.log(runner)
+            if self.reset_flag:
+                runner.log_buffer.clear_output()
+
+    def after_val_epoch(self, runner):
+        runner.log_buffer.average()
+        self.log(runner)
+        if self.reset_flag:
+            runner.log_buffer.clear_output()
diff --git a/mmcv/runner/hooks/logger/dvclive.py b/mmcv/runner/hooks/logger/dvclive.py
new file mode 100644
index 0000000..687cdc5
--- /dev/null
+++ b/mmcv/runner/hooks/logger/dvclive.py
@@ -0,0 +1,58 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from ...dist_utils import master_only
+from ..hook import HOOKS
+from .base import LoggerHook
+
+
+@HOOKS.register_module()
+class DvcliveLoggerHook(LoggerHook):
+    """Class to log metrics with dvclive.
+
+    It requires `dvclive`_ to be installed.
+
+    Args:
+        path (str): Directory where dvclive will write TSV log files.
+        interval (int): Logging interval (every k iterations).
+            Default 10.
+        ignore_last (bool): Ignore the log of last iterations in each epoch
+            if less than `interval`.
+            Default: True.
+        reset_flag (bool): Whether to clear the output buffer after logging.
+            Default: True.
+        by_epoch (bool): Whether EpochBasedRunner is used.
+            Default: True.
+
+    .. _dvclive:
+        https://dvc.org/doc/dvclive
+    """
+
+    def __init__(self,
+                 path,
+                 interval=10,
+                 ignore_last=True,
+                 reset_flag=True,
+                 by_epoch=True):
+
+        super(DvcliveLoggerHook, self).__init__(interval, ignore_last,
+                                                reset_flag, by_epoch)
+        self.path = path
+        self.import_dvclive()
+
+    def import_dvclive(self):
+        try:
+            import dvclive
+        except ImportError:
+            raise ImportError(
+                'Please run "pip install dvclive" to install dvclive')
+        self.dvclive = dvclive
+
+    @master_only
+    def before_run(self, runner):
+        self.dvclive.init(self.path)
+
+    @master_only
+    def log(self, runner):
+        tags = self.get_loggable_tags(runner)
+        if tags:
+            for k, v in tags.items():
+                self.dvclive.log(k, v, step=self.get_iter(runner))
diff --git a/mmcv/runner/hooks/logger/mlflow.py b/mmcv/runner/hooks/logger/mlflow.py
new file mode 100644
index 0000000..f9a7259
--- /dev/null
+++ b/mmcv/runner/hooks/logger/mlflow.py
@@ -0,0 +1,78 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from ...dist_utils import master_only
+from ..hook import HOOKS
+from .base import LoggerHook
+
+
+@HOOKS.register_module()
+class MlflowLoggerHook(LoggerHook):
+
+    def __init__(self,
+                 exp_name=None,
+                 tags=None,
+                 log_model=True,
+                 interval=10,
+                 ignore_last=True,
+                 reset_flag=False,
+                 by_epoch=True):
+        """Class to log metrics and (optionally) a trained model to MLflow.
+
+        It requires `MLflow`_ to be installed.
+
+        Args:
+            exp_name (str, optional): Name of the experiment to be used.
+                Default None.
+                If not None, set the active experiment.
+                If experiment does not exist, an experiment with provided name
+                will be created.
+            tags (dict of str: str, optional): Tags for the current run.
+                Default None.
+                If not None, set tags for the current run.
+            log_model (bool, optional): Whether to log an MLflow artifact.
+                Default True.
+                If True, log runner.model as an MLflow artifact
+                for the current run.
+            interval (int): Logging interval (every k iterations).
+            ignore_last (bool): Ignore the log of last iterations in each epoch
+                if less than `interval`.
+            reset_flag (bool): Whether to clear the output buffer after logging
+            by_epoch (bool): Whether EpochBasedRunner is used.
+
+        .. _MLflow:
+            https://www.mlflow.org/docs/latest/index.html
+        """
+        super(MlflowLoggerHook, self).__init__(interval, ignore_last,
+                                               reset_flag, by_epoch)
+        self.import_mlflow()
+        self.exp_name = exp_name
+        self.tags = tags
+        self.log_model = log_model
+
+    def import_mlflow(self):
+        try:
+            import mlflow
+            import mlflow.pytorch as mlflow_pytorch
+        except ImportError:
+            raise ImportError(
+                'Please run "pip install mlflow" to install mlflow')
+        self.mlflow = mlflow
+        self.mlflow_pytorch = mlflow_pytorch
+
+    @master_only
+    def before_run(self, runner):
+        super(MlflowLoggerHook, self).before_run(runner)
+        if self.exp_name is not None:
+            self.mlflow.set_experiment(self.exp_name)
+        if self.tags is not None:
+            self.mlflow.set_tags(self.tags)
+
+    @master_only
+    def log(self, runner):
+        tags = self.get_loggable_tags(runner)
+        if tags:
+            self.mlflow.log_metrics(tags, step=self.get_iter(runner))
+
+    @master_only
+    def after_run(self, runner):
+        if self.log_model:
+            self.mlflow_pytorch.log_model(runner.model, 'models')
diff --git a/mmcv/runner/hooks/logger/neptune.py b/mmcv/runner/hooks/logger/neptune.py
new file mode 100644
index 0000000..7a38772
--- /dev/null
+++ b/mmcv/runner/hooks/logger/neptune.py
@@ -0,0 +1,82 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from ...dist_utils import master_only
+from ..hook import HOOKS
+from .base import LoggerHook
+
+
+@HOOKS.register_module()
+class NeptuneLoggerHook(LoggerHook):
+    """Class to log metrics to NeptuneAI.
+
+    It requires `neptune-client` to be installed.
+
+    Args:
+        init_kwargs (dict): a dict contains the initialization keys as below:
+            - project (str): Name of a project in a form of
+                namespace/project_name. If None, the value of
+                NEPTUNE_PROJECT environment variable will be taken.
+            - api_token (str): User’s API token.
+                If None, the value of NEPTUNE_API_TOKEN environment
+                variable will be taken. Note: It is strongly recommended
+                to use NEPTUNE_API_TOKEN environment variable rather than
+                placing your API token in plain text in your source code.
+            - name (str, optional, default is 'Untitled'): Editable name of
+                the run. Name is displayed in the run's Details and in
+                Runs table as a column.
+            Check https://docs.neptune.ai/api-reference/neptune#init for
+                more init arguments.
+        interval (int): Logging interval (every k iterations).
+        ignore_last (bool): Ignore the log of last iterations in each epoch
+            if less than `interval`.
+        reset_flag (bool): Whether to clear the output buffer after logging
+        by_epoch (bool): Whether EpochBasedRunner is used.
+
+    .. _NeptuneAI:
+        https://docs.neptune.ai/you-should-know/logging-metadata
+    """
+
+    def __init__(self,
+                 init_kwargs=None,
+                 interval=10,
+                 ignore_last=True,
+                 reset_flag=True,
+                 with_step=True,
+                 by_epoch=True):
+
+        super(NeptuneLoggerHook, self).__init__(interval, ignore_last,
+                                                reset_flag, by_epoch)
+        self.import_neptune()
+        self.init_kwargs = init_kwargs
+        self.with_step = with_step
+
+    def import_neptune(self):
+        try:
+            import neptune.new as neptune
+        except ImportError:
+            raise ImportError(
+                'Please run "pip install neptune-client" to install neptune')
+        self.neptune = neptune
+        self.run = None
+
+    @master_only
+    def before_run(self, runner):
+        if self.init_kwargs:
+            self.run = self.neptune.init(**self.init_kwargs)
+        else:
+            self.run = self.neptune.init()
+
+    @master_only
+    def log(self, runner):
+        tags = self.get_loggable_tags(runner)
+        if tags:
+            for tag_name, tag_value in tags.items():
+                if self.with_step:
+                    self.run[tag_name].log(
+                        tag_value, step=self.get_iter(runner))
+                else:
+                    tags['global_step'] = self.get_iter(runner)
+                    self.run[tag_name].log(tags)
+
+    @master_only
+    def after_run(self, runner):
+        self.run.stop()
diff --git a/mmcv/runner/hooks/logger/pavi.py b/mmcv/runner/hooks/logger/pavi.py
new file mode 100644
index 0000000..ba2f6e8
--- /dev/null
+++ b/mmcv/runner/hooks/logger/pavi.py
@@ -0,0 +1,117 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import json
+import os
+import os.path as osp
+
+import torch
+import yaml
+
+import mmcv
+from ....parallel.utils import is_module_wrapper
+from ...dist_utils import master_only
+from ..hook import HOOKS
+from .base import LoggerHook
+
+
+@HOOKS.register_module()
+class PaviLoggerHook(LoggerHook):
+
+    def __init__(self,
+                 init_kwargs=None,
+                 add_graph=False,
+                 add_last_ckpt=False,
+                 interval=10,
+                 ignore_last=True,
+                 reset_flag=False,
+                 by_epoch=True,
+                 img_key='img_info'):
+        super(PaviLoggerHook, self).__init__(interval, ignore_last, reset_flag,
+                                             by_epoch)
+        self.init_kwargs = init_kwargs
+        self.add_graph = add_graph
+        self.add_last_ckpt = add_last_ckpt
+        self.img_key = img_key
+
+    @master_only
+    def before_run(self, runner):
+        super(PaviLoggerHook, self).before_run(runner)
+        try:
+            from pavi import SummaryWriter
+        except ImportError:
+            raise ImportError('Please run "pip install pavi" to install pavi.')
+
+        self.run_name = runner.work_dir.split('/')[-1]
+
+        if not self.init_kwargs:
+            self.init_kwargs = dict()
+        self.init_kwargs['name'] = self.run_name
+        self.init_kwargs['model'] = runner._model_name
+        if runner.meta is not None:
+            if 'config_dict' in runner.meta:
+                config_dict = runner.meta['config_dict']
+                assert isinstance(
+                    config_dict,
+                    dict), ('meta["config_dict"] has to be of a dict, '
+                            f'but got {type(config_dict)}')
+            elif 'config_file' in runner.meta:
+                config_file = runner.meta['config_file']
+                config_dict = dict(mmcv.Config.fromfile(config_file))
+            else:
+                config_dict = None
+            if config_dict is not None:
+                # 'max_.*iter' is parsed in pavi sdk as the maximum iterations
+                #  to properly set up the progress bar.
+                config_dict = config_dict.copy()
+                config_dict.setdefault('max_iter', runner.max_iters)
+                # non-serializable values are first converted in
+                # mmcv.dump to json
+                config_dict = json.loads(
+                    mmcv.dump(config_dict, file_format='json'))
+                session_text = yaml.dump(config_dict)
+                self.init_kwargs['session_text'] = session_text
+        self.writer = SummaryWriter(**self.init_kwargs)
+
+    def get_step(self, runner):
+        """Get the total training step/epoch."""
+        if self.get_mode(runner) == 'val' and self.by_epoch:
+            return self.get_epoch(runner)
+        else:
+            return self.get_iter(runner)
+
+    @master_only
+    def log(self, runner):
+        tags = self.get_loggable_tags(runner, add_mode=False)
+        if tags:
+            self.writer.add_scalars(
+                self.get_mode(runner), tags, self.get_step(runner))
+
+    @master_only
+    def after_run(self, runner):
+        if self.add_last_ckpt:
+            ckpt_path = osp.join(runner.work_dir, 'latest.pth')
+            if osp.islink(ckpt_path):
+                ckpt_path = osp.join(runner.work_dir, os.readlink(ckpt_path))
+
+            if osp.isfile(ckpt_path):
+                # runner.epoch += 1 has been done before `after_run`.
+                iteration = runner.epoch if self.by_epoch else runner.iter
+                return self.writer.add_snapshot_file(
+                    tag=self.run_name,
+                    snapshot_file_path=ckpt_path,
+                    iteration=iteration)
+
+        # flush the buffer and send a task ending signal to Pavi
+        self.writer.close()
+
+    @master_only
+    def before_epoch(self, runner):
+        if runner.epoch == 0 and self.add_graph:
+            if is_module_wrapper(runner.model):
+                _model = runner.model.module
+            else:
+                _model = runner.model
+            device = next(_model.parameters()).device
+            data = next(iter(runner.data_loader))
+            image = data[self.img_key][0:1].to(device)
+            with torch.no_grad():
+                self.writer.add_graph(_model, image)
diff --git a/mmcv/runner/hooks/logger/tensorboard.py b/mmcv/runner/hooks/logger/tensorboard.py
new file mode 100644
index 0000000..a8d5036
--- /dev/null
+++ b/mmcv/runner/hooks/logger/tensorboard.py
@@ -0,0 +1,57 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+
+from mmcv.utils import TORCH_VERSION, digit_version
+from ...dist_utils import master_only
+from ..hook import HOOKS
+from .base import LoggerHook
+
+
+@HOOKS.register_module()
+class TensorboardLoggerHook(LoggerHook):
+
+    def __init__(self,
+                 log_dir=None,
+                 interval=10,
+                 ignore_last=True,
+                 reset_flag=False,
+                 by_epoch=True):
+        super(TensorboardLoggerHook, self).__init__(interval, ignore_last,
+                                                    reset_flag, by_epoch)
+        self.log_dir = log_dir
+
+    @master_only
+    def before_run(self, runner):
+        super(TensorboardLoggerHook, self).before_run(runner)
+        if (TORCH_VERSION == 'parrots'
+                or digit_version(TORCH_VERSION) < digit_version('1.1')):
+            try:
+                from tensorboardX import SummaryWriter
+            except ImportError:
+                raise ImportError('Please install tensorboardX to use '
+                                  'TensorboardLoggerHook.')
+        else:
+            try:
+                from torch.utils.tensorboard import SummaryWriter
+            except ImportError:
+                raise ImportError(
+                    'Please run "pip install future tensorboard" to install '
+                    'the dependencies to use torch.utils.tensorboard '
+                    '(applicable to PyTorch 1.1 or higher)')
+
+        if self.log_dir is None:
+            self.log_dir = osp.join(runner.work_dir, 'tf_logs')
+        self.writer = SummaryWriter(self.log_dir)
+
+    @master_only
+    def log(self, runner):
+        tags = self.get_loggable_tags(runner, allow_text=True)
+        for tag, val in tags.items():
+            if isinstance(val, str):
+                self.writer.add_text(tag, val, self.get_iter(runner))
+            else:
+                self.writer.add_scalar(tag, val, self.get_iter(runner))
+
+    @master_only
+    def after_run(self, runner):
+        self.writer.close()
diff --git a/mmcv/runner/hooks/logger/text.py b/mmcv/runner/hooks/logger/text.py
new file mode 100644
index 0000000..043c7bf
--- /dev/null
+++ b/mmcv/runner/hooks/logger/text.py
@@ -0,0 +1,256 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import datetime
+import os
+import os.path as osp
+from collections import OrderedDict
+
+import torch
+import torch.distributed as dist
+
+import mmcv
+from mmcv.fileio.file_client import FileClient
+from mmcv.utils import is_tuple_of, scandir
+from ..hook import HOOKS
+from .base import LoggerHook
+
+
+@HOOKS.register_module()
+class TextLoggerHook(LoggerHook):
+    """Logger hook in text.
+
+    In this logger hook, the information will be printed on terminal and
+    saved in json file.
+
+    Args:
+        by_epoch (bool, optional): Whether EpochBasedRunner is used.
+            Default: True.
+        interval (int, optional): Logging interval (every k iterations).
+            Default: 10.
+        ignore_last (bool, optional): Ignore the log of last iterations in each
+            epoch if less than :attr:`interval`. Default: True.
+        reset_flag (bool, optional): Whether to clear the output buffer after
+            logging. Default: False.
+        interval_exp_name (int, optional): Logging interval for experiment
+            name. This feature is to help users conveniently get the experiment
+            information from screen or log file. Default: 1000.
+        out_dir (str, optional): Logs are saved in ``runner.work_dir`` default.
+            If ``out_dir`` is specified, logs will be copied to a new directory
+            which is the concatenation of ``out_dir`` and the last level
+            directory of ``runner.work_dir``. Default: None.
+            `New in version 1.3.16.`
+        out_suffix (str or tuple[str], optional): Those filenames ending with
+            ``out_suffix`` will be copied to ``out_dir``.
+            Default: ('.log.json', '.log', '.py').
+            `New in version 1.3.16.`
+        keep_local (bool, optional): Whether to keep local log when
+            :attr:`out_dir` is specified. If False, the local log will be
+            removed. Default: True.
+            `New in version 1.3.16.`
+        file_client_args (dict, optional): Arguments to instantiate a
+            FileClient. See :class:`mmcv.fileio.FileClient` for details.
+            Default: None.
+            `New in version 1.3.16.`
+    """
+
+    def __init__(self,
+                 by_epoch=True,
+                 interval=10,
+                 ignore_last=True,
+                 reset_flag=False,
+                 interval_exp_name=1000,
+                 out_dir=None,
+                 out_suffix=('.log.json', '.log', '.py'),
+                 keep_local=True,
+                 file_client_args=None):
+        super(TextLoggerHook, self).__init__(interval, ignore_last, reset_flag,
+                                             by_epoch)
+        self.by_epoch = by_epoch
+        self.time_sec_tot = 0
+        self.interval_exp_name = interval_exp_name
+
+        if out_dir is None and file_client_args is not None:
+            raise ValueError(
+                'file_client_args should be "None" when `out_dir` is not'
+                'specified.')
+        self.out_dir = out_dir
+
+        if not (out_dir is None or isinstance(out_dir, str)
+                or is_tuple_of(out_dir, str)):
+            raise TypeError('out_dir should be  "None" or string or tuple of '
+                            'string, but got {out_dir}')
+        self.out_suffix = out_suffix
+
+        self.keep_local = keep_local
+        self.file_client_args = file_client_args
+        if self.out_dir is not None:
+            self.file_client = FileClient.infer_client(file_client_args,
+                                                       self.out_dir)
+
+    def before_run(self, runner):
+        super(TextLoggerHook, self).before_run(runner)
+
+        if self.out_dir is not None:
+            self.file_client = FileClient.infer_client(self.file_client_args,
+                                                       self.out_dir)
+            # The final `self.out_dir` is the concatenation of `self.out_dir`
+            # and the last level directory of `runner.work_dir`
+            basename = osp.basename(runner.work_dir.rstrip(osp.sep))
+            self.out_dir = self.file_client.join_path(self.out_dir, basename)
+            runner.logger.info(
+                (f'Text logs will be saved to {self.out_dir} by '
+                 f'{self.file_client.name} after the training process.'))
+
+        self.start_iter = runner.iter
+        self.json_log_path = osp.join(runner.work_dir,
+                                      f'{runner.timestamp}.log.json')
+        if runner.meta is not None:
+            self._dump_log(runner.meta, runner)
+
+    def _get_max_memory(self, runner):
+        device = getattr(runner.model, 'output_device', None)
+        mem = torch.cuda.max_memory_allocated(device=device)
+        mem_mb = torch.tensor([mem / (1024 * 1024)],
+                              dtype=torch.int,
+                              device=device)
+        if runner.world_size > 1:
+            dist.reduce(mem_mb, 0, op=dist.ReduceOp.MAX)
+        return mem_mb.item()
+
+    def _log_info(self, log_dict, runner):
+        # print exp name for users to distinguish experiments
+        # at every ``interval_exp_name`` iterations and the end of each epoch
+        if runner.meta is not None and 'exp_name' in runner.meta:
+            if (self.every_n_iters(runner, self.interval_exp_name)) or (
+                    self.by_epoch and self.end_of_epoch(runner)):
+                exp_info = f'Exp name: {runner.meta["exp_name"]}'
+                runner.logger.info(exp_info)
+
+        if log_dict['mode'] == 'train':
+            if isinstance(log_dict['lr'], dict):
+                lr_str = []
+                for k, val in log_dict['lr'].items():
+                    lr_str.append(f'lr_{k}: {val:.3e}')
+                lr_str = ' '.join(lr_str)
+            else:
+                lr_str = f'lr: {log_dict["lr"]:.3e}'
+
+            # by epoch: Epoch [4][100/1000]
+            # by iter:  Iter [100/100000]
+            if self.by_epoch:
+                log_str = f'Epoch [{log_dict["epoch"]}]' \
+                          f'[{log_dict["iter"]}/{len(runner.data_loader)}]\t'
+            else:
+                log_str = f'Iter [{log_dict["iter"]}/{runner.max_iters}]\t'
+            log_str += f'{lr_str}, '
+
+            if 'time' in log_dict.keys():
+                self.time_sec_tot += (log_dict['time'] * self.interval)
+                time_sec_avg = self.time_sec_tot / (
+                    runner.iter - self.start_iter + 1)
+                eta_sec = time_sec_avg * (runner.max_iters - runner.iter - 1)
+                eta_str = str(datetime.timedelta(seconds=int(eta_sec)))
+                log_str += f'eta: {eta_str}, '
+                log_str += f'time: {log_dict["time"]:.3f}, ' \
+                           f'data_time: {log_dict["data_time"]:.3f}, '
+                # statistic memory
+                if torch.cuda.is_available():
+                    log_str += f'memory: {log_dict["memory"]}, '
+        else:
+            # val/test time
+            # here 1000 is the length of the val dataloader
+            # by epoch: Epoch[val] [4][1000]
+            # by iter: Iter[val] [1000]
+            if self.by_epoch:
+                log_str = f'Epoch({log_dict["mode"]}) ' \
+                    f'[{log_dict["epoch"]}][{log_dict["iter"]}]\t'
+            else:
+                log_str = f'Iter({log_dict["mode"]}) [{log_dict["iter"]}]\t'
+
+        log_items = []
+        for name, val in log_dict.items():
+            # TODO: resolve this hack
+            # these items have been in log_str
+            if name in [
+                    'mode', 'Epoch', 'iter', 'lr', 'time', 'data_time',
+                    'memory', 'epoch'
+            ]:
+                continue
+            if isinstance(val, float):
+                val = f'{val:.4f}'
+            log_items.append(f'{name}: {val}')
+        log_str += ', '.join(log_items)
+
+        runner.logger.info(log_str)
+
+    def _dump_log(self, log_dict, runner):
+        # dump log in json format
+        json_log = OrderedDict()
+        for k, v in log_dict.items():
+            json_log[k] = self._round_float(v)
+        # only append log at last line
+        if runner.rank == 0:
+            with open(self.json_log_path, 'a+') as f:
+                mmcv.dump(json_log, f, file_format='json')
+                f.write('\n')
+
+    def _round_float(self, items):
+        if isinstance(items, list):
+            return [self._round_float(item) for item in items]
+        elif isinstance(items, float):
+            return round(items, 5)
+        else:
+            return items
+
+    def log(self, runner):
+        if 'eval_iter_num' in runner.log_buffer.output:
+            # this doesn't modify runner.iter and is regardless of by_epoch
+            cur_iter = runner.log_buffer.output.pop('eval_iter_num')
+        else:
+            cur_iter = self.get_iter(runner, inner_iter=True)
+
+        log_dict = OrderedDict(
+            mode=self.get_mode(runner),
+            epoch=self.get_epoch(runner),
+            iter=cur_iter)
+
+        # only record lr of the first param group
+        cur_lr = runner.current_lr()
+        if isinstance(cur_lr, list):
+            log_dict['lr'] = cur_lr[0]
+        else:
+            assert isinstance(cur_lr, dict)
+            log_dict['lr'] = {}
+            for k, lr_ in cur_lr.items():
+                assert isinstance(lr_, list)
+                log_dict['lr'].update({k: lr_[0]})
+
+        if 'time' in runner.log_buffer.output:
+            # statistic memory
+            if torch.cuda.is_available():
+                log_dict['memory'] = self._get_max_memory(runner)
+
+        log_dict = dict(log_dict, **runner.log_buffer.output)
+
+        self._log_info(log_dict, runner)
+        self._dump_log(log_dict, runner)
+        return log_dict
+
+    def after_run(self, runner):
+        # copy or upload logs to self.out_dir
+        if self.out_dir is not None:
+            for filename in scandir(runner.work_dir, self.out_suffix, True):
+                local_filepath = osp.join(runner.work_dir, filename)
+                out_filepath = self.file_client.join_path(
+                    self.out_dir, filename)
+                with open(local_filepath, 'r') as f:
+                    self.file_client.put_text(f.read(), out_filepath)
+
+                runner.logger.info(
+                    (f'The file {local_filepath} has been uploaded to '
+                     f'{out_filepath}.'))
+
+                if not self.keep_local:
+                    os.remove(local_filepath)
+                    runner.logger.info(
+                        (f'{local_filepath} was removed due to the '
+                         '`self.keep_local=False`'))
diff --git a/mmcv/runner/hooks/logger/wandb.py b/mmcv/runner/hooks/logger/wandb.py
new file mode 100644
index 0000000..9f68084
--- /dev/null
+++ b/mmcv/runner/hooks/logger/wandb.py
@@ -0,0 +1,56 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from ...dist_utils import master_only
+from ..hook import HOOKS
+from .base import LoggerHook
+
+
+@HOOKS.register_module()
+class WandbLoggerHook(LoggerHook):
+
+    def __init__(self,
+                 init_kwargs=None,
+                 interval=10,
+                 ignore_last=True,
+                 reset_flag=False,
+                 commit=True,
+                 by_epoch=True,
+                 with_step=True):
+        super(WandbLoggerHook, self).__init__(interval, ignore_last,
+                                              reset_flag, by_epoch)
+        self.import_wandb()
+        self.init_kwargs = init_kwargs
+        self.commit = commit
+        self.with_step = with_step
+
+    def import_wandb(self):
+        try:
+            import wandb
+        except ImportError:
+            raise ImportError(
+                'Please run "pip install wandb" to install wandb')
+        self.wandb = wandb
+
+    @master_only
+    def before_run(self, runner):
+        super(WandbLoggerHook, self).before_run(runner)
+        if self.wandb is None:
+            self.import_wandb()
+        if self.init_kwargs:
+            self.wandb.init(**self.init_kwargs)
+        else:
+            self.wandb.init()
+
+    @master_only
+    def log(self, runner):
+        tags = self.get_loggable_tags(runner)
+        if tags:
+            if self.with_step:
+                self.wandb.log(
+                    tags, step=self.get_iter(runner), commit=self.commit)
+            else:
+                tags['global_step'] = self.get_iter(runner)
+                self.wandb.log(tags, commit=self.commit)
+
+    @master_only
+    def after_run(self, runner):
+        self.wandb.join()
diff --git a/mmcv/runner/hooks/lr_updater.py b/mmcv/runner/hooks/lr_updater.py
new file mode 100644
index 0000000..e5a1241
--- /dev/null
+++ b/mmcv/runner/hooks/lr_updater.py
@@ -0,0 +1,670 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numbers
+from math import cos, pi
+
+import mmcv
+from .hook import HOOKS, Hook
+
+
+class LrUpdaterHook(Hook):
+    """LR Scheduler in MMCV.
+
+    Args:
+        by_epoch (bool): LR changes epoch by epoch
+        warmup (string): Type of warmup used. It can be None(use no warmup),
+            'constant', 'linear' or 'exp'
+        warmup_iters (int): The number of iterations or epochs that warmup
+            lasts
+        warmup_ratio (float): LR used at the beginning of warmup equals to
+            warmup_ratio * initial_lr
+        warmup_by_epoch (bool): When warmup_by_epoch == True, warmup_iters
+            means the number of epochs that warmup lasts, otherwise means the
+            number of iteration that warmup lasts
+    """
+
+    def __init__(self,
+                 by_epoch=True,
+                 warmup=None,
+                 warmup_iters=0,
+                 warmup_ratio=0.1,
+                 warmup_by_epoch=False):
+        # validate the "warmup" argument
+        if warmup is not None:
+            if warmup not in ['constant', 'linear', 'exp']:
+                raise ValueError(
+                    f'"{warmup}" is not a supported type for warming up, valid'
+                    ' types are "constant" and "linear"')
+        if warmup is not None:
+            assert warmup_iters > 0, \
+                '"warmup_iters" must be a positive integer'
+            assert 0 < warmup_ratio <= 1.0, \
+                '"warmup_ratio" must be in range (0,1]'
+
+        self.by_epoch = by_epoch
+        self.warmup = warmup
+        self.warmup_iters = warmup_iters
+        self.warmup_ratio = warmup_ratio
+        self.warmup_by_epoch = warmup_by_epoch
+
+        if self.warmup_by_epoch:
+            self.warmup_epochs = self.warmup_iters
+            self.warmup_iters = None
+        else:
+            self.warmup_epochs = None
+
+        self.base_lr = []  # initial lr for all param groups
+        self.regular_lr = []  # expected lr if no warming up is performed
+
+    def _set_lr(self, runner, lr_groups):
+        if isinstance(runner.optimizer, dict):
+            for k, optim in runner.optimizer.items():
+                for param_group, lr in zip(optim.param_groups, lr_groups[k]):
+                    param_group['lr'] = lr
+        else:
+            for param_group, lr in zip(runner.optimizer.param_groups,
+                                       lr_groups):
+                param_group['lr'] = lr
+
+    def get_lr(self, runner, base_lr):
+        raise NotImplementedError
+
+    def get_regular_lr(self, runner):
+        if isinstance(runner.optimizer, dict):
+            lr_groups = {}
+            for k in runner.optimizer.keys():
+                _lr_group = [
+                    self.get_lr(runner, _base_lr)
+                    for _base_lr in self.base_lr[k]
+                ]
+                lr_groups.update({k: _lr_group})
+
+            return lr_groups
+        else:
+            return [self.get_lr(runner, _base_lr) for _base_lr in self.base_lr]
+
+    def get_warmup_lr(self, cur_iters):
+
+        def _get_warmup_lr(cur_iters, regular_lr):
+            if self.warmup == 'constant':
+                warmup_lr = [_lr * self.warmup_ratio for _lr in regular_lr]
+            elif self.warmup == 'linear':
+                k = (1 - cur_iters / self.warmup_iters) * (1 -
+                                                           self.warmup_ratio)
+                warmup_lr = [_lr * (1 - k) for _lr in regular_lr]
+            elif self.warmup == 'exp':
+                k = self.warmup_ratio**(1 - cur_iters / self.warmup_iters)
+                warmup_lr = [_lr * k for _lr in regular_lr]
+            return warmup_lr
+
+        if isinstance(self.regular_lr, dict):
+            lr_groups = {}
+            for key, regular_lr in self.regular_lr.items():
+                lr_groups[key] = _get_warmup_lr(cur_iters, regular_lr)
+            return lr_groups
+        else:
+            return _get_warmup_lr(cur_iters, self.regular_lr)
+
+    def before_run(self, runner):
+        # NOTE: when resuming from a checkpoint, if 'initial_lr' is not saved,
+        # it will be set according to the optimizer params
+        if isinstance(runner.optimizer, dict):
+            self.base_lr = {}
+            for k, optim in runner.optimizer.items():
+                for group in optim.param_groups:
+                    group.setdefault('initial_lr', group['lr'])
+                _base_lr = [
+                    group['initial_lr'] for group in optim.param_groups
+                ]
+                self.base_lr.update({k: _base_lr})
+        else:
+            for group in runner.optimizer.param_groups:
+                group.setdefault('initial_lr', group['lr'])
+            self.base_lr = [
+                group['initial_lr'] for group in runner.optimizer.param_groups
+            ]
+
+    def before_train_epoch(self, runner):
+        if self.warmup_iters is None:
+            epoch_len = len(runner.data_loader)
+            self.warmup_iters = self.warmup_epochs * epoch_len
+
+        if not self.by_epoch:
+            return
+
+        self.regular_lr = self.get_regular_lr(runner)
+        self._set_lr(runner, self.regular_lr)
+
+    def before_train_iter(self, runner):
+        cur_iter = runner.iter
+        if not self.by_epoch:
+            self.regular_lr = self.get_regular_lr(runner)
+            if self.warmup is None or cur_iter >= self.warmup_iters:
+                self._set_lr(runner, self.regular_lr)
+            else:
+                warmup_lr = self.get_warmup_lr(cur_iter)
+                self._set_lr(runner, warmup_lr)
+        elif self.by_epoch:
+            if self.warmup is None or cur_iter > self.warmup_iters:
+                return
+            elif cur_iter == self.warmup_iters:
+                self._set_lr(runner, self.regular_lr)
+            else:
+                warmup_lr = self.get_warmup_lr(cur_iter)
+                self._set_lr(runner, warmup_lr)
+
+
+@HOOKS.register_module()
+class FixedLrUpdaterHook(LrUpdaterHook):
+
+    def __init__(self, **kwargs):
+        super(FixedLrUpdaterHook, self).__init__(**kwargs)
+
+    def get_lr(self, runner, base_lr):
+        return base_lr
+
+
+@HOOKS.register_module()
+class StepLrUpdaterHook(LrUpdaterHook):
+    """Step LR scheduler with min_lr clipping.
+
+    Args:
+        step (int | list[int]): Step to decay the LR. If an int value is given,
+            regard it as the decay interval. If a list is given, decay LR at
+            these steps.
+        gamma (float, optional): Decay LR ratio. Default: 0.1.
+        min_lr (float, optional): Minimum LR value to keep. If LR after decay
+            is lower than `min_lr`, it will be clipped to this value. If None
+            is given, we don't perform lr clipping. Default: None.
+    """
+
+    def __init__(self, step, gamma=0.1, min_lr=None, **kwargs):
+        if isinstance(step, list):
+            assert mmcv.is_list_of(step, int)
+            assert all([s > 0 for s in step])
+        elif isinstance(step, int):
+            assert step > 0
+        else:
+            raise TypeError('"step" must be a list or integer')
+        self.step = step
+        self.gamma = gamma
+        self.min_lr = min_lr
+        super(StepLrUpdaterHook, self).__init__(**kwargs)
+
+    def get_lr(self, runner, base_lr):
+        progress = runner.epoch if self.by_epoch else runner.iter
+
+        # calculate exponential term
+        if isinstance(self.step, int):
+            exp = progress // self.step
+        else:
+            exp = len(self.step)
+            for i, s in enumerate(self.step):
+                if progress < s:
+                    exp = i
+                    break
+
+        lr = base_lr * (self.gamma**exp)
+        if self.min_lr is not None:
+            # clip to a minimum value
+            lr = max(lr, self.min_lr)
+        return lr
+
+
+@HOOKS.register_module()
+class ExpLrUpdaterHook(LrUpdaterHook):
+
+    def __init__(self, gamma, **kwargs):
+        self.gamma = gamma
+        super(ExpLrUpdaterHook, self).__init__(**kwargs)
+
+    def get_lr(self, runner, base_lr):
+        progress = runner.epoch if self.by_epoch else runner.iter
+        return base_lr * self.gamma**progress
+
+
+@HOOKS.register_module()
+class PolyLrUpdaterHook(LrUpdaterHook):
+
+    def __init__(self, power=1., min_lr=0., **kwargs):
+        self.power = power
+        self.min_lr = min_lr
+        super(PolyLrUpdaterHook, self).__init__(**kwargs)
+
+    def get_lr(self, runner, base_lr):
+        if self.by_epoch:
+            progress = runner.epoch
+            max_progress = runner.max_epochs
+        else:
+            progress = runner.iter
+            max_progress = runner.max_iters
+        coeff = (1 - progress / max_progress)**self.power
+        return (base_lr - self.min_lr) * coeff + self.min_lr
+
+
+@HOOKS.register_module()
+class InvLrUpdaterHook(LrUpdaterHook):
+
+    def __init__(self, gamma, power=1., **kwargs):
+        self.gamma = gamma
+        self.power = power
+        super(InvLrUpdaterHook, self).__init__(**kwargs)
+
+    def get_lr(self, runner, base_lr):
+        progress = runner.epoch if self.by_epoch else runner.iter
+        return base_lr * (1 + self.gamma * progress)**(-self.power)
+
+
+@HOOKS.register_module()
+class CosineAnnealingLrUpdaterHook(LrUpdaterHook):
+
+    def __init__(self, min_lr=None, min_lr_ratio=None, **kwargs):
+        assert (min_lr is None) ^ (min_lr_ratio is None)
+        self.min_lr = min_lr
+        self.min_lr_ratio = min_lr_ratio
+        super(CosineAnnealingLrUpdaterHook, self).__init__(**kwargs)
+
+    def get_lr(self, runner, base_lr):
+        if self.by_epoch:
+            progress = runner.epoch
+            max_progress = runner.max_epochs
+        else:
+            progress = runner.iter
+            max_progress = runner.max_iters
+
+        if self.min_lr_ratio is not None:
+            target_lr = base_lr * self.min_lr_ratio
+        else:
+            target_lr = self.min_lr
+        return annealing_cos(base_lr, target_lr, progress / max_progress)
+
+
+@HOOKS.register_module()
+class FlatCosineAnnealingLrUpdaterHook(LrUpdaterHook):
+    """Flat + Cosine lr schedule.
+
+    Modified from https://github.com/fastai/fastai/blob/master/fastai/callback/schedule.py#L128 # noqa: E501
+
+    Args:
+        start_percent (float): When to start annealing the learning rate
+            after the percentage of the total training steps.
+            The value should be in range [0, 1).
+            Default: 0.75
+        min_lr (float, optional): The minimum lr. Default: None.
+        min_lr_ratio (float, optional): The ratio of minimum lr to the base lr.
+            Either `min_lr` or `min_lr_ratio` should be specified.
+            Default: None.
+    """
+
+    def __init__(self,
+                 start_percent=0.75,
+                 min_lr=None,
+                 min_lr_ratio=None,
+                 **kwargs):
+        assert (min_lr is None) ^ (min_lr_ratio is None)
+        if start_percent < 0 or start_percent > 1 or not isinstance(
+                start_percent, float):
+            raise ValueError(
+                'expected float between 0 and 1 start_percent, but '
+                f'got {start_percent}')
+        self.start_percent = start_percent
+        self.min_lr = min_lr
+        self.min_lr_ratio = min_lr_ratio
+        super(FlatCosineAnnealingLrUpdaterHook, self).__init__(**kwargs)
+
+    def get_lr(self, runner, base_lr):
+        if self.by_epoch:
+            start = round(runner.max_epochs * self.start_percent)
+            progress = runner.epoch - start
+            max_progress = runner.max_epochs - start
+        else:
+            start = round(runner.max_iters * self.start_percent)
+            progress = runner.iter - start
+            max_progress = runner.max_iters - start
+
+        if self.min_lr_ratio is not None:
+            target_lr = base_lr * self.min_lr_ratio
+        else:
+            target_lr = self.min_lr
+
+        if progress < 0:
+            return base_lr
+        else:
+            return annealing_cos(base_lr, target_lr, progress / max_progress)
+
+
+@HOOKS.register_module()
+class CosineRestartLrUpdaterHook(LrUpdaterHook):
+    """Cosine annealing with restarts learning rate scheme.
+
+    Args:
+        periods (list[int]): Periods for each cosine anneling cycle.
+        restart_weights (list[float], optional): Restart weights at each
+            restart iteration. Default: [1].
+        min_lr (float, optional): The minimum lr. Default: None.
+        min_lr_ratio (float, optional): The ratio of minimum lr to the base lr.
+            Either `min_lr` or `min_lr_ratio` should be specified.
+            Default: None.
+    """
+
+    def __init__(self,
+                 periods,
+                 restart_weights=[1],
+                 min_lr=None,
+                 min_lr_ratio=None,
+                 **kwargs):
+        assert (min_lr is None) ^ (min_lr_ratio is None)
+        self.periods = periods
+        self.min_lr = min_lr
+        self.min_lr_ratio = min_lr_ratio
+        self.restart_weights = restart_weights
+        assert (len(self.periods) == len(self.restart_weights)
+                ), 'periods and restart_weights should have the same length.'
+        super(CosineRestartLrUpdaterHook, self).__init__(**kwargs)
+
+        self.cumulative_periods = [
+            sum(self.periods[0:i + 1]) for i in range(0, len(self.periods))
+        ]
+
+    def get_lr(self, runner, base_lr):
+        if self.by_epoch:
+            progress = runner.epoch
+        else:
+            progress = runner.iter
+
+        if self.min_lr_ratio is not None:
+            target_lr = base_lr * self.min_lr_ratio
+        else:
+            target_lr = self.min_lr
+
+        idx = get_position_from_periods(progress, self.cumulative_periods)
+        current_weight = self.restart_weights[idx]
+        nearest_restart = 0 if idx == 0 else self.cumulative_periods[idx - 1]
+        current_periods = self.periods[idx]
+
+        alpha = min((progress - nearest_restart) / current_periods, 1)
+        return annealing_cos(base_lr, target_lr, alpha, current_weight)
+
+
+def get_position_from_periods(iteration, cumulative_periods):
+    """Get the position from a period list.
+
+    It will return the index of the right-closest number in the period list.
+    For example, the cumulative_periods = [100, 200, 300, 400],
+    if iteration == 50, return 0;
+    if iteration == 210, return 2;
+    if iteration == 300, return 3.
+
+    Args:
+        iteration (int): Current iteration.
+        cumulative_periods (list[int]): Cumulative period list.
+
+    Returns:
+        int: The position of the right-closest number in the period list.
+    """
+    for i, period in enumerate(cumulative_periods):
+        if iteration < period:
+            return i
+    raise ValueError(f'Current iteration {iteration} exceeds '
+                     f'cumulative_periods {cumulative_periods}')
+
+
+@HOOKS.register_module()
+class CyclicLrUpdaterHook(LrUpdaterHook):
+    """Cyclic LR Scheduler.
+
+    Implement the cyclical learning rate policy (CLR) described in
+    https://arxiv.org/pdf/1506.01186.pdf
+
+    Different from the original paper, we use cosine annealing rather than
+    triangular policy inside a cycle. This improves the performance in the
+    3D detection area.
+
+    Args:
+        by_epoch (bool): Whether to update LR by epoch.
+        target_ratio (tuple[float]): Relative ratio of the highest LR and the
+            lowest LR to the initial LR.
+        cyclic_times (int): Number of cycles during training
+        step_ratio_up (float): The ratio of the increasing process of LR in
+            the total cycle.
+        anneal_strategy (str): {'cos', 'linear'}
+            Specifies the annealing strategy: 'cos' for cosine annealing,
+            'linear' for linear annealing. Default: 'cos'.
+    """
+
+    def __init__(self,
+                 by_epoch=False,
+                 target_ratio=(10, 1e-4),
+                 cyclic_times=1,
+                 step_ratio_up=0.4,
+                 anneal_strategy='cos',
+                 **kwargs):
+        if isinstance(target_ratio, float):
+            target_ratio = (target_ratio, target_ratio / 1e5)
+        elif isinstance(target_ratio, tuple):
+            target_ratio = (target_ratio[0], target_ratio[0] / 1e5) \
+                if len(target_ratio) == 1 else target_ratio
+        else:
+            raise ValueError('target_ratio should be either float '
+                             f'or tuple, got {type(target_ratio)}')
+
+        assert len(target_ratio) == 2, \
+            '"target_ratio" must be list or tuple of two floats'
+        assert 0 <= step_ratio_up < 1.0, \
+            '"step_ratio_up" must be in range [0,1)'
+
+        self.target_ratio = target_ratio
+        self.cyclic_times = cyclic_times
+        self.step_ratio_up = step_ratio_up
+        self.lr_phases = []  # init lr_phases
+        # validate anneal_strategy
+        if anneal_strategy not in ['cos', 'linear']:
+            raise ValueError('anneal_strategy must be one of "cos" or '
+                             f'"linear", instead got {anneal_strategy}')
+        elif anneal_strategy == 'cos':
+            self.anneal_func = annealing_cos
+        elif anneal_strategy == 'linear':
+            self.anneal_func = annealing_linear
+
+        assert not by_epoch, \
+            'currently only support "by_epoch" = False'
+        super(CyclicLrUpdaterHook, self).__init__(by_epoch, **kwargs)
+
+    def before_run(self, runner):
+        super(CyclicLrUpdaterHook, self).before_run(runner)
+        # initiate lr_phases
+        # total lr_phases are separated as up and down
+        max_iter_per_phase = runner.max_iters // self.cyclic_times
+        iter_up_phase = int(self.step_ratio_up * max_iter_per_phase)
+        self.lr_phases.append(
+            [0, iter_up_phase, max_iter_per_phase, 1, self.target_ratio[0]])
+        self.lr_phases.append([
+            iter_up_phase, max_iter_per_phase, max_iter_per_phase,
+            self.target_ratio[0], self.target_ratio[1]
+        ])
+
+    def get_lr(self, runner, base_lr):
+        curr_iter = runner.iter
+        for (start_iter, end_iter, max_iter_per_phase, start_ratio,
+             end_ratio) in self.lr_phases:
+            curr_iter %= max_iter_per_phase
+            if start_iter <= curr_iter < end_iter:
+                progress = curr_iter - start_iter
+                return self.anneal_func(base_lr * start_ratio,
+                                        base_lr * end_ratio,
+                                        progress / (end_iter - start_iter))
+
+
+@HOOKS.register_module()
+class OneCycleLrUpdaterHook(LrUpdaterHook):
+    """One Cycle LR Scheduler.
+
+    The 1cycle learning rate policy changes the learning rate after every
+    batch. The one cycle learning rate policy is described in
+    https://arxiv.org/pdf/1708.07120.pdf
+
+    Args:
+        max_lr (float or list): Upper learning rate boundaries in the cycle
+            for each parameter group.
+        total_steps (int, optional): The total number of steps in the cycle.
+            Note that if a value is not provided here, it will be the max_iter
+            of runner. Default: None.
+        pct_start (float): The percentage of the cycle (in number of steps)
+            spent increasing the learning rate.
+            Default: 0.3
+        anneal_strategy (str): {'cos', 'linear'}
+            Specifies the annealing strategy: 'cos' for cosine annealing,
+            'linear' for linear annealing.
+            Default: 'cos'
+        div_factor (float): Determines the initial learning rate via
+            initial_lr = max_lr/div_factor
+            Default: 25
+        final_div_factor (float): Determines the minimum learning rate via
+            min_lr = initial_lr/final_div_factor
+            Default: 1e4
+        three_phase (bool): If three_phase is True, use a third phase of the
+            schedule to annihilate the learning rate according to
+            final_div_factor instead of modifying the second phase (the first
+            two phases will be symmetrical about the step indicated by
+            pct_start).
+            Default: False
+    """
+
+    def __init__(self,
+                 max_lr,
+                 total_steps=None,
+                 pct_start=0.3,
+                 anneal_strategy='cos',
+                 div_factor=25,
+                 final_div_factor=1e4,
+                 three_phase=False,
+                 **kwargs):
+        # validate by_epoch, currently only support by_epoch = False
+        if 'by_epoch' not in kwargs:
+            kwargs['by_epoch'] = False
+        else:
+            assert not kwargs['by_epoch'], \
+                'currently only support "by_epoch" = False'
+        if not isinstance(max_lr, (numbers.Number, list, dict)):
+            raise ValueError('the type of max_lr must be the one of list or '
+                             f'dict, but got {type(max_lr)}')
+        self._max_lr = max_lr
+        if total_steps is not None:
+            if not isinstance(total_steps, int):
+                raise ValueError('the type of total_steps must be int, but'
+                                 f'got {type(total_steps)}')
+            self.total_steps = total_steps
+        # validate pct_start
+        if pct_start < 0 or pct_start > 1 or not isinstance(pct_start, float):
+            raise ValueError('expected float between 0 and 1 pct_start, but '
+                             f'got {pct_start}')
+        self.pct_start = pct_start
+        # validate anneal_strategy
+        if anneal_strategy not in ['cos', 'linear']:
+            raise ValueError('anneal_strategy must be one of "cos" or '
+                             f'"linear", instead got {anneal_strategy}')
+        elif anneal_strategy == 'cos':
+            self.anneal_func = annealing_cos
+        elif anneal_strategy == 'linear':
+            self.anneal_func = annealing_linear
+        self.div_factor = div_factor
+        self.final_div_factor = final_div_factor
+        self.three_phase = three_phase
+        self.lr_phases = []  # init lr_phases
+        super(OneCycleLrUpdaterHook, self).__init__(**kwargs)
+
+    def before_run(self, runner):
+        if hasattr(self, 'total_steps'):
+            total_steps = self.total_steps
+        else:
+            total_steps = runner.max_iters
+        if total_steps < runner.max_iters:
+            raise ValueError(
+                'The total steps must be greater than or equal to max '
+                f'iterations {runner.max_iters} of runner, but total steps '
+                f'is {total_steps}.')
+
+        if isinstance(runner.optimizer, dict):
+            self.base_lr = {}
+            for k, optim in runner.optimizer.items():
+                _max_lr = format_param(k, optim, self._max_lr)
+                self.base_lr[k] = [lr / self.div_factor for lr in _max_lr]
+                for group, lr in zip(optim.param_groups, self.base_lr[k]):
+                    group.setdefault('initial_lr', lr)
+        else:
+            k = type(runner.optimizer).__name__
+            _max_lr = format_param(k, runner.optimizer, self._max_lr)
+            self.base_lr = [lr / self.div_factor for lr in _max_lr]
+            for group, lr in zip(runner.optimizer.param_groups, self.base_lr):
+                group.setdefault('initial_lr', lr)
+
+        if self.three_phase:
+            self.lr_phases.append(
+                [float(self.pct_start * total_steps) - 1, 1, self.div_factor])
+            self.lr_phases.append([
+                float(2 * self.pct_start * total_steps) - 2, self.div_factor, 1
+            ])
+            self.lr_phases.append(
+                [total_steps - 1, 1, 1 / self.final_div_factor])
+        else:
+            self.lr_phases.append(
+                [float(self.pct_start * total_steps) - 1, 1, self.div_factor])
+            self.lr_phases.append(
+                [total_steps - 1, self.div_factor, 1 / self.final_div_factor])
+
+    def get_lr(self, runner, base_lr):
+        curr_iter = runner.iter
+        start_iter = 0
+        for i, (end_iter, start_lr, end_lr) in enumerate(self.lr_phases):
+            if curr_iter <= end_iter:
+                pct = (curr_iter - start_iter) / (end_iter - start_iter)
+                lr = self.anneal_func(base_lr * start_lr, base_lr * end_lr,
+                                      pct)
+                break
+            start_iter = end_iter
+        return lr
+
+
+def annealing_cos(start, end, factor, weight=1):
+    """Calculate annealing cos learning rate.
+
+    Cosine anneal from `weight * start + (1 - weight) * end` to `end` as
+    percentage goes from 0.0 to 1.0.
+
+    Args:
+        start (float): The starting learning rate of the cosine annealing.
+        end (float): The ending learing rate of the cosine annealing.
+        factor (float): The coefficient of `pi` when calculating the current
+            percentage. Range from 0.0 to 1.0.
+        weight (float, optional): The combination factor of `start` and `end`
+            when calculating the actual starting learning rate. Default to 1.
+    """
+    cos_out = cos(pi * factor) + 1
+    return end + 0.5 * weight * (start - end) * cos_out
+
+
+def annealing_linear(start, end, factor):
+    """Calculate annealing linear learning rate.
+
+    Linear anneal from `start` to `end` as percentage goes from 0.0 to 1.0.
+
+    Args:
+        start (float): The starting learning rate of the linear annealing.
+        end (float): The ending learing rate of the linear annealing.
+        factor (float): The coefficient of `pi` when calculating the current
+            percentage. Range from 0.0 to 1.0.
+    """
+    return start + (end - start) * factor
+
+
+def format_param(name, optim, param):
+    if isinstance(param, numbers.Number):
+        return [param] * len(optim.param_groups)
+    elif isinstance(param, (list, tuple)):  # multi param groups
+        if len(param) != len(optim.param_groups):
+            raise ValueError(f'expected {len(optim.param_groups)} '
+                             f'values for {name}, got {len(param)}')
+        return param
+    else:  # multi optimizers
+        if name not in param:
+            raise KeyError(f'{name} is not found in {param.keys()}')
+        return param[name]
diff --git a/mmcv/runner/hooks/memory.py b/mmcv/runner/hooks/memory.py
new file mode 100644
index 0000000..70cf9a8
--- /dev/null
+++ b/mmcv/runner/hooks/memory.py
@@ -0,0 +1,25 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+
+from .hook import HOOKS, Hook
+
+
+@HOOKS.register_module()
+class EmptyCacheHook(Hook):
+
+    def __init__(self, before_epoch=False, after_epoch=True, after_iter=False):
+        self._before_epoch = before_epoch
+        self._after_epoch = after_epoch
+        self._after_iter = after_iter
+
+    def after_iter(self, runner):
+        if self._after_iter:
+            torch.cuda.empty_cache()
+
+    def before_epoch(self, runner):
+        if self._before_epoch:
+            torch.cuda.empty_cache()
+
+    def after_epoch(self, runner):
+        if self._after_epoch:
+            torch.cuda.empty_cache()
diff --git a/mmcv/runner/hooks/momentum_updater.py b/mmcv/runner/hooks/momentum_updater.py
new file mode 100644
index 0000000..13d0e2f
--- /dev/null
+++ b/mmcv/runner/hooks/momentum_updater.py
@@ -0,0 +1,493 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import mmcv
+from .hook import HOOKS, Hook
+from .lr_updater import annealing_cos, annealing_linear, format_param
+
+
+class MomentumUpdaterHook(Hook):
+
+    def __init__(self,
+                 by_epoch=True,
+                 warmup=None,
+                 warmup_iters=0,
+                 warmup_ratio=0.9):
+        # validate the "warmup" argument
+        if warmup is not None:
+            if warmup not in ['constant', 'linear', 'exp']:
+                raise ValueError(
+                    f'"{warmup}" is not a supported type for warming up, valid'
+                    ' types are "constant" and "linear"')
+        if warmup is not None:
+            assert warmup_iters > 0, \
+                '"warmup_iters" must be a positive integer'
+            assert 0 < warmup_ratio <= 1.0, \
+                '"warmup_momentum" must be in range (0,1]'
+
+        self.by_epoch = by_epoch
+        self.warmup = warmup
+        self.warmup_iters = warmup_iters
+        self.warmup_ratio = warmup_ratio
+
+        self.base_momentum = []  # initial momentum for all param groups
+        self.regular_momentum = [
+        ]  # expected momentum if no warming up is performed
+
+    def _set_momentum(self, runner, momentum_groups):
+        if isinstance(runner.optimizer, dict):
+            for k, optim in runner.optimizer.items():
+                for param_group, mom in zip(optim.param_groups,
+                                            momentum_groups[k]):
+                    if 'momentum' in param_group.keys():
+                        param_group['momentum'] = mom
+                    elif 'betas' in param_group.keys():
+                        param_group['betas'] = (mom, param_group['betas'][1])
+        else:
+            for param_group, mom in zip(runner.optimizer.param_groups,
+                                        momentum_groups):
+                if 'momentum' in param_group.keys():
+                    param_group['momentum'] = mom
+                elif 'betas' in param_group.keys():
+                    param_group['betas'] = (mom, param_group['betas'][1])
+
+    def get_momentum(self, runner, base_momentum):
+        raise NotImplementedError
+
+    def get_regular_momentum(self, runner):
+        if isinstance(runner.optimizer, dict):
+            momentum_groups = {}
+            for k in runner.optimizer.keys():
+                _momentum_group = [
+                    self.get_momentum(runner, _base_momentum)
+                    for _base_momentum in self.base_momentum[k]
+                ]
+                momentum_groups.update({k: _momentum_group})
+            return momentum_groups
+        else:
+            return [
+                self.get_momentum(runner, _base_momentum)
+                for _base_momentum in self.base_momentum
+            ]
+
+    def get_warmup_momentum(self, cur_iters):
+
+        def _get_warmup_momentum(cur_iters, regular_momentum):
+            if self.warmup == 'constant':
+                warmup_momentum = [
+                    _momentum / self.warmup_ratio
+                    for _momentum in self.regular_momentum
+                ]
+            elif self.warmup == 'linear':
+                k = (1 - cur_iters / self.warmup_iters) * (1 -
+                                                           self.warmup_ratio)
+                warmup_momentum = [
+                    _momentum / (1 - k) for _momentum in self.regular_mom
+                ]
+            elif self.warmup == 'exp':
+                k = self.warmup_ratio**(1 - cur_iters / self.warmup_iters)
+                warmup_momentum = [
+                    _momentum / k for _momentum in self.regular_mom
+                ]
+            return warmup_momentum
+
+        if isinstance(self.regular_momentum, dict):
+            momentum_groups = {}
+            for key, regular_momentum in self.regular_momentum.items():
+                momentum_groups[key] = _get_warmup_momentum(
+                    cur_iters, regular_momentum)
+            return momentum_groups
+        else:
+            return _get_warmup_momentum(cur_iters, self.regular_momentum)
+
+    def before_run(self, runner):
+        # NOTE: when resuming from a checkpoint,
+        # if 'initial_momentum' is not saved,
+        # it will be set according to the optimizer params
+        if isinstance(runner.optimizer, dict):
+            self.base_momentum = {}
+            for k, optim in runner.optimizer.items():
+                for group in optim.param_groups:
+                    if 'momentum' in group.keys():
+                        group.setdefault('initial_momentum', group['momentum'])
+                    else:
+                        group.setdefault('initial_momentum', group['betas'][0])
+                _base_momentum = [
+                    group['initial_momentum'] for group in optim.param_groups
+                ]
+                self.base_momentum.update({k: _base_momentum})
+        else:
+            for group in runner.optimizer.param_groups:
+                if 'momentum' in group.keys():
+                    group.setdefault('initial_momentum', group['momentum'])
+                else:
+                    group.setdefault('initial_momentum', group['betas'][0])
+            self.base_momentum = [
+                group['initial_momentum']
+                for group in runner.optimizer.param_groups
+            ]
+
+    def before_train_epoch(self, runner):
+        if not self.by_epoch:
+            return
+        self.regular_mom = self.get_regular_momentum(runner)
+        self._set_momentum(runner, self.regular_mom)
+
+    def before_train_iter(self, runner):
+        cur_iter = runner.iter
+        if not self.by_epoch:
+            self.regular_mom = self.get_regular_momentum(runner)
+            if self.warmup is None or cur_iter >= self.warmup_iters:
+                self._set_momentum(runner, self.regular_mom)
+            else:
+                warmup_momentum = self.get_warmup_momentum(cur_iter)
+                self._set_momentum(runner, warmup_momentum)
+        elif self.by_epoch:
+            if self.warmup is None or cur_iter > self.warmup_iters:
+                return
+            elif cur_iter == self.warmup_iters:
+                self._set_momentum(runner, self.regular_mom)
+            else:
+                warmup_momentum = self.get_warmup_momentum(cur_iter)
+                self._set_momentum(runner, warmup_momentum)
+
+
+@HOOKS.register_module()
+class StepMomentumUpdaterHook(MomentumUpdaterHook):
+    """Step momentum scheduler with min value clipping.
+
+    Args:
+        step (int | list[int]): Step to decay the momentum. If an int value is
+            given, regard it as the decay interval. If a list is given, decay
+            momentum at these steps.
+        gamma (float, optional): Decay momentum ratio. Default: 0.5.
+        min_momentum (float, optional): Minimum momentum value to keep. If
+            momentum after decay is lower than this value, it will be clipped
+            accordingly. If None is given, we don't perform lr clipping.
+            Default: None.
+    """
+
+    def __init__(self, step, gamma=0.5, min_momentum=None, **kwargs):
+        if isinstance(step, list):
+            assert mmcv.is_list_of(step, int)
+            assert all([s > 0 for s in step])
+        elif isinstance(step, int):
+            assert step > 0
+        else:
+            raise TypeError('"step" must be a list or integer')
+        self.step = step
+        self.gamma = gamma
+        self.min_momentum = min_momentum
+        super(StepMomentumUpdaterHook, self).__init__(**kwargs)
+
+    def get_momentum(self, runner, base_momentum):
+        progress = runner.epoch if self.by_epoch else runner.iter
+
+        # calculate exponential term
+        if isinstance(self.step, int):
+            exp = progress // self.step
+        else:
+            exp = len(self.step)
+            for i, s in enumerate(self.step):
+                if progress < s:
+                    exp = i
+                    break
+
+        momentum = base_momentum * (self.gamma**exp)
+        if self.min_momentum is not None:
+            # clip to a minimum value
+            momentum = max(momentum, self.min_momentum)
+        return momentum
+
+
+@HOOKS.register_module()
+class CosineAnnealingMomentumUpdaterHook(MomentumUpdaterHook):
+
+    def __init__(self, min_momentum=None, min_momentum_ratio=None, **kwargs):
+        assert (min_momentum is None) ^ (min_momentum_ratio is None)
+        self.min_momentum = min_momentum
+        self.min_momentum_ratio = min_momentum_ratio
+        super(CosineAnnealingMomentumUpdaterHook, self).__init__(**kwargs)
+
+    def get_momentum(self, runner, base_momentum):
+        if self.by_epoch:
+            progress = runner.epoch
+            max_progress = runner.max_epochs
+        else:
+            progress = runner.iter
+            max_progress = runner.max_iters
+        if self.min_momentum_ratio is not None:
+            target_momentum = base_momentum * self.min_momentum_ratio
+        else:
+            target_momentum = self.min_momentum
+        return annealing_cos(base_momentum, target_momentum,
+                             progress / max_progress)
+
+
+@HOOKS.register_module()
+class CyclicMomentumUpdaterHook(MomentumUpdaterHook):
+    """Cyclic momentum Scheduler.
+
+    Implement the cyclical momentum scheduler policy described in
+    https://arxiv.org/pdf/1708.07120.pdf
+
+    This momentum scheduler usually used together with the CyclicLRUpdater
+    to improve the performance in the 3D detection area.
+
+    Attributes:
+        target_ratio (tuple[float]): Relative ratio of the lowest momentum and
+            the highest momentum to the initial momentum.
+        cyclic_times (int): Number of cycles during training
+        step_ratio_up (float): The ratio of the increasing process of momentum
+            in  the total cycle.
+        by_epoch (bool): Whether to update momentum by epoch.
+    """
+
+    def __init__(self,
+                 by_epoch=False,
+                 target_ratio=(0.85 / 0.95, 1),
+                 cyclic_times=1,
+                 step_ratio_up=0.4,
+                 **kwargs):
+        if isinstance(target_ratio, float):
+            target_ratio = (target_ratio, target_ratio / 1e5)
+        elif isinstance(target_ratio, tuple):
+            target_ratio = (target_ratio[0], target_ratio[0] / 1e5) \
+                if len(target_ratio) == 1 else target_ratio
+        else:
+            raise ValueError('target_ratio should be either float '
+                             f'or tuple, got {type(target_ratio)}')
+
+        assert len(target_ratio) == 2, \
+            '"target_ratio" must be list or tuple of two floats'
+        assert 0 <= step_ratio_up < 1.0, \
+            '"step_ratio_up" must be in range [0,1)'
+
+        self.target_ratio = target_ratio
+        self.cyclic_times = cyclic_times
+        self.step_ratio_up = step_ratio_up
+        self.momentum_phases = []  # init momentum_phases
+        # currently only support by_epoch=False
+        assert not by_epoch, \
+            'currently only support "by_epoch" = False'
+        super(CyclicMomentumUpdaterHook, self).__init__(by_epoch, **kwargs)
+
+    def before_run(self, runner):
+        super(CyclicMomentumUpdaterHook, self).before_run(runner)
+        # initiate momentum_phases
+        # total momentum_phases are separated as up and down
+        max_iter_per_phase = runner.max_iters // self.cyclic_times
+        iter_up_phase = int(self.step_ratio_up * max_iter_per_phase)
+        self.momentum_phases.append(
+            [0, iter_up_phase, max_iter_per_phase, 1, self.target_ratio[0]])
+        self.momentum_phases.append([
+            iter_up_phase, max_iter_per_phase, max_iter_per_phase,
+            self.target_ratio[0], self.target_ratio[1]
+        ])
+
+    def get_momentum(self, runner, base_momentum):
+        curr_iter = runner.iter
+        for (start_iter, end_iter, max_iter_per_phase, start_ratio,
+             end_ratio) in self.momentum_phases:
+            curr_iter %= max_iter_per_phase
+            if start_iter <= curr_iter < end_iter:
+                progress = curr_iter - start_iter
+                return annealing_cos(base_momentum * start_ratio,
+                                     base_momentum * end_ratio,
+                                     progress / (end_iter - start_iter))
+
+
+@HOOKS.register_module()
+class OneCycleMomentumUpdaterHook(MomentumUpdaterHook):
+    """OneCycle momentum Scheduler.
+
+    This momentum scheduler usually used together with the OneCycleLrUpdater
+    to improve the performance.
+
+    Args:
+        base_momentum (float or list): Lower momentum boundaries in the cycle
+            for each parameter group. Note that momentum is cycled inversely
+            to learning rate; at the peak of a cycle, momentum is
+            'base_momentum' and learning rate is 'max_lr'.
+            Default: 0.85
+        max_momentum (float or list): Upper momentum boundaries in the cycle
+            for each parameter group. Functionally,
+            it defines the cycle amplitude (max_momentum - base_momentum).
+            Note that momentum is cycled inversely
+            to learning rate; at the start of a cycle, momentum is
+            'max_momentum' and learning rate is 'base_lr'
+            Default: 0.95
+        pct_start (float): The percentage of the cycle (in number of steps)
+            spent increasing the learning rate.
+            Default: 0.3
+        anneal_strategy (str): {'cos', 'linear'}
+            Specifies the annealing strategy: 'cos' for cosine annealing,
+            'linear' for linear annealing.
+            Default: 'cos'
+        three_phase (bool): If three_phase is True, use a third phase of the
+            schedule to annihilate the learning rate according to
+            final_div_factor instead of modifying the second phase (the first
+            two phases will be symmetrical about the step indicated by
+            pct_start).
+            Default: False
+    """
+
+    def __init__(self,
+                 base_momentum=0.85,
+                 max_momentum=0.95,
+                 pct_start=0.3,
+                 anneal_strategy='cos',
+                 three_phase=False,
+                 **kwargs):
+        # validate by_epoch, currently only support by_epoch=False
+        if 'by_epoch' not in kwargs:
+            kwargs['by_epoch'] = False
+        else:
+            assert not kwargs['by_epoch'], \
+                'currently only support "by_epoch" = False'
+        if not isinstance(base_momentum, (float, list, dict)):
+            raise ValueError('base_momentum must be the type among of float,'
+                             'list or dict.')
+        self._base_momentum = base_momentum
+        if not isinstance(max_momentum, (float, list, dict)):
+            raise ValueError('max_momentum must be the type among of float,'
+                             'list or dict.')
+        self._max_momentum = max_momentum
+        # validate pct_start
+        if pct_start < 0 or pct_start > 1 or not isinstance(pct_start, float):
+            raise ValueError('Expected float between 0 and 1 pct_start, but '
+                             f'got {pct_start}')
+        self.pct_start = pct_start
+        # validate anneal_strategy
+        if anneal_strategy not in ['cos', 'linear']:
+            raise ValueError('anneal_strategy must by one of "cos" or '
+                             f'"linear", instead got {anneal_strategy}')
+        elif anneal_strategy == 'cos':
+            self.anneal_func = annealing_cos
+        elif anneal_strategy == 'linear':
+            self.anneal_func = annealing_linear
+        self.three_phase = three_phase
+        self.momentum_phases = []  # init momentum_phases
+        super(OneCycleMomentumUpdaterHook, self).__init__(**kwargs)
+
+    def before_run(self, runner):
+        if isinstance(runner.optimizer, dict):
+            for k, optim in runner.optimizer.items():
+                if ('momentum' not in optim.defaults
+                        and 'betas' not in optim.defaults):
+                    raise ValueError('optimizer must support momentum with'
+                                     'option enabled')
+                self.use_beta1 = 'betas' in optim.defaults
+                _base_momentum = format_param(k, optim, self._base_momentum)
+                _max_momentum = format_param(k, optim, self._max_momentum)
+                for group, b_momentum, m_momentum in zip(
+                        optim.param_groups, _base_momentum, _max_momentum):
+                    if self.use_beta1:
+                        _, beta2 = group['betas']
+                        group['betas'] = (m_momentum, beta2)
+                    else:
+                        group['momentum'] = m_momentum
+                    group['base_momentum'] = b_momentum
+                    group['max_momentum'] = m_momentum
+        else:
+            optim = runner.optimizer
+            if ('momentum' not in optim.defaults
+                    and 'betas' not in optim.defaults):
+                raise ValueError('optimizer must support momentum with'
+                                 'option enabled')
+            self.use_beta1 = 'betas' in optim.defaults
+            k = type(optim).__name__
+            _base_momentum = format_param(k, optim, self._base_momentum)
+            _max_momentum = format_param(k, optim, self._max_momentum)
+            for group, b_momentum, m_momentum in zip(optim.param_groups,
+                                                     _base_momentum,
+                                                     _max_momentum):
+                if self.use_beta1:
+                    _, beta2 = group['betas']
+                    group['betas'] = (m_momentum, beta2)
+                else:
+                    group['momentum'] = m_momentum
+                group['base_momentum'] = b_momentum
+                group['max_momentum'] = m_momentum
+
+        if self.three_phase:
+            self.momentum_phases.append({
+                'end_iter':
+                float(self.pct_start * runner.max_iters) - 1,
+                'start_momentum':
+                'max_momentum',
+                'end_momentum':
+                'base_momentum'
+            })
+            self.momentum_phases.append({
+                'end_iter':
+                float(2 * self.pct_start * runner.max_iters) - 2,
+                'start_momentum':
+                'base_momentum',
+                'end_momentum':
+                'max_momentum'
+            })
+            self.momentum_phases.append({
+                'end_iter': runner.max_iters - 1,
+                'start_momentum': 'max_momentum',
+                'end_momentum': 'max_momentum'
+            })
+        else:
+            self.momentum_phases.append({
+                'end_iter':
+                float(self.pct_start * runner.max_iters) - 1,
+                'start_momentum':
+                'max_momentum',
+                'end_momentum':
+                'base_momentum'
+            })
+            self.momentum_phases.append({
+                'end_iter': runner.max_iters - 1,
+                'start_momentum': 'base_momentum',
+                'end_momentum': 'max_momentum'
+            })
+
+    def _set_momentum(self, runner, momentum_groups):
+        if isinstance(runner.optimizer, dict):
+            for k, optim in runner.optimizer.items():
+                for param_group, mom in zip(optim.param_groups,
+                                            momentum_groups[k]):
+                    if 'momentum' in param_group.keys():
+                        param_group['momentum'] = mom
+                    elif 'betas' in param_group.keys():
+                        param_group['betas'] = (mom, param_group['betas'][1])
+        else:
+            for param_group, mom in zip(runner.optimizer.param_groups,
+                                        momentum_groups):
+                if 'momentum' in param_group.keys():
+                    param_group['momentum'] = mom
+                elif 'betas' in param_group.keys():
+                    param_group['betas'] = (mom, param_group['betas'][1])
+
+    def get_momentum(self, runner, param_group):
+        curr_iter = runner.iter
+        start_iter = 0
+        for i, phase in enumerate(self.momentum_phases):
+            end_iter = phase['end_iter']
+            if curr_iter <= end_iter or i == len(self.momentum_phases) - 1:
+                pct = (curr_iter - start_iter) / (end_iter - start_iter)
+                momentum = self.anneal_func(
+                    param_group[phase['start_momentum']],
+                    param_group[phase['end_momentum']], pct)
+                break
+            start_iter = end_iter
+        return momentum
+
+    def get_regular_momentum(self, runner):
+        if isinstance(runner.optimizer, dict):
+            momentum_groups = {}
+            for k, optim in runner.optimizer.items():
+                _momentum_group = [
+                    self.get_momentum(runner, param_group)
+                    for param_group in optim.param_groups
+                ]
+                momentum_groups.update({k: _momentum_group})
+            return momentum_groups
+        else:
+            momentum_groups = []
+            for param_group in runner.optimizer.param_groups:
+                momentum_groups.append(self.get_momentum(runner, param_group))
+            return momentum_groups
diff --git a/mmcv/runner/hooks/optimizer.py b/mmcv/runner/hooks/optimizer.py
new file mode 100644
index 0000000..f575ced
--- /dev/null
+++ b/mmcv/runner/hooks/optimizer.py
@@ -0,0 +1,508 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+from collections import defaultdict
+from itertools import chain
+
+from torch.nn.utils import clip_grad
+
+from mmcv.utils import TORCH_VERSION, _BatchNorm, digit_version
+from ..dist_utils import allreduce_grads
+from ..fp16_utils import LossScaler, wrap_fp16_model
+from .hook import HOOKS, Hook
+
+try:
+    # If PyTorch version >= 1.6.0, torch.cuda.amp.GradScaler would be imported
+    # and used; otherwise, auto fp16 will adopt mmcv's implementation.
+    from torch.cuda.amp import GradScaler
+except ImportError:
+    pass
+
+
+@HOOKS.register_module()
+class OptimizerHook(Hook):
+
+    def __init__(self, grad_clip=None):
+        self.grad_clip = grad_clip
+
+    def clip_grads(self, params):
+        params = list(
+            filter(lambda p: p.requires_grad and p.grad is not None, params))
+        if len(params) > 0:
+            return clip_grad.clip_grad_norm_(params, **self.grad_clip)
+
+    def after_train_iter(self, runner):
+        runner.optimizer.zero_grad()
+        runner.outputs['loss'].backward()
+        if self.grad_clip is not None:
+            grad_norm = self.clip_grads(runner.model.parameters())
+            if grad_norm is not None:
+                # Add grad norm to the logger
+                runner.log_buffer.update({'grad_norm': float(grad_norm)},
+                                         runner.outputs['num_samples'])
+        runner.optimizer.step()
+
+
+@HOOKS.register_module()
+class GradientCumulativeOptimizerHook(OptimizerHook):
+    """Optimizer Hook implements multi-iters gradient cumulating.
+
+    Args:
+        cumulative_iters (int, optional): Num of gradient cumulative iters.
+            The optimizer will step every `cumulative_iters` iters.
+            Defaults to 1.
+
+    Examples:
+        >>> # Use cumulative_iters to simulate a large batch size
+        >>> # It is helpful when the hardware cannot handle a large batch size.
+        >>> loader = DataLoader(data, batch_size=64)
+        >>> optim_hook = GradientCumulativeOptimizerHook(cumulative_iters=4)
+        >>> # almost equals to
+        >>> loader = DataLoader(data, batch_size=256)
+        >>> optim_hook = OptimizerHook()
+    """
+
+    def __init__(self, cumulative_iters=1, **kwargs):
+        super(GradientCumulativeOptimizerHook, self).__init__(**kwargs)
+
+        assert isinstance(cumulative_iters, int) and cumulative_iters > 0, \
+            f'cumulative_iters only accepts positive int, but got ' \
+            f'{type(cumulative_iters)} instead.'
+
+        self.cumulative_iters = cumulative_iters
+        self.divisible_iters = 0
+        self.remainder_iters = 0
+        self.initialized = False
+
+    def has_batch_norm(self, module):
+        if isinstance(module, _BatchNorm):
+            return True
+        for m in module.children():
+            if self.has_batch_norm(m):
+                return True
+        return False
+
+    def _init(self, runner):
+        if runner.iter % self.cumulative_iters != 0:
+            runner.logger.warning(
+                'Resume iter number is not divisible by cumulative_iters in '
+                'GradientCumulativeOptimizerHook, which means the gradient of '
+                'some iters is lost and the result may be influenced slightly.'
+            )
+
+        if self.has_batch_norm(runner.model) and self.cumulative_iters > 1:
+            runner.logger.warning(
+                'GradientCumulativeOptimizerHook may slightly decrease '
+                'performance if the model has BatchNorm layers.')
+
+        residual_iters = runner.max_iters - runner.iter
+
+        self.divisible_iters = (
+            residual_iters // self.cumulative_iters * self.cumulative_iters)
+        self.remainder_iters = residual_iters - self.divisible_iters
+
+        self.initialized = True
+
+    def after_train_iter(self, runner):
+        if not self.initialized:
+            self._init(runner)
+
+        if runner.iter < self.divisible_iters:
+            loss_factor = self.cumulative_iters
+        else:
+            loss_factor = self.remainder_iters
+        loss = runner.outputs['loss']
+        loss = loss / loss_factor
+        loss.backward()
+
+        if (self.every_n_iters(runner, self.cumulative_iters)
+                or self.is_last_iter(runner)):
+
+            if self.grad_clip is not None:
+                grad_norm = self.clip_grads(runner.model.parameters())
+                if grad_norm is not None:
+                    # Add grad norm to the logger
+                    runner.log_buffer.update({'grad_norm': float(grad_norm)},
+                                             runner.outputs['num_samples'])
+            runner.optimizer.step()
+            runner.optimizer.zero_grad()
+
+
+if (TORCH_VERSION != 'parrots'
+        and digit_version(TORCH_VERSION) >= digit_version('1.6.0')):
+
+    @HOOKS.register_module()
+    class Fp16OptimizerHook(OptimizerHook):
+        """FP16 optimizer hook (using PyTorch's implementation).
+
+        If you are using PyTorch >= 1.6, torch.cuda.amp is used as the backend,
+        to take care of the optimization procedure.
+
+        Args:
+            loss_scale (float | str | dict): Scale factor configuration.
+                If loss_scale is a float, static loss scaling will be used with
+                the specified scale. If loss_scale is a string, it must be
+                'dynamic', then dynamic loss scaling will be used.
+                It can also be a dict containing arguments of GradScalar.
+                Defaults to 512. For Pytorch >= 1.6, mmcv uses official
+                implementation of GradScaler. If you use a dict version of
+                loss_scale to create GradScaler, please refer to:
+                https://pytorch.org/docs/stable/amp.html#torch.cuda.amp.GradScaler
+                for the parameters.
+
+        Examples:
+            >>> loss_scale = dict(
+            ...     init_scale=65536.0,
+            ...     growth_factor=2.0,
+            ...     backoff_factor=0.5,
+            ...     growth_interval=2000
+            ... )
+            >>> optimizer_hook = Fp16OptimizerHook(loss_scale=loss_scale)
+        """
+
+        def __init__(self,
+                     grad_clip=None,
+                     coalesce=True,
+                     bucket_size_mb=-1,
+                     loss_scale=512.,
+                     distributed=True):
+            self.grad_clip = grad_clip
+            self.coalesce = coalesce
+            self.bucket_size_mb = bucket_size_mb
+            self.distributed = distributed
+            self._scale_update_param = None
+            if loss_scale == 'dynamic':
+                self.loss_scaler = GradScaler()
+            elif isinstance(loss_scale, float):
+                self._scale_update_param = loss_scale
+                self.loss_scaler = GradScaler(init_scale=loss_scale)
+            elif isinstance(loss_scale, dict):
+                self.loss_scaler = GradScaler(**loss_scale)
+            else:
+                raise ValueError('loss_scale must be of type float, dict, or '
+                                 f'"dynamic", got {loss_scale}')
+
+        def before_run(self, runner):
+            """Preparing steps before Mixed Precision Training."""
+            # wrap model mode to fp16
+            wrap_fp16_model(runner.model)
+            # resume from state dict
+            if 'fp16' in runner.meta and 'loss_scaler' in runner.meta['fp16']:
+                scaler_state_dict = runner.meta['fp16']['loss_scaler']
+                self.loss_scaler.load_state_dict(scaler_state_dict)
+
+        def copy_grads_to_fp32(self, fp16_net, fp32_weights):
+            """Copy gradients from fp16 model to fp32 weight copy."""
+            for fp32_param, fp16_param in zip(fp32_weights,
+                                              fp16_net.parameters()):
+                if fp16_param.grad is not None:
+                    if fp32_param.grad is None:
+                        fp32_param.grad = fp32_param.data.new(
+                            fp32_param.size())
+                    fp32_param.grad.copy_(fp16_param.grad)
+
+        def copy_params_to_fp16(self, fp16_net, fp32_weights):
+            """Copy updated params from fp32 weight copy to fp16 model."""
+            for fp16_param, fp32_param in zip(fp16_net.parameters(),
+                                              fp32_weights):
+                fp16_param.data.copy_(fp32_param.data)
+
+        def after_train_iter(self, runner):
+            """Backward optimization steps for Mixed Precision Training. For
+            dynamic loss scaling, please refer to
+            https://pytorch.org/docs/stable/amp.html#torch.cuda.amp.GradScaler.
+
+            1. Scale the loss by a scale factor.
+            2. Backward the loss to obtain the gradients.
+            3. Unscale the optimizer’s gradient tensors.
+            4. Call optimizer.step() and update scale factor.
+            5. Save loss_scaler state_dict for resume purpose.
+            """
+            # clear grads of last iteration
+            runner.model.zero_grad()
+            runner.optimizer.zero_grad()
+
+            self.loss_scaler.scale(runner.outputs['loss']).backward()
+            self.loss_scaler.unscale_(runner.optimizer)
+            # grad clip
+            if self.grad_clip is not None:
+                grad_norm = self.clip_grads(runner.model.parameters())
+                if grad_norm is not None:
+                    # Add grad norm to the logger
+                    runner.log_buffer.update({'grad_norm': float(grad_norm)},
+                                             runner.outputs['num_samples'])
+            # backward and update scaler
+            self.loss_scaler.step(runner.optimizer)
+            self.loss_scaler.update(self._scale_update_param)
+
+            # save state_dict of loss_scaler
+            runner.meta.setdefault(
+                'fp16', {})['loss_scaler'] = self.loss_scaler.state_dict()
+
+    @HOOKS.register_module()
+    class GradientCumulativeFp16OptimizerHook(GradientCumulativeOptimizerHook,
+                                              Fp16OptimizerHook):
+        """Fp16 optimizer Hook (using PyTorch's implementation) implements
+        multi-iters gradient cumulating.
+
+        If you are using PyTorch >= 1.6, torch.cuda.amp is used as the backend,
+        to take care of the optimization procedure.
+        """
+
+        def __init__(self, *args, **kwargs):
+            super(GradientCumulativeFp16OptimizerHook,
+                  self).__init__(*args, **kwargs)
+
+        def after_train_iter(self, runner):
+            if not self.initialized:
+                self._init(runner)
+
+            if runner.iter < self.divisible_iters:
+                loss_factor = self.cumulative_iters
+            else:
+                loss_factor = self.remainder_iters
+            loss = runner.outputs['loss']
+            loss = loss / loss_factor
+
+            self.loss_scaler.scale(loss).backward()
+
+            if (self.every_n_iters(runner, self.cumulative_iters)
+                    or self.is_last_iter(runner)):
+
+                # copy fp16 grads in the model to fp32 params in the optimizer
+                self.loss_scaler.unscale_(runner.optimizer)
+
+                if self.grad_clip is not None:
+                    grad_norm = self.clip_grads(runner.model.parameters())
+                    if grad_norm is not None:
+                        # Add grad norm to the logger
+                        runner.log_buffer.update(
+                            {'grad_norm': float(grad_norm)},
+                            runner.outputs['num_samples'])
+
+                # backward and update scaler
+                self.loss_scaler.step(runner.optimizer)
+                self.loss_scaler.update(self._scale_update_param)
+
+                # save state_dict of loss_scaler
+                runner.meta.setdefault(
+                    'fp16', {})['loss_scaler'] = self.loss_scaler.state_dict()
+
+                # clear grads
+                runner.model.zero_grad()
+                runner.optimizer.zero_grad()
+
+else:
+
+    @HOOKS.register_module()
+    class Fp16OptimizerHook(OptimizerHook):
+        """FP16 optimizer hook (mmcv's implementation).
+
+        The steps of fp16 optimizer is as follows.
+        1. Scale the loss value.
+        2. BP in the fp16 model.
+        2. Copy gradients from fp16 model to fp32 weights.
+        3. Update fp32 weights.
+        4. Copy updated parameters from fp32 weights to fp16 model.
+
+        Refer to https://arxiv.org/abs/1710.03740 for more details.
+
+        Args:
+            loss_scale (float | str | dict): Scale factor configuration.
+                If loss_scale is a float, static loss scaling will be used with
+                the specified scale. If loss_scale is a string, it must be
+                'dynamic', then dynamic loss scaling will be used.
+                It can also be a dict containing arguments of LossScaler.
+                Defaults to 512.
+        """
+
+        def __init__(self,
+                     grad_clip=None,
+                     coalesce=True,
+                     bucket_size_mb=-1,
+                     loss_scale=512.,
+                     distributed=True):
+            self.grad_clip = grad_clip
+            self.coalesce = coalesce
+            self.bucket_size_mb = bucket_size_mb
+            self.distributed = distributed
+            if loss_scale == 'dynamic':
+                self.loss_scaler = LossScaler(mode='dynamic')
+            elif isinstance(loss_scale, float):
+                self.loss_scaler = LossScaler(
+                    init_scale=loss_scale, mode='static')
+            elif isinstance(loss_scale, dict):
+                self.loss_scaler = LossScaler(**loss_scale)
+            else:
+                raise ValueError('loss_scale must be of type float, dict, or '
+                                 f'"dynamic", got {loss_scale}')
+
+        def before_run(self, runner):
+            """Preparing steps before Mixed Precision Training.
+
+            1. Make a master copy of fp32 weights for optimization.
+            2. Convert the main model from fp32 to fp16.
+            """
+            # keep a copy of fp32 weights
+            old_groups = runner.optimizer.param_groups
+            runner.optimizer.param_groups = copy.deepcopy(
+                runner.optimizer.param_groups)
+            state = defaultdict(dict)
+            p_map = {
+                old_p: p
+                for old_p, p in zip(
+                    chain(*(g['params'] for g in old_groups)),
+                    chain(*(g['params']
+                            for g in runner.optimizer.param_groups)))
+            }
+            for k, v in runner.optimizer.state.items():
+                state[p_map[k]] = v
+            runner.optimizer.state = state
+            # convert model to fp16
+            wrap_fp16_model(runner.model)
+            # resume from state dict
+            if 'fp16' in runner.meta and 'loss_scaler' in runner.meta['fp16']:
+                scaler_state_dict = runner.meta['fp16']['loss_scaler']
+                self.loss_scaler.load_state_dict(scaler_state_dict)
+
+        def copy_grads_to_fp32(self, fp16_net, fp32_weights):
+            """Copy gradients from fp16 model to fp32 weight copy."""
+            for fp32_param, fp16_param in zip(fp32_weights,
+                                              fp16_net.parameters()):
+                if fp16_param.grad is not None:
+                    if fp32_param.grad is None:
+                        fp32_param.grad = fp32_param.data.new(
+                            fp32_param.size())
+                    fp32_param.grad.copy_(fp16_param.grad)
+
+        def copy_params_to_fp16(self, fp16_net, fp32_weights):
+            """Copy updated params from fp32 weight copy to fp16 model."""
+            for fp16_param, fp32_param in zip(fp16_net.parameters(),
+                                              fp32_weights):
+                fp16_param.data.copy_(fp32_param.data)
+
+        def after_train_iter(self, runner):
+            """Backward optimization steps for Mixed Precision Training. For
+            dynamic loss scaling, please refer `loss_scalar.py`
+
+            1. Scale the loss by a scale factor.
+            2. Backward the loss to obtain the gradients (fp16).
+            3. Copy gradients from the model to the fp32 weight copy.
+            4. Scale the gradients back and update the fp32 weight copy.
+            5. Copy back the params from fp32 weight copy to the fp16 model.
+            6. Save loss_scaler state_dict for resume purpose.
+            """
+            # clear grads of last iteration
+            runner.model.zero_grad()
+            runner.optimizer.zero_grad()
+            # scale the loss value
+            scaled_loss = runner.outputs['loss'] * self.loss_scaler.loss_scale
+            scaled_loss.backward()
+            # copy fp16 grads in the model to fp32 params in the optimizer
+
+            fp32_weights = []
+            for param_group in runner.optimizer.param_groups:
+                fp32_weights += param_group['params']
+            self.copy_grads_to_fp32(runner.model, fp32_weights)
+            # allreduce grads
+            if self.distributed:
+                allreduce_grads(fp32_weights, self.coalesce,
+                                self.bucket_size_mb)
+
+            has_overflow = self.loss_scaler.has_overflow(fp32_weights)
+            # if has overflow, skip this iteration
+            if not has_overflow:
+                # scale the gradients back
+                for param in fp32_weights:
+                    if param.grad is not None:
+                        param.grad.div_(self.loss_scaler.loss_scale)
+                if self.grad_clip is not None:
+                    grad_norm = self.clip_grads(fp32_weights)
+                    if grad_norm is not None:
+                        # Add grad norm to the logger
+                        runner.log_buffer.update(
+                            {'grad_norm': float(grad_norm)},
+                            runner.outputs['num_samples'])
+                # update fp32 params
+                runner.optimizer.step()
+                # copy fp32 params to the fp16 model
+                self.copy_params_to_fp16(runner.model, fp32_weights)
+            self.loss_scaler.update_scale(has_overflow)
+            if has_overflow:
+                runner.logger.warning('Check overflow, downscale loss scale '
+                                      f'to {self.loss_scaler.cur_scale}')
+
+            # save state_dict of loss_scaler
+            runner.meta.setdefault(
+                'fp16', {})['loss_scaler'] = self.loss_scaler.state_dict()
+
+    @HOOKS.register_module()
+    class GradientCumulativeFp16OptimizerHook(GradientCumulativeOptimizerHook,
+                                              Fp16OptimizerHook):
+        """Fp16 optimizer Hook (using mmcv implementation) implements multi-
+        iters gradient cumulating."""
+
+        def __init__(self, *args, **kwargs):
+            super(GradientCumulativeFp16OptimizerHook,
+                  self).__init__(*args, **kwargs)
+
+        def after_train_iter(self, runner):
+            if not self.initialized:
+                self._init(runner)
+
+            if runner.iter < self.divisible_iters:
+                loss_factor = self.cumulative_iters
+            else:
+                loss_factor = self.remainder_iters
+
+            loss = runner.outputs['loss']
+            loss = loss / loss_factor
+
+            # scale the loss value
+            scaled_loss = loss * self.loss_scaler.loss_scale
+            scaled_loss.backward()
+
+            if (self.every_n_iters(runner, self.cumulative_iters)
+                    or self.is_last_iter(runner)):
+
+                # copy fp16 grads in the model to fp32 params in the optimizer
+                fp32_weights = []
+                for param_group in runner.optimizer.param_groups:
+                    fp32_weights += param_group['params']
+                self.copy_grads_to_fp32(runner.model, fp32_weights)
+                # allreduce grads
+                if self.distributed:
+                    allreduce_grads(fp32_weights, self.coalesce,
+                                    self.bucket_size_mb)
+
+                has_overflow = self.loss_scaler.has_overflow(fp32_weights)
+                # if has overflow, skip this iteration
+                if not has_overflow:
+                    # scale the gradients back
+                    for param in fp32_weights:
+                        if param.grad is not None:
+                            param.grad.div_(self.loss_scaler.loss_scale)
+                    if self.grad_clip is not None:
+                        grad_norm = self.clip_grads(fp32_weights)
+                        if grad_norm is not None:
+                            # Add grad norm to the logger
+                            runner.log_buffer.update(
+                                {'grad_norm': float(grad_norm)},
+                                runner.outputs['num_samples'])
+                    # update fp32 params
+                    runner.optimizer.step()
+                    # copy fp32 params to the fp16 model
+                    self.copy_params_to_fp16(runner.model, fp32_weights)
+                else:
+                    runner.logger.warning(
+                        'Check overflow, downscale loss scale '
+                        f'to {self.loss_scaler.cur_scale}')
+
+                self.loss_scaler.update_scale(has_overflow)
+
+                # save state_dict of loss_scaler
+                runner.meta.setdefault(
+                    'fp16', {})['loss_scaler'] = self.loss_scaler.state_dict()
+
+                # clear grads
+                runner.model.zero_grad()
+                runner.optimizer.zero_grad()
diff --git a/mmcv/runner/hooks/profiler.py b/mmcv/runner/hooks/profiler.py
new file mode 100644
index 0000000..b702369
--- /dev/null
+++ b/mmcv/runner/hooks/profiler.py
@@ -0,0 +1,180 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+from typing import Callable, List, Optional, Union
+
+import torch
+
+from ..dist_utils import master_only
+from .hook import HOOKS, Hook
+
+
+@HOOKS.register_module()
+class ProfilerHook(Hook):
+    """Profiler to analyze performance during training.
+
+    PyTorch Profiler is a tool that allows the collection of the performance
+    metrics during the training. More details on Profiler can be found at
+    https://pytorch.org/docs/1.8.1/profiler.html#torch.profiler.profile
+
+    Args:
+        by_epoch (bool): Profile performance by epoch or by iteration.
+            Default: True.
+        profile_iters (int): Number of iterations for profiling.
+            If ``by_epoch=True``, profile_iters indicates that they are the
+            first profile_iters epochs at the beginning of the
+            training, otherwise it indicates the first profile_iters
+            iterations. Default: 1.
+        activities (list[str]): List of activity groups (CPU, CUDA) to use in
+            profiling. Default: ['cpu', 'cuda'].
+        schedule (dict, optional): Config of generating the callable schedule.
+            if schedule is None, profiler will not add step markers into the
+            trace and table view. Default: None.
+        on_trace_ready (callable, dict): Either a handler or a dict of generate
+            handler. Default: None.
+        record_shapes (bool): Save information about operator's input shapes.
+            Default: False.
+        profile_memory (bool): Track tensor memory allocation/deallocation.
+            Default: False.
+        with_stack (bool): Record source information (file and line number)
+            for the ops. Default: False.
+        with_flops (bool): Use formula to estimate the FLOPS of specific
+            operators (matrix multiplication and 2D convolution).
+            Default: False.
+        json_trace_path (str, optional): Exports the collected trace in Chrome
+            JSON format. Default: None.
+
+    Example:
+        >>> runner = ... # instantiate a Runner
+        >>> # tensorboard trace
+        >>> trace_config = dict(type='tb_trace', dir_name='work_dir')
+        >>> profiler_config = dict(on_trace_ready=trace_config)
+        >>> runner.register_profiler_hook(profiler_config)
+        >>> runner.run(data_loaders=[trainloader], workflow=[('train', 1)])
+    """
+
+    def __init__(self,
+                 by_epoch: bool = True,
+                 profile_iters: int = 1,
+                 activities: List[str] = ['cpu', 'cuda'],
+                 schedule: Optional[dict] = None,
+                 on_trace_ready: Optional[Union[Callable, dict]] = None,
+                 record_shapes: bool = False,
+                 profile_memory: bool = False,
+                 with_stack: bool = False,
+                 with_flops: bool = False,
+                 json_trace_path: Optional[str] = None) -> None:
+        try:
+            from torch import profiler  # torch version >= 1.8.1
+        except ImportError:
+            raise ImportError('profiler is the new feature of torch1.8.1, '
+                              f'but your version is {torch.__version__}')
+
+        assert isinstance(by_epoch, bool), '``by_epoch`` should be a boolean.'
+        self.by_epoch = by_epoch
+
+        if profile_iters < 1:
+            raise ValueError('profile_iters should be greater than 0, but got '
+                             f'{profile_iters}')
+        self.profile_iters = profile_iters
+
+        if not isinstance(activities, list):
+            raise ValueError(
+                f'activities should be list, but got {type(activities)}')
+        self.activities = []
+        for activity in activities:
+            activity = activity.lower()
+            if activity == 'cpu':
+                self.activities.append(profiler.ProfilerActivity.CPU)
+            elif activity == 'cuda':
+                self.activities.append(profiler.ProfilerActivity.CUDA)
+            else:
+                raise ValueError(
+                    f'activity should be "cpu" or "cuda", but got {activity}')
+
+        if schedule is not None:
+            self.schedule = profiler.schedule(**schedule)
+        else:
+            self.schedule = None
+
+        self.on_trace_ready = on_trace_ready
+        self.record_shapes = record_shapes
+        self.profile_memory = profile_memory
+        self.with_stack = with_stack
+        self.with_flops = with_flops
+        self.json_trace_path = json_trace_path
+
+    @master_only
+    def before_run(self, runner):
+        if self.by_epoch and runner.max_epochs < self.profile_iters:
+            raise ValueError('self.profile_iters should not be greater than '
+                             f'{runner.max_epochs}')
+
+        if not self.by_epoch and runner.max_iters < self.profile_iters:
+            raise ValueError('self.profile_iters should not be greater than '
+                             f'{runner.max_iters}')
+
+        if callable(self.on_trace_ready):  # handler
+            _on_trace_ready = self.on_trace_ready
+        elif isinstance(self.on_trace_ready, dict):  # config of handler
+            trace_cfg = self.on_trace_ready.copy()
+            trace_type = trace_cfg.pop('type')  # log_trace handler
+            if trace_type == 'log_trace':
+
+                def _log_handler(prof):
+                    print(prof.key_averages().table(**trace_cfg))
+
+                _on_trace_ready = _log_handler
+            elif trace_type == 'tb_trace':  # tensorboard_trace handler
+                try:
+                    import torch_tb_profiler  # noqa: F401
+                except ImportError:
+                    raise ImportError('please run "pip install '
+                                      'torch-tb-profiler" to install '
+                                      'torch_tb_profiler')
+                _on_trace_ready = torch.profiler.tensorboard_trace_handler(
+                    **trace_cfg)
+            else:
+                raise ValueError('trace_type should be "log_trace" or '
+                                 f'"tb_trace", but got {trace_type}')
+        elif self.on_trace_ready is None:
+            _on_trace_ready = None  # type: ignore
+        else:
+            raise ValueError('on_trace_ready should be handler, dict or None, '
+                             f'but got {type(self.on_trace_ready)}')
+
+        if runner.max_epochs > 1:
+            warnings.warn(f'profiler will profile {runner.max_epochs} epochs '
+                          'instead of 1 epoch. Since profiler will slow down '
+                          'the training, it is recommended to train 1 epoch '
+                          'with ProfilerHook and adjust your setting according'
+                          ' to the profiler summary. During normal training '
+                          '(epoch > 1), you may disable the ProfilerHook.')
+
+        self.profiler = torch.profiler.profile(
+            activities=self.activities,
+            schedule=self.schedule,
+            on_trace_ready=_on_trace_ready,
+            record_shapes=self.record_shapes,
+            profile_memory=self.profile_memory,
+            with_stack=self.with_stack,
+            with_flops=self.with_flops)
+
+        self.profiler.__enter__()
+        runner.logger.info('profiler is profiling...')
+
+    @master_only
+    def after_train_epoch(self, runner):
+        if self.by_epoch and runner.epoch == self.profile_iters - 1:
+            runner.logger.info('profiler may take a few minutes...')
+            self.profiler.__exit__(None, None, None)
+            if self.json_trace_path is not None:
+                self.profiler.export_chrome_trace(self.json_trace_path)
+
+    @master_only
+    def after_train_iter(self, runner):
+        self.profiler.step()
+        if not self.by_epoch and runner.iter == self.profile_iters - 1:
+            runner.logger.info('profiler may take a few minutes...')
+            self.profiler.__exit__(None, None, None)
+            if self.json_trace_path is not None:
+                self.profiler.export_chrome_trace(self.json_trace_path)
diff --git a/mmcv/runner/hooks/sampler_seed.py b/mmcv/runner/hooks/sampler_seed.py
new file mode 100644
index 0000000..ee0dc6b
--- /dev/null
+++ b/mmcv/runner/hooks/sampler_seed.py
@@ -0,0 +1,20 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .hook import HOOKS, Hook
+
+
+@HOOKS.register_module()
+class DistSamplerSeedHook(Hook):
+    """Data-loading sampler for distributed training.
+
+    When distributed training, it is only useful in conjunction with
+    :obj:`EpochBasedRunner`, while :obj:`IterBasedRunner` achieves the same
+    purpose with :obj:`IterLoader`.
+    """
+
+    def before_epoch(self, runner):
+        if hasattr(runner.data_loader.sampler, 'set_epoch'):
+            # in case the data loader uses `SequentialSampler` in Pytorch
+            runner.data_loader.sampler.set_epoch(runner.epoch)
+        elif hasattr(runner.data_loader.batch_sampler.sampler, 'set_epoch'):
+            # batch sampler in pytorch warps the sampler as its attributes.
+            runner.data_loader.batch_sampler.sampler.set_epoch(runner.epoch)
diff --git a/mmcv/runner/hooks/sync_buffer.py b/mmcv/runner/hooks/sync_buffer.py
new file mode 100644
index 0000000..6376b7f
--- /dev/null
+++ b/mmcv/runner/hooks/sync_buffer.py
@@ -0,0 +1,22 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from ..dist_utils import allreduce_params
+from .hook import HOOKS, Hook
+
+
+@HOOKS.register_module()
+class SyncBuffersHook(Hook):
+    """Synchronize model buffers such as running_mean and running_var in BN at
+    the end of each epoch.
+
+    Args:
+        distributed (bool): Whether distributed training is used. It is
+          effective only for distributed training. Defaults to True.
+    """
+
+    def __init__(self, distributed=True):
+        self.distributed = distributed
+
+    def after_epoch(self, runner):
+        """All-reduce model buffers at the end of each epoch."""
+        if self.distributed:
+            allreduce_params(runner.model.buffers())
diff --git a/mmcv/runner/iter_based_runner.py b/mmcv/runner/iter_based_runner.py
new file mode 100644
index 0000000..9892b07
--- /dev/null
+++ b/mmcv/runner/iter_based_runner.py
@@ -0,0 +1,273 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+import platform
+import shutil
+import time
+import warnings
+
+import torch
+from torch.optim import Optimizer
+
+import mmcv
+from .base_runner import BaseRunner
+from .builder import RUNNERS
+from .checkpoint import save_checkpoint
+from .hooks import IterTimerHook
+from .utils import get_host_info
+
+
+class IterLoader:
+
+    def __init__(self, dataloader):
+        self._dataloader = dataloader
+        self.iter_loader = iter(self._dataloader)
+        self._epoch = 0
+
+    @property
+    def epoch(self):
+        return self._epoch
+
+    def __next__(self):
+        try:
+            data = next(self.iter_loader)
+        except StopIteration:
+            self._epoch += 1
+            if hasattr(self._dataloader.sampler, 'set_epoch'):
+                self._dataloader.sampler.set_epoch(self._epoch)
+            time.sleep(2)  # Prevent possible deadlock during epoch transition
+            self.iter_loader = iter(self._dataloader)
+            data = next(self.iter_loader)
+
+        return data
+
+    def __len__(self):
+        return len(self._dataloader)
+
+
+@RUNNERS.register_module()
+class IterBasedRunner(BaseRunner):
+    """Iteration-based Runner.
+
+    This runner train models iteration by iteration.
+    """
+
+    def train(self, data_loader, **kwargs):
+        self.model.train()
+        self.mode = 'train'
+        self.data_loader = data_loader
+        self._epoch = data_loader.epoch
+        data_batch = next(data_loader)
+        self.call_hook('before_train_iter')
+        outputs = self.model.train_step(data_batch, self.optimizer, **kwargs)
+        if not isinstance(outputs, dict):
+            raise TypeError('model.train_step() must return a dict')
+        if 'log_vars' in outputs:
+            self.log_buffer.update(outputs['log_vars'], outputs['num_samples'])
+        self.outputs = outputs
+        self.call_hook('after_train_iter')
+        self._inner_iter += 1
+        self._iter += 1
+
+    @torch.no_grad()
+    def val(self, data_loader, **kwargs):
+        self.model.eval()
+        self.mode = 'val'
+        self.data_loader = data_loader
+        data_batch = next(data_loader)
+        self.call_hook('before_val_iter')
+        outputs = self.model.val_step(data_batch, **kwargs)
+        if not isinstance(outputs, dict):
+            raise TypeError('model.val_step() must return a dict')
+        if 'log_vars' in outputs:
+            self.log_buffer.update(outputs['log_vars'], outputs['num_samples'])
+        self.outputs = outputs
+        self.call_hook('after_val_iter')
+        self._inner_iter += 1
+
+    def run(self, data_loaders, workflow, max_iters=None, **kwargs):
+        """Start running.
+
+        Args:
+            data_loaders (list[:obj:`DataLoader`]): Dataloaders for training
+                and validation.
+            workflow (list[tuple]): A list of (phase, iters) to specify the
+                running order and iterations. E.g, [('train', 10000),
+                ('val', 1000)] means running 10000 iterations for training and
+                1000 iterations for validation, iteratively.
+        """
+        assert isinstance(data_loaders, list)
+        assert mmcv.is_list_of(workflow, tuple)
+        assert len(data_loaders) == len(workflow)
+        if max_iters is not None:
+            warnings.warn(
+                'setting max_iters in run is deprecated, '
+                'please set max_iters in runner_config', DeprecationWarning)
+            self._max_iters = max_iters
+        assert self._max_iters is not None, (
+            'max_iters must be specified during instantiation')
+
+        work_dir = self.work_dir if self.work_dir is not None else 'NONE'
+        self.logger.info('Start running, host: %s, work_dir: %s',
+                         get_host_info(), work_dir)
+        self.logger.info('Hooks will be executed in the following order:\n%s',
+                         self.get_hook_info())
+        self.logger.info('workflow: %s, max: %d iters', workflow,
+                         self._max_iters)
+        self.call_hook('before_run')
+
+        iter_loaders = [IterLoader(x) for x in data_loaders]
+
+        self.call_hook('before_epoch')
+
+        while self.iter < self._max_iters:
+            for i, flow in enumerate(workflow):
+                self._inner_iter = 0
+                mode, iters = flow
+                if not isinstance(mode, str) or not hasattr(self, mode):
+                    raise ValueError(
+                        'runner has no method named "{}" to run a workflow'.
+                        format(mode))
+                iter_runner = getattr(self, mode)
+                for _ in range(iters):
+                    if mode == 'train' and self.iter >= self._max_iters:
+                        break
+                    iter_runner(iter_loaders[i], **kwargs)
+
+        time.sleep(1)  # wait for some hooks like loggers to finish
+        self.call_hook('after_epoch')
+        self.call_hook('after_run')
+
+    def resume(self,
+               checkpoint,
+               resume_optimizer=True,
+               map_location='default'):
+        """Resume model from checkpoint.
+
+        Args:
+            checkpoint (str): Checkpoint to resume from.
+            resume_optimizer (bool, optional): Whether resume the optimizer(s)
+                if the checkpoint file includes optimizer(s). Default to True.
+            map_location (str, optional): Same as :func:`torch.load`.
+                Default to 'default'.
+        """
+        if map_location == 'default':
+            device_id = torch.cuda.current_device()
+            checkpoint = self.load_checkpoint(
+                checkpoint,
+                map_location=lambda storage, loc: storage.cuda(device_id))
+        else:
+            checkpoint = self.load_checkpoint(
+                checkpoint, map_location=map_location)
+
+        self._epoch = checkpoint['meta']['epoch']
+        self._iter = checkpoint['meta']['iter']
+        self._inner_iter = checkpoint['meta']['iter']
+        if 'optimizer' in checkpoint and resume_optimizer:
+            if isinstance(self.optimizer, Optimizer):
+                self.optimizer.load_state_dict(checkpoint['optimizer'])
+            elif isinstance(self.optimizer, dict):
+                for k in self.optimizer.keys():
+                    self.optimizer[k].load_state_dict(
+                        checkpoint['optimizer'][k])
+            else:
+                raise TypeError(
+                    'Optimizer should be dict or torch.optim.Optimizer '
+                    f'but got {type(self.optimizer)}')
+
+        self.logger.info(f'resumed from epoch: {self.epoch}, iter {self.iter}')
+
+    def save_checkpoint(self,
+                        out_dir,
+                        filename_tmpl='iter_{}.pth',
+                        meta=None,
+                        save_optimizer=True,
+                        create_symlink=True):
+        """Save checkpoint to file.
+
+        Args:
+            out_dir (str): Directory to save checkpoint files.
+            filename_tmpl (str, optional): Checkpoint file template.
+                Defaults to 'iter_{}.pth'.
+            meta (dict, optional): Metadata to be saved in checkpoint.
+                Defaults to None.
+            save_optimizer (bool, optional): Whether save optimizer.
+                Defaults to True.
+            create_symlink (bool, optional): Whether create symlink to the
+                latest checkpoint file. Defaults to True.
+        """
+        if meta is None:
+            meta = {}
+        elif not isinstance(meta, dict):
+            raise TypeError(
+                f'meta should be a dict or None, but got {type(meta)}')
+        if self.meta is not None:
+            meta.update(self.meta)
+            # Note: meta.update(self.meta) should be done before
+            # meta.update(epoch=self.epoch + 1, iter=self.iter) otherwise
+            # there will be problems with resumed checkpoints.
+            # More details in https://github.com/open-mmlab/mmcv/pull/1108
+        meta.update(epoch=self.epoch + 1, iter=self.iter)
+
+        filename = filename_tmpl.format(self.iter + 1)
+        filepath = osp.join(out_dir, filename)
+        optimizer = self.optimizer if save_optimizer else None
+        save_checkpoint(self.model, filepath, optimizer=optimizer, meta=meta)
+        # in some environments, `os.symlink` is not supported, you may need to
+        # set `create_symlink` to False
+        if create_symlink:
+            dst_file = osp.join(out_dir, 'latest.pth')
+            if platform.system() != 'Windows':
+                mmcv.symlink(filename, dst_file)
+            else:
+                shutil.copy(filepath, dst_file)
+
+    def register_training_hooks(self,
+                                lr_config,
+                                optimizer_config=None,
+                                checkpoint_config=None,
+                                log_config=None,
+                                momentum_config=None,
+                                custom_hooks_config=None):
+        """Register default hooks for iter-based training.
+
+        Checkpoint hook, optimizer stepper hook and logger hooks will be set to
+        `by_epoch=False` by default.
+
+        Default hooks include:
+
+        +----------------------+-------------------------+
+        | Hooks                | Priority                |
+        +======================+=========================+
+        | LrUpdaterHook        | VERY_HIGH (10)          |
+        +----------------------+-------------------------+
+        | MomentumUpdaterHook  | HIGH (30)               |
+        +----------------------+-------------------------+
+        | OptimizerStepperHook | ABOVE_NORMAL (40)       |
+        +----------------------+-------------------------+
+        | CheckpointSaverHook  | NORMAL (50)             |
+        +----------------------+-------------------------+
+        | IterTimerHook        | LOW (70)                |
+        +----------------------+-------------------------+
+        | LoggerHook(s)        | VERY_LOW (90)           |
+        +----------------------+-------------------------+
+        | CustomHook(s)        | defaults to NORMAL (50) |
+        +----------------------+-------------------------+
+
+        If custom hooks have same priority with default hooks, custom hooks
+        will be triggered after default hooks.
+        """
+        if checkpoint_config is not None:
+            checkpoint_config.setdefault('by_epoch', False)
+        if lr_config is not None:
+            lr_config.setdefault('by_epoch', False)
+        if log_config is not None:
+            for info in log_config['hooks']:
+                info.setdefault('by_epoch', False)
+        super(IterBasedRunner, self).register_training_hooks(
+            lr_config=lr_config,
+            momentum_config=momentum_config,
+            optimizer_config=optimizer_config,
+            checkpoint_config=checkpoint_config,
+            log_config=log_config,
+            timer_config=IterTimerHook(),
+            custom_hooks_config=custom_hooks_config)
diff --git a/mmcv/runner/log_buffer.py b/mmcv/runner/log_buffer.py
new file mode 100644
index 0000000..d949e29
--- /dev/null
+++ b/mmcv/runner/log_buffer.py
@@ -0,0 +1,41 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from collections import OrderedDict
+
+import numpy as np
+
+
+class LogBuffer:
+
+    def __init__(self):
+        self.val_history = OrderedDict()
+        self.n_history = OrderedDict()
+        self.output = OrderedDict()
+        self.ready = False
+
+    def clear(self):
+        self.val_history.clear()
+        self.n_history.clear()
+        self.clear_output()
+
+    def clear_output(self):
+        self.output.clear()
+        self.ready = False
+
+    def update(self, vars, count=1):
+        assert isinstance(vars, dict)
+        for key, var in vars.items():
+            if key not in self.val_history:
+                self.val_history[key] = []
+                self.n_history[key] = []
+            self.val_history[key].append(var)
+            self.n_history[key].append(count)
+
+    def average(self, n=0):
+        """Average latest n values or all values."""
+        assert n >= 0
+        for key in self.val_history:
+            values = np.array(self.val_history[key][-n:])
+            nums = np.array(self.n_history[key][-n:])
+            avg = np.sum(values * nums) / np.sum(nums)
+            self.output[key] = avg
+        self.ready = True
diff --git a/mmcv/runner/optimizer/__init__.py b/mmcv/runner/optimizer/__init__.py
new file mode 100644
index 0000000..53c34d0
--- /dev/null
+++ b/mmcv/runner/optimizer/__init__.py
@@ -0,0 +1,9 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .builder import (OPTIMIZER_BUILDERS, OPTIMIZERS, build_optimizer,
+                      build_optimizer_constructor)
+from .default_constructor import DefaultOptimizerConstructor
+
+__all__ = [
+    'OPTIMIZER_BUILDERS', 'OPTIMIZERS', 'DefaultOptimizerConstructor',
+    'build_optimizer', 'build_optimizer_constructor'
+]
diff --git a/mmcv/runner/optimizer/builder.py b/mmcv/runner/optimizer/builder.py
new file mode 100644
index 0000000..f9234ee
--- /dev/null
+++ b/mmcv/runner/optimizer/builder.py
@@ -0,0 +1,44 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import inspect
+
+import torch
+
+from ...utils import Registry, build_from_cfg
+
+OPTIMIZERS = Registry('optimizer')
+OPTIMIZER_BUILDERS = Registry('optimizer builder')
+
+
+def register_torch_optimizers():
+    torch_optimizers = []
+    for module_name in dir(torch.optim):
+        if module_name.startswith('__'):
+            continue
+        _optim = getattr(torch.optim, module_name)
+        if inspect.isclass(_optim) and issubclass(_optim,
+                                                  torch.optim.Optimizer):
+            OPTIMIZERS.register_module()(_optim)
+            torch_optimizers.append(module_name)
+    return torch_optimizers
+
+
+TORCH_OPTIMIZERS = register_torch_optimizers()
+
+
+def build_optimizer_constructor(cfg):
+    return build_from_cfg(cfg, OPTIMIZER_BUILDERS)
+
+
+def build_optimizer(model, cfg):
+    optimizer_cfg = copy.deepcopy(cfg)
+    constructor_type = optimizer_cfg.pop('constructor',
+                                         'DefaultOptimizerConstructor')
+    paramwise_cfg = optimizer_cfg.pop('paramwise_cfg', None)
+    optim_constructor = build_optimizer_constructor(
+        dict(
+            type=constructor_type,
+            optimizer_cfg=optimizer_cfg,
+            paramwise_cfg=paramwise_cfg))
+    optimizer = optim_constructor(model)
+    return optimizer
diff --git a/mmcv/runner/optimizer/default_constructor.py b/mmcv/runner/optimizer/default_constructor.py
new file mode 100644
index 0000000..effd1e1
--- /dev/null
+++ b/mmcv/runner/optimizer/default_constructor.py
@@ -0,0 +1,249 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+
+import torch
+from torch.nn import GroupNorm, LayerNorm
+
+from mmcv.utils import _BatchNorm, _InstanceNorm, build_from_cfg, is_list_of
+from mmcv.utils.ext_loader import check_ops_exist
+from .builder import OPTIMIZER_BUILDERS, OPTIMIZERS
+
+
+@OPTIMIZER_BUILDERS.register_module()
+class DefaultOptimizerConstructor:
+    """Default constructor for optimizers.
+
+    By default each parameter share the same optimizer settings, and we
+    provide an argument ``paramwise_cfg`` to specify parameter-wise settings.
+    It is a dict and may contain the following fields:
+
+    - ``custom_keys`` (dict): Specified parameters-wise settings by keys. If
+      one of the keys in ``custom_keys`` is a substring of the name of one
+      parameter, then the setting of the parameter will be specified by
+      ``custom_keys[key]`` and other setting like ``bias_lr_mult`` etc. will
+      be ignored. It should be noted that the aforementioned ``key`` is the
+      longest key that is a substring of the name of the parameter. If there
+      are multiple matched keys with the same length, then the key with lower
+      alphabet order will be chosen.
+      ``custom_keys[key]`` should be a dict and may contain fields ``lr_mult``
+      and ``decay_mult``. See Example 2 below.
+    - ``bias_lr_mult`` (float): It will be multiplied to the learning
+      rate for all bias parameters (except for those in normalization
+      layers and offset layers of DCN).
+    - ``bias_decay_mult`` (float): It will be multiplied to the weight
+      decay for all bias parameters (except for those in
+      normalization layers, depthwise conv layers, offset layers of DCN).
+    - ``norm_decay_mult`` (float): It will be multiplied to the weight
+      decay for all weight and bias parameters of normalization
+      layers.
+    - ``dwconv_decay_mult`` (float): It will be multiplied to the weight
+      decay for all weight and bias parameters of depthwise conv
+      layers.
+    - ``dcn_offset_lr_mult`` (float): It will be multiplied to the learning
+      rate for parameters of offset layer in the deformable convs
+      of a model.
+    - ``bypass_duplicate`` (bool): If true, the duplicate parameters
+      would not be added into optimizer. Default: False.
+
+    Note:
+        1. If the option ``dcn_offset_lr_mult`` is used, the constructor will
+            override the effect of ``bias_lr_mult`` in the bias of offset
+            layer. So be careful when using both ``bias_lr_mult`` and
+            ``dcn_offset_lr_mult``. If you wish to apply both of them to the
+            offset layer in deformable convs, set ``dcn_offset_lr_mult``
+            to the original ``dcn_offset_lr_mult`` * ``bias_lr_mult``.
+        2. If the option ``dcn_offset_lr_mult`` is used, the constructor will
+            apply it to all the DCN layers in the model. So be careful when
+            the model contains multiple DCN layers in places other than
+            backbone.
+
+    Args:
+        model (:obj:`nn.Module`): The model with parameters to be optimized.
+        optimizer_cfg (dict): The config dict of the optimizer.
+            Positional fields are
+
+                - `type`: class name of the optimizer.
+
+            Optional fields are
+
+                - any arguments of the corresponding optimizer type, e.g.,
+                  lr, weight_decay, momentum, etc.
+        paramwise_cfg (dict, optional): Parameter-wise options.
+
+    Example 1:
+        >>> model = torch.nn.modules.Conv1d(1, 1, 1)
+        >>> optimizer_cfg = dict(type='SGD', lr=0.01, momentum=0.9,
+        >>>                      weight_decay=0.0001)
+        >>> paramwise_cfg = dict(norm_decay_mult=0.)
+        >>> optim_builder = DefaultOptimizerConstructor(
+        >>>     optimizer_cfg, paramwise_cfg)
+        >>> optimizer = optim_builder(model)
+
+    Example 2:
+        >>> # assume model have attribute model.backbone and model.cls_head
+        >>> optimizer_cfg = dict(type='SGD', lr=0.01, weight_decay=0.95)
+        >>> paramwise_cfg = dict(custom_keys={
+                '.backbone': dict(lr_mult=0.1, decay_mult=0.9)})
+        >>> optim_builder = DefaultOptimizerConstructor(
+        >>>     optimizer_cfg, paramwise_cfg)
+        >>> optimizer = optim_builder(model)
+        >>> # Then the `lr` and `weight_decay` for model.backbone is
+        >>> # (0.01 * 0.1, 0.95 * 0.9). `lr` and `weight_decay` for
+        >>> # model.cls_head is (0.01, 0.95).
+    """
+
+    def __init__(self, optimizer_cfg, paramwise_cfg=None):
+        if not isinstance(optimizer_cfg, dict):
+            raise TypeError('optimizer_cfg should be a dict',
+                            f'but got {type(optimizer_cfg)}')
+        self.optimizer_cfg = optimizer_cfg
+        self.paramwise_cfg = {} if paramwise_cfg is None else paramwise_cfg
+        self.base_lr = optimizer_cfg.get('lr', None)
+        self.base_wd = optimizer_cfg.get('weight_decay', None)
+        self._validate_cfg()
+
+    def _validate_cfg(self):
+        if not isinstance(self.paramwise_cfg, dict):
+            raise TypeError('paramwise_cfg should be None or a dict, '
+                            f'but got {type(self.paramwise_cfg)}')
+
+        if 'custom_keys' in self.paramwise_cfg:
+            if not isinstance(self.paramwise_cfg['custom_keys'], dict):
+                raise TypeError(
+                    'If specified, custom_keys must be a dict, '
+                    f'but got {type(self.paramwise_cfg["custom_keys"])}')
+            if self.base_wd is None:
+                for key in self.paramwise_cfg['custom_keys']:
+                    if 'decay_mult' in self.paramwise_cfg['custom_keys'][key]:
+                        raise ValueError('base_wd should not be None')
+
+        # get base lr and weight decay
+        # weight_decay must be explicitly specified if mult is specified
+        if ('bias_decay_mult' in self.paramwise_cfg
+                or 'norm_decay_mult' in self.paramwise_cfg
+                or 'dwconv_decay_mult' in self.paramwise_cfg):
+            if self.base_wd is None:
+                raise ValueError('base_wd should not be None')
+
+    def _is_in(self, param_group, param_group_list):
+        assert is_list_of(param_group_list, dict)
+        param = set(param_group['params'])
+        param_set = set()
+        for group in param_group_list:
+            param_set.update(set(group['params']))
+
+        return not param.isdisjoint(param_set)
+
+    def add_params(self, params, module, prefix='', is_dcn_module=None):
+        """Add all parameters of module to the params list.
+
+        The parameters of the given module will be added to the list of param
+        groups, with specific rules defined by paramwise_cfg.
+
+        Args:
+            params (list[dict]): A list of param groups, it will be modified
+                in place.
+            module (nn.Module): The module to be added.
+            prefix (str): The prefix of the module
+            is_dcn_module (int|float|None): If the current module is a
+                submodule of DCN, `is_dcn_module` will be passed to
+                control conv_offset layer's learning rate. Defaults to None.
+        """
+        # get param-wise options
+        custom_keys = self.paramwise_cfg.get('custom_keys', {})
+        # first sort with alphabet order and then sort with reversed len of str
+        sorted_keys = sorted(sorted(custom_keys.keys()), key=len, reverse=True)
+
+        bias_lr_mult = self.paramwise_cfg.get('bias_lr_mult', 1.)
+        bias_decay_mult = self.paramwise_cfg.get('bias_decay_mult', 1.)
+        norm_decay_mult = self.paramwise_cfg.get('norm_decay_mult', 1.)
+        dwconv_decay_mult = self.paramwise_cfg.get('dwconv_decay_mult', 1.)
+        bypass_duplicate = self.paramwise_cfg.get('bypass_duplicate', False)
+        dcn_offset_lr_mult = self.paramwise_cfg.get('dcn_offset_lr_mult', 1.)
+
+        # special rules for norm layers and depth-wise conv layers
+        is_norm = isinstance(module,
+                             (_BatchNorm, _InstanceNorm, GroupNorm, LayerNorm))
+        is_dwconv = (
+            isinstance(module, torch.nn.Conv2d)
+            and module.in_channels == module.groups)
+
+        for name, param in module.named_parameters(recurse=False):
+            param_group = {'params': [param]}
+            if not param.requires_grad:
+                params.append(param_group)
+                continue
+            if bypass_duplicate and self._is_in(param_group, params):
+                warnings.warn(f'{prefix} is duplicate. It is skipped since '
+                              f'bypass_duplicate={bypass_duplicate}')
+                continue
+            # if the parameter match one of the custom keys, ignore other rules
+            is_custom = False
+            for key in sorted_keys:
+                if key in f'{prefix}.{name}':
+                    is_custom = True
+                    lr_mult = custom_keys[key].get('lr_mult', 1.)
+                    param_group['lr'] = self.base_lr * lr_mult
+                    if self.base_wd is not None:
+                        decay_mult = custom_keys[key].get('decay_mult', 1.)
+                        param_group['weight_decay'] = self.base_wd * decay_mult
+                    break
+
+            if not is_custom:
+                # bias_lr_mult affects all bias parameters
+                # except for norm.bias dcn.conv_offset.bias
+                if name == 'bias' and not (is_norm or is_dcn_module):
+                    param_group['lr'] = self.base_lr * bias_lr_mult
+
+                if (prefix.find('conv_offset') != -1 and is_dcn_module
+                        and isinstance(module, torch.nn.Conv2d)):
+                    # deal with both dcn_offset's bias & weight
+                    param_group['lr'] = self.base_lr * dcn_offset_lr_mult
+
+                # apply weight decay policies
+                if self.base_wd is not None:
+                    # norm decay
+                    if is_norm:
+                        param_group[
+                            'weight_decay'] = self.base_wd * norm_decay_mult
+                    # depth-wise conv
+                    elif is_dwconv:
+                        param_group[
+                            'weight_decay'] = self.base_wd * dwconv_decay_mult
+                    # bias lr and decay
+                    elif name == 'bias' and not is_dcn_module:
+                        # TODO: current bias_decay_mult will have affect on DCN
+                        param_group[
+                            'weight_decay'] = self.base_wd * bias_decay_mult
+            params.append(param_group)
+
+        if check_ops_exist():
+            from mmcv.ops import DeformConv2d, ModulatedDeformConv2d
+            is_dcn_module = isinstance(module,
+                                       (DeformConv2d, ModulatedDeformConv2d))
+        else:
+            is_dcn_module = False
+        for child_name, child_mod in module.named_children():
+            child_prefix = f'{prefix}.{child_name}' if prefix else child_name
+            self.add_params(
+                params,
+                child_mod,
+                prefix=child_prefix,
+                is_dcn_module=is_dcn_module)
+
+    def __call__(self, model):
+        if hasattr(model, 'module'):
+            model = model.module
+
+        optimizer_cfg = self.optimizer_cfg.copy()
+        # if no paramwise option is specified, just use the global setting
+        if not self.paramwise_cfg:
+            optimizer_cfg['params'] = model.parameters()
+            return build_from_cfg(optimizer_cfg, OPTIMIZERS)
+
+        # set param-wise lr and weight decay recursively
+        params = []
+        self.add_params(params, model)
+        optimizer_cfg['params'] = params
+
+        return build_from_cfg(optimizer_cfg, OPTIMIZERS)
diff --git a/mmcv/runner/priority.py b/mmcv/runner/priority.py
new file mode 100644
index 0000000..64cc4e3
--- /dev/null
+++ b/mmcv/runner/priority.py
@@ -0,0 +1,60 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from enum import Enum
+
+
+class Priority(Enum):
+    """Hook priority levels.
+
+    +--------------+------------+
+    | Level        | Value      |
+    +==============+============+
+    | HIGHEST      | 0          |
+    +--------------+------------+
+    | VERY_HIGH    | 10         |
+    +--------------+------------+
+    | HIGH         | 30         |
+    +--------------+------------+
+    | ABOVE_NORMAL | 40         |
+    +--------------+------------+
+    | NORMAL       | 50         |
+    +--------------+------------+
+    | BELOW_NORMAL | 60         |
+    +--------------+------------+
+    | LOW          | 70         |
+    +--------------+------------+
+    | VERY_LOW     | 90         |
+    +--------------+------------+
+    | LOWEST       | 100        |
+    +--------------+------------+
+    """
+
+    HIGHEST = 0
+    VERY_HIGH = 10
+    HIGH = 30
+    ABOVE_NORMAL = 40
+    NORMAL = 50
+    BELOW_NORMAL = 60
+    LOW = 70
+    VERY_LOW = 90
+    LOWEST = 100
+
+
+def get_priority(priority):
+    """Get priority value.
+
+    Args:
+        priority (int or str or :obj:`Priority`): Priority.
+
+    Returns:
+        int: The priority value.
+    """
+    if isinstance(priority, int):
+        if priority < 0 or priority > 100:
+            raise ValueError('priority must be between 0 and 100')
+        return priority
+    elif isinstance(priority, Priority):
+        return priority.value
+    elif isinstance(priority, str):
+        return Priority[priority.upper()].value
+    else:
+        raise TypeError('priority must be an integer or Priority enum value')
diff --git a/mmcv/runner/utils.py b/mmcv/runner/utils.py
new file mode 100644
index 0000000..144d11e
--- /dev/null
+++ b/mmcv/runner/utils.py
@@ -0,0 +1,93 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+import random
+import sys
+import time
+import warnings
+from getpass import getuser
+from socket import gethostname
+
+import numpy as np
+import torch
+
+import mmcv
+
+
+def get_host_info():
+    """Get hostname and username.
+
+    Return empty string if exception raised, e.g. ``getpass.getuser()`` will
+    lead to error in docker container
+    """
+    host = ''
+    try:
+        host = f'{getuser()}@{gethostname()}'
+    except Exception as e:
+        warnings.warn(f'Host or user not found: {str(e)}')
+    finally:
+        return host
+
+
+def get_time_str():
+    return time.strftime('%Y%m%d_%H%M%S', time.localtime())
+
+
+def obj_from_dict(info, parent=None, default_args=None):
+    """Initialize an object from dict.
+
+    The dict must contain the key "type", which indicates the object type, it
+    can be either a string or type, such as "list" or ``list``. Remaining
+    fields are treated as the arguments for constructing the object.
+
+    Args:
+        info (dict): Object types and arguments.
+        parent (:class:`module`): Module which may containing expected object
+            classes.
+        default_args (dict, optional): Default arguments for initializing the
+            object.
+
+    Returns:
+        any type: Object built from the dict.
+    """
+    assert isinstance(info, dict) and 'type' in info
+    assert isinstance(default_args, dict) or default_args is None
+    args = info.copy()
+    obj_type = args.pop('type')
+    if mmcv.is_str(obj_type):
+        if parent is not None:
+            obj_type = getattr(parent, obj_type)
+        else:
+            obj_type = sys.modules[obj_type]
+    elif not isinstance(obj_type, type):
+        raise TypeError('type must be a str or valid type, but '
+                        f'got {type(obj_type)}')
+    if default_args is not None:
+        for name, value in default_args.items():
+            args.setdefault(name, value)
+    return obj_type(**args)
+
+
+def set_random_seed(seed, deterministic=False, use_rank_shift=False):
+    """Set random seed.
+
+    Args:
+        seed (int): Seed to be used.
+        deterministic (bool): Whether to set the deterministic option for
+            CUDNN backend, i.e., set `torch.backends.cudnn.deterministic`
+            to True and `torch.backends.cudnn.benchmark` to False.
+            Default: False.
+        rank_shift (bool): Whether to add rank number to the random seed to
+            have different random seed in different threads. Default: False.
+    """
+    if use_rank_shift:
+        rank, _ = mmcv.runner.get_dist_info()
+        seed += rank
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+    os.environ['PYTHONHASHSEED'] = str(seed)
+    if deterministic:
+        torch.backends.cudnn.deterministic = True
+        torch.backends.cudnn.benchmark = False
diff --git a/mmcv/tensorrt/__init__.py b/mmcv/tensorrt/__init__.py
new file mode 100644
index 0000000..eb8cd40
--- /dev/null
+++ b/mmcv/tensorrt/__init__.py
@@ -0,0 +1,30 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# flake8: noqa
+from .init_plugins import is_tensorrt_plugin_loaded, load_tensorrt_plugin
+from .preprocess import preprocess_onnx
+
+
+def is_tensorrt_available():
+    try:
+        import tensorrt
+        del tensorrt
+        return True
+    except ModuleNotFoundError:
+        return False
+
+
+__all__ = []
+
+if is_tensorrt_available():
+    from .tensorrt_utils import (TRTWraper, TRTWrapper, load_trt_engine,
+                                 onnx2trt, save_trt_engine)
+
+    # load tensorrt plugin lib
+    load_tensorrt_plugin()
+
+    __all__.append([
+        'onnx2trt', 'save_trt_engine', 'load_trt_engine', 'TRTWraper',
+        'TRTWrapper'
+    ])
+
+__all__.append(['is_tensorrt_plugin_loaded', 'preprocess_onnx'])
diff --git a/mmcv/tensorrt/init_plugins.py b/mmcv/tensorrt/init_plugins.py
new file mode 100644
index 0000000..d9dbd02
--- /dev/null
+++ b/mmcv/tensorrt/init_plugins.py
@@ -0,0 +1,37 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import ctypes
+import glob
+import os
+
+
+def get_tensorrt_op_path():
+    """Get TensorRT plugins library path."""
+    wildcard = os.path.join(
+        os.path.abspath(os.path.dirname(os.path.dirname(__file__))),
+        '_ext_trt.*.so')
+
+    paths = glob.glob(wildcard)
+    lib_path = paths[0] if len(paths) > 0 else ''
+    return lib_path
+
+
+plugin_is_loaded = False
+
+
+def is_tensorrt_plugin_loaded():
+    """Check if TensorRT plugins library is loaded or not.
+
+    Returns:
+        bool: plugin_is_loaded flag
+    """
+    global plugin_is_loaded
+    return plugin_is_loaded
+
+
+def load_tensorrt_plugin():
+    """load TensorRT plugins library."""
+    global plugin_is_loaded
+    lib_path = get_tensorrt_op_path()
+    if (not plugin_is_loaded) and os.path.exists(lib_path):
+        ctypes.CDLL(lib_path)
+        plugin_is_loaded = True
diff --git a/mmcv/tensorrt/preprocess.py b/mmcv/tensorrt/preprocess.py
new file mode 100644
index 0000000..d07c67f
--- /dev/null
+++ b/mmcv/tensorrt/preprocess.py
@@ -0,0 +1,120 @@
+import numpy as np
+import onnx
+
+
+def preprocess_onnx(onnx_model):
+    """Modify onnx model to match with TensorRT plugins in mmcv.
+
+    There are some conflict between onnx node definition and TensorRT limit.
+    This function perform preprocess on the onnx model to solve the conflicts.
+    For example, onnx `attribute` is loaded in TensorRT on host and onnx
+    `input` is loaded on device. The shape inference is performed on host, so
+    any `input` related to shape (such as `max_output_boxes_per_class` in
+    NonMaxSuppression) should be transformed to `attribute` before conversion.
+
+    Arguments:
+        onnx_model (onnx.ModelProto): Input onnx model.
+
+    Returns:
+        onnx.ModelProto: Modified onnx model.
+    """
+    graph = onnx_model.graph
+    nodes = graph.node
+    initializers = graph.initializer
+    node_dict = {}
+    for node in nodes:
+        node_outputs = node.output
+        for output in node_outputs:
+            if len(output) > 0:
+                node_dict[output] = node
+
+    init_dict = {_.name: _ for _ in initializers}
+
+    nodes_name_to_remove = set()
+
+    def is_node_without_output(name):
+        for node_name, node in node_dict.items():
+            if node_name not in nodes_name_to_remove:
+                if name in node.input:
+                    return False
+        return True
+
+    def mark_nodes_to_remove(name):
+        node = node_dict[name]
+        nodes_name_to_remove.add(name)
+        for input_node_name in node.input:
+            if is_node_without_output(input_node_name):
+                mark_nodes_to_remove(input_node_name)
+
+    def parse_data(name, typ, default_value=0):
+        if name in node_dict:
+            node = node_dict[name]
+            if node.op_type == 'Constant':
+                raw_data = node.attribute[0].t.raw_data
+            else:
+                mark_nodes_to_remove(name)
+                return default_value
+        elif name in init_dict:
+            raw_data = init_dict[name].raw_data
+        else:
+            raise ValueError(f'{name} not found in node or initilizer.')
+        return np.frombuffer(raw_data, typ).item()
+
+    nrof_node = len(nodes)
+    for idx in range(nrof_node):
+        node = nodes[idx]
+        node_attributes = node.attribute
+        node_inputs = node.input
+        node_outputs = node.output
+        node_name = node.name
+        # process NonMaxSuppression node
+        if node.op_type == 'NonMaxSuppression':
+            center_point_box = 0
+            max_output_boxes_per_class = 1000000
+            iou_threshold = 0.3
+            score_threshold = 0.0
+            offset = 0
+            for attribute in node_attributes:
+                if attribute.name == 'center_point_box':
+                    center_point_box = attribute.i
+                elif attribute.name == 'offset':
+                    offset = attribute.i
+
+            if len(node_inputs) >= 3:
+                max_output_boxes_per_class = parse_data(
+                    node_inputs[2], np.int64, max_output_boxes_per_class)
+                mark_nodes_to_remove(node_inputs[2])
+
+            if len(node_inputs) >= 4:
+                iou_threshold = parse_data(node_inputs[3], np.float32,
+                                           iou_threshold)
+                mark_nodes_to_remove(node_inputs[3])
+
+            if len(node_inputs) >= 5:
+                score_threshold = parse_data(node_inputs[4], np.float32)
+                mark_nodes_to_remove(node_inputs[4])
+
+            new_node = onnx.helper.make_node(
+                'NonMaxSuppression',
+                node_inputs[:2],
+                node_outputs,
+                name=node_name,
+                center_point_box=center_point_box,
+                max_output_boxes_per_class=max_output_boxes_per_class,
+                iou_threshold=iou_threshold,
+                score_threshold=score_threshold,
+                offset=offset)
+
+            for output in node_outputs:
+                if output in node_dict:
+                    node_dict[output] = new_node
+            nodes.insert(idx, new_node)
+            nodes.remove(node)
+        elif node.op_type == 'InstanceNormalization':
+            # directly change op name
+            node.op_type = 'MMCVInstanceNormalization'
+
+    for node_name in nodes_name_to_remove:
+        nodes.remove(node_dict[node_name])
+
+    return onnx_model
diff --git a/mmcv/tensorrt/tensorrt_utils.py b/mmcv/tensorrt/tensorrt_utils.py
new file mode 100644
index 0000000..ad193de
--- /dev/null
+++ b/mmcv/tensorrt/tensorrt_utils.py
@@ -0,0 +1,235 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+
+import onnx
+import tensorrt as trt
+import torch
+
+from .preprocess import preprocess_onnx
+
+
+def onnx2trt(onnx_model,
+             opt_shape_dict,
+             log_level=trt.Logger.ERROR,
+             fp16_mode=False,
+             max_workspace_size=0,
+             device_id=0):
+    """Convert onnx model to tensorrt engine.
+
+    Arguments:
+        onnx_model (str or onnx.ModelProto): the onnx model to convert from
+        opt_shape_dict (dict): the min/opt/max shape of each input
+        log_level (TensorRT log level): the log level of TensorRT
+        fp16_mode (bool): enable fp16 mode
+        max_workspace_size (int): set max workspace size of TensorRT engine.
+            some tactic and layers need large workspace.
+        device_id (int): choice the device to create engine.
+
+    Returns:
+        tensorrt.ICudaEngine: the TensorRT engine created from onnx_model
+
+    Example:
+        >>> engine = onnx2trt(
+        >>>             "onnx_model.onnx",
+        >>>             {'input': [[1, 3, 160, 160],
+        >>>                        [1, 3, 320, 320],
+        >>>                        [1, 3, 640, 640]]},
+        >>>             log_level=trt.Logger.WARNING,
+        >>>             fp16_mode=True,
+        >>>             max_workspace_size=1 << 30,
+        >>>             device_id=0)
+        >>>             })
+    """
+    device = torch.device('cuda:{}'.format(device_id))
+    # create builder and network
+    logger = trt.Logger(log_level)
+    builder = trt.Builder(logger)
+    EXPLICIT_BATCH = 1 << (int)(
+        trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
+    network = builder.create_network(EXPLICIT_BATCH)
+
+    # parse onnx
+    parser = trt.OnnxParser(network, logger)
+
+    if isinstance(onnx_model, str):
+        onnx_model = onnx.load(onnx_model)
+
+    onnx_model = preprocess_onnx(onnx_model)
+
+    if not parser.parse(onnx_model.SerializeToString()):
+        error_msgs = ''
+        for error in range(parser.num_errors):
+            error_msgs += f'{parser.get_error(error)}\n'
+        raise RuntimeError(f'parse onnx failed:\n{error_msgs}')
+
+    # config builder
+    builder.max_workspace_size = max_workspace_size
+
+    config = builder.create_builder_config()
+    config.max_workspace_size = max_workspace_size
+    profile = builder.create_optimization_profile()
+
+    for input_name, param in opt_shape_dict.items():
+        min_shape = tuple(param[0][:])
+        opt_shape = tuple(param[1][:])
+        max_shape = tuple(param[2][:])
+        profile.set_shape(input_name, min_shape, opt_shape, max_shape)
+    config.add_optimization_profile(profile)
+
+    if fp16_mode:
+        builder.fp16_mode = fp16_mode
+        config.set_flag(trt.BuilderFlag.FP16)
+
+    # create engine
+    with torch.cuda.device(device):
+        engine = builder.build_engine(network, config)
+
+    return engine
+
+
+def save_trt_engine(engine, path):
+    """Serialize TensorRT engine to disk.
+
+    Arguments:
+        engine (tensorrt.ICudaEngine): TensorRT engine to serialize
+        path (str): disk path to write the engine
+    """
+    with open(path, mode='wb') as f:
+        f.write(bytearray(engine.serialize()))
+
+
+def load_trt_engine(path):
+    """Deserialize TensorRT engine from disk.
+
+    Arguments:
+        path (str): disk path to read the engine
+
+    Returns:
+        tensorrt.ICudaEngine: the TensorRT engine loaded from disk
+    """
+    with trt.Logger() as logger, trt.Runtime(logger) as runtime:
+        with open(path, mode='rb') as f:
+            engine_bytes = f.read()
+        engine = runtime.deserialize_cuda_engine(engine_bytes)
+        return engine
+
+
+def torch_dtype_from_trt(dtype):
+    """Convert pytorch dtype to TensorRT dtype."""
+    if dtype == trt.bool:
+        return torch.bool
+    elif dtype == trt.int8:
+        return torch.int8
+    elif dtype == trt.int32:
+        return torch.int32
+    elif dtype == trt.float16:
+        return torch.float16
+    elif dtype == trt.float32:
+        return torch.float32
+    else:
+        raise TypeError('%s is not supported by torch' % dtype)
+
+
+def torch_device_from_trt(device):
+    """Convert pytorch device to TensorRT device."""
+    if device == trt.TensorLocation.DEVICE:
+        return torch.device('cuda')
+    elif device == trt.TensorLocation.HOST:
+        return torch.device('cpu')
+    else:
+        return TypeError('%s is not supported by torch' % device)
+
+
+class TRTWrapper(torch.nn.Module):
+    """TensorRT engine Wrapper.
+
+    Arguments:
+        engine (tensorrt.ICudaEngine): TensorRT engine to wrap
+        input_names (list[str]): names of each inputs
+        output_names (list[str]): names of each outputs
+
+    Note:
+        If the engine is converted from onnx model. The input_names and
+        output_names should be the same as onnx model.
+    """
+
+    def __init__(self, engine, input_names=None, output_names=None):
+        super(TRTWrapper, self).__init__()
+        self.engine = engine
+        if isinstance(self.engine, str):
+            self.engine = load_trt_engine(engine)
+
+        if not isinstance(self.engine, trt.ICudaEngine):
+            raise TypeError('engine should be str or trt.ICudaEngine')
+
+        self._register_state_dict_hook(TRTWrapper._on_state_dict)
+        self.context = self.engine.create_execution_context()
+
+        # get input and output names from engine
+        if input_names is None or output_names is None:
+            names = [_ for _ in self.engine]
+            input_names = list(filter(self.engine.binding_is_input, names))
+            output_names = list(set(names) - set(input_names))
+        self.input_names = input_names
+        self.output_names = output_names
+
+    def _on_state_dict(self, state_dict, prefix, local_metadata):
+        state_dict[prefix + 'engine'] = bytearray(self.engine.serialize())
+        state_dict[prefix + 'input_names'] = self.input_names
+        state_dict[prefix + 'output_names'] = self.output_names
+
+    def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
+                              missing_keys, unexpected_keys, error_msgs):
+        engine_bytes = state_dict[prefix + 'engine']
+
+        with trt.Logger() as logger, trt.Runtime(logger) as runtime:
+            self.engine = runtime.deserialize_cuda_engine(engine_bytes)
+            self.context = self.engine.create_execution_context()
+
+        self.input_names = state_dict[prefix + 'input_names']
+        self.output_names = state_dict[prefix + 'output_names']
+
+    def forward(self, inputs):
+        """
+        Arguments:
+            inputs (dict): dict of input name-tensors pair
+
+        Return:
+            dict: dict of output name-tensors pair
+        """
+        assert self.input_names is not None
+        assert self.output_names is not None
+        bindings = [None] * (len(self.input_names) + len(self.output_names))
+
+        for input_name, input_tensor in inputs.items():
+            idx = self.engine.get_binding_index(input_name)
+
+            if input_tensor.dtype == torch.long:
+                input_tensor = input_tensor.int()
+            self.context.set_binding_shape(idx, tuple(input_tensor.shape))
+            bindings[idx] = input_tensor.contiguous().data_ptr()
+
+        # create output tensors
+        outputs = {}
+        for i, output_name in enumerate(self.output_names):
+            idx = self.engine.get_binding_index(output_name)
+            dtype = torch_dtype_from_trt(self.engine.get_binding_dtype(idx))
+            shape = tuple(self.context.get_binding_shape(idx))
+
+            device = torch_device_from_trt(self.engine.get_location(idx))
+            output = torch.empty(size=shape, dtype=dtype, device=device)
+            outputs[output_name] = output
+            bindings[idx] = output.data_ptr()
+
+        self.context.execute_async_v2(bindings,
+                                      torch.cuda.current_stream().cuda_stream)
+
+        return outputs
+
+
+class TRTWraper(TRTWrapper):
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        warnings.warn('TRTWraper will be deprecated in'
+                      ' future. Please use TRTWrapper instead')
diff --git a/mmcv/transforms/__init__.py b/mmcv/transforms/__init__.py
deleted file mode 100644
index c4dbaa1..0000000
--- a/mmcv/transforms/__init__.py
+++ /dev/null
@@ -1,30 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from .base import BaseTransform
-from .builder import TRANSFORMS
-from .loading import LoadAnnotations, LoadImageFromFile
-from .processing import (CenterCrop, MultiScaleFlipAug, Normalize, Pad,
-                         RandomChoiceResize, RandomFlip, RandomGrayscale,
-                         RandomResize, Resize, TestTimeAug)
-from .wrappers import (Compose, KeyMapper, RandomApply, RandomChoice,
-                       TransformBroadcaster)
-
-try:
-    import torch  # noqa: F401
-except ImportError:
-    __all__ = [
-        'BaseTransform', 'TRANSFORMS', 'TransformBroadcaster', 'Compose',
-        'RandomChoice', 'KeyMapper', 'LoadImageFromFile', 'LoadAnnotations',
-        'Normalize', 'Resize', 'Pad', 'RandomFlip', 'RandomChoiceResize',
-        'CenterCrop', 'RandomGrayscale', 'MultiScaleFlipAug', 'RandomResize',
-        'RandomApply', 'TestTimeAug'
-    ]
-else:
-    from .formatting import ImageToTensor, ToTensor, to_tensor
-
-    __all__ = [
-        'BaseTransform', 'TRANSFORMS', 'TransformBroadcaster', 'Compose',
-        'RandomChoice', 'KeyMapper', 'LoadImageFromFile', 'LoadAnnotations',
-        'Normalize', 'Resize', 'Pad', 'ToTensor', 'to_tensor', 'ImageToTensor',
-        'RandomFlip', 'RandomChoiceResize', 'CenterCrop', 'RandomGrayscale',
-        'MultiScaleFlipAug', 'RandomResize', 'RandomApply', 'TestTimeAug'
-    ]
diff --git a/mmcv/transforms/base.py b/mmcv/transforms/base.py
deleted file mode 100644
index 321afb6..0000000
--- a/mmcv/transforms/base.py
+++ /dev/null
@@ -1,30 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from abc import ABCMeta, abstractmethod
-from typing import Dict, List, Optional, Tuple, Union
-
-
-class BaseTransform(metaclass=ABCMeta):
-    """Base class for all transformations."""
-
-    def __call__(self,
-                 results: Dict) -> Optional[Union[Dict, Tuple[List, List]]]:
-
-        return self.transform(results)
-
-    @abstractmethod
-    def transform(self,
-                  results: Dict) -> Optional[Union[Dict, Tuple[List, List]]]:
-        """The transform function. All subclass of BaseTransform should
-        override this method.
-
-        This function takes the result dict as the input, and can add new
-        items to the dict or modify existing items in the dict. And the result
-        dict will be returned in the end, which allows to concate multiple
-        transforms into a pipeline.
-
-        Args:
-            results (dict): The result dict.
-
-        Returns:
-            dict: The result dict.
-        """
diff --git a/mmcv/transforms/builder.py b/mmcv/transforms/builder.py
deleted file mode 100644
index aefa212..0000000
--- a/mmcv/transforms/builder.py
+++ /dev/null
@@ -1,2 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from mmengine.registry import TRANSFORMS  # noqa: F401
diff --git a/mmcv/transforms/formatting.py b/mmcv/transforms/formatting.py
deleted file mode 100644
index 0208921..0000000
--- a/mmcv/transforms/formatting.py
+++ /dev/null
@@ -1,127 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from typing import Sequence, Union
-
-import mmengine
-import numpy as np
-import torch
-
-from .base import BaseTransform
-from .builder import TRANSFORMS
-
-
-def to_tensor(
-    data: Union[torch.Tensor, np.ndarray, Sequence, int,
-                float]) -> torch.Tensor:
-    """Convert objects of various python types to :obj:`torch.Tensor`.
-
-    Supported types are: :class:`numpy.ndarray`, :class:`torch.Tensor`,
-    :class:`Sequence`, :class:`int` and :class:`float`.
-
-    Args:
-        data (torch.Tensor | numpy.ndarray | Sequence | int | float): Data to
-            be converted.
-
-    Returns:
-        torch.Tensor: the converted data.
-    """
-
-    if isinstance(data, torch.Tensor):
-        return data
-    elif isinstance(data, np.ndarray):
-        return torch.from_numpy(data)
-    elif isinstance(data, Sequence) and not mmengine.is_str(data):
-        return torch.tensor(data)
-    elif isinstance(data, int):
-        return torch.LongTensor([data])
-    elif isinstance(data, float):
-        return torch.FloatTensor([data])
-    else:
-        raise TypeError(f'type {type(data)} cannot be converted to tensor.')
-
-
-@TRANSFORMS.register_module()
-class ToTensor(BaseTransform):
-    """Convert some results to :obj:`torch.Tensor` by given keys.
-
-    Required keys:
-
-    - all these keys in `keys`
-
-    Modified Keys:
-
-    - all these keys in `keys`
-
-    Args:
-        keys (Sequence[str]): Keys that need to be converted to Tensor.
-    """
-
-    def __init__(self, keys: Sequence[str]) -> None:
-        self.keys = keys
-
-    def transform(self, results: dict) -> dict:
-        """Transform function to convert data to `torch.Tensor`.
-
-        Args:
-            results (dict): Result dict from loading pipeline.
-        Returns:
-            dict: `keys` in results will be updated.
-        """
-        for key in self.keys:
-
-            key_list = key.split('.')
-            cur_item = results
-            for i in range(len(key_list)):
-                if key_list[i] not in cur_item:
-                    raise KeyError(f'Can not find key {key}')
-                if i == len(key_list) - 1:
-                    cur_item[key_list[i]] = to_tensor(cur_item[key_list[i]])
-                    break
-                cur_item = cur_item[key_list[i]]
-
-        return results
-
-    def __repr__(self) -> str:
-        return self.__class__.__name__ + f'(keys={self.keys})'
-
-
-@TRANSFORMS.register_module()
-class ImageToTensor(BaseTransform):
-    """Convert image to :obj:`torch.Tensor` by given keys.
-
-    The dimension order of input image is (H, W, C). The pipeline will convert
-    it to (C, H, W). If only 2 dimension (H, W) is given, the output would be
-    (1, H, W).
-
-    Required keys:
-
-    - all these keys in `keys`
-
-    Modified Keys:
-
-    - all these keys in `keys`
-
-    Args:
-        keys (Sequence[str]): Key of images to be converted to Tensor.
-    """
-
-    def __init__(self, keys: dict) -> None:
-        self.keys = keys
-
-    def transform(self, results: dict) -> dict:
-        """Transform function to convert image in results to
-        :obj:`torch.Tensor` and transpose the channel order.
-        Args:
-            results (dict): Result dict contains the image data to convert.
-        Returns:
-            dict: The result dict contains the image converted
-            to :obj:``torch.Tensor`` and transposed to (C, H, W) order.
-        """
-        for key in self.keys:
-            img = results[key]
-            if len(img.shape) < 3:
-                img = np.expand_dims(img, -1)
-            results[key] = (to_tensor(img.transpose(2, 0, 1))).contiguous()
-        return results
-
-    def __repr__(self) -> str:
-        return self.__class__.__name__ + f'(keys={self.keys})'
diff --git a/mmcv/transforms/loading.py b/mmcv/transforms/loading.py
deleted file mode 100644
index c0c17c9..0000000
--- a/mmcv/transforms/loading.py
+++ /dev/null
@@ -1,360 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import warnings
-from typing import Optional
-
-import mmengine.fileio as fileio
-import numpy as np
-
-import mmcv
-from .base import BaseTransform
-from .builder import TRANSFORMS
-
-
-@TRANSFORMS.register_module()
-class LoadImageFromFile(BaseTransform):
-    """Load an image from file.
-
-    Required Keys:
-
-    - img_path
-
-    Modified Keys:
-
-    - img
-    - img_shape
-    - ori_shape
-
-    Args:
-        to_float32 (bool): Whether to convert the loaded image to a float32
-            numpy array. If set to False, the loaded image is an uint8 array.
-            Defaults to False.
-        color_type (str): The flag argument for :func:`mmcv.imfrombytes`.
-            Defaults to 'color'.
-        imdecode_backend (str): The image decoding backend type. The backend
-            argument for :func:`mmcv.imfrombytes`.
-            See :func:`mmcv.imfrombytes` for details.
-            Defaults to 'cv2'.
-        file_client_args (dict, optional): Arguments to instantiate a
-            FileClient. See :class:`mmengine.fileio.FileClient` for details.
-            Defaults to None. It will be deprecated in future. Please use
-            ``backend_args`` instead.
-            Deprecated in version 2.0.0rc4.
-        ignore_empty (bool): Whether to allow loading empty image or file path
-            not existent. Defaults to False.
-        backend_args (dict, optional): Instantiates the corresponding file
-            backend. It may contain `backend` key to specify the file
-            backend. If it contains, the file backend corresponding to this
-            value will be used and initialized with the remaining values,
-            otherwise the corresponding file backend will be selected
-            based on the prefix of the file path. Defaults to None.
-            New in version 2.0.0rc4.
-    """
-
-    def __init__(self,
-                 to_float32: bool = False,
-                 color_type: str = 'color',
-                 imdecode_backend: str = 'cv2',
-                 file_client_args: Optional[dict] = None,
-                 ignore_empty: bool = False,
-                 *,
-                 backend_args: Optional[dict] = None) -> None:
-        self.ignore_empty = ignore_empty
-        self.to_float32 = to_float32
-        self.color_type = color_type
-        self.imdecode_backend = imdecode_backend
-
-        self.file_client_args: Optional[dict] = None
-        self.backend_args: Optional[dict] = None
-        if file_client_args is not None:
-            warnings.warn(
-                '"file_client_args" will be deprecated in future. '
-                'Please use "backend_args" instead', DeprecationWarning)
-            if backend_args is not None:
-                raise ValueError(
-                    '"file_client_args" and "backend_args" cannot be set '
-                    'at the same time.')
-
-            self.file_client_args = file_client_args.copy()
-        if backend_args is not None:
-            self.backend_args = backend_args.copy()
-
-    def transform(self, results: dict) -> Optional[dict]:
-        """Functions to load image.
-
-        Args:
-            results (dict): Result dict from
-                :class:`mmengine.dataset.BaseDataset`.
-
-        Returns:
-            dict: The dict contains loaded image and meta information.
-        """
-
-        filename = results['img_path']
-        try:
-            if self.file_client_args is not None:
-                file_client = fileio.FileClient.infer_client(
-                    self.file_client_args, filename)
-                img_bytes = file_client.get(filename)
-            else:
-                img_bytes = fileio.get(
-                    filename, backend_args=self.backend_args)
-            img = mmcv.imfrombytes(
-                img_bytes, flag=self.color_type, backend=self.imdecode_backend)
-        except Exception as e:
-            if self.ignore_empty:
-                return None
-            else:
-                raise e
-        if self.to_float32:
-            img = img.astype(np.float32)
-
-        results['img'] = img
-        results['img_shape'] = img.shape[:2]
-        results['ori_shape'] = img.shape[:2]
-        return results
-
-    def __repr__(self):
-        repr_str = (f'{self.__class__.__name__}('
-                    f'ignore_empty={self.ignore_empty}, '
-                    f'to_float32={self.to_float32}, '
-                    f"color_type='{self.color_type}', "
-                    f"imdecode_backend='{self.imdecode_backend}', ")
-
-        if self.file_client_args is not None:
-            repr_str += f'file_client_args={self.file_client_args})'
-        else:
-            repr_str += f'backend_args={self.backend_args})'
-
-        return repr_str
-
-
-@TRANSFORMS.register_module()
-class LoadAnnotations(BaseTransform):
-    """Load and process the ``instances`` and ``seg_map`` annotation provided
-    by dataset.
-
-    The annotation format is as the following:
-
-    .. code-block:: python
-
-        {
-            'instances':
-            [
-                {
-                # List of 4 numbers representing the bounding box of the
-                # instance, in (x1, y1, x2, y2) order.
-                'bbox': [x1, y1, x2, y2],
-
-                # Label of image classification.
-                'bbox_label': 1,
-
-                # Used in key point detection.
-                # Can only load the format of [x1, y1, v1,…, xn, yn, vn]. v[i]
-                # means the visibility of this keypoint. n must be equal to the
-                # number of keypoint categories.
-                'keypoints': [x1, y1, v1, ..., xn, yn, vn]
-                }
-            ]
-            # Filename of semantic or panoptic segmentation ground truth file.
-            'seg_map_path': 'a/b/c'
-        }
-
-    After this module, the annotation has been changed to the format below:
-
-    .. code-block:: python
-
-        {
-            # In (x1, y1, x2, y2) order, float type. N is the number of bboxes
-            # in np.float32
-            'gt_bboxes': np.ndarray(N, 4)
-             # In np.int64 type.
-            'gt_bboxes_labels': np.ndarray(N, )
-             # In uint8 type.
-            'gt_seg_map': np.ndarray (H, W)
-             # with (x, y, v) order, in np.float32 type.
-            'gt_keypoints': np.ndarray(N, NK, 3)
-        }
-
-    Required Keys:
-
-    - instances
-
-      - bbox (optional)
-      - bbox_label
-      - keypoints (optional)
-
-    - seg_map_path (optional)
-
-    Added Keys:
-
-    - gt_bboxes (np.float32)
-    - gt_bboxes_labels (np.int64)
-    - gt_seg_map (np.uint8)
-    - gt_keypoints (np.float32)
-
-    Args:
-        with_bbox (bool): Whether to parse and load the bbox annotation.
-            Defaults to True.
-        with_label (bool): Whether to parse and load the label annotation.
-            Defaults to True.
-        with_seg (bool): Whether to parse and load the semantic segmentation
-            annotation. Defaults to False.
-        with_keypoints (bool): Whether to parse and load the keypoints
-            annotation. Defaults to False.
-        imdecode_backend (str): The image decoding backend type. The backend
-            argument for :func:`mmcv.imfrombytes`.
-            See :func:`mmcv.imfrombytes` for details.
-            Defaults to 'cv2'.
-        file_client_args (dict, optional): Arguments to instantiate a
-            FileClient. See :class:`mmengine.fileio.FileClient` for details.
-            Defaults to None. It will be deprecated in future. Please use
-            ``backend_args`` instead.
-            Deprecated in version 2.0.0rc4.
-        backend_args (dict, optional): Instantiates the corresponding file
-            backend. It may contain `backend` key to specify the file
-            backend. If it contains, the file backend corresponding to this
-            value will be used and initialized with the remaining values,
-            otherwise the corresponding file backend will be selected
-            based on the prefix of the file path. Defaults to None.
-            New in version 2.0.0rc4.
-    """
-
-    def __init__(
-        self,
-        with_bbox: bool = True,
-        with_label: bool = True,
-        with_seg: bool = False,
-        with_keypoints: bool = False,
-        imdecode_backend: str = 'cv2',
-        file_client_args: Optional[dict] = None,
-        *,
-        backend_args: Optional[dict] = None,
-    ) -> None:
-        super().__init__()
-        self.with_bbox = with_bbox
-        self.with_label = with_label
-        self.with_seg = with_seg
-        self.with_keypoints = with_keypoints
-        self.imdecode_backend = imdecode_backend
-
-        self.file_client_args: Optional[dict] = None
-        self.backend_args: Optional[dict] = None
-        if file_client_args is not None:
-            warnings.warn(
-                '"file_client_args" will be deprecated in future. '
-                'Please use "backend_args" instead', DeprecationWarning)
-            if backend_args is not None:
-                raise ValueError(
-                    '"file_client_args" and "backend_args" cannot be set '
-                    'at the same time.')
-
-            self.file_client_args = file_client_args.copy()
-        if backend_args is not None:
-            self.backend_args = backend_args.copy()
-
-    def _load_bboxes(self, results: dict) -> None:
-        """Private function to load bounding box annotations.
-
-        Args:
-            results (dict): Result dict from
-                :class:`mmengine.dataset.BaseDataset`.
-
-        Returns:
-            dict: The dict contains loaded bounding box annotations.
-        """
-        gt_bboxes = []
-        for instance in results['instances']:
-            gt_bboxes.append(instance['bbox'])
-        results['gt_bboxes'] = np.array(
-            gt_bboxes, dtype=np.float32).reshape(-1, 4)
-
-    def _load_labels(self, results: dict) -> None:
-        """Private function to load label annotations.
-
-        Args:
-            results (dict): Result dict from
-                :class:`mmengine.dataset.BaseDataset`.
-
-        Returns:
-            dict: The dict contains loaded label annotations.
-        """
-        gt_bboxes_labels = []
-        for instance in results['instances']:
-            gt_bboxes_labels.append(instance['bbox_label'])
-        results['gt_bboxes_labels'] = np.array(
-            gt_bboxes_labels, dtype=np.int64)
-
-    def _load_seg_map(self, results: dict) -> None:
-        """Private function to load semantic segmentation annotations.
-
-        Args:
-            results (dict): Result dict from
-                :class:`mmengine.dataset.BaseDataset`.
-
-        Returns:
-            dict: The dict contains loaded semantic segmentation annotations.
-        """
-        if self.file_client_args is not None:
-            file_client = fileio.FileClient.infer_client(
-                self.file_client_args, results['seg_map_path'])
-            img_bytes = file_client.get(results['seg_map_path'])
-        else:
-            img_bytes = fileio.get(
-                results['seg_map_path'], backend_args=self.backend_args)
-
-        results['gt_seg_map'] = mmcv.imfrombytes(
-            img_bytes, flag='unchanged',
-            backend=self.imdecode_backend).squeeze()
-
-    def _load_kps(self, results: dict) -> None:
-        """Private function to load keypoints annotations.
-
-        Args:
-            results (dict): Result dict from
-                :class:`mmengine.dataset.BaseDataset`.
-
-        Returns:
-            dict: The dict contains loaded keypoints annotations.
-        """
-        gt_keypoints = []
-        for instance in results['instances']:
-            gt_keypoints.append(instance['keypoints'])
-        results['gt_keypoints'] = np.array(gt_keypoints, np.float32).reshape(
-            (len(gt_keypoints), -1, 3))
-
-    def transform(self, results: dict) -> dict:
-        """Function to load multiple types annotations.
-
-        Args:
-            results (dict): Result dict from
-                :class:`mmengine.dataset.BaseDataset`.
-
-        Returns:
-            dict: The dict contains loaded bounding box, label and
-            semantic segmentation and keypoints annotations.
-        """
-
-        if self.with_bbox:
-            self._load_bboxes(results)
-        if self.with_label:
-            self._load_labels(results)
-        if self.with_seg:
-            self._load_seg_map(results)
-        if self.with_keypoints:
-            self._load_kps(results)
-        return results
-
-    def __repr__(self) -> str:
-        repr_str = self.__class__.__name__
-        repr_str += f'(with_bbox={self.with_bbox}, '
-        repr_str += f'with_label={self.with_label}, '
-        repr_str += f'with_seg={self.with_seg}, '
-        repr_str += f'with_keypoints={self.with_keypoints}, '
-        repr_str += f"imdecode_backend='{self.imdecode_backend}', "
-
-        if self.file_client_args is not None:
-            repr_str += f'file_client_args={self.file_client_args})'
-        else:
-            repr_str += f'backend_args={self.backend_args})'
-
-        return repr_str
diff --git a/mmcv/transforms/processing.py b/mmcv/transforms/processing.py
deleted file mode 100644
index 96e1bb0..0000000
--- a/mmcv/transforms/processing.py
+++ /dev/null
@@ -1,1562 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import copy
-import random
-import warnings
-from itertools import product
-from typing import Dict, Iterable, List, Optional, Sequence, Tuple, Union
-
-import mmengine
-import numpy as np
-
-import mmcv
-from mmcv.image.geometric import _scale_size
-from .base import BaseTransform
-from .builder import TRANSFORMS
-from .utils import cache_randomness
-from .wrappers import Compose
-
-Number = Union[int, float]
-
-
-@TRANSFORMS.register_module()
-class Normalize(BaseTransform):
-    """Normalize the image.
-
-    Required Keys:
-
-    - img
-
-    Modified Keys:
-
-    - img
-
-    Added Keys:
-
-    - img_norm_cfg
-
-      - mean
-      - std
-      - to_rgb
-
-
-    Args:
-        mean (sequence): Mean values of 3 channels.
-        std (sequence): Std values of 3 channels.
-        to_rgb (bool): Whether to convert the image from BGR to RGB before
-            normlizing the image. If ``to_rgb=True``, the order of mean and std
-            should be RGB. If ``to_rgb=False``, the order of mean and std
-            should be the same order of the image. Defaults to True.
-    """
-
-    def __init__(self,
-                 mean: Sequence[Number],
-                 std: Sequence[Number],
-                 to_rgb: bool = True) -> None:
-        self.mean = np.array(mean, dtype=np.float32)
-        self.std = np.array(std, dtype=np.float32)
-        self.to_rgb = to_rgb
-
-    def transform(self, results: dict) -> dict:
-        """Function to normalize images.
-
-        Args:
-            results (dict): Result dict from loading pipeline.
-
-        Returns:
-            dict: Normalized results, key 'img_norm_cfg' key is added in to
-            result dict.
-        """
-
-        results['img'] = mmcv.imnormalize(results['img'], self.mean, self.std,
-                                          self.to_rgb)
-        results['img_norm_cfg'] = dict(
-            mean=self.mean, std=self.std, to_rgb=self.to_rgb)
-        return results
-
-    def __repr__(self) -> str:
-        repr_str = self.__class__.__name__
-        repr_str += f'(mean={self.mean}, std={self.std}, to_rgb={self.to_rgb})'
-        return repr_str
-
-
-@TRANSFORMS.register_module()
-class Resize(BaseTransform):
-    """Resize images & bbox & seg & keypoints.
-
-    This transform resizes the input image according to ``scale`` or
-    ``scale_factor``. Bboxes, seg map and keypoints are then resized with the
-    same scale factor.
-    if ``scale`` and ``scale_factor`` are both set, it will use ``scale`` to
-    resize.
-
-    Required Keys:
-
-    - img
-    - gt_bboxes (optional)
-    - gt_seg_map (optional)
-    - gt_keypoints (optional)
-
-    Modified Keys:
-
-    - img
-    - gt_bboxes
-    - gt_seg_map
-    - gt_keypoints
-    - img_shape
-
-    Added Keys:
-
-    - scale
-    - scale_factor
-    - keep_ratio
-
-    Args:
-        scale (int or tuple): Images scales for resizing. Defaults to None
-        scale_factor (float or tuple[float]): Scale factors for resizing.
-            Defaults to None.
-        keep_ratio (bool): Whether to keep the aspect ratio when resizing the
-            image. Defaults to False.
-        clip_object_border (bool): Whether to clip the objects
-            outside the border of the image. In some dataset like MOT17, the gt
-            bboxes are allowed to cross the border of images. Therefore, we
-            don't need to clip the gt bboxes in these cases. Defaults to True.
-        backend (str): Image resize backend, choices are 'cv2' and 'pillow'.
-            These two backends generates slightly different results. Defaults
-            to 'cv2'.
-        interpolation (str): Interpolation method, accepted values are
-            "nearest", "bilinear", "bicubic", "area", "lanczos" for 'cv2'
-            backend, "nearest", "bilinear" for 'pillow' backend. Defaults
-            to 'bilinear'.
-    """
-
-    def __init__(self,
-                 scale: Optional[Union[int, Tuple[int, int]]] = None,
-                 scale_factor: Optional[Union[float, Tuple[float,
-                                                           float]]] = None,
-                 keep_ratio: bool = False,
-                 clip_object_border: bool = True,
-                 backend: str = 'cv2',
-                 interpolation='bilinear') -> None:
-        assert scale is not None or scale_factor is not None, (
-            '`scale` and'
-            '`scale_factor` can not both be `None`')
-        if scale is None:
-            self.scale = None
-        else:
-            if isinstance(scale, int):
-                self.scale = (scale, scale)
-            else:
-                self.scale = scale
-
-        self.backend = backend
-        self.interpolation = interpolation
-        self.keep_ratio = keep_ratio
-        self.clip_object_border = clip_object_border
-        if scale_factor is None:
-            self.scale_factor = None
-        elif isinstance(scale_factor, float):
-            self.scale_factor = (scale_factor, scale_factor)
-        elif isinstance(scale_factor, tuple):
-            assert (len(scale_factor)) == 2
-            self.scale_factor = scale_factor
-        else:
-            raise TypeError(
-                f'expect scale_factor is float or Tuple(float), but'
-                f'get {type(scale_factor)}')
-
-    def _resize_img(self, results: dict) -> None:
-        """Resize images with ``results['scale']``."""
-
-        if results.get('img', None) is not None:
-            if self.keep_ratio:
-                img, scale_factor = mmcv.imrescale(
-                    results['img'],
-                    results['scale'],
-                    interpolation=self.interpolation,
-                    return_scale=True,
-                    backend=self.backend)
-                # the w_scale and h_scale has minor difference
-                # a real fix should be done in the mmcv.imrescale in the future
-                new_h, new_w = img.shape[:2]
-                h, w = results['img'].shape[:2]
-                w_scale = new_w / w
-                h_scale = new_h / h
-            else:
-                img, w_scale, h_scale = mmcv.imresize(
-                    results['img'],
-                    results['scale'],
-                    interpolation=self.interpolation,
-                    return_scale=True,
-                    backend=self.backend)
-            results['img'] = img
-            results['img_shape'] = img.shape[:2]
-            results['scale_factor'] = (w_scale, h_scale)
-            results['keep_ratio'] = self.keep_ratio
-
-    def _resize_bboxes(self, results: dict) -> None:
-        """Resize bounding boxes with ``results['scale_factor']``."""
-        if results.get('gt_bboxes', None) is not None:
-            bboxes = results['gt_bboxes'] * np.tile(
-                np.array(results['scale_factor']), 2)
-            if self.clip_object_border:
-                bboxes[:, 0::2] = np.clip(bboxes[:, 0::2], 0,
-                                          results['img_shape'][1])
-                bboxes[:, 1::2] = np.clip(bboxes[:, 1::2], 0,
-                                          results['img_shape'][0])
-            results['gt_bboxes'] = bboxes
-
-    def _resize_seg(self, results: dict) -> None:
-        """Resize semantic segmentation map with ``results['scale']``."""
-        if results.get('gt_seg_map', None) is not None:
-            if self.keep_ratio:
-                gt_seg = mmcv.imrescale(
-                    results['gt_seg_map'],
-                    results['scale'],
-                    interpolation='nearest',
-                    backend=self.backend)
-            else:
-                gt_seg = mmcv.imresize(
-                    results['gt_seg_map'],
-                    results['scale'],
-                    interpolation='nearest',
-                    backend=self.backend)
-            results['gt_seg_map'] = gt_seg
-
-    def _resize_keypoints(self, results: dict) -> None:
-        """Resize keypoints with ``results['scale_factor']``."""
-        if results.get('gt_keypoints', None) is not None:
-            keypoints = results['gt_keypoints']
-
-            keypoints[:, :, :2] = keypoints[:, :, :2] * np.array(
-                results['scale_factor'])
-            if self.clip_object_border:
-                keypoints[:, :, 0] = np.clip(keypoints[:, :, 0], 0,
-                                             results['img_shape'][1])
-                keypoints[:, :, 1] = np.clip(keypoints[:, :, 1], 0,
-                                             results['img_shape'][0])
-            results['gt_keypoints'] = keypoints
-
-    def transform(self, results: dict) -> dict:
-        """Transform function to resize images, bounding boxes, semantic
-        segmentation map and keypoints.
-
-        Args:
-            results (dict): Result dict from loading pipeline.
-        Returns:
-            dict: Resized results, 'img', 'gt_bboxes', 'gt_seg_map',
-            'gt_keypoints', 'scale', 'scale_factor', 'img_shape',
-            and 'keep_ratio' keys are updated in result dict.
-        """
-
-        if self.scale:
-            results['scale'] = self.scale
-        else:
-            img_shape = results['img'].shape[:2]
-            results['scale'] = _scale_size(img_shape[::-1],
-                                           self.scale_factor)  # type: ignore
-        self._resize_img(results)
-        self._resize_bboxes(results)
-        self._resize_seg(results)
-        self._resize_keypoints(results)
-        return results
-
-    def __repr__(self):
-        repr_str = self.__class__.__name__
-        repr_str += f'(scale={self.scale}, '
-        repr_str += f'scale_factor={self.scale_factor}, '
-        repr_str += f'keep_ratio={self.keep_ratio}, '
-        repr_str += f'clip_object_border={self.clip_object_border}), '
-        repr_str += f'backend={self.backend}), '
-        repr_str += f'interpolation={self.interpolation})'
-        return repr_str
-
-
-@TRANSFORMS.register_module()
-class Pad(BaseTransform):
-    """Pad the image & segmentation map.
-
-    There are three padding modes: (1) pad to a fixed size and (2) pad to the
-    minimum size that is divisible by some number. and (3)pad to square. Also,
-    pad to square and pad to the minimum size can be used as the same time.
-
-    Required Keys:
-
-    - img
-    - gt_bboxes (optional)
-    - gt_seg_map (optional)
-
-    Modified Keys:
-
-    - img
-    - gt_seg_map
-    - img_shape
-
-    Added Keys:
-
-    - pad_shape
-    - pad_fixed_size
-    - pad_size_divisor
-
-    Args:
-        size (tuple, optional): Fixed padding size.
-            Expected padding shape (w, h). Defaults to None.
-        size_divisor (int, optional): The divisor of padded size. Defaults to
-            None.
-        pad_to_square (bool): Whether to pad the image into a square.
-            Currently only used for YOLOX. Defaults to False.
-        pad_val (Number | dict[str, Number], optional): Padding value for if
-            the pad_mode is "constant". If it is a single number, the value
-            to pad the image is the number and to pad the semantic
-            segmentation map is 255. If it is a dict, it should have the
-            following keys:
-
-            - img: The value to pad the image.
-            - seg: The value to pad the semantic segmentation map.
-
-            Defaults to dict(img=0, seg=255).
-        padding_mode (str): Type of padding. Should be: constant, edge,
-            reflect or symmetric. Defaults to 'constant'.
-
-            - constant: pads with a constant value, this value is specified
-              with pad_val.
-            - edge: pads with the last value at the edge of the image.
-            - reflect: pads with reflection of image without repeating the last
-              value on the edge. For example, padding [1, 2, 3, 4] with 2
-              elements on both sides in reflect mode will result in
-              [3, 2, 1, 2, 3, 4, 3, 2].
-            - symmetric: pads with reflection of image repeating the last value
-              on the edge. For example, padding [1, 2, 3, 4] with 2 elements on
-              both sides in symmetric mode will result in
-              [2, 1, 1, 2, 3, 4, 4, 3]
-    """
-
-    def __init__(self,
-                 size: Optional[Tuple[int, int]] = None,
-                 size_divisor: Optional[int] = None,
-                 pad_to_square: bool = False,
-                 pad_val: Union[Number, dict] = dict(img=0, seg=255),
-                 padding_mode: str = 'constant') -> None:
-        self.size = size
-        self.size_divisor = size_divisor
-        if isinstance(pad_val, int):
-            pad_val = dict(img=pad_val, seg=255)
-        assert isinstance(pad_val, dict), 'pad_val '
-        self.pad_val = pad_val
-        self.pad_to_square = pad_to_square
-
-        if pad_to_square:
-            assert size is None, \
-                'The size and size_divisor must be None ' \
-                'when pad2square is True'
-        else:
-            assert size is not None or size_divisor is not None, \
-                'only one of size and size_divisor should be valid'
-            assert size is None or size_divisor is None
-        assert padding_mode in ['constant', 'edge', 'reflect', 'symmetric']
-        self.padding_mode = padding_mode
-
-    def _pad_img(self, results: dict) -> None:
-        """Pad images according to ``self.size``."""
-        pad_val = self.pad_val.get('img', 0)
-
-        size = None
-        if self.pad_to_square:
-            max_size = max(results['img'].shape[:2])
-            size = (max_size, max_size)
-        if self.size_divisor is not None:
-            if size is None:
-                size = (results['img'].shape[0], results['img'].shape[1])
-            pad_h = int(np.ceil(
-                size[0] / self.size_divisor)) * self.size_divisor
-            pad_w = int(np.ceil(
-                size[1] / self.size_divisor)) * self.size_divisor
-            size = (pad_h, pad_w)
-        elif self.size is not None:
-            size = self.size[::-1]
-        if isinstance(pad_val, int) and results['img'].ndim == 3:
-            pad_val = tuple(pad_val for _ in range(results['img'].shape[2]))
-        padded_img = mmcv.impad(
-            results['img'],
-            shape=size,
-            pad_val=pad_val,
-            padding_mode=self.padding_mode)
-
-        results['img'] = padded_img
-        results['pad_shape'] = padded_img.shape
-        results['pad_fixed_size'] = self.size
-        results['pad_size_divisor'] = self.size_divisor
-        results['img_shape'] = padded_img.shape[:2]
-
-    def _pad_seg(self, results: dict) -> None:
-        """Pad semantic segmentation map according to
-        ``results['pad_shape']``."""
-        if results.get('gt_seg_map', None) is not None:
-            pad_val = self.pad_val.get('seg', 255)
-            if isinstance(pad_val, int) and results['gt_seg_map'].ndim == 3:
-                pad_val = tuple(
-                    pad_val for _ in range(results['gt_seg_map'].shape[2]))
-            results['gt_seg_map'] = mmcv.impad(
-                results['gt_seg_map'],
-                shape=results['pad_shape'][:2],
-                pad_val=pad_val,
-                padding_mode=self.padding_mode)
-
-    def transform(self, results: dict) -> dict:
-        """Call function to pad images, masks, semantic segmentation maps.
-
-        Args:
-            results (dict): Result dict from loading pipeline.
-
-        Returns:
-            dict: Updated result dict.
-        """
-        self._pad_img(results)
-        self._pad_seg(results)
-        return results
-
-    def __repr__(self):
-        repr_str = self.__class__.__name__
-        repr_str += f'(size={self.size}, '
-        repr_str += f'size_divisor={self.size_divisor}, '
-        repr_str += f'pad_to_square={self.pad_to_square}, '
-        repr_str += f'pad_val={self.pad_val}), '
-        repr_str += f'padding_mode={self.padding_mode})'
-        return repr_str
-
-
-@TRANSFORMS.register_module()
-class CenterCrop(BaseTransform):
-    """Crop the center of the image, segmentation masks, bounding boxes and key
-    points. If the crop area exceeds the original image and ``auto_pad`` is
-    True, the original image will be padded before cropping.
-
-    Required Keys:
-
-    - img
-    - gt_seg_map (optional)
-    - gt_bboxes (optional)
-    - gt_keypoints (optional)
-
-    Modified Keys:
-
-    - img
-    - img_shape
-    - gt_seg_map (optional)
-    - gt_bboxes (optional)
-    - gt_keypoints (optional)
-
-    Added Key:
-
-    - pad_shape
-
-
-    Args:
-        crop_size (Union[int, Tuple[int, int]]):  Expected size after cropping
-            with the format of (w, h). If set to an integer, then cropping
-            width and height are equal to this integer.
-        auto_pad (bool): Whether to pad the image if it's smaller than the
-            ``crop_size``. Defaults to False.
-        pad_cfg (dict): Base config for padding. Refer to ``mmcv.Pad`` for
-            detail. Defaults to ``dict(type='Pad')``.
-        clip_object_border (bool): Whether to clip the objects
-            outside the border of the image. In some dataset like MOT17, the
-            gt bboxes are allowed to cross the border of images. Therefore,
-            we don't need to clip the gt bboxes in these cases.
-            Defaults to True.
-    """
-
-    def __init__(self,
-                 crop_size: Union[int, Tuple[int, int]],
-                 auto_pad: bool = False,
-                 pad_cfg: dict = dict(type='Pad'),
-                 clip_object_border: bool = True) -> None:
-        super().__init__()
-        assert isinstance(crop_size, int) or (
-            isinstance(crop_size, tuple) and len(crop_size) == 2
-        ), 'The expected crop_size is an integer, or a tuple containing two '
-        'intergers'
-
-        if isinstance(crop_size, int):
-            crop_size = (crop_size, crop_size)
-        assert crop_size[0] > 0 and crop_size[1] > 0
-        self.crop_size = crop_size
-        self.auto_pad = auto_pad
-
-        self.pad_cfg = pad_cfg.copy()
-        # size will be overwritten
-        if 'size' in self.pad_cfg and auto_pad:
-            warnings.warn('``size`` is set in ``pad_cfg``,'
-                          'however this argument will be overwritten'
-                          ' according to crop size and image size')
-
-        self.clip_object_border = clip_object_border
-
-    def _crop_img(self, results: dict, bboxes: np.ndarray) -> None:
-        """Crop image.
-
-        Args:
-            results (dict): Result dict contains the data to transform.
-            bboxes (np.ndarray): Shape (4, ), location of cropped bboxes.
-        """
-        if results.get('img', None) is not None:
-            img = mmcv.imcrop(results['img'], bboxes=bboxes)
-            img_shape = img.shape[:2]  # type: ignore
-            results['img'] = img
-            results['img_shape'] = img_shape
-            results['pad_shape'] = img_shape
-
-    def _crop_seg_map(self, results: dict, bboxes: np.ndarray) -> None:
-        """Crop semantic segmentation map.
-
-        Args:
-            results (dict): Result dict contains the data to transform.
-            bboxes (np.ndarray): Shape (4, ), location of cropped bboxes.
-        """
-        if results.get('gt_seg_map', None) is not None:
-            img = mmcv.imcrop(results['gt_seg_map'], bboxes=bboxes)
-            results['gt_seg_map'] = img
-
-    def _crop_bboxes(self, results: dict, bboxes: np.ndarray) -> None:
-        """Update bounding boxes according to CenterCrop.
-
-        Args:
-            results (dict): Result dict contains the data to transform.
-            bboxes (np.ndarray): Shape (4, ), location of cropped bboxes.
-        """
-        if 'gt_bboxes' in results:
-            offset_w = bboxes[0]
-            offset_h = bboxes[1]
-            bbox_offset = np.array([offset_w, offset_h, offset_w, offset_h])
-            # gt_bboxes has shape (num_gts, 4) in (tl_x, tl_y, br_x, br_y)
-            # order.
-            gt_bboxes = results['gt_bboxes'] - bbox_offset
-            if self.clip_object_border:
-                gt_bboxes[:, 0::2] = np.clip(gt_bboxes[:, 0::2], 0,
-                                             results['img'].shape[1])
-                gt_bboxes[:, 1::2] = np.clip(gt_bboxes[:, 1::2], 0,
-                                             results['img'].shape[0])
-            results['gt_bboxes'] = gt_bboxes
-
-    def _crop_keypoints(self, results: dict, bboxes: np.ndarray) -> None:
-        """Update key points according to CenterCrop. Keypoints that not in the
-        cropped image will be set invisible.
-
-        Args:
-            results (dict): Result dict contains the data to transform.
-            bboxes (np.ndarray): Shape (4, ), location of cropped bboxes.
-        """
-        if 'gt_keypoints' in results:
-            offset_w = bboxes[0]
-            offset_h = bboxes[1]
-            keypoints_offset = np.array([offset_w, offset_h, 0])
-            # gt_keypoints has shape (N, NK, 3) in (x, y, visibility) order,
-            # NK = number of points per object
-            gt_keypoints = results['gt_keypoints'] - keypoints_offset
-            # set gt_kepoints out of the result image invisible
-            height, width = results['img'].shape[:2]
-            valid_pos = (gt_keypoints[:, :, 0] >=
-                         0) * (gt_keypoints[:, :, 0] <
-                               width) * (gt_keypoints[:, :, 1] >= 0) * (
-                                   gt_keypoints[:, :, 1] < height)
-            gt_keypoints[:, :, 2] = np.where(valid_pos, gt_keypoints[:, :, 2],
-                                             0)
-            gt_keypoints[:, :, 0] = np.clip(gt_keypoints[:, :, 0], 0,
-                                            results['img'].shape[1])
-            gt_keypoints[:, :, 1] = np.clip(gt_keypoints[:, :, 1], 0,
-                                            results['img'].shape[0])
-            results['gt_keypoints'] = gt_keypoints
-
-    def transform(self, results: dict) -> dict:
-        """Apply center crop on results.
-
-        Args:
-            results (dict): Result dict contains the data to transform.
-
-        Returns:
-            dict: Results with CenterCropped image and semantic segmentation
-            map.
-        """
-        crop_width, crop_height = self.crop_size[0], self.crop_size[1]
-
-        assert 'img' in results, '`img` is not found in results'
-        img = results['img']
-        # img.shape has length 2 for grayscale, length 3 for color
-        img_height, img_width = img.shape[:2]
-
-        if crop_height > img_height or crop_width > img_width:
-            if self.auto_pad:
-                # pad the area
-                img_height = max(img_height, crop_height)
-                img_width = max(img_width, crop_width)
-                pad_size = (img_width, img_height)
-                _pad_cfg = self.pad_cfg.copy()
-                _pad_cfg.update(dict(size=pad_size))
-                pad_transform = TRANSFORMS.build(_pad_cfg)
-                results = pad_transform(results)
-            else:
-                crop_height = min(crop_height, img_height)
-                crop_width = min(crop_width, img_width)
-
-        y1 = max(0, int(round((img_height - crop_height) / 2.)))
-        x1 = max(0, int(round((img_width - crop_width) / 2.)))
-        y2 = min(img_height, y1 + crop_height) - 1
-        x2 = min(img_width, x1 + crop_width) - 1
-        bboxes = np.array([x1, y1, x2, y2])
-
-        # crop the image
-        self._crop_img(results, bboxes)
-        # crop the gt_seg_map
-        self._crop_seg_map(results, bboxes)
-        # crop the bounding box
-        self._crop_bboxes(results, bboxes)
-        # crop the keypoints
-        self._crop_keypoints(results, bboxes)
-        return results
-
-    def __repr__(self) -> str:
-        repr_str = self.__class__.__name__
-        repr_str += f'(crop_size = {self.crop_size}'
-        repr_str += f', auto_pad={self.auto_pad}'
-        repr_str += f', pad_cfg={self.pad_cfg}'
-        repr_str += f',clip_object_border = {self.clip_object_border})'
-        return repr_str
-
-
-@TRANSFORMS.register_module()
-class RandomGrayscale(BaseTransform):
-    """Randomly convert image to grayscale with a probability.
-
-    Required Key:
-
-    - img
-
-    Modified Key:
-
-    - img
-
-    Added Keys:
-
-    - grayscale
-    - grayscale_weights
-
-    Args:
-        prob (float): Probability that image should be converted to
-            grayscale. Defaults to 0.1.
-        keep_channels (bool): Whether keep channel number the same as
-            input. Defaults to False.
-        channel_weights (tuple): The grayscale weights of each channel,
-            and the weights will be normalized. For example, (1, 2, 1)
-            will be normalized as (0.25, 0.5, 0.25). Defaults to
-            (1., 1., 1.).
-        color_format (str): Color format set to be any of 'bgr',
-            'rgb', 'hsv'. Note: 'hsv' image will be transformed into 'bgr'
-            format no matter whether it is grayscaled. Defaults to 'bgr'.
-    """
-
-    def __init__(self,
-                 prob: float = 0.1,
-                 keep_channels: bool = False,
-                 channel_weights: Sequence[float] = (1., 1., 1.),
-                 color_format: str = 'bgr') -> None:
-        super().__init__()
-        assert 0. <= prob <= 1., ('The range of ``prob`` value is [0., 1.],' +
-                                  f' but got {prob} instead')
-        self.prob = prob
-        self.keep_channels = keep_channels
-        self.channel_weights = channel_weights
-        assert color_format in ['bgr', 'rgb', 'hsv']
-        self.color_format = color_format
-
-    @cache_randomness
-    def _random_prob(self):
-        return random.random()
-
-    def transform(self, results: dict) -> dict:
-        """Apply random grayscale on results.
-
-        Args:
-            results (dict): Result dict contains the data to transform.
-
-        Returns:
-           dict: Results with grayscale image.
-        """
-        img = results['img']
-        # convert hsv to bgr
-        if self.color_format == 'hsv':
-            img = mmcv.hsv2bgr(img)
-        img = img[..., None] if img.ndim == 2 else img
-        num_output_channels = img.shape[2]
-        if self._random_prob() < self.prob:
-            if num_output_channels > 1:
-                assert num_output_channels == len(
-                    self.channel_weights
-                ), 'The length of ``channel_weights`` are supposed to be '
-                f'num_output_channels, but got {len(self.channel_weights)}'
-                ' instead.'
-                normalized_weights = (
-                    np.array(self.channel_weights) / sum(self.channel_weights))
-                img = (normalized_weights * img).sum(axis=2)
-                img = img.astype('uint8')
-                if self.keep_channels:
-                    img = img[:, :, None]
-                    results['img'] = np.dstack(
-                        [img for _ in range(num_output_channels)])
-                else:
-                    results['img'] = img
-                return results
-        img = img.astype('uint8')
-        results['img'] = img
-        return results
-
-    def __repr__(self) -> str:
-        repr_str = self.__class__.__name__
-        repr_str += f'(prob = {self.prob}'
-        repr_str += f', keep_channels = {self.keep_channels}'
-        repr_str += f', channel_weights = {self.channel_weights}'
-        repr_str += f', color_format = {self.color_format})'
-        return repr_str
-
-
-@TRANSFORMS.register_module()
-class MultiScaleFlipAug(BaseTransform):
-    """Test-time augmentation with multiple scales and flipping.
-
-    An example configuration is as followed:
-
-    .. code-block::
-
-        dict(
-            type='MultiScaleFlipAug',
-            scales=[(1333, 400), (1333, 800)],
-            flip=True,
-            transforms=[
-                dict(type='Normalize', **img_norm_cfg),
-                dict(type='Pad', size_divisor=1),
-                dict(type='ImageToTensor', keys=['img']),
-                dict(type='Collect', keys=['img'])
-            ])
-
-    ``results`` will be resized using all the sizes in ``scales``.
-    If ``flip`` is True, then flipped results will also be added into output
-    list.
-
-    For the above configuration, there are four combinations of resize
-    and flip:
-
-    - Resize to (1333, 400) + no flip
-    - Resize to (1333, 400) + flip
-    - Resize to (1333, 800) + no flip
-    - resize to (1333, 800) + flip
-
-    The four results are then transformed with ``transforms`` argument.
-    After that, results are wrapped into lists of the same length as below:
-
-    .. code-block::
-
-        dict(
-            inputs=[...],
-            data_samples=[...]
-        )
-
-    Where the length of ``inputs`` and ``data_samples`` are both 4.
-
-    Required Keys:
-
-    - Depending on the requirements of the ``transforms`` parameter.
-
-    Modified Keys:
-
-    - All output keys of each transform.
-
-    Args:
-        transforms (list[dict]): Transforms to be applied to each resized
-            and flipped data.
-        scales (tuple | list[tuple] | None): Images scales for resizing.
-        scale_factor (float or tuple[float]): Scale factors for resizing.
-            Defaults to None.
-        allow_flip (bool): Whether apply flip augmentation. Defaults to False.
-        flip_direction (str | list[str]): Flip augmentation directions,
-            options are "horizontal", "vertical" and "diagonal". If
-            flip_direction is a list, multiple flip augmentations will be
-            applied. It has no effect when flip == False. Defaults to
-            "horizontal".
-        resize_cfg (dict): Base config for resizing. Defaults to
-            ``dict(type='Resize', keep_ratio=True)``.
-        flip_cfg (dict): Base config for flipping. Defaults to
-            ``dict(type='RandomFlip')``.
-    """
-
-    def __init__(
-        self,
-        transforms: List[dict],
-        scales: Optional[Union[Tuple, List[Tuple]]] = None,
-        scale_factor: Optional[Union[float, List[float]]] = None,
-        allow_flip: bool = False,
-        flip_direction: Union[str, List[str]] = 'horizontal',
-        resize_cfg: dict = dict(type='Resize', keep_ratio=True),
-        flip_cfg: dict = dict(type='RandomFlip')
-    ) -> None:
-        super().__init__()
-        self.transforms = Compose(transforms)  # type: ignore
-
-        if scales is not None:
-            self.scales = scales if isinstance(scales, list) else [scales]
-            self.scale_key = 'scale'
-            assert mmengine.is_list_of(self.scales, tuple)
-        else:
-            # if ``scales`` and ``scale_factor`` both be ``None``
-            if scale_factor is None:
-                self.scales = [1.]  # type: ignore
-            elif isinstance(scale_factor, list):
-                self.scales = scale_factor  # type: ignore
-            else:
-                self.scales = [scale_factor]  # type: ignore
-
-            self.scale_key = 'scale_factor'
-
-        self.allow_flip = allow_flip
-        self.flip_direction = flip_direction if isinstance(
-            flip_direction, list) else [flip_direction]
-        assert mmengine.is_list_of(self.flip_direction, str)
-        if not self.allow_flip and self.flip_direction != ['horizontal']:
-            warnings.warn(
-                'flip_direction has no effect when flip is set to False')
-        self.resize_cfg = resize_cfg.copy()
-        self.flip_cfg = flip_cfg
-
-    def transform(self, results: dict) -> Dict:
-        """Apply test time augment transforms on results.
-
-        Args:
-            results (dict): Result dict contains the data to transform.
-
-        Returns:
-            dict: The augmented data, where each value is wrapped
-            into a list.
-        """
-
-        data_samples = []
-        inputs = []
-        flip_args = [(False, '')]
-        if self.allow_flip:
-            flip_args += [(True, direction)
-                          for direction in self.flip_direction]
-        for scale in self.scales:
-            for flip, direction in flip_args:
-                _resize_cfg = self.resize_cfg.copy()
-                _resize_cfg.update({self.scale_key: scale})
-                _resize_flip = [_resize_cfg]
-
-                if flip:
-                    _flip_cfg = self.flip_cfg.copy()
-                    _flip_cfg.update(prob=1.0, direction=direction)
-                    _resize_flip.append(_flip_cfg)
-                else:
-                    results['flip'] = False
-                    results['flip_direction'] = None
-
-                resize_flip = Compose(_resize_flip)
-                _results = resize_flip(results.copy())
-                packed_results = self.transforms(_results)  # type: ignore
-
-                inputs.append(packed_results['inputs'])  # type: ignore
-                data_samples.append(
-                    packed_results['data_sample'])  # type: ignore
-        return dict(inputs=inputs, data_sample=data_samples)
-
-    def __repr__(self) -> str:
-        repr_str = self.__class__.__name__
-        repr_str += f'(transforms={self.transforms}'
-        repr_str += f', scales={self.scales}'
-        repr_str += f', allow_flip={self.allow_flip}'
-        repr_str += f', flip_direction={self.flip_direction})'
-        return repr_str
-
-
-@TRANSFORMS.register_module()
-class TestTimeAug(BaseTransform):
-    """Test-time augmentation transform.
-
-    An example configuration is as followed:
-
-    .. code-block::
-
-        dict(type='TestTimeAug',
-             transforms=[
-                [dict(type='Resize', scale=(1333, 400), keep_ratio=True),
-                 dict(type='Resize', scale=(1333, 800), keep_ratio=True)],
-                [dict(type='RandomFlip', prob=1.),
-                 dict(type='RandomFlip', prob=0.)],
-                [dict(type='PackDetInputs',
-                      meta_keys=('img_id', 'img_path', 'ori_shape',
-                                 'img_shape', 'scale_factor', 'flip',
-                                 'flip_direction'))]])
-
-    ``results`` will be transformed using all transforms defined in
-    ``transforms`` arguments.
-
-    For the above configuration, there are four combinations of resize
-    and flip:
-
-    - Resize to (1333, 400) + no flip
-    - Resize to (1333, 400) + flip
-    - Resize to (1333, 800) + no flip
-    - resize to (1333, 800) + flip
-
-    After that, results are wrapped into lists of the same length as below:
-
-    .. code-block::
-
-        dict(
-            inputs=[...],
-            data_samples=[...]
-        )
-
-    The length of ``inputs`` and ``data_samples`` are both 4.
-
-    Required Keys:
-
-    - Depending on the requirements of the ``transforms`` parameter.
-
-    Modified Keys:
-
-    - All output keys of each transform.
-
-    Args:
-        transforms (list[list[dict]]): Transforms to be applied to data sampled
-            from dataset. ``transforms`` is a list of list, and each list
-            element usually represents a series of transforms with the same
-            type and different arguments. Data will be processed by each list
-            elements sequentially. See more information in :meth:`transform`.
-    """
-
-    def __init__(self, transforms: list):
-        for i, transform_list in enumerate(transforms):
-            for j, transform in enumerate(transform_list):
-                if isinstance(transform, dict):
-                    transform_list[j] = TRANSFORMS.build(transform)
-                elif callable(transform):
-                    continue
-                else:
-                    raise TypeError(
-                        'transform must be callable or a dict, but got'
-                        f' {type(transform)}')
-            transforms[i] = transform_list
-
-        self.subroutines = [
-            Compose(subroutine) for subroutine in product(*transforms)
-        ]
-
-    def transform(self, results: dict) -> dict:
-        """Apply all transforms defined in :attr:`transforms` to the results.
-
-        As the example given in :obj:`TestTimeAug`, ``transforms`` consists of
-        2 ``Resize``, 2 ``RandomFlip`` and 1 ``PackDetInputs``.
-        The data sampled from dataset will be processed as follows:
-
-        1. Data will be processed by 2 ``Resize`` and return a list
-           of 2 results.
-        2. Each result in list will be further passed to 2
-           ``RandomFlip``, and aggregates into a list of 4 results.
-        3. Each result will be processed by ``PackDetInputs``, and
-           return a list of dict.
-        4. Aggregates the same fields of results, and finally returns
-           a dict. Each value of the dict represents 4 transformed
-           results.
-
-        Args:
-            results (dict): Result dict contains the data to transform.
-
-        Returns:
-            dict: The augmented data, where each value is wrapped
-            into a list.
-        """
-        results_list = []  # type: ignore
-        for subroutine in self.subroutines:
-            result = subroutine(copy.deepcopy(results))
-            assert isinstance(result, dict), (
-                f'Data processed by {subroutine} must return a dict, but got '
-                f'{result}')
-            assert result is not None, (
-                f'Data processed by {subroutine} in `TestTimeAug` should not '
-                'be None! Please check your validation dataset and the '
-                f'transforms in {subroutine}')
-            results_list.append(result)
-
-        aug_data_dict = {
-            key: [item[key] for item in results_list]  # type: ignore
-            for key in results_list[0]  # type: ignore
-        }
-        return aug_data_dict
-
-    def __repr__(self) -> str:
-        repr_str = self.__class__.__name__
-        repr_str += 'transforms=\n'
-        for subroutine in self.subroutines:
-            repr_str += f'{repr(subroutine)}\n'
-        return repr_str
-
-
-@TRANSFORMS.register_module()
-class RandomChoiceResize(BaseTransform):
-    """Resize images & bbox & mask from a list of multiple scales.
-
-    This transform resizes the input image to some scale. Bboxes and masks are
-    then resized with the same scale factor. Resize scale will be randomly
-    selected from ``scales``.
-
-    How to choose the target scale to resize the image will follow the rules
-    below:
-
-    - if `scale` is a list of tuple, the target scale is sampled from the list
-      uniformally.
-    - if `scale` is a tuple, the target scale will be set to the tuple.
-
-    Required Keys:
-
-    - img
-    - gt_bboxes (optional)
-    - gt_seg_map (optional)
-    - gt_keypoints (optional)
-
-    Modified Keys:
-
-    - img
-    - img_shape
-    - gt_bboxes (optional)
-    - gt_seg_map (optional)
-    - gt_keypoints (optional)
-
-    Added Keys:
-
-    - scale
-    - scale_factor
-    - scale_idx
-    - keep_ratio
-
-
-    Args:
-        scales (Union[list, Tuple]): Images scales for resizing.
-        resize_type (str): The type of resize class to use. Defaults to
-            "Resize".
-        **resize_kwargs: Other keyword arguments for the ``resize_type``.
-
-    Note:
-        By defaults, the ``resize_type`` is "Resize", if it's not overwritten
-        by your registry, it indicates the :class:`mmcv.Resize`. And therefore,
-        ``resize_kwargs`` accepts any keyword arguments of it, like
-        ``keep_ratio``, ``interpolation`` and so on.
-
-        If you want to use your custom resize class, the class should accept
-        ``scale`` argument and have ``scale`` attribution which determines the
-        resize shape.
-    """
-
-    def __init__(
-        self,
-        scales: Sequence[Union[int, Tuple]],
-        resize_type: str = 'Resize',
-        **resize_kwargs,
-    ) -> None:
-        super().__init__()
-        if isinstance(scales, list):
-            self.scales = scales
-        else:
-            self.scales = [scales]
-        assert mmengine.is_seq_of(self.scales, (tuple, int))
-
-        self.resize_cfg = dict(type=resize_type, **resize_kwargs)
-        # create a empty Resize object
-        self.resize = TRANSFORMS.build({'scale': 0, **self.resize_cfg})
-
-    @cache_randomness
-    def _random_select(self) -> Tuple[int, int]:
-        """Randomly select an scale from given candidates.
-
-        Returns:
-            (tuple, int): Returns a tuple ``(scale, scale_dix)``,
-            where ``scale`` is the selected image scale and
-            ``scale_idx`` is the selected index in the given candidates.
-        """
-
-        scale_idx = np.random.randint(len(self.scales))
-        scale = self.scales[scale_idx]
-        return scale, scale_idx
-
-    def transform(self, results: dict) -> dict:
-        """Apply resize transforms on results from a list of scales.
-
-        Args:
-            results (dict): Result dict contains the data to transform.
-
-        Returns:
-            dict: Resized results, 'img', 'gt_bboxes', 'gt_seg_map',
-            'gt_keypoints', 'scale', 'scale_factor', 'img_shape',
-            and 'keep_ratio' keys are updated in result dict.
-        """
-
-        target_scale, scale_idx = self._random_select()
-        self.resize.scale = target_scale
-        results = self.resize(results)
-        results['scale_idx'] = scale_idx
-        return results
-
-    def __repr__(self) -> str:
-        repr_str = self.__class__.__name__
-        repr_str += f'(scales={self.scales}'
-        repr_str += f', resize_cfg={self.resize_cfg})'
-        return repr_str
-
-
-@TRANSFORMS.register_module()
-class RandomFlip(BaseTransform):
-    """Flip the image & bbox & keypoints & segmentation map. Added or Updated
-    keys: flip, flip_direction, img, gt_bboxes, gt_seg_map, and
-    gt_keypoints. There are 3 flip modes:
-
-    - ``prob`` is float, ``direction`` is string: the image will be
-      ``direction``ly flipped with probability of ``prob`` .
-      E.g., ``prob=0.5``, ``direction='horizontal'``,
-      then image will be horizontally flipped with probability of 0.5.
-
-    - ``prob`` is float, ``direction`` is list of string: the image will
-      be ``direction[i]``ly flipped with probability of
-      ``prob/len(direction)``.
-      E.g., ``prob=0.5``, ``direction=['horizontal', 'vertical']``,
-      then image will be horizontally flipped with probability of 0.25,
-      vertically with probability of 0.25.
-
-    - ``prob`` is list of float, ``direction`` is list of string:
-      given ``len(prob) == len(direction)``, the image will
-      be ``direction[i]``ly flipped with probability of ``prob[i]``.
-      E.g., ``prob=[0.3, 0.5]``, ``direction=['horizontal',
-      'vertical']``, then image will be horizontally flipped with
-      probability of 0.3, vertically with probability of 0.5.
-
-    Required Keys:
-
-    - img
-    - gt_bboxes (optional)
-    - gt_seg_map (optional)
-    - gt_keypoints (optional)
-
-    Modified Keys:
-
-    - img
-    - gt_bboxes (optional)
-    - gt_seg_map (optional)
-    - gt_keypoints (optional)
-
-    Added Keys:
-
-    - flip
-    - flip_direction
-    - swap_seg_labels (optional)
-
-    Args:
-        prob (float | list[float], optional): The flipping probability.
-            Defaults to None.
-        direction(str | list[str]): The flipping direction. Options
-            If input is a list, the length must equal ``prob``. Each
-            element in ``prob`` indicates the flip probability of
-            corresponding direction. Defaults to 'horizontal'.
-        swap_seg_labels (list, optional): The label pair need to be swapped
-            for ground truth, like 'left arm' and 'right arm' need to be
-            swapped after horizontal flipping. For example, ``[(1, 5)]``,
-            where 1/5 is the label of the left/right arm. Defaults to None.
-    """
-
-    def __init__(self,
-                 prob: Optional[Union[float, Iterable[float]]] = None,
-                 direction: Union[str, Sequence[Optional[str]]] = 'horizontal',
-                 swap_seg_labels: Optional[Sequence] = None) -> None:
-        if isinstance(prob, list):
-            assert mmengine.is_list_of(prob, float)
-            assert 0 <= sum(prob) <= 1
-        elif isinstance(prob, float):
-            assert 0 <= prob <= 1
-        else:
-            raise ValueError(f'probs must be float or list of float, but \
-                              got `{type(prob)}`.')
-        self.prob = prob
-        self.swap_seg_labels = swap_seg_labels
-
-        valid_directions = ['horizontal', 'vertical', 'diagonal']
-        if isinstance(direction, str):
-            assert direction in valid_directions
-        elif isinstance(direction, list):
-            assert mmengine.is_list_of(direction, str)
-            assert set(direction).issubset(set(valid_directions))
-        else:
-            raise ValueError(f'direction must be either str or list of str, \
-                               but got `{type(direction)}`.')
-        self.direction = direction
-
-        if isinstance(prob, list):
-            assert len(prob) == len(self.direction)
-
-    def _flip_bbox(self, bboxes: np.ndarray, img_shape: Tuple[int, int],
-                   direction: str) -> np.ndarray:
-        """Flip bboxes horizontally.
-
-        Args:
-            bboxes (numpy.ndarray): Bounding boxes, shape (..., 4*k)
-            img_shape (tuple[int]): Image shape (height, width)
-            direction (str): Flip direction. Options are 'horizontal',
-                'vertical', and 'diagonal'.
-
-        Returns:
-            numpy.ndarray: Flipped bounding boxes.
-        """
-        assert bboxes.shape[-1] % 4 == 0
-        flipped = bboxes.copy()
-        h, w = img_shape
-        if direction == 'horizontal':
-            flipped[..., 0::4] = w - bboxes[..., 2::4]
-            flipped[..., 2::4] = w - bboxes[..., 0::4]
-        elif direction == 'vertical':
-            flipped[..., 1::4] = h - bboxes[..., 3::4]
-            flipped[..., 3::4] = h - bboxes[..., 1::4]
-        elif direction == 'diagonal':
-            flipped[..., 0::4] = w - bboxes[..., 2::4]
-            flipped[..., 1::4] = h - bboxes[..., 3::4]
-            flipped[..., 2::4] = w - bboxes[..., 0::4]
-            flipped[..., 3::4] = h - bboxes[..., 1::4]
-        else:
-            raise ValueError(
-                f"Flipping direction must be 'horizontal', 'vertical', \
-                  or 'diagonal', but got '{direction}'")
-        return flipped
-
-    def _flip_keypoints(
-        self,
-        keypoints: np.ndarray,
-        img_shape: Tuple[int, int],
-        direction: str,
-    ) -> np.ndarray:
-        """Flip keypoints horizontally, vertically or diagonally.
-
-        Args:
-            keypoints (numpy.ndarray): Keypoints, shape (..., 2)
-            img_shape (tuple[int]): Image shape (height, width)
-            direction (str): Flip direction. Options are 'horizontal',
-                'vertical', and 'diagonal'.
-
-        Returns:
-            numpy.ndarray: Flipped keypoints.
-        """
-
-        meta_info = keypoints[..., 2:]
-        keypoints = keypoints[..., :2]
-        flipped = keypoints.copy()
-        h, w = img_shape
-        if direction == 'horizontal':
-            flipped[..., 0::2] = w - keypoints[..., 0::2]
-        elif direction == 'vertical':
-            flipped[..., 1::2] = h - keypoints[..., 1::2]
-        elif direction == 'diagonal':
-            flipped[..., 0::2] = w - keypoints[..., 0::2]
-            flipped[..., 1::2] = h - keypoints[..., 1::2]
-        else:
-            raise ValueError(
-                f"Flipping direction must be 'horizontal', 'vertical', \
-                  or 'diagonal', but got '{direction}'")
-        flipped = np.concatenate([flipped, meta_info], axis=-1)
-        return flipped
-
-    def _flip_seg_map(self, seg_map: dict, direction: str) -> np.ndarray:
-        """Flip segmentation map horizontally, vertically or diagonally.
-
-        Args:
-            seg_map (numpy.ndarray): segmentation map, shape (H, W).
-            direction (str): Flip direction. Options are 'horizontal',
-                'vertical'.
-
-        Returns:
-            numpy.ndarray: Flipped segmentation map.
-        """
-        seg_map = mmcv.imflip(seg_map, direction=direction)
-        if self.swap_seg_labels is not None:
-            # to handle datasets with left/right annotations
-            # like 'Left-arm' and 'Right-arm' in LIP dataset
-            # Modified from https://github.com/openseg-group/openseg.pytorch/blob/master/lib/datasets/tools/cv2_aug_transforms.py # noqa:E501
-            # Licensed under MIT license
-            temp = seg_map.copy()
-            assert isinstance(self.swap_seg_labels, (tuple, list))
-            for pair in self.swap_seg_labels:
-                assert isinstance(pair, (tuple, list)) and len(pair) == 2, \
-                    'swap_seg_labels must be a sequence with pair, but got ' \
-                    f'{self.swap_seg_labels}.'
-                seg_map[temp == pair[0]] = pair[1]
-                seg_map[temp == pair[1]] = pair[0]
-        return seg_map
-
-    @cache_randomness
-    def _choose_direction(self) -> str:
-        """Choose the flip direction according to `prob` and `direction`"""
-        if isinstance(self.direction,
-                      Sequence) and not isinstance(self.direction, str):
-            # None means non-flip
-            direction_list: list = list(self.direction) + [None]
-        elif isinstance(self.direction, str):
-            # None means non-flip
-            direction_list = [self.direction, None]
-
-        if isinstance(self.prob, list):
-            non_prob: float = 1 - sum(self.prob)
-            prob_list = self.prob + [non_prob]
-        elif isinstance(self.prob, float):
-            non_prob = 1. - self.prob
-            # exclude non-flip
-            single_ratio = self.prob / (len(direction_list) - 1)
-            prob_list = [single_ratio] * (len(direction_list) - 1) + [non_prob]
-
-        cur_dir = np.random.choice(direction_list, p=prob_list)
-
-        return cur_dir
-
-    def _flip(self, results: dict) -> None:
-        """Flip images, bounding boxes, semantic segmentation map and
-        keypoints."""
-        # flip image
-        results['img'] = mmcv.imflip(
-            results['img'], direction=results['flip_direction'])
-
-        img_shape = results['img'].shape[:2]
-
-        # flip bboxes
-        if results.get('gt_bboxes', None) is not None:
-            results['gt_bboxes'] = self._flip_bbox(results['gt_bboxes'],
-                                                   img_shape,
-                                                   results['flip_direction'])
-
-        # flip keypoints
-        if results.get('gt_keypoints', None) is not None:
-            results['gt_keypoints'] = self._flip_keypoints(
-                results['gt_keypoints'], img_shape, results['flip_direction'])
-
-        # flip seg map
-        if results.get('gt_seg_map', None) is not None:
-            results['gt_seg_map'] = self._flip_seg_map(
-                results['gt_seg_map'], direction=results['flip_direction'])
-            results['swap_seg_labels'] = self.swap_seg_labels
-
-    def _flip_on_direction(self, results: dict) -> None:
-        """Function to flip images, bounding boxes, semantic segmentation map
-        and keypoints."""
-        cur_dir = self._choose_direction()
-        if cur_dir is None:
-            results['flip'] = False
-            results['flip_direction'] = None
-        else:
-            results['flip'] = True
-            results['flip_direction'] = cur_dir
-            self._flip(results)
-
-    def transform(self, results: dict) -> dict:
-        """Transform function to flip images, bounding boxes, semantic
-        segmentation map and keypoints.
-
-        Args:
-            results (dict): Result dict from loading pipeline.
-
-        Returns:
-            dict: Flipped results, 'img', 'gt_bboxes', 'gt_seg_map',
-            'gt_keypoints', 'flip', and 'flip_direction' keys are
-            updated in result dict.
-        """
-        self._flip_on_direction(results)
-
-        return results
-
-    def __repr__(self) -> str:
-        repr_str = self.__class__.__name__
-        repr_str += f'(prob={self.prob}, '
-        repr_str += f'direction={self.direction})'
-
-        return repr_str
-
-
-@TRANSFORMS.register_module()
-class RandomResize(BaseTransform):
-    """Random resize images & bbox & keypoints.
-
-    How to choose the target scale to resize the image will follow the rules
-    below:
-
-    - if ``scale`` is a sequence of tuple
-
-    .. math::
-        target\\_scale[0] \\sim Uniform([scale[0][0], scale[1][0]])
-    .. math::
-        target\\_scale[1] \\sim Uniform([scale[0][1], scale[1][1]])
-
-    Following the resize order of weight and height in cv2, ``scale[i][0]``
-    is for width, and ``scale[i][1]`` is for height.
-
-    - if ``scale`` is a tuple
-
-    .. math::
-        target\\_scale[0] \\sim Uniform([ratio\\_range[0], ratio\\_range[1]])
-            * scale[0]
-    .. math::
-        target\\_scale[0] \\sim Uniform([ratio\\_range[0], ratio\\_range[1]])
-            * scale[1]
-
-    Following the resize order of weight and height in cv2, ``ratio_range[0]``
-    is for width, and ``ratio_range[1]`` is for height.
-
-    - if ``keep_ratio`` is True, the minimum value of ``target_scale`` will be
-      used to set the shorter side and the maximum value will be used to
-      set the longer side.
-
-    - if ``keep_ratio`` is False, the value of ``target_scale`` will be used to
-      reisze the width and height accordingly.
-
-    Required Keys:
-
-    - img
-    - gt_bboxes
-    - gt_seg_map
-    - gt_keypoints
-
-    Modified Keys:
-
-    - img
-    - gt_bboxes
-    - gt_seg_map
-    - gt_keypoints
-    - img_shape
-
-    Added Keys:
-
-    - scale
-    - scale_factor
-    - keep_ratio
-
-    Args:
-        scale (tuple or Sequence[tuple]): Images scales for resizing.
-            Defaults to None.
-        ratio_range (tuple[float], optional): (min_ratio, max_ratio).
-            Defaults to None.
-        resize_type (str): The type of resize class to use. Defaults to
-            "Resize".
-        **resize_kwargs: Other keyword arguments for the ``resize_type``.
-
-    Note:
-        By defaults, the ``resize_type`` is "Resize", if it's not overwritten
-        by your registry, it indicates the :class:`mmcv.Resize`. And therefore,
-        ``resize_kwargs`` accepts any keyword arguments of it, like
-        ``keep_ratio``, ``interpolation`` and so on.
-
-        If you want to use your custom resize class, the class should accept
-        ``scale`` argument and have ``scale`` attribution which determines the
-        resize shape.
-    """
-
-    def __init__(
-        self,
-        scale: Union[Tuple[int, int], Sequence[Tuple[int, int]]],
-        ratio_range: Tuple[float, float] = None,
-        resize_type: str = 'Resize',
-        **resize_kwargs,
-    ) -> None:
-
-        self.scale = scale
-        self.ratio_range = ratio_range
-
-        self.resize_cfg = dict(type=resize_type, **resize_kwargs)
-        # create a empty Reisize object
-        self.resize = TRANSFORMS.build({'scale': 0, **self.resize_cfg})
-
-    @staticmethod
-    def _random_sample(scales: Sequence[Tuple[int, int]]) -> tuple:
-        """Private function to randomly sample a scale from a list of tuples.
-
-        Args:
-            scales (list[tuple]): Images scale range for sampling.
-                There must be two tuples in scales, which specify the lower
-                and upper bound of image scales.
-
-        Returns:
-            tuple: The targeted scale of the image to be resized.
-        """
-
-        assert mmengine.is_list_of(scales, tuple) and len(scales) == 2
-        scale_0 = [scales[0][0], scales[1][0]]
-        scale_1 = [scales[0][1], scales[1][1]]
-        edge_0 = np.random.randint(min(scale_0), max(scale_0) + 1)
-        edge_1 = np.random.randint(min(scale_1), max(scale_1) + 1)
-        scale = (edge_0, edge_1)
-        return scale
-
-    @staticmethod
-    def _random_sample_ratio(scale: tuple, ratio_range: Tuple[float,
-                                                              float]) -> tuple:
-        """Private function to randomly sample a scale from a tuple.
-
-        A ratio will be randomly sampled from the range specified by
-        ``ratio_range``. Then it would be multiplied with ``scale`` to
-        generate sampled scale.
-
-        Args:
-            scale (tuple): Images scale base to multiply with ratio.
-            ratio_range (tuple[float]): The minimum and maximum ratio to scale
-                the ``scale``.
-
-        Returns:
-            tuple: The targeted scale of the image to be resized.
-        """
-
-        assert isinstance(scale, tuple) and len(scale) == 2
-        min_ratio, max_ratio = ratio_range
-        assert min_ratio <= max_ratio
-        ratio = np.random.random_sample() * (max_ratio - min_ratio) + min_ratio
-        scale = int(scale[0] * ratio), int(scale[1] * ratio)
-        return scale
-
-    @cache_randomness
-    def _random_scale(self) -> tuple:
-        """Private function to randomly sample an scale according to the type
-        of ``scale``.
-
-        Returns:
-            tuple: The targeted scale of the image to be resized.
-        """
-
-        if mmengine.is_tuple_of(self.scale, int):
-            assert self.ratio_range is not None and len(self.ratio_range) == 2
-            scale = self._random_sample_ratio(
-                self.scale,  # type: ignore
-                self.ratio_range)
-        elif mmengine.is_seq_of(self.scale, tuple):
-            scale = self._random_sample(self.scale)  # type: ignore
-        else:
-            raise NotImplementedError('Do not support sampling function '
-                                      f'for "{self.scale}"')
-
-        return scale
-
-    def transform(self, results: dict) -> dict:
-        """Transform function to resize images, bounding boxes, semantic
-        segmentation map.
-
-        Args:
-            results (dict): Result dict from loading pipeline.
-
-        Returns:
-            dict: Resized results, ``img``, ``gt_bboxes``, ``gt_semantic_seg``,
-            ``gt_keypoints``, ``scale``, ``scale_factor``, ``img_shape``, and
-            ``keep_ratio`` keys are updated in result dict.
-        """
-        results['scale'] = self._random_scale()
-        self.resize.scale = results['scale']
-        results = self.resize(results)
-        return results
-
-    def __repr__(self) -> str:
-        repr_str = self.__class__.__name__
-        repr_str += f'(scale={self.scale}, '
-        repr_str += f'ratio_range={self.ratio_range}, '
-        repr_str += f'resize_cfg={self.resize_cfg})'
-        return repr_str
diff --git a/mmcv/transforms/utils.py b/mmcv/transforms/utils.py
deleted file mode 100644
index 370580d..0000000
--- a/mmcv/transforms/utils.py
+++ /dev/null
@@ -1,249 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-
-import functools
-import inspect
-import weakref
-from collections import defaultdict
-from collections.abc import Iterable
-from contextlib import contextmanager
-from typing import Callable, Union
-
-from .base import BaseTransform
-
-
-class cache_randomness:
-    """Decorator that marks the method with random return value(s) in a
-    transform class.
-
-    This decorator is usually used together with the context-manager
-    :func`:cache_random_params`. In this context, a decorated method will
-    cache its return value(s) at the first time of being invoked, and always
-    return the cached values when being invoked again.
-
-    .. note::
-        Only an instance method can be decorated with ``cache_randomness``.
-    """
-
-    def __init__(self, func):
-
-        # Check `func` is to be bound as an instance method
-        if not inspect.isfunction(func):
-            raise TypeError('Unsupport callable to decorate with'
-                            '@cache_randomness.')
-        func_args = inspect.getfullargspec(func).args
-        if len(func_args) == 0 or func_args[0] != 'self':
-            raise TypeError(
-                '@cache_randomness should only be used to decorate '
-                'instance methods (the first argument is ``self``).')
-
-        functools.update_wrapper(self, func)
-        self.func = func
-        self.instance_ref = None
-
-    def __set_name__(self, owner, name):
-        # Maintain a record of decorated methods in the class
-        if not hasattr(owner, '_methods_with_randomness'):
-            setattr(owner, '_methods_with_randomness', [])
-
-        # Here `name` equals to `self.__name__`, i.e., the name of the
-        # decorated function, due to the invocation of `update_wrapper` in
-        # `self.__init__()`
-        owner._methods_with_randomness.append(name)
-
-    def __call__(self, *args, **kwargs):
-        # Get the transform instance whose method is decorated
-        # by cache_randomness
-        instance = self.instance_ref()
-        name = self.__name__
-
-        # Check the flag ``self._cache_enabled``, which should be
-        # set by the contextmanagers like ``cache_random_parameters```
-        cache_enabled = getattr(instance, '_cache_enabled', False)
-
-        if cache_enabled:
-            # Initialize the cache of the transform instances. The flag
-            # ``cache_enabled``` is set by contextmanagers like
-            # ``cache_random_params```.
-            if not hasattr(instance, '_cache'):
-                setattr(instance, '_cache', {})
-
-            if name not in instance._cache:
-                instance._cache[name] = self.func(instance, *args, **kwargs)
-            # Return the cached value
-            return instance._cache[name]
-        else:
-            # Clear cache
-            if hasattr(instance, '_cache'):
-                del instance._cache
-            # Return function output
-            return self.func(instance, *args, **kwargs)
-
-    def __get__(self, obj, cls):
-        self.instance_ref = weakref.ref(obj)
-        return self
-
-
-def avoid_cache_randomness(cls):
-    """Decorator that marks a data transform class (subclass of
-    :class:`BaseTransform`) prohibited from caching randomness. With this
-    decorator, errors will be raised in following cases:
-
-        1. A method is defined in the class with the decorate
-    `cache_randomness`;
-        2. An instance of the class is invoked with the context
-    `cache_random_params`.
-
-    A typical usage of `avoid_cache_randomness` is to decorate the data
-    transforms with non-cacheable random behaviors (e.g., the random behavior
-    can not be defined in a method, thus can not be decorated with
-    `cache_randomness`). This is for preventing unintentinoal use of such data
-    transforms within the context of caching randomness, which may lead to
-    unexpected results.
-    """
-
-    # Check that cls is a data transform class
-    assert issubclass(cls, BaseTransform)
-
-    # Check that no method is decorated with `cache_randomness` in cls
-    if getattr(cls, '_methods_with_randomness', None):
-        raise RuntimeError(
-            f'Class {cls.__name__} decorated with '
-            '``avoid_cache_randomness`` should not have methods decorated '
-            'with ``cache_randomness`` (invalid methods: '
-            f'{cls._methods_with_randomness})')
-
-    class AvoidCacheRandomness:
-
-        def __get__(self, obj, objtype=None):
-            # Here we check the value in `objtype.__dict__` instead of
-            # directly checking the attribute
-            # `objtype._avoid_cache_randomness`. So if the base class is
-            # decorated with :func:`avoid_cache_randomness`, it will not be
-            # inherited by subclasses.
-            return objtype.__dict__.get('_avoid_cache_randomness', False)
-
-    cls.avoid_cache_randomness = AvoidCacheRandomness()
-    cls._avoid_cache_randomness = True
-
-    return cls
-
-
-@contextmanager
-def cache_random_params(transforms: Union[BaseTransform, Iterable]):
-    """Context-manager that enables the cache of return values of methods
-    decorated with ``cache_randomness`` in transforms.
-
-    In this mode, decorated methods will cache their return values on the
-    first invoking, and always return the cached value afterward. This allow
-    to apply random transforms in a deterministic way. For example, apply same
-    transforms on multiple examples. See ``cache_randomness`` for more
-    information.
-
-    Args:
-        transforms (BaseTransform|list[BaseTransform]): The transforms to
-            enable cache.
-    """
-
-    # key2method stores the original methods that are replaced by the wrapped
-    # ones. These methods will be restituted when exiting the context.
-    key2method = dict()
-
-    # key2counter stores the usage number of each cache_randomness. This is
-    # used to check that any cache_randomness is invoked once during processing
-    # on data sample.
-    key2counter: dict = defaultdict(int)
-
-    def _add_invoke_counter(obj, method_name):
-        method = getattr(obj, method_name)
-        key = f'{id(obj)}.{method_name}'
-        key2method[key] = method
-
-        @functools.wraps(method)
-        def wrapped(*args, **kwargs):
-            key2counter[key] += 1
-            return method(*args, **kwargs)
-
-        return wrapped
-
-    def _add_invoke_checker(obj, method_name):
-        # check that the method in _methods_with_randomness has been
-        # invoked at most once
-        method = getattr(obj, method_name)
-        key = f'{id(obj)}.{method_name}'
-        key2method[key] = method
-
-        @functools.wraps(method)
-        def wrapped(*args, **kwargs):
-            # clear counter
-            for name in obj._methods_with_randomness:
-                key = f'{id(obj)}.{name}'
-                key2counter[key] = 0
-
-            output = method(*args, **kwargs)
-
-            for name in obj._methods_with_randomness:
-                key = f'{id(obj)}.{name}'
-                if key2counter[key] > 1:
-                    raise RuntimeError(
-                        'The method decorated with ``cache_randomness`` '
-                        'should be invoked at most once during processing '
-                        f'one data sample. The method {name} of {obj} has '
-                        f'been invoked {key2counter[key]} times.')
-            return output
-
-        return wrapped
-
-    def _start_cache(t: BaseTransform):
-        # Check if cache is allowed for `t`
-        if getattr(t, 'avoid_cache_randomness', False):
-            raise RuntimeError(
-                f'Class {t.__class__.__name__} decorated with '
-                '``avoid_cache_randomness`` is not allowed to be used with'
-                ' ``cache_random_params`` (e.g. wrapped by '
-                '``ApplyToMultiple`` with ``share_random_params==True``).')
-
-        # Skip transforms w/o random method
-        if not hasattr(t, '_methods_with_randomness'):
-            return
-
-        # Set cache enabled flag
-        setattr(t, '_cache_enabled', True)
-
-        # Store the original method and init the counter
-        if hasattr(t, '_methods_with_randomness'):
-            setattr(t, 'transform', _add_invoke_checker(t, 'transform'))
-            for name in getattr(t, '_methods_with_randomness'):
-                setattr(t, name, _add_invoke_counter(t, name))
-
-    def _end_cache(t: BaseTransform):
-        # Skip transforms w/o random method
-        if not hasattr(t, '_methods_with_randomness'):
-            return
-
-        # Remove cache enabled flag
-        delattr(t, '_cache_enabled')
-        if hasattr(t, '_cache'):
-            delattr(t, '_cache')
-
-        # Restore the original method
-        if hasattr(t, '_methods_with_randomness'):
-            for name in getattr(t, '_methods_with_randomness'):
-                key = f'{id(t)}.{name}'
-                setattr(t, name, key2method[key])
-
-            key_transform = f'{id(t)}.transform'
-            setattr(t, 'transform', key2method[key_transform])
-
-    def _apply(t: Union[BaseTransform, Iterable],
-               func: Callable[[BaseTransform], None]):
-        if isinstance(t, BaseTransform):
-            func(t)
-        if isinstance(t, Iterable):
-            for _t in t:
-                _apply(_t, func)
-
-    try:
-        _apply(transforms, _start_cache)
-        yield
-    finally:
-        _apply(transforms, _end_cache)
diff --git a/mmcv/transforms/wrappers.py b/mmcv/transforms/wrappers.py
deleted file mode 100644
index 27e07d6..0000000
--- a/mmcv/transforms/wrappers.py
+++ /dev/null
@@ -1,649 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-
-from typing import Any, Callable, Dict, List, Optional, Sequence, Union
-
-import mmengine
-import numpy as np
-
-from .base import BaseTransform
-from .builder import TRANSFORMS
-from .utils import cache_random_params, cache_randomness
-
-# Define type of transform or transform config
-Transform = Union[Dict, Callable[[Dict], Dict]]
-
-# Indicator of keys marked by KeyMapper._map_input, which means ignoring the
-# marked keys in KeyMapper._apply_transform so they will be invisible to
-# wrapped transforms.
-# This can be 2 possible case:
-# 1. The key is required but missing in results
-# 2. The key is manually set as ... (Ellipsis) in ``mapping``, which means
-# the original value in results should be ignored
-IgnoreKey = object()
-
-# Import nullcontext if python>=3.7, otherwise use a simple alternative
-# implementation.
-try:
-    from contextlib import nullcontext  # type: ignore
-except ImportError:
-    from contextlib import contextmanager
-
-    @contextmanager  # type: ignore
-    def nullcontext(resource=None):
-        try:
-            yield resource
-        finally:
-            pass
-
-
-@TRANSFORMS.register_module()
-class Compose(BaseTransform):
-    """Compose multiple transforms sequentially.
-
-    Args:
-        transforms (list[dict | callable]): Sequence of transform object or
-            config dict to be composed.
-
-    Examples:
-        >>> pipeline = [
-        >>>     dict(type='Compose',
-        >>>         transforms=[
-        >>>             dict(type='LoadImageFromFile'),
-        >>>             dict(type='Normalize')
-        >>>         ]
-        >>>     )
-        >>> ]
-    """
-
-    def __init__(self, transforms: Union[Transform, Sequence[Transform]]):
-        super().__init__()
-
-        if not isinstance(transforms, Sequence):
-            transforms = [transforms]
-        self.transforms: List = []
-        for transform in transforms:
-            if isinstance(transform, dict):
-                transform = TRANSFORMS.build(transform)
-                self.transforms.append(transform)
-            elif callable(transform):
-                self.transforms.append(transform)
-            else:
-                raise TypeError('transform must be callable or a dict, but got'
-                                f' {type(transform)}')
-
-    def __iter__(self):
-        """Allow easy iteration over the transform sequence."""
-        return iter(self.transforms)
-
-    def transform(self, results: Dict) -> Optional[Dict]:
-        """Call function to apply transforms sequentially.
-
-        Args:
-            results (dict): A result dict contains the results to transform.
-
-        Returns:
-            dict or None: Transformed results.
-        """
-        for t in self.transforms:
-            results = t(results)  # type: ignore
-            if results is None:
-                return None
-        return results
-
-    def __repr__(self):
-        """Compute the string representation."""
-        format_string = self.__class__.__name__ + '('
-        for t in self.transforms:
-            format_string += f'\n    {t}'
-        format_string += '\n)'
-        return format_string
-
-
-@TRANSFORMS.register_module()
-class KeyMapper(BaseTransform):
-    """A transform wrapper to map and reorganize the input/output of the
-    wrapped transforms (or sub-pipeline).
-
-    Args:
-        transforms (list[dict | callable], optional): Sequence of transform
-            object or config dict to be wrapped.
-        mapping (dict): A dict that defines the input key mapping.
-            The keys corresponds to the inner key (i.e., kwargs of the
-            ``transform`` method), and should be string type. The values
-            corresponds to the outer keys (i.e., the keys of the
-            data/results), and should have a type of string, list or dict.
-            None means not applying input mapping. Default: None.
-        remapping (dict): A dict that defines the output key mapping.
-            The keys and values have the same meanings and rules as in the
-            ``mapping``. Default: None.
-        auto_remap (bool, optional): If True, an inverse of the mapping will
-            be used as the remapping. If auto_remap is not given, it will be
-            automatically set True if 'remapping' is not given, and vice
-            versa. Default: None.
-        allow_nonexist_keys (bool): If False, the outer keys in the mapping
-            must exist in the input data, or an exception will be raised.
-            Default: False.
-
-    Examples:
-        >>> # Example 1: KeyMapper 'gt_img' to 'img'
-        >>> pipeline = [
-        >>>     # Use KeyMapper to convert outer (original) field name
-        >>>     # 'gt_img' to inner (used by inner transforms) filed name
-        >>>     # 'img'
-        >>>     dict(type='KeyMapper',
-        >>>         mapping={'img': 'gt_img'},
-        >>>         # auto_remap=True means output key mapping is the revert of
-        >>>         # the input key mapping, e.g. inner 'img' will be mapped
-        >>>         # back to outer 'gt_img'
-        >>>         auto_remap=True,
-        >>>         transforms=[
-        >>>             # In all transforms' implementation just use 'img'
-        >>>             # as a standard field name
-        >>>             dict(type='Crop', crop_size=(384, 384)),
-        >>>             dict(type='Normalize'),
-        >>>         ])
-        >>> ]
-
-        >>> # Example 2: Collect and structure multiple items
-        >>> pipeline = [
-        >>>     # The inner field 'imgs' will be a dict with keys 'img_src'
-        >>>     # and 'img_tar', whose values are outer fields 'img1' and
-        >>>     # 'img2' respectively.
-        >>>     dict(type='KeyMapper',
-        >>>         dict(
-        >>>             type='KeyMapper',
-        >>>             mapping=dict(
-        >>>                 imgs=dict(
-        >>>                     img_src='img1',
-        >>>                     img_tar='img2')),
-        >>>         transforms=...)
-        >>> ]
-
-        >>> # Example 3: Manually set ignored keys by "..."
-        >>> pipeline = [
-        >>>     ...
-        >>>     dict(type='KeyMapper',
-        >>>         mapping={
-        >>>             # map outer key "gt_img" to inner key "img"
-        >>>             'img': 'gt_img',
-        >>>             # ignore outer key "mask"
-        >>>             'mask': ...,
-        >>>         },
-        >>>         transforms=[
-        >>>             dict(type='RandomFlip'),
-        >>>         ])
-        >>>     ...
-        >>> ]
-    """
-
-    def __init__(self,
-                 transforms: Union[Transform, List[Transform]] = None,
-                 mapping: Optional[Dict] = None,
-                 remapping: Optional[Dict] = None,
-                 auto_remap: Optional[bool] = None,
-                 allow_nonexist_keys: bool = False):
-
-        super().__init__()
-
-        self.allow_nonexist_keys = allow_nonexist_keys
-        self.mapping = mapping
-
-        if auto_remap is None:
-            auto_remap = remapping is None
-        self.auto_remap = auto_remap
-
-        if self.auto_remap:
-            if remapping is not None:
-                raise ValueError('KeyMapper: ``remapping`` must be None if'
-                                 '`auto_remap` is set True.')
-            self.remapping = mapping
-        else:
-            self.remapping = remapping
-
-        if transforms is None:
-            transforms = []
-        self.transforms = Compose(transforms)
-
-    def __iter__(self):
-        """Allow easy iteration over the transform sequence."""
-        return iter(self.transforms)
-
-    def _map_input(self, data: Dict,
-                   mapping: Optional[Dict]) -> Dict[str, Any]:
-        """KeyMapper inputs for the wrapped transforms by gathering and
-        renaming data items according to the mapping.
-
-        Args:
-            data (dict): The original input data
-            mapping (dict, optional): The input key mapping. See the document
-                of ``mmcv.transforms.wrappers.KeyMapper`` for details. In
-                set None, return the input data directly.
-
-        Returns:
-            dict: The input data with remapped keys. This will be the actual
-                input of the wrapped pipeline.
-        """
-
-        if mapping is None:
-            return data.copy()
-
-        def _map(data, m):
-            if isinstance(m, dict):
-                # m is a dict {inner_key:outer_key, ...}
-                return {k_in: _map(data, k_out) for k_in, k_out in m.items()}
-            if isinstance(m, (tuple, list)):
-                # m is a list or tuple [outer_key1, outer_key2, ...]
-                # This is the case when we collect items from the original
-                # data to form a list or tuple to feed to the wrapped
-                # transforms.
-                return m.__class__(_map(data, e) for e in m)
-
-            # allow manually mark a key to be ignored by ...
-            if m is ...:
-                return IgnoreKey
-
-            # m is an outer_key
-            if self.allow_nonexist_keys:
-                return data.get(m, IgnoreKey)
-            else:
-                return data.get(m)
-
-        collected = _map(data, mapping)
-
-        # Retain unmapped items
-        inputs = data.copy()
-        inputs.update(collected)
-
-        return inputs
-
-    def _map_output(self, data: Dict,
-                    remapping: Optional[Dict]) -> Dict[str, Any]:
-        """KeyMapper outputs from the wrapped transforms by gathering and
-        renaming data items according to the remapping.
-
-        Args:
-            data (dict): The output of the wrapped pipeline.
-            remapping (dict, optional): The output key mapping. See the
-                document of ``mmcv.transforms.wrappers.KeyMapper`` for
-                details. If ``remapping is None``, no key mapping will be
-                applied but only remove the special token ``IgnoreKey``.
-
-        Returns:
-            dict: The output with remapped keys.
-        """
-
-        # Remove ``IgnoreKey``
-        if remapping is None:
-            return {k: v for k, v in data.items() if v is not IgnoreKey}
-
-        def _map(data, m):
-            if isinstance(m, dict):
-                assert isinstance(data, dict)
-                results = {}
-                for k_in, k_out in m.items():
-                    assert k_in in data
-                    results.update(_map(data[k_in], k_out))
-                return results
-            if isinstance(m, (list, tuple)):
-                assert isinstance(data, (list, tuple))
-                assert len(data) == len(m)
-                results = {}
-                for m_i, d_i in zip(m, data):
-                    results.update(_map(d_i, m_i))
-                return results
-
-            # ``m is ...`` means the key is marked ignored, in which case the
-            # inner resuls will not affect the outer results in remapping.
-            # Another case that will have ``data is IgnoreKey`` is that the
-            # key is missing in the inputs. In this case, if the inner key is
-            # created by the wrapped transforms, it will be remapped to the
-            # corresponding outer key during remapping.
-            if m is ... or data is IgnoreKey:
-                return {}
-
-            return {m: data}
-
-        # Note that unmapped items are not retained, which is different from
-        # the behavior in _map_input. This is to avoid original data items
-        # being overwritten by intermediate namesakes
-        return _map(data, remapping)
-
-    def _apply_transforms(self, inputs: Dict) -> Dict:
-        """Apply ``self.transforms``.
-
-        Note that the special token ``IgnoreKey`` will be invisible to
-        ``self.transforms``, but not removed in this method. It will be
-        eventually removed in :func:``self._map_output``.
-        """
-        results = inputs.copy()
-        inputs = {k: v for k, v in inputs.items() if v is not IgnoreKey}
-        outputs = self.transforms(inputs)
-
-        if outputs is None:
-            raise ValueError(
-                f'Transforms wrapped by {self.__class__.__name__} should '
-                'not return None.')
-
-        results.update(outputs)  # type: ignore
-        return results
-
-    def transform(self, results: Dict) -> Dict:
-        """Apply mapping, wrapped transforms and remapping."""
-
-        # Apply mapping
-        inputs = self._map_input(results, self.mapping)
-        # Apply wrapped transforms
-        outputs = self._apply_transforms(inputs)
-        # Apply remapping
-        outputs = self._map_output(outputs, self.remapping)
-
-        results.update(outputs)  # type: ignore
-        return results
-
-    def __repr__(self) -> str:
-        repr_str = self.__class__.__name__
-        repr_str += f'(transforms = {self.transforms}'
-        repr_str += f', mapping = {self.mapping}'
-        repr_str += f', remapping = {self.remapping}'
-        repr_str += f', auto_remap = {self.auto_remap}'
-        repr_str += f', allow_nonexist_keys = {self.allow_nonexist_keys})'
-        return repr_str
-
-
-@TRANSFORMS.register_module()
-class TransformBroadcaster(KeyMapper):
-    """A transform wrapper to apply the wrapped transforms to multiple data
-    items. For example, apply Resize to multiple images.
-
-    Args:
-        transforms (list[dict | callable]): Sequence of transform object or
-            config dict to be wrapped.
-        mapping (dict): A dict that defines the input key mapping.
-            Note that to apply the transforms to multiple data items, the
-            outer keys of the target items should be remapped as a list with
-            the standard inner key (The key required by the wrapped transform).
-            See the following example and the document of
-            ``mmcv.transforms.wrappers.KeyMapper`` for details.
-        remapping (dict): A dict that defines the output key mapping.
-            The keys and values have the same meanings and rules as in the
-            ``mapping``. Default: None.
-        auto_remap (bool, optional): If True, an inverse of the mapping will
-            be used as the remapping. If auto_remap is not given, it will be
-            automatically set True if 'remapping' is not given, and vice
-            versa. Default: None.
-        allow_nonexist_keys (bool): If False, the outer keys in the mapping
-            must exist in the input data, or an exception will be raised.
-            Default: False.
-        share_random_params (bool): If True, the random transform
-            (e.g., RandomFlip) will be conducted in a deterministic way and
-            have the same behavior on all data items. For example, to randomly
-            flip either both input image and ground-truth image, or none.
-            Default: False.
-
-    .. note::
-        To apply the transforms to each elements of a list or tuple, instead
-        of separating data items, you can map the outer key of the target
-        sequence to the standard inner key. See example 2.
-        example.
-
-    Examples:
-        >>> # Example 1: Broadcast to enumerated keys, each contains a single
-        >>> # data element
-        >>> pipeline = [
-        >>>     dict(type='LoadImageFromFile', key='lq'),  # low-quality img
-        >>>     dict(type='LoadImageFromFile', key='gt'),  # ground-truth img
-        >>>     # TransformBroadcaster maps multiple outer fields to standard
-        >>>     # the inner field and process them with wrapped transforms
-        >>>     # respectively
-        >>>     dict(type='TransformBroadcaster',
-        >>>         # case 1: from multiple outer fields
-        >>>         mapping={'img': ['lq', 'gt']},
-        >>>         auto_remap=True,
-        >>>         # share_random_param=True means using identical random
-        >>>         # parameters in every processing
-        >>>         share_random_param=True,
-        >>>         transforms=[
-        >>>             dict(type='Crop', crop_size=(384, 384)),
-        >>>             dict(type='Normalize'),
-        >>>         ])
-        >>> ]
-
-        >>> # Example 2: Broadcast to keys that contains data sequences
-        >>> pipeline = [
-        >>>     dict(type='LoadImageFromFile', key='lq'),  # low-quality img
-        >>>     dict(type='LoadImageFromFile', key='gt'),  # ground-truth img
-        >>>     # TransformBroadcaster maps multiple outer fields to standard
-        >>>     # the inner field and process them with wrapped transforms
-        >>>     # respectively
-        >>>     dict(type='TransformBroadcaster',
-        >>>         # case 2: from one outer field that contains multiple
-        >>>         # data elements (e.g. a list)
-        >>>         # mapping={'img': 'images'},
-        >>>         auto_remap=True,
-        >>>         share_random_param=True,
-        >>>         transforms=[
-        >>>             dict(type='Crop', crop_size=(384, 384)),
-        >>>             dict(type='Normalize'),
-        >>>         ])
-        >>> ]
-
-        >>> Example 3: Set ignored keys in broadcasting
-        >>> pipeline = [
-        >>>        dict(type='TransformBroadcaster',
-        >>>            # Broadcast the wrapped transforms to multiple images
-        >>>            # 'lq' and 'gt, but only update 'img_shape' once
-        >>>            mapping={
-        >>>                'img': ['lq', 'gt'],
-        >>>                'img_shape': ['img_shape', ...],
-        >>>             },
-        >>>            auto_remap=True,
-        >>>            share_random_params=True,
-        >>>            transforms=[
-        >>>                # `RandomCrop` will modify the field "img",
-        >>>                # and optionally update "img_shape" if it exists
-        >>>                dict(type='RandomCrop'),
-        >>>            ])
-        >>>    ]
-    """
-
-    def __init__(self,
-                 transforms: List[Union[Dict, Callable[[Dict], Dict]]],
-                 mapping: Optional[Dict] = None,
-                 remapping: Optional[Dict] = None,
-                 auto_remap: Optional[bool] = None,
-                 allow_nonexist_keys: bool = False,
-                 share_random_params: bool = False):
-        super().__init__(transforms, mapping, remapping, auto_remap,
-                         allow_nonexist_keys)
-
-        self.share_random_params = share_random_params
-
-    def scatter_sequence(self, data: Dict) -> List[Dict]:
-        """Scatter the broadcasting targets to a list of inputs of the wrapped
-        transforms."""
-
-        # infer split number from input
-        seq_len = 0
-        key_rep = None
-
-        if self.mapping:
-            keys = self.mapping.keys()
-        else:
-            keys = data.keys()
-
-        for key in keys:
-            assert isinstance(data[key], Sequence)
-            if seq_len:
-                if len(data[key]) != seq_len:
-                    raise ValueError('Got inconsistent sequence length: '
-                                     f'{seq_len} ({key_rep}) vs. '
-                                     f'{len(data[key])} ({key})')
-            else:
-                seq_len = len(data[key])
-                key_rep = key
-
-        assert seq_len > 0, 'Fail to get the number of broadcasting targets'
-
-        scatters = []
-        for i in range(seq_len):  # type: ignore
-            scatter = data.copy()
-            for key in keys:
-                scatter[key] = data[key][i]
-            scatters.append(scatter)
-        return scatters
-
-    def transform(self, results: Dict):
-        """Broadcast wrapped transforms to multiple targets."""
-
-        # Apply input remapping
-        inputs = self._map_input(results, self.mapping)
-
-        # Scatter sequential inputs into a list
-        input_scatters = self.scatter_sequence(inputs)
-
-        # Control random parameter sharing with a context manager
-        if self.share_random_params:
-            # The context manager :func`:cache_random_params` will let
-            # cacheable method of the transforms cache their outputs. Thus
-            # the random parameters will only generated once and shared
-            # by all data items.
-            ctx = cache_random_params  # type: ignore
-        else:
-            ctx = nullcontext  # type: ignore
-
-        with ctx(self.transforms):
-            output_scatters = [
-                self._apply_transforms(_input) for _input in input_scatters
-            ]
-
-        # Collate output scatters (list of dict to dict of list)
-        outputs = {
-            key: [_output[key] for _output in output_scatters]
-            for key in output_scatters[0]
-        }
-
-        # Apply remapping
-        outputs = self._map_output(outputs, self.remapping)
-
-        results.update(outputs)
-        return results
-
-    def __repr__(self) -> str:
-        repr_str = self.__class__.__name__
-        repr_str += f'(transforms = {self.transforms}'
-        repr_str += f', mapping = {self.mapping}'
-        repr_str += f', remapping = {self.remapping}'
-        repr_str += f', auto_remap = {self.auto_remap}'
-        repr_str += f', allow_nonexist_keys = {self.allow_nonexist_keys}'
-        repr_str += f', share_random_params = {self.share_random_params})'
-        return repr_str
-
-
-@TRANSFORMS.register_module()
-class RandomChoice(BaseTransform):
-    """Process data with a randomly chosen transform from given candidates.
-
-    Args:
-        transforms (list[list]): A list of transform candidates, each is a
-            sequence of transforms.
-        prob (list[float], optional): The probabilities associated
-            with each pipeline. The length should be equal to the pipeline
-            number and the sum should be 1. If not given, a uniform
-            distribution will be assumed.
-
-    Examples:
-        >>> # config
-        >>> pipeline = [
-        >>>     dict(type='RandomChoice',
-        >>>         transforms=[
-        >>>             [dict(type='RandomHorizontalFlip')],  # subpipeline 1
-        >>>             [dict(type='RandomRotate')],  # subpipeline 2
-        >>>         ]
-        >>>     )
-        >>> ]
-    """
-
-    def __init__(self,
-                 transforms: List[Union[Transform, List[Transform]]],
-                 prob: Optional[List[float]] = None):
-
-        super().__init__()
-
-        if prob is not None:
-            assert mmengine.is_seq_of(prob, float)
-            assert len(transforms) == len(prob), \
-                '``transforms`` and ``prob`` must have same lengths. ' \
-                f'Got {len(transforms)} vs {len(prob)}.'
-            assert sum(prob) == 1
-
-        self.prob = prob
-        self.transforms = [Compose(transforms) for transforms in transforms]
-
-    def __iter__(self):
-        return iter(self.transforms)
-
-    @cache_randomness
-    def random_pipeline_index(self) -> int:
-        """Return a random transform index."""
-        indices = np.arange(len(self.transforms))
-        return np.random.choice(indices, p=self.prob)
-
-    def transform(self, results: Dict) -> Optional[Dict]:
-        """Randomly choose a transform to apply."""
-        idx = self.random_pipeline_index()
-        return self.transforms[idx](results)
-
-    def __repr__(self) -> str:
-        repr_str = self.__class__.__name__
-        repr_str += f'(transforms = {self.transforms}'
-        repr_str += f'prob = {self.prob})'
-        return repr_str
-
-
-@TRANSFORMS.register_module()
-class RandomApply(BaseTransform):
-    """Apply transforms randomly with a given probability.
-
-    Args:
-        transforms (list[dict | callable]): The transform or transform list
-            to randomly apply.
-        prob (float): The probability to apply transforms. Default: 0.5
-
-    Examples:
-        >>> # config
-        >>> pipeline = [
-        >>>     dict(type='RandomApply',
-        >>>         transforms=[dict(type='HorizontalFlip')],
-        >>>         prob=0.3)
-        >>> ]
-    """
-
-    def __init__(self,
-                 transforms: Union[Transform, List[Transform]],
-                 prob: float = 0.5):
-
-        super().__init__()
-        self.prob = prob
-        self.transforms = Compose(transforms)
-
-    def __iter__(self):
-        return iter(self.transforms)
-
-    @cache_randomness
-    def random_apply(self) -> bool:
-        """Return a random bool value indicating whether apply the
-        transform."""
-        return np.random.rand() < self.prob
-
-    def transform(self, results: Dict) -> Optional[Dict]:
-        """Randomly apply the transform."""
-        if self.random_apply():
-            return self.transforms(results)  # type: ignore
-        else:
-            return results
-
-    def __repr__(self) -> str:
-        repr_str = self.__class__.__name__
-        repr_str += f'(transforms = {self.transforms}'
-        repr_str += f', prob = {self.prob})'
-        return repr_str
diff --git a/mmcv/utils/__init__.py b/mmcv/utils/__init__.py
index 53ebb94..478f015 100644
--- a/mmcv/utils/__init__.py
+++ b/mmcv/utils/__init__.py
@@ -1,10 +1,70 @@
+# flake8: noqa
 # Copyright (c) OpenMMLab. All rights reserved.
-from .device_type import (IS_CUDA_AVAILABLE, IS_MLU_AVAILABLE,
-                          IS_MPS_AVAILABLE, IS_NPU_AVAILABLE)
-from .env import collect_env
-from .parrots_jit import jit, skip_no_elena
+from .config import Config, ConfigDict, DictAction
+from .misc import (check_prerequisites, concat_list, deprecated_api_warning,
+                   has_method, import_modules_from_strings, is_list_of,
+                   is_method_overridden, is_seq_of, is_str, is_tuple_of,
+                   iter_cast, list_cast, requires_executable, requires_package,
+                   slice_list, to_1tuple, to_2tuple, to_3tuple, to_4tuple,
+                   to_ntuple, tuple_cast)
+from .path import (check_file_exist, fopen, is_filepath, mkdir_or_exist,
+                   scandir, symlink)
+from .progressbar import (ProgressBar, track_iter_progress,
+                          track_parallel_progress, track_progress)
+from .testing import (assert_attrs_equal, assert_dict_contains_subset,
+                      assert_dict_has_keys, assert_is_norm_layer,
+                      assert_keys_equal, assert_params_all_zeros,
+                      check_python_script)
+from .timer import Timer, TimerError, check_time
+from .version_utils import digit_version, get_git_hash
 
-__all__ = [
-    'IS_MLU_AVAILABLE', 'IS_MPS_AVAILABLE', 'IS_CUDA_AVAILABLE',
-    'IS_NPU_AVAILABLE', 'collect_env', 'jit', 'skip_no_elena'
-]
+try:
+    import torch
+except ImportError:
+    __all__ = [
+        'Config', 'ConfigDict', 'DictAction', 'is_str', 'iter_cast',
+        'list_cast', 'tuple_cast', 'is_seq_of', 'is_list_of', 'is_tuple_of',
+        'slice_list', 'concat_list', 'check_prerequisites', 'requires_package',
+        'requires_executable', 'is_filepath', 'fopen', 'check_file_exist',
+        'mkdir_or_exist', 'symlink', 'scandir', 'ProgressBar',
+        'track_progress', 'track_iter_progress', 'track_parallel_progress',
+        'Timer', 'TimerError', 'check_time', 'deprecated_api_warning',
+        'digit_version', 'get_git_hash', 'import_modules_from_strings',
+        'assert_dict_contains_subset', 'assert_attrs_equal',
+        'assert_dict_has_keys', 'assert_keys_equal', 'check_python_script',
+        'to_1tuple', 'to_2tuple', 'to_3tuple', 'to_4tuple', 'to_ntuple',
+        'is_method_overridden', 'has_method'
+    ]
+else:
+    from .env import collect_env
+    from .logging import get_logger, print_log
+    from .parrots_jit import jit, skip_no_elena
+    from .parrots_wrapper import (
+        TORCH_VERSION, BuildExtension, CppExtension, CUDAExtension, DataLoader,
+        PoolDataLoader, SyncBatchNorm, _AdaptiveAvgPoolNd, _AdaptiveMaxPoolNd,
+        _AvgPoolNd, _BatchNorm, _ConvNd, _ConvTransposeMixin, _InstanceNorm,
+        _MaxPoolNd, get_build_config, is_rocm_pytorch, _get_cuda_home)
+    from .registry import Registry, build_from_cfg
+    from .trace import is_jit_tracing
+    from .hub import load_url
+    __all__ = [
+        'Config', 'ConfigDict', 'DictAction', 'collect_env', 'get_logger',
+        'print_log', 'is_str', 'iter_cast', 'list_cast', 'tuple_cast',
+        'is_seq_of', 'is_list_of', 'is_tuple_of', 'slice_list', 'concat_list',
+        'check_prerequisites', 'requires_package', 'requires_executable',
+        'is_filepath', 'fopen', 'check_file_exist', 'mkdir_or_exist',
+        'symlink', 'scandir', 'ProgressBar', 'track_progress',
+        'track_iter_progress', 'track_parallel_progress', 'Registry',
+        'build_from_cfg', 'Timer', 'TimerError', 'check_time', 'SyncBatchNorm',
+        '_AdaptiveAvgPoolNd', '_AdaptiveMaxPoolNd', '_AvgPoolNd', '_BatchNorm',
+        '_ConvNd', '_ConvTransposeMixin', '_InstanceNorm', '_MaxPoolNd',
+        'get_build_config', 'BuildExtension', 'CppExtension', 'CUDAExtension',
+        'DataLoader', 'PoolDataLoader', 'TORCH_VERSION',
+        'deprecated_api_warning', 'digit_version', 'get_git_hash',
+        'import_modules_from_strings', 'jit', 'skip_no_elena',
+        'assert_dict_contains_subset', 'assert_attrs_equal',
+        'assert_dict_has_keys', 'assert_keys_equal', 'assert_is_norm_layer',
+        'assert_params_all_zeros', 'check_python_script',
+        'is_method_overridden', 'is_jit_tracing', 'is_rocm_pytorch',
+        '_get_cuda_home', 'load_url', 'has_method'
+    ]
diff --git a/mmcv/utils/config.py b/mmcv/utils/config.py
new file mode 100644
index 0000000..c71377c
--- /dev/null
+++ b/mmcv/utils/config.py
@@ -0,0 +1,688 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import ast
+import copy
+import os
+import os.path as osp
+import platform
+import shutil
+import sys
+import tempfile
+import uuid
+import warnings
+from argparse import Action, ArgumentParser
+from collections import abc
+from importlib import import_module
+
+from addict import Dict
+from yapf.yapflib.yapf_api import FormatCode
+
+from .misc import import_modules_from_strings
+from .path import check_file_exist
+
+if platform.system() == 'Windows':
+    import regex as re
+else:
+    import re
+
+BASE_KEY = '_base_'
+DELETE_KEY = '_delete_'
+DEPRECATION_KEY = '_deprecation_'
+RESERVED_KEYS = ['filename', 'text', 'pretty_text']
+
+
+class ConfigDict(Dict):
+
+    def __missing__(self, name):
+        raise KeyError(name)
+
+    def __getattr__(self, name):
+        try:
+            value = super(ConfigDict, self).__getattr__(name)
+        except KeyError:
+            ex = AttributeError(f"'{self.__class__.__name__}' object has no "
+                                f"attribute '{name}'")
+        except Exception as e:
+            ex = e
+        else:
+            return value
+        raise ex
+
+
+def add_args(parser, cfg, prefix=''):
+    for k, v in cfg.items():
+        if isinstance(v, str):
+            parser.add_argument('--' + prefix + k)
+        elif isinstance(v, int):
+            parser.add_argument('--' + prefix + k, type=int)
+        elif isinstance(v, float):
+            parser.add_argument('--' + prefix + k, type=float)
+        elif isinstance(v, bool):
+            parser.add_argument('--' + prefix + k, action='store_true')
+        elif isinstance(v, dict):
+            add_args(parser, v, prefix + k + '.')
+        elif isinstance(v, abc.Iterable):
+            parser.add_argument('--' + prefix + k, type=type(v[0]), nargs='+')
+        else:
+            print(f'cannot parse key {prefix + k} of type {type(v)}')
+    return parser
+
+
+class Config:
+    """A facility for config and config files.
+
+    It supports common file formats as configs: python/json/yaml. The interface
+    is the same as a dict object and also allows access config values as
+    attributes.
+
+    Example:
+        >>> cfg = Config(dict(a=1, b=dict(b1=[0, 1])))
+        >>> cfg.a
+        1
+        >>> cfg.b
+        {'b1': [0, 1]}
+        >>> cfg.b.b1
+        [0, 1]
+        >>> cfg = Config.fromfile('tests/data/config/a.py')
+        >>> cfg.filename
+        "/home/kchen/projects/mmcv/tests/data/config/a.py"
+        >>> cfg.item4
+        'test'
+        >>> cfg
+        "Config [path: /home/kchen/projects/mmcv/tests/data/config/a.py]: "
+        "{'item1': [1, 2], 'item2': {'a': 0}, 'item3': True, 'item4': 'test'}"
+    """
+
+    @staticmethod
+    def _validate_py_syntax(filename):
+        with open(filename, 'r', encoding='utf-8') as f:
+            # Setting encoding explicitly to resolve coding issue on windows
+            content = f.read()
+        try:
+            ast.parse(content)
+        except SyntaxError as e:
+            raise SyntaxError('There are syntax errors in config '
+                              f'file {filename}: {e}')
+
+    @staticmethod
+    def _substitute_predefined_vars(filename, temp_config_name):
+        file_dirname = osp.dirname(filename)
+        file_basename = osp.basename(filename)
+        file_basename_no_extension = osp.splitext(file_basename)[0]
+        file_extname = osp.splitext(filename)[1]
+        support_templates = dict(
+            fileDirname=file_dirname,
+            fileBasename=file_basename,
+            fileBasenameNoExtension=file_basename_no_extension,
+            fileExtname=file_extname)
+        with open(filename, 'r', encoding='utf-8') as f:
+            # Setting encoding explicitly to resolve coding issue on windows
+            config_file = f.read()
+        for key, value in support_templates.items():
+            regexp = r'\{\{\s*' + str(key) + r'\s*\}\}'
+            value = value.replace('\\', '/')
+            config_file = re.sub(regexp, value, config_file)
+        with open(temp_config_name, 'w', encoding='utf-8') as tmp_config_file:
+            tmp_config_file.write(config_file)
+
+    @staticmethod
+    def _pre_substitute_base_vars(filename, temp_config_name):
+        """Substitute base variable placehoders to string, so that parsing
+        would work."""
+        with open(filename, 'r', encoding='utf-8') as f:
+            # Setting encoding explicitly to resolve coding issue on windows
+            config_file = f.read()
+        base_var_dict = {}
+        regexp = r'\{\{\s*' + BASE_KEY + r'\.([\w\.]+)\s*\}\}'
+        base_vars = set(re.findall(regexp, config_file))
+        for base_var in base_vars:
+            randstr = f'_{base_var}_{uuid.uuid4().hex.lower()[:6]}'
+            base_var_dict[randstr] = base_var
+            regexp = r'\{\{\s*' + BASE_KEY + r'\.' + base_var + r'\s*\}\}'
+            config_file = re.sub(regexp, f'"{randstr}"', config_file)
+        with open(temp_config_name, 'w', encoding='utf-8') as tmp_config_file:
+            tmp_config_file.write(config_file)
+        return base_var_dict
+
+    @staticmethod
+    def _substitute_base_vars(cfg, base_var_dict, base_cfg):
+        """Substitute variable strings to their actual values."""
+        cfg = copy.deepcopy(cfg)
+
+        if isinstance(cfg, dict):
+            for k, v in cfg.items():
+                if isinstance(v, str) and v in base_var_dict:
+                    new_v = base_cfg
+                    for new_k in base_var_dict[v].split('.'):
+                        new_v = new_v[new_k]
+                    cfg[k] = new_v
+                elif isinstance(v, (list, tuple, dict)):
+                    cfg[k] = Config._substitute_base_vars(
+                        v, base_var_dict, base_cfg)
+        elif isinstance(cfg, tuple):
+            cfg = tuple(
+                Config._substitute_base_vars(c, base_var_dict, base_cfg)
+                for c in cfg)
+        elif isinstance(cfg, list):
+            cfg = [
+                Config._substitute_base_vars(c, base_var_dict, base_cfg)
+                for c in cfg
+            ]
+        elif isinstance(cfg, str) and cfg in base_var_dict:
+            new_v = base_cfg
+            for new_k in base_var_dict[cfg].split('.'):
+                new_v = new_v[new_k]
+            cfg = new_v
+
+        return cfg
+
+    @staticmethod
+    def _file2dict(filename, use_predefined_variables=True):
+        filename = osp.abspath(osp.expanduser(filename))
+        check_file_exist(filename)
+        fileExtname = osp.splitext(filename)[1]
+        if fileExtname not in ['.py', '.json', '.yaml', '.yml']:
+            raise IOError('Only py/yml/yaml/json type are supported now!')
+
+        with tempfile.TemporaryDirectory() as temp_config_dir:
+            temp_config_file = tempfile.NamedTemporaryFile(
+                dir=temp_config_dir, suffix=fileExtname)
+            if platform.system() == 'Windows':
+                temp_config_file.close()
+            temp_config_name = osp.basename(temp_config_file.name)
+            # Substitute predefined variables
+            if use_predefined_variables:
+                Config._substitute_predefined_vars(filename,
+                                                   temp_config_file.name)
+            else:
+                shutil.copyfile(filename, temp_config_file.name)
+            # Substitute base variables from placeholders to strings
+            base_var_dict = Config._pre_substitute_base_vars(
+                temp_config_file.name, temp_config_file.name)
+
+            if filename.endswith('.py'):
+                temp_module_name = osp.splitext(temp_config_name)[0]
+                sys.path.insert(0, temp_config_dir)
+                Config._validate_py_syntax(filename)
+                mod = import_module(temp_module_name)
+                sys.path.pop(0)
+                cfg_dict = {
+                    name: value
+                    for name, value in mod.__dict__.items()
+                    if not name.startswith('__')
+                }
+                # delete imported module
+                del sys.modules[temp_module_name]
+            elif filename.endswith(('.yml', '.yaml', '.json')):
+                import mmcv
+                cfg_dict = mmcv.load(temp_config_file.name)
+            # close temp file
+            temp_config_file.close()
+
+        # check deprecation information
+        if DEPRECATION_KEY in cfg_dict:
+            deprecation_info = cfg_dict.pop(DEPRECATION_KEY)
+            warning_msg = f'The config file {filename} will be deprecated ' \
+                'in the future.'
+            if 'expected' in deprecation_info:
+                warning_msg += f' Please use {deprecation_info["expected"]} ' \
+                    'instead.'
+            if 'reference' in deprecation_info:
+                warning_msg += ' More information can be found at ' \
+                    f'{deprecation_info["reference"]}'
+            warnings.warn(warning_msg)
+
+        cfg_text = filename + '\n'
+        with open(filename, 'r', encoding='utf-8') as f:
+            # Setting encoding explicitly to resolve coding issue on windows
+            cfg_text += f.read()
+
+        if BASE_KEY in cfg_dict:
+            cfg_dir = osp.dirname(filename)
+            base_filename = cfg_dict.pop(BASE_KEY)
+            base_filename = base_filename if isinstance(
+                base_filename, list) else [base_filename]
+
+            cfg_dict_list = list()
+            cfg_text_list = list()
+            for f in base_filename:
+                _cfg_dict, _cfg_text = Config._file2dict(osp.join(cfg_dir, f))
+                cfg_dict_list.append(_cfg_dict)
+                cfg_text_list.append(_cfg_text)
+
+            base_cfg_dict = dict()
+            for c in cfg_dict_list:
+                duplicate_keys = base_cfg_dict.keys() & c.keys()
+                if len(duplicate_keys) > 0:
+                    raise KeyError('Duplicate key is not allowed among bases. '
+                                   f'Duplicate keys: {duplicate_keys}')
+                base_cfg_dict.update(c)
+
+            # Substitute base variables from strings to their actual values
+            cfg_dict = Config._substitute_base_vars(cfg_dict, base_var_dict,
+                                                    base_cfg_dict)
+
+            base_cfg_dict = Config._merge_a_into_b(cfg_dict, base_cfg_dict)
+            cfg_dict = base_cfg_dict
+
+            # merge cfg_text
+            cfg_text_list.append(cfg_text)
+            cfg_text = '\n'.join(cfg_text_list)
+
+        return cfg_dict, cfg_text
+
+    @staticmethod
+    def _merge_a_into_b(a, b, allow_list_keys=False):
+        """merge dict ``a`` into dict ``b`` (non-inplace).
+
+        Values in ``a`` will overwrite ``b``. ``b`` is copied first to avoid
+        in-place modifications.
+
+        Args:
+            a (dict): The source dict to be merged into ``b``.
+            b (dict): The origin dict to be fetch keys from ``a``.
+            allow_list_keys (bool): If True, int string keys (e.g. '0', '1')
+              are allowed in source ``a`` and will replace the element of the
+              corresponding index in b if b is a list. Default: False.
+
+        Returns:
+            dict: The modified dict of ``b`` using ``a``.
+
+        Examples:
+            # Normally merge a into b.
+            >>> Config._merge_a_into_b(
+            ...     dict(obj=dict(a=2)), dict(obj=dict(a=1)))
+            {'obj': {'a': 2}}
+
+            # Delete b first and merge a into b.
+            >>> Config._merge_a_into_b(
+            ...     dict(obj=dict(_delete_=True, a=2)), dict(obj=dict(a=1)))
+            {'obj': {'a': 2}}
+
+            # b is a list
+            >>> Config._merge_a_into_b(
+            ...     {'0': dict(a=2)}, [dict(a=1), dict(b=2)], True)
+            [{'a': 2}, {'b': 2}]
+        """
+        b = b.copy()
+        for k, v in a.items():
+            if allow_list_keys and k.isdigit() and isinstance(b, list):
+                k = int(k)
+                if len(b) <= k:
+                    raise KeyError(f'Index {k} exceeds the length of list {b}')
+                b[k] = Config._merge_a_into_b(v, b[k], allow_list_keys)
+            elif isinstance(v,
+                            dict) and k in b and not v.pop(DELETE_KEY, False):
+                allowed_types = (dict, list) if allow_list_keys else dict
+                if not isinstance(b[k], allowed_types):
+                    raise TypeError(
+                        f'{k}={v} in child config cannot inherit from base '
+                        f'because {k} is a dict in the child config but is of '
+                        f'type {type(b[k])} in base config. You may set '
+                        f'`{DELETE_KEY}=True` to ignore the base config')
+                b[k] = Config._merge_a_into_b(v, b[k], allow_list_keys)
+            else:
+                b[k] = v
+        return b
+
+    @staticmethod
+    def fromfile(filename,
+                 use_predefined_variables=True,
+                 import_custom_modules=True):
+        cfg_dict, cfg_text = Config._file2dict(filename,
+                                               use_predefined_variables)
+        if import_custom_modules and cfg_dict.get('custom_imports', None):
+            import_modules_from_strings(**cfg_dict['custom_imports'])
+        return Config(cfg_dict, cfg_text=cfg_text, filename=filename)
+
+    @staticmethod
+    def fromstring(cfg_str, file_format):
+        """Generate config from config str.
+
+        Args:
+            cfg_str (str): Config str.
+            file_format (str): Config file format corresponding to the
+               config str. Only py/yml/yaml/json type are supported now!
+
+        Returns:
+            obj:`Config`: Config obj.
+        """
+        if file_format not in ['.py', '.json', '.yaml', '.yml']:
+            raise IOError('Only py/yml/yaml/json type are supported now!')
+        if file_format != '.py' and 'dict(' in cfg_str:
+            # check if users specify a wrong suffix for python
+            warnings.warn(
+                'Please check "file_format", the file format may be .py')
+        with tempfile.NamedTemporaryFile(
+                'w', encoding='utf-8', suffix=file_format,
+                delete=False) as temp_file:
+            temp_file.write(cfg_str)
+            # on windows, previous implementation cause error
+            # see PR 1077 for details
+        cfg = Config.fromfile(temp_file.name)
+        os.remove(temp_file.name)
+        return cfg
+
+    @staticmethod
+    def auto_argparser(description=None):
+        """Generate argparser from config file automatically (experimental)"""
+        partial_parser = ArgumentParser(description=description)
+        partial_parser.add_argument('config', help='config file path')
+        cfg_file = partial_parser.parse_known_args()[0].config
+        cfg = Config.fromfile(cfg_file)
+        parser = ArgumentParser(description=description)
+        parser.add_argument('config', help='config file path')
+        add_args(parser, cfg)
+        return parser, cfg
+
+    def __init__(self, cfg_dict=None, cfg_text=None, filename=None):
+        if cfg_dict is None:
+            cfg_dict = dict()
+        elif not isinstance(cfg_dict, dict):
+            raise TypeError('cfg_dict must be a dict, but '
+                            f'got {type(cfg_dict)}')
+        for key in cfg_dict:
+            if key in RESERVED_KEYS:
+                raise KeyError(f'{key} is reserved for config file')
+
+        super(Config, self).__setattr__('_cfg_dict', ConfigDict(cfg_dict))
+        super(Config, self).__setattr__('_filename', filename)
+        if cfg_text:
+            text = cfg_text
+        elif filename:
+            with open(filename, 'r') as f:
+                text = f.read()
+        else:
+            text = ''
+        super(Config, self).__setattr__('_text', text)
+
+    @property
+    def filename(self):
+        return self._filename
+
+    @property
+    def text(self):
+        return self._text
+
+    @property
+    def pretty_text(self):
+
+        indent = 4
+
+        def _indent(s_, num_spaces):
+            s = s_.split('\n')
+            if len(s) == 1:
+                return s_
+            first = s.pop(0)
+            s = [(num_spaces * ' ') + line for line in s]
+            s = '\n'.join(s)
+            s = first + '\n' + s
+            return s
+
+        def _format_basic_types(k, v, use_mapping=False):
+            if isinstance(v, str):
+                v_str = f"'{v}'"
+            else:
+                v_str = str(v)
+
+            if use_mapping:
+                k_str = f"'{k}'" if isinstance(k, str) else str(k)
+                attr_str = f'{k_str}: {v_str}'
+            else:
+                attr_str = f'{str(k)}={v_str}'
+            attr_str = _indent(attr_str, indent)
+
+            return attr_str
+
+        def _format_list(k, v, use_mapping=False):
+            # check if all items in the list are dict
+            if all(isinstance(_, dict) for _ in v):
+                v_str = '[\n'
+                v_str += '\n'.join(
+                    f'dict({_indent(_format_dict(v_), indent)}),'
+                    for v_ in v).rstrip(',')
+                if use_mapping:
+                    k_str = f"'{k}'" if isinstance(k, str) else str(k)
+                    attr_str = f'{k_str}: {v_str}'
+                else:
+                    attr_str = f'{str(k)}={v_str}'
+                attr_str = _indent(attr_str, indent) + ']'
+            else:
+                attr_str = _format_basic_types(k, v, use_mapping)
+            return attr_str
+
+        def _contain_invalid_identifier(dict_str):
+            contain_invalid_identifier = False
+            for key_name in dict_str:
+                contain_invalid_identifier |= \
+                    (not str(key_name).isidentifier())
+            return contain_invalid_identifier
+
+        def _format_dict(input_dict, outest_level=False):
+            r = ''
+            s = []
+
+            use_mapping = _contain_invalid_identifier(input_dict)
+            if use_mapping:
+                r += '{'
+            for idx, (k, v) in enumerate(input_dict.items()):
+                is_last = idx >= len(input_dict) - 1
+                end = '' if outest_level or is_last else ','
+                if isinstance(v, dict):
+                    v_str = '\n' + _format_dict(v)
+                    if use_mapping:
+                        k_str = f"'{k}'" if isinstance(k, str) else str(k)
+                        attr_str = f'{k_str}: dict({v_str}'
+                    else:
+                        attr_str = f'{str(k)}=dict({v_str}'
+                    attr_str = _indent(attr_str, indent) + ')' + end
+                elif isinstance(v, list):
+                    attr_str = _format_list(k, v, use_mapping) + end
+                else:
+                    attr_str = _format_basic_types(k, v, use_mapping) + end
+
+                s.append(attr_str)
+            r += '\n'.join(s)
+            if use_mapping:
+                r += '}'
+            return r
+
+        cfg_dict = self._cfg_dict.to_dict()
+        text = _format_dict(cfg_dict, outest_level=True)
+        # copied from setup.cfg
+        yapf_style = dict(
+            based_on_style='pep8',
+            blank_line_before_nested_class_or_def=True,
+            split_before_expression_after_opening_paren=True)
+        text, _ = FormatCode(text, style_config=yapf_style, verify=True)
+
+        return text
+
+    def __repr__(self):
+        return f'Config (path: {self.filename}): {self._cfg_dict.__repr__()}'
+
+    def __len__(self):
+        return len(self._cfg_dict)
+
+    def __getattr__(self, name):
+        return getattr(self._cfg_dict, name)
+
+    def __getitem__(self, name):
+        return self._cfg_dict.__getitem__(name)
+
+    def __setattr__(self, name, value):
+        if isinstance(value, dict):
+            value = ConfigDict(value)
+        self._cfg_dict.__setattr__(name, value)
+
+    def __setitem__(self, name, value):
+        if isinstance(value, dict):
+            value = ConfigDict(value)
+        self._cfg_dict.__setitem__(name, value)
+
+    def __iter__(self):
+        return iter(self._cfg_dict)
+
+    def __getstate__(self):
+        return (self._cfg_dict, self._filename, self._text)
+
+    def __setstate__(self, state):
+        _cfg_dict, _filename, _text = state
+        super(Config, self).__setattr__('_cfg_dict', _cfg_dict)
+        super(Config, self).__setattr__('_filename', _filename)
+        super(Config, self).__setattr__('_text', _text)
+
+    def dump(self, file=None):
+        cfg_dict = super(Config, self).__getattribute__('_cfg_dict').to_dict()
+        if self.filename.endswith('.py'):
+            if file is None:
+                return self.pretty_text
+            else:
+                with open(file, 'w', encoding='utf-8') as f:
+                    f.write(self.pretty_text)
+        else:
+            import mmcv
+            if file is None:
+                file_format = self.filename.split('.')[-1]
+                return mmcv.dump(cfg_dict, file_format=file_format)
+            else:
+                mmcv.dump(cfg_dict, file)
+
+    def merge_from_dict(self, options, allow_list_keys=True):
+        """Merge list into cfg_dict.
+
+        Merge the dict parsed by MultipleKVAction into this cfg.
+
+        Examples:
+            >>> options = {'model.backbone.depth': 50,
+            ...            'model.backbone.with_cp':True}
+            >>> cfg = Config(dict(model=dict(backbone=dict(type='ResNet'))))
+            >>> cfg.merge_from_dict(options)
+            >>> cfg_dict = super(Config, self).__getattribute__('_cfg_dict')
+            >>> assert cfg_dict == dict(
+            ...     model=dict(backbone=dict(depth=50, with_cp=True)))
+
+            # Merge list element
+            >>> cfg = Config(dict(pipeline=[
+            ...     dict(type='LoadImage'), dict(type='LoadAnnotations')]))
+            >>> options = dict(pipeline={'0': dict(type='SelfLoadImage')})
+            >>> cfg.merge_from_dict(options, allow_list_keys=True)
+            >>> cfg_dict = super(Config, self).__getattribute__('_cfg_dict')
+            >>> assert cfg_dict == dict(pipeline=[
+            ...     dict(type='SelfLoadImage'), dict(type='LoadAnnotations')])
+
+        Args:
+            options (dict): dict of configs to merge from.
+            allow_list_keys (bool): If True, int string keys (e.g. '0', '1')
+              are allowed in ``options`` and will replace the element of the
+              corresponding index in the config if the config is a list.
+              Default: True.
+        """
+        option_cfg_dict = {}
+        for full_key, v in options.items():
+            d = option_cfg_dict
+            key_list = full_key.split('.')
+            for subkey in key_list[:-1]:
+                d.setdefault(subkey, ConfigDict())
+                d = d[subkey]
+            subkey = key_list[-1]
+            d[subkey] = v
+
+        cfg_dict = super(Config, self).__getattribute__('_cfg_dict')
+        super(Config, self).__setattr__(
+            '_cfg_dict',
+            Config._merge_a_into_b(
+                option_cfg_dict, cfg_dict, allow_list_keys=allow_list_keys))
+
+
+class DictAction(Action):
+    """
+    argparse action to split an argument into KEY=VALUE form
+    on the first = and append to a dictionary. List options can
+    be passed as comma separated values, i.e 'KEY=V1,V2,V3', or with explicit
+    brackets, i.e. 'KEY=[V1,V2,V3]'. It also support nested brackets to build
+    list/tuple values. e.g. 'KEY=[(V1,V2),(V3,V4)]'
+    """
+
+    @staticmethod
+    def _parse_int_float_bool(val):
+        try:
+            return int(val)
+        except ValueError:
+            pass
+        try:
+            return float(val)
+        except ValueError:
+            pass
+        if val.lower() in ['true', 'false']:
+            return True if val.lower() == 'true' else False
+        return val
+
+    @staticmethod
+    def _parse_iterable(val):
+        """Parse iterable values in the string.
+
+        All elements inside '()' or '[]' are treated as iterable values.
+
+        Args:
+            val (str): Value string.
+
+        Returns:
+            list | tuple: The expanded list or tuple from the string.
+
+        Examples:
+            >>> DictAction._parse_iterable('1,2,3')
+            [1, 2, 3]
+            >>> DictAction._parse_iterable('[a, b, c]')
+            ['a', 'b', 'c']
+            >>> DictAction._parse_iterable('[(1, 2, 3), [a, b], c]')
+            [(1, 2, 3), ['a', 'b'], 'c']
+        """
+
+        def find_next_comma(string):
+            """Find the position of next comma in the string.
+
+            If no ',' is found in the string, return the string length. All
+            chars inside '()' and '[]' are treated as one element and thus ','
+            inside these brackets are ignored.
+            """
+            assert (string.count('(') == string.count(')')) and (
+                    string.count('[') == string.count(']')), \
+                f'Imbalanced brackets exist in {string}'
+            end = len(string)
+            for idx, char in enumerate(string):
+                pre = string[:idx]
+                # The string before this ',' is balanced
+                if ((char == ',') and (pre.count('(') == pre.count(')'))
+                        and (pre.count('[') == pre.count(']'))):
+                    end = idx
+                    break
+            return end
+
+        # Strip ' and " characters and replace whitespace.
+        val = val.strip('\'\"').replace(' ', '')
+        is_tuple = False
+        if val.startswith('(') and val.endswith(')'):
+            is_tuple = True
+            val = val[1:-1]
+        elif val.startswith('[') and val.endswith(']'):
+            val = val[1:-1]
+        elif ',' not in val:
+            # val is a single value
+            return DictAction._parse_int_float_bool(val)
+
+        values = []
+        while len(val) > 0:
+            comma_idx = find_next_comma(val)
+            element = DictAction._parse_iterable(val[:comma_idx])
+            values.append(element)
+            val = val[comma_idx + 1:]
+        if is_tuple:
+            values = tuple(values)
+        return values
+
+    def __call__(self, parser, namespace, values, option_string=None):
+        options = {}
+        for kv in values:
+            key, val = kv.split('=', maxsplit=1)
+            options[key] = self._parse_iterable(val)
+        setattr(namespace, self.dest, options)
diff --git a/mmcv/utils/device_type.py b/mmcv/utils/device_type.py
deleted file mode 100644
index 0a84371..0000000
--- a/mmcv/utils/device_type.py
+++ /dev/null
@@ -1,8 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from mmengine.device import (is_cuda_available, is_mlu_available,
-                             is_mps_available, is_npu_available)
-
-IS_MLU_AVAILABLE = is_mlu_available()
-IS_MPS_AVAILABLE = is_mps_available()
-IS_CUDA_AVAILABLE = is_cuda_available()
-IS_NPU_AVAILABLE = is_npu_available()
diff --git a/mmcv/utils/env.py b/mmcv/utils/env.py
index 27988cf..e46a109 100644
--- a/mmcv/utils/env.py
+++ b/mmcv/utils/env.py
@@ -3,11 +3,14 @@
 
 import os.path as osp
 import subprocess
+import sys
+from collections import defaultdict
 
+import cv2
 import torch
-from mmengine.utils.dl_utils import collect_env as mmengine_collect_env
 
 import mmcv
+from .parrots_wrapper import get_build_config
 
 
 def collect_env():
@@ -23,49 +26,61 @@ def collect_env():
             - CUDA_HOME (optional): The env var ``CUDA_HOME``.
             - NVCC (optional): NVCC version.
             - GCC: GCC version, "n/a" if GCC is not installed.
-            - MSVC: Microsoft Virtual C++ Compiler version, Windows only.
             - PyTorch: PyTorch version.
             - PyTorch compiling details: The output of \
                 ``torch.__config__.show()``.
             - TorchVision (optional): TorchVision version.
             - OpenCV: OpenCV version.
-            - MMEngine: MMEngine version.
             - MMCV: MMCV version.
             - MMCV Compiler: The GCC version for compiling MMCV ops.
             - MMCV CUDA Compiler: The CUDA version for compiling MMCV ops.
     """
-    env_info = mmengine_collect_env()
+    env_info = {}
+    env_info['sys.platform'] = sys.platform
+    env_info['Python'] = sys.version.replace('\n', '')
 
-    # MMEngine does not add the hipcc compiler information when collecting
-    # environment information, so it is added here. When MMEngine v0.3.0 is
-    # released, the code here can be removed.
     cuda_available = torch.cuda.is_available()
-    if cuda_available and env_info.get('NVCC') == 'Not Available':
-        CUDA_HOME = env_info['CUDA_HOME']
+    env_info['CUDA available'] = cuda_available
+
+    if cuda_available:
+        devices = defaultdict(list)
+        for k in range(torch.cuda.device_count()):
+            devices[torch.cuda.get_device_name(k)].append(str(k))
+        for name, device_ids in devices.items():
+            env_info['GPU ' + ','.join(device_ids)] = name
+
+        from mmcv.utils.parrots_wrapper import _get_cuda_home
+        CUDA_HOME = _get_cuda_home()
+        env_info['CUDA_HOME'] = CUDA_HOME
+
         if CUDA_HOME is not None and osp.isdir(CUDA_HOME):
-            if CUDA_HOME == '/opt/rocm':
-                try:
-                    nvcc = osp.join(CUDA_HOME, 'hip/bin/hipcc')
-                    nvcc = subprocess.check_output(
-                        f'"{nvcc}" --version', shell=True)
-                    nvcc = nvcc.decode('utf-8').strip()
-                    release = nvcc.rfind('HIP version:')
-                    build = nvcc.rfind('')
-                    nvcc = nvcc[release:build].strip()
-                except subprocess.SubprocessError:
-                    nvcc = 'Not Available'
-            else:
-                try:
-                    nvcc = osp.join(CUDA_HOME, 'bin/nvcc')
-                    nvcc = subprocess.check_output(f'"{nvcc}" -V', shell=True)
-                    nvcc = nvcc.decode('utf-8').strip()
-                    release = nvcc.rfind('Cuda compilation tools')
-                    build = nvcc.rfind('Build ')
-                    nvcc = nvcc[release:build].strip()
-                except subprocess.SubprocessError:
-                    nvcc = 'Not Available'
+            try:
+                nvcc = osp.join(CUDA_HOME, 'bin/nvcc')
+                nvcc = subprocess.check_output(
+                    f'"{nvcc}" -V | tail -n1', shell=True)
+                nvcc = nvcc.decode('utf-8').strip()
+            except subprocess.SubprocessError:
+                nvcc = 'Not Available'
             env_info['NVCC'] = nvcc
 
+    try:
+        gcc = subprocess.check_output('gcc --version | head -n1', shell=True)
+        gcc = gcc.decode('utf-8').strip()
+        env_info['GCC'] = gcc
+    except subprocess.CalledProcessError:  # gcc is unavailable
+        env_info['GCC'] = 'n/a'
+
+    env_info['PyTorch'] = torch.__version__
+    env_info['PyTorch compiling details'] = get_build_config()
+
+    try:
+        import torchvision
+        env_info['TorchVision'] = torchvision.__version__
+    except ModuleNotFoundError:
+        pass
+
+    env_info['OpenCV'] = cv2.__version__
+
     env_info['MMCV'] = mmcv.__version__
 
     try:
diff --git a/mmcv/utils/ext_loader.py b/mmcv/utils/ext_loader.py
index a31e107..08132d2 100644
--- a/mmcv/utils/ext_loader.py
+++ b/mmcv/utils/ext_loader.py
@@ -36,7 +36,6 @@ else:
         'ms_deform_attn_forward',
         'pixel_group',
         'contour_expand',
-        'diff_iou_rotated_sort_vertices_forward',
     ]
 
     def get_fake_func(name, e):
@@ -67,6 +66,6 @@ else:
         return ExtModule(*ext_list)
 
 
-def check_ops_exist() -> bool:
+def check_ops_exist():
     ext_loader = pkgutil.find_loader('mmcv._ext')
     return ext_loader is not None
diff --git a/mmcv/utils/hub.py b/mmcv/utils/hub.py
new file mode 100644
index 0000000..4e11796
--- /dev/null
+++ b/mmcv/utils/hub.py
@@ -0,0 +1,127 @@
+# The 1.6 release of PyTorch switched torch.save to use a new zipfile-based
+# file format. It will cause RuntimeError when a checkpoint was saved in
+# torch >= 1.6.0 but loaded in torch < 1.7.0.
+# More details at https://github.com/open-mmlab/mmpose/issues/904
+from .parrots_wrapper import TORCH_VERSION
+from .path import mkdir_or_exist
+from .version_utils import digit_version
+
+if TORCH_VERSION != 'parrots' and digit_version(TORCH_VERSION) < digit_version(
+        '1.7.0'):
+    # Modified from https://github.com/pytorch/pytorch/blob/master/torch/hub.py
+    import os
+    import torch
+    import warnings
+    from urllib.parse import urlparse
+    import sys
+    import zipfile
+    from torch.hub import download_url_to_file, _get_torch_home, HASH_REGEX
+
+    # Hub used to support automatically extracts from zipfile manually
+    # compressed by users. The legacy zip format expects only one file from
+    # torch.save() < 1.6 in the zip. We should remove this support since
+    # zipfile is now default zipfile format for torch.save().
+    def _is_legacy_zip_format(filename):
+        if zipfile.is_zipfile(filename):
+            infolist = zipfile.ZipFile(filename).infolist()
+            return len(infolist) == 1 and not infolist[0].is_dir()
+        return False
+
+    def _legacy_zip_load(filename, model_dir, map_location):
+        warnings.warn('Falling back to the old format < 1.6. This support will'
+                      ' be deprecated in favor of default zipfile format '
+                      'introduced in 1.6. Please redo torch.save() to save it '
+                      'in the new zipfile format.')
+        # Note: extractall() defaults to overwrite file if exists. No need to
+        #       clean up beforehand. We deliberately don't handle tarfile here
+        #       since our legacy serialization format was in tar.
+        #       E.g. resnet18-5c106cde.pth which is widely used.
+        with zipfile.ZipFile(filename) as f:
+            members = f.infolist()
+            if len(members) != 1:
+                raise RuntimeError(
+                    'Only one file(not dir) is allowed in the zipfile')
+            f.extractall(model_dir)
+            extraced_name = members[0].filename
+            extracted_file = os.path.join(model_dir, extraced_name)
+        return torch.load(extracted_file, map_location=map_location)
+
+    def load_url(url,
+                 model_dir=None,
+                 map_location=None,
+                 progress=True,
+                 check_hash=False,
+                 file_name=None):
+        r"""Loads the Torch serialized object at the given URL.
+
+        If downloaded file is a zip file, it will be automatically decompressed
+
+        If the object is already present in `model_dir`, it's deserialized and
+        returned.
+        The default value of ``model_dir`` is ``<hub_dir>/checkpoints`` where
+        ``hub_dir`` is the directory returned by :func:`~torch.hub.get_dir`.
+
+        Args:
+            url (str): URL of the object to download
+            model_dir (str, optional): directory in which to save the object
+            map_location (optional): a function or a dict specifying how to
+                remap storage locations (see torch.load)
+            progress (bool, optional): whether or not to display a progress bar
+                to stderr. Default: True
+            check_hash(bool, optional): If True, the filename part of the URL
+                should follow the naming convention ``filename-<sha256>.ext``
+                where ``<sha256>`` is the first eight or more digits of the
+                SHA256 hash of the contents of the file. The hash is used to
+                ensure unique names and to verify the contents of the file.
+                Default: False
+            file_name (str, optional): name for the downloaded file. Filename
+                from ``url`` will be used if not set. Default: None.
+
+        Example:
+            >>> url = ('https://s3.amazonaws.com/pytorch/models/resnet18-5c106'
+            ...        'cde.pth')
+            >>> state_dict = torch.hub.load_state_dict_from_url(url)
+        """
+        # Issue warning to move data if old env is set
+        if os.getenv('TORCH_MODEL_ZOO'):
+            warnings.warn('TORCH_MODEL_ZOO is deprecated, please use env '
+                          'TORCH_HOME instead')
+
+        if model_dir is None:
+            torch_home = _get_torch_home()
+            model_dir = os.path.join(torch_home, 'checkpoints')
+
+        mkdir_or_exist(model_dir)
+
+        parts = urlparse(url)
+        filename = os.path.basename(parts.path)
+        if file_name is not None:
+            filename = file_name
+        cached_file = os.path.join(model_dir, filename)
+        if not os.path.exists(cached_file):
+            sys.stderr.write('Downloading: "{}" to {}\n'.format(
+                url, cached_file))
+            hash_prefix = None
+            if check_hash:
+                r = HASH_REGEX.search(filename)  # r is Optional[Match[str]]
+                hash_prefix = r.group(1) if r else None
+            download_url_to_file(
+                url, cached_file, hash_prefix, progress=progress)
+
+        if _is_legacy_zip_format(cached_file):
+            return _legacy_zip_load(cached_file, model_dir, map_location)
+
+        try:
+            return torch.load(cached_file, map_location=map_location)
+        except RuntimeError as error:
+            if digit_version(TORCH_VERSION) < digit_version('1.5.0'):
+                warnings.warn(
+                    f'If the error is the same as "{cached_file} is a zip '
+                    'archive (did you mean to use torch.jit.load()?)", you can'
+                    ' upgrade your torch to 1.5.0 or higher (current torch '
+                    f'version is {TORCH_VERSION}). The error was raised '
+                    ' because the checkpoint was saved in torch>=1.6.0 but '
+                    'loaded in torch<1.5.')
+            raise error
+else:
+    from torch.utils.model_zoo import load_url  # noqa: F401
diff --git a/mmcv/utils/logging.py b/mmcv/utils/logging.py
new file mode 100644
index 0000000..4aa0e04
--- /dev/null
+++ b/mmcv/utils/logging.py
@@ -0,0 +1,110 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import logging
+
+import torch.distributed as dist
+
+logger_initialized = {}
+
+
+def get_logger(name, log_file=None, log_level=logging.INFO, file_mode='w'):
+    """Initialize and get a logger by name.
+
+    If the logger has not been initialized, this method will initialize the
+    logger by adding one or two handlers, otherwise the initialized logger will
+    be directly returned. During initialization, a StreamHandler will always be
+    added. If `log_file` is specified and the process rank is 0, a FileHandler
+    will also be added.
+
+    Args:
+        name (str): Logger name.
+        log_file (str | None): The log filename. If specified, a FileHandler
+            will be added to the logger.
+        log_level (int): The logger level. Note that only the process of
+            rank 0 is affected, and other processes will set the level to
+            "Error" thus be silent most of the time.
+        file_mode (str): The file mode used in opening log file.
+            Defaults to 'w'.
+
+    Returns:
+        logging.Logger: The expected logger.
+    """
+    logger = logging.getLogger(name)
+    if name in logger_initialized:
+        return logger
+    # handle hierarchical names
+    # e.g., logger "a" is initialized, then logger "a.b" will skip the
+    # initialization since it is a child of "a".
+    for logger_name in logger_initialized:
+        if name.startswith(logger_name):
+            return logger
+
+    # handle duplicate logs to the console
+    # Starting in 1.8.0, PyTorch DDP attaches a StreamHandler <stderr> (NOTSET)
+    # to the root logger. As logger.propagate is True by default, this root
+    # level handler causes logging messages from rank>0 processes to
+    # unexpectedly show up on the console, creating much unwanted clutter.
+    # To fix this issue, we set the root logger's StreamHandler, if any, to log
+    # at the ERROR level.
+    for handler in logger.root.handlers:
+        if type(handler) is logging.StreamHandler:
+            handler.setLevel(logging.ERROR)
+
+    stream_handler = logging.StreamHandler()
+    handlers = [stream_handler]
+
+    if dist.is_available() and dist.is_initialized():
+        rank = dist.get_rank()
+    else:
+        rank = 0
+
+    # only rank 0 will add a FileHandler
+    if rank == 0 and log_file is not None:
+        # Here, the default behaviour of the official logger is 'a'. Thus, we
+        # provide an interface to change the file mode to the default
+        # behaviour.
+        file_handler = logging.FileHandler(log_file, file_mode)
+        handlers.append(file_handler)
+
+    formatter = logging.Formatter(
+        '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+    for handler in handlers:
+        handler.setFormatter(formatter)
+        handler.setLevel(log_level)
+        logger.addHandler(handler)
+
+    if rank == 0:
+        logger.setLevel(log_level)
+    else:
+        logger.setLevel(logging.ERROR)
+
+    logger_initialized[name] = True
+
+    return logger
+
+
+def print_log(msg, logger=None, level=logging.INFO):
+    """Print a log message.
+
+    Args:
+        msg (str): The message to be logged.
+        logger (logging.Logger | str | None): The logger to be used.
+            Some special loggers are:
+            - "silent": no message will be printed.
+            - other str: the logger obtained with `get_root_logger(logger)`.
+            - None: The `print()` method will be used to print log messages.
+        level (int): Logging level. Only available when `logger` is a Logger
+            object or "root".
+    """
+    if logger is None:
+        print(msg)
+    elif isinstance(logger, logging.Logger):
+        logger.log(level, msg)
+    elif logger == 'silent':
+        pass
+    elif isinstance(logger, str):
+        _logger = get_logger(logger)
+        _logger.log(level, msg)
+    else:
+        raise TypeError(
+            'logger should be either a logging.Logger object, str, '
+            f'"silent" or None, but got {type(logger)}')
diff --git a/mmcv/utils/misc.py b/mmcv/utils/misc.py
new file mode 100644
index 0000000..2c58d0d
--- /dev/null
+++ b/mmcv/utils/misc.py
@@ -0,0 +1,377 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import collections.abc
+import functools
+import itertools
+import subprocess
+import warnings
+from collections import abc
+from importlib import import_module
+from inspect import getfullargspec
+from itertools import repeat
+
+
+# From PyTorch internals
+def _ntuple(n):
+
+    def parse(x):
+        if isinstance(x, collections.abc.Iterable):
+            return x
+        return tuple(repeat(x, n))
+
+    return parse
+
+
+to_1tuple = _ntuple(1)
+to_2tuple = _ntuple(2)
+to_3tuple = _ntuple(3)
+to_4tuple = _ntuple(4)
+to_ntuple = _ntuple
+
+
+def is_str(x):
+    """Whether the input is an string instance.
+
+    Note: This method is deprecated since python 2 is no longer supported.
+    """
+    return isinstance(x, str)
+
+
+def import_modules_from_strings(imports, allow_failed_imports=False):
+    """Import modules from the given list of strings.
+
+    Args:
+        imports (list | str | None): The given module names to be imported.
+        allow_failed_imports (bool): If True, the failed imports will return
+            None. Otherwise, an ImportError is raise. Default: False.
+
+    Returns:
+        list[module] | module | None: The imported modules.
+
+    Examples:
+        >>> osp, sys = import_modules_from_strings(
+        ...     ['os.path', 'sys'])
+        >>> import os.path as osp_
+        >>> import sys as sys_
+        >>> assert osp == osp_
+        >>> assert sys == sys_
+    """
+    if not imports:
+        return
+    single_import = False
+    if isinstance(imports, str):
+        single_import = True
+        imports = [imports]
+    if not isinstance(imports, list):
+        raise TypeError(
+            f'custom_imports must be a list but got type {type(imports)}')
+    imported = []
+    for imp in imports:
+        if not isinstance(imp, str):
+            raise TypeError(
+                f'{imp} is of type {type(imp)} and cannot be imported.')
+        try:
+            imported_tmp = import_module(imp)
+        except ImportError:
+            if allow_failed_imports:
+                warnings.warn(f'{imp} failed to import and is ignored.',
+                              UserWarning)
+                imported_tmp = None
+            else:
+                raise ImportError
+        imported.append(imported_tmp)
+    if single_import:
+        imported = imported[0]
+    return imported
+
+
+def iter_cast(inputs, dst_type, return_type=None):
+    """Cast elements of an iterable object into some type.
+
+    Args:
+        inputs (Iterable): The input object.
+        dst_type (type): Destination type.
+        return_type (type, optional): If specified, the output object will be
+            converted to this type, otherwise an iterator.
+
+    Returns:
+        iterator or specified type: The converted object.
+    """
+    if not isinstance(inputs, abc.Iterable):
+        raise TypeError('inputs must be an iterable object')
+    if not isinstance(dst_type, type):
+        raise TypeError('"dst_type" must be a valid type')
+
+    out_iterable = map(dst_type, inputs)
+
+    if return_type is None:
+        return out_iterable
+    else:
+        return return_type(out_iterable)
+
+
+def list_cast(inputs, dst_type):
+    """Cast elements of an iterable object into a list of some type.
+
+    A partial method of :func:`iter_cast`.
+    """
+    return iter_cast(inputs, dst_type, return_type=list)
+
+
+def tuple_cast(inputs, dst_type):
+    """Cast elements of an iterable object into a tuple of some type.
+
+    A partial method of :func:`iter_cast`.
+    """
+    return iter_cast(inputs, dst_type, return_type=tuple)
+
+
+def is_seq_of(seq, expected_type, seq_type=None):
+    """Check whether it is a sequence of some type.
+
+    Args:
+        seq (Sequence): The sequence to be checked.
+        expected_type (type): Expected type of sequence items.
+        seq_type (type, optional): Expected sequence type.
+
+    Returns:
+        bool: Whether the sequence is valid.
+    """
+    if seq_type is None:
+        exp_seq_type = abc.Sequence
+    else:
+        assert isinstance(seq_type, type)
+        exp_seq_type = seq_type
+    if not isinstance(seq, exp_seq_type):
+        return False
+    for item in seq:
+        if not isinstance(item, expected_type):
+            return False
+    return True
+
+
+def is_list_of(seq, expected_type):
+    """Check whether it is a list of some type.
+
+    A partial method of :func:`is_seq_of`.
+    """
+    return is_seq_of(seq, expected_type, seq_type=list)
+
+
+def is_tuple_of(seq, expected_type):
+    """Check whether it is a tuple of some type.
+
+    A partial method of :func:`is_seq_of`.
+    """
+    return is_seq_of(seq, expected_type, seq_type=tuple)
+
+
+def slice_list(in_list, lens):
+    """Slice a list into several sub lists by a list of given length.
+
+    Args:
+        in_list (list): The list to be sliced.
+        lens(int or list): The expected length of each out list.
+
+    Returns:
+        list: A list of sliced list.
+    """
+    if isinstance(lens, int):
+        assert len(in_list) % lens == 0
+        lens = [lens] * int(len(in_list) / lens)
+    if not isinstance(lens, list):
+        raise TypeError('"indices" must be an integer or a list of integers')
+    elif sum(lens) != len(in_list):
+        raise ValueError('sum of lens and list length does not '
+                         f'match: {sum(lens)} != {len(in_list)}')
+    out_list = []
+    idx = 0
+    for i in range(len(lens)):
+        out_list.append(in_list[idx:idx + lens[i]])
+        idx += lens[i]
+    return out_list
+
+
+def concat_list(in_list):
+    """Concatenate a list of list into a single list.
+
+    Args:
+        in_list (list): The list of list to be merged.
+
+    Returns:
+        list: The concatenated flat list.
+    """
+    return list(itertools.chain(*in_list))
+
+
+def check_prerequisites(
+        prerequisites,
+        checker,
+        msg_tmpl='Prerequisites "{}" are required in method "{}" but not '
+        'found, please install them first.'):  # yapf: disable
+    """A decorator factory to check if prerequisites are satisfied.
+
+    Args:
+        prerequisites (str of list[str]): Prerequisites to be checked.
+        checker (callable): The checker method that returns True if a
+            prerequisite is meet, False otherwise.
+        msg_tmpl (str): The message template with two variables.
+
+    Returns:
+        decorator: A specific decorator.
+    """
+
+    def wrap(func):
+
+        @functools.wraps(func)
+        def wrapped_func(*args, **kwargs):
+            requirements = [prerequisites] if isinstance(
+                prerequisites, str) else prerequisites
+            missing = []
+            for item in requirements:
+                if not checker(item):
+                    missing.append(item)
+            if missing:
+                print(msg_tmpl.format(', '.join(missing), func.__name__))
+                raise RuntimeError('Prerequisites not meet.')
+            else:
+                return func(*args, **kwargs)
+
+        return wrapped_func
+
+    return wrap
+
+
+def _check_py_package(package):
+    try:
+        import_module(package)
+    except ImportError:
+        return False
+    else:
+        return True
+
+
+def _check_executable(cmd):
+    if subprocess.call(f'which {cmd}', shell=True) != 0:
+        return False
+    else:
+        return True
+
+
+def requires_package(prerequisites):
+    """A decorator to check if some python packages are installed.
+
+    Example:
+        >>> @requires_package('numpy')
+        >>> func(arg1, args):
+        >>>     return numpy.zeros(1)
+        array([0.])
+        >>> @requires_package(['numpy', 'non_package'])
+        >>> func(arg1, args):
+        >>>     return numpy.zeros(1)
+        ImportError
+    """
+    return check_prerequisites(prerequisites, checker=_check_py_package)
+
+
+def requires_executable(prerequisites):
+    """A decorator to check if some executable files are installed.
+
+    Example:
+        >>> @requires_executable('ffmpeg')
+        >>> func(arg1, args):
+        >>>     print(1)
+        1
+    """
+    return check_prerequisites(prerequisites, checker=_check_executable)
+
+
+def deprecated_api_warning(name_dict, cls_name=None):
+    """A decorator to check if some arguments are deprecate and try to replace
+    deprecate src_arg_name to dst_arg_name.
+
+    Args:
+        name_dict(dict):
+            key (str): Deprecate argument names.
+            val (str): Expected argument names.
+
+    Returns:
+        func: New function.
+    """
+
+    def api_warning_wrapper(old_func):
+
+        @functools.wraps(old_func)
+        def new_func(*args, **kwargs):
+            # get the arg spec of the decorated method
+            args_info = getfullargspec(old_func)
+            # get name of the function
+            func_name = old_func.__name__
+            if cls_name is not None:
+                func_name = f'{cls_name}.{func_name}'
+            if args:
+                arg_names = args_info.args[:len(args)]
+                for src_arg_name, dst_arg_name in name_dict.items():
+                    if src_arg_name in arg_names:
+                        warnings.warn(
+                            f'"{src_arg_name}" is deprecated in '
+                            f'`{func_name}`, please use "{dst_arg_name}" '
+                            'instead')
+                        arg_names[arg_names.index(src_arg_name)] = dst_arg_name
+            if kwargs:
+                for src_arg_name, dst_arg_name in name_dict.items():
+                    if src_arg_name in kwargs:
+
+                        assert dst_arg_name not in kwargs, (
+                            f'The expected behavior is to replace '
+                            f'the deprecated key `{src_arg_name}` to '
+                            f'new key `{dst_arg_name}`, but got them '
+                            f'in the arguments at the same time, which '
+                            f'is confusing. `{src_arg_name} will be '
+                            f'deprecated in the future, please '
+                            f'use `{dst_arg_name}` instead.')
+
+                        warnings.warn(
+                            f'"{src_arg_name}" is deprecated in '
+                            f'`{func_name}`, please use "{dst_arg_name}" '
+                            'instead')
+                        kwargs[dst_arg_name] = kwargs.pop(src_arg_name)
+
+            # apply converted arguments to the decorated method
+            output = old_func(*args, **kwargs)
+            return output
+
+        return new_func
+
+    return api_warning_wrapper
+
+
+def is_method_overridden(method, base_class, derived_class):
+    """Check if a method of base class is overridden in derived class.
+
+    Args:
+        method (str): the method name to check.
+        base_class (type): the class of the base class.
+        derived_class (type | Any): the class or instance of the derived class.
+    """
+    assert isinstance(base_class, type), \
+        "base_class doesn't accept instance, Please pass class instead."
+
+    if not isinstance(derived_class, type):
+        derived_class = derived_class.__class__
+
+    base_method = getattr(base_class, method)
+    derived_method = getattr(derived_class, method)
+    return derived_method != base_method
+
+
+def has_method(obj: object, method: str) -> bool:
+    """Check whether the object has a method.
+
+    Args:
+        method (str): The method name to check.
+        obj (object): The object to check.
+
+    Returns:
+        bool: True if the object has the method else False.
+    """
+    return hasattr(obj, method) and callable(getattr(obj, method))
diff --git a/mmcv/utils/parrots_jit.py b/mmcv/utils/parrots_jit.py
index 0e3a58c..61873f6 100644
--- a/mmcv/utils/parrots_jit.py
+++ b/mmcv/utils/parrots_jit.py
@@ -1,7 +1,7 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import os
 
-from mmengine.utils.dl_utils.parrots_wrapper import TORCH_VERSION
+from .parrots_wrapper import TORCH_VERSION
 
 parrots_jit_option = os.getenv('PARROTS_JIT_OPTION')
 
diff --git a/mmcv/utils/parrots_wrapper.py b/mmcv/utils/parrots_wrapper.py
new file mode 100644
index 0000000..93c9764
--- /dev/null
+++ b/mmcv/utils/parrots_wrapper.py
@@ -0,0 +1,107 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from functools import partial
+
+import torch
+
+TORCH_VERSION = torch.__version__
+
+
+def is_rocm_pytorch() -> bool:
+    is_rocm = False
+    if TORCH_VERSION != 'parrots':
+        try:
+            from torch.utils.cpp_extension import ROCM_HOME
+            is_rocm = True if ((torch.version.hip is not None) and
+                               (ROCM_HOME is not None)) else False
+        except ImportError:
+            pass
+    return is_rocm
+
+
+def _get_cuda_home():
+    if TORCH_VERSION == 'parrots':
+        from parrots.utils.build_extension import CUDA_HOME
+    else:
+        if is_rocm_pytorch():
+            from torch.utils.cpp_extension import ROCM_HOME
+            CUDA_HOME = ROCM_HOME
+        else:
+            from torch.utils.cpp_extension import CUDA_HOME
+    return CUDA_HOME
+
+
+def get_build_config():
+    if TORCH_VERSION == 'parrots':
+        from parrots.config import get_build_info
+        return get_build_info()
+    else:
+        return torch.__config__.show()
+
+
+def _get_conv():
+    if TORCH_VERSION == 'parrots':
+        from parrots.nn.modules.conv import _ConvNd, _ConvTransposeMixin
+    else:
+        from torch.nn.modules.conv import _ConvNd, _ConvTransposeMixin
+    return _ConvNd, _ConvTransposeMixin
+
+
+def _get_dataloader():
+    if TORCH_VERSION == 'parrots':
+        from torch.utils.data import DataLoader, PoolDataLoader
+    else:
+        from torch.utils.data import DataLoader
+        PoolDataLoader = DataLoader
+    return DataLoader, PoolDataLoader
+
+
+def _get_extension():
+    if TORCH_VERSION == 'parrots':
+        from parrots.utils.build_extension import BuildExtension, Extension
+        CppExtension = partial(Extension, cuda=False)
+        CUDAExtension = partial(Extension, cuda=True)
+    else:
+        from torch.utils.cpp_extension import (BuildExtension, CppExtension,
+                                               CUDAExtension)
+    return BuildExtension, CppExtension, CUDAExtension
+
+
+def _get_pool():
+    if TORCH_VERSION == 'parrots':
+        from parrots.nn.modules.pool import (_AdaptiveAvgPoolNd,
+                                             _AdaptiveMaxPoolNd, _AvgPoolNd,
+                                             _MaxPoolNd)
+    else:
+        from torch.nn.modules.pooling import (_AdaptiveAvgPoolNd,
+                                              _AdaptiveMaxPoolNd, _AvgPoolNd,
+                                              _MaxPoolNd)
+    return _AdaptiveAvgPoolNd, _AdaptiveMaxPoolNd, _AvgPoolNd, _MaxPoolNd
+
+
+def _get_norm():
+    if TORCH_VERSION == 'parrots':
+        from parrots.nn.modules.batchnorm import _BatchNorm, _InstanceNorm
+        SyncBatchNorm_ = torch.nn.SyncBatchNorm2d
+    else:
+        from torch.nn.modules.instancenorm import _InstanceNorm
+        from torch.nn.modules.batchnorm import _BatchNorm
+        SyncBatchNorm_ = torch.nn.SyncBatchNorm
+    return _BatchNorm, _InstanceNorm, SyncBatchNorm_
+
+
+_ConvNd, _ConvTransposeMixin = _get_conv()
+DataLoader, PoolDataLoader = _get_dataloader()
+BuildExtension, CppExtension, CUDAExtension = _get_extension()
+_BatchNorm, _InstanceNorm, SyncBatchNorm_ = _get_norm()
+_AdaptiveAvgPoolNd, _AdaptiveMaxPoolNd, _AvgPoolNd, _MaxPoolNd = _get_pool()
+
+
+class SyncBatchNorm(SyncBatchNorm_):
+
+    def _check_input_dim(self, input):
+        if TORCH_VERSION == 'parrots':
+            if input.dim() < 2:
+                raise ValueError(
+                    f'expected at least 2D input (got {input.dim()}D input)')
+        else:
+            super()._check_input_dim(input)
diff --git a/mmcv/utils/path.py b/mmcv/utils/path.py
new file mode 100644
index 0000000..7dab4b3
--- /dev/null
+++ b/mmcv/utils/path.py
@@ -0,0 +1,101 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+import os.path as osp
+from pathlib import Path
+
+from .misc import is_str
+
+
+def is_filepath(x):
+    return is_str(x) or isinstance(x, Path)
+
+
+def fopen(filepath, *args, **kwargs):
+    if is_str(filepath):
+        return open(filepath, *args, **kwargs)
+    elif isinstance(filepath, Path):
+        return filepath.open(*args, **kwargs)
+    raise ValueError('`filepath` should be a string or a Path')
+
+
+def check_file_exist(filename, msg_tmpl='file "{}" does not exist'):
+    if not osp.isfile(filename):
+        raise FileNotFoundError(msg_tmpl.format(filename))
+
+
+def mkdir_or_exist(dir_name, mode=0o777):
+    if dir_name == '':
+        return
+    dir_name = osp.expanduser(dir_name)
+    os.makedirs(dir_name, mode=mode, exist_ok=True)
+
+
+def symlink(src, dst, overwrite=True, **kwargs):
+    if os.path.lexists(dst) and overwrite:
+        os.remove(dst)
+    os.symlink(src, dst, **kwargs)
+
+
+def scandir(dir_path, suffix=None, recursive=False, case_sensitive=True):
+    """Scan a directory to find the interested files.
+
+    Args:
+        dir_path (str | obj:`Path`): Path of the directory.
+        suffix (str | tuple(str), optional): File suffix that we are
+            interested in. Default: None.
+        recursive (bool, optional): If set to True, recursively scan the
+            directory. Default: False.
+        case_sensitive (bool, optional) : If set to False, ignore the case of
+            suffix. Default: True.
+
+    Returns:
+        A generator for all the interested files with relative paths.
+    """
+    if isinstance(dir_path, (str, Path)):
+        dir_path = str(dir_path)
+    else:
+        raise TypeError('"dir_path" must be a string or Path object')
+
+    if (suffix is not None) and not isinstance(suffix, (str, tuple)):
+        raise TypeError('"suffix" must be a string or tuple of strings')
+
+    if suffix is not None and not case_sensitive:
+        suffix = suffix.lower() if isinstance(suffix, str) else tuple(
+            item.lower() for item in suffix)
+
+    root = dir_path
+
+    def _scandir(dir_path, suffix, recursive, case_sensitive):
+        for entry in os.scandir(dir_path):
+            if not entry.name.startswith('.') and entry.is_file():
+                rel_path = osp.relpath(entry.path, root)
+                _rel_path = rel_path if case_sensitive else rel_path.lower()
+                if suffix is None or _rel_path.endswith(suffix):
+                    yield rel_path
+            elif recursive and os.path.isdir(entry.path):
+                # scan recursively if entry.path is a directory
+                yield from _scandir(entry.path, suffix, recursive,
+                                    case_sensitive)
+
+    return _scandir(dir_path, suffix, recursive, case_sensitive)
+
+
+def find_vcs_root(path, markers=('.git', )):
+    """Finds the root directory (including itself) of specified markers.
+
+    Args:
+        path (str): Path of directory or file.
+        markers (list[str], optional): List of file or directory names.
+
+    Returns:
+        The directory contained one of the markers or None if not found.
+    """
+    if osp.isfile(path):
+        path = osp.dirname(path)
+
+    prev, cur = None, osp.abspath(osp.expanduser(path))
+    while cur != prev:
+        if any(osp.exists(osp.join(cur, marker)) for marker in markers):
+            return cur
+        prev, cur = cur, osp.split(cur)[0]
+    return None
diff --git a/mmcv/utils/progressbar.py b/mmcv/utils/progressbar.py
new file mode 100644
index 0000000..0062f67
--- /dev/null
+++ b/mmcv/utils/progressbar.py
@@ -0,0 +1,208 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import sys
+from collections.abc import Iterable
+from multiprocessing import Pool
+from shutil import get_terminal_size
+
+from .timer import Timer
+
+
+class ProgressBar:
+    """A progress bar which can print the progress."""
+
+    def __init__(self, task_num=0, bar_width=50, start=True, file=sys.stdout):
+        self.task_num = task_num
+        self.bar_width = bar_width
+        self.completed = 0
+        self.file = file
+        if start:
+            self.start()
+
+    @property
+    def terminal_width(self):
+        width, _ = get_terminal_size()
+        return width
+
+    def start(self):
+        if self.task_num > 0:
+            self.file.write(f'[{" " * self.bar_width}] 0/{self.task_num}, '
+                            'elapsed: 0s, ETA:')
+        else:
+            self.file.write('completed: 0, elapsed: 0s')
+        self.file.flush()
+        self.timer = Timer()
+
+    def update(self, num_tasks=1):
+        assert num_tasks > 0
+        self.completed += num_tasks
+        elapsed = self.timer.since_start()
+        if elapsed > 0:
+            fps = self.completed / elapsed
+        else:
+            fps = float('inf')
+        if self.task_num > 0:
+            percentage = self.completed / float(self.task_num)
+            eta = int(elapsed * (1 - percentage) / percentage + 0.5)
+            msg = f'\r[{{}}] {self.completed}/{self.task_num}, ' \
+                  f'{fps:.1f} task/s, elapsed: {int(elapsed + 0.5)}s, ' \
+                  f'ETA: {eta:5}s'
+
+            bar_width = min(self.bar_width,
+                            int(self.terminal_width - len(msg)) + 2,
+                            int(self.terminal_width * 0.6))
+            bar_width = max(2, bar_width)
+            mark_width = int(bar_width * percentage)
+            bar_chars = '>' * mark_width + ' ' * (bar_width - mark_width)
+            self.file.write(msg.format(bar_chars))
+        else:
+            self.file.write(
+                f'completed: {self.completed}, elapsed: {int(elapsed + 0.5)}s,'
+                f' {fps:.1f} tasks/s')
+        self.file.flush()
+
+
+def track_progress(func, tasks, bar_width=50, file=sys.stdout, **kwargs):
+    """Track the progress of tasks execution with a progress bar.
+
+    Tasks are done with a simple for-loop.
+
+    Args:
+        func (callable): The function to be applied to each task.
+        tasks (list or tuple[Iterable, int]): A list of tasks or
+            (tasks, total num).
+        bar_width (int): Width of progress bar.
+
+    Returns:
+        list: The task results.
+    """
+    if isinstance(tasks, tuple):
+        assert len(tasks) == 2
+        assert isinstance(tasks[0], Iterable)
+        assert isinstance(tasks[1], int)
+        task_num = tasks[1]
+        tasks = tasks[0]
+    elif isinstance(tasks, Iterable):
+        task_num = len(tasks)
+    else:
+        raise TypeError(
+            '"tasks" must be an iterable object or a (iterator, int) tuple')
+    prog_bar = ProgressBar(task_num, bar_width, file=file)
+    results = []
+    for task in tasks:
+        results.append(func(task, **kwargs))
+        prog_bar.update()
+    prog_bar.file.write('\n')
+    return results
+
+
+def init_pool(process_num, initializer=None, initargs=None):
+    if initializer is None:
+        return Pool(process_num)
+    elif initargs is None:
+        return Pool(process_num, initializer)
+    else:
+        if not isinstance(initargs, tuple):
+            raise TypeError('"initargs" must be a tuple')
+        return Pool(process_num, initializer, initargs)
+
+
+def track_parallel_progress(func,
+                            tasks,
+                            nproc,
+                            initializer=None,
+                            initargs=None,
+                            bar_width=50,
+                            chunksize=1,
+                            skip_first=False,
+                            keep_order=True,
+                            file=sys.stdout):
+    """Track the progress of parallel task execution with a progress bar.
+
+    The built-in :mod:`multiprocessing` module is used for process pools and
+    tasks are done with :func:`Pool.map` or :func:`Pool.imap_unordered`.
+
+    Args:
+        func (callable): The function to be applied to each task.
+        tasks (list or tuple[Iterable, int]): A list of tasks or
+            (tasks, total num).
+        nproc (int): Process (worker) number.
+        initializer (None or callable): Refer to :class:`multiprocessing.Pool`
+            for details.
+        initargs (None or tuple): Refer to :class:`multiprocessing.Pool` for
+            details.
+        chunksize (int): Refer to :class:`multiprocessing.Pool` for details.
+        bar_width (int): Width of progress bar.
+        skip_first (bool): Whether to skip the first sample for each worker
+            when estimating fps, since the initialization step may takes
+            longer.
+        keep_order (bool): If True, :func:`Pool.imap` is used, otherwise
+            :func:`Pool.imap_unordered` is used.
+
+    Returns:
+        list: The task results.
+    """
+    if isinstance(tasks, tuple):
+        assert len(tasks) == 2
+        assert isinstance(tasks[0], Iterable)
+        assert isinstance(tasks[1], int)
+        task_num = tasks[1]
+        tasks = tasks[0]
+    elif isinstance(tasks, Iterable):
+        task_num = len(tasks)
+    else:
+        raise TypeError(
+            '"tasks" must be an iterable object or a (iterator, int) tuple')
+    pool = init_pool(nproc, initializer, initargs)
+    start = not skip_first
+    task_num -= nproc * chunksize * int(skip_first)
+    prog_bar = ProgressBar(task_num, bar_width, start, file=file)
+    results = []
+    if keep_order:
+        gen = pool.imap(func, tasks, chunksize)
+    else:
+        gen = pool.imap_unordered(func, tasks, chunksize)
+    for result in gen:
+        results.append(result)
+        if skip_first:
+            if len(results) < nproc * chunksize:
+                continue
+            elif len(results) == nproc * chunksize:
+                prog_bar.start()
+                continue
+        prog_bar.update()
+    prog_bar.file.write('\n')
+    pool.close()
+    pool.join()
+    return results
+
+
+def track_iter_progress(tasks, bar_width=50, file=sys.stdout):
+    """Track the progress of tasks iteration or enumeration with a progress
+    bar.
+
+    Tasks are yielded with a simple for-loop.
+
+    Args:
+        tasks (list or tuple[Iterable, int]): A list of tasks or
+            (tasks, total num).
+        bar_width (int): Width of progress bar.
+
+    Yields:
+        list: The task results.
+    """
+    if isinstance(tasks, tuple):
+        assert len(tasks) == 2
+        assert isinstance(tasks[0], Iterable)
+        assert isinstance(tasks[1], int)
+        task_num = tasks[1]
+        tasks = tasks[0]
+    elif isinstance(tasks, Iterable):
+        task_num = len(tasks)
+    else:
+        raise TypeError(
+            '"tasks" must be an iterable object or a (iterator, int) tuple')
+    prog_bar = ProgressBar(task_num, bar_width, file=file)
+    for task in tasks:
+        yield task
+        prog_bar.update()
+    prog_bar.file.write('\n')
diff --git a/mmcv/utils/registry.py b/mmcv/utils/registry.py
new file mode 100644
index 0000000..fa9df39
--- /dev/null
+++ b/mmcv/utils/registry.py
@@ -0,0 +1,315 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import inspect
+import warnings
+from functools import partial
+
+from .misc import is_seq_of
+
+
+def build_from_cfg(cfg, registry, default_args=None):
+    """Build a module from config dict.
+
+    Args:
+        cfg (dict): Config dict. It should at least contain the key "type".
+        registry (:obj:`Registry`): The registry to search the type from.
+        default_args (dict, optional): Default initialization arguments.
+
+    Returns:
+        object: The constructed object.
+    """
+    if not isinstance(cfg, dict):
+        raise TypeError(f'cfg must be a dict, but got {type(cfg)}')
+    if 'type' not in cfg:
+        if default_args is None or 'type' not in default_args:
+            raise KeyError(
+                '`cfg` or `default_args` must contain the key "type", '
+                f'but got {cfg}\n{default_args}')
+    if not isinstance(registry, Registry):
+        raise TypeError('registry must be an mmcv.Registry object, '
+                        f'but got {type(registry)}')
+    if not (isinstance(default_args, dict) or default_args is None):
+        raise TypeError('default_args must be a dict or None, '
+                        f'but got {type(default_args)}')
+
+    args = cfg.copy()
+
+    if default_args is not None:
+        for name, value in default_args.items():
+            args.setdefault(name, value)
+
+    obj_type = args.pop('type')
+    if isinstance(obj_type, str):
+        obj_cls = registry.get(obj_type)
+        if obj_cls is None:
+            raise KeyError(
+                f'{obj_type} is not in the {registry.name} registry')
+    elif inspect.isclass(obj_type):
+        obj_cls = obj_type
+    else:
+        raise TypeError(
+            f'type must be a str or valid type, but got {type(obj_type)}')
+    try:
+        return obj_cls(**args)
+    except Exception as e:
+        # Normal TypeError does not print class name.
+        raise type(e)(f'{obj_cls.__name__}: {e}')
+
+
+class Registry:
+    """A registry to map strings to classes.
+
+    Registered object could be built from registry.
+    Example:
+        >>> MODELS = Registry('models')
+        >>> @MODELS.register_module()
+        >>> class ResNet:
+        >>>     pass
+        >>> resnet = MODELS.build(dict(type='ResNet'))
+
+    Please refer to
+    https://mmcv.readthedocs.io/en/latest/understand_mmcv/registry.html for
+    advanced usage.
+
+    Args:
+        name (str): Registry name.
+        build_func(func, optional): Build function to construct instance from
+            Registry, func:`build_from_cfg` is used if neither ``parent`` or
+            ``build_func`` is specified. If ``parent`` is specified and
+            ``build_func`` is not given,  ``build_func`` will be inherited
+            from ``parent``. Default: None.
+        parent (Registry, optional): Parent registry. The class registered in
+            children registry could be built from parent. Default: None.
+        scope (str, optional): The scope of registry. It is the key to search
+            for children registry. If not specified, scope will be the name of
+            the package where class is defined, e.g. mmdet, mmcls, mmseg.
+            Default: None.
+    """
+
+    def __init__(self, name, build_func=None, parent=None, scope=None):
+        self._name = name
+        self._module_dict = dict()
+        self._children = dict()
+        self._scope = self.infer_scope() if scope is None else scope
+
+        # self.build_func will be set with the following priority:
+        # 1. build_func
+        # 2. parent.build_func
+        # 3. build_from_cfg
+        if build_func is None:
+            if parent is not None:
+                self.build_func = parent.build_func
+            else:
+                self.build_func = build_from_cfg
+        else:
+            self.build_func = build_func
+        if parent is not None:
+            assert isinstance(parent, Registry)
+            parent._add_children(self)
+            self.parent = parent
+        else:
+            self.parent = None
+
+    def __len__(self):
+        return len(self._module_dict)
+
+    def __contains__(self, key):
+        return self.get(key) is not None
+
+    def __repr__(self):
+        format_str = self.__class__.__name__ + \
+                     f'(name={self._name}, ' \
+                     f'items={self._module_dict})'
+        return format_str
+
+    @staticmethod
+    def infer_scope():
+        """Infer the scope of registry.
+
+        The name of the package where registry is defined will be returned.
+
+        Example:
+            # in mmdet/models/backbone/resnet.py
+            >>> MODELS = Registry('models')
+            >>> @MODELS.register_module()
+            >>> class ResNet:
+            >>>     pass
+            The scope of ``ResNet`` will be ``mmdet``.
+
+
+        Returns:
+            scope (str): The inferred scope name.
+        """
+        # inspect.stack() trace where this function is called, the index-2
+        # indicates the frame where `infer_scope()` is called
+        filename = inspect.getmodule(inspect.stack()[2][0]).__name__
+        split_filename = filename.split('.')
+        return split_filename[0]
+
+    @staticmethod
+    def split_scope_key(key):
+        """Split scope and key.
+
+        The first scope will be split from key.
+
+        Examples:
+            >>> Registry.split_scope_key('mmdet.ResNet')
+            'mmdet', 'ResNet'
+            >>> Registry.split_scope_key('ResNet')
+            None, 'ResNet'
+
+        Return:
+            scope (str, None): The first scope.
+            key (str): The remaining key.
+        """
+        split_index = key.find('.')
+        if split_index != -1:
+            return key[:split_index], key[split_index + 1:]
+        else:
+            return None, key
+
+    @property
+    def name(self):
+        return self._name
+
+    @property
+    def scope(self):
+        return self._scope
+
+    @property
+    def module_dict(self):
+        return self._module_dict
+
+    @property
+    def children(self):
+        return self._children
+
+    def get(self, key):
+        """Get the registry record.
+
+        Args:
+            key (str): The class name in string format.
+
+        Returns:
+            class: The corresponding class.
+        """
+        scope, real_key = self.split_scope_key(key)
+        if scope is None or scope == self._scope:
+            # get from self
+            if real_key in self._module_dict:
+                return self._module_dict[real_key]
+        else:
+            # get from self._children
+            if scope in self._children:
+                return self._children[scope].get(real_key)
+            else:
+                # goto root
+                parent = self.parent
+                while parent.parent is not None:
+                    parent = parent.parent
+                return parent.get(key)
+
+    def build(self, *args, **kwargs):
+        return self.build_func(*args, **kwargs, registry=self)
+
+    def _add_children(self, registry):
+        """Add children for a registry.
+
+        The ``registry`` will be added as children based on its scope.
+        The parent registry could build objects from children registry.
+
+        Example:
+            >>> models = Registry('models')
+            >>> mmdet_models = Registry('models', parent=models)
+            >>> @mmdet_models.register_module()
+            >>> class ResNet:
+            >>>     pass
+            >>> resnet = models.build(dict(type='mmdet.ResNet'))
+        """
+
+        assert isinstance(registry, Registry)
+        assert registry.scope is not None
+        assert registry.scope not in self.children, \
+            f'scope {registry.scope} exists in {self.name} registry'
+        self.children[registry.scope] = registry
+
+    def _register_module(self, module_class, module_name=None, force=False):
+        if not inspect.isclass(module_class):
+            raise TypeError('module must be a class, '
+                            f'but got {type(module_class)}')
+
+        if module_name is None:
+            module_name = module_class.__name__
+        if isinstance(module_name, str):
+            module_name = [module_name]
+        for name in module_name:
+            if not force and name in self._module_dict:
+                raise KeyError(f'{name} is already registered '
+                               f'in {self.name}')
+            self._module_dict[name] = module_class
+
+    def deprecated_register_module(self, cls=None, force=False):
+        warnings.warn(
+            'The old API of register_module(module, force=False) '
+            'is deprecated and will be removed, please use the new API '
+            'register_module(name=None, force=False, module=None) instead.')
+        if cls is None:
+            return partial(self.deprecated_register_module, force=force)
+        self._register_module(cls, force=force)
+        return cls
+
+    def register_module(self, name=None, force=False, module=None):
+        """Register a module.
+
+        A record will be added to `self._module_dict`, whose key is the class
+        name or the specified name, and value is the class itself.
+        It can be used as a decorator or a normal function.
+
+        Example:
+            >>> backbones = Registry('backbone')
+            >>> @backbones.register_module()
+            >>> class ResNet:
+            >>>     pass
+
+            >>> backbones = Registry('backbone')
+            >>> @backbones.register_module(name='mnet')
+            >>> class MobileNet:
+            >>>     pass
+
+            >>> backbones = Registry('backbone')
+            >>> class ResNet:
+            >>>     pass
+            >>> backbones.register_module(ResNet)
+
+        Args:
+            name (str | None): The module name to be registered. If not
+                specified, the class name will be used.
+            force (bool, optional): Whether to override an existing class with
+                the same name. Default: False.
+            module (type): Module class to be registered.
+        """
+        if not isinstance(force, bool):
+            raise TypeError(f'force must be a boolean, but got {type(force)}')
+        # NOTE: This is a walkaround to be compatible with the old api,
+        # while it may introduce unexpected bugs.
+        if isinstance(name, type):
+            return self.deprecated_register_module(name, force=force)
+
+        # raise the error ahead of time
+        if not (name is None or isinstance(name, str) or is_seq_of(name, str)):
+            raise TypeError(
+                'name must be either of None, an instance of str or a sequence'
+                f'  of str, but got {type(name)}')
+
+        # use it as a normal method: x.register_module(module=SomeClass)
+        if module is not None:
+            self._register_module(
+                module_class=module, module_name=name, force=force)
+            return module
+
+        # use it as a decorator: @x.register_module()
+        def _register(cls):
+            self._register_module(
+                module_class=cls, module_name=name, force=force)
+            return cls
+
+        return _register
diff --git a/mmcv/utils/testing.py b/mmcv/utils/testing.py
new file mode 100644
index 0000000..a27f936
--- /dev/null
+++ b/mmcv/utils/testing.py
@@ -0,0 +1,140 @@
+# Copyright (c) Open-MMLab.
+import sys
+from collections.abc import Iterable
+from runpy import run_path
+from shlex import split
+from typing import Any, Dict, List
+from unittest.mock import patch
+
+
+def check_python_script(cmd):
+    """Run the python cmd script with `__main__`. The difference between
+    `os.system` is that, this function exectues code in the current process, so
+    that it can be tracked by coverage tools. Currently it supports two forms:
+
+    - ./tests/data/scripts/hello.py zz
+    - python tests/data/scripts/hello.py zz
+    """
+    args = split(cmd)
+    if args[0] == 'python':
+        args = args[1:]
+    with patch.object(sys, 'argv', args):
+        run_path(args[0], run_name='__main__')
+
+
+def _any(judge_result):
+    """Since built-in ``any`` works only when the element of iterable is not
+    iterable, implement the function."""
+    if not isinstance(judge_result, Iterable):
+        return judge_result
+
+    try:
+        for element in judge_result:
+            if _any(element):
+                return True
+    except TypeError:
+        # Maybe encounter the case: torch.tensor(True) | torch.tensor(False)
+        if judge_result:
+            return True
+    return False
+
+
+def assert_dict_contains_subset(dict_obj: Dict[Any, Any],
+                                expected_subset: Dict[Any, Any]) -> bool:
+    """Check if the dict_obj contains the expected_subset.
+
+    Args:
+        dict_obj (Dict[Any, Any]): Dict object to be checked.
+        expected_subset (Dict[Any, Any]): Subset expected to be contained in
+            dict_obj.
+
+    Returns:
+        bool: Whether the dict_obj contains the expected_subset.
+    """
+
+    for key, value in expected_subset.items():
+        if key not in dict_obj.keys() or _any(dict_obj[key] != value):
+            return False
+    return True
+
+
+def assert_attrs_equal(obj: Any, expected_attrs: Dict[str, Any]) -> bool:
+    """Check if attribute of class object is correct.
+
+    Args:
+        obj (object): Class object to be checked.
+        expected_attrs (Dict[str, Any]): Dict of the expected attrs.
+
+    Returns:
+        bool: Whether the attribute of class object is correct.
+    """
+    for attr, value in expected_attrs.items():
+        if not hasattr(obj, attr) or _any(getattr(obj, attr) != value):
+            return False
+    return True
+
+
+def assert_dict_has_keys(obj: Dict[str, Any],
+                         expected_keys: List[str]) -> bool:
+    """Check if the obj has all the expected_keys.
+
+    Args:
+        obj (Dict[str, Any]): Object to be checked.
+        expected_keys (List[str]): Keys expected to contained in the keys of
+            the obj.
+
+    Returns:
+        bool: Whether the obj has the expected keys.
+    """
+    return set(expected_keys).issubset(set(obj.keys()))
+
+
+def assert_keys_equal(result_keys: List[str], target_keys: List[str]) -> bool:
+    """Check if target_keys is equal to result_keys.
+
+    Args:
+        result_keys (List[str]): Result keys to be checked.
+        target_keys (List[str]): Target keys to be checked.
+
+    Returns:
+        bool: Whether target_keys is equal to result_keys.
+    """
+    return set(result_keys) == set(target_keys)
+
+
+def assert_is_norm_layer(module) -> bool:
+    """Check if the module is a norm layer.
+
+    Args:
+        module (nn.Module): The module to be checked.
+
+    Returns:
+        bool: Whether the module is a norm layer.
+    """
+    from .parrots_wrapper import _BatchNorm, _InstanceNorm
+    from torch.nn import GroupNorm, LayerNorm
+    norm_layer_candidates = (_BatchNorm, _InstanceNorm, GroupNorm, LayerNorm)
+    return isinstance(module, norm_layer_candidates)
+
+
+def assert_params_all_zeros(module) -> bool:
+    """Check if the parameters of the module is all zeros.
+
+    Args:
+        module (nn.Module): The module to be checked.
+
+    Returns:
+        bool: Whether the parameters of the module is all zeros.
+    """
+    weight_data = module.weight.data
+    is_weight_zero = weight_data.allclose(
+        weight_data.new_zeros(weight_data.size()))
+
+    if hasattr(module, 'bias') and module.bias is not None:
+        bias_data = module.bias.data
+        is_bias_zero = bias_data.allclose(
+            bias_data.new_zeros(bias_data.size()))
+    else:
+        is_bias_zero = True
+
+    return is_weight_zero and is_bias_zero
diff --git a/mmcv/utils/timer.py b/mmcv/utils/timer.py
new file mode 100644
index 0000000..66d4a78
--- /dev/null
+++ b/mmcv/utils/timer.py
@@ -0,0 +1,118 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from time import time
+
+
+class TimerError(Exception):
+
+    def __init__(self, message):
+        self.message = message
+        super(TimerError, self).__init__(message)
+
+
+class Timer:
+    """A flexible Timer class.
+
+    :Example:
+
+    >>> import time
+    >>> import mmcv
+    >>> with mmcv.Timer():
+    >>>     # simulate a code block that will run for 1s
+    >>>     time.sleep(1)
+    1.000
+    >>> with mmcv.Timer(print_tmpl='it takes {:.1f} seconds'):
+    >>>     # simulate a code block that will run for 1s
+    >>>     time.sleep(1)
+    it takes 1.0 seconds
+    >>> timer = mmcv.Timer()
+    >>> time.sleep(0.5)
+    >>> print(timer.since_start())
+    0.500
+    >>> time.sleep(0.5)
+    >>> print(timer.since_last_check())
+    0.500
+    >>> print(timer.since_start())
+    1.000
+    """
+
+    def __init__(self, start=True, print_tmpl=None):
+        self._is_running = False
+        self.print_tmpl = print_tmpl if print_tmpl else '{:.3f}'
+        if start:
+            self.start()
+
+    @property
+    def is_running(self):
+        """bool: indicate whether the timer is running"""
+        return self._is_running
+
+    def __enter__(self):
+        self.start()
+        return self
+
+    def __exit__(self, type, value, traceback):
+        print(self.print_tmpl.format(self.since_last_check()))
+        self._is_running = False
+
+    def start(self):
+        """Start the timer."""
+        if not self._is_running:
+            self._t_start = time()
+            self._is_running = True
+        self._t_last = time()
+
+    def since_start(self):
+        """Total time since the timer is started.
+
+        Returns (float): Time in seconds.
+        """
+        if not self._is_running:
+            raise TimerError('timer is not running')
+        self._t_last = time()
+        return self._t_last - self._t_start
+
+    def since_last_check(self):
+        """Time since the last checking.
+
+        Either :func:`since_start` or :func:`since_last_check` is a checking
+        operation.
+
+        Returns (float): Time in seconds.
+        """
+        if not self._is_running:
+            raise TimerError('timer is not running')
+        dur = time() - self._t_last
+        self._t_last = time()
+        return dur
+
+
+_g_timers = {}  # global timers
+
+
+def check_time(timer_id):
+    """Add check points in a single line.
+
+    This method is suitable for running a task on a list of items. A timer will
+    be registered when the method is called for the first time.
+
+    :Example:
+
+    >>> import time
+    >>> import mmcv
+    >>> for i in range(1, 6):
+    >>>     # simulate a code block
+    >>>     time.sleep(i)
+    >>>     mmcv.check_time('task1')
+    2.000
+    3.000
+    4.000
+    5.000
+
+    Args:
+        timer_id (str): Timer identifier.
+    """
+    if timer_id not in _g_timers:
+        _g_timers[timer_id] = Timer()
+        return 0
+    else:
+        return _g_timers[timer_id].since_last_check()
diff --git a/mmcv/utils/trace.py b/mmcv/utils/trace.py
new file mode 100644
index 0000000..8e49bfd
--- /dev/null
+++ b/mmcv/utils/trace.py
@@ -0,0 +1,23 @@
+import warnings
+
+import torch
+
+from mmcv.utils import digit_version
+
+
+def is_jit_tracing() -> bool:
+    if (torch.__version__ != 'parrots'
+            and digit_version(torch.__version__) >= digit_version('1.6.0')):
+        on_trace = torch.jit.is_tracing()
+        # In PyTorch 1.6, torch.jit.is_tracing has a bug.
+        # Refers to https://github.com/pytorch/pytorch/issues/42448
+        if isinstance(on_trace, bool):
+            return on_trace
+        else:
+            return torch._C._is_tracing()
+    else:
+        warnings.warn(
+            'torch.jit.is_tracing is only supported after v1.6.0. '
+            'Therefore is_tracing returns False automatically. Please '
+            'set on_trace manually if you are using trace.', UserWarning)
+        return False
diff --git a/mmcv/utils/version_utils.py b/mmcv/utils/version_utils.py
new file mode 100644
index 0000000..963c45a
--- /dev/null
+++ b/mmcv/utils/version_utils.py
@@ -0,0 +1,90 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+import subprocess
+import warnings
+
+from packaging.version import parse
+
+
+def digit_version(version_str: str, length: int = 4):
+    """Convert a version string into a tuple of integers.
+
+    This method is usually used for comparing two versions. For pre-release
+    versions: alpha < beta < rc.
+
+    Args:
+        version_str (str): The version string.
+        length (int): The maximum number of version levels. Default: 4.
+
+    Returns:
+        tuple[int]: The version info in digits (integers).
+    """
+    assert 'parrots' not in version_str
+    version = parse(version_str)
+    assert version.release, f'failed to parse version {version_str}'
+    release = list(version.release)
+    release = release[:length]
+    if len(release) < length:
+        release = release + [0] * (length - len(release))
+    if version.is_prerelease:
+        mapping = {'a': -3, 'b': -2, 'rc': -1}
+        val = -4
+        # version.pre can be None
+        if version.pre:
+            if version.pre[0] not in mapping:
+                warnings.warn(f'unknown prerelease version {version.pre[0]}, '
+                              'version checking may go wrong')
+            else:
+                val = mapping[version.pre[0]]
+            release.extend([val, version.pre[-1]])
+        else:
+            release.extend([val, 0])
+
+    elif version.is_postrelease:
+        release.extend([1, version.post])
+    else:
+        release.extend([0, 0])
+    return tuple(release)
+
+
+def _minimal_ext_cmd(cmd):
+    # construct minimal environment
+    env = {}
+    for k in ['SYSTEMROOT', 'PATH', 'HOME']:
+        v = os.environ.get(k)
+        if v is not None:
+            env[k] = v
+    # LANGUAGE is used on win32
+    env['LANGUAGE'] = 'C'
+    env['LANG'] = 'C'
+    env['LC_ALL'] = 'C'
+    out = subprocess.Popen(
+        cmd, stdout=subprocess.PIPE, env=env).communicate()[0]
+    return out
+
+
+def get_git_hash(fallback='unknown', digits=None):
+    """Get the git hash of the current repo.
+
+    Args:
+        fallback (str, optional): The fallback string when git hash is
+            unavailable. Defaults to 'unknown'.
+        digits (int, optional): kept digits of the hash. Defaults to None,
+            meaning all digits are kept.
+
+    Returns:
+        str: Git commit hash.
+    """
+
+    if digits is not None and not isinstance(digits, int):
+        raise TypeError('digits must be None or an integer')
+
+    try:
+        out = _minimal_ext_cmd(['git', 'rev-parse', 'HEAD'])
+        sha = out.strip().decode('ascii')
+        if digits is not None:
+            sha = sha[:digits]
+    except OSError:
+        sha = fallback
+
+    return sha
diff --git a/mmcv/version.py b/mmcv/version.py
index c581999..783bcbb 100644
--- a/mmcv/version.py
+++ b/mmcv/version.py
@@ -1,6 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-__version__ = '2.0.0'
-__dcu_version__ = '2.0.0'
+__version__ = '1.4.0'
+
 
 def parse_version_info(version_str: str, length: int = 4) -> tuple:
     """Parse a version string into a tuple.
@@ -22,9 +22,9 @@ def parse_version_info(version_str: str, length: int = 4) -> tuple:
     if len(release) < length:
         release = release + [0] * (length - len(release))
     if version.is_prerelease:
-        release.extend(list(version.pre))  # type: ignore
+        release.extend(list(version.pre))
     elif version.is_postrelease:
-        release.extend(list(version.post))  # type: ignore
+        release.extend(list(version.post))
     else:
         release.extend([0, 0])
     return tuple(release)
@@ -32,4 +32,4 @@ def parse_version_info(version_str: str, length: int = 4) -> tuple:
 
 version_info = tuple(int(x) for x in __version__.split('.')[:3])
 
-__all__ = ['__version__', '__dcu_version__', 'version_info', 'parse_version_info']
+__all__ = ['__version__', 'version_info', 'parse_version_info']
diff --git a/mmcv/video/io.py b/mmcv/video/io.py
index 378f5b9..30971a1 100644
--- a/mmcv/video/io.py
+++ b/mmcv/video/io.py
@@ -6,8 +6,9 @@ import cv2
 from cv2 import (CAP_PROP_FOURCC, CAP_PROP_FPS, CAP_PROP_FRAME_COUNT,
                  CAP_PROP_FRAME_HEIGHT, CAP_PROP_FRAME_WIDTH,
                  CAP_PROP_POS_FRAMES, VideoWriter_fourcc)
-from mmengine.utils import (check_file_exist, mkdir_or_exist, scandir,
-                            track_progress)
+
+from mmcv.utils import (check_file_exist, mkdir_or_exist, scandir,
+                        track_progress)
 
 
 class Cache:
@@ -41,7 +42,7 @@ class Cache:
 class VideoReader:
     """Video class with similar usage to a list object.
 
-    This video wrapper class provides convenient apis to access frames.
+    This video warpper class provides convenient apis to access frames.
     There exists an issue of OpenCV's VideoCapture class that jumping to a
     certain frame may be inaccurate. It is fixed in this class by checking
     the position after jumping each time.
@@ -49,14 +50,15 @@ class VideoReader:
     the second time, there is no need to decode again if it is stored in the
     cache.
 
-    Examples:
-        >>> import mmcv
-        >>> v = mmcv.VideoReader('sample.mp4')
-        >>> len(v)  # get the total frame number with `len()`
-        120
-        >>> for img in v:  # v is iterable
-        >>>     mmcv.imshow(img)
-        >>> v[5]  # get the 6th frame
+    :Example:
+
+    >>> import mmcv
+    >>> v = mmcv.VideoReader('sample.mp4')
+    >>> len(v)  # get the total frame number with `len()`
+    120
+    >>> for img in v:  # v is iterable
+    >>>     mmcv.imshow(img)
+    >>> v[5]  # get the 6th frame
     """
 
     def __init__(self, filename, cache_capacity=10):
@@ -187,7 +189,7 @@ class VideoReader:
 
         Returns:
             ndarray or None: If the video is fresh, return None, otherwise
-            return the frame.
+                return the frame.
         """
         if self._position == 0:
             return None
@@ -271,14 +273,14 @@ class VideoReader:
         self._vcap.release()
 
 
-def frames2video(frame_dir: str,
-                 video_file: str,
-                 fps: float = 30,
-                 fourcc: str = 'XVID',
-                 filename_tmpl: str = '{:06d}.jpg',
-                 start: int = 0,
-                 end: int = 0,
-                 show_progress: bool = True) -> None:
+def frames2video(frame_dir,
+                 video_file,
+                 fps=30,
+                 fourcc='XVID',
+                 filename_tmpl='{:06d}.jpg',
+                 start=0,
+                 end=0,
+                 show_progress=True):
     """Read the frame images from a directory and join them as a video.
 
     Args:
diff --git a/mmcv/video/optflow.py b/mmcv/video/optflow.py
index edd3e42..c246f5b 100644
--- a/mmcv/video/optflow.py
+++ b/mmcv/video/optflow.py
@@ -1,20 +1,15 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import warnings
-from typing import Tuple, Union
 
 import cv2
 import numpy as np
-from mmengine.utils import is_str
 
 from mmcv.arraymisc import dequantize, quantize
 from mmcv.image import imread, imwrite
+from mmcv.utils import is_str
 
 
-def flowread(flow_or_path: Union[np.ndarray, str],
-             quantize: bool = False,
-             concat_axis: int = 0,
-             *args,
-             **kwargs) -> np.ndarray:
+def flowread(flow_or_path, quantize=False, concat_axis=0, *args, **kwargs):
     """Read an optical flow map.
 
     Args:
@@ -40,10 +35,10 @@ def flowread(flow_or_path: Union[np.ndarray, str],
             try:
                 header = f.read(4).decode('utf-8')
             except Exception:
-                raise OSError(f'Invalid flow file: {flow_or_path}')
+                raise IOError(f'Invalid flow file: {flow_or_path}')
             else:
                 if header != 'PIEH':
-                    raise OSError(f'Invalid flow file: {flow_or_path}, '
+                    raise IOError(f'Invalid flow file: {flow_or_path}, '
                                   'header does not contain PIEH')
 
             w = np.fromfile(f, np.int32, 1).squeeze()
@@ -53,7 +48,7 @@ def flowread(flow_or_path: Union[np.ndarray, str],
         assert concat_axis in [0, 1]
         cat_flow = imread(flow_or_path, flag='unchanged')
         if cat_flow.ndim != 2:
-            raise OSError(
+            raise IOError(
                 f'{flow_or_path} is not a valid quantized flow file, '
                 f'its dimension is {cat_flow.ndim}.')
         assert cat_flow.shape[concat_axis] % 2 == 0
@@ -63,12 +58,7 @@ def flowread(flow_or_path: Union[np.ndarray, str],
     return flow.astype(np.float32)
 
 
-def flowwrite(flow: np.ndarray,
-              filename: str,
-              quantize: bool = False,
-              concat_axis: int = 0,
-              *args,
-              **kwargs) -> None:
+def flowwrite(flow, filename, quantize=False, concat_axis=0, *args, **kwargs):
     """Write optical flow to file.
 
     If the flow is not quantized, it will be saved as a .flo file losslessly,
@@ -86,7 +76,7 @@ def flowwrite(flow: np.ndarray,
     """
     if not quantize:
         with open(filename, 'wb') as f:
-            f.write(b'PIEH')
+            f.write('PIEH'.encode('utf-8'))
             np.array([flow.shape[1], flow.shape[0]], dtype=np.int32).tofile(f)
             flow = flow.astype(np.float32)
             flow.tofile(f)
@@ -98,9 +88,7 @@ def flowwrite(flow: np.ndarray,
         imwrite(dxdy, filename)
 
 
-def quantize_flow(flow: np.ndarray,
-                  max_val: float = 0.02,
-                  norm: bool = True) -> tuple:
+def quantize_flow(flow, max_val=0.02, norm=True):
     """Quantize flow to [0, 255].
 
     After this step, the size of flow will be much smaller, and can be
@@ -128,10 +116,7 @@ def quantize_flow(flow: np.ndarray,
     return tuple(flow_comps)
 
 
-def dequantize_flow(dx: np.ndarray,
-                    dy: np.ndarray,
-                    max_val: float = 0.02,
-                    denorm: bool = True) -> np.ndarray:
+def dequantize_flow(dx, dy, max_val=0.02, denorm=True):
     """Recover from quantized flow.
 
     Args:
@@ -146,7 +131,7 @@ def dequantize_flow(dx: np.ndarray,
     assert dx.shape == dy.shape
     assert dx.ndim == 2 or (dx.ndim == 3 and dx.shape[-1] == 1)
 
-    dx, dy = (dequantize(d, -max_val, max_val, 255) for d in [dx, dy])
+    dx, dy = [dequantize(d, -max_val, max_val, 255) for d in [dx, dy]]
 
     if denorm:
         dx *= dx.shape[1]
@@ -155,15 +140,12 @@ def dequantize_flow(dx: np.ndarray,
     return flow
 
 
-def flow_warp(img: np.ndarray,
-              flow: np.ndarray,
-              filling_value: int = 0,
-              interpolate_mode: str = 'nearest') -> np.ndarray:
+def flow_warp(img, flow, filling_value=0, interpolate_mode='nearest'):
     """Use flow to warp img.
 
     Args:
-        img (ndarray): Image to be warped.
-        flow (ndarray): Optical Flow.
+        img (ndarray, float or uint8): Image to be warped.
+        flow (ndarray, float): Optical Flow.
         filling_value (int): The missing pixels will be set with filling_value.
         interpolate_mode (str): bilinear -> Bilinear Interpolation;
                                 nearest -> Nearest Neighbor.
@@ -219,7 +201,7 @@ def flow_warp(img: np.ndarray,
     return output.astype(img.dtype)
 
 
-def flow_from_bytes(content: bytes) -> np.ndarray:
+def flow_from_bytes(content):
     """Read dense optical flow from bytes.
 
     .. note::
@@ -249,7 +231,7 @@ def flow_from_bytes(content: bytes) -> np.ndarray:
     return flow
 
 
-def sparse_flow_from_bytes(content: bytes) -> Tuple[np.ndarray, np.ndarray]:
+def sparse_flow_from_bytes(content):
     """Read the optical flow in KITTI datasets from bytes.
 
     This function is modified from RAFT load the `KITTI datasets
@@ -260,7 +242,7 @@ def sparse_flow_from_bytes(content: bytes) -> Tuple[np.ndarray, np.ndarray]:
 
     Returns:
         Tuple(ndarray, ndarray): Loaded optical flow with the shape (H, W, 2)
-        and flow valid mask with the shape (H, W).
+            and flow valid mask with the shape (H, W).
     """  # nopa
 
     content = np.frombuffer(content, np.uint8)
diff --git a/mmcv/video/processing.py b/mmcv/video/processing.py
index 4962e08..e623cf4 100644
--- a/mmcv/video/processing.py
+++ b/mmcv/video/processing.py
@@ -3,17 +3,16 @@ import os
 import os.path as osp
 import subprocess
 import tempfile
-from typing import List, Optional, Union
 
-from mmengine.utils import requires_executable
+from mmcv.utils import requires_executable
 
 
 @requires_executable('ffmpeg')
-def convert_video(in_file: str,
-                  out_file: str,
-                  print_cmd: bool = False,
-                  pre_options: str = '',
-                  **kwargs) -> None:
+def convert_video(in_file,
+                  out_file,
+                  print_cmd=False,
+                  pre_options='',
+                  **kwargs):
     """Convert a video with ffmpeg.
 
     This provides a general api to ffmpeg, the executed command is::
@@ -53,13 +52,13 @@ def convert_video(in_file: str,
 
 
 @requires_executable('ffmpeg')
-def resize_video(in_file: str,
-                 out_file: str,
-                 size: Optional[tuple] = None,
-                 ratio: Union[tuple, float, None] = None,
-                 keep_ar: bool = False,
-                 log_level: str = 'info',
-                 print_cmd: bool = False) -> None:
+def resize_video(in_file,
+                 out_file,
+                 size=None,
+                 ratio=None,
+                 keep_ar=False,
+                 log_level='info',
+                 print_cmd=False):
     """Resize a video.
 
     Args:
@@ -91,14 +90,14 @@ def resize_video(in_file: str,
 
 
 @requires_executable('ffmpeg')
-def cut_video(in_file: str,
-              out_file: str,
-              start: Optional[float] = None,
-              end: Optional[float] = None,
-              vcodec: Optional[str] = None,
-              acodec: Optional[str] = None,
-              log_level: str = 'info',
-              print_cmd: bool = False) -> None:
+def cut_video(in_file,
+              out_file,
+              start=None,
+              end=None,
+              vcodec=None,
+              acodec=None,
+              log_level='info',
+              print_cmd=False):
     """Cut a clip from a video.
 
     Args:
@@ -117,21 +116,21 @@ def cut_video(in_file: str,
     if acodec is None:
         options['acodec'] = 'copy'
     if start:
-        options['ss'] = start  # type: ignore
+        options['ss'] = start
     else:
         start = 0
     if end:
-        options['t'] = end - start  # type: ignore
+        options['t'] = end - start
     convert_video(in_file, out_file, print_cmd, **options)
 
 
 @requires_executable('ffmpeg')
-def concat_video(video_list: List,
-                 out_file: str,
-                 vcodec: Optional[str] = None,
-                 acodec: Optional[str] = None,
-                 log_level: str = 'info',
-                 print_cmd: bool = False) -> None:
+def concat_video(video_list,
+                 out_file,
+                 vcodec=None,
+                 acodec=None,
+                 log_level='info',
+                 print_cmd=False):
     """Concatenate multiple videos into a single one.
 
     Args:
diff --git a/mmcv/visualization/color.py b/mmcv/visualization/color.py
index 05796a8..e00355a 100644
--- a/mmcv/visualization/color.py
+++ b/mmcv/visualization/color.py
@@ -1,9 +1,9 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from enum import Enum
-from typing import Union
 
 import numpy as np
-from mmengine.utils import is_str
+
+from mmcv.utils import is_str
 
 
 class Color(Enum):
@@ -21,7 +21,7 @@ class Color(Enum):
     black = (0, 0, 0)
 
 
-def color_val(color: Union[Color, str, tuple, int, np.ndarray]) -> tuple:
+def color_val(color):
     """Convert various input to color tuples.
 
     Args:
@@ -31,7 +31,7 @@ def color_val(color: Union[Color, str, tuple, int, np.ndarray]) -> tuple:
         tuple[int]: A tuple of 3 integers indicating BGR channels.
     """
     if is_str(color):
-        return Color[color].value  # type: ignore
+        return Color[color].value
     elif isinstance(color, Color):
         return color.value
     elif isinstance(color, tuple):
diff --git a/mmcv/visualization/image.py b/mmcv/visualization/image.py
index e7ac4c1..cacfb53 100644
--- a/mmcv/visualization/image.py
+++ b/mmcv/visualization/image.py
@@ -1,19 +1,12 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from typing import List, Optional, Union
-
 import cv2
 import numpy as np
 
 from mmcv.image import imread, imwrite
-from .color import Color, color_val
-
-# a type alias declares the optional types of color argument
-ColorType = Union[Color, str, tuple, int, np.ndarray]
+from .color import color_val
 
 
-def imshow(img: Union[str, np.ndarray],
-           win_name: str = '',
-           wait_time: int = 0):
+def imshow(img, win_name='', wait_time=0):
     """Show an image.
 
     Args:
@@ -34,21 +27,21 @@ def imshow(img: Union[str, np.ndarray],
         ret = cv2.waitKey(wait_time)
 
 
-def imshow_bboxes(img: Union[str, np.ndarray],
-                  bboxes: Union[list, np.ndarray],
-                  colors: ColorType = 'green',
-                  top_k: int = -1,
-                  thickness: int = 1,
-                  show: bool = True,
-                  win_name: str = '',
-                  wait_time: int = 0,
-                  out_file: Optional[str] = None):
+def imshow_bboxes(img,
+                  bboxes,
+                  colors='green',
+                  top_k=-1,
+                  thickness=1,
+                  show=True,
+                  win_name='',
+                  wait_time=0,
+                  out_file=None):
     """Draw bboxes on an image.
 
     Args:
         img (str or ndarray): The image to be displayed.
         bboxes (list or ndarray): A list of ndarray of shape (k, 4).
-        colors (Color or str or tuple or int or ndarray): A list of colors.
+        colors (list[str or tuple or Color]): A list of colors.
         top_k (int): Plot the first k bboxes only if set positive.
         thickness (int): Thickness of lines.
         show (bool): Whether to show the image.
@@ -88,19 +81,19 @@ def imshow_bboxes(img: Union[str, np.ndarray],
     return img
 
 
-def imshow_det_bboxes(img: Union[str, np.ndarray],
-                      bboxes: np.ndarray,
-                      labels: np.ndarray,
-                      class_names: List[str] = None,
-                      score_thr: float = 0,
-                      bbox_color: ColorType = 'green',
-                      text_color: ColorType = 'green',
-                      thickness: int = 1,
-                      font_scale: float = 0.5,
-                      show: bool = True,
-                      win_name: str = '',
-                      wait_time: int = 0,
-                      out_file: Optional[str] = None):
+def imshow_det_bboxes(img,
+                      bboxes,
+                      labels,
+                      class_names=None,
+                      score_thr=0,
+                      bbox_color='green',
+                      text_color='green',
+                      thickness=1,
+                      font_scale=0.5,
+                      show=True,
+                      win_name='',
+                      wait_time=0,
+                      out_file=None):
     """Draw bboxes and class labels (with scores) on an image.
 
     Args:
@@ -110,10 +103,8 @@ def imshow_det_bboxes(img: Union[str, np.ndarray],
         labels (ndarray): Labels of bboxes.
         class_names (list[str]): Names of each classes.
         score_thr (float): Minimum score of bboxes to be shown.
-        bbox_color (Color or str or tuple or int or ndarray): Color
-            of bbox lines.
-        text_color (Color or str or tuple or int or ndarray): Color
-            of texts.
+        bbox_color (str or tuple or :obj:`Color`): Color of bbox lines.
+        text_color (str or tuple or :obj:`Color`): Color of texts.
         thickness (int): Thickness of lines.
         font_scale (float): Font scales of texts.
         show (bool): Whether to show the image.
diff --git a/mmcv/visualization/optflow.py b/mmcv/visualization/optflow.py
index 080b0e6..ee9e2c1 100644
--- a/mmcv/visualization/optflow.py
+++ b/mmcv/visualization/optflow.py
@@ -1,5 +1,5 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from typing import Optional, Union
+from __future__ import division
 
 import numpy as np
 
@@ -8,9 +8,7 @@ from mmcv.video import flowread
 from .image import imshow
 
 
-def flowshow(flow: Union[np.ndarray, str],
-             win_name: str = '',
-             wait_time: int = 0) -> None:
+def flowshow(flow, win_name='', wait_time=0):
     """Show optical flow.
 
     Args:
@@ -23,16 +21,14 @@ def flowshow(flow: Union[np.ndarray, str],
     imshow(rgb2bgr(flow_img), win_name, wait_time)
 
 
-def flow2rgb(flow: np.ndarray,
-             color_wheel: Optional[np.ndarray] = None,
-             unknown_thr: float = 1e6) -> np.ndarray:
+def flow2rgb(flow, color_wheel=None, unknown_thr=1e6):
     """Convert flow map to RGB image.
 
     Args:
         flow (ndarray): Array of optical flow.
         color_wheel (ndarray or None): Color wheel used to map flow field to
             RGB colorspace. Default color wheel will be used if not specified.
-        unknown_thr (float): Values above this threshold will be marked as
+        unknown_thr (str): Values above this threshold will be marked as
             unknown and thus ignored.
 
     Returns:
@@ -77,7 +73,7 @@ def flow2rgb(flow: np.ndarray,
     return flow_img
 
 
-def make_color_wheel(bins: Optional[Union[list, tuple]] = None) -> np.ndarray:
+def make_color_wheel(bins=None):
     """Build a color wheel.
 
     Args:
diff --git a/requirements.txt b/requirements.txt
index 448e224..f80e094 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,5 @@
--r requirements/build.txt
--r requirements/optional.txt
--r requirements/runtime.txt
--r requirements/test.txt
+addict
+numpy
+pyyaml
+regex;sys_platform=='win32'
+yapf
diff --git a/requirements/build.txt b/requirements/build.txt
deleted file mode 100644
index abf5148..0000000
--- a/requirements/build.txt
+++ /dev/null
@@ -1 +0,0 @@
-pytest-runner
diff --git a/requirements/docs.txt b/requirements/docs.txt
index a1ff4d3..988b5ea 100644
--- a/requirements/docs.txt
+++ b/requirements/docs.txt
@@ -1,9 +1,9 @@
 docutils==0.16.0
-markdown>=3.4.0
+m2r
 myst-parser
 opencv-python
 -e git+https://github.com/open-mmlab/pytorch_sphinx_theme.git#egg=pytorch_sphinx_theme
 sphinx==4.0.2
 sphinx-copybutton
-sphinx_markdown_tables>=0.0.16
+sphinx_markdown_tables
 torch
diff --git a/requirements/optional.txt b/requirements/optional.txt
deleted file mode 100644
index bc74f1d..0000000
--- a/requirements/optional.txt
+++ /dev/null
@@ -1,2 +0,0 @@
-ninja
-psutil
diff --git a/requirements/runtime.txt b/requirements/runtime.txt
index 167b58c..66e90d6 100644
--- a/requirements/runtime.txt
+++ b/requirements/runtime.txt
@@ -1,5 +1,4 @@
 addict
-mmengine>=0.2.0
 numpy
 packaging
 Pillow
diff --git a/requirements/test.txt b/requirements/test.txt
index f163c03..ee706de 100644
--- a/requirements/test.txt
+++ b/requirements/test.txt
@@ -1,9 +1,9 @@
 coverage
 lmdb
-onnx
+onnx==1.7.0
 onnxoptimizer
-onnxruntime
+onnxruntime>=1.8.0
 pytest
 PyTurboJPEG
 scipy
-tifffile
+tiffile
diff --git a/setup.cfg b/setup.cfg
index dc8d376..32222e5 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -12,15 +12,11 @@ split_before_expression_after_opening_paren = true
 [isort]
 line_length = 79
 multi_line_output = 0
-extra_standard_library = pkg_resources,setuptools,logging,os,warnings,abc
+known_standard_library = pkg_resources,setuptools,logging,os,warnings,abc
 known_first_party = mmcv
-known_third_party = addict,cv2,matplotlib,numpy,onnx,packaging,pytest,pytorch_sphinx_theme,scipy,sphinx,torch,torchvision,yaml,yapf
+known_third_party = addict,cv2,m2r,numpy,onnx,onnxruntime,packaging,pytest,pytorch_sphinx_theme,recommonmark,scipy,sphinx,tensorrt,torch,torchvision,yaml,yapf
 no_lines_before = STDLIB,LOCALFOLDER
 default_section = THIRDPARTY
 
-# ignore-words-list needs to be lowercase format. For example, if we want to
-# ignore word "BA", then we need to append "ba" to ignore-words-list rather
-# than "BA"
 [codespell]
-quiet-level = 3
-ignore-words-list = inout,hist,ba,ro,inh
+ignore-words-list = inout,hist
diff --git a/setup.py b/setup.py
index 14e2b63..9560fa8 100644
--- a/setup.py
+++ b/setup.py
@@ -2,11 +2,8 @@ import glob
 import os
 import platform
 import re
-from pkg_resources import DistributionNotFound, get_distribution, parse_version
+from pkg_resources import DistributionNotFound, get_distribution
 from setuptools import find_packages, setup
-import subprocess
-from typing import Optional, Union
-from pathlib import Path
 
 EXT_TYPE = ''
 try:
@@ -14,10 +11,6 @@ try:
     if torch.__version__ == 'parrots':
         from parrots.utils.build_extension import BuildExtension
         EXT_TYPE = 'parrots'
-    elif (hasattr(torch, 'is_mlu_available') and torch.is_mlu_available()) or \
-            os.getenv('FORCE_MLU', '0') == '1':
-        from torch_mlu.utils.cpp_extension import BuildExtension
-        EXT_TYPE = 'pytorch'
     else:
         from torch.utils.cpp_extension import BuildExtension
         EXT_TYPE = 'pytorch'
@@ -38,63 +31,12 @@ def choose_requirement(primary, secondary):
 
     return str(primary)
 
-def get_sha(pytorch_root: Union[str, Path]) -> str:
-    try:
-        return subprocess.check_output(['git', 'rev-parse', 'HEAD'], cwd=pytorch_root).decode('ascii').strip()
-    except Exception:
-        return 'Unknown'
-
-
-def get_abi():
-    try:
-        command = "echo '#include <string>' | gcc -x c++ -E -dM - | fgrep _GLIBCXX_USE_CXX11_ABI" 
-        result = subprocess.run(command, shell=True, capture_output=True, text=True) 
-        output = result.stdout.strip() 
-        abi = "abi" + output.split(" ")[-1]
-        return abi
-    except Exception:
-        return 'abiUnknown'
-
-
-def get_version_add(sha: Optional[str] = None) -> str:
-    version=''
-    mmcv_root = os.path.dirname(os.path.abspath(__file__))
-    add_version_path = os.path.join(os.path.join(mmcv_root, "mmcv"), "version.py")
-    if sha != 'Unknown':
-        if sha is None:
-            sha = get_sha(mmcv_root)
-        version = 'git' + sha[:7]
-
-    # abi
-    version += "." + get_abi()
-
-    # dtk version
-    if os.getenv("ROCM_PATH"):
-        rocm_path = os.getenv('ROCM_PATH', "")
-        rocm_version_path = os.path.join(rocm_path, '.info', "rocm_version")
-        with open(rocm_version_path, 'r',encoding='utf-8') as file:
-            lines = file.readlines()
-        rocm_version=lines[0][:-2].replace(".", "")
-        version += ".dtk" + rocm_version
-    
-    # torch version
-    version += ".torch" + torch.__version__[:4]
-
-    lines=[]
-    with open(add_version_path, 'r',encoding='utf-8') as file:
-        lines = file.readlines()
-    lines[2] = "__dcu_version__ = '2.0.0+{}'\n".format(version)
-    with open(add_version_path, encoding="utf-8",mode="w") as file:
-        file.writelines(lines)
-    file.close()
-
 
 def get_version():
-    get_version_add()
     version_file = 'mmcv/version.py'
-    with open(version_file, encoding='utf-8') as f:
+    with open(version_file, 'r', encoding='utf-8') as f:
         exec(compile(f.read(), version_file, 'exec'))
-    return locals()['__dcu_version__']
+    return locals()['__version__']
 
 
 def parse_requirements(fname='requirements/runtime.txt', with_version=True):
@@ -147,11 +89,12 @@ def parse_requirements(fname='requirements/runtime.txt', with_version=True):
             yield info
 
     def parse_require_file(fpath):
-        with open(fpath) as f:
+        with open(fpath, 'r') as f:
             for line in f.readlines():
                 line = line.strip()
                 if line and not line.startswith('#'):
-                    yield from parse_line(line)
+                    for info in parse_line(line):
+                        yield info
 
     def gen_packages_items():
         if exists(require_fpath):
@@ -191,26 +134,65 @@ except ImportError:
 def get_extensions():
     extensions = []
 
-    if os.getenv('MMCV_WITH_OPS', '1') == '0':
+    if os.getenv('MMCV_WITH_TRT', '0') != '0':
+        ext_name = 'mmcv._ext_trt'
+        from torch.utils.cpp_extension import include_paths, library_paths
+        library_dirs = []
+        libraries = []
+        include_dirs = []
+        tensorrt_path = os.getenv('TENSORRT_DIR', '0')
+        tensorrt_lib_path = glob.glob(
+            os.path.join(tensorrt_path, 'targets', '*', 'lib'))[0]
+        library_dirs += [tensorrt_lib_path]
+        libraries += ['nvinfer', 'nvparsers', 'nvinfer_plugin']
+        libraries += ['cudart']
+        define_macros = []
+        extra_compile_args = {'cxx': []}
+
+        include_path = os.path.abspath('./mmcv/ops/csrc/common/cuda')
+        include_trt_path = os.path.abspath('./mmcv/ops/csrc/tensorrt')
+        include_dirs.append(include_path)
+        include_dirs.append(include_trt_path)
+        include_dirs.append(os.path.join(tensorrt_path, 'include'))
+        include_dirs += include_paths(cuda=True)
+
+        op_files = glob.glob('./mmcv/ops/csrc/tensorrt/plugins/*')
+        define_macros += [('MMCV_WITH_CUDA', None)]
+        define_macros += [('MMCV_WITH_TRT', None)]
+        cuda_args = os.getenv('MMCV_CUDA_ARGS')
+        extra_compile_args['nvcc'] = [cuda_args] if cuda_args else []
+        library_dirs += library_paths(cuda=True)
+
+        from setuptools import Extension
+        ext_ops = Extension(
+            name=ext_name,
+            sources=op_files,
+            include_dirs=include_dirs,
+            define_macros=define_macros,
+            extra_compile_args=extra_compile_args,
+            language='c++',
+            library_dirs=library_dirs,
+            libraries=libraries)
+        extensions.append(ext_ops)
+
+    if os.getenv('MMCV_WITH_OPS', '0') == '0':
         return extensions
 
     if EXT_TYPE == 'parrots':
         ext_name = 'mmcv._ext'
         from parrots.utils.build_extension import Extension
-
         # new parrots op impl do not use MMCV_USE_PARROTS
         # define_macros = [('MMCV_USE_PARROTS', None)]
         define_macros = []
         include_dirs = []
         op_files = glob.glob('./mmcv/ops/csrc/pytorch/cuda/*.cu') +\
-            glob.glob('./mmcv/ops/csrc/pytorch/cpu/*.cpp') +\
             glob.glob('./mmcv/ops/csrc/parrots/*.cpp')
         include_dirs.append(os.path.abspath('./mmcv/ops/csrc/common'))
         include_dirs.append(os.path.abspath('./mmcv/ops/csrc/common/cuda'))
         cuda_args = os.getenv('MMCV_CUDA_ARGS')
         extra_compile_args = {
-            'nvcc': [cuda_args, '-std=c++14'] if cuda_args else ['-std=c++14'],
-            'cxx': ['-std=c++14'],
+            'nvcc': [cuda_args] if cuda_args else [],
+            'cxx': [],
         }
         if torch.cuda.is_available() or os.getenv('FORCE_CUDA', '0') == '1':
             define_macros += [('MMCV_WITH_CUDA', None)]
@@ -231,7 +213,7 @@ def get_extensions():
     elif EXT_TYPE == 'pytorch':
         ext_name = 'mmcv._ext'
         from torch.utils.cpp_extension import CppExtension, CUDAExtension
-
+        from fastpt import CUDAExtension
         # prevent ninja from using too many resources
         try:
             import psutil
@@ -251,14 +233,15 @@ def get_extensions():
         # More details at https://github.com/pytorch/pytorch/pull/45956
         extra_compile_args = {'cxx': []}
 
+        # Since the PR (https://github.com/open-mmlab/mmcv/pull/1463) uses
+        # c++14 features, the argument ['std=c++14'] must be added here.
+        # However, in the windows environment, some standard libraries
+        # will depend on c++17 or higher. In fact, for the windows
+        # environment, the compiler will choose the appropriate compiler
+        # to compile those cpp files, so there is no need to add the
+        # argument
         if platform.system() != 'Windows':
             extra_compile_args['cxx'] = ['-std=c++14']
-        else:
-            # TODO: In Windows, C++17 is chosen to compile extensions in
-            # PyTorch2.0 , but a compile error will be reported.
-            # As a temporary solution, force the use of C++14.
-            if parse_version(torch.__version__) >= parse_version('2.0.0'):
-                extra_compile_args['cxx'] = ['/std:c++14']
 
         include_dirs = []
 
@@ -270,79 +253,41 @@ def get_extensions():
         except ImportError:
             pass
 
-        if is_rocm_pytorch or torch.cuda.is_available() or os.getenv(
-                'FORCE_CUDA', '0') == '1':
-            if is_rocm_pytorch:
-                define_macros += [('MMCV_WITH_HIP', None)]
+        project_dir = 'mmcv/ops/csrc/'
+        if is_rocm_pytorch and False:
+            #from torch.utils.hipify import hipify_python
+            from fastpt import hipify_python
+
+            hipify_python.hipify(
+                project_directory=project_dir,
+                output_directory=project_dir,
+                includes='mmcv/ops/csrc/*',
+                show_detailed=True,
+                is_pytorch_extension=True,
+            )
             define_macros += [('MMCV_WITH_CUDA', None)]
+            define_macros += [('HIP_DIFF', None)]
+            cuda_args = os.getenv('MMCV_CUDA_ARGS')
+            extra_compile_args['nvcc'] = [cuda_args] if cuda_args else []
+            op_files = glob.glob('./mmcv/ops/csrc/pytorch/hip/*') \
+                + glob.glob('./mmcv/ops/csrc/pytorch/cpu/hip/*')
+            extension = CUDAExtension
+            include_dirs.append(os.path.abspath('./mmcv/ops/csrc/common/hip'))
+        elif torch.cuda.is_available() or os.getenv('FORCE_CUDA', '0') == '1':
+            define_macros += [('MMCV_WITH_CUDA', None)]
+            if is_rocm_pytorch:
+                define_macros += [('HIP_DIFF', None)
             cuda_args = os.getenv('MMCV_CUDA_ARGS')
             extra_compile_args['nvcc'] = [cuda_args] if cuda_args else []
-            if is_rocm_pytorch and platform.system() != 'Windows':
-                extra_compile_args['nvcc'] += \
-                    ['--gpu-max-threads-per-block=1024']
             op_files = glob.glob('./mmcv/ops/csrc/pytorch/*.cpp') + \
                 glob.glob('./mmcv/ops/csrc/pytorch/cpu/*.cpp') + \
                 glob.glob('./mmcv/ops/csrc/pytorch/cuda/*.cu') + \
                 glob.glob('./mmcv/ops/csrc/pytorch/cuda/*.cpp')
             extension = CUDAExtension
-            include_dirs.append(os.path.abspath('./mmcv/ops/csrc/pytorch'))
             include_dirs.append(os.path.abspath('./mmcv/ops/csrc/common'))
             include_dirs.append(os.path.abspath('./mmcv/ops/csrc/common/cuda'))
-        elif (hasattr(torch, 'is_mlu_available') and
-                torch.is_mlu_available()) or \
-                os.getenv('FORCE_MLU', '0') == '1':
-            from torch_mlu.utils.cpp_extension import MLUExtension
-            define_macros += [('MMCV_WITH_MLU', None)]
-            mlu_args = os.getenv('MMCV_MLU_ARGS')
-            extra_compile_args['cncc'] = [mlu_args] if mlu_args else []
-            op_files = glob.glob('./mmcv/ops/csrc/pytorch/*.cpp') + \
-                glob.glob('./mmcv/ops/csrc/pytorch/cpu/*.cpp') + \
-                glob.glob('./mmcv/ops/csrc/pytorch/mlu/*.cpp') + \
-                glob.glob('./mmcv/ops/csrc/common/mlu/*.mlu')
-            extension = MLUExtension
-            include_dirs.append(os.path.abspath('./mmcv/ops/csrc/common'))
-            include_dirs.append(os.path.abspath('./mmcv/ops/csrc/common/mlu'))
-        elif (hasattr(torch.backends, 'mps')
-              and torch.backends.mps.is_available()) or os.getenv(
-                  'FORCE_MPS', '0') == '1':
-            # objc compiler support
-            from distutils.unixccompiler import UnixCCompiler
-            if '.mm' not in UnixCCompiler.src_extensions:
-                UnixCCompiler.src_extensions.append('.mm')
-                UnixCCompiler.language_map['.mm'] = 'objc'
-
-            define_macros += [('MMCV_WITH_MPS', None)]
-            extra_compile_args = {}
-            extra_compile_args['cxx'] = ['-Wall', '-std=c++17']
-            extra_compile_args['cxx'] += [
-                '-framework', 'Metal', '-framework', 'Foundation'
-            ]
-            extra_compile_args['cxx'] += ['-ObjC++']
-            # src
-            op_files = glob.glob('./mmcv/ops/csrc/pytorch/*.cpp') + \
-                glob.glob('./mmcv/ops/csrc/pytorch/cpu/*.cpp') + \
-                glob.glob('./mmcv/ops/csrc/common/mps/*.mm') + \
-                glob.glob('./mmcv/ops/csrc/pytorch/mps/*.mm')
-            extension = CppExtension
-            include_dirs.append(os.path.abspath('./mmcv/ops/csrc/common'))
-            include_dirs.append(os.path.abspath('./mmcv/ops/csrc/common/mps'))
-        elif (os.getenv('FORCE_NPU', '0') == '1'):
-            print(f'Compiling {ext_name} only with CPU and NPU')
-            try:
-                from torch_npu.utils.cpp_extension import NpuExtension
-                define_macros += [('MMCV_WITH_NPU', None)]
-                extension = NpuExtension
-            except Exception:
-                raise ImportError('can not find any torch_npu')
-            # src
-            op_files = glob.glob('./mmcv/ops/csrc/pytorch/*.cpp') + \
-                glob.glob('./mmcv/ops/csrc/pytorch/cpu/*.cpp') + \
-                glob.glob('./mmcv/ops/csrc/common/npu/*.cpp') + \
-                glob.glob('./mmcv/ops/csrc/pytorch/npu/*.cpp')
-            include_dirs.append(os.path.abspath('./mmcv/ops/csrc/common'))
-            include_dirs.append(os.path.abspath('./mmcv/ops/csrc/common/npu'))
         else:
-            print(f'Compiling {ext_name} only with CPU')
+            print(f'Compiling {ext_name} without CUDA')
             op_files = glob.glob('./mmcv/ops/csrc/pytorch/*.cpp') + \
                 glob.glob('./mmcv/ops/csrc/pytorch/cpu/*.cpp')
             extension = CppExtension
@@ -365,11 +310,54 @@ def get_extensions():
             define_macros=define_macros,
             extra_compile_args=extra_compile_args)
         extensions.append(ext_ops)
+
+    if EXT_TYPE == 'pytorch' and os.getenv('MMCV_WITH_ORT', '0') != '0':
+        ext_name = 'mmcv._ext_ort'
+        from torch.utils.cpp_extension import library_paths, include_paths
+        import onnxruntime
+        library_dirs = []
+        libraries = []
+        include_dirs = []
+        ort_path = os.getenv('ONNXRUNTIME_DIR', '0')
+        library_dirs += [os.path.join(ort_path, 'lib')]
+        libraries.append('onnxruntime')
+        define_macros = []
+        extra_compile_args = {'cxx': []}
+
+        include_path = os.path.abspath('./mmcv/ops/csrc/onnxruntime')
+        include_dirs.append(include_path)
+        include_dirs.append(os.path.join(ort_path, 'include'))
+
+        op_files = glob.glob('./mmcv/ops/csrc/onnxruntime/cpu/*')
+        if onnxruntime.get_device() == 'GPU' or os.getenv('FORCE_CUDA',
+                                                          '0') == '1':
+            define_macros += [('MMCV_WITH_CUDA', None)]
+            cuda_args = os.getenv('MMCV_CUDA_ARGS')
+            extra_compile_args['nvcc'] = [cuda_args] if cuda_args else []
+            op_files += glob.glob('./mmcv/ops/csrc/onnxruntime/gpu/*')
+            include_dirs += include_paths(cuda=True)
+            library_dirs += library_paths(cuda=True)
+        else:
+            include_dirs += include_paths(cuda=False)
+            library_dirs += library_paths(cuda=False)
+
+        from setuptools import Extension
+        ext_ops = Extension(
+            name=ext_name,
+            sources=op_files,
+            include_dirs=include_dirs,
+            define_macros=define_macros,
+            extra_compile_args=extra_compile_args,
+            language='c++',
+            library_dirs=library_dirs,
+            libraries=libraries)
+        extensions.append(ext_ops)
+
     return extensions
 
 
 setup(
-    name='mmcv' if os.getenv('MMCV_WITH_OPS', '1') == '1' else 'mmcv-lite',
+    name='mmcv' if os.getenv('MMCV_WITH_OPS', '0') == '0' else 'mmcv-full',
     version=get_version(),
     description='OpenMMLab Computer Vision Foundation',
     keywords='computer vision',
@@ -380,23 +368,18 @@ setup(
         'License :: OSI Approved :: Apache Software License',
         'Operating System :: OS Independent',
         'Programming Language :: Python :: 3',
+        'Programming Language :: Python :: 3.6',
         'Programming Language :: Python :: 3.7',
         'Programming Language :: Python :: 3.8',
         'Programming Language :: Python :: 3.9',
-        'Programming Language :: Python :: 3.10',
         'Topic :: Utilities',
     ],
     url='https://github.com/open-mmlab/mmcv',
     author='MMCV Contributors',
     author_email='openmmlab@gmail.com',
+    setup_requires=['pytest-runner'],
+    tests_require=['pytest'],
     install_requires=install_requires,
-    extras_require={
-        'all': parse_requirements('requirements.txt'),
-        'tests': parse_requirements('requirements/test.txt'),
-        'build': parse_requirements('requirements/build.txt'),
-        'optional': parse_requirements('requirements/optional.txt'),
-    },
-    python_requires='>=3.7',
     ext_modules=get_extensions(),
     cmdclass=cmd_class,
     zip_safe=False)
diff --git a/tests/data/config/a.b.py b/tests/data/config/a.b.py
index 2364e1d..a591adf 100644
--- a/tests/data/config/a.b.py
+++ b/tests/data/config/a.b.py
@@ -1,4 +1,3 @@
-# Copyright (c) OpenMMLab. All rights reserved.
 item1 = [1, 2]
 item2 = {'a': 0}
 item3 = True
diff --git a/tests/data/config/a.py b/tests/data/config/a.py
index 2364e1d..a591adf 100644
--- a/tests/data/config/a.py
+++ b/tests/data/config/a.py
@@ -1,4 +1,3 @@
-# Copyright (c) OpenMMLab. All rights reserved.
 item1 = [1, 2]
 item2 = {'a': 0}
 item3 = True
diff --git a/tests/data/config/base.py b/tests/data/config/base.py
index 2364e1d..a591adf 100644
--- a/tests/data/config/base.py
+++ b/tests/data/config/base.py
@@ -1,4 +1,3 @@
-# Copyright (c) OpenMMLab. All rights reserved.
 item1 = [1, 2]
 item2 = {'a': 0}
 item3 = True
diff --git a/tests/data/config/code.py b/tests/data/config/code.py
index 65f7004..2825c9b 100644
--- a/tests/data/config/code.py
+++ b/tests/data/config/code.py
@@ -1,4 +1,3 @@
-# Copyright (c) OpenMMLab. All rights reserved.
 from mmcv import Config  # isort:skip
 
 cfg = Config.fromfile('./tests/data/config/a.py')
diff --git a/tests/data/config/d.py b/tests/data/config/d.py
index 19edcf8..f40e921 100644
--- a/tests/data/config/d.py
+++ b/tests/data/config/d.py
@@ -1,4 +1,3 @@
-# Copyright (c) OpenMMLab. All rights reserved.
 _base_ = './base.py'
 item1 = [2, 3]
 item2 = {'a': 1}
diff --git a/tests/data/config/delete.py b/tests/data/config/delete.py
index f8a1eaf..2b055c8 100644
--- a/tests/data/config/delete.py
+++ b/tests/data/config/delete.py
@@ -1,4 +1,2 @@
-# Copyright (c) OpenMMLab. All rights reserved.
 _base_ = './base.py'
-item1 = {'a': 0, '_delete_': True}
-item2 = {'b': 0}
+item2 = {'b': 0, '_delete_': True}
diff --git a/tests/data/config/deprecated.py b/tests/data/config/deprecated.py
index 791b0f6..d4c40b2 100644
--- a/tests/data/config/deprecated.py
+++ b/tests/data/config/deprecated.py
@@ -1,4 +1,3 @@
-# Copyright (c) OpenMMLab. All rights reserved.
 _base_ = './expected.py'
 
 _deprecation_ = dict(
diff --git a/tests/data/config/deprecated_as_base.py b/tests/data/config/deprecated_as_base.py
index 406964d..22e534f 100644
--- a/tests/data/config/deprecated_as_base.py
+++ b/tests/data/config/deprecated_as_base.py
@@ -1,2 +1 @@
-# Copyright (c) OpenMMLab. All rights reserved.
 _base_ = './deprecated.py'
\ No newline at end of file
diff --git a/tests/data/config/e.py b/tests/data/config/e.py
index 1340e4b..173fbe4 100644
--- a/tests/data/config/e.py
+++ b/tests/data/config/e.py
@@ -1,3 +1,2 @@
-# Copyright (c) OpenMMLab. All rights reserved.
 _base_ = './base.py'
 item3 = {'a': 1}
diff --git a/tests/data/config/expected.py b/tests/data/config/expected.py
index 7f6b729..96eb408 100644
--- a/tests/data/config/expected.py
+++ b/tests/data/config/expected.py
@@ -1,2 +1 @@
-# Copyright (c) OpenMMLab. All rights reserved.
 item1 = 'expected'
diff --git a/tests/data/config/f.py b/tests/data/config/f.py
index b6ed109..c626885 100644
--- a/tests/data/config/f.py
+++ b/tests/data/config/f.py
@@ -1,3 +1,2 @@
-# Copyright (c) OpenMMLab. All rights reserved.
 _base_ = './d.py'
 item4 = 'test_recursive_bases'
diff --git a/tests/data/config/g.py b/tests/data/config/g.py
index 34d4ebe..9c55589 100644
--- a/tests/data/config/g.py
+++ b/tests/data/config/g.py
@@ -1,2 +1 @@
-# Copyright (c) OpenMMLab. All rights reserved.
 filename = 'reserved.py'
diff --git a/tests/data/config/h.py b/tests/data/config/h.py
index 8259459..9562bb0 100644
--- a/tests/data/config/h.py
+++ b/tests/data/config/h.py
@@ -1,4 +1,3 @@
-# Copyright (c) OpenMMLab. All rights reserved.
 item1 = '{{fileBasename}}'
 item2 = '{{ fileDirname}}'
 item3 = 'abc_{{ fileBasenameNoExtension }}'
diff --git a/tests/data/config/i_base.py b/tests/data/config/i_base.py
index f31a46a..5032f16 100644
--- a/tests/data/config/i_base.py
+++ b/tests/data/config/i_base.py
@@ -1,4 +1,3 @@
-# Copyright (c) OpenMMLab. All rights reserved.
 item1 = [1, 2]
 item2 = {'a': 0}
 item3 = True
diff --git a/tests/data/config/i_child.py b/tests/data/config/i_child.py
index dfb91d1..9fb5972 100644
--- a/tests/data/config/i_child.py
+++ b/tests/data/config/i_child.py
@@ -1,4 +1,3 @@
-# Copyright (c) OpenMMLab. All rights reserved.
 _base_ = './i_base.py'
 item_cfg = {'b': 2}
 item6 = {'cfg': item_cfg}
diff --git a/tests/data/config/l.py b/tests/data/config/l.py
index 4a17bfc..b63dd5e 100644
--- a/tests/data/config/l.py
+++ b/tests/data/config/l.py
@@ -1,10 +1,3 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import os.path as osp
-
-
-def func(x):
-    return x
-
 _base_ = ['./l1.py', './l2.yaml', './l3.json', './l4.py']
 item3 = False
 item4 = 'test'
diff --git a/tests/data/config/l1.py b/tests/data/config/l1.py
index 13db137..939ac08 100644
--- a/tests/data/config/l1.py
+++ b/tests/data/config/l1.py
@@ -1,2 +1 @@
-# Copyright (c) OpenMMLab. All rights reserved.
 item1 = [1, 2]
diff --git a/tests/data/config/l4.py b/tests/data/config/l4.py
index cb7b436..2bfc70b 100644
--- a/tests/data/config/l4.py
+++ b/tests/data/config/l4.py
@@ -1,4 +1,3 @@
-# Copyright (c) OpenMMLab. All rights reserved.
 item5 = dict(a=0, b=1)
 item6 = [dict(a=0), dict(b=1)]
 item7 = dict(a=[0, 1, 2], b=dict(c=[3.1, 4.2, 5.3]))
diff --git a/tests/data/config/m.py b/tests/data/config/m.py
index af81ca3..950e40e 100644
--- a/tests/data/config/m.py
+++ b/tests/data/config/m.py
@@ -1,4 +1,3 @@
-# Copyright (c) OpenMMLab. All rights reserved.
 _base_ = ['./l1.py', './l2.yaml', './l3.json', 'a.py']
 item3 = False
 item4 = 'test'
diff --git a/tests/data/config/n.py b/tests/data/config/n.py
index 8d29598..fb193e6 100644
--- a/tests/data/config/n.py
+++ b/tests/data/config/n.py
@@ -1,10 +1,3 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import os.path as osp
-
-
-def func(x):
-    return x
-
 test_item1 = [1, 2]
 bool_item2 = True
 str_item3 = 'test'
diff --git a/tests/data/config/q.py b/tests/data/config/q.py
index f7ca0a7..440a305 100644
--- a/tests/data/config/q.py
+++ b/tests/data/config/q.py
@@ -1,2 +1 @@
-# Copyright (c) OpenMMLab. All rights reserved.
 custom_imports = dict(imports=['r'], allow_failed_imports=False)
diff --git a/tests/data/config/r.py b/tests/data/config/r.py
index 26d982e..9360128 100644
--- a/tests/data/config/r.py
+++ b/tests/data/config/r.py
@@ -1,4 +1,3 @@
-# Copyright (c) OpenMMLab. All rights reserved.
 import os
 
 os.environ["TEST_VALUE"] = 'test'
diff --git a/tests/data/config/s.py b/tests/data/config/s.py
index cca0753..7c142f1 100644
--- a/tests/data/config/s.py
+++ b/tests/data/config/s.py
@@ -1,2 +1 @@
-# Copyright (c) OpenMMLab. All rights reserved.
 item = [{'a': 0}, {'b': 0, 'c': 0}]
diff --git a/tests/data/config/t.py b/tests/data/config/t.py
index 1df57cb..9f085ae 100644
--- a/tests/data/config/t.py
+++ b/tests/data/config/t.py
@@ -1,4 +1,3 @@
-# Copyright (c) OpenMMLab. All rights reserved.
 _base_ = ['./l1.py', './l2.yaml', './l3.json', './l4.py']
 item3 = False
 item4 = 'test'
diff --git a/tests/data/config/u.py b/tests/data/config/u.py
index be6c5bb..bdd96a7 100644
--- a/tests/data/config/u.py
+++ b/tests/data/config/u.py
@@ -1,4 +1,3 @@
-# Copyright (c) OpenMMLab. All rights reserved.
 _base_ = ['./t.py']
 base = '_base_.item8'
 item11 = {{ _base_.item8 }}
diff --git a/tests/data/config/v.py b/tests/data/config/v.py
index 13d204d..3d2a1a4 100644
--- a/tests/data/config/v.py
+++ b/tests/data/config/v.py
@@ -1,4 +1,3 @@
-# Copyright (c) OpenMMLab. All rights reserved.
 _base_ = ['./u.py']
 item21 = {{ _base_.item11 }}
 item22 = item21
diff --git a/tests/data/for_carafe/carafe_feat.bin b/tests/data/for_carafe/carafe_feat.bin
deleted file mode 100755
index 9402a7cdacd110d94343dfc8ca3acc935ea4af1b..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 4608
zcmWNV_dl0=7>7yrsK|<p${xuo`97cfq0mt&NmdGV9OGzf6UU}SvO*~%DjHUJKKECH
zN|aH=(TT=s5usAXIsd}@hwHjt?`B>Q$|UH6#A{UykjaD=yvHnRBDGeB?|^&;!JM^H
z(56rUk8Re&6}Mz)RpFtER6Yb9Ervk<Ry=xZA91OS#EL^naI>(B)+cq+nsF;Qk78_|
z)dM^^WKKWU-2(N)LTu5(Q6fEAPWaB6tYX>^Ot@ykzRmbZN~7d3*G-DL?!OFeO;H%9
zu8v;v1M~u%2mZc9F1q9|JoHL{jS%%CD<p%U@$p$4H!+~Ef?vUnyLaH7zCVWc`I3+?
zF2qrBCi&f&a^lgoct?c70oX_nI^W{9)XxJkY7Fv9MzGHME@@9NK*wJpq=b#2WA;iI
z?f5q>DoiH4HEA&4?j+6|l;m5k?8lCb9VE@Gju893^k~IuI(YGS2+*k|%+d)EsGmiu
zwMOano2kTYqb}@rJ41>a7U7L!O5~Bh4panY!p2P%bRz3LwsxnJ<AY9A&n%DGdg=oD
z$F!0a{Z$|ru^qn&A$6&>wD!9CI|jO}gng|WQOdBb?o0U*+@P<7tWq5A<SDR2R-+{I
zPyr~9$6`!LGVJu+Mcg7^g6zd`*f1%FQMqaK;tW1Da(s=xMJBMXgAaNFQP^901qU;;
zsZPx)T3z*mHYTlwnSL?A|6YyR&gr1Pu9H*v6%XOv7W}7Rf?u+nG09GmWJT<R84?`0
zdltgRz)+k%5kZ;<H-g8>AB<DJFf?r4fv&T}@${efX??~M60uyL`%r%yWUr-ik+KIM
z{Idsj@fo8h9n^8^P$XA*U;})Q4kS+g_i=oWBKoblikrt1$iNXp_Jx--ypb6}(^y9c
zyq^jcB^>z0H<KGqGl=8%Fx)aNh#iAHC~&bGmE;>CGX5y>l_(;wGdALPei~ddvH)H)
z2iE#gboZJv`1Lc6uJ0%SN7W&6qsW$&L?_^+L^Mdx<dfp>>TtE_Bj&wnhavGiST247
z2i}y@-37*&{c1g2v{UEM&IeQVj={t2Sw#Cp5!TdX)4gR$sN%YtB(0((HZK+mPb=ZC
zWe4%mnKn`#o&p|h3@&}4jPCK_APhg?*@zrE&JuvLT^CTwVLcHsn54-+pVHR0PB@k5
z3d_fzLP?w>l2BiKl_J0vyQ`A7mru|Q%cU?b&lUKEg5cG(gPG><WP*mBf$z~uC9G_4
zmPiNLDj$I5?WM4Jn==fB)bO@#ddK7q@4|fZ1h}rZ5SN&qg6EtK@iCC6#^x_LPs@B7
zva%ZXzL<f{!_7pqN*WsvKOj-*rG(ABj}0v*^zEDu?0R$xJziWyam^H_O+Nv)S&CqE
z?nMlHw;ziPE%`hCSqpnN%i`BmJB)DT!6B`s&=s^5VnliHa_(%<_jiN-zFKMu>#3gM
zKh$$QgOA=^AOp2An7Mh9`&M2L<KHFNq{9Q~&j{ex9x-Uyw3E)L+($OVo~P$3_4qOw
zKZ%R)2$sH-!)0Z|WVL=7^KGdLE`6<r$tuopSoAdP;<K2{+k;T0kMCBA;laK*D8dQ6
zZGErK|KwR{+>uGIy^E!qB{!h=UkemH)JlG=OJ%guGssLs52!vWi7VXRKxg_#D3D2{
z>VIfqN#k-Ne{h-z`CAivlVPyfZUc!D<5)J4Nb5~Mklo%qa0t1L&CB+n-ez6YDO}F`
zm?Q-Ir<Bp8q=9KWnnuoGF>(7v$wn8wx`NM|;JrGMN+mSInB+Iq64J)*k76Y7h$J{=
zE3^Mpd|(>XCz<{y7Vv)R5>;F;4Uamz=tU0`6qI=jxA;$B;AINgeA59GC&zgMTH+9v
zc$(vVGz9Uz_t9M12M=dd)9^nmFnAEy($*$?BO-(XQwveL(hTylUgPVPb8*gSD}72X
zkV8hz7-}!aRvEsc5$)+@Pl+T>Ejh`x3w@=LzSp6}C5Y;07{b*#c_b|5F!@xTL2H9T
zF<MX`ep{nTPHXByviliud2$h2$H$0WXBlI?Yn<XUN6c+oj8;+?iRv>22+lC2?otV)
zdA>iao1DeoQ#=5-2S<q2Lm^zxSfktbc@Sz-ipxZP!&1>v5|=Uv<1Z#ruqh5NYh}{Q
zZOHtIIixdi6X8Y%h|h9GsQOlb_mk)Hed;U0o0&^`j~AnO+7lYK$^w+!pYfQVBi!pG
zBi7;e|IErUq2kO=deV22D$Scu_9iVLIsXL1kY5J6f4R-fI%ol2pBCaJfo#k*yG-rG
zjxeS6im<775q>(>jz+&Z!LLR!_?}vT2ZL)NA-jM$G#OE|9W&vS_$thA(8HPU9@A@%
z!<ct-A@2L_2iLQ8548NCoW!<tFjbO)wnZs0cT*(&yu=u%{WPGYdIsP0GDW8kE!5)B
zFVa0MOznQ%!^5Yy<0E%96du!Mdw1oadz@gM#dH|D?`)(4Tnu&m>lX%G|CgwiyMf3(
z3rIFtj+I&h*q~?)o)a6P)nA90+Ui2x8v#<9sgJ3FRZLcOJ9x8maQo7H=B#xajXJ%E
zmKG3rVPc8~=L0aXBN9co5Wc|hE!_9{E8{$M9UGMuh;5cD(Xc9p2ew5Z*w}!Pm5tPJ
zbWh#k$$C1qE&#Xi3gBT-5?tTfh9j?3F>_-!qyFBKx|zwar~dYW9h-&O^t1cX#=nuq
z>y=UAadF}*`IJfvI$%=&bDHX^#jKp~Ph)=w^Tk@lNfCRD9*`Em#GiBFrS*LhT44dd
zDR}aZUQ^-zGG7gyxrLC^w-Z*aTMYlYHFFRCu|m7+@r37Z$N2PgF>2|NbpLBDoIT2d
zcCZSb&wc=!TMh9>Fb}P47#i4Z43RE=aKxpMwA=QRbeZGma%>$wa~da^i#74jfEKC|
z5)a$YyoC*4d9YitiF|)2%c>j7vx{CT^O>nuYr$1vloz%PX^1nZ?hxU>*rkXtBMEpV
zDvaVm5i+he$azf6gBBeV^l#IKS!=tnp5Eg0GE2c1FVORJ9Uf@<0IpIm>ylbb*q5t4
ziQD|^aLz6pi9#{;_<Rgs4OCJ7>t@ufPlv0Hv(RUmHtT=ckr*xD!Hd4N<jUTe7*wJJ
z?RE0hE<Km-dJ>MI;=XvgDg{SVl8A`UY*IbTl1=xrB2LXiw3(Y=aL^HyZoT3BW(xz+
zISZ~&&p>4K91!hm#1<QEBC*F1%;uP(oo6N8lwX9d$A?Jys2pr5)<D&#vLwW`jV}IK
zL=MQ_L<5&RX7$e;V(YBV7Zf>yYtnjYdqp-}YF<WLwaswzd53CA?kwz6mSGKsZo|QS
z*|<bhnFK}^!L?`Y^xnM`da!CS3_KfW+NR~fswD*Te0yn#z#+2Y`fN1rzJTIko~Y~D
z4-?)8pkc0$_0gYEROD10_wR%&Gt0S(L>WfIl=M|>A2GyqJ6rrZvX#h2-6VDz>12JZ
zC^UZlO2o{EXo!LVclmM_PI-uehlD);%j8+QAT5TdS6-v%7Ddte(3xzhls$*?D@equ
z&!ppNEnV(VgBkg^flb{8OB;7VdgpxR{F4ZH`|UOz&P=21r`IIxVGQZ;OXZ!5^MOs<
zJYmz=S?ZZsk6zE?;J9M}G29ach1JcF!jB`IbpvdBvj@751#{sBHPBxxh$l=6$z#{Q
za7yAH=%%aDx8_3Vp-@DnHO;`tCmp^IXhUn=4M>{^;VkplqmbSSU@R$F{lp&AR7JqC
zs}Hq)WKdR}2ifiRILFxyS65Cm<qi()$hIuzlSvt4o1a5wkIS;KrGyj}_E25pOJF#l
zjIp}IU}--Gm0P@UdMpzRpW1+-pCtLiEQzc2DJCW-ZOLMdJM;^wqe>5l>F7BpNT^we
zg}dC~s-6|oM6~Ks^#AX?)@1BnK7sSImcng^C@Q-95N2z~(sKV}*iwFj`7~1l<Yk;N
z)p;i!O#U0+DhuJ3S6;-lSrQHE=0L>vTQC-|1)V2sptUd_M3RNs*Gm^b>mxfH%ali(
zrHK%0Cy2&Z_K_*oJ6v;S2k{LYB<dXwC_mZ(67v#?i{ug%@SQ`;Z;Ha0moOA`Uc;Ek
zxAbpw6+C@O9b5L7<K<xs3|sGw#k!-+sc&ELNn$EFJ1|9-ZJ3S6&kd1{t`(p`n{e8p
zhgewhNsIn-8vAVta=an(G4TxC(T;}1)$?J+w+e1l)}0>fjG@jY3(#eyBI9km7;Q7$
zAh$-I?KM6Rs3gT7R1^c70aXk+W5y*N`+tklRC3MXFk0&P(_f|yv~!ag#%I_=+UMP1
zw{a1EkdDCaIVGG-WjLH!_>pR#AFkEa)`bv@Fg&P}PcLH+^ybK*Z<8iCoVrQfOY9&l
z@+Ns`a~N!&iGXoTCjH|?5jZqlhu^No!@!*$W^6$QJaen$oYz>A(8*A^At8y!xALiD
zn>WXvQDdC?Ijo(`h4f3&NNkHxq!*~g>jyMGc_sx&M0=U7way7yXuHH5qgCdER=Xsm
zWy^qvMlJcVWE&2JnbRkaa>-jYQ`nlYn7Kc<4<3eP63IYyGUHbQ94?N+)?01FZ(lKl
z#|=P1l`Ox{=q^g^EJLPt9tI4&B`KY{P_}afwq5ZfKh)H4ZtrKfZTgHnt&FGfGgf1i
zX)s<@ljR46nc_<SROr@Nh(|Uoz+uJdnvd2Z{86<rGQtEw&uTANS&<GZp53&dG6r}@
z<Uru$9#Vcs1Iv02K#p7@j@>WDpPzP-1eI8B&^sL0{rnxZM(psV!Ua&)IYoHijNtl#
zbC5AyLuWO6vvqYo$kr5bLRAYO{Cfz)1xUiD9gTGNXc96Cneg(j6LeSh4aQ!8Wy9}B
zqQgT%%<r<Oli!4aHpcMgn<Pq}$iN=I{nS5CfG=^ggzPZkli_D_jM5c*{G{MQ1*+!Z
z(Bw*JStNw~&v!{mW)T{MzGI9`r@-7Nm8r2`1be@?GIMqW(_1G#a7lk|hE7pA+U8nE
zhQuT}-Trv0<0!(`G#c^6K2_tLd%uI}R~1Gk%%Av(-T=wIOLavBLRfviACvE%Btg>Z
zBr;$F9+45I3m3$Js=OXo(ojR2zU{{!T?uHxi9)=YHT%hT4K6o7M#cMv$TD|J{NNtJ
zDNfoVGoFB_1Ilpe;}{6#8X;eHjAY$?PjzMo<NIT)V8bgL2<ZEV%Dhb?wGl#e>)KtA
zs&y5#MESJ(fA87)_YfH(#-Gl=4u3>0f(_5+VRzbb=$FXCZpZ&W0;@Y{rz*^!4(i7A
zx{ov@Js<U4(!sUMne+*)0-*>m)S7V)H}#*V%GNow^l>3rky{`)*-89!YEXL60iW*n
zfRLi&%)5L!jAo1QWQ8l_O4Q)0RAE;8<|A6?B#Eha^C0?P8&YrHL;qHmgS{OsDBqh4
zNqc6n;f;4u!1O&?JQ@j}YA)C?R0;1FJ_W1b9$c@`#XR;_Co-3fpg(O51~@dK_rG6I
zqpz1**RO)sB@3ZZ;~}maaRMo=ShDTRLa^8?$;$Z7g}cvRVd}Uddq!5A4&yZp_r5_6
zlsrO#Lte1osuD%&YjM9^B>trq1!m8SNLigI$Qwsu{N8lDFrJCO)?A<`OO1d(>P{o%
z46&Iy;$)5+^?Q&-{PVl0=8J5=U;|w8TmmevXhN9IDZKjUCfIM|h}!pJaoO=YIHDF@
jCwRdO{HD8zlG<7f`gs%DE0(kOE3&A`#0utuR4D!jjUUHl

diff --git a/tests/data/for_carafe/carafe_feat_grad.bin b/tests/data/for_carafe/carafe_feat_grad.bin
deleted file mode 100755
index d195bd1..0000000
--- a/tests/data/for_carafe/carafe_feat_grad.bin
+++ /dev/null
@@ -1,33 +0,0 @@
-���A��A�>�A�d�A�"~A܈A^a�A��A�J�A���A��A�>�A�d�A�"~A܈A^a�A��A�J�A���A��A�>�A�d�A�"~A܈A^a�A��A�J�A���A��A�>�A�d�A�"~A܈A^a�A��A�J�A���A��A�>�A�d�A�"~A܈A^a�A��A�J�A���A��A�>�A�d�A�"~A܈A^a�A��A�J�A���A��A�>�A�d�A�"~A܈A^a�A��A�J�A���A��A�>�A�d�A�"~A܈A^a�A��A�J�A���A��A�>�A�d�A�"~A܈A^a�A��A�J�A���A��A�>�A�d�A�"~A܈A^a�A��A�J�A���A��A�>�A�d�A�"~A܈A^a�A��A�J�A���A��A�>�A�d�A�"~A܈A^a�A��A�J�A���A��A�>�A�d�A�"~A܈A^a�A��A�J�A���A��A�>�A�d�A�"~A܈A^a�A��A�J�A���A��A�>�A�d�A�"~A܈A^a�A��A�J�A���A��A�>�A�d�A�"~A܈A^a�A��A�J�A
-��A�
-�A0�A6��AE|�Ay�A�Ab�AN�hA
-��A�
-�A0�A6��AE|�Ay�A�Ab�AN�hA
-��A�
-�A0�A6��AE|�Ay�A�Ab�AN�hA
-��A�
-�A0�A6��AE|�Ay�A�Ab�AN�hA
-��A�
-�A0�A6��AE|�Ay�A�Ab�AN�hA
-��A�
-�A0�A6��AE|�Ay�A�Ab�AN�hA
-��A�
-�A0�A6��AE|�Ay�A�Ab�AN�hA
-��A�
-�A0�A6��AE|�Ay�A�Ab�AN�hA
-��A�
-�A0�A6��AE|�Ay�A�Ab�AN�hA
-��A�
-�A0�A6��AE|�Ay�A�Ab�AN�hA
-��A�
-�A0�A6��AE|�Ay�A�Ab�AN�hA
-��A�
-�A0�A6��AE|�Ay�A�Ab�AN�hA
-��A�
-�A0�A6��AE|�Ay�A�Ab�AN�hA
-��A�
-�A0�A6��AE|�Ay�A�Ab�AN�hA
-��A�
-�A0�A6��AE|�Ay�A�Ab�AN�hA
-��A�
-�A0�A6��AE|�Ay�A�Ab�AN�hA�A)��A���A%ՈAZw�A;�A���Ae��A,��A�A)��A���A%ՈAZw�A;�A���Ae��A,��A�A)��A���A%ՈAZw�A;�A���Ae��A,��A�A)��A���A%ՈAZw�A;�A���Ae��A,��A�A)��A���A%ՈAZw�A;�A���Ae��A,��A�A)��A���A%ՈAZw�A;�A���Ae��A,��A�A)��A���A%ՈAZw�A;�A���Ae��A,��A�A)��A���A%ՈAZw�A;�A���Ae��A,��A�A)��A���A%ՈAZw�A;�A���Ae��A,��A�A)��A���A%ՈAZw�A;�A���Ae��A,��A�A)��A���A%ՈAZw�A;�A���Ae��A,��A�A)��A���A%ՈAZw�A;�A���Ae��A,��A�A)��A���A%ՈAZw�A;�A���Ae��A,��A�A)��A���A%ՈAZw�A;�A���Ae��A,��A�A)��A���A%ՈAZw�A;�A���Ae��A,��A�A)��A���A%ՈAZw�A;�A���Ae��A,��AWl�A��A�A8��Ag֊A���A��A^�xA���AWl�A��A�A8��Ag֊A���A��A^�xA���AWl�A��A�A8��Ag֊A���A��A^�xA���AWl�A��A�A8��Ag֊A���A��A^�xA���AWl�A��A�A8��Ag֊A���A��A^�xA���AWl�A��A�A8��Ag֊A���A��A^�xA���AWl�A��A�A8��Ag֊A���A��A^�xA���AWl�A��A�A8��Ag֊A���A��A^�xA���AWl�A��A�A8��Ag֊A���A��A^�xA���AWl�A��A�A8��Ag֊A���A��A^�xA���AWl�A��A�A8��Ag֊A���A��A^�xA���AWl�A��A�A8��Ag֊A���A��A^�xA���AWl�A��A�A8��Ag֊A���A��A^�xA���AWl�A��A�A8��Ag֊A���A��A^�xA���AWl�A��A�A8��Ag֊A���A��A^�xA���AWl�A��A�A8��Ag֊A���A��A^�xA���A8D�A�քA�{�AFܗA��A�חA��ATi�A`�A8D�A�քA�{�AFܗA��A�חA��ATi�A`�A8D�A�քA�{�AFܗA��A�חA��ATi�A`�A8D�A�քA�{�AFܗA��A�חA��ATi�A`�A8D�A�քA�{�AFܗA��A�חA��ATi�A`�A8D�A�քA�{�AFܗA��A�חA��ATi�A`�A8D�A�քA�{�AFܗA��A�חA��ATi�A`�A8D�A�քA�{�AFܗA��A�חA��ATi�A`�A8D�A�քA�{�AFܗA��A�חA��ATi�A`�A8D�A�քA�{�AFܗA��A�חA��ATi�A`�A8D�A�քA�{�AFܗA��A�חA��ATi�A`�A8D�A�քA�{�AFܗA��A�חA��ATi�A`�A8D�A�քA�{�AFܗA��A�חA��ATi�A`�A8D�A�քA�{�AFܗA��A�חA��ATi�A`�A8D�A�քA�{�AFܗA��A�חA��ATi�A`�A8D�A�քA�{�AFܗA��A�חA��ATi�A`�A�ǋA��ALlAl�zA٦lA�.�A��A*��AYO�A�ǋA��ALlAl�zA٦lA�.�A��A*��AYO�A�ǋA��ALlAl�zA٦lA�.�A��A*��AYO�A�ǋA��ALlAl�zA٦lA�.�A��A*��AYO�A�ǋA��ALlAl�zA٦lA�.�A��A*��AYO�A�ǋA��ALlAl�zA٦lA�.�A��A*��AYO�A�ǋA��ALlAl�zA٦lA�.�A��A*��AYO�A�ǋA��ALlAl�zA٦lA�.�A��A*��AYO�A�ǋA��ALlAl�zA٦lA�.�A��A*��AYO�A�ǋA��ALlAl�zA٦lA�.�A��A*��AYO�A�ǋA��ALlAl�zA٦lA�.�A��A*��AYO�A�ǋA��ALlAl�zA٦lA�.�A��A*��AYO�A�ǋA��ALlAl�zA٦lA�.�A��A*��AYO�A�ǋA��ALlAl�zA٦lA�.�A��A*��AYO�A�ǋA��ALlAl�zA٦lA�.�A��A*��AYO�A�ǋA��ALlAl�zA٦lA�.�A��A*��AYO�A���A���A��Auq�A�^�A�p�AΚA���A*�A���A���A��Auq�A�^�A�p�AΚA���A*�A���A���A��Auq�A�^�A�p�AΚA���A*�A���A���A��Auq�A�^�A�p�AΚA���A*�A���A���A��Auq�A�^�A�p�AΚA���A*�A���A���A��Auq�A�^�A�p�AΚA���A*�A���A���A��Auq�A�^�A�p�AΚA���A*�A���A���A��Auq�A�^�A�p�AΚA���A*�A���A���A��Auq�A�^�A�p�AΚA���A*�A���A���A��Auq�A�^�A�p�AΚA���A*�A���A���A��Auq�A�^�A�p�AΚA���A*�A���A���A��Auq�A�^�A�p�AΚA���A*�A���A���A��Auq�A�^�A�p�AΚA���A*�A���A���A��Auq�A�^�A�p�AΚA���A*�A���A���A��Auq�A�^�A�p�AΚA���A*�A���A���A��Auq�A�^�A�p�AΚA���A*�A˼�A���AhZ�AgAĒ�AB\�A�y�A�A�ЌA˼�A���AhZ�AgAĒ�AB\�A�y�A�A�ЌA˼�A���AhZ�AgAĒ�AB\�A�y�A�A�ЌA˼�A���AhZ�AgAĒ�AB\�A�y�A�A�ЌA˼�A���AhZ�AgAĒ�AB\�A�y�A�A�ЌA˼�A���AhZ�AgAĒ�AB\�A�y�A�A�ЌA˼�A���AhZ�AgAĒ�AB\�A�y�A�A�ЌA˼�A���AhZ�AgAĒ�AB\�A�y�A�A�ЌA˼�A���AhZ�AgAĒ�AB\�A�y�A�A�ЌA˼�A���AhZ�AgAĒ�AB\�A�y�A�A�ЌA˼�A���AhZ�AgAĒ�AB\�A�y�A�A�ЌA˼�A���AhZ�AgAĒ�AB\�A�y�A�A�ЌA˼�A���AhZ�AgAĒ�AB\�A�y�A�A�ЌA˼�A���AhZ�AgAĒ�AB\�A�y�A�A�ЌA˼�A���AhZ�AgAĒ�AB\�A�y�A�A�ЌA˼�A���AhZ�AgAĒ�AB\�A�y�A�A�ЌA
\ No newline at end of file
diff --git a/tests/data/for_carafe/carafe_mask.bin b/tests/data/for_carafe/carafe_mask.bin
deleted file mode 100755
index 18dc01b0a4a24dd56628eecf8103b76e40d19b9b..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 28800
zcmWKXcQ}@97{-xOB9hEVNRb9Y@!sbtQbuO>imVb!ND&%JOS@D?yFMumr6o=6y_d8!
zwUu_?^Z$Fi$MHPJb6@v${m$Q2bafOzrSHe9&c}t$_h^o8vlZ)$t$1g|H{><y@y<*y
zMtk1Jxf`08+oufKT4DTjryCnTd_%s=c-D7aDwB8iMSf5o*YvXGt?6U%(lAKOZOLWI
z*LE~qIF`TClX>LeMBZxtAR0818LjzE%&YkWh1;)1#v&`s4Ngb7do?O=3}8j}ZJgZ~
z$|hxFz6&>CL0C29ZlxpKLQwvh0fMsfaPV{#=Zt%f;~x&-;LL4^FAC(~OUCT!=gtk8
zg_vWu5dL9{g>kwTUyK>a{DTkhPOcQE8v>|)qCG-4>M$oNhT+Q&!MCyv$BpR3#^;$#
z(YpYP_h$TaF_<5WA7cG%SN@(GLeqUI*rKe3?#J9XN;Q&ZX3l(_8o_1PuV8`Q0d!31
zDomfK^VFR~n0g|g4{G9Z*|!6?mUN@#Q#bxxoPl*|&7#gH9Y^+ta-6$8)AI$_{0wBn
zPBXS&93_smOXbD`O4LzUB3_%zIQ#Jy?0q<b_k5M<+$kTLue><2X&fDNE1(+d$XR(E
z8Fy+bi<a8*i1A(_KB%$t(#_~{b|@{oM)E+}1h^HuaLI|DoORv=W3Q1rFOB2(gL-sG
zwd9ITbs}hA3o<pGd1PB3RH?0p&#@nv`E)ABHn-=?XDRfSKM%j$3-GJZXGDDyUe2>Y
zzf2t-&uKuO>nZ41m@zNBGiUYD=Y<{}ssF-^we=^Et7F3v`!$)AaUF-7efW9TeOTR3
zVe8ab_OA%yVx6Cex7KD$i~>8X&*AdKc#d(1<F4E+<{1UB{jmYuRlNpvs|~p&wjCE+
zX>-DdEGn4EadyHo82$8x_U#;=cL--d$aQi3Y!9{{9EmrJ%@{oG9BiI=amN)!9#N6;
z+%r42EP5i24G5=UU@<}#)Ij{p!k1(>)?Yr4O)pEtpu$8Bx=|vG$9#u^O&mL^$D_j*
zZJ5q9WKG~^vE+?8y+hhF{B$J}lJDc^h>4ukaWVdA9EV)rL_YFt&w}&SV)(*=;(+-;
z&ev^&vh+4InsZ#p6<M;|y;9NHt~0|HHKEP!a4s(Yg`=wyU~!`lOP&m8-`YB1-MtMJ
zE)VBw<sR(O#Sqy8tKnm?Ko)o8x8vj<w*2zMhb@7zcowS!(__`R6Xg!|ab4N1S1=!M
zR$_GrMZR-N;mh3atQ)Wks(+_&XZ0ZQdb<sGtoO&EQhB_Zb^;6B4LGSpiz^Q;Mv$sH
zJ?uu|W5IlAC-1<7@;~C%qQ?l_FQb3`N0g50!`Wq)$jP~Ypz0tdCV8>`dneirQ{)(v
zzFfM<o*(oFant!QjztsbGm(yge!OiHLEHQYYR8r1cKHHKjm%`8uQSikf(LqTh2D-s
z;?`7u8qS*y1JzDE@g`UduoFDI)q;Zz1Q#`o;=ZC$RDU~5)-pDYXG8`?SPvdKQ;f%b
zHo@kFB3>`*z{0obY-sC58{b5Pj>=*}k67_!wl~$6c*!PhIR=xx3&hjoT70tBNCa5s
zGrZOhjgx!P{ILh8E}z2s8G-2JaS!8silH*sm{S)Iqm5h?I~7E6^wJ(sb6bOo4a>0d
z-+8p&sYg20+286n+8d~#`@te8J2%6)vol?6%(1M+j~~w}v-3JqSzjLVQEm(fbme`s
z@i6IT!~BiKc-Gj4i}p>T=CtvMatfhSyFpC-m_(%*BW^7F1QXvxe%${Iszbg)Ke|vn
zX}25;Ru6-Ve=L<BZpWfE^|FooV_3CrHI6(^;^pk^*wi7L&Sh@Y$$tX4Ho<VC236Ln
z@@|qlDh5BmiS9<cmA4wQ3*Yf1;2(CZlc(KkTNX@oX7K05s4GvB4IljmdI#s@%X$lJ
zeluK{4^QW$li->&U68VJ8}?QPP;H_f?yGoErLGruu08{Wg$}G=7R^Twhj5YRGyK#`
z;kiWvso6K5%NiQd{%sEy_1q5E!pqo_+kmFU$uwyf&FK|c@X%i^e0F$n{@Hp_@y1FF
zobE}ri=%nitX0?z`h~^o0{Q&w0jyPWq0?#y9JUC>tNT%yd%G{MWvlST&<1o{Wdr5c
zDO?xZgqj3-PTk#!F79JknJ}2!hDNdH*C2$Q3gfKeDJ*ukVSsH{CNIxs`}B9Hzc>(b
z>kAQFYeJRYeYrUbJTk+bz4g>^@%~G!(&*1yPHPZRbrO}QjrsmVXa16@aej19F+Iwb
znSQxAvGN#p{ab)-6<UnA7s2$_Cm5mHnQu<d6*qhfamHi<W3HyL_<SQiE(u{xkr6bH
zD>Ke@3F;<KVo=QluIduS!VC9tXhs1V!j|FmFiY|6>Pj)B!i3c~<9YmBdrn+sj6I>z
z;%|Z_ckFSce^Y<{JZ!+kWK*e5JwnCIj+DD>%Tx0%$aL!u;{IU^)X!Olx5aA(3Y5{Y
zp-Lo{j^*JAYV7ygm~{p&81*QOAJ3$5)frnx-pHn`w+eG3l%S}X$gFc<>+rTz=@`t7
z)5h@E<!pSE?n>hjBMypqh6RpF93<1@Yc~rB#iM9wQviMWII2v@<%&tOWZSYPvuARJ
z!-YFV=<((VoC>V@_GvDTR4UUx&y-Dj-MF;v5H@tu<JxH>*+*?M)AnR?$lHF5`#h8}
z4%t+34uL^XG32KVr>ac?y_>&7J##T^0;2FfdoIS5X|cF`Dh`XjqG9<`nC^ZlUWZ-B
zm82AOU(<_^qLtv=`x>HWWikBrA0#^_@Mv3O79DruW4A}buJ=?pjc7*4Uco30lV>+-
z^XzT|)))_vt!>nz#-9Yta%n=~wpf(R4C4Ch3-Rc>1qUBmfY_g<xZVH0tfnD@G1J4*
zG&qc*i?d~8E448z$Ae=;Bnys>!}{!Hc<?=kQ67nmSM_C~gQ*z#c^q}tzQ>h4`Z%zu
z3hPsxS$DvO8Rcz|KGT3DAB>s$DunH8b=c?Ka{Qj!9ab-7e7M~dp2PZ4{ZIjZrVQt?
z+uix^%20N`>xnV5y3^wH06Gt~V61yRc4%KgPVORT{&7O>iTThI4!pa=5OG~~Sz4RG
z<+ahW&xQf``D8TDq|HF-0z39Omdbj&<4Dj*;+&LpbZi~LH66WK_DYG3ZNFemKU2=>
zsl(@w+VITzT;5o;7jNpG!FShwJZiS*ngeZF-8Y`^yJhmn&{E9&p2AhmZxF81i$;sJ
zc{Q{@%DU8I=kZIppiqzFWq&Z=v?pzg7KmxR4Eb9>k#VIRs5d)W*qnWX^KTD}z_k<j
zBkvA$Yp3$~3pI`!+JvEs31ZTjP2#_=v7)woIbu@#a@`<9#(7mq{azpT?7kdZxCKQ6
z%Hg5b1Vyv`Xg!z1Ux6!8mG&IgKj&l0VySoRauj=0RCsbnE6O^K=YXE+?09k<JM2^D
zl||az(6s^Q*E`bbWF{N$cBG$fHnSc6KxM1FXs=U&<EjJMb9oO=KHiQW9^3J*Mi^_<
z7vcG?&u}hC=1oU^ZVykO_H7%g*7Ty5)d?i7GN;}0HAwHA!pRpqW75e5C~RxZ)rK~V
zeZ3b4cZBimp2hHfFp#fWk7M9*V~TeZc<J9V4D33bTW!XR;a@9IZPJB@_9jy)&X-wI
zP4z4bW1shm$SCN`1Uq>=ou$n<>5OD-`7E-`-MFf=HhuRbOLhA+bWw<yuTyb4c>&5b
z`(e}T-Xiyp8t=4B<iAH<VLQbP?-Yf2wA+bE7VR)9uNjSJLzt&!$be~yw2M~6lO?V!
zoY$9k;_o0)(GI7L-I)hhUc^1z>wE<D>vHkB_hYedM>Fi-YLiAbOnSKk{m*pZ{H$1h
zj9G)tYcrUf`V<%CDxr1Toj<m_AmE`NN9-EMBJ(DsHAS(nYFo}YRU>X)O5o_jBiZV)
z8&|E2m^3bcCl=YV@2Z37`=15xb{fouX46Hpn;hMBJ27~61bZkrqU2>A{+pD=tV`x>
zoYS9qid8rmdR2G~Hlo7uH?rH|y)gEq32lWt7w>FG`JNuUP-e)vdrasY=7Z%XDy%eZ
z!<ZF2QBf1jnE6vh$lTWgN#?xqa2+0=7{{<*U(o0Cbqsv>4%KCo8JQ76jq6!l_iY$o
zO-|vfzO{IJVF@ljj}p62j-qK~ISk9?C?7i)AEiF#{PNEzKkUx&ZmVRqZC;}L!dTAp
zNfXJfdNk5aXJq44?wflMN3VpVQ1u%IN54m0kr4O#97DNPd(JWm=Ia6{R2>fCx8`z8
ztm%i|o1dYjFqB3{R-ApX1eT+mxoG@Q{<zqKTB|(h^J)*y?^outnY%@KhuK2?%4$*e
z)CxUZJQ?rz9h;QR@%?_W2-VN0x<?>_4jJ;%%PK6rO&Xsc&6VSvX{-7dbG8|9Ze=5^
zk57WW>?St#IVPT2A4A2<2!;jwa<O#pzk1z>>FHZV$#^~NEBb&<D{GK&%ZWKn?Kp7Z
zALI=0$w}t#F?qiw%!(I?5l;sadm>nR-U>PYRCq+)ht*e~pn0D)&0OkG=oZNb#mSt%
z@fqZ<TT{BhVdJDtp8+zVrf<p#s!wocs|E`;YVy5wZu2G%;iIjZh*p|}dBxkX@U{xH
zbi%kON}c|?Sv>u9yLeu0&G&LO7&%Orb;oR(T)9^42@hw3EQ(eAMpBc8_-lAYB;=4A
zPc1>`yIu%)cc%xf*lXVj{F}20d+yG~)*Y{4s;Y+$&%)`GI*4}F^D%!#9A^&p6JhET
zY3KD8>s<`lZg2$OBn0xHVm(UF%c$yBfqOwqgzD@&V#K)HsQPmdYYeASoUTUlj{n5l
zqD;<Ou>;1RR^nu!7w5-la;knAIvNJZcJxxjx0zsYsVChsCv&BdCTH5Wqu0e>*dWb|
z8}toX7vss0GaCGK(VLA!&Z4+xG}|Y{v+cESsFjPLZi^Y`4kHhbZ;_4Gh~m`DY^?jz
zorytVT=k&<7Jby2IsXtIT2vsvJX@N9#xd}r4-BuL#nsTO=-`vV4ON4we0&gJc}?ZQ
zftzIKbXE&zpMDG&y+jOE9Vf)5JPzM9oUuij7=CIXPnTq}cgao^=?>+uf=K9n9tY3y
z$5Fq*hj(7z!S=2#juSS-Q+;X)I;T$Hmx1my_SK~F6?rP}3TE$PzI?mnKK{P&LeSMY
z*dBKSn_9IPg#k<|HWXiP_%UO^B_wJp^1mQ;oYc?}b=^Dh>Gb~M;=_IT{`LS~WD7>5
z>2v6+0*s37CH}>kabCL)+<p5w9JjjR$Ai9H_+KX;*W8Q_jz(DRF&k0oR{WQ>Pxg6T
z5~HIIz{aEvCu?<NpIR@D9_YZ@qV*#B$v!Oi4`=fw8`;%)GG54V<-s8ioV;TS3zp<?
z`iOKUed*7^CYunL?~O!_UYK%D5&!)j#pfweLR&SNx+?^u_sBCl>6^^+r@CnK(U5ua
zN*wlc8Ga_#W9;p=X!`dEHM$CXqi)AnJDl0FY_2G;_Q3jQchI~*lWC`t>0<a3gU?v9
zp?I*EFeebZuBEeGY#)9#He{#EA8{mU0*d!V!^UT|SP<2YQ?hkgYhlDK4>j0g`4-6=
zY{h|3Cy@Rw8Fq%>9Pjsw<jA;4RvFBN+=VzQbZ}zD00UZ8r~@0K>FPUxkCP8V{hF~*
zFxv{b&0{g_sT#-LHRe*SM1Fb{$IXSOuw#fF+Gg|?S+joPf`50OD?BZGw#<jGzxna=
zs-cXJ^5gy&ejHep&C?22oO!np|IUn~|C6n<%SUBwDfQx!#SLOn6EV_uINy8t@^iT+
z3)PhQ;anQ4h6ORI&jcJ?ABwnqEACmW#>>;D<63`hZp<FY!)<2>i?L(*v(SMd-Ho}f
z(`qDNpCJ0kk7g&+CNVAWqew8?=h(ZuA+?ISQ9<=B+VA~`%Wo`sdO|RhbkkVkCZp-j
zohY*lrPJ~yBA{CaFRY2=XOB4EteB30p7Rm$e!n>7;!OJ&J?QmKmnv!LJiWx7BOWTV
z_4f?%Yu-|E<=IG@_<}7vRk_JyJa-?=77OL$Fteo#uidyHI%=AVY1aqCvwtdn&Z-n<
zk4JIF_<nTU??O}m9&}3i0<RTgsQ-I9{_C|@3|{&amq+Vk(cov|q@f-=O#FlefAyKY
z&XYn<jpy6c$TG*6z-_7}TV8nLjaY-9>#bPyO^F}p^yQ>039#R9#DgzNaNE5-zP-7L
z?!zatz-NzW)y?7Kf)ugbG7eV~6#4M=B;NgZ1CJZR`80Ga+V0wfkcaZLcs?8L%stt<
zAYb%<;wd}r((3r3-)V70Ka$p}8;~~hCTi@epxnDZlkbjZTj`tH9Pkpq9)Cp3p_Lfj
zBOSh`*P$@s4mRXGmKCID@`6h=C%@ESFGY#394tVhOoh+xYVgqDpGZ8s1}A<Sa^m(>
zrkf|A#}{}0x|GM0se!Bu`G!FAULqySgr9nLWaxZzB%0ktgYst~nrcL=tP@`yFNRZk
zZw?yu)=@KsC{M8F*#nkLTdu?~iP710ji70)KXxmZ<J}4c>^1h`dD~#Vo!5^(W_@{L
zdM+QBsIf=7i#Wgzys0n?*)?9YckRMT%Q+6=Wx4d5<;b_S7clorA=cR`KwB;!vt~wc
zqpv?MoL(lo^sgNcSm%nBwt95he_HsrZ^kq0tGIfyD|Jk|ag#<@KI|+YKhlEV?GGZT
zt}h;I92ZTa{CFeel<;(3f{Sl$5O}^j^Q0c5z;_Tk-FgV$tmE+SF2{xX+i-W6EqP)V
z;uhJ6SqDb&Lb^M7BZ4y(&tk_^JKVV3m2=lgGj`xSk$72+KjX&2xn~{?4?n>Ds9Nmb
zP>B28c3}0d05;pc!h@b>jI31Tq(fuaB72QvQ3Iq}z0UDaRwbr(9>pIGy=b~R4be*i
z_-b%4C#bCwy-FK!QgI^J&+LMn(s4{NlxL67Qy8;62rHxW7-X5wqOKXNxl)cTqas;j
z8;`VWLvZnBCbB!ML`Y<gOmS2Kl~zRXpW0g#oGpbxcpSgzEku2S9Oo>!g~$f5VC#H1
zJeUj7JWy<UvI9%C58!@<x9rP54=87KqEezaSNT830eKV06M5}8Oy<P<U)`9WD95?c
z!`SYY54$ZBxMZ~$Z=W<EJ6E6X&j-`3Hj}#d;%U8QJ;sDQhsm58Q8=<Iw`JsV<K9xN
zTys#w&8kPc_12slm_tL=bpBn`i%nB<ST(0u1P!Xj16dmezPcn9OK0Wx7;S#F`G>$w
zFY(#Kn$I%`jr#itHLDP@Ydf*h>j$0>A4TJZn*83$hfCJ}5o3Qy{kK{Hw#;0I9doas
zv9kitUyR^`8%|h!K#TX{dvf?WSL)vF&cAiJnD3NDEzhaAwxSI)-mViq(yX>O^)ik&
zC5xke9hsS8jfN?EVRbo<Gb~mjRNsnscMcN~^%MAQaXX$Yt3lh#mqp}>kD~EgIzOaT
z3b|#<{JtXJ@o@4qSmyQPqH6(SSn@{kdG{2Snw6pR4#CsDqo|#4%6(0xSl2p;s!uC%
zuUMTYt(<xB$N{Xgxq^^Va;Wzy9GPt=Ufl>|*EyL~d}l8P#BImpQMT~EXwIdr>R8fC
znRh<+<<mBcao~I?AA4Eyzj$Rn{p&##S+G97I|~+!pw+qxOkOW&Zr_!MH_b*v+<lZ6
zo`JTdJ6$WTqWtAW(duJFH=6Lx>r*h8704g=zv1wnu3Vv81k3msK9yT2##xNx+Na}z
z)0y<L4x{0jTwc&~z`{X8aeImu>{YhF&Goo!co!wEa8=@_%-L}G?>nmV9H=<mj*Ir(
z#cz$#3@K2ht3?kMOAO2G2bo#t!FyF%SUT976+4@tv{4=+!5DY<&B52wZ)i~&$P<1)
z@$uzX+2iTUVgF|n@)w5j`KT9SP^k-hm9*za>rZg@takKl?JknekEPo-ZCY15ib`F3
zRx6(upC$}sN|HSvFMcFy<tAbKLK%Pj+Kw+b@_6ayej#$#qm8>7Lzj-HTbwgazv;(V
zHzSr<eT4O^A$(P6&UXr}vbKHt@!%dsemg7U)3<?ak?YDM`*&c_lb(nfO!2p@4jBj2
zsIStV`OhAri}ermRCI-6fF??%LpWB!gB5jc`R{fkI{dc@nulV<?iczj3mAjU>T8((
zB99*xZRx#A1&Zf>pi}KvEUfRs3-UT_AJ-jKSSbcw9LP;$f;dY$OP6M>$C{|Wcx#u8
zgRfmVZ*30mPqL9s9PG)kj9ToNbOSkiLQtaAj?Y&cfJ=rl;+Z_}?R$d_QO^9~)(Zyn
zTo}H@nI2p3<F)w~w6#j3sb@Rv^Z6z|4=ojSl^$&UX-~UCHLkhy3=<0nP<6T=I{fN_
z>1N}3ZKW%9LWXc|=@Rj<Xe94Owr5?!P@3pZVZ>fPwxsry4PPel$bJ*q)!`X-4~Rl`
z)Ebe!stek7oJ@^x2SrQzWbvZ461@s6;6Dt!@Vq@8zQu9W>Y4C9f6&qPMJV_FTZ32k
zQd#Y3%y|a?F!s<i+`9{W7&?e&o!Zk!zW~pg=gM6A>vHk%4*aQLfh&*GsOlldKX0OW
zvfWCYdl4#%jkcjv&#COzn#_MLm12wI9?|h-6B^YzGwZw~^TtFlevmf5Z0pLmYoEy`
zSO3K1?_Dvgxfz`l|6zEI1J$MZsZVkyw`gba_smt$eK<^FP`41VGLRS&%{}+iXw<zc
zv+QkgzgenN^L@pS72}X?^j3Drt2-^t{CLDDg2qD(`LV^GGTBZX;Z7{Ia^s^j23+rF
zB`(ND^U#WwXxY<^w*y9^^4}QNMMomSJr|Qw-pUqVU5IC!jaZtd$Q%Do!Q#OV+;G2(
z^GlLAvUnobzFj0n)Kw#9jyhZOG$?;<gXpvexHvGKJ^S{f>cXMCsoaI?4Rev~kis#t
z9w>Rc8*AQqG4#ESFgO{)E^EJIUO_Xy=rltoL!Tu>MiK$1k?%I1gF6hNT4WtgK3s-5
ztK6lGYAF=HIf%AZRZx;$6s8xoka)ibXZ}#+j8&s~D|8CC$GUP`mK(PQ{*#5&%dvC+
zLt;+lOmuH;%MQVAl%JBx%^JEAckIopnrb{d<vxz?P2y0gAuL=U$t@AB(ml{BUO7iG
zxK3MQa6ho}s0|-j$8hz)Od1DG=8b0=>^b!@TG5B|GM(5iw?Y)iq#5GzGBhVi+1-gE
z6s=R_$aoj-I%^4yR5_mi8^Oi8d30B_=d_6$R8EsvO=@2#_?#6FCm7P?Yby0JZo(?f
zoJT(Tvj0+TJe<^n6Bl*o{h(OttXT_xgNrx<M|K(9mL|sqr+qTQ<`8@275u=JWhpdG
z`ULG0>!9N*Xz6N2oySAzniIqRPC3k2Wx^v}ZP@M0dURengjyy?(3I&5<-lp;<Gr)6
zYEg%BWFCi%Jt?LnJjS8Xel*rJ;E4J>>Akn*!AEZ$%ENOY?_@(KwGm7#Ri<K95@iAL
z+@HJ^yPuVa-G9zP|7t(l^pmlwA=Yt)H1}TEJDyFiDn-N+7tEU<!nF;S^fcUtP2=XF
z!6R16WrtzJrAOkP*a+(-Qq7NkiUHl4#F0sEl%Aed`)Wku(yr8#`ok*&%xI9W$IXAX
zWA|Mh%1)<I&BqgdpAwmrSq@*@{g~vS#-f`SMPDOhacQd%Nhj=aSFr;->kVYT7uvjd
z#fOuoNsQ|FB)*A?;)!l195Z^6lr5O^PS2TWS`Y~y4rE4}mQ3kK5AM33$%c;|kU!6k
zyP`T$J8&KJds;I0y$aVyS&Nt7rf~Vb$uzbZ#oClK=8R0FEXtL6YaMA*`2tfqq~h$)
z-^jG^qvGbPVxPp?&xktg*yqg8|77?$JBt?wdh*9u7uf8mgL6M?>VC<Fu~sjJo;RcF
zJn8qh9^tIrCk$GCR$O(ufN>Ig)H`QKpGnR56_z4$UVTBuq*OZg_T{DDSD^6K71zG~
z!<i08K=~+Y#U=4gz6m#e?<r=qToZp!g>s9h0sFd6#;GCW=)Lzm8q*I!zS}4+dNZEC
zrnTpyc^lx-(v$aJ`f=ph0`am=jrW$+!YkaA7wX2*=R}h1Uaw4U+g#xoQk_KQ*+V#V
z<6UI-bYjRa3y#ie!wDv_n6S1K`)ze;H!qZ5=0#G$DnxiRL}Q-KAU^Cc3qw>K`E8pV
zH|}=i`rWJ17(0>5n;h`IRhuF2KciyRC*+%@A*WM6{up!_k24%#b43ky)l)J1ohNnI
z9Tz>u=QB&YSmZu<j+S>ZSh^*e8=h*j*I{`~h~I{b?H);I#2j1h%o3_K`P90xUJUj6
zi0{Mhi`*;=PUtopqqS9dY?v8Sd-}4$--}hh*NC$4zO4GGN$1;J@cHapq;>FQpTA2)
zo@sYp4|inb=tJ<3*X5-HoAA8t40LhZhu){<QC#4TjNq?0@j03WS-M=f-jwC<zT?1d
zUpA{urkzN|wv)?|(Cang@ATpN;2)yvMi**&2h-KepG#sh#C)yM{PglG)PLkrv%^9W
z+pitpxh_Q4X+vmdU51;FZP{Bx%6DF!5o%VB{5R+>ww3nbs1^6nt?fWOYZ}4PKT?o&
zW}NuXyDNq~iDtZUI;Z`KL*a8H1iF4k*|rt1EtBKKZu$7PXcG+HZAaT9TFl8F#dqBd
zxkP;r8V`Ci>f%uh43cp~#vif8JdWLsYB8WRf<Jn+qT}SA%=0wosQF{5rIg6k^38a#
zus0`3*}vMo5wsg@z;*Nc(q#5jUfuHwpH$j%e&I9`dr_4RrootfLXIaK+p)@CftQX*
z=W>t=cI}j>Mr~U@oZAksTLKWjBAJJT4un@5o{%St-aHXsJ1xi6o4e3w%quK6@S?0%
z2WK8>^6TFh(3#zn{s%L7cA*(N&76yYh6fPW#S>TbU*TJ`yvW?1f!3$V%u!BavXndK
z`uQ@tn{@YGy94W<Rzf`}5R21Jq1WmZTn&q$xmgu-)_U`ajS0`K+z+)o3Rr2@fg|o6
zgJZ~Y$e&bWWo`vD2Zqyijs<eBpTXvFY4lw?l^3PF>5YONhhH<q-oaBi=~6Z~PkRIJ
zm1XGR<;=X*%TZ)oDgJI>i~frfxiB&b0p5C;IB_`oY*FE-4W_i#v|(eLhj^dTpO0UR
z!RDVG(Wm+#dW%C?kOZEvcjko>f1!Le4Ar;d_^<FR9{guR{^-KcyC-nv<v;QM!f%{v
zuOrmTucPZkd#cAQ#~_lwwi<HiL2$%{=`ehIK&HB+4AY)f;GyRz>NTDd&y@zCs#cxO
zrc1E(NgEE@_6~8L(#)~3N&G4)fZLg0Xvmo%OK5)@uV?yGTfveU2b~$@5zl+mjQPA&
zn}<xVqNVO5yy{~)@UStP8z#_pN+-r{_2kT();t#I&Y3EfTo$8*YYQ*op7t9=+!%=J
zz*-#N22r3DgINK-9P8nXb7>xU^f(cIA3HPm+ZFUSFr>28A$%Pbz=TUWJe{4$v(<+&
z?Y}PU^Q0XUw*^xvHjgE{ByLz%jTe50aU|5AGGlut#c1GDL_P<8HRbz}<8iE$FUK#7
z$B2DX*`Q#}(xa|)SAB^B?+|{~>&#uJ`%^baTQ<fkmdRGV9DQ>Z<G-b6#KFU3_-CWk
z>-5%=C8mdRrAu3+X%9tYemqqtl)*$|#Op2=;6-Oo>hv*T{5n%A-*Tj-XHUMYoF>xM
znneB*2gWRaEbhE^rgqF6q3a*dn#T%MmU833+=Zx`tIn$F`rKB&O7^b(I)^?3#*5C0
zC8!TOCGi_QI34_pL4RXu^Hxi?(b<UG_uoR!sf}m~b3iXS89)7L$AreN47qE_X=Wo)
zKkowWdauCuGGG3E6UM6EeZ-c{!+FObon>+lG4rz`Ru;dO)yjra<&Zhn8!SZ9+^P6B
z16bw~$0N3n&^qiKWV^SE*U|G)<qtOH3}ls*<<7EC=KJ;$F#lJ88_~|(I<yTd{KoR5
z<8_?x=fR+Q1*ult6!(90qhY%WxO!?b*u5`rsCDDc6B2{ZSLW?kqxiXItI+?YP3-~|
z_VK<XiUaq<u2hXHq7R6Dsbg@?S?aIVwb=UEo%T;#vD<h!gHHq?w$Po^d-tGsMlbFO
zU5(LGqWELh7{-1cB=+^1iQMTg;Ja-(P9*l`F!}YUOY`IF+&ze%)Q9^I%#|Ilx{oFM
zRXKa67Y_E2?!Z%#7`*;4aHu!m^c>FltBp9Pe?D<gi$9aH8DZas>s9-4wL%~^&UuU<
z*&XSyup<kN(it>b+d;jf9REv6Vr%zwhRq7%y%(#o>C;}!z2M5e8VYz@;>I_+E0H;-
z7b_$dHYZz7xHOG}X0{T3XL<6-2PrG`X%=NOuS>m-9Fk{kL}g(l*VI0N`^tXkss0D4
zzt`cGos^HQjAXz1EBJZDn8R1E#lvldh^@~P4z~KZoHiXnyZgg!WCq=;{Fra6MZeM<
z7Py)+L_U}5569#0#qk_+P=y}Zlc=EUELODLf{YQmRO(!XcM3K%3F^b-{yn&)MsoXH
zCSv=rFSwLdf}k}SH2GUCb~VfMR{v3)Y1NsrbpzNpq6t+oH8}ox4#IzJmc5!+j+>X$
z#MCu|WLZ{z*dXybdB@!ded|ppOFhP~vEb6uIU-N10+lt5Q1MxVH-@LML{EeBHn+e;
z^3@WK+923Tj!D~f5O}&1+aI`!Gp}xm_*I`V<(VF79YN*BH;%Px8?YrbkvVynG4b~s
z^f>3i(I?vT>3zWvv(s25w*+pF-($p{9q9G`Hp0EKxZ`9v#<jkO@%SE8Tv{&{{?38<
z^IX<a3mLYtl0)G_l|_^JIW>n$S3^ZzQCD{UvP>M_7{Y~9qUkWejkm)oc_%By%A9NX
zJC*Fx_Z&LTISq|J`(V0GV*Tk^3@RuP-7PiPCFLvX_uUt*od>g@Ll!#zv*Q})cD!4s
z&S?_+UEw9=UKQt{zxIJcy}CZ?nzo3<Z4Nj)HGr$Tc(G#m1zfaNW%T6dP#XQwvEw>_
zs!Z$2pI0ZtD>RNRlEXIPlsps9%txobQuhDDQ&hbQ6#M4tiw6_$A}QmrD7HzV{*+7D
z9DEfXp`CdCMx!|Y&4N{Piy#heL7V-S6tfahsWL?TbxEg*iW&!1B=Y=Pa~Nc95XqNI
z(W2QCDswc^OUaF0EzP;AWjY3p{tu<Caa?<&J2zdIe3LEHks;;&PA^9?;k`Xgvou*d
zasjS&OQ&X#4TqZ!<%&<~>=%=SuerPMrp*Y3NO0J5$#Ibv){*nyCez1pByWEX#=yV9
z9C<E}mPygrzGpag@D2Q$cA>Y+WDcvg<WSY^h-nxv?tL64W-R@Q#^Ol&?Ni{fKIcSs
zP9zWN<kNNX6iR4;tB&murwX;P`*aZXrR>D|fg5kxnX!2OT$rX`MyKBuP`*AEj*p&W
z^=f05be6mZ$&u{V#fn>FQfVF)2+!gG+IHFx`T1Ua6lB3A7ag%*&sz+<yc18`KXkkm
zKLL*yqzRv1zs0=V-MBt9l%t!@V}^eO|7v7V`=~MV%&ue4qwb7&?adedM@3aYD;D&R
z#P`p7IM;KmxHqLQPZwN6*Hw-bjaOuCYx1PJBe{H^SBN(Q_4(7`H*BSRYTe$xRJgVf
zCTbmMkZZ)H)04TgXCVwEz8e2yG=ERMf)C?}q()N?oh-5WnoAgYejbLVb!PK_H^s<P
zf5f|IvoNj3g3T^|?3nF>lPCOO-Qv%L*<-1(B!kC2T(~_iiL=~hU~|V6Xp|3Tw?zpw
z=@u%!L=ESeod57x%BT9>v0$H${xFE_LAz(g!epox_l2AB`yX98FYxBH{XcQ(<}Qa(
zPCB%A5bSSOhqp828EypnE!U<;{07IfD`niX^%^!h_7?vXH=y_9IBq?CTGX6+0;R^^
zB6&s?{>hT4KQ0Bwu6@GcDUMJxGh#<+e*W6kiC?c?c5uozq;8HqI{9evWqdq7Xg$Xx
zH-C=noFd$V2e71ZKV)80_~%q_D(1*j%XkAqk~75nmyhxDz7D;HT}OOa7)O1*EW2!W
z2nw<3{4UdF?#I<gxOM@J>l3+FrMK+tpCER95W}7o(X4!!K;JErLs;*|4m&>}GT4(E
z{rl7P!EfZvQ<j`jHR{>a<ATj3x`fqYa>Fh}IX@K&26|j2ajD{S#gKKB-Y-w%!;$Ye
z`X-7V&xR9gqq+RbOsG#x;I1)=#Dh!le{PBEtIpz^nlUd7FyhtqRY+Uq$4jo^R0&AM
zhN4-r)An(^xFX7NtWG7GQp34o&QuPxS|^m|Y(~u9!$R*sC~b5OLSb|Pnj^Cur~YxI
z^PnEgjO<PCGr@d#KS8`1wF#%PW@AX3A7XxH9rk7gadoP>xa{*0(TVvi>Tp#gdP^L3
z_a!lM+C21DSKx9BORh_FpzeutP&^jHn}f;75s$=fDYKm^d3S{u+i~@yek{<H=Z1N4
z^!}W{#FEZ@>7B_D^2d?oBR%KmWKsI21`of5;^W(1JoGD;b0#HnYJCj7wuG|IXpZR7
z*O-IcJuyk%5Yv);X?`n}%Z}y1pv;|q|JA`Ja3==qRbxWVUfGRq$t-)B#m_tYa6_Fr
zHLu7~b$&<EJx98WhM-6AIDTU~oi*EXm_|P)yLvJrd@}WKoRjsuqC~ZJ6EXg$1vS(x
zB_84d-NAYsGT(`c)k-vssK$-azo6b-n;p;26Kd(>S=4OI`ji~3*p<u+cZ$WWFYRa-
z8O2j0G&yFCJZCgVaoOnYqE0`Yy<QB)vhP<MwAS5_)g>umorxC;AD%*6&kuNCpUU8c
z*_^t)Cqts9i(`jmh&yl1eChvI#;kzlnE>`$@dH1L{CPMjAG@SH>PVMGNLkc?4Yp@c
z)!7bRO6`#Gpd58UX~Hefi-nsU8K5(gpY^R76Kux^swv#6n~mKK+1&8K1%FJHc(mvw
zW|W(;!ONPt9mcR|=UdnpUB>i~AF^DPR7CxYp|!gw)t}hm-}Q2#Im%sT`&gYBMjNqn
zl@r%$jAgK2e+G8ijCb8OV!^F1Vrh5jKK}CtKC3@Neu56~){j8PHSG{({s=qw9Y%k%
zbgD+$(dyDf8txm3LF;deDSHMXZ-4?muhZvOi3eSh{3a{S02cQh%y~oAxuPb9wPijG
zc(NR48dNxGY9OOUiAb6H5}#}*iU;SraJ~B@I6hpBi!m9@K021BL)7^0fj^EY{y=Ts
zT&y2sgn@b`7`<sM-!1$oW-j#Qe<S1=v{_Gf$HtyLv&<0oU>R;&KS$lkSr|PlTb#c=
zl|RgE@N0`B<t~Q6^K1Y@f;@4#)|KT)9qHFCp06yop{(svtgw?Do5Sxg+v}I3Rn%>K
zX*U9;zS?ZwG>JZu?eVt12D8VE!~&hKxMg}7{_o@1Pxm&gW~g%KjUd|U2p%0@i^%Kl
zEU%Q~QJ)TyBbLJ^`}XWxW-dCU{}T4PHazV5AJV3o(msE#I3lWLX@9*r%lrTiH6+ol
z&qf3)4B+cs?KpYC5Uf~b&6G2toPH^jwVCs9q{)L?s-}*+>N4@tyb5<b4oD;RUJTZE
z=9Gq8;?_0|?(}cKuC>D$G}oDF`5{p1GE=laV8H9z_WW<_7hzIv$e*9OO8wVlE|yr4
zlWRQnZ3|&G<{=K0OZ+u00v{}sSmzMI?>mp7`M@1a+Id@S#y<4vyhJFjD@A_)dX(Gt
zLl4R0d?vB)k69r9lHVjgG_v-FSg*ECrucM?Z1Rv{To8W{ZN5x{(d<yRk57~MMI8GU
ztcLSAi52xxL0`{!7I>$#(@{khRhc4a#xo&yx20#^eiVAw;YvG29=-GvQ@*xh@)UCn
zYPbXw?`90}S}(?0I$_2RBlf-{Wp|$4`C~&1#Pn$K(%lTl$9xiAH+qP1i_>u>{1p1t
z{le}ZcJwQrjh)@9#ei$$S<%f$dbe+((D#|l-dUMeFNZNt$~%@AjX;QQ4BK4K#M$S{
zTrz4359~UQ^3h}Q`%OA}rkupdZll@odpmyKj3zs;M*htj$t&r?z1N4LTa1jujl8*R
z@C(uRXEesn*5%86<G6okoK*7`33rbHymC|*#kx|Rk(lrEAByPPJC->s#&O=^9JWc_
z0oj%IxMp!2gJY$9&VD(b?y!?))4sfE^b&U+q`9DH2M$m;ji64AIP~v<n4Y4{(#_el
z&#~j*?5)rU)uG;&p<Fx9T}+-}%ok>E9P)1>!eeAeTwNkQxLfk%%qTIp(w@2=%kd<w
z0^{^rV44xd(^j@Dw3x!@<|~oE&zqGtmYhGS3)+V(^Gs<rdN<8Or`vrbUwAUp_l)L%
z8R2}`?TmO;{SQ+Yzr@agU9ed4BDekL%eIowm3KgzRR*0!hyJCwI@*J$V<z#cd<aM7
z`H0#xE5*gMZX9*Oie^(v(dB3!-}#>sT66x2wJYr5vwx*<4DrTo&9w+q%M!^0OYriU
z3)gA)W>wTd+*=pGnUWhkV@wJSwqFzX#u!n(#1q{NwW;Pggy)kM!cW7PUT-|OC4Vl0
zCrVlPeaWL*s44PaHY098jcjY72c6d2P;Rds9(_n<YwiNEe8xL?J{iM9lE0;Ba9j+S
z)tM)jhI9FvF^-2yrm#9QfiLcjVs%v|-fhm6&YA|jK26|=p!QrW)52n-L42cD3H<>Y
z`0qkKw#pKzr5wh(k9)CvzA5|H2C&boZ06obr`561l54U^rlfHLvlmNq+ChKj#5aq#
zu>s6E`Ap1zmCBLoPQ10e4}Uur!RG2l+`DlL{&EGFA@O;qN8WtDaSN7D@5r0+o3Tcz
zyVS2=guAOcdu;qF(gP1*)AR`v!^n`lrMB?<C3!Vk`(bGr&a?l<^2+`wzFw6=E6;%v
zOV6kAPi2JK52cNp#KrW+bH(wAylFan@Z^exsF2U%4YScaJ3(T3N6w2$`O+R!xw2zW
zgAW6L|AtD})uK)(QLNb>gKvW(`B2IW>=I^Rx6MVcq(MA^8+%E4>e7{xyWyOK->3G8
zf6{!YBYzF4=bp<JclKvl`A|k*ljq#wh8&sq8zqiQ9hUx5qKm}<oZlEgy)$OC+`bup
zWrxrqM3HSJzp(q(n?m7p0F5^4@x!lb*eO}D#Ko2RiCP#IHw6pru4Bwmf9`wMiMg5X
zEZZSv`*M4+C?tTV`v1cWDZ6|b{08?ub?EDUKy<3~r+?}+;XB-)b#Fou_1%_JH>9y(
zQJL)OtS#6){2fL(FT?E=CtNNXf~Ol?I4w7Zd&erV)VDK7cJbppbz{~U`$PGbowPS*
z%tPNzxGlHB@#4Nfwp;JXen&3LbRQ1kb3HfSEefYnzgVhvED#-bj^cQUYqa|%)uw4~
zXdEl;ob;Iml|BRL+iQhTKRQp$Tj?nI_LJ$d)tn)U9&AuE$GYjgnDWt<{U3gV{SE`j
zZ<vJQMpY4BU{2#%uCy+)#Qa(Vnht%9<&#G+>{yDpt0&LLMQW^loy_I_6S>P;VjDVL
zQF5*?H()xHhV4XU{9u0E+fJsda!+h6>@P9H;|NLB=k>0(xD(odqn>V3wk?k#2i6G9
zhW{{B%8-uyP2nb08NY7#;;m>m(Yzpz@k2BiHavtm9hc+unQr`jM#|M(oH4y>ET3Q8
zgS|u6qMv~ab64L&P5OVR%<auK(r%i{-Nh)oVa3#N1wJ@Gnf+!Tf_Zcf$Gq_2(&O4J
zQ5nW3O+dZ=K3Mb!U`E3u+%Fg?t8UkX(KQ=H_yR|aFHh(8XQrGoX^7+~#&FGyzc8uv
zWaj!zBuY;H^yu!qyV;4CHK*aMc6V-VY7~3@$1wApl-cLBqmx!&X(w<Q6mC?(y;@K;
zxCJU9gSg6U7~SKdIqA%0bZ%8*mw;knwtN?AuY@sqwjR5;?10ZDH=dekLigXpxhSSa
zY=5u`b9=28s_tGK)po7S$X>>oL2aqu|1|#I)@S}<Un-ggGIwAcdrLlL*Xj~c`B-|7
zqdtg>8xuIK`83*Ct5e@ym%XJL7H<AQ+^Ml)``_g_awH137Tm+`FDG&A;YuV*4qHU%
zAlMwZ2<0IO{1RG;fLm64S5bo6)z4*#eZPo%S6apXUjBSitid3Ab=Im}M?%SMWL;~=
zogKg79Wxm%<*MDD+((mM8=PtE%SOp%C}>JyzE(S$uU(3Zovmp5ZnN-q4dR5D{<H|t
zkX=0=$YF|lY&QOcUE3;ATCx%z38nZ^^8k<9r_wLdmU10anIPp1|MfkO+U~(n<^g=N
zmv*&Q81RFfOt^*pMa#c?vYEFuDQEN<9pWU{EMzxyt|a0`FK1p_QHg@h%V5@d3xXf5
zz}*F{V)5#YFs`174;?*u%WosnZYFTQMjZ3vrbyq`BE)>zFWnF0xMO}cM}!ZQMPF^i
ziiD>~O_rlohgQ^vWJBrpe2jMqVtbo0EOa6-Tub0y@8h7;HEGwUJCjZh;Vij#$XgN$
zy&<Ep<W3t#9M~mslSFp98Ael0U+!6FLRGmgR4l5)#gP-FGrt}kUkrukw8b!2cf<Yu
z0zC|UnOzpk2|EU{kV6<gxlmM1G~}51e%x`{hEH2>W4HAUe7kTC5uu*6%J$-e1VhSO
z*|F@43LULhi0qg7ysNbeTPw#yMSnD&)b>VZnGT=TFBKLsUD(n^on9Zt(D}GM4p%rz
zef4*ttd)yAnE|FZ*)n5KF8>*;F*WcPh6@ko<=w?Tb4PxCp28I+UT`?nh_>~2WnE1C
zIOV-3_e9BYmHB6J9ap71CX@$aj^gmmaAu4TMtMMYTz|C!9nP+SiR4wNg@p2?>riIY
z%fnpPh`k03;KWG_akgaxyfhRkYC53D^HAB!sB`!^eF@Z-UWM(7Ul{HE7)O_mg?!<7
zao=+_%-T%gjkA;aEAhTK`eFoa!bbARx&`p{@aDNJHJ-Td%Z?kh7=5TaKgWeLesBl=
z(`v+F{|;!?N|v~6BsaE%vZ(Z?EHl!aZeF#BUKhec<7>s;oxAaQ!UX8GEJS`F_+_IR
z&5H-II!wk4$$2O#8^^XgCEgx<LyU=UgnEJkif4R<d3R^J{rrHO^F!IR(u8hBoq2DM
zBRY&G6RMr5KTQ{Pn>(RCWGr>YnWER#^Jp&ah_(y8u~b3Ea)}Avo3Te0e|s!<%#!1q
zfVO;IqYb?m(oW_&phe=f4|fMMecvindo|*;({9=Pv+tpneHy<~HBgcPzV2j#OKmOK
zQR;7xHOKSlsTa5wya1J%1^6?-9FvuLNKTBAsGo1gsED?hdpMN3?aldRss+8j>mszp
zlbv5Kg@sEWTyX2mG2YTHN|Q04^-iYsa>0G|!L&?K<*d5F_&!AyF&TTr1k++vnW?k5
zhdK4OdGP6fKj7-pDD$r^z}qcG)E_*IQIVIipJVB|`i=OJC1qIBw|p={##>Lbad(}y
zG{=k<eMg63XP`P<J_W%leFEN%yN3lX6*!dEg-?dvz>xd4823eoGxonkn4CYYHNIok
zhDer8X)m6vSp|(PhNAU@v@>P$3H>Ho)63R|J`#sJwk2994DK%ujc5@q`g^c`d<`x?
zOy`u($Ke*(mV^AJGQH$Do-RC#zYBEudQuMe?)gu=6N%JE3|}^wu>FXeqH51R{7vl4
zc6%rD{P?Fh+3y{`oVbpyr&r*iw*zOs?1zjaCD2@G&3?^5>Y^_q_5B{~u^TF_#|+^2
zWN#Qx`-nqb)f`XkF(A&bLWPejzvfs7qvdvdnd;9i;|{?&&yM}elsH@FNadybWyc#|
zp)9cqGY7j+qe~B3JgSFYcYnqgO=5VaB^Bm}vCIBlaFYDGO?!g5vxAgb&6DoOuXdc*
z(2wor?-WX5%}AYA599AWkZ2Uk*mu9M_ERc98co0g$)7wtuODaEjN)TsIT{R5po82-
zN3Wh>#IBC?v5eud6+sg7lZh{~vk3OjWa{`LygRrL3%(^t{c0EorN|sVDg1(Bt>DL7
z!#Q}_01lM)zjmdRi*Jdig#Z2S@GVTj{y&Mxwm5;_6DD)cH6t$HHUV-EtoXcK8Hc1=
zP;UPOYU*Kp|JV}-+dX(46WI7C4PFW{SUKX5tcT?MH<?Qw*-mTOu{%>89g>XyZ|FSy
zv3}bytbvRWNl_^(6-D-V?(?KVMrn|(A~Z-eWViPaDh*UBy_J&oRM8%yrKz1?X=(5G
z{tNhcJm34i&g(djFOEf<GqQJsNXzcQacceybZO66J2hU)z6Q<t%3OWb1TK=Ps;mwX
zOK-i0*$F>3G_z(+ha_$-aAB`03KmS>56y1BuxL{-@B4RUR(uZ9?tj8|i~G2?JB;q0
zaXjU#;0LQzu2=NK<#sW=xn7-LB**EjzZ<PJ^jNpECpQP5!{~u8asSIFe6d}Nh?9fl
zj^B~Kx19Mr=@*)J=t0w~o{aUk<-k4}v^Q(O2B$q3C7GJ@XB0eYFhr~=D1q&EN4}dm
z28WtD@|R@EzPvEQqKx58bb`1Rq0S4N2SQmpMjT)wKhOUoJBMD>4eQU9_WikbPz+9<
zRN=IG$yBz{;PJ$vircw!k^dze-|Pf_9hUc|>XCfD^&j?OAbYmx!O4A2<M2FN)=qR}
z8~Ht78vPAT^5<#FPQ;bn$y9rBNDNt1hN$*KIXD~qQ2q+SB>*vbL-;q!p4y97p?dLC
z#2FS~-+zv{y)g@S&ws*)VGH5k#h7ACcl>wOj*s5nMr!hP>{;c=aT_*>wH+qVVd)CY
zj7wm%>%BN{yySPA4#*77gZ7JF<A|#-a$hbLyI#1^N%r^GDy>DdV-3nZ9*WsBv{;rD
z$<3*|(P(GS`aHQK){NnOt04>=>%)#YUeNL#r-<2@NX63OvVV}s4;zNDYP>OzeKn>-
zauy8RZoxpg*R8y91ZO@S!Np5aY?JyOE6lZVsY#8tYm=y1@EtwxjpUMdbqFmQM9-=n
zIB##q(zYw{)n)=?qrJJVd@RnCpT!t?w<y~c&o#%5xg)L!8yA=;9!Xcnz8r13&OC@&
z+jZrxGlAv5P1t!+5pJHFg?Q^1s4can$!Zf?yLA<My}Ge?(L|0n>4a^=_sRb8L`9(b
z2Zd#k8<#MYUvv5>j%OTH{4>}Et(<A-pyh?5bF%U2$u8921g@uPGW>vi-pelke*@$B
zRpxM6hi9-*%Y~zd%KqhXnHlJM@MZ4}F!U`#>9K!^7}1i8J%`A1B^#w$U9o?KA=@gC
zLeB+z#j*=Yd{Se<OPK>`BR{XDiq@iKvN88%wc)*+(`fl$OAZ~g953=8Ax~p4=Y$@>
zCRlS=x6fF5N0TF#`NMyjDOME{r`Hr<@@Gx4u3U#DHv_r$-~{F^FclLv-$J75EAi*N
z4U_d(qVvfP9OmYQJ@Wn;JN=tby)ueZH#A{Ven-AI&=rS@F5>)<^}=X&FZ#&2V54z2
zcE~*^c9f;#(9CkU{&A=2dco=0HMmu&U|-{>h<?<GyT%;Gvoud=UdciCHD~d><vxV}
zmVDRDE%<0_$1d|?xcNXUzR+GL64no;>+4pm&#{L~qGSQ?4yN)ZE&f(BWp1UMId81Q
zf-gH!)~>JYguN5F_5Y!v#ESijhGXN`8kes>&DlzFv(KL$7KQCkA-m%<SZ!X6dEI-6
zh*mDx<GunL^|V=ZvJX?L<&Ne3YWTg%!?&UdsFdByALS{Cnplg1mQOMANimWqKE~J{
z;p}Z-gn-vgSY@^p=Z5&OXpk#9^i=TpgEClWo`JQc7H-M=cbA3v@HDZdufGkQ$_*I3
zbtNX8PGRY`oml6R%xh!I5Pwt7;Ttxg_rpH4i_D_#miCfowiMdKIv{uYRQe1$rqCLh
z%CEWpbX?PpSNtA8dH-;3ycx-iW|uMAWIW4Eqj<MPB<gbZ!@+2$IC#dB)lOL=W2!bi
zS8PYOp6B5<VJru%?859LR`6XQyGf-b@CpvY4SQ8ylzHj!O|AKPoBZsJEa@?+KWlxp
zFz{-Tcx{%&_>_D>`K<n6(2=cj_91^jKfZBR;`izd4ou#GQ(Z1%Lva`Ed3jkZyQ;-$
zzZ}KejEz`)SC6Bk!FIC;^PD^<Z7fsxYf=IqAM)bHpdOf#wnYRcv=PyBm*bXiKg|55
zkIQFUaYSi5R)iahIPYFm?b4B2x;lj9Q(BMQ0LyJ=Y;IzPHIq~2y?!oot%~7p(--3>
z&S3h%5N?Xe<S*MAJkd879wzesd$$KmZcL<OgC_0TZo{V*et6P-3+~_|25EK2r`?NC
zdJsH)Zyc4M4<lDqBGqm?lswH@x<G~BKHP_TzaV6M9K>xGe7K@Clp7z(Jao}Lapd?P
zD7oZ|nMG9!Q|TxuT&KhO(b0^kdjluMDtO;IEzl#C1^-1eWL>scIxRy?{dp3)v%B+T
z#ZM9aXR{*Uo9x0?8PZ|*0Qz4H<;cSZv^+{)(L9NCX40-=BCA|As5tmZSnP=uA(6S_
z$fDlRxoJ+fpFLP5p97ngOhb^&>@!=m<^hE}@0N|`ssXJz<M3tt{_g2gG3yzM`V8fR
zwkj?SziUKaj|=$rDUlu-3z1#gp3`2YGgqY(OYK4!_IDaHucRPz`db`!yN0IQ%G{xL
z1f3QnapMR_jvq1)8+Q5e=PB7w4Cuqlhju7@4h~^$+-UkVw7}$>l9NAED{h<r5w+Xw
zc*El)_RaZ(s0$W67O9LGv-&C?&JGhHi~FNtt}a$xE*Co<`^q`+DUKZagpBt?Xz=PP
z44+0(t%o5Wbyct-*qzJ0OYps2OI{6g!nqJ@ev1y{;`a|A>K%|Y?j<^St%aY)KbSv0
zgrs6ip|QRc<q_c`+cuG#UQXo5P)AyQ)j;0(IHqjxL?ho1!dlCeKg*V3x@!V;Ww-KG
zN_$$>eihkfHax29!b!{1IC63?>Kn-ptL%rrjr8I^+hm3JfUTlX=@g!Mxrvx&Y1nd1
zn`2~uV8MY-T;4&6MrFBJWN$#toZ;9UH3z?LPN7qDI=!C{X1^u2Y;#!dIC@IfLGmvi
zT#qZ)$3S7$o^d(JJZ05~y1j=otZ5~lS{}!+#zok2JXl2R@p8F#vIXXqcBPrVC2z;H
z#iha+zQ623-G=V$^ud(cOPer!t{DT)r!iH|rK>g5c=7iKjC`$%D?>{$_`BpTa*b#n
z8jD@Z)8Jo~PQP{<9DS<=lau>l|2BP2yXMQrpj7S(@aOE}P#)fH!k_(jBIiGM(K`CJ
z_*gHw_b4@Sa#SK`P3^;z)g~M=pe>9a<YPyzvh2@Klew5T*LL&e&mA*3A?_er1g?d}
z1AkFw*n-PrgLrIZB73_<GT(NJSf9QQ)zgQ=P<Gjya@WJ`{xvLfFNKF*CmJX>gVCuZ
zb~77|DS`2PIz*jIlav|#W+WH2O5%%(&b<5MCDKlh;>f2pm>qQ$DK>jh95;mz(q(7(
zRXghGw_yJ+YtUhQwe*2BpmCt=CaLshK$#E13^J*5VuVo1T91snYe>m_r^s8g6Rv$N
zsoHY|(sgHG<4O&x?CF55)6696wh1p>-iRHK0%5^XaDKB4>l>sWD$Intqg(Jxm#0uQ
z=#Ja=iAWy31x2@8Q|E9M&Pz{(!O<{o%~=FbyG{&uk$i)Ej;E}N=I;L{(e`ZwjKb4c
zJ;qbadi@feyz&sZc^bN|vl181Pp9wxp`4l1MJ(Aqp6ShG_#wGcttsJXZ1{p%`(;08
zdn8`ZF2irjG|30oV$11^;#i;sO*|*@p~f3AyWlvw{OHG_15cvt=WL8QREe?WQ!p}j
z1o})UL>D_Z{who7gt@9>rpzWyQ({=~$dvmR-p2^VLu@mef}llS-2QnC?Xw#(tmSf{
zTa*dKe97z$UJpzC=P>wIB>Y<RqOV4Go^`UL<Fzt*&L0&|_G}goyCxG00x*2A6AGF<
zIX-GAh2AU7Gfd^9^o2sHpB8gJccOipXv96N6?!#$ki0pHDw%80vzy!ld^SPTIXB9x
zF+T@cp!iWb-VM4W&T1CHUB#CVOU{bPYsc~N<L>yKH5bM5?^he{4ULH3*j%I~-2)e4
zy?rK%kI76Rt`@&WCezsIsYrVAK-{?F!O)5^ygw@sHyriox<Z|fI}OC<KqKm%7>idv
zBUxji&(L35Jh-MS-`dO(<~tVQ#4O1X9Ej&F*>fvRYc8%Va${=ASH<e(dog%Zfa32o
z9lT!S$(PcBP^#LW7Zxj_`GZ()s(FLE{^9V5ehb6!GTc3F!RjOCsNdQfr;k`Nx6pu@
zt}Zz4AiG&JFJZjQGsc*uut1&$g_#{WF=HeT?z2#sehg=api|gVXvcqRg0Z#UhVce*
zGM^9R=i*^3ZkC52$===HCp|JfRC(!bH!gqpQs!ls(D8C_bP8LB4V?pdL9YwFXO$rO
ztdAI?Bt0;0L+KT&K*X@M(CgBIG9bg9#xpSJXvo!$8xeaa3MO(6>vU%%2RxSFFQt~y
zwVWz_U7uw4$w_v9pW&6lkAFQ%u}reM+1=Wq&MqGB+@wz}FoTh*IS2{p#mP@(b}_mW
zLwyZ6NHRmW4_I+uatsSs+`*h{n{X)Mv*;EX&F8WIVEfsfw-V2yC|!@OWri~~`<MtC
zVF6}1F|un3%m>Q*ppP|;dw1i<6q&))HRIl1;oNL#!e8g>oO_&_&Kuq1Ibl^Vj+DEy
z__R>gp3p$0jdbL!^2e;KV3xQ^ey&lMw<6lJR{G-xuTWyj{jS*3V=WeZ^5t|b9Zqhz
zk4rw+QMus-g085c@K7AAHLMsr>mQ<)M00j>CuZ$;WrRX@#s<qi`|cq6$FD%(J2hT-
zHi#R`!&z!xfg?&Oxb^q~>gtXOKV3iQM``f7@kB;`3gGu4kMObC8Q3<cvwmPKZuSmk
z#hB^1H7XemkE-y&a4wGAisiK5o$+k>U-3LHgn1rGSa$F<PE;Nf(~|tS^g|@OypP41
z&gop+T6R&kJQXb#wn9{0b567m<i=l?IAL#3w?)>9k(EXqG)|f3&iZ1QOKYwlAHnZ-
zGf>gm6`RIta?CLY_7SpcZG2FyET|MmTG`-0)m*Gx;|JkBnK`PmD>k<?cUjp8o!StD
z`Kxm2sR^{s2<F!GDa<bU3pe#cxV*%Qoi2P4sm7ACI$n<AuiK#Gp$B7qH54BY<l#52
zOttKbQqN$f-59`@bELE9*bO-DY{utz&xr@B1=u!nxrjGziDm;DP?hLF+wEplef=1F
zR;c66>nZ%^FbA3kx8dqXH?}DaXKr>j>iR#1qIf8E>?%b&TT^b;8O#;|x{UqWl6UjJ
zi=hT@Vf6bT5`&Hi@g$D+*V|I}TBew>GD<pbiV-%o6mL#SmqSns9viH|tO4EG>|_GB
zs_%yNOl>~<J(BfD+fl8u64gtB7;_~LU*wFq$h;m`-6iADuLVQYtT^_Q<YJ#kQFXf&
z&q|j}H}}3G);EpX=lb!gcRGqYrt<j9Z0P(?>gukm^H<S9blD<zGW!Zd<vz)(jY-D)
zv45S_rlj(6U<B+wBr#LoYgfONOjp<j5jbF);y}<+SWO(u2fcOpq0JX0UT9C>#gV+W
zb_yJ-6|7Aw!Hd326lOQ%nSVhtsrN^*z{UU}#e=ygJcHhC)M+nWCtE$Qh_gSXgK&iG
zZJk%8`+#`}exJc!Ga_g<sXbSQYH^ffEzVE4t{7)}7XQwU#?!f8403Ppq-vW=Av<}^
z2K*8O7llG)-E>~qqlSJxWcP1^{JD~4o!sxLC|G(0O*`u0cJLfBcRDd~XF3;1|Bl^*
zWw2Z#pS4xt^!yUUI|pKTVbnErAEk-mQzo-mvf0KFYw%)LE-Y3j)3JLY5_`1e@58Hb
zc+ezzdiuzjKZ>J{6v3hIY7r8w%)XwEjBVeU_BXxgHcg!^?)8MHsx?EP%BuY{Mb8(K
zm6{{FqQAXZ8aR&Y<!8~p>Ij6j>?;RKZ{yr(c1pQ}==&bh_t%W7+07}_U=*Yezy(Ly
zAJ!Qsx=R-RXun##NVLV2v3GFXQ0{zNR*6A19mUyOd$FpIJ|=V=%Gc(8V)bUwv33-n
zjc&m^x8zRr&<_0jRiilm*n+k_3KU)cHD~a)-B{dds5lxt5q1ts@TH+2|Lz*d2ij|)
z{@ss<WPhY8vxSI_jAYBNOT?R|8suMdL)@k+5wanj|K<hDPU!?DNseq-NfeJZ87h=y
zzIA$=1$SG>UA>Ake_IqP{7pM>mvIL!U8=+cjcNGTH<+*I4dA>U;}~A;tLR^x2pbnm
zUOL)ecI6$g;6o;}tnQ-7O`jDWmxcM`qtKU((*62v@V_w!_pXm&T7fP{e>D&ZUnZe^
zhGbB>8wiKJ`=E9ufa81G(fXg9H&&7R&&`F^bUkt6=pS*W(M9MjH~^<{ayDMrikF+$
zVx4;*nemKg$neR?9qLEJuf1sd)RNZgjdAGRR64Hrg4JAI?(oS&yCikmj9MfVney3d
z+7G`}R^pfC2<q5&<$xW&thmyQ?<H68_qh_)L!`$;&zZe*2f(7=H<*oXhG9PsL(y^o
zFXpeqG}o_q&z203JI1nQ7cg(49rgRjp4kg4?$jC0i6uAi{82TAJ-&_bb=H{Qs6^eX
zUAbYxDr{pk{f^quaK?Ge=wOZ?4SS%YF%%<~lf8y`v*EyM)cB{c*kdHy`#`L4*N2AI
z0Gu4KMHDCx6kCjp<@56pp3J-nld;Cq0hEnp`T=4`o1Ppc`zNi2Nba#Ql_$1^@UN34
z7Tt+pZ9@qb?o_Z+IwiyM<jm4sx&kzu`8W1H{4)l#C?k$db9+*^+ivl|M~Pvw13R#^
z6SvsQp2JO38mAo;7tUp1@ghyp|AHEWWp8!(<_fW9uN(Is)0KIq12wJO<>#2e<#)F!
zN-k&e+^}H&?wv;G#fi8O70xHiCa}lt7IbO|;IDfrEK8F!@K86X_Hv}n?>t=T_Ei|y
z=i}N&4|e!7l=ph}qT2Zy?3g=+KQESJ>$>B(>8FC0+EWp4a~LNx-=iiZm=2O*TCKfV
zoEz%S9!t~c<7-Asw_jpvl9U6@iRX{kQ#sP;4Z2Ql$Ksvaalr78$PG`SqU<c39`~bd
zPjzwPvLl-{ZjrxdKQx)&#UGYn$H9D9ok^ryLl?~6H6Gauo{8^UTDo|I<lv{#1YT8Z
z5Elb0@u~78Y^&p_FT1L96PAnL-4#49`wZ((wWe?Lk^J_qNqks7fsgwsv1hcLS2w&;
z3>fxGB+SXf?b|JRNTCC@{6S1gcH!aMj<oy|#&BIzYDhM$-GjxLu+1O6uITY_^Aa(l
zZ%101{l)*r3p%wbMIT*Pp1P5P@!m5yInRoBI``v&NuF3Wr!{*<1#*#224`*S&ZbCH
zKEKr*U2P9wEcFpRs4Zu04P?b%3-(>USkXOS8OMD3b4uj|*8FcTyvJCu*fE3~JcqLH
z!Y7zz6wPu=56+c4sdfpysomS2RW-JBd_0i{iaR6Wp&DP-x-&vglabcL*|cps+80Wf
zYj3%;{yB=(f22pI`Bt%R_6V8rPUX%#7shV7>umotRhasBpo)1Ic2u;ctNcAHYagLm
zRw!D`v7^8IURl30z|o@jSS1;f-O@2!WHFR>86D}^v;iNE+(w+f8S9>O6^(-}+4;&&
z%vj~cj{_Zev*9tOk7+=ck3Yq+FOgK_Xt2+SZ`k^GBrQix;kxTZh`DdTE&p|+V}ur!
zcX#B@4c;`ae2B@z7mCYH7w~XSFAnvv#oYQqJY4F}%8eTQ`}7!wthDF0cEPwdrVdrE
zDlDm%J@_-O^ckKd&#);tbL}%~k9|af{x}4Td4Mk#@iY&(g1XLyFxetqWLc7JI2I<)
zBm>^5Dn-E)W1P5RD7x&)!_FKVj<)+MJ!K8}abX2^8cY+;4)35Nl9(?u8(lfS1;sSr
z)re5c(vM}xAZISWcM!p9_hGJ9h)>I0I5f(Jr(%-0*s>Y@hwo67Y(0%k>-A_T+$PpO
zaH6)`1H`6yaFSjIr!OB%g<dmZ`^ACRr0cgyr9tEdP3G`c4-x9$QO+}&oS&A&tJ42>
z;D8T92Tj2Aa#yA|KcG097LQQ11az3!ADWjU*!q_@bJgM@95gvtdRSt<sKIfzJoAPd
zG2*K1<dqLVTl)a^)3avb^?_KuERsd1|3jU8F3r5H%D6Ib>Q_nEaB&KcdaT2&ifsIt
z)Qkta8DZEZ`56_y!2JU0P8t0dEVJUee)ce)*q3h6UR?3&Ixd;W9?aMX-jLpqfbF`R
zaapp)zA~f8?uR`m0_gQLjQXrpbQ+_Nptk)uWXv{9n|KjII@b#8?Kw@JpIxLcu<6KF
z_}}Tp|H)mN^Tu@;r1ld=F~{-pmjj)b$-ahzKd-D-<%&<P$ZLK>5uVnGgC_J4iw1AR
z8_y@$B)!I)WY_v{=VV-7c>#-iDG<}#kq2xpAVNA+I4c=fWgfS<y9!e5(%~t&aE+{H
zoMY(0jHhQ6{}yk8Q6)sG=PGP1n~4Pr)ZrQC#XSGhvSYmmtETLOa-ud<r(VG9Tm>`U
zRAR`55!@`@%hUYoG2J+j;WFnB_I-{0OO4pnr9c1aZ^!b{YA$!n<bE<i!IS5Qi7BqL
zQG8aJiXr*<ymAoJl#jx7a|OQ3e&C<=e~~n74bGjFOkEcjPVum1ONRui*G*?Lr6F{`
z<-tK(C5izb0%_;3;I1?B8G87lB3VC$D-LEb;Lsoj{fXn|#F@yDd!r2LsQO%&!HoJ}
zh%qk^7V45|n%|KHhQ>&;za|b`kzVNoMIzE~97h|3Dhgj5$0NIP=`7HpTYU(xSEupF
zhG0=s*b|Yr%TOnu!7Z!S<8u!m##q)0UX=a0->ulbRVxGqokUig?2O)uqSIvsvno0<
z`REahxI7!jqdYiYGEm9Ou43Vd3US+LI2T;KDdrbf<7iu5OmMse^IN^y>qZjmFGaGZ
z&Xu1Vt$20jPZ#S2L0r`G5yD1Kq+7cSm{~W88!Drrb<hOscS{a4tp<hr>o9oz4V3<G
z6<j(wA-Eu(A(@w<RKE$0v11rp@5v5^N3b-m4vWjW@%%w|Zan`OJF3-~BDsx@Ds537
z-~uDrTeo@{E&YN+xm>!cW`6&O0YM%N8li=Qn)WnUeG5u9HZ&RINprK&oCE1NovV$5
z-(y%S`!`_&@4=>P7Qa=`p!TkAOmAVpOHSqZ8WY5Xjw5;R(j&YY)Fi$~trA;PY*3fy
z$=(j5Mfc9jU?d%y&m?nT`OjVqEZ>RladNJ+tA+7p(loRaV{V&pLBSvRE$GgKyE8c7
z&4|yZ7mK*jt8rD%Ecu7-3-e<-%*^e}MM@)?fPZ+FH;H#<6bRG19jLx%ov`}Xl_T;4
zY52vJ+NZzZKr?%EA30nYZkobB?QhEYO@$d#y;<<qgvY-$;O5;v$a|V4ZkD(*-zfm6
zrw^xZYB;NcRk1XCF%-&5$XIX>+hrH?TZboT<MR`Hwns2u^%@d>kLPv+1s|sT#SX3(
z=Y9+aq_ec^L?cG#k?qURp|@Qs-@I29rH@;pa`{4J&fJdt36HTqB!C)u*5ZxZ53G9R
z!#+>tJ$Jcu@P4&m`u#XD{%$Uojk|_OqtD3F{|vWIM=+&v3FeP}i`{wSc_1@|7tWR7
z$F{EgQy0OrE4Jah-bysjsFv)N>}bBT<(*L@;Lt_`jk;A(X?_|Z(|_T&yvKE$)PS_y
zbk4sx7F%q7p<|5;Y8*Q;?#)oTsCXg9wl_UI$8xE?A&+Jc!aA8LZ@C{pt(jlpeMLpg
ztslT6@1~<jb_b6=k^jpk5zKyj84J$Fan-dMRC-nI(qqMXtU2M#)OoX|OD>4n=J9xt
z-3>L5Z8)Un2ZmcG@mKH${3o+Y?c{;<znR4O$EwBD>+!s1d<$Q?*5QV&9qzu*#`V<}
z49RN;tK8{qXBf?>z*}fn;?7ypkyxR58IyZx@{d*nGJdtE-hgo0MQC#4Tj>P#vBm7q
zE!ewNM_vtR%>>tOjD76GpFRzE`DiT?lw{A#bQnq|7a~^VKt+2Y-ubHW#`8=bzEXox
zIenR8bI>I$cqNn^wAe-N3;#VMQ}w)gbBPWXcsFPB!dv+C&=%XaTJ!O&5Wafi3bZZ2
z;GMzjI;Iid2L;L9Vmy`GuS1!BGVNwdr}=;0{5~**+jSl3k#8a2H_?Xu6WX(T**~;s
zH3OONY?#sWDP|T)KD=oh$IU*DIg%wvuCBrHaUQ%ZIhz@hCs>rF#n2(rznt?FVJ>0f
zmKe+nIXc2=mJ)v)cf|KqpCqGq8jY>ip@VcU&n*6khdcDhJC^)iIYV{;<|FfVqf2T?
zPj3H`jUA1%@V{>gta-l#3t#q9eCet}?S*PM)tiW4sf;D_M)CZ5%8o@OMffLllH6w7
zsfTbXyM>}$R~5G=4M4L065PJ1Cq!xloqmSVzoG{R<@zE$)}9|<-#~2h4Z{Ax9`VK8
zlBZJkp~;~S-F%K>>>FK7E*8>B?Z@1glEX0!XSVbtv^zAN_s49+hskqAZ2nAf+R0g1
zX4^w7n~3azcW`_7NUFbRiHe7dp!@F_lwPzzmk(Le?Nfv8RsCS)Hx8F2FQd}RpF2vr
z^1j^TuQdpQO$C^vB^ih#zeUS`KB#|XkN;KZb93x;DqoAI;esUeTd&2|YuBS$sLT&`
zKY&Fa4R+7BBy@Zb{rd?{D=ayG@G6CF&#_E7mCBP(BxgBSpXJ&SiY(KkP&(?!KkxN;
zeT$q=-JYXvq9^=<x<YM-HNATdV`FVF8yovE^jJ&g2d45>TmZI7PsQv$@f@*aB6UAF
zv!bsblN@81tkWoJf_7u-`esaeBK^BFI^e;!D$F<Q$mb{0`FhGt#g?@lxn>A3D5exM
z`YX|4fEHaIf5GwPa_%qdz^7i)EmCNQ58Whhp}rO6r6W0XT(MC9D)^uF8Q3kJ!d8px
z!NZj}{Ja9m{Z;tR-kV_|t?8N8OSsODgX!XAR{WdJ6PphpXO}$}^ymUdt5ayRJel^b
z%W<{gC0ecZLzBxIWEvhu#4!bxf1VTdw`MTZ#t#MZJtn@Zj-YtSe;9tzn%Axj<mcgf
z96xV7g6b!6Yxh2^H;bjs-9YYaK7;jNGkHkOfh#vVBS7x%gN{qb^JN$8G>&EJDe&1k
z$uC{}C|>_f<_GU9s4{QGE>5A$y0;Z>Q@o*_)tifYI&zoFTal}(Elg!EE-&AY{l<6Z
zrD<0&_DT;XZEj6HcQ57+i{$RS;Vl1PLRBHZn@cZX`t%3r9A$t<(G&S7v<JR<eL~NN
z5xg=>S2RzZf&axDAR;W5)hEh?`z2GPO*!C_XD}1`p&#(6xd(M8-9hK)a>gv*i}QJl
z(Y8rC*B=(((QkDo*8hZ+tLzF-AIjU2H=q;o6}O!Sv9L57`Bl9*VelBLH(P*3A0CM8
zB4?HymrnU8g`)XqWm?br3k{`t2>bL%+}fYW`_t~A=6?&s80i)Gl`iMLHzvGseH9`e
zEkLkiYntl_UYj+Pd&fxD@J?58yHz-4Ls@dcuHx#+9xQW8;KqNCMfA)D>`ktNVUgTb
z)CSPI^>0OlpEv$@?G)NQn7~uH1L!<`redZ^C{Gwl&+~}m;?kD2!lbtim)6L=lbarX
zj?Uun6&q0eco7_{>#%UBJ<|(fxnav_Uf4vg&nU&ZC|551DgO?&n$-CC66VRNyt}an
zo!qC=#^e$#eMbqOb!AYvoAaTu>>kNJ>3a)%wCg;Ls=3u@THB4L$FoK0c2zb}m)5uM
z;qHje^jhzWD9;#LE|GoCC#T@`)`>Nq%TPFBCv;8Zj<Kl+uj`%@IHOIC7wsjhqQO%C
z516PtgBpF^`TbrWoJjm9Zf68?@wri~t(X4m&U;}evvwWnEgyGA_M8k8ptDU&?40yN
z`fToDv+VUef4>6(#gfTrh-Uv*W}LR8GfET6(W3GP66@r%y3=6plKWh~IEyRMQB*Ed
zV_dcLV%(JOo=?MIzy7KCdu$Y9&#hHVQjcWGIBl-~@E&iiU8z<wO`g4jxi|C$(xtC%
z-O|y#Z2SYKw8J^hzXR-s2$qb>pw;$NzJJ|`hgCdf&tM$%7rFC~o1YjRxEQJzPs4cf
zXSCO9P5Zj3*l=|%)_scN`-DW<U7dmZUz53^kGzxiYR^wD9;h%lrKmh%NB8*YbeH$s
z^4{_tCx4rx`9d#Fxt<R*yDN%0nT{BB*NN9QWpPON6G-2%O;k-j0;8pgtR7*=1HW{5
zt0b6K+f0R!{`P|9+EDYB{;wx3kud!U3Vprdx#F?Q_NY;Gbbo*cI=RxJ;K~PYCI2e<
z=?b?>5$n=|`afIpvt)gGI4@JEdm1vy%nL^z&cznFQ(5fb$TjJyT>4%OKZ|3jR$#@|
zY0rf3$I<ZJyaF4%?qi3>HJD!=!2RoN`6yEEwvEDt>E0N8uZa?=H&@}9pA+wuxl(Cn
zN2(SkqVc6C%G-rg+3Ovu9`$5x$1nyS%)wXx1`*+)PE-4CJbLvGhWF8d#y!a_-HBrU
z9WQh)*e=xK^3b;Ha$vcFi>{~h)s(qV@t+~yA05G|Z)JWJyBB9W>v2b2yyRQd;XY=Q
zWOFlkx;~yyT5iJHr^|#}x*Bcf-NeIT-Fc?>95i07L;HujvE=L~*x%TK|IS76X5t4?
z*!ncupY`Sx^##~pwI69GXYl_1Il`gL2<B&0Y3P~*kurgyPAc>*ieam{&6u{+7tRfb
zf%6x{|N3;muBADc_+JNH@{m2ifHq8Wkon>$SJB)Pe5{i#;>WdzTUx02+_e;4F3Nt<
ziek84EfZa?W%A?0lW6N~%;$&V82et%=g*xPk~V-bD*d4%?+e+3{P|6G!|!*wfq_pH
zY#g*jG`)}D7};xE@y|=7pFb)(saWuAh96DCUGQ>p4QhV1;NjKs-B~L;h_MkHg`?^~
zstqVs*v^XP*j09Hx}nR=k)Kdnr$_Hg6BwnQ!XeUCG4ViqS}d5ze49|0vBlpN_mww7
zGsKc7KSj_@GYmU>okZiR^KhKwE!K9c7vH+KW2@<Y&<HKU*W0$-JAMdPT#e<q5;rk=
zw+SokoLTj3FDhnr<9H=oYU3?dwQ$0>IR-Rz^yKlr;{@)nM%PHWpPy~Wgn~rTV%B1W
z_cLR@mkF199mV|cVvM+9!R5X0LTQN&Pc|RJMR()5z+w{?NhZ>{WHR@uwr3B+Dwx!2
zv+{K^O@G*OhQ@3B(P#msw@U?7hCwB*B^GEOfkT}&`~sgK^!Yrfl(nMnh$qn762(7V
z%=lSY&}wTiDl%j?b)x}wTlLv3>lc(Ss<JUQO8T?=V)MKRsvfJt@4}fbiJ$IZZFUtx
z-iI=AvCKI<`iML0QrR)w2S#P8>?U1l)~1bUF5Na|$<|!oHy@`B-@t5WG%eIMXyH7X
z!zFt++1j0}qZeXuZYR3^$;SD@wdmXa1FVwcc(7usFdTddc`>R~{-ld8t^smR@}QNY
z5<8pRgT9s%ls9H$*x&YqWMJlO{((J1bKrR)jk-Q&(zpK@(_S3LepN%>Y|{fOnz8hn
z9fYIfiGDJdI5c=XFPFc-<Nlf)W~+}=&TC<OW)560$$5F8JIlJt-tWSZT)8`vp{Wbd
zXPrIfJBM>)=3<etHHI5M`?B}ruabdKaPsMRoUiWBo?rjszDqU!tR2G6_7}xOtqhh2
z1~BX7Y_UJDCmW~D#_4q~;#H;%uegro#VR$1Pdf;wxjS+Gbr{F^gd@GmTT!y?8%j0|
zWbDSh$lX(jtcpf)B<U7zSESSUo;{T>=EAzvn9X-dpK{7=4C(S7-5P$w+$e)-iWsgu
znL(EcW4I#Cj@cEB*w;#h3uHe~w_pH=Od_vq_F_s{Jl!RuZ+o;6X2T4r80<tvk{;H7
zw8fn@*U;0(iwf7N0;&njpKeW!V+}ZT`Y%rO{)yKgJK^t~&ls`Glxj=EIlsGnCz`ee
zf+epwvo4aJcWO{@I}^4Eqv`*kEk-T>i)-UIiz8NH2+0{t+vrCq7~hvcmsX?QyCN)4
zmwT=8e-%FOCUMY_M7}g0Ol{ZCaGubLWB*R3Mb=0-PioG&Yb1y6R*P*~-Ngf=F6^2x
z0`+>?@VgMt`v>;nfsGHHH%79;O!9iqt1$j$I#Tmu`DA5ZUOk$w5UX7n_w_gWee2I}
zQ6@a-{t&sP7onp$A2}a#<Ze!SX&htuzr!+X$&)$!A$6Qm9>$2WHTW&h6wQxYMf<ah
z@I^Wtf)AMT^URhUXfu*3?Na$OFb7i;x^c&yY_U$hhwad`So&BLVXfIz`u9~qE8}(?
z=xWRNPoyVes|H{7?L{32*?a03!H2=_c=a-d5zS8`q)>WGWJVGz8O$$B$IHET6I4#e
n^UPvHRA20*P`)hRDNxj(){iIgM~e`y8KuulCFOW6d$Ioq(p7<e

diff --git a/tests/data/for_carafe/carafe_mask_grad.bin b/tests/data/for_carafe/carafe_mask_grad.bin
deleted file mode 100755
index f6f93dc68d7a102c524f2b38db957b1b0329614c..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 28800
zcmdtjL8#?r9meqqCMOJr5vIaPWUw#{7ZcM|WX?T93(KM`EGQ(5NrY%I(7LjX8>vOK
zi9lLJAW@mMXe#F(VaN&<F;SyM(I%1#3xq(UeC2-5;u#+PXAb|<;tDUndB4x|f1lrd
z?gbAV`2YTQ=Sv^Ed7MmbJbiQB<z0O5EsJry+c<shnR!RA{^w$x#^=t@_w->8dgJtM
z&D=vy<Md6Z-t+s$pMB=q<v8hYJbmphPw#w<%e(R96=%lbZ=60gGf(f0XWn!F`OD5Y
zjZfvLt`B?A8`p<D_#3CMJ97`;HJ*F$H%>S8?rt`&cRsW6+h0DlGY+4bT%O+f8keVc
z{>J6~`@WCfJPv>I%+q`0nWy*0Gw+&jes*`9#`R$jdpE8Rd+;}|4}0)8o_p{&o_p{&
zo_p{&PB-=LZZ@uWKC^N9)Rd=p{>J6$oxgGV)XY4+H=cQVZ+t30b$!@_-nc&O!QVK2
z-I;s%uJPQ1zj6AeQ}6kG<LR60E>G{?ZJa*$%sjm}K6ie;rw@D38>eq;<{okyr|*|}
z)8C(7j*~x6<LR60F7L@dUcVg2yN%Q5o|$*>(61KbG(LBJzNZg+&>N?3Yvvwu8mDhM
z^`751{?6lHUyPIf#?#mC^7PKvxV%5yc4RpYf8+G2nR$9|JoApf|HA2U8lTEfT_5(K
zH?9wR@Hb9hcjg|xYdrVhZ=7!G-Q8?l?|f$C2hU!!7>CbHE>G`#jmy(Jf8+Arb@a$$
z9RB2)r}xG)Pw$Oq-r*OHEyihFANH_!<NB}%f8+YF2Y=(a2Y=(a2Y=(a2Y=&qQ}6C(
z<9g>a8>dfAd3xt>T%O+f8>dgr%+q`0nWy*0r}9(Rhdt<x>%$)Wjnmhixrgr>&pr4X
zr*As-p5Hf~zPax5^zPlp>2uG_(|hA{=jVI+um`<y`nG27A*XTrewj}{d)Ll5`SUcM
zzPax5u7C9TopHR|IDPJ!d8fbpo5eVd&z+y|>BAoM#_8Ldxrdy_>6=cy=l6|Y`RT_O
z<D|dw^tHP@z4J9L@1d6uJu(h|<MgSSd3taBlDuc`{pEpIPUBPgsq4cY^u||x?Crtd
zIDOrjd-$&L(Z#EW(QKT)H}xL)K;wE(f8)n?f3h4W{mJF&oxgE;tKRoMXk6aE9(dbw
z9PcO3JiRx5N#0)Xjc4AifByMmoW}KG4|_Mh>LY&tjqAf6-fukj;BWkL|2}gM?>A02
z_3myqu6I7Oar)Ghx9WZGgU03Q-TRHxr{*PjGyi=~<)^;tV@rRZnKyZlJy!n0+uvd1
ztM@kihu?pG-gx@vy33pU@AGo^%RTho_~pKTxqf}v!}pETw>5JQIgQiz%l!Dkdluv5
z&(nDN=DN#!-}hd#JC1i7r_Vhv$$P_Fmj_-sjnAE*@2&dS+k@UXeQPuJSmiXnGHmt_
zz311Bf8oAc7vrSA@$|L3yjAae#>VCS`sACJ<9N4m`qaE6Z?E^pGw(Z}`N4&88lTEf
zebq<&{u|eaJ-pXAeeD^0tiCHe_uy}wZtC6LY+UbrX5%M*{MY3;d}eZatKRpFjmy)!
zcN>>??cvug$Kg+YN#4wVpP6^;uGj94)A*{7`29DokNo{Nt`B?YrSb9ql2;F-S$OWj
z-gxf8-#Fd$`*$}R*E^rtIDKl$(>s6T^7PK%IDKknp57bJJiRwQm7lsk>_Kl_ANJsH
zoWAbNJ$%=A?!n(UebcG;{J!z@&2^Wjckeb%pL=GW-W#7gKi|`bJ?M?ow>5JQIgQiz
z%e?J_M;;z0f1bwEH`iU>^H=<7F^+c|r_Vhz?}`5&U5?ZE-1+&QKI}nnoW8A@d&p^=
zzUkC^e&6_GJBOF!q`&d>wYxmM^EEE-?i1%1<M205pPHGc_r^2t?ClRM#%X*iKXrZB
zgWkA4?7`nSechRR_^$EXgTHaQsdsm?alP}IjUPY%)zjnfnaSnpov(3udgpIk-nqAY
z{=zu?$um#yjc1<T8_&FVf9m}1IF0MW9`<frANJsHTp#w}Z#?(lZ#?(lZ#?(lZ=7!G
z-Q8?l?|f$C^r<ON@BEF+(>s6T^r@M7dT%`Q^xpVXe(L(L2fcB9*n_`u`nogs@Ll7%
z2Y=)AO{d=T`^M8Z*Il08z1ui_?wNUdZ+!0jd`}<tpf^t6*33QRG)~_y^TrQ7u^1<R
zp2pKR*InMl7r(X`$GeTw=boAO;tjuBj??(u`T3qc>_KmwzO9*i$Z4Fu>C}6E-}u!J
zePc0B`WsJQyUWu%U*q!Le*eQe<M205pPHGc_r^2tBVT-SXPm~T@>AD`J?M?=!yf#N
z)7PE3hwmECJ@^}^n|gOQ8`nFZ+4ygN_QT~kd}eZadgp6gp5FNzm-nN0JhC$mfAY-J
zd*hj>_r^2t@K<hGjMKP2>|yW5^<fYG#`R$j{>F0;{>F0;{>F0;{>JI1-rdc{_0DHD
zPM@0c^v>V7JiYTbPM?~Yr}xG)Pw$OS<)^L>d(a!#hduZkr>{G658pMOd+;|--*oCd
zzi&K!bKT|X-Mfv`=bo9T_r~YW&-e6U4|?PDZOz<6PUH0bGQa)ZQ;Tu(=V?5BbKT|L
z|DH#8$MJ6C^tor|y>!(LyW=!IcYeO74|~uXr*CWK9&#F|Z#wm!-#32Ump-r<C;g45
zuifS8ov(3uk6m|WISzm0^r@M7dT%`Q9=!W+JL5Dym7lsk>_Kl_ANJsHoWAbNJ$%=A
z?!n(U-PF6g*|^^M%*M}MdGW$Hd}eZadgp6gp5FNzmv`vY_ZQ>vC(k^+H=cQVZ#?r(
zyy=b0aT?c$J?!1MKJ3BYxIXN`-+1o9-+1o9-+1o9-#FdWySv%A-ucYN=~Gjl-uWAs
zr+5Cw=~FZF^xk;p>Amr({M7Yf4|?PJum^wR^mS+M;k(9j5B|pKn@+vw_l>7-uDd+F
zd$)1=+%xm^-uT@4`JO)PL2sPCt(kksX`H@a=7*p9+|D@p^E95mx$g4*amUHUINoiX
zKKIPLd(J+%9H;TQ^YcA@*n{3UeOoj4kkdGQ)2a9TzVR2X`qpxs^f#Wqc9*AjzQ*O9
zd;Hw)IQ)&%r)K8qz46R@?iZij9jEcB{M7Yf4|?PJum^wR^mS+M;k(9j5B|pKrrzDn
z#`Vr;Hh$+%Kd~H#&rB{)?|hBR(>s6T@_v8s>(7kCpFH#Q-gxHez46RD^VChtaT?c$
zJ?!1MKJ3BYxIXN`-+1o9-+1o9-+1o9-#FdWySv%A-ucYN=~Gjl-uWAsr+5Cw=~FZF
z^xk;p>Amr({M7Yf4|?PJum^wR^mS+M;k(9j5B|pKn@+vw_l>7-uDd+Fd$)1=+%xm^
Z-uT@4`JO)PL2sPCt(kksX`H^T`9Cc1qvrqs

diff --git a/tests/data/for_carafe/carafe_output.bin b/tests/data/for_carafe/carafe_output.bin
deleted file mode 100755
index 540052702068fe07315ea81735062ad1b9091e45..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 18432
zcmWiecRZKh8^+B-2qD>I6BT9TbM6xnqD51Rv?!^xRLWi%*(#$&rBX5~e9rxpP!SqZ
zDfMllQqdsm=lB2fyk5_FJ?EVJy584yy0#VmKnF?p+0EwJ36koyjSx7#95e@9;L4-p
zynEjqVQ4Un{O&0Q=A8%@({F)wD3YnqC(;^=n{e!B9s7cpA#~GvP`W+DBtPzetjzzQ
zsVo8NGv)a0N>@Q6>pN`ZzaTojvNX1FE9@QW1w-vb*ynWDHf{P`*cSDkt?PV4<QJxp
zpnd)bN#8)>{9M`_EJcHR1gS~18M|D)8{qy+SXBk|zaSnh4SWEfq;=t&yDE4TOTZ@3
z<u9BX2VGhMRLU&@bOIOC+&?ljUS|n7b2i{*oDJF~<xp#{PZx^JN9(L3bog~9d#F|i
zpGv~P=)z>0^j4j&e|8@}U%CbJb`Hbi!FO=_d>qYD`3U6hRp>Y(Mu+#Q(4sqg=|&r8
zI=>>2=B0<z#>>ZG+1R(f?>D0LRpPWsa}cmhj@nk*(=}5JXlA1fvefz1D@B8*hy8;3
zoi6lL_a1ue$qYJI{u8YA)S|u(mUKyJH#|HRO7$aD+3dCnG<atw`1pJU4bO=*ZPzt&
zICU1)d}<E2raMwm6-!E0!|1J*O4L7L9KBeyg<c#9r4nZe9bJ<^|G7rf-SsYXTGnPd
zr6w8L^Hzdtj29jL`wj8~p1_&@MEZ71B(;09j+W?Nh5Xcc)MfS-YW7Hh$_RvkK*&}4
z-d+n9RKA1Ov9(S8aFoiZ>r%bl^`J860WANZP0PK)>8AuE8vAJ<aoxBcGrQ9GlZE$z
zeV06K*n9yR9!w*X&W5rh74xy7RRVsUjtB3cbFl5pKG=9ek_y<RL-~kh^<m3N#8vhg
zlfO~PS~SxkYyA$moSXtL6HXI*D{<02_8*(n{E*)=1pZ>F#OY}y{EE9uzKwokg(adO
zKi-)aKKC0Ns1bn=3A!-p%r%I+{{S|)m_dF{CH(uLO}Fiqr4#E)$hMA(s@g{hu&71=
z4g`*)^Q0a__Px2JvZ4pB&6KA+i(RlaI}`k?4}wC?1F~N@4rG4R@{)u!pn82P-20UT
z9;fB$yjDB-K3ENU=?j=o?;v=re+d!6E#UDW7aGBhEZ6%8(jD`_F<_8CdOm_T{z@Df
z`U!AqUm{G2EFhIqH_36eCbmG>6L@lwp#Lip*8L}n>y}P~!UHp5dM$%{{B(Gvq=A8t
zufT-k@zC{ql+Bs(9NvhB!>S+Wc>fJOC5@~H#@SiJl@0MwG{XSGL;taL-rvD~&lX79
zdVxJ0(1ISnG&XDX3-W#ZYUpa-!fDM8MjQDiw2R%(MTZ%3M)S{d2Pf^}WEH)+5T?i3
zO^?BGZliepy#d#FZzgB`$&AxKwu2Kga^}M9l)2gItI)0D2WG8(jcrpla2<D^W6ud;
z?qyXhw^>?@%gl`6=IjyY)`kgl+xFI>dGKxY)riNJAJ5?o?}?mH{WLB}IR@iPBe-M0
z;OpxY$W-Bhvw#2<Uv5k}mwg}~x)u7s2Xd!1ftg4lw5|CEXR^+bf|g6LdhZt?CnwNy
z3w=0}w+`N<A#CX=hKYv{f@#_WI=vwg{<@XH!h{#(hra?{amNiN=zaxz{uAgwt_OEq
zr_c!TJD{?86?8~G1dXd1;C=olIoBvmL*GYY<gFQ?bweD|uLZ&}^V8ren2jfCFd}Dz
zTh&YW8M$H5r&CQ#N5ipP`57zj|3%`}MM3ZHOl<7%WJ^gV)LJ<ZjT6gxUoG~*48K}-
zUidwGm#`K)^K*f}F$@KR^x%wHG}-YY7}vb5hu3+NKs5F=S&?ZA-ep<j*<x)XzRU)`
zKIEgmi4?}oOyv#lxyC>K!v*WLUmzrAFlv5-ADmImuaase{_aiWr>+&&QA6Ieo155+
zo&TYFU^dPul;^b`(<B?zr}L<(6quA*V`|$zyx7;r!Z#jeayoNahVpIPpg9*L&X;0E
zTPu^Vui;-?I)UBU@s`{awZtuLajd1~2hoiX#^5w1GHvb<{(JDAU7xv?L?0_en?;Ag
zRau<u#fLnzv)LqRYZ0n1slZoKX}GY@2b2A^AmfiSp&I?X%YVG8`hLe?VXGHfO-{py
zN8E^Wy(<4wdmaWZ<Y32dJ~OD=0eh5V@y7Y%sGU6tOaC=u`2%TgRpbS%kmQ)vBO$mn
zvKT+MXOMbFcR0ss(m+8aYV<T7!VDDX8GnnhS@ahoF5iRUQE6JZY!Y4krW3wtv_h}u
z9Z)##0xyC#kc$~9@HS12{*u#xDhCt#aJdOpGifJ3|I>oi>1y<qiwmg#*9U@E^jXOP
zHyRsZ0oy7EAZz(Gh&6YhA+fXQa77(>UpWEX#&=+xa|~RahVeYN6@RVpz~sw+ZM~06
zb19pg_(N4n{6xiGrgzJj`!W2N|5sK3%i_<GvbRUcHPbSz^k0a|QbQ!eU6A{omPQgz
zUc_Y|15hhj4BmSqs#OkPcfw}yn*AMry!}KrTq?ql&chg2qRqwCSg`}Ix8R_B7QX1X
ziJg{L@L>2Xk}^S^8EUR&=6N@u`Ls7m4!*%R8iV+;MGr;dj-wXuKYsCIVQ!6;JPv>F
zz^@Oc!I5c?ux4Zg|GExi+@eaJ=kj12e_Eg0M8dH#vz_F!Xng!d4re9S5ZauL)mBTn
z!9+x;>BE)d>~Y|4psfU>oW_LR<nNYKc(*wneFtaYF6lQ2zTW6!b(UQ?{t9wMXVZi`
zK~%PNHr=HW3OClC0xvybn)g$O3ZLfD9qK|f%EcQ-znz5Z@ugtt^9sn!a%h%)0j{Fs
z=*^%R)NsXbsMOV=b)~alrjs26yqQ3yb?4L2GD$ePb{pM$(*m06Okm1E87d$64+^3r
zXu6*SeVi^xqvigB`nkpQtnV|}xN{#mKIcK=i&{vkOeUK?y(9bEKJtCTs;jSs`jA=T
zyTO0L7WjPqIPn|1_Y14jY@T)Qf%J}eHc)T{?sVUPBipVMwd%*@mwF`}kU9&#=Zc7U
zxd?3ew-_E|B@nNXGT?jMg|{1mL8$Ww?{1<c|H?iuIQuOb5{LSDhvqG-{+sAd8n;g3
zUM$W6M>T!y{%;fuHtvP(*&k8=^%o}CE6Zky1c2R~LujzL9G@s$MfR%!pNy9V<=RYk
zV3HGvMfu^${SHhaGMWT>Iq;`xQVhA2i00b&i1pV{oN{?LZf_F7cjJY)rS6s36>A4>
z>mP#6?RxO~7lxasrjU_q_0WY0<neDYG<?woC-N__%htUluXKPN|9S)(V@83!wIc6k
zs^RU%SWr7F1{;b-h*;ixcFE;5)W(IP#QF$!@`eN+%{m1ZRR<u{@E-`uH-N3sKe#CJ
z99Hut!)2f6Aiwt*oPgJ0QZowZVM4=9?t}3iYrL(*!;{;7LUy(n$Uc5Z9;xmjg0q7l
zWKR|}Mz-QjrofclcEI_xXsF2Ahfe(`S)yep+wGDILF;#eSHwn0OT2_Ct28j-&;ru)
zAOn_a%qR8<CSzJnn%duaMjpqnhXJb#5H$TZ^fvJ!aMev#IaijfjS=8oW;0>Mr4l%I
zNShX!HAC<0mp~sbVez(tv}O8xnDu-=yA+}d6ICi8G|iCSmNBGam+L`$Mk7r1D1m^3
z08NJ1K|@lNmf4R$!)q~cSp5_d-}FGrvdgf`<2_6nkfKp%)ZylPU5Gh>ARuJ}FD5ZE
zsdP2iXupI5&%>eIPKmyGa)&s&h|rx+!{GPzB(VR`1zP`t;QrL7p!4M`eDKJCh<Dc1
zWODbI9$!Q5K9Rx?YzAvtb{s=1O`z$=H}c%IjigS0T)ofF8RpE4VLw-8vvmrY*uJ)z
z?4BHoGnKs2Q^y<*FVe;LYjbga+!-jz>VkvUzcTlBcMv~U0ucv>$T*`Vu&C-BOtoJ{
zPPN^F_4@ZAvGx%&>Te=#{Y#*J!BSZD>pG+keqvpj!61LD4E<IsV$G~CtZ3>QkULnw
zvcF4ng7pTd7Mo9|I_2Xp3q5SspM!hv3RM@s$iaZ|GvMRs9I(6IQ5||iiKtUEaz`%_
z!88=LPTXNvW{cy}-eR^(y9BGf!m()CC7g7r16|Uu;Yj~p{%p||5C$6{v(FPf6T--p
zI6{Q2R$y`W*nV$d5b@6sTa%*5TJtj4uGvjIG*m(Q>pXCkc};9D`@_Pm&CvBT55)GC
z@y|3KBl}O8+3xx#2@gFp!E4<iJi=ek6c5j?dgL1j&K`Eypcgx)CBMO=8-m22vw)q4
zTS&#`%dp|14@}{20^!ch@G-{)66PDT5#KRyk{AnKudb14A>--bP8%9Aqmy~~%aU!Y
zGvTL-7CydP4>>{tv|;81Xwff)!2?z>P2P;|8O|Zm<7Mc2!vn;yJP8^dj9}{qNB;E}
z1yEV-0>T=7@FYJJ_!)B4C8q)KXBEkR`wMbp+5m*LXwEq;Dr3UnVuv<9uxf+k>&fK%
zLs8-)|6uIx72r^1M$bK1MW*Pz0lO`NbW3Ip{Jn1mg$jzKc$+R*zkdm{f>*(9uYFKA
zCmPbz&cllTnxVEth@Q~u2gB6ou=mGvqF%s*Q<pW_QaTY_QYbvzTEqq`JNZ|)PvpzC
z90#&(8I+^~bdUG4eYAWs&u@|lMe7?d(>#_LZ;XPi&j425yAA8Tc>uuy;H~@|0{@)g
z9W0x}>_iPAk`tkAg%9BS$RJr&a2+;u5!krvG3Y*eO&WfmhV7=?;ECRBP}yJrIeXPG
z#w8k-^(}^V>(b%nglO25t_07{{zrbSv;<LsWMX}B5b}bbfn$t0EL`$~cjNm3a5mZl
znmMmo`WYeAJJ`wl_2><(cw;~VCkV0}B}H27UIZf{HKeS(6Lu^AAxSY$`9XF65wg4+
zBKdc~VRj_1#99e>yU+7uj`E?U_9RTlRA`)INz!~S!2C!FkRM29r$W?0+wBWG8s0>d
zEFO_F4fVXC!Q*@`+6pfIRY$LL9^_+$DGeDh0K*cFU-CW@a-?G5zf}`y^kqMY>o?%u
zS@vPU)C=f)rU~^kV)1n6bCfBY!X2}zMS*&APVaXznrrBA0+IVMtekSaCq7_+^K34o
zY!J6iScECPCfvlC(i|Ps<)ZB;bG`?xIHwjBZeX@55-|hru*76;u-KOS={kp7<H+N_
z9jHeM^VM9xS^_GoRpUyH^H@DFjl0(2!!<U!aR>9~bJu=#;)fYd+|}AS+>XVG-0gBz
z?r8EOd|n{Mh0Wn{!B-Y>!rxW7)4M!4ulI)B^vW>Ka9Ip?Jsie>V;S6!N2c7WQ?lHb
zU#B^@OG~*IH5c%Ev^@7K?<6YS2<B$@o#b3>oVe^s!?@DfhugfO2mhqi<K$iS=%Z|l
zM=e9RYZ=G647ImVRC56o=NZt;Cwm~-;SD_6aub%y%0P$j1(>b>7R<~CL9x3GKISID
zA46H1y6zS<b&jLu90$r474Y@z6Hv)d8}n8OYM({u*|rGaEKFe5-b}cvAV~epp2KTS
zi3(@*L4sj1NOX>;GYqzY+=mSCXqBd`*IfoZ-)>lR=ML;zCQQhQG@Sh21<T8IFu`v!
zetFu-n_|Ow!#RSuaYzS0$;;yWtedd9%Ll)DuOxD3mXl5Q82`8ES^RMzoh<FX4rk)l
zqq;;N*%t5~Jo(Srg|*#i@!xbNog9q!zL&%K@5MY3%qKheawt}z#<0(UH|N|V+$kM^
zLRyp2T_Fy;4n(n4+v-r{-U*b9UctRP^AOiv<zu&J02f}X$eA5W#FnP@T%W`w?&vvf
zZt_qKidgXw<MZ&w^(|Z$T*X@vJeIdQ1CKXrb5UL&Q2Asq*T<z{p;0U5n=j%PI7DHs
z)oTo}7A2zFiqTPKfPHLo<&t7s@T3pnHr2>;4*PUD8UK77-8q5l$4jV{`x~8~RHJ;t
z6wcv`07qrwFqapBV5GrKPD{ktfe8FEVI_Wymd3R%UvTqw4|FC=P$6glr9K}+@tC_<
zHO?JPTSU3-)pDpZonevx33RFS#cNr9I2zx`WO`H3LQR4Te0>FjYTsgdZ6#(oZDh}^
z+t5esJ(_PE!TSpj^EiJAdf?7`5Ho&4HmuC%FH*Y*t!G!mmrwiAXNDl~$qcyZ*uYDD
zAzY<%p2DQ|N{p8K#Fo#<<8h}yL!XHOD)|mqUp#mNYRl)5?qGdt(`5+J?heq=+|Mqj
zFTf1J6xe+N$Sr;VxS!w1IuCWDpW`wZO{;^HJCm@+GY3ZchRI}}FZk6^5`WMSR!hGI
z#s2w3!FK|cRrmnKv(4$;TYNA+Zb{p=OVit(_le5SnY6-T82m&Z!s(=5V0pce_~R{{
zb{T-(sm;*nBux$e-iFYqJa{F=;7&PEHZ2s^^(xT?eRi<?NjvNnwxdtd=g>Q*=OLs^
znVvp#6dv%Z;M1a3@Eu)EF})jdf1ZGp2|#jk-jF_59^7d7Nbb~JXW#!WCk5{}@m7pK
z3J`YzrU@t7^8eJrVSXiC{WZWc(~iT?*ONpWe?aW~1Gv8NJ=FT>l4<WPsq;z)P`s86
z@)n(N@02sE>v>L|_leWp>k^r7&Kj6`z7ZxLGJ@f8AIQCU3%cpS84#c=$Ul8^^4V8{
zG>v(j{2Rx3%D-}mr<gf^#=#Y+s-??ni$iU@C1zqzS|~P_{bkBi1^KBzmZFiX7+S0p
z1Ng0p1w3VpTs#$j&c0D?|2&V49_eH?K{GI??LGU?MvFHZqlQ;XBiQP&f9#68BtBkh
ziP@=tnc(N~xZF03ZJ4FZip#}d)s7anTilce-%+MpJ1;{(jtZ4^E&$WK88qeW1^ALC
zM*B>)(P{HZFiS`QGqof9&WI7vZ8e2CCsjf5>O*i_SqbN^DZ`l;li{%IPtp=0MB|gg
zq3(eZy|6okC-hE(I*8|k-(pICJ1Ic?#?|0ANe44m9OVg|HHP|O1v<Gc3+`1((9-5M
zHoSX2JkkCD^JfXr3)T~9eqcIOcIi>2*)FtW=TYzp>xNCW9#r?zNyw4g4tmCFH0jz+
z82)ZeUt<i!wf}}0KfgiX-E&}4B}u*S>(X}rTVOKz20WUx556rOg7N_Y+PPyqoqKUH
zlt?Pm6<(8R>-0x3G{)DCj`{Q>-Cv;mgc99v+Xg`|mSXy;L%8)w7*c`TIPv0A+!Ygm
zg|#WXVzfb5%*8do!dcIoQg(G1$(zxCWM7*UD4C|BM!YX}$ScFm4NLiA`@^x?Z57Ua
zR)ume9`QGSjwf;}o)N=_DX`Vfnk-I^B9X=lxL<@L7xz798bPkO`$;IXbFjb~{#LAt
zpNd=8DME9ZDyQqO!JXUAuzZy=sx+x`Jq6mFm5m~Lf39E`i<WWo)=F`P0`u8vo9p;#
z<zMpmOb)**VjLHnIG3w!dq)l&zlo+gBAh{z7WZY`F>W}i2i0uOvbCBbnAhY+$`<uw
zv6vX^Tk`{hUEHxC@Hz`?mE#1RJvc>~cy{NE3#T*Y)4vz1a7~8t(CqV>ocL8lnv_-X
zSvW#<wlK9EJID7AWLJkw?t$t1@51hH>G<Z)au^h?Cs!_9<BKZJhfVumL-IsX$TZ6V
zQ++L1e7zACyc&hnRSNJ!FM!>*wdOxH+C|nh+=QIG<&ZU?1KZ`_LjK=<WV+%TUh4Ua
zP<UUL<eGUvi`!Q4%PNO89+~jaZ5+%^%7a3WBsi_{owyJY>NGwbR?hN<tC~4v&P+3K
zcpVEa3DIni!vdIX(L$yzbcOiu!r*xD2`SbZCQILcB*&88kmOUjz;pXel$xX9kCz<0
z$WMjT<`cy1`gXXttA`9f5#nj{)Zn?~YyNX$OXh{#1-0MvpgvXv3KFEStS}t5-t>VG
zxhr5Ilmtd0zP85wHE?j10QJfDf!@kz?Ca&faO*-pJpX%|Sc}|;7_&A|z7q;oI*I&-
z>=(eUR*z*CvhYVn9X#`n!}&fA`a}ikGapUZU-t$U#plC{xg)Tva|aj}Wq^FgKCn))
zhsh)qj9)&6Sp5Vr3!Vh7fjdavyg_)clm|It4Y2OXMOG*L5`28O(t}$}sAQ}z*fe*8
zWr!A4bB%!xe?>YgMTKr({R^B|anPwZ3hF-}!1`rPVEkkuyna{=k{y<`Y4|PF-!`FH
z_dIFs8b@k5aW=gm_YpRRZG%>u-w=Gi7`B+zk%r@LG{x^8SO$pF_;&)-`>8LLmGvY)
z{f^-9p2=K`=_S<bjl!}^?Tqg+fm7yx#4Md5T;})(k6r7;g!3b)IP(XqiK#)wKPsH0
z-xK!Mj-hAEJzVH%%zbXSfjway8t&PHa(Aln&4W5@ZPkOCoGu*rb%R+CeaF=>k(<F;
zaJxh&a3A)3!{)b1xcXGo7z-<5^Z4Cp->8Gz=E~rNj~zG^IT;VZH8N0b#_6>F!W#qI
zK=^tGzTU6Q&As&q6^ItvyOp!o3tdrvuRW(M6h|ItYO^^ngV^O+mvE+M8S^mq=2Rct
zW^#tRaOshY_~CFNuD5=R+CE+Ear^>UDJjE(jvmL+4o9>JA)I0Dbqr`ZjirHrGdx7N
zAkW$KjB_ibJd&mkAOAq}-CAh16$dqCOL+6B7iv`Rk*Rk2G}`AitdrKICyO~SedP_B
z<E^R3G<~{liYnz?4d|<9Ca`w>F^G101W7MP;N-dQ#Qk9ewCfeZ+|_zi6CG*#*txxW
zYaO{wgsIkpMRdTc6>>9Y(Y`<B@E~Fw&GG$7u53#q#%kB`;MIqit8*It^u_S0`y*cI
z;XUYUZqG(9hcM?D19WuWz;{+@A?p6^_$=%urVZq?I6p5IUl)Tx&TYiDp$;!4?3iOD
zdjTuOIp$iEhr+sF`Ado(Ggsr^c=Gld^ba(|yYo3zJTwz0T#m<OPxoW8-X09O@QN&0
zX~%@z1vvcy5!P7^aQK@ibFnPM>q9Xxs$c{9XYJ9+QJ&0d?j-c^BDhktl`nj1J8128
zuU0y^1M;#*p>~%ZIv!ojZxMgXghv3^teJvNMZIMG)1UlF0x5VddL#RAaS0UqCbPw#
z+*!0%D!vNrCw5OF;76`HI;$^)&Z=v8GLtfA(ODS3<UT9box&Bpl*b8&+R@R(19z-!
zKvw~2Trl-0+bA=DZCBE<@R|w-N7FH9TsUhpmE?Sjm!Ro40q*J+b1uf*lv^S40ef`v
zP-M(+9PAmbnz*POAMdl`r1%fepk$1Z9@gQi*BEe?uIgC6y#oKKM4<FSSMGj)A}-rG
zh@1VEGMC@|s3KsF8dn`bMZ1pmURsD3cAVlHsR(eYjnA;$T7kD%b+o!p{w3;*HsQwo
z6It7u#VDyF&1HufaYkYqT+;Srq87N8nMKU1ekk3-Q@WK5>gJ2lyYVcs&Hly~?Dj)b
ziCmn1&;Zl!yta)|vw=(d#&fsZYcP0RDF2B1c<d=O1J7R`usNfh%_up@YrQT78C7pF
zGBA*Pzo-O{1u1iPEQ(ORAeqDmW#ah$$Jh~5i>c<(=zY8fAC-x5?WR?D?UE<AMo1Oo
zFAL*|yS?~!SrmTSU5T@7G`M5EXJNVBYh0*i1|<y*D4gtxi8?<qSh<<}`#g?w2$bV?
zYUSc$4`Z&fL<l#1nU9}xOZl<$S7TT1TJXEsfXn(DanYJ^ytQAQsZF75-z9I<U9gp}
zbX}t|dR0AdIEI4Hr9(_^?o#|Dvj|I1-{E~dXh?S73t=8)4H?QCJF6CtF{a=>ZhIUL
zkN*^7)Av)*lfM+>HxcZtJB?D81aZB*7>cDA;gm%=Oj-3AUfX__JY7E@AALH^l)k)2
z*>MI?c|es0{?Vn1XDwjUvnX)fzLIpWJ_hdR)<Dq&XBb^o3zMVyq-Si-#MCrHV(1}o
zw`ZVMy$-ZpzJO!c6gV398Z7lL0iE9pFRZL!ypSO6P&z^O#MHr#j(RY8mk0e4`#{a}
z3B+zY4?PR+fo!TGwYj|v+=tgfZ=D2`Z_%X_(>1uSi`H`0lNNB^f2FzM_|2SMnH4v1
zcn&vVRFIqZ$$}FbP2-5jLA2C#<|;N{;&h&+a3*KdIk^@+u6^(rx67yk|2|vDt?wzu
z9D4`uXU;4x(RUMPbzupo7r2dEIxd{MaeO0ZQ#743+3&}-){f<Kx1zYc-&b*~G@s$R
zgcG>4!Ic}o+KY=_^^O(3O2yujTX@0iey~^Zxn!teI}6nu#@DWtC}tmI6^BC!S$Vd~
zD&Z8nE2xNTl{~BO=32mxr`aTU%Q3v+=na>w3bFKz0tRRS@^`+$*EdbEXvH=B${dK>
z8)MS;xu2wNeutx5C!^fZQ8IILK3iR2jKTCE%Xs8~^D0}&@+Zoiv`Zoe%c+sZ-zPy^
zSclrK-wto?Dbbu$O7zR$$845Q7aVi<uM+M0%vb8~fV4m@x<Bd(WOeK%htf4zV*D=t
zJ^kgtHTQ$56Ca#beTDSz!>m3;l%{BvK(<gn*|xz5cdl$DCdT9FZoxB<E?fbTPAP<z
z>A}wP5miy{WgznQ8aNr`LFIKhh}v39KCErR@M-$k750h=y4TnWY%b>Q>sKO&$Q<0D
zeF9_FE0W(CCy1)YS#*7$fU7i|Snl0bD7V6ZT#7n_?_|SyBJx#4E9M@FIhza>RgbXY
zpBDd*#{tZ_nt*bSGx(8LRLQ@{0K6y5u|02pFgJl)WaKQttwy4tvS1WT?z|=c0uyoS
zy*M_@DjeJ9ijyrV&CK$hATG*&z;A1&==)k1vnuytoc=+!?CljiYVFK>m8b*V54>?r
zQ6Zkp8;_y*7uBaLusz0|cy*whC;iEl1b)7R8N&9=v#$uxEq+SwYb-_GwSwpub&;I=
zGYcI?ZSmACe{^AT?72}hs%G)gxiXdbDc=JV{$8jL8G=Q3m8l6!fVp3GgJMiBggT!G
zrC2>!6Uc{o=m)nGI^jmZ7kIkL2V#<{fOl1h9$M1|@qOcHXh19Mwd@9Wt^ppOcZ8KS
z&!Do@4t_q2f&FfsaQFUfx?EV8J`A`{Mqhk}GeNp^#hZ&DF;0lOp00uqv&HDd6?V{>
zx&@?*#$#67Z1jvA;Cr+k$1dY?T$^EBl`0s-+^dfw6Yu8Fm9--!_cZXmX*5=J1!E8I
zD$b1kOI*&6Bi<rMd3GWjFj9FNTAofv+uS&iuGog5i_*z!;pMnss{kwp72GeW4Mof1
z@ac!s=up0vTxie7<Z-89!<}19Vskg?(tE;8mCD#8ml(LCa}sv;SF^4yuh~$q3i<{{
z@pqja;eS^~CQ;{99k}j2>+TeX4@JJH++>L3<CRz$8RYxLv@(}Z3UKe$0Pj<OD*WUN
z!hO%fIR3gJ`IvMa>2fV_^7+JWNqrt;*I~9smBVaaTLn>+>SyF+6kK|NtbH&FY8|Y|
zS^9${-U^~swb4|&DiuyS$I@jxr_iq%$@I#cBUDz?k)C>NM-w7G!HdIsbnTH~Iy?6$
z6_FT%$m4ryoOuOYOT9|;m(HfkCQhe0)P%15eub)K)KbfJPl;35Kk~{}i{>d1`h0;e
zedQTWucZ0F%-MPL`1}oY@qrbz=fzoCIpYY;7xSlru?f_uZYHlVCV`xNIh|+s`e5}I
z?JgWV_7ua!EODnxJ06uW#r>$s#+|i=iw&vlbgMM(oa4`$n*4Ff$T!^l{4flR^(j>6
z%qCCFw_?!GaGbI|17j1zNo{5-o)Wys5>J_6qhB2Hc^rmj8_qHjQ7!b85Qj2NNn*9N
zgrv_&##X@$Ed82<`6ha>e1a^9yNe^X+~S2qtid^h=kd#{5#G6XHE8#3Gv8eKF{}<<
z1_E<0LT}(BX6bMlY6Fh&{l9BsxBo(N^hpUEnYkNW9-bpRHN!|j-+uBiVI%DQC5`um
z-mrAd9Hx>Z%9ezmw(VTjP1?WLvk(0eRP1{+te<s@Jl-eI5~CF0ovI%eUh(DW=%0XF
zDPk}+M4QgYievUk*Fmn#8%&hENy$hteilE3UGrt2{D~}{v3?5ytFm#9U>7^7K7nQ~
zw*`BR3ON1406*8AfM3tc;d7f3eV=lXt@kS@ZV5}!M$#X=W<4ZJhzcEX4`;4JL*V>d
zlKV6HF}|}PByX%Y<MX#1Di2mc&i4!8&=PK&TpR}uahD;z{3WbdnE@FpBOq)PNpAk|
z0om>2AmCUtq|99m$%>6+h1(|R7jl649*5yws}6mcG{y%SMd{%~r^Y%%C&2AyKV$|R
zBS%*bfoY`<U9xQ=Eism$^G|#y!AcK^<l(E#%=r=7eRM7~s~gc_k!FyR8BgEHykkWp
zXGn&;Cw!eO2tT6M;M}j;#LJ2YJ*tMR@Wx}1P-%w91b<diY7d$53h-BdD!j3BVRyy1
zK-3knF%4*gUk;^Vlbjl?(JaL;2L9xvM;Z2=^S0f!G#_X8X2TYDXPh_XAQ;EY!ZltS
z$cM{5@Nw2{SRVcb?zioQ*Zc3`0R>6!c>XmIUNe){Mat9DrD71+^aC>fo`QuLlj#$E
zA?h^z6H5OH(uk~jklDQ!r2A%5{@8jH93H}-nxC*bt{nbse_#D8CK7ht4S|OaqI8C(
zE!}V>jHazT3So(k&=C;{1#1Oph-V2jc$iV2L~+Q^)TP#+O=(;9SJ>$^j+)0v((Cr)
zsFAZQ9n8{WnonKvfVMgcUWvo`Ki}dSrEWZ_ejQggUc~(GjqJZ4SJ-Kb<ERm0fctwk
zVY^induEh~2{)ghrpyn1*3-pkhA}ubf`_X6M7RgT6&SA|O+<v}V1&jVbl>fcIqtrA
zME?nn%-6xMcQP<0rGb~$D2`KP>~K?Y71kTq;j`waIIzu=(+`s7Hs8x<J6?-%24x>{
z;jaZ;clr`eY($?^)11$3^)=)Ci=X1osgayw+j{Q9h#EIwJC<LY{=}b)GMVencC_BD
z&TZta=Jp$|<z!WJIXm}%s2k9U+k8d1rAk`d4|P#)<8^cH#rLPk;>WbU{3h<@Qax_o
z-5~BmhYGiQrw;dPLOX7cWDsr;NfH7C>ErvyK`b?r3VvBhPo<mC@#%!dZ5l^)7V1D2
z?;Xg5RKTyX46!ga1J>;pptk*vbl!7Ox+D7rY_U<JdMRCCwnB@d<jgV7CJ!5iy=mUl
zowV)ca=K);E|oQS3K754;GdlgxSa4K!39aM?xQY!Dmj_zl<Co1#pQ4Z?_x$<2)=ll
zg$qSS`ElL3C;*vEcisZLAS;gf(^E0U<u|(Ke?s0mNp3@uAP$HRpvnV#e8ycyx8q}-
zo|mn-BmNtvZ{LL49hum7CIhptj9@_fU0n3O0+T#jaMiXjT$#NeExhuu@T(k|sxg(T
zzg>$kyb+Z&TG+$8L1^$*pN(o{v9z0}AU4Lz7le;O$HGP^5Bdyi-Q{U~WeDtzy-(c6
z7>6erPluo1grVOhureVBmfNI2S-vDq`%j3MQK(K6-Fb9_y(rxxCqZ}YYzKX<Y9bSJ
z35-tr!_8cM&>ho64mVOkyKW*~WIfhT2ruRN{xSy-G^7$0pTNg%5{-=PCoA?YAd33)
z$d9lgVqG!JV&`lp9THz`-CO~kY#qTSK^aVcC9_c3$>=|KCI5@dF<zm^E8e$Q1vn9X
zj{Uw>#743_@mNj+(|DzZx~?*K`Ku^e^wjf`+x2;~6lal_nz2Md>Zz@Mx(o_hNipj~
zf+RfrK3R~xo(RgSv6G_{aPnM#jF$dK43F#Mnz5W=-2Z!I>9_ESeii=Nsm58msi4D%
zDtGdeGnzKXfu*kz7`8f*B7<IRr{h6y!DW<oZpXn_vq;;A@gQZV1xGhnquu@ptO*h1
zE<O;442z=}-7^*MJbHoW=dQt6u>wprjUebC!WpZbfQ8W$ApPiST$a&-q0I(3^@1$7
zYU5ED$8(_GN7ZO;ygWVA;R-6B0-;h}2Hx4Mf}a(lByt50$|GIq_&?X7c5NH{`o0Sa
zTp2Vf--ZR7e?bj@Ddi3bQ6$#96~!aa(tjDEy9>rLfC*sh76*MVdf-XFI909s1Z|=j
zko8B9UY>gcPF_qK)7qb5lSBgOmFYufkvwg2GR7$(x7dN{?bSnvQrW1xKP>yK2)v9S
zHjZ~1o=eXMjhILLaOWv_IXR1kt1IwqnI-8|O@y1@)L6o#gIJZn9JTkqCC3Y|FiQ;)
z^ix`k76lUc&?bjWG4CgVlY;p5rivt}-~@V#2t(7h8^lKMENij%A&(O5An(u>=G7^O
z3;sJrY=rmF*<X*+@scT2<=8ap*WgNbgyqvnoJF6xXV4fC7rNgum7dT)L7)GNqK80-
zZcti86<a*$t+GQj?EWztVH87sqa&!329N&nEu!+h3G`V&8a=RyM<3)D(S4q2^q*-I
zZH-Q&tLEp?k9!iRR7E`9yLJJ+HcprJ2rQ$Tb#AnHB$EbAi6F0v;z3wH7}hTOkJ#Uz
z4CV#j$@3fL(0WG`M(lOTt$Y*O&z*p)!}(-sVIkNrNr7EU&ylH{ZHY2_$IPNm67fbk
za7;K)YF1XmR&{A`DcB25Yp=l4@e_H&vg=T*g~GxgqL8R9L7OADfSmDHzI>brURown
zJu+Yp9|UgjCLK3{$N%i`_~{cwV1*UwnfVdoJ1)Ze#q)VmPRYE-AH-p?PAm&{7~{sm
zGvMoXb2vR;AN6K+^I!Xfps1J+=nNbNFHK+eVBrCBr|T9BDNIMhkX!s-83A&T9D%ym
zr6gj$F&L$U5)sum;5Y9A=%g30{=G_kty^<JL?jeCJ90>ySU$LwdO^|t3TO_Jg&o5p
zG%7@a-j!3O55>gj>=WA|=*N88@ckFW&?cyis)uPB)94z}Y@T87IhH7<LVNSfsiPHx
z2|rBWjBGvBo0sy&N?kBXL>kVOrorHfne>;U2(|N2p>^#Bw5wQ$va&A7i2n)l>7#I|
zL4)4jBmlX_J@9a%9k5&t>XVZRqcvu<$l8q_^p1xK?aDO7PKYjk^$tEYOVQFLk#PH<
z2F+BPNzFD9I(6^?G=H8%x4TZDFCK+Zxfz5;AAJbsduCAm)<Kw?e~_AtX@ZpE&D63`
zjxM+_M<q9f(kUoFr_R)-{vW((Qh_d2>-z?Oc8tyQnjxa7+z0n(3)1hXORolmlNZX%
zAj>TPq|=PqwiyeE@%l9W?(4&(IcO6Z5)EKQfgZ4D#xl@fszM&A2!g?CNw_pck1RSh
z7e>00$n!!)*tl*StnM)YBPR>^<u?nW#d4vbB#Re+Wsql`w*c0q3Xp3D5AxUOt^vt~
zrNp5}i#IXVnY|Q9<*Swnv#^g7Am8Z<`!<}5r?v=j777h``qym!%U~tE-wxb`TbbB2
zUYyI`-;0kczq1HEKK@=H&oz7aVzfX3`d55K>B&8~6a+a7pKQGJ*A)};FJOa*Iwz#K
zAA53CxYw7OctY!DaK^j3aDQ<(MtyO^Dlc<9v&?{7zPBAEGcMv42@|Z<5W|y8Y)~cf
z8d?`m<(yL0xy>6SxDWcq9G$y_JCS%F-8OH+*uk-Tx=fF&ZkWuO2upL%6+OALEn^u}
z_$kcT@)-T~{$QT?3eMV5j`N@F%Xw%IqFI(B7qoB{_lEt4r_$$eX<v47ey?0O3kw@A
zOKTM;<wLpYJ-HaUY!m}Dg}Jn6lerBO=5Ry88@b-LRh-L^KAoyT>D1!_bj#$4wC%zy
zs+};GE)YnCfoFzvnv*^KG{uQdn>mF}81kltBhR5mOo`h03sV=@VhGV+L^rRLp?QB7
z(uaocz|#9D)%JCux1GPkiPctADJq(78+-Ox?mSvh;zKvS@S-OVOrnPd<*CBRFsy#3
zPEY*jMRg3!>BYw})a7*VScbX=C8EOdxokD>!RQE+E8R`bnVaI}tUavL{4j|uI)-be
zyI^an8ZJngilJbQFY3dYfZiEYu??^VSufJ2^o;Gw)x&4|4`cY!S<Fba8h7%iV^2#d
z`BfIpDm=T`wWOcCwa+tfL$VFt`~8gw<r<PQMN62Ow~74K_h5g7g>65T6=2JGWwLDH
zVsiA0m#s{REd2gB4+Xt!nQGr}cJfU&u6kU^UngF|%e~0)_ReIiIqEJiWUm1}E<XVW
z-oGYFP7AQ`ZZ&_q;6li<I>vu~U@1yfOMy@HCNeu^8lIVc8lt`jAQO!u^ItE4wsJX~
zmtl|Xox$YtB|X+QMVAFLNieaL#=q9GoPmx6H?Uz5_jHdw$IeaVzU?Pm>^@<xUT86w
znf3&;n;p0v6PIy2rx|f~#fmX$=_}NjBgOTdUd|n^Bit)%BTlN@hWjS@7*EGd=l<lH
zaU1ST<93WCCxI(1IUkd?+*T_qPR4#8*RpdRry*?4t!#AT?)O@9=Pq2vCo2^=8t?-<
zSs<6@J&t8tzJSeY-B9!K7#!5A1Gyn@=Fyb^H`b+rK&&ts{NPMasQrL7yHCQtefweE
z!L`u$w2yc7@Lk{r=JDfy8xeD_Na%N|f_bC}Y`+G;o5Ar^?bAOPz4w5p`Nf7`N6X=e
zj}d(Kw4@vE?tt+<8fg1Q0G*v>7_ZiXY>7yPF!P%vUf~Fjw`3=TgzN<U`r~ZYl7(co
zs1Z8n8^XO_Encq^R2SYjL(*hTA>;fwP^l>)PaVIIl_eQGecS_5Hwws5>kATl#0*YZ
z?1aOWa$r#s3lhDxpp;s`7t-*v71^u;&n8U(t13C1J(x!{>+X|bu}kdWzB*<xX_$Q|
zoP+xx*hBBk&+N{9OHRJwG<3~WB)qL(cr$4MOuE6LSZoZrKPeX1Z;rt73A3Rf)sZO0
z6+`1YFY+?86lMk$VUhR~@VPGrD)pkA@NQq?ct@C)2c|&M>xGy<iGkI>43HI0LMxL5
zwn|@@Wsi!INg5wuyoY;L=?^(>V|NzQ|F95M+di@Sm?#hst|N4n8PoKrswRgL$bVVF
zXqnK*tVhl;54$S9e6&5=xFa5K*%rV((W_*GRvv~Z{YQ3O`CVQ7?l>!QWbDARTy$ui
z%nzQSj@HM^*~-r<EGkJAH)>5K%BJGX@V`Cy)8I239NLR>Ua4TQ^(GLQTE#wANTEna
zG;W(5%>0ipX1Ve*nCy8IW6T*-xbDd}5^yHGB~|R{7Z;d*b}H{ID}YhoS<v!P5;Z&r
z$qXY2EOrn^CAk{bG^h?b9r{GCID<7Wkb=MiGhirZIYzaess61H#rSRs`1`gr2HoEQ
z$M=5am1bCiPtO>0E49L3e~+Wd#7nFw`xlWso6c{%<^?U|Uy)`hKloTTgMB(<PX#|t
zp$Gnr`5zBSYJF-G9hMxM*^9Giim?$j7u!cC6bRF|+l}ZsF?BjQ*^(A)ilXipeCT&A
zH#&uGqD9d*RNgL}F4%XFD)+CY<cK}({%khJ7`*9=DkW+&9_Xe(87j0ufx=%c8n<#k
zjS^F(4XUf@lK1OrvMZtXX*qP@lP4A5JQ)&W#j!zb5iGV@2QRZ~`9h~gSkgQKb9JIv
zOVtFTJeHLh{dWtq3to`OMrG#jbDP||vJ_uQOW@FhoBXR$3t5)!Y53NC6H+e6vc|ps
zWW(%qrh!*T->##qG`GIG*6t&T6N_O5PnP0_pD)PHs@u$yHwi|;4~O>!<EwwGp`d;f
zGu?L+T_@GxJ-3^v;a85k#hXy&=5}=0F@V-jo3P7ZE?1#s#g)uF%IzFZ!smj<+}e}E
zoaiS_ZtYwb?xd?cr#SK(W5)d5^4ok&kNJc9CynD8^ZW3J)H4jJy^Q<j$#H4l&*G-F
zZ*WQ&<&<+vF=}%+PWPC`wYTeXAK$xjHpXh4cDpv$DP#fmU<^Xhkub4r;TR_r$M)44
za5v>SDdI=-nrEkj*`2#!Tr-wahsThe*$3h2))VB^B}vGa$bwyUl3-`N3~ZCH!_ene
zqGW7GPE~y+hiCAha`#In_elfJZQ2EGE-N8_Y(Ex+UIb<PNwCx^1RR<Vfp(z*yqb^-
zo7_|&(J&1h<EBzp<W1MSdk^~DcbJ>`0X{mKQvG&)>TfMf4TJYn&m)O6Xe5tjoN}hu
z(1y;xXi8VfxzRa=snq3_G3DQIq8CaJ(r4dPsIH7Vy?J;mrQdz%?jv4feJ^{uXFsLn
zr3w`aH>RZ~p|m~Oj;58m({R^WbosoMv~>OoDwsQ$;=aRlMaeF1*R|j5fO9V@%c*lG
zxKRF-0b9)K)ZoJHcjD9WMHu?uVs4z|8@wQ_z`46C;o8GvS>~||#58Fh&XlTP+wRMw
zUYr&8smX)eGi@&Zj27X3E&qh&vJn{WVZt4A>P8)JS#Dd`1kOyOm(39Bz~-8lI3l9U
zg-AA_^mGAEOJEsyd3hnee-(f?wQk!UzaowzpB}L&rBCQ-YleHm3Q%(381CW)VeJQT
zHkdn#ZTGkFDuZ63s&^8e>n+6X4ce&Nu7~mwv3TuH3W?5C!5`XB@$e5NcG)gsj?w1(
zXzXwVm&e~n=?5!Vjqera-7d*}YcxY4$s5G&PBX@?nZUgsapG1CY{HIngSd8R7f)W2
zg5KSuXdyTiFBNcLF)tCWTDP)s^XGz|;vr~?kVf9NeNgc3B)oif1)lZBf_&^pNWHli
zo@!qr&feLOq&f|0O%zK$tU#ZZ<iXu(Ct<nlaVD%~1zQzjNcDv|(7U>i7(|&<t8Y=D
z?;?uP*Uj0%f+$E)DumanL+s;+zfhGq57Vxe;Pxjw*q*Ct{OsqoSo*UF4=g^5b;()y
z^6zq{;Ms~(=YK^y`3CFuyTizlKg_;nHy$!f!@D_C@KA~}_B6zh3)(?!Rc;Y_T^?W)
zEBCW|KM>cyZo)|pXHm!aI8jwsMITjt6n;Mu^L|9)%i&dcWVHe}!(%-6J4_3c+k<iM
zmsF@P<Iy*7{pnEUS<t+G2Nt|kr5V|o^xTR<=(_U~m{$c{{PKkijQ@`}uRDIMQL&uv
zyeUMd+#3Q<Npt#0aRohBVNU0(iBqWqxiF$;g6C8(l62vE&^h)7UPakc|3@qM3tyb%
z?@;Zw)qG-4ixTYUz+ekF9WkS;&tHdhbq8uAVMq_jEa5$u3gzV$O(Z)%h(WCO0iKuA
z7iOH1&D?WN@OYo^g7|?Q#KZRtnCeJDeSapz=Np5^q?NFJQ#Mp=TmV{Dapau>A4Xzc
zkb{D4WMA<GFzGx3t<AG4Ka9=qQ<gPz9y~Yz<GJaO9DEHbk`u}Pf<HW`ECNZ3&M`w0
z#dD1vAQiIXV264pOdMk@gOTge_RunR>W~lNFSv)Eio)!t%bu|e_yv;L<!I$)%>6sj
z#n)6HM7^)KP(JAbUQjpSB1Mz%jEElRdMS0RCoT!#ZHS*HE#X<bJ;6-fN<cHpjX(bL
z7ra(F349Y=#yU;9oa?a}wrBrK#MHYhaANU1Oj?N8FfD;h5mDn>FG_Ka3mEe67xI<|
zY(!Zi!<mI8<K@&Ooc~`oIyD`^+}%R>y)G4}ddqQqi^*ujreowI9~7xRgI4D+GZvFf
z0$lU){Hy^iS!TqEt>keAme=s&LP74$G$GDltan(c)Pel}I`RF9LUar4LH8x)%=XkZ
zEDvhJqH{Ue84`=~qVeeca}o^*UdsD#TtB33EQDEipA+AAf?$5|(-@Cwvt1sv5<H#b
z(JAx@YhP;5>uy>CbAQhTSIb^vC$5g2J}1fJluTkDa|1MK52y@phf>i(o;?>#?BDsp
z`?>^BoH+vmM1^UaQWO;Q?15Pyl&G|Z64U%}7Rsjh!^Af!@U-j_d|h=7B>IGCjz>Gb
z6YAjKZT!XyzuU>SD&_GPE;ay*mR4TU5m8R7e==+x7m6EpC$aFMB2<>?#!D^#p`*S9
zcVv4vrpoU`_q!9gTB9z~dte$qdsKn1XUsvls(i9&+!*7x2qH)1y63D?9OFBiFXD}u
zB(_?Lhd%CatDm*+gB8;o!M{fw-Udv-rLlL>{_iOa|MHN1iB4th6OZBZiz2w|lrTP8
zy%mpo-okf=U(h4IoY+VFVP0>f;JTwVPT4Ggdycj-)tTRz(6%c$vwl<6g3X!y`YScK
zcWxv9K;{q*op;BXE}9se-$`C=@xYVIzq90uY4C1M5AmaaaCOlX3{!m0>Kezgijxbl
z&ha)IAKb^9-lp;zOe<i)_i^xVnIKlkooCJWu7K0LcJh4ETrga33PWqAv8j$#W7$m=
zq=^6EM;)@l+oj7vR7aea9Xf?3*DIOIIw=}G*^#_1oCIg~w_xf9NBHySG%3jZ2uy7{
zZ{?GArr0rsmh>{n7-Pg`^TY5<M-t?$m`Yol#o5YtvSgfA0&|!mgs&C1<HzPM*1Typ
zzWR0zPxtab;-d_ByKTjL_ZBmMg-9&O5@MU`cCd^qM=>Dv2KI}E;ZC;<5Z<SPQg33h
z<GL{az&jIG!7IlspZ%!kP(hYdA$#mp#EkE4z>eD!IoY;2Fj&!!>XP~RzlP1lAL@J$
z<L!)lHxyH|WOYFi<+zlY`F_4xTO_UY>b57FPUId-YELPIR;Hwj!n%~<NQ!b9TbbYI
zn;{iap)6KxwYi*)Wm(x`=bw09ujl<d?<049!jQgl{BZD^!gVd5rFGvR@A+3G89kja
z^M7Mxq@V=Kwr+*+dLL1IcNZ}xDR8#!BKS_$Lhut`c)R~^;udBIXXM(j!a0D1mmeYL
zvpL{Ly1}1>2VH@$C5uYjU}2FnCa*|^L34F#@=QuTl^Q_N0v@*}Wt?LXB1KPACfDi}
z3vYu8$aYI-sBhTCtqir{>esXrVg4YPI&Go7uKz;bVh*nU|1#YDE(7Pv(~0tFEbMT3
z3TCg*L*R16a<d6;TftLt;d|r+tvT>qRtonzMkKZ83}|D67bZP+MAzSx3UT>*g=nd+
z<Yustv<sWaN&jMe<25Y#T*bzBM~snK=4-*G(Gq0VIdI%i9Y*pJ;QJF1a6>7E)O8iu
zw{Ra5WX)l{MxJbygENyJ_GK%ZELcFza|}2Z$NKLJ*cDk4yZN9Vn;wdp>wg`Y>z)(r
z+fF<7>)s~(!!wv&)Qx2($`Z_o9K+!+J=olRk<2S>3rnc7V5f@QSi+!)g&26SgNC25
ze8iLG6vVJvE{XBl4zYQBeU=hTSz2HZ%2o@=n%ZJWJ-i=&HZq1S`Tb<PHvt7ppQC?>
z8SeO62g>K9!?JuIOdQ`#BC9-9J5-@WXR4l)9<v~gYA3lHZWWUHj5Ndvf2WDi2jugN
zNUlf_$MsD#a%K-GW_^msj&=D8f#E86Wb_369fDv?q(SD#NfizEQeh6YB#J|xQ2wDB
z-#BYRk4YmQ^gW{pX{bXB2Qzdz*~R()l>$i_3vkfN1og5O<59jGkG)JMHRk=C;-v($
z*2Q4ek{6uGyM?4g)W_MXU&4~9AK=s37BYG{1V%P|i=7Rn7+$JHLJt~oVC4i^Yt*sq
zVh`x8S<RUTZpTH*%hB%MT72PG0uK2zvEpVH_k$=4lk8ObQ^h2F94se=a(5h`1`vkv
zKw=z6tZHpxx-5^}O^H=~S2{rG{TjN?b&@fgzq#wV(O~_$1*T`rW_6}kFnd^=240)1
z$jUZ?tb{9A8JY+U=ei{PQx<goz8WshC`D5IYZBP}aUY^TyNg0KDO%<gLrd9h@J`bJ
z<&N!=5bYSrlgKJClFH#gK^rU|bcYGwRM;zMfXF>+blzr7ig+DN&KOgZ!!zj1HUrx2
zs|j(`9e4rR@F7r4-Ji6R+Ffe&T4g>+Zo~qAYaz*-OaZ?Ab}A4J5$V=-bXakUYr|2l
z&m|Y03AMP+fWzQqEr5qwB~aA$k$}#95}#BA+XrO0H);T_M|NSDZJ%m?*@w}sDmM3?
z&#Z=<5Z0|=ZjKfbSzH~;?9bzlL}xZ?^A%<u-^qe1eVC107e~8fIM>dc)v7j3hr>>A
zF2jjItP{D#^TeEnc8pdvg$=fWSfVRp8Yh>qs68I6Up<k#_~RTl#+$<1zLsq2sE7rh
zSj4!84?tNcWD`#`X_s_AX4|ZS!Gf<S-B<(BG4{}#*dn%maf+;Oz6JU`9~xm;D;e8d
zN#1Nb2cM;<le}PCh$+;;pL~pHv+NfLSXv9+kumg6XC;YvpN!vyJyNVR8iH?3wdm~T
z1eHss1&#;YpzD??*S4@wai}o}E*!cHda7RKH~x`)DU8RaFjMrah~_kh=b_1T4cE%+
z!$@8*OY~D>y1EW~TXC)0Huo$pmI_hXWXXg>1ReF?A=f>NDeu+e&kZ>^xu6x-PQ1dE
z_N81z`zU^_cf$bV$5^(>h$RoIIF@}OHW^2wcXI;UIJ*Q}+PC7f>c_|{c0j${FIZ4i
z5;p5EVa>}X5sv7waD5s6meh>nFNRe<({&`v?%?lHuTb_@z?3uQuy4Be;&9U|R4#VK
zu08wFSTKkcml#U*s`19;l<H$C$LZB#-1vJP-p_lATBl#2?(I7`<+2B7y7=RfU{xng
z&B3eU`4~5F0{?XR1(nJw44u)7C&N=QXrKli>`r4(ZW}oKXA-2HM{&ngEEFfp;kmb;
z!cn&XzOa@+jin{@>Wz?MYQ_~^QA4qsT%l2Mx0(z;f~V(^Jl9NB)UCIprTqnD{GAsX
zG^#Q|wh0FNqo8N^&gzcuzlQXR<2YuK4RR}e_+*$5-Ul*?rtXiRaDNR~Lxv=~wBulP
ziA=J2Llh+Gi0M#RD(TG9r2K1@Vy6~0s<O*zahf(wp0^8pQcS6i*EFP+#Y0TpEP7fF
zWXUr=P1jVh;DrOAF@0Mcq`{}V2Xtt)Pb<`KTShnW5@-R>nP&cbHFwGTFKEAT0p!Vk
mbpLt}TJcaH68P#gc2q?7ZJJGYHfqq!Ar3@V8|mAQBk&I&6|GJH

diff --git a/tests/data/for_masked_conv2d/masked_conv2d_for_bias.npy b/tests/data/for_masked_conv2d/masked_conv2d_for_bias.npy
deleted file mode 100644
index c60951a1d64d67dc8717284753e1bd33f95f6a48..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 140
zcmbR27wQ`j$;eQ~P_3SlTAW;@Zl$1ZlV+l>qoAIaUsO_*m=~X4l#&V(cT3DEP6dh=
iXCxM+0{I%oI+{8PwF(pfE}aMZ`&H^E?B{%9Zw~;zoglFQ

diff --git a/tests/data/for_masked_conv2d/masked_conv2d_for_input.npy b/tests/data/for_masked_conv2d/masked_conv2d_for_input.npy
deleted file mode 100644
index f45c03457d84e47a6c737dc2f362a279634d0108..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 3200
zcmb7_`CH9-AI3`@!ed0*wTxCOQB%tKe%^;kPJ@(^^rQtTD(w-YRD?=YqM_18HE9!C
zobTs-TBc1Zl_`Z5GDRXOp?J)n@Z8sR|8T#4y|3%O6D)Rav)nBz5+rh5W528aK0gf;
zWex5D0}X9ujRW3(0e&uroxJ__yZ+0~Ts-|<e{g?y7a!LjS#yoHvc9(R8pD6BrLBBI
z`Ty4Z0Y(DQi-RS*xI+6DXR@Wi7!Nq+<3$fOSkAeKjMgyPhQDI!j8CG$swc2eATDH2
z038pILovlk=)B*K(<GbT*4@UmG&F)i&L{~zxLha`&k=f^iy=!=Z-B$2zqqD*GicFq
z#Kx`?d_KRPDQ>r56AL%fQlbOf_(zCrzcPF%EriX|YQ*KiYkKI)QsMZ9GIC|m5`!Hg
zFgU9NCv|M_Tfj~_Ci<R@SrA4dfB7P?&Sb#JRu?1Q)R5u)R4l#5fnuBY_~OPvqRse_
zYQu4yZ4ycA%NGitT8Im6JHoKG=@`u4_7@x2oyk~k44|_`w_?@YYBsUf1@biQaPi@2
zQfcLfRc}Y>rvPB5%iq#s!<*DIZ8sdaH$~UX3lrLN>Tp+4G&J|6u&sKBuvEey3^v>*
zXF~hPNvBPyG%X5;`L`fRr<%NUJj<A;rjW`45uwxDJT$w2^zS4^-26@s4K7vD#~~j`
zM|3uQ_wX`UEc3$7J|4+F)I(l%4&mhLgVdtc5bHTx0H6Lt&8qj|tr%6DIl2~|6ndE(
z#jm-~ul<HgqFEsKEP-FDiyI3n$P&{8Fzr=<*B066u`7W66sC$gM!RtI?s3%Zo6mDw
zbp~`oB)~AY7eq5>lMs>B81!l>`i@BPoX0xBd6hO#;9JawcJ@N2T_cDVO%uhWWcFIF
zKk#ExnFV{cqWN(lmg%*kS1lJSig$x;Ct*|u=Lm}>^-(>{3yhA{5qFoBVAWko_quO_
zaJM-8`1Ul|tDazI+)g9!Wdj7;Y$Rf)_t8CkidO$BkLU8VK{2@qYy=|=vwka0Fm^?k
zkz{m9@WHI;i>T?=LubsVz*tiSkLxF~#rng<SXxa;`y*kb)(P!uOHnlc4YiopNK%#j
z5dYRf=bk>WaIS=3Nhaz19t__L-{GBu%kXJ?AikN`OQyWvfQ<Ap%;lef2W}T&*PtR9
zF403H&unaS?_zp$M1*ry1k6*56#BVwGvgB_5B2haxZzD938`^HX-9y<%FlG?8Bvh^
z={bG1cmrrh5$e?w0$*CiaZOV$^!oI$kQD{wk~#K-l;DMBV$}CyG1=|54)B#NS*4{7
zlmER>tvy?4MSm;~eh~uarzDV*I>h}Ye*sev8&5;D4PZKpC2GfdiAh^5o$BubyXbQm
zbJ-Cz_e%=5ge&rXzhh064i{17KrwhXHBOj{PUc0g2aey1!bOAP$S*wwm4X#S?}{q4
zKMF%{k!SST5kJV(h=U>iUqpMU0(g)~5C+fV?Vq=rdM>+-;iG2-dYovC-4X@W`xA(7
z=Lxne-we`1GVy(UA{uMyL!L`Bs!XUtnCBImGIJa>wUUW<0Wx>}#V~v7HoR8OMgP=t
zc#`>v)_zH+#v@me-+U0?FW3N%W689E&8M+S8T34BLz7pWMP2h0D38cz(>#_y>iZ<J
zsl0?<&(vfqB9*W-O$Gc(1+gV7F)!jV>C*B7U!OR%kK{4W4E0b@@F(`%38JH2MYOC(
z9wyua1Y->sA?LCHH;O+bBTeD-_oh37m*!DK)}#yr{er+_`73fD?h#{C$HC+}z@`&R
zcp6^s@t{X0ep=oLH+L7I!|*wfQI1B_MOj#~JR4V~h@nEO30*M~O6%X>1c%4ExO0gS
z)MTpR%PY}P_2eW`dDII@jb~8irZaMvnBa{37Lfnh6z9h7rRJX>(k#ndER8CJ)`$1V
z^O6Sa=h`DbEf0*Z)&buz6;!Wsz)<lI@aP_(+S;+WO-B~(oh{gqBnhF6L?djWrF0GN
zDTMX!#BNf~)SX{TR<!TH_>iCB_S#4~8MFq=RP2ei!a0aeki_s0skF>qoZig~g-;1-
zn7E^w7CFY^T=7<r9Dc-n7`;r=uE*jPNe9|2{sD$nI^ZP3hmH{y8ap5YGlwH!L}>w9
zE3`1ZpM$ts>#v~;qmIMc5#&>HI(-a@u*p;mjlC~}U6Kdon7pHx2E>ISqYKGLmo@CB
z_{(^~NJ4OZZV>8sxWdZt<=EhN6V&q`Q@JY%jQ!F*wEN}^{3KQZPEq-+#K~$>BYF(2
z&RgIFk%pnIda%_Y3<ouKQ6&jU*fJ;rDzn>&W1ccE%i4*~svE;jlP4tHZ-Vjiw}w?&
z`WP<djoZv+(EU;)$WBC|-T7EJb|jneHNFUg;U1(r^(L<GJ<IN>jiVx(EmYF-K2aR+
zB^D9uY5HQ8ZlCU?MVeD&=yVQoiQ5ZZ4wAfUv-=@qZWh&@;gf-$Bk=CCn(*bsBq<CU
zVQhFqpkEP)38UL#&yQ`Kv(gHb9*W@@F9C9IOyb*w2LfU_2Z_xM*7|ELZHyab_x)lG
za-mb082SJ`w(CLOtTq~57zy%HgRHla6q8}BLgiXzvAz8}%E|mfPFiPz--9#wr_3Gn
zuRKFvOJ9Y|u|+iCXA@M|RzwO5m2l_1B$8$og7a5Iz@5Yp=vwp}l$dq{t0N$JoM+@z
z^kE#juon!4D#8Y<uS9mg6;%8h3q@=0v$ESYap^-nym~E}JmqK5KeyVzbH8VT&5{q1
z?{f;Z?poqMCI#f0@<;ldsh~}&(U6foAI2x7@wCZ8X!!XXoe3Ufc*ZXn@8=@Ip@Lo<
zNSTLCk}2qQTAZ$kbp<isD6(2h8ZI4|2gjxB@bF#%g1<3*!BL`_rXw)W51^N36_a#V
zT}%_@2&>Z{3Ql*H;P(no`1O)A?r<o>VJ$v5K8wbgpPKRg^g84cGrB(HGyI+Z7%%Tj
zg5Pitemh?V#=}`KJJo=fR+EASMrmwZ*LutvxDFqmX2Ni(6>R4|Vr*|cpl<Uzki)e_
zZLSEk*(T#HHig{!@wHWbau{G44V6+=tjVS*c(YO(txy3H+m^wTuZpnUqXfOpPGNB4
zJ)FHo3YBW=A)w<By+1L68F2;J(sdN_`jkO$ZweXgucaH(zLCXJ{`l%^8F<vTpov{Q
zeQWMazXY8}^}jsPK_r3v*;<d<Pyc|4Syl9mK_HPd9fXbt98^9bq*P`vf#nCurj81-
zIbI06Ufg0+rDia!T@7{wf1&k*lc?wM2#?y>(VAKL_@%d+s4Xdn9I6F7-F4_CelFWK
zJ&yUaE)aELH&K+Ur_9Sc&}?>*tIg~t7cJ81QQvhC^WipI^uU8`C_N>JIU9+x^82V-
zSs46hR~U+^_u$L;J@~1x8UIihp?`=d(!9wq;M(cHriME7d}RU2{n?l-Z3KzYQK*>P
z4mlcQ3}?Y&{HMbiU&0~MRuq5>GyJf(QdPK4QW5%oZ-(oRACY5v1MQ6(@M=>a<Lgz3
zhP(`5c8)UsS6`qoMV*@Z%mc;q4%n)+3dgR`!N8#*YWYn9ChqUS8r>Sf<8XJhoij?e
z(HP|JI{>cF?ICG$BMda>(YpBAsJFk0<a}(Tw`?6?b?s*Kc$b3_O1YRbKLvN!b09s~
z3>sU02mbgF?iem-O8G;WyYUF<Cdk1FkphS{389f{vFzQY8YC`81k4=oq0!f+y!*u+
zg443*DAu=vJLH@R-vxK!$eS>tl=Gvrj>G7qasgH1WymtmWxzT$(F5_JG&&>$b3$z)
zWo<T1JGu`4%T&?ITblHWq@plODFVu8YU$jC!L)Y0A%v`Y#vc1xPFggFN!-CjsJ^a>
zahn(*Zm&nkgvc~D{;kSdO~+uB^J3uU8&j@AI{fE`2JM$W&Te$eWSYkWw5@wBR#!iU
IN+)mlABX)c8UO$Q

diff --git a/tests/data/for_masked_conv2d/masked_conv2d_for_mask.npy b/tests/data/for_masked_conv2d/masked_conv2d_for_mask.npy
deleted file mode 100644
index 4c074471e7b7411868d1ac62ed5a56a93ab6f216..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 1152
zcmbW0`A-ve7>7Be1BO*%+?GiIhqQpWawue^{l1+;97hq9b1IfW3v=aa9mD~RV3A|t
z+Ca2`r5-3EAUG~dzb`|X5<qY`rNf~N)NE9SOs<G?_%G~}JU=|o@1Gw6Lj^&Rh6bqy
z>FijMEJn)qvSo9{+u6>xY;ltGh%`F!P?9uO^tSs%OJt(eE{l&&7OlF2tFx`E$6Fko
zZ8L5E&&)ZKI_fcH0_z&CAhYCqY9@8YiA$BVqHQbw@lm^WOyR*BG}wz5P8HJQbNk?w
z9~W0no1@&{29El!P^My;4(zhQM;HMWzD9i8DOJ~W@FX-XE3kJDn@UVS=yqT4iB3aR
zu)j0{yxZ^5&C_4vCUHN6UfiI~$g6>&s1Dwqpb)~6ne$!l_d%@jIm`*s&?s2~tegLb
zr`9fLd0!nS+*t?Qa?ljk&!^(Ljk%=QT7_l$08B5;#+?s}Vf2KK==*Pz)A$rQQcK(s
zewRB(m6-XY6sM|du=BbNsx8D|eaRRezp2N>xlO#?=Vr+Zu{VsL&ml~r#&TW^ZE<Ub
zSne+1goKhB)h$w@$bs}XQo53zML%4y;LEpKX+wJ|XxefCq<)q~cJ_GTqqTa}2yVm9
zm3;1?#}PC+%)qIqYhYaaiY_~JX@@nt$pD-n(RTCT>}LZxV`X@~?HS2V&7vpgFM{++
zF79aU1$E$2Xz$me@Y+jlvW$cAzb!)Q<A-QuvBVp9s3(y_Un4rFq5EDIO57MIG>d~X
zCPVndCJyDdQg9?xh~BDcn(;*wZM0q_tuK4PqT~b)3>D*!Z}oW7-4K12jk-apz~}oy
zx_&k4f!Wjv>a_hHRJ@Y#w%t3A=i8MuYN`u*p&TfWqMe*UWCtdR$-8kV{I`Re=4a9C
zj^AL3D223c4MAaDIZP-rP&aZASmQ}_RC1Zrjvat^?6MJ$IpW5aDh%zg!=HaK#zQW3
z_~4EVYMVCj_ogVROe2IoK_!f`@@UzQf!M(Qm@cpjvF@rwGZ~(V&)6e4oW&sh58X)3
z?^V=#Z6_YJ8X$DbF-YHELy9Kk5Ehe&NtIFfX;>{bc2C1%r!Ndj9nq|>0zJHSylQNQ
z<g@l@85{(Tp8mkNoR9MG4(jzuI;NdG40lRh$XJF5H4P@fiMQY%&`c7JVIFoic54e%
zg`~OiF15<3*Q#a1Jk#gZxTuVvlL7&dpN!z?=LYBy|2HX0enxK$-lj8G^^gEQs5qmB
zKQ}w!oP99_{S`xvXU%DmzJuQIkHq@{FGxkfaq2<?$TqPn|K?dKeG)GK7K;fgFFyIM
zXAtV_8CY!7g)Rvfcx}lNxZ5&A?{5wU%Pl%?QHuhakD0)l?0gUngb+0|kd%a#P-bof
v=!k$WEk)q_qSbF8gJ!2IL0tVF`j)@$nz>v+u8G(3Jvnob@|ppeeirZ_V~QY5

diff --git a/tests/data/for_masked_conv2d/masked_conv2d_for_output.npy b/tests/data/for_masked_conv2d/masked_conv2d_for_output.npy
deleted file mode 100644
index 4741265afb3411221c184b2665737b5dab755692..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 3200
zcmb7FdpwnS8h>@s#C~c@%#6qhp@Vjs#TbkCIWZc>w3D)3#**7vV{Dy8mz8Eh4CTWS
z$+RvhH6@`>W|ic9PDz<A<5ng$Wk=~oC2FWX?RtKC-%|QxzMl`z^S%6@-|xMgb28`o
ze)jniEp8RJ)-ohCDmc>8NoMJ+w6nC4St?gVMn?uN4_FZy5=y#9U_?~tpdJ+#s0tnA
zR<mtnf3lIycA&qRHnMfH-whpZ5Pja2Ky@U{44r_YF1w|c`p4S!AnH`jHaryin#yXZ
zH{fscR9{A$Je_Zh@dZ&+U4Ozd7h8LTv@LB|m#|rAo5Vr#NA?OaGrg&^CRX1Ri93y+
z$aMREECAxh+oO=G2z{qOxPRj}aLwBRqP9bxB=w>eeXg1>dH0S${7lS2*l6I2^Ez5k
zerl_9=9Zo&oYkq#@8gn$oM*Xc+qFoVV}22(oX4MS(PmN|4vu25*mn!+*8K`YjN^9u
zS!=|;HNae9Xw-<Q5Av}&KNv~R{4R{uuD~)@6W%vdfcF>De9C;@dzA6FR_hB3ZSS!h
z=^{pZcnM<vYRxrlhJMeiH~N%66vVUn-iE1=u;Y6vTW*>M4iRG*4_TNExVM+(Ufx5u
z$e2Bor>ByT^xE=|StixNS@UyLi7RO%VY2-VxZ3Z8<W+rTB*>3^$9m!;VVm|d2wQcR
zorQeZ=I{EGrw6Lvm*c#k3>da%L3?#+eUYSjJDvffwbn?PvL80#%^7*p9MXyo7~$@(
zj{x=midDipeH?DjeTQ8KY1_710#!mo?jP9uO}&jTCoC=KVY#=h5`i?rZFf0R-MM%%
ztK6@xLzTqma6YUL)IOM#1KZuS*i5spyfCErF#ojm+nP&#5zME|a`!1%q1BAkN9%Re
z6c?Tr!nxz9Q}I;roqJShIJ&6D!@rcBW9pJgK)IJ;H~j+?`7Mu$nmV5_#M?@ji~8xj
zbL=waO}nKUc2C4S)%#;zrRV6`FR(o6R^jf$)+EHFVrr5vd*3H_B}(^+vJ7UmE?H|*
z@Hyl51V3SOH;&WCu`3fH$t@n|XS!q7*&-?9r1eNhJLbW7yVf+8_i4^xyz|)~hwWck
zr6n}@1cH?rGM^j7;}Ztb=Z`qSbYLq?OfEp`$InZ^r>O}T(^<!xS6ecT^ckjwLd&7=
z`I04TQ2R@O(#^)woD1rFY2T2)K0;;VNok!&iyp4q^|i#k_D+YvaSYdTOf<=%;gFKo
zEhv9$W%noPU^$yL`QBfULhAV+YMxaD1N9<Wr%eXx<Ek12Z)2sjx9F?m_oLXqTGN)g
zVV`c{8-2>3nx)UKZSSVb=+!qvA<if@=@)v}lmYShDglp-nf2P<V>G9D@;CI~W!Dq@
z_p?ZR=d1@1Xgdn19|;@Mbu|u4<v`x3l2X2=+l%$YM}mP~6`Bu>2GJ|2yCgL9Dxl)t
z7yRV|<JtK|Z~Db<^vZbg8DCsDi8q8wB;VXlhW|<XiJe)@->=C;^}|tYMo~&Hep>QG
zFj^HPWtMeYpqR~`Sqowxv_23Ey9?MoF~4dionejte3l8iUCd@EGXkVF;##%64$FU&
zdj;F{b|UHLdh+ncP#rY3X+p2`4buJ|jtd2w4{tCYT&jnw<&OnboG&DIHM-^pbPCRT
zQv|=TPuPrI1FydeIk<^=v`3=LHQ1a|&xiJ^LEUAJJ3k)9yevfpyAH~{is+Ln_H0T2
z^umkp#XVQ|NM|Ik&@77YIT*>bV;TP*=8=E$pUu#Ez*Ew%j7tF0ji$w5oPPsoE$V~Z
z-WuWbTAu0TIV2R|oc4!|7hh4aYFw7fYU%<ttK4(QvhVZ3{C+6Bz)8AG%X<pZ7wa_e
zg{<;JEMq&M;`Q?0#=FPf#W+Wl)~y`>3ucc~3w;s4@OCaJ-0)IJ-uVeS$UTSu6dBw+
zs}@9F{@*gNsgDAYU%&oD>ID-O-*jFJTWb%&j)9@?2+hV2w$}7~HPV{gt_HByHDo<;
z!UexPcF*>0S&x<45z<~mF2`9<eDQrLYGN(O%E#TRWVT+^Uab?Xn}IZqpBmfN$FZKc
zYOVdz@U|b>T1Shikls`$dE^t8#iz6LocpBOmAEI?A=R9L(TZ=f4PHAdd4#Da0)#xr
zq2B`CN;~1K{4_A*4gbdK?89^aQ=n+OWUmyI6(&%TX#y#3bBFC?^1)2(evlw#kN=zo
zi9TZ)4?k0npgc1yf9IYBq^{Ox@3@t=2FG`Q$#-6RBxT}1+9vhrZSO3ep}0{Ap_^Cm
z#C^C95cT~Q1FVm|V}hieKE8`p>P5Vjxdo};_*f4v%}xdCJKQgWTX_{6K3<FY8at+u
z{+Zzqd`)mG+owIL!r3qDgwUbNL=`j$e`}D#^3FT1u`?Rk8H*aLrL)pnLc7Bppi1a;
W;~+LV5azz8Ei~V>5yBRH3;ze{;P5^G

diff --git a/tests/data/for_masked_conv2d/masked_conv2d_for_weight.npy b/tests/data/for_masked_conv2d/masked_conv2d_for_weight.npy
deleted file mode 100644
index 50f04b53f01297845c95a690bfbf87ce3a88e523..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 452
zcmbR27wQ`j$;eQ~P_3SlTAW;@Zl$1ZlV+l>qoAIaUsO_*m=~X4l#&V(cT3DEP6dh=
zXCxM+0{I%oItoyvsiRP<KnCEt{M*#-^X;{EiMe%lyXViat4Kd>_kBXhe)GVs`=$z>
zv`@XXeBYb7mG%>2lI&|j%<Pxn^RrcKoMhYX&3It)MCSb;Lk{kbJTq;-opHi`(`dc}
zaYt_LyLso({z>bf@83Kp+5Xh~W&7-nZtmkLx3Iq+&~GniUAOPgLM3}Gp{;g@m45I4
z+23Wq*~i{iW8s8-fsEz*f4+aa|M05@yDzp!_P<mtv{%&DvsX>JV*i+TqJ6P*xBcbn
zaJ$37Z|t_oSnj{Fu+jcvw5Z*WgbVxhR+igH*KD?vvfpI4AZ>~5v2=AirQ;LqeO4CR
zT?~@5PuAVI??0D`U7O65eSR}4_Ae0*+&3xwtlf13&ixe!P3_*f`r9x5bHa{qC;$Gk
zHu=5Bv|9Gh4N=&CDZzaIiGZ$sk@LmvpB?VoXVCH9E}-D;{@Pi`?W^{D-5*`wxeoxl
C$-Nr@

diff --git a/tests/data/scripts/hello.py b/tests/data/scripts/hello.py
index 2ed1a1e..d7215d0 100755
--- a/tests/data/scripts/hello.py
+++ b/tests/data/scripts/hello.py
@@ -1,4 +1,3 @@
-# Copyright (c) OpenMMLab. All rights reserved.
 #!/usr/bin/env python
 
 import argparse
diff --git a/tests/test_arraymisc.py b/tests/test_arraymisc.py
index b29e5f6..8e24b8a 100644
--- a/tests/test_arraymisc.py
+++ b/tests/test_arraymisc.py
@@ -1,4 +1,5 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+from __future__ import division
 
 import numpy as np
 import pytest
diff --git a/tests/test_cnn/test_build_layers.py b/tests/test_cnn/test_build_layers.py
index c8903ac..fbaf862 100644
--- a/tests/test_cnn/test_build_layers.py
+++ b/tests/test_cnn/test_build_layers.py
@@ -1,19 +1,17 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from importlib import import_module
-
 import numpy as np
 import pytest
 import torch
 import torch.nn as nn
-from mmengine.registry import MODELS
-from mmengine.utils.dl_utils.parrots_wrapper import _BatchNorm
 
-from mmcv.cnn.bricks import (build_activation_layer, build_conv_layer,
+from mmcv.cnn.bricks import (ACTIVATION_LAYERS, CONV_LAYERS, NORM_LAYERS,
+                             PADDING_LAYERS, PLUGIN_LAYERS,
+                             build_activation_layer, build_conv_layer,
                              build_norm_layer, build_padding_layer,
                              build_plugin_layer, build_upsample_layer, is_norm)
 from mmcv.cnn.bricks.norm import infer_abbr as infer_norm_abbr
 from mmcv.cnn.bricks.plugin import infer_abbr as infer_plugin_abbr
 from mmcv.cnn.bricks.upsample import PixelShufflePack
+from mmcv.utils.parrots_wrapper import _BatchNorm
 
 
 def test_build_conv_layer():
@@ -61,22 +59,12 @@ def test_build_conv_layer():
     assert layer.groups == kwargs['groups']
     assert layer.dilation == (kwargs['dilation'], kwargs['dilation'])
 
-    # sparse convs cannot support the case when groups>1
-    kwargs.pop('groups')
-
-    for type_name, module in MODELS.module_dict.items():
+    for type_name, module in CONV_LAYERS.module_dict.items():
         cfg = dict(type=type_name)
-        # SparseInverseConv2d and SparseInverseConv3d do not have the argument
-        # 'dilation'
-        if type_name == 'SparseInverseConv2d' or type_name == \
-                'SparseInverseConv3d':
-            kwargs.pop('dilation')
-        if 'conv' in type_name.lower():
-            layer = build_conv_layer(cfg, **kwargs)
-            assert isinstance(layer, module)
-            assert layer.in_channels == kwargs['in_channels']
-            assert layer.out_channels == kwargs['out_channels']
-            kwargs['dilation'] = 2  # recover the key
+        layer = build_conv_layer(cfg, **kwargs)
+        assert isinstance(layer, module)
+        assert layer.in_channels == kwargs['in_channels']
+        assert layer.out_channels == kwargs['out_channels']
 
 
 def test_infer_norm_abbr():
@@ -156,15 +144,13 @@ def test_build_norm_layer():
         'IN2d': 'in',
         'IN3d': 'in',
     }
-    for type_name, module in MODELS.module_dict.items():
-        if type_name not in abbr_mapping:
-            continue
+    for type_name, module in NORM_LAYERS.module_dict.items():
         if type_name == 'MMSyncBN':  # skip MMSyncBN
             continue
         for postfix in ['_test', 1]:
             cfg = dict(type=type_name)
             if type_name == 'GN':
-                cfg['num_groups'] = 3
+                cfg['num_groups'] = 2
             name, layer = build_norm_layer(cfg, 3, postfix=postfix)
             assert name == abbr_mapping[type_name] + str(postfix)
             assert isinstance(layer, module)
@@ -176,17 +162,6 @@ def test_build_norm_layer():
 
 
 def test_build_activation_layer():
-    act_names = [
-        'ReLU', 'LeakyReLU', 'PReLU', 'RReLU', 'ReLU6', 'ELU', 'Sigmoid',
-        'Tanh'
-    ]
-
-    for module_name in ['activation', 'hsigmoid', 'hswish', 'swish']:
-        act_module = import_module(f'mmcv.cnn.bricks.{module_name}')
-        for key, value in act_module.__dict__.items():
-            if isinstance(value, type) and issubclass(value, nn.Module):
-                act_names.append(key)
-
     with pytest.raises(TypeError):
         # cfg must be a dict
         cfg = 'ReLU'
@@ -203,11 +178,10 @@ def test_build_activation_layer():
         build_activation_layer(cfg)
 
     # test each type of activation layer in activation_cfg
-    for type_name, module in MODELS.module_dict.items():
-        if type_name in act_names:
-            cfg['type'] = type_name
-            layer = build_activation_layer(cfg)
-            assert isinstance(layer, module)
+    for type_name, module in ACTIVATION_LAYERS.module_dict.items():
+        cfg['type'] = type_name
+        layer = build_activation_layer(cfg)
+        assert isinstance(layer, module)
 
     # sanity check for Clamp
     act = build_activation_layer(dict(type='Clamp'))
@@ -223,13 +197,6 @@ def test_build_activation_layer():
 
 
 def test_build_padding_layer():
-    pad_names = ['zero', 'reflect', 'replicate']
-    for module_name in ['padding']:
-        pad_module = import_module(f'mmcv.cnn.bricks.{module_name}')
-        for key, value in pad_module.__dict__.items():
-            if isinstance(value, type) and issubclass(value, nn.Module):
-                pad_names.append(key)
-
     with pytest.raises(TypeError):
         # cfg must be a dict
         cfg = 'reflect'
@@ -245,11 +212,10 @@ def test_build_padding_layer():
         cfg = dict(type='FancyPad')
         build_padding_layer(cfg)
 
-    for type_name, module in MODELS.module_dict.items():
-        if type_name in pad_names:
-            cfg['type'] = type_name
-            layer = build_padding_layer(cfg, 2)
-            assert isinstance(layer, module)
+    for type_name, module in PADDING_LAYERS.module_dict.items():
+        cfg['type'] = type_name
+        layer = build_padding_layer(cfg, 2)
+        assert isinstance(layer, module)
 
     input_x = torch.randn(1, 2, 5, 5)
     cfg = dict(type='reflect')
@@ -401,21 +367,22 @@ def test_build_plugin_layer():
         name, layer = build_plugin_layer(
             cfg, postfix=postfix, in_channels=16, ratio=1. / 4)
         assert name == 'context_block' + str(postfix)
-        assert isinstance(layer, MODELS.module_dict['ContextBlock'])
+        assert isinstance(layer, PLUGIN_LAYERS.module_dict['ContextBlock'])
 
     # test GeneralizedAttention
     for postfix in ['', '_test', 1]:
         cfg = dict(type='GeneralizedAttention')
         name, layer = build_plugin_layer(cfg, postfix=postfix, in_channels=16)
         assert name == 'gen_attention_block' + str(postfix)
-        assert isinstance(layer, MODELS.module_dict['GeneralizedAttention'])
+        assert isinstance(layer,
+                          PLUGIN_LAYERS.module_dict['GeneralizedAttention'])
 
     # test NonLocal2d
     for postfix in ['', '_test', 1]:
         cfg = dict(type='NonLocal2d')
         name, layer = build_plugin_layer(cfg, postfix=postfix, in_channels=16)
         assert name == 'nonlocal_block' + str(postfix)
-        assert isinstance(layer, MODELS.module_dict['NonLocal2d'])
+        assert isinstance(layer, PLUGIN_LAYERS.module_dict['NonLocal2d'])
 
     # test ConvModule
     for postfix in ['', '_test', 1]:
@@ -427,4 +394,4 @@ def test_build_plugin_layer():
             out_channels=4,
             kernel_size=3)
         assert name == 'conv_block' + str(postfix)
-        assert isinstance(layer, MODELS.module_dict['ConvModule'])
+        assert isinstance(layer, PLUGIN_LAYERS.module_dict['ConvModule'])
diff --git a/tests/test_cnn/test_context_block.py b/tests/test_cnn/test_context_block.py
index 864cb41..8aa18f8 100644
--- a/tests/test_cnn/test_context_block.py
+++ b/tests/test_cnn/test_context_block.py
@@ -1,4 +1,3 @@
-# Copyright (c) OpenMMLab. All rights reserved.
 import pytest
 import torch
 
diff --git a/tests/test_cnn/test_conv2d_adaptive_padding.py b/tests/test_cnn/test_conv2d_adaptive_padding.py
index 83114bd..051d6e5 100644
--- a/tests/test_cnn/test_conv2d_adaptive_padding.py
+++ b/tests/test_cnn/test_conv2d_adaptive_padding.py
@@ -1,4 +1,3 @@
-# Copyright (c) OpenMMLab. All rights reserved.
 import torch
 
 from mmcv.cnn.bricks import Conv2dAdaptivePadding
diff --git a/tests/test_cnn/test_conv_module.py b/tests/test_cnn/test_conv_module.py
index d31167a..e231ef3 100644
--- a/tests/test_cnn/test_conv_module.py
+++ b/tests/test_cnn/test_conv_module.py
@@ -1,18 +1,14 @@
-# Copyright (c) OpenMMLab. All rights reserved.
 import warnings
 from unittest.mock import patch
 
 import pytest
 import torch
 import torch.nn as nn
-from mmengine.registry import MODELS
-from mmengine.utils import digit_version
-from mmengine.utils.dl_utils import TORCH_VERSION
 
-from mmcv.cnn.bricks import ConvModule, HSigmoid, HSwish
+from mmcv.cnn.bricks import CONV_LAYERS, ConvModule, HSigmoid, HSwish
 
 
-@MODELS.register_module()
+@CONV_LAYERS.register_module()
 class ExampleConv(nn.Module):
 
     def __init__(self,
@@ -25,7 +21,7 @@ class ExampleConv(nn.Module):
                  groups=1,
                  bias=True,
                  norm_cfg=None):
-        super().__init__()
+        super(ExampleConv, self).__init__()
         self.in_channels = in_channels
         self.out_channels = out_channels
         self.kernel_size = kernel_size
@@ -142,12 +138,7 @@ def test_conv_module():
 
     # HSwish
     conv = ConvModule(3, 8, 3, padding=1, act_cfg=dict(type='HSwish'))
-    if (TORCH_VERSION == 'parrots'
-            or digit_version(TORCH_VERSION) < digit_version('1.7')):
-        assert isinstance(conv.activate, HSwish)
-    else:
-        assert isinstance(conv.activate, nn.Hardswish)
-
+    assert isinstance(conv.activate, HSwish)
     output = conv(x)
     assert output.shape == (1, 8, 256, 256)
 
diff --git a/tests/test_cnn/test_depthwise_seperable_conv_module.py b/tests/test_cnn/test_depthwise_seperable_conv_module.py
index 748fc1b..10b4c56 100644
--- a/tests/test_cnn/test_depthwise_seperable_conv_module.py
+++ b/tests/test_cnn/test_depthwise_seperable_conv_module.py
@@ -1,4 +1,3 @@
-# Copyright (c) OpenMMLab. All rights reserved.
 import pytest
 import torch
 import torch.nn as nn
diff --git a/tests/test_cnn/test_flops_counter.py b/tests/test_cnn/test_flops_counter.py
index e2ba6e2..99a53b7 100644
--- a/tests/test_cnn/test_flops_counter.py
+++ b/tests/test_cnn/test_flops_counter.py
@@ -1,4 +1,3 @@
-# Copyright (c) OpenMMLab. All rights reserved.
 import pytest
 import torch
 import torch.nn as nn
diff --git a/tests/test_cnn/test_fuse_conv_bn.py b/tests/test_cnn/test_fuse_conv_bn.py
index e60be53..f1346f6 100644
--- a/tests/test_cnn/test_fuse_conv_bn.py
+++ b/tests/test_cnn/test_fuse_conv_bn.py
@@ -1,4 +1,3 @@
-# Copyright (c) OpenMMLab. All rights reserved.
 import torch
 import torch.nn as nn
 
diff --git a/tests/test_cnn/test_generalized_attention.py b/tests/test_cnn/test_generalized_attention.py
index 6b844f0..27207c9 100644
--- a/tests/test_cnn/test_generalized_attention.py
+++ b/tests/test_cnn/test_generalized_attention.py
@@ -1,4 +1,3 @@
-# Copyright (c) OpenMMLab. All rights reserved.
 import torch
 
 from mmcv.cnn.bricks import GeneralizedAttention
diff --git a/tests/test_cnn/test_hsigmoid.py b/tests/test_cnn/test_hsigmoid.py
index 43e9f62..2de7231 100644
--- a/tests/test_cnn/test_hsigmoid.py
+++ b/tests/test_cnn/test_hsigmoid.py
@@ -1,4 +1,3 @@
-# Copyright (c) OpenMMLab. All rights reserved.
 import pytest
 import torch
 
@@ -16,7 +15,7 @@ def test_hsigmoid():
     input = torch.randn(input_shape)
     output = act(input)
     expected_output = torch.min(
-        torch.max((input + 3) / 6, torch.zeros(input_shape)),
+        torch.max((input + 1) / 2, torch.zeros(input_shape)),
         torch.ones(input_shape))
     # test output shape
     assert output.shape == expected_output.shape
@@ -24,12 +23,12 @@ def test_hsigmoid():
     assert torch.equal(output, expected_output)
 
     # test with designated parameters
-    act = HSigmoid(1, 2, 0, 1)
+    act = HSigmoid(3, 6, 0, 1)
     input_shape = torch.Size([1, 3, 64, 64])
     input = torch.randn(input_shape)
     output = act(input)
     expected_output = torch.min(
-        torch.max((input + 1) / 2, torch.zeros(input_shape)),
+        torch.max((input + 3) / 6, torch.zeros(input_shape)),
         torch.ones(input_shape))
     # test output shape
     assert output.shape == expected_output.shape
diff --git a/tests/test_cnn/test_hswish.py b/tests/test_cnn/test_hswish.py
index 5cd1bcf..0530084 100644
--- a/tests/test_cnn/test_hswish.py
+++ b/tests/test_cnn/test_hswish.py
@@ -1,4 +1,3 @@
-# Copyright (c) OpenMMLab. All rights reserved.
 import torch
 from torch.nn.functional import relu6
 
diff --git a/tests/test_cnn/test_model_registry.py b/tests/test_cnn/test_model_registry.py
new file mode 100644
index 0000000..86fb15b
--- /dev/null
+++ b/tests/test_cnn/test_model_registry.py
@@ -0,0 +1,63 @@
+import torch.nn as nn
+
+import mmcv
+from mmcv.cnn import MODELS, build_model_from_cfg
+
+
+def test_build_model_from_cfg():
+    BACKBONES = mmcv.Registry('backbone', build_func=build_model_from_cfg)
+
+    @BACKBONES.register_module()
+    class ResNet(nn.Module):
+
+        def __init__(self, depth, stages=4):
+            super().__init__()
+            self.depth = depth
+            self.stages = stages
+
+        def forward(self, x):
+            return x
+
+    @BACKBONES.register_module()
+    class ResNeXt(nn.Module):
+
+        def __init__(self, depth, stages=4):
+            super().__init__()
+            self.depth = depth
+            self.stages = stages
+
+        def forward(self, x):
+            return x
+
+    cfg = dict(type='ResNet', depth=50)
+    model = BACKBONES.build(cfg)
+    assert isinstance(model, ResNet)
+    assert model.depth == 50 and model.stages == 4
+
+    cfg = dict(type='ResNeXt', depth=50, stages=3)
+    model = BACKBONES.build(cfg)
+    assert isinstance(model, ResNeXt)
+    assert model.depth == 50 and model.stages == 3
+
+    cfg = [
+        dict(type='ResNet', depth=50),
+        dict(type='ResNeXt', depth=50, stages=3)
+    ]
+    model = BACKBONES.build(cfg)
+    assert isinstance(model, nn.Sequential)
+    assert isinstance(model[0], ResNet)
+    assert model[0].depth == 50 and model[0].stages == 4
+    assert isinstance(model[1], ResNeXt)
+    assert model[1].depth == 50 and model[1].stages == 3
+
+    # test inherit `build_func` from parent
+    NEW_MODELS = mmcv.Registry('models', parent=MODELS, scope='new')
+    assert NEW_MODELS.build_func is build_model_from_cfg
+
+    # test specify `build_func`
+    def pseudo_build(cfg):
+        return cfg
+
+    NEW_MODELS = mmcv.Registry(
+        'models', parent=MODELS, build_func=pseudo_build)
+    assert NEW_MODELS.build_func is pseudo_build
diff --git a/tests/test_cnn/test_non_local.py b/tests/test_cnn/test_non_local.py
index 25d7883..de231f9 100644
--- a/tests/test_cnn/test_non_local.py
+++ b/tests/test_cnn/test_non_local.py
@@ -1,4 +1,3 @@
-# Copyright (c) OpenMMLab. All rights reserved.
 import pytest
 import torch
 import torch.nn as nn
diff --git a/tests/test_cnn/test_revert_syncbn.py b/tests/test_cnn/test_revert_syncbn.py
new file mode 100644
index 0000000..69c9161
--- /dev/null
+++ b/tests/test_cnn/test_revert_syncbn.py
@@ -0,0 +1,58 @@
+import os
+import platform
+
+import numpy as np
+import pytest
+import torch
+import torch.distributed as dist
+
+from mmcv.cnn.bricks import ConvModule
+from mmcv.cnn.utils import revert_sync_batchnorm
+
+if platform.system() == 'Windows':
+    import regex as re
+else:
+    import re
+
+
+def test_revert_syncbn():
+    conv = ConvModule(3, 8, 2, norm_cfg=dict(type='SyncBN'))
+    x = torch.randn(1, 3, 10, 10)
+    # Expect a ValueError prompting that SyncBN is not supported on CPU
+    with pytest.raises(ValueError):
+        y = conv(x)
+    conv = revert_sync_batchnorm(conv)
+    y = conv(x)
+    assert y.shape == (1, 8, 9, 9)
+
+
+def test_revert_mmsyncbn():
+    if 'SLURM_NTASKS' not in os.environ or int(os.environ['SLURM_NTASKS']) < 2:
+        print('Must run on slurm with more than 1 process!\n'
+              'srun -p test --gres=gpu:2 -n2')
+        return
+    rank = int(os.environ['SLURM_PROCID'])
+    world_size = int(os.environ['SLURM_NTASKS'])
+    local_rank = int(os.environ['SLURM_LOCALID'])
+    node_list = str(os.environ['SLURM_NODELIST'])
+
+    node_parts = re.findall('[0-9]+', node_list)
+    os.environ['MASTER_ADDR'] = (f'{node_parts[1]}.{node_parts[2]}' +
+                                 f'.{node_parts[3]}.{node_parts[4]}')
+    os.environ['MASTER_PORT'] = '12341'
+    os.environ['WORLD_SIZE'] = str(world_size)
+    os.environ['RANK'] = str(rank)
+
+    dist.init_process_group('nccl')
+    torch.cuda.set_device(local_rank)
+    x = torch.randn(1, 3, 10, 10).cuda()
+    dist.broadcast(x, src=0)
+    conv = ConvModule(3, 8, 2, norm_cfg=dict(type='MMSyncBN')).cuda()
+    conv.eval()
+    y_mmsyncbn = conv(x).detach().cpu().numpy()
+    conv = revert_sync_batchnorm(conv)
+    y_bn = conv(x).detach().cpu().numpy()
+    assert np.all(np.isclose(y_bn, y_mmsyncbn, 1e-3))
+    conv, x = conv.to('cpu'), x.to('cpu')
+    y_bn_cpu = conv(x).detach().numpy()
+    assert np.all(np.isclose(y_bn, y_bn_cpu, 1e-3))
diff --git a/tests/test_cnn/test_rfsearch/test_operator.py b/tests/test_cnn/test_rfsearch/test_operator.py
deleted file mode 100644
index b555605..0000000
--- a/tests/test_cnn/test_rfsearch/test_operator.py
+++ /dev/null
@@ -1,325 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from copy import deepcopy
-
-import torch
-import torch.nn as nn
-
-from mmcv.cnn.rfsearch.operator import Conv2dRFSearchOp
-
-global_config = dict(
-    step=0,
-    max_step=12,
-    search_interval=1,
-    exp_rate=0.5,
-    init_alphas=0.01,
-    mmin=1,
-    mmax=24,
-    num_branches=2,
-    skip_layer=['stem', 'layer1'])
-
-
-# test with 3x3 conv
-def test_rfsearch_operator_3x3():
-    conv = nn.Conv2d(
-        in_channels=3, out_channels=3, kernel_size=3, stride=1, padding=1)
-    operator = Conv2dRFSearchOp(conv, global_config)
-    x = torch.randn(1, 3, 32, 32)
-
-    # set no_grad to perform in-place operator
-    with torch.no_grad():
-        # After expand: (1, 1) (2, 2)
-        assert len(operator.dilation_rates) == 2
-        assert operator.dilation_rates[0] == (1, 1)
-        assert operator.dilation_rates[1] == (2, 2)
-        assert torch.all(operator.branch_weights.data ==
-                         global_config['init_alphas']).item()
-        # test forward
-        assert operator(x).shape == (1, 3, 32, 32)
-
-        # After estimate: (2, 2) with branch_weights of [0.5 0.5]
-        operator.estimate_rates()
-        assert len(operator.dilation_rates) == 1
-        assert operator.dilation_rates[0] == (2, 2)
-        assert operator.op_layer.dilation == (2, 2)
-        assert operator.op_layer.padding == (2, 2)
-        # test forward
-        assert operator(x).shape == (1, 3, 32, 32)
-
-        # After expand: (1, 1) (3, 3)
-        operator.expand_rates()
-        assert len(operator.dilation_rates) == 2
-        assert operator.dilation_rates[0] == (1, 1)
-        assert operator.dilation_rates[1] == (3, 3)
-        assert torch.all(operator.branch_weights.data ==
-                         global_config['init_alphas']).item()
-        # test forward
-        assert operator(x).shape == (1, 3, 32, 32)
-
-        operator.branch_weights[0] = 0.1
-        operator.branch_weights[1] = 0.4
-        # After estimate: (3, 3) with branch_weights of [0.2 0.8]
-        operator.estimate_rates()
-        assert len(operator.dilation_rates) == 1
-        assert operator.dilation_rates[0] == (3, 3)
-        assert operator.op_layer.dilation == (3, 3)
-        assert operator.op_layer.padding == (3, 3)
-        # test forward
-        assert operator(x).shape == (1, 3, 32, 32)
-
-
-# test with 5x5 conv
-def test_rfsearch_operator_5x5():
-    conv = nn.Conv2d(
-        in_channels=3, out_channels=3, kernel_size=5, stride=1, padding=2)
-    operator = Conv2dRFSearchOp(conv, global_config)
-    x = torch.randn(1, 3, 32, 32)
-
-    with torch.no_grad():
-        # After expand: (1, 1) (2, 2)
-        assert len(operator.dilation_rates) == 2
-        assert operator.dilation_rates[0] == (1, 1)
-        assert operator.dilation_rates[1] == (2, 2)
-        assert torch.all(operator.branch_weights.data ==
-                         global_config['init_alphas']).item()
-        # test forward
-        assert operator(x).shape == (1, 3, 32, 32)
-
-        # After estimate: (2, 2) with branch_weights of [0.5 0.5]
-        operator.estimate_rates()
-        assert len(operator.dilation_rates) == 1
-        assert operator.dilation_rates[0] == (2, 2)
-        assert operator.op_layer.dilation == (2, 2)
-        assert operator.op_layer.padding == (4, 4)
-        # test forward
-        assert operator(x).shape == (1, 3, 32, 32)
-
-        # After expand: (1, 1) (3, 3)
-        operator.expand_rates()
-        assert len(operator.dilation_rates) == 2
-        assert operator.dilation_rates[0] == (1, 1)
-        assert operator.dilation_rates[1] == (3, 3)
-        assert torch.all(operator.branch_weights.data ==
-                         global_config['init_alphas']).item()
-        # test forward
-        assert operator(x).shape == (1, 3, 32, 32)
-
-        operator.branch_weights[0] = 0.1
-        operator.branch_weights[1] = 0.4
-        # After estimate: (3, 3) with branch_weights of [0.2 0.8]
-        operator.estimate_rates()
-        assert len(operator.dilation_rates) == 1
-        assert operator.dilation_rates[0] == (3, 3)
-        assert operator.op_layer.dilation == (3, 3)
-        assert operator.op_layer.padding == (6, 6)
-        # test forward
-        assert operator(x).shape == (1, 3, 32, 32)
-
-
-# test with 5x5 conv num_branches=3
-def test_rfsearch_operator_5x5_branch3():
-    conv = nn.Conv2d(
-        in_channels=3, out_channels=3, kernel_size=5, stride=1, padding=2)
-    config = deepcopy(global_config)
-    config['num_branches'] = 3
-    operator = Conv2dRFSearchOp(conv, config)
-    x = torch.randn(1, 3, 32, 32)
-
-    with torch.no_grad():
-        # After expand: (1, 1) (2, 2)
-        assert len(operator.dilation_rates) == 2
-        assert operator.dilation_rates[0] == (1, 1)
-        assert operator.dilation_rates[1] == (2, 2)
-        assert torch.all(operator.branch_weights.data ==
-                         global_config['init_alphas']).item()
-        # test forward
-        assert operator(x).shape == (1, 3, 32, 32)
-
-        # After estimate: (2, 2) with branch_weights of [0.5 0.5]
-        operator.estimate_rates()
-        assert len(operator.dilation_rates) == 1
-        assert operator.dilation_rates[0] == (2, 2)
-        assert operator.op_layer.dilation == (2, 2)
-        assert operator.op_layer.padding == (4, 4)
-        # test forward
-        assert operator(x).shape == (1, 3, 32, 32)
-
-        # After expand: (1, 1) (2, 2) (3, 3)
-        operator.expand_rates()
-        assert len(operator.dilation_rates) == 3
-        assert operator.dilation_rates[0] == (1, 1)
-        assert operator.dilation_rates[1] == (2, 2)
-        assert operator.dilation_rates[2] == (3, 3)
-        assert torch.all(operator.branch_weights.data ==
-                         global_config['init_alphas']).item()
-        # test forward
-        assert operator(x).shape == (1, 3, 32, 32)
-
-        operator.branch_weights[0] = 0.1
-        operator.branch_weights[1] = 0.3
-        operator.branch_weights[2] = 0.6
-        # After estimate: (3, 3) with branch_weights of [0.1 0.3 0.6]
-        operator.estimate_rates()
-        assert len(operator.dilation_rates) == 1
-        assert operator.dilation_rates[0] == (3, 3)
-        assert operator.op_layer.dilation == (3, 3)
-        assert operator.op_layer.padding == (6, 6)
-        # test forward
-        assert operator(x).shape == (1, 3, 32, 32)
-
-
-# test with 1x5 conv
-def test_rfsearch_operator_1x5():
-    conv = nn.Conv2d(
-        in_channels=3,
-        out_channels=3,
-        kernel_size=(1, 5),
-        stride=1,
-        padding=(0, 2))
-    operator = Conv2dRFSearchOp(conv, global_config)
-    x = torch.randn(1, 3, 32, 32)
-
-    # After expand: (1, 1) (1, 2)
-    assert len(operator.dilation_rates) == 2
-    assert operator.dilation_rates[0] == (1, 1)
-    assert operator.dilation_rates[1] == (1, 2)
-    assert torch.all(
-        operator.branch_weights.data == global_config['init_alphas']).item()
-    # test forward
-    assert operator(x).shape == (1, 3, 32, 32)
-
-    with torch.no_grad():
-        # After estimate: (1, 2) with branch_weights of [0.5 0.5]
-        operator.estimate_rates()
-        assert len(operator.dilation_rates) == 1
-        assert operator.dilation_rates[0] == (1, 2)
-        assert operator.op_layer.dilation == (1, 2)
-        assert operator.op_layer.padding == (0, 4)
-        # test forward
-        assert operator(x).shape == (1, 3, 32, 32)
-
-        # After expand: (1, 1) (1, 3)
-        operator.expand_rates()
-        assert len(operator.dilation_rates) == 2
-        assert operator.dilation_rates[0] == (1, 1)
-        assert operator.dilation_rates[1] == (1, 3)
-        assert torch.all(operator.branch_weights.data ==
-                         global_config['init_alphas']).item()
-        # test forward
-        assert operator(x).shape == (1, 3, 32, 32)
-
-        operator.branch_weights[0] = 0.2
-        operator.branch_weights[1] = 0.8
-        # After estimate: (3, 3) with branch_weights of [0.2 0.8]
-        operator.estimate_rates()
-        assert len(operator.dilation_rates) == 1
-        assert operator.dilation_rates[0] == (1, 3)
-        assert operator.op_layer.dilation == (1, 3)
-        assert operator.op_layer.padding == (0, 6)
-        # test forward
-        assert operator(x).shape == (1, 3, 32, 32)
-
-
-# test with 5x5 conv initial_dilation=(2, 2)
-def test_rfsearch_operator_5x5_d2x2():
-    conv = nn.Conv2d(
-        in_channels=3,
-        out_channels=3,
-        kernel_size=5,
-        stride=1,
-        padding=4,
-        dilation=(2, 2))
-    operator = Conv2dRFSearchOp(conv, global_config)
-    x = torch.randn(1, 3, 32, 32)
-
-    with torch.no_grad():
-        # After expand: (1, 1) (3, 3)
-        assert len(operator.dilation_rates) == 2
-        assert operator.dilation_rates[0] == (1, 1)
-        assert operator.dilation_rates[1] == (3, 3)
-        assert torch.all(operator.branch_weights.data ==
-                         global_config['init_alphas']).item()
-        # test forward
-        assert operator(x).shape == (1, 3, 32, 32)
-
-        # After estimate: (2, 2) with branch_weights of [0.5 0.5]
-        operator.estimate_rates()
-        assert len(operator.dilation_rates) == 1
-        assert operator.dilation_rates[0] == (2, 2)
-        assert operator.op_layer.dilation == (2, 2)
-        assert operator.op_layer.padding == (4, 4)
-        # test forward
-        assert operator(x).shape == (1, 3, 32, 32)
-
-        # After expand: (1, 1) (3, 3)
-        operator.expand_rates()
-        assert len(operator.dilation_rates) == 2
-        assert operator.dilation_rates[0] == (1, 1)
-        assert operator.dilation_rates[1] == (3, 3)
-        assert torch.all(operator.branch_weights.data ==
-                         global_config['init_alphas']).item()
-        # test forward
-        assert operator(x).shape == (1, 3, 32, 32)
-
-        operator.branch_weights[0] = 0.8
-        operator.branch_weights[1] = 0.2
-        # After estimate: (3, 3) with branch_weights of [0.8 0.2]
-        operator.estimate_rates()
-        assert len(operator.dilation_rates) == 1
-        assert operator.dilation_rates[0] == (1, 1)
-        assert operator.op_layer.dilation == (1, 1)
-        assert operator.op_layer.padding == (2, 2)
-        # test forward
-        assert operator(x).shape == (1, 3, 32, 32)
-
-
-# test with 5x5 conv initial_dilation=(1, 2)
-def test_rfsearch_operator_5x5_d1x2():
-    conv = nn.Conv2d(
-        in_channels=3,
-        out_channels=3,
-        kernel_size=5,
-        stride=1,
-        padding=(2, 4),
-        dilation=(1, 2))
-    operator = Conv2dRFSearchOp(conv, global_config)
-    x = torch.randn(1, 3, 32, 32)
-
-    with torch.no_grad():
-        # After expand: (1, 1) (2, 3)
-        assert len(operator.dilation_rates) == 2
-        assert operator.dilation_rates[0] == (1, 1)
-        assert operator.dilation_rates[1] == (2, 3)
-        assert torch.all(operator.branch_weights.data ==
-                         global_config['init_alphas']).item()
-        # test forward
-        assert operator(x).shape == (1, 3, 32, 32)
-
-        # After estimate: (2, 2) with branch_weights of [0.5 0.5]
-        operator.estimate_rates()
-        assert len(operator.dilation_rates) == 1
-        assert operator.dilation_rates[0] == (2, 2)
-        assert operator.op_layer.dilation == (2, 2)
-        assert operator.op_layer.padding == (4, 4)
-        # test forward
-        assert operator(x).shape == (1, 3, 32, 32)
-
-        # After expand: (1, 1) (3, 3)
-        operator.expand_rates()
-        assert len(operator.dilation_rates) == 2
-        assert operator.dilation_rates[0] == (1, 1)
-        assert operator.dilation_rates[1] == (3, 3)
-        assert torch.all(operator.branch_weights.data ==
-                         global_config['init_alphas']).item()
-        # test forward
-        assert operator(x).shape == (1, 3, 32, 32)
-
-        operator.branch_weights[0] = 0.1
-        operator.branch_weights[1] = 0.8
-        # After estimate: (3, 3) with branch_weights of [0.1 0.8]
-        operator.estimate_rates()
-        assert len(operator.dilation_rates) == 1
-        assert operator.dilation_rates[0] == (3, 3)
-        assert operator.op_layer.dilation == (3, 3)
-        assert operator.op_layer.padding == (6, 6)
-        # test forward
-        assert operator(x).shape == (1, 3, 32, 32)
diff --git a/tests/test_cnn/test_rfsearch/test_search.py b/tests/test_cnn/test_rfsearch/test_search.py
deleted file mode 100644
index 5614e3c..0000000
--- a/tests/test_cnn/test_rfsearch/test_search.py
+++ /dev/null
@@ -1,128 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-
-import torch.nn as nn
-
-from mmcv.cnn.rfsearch import Conv2dRFSearchOp, RFSearchHook
-
-
-def test_rfsearchhook():
-
-    def conv(in_channels, out_channels, kernel_size, stride, padding,
-             dilation):
-        return nn.Conv2d(
-            in_channels=in_channels,
-            out_channels=out_channels,
-            kernel_size=kernel_size,
-            stride=stride,
-            padding=padding,
-            dilation=dilation)
-
-    class Model(nn.Module):
-
-        def __init__(self):
-            super().__init__()
-            self.stem = conv(1, 2, 3, 1, 1, 1)
-            self.conv0 = conv(2, 2, 3, 1, 1, 1)
-            self.layer0 = nn.Sequential(
-                conv(2, 2, 3, 1, 1, 1), conv(2, 2, 3, 1, 1, 1))
-            self.conv1 = conv(2, 2, 1, 1, 0, 1)
-            self.conv2 = conv(2, 2, 3, 1, 1, 1)
-            self.conv3 = conv(2, 2, (1, 3), 1, (0, 1), 1)
-
-        def forward(self, x):
-            x1 = self.stem(x)
-            x2 = self.layer0(x1)
-            x3 = self.conv0(x2)
-            x4 = self.conv1(x3)
-            x5 = self.conv2(x4)
-            x6 = self.conv3(x5)
-            return x6
-
-        def train_step(self, x, optimizer, **kwargs):
-            return dict(loss=self(x).mean(), num_samples=x.shape[0])
-
-    rfsearch_cfg = dict(
-        mode='search',
-        rfstructure_file=None,
-        config=dict(
-            search=dict(
-                step=0,
-                max_step=12,
-                search_interval=1,
-                exp_rate=0.5,
-                init_alphas=0.01,
-                mmin=1,
-                mmax=24,
-                num_branches=2,
-                skip_layer=['stem', 'conv0', 'layer0.1'])),
-    )
-
-    # hook for search
-    rfsearchhook_search = RFSearchHook(
-        'search', rfsearch_cfg['config'], by_epoch=True, verbose=True)
-    rfsearchhook_search.config['structure'] = {
-        'module.layer0.0': [1, 1],
-        'module.conv2': [2, 2],
-        'module.conv3': [1, 1]
-    }
-    # hook for fixed_single_branch
-    rfsearchhook_fixed_single_branch = RFSearchHook(
-        'fixed_single_branch',
-        rfsearch_cfg['config'],
-        by_epoch=True,
-        verbose=True)
-    rfsearchhook_fixed_single_branch.config['structure'] = {
-        'module.layer0.0': [1, 1],
-        'module.conv2': [2, 2],
-        'module.conv3': [1, 1]
-    }
-    # hook for fixed_multi_branch
-    rfsearchhook_fixed_multi_branch = RFSearchHook(
-        'fixed_multi_branch',
-        rfsearch_cfg['config'],
-        by_epoch=True,
-        verbose=True)
-    rfsearchhook_fixed_multi_branch.config['structure'] = {
-        'module.layer0.0': [1, 1],
-        'module.conv2': [2, 2],
-        'module.conv3': [1, 1]
-    }
-
-    def test_skip_layer():
-        assert not isinstance(model.stem, Conv2dRFSearchOp)
-        assert not isinstance(model.conv0, Conv2dRFSearchOp)
-        assert isinstance(model.layer0[0], Conv2dRFSearchOp)
-        assert not isinstance(model.layer0[1], Conv2dRFSearchOp)
-
-    # 1. test init_model() with mode of search
-    model = Model()
-    rfsearchhook_search.init_model(model)
-
-    test_skip_layer()
-    assert not isinstance(model.conv1, Conv2dRFSearchOp)
-    assert isinstance(model.conv2, Conv2dRFSearchOp)
-    assert isinstance(model.conv3, Conv2dRFSearchOp)
-    assert model.conv2.dilation_rates == [(1, 1), (3, 3)]
-    assert model.conv3.dilation_rates == [(1, 1), (1, 2)]
-
-    # 2. test init_model() with mode of fixed_single_branch
-    model = Model()
-    rfsearchhook_fixed_single_branch.init_model(model)
-
-    assert not isinstance(model.conv1, Conv2dRFSearchOp)
-    assert not isinstance(model.conv2, Conv2dRFSearchOp)
-    assert not isinstance(model.conv3, Conv2dRFSearchOp)
-    assert model.conv1.dilation == (1, 1)
-    assert model.conv2.dilation == (2, 2)
-    assert model.conv3.dilation == (1, 1)
-
-    # 3. test init_model() with mode of fixed_multi_branch
-    model = Model()
-    rfsearchhook_fixed_multi_branch.init_model(model)
-
-    test_skip_layer()
-    assert not isinstance(model.conv1, Conv2dRFSearchOp)
-    assert isinstance(model.conv2, Conv2dRFSearchOp)
-    assert isinstance(model.conv3, Conv2dRFSearchOp)
-    assert model.conv2.dilation_rates == [(1, 1), (3, 3)]
-    assert model.conv3.dilation_rates == [(1, 1), (1, 2)]
diff --git a/tests/test_cnn/test_scale.py b/tests/test_cnn/test_scale.py
index 04d75ec..a380b90 100644
--- a/tests/test_cnn/test_scale.py
+++ b/tests/test_cnn/test_scale.py
@@ -1,8 +1,6 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import pytest
 import torch
 
-from mmcv.cnn.bricks import LayerScale, Scale
+from mmcv.cnn.bricks import Scale
 
 
 def test_scale():
@@ -21,58 +19,3 @@ def test_scale():
     x = torch.rand(1, 3, 64, 64)
     output = scale(x)
     assert output.shape == (1, 3, 64, 64)
-
-
-def test_layer_scale():
-    with pytest.raises(AssertionError):
-        cfg = dict(
-            dim=10,
-            data_format='BNC',
-        )
-        LayerScale(**cfg)
-
-    # test init
-    cfg = dict(dim=10)
-    ls = LayerScale(**cfg)
-    assert torch.equal(ls.weight, torch.ones(10, requires_grad=True) * 1e-5)
-
-    # test forward
-    # test channels_last
-    cfg = dict(dim=256, inplace=False, data_format='channels_last')
-    ls_channels_last = LayerScale(**cfg)
-    x = torch.randn((4, 49, 256))
-    out = ls_channels_last(x)
-    assert tuple(out.size()) == (4, 49, 256)
-    assert torch.equal(x * 1e-5, out)
-
-    # test channels_last 2d
-    cfg = dict(dim=256, inplace=False, data_format='channels_last')
-    ls_channels_last = LayerScale(**cfg)
-    x = torch.randn((4, 7, 49, 256))
-    out = ls_channels_last(x)
-    assert tuple(out.size()) == (4, 7, 49, 256)
-    assert torch.equal(x * 1e-5, out)
-
-    # test channels_first
-    cfg = dict(dim=256, inplace=False, data_format='channels_first')
-    ls_channels_first = LayerScale(**cfg)
-    x = torch.randn((4, 256, 7, 7))
-    out = ls_channels_first(x)
-    assert tuple(out.size()) == (4, 256, 7, 7)
-    assert torch.equal(x * 1e-5, out)
-
-    # test channels_first 3D
-    cfg = dict(dim=256, inplace=False, data_format='channels_first')
-    ls_channels_first = LayerScale(**cfg)
-    x = torch.randn((4, 256, 7, 7, 7))
-    out = ls_channels_first(x)
-    assert tuple(out.size()) == (4, 256, 7, 7, 7)
-    assert torch.equal(x * 1e-5, out)
-
-    # test inplace True
-    cfg = dict(dim=256, inplace=True, data_format='channels_first')
-    ls_channels_first = LayerScale(**cfg)
-    x = torch.randn((4, 256, 7, 7))
-    out = ls_channels_first(x)
-    assert tuple(out.size()) == (4, 256, 7, 7)
-    assert x is out
diff --git a/tests/test_cnn/test_silu.py b/tests/test_cnn/test_silu.py
deleted file mode 100644
index e3bbc0f..0000000
--- a/tests/test_cnn/test_silu.py
+++ /dev/null
@@ -1,28 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import torch
-
-from mmcv.cnn.bricks import build_activation_layer
-
-
-def test_silu():
-    act = build_activation_layer(dict(type='SiLU'))
-    input = torch.randn(1, 3, 64, 64)
-    expected_output = input * torch.sigmoid(input)
-    output = act(input)
-    # test output shape
-    assert output.shape == expected_output.shape
-    # test output value
-    assert torch.allclose(output, expected_output)
-
-    # test inplace
-    act = build_activation_layer(dict(type='SiLU', inplace=True))
-    assert act.inplace
-    input = torch.randn(1, 3, 64, 64)
-    expected_output = input * torch.sigmoid(input)
-    output = act(input)
-    # test output shape
-    assert output.shape == expected_output.shape
-    # test output value
-    assert torch.allclose(output, expected_output)
-    assert torch.allclose(input, expected_output)
-    assert input is output
diff --git a/tests/test_cnn/test_swish.py b/tests/test_cnn/test_swish.py
index 2317f5a..d8e7772 100644
--- a/tests/test_cnn/test_swish.py
+++ b/tests/test_cnn/test_swish.py
@@ -1,6 +1,5 @@
-# Copyright (c) OpenMMLab. All rights reserved.
 import torch
-import torch.nn.functional as F
+from torch.nn.functional import sigmoid
 
 from mmcv.cnn.bricks import Swish
 
@@ -8,7 +7,7 @@ from mmcv.cnn.bricks import Swish
 def test_swish():
     act = Swish()
     input = torch.randn(1, 3, 64, 64)
-    expected_output = input * F.sigmoid(input)
+    expected_output = input * sigmoid(input)
     output = act(input)
     # test output shape
     assert output.shape == expected_output.shape
diff --git a/tests/test_cnn/test_transformer.py b/tests/test_cnn/test_transformer.py
index b5a9562..106753b 100644
--- a/tests/test_cnn/test_transformer.py
+++ b/tests/test_cnn/test_transformer.py
@@ -1,472 +1,13 @@
-# Copyright (c) OpenMMLab. All rights reserved.
 import copy
 
 import pytest
 import torch
-from mmengine.model import ModuleList
 
 from mmcv.cnn.bricks.drop import DropPath
-from mmcv.cnn.bricks.transformer import (FFN, AdaptivePadding,
-                                         BaseTransformerLayer,
-                                         MultiheadAttention, PatchEmbed,
-                                         PatchMerging,
+from mmcv.cnn.bricks.transformer import (FFN, BaseTransformerLayer,
+                                         MultiheadAttention,
                                          TransformerLayerSequence)
-
-
-def test_adaptive_padding():
-
-    for padding in ('same', 'corner'):
-        kernel_size = 16
-        stride = 16
-        dilation = 1
-        input = torch.rand(1, 1, 15, 17)
-        adap_pad = AdaptivePadding(
-            kernel_size=kernel_size,
-            stride=stride,
-            dilation=dilation,
-            padding=padding)
-        out = adap_pad(input)
-        # padding to divisible by 16
-        assert (out.shape[2], out.shape[3]) == (16, 32)
-        input = torch.rand(1, 1, 16, 17)
-        out = adap_pad(input)
-        # padding to divisible by 16
-        assert (out.shape[2], out.shape[3]) == (16, 32)
-
-        kernel_size = (2, 2)
-        stride = (2, 2)
-        dilation = (1, 1)
-
-        adap_pad = AdaptivePadding(
-            kernel_size=kernel_size,
-            stride=stride,
-            dilation=dilation,
-            padding=padding)
-        input = torch.rand(1, 1, 11, 13)
-        out = adap_pad(input)
-        # padding to divisible by 2
-        assert (out.shape[2], out.shape[3]) == (12, 14)
-
-        kernel_size = (2, 2)
-        stride = (10, 10)
-        dilation = (1, 1)
-
-        adap_pad = AdaptivePadding(
-            kernel_size=kernel_size,
-            stride=stride,
-            dilation=dilation,
-            padding=padding)
-        input = torch.rand(1, 1, 10, 13)
-        out = adap_pad(input)
-        #  no padding
-        assert (out.shape[2], out.shape[3]) == (10, 13)
-
-        kernel_size = (11, 11)
-        adap_pad = AdaptivePadding(
-            kernel_size=kernel_size,
-            stride=stride,
-            dilation=dilation,
-            padding=padding)
-        input = torch.rand(1, 1, 11, 13)
-        out = adap_pad(input)
-        #  all padding
-        assert (out.shape[2], out.shape[3]) == (21, 21)
-
-        # test padding as kernel is (7,9)
-        input = torch.rand(1, 1, 11, 13)
-        stride = (3, 4)
-        kernel_size = (4, 5)
-        dilation = (2, 2)
-        # actually (7, 9)
-        adap_pad = AdaptivePadding(
-            kernel_size=kernel_size,
-            stride=stride,
-            dilation=dilation,
-            padding=padding)
-        dilation_out = adap_pad(input)
-        assert (dilation_out.shape[2], dilation_out.shape[3]) == (16, 21)
-        kernel_size = (7, 9)
-        dilation = (1, 1)
-        adap_pad = AdaptivePadding(
-            kernel_size=kernel_size,
-            stride=stride,
-            dilation=dilation,
-            padding=padding)
-        kernel79_out = adap_pad(input)
-        assert (kernel79_out.shape[2], kernel79_out.shape[3]) == (16, 21)
-        assert kernel79_out.shape == dilation_out.shape
-
-    # assert only support "same" "corner"
-    with pytest.raises(AssertionError):
-        AdaptivePadding(
-            kernel_size=kernel_size,
-            stride=stride,
-            dilation=dilation,
-            padding=1)
-
-
-def test_patch_embed():
-    B = 2
-    H = 3
-    W = 4
-    C = 3
-    embed_dims = 10
-    kernel_size = 3
-    stride = 1
-    dummy_input = torch.rand(B, C, H, W)
-    patch_merge_1 = PatchEmbed(
-        in_channels=C,
-        embed_dims=embed_dims,
-        kernel_size=kernel_size,
-        stride=stride,
-        padding=0,
-        dilation=1,
-        norm_cfg=None)
-
-    x1, shape = patch_merge_1(dummy_input)
-    # test out shape
-    assert x1.shape == (2, 2, 10)
-    # test outsize is correct
-    assert shape == (1, 2)
-    # test L = out_h * out_w
-    assert shape[0] * shape[1] == x1.shape[1]
-
-    B = 2
-    H = 10
-    W = 10
-    C = 3
-    embed_dims = 10
-    kernel_size = 5
-    stride = 2
-    dummy_input = torch.rand(B, C, H, W)
-    # test dilation
-    patch_merge_2 = PatchEmbed(
-        in_channels=C,
-        embed_dims=embed_dims,
-        kernel_size=kernel_size,
-        stride=stride,
-        padding=0,
-        dilation=2,
-        norm_cfg=None,
-    )
-
-    x2, shape = patch_merge_2(dummy_input)
-    # test out shape
-    assert x2.shape == (2, 1, 10)
-    # test outsize is correct
-    assert shape == (1, 1)
-    # test L = out_h * out_w
-    assert shape[0] * shape[1] == x2.shape[1]
-
-    stride = 2
-    input_size = (10, 10)
-
-    dummy_input = torch.rand(B, C, H, W)
-    # test stride and norm
-    patch_merge_3 = PatchEmbed(
-        in_channels=C,
-        embed_dims=embed_dims,
-        kernel_size=kernel_size,
-        stride=stride,
-        padding=0,
-        dilation=2,
-        norm_cfg=dict(type='LN'),
-        input_size=input_size)
-
-    x3, shape = patch_merge_3(dummy_input)
-    # test out shape
-    assert x3.shape == (2, 1, 10)
-    # test outsize is correct
-    assert shape == (1, 1)
-    # test L = out_h * out_w
-    assert shape[0] * shape[1] == x3.shape[1]
-
-    # test the init_out_size with nn.Unfold
-    assert patch_merge_3.init_out_size[1] == (input_size[0] - 2 * 4 -
-                                              1) // 2 + 1
-    assert patch_merge_3.init_out_size[0] == (input_size[0] - 2 * 4 -
-                                              1) // 2 + 1
-    H = 11
-    W = 12
-    input_size = (H, W)
-    dummy_input = torch.rand(B, C, H, W)
-    # test stride and norm
-    patch_merge_3 = PatchEmbed(
-        in_channels=C,
-        embed_dims=embed_dims,
-        kernel_size=kernel_size,
-        stride=stride,
-        padding=0,
-        dilation=2,
-        norm_cfg=dict(type='LN'),
-        input_size=input_size)
-
-    _, shape = patch_merge_3(dummy_input)
-    # when input_size equal to real input
-    # the out_size should be equal to `init_out_size`
-    assert shape == patch_merge_3.init_out_size
-
-    input_size = (H, W)
-    dummy_input = torch.rand(B, C, H, W)
-    # test stride and norm
-    patch_merge_3 = PatchEmbed(
-        in_channels=C,
-        embed_dims=embed_dims,
-        kernel_size=kernel_size,
-        stride=stride,
-        padding=0,
-        dilation=2,
-        norm_cfg=dict(type='LN'),
-        input_size=input_size)
-
-    _, shape = patch_merge_3(dummy_input)
-    # when input_size equal to real input
-    # the out_size should be equal to `init_out_size`
-    assert shape == patch_merge_3.init_out_size
-
-    # test adap padding
-    for padding in ('same', 'corner'):
-        in_c = 2
-        embed_dims = 3
-        B = 2
-
-        # test stride is 1
-        input_size = (5, 5)
-        kernel_size = (5, 5)
-        stride = (1, 1)
-        dilation = 1
-        bias = False
-
-        x = torch.rand(B, in_c, *input_size)
-        patch_embed = PatchEmbed(
-            in_channels=in_c,
-            embed_dims=embed_dims,
-            kernel_size=kernel_size,
-            stride=stride,
-            padding=padding,
-            dilation=dilation,
-            bias=bias)
-
-        x_out, out_size = patch_embed(x)
-        assert x_out.size() == (B, 25, 3)
-        assert out_size == (5, 5)
-        assert x_out.size(1) == out_size[0] * out_size[1]
-
-        # test kernel_size == stride
-        input_size = (5, 5)
-        kernel_size = (5, 5)
-        stride = (5, 5)
-        dilation = 1
-        bias = False
-
-        x = torch.rand(B, in_c, *input_size)
-        patch_embed = PatchEmbed(
-            in_channels=in_c,
-            embed_dims=embed_dims,
-            kernel_size=kernel_size,
-            stride=stride,
-            padding=padding,
-            dilation=dilation,
-            bias=bias)
-
-        x_out, out_size = patch_embed(x)
-        assert x_out.size() == (B, 1, 3)
-        assert out_size == (1, 1)
-        assert x_out.size(1) == out_size[0] * out_size[1]
-
-        # test kernel_size == stride
-        input_size = (6, 5)
-        kernel_size = (5, 5)
-        stride = (5, 5)
-        dilation = 1
-        bias = False
-
-        x = torch.rand(B, in_c, *input_size)
-        patch_embed = PatchEmbed(
-            in_channels=in_c,
-            embed_dims=embed_dims,
-            kernel_size=kernel_size,
-            stride=stride,
-            padding=padding,
-            dilation=dilation,
-            bias=bias)
-
-        x_out, out_size = patch_embed(x)
-        assert x_out.size() == (B, 2, 3)
-        assert out_size == (2, 1)
-        assert x_out.size(1) == out_size[0] * out_size[1]
-
-        # test different kernel_size with different stride
-        input_size = (6, 5)
-        kernel_size = (6, 2)
-        stride = (6, 2)
-        dilation = 1
-        bias = False
-
-        x = torch.rand(B, in_c, *input_size)
-        patch_embed = PatchEmbed(
-            in_channels=in_c,
-            embed_dims=embed_dims,
-            kernel_size=kernel_size,
-            stride=stride,
-            padding=padding,
-            dilation=dilation,
-            bias=bias)
-
-        x_out, out_size = patch_embed(x)
-        assert x_out.size() == (B, 3, 3)
-        assert out_size == (1, 3)
-        assert x_out.size(1) == out_size[0] * out_size[1]
-
-
-def test_patch_merging():
-
-    # Test the model with int padding
-    in_c = 3
-    out_c = 4
-    kernel_size = 3
-    stride = 3
-    padding = 1
-    dilation = 1
-    bias = False
-    # test the case `pad_to_stride` is False
-    patch_merge = PatchMerging(
-        in_channels=in_c,
-        out_channels=out_c,
-        kernel_size=kernel_size,
-        stride=stride,
-        padding=padding,
-        dilation=dilation,
-        bias=bias)
-    B, L, C = 1, 100, 3
-    input_size = (10, 10)
-    x = torch.rand(B, L, C)
-    x_out, out_size = patch_merge(x, input_size)
-    assert x_out.size() == (1, 16, 4)
-    assert out_size == (4, 4)
-    # assert out size is consistent with real output
-    assert x_out.size(1) == out_size[0] * out_size[1]
-    in_c = 4
-    out_c = 5
-    kernel_size = 6
-    stride = 3
-    padding = 2
-    dilation = 2
-    bias = False
-    patch_merge = PatchMerging(
-        in_channels=in_c,
-        out_channels=out_c,
-        kernel_size=kernel_size,
-        stride=stride,
-        padding=padding,
-        dilation=dilation,
-        bias=bias)
-    B, L, C = 1, 100, 4
-    input_size = (10, 10)
-    x = torch.rand(B, L, C)
-    x_out, out_size = patch_merge(x, input_size)
-    assert x_out.size() == (1, 4, 5)
-    assert out_size == (2, 2)
-    # assert out size is consistent with real output
-    assert x_out.size(1) == out_size[0] * out_size[1]
-
-    # Test with adaptive padding
-    for padding in ('same', 'corner'):
-        in_c = 2
-        out_c = 3
-        B = 2
-
-        # test stride is 1
-        input_size = (5, 5)
-        kernel_size = (5, 5)
-        stride = (1, 1)
-        dilation = 1
-        bias = False
-        L = input_size[0] * input_size[1]
-
-        x = torch.rand(B, L, in_c)
-        patch_merge = PatchMerging(
-            in_channels=in_c,
-            out_channels=out_c,
-            kernel_size=kernel_size,
-            stride=stride,
-            padding=padding,
-            dilation=dilation,
-            bias=bias)
-
-        x_out, out_size = patch_merge(x, input_size)
-        assert x_out.size() == (B, 25, 3)
-        assert out_size == (5, 5)
-        assert x_out.size(1) == out_size[0] * out_size[1]
-
-        # test kernel_size == stride
-        input_size = (5, 5)
-        kernel_size = (5, 5)
-        stride = (5, 5)
-        dilation = 1
-        bias = False
-        L = input_size[0] * input_size[1]
-
-        x = torch.rand(B, L, in_c)
-        patch_merge = PatchMerging(
-            in_channels=in_c,
-            out_channels=out_c,
-            kernel_size=kernel_size,
-            stride=stride,
-            padding=padding,
-            dilation=dilation,
-            bias=bias)
-
-        x_out, out_size = patch_merge(x, input_size)
-        assert x_out.size() == (B, 1, 3)
-        assert out_size == (1, 1)
-        assert x_out.size(1) == out_size[0] * out_size[1]
-
-        # test kernel_size == stride
-        input_size = (6, 5)
-        kernel_size = (5, 5)
-        stride = (5, 5)
-        dilation = 1
-        bias = False
-        L = input_size[0] * input_size[1]
-
-        x = torch.rand(B, L, in_c)
-        patch_merge = PatchMerging(
-            in_channels=in_c,
-            out_channels=out_c,
-            kernel_size=kernel_size,
-            stride=stride,
-            padding=padding,
-            dilation=dilation,
-            bias=bias)
-
-        x_out, out_size = patch_merge(x, input_size)
-        assert x_out.size() == (B, 2, 3)
-        assert out_size == (2, 1)
-        assert x_out.size(1) == out_size[0] * out_size[1]
-
-        # test different kernel_size with different stride
-        input_size = (6, 5)
-        kernel_size = (6, 2)
-        stride = (6, 2)
-        dilation = 1
-        bias = False
-        L = input_size[0] * input_size[1]
-
-        x = torch.rand(B, L, in_c)
-        patch_merge = PatchMerging(
-            in_channels=in_c,
-            out_channels=out_c,
-            kernel_size=kernel_size,
-            stride=stride,
-            padding=padding,
-            dilation=dilation,
-            bias=bias)
-
-        x_out, out_size = patch_merge(x, input_size)
-        assert x_out.size() == (B, 3, 3)
-        assert out_size == (1, 3)
-        assert x_out.size(1) == out_size[0] * out_size[1]
+from mmcv.runner import ModuleList
 
 
 def test_multiheadattention():
@@ -538,6 +79,7 @@ def test_ffn():
     with pytest.raises(AssertionError):
         # num_fcs should be no less than 2
         FFN(num_fcs=1)
+    FFN(dropout=0, add_residual=True)
     ffn = FFN(dropout=0, add_identity=True)
 
     input_tensor = torch.rand(2, 20, 256)
@@ -552,13 +94,6 @@ def test_ffn():
         ffn(input_tensor, identity=residual).sum(),
         ffn(input_tensor).sum() + residual.sum() - input_tensor.sum())
 
-    # test with layer_scale
-    ffn = FFN(dropout=0, add_identity=True, layer_scale_init_value=0.1)
-
-    input_tensor = torch.rand(2, 20, 256)
-    input_tensor_nbc = input_tensor.transpose(0, 1)
-    assert torch.allclose(ffn(input_tensor).sum(), ffn(input_tensor_nbc).sum())
-
 
 @pytest.mark.skipif(not torch.cuda.is_available(), reason='Cuda not available')
 def test_basetransformerlayer_cuda():
@@ -582,27 +117,8 @@ def test_basetransformerlayer_cuda():
         assert x.shape == torch.Size([2, 10, 256])
 
 
-@pytest.mark.parametrize('embed_dims', [False, 256])
-def test_basetransformerlayer(embed_dims):
+def test_basetransformerlayer():
     attn_cfgs = dict(type='MultiheadAttention', embed_dims=256, num_heads=8),
-    if embed_dims:
-        ffn_cfgs = dict(
-            type='FFN',
-            embed_dims=embed_dims,
-            feedforward_channels=1024,
-            num_fcs=2,
-            ffn_drop=0.,
-            act_cfg=dict(type='ReLU', inplace=True),
-        )
-    else:
-        ffn_cfgs = dict(
-            type='FFN',
-            feedforward_channels=1024,
-            num_fcs=2,
-            ffn_drop=0.,
-            act_cfg=dict(type='ReLU', inplace=True),
-        )
-
     feedforward_channels = 2048
     ffn_dropout = 0.1
     operation_order = ('self_attn', 'norm', 'ffn', 'norm')
@@ -610,7 +126,6 @@ def test_basetransformerlayer(embed_dims):
     # test deprecated_args
     baselayer = BaseTransformerLayer(
         attn_cfgs=attn_cfgs,
-        ffn_cfgs=ffn_cfgs,
         feedforward_channels=feedforward_channels,
         ffn_dropout=ffn_dropout,
         operation_order=operation_order)
diff --git a/tests/test_cnn/test_weight_init.py b/tests/test_cnn/test_weight_init.py
new file mode 100644
index 0000000..7c2a4b7
--- /dev/null
+++ b/tests/test_cnn/test_weight_init.py
@@ -0,0 +1,559 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import random
+from tempfile import TemporaryDirectory
+
+import numpy as np
+import pytest
+import torch
+from scipy import stats
+from torch import nn
+
+from mmcv.cnn import (Caffe2XavierInit, ConstantInit, KaimingInit, NormalInit,
+                      PretrainedInit, TruncNormalInit, UniformInit, XavierInit,
+                      bias_init_with_prob, caffe2_xavier_init, constant_init,
+                      initialize, kaiming_init, normal_init, trunc_normal_init,
+                      uniform_init, xavier_init)
+
+
+def test_constant_init():
+    conv_module = nn.Conv2d(3, 16, 3)
+    constant_init(conv_module, 0.1)
+    assert conv_module.weight.allclose(
+        torch.full_like(conv_module.weight, 0.1))
+    assert conv_module.bias.allclose(torch.zeros_like(conv_module.bias))
+    conv_module_no_bias = nn.Conv2d(3, 16, 3, bias=False)
+    constant_init(conv_module_no_bias, 0.1)
+    assert conv_module.weight.allclose(
+        torch.full_like(conv_module.weight, 0.1))
+
+
+def test_xavier_init():
+    conv_module = nn.Conv2d(3, 16, 3)
+    xavier_init(conv_module, bias=0.1)
+    assert conv_module.bias.allclose(torch.full_like(conv_module.bias, 0.1))
+    xavier_init(conv_module, distribution='uniform')
+    # TODO: sanity check of weight distribution, e.g. mean, std
+    with pytest.raises(AssertionError):
+        xavier_init(conv_module, distribution='student-t')
+    conv_module_no_bias = nn.Conv2d(3, 16, 3, bias=False)
+    xavier_init(conv_module_no_bias)
+
+
+def test_normal_init():
+    conv_module = nn.Conv2d(3, 16, 3)
+    normal_init(conv_module, bias=0.1)
+    # TODO: sanity check of weight distribution, e.g. mean, std
+    assert conv_module.bias.allclose(torch.full_like(conv_module.bias, 0.1))
+    conv_module_no_bias = nn.Conv2d(3, 16, 3, bias=False)
+    normal_init(conv_module_no_bias)
+    # TODO: sanity check distribution, e.g. mean, std
+
+
+def test_trunc_normal_init():
+
+    def _random_float(a, b):
+        return (b - a) * random.random() + a
+
+    def _is_trunc_normal(tensor, mean, std, a, b):
+        # scipy's trunc norm is suited for data drawn from N(0, 1),
+        # so we need to transform our data to test it using scipy.
+        z_samples = (tensor.view(-1) - mean) / std
+        z_samples = z_samples.tolist()
+        a0 = (a - mean) / std
+        b0 = (b - mean) / std
+        p_value = stats.kstest(z_samples, 'truncnorm', args=(a0, b0))[1]
+        return p_value > 0.0001
+
+    conv_module = nn.Conv2d(3, 16, 3)
+    mean = _random_float(-3, 3)
+    std = _random_float(.01, 1)
+    a = _random_float(mean - 2 * std, mean)
+    b = _random_float(mean, mean + 2 * std)
+    trunc_normal_init(conv_module, mean, std, a, b, bias=0.1)
+    assert _is_trunc_normal(conv_module.weight, mean, std, a, b)
+    assert conv_module.bias.allclose(torch.full_like(conv_module.bias, 0.1))
+
+    conv_module_no_bias = nn.Conv2d(3, 16, 3, bias=False)
+    trunc_normal_init(conv_module_no_bias)
+    # TODO: sanity check distribution, e.g. mean, std
+
+
+def test_uniform_init():
+    conv_module = nn.Conv2d(3, 16, 3)
+    uniform_init(conv_module, bias=0.1)
+    # TODO: sanity check of weight distribution, e.g. mean, std
+    assert conv_module.bias.allclose(torch.full_like(conv_module.bias, 0.1))
+    conv_module_no_bias = nn.Conv2d(3, 16, 3, bias=False)
+    uniform_init(conv_module_no_bias)
+
+
+def test_kaiming_init():
+    conv_module = nn.Conv2d(3, 16, 3)
+    kaiming_init(conv_module, bias=0.1)
+    # TODO: sanity check of weight distribution, e.g. mean, std
+    assert conv_module.bias.allclose(torch.full_like(conv_module.bias, 0.1))
+    kaiming_init(conv_module, distribution='uniform')
+    with pytest.raises(AssertionError):
+        kaiming_init(conv_module, distribution='student-t')
+    conv_module_no_bias = nn.Conv2d(3, 16, 3, bias=False)
+    kaiming_init(conv_module_no_bias)
+
+
+def test_caffe_xavier_init():
+    conv_module = nn.Conv2d(3, 16, 3)
+    caffe2_xavier_init(conv_module)
+
+
+def test_bias_init_with_prob():
+    conv_module = nn.Conv2d(3, 16, 3)
+    prior_prob = 0.1
+    normal_init(conv_module, bias=bias_init_with_prob(0.1))
+    # TODO: sanity check of weight distribution, e.g. mean, std
+    bias = float(-np.log((1 - prior_prob) / prior_prob))
+    assert conv_module.bias.allclose(torch.full_like(conv_module.bias, bias))
+
+
+def test_constaninit():
+    """test ConstantInit class."""
+    model = nn.Sequential(nn.Conv2d(3, 1, 3), nn.ReLU(), nn.Linear(1, 2))
+    func = ConstantInit(val=1, bias=2, layer='Conv2d')
+    func(model)
+    assert torch.equal(model[0].weight, torch.full(model[0].weight.shape, 1.))
+    assert torch.equal(model[0].bias, torch.full(model[0].bias.shape, 2.))
+
+    assert not torch.equal(model[2].weight,
+                           torch.full(model[2].weight.shape, 1.))
+    assert not torch.equal(model[2].bias, torch.full(model[2].bias.shape, 2.))
+
+    func = ConstantInit(val=3, bias_prob=0.01, layer='Linear')
+    func(model)
+    res = bias_init_with_prob(0.01)
+
+    assert torch.equal(model[0].weight, torch.full(model[0].weight.shape, 1.))
+    assert torch.equal(model[2].weight, torch.full(model[2].weight.shape, 3.))
+    assert torch.equal(model[0].bias, torch.full(model[0].bias.shape, 2.))
+    assert torch.equal(model[2].bias, torch.full(model[2].bias.shape, res))
+
+    # test layer key with base class name
+    model = nn.Sequential(nn.Conv2d(3, 1, 3), nn.ReLU(), nn.Conv1d(1, 2, 1))
+    func = ConstantInit(val=4., bias=5., layer='_ConvNd')
+    func(model)
+    assert torch.all(model[0].weight == 4.)
+    assert torch.all(model[2].weight == 4.)
+    assert torch.all(model[0].bias == 5.)
+    assert torch.all(model[2].bias == 5.)
+
+    # test bias input type
+    with pytest.raises(TypeError):
+        func = ConstantInit(val=1, bias='1')
+    # test bias_prob type
+    with pytest.raises(TypeError):
+        func = ConstantInit(val=1, bias_prob='1')
+    # test layer input type
+    with pytest.raises(TypeError):
+        func = ConstantInit(val=1, layer=1)
+
+
+def test_xavierinit():
+    """test XavierInit class."""
+    model = nn.Sequential(nn.Conv2d(3, 1, 3), nn.ReLU(), nn.Linear(1, 2))
+    func = XavierInit(bias=0.1, layer='Conv2d')
+    func(model)
+    assert model[0].bias.allclose(torch.full_like(model[2].bias, 0.1))
+    assert not model[2].bias.allclose(torch.full_like(model[0].bias, 0.1))
+
+    constant_func = ConstantInit(val=0, bias=0, layer=['Conv2d', 'Linear'])
+    func = XavierInit(gain=100, bias_prob=0.01, layer=['Conv2d', 'Linear'])
+    model.apply(constant_func)
+    assert torch.equal(model[0].weight, torch.full(model[0].weight.shape, 0.))
+    assert torch.equal(model[2].weight, torch.full(model[2].weight.shape, 0.))
+    assert torch.equal(model[0].bias, torch.full(model[0].bias.shape, 0.))
+    assert torch.equal(model[2].bias, torch.full(model[2].bias.shape, 0.))
+
+    res = bias_init_with_prob(0.01)
+    func(model)
+    assert not torch.equal(model[0].weight,
+                           torch.full(model[0].weight.shape, 0.))
+    assert not torch.equal(model[2].weight,
+                           torch.full(model[2].weight.shape, 0.))
+    assert torch.equal(model[0].bias, torch.full(model[0].bias.shape, res))
+    assert torch.equal(model[2].bias, torch.full(model[2].bias.shape, res))
+
+    # test layer key with base class name
+    model = nn.Sequential(nn.Conv2d(3, 1, 3), nn.ReLU(), nn.Conv1d(1, 2, 1))
+    func = ConstantInit(val=4., bias=5., layer='_ConvNd')
+    func(model)
+    assert torch.all(model[0].weight == 4.)
+    assert torch.all(model[2].weight == 4.)
+    assert torch.all(model[0].bias == 5.)
+    assert torch.all(model[2].bias == 5.)
+
+    func = XavierInit(gain=100, bias_prob=0.01, layer='_ConvNd')
+    func(model)
+    assert not torch.all(model[0].weight == 4.)
+    assert not torch.all(model[2].weight == 4.)
+    assert torch.all(model[0].bias == res)
+    assert torch.all(model[2].bias == res)
+
+    # test bias input type
+    with pytest.raises(TypeError):
+        func = XavierInit(bias='0.1', layer='Conv2d')
+    # test layer inpur type
+    with pytest.raises(TypeError):
+        func = XavierInit(bias=0.1, layer=1)
+
+
+def test_normalinit():
+    """test Normalinit class."""
+    model = nn.Sequential(nn.Conv2d(3, 1, 3), nn.ReLU(), nn.Linear(1, 2))
+
+    func = NormalInit(mean=100, std=1e-5, bias=200, layer=['Conv2d', 'Linear'])
+    func(model)
+    assert model[0].weight.allclose(torch.tensor(100.))
+    assert model[2].weight.allclose(torch.tensor(100.))
+    assert model[0].bias.allclose(torch.tensor(200.))
+    assert model[2].bias.allclose(torch.tensor(200.))
+
+    func = NormalInit(
+        mean=300, std=1e-5, bias_prob=0.01, layer=['Conv2d', 'Linear'])
+    res = bias_init_with_prob(0.01)
+    func(model)
+    assert model[0].weight.allclose(torch.tensor(300.))
+    assert model[2].weight.allclose(torch.tensor(300.))
+    assert model[0].bias.allclose(torch.tensor(res))
+    assert model[2].bias.allclose(torch.tensor(res))
+
+    # test layer key with base class name
+    model = nn.Sequential(nn.Conv2d(3, 1, 3), nn.ReLU(), nn.Conv1d(1, 2, 1))
+
+    func = NormalInit(mean=300, std=1e-5, bias_prob=0.01, layer='_ConvNd')
+    func(model)
+    assert model[0].weight.allclose(torch.tensor(300.))
+    assert model[2].weight.allclose(torch.tensor(300.))
+    assert torch.all(model[0].bias == res)
+    assert torch.all(model[2].bias == res)
+
+
+def test_truncnormalinit():
+    """test TruncNormalInit class."""
+    model = nn.Sequential(nn.Conv2d(3, 1, 3), nn.ReLU(), nn.Linear(1, 2))
+
+    func = TruncNormalInit(
+        mean=100, std=1e-5, bias=200, a=0, b=200, layer=['Conv2d', 'Linear'])
+    func(model)
+    assert model[0].weight.allclose(torch.tensor(100.))
+    assert model[2].weight.allclose(torch.tensor(100.))
+    assert model[0].bias.allclose(torch.tensor(200.))
+    assert model[2].bias.allclose(torch.tensor(200.))
+
+    func = TruncNormalInit(
+        mean=300,
+        std=1e-5,
+        a=100,
+        b=400,
+        bias_prob=0.01,
+        layer=['Conv2d', 'Linear'])
+    res = bias_init_with_prob(0.01)
+    func(model)
+    assert model[0].weight.allclose(torch.tensor(300.))
+    assert model[2].weight.allclose(torch.tensor(300.))
+    assert model[0].bias.allclose(torch.tensor(res))
+    assert model[2].bias.allclose(torch.tensor(res))
+
+    # test layer key with base class name
+    model = nn.Sequential(nn.Conv2d(3, 1, 3), nn.ReLU(), nn.Conv1d(1, 2, 1))
+
+    func = TruncNormalInit(
+        mean=300, std=1e-5, a=100, b=400, bias_prob=0.01, layer='_ConvNd')
+    func(model)
+    assert model[0].weight.allclose(torch.tensor(300.))
+    assert model[2].weight.allclose(torch.tensor(300.))
+    assert torch.all(model[0].bias == res)
+    assert torch.all(model[2].bias == res)
+
+
+def test_uniforminit():
+    """"test UniformInit class."""
+    model = nn.Sequential(nn.Conv2d(3, 1, 3), nn.ReLU(), nn.Linear(1, 2))
+    func = UniformInit(a=1, b=1, bias=2, layer=['Conv2d', 'Linear'])
+    func(model)
+    assert torch.equal(model[0].weight, torch.full(model[0].weight.shape, 1.))
+    assert torch.equal(model[2].weight, torch.full(model[2].weight.shape, 1.))
+    assert torch.equal(model[0].bias, torch.full(model[0].bias.shape, 2.))
+    assert torch.equal(model[2].bias, torch.full(model[2].bias.shape, 2.))
+
+    func = UniformInit(a=100, b=100, layer=['Conv2d', 'Linear'], bias=10)
+    func(model)
+    assert torch.equal(model[0].weight, torch.full(model[0].weight.shape,
+                                                   100.))
+    assert torch.equal(model[2].weight, torch.full(model[2].weight.shape,
+                                                   100.))
+    assert torch.equal(model[0].bias, torch.full(model[0].bias.shape, 10.))
+    assert torch.equal(model[2].bias, torch.full(model[2].bias.shape, 10.))
+
+    # test layer key with base class name
+    model = nn.Sequential(nn.Conv2d(3, 1, 3), nn.ReLU(), nn.Conv1d(1, 2, 1))
+
+    func = UniformInit(a=100, b=100, bias_prob=0.01, layer='_ConvNd')
+    res = bias_init_with_prob(0.01)
+    func(model)
+    assert torch.all(model[0].weight == 100.)
+    assert torch.all(model[2].weight == 100.)
+    assert torch.all(model[0].bias == res)
+    assert torch.all(model[2].bias == res)
+
+
+def test_kaiminginit():
+    """test KaimingInit class."""
+    model = nn.Sequential(nn.Conv2d(3, 1, 3), nn.ReLU(), nn.Linear(1, 2))
+    func = KaimingInit(bias=0.1, layer='Conv2d')
+    func(model)
+    assert torch.equal(model[0].bias, torch.full(model[0].bias.shape, 0.1))
+    assert not torch.equal(model[2].bias, torch.full(model[2].bias.shape, 0.1))
+
+    func = KaimingInit(a=100, bias=10, layer=['Conv2d', 'Linear'])
+    constant_func = ConstantInit(val=0, bias=0, layer=['Conv2d', 'Linear'])
+    model.apply(constant_func)
+    assert torch.equal(model[0].weight, torch.full(model[0].weight.shape, 0.))
+    assert torch.equal(model[2].weight, torch.full(model[2].weight.shape, 0.))
+    assert torch.equal(model[0].bias, torch.full(model[0].bias.shape, 0.))
+    assert torch.equal(model[2].bias, torch.full(model[2].bias.shape, 0.))
+
+    func(model)
+    assert not torch.equal(model[0].weight,
+                           torch.full(model[0].weight.shape, 0.))
+    assert not torch.equal(model[2].weight,
+                           torch.full(model[2].weight.shape, 0.))
+    assert torch.equal(model[0].bias, torch.full(model[0].bias.shape, 10.))
+    assert torch.equal(model[2].bias, torch.full(model[2].bias.shape, 10.))
+
+    # test layer key with base class name
+    model = nn.Sequential(nn.Conv2d(3, 1, 3), nn.ReLU(), nn.Conv1d(1, 2, 1))
+    func = KaimingInit(bias=0.1, layer='_ConvNd')
+    func(model)
+    assert torch.all(model[0].bias == 0.1)
+    assert torch.all(model[2].bias == 0.1)
+
+    func = KaimingInit(a=100, bias=10, layer='_ConvNd')
+    constant_func = ConstantInit(val=0, bias=0, layer='_ConvNd')
+    model.apply(constant_func)
+    assert torch.equal(model[0].weight, torch.full(model[0].weight.shape, 0.))
+    assert torch.equal(model[2].weight, torch.full(model[2].weight.shape, 0.))
+    assert torch.equal(model[0].bias, torch.full(model[0].bias.shape, 0.))
+    assert torch.equal(model[2].bias, torch.full(model[2].bias.shape, 0.))
+
+    func(model)
+    assert not torch.equal(model[0].weight,
+                           torch.full(model[0].weight.shape, 0.))
+    assert not torch.equal(model[2].weight,
+                           torch.full(model[2].weight.shape, 0.))
+    assert torch.equal(model[0].bias, torch.full(model[0].bias.shape, 10.))
+    assert torch.equal(model[2].bias, torch.full(model[2].bias.shape, 10.))
+
+
+def test_caffe2xavierinit():
+    """test Caffe2XavierInit."""
+    model = nn.Sequential(nn.Conv2d(3, 1, 3), nn.ReLU(), nn.Linear(1, 2))
+    func = Caffe2XavierInit(bias=0.1, layer='Conv2d')
+    func(model)
+    assert torch.equal(model[0].bias, torch.full(model[0].bias.shape, 0.1))
+    assert not torch.equal(model[2].bias, torch.full(model[2].bias.shape, 0.1))
+
+
+class FooModule(nn.Module):
+
+    def __init__(self):
+        super().__init__()
+        self.linear = nn.Linear(1, 2)
+        self.conv2d = nn.Conv2d(3, 1, 3)
+        self.conv2d_2 = nn.Conv2d(3, 2, 3)
+
+
+def test_pretrainedinit():
+    """test PretrainedInit class."""
+
+    modelA = FooModule()
+    constant_func = ConstantInit(val=1, bias=2, layer=['Conv2d', 'Linear'])
+    modelA.apply(constant_func)
+    modelB = FooModule()
+    funcB = PretrainedInit(checkpoint='modelA.pth')
+    modelC = nn.Linear(1, 2)
+    funcC = PretrainedInit(checkpoint='modelA.pth', prefix='linear.')
+    with TemporaryDirectory():
+        torch.save(modelA.state_dict(), 'modelA.pth')
+        funcB(modelB)
+        assert torch.equal(modelB.linear.weight,
+                           torch.full(modelB.linear.weight.shape, 1.))
+        assert torch.equal(modelB.linear.bias,
+                           torch.full(modelB.linear.bias.shape, 2.))
+        assert torch.equal(modelB.conv2d.weight,
+                           torch.full(modelB.conv2d.weight.shape, 1.))
+        assert torch.equal(modelB.conv2d.bias,
+                           torch.full(modelB.conv2d.bias.shape, 2.))
+        assert torch.equal(modelB.conv2d_2.weight,
+                           torch.full(modelB.conv2d_2.weight.shape, 1.))
+        assert torch.equal(modelB.conv2d_2.bias,
+                           torch.full(modelB.conv2d_2.bias.shape, 2.))
+
+        funcC(modelC)
+        assert torch.equal(modelC.weight, torch.full(modelC.weight.shape, 1.))
+        assert torch.equal(modelC.bias, torch.full(modelC.bias.shape, 2.))
+
+
+def test_initialize():
+    model = nn.Sequential(nn.Conv2d(3, 1, 3), nn.ReLU(), nn.Linear(1, 2))
+    foonet = FooModule()
+
+    # test layer key
+    init_cfg = dict(type='Constant', layer=['Conv2d', 'Linear'], val=1, bias=2)
+    initialize(model, init_cfg)
+    assert torch.equal(model[0].weight, torch.full(model[0].weight.shape, 1.))
+    assert torch.equal(model[2].weight, torch.full(model[2].weight.shape, 1.))
+    assert torch.equal(model[0].bias, torch.full(model[0].bias.shape, 2.))
+    assert torch.equal(model[2].bias, torch.full(model[2].bias.shape, 2.))
+    assert init_cfg == dict(
+        type='Constant', layer=['Conv2d', 'Linear'], val=1, bias=2)
+
+    # test init_cfg with list type
+    init_cfg = [
+        dict(type='Constant', layer='Conv2d', val=1, bias=2),
+        dict(type='Constant', layer='Linear', val=3, bias=4)
+    ]
+    initialize(model, init_cfg)
+    assert torch.equal(model[0].weight, torch.full(model[0].weight.shape, 1.))
+    assert torch.equal(model[2].weight, torch.full(model[2].weight.shape, 3.))
+    assert torch.equal(model[0].bias, torch.full(model[0].bias.shape, 2.))
+    assert torch.equal(model[2].bias, torch.full(model[2].bias.shape, 4.))
+    assert init_cfg == [
+        dict(type='Constant', layer='Conv2d', val=1, bias=2),
+        dict(type='Constant', layer='Linear', val=3, bias=4)
+    ]
+
+    # test layer key and override key
+    init_cfg = dict(
+        type='Constant',
+        val=1,
+        bias=2,
+        layer=['Conv2d', 'Linear'],
+        override=dict(type='Constant', name='conv2d_2', val=3, bias=4))
+    initialize(foonet, init_cfg)
+    assert torch.equal(foonet.linear.weight,
+                       torch.full(foonet.linear.weight.shape, 1.))
+    assert torch.equal(foonet.linear.bias,
+                       torch.full(foonet.linear.bias.shape, 2.))
+    assert torch.equal(foonet.conv2d.weight,
+                       torch.full(foonet.conv2d.weight.shape, 1.))
+    assert torch.equal(foonet.conv2d.bias,
+                       torch.full(foonet.conv2d.bias.shape, 2.))
+    assert torch.equal(foonet.conv2d_2.weight,
+                       torch.full(foonet.conv2d_2.weight.shape, 3.))
+    assert torch.equal(foonet.conv2d_2.bias,
+                       torch.full(foonet.conv2d_2.bias.shape, 4.))
+    assert init_cfg == dict(
+        type='Constant',
+        val=1,
+        bias=2,
+        layer=['Conv2d', 'Linear'],
+        override=dict(type='Constant', name='conv2d_2', val=3, bias=4))
+
+    # test override key
+    init_cfg = dict(
+        type='Constant', val=5, bias=6, override=dict(name='conv2d_2'))
+    initialize(foonet, init_cfg)
+    assert not torch.equal(foonet.linear.weight,
+                           torch.full(foonet.linear.weight.shape, 5.))
+    assert not torch.equal(foonet.linear.bias,
+                           torch.full(foonet.linear.bias.shape, 6.))
+    assert not torch.equal(foonet.conv2d.weight,
+                           torch.full(foonet.conv2d.weight.shape, 5.))
+    assert not torch.equal(foonet.conv2d.bias,
+                           torch.full(foonet.conv2d.bias.shape, 6.))
+    assert torch.equal(foonet.conv2d_2.weight,
+                       torch.full(foonet.conv2d_2.weight.shape, 5.))
+    assert torch.equal(foonet.conv2d_2.bias,
+                       torch.full(foonet.conv2d_2.bias.shape, 6.))
+    assert init_cfg == dict(
+        type='Constant', val=5, bias=6, override=dict(name='conv2d_2'))
+
+    init_cfg = dict(
+        type='Pretrained',
+        checkpoint='modelA.pth',
+        override=dict(type='Constant', name='conv2d_2', val=3, bias=4))
+    modelA = FooModule()
+    constant_func = ConstantInit(val=1, bias=2, layer=['Conv2d', 'Linear'])
+    modelA.apply(constant_func)
+    with TemporaryDirectory():
+        torch.save(modelA.state_dict(), 'modelA.pth')
+        initialize(foonet, init_cfg)
+        assert torch.equal(foonet.linear.weight,
+                           torch.full(foonet.linear.weight.shape, 1.))
+        assert torch.equal(foonet.linear.bias,
+                           torch.full(foonet.linear.bias.shape, 2.))
+        assert torch.equal(foonet.conv2d.weight,
+                           torch.full(foonet.conv2d.weight.shape, 1.))
+        assert torch.equal(foonet.conv2d.bias,
+                           torch.full(foonet.conv2d.bias.shape, 2.))
+        assert torch.equal(foonet.conv2d_2.weight,
+                           torch.full(foonet.conv2d_2.weight.shape, 3.))
+        assert torch.equal(foonet.conv2d_2.bias,
+                           torch.full(foonet.conv2d_2.bias.shape, 4.))
+    assert init_cfg == dict(
+        type='Pretrained',
+        checkpoint='modelA.pth',
+        override=dict(type='Constant', name='conv2d_2', val=3, bias=4))
+
+    # test init_cfg type
+    with pytest.raises(TypeError):
+        init_cfg = 'init_cfg'
+        initialize(foonet, init_cfg)
+
+    # test override value type
+    with pytest.raises(TypeError):
+        init_cfg = dict(
+            type='Constant',
+            val=1,
+            bias=2,
+            layer=['Conv2d', 'Linear'],
+            override='conv')
+        initialize(foonet, init_cfg)
+
+    # test override name
+    with pytest.raises(RuntimeError):
+        init_cfg = dict(
+            type='Constant',
+            val=1,
+            bias=2,
+            layer=['Conv2d', 'Linear'],
+            override=dict(type='Constant', name='conv2d_3', val=3, bias=4))
+        initialize(foonet, init_cfg)
+
+    # test list override name
+    with pytest.raises(RuntimeError):
+        init_cfg = dict(
+            type='Constant',
+            val=1,
+            bias=2,
+            layer=['Conv2d', 'Linear'],
+            override=[
+                dict(type='Constant', name='conv2d', val=3, bias=4),
+                dict(type='Constant', name='conv2d_3', val=5, bias=6)
+            ])
+        initialize(foonet, init_cfg)
+
+    # test override with args except type key
+    with pytest.raises(ValueError):
+        init_cfg = dict(
+            type='Constant',
+            val=1,
+            bias=2,
+            override=dict(name='conv2d_2', val=3, bias=4))
+        initialize(foonet, init_cfg)
+
+    # test override without name
+    with pytest.raises(ValueError):
+        init_cfg = dict(
+            type='Constant',
+            val=1,
+            bias=2,
+            override=dict(type='Constant', val=3, bias=4))
+        initialize(foonet, init_cfg)
diff --git a/tests/test_cnn/test_wrappers.py b/tests/test_cnn/test_wrappers.py
index 02e0f13..ffc933f 100644
--- a/tests/test_cnn/test_wrappers.py
+++ b/tests/test_cnn/test_wrappers.py
@@ -1,4 +1,3 @@
-# Copyright (c) OpenMMLab. All rights reserved.
 from unittest.mock import patch
 
 import pytest
diff --git a/tests/test_fileclient.py b/tests/test_fileclient.py
new file mode 100644
index 0000000..30f3243
--- /dev/null
+++ b/tests/test_fileclient.py
@@ -0,0 +1,860 @@
+import os
+import os.path as osp
+import sys
+import tempfile
+from contextlib import contextmanager
+from copy import deepcopy
+from pathlib import Path
+from unittest.mock import MagicMock, patch
+
+import pytest
+
+import mmcv
+from mmcv import BaseStorageBackend, FileClient
+from mmcv.utils import has_method
+
+sys.modules['ceph'] = MagicMock()
+sys.modules['petrel_client'] = MagicMock()
+sys.modules['petrel_client.client'] = MagicMock()
+sys.modules['mc'] = MagicMock()
+
+
+@contextmanager
+def build_temporary_directory():
+    """Build a temporary directory containing many files to test
+    ``FileClient.list_dir_or_file``.
+
+    . \n
+    | -- dir1 \n
+    | -- | -- text3.txt \n
+    | -- dir2 \n
+    | -- | -- dir3 \n
+    | -- | -- | -- text4.txt \n
+    | -- | -- img.jpg \n
+    | -- text1.txt \n
+    | -- text2.txt \n
+    """
+    with tempfile.TemporaryDirectory() as tmp_dir:
+        text1 = Path(tmp_dir) / 'text1.txt'
+        text1.open('w').write('text1')
+        text2 = Path(tmp_dir) / 'text2.txt'
+        text2.open('w').write('text2')
+        dir1 = Path(tmp_dir) / 'dir1'
+        dir1.mkdir()
+        text3 = dir1 / 'text3.txt'
+        text3.open('w').write('text3')
+        dir2 = Path(tmp_dir) / 'dir2'
+        dir2.mkdir()
+        jpg1 = dir2 / 'img.jpg'
+        jpg1.open('wb').write(b'img')
+        dir3 = dir2 / 'dir3'
+        dir3.mkdir()
+        text4 = dir3 / 'text4.txt'
+        text4.open('w').write('text4')
+        yield tmp_dir
+
+
+@contextmanager
+def delete_and_reset_method(obj, method):
+    method_obj = deepcopy(getattr(type(obj), method))
+    try:
+        delattr(type(obj), method)
+        yield
+    finally:
+        setattr(type(obj), method, method_obj)
+
+
+class MockS3Client:
+
+    def __init__(self, enable_mc=True):
+        self.enable_mc = enable_mc
+
+    def Get(self, filepath):
+        with open(filepath, 'rb') as f:
+            content = f.read()
+        return content
+
+
+class MockPetrelClient:
+
+    def __init__(self, enable_mc=True, enable_multi_cluster=False):
+        self.enable_mc = enable_mc
+        self.enable_multi_cluster = enable_multi_cluster
+
+    def Get(self, filepath):
+        with open(filepath, 'rb') as f:
+            content = f.read()
+        return content
+
+    def put(self):
+        pass
+
+    def delete(self):
+        pass
+
+    def contains(self):
+        pass
+
+    def isdir(self):
+        pass
+
+    def list(self, dir_path):
+        for entry in os.scandir(dir_path):
+            if not entry.name.startswith('.') and entry.is_file():
+                yield entry.name
+            elif osp.isdir(entry.path):
+                yield entry.name + '/'
+
+
+class MockMemcachedClient:
+
+    def __init__(self, server_list_cfg, client_cfg):
+        pass
+
+    def Get(self, filepath, buffer):
+        with open(filepath, 'rb') as f:
+            buffer.content = f.read()
+
+
+class TestFileClient:
+
+    @classmethod
+    def setup_class(cls):
+        cls.test_data_dir = Path(__file__).parent / 'data'
+        cls.img_path = cls.test_data_dir / 'color.jpg'
+        cls.img_shape = (300, 400, 3)
+        cls.text_path = cls.test_data_dir / 'filelist.txt'
+
+    def test_error(self):
+        with pytest.raises(ValueError):
+            FileClient('hadoop')
+
+    def test_disk_backend(self):
+        disk_backend = FileClient('disk')
+
+        # test `name` attribute
+        assert disk_backend.name == 'HardDiskBackend'
+        # test `allow_symlink` attribute
+        assert disk_backend.allow_symlink
+        # test `get`
+        # input path is Path object
+        img_bytes = disk_backend.get(self.img_path)
+        img = mmcv.imfrombytes(img_bytes)
+        assert self.img_path.open('rb').read() == img_bytes
+        assert img.shape == self.img_shape
+        # input path is str
+        img_bytes = disk_backend.get(str(self.img_path))
+        img = mmcv.imfrombytes(img_bytes)
+        assert self.img_path.open('rb').read() == img_bytes
+        assert img.shape == self.img_shape
+
+        # test `get_text`
+        # input path is Path object
+        value_buf = disk_backend.get_text(self.text_path)
+        assert self.text_path.open('r').read() == value_buf
+        # input path is str
+        value_buf = disk_backend.get_text(str(self.text_path))
+        assert self.text_path.open('r').read() == value_buf
+
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            # test `put`
+            filepath1 = Path(tmp_dir) / 'test.jpg'
+            disk_backend.put(b'disk', filepath1)
+            assert filepath1.open('rb').read() == b'disk'
+            # test the `mkdir_or_exist` behavior in `put`
+            _filepath1 = Path(tmp_dir) / 'not_existed_dir1' / 'test.jpg'
+            disk_backend.put(b'disk', _filepath1)
+            assert _filepath1.open('rb').read() == b'disk'
+
+            # test `put_text`
+            filepath2 = Path(tmp_dir) / 'test.txt'
+            disk_backend.put_text('disk', filepath2)
+            assert filepath2.open('r').read() == 'disk'
+            # test the `mkdir_or_exist` behavior in `put_text`
+            _filepath2 = Path(tmp_dir) / 'not_existed_dir2' / 'test.txt'
+            disk_backend.put_text('disk', _filepath2)
+            assert _filepath2.open('r').read() == 'disk'
+
+            # test `isfile`
+            assert disk_backend.isfile(filepath2)
+            assert not disk_backend.isfile(Path(tmp_dir) / 'not/existed/path')
+
+            # test `remove`
+            disk_backend.remove(filepath2)
+
+            # test `exists`
+            assert not disk_backend.exists(filepath2)
+
+            # test `get_local_path`
+            # if the backend is disk, `get_local_path` just return the input
+            with disk_backend.get_local_path(filepath1) as path:
+                assert str(filepath1) == path
+            assert osp.isfile(filepath1)
+
+        # test `join_path`
+        disk_dir = '/path/of/your/directory'
+        assert disk_backend.join_path(disk_dir, 'file') == \
+            osp.join(disk_dir, 'file')
+        assert disk_backend.join_path(disk_dir, 'dir', 'file') == \
+            osp.join(disk_dir, 'dir', 'file')
+
+        # test `list_dir_or_file`
+        with build_temporary_directory() as tmp_dir:
+            # 1. list directories and files
+            assert set(disk_backend.list_dir_or_file(tmp_dir)) == set(
+                ['dir1', 'dir2', 'text1.txt', 'text2.txt'])
+            # 2. list directories and files recursively
+            assert set(disk_backend.list_dir_or_file(
+                tmp_dir, recursive=True)) == set([
+                    'dir1',
+                    osp.join('dir1', 'text3.txt'), 'dir2',
+                    osp.join('dir2', 'dir3'),
+                    osp.join('dir2', 'dir3', 'text4.txt'),
+                    osp.join('dir2', 'img.jpg'), 'text1.txt', 'text2.txt'
+                ])
+            # 3. only list directories
+            assert set(
+                disk_backend.list_dir_or_file(
+                    tmp_dir, list_file=False)) == set(['dir1', 'dir2'])
+            with pytest.raises(
+                    TypeError,
+                    match='`suffix` should be None when `list_dir` is True'):
+                # Exception is raised among the `list_dir_or_file` of client,
+                # so we need to invode the client to trigger the exception
+                disk_backend.client.list_dir_or_file(
+                    tmp_dir, list_file=False, suffix='.txt')
+            # 4. only list directories recursively
+            assert set(
+                disk_backend.list_dir_or_file(
+                    tmp_dir, list_file=False, recursive=True)) == set(
+                        ['dir1', 'dir2',
+                         osp.join('dir2', 'dir3')])
+            # 5. only list files
+            assert set(disk_backend.list_dir_or_file(
+                tmp_dir, list_dir=False)) == set(['text1.txt', 'text2.txt'])
+            # 6. only list files recursively
+            assert set(
+                disk_backend.list_dir_or_file(
+                    tmp_dir, list_dir=False, recursive=True)) == set([
+                        osp.join('dir1', 'text3.txt'),
+                        osp.join('dir2', 'dir3', 'text4.txt'),
+                        osp.join('dir2', 'img.jpg'), 'text1.txt', 'text2.txt'
+                    ])
+            # 7. only list files ending with suffix
+            assert set(
+                disk_backend.list_dir_or_file(
+                    tmp_dir, list_dir=False,
+                    suffix='.txt')) == set(['text1.txt', 'text2.txt'])
+            assert set(
+                disk_backend.list_dir_or_file(
+                    tmp_dir, list_dir=False,
+                    suffix=('.txt',
+                            '.jpg'))) == set(['text1.txt', 'text2.txt'])
+            with pytest.raises(
+                    TypeError,
+                    match='`suffix` must be a string or tuple of strings'):
+                disk_backend.client.list_dir_or_file(
+                    tmp_dir, list_dir=False, suffix=['.txt', '.jpg'])
+            # 8. only list files ending with suffix recursively
+            assert set(
+                disk_backend.list_dir_or_file(
+                    tmp_dir, list_dir=False, suffix='.txt',
+                    recursive=True)) == set([
+                        osp.join('dir1', 'text3.txt'),
+                        osp.join('dir2', 'dir3', 'text4.txt'), 'text1.txt',
+                        'text2.txt'
+                    ])
+            # 7. only list files ending with suffix
+            assert set(
+                disk_backend.list_dir_or_file(
+                    tmp_dir,
+                    list_dir=False,
+                    suffix=('.txt', '.jpg'),
+                    recursive=True)) == set([
+                        osp.join('dir1', 'text3.txt'),
+                        osp.join('dir2', 'dir3', 'text4.txt'),
+                        osp.join('dir2', 'img.jpg'), 'text1.txt', 'text2.txt'
+                    ])
+
+    @patch('ceph.S3Client', MockS3Client)
+    def test_ceph_backend(self):
+        ceph_backend = FileClient('ceph')
+
+        # test `allow_symlink` attribute
+        assert not ceph_backend.allow_symlink
+
+        # input path is Path object
+        with pytest.raises(NotImplementedError):
+            ceph_backend.get_text(self.text_path)
+        # input path is str
+        with pytest.raises(NotImplementedError):
+            ceph_backend.get_text(str(self.text_path))
+
+        # input path is Path object
+        img_bytes = ceph_backend.get(self.img_path)
+        img = mmcv.imfrombytes(img_bytes)
+        assert img.shape == self.img_shape
+        # input path is str
+        img_bytes = ceph_backend.get(str(self.img_path))
+        img = mmcv.imfrombytes(img_bytes)
+        assert img.shape == self.img_shape
+
+        # `path_mapping` is either None or dict
+        with pytest.raises(AssertionError):
+            FileClient('ceph', path_mapping=1)
+        # test `path_mapping`
+        ceph_path = 's3://user/data'
+        ceph_backend = FileClient(
+            'ceph', path_mapping={str(self.test_data_dir): ceph_path})
+        ceph_backend.client._client.Get = MagicMock(
+            return_value=ceph_backend.client._client.Get(self.img_path))
+        img_bytes = ceph_backend.get(self.img_path)
+        img = mmcv.imfrombytes(img_bytes)
+        assert img.shape == self.img_shape
+        ceph_backend.client._client.Get.assert_called_with(
+            str(self.img_path).replace(str(self.test_data_dir), ceph_path))
+
+    @patch('petrel_client.client.Client', MockPetrelClient)
+    @pytest.mark.parametrize('backend,prefix', [('petrel', None),
+                                                (None, 's3')])
+    def test_petrel_backend(self, backend, prefix):
+        petrel_backend = FileClient(backend=backend, prefix=prefix)
+
+        # test `allow_symlink` attribute
+        assert not petrel_backend.allow_symlink
+
+        # input path is Path object
+        img_bytes = petrel_backend.get(self.img_path)
+        img = mmcv.imfrombytes(img_bytes)
+        assert img.shape == self.img_shape
+        # input path is str
+        img_bytes = petrel_backend.get(str(self.img_path))
+        img = mmcv.imfrombytes(img_bytes)
+        assert img.shape == self.img_shape
+
+        # `path_mapping` is either None or dict
+        with pytest.raises(AssertionError):
+            FileClient('petrel', path_mapping=1)
+
+        # test `_map_path`
+        petrel_dir = 's3://user/data'
+        petrel_backend = FileClient(
+            'petrel', path_mapping={str(self.test_data_dir): petrel_dir})
+        assert petrel_backend.client._map_path(str(self.img_path)) == \
+            str(self.img_path).replace(str(self.test_data_dir), petrel_dir)
+
+        petrel_path = f'{petrel_dir}/test.jpg'
+        petrel_backend = FileClient('petrel')
+
+        # test `_format_path`
+        assert petrel_backend.client._format_path('s3://user\\data\\test.jpg')\
+            == petrel_path
+
+        # test `get`
+        with patch.object(
+                petrel_backend.client._client, 'Get',
+                return_value=b'petrel') as mock_get:
+            assert petrel_backend.get(petrel_path) == b'petrel'
+            mock_get.assert_called_once_with(petrel_path)
+
+        # test `get_text`
+        with patch.object(
+                petrel_backend.client._client, 'Get',
+                return_value=b'petrel') as mock_get:
+            assert petrel_backend.get_text(petrel_path) == 'petrel'
+            mock_get.assert_called_once_with(petrel_path)
+
+        # test `put`
+        with patch.object(petrel_backend.client._client, 'put') as mock_put:
+            petrel_backend.put(b'petrel', petrel_path)
+            mock_put.assert_called_once_with(petrel_path, b'petrel')
+
+        # test `put_text`
+        with patch.object(petrel_backend.client._client, 'put') as mock_put:
+            petrel_backend.put_text('petrel', petrel_path)
+            mock_put.assert_called_once_with(petrel_path, b'petrel')
+
+        # test `remove`
+        assert has_method(petrel_backend.client._client, 'delete')
+        # raise Exception if `delete` is not implemented
+        with delete_and_reset_method(petrel_backend.client._client, 'delete'):
+            assert not has_method(petrel_backend.client._client, 'delete')
+            with pytest.raises(NotImplementedError):
+                petrel_backend.remove(petrel_path)
+
+        with patch.object(petrel_backend.client._client,
+                          'delete') as mock_delete:
+            petrel_backend.remove(petrel_path)
+            mock_delete.assert_called_once_with(petrel_path)
+
+        # test `exists`
+        assert has_method(petrel_backend.client._client, 'contains')
+        assert has_method(petrel_backend.client._client, 'isdir')
+        # raise Exception if `delete` is not implemented
+        with delete_and_reset_method(petrel_backend.client._client,
+                                     'contains'), delete_and_reset_method(
+                                         petrel_backend.client._client,
+                                         'isdir'):
+            assert not has_method(petrel_backend.client._client, 'contains')
+            assert not has_method(petrel_backend.client._client, 'isdir')
+            with pytest.raises(NotImplementedError):
+                petrel_backend.exists(petrel_path)
+
+        with patch.object(
+                petrel_backend.client._client, 'contains',
+                return_value=True) as mock_contains:
+            assert petrel_backend.exists(petrel_path)
+            mock_contains.assert_called_once_with(petrel_path)
+
+        # test `isdir`
+        assert has_method(petrel_backend.client._client, 'isdir')
+        with delete_and_reset_method(petrel_backend.client._client, 'isdir'):
+            assert not has_method(petrel_backend.client._client, 'isdir')
+            with pytest.raises(NotImplementedError):
+                petrel_backend.isdir(petrel_path)
+
+        with patch.object(
+                petrel_backend.client._client, 'isdir',
+                return_value=True) as mock_isdir:
+            assert petrel_backend.isdir(petrel_dir)
+            mock_isdir.assert_called_once_with(petrel_dir)
+
+        # test `isfile`
+        assert has_method(petrel_backend.client._client, 'contains')
+        with delete_and_reset_method(petrel_backend.client._client,
+                                     'contains'):
+            assert not has_method(petrel_backend.client._client, 'contains')
+            with pytest.raises(NotImplementedError):
+                petrel_backend.isfile(petrel_path)
+
+        with patch.object(
+                petrel_backend.client._client, 'contains',
+                return_value=True) as mock_contains:
+            assert petrel_backend.isfile(petrel_path)
+            mock_contains.assert_called_once_with(petrel_path)
+
+        # test `join_path`
+        assert petrel_backend.join_path(petrel_dir, 'file') == \
+            f'{petrel_dir}/file'
+        assert petrel_backend.join_path(f'{petrel_dir}/', 'file') == \
+            f'{petrel_dir}/file'
+        assert petrel_backend.join_path(petrel_dir, 'dir', 'file') == \
+            f'{petrel_dir}/dir/file'
+
+        # test `get_local_path`
+        with patch.object(petrel_backend.client._client, 'Get',
+                          return_value=b'petrel') as mock_get, \
+             patch.object(petrel_backend.client._client, 'contains',
+                          return_value=True) as mock_contains:
+            with petrel_backend.get_local_path(petrel_path) as path:
+                assert Path(path).open('rb').read() == b'petrel'
+            # exist the with block and path will be released
+            assert not osp.isfile(path)
+            mock_get.assert_called_once_with(petrel_path)
+            mock_contains.assert_called_once_with(petrel_path)
+
+        # test `list_dir_or_file`
+        assert has_method(petrel_backend.client._client, 'list')
+        with delete_and_reset_method(petrel_backend.client._client, 'list'):
+            assert not has_method(petrel_backend.client._client, 'list')
+            with pytest.raises(NotImplementedError):
+                list(petrel_backend.list_dir_or_file(petrel_dir))
+
+        with build_temporary_directory() as tmp_dir:
+            # 1. list directories and files
+            assert set(petrel_backend.list_dir_or_file(tmp_dir)) == set(
+                ['dir1', 'dir2', 'text1.txt', 'text2.txt'])
+            # 2. list directories and files recursively
+            assert set(
+                petrel_backend.list_dir_or_file(
+                    tmp_dir, recursive=True)) == set([
+                        'dir1', '/'.join(('dir1', 'text3.txt')), 'dir2',
+                        '/'.join(('dir2', 'dir3')), '/'.join(
+                            ('dir2', 'dir3', 'text4.txt')), '/'.join(
+                                ('dir2', 'img.jpg')), 'text1.txt', 'text2.txt'
+                    ])
+            # 3. only list directories
+            assert set(
+                petrel_backend.list_dir_or_file(
+                    tmp_dir, list_file=False)) == set(['dir1', 'dir2'])
+            with pytest.raises(
+                    TypeError,
+                    match=('`list_dir` should be False when `suffix` is not '
+                           'None')):
+                # Exception is raised among the `list_dir_or_file` of client,
+                # so we need to invode the client to trigger the exception
+                petrel_backend.client.list_dir_or_file(
+                    tmp_dir, list_file=False, suffix='.txt')
+            # 4. only list directories recursively
+            assert set(
+                petrel_backend.list_dir_or_file(
+                    tmp_dir, list_file=False, recursive=True)) == set(
+                        ['dir1', 'dir2', '/'.join(('dir2', 'dir3'))])
+            # 5. only list files
+            assert set(
+                petrel_backend.list_dir_or_file(tmp_dir,
+                                                list_dir=False)) == set(
+                                                    ['text1.txt', 'text2.txt'])
+            # 6. only list files recursively
+            assert set(
+                petrel_backend.list_dir_or_file(
+                    tmp_dir, list_dir=False, recursive=True)) == set([
+                        '/'.join(('dir1', 'text3.txt')), '/'.join(
+                            ('dir2', 'dir3', 'text4.txt')), '/'.join(
+                                ('dir2', 'img.jpg')), 'text1.txt', 'text2.txt'
+                    ])
+            # 7. only list files ending with suffix
+            assert set(
+                petrel_backend.list_dir_or_file(
+                    tmp_dir, list_dir=False,
+                    suffix='.txt')) == set(['text1.txt', 'text2.txt'])
+            assert set(
+                petrel_backend.list_dir_or_file(
+                    tmp_dir, list_dir=False,
+                    suffix=('.txt',
+                            '.jpg'))) == set(['text1.txt', 'text2.txt'])
+            with pytest.raises(
+                    TypeError,
+                    match='`suffix` must be a string or tuple of strings'):
+                petrel_backend.client.list_dir_or_file(
+                    tmp_dir, list_dir=False, suffix=['.txt', '.jpg'])
+            # 8. only list files ending with suffix recursively
+            assert set(
+                petrel_backend.list_dir_or_file(
+                    tmp_dir, list_dir=False, suffix='.txt',
+                    recursive=True)) == set([
+                        '/'.join(('dir1', 'text3.txt')), '/'.join(
+                            ('dir2', 'dir3', 'text4.txt')), 'text1.txt',
+                        'text2.txt'
+                    ])
+            # 7. only list files ending with suffix
+            assert set(
+                petrel_backend.list_dir_or_file(
+                    tmp_dir,
+                    list_dir=False,
+                    suffix=('.txt', '.jpg'),
+                    recursive=True)) == set([
+                        '/'.join(('dir1', 'text3.txt')), '/'.join(
+                            ('dir2', 'dir3', 'text4.txt')), '/'.join(
+                                ('dir2', 'img.jpg')), 'text1.txt', 'text2.txt'
+                    ])
+
+    @patch('mc.MemcachedClient.GetInstance', MockMemcachedClient)
+    @patch('mc.pyvector', MagicMock)
+    @patch('mc.ConvertBuffer', lambda x: x.content)
+    def test_memcached_backend(self):
+        mc_cfg = dict(server_list_cfg='', client_cfg='', sys_path=None)
+        mc_backend = FileClient('memcached', **mc_cfg)
+
+        # test `allow_symlink` attribute
+        assert not mc_backend.allow_symlink
+
+        # input path is Path object
+        with pytest.raises(NotImplementedError):
+            mc_backend.get_text(self.text_path)
+        # input path is str
+        with pytest.raises(NotImplementedError):
+            mc_backend.get_text(str(self.text_path))
+
+        # input path is Path object
+        img_bytes = mc_backend.get(self.img_path)
+        img = mmcv.imfrombytes(img_bytes)
+        assert img.shape == self.img_shape
+        # input path is str
+        img_bytes = mc_backend.get(str(self.img_path))
+        img = mmcv.imfrombytes(img_bytes)
+        assert img.shape == self.img_shape
+
+    def test_lmdb_backend(self):
+        lmdb_path = self.test_data_dir / 'demo.lmdb'
+
+        # db_path is Path object
+        lmdb_backend = FileClient('lmdb', db_path=lmdb_path)
+
+        # test `allow_symlink` attribute
+        assert not lmdb_backend.allow_symlink
+
+        with pytest.raises(NotImplementedError):
+            lmdb_backend.get_text(self.text_path)
+
+        img_bytes = lmdb_backend.get('baboon')
+        img = mmcv.imfrombytes(img_bytes)
+        assert img.shape == (120, 125, 3)
+
+        # db_path is str
+        lmdb_backend = FileClient('lmdb', db_path=str(lmdb_path))
+        with pytest.raises(NotImplementedError):
+            lmdb_backend.get_text(str(self.text_path))
+        img_bytes = lmdb_backend.get('baboon')
+        img = mmcv.imfrombytes(img_bytes)
+        assert img.shape == (120, 125, 3)
+
+    @pytest.mark.parametrize('backend,prefix', [('http', None),
+                                                (None, 'http')])
+    def test_http_backend(self, backend, prefix):
+        http_backend = FileClient(backend=backend, prefix=prefix)
+        img_url = 'https://raw.githubusercontent.com/open-mmlab/mmcv/' \
+            'master/tests/data/color.jpg'
+        text_url = 'https://raw.githubusercontent.com/open-mmlab/mmcv/' \
+            'master/tests/data/filelist.txt'
+
+        # test `allow_symlink` attribute
+        assert not http_backend.allow_symlink
+
+        # input is path or Path object
+        with pytest.raises(Exception):
+            http_backend.get(self.img_path)
+        with pytest.raises(Exception):
+            http_backend.get(str(self.img_path))
+        with pytest.raises(Exception):
+            http_backend.get_text(self.text_path)
+        with pytest.raises(Exception):
+            http_backend.get_text(str(self.text_path))
+
+        # input url is http image
+        img_bytes = http_backend.get(img_url)
+        img = mmcv.imfrombytes(img_bytes)
+        assert img.shape == self.img_shape
+
+        # input url is http text
+        value_buf = http_backend.get_text(text_url)
+        assert self.text_path.open('r').read() == value_buf
+
+        # test `_get_local_path`
+        # exist the with block and path will be released
+        with http_backend.get_local_path(img_url) as path:
+            assert mmcv.imread(path).shape == self.img_shape
+        assert not osp.isfile(path)
+
+    def test_new_magic_method(self):
+
+        class DummyBackend1(BaseStorageBackend):
+
+            def get(self, filepath):
+                return filepath
+
+            def get_text(self, filepath, encoding='utf-8'):
+                return filepath
+
+        FileClient.register_backend('dummy_backend', DummyBackend1)
+        client1 = FileClient(backend='dummy_backend')
+        client2 = FileClient(backend='dummy_backend')
+        assert client1 is client2
+
+        # if a backend is overwrote, it will disable the singleton pattern for
+        # the backend
+        class DummyBackend2(BaseStorageBackend):
+
+            def get(self, filepath):
+                pass
+
+            def get_text(self, filepath):
+                pass
+
+        FileClient.register_backend('dummy_backend', DummyBackend2, force=True)
+        client3 = FileClient(backend='dummy_backend')
+        client4 = FileClient(backend='dummy_backend')
+        assert client3 is not client4
+
+    def test_parse_uri_prefix(self):
+        # input path is None
+        with pytest.raises(AssertionError):
+            FileClient.parse_uri_prefix(None)
+        # input path is list
+        with pytest.raises(AssertionError):
+            FileClient.parse_uri_prefix([])
+
+        # input path is Path object
+        assert FileClient.parse_uri_prefix(self.img_path) is None
+        # input path is str
+        assert FileClient.parse_uri_prefix(str(self.img_path)) is None
+
+        # input path starts with https
+        img_url = 'https://raw.githubusercontent.com/open-mmlab/mmcv/' \
+            'master/tests/data/color.jpg'
+        assert FileClient.parse_uri_prefix(img_url) == 'https'
+
+        # input path starts with s3
+        img_url = 's3://your_bucket/img.png'
+        assert FileClient.parse_uri_prefix(img_url) == 's3'
+
+        # input path starts with clusterName:s3
+        img_url = 'clusterName:s3://your_bucket/img.png'
+        assert FileClient.parse_uri_prefix(img_url) == 's3'
+
+    def test_infer_client(self):
+        # HardDiskBackend
+        file_client_args = {'backend': 'disk'}
+        client = FileClient.infer_client(file_client_args)
+        assert client.name == 'HardDiskBackend'
+        client = FileClient.infer_client(uri=self.img_path)
+        assert client.name == 'HardDiskBackend'
+
+        # PetrelBackend
+        file_client_args = {'backend': 'petrel'}
+        client = FileClient.infer_client(file_client_args)
+        assert client.name == 'PetrelBackend'
+        uri = 's3://user_data'
+        client = FileClient.infer_client(uri=uri)
+        assert client.name == 'PetrelBackend'
+
+    def test_register_backend(self):
+
+        # name must be a string
+        with pytest.raises(TypeError):
+
+            class TestClass1:
+                pass
+
+            FileClient.register_backend(1, TestClass1)
+
+        # module must be a class
+        with pytest.raises(TypeError):
+            FileClient.register_backend('int', 0)
+
+        # module must be a subclass of BaseStorageBackend
+        with pytest.raises(TypeError):
+
+            class TestClass1:
+                pass
+
+            FileClient.register_backend('TestClass1', TestClass1)
+
+        class ExampleBackend(BaseStorageBackend):
+
+            def get(self, filepath):
+                return filepath
+
+            def get_text(self, filepath, encoding='utf-8'):
+                return filepath
+
+        FileClient.register_backend('example', ExampleBackend)
+        example_backend = FileClient('example')
+        assert example_backend.get(self.img_path) == self.img_path
+        assert example_backend.get_text(self.text_path) == self.text_path
+        assert 'example' in FileClient._backends
+
+        class Example2Backend(BaseStorageBackend):
+
+            def get(self, filepath):
+                return b'bytes2'
+
+            def get_text(self, filepath, encoding='utf-8'):
+                return 'text2'
+
+        # force=False
+        with pytest.raises(KeyError):
+            FileClient.register_backend('example', Example2Backend)
+
+        FileClient.register_backend('example', Example2Backend, force=True)
+        example_backend = FileClient('example')
+        assert example_backend.get(self.img_path) == b'bytes2'
+        assert example_backend.get_text(self.text_path) == 'text2'
+
+        @FileClient.register_backend(name='example3')
+        class Example3Backend(BaseStorageBackend):
+
+            def get(self, filepath):
+                return b'bytes3'
+
+            def get_text(self, filepath, encoding='utf-8'):
+                return 'text3'
+
+        example_backend = FileClient('example3')
+        assert example_backend.get(self.img_path) == b'bytes3'
+        assert example_backend.get_text(self.text_path) == 'text3'
+        assert 'example3' in FileClient._backends
+
+        # force=False
+        with pytest.raises(KeyError):
+
+            @FileClient.register_backend(name='example3')
+            class Example4Backend(BaseStorageBackend):
+
+                def get(self, filepath):
+                    return b'bytes4'
+
+                def get_text(self, filepath, encoding='utf-8'):
+                    return 'text4'
+
+        @FileClient.register_backend(name='example3', force=True)
+        class Example5Backend(BaseStorageBackend):
+
+            def get(self, filepath):
+                return b'bytes5'
+
+            def get_text(self, filepath, encoding='utf-8'):
+                return 'text5'
+
+        example_backend = FileClient('example3')
+        assert example_backend.get(self.img_path) == b'bytes5'
+        assert example_backend.get_text(self.text_path) == 'text5'
+
+        # prefixes is a str
+        class Example6Backend(BaseStorageBackend):
+
+            def get(self, filepath):
+                return b'bytes6'
+
+            def get_text(self, filepath, encoding='utf-8'):
+                return 'text6'
+
+        FileClient.register_backend(
+            'example4',
+            Example6Backend,
+            force=True,
+            prefixes='example4_prefix')
+        example_backend = FileClient('example4')
+        assert example_backend.get(self.img_path) == b'bytes6'
+        assert example_backend.get_text(self.text_path) == 'text6'
+        example_backend = FileClient(prefix='example4_prefix')
+        assert example_backend.get(self.img_path) == b'bytes6'
+        assert example_backend.get_text(self.text_path) == 'text6'
+        example_backend = FileClient('example4', prefix='example4_prefix')
+        assert example_backend.get(self.img_path) == b'bytes6'
+        assert example_backend.get_text(self.text_path) == 'text6'
+
+        # prefixes is a list of str
+        class Example7Backend(BaseStorageBackend):
+
+            def get(self, filepath):
+                return b'bytes7'
+
+            def get_text(self, filepath, encoding='utf-8'):
+                return 'text7'
+
+        FileClient.register_backend(
+            'example5',
+            Example7Backend,
+            force=True,
+            prefixes=['example5_prefix1', 'example5_prefix2'])
+        example_backend = FileClient('example5')
+        assert example_backend.get(self.img_path) == b'bytes7'
+        assert example_backend.get_text(self.text_path) == 'text7'
+        example_backend = FileClient(prefix='example5_prefix1')
+        assert example_backend.get(self.img_path) == b'bytes7'
+        assert example_backend.get_text(self.text_path) == 'text7'
+        example_backend = FileClient(prefix='example5_prefix2')
+        assert example_backend.get(self.img_path) == b'bytes7'
+        assert example_backend.get_text(self.text_path) == 'text7'
+
+        # backend has a higher priority than prefixes
+        class Example8Backend(BaseStorageBackend):
+
+            def get(self, filepath):
+                return b'bytes8'
+
+            def get_text(self, filepath, encoding='utf-8'):
+                return 'text8'
+
+        FileClient.register_backend(
+            'example6',
+            Example8Backend,
+            force=True,
+            prefixes='example6_prefix')
+        example_backend = FileClient('example6')
+        assert example_backend.get(self.img_path) == b'bytes8'
+        assert example_backend.get_text(self.text_path) == 'text8'
+        example_backend = FileClient('example6', prefix='example4_prefix')
+        assert example_backend.get(self.img_path) == b'bytes8'
+        assert example_backend.get_text(self.text_path) == 'text8'
diff --git a/tests/test_fileio.py b/tests/test_fileio.py
new file mode 100644
index 0000000..556a44a
--- /dev/null
+++ b/tests/test_fileio.py
@@ -0,0 +1,211 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+import os.path as osp
+import sys
+import tempfile
+from unittest.mock import MagicMock, patch
+
+import pytest
+
+import mmcv
+from mmcv.fileio.file_client import HTTPBackend, PetrelBackend
+
+sys.modules['petrel_client'] = MagicMock()
+sys.modules['petrel_client.client'] = MagicMock()
+
+
+def _test_handler(file_format, test_obj, str_checker, mode='r+'):
+    # dump to a string
+    dump_str = mmcv.dump(test_obj, file_format=file_format)
+    str_checker(dump_str)
+
+    # load/dump with filenames from disk
+    tmp_filename = osp.join(tempfile.gettempdir(), 'mmcv_test_dump')
+    mmcv.dump(test_obj, tmp_filename, file_format=file_format)
+    assert osp.isfile(tmp_filename)
+    load_obj = mmcv.load(tmp_filename, file_format=file_format)
+    assert load_obj == test_obj
+    os.remove(tmp_filename)
+
+    # load/dump with filename from petrel
+    method = 'put' if 'b' in mode else 'put_text'
+    with patch.object(PetrelBackend, method, return_value=None) as mock_method:
+        filename = 's3://path/of/your/file'
+        mmcv.dump(test_obj, filename, file_format=file_format)
+    mock_method.assert_called()
+
+    # json load/dump with a file-like object
+    with tempfile.NamedTemporaryFile(mode, delete=False) as f:
+        tmp_filename = f.name
+        mmcv.dump(test_obj, f, file_format=file_format)
+    assert osp.isfile(tmp_filename)
+    with open(tmp_filename, mode) as f:
+        load_obj = mmcv.load(f, file_format=file_format)
+    assert load_obj == test_obj
+    os.remove(tmp_filename)
+
+    # automatically inference the file format from the given filename
+    tmp_filename = osp.join(tempfile.gettempdir(),
+                            'mmcv_test_dump.' + file_format)
+    mmcv.dump(test_obj, tmp_filename)
+    assert osp.isfile(tmp_filename)
+    load_obj = mmcv.load(tmp_filename)
+    assert load_obj == test_obj
+    os.remove(tmp_filename)
+
+
+obj_for_test = [{'a': 'abc', 'b': 1}, 2, 'c']
+
+
+def test_json():
+
+    def json_checker(dump_str):
+        assert dump_str in [
+            '[{"a": "abc", "b": 1}, 2, "c"]', '[{"b": 1, "a": "abc"}, 2, "c"]'
+        ]
+
+    _test_handler('json', obj_for_test, json_checker)
+
+
+def test_yaml():
+
+    def yaml_checker(dump_str):
+        assert dump_str in [
+            '- {a: abc, b: 1}\n- 2\n- c\n', '- {b: 1, a: abc}\n- 2\n- c\n',
+            '- a: abc\n  b: 1\n- 2\n- c\n', '- b: 1\n  a: abc\n- 2\n- c\n'
+        ]
+
+    _test_handler('yaml', obj_for_test, yaml_checker)
+
+
+def test_pickle():
+
+    def pickle_checker(dump_str):
+        import pickle
+        assert pickle.loads(dump_str) == obj_for_test
+
+    _test_handler('pickle', obj_for_test, pickle_checker, mode='rb+')
+
+
+def test_exception():
+    test_obj = [{'a': 'abc', 'b': 1}, 2, 'c']
+
+    with pytest.raises(ValueError):
+        mmcv.dump(test_obj)
+
+    with pytest.raises(TypeError):
+        mmcv.dump(test_obj, 'tmp.txt')
+
+
+def test_register_handler():
+
+    @mmcv.register_handler('txt')
+    class TxtHandler1(mmcv.BaseFileHandler):
+
+        def load_from_fileobj(self, file):
+            return file.read()
+
+        def dump_to_fileobj(self, obj, file):
+            file.write(str(obj))
+
+        def dump_to_str(self, obj, **kwargs):
+            return str(obj)
+
+    @mmcv.register_handler(['txt1', 'txt2'])
+    class TxtHandler2(mmcv.BaseFileHandler):
+
+        def load_from_fileobj(self, file):
+            return file.read()
+
+        def dump_to_fileobj(self, obj, file):
+            file.write('\n')
+            file.write(str(obj))
+
+        def dump_to_str(self, obj, **kwargs):
+            return str(obj)
+
+    content = mmcv.load(osp.join(osp.dirname(__file__), 'data/filelist.txt'))
+    assert content == '1.jpg\n2.jpg\n3.jpg\n4.jpg\n5.jpg'
+    tmp_filename = osp.join(tempfile.gettempdir(), 'mmcv_test.txt2')
+    mmcv.dump(content, tmp_filename)
+    with open(tmp_filename, 'r') as f:
+        written = f.read()
+    os.remove(tmp_filename)
+    assert written == '\n' + content
+
+
+def test_list_from_file():
+    # get list from disk
+    filename = osp.join(osp.dirname(__file__), 'data/filelist.txt')
+    filelist = mmcv.list_from_file(filename)
+    assert filelist == ['1.jpg', '2.jpg', '3.jpg', '4.jpg', '5.jpg']
+    filelist = mmcv.list_from_file(filename, prefix='a/')
+    assert filelist == ['a/1.jpg', 'a/2.jpg', 'a/3.jpg', 'a/4.jpg', 'a/5.jpg']
+    filelist = mmcv.list_from_file(filename, offset=2)
+    assert filelist == ['3.jpg', '4.jpg', '5.jpg']
+    filelist = mmcv.list_from_file(filename, max_num=2)
+    assert filelist == ['1.jpg', '2.jpg']
+    filelist = mmcv.list_from_file(filename, offset=3, max_num=3)
+    assert filelist == ['4.jpg', '5.jpg']
+
+    # get list from http
+    with patch.object(
+            HTTPBackend, 'get_text', return_value='1.jpg\n2.jpg\n3.jpg'):
+        filename = 'http://path/of/your/file'
+        filelist = mmcv.list_from_file(
+            filename, file_client_args={'backend': 'http'})
+        assert filelist == ['1.jpg', '2.jpg', '3.jpg']
+        filelist = mmcv.list_from_file(
+            filename, file_client_args={'prefix': 'http'})
+        assert filelist == ['1.jpg', '2.jpg', '3.jpg']
+        filelist = mmcv.list_from_file(filename)
+        assert filelist == ['1.jpg', '2.jpg', '3.jpg']
+
+    # get list from petrel
+    with patch.object(
+            PetrelBackend, 'get_text', return_value='1.jpg\n2.jpg\n3.jpg'):
+        filename = 's3://path/of/your/file'
+        filelist = mmcv.list_from_file(
+            filename, file_client_args={'backend': 'petrel'})
+        assert filelist == ['1.jpg', '2.jpg', '3.jpg']
+        filelist = mmcv.list_from_file(
+            filename, file_client_args={'prefix': 's3'})
+        assert filelist == ['1.jpg', '2.jpg', '3.jpg']
+        filelist = mmcv.list_from_file(filename)
+        assert filelist == ['1.jpg', '2.jpg', '3.jpg']
+
+
+def test_dict_from_file():
+    # get dict from disk
+    filename = osp.join(osp.dirname(__file__), 'data/mapping.txt')
+    mapping = mmcv.dict_from_file(filename)
+    assert mapping == {'1': 'cat', '2': ['dog', 'cow'], '3': 'panda'}
+    mapping = mmcv.dict_from_file(filename, key_type=int)
+    assert mapping == {1: 'cat', 2: ['dog', 'cow'], 3: 'panda'}
+
+    # get dict from http
+    with patch.object(
+            HTTPBackend, 'get_text', return_value='1 cat\n2 dog cow\n3 panda'):
+        filename = 'http://path/of/your/file'
+        mapping = mmcv.dict_from_file(
+            filename, file_client_args={'backend': 'http'})
+        assert mapping == {'1': 'cat', '2': ['dog', 'cow'], '3': 'panda'}
+        mapping = mmcv.dict_from_file(
+            filename, file_client_args={'prefix': 'http'})
+        assert mapping == {'1': 'cat', '2': ['dog', 'cow'], '3': 'panda'}
+        mapping = mmcv.dict_from_file(filename)
+        assert mapping == {'1': 'cat', '2': ['dog', 'cow'], '3': 'panda'}
+
+    # get dict from petrel
+    with patch.object(
+            PetrelBackend, 'get_text',
+            return_value='1 cat\n2 dog cow\n3 panda'):
+        filename = 's3://path/of/your/file'
+        mapping = mmcv.dict_from_file(
+            filename, file_client_args={'backend': 'petrel'})
+        assert mapping == {'1': 'cat', '2': ['dog', 'cow'], '3': 'panda'}
+        mapping = mmcv.dict_from_file(
+            filename, file_client_args={'prefix': 's3'})
+        assert mapping == {'1': 'cat', '2': ['dog', 'cow'], '3': 'panda'}
+        mapping = mmcv.dict_from_file(filename)
+        assert mapping == {'1': 'cat', '2': ['dog', 'cow'], '3': 'panda'}
diff --git a/tests/test_image/test_geometric.py b/tests/test_image/test_geometric.py
index e6409d7..4a924af 100644
--- a/tests/test_image/test_geometric.py
+++ b/tests/test_image/test_geometric.py
@@ -461,10 +461,6 @@ class TestGeometric:
         with pytest.raises(AssertionError):
             mmcv.impad(img, shape=(12, 15), padding=(0, 0, 5, 2))
 
-        # Pad shape smaller than image shape
-        padded_img = mmcv.impad(img, shape=(8, 8))
-        assert padded_img.shape == (10, 10, 3)
-
     def test_impad_to_multiple(self):
         img = np.random.rand(11, 14, 3).astype(np.float32)
         padded_img = mmcv.impad_to_multiple(img, 4)
@@ -527,9 +523,6 @@ class TestGeometric:
         assert_array_equal(mmcv.imrotate(img, 90, border_value=255), img_r)
         img_r = np.array([[5, 1], [6, 2], [7, 3], [8, 4]])
         assert_array_equal(mmcv.imrotate(img, 90, auto_bound=True), img_r)
-        img_r = np.array([[6, 6, 2, 2], [7, 7, 3, 3]])
-        assert_array_equal(
-            mmcv.imrotate(img, 90, border_mode='replicate'), img_r)
 
         with pytest.raises(ValueError):
             mmcv.imrotate(img, 90, center=(0, 0), auto_bound=True)
diff --git a/tests/test_image/test_image_misc.py b/tests/test_image/test_image_misc.py
index 51e61d8..7573b16 100644
--- a/tests/test_image/test_image_misc.py
+++ b/tests/test_image/test_image_misc.py
@@ -24,29 +24,15 @@ def test_tensor2imgs():
         tensor = torch.randn(2, 3, 3)
         mmcv.tensor2imgs(tensor)
 
-    # test tensor dim-1
-    with pytest.raises(AssertionError):
-        tensor = torch.randn(2, 4, 3, 3)
-        mmcv.tensor2imgs(tensor)
-
     # test mean length
     with pytest.raises(AssertionError):
         tensor = torch.randn(2, 3, 5, 5)
         mmcv.tensor2imgs(tensor, mean=(1, ))
-        tensor = torch.randn(2, 1, 5, 5)
-        mmcv.tensor2imgs(tensor, mean=(0, 0, 0))
 
     # test std length
     with pytest.raises(AssertionError):
         tensor = torch.randn(2, 3, 5, 5)
         mmcv.tensor2imgs(tensor, std=(1, ))
-        tensor = torch.randn(2, 1, 5, 5)
-        mmcv.tensor2imgs(tensor, std=(1, 1, 1))
-
-    # test to_rgb
-    with pytest.raises(AssertionError):
-        tensor = torch.randn(2, 1, 5, 5)
-        mmcv.tensor2imgs(tensor, mean=(0, ), std=(1, ), to_rgb=True)
 
     # test rgb=True
     tensor = torch.randn(2, 3, 5, 5)
@@ -64,10 +50,3 @@ def test_tensor2imgs():
     outputs = mmcv.tensor2imgs(tensor, to_rgb=False)
     for gt, output in zip(gts, outputs):
         assert_array_equal(gt, output)
-
-    # test tensor channel 1 and rgb=False
-    tensor = torch.randn(2, 1, 5, 5)
-    gts = [t.squeeze(0).cpu().numpy().astype(np.uint8) for t in tensor]
-    outputs = mmcv.tensor2imgs(tensor, to_rgb=False)
-    for gt, output in zip(gts, outputs):
-        assert_array_equal(gt, output)
diff --git a/tests/test_image/test_io.py b/tests/test_image/test_io.py
index 6742924..35bdc9d 100644
--- a/tests/test_image/test_io.py
+++ b/tests/test_image/test_io.py
@@ -1,24 +1,17 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import os
 import os.path as osp
-import sys
 import tempfile
 from pathlib import Path
-from unittest.mock import MagicMock, patch
+from unittest.mock import patch
 
 import cv2
-import mmengine
 import numpy as np
 import pytest
-import torch
-from mmengine.fileio.file_client import HTTPBackend, PetrelBackend
 from numpy.testing import assert_allclose, assert_array_equal
 
 import mmcv
 
-if torch.__version__ == 'parrots':
-    pytest.skip('not necessary in parrots test', allow_module_level=True)
-
 
 class TestIO:
 
@@ -36,18 +29,6 @@ class TestIO:
         cls.exif_img_path = osp.join(cls.data_dir, 'color_exif.jpg')
         cls.img = cv2.imread(cls.img_path)
         cls.tiff_path = osp.join(cls.data_dir, 'uint16-5channel.tif')
-        # petrel s3 path
-        cls.s3_path = 's3://path/of/your/file.jpg'
-        # http path
-        cls.http_path = 'http://path/of/your/file.jpg'
-        # add mock package
-        sys.modules['petrel_client'] = MagicMock()
-        sys.modules['petrel_client.client'] = MagicMock()
-
-    @classmethod
-    def teardown_class(cls):
-        # clean instances avoid to influence other unittest
-        mmengine.FileClient._instances = {}
 
     def assert_img_equal(self, img, ref_img, ratio_thr=0.999):
         assert img.shape == ref_img.shape
@@ -60,16 +41,6 @@ class TestIO:
         # backend cv2
         mmcv.use_backend('cv2')
 
-        # file_client_args and backend_args can not be both set
-        with pytest.raises(
-                ValueError,
-                match='"file_client_args" and "backend_args" cannot be set'):
-            mmcv.imread(
-                self.img_path,
-                file_client_args={'backend': 'disk'},
-                backend_args={'backend': 'disk'})
-
-        # HardDiskBackend
         img_cv2_color_bgr = mmcv.imread(self.img_path)
         assert img_cv2_color_bgr.shape == (300, 400, 3)
         img_cv2_color_rgb = mmcv.imread(self.img_path, channel_order='rgb')
@@ -98,57 +69,6 @@ class TestIO:
         with pytest.raises(TypeError):
             mmcv.imread(1)
 
-        # PetrelBackend
-        img_cv2_color_bgr = mmcv.imread(self.img_path)
-        with patch.object(
-                PetrelBackend, 'get',
-                return_value=img_cv2_color_bgr) as mock_method:
-            img_cv2_color_bgr_petrel = mmcv.imread(self.s3_path, backend='cv2')
-            img_cv2_color_bgr_petrel_with_args = mmcv.imread(
-                self.s3_path,
-                backend='cv2',
-                file_client_args={'backend': 'petrel'})
-            mock_method.assert_called()
-            assert_array_equal(img_cv2_color_bgr_petrel,
-                               img_cv2_color_bgr_petrel_with_args)
-
-            mock_method.reset_mock()
-
-            img_cv2_color_bgr_petrel_with_args = mmcv.imread(
-                self.s3_path,
-                backend='cv2',
-                backend_args={'backend': 'petrel'})
-            mock_method.assert_called()
-            assert_array_equal(img_cv2_color_bgr_petrel,
-                               img_cv2_color_bgr_petrel_with_args)
-
-        # HTTPBackend
-        img_cv2_color_bgr = mmcv.imread(self.img_path)
-        with patch.object(
-                HTTPBackend, 'get',
-                return_value=img_cv2_color_bgr) as mock_method:
-            img_cv2_color_bgr_http = mmcv.imread(self.http_path, backend='cv2')
-            img_cv2_color_bgr_http_with_args = mmcv.imread(
-                self.http_path,
-                backend='cv2',
-                file_client_args={'backend': 'http'})
-            mock_method.assert_called()
-            assert_array_equal(img_cv2_color_bgr_http,
-                               img_cv2_color_bgr_http_with_args)
-
-            mock_method.reset_mock()
-
-            img_cv2_color_bgr_http_with_args = mmcv.imread(
-                self.http_path,
-                backend='cv2',
-                backend_args={'backend': 'http'})
-            mock_method.assert_called()
-            assert_array_equal(img_cv2_color_bgr_http,
-                               img_cv2_color_bgr_http_with_args)
-
-        with pytest.raises(FileNotFoundError):
-            mmcv.imread('/not/exists/' + self.img_path)
-
         # test arg backend pillow
         img_pil_gray_alpha = mmcv.imread(
             self.gray_alpha_img_path, 'grayscale', backend='pillow')
@@ -386,41 +306,14 @@ class TestIO:
     def test_imwrite(self):
         img = mmcv.imread(self.img_path)
         out_file = osp.join(tempfile.gettempdir(), 'mmcv_test.jpg')
-
-        # file_client_args and backend_args can not be both set
-        with pytest.raises(
-                ValueError,
-                match='"file_client_args" and "backend_args" cannot be set'):
-            mmcv.imwrite(
-                img,
-                out_file,
-                file_client_args={'backend': 'disk'},
-                backend_args={'backend': 'disk'})
-
         mmcv.imwrite(img, out_file)
         rewrite_img = mmcv.imread(out_file)
         os.remove(out_file)
         self.assert_img_equal(img, rewrite_img)
 
-        # test petrel client
-        with patch.object(
-                PetrelBackend, 'put', return_value=None) as mock_method:
-            ret = mmcv.imwrite(img, self.s3_path)
-            ret_with_args = mmcv.imwrite(
-                img, self.s3_path, file_client_args={'backend': 'petrel'})
-            assert ret
-            assert ret_with_args
-            mock_method.assert_called()
-
-            mock_method.reset_mock()
-
-            ret_with_args = mmcv.imwrite(
-                img, self.s3_path, backend_args={'backend': 'petrel'})
-            assert ret_with_args
-            mock_method.assert_called()
-
-        with pytest.raises(cv2.error):
-            mmcv.imwrite(img, 'error_file.jppg')
+        ret = mmcv.imwrite(
+            img, './non_exist_path/mmcv_test.jpg', auto_mkdir=False)
+        assert ret is False
 
     @patch('mmcv.image.io.TurboJPEG', None)
     def test_no_turbojpeg(self):
diff --git a/tests/test_image/test_photometric.py b/tests/test_image/test_photometric.py
index 2288a5e..8552bd1 100644
--- a/tests/test_image/test_photometric.py
+++ b/tests/test_image/test_photometric.py
@@ -77,7 +77,7 @@ class TestPhotometric:
                          dtype=np.uint8)
         assert_array_equal(mmcv.posterize(img, 3), img_r)
 
-    def test_adjust_color(self, nb_rand_test=100):
+    def test_adjust_color(self):
         img = np.array([[0, 128, 255], [1, 127, 254], [2, 129, 253]],
                        dtype=np.uint8)
         img = np.stack([img, img, img], axis=-1)
@@ -108,28 +108,11 @@ class TestPhotometric:
             np.round(mmcv.adjust_color(img, 0.8, -0.6, gamma=-0.6)),
             np.round(np.clip(img * 0.8 - 0.6 * img_r - 0.6, 0, 255)))
 
-        # test equalize with randomly sampled image.
-        for _ in range(nb_rand_test):
-            img = np.clip(np.random.normal(0, 1, (256, 256, 3)) * 260, 0,
-                          255).astype(np.uint8)
-            factor = np.random.uniform()
-            cv2_img = mmcv.adjust_color(img, alpha=factor)
-            pil_img = mmcv.adjust_color(img, alpha=factor, backend='pillow')
-            np.testing.assert_allclose(cv2_img, pil_img, rtol=0, atol=2)
-
-        # the input type must be uint8 for pillow backend
-        with pytest.raises(AssertionError):
-            mmcv.adjust_color(img.astype(np.float32), backend='pillow')
-
-        # backend must be 'cv2' or 'pillow'
-        with pytest.raises(ValueError):
-            mmcv.adjust_color(img.astype(np.uint8), backend='not support')
-
     def test_imequalize(self, nb_rand_test=100):
 
         def _imequalize(img):
             # equalize the image using PIL.ImageOps.equalize
-            from PIL import Image, ImageOps
+            from PIL import ImageOps, Image
             img = Image.fromarray(img)
             equalized_img = np.asarray(ImageOps.equalize(img))
             return equalized_img
@@ -155,6 +138,15 @@ class TestPhotometric:
 
     def test_adjust_brightness(self, nb_rand_test=100):
 
+        def _adjust_brightness(img, factor):
+            # adjust the brightness of image using
+            # PIL.ImageEnhance.Brightness
+            from PIL.ImageEnhance import Brightness
+            from PIL import Image
+            img = Image.fromarray(img)
+            brightened_img = Brightness(img).enhance(factor)
+            return np.asarray(brightened_img)
+
         img = np.array([[0, 128, 255], [1, 127, 254], [2, 129, 253]],
                        dtype=np.uint8)
         img = np.stack([img, img, img], axis=-1)
@@ -170,21 +162,22 @@ class TestPhotometric:
             factor = np.random.uniform() + np.random.choice([0, 1])
             np.testing.assert_allclose(
                 mmcv.adjust_brightness(img, factor).astype(np.int32),
-                mmcv.adjust_brightness(img, factor,
-                                       backend='pillow').astype(np.int32),
+                _adjust_brightness(img, factor).astype(np.int32),
                 rtol=0,
                 atol=1)
 
-        # the input type must be uint8 for pillow backend
-        with pytest.raises(AssertionError):
-            mmcv.adjust_brightness(img.astype(np.float32), backend='pillow')
-
-        # backend must be 'cv2' or 'pillow'
-        with pytest.raises(ValueError):
-            mmcv.adjust_brightness(img.astype(np.uint8), backend='not support')
-
     def test_adjust_contrast(self, nb_rand_test=100):
 
+        def _adjust_contrast(img, factor):
+            from PIL.ImageEnhance import Contrast
+            from PIL import Image
+            # Image.fromarray defaultly supports RGB, not BGR.
+            # convert from BGR to RGB
+            img = Image.fromarray(img[..., ::-1], mode='RGB')
+            contrasted_img = Contrast(img).enhance(factor)
+            # convert from RGB to BGR
+            return np.asarray(contrasted_img)[..., ::-1]
+
         img = np.array([[0, 128, 255], [1, 127, 254], [2, 129, 253]],
                        dtype=np.uint8)
         img = np.stack([img, img, img], axis=-1)
@@ -192,8 +185,7 @@ class TestPhotometric:
         assert_array_equal(mmcv.adjust_contrast(img, 1.), img)
         # test case with factor 0.0
         assert_array_equal(
-            mmcv.adjust_contrast(img, 0.),
-            mmcv.adjust_contrast(img, 0., backend='pillow'))
+            mmcv.adjust_contrast(img, 0.), _adjust_contrast(img, 0.))
         # test adjust_contrast with randomly sampled images and factors.
         for _ in range(nb_rand_test):
             img = np.clip(
@@ -205,25 +197,15 @@ class TestPhotometric:
             # a color image to gray image using mmcv or PIL.
             np.testing.assert_allclose(
                 mmcv.adjust_contrast(img, factor).astype(np.int32),
-                mmcv.adjust_contrast(img, factor,
-                                     backend='pillow').astype(np.int32),
+                _adjust_contrast(img, factor).astype(np.int32),
                 rtol=0,
                 atol=1)
 
-        # the input type must be uint8 pillow backend
-        with pytest.raises(AssertionError):
-            mmcv.adjust_contrast(img.astype(np.float32), backend='pillow')
-
-        # backend must be 'cv2' or 'pillow'
-        with pytest.raises(ValueError):
-            mmcv.adjust_contrast(img.astype(np.uint8), backend='not support')
-
     def test_auto_contrast(self, nb_rand_test=100):
 
         def _auto_contrast(img, cutoff=0):
-            from PIL import Image
             from PIL.ImageOps import autocontrast
-
+            from PIL import Image
             # Image.fromarray defaultly supports RGB, not BGR.
             # convert from BGR to RGB
             img = Image.fromarray(img[..., ::-1], mode='RGB')
@@ -268,8 +250,8 @@ class TestPhotometric:
         def _adjust_sharpness(img, factor):
             # adjust the sharpness of image using
             # PIL.ImageEnhance.Sharpness
-            from PIL import Image
             from PIL.ImageEnhance import Sharpness
+            from PIL import Image
             img = Image.fromarray(img)
             sharpened_img = Sharpness(img).enhance(factor)
             return np.asarray(sharpened_img)
@@ -351,7 +333,7 @@ class TestPhotometric:
 
         input_img = np.array(
             [[[0, 128, 255], [255, 128, 0]], [[0, 128, 255], [255, 128, 0]]],
-            dtype=float)
+            dtype=np.float)
         img = mmcv.lut_transform(input_img, lut_table)
         baseline = cv2.LUT(np.array(input_img, dtype=np.uint8), lut_table)
         assert np.allclose(img, baseline)
@@ -394,33 +376,3 @@ class TestPhotometric:
             assert np.allclose(img, img_std)
             assert id(img) != id(self.img[:, :, i])
             assert id(img_std) != id(self.img[:, :, i])
-
-    def test_adjust_hue(self):
-        # test case with img is not ndarray
-        from PIL import Image
-        pil_img = Image.fromarray(self.img)
-
-        with pytest.raises(TypeError):
-            mmcv.adjust_hue(pil_img, hue_factor=0.0)
-
-        # test case with hue_factor > 0.5 or hue_factor < -0.5
-        with pytest.raises(ValueError):
-            mmcv.adjust_hue(self.img, hue_factor=-0.6)
-        with pytest.raises(ValueError):
-            mmcv.adjust_hue(self.img, hue_factor=0.6)
-
-        for i in np.arange(-0.5, 0.5, 0.2):
-            pil_res = mmcv.adjust_hue(self.img, hue_factor=i, backend='pillow')
-            pil_res = np.array(pil_res)
-            cv2_res = mmcv.adjust_hue(self.img, hue_factor=i)
-            assert np.allclose(pil_res, cv2_res, atol=10.0)
-
-        # test pillow backend
-        with pytest.raises(AssertionError):
-            mmcv.adjust_hue(
-                self.img.astype(np.float32), hue_factor=0, backend='pillow')
-
-        # backend must be 'cv2' or 'pillow'
-        with pytest.raises(ValueError):
-            mmcv.adjust_hue(
-                self.img.astype(np.uint8), hue_factor=0, backend='not support')
diff --git a/tests/test_load_model_zoo.py b/tests/test_load_model_zoo.py
new file mode 100644
index 0000000..c230bfa
--- /dev/null
+++ b/tests/test_load_model_zoo.py
@@ -0,0 +1,146 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+import os.path as osp
+from unittest.mock import patch
+
+import pytest
+
+import mmcv
+from mmcv.runner.checkpoint import (DEFAULT_CACHE_DIR, ENV_MMCV_HOME,
+                                    ENV_XDG_CACHE_HOME, _get_mmcv_home,
+                                    _load_checkpoint,
+                                    get_deprecated_model_names,
+                                    get_external_models)
+from mmcv.utils import TORCH_VERSION
+
+
+@patch('mmcv.__path__', [osp.join(osp.dirname(__file__), 'data/')])
+def test_set_mmcv_home():
+    os.environ.pop(ENV_MMCV_HOME, None)
+    mmcv_home = osp.join(osp.dirname(__file__), 'data/model_zoo/mmcv_home/')
+    os.environ[ENV_MMCV_HOME] = mmcv_home
+    assert _get_mmcv_home() == mmcv_home
+
+
+@patch('mmcv.__path__', [osp.join(osp.dirname(__file__), 'data/')])
+def test_default_mmcv_home():
+    os.environ.pop(ENV_MMCV_HOME, None)
+    os.environ.pop(ENV_XDG_CACHE_HOME, None)
+    assert _get_mmcv_home() == os.path.expanduser(
+        os.path.join(DEFAULT_CACHE_DIR, 'mmcv'))
+    model_urls = get_external_models()
+    assert model_urls == mmcv.load(
+        osp.join(mmcv.__path__[0], 'model_zoo/open_mmlab.json'))
+
+
+@patch('mmcv.__path__', [osp.join(osp.dirname(__file__), 'data/')])
+def test_get_external_models():
+    os.environ.pop(ENV_MMCV_HOME, None)
+    mmcv_home = osp.join(osp.dirname(__file__), 'data/model_zoo/mmcv_home/')
+    os.environ[ENV_MMCV_HOME] = mmcv_home
+    ext_urls = get_external_models()
+    assert ext_urls == {
+        'train': 'https://localhost/train.pth',
+        'test': 'test.pth',
+        'val': 'val.pth',
+        'train_empty': 'train.pth'
+    }
+
+
+@patch('mmcv.__path__', [osp.join(osp.dirname(__file__), 'data/')])
+def test_get_deprecated_models():
+    os.environ.pop(ENV_MMCV_HOME, None)
+    mmcv_home = osp.join(osp.dirname(__file__), 'data/model_zoo/mmcv_home/')
+    os.environ[ENV_MMCV_HOME] = mmcv_home
+    dep_urls = get_deprecated_model_names()
+    assert dep_urls == {
+        'train_old': 'train',
+        'test_old': 'test',
+    }
+
+
+def load_from_http(url, map_location=None):
+    return 'url:' + url
+
+
+def load_url(url, map_location=None, model_dir=None):
+    return load_from_http(url)
+
+
+def load(filepath, map_location=None):
+    return 'local:' + filepath
+
+
+@patch('mmcv.__path__', [osp.join(osp.dirname(__file__), 'data/')])
+@patch('mmcv.runner.checkpoint.load_from_http', load_from_http)
+@patch('mmcv.runner.checkpoint.load_url', load_url)
+@patch('torch.load', load)
+def test_load_external_url():
+    # test modelzoo://
+    url = _load_checkpoint('modelzoo://resnet50')
+    if TORCH_VERSION < '1.9.0':
+        assert url == ('url:https://download.pytorch.org/models/resnet50-19c8e'
+                       '357.pth')
+    else:
+        # filename of checkpoint is renamed in torch1.9.0
+        assert url == ('url:https://download.pytorch.org/models/resnet50-0676b'
+                       'a61.pth')
+
+    # test torchvision://
+    url = _load_checkpoint('torchvision://resnet50')
+    if TORCH_VERSION < '1.9.0':
+        assert url == ('url:https://download.pytorch.org/models/resnet50-19c8e'
+                       '357.pth')
+    else:
+        # filename of checkpoint is renamed in torch1.9.0
+        assert url == ('url:https://download.pytorch.org/models/resnet50-0676b'
+                       'a61.pth')
+
+    # test open-mmlab:// with default MMCV_HOME
+    os.environ.pop(ENV_MMCV_HOME, None)
+    os.environ.pop(ENV_XDG_CACHE_HOME, None)
+    url = _load_checkpoint('open-mmlab://train')
+    assert url == 'url:https://localhost/train.pth'
+
+    # test open-mmlab:// with deprecated model name
+    os.environ.pop(ENV_MMCV_HOME, None)
+    os.environ.pop(ENV_XDG_CACHE_HOME, None)
+    with pytest.warns(
+            Warning,
+            match='open-mmlab://train_old is deprecated in favor of '
+            'open-mmlab://train'):
+        url = _load_checkpoint('open-mmlab://train_old')
+        assert url == 'url:https://localhost/train.pth'
+
+    # test openmmlab:// with deprecated model name
+    os.environ.pop(ENV_MMCV_HOME, None)
+    os.environ.pop(ENV_XDG_CACHE_HOME, None)
+    with pytest.warns(
+            Warning,
+            match='openmmlab://train_old is deprecated in favor of '
+            'openmmlab://train'):
+        url = _load_checkpoint('openmmlab://train_old')
+        assert url == 'url:https://localhost/train.pth'
+
+    # test open-mmlab:// with user-defined MMCV_HOME
+    os.environ.pop(ENV_MMCV_HOME, None)
+    mmcv_home = osp.join(osp.dirname(__file__), 'data/model_zoo/mmcv_home')
+    os.environ[ENV_MMCV_HOME] = mmcv_home
+    url = _load_checkpoint('open-mmlab://train')
+    assert url == 'url:https://localhost/train.pth'
+    with pytest.raises(IOError, match='train.pth is not a checkpoint file'):
+        _load_checkpoint('open-mmlab://train_empty')
+    url = _load_checkpoint('open-mmlab://test')
+    assert url == f'local:{osp.join(_get_mmcv_home(), "test.pth")}'
+    url = _load_checkpoint('open-mmlab://val')
+    assert url == f'local:{osp.join(_get_mmcv_home(), "val.pth")}'
+
+    # test http:// https://
+    url = _load_checkpoint('http://localhost/train.pth')
+    assert url == 'url:http://localhost/train.pth'
+
+    # test local file
+    with pytest.raises(IOError, match='train.pth is not a checkpoint file'):
+        _load_checkpoint('train.pth')
+    url = _load_checkpoint(osp.join(_get_mmcv_home(), 'test.pth'))
+    assert url == f'local:{osp.join(_get_mmcv_home(), "test.pth")}'
diff --git a/tests/test_ops/output.pkl b/tests/test_ops/output.pkl
deleted file mode 100644
index bcb7b2dd606930522b102d3a59fef70d6f3eb885..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 2168
zcmd^BO=}ZT6n)9$%dxbj6hQ)YAwmUB(`ibn3r9kU!c!bm#NZ}OCqojPB)-WcD+R$t
zai<$sA{FVzO@Bc%E<(Yj3zx2Rp$oyKf~fB%IU$qUty%QK;pDyC`#w$%@5bOtgt0_|
z9g0~t$4u9%RNMAa$@I+B{d-O>JI(F};!)W08Zs+YY<X3usTN&(t5tPMMbEA`8*Z_!
zse&yo7UFXKrpz?Uwg~l0)SS~Wz0LllD#C?nnX{frYilc+Wi6wvt!hFk#N?#aDw&tk
zB66b=@<j9&XMBpQT-g>ezQ7e8+7|IAmep_^+w!W7dQ-jWmTcE9ZB#8!6^ZkCal#X7
zUYtxBJf8SCwVT|Ns}hVOub*Uz!1b4cC(C6cJtYom^Ez<Nxb6d$XEh&qV7~Ezr^|af
z;6JSCfT?}x0GGdf^?`1-rUSq2J38=bH@OeYJ-*Tb${X)|;6*pB13xxjcYvQeS9Rd?
z{C)?pPrE*__3C*C$esJ<15&ez1T-dw#>CK=7#b5pV`6Ab42_AQF)=hIhQ`FlZC`kb
z7@i`Ar-<PxVt9%eo+5^)h~X(>c#0UFBA$q;{==q<+~Z%El+MR(UwZHle!Z>lL>VI-
z{ov2Av%?3!ZM#j`NOIXTW9=@``)IJD(hl!mmT!mUFHJCbh-lbTN88OTeG!Q94m(~w
zdiG?X@{b&iR*yBP@r6c@I1^atyKJ#oXmD|Z$6^--NejxwVLCaP0^IEnS)SUv3|ZIv
VbZYQ_BGj9UQV*9k3Zwjf?q5M}yi@=H

diff --git a/tests/test_ops/test_active_rotated_filter.py b/tests/test_ops/test_active_rotated_filter.py
deleted file mode 100644
index 30ea59c..0000000
--- a/tests/test_ops/test_active_rotated_filter.py
+++ /dev/null
@@ -1,258 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import numpy as np
-import pytest
-import torch
-
-from mmcv.ops import active_rotated_filter
-
-np_feature = np.array([[[[[-1.4934e-01, 1.1341e+00, -1.6241e-01],
-                          [-1.0986e+00, -1.1463e+00, -1.3176e+00],
-                          [1.4808e+00, 7.6572e-01, -1.4548e+00]]]],
-                       [[[[1.9370e+00, 6.2799e-01, 2.5834e-02],
-                          [-1.4242e+00, 7.6566e-01, 1.0015e+00],
-                          [9.8669e-01, 4.1356e-01, 6.1068e-01]]]],
-                       [[[[1.4565e+00, 1.4960e+00, 2.4339e-01],
-                          [-2.2484e-01, 7.5942e-01, -8.1184e-01],
-                          [-1.7077e+00, 1.0658e+00, 3.8311e-01]]]],
-                       [[[[8.4734e-01, 1.0904e+00, 2.4356e+00],
-                          [9.5822e-01, 2.2260e-01, -2.4450e-01],
-                          [-1.5078e+00, 7.0902e-02, -1.5921e+00]]]],
-                       [[[[2.1173e+00, -7.3524e-01, 1.8888e+00],
-                          [1.0169e+00, 4.7033e-01, -1.0875e+00],
-                          [-1.0736e+00, -5.2245e-01, -2.8733e-01]]]],
-                       [[[[-5.6433e-01, 1.5835e+00, -1.5826e+00],
-                          [-8.8974e-01, -4.3128e-01, -2.2423e-01],
-                          [1.6552e-03, -1.7292e+00, 2.6639e-01]]]],
-                       [[[[-1.2951e-01, 1.3493e+00, -1.9329e+00],
-                          [5.6248e-01, -5.1189e-01, 1.3614e+00],
-                          [3.3680e-01, -8.7148e-01, 5.0592e-01]]]],
-                       [[[[1.6781e-02, -8.3929e-01, 1.2060e+00],
-                          [-1.0764e+00, 4.7821e-01, 1.5342e+00],
-                          [-4.4542e-01, -1.8606e+00, 3.0827e-01]]]]])
-
-np_indices = np.array([[[[1, 2, 3, 6, 9, 8, 7, 4], [2, 3, 6, 9, 8, 7, 4, 1],
-                         [3, 6, 9, 8, 7, 4, 1, 2]],
-                        [[4, 1, 2, 3, 6, 9, 8, 7], [5, 5, 5, 5, 5, 5, 5, 5],
-                         [6, 9, 8, 7, 4, 1, 2, 3]],
-                        [[7, 4, 1, 2, 3, 6, 9, 8], [8, 7, 4, 1, 2, 3, 6, 9],
-                         [9, 8, 7, 4, 1, 2, 3, 6]]]])
-
-expected_output = np.array([[[[-1.4934e-01, 1.1341e+00, -1.6241e-01],
-                              [-1.0986e+00, -1.1463e+00, -1.3176e+00],
-                              [1.4808e+00, 7.6572e-01, -1.4548e+00]]],
-                            [[[-1.0986e+00, -1.4934e-01, 1.1341e+00],
-                              [1.4808e+00, -1.1463e+00, -1.6241e-01],
-                              [7.6572e-01, -1.4548e+00, -1.3176e+00]]],
-                            [[[1.4808e+00, -1.0986e+00, -1.4934e-01],
-                              [7.6572e-01, -1.1463e+00, 1.1341e+00],
-                              [-1.4548e+00, -1.3176e+00, -1.6241e-01]]],
-                            [[[7.6572e-01, 1.4808e+00, -1.0986e+00],
-                              [-1.4548e+00, -1.1463e+00, -1.4934e-01],
-                              [-1.3176e+00, -1.6241e-01, 1.1341e+00]]],
-                            [[[-1.4548e+00, 7.6572e-01, 1.4808e+00],
-                              [-1.3176e+00, -1.1463e+00, -1.0986e+00],
-                              [-1.6241e-01, 1.1341e+00, -1.4934e-01]]],
-                            [[[-1.3176e+00, -1.4548e+00, 7.6572e-01],
-                              [-1.6241e-01, -1.1463e+00, 1.4808e+00],
-                              [1.1341e+00, -1.4934e-01, -1.0986e+00]]],
-                            [[[-1.6241e-01, -1.3176e+00, -1.4548e+00],
-                              [1.1341e+00, -1.1463e+00, 7.6572e-01],
-                              [-1.4934e-01, -1.0986e+00, 1.4808e+00]]],
-                            [[[1.1341e+00, -1.6241e-01, -1.3176e+00],
-                              [-1.4934e-01, -1.1463e+00, -1.4548e+00],
-                              [-1.0986e+00, 1.4808e+00, 7.6572e-01]]],
-                            [[[1.9370e+00, 6.2799e-01, 2.5834e-02],
-                              [-1.4242e+00, 7.6566e-01, 1.0015e+00],
-                              [9.8669e-01, 4.1356e-01, 6.1068e-01]]],
-                            [[[-1.4242e+00, 1.9370e+00, 6.2799e-01],
-                              [9.8669e-01, 7.6566e-01, 2.5834e-02],
-                              [4.1356e-01, 6.1068e-01, 1.0015e+00]]],
-                            [[[9.8669e-01, -1.4242e+00, 1.9370e+00],
-                              [4.1356e-01, 7.6566e-01, 6.2799e-01],
-                              [6.1068e-01, 1.0015e+00, 2.5834e-02]]],
-                            [[[4.1356e-01, 9.8669e-01, -1.4242e+00],
-                              [6.1068e-01, 7.6566e-01, 1.9370e+00],
-                              [1.0015e+00, 2.5834e-02, 6.2799e-01]]],
-                            [[[6.1068e-01, 4.1356e-01, 9.8669e-01],
-                              [1.0015e+00, 7.6566e-01, -1.4242e+00],
-                              [2.5834e-02, 6.2799e-01, 1.9370e+00]]],
-                            [[[1.0015e+00, 6.1068e-01, 4.1356e-01],
-                              [2.5834e-02, 7.6566e-01, 9.8669e-01],
-                              [6.2799e-01, 1.9370e+00, -1.4242e+00]]],
-                            [[[2.5834e-02, 1.0015e+00, 6.1068e-01],
-                              [6.2799e-01, 7.6566e-01, 4.1356e-01],
-                              [1.9370e+00, -1.4242e+00, 9.8669e-01]]],
-                            [[[6.2799e-01, 2.5834e-02, 1.0015e+00],
-                              [1.9370e+00, 7.6566e-01, 6.1068e-01],
-                              [-1.4242e+00, 9.8669e-01, 4.1356e-01]]],
-                            [[[1.4565e+00, 1.4960e+00, 2.4339e-01],
-                              [-2.2484e-01, 7.5942e-01, -8.1184e-01],
-                              [-1.7077e+00, 1.0658e+00, 3.8311e-01]]],
-                            [[[-2.2484e-01, 1.4565e+00, 1.4960e+00],
-                              [-1.7077e+00, 7.5942e-01, 2.4339e-01],
-                              [1.0658e+00, 3.8311e-01, -8.1184e-01]]],
-                            [[[-1.7077e+00, -2.2484e-01, 1.4565e+00],
-                              [1.0658e+00, 7.5942e-01, 1.4960e+00],
-                              [3.8311e-01, -8.1184e-01, 2.4339e-01]]],
-                            [[[1.0658e+00, -1.7077e+00, -2.2484e-01],
-                              [3.8311e-01, 7.5942e-01, 1.4565e+00],
-                              [-8.1184e-01, 2.4339e-01, 1.4960e+00]]],
-                            [[[3.8311e-01, 1.0658e+00, -1.7077e+00],
-                              [-8.1184e-01, 7.5942e-01, -2.2484e-01],
-                              [2.4339e-01, 1.4960e+00, 1.4565e+00]]],
-                            [[[-8.1184e-01, 3.8311e-01, 1.0658e+00],
-                              [2.4339e-01, 7.5942e-01, -1.7077e+00],
-                              [1.4960e+00, 1.4565e+00, -2.2484e-01]]],
-                            [[[2.4339e-01, -8.1184e-01, 3.8311e-01],
-                              [1.4960e+00, 7.5942e-01, 1.0658e+00],
-                              [1.4565e+00, -2.2484e-01, -1.7077e+00]]],
-                            [[[1.4960e+00, 2.4339e-01, -8.1184e-01],
-                              [1.4565e+00, 7.5942e-01, 3.8311e-01],
-                              [-2.2484e-01, -1.7077e+00, 1.0658e+00]]],
-                            [[[8.4734e-01, 1.0904e+00, 2.4356e+00],
-                              [9.5822e-01, 2.2260e-01, -2.4450e-01],
-                              [-1.5078e+00, 7.0902e-02, -1.5921e+00]]],
-                            [[[9.5822e-01, 8.4734e-01, 1.0904e+00],
-                              [-1.5078e+00, 2.2260e-01, 2.4356e+00],
-                              [7.0902e-02, -1.5921e+00, -2.4450e-01]]],
-                            [[[-1.5078e+00, 9.5822e-01, 8.4734e-01],
-                              [7.0902e-02, 2.2260e-01, 1.0904e+00],
-                              [-1.5921e+00, -2.4450e-01, 2.4356e+00]]],
-                            [[[7.0902e-02, -1.5078e+00, 9.5822e-01],
-                              [-1.5921e+00, 2.2260e-01, 8.4734e-01],
-                              [-2.4450e-01, 2.4356e+00, 1.0904e+00]]],
-                            [[[-1.5921e+00, 7.0902e-02, -1.5078e+00],
-                              [-2.4450e-01, 2.2260e-01, 9.5822e-01],
-                              [2.4356e+00, 1.0904e+00, 8.4734e-01]]],
-                            [[[-2.4450e-01, -1.5921e+00, 7.0902e-02],
-                              [2.4356e+00, 2.2260e-01, -1.5078e+00],
-                              [1.0904e+00, 8.4734e-01, 9.5822e-01]]],
-                            [[[2.4356e+00, -2.4450e-01, -1.5921e+00],
-                              [1.0904e+00, 2.2260e-01, 7.0902e-02],
-                              [8.4734e-01, 9.5822e-01, -1.5078e+00]]],
-                            [[[1.0904e+00, 2.4356e+00, -2.4450e-01],
-                              [8.4734e-01, 2.2260e-01, -1.5921e+00],
-                              [9.5822e-01, -1.5078e+00, 7.0902e-02]]],
-                            [[[2.1173e+00, -7.3524e-01, 1.8888e+00],
-                              [1.0169e+00, 4.7033e-01, -1.0875e+00],
-                              [-1.0736e+00, -5.2245e-01, -2.8733e-01]]],
-                            [[[1.0169e+00, 2.1173e+00, -7.3524e-01],
-                              [-1.0736e+00, 4.7033e-01, 1.8888e+00],
-                              [-5.2245e-01, -2.8733e-01, -1.0875e+00]]],
-                            [[[-1.0736e+00, 1.0169e+00, 2.1173e+00],
-                              [-5.2245e-01, 4.7033e-01, -7.3524e-01],
-                              [-2.8733e-01, -1.0875e+00, 1.8888e+00]]],
-                            [[[-5.2245e-01, -1.0736e+00, 1.0169e+00],
-                              [-2.8733e-01, 4.7033e-01, 2.1173e+00],
-                              [-1.0875e+00, 1.8888e+00, -7.3524e-01]]],
-                            [[[-2.8733e-01, -5.2245e-01, -1.0736e+00],
-                              [-1.0875e+00, 4.7033e-01, 1.0169e+00],
-                              [1.8888e+00, -7.3524e-01, 2.1173e+00]]],
-                            [[[-1.0875e+00, -2.8733e-01, -5.2245e-01],
-                              [1.8888e+00, 4.7033e-01, -1.0736e+00],
-                              [-7.3524e-01, 2.1173e+00, 1.0169e+00]]],
-                            [[[1.8888e+00, -1.0875e+00, -2.8733e-01],
-                              [-7.3524e-01, 4.7033e-01, -5.2245e-01],
-                              [2.1173e+00, 1.0169e+00, -1.0736e+00]]],
-                            [[[-7.3524e-01, 1.8888e+00, -1.0875e+00],
-                              [2.1173e+00, 4.7033e-01, -2.8733e-01],
-                              [1.0169e+00, -1.0736e+00, -5.2245e-01]]],
-                            [[[-5.6433e-01, 1.5835e+00, -1.5826e+00],
-                              [-8.8974e-01, -4.3128e-01, -2.2423e-01],
-                              [1.6552e-03, -1.7292e+00, 2.6639e-01]]],
-                            [[[-8.8974e-01, -5.6433e-01, 1.5835e+00],
-                              [1.6552e-03, -4.3128e-01, -1.5826e+00],
-                              [-1.7292e+00, 2.6639e-01, -2.2423e-01]]],
-                            [[[1.6552e-03, -8.8974e-01, -5.6433e-01],
-                              [-1.7292e+00, -4.3128e-01, 1.5835e+00],
-                              [2.6639e-01, -2.2423e-01, -1.5826e+00]]],
-                            [[[-1.7292e+00, 1.6552e-03, -8.8974e-01],
-                              [2.6639e-01, -4.3128e-01, -5.6433e-01],
-                              [-2.2423e-01, -1.5826e+00, 1.5835e+00]]],
-                            [[[2.6639e-01, -1.7292e+00, 1.6552e-03],
-                              [-2.2423e-01, -4.3128e-01, -8.8974e-01],
-                              [-1.5826e+00, 1.5835e+00, -5.6433e-01]]],
-                            [[[-2.2423e-01, 2.6639e-01, -1.7292e+00],
-                              [-1.5826e+00, -4.3128e-01, 1.6552e-03],
-                              [1.5835e+00, -5.6433e-01, -8.8974e-01]]],
-                            [[[-1.5826e+00, -2.2423e-01, 2.6639e-01],
-                              [1.5835e+00, -4.3128e-01, -1.7292e+00],
-                              [-5.6433e-01, -8.8974e-01, 1.6552e-03]]],
-                            [[[1.5835e+00, -1.5826e+00, -2.2423e-01],
-                              [-5.6433e-01, -4.3128e-01, 2.6639e-01],
-                              [-8.8974e-01, 1.6552e-03, -1.7292e+00]]],
-                            [[[-1.2951e-01, 1.3493e+00, -1.9329e+00],
-                              [5.6248e-01, -5.1189e-01, 1.3614e+00],
-                              [3.3680e-01, -8.7148e-01, 5.0592e-01]]],
-                            [[[5.6248e-01, -1.2951e-01, 1.3493e+00],
-                              [3.3680e-01, -5.1189e-01, -1.9329e+00],
-                              [-8.7148e-01, 5.0592e-01, 1.3614e+00]]],
-                            [[[3.3680e-01, 5.6248e-01, -1.2951e-01],
-                              [-8.7148e-01, -5.1189e-01, 1.3493e+00],
-                              [5.0592e-01, 1.3614e+00, -1.9329e+00]]],
-                            [[[-8.7148e-01, 3.3680e-01, 5.6248e-01],
-                              [5.0592e-01, -5.1189e-01, -1.2951e-01],
-                              [1.3614e+00, -1.9329e+00, 1.3493e+00]]],
-                            [[[5.0592e-01, -8.7148e-01, 3.3680e-01],
-                              [1.3614e+00, -5.1189e-01, 5.6248e-01],
-                              [-1.9329e+00, 1.3493e+00, -1.2951e-01]]],
-                            [[[1.3614e+00, 5.0592e-01, -8.7148e-01],
-                              [-1.9329e+00, -5.1189e-01, 3.3680e-01],
-                              [1.3493e+00, -1.2951e-01, 5.6248e-01]]],
-                            [[[-1.9329e+00, 1.3614e+00, 5.0592e-01],
-                              [1.3493e+00, -5.1189e-01, -8.7148e-01],
-                              [-1.2951e-01, 5.6248e-01, 3.3680e-01]]],
-                            [[[1.3493e+00, -1.9329e+00, 1.3614e+00],
-                              [-1.2951e-01, -5.1189e-01, 5.0592e-01],
-                              [5.6248e-01, 3.3680e-01, -8.7148e-01]]],
-                            [[[1.6781e-02, -8.3929e-01, 1.2060e+00],
-                              [-1.0764e+00, 4.7821e-01, 1.5342e+00],
-                              [-4.4542e-01, -1.8606e+00, 3.0827e-01]]],
-                            [[[-1.0764e+00, 1.6781e-02, -8.3929e-01],
-                              [-4.4542e-01, 4.7821e-01, 1.2060e+00],
-                              [-1.8606e+00, 3.0827e-01, 1.5342e+00]]],
-                            [[[-4.4542e-01, -1.0764e+00, 1.6781e-02],
-                              [-1.8606e+00, 4.7821e-01, -8.3929e-01],
-                              [3.0827e-01, 1.5342e+00, 1.2060e+00]]],
-                            [[[-1.8606e+00, -4.4542e-01, -1.0764e+00],
-                              [3.0827e-01, 4.7821e-01, 1.6781e-02],
-                              [1.5342e+00, 1.2060e+00, -8.3929e-01]]],
-                            [[[3.0827e-01, -1.8606e+00, -4.4542e-01],
-                              [1.5342e+00, 4.7821e-01, -1.0764e+00],
-                              [1.2060e+00, -8.3929e-01, 1.6781e-02]]],
-                            [[[1.5342e+00, 3.0827e-01, -1.8606e+00],
-                              [1.2060e+00, 4.7821e-01, -4.4542e-01],
-                              [-8.3929e-01, 1.6781e-02, -1.0764e+00]]],
-                            [[[1.2060e+00, 1.5342e+00, 3.0827e-01],
-                              [-8.3929e-01, 4.7821e-01, -1.8606e+00],
-                              [1.6781e-02, -1.0764e+00, -4.4542e-01]]],
-                            [[[-8.3929e-01, 1.2060e+00, 1.5342e+00],
-                              [1.6781e-02, 4.7821e-01, 3.0827e-01],
-                              [-1.0764e+00, -4.4542e-01, -1.8606e+00]]]])
-
-expected_grad = np.array([[[[[8., 8., 8.], [8., 8., 8.], [8., 8., 8.]]]],
-                          [[[[8., 8., 8.], [8., 8., 8.], [8., 8., 8.]]]],
-                          [[[[8., 8., 8.], [8., 8., 8.], [8., 8., 8.]]]],
-                          [[[[8., 8., 8.], [8., 8., 8.], [8., 8., 8.]]]],
-                          [[[[8., 8., 8.], [8., 8., 8.], [8., 8., 8.]]]],
-                          [[[[8., 8., 8.], [8., 8., 8.], [8., 8., 8.]]]],
-                          [[[[8., 8., 8.], [8., 8., 8.], [8., 8., 8.]]]],
-                          [[[[8., 8., 8.], [8., 8., 8.], [8., 8., 8.]]]]])
-
-
-@pytest.mark.parametrize('device', [
-    'cpu',
-    pytest.param(
-        'cuda',
-        marks=pytest.mark.skipif(
-            not torch.cuda.is_available(), reason='requires CUDA support')),
-])
-def test_active_rotated_filter(device):
-    feature = torch.tensor(
-        np_feature, dtype=torch.float, device=device, requires_grad=True)
-    indices = torch.tensor(np_indices, dtype=torch.int, device=device)
-    output = active_rotated_filter(feature, indices)
-    output.backward(torch.ones_like(output))
-    assert np.allclose(output.data.cpu().numpy(), expected_output, atol=1e-3)
-    assert np.allclose(
-        feature.grad.data.cpu().numpy(), expected_grad, atol=1e-3)
diff --git a/tests/test_ops/test_assign_score_withk.py b/tests/test_ops/test_assign_score_withk.py
index f8fc6ae..4f45525 100644
--- a/tests/test_ops/test_assign_score_withk.py
+++ b/tests/test_ops/test_assign_score_withk.py
@@ -1,4 +1,3 @@
-# Copyright (c) OpenMMLab. All rights reserved.
 import pytest
 import torch
 
diff --git a/tests/test_ops/test_ball_query.py b/tests/test_ops/test_ball_query.py
index d3fc791..cf30a7e 100644
--- a/tests/test_ops/test_ball_query.py
+++ b/tests/test_ops/test_ball_query.py
@@ -1,4 +1,3 @@
-# Copyright (c) OpenMMLab. All rights reserved.
 import pytest
 import torch
 
@@ -53,50 +52,3 @@ def test_ball_query():
                                   [7, 7, 7, 7, 7], [0, 0, 0, 0, 0],
                                   [0, 0, 0, 0, 0]]]).cuda()
     assert torch.all(idx == expected_idx)
-
-
-@pytest.mark.skipif(
-    not torch.cuda.is_available(), reason='requires CUDA support')
-def test_stack_ball_query():
-    new_xyz = torch.tensor([[-0.0740, 1.3147, -1.3625],
-                            [-2.2769, 2.7817, -0.2334],
-                            [-0.4003, 2.4666, -0.5116],
-                            [-0.0740, 1.3147, -1.3625],
-                            [-0.0740, 1.3147, -1.3625],
-                            [-2.0289, 2.4952, -0.1708],
-                            [-2.0668, 6.0278, -0.4875],
-                            [0.4066, 1.4211, -0.2947],
-                            [-2.0289, 2.4952, -0.1708],
-                            [-2.0289, 2.4952, -0.1708]]).cuda()
-    new_xyz_batch_cnt = torch.tensor([5, 5], dtype=torch.int32).cuda()
-    xyz = torch.tensor([[-0.0740, 1.3147, -1.3625], [0.5555, 1.0399, -1.3634],
-                        [-0.4003, 2.4666, -0.5116], [-0.5251, 2.4379, -0.8466],
-                        [-0.9691, 1.1418, -1.3733], [-0.2232, 0.9561, -1.3626],
-                        [-2.2769, 2.7817, -0.2334], [-0.2822, 1.3192, -1.3645],
-                        [0.1533, 1.5024, -1.0432], [0.4917, 1.1529, -1.3496],
-                        [-2.0289, 2.4952, -0.1708], [-0.7188, 0.9956, -0.5096],
-                        [-2.0668, 6.0278, -0.4875], [-1.9304, 3.3092, 0.6610],
-                        [0.0949, 1.4332, 0.3140], [-1.2879, 2.0008, -0.7791],
-                        [-0.7252, 0.9611, -0.6371], [0.4066, 1.4211, -0.2947],
-                        [0.3220, 1.4447, 0.3548], [-0.9744, 2.3856,
-                                                   -1.2000]]).cuda()
-    xyz_batch_cnt = torch.tensor([10, 10], dtype=torch.int32).cuda()
-    idx = ball_query(0, 0.2, 5, xyz, new_xyz, xyz_batch_cnt, new_xyz_batch_cnt)
-    expected_idx = torch.tensor([[0, 0, 0, 0, 0], [6, 6, 6, 6, 6],
-                                 [2, 2, 2, 2, 2], [0, 0, 0, 0, 0],
-                                 [0, 0, 0, 0, 0], [0, 0, 0, 0, 0],
-                                 [2, 2, 2, 2, 2], [7, 7, 7, 7, 7],
-                                 [0, 0, 0, 0, 0], [0, 0, 0, 0, 0]]).cuda()
-    assert torch.all(idx == expected_idx)
-
-    xyz = xyz.double()
-    new_xyz = new_xyz.double()
-    expected_idx = expected_idx.double()
-    idx = ball_query(0, 0.2, 5, xyz, new_xyz, xyz_batch_cnt, new_xyz_batch_cnt)
-    assert torch.all(idx == expected_idx)
-
-    xyz = xyz.half()
-    new_xyz = new_xyz.half()
-    expected_idx = expected_idx.half()
-    idx = ball_query(0, 0.2, 5, xyz, new_xyz, xyz_batch_cnt, new_xyz_batch_cnt)
-    assert torch.all(idx == expected_idx)
diff --git a/tests/test_ops/test_bbox.py b/tests/test_ops/test_bbox.py
index 3d1486e..cff7bcc 100644
--- a/tests/test_ops/test_bbox.py
+++ b/tests/test_ops/test_bbox.py
@@ -1,83 +1,42 @@
-# Copyright (c) OpenMMLab. All rights reserved.
 import numpy as np
 import pytest
 import torch
 
-from mmcv.utils import (IS_CUDA_AVAILABLE, IS_MLU_AVAILABLE, IS_MPS_AVAILABLE,
-                        IS_NPU_AVAILABLE)
 
+@pytest.mark.skipif(
+    not torch.cuda.is_available(), reason='requires CUDA support')
+class TestBBox(object):
 
-class TestBBox:
+    def _test_bbox_overlaps(self, dtype=torch.float):
 
-    def _test_bbox_overlaps(self, device='cpu', dtype=torch.float):
         from mmcv.ops import bbox_overlaps
         b1 = torch.tensor([[1.0, 1.0, 3.0, 4.0], [2.0, 2.0, 3.0, 4.0],
-                           [7.0, 7.0, 8.0, 8.0]]).to(device).type(dtype)
+                           [7.0, 7.0, 8.0, 8.0]]).cuda().type(dtype)
         b2 = torch.tensor([[0.0, 2.0, 2.0, 5.0], [2.0, 1.0, 3.0,
-                                                  3.0]]).to(device).type(dtype)
+                                                  3.0]]).cuda().type(dtype)
         should_output = np.array([[0.33333334, 0.5], [0.2, 0.5], [0.0, 0.0]])
         out = bbox_overlaps(b1, b2, offset=1)
         assert np.allclose(out.cpu().numpy(), should_output, 1e-2)
 
         b1 = torch.tensor([[1.0, 1.0, 3.0, 4.0], [2.0, 2.0, 3.0,
-                                                  4.0]]).to(device).type(dtype)
+                                                  4.0]]).cuda().type(dtype)
         b2 = torch.tensor([[0.0, 2.0, 2.0, 5.0], [2.0, 1.0, 3.0,
-                                                  3.0]]).to(device).type(dtype)
+                                                  3.0]]).cuda().type(dtype)
         should_output = np.array([0.33333334, 0.5])
         out = bbox_overlaps(b1, b2, aligned=True, offset=1)
         assert np.allclose(out.cpu().numpy(), should_output, 1e-2)
 
-        b1 = torch.tensor([[0.0, 0.0, 3.0, 3.0]]).to(device).type(dtype)
+        b1 = torch.tensor([[0.0, 0.0, 3.0, 3.0]]).cuda().type(dtype)
+        b1 = torch.tensor([[0.0, 0.0, 3.0, 3.0]]).cuda().type(dtype)
         b2 = torch.tensor([[4.0, 0.0, 5.0, 3.0], [3.0, 0.0, 4.0, 3.0],
                            [2.0, 0.0, 3.0, 3.0], [1.0, 0.0, 2.0,
-                                                  3.0]]).to(device).type(dtype)
+                                                  3.0]]).cuda().type(dtype)
         should_output = np.array([0, 0.2, 0.5, 0.5])
         out = bbox_overlaps(b1, b2, offset=1)
         assert np.allclose(out.cpu().numpy(), should_output, 1e-2)
 
-        b1 = torch.tensor([[10.0 + i, 10.0 + i, 30.0 + i, 30.0 + i]
-                           for i in range(1000)]).to(device).type(dtype)
-        b2 = torch.tensor([[20.0 + i, 20.0 + i, 40.0 + i, 40.0 + i]
-                           for i in range(1000)]).to(device).type(dtype)
-        should_output = np.array([1 / 7] * 1000)
-        out = bbox_overlaps(b1, b2, aligned=True)
-        assert np.allclose(out.cpu().numpy(), should_output, 1e-2)
-
-    @pytest.mark.parametrize('device', [
-        'cpu',
-        pytest.param(
-            'cuda',
-            marks=pytest.mark.skipif(
-                not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
-        pytest.param(
-            'mlu',
-            marks=pytest.mark.skipif(
-                not IS_MLU_AVAILABLE, reason='requires MLU support')),
-        pytest.param(
-            'mps',
-            marks=pytest.mark.skipif(
-                not IS_MPS_AVAILABLE, reason='requires MPS support')),
-        pytest.param(
-            'npu',
-            marks=pytest.mark.skipif(
-                not IS_NPU_AVAILABLE, reason='requires NPU support'))
-    ])
-    def test_bbox_overlaps_float(self, device):
-        self._test_bbox_overlaps(device, dtype=torch.float)
+    def test_bbox_overlaps_float(self):
+        self._test_bbox_overlaps(torch.float)
 
-    @pytest.mark.parametrize('device', [
-        pytest.param(
-            'cuda',
-            marks=pytest.mark.skipif(
-                not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
-        pytest.param(
-            'mlu',
-            marks=pytest.mark.skipif(
-                not IS_MLU_AVAILABLE, reason='requires MLU support')),
-        pytest.param(
-            'npu',
-            marks=pytest.mark.skipif(
-                not IS_NPU_AVAILABLE, reason='requires NPU support'))
-    ])
-    def test_bbox_overlaps_half(self, device):
-        self._test_bbox_overlaps(device, dtype=torch.half)
+    def test_bbox_overlaps_half(self):
+        self._test_bbox_overlaps(torch.half)
diff --git a/tests/test_ops/test_bezier_align.py b/tests/test_ops/test_bezier_align.py
deleted file mode 100644
index b86812a..0000000
--- a/tests/test_ops/test_bezier_align.py
+++ /dev/null
@@ -1,54 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import numpy as np
-import pytest
-import torch
-
-from mmcv.utils import IS_CUDA_AVAILABLE
-
-inputs = ([[[
-    [1., 2., 5., 6.],
-    [3., 4., 7., 8.],
-    [9., 10., 13., 14.],
-    [11., 12., 15., 16.],
-]]], [[0., 0., 0., 1, 0., 2., 0., 3., 0., 3., 3., 2., 3., 1., 3., 0., 3.]])
-outputs = ([[[[1., 1.75, 3.5, 5.25], [2.5, 3.25, 5., 6.75],
-              [6., 6.75, 8.5, 10.25],
-              [9.5, 10.25, 12., 13.75]]]], [[[[1.5625, 1.5625, 1.5625, 0.3125],
-                                              [1.5625, 1.5625, 1.5625, 0.3125],
-                                              [1.5625, 1.5625, 1.5625, 0.3125],
-                                              [0.3125, 0.3125, 0.3125,
-                                               0.0625]]]])
-
-
-@pytest.mark.parametrize('device', [
-    'cpu',
-    pytest.param(
-        'cuda',
-        marks=pytest.mark.skipif(
-            not IS_CUDA_AVAILABLE, reason='requires CUDA support'))
-])
-@pytest.mark.parametrize('dtype', [torch.float, torch.double, torch.half])
-def test_bezieralign(device, dtype):
-    try:
-        from mmcv.ops import bezier_align
-    except ModuleNotFoundError:
-        pytest.skip('test requires compilation')
-    pool_h = 4
-    pool_w = 4
-    spatial_scale = 1.0
-    sampling_ratio = 1
-    np_input = np.array(inputs[0])
-    np_rois = np.array(inputs[1])
-    np_output = np.array(outputs[0])
-    np_grad = np.array(outputs[1])
-
-    x = torch.tensor(np_input, dtype=dtype, device=device, requires_grad=True)
-    rois = torch.tensor(np_rois, dtype=dtype, device=device)
-
-    output = bezier_align(x, rois, (pool_h, pool_w), spatial_scale,
-                          sampling_ratio, False)
-    output.backward(torch.ones_like(output))
-    assert np.allclose(
-        output.data.type(torch.float).cpu().numpy(), np_output, atol=1e-3)
-    assert np.allclose(
-        x.grad.data.type(torch.float).cpu().numpy(), np_grad, atol=1e-3)
diff --git a/tests/test_ops/test_bias_act.py b/tests/test_ops/test_bias_act.py
deleted file mode 100644
index 01b57c4..0000000
--- a/tests/test_ops/test_bias_act.py
+++ /dev/null
@@ -1,144 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import pytest
-import torch
-
-from mmcv.ops import bias_act
-from mmcv.ops.bias_act import EasyDict
-
-_USING_PARROTS = True
-try:
-    from parrots.autograd import gradcheck
-except ImportError:
-    from torch.autograd import gradcheck, gradgradcheck
-    _USING_PARROTS = False
-
-
-class TestBiasAct:
-
-    @classmethod
-    def setup_class(cls):
-        cls.input_tensor = torch.randn((1, 3), requires_grad=True)
-        cls.bias = torch.randn(3, requires_grad=True)
-
-    def test_bias_act_cpu(self):
-        out = bias_act(self.input_tensor, self.bias)
-        assert out.shape == (1, 3)
-
-        # test with different dim
-        input_tensor = torch.randn((1, 1, 3), requires_grad=True)
-        bias = torch.randn(3, requires_grad=True)
-        out = bias_act(input_tensor, bias, dim=2)
-        assert out.shape == (1, 1, 3)
-
-        # test with different act
-        out = bias_act(self.input_tensor, self.bias, act='relu')
-        assert out.shape == (1, 3)
-        out = bias_act(self.input_tensor, self.bias, act='lrelu')
-        assert out.shape == (1, 3)
-        out = bias_act(self.input_tensor, self.bias, act='tanh')
-        assert out.shape == (1, 3)
-        out = bias_act(self.input_tensor, self.bias, act='sigmoid')
-        assert out.shape == (1, 3)
-        out = bias_act(self.input_tensor, self.bias, act='elu')
-        assert out.shape == (1, 3)
-        out = bias_act(self.input_tensor, self.bias, act='selu')
-        assert out.shape == (1, 3)
-        out = bias_act(self.input_tensor, self.bias, act='softplus')
-        assert out.shape == (1, 3)
-        out = bias_act(self.input_tensor, self.bias, act='swish')
-        assert out.shape == (1, 3)
-
-        # test with different alpha
-        out = bias_act(self.input_tensor, self.bias, act='lrelu', alpha=0.1)
-        assert out.shape == (1, 3)
-
-        # test with different gain
-        out1 = bias_act(self.input_tensor, self.bias, act='lrelu', gain=0.2)
-        out2 = bias_act(self.input_tensor, self.bias, act='lrelu', gain=0.1)
-        assert torch.allclose(out1, out2 * 2)
-
-        # test with different clamp
-        out1 = bias_act(self.input_tensor, self.bias, act='lrelu', clamp=0.5)
-        out2 = bias_act(self.input_tensor, self.bias, act='lrelu', clamp=0.2)
-        assert out1.max() <= 0.5
-        assert out2.max() <= 0.5
-
-    @pytest.mark.skipif(not torch.cuda.is_available(), reason='requires cuda')
-    def test_bias_act_cuda(self):
-        if _USING_PARROTS:
-            gradcheck(
-                bias_act, (self.input_tensor.cuda(), self.bias.cuda()),
-                delta=1e-4,
-                pt_atol=1e-3)
-        else:
-            gradcheck(
-                bias_act, (self.input_tensor.cuda(), self.bias.cuda()),
-                eps=1e-4,
-                atol=1e-3)
-
-            gradgradcheck(
-                bias_act, (self.input_tensor.cuda(), self.bias.cuda()),
-                eps=1e-4,
-                atol=1e-3)
-
-        out = bias_act(self.input_tensor.cuda(), self.bias.cuda())
-        assert out.shape == (1, 3)
-
-        # test with different dim
-        input_tensor = torch.randn((1, 1, 3), requires_grad=True).cuda()
-        bias = torch.randn(3, requires_grad=True).cuda()
-        out = bias_act(input_tensor, bias, dim=2)
-        assert out.shape == (1, 1, 3)
-
-        # test with different act
-        out = bias_act(self.input_tensor.cuda(), self.bias.cuda(), act='relu')
-        assert out.shape == (1, 3)
-
-        out = bias_act(self.input_tensor.cuda(), self.bias.cuda(), act='lrelu')
-        assert out.shape == (1, 3)
-        out = bias_act(self.input_tensor.cuda(), self.bias.cuda(), act='tanh')
-        assert out.shape == (1, 3)
-        out = bias_act(
-            self.input_tensor.cuda(), self.bias.cuda(), act='sigmoid')
-        assert out.shape == (1, 3)
-        out = bias_act(self.input_tensor.cuda(), self.bias.cuda(), act='elu')
-        assert out.shape == (1, 3)
-        out = bias_act(self.input_tensor.cuda(), self.bias.cuda(), act='selu')
-        assert out.shape == (1, 3)
-        out = bias_act(
-            self.input_tensor.cuda(), self.bias.cuda(), act='softplus')
-        assert out.shape == (1, 3)
-        out = bias_act(self.input_tensor.cuda(), self.bias.cuda(), act='swish')
-        assert out.shape == (1, 3)
-
-        # test with different alpha
-        out = bias_act(
-            self.input_tensor.cuda(), self.bias.cuda(), act='lrelu', alpha=0.1)
-        assert out.shape == (1, 3)
-
-        # test with different gain
-        out1 = bias_act(
-            self.input_tensor.cuda(), self.bias.cuda(), act='lrelu', gain=0.2)
-        out2 = bias_act(
-            self.input_tensor.cuda(), self.bias.cuda(), act='lrelu', gain=0.1)
-        assert torch.allclose(out1, out2 * 2)
-
-        # test with different clamp
-        out1 = bias_act(
-            self.input_tensor.cuda(), self.bias.cuda(), act='lrelu', clamp=0.5)
-        out2 = bias_act(
-            self.input_tensor.cuda(), self.bias.cuda(), act='lrelu', clamp=0.2)
-        assert out1.max() <= 0.5
-        assert out2.max() <= 0.5
-
-    def test_easy_dict(self):
-        easy_dict = EasyDict(
-            func=lambda x, **_: x,
-            def_alpha=0,
-            def_gain=1,
-            cuda_idx=1,
-            ref='',
-            has_2nd_grad=False)
-        _ = easy_dict.def_alpha
-        easy_dict.def_alpha = 1
-        del easy_dict.def_alpha
diff --git a/tests/test_ops/test_bilinear_grid_sample.py b/tests/test_ops/test_bilinear_grid_sample.py
index 8f43d4f..cf0bf43 100644
--- a/tests/test_ops/test_bilinear_grid_sample.py
+++ b/tests/test_ops/test_bilinear_grid_sample.py
@@ -1,10 +1,10 @@
-# Copyright (c) OpenMMLab. All rights reserved.
 import numpy as np
 import torch
+import torch.nn as nn
 import torch.nn.functional as F
 
 
-class TestBilinearGridSample:
+class TestBilinearGridSample(object):
 
     def _test_bilinear_grid_sample(self,
                                    dtype=torch.float,
@@ -15,8 +15,7 @@ class TestBilinearGridSample:
 
         input = torch.rand(1, 1, 20, 20, dtype=dtype)
         grid = torch.Tensor([[[1, 0, 0], [0, 1, 0]]])
-        grid = F.affine_grid(
-            grid, (1, 1, 15, 15), align_corners=align_corners).type_as(input)
+        grid = nn.functional.affine_grid(grid, (1, 1, 15, 15)).type_as(input)
         grid *= multiplier
 
         out = bilinear_grid_sample(input, grid, align_corners=align_corners)
diff --git a/tests/test_ops/test_border_align.py b/tests/test_ops/test_border_align.py
index 71518ce..d8c2c0b 100644
--- a/tests/test_ops/test_border_align.py
+++ b/tests/test_ops/test_border_align.py
@@ -1,4 +1,3 @@
-# Copyright (c) OpenMMLab. All rights reserved.
 import copy
 
 import numpy as np
@@ -52,7 +51,7 @@ def _test_border_align_allclose(device, dtype, pool_size):
     if not torch.cuda.is_available() and device == 'cuda':
         pytest.skip('test requires GPU')
     try:
-        from mmcv.ops import BorderAlign, border_align
+        from mmcv.ops import border_align, BorderAlign
     except ModuleNotFoundError:
         pytest.skip('BorderAlign op is not successfully compiled')
 
diff --git a/tests/test_ops/test_box_iou_quadri.py b/tests/test_ops/test_box_iou_quadri.py
deleted file mode 100644
index e5cfcab..0000000
--- a/tests/test_ops/test_box_iou_quadri.py
+++ /dev/null
@@ -1,77 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import numpy as np
-import pytest
-import torch
-
-from mmcv.utils import IS_CUDA_AVAILABLE
-
-
-class TestBoxIoUQuadri:
-
-    @pytest.mark.parametrize('device', [
-        'cpu',
-        pytest.param(
-            'cuda',
-            marks=pytest.mark.skipif(
-                not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
-    ])
-    def test_box_iou_quadri_cuda(self, device):
-        from mmcv.ops import box_iou_quadri
-        np_boxes1 = np.asarray([[1.0, 1.0, 3.0, 4.0, 4.0, 4.0, 4.0, 1.0],
-                                [2.0, 2.0, 3.0, 4.0, 4.0, 2.0, 3.0, 1.0],
-                                [7.0, 7.0, 8.0, 8.0, 9.0, 7.0, 8.0, 6.0]],
-                               dtype=np.float32)
-        np_boxes2 = np.asarray([[0.0, 0.0, 0.0, 2.0, 2.0, 2.0, 2.0, 0.0],
-                                [2.0, 1.0, 2.0, 4.0, 4.0, 4.0, 4.0, 1.0],
-                                [7.0, 6.0, 7.0, 8.0, 9.0, 8.0, 9.0, 6.0]],
-                               dtype=np.float32)
-        np_expect_ious = np.asarray(
-            [[0.0714, 1.0000, 0.0000], [0.0000, 0.5000, 0.0000],
-             [0.0000, 0.0000, 0.5000]],
-            dtype=np.float32)
-        np_expect_ious_aligned = np.asarray([0.0714, 0.5000, 0.5000],
-                                            dtype=np.float32)
-
-        boxes1 = torch.from_numpy(np_boxes1).to(device)
-        boxes2 = torch.from_numpy(np_boxes2).to(device)
-
-        ious = box_iou_quadri(boxes1, boxes2)
-        assert np.allclose(ious.cpu().numpy(), np_expect_ious, atol=1e-4)
-
-        ious = box_iou_quadri(boxes1, boxes2, aligned=True)
-        assert np.allclose(
-            ious.cpu().numpy(), np_expect_ious_aligned, atol=1e-4)
-
-    @pytest.mark.parametrize('device', [
-        'cpu',
-        pytest.param(
-            'cuda',
-            marks=pytest.mark.skipif(
-                not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
-    ])
-    def test_box_iou_quadri_iof_cuda(self, device):
-        from mmcv.ops import box_iou_quadri
-        np_boxes1 = np.asarray([[1.0, 1.0, 3.0, 4.0, 4.0, 4.0, 4.0, 1.0],
-                                [2.0, 2.0, 3.0, 4.0, 4.0, 2.0, 3.0, 1.0],
-                                [7.0, 7.0, 8.0, 8.0, 9.0, 7.0, 8.0, 6.0]],
-                               dtype=np.float32)
-        np_boxes2 = np.asarray([[0.0, 0.0, 0.0, 2.0, 2.0, 2.0, 2.0, 0.0],
-                                [2.0, 1.0, 2.0, 4.0, 4.0, 4.0, 4.0, 1.0],
-                                [7.0, 6.0, 7.0, 8.0, 9.0, 8.0, 9.0, 6.0]],
-                               dtype=np.float32)
-        np_expect_ious = np.asarray(
-            [[0.1111, 1.0000, 0.0000], [0.0000, 1.0000, 0.0000],
-             [0.0000, 0.0000, 1.0000]],
-            dtype=np.float32)
-        np_expect_ious_aligned = np.asarray([0.1111, 1.0000, 1.0000],
-                                            dtype=np.float32)
-
-        boxes1 = torch.from_numpy(np_boxes1).to(device)
-        boxes2 = torch.from_numpy(np_boxes2).to(device)
-
-        ious = box_iou_quadri(boxes1, boxes2, mode='iof')
-        assert np.allclose(ious.cpu().numpy(), np_expect_ious, atol=1e-4)
-
-        ious = box_iou_quadri(boxes1, boxes2, mode='iof', aligned=True)
-        assert np.allclose(
-            ious.cpu().numpy(), np_expect_ious_aligned, atol=1e-4)
diff --git a/tests/test_ops/test_box_iou_rotated.py b/tests/test_ops/test_box_iou_rotated.py
index 9f5e0df..ad7c918 100644
--- a/tests/test_ops/test_box_iou_rotated.py
+++ b/tests/test_ops/test_box_iou_rotated.py
@@ -1,10 +1,9 @@
-# Copyright (c) OpenMMLab. All rights reserved.
 import numpy as np
 import pytest
 import torch
 
 
-class TestBoxIoURotated:
+class TestBoxIoURotated(object):
 
     def test_box_iou_rotated_cpu(self):
         from mmcv.ops import box_iou_rotated
@@ -26,7 +25,6 @@ class TestBoxIoURotated:
         boxes1 = torch.from_numpy(np_boxes1)
         boxes2 = torch.from_numpy(np_boxes2)
 
-        # test cw angle definition
         ious = box_iou_rotated(boxes1, boxes2)
         assert np.allclose(ious.cpu().numpy(), np_expect_ious, atol=1e-4)
 
@@ -34,16 +32,6 @@ class TestBoxIoURotated:
         assert np.allclose(
             ious.cpu().numpy(), np_expect_ious_aligned, atol=1e-4)
 
-        # test ccw angle definition
-        boxes1[..., -1] *= -1
-        boxes2[..., -1] *= -1
-        ious = box_iou_rotated(boxes1, boxes2, clockwise=False)
-        assert np.allclose(ious.cpu().numpy(), np_expect_ious, atol=1e-4)
-
-        ious = box_iou_rotated(boxes1, boxes2, aligned=True, clockwise=False)
-        assert np.allclose(
-            ious.cpu().numpy(), np_expect_ious_aligned, atol=1e-4)
-
     @pytest.mark.skipif(
         not torch.cuda.is_available(), reason='requires CUDA support')
     def test_box_iou_rotated_cuda(self):
@@ -66,7 +54,6 @@ class TestBoxIoURotated:
         boxes1 = torch.from_numpy(np_boxes1).cuda()
         boxes2 = torch.from_numpy(np_boxes2).cuda()
 
-        # test cw angle definition
         ious = box_iou_rotated(boxes1, boxes2)
         assert np.allclose(ious.cpu().numpy(), np_expect_ious, atol=1e-4)
 
@@ -74,16 +61,6 @@ class TestBoxIoURotated:
         assert np.allclose(
             ious.cpu().numpy(), np_expect_ious_aligned, atol=1e-4)
 
-        # test ccw angle definition
-        boxes1[..., -1] *= -1
-        boxes2[..., -1] *= -1
-        ious = box_iou_rotated(boxes1, boxes2, clockwise=False)
-        assert np.allclose(ious.cpu().numpy(), np_expect_ious, atol=1e-4)
-
-        ious = box_iou_rotated(boxes1, boxes2, aligned=True, clockwise=False)
-        assert np.allclose(
-            ious.cpu().numpy(), np_expect_ious_aligned, atol=1e-4)
-
     def test_box_iou_rotated_iof_cpu(self):
         from mmcv.ops import box_iou_rotated
         np_boxes1 = np.asarray(
@@ -104,23 +81,12 @@ class TestBoxIoURotated:
         boxes1 = torch.from_numpy(np_boxes1)
         boxes2 = torch.from_numpy(np_boxes2)
 
-        # test cw angle definition
         ious = box_iou_rotated(boxes1, boxes2, mode='iof')
         assert np.allclose(ious.cpu().numpy(), np_expect_ious, atol=1e-4)
         ious = box_iou_rotated(boxes1, boxes2, mode='iof', aligned=True)
         assert np.allclose(
             ious.cpu().numpy(), np_expect_ious_aligned, atol=1e-4)
 
-        # test ccw angle definition
-        boxes1[..., -1] *= -1
-        boxes2[..., -1] *= -1
-        ious = box_iou_rotated(boxes1, boxes2, mode='iof', clockwise=False)
-        assert np.allclose(ious.cpu().numpy(), np_expect_ious, atol=1e-4)
-        ious = box_iou_rotated(
-            boxes1, boxes2, mode='iof', aligned=True, clockwise=False)
-        assert np.allclose(
-            ious.cpu().numpy(), np_expect_ious_aligned, atol=1e-4)
-
     @pytest.mark.skipif(
         not torch.cuda.is_available(), reason='requires CUDA support')
     def test_box_iou_rotated_iof_cuda(self):
@@ -143,21 +109,9 @@ class TestBoxIoURotated:
         boxes1 = torch.from_numpy(np_boxes1).cuda()
         boxes2 = torch.from_numpy(np_boxes2).cuda()
 
-        # test cw angle definition
         ious = box_iou_rotated(boxes1, boxes2, mode='iof')
         assert np.allclose(ious.cpu().numpy(), np_expect_ious, atol=1e-4)
 
         ious = box_iou_rotated(boxes1, boxes2, mode='iof', aligned=True)
         assert np.allclose(
             ious.cpu().numpy(), np_expect_ious_aligned, atol=1e-4)
-
-        # test ccw angle definition
-        boxes1[..., -1] *= -1
-        boxes2[..., -1] *= -1
-        ious = box_iou_rotated(boxes1, boxes2, mode='iof', clockwise=False)
-        assert np.allclose(ious.cpu().numpy(), np_expect_ious, atol=1e-4)
-
-        ious = box_iou_rotated(
-            boxes1, boxes2, mode='iof', aligned=True, clockwise=False)
-        assert np.allclose(
-            ious.cpu().numpy(), np_expect_ious_aligned, atol=1e-4)
diff --git a/tests/test_ops/test_carafe.py b/tests/test_ops/test_carafe.py
index 02d00f1..0265bde 100644
--- a/tests/test_ops/test_carafe.py
+++ b/tests/test_ops/test_carafe.py
@@ -1,13 +1,8 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import numpy as np
-import pytest
 import torch
 from torch.autograd import gradcheck
 
-from mmcv.utils import IS_CUDA_AVAILABLE, IS_MLU_AVAILABLE
 
-
-class TestCarafe:
+class TestCarafe(object):
 
     def test_carafe_naive_gradcheck(self):
         if not torch.cuda.is_available():
@@ -30,56 +25,3 @@ class TestCarafe:
             2, 100, 6, 6, requires_grad=True,
             device='cuda').sigmoid().double()
         gradcheck(CARAFE(5, 4, 2), (feat, mask), atol=1e-4, eps=1e-4)
-
-    @pytest.mark.parametrize('device', [
-        pytest.param(
-            'cuda',
-            marks=pytest.mark.skipif(
-                not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
-        pytest.param(
-            'mlu',
-            marks=pytest.mark.skipif(
-                not IS_MLU_AVAILABLE, reason='requires MLU support'))
-    ])
-    def test_carafe_allclose(self, device):
-        try:
-            from mmcv.ops import CARAFE
-        except ModuleNotFoundError:
-            pytest.skip('test requires compilation')
-
-        np_feat = np.fromfile(
-            'tests/data/for_carafe/carafe_feat.bin', dtype=np.float32)
-        np_mask = np.fromfile(
-            'tests/data/for_carafe/carafe_mask.bin', dtype=np.float32)
-        np_output = np.fromfile(
-            'tests/data/for_carafe/carafe_output.bin', dtype=np.float32)
-        np_feat_grad = np.fromfile(
-            'tests/data/for_carafe/carafe_feat_grad.bin', dtype=np.float32)
-        np_mask_grad = np.fromfile(
-            'tests/data/for_carafe/carafe_mask_grad.bin', dtype=np.float32)
-
-        np_feat = np_feat.reshape((2, 64, 3, 3))
-        np_mask = np_mask.reshape((2, 100, 6, 6))
-        np_output = np_output.reshape((2, 64, 6, 6))
-        np_feat_grad = np_feat_grad.reshape((2, 64, 3, 3))
-        np_mask_grad = np_mask_grad.reshape((2, 100, 6, 6))
-
-        feat = torch.tensor(
-            np_feat, dtype=torch.float, device=device, requires_grad=True)
-        mask = torch.tensor(
-            np_mask, dtype=torch.float, device=device, requires_grad=True)
-
-        carafe = CARAFE(5, 4, 2)
-
-        output = carafe(feat, mask)
-        output.backward(torch.ones_like(output))
-        assert np.allclose(
-            output.data.type(torch.float).cpu().numpy(), np_output, atol=1e-3)
-        assert np.allclose(
-            feat.grad.data.type(torch.float).cpu().numpy(),
-            np_feat_grad,
-            atol=1e-3)
-        assert np.allclose(
-            mask.grad.data.type(torch.float).cpu().numpy(),
-            np_mask_grad,
-            atol=1e-3)
diff --git a/tests/test_ops/test_cc_attention.py b/tests/test_ops/test_cc_attention.py
index b2a8d22..5dd948e 100644
--- a/tests/test_ops/test_cc_attention.py
+++ b/tests/test_ops/test_cc_attention.py
@@ -1,4 +1,3 @@
-# Copyright (c) OpenMMLab. All rights reserved.
 import numpy as np
 import torch
 import torch.nn as nn
@@ -15,7 +14,7 @@ class Loss(nn.Module):
         return torch.mean(input - target)
 
 
-class TestCrissCrossAttention:
+class TestCrissCrossAttention(object):
 
     def test_cc_attention(self):
         device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
diff --git a/tests/test_ops/test_chamfer_distance.py b/tests/test_ops/test_chamfer_distance.py
deleted file mode 100644
index 522dcdd..0000000
--- a/tests/test_ops/test_chamfer_distance.py
+++ /dev/null
@@ -1,57 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import pytest
-import torch
-
-from mmcv.ops import chamfer_distance
-
-
-@pytest.mark.skipif(
-    not torch.cuda.is_available(), reason='requires CUDA support')
-def test_chamfer_distance():
-    pointset1 = torch.tensor(
-        [[[1.3, 9.39], [2.3, 9.39], [2.3, 10.39], [1.3, 10.39]],
-         [[1.0, 9.39], [3.0, 9.39], [3.0, 10.39], [1.0, 10.39]],
-         [[1.6, 9.99], [2.3, 9.99], [2.3, 10.39], [1.6, 10.39]]],
-        device='cuda',
-        requires_grad=True)
-
-    pointset2 = torch.tensor(
-        [[[1.0, 9.39], [3.0, 9.39], [3.0, 10.39], [1.0, 10.39]],
-         [[1.3, 9.39], [2.3, 9.39], [2.3, 10.39], [1.3, 10.39]],
-         [[1.0, 9.39], [3.0, 9.39], [3.0, 10.39], [1.0, 10.39]]],
-        device='cuda',
-        requires_grad=True)
-
-    expected_dist1 = torch.tensor(
-        [[0.0900, 0.4900, 0.4900, 0.0900], [0.0900, 0.4900, 0.4900, 0.0900],
-         [0.5200, 0.6500, 0.4900, 0.3600]],
-        device='cuda')
-    expected_dist2 = torch.tensor(
-        [[0.0900, 0.4900, 0.4900, 0.0900], [0.0900, 0.4900, 0.4900, 0.0900],
-         [0.7200, 0.8500, 0.4900, 0.3600]],
-        device='cuda')
-
-    expected_pointset1_grad = torch.tensor(
-        [[[0.6000, 0.0000], [-1.4000, 0.0000], [-1.4000, 0.0000],
-          [0.6000, 0.0000]],
-         [[-0.6000, 0.0000], [1.4000, 0.0000], [1.4000, 0.0000],
-          [-0.6000, 0.0000]],
-         [[1.2000, -0.8000], [-1.4000, -0.8000], [-1.4000, 0.0000],
-          [1.2000, 0.0000]]],
-        device='cuda')
-
-    expected_pointset2_grad = torch.tensor(
-        [[[-0.6000, 0.0000], [1.4000, 0.0000], [1.4000, 0.0000],
-          [-0.6000, 0.0000]],
-         [[0.6000, 0.0000], [-1.4000, 0.0000], [-1.4000, 0.0000],
-          [0.6000, 0.0000]],
-         [[0.0000, 0.0000], [0.0000, 0.0000], [2.8000, 0.8000],
-          [-2.4000, 0.8000]]],
-        device='cuda')
-
-    dist1, dist2, idx1, idx2 = chamfer_distance(pointset1, pointset2)
-    dist1.backward(torch.ones_like(dist1))
-    assert torch.allclose(dist1, expected_dist1, 1e-2)
-    assert torch.allclose(dist2, expected_dist2, 1e-2)
-    assert torch.allclose(pointset1.grad.data, expected_pointset1_grad, 1e-2)
-    assert torch.allclose(pointset2.grad.data, expected_pointset2_grad, 1e-2)
diff --git a/tests/test_ops/test_contour_expand.py b/tests/test_ops/test_contour_expand.py
index b36bbf4..c337b44 100644
--- a/tests/test_ops/test_contour_expand.py
+++ b/tests/test_ops/test_contour_expand.py
@@ -1,4 +1,3 @@
-# Copyright (c) OpenMMLab. All rights reserved.
 import numpy as np
 import torch
 
diff --git a/tests/test_ops/test_conv_gradfix.py b/tests/test_ops/test_conv_gradfix.py
deleted file mode 100644
index ff2f35c..0000000
--- a/tests/test_ops/test_conv_gradfix.py
+++ /dev/null
@@ -1,43 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import pytest
-import torch
-import torch.nn as nn
-from torch.autograd import gradcheck, gradgradcheck
-
-from mmcv.ops import conv2d, conv_transpose2d
-
-
-class TestCond2d:
-
-    @classmethod
-    def setup_class(cls):
-        cls.input = torch.randn((1, 3, 32, 32), requires_grad=True)
-        cls.weight = nn.Parameter(torch.randn(1, 3, 3, 3))
-
-    @pytest.mark.skipif(not torch.cuda.is_available(), reason='requires cuda')
-    def test_conv2d_cuda(self):
-        x = self.input.cuda()
-        weight = self.weight.cuda()
-        res = conv2d(x, weight, None, 1, 1)
-        assert res.shape == (1, 1, 32, 32)
-        gradcheck(conv2d, (x, weight, None, 1, 1), eps=1e-2, atol=0.1)
-        gradgradcheck(conv2d, (x, weight, None, 1, 1), eps=1e-2, atol=0.1)
-
-
-class TestCond2dTansposed:
-
-    @classmethod
-    def setup_class(cls):
-        cls.input = torch.randn((1, 3, 32, 32), requires_grad=True)
-        cls.weight = nn.Parameter(torch.randn(3, 1, 3, 3))
-
-    @pytest.mark.skipif(not torch.cuda.is_available(), reason='requires cuda')
-    def test_conv2d_transposed_cuda(self):
-        x = self.input.cuda()
-        weight = self.weight.cuda()
-        res = conv_transpose2d(x, weight, None, 1, 1)
-        assert res.shape == (1, 1, 32, 32)
-        gradcheck(
-            conv_transpose2d, (x, weight, None, 1, 1), eps=1e-2, atol=1e-2)
-        gradgradcheck(
-            conv_transpose2d, (x, weight, None, 1, 1), eps=1e-2, atol=1e-2)
diff --git a/tests/test_ops/test_convex_iou.py b/tests/test_ops/test_convex_iou.py
deleted file mode 100644
index 95dc482..0000000
--- a/tests/test_ops/test_convex_iou.py
+++ /dev/null
@@ -1,56 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import numpy as np
-import pytest
-import torch
-
-from mmcv.ops import convex_giou, convex_iou
-
-np_pointsets = np.asarray([[
-    1.0, 1.0, 2.0, 2.0, 1.0, 2.0, 2.0, 1.0, 1.0, 3.0, 3.0, 1.0, 2.0, 3.0, 3.0,
-    2.0, 1.5, 1.5
-],
-                           [
-                               1.5, 1.5, 2.5, 2.5, 1.5, 2.5, 2.5, 1.5, 1.5,
-                               3.5, 3.5, 1.5, 2.5, 3.5, 3.5, 2.5, 2.0, 2.0
-                           ]])
-
-np_polygons = np.asarray([[1.0, 1.0, 1.0, 2.0, 2.0, 2.0, 2.0, 1.0],
-                          [1.0, 1.0, 1.0, 3.0, 3.0, 3.0, 3.0, 1.0]])
-
-np_expected_iou = np.asarray([[0.2857, 0.8750], [0.0588, 0.4286]])
-
-np_expected_giou = np.asarray([0.2857, 0.3831])
-
-np_expected_grad = np.asarray([[
-    0.0204, 0.0408, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0612,
-    -0.0408, -0.0408, 0.0816, -0.0408, -0.0816, -0.0816, -0.0408, 0.0000,
-    0.0000
-],
-                               [
-                                   -0.1848, -0.1848, 0.0000, 0.0000, 0.0000,
-                                   0.0000, 0.0000, 0.0000, -0.1076, -0.0801,
-                                   -0.0801, -0.1076, -0.0367, -0.0734, -0.0734,
-                                   -0.0367, 0.0000, 0.0000
-                               ]])
-
-
-@pytest.mark.skipif(
-    not torch.cuda.is_available(), reason='requires CUDA support')
-def test_convex_iou():
-    pointsets = torch.from_numpy(np_pointsets).cuda().float()
-    polygons = torch.from_numpy(np_polygons).cuda().float()
-    expected_iou = torch.from_numpy(np_expected_iou).cuda().float()
-    assert torch.allclose(
-        convex_iou(pointsets, polygons), expected_iou, atol=1e-3)
-
-
-@pytest.mark.skipif(
-    not torch.cuda.is_available(), reason='requires CUDA support')
-def test_convex_giou():
-    pointsets = torch.from_numpy(np_pointsets).cuda().float()
-    polygons = torch.from_numpy(np_polygons).cuda().float()
-    expected_giou = torch.from_numpy(np_expected_giou).cuda().float()
-    expected_grad = torch.from_numpy(np_expected_grad).cuda().float()
-    giou, grad = convex_giou(pointsets, polygons)
-    assert torch.allclose(giou, expected_giou, atol=1e-3)
-    assert torch.allclose(grad, expected_grad, atol=1e-3)
diff --git a/tests/test_ops/test_corner_pool.py b/tests/test_ops/test_corner_pool.py
index d6dd25f..95d32f3 100644
--- a/tests/test_ops/test_corner_pool.py
+++ b/tests/test_ops/test_corner_pool.py
@@ -1,4 +1,3 @@
-# Copyright (c) OpenMMLab. All rights reserved.
 """
 CommandLine:
     pytest tests/test_corner_pool.py
diff --git a/tests/test_ops/test_correlation.py b/tests/test_ops/test_correlation.py
index 6cf5f9f..6b75a9f 100644
--- a/tests/test_ops/test_correlation.py
+++ b/tests/test_ops/test_correlation.py
@@ -30,13 +30,10 @@ class TestCorrelation:
         out = layer(input1, input2)
         out.backward(torch.ones_like(out))
 
-        # `eq_cpu` is not implemented for 'Half' in torch1.5.0,
-        # so we need to make a comparison for cuda tensor
-        # rather than cpu tensor
-        gt_out = torch.tensor(_gt_out, dtype=dtype).cuda()
-        assert_equal_tensor(out, gt_out)
-        assert_equal_tensor(input1.grad.detach(), input2)
-        assert_equal_tensor(input2.grad.detach(), input1)
+        gt_out = torch.tensor(_gt_out, dtype=dtype)
+        assert_equal_tensor(out.cpu(), gt_out)
+        assert_equal_tensor(input1.grad.detach().cpu(), input2.cpu())
+        assert_equal_tensor(input2.grad.detach().cpu(), input1.cpu())
 
     @pytest.mark.skipif(
         not torch.cuda.is_available(), reason='requires CUDA support')
diff --git a/tests/test_ops/test_deform_conv.py b/tests/test_ops/test_deform_conv.py
index 64dcccf..2118cfb 100644
--- a/tests/test_ops/test_deform_conv.py
+++ b/tests/test_ops/test_deform_conv.py
@@ -1,9 +1,8 @@
-# Copyright (c) OpenMMLab. All rights reserved.
 import numpy as np
 import pytest
 import torch
-from mmengine.utils import digit_version
-from mmengine.utils.dl_utils import TORCH_VERSION
+
+from mmcv.utils import TORCH_VERSION, digit_version
 
 try:
     # If PyTorch version >= 1.6.0 and fp16 is enabled, torch.cuda.amp.autocast
@@ -35,7 +34,7 @@ gt_offset_bias_grad = [1.44, -0.72, 0., 0., -0.10, -0.08, -0.54, -0.54],
 gt_deform_weight_grad = [[[[3.62, 0.], [0.40, 0.18]]]]
 
 
-class TestDeformconv:
+class TestDeformconv(object):
 
     def _test_deformconv(self,
                          dtype=torch.float,
diff --git a/tests/test_ops/test_deform_roi_pool.py b/tests/test_ops/test_deform_roi_pool.py
index 346301f..a3eeda9 100644
--- a/tests/test_ops/test_deform_roi_pool.py
+++ b/tests/test_ops/test_deform_roi_pool.py
@@ -1,12 +1,8 @@
-# Copyright (c) OpenMMLab. All rights reserved.
 import os
 
 import numpy as np
-import pytest
 import torch
 
-from mmcv.utils import IS_CUDA_AVAILABLE, IS_MLU_AVAILABLE, IS_NPU_AVAILABLE
-
 _USING_PARROTS = True
 try:
     from parrots.autograd import gradcheck
@@ -38,7 +34,7 @@ outputs = [([[[[1, 1.25], [1.5, 1.75]]]], [[[[3.0625, 0.4375],
                                 0.00390625]]]])]
 
 
-class TestDeformRoIPool:
+class TestDeformRoIPool(object):
 
     def test_deform_roi_pool_gradcheck(self):
         if not torch.cuda.is_available():
@@ -96,57 +92,3 @@ class TestDeformRoIPool:
                 gradcheck(droipool, (x, rois), no_grads=[rois])
             else:
                 gradcheck(droipool, (x, rois), eps=1e-2, atol=1e-2)
-
-    def _test_deform_roi_pool_allclose(self, device, dtype=torch.float):
-        from mmcv.ops import DeformRoIPoolPack
-        pool_h = 2
-        pool_w = 2
-        spatial_scale = 1.0
-        sampling_ratio = 2
-
-        for case, output in zip(inputs, outputs):
-            np_input = np.array(case[0])
-            np_rois = np.array(case[1])
-            np_output = np.array(output[0])
-            np_grad = np.array(output[1])
-
-            x = torch.tensor(
-                np_input, device=device, dtype=torch.float, requires_grad=True)
-            rois = torch.tensor(np_rois, device=device, dtype=torch.float)
-            output_c = x.size(1)
-            droipool = DeformRoIPoolPack(
-                (pool_h, pool_w),
-                output_c,
-                spatial_scale=spatial_scale,
-                sampling_ratio=sampling_ratio).to(device)
-
-            output = droipool(x, rois)
-            output.backward(torch.ones_like(output))
-            assert np.allclose(output.data.cpu().numpy(), np_output, 1e-3)
-            assert np.allclose(x.grad.data.cpu().numpy(), np_grad, 1e-3)
-
-    @pytest.mark.parametrize('device', [
-        pytest.param(
-            'npu',
-            marks=pytest.mark.skipif(
-                not IS_NPU_AVAILABLE, reason='requires NPU support')),
-        pytest.param(
-            'cuda',
-            marks=pytest.mark.skipif(
-                not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
-        pytest.param(
-            'mlu',
-            marks=pytest.mark.skipif(
-                not IS_MLU_AVAILABLE, reason='requires MLU support'))
-    ])
-    @pytest.mark.parametrize('dtype', [
-        torch.float,
-        pytest.param(
-            torch.double,
-            marks=pytest.mark.skipif(
-                IS_MLU_AVAILABLE,
-                reason='MLU does not support for 64-bit floating point')),
-        torch.half
-    ])
-    def test_deform_roi_pool_allclose(self, device, dtype):
-        self._test_deform_roi_pool_allclose(device, dtype)
diff --git a/tests/test_ops/test_diff_iou_rotated.py b/tests/test_ops/test_diff_iou_rotated.py
deleted file mode 100644
index 01e0555..0000000
--- a/tests/test_ops/test_diff_iou_rotated.py
+++ /dev/null
@@ -1,49 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import numpy as np
-import pytest
-import torch
-
-from mmcv.ops import diff_iou_rotated_2d, diff_iou_rotated_3d
-
-
-@pytest.mark.skipif(
-    not torch.cuda.is_available(), reason='requires CUDA support')
-def test_diff_iou_rotated_2d():
-    np_boxes1 = np.asarray([[[0.5, 0.5, 1., 1., .0], [0.5, 0.5, 1., 1., .0],
-                             [0.5, 0.5, 1., 1., .0], [0.5, 0.5, 1., 1., .0],
-                             [0.5, 0.5, 1., 1., .0]]],
-                           dtype=np.float32)
-    np_boxes2 = np.asarray(
-        [[[0.5, 0.5, 1., 1., .0], [0.5, 0.5, 1., 1., np.pi / 2],
-          [0.5, 0.5, 1., 1., np.pi / 4], [1., 1., 1., 1., .0],
-          [1.5, 1.5, 1., 1., .0]]],
-        dtype=np.float32)
-
-    boxes1 = torch.from_numpy(np_boxes1).cuda()
-    boxes2 = torch.from_numpy(np_boxes2).cuda()
-
-    np_expect_ious = np.asarray([[1., 1., .7071, 1 / 7, .0]])
-    ious = diff_iou_rotated_2d(boxes1, boxes2)
-    assert np.allclose(ious.cpu().numpy(), np_expect_ious, atol=1e-4)
-
-
-@pytest.mark.skipif(
-    not torch.cuda.is_available(), reason='requires CUDA support')
-def test_diff_iou_rotated_3d():
-    np_boxes1 = np.asarray(
-        [[[.5, .5, .5, 1., 1., 1., .0], [.5, .5, .5, 1., 1., 1., .0],
-          [.5, .5, .5, 1., 1., 1., .0], [.5, .5, .5, 1., 1., 1., .0],
-          [.5, .5, .5, 1., 1., 1., .0]]],
-        dtype=np.float32)
-    np_boxes2 = np.asarray(
-        [[[.5, .5, .5, 1., 1., 1., .0], [.5, .5, .5, 1., 1., 2., np.pi / 2],
-          [.5, .5, .5, 1., 1., 1., np.pi / 4], [1., 1., 1., 1., 1., 1., .0],
-          [-1.5, -1.5, -1.5, 2.5, 2.5, 2.5, .0]]],
-        dtype=np.float32)
-
-    boxes1 = torch.from_numpy(np_boxes1).cuda()
-    boxes2 = torch.from_numpy(np_boxes2).cuda()
-
-    np_expect_ious = np.asarray([[1., .5, .7071, 1 / 15, .0]])
-    ious = diff_iou_rotated_3d(boxes1, boxes2)
-    assert np.allclose(ious.cpu().numpy(), np_expect_ious, atol=1e-4)
diff --git a/tests/test_ops/test_filtered_lrelu.py b/tests/test_ops/test_filtered_lrelu.py
deleted file mode 100644
index fda7c60..0000000
--- a/tests/test_ops/test_filtered_lrelu.py
+++ /dev/null
@@ -1,224 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import pytest
-import torch
-from mmengine.utils import digit_version
-
-from mmcv.ops import filtered_lrelu
-
-
-class TestFilteredLrelu:
-
-    @classmethod
-    def setup_class(cls):
-        cls.input_tensor = torch.randn((1, 3, 16, 16), requires_grad=True)
-        cls.bias = torch.randn(3, requires_grad=True)
-        cls.filter_up = torch.randn((2, 2))
-        cls.filter_down = torch.randn((2, 2))
-
-    def test_filtered_lrelu_cpu(self):
-        out = filtered_lrelu(self.input_tensor, bias=self.bias)
-        assert out.shape == (1, 3, 16, 16)
-
-        out = filtered_lrelu(
-            self.input_tensor,
-            bias=self.bias,
-            filter_up=self.filter_up,
-            filter_down=self.filter_down,
-            up=2,
-            down=2,
-            padding=1,
-            clamp=0.5)
-        assert out.shape == (1, 3, 16, 16)
-
-        # test with different filter_up
-        filter_up = torch.randn((4, 4))
-        out = filtered_lrelu(
-            self.input_tensor,
-            bias=self.bias,
-            filter_up=filter_up,
-            filter_down=self.filter_down,
-            up=2,
-            down=2,
-            padding=2,
-            clamp=0.5)
-        assert out.shape == (1, 3, 16, 16)
-
-        # test with different filter_down
-        filter_down = torch.randn((4, 4))
-        out = filtered_lrelu(
-            self.input_tensor,
-            bias=self.bias,
-            filter_up=self.filter_up,
-            filter_down=filter_down,
-            up=2,
-            down=2,
-            padding=2,
-            clamp=0.5)
-        assert out.shape == (1, 3, 16, 16)
-
-        # test with different b
-        input_tensor = torch.randn((1, 4, 16, 16), requires_grad=True)
-        bias = torch.randn(4, requires_grad=True)
-        out = filtered_lrelu(
-            input_tensor,
-            bias=bias,
-            filter_up=self.filter_up,
-            filter_down=self.filter_down,
-            up=2,
-            down=2,
-            padding=1,
-            clamp=0.5)
-        assert out.shape == (1, 4, 16, 16)
-
-        # test with different up
-        out = filtered_lrelu(
-            self.input_tensor,
-            bias=self.bias,
-            filter_up=self.filter_up,
-            filter_down=self.filter_down,
-            up=4,
-            down=2,
-            padding=1,
-            clamp=0.5)
-        assert out.shape == (1, 3, 32, 32)
-
-        # test with different down
-        out = filtered_lrelu(
-            self.input_tensor,
-            bias=self.bias,
-            filter_up=self.filter_up,
-            filter_down=self.filter_down,
-            up=2,
-            down=4,
-            padding=1,
-            clamp=0.5)
-        assert out.shape == (1, 3, 8, 8)
-
-        # test with different gain
-        out1 = filtered_lrelu(self.input_tensor, bias=self.bias, gain=0.2)
-        out2 = filtered_lrelu(self.input_tensor, bias=self.bias, gain=0.1)
-        assert torch.allclose(out1, 2 * out2)
-
-        # test with different slope
-        out = filtered_lrelu(self.input_tensor, bias=self.bias, slope=0.2)
-        assert out.shape == (1, 3, 16, 16)
-
-        # test with different clamp
-        out1 = filtered_lrelu(self.input_tensor, bias=self.bias, clamp=0.2)
-        out2 = filtered_lrelu(self.input_tensor, bias=self.bias, clamp=0.1)
-        assert out1.max() <= 0.2
-        assert out2.max() <= 0.1
-
-        # test with different flip_filter
-        out1 = filtered_lrelu(
-            self.input_tensor, bias=self.bias, flip_filter=True)
-        assert out.shape == (1, 3, 16, 16)
-
-    @pytest.mark.skipif(
-        not torch.cuda.is_available()
-        # or digit_version(torch.version.cuda) < digit_version('10.2'),
-        or False,
-        reason='requires cuda>=10.2')
-    def test_filtered_lrelu_cuda(self):
-        out = filtered_lrelu(self.input_tensor.cuda(), bias=self.bias.cuda())
-        assert out.shape == (1, 3, 16, 16)
-
-        out = filtered_lrelu(
-            self.input_tensor.cuda(),
-            bias=self.bias.cuda(),
-            filter_up=self.filter_up.cuda(),
-            filter_down=self.filter_down.cuda(),
-            up=2,
-            down=2,
-            padding=1,
-            clamp=0.5)
-        assert out.shape == (1, 3, 16, 16)
-
-        # test with different filter_up
-        filter_up = torch.randn((4, 4))
-        out = filtered_lrelu(
-            self.input_tensor.cuda(),
-            bias=self.bias.cuda(),
-            filter_up=filter_up.cuda(),
-            filter_down=self.filter_down.cuda(),
-            up=2,
-            down=2,
-            padding=2,
-            clamp=0.5)
-        assert out.shape == (1, 3, 16, 16)
-
-        # test with different filter_down
-        filter_down = torch.randn((4, 4))
-        out = filtered_lrelu(
-            self.input_tensor.cuda(),
-            bias=self.bias.cuda(),
-            filter_up=self.filter_up.cuda(),
-            filter_down=filter_down.cuda(),
-            up=2,
-            down=2,
-            padding=2,
-            clamp=0.5)
-        assert out.shape == (1, 3, 16, 16)
-
-        # test with different b
-        input_tensor = torch.randn((1, 4, 16, 16), requires_grad=True)
-        bias = torch.randn(4, requires_grad=True)
-        out = filtered_lrelu(
-            input_tensor.cuda(),
-            bias=bias.cuda(),
-            filter_up=self.filter_up.cuda(),
-            filter_down=self.filter_down.cuda(),
-            up=2,
-            down=2,
-            padding=1,
-            clamp=0.5)
-        assert out.shape == (1, 4, 16, 16)
-
-        # test with different up
-        out = filtered_lrelu(
-            self.input_tensor.cuda(),
-            bias=self.bias.cuda(),
-            filter_up=self.filter_up.cuda(),
-            filter_down=self.filter_down.cuda(),
-            up=4,
-            down=2,
-            padding=1,
-            clamp=0.5)
-        assert out.shape == (1, 3, 32, 32)
-
-        # test with different down
-        out = filtered_lrelu(
-            self.input_tensor.cuda(),
-            bias=self.bias.cuda(),
-            filter_up=self.filter_up.cuda(),
-            filter_down=self.filter_down.cuda(),
-            up=2,
-            down=4,
-            padding=1,
-            clamp=0.5)
-        assert out.shape == (1, 3, 8, 8)
-
-        # test with different gain
-        out1 = filtered_lrelu(
-            self.input_tensor.cuda(), bias=self.bias.cuda(), gain=0.2)
-        out2 = filtered_lrelu(
-            self.input_tensor.cuda(), bias=self.bias.cuda(), gain=0.1)
-        assert torch.allclose(out1, 2 * out2)
-
-        # test with different slope
-        out = filtered_lrelu(
-            self.input_tensor.cuda(), bias=self.bias.cuda(), slope=0.2)
-        assert out.shape == (1, 3, 16, 16)
-
-        # test with different clamp
-        out1 = filtered_lrelu(
-            self.input_tensor.cuda(), bias=self.bias.cuda(), clamp=0.2)
-        out2 = filtered_lrelu(
-            self.input_tensor.cuda(), bias=self.bias.cuda(), clamp=0.1)
-        assert out1.max() <= 0.2
-        assert out2.max() <= 0.1
-
-        # test with different flip_filter
-        out1 = filtered_lrelu(
-            self.input_tensor.cuda(), bias=self.bias.cuda(), flip_filter=True)
-        assert out.shape == (1, 3, 16, 16)
diff --git a/tests/test_ops/test_focal_loss.py b/tests/test_ops/test_focal_loss.py
index ee7c986..e52f060 100644
--- a/tests/test_ops/test_focal_loss.py
+++ b/tests/test_ops/test_focal_loss.py
@@ -1,10 +1,6 @@
-# Copyright (c) OpenMMLab. All rights reserved.
 import numpy as np
-import pytest
 import torch
 
-from mmcv.utils import IS_CUDA_AVAILABLE, IS_MLU_AVAILABLE, IS_NPU_AVAILABLE
-
 _USING_PARROTS = True
 try:
     from parrots.autograd import gradcheck
@@ -37,7 +33,7 @@ sigmoid_outputs = [(0.13562961, [[-0.00657264, 0.11185755],
                                  [-0.02462499, 0.08277918, 0.18050370]])]
 
 
-class Testfocalloss:
+class Testfocalloss(object):
 
     def _test_softmax(self, dtype=torch.float):
         if not torch.cuda.is_available():
@@ -60,7 +56,9 @@ class Testfocalloss:
             assert np.allclose(loss.data.cpu().numpy(), output[0], 1e-2)
             assert np.allclose(x.grad.data.cpu(), np_x_grad, 1e-2)
 
-    def _test_sigmoid(self, device, dtype=torch.float):
+    def _test_sigmoid(self, dtype=torch.float):
+        if not torch.cuda.is_available():
+            return
         from mmcv.ops import sigmoid_focal_loss
         alpha = 0.25
         gamma = 2.0
@@ -69,9 +67,9 @@ class Testfocalloss:
             np_y = np.array(case[1])
             np_x_grad = np.array(output[1])
 
-            x = torch.from_numpy(np_x).to(device).type(dtype)
+            x = torch.from_numpy(np_x).cuda().type(dtype)
             x.requires_grad_()
-            y = torch.from_numpy(np_y).to(device).long()
+            y = torch.from_numpy(np_y).cuda().long()
 
             loss = sigmoid_focal_loss(x, y, gamma, alpha, None, 'mean')
             loss.backward()
@@ -129,39 +127,11 @@ class Testfocalloss:
     def test_softmax_half(self):
         self._test_softmax(dtype=torch.half)
 
-    @pytest.mark.parametrize('device', [
-        pytest.param(
-            'npu',
-            marks=pytest.mark.skipif(
-                not IS_NPU_AVAILABLE, reason='requires NPU support')),
-        pytest.param(
-            'cuda',
-            marks=pytest.mark.skipif(
-                not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
-        pytest.param(
-            'mlu',
-            marks=pytest.mark.skipif(
-                not IS_MLU_AVAILABLE, reason='requires MLU support'))
-    ])
-    def test_sigmoid_float(self, device):
-        self._test_sigmoid(device=device, dtype=torch.float)
-
-    @pytest.mark.parametrize('device', [
-        pytest.param(
-            'npu',
-            marks=pytest.mark.skipif(
-                not IS_NPU_AVAILABLE, reason='requires NPU support')),
-        pytest.param(
-            'cuda',
-            marks=pytest.mark.skipif(
-                not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
-        pytest.param(
-            'mlu',
-            marks=pytest.mark.skipif(
-                not IS_MLU_AVAILABLE, reason='requires MLU support'))
-    ])
-    def test_sigmoid_half(self, device):
-        self._test_sigmoid(device, dtype=torch.half)
+    def test_sigmoid_float(self):
+        self._test_sigmoid(dtype=torch.float)
+
+    def test_sigmoid_half(self):
+        self._test_sigmoid(dtype=torch.half)
 
     def test_grad_softmax_float(self):
         self._test_grad_softmax(dtype=torch.float)
diff --git a/tests/test_ops/test_furthest_point_sample.py b/tests/test_ops/test_furthest_point_sample.py
index 7e61e64..d64dedc 100644
--- a/tests/test_ops/test_furthest_point_sample.py
+++ b/tests/test_ops/test_furthest_point_sample.py
@@ -1,4 +1,3 @@
-# Copyright (c) OpenMMLab. All rights reserved.
 import pytest
 import torch
 
diff --git a/tests/test_ops/test_fused_bias_leakyrelu.py b/tests/test_ops/test_fused_bias_leakyrelu.py
index e6f6fb9..5f92757 100644
--- a/tests/test_ops/test_fused_bias_leakyrelu.py
+++ b/tests/test_ops/test_fused_bias_leakyrelu.py
@@ -1,9 +1,6 @@
-# Copyright (c) OpenMMLab. All rights reserved.
 import pytest
 import torch
 
-from mmcv.utils import IS_CUDA_AVAILABLE, IS_NPU_AVAILABLE
-
 _USING_PARROTS = True
 try:
     from parrots.autograd import gradcheck
@@ -12,63 +9,40 @@ except ImportError:
     _USING_PARROTS = False
 
 
-class TestFusedBiasLeakyReLU:
+class TestFusedBiasLeakyReLU(object):
 
     @classmethod
     def setup_class(cls):
-        if not IS_CUDA_AVAILABLE and not IS_NPU_AVAILABLE:
+        if not torch.cuda.is_available():
             return
-        if IS_CUDA_AVAILABLE:
-            cls.input_tensor = torch.randn((2, 2, 2, 2),
-                                           requires_grad=True).cuda()
-            cls.bias = torch.zeros(2, requires_grad=True).cuda()
-        elif IS_NPU_AVAILABLE:
-            cls.input_tensor = torch.randn((2, 2, 2, 2),
-                                           requires_grad=True).npu()
-            cls.bias = torch.zeros(2, requires_grad=True).npu()
+        cls.input_tensor = torch.randn((2, 2, 2, 2), requires_grad=True).cuda()
+        cls.bias = torch.zeros(2, requires_grad=True).cuda()
 
-    @pytest.mark.parametrize('device', [
-        pytest.param(
-            'cuda',
-            marks=pytest.mark.skipif(
-                not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
-        pytest.param(
-            'npu',
-            marks=pytest.mark.skipif(
-                not IS_NPU_AVAILABLE, reason='requires NPU support'))
-    ])
-    def test_gradient(self, device):
+    @pytest.mark.skipif(not torch.cuda.is_available(), reason='requires cuda')
+    def test_gradient(self):
 
         from mmcv.ops import FusedBiasLeakyReLU
         if _USING_PARROTS:
-            if IS_CUDA_AVAILABLE:
-                gradcheck(
-                    FusedBiasLeakyReLU(2).cuda(),
-                    self.input_tensor,
-                    delta=1e-4,
-                    pt_atol=1e-3)
+            gradcheck(
+                FusedBiasLeakyReLU(2).cuda(),
+                self.input_tensor,
+                delta=1e-4,
+                pt_atol=1e-3)
         else:
             gradcheck(
-                FusedBiasLeakyReLU(2).to(device),
+                FusedBiasLeakyReLU(2).cuda(),
                 self.input_tensor,
                 eps=1e-4,
                 atol=1e-3)
 
-    @pytest.mark.parametrize('device', [
-        pytest.param(
-            'cuda',
-            marks=pytest.mark.skipif(
-                not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
-        pytest.param(
-            'npu',
-            marks=pytest.mark.skipif(
-                not IS_NPU_AVAILABLE, reason='requires NPU support'))
-    ])
-    def test_gradgradient(self, device):
+    @pytest.mark.skipif(
+        not torch.cuda.is_available() or _USING_PARROTS,
+        reason='requires cuda')
+    def test_gradgradient(self):
 
         from mmcv.ops import FusedBiasLeakyReLU
         gradgradcheck(
-            FusedBiasLeakyReLU(2).to(device),
+            FusedBiasLeakyReLU(2).cuda(),
             self.input_tensor,
             eps=1e-4,
             atol=1e-3)
diff --git a/tests/test_ops/test_gather_points.py b/tests/test_ops/test_gather_points.py
index 349a1b6..7ca2df0 100644
--- a/tests/test_ops/test_gather_points.py
+++ b/tests/test_ops/test_gather_points.py
@@ -1,67 +1,46 @@
-# Copyright (c) OpenMMLab. All rights reserved.
 import pytest
 import torch
 
 from mmcv.ops import gather_points
-from mmcv.utils import IS_CUDA_AVAILABLE, IS_NPU_AVAILABLE
 
 
-class TestGatherPoints:
+@pytest.mark.skipif(
+    not torch.cuda.is_available(), reason='requires CUDA support')
+def test_gather_points():
+    features = torch.tensor([[[
+        -1.6095, -0.1029, -0.8876, -1.2447, -2.4031, 0.3708, -1.1586, -1.4967,
+        -0.4800, 0.2252
+    ],
+                              [
+                                  1.9138, 3.4979, 1.6854, 1.5631, 3.6776,
+                                  3.1154, 2.1705, 2.5221, 2.0411, 3.1446
+                              ],
+                              [
+                                  -1.4173, 0.3073, -1.4339, -1.4340, -1.2770,
+                                  -0.2867, -1.4162, -1.4044, -1.4245, -1.4074
+                              ]],
+                             [[
+                                 0.2160, 0.0842, 0.3661, -0.2749, -0.4909,
+                                 -0.6066, -0.8773, -0.0745, -0.9496, 0.1434
+                             ],
+                              [
+                                  1.3644, 1.8087, 1.6855, 1.9563, 1.2746,
+                                  1.9662, 0.9566, 1.8778, 1.1437, 1.3639
+                              ],
+                              [
+                                  -0.7172, 0.1692, 0.2241, 0.0721, -0.7540,
+                                  0.0462, -0.6227, 0.3223, -0.6944, -0.5294
+                              ]]]).cuda()
 
-    @pytest.mark.parametrize('device', [
-        pytest.param(
-            'cuda',
-            marks=pytest.mark.skipif(
-                not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
-        pytest.param(
-            'npu',
-            marks=pytest.mark.skipif(
-                not IS_NPU_AVAILABLE, reason='requires NPU support'))
-    ])
-    def test_gather_points_all_close(self, device):
-        features = torch.tensor(
-            [[[
-                -1.6095, -0.1029, -0.8876, -1.2447, -2.4031, 0.3708, -1.1586,
-                -1.4967, -0.4800, 0.2252
-            ],
-              [
-                  1.9138, 3.4979, 1.6854, 1.5631, 3.6776, 3.1154, 2.1705,
-                  2.5221, 2.0411, 3.1446
-              ],
-              [
-                  -1.4173, 0.3073, -1.4339, -1.4340, -1.2770, -0.2867, -1.4162,
-                  -1.4044, -1.4245, -1.4074
-              ]],
-             [[
-                 0.2160, 0.0842, 0.3661, -0.2749, -0.4909, -0.6066, -0.8773,
-                 -0.0745, -0.9496, 0.1434
-             ],
-              [
-                  1.3644, 1.8087, 1.6855, 1.9563, 1.2746, 1.9662, 0.9566,
-                  1.8778, 1.1437, 1.3639
-              ],
-              [
-                  -0.7172, 0.1692, 0.2241, 0.0721, -0.7540, 0.0462, -0.6227,
-                  0.3223, -0.6944, -0.5294
-              ]]],
-            dtype=torch.float,
-            device=device)
-        idx = torch.tensor([[0, 1, 4, 0, 0, 0], [0, 5, 6, 0, 0, 0]],
-                           dtype=torch.int32,
-                           device=device)
-        output = gather_points(features, idx)
-        expected_output = torch.tensor(
-            [[[-1.6095, -0.1029, -2.4031, -1.6095, -1.6095, -1.6095],
-              [1.9138, 3.4979, 3.6776, 1.9138, 1.9138, 1.9138],
-              [-1.4173, 0.3073, -1.2770, -1.4173, -1.4173, -1.4173]],
-             [[0.2160, -0.6066, -0.8773, 0.2160, 0.2160, 0.2160],
-              [1.3644, 1.9662, 0.9566, 1.3644, 1.3644, 1.3644],
-              [-0.7172, 0.0462, -0.6227, -0.7172, -0.7172, -0.7172]]],
-            dtype=torch.float,
-            device=device)
+    idx = torch.tensor([[0, 1, 4, 0, 0, 0], [0, 5, 6, 0, 0, 0]]).int().cuda()
 
-        assert torch.allclose(output, expected_output)
+    output = gather_points(features, idx)
+    expected_output = torch.tensor(
+        [[[-1.6095, -0.1029, -2.4031, -1.6095, -1.6095, -1.6095],
+          [1.9138, 3.4979, 3.6776, 1.9138, 1.9138, 1.9138],
+          [-1.4173, 0.3073, -1.2770, -1.4173, -1.4173, -1.4173]],
+         [[0.2160, -0.6066, -0.8773, 0.2160, 0.2160, 0.2160],
+          [1.3644, 1.9662, 0.9566, 1.3644, 1.3644, 1.3644],
+          [-0.7172, 0.0462, -0.6227, -0.7172, -0.7172, -0.7172]]]).cuda()
 
-        # test fp16
-        output_half = gather_points(features.half(), idx)
-        assert torch.allclose(output_half, expected_output.half())
+    assert torch.allclose(output, expected_output)
diff --git a/tests/test_ops/test_group_points.py b/tests/test_ops/test_group_points.py
index 8109540..1b495c2 100644
--- a/tests/test_ops/test_group_points.py
+++ b/tests/test_ops/test_group_points.py
@@ -1,4 +1,3 @@
-# Copyright (c) OpenMMLab. All rights reserved.
 import pytest
 import torch
 
@@ -7,13 +6,12 @@ from mmcv.ops import grouping_operation
 
 @pytest.mark.skipif(
     not torch.cuda.is_available(), reason='requires CUDA support')
-@pytest.mark.parametrize('dtype', [torch.half, torch.float, torch.double])
-def test_grouping_points(dtype):
+def test_grouping_points():
     idx = torch.tensor([[[0, 0, 0], [3, 3, 3], [8, 8, 8], [0, 0, 0], [0, 0, 0],
                          [0, 0, 0]],
                         [[0, 0, 0], [6, 6, 6], [9, 9, 9], [0, 0, 0], [0, 0, 0],
                          [0, 0, 0]]]).int().cuda()
-    features = torch.tensor([[[
+    festures = torch.tensor([[[
         0.5798, -0.7981, -0.9280, -1.3311, 1.3687, 0.9277, -0.4164, -1.8274,
         0.9268, 0.8414
     ],
@@ -36,129 +34,43 @@ def test_grouping_points(dtype):
                               [
                                   -0.6646, -0.6870, -0.1125, -0.2224, -0.3445,
                                   -1.4049, 0.4990, -0.7037, -0.9924, 0.0386
-                              ]]],
-                            dtype=dtype).cuda()
+                              ]]]).cuda()
 
-    output = grouping_operation(features, idx)
-    expected_output = torch.tensor(
-        [[[[0.5798, 0.5798, 0.5798], [-1.3311, -1.3311, -1.3311],
-           [0.9268, 0.9268, 0.9268], [0.5798, 0.5798, 0.5798],
-           [0.5798, 0.5798, 0.5798], [0.5798, 0.5798, 0.5798]],
-          [[5.4247, 5.4247, 5.4247], [1.4740, 1.4740, 1.4740],
-           [2.1581, 2.1581, 2.1581], [5.4247, 5.4247, 5.4247],
-           [5.4247, 5.4247, 5.4247], [5.4247, 5.4247, 5.4247]],
-          [[-1.6266, -1.6266, -1.6266], [-1.6931, -1.6931, -1.6931],
-           [-1.6786, -1.6786, -1.6786], [-1.6266, -1.6266, -1.6266],
-           [-1.6266, -1.6266, -1.6266], [-1.6266, -1.6266, -1.6266]]],
-         [[[-0.0380, -0.0380, -0.0380], [-0.3693, -0.3693, -0.3693],
-           [-1.8527, -1.8527, -1.8527], [-0.0380, -0.0380, -0.0380],
-           [-0.0380, -0.0380, -0.0380], [-0.0380, -0.0380, -0.0380]],
-          [[1.1773, 1.1773, 1.1773], [6.0865, 6.0865, 6.0865],
-           [2.8229, 2.8229, 2.8229], [1.1773, 1.1773, 1.1773],
-           [1.1773, 1.1773, 1.1773], [1.1773, 1.1773, 1.1773]],
-          [[-0.6646, -0.6646, -0.6646], [0.4990, 0.4990, 0.4990],
-           [0.0386, 0.0386, 0.0386], [-0.6646, -0.6646, -0.6646],
-           [-0.6646, -0.6646, -0.6646], [-0.6646, -0.6646, -0.6646]]]],
-        dtype=dtype).cuda()
-    assert torch.allclose(output, expected_output)
-
-
-@pytest.mark.skipif(
-    not torch.cuda.is_available(), reason='requires CUDA support')
-@pytest.mark.parametrize('dtype', [torch.half, torch.float, torch.double])
-def test_stack_grouping_points(dtype):
-    idx = torch.tensor([[0, 0, 0], [3, 3, 3], [8, 8, 8], [1, 1, 1], [0, 0, 0],
-                        [2, 2, 2], [0, 0, 0], [6, 6, 6], [9, 9, 9], [0, 0, 0],
-                        [1, 1, 1], [0, 0, 0]]).int().cuda()
-    features = torch.tensor([[
-        0.5798, -0.7981, -0.9280, -1.3311, 1.3687, 0.9277, -0.4164, -1.8274,
-        0.9268, 0.8414
-    ],
-                             [
-                                 5.4247, 1.5113, 2.3944, 1.4740, 5.0300,
-                                 5.1030, 1.9360, 2.1939, 2.1581, 3.4666
-                             ],
-                             [
-                                 -1.6266, -1.0281, -1.0393, -1.6931, -1.3982,
-                                 -0.5732, -1.0830, -1.7561, -1.6786, -1.6967
-                             ],
-                             [
-                                 -0.0380, -0.1880, -1.5724, 0.6905, -0.3190,
-                                 0.7798, -0.3693, -0.9457, -0.2942, -1.8527
-                             ],
-                             [
-                                 1.1773, 1.5009, 2.6399, 5.9242, 1.0962,
-                                 2.7346, 6.0865, 1.5555, 4.3303, 2.8229
-                             ],
-                             [
-                                 -0.6646, -0.6870, -0.1125, -0.2224, -0.3445,
-                                 -1.4049, 0.4990, -0.7037, -0.9924, 0.0386
-                             ]],
-                            dtype=dtype).cuda()
-    features_batch_cnt = torch.tensor([3, 3]).int().cuda()
-    indices_batch_cnt = torch.tensor([6, 6]).int().cuda()
-    output = grouping_operation(features, idx, features_batch_cnt,
-                                indices_batch_cnt)
-    expected_output = torch.tensor(
-        [[[0.5798, 0.5798, 0.5798], [-0.7981, -0.7981, -0.7981],
-          [-0.9280, -0.9280, -0.9280], [-1.3311, -1.3311, -1.3311],
-          [1.3687, 1.3687, 1.3687], [0.9277, 0.9277, 0.9277],
-          [-0.4164, -0.4164, -0.4164], [-1.8274, -1.8274, -1.8274],
-          [0.9268, 0.9268, 0.9268], [0.8414, 0.8414, 0.8414]],
-         [[0.0000, 0.0000, 0.0000], [0.0000, 0.0000, 0.0000],
-          [0.0000, 0.0000, 0.0000], [0.0000, 0.0000, 0.0000],
-          [0.0000, 0.0000, 0.0000], [0.0000, 0.0000, 0.0000],
-          [0.0000, 0.0000, 0.0000], [0.0000, 0.0000, 0.0000],
-          [0.0000, 0.0000, 0.0000], [0.0000, 0.0000, 0.0000]],
-         [[0.0000, 0.0000, 0.0000], [0.0000, 0.0000, 0.0000],
-          [0.0000, 0.0000, 0.0000], [0.0000, 0.0000, 0.0000],
-          [0.0000, 0.0000, 0.0000], [0.0000, 0.0000, 0.0000],
-          [0.0000, 0.0000, 0.0000], [0.0000, 0.0000, 0.0000],
-          [0.0000, 0.0000, 0.0000], [0.0000, 0.0000, 0.0000]],
-         [[5.4247, 5.4247, 5.4247], [1.5113, 1.5113, 1.5113],
-          [2.3944, 2.3944, 2.3944], [1.4740, 1.4740, 1.4740],
-          [5.0300, 5.0300, 5.0300], [5.1030, 5.1030, 5.1030],
-          [1.9360, 1.9360, 1.9360], [2.1939, 2.1939, 2.1939],
-          [2.1581, 2.1581, 2.1581], [3.4666, 3.4666, 3.4666]],
-         [[0.5798, 0.5798, 0.5798], [-0.7981, -0.7981, -0.7981],
-          [-0.9280, -0.9280, -0.9280], [-1.3311, -1.3311, -1.3311],
-          [1.3687, 1.3687, 1.3687], [0.9277, 0.9277, 0.9277],
-          [-0.4164, -0.4164, -0.4164], [-1.8274, -1.8274, -1.8274],
-          [0.9268, 0.9268, 0.9268], [0.8414, 0.8414, 0.8414]],
-         [[-1.6266, -1.6266, -1.6266], [-1.0281, -1.0281, -1.0281],
-          [-1.0393, -1.0393, -1.0393], [-1.6931, -1.6931, -1.6931],
-          [-1.3982, -1.3982, -1.3982], [-0.5732, -0.5732, -0.5732],
-          [-1.0830, -1.0830, -1.0830], [-1.7561, -1.7561, -1.7561],
-          [-1.6786, -1.6786, -1.6786], [-1.6967, -1.6967, -1.6967]],
-         [[-0.0380, -0.0380, -0.0380], [-0.1880, -0.1880, -0.1880],
-          [-1.5724, -1.5724, -1.5724], [0.6905, 0.6905, 0.6905],
-          [-0.3190, -0.3190, -0.3190], [0.7798, 0.7798, 0.7798],
-          [-0.3693, -0.3693, -0.3693], [-0.9457, -0.9457, -0.9457],
-          [-0.2942, -0.2942, -0.2942], [-1.8527, -1.8527, -1.8527]],
-         [[0.0000, 0.0000, 0.0000], [0.0000, 0.0000, 0.0000],
-          [0.0000, 0.0000, 0.0000], [0.0000, 0.0000, 0.0000],
-          [0.0000, 0.0000, 0.0000], [0.0000, 0.0000, 0.0000],
-          [0.0000, 0.0000, 0.0000], [0.0000, 0.0000, 0.0000],
-          [0.0000, 0.0000, 0.0000], [0.0000, 0.0000, 0.0000]],
-         [[0.0000, 0.0000, 0.0000], [0.0000, 0.0000, 0.0000],
-          [0.0000, 0.0000, 0.0000], [0.0000, 0.0000, 0.0000],
-          [0.0000, 0.0000, 0.0000], [0.0000, 0.0000, 0.0000],
-          [0.0000, 0.0000, 0.0000], [0.0000, 0.0000, 0.0000],
-          [0.0000, 0.0000, 0.0000], [0.0000, 0.0000, 0.0000]],
-         [[-0.0380, -0.0380, -0.0380], [-0.1880, -0.1880, -0.1880],
-          [-1.5724, -1.5724, -1.5724], [0.6905, 0.6905, 0.6905],
-          [-0.3190, -0.3190, -0.3190], [0.7798, 0.7798, 0.7798],
-          [-0.3693, -0.3693, -0.3693], [-0.9457, -0.9457, -0.9457],
-          [-0.2942, -0.2942, -0.2942], [-1.8527, -1.8527, -1.8527]],
-         [[1.1773, 1.1773, 1.1773], [1.5009, 1.5009, 1.5009],
-          [2.6399, 2.6399, 2.6399], [5.9242, 5.9242, 5.9242],
-          [1.0962, 1.0962, 1.0962], [2.7346, 2.7346, 2.7346],
-          [6.0865, 6.0865, 6.0865], [1.5555, 1.5555, 1.5555],
-          [4.3303, 4.3303, 4.3303], [2.8229, 2.8229, 2.8229]],
-         [[-0.0380, -0.0380, -0.0380], [-0.1880, -0.1880, -0.1880],
-          [-1.5724, -1.5724, -1.5724], [0.6905, 0.6905, 0.6905],
-          [-0.3190, -0.3190, -0.3190], [0.7798, 0.7798, 0.7798],
-          [-0.3693, -0.3693, -0.3693], [-0.9457, -0.9457, -0.9457],
-          [-0.2942, -0.2942, -0.2942], [-1.8527, -1.8527, -1.8527]]],
-        dtype=dtype).cuda()
+    output = grouping_operation(festures, idx)
+    expected_output = torch.tensor([[[[0.5798, 0.5798, 0.5798],
+                                      [-1.3311, -1.3311, -1.3311],
+                                      [0.9268, 0.9268, 0.9268],
+                                      [0.5798, 0.5798, 0.5798],
+                                      [0.5798, 0.5798, 0.5798],
+                                      [0.5798, 0.5798, 0.5798]],
+                                     [[5.4247, 5.4247, 5.4247],
+                                      [1.4740, 1.4740, 1.4740],
+                                      [2.1581, 2.1581, 2.1581],
+                                      [5.4247, 5.4247, 5.4247],
+                                      [5.4247, 5.4247, 5.4247],
+                                      [5.4247, 5.4247, 5.4247]],
+                                     [[-1.6266, -1.6266, -1.6266],
+                                      [-1.6931, -1.6931, -1.6931],
+                                      [-1.6786, -1.6786, -1.6786],
+                                      [-1.6266, -1.6266, -1.6266],
+                                      [-1.6266, -1.6266, -1.6266],
+                                      [-1.6266, -1.6266, -1.6266]]],
+                                    [[[-0.0380, -0.0380, -0.0380],
+                                      [-0.3693, -0.3693, -0.3693],
+                                      [-1.8527, -1.8527, -1.8527],
+                                      [-0.0380, -0.0380, -0.0380],
+                                      [-0.0380, -0.0380, -0.0380],
+                                      [-0.0380, -0.0380, -0.0380]],
+                                     [[1.1773, 1.1773, 1.1773],
+                                      [6.0865, 6.0865, 6.0865],
+                                      [2.8229, 2.8229, 2.8229],
+                                      [1.1773, 1.1773, 1.1773],
+                                      [1.1773, 1.1773, 1.1773],
+                                      [1.1773, 1.1773, 1.1773]],
+                                     [[-0.6646, -0.6646, -0.6646],
+                                      [0.4990, 0.4990, 0.4990],
+                                      [0.0386, 0.0386, 0.0386],
+                                      [-0.6646, -0.6646, -0.6646],
+                                      [-0.6646, -0.6646, -0.6646],
+                                      [-0.6646, -0.6646, -0.6646]]]]).cuda()
     assert torch.allclose(output, expected_output)
diff --git a/tests/test_ops/test_info.py b/tests/test_ops/test_info.py
index e3c1722..4c95c29 100644
--- a/tests/test_ops/test_info.py
+++ b/tests/test_ops/test_info.py
@@ -1,8 +1,7 @@
-# Copyright (c) OpenMMLab. All rights reserved.
 import torch
 
 
-class TestInfo:
+class TestInfo(object):
 
     def test_info(self):
         if not torch.cuda.is_available():
diff --git a/tests/test_ops/test_iou3d.py b/tests/test_ops/test_iou3d.py
index 6bb8c1c..21ed84a 100644
--- a/tests/test_ops/test_iou3d.py
+++ b/tests/test_ops/test_iou3d.py
@@ -1,145 +1,60 @@
-# Copyright (c) OpenMMLab. All rights reserved.
 import numpy as np
 import pytest
 import torch
 
-from mmcv.ops import boxes_iou3d, boxes_overlap_bev, nms3d, nms3d_normal
-from mmcv.utils import IS_CUDA_AVAILABLE, IS_MLU_AVAILABLE
+from mmcv.ops import boxes_iou_bev, nms_bev, nms_normal_bev
 
 
-@pytest.mark.parametrize('device', [
-    pytest.param(
-        'cuda',
-        marks=pytest.mark.skipif(
-            not IS_CUDA_AVAILABLE, reason='requires CUDA support'))
-])
-def test_boxes_overlap_bev(device):
-    np_boxes1 = np.asarray([[1.0, 1.0, 1.0, 2.0, 2.0, 2.0, 0.0],
-                            [2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 0.0],
-                            [3.0, 3.0, 3.0, 3.0, 2.0, 2.0, 0.0]],
-                           dtype=np.float32)
-    np_boxes2 = np.asarray([[1.0, 1.0, 1.0, 2.0, 2.0, 2.0, 0.0],
-                            [1.0, 1.0, 1.0, 2.0, 2.0, 2.0, np.pi / 2],
-                            [1.0, 1.0, 1.0, 2.0, 2.0, 2.0, np.pi / 4]],
-                           dtype=np.float32)
-    np_expect_overlaps = np.asarray(
-        [[4.0, 4.0, (8 + 8 * 2**0.5) /
-          (3 + 2 * 2**0.5)], [1.0, 1.0, 1.0], [0.0, 0.0, 0.0]],
+@pytest.mark.skipif(
+    not torch.cuda.is_available(), reason='requires CUDA support')
+def test_boxes_iou_bev():
+    np_boxes1 = np.asarray(
+        [[1.0, 1.0, 3.0, 4.0, 0.5], [2.0, 2.0, 3.0, 4.0, 0.6],
+         [7.0, 7.0, 8.0, 8.0, 0.4]],
+        dtype=np.float32)
+    np_boxes2 = np.asarray(
+        [[0.0, 2.0, 2.0, 5.0, 0.3], [2.0, 1.0, 3.0, 3.0, 0.5],
+         [5.0, 5.0, 6.0, 7.0, 0.4]],
         dtype=np.float32)
-
-    boxes1 = torch.from_numpy(np_boxes1).to(device)
-    boxes2 = torch.from_numpy(np_boxes2).to(device)
-
-    # test for 3 boxes
-    overlaps = boxes_overlap_bev(boxes1, boxes2)
-    assert np.allclose(overlaps.cpu().numpy(), np_expect_overlaps, atol=1e-4)
-
-    # test for many boxes
-    boxes2 = boxes2.repeat_interleave(555, 0)
-
-    overlaps = boxes_overlap_bev(boxes1, boxes2)
-    assert np.allclose(
-        overlaps.cpu().numpy(), np_expect_overlaps.repeat(555, 1), atol=1e-4)
-
-
-@pytest.mark.parametrize('device', [
-    pytest.param(
-        'cuda',
-        marks=pytest.mark.skipif(
-            not IS_CUDA_AVAILABLE, reason='requires CUDA support'))
-])
-def test_boxes_iou3d(device):
-    np_boxes1 = np.asarray([[1.0, 1.0, 1.0, 2.0, 2.0, 2.0, 0.0],
-                            [2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 0.0],
-                            [3.0, 3.0, 3.0, 3.0, 2.0, 2.0, 0.0]],
-                           dtype=np.float32)
-    np_boxes2 = np.asarray([[1.0, 1.0, 1.0, 2.0, 2.0, 2.0, 0.0],
-                            [1.0, 1.0, 1.0, 2.0, 2.0, 2.0, np.pi / 2],
-                            [1.0, 1.0, 1.0, 2.0, 2.0, 2.0, np.pi / 4]],
-                           dtype=np.float32)
     np_expect_ious = np.asarray(
-        [[1.0, 1.0, 1.0 / 2**0.5], [1.0 / 15, 1.0 / 15, 1.0 / 15],
-         [0.0, 0.0, 0.0]],
+        [[0.2621, 0.2948, 0.0000], [0.0549, 0.1587, 0.0000],
+         [0.0000, 0.0000, 0.0000]],
         dtype=np.float32)
 
-    boxes1 = torch.from_numpy(np_boxes1).to(device)
-    boxes2 = torch.from_numpy(np_boxes2).to(device)
+    boxes1 = torch.from_numpy(np_boxes1).cuda()
+    boxes2 = torch.from_numpy(np_boxes2).cuda()
 
-    ious = boxes_iou3d(boxes1, boxes2)
+    ious = boxes_iou_bev(boxes1, boxes2)
     assert np.allclose(ious.cpu().numpy(), np_expect_ious, atol=1e-4)
 
 
-@pytest.mark.parametrize('device', [
-    pytest.param(
-        'cuda',
-        marks=pytest.mark.skipif(
-            not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
-    pytest.param(
-        'mlu',
-        marks=pytest.mark.skipif(
-            not IS_MLU_AVAILABLE, reason='requires MLU support'))
-])
-def test_nms3d(device):
-    # test for 5 boxes
-    np_boxes = np.asarray([[1.0, 1.0, 1.0, 2.0, 2.0, 2.0, 0.0],
-                           [2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 0.0],
-                           [3.0, 3.0, 3.0, 3.0, 2.0, 2.0, 0.3],
-                           [3.0, 3.0, 3.0, 3.0, 2.0, 2.0, 0.0],
-                           [3.0, 3.2, 3.2, 3.0, 2.0, 2.0, 0.3]],
-                          dtype=np.float32)
-    np_scores = np.array([0.6, 0.9, 0.1, 0.2, 0.15], dtype=np.float32)
+@pytest.mark.skipif(
+    not torch.cuda.is_available(), reason='requires CUDA support')
+def test_nms_bev():
+    np_boxes = np.array(
+        [[6.0, 3.0, 8.0, 7.0, 2.0], [3.0, 6.0, 9.0, 11.0, 1.0],
+         [3.0, 7.0, 10.0, 12.0, 1.0], [1.0, 4.0, 13.0, 7.0, 3.0]],
+        dtype=np.float32)
+    np_scores = np.array([0.6, 0.9, 0.7, 0.2], dtype=np.float32)
     np_inds = np.array([1, 0, 3])
     boxes = torch.from_numpy(np_boxes)
     scores = torch.from_numpy(np_scores)
-    inds = nms3d(boxes.to(device), scores.to(device), iou_threshold=0.3)
+    inds = nms_bev(boxes.cuda(), scores.cuda(), thresh=0.3)
 
     assert np.allclose(inds.cpu().numpy(), np_inds)
 
-    # test for many boxes
-    # In the float data type calculation process, float will be converted to
-    # double in CUDA kernel (https://github.com/open-mmlab/mmcv/blob
-    # /master/mmcv/ops/csrc/common/box_iou_rotated_utils.hpp#L61),
-    # always use float in MLU kernel. The difference between the mentioned
-    # above leads to different results.
-    if device != 'mlu':
-        np.random.seed(42)
-        np_boxes = np.random.rand(555, 7).astype(np.float32)
-        np_scores = np.random.rand(555).astype(np.float32)
-        boxes = torch.from_numpy(np_boxes)
-        scores = torch.from_numpy(np_scores)
-        inds = nms3d(boxes.to(device), scores.to(device), iou_threshold=0.3)
 
-        assert len(inds.cpu().numpy()) == 176
-
-
-@pytest.mark.parametrize('device', [
-    pytest.param(
-        'cuda',
-        marks=pytest.mark.skipif(
-            not IS_CUDA_AVAILABLE, reason='requires CUDA support'))
-])
-def test_nms3d_normal(device):
-    # test for 5 boxes
-    np_boxes = np.asarray([[1.0, 1.0, 1.0, 2.0, 2.0, 2.0, 0.0],
-                           [2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 0.0],
-                           [3.0, 3.0, 3.0, 3.0, 2.0, 2.0, 0.3],
-                           [3.0, 3.0, 3.0, 3.0, 2.0, 2.0, 0.0],
-                           [3.0, 3.2, 3.2, 3.0, 2.0, 2.0, 0.3]],
-                          dtype=np.float32)
-    np_scores = np.array([0.6, 0.9, 0.1, 0.2, 0.15], dtype=np.float32)
+@pytest.mark.skipif(
+    not torch.cuda.is_available(), reason='requires CUDA support')
+def test_nms_normal_bev():
+    np_boxes = np.array(
+        [[6.0, 3.0, 8.0, 7.0, 2.0], [3.0, 6.0, 9.0, 11.0, 1.0],
+         [3.0, 7.0, 10.0, 12.0, 1.0], [1.0, 4.0, 13.0, 7.0, 3.0]],
+        dtype=np.float32)
+    np_scores = np.array([0.6, 0.9, 0.7, 0.2], dtype=np.float32)
     np_inds = np.array([1, 0, 3])
     boxes = torch.from_numpy(np_boxes)
     scores = torch.from_numpy(np_scores)
-    inds = nms3d_normal(boxes.to(device), scores.to(device), iou_threshold=0.3)
+    inds = nms_normal_bev(boxes.cuda(), scores.cuda(), thresh=0.3)
 
     assert np.allclose(inds.cpu().numpy(), np_inds)
-
-    # test for many boxes
-    np.random.seed(42)
-    np_boxes = np.random.rand(555, 7).astype(np.float32)
-    np_scores = np.random.rand(555).astype(np.float32)
-    boxes = torch.from_numpy(np_boxes)
-    scores = torch.from_numpy(np_scores)
-    inds = nms3d_normal(boxes.to(device), scores.to(device), iou_threshold=0.3)
-
-    assert len(inds.cpu().numpy()) == 148
diff --git a/tests/test_ops/test_knn.py b/tests/test_ops/test_knn.py
index 1236a5f..2740cb5 100644
--- a/tests/test_ops/test_knn.py
+++ b/tests/test_ops/test_knn.py
@@ -1,4 +1,3 @@
-# Copyright (c) OpenMMLab. All rights reserved.
 import pytest
 import torch
 
diff --git a/tests/test_ops/test_masked_conv2d.py b/tests/test_ops/test_masked_conv2d.py
index a292f6a..6df70bc 100644
--- a/tests/test_ops/test_masked_conv2d.py
+++ b/tests/test_ops/test_masked_conv2d.py
@@ -1,41 +1,14 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import numpy as np
-import pytest
 import torch
 
-from mmcv.utils import IS_CUDA_AVAILABLE, IS_MLU_AVAILABLE
 
+class TestMaskedConv2d(object):
 
-class TestMaskedConv2d:
-
-    @pytest.mark.parametrize('device', [
-        pytest.param(
-            'cuda',
-            marks=pytest.mark.skipif(
-                not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
-        pytest.param(
-            'mlu',
-            marks=pytest.mark.skipif(
-                not IS_MLU_AVAILABLE, reason='requires MLU support'))
-    ])
-    def test_masked_conv2d_all_close(self, device):
+    def test_masked_conv2d(self):
+        if not torch.cuda.is_available():
+            return
         from mmcv.ops import MaskedConv2d
-        np_input = np.load(
-            'tests/data/for_masked_conv2d/masked_conv2d_for_input.npy')
-        np_mask = np.load(
-            'tests/data/for_masked_conv2d/masked_conv2d_for_mask.npy')
-        np_weight = np.load(
-            'tests/data/for_masked_conv2d/masked_conv2d_for_weight.npy')
-        np_bias = np.load(
-            'tests/data/for_masked_conv2d/masked_conv2d_for_bias.npy')
-        np_output = np.load(
-            'tests/data/for_masked_conv2d/masked_conv2d_for_output.npy')
-        input = torch.tensor(np_input, dtype=torch.float, device=device)
-        mask = torch.tensor(np_mask, dtype=torch.float, device=device)
-        weight = torch.tensor(np_weight, dtype=torch.float, device=device)
-        bias = torch.tensor(np_bias, dtype=torch.float, device=device)
-        conv = MaskedConv2d(3, 3, 3, 1, 1).to(device)
-        conv.weight = torch.nn.Parameter(weight)
-        conv.bias = torch.nn.Parameter(bias)
+        input = torch.randn(1, 3, 16, 16, requires_grad=True, device='cuda')
+        mask = torch.randn(1, 16, 16, requires_grad=True, device='cuda')
+        conv = MaskedConv2d(3, 3, 3).cuda()
         output = conv(input, mask)
-        assert np.allclose(output.data.cpu().numpy(), np_output, 1e-3)
+        assert output is not None
diff --git a/tests/test_ops/test_merge_cells.py b/tests/test_ops/test_merge_cells.py
index 51551c1..47080ff 100644
--- a/tests/test_ops/test_merge_cells.py
+++ b/tests/test_ops/test_merge_cells.py
@@ -1,11 +1,7 @@
-# Copyright (c) OpenMMLab. All rights reserved.
 """
 CommandLine:
     pytest tests/test_merge_cells.py
 """
-import math
-
-import pytest
 import torch
 import torch.nn.functional as F
 
@@ -13,41 +9,33 @@ from mmcv.ops.merge_cells import (BaseMergeCell, ConcatCell, GlobalPoolingCell,
                                   SumCell)
 
 
-# All size (14, 7) below is to test the situation that
-# the input size can't be divisible by the target size.
-@pytest.mark.parametrize(
-    'inputs_x, inputs_y',
-    [(torch.randn([2, 256, 16, 16]), torch.randn([2, 256, 32, 32])),
-     (torch.randn([2, 256, 14, 7]), torch.randn([2, 256, 32, 32]))])
-def test_sum_cell(inputs_x, inputs_y):
+def test_sum_cell():
+    inputs_x = torch.randn([2, 256, 32, 32])
+    inputs_y = torch.randn([2, 256, 16, 16])
     sum_cell = SumCell(256, 256)
     output = sum_cell(inputs_x, inputs_y, out_size=inputs_x.shape[-2:])
     assert output.size() == inputs_x.size()
     output = sum_cell(inputs_x, inputs_y, out_size=inputs_y.shape[-2:])
     assert output.size() == inputs_y.size()
     output = sum_cell(inputs_x, inputs_y)
-    assert output.size() == inputs_y.size()
+    assert output.size() == inputs_x.size()
 
 
-@pytest.mark.parametrize(
-    'inputs_x, inputs_y',
-    [(torch.randn([2, 256, 16, 16]), torch.randn([2, 256, 32, 32])),
-     (torch.randn([2, 256, 14, 7]), torch.randn([2, 256, 32, 32]))])
-def test_concat_cell(inputs_x, inputs_y):
+def test_concat_cell():
+    inputs_x = torch.randn([2, 256, 32, 32])
+    inputs_y = torch.randn([2, 256, 16, 16])
     concat_cell = ConcatCell(256, 256)
     output = concat_cell(inputs_x, inputs_y, out_size=inputs_x.shape[-2:])
     assert output.size() == inputs_x.size()
     output = concat_cell(inputs_x, inputs_y, out_size=inputs_y.shape[-2:])
     assert output.size() == inputs_y.size()
     output = concat_cell(inputs_x, inputs_y)
-    assert output.size() == inputs_y.size()
+    assert output.size() == inputs_x.size()
 
 
-@pytest.mark.parametrize(
-    'inputs_x, inputs_y',
-    [(torch.randn([2, 256, 16, 16]), torch.randn([2, 256, 32, 32])),
-     (torch.randn([2, 256, 14, 7]), torch.randn([2, 256, 32, 32]))])
-def test_global_pool_cell(inputs_x, inputs_y):
+def test_global_pool_cell():
+    inputs_x = torch.randn([2, 256, 32, 32])
+    inputs_y = torch.randn([2, 256, 32, 32])
     gp_cell = GlobalPoolingCell(with_out_conv=False)
     gp_cell_out = gp_cell(inputs_x, inputs_y, out_size=inputs_x.shape[-2:])
     assert (gp_cell_out.size() == inputs_x.size())
@@ -56,40 +44,22 @@ def test_global_pool_cell(inputs_x, inputs_y):
     assert (gp_cell_out.size() == inputs_x.size())
 
 
-@pytest.mark.parametrize('target_size', [(256, 256), (128, 128), (64, 64),
-                                         (14, 7)])
-def test_resize_methods(target_size):
+def test_resize_methods():
     inputs_x = torch.randn([2, 256, 128, 128])
-    h, w = inputs_x.shape[-2:]
-    target_h, target_w = target_size
-    if (h <= target_h) or w <= target_w:
-        rs_mode = 'upsample'
-    else:
-        rs_mode = 'downsample'
+    target_resize_sizes = [(128, 128), (256, 256)]
+    resize_methods_list = ['nearest', 'bilinear']
 
-    if rs_mode == 'upsample':
-        upsample_methods_list = ['nearest', 'bilinear']
-        for method in upsample_methods_list:
-            merge_cell = BaseMergeCell(upsample_mode=method)
+    for method in resize_methods_list:
+        merge_cell = BaseMergeCell(upsample_mode=method)
+        for target_size in target_resize_sizes:
             merge_cell_out = merge_cell._resize(inputs_x, target_size)
             gt_out = F.interpolate(inputs_x, size=target_size, mode=method)
             assert merge_cell_out.equal(gt_out)
-    elif rs_mode == 'downsample':
-        merge_cell = BaseMergeCell()
-        merge_cell_out = merge_cell._resize(inputs_x, target_size)
-        if h % target_h != 0 or w % target_w != 0:
-            pad_h = math.ceil(h / target_h) * target_h - h
-            pad_w = math.ceil(w / target_w) * target_w - w
-            pad_l = pad_w // 2
-            pad_r = pad_w - pad_l
-            pad_t = pad_h // 2
-            pad_b = pad_h - pad_t
-            pad = (pad_l, pad_r, pad_t, pad_b)
-            inputs_x = F.pad(inputs_x, pad, mode='constant', value=0.0)
-        kernel_size = (inputs_x.shape[-2] // target_h,
-                       inputs_x.shape[-1] // target_w)
-        gt_out = F.max_pool2d(
-            inputs_x, kernel_size=kernel_size, stride=kernel_size)
-        print(merge_cell_out.shape, gt_out.shape)
-        assert (merge_cell_out == gt_out).all()
-        assert merge_cell_out.shape[-2:] == target_size
+
+    target_size = (64, 64)  # resize to a smaller size
+    merge_cell = BaseMergeCell()
+    merge_cell_out = merge_cell._resize(inputs_x, target_size)
+    kernel_size = inputs_x.shape[-1] // target_size[-1]
+    gt_out = F.max_pool2d(
+        inputs_x, kernel_size=kernel_size, stride=kernel_size)
+    assert (merge_cell_out == gt_out).all()
diff --git a/tests/test_ops/test_min_area_polygons.py b/tests/test_ops/test_min_area_polygons.py
deleted file mode 100644
index 649bdec..0000000
--- a/tests/test_ops/test_min_area_polygons.py
+++ /dev/null
@@ -1,30 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import numpy as np
-import pytest
-import torch
-
-from mmcv.ops import min_area_polygons
-
-np_pointsets = np.asarray([[
-    1.0, 1.0, 2.0, 2.0, 1.0, 2.0, 2.0, 1.0, 1.0, 3.0, 3.0, 1.0, 2.0, 3.0, 3.0,
-    2.0, 1.5, 1.5
-],
-                           [
-                               1.0, 1.0, 8.0, 8.0, 1.0, 2.0, 2.0, 1.0, 1.0,
-                               3.0, 3.0, 1.0, 2.0, 3.0, 3.0, 2.0, 1.5, 1.5
-                           ]])
-
-expected_polygons = np.asarray(
-    [[3.0000, 1.0000, 1.0000, 1.0000, 1.0000, 3.0000, 3.0000, 3.0000],
-     [8.0, 8.0, 2.3243, 0.0541, 0.0541, 1.6757, 5.7297, 9.6216]])
-
-
-@pytest.mark.skipif(
-    not torch.cuda.is_available(), reason='requires CUDA support')
-def test_min_area_polygons():
-    pointsets = torch.from_numpy(np_pointsets).cuda().float()
-
-    assert np.allclose(
-        min_area_polygons(pointsets).cpu().numpy(),
-        expected_polygons,
-        atol=1e-4)
diff --git a/tests/test_ops/test_modulated_deform_conv.py b/tests/test_ops/test_modulated_deform_conv.py
index ee29e73..b528c51 100644
--- a/tests/test_ops/test_modulated_deform_conv.py
+++ b/tests/test_ops/test_modulated_deform_conv.py
@@ -1,11 +1,10 @@
-# Copyright (c) OpenMMLab. All rights reserved.
 import os
 
 import numpy
 import pytest
 import torch
-from mmengine.utils import digit_version
-from mmengine.utils.dl_utils import TORCH_VERSION
+
+from mmcv.utils import TORCH_VERSION, digit_version
 
 try:
     # If PyTorch version >= 1.6.0 and fp16 is enabled, torch.cuda.amp.autocast
@@ -37,7 +36,7 @@ dcn_offset_b_grad = [
 ]
 
 
-class TestMdconv:
+class TestMdconv(object):
 
     def _test_mdconv(self, dtype=torch.float, device='cuda'):
         if not torch.cuda.is_available() and device == 'cuda':
diff --git a/tests/test_ops/test_ms_deformable_attn.py b/tests/test_ops/test_ms_deformable_attn.py
index 8e9f1af..0c350de 100644
--- a/tests/test_ops/test_ms_deformable_attn.py
+++ b/tests/test_ops/test_ms_deformable_attn.py
@@ -1,48 +1,34 @@
-# Copyright (c) OpenMMLab. All rights reserved.
 import pytest
 import torch
 
 from mmcv.ops.multi_scale_deform_attn import (
     MultiScaleDeformableAttention, MultiScaleDeformableAttnFunction,
     multi_scale_deformable_attn_pytorch)
-from mmcv.utils import IS_CUDA_AVAILABLE, IS_MLU_AVAILABLE
 
 _USING_PARROTS = True
-_IS_AUTOCAST_AVAILABLE = True
 try:
     from parrots.autograd import gradcheck
 except ImportError:
     from torch.autograd import gradcheck
     _USING_PARROTS = False
 
-try:
-    # If PyTorch version >= 1.6.0 and fp16 is enabled, torch.cuda.amp.autocast
-    # would be imported and used; we should test if our modules support it.
-    from torch.cuda.amp import autocast
-except ImportError:
-    _IS_AUTOCAST_AVAILABLE = False
-    pass
-
 
-@pytest.mark.parametrize('device', [
+@pytest.mark.parametrize('device_type', [
     'cpu',
     pytest.param(
         'cuda:0',
         marks=pytest.mark.skipif(
-            not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
-    pytest.param(
-        'mlu',
-        marks=pytest.mark.skipif(
-            not IS_MLU_AVAILABLE, reason='requires MLU support'))
+            not torch.cuda.is_available(), reason='requires CUDA support'))
 ])
-def test_multiscale_deformable_attention(device):
+def test_multiscale_deformable_attention(device_type):
+
     with pytest.raises(ValueError):
         # embed_dims must be divisible by num_heads,
         MultiScaleDeformableAttention(
             embed_dims=256,
             num_heads=7,
         )
-    device = torch.device(device)
+    device = torch.device(device_type)
     msda = MultiScaleDeformableAttention(
         embed_dims=3, num_levels=2, num_heads=3)
     msda.init_weights()
@@ -63,32 +49,12 @@ def test_multiscale_deformable_attention(device):
         spatial_shapes=spatial_shapes,
         level_start_index=level_start_index)
 
-    # test with value_proj_ratio
-    embed_dims = 6
-    value_proj_ratio = 0.5
-    query = torch.rand(num_query, bs, embed_dims).to(device)
-    key = torch.rand(num_query, bs, embed_dims).to(device)
-    msda = MultiScaleDeformableAttention(
-        embed_dims=embed_dims,
-        num_levels=2,
-        num_heads=3,
-        value_proj_ratio=value_proj_ratio)
-    msda.init_weights()
-    msda.to(device)
-    msda(
-        query,
-        key,
-        key,
-        reference_points=reference_points,
-        spatial_shapes=spatial_shapes,
-        level_start_index=level_start_index)
-
 
 def test_forward_multi_scale_deformable_attn_pytorch():
     N, M, D = 1, 2, 2
     Lq, L, P = 2, 2, 2
     shapes = torch.as_tensor([(6, 4), (3, 2)], dtype=torch.long)
-    S = sum((H * W).item() for H, W in shapes)
+    S = sum([(H * W).item() for H, W in shapes])
 
     torch.manual_seed(3)
     value = torch.rand(N, S, M, D) * 0.01
@@ -103,19 +69,20 @@ def test_forward_multi_scale_deformable_attn_pytorch():
                                         attention_weights.double()).detach()
 
 
-@pytest.mark.skipif(not IS_CUDA_AVAILABLE, reason='requires CUDA support')
+@pytest.mark.skipif(
+    not torch.cuda.is_available(), reason='requires CUDA support')
 def test_forward_equal_with_pytorch_double():
     N, M, D = 1, 2, 2
     Lq, L, P = 2, 2, 2
-    shapes = torch.as_tensor([(6, 4), (3, 2)], dtype=torch.long)
+    shapes = torch.as_tensor([(6, 4), (3, 2)], dtype=torch.long).cuda()
     level_start_index = torch.cat((shapes.new_zeros(
         (1, )), shapes.prod(1).cumsum(0)[:-1]))
-    S = sum((H * W).item() for H, W in shapes)
+    S = sum([(H * W).item() for H, W in shapes])
 
     torch.manual_seed(3)
-    value = torch.rand(N, S, M, D) * 0.01
-    sampling_locations = torch.rand(N, Lq, M, L, P, 2)
-    attention_weights = torch.rand(N, Lq, M, L, P) + 1e-5
+    value = torch.rand(N, S, M, D).cuda() * 0.01
+    sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda()
+    attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5
     attention_weights /= attention_weights.sum(
         -1, keepdim=True).sum(
             -2, keepdim=True)
@@ -125,9 +92,8 @@ def test_forward_equal_with_pytorch_double():
         attention_weights.double()).detach().cpu()
 
     output_cuda = MultiScaleDeformableAttnFunction.apply(
-        value.cuda().double(), shapes.cuda(), level_start_index.cuda(),
-        sampling_locations.cuda().double(),
-        attention_weights.cuda().double(), im2col_step).detach().cpu()
+        value.double(), shapes, level_start_index, sampling_locations.double(),
+        attention_weights.double(), im2col_step).detach().cpu()
     assert torch.allclose(output_cuda, output_pytorch)
     max_abs_err = (output_cuda - output_pytorch).abs().max()
     max_rel_err = ((output_cuda - output_pytorch).abs() /
@@ -136,62 +102,20 @@ def test_forward_equal_with_pytorch_double():
     assert max_rel_err < 1e-15
 
 
-@pytest.mark.parametrize('device', [
-    pytest.param(
-        'cuda',
-        marks=pytest.mark.skipif(
-            not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
-    pytest.param(
-        'mlu',
-        marks=pytest.mark.skipif(
-            not IS_MLU_AVAILABLE, reason='requires MLU support'))
-])
-def test_forward_equal_with_pytorch_float(device):
-    N, M, D = 1, 2, 2
-    Lq, L, P = 2, 2, 2
-    shapes = torch.as_tensor([(6, 4), (3, 2)], dtype=torch.long)
-    level_start_index = torch.cat((shapes.new_zeros(
-        (1, )), shapes.prod(1).cumsum(0)[:-1]))
-    S = sum((H * W).item() for H, W in shapes)
-
-    torch.manual_seed(3)
-    value = torch.rand(N, S, M, D) * 0.01
-    sampling_locations = torch.rand(N, Lq, M, L, P, 2)
-    attention_weights = torch.rand(N, Lq, M, L, P) + 1e-5
-    attention_weights /= attention_weights.sum(
-        -1, keepdim=True).sum(
-            -2, keepdim=True)
-    im2col_step = 2
-    output_pytorch = multi_scale_deformable_attn_pytorch(
-        value, shapes, sampling_locations, attention_weights).detach().cpu()
-
-    output_device = MultiScaleDeformableAttnFunction.apply(
-        value.to(device), shapes.to(device), level_start_index.to(device),
-        sampling_locations.to(device), attention_weights.to(device),
-        im2col_step).detach().cpu()
-    assert torch.allclose(output_device, output_pytorch, rtol=1e-2, atol=1e-3)
-    max_abs_err = (output_device - output_pytorch).abs().max()
-    max_rel_err = ((output_device - output_pytorch).abs() /
-                   output_pytorch.abs()).max()
-    assert max_abs_err < 1e-9
-    assert max_rel_err < 1e-6
-
-
 @pytest.mark.skipif(
-    not _IS_AUTOCAST_AVAILABLE, reason='requires autocast support')
-@pytest.mark.skipif(not IS_CUDA_AVAILABLE, reason='requires CUDA support')
-def test_forward_equal_with_autocast():
+    not torch.cuda.is_available(), reason='requires CUDA support')
+def test_forward_equal_with_pytorch_float():
     N, M, D = 1, 2, 2
     Lq, L, P = 2, 2, 2
-    shapes = torch.as_tensor([(6, 4), (3, 2)], dtype=torch.long)
+    shapes = torch.as_tensor([(6, 4), (3, 2)], dtype=torch.long).cuda()
     level_start_index = torch.cat((shapes.new_zeros(
         (1, )), shapes.prod(1).cumsum(0)[:-1]))
-    S = sum((H * W).item() for H, W in shapes)
+    S = sum([(H * W).item() for H, W in shapes])
 
     torch.manual_seed(3)
-    value = torch.rand(N, S, M, D) * 0.01
-    sampling_locations = torch.rand(N, Lq, M, L, P, 2)
-    attention_weights = torch.rand(N, Lq, M, L, P) + 1e-5
+    value = torch.rand(N, S, M, D).cuda() * 0.01
+    sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda()
+    attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5
     attention_weights /= attention_weights.sum(
         -1, keepdim=True).sum(
             -2, keepdim=True)
@@ -199,55 +123,19 @@ def test_forward_equal_with_autocast():
     output_pytorch = multi_scale_deformable_attn_pytorch(
         value, shapes, sampling_locations, attention_weights).detach().cpu()
 
-    # float test
-    dtype = torch.float
-    with autocast(enabled=True):
-        output_device = MultiScaleDeformableAttnFunction.apply(
-            value.cuda().type(dtype), shapes.cuda(), level_start_index.cuda(),
-            sampling_locations.cuda(), attention_weights.cuda(),
-            im2col_step).detach().cpu()
-    assert torch.allclose(output_device, output_pytorch, rtol=1e-2, atol=1e-3)
-    max_abs_err = (output_device - output_pytorch).abs().max()
-    max_rel_err = ((output_device - output_pytorch).abs() /
+    output_cuda = MultiScaleDeformableAttnFunction.apply(
+        value, shapes, level_start_index, sampling_locations,
+        attention_weights, im2col_step).detach().cpu()
+    assert torch.allclose(output_cuda, output_pytorch, rtol=1e-2, atol=1e-3)
+    max_abs_err = (output_cuda - output_pytorch).abs().max()
+    max_rel_err = ((output_cuda - output_pytorch).abs() /
                    output_pytorch.abs()).max()
     assert max_abs_err < 1e-9
     assert max_rel_err < 1e-6
 
-    # half test
-    dtype = torch.half
-    with autocast(enabled=True):
-        output_device = MultiScaleDeformableAttnFunction.apply(
-            value.cuda().type(dtype), shapes.cuda(), level_start_index.cuda(),
-            sampling_locations.cuda(), attention_weights.cuda(),
-            im2col_step).detach().cpu()
-    assert torch.allclose(
-        output_device, output_pytorch.half(), rtol=1e-2, atol=1e-3)
-    max_abs_err = (output_device - output_pytorch).abs().max()
-    max_rel_err = ((output_device - output_pytorch).abs() /
-                   output_pytorch.abs()).max()
-    assert max_abs_err < 1e-5
-    assert max_rel_err < 1e-2
 
-
-@pytest.mark.parametrize('device', [
-    pytest.param(
-        'cuda',
-        marks=pytest.mark.skipif(
-            not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
-    pytest.param(
-        'mlu',
-        marks=pytest.mark.skipif(
-            not IS_MLU_AVAILABLE, reason='requires MLU support'))
-])
-@pytest.mark.parametrize('dtype', [
-    torch.float,
-    pytest.param(
-        torch.double,
-        marks=pytest.mark.skipif(
-            IS_MLU_AVAILABLE,
-            reason='MLU does not support for 64-bit floating point')),
-    torch.half
-])
+@pytest.mark.skipif(
+    not torch.cuda.is_available(), reason='requires CUDA support')
 @pytest.mark.parametrize('channels', [
     4,
     30,
@@ -257,22 +145,20 @@ def test_forward_equal_with_autocast():
     1025,
 ])
 def test_gradient_numerical(channels,
-                            device,
-                            dtype,
                             grad_value=True,
                             grad_sampling_loc=True,
                             grad_attn_weight=True):
 
     N, M, _ = 1, 2, 2
     Lq, L, P = 2, 2, 2
-    shapes = torch.as_tensor([(3, 2), (2, 1)], dtype=torch.long).to(device)
+    shapes = torch.as_tensor([(3, 2), (2, 1)], dtype=torch.long).cuda()
     level_start_index = torch.cat((shapes.new_zeros(
         (1, )), shapes.prod(1).cumsum(0)[:-1]))
-    S = sum((H * W).item() for H, W in shapes)
+    S = sum([(H * W).item() for H, W in shapes])
 
-    value = torch.rand(N, S, M, channels).to(device) * 0.01
-    sampling_locations = torch.rand(N, Lq, M, L, P, 2).to(device)
-    attention_weights = torch.rand(N, Lq, M, L, P).to(device) + 1e-5
+    value = torch.rand(N, S, M, channels).cuda() * 0.01
+    sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda()
+    attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5
     attention_weights /= attention_weights.sum(
         -1, keepdim=True).sum(
             -2, keepdim=True)
@@ -283,23 +169,13 @@ def test_gradient_numerical(channels,
     value.requires_grad = grad_value
     sampling_locations.requires_grad = grad_sampling_loc
     attention_weights.requires_grad = grad_attn_weight
-    if device == 'cuda':
-        dtype = torch.double
-        eps = 1e-6
-    elif device == 'mlu':
-        dtype = torch.float
-        eps = 1e-4
     if _USING_PARROTS:
         assert gradcheck(
-            func, (value.to(dtype), shapes, level_start_index,
-                   sampling_locations.to(dtype), attention_weights.to(dtype),
+            func, (value.double(), shapes, level_start_index,
+                   sampling_locations.double(), attention_weights.double(),
                    im2col_step),
-            no_grads=[shapes, level_start_index],
-            eps=eps)
+            no_grads=[shapes, level_start_index])
     else:
-        assert gradcheck(
-            func, (value.to(dtype), shapes, level_start_index,
-                   sampling_locations.to(dtype), attention_weights.to(dtype),
-                   im2col_step),
-            eps=eps,
-            atol=1e-2)
+        assert gradcheck(func, (value.double(), shapes, level_start_index,
+                                sampling_locations.double(),
+                                attention_weights.double(), im2col_step))
diff --git a/tests/test_ops/test_nms.py b/tests/test_ops/test_nms.py
index 9f1ac65..3c59204 100644
--- a/tests/test_ops/test_nms.py
+++ b/tests/test_ops/test_nms.py
@@ -1,25 +1,13 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import mmengine
 import numpy as np
 import pytest
 import torch
 
-from mmcv.utils import IS_CUDA_AVAILABLE, IS_MLU_AVAILABLE
 
+class Testnms(object):
 
-class Testnms:
-
-    @pytest.mark.parametrize('device', [
-        pytest.param(
-            'cuda',
-            marks=pytest.mark.skipif(
-                not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
-        pytest.param(
-            'mlu',
-            marks=pytest.mark.skipif(
-                not IS_MLU_AVAILABLE, reason='requires MLU support'))
-    ])
-    def test_nms_allclose(self, device):
+    def test_nms_allclose(self):
+        if not torch.cuda.is_available():
+            return
         from mmcv.ops import nms
         np_boxes = np.array([[6.0, 3.0, 8.0, 7.0], [3.0, 6.0, 9.0, 11.0],
                              [3.0, 7.0, 10.0, 12.0], [1.0, 4.0, 13.0, 7.0]],
@@ -35,7 +23,7 @@ class Testnms:
         assert np.allclose(dets, np_dets)  # test cpu
         assert np.allclose(inds, np_inds)  # test cpu
         dets, inds = nms(
-            boxes.to(device), scores.to(device), iou_threshold=0.3, offset=0)
+            boxes.cuda(), scores.cuda(), iou_threshold=0.3, offset=0)
         assert np.allclose(dets.cpu().numpy(), np_dets)  # test gpu
         assert np.allclose(inds.cpu().numpy(), np_inds)  # test gpu
 
@@ -130,7 +118,8 @@ class Testnms:
         scores = tensor_dets[:, 4]
         nms_keep_inds = nms(boxes.contiguous(), scores.contiguous(),
                             iou_thr)[1]
-        assert {g[0].item() for g in np_groups} == set(nms_keep_inds.tolist())
+        assert set([g[0].item()
+                    for g in np_groups]) == set(nms_keep_inds.tolist())
 
         # non empty tensor input
         tensor_dets = torch.from_numpy(np_dets)
@@ -145,8 +134,9 @@ class Testnms:
             nms_match(wrong_dets, iou_thr)
 
     def test_batched_nms(self):
+        import mmcv
         from mmcv.ops import batched_nms
-        results = mmengine.load('./tests/data/batched_nms_data.pkl')
+        results = mmcv.load('./tests/data/batched_nms_data.pkl')
 
         nms_max_num = 100
         nms_cfg = dict(
@@ -192,14 +182,3 @@ class Testnms:
 
         assert torch.equal(keep, seq_keep)
         assert torch.equal(boxes, seq_boxes)
-
-        # test skip nms when `nms_cfg` is None
-        seq_boxes, seq_keep = batched_nms(
-            torch.from_numpy(results['boxes']),
-            torch.from_numpy(results['scores']),
-            torch.from_numpy(results['idxs']),
-            None,
-            class_agnostic=False)
-        assert len(seq_keep) == len(results['boxes'])
-        # assert score is descending order
-        assert ((seq_boxes[:, -1][1:] - seq_boxes[:, -1][:-1]) < 0).all()
diff --git a/tests/test_ops/test_nms_quadri.py b/tests/test_ops/test_nms_quadri.py
deleted file mode 100644
index 51f91f0..0000000
--- a/tests/test_ops/test_nms_quadri.py
+++ /dev/null
@@ -1,119 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import numpy as np
-import pytest
-import torch
-
-from mmcv.utils import IS_CUDA_AVAILABLE
-
-
-class TestNMSQuadri:
-
-    @pytest.mark.parametrize('device', [
-        'cpu',
-        pytest.param(
-            'cuda',
-            marks=pytest.mark.skipif(
-                not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
-    ])
-    def test_ml_nms_quadri(self, device):
-        from mmcv.ops import nms_quadri
-        np_boxes = np.array([[1.0, 1.0, 3.0, 4.0, 4.0, 4.0, 4.0, 1.0, 0.7],
-                             [2.0, 2.0, 3.0, 4.0, 4.0, 2.0, 3.0, 1.0, 0.8],
-                             [7.0, 7.0, 8.0, 8.0, 9.0, 7.0, 8.0, 6.0, 0.5],
-                             [0.0, 0.0, 0.0, 2.0, 2.0, 2.0, 2.0, 0.0, 0.9]],
-                            dtype=np.float32)
-        np_labels = np.array([1, 0, 1, 0], dtype=np.float32)
-
-        np_expect_dets = np.array([[0., 0., 0., 2., 2., 2., 2., 0.],
-                                   [2., 2., 3., 4., 4., 2., 3., 1.],
-                                   [7., 7., 8., 8., 9., 7., 8., 6.]],
-                                  dtype=np.float32)
-        np_expect_keep_inds = np.array([3, 1, 2], dtype=np.int64)
-
-        boxes = torch.from_numpy(np_boxes).to(device)
-        labels = torch.from_numpy(np_labels).to(device)
-
-        dets, keep_inds = nms_quadri(boxes[:, :8], boxes[:, -1], 0.3, labels)
-
-        assert np.allclose(dets.cpu().numpy()[:, :8], np_expect_dets)
-        assert np.allclose(keep_inds.cpu().numpy(), np_expect_keep_inds)
-
-    @pytest.mark.parametrize('device', [
-        'cpu',
-        pytest.param(
-            'cuda',
-            marks=pytest.mark.skipif(
-                not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
-    ])
-    def test_nms_quadri(self, device):
-        from mmcv.ops import nms_quadri
-        np_boxes = np.array([[1.0, 1.0, 3.0, 4.0, 4.0, 4.0, 4.0, 1.0, 0.7],
-                             [2.0, 2.0, 3.0, 4.0, 4.0, 2.0, 3.0, 1.0, 0.8],
-                             [7.0, 7.0, 8.0, 8.0, 9.0, 7.0, 8.0, 6.0, 0.5],
-                             [0.0, 0.0, 0.0, 2.0, 2.0, 2.0, 2.0, 0.0, 0.9]],
-                            dtype=np.float32)
-
-        np_expect_dets = np.array([[0., 0., 0., 2., 2., 2., 2., 0.],
-                                   [2., 2., 3., 4., 4., 2., 3., 1.],
-                                   [7., 7., 8., 8., 9., 7., 8., 6.]],
-                                  dtype=np.float32)
-        np_expect_keep_inds = np.array([3, 1, 2], dtype=np.int64)
-
-        boxes = torch.from_numpy(np_boxes).to(device)
-
-        dets, keep_inds = nms_quadri(boxes[:, :8], boxes[:, -1], 0.3)
-        assert np.allclose(dets.cpu().numpy()[:, :8], np_expect_dets)
-        assert np.allclose(keep_inds.cpu().numpy(), np_expect_keep_inds)
-
-    @pytest.mark.parametrize('device', [
-        'cpu',
-        pytest.param(
-            'cuda',
-            marks=pytest.mark.skipif(
-                not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
-    ])
-    def test_batched_nms(self, device):
-        # test batched_nms with nms_quadri
-        from mmcv.ops import batched_nms
-
-        np_boxes = np.array([[1.0, 1.0, 3.0, 4.0, 4.0, 4.0, 4.0, 1.0, 0.7],
-                             [2.0, 2.0, 3.0, 4.0, 4.0, 2.0, 3.0, 1.0, 0.8],
-                             [7.0, 7.0, 8.0, 8.0, 9.0, 7.0, 8.0, 6.0, 0.5],
-                             [0.0, 0.0, 0.0, 2.0, 2.0, 2.0, 2.0, 0.0, 0.9]],
-                            dtype=np.float32)
-        np_labels = np.array([1, 0, 1, 0], dtype=np.float32)
-
-        np_expect_agnostic_dets = np.array([[0., 0., 0., 2., 2., 2., 2., 0.],
-                                            [2., 2., 3., 4., 4., 2., 3., 1.],
-                                            [7., 7., 8., 8., 9., 7., 8., 6.]],
-                                           dtype=np.float32)
-        np_expect_agnostic_keep_inds = np.array([3, 1, 2], dtype=np.int64)
-
-        np_expect_dets = np.array([[0., 0., 0., 2., 2., 2., 2., 0.],
-                                   [2., 2., 3., 4., 4., 2., 3., 1.],
-                                   [1., 1., 3., 4., 4., 4., 4., 1.],
-                                   [7., 7., 8., 8., 9., 7., 8., 6.]],
-                                  dtype=np.float32)
-        np_expect_keep_inds = np.array([3, 1, 0, 2], dtype=np.int64)
-
-        nms_cfg = dict(type='nms_quadri', iou_threshold=0.3)
-
-        # test class_agnostic is True
-        boxes, keep = batched_nms(
-            torch.from_numpy(np_boxes[:, :8]).to(device),
-            torch.from_numpy(np_boxes[:, -1]).to(device),
-            torch.from_numpy(np_labels).to(device),
-            nms_cfg,
-            class_agnostic=True)
-        assert np.allclose(boxes.cpu().numpy()[:, :8], np_expect_agnostic_dets)
-        assert np.allclose(keep.cpu().numpy(), np_expect_agnostic_keep_inds)
-
-        # test class_agnostic is False
-        boxes, keep = batched_nms(
-            torch.from_numpy(np_boxes[:, :8]).to(device),
-            torch.from_numpy(np_boxes[:, -1]).to(device),
-            torch.from_numpy(np_labels).to(device),
-            nms_cfg,
-            class_agnostic=False)
-        assert np.allclose(boxes.cpu().numpy()[:, :8], np_expect_dets)
-        assert np.allclose(keep.cpu().numpy(), np_expect_keep_inds)
diff --git a/tests/test_ops/test_nms_rotated.py b/tests/test_ops/test_nms_rotated.py
index bee562a..4ae74ea 100644
--- a/tests/test_ops/test_nms_rotated.py
+++ b/tests/test_ops/test_nms_rotated.py
@@ -1,24 +1,14 @@
-# Copyright (c) OpenMMLab. All rights reserved.
 import numpy as np
 import pytest
 import torch
 
-from mmcv.utils import IS_CUDA_AVAILABLE, IS_NPU_AVAILABLE
-
 
+@pytest.mark.skipif(
+    not torch.cuda.is_available(),
+    reason='GPU is required to test NMSRotated op')
 class TestNmsRotated:
 
-    @pytest.mark.parametrize('device', [
-        pytest.param(
-            'npu',
-            marks=pytest.mark.skipif(
-                not IS_NPU_AVAILABLE, reason='requires NPU support')),
-        pytest.param(
-            'cuda',
-            marks=pytest.mark.skipif(
-                not IS_CUDA_AVAILABLE, reason='requires CUDA support'))
-    ])
-    def test_ml_nms_rotated(self, device):
+    def test_ml_nms_rotated(self):
         from mmcv.ops import nms_rotated
         np_boxes = np.array(
             [[6.0, 3.0, 8.0, 7.0, 0.5, 0.7], [3.0, 6.0, 9.0, 11.0, 0.6, 0.8],
@@ -33,34 +23,15 @@ class TestNmsRotated:
             dtype=np.float32)
         np_expect_keep_inds = np.array([3, 1, 0], dtype=np.int64)
 
-        boxes = torch.from_numpy(np_boxes).to(device)
-        labels = torch.from_numpy(np_labels).to(device)
+        boxes = torch.from_numpy(np_boxes).cuda()
+        labels = torch.from_numpy(np_labels).cuda()
 
-        # test cw angle definition
         dets, keep_inds = nms_rotated(boxes[:, :5], boxes[:, -1], 0.5, labels)
 
         assert np.allclose(dets.cpu().numpy()[:, :5], np_expect_dets)
         assert np.allclose(keep_inds.cpu().numpy(), np_expect_keep_inds)
 
-        # test ccw angle definition
-        boxes[..., -2] *= -1
-        dets, keep_inds = nms_rotated(
-            boxes[:, :5], boxes[:, -1], 0.5, labels, clockwise=False)
-        dets[..., -2] *= -1
-        assert np.allclose(dets.cpu().numpy()[:, :5], np_expect_dets)
-        assert np.allclose(keep_inds.cpu().numpy(), np_expect_keep_inds)
-
-    @pytest.mark.parametrize('device', [
-        pytest.param(
-            'npu',
-            marks=pytest.mark.skipif(
-                not IS_NPU_AVAILABLE, reason='requires NPU support')),
-        pytest.param(
-            'cuda',
-            marks=pytest.mark.skipif(
-                not IS_CUDA_AVAILABLE, reason='requires CUDA support'))
-    ])
-    def test_nms_rotated(self, device):
+    def test_nms_rotated(self):
         from mmcv.ops import nms_rotated
         np_boxes = np.array(
             [[6.0, 3.0, 8.0, 7.0, 0.5, 0.7], [3.0, 6.0, 9.0, 11.0, 0.6, 0.8],
@@ -74,62 +45,8 @@ class TestNmsRotated:
             dtype=np.float32)
         np_expect_keep_inds = np.array([3, 1, 0], dtype=np.int64)
 
-        boxes = torch.from_numpy(np_boxes).to(device)
+        boxes = torch.from_numpy(np_boxes).cuda()
 
-        # test cw angle definition
         dets, keep_inds = nms_rotated(boxes[:, :5], boxes[:, -1], 0.5)
         assert np.allclose(dets.cpu().numpy()[:, :5], np_expect_dets)
         assert np.allclose(keep_inds.cpu().numpy(), np_expect_keep_inds)
-
-        # test ccw angle definition
-        boxes[..., -2] *= -1
-        dets, keep_inds = nms_rotated(
-            boxes[:, :5], boxes[:, -1], 0.5, clockwise=False)
-        dets[..., -2] *= -1
-        assert np.allclose(dets.cpu().numpy()[:, :5], np_expect_dets)
-        assert np.allclose(keep_inds.cpu().numpy(), np_expect_keep_inds)
-
-    def test_batched_nms(self):
-        # test batched_nms with nms_rotated
-        from mmcv.ops import batched_nms
-
-        np_boxes = np.array(
-            [[6.0, 3.0, 8.0, 7.0, 0.5, 0.7], [3.0, 6.0, 9.0, 11.0, 0.6, 0.8],
-             [3.0, 7.0, 10.0, 12.0, 0.3, 0.5], [1.0, 4.0, 13.0, 7.0, 0.6, 0.9]
-             ],
-            dtype=np.float32)
-        np_labels = np.array([1, 0, 1, 0], dtype=np.float32)
-
-        np_expect_agnostic_dets = np.array(
-            [[1.0, 4.0, 13.0, 7.0, 0.6], [3.0, 6.0, 9.0, 11.0, 0.6],
-             [6.0, 3.0, 8.0, 7.0, 0.5]],
-            dtype=np.float32)
-        np_expect_agnostic_keep_inds = np.array([3, 1, 0], dtype=np.int64)
-
-        np_expect_dets = np.array(
-            [[1.0, 4.0, 13.0, 7.0, 0.6], [3.0, 6.0, 9.0, 11.0, 0.6],
-             [6.0, 3.0, 8.0, 7.0, 0.5], [3.0, 7.0, 10.0, 12.0, 0.3]],
-            dtype=np.float32)
-        np_expect_keep_inds = np.array([3, 1, 0, 2], dtype=np.int64)
-
-        nms_cfg = dict(type='nms_rotated', iou_threshold=0.5)
-
-        # test class_agnostic is True
-        boxes, keep = batched_nms(
-            torch.from_numpy(np_boxes[:, :5]),
-            torch.from_numpy(np_boxes[:, -1]),
-            torch.from_numpy(np_labels),
-            nms_cfg,
-            class_agnostic=True)
-        assert np.allclose(boxes.cpu().numpy()[:, :5], np_expect_agnostic_dets)
-        assert np.allclose(keep.cpu().numpy(), np_expect_agnostic_keep_inds)
-
-        # test class_agnostic is False
-        boxes, keep = batched_nms(
-            torch.from_numpy(np_boxes[:, :5]),
-            torch.from_numpy(np_boxes[:, -1]),
-            torch.from_numpy(np_labels),
-            nms_cfg,
-            class_agnostic=False)
-        assert np.allclose(boxes.cpu().numpy()[:, :5], np_expect_dets)
-        assert np.allclose(keep.cpu().numpy(), np_expect_keep_inds)
diff --git a/tests/test_ops/test_onnx.py b/tests/test_ops/test_onnx.py
index 1058fa7..c2fc7ff 100644
--- a/tests/test_ops/test_onnx.py
+++ b/tests/test_ops/test_onnx.py
@@ -1,15 +1,16 @@
-# Copyright (c) OpenMMLab. All rights reserved.
 import os
+import warnings
+from functools import partial
 
 import numpy as np
 import onnx
+import onnxruntime as rt
 import pytest
 import torch
 import torch.nn as nn
+from packaging import version
 
 onnx_file = 'tmp.onnx'
-if torch.__version__ == 'parrots':
-    pytest.skip('not supported in parrots now', allow_module_level=True)
 
 
 @pytest.fixture(autouse=True)
@@ -28,20 +29,228 @@ def run_before_and_after_test():
 class WrapFunction(nn.Module):
 
     def __init__(self, wrapped_function):
-        super().__init__()
+        super(WrapFunction, self).__init__()
         self.wrapped_function = wrapped_function
 
     def forward(self, *args, **kwargs):
         return self.wrapped_function(*args, **kwargs)
 
 
+def process_grid_sample(func, input, grid, ort_custom_op_path=''):
+    wrapped_model = WrapFunction(func).eval()
+
+    input_names = ['input', 'grid']
+    output_names = ['output']
+
+    with torch.no_grad():
+        torch.onnx.export(
+            wrapped_model, (input, grid),
+            onnx_file,
+            export_params=True,
+            keep_initializers_as_inputs=True,
+            input_names=input_names,
+            output_names=output_names,
+            opset_version=11)
+
+    onnx_model = onnx.load(onnx_file)
+
+    session_options = rt.SessionOptions()
+    if ort_custom_op_path:
+        session_options.register_custom_ops_library(ort_custom_op_path)
+
+    # get onnx output
+    input_all = [node.name for node in onnx_model.graph.input]
+    input_initializer = [node.name for node in onnx_model.graph.initializer]
+    net_feed_input = list(set(input_all) - set(input_initializer))
+    assert (len(net_feed_input) == 2)
+    sess = rt.InferenceSession(onnx_file, session_options)
+    ort_result = sess.run(None, {
+        'input': input.detach().numpy(),
+        'grid': grid.detach().numpy()
+    })
+    pytorch_results = wrapped_model(input.clone(), grid.clone())
+    assert np.allclose(pytorch_results, ort_result, atol=1e-3)
+
+
+@pytest.mark.parametrize('mode', ['bilinear', 'nearest'])
+@pytest.mark.parametrize('padding_mode', ['zeros', 'border', 'reflection'])
+@pytest.mark.parametrize('align_corners', [True, False])
+def test_grid_sample(mode, padding_mode, align_corners):
+    from mmcv.onnx.symbolic import register_extra_symbolics
+    opset_version = 11
+    register_extra_symbolics(opset_version)
+
+    from mmcv.ops import get_onnxruntime_op_path
+    ort_custom_op_path = get_onnxruntime_op_path()
+    if not os.path.exists(ort_custom_op_path):
+        pytest.skip('custom ops for onnxruntime are not compiled.')
+
+    input = torch.rand(1, 1, 10, 10)
+    grid = torch.Tensor([[[1, 0, 0], [0, 1, 0]]])
+    grid = nn.functional.affine_grid(grid, (1, 1, 15, 15)).type_as(input)
+
+    def func(input, grid):
+        return nn.functional.grid_sample(
+            input,
+            grid,
+            mode=mode,
+            padding_mode=padding_mode,
+            align_corners=align_corners)
+
+    return process_grid_sample(func, input, grid, ort_custom_op_path)
+
+
+@pytest.mark.parametrize('align_corners', [True, False])
+def test_bilinear_grid_sample(align_corners):
+    from mmcv.ops.point_sample import bilinear_grid_sample
+
+    # only support pytorch >= 1.5.0
+    if version.parse(torch.__version__) < version.parse('1.5.0'):
+        pytest.skip('Only support PyTorch >= 1.5.0')
+
+    input = torch.rand(1, 1, 10, 10)
+    grid = torch.Tensor([[[1, 0, 0], [0, 1, 0]]])
+    grid = nn.functional.affine_grid(grid, (1, 1, 15, 15)).type_as(input)
+
+    def func(input, grid):
+        return bilinear_grid_sample(input, grid, align_corners=align_corners)
+
+    return process_grid_sample(func, input, grid)
+
+
+def test_nms():
+    if torch.__version__ == 'parrots':
+        pytest.skip('onnx is not supported in parrots directly')
+    from mmcv.ops import get_onnxruntime_op_path, nms
+    np_boxes = np.array([[6.0, 3.0, 8.0, 7.0], [3.0, 6.0, 9.0, 11.0],
+                         [3.0, 7.0, 10.0, 12.0], [1.0, 4.0, 13.0, 7.0]],
+                        dtype=np.float32)
+    np_scores = np.array([0.6, 0.9, 0.7, 0.2], dtype=np.float32)
+    boxes = torch.from_numpy(np_boxes)
+    scores = torch.from_numpy(np_scores)
+
+    nms = partial(
+        nms, iou_threshold=0.3, offset=0, score_threshold=0, max_num=0)
+    pytorch_dets, _ = nms(boxes, scores)
+    pytorch_score = pytorch_dets[:, 4]
+
+    wrapped_model = WrapFunction(nms)
+    wrapped_model.cpu().eval()
+    with torch.no_grad():
+        torch.onnx.export(
+            wrapped_model, (boxes, scores),
+            onnx_file,
+            export_params=True,
+            keep_initializers_as_inputs=True,
+            input_names=['boxes', 'scores'],
+            opset_version=11)
+
+    onnx_model = onnx.load(onnx_file)
+    ort_custom_op_path = get_onnxruntime_op_path()
+    session_options = rt.SessionOptions()
+    if os.path.exists(ort_custom_op_path):
+        session_options.register_custom_ops_library(ort_custom_op_path)
+
+    # get onnx output
+    input_all = [node.name for node in onnx_model.graph.input]
+    input_initializer = [node.name for node in onnx_model.graph.initializer]
+    net_feed_input = list(set(input_all) - set(input_initializer))
+    assert (len(net_feed_input) == 2)
+    sess = rt.InferenceSession(onnx_file, session_options)
+    onnx_dets, _ = sess.run(None, {
+        'scores': scores.detach().numpy(),
+        'boxes': boxes.detach().numpy()
+    })
+    onnx_score = onnx_dets[:, 4]
+    assert np.allclose(pytorch_score, onnx_score, atol=1e-3)
+
+
+@pytest.mark.skipif(not torch.cuda.is_available(), reason='test requires GPU')
+def test_softnms():
+    if torch.__version__ == 'parrots':
+        pytest.skip('onnx is not supported in parrots directly')
+    from mmcv.ops import get_onnxruntime_op_path, soft_nms
+
+    # only support pytorch >= 1.7.0
+    if version.parse(torch.__version__) < version.parse('1.7.0'):
+        warnings.warn('test_softnms should be ran with pytorch >= 1.7.0')
+        return
+
+    # only support onnxruntime >= 1.5.1
+    assert version.parse(rt.__version__) >= version.parse(
+        '1.5.1'), 'test_softnms should be ran with onnxruntime >= 1.5.1'
+
+    ort_custom_op_path = get_onnxruntime_op_path()
+    if not os.path.exists(ort_custom_op_path):
+        pytest.skip('softnms for onnxruntime is not compiled.')
+
+    np_boxes = np.array([[6.0, 3.0, 8.0, 7.0], [3.0, 6.0, 9.0, 11.0],
+                         [3.0, 7.0, 10.0, 12.0], [1.0, 4.0, 13.0, 7.0]],
+                        dtype=np.float32)
+    np_scores = np.array([0.6, 0.9, 0.7, 0.2], dtype=np.float32)
+
+    boxes = torch.from_numpy(np_boxes)
+    scores = torch.from_numpy(np_scores)
+
+    configs = [[0.3, 0.5, 0.01, 'linear'], [0.3, 0.5, 0.01, 'gaussian'],
+               [0.3, 0.5, 0.01, 'naive']]
+
+    session_options = rt.SessionOptions()
+    session_options.register_custom_ops_library(ort_custom_op_path)
+
+    for _iou_threshold, _sigma, _min_score, _method in configs:
+        pytorch_dets, pytorch_inds = soft_nms(
+            boxes,
+            scores,
+            iou_threshold=_iou_threshold,
+            sigma=_sigma,
+            min_score=_min_score,
+            method=_method)
+        nms = partial(
+            soft_nms,
+            iou_threshold=_iou_threshold,
+            sigma=_sigma,
+            min_score=_min_score,
+            method=_method)
+
+        wrapped_model = WrapFunction(nms)
+        wrapped_model.cpu().eval()
+        with torch.no_grad():
+            torch.onnx.export(
+                wrapped_model, (boxes, scores),
+                onnx_file,
+                export_params=True,
+                keep_initializers_as_inputs=True,
+                input_names=['boxes', 'scores'],
+                opset_version=11)
+        onnx_model = onnx.load(onnx_file)
+
+        # get onnx output
+        input_all = [node.name for node in onnx_model.graph.input]
+        input_initializer = [
+            node.name for node in onnx_model.graph.initializer
+        ]
+        net_feed_input = list(set(input_all) - set(input_initializer))
+        assert (len(net_feed_input) == 2)
+        sess = rt.InferenceSession(onnx_file, session_options)
+        onnx_dets, onnx_inds = sess.run(None, {
+            'scores': scores.detach().numpy(),
+            'boxes': boxes.detach().numpy()
+        })
+
+        assert np.allclose(pytorch_dets, onnx_dets, atol=1e-3)
+        assert np.allclose(onnx_inds, onnx_inds, atol=1e-3)
+
+
 def test_roialign():
-    rt = pytest.importorskip('onnxruntime')
+    if torch.__version__ == 'parrots':
+        pytest.skip('onnx is not supported in parrots directly')
     try:
-        from mmcv.ops import roi_align
+        from mmcv.ops import get_onnxruntime_op_path, roi_align
     except (ImportError, ModuleNotFoundError):
         pytest.skip('roi_align op is not successfully compiled')
 
+    ort_custom_op_path = get_onnxruntime_op_path()
     # roi align config
     pool_h = 2
     pool_w = 2
@@ -83,6 +292,8 @@ def test_roialign():
 
         onnx_model = onnx.load(onnx_file)
         session_options = rt.SessionOptions()
+        if os.path.exists(ort_custom_op_path):
+            session_options.register_custom_ops_library(ort_custom_op_path)
 
         # compute onnx_output
         input_all = [node.name for node in onnx_model.graph.input]
@@ -91,8 +302,7 @@ def test_roialign():
         ]
         net_feed_input = list(set(input_all) - set(input_initializer))
         assert (len(net_feed_input) == 2)
-        sess = rt.InferenceSession(
-            onnx_file, session_options, providers=['CPUExecutionProvider'])
+        sess = rt.InferenceSession(onnx_file, session_options)
         onnx_output = sess.run(None, {
             'input': input.detach().numpy(),
             'rois': rois.detach().numpy()
@@ -104,9 +314,88 @@ def test_roialign():
         assert np.allclose(pytorch_output, onnx_output, atol=1e-3)
 
 
+def test_roialign_rotated():
+    if torch.__version__ == 'parrots':
+        pytest.skip('onnx is not supported in parrots directly')
+    try:
+        from mmcv.ops import get_onnxruntime_op_path, roi_align_rotated
+    except (ImportError, ModuleNotFoundError):
+        pytest.skip('roi_align_aligned op is not successfully compiled')
+
+    ort_custom_op_path = get_onnxruntime_op_path()
+    if not os.path.exists(ort_custom_op_path):
+        pytest.skip('custom ops for onnxruntime are not compiled.')
+    # roi align config
+    pool_h = 2
+    pool_w = 2
+    spatial_scale = 1.0
+    sampling_ratio = 2
+
+    inputs = [([[[[1., 2.], [3., 4.]]]], [[0., 0.5, 0.5, 1., 1., 0]]),
+              ([[[[1., 2.], [3., 4.]]]], [[0., 0.5, 0.5, 1., 1., np.pi / 2]]),
+              ([[[[1., 2.], [3., 4.]],
+                 [[4., 3.], [2., 1.]]]], [[0., 0.5, 0.5, 1., 1., 0]]),
+              ([[[[1., 2., 5., 6.], [3., 4., 7., 8.], [9., 10., 13., 14.],
+                  [11., 12., 15., 16.]]]], [[0., 1.5, 1.5, 3., 3., 0]]),
+              ([[[[1., 2., 5., 6.], [3., 4., 7., 8.], [9., 10., 13., 14.],
+                  [11., 12., 15., 16.]]]], [[0., 1.5, 1.5, 3., 3.,
+                                             np.pi / 2]])]
+
+    def warpped_function(torch_input, torch_rois):
+        return roi_align_rotated(torch_input, torch_rois, (pool_w, pool_h),
+                                 spatial_scale, sampling_ratio, True, False)
+
+    for case in inputs:
+        np_input = np.array(case[0], dtype=np.float32)
+        np_rois = np.array(case[1], dtype=np.float32)
+        input = torch.from_numpy(np_input)
+        rois = torch.from_numpy(np_rois)
+
+        # compute pytorch_output
+        with torch.no_grad():
+            pytorch_output = roi_align_rotated(input, rois, (pool_w, pool_h),
+                                               spatial_scale, sampling_ratio,
+                                               True, False)
+
+        # export and load onnx model
+        wrapped_model = WrapFunction(warpped_function)
+        with torch.no_grad():
+            torch.onnx.export(
+                wrapped_model, (input, rois),
+                onnx_file,
+                export_params=True,
+                keep_initializers_as_inputs=True,
+                input_names=['features', 'rois'],
+                opset_version=11)
+
+        onnx_model = onnx.load(onnx_file)
+        session_options = rt.SessionOptions()
+        if os.path.exists(ort_custom_op_path):
+            session_options.register_custom_ops_library(ort_custom_op_path)
+
+        # compute onnx_output
+        input_all = [node.name for node in onnx_model.graph.input]
+        input_initializer = [
+            node.name for node in onnx_model.graph.initializer
+        ]
+        net_feed_input = list(set(input_all) - set(input_initializer))
+        assert (len(net_feed_input) == 2)
+        sess = rt.InferenceSession(onnx_file, session_options)
+        onnx_output = sess.run(None, {
+            'features': input.detach().numpy(),
+            'rois': rois.detach().numpy()
+        })
+        onnx_output = onnx_output[0]
+
+        # allclose
+
+        assert np.allclose(pytorch_output, onnx_output, atol=1e-3)
+
+
 @pytest.mark.skipif(not torch.cuda.is_available(), reason='test requires GPU')
 def test_roipool():
-    rt = pytest.importorskip('onnxruntime')
+    if torch.__version__ == 'parrots':
+        pytest.skip('onnx is not supported in parrots directly')
     from mmcv.ops import roi_pool
 
     # roi pool config
@@ -155,8 +444,7 @@ def test_roipool():
         ]
         net_feed_input = list(set(input_all) - set(input_initializer))
         assert (len(net_feed_input) == 2)
-        sess = rt.InferenceSession(
-            onnx_file, providers=['CPUExecutionProvider'])
+        sess = rt.InferenceSession(onnx_file)
         onnx_output = sess.run(
             None, {
                 'input': input.detach().cpu().numpy(),
@@ -168,120 +456,357 @@ def test_roipool():
         assert np.allclose(pytorch_output, onnx_output, atol=1e-3)
 
 
-def _test_symbolic(model, inputs, symbol_name):
-    with torch.no_grad():
-        torch.onnx.export(model, inputs, onnx_file, opset_version=11)
+def test_interpolate():
+    from mmcv.onnx.symbolic import register_extra_symbolics
+    opset_version = 11
+    register_extra_symbolics(opset_version)
 
-    import onnx
-    model = onnx.load(onnx_file)
-    nodes = model.graph.node
+    def func(feat, scale_factor=2):
+        out = nn.functional.interpolate(feat, scale_factor=scale_factor)
+        return out
 
-    symbol_exist = False
-    for n in nodes:
-        if n.op_type == symbol_name:
-            symbol_exist = True
-    assert symbol_exist
+    net = WrapFunction(func)
+    net = net.cpu().eval()
+    dummy_input = torch.randn(2, 4, 8, 8).cpu()
+    torch.onnx.export(
+        net,
+        dummy_input,
+        onnx_file,
+        input_names=['input'],
+        opset_version=opset_version)
+    sess = rt.InferenceSession(onnx_file)
+    onnx_result = sess.run(None, {'input': dummy_input.detach().numpy()})
+    pytorch_result = func(dummy_input).detach().numpy()
 
-
-@pytest.mark.skipif(not torch.cuda.is_available(), reason='test requires GPU')
-def test_border_align():
-    from mmcv.ops import BorderAlign
-    model = BorderAlign(2)
-    input = torch.rand(1, 8, 2, 2).cuda()
-    boxes = torch.rand(1, 4, 4).cuda()
-    _test_symbolic(model, (input, boxes), 'MMCVBorderAlign')
+    assert np.allclose(pytorch_result, onnx_result, atol=1e-3)
 
 
-@pytest.mark.skipif(not torch.cuda.is_available(), reason='test requires GPU')
-def test_carafe():
-    from mmcv.ops import CARAFENaive
-    feat = torch.randn(2, 64, 3, 3, device='cuda').double()
-    mask = torch.randn(2, 100, 6, 6, device='cuda').sigmoid().double()
-    _test_symbolic(CARAFENaive(5, 4, 2), (feat, mask), 'MMCVCARAFENaive')
+@pytest.mark.parametrize('mode', ['top', 'bottom', 'left', 'right'])
+def test_corner_pool(mode, opset=11):
+    if torch.__version__ == 'parrots':
+        pytest.skip('onnx is not supported in parrots directly')
 
+    from mmcv.ops import get_onnxruntime_op_path
+    ort_custom_op_path = get_onnxruntime_op_path()
+    if not os.path.exists(ort_custom_op_path):
+        pytest.skip('custom ops for onnxruntime are not compiled.')
 
-@pytest.mark.skipif(not torch.cuda.is_available(), reason='test requires GPU')
-def test_deform_conv():
-    from mmcv.ops import DeformConv2dPack
-    x = torch.randn(1, 2, 4, 4, device='cuda')
-    _test_symbolic(
-        DeformConv2dPack(2, 4, 3, 1, 1).cuda(), (x, ), 'MMCVDeformConv2d')
+    from mmcv.ops.corner_pool import CornerPool
 
+    def corner_pool_func(input):
+        corner_pool_module = CornerPool(mode)
+        return corner_pool_module.corner_pool.apply(input)
 
-@pytest.mark.skipif(not torch.cuda.is_available(), reason='test requires GPU')
-def test_modulated_deform_conv():
-    from mmcv.ops import ModulatedDeformConv2dPack
-    x = torch.randn(1, 2, 4, 4, device='cuda')
-    _test_symbolic(
-        ModulatedDeformConv2dPack(2, 4, 3, 1, 1).cuda(), x,
-        'MMCVModulatedDeformConv2d')
+    wrapped_model = WrapFunction(corner_pool_func).eval()
 
+    input = torch.rand((2, 3, 9, 12))  # (n,c,h,w)
 
-@pytest.mark.skipif(not torch.cuda.is_available(), reason='test requires GPU')
-def test_deform_roi_pool():
-    from mmcv.ops import DeformRoIPoolPack
-    x = torch.tensor([[[[1., 2.], [3., 4.]]]], device='cuda')
-    rois = torch.tensor([[0., 0., 0., 1., 1.]], device='cuda')
-    output_c = x.size(1)
-    pool_h = 2
-    pool_w = 2
-    spatial_scale = 1.0
-    sampling_ratio = 2
-    model = DeformRoIPoolPack((pool_h, pool_w),
-                              output_c,
-                              spatial_scale=spatial_scale,
-                              sampling_ratio=sampling_ratio).cuda()
+    with torch.no_grad():
+        torch.onnx.export(
+            wrapped_model,
+            input,
+            onnx_file,
+            export_params=True,
+            keep_initializers_as_inputs=True,
+            input_names=['input'],
+            output_names=['output'],
+            opset_version=opset)
+
+    onnx_model = onnx.load(onnx_file)
+    input_all = [node.name for node in onnx_model.graph.input]
+    input_initializer = [node.name for node in onnx_model.graph.initializer]
+    net_feed_input = list(set(input_all) - set(input_initializer))
+    assert (len(net_feed_input) == 1)
+
+    session_options = rt.SessionOptions()
+    session_options.register_custom_ops_library(ort_custom_op_path)
+    sess = rt.InferenceSession(onnx_file, session_options)
+    ort_result = sess.run(None, {'input': input.detach().numpy()})
+    pytorch_results = wrapped_model(input.clone())
+
+    assert np.allclose(pytorch_results, ort_result, atol=1e-5)
+
+
+@pytest.mark.parametrize('key', ['cummax', 'cummin'])
+def test_cummax_cummin(key, opset=11):
+    if torch.__version__ == 'parrots':
+        pytest.skip('onnx is not supported in parrots directly')
+
+    # Note generally `cummax` or `cummin` is exportable to ONNX
+    # as long as the pytorch version >= 1.5.0, since `torch.cummax`
+    # is only supported with torch >= 1.5.0.
+    # But when `cummax` or `cummin` serves as an intermediate component
+    # whose outputs is used as inputs for another modules, it's expected
+    # that pytorch version must be >= 1.7.0. Otherwise error appears like:
+    # `RuntimeError: tuple  appears in op that does not forward tuples,
+    # unsupported 'kind: prim::PythonOp`.
+    if version.parse(torch.__version__) < version.parse('1.7.0'):
+        pytest.skip('test_cummax_cummin should be ran with pytorch >= 1.7.0')
+
+    # register custom op `mmcv::cummax` and `mmcv::cummin`
+    from mmcv.onnx.symbolic import register_extra_symbolics
+    register_extra_symbolics(opset)
+
+    from mmcv.ops import get_onnxruntime_op_path
+    ort_custom_op_path = get_onnxruntime_op_path()
+    if not os.path.exists(ort_custom_op_path):
+        pytest.skip('custom ops for onnxruntime are not compiled.')
+
+    input_list = [
+        # arbitrary shape, e.g. 1-D, 2-D, 3-D, ...
+        torch.rand((2, 3, 4, 1, 5)),
+        torch.rand((1)),
+        torch.rand((2, 0, 1)),  # tensor.numel() is 0
+        torch.FloatTensor(),  # empty tensor
+    ]
+
+    cummax_cummin_funcs = {'cummax': torch.cummax, 'cummin': torch.cummin}
+
+    for input in input_list:
+        ndims = input.dim()
+        # valid dim range is [-ndims, ndims-1]
+        # test for all `dim` value which is valid
+        for dim in range(-ndims, ndims):
+            cummax_func = partial(cummax_cummin_funcs[key], dim=dim)
+            wrapped_model = WrapFunction(cummax_func).eval()
+
+            with torch.no_grad():
+                torch.onnx.export(
+                    wrapped_model,
+                    input,
+                    onnx_file,
+                    export_params=True,
+                    keep_initializers_as_inputs=True,
+                    input_names=['input'],
+                    output_names=['output', 'indices'],
+                    opset_version=opset)
+
+            onnx_model = onnx.load(onnx_file)
+            input_all = [node.name for node in onnx_model.graph.input]
+            input_initializer = [
+                node.name for node in onnx_model.graph.initializer
+            ]
+            net_feed_input = list(set(input_all) - set(input_initializer))
+            assert (len(net_feed_input) == 1)
+
+            session_options = rt.SessionOptions()
+            session_options.register_custom_ops_library(ort_custom_op_path)
+            sess = rt.InferenceSession(onnx_file, session_options)
+            ort_output, ort_inds = sess.run(None,
+                                            {'input': input.detach().numpy()})
+            pytorch_output, pytorch_inds = wrapped_model(input.clone())
+            pytorch_output = pytorch_output.detach().numpy()
+            pytorch_inds = pytorch_inds.detach().numpy()
+            assert np.allclose(pytorch_output, ort_output, atol=1e-5)
+            assert np.all(pytorch_inds == ort_inds)
+
+
+@pytest.mark.parametrize('shifts_dims_pair', [([-3, 5], [2, 0]), (5, None)])
+def test_roll(shifts_dims_pair):
+    opset = 11
+    from mmcv.onnx.symbolic import register_extra_symbolics
+    register_extra_symbolics(opset)
+
+    input = torch.arange(0, 4 * 5 * 6, dtype=torch.float32).view(4, 5, 6)
+
+    shifts, dims = shifts_dims_pair
+    func = partial(torch.roll, shifts=shifts, dims=dims)
+    wrapped_model = WrapFunction(func).eval()
 
-    _test_symbolic(model, (x, rois), 'MMCVDeformRoIPool')
+    with torch.no_grad():
+        torch.onnx.export(
+            wrapped_model,
+            input,
+            onnx_file,
+            export_params=True,
+            keep_initializers_as_inputs=True,
+            input_names=['input'],
+            output_names=['output'],
+            opset_version=opset)
+
+    onnx_model = onnx.load(onnx_file)
+    input_all = [node.name for node in onnx_model.graph.input]
+    input_initializer = [node.name for node in onnx_model.graph.initializer]
+    net_feed_input = list(set(input_all) - set(input_initializer))
+    assert (len(net_feed_input) == 1)
+
+    sess = rt.InferenceSession(onnx_file)
+    ort_output = sess.run(None, {'input': input.detach().numpy()})[0]
 
+    with torch.no_grad():
+        pytorch_output = wrapped_model(input.clone())
 
-@pytest.mark.skipif(not torch.cuda.is_available(), reason='test requires GPU')
-def test_masked_conv():
-    from mmcv.ops import MaskedConv2d
-    x = torch.rand(1, 2, 4, 4, device='cuda')
-    mask = torch.rand(1, 4, 4, device='cuda')
-    _test_symbolic(
-        MaskedConv2d(2, 4, 3, 1, 1).cuda(), (x, mask), 'MMCVMaskedConv2d')
+    torch.testing.assert_allclose(ort_output, pytorch_output)
 
 
-@pytest.mark.skipif(not torch.cuda.is_available(), reason='test requires GPU')
-def test_pr_roi_pool():
-    from mmcv.ops import PrRoIPool
-    pool_h = 2
-    pool_w = 2
-    spatial_scale = 1.0
-    x = torch.tensor([[[[1., 2.], [3., 4.]]]], device='cuda')
-    rois = torch.tensor([[0., 0., 0., 1., 1.]], device='cuda')
-    model = PrRoIPool((pool_h, pool_w), spatial_scale).cuda()
-    _test_symbolic(model, (x, rois), 'PrRoIPool')
+@pytest.mark.skipif(
+    torch.__version__ == 'parrots',
+    reason='onnx is not supported in parrots directly')
+@pytest.mark.skipif(
+    not torch.cuda.is_available(),
+    reason='modulated_deform_conv2d only supports in GPU')
+def test_modulated_deform_conv2d():
+    try:
+        from mmcv.ops import ModulatedDeformConv2d, get_onnxruntime_op_path
+    except (ImportError, ModuleNotFoundError):
+        pytest.skip('modulated_deform_conv op is not successfully compiled')
+
+    ort_custom_op_path = get_onnxruntime_op_path()
+    # modulated deform conv config
+    in_channels = 3
+    out_channels = 64
+    stride = 1
+    padding = 0
+    dilation = 1
+    groups = 1
+    deform_groups = 1
+    kernel_size = 3
+
+    input = torch.rand(1, in_channels, 28, 28).cuda()  # (n, c, h, w)
+    conv_offset = nn.Conv2d(
+        in_channels=3,
+        out_channels=deform_groups * 3 * kernel_size * kernel_size,
+        kernel_size=kernel_size,
+        stride=stride,
+        padding=padding,
+        dilation=dilation,
+        bias=True).cuda()
+    conv_offset.cuda()
+    out = conv_offset(input)
+    o1, o2, mask = torch.chunk(out, 3, dim=1)
+    offset = torch.cat((o1, o2), dim=1)
+    mask = torch.sigmoid(mask)
+
+    model_with_bias = ModulatedDeformConv2d(
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride,
+        padding,
+        dilation,
+        groups,
+        deform_groups,
+        bias=True)
+    model_without_bias = ModulatedDeformConv2d(
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride,
+        padding,
+        dilation,
+        groups,
+        deform_groups,
+        bias=False)
+    models = [model_with_bias.cuda(), model_without_bias.cuda()]
+
+    for model in models:
+        # export and load onnx model
+        with torch.no_grad():
+            torch.onnx.export(
+                model, (input, offset, mask),
+                onnx_file,
+                export_params=True,
+                keep_initializers_as_inputs=True,
+                input_names=['input', 'offset', 'mask'],
+                opset_version=11)
 
+        session_options = rt.SessionOptions()
+        if os.path.exists(ort_custom_op_path):
+            session_options.register_custom_ops_library(ort_custom_op_path)
 
-@pytest.mark.skipif(not torch.cuda.is_available(), reason='test requires GPU')
-def test_psa_mask():
-    from mmcv.ops import PSAMask
-    input = torch.rand(4, 16, 8, 8).cuda()
-    model = PSAMask('collect', (4, 4)).cuda()
-    _test_symbolic(model, input, 'MMCVPSAMask')
+        # compute onnx_output
+        sess = rt.InferenceSession(onnx_file, session_options)
+        onnx_output = sess.run(
+            None, {
+                'input': input.cpu().detach().numpy(),
+                'offset': offset.cpu().detach().numpy(),
+                'mask': mask.cpu().detach().numpy()
+            })[0]
 
+        # compute pytorch_output
+        with torch.no_grad():
+            pytorch_output = model(input, offset, mask).cpu()
+        # allclose
+        assert np.allclose(pytorch_output, onnx_output, atol=1e-3)
 
-@pytest.mark.skipif(not torch.cuda.is_available(), reason='test requires GPU')
-def test_roi_align_rotated():
-    from mmcv.ops import RoIAlignRotated
-    pool_h = 2
-    pool_w = 2
-    spatial_scale = 1.0
-    sampling_ratio = 2
-    x = torch.tensor([[[[1., 2.], [3., 4.]]]], device='cuda')
-    rois = torch.tensor([[0., 0.5, 0.5, 1., 1., 0]], device='cuda')
-    model = RoIAlignRotated((pool_h, pool_w), spatial_scale,
-                            sampling_ratio).cuda()
-    _test_symbolic(model, (x, rois), 'MMCVRoIAlignRotated')
 
+@pytest.mark.skipif(
+    torch.__version__ == 'parrots',
+    reason='onnx is not supported in parrots directly')
+def test_deform_conv2d(threshold=1e-3):
+    try:
+        from mmcv.ops import DeformConv2d, get_onnxruntime_op_path
+    except (ImportError, ModuleNotFoundError):
+        pytest.skip('deform_conv op is not successfully compiled')
+
+    ort_custom_op_path = get_onnxruntime_op_path()
+    if not os.path.exists(ort_custom_op_path):
+        pytest.skip('custom ops for onnxruntime are not compiled.')
+
+    # deform conv config
+    # modulated deform conv config
+    in_channels = 1
+    out_channels = 64
+    stride = 1
+    padding = 0
+    dilation = 1
+    groups = 1
+    deform_groups = 1
+    kernel_size = 2
+    input = [[[[1., 2., 3.], [0., 1., 2.], [3., 5., 2.]]]]
+    offset_weight = [[[0.1, 0.4, 0.6, 0.1]], [[0.3, 0.2, 0.1, 0.3]],
+                     [[0.5, 0.5, 0.2, 0.8]], [[0.8, 0.3, 0.9, 0.1]],
+                     [[0.3, 0.1, 0.2, 0.5]], [[0.3, 0.7, 0.5, 0.3]],
+                     [[0.6, 0.2, 0.5, 0.3]], [[0.4, 0.1, 0.8, 0.4]]]
+    offset_bias = [0.7, 0.1, 0.8, 0.5, 0.6, 0.5, 0.4, 0.7]
+    deform_weight = [[[0.4, 0.2, 0.1, 0.9]]]
+
+    x = torch.tensor(input)
+    conv_offset = nn.Conv2d(
+        in_channels=in_channels,
+        out_channels=deform_groups * 2 * kernel_size * kernel_size,
+        kernel_size=kernel_size,
+        stride=stride,
+        padding=padding,
+        dilation=dilation,
+        bias=True)
+
+    conv_offset.weight.data = torch.nn.Parameter(
+        torch.Tensor(offset_weight).reshape(8, 1, 2, 2))
+    conv_offset.bias.data = torch.nn.Parameter(
+        torch.Tensor(offset_bias).reshape(8))
+
+    offset = conv_offset(x)
+
+    model = DeformConv2d(in_channels, out_channels, kernel_size, stride,
+                         padding, dilation, groups, deform_groups)
+
+    model.weight.data = torch.nn.Parameter(
+        torch.Tensor(deform_weight).reshape(1, 1, 2, 2))
 
-@pytest.mark.skipif(not torch.cuda.is_available(), reason='test requires GPU')
-def test_roi_feaeture_align():
-    from mmcv.ops import rotated_feature_align
-    wrapped_model = WrapFunction(rotated_feature_align)
-    feature = torch.rand(1, 1, 2, 2, device='cuda')
-    bbox = torch.rand(1, 2, 2, 5, device='cuda')
-    _test_symbolic(wrapped_model, (feature, bbox), 'MMCVRotatedFeatureAlign')
+    with torch.no_grad():
+        torch.onnx.export(
+            model, (x, offset),
+            onnx_file,
+            export_params=True,
+            keep_initializers_as_inputs=True,
+            input_names=['input', 'offset'],
+            opset_version=11)
+
+    session_options = rt.SessionOptions()
+    if os.path.exists(ort_custom_op_path):
+        session_options.register_custom_ops_library(ort_custom_op_path)
+
+    # compute onnx_output
+    sess = rt.InferenceSession(onnx_file, session_options)
+    onnx_output = sess.run(
+        None, {
+            'input': x.cpu().detach().numpy(),
+            'offset': offset.cpu().detach().numpy(),
+        })[0]
+
+    # compute pytorch_output
+    with torch.no_grad():
+        pytorch_output = model(x, offset).cpu()
+    # allclose
+    assert np.allclose(pytorch_output, onnx_output, atol=1e-3)
diff --git a/tests/test_ops/test_pixel_group.py b/tests/test_ops/test_pixel_group.py
index ceb2573..c7052ce 100644
--- a/tests/test_ops/test_pixel_group.py
+++ b/tests/test_ops/test_pixel_group.py
@@ -1,4 +1,3 @@
-# Copyright (c) OpenMMLab. All rights reserved.
 import numpy as np
 import torch
 
diff --git a/tests/test_ops/test_points_in_polygons.py b/tests/test_ops/test_points_in_polygons.py
deleted file mode 100644
index dde8ab0..0000000
--- a/tests/test_ops/test_points_in_polygons.py
+++ /dev/null
@@ -1,23 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import numpy as np
-import pytest
-import torch
-
-from mmcv.ops import points_in_polygons
-
-
-@pytest.mark.skipif(
-    not torch.cuda.is_available(), reason='requires CUDA support')
-def test_points_in_polygons():
-    points = np.array([[300., 300.], [400., 400.], [100., 100], [300, 250],
-                       [100, 0]])
-    polygons = np.array([[200., 200., 400., 400., 500., 200., 400., 100.],
-                         [400., 400., 500., 500., 600., 300., 500., 200.],
-                         [300., 300., 600., 700., 700., 700., 700., 100.]])
-    expected_output = np.array([[0., 0., 0.], [0., 0., 1.], [0., 0., 0.],
-                                [1., 0., 0.], [0., 0., 0.]])
-    points = torch.from_numpy(points).cuda().float()
-    polygons = torch.from_numpy(polygons).cuda().float()
-    expected_output = torch.from_numpy(expected_output).cuda().float()
-    assert torch.allclose(
-        points_in_polygons(points, polygons), expected_output, 1e-3)
diff --git a/tests/test_ops/test_prroi_pool.py b/tests/test_ops/test_prroi_pool.py
deleted file mode 100644
index 0535dfb..0000000
--- a/tests/test_ops/test_prroi_pool.py
+++ /dev/null
@@ -1,98 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import numpy as np
-import pytest
-import torch
-
-from mmcv.utils import IS_CUDA_AVAILABLE
-
-_USING_PARROTS = True
-try:
-    from parrots.autograd import gradcheck
-except ImportError:
-    from torch.autograd import gradcheck
-
-    _USING_PARROTS = False
-
-inputs = [([[[[1., 2.], [3., 4.]]]], [[0., 0., 0., 1., 1.]]),
-          ([[[[1., 2.], [3., 4.]], [[4., 3.], [2.,
-                                               1.]]]], [[0., 0., 0., 1., 1.]]),
-          ([[[[1., 2., 5., 6.], [3., 4., 7., 8.], [9., 10., 13., 14.],
-              [11., 12., 15., 16.]]]], [[0., 0., 0., 3., 3.]])]
-outputs = [
-    ([[[[1.75, 2.25], [2.75, 3.25]]]], [[[[1., 1.],
-                                          [1., 1.]]]], [[0., 2., 4., 2., 4.]]),
-    ([[[[1.75, 2.25], [2.75, 3.25]],
-       [[3.25, 2.75], [2.25, 1.75]]]], [[[[1., 1.], [1., 1.]],
-                                         [[1., 1.],
-                                          [1., 1.]]]], [[0., 0., 0., 0., 0.]]),
-    ([[[[3.75, 6.91666651],
-        [10.08333302,
-         13.25]]]], [[[[0.11111111, 0.22222224, 0.22222222, 0.11111111],
-                       [0.22222224, 0.444444448, 0.44444448, 0.22222224],
-                       [0.22222224, 0.44444448, 0.44444448, 0.22222224],
-                       [0.11111111, 0.22222224, 0.22222224, 0.11111111]]]],
-     [[0.0, 3.33333302, 6.66666603, 3.33333349, 6.66666698]])
-]
-
-
-class TestPrRoiPool:
-
-    @pytest.mark.parametrize('device', [
-        pytest.param(
-            'cuda',
-            marks=pytest.mark.skipif(
-                not IS_CUDA_AVAILABLE, reason='requires CUDA support'))
-    ])
-    def test_roipool_gradcheck(self, device):
-        from mmcv.ops import PrRoIPool
-        pool_h = 2
-        pool_w = 2
-        spatial_scale = 1.0
-
-        for case in inputs:
-            np_input = np.array(case[0], dtype=np.float32)
-            np_rois = np.array(case[1], dtype=np.float32)
-
-            x = torch.tensor(np_input, device=device, requires_grad=True)
-            rois = torch.tensor(np_rois, device=device)
-
-            froipool = PrRoIPool((pool_h, pool_w), spatial_scale)
-
-            if _USING_PARROTS:
-                gradcheck(froipool, (x, rois), no_grads=[rois])
-            else:
-                gradcheck(froipool, (x, rois), eps=1e-2, atol=1e-2)
-
-    def _test_roipool_allclose(self, device, dtype=torch.float):
-        from mmcv.ops import prroi_pool
-        pool_h = 2
-        pool_w = 2
-        spatial_scale = 1.0
-
-        for case, output in zip(inputs, outputs):
-            np_input = np.array(case[0], dtype=np.float32)
-            np_rois = np.array(case[1], dtype=np.float32)
-            np_output = np.array(output[0], dtype=np.float32)
-            np_input_grad = np.array(output[1], dtype=np.float32)
-            np_rois_grad = np.array(output[2], dtype=np.float32)
-
-            x = torch.tensor(
-                np_input, dtype=dtype, device=device, requires_grad=True)
-            rois = torch.tensor(
-                np_rois, dtype=dtype, device=device, requires_grad=True)
-
-            output = prroi_pool(x, rois, (pool_h, pool_w), spatial_scale)
-            output.backward(torch.ones_like(output))
-            assert np.allclose(output.data.cpu().numpy(), np_output, 1e-3)
-            assert np.allclose(x.grad.data.cpu().numpy(), np_input_grad, 1e-3)
-            assert np.allclose(rois.grad.data.cpu().numpy(), np_rois_grad,
-                               1e-3)
-
-    @pytest.mark.parametrize('device', [
-        pytest.param(
-            'cuda',
-            marks=pytest.mark.skipif(
-                not IS_CUDA_AVAILABLE, reason='requires CUDA support'))
-    ])
-    def test_roipool_allclose_float(self, device):
-        self._test_roipool_allclose(device, dtype=torch.float)
diff --git a/tests/test_ops/test_psa_mask.py b/tests/test_ops/test_psa_mask.py
index b0fd86e..df70fd5 100644
--- a/tests/test_ops/test_psa_mask.py
+++ b/tests/test_ops/test_psa_mask.py
@@ -1,11 +1,7 @@
-# Copyright (c) OpenMMLab. All rights reserved.
 import numpy as np
-import pytest
 import torch
 import torch.nn as nn
 
-from mmcv.utils import IS_CUDA_AVAILABLE, IS_MLU_AVAILABLE, IS_NPU_AVAILABLE
-
 
 class Loss(nn.Module):
 
@@ -18,23 +14,11 @@ class Loss(nn.Module):
         return torch.mean(input - target)
 
 
-class TestPSAMask:
-
-    @pytest.mark.parametrize('device', [
-        pytest.param(
-            'cuda',
-            marks=pytest.mark.skipif(
-                not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
-        pytest.param(
-            'mlu',
-            marks=pytest.mark.skipif(
-                not IS_MLU_AVAILABLE, reason='requires MLU support')),
-        pytest.param(
-            'npu',
-            marks=pytest.mark.skipif(
-                not IS_NPU_AVAILABLE, reason='requires NPU support'))
-    ])
-    def test_psa_mask_collect(self, device):
+class TestPSAMask(object):
+
+    def test_psa_mask_collect(self):
+        if not torch.cuda.is_available():
+            return
         from mmcv.ops import PSAMask
         test_loss = Loss()
 
@@ -60,11 +44,11 @@ class TestPSAMask:
         assert np.allclose(test_output, output_collect)
         assert test_output.shape == output_collect.shape
 
-        psamask_collect.to(device)
-        input = input.to(device)
-        label = label.to(device)
+        psamask_collect.cuda()
+        input = input.cuda()
+        label = label.cuda()
 
-        # test collect on device
+        # test collect cuda
         test_output = psamask_collect(input)
         loss = test_loss(test_output, label)
         loss.backward()
@@ -72,21 +56,9 @@ class TestPSAMask:
         assert np.allclose(test_output, output_collect)
         assert test_output.shape == output_collect.shape
 
-    @pytest.mark.parametrize('device', [
-        pytest.param(
-            'cuda',
-            marks=pytest.mark.skipif(
-                not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
-        pytest.param(
-            'mlu',
-            marks=pytest.mark.skipif(
-                not IS_MLU_AVAILABLE, reason='requires MLU support')),
-        pytest.param(
-            'npu',
-            marks=pytest.mark.skipif(
-                not IS_NPU_AVAILABLE, reason='requires NPU support'))
-    ])
-    def test_psa_mask_distribute(self, device):
+    def test_psa_mask_distribute(self):
+        if not torch.cuda.is_available():
+            return
         from mmcv.ops import PSAMask
         test_loss = Loss()
 
@@ -113,11 +85,11 @@ class TestPSAMask:
         assert np.allclose(test_output, output_distribute)
         assert test_output.shape == output_distribute.shape
 
-        psamask_distribute.to(device)
-        input = input.to(device)
-        label = label.to(device)
+        psamask_distribute.cuda()
+        input = input.cuda()
+        label = label.cuda()
 
-        # test distribute on device
+        # test distribute cuda
         test_output = psamask_distribute(input)
         loss = test_loss(test_output, label)
         loss.backward()
diff --git a/tests/test_ops/test_riroi_align_rotated.py b/tests/test_ops/test_riroi_align_rotated.py
deleted file mode 100644
index c7b501c..0000000
--- a/tests/test_ops/test_riroi_align_rotated.py
+++ /dev/null
@@ -1,84 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import numpy as np
-import pytest
-import torch
-
-from mmcv.ops import RiRoIAlignRotated
-
-if torch.__version__ == 'parrots':
-    from parrots.autograd import gradcheck
-    _USING_PARROTS = True
-else:
-    from torch.autograd import gradcheck
-    _USING_PARROTS = False
-
-np_feature = np.array([[[[1, 2], [3, 4]], [[1, 2], [4, 3]], [[4, 3], [2, 1]],
-                        [[1, 2], [5, 6]], [[3, 4], [7, 8]], [[9, 10], [13,
-                                                                       14]],
-                        [[11, 12], [15, 16]], [[1, 1], [2, 2]]]])
-np_rois = np.array([[0., 0.5, 0.5, 1., 1., np.pi / 3],
-                    [0., 1., 1., 3., 3., np.pi / 2]])
-expect_output = np.array([[[[1.8425, 1.3516], [2.3151, 1.8241]],
-                           [[2.4779, 1.7416], [3.2173, 2.5632]],
-                           [[2.7149, 2.2638], [2.6540, 2.3673]],
-                           [[2.9461, 2.8638], [2.8028, 2.7205]],
-                           [[4.1943, 2.7214], [5.6119, 4.1391]],
-                           [[7.5276, 6.0547], [8.9453, 7.4724]],
-                           [[12.1943, 10.7214], [13.6119, 12.1391]],
-                           [[9.5489, 8.4237], [10.5763, 9.4511]]],
-                          [[[7.6562, 12.5625], [4.0000, 6.6250]],
-                           [[1.0000, 1.3125], [0.5000, 0.6562]],
-                           [[1.6562, 1.9375], [1.0000, 1.3125]],
-                           [[1.8438, 2.0547], [0.7500, 1.1562]],
-                           [[0.8438, 3.0625], [0.2500, 1.1875]],
-                           [[2.6562, 2.5625], [1.5000, 1.6250]],
-                           [[3.6562, 4.5625], [2.0000, 2.6250]],
-                           [[6.6562, 10.5625], [3.5000, 5.6250]]]])
-
-expect_grad = np.array([[[[1.4727, 1.5586], [1.5586, 1.6602]],
-                         [[1.4727, 1.5586], [1.5586, 1.6602]],
-                         [[1.4727, 1.5586], [1.5586, 1.6602]],
-                         [[1.4727, 1.5586], [1.5586, 1.6602]],
-                         [[1.4727, 1.5586], [1.5586, 1.6602]],
-                         [[1.4727, 1.5586], [1.5586, 1.6602]],
-                         [[1.4727, 1.5586], [1.5586, 1.6602]],
-                         [[1.4727, 1.5586], [1.5586, 1.6602]]]])
-
-pool_h = 2
-pool_w = 2
-spatial_scale = 1.0
-num_samples = 2
-sampling_ratio = 2
-num_orientations = 8
-clockwise = False
-
-
-@pytest.mark.skipif(
-    not torch.cuda.is_available(), reason='requires CUDA support')
-def test_roialign_rotated_gradcheck():
-    x = torch.tensor(
-        np_feature, dtype=torch.float, device='cuda', requires_grad=True)
-    rois = torch.tensor(np_rois, dtype=torch.float, device='cuda')
-    froipool = RiRoIAlignRotated((pool_h, pool_w), spatial_scale, num_samples,
-                                 num_orientations, clockwise)
-    if _USING_PARROTS:
-        gradcheck(
-            froipool, (x, rois), no_grads=[rois], delta=1e-3, pt_atol=1e-3)
-    else:
-        gradcheck(froipool, (x, rois), eps=1e-3, atol=1e-3)
-
-
-@pytest.mark.skipif(
-    not torch.cuda.is_available(), reason='requires CUDA support')
-def test_roialign_rotated_allclose():
-    x = torch.tensor(
-        np_feature, dtype=torch.float, device='cuda', requires_grad=True)
-    rois = torch.tensor(np_rois, dtype=torch.float, device='cuda')
-    froipool = RiRoIAlignRotated((pool_h, pool_w), spatial_scale, num_samples,
-                                 num_orientations, clockwise)
-    output = froipool(x, rois)
-    output.backward(torch.ones_like(output))
-    assert np.allclose(
-        output.data.type(torch.float).cpu().numpy(), expect_output, atol=1e-3)
-    assert np.allclose(
-        x.grad.data.type(torch.float).cpu().numpy(), expect_grad, atol=1e-3)
diff --git a/tests/test_ops/test_roi_align.py b/tests/test_ops/test_roi_align.py
index 6caf5c5..db7c037 100644
--- a/tests/test_ops/test_roi_align.py
+++ b/tests/test_ops/test_roi_align.py
@@ -1,10 +1,7 @@
-# Copyright (c) OpenMMLab. All rights reserved.
 import numpy as np
 import pytest
 import torch
 
-from mmcv.utils import IS_CUDA_AVAILABLE, IS_MLU_AVAILABLE
-
 _USING_PARROTS = True
 try:
     from parrots.autograd import gradcheck
@@ -13,7 +10,6 @@ except ImportError:
     _USING_PARROTS = False
 
 # yapf:disable
-
 inputs = [([[[[1., 2.], [3., 4.]]]],
            [[0., 0., 0., 1., 1.]]),
           ([[[[1., 2.], [3., 4.]],
@@ -42,6 +38,8 @@ sampling_ratio = 2
 
 
 def _test_roialign_gradcheck(device, dtype):
+    if not torch.cuda.is_available() and device == 'cuda':
+        pytest.skip('test requires GPU')
     try:
         from mmcv.ops import RoIAlign
     except ModuleNotFoundError:
@@ -66,6 +64,8 @@ def _test_roialign_gradcheck(device, dtype):
 
 
 def _test_roialign_allclose(device, dtype):
+    if not torch.cuda.is_available() and device == 'cuda':
+        pytest.skip('test requires GPU')
     try:
         from mmcv.ops import roi_align
     except ModuleNotFoundError:
@@ -74,6 +74,7 @@ def _test_roialign_allclose(device, dtype):
     pool_w = 2
     spatial_scale = 1.0
     sampling_ratio = 2
+
     for case, output in zip(inputs, outputs):
         np_input = np.array(case[0])
         np_rois = np.array(case[1])
@@ -93,26 +94,8 @@ def _test_roialign_allclose(device, dtype):
             x.grad.data.type(torch.float).cpu().numpy(), np_grad, atol=1e-3)
 
 
-@pytest.mark.parametrize('device', [
-    'cpu',
-    pytest.param(
-        'cuda',
-        marks=pytest.mark.skipif(
-            not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
-    pytest.param(
-        'mlu',
-        marks=pytest.mark.skipif(
-            not IS_MLU_AVAILABLE, reason='requires MLU support'))
-])
-@pytest.mark.parametrize('dtype', [
-    torch.float,
-    pytest.param(
-        torch.double,
-        marks=pytest.mark.skipif(
-            IS_MLU_AVAILABLE,
-            reason='MLU does not support for 64-bit floating point')),
-    torch.half
-])
+@pytest.mark.parametrize('device', ['cuda', 'cpu'])
+@pytest.mark.parametrize('dtype', [torch.float, torch.double, torch.half])
 def test_roialign(device, dtype):
     # check double only
     if dtype is torch.double:
diff --git a/tests/test_ops/test_roi_align_rotated.py b/tests/test_ops/test_roi_align_rotated.py
index 1ad6b6e..bed2e05 100644
--- a/tests/test_ops/test_roi_align_rotated.py
+++ b/tests/test_ops/test_roi_align_rotated.py
@@ -1,10 +1,7 @@
-# Copyright (c) OpenMMLab. All rights reserved.
 import numpy as np
 import pytest
 import torch
 
-from mmcv.utils import IS_CUDA_AVAILABLE, IS_MLU_AVAILABLE
-
 _USING_PARROTS = True
 try:
     from parrots.autograd import gradcheck
@@ -53,6 +50,8 @@ sampling_ratio = 2
 
 
 def _test_roialign_rotated_gradcheck(device, dtype):
+    if not torch.cuda.is_available() and device == 'cuda':
+        pytest.skip('unittest does not support GPU yet.')
     try:
         from mmcv.ops import RoIAlignRotated
     except ModuleNotFoundError:
@@ -69,6 +68,7 @@ def _test_roialign_rotated_gradcheck(device, dtype):
 
         froipool = RoIAlignRotated((pool_h, pool_w), spatial_scale,
                                    sampling_ratio)
+
         if torch.__version__ == 'parrots':
             gradcheck(
                 froipool, (x, rois), no_grads=[rois], delta=1e-5, pt_atol=1e-5)
@@ -77,8 +77,10 @@ def _test_roialign_rotated_gradcheck(device, dtype):
 
 
 def _test_roialign_rotated_allclose(device, dtype):
+    if not torch.cuda.is_available() and device == 'cuda':
+        pytest.skip('unittest does not support GPU yet.')
     try:
-        from mmcv.ops import RoIAlignRotated, roi_align_rotated
+        from mmcv.ops import roi_align_rotated
     except ModuleNotFoundError:
         pytest.skip('test requires compilation')
     pool_h = 2
@@ -104,48 +106,11 @@ def _test_roialign_rotated_allclose(device, dtype):
         assert np.allclose(
             x.grad.data.type(torch.float).cpu().numpy(), np_grad, atol=1e-3)
 
-    # Test deprecated parameters
-    roi_align_rotated_module_deprecated = RoIAlignRotated(
-        out_size=(pool_h, pool_w),
-        spatial_scale=spatial_scale,
-        sample_num=sampling_ratio)
-
-    output_1 = roi_align_rotated_module_deprecated(x, rois)
-
-    roi_align_rotated_module_new = RoIAlignRotated(
-        output_size=(pool_h, pool_w),
-        spatial_scale=spatial_scale,
-        sampling_ratio=sampling_ratio)
-
-    output_2 = roi_align_rotated_module_new(x, rois)
-
-    assert np.allclose(
-        output_1.data.type(torch.float).cpu().numpy(),
-        output_2.data.type(torch.float).cpu().numpy())
-
 
-@pytest.mark.parametrize('device', [
-    'cpu',
-    pytest.param(
-        'cuda',
-        marks=pytest.mark.skipif(
-            not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
-    pytest.param(
-        'mlu',
-        marks=pytest.mark.skipif(
-            not IS_MLU_AVAILABLE, reason='requires MLU support'))
-])
-@pytest.mark.parametrize('dtype', [
-    torch.float,
-    pytest.param(
-        torch.double,
-        marks=pytest.mark.skipif(
-            IS_MLU_AVAILABLE,
-            reason='MLU does not support for 64-bit floating point')),
-    torch.half
-])
+@pytest.mark.parametrize('device', ['cuda', 'cpu'])
+@pytest.mark.parametrize('dtype', [torch.float, torch.double, torch.half])
 def test_roialign_rotated(device, dtype):
     # check double only
-    if dtype is torch.double:
+    if (dtype is torch.double):
         _test_roialign_rotated_gradcheck(device=device, dtype=dtype)
     _test_roialign_rotated_allclose(device=device, dtype=dtype)
diff --git a/tests/test_ops/test_roi_pool.py b/tests/test_ops/test_roi_pool.py
index 5ab04bc..d38c45a 100644
--- a/tests/test_ops/test_roi_pool.py
+++ b/tests/test_ops/test_roi_pool.py
@@ -1,12 +1,8 @@
-# Copyright (c) OpenMMLab. All rights reserved.
 import os
 
 import numpy as np
-import pytest
 import torch
 
-from mmcv.utils import IS_CUDA_AVAILABLE, IS_MLU_AVAILABLE, IS_NPU_AVAILABLE
-
 _USING_PARROTS = True
 try:
     from parrots.autograd import gradcheck
@@ -32,7 +28,7 @@ outputs = [([[[[1., 2.], [3., 4.]]]], [[[[1., 1.], [1., 1.]]]]),
                                                               1.]]]])]
 
 
-class TestRoiPool:
+class TestRoiPool(object):
 
     def test_roipool_gradcheck(self):
         if not torch.cuda.is_available():
@@ -57,7 +53,9 @@ class TestRoiPool:
             else:
                 gradcheck(froipool, (x, rois), eps=1e-2, atol=1e-2)
 
-    def _test_roipool_allclose(self, device, dtype=torch.float):
+    def _test_roipool_allclose(self, dtype=torch.float):
+        if not torch.cuda.is_available():
+            return
         from mmcv.ops import roi_pool
         pool_h = 2
         pool_w = 2
@@ -70,35 +68,15 @@ class TestRoiPool:
             np_grad = np.array(output[1])
 
             x = torch.tensor(
-                np_input, dtype=dtype, device=device, requires_grad=True)
-            rois = torch.tensor(np_rois, dtype=dtype, device=device)
+                np_input, dtype=dtype, device='cuda', requires_grad=True)
+            rois = torch.tensor(np_rois, dtype=dtype, device='cuda')
+
             output = roi_pool(x, rois, (pool_h, pool_w), spatial_scale)
             output.backward(torch.ones_like(output))
             assert np.allclose(output.data.cpu().numpy(), np_output, 1e-3)
             assert np.allclose(x.grad.data.cpu().numpy(), np_grad, 1e-3)
 
-    @pytest.mark.parametrize('device', [
-        pytest.param(
-            'cuda',
-            marks=pytest.mark.skipif(
-                not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
-        pytest.param(
-            'mlu',
-            marks=pytest.mark.skipif(
-                not IS_MLU_AVAILABLE, reason='requires MLU support')),
-        pytest.param(
-            'npu',
-            marks=pytest.mark.skipif(
-                not IS_NPU_AVAILABLE, reason='requires NPU support'))
-    ])
-    @pytest.mark.parametrize('dtype', [
-        torch.float,
-        pytest.param(
-            torch.double,
-            marks=pytest.mark.skipif(
-                IS_MLU_AVAILABLE or IS_NPU_AVAILABLE,
-                reason='MLU, NPU does not support for 64-bit floating point')),
-        torch.half
-    ])
-    def test_roipool_allclose(self, device, dtype):
-        self._test_roipool_allclose(device, dtype)
+    def test_roipool_allclose(self):
+        self._test_roipool_allclose(torch.double)
+        self._test_roipool_allclose(torch.float)
+        self._test_roipool_allclose(torch.half)
diff --git a/tests/test_ops/test_roiaware_pool3d.py b/tests/test_ops/test_roiaware_pool3d.py
index 2a9cbfd..1d63e39 100644
--- a/tests/test_ops/test_roiaware_pool3d.py
+++ b/tests/test_ops/test_roiaware_pool3d.py
@@ -5,27 +5,11 @@ import torch
 
 from mmcv.ops import (RoIAwarePool3d, points_in_boxes_all, points_in_boxes_cpu,
                       points_in_boxes_part)
-from mmcv.utils import IS_CUDA_AVAILABLE, IS_MLU_AVAILABLE
-
-
-@pytest.mark.parametrize('device', [
-    pytest.param(
-        'cuda',
-        marks=pytest.mark.skipif(
-            not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
-    pytest.param(
-        'mlu',
-        marks=pytest.mark.skipif(
-            not IS_MLU_AVAILABLE, reason='requires MLU support'))
-])
-@pytest.mark.parametrize('dtype', [
-    torch.float, torch.half,
-    pytest.param(
-        torch.double,
-        marks=pytest.mark.skipif(
-            IS_MLU_AVAILABLE, reason='MLU does not support for double'))
-])
-def test_RoIAwarePool3d(device, dtype):
+
+
+@pytest.mark.skipif(
+    not torch.cuda.is_available(), reason='requires CUDA support')
+def test_RoIAwarePool3d():
     roiaware_pool3d_max = RoIAwarePool3d(
         out_size=4, max_pts_per_voxel=128, mode='max')
     roiaware_pool3d_avg = RoIAwarePool3d(
@@ -33,27 +17,27 @@ def test_RoIAwarePool3d(device, dtype):
     rois = torch.tensor(
         [[1.0, 2.0, 3.0, 5.0, 4.0, 6.0, -0.3 - np.pi / 2],
          [-10.0, 23.0, 16.0, 20.0, 10.0, 20.0, -0.5 - np.pi / 2]],
-        dtype=dtype).to(device)
-    # boxes (m, 7) with bottom center in lidar coordinate
+        dtype=torch.float32).cuda(
+        )  # boxes (m, 7) with bottom center in lidar coordinate
     pts = torch.tensor(
         [[1, 2, 3.3], [1.2, 2.5, 3.0], [0.8, 2.1, 3.5], [1.6, 2.6, 3.6],
          [0.8, 1.2, 3.9], [-9.2, 21.0, 18.2], [3.8, 7.9, 6.3],
          [4.7, 3.5, -12.2], [3.8, 7.6, -2], [-10.6, -12.9, -20], [-16, -18, 9],
          [-21.3, -52, -5], [0, 0, 0], [6, 7, 8], [-2, -3, -4]],
-        dtype=dtype).to(device)  # points (n, 3) in lidar coordinate
+        dtype=torch.float32).cuda()  # points (n, 3) in lidar coordinate
     pts_feature = pts.clone()
 
     pooled_features_max = roiaware_pool3d_max(
         rois=rois, pts=pts, pts_feature=pts_feature)
     assert pooled_features_max.shape == torch.Size([2, 4, 4, 4, 3])
     assert torch.allclose(pooled_features_max.sum(),
-                          torch.tensor(51.100, dtype=dtype).to(device), 1e-3)
+                          torch.tensor(51.100).cuda(), 1e-3)
 
     pooled_features_avg = roiaware_pool3d_avg(
         rois=rois, pts=pts, pts_feature=pts_feature)
     assert pooled_features_avg.shape == torch.Size([2, 4, 4, 4, 3])
     assert torch.allclose(pooled_features_avg.sum(),
-                          torch.tensor(49.750, dtype=dtype).to(device), 1e-3)
+                          torch.tensor(49.750).cuda(), 1e-3)
 
 
 @pytest.mark.skipif(
@@ -149,11 +133,3 @@ def test_points_in_boxes_all():
         dtype=torch.int32).cuda()
     assert point_indices.shape == torch.Size([1, 15, 2])
     assert (point_indices == expected_point_indices).all()
-
-    if torch.cuda.device_count() > 1:
-        pts = pts.to('cuda:1')
-        boxes = boxes.to('cuda:1')
-        expected_point_indices = expected_point_indices.to('cuda:1')
-        point_indices = points_in_boxes_all(points=pts, boxes=boxes)
-        assert point_indices.shape == torch.Size([1, 15, 2])
-        assert (point_indices == expected_point_indices).all()
diff --git a/tests/test_ops/test_roipoint_pool3d.py b/tests/test_ops/test_roipoint_pool3d.py
index 391a0bf..7db3885 100644
--- a/tests/test_ops/test_roipoint_pool3d.py
+++ b/tests/test_ops/test_roipoint_pool3d.py
@@ -1,50 +1,35 @@
-# Copyright (c) OpenMMLab. All rights reserved.
 import pytest
 import torch
 
 from mmcv.ops import RoIPointPool3d
-from mmcv.utils import IS_CUDA_AVAILABLE, IS_MLU_AVAILABLE
 
 
-@pytest.mark.parametrize('device', [
-    pytest.param(
-        'cuda',
-        marks=pytest.mark.skipif(
-            not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
-    pytest.param(
-        'mlu',
-        marks=pytest.mark.skipif(
-            not IS_MLU_AVAILABLE, reason='requires MLU support'))
-])
-@pytest.mark.parametrize('dtype', [
-    torch.float, torch.half,
-    pytest.param(
-        torch.double,
-        marks=pytest.mark.skipif(
-            IS_MLU_AVAILABLE, reason='MLU does not support for double'))
-])
-def test_roipoint(device, dtype):
-    points = torch.tensor(
+@pytest.mark.skipif(
+    not torch.cuda.is_available(), reason='requires CUDA support')
+def test_gather_points():
+    feats = torch.tensor(
         [[1, 2, 3.3], [1.2, 2.5, 3.0], [0.8, 2.1, 3.5], [1.6, 2.6, 3.6],
          [0.8, 1.2, 3.9], [-9.2, 21.0, 18.2], [3.8, 7.9, 6.3],
          [4.7, 3.5, -12.2], [3.8, 7.6, -2], [-10.6, -12.9, -20], [-16, -18, 9],
          [-21.3, -52, -5], [0, 0, 0], [6, 7, 8], [-2, -3, -4]],
-        dtype=dtype).unsqueeze(0).to(device)
-    feats = points.clone()
+        dtype=torch.float32).unsqueeze(0).cuda()
+    points = feats.clone()
     rois = torch.tensor([[[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 0.3],
                           [-10.0, 23.0, 16.0, 10, 20, 20, 0.5]]],
-                        dtype=dtype).to(device)
+                        dtype=torch.float32).cuda()
 
     roipoint_pool3d = RoIPointPool3d(num_sampled_points=4)
-    roi_feat, empty_flag = roipoint_pool3d(points, feats, rois)
-    expected_roi_feat = torch.tensor(
-        [[[[1, 2, 3.3, 1, 2, 3.3], [1.2, 2.5, 3, 1.2, 2.5, 3],
-           [0.8, 2.1, 3.5, 0.8, 2.1, 3.5], [1.6, 2.6, 3.6, 1.6, 2.6, 3.6]],
-          [[-9.2, 21, 18.2, -9.2, 21, 18.2], [-9.2, 21, 18.2, -9.2, 21, 18.2],
-           [-9.2, 21, 18.2, -9.2, 21, 18.2], [-9.2, 21, 18.2, -9.2, 21, 18.2]]]
-         ],
-        dtype=dtype).to(device)
-    expected_empty_flag = torch.tensor([[0, 0]]).int().to(device)
+    roi_feat, empty_flag = roipoint_pool3d(feats, points, rois)
+    expected_roi_feat = torch.tensor([[[[1, 2, 3.3, 1, 2, 3.3],
+                                        [1.2, 2.5, 3, 1.2, 2.5, 3],
+                                        [0.8, 2.1, 3.5, 0.8, 2.1, 3.5],
+                                        [1.6, 2.6, 3.6, 1.6, 2.6, 3.6]],
+                                       [[-9.2, 21, 18.2, -9.2, 21, 18.2],
+                                        [-9.2, 21, 18.2, -9.2, 21, 18.2],
+                                        [-9.2, 21, 18.2, -9.2, 21, 18.2],
+                                        [-9.2, 21, 18.2, -9.2, 21,
+                                         18.2]]]]).cuda()
+    expected_empty_flag = torch.tensor([[0, 0]]).int().cuda()
 
     assert torch.allclose(roi_feat, expected_roi_feat)
     assert torch.allclose(empty_flag, expected_empty_flag)
diff --git a/tests/test_ops/test_rotated_feature_align.py b/tests/test_ops/test_rotated_feature_align.py
deleted file mode 100644
index e7422a3..0000000
--- a/tests/test_ops/test_rotated_feature_align.py
+++ /dev/null
@@ -1,131 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import pytest
-import torch
-
-from mmcv.ops import rotated_feature_align
-from mmcv.utils import IS_CUDA_AVAILABLE
-
-
-@pytest.mark.skipif(
-    not torch.cuda.is_available(), reason='requires CUDA support')
-@pytest.mark.parametrize('device', [
-    pytest.param(
-        'cuda',
-        marks=pytest.mark.skipif(
-            not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
-    pytest.param(
-        'cpu',
-        marks=pytest.mark.skipif(
-            torch.__version__ == 'parrots', reason='requires PyTorch support'))
-])
-def test_rotated_feature_align(device):
-    feature = torch.tensor([[[[1.2924, -0.2172, -0.5222, 0.1172],
-                              [0.9144, 1.2248, 1.3115, -0.9690],
-                              [-0.8949, -1.1797, -0.9093, -0.3961],
-                              [-0.4586, 0.5062, -0.7947, -0.7397]],
-                             [[-1.0943, -0.7495, 1.3461, -1.1652],
-                              [0.2034, 0.6763, -1.2357, 0.5231],
-                              [-1.0062, 1.2592, 1.4225, -0.3951],
-                              [-0.1242, -1.6240, 0.1932, 2.7181]],
-                             [[-1.6271, -1.0276, 0.0578, -0.2997],
-                              [-0.9684, -1.6946, -1.3188, -1.1938],
-                              [-1.6744, -0.8917, -0.6556,
-                               1.0073], [-0.1205, 0.3671, -0.3731, -0.5347]]],
-                            [[[0.7035, 0.2089, -0.1774, 3.4670],
-                              [-0.8505, -0.9278, 1.4714, 0.1644],
-                              [0.0898, 0.3531, -0.4007, 0.1927],
-                              [1.2569, -0.2636, -0.5223, 0.0616]],
-                             [[0.1760, -0.7639, -0.4600, -1.3260],
-                              [-0.9921, -0.2970, -0.8955, 1.0508],
-                              [1.3515, -0.1641, 1.9679, 1.1986],
-                              [-0.3616, 0.6287, 0.4933, 0.3360]],
-                             [[-0.5860, 0.2124, -0.8700, 2.4200],
-                              [-0.0551, -1.5103, -1.6779, 0.8399],
-                              [0.8431, 1.2414, -1.1243, -0.3887],
-                              [-2.1254, 0.6047, -0.3515, 0.7254]]]],
-                           device=device,
-                           requires_grad=True)
-
-    bbox = torch.tensor(
-        [[[[1.3080e+01, 1.2688e+01, 1.1214e+01, 9.3944e+01, -9.1905e-01],
-           [3.8104e+01, 1.0134e+01, 1.4659e+02, 9.0306e+01, -9.8211e-01],
-           [-5.3213e+01, 4.9508e+01, 5.1513e+01, 3.2055e+01, -3.1954e-01],
-           [2.6974e+01, 2.5248e+01, 5.4495e+01, 3.1083e+00, -6.2127e-01]],
-          [[-1.5604e+01, -5.1908e+01, 2.3998e+02, 1.5008e+01, -1.2546e+00],
-           [3.1354e+01, -7.3635e+00, 6.7879e+01, 3.5081e+01, -3.3851e-01],
-           [-5.3292e+00, 9.1946e+00, 1.2834e+01, 1.0485e+01, -1.3039e+00],
-           [-2.3925e+01, 3.6623e+01, 3.9875e+01, 7.2009e+01, -6.5934e-01]],
-          [[7.2114e+01, -2.3781e+01, 2.9106e+01, 8.4501e+01, -1.1340e+00],
-           [2.6258e+01, -7.7034e+00, 1.7629e+02, 1.0615e+02, -1.2156e+00],
-           [3.8057e+01, 4.6016e+01, 1.2965e+01, 6.9384e+00, -1.0855e+00],
-           [2.4428e+01, -1.6189e+01, 2.0572e+02, 3.1622e+01, -1.5719e-01]],
-          [[3.8226e+00, 2.9608e+01, 1.4457e+01, 6.8179e+01, -9.1997e-01],
-           [2.5003e+01, -4.2490e+01, 9.6007e+01, 4.9086e+01, -1.4786e+00],
-           [8.5983e+01, 5.4980e+01, 7.8080e+01, 1.0003e+02, -1.0926e+00],
-           [9.9065e+00, 4.1457e+01, 5.9799e+00, 1.7973e+01, -5.6313e-01]]],
-         [[[-1.8244e+01, 4.6309e+00, 5.3010e+01, 2.4310e+01, -7.0345e-01],
-           [1.9419e+01, 3.6704e+01, 5.2390e+01, 5.4133e+01, -3.7730e-01],
-           [5.6387e+01, 2.3752e+01, 9.0441e+00, 1.7792e+01, -1.5583e+00],
-           [3.6303e+01, 1.6396e+01, 2.0283e+01, 1.9148e+01, -8.3419e-01]],
-          [[3.2169e+01, 3.0521e+01, 2.6283e+01, 1.9680e+02, -3.0454e-01],
-           [2.5788e+01, -3.2189e+01, 8.8882e+01, 1.0207e+02, -1.5328e+00],
-           [8.4676e+00, -1.6668e+01, 2.4657e+01, 1.1275e+02, -4.0388e-01],
-           [-1.0799e+01, 6.0422e+00, 9.5807e+00, 3.3677e+01, -3.5438e-01]],
-          [[6.9363e+01, 1.0850e+01, 2.5968e+01, 2.2311e+01, -1.6408e-01],
-           [2.8140e+00, 4.6843e+00, 3.1289e+00, 2.1480e+01, -6.7583e-01],
-           [2.6661e+01, 4.5290e+01, 6.1679e+00, 3.0005e+01, -8.9806e-01],
-           [5.0871e+00, 1.3234e+01, 9.2087e+01, 4.9622e+01, -2.8020e-01]],
-          [[-1.2643e+01, 2.5176e+01, 5.0488e+01, 5.4246e+01, -4.4840e-01],
-           [-3.4521e+01, 9.8435e-01, 5.2413e+01, 9.7996e+00, -8.4218e-01],
-           [4.9829e+01, -1.0808e+01, 2.9848e+01, 7.3579e+01, -6.2672e-01],
-           [8.0446e+01, 2.8064e+01, 4.5273e+01, 5.3809e+01, -1.2359e+00]]]],
-        device=device,
-        requires_grad=True)
-
-    expected_output = torch.tensor([[[[1.1095, -0.2172, -0.5222, -0.6225],
-                                      [0.9144, 0.7662, 1.0487, -0.9690],
-                                      [-0.8949, -1.6384, -0.9093, -0.3961],
-                                      [-0.8604, 0.5062, -0.7947, -0.7397]],
-                                     [[-0.3961, -0.7495, 1.3461, 1.5528],
-                                      [0.2034, 0.5522, -1.6722, 0.5231],
-                                      [-1.0062, 1.1350, 1.4225, -0.3951],
-                                      [-0.4826, -1.6240, 0.1932, 2.7181]],
-                                     [[-2.6436, -1.0276, 0.0578, -0.8344],
-                                      [-0.9684, -1.8151, -2.1843, -1.1938],
-                                      [-1.6744, -1.0121, -0.6556, 1.0073],
-                                      [-0.8474, 0.3671, -0.3731, -0.5347]]],
-                                    [[[0.7035, 0.2089, -0.1774, 3.4670],
-                                      [-0.8505, -0.9278, 1.4714, 0.1644],
-                                      [0.0898, 0.3064, -0.4007, 0.5849],
-                                      [1.2569, -0.2636, -0.5223, 0.0616]],
-                                     [[0.1760, -0.7639, -0.4600, -1.3260],
-                                      [-0.9921, -0.2970, -0.8955, 1.0508],
-                                      [1.3515, -0.6125, 1.9679, 0.5550],
-                                      [-0.3616, 0.6287, 0.4933, 0.3360]],
-                                     [[-0.5860, 0.2124, -0.8700, 2.4200],
-                                      [-0.0551, -1.5103, -1.6779, 0.8399],
-                                      [0.8431, 0.8455, -1.1243, -1.5994],
-                                      [-2.1254, 0.6047, -0.3515, 0.7254]]]],
-                                   device=device)
-
-    expected_grad = torch.tensor([
-        [[[1.0000, 1.8507, 1.1493, 1.5222], [1.0000, 1.1511, 1.2139, 1.4778],
-          [1.0000, 1.2629, 1.3721, 1.0000], [3.0000, 1.0000, 1.0000, 2.0000]],
-         [[1.0000, 1.8507, 1.1493, 1.5222], [1.0000, 1.1511, 1.2139, 1.4778],
-          [1.0000, 1.2629, 1.3721, 1.0000], [3.0000, 1.0000, 1.0000, 2.0000]],
-         [[1.0000, 1.8507, 1.1493, 1.5222], [1.0000, 1.1511, 1.2139, 1.4778],
-          [1.0000, 1.2629, 1.3721, 1.0000], [3.0000, 1.0000, 1.0000, 2.0000]]],
-        [[[1.2687, 1.5055, 1.2382, 1.0000], [1.1458, 1.4258, 1.4160, 1.0000],
-          [1.0000, 1.0000, 1.0000, 1.0000], [1.0000, 1.0000, 1.0000, 1.0000]],
-         [[1.2687, 1.5055, 1.2382, 1.0000], [1.1458, 1.4258, 1.4160, 1.0000],
-          [1.0000, 1.0000, 1.0000, 1.0000], [1.0000, 1.0000, 1.0000, 1.0000]],
-         [[1.2687, 1.5055, 1.2382, 1.0000], [1.1458, 1.4258, 1.4160, 1.0000],
-          [1.0000, 1.0000, 1.0000, 1.0000], [1.0000, 1.0000, 1.0000, 1.0000]]]
-    ],
-                                 device=device)
-
-    output = rotated_feature_align(
-        feature, bbox, spatial_scale=1 / 8, points=1)
-    output.backward(torch.ones_like(output))
-    assert torch.allclose(output, expected_output, 1e-2)
-    assert torch.allclose(feature.grad, expected_grad, 1e-2)
diff --git a/tests/test_ops/test_saconv.py b/tests/test_ops/test_saconv.py
index 607775c..b34865f 100644
--- a/tests/test_ops/test_saconv.py
+++ b/tests/test_ops/test_saconv.py
@@ -1,4 +1,3 @@
-# Copyright (c) OpenMMLab. All rights reserved.
 import torch
 import torch.nn as nn
 
diff --git a/tests/test_ops/test_scatter_points.py b/tests/test_ops/test_scatter_points.py
index cf45160..8fe1fe8 100644
--- a/tests/test_ops/test_scatter_points.py
+++ b/tests/test_ops/test_scatter_points.py
@@ -1,17 +1,18 @@
-# Copyright (c) OpenMMLab. All rights reserved.
 import pytest
 import torch
 from torch.autograd import gradcheck
 
 from mmcv.ops import DynamicScatter
 
-if torch.__version__ == 'parrots':
-    pytest.skip('not supported in parrots now', allow_module_level=True)
-
 
 @pytest.mark.skipif(
     not torch.cuda.is_available(), reason='requires CUDA support')
 def test_dynamic_scatter():
+    feats = torch.rand(
+        size=(200000, 3), dtype=torch.float32, device='cuda') * 100 - 50
+    coors = torch.randint(
+        low=-1, high=20, size=(200000, 3), dtype=torch.int32, device='cuda')
+
     dsmean = DynamicScatter([0.32, 0.32, 6],
                             [-74.88, -74.88, -2, 74.88, 74.88, 4], True)
     dsmax = DynamicScatter([0.32, 0.32, 6],
@@ -51,47 +52,6 @@ def test_dynamic_scatter():
     assert (empty_o_feats.grad == 0).all()
 
     # test non-empty input
-    feats = torch.rand(
-        size=(200000, 3), dtype=torch.float32, device='cuda') * 100 - 50
-    coors = torch.randint(
-        low=-1, high=20, size=(200000, 3), dtype=torch.int32, device='cuda')
-
-    ref_voxel_coors = coors.unique(dim=0, sorted=True)
-    ref_voxel_coors = ref_voxel_coors[ref_voxel_coors.min(dim=-1).values >= 0]
-    ref_voxel_feats_mean = []
-    ref_voxel_feats_max = []
-    for ref_voxel_coor in ref_voxel_coors:
-        voxel_mask = (coors == ref_voxel_coor).all(dim=-1)
-        ref_voxel_feats_mean.append(feats[voxel_mask].mean(dim=0))
-        ref_voxel_feats_max.append(feats[voxel_mask].max(dim=0).values)
-    ref_voxel_feats_mean = torch.stack(ref_voxel_feats_mean)
-    ref_voxel_feats_max = torch.stack(ref_voxel_feats_max)
-
-    feats_out_mean, coors_out_mean = dsmean(feats, coors)
-    seq_mean = (coors_out_mean[:, 0] * 400 + coors_out_mean[:, 1] * 20 +
-                coors_out_mean[:, 2]).argsort()
-    feats_out_mean = feats_out_mean[seq_mean]
-    coors_out_mean = coors_out_mean[seq_mean]
-
-    feats_out_max, coors_out_max = dsmax(feats, coors)
-    seq_max = (coors_out_max[:, 0] * 400 + coors_out_max[:, 1] * 20 +
-               coors_out_max[:, 2]).argsort()
-    feats_out_max = feats_out_max[seq_max]
-    coors_cout_max = coors_out_max[seq_max]
-
-    assert (coors_out_mean == ref_voxel_coors).all()
-    assert torch.allclose(
-        feats_out_mean, ref_voxel_feats_mean, atol=1e-2, rtol=1e-5)
-    assert (coors_cout_max == ref_voxel_coors).all()
-    assert torch.allclose(
-        feats_out_max, ref_voxel_feats_max, atol=1e-2, rtol=1e-5)
-
-    # test non-empty input without any point out of bound
-    feats = torch.rand(
-        size=(200000, 3), dtype=torch.float32, device='cuda') * 100 - 50
-    coors = torch.randint(
-        low=0, high=20, size=(200000, 3), dtype=torch.int32, device='cuda')
-
     ref_voxel_coors = coors.unique(dim=0, sorted=True)
     ref_voxel_coors = ref_voxel_coors[ref_voxel_coors.min(dim=-1).values >= 0]
     ref_voxel_feats_mean = []
diff --git a/tests/test_ops/test_spconv.py b/tests/test_ops/test_spconv.py
deleted file mode 100644
index 098ff21..0000000
--- a/tests/test_ops/test_spconv.py
+++ /dev/null
@@ -1,133 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import pytest
-import torch
-from torch import nn
-
-from mmcv.cnn import build_conv_layer, build_norm_layer
-from mmcv.ops import (SparseConvTensor, SparseInverseConv3d, SparseSequential,
-                      SubMConv3d)
-
-if torch.__version__ == 'parrots':
-    pytest.skip('not supported in parrots now', allow_module_level=True)
-
-
-def make_sparse_convmodule(in_channels,
-                           out_channels,
-                           kernel_size,
-                           indice_key,
-                           stride=1,
-                           padding=0,
-                           conv_type='SubMConv3d',
-                           norm_cfg=None,
-                           order=('conv', 'norm', 'act')):
-    """Make sparse convolution module.
-
-    Args:
-        in_channels (int): the number of input channels
-        out_channels (int): the number of out channels
-        kernel_size (int|tuple(int)): kernel size of convolution
-        indice_key (str): the indice key used for sparse tensor
-        stride (int|tuple(int)): the stride of convolution
-        padding (int or list[int]): the padding number of input
-        conv_type (str): sparse conv type in spconv
-        norm_cfg (dict[str]): config of normalization layer
-        order (tuple[str]): The order of conv/norm/activation layers. It is a
-            sequence of "conv", "norm" and "act". Common examples are
-            ("conv", "norm", "act") and ("act", "conv", "norm").
-
-    Returns:
-        spconv.SparseSequential: sparse convolution module.
-    """
-    assert isinstance(order, tuple) and len(order) <= 3
-    assert set(order) | {'conv', 'norm', 'act'} == {'conv', 'norm', 'act'}
-
-    conv_cfg = dict(type=conv_type, indice_key=indice_key)
-
-    layers = list()
-    for layer in order:
-        if layer == 'conv':
-            if conv_type not in [
-                    'SparseInverseConv3d', 'SparseInverseConv2d',
-                    'SparseInverseConv1d'
-            ]:
-                layers.append(
-                    build_conv_layer(
-                        conv_cfg,
-                        in_channels,
-                        out_channels,
-                        kernel_size,
-                        stride=stride,
-                        padding=padding,
-                        bias=False))
-            else:
-                layers.append(
-                    build_conv_layer(
-                        conv_cfg,
-                        in_channels,
-                        out_channels,
-                        kernel_size,
-                        bias=False))
-        elif layer == 'norm':
-            layers.append(build_norm_layer(norm_cfg, out_channels)[1])
-        elif layer == 'act':
-            layers.append(nn.ReLU(inplace=True))
-
-    layers = SparseSequential(*layers)
-    return layers
-
-
-@pytest.mark.skipif(
-    not torch.cuda.is_available(), reason='requires CUDA support')
-def test_make_sparse_convmodule():
-    torch.cuda.empty_cache()
-    voxel_features = torch.tensor([[6.56126, 0.9648336, -1.7339306, 0.315],
-                                   [6.8162713, -2.480431, -1.3616394, 0.36],
-                                   [11.643568, -4.744306, -1.3580885, 0.16],
-                                   [23.482342, 6.5036807, 0.5806964, 0.35]],
-                                  dtype=torch.float32,
-                                  device='cuda')  # n, point_features
-    coordinates = torch.tensor(
-        [[0, 12, 819, 131], [0, 16, 750, 136], [1, 16, 705, 232],
-         [1, 35, 930, 469]],
-        dtype=torch.int32,
-        device='cuda')  # n, 4(batch, ind_x, ind_y, ind_z)
-
-    # test
-    input_sp_tensor = SparseConvTensor(voxel_features, coordinates,
-                                       [41, 1600, 1408], 2)
-
-    sparse_block0 = make_sparse_convmodule(
-        4,
-        16,
-        3,
-        'test0',
-        stride=1,
-        padding=0,
-        conv_type='SubMConv3d',
-        norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.01),
-        order=('conv', 'norm', 'act')).cuda()
-    assert isinstance(sparse_block0[0], SubMConv3d)
-    assert sparse_block0[0].in_channels == 4
-    assert sparse_block0[0].out_channels == 16
-    assert isinstance(sparse_block0[1], torch.nn.BatchNorm1d)
-    assert sparse_block0[1].eps == 0.001
-    assert sparse_block0[1].momentum == 0.01
-    assert isinstance(sparse_block0[2], torch.nn.ReLU)
-
-    # test forward
-    out_features = sparse_block0(input_sp_tensor)
-    assert out_features.features.shape == torch.Size([4, 16])
-
-    sparse_block1 = make_sparse_convmodule(
-        4,
-        16,
-        3,
-        'test1',
-        stride=1,
-        padding=0,
-        conv_type='SparseInverseConv3d',
-        norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.01),
-        order=('norm', 'act', 'conv')).cuda()
-    assert isinstance(sparse_block1[0], torch.nn.BatchNorm1d)
-    assert isinstance(sparse_block1[1], torch.nn.ReLU)
-    assert isinstance(sparse_block1[2], SparseInverseConv3d)
diff --git a/tests/test_ops/test_syncbn.py b/tests/test_ops/test_syncbn.py
index d1c1605..fb1c837 100644
--- a/tests/test_ops/test_syncbn.py
+++ b/tests/test_ops/test_syncbn.py
@@ -1,4 +1,3 @@
-# Copyright (c) OpenMMLab. All rights reserved.
 import os
 import platform
 
@@ -14,7 +13,7 @@ else:
     import re
 
 
-class TestSyncBN:
+class TestSyncBN(object):
 
     def dist_init(self):
         rank = int(os.environ['SLURM_PROCID'])
diff --git a/tests/test_ops/test_tensorrt.py b/tests/test_ops/test_tensorrt.py
new file mode 100644
index 0000000..c2eb61b
--- /dev/null
+++ b/tests/test_ops/test_tensorrt.py
@@ -0,0 +1,807 @@
+import os
+from functools import partial
+from typing import Callable
+
+import numpy as np
+import onnx
+import pytest
+import torch
+import torch.nn as nn
+
+try:
+    from mmcv.tensorrt import (TRTWrapper, is_tensorrt_plugin_loaded, onnx2trt,
+                               save_trt_engine)
+except ImportError:
+    pytest.skip(
+        'TensorRT should be installed from source.', allow_module_level=True)
+
+if not torch.cuda.is_available():
+    pytest.skip(
+        'CUDA is required for this test module', allow_module_level=True)
+
+if not is_tensorrt_plugin_loaded():
+    pytest.skip(
+        'Test requires to complie TensorRT plugins in mmcv',
+        allow_module_level=True)
+
+
+class WrapFunction(nn.Module):
+
+    def __init__(self, wrapped_function):
+        super(WrapFunction, self).__init__()
+        self.wrapped_function = wrapped_function
+
+    def forward(self, *args, **kwargs):
+        return self.wrapped_function(*args, **kwargs)
+
+
+onnx_file = 'tmp.onnx'
+trt_file = 'tmp.engine'
+
+
+def test_roialign():
+    try:
+        from mmcv.ops import RoIAlign
+    except (ImportError, ModuleNotFoundError):
+        pytest.skip('test requires compilation')
+
+    # trt config
+    fp16_mode = False
+    max_workspace_size = 1 << 30
+
+    # roi align config
+    pool_h = 2
+    pool_w = 2
+    spatial_scale = 1.0
+    sampling_ratio = 2
+
+    inputs = [([[[[1., 2.], [3., 4.]]]], [[0., 0., 0., 1., 1.]]),
+              ([[[[1., 2.], [3., 4.]], [[4., 3.],
+                                        [2., 1.]]]], [[0., 0., 0., 1., 1.]]),
+              ([[[[1., 2., 5., 6.], [3., 4., 7., 8.], [9., 10., 13., 14.],
+                  [11., 12., 15., 16.]]]], [[0., 0., 0., 3., 3.]])]
+
+    wrapped_model = RoIAlign((pool_w, pool_h), spatial_scale, sampling_ratio,
+                             'avg', True).cuda()
+    for case in inputs:
+        np_input = np.array(case[0], dtype=np.float32)
+        np_rois = np.array(case[1], dtype=np.float32)
+        input = torch.from_numpy(np_input).cuda()
+        rois = torch.from_numpy(np_rois).cuda()
+
+        with torch.no_grad():
+            torch.onnx.export(
+                wrapped_model, (input, rois),
+                onnx_file,
+                export_params=True,
+                keep_initializers_as_inputs=True,
+                input_names=['input', 'rois'],
+                output_names=['roi_feat'],
+                opset_version=11)
+        onnx_model = onnx.load(onnx_file)
+
+        # create trt engine and wrapper
+        opt_shape_dict = {
+            'input': [list(input.shape),
+                      list(input.shape),
+                      list(input.shape)],
+            'rois': [list(rois.shape),
+                     list(rois.shape),
+                     list(rois.shape)]
+        }
+        trt_engine = onnx2trt(
+            onnx_model,
+            opt_shape_dict,
+            fp16_mode=fp16_mode,
+            max_workspace_size=max_workspace_size)
+        save_trt_engine(trt_engine, trt_file)
+        trt_model = TRTWrapper(trt_file, ['input', 'rois'], ['roi_feat'])
+
+        with torch.no_grad():
+            trt_outputs = trt_model({'input': input, 'rois': rois})
+            trt_roi_feat = trt_outputs['roi_feat']
+
+        # compute pytorch_output
+        with torch.no_grad():
+            pytorch_roi_feat = wrapped_model(input, rois)
+
+        # allclose
+        if os.path.exists(onnx_file):
+            os.remove(onnx_file)
+        if os.path.exists(trt_file):
+            os.remove(trt_file)
+        assert torch.allclose(pytorch_roi_feat, trt_roi_feat)
+
+
+def test_nms():
+    try:
+        import mmcv
+        from mmcv.ops import nms
+    except (ImportError, ModuleNotFoundError):
+        pytest.skip('test requires compilation')
+    os.environ['ONNX_BACKEND'] = 'MMCVTensorRT'
+    # trt config
+    fp16_mode = False
+    max_workspace_size = 1 << 30
+    data = mmcv.load('./tests/data/batched_nms_data.pkl')
+    boxes = torch.from_numpy(data['boxes']).cuda()
+    scores = torch.from_numpy(data['scores']).cuda()
+    nms = partial(
+        nms, iou_threshold=0.7, offset=0, score_threshold=0.1, max_num=100)
+    wrapped_model = WrapFunction(nms)
+    wrapped_model.cpu().eval()
+    with torch.no_grad():
+        torch.onnx.export(
+            wrapped_model, (boxes.detach().cpu(), scores.detach().cpu()),
+            onnx_file,
+            export_params=True,
+            keep_initializers_as_inputs=True,
+            input_names=['boxes', 'scores'],
+            output_names=['dets', 'inds'],
+            opset_version=11)
+    onnx_model = onnx.load(onnx_file)
+
+    # create trt engine and wrapper
+    opt_shape_dict = {
+        'boxes': [list(boxes.shape),
+                  list(boxes.shape),
+                  list(boxes.shape)],
+        'scores': [list(scores.shape),
+                   list(scores.shape),
+                   list(scores.shape)]
+    }
+    trt_engine = onnx2trt(
+        onnx_model,
+        opt_shape_dict,
+        fp16_mode=fp16_mode,
+        max_workspace_size=max_workspace_size)
+    save_trt_engine(trt_engine, trt_file)
+    trt_model = TRTWrapper(trt_file, ['boxes', 'scores'], ['dets', 'inds'])
+
+    with torch.no_grad():
+        trt_outputs = trt_model({'boxes': boxes, 'scores': scores})
+        trt_dets = trt_outputs['dets']
+        trt_inds = trt_outputs['inds']
+        trt_inds = trt_inds.long()
+
+    # compute pytorch_output
+    with torch.no_grad():
+        pytorch_outputs = wrapped_model(boxes, scores)
+        pytorch_dets, pytorch_inds = pytorch_outputs
+
+    # allclose
+    if os.path.exists(onnx_file):
+        os.remove(onnx_file)
+    if os.path.exists(trt_file):
+        os.remove(trt_file)
+    num_boxes = pytorch_dets.shape[0]
+    trt_dets = trt_dets[:num_boxes, ...]
+    trt_inds = trt_inds[:num_boxes]
+    trt_scores = trt_dets[:, 4]
+    pytorch_scores = pytorch_dets[:, 4]
+    os.environ.pop('ONNX_BACKEND')
+    assert torch.allclose(pytorch_scores, trt_scores, atol=1e-3)
+    assert torch.equal(pytorch_inds, trt_inds)
+
+
+def test_batched_nms():
+    try:
+        import mmcv
+        from mmcv.ops import batched_nms
+    except (ImportError, ModuleNotFoundError):
+        pytest.skip('test requires compilation')
+
+    # trt config
+    os.environ['ONNX_BACKEND'] = 'MMCVTensorRT'
+    fp16_mode = False
+    max_workspace_size = 1 << 30
+    data = mmcv.load('./tests/data/batched_nms_data.pkl')
+    nms_cfg = dict(type='nms', iou_threshold=0.7, score_threshold=0.1)
+    boxes = torch.from_numpy(data['boxes']).cuda()
+    scores = torch.from_numpy(data['scores']).cuda()
+    idxs = torch.from_numpy(data['idxs']).cuda()
+    class_agnostic = False
+
+    nms = partial(batched_nms, nms_cfg=nms_cfg, class_agnostic=class_agnostic)
+    wrapped_model = WrapFunction(nms)
+    wrapped_model.cpu().eval()
+    input_data = (boxes.detach().cpu(), scores.detach().cpu(),
+                  idxs.detach().cpu())
+    input_names = ['boxes', 'scores', 'idxs']
+    output_names = ['dets', 'inds']
+    with torch.no_grad():
+        torch.onnx.export(
+            wrapped_model,
+            input_data,
+            onnx_file,
+            export_params=True,
+            keep_initializers_as_inputs=True,
+            input_names=input_names,
+            output_names=output_names,
+            opset_version=11)
+    onnx_model = onnx.load(onnx_file)
+    # create trt engine and wrapper
+    opt_shape_dict = {
+        'boxes': [list(boxes.shape),
+                  list(boxes.shape),
+                  list(boxes.shape)],
+        'scores': [list(scores.shape),
+                   list(scores.shape),
+                   list(scores.shape)],
+        'idxs': [list(idxs.shape),
+                 list(idxs.shape),
+                 list(idxs.shape)]
+    }
+    trt_engine = onnx2trt(
+        onnx_model,
+        opt_shape_dict,
+        fp16_mode=fp16_mode,
+        max_workspace_size=max_workspace_size)
+    save_trt_engine(trt_engine, trt_file)
+    trt_model = TRTWrapper(trt_file, input_names, output_names)
+
+    with torch.no_grad():
+        trt_outputs = trt_model({
+            'boxes': boxes,
+            'scores': scores,
+            'idxs': idxs
+        })
+        trt_dets = trt_outputs['dets']
+        trt_inds = trt_outputs['inds']
+        trt_inds = trt_inds.long()
+
+    # compute pytorch_output
+    with torch.no_grad():
+        pytorch_outputs = wrapped_model(boxes, scores, idxs)
+        pytorch_dets, pytorch_inds = pytorch_outputs
+    # allclose
+    if os.path.exists(onnx_file):
+        os.remove(onnx_file)
+    if os.path.exists(trt_file):
+        os.remove(trt_file)
+    num_boxes = pytorch_dets.shape[0]
+    trt_dets = trt_dets[:num_boxes, ...]
+    trt_inds = trt_inds[:num_boxes]
+    trt_scores = trt_dets[:, 4]
+    pytorch_scores = pytorch_dets[:, 4]
+
+    os.environ.pop('ONNX_BACKEND')
+    assert torch.allclose(pytorch_scores, trt_scores)
+    assert torch.equal(pytorch_inds, trt_inds)
+
+
+def test_scatternd():
+
+    def func(data):
+        data[:, :-2] += 1
+        data[:2, :] -= 1
+        return data
+
+    data = torch.zeros(4, 4).cuda()
+    wrapped_model = WrapFunction(func).eval().cuda()
+
+    input_names = ['input']
+    output_names = ['output']
+
+    with torch.no_grad():
+        torch.onnx.export(
+            wrapped_model, (data.clone(), ),
+            onnx_file,
+            export_params=True,
+            keep_initializers_as_inputs=True,
+            input_names=input_names,
+            output_names=output_names,
+            opset_version=11)
+
+    onnx_model = onnx.load(onnx_file)
+
+    # create trt engine and wrapper
+    opt_shape_dict = {
+        'input': [list(data.shape),
+                  list(data.shape),
+                  list(data.shape)],
+    }
+    # trt config
+    fp16_mode = False
+    max_workspace_size = 1 << 30
+
+    trt_engine = onnx2trt(
+        onnx_model,
+        opt_shape_dict,
+        fp16_mode=fp16_mode,
+        max_workspace_size=max_workspace_size)
+
+    save_trt_engine(trt_engine, trt_file)
+    trt_model = TRTWrapper(trt_file, input_names, output_names)
+
+    with torch.no_grad():
+        trt_outputs = trt_model({'input': data.clone()})
+        trt_results = trt_outputs['output']
+
+    # compute pytorch_output
+    with torch.no_grad():
+        pytorch_results = wrapped_model(data.clone())
+
+    # allclose
+    if os.path.exists(onnx_file):
+        os.remove(onnx_file)
+    if os.path.exists(trt_file):
+        os.remove(trt_file)
+    assert torch.allclose(pytorch_results, trt_results)
+
+
+def test_deform_conv():
+    try:
+        from mmcv.ops import DeformConv2dPack
+    except (ImportError, ModuleNotFoundError):
+        pytest.skip('test requires compilation')
+
+    input = [[[[1., 2., 3.], [0., 1., 2.], [3., 5., 2.]]]]
+    offset_weight = [[[0.1, 0.4, 0.6, 0.1]], [[0.3, 0.2, 0.1, 0.3]],
+                     [[0.5, 0.5, 0.2, 0.8]], [[0.8, 0.3, 0.9, 0.1]],
+                     [[0.3, 0.1, 0.2, 0.5]], [[0.3, 0.7, 0.5, 0.3]],
+                     [[0.6, 0.2, 0.5, 0.3]], [[0.4, 0.1, 0.8, 0.4]]]
+    offset_bias = [0.7, 0.1, 0.8, 0.5, 0.6, 0.5, 0.4, 0.7]
+    deform_weight = [[[0.4, 0.2, 0.1, 0.9]]]
+
+    c_in = 1
+    c_out = 1
+    x = torch.Tensor(input).cuda()
+    x.requires_grad = True
+    model = DeformConv2dPack(c_in, c_out, 2, stride=1, padding=0)
+    model.conv_offset.weight.data = torch.nn.Parameter(
+        torch.Tensor(offset_weight).reshape(8, 1, 2, 2))
+    model.conv_offset.bias.data = torch.nn.Parameter(
+        torch.Tensor(offset_bias).reshape(8))
+    model.weight.data = torch.nn.Parameter(
+        torch.Tensor(deform_weight).reshape(1, 1, 2, 2))
+    model.cuda().eval()
+
+    input_names = ['input']
+    output_names = ['output']
+
+    with torch.no_grad():
+        torch.onnx.export(
+            model, (x.clone(), ),
+            onnx_file,
+            export_params=True,
+            keep_initializers_as_inputs=True,
+            input_names=input_names,
+            output_names=output_names,
+            opset_version=11)
+
+    onnx_model = onnx.load(onnx_file)
+
+    # create trt engine and wrapper
+    opt_shape_dict = {
+        'input': [list(x.shape), list(x.shape),
+                  list(x.shape)],
+    }
+    # trt config
+    fp16_mode = False
+    max_workspace_size = 1 << 30
+
+    trt_engine = onnx2trt(
+        onnx_model,
+        opt_shape_dict,
+        fp16_mode=fp16_mode,
+        max_workspace_size=max_workspace_size)
+
+    save_trt_engine(trt_engine, trt_file)
+    trt_model = TRTWrapper(trt_file, input_names, output_names)
+
+    with torch.no_grad():
+        trt_outputs = trt_model({'input': x.clone()})
+        trt_results = trt_outputs['output']
+
+    # compute pytorch_output
+    with torch.no_grad():
+        pytorch_results = model(x.clone())
+
+    # allclose
+    if os.path.exists(onnx_file):
+        os.remove(onnx_file)
+    if os.path.exists(trt_file):
+        os.remove(trt_file)
+    assert torch.allclose(pytorch_results, trt_results)
+
+
+@pytest.mark.parametrize('with_bias', [True, False])
+def test_modulated_deform_conv(with_bias):
+    try:
+        from mmcv.ops import ModulatedDeformConv2dPack
+    except (ImportError, ModuleNotFoundError):
+        pytest.skip('test requires compilation')
+
+    input = [[[[1., 2., 3.], [0., 1., 2.], [3., 5., 2.]]]]
+
+    x = torch.Tensor(input).cuda()
+    model = ModulatedDeformConv2dPack(
+        1,
+        1,
+        kernel_size=(2, 2),
+        stride=1,
+        padding=1,
+        deform_groups=1,
+        bias=with_bias)
+    model.weight.data.fill_(1.)
+    model.type(torch.float32)
+    model = model.cuda().eval()
+
+    input_names = ['input']
+    output_names = ['output']
+
+    with torch.no_grad():
+        torch.onnx.export(
+            model, (x.clone(), ),
+            onnx_file,
+            export_params=True,
+            keep_initializers_as_inputs=True,
+            input_names=input_names,
+            output_names=output_names,
+            opset_version=11)
+
+    onnx_model = onnx.load(onnx_file)
+
+    # create trt engine and wrapper
+    opt_shape_dict = {
+        'input': [list(x.shape), list(x.shape),
+                  list(x.shape)],
+    }
+    # trt config
+    fp16_mode = False
+    max_workspace_size = 1 << 30
+
+    trt_engine = onnx2trt(
+        onnx_model,
+        opt_shape_dict,
+        fp16_mode=fp16_mode,
+        max_workspace_size=max_workspace_size)
+
+    save_trt_engine(trt_engine, trt_file)
+    trt_model = TRTWrapper(trt_file, input_names, output_names)
+
+    with torch.no_grad():
+        trt_outputs = trt_model({'input': x.clone()})
+        trt_results = trt_outputs['output']
+
+    # compute pytorch_output
+    with torch.no_grad():
+        pytorch_results = model(x.clone())
+
+    # allclose
+    if os.path.exists(onnx_file):
+        os.remove(onnx_file)
+    if os.path.exists(trt_file):
+        os.remove(trt_file)
+    torch.testing.assert_allclose(pytorch_results, trt_results)
+
+
+@pytest.mark.parametrize('mode', ['bilinear', 'nearest'])
+@pytest.mark.parametrize('padding_mode', ['zeros', 'border', 'reflection'])
+@pytest.mark.parametrize('align_corners', [True, False])
+def test_grid_sample(mode, padding_mode, align_corners):
+    from mmcv.onnx.symbolic import register_extra_symbolics
+
+    register_extra_symbolics(11)
+
+    input = torch.rand(1, 1, 10, 10).cuda()
+    grid = torch.Tensor([[[1, 0, 0], [0, 1, 0]]])
+    grid = nn.functional.affine_grid(grid,
+                                     (1, 1, 15, 15)).type_as(input).cuda()
+
+    def func(input, grid):
+        return nn.functional.grid_sample(
+            input,
+            grid,
+            mode=mode,
+            padding_mode=padding_mode,
+            align_corners=align_corners)
+
+    wrapped_model = WrapFunction(func).eval().cuda()
+
+    input_names = ['input', 'grid']
+    output_names = ['output']
+
+    with torch.no_grad():
+        torch.onnx.export(
+            wrapped_model, (input.clone(), grid.clone()),
+            onnx_file,
+            export_params=True,
+            keep_initializers_as_inputs=True,
+            input_names=input_names,
+            output_names=output_names,
+            opset_version=11)
+
+    onnx_model = onnx.load(onnx_file)
+
+    # create trt engine and wrapper
+    opt_shape_dict = {
+        'input': [list(input.shape),
+                  list(input.shape),
+                  list(input.shape)],
+        'grid': [list(grid.shape),
+                 list(grid.shape),
+                 list(grid.shape)],
+    }
+    # trt config
+    fp16_mode = False
+    max_workspace_size = 1 << 30
+
+    trt_engine = onnx2trt(
+        onnx_model,
+        opt_shape_dict,
+        fp16_mode=fp16_mode,
+        max_workspace_size=max_workspace_size)
+
+    save_trt_engine(trt_engine, trt_file)
+    trt_model = TRTWrapper(trt_file, input_names, output_names)
+
+    with torch.no_grad():
+        trt_outputs = trt_model({'input': input.clone(), 'grid': grid.clone()})
+        trt_results = trt_outputs['output']
+
+    # compute pytorch_output
+    with torch.no_grad():
+        pytorch_results = wrapped_model(input.clone(), grid.clone())
+
+    # allclose
+    if os.path.exists(onnx_file):
+        os.remove(onnx_file)
+    if os.path.exists(trt_file):
+        os.remove(trt_file)
+    assert torch.allclose(pytorch_results, trt_results)
+
+
+@pytest.mark.parametrize('func', [torch.cummax, torch.cummin])
+def test_cummin_cummax(func: Callable):
+    # Note generally `cummax` or `cummin` is exportable to ONNX
+    # as long as the pytorch version >= 1.5.0, since `torch.cummax`
+    # is only supported with torch >= 1.5.0.
+    # But when `cummax` or `cummin` serves as an intermediate component
+    # whose outputs is used as inputs for another modules, it's expected
+    # that pytorch version must be >= 1.7.0. Otherwise error appears like:
+    # `RuntimeError: tuple  appears in op that does not forward tuples,
+    # unsupported 'kind: prim::PythonOp`.
+    from packaging import version
+    if version.parse(torch.__version__) < version.parse('1.7.0'):
+        pytest.skip('test_cummax_cummin should be ran with pytorch >= 1.7.0')
+
+    opset = 11
+    # register custom op `mmcv::cummax` and `mmcv::cummin`
+    from mmcv.onnx.symbolic import register_extra_symbolics
+    register_extra_symbolics(opset)
+
+    input_list = [
+        # arbitrary shape, e.g. 1-D, 2-D, 3-D, ...
+        torch.rand((2, 3, 4, 1, 5)).cuda(),
+        torch.rand((1)).cuda()
+    ]
+
+    input_names = ['input']
+    output_names = ['output', 'indices']
+
+    for input in input_list:
+        ndims = input.dim()
+        # valid dim range is [-ndims, ndims-1]
+        # test for all `dim` value which is valid
+        for dim in range(-ndims, ndims):
+            cummax_func = partial(func, dim=dim)
+            wrapped_model = WrapFunction(cummax_func).eval().cuda()
+
+            with torch.no_grad():
+                torch.onnx.export(
+                    wrapped_model,
+                    input,
+                    onnx_file,
+                    export_params=True,
+                    keep_initializers_as_inputs=False,
+                    input_names=input_names,
+                    output_names=output_names,
+                    opset_version=opset)
+
+            onnx_model = onnx.load(onnx_file)
+
+            # create trt engine and wrapper
+            opt_shape_dict = {
+                'input':
+                [list(input.shape),
+                 list(input.shape),
+                 list(input.shape)]
+            }
+            # trt config
+            fp16_mode = False
+            max_workspace_size = 1 << 30
+
+            trt_engine = onnx2trt(
+                onnx_model,
+                opt_shape_dict,
+                fp16_mode=fp16_mode,
+                max_workspace_size=max_workspace_size)
+
+            # remove ONNX model after conversion
+            if os.path.exists(onnx_file):
+                os.remove(onnx_file)
+
+            # save TensorRT model
+            save_trt_engine(trt_engine, trt_file)
+
+            # load and wrap TensorRT model
+            trt_model = TRTWrapper(trt_file)
+
+            # remove trt model after loading
+            if os.path.exists(trt_file):
+                os.remove(trt_file)
+
+            # compute trt output
+            with torch.no_grad():
+                trt_results = trt_model({'input': input.contiguous().clone()})
+                trt_output = trt_results['output']
+                trt_indices = trt_results['indices']
+
+            # compute pytorch output
+            with torch.no_grad():
+                pytorch_results = wrapped_model(input.clone())
+                pytorch_output = pytorch_results[0]
+                pytorch_indices = pytorch_results[1]
+
+            torch.testing.assert_allclose(trt_output, pytorch_output)
+            torch.testing.assert_allclose(trt_indices, pytorch_indices)
+
+
+@pytest.mark.parametrize('dynamic_export', [True, False])
+@pytest.mark.parametrize('fp16_mode', [True, False])
+def test_instance_norm(dynamic_export, fp16_mode):
+
+    n, c, h, w = 2, 3, 10, 10
+    data = torch.randn(n, c, h, w).cuda()
+    norm = nn.InstanceNorm2d(c, affine=True)
+
+    wrapped_model = WrapFunction(norm).eval().cuda()
+
+    input_names = ['input']
+    output_names = ['output']
+    dynamic_axes = None
+    if dynamic_export:
+        dynamic_axes = {
+            'input': {
+                0: 'n',
+                2: 'h',
+                3: 'w',
+            },
+            'output': {
+                0: 'n',
+                2: 'h',
+                3: 'w',
+            },
+        }
+    with torch.no_grad():
+        torch.onnx.export(
+            wrapped_model, (data.clone(), ),
+            onnx_file,
+            export_params=True,
+            keep_initializers_as_inputs=True,
+            input_names=input_names,
+            output_names=output_names,
+            dynamic_axes=dynamic_axes,
+            opset_version=11)
+
+    onnx_model = onnx.load(onnx_file)
+
+    # create trt engine and wrapper
+    if dynamic_export:
+        opt_shape_dict = {
+            'input':
+            [list(data.shape),
+             list(data.shape), [2 * n, c, 2 * h, 2 * w]],
+        }
+    else:
+        opt_shape_dict = {
+            'input': [list(data.shape),
+                      list(data.shape),
+                      list(data.shape)],
+        }
+    # trt config
+    max_workspace_size = 1 << 30
+
+    trt_engine = onnx2trt(
+        onnx_model,
+        opt_shape_dict,
+        fp16_mode=fp16_mode,
+        max_workspace_size=max_workspace_size)
+
+    save_trt_engine(trt_engine, trt_file)
+    trt_model = TRTWrapper(trt_file, input_names, output_names)
+
+    with torch.no_grad():
+        trt_outputs = trt_model({'input': data.clone()})
+        trt_results = trt_outputs['output']
+
+    # compute pytorch_output
+    with torch.no_grad():
+        pytorch_results = wrapped_model(data.clone())
+
+    # allclose
+    if os.path.exists(onnx_file):
+        os.remove(onnx_file)
+    if os.path.exists(trt_file):
+        os.remove(trt_file)
+    assert torch.allclose(pytorch_results, trt_results)
+
+
+@pytest.mark.parametrize('mode', ['top', 'bottom', 'left', 'right'])
+def test_corner_pool(mode):
+    try:
+        from mmcv.ops import CornerPool
+    except (ImportError, ModuleNotFoundError):
+        pytest.skip('test requires compilation')
+
+    opset = 11
+    # register custom op `mmcv::MMCVCornerPool`
+    from mmcv.onnx.symbolic import register_extra_symbolics
+    register_extra_symbolics(opset)
+
+    # trt config
+    fp16_mode = False
+    max_workspace_size = 1 << 30
+
+    inputs = [
+        # (n, c, h, w)
+        torch.rand((2, 3, 5, 5)),
+        torch.rand((1, 2, 4, 6)),
+        torch.rand((2, 1, 3, 2)),
+    ]
+
+    class CornerPoolWrapper(CornerPool):
+
+        def __init__(self, mode):
+            super(CornerPoolWrapper, self).__init__(mode)
+
+        def forward(self, x):
+            # no use `torch.cummax`, instead `corner_pool` is used
+            # for various torch version
+            return self.corner_pool.apply(x)
+
+    wrapped_model = CornerPoolWrapper(mode).cuda()
+    for input in inputs:
+        input = input.cuda()
+
+        with torch.no_grad():
+            torch.onnx.export(
+                wrapped_model, (input, ),
+                onnx_file,
+                export_params=True,
+                keep_initializers_as_inputs=True,
+                input_names=['input'],
+                output_names=['output'],
+                opset_version=opset)
+        onnx_model = onnx.load(onnx_file)
+
+        # create trt engine and wrapper
+        opt_shape_dict = {
+            'input': [list(input.shape),
+                      list(input.shape),
+                      list(input.shape)],
+        }
+        trt_engine = onnx2trt(
+            onnx_model,
+            opt_shape_dict,
+            fp16_mode=fp16_mode,
+            max_workspace_size=max_workspace_size)
+        save_trt_engine(trt_engine, trt_file)
+        trt_model = TRTWrapper(trt_file, ['input'], ['output'])
+
+        with torch.no_grad():
+            trt_outputs = trt_model({'input': input})
+            trt_pool_feat = trt_outputs['output']
+
+        # compute pytorch_output
+        with torch.no_grad():
+            pytorch_pool_feat = wrapped_model(input)
+
+        # allclose
+        if os.path.exists(onnx_file):
+            os.remove(onnx_file)
+        if os.path.exists(trt_file):
+            os.remove(trt_file)
+        assert torch.allclose(pytorch_pool_feat, trt_pool_feat, atol=1e-5)
diff --git a/tests/test_ops/test_tensorrt_preprocess.py b/tests/test_ops/test_tensorrt_preprocess.py
new file mode 100644
index 0000000..b5ade24
--- /dev/null
+++ b/tests/test_ops/test_tensorrt_preprocess.py
@@ -0,0 +1,75 @@
+import os
+from functools import wraps
+
+import onnx
+import torch
+
+from mmcv.ops import nms
+from mmcv.tensorrt.preprocess import preprocess_onnx
+
+
+def remove_tmp_file(func):
+
+    @wraps(func)
+    def wrapper(*args, **kwargs):
+        onnx_file = 'tmp.onnx'
+        kwargs['onnx_file'] = onnx_file
+        try:
+            result = func(*args, **kwargs)
+        finally:
+            if os.path.exists(onnx_file):
+                os.remove(onnx_file)
+        return result
+
+    return wrapper
+
+
+@remove_tmp_file
+def export_nms_module_to_onnx(module, onnx_file):
+    torch_model = module()
+    torch_model.eval()
+
+    input = (torch.rand([100, 4], dtype=torch.float32),
+             torch.rand([100], dtype=torch.float32))
+
+    torch.onnx.export(
+        torch_model,
+        input,
+        onnx_file,
+        opset_version=11,
+        input_names=['boxes', 'scores'],
+        output_names=['output'])
+
+    onnx_model = onnx.load(onnx_file)
+    return onnx_model
+
+
+def test_can_handle_nms_with_constant_maxnum():
+
+    class ModuleNMS(torch.nn.Module):
+
+        def forward(self, boxes, scores):
+            return nms(boxes, scores, iou_threshold=0.4, max_num=10)
+
+    onnx_model = export_nms_module_to_onnx(ModuleNMS)
+    preprocess_onnx_model = preprocess_onnx(onnx_model)
+    for node in preprocess_onnx_model.graph.node:
+        if 'NonMaxSuppression' in node.name:
+            assert len(node.attribute) == 5, 'The NMS must have 5 attributes.'
+
+
+def test_can_handle_nms_with_undefined_maxnum():
+
+    class ModuleNMS(torch.nn.Module):
+
+        def forward(self, boxes, scores):
+            return nms(boxes, scores, iou_threshold=0.4)
+
+    onnx_model = export_nms_module_to_onnx(ModuleNMS)
+    preprocess_onnx_model = preprocess_onnx(onnx_model)
+    for node in preprocess_onnx_model.graph.node:
+        if 'NonMaxSuppression' in node.name:
+            assert len(node.attribute) == 5, \
+                'The NMS must have 5 attributes.'
+            assert node.attribute[2].i > 0, \
+                'The max_output_boxes_per_class is not defined correctly.'
diff --git a/tests/test_ops/test_three_interpolate.py b/tests/test_ops/test_three_interpolate.py
index 51a6b87..4dffe5a 100644
--- a/tests/test_ops/test_three_interpolate.py
+++ b/tests/test_ops/test_three_interpolate.py
@@ -1,4 +1,3 @@
-# Copyright (c) OpenMMLab. All rights reserved.
 import pytest
 import torch
 
@@ -7,20 +6,19 @@ from mmcv.ops import three_interpolate
 
 @pytest.mark.skipif(
     not torch.cuda.is_available(), reason='requires CUDA support')
-@pytest.mark.parametrize('dtype', [torch.half, torch.float, torch.double])
-def test_three_interpolate(dtype):
-    features = torch.tensor(
-        [[[2.4350, 4.7516, 4.4995, 2.4350, 2.4350, 2.4350],
-          [3.1236, 2.6278, 3.0447, 3.1236, 3.1236, 3.1236],
-          [2.6732, 2.8677, 2.6436, 2.6732, 2.6732, 2.6732],
-          [0.0124, 7.0150, 7.0199, 0.0124, 0.0124, 0.0124],
-          [0.3207, 0.0000, 0.3411, 0.3207, 0.3207, 0.3207]],
-         [[0.0000, 0.9544, 2.4532, 0.0000, 0.0000, 0.0000],
-          [0.5346, 1.9176, 1.4715, 0.5346, 0.5346, 0.5346],
-          [0.0000, 0.2744, 2.0842, 0.0000, 0.0000, 0.0000],
-          [0.3414, 1.5063, 1.6209, 0.3414, 0.3414, 0.3414],
-          [0.5814, 0.0103, 0.0000, 0.5814, 0.5814, 0.5814]]],
-        dtype=dtype).cuda()
+def test_three_interpolate():
+    features = torch.tensor([[[2.4350, 4.7516, 4.4995, 2.4350, 2.4350, 2.4350],
+                              [3.1236, 2.6278, 3.0447, 3.1236, 3.1236, 3.1236],
+                              [2.6732, 2.8677, 2.6436, 2.6732, 2.6732, 2.6732],
+                              [0.0124, 7.0150, 7.0199, 0.0124, 0.0124, 0.0124],
+                              [0.3207, 0.0000, 0.3411, 0.3207, 0.3207,
+                               0.3207]],
+                             [[0.0000, 0.9544, 2.4532, 0.0000, 0.0000, 0.0000],
+                              [0.5346, 1.9176, 1.4715, 0.5346, 0.5346, 0.5346],
+                              [0.0000, 0.2744, 2.0842, 0.0000, 0.0000, 0.0000],
+                              [0.3414, 1.5063, 1.6209, 0.3414, 0.3414, 0.3414],
+                              [0.5814, 0.0103, 0.0000, 0.5814, 0.5814,
+                               0.5814]]]).cuda()
 
     idx = torch.tensor([[[0, 1, 2], [2, 3, 4], [2, 3, 4], [0, 1, 2], [0, 1, 2],
                          [0, 1, 3]],
@@ -38,8 +36,7 @@ def test_three_interpolate(dtype):
                             [1.0000e+00, 1.7148e-08, 1.4070e-08],
                             [3.3333e-01, 3.3333e-01, 3.3333e-01],
                             [3.3333e-01, 3.3333e-01, 3.3333e-01],
-                            [3.3333e-01, 3.3333e-01, 3.3333e-01]]],
-                          dtype=dtype).cuda()
+                            [3.3333e-01, 3.3333e-01, 3.3333e-01]]]).cuda()
 
     output = three_interpolate(features, idx, weight)
     expected_output = torch.tensor([[[
@@ -72,7 +69,6 @@ def test_three_interpolate(dtype):
                                      [
                                          3.8760e-01, 1.0300e-02, 8.3569e-09,
                                          3.8760e-01, 3.8760e-01, 1.9723e-01
-                                     ]]],
-                                   dtype=dtype).cuda()
+                                     ]]]).cuda()
 
-    assert torch.allclose(output, expected_output, 1e-3, 1e-4)
+    assert torch.allclose(output, expected_output, 1e-4)
diff --git a/tests/test_ops/test_three_nn.py b/tests/test_ops/test_three_nn.py
index 456188b..e7cba24 100644
--- a/tests/test_ops/test_three_nn.py
+++ b/tests/test_ops/test_three_nn.py
@@ -1,65 +1,71 @@
-# Copyright (c) OpenMMLab. All rights reserved.
 import pytest
 import torch
 
 from mmcv.ops import three_nn
-from mmcv.utils import IS_CUDA_AVAILABLE, IS_MLU_AVAILABLE
 
-known = [[[-1.8373, 3.5605, -0.7867], [0.7615, 2.9420, 0.2314],
-          [-0.6503, 3.6637, -1.0622], [-1.8373, 3.5605, -0.7867],
-          [-1.8373, 3.5605, -0.7867]],
-         [[-1.3399, 1.9991, -0.3698], [-0.0799, 0.9698, -0.8457],
-          [0.0858, 2.4721, -0.1928], [-1.3399, 1.9991, -0.3698],
-          [-1.3399, 1.9991, -0.3698]]]
 
-unknown = [[[-1.8373, 3.5605, -0.7867], [0.7615, 2.9420, 0.2314],
-            [-0.6503, 3.6637, -1.0622], [-1.5237, 2.3976, -0.8097],
-            [-0.0722, 3.4017, -0.2880], [0.5198, 3.0661, -0.4605],
-            [-2.0185, 3.5019, -0.3236], [0.5098, 3.1020, 0.5799],
-            [-1.6137, 3.8443, -0.5269], [0.7341, 2.9626, -0.3189]],
-           [[-1.3399, 1.9991, -0.3698], [-0.0799, 0.9698, -0.8457],
-            [0.0858, 2.4721, -0.1928], [-0.9022, 1.6560, -1.3090],
-            [0.1156, 1.6901, -0.4366], [-0.6477, 2.3576, -0.1563],
-            [-0.8482, 1.1466, -1.2704], [-0.8753, 2.0845, -0.3460],
-            [-0.5621, 1.4233, -1.2858], [-0.5883, 1.3114, -1.2899]]]
+@pytest.mark.skipif(
+    not torch.cuda.is_available(), reason='requires CUDA support')
+def test_three_nn():
+    known = torch.tensor([[[-1.8373, 3.5605,
+                            -0.7867], [0.7615, 2.9420, 0.2314],
+                           [-0.6503, 3.6637, -1.0622],
+                           [-1.8373, 3.5605, -0.7867],
+                           [-1.8373, 3.5605, -0.7867]],
+                          [[-1.3399, 1.9991, -0.3698],
+                           [-0.0799, 0.9698,
+                            -0.8457], [0.0858, 2.4721, -0.1928],
+                           [-1.3399, 1.9991, -0.3698],
+                           [-1.3399, 1.9991, -0.3698]]]).cuda()
 
-expected_dist = [[[0.0000, 0.0000, 0.0000], [0.0000, 2.0463, 2.8588],
-                  [0.0000, 1.2229, 1.2229], [1.2047, 1.2047, 1.2047],
-                  [1.0011, 1.0845, 1.8411], [0.7433, 1.4451, 2.4304],
-                  [0.5007, 0.5007, 0.5007], [0.4587, 2.0875, 2.7544],
-                  [0.4450, 0.4450, 0.4450], [0.5514, 1.7206, 2.6811]],
-                 [[0.0000, 0.0000, 0.0000], [0.0000, 1.6464, 1.6952],
-                  [0.0000, 1.5125, 1.5125], [1.0915, 1.0915, 1.0915],
-                  [0.8197, 0.8511, 1.4894], [0.7433, 0.8082, 0.8082],
-                  [0.8955, 1.3340, 1.3340], [0.4730, 0.4730, 0.4730],
-                  [0.7949, 1.3325, 1.3325], [0.7566, 1.3727, 1.3727]]]
+    unknown = torch.tensor([[[-1.8373, 3.5605, -0.7867],
+                             [0.7615, 2.9420, 0.2314],
+                             [-0.6503, 3.6637, -1.0622],
+                             [-1.5237, 2.3976, -0.8097],
+                             [-0.0722, 3.4017, -0.2880],
+                             [0.5198, 3.0661, -0.4605],
+                             [-2.0185, 3.5019, -0.3236],
+                             [0.5098, 3.1020, 0.5799],
+                             [-1.6137, 3.8443, -0.5269],
+                             [0.7341, 2.9626, -0.3189]],
+                            [[-1.3399, 1.9991, -0.3698],
+                             [-0.0799, 0.9698, -0.8457],
+                             [0.0858, 2.4721, -0.1928],
+                             [-0.9022, 1.6560, -1.3090],
+                             [0.1156, 1.6901, -0.4366],
+                             [-0.6477, 2.3576, -0.1563],
+                             [-0.8482, 1.1466, -1.2704],
+                             [-0.8753, 2.0845, -0.3460],
+                             [-0.5621, 1.4233, -1.2858],
+                             [-0.5883, 1.3114, -1.2899]]]).cuda()
 
-expected_idx = [[[0, 3, 4], [1, 2, 0], [2, 0, 3], [0, 3, 4], [2, 1, 0],
-                 [1, 2, 0], [0, 3, 4], [1, 2, 0], [0, 3, 4], [1, 2, 0]],
-                [[0, 3, 4], [1, 2, 0], [2, 0, 3], [0, 3, 4], [2, 1, 0],
-                 [2, 0, 3], [1, 0, 3], [0, 3, 4], [1, 0, 3], [1, 0, 3]]]
+    dist, idx = three_nn(unknown, known)
+    expected_dist = torch.tensor([[[0.0000, 0.0000, 0.0000],
+                                   [0.0000, 2.0463, 2.8588],
+                                   [0.0000, 1.2229, 1.2229],
+                                   [1.2047, 1.2047, 1.2047],
+                                   [1.0011, 1.0845, 1.8411],
+                                   [0.7433, 1.4451, 2.4304],
+                                   [0.5007, 0.5007, 0.5007],
+                                   [0.4587, 2.0875, 2.7544],
+                                   [0.4450, 0.4450, 0.4450],
+                                   [0.5514, 1.7206, 2.6811]],
+                                  [[0.0000, 0.0000, 0.0000],
+                                   [0.0000, 1.6464, 1.6952],
+                                   [0.0000, 1.5125, 1.5125],
+                                   [1.0915, 1.0915, 1.0915],
+                                   [0.8197, 0.8511, 1.4894],
+                                   [0.7433, 0.8082, 0.8082],
+                                   [0.8955, 1.3340, 1.3340],
+                                   [0.4730, 0.4730, 0.4730],
+                                   [0.7949, 1.3325, 1.3325],
+                                   [0.7566, 1.3727, 1.3727]]]).cuda()
+    expected_idx = torch.tensor([[[0, 3, 4], [1, 2, 0], [2, 0, 3], [0, 3, 4],
+                                  [2, 1, 0], [1, 2, 0], [0, 3, 4], [1, 2, 0],
+                                  [0, 3, 4], [1, 2, 0]],
+                                 [[0, 3, 4], [1, 2, 0], [2, 0, 3], [0, 3, 4],
+                                  [2, 1, 0], [2, 0, 3], [1, 0, 3], [0, 3, 4],
+                                  [1, 0, 3], [1, 0, 3]]]).cuda()
 
-
-@pytest.mark.parametrize('device', [
-    pytest.param(
-        'cuda',
-        marks=pytest.mark.skipif(
-            not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
-    pytest.param(
-        'mlu',
-        marks=pytest.mark.skipif(
-            not IS_MLU_AVAILABLE, reason='requires MLU support'))
-])
-@pytest.mark.parametrize('dtype,rtol', [(torch.float, 1e-8),
-                                        (torch.half, 1e-3)])
-def test_three_nn(device, dtype, rtol):
-    dtype = torch.float
-    known_t = torch.tensor(known, dtype=dtype, device=device)
-    unknown_t = torch.tensor(unknown, dtype=dtype, device=device)
-
-    dist_t, idx_t = three_nn(unknown_t, known_t)
-    expected_dist_t = torch.tensor(expected_dist, dtype=dtype, device=device)
-    expected_idx_t = torch.tensor(expected_idx, device=device)
-
-    assert torch.allclose(dist_t, expected_dist_t, atol=1e-4, rtol=rtol)
-    assert torch.all(idx_t == expected_idx_t)
+    assert torch.allclose(dist, expected_dist, 1e-4)
+    assert torch.all(idx == expected_idx)
diff --git a/tests/test_ops/test_tin_shift.py b/tests/test_ops/test_tin_shift.py
old mode 100755
new mode 100644
index c8ce144..93cea6e
--- a/tests/test_ops/test_tin_shift.py
+++ b/tests/test_ops/test_tin_shift.py
@@ -1,12 +1,9 @@
-# Copyright (c) OpenMMLab. All rights reserved.
 import os
 
 import numpy as np
 import pytest
 import torch
 
-from mmcv.utils import IS_CUDA_AVAILABLE, IS_MLU_AVAILABLE
-
 _USING_PARROTS = True
 try:
     from parrots.autograd import gradcheck
@@ -133,7 +130,7 @@ grads = [
 ]
 
 
-def _test_tinshift_gradcheck(device, dtype):
+def _test_tinshift_gradcheck(dtype):
     try:
         from mmcv.ops import tin_shift
     except ModuleNotFoundError:
@@ -147,15 +144,15 @@ def _test_tinshift_gradcheck(device, dtype):
         np_shift = np.array(shift)
 
         x = torch.tensor(
-            np_input, dtype=dtype, device=device, requires_grad=True)
-        shift = torch.tensor(np_shift, device=device).int()
+            np_input, dtype=dtype, device='cuda', requires_grad=True)
+        shift = torch.tensor(np_shift, device='cuda').int()
         if torch.__version__ == 'parrots':
             gradcheck(tin_shift, (x, shift))
         else:
             gradcheck(tin_shift, (x, shift), atol=1, rtol=0.1)
 
 
-def _test_tinshift_allclose(device, dtype):
+def _test_tinshift_allclose(dtype):
     try:
         from mmcv.ops import tin_shift
     except ModuleNotFoundError:
@@ -168,8 +165,8 @@ def _test_tinshift_allclose(device, dtype):
         np_grad = np.array(grad)
 
         x = torch.tensor(
-            np_input, dtype=dtype, device=device, requires_grad=True)
-        shift = torch.tensor(np_shift, device=device).int()
+            np_input, dtype=dtype, device='cuda', requires_grad=True)
+        shift = torch.tensor(np_shift, device='cuda').int()
 
         output = tin_shift(x, shift)
         output.backward(torch.ones_like(output))
@@ -179,48 +176,28 @@ def _test_tinshift_allclose(device, dtype):
             x.grad.data.type(torch.float).cpu().numpy(), np_grad, 1e-3)
 
 
-def _test_tinshift_assert(device, dtype):
+def _test_tinshift_assert(dtype):
     try:
         from mmcv.ops import tin_shift
     except ModuleNotFoundError:
         pytest.skip('TINShift op is not successfully compiled')
 
-    inputs = [
-        torch.rand(2, 3, 4, 2),
-        torch.rand(2, 3, 4, 2),
-        torch.rand(1, 3, 4, 2)
-    ]
+    inputs = [torch.rand(2, 3, 4, 2), torch.rand(2, 3, 4, 2)]
     shifts = [torch.rand(2, 3), torch.rand(2, 5)]
 
     for x, shift in zip(inputs, shifts):
-        x = x.to(device).type(dtype)
-        shift = shift.to(device).type(dtype)
+        x = x.cuda()
+        shift = shift.cuda()
 
         # A ValueError should be raised if ops get inputs with wrong shapes.
         with pytest.raises(ValueError):
             tin_shift(x, shift)
 
 
-@pytest.mark.parametrize('device', [
-    pytest.param(
-        'cuda',
-        marks=pytest.mark.skipif(
-            not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
-    pytest.param(
-        'mlu',
-        marks=pytest.mark.skipif(
-            not IS_MLU_AVAILABLE, reason='requires MLU support'))
-])
-@pytest.mark.parametrize('dtype', [
-    torch.float,
-    pytest.param(
-        torch.double,
-        marks=pytest.mark.skipif(
-            IS_MLU_AVAILABLE,
-            reason='MLU does not support for 64-bit floating point')),
-    torch.half
-])
-def test_tinshift(device, dtype):
-    _test_tinshift_allclose(device=device, dtype=dtype)
-    _test_tinshift_gradcheck(device=device, dtype=dtype)
-    _test_tinshift_assert(device=device, dtype=dtype)
+@pytest.mark.skipif(
+    not torch.cuda.is_available(), reason='requires CUDA support')
+@pytest.mark.parametrize('dtype', [torch.float, torch.double, torch.half])
+def test_tinshift(dtype):
+    _test_tinshift_allclose(dtype=dtype)
+    _test_tinshift_gradcheck(dtype=dtype)
+    _test_tinshift_assert(dtype=dtype)
diff --git a/tests/test_ops/test_upfirdn2d.py b/tests/test_ops/test_upfirdn2d.py
index 1342480..f1c33bc 100644
--- a/tests/test_ops/test_upfirdn2d.py
+++ b/tests/test_ops/test_upfirdn2d.py
@@ -1,4 +1,3 @@
-# Copyright (c) OpenMMLab. All rights reserved.
 import pytest
 import torch
 
@@ -10,7 +9,7 @@ except ImportError:
     _USING_PARROTS = False
 
 
-class TestUpFirDn2d:
+class TestUpFirDn2d(object):
     """Unit test for UpFirDn2d.
 
     Here, we just test the basic case of upsample version. More gerneal tests
@@ -56,29 +55,3 @@ class TestUpFirDn2d:
                      self.input_tensor).cuda(), self.factor, 1, self.pad),
                 eps=1e-4,
                 atol=1e-3)
-
-        # test with different up
-        kernel = torch.randn(3, 3)
-        out = upfirdn2d(
-            self.input_tensor.cuda(), filter=kernel.cuda(), up=2, padding=1)
-        assert out.shape == (2, 3, 8, 8)
-
-        # test with different down
-        input_tensor = torch.randn(2, 3, 8, 8)
-        out = upfirdn2d(
-            input_tensor.cuda(), filter=self.kernel.cuda(), down=2, padding=1)
-        assert out.shape == (2, 3, 4, 4)
-
-        # test with different flip_filter
-        out = upfirdn2d(
-            self.input_tensor.cuda(),
-            filter=self.kernel.cuda(),
-            flip_filter=True)
-        assert out.shape == (2, 3, 1, 1)
-
-        # test with different gain
-        out1 = upfirdn2d(
-            self.input_tensor.cuda(), filter=self.kernel.cuda(), gain=0.2)
-        out2 = upfirdn2d(
-            self.input_tensor.cuda(), filter=self.kernel.cuda(), gain=0.1)
-        assert torch.allclose(out1, out2 * 2)
diff --git a/tests/test_ops/test_voxelization.py b/tests/test_ops/test_voxelization.py
index 1422e0a..ad3253f 100644
--- a/tests/test_ops/test_voxelization.py
+++ b/tests/test_ops/test_voxelization.py
@@ -1,10 +1,8 @@
-# Copyright (c) OpenMMLab. All rights reserved.
 import numpy as np
 import pytest
 import torch
 
 from mmcv.ops import Voxelization
-from mmcv.utils import IS_NPU_AVAILABLE
 
 
 def _get_voxel_points_indices(points, coors, voxel):
@@ -41,7 +39,7 @@ def test_voxelization(device_type):
     device = torch.device(device_type)
 
     # test hard_voxelization on cpu/gpu
-    points = points.contiguous().to(device)
+    points = torch.tensor(points).contiguous().to(device)
     coors, voxels, num_points_per_voxel = hard_voxelization.forward(points)
     coors = coors.cpu().detach().numpy()
     voxels = voxels.cpu().detach().numpy()
@@ -61,115 +59,3 @@ def test_voxelization(device_type):
         assert np.all(
             points[indices] == expected_coors[i][:num_points_current_voxel])
         assert num_points_current_voxel == expected_num_points_per_voxel[i]
-
-
-@pytest.mark.skipif(
-    not torch.cuda.is_available(), reason='requires CUDA support')
-def test_voxelization_nondeterministic():
-    voxel_size = [0.5, 0.5, 0.5]
-    point_cloud_range = [0, -40, -3, 70.4, 40, 1]
-
-    voxel_dict = np.load(
-        'tests/data/for_3d_ops/test_voxel.npy', allow_pickle=True).item()
-    points = voxel_dict['points']
-
-    points = torch.tensor(points)
-    max_num_points = -1
-    dynamic_voxelization = Voxelization(voxel_size, point_cloud_range,
-                                        max_num_points)
-
-    max_num_points = 10
-    max_voxels = 50
-    hard_voxelization = Voxelization(
-        voxel_size,
-        point_cloud_range,
-        max_num_points,
-        max_voxels,
-        deterministic=False)
-
-    # test hard_voxelization (non-deterministic version) on gpu
-    points = torch.tensor(points).contiguous().to(device='cuda:0')
-    voxels, coors, num_points_per_voxel = hard_voxelization.forward(points)
-    coors = coors.cpu().detach().numpy().tolist()
-    voxels = voxels.cpu().detach().numpy().tolist()
-    num_points_per_voxel = num_points_per_voxel.cpu().detach().numpy().tolist()
-
-    coors_all = dynamic_voxelization.forward(points)
-    coors_all = coors_all.cpu().detach().numpy().tolist()
-
-    coors_set = {tuple(c) for c in coors}
-    coors_all_set = {tuple(c) for c in coors_all}
-
-    assert len(coors_set) == len(coors)
-    assert len(coors_set - coors_all_set) == 0
-
-    points = points.cpu().detach().numpy().tolist()
-
-    coors_points_dict = {}
-    for c, ps in zip(coors_all, points):
-        if tuple(c) not in coors_points_dict:
-            coors_points_dict[tuple(c)] = set()
-        coors_points_dict[tuple(c)].add(tuple(ps))
-
-    for c, ps, n in zip(coors, voxels, num_points_per_voxel):
-        ideal_voxel_points_set = coors_points_dict[tuple(c)]
-        voxel_points_set = {tuple(p) for p in ps[:n]}
-        assert len(voxel_points_set) == n
-        if n < max_num_points:
-            assert voxel_points_set == ideal_voxel_points_set
-            for p in ps[n:]:
-                assert max(p) == min(p) == 0
-        else:
-            assert len(voxel_points_set - ideal_voxel_points_set) == 0
-
-    # test hard_voxelization (non-deterministic version) on gpu
-    # with all input point in range
-    points = torch.tensor(points).contiguous().to(device='cuda:0')[:max_voxels]
-    coors_all = dynamic_voxelization.forward(points)
-    valid_mask = coors_all.ge(0).all(-1)
-    points = points[valid_mask]
-    coors_all = coors_all[valid_mask]
-    coors_all = coors_all.cpu().detach().numpy().tolist()
-
-    voxels, coors, num_points_per_voxel = hard_voxelization.forward(points)
-    coors = coors.cpu().detach().numpy().tolist()
-
-    coors_set = {tuple(c) for c in coors}
-    coors_all_set = {tuple(c) for c in coors_all}
-
-    assert len(coors_set) == len(coors) == len(coors_all_set)
-
-
-@pytest.mark.parametrize('device_type', [
-    pytest.param(
-        'npu',
-        marks=pytest.mark.skipif(
-            not IS_NPU_AVAILABLE, reason='requires NPU support'))
-])
-def test_voxelization_npu(device_type):
-    voxel_size = [0.5, 0.5, 0.5]
-    point_cloud_range = [0, -40, -3, 70.4, 40, 1]
-
-    voxel_dict = np.load(
-        'tests/data/for_3d_ops/test_voxel.npy', allow_pickle=True).item()
-    expected_coors = voxel_dict['coors']
-    expected_voxels = voxel_dict['voxels']
-    expected_num_points_per_voxel = voxel_dict['num_points_per_voxel']
-    points = voxel_dict['points']
-
-    points = torch.tensor(points)
-    max_num_points = 1000
-    hard_voxelization = Voxelization(voxel_size, point_cloud_range,
-                                     max_num_points)
-
-    device = torch.device(device_type)
-
-    # test hard_voxelization on npu
-    points = points.contiguous().to(device)
-    coors, voxels, num_points_per_voxel = hard_voxelization.forward(points)
-    coors = coors.cpu().detach().numpy()
-    voxels = voxels.cpu().detach().numpy()
-    num_points_per_voxel = num_points_per_voxel.cpu().detach().numpy()
-    assert np.all(coors == expected_coors)
-    assert np.all(voxels == expected_voxels)
-    assert np.all(num_points_per_voxel == expected_num_points_per_voxel)
diff --git a/tests/test_parallel.py b/tests/test_parallel.py
new file mode 100644
index 0000000..e8e5456
--- /dev/null
+++ b/tests/test_parallel.py
@@ -0,0 +1,66 @@
+from unittest.mock import MagicMock, patch
+
+import torch
+import torch.nn as nn
+from torch.nn.parallel import DataParallel, DistributedDataParallel
+
+from mmcv.parallel import (MODULE_WRAPPERS, MMDataParallel,
+                           MMDistributedDataParallel, is_module_wrapper)
+from mmcv.parallel.distributed_deprecated import \
+    MMDistributedDataParallel as DeprecatedMMDDP
+
+
+def mock(*args, **kwargs):
+    pass
+
+
+@patch('torch.distributed._broadcast_coalesced', mock)
+@patch('torch.distributed.broadcast', mock)
+@patch('torch.nn.parallel.DistributedDataParallel._ddp_init_helper', mock)
+def test_is_module_wrapper():
+
+    class Model(nn.Module):
+
+        def __init__(self):
+            super().__init__()
+            self.conv = nn.Conv2d(2, 2, 1)
+
+        def forward(self, x):
+            return self.conv(x)
+
+    # _verify_model_across_ranks is added in torch1.9.0 so we should check
+    # whether _verify_model_across_ranks is the member of torch.distributed
+    # before mocking
+    if hasattr(torch.distributed, '_verify_model_across_ranks'):
+        torch.distributed._verify_model_across_ranks = mock
+
+    model = Model()
+    assert not is_module_wrapper(model)
+
+    dp = DataParallel(model)
+    assert is_module_wrapper(dp)
+
+    mmdp = MMDataParallel(model)
+    assert is_module_wrapper(mmdp)
+
+    ddp = DistributedDataParallel(model, process_group=MagicMock())
+    assert is_module_wrapper(ddp)
+
+    mmddp = MMDistributedDataParallel(model, process_group=MagicMock())
+    assert is_module_wrapper(mmddp)
+
+    deprecated_mmddp = DeprecatedMMDDP(model)
+    assert is_module_wrapper(deprecated_mmddp)
+
+    # test module wrapper registry
+    @MODULE_WRAPPERS.register_module()
+    class ModuleWrapper(object):
+
+        def __init__(self, module):
+            self.module = module
+
+        def forward(self, *args, **kwargs):
+            return self.module(*args, **kwargs)
+
+    module_wraper = ModuleWrapper(model)
+    assert is_module_wrapper(module_wraper)
diff --git a/tests/test_runner/test_basemodule.py b/tests/test_runner/test_basemodule.py
new file mode 100644
index 0000000..fb44a1d
--- /dev/null
+++ b/tests/test_runner/test_basemodule.py
@@ -0,0 +1,557 @@
+import tempfile
+
+import pytest
+import torch
+from torch import nn
+
+import mmcv
+from mmcv.cnn.utils.weight_init import update_init_info
+from mmcv.runner import BaseModule, ModuleList, Sequential
+from mmcv.utils import Registry, build_from_cfg
+
+COMPONENTS = Registry('component')
+FOOMODELS = Registry('model')
+
+
+@COMPONENTS.register_module()
+class FooConv1d(BaseModule):
+
+    def __init__(self, init_cfg=None):
+        super().__init__(init_cfg)
+        self.conv1d = nn.Conv1d(4, 1, 4)
+
+    def forward(self, x):
+        return self.conv1d(x)
+
+
+@COMPONENTS.register_module()
+class FooConv2d(BaseModule):
+
+    def __init__(self, init_cfg=None):
+        super().__init__(init_cfg)
+        self.conv2d = nn.Conv2d(3, 1, 3)
+
+    def forward(self, x):
+        return self.conv2d(x)
+
+
+@COMPONENTS.register_module()
+class FooLinear(BaseModule):
+
+    def __init__(self, init_cfg=None):
+        super().__init__(init_cfg)
+        self.linear = nn.Linear(3, 4)
+
+    def forward(self, x):
+        return self.linear(x)
+
+
+@COMPONENTS.register_module()
+class FooLinearConv1d(BaseModule):
+
+    def __init__(self, linear=None, conv1d=None, init_cfg=None):
+        super().__init__(init_cfg)
+        if linear is not None:
+            self.linear = build_from_cfg(linear, COMPONENTS)
+        if conv1d is not None:
+            self.conv1d = build_from_cfg(conv1d, COMPONENTS)
+
+    def forward(self, x):
+        x = self.linear(x)
+        return self.conv1d(x)
+
+
+@FOOMODELS.register_module()
+class FooModel(BaseModule):
+
+    def __init__(self,
+                 component1=None,
+                 component2=None,
+                 component3=None,
+                 component4=None,
+                 init_cfg=None) -> None:
+        super().__init__(init_cfg)
+        if component1 is not None:
+            self.component1 = build_from_cfg(component1, COMPONENTS)
+        if component2 is not None:
+            self.component2 = build_from_cfg(component2, COMPONENTS)
+        if component3 is not None:
+            self.component3 = build_from_cfg(component3, COMPONENTS)
+        if component4 is not None:
+            self.component4 = build_from_cfg(component4, COMPONENTS)
+
+        # its type is not BaseModule, it can be initialized
+        # with "override" key.
+        self.reg = nn.Linear(3, 4)
+
+
+def test_initilization_info_logger():
+    # 'override' has higher priority
+
+    import torch.nn as nn
+    from mmcv.utils.logging import get_logger
+    import os
+
+    class OverloadInitConv(nn.Conv2d, BaseModule):
+
+        def init_weights(self):
+            for p in self.parameters():
+                with torch.no_grad():
+                    p.fill_(1)
+
+    class CheckLoggerModel(BaseModule):
+
+        def __init__(self, init_cfg=None):
+            super(CheckLoggerModel, self).__init__(init_cfg)
+            self.conv1 = nn.Conv2d(1, 1, 1, 1)
+            self.conv2 = OverloadInitConv(1, 1, 1, 1)
+            self.conv3 = nn.Conv2d(1, 1, 1, 1)
+            self.fc1 = nn.Linear(1, 1)
+
+    init_cfg = [
+        dict(
+            type='Normal',
+            layer='Conv2d',
+            std=0.01,
+            override=dict(
+                type='Normal', name='conv3', std=0.01, bias_prob=0.01)),
+        dict(type='Constant', layer='Linear', val=0., bias=1.)
+    ]
+
+    model = CheckLoggerModel(init_cfg=init_cfg)
+
+    train_log = '20210720_132454.log'
+    workdir = tempfile.mkdtemp()
+    log_file = os.path.join(workdir, train_log)
+    # create a logger
+    get_logger('init_logger', log_file=log_file)
+    assert not hasattr(model, '_params_init_info')
+    model.init_weights()
+    # assert `_params_init_info` would be deleted after `init_weights`
+    assert not hasattr(model, '_params_init_info')
+    # assert initialization information has been dumped
+    assert os.path.exists(log_file)
+
+    lines = mmcv.list_from_file(log_file)
+
+    # check initialization information is right
+    for i, line in enumerate(lines):
+        if 'conv1.weight' in line:
+            assert 'NormalInit' in lines[i + 1]
+        if 'conv2.weight' in line:
+            assert 'OverloadInitConv' in lines[i + 1]
+        if 'fc1.weight' in line:
+            assert 'ConstantInit' in lines[i + 1]
+
+    # test corner case
+
+    class OverloadInitConvFc(nn.Conv2d, BaseModule):
+
+        def __init__(self, *args, **kwargs):
+            super(OverloadInitConvFc, self).__init__(*args, **kwargs)
+            self.conv1 = nn.Linear(1, 1)
+
+        def init_weights(self):
+            for p in self.parameters():
+                with torch.no_grad():
+                    p.fill_(1)
+
+    class CheckLoggerModel(BaseModule):
+
+        def __init__(self, init_cfg=None):
+            super(CheckLoggerModel, self).__init__(init_cfg)
+            self.conv1 = nn.Conv2d(1, 1, 1, 1)
+            self.conv2 = OverloadInitConvFc(1, 1, 1, 1)
+            self.conv3 = nn.Conv2d(1, 1, 1, 1)
+            self.fc1 = nn.Linear(1, 1)
+
+    class TopLevelModule(BaseModule):
+
+        def __init__(self, init_cfg=None, checklog_init_cfg=None):
+            super(TopLevelModule, self).__init__(init_cfg)
+            self.module1 = CheckLoggerModel(checklog_init_cfg)
+            self.module2 = OverloadInitConvFc(1, 1, 1, 1)
+
+    checklog_init_cfg = [
+        dict(
+            type='Normal',
+            layer='Conv2d',
+            std=0.01,
+            override=dict(
+                type='Normal', name='conv3', std=0.01, bias_prob=0.01)),
+        dict(type='Constant', layer='Linear', val=0., bias=1.)
+    ]
+
+    top_level_init_cfg = [
+        dict(
+            type='Normal',
+            layer='Conv2d',
+            std=0.01,
+            override=dict(
+                type='Normal', name='module2', std=0.01, bias_prob=0.01))
+    ]
+
+    model = TopLevelModule(
+        init_cfg=top_level_init_cfg, checklog_init_cfg=checklog_init_cfg)
+
+    model.module1.init_weights()
+    model.module2.init_weights()
+    model.init_weights()
+    model.module1.init_weights()
+    model.module2.init_weights()
+
+    assert not hasattr(model, '_params_init_info')
+    model.init_weights()
+    # assert `_params_init_info` would be deleted after `init_weights`
+    assert not hasattr(model, '_params_init_info')
+    # assert initialization information has been dumped
+    assert os.path.exists(log_file)
+
+    lines = mmcv.list_from_file(log_file)
+    # check initialization information is right
+    for i, line in enumerate(lines):
+        if 'TopLevelModule' in line and 'init_cfg' not in line:
+            # have been set init_flag
+            assert 'the same' in line
+
+
+def test_update_init_info():
+
+    class DummyModel(BaseModule):
+
+        def __init__(self, init_cfg=None):
+            super().__init__(init_cfg)
+            self.conv1 = nn.Conv2d(1, 1, 1, 1)
+            self.conv3 = nn.Conv2d(1, 1, 1, 1)
+            self.fc1 = nn.Linear(1, 1)
+
+    model = DummyModel()
+    from collections import defaultdict
+    model._params_init_info = defaultdict(dict)
+    for name, param in model.named_parameters():
+        model._params_init_info[param]['init_info'] = 'init'
+        model._params_init_info[param]['tmp_mean_value'] = param.data.mean()
+
+    with torch.no_grad():
+        for p in model.parameters():
+            p.fill_(1)
+
+    update_init_info(model, init_info='fill_1')
+
+    for item in model._params_init_info.values():
+        assert item['init_info'] == 'fill_1'
+        assert item['tmp_mean_value'] == 1
+
+    # test assert for new parameters
+    model.conv1.bias = nn.Parameter(torch.ones_like(model.conv1.bias))
+    with pytest.raises(AssertionError):
+        update_init_info(model, init_info=' ')
+
+
+def test_model_weight_init():
+    """
+    Config
+    model (FooModel, Linear: weight=1, bias=2, Conv1d: weight=3, bias=4,
+                     Conv2d: weight=5, bias=6)
+    ├──component1 (FooConv1d)
+    ├──component2 (FooConv2d)
+    ├──component3 (FooLinear)
+    ├──component4 (FooLinearConv1d)
+        ├──linear (FooLinear)
+        ├──conv1d (FooConv1d)
+    ├──reg (nn.Linear)
+
+    Parameters after initialization
+    model (FooModel)
+    ├──component1 (FooConv1d, weight=3, bias=4)
+    ├──component2 (FooConv2d, weight=5, bias=6)
+    ├──component3 (FooLinear, weight=1, bias=2)
+    ├──component4 (FooLinearConv1d)
+        ├──linear (FooLinear, weight=1, bias=2)
+        ├──conv1d (FooConv1d, weight=3, bias=4)
+    ├──reg (nn.Linear, weight=1, bias=2)
+    """
+    model_cfg = dict(
+        type='FooModel',
+        init_cfg=[
+            dict(type='Constant', val=1, bias=2, layer='Linear'),
+            dict(type='Constant', val=3, bias=4, layer='Conv1d'),
+            dict(type='Constant', val=5, bias=6, layer='Conv2d')
+        ],
+        component1=dict(type='FooConv1d'),
+        component2=dict(type='FooConv2d'),
+        component3=dict(type='FooLinear'),
+        component4=dict(
+            type='FooLinearConv1d',
+            linear=dict(type='FooLinear'),
+            conv1d=dict(type='FooConv1d')))
+
+    model = build_from_cfg(model_cfg, FOOMODELS)
+    model.init_weights()
+
+    assert torch.equal(model.component1.conv1d.weight,
+                       torch.full(model.component1.conv1d.weight.shape, 3.0))
+    assert torch.equal(model.component1.conv1d.bias,
+                       torch.full(model.component1.conv1d.bias.shape, 4.0))
+    assert torch.equal(model.component2.conv2d.weight,
+                       torch.full(model.component2.conv2d.weight.shape, 5.0))
+    assert torch.equal(model.component2.conv2d.bias,
+                       torch.full(model.component2.conv2d.bias.shape, 6.0))
+    assert torch.equal(model.component3.linear.weight,
+                       torch.full(model.component3.linear.weight.shape, 1.0))
+    assert torch.equal(model.component3.linear.bias,
+                       torch.full(model.component3.linear.bias.shape, 2.0))
+    assert torch.equal(
+        model.component4.linear.linear.weight,
+        torch.full(model.component4.linear.linear.weight.shape, 1.0))
+    assert torch.equal(
+        model.component4.linear.linear.bias,
+        torch.full(model.component4.linear.linear.bias.shape, 2.0))
+    assert torch.equal(
+        model.component4.conv1d.conv1d.weight,
+        torch.full(model.component4.conv1d.conv1d.weight.shape, 3.0))
+    assert torch.equal(
+        model.component4.conv1d.conv1d.bias,
+        torch.full(model.component4.conv1d.conv1d.bias.shape, 4.0))
+    assert torch.equal(model.reg.weight, torch.full(model.reg.weight.shape,
+                                                    1.0))
+    assert torch.equal(model.reg.bias, torch.full(model.reg.bias.shape, 2.0))
+
+
+def test_nest_components_weight_init():
+    """
+    Config
+    model (FooModel, Linear: weight=1, bias=2, Conv1d: weight=3, bias=4,
+                     Conv2d: weight=5, bias=6)
+    ├──component1 (FooConv1d, Conv1d: weight=7, bias=8)
+    ├──component2 (FooConv2d, Conv2d: weight=9, bias=10)
+    ├──component3 (FooLinear)
+    ├──component4 (FooLinearConv1d, Linear: weight=11, bias=12)
+        ├──linear (FooLinear, Linear: weight=11, bias=12)
+        ├──conv1d (FooConv1d)
+    ├──reg (nn.Linear, weight=13, bias=14)
+
+    Parameters after initialization
+    model (FooModel)
+    ├──component1 (FooConv1d, weight=7, bias=8)
+    ├──component2 (FooConv2d, weight=9, bias=10)
+    ├──component3 (FooLinear, weight=1, bias=2)
+    ├──component4 (FooLinearConv1d)
+        ├──linear (FooLinear, weight=1, bias=2)
+        ├──conv1d (FooConv1d, weight=3, bias=4)
+    ├──reg (nn.Linear, weight=13, bias=14)
+    """
+
+    model_cfg = dict(
+        type='FooModel',
+        init_cfg=[
+            dict(
+                type='Constant',
+                val=1,
+                bias=2,
+                layer='Linear',
+                override=dict(type='Constant', name='reg', val=13, bias=14)),
+            dict(type='Constant', val=3, bias=4, layer='Conv1d'),
+            dict(type='Constant', val=5, bias=6, layer='Conv2d'),
+        ],
+        component1=dict(
+            type='FooConv1d',
+            init_cfg=dict(type='Constant', layer='Conv1d', val=7, bias=8)),
+        component2=dict(
+            type='FooConv2d',
+            init_cfg=dict(type='Constant', layer='Conv2d', val=9, bias=10)),
+        component3=dict(type='FooLinear'),
+        component4=dict(
+            type='FooLinearConv1d',
+            linear=dict(type='FooLinear'),
+            conv1d=dict(type='FooConv1d')))
+
+    model = build_from_cfg(model_cfg, FOOMODELS)
+    model.init_weights()
+
+    assert torch.equal(model.component1.conv1d.weight,
+                       torch.full(model.component1.conv1d.weight.shape, 7.0))
+    assert torch.equal(model.component1.conv1d.bias,
+                       torch.full(model.component1.conv1d.bias.shape, 8.0))
+    assert torch.equal(model.component2.conv2d.weight,
+                       torch.full(model.component2.conv2d.weight.shape, 9.0))
+    assert torch.equal(model.component2.conv2d.bias,
+                       torch.full(model.component2.conv2d.bias.shape, 10.0))
+    assert torch.equal(model.component3.linear.weight,
+                       torch.full(model.component3.linear.weight.shape, 1.0))
+    assert torch.equal(model.component3.linear.bias,
+                       torch.full(model.component3.linear.bias.shape, 2.0))
+    assert torch.equal(
+        model.component4.linear.linear.weight,
+        torch.full(model.component4.linear.linear.weight.shape, 1.0))
+    assert torch.equal(
+        model.component4.linear.linear.bias,
+        torch.full(model.component4.linear.linear.bias.shape, 2.0))
+    assert torch.equal(
+        model.component4.conv1d.conv1d.weight,
+        torch.full(model.component4.conv1d.conv1d.weight.shape, 3.0))
+    assert torch.equal(
+        model.component4.conv1d.conv1d.bias,
+        torch.full(model.component4.conv1d.conv1d.bias.shape, 4.0))
+    assert torch.equal(model.reg.weight,
+                       torch.full(model.reg.weight.shape, 13.0))
+    assert torch.equal(model.reg.bias, torch.full(model.reg.bias.shape, 14.0))
+
+
+def test_without_layer_weight_init():
+    model_cfg = dict(
+        type='FooModel',
+        init_cfg=[
+            dict(type='Constant', val=1, bias=2, layer='Linear'),
+            dict(type='Constant', val=3, bias=4, layer='Conv1d'),
+            dict(type='Constant', val=5, bias=6, layer='Conv2d')
+        ],
+        component1=dict(
+            type='FooConv1d', init_cfg=dict(type='Constant', val=7, bias=8)),
+        component2=dict(type='FooConv2d'),
+        component3=dict(type='FooLinear'))
+    model = build_from_cfg(model_cfg, FOOMODELS)
+    model.init_weights()
+
+    assert torch.equal(model.component1.conv1d.weight,
+                       torch.full(model.component1.conv1d.weight.shape, 3.0))
+    assert torch.equal(model.component1.conv1d.bias,
+                       torch.full(model.component1.conv1d.bias.shape, 4.0))
+
+    # init_cfg in component1 does not have layer key, so it does nothing
+    assert torch.equal(model.component2.conv2d.weight,
+                       torch.full(model.component2.conv2d.weight.shape, 5.0))
+    assert torch.equal(model.component2.conv2d.bias,
+                       torch.full(model.component2.conv2d.bias.shape, 6.0))
+    assert torch.equal(model.component3.linear.weight,
+                       torch.full(model.component3.linear.weight.shape, 1.0))
+    assert torch.equal(model.component3.linear.bias,
+                       torch.full(model.component3.linear.bias.shape, 2.0))
+
+    assert torch.equal(model.reg.weight, torch.full(model.reg.weight.shape,
+                                                    1.0))
+    assert torch.equal(model.reg.bias, torch.full(model.reg.bias.shape, 2.0))
+
+
+def test_override_weight_init():
+
+    # only initialize 'override'
+    model_cfg = dict(
+        type='FooModel',
+        init_cfg=[
+            dict(type='Constant', val=10, bias=20, override=dict(name='reg'))
+        ],
+        component1=dict(type='FooConv1d'),
+        component3=dict(type='FooLinear'))
+    model = build_from_cfg(model_cfg, FOOMODELS)
+    model.init_weights()
+    assert torch.equal(model.reg.weight,
+                       torch.full(model.reg.weight.shape, 10.0))
+    assert torch.equal(model.reg.bias, torch.full(model.reg.bias.shape, 20.0))
+    # do not initialize others
+    assert not torch.equal(
+        model.component1.conv1d.weight,
+        torch.full(model.component1.conv1d.weight.shape, 10.0))
+    assert not torch.equal(
+        model.component1.conv1d.bias,
+        torch.full(model.component1.conv1d.bias.shape, 20.0))
+    assert not torch.equal(
+        model.component3.linear.weight,
+        torch.full(model.component3.linear.weight.shape, 10.0))
+    assert not torch.equal(
+        model.component3.linear.bias,
+        torch.full(model.component3.linear.bias.shape, 20.0))
+
+    # 'override' has higher priority
+    model_cfg = dict(
+        type='FooModel',
+        init_cfg=[
+            dict(
+                type='Constant',
+                val=1,
+                bias=2,
+                override=dict(name='reg', type='Constant', val=30, bias=40))
+        ],
+        component1=dict(type='FooConv1d'),
+        component2=dict(type='FooConv2d'),
+        component3=dict(type='FooLinear'))
+    model = build_from_cfg(model_cfg, FOOMODELS)
+    model.init_weights()
+
+    assert torch.equal(model.reg.weight,
+                       torch.full(model.reg.weight.shape, 30.0))
+    assert torch.equal(model.reg.bias, torch.full(model.reg.bias.shape, 40.0))
+
+
+def test_sequential_model_weight_init():
+    seq_model_cfg = [
+        dict(
+            type='FooConv1d',
+            init_cfg=dict(type='Constant', layer='Conv1d', val=0., bias=1.)),
+        dict(
+            type='FooConv2d',
+            init_cfg=dict(type='Constant', layer='Conv2d', val=2., bias=3.)),
+    ]
+    layers = [build_from_cfg(cfg, COMPONENTS) for cfg in seq_model_cfg]
+    seq_model = Sequential(*layers)
+    seq_model.init_weights()
+    assert torch.equal(seq_model[0].conv1d.weight,
+                       torch.full(seq_model[0].conv1d.weight.shape, 0.))
+    assert torch.equal(seq_model[0].conv1d.bias,
+                       torch.full(seq_model[0].conv1d.bias.shape, 1.))
+    assert torch.equal(seq_model[1].conv2d.weight,
+                       torch.full(seq_model[1].conv2d.weight.shape, 2.))
+    assert torch.equal(seq_model[1].conv2d.bias,
+                       torch.full(seq_model[1].conv2d.bias.shape, 3.))
+    # inner init_cfg has higher priority
+    layers = [build_from_cfg(cfg, COMPONENTS) for cfg in seq_model_cfg]
+    seq_model = Sequential(
+        *layers,
+        init_cfg=dict(
+            type='Constant', layer=['Conv1d', 'Conv2d'], val=4., bias=5.))
+    seq_model.init_weights()
+    assert torch.equal(seq_model[0].conv1d.weight,
+                       torch.full(seq_model[0].conv1d.weight.shape, 0.))
+    assert torch.equal(seq_model[0].conv1d.bias,
+                       torch.full(seq_model[0].conv1d.bias.shape, 1.))
+    assert torch.equal(seq_model[1].conv2d.weight,
+                       torch.full(seq_model[1].conv2d.weight.shape, 2.))
+    assert torch.equal(seq_model[1].conv2d.bias,
+                       torch.full(seq_model[1].conv2d.bias.shape, 3.))
+
+
+def test_modulelist_weight_init():
+    models_cfg = [
+        dict(
+            type='FooConv1d',
+            init_cfg=dict(type='Constant', layer='Conv1d', val=0., bias=1.)),
+        dict(
+            type='FooConv2d',
+            init_cfg=dict(type='Constant', layer='Conv2d', val=2., bias=3.)),
+    ]
+    layers = [build_from_cfg(cfg, COMPONENTS) for cfg in models_cfg]
+    modellist = ModuleList(layers)
+    modellist.init_weights()
+    assert torch.equal(modellist[0].conv1d.weight,
+                       torch.full(modellist[0].conv1d.weight.shape, 0.))
+    assert torch.equal(modellist[0].conv1d.bias,
+                       torch.full(modellist[0].conv1d.bias.shape, 1.))
+    assert torch.equal(modellist[1].conv2d.weight,
+                       torch.full(modellist[1].conv2d.weight.shape, 2.))
+    assert torch.equal(modellist[1].conv2d.bias,
+                       torch.full(modellist[1].conv2d.bias.shape, 3.))
+    # inner init_cfg has higher priority
+    layers = [build_from_cfg(cfg, COMPONENTS) for cfg in models_cfg]
+    modellist = ModuleList(
+        layers,
+        init_cfg=dict(
+            type='Constant', layer=['Conv1d', 'Conv2d'], val=4., bias=5.))
+    modellist.init_weights()
+    assert torch.equal(modellist[0].conv1d.weight,
+                       torch.full(modellist[0].conv1d.weight.shape, 0.))
+    assert torch.equal(modellist[0].conv1d.bias,
+                       torch.full(modellist[0].conv1d.bias.shape, 1.))
+    assert torch.equal(modellist[1].conv2d.weight,
+                       torch.full(modellist[1].conv2d.weight.shape, 2.))
+    assert torch.equal(modellist[1].conv2d.bias,
+                       torch.full(modellist[1].conv2d.bias.shape, 3.))
diff --git a/tests/test_runner/test_checkpoint.py b/tests/test_runner/test_checkpoint.py
new file mode 100644
index 0000000..9856724
--- /dev/null
+++ b/tests/test_runner/test_checkpoint.py
@@ -0,0 +1,432 @@
+import sys
+from collections import OrderedDict
+from tempfile import TemporaryDirectory
+from unittest.mock import MagicMock, patch
+
+import pytest
+import torch
+import torch.nn as nn
+import torch.optim as optim
+from torch.nn.parallel import DataParallel
+
+from mmcv.fileio.file_client import PetrelBackend
+from mmcv.parallel.registry import MODULE_WRAPPERS
+from mmcv.runner.checkpoint import (_load_checkpoint_with_prefix,
+                                    get_state_dict, load_checkpoint,
+                                    load_from_pavi, save_checkpoint)
+
+sys.modules['petrel_client'] = MagicMock()
+sys.modules['petrel_client.client'] = MagicMock()
+
+
+@MODULE_WRAPPERS.register_module()
+class DDPWrapper(object):
+
+    def __init__(self, module):
+        self.module = module
+
+
+class Block(nn.Module):
+
+    def __init__(self):
+        super().__init__()
+        self.conv = nn.Conv2d(3, 3, 1)
+        self.norm = nn.BatchNorm2d(3)
+
+
+class Model(nn.Module):
+
+    def __init__(self):
+        super().__init__()
+        self.block = Block()
+        self.conv = nn.Conv2d(3, 3, 1)
+
+
+class Mockpavimodel(object):
+
+    def __init__(self, name='fakename'):
+        self.name = name
+
+    def download(self, file):
+        pass
+
+
+def assert_tensor_equal(tensor_a, tensor_b):
+    assert tensor_a.eq(tensor_b).all()
+
+
+def test_get_state_dict():
+    if torch.__version__ == 'parrots':
+        state_dict_keys = set([
+            'block.conv.weight', 'block.conv.bias', 'block.norm.weight',
+            'block.norm.bias', 'block.norm.running_mean',
+            'block.norm.running_var', 'conv.weight', 'conv.bias'
+        ])
+    else:
+        state_dict_keys = set([
+            'block.conv.weight', 'block.conv.bias', 'block.norm.weight',
+            'block.norm.bias', 'block.norm.running_mean',
+            'block.norm.running_var', 'block.norm.num_batches_tracked',
+            'conv.weight', 'conv.bias'
+        ])
+
+    model = Model()
+    state_dict = get_state_dict(model)
+    assert isinstance(state_dict, OrderedDict)
+    assert set(state_dict.keys()) == state_dict_keys
+
+    assert_tensor_equal(state_dict['block.conv.weight'],
+                        model.block.conv.weight)
+    assert_tensor_equal(state_dict['block.conv.bias'], model.block.conv.bias)
+    assert_tensor_equal(state_dict['block.norm.weight'],
+                        model.block.norm.weight)
+    assert_tensor_equal(state_dict['block.norm.bias'], model.block.norm.bias)
+    assert_tensor_equal(state_dict['block.norm.running_mean'],
+                        model.block.norm.running_mean)
+    assert_tensor_equal(state_dict['block.norm.running_var'],
+                        model.block.norm.running_var)
+    if torch.__version__ != 'parrots':
+        assert_tensor_equal(state_dict['block.norm.num_batches_tracked'],
+                            model.block.norm.num_batches_tracked)
+    assert_tensor_equal(state_dict['conv.weight'], model.conv.weight)
+    assert_tensor_equal(state_dict['conv.bias'], model.conv.bias)
+
+    wrapped_model = DDPWrapper(model)
+    state_dict = get_state_dict(wrapped_model)
+    assert isinstance(state_dict, OrderedDict)
+    assert set(state_dict.keys()) == state_dict_keys
+    assert_tensor_equal(state_dict['block.conv.weight'],
+                        wrapped_model.module.block.conv.weight)
+    assert_tensor_equal(state_dict['block.conv.bias'],
+                        wrapped_model.module.block.conv.bias)
+    assert_tensor_equal(state_dict['block.norm.weight'],
+                        wrapped_model.module.block.norm.weight)
+    assert_tensor_equal(state_dict['block.norm.bias'],
+                        wrapped_model.module.block.norm.bias)
+    assert_tensor_equal(state_dict['block.norm.running_mean'],
+                        wrapped_model.module.block.norm.running_mean)
+    assert_tensor_equal(state_dict['block.norm.running_var'],
+                        wrapped_model.module.block.norm.running_var)
+    if torch.__version__ != 'parrots':
+        assert_tensor_equal(
+            state_dict['block.norm.num_batches_tracked'],
+            wrapped_model.module.block.norm.num_batches_tracked)
+    assert_tensor_equal(state_dict['conv.weight'],
+                        wrapped_model.module.conv.weight)
+    assert_tensor_equal(state_dict['conv.bias'],
+                        wrapped_model.module.conv.bias)
+
+    # wrapped inner module
+    for name, module in wrapped_model.module._modules.items():
+        module = DataParallel(module)
+        wrapped_model.module._modules[name] = module
+    state_dict = get_state_dict(wrapped_model)
+    assert isinstance(state_dict, OrderedDict)
+    assert set(state_dict.keys()) == state_dict_keys
+    assert_tensor_equal(state_dict['block.conv.weight'],
+                        wrapped_model.module.block.module.conv.weight)
+    assert_tensor_equal(state_dict['block.conv.bias'],
+                        wrapped_model.module.block.module.conv.bias)
+    assert_tensor_equal(state_dict['block.norm.weight'],
+                        wrapped_model.module.block.module.norm.weight)
+    assert_tensor_equal(state_dict['block.norm.bias'],
+                        wrapped_model.module.block.module.norm.bias)
+    assert_tensor_equal(state_dict['block.norm.running_mean'],
+                        wrapped_model.module.block.module.norm.running_mean)
+    assert_tensor_equal(state_dict['block.norm.running_var'],
+                        wrapped_model.module.block.module.norm.running_var)
+    if torch.__version__ != 'parrots':
+        assert_tensor_equal(
+            state_dict['block.norm.num_batches_tracked'],
+            wrapped_model.module.block.module.norm.num_batches_tracked)
+    assert_tensor_equal(state_dict['conv.weight'],
+                        wrapped_model.module.conv.module.weight)
+    assert_tensor_equal(state_dict['conv.bias'],
+                        wrapped_model.module.conv.module.bias)
+
+
+def test_load_pavimodel_dist():
+
+    sys.modules['pavi'] = MagicMock()
+    sys.modules['pavi.modelcloud'] = MagicMock()
+    pavimodel = Mockpavimodel()
+    import pavi
+    pavi.modelcloud.get = MagicMock(return_value=pavimodel)
+    with pytest.raises(AssertionError):
+        # test pavi prefix
+        _ = load_from_pavi('MyPaviFolder/checkpoint.pth')
+
+    with pytest.raises(FileNotFoundError):
+        # there is not such checkpoint for us to load
+        _ = load_from_pavi('pavi://checkpoint.pth')
+
+
+def test_load_checkpoint_with_prefix():
+
+    class FooModule(nn.Module):
+
+        def __init__(self):
+            super().__init__()
+            self.linear = nn.Linear(1, 2)
+            self.conv2d = nn.Conv2d(3, 1, 3)
+            self.conv2d_2 = nn.Conv2d(3, 2, 3)
+
+    model = FooModule()
+    nn.init.constant_(model.linear.weight, 1)
+    nn.init.constant_(model.linear.bias, 2)
+    nn.init.constant_(model.conv2d.weight, 3)
+    nn.init.constant_(model.conv2d.bias, 4)
+    nn.init.constant_(model.conv2d_2.weight, 5)
+    nn.init.constant_(model.conv2d_2.bias, 6)
+
+    with TemporaryDirectory():
+        torch.save(model.state_dict(), 'model.pth')
+        prefix = 'conv2d'
+        state_dict = _load_checkpoint_with_prefix(prefix, 'model.pth')
+        assert torch.equal(model.conv2d.state_dict()['weight'],
+                           state_dict['weight'])
+        assert torch.equal(model.conv2d.state_dict()['bias'],
+                           state_dict['bias'])
+
+        # test whether prefix is in pretrained model
+        with pytest.raises(AssertionError):
+            prefix = 'back'
+            _load_checkpoint_with_prefix(prefix, 'model.pth')
+
+
+def test_load_checkpoint():
+    import os
+    import tempfile
+    import re
+
+    class PrefixModel(nn.Module):
+
+        def __init__(self):
+            super().__init__()
+            self.backbone = Model()
+
+    pmodel = PrefixModel()
+    model = Model()
+    checkpoint_path = os.path.join(tempfile.gettempdir(), 'checkpoint.pth')
+
+    # add prefix
+    torch.save(model.state_dict(), checkpoint_path)
+    state_dict = load_checkpoint(
+        pmodel, checkpoint_path, revise_keys=[(r'^', 'backbone.')])
+    for key in pmodel.backbone.state_dict().keys():
+        assert torch.equal(pmodel.backbone.state_dict()[key], state_dict[key])
+    # strip prefix
+    torch.save(pmodel.state_dict(), checkpoint_path)
+    state_dict = load_checkpoint(
+        model, checkpoint_path, revise_keys=[(r'^backbone\.', '')])
+
+    for key in state_dict.keys():
+        key_stripped = re.sub(r'^backbone\.', '', key)
+        assert torch.equal(model.state_dict()[key_stripped], state_dict[key])
+    os.remove(checkpoint_path)
+
+
+def test_load_checkpoint_metadata():
+    import os
+    import tempfile
+
+    from mmcv.runner import load_checkpoint, save_checkpoint
+
+    class ModelV1(nn.Module):
+
+        def __init__(self):
+            super().__init__()
+            self.block = Block()
+            self.conv1 = nn.Conv2d(3, 3, 1)
+            self.conv2 = nn.Conv2d(3, 3, 1)
+            nn.init.normal_(self.conv1.weight)
+            nn.init.normal_(self.conv2.weight)
+
+    class ModelV2(nn.Module):
+        _version = 2
+
+        def __init__(self):
+            super().__init__()
+            self.block = Block()
+            self.conv0 = nn.Conv2d(3, 3, 1)
+            self.conv1 = nn.Conv2d(3, 3, 1)
+            nn.init.normal_(self.conv0.weight)
+            nn.init.normal_(self.conv1.weight)
+
+        def _load_from_state_dict(self, state_dict, prefix, local_metadata,
+                                  *args, **kwargs):
+            """load checkpoints."""
+
+            # Names of some parameters in has been changed.
+            version = local_metadata.get('version', None)
+            if version is None or version < 2:
+                state_dict_keys = list(state_dict.keys())
+                convert_map = {'conv1': 'conv0', 'conv2': 'conv1'}
+                for k in state_dict_keys:
+                    for ori_str, new_str in convert_map.items():
+                        if k.startswith(prefix + ori_str):
+                            new_key = k.replace(ori_str, new_str)
+                            state_dict[new_key] = state_dict[k]
+                            del state_dict[k]
+
+            super()._load_from_state_dict(state_dict, prefix, local_metadata,
+                                          *args, **kwargs)
+
+    model_v1 = ModelV1()
+    model_v1_conv0_weight = model_v1.conv1.weight.detach()
+    model_v1_conv1_weight = model_v1.conv2.weight.detach()
+    model_v2 = ModelV2()
+    model_v2_conv0_weight = model_v2.conv0.weight.detach()
+    model_v2_conv1_weight = model_v2.conv1.weight.detach()
+    ckpt_v1_path = os.path.join(tempfile.gettempdir(), 'checkpoint_v1.pth')
+    ckpt_v2_path = os.path.join(tempfile.gettempdir(), 'checkpoint_v2.pth')
+
+    # Save checkpoint
+    save_checkpoint(model_v1, ckpt_v1_path)
+    save_checkpoint(model_v2, ckpt_v2_path)
+
+    # test load v1 model
+    load_checkpoint(model_v2, ckpt_v1_path)
+    assert torch.allclose(model_v2.conv0.weight, model_v1_conv0_weight)
+    assert torch.allclose(model_v2.conv1.weight, model_v1_conv1_weight)
+
+    # test load v2 model
+    load_checkpoint(model_v2, ckpt_v2_path)
+    assert torch.allclose(model_v2.conv0.weight, model_v2_conv0_weight)
+    assert torch.allclose(model_v2.conv1.weight, model_v2_conv1_weight)
+
+
+def test_load_classes_name():
+    import os
+
+    import tempfile
+
+    from mmcv.runner import load_checkpoint, save_checkpoint
+    checkpoint_path = os.path.join(tempfile.gettempdir(), 'checkpoint.pth')
+    model = Model()
+    save_checkpoint(model, checkpoint_path)
+    checkpoint = load_checkpoint(model, checkpoint_path)
+    assert 'meta' in checkpoint and 'CLASSES' not in checkpoint['meta']
+
+    model.CLASSES = ('class1', 'class2')
+    save_checkpoint(model, checkpoint_path)
+    checkpoint = load_checkpoint(model, checkpoint_path)
+    assert 'meta' in checkpoint and 'CLASSES' in checkpoint['meta']
+    assert checkpoint['meta']['CLASSES'] == ('class1', 'class2')
+
+    model = Model()
+    wrapped_model = DDPWrapper(model)
+    save_checkpoint(wrapped_model, checkpoint_path)
+    checkpoint = load_checkpoint(wrapped_model, checkpoint_path)
+    assert 'meta' in checkpoint and 'CLASSES' not in checkpoint['meta']
+
+    wrapped_model.module.CLASSES = ('class1', 'class2')
+    save_checkpoint(wrapped_model, checkpoint_path)
+    checkpoint = load_checkpoint(wrapped_model, checkpoint_path)
+    assert 'meta' in checkpoint and 'CLASSES' in checkpoint['meta']
+    assert checkpoint['meta']['CLASSES'] == ('class1', 'class2')
+
+    # remove the temp file
+    os.remove(checkpoint_path)
+
+
+def test_checkpoint_loader():
+    from mmcv.runner import _load_checkpoint, save_checkpoint, CheckpointLoader
+    import tempfile
+    import os
+    checkpoint_path = os.path.join(tempfile.gettempdir(), 'checkpoint.pth')
+    model = Model()
+    save_checkpoint(model, checkpoint_path)
+    checkpoint = _load_checkpoint(checkpoint_path)
+    assert 'meta' in checkpoint and 'CLASSES' not in checkpoint['meta']
+    # remove the temp file
+    os.remove(checkpoint_path)
+
+    filenames = [
+        'http://xx.xx/xx.pth', 'https://xx.xx/xx.pth',
+        'modelzoo://xx.xx/xx.pth', 'torchvision://xx.xx/xx.pth',
+        'open-mmlab://xx.xx/xx.pth', 'openmmlab://xx.xx/xx.pth',
+        'mmcls://xx.xx/xx.pth', 'pavi://xx.xx/xx.pth', 's3://xx.xx/xx.pth',
+        'ss3://xx.xx/xx.pth', ' s3://xx.xx/xx.pth'
+    ]
+    fn_names = [
+        'load_from_http', 'load_from_http', 'load_from_torchvision',
+        'load_from_torchvision', 'load_from_openmmlab', 'load_from_openmmlab',
+        'load_from_mmcls', 'load_from_pavi', 'load_from_ceph',
+        'load_from_local', 'load_from_local'
+    ]
+
+    for filename, fn_name in zip(filenames, fn_names):
+        loader = CheckpointLoader._get_checkpoint_loader(filename)
+        assert loader.__name__ == fn_name
+
+    @CheckpointLoader.register_scheme(prefixes='ftp://')
+    def load_from_ftp(filename, map_location):
+        return dict(filename=filename)
+
+    # test register_loader
+    filename = 'ftp://xx.xx/xx.pth'
+    loader = CheckpointLoader._get_checkpoint_loader(filename)
+    assert loader.__name__ == 'load_from_ftp'
+
+    def load_from_ftp1(filename, map_location):
+        return dict(filename=filename)
+
+    # test duplicate registered error
+    with pytest.raises(KeyError):
+        CheckpointLoader.register_scheme('ftp://', load_from_ftp1)
+
+    # test force param
+    CheckpointLoader.register_scheme('ftp://', load_from_ftp1, force=True)
+    checkpoint = CheckpointLoader.load_checkpoint(filename)
+    assert checkpoint['filename'] == filename
+
+    # test print function name
+    loader = CheckpointLoader._get_checkpoint_loader(filename)
+    assert loader.__name__ == 'load_from_ftp1'
+
+    # test sort
+    @CheckpointLoader.register_scheme(prefixes='a/b')
+    def load_from_ab(filename, map_location):
+        return dict(filename=filename)
+
+    @CheckpointLoader.register_scheme(prefixes='a/b/c')
+    def load_from_abc(filename, map_location):
+        return dict(filename=filename)
+
+    filename = 'a/b/c/d'
+    loader = CheckpointLoader._get_checkpoint_loader(filename)
+    assert loader.__name__ == 'load_from_abc'
+
+
+def test_save_checkpoint(tmp_path):
+    model = Model()
+    optimizer = optim.SGD(model.parameters(), lr=0.1, momentum=0.9)
+    # meta is not a dict
+    with pytest.raises(TypeError):
+        save_checkpoint(model, '/path/of/your/filename', meta='invalid type')
+
+    # 1. save to disk
+    filename = str(tmp_path / 'checkpoint1.pth')
+    save_checkpoint(model, filename)
+
+    filename = str(tmp_path / 'checkpoint2.pth')
+    save_checkpoint(model, filename, optimizer)
+
+    filename = str(tmp_path / 'checkpoint3.pth')
+    save_checkpoint(model, filename, meta={'test': 'test'})
+
+    filename = str(tmp_path / 'checkpoint4.pth')
+    save_checkpoint(model, filename, file_client_args={'backend': 'disk'})
+
+    # 2. save to petrel oss
+    with patch.object(PetrelBackend, 'put') as mock_method:
+        filename = 's3://path/of/your/checkpoint1.pth'
+        save_checkpoint(model, filename)
+    mock_method.assert_called()
+
+    with patch.object(PetrelBackend, 'put') as mock_method:
+        filename = 's3://path//of/your/checkpoint2.pth'
+        save_checkpoint(
+            model, filename, file_client_args={'backend': 'petrel'})
+    mock_method.assert_called()
diff --git a/tests/test_runner/test_dist_utils.py b/tests/test_runner/test_dist_utils.py
new file mode 100644
index 0000000..00e4306
--- /dev/null
+++ b/tests/test_runner/test_dist_utils.py
@@ -0,0 +1,52 @@
+import os
+from unittest.mock import patch
+
+import pytest
+
+from mmcv.runner import init_dist
+
+
+@patch('torch.cuda.device_count', return_value=1)
+@patch('torch.cuda.set_device')
+@patch('torch.distributed.init_process_group')
+@patch('subprocess.getoutput', return_value='127.0.0.1')
+def test_init_dist(mock_getoutput, mock_dist_init, mock_set_device,
+                   mock_device_count):
+    with pytest.raises(ValueError):
+        # launcher must be one of {'pytorch', 'mpi', 'slurm'}
+        init_dist('invaliad_launcher')
+
+    # test initialize with slurm launcher
+    os.environ['SLURM_PROCID'] = '0'
+    os.environ['SLURM_NTASKS'] = '1'
+    os.environ['SLURM_NODELIST'] = '[0]'  # haven't check the correct form
+
+    init_dist('slurm')
+    # no port is specified, use default port 29500
+    assert os.environ['MASTER_PORT'] == '29500'
+    assert os.environ['MASTER_ADDR'] == '127.0.0.1'
+    assert os.environ['WORLD_SIZE'] == '1'
+    assert os.environ['RANK'] == '0'
+    mock_set_device.assert_called_with(0)
+    mock_getoutput.assert_called_with('scontrol show hostname [0] | head -n1')
+    mock_dist_init.assert_called_with(backend='nccl')
+
+    init_dist('slurm', port=29505)
+    # port is specified with argument 'port'
+    assert os.environ['MASTER_PORT'] == '29505'
+    assert os.environ['MASTER_ADDR'] == '127.0.0.1'
+    assert os.environ['WORLD_SIZE'] == '1'
+    assert os.environ['RANK'] == '0'
+    mock_set_device.assert_called_with(0)
+    mock_getoutput.assert_called_with('scontrol show hostname [0] | head -n1')
+    mock_dist_init.assert_called_with(backend='nccl')
+
+    init_dist('slurm')
+    # port is specified by environment variable 'MASTER_PORT'
+    assert os.environ['MASTER_PORT'] == '29505'
+    assert os.environ['MASTER_ADDR'] == '127.0.0.1'
+    assert os.environ['WORLD_SIZE'] == '1'
+    assert os.environ['RANK'] == '0'
+    mock_set_device.assert_called_with(0)
+    mock_getoutput.assert_called_with('scontrol show hostname [0] | head -n1')
+    mock_dist_init.assert_called_with(backend='nccl')
diff --git a/tests/test_runner/test_eval_hook.py b/tests/test_runner/test_eval_hook.py
new file mode 100644
index 0000000..70fc82c
--- /dev/null
+++ b/tests/test_runner/test_eval_hook.py
@@ -0,0 +1,482 @@
+import json
+import os.path as osp
+import sys
+import tempfile
+import unittest.mock as mock
+from collections import OrderedDict
+from unittest.mock import MagicMock, patch
+
+import pytest
+import torch
+import torch.nn as nn
+import torch.optim as optim
+from torch.utils.data import DataLoader, Dataset
+
+from mmcv.fileio.file_client import PetrelBackend
+from mmcv.runner import DistEvalHook as BaseDistEvalHook
+from mmcv.runner import EpochBasedRunner
+from mmcv.runner import EvalHook as BaseEvalHook
+from mmcv.runner import IterBasedRunner
+from mmcv.utils import get_logger, scandir
+
+sys.modules['petrel_client'] = MagicMock()
+sys.modules['petrel_client.client'] = MagicMock()
+
+
+class ExampleDataset(Dataset):
+
+    def __init__(self):
+        self.index = 0
+        self.eval_result = [1, 4, 3, 7, 2, -3, 4, 6]
+
+    def __getitem__(self, idx):
+        results = dict(x=torch.tensor([1]))
+        return results
+
+    def __len__(self):
+        return 1
+
+    @mock.create_autospec
+    def evaluate(self, results, logger=None):
+        pass
+
+
+class EvalDataset(ExampleDataset):
+
+    def evaluate(self, results, logger=None):
+        acc = self.eval_result[self.index]
+        output = OrderedDict(
+            acc=acc, index=self.index, score=acc, loss_top=acc)
+        self.index += 1
+        return output
+
+
+class Model(nn.Module):
+
+    def __init__(self):
+        super().__init__()
+        self.param = nn.Parameter(torch.tensor([1.0]))
+
+    def forward(self, x, **kwargs):
+        return self.param * x
+
+    def train_step(self, data_batch, optimizer, **kwargs):
+        return {'loss': torch.sum(self(data_batch['x']))}
+
+    def val_step(self, data_batch, optimizer, **kwargs):
+        return {'loss': torch.sum(self(data_batch['x']))}
+
+
+def _build_epoch_runner():
+
+    model = Model()
+    tmp_dir = tempfile.mkdtemp()
+
+    runner = EpochBasedRunner(
+        model=model, work_dir=tmp_dir, logger=get_logger('demo'))
+    return runner
+
+
+def _build_iter_runner():
+
+    model = Model()
+    tmp_dir = tempfile.mkdtemp()
+
+    runner = IterBasedRunner(
+        model=model, work_dir=tmp_dir, logger=get_logger('demo'))
+    return runner
+
+
+class EvalHook(BaseEvalHook):
+
+    _default_greater_keys = ['acc', 'top']
+    _default_less_keys = ['loss', 'loss_top']
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+
+class DistEvalHook(BaseDistEvalHook):
+
+    greater_keys = ['acc', 'top']
+    less_keys = ['loss', 'loss_top']
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+
+def test_eval_hook():
+    with pytest.raises(AssertionError):
+        # `save_best` should be a str
+        test_dataset = Model()
+        data_loader = DataLoader(test_dataset)
+        EvalHook(data_loader, save_best=True)
+
+    with pytest.raises(TypeError):
+        # dataloader must be a pytorch DataLoader
+        test_dataset = Model()
+        data_loader = [DataLoader(test_dataset)]
+        EvalHook(data_loader)
+
+    with pytest.raises(ValueError):
+        # key_indicator must be valid when rule_map is None
+        test_dataset = ExampleDataset()
+        data_loader = DataLoader(test_dataset)
+        EvalHook(data_loader, save_best='unsupport')
+
+    with pytest.raises(KeyError):
+        # rule must be in keys of rule_map
+        test_dataset = ExampleDataset()
+        data_loader = DataLoader(test_dataset)
+        EvalHook(data_loader, save_best='auto', rule='unsupport')
+
+    # if eval_res is an empty dict, print a warning information
+    with pytest.warns(UserWarning) as record_warnings:
+
+        class _EvalDataset(ExampleDataset):
+
+            def evaluate(self, results, logger=None):
+                return {}
+
+        test_dataset = _EvalDataset()
+        data_loader = DataLoader(test_dataset)
+        eval_hook = EvalHook(data_loader, save_best='auto')
+        runner = _build_epoch_runner()
+        runner.register_hook(eval_hook)
+        runner.run([data_loader], [('train', 1)], 1)
+    # Since there will be many warnings thrown, we just need to check if the
+    # expected exceptions are thrown
+    expected_message = ('Since `eval_res` is an empty dict, the behavior to '
+                        'save the best checkpoint will be skipped in this '
+                        'evaluation.')
+    for warning in record_warnings:
+        if str(warning.message) == expected_message:
+            break
+    else:
+        assert False
+
+    test_dataset = ExampleDataset()
+    loader = DataLoader(test_dataset)
+    model = Model()
+    data_loader = DataLoader(test_dataset)
+    eval_hook = EvalHook(data_loader, save_best=None)
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+
+        # total_epochs = 1
+        logger = get_logger('test_eval')
+        runner = EpochBasedRunner(model=model, work_dir=tmpdir, logger=logger)
+        runner.register_hook(eval_hook)
+        runner.run([loader], [('train', 1)], 1)
+        test_dataset.evaluate.assert_called_with(
+            test_dataset, [torch.tensor([1])], logger=runner.logger)
+        assert runner.meta is None or 'best_score' not in runner.meta[
+            'hook_msgs']
+        assert runner.meta is None or 'best_ckpt' not in runner.meta[
+            'hook_msgs']
+
+    # when `save_best` is set to 'auto', first metric will be used.
+    loader = DataLoader(EvalDataset())
+    model = Model()
+    data_loader = DataLoader(EvalDataset())
+    eval_hook = EvalHook(data_loader, interval=1, save_best='auto')
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        logger = get_logger('test_eval')
+        runner = EpochBasedRunner(model=model, work_dir=tmpdir, logger=logger)
+        runner.register_checkpoint_hook(dict(interval=1))
+        runner.register_hook(eval_hook)
+        runner.run([loader], [('train', 1)], 8)
+
+        ckpt_path = osp.join(tmpdir, 'best_acc_epoch_4.pth')
+
+        assert runner.meta['hook_msgs']['best_ckpt'] == ckpt_path
+        assert osp.exists(ckpt_path)
+        assert runner.meta['hook_msgs']['best_score'] == 7
+
+    # total_epochs = 8, return the best acc and corresponding epoch
+    loader = DataLoader(EvalDataset())
+    model = Model()
+    data_loader = DataLoader(EvalDataset())
+    eval_hook = EvalHook(data_loader, interval=1, save_best='acc')
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        logger = get_logger('test_eval')
+        runner = EpochBasedRunner(model=model, work_dir=tmpdir, logger=logger)
+        runner.register_checkpoint_hook(dict(interval=1))
+        runner.register_hook(eval_hook)
+        runner.run([loader], [('train', 1)], 8)
+
+        ckpt_path = osp.join(tmpdir, 'best_acc_epoch_4.pth')
+
+        assert runner.meta['hook_msgs']['best_ckpt'] == ckpt_path
+        assert osp.exists(ckpt_path)
+        assert runner.meta['hook_msgs']['best_score'] == 7
+
+    # total_epochs = 8, return the best loss_top and corresponding epoch
+    loader = DataLoader(EvalDataset())
+    model = Model()
+    data_loader = DataLoader(EvalDataset())
+    eval_hook = EvalHook(data_loader, interval=1, save_best='loss_top')
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        logger = get_logger('test_eval')
+        runner = EpochBasedRunner(model=model, work_dir=tmpdir, logger=logger)
+        runner.register_checkpoint_hook(dict(interval=1))
+        runner.register_hook(eval_hook)
+        runner.run([loader], [('train', 1)], 8)
+
+        ckpt_path = osp.join(tmpdir, 'best_loss_top_epoch_6.pth')
+
+        assert runner.meta['hook_msgs']['best_ckpt'] == ckpt_path
+        assert osp.exists(ckpt_path)
+        assert runner.meta['hook_msgs']['best_score'] == -3
+
+    # total_epochs = 8, return the best score and corresponding epoch
+    data_loader = DataLoader(EvalDataset())
+    eval_hook = EvalHook(
+        data_loader, interval=1, save_best='score', rule='greater')
+    with tempfile.TemporaryDirectory() as tmpdir:
+        logger = get_logger('test_eval')
+        runner = EpochBasedRunner(model=model, work_dir=tmpdir, logger=logger)
+        runner.register_checkpoint_hook(dict(interval=1))
+        runner.register_hook(eval_hook)
+        runner.run([loader], [('train', 1)], 8)
+
+        ckpt_path = osp.join(tmpdir, 'best_score_epoch_4.pth')
+
+        assert runner.meta['hook_msgs']['best_ckpt'] == ckpt_path
+        assert osp.exists(ckpt_path)
+        assert runner.meta['hook_msgs']['best_score'] == 7
+
+    # total_epochs = 8, return the best score using less compare func
+    # and indicate corresponding epoch
+    data_loader = DataLoader(EvalDataset())
+    eval_hook = EvalHook(data_loader, save_best='acc', rule='less')
+    with tempfile.TemporaryDirectory() as tmpdir:
+        logger = get_logger('test_eval')
+        runner = EpochBasedRunner(model=model, work_dir=tmpdir, logger=logger)
+        runner.register_checkpoint_hook(dict(interval=1))
+        runner.register_hook(eval_hook)
+        runner.run([loader], [('train', 1)], 8)
+
+        ckpt_path = osp.join(tmpdir, 'best_acc_epoch_6.pth')
+
+        assert runner.meta['hook_msgs']['best_ckpt'] == ckpt_path
+        assert osp.exists(ckpt_path)
+        assert runner.meta['hook_msgs']['best_score'] == -3
+
+    # Test the EvalHook when resume happened
+    data_loader = DataLoader(EvalDataset())
+    eval_hook = EvalHook(data_loader, save_best='acc')
+    with tempfile.TemporaryDirectory() as tmpdir:
+        logger = get_logger('test_eval')
+        runner = EpochBasedRunner(model=model, work_dir=tmpdir, logger=logger)
+        runner.register_checkpoint_hook(dict(interval=1))
+        runner.register_hook(eval_hook)
+        runner.run([loader], [('train', 1)], 2)
+
+        old_ckpt_path = osp.join(tmpdir, 'best_acc_epoch_2.pth')
+
+        assert runner.meta['hook_msgs']['best_ckpt'] == old_ckpt_path
+        assert osp.exists(old_ckpt_path)
+        assert runner.meta['hook_msgs']['best_score'] == 4
+
+        resume_from = old_ckpt_path
+        loader = DataLoader(ExampleDataset())
+        eval_hook = EvalHook(data_loader, save_best='acc')
+        runner = EpochBasedRunner(model=model, work_dir=tmpdir, logger=logger)
+        runner.register_checkpoint_hook(dict(interval=1))
+        runner.register_hook(eval_hook)
+
+        runner.resume(resume_from)
+        assert runner.meta['hook_msgs']['best_ckpt'] == old_ckpt_path
+        assert osp.exists(old_ckpt_path)
+        assert runner.meta['hook_msgs']['best_score'] == 4
+
+        runner.run([loader], [('train', 1)], 8)
+
+        ckpt_path = osp.join(tmpdir, 'best_acc_epoch_4.pth')
+
+        assert runner.meta['hook_msgs']['best_ckpt'] == ckpt_path
+        assert osp.exists(ckpt_path)
+        assert runner.meta['hook_msgs']['best_score'] == 7
+        assert not osp.exists(old_ckpt_path)
+
+    # test EvalHook with customer test_fn and greater/less keys
+    loader = DataLoader(EvalDataset())
+    model = Model()
+    data_loader = DataLoader(EvalDataset())
+
+    eval_hook = EvalHook(
+        data_loader,
+        save_best='acc',
+        test_fn=mock.MagicMock(return_value={}),
+        greater_keys=[],
+        less_keys=['acc'])
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        logger = get_logger('test_eval')
+        runner = EpochBasedRunner(model=model, work_dir=tmpdir, logger=logger)
+        runner.register_checkpoint_hook(dict(interval=1))
+        runner.register_hook(eval_hook)
+        runner.run([loader], [('train', 1)], 8)
+
+        ckpt_path = osp.join(tmpdir, 'best_acc_epoch_6.pth')
+
+        assert runner.meta['hook_msgs']['best_ckpt'] == ckpt_path
+        assert osp.exists(ckpt_path)
+        assert runner.meta['hook_msgs']['best_score'] == -3
+
+    # test EvalHook with specified `out_dir`
+    loader = DataLoader(EvalDataset())
+    model = Model()
+    data_loader = DataLoader(EvalDataset())
+    out_dir = 's3://user/data'
+    eval_hook = EvalHook(
+        data_loader, interval=1, save_best='auto', out_dir=out_dir)
+
+    with patch.object(PetrelBackend, 'put') as mock_put, \
+         patch.object(PetrelBackend, 'remove') as mock_remove, \
+         patch.object(PetrelBackend, 'isfile') as mock_isfile, \
+         tempfile.TemporaryDirectory() as tmpdir:
+        logger = get_logger('test_eval')
+        runner = EpochBasedRunner(model=model, work_dir=tmpdir, logger=logger)
+        runner.register_checkpoint_hook(dict(interval=1))
+        runner.register_hook(eval_hook)
+        runner.run([loader], [('train', 1)], 8)
+
+        basename = osp.basename(runner.work_dir.rstrip(osp.sep))
+        ckpt_path = f'{out_dir}/{basename}/best_acc_epoch_4.pth'
+
+        assert runner.meta['hook_msgs']['best_ckpt'] == ckpt_path
+        assert runner.meta['hook_msgs']['best_score'] == 7
+
+    assert mock_put.call_count == 3
+    assert mock_remove.call_count == 2
+    assert mock_isfile.call_count == 2
+
+
+@patch('mmcv.engine.single_gpu_test', MagicMock)
+@patch('mmcv.engine.multi_gpu_test', MagicMock)
+@pytest.mark.parametrize('EvalHookParam', [EvalHook, DistEvalHook])
+@pytest.mark.parametrize('_build_demo_runner,by_epoch',
+                         [(_build_epoch_runner, True),
+                          (_build_iter_runner, False)])
+def test_start_param(EvalHookParam, _build_demo_runner, by_epoch):
+    # create dummy data
+    dataloader = DataLoader(EvalDataset())
+
+    # 0.1. dataloader is not a DataLoader object
+    with pytest.raises(TypeError):
+        EvalHookParam(dataloader=MagicMock(), interval=-1)
+
+    # 0.2. negative interval
+    with pytest.raises(ValueError):
+        EvalHookParam(dataloader, interval=-1)
+
+    # 0.3. negative start
+    with pytest.raises(ValueError):
+        EvalHookParam(dataloader, start=-1)
+
+    # 1. start=None, interval=1: perform evaluation after each epoch.
+    runner = _build_demo_runner()
+    evalhook = EvalHookParam(dataloader, interval=1, by_epoch=by_epoch)
+    evalhook.evaluate = MagicMock()
+    runner.register_hook(evalhook)
+    runner.run([dataloader], [('train', 1)], 2)
+    assert evalhook.evaluate.call_count == 2  # after epoch 1 & 2
+
+    # 2. start=1, interval=1: perform evaluation after each epoch.
+    runner = _build_demo_runner()
+    evalhook = EvalHookParam(
+        dataloader, start=1, interval=1, by_epoch=by_epoch)
+    evalhook.evaluate = MagicMock()
+    runner.register_hook(evalhook)
+    runner.run([dataloader], [('train', 1)], 2)
+    assert evalhook.evaluate.call_count == 2  # after epoch 1 & 2
+
+    # 3. start=None, interval=2: perform evaluation after epoch 2, 4, 6, etc
+    runner = _build_demo_runner()
+    evalhook = EvalHookParam(dataloader, interval=2, by_epoch=by_epoch)
+    evalhook.evaluate = MagicMock()
+    runner.register_hook(evalhook)
+    runner.run([dataloader], [('train', 1)], 2)
+    assert evalhook.evaluate.call_count == 1  # after epoch 2
+
+    # 4. start=1, interval=2: perform evaluation after epoch 1, 3, 5, etc
+    runner = _build_demo_runner()
+    evalhook = EvalHookParam(
+        dataloader, start=1, interval=2, by_epoch=by_epoch)
+    evalhook.evaluate = MagicMock()
+    runner.register_hook(evalhook)
+    runner.run([dataloader], [('train', 1)], 3)
+    assert evalhook.evaluate.call_count == 2  # after epoch 1 & 3
+
+    # 5. start=0, interval=1: perform evaluation after each epoch and
+    #    before epoch 1.
+    runner = _build_demo_runner()
+    evalhook = EvalHookParam(dataloader, start=0, by_epoch=by_epoch)
+    evalhook.evaluate = MagicMock()
+    runner.register_hook(evalhook)
+    runner.run([dataloader], [('train', 1)], 2)
+    assert evalhook.evaluate.call_count == 3  # before epoch1 and after e1 & e2
+
+    # 6. resuming from epoch i, start = x (x<=i), interval =1: perform
+    #    evaluation after each epoch and before the first epoch.
+    runner = _build_demo_runner()
+    evalhook = EvalHookParam(dataloader, start=1, by_epoch=by_epoch)
+    evalhook.evaluate = MagicMock()
+    runner.register_hook(evalhook)
+    if by_epoch:
+        runner._epoch = 2
+    else:
+        runner._iter = 2
+    runner.run([dataloader], [('train', 1)], 3)
+    assert evalhook.evaluate.call_count == 2  # before & after epoch 3
+
+    # 7. resuming from epoch i, start = i+1/None, interval =1: perform
+    #    evaluation after each epoch.
+    runner = _build_demo_runner()
+    evalhook = EvalHookParam(dataloader, start=2, by_epoch=by_epoch)
+    evalhook.evaluate = MagicMock()
+    runner.register_hook(evalhook)
+    if by_epoch:
+        runner._epoch = 1
+    else:
+        runner._iter = 1
+    runner.run([dataloader], [('train', 1)], 3)
+    assert evalhook.evaluate.call_count == 2  # after epoch 2 & 3
+
+
+@pytest.mark.parametrize('runner,by_epoch,eval_hook_priority',
+                         [(EpochBasedRunner, True, 'NORMAL'),
+                          (EpochBasedRunner, True, 'LOW'),
+                          (IterBasedRunner, False, 'LOW')])
+def test_logger(runner, by_epoch, eval_hook_priority):
+    loader = DataLoader(EvalDataset())
+    model = Model()
+    data_loader = DataLoader(EvalDataset())
+    eval_hook = EvalHook(
+        data_loader, interval=1, by_epoch=by_epoch, save_best='acc')
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        logger = get_logger('test_logger')
+        optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)
+        runner = EpochBasedRunner(
+            model=model, optimizer=optimizer, work_dir=tmpdir, logger=logger)
+        runner.register_logger_hooks(
+            dict(
+                interval=1,
+                hooks=[dict(type='TextLoggerHook', by_epoch=by_epoch)]))
+        runner.register_timer_hook(dict(type='IterTimerHook'))
+        runner.register_hook(eval_hook, priority=eval_hook_priority)
+        runner.run([loader], [('train', 1)], 1)
+
+        path = osp.join(tmpdir, next(scandir(tmpdir, '.json')))
+        with open(path) as fr:
+            fr.readline()  # skip the first line which is `hook_msg`
+            train_log = json.loads(fr.readline())
+            assert train_log['mode'] == 'train' and 'time' in train_log
+            val_log = json.loads(fr.readline())
+            assert val_log['mode'] == 'val' and 'time' not in val_log
diff --git a/tests/test_runner/test_fp16.py b/tests/test_runner/test_fp16.py
new file mode 100644
index 0000000..fb1c788
--- /dev/null
+++ b/tests/test_runner/test_fp16.py
@@ -0,0 +1,300 @@
+import numpy as np
+import pytest
+import torch
+import torch.nn as nn
+
+from mmcv.runner.fp16_utils import auto_fp16, cast_tensor_type, force_fp32
+
+
+def test_cast_tensor_type():
+    inputs = torch.FloatTensor([5.])
+    src_type = torch.float32
+    dst_type = torch.int32
+    outputs = cast_tensor_type(inputs, src_type, dst_type)
+    assert isinstance(outputs, torch.Tensor)
+    assert outputs.dtype == dst_type
+
+    inputs = 'tensor'
+    src_type = str
+    dst_type = str
+    outputs = cast_tensor_type(inputs, src_type, dst_type)
+    assert isinstance(outputs, str)
+
+    inputs = np.array([5.])
+    src_type = np.ndarray
+    dst_type = np.ndarray
+    outputs = cast_tensor_type(inputs, src_type, dst_type)
+    assert isinstance(outputs, np.ndarray)
+
+    inputs = dict(
+        tensor_a=torch.FloatTensor([1.]), tensor_b=torch.FloatTensor([2.]))
+    src_type = torch.float32
+    dst_type = torch.int32
+    outputs = cast_tensor_type(inputs, src_type, dst_type)
+    assert isinstance(outputs, dict)
+    assert outputs['tensor_a'].dtype == dst_type
+    assert outputs['tensor_b'].dtype == dst_type
+
+    inputs = [torch.FloatTensor([1.]), torch.FloatTensor([2.])]
+    src_type = torch.float32
+    dst_type = torch.int32
+    outputs = cast_tensor_type(inputs, src_type, dst_type)
+    assert isinstance(outputs, list)
+    assert outputs[0].dtype == dst_type
+    assert outputs[1].dtype == dst_type
+
+    inputs = 5
+    outputs = cast_tensor_type(inputs, None, None)
+    assert isinstance(outputs, int)
+
+
+def test_auto_fp16():
+
+    with pytest.raises(TypeError):
+        # ExampleObject is not a subclass of nn.Module
+
+        class ExampleObject(object):
+
+            @auto_fp16()
+            def __call__(self, x):
+                return x
+
+        model = ExampleObject()
+        input_x = torch.ones(1, dtype=torch.float32)
+        model(input_x)
+
+    # apply to all input args
+    class ExampleModule(nn.Module):
+
+        @auto_fp16()
+        def forward(self, x, y):
+            return x, y
+
+    model = ExampleModule()
+    input_x = torch.ones(1, dtype=torch.float32)
+    input_y = torch.ones(1, dtype=torch.float32)
+    output_x, output_y = model(input_x, input_y)
+    assert output_x.dtype == torch.float32
+    assert output_y.dtype == torch.float32
+
+    model.fp16_enabled = True
+    output_x, output_y = model(input_x, input_y)
+    assert output_x.dtype == torch.half
+    assert output_y.dtype == torch.half
+
+    if torch.cuda.is_available():
+        model.cuda()
+        output_x, output_y = model(input_x.cuda(), input_y.cuda())
+        assert output_x.dtype == torch.half
+        assert output_y.dtype == torch.half
+
+    # apply to specified input args
+    class ExampleModule(nn.Module):
+
+        @auto_fp16(apply_to=('x', ))
+        def forward(self, x, y):
+            return x, y
+
+    model = ExampleModule()
+    input_x = torch.ones(1, dtype=torch.float32)
+    input_y = torch.ones(1, dtype=torch.float32)
+    output_x, output_y = model(input_x, input_y)
+    assert output_x.dtype == torch.float32
+    assert output_y.dtype == torch.float32
+
+    model.fp16_enabled = True
+    output_x, output_y = model(input_x, input_y)
+    assert output_x.dtype == torch.half
+    assert output_y.dtype == torch.float32
+
+    if torch.cuda.is_available():
+        model.cuda()
+        output_x, output_y = model(input_x.cuda(), input_y.cuda())
+        assert output_x.dtype == torch.half
+        assert output_y.dtype == torch.float32
+
+    # apply to optional input args
+    class ExampleModule(nn.Module):
+
+        @auto_fp16(apply_to=('x', 'y'))
+        def forward(self, x, y=None, z=None):
+            return x, y, z
+
+    model = ExampleModule()
+    input_x = torch.ones(1, dtype=torch.float32)
+    input_y = torch.ones(1, dtype=torch.float32)
+    input_z = torch.ones(1, dtype=torch.float32)
+    output_x, output_y, output_z = model(input_x, y=input_y, z=input_z)
+    assert output_x.dtype == torch.float32
+    assert output_y.dtype == torch.float32
+    assert output_z.dtype == torch.float32
+
+    model.fp16_enabled = True
+    output_x, output_y, output_z = model(input_x, y=input_y, z=input_z)
+    assert output_x.dtype == torch.half
+    assert output_y.dtype == torch.half
+    assert output_z.dtype == torch.float32
+
+    if torch.cuda.is_available():
+        model.cuda()
+        output_x, output_y, output_z = model(
+            input_x.cuda(), y=input_y.cuda(), z=input_z.cuda())
+        assert output_x.dtype == torch.half
+        assert output_y.dtype == torch.half
+        assert output_z.dtype == torch.float32
+
+    # out_fp32=True
+    class ExampleModule(nn.Module):
+
+        @auto_fp16(apply_to=('x', 'y'), out_fp32=True)
+        def forward(self, x, y=None, z=None):
+            return x, y, z
+
+    model = ExampleModule()
+    input_x = torch.ones(1, dtype=torch.half)
+    input_y = torch.ones(1, dtype=torch.float32)
+    input_z = torch.ones(1, dtype=torch.float32)
+    output_x, output_y, output_z = model(input_x, y=input_y, z=input_z)
+    assert output_x.dtype == torch.half
+    assert output_y.dtype == torch.float32
+    assert output_z.dtype == torch.float32
+
+    model.fp16_enabled = True
+    output_x, output_y, output_z = model(input_x, y=input_y, z=input_z)
+    assert output_x.dtype == torch.float32
+    assert output_y.dtype == torch.float32
+    assert output_z.dtype == torch.float32
+
+    if torch.cuda.is_available():
+        model.cuda()
+        output_x, output_y, output_z = model(
+            input_x.cuda(), y=input_y.cuda(), z=input_z.cuda())
+        assert output_x.dtype == torch.float32
+        assert output_y.dtype == torch.float32
+        assert output_z.dtype == torch.float32
+
+
+def test_force_fp32():
+
+    with pytest.raises(TypeError):
+        # ExampleObject is not a subclass of nn.Module
+
+        class ExampleObject(object):
+
+            @force_fp32()
+            def __call__(self, x):
+                return x
+
+        model = ExampleObject()
+        input_x = torch.ones(1, dtype=torch.float32)
+        model(input_x)
+
+    # apply to all input args
+    class ExampleModule(nn.Module):
+
+        @force_fp32()
+        def forward(self, x, y):
+            return x, y
+
+    model = ExampleModule()
+    input_x = torch.ones(1, dtype=torch.half)
+    input_y = torch.ones(1, dtype=torch.half)
+    output_x, output_y = model(input_x, input_y)
+    assert output_x.dtype == torch.half
+    assert output_y.dtype == torch.half
+
+    model.fp16_enabled = True
+    output_x, output_y = model(input_x, input_y)
+    assert output_x.dtype == torch.float32
+    assert output_y.dtype == torch.float32
+
+    if torch.cuda.is_available():
+        model.cuda()
+        output_x, output_y = model(input_x.cuda(), input_y.cuda())
+        assert output_x.dtype == torch.float32
+        assert output_y.dtype == torch.float32
+
+    # apply to specified input args
+    class ExampleModule(nn.Module):
+
+        @force_fp32(apply_to=('x', ))
+        def forward(self, x, y):
+            return x, y
+
+    model = ExampleModule()
+    input_x = torch.ones(1, dtype=torch.half)
+    input_y = torch.ones(1, dtype=torch.half)
+    output_x, output_y = model(input_x, input_y)
+    assert output_x.dtype == torch.half
+    assert output_y.dtype == torch.half
+
+    model.fp16_enabled = True
+    output_x, output_y = model(input_x, input_y)
+    assert output_x.dtype == torch.float32
+    assert output_y.dtype == torch.half
+
+    if torch.cuda.is_available():
+        model.cuda()
+        output_x, output_y = model(input_x.cuda(), input_y.cuda())
+        assert output_x.dtype == torch.float32
+        assert output_y.dtype == torch.half
+
+    # apply to optional input args
+    class ExampleModule(nn.Module):
+
+        @force_fp32(apply_to=('x', 'y'))
+        def forward(self, x, y=None, z=None):
+            return x, y, z
+
+    model = ExampleModule()
+    input_x = torch.ones(1, dtype=torch.half)
+    input_y = torch.ones(1, dtype=torch.half)
+    input_z = torch.ones(1, dtype=torch.half)
+    output_x, output_y, output_z = model(input_x, y=input_y, z=input_z)
+    assert output_x.dtype == torch.half
+    assert output_y.dtype == torch.half
+    assert output_z.dtype == torch.half
+
+    model.fp16_enabled = True
+    output_x, output_y, output_z = model(input_x, y=input_y, z=input_z)
+    assert output_x.dtype == torch.float32
+    assert output_y.dtype == torch.float32
+    assert output_z.dtype == torch.half
+
+    if torch.cuda.is_available():
+        model.cuda()
+        output_x, output_y, output_z = model(
+            input_x.cuda(), y=input_y.cuda(), z=input_z.cuda())
+        assert output_x.dtype == torch.float32
+        assert output_y.dtype == torch.float32
+        assert output_z.dtype == torch.half
+
+    # out_fp16=True
+    class ExampleModule(nn.Module):
+
+        @force_fp32(apply_to=('x', 'y'), out_fp16=True)
+        def forward(self, x, y=None, z=None):
+            return x, y, z
+
+    model = ExampleModule()
+    input_x = torch.ones(1, dtype=torch.float32)
+    input_y = torch.ones(1, dtype=torch.half)
+    input_z = torch.ones(1, dtype=torch.half)
+    output_x, output_y, output_z = model(input_x, y=input_y, z=input_z)
+    assert output_x.dtype == torch.float32
+    assert output_y.dtype == torch.half
+    assert output_z.dtype == torch.half
+
+    model.fp16_enabled = True
+    output_x, output_y, output_z = model(input_x, y=input_y, z=input_z)
+    assert output_x.dtype == torch.half
+    assert output_y.dtype == torch.half
+    assert output_z.dtype == torch.half
+
+    if torch.cuda.is_available():
+        model.cuda()
+        output_x, output_y, output_z = model(
+            input_x.cuda(), y=input_y.cuda(), z=input_z.cuda())
+        assert output_x.dtype == torch.half
+        assert output_y.dtype == torch.half
+        assert output_z.dtype == torch.half
diff --git a/tests/test_runner/test_hooks.py b/tests/test_runner/test_hooks.py
new file mode 100644
index 0000000..61c347e
--- /dev/null
+++ b/tests/test_runner/test_hooks.py
@@ -0,0 +1,1488 @@
+"""Tests the hooks with runners.
+
+CommandLine:
+    pytest tests/test_runner/test_hooks.py
+    xdoctest tests/test_hooks.py zero
+"""
+import logging
+import os.path as osp
+import platform
+import random
+import re
+import shutil
+import sys
+import tempfile
+from unittest.mock import MagicMock, call, patch
+
+import pytest
+import torch
+import torch.nn as nn
+from torch.nn.init import constant_
+from torch.utils.data import DataLoader
+
+from mmcv.fileio.file_client import PetrelBackend
+from mmcv.runner import (CheckpointHook, DvcliveLoggerHook, EMAHook,
+                         Fp16OptimizerHook,
+                         GradientCumulativeFp16OptimizerHook,
+                         GradientCumulativeOptimizerHook, IterTimerHook,
+                         MlflowLoggerHook, NeptuneLoggerHook, OptimizerHook,
+                         PaviLoggerHook, WandbLoggerHook, build_runner)
+from mmcv.runner.fp16_utils import auto_fp16
+from mmcv.runner.hooks.hook import HOOKS, Hook
+from mmcv.runner.hooks.lr_updater import (CosineRestartLrUpdaterHook,
+                                          CyclicLrUpdaterHook,
+                                          FlatCosineAnnealingLrUpdaterHook,
+                                          OneCycleLrUpdaterHook,
+                                          StepLrUpdaterHook)
+
+sys.modules['petrel_client'] = MagicMock()
+sys.modules['petrel_client.client'] = MagicMock()
+
+
+def test_checkpoint_hook(tmp_path):
+    """xdoctest -m tests/test_runner/test_hooks.py test_checkpoint_hook."""
+
+    # test epoch based runner
+    loader = DataLoader(torch.ones((5, 2)))
+    runner = _build_demo_runner('EpochBasedRunner', max_epochs=1)
+    runner.meta = dict()
+    checkpointhook = CheckpointHook(interval=1, by_epoch=True)
+    runner.register_hook(checkpointhook)
+    runner.run([loader], [('train', 1)])
+    assert runner.meta['hook_msgs']['last_ckpt'] == osp.join(
+        runner.work_dir, 'epoch_1.pth')
+    shutil.rmtree(runner.work_dir)
+
+    # test petrel oss when the type of runner is `EpochBasedRunner`
+    runner = _build_demo_runner('EpochBasedRunner', max_epochs=4)
+    runner.meta = dict()
+    out_dir = 's3://user/data'
+    with patch.object(PetrelBackend, 'put') as mock_put, \
+         patch.object(PetrelBackend, 'remove') as mock_remove, \
+         patch.object(PetrelBackend, 'isfile') as mock_isfile:
+        checkpointhook = CheckpointHook(
+            interval=1, out_dir=out_dir, by_epoch=True, max_keep_ckpts=2)
+        runner.register_hook(checkpointhook)
+        runner.run([loader], [('train', 1)])
+        basename = osp.basename(runner.work_dir.rstrip(osp.sep))
+        assert runner.meta['hook_msgs']['last_ckpt'] == \
+            '/'.join([out_dir, basename, 'epoch_4.pth'])
+    mock_put.assert_called()
+    mock_remove.assert_called()
+    mock_isfile.assert_called()
+    shutil.rmtree(runner.work_dir)
+
+    # test iter based runner
+    runner = _build_demo_runner(
+        'IterBasedRunner', max_iters=1, max_epochs=None)
+    runner.meta = dict()
+    checkpointhook = CheckpointHook(interval=1, by_epoch=False)
+    runner.register_hook(checkpointhook)
+    runner.run([loader], [('train', 1)])
+    assert runner.meta['hook_msgs']['last_ckpt'] == osp.join(
+        runner.work_dir, 'iter_1.pth')
+    shutil.rmtree(runner.work_dir)
+
+    # test petrel oss when the type of runner is `IterBasedRunner`
+    runner = _build_demo_runner(
+        'IterBasedRunner', max_iters=4, max_epochs=None)
+    runner.meta = dict()
+    out_dir = 's3://user/data'
+    with patch.object(PetrelBackend, 'put') as mock_put, \
+         patch.object(PetrelBackend, 'remove') as mock_remove, \
+         patch.object(PetrelBackend, 'isfile') as mock_isfile:
+        checkpointhook = CheckpointHook(
+            interval=1, out_dir=out_dir, by_epoch=False, max_keep_ckpts=2)
+        runner.register_hook(checkpointhook)
+        runner.run([loader], [('train', 1)])
+        basename = osp.basename(runner.work_dir.rstrip(osp.sep))
+        assert runner.meta['hook_msgs']['last_ckpt'] == \
+            '/'.join([out_dir, basename, 'iter_4.pth'])
+    mock_put.assert_called()
+    mock_remove.assert_called()
+    mock_isfile.assert_called()
+    shutil.rmtree(runner.work_dir)
+
+
+def test_ema_hook():
+    """xdoctest -m tests/test_hooks.py test_ema_hook."""
+
+    class DemoModel(nn.Module):
+
+        def __init__(self):
+            super().__init__()
+            self.conv = nn.Conv2d(
+                in_channels=1,
+                out_channels=2,
+                kernel_size=1,
+                padding=1,
+                bias=True)
+            self._init_weight()
+
+        def _init_weight(self):
+            constant_(self.conv.weight, 0)
+            constant_(self.conv.bias, 0)
+
+        def forward(self, x):
+            return self.conv(x).sum()
+
+        def train_step(self, x, optimizer, **kwargs):
+            return dict(loss=self(x))
+
+        def val_step(self, x, optimizer, **kwargs):
+            return dict(loss=self(x))
+
+    loader = DataLoader(torch.ones((1, 1, 1, 1)))
+    runner = _build_demo_runner()
+    demo_model = DemoModel()
+    runner.model = demo_model
+    emahook = EMAHook(momentum=0.1, interval=2, warm_up=100, resume_from=None)
+    checkpointhook = CheckpointHook(interval=1, by_epoch=True)
+    runner.register_hook(emahook, priority='HIGHEST')
+    runner.register_hook(checkpointhook)
+    runner.run([loader, loader], [('train', 1), ('val', 1)])
+    checkpoint = torch.load(f'{runner.work_dir}/epoch_1.pth')
+    contain_ema_buffer = False
+    for name, value in checkpoint['state_dict'].items():
+        if 'ema' in name:
+            contain_ema_buffer = True
+            assert value.sum() == 0
+            value.fill_(1)
+        else:
+            assert value.sum() == 0
+    assert contain_ema_buffer
+    torch.save(checkpoint, f'{runner.work_dir}/epoch_1.pth')
+    work_dir = runner.work_dir
+    resume_ema_hook = EMAHook(
+        momentum=0.5, warm_up=0, resume_from=f'{work_dir}/epoch_1.pth')
+    runner = _build_demo_runner(max_epochs=2)
+    runner.model = demo_model
+    runner.register_hook(resume_ema_hook, priority='HIGHEST')
+    checkpointhook = CheckpointHook(interval=1, by_epoch=True)
+    runner.register_hook(checkpointhook)
+    runner.run([loader, loader], [('train', 1), ('val', 1)])
+    checkpoint = torch.load(f'{runner.work_dir}/epoch_2.pth')
+    contain_ema_buffer = False
+    for name, value in checkpoint['state_dict'].items():
+        if 'ema' in name:
+            contain_ema_buffer = True
+            assert value.sum() == 2
+        else:
+            assert value.sum() == 1
+    assert contain_ema_buffer
+    shutil.rmtree(runner.work_dir)
+    shutil.rmtree(work_dir)
+
+
+def test_custom_hook():
+
+    @HOOKS.register_module()
+    class ToyHook(Hook):
+
+        def __init__(self, info, *args, **kwargs):
+            super().__init__()
+            self.info = info
+
+    runner = _build_demo_runner_without_hook('EpochBasedRunner', max_epochs=1)
+    # test if custom_hooks is None
+    runner.register_custom_hooks(None)
+    assert len(runner.hooks) == 0
+    # test if custom_hooks is dict list
+    custom_hooks_cfg = [
+        dict(type='ToyHook', priority=51, info=51),
+        dict(type='ToyHook', priority=49, info=49)
+    ]
+    runner.register_custom_hooks(custom_hooks_cfg)
+    assert [hook.info for hook in runner.hooks] == [49, 51]
+    # test if custom_hooks is object and without priority
+    runner.register_custom_hooks(ToyHook(info='default'))
+    assert len(runner.hooks) == 3 and runner.hooks[1].info == 'default'
+    shutil.rmtree(runner.work_dir)
+
+    runner = _build_demo_runner_without_hook('EpochBasedRunner', max_epochs=1)
+    # test custom_hooks with string priority setting
+    priority_ranks = [
+        'HIGHEST', 'VERY_HIGH', 'HIGH', 'ABOVE_NORMAL', 'NORMAL',
+        'BELOW_NORMAL', 'LOW', 'VERY_LOW', 'LOWEST'
+    ]
+    random_priority_ranks = priority_ranks.copy()
+    random.shuffle(random_priority_ranks)
+    custom_hooks_cfg = [
+        dict(type='ToyHook', priority=rank, info=rank)
+        for rank in random_priority_ranks
+    ]
+    runner.register_custom_hooks(custom_hooks_cfg)
+    assert [hook.info for hook in runner.hooks] == priority_ranks
+    shutil.rmtree(runner.work_dir)
+
+    runner = _build_demo_runner_without_hook('EpochBasedRunner', max_epochs=1)
+    # test register_training_hooks order
+    custom_hooks_cfg = [
+        dict(type='ToyHook', priority=1, info='custom 1'),
+        dict(type='ToyHook', priority='NORMAL', info='custom normal'),
+        dict(type='ToyHook', priority=89, info='custom 89')
+    ]
+    runner.register_training_hooks(
+        lr_config=ToyHook('lr'),
+        optimizer_config=ToyHook('optimizer'),
+        checkpoint_config=ToyHook('checkpoint'),
+        log_config=dict(interval=1, hooks=[dict(type='ToyHook', info='log')]),
+        momentum_config=ToyHook('momentum'),
+        timer_config=ToyHook('timer'),
+        custom_hooks_config=custom_hooks_cfg)
+    # If custom hooks have same priority with default hooks, custom hooks
+    # will be triggered after default hooks.
+    hooks_order = [
+        'custom 1', 'lr', 'momentum', 'optimizer', 'checkpoint',
+        'custom normal', 'timer', 'custom 89', 'log'
+    ]
+    assert [hook.info for hook in runner.hooks] == hooks_order
+    shutil.rmtree(runner.work_dir)
+
+
+def test_pavi_hook():
+    sys.modules['pavi'] = MagicMock()
+
+    loader = DataLoader(torch.ones((5, 2)))
+    runner = _build_demo_runner()
+    runner.meta = dict(config_dict=dict(lr=0.02, gpu_ids=range(1)))
+    hook = PaviLoggerHook(add_graph=False, add_last_ckpt=True)
+    runner.register_hook(hook)
+    runner.run([loader, loader], [('train', 1), ('val', 1)])
+    shutil.rmtree(runner.work_dir)
+
+    assert hasattr(hook, 'writer')
+    hook.writer.add_scalars.assert_called_with('val', {
+        'learning_rate': 0.02,
+        'momentum': 0.95
+    }, 1)
+    # in windows environment, the latest checkpoint is copied from epoch_1.pth
+    if platform.system() == 'Windows':
+        snapshot_file_path = osp.join(runner.work_dir, 'latest.pth')
+    else:
+        snapshot_file_path = osp.join(runner.work_dir, 'epoch_1.pth')
+    hook.writer.add_snapshot_file.assert_called_with(
+        tag=runner.work_dir.split('/')[-1],
+        snapshot_file_path=snapshot_file_path,
+        iteration=1)
+
+
+def test_sync_buffers_hook():
+    loader = DataLoader(torch.ones((5, 2)))
+    runner = _build_demo_runner()
+    runner.register_hook_from_cfg(dict(type='SyncBuffersHook'))
+    runner.run([loader, loader], [('train', 1), ('val', 1)])
+    shutil.rmtree(runner.work_dir)
+
+
+@pytest.mark.parametrize('multi_optimziers', (True, False))
+def test_momentum_runner_hook(multi_optimziers):
+    """xdoctest -m tests/test_hooks.py test_momentum_runner_hook."""
+    sys.modules['pavi'] = MagicMock()
+    loader = DataLoader(torch.ones((10, 2)))
+    runner = _build_demo_runner(multi_optimziers=multi_optimziers)
+
+    # add momentum scheduler
+    hook_cfg = dict(
+        type='CyclicMomentumUpdaterHook',
+        by_epoch=False,
+        target_ratio=(0.85 / 0.95, 1),
+        cyclic_times=1,
+        step_ratio_up=0.4)
+    runner.register_hook_from_cfg(hook_cfg)
+
+    # add momentum LR scheduler
+    hook_cfg = dict(
+        type='CyclicLrUpdaterHook',
+        by_epoch=False,
+        target_ratio=(10, 1),
+        cyclic_times=1,
+        step_ratio_up=0.4)
+    runner.register_hook_from_cfg(hook_cfg)
+    runner.register_hook_from_cfg(dict(type='IterTimerHook'))
+
+    # add pavi hook
+    hook = PaviLoggerHook(interval=1, add_graph=False, add_last_ckpt=True)
+    runner.register_hook(hook)
+    runner.run([loader], [('train', 1)])
+    shutil.rmtree(runner.work_dir)
+
+    # TODO: use a more elegant way to check values
+    assert hasattr(hook, 'writer')
+    if multi_optimziers:
+        calls = [
+            call(
+                'train', {
+                    'learning_rate/model1': 0.01999999999999999,
+                    'learning_rate/model2': 0.009999999999999995,
+                    'momentum/model1': 0.95,
+                    'momentum/model2': 0.9,
+                }, 1),
+            call(
+                'train', {
+                    'learning_rate/model1': 0.2,
+                    'learning_rate/model2': 0.1,
+                    'momentum/model1': 0.85,
+                    'momentum/model2': 0.8052631578947369,
+                }, 5),
+            call(
+                'train', {
+                    'learning_rate/model1': 0.155,
+                    'learning_rate/model2': 0.0775,
+                    'momentum/model1': 0.875,
+                    'momentum/model2': 0.8289473684210527,
+                }, 7)
+        ]
+    else:
+        calls = [
+            call('train', {
+                'learning_rate': 0.01999999999999999,
+                'momentum': 0.95
+            }, 1),
+            call('train', {
+                'learning_rate': 0.2,
+                'momentum': 0.85
+            }, 5),
+            call('train', {
+                'learning_rate': 0.155,
+                'momentum': 0.875
+            }, 7),
+        ]
+    hook.writer.add_scalars.assert_has_calls(calls, any_order=True)
+
+
+@pytest.mark.parametrize('multi_optimziers', (True, False))
+def test_cosine_runner_hook(multi_optimziers):
+    """xdoctest -m tests/test_hooks.py test_cosine_runner_hook."""
+    sys.modules['pavi'] = MagicMock()
+    loader = DataLoader(torch.ones((10, 2)))
+    runner = _build_demo_runner(multi_optimziers=multi_optimziers)
+
+    # add momentum scheduler
+
+    hook_cfg = dict(
+        type='CosineAnnealingMomentumUpdaterHook',
+        min_momentum_ratio=0.99 / 0.95,
+        by_epoch=False,
+        warmup_iters=2,
+        warmup_ratio=0.9 / 0.95)
+    runner.register_hook_from_cfg(hook_cfg)
+
+    # add momentum LR scheduler
+    hook_cfg = dict(
+        type='CosineAnnealingLrUpdaterHook',
+        by_epoch=False,
+        min_lr_ratio=0,
+        warmup_iters=2,
+        warmup_ratio=0.9)
+    runner.register_hook_from_cfg(hook_cfg)
+    runner.register_hook_from_cfg(dict(type='IterTimerHook'))
+    runner.register_hook(IterTimerHook())
+    # add pavi hook
+    hook = PaviLoggerHook(interval=1, add_graph=False, add_last_ckpt=True)
+    runner.register_hook(hook)
+    runner.run([loader], [('train', 1)])
+    shutil.rmtree(runner.work_dir)
+
+    # TODO: use a more elegant way to check values
+    assert hasattr(hook, 'writer')
+    if multi_optimziers:
+        calls = [
+            call(
+                'train', {
+                    'learning_rate/model1': 0.02,
+                    'learning_rate/model2': 0.01,
+                    'momentum/model1': 0.95,
+                    'momentum/model2': 0.9,
+                }, 1),
+            call(
+                'train', {
+                    'learning_rate/model1': 0.01,
+                    'learning_rate/model2': 0.005,
+                    'momentum/model1': 0.97,
+                    'momentum/model2': 0.9189473684210527,
+                }, 6),
+            call(
+                'train', {
+                    'learning_rate/model1': 0.0004894348370484647,
+                    'learning_rate/model2': 0.00024471741852423234,
+                    'momentum/model1': 0.9890211303259032,
+                    'momentum/model2': 0.9369673866245399,
+                }, 10)
+        ]
+    else:
+        calls = [
+            call('train', {
+                'learning_rate': 0.02,
+                'momentum': 0.95
+            }, 1),
+            call('train', {
+                'learning_rate': 0.01,
+                'momentum': 0.97
+            }, 6),
+            call(
+                'train', {
+                    'learning_rate': 0.0004894348370484647,
+                    'momentum': 0.9890211303259032
+                }, 10)
+        ]
+    hook.writer.add_scalars.assert_has_calls(calls, any_order=True)
+
+
+@pytest.mark.parametrize('multi_optimziers, by_epoch', [(False, False),
+                                                        (True, False),
+                                                        (False, True),
+                                                        (True, True)])
+def test_flat_cosine_runner_hook(multi_optimziers, by_epoch):
+    """xdoctest -m tests/test_hooks.py test_flat_cosine_runner_hook."""
+    sys.modules['pavi'] = MagicMock()
+    loader = DataLoader(torch.ones((10, 2)))
+    max_epochs = 10 if by_epoch else 1
+    runner = _build_demo_runner(
+        multi_optimziers=multi_optimziers, max_epochs=max_epochs)
+
+    with pytest.raises(ValueError):
+        # start_percent: expected float between 0 and 1
+        FlatCosineAnnealingLrUpdaterHook(start_percent=-0.1, min_lr_ratio=0)
+
+    # add LR scheduler
+    hook_cfg = dict(
+        type='FlatCosineAnnealingLrUpdaterHook',
+        by_epoch=by_epoch,
+        min_lr_ratio=0,
+        warmup='linear',
+        warmup_iters=10 if by_epoch else 2,
+        warmup_ratio=0.9,
+        start_percent=0.5)
+    runner.register_hook_from_cfg(hook_cfg)
+    runner.register_hook_from_cfg(dict(type='IterTimerHook'))
+    runner.register_hook(IterTimerHook())
+    # add pavi hook
+    hook = PaviLoggerHook(interval=1, add_graph=False, add_last_ckpt=True)
+    runner.register_hook(hook)
+    runner.run([loader], [('train', 1)])
+    shutil.rmtree(runner.work_dir)
+
+    # TODO: use a more elegant way to check values
+    assert hasattr(hook, 'writer')
+    if multi_optimziers:
+        if by_epoch:
+            calls = [
+                call(
+                    'train', {
+                        'learning_rate/model1': 0.018000000000000002,
+                        'learning_rate/model2': 0.009000000000000001,
+                        'momentum/model1': 0.95,
+                        'momentum/model2': 0.9,
+                    }, 1),
+                call(
+                    'train', {
+                        'learning_rate/model1': 0.02,
+                        'learning_rate/model2': 0.01,
+                        'momentum/model1': 0.95,
+                        'momentum/model2': 0.9,
+                    }, 11),
+                call(
+                    'train', {
+                        'learning_rate/model1': 0.018090169943749474,
+                        'learning_rate/model2': 0.009045084971874737,
+                        'momentum/model1': 0.95,
+                        'momentum/model2': 0.9,
+                    }, 61),
+                call(
+                    'train', {
+                        'learning_rate/model1': 0.0019098300562505265,
+                        'learning_rate/model2': 0.0009549150281252633,
+                        'momentum/model1': 0.95,
+                        'momentum/model2': 0.9,
+                    }, 100)
+            ]
+        else:
+            calls = [
+                call(
+                    'train', {
+                        'learning_rate/model1': 0.018000000000000002,
+                        'learning_rate/model2': 0.009000000000000001,
+                        'momentum/model1': 0.95,
+                        'momentum/model2': 0.9
+                    }, 1),
+                call(
+                    'train', {
+                        'learning_rate/model1': 0.02,
+                        'learning_rate/model2': 0.01,
+                        'momentum/model1': 0.95,
+                        'momentum/model2': 0.9
+                    }, 6),
+                call(
+                    'train', {
+                        'learning_rate/model1': 0.018090169943749474,
+                        'learning_rate/model2': 0.009045084971874737,
+                        'momentum/model1': 0.95,
+                        'momentum/model2': 0.9
+                    }, 7),
+                call(
+                    'train', {
+                        'learning_rate/model1': 0.0019098300562505265,
+                        'learning_rate/model2': 0.0009549150281252633,
+                        'momentum/model1': 0.95,
+                        'momentum/model2': 0.9
+                    }, 10)
+            ]
+    else:
+        if by_epoch:
+            calls = [
+                call('train', {
+                    'learning_rate': 0.018000000000000002,
+                    'momentum': 0.95
+                }, 1),
+                call('train', {
+                    'learning_rate': 0.02,
+                    'momentum': 0.95
+                }, 11),
+                call('train', {
+                    'learning_rate': 0.018090169943749474,
+                    'momentum': 0.95
+                }, 61),
+                call('train', {
+                    'learning_rate': 0.0019098300562505265,
+                    'momentum': 0.95
+                }, 100)
+            ]
+        else:
+            calls = [
+                call('train', {
+                    'learning_rate': 0.018000000000000002,
+                    'momentum': 0.95
+                }, 1),
+                call('train', {
+                    'learning_rate': 0.02,
+                    'momentum': 0.95
+                }, 6),
+                call('train', {
+                    'learning_rate': 0.018090169943749474,
+                    'momentum': 0.95
+                }, 7),
+                call('train', {
+                    'learning_rate': 0.0019098300562505265,
+                    'momentum': 0.95
+                }, 10)
+            ]
+    hook.writer.add_scalars.assert_has_calls(calls, any_order=True)
+
+
+@pytest.mark.parametrize('multi_optimziers, max_iters', [(True, 10), (True, 2),
+                                                         (False, 10),
+                                                         (False, 2)])
+def test_one_cycle_runner_hook(multi_optimziers, max_iters):
+    """Test OneCycleLrUpdaterHook and OneCycleMomentumUpdaterHook."""
+    with pytest.raises(AssertionError):
+        # by_epoch should be False
+        OneCycleLrUpdaterHook(max_lr=0.1, by_epoch=True)
+
+    with pytest.raises(ValueError):
+        # expected float between 0 and 1
+        OneCycleLrUpdaterHook(max_lr=0.1, pct_start=-0.1)
+
+    with pytest.raises(ValueError):
+        # anneal_strategy should be either 'cos' or 'linear'
+        OneCycleLrUpdaterHook(max_lr=0.1, anneal_strategy='sin')
+
+    sys.modules['pavi'] = MagicMock()
+    loader = DataLoader(torch.ones((10, 2)))
+    runner = _build_demo_runner(multi_optimziers=multi_optimziers)
+
+    # add momentum scheduler
+    hook_cfg = dict(
+        type='OneCycleMomentumUpdaterHook',
+        base_momentum=0.85,
+        max_momentum=0.95,
+        pct_start=0.5,
+        anneal_strategy='cos',
+        three_phase=False)
+    runner.register_hook_from_cfg(hook_cfg)
+
+    # add LR scheduler
+    hook_cfg = dict(
+        type='OneCycleLrUpdaterHook',
+        max_lr=0.01,
+        pct_start=0.5,
+        anneal_strategy='cos',
+        div_factor=25,
+        final_div_factor=1e4,
+        three_phase=False)
+    runner.register_hook_from_cfg(hook_cfg)
+    runner.register_hook_from_cfg(dict(type='IterTimerHook'))
+    runner.register_hook(IterTimerHook())
+    # add pavi hook
+    hook = PaviLoggerHook(interval=1, add_graph=False, add_last_ckpt=True)
+    runner.register_hook(hook)
+    runner.run([loader], [('train', 1)])
+    shutil.rmtree(runner.work_dir)
+
+    # TODO: use a more elegant way to check values
+    assert hasattr(hook, 'writer')
+    if multi_optimziers:
+        calls = [
+            call(
+                'train', {
+                    'learning_rate/model1': 0.0003999999999999993,
+                    'learning_rate/model2': 0.0003999999999999993,
+                    'momentum/model1': 0.95,
+                    'momentum/model2': 0.95,
+                }, 1),
+            call(
+                'train', {
+                    'learning_rate/model1': 0.00904508879153485,
+                    'learning_rate/model2': 0.00904508879153485,
+                    'momentum/model1': 0.8595491502812526,
+                    'momentum/model2': 0.8595491502812526,
+                }, 6),
+            call(
+                'train', {
+                    'learning_rate/model1': 4e-08,
+                    'learning_rate/model2': 4e-08,
+                    'momentum/model1': 0.95,
+                    'momentum/model2': 0.95,
+                }, 10)
+        ]
+    else:
+        calls = [
+            call('train', {
+                'learning_rate': 0.0003999999999999993,
+                'momentum': 0.95
+            }, 1),
+            call(
+                'train', {
+                    'learning_rate': 0.00904508879153485,
+                    'momentum': 0.8595491502812526
+                }, 6),
+            call('train', {
+                'learning_rate': 4e-08,
+                'momentum': 0.95
+            }, 10)
+        ]
+    hook.writer.add_scalars.assert_has_calls(calls, any_order=True)
+
+    # Test OneCycleLrUpdaterHook
+    sys.modules['pavi'] = MagicMock()
+    loader = DataLoader(torch.ones((10, 2)))
+    runner = _build_demo_runner(
+        runner_type='IterBasedRunner', max_epochs=None, max_iters=max_iters)
+
+    args = dict(
+        max_lr=0.01,
+        total_steps=5,
+        pct_start=0.5,
+        anneal_strategy='linear',
+        div_factor=25,
+        final_div_factor=1e4,
+    )
+    hook = OneCycleLrUpdaterHook(**args)
+    runner.register_hook(hook)
+    if max_iters == 10:
+        # test total_steps < max_iters
+        with pytest.raises(ValueError):
+            runner.run([loader], [('train', 1)])
+    else:
+        # test total_steps > max_iters
+        runner.run([loader], [('train', 1)])
+        lr_last = runner.current_lr()
+        t = torch.tensor([0.0], requires_grad=True)
+        optim = torch.optim.SGD([t], lr=0.01)
+        lr_scheduler = torch.optim.lr_scheduler.OneCycleLR(optim, **args)
+        lr_target = []
+        for _ in range(max_iters):
+            optim.step()
+            lr_target.append(optim.param_groups[0]['lr'])
+            lr_scheduler.step()
+        assert lr_target[-1] == lr_last[0]
+
+
+@pytest.mark.parametrize('multi_optimziers', (True, False))
+def test_cosine_restart_lr_update_hook(multi_optimziers):
+    """Test CosineRestartLrUpdaterHook."""
+    with pytest.raises(AssertionError):
+        # either `min_lr` or `min_lr_ratio` should be specified
+        CosineRestartLrUpdaterHook(
+            by_epoch=False,
+            periods=[2, 10],
+            restart_weights=[0.5, 0.5],
+            min_lr=0.1,
+            min_lr_ratio=0)
+
+    with pytest.raises(AssertionError):
+        # periods and restart_weights should have the same length
+        CosineRestartLrUpdaterHook(
+            by_epoch=False,
+            periods=[2, 10],
+            restart_weights=[0.5],
+            min_lr_ratio=0)
+
+    with pytest.raises(ValueError):
+        # the last cumulative_periods 7 (out of [5, 7]) should >= 10
+        sys.modules['pavi'] = MagicMock()
+        loader = DataLoader(torch.ones((10, 2)))
+        runner = _build_demo_runner()
+
+        # add cosine restart LR scheduler
+        hook = CosineRestartLrUpdaterHook(
+            by_epoch=False,
+            periods=[5, 2],  # cumulative_periods [5, 7 (5 + 2)]
+            restart_weights=[0.5, 0.5],
+            min_lr=0.0001)
+        runner.register_hook(hook)
+        runner.register_hook(IterTimerHook())
+
+        # add pavi hook
+        hook = PaviLoggerHook(interval=1, add_graph=False, add_last_ckpt=True)
+        runner.register_hook(hook)
+        runner.run([loader], [('train', 1)])
+        shutil.rmtree(runner.work_dir)
+
+    sys.modules['pavi'] = MagicMock()
+    loader = DataLoader(torch.ones((10, 2)))
+    runner = _build_demo_runner(multi_optimziers=multi_optimziers)
+
+    # add cosine restart LR scheduler
+    hook = CosineRestartLrUpdaterHook(
+        by_epoch=False,
+        periods=[5, 5],
+        restart_weights=[0.5, 0.5],
+        min_lr_ratio=0)
+    runner.register_hook(hook)
+    runner.register_hook(IterTimerHook())
+
+    # add pavi hook
+    hook = PaviLoggerHook(interval=1, add_graph=False, add_last_ckpt=True)
+    runner.register_hook(hook)
+    runner.run([loader], [('train', 1)])
+    shutil.rmtree(runner.work_dir)
+
+    # TODO: use a more elegant way to check values
+    assert hasattr(hook, 'writer')
+    if multi_optimziers:
+        calls = [
+            call(
+                'train', {
+                    'learning_rate/model1': 0.01,
+                    'learning_rate/model2': 0.005,
+                    'momentum/model1': 0.95,
+                    'momentum/model2': 0.9,
+                }, 1),
+            call(
+                'train', {
+                    'learning_rate/model1': 0.01,
+                    'learning_rate/model2': 0.005,
+                    'momentum/model1': 0.95,
+                    'momentum/model2': 0.9,
+                }, 6),
+            call(
+                'train', {
+                    'learning_rate/model1': 0.0009549150281252633,
+                    'learning_rate/model2': 0.00047745751406263163,
+                    'momentum/model1': 0.95,
+                    'momentum/model2': 0.9,
+                }, 10)
+        ]
+    else:
+        calls = [
+            call('train', {
+                'learning_rate': 0.01,
+                'momentum': 0.95
+            }, 1),
+            call('train', {
+                'learning_rate': 0.01,
+                'momentum': 0.95
+            }, 6),
+            call('train', {
+                'learning_rate': 0.0009549150281252633,
+                'momentum': 0.95
+            }, 10)
+        ]
+    hook.writer.add_scalars.assert_has_calls(calls, any_order=True)
+
+
+@pytest.mark.parametrize('multi_optimziers', (True, False))
+def test_step_runner_hook(multi_optimziers):
+    """Test StepLrUpdaterHook."""
+    with pytest.raises(TypeError):
+        # `step` should be specified
+        StepLrUpdaterHook()
+    with pytest.raises(AssertionError):
+        # if `step` is int, should be positive
+        StepLrUpdaterHook(-10)
+    with pytest.raises(AssertionError):
+        # if `step` is list of int, should all be positive
+        StepLrUpdaterHook([10, 16, -20])
+
+    # test StepLrUpdaterHook with int `step` value
+    sys.modules['pavi'] = MagicMock()
+    loader = DataLoader(torch.ones((30, 2)))
+    runner = _build_demo_runner(multi_optimziers=multi_optimziers)
+
+    # add momentum scheduler
+    hook_cfg = dict(
+        type='StepMomentumUpdaterHook',
+        by_epoch=False,
+        step=5,
+        gamma=0.5,
+        min_momentum=0.05)
+    runner.register_hook_from_cfg(hook_cfg)
+
+    # add step LR scheduler
+    hook = StepLrUpdaterHook(by_epoch=False, step=5, gamma=0.5, min_lr=1e-3)
+    runner.register_hook(hook)
+    runner.register_hook(IterTimerHook())
+
+    # add pavi hook
+    hook = PaviLoggerHook(interval=1, add_graph=False, add_last_ckpt=True)
+    runner.register_hook(hook)
+    runner.run([loader], [('train', 1)])
+    shutil.rmtree(runner.work_dir)
+
+    # TODO: use a more elegant way to check values
+    assert hasattr(hook, 'writer')
+    if multi_optimziers:
+        calls = [
+            call(
+                'train', {
+                    'learning_rate/model1': 0.02,
+                    'learning_rate/model2': 0.01,
+                    'momentum/model1': 0.95,
+                    'momentum/model2': 0.9
+                }, 1),
+            call(
+                'train', {
+                    'learning_rate/model1': 0.01,
+                    'learning_rate/model2': 0.005,
+                    'momentum/model1': 0.475,
+                    'momentum/model2': 0.45
+                }, 6),
+            call(
+                'train', {
+                    'learning_rate/model1': 0.0025,
+                    'learning_rate/model2': 0.00125,
+                    'momentum/model1': 0.11875,
+                    'momentum/model2': 0.1125
+                }, 16),
+            call(
+                'train', {
+                    'learning_rate/model1': 0.00125,
+                    'learning_rate/model2': 0.001,
+                    'momentum/model1': 0.059375,
+                    'momentum/model2': 0.05625
+                }, 21),
+            call(
+                'train', {
+                    'learning_rate/model1': 0.001,
+                    'learning_rate/model2': 0.001,
+                    'momentum/model1': 0.05,
+                    'momentum/model2': 0.05
+                }, 26),
+            call(
+                'train', {
+                    'learning_rate/model1': 0.001,
+                    'learning_rate/model2': 0.001,
+                    'momentum/model1': 0.05,
+                    'momentum/model2': 0.05
+                }, 30)
+        ]
+    else:
+        calls = [
+            call('train', {
+                'learning_rate': 0.02,
+                'momentum': 0.95
+            }, 1),
+            call('train', {
+                'learning_rate': 0.01,
+                'momentum': 0.475
+            }, 6),
+            call('train', {
+                'learning_rate': 0.0025,
+                'momentum': 0.11875
+            }, 16),
+            call('train', {
+                'learning_rate': 0.00125,
+                'momentum': 0.059375
+            }, 21),
+            call('train', {
+                'learning_rate': 0.001,
+                'momentum': 0.05
+            }, 26),
+            call('train', {
+                'learning_rate': 0.001,
+                'momentum': 0.05
+            }, 30)
+        ]
+    hook.writer.add_scalars.assert_has_calls(calls, any_order=True)
+
+    # test StepLrUpdaterHook with list[int] `step` value
+    sys.modules['pavi'] = MagicMock()
+    loader = DataLoader(torch.ones((10, 2)))
+    runner = _build_demo_runner(multi_optimziers=multi_optimziers)
+
+    # add momentum scheduler
+    hook_cfg = dict(
+        type='StepMomentumUpdaterHook',
+        by_epoch=False,
+        step=[4, 6, 8],
+        gamma=0.1)
+    runner.register_hook_from_cfg(hook_cfg)
+
+    # add step LR scheduler
+    hook = StepLrUpdaterHook(by_epoch=False, step=[4, 6, 8], gamma=0.1)
+    runner.register_hook(hook)
+    runner.register_hook(IterTimerHook())
+
+    # add pavi hook
+    hook = PaviLoggerHook(interval=1, add_graph=False, add_last_ckpt=True)
+    runner.register_hook(hook)
+    runner.run([loader], [('train', 1)])
+    shutil.rmtree(runner.work_dir)
+
+    # TODO: use a more elegant way to check values
+    assert hasattr(hook, 'writer')
+    if multi_optimziers:
+        calls = [
+            call(
+                'train', {
+                    'learning_rate/model1': 0.02,
+                    'learning_rate/model2': 0.01,
+                    'momentum/model1': 0.95,
+                    'momentum/model2': 0.9
+                }, 1),
+            call(
+                'train', {
+                    'learning_rate/model1': 0.002,
+                    'learning_rate/model2': 0.001,
+                    'momentum/model1': 9.5e-2,
+                    'momentum/model2': 9.000000000000001e-2
+                }, 5),
+            call(
+                'train', {
+                    'learning_rate/model1': 2.0000000000000004e-4,
+                    'learning_rate/model2': 1.0000000000000002e-4,
+                    'momentum/model1': 9.500000000000001e-3,
+                    'momentum/model2': 9.000000000000003e-3
+                }, 7),
+            call(
+                'train', {
+                    'learning_rate/model1': 2.0000000000000005e-05,
+                    'learning_rate/model2': 1.0000000000000003e-05,
+                    'momentum/model1': 9.500000000000002e-4,
+                    'momentum/model2': 9.000000000000002e-4
+                }, 9)
+        ]
+    else:
+        calls = [
+            call('train', {
+                'learning_rate': 0.02,
+                'momentum': 0.95
+            }, 1),
+            call('train', {
+                'learning_rate': 0.002,
+                'momentum': 0.095
+            }, 5),
+            call(
+                'train', {
+                    'learning_rate': 2.0000000000000004e-4,
+                    'momentum': 9.500000000000001e-3
+                }, 7),
+            call(
+                'train', {
+                    'learning_rate': 2.0000000000000005e-05,
+                    'momentum': 9.500000000000002e-4
+                }, 9)
+        ]
+    hook.writer.add_scalars.assert_has_calls(calls, any_order=True)
+
+
+@pytest.mark.parametrize('multi_optimizers, max_iters', [(True, 8),
+                                                         (False, 8)])
+def test_cyclic_lr_update_hook(multi_optimizers, max_iters):
+    """Test CyclicLrUpdateHook."""
+    with pytest.raises(AssertionError):
+        # by_epoch should be False
+        CyclicLrUpdaterHook(by_epoch=True)
+
+    with pytest.raises(AssertionError):
+        # target_ratio" must be either float or tuple/list of two floats
+        CyclicLrUpdaterHook(by_epoch=False, target_ratio=(10.0, 0.1, 0.2))
+
+    with pytest.raises(AssertionError):
+        # step_ratio_up" must be in range [0,1)
+        CyclicLrUpdaterHook(by_epoch=False, step_ratio_up=1.4)
+
+    with pytest.raises(ValueError):
+        # anneal_strategy must be one of "cos" or "linear"
+        CyclicLrUpdaterHook(by_epoch=False, anneal_strategy='sin')
+
+    sys.modules['pavi'] = MagicMock()
+    loader = DataLoader(torch.ones((10, 2)))
+    runner = _build_demo_runner(
+        runner_type='IterBasedRunner',
+        max_epochs=None,
+        max_iters=max_iters,
+        multi_optimziers=multi_optimizers)
+
+    # add cyclic LR scheduler
+    hook = CyclicLrUpdaterHook(
+        by_epoch=False,
+        target_ratio=(10.0, 1.0),
+        cyclic_times=1,
+        step_ratio_up=0.5,
+        anneal_strategy='linear')
+    runner.register_hook(hook)
+    runner.register_hook_from_cfg(dict(type='IterTimerHook'))
+    runner.register_hook(IterTimerHook())
+    # add pavi hook
+    hook = PaviLoggerHook(interval=1, add_graph=False, add_last_ckpt=True)
+    runner.register_hook(hook)
+    runner.run([loader], [('train', 1)])
+    shutil.rmtree(runner.work_dir)
+
+    assert hasattr(hook, 'writer')
+    if multi_optimizers:
+        calls = [
+            call(
+                'train', {
+                    'learning_rate/model1': 0.02,
+                    'learning_rate/model2': 0.01,
+                    'momentum/model1': 0.95,
+                    'momentum/model2': 0.9,
+                }, 1),
+            call(
+                'train', {
+                    'learning_rate/model1': 0.155,
+                    'learning_rate/model2': 0.0775,
+                    'momentum/model1': 0.95,
+                    'momentum/model2': 0.9,
+                }, 4),
+            call(
+                'train', {
+                    'learning_rate/model1': 0.155,
+                    'learning_rate/model2': 0.0775,
+                    'momentum/model1': 0.95,
+                    'momentum/model2': 0.9,
+                }, 6)
+        ]
+    else:
+        calls = [
+            call('train', {
+                'learning_rate': 0.02,
+                'momentum': 0.95
+            }, 1),
+            call('train', {
+                'learning_rate': 0.155,
+                'momentum': 0.95
+            }, 4),
+            call('train', {
+                'learning_rate': 0.155,
+                'momentum': 0.95
+            }, 6),
+        ]
+    hook.writer.add_scalars.assert_has_calls(calls, any_order=True)
+
+
+@pytest.mark.parametrize('log_model', (True, False))
+def test_mlflow_hook(log_model):
+    sys.modules['mlflow'] = MagicMock()
+    sys.modules['mlflow.pytorch'] = MagicMock()
+
+    runner = _build_demo_runner()
+    loader = DataLoader(torch.ones((5, 2)))
+
+    hook = MlflowLoggerHook(exp_name='test', log_model=log_model)
+    runner.register_hook(hook)
+    runner.run([loader, loader], [('train', 1), ('val', 1)])
+    shutil.rmtree(runner.work_dir)
+
+    hook.mlflow.set_experiment.assert_called_with('test')
+    hook.mlflow.log_metrics.assert_called_with(
+        {
+            'learning_rate': 0.02,
+            'momentum': 0.95
+        }, step=6)
+    if log_model:
+        hook.mlflow_pytorch.log_model.assert_called_with(
+            runner.model, 'models')
+    else:
+        assert not hook.mlflow_pytorch.log_model.called
+
+
+def test_wandb_hook():
+    sys.modules['wandb'] = MagicMock()
+    runner = _build_demo_runner()
+    hook = WandbLoggerHook()
+    loader = DataLoader(torch.ones((5, 2)))
+
+    runner.register_hook(hook)
+    runner.run([loader, loader], [('train', 1), ('val', 1)])
+    shutil.rmtree(runner.work_dir)
+
+    hook.wandb.init.assert_called_with()
+    hook.wandb.log.assert_called_with({
+        'learning_rate': 0.02,
+        'momentum': 0.95
+    },
+                                      step=6,
+                                      commit=True)
+    hook.wandb.join.assert_called_with()
+
+
+def test_neptune_hook():
+    sys.modules['neptune'] = MagicMock()
+    sys.modules['neptune.new'] = MagicMock()
+    runner = _build_demo_runner()
+    hook = NeptuneLoggerHook()
+
+    loader = DataLoader(torch.ones((5, 2)))
+
+    runner.register_hook(hook)
+    runner.run([loader, loader], [('train', 1), ('val', 1)])
+    shutil.rmtree(runner.work_dir)
+
+    hook.neptune.init.assert_called_with()
+    hook.run['momentum'].log.assert_called_with(0.95, step=6)
+    hook.run.stop.assert_called_with()
+
+
+def test_dvclive_hook(tmp_path):
+    sys.modules['dvclive'] = MagicMock()
+    runner = _build_demo_runner()
+
+    (tmp_path / 'dvclive').mkdir()
+    hook = DvcliveLoggerHook(str(tmp_path / 'dvclive'))
+    loader = DataLoader(torch.ones((5, 2)))
+
+    runner.register_hook(hook)
+    runner.run([loader, loader], [('train', 1), ('val', 1)])
+    shutil.rmtree(runner.work_dir)
+
+    hook.dvclive.init.assert_called_with(str(tmp_path / 'dvclive'))
+    hook.dvclive.log.assert_called_with('momentum', 0.95, step=6)
+    hook.dvclive.log.assert_any_call('learning_rate', 0.02, step=6)
+
+
+def _build_demo_runner_without_hook(runner_type='EpochBasedRunner',
+                                    max_epochs=1,
+                                    max_iters=None,
+                                    multi_optimziers=False):
+
+    class Model(nn.Module):
+
+        def __init__(self):
+            super().__init__()
+            self.linear = nn.Linear(2, 1)
+            self.conv = nn.Conv2d(3, 3, 3)
+
+        def forward(self, x):
+            return self.linear(x)
+
+        def train_step(self, x, optimizer, **kwargs):
+            return dict(loss=self(x))
+
+        def val_step(self, x, optimizer, **kwargs):
+            return dict(loss=self(x))
+
+    model = Model()
+
+    if multi_optimziers:
+        optimizer = {
+            'model1':
+            torch.optim.SGD(model.linear.parameters(), lr=0.02, momentum=0.95),
+            'model2':
+            torch.optim.SGD(model.conv.parameters(), lr=0.01, momentum=0.9),
+        }
+    else:
+        optimizer = torch.optim.SGD(model.parameters(), lr=0.02, momentum=0.95)
+
+    tmp_dir = tempfile.mkdtemp()
+    runner = build_runner(
+        dict(type=runner_type),
+        default_args=dict(
+            model=model,
+            work_dir=tmp_dir,
+            optimizer=optimizer,
+            logger=logging.getLogger(),
+            max_epochs=max_epochs,
+            max_iters=max_iters))
+    return runner
+
+
+def _build_demo_runner(runner_type='EpochBasedRunner',
+                       max_epochs=1,
+                       max_iters=None,
+                       multi_optimziers=False):
+
+    log_config = dict(
+        interval=1, hooks=[
+            dict(type='TextLoggerHook'),
+        ])
+
+    runner = _build_demo_runner_without_hook(runner_type, max_epochs,
+                                             max_iters, multi_optimziers)
+
+    runner.register_checkpoint_hook(dict(interval=1))
+    runner.register_logger_hooks(log_config)
+    return runner
+
+
+def test_runner_with_revise_keys():
+
+    import os
+
+    class Model(nn.Module):
+
+        def __init__(self):
+            super().__init__()
+            self.conv = nn.Conv2d(3, 3, 1)
+
+    class PrefixModel(nn.Module):
+
+        def __init__(self):
+            super().__init__()
+            self.backbone = Model()
+
+    pmodel = PrefixModel()
+    model = Model()
+    checkpoint_path = os.path.join(tempfile.gettempdir(), 'checkpoint.pth')
+
+    # add prefix
+    torch.save(model.state_dict(), checkpoint_path)
+    runner = _build_demo_runner(runner_type='EpochBasedRunner')
+    runner.model = pmodel
+    state_dict = runner.load_checkpoint(
+        checkpoint_path, revise_keys=[(r'^', 'backbone.')])
+    for key in pmodel.backbone.state_dict().keys():
+        assert torch.equal(pmodel.backbone.state_dict()[key], state_dict[key])
+    # strip prefix
+    torch.save(pmodel.state_dict(), checkpoint_path)
+    runner.model = model
+    state_dict = runner.load_checkpoint(
+        checkpoint_path, revise_keys=[(r'^backbone\.', '')])
+    for key in state_dict.keys():
+        key_stripped = re.sub(r'^backbone\.', '', key)
+        assert torch.equal(model.state_dict()[key_stripped], state_dict[key])
+    os.remove(checkpoint_path)
+
+
+def test_get_triggered_stages():
+
+    class ToyHook(Hook):
+        # test normal stage
+        def before_run():
+            pass
+
+        # test the method mapped to multi stages.
+        def after_epoch():
+            pass
+
+    hook = ToyHook()
+    # stages output have order, so here is list instead of set.
+    expected_stages = ['before_run', 'after_train_epoch', 'after_val_epoch']
+    assert hook.get_triggered_stages() == expected_stages
+
+
+def test_gradient_cumulative_optimizer_hook():
+
+    class ToyModel(nn.Module):
+
+        def __init__(self, with_norm=False):
+            super().__init__()
+            self.fp16_enabled = False
+            self.fc = nn.Linear(3, 2)
+            nn.init.constant_(self.fc.weight, 1.)
+            nn.init.constant_(self.fc.bias, 1.)
+            self.with_norm = with_norm
+            if with_norm:
+                self.norm = nn.BatchNorm1d(2)
+
+        def forward(self, x):
+            x = self.fc(x)
+            if self.with_norm:
+                x = self.norm(x)
+            return x
+
+        def train_step(self, x, optimizer, **kwargs):
+            return dict(loss=self(x).mean(), num_samples=x.shape[0])
+
+        def val_step(self, x, optimizer, **kwargs):
+            return dict(loss=self(x).mean(), num_samples=x.shape[0])
+
+    def build_toy_runner(config=dict(type='EpochBasedRunner', max_epochs=3)):
+        model = ToyModel()
+        optimizer = torch.optim.SGD(model.parameters(), lr=0.02)
+        tmp_dir = tempfile.mkdtemp()
+
+        runner = build_runner(
+            config,
+            default_args=dict(
+                model=model,
+                work_dir=tmp_dir,
+                optimizer=optimizer,
+                logger=logging.getLogger(),
+                meta=dict()))
+        return runner
+
+    with pytest.raises(AssertionError):
+        # cumulative_iters only accepts int
+        GradientCumulativeOptimizerHook(cumulative_iters='str')
+
+    with pytest.raises(AssertionError):
+        # cumulative_iters only accepts positive number
+        GradientCumulativeOptimizerHook(cumulative_iters=-1)
+
+    # test epoch based runner
+    data = torch.rand((6, 3))
+    # optimize with cumulative_iters
+    loader_1 = DataLoader(data, batch_size=1)
+    runner_1 = build_toy_runner()
+    optimizer_hook = GradientCumulativeOptimizerHook(
+        grad_clip=dict(max_norm=0.2), cumulative_iters=3)
+    runner_1.register_hook(optimizer_hook)
+    runner_1.run([loader_1], [('train', 1)])
+
+    # optimize without cumulative_iters
+    loader_2 = DataLoader(data, batch_size=3)
+    runner_2 = build_toy_runner()
+    optimizer_hook = OptimizerHook(grad_clip=dict(max_norm=0.2))
+    runner_2.register_hook(optimizer_hook)
+    runner_2.run([loader_2], [('train', 1)])
+
+    # test optimizer works well
+    assert (runner_1.model.fc.weight < 1).all()
+    assert (runner_1.model.fc.bias < 1).all()
+    # test optimizer with cumulative_iters gets the same results
+    assert torch.allclose(runner_1.model.fc.weight, runner_2.model.fc.weight)
+    assert torch.allclose(runner_1.model.fc.bias, runner_2.model.fc.bias)
+    shutil.rmtree(runner_1.work_dir)
+    shutil.rmtree(runner_2.work_dir)
+
+    # test iter based runner
+    data = torch.rand((8, 3))
+    # optimize with cumulative_iters
+    loader_1 = DataLoader(data, batch_size=1)
+    runner_1 = build_toy_runner(dict(type='IterBasedRunner', max_iters=8))
+    optimizer_hook = GradientCumulativeOptimizerHook(
+        grad_clip=dict(max_norm=0.2), cumulative_iters=3)
+    runner_1.register_hook(optimizer_hook)
+    runner_1.run([loader_1], [('train', 1)])
+
+    # optimize without cumulative_iters
+    loader_2_divisible = DataLoader(data[:6], batch_size=3)
+    loader_2_remainder = DataLoader(data[6:], batch_size=2)
+    runner_2 = build_toy_runner(dict(type='IterBasedRunner', max_iters=3))
+    optimizer_hook = OptimizerHook(grad_clip=dict(max_norm=0.2))
+    runner_2.register_hook(optimizer_hook)
+    runner_2.run([loader_2_divisible, loader_2_remainder], [('train', 2),
+                                                            ('train', 1)])
+
+    # test optimizer works well
+    assert (runner_1.model.fc.weight < 1).all()
+    assert (runner_1.model.fc.bias < 1).all()
+    # test optimizer with cumulative_iters gets the same results
+    assert torch.allclose(runner_1.model.fc.weight, runner_2.model.fc.weight)
+    assert torch.allclose(runner_1.model.fc.bias, runner_2.model.fc.bias)
+    shutil.rmtree(runner_1.work_dir)
+    shutil.rmtree(runner_2.work_dir)
+
+    # test has_batch_norm
+    model = ToyModel(with_norm=True)
+    optimizer_hook = GradientCumulativeOptimizerHook(
+        grad_clip=dict(max_norm=0.2), cumulative_iters=3)
+    assert optimizer_hook.has_batch_norm(model)
+
+
+@pytest.mark.skipif(
+    not torch.cuda.is_available(), reason='requires CUDA support')
+def test_gradient_cumulative_fp16_optimizer_hook():
+
+    class ToyModel(nn.Module):
+
+        def __init__(self):
+            super().__init__()
+            self.fp16_enabled = False
+            self.fc = nn.Linear(3, 2)
+            nn.init.constant_(self.fc.weight, 1.)
+            nn.init.constant_(self.fc.bias, 1.)
+
+        @auto_fp16(apply_to=('x', ))
+        def forward(self, x):
+            x = self.fc(x)
+            return x
+
+        def train_step(self, x, optimizer, **kwargs):
+            return dict(loss=self(x).mean(), num_samples=x.shape[0])
+
+        def val_step(self, x, optimizer, **kwargs):
+            return dict(loss=self(x).mean(), num_samples=x.shape[0])
+
+    def build_toy_runner(config=dict(type='EpochBasedRunner', max_epochs=3)):
+        model = ToyModel().cuda()
+        optimizer = torch.optim.SGD(model.parameters(), lr=0.02)
+        tmp_dir = tempfile.mkdtemp()
+
+        runner = build_runner(
+            config,
+            default_args=dict(
+                model=model,
+                work_dir=tmp_dir,
+                optimizer=optimizer,
+                logger=logging.getLogger(),
+                meta=dict()))
+        return runner
+
+    # test epoch based runner
+    data = torch.rand((6, 3)).cuda()
+    # optimize with cumulative_iters
+    loader_1 = DataLoader(data, batch_size=1)
+    runner_1 = build_toy_runner()
+    optimizer_hook = GradientCumulativeFp16OptimizerHook(
+        grad_clip=dict(max_norm=0.2), cumulative_iters=3)
+    runner_1.register_hook(optimizer_hook)
+    runner_1.run([loader_1], [('train', 1)])
+
+    # optimize without cumulative_iters
+    loader_2 = DataLoader(data, batch_size=3)
+    runner_2 = build_toy_runner()
+    optimizer_hook = Fp16OptimizerHook(grad_clip=dict(max_norm=0.2))
+    runner_2.register_hook(optimizer_hook)
+    runner_2.run([loader_2], [('train', 1)])
+
+    # test optimizer works well
+    assert (runner_1.model.fc.weight < 1).all()
+    assert (runner_1.model.fc.bias < 1).all()
+    # test optimizer with cumulative_iters gets the same results
+    assert torch.allclose(runner_1.model.fc.weight, runner_2.model.fc.weight)
+    assert torch.allclose(runner_1.model.fc.bias, runner_2.model.fc.bias)
+    shutil.rmtree(runner_1.work_dir)
+    shutil.rmtree(runner_2.work_dir)
+
+    # test iter based runner
+    data = torch.rand((8, 3)).cuda()
+    # optimize with cumulative_iters
+    loader_1 = DataLoader(data, batch_size=1)
+    runner_1 = build_toy_runner(dict(type='IterBasedRunner', max_iters=8))
+    optimizer_hook = GradientCumulativeFp16OptimizerHook(
+        grad_clip=dict(max_norm=0.2), cumulative_iters=3)
+    runner_1.register_hook(optimizer_hook)
+    runner_1.run([loader_1], [('train', 1)])
+
+    # optimize without cumulative_iters
+    loader_2_divisible = DataLoader(data[:6], batch_size=3)
+    loader_2_remainder = DataLoader(data[6:], batch_size=2)
+    runner_2 = build_toy_runner(dict(type='IterBasedRunner', max_iters=3))
+    optimizer_hook = Fp16OptimizerHook(grad_clip=dict(max_norm=0.2))
+    runner_2.register_hook(optimizer_hook)
+    runner_2.run([loader_2_divisible, loader_2_remainder], [('train', 2),
+                                                            ('train', 1)])
+
+    # test optimizer works well
+    assert (runner_1.model.fc.weight < 1).all()
+    assert (runner_1.model.fc.bias < 1).all()
+    # test optimizer with cumulative_iters gets the same results
+    assert torch.allclose(runner_1.model.fc.weight, runner_2.model.fc.weight)
+    assert torch.allclose(runner_1.model.fc.bias, runner_2.model.fc.bias)
+    shutil.rmtree(runner_1.work_dir)
+    shutil.rmtree(runner_2.work_dir)
diff --git a/tests/test_runner/test_optimizer.py b/tests/test_runner/test_optimizer.py
new file mode 100644
index 0000000..cae22f7
--- /dev/null
+++ b/tests/test_runner/test_optimizer.py
@@ -0,0 +1,639 @@
+import sys
+import warnings
+from unittest.mock import MagicMock
+
+import pytest
+import torch
+import torch.nn as nn
+
+from mmcv.runner import OPTIMIZER_BUILDERS, DefaultOptimizerConstructor
+from mmcv.runner.optimizer import build_optimizer, build_optimizer_constructor
+from mmcv.runner.optimizer.builder import TORCH_OPTIMIZERS
+from mmcv.utils.ext_loader import check_ops_exist
+
+OPS_AVAILABLE = check_ops_exist()
+if not OPS_AVAILABLE:
+    sys.modules['mmcv.ops'] = MagicMock(
+        DeformConv2d=dict, ModulatedDeformConv2d=dict)
+
+
+class SubModel(nn.Module):
+
+    def __init__(self):
+        super().__init__()
+        self.conv1 = nn.Conv2d(2, 2, kernel_size=1, groups=2)
+        self.gn = nn.GroupNorm(2, 2)
+        self.param1 = nn.Parameter(torch.ones(1))
+
+    def forward(self, x):
+        return x
+
+
+class ExampleModel(nn.Module):
+
+    def __init__(self):
+        super().__init__()
+        self.param1 = nn.Parameter(torch.ones(1))
+        self.conv1 = nn.Conv2d(3, 4, kernel_size=1, bias=False)
+        self.conv2 = nn.Conv2d(4, 2, kernel_size=1)
+        self.bn = nn.BatchNorm2d(2)
+        self.sub = SubModel()
+        if OPS_AVAILABLE:
+            from mmcv.ops import DeformConv2dPack
+            self.dcn = DeformConv2dPack(
+                3, 4, kernel_size=3, deformable_groups=1)
+
+    def forward(self, x):
+        return x
+
+
+class ExampleDuplicateModel(nn.Module):
+
+    def __init__(self):
+        super().__init__()
+        self.param1 = nn.Parameter(torch.ones(1))
+        self.conv1 = nn.Sequential(nn.Conv2d(3, 4, kernel_size=1, bias=False))
+        self.conv2 = nn.Sequential(nn.Conv2d(4, 2, kernel_size=1))
+        self.bn = nn.BatchNorm2d(2)
+        self.sub = SubModel()
+        self.conv3 = nn.Sequential(nn.Conv2d(3, 4, kernel_size=1, bias=False))
+        self.conv3[0] = self.conv1[0]
+        if OPS_AVAILABLE:
+            from mmcv.ops import DeformConv2dPack
+            self.dcn = DeformConv2dPack(
+                3, 4, kernel_size=3, deformable_groups=1)
+
+    def forward(self, x):
+        return x
+
+
+class PseudoDataParallel(nn.Module):
+
+    def __init__(self):
+        super().__init__()
+        self.module = ExampleModel()
+
+    def forward(self, x):
+        return x
+
+
+base_lr = 0.01
+base_wd = 0.0001
+momentum = 0.9
+
+
+def check_default_optimizer(optimizer, model, prefix=''):
+    assert isinstance(optimizer, torch.optim.SGD)
+    assert optimizer.defaults['lr'] == base_lr
+    assert optimizer.defaults['momentum'] == momentum
+    assert optimizer.defaults['weight_decay'] == base_wd
+    param_groups = optimizer.param_groups[0]
+    if OPS_AVAILABLE:
+        param_names = [
+            'param1', 'conv1.weight', 'conv2.weight', 'conv2.bias',
+            'bn.weight', 'bn.bias', 'sub.param1', 'sub.conv1.weight',
+            'sub.conv1.bias', 'sub.gn.weight', 'sub.gn.bias', 'dcn.weight',
+            'dcn.conv_offset.weight', 'dcn.conv_offset.bias'
+        ]
+    else:
+        param_names = [
+            'param1', 'conv1.weight', 'conv2.weight', 'conv2.bias',
+            'bn.weight', 'bn.bias', 'sub.param1', 'sub.conv1.weight',
+            'sub.conv1.bias', 'sub.gn.weight', 'sub.gn.bias'
+        ]
+    param_dict = dict(model.named_parameters())
+    assert len(param_groups['params']) == len(param_names)
+    for i in range(len(param_groups['params'])):
+        assert torch.equal(param_groups['params'][i],
+                           param_dict[prefix + param_names[i]])
+
+
+def check_sgd_optimizer(optimizer,
+                        model,
+                        prefix='',
+                        bias_lr_mult=1,
+                        bias_decay_mult=1,
+                        norm_decay_mult=1,
+                        dwconv_decay_mult=1,
+                        dcn_offset_lr_mult=1,
+                        bypass_duplicate=False):
+    param_groups = optimizer.param_groups
+    assert isinstance(optimizer, torch.optim.SGD)
+    assert optimizer.defaults['lr'] == base_lr
+    assert optimizer.defaults['momentum'] == momentum
+    assert optimizer.defaults['weight_decay'] == base_wd
+    model_parameters = list(model.parameters())
+    assert len(param_groups) == len(model_parameters)
+    for i, param in enumerate(model_parameters):
+        param_group = param_groups[i]
+        assert torch.equal(param_group['params'][0], param)
+        assert param_group['momentum'] == momentum
+
+    # param1
+    param1 = param_groups[0]
+    assert param1['lr'] == base_lr
+    assert param1['weight_decay'] == base_wd
+    # conv1.weight
+    conv1_weight = param_groups[1]
+    assert conv1_weight['lr'] == base_lr
+    assert conv1_weight['weight_decay'] == base_wd
+    # conv2.weight
+    conv2_weight = param_groups[2]
+    assert conv2_weight['lr'] == base_lr
+    assert conv2_weight['weight_decay'] == base_wd
+    # conv2.bias
+    conv2_bias = param_groups[3]
+    assert conv2_bias['lr'] == base_lr * bias_lr_mult
+    assert conv2_bias['weight_decay'] == base_wd * bias_decay_mult
+    # bn.weight
+    bn_weight = param_groups[4]
+    assert bn_weight['lr'] == base_lr
+    assert bn_weight['weight_decay'] == base_wd * norm_decay_mult
+    # bn.bias
+    bn_bias = param_groups[5]
+    assert bn_bias['lr'] == base_lr
+    assert bn_bias['weight_decay'] == base_wd * norm_decay_mult
+    # sub.param1
+    sub_param1 = param_groups[6]
+    assert sub_param1['lr'] == base_lr
+    assert sub_param1['weight_decay'] == base_wd
+    # sub.conv1.weight
+    sub_conv1_weight = param_groups[7]
+    assert sub_conv1_weight['lr'] == base_lr
+    assert sub_conv1_weight['weight_decay'] == base_wd * dwconv_decay_mult
+    # sub.conv1.bias
+    sub_conv1_bias = param_groups[8]
+    assert sub_conv1_bias['lr'] == base_lr * bias_lr_mult
+    assert sub_conv1_bias['weight_decay'] == base_wd * dwconv_decay_mult
+    # sub.gn.weight
+    sub_gn_weight = param_groups[9]
+    assert sub_gn_weight['lr'] == base_lr
+    assert sub_gn_weight['weight_decay'] == base_wd * norm_decay_mult
+    # sub.gn.bias
+    sub_gn_bias = param_groups[10]
+    assert sub_gn_bias['lr'] == base_lr
+    assert sub_gn_bias['weight_decay'] == base_wd * norm_decay_mult
+
+    if torch.cuda.is_available():
+        dcn_conv_weight = param_groups[11]
+        assert dcn_conv_weight['lr'] == base_lr
+        assert dcn_conv_weight['weight_decay'] == base_wd
+
+        dcn_offset_weight = param_groups[12]
+        assert dcn_offset_weight['lr'] == base_lr * dcn_offset_lr_mult
+        assert dcn_offset_weight['weight_decay'] == base_wd
+
+        dcn_offset_bias = param_groups[13]
+        assert dcn_offset_bias['lr'] == base_lr * dcn_offset_lr_mult
+        assert dcn_offset_bias['weight_decay'] == base_wd
+
+
+def test_default_optimizer_constructor():
+    model = ExampleModel()
+
+    with pytest.raises(TypeError):
+        # optimizer_cfg must be a dict
+        optimizer_cfg = []
+        optim_constructor = DefaultOptimizerConstructor(optimizer_cfg)
+        optim_constructor(model)
+
+    with pytest.raises(TypeError):
+        # paramwise_cfg must be a dict or None
+        optimizer_cfg = dict(lr=0.0001)
+        paramwise_cfg = ['error']
+        optim_constructor = DefaultOptimizerConstructor(
+            optimizer_cfg, paramwise_cfg)
+        optim_constructor(model)
+
+    with pytest.raises(ValueError):
+        # bias_decay_mult/norm_decay_mult is specified but weight_decay is None
+        optimizer_cfg = dict(lr=0.0001, weight_decay=None)
+        paramwise_cfg = dict(bias_decay_mult=1, norm_decay_mult=1)
+        optim_constructor = DefaultOptimizerConstructor(
+            optimizer_cfg, paramwise_cfg)
+        optim_constructor(model)
+
+    # basic config with ExampleModel
+    optimizer_cfg = dict(
+        type='SGD', lr=base_lr, weight_decay=base_wd, momentum=momentum)
+    optim_constructor = DefaultOptimizerConstructor(optimizer_cfg)
+    optimizer = optim_constructor(model)
+    check_default_optimizer(optimizer, model)
+
+    # basic config with pseudo data parallel
+    model = PseudoDataParallel()
+    optimizer_cfg = dict(
+        type='SGD', lr=base_lr, weight_decay=base_wd, momentum=momentum)
+    paramwise_cfg = None
+    optim_constructor = DefaultOptimizerConstructor(optimizer_cfg)
+    optimizer = optim_constructor(model)
+    check_default_optimizer(optimizer, model, prefix='module.')
+
+    # basic config with DataParallel
+    if torch.cuda.is_available():
+        model = torch.nn.DataParallel(ExampleModel())
+        optimizer_cfg = dict(
+            type='SGD', lr=base_lr, weight_decay=base_wd, momentum=momentum)
+        paramwise_cfg = None
+        optim_constructor = DefaultOptimizerConstructor(optimizer_cfg)
+        optimizer = optim_constructor(model)
+        check_default_optimizer(optimizer, model, prefix='module.')
+
+    # Empty paramwise_cfg with ExampleModel
+    model = ExampleModel()
+    optimizer_cfg = dict(
+        type='SGD', lr=base_lr, weight_decay=base_wd, momentum=momentum)
+    paramwise_cfg = dict()
+    optim_constructor = DefaultOptimizerConstructor(optimizer_cfg,
+                                                    paramwise_cfg)
+    optimizer = optim_constructor(model)
+    check_default_optimizer(optimizer, model)
+
+    # Empty paramwise_cfg with ExampleModel and no grad
+    model = ExampleModel()
+    for param in model.parameters():
+        param.requires_grad = False
+    optimizer_cfg = dict(
+        type='SGD', lr=base_lr, weight_decay=base_wd, momentum=momentum)
+    paramwise_cfg = dict()
+    optim_constructor = DefaultOptimizerConstructor(optimizer_cfg)
+    optimizer = optim_constructor(model)
+    check_default_optimizer(optimizer, model)
+
+    # paramwise_cfg with ExampleModel
+    model = ExampleModel()
+    optimizer_cfg = dict(
+        type='SGD', lr=base_lr, weight_decay=base_wd, momentum=momentum)
+    paramwise_cfg = dict(
+        bias_lr_mult=2,
+        bias_decay_mult=0.5,
+        norm_decay_mult=0,
+        dwconv_decay_mult=0.1,
+        dcn_offset_lr_mult=0.1)
+    optim_constructor = DefaultOptimizerConstructor(optimizer_cfg,
+                                                    paramwise_cfg)
+    optimizer = optim_constructor(model)
+    check_sgd_optimizer(optimizer, model, **paramwise_cfg)
+
+    # paramwise_cfg with ExampleModel, weight decay is None
+    model = ExampleModel()
+    optimizer_cfg = dict(type='Rprop', lr=base_lr)
+    paramwise_cfg = dict(bias_lr_mult=2)
+    optim_constructor = DefaultOptimizerConstructor(optimizer_cfg,
+                                                    paramwise_cfg)
+    optimizer = optim_constructor(model)
+
+    param_groups = optimizer.param_groups
+    assert isinstance(optimizer, torch.optim.Rprop)
+    assert optimizer.defaults['lr'] == base_lr
+    model_parameters = list(model.parameters())
+    assert len(param_groups) == len(model_parameters)
+    for i, param in enumerate(model_parameters):
+        param_group = param_groups[i]
+        assert torch.equal(param_group['params'][0], param)
+    # param1
+    assert param_groups[0]['lr'] == base_lr
+    # conv1.weight
+    assert param_groups[1]['lr'] == base_lr
+    # conv2.weight
+    assert param_groups[2]['lr'] == base_lr
+    # conv2.bias
+    assert param_groups[3]['lr'] == base_lr * paramwise_cfg['bias_lr_mult']
+    # bn.weight
+    assert param_groups[4]['lr'] == base_lr
+    # bn.bias
+    assert param_groups[5]['lr'] == base_lr
+    # sub.param1
+    assert param_groups[6]['lr'] == base_lr
+    # sub.conv1.weight
+    assert param_groups[7]['lr'] == base_lr
+    # sub.conv1.bias
+    assert param_groups[8]['lr'] == base_lr * paramwise_cfg['bias_lr_mult']
+    # sub.gn.weight
+    assert param_groups[9]['lr'] == base_lr
+    # sub.gn.bias
+    assert param_groups[10]['lr'] == base_lr
+
+    if OPS_AVAILABLE:
+        # dcn.weight
+        assert param_groups[11]['lr'] == base_lr
+        # dcn.conv_offset.weight
+        assert param_groups[12]['lr'] == base_lr
+        # dcn.conv_offset.bias
+        assert param_groups[13]['lr'] == base_lr
+
+    # paramwise_cfg with pseudo data parallel
+    model = PseudoDataParallel()
+    optimizer_cfg = dict(
+        type='SGD', lr=base_lr, weight_decay=base_wd, momentum=momentum)
+    paramwise_cfg = dict(
+        bias_lr_mult=2,
+        bias_decay_mult=0.5,
+        norm_decay_mult=0,
+        dwconv_decay_mult=0.1,
+        dcn_offset_lr_mult=0.1)
+    optim_constructor = DefaultOptimizerConstructor(optimizer_cfg,
+                                                    paramwise_cfg)
+    optimizer = optim_constructor(model)
+    check_sgd_optimizer(optimizer, model, prefix='module.', **paramwise_cfg)
+
+    # paramwise_cfg with DataParallel
+    if torch.cuda.is_available():
+        model = torch.nn.DataParallel(ExampleModel())
+        optimizer_cfg = dict(
+            type='SGD', lr=base_lr, weight_decay=base_wd, momentum=momentum)
+        paramwise_cfg = dict(
+            bias_lr_mult=2,
+            bias_decay_mult=0.5,
+            norm_decay_mult=0,
+            dwconv_decay_mult=0.1,
+            dcn_offset_lr_mult=0.1)
+        optim_constructor = DefaultOptimizerConstructor(
+            optimizer_cfg, paramwise_cfg)
+        optimizer = optim_constructor(model)
+        check_sgd_optimizer(
+            optimizer, model, prefix='module.', **paramwise_cfg)
+
+    # paramwise_cfg with ExampleModel and no grad
+    for param in model.parameters():
+        param.requires_grad = False
+    optim_constructor = DefaultOptimizerConstructor(optimizer_cfg,
+                                                    paramwise_cfg)
+    optimizer = optim_constructor(model)
+    param_groups = optimizer.param_groups
+    assert isinstance(optimizer, torch.optim.SGD)
+    assert optimizer.defaults['lr'] == base_lr
+    assert optimizer.defaults['momentum'] == momentum
+    assert optimizer.defaults['weight_decay'] == base_wd
+    for i, (name, param) in enumerate(model.named_parameters()):
+        param_group = param_groups[i]
+        assert torch.equal(param_group['params'][0], param)
+        assert param_group['momentum'] == momentum
+        assert param_group['lr'] == base_lr
+        assert param_group['weight_decay'] == base_wd
+
+    # paramwise_cfg with bypass_duplicate option
+    model = ExampleDuplicateModel()
+    optimizer_cfg = dict(
+        type='SGD', lr=base_lr, weight_decay=base_wd, momentum=momentum)
+    paramwise_cfg = dict(
+        bias_lr_mult=2,
+        bias_decay_mult=0.5,
+        norm_decay_mult=0,
+        dwconv_decay_mult=0.1)
+    with pytest.raises(ValueError) as excinfo:
+        optim_constructor = DefaultOptimizerConstructor(
+            optimizer_cfg, paramwise_cfg)
+        optim_constructor(model)
+        assert 'some parameters appear in more than one parameter ' \
+               'group' == excinfo.value
+
+    paramwise_cfg = dict(
+        bias_lr_mult=2,
+        bias_decay_mult=0.5,
+        norm_decay_mult=0,
+        dwconv_decay_mult=0.1,
+        dcn_offset_lr_mult=0.1,
+        bypass_duplicate=True)
+    optim_constructor = DefaultOptimizerConstructor(optimizer_cfg,
+                                                    paramwise_cfg)
+    with warnings.catch_warnings(record=True) as w:
+        optimizer = optim_constructor(model)
+        warnings.simplefilter('always')
+        assert len(w) == 1
+        assert str(w[0].message) == 'conv3.0 is duplicate. It is skipped ' \
+                                    'since bypass_duplicate=True'
+    model_parameters = list(model.parameters())
+    num_params = 14 if OPS_AVAILABLE else 11
+    assert len(optimizer.param_groups) == len(model_parameters) == num_params
+    check_sgd_optimizer(optimizer, model, **paramwise_cfg)
+
+    # test DefaultOptimizerConstructor with custom_keys and ExampleModel
+    model = ExampleModel()
+    optimizer_cfg = dict(
+        type='SGD', lr=base_lr, weight_decay=base_wd, momentum=momentum)
+    paramwise_cfg = dict(
+        custom_keys={
+            'param1': dict(lr_mult=10),
+            'sub': dict(lr_mult=0.1, decay_mult=0),
+            'sub.gn': dict(lr_mult=0.01),
+            'non_exist_key': dict(lr_mult=0.0)
+        },
+        norm_decay_mult=0.5)
+
+    with pytest.raises(TypeError):
+        # custom_keys should be a dict
+        paramwise_cfg_ = dict(custom_keys=[0.1, 0.0001])
+        optim_constructor = DefaultOptimizerConstructor(
+            optimizer_cfg, paramwise_cfg_)
+        optimizer = optim_constructor(model)
+
+    with pytest.raises(ValueError):
+        # if 'decay_mult' is specified in custom_keys, weight_decay should be
+        # specified
+        optimizer_cfg_ = dict(type='SGD', lr=0.01)
+        paramwise_cfg_ = dict(custom_keys={'.backbone': dict(decay_mult=0.5)})
+        optim_constructor = DefaultOptimizerConstructor(
+            optimizer_cfg_, paramwise_cfg_)
+        optimizer = optim_constructor(model)
+
+    optim_constructor = DefaultOptimizerConstructor(optimizer_cfg,
+                                                    paramwise_cfg)
+    optimizer = optim_constructor(model)
+    # check optimizer type and default config
+    assert isinstance(optimizer, torch.optim.SGD)
+    assert optimizer.defaults['lr'] == base_lr
+    assert optimizer.defaults['momentum'] == momentum
+    assert optimizer.defaults['weight_decay'] == base_wd
+
+    # check params groups
+    param_groups = optimizer.param_groups
+
+    groups = []
+    group_settings = []
+    # group 1, matches of 'param1'
+    # 'param1' is the longest match for 'sub.param1'
+    groups.append(['param1', 'sub.param1'])
+    group_settings.append({
+        'lr': base_lr * 10,
+        'momentum': momentum,
+        'weight_decay': base_wd,
+    })
+    # group 2, matches of 'sub.gn'
+    groups.append(['sub.gn.weight', 'sub.gn.bias'])
+    group_settings.append({
+        'lr': base_lr * 0.01,
+        'momentum': momentum,
+        'weight_decay': base_wd,
+    })
+    # group 3, matches of 'sub'
+    groups.append(['sub.conv1.weight', 'sub.conv1.bias'])
+    group_settings.append({
+        'lr': base_lr * 0.1,
+        'momentum': momentum,
+        'weight_decay': 0,
+    })
+    # group 4, bn is configured by 'norm_decay_mult'
+    groups.append(['bn.weight', 'bn.bias'])
+    group_settings.append({
+        'lr': base_lr,
+        'momentum': momentum,
+        'weight_decay': base_wd * 0.5,
+    })
+    # group 5, default group
+    groups.append(['conv1.weight', 'conv2.weight', 'conv2.bias'])
+    group_settings.append({
+        'lr': base_lr,
+        'momentum': momentum,
+        'weight_decay': base_wd
+    })
+
+    num_params = 14 if OPS_AVAILABLE else 11
+    assert len(param_groups) == num_params
+    for i, (name, param) in enumerate(model.named_parameters()):
+        assert torch.equal(param_groups[i]['params'][0], param)
+        for group, settings in zip(groups, group_settings):
+            if name in group:
+                for setting in settings:
+                    assert param_groups[i][setting] == settings[
+                        setting], f'{name} {setting}'
+
+    # test DefaultOptimizerConstructor with custom_keys and ExampleModel 2
+    model = ExampleModel()
+    optimizer_cfg = dict(type='SGD', lr=base_lr, momentum=momentum)
+    paramwise_cfg = dict(custom_keys={'param1': dict(lr_mult=10)})
+
+    optim_constructor = DefaultOptimizerConstructor(optimizer_cfg,
+                                                    paramwise_cfg)
+    optimizer = optim_constructor(model)
+    # check optimizer type and default config
+    assert isinstance(optimizer, torch.optim.SGD)
+    assert optimizer.defaults['lr'] == base_lr
+    assert optimizer.defaults['momentum'] == momentum
+    assert optimizer.defaults['weight_decay'] == 0
+
+    # check params groups
+    param_groups = optimizer.param_groups
+
+    groups = []
+    group_settings = []
+    # group 1, matches of 'param1'
+    groups.append(['param1', 'sub.param1'])
+    group_settings.append({
+        'lr': base_lr * 10,
+        'momentum': momentum,
+        'weight_decay': 0,
+    })
+    # group 2, default group
+    groups.append([
+        'sub.conv1.weight', 'sub.conv1.bias', 'sub.gn.weight', 'sub.gn.bias',
+        'conv1.weight', 'conv2.weight', 'conv2.bias', 'bn.weight', 'bn.bias'
+    ])
+    group_settings.append({
+        'lr': base_lr,
+        'momentum': momentum,
+        'weight_decay': 0
+    })
+
+    num_params = 14 if OPS_AVAILABLE else 11
+    assert len(param_groups) == num_params
+    for i, (name, param) in enumerate(model.named_parameters()):
+        assert torch.equal(param_groups[i]['params'][0], param)
+        for group, settings in zip(groups, group_settings):
+            if name in group:
+                for setting in settings:
+                    assert param_groups[i][setting] == settings[
+                        setting], f'{name} {setting}'
+
+
+def test_torch_optimizers():
+    torch_optimizers = [
+        'ASGD', 'Adadelta', 'Adagrad', 'Adam', 'AdamW', 'Adamax', 'LBFGS',
+        'Optimizer', 'RMSprop', 'Rprop', 'SGD', 'SparseAdam'
+    ]
+    assert set(torch_optimizers).issubset(set(TORCH_OPTIMIZERS))
+
+
+def test_build_optimizer_constructor():
+    model = ExampleModel()
+    optimizer_cfg = dict(
+        type='SGD', lr=base_lr, weight_decay=base_wd, momentum=momentum)
+    paramwise_cfg = dict(
+        bias_lr_mult=2,
+        bias_decay_mult=0.5,
+        norm_decay_mult=0,
+        dwconv_decay_mult=0.1,
+        dcn_offset_lr_mult=0.1)
+    optim_constructor_cfg = dict(
+        type='DefaultOptimizerConstructor',
+        optimizer_cfg=optimizer_cfg,
+        paramwise_cfg=paramwise_cfg)
+    optim_constructor = build_optimizer_constructor(optim_constructor_cfg)
+    optimizer = optim_constructor(model)
+    check_sgd_optimizer(optimizer, model, **paramwise_cfg)
+
+    from mmcv.runner import OPTIMIZERS
+    from mmcv.utils import build_from_cfg
+
+    @OPTIMIZER_BUILDERS.register_module()
+    class MyOptimizerConstructor(DefaultOptimizerConstructor):
+
+        def __call__(self, model):
+            if hasattr(model, 'module'):
+                model = model.module
+
+            conv1_lr_mult = self.paramwise_cfg.get('conv1_lr_mult', 1.)
+
+            params = []
+            for name, param in model.named_parameters():
+                param_group = {'params': [param]}
+                if name.startswith('conv1') and param.requires_grad:
+                    param_group['lr'] = self.base_lr * conv1_lr_mult
+                params.append(param_group)
+            optimizer_cfg['params'] = params
+
+            return build_from_cfg(optimizer_cfg, OPTIMIZERS)
+
+    paramwise_cfg = dict(conv1_lr_mult=5)
+    optim_constructor_cfg = dict(
+        type='MyOptimizerConstructor',
+        optimizer_cfg=optimizer_cfg,
+        paramwise_cfg=paramwise_cfg)
+    optim_constructor = build_optimizer_constructor(optim_constructor_cfg)
+    optimizer = optim_constructor(model)
+
+    param_groups = optimizer.param_groups
+    assert isinstance(optimizer, torch.optim.SGD)
+    assert optimizer.defaults['lr'] == base_lr
+    assert optimizer.defaults['momentum'] == momentum
+    assert optimizer.defaults['weight_decay'] == base_wd
+    for i, param in enumerate(model.parameters()):
+        param_group = param_groups[i]
+        assert torch.equal(param_group['params'][0], param)
+        assert param_group['momentum'] == momentum
+    # conv1.weight
+    assert param_groups[1]['lr'] == base_lr * paramwise_cfg['conv1_lr_mult']
+    assert param_groups[1]['weight_decay'] == base_wd
+
+
+def test_build_optimizer():
+    model = ExampleModel()
+    optimizer_cfg = dict(
+        type='SGD', lr=base_lr, weight_decay=base_wd, momentum=momentum)
+    optimizer = build_optimizer(model, optimizer_cfg)
+    check_default_optimizer(optimizer, model)
+
+    model = ExampleModel()
+    optimizer_cfg = dict(
+        type='SGD',
+        lr=base_lr,
+        weight_decay=base_wd,
+        momentum=momentum,
+        paramwise_cfg=dict(
+            bias_lr_mult=2,
+            bias_decay_mult=0.5,
+            norm_decay_mult=0,
+            dwconv_decay_mult=0.1,
+            dcn_offset_lr_mult=0.1))
+    optimizer = build_optimizer(model, optimizer_cfg)
+    check_sgd_optimizer(optimizer, model, **optimizer_cfg['paramwise_cfg'])
diff --git a/tests/test_runner/test_runner.py b/tests/test_runner/test_runner.py
new file mode 100644
index 0000000..0a9f06c
--- /dev/null
+++ b/tests/test_runner/test_runner.py
@@ -0,0 +1,289 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import logging
+import os
+import os.path as osp
+import platform
+import random
+import string
+import tempfile
+
+import pytest
+import torch
+import torch.nn as nn
+
+from mmcv.parallel import MMDataParallel
+from mmcv.runner import (RUNNERS, EpochBasedRunner, IterBasedRunner,
+                         build_runner)
+from mmcv.runner.hooks import IterTimerHook
+
+
+class OldStyleModel(nn.Module):
+
+    def __init__(self):
+        super().__init__()
+        self.conv = nn.Conv2d(3, 3, 1)
+
+
+class Model(OldStyleModel):
+
+    def train_step(self):
+        pass
+
+    def val_step(self):
+        pass
+
+
+def test_build_runner():
+    temp_root = tempfile.gettempdir()
+    dir_name = ''.join(
+        [random.choice(string.ascii_letters) for _ in range(10)])
+
+    default_args = dict(
+        model=Model(),
+        work_dir=osp.join(temp_root, dir_name),
+        logger=logging.getLogger())
+    cfg = dict(type='EpochBasedRunner', max_epochs=1)
+    runner = build_runner(cfg, default_args=default_args)
+    assert runner._max_epochs == 1
+    cfg = dict(type='IterBasedRunner', max_iters=1)
+    runner = build_runner(cfg, default_args=default_args)
+    assert runner._max_iters == 1
+
+    with pytest.raises(ValueError, match='Only one of'):
+        cfg = dict(type='IterBasedRunner', max_epochs=1, max_iters=1)
+        runner = build_runner(cfg, default_args=default_args)
+
+
+@pytest.mark.parametrize('runner_class', RUNNERS.module_dict.values())
+def test_epoch_based_runner(runner_class):
+
+    with pytest.warns(UserWarning):
+        # batch_processor is deprecated
+        model = OldStyleModel()
+
+        def batch_processor():
+            pass
+
+        _ = runner_class(model, batch_processor, logger=logging.getLogger())
+
+    with pytest.raises(TypeError):
+        # batch_processor must be callable
+        model = OldStyleModel()
+        _ = runner_class(model, batch_processor=0, logger=logging.getLogger())
+
+    with pytest.raises(TypeError):
+        # optimizer must be a optimizer or a dict of optimizers
+        model = Model()
+        optimizer = 'NotAOptimizer'
+        _ = runner_class(
+            model, optimizer=optimizer, logger=logging.getLogger())
+
+    with pytest.raises(TypeError):
+        # optimizer must be a optimizer or a dict of optimizers
+        model = Model()
+        optimizers = dict(optim1=torch.optim.Adam(), optim2='NotAOptimizer')
+        _ = runner_class(
+            model, optimizer=optimizers, logger=logging.getLogger())
+
+    with pytest.raises(TypeError):
+        # logger must be a logging.Logger
+        model = Model()
+        _ = runner_class(model, logger=None)
+
+    with pytest.raises(TypeError):
+        # meta must be a dict or None
+        model = Model()
+        _ = runner_class(model, logger=logging.getLogger(), meta=['list'])
+
+    with pytest.raises(AssertionError):
+        # model must implement the method train_step()
+        model = OldStyleModel()
+        _ = runner_class(model, logger=logging.getLogger())
+
+    with pytest.raises(TypeError):
+        # work_dir must be a str or None
+        model = Model()
+        _ = runner_class(model, work_dir=1, logger=logging.getLogger())
+
+    with pytest.raises(RuntimeError):
+        # batch_processor and train_step() cannot be both set
+
+        def batch_processor():
+            pass
+
+        model = Model()
+        _ = runner_class(model, batch_processor, logger=logging.getLogger())
+
+    # test work_dir
+    model = Model()
+    temp_root = tempfile.gettempdir()
+    dir_name = ''.join(
+        [random.choice(string.ascii_letters) for _ in range(10)])
+    work_dir = osp.join(temp_root, dir_name)
+    _ = runner_class(model, work_dir=work_dir, logger=logging.getLogger())
+    assert osp.isdir(work_dir)
+    _ = runner_class(model, work_dir=work_dir, logger=logging.getLogger())
+    assert osp.isdir(work_dir)
+    os.removedirs(work_dir)
+
+
+@pytest.mark.parametrize('runner_class', RUNNERS.module_dict.values())
+def test_runner_with_parallel(runner_class):
+
+    def batch_processor():
+        pass
+
+    model = MMDataParallel(OldStyleModel())
+    _ = runner_class(model, batch_processor, logger=logging.getLogger())
+
+    model = MMDataParallel(Model())
+    _ = runner_class(model, logger=logging.getLogger())
+
+    with pytest.raises(RuntimeError):
+        # batch_processor and train_step() cannot be both set
+
+        def batch_processor():
+            pass
+
+        model = MMDataParallel(Model())
+        _ = runner_class(model, batch_processor, logger=logging.getLogger())
+
+
+@pytest.mark.parametrize('runner_class', RUNNERS.module_dict.values())
+def test_save_checkpoint(runner_class):
+    model = Model()
+    runner = runner_class(model=model, logger=logging.getLogger())
+
+    with pytest.raises(TypeError):
+        # meta should be None or dict
+        runner.save_checkpoint('.', meta=list())
+
+    with tempfile.TemporaryDirectory() as root:
+        runner.save_checkpoint(root)
+
+        latest_path = osp.join(root, 'latest.pth')
+        assert osp.exists(latest_path)
+
+        if isinstance(runner, EpochBasedRunner):
+            first_ckp_path = osp.join(root, 'epoch_1.pth')
+        elif isinstance(runner, IterBasedRunner):
+            first_ckp_path = osp.join(root, 'iter_1.pth')
+
+        assert osp.exists(first_ckp_path)
+
+        if platform.system() != 'Windows':
+            assert osp.realpath(latest_path) == osp.realpath(first_ckp_path)
+        else:
+            # use copy instead of symlink on windows
+            pass
+
+        torch.load(latest_path)
+
+
+@pytest.mark.parametrize('runner_class', RUNNERS.module_dict.values())
+def test_build_lr_momentum_hook(runner_class):
+    model = Model()
+    runner = runner_class(model=model, logger=logging.getLogger())
+
+    # test policy that is already title
+    lr_config = dict(
+        policy='CosineAnnealing',
+        by_epoch=False,
+        min_lr_ratio=0,
+        warmup_iters=2,
+        warmup_ratio=0.9)
+    runner.register_lr_hook(lr_config)
+    assert len(runner.hooks) == 1
+
+    # test policy that is already title
+    lr_config = dict(
+        policy='Cyclic',
+        by_epoch=False,
+        target_ratio=(10, 1),
+        cyclic_times=1,
+        step_ratio_up=0.4)
+    runner.register_lr_hook(lr_config)
+    assert len(runner.hooks) == 2
+
+    # test policy that is not title
+    lr_config = dict(
+        policy='cyclic',
+        by_epoch=False,
+        target_ratio=(0.85 / 0.95, 1),
+        cyclic_times=1,
+        step_ratio_up=0.4)
+    runner.register_lr_hook(lr_config)
+    assert len(runner.hooks) == 3
+
+    # test policy that is title
+    lr_config = dict(
+        policy='Step',
+        warmup='linear',
+        warmup_iters=500,
+        warmup_ratio=1.0 / 3,
+        step=[8, 11])
+    runner.register_lr_hook(lr_config)
+    assert len(runner.hooks) == 4
+
+    # test policy that is not title
+    lr_config = dict(
+        policy='step',
+        warmup='linear',
+        warmup_iters=500,
+        warmup_ratio=1.0 / 3,
+        step=[8, 11])
+    runner.register_lr_hook(lr_config)
+    assert len(runner.hooks) == 5
+
+    # test policy that is already title
+    mom_config = dict(
+        policy='CosineAnnealing',
+        min_momentum_ratio=0.99 / 0.95,
+        by_epoch=False,
+        warmup_iters=2,
+        warmup_ratio=0.9 / 0.95)
+    runner.register_momentum_hook(mom_config)
+    assert len(runner.hooks) == 6
+
+    # test policy that is already title
+    mom_config = dict(
+        policy='Cyclic',
+        by_epoch=False,
+        target_ratio=(0.85 / 0.95, 1),
+        cyclic_times=1,
+        step_ratio_up=0.4)
+    runner.register_momentum_hook(mom_config)
+    assert len(runner.hooks) == 7
+
+    # test policy that is already title
+    mom_config = dict(
+        policy='cyclic',
+        by_epoch=False,
+        target_ratio=(0.85 / 0.95, 1),
+        cyclic_times=1,
+        step_ratio_up=0.4)
+    runner.register_momentum_hook(mom_config)
+    assert len(runner.hooks) == 8
+
+
+@pytest.mark.parametrize('runner_class', RUNNERS.module_dict.values())
+def test_register_timer_hook(runner_class):
+    model = Model()
+    runner = runner_class(model=model, logger=logging.getLogger())
+
+    # test register None
+    timer_config = None
+    runner.register_timer_hook(timer_config)
+    assert len(runner.hooks) == 0
+
+    # test register IterTimerHook with config
+    timer_config = dict(type='IterTimerHook')
+    runner.register_timer_hook(timer_config)
+    assert len(runner.hooks) == 1
+    assert isinstance(runner.hooks[0], IterTimerHook)
+
+    # test register IterTimerHook
+    timer_config = IterTimerHook()
+    runner.register_timer_hook(timer_config)
+    assert len(runner.hooks) == 2
+    assert isinstance(runner.hooks[1], IterTimerHook)
diff --git a/tests/test_runner/test_utils.py b/tests/test_runner/test_utils.py
new file mode 100644
index 0000000..974bac8
--- /dev/null
+++ b/tests/test_runner/test_utils.py
@@ -0,0 +1,38 @@
+import os
+import random
+
+import numpy as np
+import torch
+
+from mmcv.runner import set_random_seed
+from mmcv.utils import TORCH_VERSION, digit_version
+
+is_rocm_pytorch = False
+if digit_version(TORCH_VERSION) >= digit_version('1.5'):
+    from torch.utils.cpp_extension import ROCM_HOME
+    is_rocm_pytorch = True if ((torch.version.hip is not None) and
+                               (ROCM_HOME is not None)) else False
+
+
+def test_set_random_seed():
+    set_random_seed(0)
+    a_random = random.randint(0, 10)
+    a_np_random = np.random.rand(2, 2)
+    a_torch_random = torch.rand(2, 2)
+    assert torch.backends.cudnn.deterministic is False
+    assert torch.backends.cudnn.benchmark is False
+    assert os.environ['PYTHONHASHSEED'] == str(0)
+
+    set_random_seed(0, True)
+    b_random = random.randint(0, 10)
+    b_np_random = np.random.rand(2, 2)
+    b_torch_random = torch.rand(2, 2)
+    assert torch.backends.cudnn.deterministic is True
+    if is_rocm_pytorch:
+        assert torch.backends.cudnn.benchmark is True
+    else:
+        assert torch.backends.cudnn.benchmark is False
+
+    assert a_random == b_random
+    assert np.equal(a_np_random, b_np_random).all()
+    assert torch.equal(a_torch_random, b_torch_random)
diff --git a/tests/test_transforms/test_transforms_formatting.py b/tests/test_transforms/test_transforms_formatting.py
deleted file mode 100644
index 96abc8c..0000000
--- a/tests/test_transforms/test_transforms_formatting.py
+++ /dev/null
@@ -1,101 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-try:
-    import torch
-except ModuleNotFoundError:
-    torch = None
-else:
-    from mmcv.transforms import ToTensor, to_tensor, ImageToTensor
-
-import copy
-
-import numpy as np
-import pytest
-
-
-@pytest.mark.skipif(condition=torch is None, reason='No torch in current env')
-def test_to_tensor():
-
-    # The type of the input object is torch.Tensor
-    data_tensor = torch.tensor([1, 2, 3])
-    tensor_from_tensor = to_tensor(data_tensor)
-    assert isinstance(tensor_from_tensor, torch.Tensor)
-
-    # The type of the input object is numpy.ndarray
-    data_numpy = np.array([1, 2, 3])
-    tensor_from_numpy = to_tensor(data_numpy)
-    assert isinstance(tensor_from_numpy, torch.Tensor)
-
-    # The type of the input object is list
-    data_list = [1, 2, 3]
-    tensor_from_list = to_tensor(data_list)
-    assert isinstance(tensor_from_list, torch.Tensor)
-
-    # The type of the input object is int
-    data_int = 1
-    tensor_from_int = to_tensor(data_int)
-    assert isinstance(tensor_from_int, torch.Tensor)
-
-    # The type of the input object is float
-    data_float = 1.0
-    tensor_from_float = to_tensor(data_float)
-    assert isinstance(tensor_from_float, torch.Tensor)
-
-    # The type of the input object is invalid
-    with pytest.raises(TypeError):
-        data_str = '123'
-        _ = to_tensor(data_str)
-
-
-@pytest.mark.skipif(condition=torch is None, reason='No torch in current env')
-class TestToTensor:
-
-    def test_init(self):
-        TRANSFORM = ToTensor(keys=['img_label'])
-        assert TRANSFORM.keys == ['img_label']
-
-    def test_transform(self):
-        TRANSFORMS = ToTensor(['instances.bbox', 'img_label'])
-
-        # Test multi-level key and single-level key (multi-level key is
-        # not in results)
-        with pytest.raises(KeyError):
-            results = {'instances': {'label': [1]}, 'img_label': [1]}
-            results_tensor = TRANSFORMS.transform(copy.deepcopy(results))
-            assert isinstance(results_tensor['instances']['label'], list)
-            assert isinstance(results_tensor['img_label'], torch.Tensor)
-
-        # Test multi-level key (multi-level key is in results)
-        results = {'instances': {'bbox': [[0, 0, 10, 10]]}, 'img_label': [1]}
-        results_tensor = TRANSFORMS.transform(copy.deepcopy(results))
-        assert isinstance(results_tensor['instances']['bbox'], torch.Tensor)
-
-    def test_repr(self):
-        TRANSFORMS = ToTensor(['instances.bbox', 'img_label'])
-        TRANSFORMS_str = str(TRANSFORMS)
-        isinstance(TRANSFORMS_str, str)
-
-
-@pytest.mark.skipif(condition=torch is None, reason='No torch in current env')
-class TestImageToTensor:
-
-    def test_init(self):
-        TRANSFORMS = ImageToTensor(['img'])
-        assert TRANSFORMS.keys == ['img']
-
-    def test_transform(self):
-        TRANSFORMS = ImageToTensor(['img'])
-
-        # image only has one channel
-        results = {'img': np.zeros((224, 224))}
-        results = TRANSFORMS.transform(results)
-        assert results['img'].shape == (1, 224, 224)
-
-        # image has three channels
-        results = {'img': np.zeros((224, 224, 3))}
-        results = TRANSFORMS.transform(results)
-        assert results['img'].shape == (3, 224, 224)
-
-    def test_repr(self):
-        TRANSFORMS = ImageToTensor(['img'])
-        TRANSFORMS_str = str(TRANSFORMS)
-        assert isinstance(TRANSFORMS_str, str)
diff --git a/tests/test_transforms/test_transforms_loading.py b/tests/test_transforms/test_transforms_loading.py
deleted file mode 100644
index 918783c..0000000
--- a/tests/test_transforms/test_transforms_loading.py
+++ /dev/null
@@ -1,151 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import copy
-import os.path as osp
-
-import numpy as np
-import pytest
-
-from mmcv.transforms import LoadAnnotations, LoadImageFromFile
-
-
-class TestLoadImageFromFile:
-
-    def test_load_img(self):
-        # file_client_args and backend_args can not be both set
-        with pytest.raises(
-                ValueError,
-                match='"file_client_args" and "backend_args" cannot be set'):
-            LoadImageFromFile(
-                file_client_args={'backend': 'disk'},
-                backend_args={'backend': 'disk'})
-        data_prefix = osp.join(osp.dirname(__file__), '../data')
-
-        results = dict(img_path=osp.join(data_prefix, 'color.jpg'))
-        transform = LoadImageFromFile()
-        results = transform(copy.deepcopy(results))
-        assert results['img_path'] == osp.join(data_prefix, 'color.jpg')
-        assert results['img'].shape == (300, 400, 3)
-        assert results['img'].dtype == np.uint8
-        assert results['img_shape'] == (300, 400)
-        assert results['ori_shape'] == (300, 400)
-        assert repr(transform) == transform.__class__.__name__ + \
-            "(ignore_empty=False, to_float32=False, color_type='color', " + \
-            "imdecode_backend='cv2', backend_args=None)"
-
-        # to_float32
-        transform = LoadImageFromFile(to_float32=True)
-        results = transform(copy.deepcopy(results))
-        assert results['img'].dtype == np.float32
-
-        # gray image
-        results = dict(img_path=osp.join(data_prefix, 'grayscale.jpg'))
-        transform = LoadImageFromFile()
-        results = transform(copy.deepcopy(results))
-        assert results['img'].shape == (300, 400, 3)
-        assert results['img'].dtype == np.uint8
-
-        transform = LoadImageFromFile(color_type='unchanged')
-        results = transform(copy.deepcopy(results))
-        assert results['img'].shape == (300, 400)
-        assert results['img'].dtype == np.uint8
-
-        # test load empty
-        fake_img_path = osp.join(data_prefix, 'fake.jpg')
-        results['img_path'] = fake_img_path
-        transform = LoadImageFromFile(ignore_empty=False)
-        with pytest.raises(FileNotFoundError):
-            transform(copy.deepcopy(results))
-        transform = LoadImageFromFile(ignore_empty=True)
-        assert transform(copy.deepcopy(results)) is None
-
-
-class TestLoadAnnotations:
-
-    def setup_class(cls):
-        data_prefix = osp.join(osp.dirname(__file__), '../data')
-        seg_map = osp.join(data_prefix, 'grayscale.jpg')
-        cls.results = {
-            'seg_map_path':
-            seg_map,
-            'instances': [{
-                'bbox': [0, 0, 10, 20],
-                'bbox_label': 1,
-                'keypoints': [1, 2, 3]
-            }, {
-                'bbox': [10, 10, 110, 120],
-                'bbox_label': 2,
-                'keypoints': [4, 5, 6]
-            }]
-        }
-
-    def test_init(self):
-        # file_client_args and backend_args can not be both set
-        with pytest.raises(
-                ValueError,
-                match='"file_client_args" and "backend_args" cannot be set'):
-            LoadAnnotations(
-                file_client_args={'backend': 'disk'},
-                backend_args={'backend': 'disk'})
-
-    def test_load_bboxes(self):
-        transform = LoadAnnotations(
-            with_bbox=True,
-            with_label=False,
-            with_seg=False,
-            with_keypoints=False,
-        )
-        results = transform(copy.deepcopy(self.results))
-        assert 'gt_bboxes' in results
-        assert (results['gt_bboxes'] == np.array([[0, 0, 10, 20],
-                                                  [10, 10, 110, 120]])).all()
-        assert results['gt_bboxes'].dtype == np.float32
-
-    def test_load_labels(self):
-        transform = LoadAnnotations(
-            with_bbox=False,
-            with_label=True,
-            with_seg=False,
-            with_keypoints=False,
-        )
-        results = transform(copy.deepcopy(self.results))
-        assert 'gt_bboxes_labels' in results
-        assert (results['gt_bboxes_labels'] == np.array([1, 2])).all()
-        assert results['gt_bboxes_labels'].dtype == np.int64
-
-    def test_load_kps(self):
-        transform = LoadAnnotations(
-            with_bbox=False,
-            with_label=False,
-            with_seg=False,
-            with_keypoints=True,
-        )
-        results = transform(copy.deepcopy(self.results))
-        assert 'gt_keypoints' in results
-        assert (results['gt_keypoints'] == np.array([[[1, 2, 3]],
-                                                     [[4, 5, 6]]])).all()
-        assert results['gt_keypoints'].dtype == np.float32
-
-    def test_load_seg_map(self):
-        transform = LoadAnnotations(
-            with_bbox=False,
-            with_label=False,
-            with_seg=True,
-            with_keypoints=False,
-        )
-        results = transform(copy.deepcopy(self.results))
-        assert 'gt_seg_map' in results
-        assert results['gt_seg_map'].shape[:2] == (300, 400)
-        assert results['gt_seg_map'].dtype == np.uint8
-
-    def test_repr(self):
-        transform = LoadAnnotations(
-            with_bbox=True,
-            with_label=False,
-            with_seg=False,
-            with_keypoints=False,
-        )
-        assert repr(transform) == (
-            'LoadAnnotations(with_bbox=True, '
-            'with_label=False, with_seg=False, '
-            "with_keypoints=False, imdecode_backend='cv2', "
-            'backend_args=None)')
diff --git a/tests/test_transforms/test_transforms_processing.py b/tests/test_transforms/test_transforms_processing.py
deleted file mode 100644
index 716b9cf..0000000
--- a/tests/test_transforms/test_transforms_processing.py
+++ /dev/null
@@ -1,1014 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import copy
-import os.path as osp
-from unittest.mock import Mock
-
-import numpy as np
-import pytest
-
-import mmcv
-from mmcv.transforms import (TRANSFORMS, Normalize, Pad, RandomFlip,
-                             RandomResize, Resize, TestTimeAug)
-from mmcv.transforms.base import BaseTransform
-
-try:
-    import torch
-except ModuleNotFoundError:
-    torch = None
-else:
-    import torchvision
-
-from numpy.testing import assert_array_almost_equal, assert_array_equal
-from PIL import Image
-
-
-class TestNormalize:
-
-    def test_normalize(self):
-        img_norm_cfg = dict(
-            mean=[123.675, 116.28, 103.53],
-            std=[58.395, 57.12, 57.375],
-            to_rgb=True)
-        transform = Normalize(**img_norm_cfg)
-        results = dict()
-        img = mmcv.imread(
-            osp.join(osp.dirname(__file__), '../data/color.jpg'), 'color')
-        original_img = copy.deepcopy(img)
-        results['img'] = img
-        results = transform(results)
-        mean = np.array(img_norm_cfg['mean'])
-        std = np.array(img_norm_cfg['std'])
-        converted_img = (original_img[..., ::-1] - mean) / std
-        assert np.allclose(results['img'], converted_img)
-
-    def test_repr(self):
-        img_norm_cfg = dict(
-            mean=[123.675, 116.28, 103.53],
-            std=[58.395, 57.12, 57.375],
-            to_rgb=True)
-        transform = Normalize(**img_norm_cfg)
-        assert repr(transform) == ('Normalize(mean=[123.675 116.28  103.53 ], '
-                                   'std=[58.395 57.12  57.375], to_rgb=True)')
-
-
-class TestResize:
-
-    def test_resize(self):
-        data_info = dict(
-            img=np.random.random((1333, 800, 3)),
-            gt_seg_map=np.random.random((1333, 800, 3)),
-            gt_bboxes=np.array([[0, 0, 112, 112]]),
-            gt_keypoints=np.array([[[20, 50, 1]]]))
-
-        with pytest.raises(AssertionError):
-            transform = Resize(scale=None, scale_factor=None)
-        with pytest.raises(TypeError):
-            transform = Resize(scale_factor=[])
-        # test scale is int
-        transform = Resize(scale=2000)
-        results = transform(copy.deepcopy(data_info))
-        assert results['img'].shape[:2] == (2000, 2000)
-        assert results['scale_factor'] == (2000 / 800, 2000 / 1333)
-
-        # test scale is tuple
-        transform = Resize(scale=(2000, 2000))
-        results = transform(copy.deepcopy(data_info))
-        assert results['img'].shape[:2] == (2000, 2000)
-        assert results['scale_factor'] == (2000 / 800, 2000 / 1333)
-
-        # test scale_factor is float
-        transform = Resize(scale_factor=2.0)
-        results = transform(copy.deepcopy(data_info))
-        assert results['img'].shape[:2] == (2666, 1600)
-        assert results['scale_factor'] == (2.0, 2.0)
-
-        # test scale_factor is tuple
-        transform = Resize(scale_factor=(1.5, 2))
-        results = transform(copy.deepcopy(data_info))
-        assert results['img'].shape[:2] == (2666, 1200)
-        assert results['scale_factor'] == (1.5, 2)
-
-        # test keep_ratio is True
-        transform = Resize(scale=(2000, 2000), keep_ratio=True)
-        results = transform(copy.deepcopy(data_info))
-        assert results['img'].shape[:2] == (2000, 1200)
-        assert results['scale_factor'] == (1200 / 800, 2000 / 1333)
-
-        # test resize_bboxes/seg/kps
-        transform = Resize(scale_factor=(1.5, 2))
-        results = transform(copy.deepcopy(data_info))
-        assert (results['gt_bboxes'] == np.array([[0, 0, 168, 224]])).all()
-        assert (results['gt_keypoints'] == np.array([[[30, 100, 1]]])).all()
-        assert results['gt_seg_map'].shape[:2] == (2666, 1200)
-
-        # test clip_object_border = False
-        data_info = dict(
-            img=np.random.random((300, 400, 3)),
-            gt_bboxes=np.array([[200, 150, 600, 450]]))
-        transform = Resize(scale=(200, 150), clip_object_border=False)
-        results = transform(data_info)
-        assert (results['gt_bboxes'] == np.array([100, 75, 300, 225])).all()
-
-    def test_repr(self):
-        transform = Resize(scale=(2000, 2000), keep_ratio=True)
-        assert repr(transform) == ('Resize(scale=(2000, 2000), '
-                                   'scale_factor=None, keep_ratio=True, '
-                                   'clip_object_border=True), backend=cv2), '
-                                   'interpolation=bilinear)')
-
-
-class TestPad:
-
-    def test_pad(self):
-        # test size and size_divisor are both set
-        with pytest.raises(AssertionError):
-            Pad(size=(10, 10), size_divisor=2)
-
-        # test size and size_divisor are both None
-        with pytest.raises(AssertionError):
-            Pad(size=None, size_divisor=None)
-
-        # test size and pad_to_square are both None
-        with pytest.raises(AssertionError):
-            Pad(size=(10, 10), pad_to_square=True)
-
-        # test pad_val is not int or tuple
-        with pytest.raises(AssertionError):
-            Pad(size=(10, 10), pad_val=[])
-
-        # test padding_mode is not 'constant', 'edge', 'reflect' or 'symmetric'
-        with pytest.raises(AssertionError):
-            Pad(size=(10, 10), padding_mode='edg')
-
-        data_info = dict(
-            img=np.random.random((1333, 800, 3)),
-            gt_seg_map=np.random.random((1333, 800, 3)),
-            gt_bboxes=np.array([[0, 0, 112, 112]]),
-            gt_keypoints=np.array([[[20, 50, 1]]]))
-
-        # test pad img / gt_seg_map with size
-        trans = Pad(size=(1200, 2000))
-        results = trans(copy.deepcopy(data_info))
-        assert results['img'].shape[:2] == (2000, 1200)
-        assert results['gt_seg_map'].shape[:2] == (2000, 1200)
-
-        # test pad img/gt_seg_map with size_divisor
-        trans = Pad(size_divisor=11)
-        results = trans(copy.deepcopy(data_info))
-        assert results['img'].shape[:2] == (1342, 803)
-        assert results['gt_seg_map'].shape[:2] == (1342, 803)
-
-        # test pad img/gt_seg_map with pad_to_square
-        trans = Pad(pad_to_square=True)
-        results = trans(copy.deepcopy(data_info))
-        assert results['img'].shape[:2] == (1333, 1333)
-        assert results['gt_seg_map'].shape[:2] == (1333, 1333)
-
-        # test pad img/gt_seg_map with pad_to_square and size_divisor
-        trans = Pad(pad_to_square=True, size_divisor=11)
-        results = trans(copy.deepcopy(data_info))
-        assert results['img'].shape[:2] == (1342, 1342)
-        assert results['gt_seg_map'].shape[:2] == (1342, 1342)
-
-        # test pad img/gt_seg_map with pad_to_square and size_divisor
-        trans = Pad(pad_to_square=True, size_divisor=11)
-        results = trans(copy.deepcopy(data_info))
-        assert results['img'].shape[:2] == (1342, 1342)
-        assert results['gt_seg_map'].shape[:2] == (1342, 1342)
-
-        # test padding_mode
-        new_img = np.ones((1333, 800, 3))
-        data_info['img'] = new_img
-        trans = Pad(pad_to_square=True, padding_mode='edge')
-        results = trans(copy.deepcopy(data_info))
-        assert (results['img'] == np.ones((1333, 1333, 3))).all()
-
-        # test pad_val is dict
-        # test rgb image, size=(2000, 2000)
-        trans = Pad(
-            size=(2000, 2000),
-            pad_val=dict(img=(12, 12, 12), seg=(10, 10, 10)))
-        results = trans(copy.deepcopy(data_info))
-        assert (results['img'][1333:2000, 800:2000, :] == 12).all()
-        assert (results['gt_seg_map'][1333:2000, 800:2000, :] == 10).all()
-
-        trans = Pad(size=(2000, 2000), pad_val=dict(img=(12, 12, 12)))
-        results = trans(copy.deepcopy(data_info))
-        assert (results['img'][1333:2000, 800:2000, :] == 12).all()
-        assert (results['gt_seg_map'][1333:2000, 800:2000, :] == 255).all()
-
-        # test rgb image, pad_to_square=True
-        trans = Pad(
-            pad_to_square=True,
-            pad_val=dict(img=(12, 12, 12), seg=(10, 10, 10)))
-        results = trans(copy.deepcopy(data_info))
-        assert (results['img'][:, 800:1333, :] == 12).all()
-        assert (results['gt_seg_map'][:, 800:1333, :] == 10).all()
-
-        trans = Pad(pad_to_square=True, pad_val=dict(img=(12, 12, 12)))
-        results = trans(copy.deepcopy(data_info))
-        assert (results['img'][:, 800:1333, :] == 12).all()
-        assert (results['gt_seg_map'][:, 800:1333, :] == 255).all()
-
-        # test pad_val is int
-        # test rgb image
-        trans = Pad(size=(2000, 2000), pad_val=12)
-        results = trans(copy.deepcopy(data_info))
-        assert (results['img'][1333:2000, 800:2000, :] == 12).all()
-        assert (results['gt_seg_map'][1333:2000, 800:2000, :] == 255).all()
-        # test gray image
-        new_img = np.random.random((1333, 800))
-        data_info['img'] = new_img
-        new_semantic_seg = np.random.random((1333, 800))
-        data_info['gt_seg_map'] = new_semantic_seg
-        trans = Pad(size=(2000, 2000), pad_val=12)
-        results = trans(copy.deepcopy(data_info))
-        assert (results['img'][1333:2000, 800:2000] == 12).all()
-        assert (results['gt_seg_map'][1333:2000, 800:2000] == 255).all()
-
-    def test_repr(self):
-        trans = Pad(pad_to_square=True, size_divisor=11, padding_mode='edge')
-        assert repr(trans) == (
-            'Pad(size=None, size_divisor=11, pad_to_square=True, '
-            "pad_val={'img': 0, 'seg': 255}), padding_mode=edge)")
-
-
-class TestCenterCrop:
-
-    @classmethod
-    def setup_class(cls):
-        img = mmcv.imread(
-            osp.join(osp.dirname(__file__), '../data/color.jpg'), 'color')
-        cls.original_img = copy.deepcopy(img)
-        seg = np.random.randint(0, 19, (300, 400)).astype(np.uint8)
-        cls.gt_semantic_map = copy.deepcopy(seg)
-
-    @staticmethod
-    def reset_results(results, original_img, gt_semantic_map):
-        results['img'] = copy.deepcopy(original_img)
-        results['gt_seg_map'] = copy.deepcopy(gt_semantic_map)
-        results['gt_bboxes'] = np.array([[0, 0, 210, 160],
-                                         [200, 150, 400, 300]])
-        results['gt_keypoints'] = np.array([[[20, 50, 1]], [[200, 150, 1]],
-                                            [[300, 225, 1]]])
-        return results
-
-    @pytest.mark.skipif(
-        condition=torch is None, reason='No torch in current env')
-    def test_error(self):
-        # test assertion if size is smaller than 0
-        with pytest.raises(AssertionError):
-            transform = dict(type='CenterCrop', crop_size=-1)
-            TRANSFORMS.build(transform)
-
-        # test assertion if size is tuple but one value is smaller than 0
-        with pytest.raises(AssertionError):
-            transform = dict(type='CenterCrop', crop_size=(224, -1))
-            TRANSFORMS.build(transform)
-
-        # test assertion if size is tuple and len(size) < 2
-        with pytest.raises(AssertionError):
-            transform = dict(type='CenterCrop', crop_size=(224, ))
-            TRANSFORMS.build(transform)
-
-        # test assertion if size is tuple len(size) > 2
-        with pytest.raises(AssertionError):
-            transform = dict(type='CenterCrop', crop_size=(224, 224, 3))
-            TRANSFORMS.build(transform)
-
-    def test_repr(self):
-        # test repr
-        transform = dict(type='CenterCrop', crop_size=224)
-        center_crop_module = TRANSFORMS.build(transform)
-        assert isinstance(repr(center_crop_module), str)
-
-    def test_transform(self):
-        results = {}
-        self.reset_results(results, self.original_img, self.gt_semantic_map)
-
-        # test CenterCrop when size is int
-        transform = dict(type='CenterCrop', crop_size=224)
-        center_crop_module = TRANSFORMS.build(transform)
-        results = center_crop_module(results)
-        assert results['img_shape'] == (224, 224)
-        assert (results['img'] == self.original_img[38:262, 88:312, ...]).all()
-        assert (results['gt_seg_map'] == self.gt_semantic_map[38:262,
-                                                              88:312]).all()
-        assert np.equal(results['gt_bboxes'],
-                        np.array([[0, 0, 122, 122], [112, 112, 224,
-                                                     224]])).all()
-        assert np.equal(
-            results['gt_keypoints'],
-            np.array([[[0, 12, 0]], [[112, 112, 1]], [[212, 187, 1]]])).all()
-
-        # test CenterCrop when size is tuple
-        transform = dict(type='CenterCrop', crop_size=(224, 224))
-        center_crop_module = TRANSFORMS.build(transform)
-        results = self.reset_results(results, self.original_img,
-                                     self.gt_semantic_map)
-        results = center_crop_module(results)
-        assert results['img_shape'] == (224, 224)
-        assert (results['img'] == self.original_img[38:262, 88:312, ...]).all()
-        assert (results['gt_seg_map'] == self.gt_semantic_map[38:262,
-                                                              88:312]).all()
-        assert np.equal(results['gt_bboxes'],
-                        np.array([[0, 0, 122, 122], [112, 112, 224,
-                                                     224]])).all()
-        assert np.equal(
-            results['gt_keypoints'],
-            np.array([[[0, 12, 0]], [[112, 112, 1]], [[212, 187, 1]]])).all()
-
-        # test CenterCrop when crop_height != crop_width
-        transform = dict(type='CenterCrop', crop_size=(224, 256))
-        center_crop_module = TRANSFORMS.build(transform)
-        results = self.reset_results(results, self.original_img,
-                                     self.gt_semantic_map)
-        results = center_crop_module(results)
-        assert results['img_shape'] == (256, 224)
-        assert (results['img'] == self.original_img[22:278, 88:312, ...]).all()
-        assert (results['gt_seg_map'] == self.gt_semantic_map[22:278,
-                                                              88:312]).all()
-        assert np.equal(results['gt_bboxes'],
-                        np.array([[0, 0, 122, 138], [112, 128, 224,
-                                                     256]])).all()
-        assert np.equal(
-            results['gt_keypoints'],
-            np.array([[[0, 28, 0]], [[112, 128, 1]], [[212, 203, 1]]])).all()
-
-        # test CenterCrop when crop_size is equal to img.shape
-        img_height, img_width, _ = self.original_img.shape
-        transform = dict(type='CenterCrop', crop_size=(img_width, img_height))
-        center_crop_module = TRANSFORMS.build(transform)
-        results = self.reset_results(results, self.original_img,
-                                     self.gt_semantic_map)
-        results = center_crop_module(results)
-        assert results['img_shape'] == (300, 400)
-        assert (results['img'] == self.original_img).all()
-        assert (results['gt_seg_map'] == self.gt_semantic_map).all()
-        assert np.equal(results['gt_bboxes'],
-                        np.array([[0, 0, 210, 160], [200, 150, 400,
-                                                     300]])).all()
-        assert np.equal(
-            results['gt_keypoints'],
-            np.array([[[20, 50, 1]], [[200, 150, 1]], [[300, 225, 1]]])).all()
-
-        # test CenterCrop when crop_size is larger than img.shape
-        transform = dict(
-            type='CenterCrop', crop_size=(img_width * 2, img_height * 2))
-        center_crop_module = TRANSFORMS.build(transform)
-        results = self.reset_results(results, self.original_img,
-                                     self.gt_semantic_map)
-        results = center_crop_module(results)
-        assert results['img_shape'] == (300, 400)
-        assert (results['img'] == self.original_img).all()
-        assert (results['gt_seg_map'] == self.gt_semantic_map).all()
-        assert np.equal(results['gt_bboxes'],
-                        np.array([[0, 0, 210, 160], [200, 150, 400,
-                                                     300]])).all()
-        assert np.equal(
-            results['gt_keypoints'],
-            np.array([[[20, 50, 1]], [[200, 150, 1]], [[300, 225, 1]]])).all()
-
-        # test with padding
-        transform = dict(
-            type='CenterCrop',
-            crop_size=(img_width // 2, img_height * 2),
-            auto_pad=True,
-            pad_cfg=dict(type='Pad', padding_mode='constant', pad_val=12))
-        center_crop_module = TRANSFORMS.build(transform)
-        results = self.reset_results(results, self.original_img,
-                                     self.gt_semantic_map)
-        results = center_crop_module(results)
-        assert results['img_shape'] == (600, 200)
-        assert results['img'].shape[:2] == results['gt_seg_map'].shape
-        assert (results['img'][300:600, 100:300, ...] == 12).all()
-        assert (results['gt_seg_map'][300:600, 100:300] == 255).all()
-        assert np.equal(results['gt_bboxes'],
-                        np.array([[0, 0, 110, 160], [100, 150, 200,
-                                                     300]])).all()
-        assert np.equal(
-            results['gt_keypoints'],
-            np.array([[[0, 50, 0]], [[100, 150, 1]], [[200, 225, 0]]])).all()
-
-        transform = dict(
-            type='CenterCrop',
-            crop_size=(img_width // 2, img_height * 2),
-            auto_pad=True,
-            pad_cfg=dict(
-                type='Pad',
-                padding_mode='constant',
-                pad_val=dict(img=13, seg=33)))
-        center_crop_module = TRANSFORMS.build(transform)
-        results = self.reset_results(results, self.original_img,
-                                     self.gt_semantic_map)
-        results = center_crop_module(results)
-        assert results['img_shape'] == (600, 200)
-        assert (results['img'][300:600, 100:300, ...] == 13).all()
-        assert (results['gt_seg_map'][300:600, 100:300] == 33).all()
-        assert np.equal(results['gt_bboxes'],
-                        np.array([[0, 0, 110, 160], [100, 150, 200,
-                                                     300]])).all()
-        assert np.equal(
-            results['gt_keypoints'],
-            np.array([[[0, 50, 0]], [[100, 150, 1]], [[200, 225, 0]]])).all()
-
-        # test CenterCrop when crop_width is smaller than img_width
-        transform = dict(
-            type='CenterCrop', crop_size=(img_width // 2, img_height))
-        center_crop_module = TRANSFORMS.build(transform)
-        results = self.reset_results(results, self.original_img,
-                                     self.gt_semantic_map)
-        results = center_crop_module(results)
-        assert results['img_shape'] == (img_height, img_width // 2)
-        assert (results['img'] == self.original_img[:, 100:300, ...]).all()
-        assert (results['gt_seg_map'] == self.gt_semantic_map[:,
-                                                              100:300]).all()
-        assert np.equal(results['gt_bboxes'],
-                        np.array([[0, 0, 110, 160], [100, 150, 200,
-                                                     300]])).all()
-        assert np.equal(
-            results['gt_keypoints'],
-            np.array([[[0, 50, 0]], [[100, 150, 1]], [[200, 225, 0]]])).all()
-
-        # test CenterCrop when crop_height is smaller than img_height
-        transform = dict(
-            type='CenterCrop', crop_size=(img_width, img_height // 2))
-        center_crop_module = TRANSFORMS.build(transform)
-        results = self.reset_results(results, self.original_img,
-                                     self.gt_semantic_map)
-        results = center_crop_module(results)
-        assert results['img_shape'] == (img_height // 2, img_width)
-        assert (results['img'] == self.original_img[75:225, ...]).all()
-        assert (results['gt_seg_map'] == self.gt_semantic_map[75:225,
-                                                              ...]).all()
-        assert np.equal(results['gt_bboxes'],
-                        np.array([[0, 0, 210, 85], [200, 75, 400,
-                                                    150]])).all()
-        assert np.equal(
-            results['gt_keypoints'],
-            np.array([[[20, 0, 0]], [[200, 75, 1]], [[300, 150, 0]]])).all()
-
-    @pytest.mark.skipif(
-        condition=torch is None, reason='No torch in current env')
-    def test_torchvision_compare(self):
-        # compare results with torchvision
-        results = {}
-        transform = dict(type='CenterCrop', crop_size=224)
-        center_crop_module = TRANSFORMS.build(transform)
-        results = self.reset_results(results, self.original_img,
-                                     self.gt_semantic_map)
-        results = center_crop_module(results)
-        center_crop_module = torchvision.transforms.CenterCrop(size=224)
-        pil_img = Image.fromarray(self.original_img)
-        pil_seg = Image.fromarray(self.gt_semantic_map)
-        cropped_img = center_crop_module(pil_img)
-        cropped_img = np.array(cropped_img)
-        cropped_seg = center_crop_module(pil_seg)
-        cropped_seg = np.array(cropped_seg)
-        assert np.equal(results['img'], cropped_img).all()
-        assert np.equal(results['gt_seg_map'], cropped_seg).all()
-
-
-class TestRandomGrayscale:
-
-    @classmethod
-    def setup_class(cls):
-        cls.img = (np.random.rand(10, 10, 3) * 255).astype(np.uint8)
-
-    def test_repr(self):
-        # test repr
-        transform = dict(
-            type='RandomGrayscale',
-            prob=1.,
-            channel_weights=(0.299, 0.587, 0.114),
-            keep_channels=True)
-        random_gray_scale_module = TRANSFORMS.build(transform)
-        assert isinstance(repr(random_gray_scale_module), str)
-
-    def test_error(self):
-        # test invalid argument
-        transform = dict(type='RandomGrayscale', prob=2)
-        with pytest.raises(AssertionError):
-            TRANSFORMS.build(transform)
-
-    def test_transform(self):
-        results = dict()
-        # test rgb2gray, return the grayscale image with prob = 1.
-        transform = dict(
-            type='RandomGrayscale',
-            prob=1.,
-            channel_weights=(0.299, 0.587, 0.114),
-            keep_channels=True)
-
-        random_gray_scale_module = TRANSFORMS.build(transform)
-        results['img'] = copy.deepcopy(self.img)
-        img = random_gray_scale_module(results)['img']
-        computed_gray = (self.img[:, :, 0] * 0.299 +
-                         self.img[:, :, 1] * 0.587 +
-                         self.img[:, :, 2] * 0.114).astype(np.uint8)
-        for i in range(img.shape[2]):
-            assert_array_almost_equal(img[:, :, i], computed_gray, decimal=4)
-        assert img.shape == (10, 10, 3)
-
-        # test rgb2gray, return the original image with p=0.
-        transform = dict(type='RandomGrayscale', prob=0.)
-        random_gray_scale_module = TRANSFORMS.build(transform)
-        results['img'] = copy.deepcopy(self.img)
-        img = random_gray_scale_module(results)['img']
-        assert_array_equal(img, self.img)
-        assert img.shape == (10, 10, 3)
-
-        # test image with one channel
-        transform = dict(type='RandomGrayscale', prob=1.)
-        results['img'] = self.img[:, :, 0:1]
-        random_gray_scale_module = TRANSFORMS.build(transform)
-        img = random_gray_scale_module(results)['img']
-        assert_array_equal(img, self.img[:, :, 0:1])
-        assert img.shape == (10, 10, 1)
-
-
-@TRANSFORMS.register_module()
-class MockPackTaskInputs(BaseTransform):
-
-    def __init__(self) -> None:
-        super().__init__()
-
-    def transform(self, results):
-        packed_results = dict(inputs=results['img'], data_sample=Mock())
-        return packed_results
-
-
-class TestMultiScaleFlipAug:
-
-    @classmethod
-    def setup_class(cls):
-        cls.img = mmcv.imread(
-            osp.join(osp.dirname(__file__), '../data/color.jpg'), 'color')
-        cls.original_img = copy.deepcopy(cls.img)
-
-    def test_error(self):
-        # test assertion if scales is not tuple or list of tuple
-        with pytest.raises(AssertionError):
-            transform = dict(
-                type='MultiScaleFlipAug', scales=[1333, 800], transforms=[])
-            TRANSFORMS.build(transform)
-
-        # test assertion if flip_direction is not str or list of str
-        with pytest.raises(AssertionError):
-            transform = dict(
-                type='MultiScaleFlipAug',
-                scales=[(1333, 800)],
-                flip_direction=1,
-                transforms=[])
-            TRANSFORMS.build(transform)
-
-    @pytest.mark.skipif(
-        condition=torch is None, reason='No torch in current env')
-    def test_multi_scale_flip_aug(self):
-        # test with empty transforms
-        transform = dict(
-            type='MultiScaleFlipAug',
-            transforms=[dict(type='MockPackTaskInputs')],
-            scales=[(1333, 800), (800, 600), (640, 480)],
-            allow_flip=True,
-            flip_direction=['horizontal', 'vertical', 'diagonal'])
-        multi_scale_flip_aug_module = TRANSFORMS.build(transform)
-        results = dict()
-        results['img'] = copy.deepcopy(self.original_img)
-        packed_results = multi_scale_flip_aug_module(results)
-        assert len(packed_results['inputs']) == 12
-
-        # test with allow_flip=False
-        transform = dict(
-            type='MultiScaleFlipAug',
-            transforms=[dict(type='MockPackTaskInputs')],
-            scales=[(1333, 800), (800, 600), (640, 480)],
-            allow_flip=False,
-            flip_direction=['horizontal', 'vertical', 'diagonal'])
-        multi_scale_flip_aug_module = TRANSFORMS.build(transform)
-        results = dict()
-        results['img'] = copy.deepcopy(self.original_img)
-        packed_results = multi_scale_flip_aug_module(results)
-        assert len(packed_results['inputs']) == 3
-
-        # test with transforms
-        img_norm_cfg = dict(
-            mean=[123.675, 116.28, 103.53],
-            std=[58.395, 57.12, 57.375],
-            to_rgb=True)
-        transforms_cfg = [
-            dict(type='Normalize', **img_norm_cfg),
-            dict(type='Pad', size_divisor=32),
-            dict(type='ImageToTensor', keys=['img']),
-            dict(type='MockPackTaskInputs')
-        ]
-        transform = dict(
-            type='MultiScaleFlipAug',
-            transforms=transforms_cfg,
-            scales=[(1333, 800), (800, 600), (640, 480)],
-            allow_flip=True,
-            flip_direction=['horizontal', 'vertical', 'diagonal'])
-        multi_scale_flip_aug_module = TRANSFORMS.build(transform)
-        results = dict()
-        results['img'] = copy.deepcopy(self.original_img)
-        packed_results = multi_scale_flip_aug_module(results)
-        assert len(packed_results['inputs']) == 12
-
-        # test with scale_factor
-        img_norm_cfg = dict(
-            mean=[123.675, 116.28, 103.53],
-            std=[58.395, 57.12, 57.375],
-            to_rgb=True)
-        transforms_cfg = [
-            dict(type='Normalize', **img_norm_cfg),
-            dict(type='Pad', size_divisor=32),
-            dict(type='ImageToTensor', keys=['img']),
-            dict(type='MockPackTaskInputs')
-        ]
-        transform = dict(
-            type='MultiScaleFlipAug',
-            transforms=transforms_cfg,
-            scale_factor=[0.5, 1., 2.],
-            allow_flip=True,
-            flip_direction=['horizontal', 'vertical', 'diagonal'])
-        multi_scale_flip_aug_module = TRANSFORMS.build(transform)
-        results = dict()
-        results['img'] = copy.deepcopy(self.original_img)
-        packed_results = multi_scale_flip_aug_module(results)
-        assert len(packed_results['inputs']) == 12
-
-        # test no resize
-        img_norm_cfg = dict(
-            mean=[123.675, 116.28, 103.53],
-            std=[58.395, 57.12, 57.375],
-            to_rgb=True)
-        transforms_cfg = [
-            dict(type='Normalize', **img_norm_cfg),
-            dict(type='Pad', size_divisor=32),
-            dict(type='ImageToTensor', keys=['img']),
-            dict(type='MockPackTaskInputs')
-        ]
-        transform = dict(
-            type='MultiScaleFlipAug',
-            transforms=transforms_cfg,
-            allow_flip=True,
-            flip_direction=['horizontal', 'vertical', 'diagonal'])
-        multi_scale_flip_aug_module = TRANSFORMS.build(transform)
-        results = dict()
-        results['img'] = copy.deepcopy(self.original_img)
-        packed_results = multi_scale_flip_aug_module(results)
-        assert len(packed_results['inputs']) == 4
-
-
-class TestRandomChoiceResize:
-
-    @classmethod
-    def setup_class(cls):
-        cls.img = mmcv.imread(
-            osp.join(osp.dirname(__file__), '../data/color.jpg'), 'color')
-        cls.original_img = copy.deepcopy(cls.img)
-
-    def reset_results(self, results):
-        results['img'] = copy.deepcopy(self.original_img)
-        results['gt_seg_map'] = copy.deepcopy(self.original_img)
-
-    def test_repr(self):
-        # test repr
-        transform = dict(
-            type='RandomChoiceResize', scales=[(1333, 800), (1333, 600)])
-        random_multiscale_resize = TRANSFORMS.build(transform)
-        assert isinstance(repr(random_multiscale_resize), str)
-
-    def test_error(self):
-        # test assertion if size is smaller than 0
-        with pytest.raises(AssertionError):
-            transform = dict(type='RandomChoiceResize', scales=[0.5, 1, 2])
-            TRANSFORMS.build(transform)
-
-    def test_random_multiscale_resize(self):
-        results = dict()
-        # test with one scale
-        transform = dict(type='RandomChoiceResize', scales=[(1333, 800)])
-        random_multiscale_resize = TRANSFORMS.build(transform)
-        self.reset_results(results)
-        results = random_multiscale_resize(results)
-        assert results['img'].shape == (800, 1333, 3)
-
-        # test with multi scales
-        _scale_choice = [(1333, 800), (1333, 600)]
-        transform = dict(type='RandomChoiceResize', scales=_scale_choice)
-        random_multiscale_resize = TRANSFORMS.build(transform)
-        self.reset_results(results)
-        results = random_multiscale_resize(results)
-        assert (results['img'].shape[1],
-                results['img'].shape[0]) in _scale_choice
-
-        # test keep_ratio
-        transform = dict(
-            type='RandomChoiceResize',
-            scales=[(900, 600)],
-            resize_type='Resize',
-            keep_ratio=True)
-        random_multiscale_resize = TRANSFORMS.build(transform)
-        self.reset_results(results)
-        _input_ratio = results['img'].shape[0] / results['img'].shape[1]
-        results = random_multiscale_resize(results)
-        _output_ratio = results['img'].shape[0] / results['img'].shape[1]
-        assert_array_almost_equal(_input_ratio, _output_ratio)
-
-        # test clip_object_border
-        gt_bboxes = [[200, 150, 600, 450]]
-        transform = dict(
-            type='RandomChoiceResize',
-            scales=[(200, 150)],
-            resize_type='Resize',
-            clip_object_border=True)
-        random_multiscale_resize = TRANSFORMS.build(transform)
-        self.reset_results(results)
-        results['gt_bboxes'] = np.array(gt_bboxes)
-        results = random_multiscale_resize(results)
-        assert results['img'].shape == (150, 200, 3)
-        assert np.equal(results['gt_bboxes'], np.array([[100, 75, 200,
-                                                         150]])).all()
-
-        transform = dict(
-            type='RandomChoiceResize',
-            scales=[(200, 150)],
-            resize_type='Resize',
-            clip_object_border=False)
-        random_multiscale_resize = TRANSFORMS.build(transform)
-        self.reset_results(results)
-        results['gt_bboxes'] = np.array(gt_bboxes)
-        results = random_multiscale_resize(results)
-        assert results['img'].shape == (150, 200, 3)
-        assert np.equal(results['gt_bboxes'], np.array([[100, 75, 300,
-                                                         225]])).all()
-
-
-class TestRandomFlip:
-
-    def test_init(self):
-
-        # prob is float
-        TRANSFORMS = RandomFlip(0.1)
-        assert TRANSFORMS.prob == 0.1
-
-        # prob is None
-        with pytest.raises(ValueError):
-            TRANSFORMS = RandomFlip(None)
-            assert TRANSFORMS.prob is None
-
-        # prob is a list
-        TRANSFORMS = RandomFlip([0.1, 0.2], ['horizontal', 'vertical'])
-        assert len(TRANSFORMS.prob) == 2
-        assert len(TRANSFORMS.direction) == 2
-
-        # direction is an invalid type
-        with pytest.raises(ValueError):
-            TRANSFORMS = RandomFlip(0.1, 1)
-
-        # prob is an invalid type
-        with pytest.raises(ValueError):
-            TRANSFORMS = RandomFlip('0.1')
-
-    def test_transform(self):
-
-        results = {
-            'img': np.random.random((224, 224, 3)),
-            'gt_bboxes': np.array([[0, 1, 100, 101]]),
-            'gt_keypoints': np.array([[[100, 100, 1.0]]]),
-            # seg map flip is irrelative with image, so there is no requirement
-            # that gt_set_map of test data matches image.
-            'gt_seg_map': np.array([[0, 1], [2, 3]])
-        }
-
-        # horizontal flip
-        TRANSFORMS = RandomFlip([1.0], ['horizontal'])
-        results_update = TRANSFORMS.transform(copy.deepcopy(results))
-        assert (results_update['gt_bboxes'] == np.array([[124, 1, 224,
-                                                          101]])).all()
-        assert (results_update['gt_seg_map'] == np.array([[1, 0], [3,
-                                                                   2]])).all()
-
-        # diagonal flip
-        TRANSFORMS = RandomFlip([1.0], ['diagonal'])
-        results_update = TRANSFORMS.transform(copy.deepcopy(results))
-        assert (results_update['gt_bboxes'] == np.array([[124, 123, 224,
-                                                          223]])).all()
-        assert (results_update['gt_seg_map'] == np.array([[3, 2], [1,
-                                                                   0]])).all()
-
-        # vertical flip
-        TRANSFORMS = RandomFlip([1.0], ['vertical'])
-        results_update = TRANSFORMS.transform(copy.deepcopy(results))
-        assert (results_update['gt_bboxes'] == np.array([[0, 123, 100,
-                                                          223]])).all()
-        assert (results_update['gt_seg_map'] == np.array([[2, 3], [0,
-                                                                   1]])).all()
-
-        # horizontal flip when direction is None
-        TRANSFORMS = RandomFlip(1.0)
-        results_update = TRANSFORMS.transform(copy.deepcopy(results))
-        assert (results_update['gt_bboxes'] == np.array([[124, 1, 224,
-                                                          101]])).all()
-        assert (results_update['gt_seg_map'] == np.array([[1, 0], [3,
-                                                                   2]])).all()
-
-        # horizontal flip and swap label pair
-        TRANSFORMS = RandomFlip([1.0], ['horizontal'],
-                                swap_seg_labels=[[0, 1]])
-        results_update = TRANSFORMS.transform(copy.deepcopy(results))
-        assert (results_update['gt_seg_map'] == np.array([[0, 1], [3,
-                                                                   2]])).all()
-        assert results_update['swap_seg_labels'] == [[0, 1]]
-
-        TRANSFORMS = RandomFlip(0.0)
-        results_update = TRANSFORMS.transform(copy.deepcopy(results))
-        assert (results_update['gt_bboxes'] == np.array([[0, 1, 100,
-                                                          101]])).all()
-        assert (results_update['gt_seg_map'] == np.array([[0, 1], [2,
-                                                                   3]])).all()
-
-        # flip direction is invalid in bbox flip
-        with pytest.raises(ValueError):
-            TRANSFORMS = RandomFlip(1.0)
-            results_update = TRANSFORMS._flip_bbox(results['gt_bboxes'],
-                                                   (224, 224), 'invalid')
-
-        # flip direction is invalid in keypoints flip
-        with pytest.raises(ValueError):
-            TRANSFORMS = RandomFlip(1.0)
-            results_update = TRANSFORMS._flip_keypoints(
-                results['gt_keypoints'], (224, 224), 'invalid')
-
-        # swap pair is invalid
-        with pytest.raises(AssertionError):
-            TRANSFORMS = RandomFlip(1.0, swap_seg_labels='invalid')
-            results_update = TRANSFORMS._flip_seg_map(results['gt_seg_map'],
-                                                      'horizontal')
-
-    def test_repr(self):
-        TRANSFORMS = RandomFlip(0.1)
-        TRANSFORMS_str = str(TRANSFORMS)
-        assert isinstance(TRANSFORMS_str, str)
-
-
-class TestRandomResize:
-
-    def test_init(self):
-        TRANSFORMS = RandomResize(
-            (224, 224),
-            (1.0, 2.0),
-        )
-        assert TRANSFORMS.scale == (224, 224)
-
-    def test_repr(self):
-        TRANSFORMS = RandomResize(
-            (224, 224),
-            (1.0, 2.0),
-        )
-        TRANSFORMS_str = str(TRANSFORMS)
-        assert isinstance(TRANSFORMS_str, str)
-
-    def test_transform(self):
-
-        # choose target scale from init when override is True
-        results = {}
-        TRANSFORMS = RandomResize((224, 224), (1.0, 2.0))
-        results_update = TRANSFORMS.transform(copy.deepcopy(results))
-        assert results_update['scale'][0] >= 224 and results_update['scale'][
-            0] <= 448
-        assert results_update['scale'][1] >= 224 and results_update['scale'][
-            1] <= 448
-
-        # keep ratio is True
-        results = {
-            'img': np.random.random((224, 224, 3)),
-            'gt_seg_map': np.random.random((224, 224, 3)),
-            'gt_bboxes': np.array([[0, 0, 112, 112]]),
-            'gt_keypoints': np.array([[[112, 112]]])
-        }
-
-        TRANSFORMS = RandomResize((224, 224), (1.0, 2.0),
-                                  resize_type='Resize',
-                                  keep_ratio=True)
-        results_update = TRANSFORMS.transform(copy.deepcopy(results))
-        assert 224 <= results_update['img_shape'][0]
-        assert 448 >= results_update['img_shape'][0]
-        assert 224 <= results_update['img_shape'][1]
-        assert 448 >= results_update['img_shape'][1]
-        assert results_update['keep_ratio']
-        assert results['gt_bboxes'][0][2] >= 112
-        assert results['gt_bboxes'][0][2] <= 112
-
-        # keep ratio is False
-        TRANSFORMS = RandomResize((224, 224), (1.0, 2.0),
-                                  resize_type='Resize',
-                                  keep_ratio=False)
-        results_update = TRANSFORMS.transform(copy.deepcopy(results))
-
-        # choose target scale from init when override is False and scale is a
-        # list of tuples
-        results = {}
-        TRANSFORMS = RandomResize([(224, 448), (112, 224)],
-                                  resize_type='Resize',
-                                  keep_ratio=True)
-        results_update = TRANSFORMS.transform(copy.deepcopy(results))
-        assert results_update['scale'][1] >= 224 and results_update['scale'][
-            1] <= 448
-        assert results_update['scale'][0] >= 112 and results_update['scale'][
-            0] <= 224
-
-        # the type of scale is invalid in init
-        with pytest.raises(NotImplementedError):
-            results = {}
-            TRANSFORMS = RandomResize([(224, 448), [112, 224]],
-                                      resize_type='Resize',
-                                      keep_ratio=True)
-            results_update = TRANSFORMS.transform(copy.deepcopy(results))
-
-
-class TestTestTimeAug:
-
-    def test_init(self):
-        subroutines = [[
-            dict(type='Resize', scale=(1333, 800), keep_ratio=True),
-            dict(type='Resize', scale=(1333, 400), keep_ratio=True)
-        ], [
-            dict(type='RandomFlip', prob=1.),
-            dict(type='RandomFlip', prob=0.)
-        ], [dict(type='Normalize', mean=(0, 0, 0), std=(1, 1, 1))]]
-
-        tta_transform = TestTimeAug(subroutines)
-        subroutines = tta_transform.subroutines
-        assert len(subroutines) == 4
-
-        assert isinstance(subroutines[0].transforms[0], Resize)
-        assert isinstance(subroutines[0].transforms[1], RandomFlip)
-        assert isinstance(subroutines[0].transforms[2], Normalize)
-        assert isinstance(subroutines[1].transforms[0], Resize)
-        assert isinstance(subroutines[1].transforms[1], RandomFlip)
-        assert isinstance(subroutines[1].transforms[2], Normalize)
-
-    def test_transform(self):
-        results = {
-            'img': np.random.random((224, 224, 3)),
-            'gt_bboxes': np.array([[0, 1, 100, 101]]),
-            'gt_keypoints': np.array([[[100, 100, 1.0]]]),
-            'gt_seg_map': np.random.random((224, 224, 3))
-        }
-        input_results = copy.deepcopy(results)
-        transforms = [[
-            dict(type='Resize', scale=(1333, 800), keep_ratio=True),
-            dict(type='Resize', scale=(1333, 400), keep_ratio=True)
-        ], [
-            dict(type='RandomFlip', prob=0.),
-            dict(type='RandomFlip', prob=1.)
-        ], [dict(type='Normalize', mean=(0, 0, 0), std=(1, 1, 1))]]
-
-        tta_transform = TestTimeAug(transforms)
-        results = tta_transform.transform(results)
-        assert len(results['img']) == 4
-
-        resize1 = tta_transform.subroutines[0].transforms[0]
-        resize2 = tta_transform.subroutines[2].transforms[0]
-        flip1 = tta_transform.subroutines[0].transforms[1]
-        flip2 = tta_transform.subroutines[1].transforms[1]
-        normalize = tta_transform.subroutines[0].transforms[2]
-        target_results = [
-            normalize.transform(
-                flip1.transform(
-                    resize1.transform(copy.deepcopy(input_results)))),
-            normalize.transform(
-                flip2.transform(
-                    resize1.transform(copy.deepcopy(input_results)))),
-            normalize.transform(
-                flip1.transform(
-                    resize2.transform(copy.deepcopy(input_results)))),
-            normalize.transform(
-                flip2.transform(
-                    resize2.transform(copy.deepcopy(input_results)))),
-        ]
-
-        assert np.allclose(target_results[0]['img'], results['img'][0])
-        assert np.allclose(target_results[1]['img'], results['img'][1])
-        assert np.allclose(target_results[2]['img'], results['img'][2])
-        assert np.allclose(target_results[3]['img'], results['img'][3])
-
-    def test_repr(self):
-        transforms = [[
-            dict(type='Resize', scale=(1333, 800), keep_ratio=True),
-            dict(type='Resize', scale=(1333, 400), keep_ratio=True)
-        ], [
-            dict(type='RandomFlip', prob=0.),
-            dict(type='RandomFlip', prob=1.)
-        ], [dict(type='Normalize', mean=(0, 0, 0), std=(1, 1, 1))]]
-
-        tta_transform = TestTimeAug(transforms)
-        repr_str = repr(tta_transform)
-        repr_str_list = repr_str.split('\n')
-        assert repr_str_list[0] == 'TestTimeAugtransforms='
-        assert repr_str_list[1] == 'Compose('
-        assert repr_str_list[2].startswith('    Resize(scale=(1333, 800)')
-        assert repr_str_list[3].startswith('    RandomFlip(prob=0.0')
-        assert repr_str_list[4].startswith('    Normalize(mean=[0. 0. 0.]')
diff --git a/tests/test_transforms/test_transforms_wrapper.py b/tests/test_transforms/test_transforms_wrapper.py
deleted file mode 100644
index 98feeb8..0000000
--- a/tests/test_transforms/test_transforms_wrapper.py
+++ /dev/null
@@ -1,585 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import warnings
-
-import numpy as np
-import pytest
-
-from mmcv.transforms.base import BaseTransform
-from mmcv.transforms.builder import TRANSFORMS
-from mmcv.transforms.utils import (avoid_cache_randomness, cache_random_params,
-                                   cache_randomness)
-from mmcv.transforms.wrappers import (Compose, KeyMapper, RandomApply,
-                                      RandomChoice, TransformBroadcaster)
-
-
-@TRANSFORMS.register_module()
-class AddToValue(BaseTransform):
-    """Dummy transform to add a given addend to results['value']"""
-
-    def __init__(self, addend=0) -> None:
-        super().__init__()
-        self.addend = addend
-
-    def add(self, results, addend):
-        augend = results['value']
-
-        if isinstance(augend, list):
-            warnings.warn('value is a list', UserWarning)
-        if isinstance(augend, dict):
-            warnings.warn('value is a dict', UserWarning)
-
-        def _add_to_value(augend, addend):
-            if isinstance(augend, list):
-                return [_add_to_value(v, addend) for v in augend]
-            if isinstance(augend, dict):
-                return {k: _add_to_value(v, addend) for k, v in augend.items()}
-            return augend + addend
-
-        results['value'] = _add_to_value(results['value'], addend)
-        return results
-
-    def transform(self, results):
-        return self.add(results, self.addend)
-
-    def __repr__(self) -> str:
-        repr_str = self.__class__.__name__
-        repr_str += f'addend = {self.addend}'
-        return repr_str
-
-
-@TRANSFORMS.register_module()
-class RandomAddToValue(AddToValue):
-    """Dummy transform to add a random addend to results['value']"""
-
-    def __init__(self, repeat=1) -> None:
-        super().__init__(addend=None)
-        self.repeat = repeat
-
-    @cache_randomness
-    def get_random_addend(self):
-        return np.random.rand()
-
-    def transform(self, results):
-        for _ in range(self.repeat):
-            results = self.add(results, addend=self.get_random_addend())
-        return results
-
-    def __repr__(self) -> str:
-        repr_str = self.__class__.__name__
-        repr_str += f'repeat = {self.repeat}'
-        return repr_str
-
-
-@TRANSFORMS.register_module()
-class SumTwoValues(BaseTransform):
-    """Dummy transform to test transform wrappers."""
-
-    def transform(self, results):
-        if 'num_1' in results and 'num_2' in results:
-            results['sum'] = results['num_1'] + results['num_2']
-        elif 'num_1' in results:
-            results['sum'] = results['num_1']
-        elif 'num_2' in results:
-            results['sum'] = results['num_2']
-        else:
-            results['sum'] = np.nan
-        return results
-
-    def __repr__(self) -> str:
-        repr_str = self.__class__.__name__
-        return repr_str
-
-
-def test_compose():
-
-    # Case 1: build from cfg
-    pipeline = [dict(type='AddToValue')]
-    pipeline = Compose(pipeline)
-    _ = str(pipeline)
-
-    # Case 2: build from transform list
-    pipeline = [AddToValue()]
-    pipeline = Compose(pipeline)
-
-    # Case 3: invalid build arguments
-    pipeline = [[dict(type='AddToValue')]]
-    with pytest.raises(TypeError):
-        pipeline = Compose(pipeline)
-
-    # Case 4: contain transform with None output
-    class DummyTransform(BaseTransform):
-
-        def transform(self, results):
-            return None
-
-    pipeline = Compose([DummyTransform()])
-    results = pipeline({})
-    assert results is None
-
-
-def test_cache_random_parameters():
-
-    transform = RandomAddToValue()
-
-    # Case 1: cache random parameters
-    assert hasattr(RandomAddToValue, '_methods_with_randomness')
-    assert 'get_random_addend' in RandomAddToValue._methods_with_randomness
-
-    with cache_random_params(transform):
-        results_1 = transform(dict(value=0))
-        results_2 = transform(dict(value=0))
-        np.testing.assert_equal(results_1['value'], results_2['value'])
-
-    # Case 2: do not cache random parameters
-    results_1 = transform(dict(value=0))
-    results_2 = transform(dict(value=0))
-    with pytest.raises(AssertionError):
-        np.testing.assert_equal(results_1['value'], results_2['value'])
-
-    # Case 3: allow to invoke random method 0 times
-    transform = RandomAddToValue(repeat=0)
-    with cache_random_params(transform):
-        _ = transform(dict(value=0))
-
-    # Case 4: NOT allow to invoke random method >1 times
-    transform = RandomAddToValue(repeat=2)
-    with pytest.raises(RuntimeError):
-        with cache_random_params(transform):
-            _ = transform(dict(value=0))
-
-    # Case 5: apply on nested transforms
-    transform = Compose([RandomAddToValue()])
-    with cache_random_params(transform):
-        results_1 = transform(dict(value=0))
-        results_2 = transform(dict(value=0))
-        np.testing.assert_equal(results_1['value'], results_2['value'])
-
-
-def test_key_mapper():
-    # Case 0: only remap
-    pipeline = KeyMapper(
-        transforms=[AddToValue(addend=1)], remapping={'value': 'v_out'})
-
-    results = dict(value=0)
-    results = pipeline(results)
-
-    np.testing.assert_equal(results['value'], 0)  # should be unchanged
-    np.testing.assert_equal(results['v_out'], 1)
-
-    # Case 1: simple remap
-    pipeline = KeyMapper(
-        transforms=[AddToValue(addend=1)],
-        mapping={'value': 'v_in'},
-        remapping={'value': 'v_out'})
-
-    results = dict(value=0, v_in=1)
-    results = pipeline(results)
-
-    np.testing.assert_equal(results['value'], 0)  # should be unchanged
-    np.testing.assert_equal(results['v_in'], 1)
-    np.testing.assert_equal(results['v_out'], 2)
-
-    # Case 2: collecting list
-    pipeline = KeyMapper(
-        transforms=[AddToValue(addend=2)],
-        mapping={'value': ['v_in_1', 'v_in_2']},
-        remapping={'value': ['v_out_1', 'v_out_2']})
-    results = dict(value=0, v_in_1=1, v_in_2=2)
-
-    with pytest.warns(UserWarning, match='value is a list'):
-        results = pipeline(results)
-
-    np.testing.assert_equal(results['value'], 0)  # should be unchanged
-    np.testing.assert_equal(results['v_in_1'], 1)
-    np.testing.assert_equal(results['v_in_2'], 2)
-    np.testing.assert_equal(results['v_out_1'], 3)
-    np.testing.assert_equal(results['v_out_2'], 4)
-
-    # Case 3: collecting dict
-    pipeline = KeyMapper(
-        transforms=[AddToValue(addend=2)],
-        mapping={'value': {
-            'v1': 'v_in_1',
-            'v2': 'v_in_2'
-        }},
-        remapping={'value': {
-            'v1': 'v_out_1',
-            'v2': 'v_out_2'
-        }})
-    results = dict(value=0, v_in_1=1, v_in_2=2)
-
-    with pytest.warns(UserWarning, match='value is a dict'):
-        results = pipeline(results)
-
-    np.testing.assert_equal(results['value'], 0)  # should be unchanged
-    np.testing.assert_equal(results['v_in_1'], 1)
-    np.testing.assert_equal(results['v_in_2'], 2)
-    np.testing.assert_equal(results['v_out_1'], 3)
-    np.testing.assert_equal(results['v_out_2'], 4)
-
-    # Case 4: collecting list with auto_remap mode
-    pipeline = KeyMapper(
-        transforms=[AddToValue(addend=2)],
-        mapping=dict(value=['v_in_1', 'v_in_2']),
-        auto_remap=True)
-    results = dict(value=0, v_in_1=1, v_in_2=2)
-
-    with pytest.warns(UserWarning, match='value is a list'):
-        results = pipeline(results)
-
-    np.testing.assert_equal(results['value'], 0)
-    np.testing.assert_equal(results['v_in_1'], 3)
-    np.testing.assert_equal(results['v_in_2'], 4)
-
-    # Case 5: collecting dict with auto_remap mode
-    pipeline = KeyMapper(
-        transforms=[AddToValue(addend=2)],
-        mapping=dict(value=dict(v1='v_in_1', v2='v_in_2')),
-        auto_remap=True)
-    results = dict(value=0, v_in_1=1, v_in_2=2)
-
-    with pytest.warns(UserWarning, match='value is a dict'):
-        results = pipeline(results)
-
-    np.testing.assert_equal(results['value'], 0)
-    np.testing.assert_equal(results['v_in_1'], 3)
-    np.testing.assert_equal(results['v_in_2'], 4)
-
-    # Case 6: nested collection with auto_remap mode
-    pipeline = KeyMapper(
-        transforms=[AddToValue(addend=2)],
-        mapping=dict(value=['v1', dict(v2=['v21', 'v22'], v3='v3')]),
-        auto_remap=True)
-    results = dict(value=0, v1=1, v21=2, v22=3, v3=4)
-
-    with pytest.warns(UserWarning, match='value is a list'):
-        results = pipeline(results)
-
-    np.testing.assert_equal(results['value'], 0)
-    np.testing.assert_equal(results['v1'], 3)
-    np.testing.assert_equal(results['v21'], 4)
-    np.testing.assert_equal(results['v22'], 5)
-    np.testing.assert_equal(results['v3'], 6)
-
-    # Case 7: output_map must be None if `auto_remap` is set True
-    with pytest.raises(ValueError):
-        pipeline = KeyMapper(
-            transforms=[AddToValue(addend=1)],
-            mapping=dict(value='v_in'),
-            remapping=dict(value='v_out'),
-            auto_remap=True)
-
-    # Case 8: allow_nonexist_keys8
-    pipeline = KeyMapper(
-        transforms=[SumTwoValues()],
-        mapping=dict(num_1='a', num_2='b'),
-        auto_remap=False,
-        allow_nonexist_keys=True)
-
-    results = pipeline(dict(a=1, b=2))
-    np.testing.assert_equal(results['sum'], 3)
-
-    results = pipeline(dict(a=1))
-    np.testing.assert_equal(results['sum'], 1)
-
-    # Case 9: use wrapper as a transform
-    transform = KeyMapper(mapping=dict(b='a'), auto_remap=False)
-    results = transform(dict(a=1))
-    # note that the original key 'a' will not be removed
-    assert results == dict(a=1, b=1)
-
-    # Case 10: manually set keys ignored
-    pipeline = KeyMapper(
-        transforms=[SumTwoValues()],
-        mapping=dict(num_1='a', num_2=...),  # num_2 (b) will be ignored
-        auto_remap=False,
-        # allow_nonexist_keys will not affect manually ignored keys
-        allow_nonexist_keys=False)
-
-    results = pipeline(dict(a=1, b=2))
-    np.testing.assert_equal(results['sum'], 1)
-
-    # Test basic functions
-    pipeline = KeyMapper(
-        transforms=[AddToValue(addend=1)],
-        mapping=dict(value='v_in'),
-        remapping=dict(value='v_out'))
-
-    # __iter__
-    for _ in pipeline:
-        pass
-
-    # __repr__
-    assert repr(pipeline) == (
-        'KeyMapper(transforms = Compose(\n    ' + 'AddToValueaddend = 1' +
-        '\n), mapping = {\'value\': \'v_in\'}, ' +
-        'remapping = {\'value\': \'v_out\'}, auto_remap = False, ' +
-        'allow_nonexist_keys = False)')
-
-
-def test_transform_broadcaster():
-
-    # Case 1: apply to list in results
-    pipeline = TransformBroadcaster(
-        transforms=[AddToValue(addend=1)],
-        mapping=dict(value='values'),
-        auto_remap=True)
-    results = dict(values=[1, 2])
-
-    results = pipeline(results)
-
-    np.testing.assert_equal(results['values'], [2, 3])
-
-    # Case 2: apply to multiple keys
-    pipeline = TransformBroadcaster(
-        transforms=[AddToValue(addend=1)],
-        mapping=dict(value=['v_1', 'v_2']),
-        auto_remap=True)
-    results = dict(v_1=1, v_2=2)
-
-    results = pipeline(results)
-
-    np.testing.assert_equal(results['v_1'], 2)
-    np.testing.assert_equal(results['v_2'], 3)
-
-    # Case 3: apply to multiple groups of keys
-    pipeline = TransformBroadcaster(
-        transforms=[SumTwoValues()],
-        mapping=dict(num_1=['a_1', 'b_1'], num_2=['a_2', 'b_2']),
-        remapping=dict(sum=['a', 'b']),
-        auto_remap=False)
-
-    results = dict(a_1=1, a_2=2, b_1=3, b_2=4)
-    results = pipeline(results)
-
-    np.testing.assert_equal(results['a'], 3)
-    np.testing.assert_equal(results['b'], 7)
-
-    # Case 3: apply to all keys
-    pipeline = TransformBroadcaster(
-        transforms=[SumTwoValues()], mapping=None, remapping=None)
-    results = dict(num_1=[1, 2, 3], num_2=[4, 5, 6])
-
-    results = pipeline(results)
-
-    np.testing.assert_equal(results['sum'], [5, 7, 9])
-
-    # Case 4: inconsistent sequence length
-    with pytest.raises(ValueError):
-        pipeline = TransformBroadcaster(
-            transforms=[SumTwoValues()],
-            mapping=dict(num_1='list_1', num_2='list_2'),
-            auto_remap=False)
-
-        results = dict(list_1=[1, 2], list_2=[1, 2, 3])
-        _ = pipeline(results)
-
-    # Case 5: share random parameter
-    pipeline = TransformBroadcaster(
-        transforms=[RandomAddToValue()],
-        mapping=dict(value='values'),
-        auto_remap=True,
-        share_random_params=True)
-
-    results = dict(values=[0, 0])
-    results = pipeline(results)
-
-    np.testing.assert_equal(results['values'][0], results['values'][1])
-
-    # Case 6: partial broadcasting
-    pipeline = TransformBroadcaster(
-        transforms=[SumTwoValues()],
-        mapping=dict(num_1=['a_1', 'b_1'], num_2=['a_2', ...]),
-        remapping=dict(sum=['a', 'b']),
-        auto_remap=False)
-
-    results = dict(a_1=1, a_2=2, b_1=3, b_2=4)
-    results = pipeline(results)
-
-    np.testing.assert_equal(results['a'], 3)
-    np.testing.assert_equal(results['b'], 3)
-
-    pipeline = TransformBroadcaster(
-        transforms=[SumTwoValues()],
-        mapping=dict(num_1=['a_1', 'b_1'], num_2=['a_2', 'b_2']),
-        remapping=dict(sum=['a', ...]),
-        auto_remap=False)
-
-    results = dict(a_1=1, a_2=2, b_1=3, b_2=4)
-    results = pipeline(results)
-
-    np.testing.assert_equal(results['a'], 3)
-    assert 'b' not in results
-
-    # Test repr
-    assert repr(pipeline) == (
-        'TransformBroadcaster(transforms = Compose(\n' + '    SumTwoValues' +
-        '\n), mapping = {\'num_1\': [\'a_1\', \'b_1\'], ' +
-        '\'num_2\': [\'a_2\', \'b_2\']}, ' +
-        'remapping = {\'sum\': [\'a\', Ellipsis]}, auto_remap = False, ' +
-        'allow_nonexist_keys = False, share_random_params = False)')
-
-
-def test_random_choice():
-
-    # Case 1: given probability
-    pipeline = RandomChoice(
-        transforms=[[AddToValue(addend=1.0)], [AddToValue(addend=2.0)]],
-        prob=[1.0, 0.0])
-
-    results = pipeline(dict(value=1))
-    np.testing.assert_equal(results['value'], 2.0)
-
-    # Case 2: default probability
-    pipeline = RandomChoice(transforms=[[AddToValue(
-        addend=1.0)], [AddToValue(addend=2.0)]])
-
-    _ = pipeline(dict(value=1))
-
-    # Case 3: nested RandomChoice in TransformBroadcaster
-    pipeline = TransformBroadcaster(
-        transforms=[
-            RandomChoice(
-                transforms=[[AddToValue(addend=1.0)],
-                            [AddToValue(addend=2.0)]], ),
-        ],
-        mapping={'value': 'values'},
-        auto_remap=True,
-        share_random_params=True)
-
-    results = dict(values=[0 for _ in range(10)])
-    results = pipeline(results)
-    # check share_random_params=True works so that all values are same
-    values = results['values']
-    assert all(map(lambda x: x == values[0], values))
-
-    # repr
-    assert repr(pipeline) == (
-        'TransformBroadcaster(transforms = Compose(\n' +
-        '    RandomChoice(transforms = [Compose(\n' +
-        '    AddToValueaddend = 1.0' + '\n), Compose(\n' +
-        '    AddToValueaddend = 2.0' + '\n)]prob = None)' +
-        '\n), mapping = {\'value\': \'values\'}, ' +
-        'remapping = {\'value\': \'values\'}, auto_remap = True, ' +
-        'allow_nonexist_keys = False, share_random_params = True)')
-
-
-def test_random_apply():
-
-    # Case 1: simple use
-    pipeline = RandomApply(transforms=[AddToValue(addend=1.0)], prob=1.0)
-    results = pipeline(dict(value=1))
-    np.testing.assert_equal(results['value'], 2.0)
-
-    pipeline = RandomApply(transforms=[AddToValue(addend=1.0)], prob=0.0)
-    results = pipeline(dict(value=1))
-    np.testing.assert_equal(results['value'], 1.0)
-
-    # Case 2: nested RandomApply in TransformBroadcaster
-    pipeline = TransformBroadcaster(
-        transforms=[RandomApply(transforms=[AddToValue(addend=1)], prob=0.5)],
-        mapping={'value': 'values'},
-        auto_remap=True,
-        share_random_params=True)
-
-    results = dict(values=[0 for _ in range(10)])
-    results = pipeline(results)
-    # check share_random_params=True works so that all values are same
-    values = results['values']
-    assert all(map(lambda x: x == values[0], values))
-
-    # __iter__
-    for _ in pipeline:
-        pass
-
-    # repr
-    assert repr(pipeline) == (
-        'TransformBroadcaster(transforms = Compose(\n' +
-        '    RandomApply(transforms = Compose(\n' +
-        '    AddToValueaddend = 1' + '\n), prob = 0.5)' +
-        '\n), mapping = {\'value\': \'values\'}, ' +
-        'remapping = {\'value\': \'values\'}, auto_remap = True, ' +
-        'allow_nonexist_keys = False, share_random_params = True)')
-
-
-def test_utils():
-    # Test cache_randomness: normal case
-    class DummyTransform(BaseTransform):
-
-        @cache_randomness
-        def func(self):
-            return np.random.rand()
-
-        def transform(self, results):
-            _ = self.func()
-            return results
-
-    transform = DummyTransform()
-    _ = transform({})
-    with cache_random_params(transform):
-        _ = transform({})
-
-    # Test cache_randomness: invalid function type
-    with pytest.raises(TypeError):
-
-        class DummyTransform(BaseTransform):
-
-            @cache_randomness
-            @staticmethod
-            def func():
-                return np.random.rand()
-
-            def transform(self, results):
-                return results
-
-    # Test cache_randomness: invalid function argument list
-    with pytest.raises(TypeError):
-
-        class DummyTransform(BaseTransform):
-
-            @cache_randomness
-            def func(cls):
-                return np.random.rand()
-
-            def transform(self, results):
-                return results
-
-    # Test avoid_cache_randomness: invalid mixture with cache_randomness
-    with pytest.raises(RuntimeError):
-
-        @avoid_cache_randomness
-        class DummyTransform(BaseTransform):
-
-            @cache_randomness
-            def func(self):
-                pass
-
-            def transform(self, results):
-                return results
-
-    # Test avoid_cache_randomness: raise error in cache_random_params
-    with pytest.raises(RuntimeError):
-
-        @avoid_cache_randomness
-        class DummyTransform(BaseTransform):
-
-            def transform(self, results):
-                return results
-
-        transform = DummyTransform()
-        with cache_random_params(transform):
-            pass
-
-    # Test avoid_cache_randomness: non-inheritable
-    @avoid_cache_randomness
-    class DummyBaseTransform(BaseTransform):
-
-        def transform(self, results):
-            return results
-
-    class DummyTransform(DummyBaseTransform):
-        pass
-
-    transform = DummyTransform()
-    with cache_random_params(transform):
-        pass
diff --git a/tests/test_utils/test_config.py b/tests/test_utils/test_config.py
new file mode 100644
index 0000000..89520de
--- /dev/null
+++ b/tests/test_utils/test_config.py
@@ -0,0 +1,534 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import json
+import os
+import os.path as osp
+import shutil
+import tempfile
+from pathlib import Path
+
+import pytest
+import yaml
+
+from mmcv import Config, DictAction, dump, load
+
+data_path = osp.join(osp.dirname(osp.dirname(__file__)), 'data')
+
+
+def test_construct():
+    cfg = Config()
+    assert cfg.filename is None
+    assert cfg.text == ''
+    assert len(cfg) == 0
+    assert cfg._cfg_dict == {}
+
+    with pytest.raises(TypeError):
+        Config([0, 1])
+
+    cfg_dict = dict(item1=[1, 2], item2=dict(a=0), item3=True, item4='test')
+    # test a.py
+    cfg_file = osp.join(data_path, 'config/a.py')
+    cfg = Config(cfg_dict, filename=cfg_file)
+    assert isinstance(cfg, Config)
+    assert cfg.filename == cfg_file
+    assert cfg.text == open(cfg_file, 'r').read()
+    assert cfg.dump() == cfg.pretty_text
+    with tempfile.TemporaryDirectory() as temp_config_dir:
+        dump_file = osp.join(temp_config_dir, 'a.py')
+        cfg.dump(dump_file)
+        assert cfg.dump() == open(dump_file, 'r').read()
+        assert Config.fromfile(dump_file)
+
+    # test b.json
+    cfg_file = osp.join(data_path, 'config/b.json')
+    cfg = Config(cfg_dict, filename=cfg_file)
+    assert isinstance(cfg, Config)
+    assert cfg.filename == cfg_file
+    assert cfg.text == open(cfg_file, 'r').read()
+    assert cfg.dump() == json.dumps(cfg_dict)
+    with tempfile.TemporaryDirectory() as temp_config_dir:
+        dump_file = osp.join(temp_config_dir, 'b.json')
+        cfg.dump(dump_file)
+        assert cfg.dump() == open(dump_file, 'r').read()
+        assert Config.fromfile(dump_file)
+
+    # test c.yaml
+    cfg_file = osp.join(data_path, 'config/c.yaml')
+    cfg = Config(cfg_dict, filename=cfg_file)
+    assert isinstance(cfg, Config)
+    assert cfg.filename == cfg_file
+    assert cfg.text == open(cfg_file, 'r').read()
+    assert cfg.dump() == yaml.dump(cfg_dict)
+    with tempfile.TemporaryDirectory() as temp_config_dir:
+        dump_file = osp.join(temp_config_dir, 'c.yaml')
+        cfg.dump(dump_file)
+        assert cfg.dump() == open(dump_file, 'r').read()
+        assert Config.fromfile(dump_file)
+
+    # test h.py
+    cfg_file = osp.join(data_path, 'config/h.py')
+    path = osp.join(osp.dirname(__file__), 'data', 'config')
+    # the value of osp.dirname(__file__) may be `D:\a\xxx` in windows
+    # environment. When dumping the cfg_dict to file, `D:\a\xxx` will be
+    # converted to `D:\x07\xxx` and it will cause unexpected result when
+    # checking whether `D:\a\xxx` equals to `D:\x07\xxx`. Therefore, we forcely
+    # convert a string representation of the path with forward slashes (/)
+    path = Path(path).as_posix()
+    cfg_dict = dict(item1='h.py', item2=path, item3='abc_h')
+    cfg = Config(cfg_dict, filename=cfg_file)
+    assert isinstance(cfg, Config)
+    assert cfg.filename == cfg_file
+    assert cfg.text == open(cfg_file, 'r').read()
+    assert cfg.dump() == cfg.pretty_text
+    with tempfile.TemporaryDirectory() as temp_config_dir:
+        dump_file = osp.join(temp_config_dir, 'h.py')
+        cfg.dump(dump_file)
+        assert cfg.dump() == open(dump_file, 'r').read()
+        assert Config.fromfile(dump_file)
+        assert Config.fromfile(dump_file)['item1'] == cfg_dict['item1']
+        assert Config.fromfile(dump_file)['item2'] == cfg_dict['item2']
+        assert Config.fromfile(dump_file)['item3'] == cfg_dict['item3']
+
+    # test no use_predefined_variable
+    cfg_dict = dict(
+        item1='{{fileBasename}}',
+        item2='{{ fileDirname}}',
+        item3='abc_{{ fileBasenameNoExtension }}')
+    assert Config.fromfile(cfg_file, False)
+    assert Config.fromfile(cfg_file, False)['item1'] == cfg_dict['item1']
+    assert Config.fromfile(cfg_file, False)['item2'] == cfg_dict['item2']
+    assert Config.fromfile(cfg_file, False)['item3'] == cfg_dict['item3']
+
+    # test p.yaml
+    cfg_file = osp.join(data_path, 'config/p.yaml')
+    cfg_dict = dict(item1=osp.join(osp.dirname(__file__), 'data', 'config'))
+    cfg = Config(cfg_dict, filename=cfg_file)
+    assert isinstance(cfg, Config)
+    assert cfg.filename == cfg_file
+    assert cfg.text == open(cfg_file, 'r').read()
+    assert cfg.dump() == yaml.dump(cfg_dict)
+    with tempfile.TemporaryDirectory() as temp_config_dir:
+        dump_file = osp.join(temp_config_dir, 'p.yaml')
+        cfg.dump(dump_file)
+        assert cfg.dump() == open(dump_file, 'r').read()
+        assert Config.fromfile(dump_file)
+        assert Config.fromfile(dump_file)['item1'] == cfg_dict['item1']
+
+    # test no use_predefined_variable
+    assert Config.fromfile(cfg_file, False)
+    assert Config.fromfile(cfg_file, False)['item1'] == '{{ fileDirname }}'
+
+    # test o.json
+    cfg_file = osp.join(data_path, 'config/o.json')
+    cfg_dict = dict(item1=osp.join(osp.dirname(__file__), 'data', 'config'))
+    cfg = Config(cfg_dict, filename=cfg_file)
+    assert isinstance(cfg, Config)
+    assert cfg.filename == cfg_file
+    assert cfg.text == open(cfg_file, 'r').read()
+    assert cfg.dump() == json.dumps(cfg_dict)
+    with tempfile.TemporaryDirectory() as temp_config_dir:
+        dump_file = osp.join(temp_config_dir, 'o.json')
+        cfg.dump(dump_file)
+        assert cfg.dump() == open(dump_file, 'r').read()
+        assert Config.fromfile(dump_file)
+        assert Config.fromfile(dump_file)['item1'] == cfg_dict['item1']
+
+    # test no use_predefined_variable
+    assert Config.fromfile(cfg_file, False)
+    assert Config.fromfile(cfg_file, False)['item1'] == '{{ fileDirname }}'
+
+
+def test_fromfile():
+    for filename in ['a.py', 'a.b.py', 'b.json', 'c.yaml']:
+        cfg_file = osp.join(data_path, 'config', filename)
+        cfg = Config.fromfile(cfg_file)
+        assert isinstance(cfg, Config)
+        assert cfg.filename == cfg_file
+        assert cfg.text == osp.abspath(osp.expanduser(cfg_file)) + '\n' + \
+            open(cfg_file, 'r').read()
+
+    # test custom_imports for Config.fromfile
+    cfg_file = osp.join(data_path, 'config', 'q.py')
+    imported_file = osp.join(data_path, 'config', 'r.py')
+    target_pkg = osp.join(osp.dirname(__file__), 'r.py')
+
+    # Since the imported config will be regarded as a tmp file
+    # it should be copied to the directory at the same level
+    shutil.copy(imported_file, target_pkg)
+    Config.fromfile(cfg_file, import_custom_modules=True)
+
+    assert os.environ.pop('TEST_VALUE') == 'test'
+    os.remove(target_pkg)
+
+    with pytest.raises(FileNotFoundError):
+        Config.fromfile('no_such_file.py')
+    with pytest.raises(IOError):
+        Config.fromfile(osp.join(data_path, 'color.jpg'))
+
+
+def test_fromstring():
+    for filename in ['a.py', 'a.b.py', 'b.json', 'c.yaml']:
+        cfg_file = osp.join(data_path, 'config', filename)
+        file_format = osp.splitext(filename)[-1]
+        in_cfg = Config.fromfile(cfg_file)
+
+        out_cfg = Config.fromstring(in_cfg.pretty_text, '.py')
+        assert in_cfg._cfg_dict == out_cfg._cfg_dict
+
+        cfg_str = open(cfg_file, 'r').read()
+        out_cfg = Config.fromstring(cfg_str, file_format)
+        assert in_cfg._cfg_dict == out_cfg._cfg_dict
+
+    # test pretty_text only supports py file format
+    cfg_file = osp.join(data_path, 'config', 'b.json')
+    in_cfg = Config.fromfile(cfg_file)
+    with pytest.raises(Exception):
+        Config.fromstring(in_cfg.pretty_text, '.json')
+
+    # test file format error
+    cfg_str = open(cfg_file, 'r').read()
+    with pytest.raises(Exception):
+        Config.fromstring(cfg_str, '.py')
+
+
+def test_merge_from_base():
+    cfg_file = osp.join(data_path, 'config/d.py')
+    cfg = Config.fromfile(cfg_file)
+    assert isinstance(cfg, Config)
+    assert cfg.filename == cfg_file
+    base_cfg_file = osp.join(data_path, 'config/base.py')
+    merge_text = osp.abspath(osp.expanduser(base_cfg_file)) + '\n' + \
+        open(base_cfg_file, 'r').read()
+    merge_text += '\n' + osp.abspath(osp.expanduser(cfg_file)) + '\n' + \
+                  open(cfg_file, 'r').read()
+    assert cfg.text == merge_text
+    assert cfg.item1 == [2, 3]
+    assert cfg.item2.a == 1
+    assert cfg.item3 is False
+    assert cfg.item4 == 'test_base'
+
+    with pytest.raises(TypeError):
+        Config.fromfile(osp.join(data_path, 'config/e.py'))
+
+
+def test_merge_from_multiple_bases():
+    cfg_file = osp.join(data_path, 'config/l.py')
+    cfg = Config.fromfile(cfg_file)
+    assert isinstance(cfg, Config)
+    assert cfg.filename == cfg_file
+    # cfg.field
+    assert cfg.item1 == [1, 2]
+    assert cfg.item2.a == 0
+    assert cfg.item3 is False
+    assert cfg.item4 == 'test'
+    assert cfg.item5 == dict(a=0, b=1)
+    assert cfg.item6 == [dict(a=0), dict(b=1)]
+    assert cfg.item7 == dict(a=[0, 1, 2], b=dict(c=[3.1, 4.2, 5.3]))
+
+    with pytest.raises(KeyError):
+        Config.fromfile(osp.join(data_path, 'config/m.py'))
+
+
+def test_base_variables():
+    for file in ['t.py', 't.json', 't.yaml']:
+        cfg_file = osp.join(data_path, f'config/{file}')
+        cfg = Config.fromfile(cfg_file)
+        assert isinstance(cfg, Config)
+        assert cfg.filename == cfg_file
+        # cfg.field
+        assert cfg.item1 == [1, 2]
+        assert cfg.item2.a == 0
+        assert cfg.item3 is False
+        assert cfg.item4 == 'test'
+        assert cfg.item5 == dict(a=0, b=1)
+        assert cfg.item6 == [dict(a=0), dict(b=1)]
+        assert cfg.item7 == dict(a=[0, 1, 2], b=dict(c=[3.1, 4.2, 5.3]))
+        assert cfg.item8 == file
+        assert cfg.item9 == dict(a=0)
+        assert cfg.item10 == [3.1, 4.2, 5.3]
+
+    # test nested base
+    for file in ['u.py', 'u.json', 'u.yaml']:
+        cfg_file = osp.join(data_path, f'config/{file}')
+        cfg = Config.fromfile(cfg_file)
+        assert isinstance(cfg, Config)
+        assert cfg.filename == cfg_file
+        # cfg.field
+        assert cfg.base == '_base_.item8'
+        assert cfg.item1 == [1, 2]
+        assert cfg.item2.a == 0
+        assert cfg.item3 is False
+        assert cfg.item4 == 'test'
+        assert cfg.item5 == dict(a=0, b=1)
+        assert cfg.item6 == [dict(a=0), dict(b=1)]
+        assert cfg.item7 == dict(a=[0, 1, 2], b=dict(c=[3.1, 4.2, 5.3]))
+        assert cfg.item8 == 't.py'
+        assert cfg.item9 == dict(a=0)
+        assert cfg.item10 == [3.1, 4.2, 5.3]
+        assert cfg.item11 == 't.py'
+        assert cfg.item12 == dict(a=0)
+        assert cfg.item13 == [3.1, 4.2, 5.3]
+        assert cfg.item14 == [1, 2]
+        assert cfg.item15 == dict(
+            a=dict(b=dict(a=0)),
+            b=[False],
+            c=['test'],
+            d=[[{
+                'e': 0
+            }], [{
+                'a': 0
+            }, {
+                'b': 1
+            }]],
+            e=[1, 2])
+
+    # test reference assignment for py
+    cfg_file = osp.join(data_path, 'config/v.py')
+    cfg = Config.fromfile(cfg_file)
+    assert isinstance(cfg, Config)
+    assert cfg.filename == cfg_file
+    assert cfg.item21 == 't.py'
+    assert cfg.item22 == 't.py'
+    assert cfg.item23 == [3.1, 4.2, 5.3]
+    assert cfg.item24 == [3.1, 4.2, 5.3]
+    assert cfg.item25 == dict(
+        a=dict(b=[3.1, 4.2, 5.3]),
+        b=[[3.1, 4.2, 5.3]],
+        c=[[{
+            'e': 't.py'
+        }], [{
+            'a': 0
+        }, {
+            'b': 1
+        }]],
+        e='t.py')
+
+
+def test_merge_recursive_bases():
+    cfg_file = osp.join(data_path, 'config/f.py')
+    cfg = Config.fromfile(cfg_file)
+    assert isinstance(cfg, Config)
+    assert cfg.filename == cfg_file
+    # cfg.field
+    assert cfg.item1 == [2, 3]
+    assert cfg.item2.a == 1
+    assert cfg.item3 is False
+    assert cfg.item4 == 'test_recursive_bases'
+
+
+def test_merge_from_dict():
+    cfg_file = osp.join(data_path, 'config/a.py')
+    cfg = Config.fromfile(cfg_file)
+    input_options = {'item2.a': 1, 'item2.b': 0.1, 'item3': False}
+    cfg.merge_from_dict(input_options)
+    assert cfg.item2 == dict(a=1, b=0.1)
+    assert cfg.item3 is False
+
+    cfg_file = osp.join(data_path, 'config/s.py')
+    cfg = Config.fromfile(cfg_file)
+
+    # Allow list keys
+    input_options = {'item.0.a': 1, 'item.1.b': 1}
+    cfg.merge_from_dict(input_options, allow_list_keys=True)
+    assert cfg.item == [{'a': 1}, {'b': 1, 'c': 0}]
+
+    # allow_list_keys is False
+    input_options = {'item.0.a': 1, 'item.1.b': 1}
+    with pytest.raises(TypeError):
+        cfg.merge_from_dict(input_options, allow_list_keys=False)
+
+    # Overflowed index number
+    input_options = {'item.2.a': 1}
+    with pytest.raises(KeyError):
+        cfg.merge_from_dict(input_options, allow_list_keys=True)
+
+
+def test_merge_delete():
+    cfg_file = osp.join(data_path, 'config/delete.py')
+    cfg = Config.fromfile(cfg_file)
+    # cfg.field
+    assert cfg.item1 == [1, 2]
+    assert cfg.item2 == dict(b=0)
+    assert cfg.item3 is True
+    assert cfg.item4 == 'test'
+    assert '_delete_' not in cfg.item2
+
+
+def test_merge_intermediate_variable():
+
+    cfg_file = osp.join(data_path, 'config/i_child.py')
+    cfg = Config.fromfile(cfg_file)
+    # cfg.field
+    assert cfg.item1 == [1, 2]
+    assert cfg.item2 == dict(a=0)
+    assert cfg.item3 is True
+    assert cfg.item4 == 'test'
+    assert cfg.item_cfg == dict(b=2)
+    assert cfg.item5 == dict(cfg=dict(b=1))
+    assert cfg.item6 == dict(cfg=dict(b=2))
+
+
+def test_fromfile_in_config():
+    cfg_file = osp.join(data_path, 'config/code.py')
+    cfg = Config.fromfile(cfg_file)
+    # cfg.field
+    assert cfg.cfg.item1 == [1, 2]
+    assert cfg.cfg.item2 == dict(a=0)
+    assert cfg.cfg.item3 is True
+    assert cfg.cfg.item4 == 'test'
+    assert cfg.item5 == 1
+
+
+def test_dict():
+    cfg_dict = dict(item1=[1, 2], item2=dict(a=0), item3=True, item4='test')
+
+    for filename in ['a.py', 'b.json', 'c.yaml']:
+        cfg_file = osp.join(data_path, 'config', filename)
+        cfg = Config.fromfile(cfg_file)
+
+        # len(cfg)
+        assert len(cfg) == 4
+        # cfg.keys()
+        assert set(cfg.keys()) == set(cfg_dict.keys())
+        assert set(cfg._cfg_dict.keys()) == set(cfg_dict.keys())
+        # cfg.values()
+        for value in cfg.values():
+            assert value in cfg_dict.values()
+        # cfg.items()
+        for name, value in cfg.items():
+            assert name in cfg_dict
+            assert value in cfg_dict.values()
+        # cfg.field
+        assert cfg.item1 == cfg_dict['item1']
+        assert cfg.item2 == cfg_dict['item2']
+        assert cfg.item2.a == 0
+        assert cfg.item3 == cfg_dict['item3']
+        assert cfg.item4 == cfg_dict['item4']
+        with pytest.raises(AttributeError):
+            cfg.not_exist
+        # field in cfg, cfg[field], cfg.get()
+        for name in ['item1', 'item2', 'item3', 'item4']:
+            assert name in cfg
+            assert cfg[name] == cfg_dict[name]
+            assert cfg.get(name) == cfg_dict[name]
+            assert cfg.get('not_exist') is None
+            assert cfg.get('not_exist', 0) == 0
+            with pytest.raises(KeyError):
+                cfg['not_exist']
+        assert 'item1' in cfg
+        assert 'not_exist' not in cfg
+        # cfg.update()
+        cfg.update(dict(item1=0))
+        assert cfg.item1 == 0
+        cfg.update(dict(item2=dict(a=1)))
+        assert cfg.item2.a == 1
+
+
+def test_setattr():
+    cfg = Config()
+    cfg.item1 = [1, 2]
+    cfg.item2 = {'a': 0}
+    cfg['item5'] = {'a': {'b': None}}
+    assert cfg._cfg_dict['item1'] == [1, 2]
+    assert cfg.item1 == [1, 2]
+    assert cfg._cfg_dict['item2'] == {'a': 0}
+    assert cfg.item2.a == 0
+    assert cfg._cfg_dict['item5'] == {'a': {'b': None}}
+    assert cfg.item5.a.b is None
+
+
+def test_pretty_text():
+    cfg_file = osp.join(data_path, 'config/l.py')
+    cfg = Config.fromfile(cfg_file)
+    with tempfile.TemporaryDirectory() as temp_config_dir:
+        text_cfg_filename = osp.join(temp_config_dir, '_text_config.py')
+        with open(text_cfg_filename, 'w') as f:
+            f.write(cfg.pretty_text)
+        text_cfg = Config.fromfile(text_cfg_filename)
+    assert text_cfg._cfg_dict == cfg._cfg_dict
+
+
+def test_dict_action():
+    parser = argparse.ArgumentParser(description='Train a detector')
+    parser.add_argument(
+        '--options', nargs='+', action=DictAction, help='custom options')
+    # Nested brackets
+    args = parser.parse_args(
+        ['--options', 'item2.a=a,b', 'item2.b=[(a,b), [1,2], false]'])
+    out_dict = {'item2.a': ['a', 'b'], 'item2.b': [('a', 'b'), [1, 2], False]}
+    assert args.options == out_dict
+    # Single Nested brackets
+    args = parser.parse_args(['--options', 'item2.a=[[1]]'])
+    out_dict = {'item2.a': [[1]]}
+    assert args.options == out_dict
+    # Imbalance bracket
+    with pytest.raises(AssertionError):
+        parser.parse_args(['--options', 'item2.a=[(a,b), [1,2], false'])
+    # Normal values
+    args = parser.parse_args(
+        ['--options', 'item2.a=1', 'item2.b=0.1', 'item2.c=x', 'item3=false'])
+    out_dict = {'item2.a': 1, 'item2.b': 0.1, 'item2.c': 'x', 'item3': False}
+    assert args.options == out_dict
+    cfg_file = osp.join(data_path, 'config/a.py')
+    cfg = Config.fromfile(cfg_file)
+    cfg.merge_from_dict(args.options)
+    assert cfg.item2 == dict(a=1, b=0.1, c='x')
+    assert cfg.item3 is False
+
+
+def test_dump_mapping():
+    cfg_file = osp.join(data_path, 'config/n.py')
+    cfg = Config.fromfile(cfg_file)
+
+    with tempfile.TemporaryDirectory() as temp_config_dir:
+        text_cfg_filename = osp.join(temp_config_dir, '_text_config.py')
+        cfg.dump(text_cfg_filename)
+        text_cfg = Config.fromfile(text_cfg_filename)
+
+    assert text_cfg._cfg_dict == cfg._cfg_dict
+
+
+def test_reserved_key():
+    cfg_file = osp.join(data_path, 'config/g.py')
+    with pytest.raises(KeyError):
+        Config.fromfile(cfg_file)
+
+
+def test_syntax_error():
+    # the name can not be used to open the file a second time in windows,
+    # so `delete` should be set as `False` and we need to manually remove it
+    # more details can be found at https://github.com/open-mmlab/mmcv/pull/1077
+    temp_cfg_file = tempfile.NamedTemporaryFile(suffix='.py', delete=False)
+    temp_cfg_path = temp_cfg_file.name
+    # write a file with syntax error
+    with open(temp_cfg_path, 'w') as f:
+        f.write('a=0b=dict(c=1)')
+    with pytest.raises(
+            SyntaxError, match='There are syntax errors in config file'):
+        Config.fromfile(temp_cfg_path)
+    temp_cfg_file.close()
+    os.remove(temp_cfg_path)
+
+
+def test_pickle_support():
+    cfg_file = osp.join(data_path, 'config/n.py')
+    cfg = Config.fromfile(cfg_file)
+
+    with tempfile.TemporaryDirectory() as temp_config_dir:
+        pkl_cfg_filename = osp.join(temp_config_dir, '_pickle.pkl')
+        dump(cfg, pkl_cfg_filename)
+        pkl_cfg = load(pkl_cfg_filename)
+
+    assert pkl_cfg._cfg_dict == cfg._cfg_dict
+
+
+def test_deprecation():
+    deprecated_cfg_files = [
+        osp.join(data_path, 'config/deprecated.py'),
+        osp.join(data_path, 'config/deprecated_as_base.py')
+    ]
+
+    for cfg_file in deprecated_cfg_files:
+        with pytest.warns(UserWarning):
+            cfg = Config.fromfile(cfg_file)
+        assert cfg.item1 == 'expected'
diff --git a/tests/test_utils/test_env.py b/tests/test_utils/test_env.py
index 74bafff..7c245c7 100644
--- a/tests/test_utils/test_env.py
+++ b/tests/test_utils/test_env.py
@@ -1,4 +1,3 @@
-# Copyright (c) OpenMMLab. All rights reserved.
 import sys
 
 import pytest
@@ -16,7 +15,7 @@ def test_collect_env():
     env_info = collect_env()
     expected_keys = [
         'sys.platform', 'Python', 'CUDA available', 'PyTorch',
-        'PyTorch compiling details', 'OpenCV', 'MMCV', 'MMCV Compiler', 'GCC',
+        'PyTorch compiling details', 'OpenCV', 'MMCV', 'MMCV Compiler',
         'MMCV CUDA Compiler'
     ]
     for key in expected_keys:
@@ -26,8 +25,8 @@ def test_collect_env():
         for key in ['CUDA_HOME', 'NVCC']:
             assert key in env_info
 
-    if sys.platform == 'win32':
-        assert 'MSVC' in env_info
+    if sys.platform != 'win32':
+        assert 'GCC' in env_info
 
     assert env_info['sys.platform'] == sys.platform
     assert env_info['Python'] == sys.version.replace('\n', '')
diff --git a/tests/test_utils/test_hub.py b/tests/test_utils/test_hub.py
new file mode 100644
index 0000000..0464151
--- /dev/null
+++ b/tests/test_utils/test_hub.py
@@ -0,0 +1,32 @@
+import pytest
+from torch.utils import model_zoo
+
+from mmcv.utils import TORCH_VERSION, digit_version, load_url
+
+
+def test_load_url():
+    url1 = 'https://download.openmmlab.com/mmcv/test_data/saved_in_pt1.5.pth'
+    url2 = 'https://download.openmmlab.com/mmcv/test_data/saved_in_pt1.6.pth'
+
+    # The 1.6 release of PyTorch switched torch.save to use a new zipfile-based
+    # file format. It will cause RuntimeError when a checkpoint was saved in
+    # torch >= 1.6.0 but loaded in torch < 1.7.0.
+    # More details at https://github.com/open-mmlab/mmpose/issues/904
+    if digit_version(TORCH_VERSION) < digit_version('1.7.0'):
+        model_zoo.load_url(url1)
+        with pytest.raises(RuntimeError):
+            model_zoo.load_url(url2)
+    else:
+        # high version of PyTorch can load checkpoints from url, regardless
+        # of which version they were saved in
+        model_zoo.load_url(url1)
+        model_zoo.load_url(url2)
+
+    load_url(url1)
+    # if a checkpoint was saved in torch >= 1.6.0 but loaded in torch < 1.5.0,
+    # it will raise a RuntimeError
+    if digit_version(TORCH_VERSION) < digit_version('1.5.0'):
+        with pytest.raises(RuntimeError):
+            load_url(url2)
+    else:
+        load_url(url2)
diff --git a/tests/test_utils/test_logging.py b/tests/test_utils/test_logging.py
new file mode 100644
index 0000000..4be4bb2
--- /dev/null
+++ b/tests/test_utils/test_logging.py
@@ -0,0 +1,117 @@
+import logging
+import os
+import platform
+import tempfile
+from unittest.mock import patch
+
+import pytest
+
+from mmcv import get_logger, print_log
+
+if platform.system() == 'Windows':
+    import regex as re
+else:
+    import re
+
+
+@patch('torch.distributed.get_rank', lambda: 0)
+@patch('torch.distributed.is_initialized', lambda: True)
+@patch('torch.distributed.is_available', lambda: True)
+def test_get_logger_rank0():
+    logger = get_logger('rank0.pkg1')
+    assert isinstance(logger, logging.Logger)
+    assert len(logger.handlers) == 1
+    assert isinstance(logger.handlers[0], logging.StreamHandler)
+    assert logger.handlers[0].level == logging.INFO
+
+    logger = get_logger('rank0.pkg2', log_level=logging.DEBUG)
+    assert isinstance(logger, logging.Logger)
+    assert len(logger.handlers) == 1
+    assert logger.handlers[0].level == logging.DEBUG
+
+    # the name can not be used to open the file a second time in windows,
+    # so `delete` should be set as `False` and we need to manually remove it
+    # more details can be found at https://github.com/open-mmlab/mmcv/pull/1077
+    with tempfile.NamedTemporaryFile(delete=False) as f:
+        logger = get_logger('rank0.pkg3', log_file=f.name)
+        assert isinstance(logger, logging.Logger)
+        assert len(logger.handlers) == 2
+        assert isinstance(logger.handlers[0], logging.StreamHandler)
+        assert isinstance(logger.handlers[1], logging.FileHandler)
+        logger_pkg3 = get_logger('rank0.pkg3')
+        assert id(logger_pkg3) == id(logger)
+        # flushing and closing all handlers in order to remove `f.name`
+        logging.shutdown()
+
+    os.remove(f.name)
+
+    logger_pkg3 = get_logger('rank0.pkg3.subpkg')
+    assert logger_pkg3.handlers == logger_pkg3.handlers
+
+
+@patch('torch.distributed.get_rank', lambda: 1)
+@patch('torch.distributed.is_initialized', lambda: True)
+@patch('torch.distributed.is_available', lambda: True)
+def test_get_logger_rank1():
+    logger = get_logger('rank1.pkg1')
+    assert isinstance(logger, logging.Logger)
+    assert len(logger.handlers) == 1
+    assert isinstance(logger.handlers[0], logging.StreamHandler)
+    assert logger.handlers[0].level == logging.INFO
+
+    # the name can not be used to open the file a second time in windows,
+    # so `delete` should be set as `False` and we need to manually remove it
+    # more details can be found at https://github.com/open-mmlab/mmcv/pull/1077
+    with tempfile.NamedTemporaryFile(delete=False) as f:
+        logger = get_logger('rank1.pkg2', log_file=f.name)
+        assert isinstance(logger, logging.Logger)
+        assert len(logger.handlers) == 1
+        assert logger.handlers[0].level == logging.INFO
+        # flushing and closing all handlers in order to remove `f.name`
+        logging.shutdown()
+
+    os.remove(f.name)
+
+
+def test_print_log_print(capsys):
+    print_log('welcome', logger=None)
+    out, _ = capsys.readouterr()
+    assert out == 'welcome\n'
+
+
+def test_print_log_silent(capsys, caplog):
+    print_log('welcome', logger='silent')
+    out, _ = capsys.readouterr()
+    assert out == ''
+    assert len(caplog.records) == 0
+
+
+def test_print_log_logger(caplog):
+    print_log('welcome', logger='mmcv')
+    assert caplog.record_tuples[-1] == ('mmcv', logging.INFO, 'welcome')
+
+    print_log('welcome', logger='mmcv', level=logging.ERROR)
+    assert caplog.record_tuples[-1] == ('mmcv', logging.ERROR, 'welcome')
+
+    # the name can not be used to open the file a second time in windows,
+    # so `delete` should be set as `False` and we need to manually remove it
+    # more details can be found at https://github.com/open-mmlab/mmcv/pull/1077
+    with tempfile.NamedTemporaryFile(delete=False) as f:
+        logger = get_logger('abc', log_file=f.name)
+        print_log('welcome', logger=logger)
+        assert caplog.record_tuples[-1] == ('abc', logging.INFO, 'welcome')
+        with open(f.name, 'r') as fin:
+            log_text = fin.read()
+            regex_time = r'\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2},\d{3}'
+            match = re.fullmatch(regex_time + r' - abc - INFO - welcome\n',
+                                 log_text)
+            assert match is not None
+        # flushing and closing all handlers in order to remove `f.name`
+        logging.shutdown()
+
+    os.remove(f.name)
+
+
+def test_print_log_exception():
+    with pytest.raises(TypeError):
+        print_log('welcome', logger=0)
diff --git a/tests/test_utils/test_misc.py b/tests/test_utils/test_misc.py
new file mode 100644
index 0000000..6070624
--- /dev/null
+++ b/tests/test_utils/test_misc.py
@@ -0,0 +1,225 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import pytest
+
+import mmcv
+from mmcv import deprecated_api_warning
+from mmcv.utils.misc import has_method
+
+
+def test_to_ntuple():
+    single_number = 2
+    assert mmcv.utils.to_1tuple(single_number) == (single_number, )
+    assert mmcv.utils.to_2tuple(single_number) == (single_number,
+                                                   single_number)
+    assert mmcv.utils.to_3tuple(single_number) == (single_number,
+                                                   single_number,
+                                                   single_number)
+    assert mmcv.utils.to_4tuple(single_number) == (single_number,
+                                                   single_number,
+                                                   single_number,
+                                                   single_number)
+    assert mmcv.utils.to_ntuple(5)(single_number) == (single_number,
+                                                      single_number,
+                                                      single_number,
+                                                      single_number,
+                                                      single_number)
+    assert mmcv.utils.to_ntuple(6)(single_number) == (single_number,
+                                                      single_number,
+                                                      single_number,
+                                                      single_number,
+                                                      single_number,
+                                                      single_number)
+
+
+def test_iter_cast():
+    assert mmcv.list_cast([1, 2, 3], int) == [1, 2, 3]
+    assert mmcv.list_cast(['1.1', 2, '3'], float) == [1.1, 2.0, 3.0]
+    assert mmcv.list_cast([1, 2, 3], str) == ['1', '2', '3']
+    assert mmcv.tuple_cast((1, 2, 3), str) == ('1', '2', '3')
+    assert next(mmcv.iter_cast([1, 2, 3], str)) == '1'
+    with pytest.raises(TypeError):
+        mmcv.iter_cast([1, 2, 3], '')
+    with pytest.raises(TypeError):
+        mmcv.iter_cast(1, str)
+
+
+def test_is_seq_of():
+    assert mmcv.is_seq_of([1.0, 2.0, 3.0], float)
+    assert mmcv.is_seq_of([(1, ), (2, ), (3, )], tuple)
+    assert mmcv.is_seq_of((1.0, 2.0, 3.0), float)
+    assert mmcv.is_list_of([1.0, 2.0, 3.0], float)
+    assert not mmcv.is_seq_of((1.0, 2.0, 3.0), float, seq_type=list)
+    assert not mmcv.is_tuple_of([1.0, 2.0, 3.0], float)
+    assert not mmcv.is_seq_of([1.0, 2, 3], int)
+    assert not mmcv.is_seq_of((1.0, 2, 3), int)
+
+
+def test_slice_list():
+    in_list = [1, 2, 3, 4, 5, 6]
+    assert mmcv.slice_list(in_list, [1, 2, 3]) == [[1], [2, 3], [4, 5, 6]]
+    assert mmcv.slice_list(in_list, [len(in_list)]) == [in_list]
+    with pytest.raises(TypeError):
+        mmcv.slice_list(in_list, 2.0)
+    with pytest.raises(ValueError):
+        mmcv.slice_list(in_list, [1, 2])
+
+
+def test_concat_list():
+    assert mmcv.concat_list([[1, 2]]) == [1, 2]
+    assert mmcv.concat_list([[1, 2], [3, 4, 5], [6]]) == [1, 2, 3, 4, 5, 6]
+
+
+def test_requires_package(capsys):
+
+    @mmcv.requires_package('nnn')
+    def func_a():
+        pass
+
+    @mmcv.requires_package(['numpy', 'n1', 'n2'])
+    def func_b():
+        pass
+
+    @mmcv.requires_package('numpy')
+    def func_c():
+        return 1
+
+    with pytest.raises(RuntimeError):
+        func_a()
+    out, _ = capsys.readouterr()
+    assert out == ('Prerequisites "nnn" are required in method "func_a" but '
+                   'not found, please install them first.\n')
+
+    with pytest.raises(RuntimeError):
+        func_b()
+    out, _ = capsys.readouterr()
+    assert out == (
+        'Prerequisites "n1, n2" are required in method "func_b" but not found,'
+        ' please install them first.\n')
+
+    assert func_c() == 1
+
+
+def test_requires_executable(capsys):
+
+    @mmcv.requires_executable('nnn')
+    def func_a():
+        pass
+
+    @mmcv.requires_executable(['ls', 'n1', 'n2'])
+    def func_b():
+        pass
+
+    @mmcv.requires_executable('mv')
+    def func_c():
+        return 1
+
+    with pytest.raises(RuntimeError):
+        func_a()
+    out, _ = capsys.readouterr()
+    assert out == ('Prerequisites "nnn" are required in method "func_a" but '
+                   'not found, please install them first.\n')
+
+    with pytest.raises(RuntimeError):
+        func_b()
+    out, _ = capsys.readouterr()
+    assert out == (
+        'Prerequisites "n1, n2" are required in method "func_b" but not found,'
+        ' please install them first.\n')
+
+    assert func_c() == 1
+
+
+def test_import_modules_from_strings():
+    # multiple imports
+    import os.path as osp_
+
+    import sys as sys_
+    osp, sys = mmcv.import_modules_from_strings(['os.path', 'sys'])
+    assert osp == osp_
+    assert sys == sys_
+
+    # single imports
+    osp = mmcv.import_modules_from_strings('os.path')
+    assert osp == osp_
+    # No imports
+    assert mmcv.import_modules_from_strings(None) is None
+    assert mmcv.import_modules_from_strings([]) is None
+    assert mmcv.import_modules_from_strings('') is None
+    # Unsupported types
+    with pytest.raises(TypeError):
+        mmcv.import_modules_from_strings(1)
+    with pytest.raises(TypeError):
+        mmcv.import_modules_from_strings([1])
+    # Failed imports
+    with pytest.raises(ImportError):
+        mmcv.import_modules_from_strings('_not_implemented_module')
+    with pytest.warns(UserWarning):
+        imported = mmcv.import_modules_from_strings(
+            '_not_implemented_module', allow_failed_imports=True)
+        assert imported is None
+    with pytest.warns(UserWarning):
+        imported = mmcv.import_modules_from_strings(
+            ['os.path', '_not_implemented'], allow_failed_imports=True)
+        assert imported[0] == osp
+        assert imported[1] is None
+
+
+def test_is_method_overridden():
+
+    class Base:
+
+        def foo1():
+            pass
+
+        def foo2():
+            pass
+
+    class Sub(Base):
+
+        def foo1():
+            pass
+
+    # test passing sub class directly
+    assert mmcv.is_method_overridden('foo1', Base, Sub)
+    assert not mmcv.is_method_overridden('foo2', Base, Sub)
+
+    # test passing instance of sub class
+    sub_instance = Sub()
+    assert mmcv.is_method_overridden('foo1', Base, sub_instance)
+    assert not mmcv.is_method_overridden('foo2', Base, sub_instance)
+
+    # base_class should be a class, not instance
+    base_instance = Base()
+    with pytest.raises(AssertionError):
+        mmcv.is_method_overridden('foo1', base_instance, sub_instance)
+
+
+def test_has_method():
+
+    class Foo:
+
+        def __init__(self, name):
+            self.name = name
+
+        def print_name(self):
+            print(self.name)
+
+    foo = Foo('foo')
+    assert not has_method(foo, 'name')
+    assert has_method(foo, 'print_name')
+
+
+def test_deprecated_api_warning():
+
+    @deprecated_api_warning(name_dict=dict(old_key='new_key'))
+    def dummy_func(new_key=1):
+        return new_key
+
+    # replace `old_key` to `new_key`
+    assert dummy_func(old_key=2) == 2
+
+    # The expected behavior is to replace the
+    # deprecated key `old_key` to `new_key`,
+    # but got them in the arguments at the same time
+    with pytest.raises(AssertionError):
+        dummy_func(old_key=1, new_key=2)
diff --git a/tests/test_utils/test_parrots_jit.py b/tests/test_utils/test_parrots_jit.py
index 921a440..78f8b9f 100644
--- a/tests/test_utils/test_parrots_jit.py
+++ b/tests/test_utils/test_parrots_jit.py
@@ -1,16 +1,14 @@
-# Copyright (c) OpenMMLab. All rights reserved.
 import pytest
 import torch
-from mmengine.utils.dl_utils import TORCH_VERSION
 
 import mmcv
+from mmcv.utils import TORCH_VERSION
 
-pytest.skip('this test not ready now', allow_module_level=True)
 skip_no_parrots = pytest.mark.skipif(
     TORCH_VERSION != 'parrots', reason='test case under parrots environment')
 
 
-class TestJit:
+class TestJit(object):
 
     def test_add_dict(self):
 
@@ -255,7 +253,7 @@ class TestJit:
 
     def test_instance_method(self):
 
-        class T:
+        class T(object):
 
             def __init__(self, shape):
                 self._c = torch.rand(shape)
diff --git a/tests/test_utils/test_path.py b/tests/test_utils/test_path.py
new file mode 100644
index 0000000..299c87f
--- /dev/null
+++ b/tests/test_utils/test_path.py
@@ -0,0 +1,73 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+from pathlib import Path
+
+import pytest
+
+import mmcv
+
+
+def test_is_filepath():
+    assert mmcv.is_filepath(__file__)
+    assert mmcv.is_filepath('abc')
+    assert mmcv.is_filepath(Path('/etc'))
+    assert not mmcv.is_filepath(0)
+
+
+def test_fopen():
+    assert hasattr(mmcv.fopen(__file__), 'read')
+    assert hasattr(mmcv.fopen(Path(__file__)), 'read')
+
+
+def test_check_file_exist():
+    mmcv.check_file_exist(__file__)
+    with pytest.raises(FileNotFoundError):
+        mmcv.check_file_exist('no_such_file.txt')
+
+
+def test_scandir():
+    folder = osp.join(osp.dirname(osp.dirname(__file__)), 'data/for_scan')
+    filenames = ['a.bin', '1.txt', '2.txt', '1.json', '2.json', '3.TXT']
+    assert set(mmcv.scandir(folder)) == set(filenames)
+    assert set(mmcv.scandir(Path(folder))) == set(filenames)
+    assert set(mmcv.scandir(folder, '.txt')) == set(
+        [filename for filename in filenames if filename.endswith('.txt')])
+    assert set(mmcv.scandir(folder, ('.json', '.txt'))) == set([
+        filename for filename in filenames
+        if filename.endswith(('.txt', '.json'))
+    ])
+    assert set(mmcv.scandir(folder, '.png')) == set()
+
+    # path of sep is `\\` in windows but `/` in linux, so osp.join should be
+    # used to join string for compatibility
+    filenames_recursive = [
+        'a.bin', '1.txt', '2.txt', '1.json', '2.json', '3.TXT',
+        osp.join('sub', '1.json'),
+        osp.join('sub', '1.txt'), '.file'
+    ]
+    # .file starts with '.' and is a file so it will not be scanned
+    assert set(mmcv.scandir(folder, recursive=True)) == set(
+        [filename for filename in filenames_recursive if filename != '.file'])
+    assert set(mmcv.scandir(Path(folder), recursive=True)) == set(
+        [filename for filename in filenames_recursive if filename != '.file'])
+    assert set(mmcv.scandir(folder, '.txt', recursive=True)) == set([
+        filename for filename in filenames_recursive
+        if filename.endswith('.txt')
+    ])
+    assert set(
+        mmcv.scandir(folder, '.TXT', recursive=True,
+                     case_sensitive=False)) == set([
+                         filename for filename in filenames_recursive
+                         if filename.endswith(('.txt', '.TXT'))
+                     ])
+    assert set(
+        mmcv.scandir(
+            folder, ('.TXT', '.JSON'), recursive=True,
+            case_sensitive=False)) == set([
+                filename for filename in filenames_recursive
+                if filename.endswith(('.txt', '.json', '.TXT'))
+            ])
+    with pytest.raises(TypeError):
+        list(mmcv.scandir(123))
+    with pytest.raises(TypeError):
+        list(mmcv.scandir(folder, 111))
diff --git a/tests/test_utils/test_progressbar.py b/tests/test_utils/test_progressbar.py
new file mode 100644
index 0000000..730f456
--- /dev/null
+++ b/tests/test_utils/test_progressbar.py
@@ -0,0 +1,171 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+import time
+
+try:
+    from unittest.mock import patch
+except ImportError:
+    from mock import patch
+
+try:
+    from StringIO import StringIO
+except ImportError:
+    from io import StringIO
+
+import mmcv  # isort:skip
+
+
+def reset_string_io(io):
+    io.truncate(0)
+    io.seek(0)
+
+
+class TestProgressBar:
+
+    def test_start(self):
+        out = StringIO()
+        bar_width = 20
+        # without total task num
+        prog_bar = mmcv.ProgressBar(bar_width=bar_width, file=out)
+        assert out.getvalue() == 'completed: 0, elapsed: 0s'
+        reset_string_io(out)
+        prog_bar = mmcv.ProgressBar(bar_width=bar_width, start=False, file=out)
+        assert out.getvalue() == ''
+        reset_string_io(out)
+        prog_bar.start()
+        assert out.getvalue() == 'completed: 0, elapsed: 0s'
+        # with total task num
+        reset_string_io(out)
+        prog_bar = mmcv.ProgressBar(10, bar_width=bar_width, file=out)
+        assert out.getvalue() == f'[{" " * bar_width}] 0/10, elapsed: 0s, ETA:'
+        reset_string_io(out)
+        prog_bar = mmcv.ProgressBar(
+            10, bar_width=bar_width, start=False, file=out)
+        assert out.getvalue() == ''
+        reset_string_io(out)
+        prog_bar.start()
+        assert out.getvalue() == f'[{" " * bar_width}] 0/10, elapsed: 0s, ETA:'
+
+    def test_update(self):
+        out = StringIO()
+        bar_width = 20
+        # without total task num
+        prog_bar = mmcv.ProgressBar(bar_width=bar_width, file=out)
+        time.sleep(1)
+        reset_string_io(out)
+        prog_bar.update()
+        assert out.getvalue() == 'completed: 1, elapsed: 1s, 1.0 tasks/s'
+        reset_string_io(out)
+        # with total task num
+        prog_bar = mmcv.ProgressBar(10, bar_width=bar_width, file=out)
+        time.sleep(1)
+        reset_string_io(out)
+        prog_bar.update()
+        assert out.getvalue() == f'\r[{">" * 2 + " " * 18}] 1/10, 1.0 ' \
+                                 'task/s, elapsed: 1s, ETA:     9s'
+
+    def test_adaptive_length(self):
+        with patch.dict('os.environ', {'COLUMNS': '80'}):
+            out = StringIO()
+            bar_width = 20
+            prog_bar = mmcv.ProgressBar(10, bar_width=bar_width, file=out)
+            time.sleep(1)
+            reset_string_io(out)
+            prog_bar.update()
+            assert len(out.getvalue()) == 66
+
+            os.environ['COLUMNS'] = '30'
+            reset_string_io(out)
+            prog_bar.update()
+            assert len(out.getvalue()) == 48
+
+            os.environ['COLUMNS'] = '60'
+            reset_string_io(out)
+            prog_bar.update()
+            assert len(out.getvalue()) == 60
+
+
+def sleep_1s(num):
+    time.sleep(1)
+    return num
+
+
+def test_track_progress_list():
+    out = StringIO()
+    ret = mmcv.track_progress(sleep_1s, [1, 2, 3], bar_width=3, file=out)
+    assert out.getvalue() == (
+        '[   ] 0/3, elapsed: 0s, ETA:'
+        '\r[>  ] 1/3, 1.0 task/s, elapsed: 1s, ETA:     2s'
+        '\r[>> ] 2/3, 1.0 task/s, elapsed: 2s, ETA:     1s'
+        '\r[>>>] 3/3, 1.0 task/s, elapsed: 3s, ETA:     0s\n')
+    assert ret == [1, 2, 3]
+
+
+def test_track_progress_iterator():
+    out = StringIO()
+    ret = mmcv.track_progress(
+        sleep_1s, ((i for i in [1, 2, 3]), 3), bar_width=3, file=out)
+    assert out.getvalue() == (
+        '[   ] 0/3, elapsed: 0s, ETA:'
+        '\r[>  ] 1/3, 1.0 task/s, elapsed: 1s, ETA:     2s'
+        '\r[>> ] 2/3, 1.0 task/s, elapsed: 2s, ETA:     1s'
+        '\r[>>>] 3/3, 1.0 task/s, elapsed: 3s, ETA:     0s\n')
+    assert ret == [1, 2, 3]
+
+
+def test_track_iter_progress():
+    out = StringIO()
+    ret = []
+    for num in mmcv.track_iter_progress([1, 2, 3], bar_width=3, file=out):
+        ret.append(sleep_1s(num))
+    assert out.getvalue() == (
+        '[   ] 0/3, elapsed: 0s, ETA:'
+        '\r[>  ] 1/3, 1.0 task/s, elapsed: 1s, ETA:     2s'
+        '\r[>> ] 2/3, 1.0 task/s, elapsed: 2s, ETA:     1s'
+        '\r[>>>] 3/3, 1.0 task/s, elapsed: 3s, ETA:     0s\n')
+    assert ret == [1, 2, 3]
+
+
+def test_track_enum_progress():
+    out = StringIO()
+    ret = []
+    count = []
+    for i, num in enumerate(
+            mmcv.track_iter_progress([1, 2, 3], bar_width=3, file=out)):
+        ret.append(sleep_1s(num))
+        count.append(i)
+    assert out.getvalue() == (
+        '[   ] 0/3, elapsed: 0s, ETA:'
+        '\r[>  ] 1/3, 1.0 task/s, elapsed: 1s, ETA:     2s'
+        '\r[>> ] 2/3, 1.0 task/s, elapsed: 2s, ETA:     1s'
+        '\r[>>>] 3/3, 1.0 task/s, elapsed: 3s, ETA:     0s\n')
+    assert ret == [1, 2, 3]
+    assert count == [0, 1, 2]
+
+
+def test_track_parallel_progress_list():
+    out = StringIO()
+    results = mmcv.track_parallel_progress(
+        sleep_1s, [1, 2, 3, 4], 2, bar_width=4, file=out)
+    # The following cannot pass CI on Github Action
+    # assert out.getvalue() == (
+    #     '[    ] 0/4, elapsed: 0s, ETA:'
+    #     '\r[>   ] 1/4, 1.0 task/s, elapsed: 1s, ETA:     3s'
+    #     '\r[>>  ] 2/4, 2.0 task/s, elapsed: 1s, ETA:     1s'
+    #     '\r[>>> ] 3/4, 1.5 task/s, elapsed: 2s, ETA:     1s'
+    #     '\r[>>>>] 4/4, 2.0 task/s, elapsed: 2s, ETA:     0s\n')
+    assert results == [1, 2, 3, 4]
+
+
+def test_track_parallel_progress_iterator():
+    out = StringIO()
+    results = mmcv.track_parallel_progress(
+        sleep_1s, ((i for i in [1, 2, 3, 4]), 4), 2, bar_width=4, file=out)
+    # The following cannot pass CI on Github Action
+    # assert out.getvalue() == (
+    #     '[    ] 0/4, elapsed: 0s, ETA:'
+    #     '\r[>   ] 1/4, 1.0 task/s, elapsed: 1s, ETA:     3s'
+    #     '\r[>>  ] 2/4, 2.0 task/s, elapsed: 1s, ETA:     1s'
+    #     '\r[>>> ] 3/4, 1.5 task/s, elapsed: 2s, ETA:     1s'
+    #     '\r[>>>>] 4/4, 2.0 task/s, elapsed: 2s, ETA:     0s\n')
+    assert results == [1, 2, 3, 4]
diff --git a/tests/test_utils/test_registry.py b/tests/test_utils/test_registry.py
new file mode 100644
index 0000000..bfef531
--- /dev/null
+++ b/tests/test_utils/test_registry.py
@@ -0,0 +1,282 @@
+import pytest
+
+import mmcv
+
+
+def test_registry():
+    CATS = mmcv.Registry('cat')
+    assert CATS.name == 'cat'
+    assert CATS.module_dict == {}
+    assert len(CATS) == 0
+
+    @CATS.register_module()
+    class BritishShorthair:
+        pass
+
+    assert len(CATS) == 1
+    assert CATS.get('BritishShorthair') is BritishShorthair
+
+    class Munchkin:
+        pass
+
+    CATS.register_module(Munchkin)
+    assert len(CATS) == 2
+    assert CATS.get('Munchkin') is Munchkin
+    assert 'Munchkin' in CATS
+
+    with pytest.raises(KeyError):
+        CATS.register_module(Munchkin)
+
+    CATS.register_module(Munchkin, force=True)
+    assert len(CATS) == 2
+
+    # force=False
+    with pytest.raises(KeyError):
+
+        @CATS.register_module()
+        class BritishShorthair:
+            pass
+
+    @CATS.register_module(force=True)
+    class BritishShorthair:
+        pass
+
+    assert len(CATS) == 2
+
+    assert CATS.get('PersianCat') is None
+    assert 'PersianCat' not in CATS
+
+    @CATS.register_module(name=['Siamese', 'Siamese2'])
+    class SiameseCat:
+        pass
+
+    assert CATS.get('Siamese').__name__ == 'SiameseCat'
+    assert CATS.get('Siamese2').__name__ == 'SiameseCat'
+
+    class SphynxCat:
+        pass
+
+    CATS.register_module(name='Sphynx', module=SphynxCat)
+    assert CATS.get('Sphynx') is SphynxCat
+
+    CATS.register_module(name=['Sphynx1', 'Sphynx2'], module=SphynxCat)
+    assert CATS.get('Sphynx2') is SphynxCat
+
+    repr_str = 'Registry(name=cat, items={'
+    repr_str += ("'BritishShorthair': <class 'test_registry.test_registry."
+                 "<locals>.BritishShorthair'>, ")
+    repr_str += ("'Munchkin': <class 'test_registry.test_registry."
+                 "<locals>.Munchkin'>, ")
+    repr_str += ("'Siamese': <class 'test_registry.test_registry."
+                 "<locals>.SiameseCat'>, ")
+    repr_str += ("'Siamese2': <class 'test_registry.test_registry."
+                 "<locals>.SiameseCat'>, ")
+    repr_str += ("'Sphynx': <class 'test_registry.test_registry."
+                 "<locals>.SphynxCat'>, ")
+    repr_str += ("'Sphynx1': <class 'test_registry.test_registry."
+                 "<locals>.SphynxCat'>, ")
+    repr_str += ("'Sphynx2': <class 'test_registry.test_registry."
+                 "<locals>.SphynxCat'>")
+    repr_str += '})'
+    assert repr(CATS) == repr_str
+
+    # name type
+    with pytest.raises(TypeError):
+        CATS.register_module(name=7474741, module=SphynxCat)
+
+    # the registered module should be a class
+    with pytest.raises(TypeError):
+        CATS.register_module(0)
+
+    # can only decorate a class
+    with pytest.raises(TypeError):
+
+        @CATS.register_module()
+        def some_method():
+            pass
+
+    # begin: test old APIs
+    with pytest.warns(UserWarning):
+        CATS.register_module(SphynxCat)
+        assert CATS.get('SphynxCat').__name__ == 'SphynxCat'
+
+    with pytest.warns(UserWarning):
+        CATS.register_module(SphynxCat, force=True)
+        assert CATS.get('SphynxCat').__name__ == 'SphynxCat'
+
+    with pytest.warns(UserWarning):
+
+        @CATS.register_module
+        class NewCat:
+            pass
+
+        assert CATS.get('NewCat').__name__ == 'NewCat'
+
+    with pytest.warns(UserWarning):
+        CATS.deprecated_register_module(SphynxCat, force=True)
+        assert CATS.get('SphynxCat').__name__ == 'SphynxCat'
+
+    with pytest.warns(UserWarning):
+
+        @CATS.deprecated_register_module
+        class CuteCat:
+            pass
+
+        assert CATS.get('CuteCat').__name__ == 'CuteCat'
+
+    with pytest.warns(UserWarning):
+
+        @CATS.deprecated_register_module(force=True)
+        class NewCat2:
+            pass
+
+        assert CATS.get('NewCat2').__name__ == 'NewCat2'
+
+    # end: test old APIs
+
+
+def test_multi_scope_registry():
+    DOGS = mmcv.Registry('dogs')
+    assert DOGS.name == 'dogs'
+    assert DOGS.scope == 'test_registry'
+    assert DOGS.module_dict == {}
+    assert len(DOGS) == 0
+
+    @DOGS.register_module()
+    class GoldenRetriever:
+        pass
+
+    assert len(DOGS) == 1
+    assert DOGS.get('GoldenRetriever') is GoldenRetriever
+
+    HOUNDS = mmcv.Registry('dogs', parent=DOGS, scope='hound')
+
+    @HOUNDS.register_module()
+    class BloodHound:
+        pass
+
+    assert len(HOUNDS) == 1
+    assert HOUNDS.get('BloodHound') is BloodHound
+    assert DOGS.get('hound.BloodHound') is BloodHound
+    assert HOUNDS.get('hound.BloodHound') is BloodHound
+
+    LITTLE_HOUNDS = mmcv.Registry('dogs', parent=HOUNDS, scope='little_hound')
+
+    @LITTLE_HOUNDS.register_module()
+    class Dachshund:
+        pass
+
+    assert len(LITTLE_HOUNDS) == 1
+    assert LITTLE_HOUNDS.get('Dachshund') is Dachshund
+    assert LITTLE_HOUNDS.get('hound.BloodHound') is BloodHound
+    assert HOUNDS.get('little_hound.Dachshund') is Dachshund
+    assert DOGS.get('hound.little_hound.Dachshund') is Dachshund
+
+    MID_HOUNDS = mmcv.Registry('dogs', parent=HOUNDS, scope='mid_hound')
+
+    @MID_HOUNDS.register_module()
+    class Beagle:
+        pass
+
+    assert MID_HOUNDS.get('Beagle') is Beagle
+    assert HOUNDS.get('mid_hound.Beagle') is Beagle
+    assert DOGS.get('hound.mid_hound.Beagle') is Beagle
+    assert LITTLE_HOUNDS.get('hound.mid_hound.Beagle') is Beagle
+    assert MID_HOUNDS.get('hound.BloodHound') is BloodHound
+    assert MID_HOUNDS.get('hound.Dachshund') is None
+
+
+def test_build_from_cfg():
+    BACKBONES = mmcv.Registry('backbone')
+
+    @BACKBONES.register_module()
+    class ResNet:
+
+        def __init__(self, depth, stages=4):
+            self.depth = depth
+            self.stages = stages
+
+    @BACKBONES.register_module()
+    class ResNeXt:
+
+        def __init__(self, depth, stages=4):
+            self.depth = depth
+            self.stages = stages
+
+    cfg = dict(type='ResNet', depth=50)
+    model = mmcv.build_from_cfg(cfg, BACKBONES)
+    assert isinstance(model, ResNet)
+    assert model.depth == 50 and model.stages == 4
+
+    cfg = dict(type='ResNet', depth=50)
+    model = mmcv.build_from_cfg(cfg, BACKBONES, default_args={'stages': 3})
+    assert isinstance(model, ResNet)
+    assert model.depth == 50 and model.stages == 3
+
+    cfg = dict(type='ResNeXt', depth=50, stages=3)
+    model = mmcv.build_from_cfg(cfg, BACKBONES)
+    assert isinstance(model, ResNeXt)
+    assert model.depth == 50 and model.stages == 3
+
+    cfg = dict(type=ResNet, depth=50)
+    model = mmcv.build_from_cfg(cfg, BACKBONES)
+    assert isinstance(model, ResNet)
+    assert model.depth == 50 and model.stages == 4
+
+    # type defined using default_args
+    cfg = dict(depth=50)
+    model = mmcv.build_from_cfg(
+        cfg, BACKBONES, default_args=dict(type='ResNet'))
+    assert isinstance(model, ResNet)
+    assert model.depth == 50 and model.stages == 4
+
+    cfg = dict(depth=50)
+    model = mmcv.build_from_cfg(cfg, BACKBONES, default_args=dict(type=ResNet))
+    assert isinstance(model, ResNet)
+    assert model.depth == 50 and model.stages == 4
+
+    # not a registry
+    with pytest.raises(TypeError):
+        cfg = dict(type='VGG')
+        model = mmcv.build_from_cfg(cfg, 'BACKBONES')
+
+    # non-registered class
+    with pytest.raises(KeyError):
+        cfg = dict(type='VGG')
+        model = mmcv.build_from_cfg(cfg, BACKBONES)
+
+    # default_args must be a dict or None
+    with pytest.raises(TypeError):
+        cfg = dict(type='ResNet', depth=50)
+        model = mmcv.build_from_cfg(cfg, BACKBONES, default_args=1)
+
+    # cfg['type'] should be a str or class
+    with pytest.raises(TypeError):
+        cfg = dict(type=1000)
+        model = mmcv.build_from_cfg(cfg, BACKBONES)
+
+    # cfg should contain the key "type"
+    with pytest.raises(KeyError, match='must contain the key "type"'):
+        cfg = dict(depth=50, stages=4)
+        model = mmcv.build_from_cfg(cfg, BACKBONES)
+
+    # cfg or default_args should contain the key "type"
+    with pytest.raises(KeyError, match='must contain the key "type"'):
+        cfg = dict(depth=50)
+        model = mmcv.build_from_cfg(
+            cfg, BACKBONES, default_args=dict(stages=4))
+
+    # incorrect registry type
+    with pytest.raises(TypeError):
+        cfg = dict(type='ResNet', depth=50)
+        model = mmcv.build_from_cfg(cfg, 'BACKBONES')
+
+    # incorrect default_args type
+    with pytest.raises(TypeError):
+        cfg = dict(type='ResNet', depth=50)
+        model = mmcv.build_from_cfg(cfg, BACKBONES, default_args=0)
+
+    # incorrect arguments
+    with pytest.raises(TypeError):
+        cfg = dict(type='ResNet', non_existing_arg=50)
+        model = mmcv.build_from_cfg(cfg, BACKBONES)
diff --git a/tests/test_utils/test_testing.py b/tests/test_utils/test_testing.py
new file mode 100644
index 0000000..0a479d3
--- /dev/null
+++ b/tests/test_utils/test_testing.py
@@ -0,0 +1,194 @@
+import numpy as np
+import pytest
+
+import mmcv
+
+try:
+    import torch
+except ImportError:
+    torch = None
+else:
+    import torch.nn as nn
+
+
+def test_assert_dict_contains_subset():
+    dict_obj = {'a': 'test1', 'b': 2, 'c': (4, 6)}
+
+    # case 1
+    expected_subset = {'a': 'test1', 'b': 2, 'c': (4, 6)}
+    assert mmcv.assert_dict_contains_subset(dict_obj, expected_subset)
+
+    # case 2
+    expected_subset = {'a': 'test1', 'b': 2, 'c': (6, 4)}
+    assert not mmcv.assert_dict_contains_subset(dict_obj, expected_subset)
+
+    # case 3
+    expected_subset = {'a': 'test1', 'b': 2, 'c': None}
+    assert not mmcv.assert_dict_contains_subset(dict_obj, expected_subset)
+
+    # case 4
+    expected_subset = {'a': 'test1', 'b': 2, 'd': (4, 6)}
+    assert not mmcv.assert_dict_contains_subset(dict_obj, expected_subset)
+
+    # case 5
+    dict_obj = {
+        'a': 'test1',
+        'b': 2,
+        'c': (4, 6),
+        'd': np.array([[5, 3, 5], [1, 2, 3]])
+    }
+    expected_subset = {
+        'a': 'test1',
+        'b': 2,
+        'c': (4, 6),
+        'd': np.array([[5, 3, 5], [6, 2, 3]])
+    }
+    assert not mmcv.assert_dict_contains_subset(dict_obj, expected_subset)
+
+    # case 6
+    dict_obj = {'a': 'test1', 'b': 2, 'c': (4, 6), 'd': np.array([[1]])}
+    expected_subset = {'a': 'test1', 'b': 2, 'c': (4, 6), 'd': np.array([[1]])}
+    assert mmcv.assert_dict_contains_subset(dict_obj, expected_subset)
+
+    if torch is not None:
+        dict_obj = {
+            'a': 'test1',
+            'b': 2,
+            'c': (4, 6),
+            'd': torch.tensor([5, 3, 5])
+        }
+
+        # case 7
+        expected_subset = {'d': torch.tensor([5, 5, 5])}
+        assert not mmcv.assert_dict_contains_subset(dict_obj, expected_subset)
+
+        # case 8
+        expected_subset = {'d': torch.tensor([[5, 3, 5], [4, 1, 2]])}
+        assert not mmcv.assert_dict_contains_subset(dict_obj, expected_subset)
+
+
+def test_assert_attrs_equal():
+
+    class TestExample(object):
+        a, b, c = 1, ('wvi', 3), [4.5, 3.14]
+
+        def test_func(self):
+            return self.b
+
+    # case 1
+    assert mmcv.assert_attrs_equal(TestExample, {
+        'a': 1,
+        'b': ('wvi', 3),
+        'c': [4.5, 3.14]
+    })
+
+    # case 2
+    assert not mmcv.assert_attrs_equal(TestExample, {
+        'a': 1,
+        'b': ('wvi', 3),
+        'c': [4.5, 3.14, 2]
+    })
+
+    # case 3
+    assert not mmcv.assert_attrs_equal(TestExample, {
+        'bc': 54,
+        'c': [4.5, 3.14]
+    })
+
+    # case 4
+    assert mmcv.assert_attrs_equal(TestExample, {
+        'b': ('wvi', 3),
+        'test_func': TestExample.test_func
+    })
+
+    if torch is not None:
+
+        class TestExample(object):
+            a, b = torch.tensor([1]), torch.tensor([4, 5])
+
+        # case 5
+        assert mmcv.assert_attrs_equal(TestExample, {
+            'a': torch.tensor([1]),
+            'b': torch.tensor([4, 5])
+        })
+
+        # case 6
+        assert not mmcv.assert_attrs_equal(TestExample, {
+            'a': torch.tensor([1]),
+            'b': torch.tensor([4, 6])
+        })
+
+
+assert_dict_has_keys_data_1 = [({
+    'res_layer': 1,
+    'norm_layer': 2,
+    'dense_layer': 3
+})]
+assert_dict_has_keys_data_2 = [(['res_layer', 'dense_layer'], True),
+                               (['res_layer', 'conv_layer'], False)]
+
+
+@pytest.mark.parametrize('obj', assert_dict_has_keys_data_1)
+@pytest.mark.parametrize('expected_keys, ret_value',
+                         assert_dict_has_keys_data_2)
+def test_assert_dict_has_keys(obj, expected_keys, ret_value):
+    assert mmcv.assert_dict_has_keys(obj, expected_keys) == ret_value
+
+
+assert_keys_equal_data_1 = [(['res_layer', 'norm_layer', 'dense_layer'])]
+assert_keys_equal_data_2 = [(['res_layer', 'norm_layer', 'dense_layer'], True),
+                            (['res_layer', 'dense_layer', 'norm_layer'], True),
+                            (['res_layer', 'norm_layer'], False),
+                            (['res_layer', 'conv_layer', 'norm_layer'], False)]
+
+
+@pytest.mark.parametrize('result_keys', assert_keys_equal_data_1)
+@pytest.mark.parametrize('target_keys, ret_value', assert_keys_equal_data_2)
+def test_assert_keys_equal(result_keys, target_keys, ret_value):
+    assert mmcv.assert_keys_equal(result_keys, target_keys) == ret_value
+
+
+@pytest.mark.skipif(torch is None, reason='requires torch library')
+def test_assert_is_norm_layer():
+    # case 1
+    assert not mmcv.assert_is_norm_layer(nn.Conv3d(3, 64, 3))
+
+    # case 2
+    assert mmcv.assert_is_norm_layer(nn.BatchNorm3d(128))
+
+    # case 3
+    assert mmcv.assert_is_norm_layer(nn.GroupNorm(8, 64))
+
+    # case 4
+    assert not mmcv.assert_is_norm_layer(nn.Sigmoid())
+
+
+@pytest.mark.skipif(torch is None, reason='requires torch library')
+def test_assert_params_all_zeros():
+    demo_module = nn.Conv2d(3, 64, 3)
+    nn.init.constant_(demo_module.weight, 0)
+    nn.init.constant_(demo_module.bias, 0)
+    assert mmcv.assert_params_all_zeros(demo_module)
+
+    nn.init.xavier_normal_(demo_module.weight)
+    nn.init.constant_(demo_module.bias, 0)
+    assert not mmcv.assert_params_all_zeros(demo_module)
+
+    demo_module = nn.Linear(2048, 400, bias=False)
+    nn.init.constant_(demo_module.weight, 0)
+    assert mmcv.assert_params_all_zeros(demo_module)
+
+    nn.init.normal_(demo_module.weight, mean=0, std=0.01)
+    assert not mmcv.assert_params_all_zeros(demo_module)
+
+
+def test_check_python_script(capsys):
+    mmcv.utils.check_python_script('./tests/data/scripts/hello.py zz')
+    captured = capsys.readouterr().out
+    assert captured == 'hello zz!\n'
+    mmcv.utils.check_python_script('./tests/data/scripts/hello.py agent')
+    captured = capsys.readouterr().out
+    assert captured == 'hello agent!\n'
+    # Make sure that wrong cmd raises an error
+    with pytest.raises(SystemExit):
+        mmcv.utils.check_python_script('./tests/data/scripts/hello.py li zz')
diff --git a/tests/test_utils/test_timer.py b/tests/test_utils/test_timer.py
new file mode 100644
index 0000000..983f64f
--- /dev/null
+++ b/tests/test_utils/test_timer.py
@@ -0,0 +1,40 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import time
+
+import pytest
+
+import mmcv
+
+
+def test_timer_init():
+    timer = mmcv.Timer(start=False)
+    assert not timer.is_running
+    timer.start()
+    assert timer.is_running
+    timer = mmcv.Timer()
+    assert timer.is_running
+
+
+def test_timer_run():
+    timer = mmcv.Timer()
+    time.sleep(1)
+    assert abs(timer.since_start() - 1) < 1e-2
+    time.sleep(1)
+    assert abs(timer.since_last_check() - 1) < 1e-2
+    assert abs(timer.since_start() - 2) < 1e-2
+    timer = mmcv.Timer(False)
+    with pytest.raises(mmcv.TimerError):
+        timer.since_start()
+    with pytest.raises(mmcv.TimerError):
+        timer.since_last_check()
+
+
+def test_timer_context(capsys):
+    with mmcv.Timer():
+        time.sleep(1)
+    out, _ = capsys.readouterr()
+    assert abs(float(out) - 1) < 1e-2
+    with mmcv.Timer(print_tmpl='time: {:.1f}s'):
+        time.sleep(1)
+    out, _ = capsys.readouterr()
+    assert out == 'time: 1.0s\n'
diff --git a/tests/test_utils/test_trace.py b/tests/test_utils/test_trace.py
new file mode 100644
index 0000000..3aca0b1
--- /dev/null
+++ b/tests/test_utils/test_trace.py
@@ -0,0 +1,24 @@
+import pytest
+import torch
+
+from mmcv.utils import digit_version, is_jit_tracing
+
+
+@pytest.mark.skipif(
+    digit_version(torch.__version__) < digit_version('1.6.0'),
+    reason='torch.jit.is_tracing is not available before 1.6.0')
+def test_is_jit_tracing():
+
+    def foo(x):
+        if is_jit_tracing():
+            return x
+        else:
+            return x.tolist()
+
+    x = torch.rand(3)
+    # test without trace
+    assert isinstance(foo(x), list)
+
+    # test with trace
+    traced_foo = torch.jit.trace(foo, (torch.rand(1), ))
+    assert isinstance(traced_foo(x), torch.Tensor)
diff --git a/tests/test_utils/test_version_utils.py b/tests/test_utils/test_version_utils.py
new file mode 100644
index 0000000..58bac0b
--- /dev/null
+++ b/tests/test_utils/test_version_utils.py
@@ -0,0 +1,57 @@
+from unittest.mock import patch
+
+import pytest
+
+from mmcv import get_git_hash, parse_version_info
+from mmcv.utils import digit_version
+
+
+def test_digit_version():
+    assert digit_version('0.2.16') == (0, 2, 16, 0, 0, 0)
+    assert digit_version('1.2.3') == (1, 2, 3, 0, 0, 0)
+    assert digit_version('1.2.3rc0') == (1, 2, 3, 0, -1, 0)
+    assert digit_version('1.2.3rc1') == (1, 2, 3, 0, -1, 1)
+    assert digit_version('1.0rc0') == (1, 0, 0, 0, -1, 0)
+    assert digit_version('1.0') == digit_version('1.0.0')
+    assert digit_version('1.5.0+cuda90_cudnn7.6.3_lms') == digit_version('1.5')
+    assert digit_version('1.0.0dev') < digit_version('1.0.0a')
+    assert digit_version('1.0.0a') < digit_version('1.0.0a1')
+    assert digit_version('1.0.0a') < digit_version('1.0.0b')
+    assert digit_version('1.0.0b') < digit_version('1.0.0rc')
+    assert digit_version('1.0.0rc1') < digit_version('1.0.0')
+    assert digit_version('1.0.0') < digit_version('1.0.0post')
+    assert digit_version('1.0.0post') < digit_version('1.0.0post1')
+    assert digit_version('v1') == (1, 0, 0, 0, 0, 0)
+    assert digit_version('v1.1.5') == (1, 1, 5, 0, 0, 0)
+    with pytest.raises(AssertionError):
+        digit_version('a')
+    with pytest.raises(AssertionError):
+        digit_version('1x')
+    with pytest.raises(AssertionError):
+        digit_version('1.x')
+
+
+def test_parse_version_info():
+    assert parse_version_info('0.2.16') == (0, 2, 16, 0, 0, 0)
+    assert parse_version_info('1.2.3') == (1, 2, 3, 0, 0, 0)
+    assert parse_version_info('1.2.3rc0') == (1, 2, 3, 0, 'rc', 0)
+    assert parse_version_info('1.2.3rc1') == (1, 2, 3, 0, 'rc', 1)
+    assert parse_version_info('1.0rc0') == (1, 0, 0, 0, 'rc', 0)
+
+
+def _mock_cmd_success(cmd):
+    return '3b46d33e90c397869ad5103075838fdfc9812aa0'.encode('ascii')
+
+
+def _mock_cmd_fail(cmd):
+    raise OSError
+
+
+def test_get_git_hash():
+    with patch('mmcv.utils.version_utils._minimal_ext_cmd', _mock_cmd_success):
+        assert get_git_hash() == '3b46d33e90c397869ad5103075838fdfc9812aa0'
+        assert get_git_hash(digits=6) == '3b46d3'
+        assert get_git_hash(digits=100) == get_git_hash()
+    with patch('mmcv.utils.version_utils._minimal_ext_cmd', _mock_cmd_fail):
+        assert get_git_hash() == 'unknown'
+        assert get_git_hash(fallback='n/a') == 'n/a'
diff --git a/tests/test_video/test_reader.py b/tests/test_video/test_reader.py
index c3bbdb7..f62ac5b 100644
--- a/tests/test_video/test_reader.py
+++ b/tests/test_video/test_reader.py
@@ -45,7 +45,7 @@ class TestVideoReader:
     def setup_class(cls):
         cls.video_path = osp.join(osp.dirname(__file__), '../data/test.mp4')
         cls.num_frames = 168
-        cls.video_url = 'https://download.openmmlab.com/mmcv/test_data/sample-mp4-file.mp4'  # noqa: E501
+        cls.video_url = 'https://www.learningcontainer.com/wp-content/uploads/2020/05/sample-mp4-file.mp4'  # noqa: E501
 
     def test_load(self):
         # read from video file
diff --git a/tests/test_visualization.py b/tests/test_visualization.py
index 82dd093..157bd9b 100644
--- a/tests/test_visualization.py
+++ b/tests/test_visualization.py
@@ -10,7 +10,7 @@ def test_color():
     assert mmcv.color_val('green') == (0, 255, 0)
     assert mmcv.color_val((1, 2, 3)) == (1, 2, 3)
     assert mmcv.color_val(100) == (100, 100, 100)
-    assert mmcv.color_val(np.zeros(3, dtype=int)) == (0, 0, 0)
+    assert mmcv.color_val(np.zeros(3, dtype=np.int)) == (0, 0, 0)
     with pytest.raises(TypeError):
         mmcv.color_val([255, 255, 255])
     with pytest.raises(TypeError):
-- 
GitLab