[Refactor]: Remove deployment for dev-2.x (#2225)

* remove deploy for 2.0 * update onnx ut

[Refactor]: Remove deployment for dev-2.x (#2225)
* remove deploy for 2.0 * update onnx ut
2e5628b4 · q.yao · GitHub · 961373ad · 2e5628b4 · 2e5628b4
Unverified Commit 2e5628b4 authored Aug 26, 2022 by q.yao Committed by GitHub Aug 26, 2022
14 changed files
--- a/mmcv/ops/info.py
+++ b/mmcv/ops/info.py
 # Copyright (c) OpenMMLab. All rights reserved.
-import glob
-import os
-
 import torch

 if torch.__version__ == 'parrots':
@@ -22,15 +19,3 @@ else:

    def get_compiling_cuda_version():
        return ext_module.get_compiling_cuda_version()
-
-
-def get_onnxruntime_op_path():
-    wildcard = os.path.join(
-        os.path.abspath(os.path.dirname(os.path.dirname(__file__))),
-        '_ext_ort.*.so')
-
-    paths = glob.glob(wildcard)
-    if len(paths) > 0:
-        return paths[0]
-    else:
-        return ''
--- a/mmcv/ops/masked_conv.py
+++ b/mmcv/ops/masked_conv.py
@@ -17,7 +17,7 @@ ext_module = ext_loader.load_ext(
 class MaskedConv2dFunction(Function):

    @staticmethod
-    def symbolic(g, features, mask, weight, bias, padding, stride):
+    def symbolic(g, features, mask, weight, bias, padding, stride=1):
        return g.op(
            'mmcv::MMCVMaskedConv2d',
            features,

--- a/mmcv/ops/nms.py
+++ b/mmcv/ops/nms.py
-import os
 from typing import Any, Dict, List, Optional, Tuple, Union

 import numpy as np
@@ -37,49 +36,34 @@ class NMSop(torch.autograd.Function):
    @staticmethod
    def symbolic(g, bboxes, scores, iou_threshold, offset, score_threshold,
                 max_num):
-        from ..onnx import is_custom_op_loaded
-        has_custom_op = is_custom_op_loaded()
-        # TensorRT nms plugin is aligned with original nms in ONNXRuntime
-        is_trt_backend = os.environ.get('ONNX_BACKEND') == 'MMCVTensorRT'
-        if has_custom_op and (not is_trt_backend):
-            return g.op(
-                'mmcv::NonMaxSuppression',
-                bboxes,
-                scores,
-                iou_threshold_f=float(iou_threshold),
-                offset_i=int(offset))
+        from torch.onnx.symbolic_opset9 import select, squeeze, unsqueeze
+
+        from ..onnx.onnx_utils.symbolic_helper import _size_helper
+
+        boxes = unsqueeze(g, bboxes, 0)
+        scores = unsqueeze(g, unsqueeze(g, scores, 0), 0)
+
+        if max_num > 0:
+            max_num = g.op(
+                'Constant', value_t=torch.tensor(max_num, dtype=torch.long))
        else:
-            from torch.onnx.symbolic_opset9 import select, squeeze, unsqueeze
-
-            from ..onnx.onnx_utils.symbolic_helper import _size_helper
-
-            boxes = unsqueeze(g, bboxes, 0)
-            scores = unsqueeze(g, unsqueeze(g, scores, 0), 0)
-
-            if max_num > 0:
-                max_num = g.op(
-                    'Constant',
-                    value_t=torch.tensor(max_num, dtype=torch.long))
-            else:
-                dim = g.op('Constant', value_t=torch.tensor(0))
-                max_num = _size_helper(g, bboxes, dim)
-            max_output_per_class = max_num
-            iou_threshold = g.op(
-                'Constant',
-                value_t=torch.tensor([iou_threshold], dtype=torch.float))
-            score_threshold = g.op(
-                'Constant',
-                value_t=torch.tensor([score_threshold], dtype=torch.float))
-            nms_out = g.op('NonMaxSuppression', boxes, scores,
-                           max_output_per_class, iou_threshold,
-                           score_threshold)
-            return squeeze(
-                g,
-                select(
-                    g, nms_out, 1,
-                    g.op(
-                        'Constant',
-                        value_t=torch.tensor([2], dtype=torch.long))), 1)
+            dim = g.op('Constant', value_t=torch.tensor(0))
+            max_num = _size_helper(g, bboxes, dim)
+        max_output_per_class = max_num
+        iou_threshold = g.op(
+            'Constant',
+            value_t=torch.tensor([iou_threshold], dtype=torch.float))
+        score_threshold = g.op(
+            'Constant',
+            value_t=torch.tensor([score_threshold], dtype=torch.float))
+        nms_out = g.op('NonMaxSuppression', boxes, scores,
+                       max_output_per_class, iou_threshold, score_threshold)
+        return squeeze(
+            g,
+            select(
+                g, nms_out, 1,
+                g.op('Constant', value_t=torch.tensor([2], dtype=torch.long))),
+            1)


 class SoftNMSop(torch.autograd.Function):

--- a/mmcv/ops/point_sample.py
+++ b/mmcv/ops/point_sample.py
 # Modified from https://github.com/facebookresearch/detectron2/tree/master/projects/PointRend  # noqa

-from os import path as osp
 from typing import Tuple, Union

 import torch
@@ -89,13 +88,6 @@ def bilinear_grid_sample(im: Tensor,
    return (Ia * wa + Ib * wb + Ic * wc + Id * wd).reshape(n, c, gh, gw)


-def is_in_onnx_export_without_custom_ops() -> bool:
-    from mmcv.ops import get_onnxruntime_op_path
-    ort_custom_op_path = get_onnxruntime_op_path()
-    return torch.onnx.is_in_onnx_export(
-    ) and not osp.exists(ort_custom_op_path)
-
-
 def normalize(grid: Tensor) -> Tensor:
    """Normalize input grid from [-1, 1] to [0, 1]

@@ -280,7 +272,7 @@ def point_sample(input: Tensor,
    if points.dim() == 3:
        add_dim = True
        points = points.unsqueeze(2)
-    if is_in_onnx_export_without_custom_ops():
+    if torch.onnx.is_in_onnx_export():
        # If custom ops for onnx runtime not compiled use python
        # implementation of grid_sample function to make onnx graph
        # with supported nodes

--- a/mmcv/ops/roi_align.py
+++ b/mmcv/ops/roi_align.py
@@ -19,50 +19,35 @@ class RoIAlignFunction(Function):
    @staticmethod
    def symbolic(g, input, rois, output_size, spatial_scale, sampling_ratio,
                 pool_mode, aligned):
-        from ..onnx import is_custom_op_loaded
-        has_custom_op = is_custom_op_loaded()
-        if has_custom_op:
-            return g.op(
-                'mmcv::MMCVRoiAlign',
-                input,
-                rois,
-                output_height_i=output_size[0],
-                output_width_i=output_size[1],
-                spatial_scale_f=spatial_scale,
-                sampling_ratio_i=sampling_ratio,
-                mode_s=pool_mode,
-                aligned_i=aligned)
-        else:
-            from torch.onnx import TensorProtoDataType
-            from torch.onnx.symbolic_helper import _slice_helper
-            from torch.onnx.symbolic_opset9 import squeeze, sub
-
-            # batch_indices = rois[:, 0].long()
-            batch_indices = _slice_helper(
-                g, rois, axes=[1], starts=[0], ends=[1])
-            batch_indices = squeeze(g, batch_indices, 1)
-            batch_indices = g.op(
-                'Cast', batch_indices, to_i=TensorProtoDataType.INT64)
-            # rois = rois[:, 1:]
-            rois = _slice_helper(g, rois, axes=[1], starts=[1], ends=[5])
-            if aligned:
-                # rois -= 0.5/spatial_scale
-                aligned_offset = g.op(
-                    'Constant',
-                    value_t=torch.tensor([0.5 / spatial_scale],
-                                         dtype=torch.float32))
-                rois = sub(g, rois, aligned_offset)
-            # roi align
-            return g.op(
-                'RoiAlign',
-                input,
-                rois,
-                batch_indices,
-                output_height_i=output_size[0],
-                output_width_i=output_size[1],
-                spatial_scale_f=spatial_scale,
-                sampling_ratio_i=max(0, sampling_ratio),
-                mode_s=pool_mode)
+        from torch.onnx import TensorProtoDataType
+        from torch.onnx.symbolic_helper import _slice_helper
+        from torch.onnx.symbolic_opset9 import squeeze, sub
+
+        # batch_indices = rois[:, 0].long()
+        batch_indices = _slice_helper(g, rois, axes=[1], starts=[0], ends=[1])
+        batch_indices = squeeze(g, batch_indices, 1)
+        batch_indices = g.op(
+            'Cast', batch_indices, to_i=TensorProtoDataType.INT64)
+        # rois = rois[:, 1:]
+        rois = _slice_helper(g, rois, axes=[1], starts=[1], ends=[5])
+        if aligned:
+            # rois -= 0.5/spatial_scale
+            aligned_offset = g.op(
+                'Constant',
+                value_t=torch.tensor([0.5 / spatial_scale],
+                                     dtype=torch.float32))
+            rois = sub(g, rois, aligned_offset)
+        # roi align
+        return g.op(
+            'RoiAlign',
+            input,
+            rois,
+            batch_indices,
+            output_height_i=output_size[0],
+            output_width_i=output_size[1],
+            spatial_scale_f=spatial_scale,
+            sampling_ratio_i=max(0, sampling_ratio),
+            mode_s=pool_mode)

    @staticmethod
    def forward(ctx: Any,

--- a/mmcv/tensorrt/__init__.py
+++ b/mmcv/tensorrt/__init__.py
-# Copyright (c) OpenMMLab. All rights reserved.
-# flake8: noqa
-from .init_plugins import is_tensorrt_plugin_loaded, load_tensorrt_plugin
-from .preprocess import preprocess_onnx
-
-
-def is_tensorrt_available():
-    try:
-        import tensorrt
-        del tensorrt
-        return True
-    except ModuleNotFoundError:
-        return False
-
-
-__all__ = []
-
-if is_tensorrt_available():
-    from .tensorrt_utils import (TRTWraper, TRTWrapper, load_trt_engine,
-                                 onnx2trt, save_trt_engine)
-
-    # load tensorrt plugin lib
-    load_tensorrt_plugin()
-
-    __all__.extend([
-        'onnx2trt', 'save_trt_engine', 'load_trt_engine', 'TRTWraper',
-        'TRTWrapper'
-    ])
-
-__all__.extend(['is_tensorrt_plugin_loaded', 'preprocess_onnx'])
--- a/mmcv/tensorrt/init_plugins.py
+++ b/mmcv/tensorrt/init_plugins.py
-# Copyright (c) OpenMMLab. All rights reserved.
-import ctypes
-import glob
-import os
-import warnings
-
-
-def get_tensorrt_op_path() -> str:
-    """Get TensorRT plugins library path."""
-    # Following strings of text style are from colorama package
-    bright_style, reset_style = '\x1b[1m', '\x1b[0m'
-    red_text, blue_text = '\x1b[31m', '\x1b[34m'
-    white_background = '\x1b[107m'
-
-    msg = white_background + bright_style + red_text
-    msg += 'DeprecationWarning: This function will be deprecated in future. '
-    msg += blue_text + 'Welcome to use the unified model deployment toolbox '
-    msg += 'MMDeploy: https://github.com/open-mmlab/mmdeploy'
-    msg += reset_style
-    warnings.warn(msg)
-
-    wildcard = os.path.join(
-        os.path.abspath(os.path.dirname(os.path.dirname(__file__))),
-        '_ext_trt.*.so')
-
-    paths = glob.glob(wildcard)
-    lib_path = paths[0] if len(paths) > 0 else ''
-    return lib_path
-
-
-plugin_is_loaded = False
-
-
-def is_tensorrt_plugin_loaded() -> bool:
-    """Check if TensorRT plugins library is loaded or not.
-
-    Returns:
-        bool: plugin_is_loaded flag
-    """
-
-    # Following strings of text style are from colorama package
-    bright_style, reset_style = '\x1b[1m', '\x1b[0m'
-    red_text, blue_text = '\x1b[31m', '\x1b[34m'
-    white_background = '\x1b[107m'
-
-    msg = white_background + bright_style + red_text
-    msg += 'DeprecationWarning: This function will be deprecated in future. '
-    msg += blue_text + 'Welcome to use the unified model deployment toolbox '
-    msg += 'MMDeploy: https://github.com/open-mmlab/mmdeploy'
-    msg += reset_style
-    warnings.warn(msg)
-
-    global plugin_is_loaded
-    return plugin_is_loaded
-
-
-def load_tensorrt_plugin() -> None:
-    """load TensorRT plugins library."""
-
-    # Following strings of text style are from colorama package
-    bright_style, reset_style = '\x1b[1m', '\x1b[0m'
-    red_text, blue_text = '\x1b[31m', '\x1b[34m'
-    white_background = '\x1b[107m'
-
-    msg = white_background + bright_style + red_text
-    msg += 'DeprecationWarning: This function will be deprecated in future. '
-    msg += blue_text + 'Welcome to use the unified model deployment toolbox '
-    msg += 'MMDeploy: https://github.com/open-mmlab/mmdeploy'
-    msg += reset_style
-    warnings.warn(msg)
-
-    global plugin_is_loaded
-    lib_path = get_tensorrt_op_path()
-    if (not plugin_is_loaded) and os.path.exists(lib_path):
-        ctypes.CDLL(lib_path)
-        plugin_is_loaded = True
--- a/mmcv/tensorrt/preprocess.py
+++ b/mmcv/tensorrt/preprocess.py
-# Copyright (c) OpenMMLab. All rights reserved.
-import warnings
-
-import numpy as np
-import onnx
-
-
-def preprocess_onnx(onnx_model: onnx.ModelProto) -> onnx.ModelProto:
-    """Modify onnx model to match with TensorRT plugins in mmcv.
-
-    There are some conflict between onnx node definition and TensorRT limit.
-    This function perform preprocess on the onnx model to solve the conflicts.
-    For example, onnx `attribute` is loaded in TensorRT on host and onnx
-    `input` is loaded on device. The shape inference is performed on host, so
-    any `input` related to shape (such as `max_output_boxes_per_class` in
-    NonMaxSuppression) should be transformed to `attribute` before conversion.
-
-    Arguments:
-        onnx_model (onnx.ModelProto): Input onnx model.
-
-    Returns:
-        onnx.ModelProto: Modified onnx model.
-    """
-
-    # Following strings of text style are from colorama package
-    bright_style, reset_style = '\x1b[1m', '\x1b[0m'
-    red_text, blue_text = '\x1b[31m', '\x1b[34m'
-    white_background = '\x1b[107m'
-
-    msg = white_background + bright_style + red_text
-    msg += 'DeprecationWarning: This function will be deprecated in future. '
-    msg += blue_text + 'Welcome to use the unified model deployment toolbox '
-    msg += 'MMDeploy: https://github.com/open-mmlab/mmdeploy'
-    msg += reset_style
-    warnings.warn(msg)
-
-    graph = onnx_model.graph
-    nodes = graph.node
-    initializers = graph.initializer
-    node_dict = {}
-    for node in nodes:
-        node_outputs = node.output
-        for output in node_outputs:
-            if len(output) > 0:
-                node_dict[output] = node
-
-    init_dict = {_.name: _ for _ in initializers}
-
-    nodes_name_to_remove = set()
-
-    def is_node_without_output(name):
-        for node_name, node in node_dict.items():
-            if node_name not in nodes_name_to_remove:
-                if name in node.input:
-                    return False
-        return True
-
-    def mark_nodes_to_remove(name):
-        node = node_dict[name]
-        nodes_name_to_remove.add(name)
-        for input_node_name in node.input:
-            if is_node_without_output(input_node_name):
-                mark_nodes_to_remove(input_node_name)
-
-    def parse_data(name, typ, default_value=0):
-        if name in node_dict:
-            node = node_dict[name]
-            if node.op_type == 'Constant':
-                raw_data = node.attribute[0].t.raw_data
-            else:
-                mark_nodes_to_remove(name)
-                return default_value
-        elif name in init_dict:
-            raw_data = init_dict[name].raw_data
-        else:
-            raise ValueError(f'{name} not found in node or initilizer.')
-        return np.frombuffer(raw_data, typ).item()
-
-    nrof_node = len(nodes)
-    for idx in range(nrof_node):
-        node = nodes[idx]
-        node_attributes = node.attribute
-        node_inputs = node.input
-        node_outputs = node.output
-        node_name = node.name
-        # process NonMaxSuppression node
-        if node.op_type == 'NonMaxSuppression':
-            center_point_box = 0
-            max_output_boxes_per_class = 1000000
-            iou_threshold = 0.3
-            score_threshold = 0.0
-            offset = 0
-            for attribute in node_attributes:
-                if attribute.name == 'center_point_box':
-                    center_point_box = attribute.i
-                elif attribute.name == 'offset':
-                    offset = attribute.i
-
-            if len(node_inputs) >= 3:
-                max_output_boxes_per_class = parse_data(
-                    node_inputs[2], np.int64, max_output_boxes_per_class)
-                mark_nodes_to_remove(node_inputs[2])
-
-            if len(node_inputs) >= 4:
-                iou_threshold = parse_data(node_inputs[3], np.float32,
-                                           iou_threshold)
-                mark_nodes_to_remove(node_inputs[3])
-
-            if len(node_inputs) >= 5:
-                score_threshold = parse_data(node_inputs[4], np.float32)
-                mark_nodes_to_remove(node_inputs[4])
-
-            new_node = onnx.helper.make_node(
-                'NonMaxSuppression',
-                node_inputs[:2],
-                node_outputs,
-                name=node_name,
-                center_point_box=center_point_box,
-                max_output_boxes_per_class=max_output_boxes_per_class,
-                iou_threshold=iou_threshold,
-                score_threshold=score_threshold,
-                offset=offset)
-
-            for output in node_outputs:
-                if output in node_dict:
-                    node_dict[output] = new_node
-            nodes.insert(idx, new_node)
-            nodes.remove(node)
-        elif node.op_type == 'InstanceNormalization':
-            # directly change op name
-            node.op_type = 'MMCVInstanceNormalization'
-
-    for node_name in nodes_name_to_remove:
-        nodes.remove(node_dict[node_name])
-
-    return onnx_model
--- a/mmcv/tensorrt/tensorrt_utils.py
+++ b/mmcv/tensorrt/tensorrt_utils.py
-# Copyright (c) OpenMMLab. All rights reserved.
-import warnings
-from typing import Union
-
-import onnx
-import tensorrt as trt
-import torch
-
-from .preprocess import preprocess_onnx
-
-
-def onnx2trt(onnx_model: Union[str, onnx.ModelProto],
-             opt_shape_dict: dict,
-             log_level: trt.ILogger.Severity = trt.Logger.ERROR,
-             fp16_mode: bool = False,
-             max_workspace_size: int = 0,
-             device_id: int = 0) -> trt.ICudaEngine:
-    """Convert onnx model to tensorrt engine.
-
-    Arguments:
-        onnx_model (str or onnx.ModelProto): the onnx model to convert from
-        opt_shape_dict (dict): the min/opt/max shape of each input
-        log_level (TensorRT log level): the log level of TensorRT
-        fp16_mode (bool): enable fp16 mode
-        max_workspace_size (int): set max workspace size of TensorRT engine.
-            some tactic and layers need large workspace.
-        device_id (int): choice the device to create engine.
-
-    Returns:
-        tensorrt.ICudaEngine: the TensorRT engine created from onnx_model
-
-    Example:
-        >>> engine = onnx2trt(
-        >>>             "onnx_model.onnx",
-        >>>             {'input': [[1, 3, 160, 160],
-        >>>                        [1, 3, 320, 320],
-        >>>                        [1, 3, 640, 640]]},
-        >>>             log_level=trt.Logger.WARNING,
-        >>>             fp16_mode=True,
-        >>>             max_workspace_size=1 << 30,
-        >>>             device_id=0)
-        >>>             })
-    """
-
-    # Following strings of text style are from colorama package
-    bright_style, reset_style = '\x1b[1m', '\x1b[0m'
-    red_text, blue_text = '\x1b[31m', '\x1b[34m'
-    white_background = '\x1b[107m'
-
-    msg = white_background + bright_style + red_text
-    msg += 'DeprecationWarning: This function will be deprecated in future. '
-    msg += blue_text + 'Welcome to use the unified model deployment toolbox '
-    msg += 'MMDeploy: https://github.com/open-mmlab/mmdeploy'
-    msg += reset_style
-    warnings.warn(msg)
-
-    device = torch.device(f'cuda:{device_id}')
-    # create builder and network
-    logger = trt.Logger(log_level)
-    builder = trt.Builder(logger)
-    EXPLICIT_BATCH = 1 << (int)(
-        trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
-    network = builder.create_network(EXPLICIT_BATCH)
-
-    # parse onnx
-    parser = trt.OnnxParser(network, logger)
-
-    if isinstance(onnx_model, str):
-        onnx_model = onnx.load(onnx_model)
-
-    onnx_model = preprocess_onnx(onnx_model)
-
-    if not parser.parse(onnx_model.SerializeToString()):
-        error_msgs = ''
-        for error in range(parser.num_errors):
-            error_msgs += f'{parser.get_error(error)}\n'
-        raise RuntimeError(f'parse onnx failed:\n{error_msgs}')
-
-    # config builder
-    builder.max_workspace_size = max_workspace_size
-
-    config = builder.create_builder_config()
-    config.max_workspace_size = max_workspace_size
-    profile = builder.create_optimization_profile()
-
-    for input_name, param in opt_shape_dict.items():
-        min_shape = tuple(param[0][:])
-        opt_shape = tuple(param[1][:])
-        max_shape = tuple(param[2][:])
-        profile.set_shape(input_name, min_shape, opt_shape, max_shape)
-    config.add_optimization_profile(profile)
-
-    if fp16_mode:
-        builder.fp16_mode = fp16_mode
-        config.set_flag(trt.BuilderFlag.FP16)
-
-    # create engine
-    with torch.cuda.device(device):
-        engine = builder.build_engine(network, config)
-
-    return engine
-
-
-def save_trt_engine(engine: trt.ICudaEngine, path: str) -> None:
-    """Serialize TensorRT engine to disk.
-
-    Arguments:
-        engine (tensorrt.ICudaEngine): TensorRT engine to serialize
-        path (str): disk path to write the engine
-    """
-
-    # Following strings of text style are from colorama package
-    bright_style, reset_style = '\x1b[1m', '\x1b[0m'
-    red_text, blue_text = '\x1b[31m', '\x1b[34m'
-    white_background = '\x1b[107m'
-
-    msg = white_background + bright_style + red_text
-    msg += 'DeprecationWarning: This function will be deprecated in future. '
-    msg += blue_text + 'Welcome to use the unified model deployment toolbox '
-    msg += 'MMDeploy: https://github.com/open-mmlab/mmdeploy'
-    msg += reset_style
-    warnings.warn(msg)
-
-    with open(path, mode='wb') as f:
-        f.write(bytearray(engine.serialize()))
-
-
-def load_trt_engine(path: str) -> trt.ICudaEngine:
-    """Deserialize TensorRT engine from disk.
-
-    Arguments:
-        path (str): disk path to read the engine
-
-    Returns:
-        tensorrt.ICudaEngine: the TensorRT engine loaded from disk
-    """
-
-    # Following strings of text style are from colorama package
-    bright_style, reset_style = '\x1b[1m', '\x1b[0m'
-    red_text, blue_text = '\x1b[31m', '\x1b[34m'
-    white_background = '\x1b[107m'
-
-    msg = white_background + bright_style + red_text
-    msg += 'DeprecationWarning: This function will be deprecated in future. '
-    msg += blue_text + 'Welcome to use the unified model deployment toolbox '
-    msg += 'MMDeploy: https://github.com/open-mmlab/mmdeploy'
-    msg += reset_style
-    warnings.warn(msg)
-
-    with trt.Logger() as logger, trt.Runtime(logger) as runtime:
-        with open(path, mode='rb') as f:
-            engine_bytes = f.read()
-        engine = runtime.deserialize_cuda_engine(engine_bytes)
-        return engine
-
-
-def torch_dtype_from_trt(dtype: trt.DataType) -> Union[torch.dtype, TypeError]:
-    """Convert pytorch dtype to TensorRT dtype."""
-    if dtype == trt.bool:
-        return torch.bool
-    elif dtype == trt.int8:
-        return torch.int8
-    elif dtype == trt.int32:
-        return torch.int32
-    elif dtype == trt.float16:
-        return torch.float16
-    elif dtype == trt.float32:
-        return torch.float32
-    else:
-        raise TypeError('%s is not supported by torch' % dtype)
-
-
-def torch_device_from_trt(
-        device: trt.TensorLocation) -> Union[torch.device, TypeError]:
-    """Convert pytorch device to TensorRT device."""
-    if device == trt.TensorLocation.DEVICE:
-        return torch.device('cuda')
-    elif device == trt.TensorLocation.HOST:
-        return torch.device('cpu')
-    else:
-        return TypeError('%s is not supported by torch' % device)
-
-
-class TRTWrapper(torch.nn.Module):
-    """TensorRT engine Wrapper.
-
-    Arguments:
-        engine (tensorrt.ICudaEngine): TensorRT engine to wrap
-        input_names (list[str]): names of each inputs
-        output_names (list[str]): names of each outputs
-
-    Note:
-        If the engine is converted from onnx model. The input_names and
-        output_names should be the same as onnx model.
-    """
-
-    def __init__(self, engine, input_names=None, output_names=None):
-
-        # Following strings of text style are from colorama package
-        bright_style, reset_style = '\x1b[1m', '\x1b[0m'
-        red_text, blue_text = '\x1b[31m', '\x1b[34m'
-        white_background = '\x1b[107m'
-
-        msg = white_background + bright_style + red_text
-        msg += 'DeprecationWarning: This tool will be deprecated in future. '
-        msg += blue_text + \
-            'Welcome to use the unified model deployment toolbox '
-        msg += 'MMDeploy: https://github.com/open-mmlab/mmdeploy'
-        msg += reset_style
-        warnings.warn(msg)
-
-        super().__init__()
-        self.engine = engine
-        if isinstance(self.engine, str):
-            self.engine = load_trt_engine(engine)
-
-        if not isinstance(self.engine, trt.ICudaEngine):
-            raise TypeError('engine should be str or trt.ICudaEngine')
-
-        self._register_state_dict_hook(TRTWrapper._on_state_dict)
-        self.context = self.engine.create_execution_context()
-
-        # get input and output names from engine
-        if input_names is None or output_names is None:
-            names = [_ for _ in self.engine]
-            input_names = list(filter(self.engine.binding_is_input, names))
-            output_names = list(set(names) - set(input_names))
-        self.input_names = input_names
-        self.output_names = output_names
-
-    def _on_state_dict(self, state_dict, prefix, local_metadata):
-        state_dict[prefix + 'engine'] = bytearray(self.engine.serialize())
-        state_dict[prefix + 'input_names'] = self.input_names
-        state_dict[prefix + 'output_names'] = self.output_names
-
-    def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
-                              missing_keys, unexpected_keys, error_msgs):
-        engine_bytes = state_dict[prefix + 'engine']
-
-        with trt.Logger() as logger, trt.Runtime(logger) as runtime:
-            self.engine = runtime.deserialize_cuda_engine(engine_bytes)
-            self.context = self.engine.create_execution_context()
-
-        self.input_names = state_dict[prefix + 'input_names']
-        self.output_names = state_dict[prefix + 'output_names']
-
-    def forward(self, inputs):
-        """
-        Arguments:
-            inputs (dict): dict of input name-tensors pair
-
-        Return:
-            dict: dict of output name-tensors pair
-        """
-        assert self.input_names is not None
-        assert self.output_names is not None
-        bindings = [None] * (len(self.input_names) + len(self.output_names))
-
-        for input_name, input_tensor in inputs.items():
-            idx = self.engine.get_binding_index(input_name)
-
-            if input_tensor.dtype == torch.long:
-                input_tensor = input_tensor.int()
-            self.context.set_binding_shape(idx, tuple(input_tensor.shape))
-            bindings[idx] = input_tensor.contiguous().data_ptr()
-
-        # create output tensors
-        outputs = {}
-        for i, output_name in enumerate(self.output_names):
-            idx = self.engine.get_binding_index(output_name)
-            dtype = torch_dtype_from_trt(self.engine.get_binding_dtype(idx))
-            shape = tuple(self.context.get_binding_shape(idx))
-
-            device = torch_device_from_trt(self.engine.get_location(idx))
-            output = torch.empty(size=shape, dtype=dtype, device=device)
-            outputs[output_name] = output
-            bindings[idx] = output.data_ptr()
-
-        self.context.execute_async_v2(bindings,
-                                      torch.cuda.current_stream().cuda_stream)
-
-        return outputs
-
-
-class TRTWraper(TRTWrapper):
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        warnings.warn(
-            'TRTWraper will be deprecated in'
-            ' future. Please use TRTWrapper instead', DeprecationWarning)
--- a/setup.cfg
+++ b/setup.cfg
@@ -14,7 +14,7 @@ line_length = 79
 multi_line_output = 0
 extra_standard_library = pkg_resources,setuptools,logging,os,warnings,abc
 known_first_party = mmcv
-known_third_party = addict,cv2,matplotlib,numpy,onnx,onnxruntime,packaging,pytest,pytorch_sphinx_theme,scipy,sphinx,tensorrt,torch,torchvision,yaml,yapf
+known_third_party = addict,cv2,matplotlib,numpy,onnx,packaging,pytest,pytorch_sphinx_theme,scipy,sphinx,torch,torchvision,yaml,yapf
 no_lines_before = STDLIB,LOCALFOLDER
 default_section = THIRDPARTY


--- a/setup.py
+++ b/setup.py
@@ -2,7 +2,6 @@ import glob
 import os
 import platform
 import re
-import warnings
 from pkg_resources import DistributionNotFound, get_distribution
 from setuptools import find_packages, setup

@@ -138,65 +137,6 @@ except ImportError:
 def get_extensions():
    extensions = []

-    if os.getenv('MMCV_WITH_TRT', '0') != '0':
-
-        # Following strings of text style are from colorama package
-        bright_style, reset_style = '\x1b[1m', '\x1b[0m'
-        red_text, blue_text = '\x1b[31m', '\x1b[34m'
-        white_background = '\x1b[107m'
-
-        msg = white_background + bright_style + red_text
-        msg += 'DeprecationWarning: ' + \
-            'Custom TensorRT Ops will be deprecated in future. '
-        msg += blue_text + \
-            'Welcome to use the unified model deployment toolbox '
-        msg += 'MMDeploy: https://github.com/open-mmlab/mmdeploy'
-        msg += reset_style
-        warnings.warn(msg)
-
-        ext_name = 'mmcv._ext_trt'
-        from torch.utils.cpp_extension import include_paths, library_paths
-        library_dirs = []
-        libraries = []
-        include_dirs = []
-        tensorrt_path = os.getenv('TENSORRT_DIR', '0')
-        tensorrt_lib_path = glob.glob(
-            os.path.join(tensorrt_path, 'targets', '*', 'lib'))[0]
-        library_dirs += [tensorrt_lib_path]
-        libraries += ['nvinfer', 'nvparsers', 'nvinfer_plugin']
-        libraries += ['cudart']
-        define_macros = []
-        extra_compile_args = {'cxx': []}
-
-        include_path = os.path.abspath('./mmcv/ops/csrc/common/cuda')
-        include_trt_path = os.path.abspath('./mmcv/ops/csrc/tensorrt')
-        include_dirs.append(include_path)
-        include_dirs.append(include_trt_path)
-        include_dirs.append(os.path.join(tensorrt_path, 'include'))
-        include_dirs += include_paths(cuda=True)
-
-        op_files = glob.glob('./mmcv/ops/csrc/tensorrt/plugins/*')
-        define_macros += [('MMCV_WITH_CUDA', None)]
-        define_macros += [('MMCV_WITH_TRT', None)]
-        cuda_args = os.getenv('MMCV_CUDA_ARGS')
-        extra_compile_args['nvcc'] = [cuda_args] if cuda_args else []
-        # prevent cub/thrust conflict with other python library
-        # More context See issues #1454
-        extra_compile_args['nvcc'] += ['-Xcompiler=-fno-gnu-unique']
-        library_dirs += library_paths(cuda=True)
-
-        from setuptools import Extension
-        ext_ops = Extension(
-            name=ext_name,
-            sources=op_files,
-            include_dirs=include_dirs,
-            define_macros=define_macros,
-            extra_compile_args=extra_compile_args,
-            language='c++',
-            library_dirs=library_dirs,
-            libraries=libraries)
-        extensions.append(ext_ops)
-
    if os.getenv('MMCV_WITH_OPS', '0') == '0':
        return extensions

@@ -353,63 +293,6 @@ def get_extensions():
            define_macros=define_macros,
            extra_compile_args=extra_compile_args)
        extensions.append(ext_ops)
-
-    if EXT_TYPE == 'pytorch' and os.getenv('MMCV_WITH_ORT', '0') != '0':
-
-        # Following strings of text style are from colorama package
-        bright_style, reset_style = '\x1b[1m', '\x1b[0m'
-        red_text, blue_text = '\x1b[31m', '\x1b[34m'
-        white_background = '\x1b[107m'
-
-        msg = white_background + bright_style + red_text
-        msg += 'DeprecationWarning: ' + \
-            'Custom ONNXRuntime Ops will be deprecated in future. '
-        msg += blue_text + \
-            'Welcome to use the unified model deployment toolbox '
-        msg += 'MMDeploy: https://github.com/open-mmlab/mmdeploy'
-        msg += reset_style
-        warnings.warn(msg)
-        ext_name = 'mmcv._ext_ort'
-        import onnxruntime
-        from torch.utils.cpp_extension import include_paths, library_paths
-        library_dirs = []
-        libraries = []
-        include_dirs = []
-        ort_path = os.getenv('ONNXRUNTIME_DIR', '0')
-        library_dirs += [os.path.join(ort_path, 'lib')]
-        libraries.append('onnxruntime')
-        define_macros = []
-        extra_compile_args = {'cxx': []}
-
-        include_path = os.path.abspath('./mmcv/ops/csrc/onnxruntime')
-        include_dirs.append(include_path)
-        include_dirs.append(os.path.join(ort_path, 'include'))
-
-        op_files = glob.glob('./mmcv/ops/csrc/onnxruntime/cpu/*')
-        if onnxruntime.get_device() == 'GPU' or os.getenv('FORCE_CUDA',
-                                                          '0') == '1':
-            define_macros += [('MMCV_WITH_CUDA', None)]
-            cuda_args = os.getenv('MMCV_CUDA_ARGS')
-            extra_compile_args['nvcc'] = [cuda_args] if cuda_args else []
-            op_files += glob.glob('./mmcv/ops/csrc/onnxruntime/gpu/*')
-            include_dirs += include_paths(cuda=True)
-            library_dirs += library_paths(cuda=True)
-        else:
-            include_dirs += include_paths(cuda=False)
-            library_dirs += library_paths(cuda=False)
-
-        from setuptools import Extension
-        ext_ops = Extension(
-            name=ext_name,
-            sources=op_files,
-            include_dirs=include_dirs,
-            define_macros=define_macros,
-            extra_compile_args=extra_compile_args,
-            language='c++',
-            library_dirs=library_dirs,
-            libraries=libraries)
-        extensions.append(ext_ops)
-
    return extensions



--- a/tests/test_ops/test_onnx.py
+++ b/tests/test_ops/test_onnx.py
 # Copyright (c) OpenMMLab. All rights reserved.
 import os
-import warnings
 from functools import partial

 import numpy as np
@@ -10,7 +9,6 @@ import pytest
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from packaging import version

 onnx_file = 'tmp.onnx'
 if torch.__version__ == 'parrots':
@@ -40,93 +38,8 @@ class WrapFunction(nn.Module):
        return self.wrapped_function(*args, **kwargs)


-def process_grid_sample(func, input, grid, ort_custom_op_path=''):
-    wrapped_model = WrapFunction(func).eval()
-
-    input_names = ['input', 'grid']
-    output_names = ['output']
-
-    with torch.no_grad():
-        torch.onnx.export(
-            wrapped_model, (input, grid),
-            onnx_file,
-            export_params=True,
-            keep_initializers_as_inputs=True,
-            input_names=input_names,
-            output_names=output_names,
-            opset_version=11)
-
-    onnx_model = onnx.load(onnx_file)
-
-    session_options = rt.SessionOptions()
-    if ort_custom_op_path:
-        session_options.register_custom_ops_library(ort_custom_op_path)
-
-    # get onnx output
-    input_all = [node.name for node in onnx_model.graph.input]
-    input_initializer = [node.name for node in onnx_model.graph.initializer]
-    net_feed_input = list(set(input_all) - set(input_initializer))
-    assert (len(net_feed_input) == 2)
-    sess = rt.InferenceSession(
-        onnx_file, session_options, providers=['CPUExecutionProvider'])
-    ort_result = sess.run(None, {
-        'input': input.detach().numpy(),
-        'grid': grid.detach().numpy()
-    })
-    pytorch_results = wrapped_model(input.clone(), grid.clone())
-    assert np.allclose(pytorch_results, ort_result, atol=1e-3)
-
-
-@pytest.mark.parametrize('mode', ['bilinear', 'nearest'])
-@pytest.mark.parametrize('padding_mode', ['zeros', 'border', 'reflection'])
-@pytest.mark.parametrize('align_corners', [True, False])
-def test_grid_sample(mode, padding_mode, align_corners):
-    from mmcv.onnx.symbolic import register_extra_symbolics
-    opset_version = 11
-    register_extra_symbolics(opset_version)
-
-    from mmcv.ops import get_onnxruntime_op_path
-    ort_custom_op_path = get_onnxruntime_op_path()
-    if not os.path.exists(ort_custom_op_path):
-        pytest.skip('custom ops for onnxruntime are not compiled.')
-
-    input = torch.rand(1, 1, 10, 10)
-    grid = torch.Tensor([[[1, 0, 0], [0, 1, 0]]])
-    grid = F.affine_grid(
-        grid, (1, 1, 15, 15), align_corners=align_corners).type_as(input)
-
-    def func(input, grid):
-        return F.grid_sample(
-            input,
-            grid,
-            mode=mode,
-            padding_mode=padding_mode,
-            align_corners=align_corners)
-
-    return process_grid_sample(func, input, grid, ort_custom_op_path)
-
-
-@pytest.mark.parametrize('align_corners', [True, False])
-def test_bilinear_grid_sample(align_corners):
-    from mmcv.ops.point_sample import bilinear_grid_sample
-
-    # only support pytorch >= 1.5.0
-    if version.parse(torch.__version__) < version.parse('1.5.0'):
-        pytest.skip('Only support PyTorch >= 1.5.0')
-
-    input = torch.rand(1, 1, 10, 10)
-    grid = torch.Tensor([[[1, 0, 0], [0, 1, 0]]])
-    grid = F.affine_grid(
-        grid, (1, 1, 15, 15), align_corners=align_corners).type_as(input)
-
-    def func(input, grid):
-        return bilinear_grid_sample(input, grid, align_corners=align_corners)
-
-    return process_grid_sample(func, input, grid)
-
-
 def test_nms():
-    from mmcv.ops import get_onnxruntime_op_path, nms
+    from mmcv.ops import nms
    np_boxes = np.array([[6.0, 3.0, 8.0, 7.0], [3.0, 6.0, 9.0, 11.0],
                         [3.0, 7.0, 10.0, 12.0], [1.0, 4.0, 13.0, 7.0]],
                        dtype=np.float32)
@@ -151,10 +64,7 @@ def test_nms():
            opset_version=11)

    onnx_model = onnx.load(onnx_file)
-    ort_custom_op_path = get_onnxruntime_op_path()
    session_options = rt.SessionOptions()
-    if os.path.exists(ort_custom_op_path):
-        session_options.register_custom_ops_library(ort_custom_op_path)

    # get onnx output
    input_all = [node.name for node in onnx_model.graph.input]
@@ -171,89 +81,12 @@ def test_nms():
    assert np.allclose(pytorch_score, onnx_score, atol=1e-3)


-@pytest.mark.skipif(not torch.cuda.is_available(), reason='test requires GPU')
-def test_softnms():
-    from mmcv.ops import get_onnxruntime_op_path, soft_nms
-
-    # only support pytorch >= 1.7.0
-    if version.parse(torch.__version__) < version.parse('1.7.0'):
-        warnings.warn('test_softnms should be ran with pytorch >= 1.7.0')
-        return
-
-    # only support onnxruntime >= 1.5.1
-    assert version.parse(rt.__version__) >= version.parse(
-        '1.5.1'), 'test_softnms should be ran with onnxruntime >= 1.5.1'
-
-    ort_custom_op_path = get_onnxruntime_op_path()
-    if not os.path.exists(ort_custom_op_path):
-        pytest.skip('softnms for onnxruntime is not compiled.')
-
-    np_boxes = np.array([[6.0, 3.0, 8.0, 7.0], [3.0, 6.0, 9.0, 11.0],
-                         [3.0, 7.0, 10.0, 12.0], [1.0, 4.0, 13.0, 7.0]],
-                        dtype=np.float32)
-    np_scores = np.array([0.6, 0.9, 0.7, 0.2], dtype=np.float32)
-
-    boxes = torch.from_numpy(np_boxes)
-    scores = torch.from_numpy(np_scores)
-
-    configs = [[0.3, 0.5, 0.01, 'linear'], [0.3, 0.5, 0.01, 'gaussian'],
-               [0.3, 0.5, 0.01, 'naive']]
-
-    session_options = rt.SessionOptions()
-    session_options.register_custom_ops_library(ort_custom_op_path)
-
-    for _iou_threshold, _sigma, _min_score, _method in configs:
-        pytorch_dets, pytorch_inds = soft_nms(
-            boxes,
-            scores,
-            iou_threshold=_iou_threshold,
-            sigma=_sigma,
-            min_score=_min_score,
-            method=_method)
-        nms = partial(
-            soft_nms,
-            iou_threshold=_iou_threshold,
-            sigma=_sigma,
-            min_score=_min_score,
-            method=_method)
-
-        wrapped_model = WrapFunction(nms)
-        wrapped_model.cpu().eval()
-        with torch.no_grad():
-            torch.onnx.export(
-                wrapped_model, (boxes, scores),
-                onnx_file,
-                export_params=True,
-                keep_initializers_as_inputs=True,
-                input_names=['boxes', 'scores'],
-                opset_version=11)
-        onnx_model = onnx.load(onnx_file)
-
-        # get onnx output
-        input_all = [node.name for node in onnx_model.graph.input]
-        input_initializer = [
-            node.name for node in onnx_model.graph.initializer
-        ]
-        net_feed_input = list(set(input_all) - set(input_initializer))
-        assert (len(net_feed_input) == 2)
-        sess = rt.InferenceSession(
-            onnx_file, session_options, providers=['CPUExecutionProvider'])
-        onnx_dets, onnx_inds = sess.run(None, {
-            'scores': scores.detach().numpy(),
-            'boxes': boxes.detach().numpy()
-        })
-
-        assert np.allclose(pytorch_dets, onnx_dets, atol=1e-3)
-        assert np.allclose(onnx_inds, onnx_inds, atol=1e-3)
-
-
 def test_roialign():
    try:
-        from mmcv.ops import get_onnxruntime_op_path, roi_align
+        from mmcv.ops import roi_align
    except (ImportError, ModuleNotFoundError):
        pytest.skip('roi_align op is not successfully compiled')

-    ort_custom_op_path = get_onnxruntime_op_path()
    # roi align config
    pool_h = 2
    pool_w = 2
@@ -295,8 +128,6 @@ def test_roialign():

        onnx_model = onnx.load(onnx_file)
        session_options = rt.SessionOptions()
-        if os.path.exists(ort_custom_op_path):
-            session_options.register_custom_ops_library(ort_custom_op_path)

        # compute onnx_output
        input_all = [node.name for node in onnx_model.graph.input]
@@ -318,83 +149,6 @@ def test_roialign():
        assert np.allclose(pytorch_output, onnx_output, atol=1e-3)


-def test_roialign_rotated():
-    try:
-        from mmcv.ops import get_onnxruntime_op_path, roi_align_rotated
-    except (ImportError, ModuleNotFoundError):
-        pytest.skip('roi_align_aligned op is not successfully compiled')
-
-    ort_custom_op_path = get_onnxruntime_op_path()
-    if not os.path.exists(ort_custom_op_path):
-        pytest.skip('custom ops for onnxruntime are not compiled.')
-    # roi align config
-    pool_h = 2
-    pool_w = 2
-    spatial_scale = 1.0
-    sampling_ratio = 2
-
-    inputs = [([[[[1., 2.], [3., 4.]]]], [[0., 0.5, 0.5, 1., 1., 0]]),
-              ([[[[1., 2.], [3., 4.]]]], [[0., 0.5, 0.5, 1., 1., np.pi / 2]]),
-              ([[[[1., 2.], [3., 4.]],
-                 [[4., 3.], [2., 1.]]]], [[0., 0.5, 0.5, 1., 1., 0]]),
-              ([[[[1., 2., 5., 6.], [3., 4., 7., 8.], [9., 10., 13., 14.],
-                  [11., 12., 15., 16.]]]], [[0., 1.5, 1.5, 3., 3., 0]]),
-              ([[[[1., 2., 5., 6.], [3., 4., 7., 8.], [9., 10., 13., 14.],
-                  [11., 12., 15., 16.]]]], [[0., 1.5, 1.5, 3., 3.,
-                                             np.pi / 2]])]
-
-    def warpped_function(torch_input, torch_rois):
-        return roi_align_rotated(torch_input, torch_rois, (pool_w, pool_h),
-                                 spatial_scale, sampling_ratio, True, False)
-
-    for case in inputs:
-        np_input = np.array(case[0], dtype=np.float32)
-        np_rois = np.array(case[1], dtype=np.float32)
-        input = torch.from_numpy(np_input)
-        rois = torch.from_numpy(np_rois)
-
-        # compute pytorch_output
-        with torch.no_grad():
-            pytorch_output = roi_align_rotated(input, rois, (pool_w, pool_h),
-                                               spatial_scale, sampling_ratio,
-                                               True, False)
-
-        # export and load onnx model
-        wrapped_model = WrapFunction(warpped_function)
-        with torch.no_grad():
-            torch.onnx.export(
-                wrapped_model, (input, rois),
-                onnx_file,
-                export_params=True,
-                keep_initializers_as_inputs=True,
-                input_names=['features', 'rois'],
-                opset_version=11)
-
-        onnx_model = onnx.load(onnx_file)
-        session_options = rt.SessionOptions()
-        if os.path.exists(ort_custom_op_path):
-            session_options.register_custom_ops_library(ort_custom_op_path)
-
-        # compute onnx_output
-        input_all = [node.name for node in onnx_model.graph.input]
-        input_initializer = [
-            node.name for node in onnx_model.graph.initializer
-        ]
-        net_feed_input = list(set(input_all) - set(input_initializer))
-        assert (len(net_feed_input) == 2)
-        sess = rt.InferenceSession(
-            onnx_file, session_options, providers=['CPUExecutionProvider'])
-        onnx_output = sess.run(None, {
-            'features': input.detach().numpy(),
-            'rois': rois.detach().numpy()
-        })
-        onnx_output = onnx_output[0]
-
-        # allclose
-
-        assert np.allclose(pytorch_output, onnx_output, atol=1e-3)
-
-
 @pytest.mark.skipif(not torch.cuda.is_available(), reason='test requires GPU')
 def test_roipool():
    from mmcv.ops import roi_pool
@@ -483,240 +237,6 @@ def test_interpolate():
    assert np.allclose(pytorch_result, onnx_result, atol=1e-3)


-def test_rotated_feature_align():
-    if torch.__version__ == 'parrots':
-        pytest.skip('onnx is not supported in parrots directly')
-    try:
-        from mmcv.ops import get_onnxruntime_op_path, rotated_feature_align
-    except (ImportError, ModuleNotFoundError):
-        pytest.skip('rotated_feature_align op is not successfully compiled')
-
-    ort_custom_op_path = get_onnxruntime_op_path()
-    if not os.path.exists(ort_custom_op_path):
-        pytest.skip('custom ops for onnxruntime are not compiled.')
-
-    spatial_scale = 1.0 / 8
-    points = 1
-
-    def warpped_function(feature, bbox):
-        return rotated_feature_align(
-            feature, bbox, spatial_scale=spatial_scale, points=points)
-
-    feature = torch.tensor([[[[1.2924, -0.2172, -0.5222, 0.1172],
-                              [0.9144, 1.2248, 1.3115, -0.9690],
-                              [-0.8949, -1.1797, -0.9093, -0.3961],
-                              [-0.4586, 0.5062, -0.7947, -0.7397]],
-                             [[-1.0943, -0.7495, 1.3461, -1.1652],
-                              [0.2034, 0.6763, -1.2357, 0.5231],
-                              [-1.0062, 1.2592, 1.4225, -0.3951],
-                              [-0.1242, -1.6240, 0.1932, 2.7181]],
-                             [[-1.6271, -1.0276, 0.0578, -0.2997],
-                              [-0.9684, -1.6946, -1.3188, -1.1938],
-                              [-1.6744, -0.8917, -0.6556, 1.0073],
-                              [-0.1205, 0.3671, -0.3731, -0.5347]]],
-                            [[[0.7035, 0.2089, -0.1774, 3.4670],
-                              [-0.8505, -0.9278, 1.4714, 0.1644],
-                              [0.0898, 0.3531, -0.4007, 0.1927],
-                              [1.2569, -0.2636, -0.5223, 0.0616]],
-                             [[0.1760, -0.7639, -0.4600, -1.3260],
-                              [-0.9921, -0.2970, -0.8955, 1.0508],
-                              [1.3515, -0.1641, 1.9679, 1.1986],
-                              [-0.3616, 0.6287, 0.4933, 0.3360]],
-                             [[-0.5860, 0.2124, -0.8700, 2.4200],
-                              [-0.0551, -1.5103, -1.6779, 0.8399],
-                              [0.8431, 1.2414, -1.1243, -0.3887],
-                              [-2.1254, 0.6047, -0.3515, 0.7254]]]])
-
-    bbox = torch.tensor(
-        [[[[1.3080e+01, 1.2688e+01, 1.1214e+01, 9.3944e+01, -9.1905e-01],
-           [3.8104e+01, 1.0134e+01, 1.4659e+02, 9.0306e+01, -9.8211e-01],
-           [-5.3213e+01, 4.9508e+01, 5.1513e+01, 3.2055e+01, -3.1954e-01],
-           [2.6974e+01, 2.5248e+01, 5.4495e+01, 3.1083e+00, -6.2127e-01]],
-          [[-1.5604e+01, -5.1908e+01, 2.3998e+02, 1.5008e+01, -1.2546e+00],
-           [3.1354e+01, -7.3635e+00, 6.7879e+01, 3.5081e+01, -3.3851e-01],
-           [-5.3292e+00, 9.1946e+00, 1.2834e+01, 1.0485e+01, -1.3039e+00],
-           [-2.3925e+01, 3.6623e+01, 3.9875e+01, 7.2009e+01, -6.5934e-01]],
-          [[7.2114e+01, -2.3781e+01, 2.9106e+01, 8.4501e+01, -1.1340e+00],
-           [2.6258e+01, -7.7034e+00, 1.7629e+02, 1.0615e+02, -1.2156e+00],
-           [3.8057e+01, 4.6016e+01, 1.2965e+01, 6.9384e+00, -1.0855e+00],
-           [2.4428e+01, -1.6189e+01, 2.0572e+02, 3.1622e+01, -1.5719e-01]],
-          [[3.8226e+00, 2.9608e+01, 1.4457e+01, 6.8179e+01, -9.1997e-01],
-           [2.5003e+01, -4.2490e+01, 9.6007e+01, 4.9086e+01, -1.4786e+00],
-           [8.5983e+01, 5.4980e+01, 7.8080e+01, 1.0003e+02, -1.0926e+00],
-           [9.9065e+00, 4.1457e+01, 5.9799e+00, 1.7973e+01, -5.6313e-01]]],
-         [[[-1.8244e+01, 4.6309e+00, 5.3010e+01, 2.4310e+01, -7.0345e-01],
-           [1.9419e+01, 3.6704e+01, 5.2390e+01, 5.4133e+01, -3.7730e-01],
-           [5.6387e+01, 2.3752e+01, 9.0441e+00, 1.7792e+01, -1.5583e+00],
-           [3.6303e+01, 1.6396e+01, 2.0283e+01, 1.9148e+01, -8.3419e-01]],
-          [[3.2169e+01, 3.0521e+01, 2.6283e+01, 1.9680e+02, -3.0454e-01],
-           [2.5788e+01, -3.2189e+01, 8.8882e+01, 1.0207e+02, -1.5328e+00],
-           [8.4676e+00, -1.6668e+01, 2.4657e+01, 1.1275e+02, -4.0388e-01],
-           [-1.0799e+01, 6.0422e+00, 9.5807e+00, 3.3677e+01, -3.5438e-01]],
-          [[6.9363e+01, 1.0850e+01, 2.5968e+01, 2.2311e+01, -1.6408e-01],
-           [2.8140e+00, 4.6843e+00, 3.1289e+00, 2.1480e+01, -6.7583e-01],
-           [2.6661e+01, 4.5290e+01, 6.1679e+00, 3.0005e+01, -8.9806e-01],
-           [5.0871e+00, 1.3234e+01, 9.2087e+01, 4.9622e+01, -2.8020e-01]],
-          [[-1.2643e+01, 2.5176e+01, 5.0488e+01, 5.4246e+01, -4.4840e-01],
-           [-3.4521e+01, 9.8435e-01, 5.2413e+01, 9.7996e+00, -8.4218e-01],
-           [4.9829e+01, -1.0808e+01, 2.9848e+01, 7.3579e+01, -6.2672e-01],
-           [8.0446e+01, 2.8064e+01, 4.5273e+01, 5.3809e+01, -1.2359e+00]]]])
-
-    # compute pytorch_output
-    with torch.no_grad():
-        pytorch_output = rotated_feature_align(
-            feature, bbox, spatial_scale=spatial_scale, points=points)
-
-    # export and load onnx model
-    wrapped_model = WrapFunction(warpped_function)
-    with torch.no_grad():
-        torch.onnx.export(
-            wrapped_model, (feature, bbox),
-            onnx_file,
-            export_params=True,
-            keep_initializers_as_inputs=True,
-            input_names=['feature', 'bbox'],
-            opset_version=11)
-
-    onnx_model = onnx.load(onnx_file)
-    session_options = rt.SessionOptions()
-    if os.path.exists(ort_custom_op_path):
-        session_options.register_custom_ops_library(ort_custom_op_path)
-
-    # compute onnx_output
-    input_all = [node.name for node in onnx_model.graph.input]
-    input_initializer = [node.name for node in onnx_model.graph.initializer]
-    net_feed_input = list(set(input_all) - set(input_initializer))
-    assert (len(net_feed_input) == 2)
-    sess = rt.InferenceSession(
-        onnx_file, session_options, providers=['CPUExecutionProvider'])
-    onnx_output = sess.run(None, {
-        'feature': feature.detach().numpy(),
-        'bbox': bbox.detach().numpy()
-    })
-    onnx_output = onnx_output[0]
-
-    # allclose
-    assert np.allclose(pytorch_output, onnx_output, atol=1e-3)
-
-
-@pytest.mark.parametrize('mode', ['top', 'bottom', 'left', 'right'])
-def test_corner_pool(mode, opset=11):
-
-    from mmcv.ops import get_onnxruntime_op_path
-    ort_custom_op_path = get_onnxruntime_op_path()
-    if not os.path.exists(ort_custom_op_path):
-        pytest.skip('custom ops for onnxruntime are not compiled.')
-
-    from mmcv.ops.corner_pool import CornerPool
-
-    def corner_pool_func(input):
-        corner_pool_module = CornerPool(mode)
-        return corner_pool_module.corner_pool.apply(input)
-
-    wrapped_model = WrapFunction(corner_pool_func).eval()
-
-    input = torch.rand((2, 3, 9, 12))  # (n,c,h,w)
-
-    with torch.no_grad():
-        torch.onnx.export(
-            wrapped_model,
-            input,
-            onnx_file,
-            export_params=True,
-            keep_initializers_as_inputs=True,
-            input_names=['input'],
-            output_names=['output'],
-            opset_version=opset)
-
-    onnx_model = onnx.load(onnx_file)
-    input_all = [node.name for node in onnx_model.graph.input]
-    input_initializer = [node.name for node in onnx_model.graph.initializer]
-    net_feed_input = list(set(input_all) - set(input_initializer))
-    assert (len(net_feed_input) == 1)
-
-    session_options = rt.SessionOptions()
-    session_options.register_custom_ops_library(ort_custom_op_path)
-    sess = rt.InferenceSession(
-        onnx_file, session_options, providers=['CPUExecutionProvider'])
-    ort_result = sess.run(None, {'input': input.detach().numpy()})
-    pytorch_results = wrapped_model(input.clone())
-
-    assert np.allclose(pytorch_results, ort_result, atol=1e-5)
-
-
-@pytest.mark.parametrize('key', ['cummax', 'cummin'])
-def test_cummax_cummin(key, opset=11):
-
-    # Note generally `cummax` or `cummin` is exportable to ONNX
-    # as long as the pytorch version >= 1.5.0, since `torch.cummax`
-    # is only supported with torch >= 1.5.0.
-    # But when `cummax` or `cummin` serves as an intermediate component
-    # whose outputs is used as inputs for another modules, it's expected
-    # that pytorch version must be >= 1.7.0. Otherwise error appears like:
-    # `RuntimeError: tuple  appears in op that does not forward tuples,
-    # unsupported 'kind: prim::PythonOp`.
-    if version.parse(torch.__version__) < version.parse('1.7.0'):
-        pytest.skip('test_cummax_cummin should be ran with pytorch >= 1.7.0')
-
-    # register custom op `mmcv::cummax` and `mmcv::cummin`
-    from mmcv.onnx.symbolic import register_extra_symbolics
-    register_extra_symbolics(opset)
-
-    from mmcv.ops import get_onnxruntime_op_path
-    ort_custom_op_path = get_onnxruntime_op_path()
-    if not os.path.exists(ort_custom_op_path):
-        pytest.skip('custom ops for onnxruntime are not compiled.')
-
-    input_list = [
-        # arbitrary shape, e.g. 1-D, 2-D, 3-D, ...
-        torch.rand((2, 3, 4, 1, 5)),
-        torch.rand(1),
-        torch.rand((2, 0, 1)),  # tensor.numel() is 0
-        torch.FloatTensor(),  # empty tensor
-    ]
-
-    cummax_cummin_funcs = {'cummax': torch.cummax, 'cummin': torch.cummin}
-
-    for input in input_list:
-        ndims = input.dim()
-        # valid dim range is [-ndims, ndims-1]
-        # test for all `dim` value which is valid
-        for dim in range(-ndims, ndims):
-            cummax_func = partial(cummax_cummin_funcs[key], dim=dim)
-            wrapped_model = WrapFunction(cummax_func).eval()
-
-            with torch.no_grad():
-                torch.onnx.export(
-                    wrapped_model,
-                    input,
-                    onnx_file,
-                    export_params=True,
-                    keep_initializers_as_inputs=True,
-                    input_names=['input'],
-                    output_names=['output', 'indices'],
-                    opset_version=opset)
-
-            onnx_model = onnx.load(onnx_file)
-            input_all = [node.name for node in onnx_model.graph.input]
-            input_initializer = [
-                node.name for node in onnx_model.graph.initializer
-            ]
-            net_feed_input = list(set(input_all) - set(input_initializer))
-            assert (len(net_feed_input) == 1)
-
-            session_options = rt.SessionOptions()
-            session_options.register_custom_ops_library(ort_custom_op_path)
-            sess = rt.InferenceSession(
-                onnx_file, session_options, providers=['CPUExecutionProvider'])
-            ort_output, ort_inds = sess.run(None,
-                                            {'input': input.detach().numpy()})
-            pytorch_output, pytorch_inds = wrapped_model(input.clone())
-            pytorch_output = pytorch_output.detach().numpy()
-            pytorch_inds = pytorch_inds.detach().numpy()
-            assert np.allclose(pytorch_output, ort_output, atol=1e-5)
-            assert np.all(pytorch_inds == ort_inds)
-
-
 @pytest.mark.parametrize('shifts_dims_pair', [([-3, 5], [2, 0]), (5, None)])
 def test_roll(shifts_dims_pair):
    opset = 11
@@ -755,173 +275,120 @@ def test_roll(shifts_dims_pair):
    torch.testing.assert_allclose(ort_output, pytorch_output)


-@pytest.mark.skipif(
-    not torch.cuda.is_available(),
-    reason='modulated_deform_conv2d only supports in GPU')
-def test_modulated_deform_conv2d():
-    try:
-        from mmcv.ops import ModulatedDeformConv2d, get_onnxruntime_op_path
-    except (ImportError, ModuleNotFoundError):
-        pytest.skip('modulated_deform_conv op is not successfully compiled')
-
-    ort_custom_op_path = get_onnxruntime_op_path()
-    if not os.path.exists(ort_custom_op_path):
-        pytest.skip('custom ops for onnxruntime are not compiled.')
-
-    # modulated deform conv config
-    in_channels = 3
-    out_channels = 64
-    stride = 1
-    padding = 0
-    dilation = 1
-    groups = 1
-    deform_groups = 1
-    kernel_size = 3
-
-    input = torch.rand(1, in_channels, 28, 28).cuda()  # (n, c, h, w)
-    conv_offset = nn.Conv2d(
-        in_channels=3,
-        out_channels=deform_groups * 3 * kernel_size * kernel_size,
-        kernel_size=kernel_size,
-        stride=stride,
-        padding=padding,
-        dilation=dilation,
-        bias=True).cuda()
-    conv_offset.cuda()
-    out = conv_offset(input)
-    o1, o2, mask = torch.chunk(out, 3, dim=1)
-    offset = torch.cat((o1, o2), dim=1)
-    mask = torch.sigmoid(mask)
-
-    model_with_bias = ModulatedDeformConv2d(
-        in_channels,
-        out_channels,
-        kernel_size,
-        stride,
-        padding,
-        dilation,
-        groups,
-        deform_groups,
-        bias=True)
-    model_without_bias = ModulatedDeformConv2d(
-        in_channels,
-        out_channels,
-        kernel_size,
-        stride,
-        padding,
-        dilation,
-        groups,
-        deform_groups,
-        bias=False)
-    models = [model_with_bias.cuda(), model_without_bias.cuda()]
-
-    for model in models:
-        # export and load onnx model
-        with torch.no_grad():
-            torch.onnx.export(
-                model, (input, offset, mask),
-                onnx_file,
-                export_params=True,
-                keep_initializers_as_inputs=True,
-                input_names=['input', 'offset', 'mask'],
-                opset_version=11)
+def _test_symbolic(model, inputs, symbol_name):
+    with torch.no_grad():
+        torch.onnx.export(model, inputs, onnx_file, opset_version=11)

-        session_options = rt.SessionOptions()
-        if os.path.exists(ort_custom_op_path):
-            session_options.register_custom_ops_library(ort_custom_op_path)
+    import onnx
+    model = onnx.load(onnx_file)
+    nodes = model.graph.node

-        # compute onnx_output
-        sess = rt.InferenceSession(
-            onnx_file, session_options, providers=['CPUExecutionProvider'])
-        onnx_output = sess.run(
-            None, {
-                'input': input.cpu().detach().numpy(),
-                'offset': offset.cpu().detach().numpy(),
-                'mask': mask.cpu().detach().numpy()
-            })[0]
+    symbol_exist = False
+    for n in nodes:
+        if n.op_type == symbol_name:
+            symbol_exist = True
+    assert symbol_exist

-        # compute pytorch_output
-        with torch.no_grad():
-            pytorch_output = model(input, offset, mask).cpu()
-        # allclose
-        assert np.allclose(pytorch_output, onnx_output, atol=1e-3)

+@pytest.mark.skipif(not torch.cuda.is_available(), reason='test requires GPU')
+def test_border_align():
+    from mmcv.ops import BorderAlign
+    model = BorderAlign(2)
+    input = torch.rand(1, 8, 2, 2).cuda()
+    boxes = torch.rand(1, 4, 4).cuda()
+    _test_symbolic(model, (input, boxes), 'MMCVBorderAlign')

-def test_deform_conv2d(threshold=1e-3):
-    try:
-        from mmcv.ops import DeformConv2d, get_onnxruntime_op_path
-    except (ImportError, ModuleNotFoundError):
-        pytest.skip('deform_conv op is not successfully compiled')
-
-    ort_custom_op_path = get_onnxruntime_op_path()
-    if not os.path.exists(ort_custom_op_path):
-        pytest.skip('custom ops for onnxruntime are not compiled.')
-
-    # deform conv config
-    # modulated deform conv config
-    in_channels = 1
-    out_channels = 64
-    stride = 1
-    padding = 0
-    dilation = 1
-    groups = 1
-    deform_groups = 1
-    kernel_size = 2
-    input = [[[[1., 2., 3.], [0., 1., 2.], [3., 5., 2.]]]]
-    offset_weight = [[[0.1, 0.4, 0.6, 0.1]], [[0.3, 0.2, 0.1, 0.3]],
-                     [[0.5, 0.5, 0.2, 0.8]], [[0.8, 0.3, 0.9, 0.1]],
-                     [[0.3, 0.1, 0.2, 0.5]], [[0.3, 0.7, 0.5, 0.3]],
-                     [[0.6, 0.2, 0.5, 0.3]], [[0.4, 0.1, 0.8, 0.4]]]
-    offset_bias = [0.7, 0.1, 0.8, 0.5, 0.6, 0.5, 0.4, 0.7]
-    deform_weight = [[[0.4, 0.2, 0.1, 0.9]]]
-
-    x = torch.tensor(input)
-    conv_offset = nn.Conv2d(
-        in_channels=in_channels,
-        out_channels=deform_groups * 2 * kernel_size * kernel_size,
-        kernel_size=kernel_size,
-        stride=stride,
-        padding=padding,
-        dilation=dilation,
-        bias=True)
-
-    conv_offset.weight.data = torch.nn.Parameter(
-        torch.Tensor(offset_weight).reshape(8, 1, 2, 2))
-    conv_offset.bias.data = torch.nn.Parameter(
-        torch.Tensor(offset_bias).reshape(8))
-
-    offset = conv_offset(x)
-
-    model = DeformConv2d(in_channels, out_channels, kernel_size, stride,
-                         padding, dilation, groups, deform_groups)
-
-    model.weight.data = torch.nn.Parameter(
-        torch.Tensor(deform_weight).reshape(1, 1, 2, 2))

-    with torch.no_grad():
-        torch.onnx.export(
-            model, (x, offset),
-            onnx_file,
-            export_params=True,
-            keep_initializers_as_inputs=True,
-            input_names=['input', 'offset'],
-            opset_version=11)
+@pytest.mark.skipif(not torch.cuda.is_available(), reason='test requires GPU')
+def test_carafe():
+    from mmcv.ops import CARAFENaive
+    feat = torch.randn(2, 64, 3, 3, device='cuda').double()
+    mask = torch.randn(2, 100, 6, 6, device='cuda').sigmoid().double()
+    _test_symbolic(CARAFENaive(5, 4, 2), (feat, mask), 'MMCVCARAFENaive')

-    session_options = rt.SessionOptions()
-    if os.path.exists(ort_custom_op_path):
-        session_options.register_custom_ops_library(ort_custom_op_path)

-    # compute onnx_output
-    sess = rt.InferenceSession(
-        onnx_file, session_options, providers=['CPUExecutionProvider'])
-    onnx_output = sess.run(
-        None, {
-            'input': x.cpu().detach().numpy(),
-            'offset': offset.cpu().detach().numpy(),
-        })[0]
+@pytest.mark.skipif(not torch.cuda.is_available(), reason='test requires GPU')
+def test_deform_conv():
+    from mmcv.ops import DeformConv2dPack
+    x = torch.randn(1, 2, 4, 4, device='cuda')
+    _test_symbolic(
+        DeformConv2dPack(2, 4, 3, 1, 1).cuda(), x, 'MMCVDeformConv2d')

-    # compute pytorch_output
-    with torch.no_grad():
-        pytorch_output = model(x, offset).cpu()
-    # allclose
-    assert np.allclose(pytorch_output, onnx_output, atol=1e-3)
+
+@pytest.mark.skipif(not torch.cuda.is_available(), reason='test requires GPU')
+def test_modulated_deform_conv():
+    from mmcv.ops import ModulatedDeformConv2dPack
+    x = torch.randn(1, 2, 4, 4, device='cuda')
+    _test_symbolic(
+        ModulatedDeformConv2dPack(2, 4, 3, 1, 1).cuda(), x,
+        'MMCVModulatedDeformConv2d')
+
+
+@pytest.mark.skipif(not torch.cuda.is_available(), reason='test requires GPU')
+def test_deform_roi_pool():
+    from mmcv.ops import DeformRoIPoolPack
+    x = torch.tensor([[[[1., 2.], [3., 4.]]]], device='cuda')
+    rois = torch.tensor([[0., 0., 0., 1., 1.]], device='cuda')
+    output_c = x.size(1)
+    pool_h = 2
+    pool_w = 2
+    spatial_scale = 1.0
+    sampling_ratio = 2
+    model = DeformRoIPoolPack((pool_h, pool_w),
+                              output_c,
+                              spatial_scale=spatial_scale,
+                              sampling_ratio=sampling_ratio).cuda()
+
+    _test_symbolic(model, (x, rois), 'MMCVDeformRoIPool')
+
+
+@pytest.mark.skipif(not torch.cuda.is_available(), reason='test requires GPU')
+def test_masked_conv():
+    from mmcv.ops import MaskedConv2d
+    x = torch.rand(1, 2, 4, 4, device='cuda')
+    mask = torch.rand(1, 4, 4, device='cuda')
+    _test_symbolic(
+        MaskedConv2d(2, 4, 3, 1, 1).cuda(), (x, mask), 'MMCVMaskedConv2d')
+
+
+@pytest.mark.skipif(not torch.cuda.is_available(), reason='test requires GPU')
+def test_pr_roi_pool():
+    from mmcv.ops import PrRoIPool
+    pool_h = 2
+    pool_w = 2
+    spatial_scale = 1.0
+    x = torch.tensor([[[[1., 2.], [3., 4.]]]], device='cuda')
+    rois = torch.tensor([[0., 0., 0., 1., 1.]], device='cuda')
+    model = PrRoIPool((pool_h, pool_w), spatial_scale).cuda()
+    _test_symbolic(model, (x, rois), 'PrRoIPool')
+
+
+@pytest.mark.skipif(not torch.cuda.is_available(), reason='test requires GPU')
+def test_psa_mask():
+    from mmcv.ops import PSAMask
+    input = torch.rand(4, 16, 8, 8).cuda()
+    model = PSAMask('collect', (4, 4)).cuda()
+    _test_symbolic(model, input, 'MMCVPSAMask')
+
+
+@pytest.mark.skipif(not torch.cuda.is_available(), reason='test requires GPU')
+def test_roi_align_rotated():
+    from mmcv.ops import RoIAlignRotated
+    pool_h = 2
+    pool_w = 2
+    spatial_scale = 1.0
+    sampling_ratio = 2
+    x = torch.tensor([[[[1., 2.], [3., 4.]]]], device='cuda')
+    rois = torch.tensor([[0., 0.5, 0.5, 1., 1., 0]], device='cuda')
+    model = RoIAlignRotated((pool_h, pool_w), spatial_scale,
+                            sampling_ratio).cuda()
+    _test_symbolic(model, (x, rois), 'MMCVRoIAlignRotated')
+
+
+@pytest.mark.skipif(not torch.cuda.is_available(), reason='test requires GPU')
+def test_roi_feaeture_align():
+    from mmcv.ops import rotated_feature_align
+    wrapped_model = WrapFunction(rotated_feature_align)
+    feature = torch.rand(1, 1, 2, 2, device='cuda')
+    bbox = torch.rand(1, 2, 2, 5, device='cuda')
+    _test_symbolic(wrapped_model, (feature, bbox), 'MMCVRotatedFeatureAlign')
--- a/tests/test_ops/test_tensorrt.py
+++ b/tests/test_ops/test_tensorrt.py
-# Copyright (c) OpenMMLab. All rights reserved.
-import os
-from functools import partial
-from typing import Callable
-
-import mmengine
-import numpy as np
-import onnx
-import pytest
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-
-try:
-    from mmcv.tensorrt import (TRTWrapper, is_tensorrt_plugin_loaded, onnx2trt,
-                               save_trt_engine)
-except ImportError:
-    pytest.skip(
-        'TensorRT should be installed from source.', allow_module_level=True)
-
-if not torch.cuda.is_available():
-    pytest.skip(
-        'CUDA is required for this test module', allow_module_level=True)
-
-if not is_tensorrt_plugin_loaded():
-    pytest.skip(
-        'Test requires to complie TensorRT plugins in mmcv',
-        allow_module_level=True)
-
-
-class WrapFunction(nn.Module):
-
-    def __init__(self, wrapped_function):
-        super().__init__()
-        self.wrapped_function = wrapped_function
-
-    def forward(self, *args, **kwargs):
-        return self.wrapped_function(*args, **kwargs)
-
-
-onnx_file = 'tmp.onnx'
-trt_file = 'tmp.engine'
-
-
-def test_roialign():
-    try:
-        from mmcv.ops import RoIAlign
-    except (ImportError, ModuleNotFoundError):
-        pytest.skip('test requires compilation')
-
-    # trt config
-    fp16_mode = False
-    max_workspace_size = 1 << 30
-
-    # roi align config
-    pool_h = 2
-    pool_w = 2
-    spatial_scale = 1.0
-    sampling_ratio = 2
-
-    inputs = [([[[[1., 2.], [3., 4.]]]], [[0., 0., 0., 1., 1.]]),
-              ([[[[1., 2.], [3., 4.]], [[4., 3.],
-                                        [2., 1.]]]], [[0., 0., 0., 1., 1.]]),
-              ([[[[1., 2., 5., 6.], [3., 4., 7., 8.], [9., 10., 13., 14.],
-                  [11., 12., 15., 16.]]]], [[0., 0., 0., 3., 3.]])]
-
-    wrapped_model = RoIAlign((pool_w, pool_h), spatial_scale, sampling_ratio,
-                             'avg', True).cuda()
-    for case in inputs:
-        np_input = np.array(case[0], dtype=np.float32)
-        np_rois = np.array(case[1], dtype=np.float32)
-        input = torch.from_numpy(np_input).cuda()
-        rois = torch.from_numpy(np_rois).cuda()
-
-        with torch.no_grad():
-            torch.onnx.export(
-                wrapped_model, (input, rois),
-                onnx_file,
-                export_params=True,
-                keep_initializers_as_inputs=True,
-                input_names=['input', 'rois'],
-                output_names=['roi_feat'],
-                opset_version=11)
-        onnx_model = onnx.load(onnx_file)
-
-        # create trt engine and wrapper
-        opt_shape_dict = {
-            'input': [list(input.shape),
-                      list(input.shape),
-                      list(input.shape)],
-            'rois': [list(rois.shape),
-                     list(rois.shape),
-                     list(rois.shape)]
-        }
-        trt_engine = onnx2trt(
-            onnx_model,
-            opt_shape_dict,
-            fp16_mode=fp16_mode,
-            max_workspace_size=max_workspace_size)
-        save_trt_engine(trt_engine, trt_file)
-        trt_model = TRTWrapper(trt_file, ['input', 'rois'], ['roi_feat'])
-
-        with torch.no_grad():
-            trt_outputs = trt_model({'input': input, 'rois': rois})
-            trt_roi_feat = trt_outputs['roi_feat']
-
-        # compute pytorch_output
-        with torch.no_grad():
-            pytorch_roi_feat = wrapped_model(input, rois)
-
-        # allclose
-        if os.path.exists(onnx_file):
-            os.remove(onnx_file)
-        if os.path.exists(trt_file):
-            os.remove(trt_file)
-        assert torch.allclose(pytorch_roi_feat, trt_roi_feat)
-
-
-def test_nms():
-    try:
-        from mmcv.ops import nms
-    except (ImportError, ModuleNotFoundError):
-        pytest.skip('test requires compilation')
-    os.environ['ONNX_BACKEND'] = 'MMCVTensorRT'
-    # trt config
-    fp16_mode = False
-    max_workspace_size = 1 << 30
-    data = mmengine.load('./tests/data/batched_nms_data.pkl')
-    boxes = torch.from_numpy(data['boxes']).cuda()
-    scores = torch.from_numpy(data['scores']).cuda()
-    nms = partial(
-        nms, iou_threshold=0.7, offset=0, score_threshold=0.1, max_num=100)
-    wrapped_model = WrapFunction(nms)
-    wrapped_model.cpu().eval()
-    with torch.no_grad():
-        torch.onnx.export(
-            wrapped_model, (boxes.detach().cpu(), scores.detach().cpu()),
-            onnx_file,
-            export_params=True,
-            keep_initializers_as_inputs=True,
-            input_names=['boxes', 'scores'],
-            output_names=['dets', 'inds'],
-            opset_version=11)
-    onnx_model = onnx.load(onnx_file)
-
-    # create trt engine and wrapper
-    opt_shape_dict = {
-        'boxes': [list(boxes.shape),
-                  list(boxes.shape),
-                  list(boxes.shape)],
-        'scores': [list(scores.shape),
-                   list(scores.shape),
-                   list(scores.shape)]
-    }
-    trt_engine = onnx2trt(
-        onnx_model,
-        opt_shape_dict,
-        fp16_mode=fp16_mode,
-        max_workspace_size=max_workspace_size)
-    save_trt_engine(trt_engine, trt_file)
-    trt_model = TRTWrapper(trt_file, ['boxes', 'scores'], ['dets', 'inds'])
-
-    with torch.no_grad():
-        trt_outputs = trt_model({'boxes': boxes, 'scores': scores})
-        trt_dets = trt_outputs['dets']
-        trt_inds = trt_outputs['inds']
-        trt_inds = trt_inds.long()
-
-    # compute pytorch_output
-    with torch.no_grad():
-        pytorch_outputs = wrapped_model(boxes, scores)
-        pytorch_dets, pytorch_inds = pytorch_outputs
-
-    # allclose
-    if os.path.exists(onnx_file):
-        os.remove(onnx_file)
-    if os.path.exists(trt_file):
-        os.remove(trt_file)
-    num_boxes = pytorch_dets.shape[0]
-    trt_dets = trt_dets[:num_boxes, ...]
-    trt_inds = trt_inds[:num_boxes]
-    trt_scores = trt_dets[:, 4]
-    pytorch_scores = pytorch_dets[:, 4]
-    os.environ.pop('ONNX_BACKEND')
-    assert torch.allclose(pytorch_scores, trt_scores, atol=1e-3)
-    assert torch.equal(pytorch_inds, trt_inds)
-
-
-def test_batched_nms():
-    try:
-        from mmcv.ops import batched_nms
-    except (ImportError, ModuleNotFoundError):
-        pytest.skip('test requires compilation')
-
-    # trt config
-    os.environ['ONNX_BACKEND'] = 'MMCVTensorRT'
-    fp16_mode = False
-    max_workspace_size = 1 << 30
-    data = mmengine.load('./tests/data/batched_nms_data.pkl')
-    nms_cfg = dict(type='nms', iou_threshold=0.7, score_threshold=0.1)
-    boxes = torch.from_numpy(data['boxes']).cuda()
-    scores = torch.from_numpy(data['scores']).cuda()
-    idxs = torch.from_numpy(data['idxs']).cuda()
-    class_agnostic = False
-
-    nms = partial(batched_nms, nms_cfg=nms_cfg, class_agnostic=class_agnostic)
-    wrapped_model = WrapFunction(nms)
-    wrapped_model.cpu().eval()
-    input_data = (boxes.detach().cpu(), scores.detach().cpu(),
-                  idxs.detach().cpu())
-    input_names = ['boxes', 'scores', 'idxs']
-    output_names = ['dets', 'inds']
-    with torch.no_grad():
-        torch.onnx.export(
-            wrapped_model,
-            input_data,
-            onnx_file,
-            export_params=True,
-            keep_initializers_as_inputs=True,
-            input_names=input_names,
-            output_names=output_names,
-            opset_version=11)
-    onnx_model = onnx.load(onnx_file)
-    # create trt engine and wrapper
-    opt_shape_dict = {
-        'boxes': [list(boxes.shape),
-                  list(boxes.shape),
-                  list(boxes.shape)],
-        'scores': [list(scores.shape),
-                   list(scores.shape),
-                   list(scores.shape)],
-        'idxs': [list(idxs.shape),
-                 list(idxs.shape),
-                 list(idxs.shape)]
-    }
-    trt_engine = onnx2trt(
-        onnx_model,
-        opt_shape_dict,
-        fp16_mode=fp16_mode,
-        max_workspace_size=max_workspace_size)
-    save_trt_engine(trt_engine, trt_file)
-    trt_model = TRTWrapper(trt_file, input_names, output_names)
-
-    with torch.no_grad():
-        trt_outputs = trt_model({
-            'boxes': boxes,
-            'scores': scores,
-            'idxs': idxs
-        })
-        trt_dets = trt_outputs['dets']
-        trt_inds = trt_outputs['inds']
-        trt_inds = trt_inds.long()
-
-    # compute pytorch_output
-    with torch.no_grad():
-        pytorch_outputs = wrapped_model(boxes, scores, idxs)
-        pytorch_dets, pytorch_inds = pytorch_outputs
-    # allclose
-    if os.path.exists(onnx_file):
-        os.remove(onnx_file)
-    if os.path.exists(trt_file):
-        os.remove(trt_file)
-    num_boxes = pytorch_dets.shape[0]
-    trt_dets = trt_dets[:num_boxes, ...]
-    trt_inds = trt_inds[:num_boxes]
-    trt_scores = trt_dets[:, 4]
-    pytorch_scores = pytorch_dets[:, 4]
-
-    os.environ.pop('ONNX_BACKEND')
-    assert torch.allclose(pytorch_scores, trt_scores)
-    assert torch.equal(pytorch_inds, trt_inds)
-
-
-def test_scatternd():
-
-    def func(data):
-        data[:, :-2] += 1
-        data[:2, :] -= 1
-        return data
-
-    data = torch.zeros(4, 4).cuda()
-    wrapped_model = WrapFunction(func).eval().cuda()
-
-    input_names = ['input']
-    output_names = ['output']
-
-    with torch.no_grad():
-        torch.onnx.export(
-            wrapped_model, (data.clone(), ),
-            onnx_file,
-            export_params=True,
-            keep_initializers_as_inputs=True,
-            input_names=input_names,
-            output_names=output_names,
-            opset_version=11)
-
-    onnx_model = onnx.load(onnx_file)
-
-    # create trt engine and wrapper
-    opt_shape_dict = {
-        'input': [list(data.shape),
-                  list(data.shape),
-                  list(data.shape)],
-    }
-    # trt config
-    fp16_mode = False
-    max_workspace_size = 1 << 30
-
-    trt_engine = onnx2trt(
-        onnx_model,
-        opt_shape_dict,
-        fp16_mode=fp16_mode,
-        max_workspace_size=max_workspace_size)
-
-    save_trt_engine(trt_engine, trt_file)
-    trt_model = TRTWrapper(trt_file, input_names, output_names)
-
-    with torch.no_grad():
-        trt_outputs = trt_model({'input': data.clone()})
-        trt_results = trt_outputs['output']
-
-    # compute pytorch_output
-    with torch.no_grad():
-        pytorch_results = wrapped_model(data.clone())
-
-    # allclose
-    if os.path.exists(onnx_file):
-        os.remove(onnx_file)
-    if os.path.exists(trt_file):
-        os.remove(trt_file)
-    assert torch.allclose(pytorch_results, trt_results)
-
-
-def test_deform_conv():
-    try:
-        from mmcv.ops import DeformConv2dPack
-    except (ImportError, ModuleNotFoundError):
-        pytest.skip('test requires compilation')
-
-    input = [[[[1., 2., 3.], [0., 1., 2.], [3., 5., 2.]]]]
-    offset_weight = [[[0.1, 0.4, 0.6, 0.1]], [[0.3, 0.2, 0.1, 0.3]],
-                     [[0.5, 0.5, 0.2, 0.8]], [[0.8, 0.3, 0.9, 0.1]],
-                     [[0.3, 0.1, 0.2, 0.5]], [[0.3, 0.7, 0.5, 0.3]],
-                     [[0.6, 0.2, 0.5, 0.3]], [[0.4, 0.1, 0.8, 0.4]]]
-    offset_bias = [0.7, 0.1, 0.8, 0.5, 0.6, 0.5, 0.4, 0.7]
-    deform_weight = [[[0.4, 0.2, 0.1, 0.9]]]
-
-    c_in = 1
-    c_out = 1
-    x = torch.Tensor(input).cuda()
-    x.requires_grad = True
-    model = DeformConv2dPack(c_in, c_out, 2, stride=1, padding=0)
-    model.conv_offset.weight.data = torch.nn.Parameter(
-        torch.Tensor(offset_weight).reshape(8, 1, 2, 2))
-    model.conv_offset.bias.data = torch.nn.Parameter(
-        torch.Tensor(offset_bias).reshape(8))
-    model.weight.data = torch.nn.Parameter(
-        torch.Tensor(deform_weight).reshape(1, 1, 2, 2))
-    model.cuda().eval()
-
-    input_names = ['input']
-    output_names = ['output']
-
-    with torch.no_grad():
-        torch.onnx.export(
-            model, (x.clone(), ),
-            onnx_file,
-            export_params=True,
-            keep_initializers_as_inputs=True,
-            input_names=input_names,
-            output_names=output_names,
-            opset_version=11)
-
-    onnx_model = onnx.load(onnx_file)
-
-    # create trt engine and wrapper
-    opt_shape_dict = {
-        'input': [list(x.shape), list(x.shape),
-                  list(x.shape)],
-    }
-    # trt config
-    fp16_mode = False
-    max_workspace_size = 1 << 30
-
-    trt_engine = onnx2trt(
-        onnx_model,
-        opt_shape_dict,
-        fp16_mode=fp16_mode,
-        max_workspace_size=max_workspace_size)
-
-    save_trt_engine(trt_engine, trt_file)
-    trt_model = TRTWrapper(trt_file, input_names, output_names)
-
-    with torch.no_grad():
-        trt_outputs = trt_model({'input': x.clone()})
-        trt_results = trt_outputs['output']
-
-    # compute pytorch_output
-    with torch.no_grad():
-        pytorch_results = model(x.clone())
-
-    # allclose
-    if os.path.exists(onnx_file):
-        os.remove(onnx_file)
-    if os.path.exists(trt_file):
-        os.remove(trt_file)
-    assert torch.allclose(pytorch_results, trt_results)
-
-
-@pytest.mark.parametrize('with_bias', [True, False])
-def test_modulated_deform_conv(with_bias):
-    try:
-        from mmcv.ops import ModulatedDeformConv2dPack
-    except (ImportError, ModuleNotFoundError):
-        pytest.skip('test requires compilation')
-
-    input = [[[[1., 2., 3.], [0., 1., 2.], [3., 5., 2.]]]]
-
-    x = torch.Tensor(input).cuda()
-    model = ModulatedDeformConv2dPack(
-        1,
-        1,
-        kernel_size=(2, 2),
-        stride=1,
-        padding=1,
-        deform_groups=1,
-        bias=with_bias)
-    model.weight.data.fill_(1.)
-    model.type(torch.float32)
-    model = model.cuda().eval()
-
-    input_names = ['input']
-    output_names = ['output']
-
-    with torch.no_grad():
-        torch.onnx.export(
-            model, (x.clone(), ),
-            onnx_file,
-            export_params=True,
-            keep_initializers_as_inputs=True,
-            input_names=input_names,
-            output_names=output_names,
-            opset_version=11)
-
-    onnx_model = onnx.load(onnx_file)
-
-    # create trt engine and wrapper
-    opt_shape_dict = {
-        'input': [list(x.shape), list(x.shape),
-                  list(x.shape)],
-    }
-    # trt config
-    fp16_mode = False
-    max_workspace_size = 1 << 30
-
-    trt_engine = onnx2trt(
-        onnx_model,
-        opt_shape_dict,
-        fp16_mode=fp16_mode,
-        max_workspace_size=max_workspace_size)
-
-    save_trt_engine(trt_engine, trt_file)
-    trt_model = TRTWrapper(trt_file, input_names, output_names)
-
-    with torch.no_grad():
-        trt_outputs = trt_model({'input': x.clone()})
-        trt_results = trt_outputs['output']
-
-    # compute pytorch_output
-    with torch.no_grad():
-        pytorch_results = model(x.clone())
-
-    # allclose
-    if os.path.exists(onnx_file):
-        os.remove(onnx_file)
-    if os.path.exists(trt_file):
-        os.remove(trt_file)
-    torch.testing.assert_allclose(pytorch_results, trt_results)
-
-
-@pytest.mark.parametrize('mode', ['bilinear', 'nearest'])
-@pytest.mark.parametrize('padding_mode', ['zeros', 'border', 'reflection'])
-@pytest.mark.parametrize('align_corners', [True, False])
-def test_grid_sample(mode, padding_mode, align_corners):
-    from mmcv.onnx.symbolic import register_extra_symbolics
-
-    register_extra_symbolics(11)
-
-    input = torch.rand(1, 1, 10, 10).cuda()
-    grid = torch.Tensor([[[1, 0, 0], [0, 1, 0]]])
-    grid = F.affine_grid(grid, (1, 1, 15, 15)).type_as(input).cuda()
-
-    def func(input, grid):
-        return F.grid_sample(
-            input,
-            grid,
-            mode=mode,
-            padding_mode=padding_mode,
-            align_corners=align_corners)
-
-    wrapped_model = WrapFunction(func).eval().cuda()
-
-    input_names = ['input', 'grid']
-    output_names = ['output']
-
-    with torch.no_grad():
-        torch.onnx.export(
-            wrapped_model, (input.clone(), grid.clone()),
-            onnx_file,
-            export_params=True,
-            keep_initializers_as_inputs=True,
-            input_names=input_names,
-            output_names=output_names,
-            opset_version=11)
-
-    onnx_model = onnx.load(onnx_file)
-
-    # create trt engine and wrapper
-    opt_shape_dict = {
-        'input': [list(input.shape),
-                  list(input.shape),
-                  list(input.shape)],
-        'grid': [list(grid.shape),
-                 list(grid.shape),
-                 list(grid.shape)],
-    }
-    # trt config
-    fp16_mode = False
-    max_workspace_size = 1 << 30
-
-    trt_engine = onnx2trt(
-        onnx_model,
-        opt_shape_dict,
-        fp16_mode=fp16_mode,
-        max_workspace_size=max_workspace_size)
-
-    save_trt_engine(trt_engine, trt_file)
-    trt_model = TRTWrapper(trt_file, input_names, output_names)
-
-    with torch.no_grad():
-        trt_outputs = trt_model({'input': input.clone(), 'grid': grid.clone()})
-        trt_results = trt_outputs['output']
-
-    # compute pytorch_output
-    with torch.no_grad():
-        pytorch_results = wrapped_model(input.clone(), grid.clone())
-
-    # allclose
-    if os.path.exists(onnx_file):
-        os.remove(onnx_file)
-    if os.path.exists(trt_file):
-        os.remove(trt_file)
-    assert torch.allclose(pytorch_results, trt_results)
-
-
-@pytest.mark.parametrize('func', [torch.cummax, torch.cummin])
-def test_cummin_cummax(func: Callable):
-    # Note generally `cummax` or `cummin` is exportable to ONNX
-    # as long as the pytorch version >= 1.5.0, since `torch.cummax`
-    # is only supported with torch >= 1.5.0.
-    # But when `cummax` or `cummin` serves as an intermediate component
-    # whose outputs is used as inputs for another modules, it's expected
-    # that pytorch version must be >= 1.7.0. Otherwise error appears like:
-    # `RuntimeError: tuple  appears in op that does not forward tuples,
-    # unsupported 'kind: prim::PythonOp`.
-    from packaging import version
-    if version.parse(torch.__version__) < version.parse('1.7.0'):
-        pytest.skip('test_cummax_cummin should be ran with pytorch >= 1.7.0')
-
-    opset = 11
-    # register custom op `mmcv::cummax` and `mmcv::cummin`
-    from mmcv.onnx.symbolic import register_extra_symbolics
-    register_extra_symbolics(opset)
-
-    input_list = [
-        # arbitrary shape, e.g. 1-D, 2-D, 3-D, ...
-        torch.rand((2, 3, 4, 1, 5)).cuda(),
-        torch.rand(1).cuda()
-    ]
-
-    input_names = ['input']
-    output_names = ['output', 'indices']
-
-    for input in input_list:
-        ndims = input.dim()
-        # valid dim range is [-ndims, ndims-1]
-        # test for all `dim` value which is valid
-        for dim in range(-ndims, ndims):
-            cummax_func = partial(func, dim=dim)
-            wrapped_model = WrapFunction(cummax_func).eval().cuda()
-
-            with torch.no_grad():
-                torch.onnx.export(
-                    wrapped_model,
-                    input,
-                    onnx_file,
-                    export_params=True,
-                    keep_initializers_as_inputs=False,
-                    input_names=input_names,
-                    output_names=output_names,
-                    opset_version=opset)
-
-            onnx_model = onnx.load(onnx_file)
-
-            # create trt engine and wrapper
-            opt_shape_dict = {
-                'input':
-                [list(input.shape),
-                 list(input.shape),
-                 list(input.shape)]
-            }
-            # trt config
-            fp16_mode = False
-            max_workspace_size = 1 << 30
-
-            trt_engine = onnx2trt(
-                onnx_model,
-                opt_shape_dict,
-                fp16_mode=fp16_mode,
-                max_workspace_size=max_workspace_size)
-
-            # remove ONNX model after conversion
-            if os.path.exists(onnx_file):
-                os.remove(onnx_file)
-
-            # save TensorRT model
-            save_trt_engine(trt_engine, trt_file)
-
-            # load and wrap TensorRT model
-            trt_model = TRTWrapper(trt_file)
-
-            # remove trt model after loading
-            if os.path.exists(trt_file):
-                os.remove(trt_file)
-
-            # compute trt output
-            with torch.no_grad():
-                trt_results = trt_model({'input': input.contiguous().clone()})
-                trt_output = trt_results['output']
-                trt_indices = trt_results['indices']
-
-            # compute pytorch output
-            with torch.no_grad():
-                pytorch_results = wrapped_model(input.clone())
-                pytorch_output = pytorch_results[0]
-                pytorch_indices = pytorch_results[1]
-
-            torch.testing.assert_allclose(trt_output, pytorch_output)
-            torch.testing.assert_allclose(trt_indices, pytorch_indices)
-
-
-@pytest.mark.parametrize('dynamic_export', [True, False])
-@pytest.mark.parametrize('fp16_mode', [True, False])
-def test_instance_norm(dynamic_export, fp16_mode):
-
-    n, c, h, w = 2, 3, 10, 10
-    data = torch.randn(n, c, h, w).cuda()
-    norm = nn.InstanceNorm2d(c, affine=True)
-
-    wrapped_model = WrapFunction(norm).eval().cuda()
-
-    input_names = ['input']
-    output_names = ['output']
-    dynamic_axes = None
-    if dynamic_export:
-        dynamic_axes = {
-            'input': {
-                0: 'n',
-                2: 'h',
-                3: 'w',
-            },
-            'output': {
-                0: 'n',
-                2: 'h',
-                3: 'w',
-            },
-        }
-    with torch.no_grad():
-        torch.onnx.export(
-            wrapped_model, (data.clone(), ),
-            onnx_file,
-            export_params=True,
-            keep_initializers_as_inputs=True,
-            input_names=input_names,
-            output_names=output_names,
-            dynamic_axes=dynamic_axes,
-            opset_version=11)
-
-    onnx_model = onnx.load(onnx_file)
-
-    # create trt engine and wrapper
-    if dynamic_export:
-        opt_shape_dict = {
-            'input':
-            [list(data.shape),
-             list(data.shape), [2 * n, c, 2 * h, 2 * w]],
-        }
-    else:
-        opt_shape_dict = {
-            'input': [list(data.shape),
-                      list(data.shape),
-                      list(data.shape)],
-        }
-    # trt config
-    max_workspace_size = 1 << 30
-
-    trt_engine = onnx2trt(
-        onnx_model,
-        opt_shape_dict,
-        fp16_mode=fp16_mode,
-        max_workspace_size=max_workspace_size)
-
-    save_trt_engine(trt_engine, trt_file)
-    trt_model = TRTWrapper(trt_file, input_names, output_names)
-
-    with torch.no_grad():
-        trt_outputs = trt_model({'input': data.clone()})
-        trt_results = trt_outputs['output']
-
-    # compute pytorch_output
-    with torch.no_grad():
-        pytorch_results = wrapped_model(data.clone())
-
-    # allclose
-    if os.path.exists(onnx_file):
-        os.remove(onnx_file)
-    if os.path.exists(trt_file):
-        os.remove(trt_file)
-    assert torch.allclose(pytorch_results, trt_results)
-
-
-@pytest.mark.parametrize('mode', ['top', 'bottom', 'left', 'right'])
-def test_corner_pool(mode):
-    try:
-        from mmcv.ops import CornerPool
-    except (ImportError, ModuleNotFoundError):
-        pytest.skip('test requires compilation')
-
-    opset = 11
-    # register custom op `mmcv::MMCVCornerPool`
-    from mmcv.onnx.symbolic import register_extra_symbolics
-    register_extra_symbolics(opset)
-
-    # trt config
-    fp16_mode = False
-    max_workspace_size = 1 << 30
-
-    inputs = [
-        # (n, c, h, w)
-        torch.rand((2, 3, 5, 5)),
-        torch.rand((1, 2, 4, 6)),
-        torch.rand((2, 1, 3, 2)),
-    ]
-
-    class CornerPoolWrapper(CornerPool):
-
-        def __init__(self, mode):
-            super().__init__(mode)
-
-        def forward(self, x):
-            # no use `torch.cummax`, instead `corner_pool` is used
-            # for various torch version
-            return self.corner_pool.apply(x)
-
-    wrapped_model = CornerPoolWrapper(mode).cuda()
-    for input in inputs:
-        input = input.cuda()
-
-        with torch.no_grad():
-            torch.onnx.export(
-                wrapped_model, (input, ),
-                onnx_file,
-                export_params=True,
-                keep_initializers_as_inputs=True,
-                input_names=['input'],
-                output_names=['output'],
-                opset_version=opset)
-        onnx_model = onnx.load(onnx_file)
-
-        # create trt engine and wrapper
-        opt_shape_dict = {
-            'input': [list(input.shape),
-                      list(input.shape),
-                      list(input.shape)],
-        }
-        trt_engine = onnx2trt(
-            onnx_model,
-            opt_shape_dict,
-            fp16_mode=fp16_mode,
-            max_workspace_size=max_workspace_size)
-        save_trt_engine(trt_engine, trt_file)
-        trt_model = TRTWrapper(trt_file, ['input'], ['output'])
-
-        with torch.no_grad():
-            trt_outputs = trt_model({'input': input})
-            trt_pool_feat = trt_outputs['output']
-
-        # compute pytorch_output
-        with torch.no_grad():
-            pytorch_pool_feat = wrapped_model(input)
-
-        # allclose
-        if os.path.exists(onnx_file):
-            os.remove(onnx_file)
-        if os.path.exists(trt_file):
-            os.remove(trt_file)
-        assert torch.allclose(pytorch_pool_feat, trt_pool_feat, atol=1e-5)
--- a/tests/test_ops/test_tensorrt_preprocess.py
+++ b/tests/test_ops/test_tensorrt_preprocess.py
-# Copyright (c) OpenMMLab. All rights reserved.
-import os
-from functools import wraps
-
-import onnx
-import pytest
-import torch
-
-from mmcv.ops import nms
-from mmcv.tensorrt.preprocess import preprocess_onnx
-
-if torch.__version__ == 'parrots':
-    pytest.skip('not supported in parrots now', allow_module_level=True)
-
-
-def remove_tmp_file(func):
-
-    @wraps(func)
-    def wrapper(*args, **kwargs):
-        onnx_file = 'tmp.onnx'
-        kwargs['onnx_file'] = onnx_file
-        try:
-            result = func(*args, **kwargs)
-        finally:
-            if os.path.exists(onnx_file):
-                os.remove(onnx_file)
-        return result
-
-    return wrapper
-
-
-@remove_tmp_file
-def export_nms_module_to_onnx(module, onnx_file):
-    torch_model = module()
-    torch_model.eval()
-
-    input = (torch.rand([100, 4], dtype=torch.float32),
-             torch.rand([100], dtype=torch.float32))
-
-    torch.onnx.export(
-        torch_model,
-        input,
-        onnx_file,
-        opset_version=11,
-        input_names=['boxes', 'scores'],
-        output_names=['output'])
-
-    onnx_model = onnx.load(onnx_file)
-    return onnx_model
-
-
-def test_can_handle_nms_with_constant_maxnum():
-
-    class ModuleNMS(torch.nn.Module):
-
-        def forward(self, boxes, scores):
-            return nms(boxes, scores, iou_threshold=0.4, max_num=10)
-
-    onnx_model = export_nms_module_to_onnx(ModuleNMS)
-    preprocess_onnx_model = preprocess_onnx(onnx_model)
-    for node in preprocess_onnx_model.graph.node:
-        if 'NonMaxSuppression' in node.name:
-            assert len(node.attribute) == 5, 'The NMS must have 5 attributes.'
-
-
-def test_can_handle_nms_with_undefined_maxnum():
-
-    class ModuleNMS(torch.nn.Module):
-
-        def forward(self, boxes, scores):
-            return nms(boxes, scores, iou_threshold=0.4)
-
-    onnx_model = export_nms_module_to_onnx(ModuleNMS)
-    preprocess_onnx_model = preprocess_onnx(onnx_model)
-    for node in preprocess_onnx_model.graph.node:
-        if 'NonMaxSuppression' in node.name:
-            assert len(node.attribute) == 5, \
-                'The NMS must have 5 attributes.'
-            assert node.attribute[2].i > 0, \
-                'The max_output_boxes_per_class is not defined correctly.'