support v2.1.0

91da9643 · limm · 6f674c7e · 91da9643 · 91da9643 · 91da9643
Commit 91da9643 authored Aug 13, 2024 by limm
20 changed files
--- a/mmcv/cnn/bricks/conv_module.py
+++ b/mmcv/cnn/bricks/conv_module.py
 # Copyright (c) OpenMMLab. All rights reserved.
 import warnings
+from functools import partial
 from typing import Dict, Optional, Tuple, Union
 import torch
@@ -14,6 +15,56 @@ from .norm import build_norm_layer
 from .padding import build_padding_layer
+def efficient_conv_bn_eval_forward(bn: _BatchNorm,
+                                   conv: nn.modules.conv._ConvNd,
+                                   x: torch.Tensor):
+    """
+    Implementation based on https://arxiv.org/abs/2305.11624
+    "Tune-Mode ConvBN Blocks For Efficient Transfer Learning"
+    It leverages the associative law between convolution and affine transform,
+    i.e., normalize (weight conv feature) = (normalize weight) conv feature.
+    It works for Eval mode of ConvBN blocks during validation, and can be used
+    for training as well. It reduces memory and computation cost.
+    Args:
+        bn (_BatchNorm): a BatchNorm module.
+        conv (nn._ConvNd): a conv module
+        x (torch.Tensor): Input feature map.
+    """
+    # These lines of code are designed to deal with various cases
+    # like bn without affine transform, and conv without bias
+    weight_on_the_fly = conv.weight
+    if conv.bias is not None:
+        bias_on_the_fly = conv.bias
+    else:
+        bias_on_the_fly = torch.zeros_like(bn.running_var)
+    if bn.weight is not None:
+        bn_weight = bn.weight
+    else:
+        bn_weight = torch.ones_like(bn.running_var)
+    if bn.bias is not None:
+        bn_bias = bn.bias
+    else:
+        bn_bias = torch.zeros_like(bn.running_var)
+    # shape of [C_out, 1, 1, 1] in Conv2d
+    weight_coeff = torch.rsqrt(bn.running_var +
+                               bn.eps).reshape([-1] + [1] *
+                                               (len(conv.weight.shape) - 1))
+    # shape of [C_out, 1, 1, 1] in Conv2d
+    coefff_on_the_fly = bn_weight.view_as(weight_coeff) * weight_coeff
+    # shape of [C_out, C_in, k, k] in Conv2d
+    weight_on_the_fly = weight_on_the_fly * coefff_on_the_fly
+    # shape of [C_out] in Conv2d
+    bias_on_the_fly = bn_bias + coefff_on_the_fly.flatten() *\
+        (bias_on_the_fly - bn.running_mean)
+    return conv._conv_forward(x, weight_on_the_fly, bias_on_the_fly)
 @MODELS.register_module()
 class ConvModule(nn.Module):
    """A conv block that bundles conv/norm/activation layers.
@@ -65,6 +116,9 @@ class ConvModule(nn.Module):
            sequence of "conv", "norm" and "act". Common examples are
            ("conv", "norm", "act") and ("act", "conv", "norm").
            Default: ('conv', 'norm', 'act').
+        efficient_conv_bn_eval (bool): Whether use efficient conv when the
+            consecutive bn is in eval mode (either training or testing), as
+            proposed in https://arxiv.org/abs/2305.11624 . Default: `False`.
    """
    _abbr_ = 'conv_block'
@@ -84,7 +138,8 @@ class ConvModule(nn.Module):
                 inplace: bool = True,
                 with_spectral_norm: bool = False,
                 padding_mode: str = 'zeros',
-                 order: tuple = ('conv', 'norm', 'act')):
+                 order: tuple = ('conv', 'norm', 'act'),
+                 efficient_conv_bn_eval: bool = False):
        super().__init__()
        assert conv_cfg is None or isinstance(conv_cfg, dict)
        assert norm_cfg is None or isinstance(norm_cfg, dict)
@@ -155,6 +210,8 @@ class ConvModule(nn.Module):
        else:
            self.norm_name = None  # type: ignore
+        self.turn_on_efficient_conv_bn_eval(efficient_conv_bn_eval)
        # build activation layer
        if self.with_activation:
            act_cfg_ = act_cfg.copy()  # type: ignore
@@ -200,13 +257,82 @@ class ConvModule(nn.Module):
                x: torch.Tensor,
                activate: bool = True,
                norm: bool = True) -> torch.Tensor:
-        for layer in self.order:
+        layer_index = 0
+        while layer_index < len(self.order):
+            layer = self.order[layer_index]
            if layer == 'conv':
                if self.with_explicit_padding:
                    x = self.padding_layer(x)
+                # if the next operation is norm and we have a norm layer in
+                # eval mode and we have enabled `efficient_conv_bn_eval` for
+                # the conv operator, then activate the optimized forward and
+                # skip the next norm operator since it has been fused
+                if layer_index + 1 < len(self.order) and \
+                        self.order[layer_index + 1] == 'norm' and norm and \
+                        self.with_norm and not self.norm.training and \
+                        self.efficient_conv_bn_eval_forward is not None:
+                    self.conv.forward = partial(
+                        self.efficient_conv_bn_eval_forward, self.norm,
+                        self.conv)
+                    layer_index += 1
+                    x = self.conv(x)
+                    del self.conv.forward
+                else:
                    x = self.conv(x)
            elif layer == 'norm' and norm and self.with_norm:
                x = self.norm(x)
            elif layer == 'act' and activate and self.with_activation:
                x = self.activate(x)
+            layer_index += 1
        return x
+    def turn_on_efficient_conv_bn_eval(self, efficient_conv_bn_eval=True):
+        # efficient_conv_bn_eval works for conv + bn
+        # with `track_running_stats` option
+        if efficient_conv_bn_eval and self.norm \
+                            and isinstance(self.norm, _BatchNorm) \
+                            and self.norm.track_running_stats:
+            self.efficient_conv_bn_eval_forward = efficient_conv_bn_eval_forward  # noqa: E501
+        else:
+            self.efficient_conv_bn_eval_forward = None  # type: ignore
+    @staticmethod
+    def create_from_conv_bn(conv: torch.nn.modules.conv._ConvNd,
+                            bn: torch.nn.modules.batchnorm._BatchNorm,
+                            efficient_conv_bn_eval=True) -> 'ConvModule':
+        """Create a ConvModule from a conv and a bn module."""
+        self = ConvModule.__new__(ConvModule)
+        super(ConvModule, self).__init__()
+        self.conv_cfg = None
+        self.norm_cfg = None
+        self.act_cfg = None
+        self.inplace = False
+        self.with_spectral_norm = False
+        self.with_explicit_padding = False
+        self.order = ('conv', 'norm', 'act')
+        self.with_norm = True
+        self.with_activation = False
+        self.with_bias = conv.bias is not None
+        # build convolution layer
+        self.conv = conv
+        # export the attributes of self.conv to a higher level for convenience
+        self.in_channels = self.conv.in_channels
+        self.out_channels = self.conv.out_channels
+        self.kernel_size = self.conv.kernel_size
+        self.stride = self.conv.stride
+        self.padding = self.conv.padding
+        self.dilation = self.conv.dilation
+        self.transposed = self.conv.transposed
+        self.output_padding = self.conv.output_padding
+        self.groups = self.conv.groups
+        # build normalization layers
+        self.norm_name, norm = 'bn', bn
+        self.add_module(self.norm_name, norm)
+        self.turn_on_efficient_conv_bn_eval(efficient_conv_bn_eval)
+        return self
--- a/mmcv/cnn/bricks/generalized_attention.py
+++ b/mmcv/cnn/bricks/generalized_attention.py
@@ -371,7 +371,7 @@ class GeneralizedAttention(nn.Module):
                contiguous().\
                view(1, 1, h*w, h_kv*w_kv)
-            energy = energy.masked_fill_(cur_local_constraint_map,
+            energy = energy.masked_fill_(cur_local_constraint_map.bool(),
                                         float('-inf'))
        attention = F.softmax(energy, 3)

--- a/mmcv/cnn/bricks/norm.py
+++ b/mmcv/cnn/bricks/norm.py
@@ -98,14 +98,17 @@ def build_norm_layer(cfg: Dict,
    layer_type = cfg_.pop('type')
+    if inspect.isclass(layer_type):
+        norm_layer = layer_type
+    else:
        # Switch registry to the target scope. If `norm_layer` cannot be found
        # in the registry, fallback to search `norm_layer` in the
        # mmengine.MODELS.
        with MODELS.switch_scope_and_registry(None) as registry:
            norm_layer = registry.get(layer_type)
        if norm_layer is None:
-        raise KeyError(f'Cannot find {norm_layer} in registry under scope '
+            raise KeyError(f'Cannot find {norm_layer} in registry under '
-                       f'name {registry.scope}')
+                           f'scope name {registry.scope}')
    abbr = infer_abbr(norm_layer)
    assert isinstance(postfix, (int, str))
@@ -113,7 +116,7 @@ def build_norm_layer(cfg: Dict,
    requires_grad = cfg_.pop('requires_grad', True)
    cfg_.setdefault('eps', 1e-5)
-    if layer_type != 'GN':
+    if norm_layer is not nn.GroupNorm:
        layer = norm_layer(num_features, **cfg_)
        if layer_type == 'SyncBN' and hasattr(layer, '_specify_ddp_gpu_num'):
            layer._specify_ddp_gpu_num(1)

--- a/mmcv/cnn/bricks/padding.py
+++ b/mmcv/cnn/bricks/padding.py
 # Copyright (c) OpenMMLab. All rights reserved.
+import inspect
 from typing import Dict
 import torch.nn as nn
@@ -27,7 +28,8 @@ def build_padding_layer(cfg: Dict, *args, **kwargs) -> nn.Module:
    cfg_ = cfg.copy()
    padding_type = cfg_.pop('type')
+    if inspect.isclass(padding_type):
+        return padding_type(*args, **kwargs, **cfg_)
    # Switch registry to the target scope. If `padding_layer` cannot be found
    # in the registry, fallback to search `padding_layer` in the
    # mmengine.MODELS.

--- a/mmcv/cnn/bricks/plugin.py
+++ b/mmcv/cnn/bricks/plugin.py
@@ -79,14 +79,17 @@ def build_plugin_layer(cfg: Dict,
    cfg_ = cfg.copy()
    layer_type = cfg_.pop('type')
+    if inspect.isclass(layer_type):
-    # Switch registry to the target scope. If `plugin_layer` cannot be found
+        plugin_layer = layer_type
-    # in the registry, fallback to search `plugin_layer` in the
+    else:
+        # Switch registry to the target scope. If `plugin_layer` cannot be
+        # found in the registry, fallback to search `plugin_layer` in the
        # mmengine.MODELS.
        with MODELS.switch_scope_and_registry(None) as registry:
            plugin_layer = registry.get(layer_type)
        if plugin_layer is None:
-        raise KeyError(f'Cannot find {plugin_layer} in registry under scope '
+            raise KeyError(
+                f'Cannot find {plugin_layer} in registry under scope '
                f'name {registry.scope}')
    abbr = infer_abbr(plugin_layer)

--- a/mmcv/cnn/bricks/upsample.py
+++ b/mmcv/cnn/bricks/upsample.py
 # Copyright (c) OpenMMLab. All rights reserved.
+import inspect
 from typing import Dict
 import torch
@@ -76,9 +77,12 @@ def build_upsample_layer(cfg: Dict, *args, **kwargs) -> nn.Module:
    layer_type = cfg_.pop('type')
+    if inspect.isclass(layer_type):
+        upsample = layer_type
    # Switch registry to the target scope. If `upsample` cannot be found
    # in the registry, fallback to search `upsample` in the
    # mmengine.MODELS.
+    else:
        with MODELS.switch_scope_and_registry(None) as registry:
            upsample = registry.get(layer_type)
        if upsample is None:

--- a/mmcv/cnn/bricks/wrappers.py
+++ b/mmcv/cnn/bricks/wrappers.py
@@ -41,7 +41,7 @@ class NewEmptyTensorOp(torch.autograd.Function):
 class Conv2d(nn.Conv2d):
    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        if x.numel() == 0 and obsolete_torch_version(TORCH_VERSION, (1, 4)):
+        if obsolete_torch_version(TORCH_VERSION, (1, 4)) and x.numel() == 0:
            out_shape = [x.shape[0], self.out_channels]
            for i, k, p, s, d in zip(x.shape[-2:], self.kernel_size,
                                     self.padding, self.stride, self.dilation):
@@ -62,7 +62,7 @@ class Conv2d(nn.Conv2d):
 class Conv3d(nn.Conv3d):
    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        if x.numel() == 0 and obsolete_torch_version(TORCH_VERSION, (1, 4)):
+        if obsolete_torch_version(TORCH_VERSION, (1, 4)) and x.numel() == 0:
            out_shape = [x.shape[0], self.out_channels]
            for i, k, p, s, d in zip(x.shape[-3:], self.kernel_size,
                                     self.padding, self.stride, self.dilation):
@@ -84,7 +84,7 @@ class Conv3d(nn.Conv3d):
 class ConvTranspose2d(nn.ConvTranspose2d):
    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        if x.numel() == 0 and obsolete_torch_version(TORCH_VERSION, (1, 4)):
+        if obsolete_torch_version(TORCH_VERSION, (1, 4)) and x.numel() == 0:
            out_shape = [x.shape[0], self.out_channels]
            for i, k, p, s, d, op in zip(x.shape[-2:], self.kernel_size,
                                         self.padding, self.stride,
@@ -106,7 +106,7 @@ class ConvTranspose2d(nn.ConvTranspose2d):
 class ConvTranspose3d(nn.ConvTranspose3d):
    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        if x.numel() == 0 and obsolete_torch_version(TORCH_VERSION, (1, 4)):
+        if obsolete_torch_version(TORCH_VERSION, (1, 4)) and x.numel() == 0:
            out_shape = [x.shape[0], self.out_channels]
            for i, k, p, s, d, op in zip(x.shape[-3:], self.kernel_size,
                                         self.padding, self.stride,
@@ -127,7 +127,7 @@ class MaxPool2d(nn.MaxPool2d):
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        # PyTorch 1.9 does not support empty tensor inference yet
-        if x.numel() == 0 and obsolete_torch_version(TORCH_VERSION, (1, 9)):
+        if obsolete_torch_version(TORCH_VERSION, (1, 9)) and x.numel() == 0:
            out_shape = list(x.shape[:2])
            for i, k, p, s, d in zip(x.shape[-2:], _pair(self.kernel_size),
                                     _pair(self.padding), _pair(self.stride),
@@ -145,7 +145,7 @@ class MaxPool3d(nn.MaxPool3d):
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        # PyTorch 1.9 does not support empty tensor inference yet
-        if x.numel() == 0 and obsolete_torch_version(TORCH_VERSION, (1, 9)):
+        if obsolete_torch_version(TORCH_VERSION, (1, 9)) and x.numel() == 0:
            out_shape = list(x.shape[:2])
            for i, k, p, s, d in zip(x.shape[-3:], _triple(self.kernel_size),
                                     _triple(self.padding),
@@ -164,7 +164,7 @@ class Linear(torch.nn.Linear):
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        # empty tensor forward of Linear layer is supported in Pytorch 1.6
-        if x.numel() == 0 and obsolete_torch_version(TORCH_VERSION, (1, 5)):
+        if obsolete_torch_version(TORCH_VERSION, (1, 5)) and x.numel() == 0:
            out_shape = [x.shape[0], self.out_features]
            empty = NewEmptyTensorOp.apply(x, out_shape)
            if self.training:

--- a/mmcv/image/geometric.py
+++ b/mmcv/image/geometric.py
@@ -16,13 +16,13 @@ except ImportError:
 def _scale_size(
    size: Tuple[int, int],
-    scale: Union[float, int, tuple],
+    scale: Union[float, int, Tuple[float, float], Tuple[int, int]],
 ) -> Tuple[int, int]:
    """Rescale a size by a ratio.
    Args:
        size (tuple[int]): (w, h).
-        scale (float | tuple(float)): Scaling factor.
+        scale (float | int | tuple(float) | tuple(int)): Scaling factor.
    Returns:
        tuple[int]: scaled size.
@@ -128,7 +128,8 @@ def imresize_to_multiple(
    img: np.ndarray,
    divisor: Union[int, Tuple[int, int]],
    size: Union[int, Tuple[int, int], None] = None,
-    scale_factor: Union[float, Tuple[float, float], None] = None,
+    scale_factor: Union[float, int, Tuple[float, float], Tuple[int, int],
+                        None] = None,
    keep_ratio: bool = False,
    return_scale: bool = False,
    interpolation: str = 'bilinear',
@@ -145,9 +146,10 @@ def imresize_to_multiple(
            divisor. If divisor is a tuple, divisor should be
            (w_divisor, h_divisor).
        size (None | int | tuple[int]): Target size (w, h). Default: None.
-        scale_factor (None | float | tuple[float]): Multiplier for spatial
+        scale_factor (None | float | int | tuple[float] | tuple[int]):
-            size. Should match input size if it is a tuple and the 2D style is
+            Multiplier for spatial size. Should match input size if it is a
-            (w_scale_factor, h_scale_factor). Default: None.
+            tuple and the 2D style is (w_scale_factor, h_scale_factor).
+            Default: None.
        keep_ratio (bool): Whether to keep the aspect ratio when resizing the
            image. Default: False.
        return_scale (bool): Whether to return `w_scale` and `h_scale`.
@@ -215,16 +217,16 @@ def imresize_like(
 def rescale_size(old_size: tuple,
-                 scale: Union[float, int, tuple],
+                 scale: Union[float, int, Tuple[int, int]],
                 return_scale: bool = False) -> tuple:
    """Calculate the new size to be rescaled to.
    Args:
        old_size (tuple[int]): The old size (w, h) of image.
-        scale (float | tuple[int]): The scaling factor or maximum size.
+        scale (float | int | tuple[int]): The scaling factor or maximum size.
-            If it is a float number, then the image will be rescaled by this
+            If it is a float number or an integer, then the image will be
-            factor, else if it is a tuple of 2 integers, then the image will
+            rescaled by this factor, else if it is a tuple of 2 integers, then
-            be rescaled as large as possible within the scale.
+            the image will be rescaled as large as possible within the scale.
        return_scale (bool): Whether to return the scaling factor besides the
            rescaled image size.
@@ -255,7 +257,7 @@ def rescale_size(old_size: tuple,
 def imrescale(
    img: np.ndarray,
-    scale: Union[float, Tuple[int, int]],
+    scale: Union[float, int, Tuple[int, int]],
    return_scale: bool = False,
    interpolation: str = 'bilinear',
    backend: Optional[str] = None
@@ -264,10 +266,10 @@ def imrescale(
    Args:
        img (ndarray): The input image.
-        scale (float | tuple[int]): The scaling factor or maximum size.
+        scale (float | int | tuple[int]): The scaling factor or maximum size.
-            If it is a float number, then the image will be rescaled by this
+            If it is a float number or an integer, then the image will be
-            factor, else if it is a tuple of 2 integers, then the image will
+            rescaled by this factor, else if it is a tuple of 2 integers, then
-            be rescaled as large as possible within the scale.
+            the image will be rescaled as large as possible within the scale.
        return_scale (bool): Whether to return the scaling factor besides the
            rescaled image.
        interpolation (str): Same as :func:`resize`.

--- a/mmcv/ops/__init__.py
+++ b/mmcv/ops/__init__.py
 # Copyright (c) OpenMMLab. All rights reserved.
+from mmcv.utils import IS_MLU_AVAILABLE
 from .active_rotated_filter import active_rotated_filter
 from .assign_score_withk import assign_score_withk
 from .ball_query import ball_query
@@ -109,3 +110,9 @@ __all__ = [
    'PrRoIPool', 'prroi_pool', 'bias_act', 'filtered_lrelu', 'conv2d',
    'conv_transpose2d', 'filter2d', 'upsample2d', 'BezierAlign', 'bezier_align'
 ]
+if IS_MLU_AVAILABLE:
+    from .deform_conv import DeformConv2dPack_MLU  # noqa:F401
+    from .modulated_deform_conv import \
+        ModulatedDeformConv2dPack_MLU  # noqa:F401
+    __all__.extend(['ModulatedDeformConv2dPack_MLU', 'DeformConv2dPack_MLU'])
--- a/mmcv/ops/bbox.py
+++ b/mmcv/ops/bbox.py
@@ -116,6 +116,10 @@ def bbox_overlaps(bboxes1: torch.Tensor,
    if rows * cols == 0:
        return ious
+    if bboxes1.device.type == 'cpu' and torch.__version__ == 'parrots':
+        return _bbox_overlaps_cpu(
+            bboxes1, bboxes2, mode=mode, aligned=aligned, offset=offset)
    ext_module.bbox_overlaps(
        bboxes1, bboxes2, ious, mode=mode_flag, aligned=aligned, offset=offset)

--- a/mmcv/ops/box_iou_rotated.py
+++ b/mmcv/ops/box_iou_rotated.py
@@ -132,6 +132,9 @@ def box_iou_rotated(bboxes1: torch.Tensor,
    cols = bboxes2.size(0)
    if aligned:
        ious = bboxes1.new_zeros(rows)
+    else:
+        if bboxes1.device.type == 'mlu':
+            ious = bboxes1.new_zeros([rows, cols])
        else:
            ious = bboxes1.new_zeros(rows * cols)
    if not clockwise:
@@ -139,6 +142,11 @@ def box_iou_rotated(bboxes1: torch.Tensor,
        flip_mat[-1] = -1
        bboxes1 = bboxes1 * flip_mat
        bboxes2 = bboxes2 * flip_mat
+    if bboxes1.device.type == 'npu':
+        scale_mat = bboxes1.new_ones(bboxes1.shape[-1])
+        scale_mat[-1] = 1.0 / 0.01745329252
+        bboxes1 = bboxes1 * scale_mat
+        bboxes2 = bboxes2 * scale_mat
    bboxes1 = bboxes1.contiguous()
    bboxes2 = bboxes2.contiguous()
    ext_module.box_iou_rotated(

--- a/mmcv/ops/conv2d_gradfix.py
+++ b/mmcv/ops/conv2d_gradfix.py
@@ -16,6 +16,7 @@ from typing import Dict, Optional, Tuple, Union
 import torch
 from mmengine.utils import digit_version
+from mmengine.utils.dl_utils.parrots_wrapper import is_rocm_pytorch
 enabled = True
 weight_gradients_disabled = False
@@ -283,19 +284,10 @@ def _conv2d_gradfix(
                    output_padding=output_padding,
                    output_mask=[0, 1, 0])[1]
            else:
-                is_rocm_pytorch = False
+                if is_rocm_pytorch():
-                try:
+                    name = 'aten::miopen_convolution_transpose_backward_weight'
-                    from torch.utils.cpp_extension import ROCM_HOME
+                    if not transpose:
-                    is_rocm_pytorch = True if ((torch.version.hip is not None) and
+                        name = 'aten::miopen_convolution_backward_weight'
-                                       (ROCM_HOME is not None)) else False
-                except ImportError:
-                    pass
-                name=''
-                flags=[]
-                if is_rocm_pytorch:
-                    name = ('aten::miopen_convolution_transpose_backward_weight'
-                        if transpose else
-                        'aten::miopen_convolution_backward_weight')
                    flags = [
                        torch.backends.cudnn.benchmark,
                        torch.backends.cudnn.deterministic

--- a/mmcv/ops/corner_pool.py
+++ b/mmcv/ops/corner_pool.py
 # Copyright (c) OpenMMLab. All rights reserved.
 import torch
-from torch import Tensor, nn
 from mmengine.utils import digit_version
+from torch import Tensor, nn
 _mode_dict = {'top': 0, 'bottom': 1, 'left': 2, 'right': 3}
@@ -70,7 +71,8 @@ class CornerPool(nn.Module):
        self.mode = mode
    def forward(self, x: Tensor) -> Tensor:
-        if torch.__version__ != 'parrots' and digit_version(torch.__version__) >= digit_version('1.5.0'):
+        if (torch.__version__ != 'parrots' and
+                digit_version(torch.__version__) >= digit_version('1.5.0')):
            dim, flip = self.cummax_dim_flip[self.mode]
            if flip:
                x = x.flip(dim)

--- a/mmcv/ops/csrc/common/cuda/carafe_cuda_kernel.cuh
+++ b/mmcv/ops/csrc/common/cuda/carafe_cuda_kernel.cuh
@@ -2,6 +2,8 @@
 #ifndef CARAFE_CUDA_KERNEL_CUH
 #define CARAFE_CUDA_KERNEL_CUH
+#include <ATen/cuda/DeviceUtils.cuh>
 #ifdef MMCV_USE_PARROTS
 #include "parrots_cuda_helper.hpp"
 #else
@@ -56,7 +58,8 @@ template <>
 __device__ __forceinline__ phalf warpReduceSum(phalf val) {
  for (int offset = WARP_SIZE / 2; offset > 0; offset /= 2)
 #ifdef MMCV_WITH_HIP
-    __PHALF(val) += __shfl_down(val, offset);
+    // Using PyTorch's macro for half support
+    __PHALF(val) += WARP_SHFL_DOWN(val, offset);
 #else
    __PHALF(val) +=
        __shfl_down_sync(FULL_MASK, __PHALF(val).operator __half(), offset);

--- a/mmcv/ops/csrc/common/mlu/bbox_overlaps_mlu_kernel.mlu
+++ b/mmcv/ops/csrc/common/mlu/bbox_overlaps_mlu_kernel.mlu
-/*************************************************************************
- * Copyright (C) 2021 Cambricon.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *************************************************************************/
-#include <float.h>
-#include "common_mlu_helper.hpp"
-#define COORD_NUM 4
-__nram__ char nmem_buf[MAX_NRAM_SIZE];
-template <typename T>
-__mlu_func__ void computeDiv(void *nram_dst, void *nram_src0, void *nram_src1,
-                             void *nram_addition, const int32_t deal_num) {
-  __bang_active_reciphp((T *)nram_dst, (T *)nram_src1, deal_num);
-  __bang_mul((T *)nram_dst, (T *)nram_src0, (T *)nram_dst, deal_num);
-}
-template <>
-__mlu_func__ void computeDiv<half>(void *nram_dst, void *nram_src0,
-                                   void *nram_src1, void *nram_addition,
-                                   const int32_t deal_num) {
-  __bang_half2float((float *)nram_addition, (half *)nram_src1, deal_num);
-  __bang_active_reciphp((float *)nram_addition, (float *)nram_addition,
-                        deal_num);
-  __bang_float2half_rd((half *)nram_src1, (float *)nram_addition, deal_num);
-  __bang_mul((half *)nram_dst, (half *)nram_src0, (half *)nram_src1, deal_num);
-}
-template <typename T>
-__mlu_func__ void bboxOverlapsWorkflow(
-    T *vec_b1_x1, T *vec_b1_y1, T *vec_b1_x2, T *vec_b1_y2, T *vec_b2_x1,
-    T *vec_b2_y1, T *vec_b2_x2, T *vec_b2_y2, T *vec_left, T *vec_right,
-    T *vec_top, T *vec_bottom, const T *bbox1, const T *bbox2, void *ious,
-    const int32_t offset, const int32_t mode, const int32_t batches_stride,
-    const int32_t num_bbox1, const int32_t num_bbox2, const bool aligned) {
-  int32_t task_batch_stride = (num_bbox1 + taskDim - 1) / taskDim;
-  int32_t batch_start = taskId * task_batch_stride;
-  int32_t batch_per_task = batch_start + task_batch_stride < num_bbox1
-                               ? task_batch_stride
-                               : num_bbox1 - batch_start;
-  batch_per_task = batch_per_task > 0 ? batch_per_task : (0);
-  if (aligned) {
-    int32_t num_loop_cpy = batch_per_task / batches_stride;
-    int32_t num_rem_cpy_batches = batch_per_task % batches_stride;
-    num_loop_cpy = num_rem_cpy_batches > 0 ? num_loop_cpy + 1 : num_loop_cpy;
-    for (int32_t i = 0; i < num_loop_cpy; i++) {
-      int32_t index = batch_start + i * batches_stride;
-      int32_t handle_batches = index + batches_stride > num_bbox1
-                                   ? num_rem_cpy_batches
-                                   : batches_stride;
-      int32_t b1 = index;
-      int32_t b2 = index;
-      int32_t base1 = b1 * COORD_NUM;
-      __memcpy(vec_b1_x1, &bbox1[base1], sizeof(T), GDRAM2NRAM, sizeof(T),
-               COORD_NUM * sizeof(T), handle_batches - 1);
-      __memcpy(vec_b1_y1, &bbox1[base1 + 1], sizeof(T), GDRAM2NRAM, sizeof(T),
-               COORD_NUM * sizeof(T), handle_batches - 1);
-      __memcpy(vec_b1_x2, &bbox1[base1 + 2], sizeof(T), GDRAM2NRAM, sizeof(T),
-               COORD_NUM * sizeof(T), handle_batches - 1);
-      __memcpy(vec_b1_y2, &bbox1[base1 + 3], sizeof(T), GDRAM2NRAM, sizeof(T),
-               COORD_NUM * sizeof(T), handle_batches - 1);
-      int32_t base2 = b2 * COORD_NUM;
-      __memcpy(vec_b2_x1, &bbox2[base2], sizeof(T), GDRAM2NRAM, sizeof(T),
-               COORD_NUM * sizeof(T), handle_batches - 1);
-      __memcpy(vec_b2_y1, &bbox2[base2 + 1], sizeof(T), GDRAM2NRAM, sizeof(T),
-               COORD_NUM * sizeof(T), handle_batches - 1);
-      __memcpy(vec_b2_x2, &bbox2[base2 + 2], sizeof(T), GDRAM2NRAM, sizeof(T),
-               COORD_NUM * sizeof(T), handle_batches - 1);
-      __memcpy(vec_b2_y2, &bbox2[base2 + 3], sizeof(T), GDRAM2NRAM, sizeof(T),
-               COORD_NUM * sizeof(T), handle_batches - 1);
-      // get the width and height
-      __bang_maxequal(vec_left, vec_b1_x1, vec_b2_x1, batches_stride);
-      __bang_minequal(vec_right, vec_b1_x2, vec_b2_x2, batches_stride);
-      __bang_maxequal(vec_top, vec_b1_y1, vec_b2_y1, batches_stride);
-      __bang_minequal(vec_bottom, vec_b1_y2, vec_b2_y2, batches_stride);
-      // right - left + offset ---> left
-      __bang_sub(vec_left, vec_right, vec_left, batches_stride);
-      __bang_add_scalar(vec_left, vec_left, (T)offset, batches_stride);
-      // bottom - top + offset ---> right
-      __bang_sub(vec_right, vec_bottom, vec_top, batches_stride);
-      __bang_add_scalar(vec_right, vec_right, (T)offset, batches_stride);
-      // zero vector ---> bottom
-      __bang_write_value(vec_bottom, batches_stride, 0.f);
-      // width --> vec_left
-      __bang_maxequal(vec_left, vec_bottom, vec_left, batches_stride);
-      T *width = vec_left;
-      // height --> vec_right
-      __bang_maxequal(vec_right, vec_bottom, vec_right, batches_stride);
-      T *height = vec_right;
-      // get the b1_area
-      // (b1_x2 - b1_x1 + offset)  --->  vec_top
-      __bang_sub(vec_top, vec_b1_x2, vec_b1_x1, batches_stride);
-      __bang_add_scalar(vec_top, vec_top, (T)offset, batches_stride);
-      // (b1_y2 - b1_y1 + offset)  --->  vec_bottom
-      __bang_sub(vec_bottom, vec_b1_y2, vec_b1_y1, batches_stride);
-      __bang_add_scalar(vec_bottom, vec_bottom, (T)offset, batches_stride);
-      // b1_area = (b1_x2 - b1_x1 + offset) * (b1_y2 - b1_y1 + offset)
-      // --->  vec_top;
-      __bang_mul(vec_top, vec_top, vec_bottom, batches_stride);
-      T *b1_area = vec_top;
-      // get the b2_area
-      // (b2_x2 - b2_x1 + offset)  --->  b2_x1
-      __bang_sub(vec_b2_x1, vec_b2_x2, vec_b2_x1, batches_stride);
-      __bang_add_scalar(vec_b2_x1, vec_b2_x1, (T)offset, batches_stride);
-      // (b2_y2 - b2_y1 + offset)  --->  b2_y1
-      __bang_sub(vec_b2_y1, vec_b2_y2, vec_b2_y1, batches_stride);
-      __bang_add_scalar(vec_b2_y1, vec_b2_y1, (T)offset, batches_stride);
-      // b2_area = (b2_x2 - b2_x1 + offset) * (b2_y2 - b2_y1 + offset)
-      // --->  b2_x1;
-      __bang_mul(vec_b2_x1, vec_b2_x1, vec_b2_y1, batches_stride);
-      T *b2_area = vec_b2_x1;
-      // inter_s = width * height
-      __bang_mul(height, width, height, batches_stride);
-      T *inter_s = height;
-      // offset vector ---> vec_b2_y1
-      __bang_write_value(vec_b2_y1, batches_stride, T(offset));
-      T *vec_offset = vec_b2_y1;
-      if (mode == 0) {
-        __bang_add(b1_area, b1_area, b2_area, batches_stride);
-        __bang_sub(b1_area, b1_area, inter_s, batches_stride);
-        __bang_maxequal(b1_area, vec_offset, b1_area, batches_stride);
-      } else {
-        __bang_maxequal(b1_area, vec_offset, b1_area, batches_stride);
-      }
-      T *base_s = b1_area;
-      // ious = inter_s / base_s
-      computeDiv<T>(width, inter_s, base_s, vec_b2_x2, batches_stride);
-      __memcpy((T *)ious + index, width, handle_batches * sizeof(T),
-               NRAM2GDRAM);
-    }
-  } else {
-    int32_t num_loop_cpy = num_bbox2 / batches_stride;
-    int32_t num_rem_cpy_batches = num_bbox2 % batches_stride;
-    num_loop_cpy = num_rem_cpy_batches > 0 ? num_loop_cpy + 1 : num_loop_cpy;
-    for (int32_t i = 0; i < batch_per_task; i++) {
-      int32_t index1 = batch_start + i;
-      int32_t b1 = index1;
-      int32_t base1 = b1 * COORD_NUM;
-      // set bbox1 and bbox2 to nram
-      __bang_write_value(vec_b1_x1, batches_stride, bbox1[base1]);
-      __bang_write_value(vec_b1_y1, batches_stride, bbox1[base1 + 1]);
-      __bang_write_value(vec_b1_x2, batches_stride, bbox1[base1 + 2]);
-      __bang_write_value(vec_b1_y2, batches_stride, bbox1[base1 + 3]);
-      for (int32_t j = 0; j < num_loop_cpy; j++) {
-        int32_t index2 = j * batches_stride;
-        int32_t handle_batches = index2 + batches_stride > num_bbox2
-                                     ? num_rem_cpy_batches
-                                     : batches_stride;
-        int32_t b2 = index2;
-        int32_t base2 = b2 * COORD_NUM;
-        // copy bbox2 to nram
-        __memcpy(vec_b2_x1, &bbox2[base2], sizeof(T), GDRAM2NRAM, sizeof(T),
-                 COORD_NUM * sizeof(T), handle_batches - 1);
-        __memcpy(vec_b2_y1, &bbox2[base2 + 1], sizeof(T), GDRAM2NRAM, sizeof(T),
-                 COORD_NUM * sizeof(T), handle_batches - 1);
-        __memcpy(vec_b2_x2, &bbox2[base2 + 2], sizeof(T), GDRAM2NRAM, sizeof(T),
-                 COORD_NUM * sizeof(T), handle_batches - 1);
-        __memcpy(vec_b2_y2, &bbox2[base2 + 3], sizeof(T), GDRAM2NRAM, sizeof(T),
-                 COORD_NUM * sizeof(T), handle_batches - 1);
-        // get the width and height
-        __bang_maxequal(vec_left, vec_b1_x1, vec_b2_x1, batches_stride);
-        __bang_minequal(vec_right, vec_b1_x2, vec_b2_x2, batches_stride);
-        __bang_maxequal(vec_top, vec_b1_y1, vec_b2_y1, batches_stride);
-        __bang_minequal(vec_bottom, vec_b1_y2, vec_b2_y2, batches_stride);
-        // right - left + offset ---> left
-        __bang_sub(vec_left, vec_right, vec_left, batches_stride);
-        __bang_add_scalar(vec_left, vec_left, (T)offset, batches_stride);
-        // bottom - top + offset ---> right
-        __bang_sub(vec_right, vec_bottom, vec_top, batches_stride);
-        __bang_add_scalar(vec_right, vec_right, (T)offset, batches_stride);
-        // zero vector ---> bottom
-        __bang_write_value(vec_bottom, batches_stride, (T)0);
-        // width --> vec_left
-        __bang_maxequal(vec_left, vec_bottom, vec_left, batches_stride);
-        T *width = vec_left;
-        // height --> vec_right
-        __bang_maxequal(vec_right, vec_bottom, vec_right, batches_stride);
-        T *height = vec_right;
-        // get the b1_area
-        // (b1_x2 - b1_x1 + offset)  --->  vec_top
-        __bang_sub(vec_top, vec_b1_x2, vec_b1_x1, batches_stride);
-        __bang_add_scalar(vec_top, vec_top, (T)offset, batches_stride);
-        // (b1_y2 - b1_y1 + offset)  --->  vec_bottom
-        __bang_sub(vec_bottom, vec_b1_y2, vec_b1_y1, batches_stride);
-        __bang_add_scalar(vec_bottom, vec_bottom, (T)offset, batches_stride);
-        // b1_area = (b1_x2 - b1_x1 + offset) * (b1_y2 - b1_y1 + offset)
-        // --->  vec_top;
-        __bang_mul(vec_top, vec_top, vec_bottom, batches_stride);
-        T *b1_area = vec_top;
-        // get the b2_area
-        // (b2_x2 - b2_x1 + offset)  --->  b2_x1
-        __bang_sub(vec_b2_x1, vec_b2_x2, vec_b2_x1, batches_stride);
-        __bang_add_scalar(vec_b2_x1, vec_b2_x1, (T)offset, batches_stride);
-        // (b2_y2 - b2_y1 + offset)  --->  b2_y1
-        __bang_sub(vec_b2_y1, vec_b2_y2, vec_b2_y1, batches_stride);
-        __bang_add_scalar(vec_b2_y1, vec_b2_y1, (T)offset, batches_stride);
-        // b2_area = (b2_x2 - b2_x1 + offset) * (b2_y2 - b2_y1 + offset)
-        // --->  b2_x1;
-        __bang_mul(vec_b2_x1, vec_b2_x1, vec_b2_y1, batches_stride);
-        T *b2_area = vec_b2_x1;
-        // inter_s = width * height
-        __bang_mul(height, width, height, batches_stride);
-        T *inter_s = height;
-        // offset vector ---> vec_b2_y1
-        __bang_write_value(vec_b2_y1, batches_stride, T(offset));
-        T *vec_offset = vec_b2_y1;
-        if (mode == 0) {
-          __bang_add(b1_area, b1_area, b2_area, batches_stride);
-          __bang_sub(b1_area, b1_area, inter_s, batches_stride);
-          __bang_maxequal(b1_area, vec_offset, b1_area, batches_stride);
-        } else {
-          __bang_maxequal(b1_area, vec_offset, b1_area, batches_stride);
-        }
-        T *base_s = b1_area;
-        // ious = inter_s / base_s
-        computeDiv<T>(width, inter_s, base_s, vec_b2_x2, batches_stride);
-        int32_t gdram_offset = index1 * num_bbox2 + index2;
-        __memcpy((T *)ious + gdram_offset, width, handle_batches * sizeof(T),
-                 NRAM2GDRAM);
-      }
-    }
-  }
-}
-template <typename T>
-__mlu_global__ void MLUUnion1KernelBBoxOverlaps(
-    const void *bbox1, const void *bbox2, void *ious, const int32_t num_bbox1,
-    const int32_t num_bbox2, const int32_t mode, const bool aligned,
-    const int32_t offset) {
-  /*
-   * NRAM partition
-   *  |-------------------------------------------------------------|
-   *  |   vec_b1_x1   |  vec_b1_y1   |   vec_b1_x2  |   vec_b1_y2   |
-   *  |-------------------------------------------------------------|
-   *  |   vec_b2_x1   |  vec_b2_y1   |   vec_b2_x2  |   vec_b2_y2   |
-   *  |-------------------------------------------------------------|
-   *  |    vec_left   |  vec_right   |    vec_top   |   vec_bottom  |
-   *  |-------------------------------------------------------------|
-   *
-  */
-  const int32_t align_bytes = PAD_DOWN(MAX_NRAM_SIZE, NFU_ALIGN_SIZE);
-  const int32_t split_nram_num = 12;
-  const int32_t nram_stride =
-      align_bytes / NFU_ALIGN_SIZE / split_nram_num * NFU_ALIGN_SIZE;
-  void *vec_b1_x1 = nmem_buf;
-  void *vec_b1_y1 = nmem_buf + nram_stride;
-  void *vec_b1_x2 = nmem_buf + 2 * nram_stride;
-  void *vec_b1_y2 = nmem_buf + 3 * nram_stride;
-  void *vec_b2_x1 = nmem_buf + 4 * nram_stride;
-  void *vec_b2_y1 = nmem_buf + 5 * nram_stride;
-  void *vec_b2_x2 = nmem_buf + 6 * nram_stride;
-  void *vec_b2_y2 = nmem_buf + 7 * nram_stride;
-  void *vec_left = nmem_buf + 8 * nram_stride;
-  void *vec_right = nmem_buf + 9 * nram_stride;
-  void *vec_top = nmem_buf + 10 * nram_stride;
-  void *vec_bottom = nmem_buf + 11 * nram_stride;
-  const int32_t vec_length = nram_stride / sizeof(T);
-  bboxOverlapsWorkflow((T *)vec_b1_x1, (T *)vec_b1_y1, (T *)vec_b1_x2,
-                       (T *)vec_b1_y2, (T *)vec_b2_x1, (T *)vec_b2_y1,
-                       (T *)vec_b2_x2, (T *)vec_b2_y2, (T *)vec_left,
-                       (T *)vec_right, (T *)vec_top, (T *)vec_bottom,
-                       (T *)bbox1, (T *)bbox2, (T *)ious, offset, mode,
-                       vec_length, num_bbox1, num_bbox2, aligned);
-}
-void KernelBBoxOverlaps(cnrtDim3_t k_dim, cnrtFunctionType_t k_type,
-                        cnrtQueue_t queue, const cnrtDataType_t d_type,
-                        const void *bbox1, const void *bbox2, void *ious,
-                        const int32_t num_bbox1, const int32_t num_bbox2,
-                        const int32_t mode, const bool aligned,
-                        const int32_t offset) {
-  if (d_type == CNRT_FLOAT16) {
-    MLUUnion1KernelBBoxOverlaps<half><<<k_dim, k_type, queue>>>(
-        bbox1, bbox2, ious, num_bbox1, num_bbox2, mode, aligned, offset);
-  } else {
-    MLUUnion1KernelBBoxOverlaps<float><<<k_dim, k_type, queue>>>(
-        bbox1, bbox2, ious, num_bbox1, num_bbox2, mode, aligned, offset);
-  }
-}
--- a/mmcv/ops/csrc/common/mlu/carafe_mlu_kernel.mlu
+++ b/mmcv/ops/csrc/common/mlu/carafe_mlu_kernel.mlu
--- a/mmcv/ops/csrc/common/mlu/carafe_utils.hpp
+++ b/mmcv/ops/csrc/common/mlu/carafe_utils.hpp
-/*************************************************************************
- * Copyright (C) 2022 Cambricon.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *************************************************************************/
-#ifndef CARAFE_UTILS_HPP_
-#define CARAFE_UTILS_HPP_
-#define NRAM_ALIGN_SIZE 64
-struct CarafeForwardParam {
-  int N;   // batch size
-  int Hi;  // input height
-  int Wi;  // input width
-  int Ci;  // input channels
-  int Ho;  // output height
-  int Wo;  // output width
-  int Cg;  // channels per group
-  int kernel_size;       // kernel_size
-  int group_size;        // group_size
-  int scale_factor;      // scale_factor
-  int kernel_size_half;  // kernel half size (K-1)/2
-  int kernel_size_sq;    // square of kernel size
-  int dtype_size;  // size of tensor data type
-  // Host arrays' geometry
-  int input_stride_g;
-  int input_stride_w;
-  int input_stride_h;
-  int input_stride_n;
-  int input_size;
-  int mask_stride_kh;
-  int mask_stride_g;
-  int mask_stride_w;
-  int mask_stride_h;
-  int mask_stride_n;
-  int mask_size;
-  int output_stride_g;
-  int output_stride_w;
-  int output_stride_h;
-  int output_stride_n;
-  int output_size;
-  // NRAM arrays' geometry
-  int input_nram_stride_g;
-  int input_nram_stride_w;
-  int input_nram_stride_h;
-  int input_nram_size;
-  int mask_nram_stride_kh;
-  int mask_nram_stride_g;
-  int mask_nram_stride_w;
-  int mask_nram_stride_h;
-  int mask_nram_size;
-  int output_nram_stride_g;
-  int output_nram_stride_w;
-  int output_nram_stride_h;
-  int output_nram_size;
-  // for address/compute alignment
-  int align_size_NRAM;  // for addressing on NRAM
-  int align_size_NFU;   // for NFU operation length
-  int block_Cg_NFU;     // for bang_mul_const
-  int job_num;  // total job number
-};
-struct CarafeForwardBlockDim {
-  int Ho;  // block size of output height
-  int Wo;  // block size of output width
-  int Kh;  // block size of kernel height
-  int Kw;  // block size of kernel width
-  int G;   // block size of groups
-  int Cg;  // block size of channels within a group
-  int Hi;  // block size of input height
-  int Wi;  // block size of input width
-};
-struct CarafeForwardGridDim {
-  int Ho;  // number of blocks of output height
-  int Wo;
-  int Kh;
-  int Kw;
-  int G;
-  int Cg;
-};
-#endif  // CARAFE_UTILS_HPP_
--- a/mmcv/ops/csrc/common/mlu/common_mlu_helper.hpp
+++ b/mmcv/ops/csrc/common/mlu/common_mlu_helper.hpp
@@ -45,148 +45,6 @@ __mlu_func__ inline scalar_t max(scalar_t a, scalar_t b) {
  return a > b ? a : b;
 }
-/*!
- * @brief loads data from global DRAM to NRAM with 2D pattern.
- *
- * @param[out] dst
- *   Pointer to NRAM that stores dst data.
- * @param[in] src
- *   Pointer to global DRAM that stores src data.
- * @param[in] size
- *   The byte size of segment in the lower dimension.
- * @param[in] dst_str
- *   The data stride in bytes between segments in the lower dimension of dst.
- * @param[in] src_str
- *   The data stride in bytes between segments in the lower dimension of src.
- * @param[in] seg_num
- *   The total count of data segments in the lower dimension.
- */
-template <typename T>
-__mlu_func__ void loadStr2D(T *dst, T *src, const int size, const int dst_str,
-                            const int src_str, const int seg_num) {
-  if (dst_str == src_str && size == src_str) {
-    __memcpy(dst, src, src_str * seg_num * sizeof(T), GDRAM2NRAM);
-  } else if ((size == src_str || src_str <= dst_str) &&
-             src_str * sizeof(T) <= 512) {
-    // gather data less than 512Bytes to improve IO efficiency
-    T *tmp = (T *)dst + (dst_str - src_str) * seg_num;
-    __memcpy(tmp, src, (src_str * (seg_num - 1) + size) * sizeof(T),
-             GDRAM2NRAM);
-    if (dst_str != src_str) {
-      __memcpy(dst, tmp, size * sizeof(T), NRAM2NRAM, dst_str * sizeof(T),
-               src_str * sizeof(T), seg_num - 1);
-    }
-  } else {
-    __memcpy(dst, src, size * sizeof(T), GDRAM2NRAM, dst_str * sizeof(T),
-             src_str * sizeof(T), seg_num - 1);
-  }
-}
-/*!
- * @brief loads data from global DRAM to NRAM with 3D pattern.
- *
- * @param[out] dst
- *   Pointer to NRAM that stores dst data.
- * @param[in] src
- *   Pointer to global DRAM that stores src data.
- * @param[in] size
- *   The byte size of segment in the lowest dimension.
- * @param[in] seg_num_in
- *   The total count of data segments in the lowest dimension.
- * @param[in] seg_num_out
- *   The total count of data segments in the middle dimension.
- * @param[in] dst_str_in
- *   The data stride in bytes between segments in the lowest dimension of dst.
- * @param[in] dst_str_out
- *   The data stride in bytes between segments in the middle dimension of dst.
- * @param[in] src_str_in
- *   The data stride in bytes between segments in the lowest dimension of src.
- * @param[in] src_str_out
- *   The data stride in bytes between segments in the middle dimension of src.
- */
-template <typename T>
-__mlu_func__ void loadStr3D(T *dst, T *src, const int size,
-                            const int seg_num_in, const int seg_num_out,
-                            const int dst_str_in, const int dst_str_out,
-                            const int src_str_in, const int src_str_out) {
-  T *tmp_dst = dst;
-  T *tmp_src = src;
-  for (int i = 0; i < seg_num_out; ++i) {
-    loadStr2D(tmp_dst, tmp_src, size, dst_str_in, src_str_in, seg_num_in);
-    tmp_src += src_str_out;
-    tmp_dst += dst_str_out;
-  }
-}
-/*!
- * @brief stores data from NRAM to global DRAM with 2D pattern.
- *
- * @param[out] dst
- *   Pointer to global DRAM that stores dst data.
- * @param[in] src
- *   Pointer to NRAM that stores src data.
- * @param[in] size
- *   The byte size of segment in the lower dimension.
- * @param[in] dst_str
- *   The data stride in bytes between segments in the lower dimension of dst.
- * @param[in] src_str
- *   The data stride in bytes between segments in the lower dimension of src.
- * @param[in] seg_num
- *   The total count of data segments in the lower dimension.
- */
-template <typename T>
-__mlu_func__ void storeStr2D(T *dst, T *src, const int size, const int seg_num,
-                             const int dst_str, const int src_str) {
-  if ((size == dst_str && dst_str <= src_str) && dst_str * sizeof(T) <= 512) {
-    // gather data less than 512Bytes to improve IO efficiency
-    if (dst_str != src_str) {
-      __memcpy(src, src, size * sizeof(T), NRAM2NRAM, dst_str * sizeof(T),
-               src_str * sizeof(T), seg_num - 1);
-    }
-    __memcpy(dst, src, size * seg_num * sizeof(T), NRAM2GDRAM);
-  } else {
-    __memcpy(dst, src, size * sizeof(T), NRAM2GDRAM, dst_str * sizeof(T),
-             src_str * sizeof(T), seg_num - 1);
-  }
-}
-/*!
- * @brief stores data from NRAM to global DRAM with 3D pattern.
- *
- * @param[out] dst
- *   Pointer to global DRAM that stores dst data.
- * @param[in] src
- *   Pointer to NRAM that stores src data.
- * @param[in] size
- *   The byte size of segment in the lowest dimension.
- * @param[in] seg_num_in
- *   The total count of data segments in the lowest dimension.
- * @param[in] seg_num_out
- *   The total count of data segments in the middle dimension.
- * @param[in] dst_str_in
- *   The data stride in bytes between segments in the lowest dimension of dst.
- * @param[in] dst_str_out
- *   The data stride in bytes between segments in the middle dimension of dst.
- * @param[in] src_str_in
- *   The data stride in bytes between segments in the lowest dimension of src.
- * @param[in] src_str_out
- *   The data stride in bytes between segments in the middle dimension of src.
- */
-template <typename T>
-__mlu_func__ void storeStr3D(T *dst, T *src, const int size,
-                             const int seg_num_in, const int seg_num_out,
-                             const int dst_str_in, const int dst_str_out,
-                             const int src_str_in, const int src_str_out) {
-  T *tmp_dst = dst;
-  T *tmp_src = src;
-  for (int i = 0; i < seg_num_out; ++i) {
-    storeStr2D(tmp_dst, tmp_src, size, seg_num_in, dst_str_in, src_str_in);
-    tmp_src += src_str_out;
-    tmp_dst += dst_str_out;
-  }
-}
 /*!
 * @brief Converts int32 to float32 data type.
 *

--- a/mmcv/ops/csrc/common/mlu/deform_roi_pool_mlu_kernel.mlu
+++ b/mmcv/ops/csrc/common/mlu/deform_roi_pool_mlu_kernel.mlu
--- a/mmcv/ops/csrc/common/mlu/focal_loss_sigmoid_mlu_kernel.mlu
+++ b/mmcv/ops/csrc/common/mlu/focal_loss_sigmoid_mlu_kernel.mlu