add v2.2.0

0e2f8a5c · limm · 2754cb11 · 0e2f8a5c · 0e2f8a5c · 0e2f8a5c
Commit 0e2f8a5c authored Mar 27, 2025 by limm
11 changed files
--- a/mmcv/ops/deform_conv.py
+++ b/mmcv/ops/deform_conv.py
@@ -51,10 +51,11 @@ class DeformConv2dFunction(Function):

    @staticmethod
    def _npu_backward(ctx, grad_output):
+        import torch_npu
        input_tensor, weight, offset_out, offset_all, sort_index_for_npu_bp = \
            ctx.saved_tensors
        grad_input, grad_weight, grad_offset_all, grad_bias = \
-            torch.npu_deformable_conv2dbk(
+            torch_npu.npu_deformable_conv2dbk(
                input_tensor, grad_output, offset_out, weight, offset_all,
                kernel_size=[weight.shape[3], weight.shape[2]],
                stride=[1, 1, ctx.stride[0], ctx.stride[1]],

--- a/mmcv/ops/modulated_deform_conv.py
+++ b/mmcv/ops/modulated_deform_conv.py
@@ -83,8 +83,9 @@ class ModulatedDeformConv2dFunction(Function):
    def _npu_backward(ctx, grad_output):
        input_tensor, weight, offset_out, offset_all, sort_index_bp = \
            ctx.saved_tensors
+        import torch_npu
        grad_input, grad_weight, grad_offset_all, grad_bias = \
-            torch.npu_deformable_conv2dbk(
+            torch_npu.npu_deformable_conv2dbk(
                input_tensor, grad_output, offset_out, weight, offset_all,
                kernel_size=[weight.shape[3], weight.shape[2]],
                stride=[1, 1, ctx.stride[0], ctx.stride[1]],

--- a/mmcv/ops/multi_scale_deform_attn.py
+++ b/mmcv/ops/multi_scale_deform_attn.py
@@ -12,7 +12,7 @@ from mmengine.registry import MODELS
 from mmengine.utils import deprecated_api_warning
 from torch.autograd.function import Function, once_differentiable

-from mmcv.utils import IS_CUDA_AVAILABLE, IS_MLU_AVAILABLE
+from mmcv.utils import IS_CUDA_AVAILABLE, IS_MLU_AVAILABLE, IS_NPU_AVAILABLE
 from ..utils import ext_loader

 ext_module = ext_loader.load_ext(
@@ -84,7 +84,7 @@ class MultiScaleDeformableAttnFunction(Function):
        Returns:
            tuple[Tensor]: Gradient of input tensors in forward.
        """
-        value, value_spatial_shapes, value_level_start_index,\
+        value, value_spatial_shapes, value_level_start_index, \
            sampling_locations, attention_weights = ctx.saved_tensors
        grad_value = torch.zeros_like(value)
        grad_sampling_loc = torch.zeros_like(sampling_locations)
@@ -364,7 +364,8 @@ class MultiScaleDeformableAttention(BaseModule):
                f'Last dim of reference_points must be'
                f' 2 or 4, but get {reference_points.shape[-1]} instead.')
        if ((IS_CUDA_AVAILABLE and value.is_cuda)
-                or (IS_MLU_AVAILABLE and value.is_mlu)):
+                or (IS_MLU_AVAILABLE and value.is_mlu)
+                or (IS_NPU_AVAILABLE and value.device.type == 'npu')):
            output = MultiScaleDeformableAttnFunction.apply(
                value, spatial_shapes, level_start_index, sampling_locations,
                attention_weights, self.im2col_step)

--- a/mmcv/version.py
+++ b/mmcv/version.py
 # Copyright (c) OpenMMLab. All rights reserved.
-__version__ = '2.1.0'
+__version__ = '2.2.0'


 def parse_version_info(version_str: str, length: int = 4) -> tuple:

--- a/setup.py
+++ b/setup.py
@@ -244,10 +244,12 @@ def get_extensions():
            dipu_path = os.getenv('DIPU_PATH')
            vendor_include_dirs = os.getenv('VENDOR_INCLUDE_DIRS')
            nccl_include_dirs = os.getenv('NCCL_INCLUDE_DIRS')
+            pytorch_dir = os.getenv('PYTORCH_DIR')
            include_dirs.append(dipu_root)
            include_dirs.append(diopi_path + '/include')
            include_dirs.append(dipu_path + '/dist/include')
            include_dirs.append(vendor_include_dirs)
+            include_dirs.append(pytorch_dir + 'torch/include')
            if nccl_include_dirs:
                include_dirs.append(nccl_include_dirs)
            library_dirs += [dipu_root]
@@ -395,12 +397,22 @@ def get_extensions():
        elif (os.getenv('FORCE_NPU', '0') == '1'):
            print(f'Compiling {ext_name} only with CPU and NPU')
            try:
+                import importlib
+
                from torch_npu.utils.cpp_extension import NpuExtension
+                extra_compile_args['cxx'] += [
+                    '-D__FILENAME__=\"$$(notdir $$(abspath $$<))\"'
+                ]
+                extra_compile_args['cxx'] += [
+                    '-I' + importlib.util.find_spec(
+                        'torch_npu').submodule_search_locations[0] +
+                    '/include/third_party/acl/inc'
+                ]
                define_macros += [('MMCV_WITH_NPU', None)]
                extension = NpuExtension
-                if parse_version(torch.__version__) <= parse_version('2.0.0'):
+                if parse_version(torch.__version__) < parse_version('2.1.0'):
                    define_macros += [('MMCV_WITH_XLA', None)]
-                if parse_version(torch.__version__) > parse_version('2.0.0'):
+                if parse_version(torch.__version__) >= parse_version('2.1.0'):
                    define_macros += [('MMCV_WITH_KPRIVATE', None)]
            except Exception:
                raise ImportError('can not find any torch_npu')

--- a/tests/test_ops/test_ball_query.py
+++ b/tests/test_ops/test_ball_query.py
@@ -3,7 +3,7 @@ import pytest
 import torch

 from mmcv.ops import ball_query
-from mmcv.utils import IS_CUDA_AVAILABLE, IS_MLU_AVAILABLE
+from mmcv.utils import IS_CUDA_AVAILABLE, IS_MLU_AVAILABLE, IS_NPU_AVAILABLE


 @pytest.mark.parametrize('device', [
@@ -14,7 +14,11 @@ from mmcv.utils import IS_CUDA_AVAILABLE, IS_MLU_AVAILABLE
    pytest.param(
        'mlu',
        marks=pytest.mark.skipif(
-            not IS_MLU_AVAILABLE, reason='requires MLU support'))
+            not IS_MLU_AVAILABLE, reason='requires MLU support')),
+    pytest.param(
+        'npu',
+        marks=pytest.mark.skipif(
+            not IS_NPU_AVAILABLE, reason='requires NPU support'))
 ])
 def test_ball_query(device):
    new_xyz = torch.tensor(
@@ -59,20 +63,25 @@ def test_ball_query(device):
    assert torch.all(idx == expected_idx)


-@pytest.mark.skipif(
-    not torch.cuda.is_available(), reason='requires CUDA support')
-def test_stack_ball_query():
-    new_xyz = torch.tensor([[-0.0740, 1.3147, -1.3625],
-                            [-2.2769, 2.7817, -0.2334],
-                            [-0.4003, 2.4666, -0.5116],
-                            [-0.0740, 1.3147, -1.3625],
-                            [-0.0740, 1.3147, -1.3625],
-                            [-2.0289, 2.4952, -0.1708],
-                            [-2.0668, 6.0278, -0.4875],
-                            [0.4066, 1.4211, -0.2947],
-                            [-2.0289, 2.4952, -0.1708],
-                            [-2.0289, 2.4952, -0.1708]]).cuda()
-    new_xyz_batch_cnt = torch.tensor([5, 5], dtype=torch.int32).cuda()
+@pytest.mark.parametrize('device', [
+    pytest.param(
+        'cuda',
+        marks=pytest.mark.skipif(
+            not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
+    pytest.param(
+        'npu',
+        marks=pytest.mark.skipif(
+            not IS_NPU_AVAILABLE, reason='requires NPU support'))
+])
+def test_stack_ball_query(device):
+    new_xyz = torch.tensor(
+        [[-0.0740, 1.3147, -1.3625], [-2.2769, 2.7817, -0.2334],
+         [-0.4003, 2.4666, -0.5116], [-0.0740, 1.3147, -1.3625],
+         [-0.0740, 1.3147, -1.3625], [-2.0289, 2.4952, -0.1708],
+         [-2.0668, 6.0278, -0.4875], [0.4066, 1.4211, -0.2947],
+         [-2.0289, 2.4952, -0.1708], [-2.0289, 2.4952, -0.1708]],
+        device=device)
+    new_xyz_batch_cnt = torch.tensor([5, 5], dtype=torch.int32, device=device)
    xyz = torch.tensor([[-0.0740, 1.3147, -1.3625], [0.5555, 1.0399, -1.3634],
                        [-0.4003, 2.4666, -0.5116], [-0.5251, 2.4379, -0.8466],
                        [-0.9691, 1.1418, -1.3733], [-0.2232, 0.9561, -1.3626],
@@ -82,15 +91,15 @@ def test_stack_ball_query():
                        [-2.0668, 6.0278, -0.4875], [-1.9304, 3.3092, 0.6610],
                        [0.0949, 1.4332, 0.3140], [-1.2879, 2.0008, -0.7791],
                        [-0.7252, 0.9611, -0.6371], [0.4066, 1.4211, -0.2947],
-                        [0.3220, 1.4447, 0.3548], [-0.9744, 2.3856,
-                                                   -1.2000]]).cuda()
-    xyz_batch_cnt = torch.tensor([10, 10], dtype=torch.int32).cuda()
+                        [0.3220, 1.4447, 0.3548], [-0.9744, 2.3856, -1.2000]],
+                       device=device)
+    xyz_batch_cnt = torch.tensor([10, 10], dtype=torch.int32, device=device)
    idx = ball_query(0, 0.2, 5, xyz, new_xyz, xyz_batch_cnt, new_xyz_batch_cnt)
-    expected_idx = torch.tensor([[0, 0, 0, 0, 0], [6, 6, 6, 6, 6],
-                                 [2, 2, 2, 2, 2], [0, 0, 0, 0, 0],
-                                 [0, 0, 0, 0, 0], [0, 0, 0, 0, 0],
-                                 [2, 2, 2, 2, 2], [7, 7, 7, 7, 7],
-                                 [0, 0, 0, 0, 0], [0, 0, 0, 0, 0]]).cuda()
+    expected_idx = torch.tensor(
+        [[0, 0, 0, 0, 0], [6, 6, 6, 6, 6], [2, 2, 2, 2, 2], [0, 0, 0, 0, 0],
+         [0, 0, 0, 0, 0], [0, 0, 0, 0, 0], [2, 2, 2, 2, 2], [7, 7, 7, 7, 7],
+         [0, 0, 0, 0, 0], [0, 0, 0, 0, 0]],
+        device=device)
    assert torch.all(idx == expected_idx)

    xyz = xyz.double()

--- a/tests/test_ops/test_chamfer_distance.py
+++ b/tests/test_ops/test_chamfer_distance.py
 # Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
 import pytest
 import torch

 from mmcv.ops import chamfer_distance
+from mmcv.utils import IS_CUDA_AVAILABLE, IS_NPU_AVAILABLE


-@pytest.mark.skipif(
-    not torch.cuda.is_available(), reason='requires CUDA support')
-def test_chamfer_distance():
-    pointset1 = torch.tensor(
-        [[[1.3, 9.39], [2.3, 9.39], [2.3, 10.39], [1.3, 10.39]],
-         [[1.0, 9.39], [3.0, 9.39], [3.0, 10.39], [1.0, 10.39]],
-         [[1.6, 9.99], [2.3, 9.99], [2.3, 10.39], [1.6, 10.39]]],
-        device='cuda',
-        requires_grad=True)
+def chamfer_distance_forward_groundtruth(xyz1, xyz2, dtype):
+    bs, ns, ss = xyz1.shape
+    dist1 = np.zeros((bs, ns)).astype(torch_to_np_type(dtype))
+    dist2 = np.zeros((bs, ns)).astype(torch_to_np_type(dtype))
+    idx1 = np.zeros((bs, ns)).astype('int32')
+    idx2 = np.zeros((bs, ns)).astype('int32')
+    for b1 in range(bs):
+        for n1 in range(ns):
+            x1, y1 = xyz1[b1][n1]
+            dist1[b1][n1] = 10000000
+            for n2 in range(ns):
+                x2, y2 = xyz2[b1][n2]
+                dst = (x1 - x2) * (x1 - x2) + (y1 - y2) * (y1 - y2)
+                if dist1[b1][n1] > dst:
+                    dist1[b1][n1] = dst
+                    idx1[b1][n1] = n2
+    for b1 in range(bs):
+        for n1 in range(ns):
+            x1, y1 = xyz2[b1][n1]
+            dist2[b1][n1] = 10000000
+            for n2 in range(ns):
+                x2, y2 = xyz1[b1][n2]
+                dst = (x1 - x2) * (x1 - x2) + (y1 - y2) * (y1 - y2)
+                if dist2[b1][n1] > dst:
+                    dist2[b1][n1] = dst
+                    idx2[b1][n1] = n2
+    return [dist1, dist2, idx1, idx2]

-    pointset2 = torch.tensor(
-        [[[1.0, 9.39], [3.0, 9.39], [3.0, 10.39], [1.0, 10.39]],
-         [[1.3, 9.39], [2.3, 9.39], [2.3, 10.39], [1.3, 10.39]],
-         [[1.0, 9.39], [3.0, 9.39], [3.0, 10.39], [1.0, 10.39]]],
-        device='cuda',
-        requires_grad=True)

-    expected_dist1 = torch.tensor(
-        [[0.0900, 0.4900, 0.4900, 0.0900], [0.0900, 0.4900, 0.4900, 0.0900],
-         [0.5200, 0.6500, 0.4900, 0.3600]],
-        device='cuda')
-    expected_dist2 = torch.tensor(
-        [[0.0900, 0.4900, 0.4900, 0.0900], [0.0900, 0.4900, 0.4900, 0.0900],
-         [0.7200, 0.8500, 0.4900, 0.3600]],
-        device='cuda')
+def torch_to_np_type(dtype):
+    if dtype == torch.half:
+        return np.float16
+    elif dtype == torch.float32:
+        return np.float32

-    expected_pointset1_grad = torch.tensor(
-        [[[0.6000, 0.0000], [-1.4000, 0.0000], [-1.4000, 0.0000],
-          [0.6000, 0.0000]],
-         [[-0.6000, 0.0000], [1.4000, 0.0000], [1.4000, 0.0000],
-          [-0.6000, 0.0000]],
-         [[1.2000, -0.8000], [-1.4000, -0.8000], [-1.4000, 0.0000],
-          [1.2000, 0.0000]]],
-        device='cuda')

-    expected_pointset2_grad = torch.tensor(
-        [[[-0.6000, 0.0000], [1.4000, 0.0000], [1.4000, 0.0000],
-          [-0.6000, 0.0000]],
-         [[0.6000, 0.0000], [-1.4000, 0.0000], [-1.4000, 0.0000],
-          [0.6000, 0.0000]],
-         [[0.0000, 0.0000], [0.0000, 0.0000], [2.8000, 0.8000],
-          [-2.4000, 0.8000]]],
-        device='cuda')
-
-    dist1, dist2, idx1, idx2 = chamfer_distance(pointset1, pointset2)
-    dist1.backward(torch.ones_like(dist1))
-    assert torch.allclose(dist1, expected_dist1, 1e-2)
-    assert torch.allclose(dist2, expected_dist2, 1e-2)
-    assert torch.allclose(pointset1.grad.data, expected_pointset1_grad, 1e-2)
-    assert torch.allclose(pointset2.grad.data, expected_pointset2_grad, 1e-2)
+@pytest.mark.parametrize('device', [
+    pytest.param(
+        'cuda',
+        marks=pytest.mark.skipif(
+            not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
+    pytest.param(
+        'npu',
+        marks=pytest.mark.skipif(
+            not IS_NPU_AVAILABLE, reason='requires NPU support'))
+])
+@pytest.mark.parametrize('dtype', [torch.half, torch.float32])
+@pytest.mark.parametrize('shape', [(2, 600, 2), (2, 600, 2)])
+def test_chamfer_distance_npu_dynamic_shape(dtype, device, shape):
+    bs = shape[0]
+    ns = shape[1]
+    xyz1 = np.random.uniform(-10.0, 10.0,
+                             (bs, ns, 2)).astype(torch_to_np_type(dtype))
+    xyz2 = np.random.uniform(-10.0, 10.0,
+                             (bs, ns, 2)).astype(torch_to_np_type(dtype))
+    xyz1_npu = torch.tensor(xyz1, dtype=dtype).to(device)
+    xyz2_npu = torch.tensor(xyz2, dtype=dtype).to(device)
+    expected_output = chamfer_distance_forward_groundtruth(xyz1, xyz2, dtype)
+    output = chamfer_distance(xyz1_npu, xyz2_npu)
+    assert np.allclose(output[0].cpu().numpy(), expected_output[0], 1e-3, 1e-4)
+    assert np.allclose(output[1].cpu().numpy(), expected_output[1], 1e-3, 1e-4)
+    assert np.allclose(output[2].cpu().numpy(), expected_output[2], 1e-3, 1e-4)
+    assert np.allclose(output[3].cpu().numpy(), expected_output[3], 1e-3, 1e-4)
--- a/tests/test_ops/test_group_points.py
+++ b/tests/test_ops/test_group_points.py
@@ -72,13 +72,23 @@ def test_grouping_points(dtype, device):
    assert torch.allclose(output, expected_output)


-@pytest.mark.skipif(
-    not torch.cuda.is_available(), reason='requires CUDA support')
+@pytest.mark.parametrize('device', [
+    pytest.param(
+        'cuda',
+        marks=pytest.mark.skipif(
+            not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
+    pytest.param(
+        'npu',
+        marks=pytest.mark.skipif(
+            not IS_NPU_AVAILABLE, reason='requires NPU support'))
+])
 @pytest.mark.parametrize('dtype', [torch.half, torch.float, torch.double])
-def test_stack_grouping_points(dtype):
+def test_stack_grouping_points(dtype, device):
+    if device == 'npu' and dtype == torch.double:
+        return
    idx = torch.tensor([[0, 0, 0], [3, 3, 3], [8, 8, 8], [1, 1, 1], [0, 0, 0],
                        [2, 2, 2], [0, 0, 0], [6, 6, 6], [9, 9, 9], [0, 0, 0],
-                        [1, 1, 1], [0, 0, 0]]).int().cuda()
+                        [1, 1, 1], [0, 0, 0]]).int().to(device)
    features = torch.tensor([[
        0.5798, -0.7981, -0.9280, -1.3311, 1.3687, 0.9277, -0.4164, -1.8274,
        0.9268, 0.8414
@@ -103,9 +113,9 @@ def test_stack_grouping_points(dtype):
                                 -0.6646, -0.6870, -0.1125, -0.2224, -0.3445,
                                 -1.4049, 0.4990, -0.7037, -0.9924, 0.0386
                             ]],
-                            dtype=dtype).cuda()
-    features_batch_cnt = torch.tensor([3, 3]).int().cuda()
-    indices_batch_cnt = torch.tensor([6, 6]).int().cuda()
+                            dtype=dtype).to(device)
+    features_batch_cnt = torch.tensor([3, 3]).int().to(device)
+    indices_batch_cnt = torch.tensor([6, 6]).int().to(device)
    output = grouping_operation(features, idx, features_batch_cnt,
                                indices_batch_cnt)
    expected_output = torch.tensor(
@@ -169,5 +179,5 @@ def test_stack_grouping_points(dtype):
          [-0.3190, -0.3190, -0.3190], [0.7798, 0.7798, 0.7798],
          [-0.3693, -0.3693, -0.3693], [-0.9457, -0.9457, -0.9457],
          [-0.2942, -0.2942, -0.2942], [-1.8527, -1.8527, -1.8527]]],
-        dtype=dtype).cuda()
+        dtype=dtype).to(device)
    assert torch.allclose(output, expected_output)
--- a/tests/test_ops/test_ms_deformable_attn.py
+++ b/tests/test_ops/test_ms_deformable_attn.py
@@ -5,7 +5,7 @@ import torch
 from mmcv.ops.multi_scale_deform_attn import (
    MultiScaleDeformableAttention, MultiScaleDeformableAttnFunction,
    multi_scale_deformable_attn_pytorch)
-from mmcv.utils import IS_CUDA_AVAILABLE, IS_MLU_AVAILABLE
+from mmcv.utils import IS_CUDA_AVAILABLE, IS_MLU_AVAILABLE, IS_NPU_AVAILABLE

 _USING_PARROTS = True
 _IS_AUTOCAST_AVAILABLE = True
@@ -136,6 +136,40 @@ def test_forward_equal_with_pytorch_double():
    assert max_rel_err < 1e-15


+@pytest.mark.skipif(not IS_NPU_AVAILABLE, reason='requires NPU support')
+def test_forward_equal_with_pytorch_npu():
+    N, M, D = 6, 4, 8
+    Lq, L, P = 10000, 4, 8
+    shapes = torch.as_tensor([(60, 40), (30, 20), (16, 24), (53, 32)],
+                             dtype=torch.int32)
+    level_start_index = torch.cat((shapes.new_zeros(
+        (1, )), shapes.prod(1).cumsum(0)[:-1]))
+    S = sum((H * W).item() for H, W in shapes)
+
+    torch.manual_seed(3)
+    value = torch.rand(N, S, M, D) * 0.01
+    sampling_locations = torch.rand(N, Lq, M, L, P, 2)
+    attention_weights = torch.rand(N, Lq, M, L, P) + 1e-5
+    attention_weights /= attention_weights.sum(
+        -1, keepdim=True).sum(
+            -2, keepdim=True)
+    im2col_step = 2
+    output_pytorch = multi_scale_deformable_attn_pytorch(
+        value.float(), shapes, sampling_locations.float(),
+        attention_weights.float()).detach().cpu()
+
+    output_npu = MultiScaleDeformableAttnFunction.apply(
+        value.npu().float(), shapes.npu(), level_start_index.npu(),
+        sampling_locations.npu().float(),
+        attention_weights.npu().float(), im2col_step).detach().cpu()
+    assert torch.allclose(output_npu, output_pytorch)
+    max_abs_err = (output_npu - output_pytorch).abs().max()
+    max_rel_err = ((output_npu - output_pytorch).abs() /
+                   output_pytorch.abs()).max()
+    assert max_abs_err < 1e-18
+    assert max_rel_err < 1e-15
+
+
 @pytest.mark.parametrize('device', [
    pytest.param(
        'cuda',
@@ -303,3 +337,67 @@ def test_gradient_numerical(channels,
                   im2col_step),
            eps=eps,
            atol=1e-2)
+
+
+@pytest.mark.skipif(not IS_NPU_AVAILABLE, reason='requires NPU support')
+def test_backward_equal_with_pytorch_npu():
+    N, M, D = 6, 4, 8
+    Lq, L, P = 10000, 4, 8
+    shapes = torch.as_tensor([(60, 40), (30, 20), (16, 24), (53, 32)],
+                             dtype=torch.int32)
+    level_start_index = torch.cat((shapes.new_zeros(
+        (1, )), shapes.prod(1).cumsum(0)[:-1]))
+    S = sum((H * W).item() for H, W in shapes)
+
+    torch.manual_seed(3)
+    value = torch.rand(N, S, M, D) * 0.01
+    sampling_locations = torch.rand(N, Lq, M, L, P, 2)
+    attention_weights = torch.rand(N, Lq, M, L, P) + 1e-5
+    attention_weights /= attention_weights.sum(
+        -1, keepdim=True).sum(
+            -2, keepdim=True)
+    im2col_step = 2
+    value.requires_grad = True
+    sampling_locations.requires_grad = True
+    attention_weights.requires_grad = True
+    output_pytorch = multi_scale_deformable_attn_pytorch(
+        value.float(), shapes, sampling_locations.float(),
+        attention_weights.float())
+    grad_output_pytorch = torch.ones_like(output_pytorch)
+    output_pytorch.backward(grad_output_pytorch)
+    grad_value = value.grad.detach().cpu()
+    grad_location = sampling_locations.grad.detach().cpu()
+    grad_attn_weight = attention_weights.grad.detach().cpu()
+
+    value_npu = value.npu()
+    shapes_npu = shapes.npu()
+    level_start_index_npu = level_start_index.npu()
+    sampling_locations_npu = sampling_locations.npu()
+    attention_weights_npu = attention_weights.npu()
+    output_npu = MultiScaleDeformableAttnFunction.apply(
+        value_npu.float(), shapes_npu, level_start_index_npu,
+        sampling_locations_npu.float(), attention_weights_npu.float(),
+        im2col_step)
+    grad_output_npu = torch.ones_like(output_npu)
+    output_npu.backward(grad_output_npu)
+    grad_value_npu = value_npu.grad.detach().cpu()
+    grad_location_npu = sampling_locations_npu.grad.detach().cpu()
+    grad_attn_weight_npu = attention_weights_npu.grad.detach().cpu()
+    assert torch.allclose(grad_value_npu, grad_value)
+    max_abs_err_1 = (grad_value_npu - grad_value).abs().max()
+    max_rel_err_1 = ((grad_value_npu - grad_value).abs() /
+                     grad_value.abs()).max()
+    assert max_abs_err_1 < 1e-5
+    assert max_rel_err_1 < 1e-4
+    assert torch.allclose(grad_location_npu, grad_location)
+    max_abs_err_2 = (grad_location_npu - grad_location).abs().max()
+    max_rel_err_2 = ((grad_location_npu - grad_location).abs() /
+                     grad_location.abs()).max()
+    assert max_abs_err_2 < 1e-5
+    assert max_rel_err_2 < 1e-4
+    assert torch.allclose(grad_attn_weight_npu, grad_attn_weight)
+    max_abs_err_3 = (grad_attn_weight_npu - grad_attn_weight).abs().max()
+    max_rel_err_3 = ((grad_attn_weight_npu - grad_attn_weight).abs() /
+                     grad_attn_weight.abs()).max()
+    assert max_abs_err_3 < 1e-5
+    assert max_rel_err_3 < 1e-4
--- a/tests/test_ops/test_rotated_feature_align.py
+++ b/tests/test_ops/test_rotated_feature_align.py
@@ -3,7 +3,7 @@ import pytest
 import torch

 from mmcv.ops import rotated_feature_align
-from mmcv.utils import IS_CUDA_AVAILABLE, IS_MLU_AVAILABLE
+from mmcv.utils import IS_CUDA_AVAILABLE, IS_MLU_AVAILABLE, IS_NPU_AVAILABLE


 @pytest.mark.skipif(
@@ -17,6 +17,10 @@ from mmcv.utils import IS_CUDA_AVAILABLE, IS_MLU_AVAILABLE
        'mlu',
        marks=pytest.mark.skipif(
            not IS_MLU_AVAILABLE, reason='requires MLU support')),
+    pytest.param(
+        'npu',
+        marks=pytest.mark.skipif(
+            not IS_NPU_AVAILABLE, reason='requires NPU support')),
    pytest.param(
        'cpu',
        marks=pytest.mark.skipif(

--- a/tests/test_ops/test_three_interpolate.py
+++ b/tests/test_ops/test_three_interpolate.py
@@ -3,12 +3,28 @@ import pytest
 import torch

 from mmcv.ops import three_interpolate
+from mmcv.utils import IS_CUDA_AVAILABLE, IS_NPU_AVAILABLE


-@pytest.mark.skipif(
-    not torch.cuda.is_available(), reason='requires CUDA support')
-@pytest.mark.parametrize('dtype', [torch.half, torch.float, torch.double])
-def test_three_interpolate(dtype):
+@pytest.mark.parametrize('dtype', [
+    torch.half, torch.float,
+    pytest.param(
+        torch.double,
+        marks=pytest.mark.skipif(
+            IS_NPU_AVAILABLE,
+            reason='NPU does not support for 64-bit floating point'))
+])
+@pytest.mark.parametrize('device', [
+    pytest.param(
+        'cuda',
+        marks=pytest.mark.skipif(
+            not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
+    pytest.param(
+        'npu',
+        marks=pytest.mark.skipif(
+            not IS_NPU_AVAILABLE, reason='requires NPU support'))
+])
+def test_three_interpolate(dtype, device):
    features = torch.tensor(
        [[[2.4350, 4.7516, 4.4995, 2.4350, 2.4350, 2.4350],
          [3.1236, 2.6278, 3.0447, 3.1236, 3.1236, 3.1236],
@@ -20,12 +36,13 @@ def test_three_interpolate(dtype):
          [0.0000, 0.2744, 2.0842, 0.0000, 0.0000, 0.0000],
          [0.3414, 1.5063, 1.6209, 0.3414, 0.3414, 0.3414],
          [0.5814, 0.0103, 0.0000, 0.5814, 0.5814, 0.5814]]],
-        dtype=dtype).cuda()
+        dtype=dtype,
+        device=device)

-    idx = torch.tensor([[[0, 1, 2], [2, 3, 4], [2, 3, 4], [0, 1, 2], [0, 1, 2],
-                         [0, 1, 3]],
-                        [[0, 2, 3], [1, 3, 4], [2, 1, 4], [0, 2, 4], [0, 2, 4],
-                         [0, 1, 2]]]).int().cuda()
+    idx = torch.tensor(
+        [[[0, 1, 2], [2, 3, 4], [2, 3, 4], [0, 1, 2], [0, 1, 2], [0, 1, 3]],
+         [[0, 2, 3], [1, 3, 4], [2, 1, 4], [0, 2, 4], [0, 2, 4], [0, 1, 2]]],
+        device=device).int()

    weight = torch.tensor([[[3.3333e-01, 3.3333e-01, 3.3333e-01],
                            [1.0000e+00, 5.8155e-08, 2.2373e-08],
@@ -39,7 +56,8 @@ def test_three_interpolate(dtype):
                            [3.3333e-01, 3.3333e-01, 3.3333e-01],
                            [3.3333e-01, 3.3333e-01, 3.3333e-01],
                            [3.3333e-01, 3.3333e-01, 3.3333e-01]]],
-                          dtype=dtype).cuda()
+                          dtype=dtype,
+                          device=device)

    output = three_interpolate(features, idx, weight)
    expected_output = torch.tensor([[[
@@ -73,6 +91,7 @@ def test_three_interpolate(dtype):
                                         3.8760e-01, 1.0300e-02, 8.3569e-09,
                                         3.8760e-01, 3.8760e-01, 1.9723e-01
                                     ]]],
-                                   dtype=dtype).cuda()
+                                   dtype=dtype,
+                                   device=device)

    assert torch.allclose(output, expected_output, 1e-3, 1e-4)