support v2.1.0

91da9643 · limm · 6f674c7e · 91da9643 · 91da9643 · 91da9643
Commit 91da9643 authored Aug 13, 2024 by limm
19 changed files
--- a/tests/test_ops/test_active_rotated_filter.py
+++ b/tests/test_ops/test_active_rotated_filter.py
@@ -4,6 +4,7 @@ import pytest
 import torch
 from mmcv.ops import active_rotated_filter
+from mmcv.utils import IS_CUDA_AVAILABLE, IS_NPU_AVAILABLE
 np_feature = np.array([[[[[-1.4934e-01, 1.1341e+00, -1.6241e-01],
                          [-1.0986e+00, -1.1463e+00, -1.3176e+00],
@@ -245,7 +246,11 @@ expected_grad = np.array([[[[[8., 8., 8.], [8., 8., 8.], [8., 8., 8.]]]],
    pytest.param(
        'cuda',
        marks=pytest.mark.skipif(
-            not torch.cuda.is_available(), reason='requires CUDA support')),
+            not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
+    pytest.param(
+        'npu',
+        marks=pytest.mark.skipif(
+            not IS_NPU_AVAILABLE, reason='requires NPU support'))
 ])
 def test_active_rotated_filter(device):
    feature = torch.tensor(

--- a/tests/test_ops/test_ball_query.py
+++ b/tests/test_ops/test_ball_query.py
@@ -3,55 +3,59 @@ import pytest
 import torch
 from mmcv.ops import ball_query
+from mmcv.utils import IS_CUDA_AVAILABLE, IS_MLU_AVAILABLE
-@pytest.mark.skipif(
+@pytest.mark.parametrize('device', [
-    not torch.cuda.is_available(), reason='requires CUDA support')
+    pytest.param(
-def test_ball_query():
+        'cuda',
-    new_xyz = torch.tensor([[[-0.0740, 1.3147, -1.3625],
+        marks=pytest.mark.skipif(
-                             [-2.2769, 2.7817, -0.2334],
+            not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
-                             [-0.4003, 2.4666, -0.5116],
+    pytest.param(
-                             [-0.0740, 1.3147, -1.3625],
+        'mlu',
-                             [-0.0740, 1.3147, -1.3625]],
+        marks=pytest.mark.skipif(
-                            [[-2.0289, 2.4952, -0.1708],
+            not IS_MLU_AVAILABLE, reason='requires MLU support'))
-                             [-2.0668, 6.0278, -0.4875],
+])
-                             [0.4066, 1.4211, -0.2947],
+def test_ball_query(device):
-                             [-2.0289, 2.4952, -0.1708],
+    new_xyz = torch.tensor(
-                             [-2.0289, 2.4952, -0.1708]]]).cuda()
+        [[[-0.0740, 1.3147, -1.3625], [-2.2769, 2.7817, -0.2334],
+          [-0.4003, 2.4666, -0.5116], [-0.0740, 1.3147, -1.3625],
+          [-0.0740, 1.3147, -1.3625]],
+         [[-2.0289, 2.4952, -0.1708], [-2.0668, 6.0278, -0.4875],
+          [0.4066, 1.4211, -0.2947], [-2.0289, 2.4952, -0.1708],
+          [-2.0289, 2.4952, -0.1708]]],
+        device=device)
-    xyz = torch.tensor([[[-0.0740, 1.3147, -1.3625], [0.5555, 1.0399, -1.3634],
+    xyz = torch.tensor(
-                         [-0.4003, 2.4666,
+        [[[-0.0740, 1.3147, -1.3625], [0.5555, 1.0399, -1.3634],
-                          -0.5116], [-0.5251, 2.4379, -0.8466],
+          [-0.4003, 2.4666, -0.5116], [-0.5251, 2.4379, -0.8466],
-                         [-0.9691, 1.1418,
+          [-0.9691, 1.1418, -1.3733], [-0.2232, 0.9561, -1.3626],
-                          -1.3733], [-0.2232, 0.9561, -1.3626],
+          [-2.2769, 2.7817, -0.2334], [-0.2822, 1.3192, -1.3645],
-                         [-2.2769, 2.7817, -0.2334],
+          [0.1533, 1.5024, -1.0432], [0.4917, 1.1529, -1.3496]],
-                         [-0.2822, 1.3192, -1.3645], [0.1533, 1.5024, -1.0432],
+         [[-2.0289, 2.4952, -0.1708], [-0.7188, 0.9956, -0.5096],
-                         [0.4917, 1.1529, -1.3496]],
+          [-2.0668, 6.0278, -0.4875], [-1.9304, 3.3092, 0.6610],
-                        [[-2.0289, 2.4952,
+          [0.0949, 1.4332, 0.3140], [-1.2879, 2.0008, -0.7791],
-                          -0.1708], [-0.7188, 0.9956, -0.5096],
+          [-0.7252, 0.9611, -0.6371], [0.4066, 1.4211, -0.2947],
-                         [-2.0668, 6.0278, -0.4875], [-1.9304, 3.3092, 0.6610],
+          [0.3220, 1.4447, 0.3548], [-0.9744, 2.3856, -1.2000]]],
-                         [0.0949, 1.4332, 0.3140], [-1.2879, 2.0008, -0.7791],
+        device=device)
-                         [-0.7252, 0.9611, -0.6371], [0.4066, 1.4211, -0.2947],
-                         [0.3220, 1.4447, 0.3548], [-0.9744, 2.3856,
-                                                    -1.2000]]]).cuda()
    idx = ball_query(0, 0.2, 5, xyz, new_xyz)
-    expected_idx = torch.tensor([[[0, 0, 0, 0, 0], [6, 6, 6, 6, 6],
+    expected_idx = torch.tensor(
-                                  [2, 2, 2, 2, 2], [0, 0, 0, 0, 0],
+        [[[0, 0, 0, 0, 0], [6, 6, 6, 6, 6], [2, 2, 2, 2, 2], [0, 0, 0, 0, 0],
-                                  [0, 0, 0, 0, 0]],
+          [0, 0, 0, 0, 0]],
-                                 [[0, 0, 0, 0, 0], [2, 2, 2, 2, 2],
+         [[0, 0, 0, 0, 0], [2, 2, 2, 2, 2], [7, 7, 7, 7, 7], [0, 0, 0, 0, 0],
-                                  [7, 7, 7, 7, 7], [0, 0, 0, 0, 0],
+          [0, 0, 0, 0, 0]]],
-                                  [0, 0, 0, 0, 0]]]).cuda()
+        device=device)
    assert torch.all(idx == expected_idx)
    # test dilated ball query
    idx = ball_query(0.2, 0.4, 5, xyz, new_xyz)
-    expected_idx = torch.tensor([[[0, 5, 7, 0, 0], [6, 6, 6, 6, 6],
+    expected_idx = torch.tensor(
-                                  [2, 3, 2, 2, 2], [0, 5, 7, 0, 0],
+        [[[0, 5, 7, 0, 0], [6, 6, 6, 6, 6], [2, 3, 2, 2, 2], [0, 5, 7, 0, 0],
-                                  [0, 5, 7, 0, 0]],
+          [0, 5, 7, 0, 0]],
-                                 [[0, 0, 0, 0, 0], [2, 2, 2, 2, 2],
+         [[0, 0, 0, 0, 0], [2, 2, 2, 2, 2], [7, 7, 7, 7, 7], [0, 0, 0, 0, 0],
-                                  [7, 7, 7, 7, 7], [0, 0, 0, 0, 0],
+          [0, 0, 0, 0, 0]]],
-                                  [0, 0, 0, 0, 0]]]).cuda()
+        device=device)
    assert torch.all(idx == expected_idx)

--- a/tests/test_ops/test_bbox.py
+++ b/tests/test_ops/test_bbox.py
@@ -2,6 +2,7 @@
 import numpy as np
 import pytest
 import torch
+from mmengine.utils import digit_version
 from mmcv.utils import (IS_CUDA_AVAILABLE, IS_MLU_AVAILABLE, IS_MPS_AVAILABLE,
                        IS_NPU_AVAILABLE)
@@ -56,7 +57,9 @@ class TestBBox:
        pytest.param(
            'mps',
            marks=pytest.mark.skipif(
-                not IS_MPS_AVAILABLE, reason='requires MPS support')),
+                not IS_MPS_AVAILABLE
+                or digit_version(torch.__version__) >= digit_version('2.1.0'),
+                reason='requires MPS support')),
        pytest.param(
            'npu',
            marks=pytest.mark.skipif(

--- a/tests/test_ops/test_box_iou_rotated.py
+++ b/tests/test_ops/test_box_iou_rotated.py
@@ -3,11 +3,13 @@ import numpy as np
 import pytest
 import torch
+from mmcv.ops import box_iou_rotated
+from mmcv.utils import IS_CUDA_AVAILABLE, IS_MLU_AVAILABLE, IS_NPU_AVAILABLE
 class TestBoxIoURotated:
    def test_box_iou_rotated_cpu(self):
-        from mmcv.ops import box_iou_rotated
        np_boxes1 = np.asarray(
            [[1.0, 1.0, 3.0, 4.0, 0.5], [2.0, 2.0, 3.0, 4.0, 0.6],
             [7.0, 7.0, 8.0, 8.0, 0.4]],
@@ -44,10 +46,21 @@ class TestBoxIoURotated:
        assert np.allclose(
            ious.cpu().numpy(), np_expect_ious_aligned, atol=1e-4)
-    @pytest.mark.skipif(
+    @pytest.mark.parametrize('device', [
-        not torch.cuda.is_available(), reason='requires CUDA support')
+        pytest.param(
-    def test_box_iou_rotated_cuda(self):
+            'cuda',
-        from mmcv.ops import box_iou_rotated
+            marks=pytest.mark.skipif(
+                not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
+        pytest.param(
+            'mlu',
+            marks=pytest.mark.skipif(
+                not IS_MLU_AVAILABLE, reason='requires MLU support')),
+        pytest.param(
+            'npu',
+            marks=pytest.mark.skipif(
+                not IS_NPU_AVAILABLE, reason='requires NPU support'))
+    ])
+    def test_box_iou_rotated(self, device):
        np_boxes1 = np.asarray(
            [[1.0, 1.0, 3.0, 4.0, 0.5], [2.0, 2.0, 3.0, 4.0, 0.6],
             [7.0, 7.0, 8.0, 8.0, 0.4]],
@@ -63,8 +76,8 @@ class TestBoxIoURotated:
        np_expect_ious_aligned = np.asarray([0.3708, 0.4487, 0.3622],
                                            dtype=np.float32)
-        boxes1 = torch.from_numpy(np_boxes1).cuda()
+        boxes1 = torch.from_numpy(np_boxes1).to(device)
-        boxes2 = torch.from_numpy(np_boxes2).cuda()
+        boxes2 = torch.from_numpy(np_boxes2).to(device)
        # test cw angle definition
        ious = box_iou_rotated(boxes1, boxes2)
@@ -85,7 +98,6 @@ class TestBoxIoURotated:
            ious.cpu().numpy(), np_expect_ious_aligned, atol=1e-4)
    def test_box_iou_rotated_iof_cpu(self):
-        from mmcv.ops import box_iou_rotated
        np_boxes1 = np.asarray(
            [[1.0, 1.0, 3.0, 4.0, 0.5], [2.0, 2.0, 3.0, 4.0, 0.6],
             [7.0, 7.0, 8.0, 8.0, 0.4]],
@@ -121,10 +133,21 @@ class TestBoxIoURotated:
        assert np.allclose(
            ious.cpu().numpy(), np_expect_ious_aligned, atol=1e-4)
-    @pytest.mark.skipif(
+    @pytest.mark.parametrize('device', [
-        not torch.cuda.is_available(), reason='requires CUDA support')
+        pytest.param(
-    def test_box_iou_rotated_iof_cuda(self):
+            'cuda',
-        from mmcv.ops import box_iou_rotated
+            marks=pytest.mark.skipif(
+                not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
+        pytest.param(
+            'mlu',
+            marks=pytest.mark.skipif(
+                not IS_MLU_AVAILABLE, reason='requires MLU support')),
+        pytest.param(
+            'npu',
+            marks=pytest.mark.skipif(
+                not IS_NPU_AVAILABLE, reason='requires NPU support'))
+    ])
+    def test_box_iou_rotated_iof(self, device):
        np_boxes1 = np.asarray(
            [[1.0, 1.0, 3.0, 4.0, 0.5], [2.0, 2.0, 3.0, 4.0, 0.6],
             [7.0, 7.0, 8.0, 8.0, 0.4]],
@@ -140,8 +163,8 @@ class TestBoxIoURotated:
        np_expect_ious_aligned = np.asarray([0.4959, 0.5420, 0.4404],
                                            dtype=np.float32)
-        boxes1 = torch.from_numpy(np_boxes1).cuda()
+        boxes1 = torch.from_numpy(np_boxes1).to(device)
-        boxes2 = torch.from_numpy(np_boxes2).cuda()
+        boxes2 = torch.from_numpy(np_boxes2).to(device)
        # test cw angle definition
        ious = box_iou_rotated(boxes1, boxes2, mode='iof')

--- a/tests/test_ops/test_deform_conv.py
+++ b/tests/test_ops/test_deform_conv.py
@@ -5,6 +5,11 @@ import torch
 from mmengine.utils import digit_version
 from mmengine.utils.dl_utils import TORCH_VERSION
+from mmcv.utils import IS_CUDA_AVAILABLE, IS_MLU_AVAILABLE
+if IS_MLU_AVAILABLE:
+    torch.backends.cnnl.allow_tf32 = False
 try:
    # If PyTorch version >= 1.6.0 and fp16 is enabled, torch.cuda.amp.autocast
    # would be imported and used; we should test if our modules support it.
@@ -45,7 +50,10 @@ class TestDeformconv:
                         im2col_step=2):
        if not torch.cuda.is_available() and device == 'cuda':
            pytest.skip('test requires GPU')
-        from mmcv.ops import DeformConv2dPack
+        if device == 'mlu':
+            from mmcv.ops import DeformConv2dPack_MLU as DeformConv2dPack
+        else:
+            from mmcv.ops import DeformConv2dPack
        c_in = 1
        c_out = 1
        batch_size = 10
@@ -69,6 +77,8 @@ class TestDeformconv:
            torch.Tensor(deform_weight).reshape(1, 1, 2, 2))
        if device == 'cuda':
            model.cuda()
+        elif device == 'mlu':
+            model.mlu()
        model.type(dtype)
        out = model(x)
@@ -108,6 +118,7 @@ class TestDeformconv:
    def _test_amp_deformconv(self,
                             input_dtype,
                             threshold=1e-3,
+                             device='cuda',
                             batch_size=10,
                             im2col_step=2):
        """The function to test amp released on pytorch 1.6.0.
@@ -120,15 +131,18 @@ class TestDeformconv:
            input_dtype: torch.float or torch.half.
            threshold: the same as above function.
        """
-        if not torch.cuda.is_available():
+        if not torch.cuda.is_available() and device == 'cuda':
            return
-        from mmcv.ops import DeformConv2dPack
+        if device == 'mlu':
+            from mmcv.ops import DeformConv2dPack_MLU as DeformConv2dPack
+        else:
+            from mmcv.ops import DeformConv2dPack
        c_in = 1
        c_out = 1
        repeated_input = np.repeat(input, batch_size, axis=0)
        repeated_gt_out = np.repeat(gt_out, batch_size, axis=0)
        repeated_gt_x_grad = np.repeat(gt_x_grad, batch_size, axis=0)
-        x = torch.Tensor(repeated_input).cuda().type(input_dtype)
+        x = torch.Tensor(repeated_input).to(device).type(input_dtype)
        x.requires_grad = True
        model = DeformConv2dPack(
            in_channels=c_in,
@@ -143,7 +157,10 @@ class TestDeformconv:
            torch.Tensor(offset_bias).reshape(8))
        model.weight.data = torch.nn.Parameter(
            torch.Tensor(deform_weight).reshape(1, 1, 2, 2))
-        model.cuda()
+        if device == 'cuda':
+            model.cuda()
+        elif device == 'mlu':
+            model.mlu()
        out = model(x)
        out.backward(torch.ones_like(out))
@@ -177,24 +194,65 @@ class TestDeformconv:
        with pytest.raises(AssertionError):
            model = DeformConv2d(3, 4, 3, groups=3)
-    def test_deformconv(self):
+    @pytest.mark.parametrize('device, threshold', [
-        self._test_deformconv(torch.double, device='cpu')
+        ('cpu', 1e-1),
-        self._test_deformconv(torch.float, device='cpu', threshold=1e-1)
+        pytest.param(
-        self._test_deformconv(torch.double)
+            'cuda',
-        self._test_deformconv(torch.float)
+            1e-3,
-        self._test_deformconv(torch.half, threshold=1e-1)
+            marks=pytest.mark.skipif(
+                not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
+        pytest.param(
+            'mlu',
+            1e-3,
+            marks=pytest.mark.skipif(
+                not IS_MLU_AVAILABLE, reason='requires MLU support')),
+    ])
+    def test_deformconv_float(self, device, threshold):
+        self._test_deformconv(torch.float, device=device, threshold=threshold)
        # test batch_size < im2col_step
-        self._test_deformconv(torch.float, batch_size=1, im2col_step=2)
+        self._test_deformconv(
+            torch.float, batch_size=1, im2col_step=2, device=device)
        # test bach_size % im2col_step != 0
        with pytest.raises(
                AssertionError,
                match='batch size must be divisible by im2col_step'):
-            self._test_deformconv(torch.float, batch_size=10, im2col_step=3)
+            self._test_deformconv(
+                torch.float, batch_size=10, im2col_step=3, device=device)
+    @pytest.mark.parametrize('device', [
+        'cpu',
+        pytest.param(
+            'cuda',
+            marks=pytest.mark.skipif(
+                not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
+        pytest.param(
+            'mlu',
+            marks=pytest.mark.skipif(
+                not IS_MLU_AVAILABLE, reason='requires MLU support')),
+    ])
+    def test_deformconv_double(self, device):
+        self._test_deformconv(torch.double, device=device)
+    @pytest.mark.parametrize('device, threshold', [
+        pytest.param(
+            'cuda',
+            1e-1,
+            marks=pytest.mark.skipif(
+                not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
+        pytest.param(
+            'mlu',
+            1e-1,
+            marks=pytest.mark.skipif(
+                not IS_MLU_AVAILABLE, reason='requires MLU support')),
+    ])
+    def test_deformconv_half(self, device, threshold):
+        self._test_deformconv(torch.half, device=device, threshold=threshold)
        # test amp when torch version >= '1.6.0', the type of
        # input data for deformconv might be torch.float or torch.half
        if (TORCH_VERSION != 'parrots'
                and digit_version(TORCH_VERSION) >= digit_version('1.6.0')):
            with autocast(enabled=True):
-                self._test_amp_deformconv(torch.float, 1e-1)
+                self._test_amp_deformconv(
-                self._test_amp_deformconv(torch.half, 1e-1)
+                    torch.float, device=device, threshold=threshold)
+                self._test_amp_deformconv(
+                    torch.half, device=device, threshold=threshold)
--- a/tests/test_ops/test_diff_iou_rotated.py
+++ b/tests/test_ops/test_diff_iou_rotated.py
@@ -4,11 +4,23 @@ import pytest
 import torch
 from mmcv.ops import diff_iou_rotated_2d, diff_iou_rotated_3d
+from mmcv.utils import IS_CUDA_AVAILABLE, IS_MLU_AVAILABLE
+if IS_MLU_AVAILABLE:
+    torch.backends.mlu.matmul.allow_tf32 = False
-@pytest.mark.skipif(
-    not torch.cuda.is_available(), reason='requires CUDA support')
+@pytest.mark.parametrize('device', [
-def test_diff_iou_rotated_2d():
+    pytest.param(
+        'cuda',
+        marks=pytest.mark.skipif(
+            not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
+    pytest.param(
+        'mlu',
+        marks=pytest.mark.skipif(
+            not IS_MLU_AVAILABLE, reason='requires MLU support'))
+])
+def test_diff_iou_rotated_2d(device):
    np_boxes1 = np.asarray([[[0.5, 0.5, 1., 1., .0], [0.5, 0.5, 1., 1., .0],
                             [0.5, 0.5, 1., 1., .0], [0.5, 0.5, 1., 1., .0],
                             [0.5, 0.5, 1., 1., .0]]],
@@ -19,17 +31,25 @@ def test_diff_iou_rotated_2d():
          [1.5, 1.5, 1., 1., .0]]],
        dtype=np.float32)
-    boxes1 = torch.from_numpy(np_boxes1).cuda()
+    boxes1 = torch.from_numpy(np_boxes1).to(device)
-    boxes2 = torch.from_numpy(np_boxes2).cuda()
+    boxes2 = torch.from_numpy(np_boxes2).to(device)
    np_expect_ious = np.asarray([[1., 1., .7071, 1 / 7, .0]])
    ious = diff_iou_rotated_2d(boxes1, boxes2)
    assert np.allclose(ious.cpu().numpy(), np_expect_ious, atol=1e-4)
-@pytest.mark.skipif(
+@pytest.mark.parametrize('device', [
-    not torch.cuda.is_available(), reason='requires CUDA support')
+    pytest.param(
-def test_diff_iou_rotated_3d():
+        'cuda',
+        marks=pytest.mark.skipif(
+            not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
+    pytest.param(
+        'mlu',
+        marks=pytest.mark.skipif(
+            not IS_MLU_AVAILABLE, reason='requires MLU support'))
+])
+def test_diff_iou_rotated_3d(device):
    np_boxes1 = np.asarray(
        [[[.5, .5, .5, 1., 1., 1., .0], [.5, .5, .5, 1., 1., 1., .0],
          [.5, .5, .5, 1., 1., 1., .0], [.5, .5, .5, 1., 1., 1., .0],
@@ -41,8 +61,8 @@ def test_diff_iou_rotated_3d():
          [-1.5, -1.5, -1.5, 2.5, 2.5, 2.5, .0]]],
        dtype=np.float32)
-    boxes1 = torch.from_numpy(np_boxes1).cuda()
+    boxes1 = torch.from_numpy(np_boxes1).to(device)
-    boxes2 = torch.from_numpy(np_boxes2).cuda()
+    boxes2 = torch.from_numpy(np_boxes2).to(device)
    np_expect_ious = np.asarray([[1., .5, .7071, 1 / 15, .0]])
    ious = diff_iou_rotated_3d(boxes1, boxes2)

--- a/tests/test_ops/test_filtered_lrelu.py
+++ b/tests/test_ops/test_filtered_lrelu.py
@@ -2,6 +2,7 @@
 import pytest
 import torch
 from mmengine.utils import digit_version
+from mmengine.utils.dl_utils.parrots_wrapper import is_rocm_pytorch
 from mmcv.ops import filtered_lrelu
@@ -115,9 +116,8 @@ class TestFilteredLrelu:
        assert out.shape == (1, 3, 16, 16)
    @pytest.mark.skipif(
-        not torch.cuda.is_available()
+        not torch.cuda.is_available() or is_rocm_pytorch()
-        # or digit_version(torch.version.cuda) < digit_version('10.2'),
+        or digit_version(torch.version.cuda) < digit_version('10.2'),
-        or False,
        reason='requires cuda>=10.2')
    def test_filtered_lrelu_cuda(self):
        out = filtered_lrelu(self.input_tensor.cuda(), bias=self.bias.cuda())

--- a/tests/test_ops/test_group_points.py
+++ b/tests/test_ops/test_group_points.py
@@ -3,16 +3,25 @@ import pytest
 import torch
 from mmcv.ops import grouping_operation
+from mmcv.utils import IS_CUDA_AVAILABLE, IS_NPU_AVAILABLE
-@pytest.mark.skipif(
+@pytest.mark.parametrize('device', [
-    not torch.cuda.is_available(), reason='requires CUDA support')
+    pytest.param(
+        'cuda',
+        marks=pytest.mark.skipif(
+            not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
+    pytest.param(
+        'npu',
+        marks=pytest.mark.skipif(
+            not IS_NPU_AVAILABLE, reason='requires NPU support'))
+])
 @pytest.mark.parametrize('dtype', [torch.half, torch.float, torch.double])
-def test_grouping_points(dtype):
+def test_grouping_points(dtype, device):
    idx = torch.tensor([[[0, 0, 0], [3, 3, 3], [8, 8, 8], [0, 0, 0], [0, 0, 0],
                         [0, 0, 0]],
                        [[0, 0, 0], [6, 6, 6], [9, 9, 9], [0, 0, 0], [0, 0, 0],
-                         [0, 0, 0]]]).int().cuda()
+                         [0, 0, 0]]]).int().to(device)
    features = torch.tensor([[[
        0.5798, -0.7981, -0.9280, -1.3311, 1.3687, 0.9277, -0.4164, -1.8274,
        0.9268, 0.8414
@@ -37,7 +46,7 @@ def test_grouping_points(dtype):
                                  -0.6646, -0.6870, -0.1125, -0.2224, -0.3445,
                                  -1.4049, 0.4990, -0.7037, -0.9924, 0.0386
                              ]]],
-                            dtype=dtype).cuda()
+                            dtype=dtype).to(device)
    output = grouping_operation(features, idx)
    expected_output = torch.tensor(
@@ -59,7 +68,7 @@ def test_grouping_points(dtype):
          [[-0.6646, -0.6646, -0.6646], [0.4990, 0.4990, 0.4990],
           [0.0386, 0.0386, 0.0386], [-0.6646, -0.6646, -0.6646],
           [-0.6646, -0.6646, -0.6646], [-0.6646, -0.6646, -0.6646]]]],
-        dtype=dtype).cuda()
+        dtype=dtype).to(device)
    assert torch.allclose(output, expected_output)

--- a/tests/test_ops/test_masked_conv2d.py
+++ b/tests/test_ops/test_masked_conv2d.py
@@ -5,6 +5,10 @@ import torch
 from mmcv.utils import IS_CUDA_AVAILABLE, IS_MLU_AVAILABLE
+if IS_MLU_AVAILABLE:
+    torch.backends.cnnl.allow_tf32 = False
+    torch.backends.mlu.matmul.allow_tf32 = False
 class TestMaskedConv2d:

--- a/tests/test_ops/test_modulated_deform_conv.py
+++ b/tests/test_ops/test_modulated_deform_conv.py
@@ -7,6 +7,8 @@ import torch
 from mmengine.utils import digit_version
 from mmengine.utils.dl_utils import TORCH_VERSION
+from mmcv.utils import IS_CUDA_AVAILABLE, IS_MLU_AVAILABLE
 try:
    # If PyTorch version >= 1.6.0 and fp16 is enabled, torch.cuda.amp.autocast
    # would be imported and used; we should test if our modules support it.
@@ -42,7 +44,12 @@ class TestMdconv:
    def _test_mdconv(self, dtype=torch.float, device='cuda'):
        if not torch.cuda.is_available() and device == 'cuda':
            pytest.skip('test requires GPU')
-        from mmcv.ops import ModulatedDeformConv2dPack
+        if device == 'mlu':
+            from mmcv.ops import \
+                ModulatedDeformConv2dPack_MLU as ModulatedDeformConv2dPack
+        else:
+            from mmcv.ops import ModulatedDeformConv2dPack
        input = torch.tensor(input_t, dtype=dtype, device=device)
        input.requires_grad = True
@@ -53,10 +60,7 @@ class TestMdconv:
            stride=1,
            padding=1,
            deform_groups=1,
-            bias=False)
+            bias=False).to(device)
-        if device == 'cuda':
-            dcn.cuda()
        dcn.weight.data.fill_(1.)
        dcn.type(dtype)
@@ -73,7 +77,7 @@ class TestMdconv:
        assert numpy.allclose(dcn.conv_offset.bias.grad.cpu().detach().numpy(),
                              dcn_offset_b_grad, 1e-2)
-    def _test_amp_mdconv(self, input_dtype=torch.float):
+    def _test_amp_mdconv(self, input_dtype=torch.float, device='cuda'):
        """The function to test amp released on pytorch 1.6.0.
        The type of input data might be torch.float or torch.half,
@@ -83,10 +87,15 @@ class TestMdconv:
        Args:
            input_dtype: torch.float or torch.half.
        """
-        if not torch.cuda.is_available():
+        if not torch.cuda.is_available() and device == 'cuda':
            return
-        from mmcv.ops import ModulatedDeformConv2dPack
+        if device == 'mlu':
-        input = torch.tensor(input_t).cuda().type(input_dtype)
+            from mmcv.ops import \
+                ModulatedDeformConv2dPack_MLU as ModulatedDeformConv2dPack
+        else:
+            from mmcv.ops import ModulatedDeformConv2dPack
+        input = torch.tensor(input_t).to(device).type(input_dtype)
        input.requires_grad = True
        dcn = ModulatedDeformConv2dPack(
@@ -96,7 +105,7 @@ class TestMdconv:
            stride=1,
            padding=1,
            deform_groups=1,
-            bias=False).cuda()
+            bias=False).to(device)
        dcn.weight.data.fill_(1.)
        output = dcn(input)
        output.sum().backward()
@@ -111,17 +120,50 @@ class TestMdconv:
        assert numpy.allclose(dcn.conv_offset.bias.grad.cpu().detach().numpy(),
                              dcn_offset_b_grad, 1e-2)
-    def test_mdconv(self):
+    @pytest.mark.parametrize('device', [
-        self._test_mdconv(torch.double, device='cpu')
+        'cpu',
-        self._test_mdconv(torch.float, device='cpu')
+        pytest.param(
-        self._test_mdconv(torch.double)
+            'cuda',
-        self._test_mdconv(torch.float)
+            marks=pytest.mark.skipif(
-        self._test_mdconv(torch.half)
+                not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
+        pytest.param(
+            'mlu',
+            marks=pytest.mark.skipif(
+                not IS_MLU_AVAILABLE, reason='requires MLU support')),
+    ])
+    def test_mdconv_float(self, device):
+        self._test_mdconv(dtype=torch.float, device=device)
+    @pytest.mark.parametrize('device', [
+        'cpu',
+        pytest.param(
+            'cuda',
+            marks=pytest.mark.skipif(
+                not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
+        pytest.param(
+            'mlu',
+            marks=pytest.mark.skipif(
+                not IS_MLU_AVAILABLE, reason='requires MLU support')),
+    ])
+    def test_mdconv_double(self, device):
+        self._test_mdconv(dtype=torch.double, device=device)
+    @pytest.mark.parametrize('device', [
+        pytest.param(
+            'cuda',
+            marks=pytest.mark.skipif(
+                not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
+        pytest.param(
+            'mlu',
+            marks=pytest.mark.skipif(
+                not IS_MLU_AVAILABLE, reason='requires MLU support')),
+    ])
+    def test_mdconv_half(self, device):
+        self._test_mdconv(torch.half, device=device)
        # test amp when torch version >= '1.6.0', the type of
        # input data for mdconv might be torch.float or torch.half
        if (TORCH_VERSION != 'parrots'
                and digit_version(TORCH_VERSION) >= digit_version('1.6.0')):
            with autocast(enabled=True):
-                self._test_amp_mdconv(torch.float)
+                self._test_amp_mdconv(torch.float, device=device)
-                self._test_amp_mdconv(torch.half)
+                self._test_amp_mdconv(torch.half, device=device)
--- a/tests/test_ops/test_nms_rotated.py
+++ b/tests/test_ops/test_nms_rotated.py
@@ -3,7 +3,7 @@ import numpy as np
 import pytest
 import torch
-from mmcv.utils import IS_CUDA_AVAILABLE, IS_NPU_AVAILABLE
+from mmcv.utils import IS_CUDA_AVAILABLE, IS_MLU_AVAILABLE, IS_NPU_AVAILABLE
 class TestNmsRotated:
@@ -16,7 +16,11 @@ class TestNmsRotated:
        pytest.param(
            'cuda',
            marks=pytest.mark.skipif(
-                not IS_CUDA_AVAILABLE, reason='requires CUDA support'))
+                not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
+        pytest.param(
+            'mlu',
+            marks=pytest.mark.skipif(
+                not IS_MLU_AVAILABLE, reason='requires MLU support'))
    ])
    def test_ml_nms_rotated(self, device):
        from mmcv.ops import nms_rotated
@@ -58,7 +62,11 @@ class TestNmsRotated:
        pytest.param(
            'cuda',
            marks=pytest.mark.skipif(
-                not IS_CUDA_AVAILABLE, reason='requires CUDA support'))
+                not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
+        pytest.param(
+            'mlu',
+            marks=pytest.mark.skipif(
+                not IS_MLU_AVAILABLE, reason='requires MLU support'))
    ])
    def test_nms_rotated(self, device):
        from mmcv.ops import nms_rotated

--- a/tests/test_ops/test_points_in_polygons.py
+++ b/tests/test_ops/test_points_in_polygons.py
@@ -4,20 +4,29 @@ import pytest
 import torch
 from mmcv.ops import points_in_polygons
+from mmcv.utils import IS_CUDA_AVAILABLE, IS_NPU_AVAILABLE
-@pytest.mark.skipif(
+@pytest.mark.parametrize('device', [
-    not torch.cuda.is_available(), reason='requires CUDA support')
+    pytest.param(
-def test_points_in_polygons():
+        'cuda',
+        marks=pytest.mark.skipif(
+            not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
+    pytest.param(
+        'npu',
+        marks=pytest.mark.skipif(
+            not IS_NPU_AVAILABLE, reason='requires NPU support'))
+])
+def test_points_in_polygons(device):
    points = np.array([[300., 300.], [400., 400.], [100., 100], [300, 250],
                       [100, 0]])
    polygons = np.array([[200., 200., 400., 400., 500., 200., 400., 100.],
                         [400., 400., 500., 500., 600., 300., 500., 200.],
                         [300., 300., 600., 700., 700., 700., 700., 100.]])
    expected_output = np.array([[0., 0., 0.], [0., 0., 1.], [0., 0., 0.],
-                                [1., 0., 0.], [0., 0., 0.]])
+                                [1., 0., 0.], [0., 0., 0.]]).astype(np.float32)
-    points = torch.from_numpy(points).cuda().float()
+    points = torch.tensor(points, dtype=torch.float32, device=device)
-    polygons = torch.from_numpy(polygons).cuda().float()
+    polygons = torch.tensor(polygons, dtype=torch.float32, device=device)
-    expected_output = torch.from_numpy(expected_output).cuda().float()
+    assert np.allclose(
-    assert torch.allclose(
+        points_in_polygons(points, polygons).cpu().numpy(), expected_output,
-        points_in_polygons(points, polygons), expected_output, 1e-3)
+        1e-3)
--- a/tests/test_ops/test_roi_align.py
+++ b/tests/test_ops/test_roi_align.py
@@ -3,7 +3,7 @@ import numpy as np
 import pytest
 import torch
-from mmcv.utils import IS_CUDA_AVAILABLE, IS_MLU_AVAILABLE
+from mmcv.utils import IS_CUDA_AVAILABLE, IS_MLU_AVAILABLE, IS_NPU_AVAILABLE
 _USING_PARROTS = True
 try:
@@ -93,6 +93,7 @@ def _test_roialign_allclose(device, dtype):
            x.grad.data.type(torch.float).cpu().numpy(), np_grad, atol=1e-3)
+@pytest.mark.parametrize('dtype', [torch.float, torch.half])
 @pytest.mark.parametrize('device', [
    'cpu',
    pytest.param(
@@ -102,19 +103,23 @@ def _test_roialign_allclose(device, dtype):
    pytest.param(
        'mlu',
        marks=pytest.mark.skipif(
-            not IS_MLU_AVAILABLE, reason='requires MLU support'))
+            not IS_MLU_AVAILABLE, reason='requires MLU support')),
-])
-@pytest.mark.parametrize('dtype', [
-    torch.float,
    pytest.param(
-        torch.double,
+        'npu',
        marks=pytest.mark.skipif(
-            IS_MLU_AVAILABLE,
+            not IS_NPU_AVAILABLE, reason='requires NPU support'))
-            reason='MLU does not support for 64-bit floating point')),
-    torch.half
 ])
-def test_roialign(device, dtype):
+def test_roialign_float(device, dtype):
-    # check double only
-    if dtype is torch.double:
-        _test_roialign_gradcheck(device=device, dtype=dtype)
    _test_roialign_allclose(device=device, dtype=dtype)
+@pytest.mark.parametrize('device', [
+    'cpu',
+    pytest.param(
+        'cuda',
+        marks=pytest.mark.skipif(
+            not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
+])
+def test_roialign_float64(device):
+    _test_roialign_allclose(device=device, dtype=torch.double)
+    _test_roialign_gradcheck(device=device, dtype=torch.double)
--- a/tests/test_ops/test_roi_align_rotated.py
+++ b/tests/test_ops/test_roi_align_rotated.py
@@ -11,7 +11,6 @@ try:
 except ImportError:
    from torch.autograd import gradcheck
    _USING_PARROTS = False
 # yapf:disable
 inputs = [([[[[1., 2.], [3., 4.]]]],
           [[0., 0.5, 0.5, 1., 1., 0]]),

--- a/tests/test_ops/test_roiaware_pool3d.py
+++ b/tests/test_ops/test_roiaware_pool3d.py
@@ -8,6 +8,13 @@ from mmcv.ops import (RoIAwarePool3d, points_in_boxes_all, points_in_boxes_cpu,
 from mmcv.utils import IS_CUDA_AVAILABLE, IS_MLU_AVAILABLE
+@pytest.mark.parametrize('dtype', [
+    torch.float, torch.half,
+    pytest.param(
+        torch.double,
+        marks=pytest.mark.skipif(
+            IS_MLU_AVAILABLE, reason='MLU does not support for double'))
+])
 @pytest.mark.parametrize('device', [
    pytest.param(
        'cuda',
@@ -18,13 +25,6 @@ from mmcv.utils import IS_CUDA_AVAILABLE, IS_MLU_AVAILABLE
        marks=pytest.mark.skipif(
            not IS_MLU_AVAILABLE, reason='requires MLU support'))
 ])
-@pytest.mark.parametrize('dtype', [
-    torch.float, torch.half,
-    pytest.param(
-        torch.double,
-        marks=pytest.mark.skipif(
-            IS_MLU_AVAILABLE, reason='MLU does not support for double'))
-])
 def test_RoIAwarePool3d(device, dtype):
    roiaware_pool3d_max = RoIAwarePool3d(
        out_size=4, max_pts_per_voxel=128, mode='max')

--- a/tests/test_ops/test_rotated_feature_align.py
+++ b/tests/test_ops/test_rotated_feature_align.py
@@ -3,7 +3,7 @@ import pytest
 import torch
 from mmcv.ops import rotated_feature_align
-from mmcv.utils import IS_CUDA_AVAILABLE
+from mmcv.utils import IS_CUDA_AVAILABLE, IS_MLU_AVAILABLE
 @pytest.mark.skipif(
@@ -13,6 +13,10 @@ from mmcv.utils import IS_CUDA_AVAILABLE
        'cuda',
        marks=pytest.mark.skipif(
            not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
+    pytest.param(
+        'mlu',
+        marks=pytest.mark.skipif(
+            not IS_MLU_AVAILABLE, reason='requires MLU support')),
    pytest.param(
        'cpu',
        marks=pytest.mark.skipif(

--- a/tests/test_ops/test_scatter_points.py
+++ b/tests/test_ops/test_scatter_points.py
@@ -4,22 +4,31 @@ import torch
 from torch.autograd import gradcheck
 from mmcv.ops import DynamicScatter
+from mmcv.utils import IS_CUDA_AVAILABLE, IS_MLU_AVAILABLE
 if torch.__version__ == 'parrots':
    pytest.skip('not supported in parrots now', allow_module_level=True)
-@pytest.mark.skipif(
+@pytest.mark.parametrize('device', [
-    not torch.cuda.is_available(), reason='requires CUDA support')
+    pytest.param(
-def test_dynamic_scatter():
+        'cuda',
+        marks=pytest.mark.skipif(
+            not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
+    pytest.param(
+        'mlu',
+        marks=pytest.mark.skipif(
+            not IS_MLU_AVAILABLE, reason='requires MLU support'))
+])
+def test_dynamic_scatter(device):
    dsmean = DynamicScatter([0.32, 0.32, 6],
                            [-74.88, -74.88, -2, 74.88, 74.88, 4], True)
    dsmax = DynamicScatter([0.32, 0.32, 6],
                           [-74.88, -74.88, -2, 74.88, 74.88, 4], False)
    # test empty input
-    empty_feats = torch.empty(size=(0, 3), dtype=torch.float32, device='cuda')
+    empty_feats = torch.empty(size=(0, 3), dtype=torch.float32, device=device)
-    empty_coors = torch.empty(size=(0, 3), dtype=torch.int32, device='cuda')
+    empty_coors = torch.empty(size=(0, 3), dtype=torch.int32, device=device)
    empty_feats.requires_grad_()
    empty_feats_out_mean, empty_coors_out_mean = dsmean(
@@ -35,9 +44,9 @@ def test_dynamic_scatter():
    # test empty reduced output
    empty_o_feats = torch.rand(
-        size=(200000, 3), dtype=torch.float32, device='cuda') * 100 - 50
+        size=(200000, 3), dtype=torch.float32, device=device) * 100 - 50
    empty_o_coors = torch.randint(
-        low=-1, high=0, size=(200000, 3), dtype=torch.int32, device='cuda')
+        low=-1, high=0, size=(200000, 3), dtype=torch.int32, device=device)
    empty_o_feats.requires_grad_()
    empty_o_feats_out_mean, empty_o_coors_out_mean = dsmean(
@@ -52,9 +61,9 @@ def test_dynamic_scatter():
    # test non-empty input
    feats = torch.rand(
-        size=(200000, 3), dtype=torch.float32, device='cuda') * 100 - 50
+        size=(200000, 3), dtype=torch.float32, device=device) * 100 - 50
    coors = torch.randint(
-        low=-1, high=20, size=(200000, 3), dtype=torch.int32, device='cuda')
+        low=-1, high=20, size=(200000, 3), dtype=torch.int32, device=device)
    ref_voxel_coors = coors.unique(dim=0, sorted=True)
    ref_voxel_coors = ref_voxel_coors[ref_voxel_coors.min(dim=-1).values >= 0]
@@ -88,9 +97,9 @@ def test_dynamic_scatter():
    # test non-empty input without any point out of bound
    feats = torch.rand(
-        size=(200000, 3), dtype=torch.float32, device='cuda') * 100 - 50
+        size=(200000, 3), dtype=torch.float32, device=device) * 100 - 50
    coors = torch.randint(
-        low=0, high=20, size=(200000, 3), dtype=torch.int32, device='cuda')
+        low=0, high=20, size=(200000, 3), dtype=torch.int32, device=device)
    ref_voxel_coors = coors.unique(dim=0, sorted=True)
    ref_voxel_coors = ref_voxel_coors[ref_voxel_coors.min(dim=-1).values >= 0]
@@ -124,9 +133,11 @@ def test_dynamic_scatter():
    # test grad #
    feats = torch.rand(
-        size=(100, 4), dtype=torch.float32, device='cuda') * 100 - 50
+        size=(100, 4), dtype=torch.float32, device=device) * 100 - 50
    coors = torch.randint(
-        low=-1, high=3, size=(100, 3), dtype=torch.int32, device='cuda')
+        low=-1, high=3, size=(100, 3), dtype=torch.int32, device=device)
    feats.requires_grad_()
-    gradcheck(dsmean, (feats, coors), eps=1e-2, atol=1e-2, rtol=1e-5)
+    # TODO(Cambricon): mlu only support max reduce in current version.
+    if not IS_MLU_AVAILABLE:
+        gradcheck(dsmean, (feats, coors), eps=1e-2, atol=1e-2, rtol=1e-5)
    gradcheck(dsmax, (feats, coors), eps=1e-2, atol=1e-2, rtol=1e-5)
--- a/tests/test_ops/test_spconv.py
+++ b/tests/test_ops/test_spconv.py
@@ -10,6 +10,8 @@ from mmcv.ops import (SparseConvTensor, SparseInverseConv3d, SparseSequential,
 if torch.__version__ == 'parrots':
    pytest.skip('not supported in parrots now', allow_module_level=True)
+from mmcv.utils import IS_CUDA_AVAILABLE, IS_MLU_AVAILABLE
 def make_sparse_convmodule(in_channels,
                           out_channels,
@@ -76,21 +78,29 @@ def make_sparse_convmodule(in_channels,
    return layers
-@pytest.mark.skipif(
+@pytest.mark.parametrize('device', [
-    not torch.cuda.is_available(), reason='requires CUDA support')
+    pytest.param(
-def test_make_sparse_convmodule():
+        'cuda',
+        marks=pytest.mark.skipif(
+            not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
+    pytest.param(
+        'mlu',
+        marks=pytest.mark.skipif(
+            not IS_MLU_AVAILABLE, reason='requires MLU support'))
+])
+def test_make_sparse_convmodule(device):
    torch.cuda.empty_cache()
    voxel_features = torch.tensor([[6.56126, 0.9648336, -1.7339306, 0.315],
                                   [6.8162713, -2.480431, -1.3616394, 0.36],
                                   [11.643568, -4.744306, -1.3580885, 0.16],
                                   [23.482342, 6.5036807, 0.5806964, 0.35]],
                                  dtype=torch.float32,
-                                  device='cuda')  # n, point_features
+                                  device=device)  # n, point_features
    coordinates = torch.tensor(
        [[0, 12, 819, 131], [0, 16, 750, 136], [1, 16, 705, 232],
         [1, 35, 930, 469]],
        dtype=torch.int32,
-        device='cuda')  # n, 4(batch, ind_x, ind_y, ind_z)
+        device=device)  # n, 4(batch, ind_x, ind_y, ind_z)
    # test
    input_sp_tensor = SparseConvTensor(voxel_features, coordinates,
@@ -105,7 +115,7 @@ def test_make_sparse_convmodule():
        padding=0,
        conv_type='SubMConv3d',
        norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.01),
-        order=('conv', 'norm', 'act')).cuda()
+        order=('conv', 'norm', 'act')).to(device)
    assert isinstance(sparse_block0[0], SubMConv3d)
    assert sparse_block0[0].in_channels == 4
    assert sparse_block0[0].out_channels == 16
@@ -118,16 +128,18 @@ def test_make_sparse_convmodule():
    out_features = sparse_block0(input_sp_tensor)
    assert out_features.features.shape == torch.Size([4, 16])
-    sparse_block1 = make_sparse_convmodule(
+    # device == mlu: not support inverse==1 yet
-        4,
+    if device != 'mlu':
-        16,
+        sparse_block1 = make_sparse_convmodule(
-        3,
+            4,
-        'test1',
+            16,
-        stride=1,
+            3,
-        padding=0,
+            'test1',
-        conv_type='SparseInverseConv3d',
+            stride=1,
-        norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.01),
+            padding=0,
-        order=('norm', 'act', 'conv')).cuda()
+            conv_type='SparseInverseConv3d',
-    assert isinstance(sparse_block1[0], torch.nn.BatchNorm1d)
+            norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.01),
-    assert isinstance(sparse_block1[1], torch.nn.ReLU)
+            order=('norm', 'act', 'conv')).to(device)
-    assert isinstance(sparse_block1[2], SparseInverseConv3d)
+        assert isinstance(sparse_block1[2], SparseInverseConv3d)
+        assert isinstance(sparse_block1[0], torch.nn.BatchNorm1d)
+        assert isinstance(sparse_block1[1], torch.nn.ReLU)
--- a/tests/test_ops/test_voxelization.py
+++ b/tests/test_ops/test_voxelization.py
@@ -4,7 +4,7 @@ import pytest
 import torch
 from mmcv.ops import Voxelization
-from mmcv.utils import IS_NPU_AVAILABLE
+from mmcv.utils import IS_CUDA_AVAILABLE, IS_MLU_AVAILABLE, IS_NPU_AVAILABLE
 def _get_voxel_points_indices(points, coors, voxel):
@@ -17,7 +17,7 @@ def _get_voxel_points_indices(points, coors, voxel):
    pytest.param(
        'cuda:0',
        marks=pytest.mark.skipif(
-            not torch.cuda.is_available(), reason='requires CUDA support'))
+            not IS_CUDA_AVAILABLE, reason='requires CUDA support'))
 ])
 def test_voxelization(device_type):
    voxel_size = [0.5, 0.5, 0.5]
@@ -63,8 +63,7 @@ def test_voxelization(device_type):
        assert num_points_current_voxel == expected_num_points_per_voxel[i]
-@pytest.mark.skipif(
+@pytest.mark.skipif(not IS_CUDA_AVAILABLE, reason='requires CUDA support')
-    not torch.cuda.is_available(), reason='requires CUDA support')
 def test_voxelization_nondeterministic():
    voxel_size = [0.5, 0.5, 0.5]
    point_cloud_range = [0, -40, -3, 70.4, 40, 1]
@@ -140,6 +139,49 @@ def test_voxelization_nondeterministic():
    assert len(coors_set) == len(coors) == len(coors_all_set)
+@pytest.mark.parametrize(
+    'device_type',
+    [
+        pytest.param(
+            # this is only used for dipu device testing case.
+            # dipu will mock to cuda automatically on mlu physical device.
+            'cuda:0',
+            marks=pytest.mark.skipif(
+                not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
+        pytest.param(
+            'mlu',
+            marks=pytest.mark.skipif(
+                not IS_MLU_AVAILABLE, reason='requires MLU support'))
+    ])
+def test_voxelization_mlu(device_type):
+    voxel_size = [0.5, 0.5, 0.5]
+    point_cloud_range = [0, -40, -3, 70.4, 40, 1]
+    voxel_dict = np.load(
+        'tests/data/for_3d_ops/test_voxel.npy', allow_pickle=True).item()
+    expected_coors = voxel_dict['coors']
+    expected_voxels = voxel_dict['voxels']
+    expected_num_points_per_voxel = voxel_dict['num_points_per_voxel']
+    points = voxel_dict['points']
+    points = torch.tensor(points)
+    max_num_points = 1000
+    hard_voxelization = Voxelization(voxel_size, point_cloud_range,
+                                     max_num_points)
+    device = torch.device(device_type)
+    # test hard_voxelization on mlu
+    points = points.contiguous().to(device)
+    coors, voxels, num_points_per_voxel = hard_voxelization.forward(points)
+    coors = coors.cpu().detach().numpy()
+    voxels = voxels.cpu().detach().numpy()
+    num_points_per_voxel = num_points_per_voxel.cpu().detach().numpy()
+    assert np.all(coors == expected_coors)
+    assert np.all(voxels == expected_voxels)
+    assert np.all(num_points_per_voxel == expected_num_points_per_voxel)
 @pytest.mark.parametrize('device_type', [
    pytest.param(
        'npu',