[Feature] Add support of some ops for Ascend device (#2594)

* [Feature]: add supports of gather_points, nms_rotated, bbox_overlaps for Ascend device * Apply suggestions from code review --------- Co-authored-by: Zaida Zhou <58739961+zhouzaida@users.noreply.github.com>

[Feature] Add support of some ops for Ascend device (#2594)
* [Feature]: add supports of gather_points, nms_rotated, bbox_overlaps for Ascend device * Apply suggestions from code review --------- Co-authored-by: Zaida Zhou <58739961+zhouzaida@users.noreply.github.com>
834f94db · ckirchhoff · GitHub · 8e2b2bf3 · 834f94db · 834f94db
Unverified Commit 834f94db authored Feb 13, 2023 by ckirchhoff Committed by GitHub Feb 13, 2023
14 changed files
--- a/docs/en/understand_mmcv/ops.md
+++ b/docs/en/understand_mmcv/ops.md
@@ -7,7 +7,7 @@ We implement common ops used in detection, segmentation, etc.
 | ActiveRotatedFilter          | √   | √    |     |     |        |
 | AssignScoreWithK             |     | √    |     |     |        |
 | BallQuery                    |     | √    |     |     |        |
-| BBoxOverlaps                 |     | √    | √   | √   |        |
+| BBoxOverlaps                 |     | √    | √   | √   | √      |
 | BorderAlign                  |     | √    |     |     |        |
 | BoxIouRotated                | √   | √    |     |     |        |
 | BoxIouQuadri                 | √   | √    |     |     |        |
@@ -25,7 +25,7 @@ We implement common ops used in detection, segmentation, etc.
 | FurthestPointSample          |     | √    |     |     |        |
 | FurthestPointSampleWithDist  |     | √    |     |     |        |
 | FusedBiasLeakyrelu           |     | √    |     |     | √      |
-| GatherPoints                 |     | √    |     |     |        |
+| GatherPoints                 |     | √    |     |     | √      |
 | GroupPoints                  |     | √    |     |     |        |
 | Iou3d                        |     | √    | √   |     |        |
 | KNN                          |     | √    |     |     |        |
@@ -35,7 +35,7 @@ We implement common ops used in detection, segmentation, etc.
 | ModulatedDeformConv2d        | √   | √    |     |     | √      |
 | MultiScaleDeformableAttn     |     | √    | √   |     |        |
 | NMS                          | √   | √    | √   |     | √      |
-| NMSRotated                   | √   | √    |     |     |        |
+| NMSRotated                   | √   | √    |     |     | √      |
 | NMSQuadri                    | √   | √    |     |     |        |
 | PixelGroup                   | √   |      |     |     |        |
 | PointsInBoxes                | √   | √    |     |     |        |

--- a/docs/zh_cn/understand_mmcv/ops.md
+++ b/docs/zh_cn/understand_mmcv/ops.md
@@ -7,7 +7,7 @@ MMCV 提供了检测、分割等任务中常用的算子
 | ActiveRotatedFilter          | √   | √    |     |     |        |
 | AssignScoreWithK             |     | √    |     |     |        |
 | BallQuery                    |     | √    |     |     |        |
-| BBoxOverlaps                 |     | √    | √   | √   |        |
+| BBoxOverlaps                 |     | √    | √   | √   | √      |
 | BorderAlign                  |     | √    |     |     |        |
 | BoxIouRotated                | √   | √    |     |     |        |
 | BoxIouQuadri                 | √   | √    |     |     |        |
@@ -25,7 +25,7 @@ MMCV 提供了检测、分割等任务中常用的算子
 | FurthestPointSample          |     | √    |     |     |        |
 | FurthestPointSampleWithDist  |     | √    |     |     |        |
 | FusedBiasLeakyrelu           |     | √    |     |     | √      |
-| GatherPoints                 |     | √    |     |     |        |
+| GatherPoints                 |     | √    |     |     | √      |
 | GroupPoints                  |     | √    |     |     |        |
 | Iou3d                        |     | √    | √   |     |        |
 | KNN                          |     | √    |     |     |        |
@@ -35,7 +35,7 @@ MMCV 提供了检测、分割等任务中常用的算子
 | ModulatedDeformConv2d        | √   | √    |     |     | √      |
 | MultiScaleDeformableAttn     |     | √    | √   |     |        |
 | NMS                          | √   | √    | √   |     | √      |
-| NMSRotated                   | √   | √    |     |     |        |
+| NMSRotated                   | √   | √    |     |     | √      |
 | NMSQuadri                    | √   | √    |     |     |        |
 | PixelGroup                   | √   |      |     |     |        |
 | PointsInBoxes                | √   | √    |     |     |        |

--- a/mmcv/ops/csrc/pytorch/nms_rotated.cpp
+++ b/mmcv/ops/csrc/pytorch/nms_rotated.cpp
@@ -12,21 +12,32 @@ Tensor nms_rotated_cuda(const Tensor dets, const Tensor scores,
                        const float iou_threshold, const int multi_label);
 #endif

+#ifdef MMCV_WITH_NPU
+Tensor nms_rotated_npu(const Tensor dets, const Tensor scores,
+                       const Tensor labels, const float iou_threshold);
+#endif
+
 // Interface for Python
 // inline is needed to prevent multiple function definitions when this header is
 // included by different cpps
 Tensor nms_rotated(const Tensor dets, const Tensor scores, const Tensor order,
-                   const Tensor dets_sorted, const float iou_threshold,
-                   const int multi_label) {
+                   const Tensor dets_sorted, const Tensor labels,
+                   const float iou_threshold, const int multi_label) {
  assert(dets.device().is_cuda() == scores.device().is_cuda());
  if (dets.device().is_cuda()) {
 #ifdef MMCV_WITH_CUDA
-    return nms_rotated_cuda(dets, scores, order, dets_sorted, iou_threshold,
-                            multi_label);
+    return nms_rotated_cuda(dets, scores, order, dets_sorted.contiguous(),
+                            iou_threshold, multi_label);
 #else
    AT_ERROR("Not compiled with GPU support");
+#endif
+  } else if (dets.device().type() == at::kXLA) {
+#ifdef MMCV_WITH_NPU
+    return nms_rotated_npu(dets, scores, labels, iou_threshold);
+#else
+    AT_ERROR("Not compiled with NPU support");
 #endif
  }

-  return nms_rotated_cpu(dets, scores, iou_threshold);
+  return nms_rotated_cpu(dets.contiguous(), scores.contiguous(), iou_threshold);
 }
--- a/mmcv/ops/csrc/pytorch/npu/bbox_overlaps_npu.cpp
+++ b/mmcv/ops/csrc/pytorch/npu/bbox_overlaps_npu.cpp
+#include "pytorch_npu_helper.hpp"
+
+using namespace NPU_NAME_SPACE;
+using namespace std;
+
+void bbox_overlaps_impl(const Tensor bboxes1, const Tensor bboxes2, Tensor ious,
+                        const int mode, const bool aligned, const int offset);
+
+void bbox_overlaps_npu(const Tensor bboxes1, const Tensor bboxes2, Tensor ious,
+                       const int mode, const bool aligned, const int offset) {
+  string modeStr = "iou";
+  if (mode == 1) {
+    modeStr = "iof";
+  }
+  at::Tensor bboxes = at::ones_like(bboxes2);
+  at::Tensor gtboxes = at::ones_like(bboxes1);
+  bboxes = aligned ? bboxes2.transpose(0, 1) : bboxes2;
+  gtboxes = aligned ? bboxes1.transpose(0, 1) : bboxes1;
+  OpCommand cmd;
+  cmd.Name("Iou")
+      .Input(bboxes)
+      .Input(gtboxes)
+      .Output(ious)
+      .Attr("mode", modeStr)
+      .Attr("eps", (float)offset)
+      .Attr("aligned", aligned)
+      .Run();
+}
+
+REGISTER_NPU_IMPL(bbox_overlaps_impl, bbox_overlaps_npu);
--- a/mmcv/ops/csrc/pytorch/npu/deform_roi_pool.cpp
+++ b/mmcv/ops/csrc/pytorch/npu/deform_roi_pool.cpp
@@ -45,11 +45,11 @@ void deform_roi_pool_backward_npu(Tensor grad_output, Tensor input, Tensor rois,
  int64_t sampling_ratio_ = (int64_t)sampling_ratio;
  OpCommand cmd;
  cmd.Name("DeformableRoiPoolGrad")
-      .Input(grad_input)
+      .Input(grad_output)
      .Input(input)
      .Input(rois)
      .Input(offset)
-      .Output(grad_output)
+      .Output(grad_input)
      .Output(grad_offset)
      .Attr("output_size", output_size)
      .Attr("spatial_scale", spatial_scale)

--- a/mmcv/ops/csrc/pytorch/npu/gather_points_npu.cpp
+++ b/mmcv/ops/csrc/pytorch/npu/gather_points_npu.cpp
+#include "pytorch_npu_helper.hpp"
+
+using namespace NPU_NAME_SPACE;
+using namespace std;
+
+void gather_points_forward_npu(int b, int c, int n, int npoints,
+                               const Tensor points, const Tensor idx,
+                               Tensor out) {
+  // b, c, n, and npoints do not need to be passed into gatherv2,
+  // b, c, n, and npoints are calculated inside the operator
+  // gatherv2 operator in ascend needs to set axis to 2, batch_dims is 1
+  c10::SmallVector<int64_t, N> axis = {2};
+  int64_t batch_dims = 1;
+
+  OpCommand cmd;
+  cmd.Name("GatherV2")
+      .Input(points)
+      .Input(idx)
+      .Input(axis)
+      .Output(out)
+      .Attr("batch_dims", batch_dims)
+      .Run();
+}
+
+void gather_points_forward_impl(int b, int c, int n, int npoints,
+                                const Tensor points, const Tensor idx,
+                                Tensor out);
+
+REGISTER_NPU_IMPL(gather_points_forward_impl, gather_points_forward_npu);
--- a/mmcv/ops/csrc/pytorch/npu/nms_rotated_npu.cpp
+++ b/mmcv/ops/csrc/pytorch/npu/nms_rotated_npu.cpp
+#include "pytorch_npu_helper.hpp"
+
+using namespace NPU_NAME_SPACE;
+
+Tensor nms_rotated_npu(const Tensor dets, const Tensor scores,
+                       const Tensor labels, const float iou_threshold) {
+  auto originDtype = dets.scalar_type();
+  at::Tensor detsCast = dets;
+  at::Tensor scoresCast = scores;
+  if (originDtype != at::ScalarType::Float) {
+    detsCast = NPUNativeFunctions::npu_dtype_cast(dets, at::kFloat);
+    scoresCast = NPUNativeFunctions::npu_dtype_cast(scores, at::kFloat);
+  }
+  c10::SmallVector<int64_t, SIZE> selectedIndexSize = {dets.size(0)};
+  at::Tensor selectedBox = OpPreparation::ApplyTensor(dets);
+  at::Tensor selectedIndex = OpPreparation::ApplyTensor(
+      selectedIndexSize, dets.options().dtype(at::kInt), dets);
+
+  c10::SmallVector<int64_t, N> output_sync_idx = {0, 1};
+  OpCommand cmd;
+  cmd.Sync(output_sync_idx)
+      .Name("RotatedNMS")
+      .Input(detsCast)
+      .Input(scoresCast)
+      .Input(labels)
+      .Output(selectedBox)
+      .Output(selectedIndex)
+      .Attr("iou_threshold", (float)iou_threshold)
+      .Run();
+  selectedIndex = NPUNativeFunctions::npu_dtype_cast(selectedIndex, at::kLong);
+  return selectedIndex;
+}
--- a/mmcv/ops/csrc/pytorch/npu/roi_pool_npu.cpp
+++ b/mmcv/ops/csrc/pytorch/npu/roi_pool_npu.cpp
@@ -11,7 +11,6 @@ void roi_pool_forward_npu(Tensor input, Tensor rois, Tensor output,
  int64_t pooled_channel = 1;
  at::Tensor roi_actual_num = at_npu::native::OpPreparation::ApplyTensor(
      {}, rois.options().dtype(at::kInt), rois);
-
  OpCommand cmd;
  cmd.Name("RoiPoolingWithArgMax")
      .Input(input)
@@ -27,8 +26,38 @@ void roi_pool_forward_npu(Tensor input, Tensor rois, Tensor output,
      .Run();
 }

+void roi_pool_backward_npu(Tensor grad_output, Tensor rois, Tensor argmax,
+                           Tensor grad_input, int pooled_height,
+                           int pooled_width, float spatial_scale) {
+  int64_t pooled_height_64 = pooled_height;
+  int64_t pooled_width_64 = pooled_width;
+  int64_t pooled_channel = 1;
+  at::Tensor roi_actual_num = at_npu::native::OpPreparation::ApplyTensor(
+      {}, rois.options().dtype(at::kInt), rois);
+  at::Tensor x = at::ones_like(grad_input);
+  OpCommand cmd;
+  cmd.Name("RoiPoolingGradWithArgMax")
+      .Input(grad_output)
+      .Input(x)
+      .Input(rois)
+      .Input(roi_actual_num)
+      .Input(argmax)
+      .Output(grad_input)
+      .Attr("pooled_h", pooled_height_64)
+      .Attr("pooled_w", pooled_width_64)
+      .Attr("spatial_scale_h", spatial_scale)
+      .Attr("spatial_scale_w", spatial_scale)
+      .Attr("pool_channel", pooled_channel)
+      .Run();
+}
+
 void roi_pool_forward_impl(Tensor input, Tensor rois, Tensor output,
                           Tensor argmax, int pooled_height, int pooled_width,
                           float spatial_scale);

+void roi_pool_backward_impl(Tensor grad_output, Tensor rois, Tensor argmax,
+                            Tensor grad_input, int pooled_height,
+                            int pooled_width, float spatial_scale);
+
 REGISTER_NPU_IMPL(roi_pool_forward_impl, roi_pool_forward_npu);
+REGISTER_NPU_IMPL(roi_pool_backward_impl, roi_pool_backward_npu);
--- a/mmcv/ops/csrc/pytorch/pybind.cpp
+++ b/mmcv/ops/csrc/pytorch/pybind.cpp
@@ -309,8 +309,8 @@ void box_iou_rotated(const Tensor boxes1, const Tensor boxes2, Tensor ious,
                     const int mode_flag, const bool aligned);

 Tensor nms_rotated(const Tensor dets, const Tensor scores, const Tensor order,
-                   const Tensor dets_sorted, const float iou_threshold,
-                   const int multi_label);
+                   const Tensor dets_sorted, const Tensor labels,
+                   const float iou_threshold, const int multi_label);

 Tensor upfirdn2d(const Tensor &input, const Tensor &kernel, int up_x, int up_y,
                 int down_x, int down_y, int pad_x0, int pad_x1, int pad_y0,
@@ -758,7 +758,7 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
        py::arg("mode_flag"), py::arg("aligned"));
  m.def("nms_rotated", &nms_rotated, "NMS for rotated boxes", py::arg("dets"),
        py::arg("scores"), py::arg("order"), py::arg("dets_sorted"),
-        py::arg("iou_threshold"), py::arg("multi_label"));
+        py::arg("labels"), py::arg("iou_threshold"), py::arg("multi_label"));
  m.def("ball_query_forward", &ball_query_forward, "ball_query_forward",
        py::arg("new_xyz_tensor"), py::arg("xyz_tensor"), py::arg("idx_tensor"),
        py::arg("b"), py::arg("n"), py::arg("m"), py::arg("min_radius"),

--- a/mmcv/ops/nms.py
+++ b/mmcv/ops/nms.py
@@ -406,6 +406,19 @@ def nms_rotated(dets: Tensor,
    else:
        dets_cw = dets
    multi_label = labels is not None
+    if labels is None:
+        input_labels = scores.new_empty(0, dtype=torch.int)
+    else:
+        input_labels = labels
+    if dets.device.type == 'npu':
+        order = scores.new_empty(0, dtype=torch.long)
+        keep_inds = ext_module.nms_rotated(dets_cw, scores, order, dets_cw,
+                                           input_labels, iou_threshold,
+                                           multi_label)
+        dets = torch.cat((dets[keep_inds], scores[keep_inds].reshape(-1, 1)),
+                         dim=1)
+        return dets, keep_inds
+
    if multi_label:
        dets_wl = torch.cat((dets_cw, labels.unsqueeze(1)), 1)  # type: ignore
    else:
@@ -419,11 +432,13 @@ def nms_rotated(dets: Tensor,
            scores,
            order,
            dets_sorted,
+            input_labels,
            iou_threshold=iou_threshold,
            multi_label=multi_label)
    else:
        keep_inds = ext_module.nms_rotated(dets_wl, scores, order, dets_sorted,
-                                           iou_threshold, multi_label)
+                                           input_labels, iou_threshold,
+                                           multi_label)
    dets = torch.cat((dets[keep_inds], scores[keep_inds].reshape(-1, 1)),
                     dim=1)
    return dets, keep_inds

--- a/tests/test_ops/test_bbox.py
+++ b/tests/test_ops/test_bbox.py
@@ -3,7 +3,8 @@ import numpy as np
 import pytest
 import torch

-from mmcv.utils import IS_CUDA_AVAILABLE, IS_MLU_AVAILABLE, IS_MPS_AVAILABLE
+from mmcv.utils import (IS_CUDA_AVAILABLE, IS_MLU_AVAILABLE, IS_MPS_AVAILABLE,
+                        IS_NPU_AVAILABLE)


 class TestBBox:
@@ -47,7 +48,11 @@ class TestBBox:
        pytest.param(
            'mps',
            marks=pytest.mark.skipif(
-                not IS_MPS_AVAILABLE, reason='requires MPS support'))
+                not IS_MPS_AVAILABLE, reason='requires MPS support')),
+        pytest.param(
+            'npu',
+            marks=pytest.mark.skipif(
+                not IS_NPU_AVAILABLE, reason='requires NPU support'))
    ])
    def test_bbox_overlaps_float(self, device):
        self._test_bbox_overlaps(device, dtype=torch.float)
@@ -60,7 +65,11 @@ class TestBBox:
        pytest.param(
            'mlu',
            marks=pytest.mark.skipif(
-                not IS_MLU_AVAILABLE, reason='requires MLU support'))
+                not IS_MLU_AVAILABLE, reason='requires MLU support')),
+        pytest.param(
+            'npu',
+            marks=pytest.mark.skipif(
+                not IS_NPU_AVAILABLE, reason='requires NPU support'))
    ])
    def test_bbox_overlaps_half(self, device):
        self._test_bbox_overlaps(device, dtype=torch.half)
--- a/tests/test_ops/test_gather_points.py
+++ b/tests/test_ops/test_gather_points.py
@@ -3,49 +3,65 @@ import pytest
 import torch

 from mmcv.ops import gather_points
+from mmcv.utils import IS_CUDA_AVAILABLE, IS_NPU_AVAILABLE


-@pytest.mark.skipif(
-    not torch.cuda.is_available(), reason='requires CUDA support')
-def test_gather_points():
-    features = torch.tensor([[[
-        -1.6095, -0.1029, -0.8876, -1.2447, -2.4031, 0.3708, -1.1586, -1.4967,
-        -0.4800, 0.2252
-    ],
-                              [
-                                  1.9138, 3.4979, 1.6854, 1.5631, 3.6776,
-                                  3.1154, 2.1705, 2.5221, 2.0411, 3.1446
-                              ],
-                              [
-                                  -1.4173, 0.3073, -1.4339, -1.4340, -1.2770,
-                                  -0.2867, -1.4162, -1.4044, -1.4245, -1.4074
-                              ]],
-                             [[
-                                 0.2160, 0.0842, 0.3661, -0.2749, -0.4909,
-                                 -0.6066, -0.8773, -0.0745, -0.9496, 0.1434
-                             ],
-                              [
-                                  1.3644, 1.8087, 1.6855, 1.9563, 1.2746,
-                                  1.9662, 0.9566, 1.8778, 1.1437, 1.3639
-                              ],
-                              [
-                                  -0.7172, 0.1692, 0.2241, 0.0721, -0.7540,
-                                  0.0462, -0.6227, 0.3223, -0.6944, -0.5294
-                              ]]]).cuda()
+class TestGatherPoints:

-    idx = torch.tensor([[0, 1, 4, 0, 0, 0], [0, 5, 6, 0, 0, 0]]).int().cuda()
+    @pytest.mark.parametrize('device', [
+        pytest.param(
+            'cuda',
+            marks=pytest.mark.skipif(
+                not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
+        pytest.param(
+            'npu',
+            marks=pytest.mark.skipif(
+                not IS_NPU_AVAILABLE, reason='requires NPU support'))
+    ])
+    def test_gather_points_all_close(self, device):
+        features = torch.tensor(
+            [[[
+                -1.6095, -0.1029, -0.8876, -1.2447, -2.4031, 0.3708, -1.1586,
+                -1.4967, -0.4800, 0.2252
+            ],
+              [
+                  1.9138, 3.4979, 1.6854, 1.5631, 3.6776, 3.1154, 2.1705,
+                  2.5221, 2.0411, 3.1446
+              ],
+              [
+                  -1.4173, 0.3073, -1.4339, -1.4340, -1.2770, -0.2867, -1.4162,
+                  -1.4044, -1.4245, -1.4074
+              ]],
+             [[
+                 0.2160, 0.0842, 0.3661, -0.2749, -0.4909, -0.6066, -0.8773,
+                 -0.0745, -0.9496, 0.1434
+             ],
+              [
+                  1.3644, 1.8087, 1.6855, 1.9563, 1.2746, 1.9662, 0.9566,
+                  1.8778, 1.1437, 1.3639
+              ],
+              [
+                  -0.7172, 0.1692, 0.2241, 0.0721, -0.7540, 0.0462, -0.6227,
+                  0.3223, -0.6944, -0.5294
+              ]]],
+            dtype=torch.float,
+            device=device)
+        idx = torch.tensor([[0, 1, 4, 0, 0, 0], [0, 5, 6, 0, 0, 0]],
+                           dtype=torch.int32,
+                           device=device)
+        output = gather_points(features, idx)
+        expected_output = torch.tensor(
+            [[[-1.6095, -0.1029, -2.4031, -1.6095, -1.6095, -1.6095],
+              [1.9138, 3.4979, 3.6776, 1.9138, 1.9138, 1.9138],
+              [-1.4173, 0.3073, -1.2770, -1.4173, -1.4173, -1.4173]],
+             [[0.2160, -0.6066, -0.8773, 0.2160, 0.2160, 0.2160],
+              [1.3644, 1.9662, 0.9566, 1.3644, 1.3644, 1.3644],
+              [-0.7172, 0.0462, -0.6227, -0.7172, -0.7172, -0.7172]]],
+            dtype=torch.float,
+            device=device)

-    output = gather_points(features, idx)
-    expected_output = torch.tensor(
-        [[[-1.6095, -0.1029, -2.4031, -1.6095, -1.6095, -1.6095],
-          [1.9138, 3.4979, 3.6776, 1.9138, 1.9138, 1.9138],
-          [-1.4173, 0.3073, -1.2770, -1.4173, -1.4173, -1.4173]],
-         [[0.2160, -0.6066, -0.8773, 0.2160, 0.2160, 0.2160],
-          [1.3644, 1.9662, 0.9566, 1.3644, 1.3644, 1.3644],
-          [-0.7172, 0.0462, -0.6227, -0.7172, -0.7172, -0.7172]]]).cuda()
+        assert torch.allclose(output, expected_output)

-    assert torch.allclose(output, expected_output)
-
-    # test fp16
-    output_half = gather_points(features.half(), idx)
-    assert torch.allclose(output_half, expected_output.half())
+        # test fp16
+        output_half = gather_points(features.half(), idx)
+        assert torch.allclose(output_half, expected_output.half())
--- a/tests/test_ops/test_nms_rotated.py
+++ b/tests/test_ops/test_nms_rotated.py
@@ -3,13 +3,22 @@ import numpy as np
 import pytest
 import torch

+from mmcv.utils import IS_CUDA_AVAILABLE, IS_NPU_AVAILABLE
+

-@pytest.mark.skipif(
-    not torch.cuda.is_available(),
-    reason='GPU is required to test NMSRotated op')
 class TestNmsRotated:

-    def test_ml_nms_rotated(self):
+    @pytest.mark.parametrize('device', [
+        pytest.param(
+            'npu',
+            marks=pytest.mark.skipif(
+                not IS_NPU_AVAILABLE, reason='requires NPU support')),
+        pytest.param(
+            'cuda',
+            marks=pytest.mark.skipif(
+                not IS_CUDA_AVAILABLE, reason='requires CUDA support'))
+    ])
+    def test_ml_nms_rotated(self, device):
        from mmcv.ops import nms_rotated
        np_boxes = np.array(
            [[6.0, 3.0, 8.0, 7.0, 0.5, 0.7], [3.0, 6.0, 9.0, 11.0, 0.6, 0.8],
@@ -24,8 +33,8 @@ class TestNmsRotated:
            dtype=np.float32)
        np_expect_keep_inds = np.array([3, 1, 0], dtype=np.int64)

-        boxes = torch.from_numpy(np_boxes).cuda()
-        labels = torch.from_numpy(np_labels).cuda()
+        boxes = torch.from_numpy(np_boxes).to(device)
+        labels = torch.from_numpy(np_labels).to(device)

        # test cw angle definition
        dets, keep_inds = nms_rotated(boxes[:, :5], boxes[:, -1], 0.5, labels)
@@ -41,7 +50,17 @@ class TestNmsRotated:
        assert np.allclose(dets.cpu().numpy()[:, :5], np_expect_dets)
        assert np.allclose(keep_inds.cpu().numpy(), np_expect_keep_inds)

-    def test_nms_rotated(self):
+    @pytest.mark.parametrize('device', [
+        pytest.param(
+            'npu',
+            marks=pytest.mark.skipif(
+                not IS_NPU_AVAILABLE, reason='requires NPU support')),
+        pytest.param(
+            'cuda',
+            marks=pytest.mark.skipif(
+                not IS_CUDA_AVAILABLE, reason='requires CUDA support'))
+    ])
+    def test_nms_rotated(self, device):
        from mmcv.ops import nms_rotated
        np_boxes = np.array(
            [[6.0, 3.0, 8.0, 7.0, 0.5, 0.7], [3.0, 6.0, 9.0, 11.0, 0.6, 0.8],
@@ -55,7 +74,7 @@ class TestNmsRotated:
            dtype=np.float32)
        np_expect_keep_inds = np.array([3, 1, 0], dtype=np.int64)

-        boxes = torch.from_numpy(np_boxes).cuda()
+        boxes = torch.from_numpy(np_boxes).to(device)

        # test cw angle definition
        dets, keep_inds = nms_rotated(boxes[:, :5], boxes[:, -1], 0.5)

--- a/tests/test_ops/test_roi_pool.py
+++ b/tests/test_ops/test_roi_pool.py
@@ -72,7 +72,6 @@ class TestRoiPool:
            x = torch.tensor(
                np_input, dtype=dtype, device=device, requires_grad=True)
            rois = torch.tensor(np_rois, dtype=dtype, device=device)
-
            output = roi_pool(x, rois, (pool_h, pool_w), spatial_scale)
            output.backward(torch.ones_like(output))
            assert np.allclose(output.data.cpu().numpy(), np_output, 1e-3)
@@ -97,8 +96,8 @@ class TestRoiPool:
        pytest.param(
            torch.double,
            marks=pytest.mark.skipif(
-                IS_MLU_AVAILABLE,
-                reason='MLU does not support for 64-bit floating point')),
+                IS_MLU_AVAILABLE or IS_NPU_AVAILABLE,
+                reason='MLU, NPU does not support for 64-bit floating point')),
        torch.half
    ])
    def test_roipool_allclose(self, device, dtype):