Unverified Commit 834f94db authored by ckirchhoff's avatar ckirchhoff Committed by GitHub
Browse files

[Feature] Add support of some ops for Ascend device (#2594)



* [Feature]: add supports of gather_points, nms_rotated, bbox_overlaps for Ascend device

* Apply suggestions from code review

---------
Co-authored-by: default avatarZaida Zhou <58739961+zhouzaida@users.noreply.github.com>
parent 8e2b2bf3
......@@ -7,7 +7,7 @@ We implement common ops used in detection, segmentation, etc.
| ActiveRotatedFilter | √ | √ | | | |
| AssignScoreWithK | | √ | | | |
| BallQuery | | √ | | | |
| BBoxOverlaps | | √ | √ | √ | |
| BBoxOverlaps | | √ | √ | √ | |
| BorderAlign | | √ | | | |
| BoxIouRotated | √ | √ | | | |
| BoxIouQuadri | √ | √ | | | |
......@@ -25,7 +25,7 @@ We implement common ops used in detection, segmentation, etc.
| FurthestPointSample | | √ | | | |
| FurthestPointSampleWithDist | | √ | | | |
| FusedBiasLeakyrelu | | √ | | | √ |
| GatherPoints | | √ | | | |
| GatherPoints | | √ | | | |
| GroupPoints | | √ | | | |
| Iou3d | | √ | √ | | |
| KNN | | √ | | | |
......@@ -35,7 +35,7 @@ We implement common ops used in detection, segmentation, etc.
| ModulatedDeformConv2d | √ | √ | | | √ |
| MultiScaleDeformableAttn | | √ | √ | | |
| NMS | √ | √ | √ | | √ |
| NMSRotated | √ | √ | | | |
| NMSRotated | √ | √ | | | |
| NMSQuadri | √ | √ | | | |
| PixelGroup | √ | | | | |
| PointsInBoxes | √ | √ | | | |
......
......@@ -7,7 +7,7 @@ MMCV 提供了检测、分割等任务中常用的算子
| ActiveRotatedFilter | √ | √ | | | |
| AssignScoreWithK | | √ | | | |
| BallQuery | | √ | | | |
| BBoxOverlaps | | √ | √ | √ | |
| BBoxOverlaps | | √ | √ | √ | |
| BorderAlign | | √ | | | |
| BoxIouRotated | √ | √ | | | |
| BoxIouQuadri | √ | √ | | | |
......@@ -25,7 +25,7 @@ MMCV 提供了检测、分割等任务中常用的算子
| FurthestPointSample | | √ | | | |
| FurthestPointSampleWithDist | | √ | | | |
| FusedBiasLeakyrelu | | √ | | | √ |
| GatherPoints | | √ | | | |
| GatherPoints | | √ | | | |
| GroupPoints | | √ | | | |
| Iou3d | | √ | √ | | |
| KNN | | √ | | | |
......@@ -35,7 +35,7 @@ MMCV 提供了检测、分割等任务中常用的算子
| ModulatedDeformConv2d | √ | √ | | | √ |
| MultiScaleDeformableAttn | | √ | √ | | |
| NMS | √ | √ | √ | | √ |
| NMSRotated | √ | √ | | | |
| NMSRotated | √ | √ | | | |
| NMSQuadri | √ | √ | | | |
| PixelGroup | √ | | | | |
| PointsInBoxes | √ | √ | | | |
......
......@@ -12,21 +12,32 @@ Tensor nms_rotated_cuda(const Tensor dets, const Tensor scores,
const float iou_threshold, const int multi_label);
#endif
#ifdef MMCV_WITH_NPU
Tensor nms_rotated_npu(const Tensor dets, const Tensor scores,
const Tensor labels, const float iou_threshold);
#endif
// Interface for Python
// inline is needed to prevent multiple function definitions when this header is
// included by different cpps
Tensor nms_rotated(const Tensor dets, const Tensor scores, const Tensor order,
const Tensor dets_sorted, const float iou_threshold,
const int multi_label) {
const Tensor dets_sorted, const Tensor labels,
const float iou_threshold, const int multi_label) {
assert(dets.device().is_cuda() == scores.device().is_cuda());
if (dets.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
return nms_rotated_cuda(dets, scores, order, dets_sorted, iou_threshold,
multi_label);
return nms_rotated_cuda(dets, scores, order, dets_sorted.contiguous(),
iou_threshold, multi_label);
#else
AT_ERROR("Not compiled with GPU support");
#endif
} else if (dets.device().type() == at::kXLA) {
#ifdef MMCV_WITH_NPU
return nms_rotated_npu(dets, scores, labels, iou_threshold);
#else
AT_ERROR("Not compiled with NPU support");
#endif
}
return nms_rotated_cpu(dets, scores, iou_threshold);
return nms_rotated_cpu(dets.contiguous(), scores.contiguous(), iou_threshold);
}
#include "pytorch_npu_helper.hpp"
using namespace NPU_NAME_SPACE;
using namespace std;
void bbox_overlaps_impl(const Tensor bboxes1, const Tensor bboxes2, Tensor ious,
const int mode, const bool aligned, const int offset);
void bbox_overlaps_npu(const Tensor bboxes1, const Tensor bboxes2, Tensor ious,
const int mode, const bool aligned, const int offset) {
string modeStr = "iou";
if (mode == 1) {
modeStr = "iof";
}
at::Tensor bboxes = at::ones_like(bboxes2);
at::Tensor gtboxes = at::ones_like(bboxes1);
bboxes = aligned ? bboxes2.transpose(0, 1) : bboxes2;
gtboxes = aligned ? bboxes1.transpose(0, 1) : bboxes1;
OpCommand cmd;
cmd.Name("Iou")
.Input(bboxes)
.Input(gtboxes)
.Output(ious)
.Attr("mode", modeStr)
.Attr("eps", (float)offset)
.Attr("aligned", aligned)
.Run();
}
REGISTER_NPU_IMPL(bbox_overlaps_impl, bbox_overlaps_npu);
......@@ -45,11 +45,11 @@ void deform_roi_pool_backward_npu(Tensor grad_output, Tensor input, Tensor rois,
int64_t sampling_ratio_ = (int64_t)sampling_ratio;
OpCommand cmd;
cmd.Name("DeformableRoiPoolGrad")
.Input(grad_input)
.Input(grad_output)
.Input(input)
.Input(rois)
.Input(offset)
.Output(grad_output)
.Output(grad_input)
.Output(grad_offset)
.Attr("output_size", output_size)
.Attr("spatial_scale", spatial_scale)
......
#include "pytorch_npu_helper.hpp"
using namespace NPU_NAME_SPACE;
using namespace std;
void gather_points_forward_npu(int b, int c, int n, int npoints,
const Tensor points, const Tensor idx,
Tensor out) {
// b, c, n, and npoints do not need to be passed into gatherv2,
// b, c, n, and npoints are calculated inside the operator
// gatherv2 operator in ascend needs to set axis to 2, batch_dims is 1
c10::SmallVector<int64_t, N> axis = {2};
int64_t batch_dims = 1;
OpCommand cmd;
cmd.Name("GatherV2")
.Input(points)
.Input(idx)
.Input(axis)
.Output(out)
.Attr("batch_dims", batch_dims)
.Run();
}
void gather_points_forward_impl(int b, int c, int n, int npoints,
const Tensor points, const Tensor idx,
Tensor out);
REGISTER_NPU_IMPL(gather_points_forward_impl, gather_points_forward_npu);
#include "pytorch_npu_helper.hpp"
using namespace NPU_NAME_SPACE;
Tensor nms_rotated_npu(const Tensor dets, const Tensor scores,
const Tensor labels, const float iou_threshold) {
auto originDtype = dets.scalar_type();
at::Tensor detsCast = dets;
at::Tensor scoresCast = scores;
if (originDtype != at::ScalarType::Float) {
detsCast = NPUNativeFunctions::npu_dtype_cast(dets, at::kFloat);
scoresCast = NPUNativeFunctions::npu_dtype_cast(scores, at::kFloat);
}
c10::SmallVector<int64_t, SIZE> selectedIndexSize = {dets.size(0)};
at::Tensor selectedBox = OpPreparation::ApplyTensor(dets);
at::Tensor selectedIndex = OpPreparation::ApplyTensor(
selectedIndexSize, dets.options().dtype(at::kInt), dets);
c10::SmallVector<int64_t, N> output_sync_idx = {0, 1};
OpCommand cmd;
cmd.Sync(output_sync_idx)
.Name("RotatedNMS")
.Input(detsCast)
.Input(scoresCast)
.Input(labels)
.Output(selectedBox)
.Output(selectedIndex)
.Attr("iou_threshold", (float)iou_threshold)
.Run();
selectedIndex = NPUNativeFunctions::npu_dtype_cast(selectedIndex, at::kLong);
return selectedIndex;
}
......@@ -11,7 +11,6 @@ void roi_pool_forward_npu(Tensor input, Tensor rois, Tensor output,
int64_t pooled_channel = 1;
at::Tensor roi_actual_num = at_npu::native::OpPreparation::ApplyTensor(
{}, rois.options().dtype(at::kInt), rois);
OpCommand cmd;
cmd.Name("RoiPoolingWithArgMax")
.Input(input)
......@@ -27,8 +26,38 @@ void roi_pool_forward_npu(Tensor input, Tensor rois, Tensor output,
.Run();
}
void roi_pool_backward_npu(Tensor grad_output, Tensor rois, Tensor argmax,
Tensor grad_input, int pooled_height,
int pooled_width, float spatial_scale) {
int64_t pooled_height_64 = pooled_height;
int64_t pooled_width_64 = pooled_width;
int64_t pooled_channel = 1;
at::Tensor roi_actual_num = at_npu::native::OpPreparation::ApplyTensor(
{}, rois.options().dtype(at::kInt), rois);
at::Tensor x = at::ones_like(grad_input);
OpCommand cmd;
cmd.Name("RoiPoolingGradWithArgMax")
.Input(grad_output)
.Input(x)
.Input(rois)
.Input(roi_actual_num)
.Input(argmax)
.Output(grad_input)
.Attr("pooled_h", pooled_height_64)
.Attr("pooled_w", pooled_width_64)
.Attr("spatial_scale_h", spatial_scale)
.Attr("spatial_scale_w", spatial_scale)
.Attr("pool_channel", pooled_channel)
.Run();
}
void roi_pool_forward_impl(Tensor input, Tensor rois, Tensor output,
Tensor argmax, int pooled_height, int pooled_width,
float spatial_scale);
void roi_pool_backward_impl(Tensor grad_output, Tensor rois, Tensor argmax,
Tensor grad_input, int pooled_height,
int pooled_width, float spatial_scale);
REGISTER_NPU_IMPL(roi_pool_forward_impl, roi_pool_forward_npu);
REGISTER_NPU_IMPL(roi_pool_backward_impl, roi_pool_backward_npu);
......@@ -309,8 +309,8 @@ void box_iou_rotated(const Tensor boxes1, const Tensor boxes2, Tensor ious,
const int mode_flag, const bool aligned);
Tensor nms_rotated(const Tensor dets, const Tensor scores, const Tensor order,
const Tensor dets_sorted, const float iou_threshold,
const int multi_label);
const Tensor dets_sorted, const Tensor labels,
const float iou_threshold, const int multi_label);
Tensor upfirdn2d(const Tensor &input, const Tensor &kernel, int up_x, int up_y,
int down_x, int down_y, int pad_x0, int pad_x1, int pad_y0,
......@@ -758,7 +758,7 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
py::arg("mode_flag"), py::arg("aligned"));
m.def("nms_rotated", &nms_rotated, "NMS for rotated boxes", py::arg("dets"),
py::arg("scores"), py::arg("order"), py::arg("dets_sorted"),
py::arg("iou_threshold"), py::arg("multi_label"));
py::arg("labels"), py::arg("iou_threshold"), py::arg("multi_label"));
m.def("ball_query_forward", &ball_query_forward, "ball_query_forward",
py::arg("new_xyz_tensor"), py::arg("xyz_tensor"), py::arg("idx_tensor"),
py::arg("b"), py::arg("n"), py::arg("m"), py::arg("min_radius"),
......
......@@ -406,6 +406,19 @@ def nms_rotated(dets: Tensor,
else:
dets_cw = dets
multi_label = labels is not None
if labels is None:
input_labels = scores.new_empty(0, dtype=torch.int)
else:
input_labels = labels
if dets.device.type == 'npu':
order = scores.new_empty(0, dtype=torch.long)
keep_inds = ext_module.nms_rotated(dets_cw, scores, order, dets_cw,
input_labels, iou_threshold,
multi_label)
dets = torch.cat((dets[keep_inds], scores[keep_inds].reshape(-1, 1)),
dim=1)
return dets, keep_inds
if multi_label:
dets_wl = torch.cat((dets_cw, labels.unsqueeze(1)), 1) # type: ignore
else:
......@@ -419,11 +432,13 @@ def nms_rotated(dets: Tensor,
scores,
order,
dets_sorted,
input_labels,
iou_threshold=iou_threshold,
multi_label=multi_label)
else:
keep_inds = ext_module.nms_rotated(dets_wl, scores, order, dets_sorted,
iou_threshold, multi_label)
input_labels, iou_threshold,
multi_label)
dets = torch.cat((dets[keep_inds], scores[keep_inds].reshape(-1, 1)),
dim=1)
return dets, keep_inds
......
......@@ -3,7 +3,8 @@ import numpy as np
import pytest
import torch
from mmcv.utils import IS_CUDA_AVAILABLE, IS_MLU_AVAILABLE, IS_MPS_AVAILABLE
from mmcv.utils import (IS_CUDA_AVAILABLE, IS_MLU_AVAILABLE, IS_MPS_AVAILABLE,
IS_NPU_AVAILABLE)
class TestBBox:
......@@ -47,7 +48,11 @@ class TestBBox:
pytest.param(
'mps',
marks=pytest.mark.skipif(
not IS_MPS_AVAILABLE, reason='requires MPS support'))
not IS_MPS_AVAILABLE, reason='requires MPS support')),
pytest.param(
'npu',
marks=pytest.mark.skipif(
not IS_NPU_AVAILABLE, reason='requires NPU support'))
])
def test_bbox_overlaps_float(self, device):
self._test_bbox_overlaps(device, dtype=torch.float)
......@@ -60,7 +65,11 @@ class TestBBox:
pytest.param(
'mlu',
marks=pytest.mark.skipif(
not IS_MLU_AVAILABLE, reason='requires MLU support'))
not IS_MLU_AVAILABLE, reason='requires MLU support')),
pytest.param(
'npu',
marks=pytest.mark.skipif(
not IS_NPU_AVAILABLE, reason='requires NPU support'))
])
def test_bbox_overlaps_half(self, device):
self._test_bbox_overlaps(device, dtype=torch.half)
......@@ -3,49 +3,65 @@ import pytest
import torch
from mmcv.ops import gather_points
from mmcv.utils import IS_CUDA_AVAILABLE, IS_NPU_AVAILABLE
@pytest.mark.skipif(
not torch.cuda.is_available(), reason='requires CUDA support')
def test_gather_points():
features = torch.tensor([[[
-1.6095, -0.1029, -0.8876, -1.2447, -2.4031, 0.3708, -1.1586, -1.4967,
-0.4800, 0.2252
],
[
1.9138, 3.4979, 1.6854, 1.5631, 3.6776,
3.1154, 2.1705, 2.5221, 2.0411, 3.1446
],
[
-1.4173, 0.3073, -1.4339, -1.4340, -1.2770,
-0.2867, -1.4162, -1.4044, -1.4245, -1.4074
]],
[[
0.2160, 0.0842, 0.3661, -0.2749, -0.4909,
-0.6066, -0.8773, -0.0745, -0.9496, 0.1434
],
[
1.3644, 1.8087, 1.6855, 1.9563, 1.2746,
1.9662, 0.9566, 1.8778, 1.1437, 1.3639
],
[
-0.7172, 0.1692, 0.2241, 0.0721, -0.7540,
0.0462, -0.6227, 0.3223, -0.6944, -0.5294
]]]).cuda()
class TestGatherPoints:
idx = torch.tensor([[0, 1, 4, 0, 0, 0], [0, 5, 6, 0, 0, 0]]).int().cuda()
@pytest.mark.parametrize('device', [
pytest.param(
'cuda',
marks=pytest.mark.skipif(
not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
pytest.param(
'npu',
marks=pytest.mark.skipif(
not IS_NPU_AVAILABLE, reason='requires NPU support'))
])
def test_gather_points_all_close(self, device):
features = torch.tensor(
[[[
-1.6095, -0.1029, -0.8876, -1.2447, -2.4031, 0.3708, -1.1586,
-1.4967, -0.4800, 0.2252
],
[
1.9138, 3.4979, 1.6854, 1.5631, 3.6776, 3.1154, 2.1705,
2.5221, 2.0411, 3.1446
],
[
-1.4173, 0.3073, -1.4339, -1.4340, -1.2770, -0.2867, -1.4162,
-1.4044, -1.4245, -1.4074
]],
[[
0.2160, 0.0842, 0.3661, -0.2749, -0.4909, -0.6066, -0.8773,
-0.0745, -0.9496, 0.1434
],
[
1.3644, 1.8087, 1.6855, 1.9563, 1.2746, 1.9662, 0.9566,
1.8778, 1.1437, 1.3639
],
[
-0.7172, 0.1692, 0.2241, 0.0721, -0.7540, 0.0462, -0.6227,
0.3223, -0.6944, -0.5294
]]],
dtype=torch.float,
device=device)
idx = torch.tensor([[0, 1, 4, 0, 0, 0], [0, 5, 6, 0, 0, 0]],
dtype=torch.int32,
device=device)
output = gather_points(features, idx)
expected_output = torch.tensor(
[[[-1.6095, -0.1029, -2.4031, -1.6095, -1.6095, -1.6095],
[1.9138, 3.4979, 3.6776, 1.9138, 1.9138, 1.9138],
[-1.4173, 0.3073, -1.2770, -1.4173, -1.4173, -1.4173]],
[[0.2160, -0.6066, -0.8773, 0.2160, 0.2160, 0.2160],
[1.3644, 1.9662, 0.9566, 1.3644, 1.3644, 1.3644],
[-0.7172, 0.0462, -0.6227, -0.7172, -0.7172, -0.7172]]],
dtype=torch.float,
device=device)
output = gather_points(features, idx)
expected_output = torch.tensor(
[[[-1.6095, -0.1029, -2.4031, -1.6095, -1.6095, -1.6095],
[1.9138, 3.4979, 3.6776, 1.9138, 1.9138, 1.9138],
[-1.4173, 0.3073, -1.2770, -1.4173, -1.4173, -1.4173]],
[[0.2160, -0.6066, -0.8773, 0.2160, 0.2160, 0.2160],
[1.3644, 1.9662, 0.9566, 1.3644, 1.3644, 1.3644],
[-0.7172, 0.0462, -0.6227, -0.7172, -0.7172, -0.7172]]]).cuda()
assert torch.allclose(output, expected_output)
assert torch.allclose(output, expected_output)
# test fp16
output_half = gather_points(features.half(), idx)
assert torch.allclose(output_half, expected_output.half())
# test fp16
output_half = gather_points(features.half(), idx)
assert torch.allclose(output_half, expected_output.half())
......@@ -3,13 +3,22 @@ import numpy as np
import pytest
import torch
from mmcv.utils import IS_CUDA_AVAILABLE, IS_NPU_AVAILABLE
@pytest.mark.skipif(
not torch.cuda.is_available(),
reason='GPU is required to test NMSRotated op')
class TestNmsRotated:
def test_ml_nms_rotated(self):
@pytest.mark.parametrize('device', [
pytest.param(
'npu',
marks=pytest.mark.skipif(
not IS_NPU_AVAILABLE, reason='requires NPU support')),
pytest.param(
'cuda',
marks=pytest.mark.skipif(
not IS_CUDA_AVAILABLE, reason='requires CUDA support'))
])
def test_ml_nms_rotated(self, device):
from mmcv.ops import nms_rotated
np_boxes = np.array(
[[6.0, 3.0, 8.0, 7.0, 0.5, 0.7], [3.0, 6.0, 9.0, 11.0, 0.6, 0.8],
......@@ -24,8 +33,8 @@ class TestNmsRotated:
dtype=np.float32)
np_expect_keep_inds = np.array([3, 1, 0], dtype=np.int64)
boxes = torch.from_numpy(np_boxes).cuda()
labels = torch.from_numpy(np_labels).cuda()
boxes = torch.from_numpy(np_boxes).to(device)
labels = torch.from_numpy(np_labels).to(device)
# test cw angle definition
dets, keep_inds = nms_rotated(boxes[:, :5], boxes[:, -1], 0.5, labels)
......@@ -41,7 +50,17 @@ class TestNmsRotated:
assert np.allclose(dets.cpu().numpy()[:, :5], np_expect_dets)
assert np.allclose(keep_inds.cpu().numpy(), np_expect_keep_inds)
def test_nms_rotated(self):
@pytest.mark.parametrize('device', [
pytest.param(
'npu',
marks=pytest.mark.skipif(
not IS_NPU_AVAILABLE, reason='requires NPU support')),
pytest.param(
'cuda',
marks=pytest.mark.skipif(
not IS_CUDA_AVAILABLE, reason='requires CUDA support'))
])
def test_nms_rotated(self, device):
from mmcv.ops import nms_rotated
np_boxes = np.array(
[[6.0, 3.0, 8.0, 7.0, 0.5, 0.7], [3.0, 6.0, 9.0, 11.0, 0.6, 0.8],
......@@ -55,7 +74,7 @@ class TestNmsRotated:
dtype=np.float32)
np_expect_keep_inds = np.array([3, 1, 0], dtype=np.int64)
boxes = torch.from_numpy(np_boxes).cuda()
boxes = torch.from_numpy(np_boxes).to(device)
# test cw angle definition
dets, keep_inds = nms_rotated(boxes[:, :5], boxes[:, -1], 0.5)
......
......@@ -72,7 +72,6 @@ class TestRoiPool:
x = torch.tensor(
np_input, dtype=dtype, device=device, requires_grad=True)
rois = torch.tensor(np_rois, dtype=dtype, device=device)
output = roi_pool(x, rois, (pool_h, pool_w), spatial_scale)
output.backward(torch.ones_like(output))
assert np.allclose(output.data.cpu().numpy(), np_output, 1e-3)
......@@ -97,8 +96,8 @@ class TestRoiPool:
pytest.param(
torch.double,
marks=pytest.mark.skipif(
IS_MLU_AVAILABLE,
reason='MLU does not support for 64-bit floating point')),
IS_MLU_AVAILABLE or IS_NPU_AVAILABLE,
reason='MLU, NPU does not support for 64-bit floating point')),
torch.half
])
def test_roipool_allclose(self, device, dtype):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment