Unverified Commit 733e6ff8 authored by bdf's avatar bdf Committed by GitHub
Browse files

Pick MLU modifications from master (1.x) to main (2.x) (#2704)



* [Feature] Support Voxelization with cambricon MLU device (#2500)

* [Feature] Support hard_voxelize with cambricon MLU backend

* [Feature](bangc-ops): add voxelization op

* [Feature](bangc-ops): add voxelization op

* [Feature](bangc-ops): add voxelization op

* [Feature](bangc-ops): add voxelization op

* [Feature](bangc-ops): add voxelization op

* [Feature](bangc-ops): add voxelization op

* [Feature](bangc-ops): add voxelization op

* [Feature](bangc-ops): add voxelization op

* [Enhance] Optimize the performace of ms_deform_attn for MLU device (#2510)

* ms_opt

* ms_opt

* ms_opt

* ms_opt

* ms_opt

* [Feature] ms_deform_attn performance optimization

* [Feature] ms_deform_attn performance optimization

* [Feature] ms_deform_attn performance optimization

* [Feature] Support ball_query with cambricon MLU backend and mlu-ops library. (#2520)

* [Feature] Support ball_query with cambricon MLU backend and mlu-ops library.

* [Fix] update operator data layout setting.

* [Fix] add cxx compile option to avoid symbol conflict.

* [Fix] fix lint errors.

* [Fix] update ops.md with info of ball_query support by MLU backend.

* [Feature] Fix typo.

* [Fix] Remove print.

* [Fix] get mlu-ops from MMCV_MLU_OPS_PATH env.

* [Fix] update MMCV_MLU_OPS_PATH check logic.

* [Fix] update error info when failed to download mlu-ops.

* [Fix] check mlu-ops version matching info in mmcv.

* [Fix] revise wrong filename.

* [Fix] remove f.close and re.

* [Docs] Steps to compile mmcv-full on MLU machine (#2571)

* [Docs] Steps to compile mmcv-full on MLU machine

* [Docs] Adjust paragraph order

* Update docs/zh_cn/get_started/build.md
Co-authored-by: default avatarZaida Zhou <58739961+zhouzaida@users.noreply.github.com>

* Update docs/zh_cn/get_started/build.md
Co-authored-by: default avatarZaida Zhou <58739961+zhouzaida@users.noreply.github.com>

* Update docs/en/get_started/build.md
Co-authored-by: default avatarZaida Zhou <58739961+zhouzaida@users.noreply.github.com>

* Update docs/en/get_started/build.md
Co-authored-by: default avatarZaida Zhou <58739961+zhouzaida@users.noreply.github.com>

* [Docs] Modify the format

---------
Co-authored-by: default avatarbudefei <budefei@cambricon.com>
Co-authored-by: default avatarZaida Zhou <58739961+zhouzaida@users.noreply.github.com>

* [Fix] Fix tensor descriptor setting in MLU ball_query. (#2579)

* [Feature] Add MLU support for Sparse Convolution op (#2589)

* [Feature] Add sparse convolution MLU API

* [Feature] update cpp code style

* end-of-file

* delete libext.a

* code style

* update ops.md

---------
Co-authored-by: default avatarbudefei <budefei@cambricon.com>

* [Enhancement] Replace the implementation of deform_roi_pool with mlu-ops (#2598)

* [Feature] Replace the implementation of deform_roi_pool with mlu-ops

* [Feature] Modify code

---------
Co-authored-by: default avatarbudefei <budefei@cambricon.com>

* [Enhancement] ms_deform_attn performance optimization (#2616)

* ms_opt_v2

* ms_opt_v2_1

* optimize MultiScaleDeformableAttention ops for MLU

* ms_opt_v2_1

* [Feature] ms_deform_attn performance optimization V2

* [Feature] ms_deform_attn performance optimization V2

* [Feature] ms_deform_attn performance optimization V2

* [Feature] ms_deform_attn performance optimization V2

* [Feature] ms_deform_attn performance optimization V2

* [Feature] ms_deform_attn performance optimization V2

* [Feature] ms_deform_attn performance optimization V2

---------
Co-authored-by: default avatardongchengwei <dongchengwei@cambricon.com>

* [Feature] Support NmsRotated with cambricon MLU backend (#2643)

* [Feature] Support NmsRotated with cambricon MLU backend

* [Feature] remove foolproofs in nms_rotated_mlu.cpp

* [Feature] fix lint in test_nms_rotated.py

* [Feature] fix kMLU not found in nms_rotated.cpp

* [Feature] modify mlu support in nms.py

* [Feature] modify nms_rotated support in ops.md

* [Feature] modify ops/nms.py

* [Enhance] Add a default value for MMCV_MLU_ARGS (#2688)

* add mlu_args

* add mlu_args

* Modify the code

---------
Co-authored-by: default avatarbudefei <budefei@cambricon.com>

* [Enhance] Ignore mlu-ops files (#2691)
Co-authored-by: default avatarbudefei <budefei@cambricon.com>

---------
Co-authored-by: default avatarZShaopeng <108382403+ZShaopeng@users.noreply.github.com>
Co-authored-by: default avatarBinZheng <38182684+Wickyzheng@users.noreply.github.com>
Co-authored-by: default avatarliuduanhui <103939338+DanieeelLiu@users.noreply.github.com>
Co-authored-by: default avatarbudefei <budefei@cambricon.com>
Co-authored-by: default avatarZaida Zhou <58739961+zhouzaida@users.noreply.github.com>
Co-authored-by: default avatarduzekun <108381389+duzekunKTH@users.noreply.github.com>
Co-authored-by: default avatardongchengwei <dongchengwei@cambricon.com>
Co-authored-by: default avatarliuyuan1-v <125547457+liuyuan1-v@users.noreply.github.com>
parent 1f161f68
...@@ -3,55 +3,59 @@ import pytest ...@@ -3,55 +3,59 @@ import pytest
import torch import torch
from mmcv.ops import ball_query from mmcv.ops import ball_query
from mmcv.utils import IS_CUDA_AVAILABLE, IS_MLU_AVAILABLE
@pytest.mark.skipif( @pytest.mark.parametrize('device', [
not torch.cuda.is_available(), reason='requires CUDA support') pytest.param(
def test_ball_query(): 'cuda',
new_xyz = torch.tensor([[[-0.0740, 1.3147, -1.3625], marks=pytest.mark.skipif(
[-2.2769, 2.7817, -0.2334], not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
[-0.4003, 2.4666, -0.5116], pytest.param(
[-0.0740, 1.3147, -1.3625], 'mlu',
[-0.0740, 1.3147, -1.3625]], marks=pytest.mark.skipif(
[[-2.0289, 2.4952, -0.1708], not IS_MLU_AVAILABLE, reason='requires MLU support'))
[-2.0668, 6.0278, -0.4875], ])
[0.4066, 1.4211, -0.2947], def test_ball_query(device):
[-2.0289, 2.4952, -0.1708], new_xyz = torch.tensor(
[-2.0289, 2.4952, -0.1708]]]).cuda() [[[-0.0740, 1.3147, -1.3625], [-2.2769, 2.7817, -0.2334],
[-0.4003, 2.4666, -0.5116], [-0.0740, 1.3147, -1.3625],
[-0.0740, 1.3147, -1.3625]],
[[-2.0289, 2.4952, -0.1708], [-2.0668, 6.0278, -0.4875],
[0.4066, 1.4211, -0.2947], [-2.0289, 2.4952, -0.1708],
[-2.0289, 2.4952, -0.1708]]],
device=device)
xyz = torch.tensor([[[-0.0740, 1.3147, -1.3625], [0.5555, 1.0399, -1.3634], xyz = torch.tensor(
[-0.4003, 2.4666, [[[-0.0740, 1.3147, -1.3625], [0.5555, 1.0399, -1.3634],
-0.5116], [-0.5251, 2.4379, -0.8466], [-0.4003, 2.4666, -0.5116], [-0.5251, 2.4379, -0.8466],
[-0.9691, 1.1418, [-0.9691, 1.1418, -1.3733], [-0.2232, 0.9561, -1.3626],
-1.3733], [-0.2232, 0.9561, -1.3626], [-2.2769, 2.7817, -0.2334], [-0.2822, 1.3192, -1.3645],
[-2.2769, 2.7817, -0.2334], [0.1533, 1.5024, -1.0432], [0.4917, 1.1529, -1.3496]],
[-0.2822, 1.3192, -1.3645], [0.1533, 1.5024, -1.0432], [[-2.0289, 2.4952, -0.1708], [-0.7188, 0.9956, -0.5096],
[0.4917, 1.1529, -1.3496]], [-2.0668, 6.0278, -0.4875], [-1.9304, 3.3092, 0.6610],
[[-2.0289, 2.4952, [0.0949, 1.4332, 0.3140], [-1.2879, 2.0008, -0.7791],
-0.1708], [-0.7188, 0.9956, -0.5096], [-0.7252, 0.9611, -0.6371], [0.4066, 1.4211, -0.2947],
[-2.0668, 6.0278, -0.4875], [-1.9304, 3.3092, 0.6610], [0.3220, 1.4447, 0.3548], [-0.9744, 2.3856, -1.2000]]],
[0.0949, 1.4332, 0.3140], [-1.2879, 2.0008, -0.7791], device=device)
[-0.7252, 0.9611, -0.6371], [0.4066, 1.4211, -0.2947],
[0.3220, 1.4447, 0.3548], [-0.9744, 2.3856,
-1.2000]]]).cuda()
idx = ball_query(0, 0.2, 5, xyz, new_xyz) idx = ball_query(0, 0.2, 5, xyz, new_xyz)
expected_idx = torch.tensor([[[0, 0, 0, 0, 0], [6, 6, 6, 6, 6], expected_idx = torch.tensor(
[2, 2, 2, 2, 2], [0, 0, 0, 0, 0], [[[0, 0, 0, 0, 0], [6, 6, 6, 6, 6], [2, 2, 2, 2, 2], [0, 0, 0, 0, 0],
[0, 0, 0, 0, 0]], [0, 0, 0, 0, 0]],
[[0, 0, 0, 0, 0], [2, 2, 2, 2, 2], [[0, 0, 0, 0, 0], [2, 2, 2, 2, 2], [7, 7, 7, 7, 7], [0, 0, 0, 0, 0],
[7, 7, 7, 7, 7], [0, 0, 0, 0, 0], [0, 0, 0, 0, 0]]],
[0, 0, 0, 0, 0]]]).cuda() device=device)
assert torch.all(idx == expected_idx) assert torch.all(idx == expected_idx)
# test dilated ball query # test dilated ball query
idx = ball_query(0.2, 0.4, 5, xyz, new_xyz) idx = ball_query(0.2, 0.4, 5, xyz, new_xyz)
expected_idx = torch.tensor([[[0, 5, 7, 0, 0], [6, 6, 6, 6, 6], expected_idx = torch.tensor(
[2, 3, 2, 2, 2], [0, 5, 7, 0, 0], [[[0, 5, 7, 0, 0], [6, 6, 6, 6, 6], [2, 3, 2, 2, 2], [0, 5, 7, 0, 0],
[0, 5, 7, 0, 0]], [0, 5, 7, 0, 0]],
[[0, 0, 0, 0, 0], [2, 2, 2, 2, 2], [[0, 0, 0, 0, 0], [2, 2, 2, 2, 2], [7, 7, 7, 7, 7], [0, 0, 0, 0, 0],
[7, 7, 7, 7, 7], [0, 0, 0, 0, 0], [0, 0, 0, 0, 0]]],
[0, 0, 0, 0, 0]]]).cuda() device=device)
assert torch.all(idx == expected_idx) assert torch.all(idx == expected_idx)
......
...@@ -3,7 +3,7 @@ import numpy as np ...@@ -3,7 +3,7 @@ import numpy as np
import pytest import pytest
import torch import torch
from mmcv.utils import IS_CUDA_AVAILABLE, IS_NPU_AVAILABLE from mmcv.utils import IS_CUDA_AVAILABLE, IS_MLU_AVAILABLE, IS_NPU_AVAILABLE
class TestNmsRotated: class TestNmsRotated:
...@@ -16,7 +16,11 @@ class TestNmsRotated: ...@@ -16,7 +16,11 @@ class TestNmsRotated:
pytest.param( pytest.param(
'cuda', 'cuda',
marks=pytest.mark.skipif( marks=pytest.mark.skipif(
not IS_CUDA_AVAILABLE, reason='requires CUDA support')) not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
pytest.param(
'mlu',
marks=pytest.mark.skipif(
not IS_MLU_AVAILABLE, reason='requires MLU support'))
]) ])
def test_ml_nms_rotated(self, device): def test_ml_nms_rotated(self, device):
from mmcv.ops import nms_rotated from mmcv.ops import nms_rotated
...@@ -58,7 +62,11 @@ class TestNmsRotated: ...@@ -58,7 +62,11 @@ class TestNmsRotated:
pytest.param( pytest.param(
'cuda', 'cuda',
marks=pytest.mark.skipif( marks=pytest.mark.skipif(
not IS_CUDA_AVAILABLE, reason='requires CUDA support')) not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
pytest.param(
'mlu',
marks=pytest.mark.skipif(
not IS_MLU_AVAILABLE, reason='requires MLU support'))
]) ])
def test_nms_rotated(self, device): def test_nms_rotated(self, device):
from mmcv.ops import nms_rotated from mmcv.ops import nms_rotated
......
...@@ -10,6 +10,8 @@ from mmcv.ops import (SparseConvTensor, SparseInverseConv3d, SparseSequential, ...@@ -10,6 +10,8 @@ from mmcv.ops import (SparseConvTensor, SparseInverseConv3d, SparseSequential,
if torch.__version__ == 'parrots': if torch.__version__ == 'parrots':
pytest.skip('not supported in parrots now', allow_module_level=True) pytest.skip('not supported in parrots now', allow_module_level=True)
from mmcv.utils import IS_CUDA_AVAILABLE, IS_MLU_AVAILABLE
def make_sparse_convmodule(in_channels, def make_sparse_convmodule(in_channels,
out_channels, out_channels,
...@@ -76,21 +78,29 @@ def make_sparse_convmodule(in_channels, ...@@ -76,21 +78,29 @@ def make_sparse_convmodule(in_channels,
return layers return layers
@pytest.mark.skipif( @pytest.mark.parametrize('device', [
not torch.cuda.is_available(), reason='requires CUDA support') pytest.param(
def test_make_sparse_convmodule(): 'cuda',
marks=pytest.mark.skipif(
not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
pytest.param(
'mlu',
marks=pytest.mark.skipif(
not IS_MLU_AVAILABLE, reason='requires MLU support'))
])
def test_make_sparse_convmodule(device):
torch.cuda.empty_cache() torch.cuda.empty_cache()
voxel_features = torch.tensor([[6.56126, 0.9648336, -1.7339306, 0.315], voxel_features = torch.tensor([[6.56126, 0.9648336, -1.7339306, 0.315],
[6.8162713, -2.480431, -1.3616394, 0.36], [6.8162713, -2.480431, -1.3616394, 0.36],
[11.643568, -4.744306, -1.3580885, 0.16], [11.643568, -4.744306, -1.3580885, 0.16],
[23.482342, 6.5036807, 0.5806964, 0.35]], [23.482342, 6.5036807, 0.5806964, 0.35]],
dtype=torch.float32, dtype=torch.float32,
device='cuda') # n, point_features device=device) # n, point_features
coordinates = torch.tensor( coordinates = torch.tensor(
[[0, 12, 819, 131], [0, 16, 750, 136], [1, 16, 705, 232], [[0, 12, 819, 131], [0, 16, 750, 136], [1, 16, 705, 232],
[1, 35, 930, 469]], [1, 35, 930, 469]],
dtype=torch.int32, dtype=torch.int32,
device='cuda') # n, 4(batch, ind_x, ind_y, ind_z) device=device) # n, 4(batch, ind_x, ind_y, ind_z)
# test # test
input_sp_tensor = SparseConvTensor(voxel_features, coordinates, input_sp_tensor = SparseConvTensor(voxel_features, coordinates,
...@@ -105,7 +115,7 @@ def test_make_sparse_convmodule(): ...@@ -105,7 +115,7 @@ def test_make_sparse_convmodule():
padding=0, padding=0,
conv_type='SubMConv3d', conv_type='SubMConv3d',
norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.01), norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.01),
order=('conv', 'norm', 'act')).cuda() order=('conv', 'norm', 'act')).to(device)
assert isinstance(sparse_block0[0], SubMConv3d) assert isinstance(sparse_block0[0], SubMConv3d)
assert sparse_block0[0].in_channels == 4 assert sparse_block0[0].in_channels == 4
assert sparse_block0[0].out_channels == 16 assert sparse_block0[0].out_channels == 16
...@@ -118,16 +128,18 @@ def test_make_sparse_convmodule(): ...@@ -118,16 +128,18 @@ def test_make_sparse_convmodule():
out_features = sparse_block0(input_sp_tensor) out_features = sparse_block0(input_sp_tensor)
assert out_features.features.shape == torch.Size([4, 16]) assert out_features.features.shape == torch.Size([4, 16])
sparse_block1 = make_sparse_convmodule( # device == mlu: not support inverse==1 yet
4, if device != 'mlu':
16, sparse_block1 = make_sparse_convmodule(
3, 4,
'test1', 16,
stride=1, 3,
padding=0, 'test1',
conv_type='SparseInverseConv3d', stride=1,
norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.01), padding=0,
order=('norm', 'act', 'conv')).cuda() conv_type='SparseInverseConv3d',
assert isinstance(sparse_block1[0], torch.nn.BatchNorm1d) norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.01),
assert isinstance(sparse_block1[1], torch.nn.ReLU) order=('norm', 'act', 'conv')).to(device)
assert isinstance(sparse_block1[2], SparseInverseConv3d) assert isinstance(sparse_block1[2], SparseInverseConv3d)
assert isinstance(sparse_block1[0], torch.nn.BatchNorm1d)
assert isinstance(sparse_block1[1], torch.nn.ReLU)
...@@ -4,7 +4,7 @@ import pytest ...@@ -4,7 +4,7 @@ import pytest
import torch import torch
from mmcv.ops import Voxelization from mmcv.ops import Voxelization
from mmcv.utils import IS_NPU_AVAILABLE from mmcv.utils import IS_CUDA_AVAILABLE, IS_MLU_AVAILABLE, IS_NPU_AVAILABLE
def _get_voxel_points_indices(points, coors, voxel): def _get_voxel_points_indices(points, coors, voxel):
...@@ -17,7 +17,7 @@ def _get_voxel_points_indices(points, coors, voxel): ...@@ -17,7 +17,7 @@ def _get_voxel_points_indices(points, coors, voxel):
pytest.param( pytest.param(
'cuda:0', 'cuda:0',
marks=pytest.mark.skipif( marks=pytest.mark.skipif(
not torch.cuda.is_available(), reason='requires CUDA support')) not IS_CUDA_AVAILABLE, reason='requires CUDA support'))
]) ])
def test_voxelization(device_type): def test_voxelization(device_type):
voxel_size = [0.5, 0.5, 0.5] voxel_size = [0.5, 0.5, 0.5]
...@@ -63,8 +63,7 @@ def test_voxelization(device_type): ...@@ -63,8 +63,7 @@ def test_voxelization(device_type):
assert num_points_current_voxel == expected_num_points_per_voxel[i] assert num_points_current_voxel == expected_num_points_per_voxel[i]
@pytest.mark.skipif( @pytest.mark.skipif(not IS_CUDA_AVAILABLE, reason='requires CUDA support')
not torch.cuda.is_available(), reason='requires CUDA support')
def test_voxelization_nondeterministic(): def test_voxelization_nondeterministic():
voxel_size = [0.5, 0.5, 0.5] voxel_size = [0.5, 0.5, 0.5]
point_cloud_range = [0, -40, -3, 70.4, 40, 1] point_cloud_range = [0, -40, -3, 70.4, 40, 1]
...@@ -140,6 +139,41 @@ def test_voxelization_nondeterministic(): ...@@ -140,6 +139,41 @@ def test_voxelization_nondeterministic():
assert len(coors_set) == len(coors) == len(coors_all_set) assert len(coors_set) == len(coors) == len(coors_all_set)
@pytest.mark.parametrize('device_type', [
pytest.param(
'mlu',
marks=pytest.mark.skipif(
not IS_MLU_AVAILABLE, reason='requires MLU support'))
])
def test_voxelization_mlu(device_type):
voxel_size = [0.5, 0.5, 0.5]
point_cloud_range = [0, -40, -3, 70.4, 40, 1]
voxel_dict = np.load(
'tests/data/for_3d_ops/test_voxel.npy', allow_pickle=True).item()
expected_coors = voxel_dict['coors']
expected_voxels = voxel_dict['voxels']
expected_num_points_per_voxel = voxel_dict['num_points_per_voxel']
points = voxel_dict['points']
points = torch.tensor(points)
max_num_points = 1000
hard_voxelization = Voxelization(voxel_size, point_cloud_range,
max_num_points)
device = torch.device(device_type)
# test hard_voxelization on mlu
points = points.contiguous().to(device)
coors, voxels, num_points_per_voxel = hard_voxelization.forward(points)
coors = coors.cpu().detach().numpy()
voxels = voxels.cpu().detach().numpy()
num_points_per_voxel = num_points_per_voxel.cpu().detach().numpy()
assert np.all(coors == expected_coors)
assert np.all(voxels == expected_voxels)
assert np.all(num_points_per_voxel == expected_num_points_per_voxel)
@pytest.mark.parametrize('device_type', [ @pytest.mark.parametrize('device_type', [
pytest.param( pytest.param(
'npu', 'npu',
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment