Unverified Commit c0f5492e authored by zhuyuanhao's avatar zhuyuanhao Committed by GitHub
Browse files

add ext ops, support parrots (#310)



* add ext ops, support parrots

* fix lint

* fix lint

* update op from mmdetection

* support non-pytorch env

* fix import bug

* test not import mmcv.op

* rename mmcv.op to mmcv.ops

* fix compile warning

* 1. fix syncbn warning in pytorch 1.5
2. support only cpu compile
3. add point_sample from mmdet

* fix text bug

* update docstrings

* fix line endings

* minor updates

* remove non_local from ops

* bug fix for nonlocal2d

* rename ops_ext to _ext and _ext to _flow_warp_ext

* update the doc

* try clang-format github action

* fix github action

* add ops to api.rst

* fix cpp format

* fix clang format issues

* remove .clang-format
Co-authored-by: default avatarKai Chen <chenkaidev@gmail.com>
parent a7bf7701
import os
import numpy
import torch
cur_dir = os.path.dirname(os.path.abspath(__file__))
input_t = [[[[1., 2., 3.], [1., 2., 3.], [1., 2., 3.]]]]
output_t = [[[[0.5, 1.5, 2.5, 1.5], [1.0, 3.0, 5.0, 3.0], [1.0, 3.0, 5.0, 3.0],
[0.5, 1.5, 2.5, 1.5]]]]
input_grad = [[[[2., 2., 2.], [2., 2., 2.], [2., 2., 2.]]]]
dcn_w_grad = [[[[9., 9.], [9., 9.]]]]
dcn_offset_w_grad = [[[[-7.0, -4.0], [0.0, 0.0]]], [[[-9.0, 7.5], [-6.0,
5.0]]],
[[[-4.0, -7.0], [0.0, 0.0]]],
[[[-7.5, -9.0], [-5.0, -6.0]]],
[[[-7.0, -4.0], [-7.0, -4.0]]],
[[[-6.0, 5.0], [-9.0, 7.5]]],
[[[-4.0, -7.0], [-4.0, -7.0]]],
[[[-5.0, -6.0], [-7.5, -9.0]]], [[[10.5, 6.0], [7.0,
4.0]]],
[[[6.0, 10.5], [4.0, 7.0]]], [[[7.0, 4.0], [10.5, 6.0]]],
[[[4.0, 7.0], [6.0, 10.5]]]]
dcn_offset_b_grad = [
-3.0, -1.5, -3.0, -1.5, -3.0, -1.5, -3.0, -1.5, 4.5, 4.5, 4.5, 4.5
]
class TestMdconv(object):
def _test_mdconv(self, dtype=torch.float):
if not torch.cuda.is_available():
return
from mmcv.ops import ModulatedDeformConv2dPack
input = torch.tensor(input_t).cuda().type(dtype)
input.requires_grad = True
dcn = ModulatedDeformConv2dPack(
1,
1,
kernel_size=(2, 2),
stride=1,
padding=1,
deform_groups=1,
bias=False).cuda()
dcn.weight.data.fill_(1.)
dcn.type(dtype)
output = dcn(input)
output.sum().backward()
assert numpy.allclose(output.cpu().detach().numpy(), output_t, 1e-2)
assert numpy.allclose(input.grad.cpu().detach().numpy(), input_grad,
1e-2)
assert numpy.allclose(dcn.weight.grad.cpu().detach().numpy(),
dcn_w_grad, 1e-2)
assert numpy.allclose(
dcn.conv_offset.weight.grad.cpu().detach().numpy(),
dcn_offset_w_grad, 1e-2)
assert numpy.allclose(dcn.conv_offset.bias.grad.cpu().detach().numpy(),
dcn_offset_b_grad, 1e-2)
def test_mdconv(self):
self._test_mdconv(torch.double)
self._test_mdconv(torch.float)
self._test_mdconv(torch.half)
import numpy as np
import pytest
import torch
class Testnms(object):
def test_nms_allclose(self):
if not torch.cuda.is_available():
return
from mmcv.ops import nms
np_boxes = np.array([[6.0, 3.0, 8.0, 7.0], [3.0, 6.0, 9.0, 11.0],
[3.0, 7.0, 10.0, 12.0], [1.0, 4.0, 13.0, 7.0]],
dtype=np.float32)
np_scores = np.array([0.6, 0.9, 0.7, 0.2], dtype=np.float32)
np_inds = np.array([1, 0, 3])
np_dets = np.array([[3.0, 6.0, 9.0, 11.0, 0.9],
[6.0, 3.0, 8.0, 7.0, 0.6],
[1.0, 4.0, 13.0, 7.0, 0.2]])
boxes = torch.from_numpy(np_boxes)
scores = torch.from_numpy(np_scores)
dets, inds = nms(boxes, scores, iou_threshold=0.3, offset=0)
assert np.allclose(dets, np_dets) # test cpu
assert np.allclose(inds, np_inds) # test cpu
dets, inds = nms(
boxes.cuda(), scores.cuda(), iou_threshold=0.3, offset=0)
assert np.allclose(dets.cpu().numpy(), np_dets) # test gpu
assert np.allclose(inds.cpu().numpy(), np_inds) # test gpu
def test_softnms_allclose(self):
if not torch.cuda.is_available():
return
from mmcv.ops import soft_nms
np_boxes = np.array([[6.0, 3.0, 8.0, 7.0], [3.0, 6.0, 9.0, 11.0],
[3.0, 7.0, 10.0, 12.0], [1.0, 4.0, 13.0, 7.0]],
dtype=np.float32)
np_scores = np.array([0.6, 0.9, 0.7, 0.2], dtype=np.float32)
np_output = {
'linear': {
'dets':
np.array(
[[3., 6., 9., 11., 0.9], [6., 3., 8., 7., 0.6],
[3., 7., 10., 12., 0.29024392], [1., 4., 13., 7., 0.2]],
dtype=np.float32),
'inds':
np.array([1, 0, 2, 3], dtype=np.int64)
},
'gaussian': {
'dets':
np.array([[3., 6., 9., 11., 0.9], [6., 3., 8., 7., 0.59630775],
[3., 7., 10., 12., 0.35275510],
[1., 4., 13., 7., 0.18650459]],
dtype=np.float32),
'inds':
np.array([1, 0, 2, 3], dtype=np.int64)
},
'naive': {
'dets':
np.array([[3., 6., 9., 11., 0.9], [6., 3., 8., 7., 0.6],
[1., 4., 13., 7., 0.2]],
dtype=np.float32),
'inds':
np.array([1, 0, 3], dtype=np.int64)
}
}
boxes = torch.from_numpy(np_boxes)
scores = torch.from_numpy(np_scores)
configs = [[0.3, 0.5, 0.01, 'linear'], [0.3, 0.5, 0.01, 'gaussian'],
[0.3, 0.5, 0.01, 'naive']]
for iou, sig, mscore, m in configs:
dets, inds = soft_nms(
boxes,
scores,
iou_threshold=iou,
sigma=sig,
min_score=mscore,
method=m)
assert np.allclose(dets.cpu().numpy(), np_output[m]['dets'])
assert np.allclose(inds.cpu().numpy(), np_output[m]['inds'])
if torch.__version__ != 'parrots':
boxes = boxes.cuda()
scores = scores.cuda()
for iou, sig, mscore, m in configs:
dets, inds = soft_nms(
boxes,
scores,
iou_threshold=iou,
sigma=sig,
min_score=mscore,
method=m)
assert np.allclose(dets.cpu().numpy(), np_output[m]['dets'])
assert np.allclose(inds.cpu().numpy(), np_output[m]['inds'])
def test_nms_match(self):
if not torch.cuda.is_available():
return
from mmcv.ops import nms, nms_match
iou_thr = 0.6
# empty input
empty_dets = np.array([])
assert len(nms_match(empty_dets, iou_thr)) == 0
# non empty ndarray input
np_dets = np.array(
[[49.1, 32.4, 51.0, 35.9, 0.9], [49.3, 32.9, 51.0, 35.3, 0.9],
[35.3, 11.5, 39.9, 14.5, 0.4], [35.2, 11.7, 39.7, 15.7, 0.3]],
dtype=np.float32)
np_groups = nms_match(np_dets, iou_thr)
assert isinstance(np_groups[0], np.ndarray)
assert len(np_groups) == 2
tensor_dets = torch.from_numpy(np_dets)
boxes = tensor_dets[:, :4]
scores = tensor_dets[:, 4]
nms_keep_inds = nms(boxes.contiguous(), scores.contiguous(),
iou_thr)[1]
assert set([g[0].item()
for g in np_groups]) == set(nms_keep_inds.tolist())
# non empty tensor input
tensor_dets = torch.from_numpy(np_dets)
tensor_groups = nms_match(tensor_dets, iou_thr)
assert isinstance(tensor_groups[0], torch.Tensor)
for i in range(len(tensor_groups)):
assert np.equal(tensor_groups[i].numpy(), np_groups[i]).all()
# input of wrong shape
wrong_dets = np.zeros((2, 3))
with pytest.raises(AssertionError):
nms_match(wrong_dets, iou_thr)
import numpy as np
import torch
import torch.nn as nn
class Loss(nn.Module):
def __init__(self):
super().__init__()
def forward(self, input, target):
input = input.view(-1)
target = target.view(-1)
return torch.mean(input - target)
class TestPSAMask(object):
def test_psa_mask_collect(self):
if not torch.cuda.is_available():
return
from mmcv.ops import PSAMask
test_loss = Loss()
input = np.fromfile(
'tests/data/for_psa_mask/psa_input.bin', dtype=np.float32)
output_collect = np.fromfile(
'tests/data/for_psa_mask/psa_output_collect.bin', dtype=np.float32)
input = input.reshape((4, 16, 8, 8))
output_collect = output_collect.reshape((4, 64, 8, 8))
label = torch.ones((4, 64, 8, 8))
input = torch.FloatTensor(input)
input.requires_grad = True
psamask_collect = PSAMask('collect', (4, 4))
# test collect cpu
test_output = psamask_collect(input)
loss = test_loss(test_output, label)
loss.backward()
test_output = test_output.detach().numpy()
assert np.allclose(test_output, output_collect)
assert test_output.shape == output_collect.shape
psamask_collect.cuda()
input = input.cuda()
label = label.cuda()
# test collect cuda
test_output = psamask_collect(input)
loss = test_loss(test_output, label)
loss.backward()
test_output = test_output.detach().cpu().numpy()
assert np.allclose(test_output, output_collect)
assert test_output.shape == output_collect.shape
def test_psa_mask_distribute(self):
if not torch.cuda.is_available():
return
from mmcv.ops import PSAMask
test_loss = Loss()
input = np.fromfile(
'tests/data/for_psa_mask/psa_input.bin', dtype=np.float32)
output_distribute = np.fromfile(
'tests/data/for_psa_mask/psa_output_distribute.bin',
dtype=np.float32)
input = input.reshape((4, 16, 8, 8))
output_distribute = output_distribute.reshape((4, 64, 8, 8))
label = torch.ones((4, 64, 8, 8))
input = torch.FloatTensor(input)
input.requires_grad = True
psamask_distribute = PSAMask('distribute', (4, 4))
# test distribute cpu
test_output = psamask_distribute(input)
loss = test_loss(test_output, label)
loss.backward()
test_output = test_output.detach().numpy()
assert np.allclose(test_output, output_distribute)
assert test_output.shape == output_distribute.shape
psamask_distribute.cuda()
input = input.cuda()
label = label.cuda()
# test distribute cuda
test_output = psamask_distribute(input)
loss = test_loss(test_output, label)
loss.backward()
test_output = test_output.detach().cpu().numpy()
assert np.allclose(test_output, output_distribute)
assert test_output.shape == output_distribute.shape
import os
import numpy as np
import torch
_USING_PARROTS = True
try:
from parrots.autograd import gradcheck
except ImportError:
from torch.autograd import gradcheck
_USING_PARROTS = False
cur_dir = os.path.dirname(os.path.abspath(__file__))
inputs = [([[[[1., 2.], [3., 4.]]]], [[0., 0., 0., 1., 1.]]),
([[[[1., 2.], [3., 4.]], [[4., 3.], [2.,
1.]]]], [[0., 0., 0., 1., 1.]]),
([[[[1., 2., 5., 6.], [3., 4., 7., 8.], [9., 10., 13., 14.],
[11., 12., 15., 16.]]]], [[0., 0., 0., 3., 3.]])]
outputs = [([[[[1.0, 1.25], [1.5, 1.75]]]], [[[[3.0625, 0.4375],
[0.4375, 0.0625]]]]),
([[[[1.0, 1.25], [1.5, 1.75]],
[[4.0, 3.75], [3.5, 3.25]]]], [[[[3.0625, 0.4375],
[0.4375, 0.0625]],
[[3.0625, 0.4375],
[0.4375, 0.0625]]]]),
([[[[1.9375, 4.75], [7.5625, 10.375]]]],
[[[[0.47265625, 0.42968750, 0.42968750, 0.04296875],
[0.42968750, 0.39062500, 0.39062500, 0.03906250],
[0.42968750, 0.39062500, 0.39062500, 0.03906250],
[0.04296875, 0.03906250, 0.03906250, 0.00390625]]]])]
class TestRoiAlign(object):
def test_roialign_gradcheck(self):
if not torch.cuda.is_available():
return
from mmcv.ops import RoIAlign
pool_h = 2
pool_w = 2
spatial_scale = 1.0
sampling_ratio = 2
for case in inputs:
np_input = np.array(case[0])
np_rois = np.array(case[1])
x = torch.tensor(np_input, device='cuda', requires_grad=True)
rois = torch.tensor(np_rois, device='cuda')
froipool = RoIAlign((pool_h, pool_w), spatial_scale,
sampling_ratio)
if _USING_PARROTS:
pass
# gradcheck(froipool, (x, rois), no_grads=[rois])
else:
gradcheck(froipool, (x, rois), eps=1e-2, atol=1e-2)
def _test_roipool_allclose(self, dtype=torch.float):
if not torch.cuda.is_available():
return
from mmcv.ops import roi_align
pool_h = 2
pool_w = 2
spatial_scale = 1.0
sampling_ratio = 2
for case, output in zip(inputs, outputs):
np_input = np.array(case[0])
np_rois = np.array(case[1])
np_output = np.array(output[0])
np_grad = np.array(output[1])
x = torch.tensor(
np_input, dtype=dtype, device='cuda', requires_grad=True)
rois = torch.tensor(np_rois, dtype=dtype, device='cuda')
output = roi_align(x, rois, (pool_h, pool_w), spatial_scale,
sampling_ratio, 'avg', True)
output.backward(torch.ones_like(output))
assert np.allclose(
output.data.type(torch.float).cpu().numpy(),
np_output,
atol=1e-3)
assert np.allclose(
x.grad.data.type(torch.float).cpu().numpy(),
np_grad,
atol=1e-3)
def test_roipool_allclose(self):
self._test_roipool_allclose(torch.float)
self._test_roipool_allclose(torch.double)
self._test_roipool_allclose(torch.half)
import os
import numpy as np
import torch
_USING_PARROTS = True
try:
from parrots.autograd import gradcheck
except ImportError:
from torch.autograd import gradcheck
_USING_PARROTS = False
cur_dir = os.path.dirname(os.path.abspath(__file__))
inputs = [([[[[1., 2.], [3., 4.]]]], [[0., 0., 0., 1., 1.]]),
([[[[1., 2.], [3., 4.]], [[4., 3.], [2.,
1.]]]], [[0., 0., 0., 1., 1.]]),
([[[[1., 2., 5., 6.], [3., 4., 7., 8.], [9., 10., 13., 14.],
[11., 12., 15., 16.]]]], [[0., 0., 0., 3., 3.]])]
outputs = [([[[[1., 2.], [3., 4.]]]], [[[[1., 1.], [1., 1.]]]]),
([[[[1., 2.], [3., 4.]], [[4., 3.], [2., 1.]]]], [[[[1., 1.],
[1., 1.]],
[[1., 1.],
[1., 1.]]]]),
([[[[4., 8.], [12., 16.]]]], [[[[0., 0., 0., 0.], [0., 1., 0., 1.],
[0., 0., 0., 0.], [0., 1., 0.,
1.]]]])]
class TestRoiPool(object):
def test_roipool_gradcheck(self):
if not torch.cuda.is_available():
return
from mmcv.ops import RoIPool
pool_h = 2
pool_w = 2
spatial_scale = 1.0
for case in inputs:
np_input = np.array(case[0])
np_rois = np.array(case[1])
x = torch.tensor(np_input, device='cuda', requires_grad=True)
rois = torch.tensor(np_rois, device='cuda')
froipool = RoIPool((pool_h, pool_w), spatial_scale)
if _USING_PARROTS:
pass
# gradcheck(froipool, (x, rois), no_grads=[rois])
else:
gradcheck(froipool, (x, rois), eps=1e-2, atol=1e-2)
def _test_roipool_allclose(self, dtype=torch.float):
if not torch.cuda.is_available():
return
from mmcv.ops import roi_pool
pool_h = 2
pool_w = 2
spatial_scale = 1.0
for case, output in zip(inputs, outputs):
np_input = np.array(case[0])
np_rois = np.array(case[1])
np_output = np.array(output[0])
np_grad = np.array(output[1])
x = torch.tensor(
np_input, dtype=dtype, device='cuda', requires_grad=True)
rois = torch.tensor(np_rois, dtype=dtype, device='cuda')
output = roi_pool(x, rois, (pool_h, pool_w), spatial_scale)
output.backward(torch.ones_like(output))
assert np.allclose(output.data.cpu().numpy(), np_output, 1e-3)
assert np.allclose(x.grad.data.cpu().numpy(), np_grad, 1e-3)
def test_roipool_allclose(self):
self._test_roipool_allclose(torch.double)
self._test_roipool_allclose(torch.float)
self._test_roipool_allclose(torch.half)
import os
import re
import numpy as np
import torch
import torch.distributed as dist
import torch.nn as nn
class TestSyncBN(object):
def dist_init(self):
rank = int(os.environ['SLURM_PROCID'])
world_size = int(os.environ['SLURM_NTASKS'])
local_rank = int(os.environ['SLURM_LOCALID'])
node_list = str(os.environ['SLURM_NODELIST'])
node_parts = re.findall('[0-9]+', node_list)
host_ip = '{}.{}.{}.{}'.format(node_parts[1], node_parts[2],
node_parts[3], node_parts[4])
port = '12341'
init_method = 'tcp://{}:{}'.format(host_ip, port)
dist.init_process_group(
'nccl', init_method=init_method, world_size=world_size, rank=rank)
torch.cuda.set_device(local_rank)
def _test_syncbn_train(self, size=1, half=False):
if 'SLURM_NTASKS' not in os.environ or int(
os.environ['SLURM_NTASKS']) != 4:
print('must run with slurm has 4 processes!\n'
'srun -p test --gres=gpu:4 -n4')
return
else:
print('Running syncbn test')
from mmcv.ops import SyncBatchNorm
assert size in (1, 2, 4)
if not dist.is_initialized():
self.dist_init()
rank = dist.get_rank()
torch.manual_seed(9)
torch.cuda.manual_seed(9)
self.x = torch.rand(16, 3, 2, 3).cuda()
self.y_bp = torch.rand(16, 3, 2, 3).cuda()
if half:
self.x = self.x.half()
self.y_bp = self.y_bp.half()
dist.broadcast(self.x, src=0)
dist.broadcast(self.y_bp, src=0)
torch.cuda.synchronize()
if size == 1:
groups = [None, None, None, None]
groups[0] = dist.new_group([0])
groups[1] = dist.new_group([1])
groups[2] = dist.new_group([2])
groups[3] = dist.new_group([3])
group = groups[rank]
elif size == 2:
groups = [None, None, None, None]
groups[0] = groups[1] = dist.new_group([0, 1])
groups[2] = groups[3] = dist.new_group([2, 3])
group = groups[rank]
elif size == 4:
group = dist.group.WORLD
syncbn = SyncBatchNorm(3, group=group).cuda()
syncbn.weight.data[0] = 0.2
syncbn.weight.data[1] = 0.5
syncbn.weight.data[2] = 0.7
syncbn.train()
bn = nn.BatchNorm2d(3).cuda()
bn.weight.data[0] = 0.2
bn.weight.data[1] = 0.5
bn.weight.data[2] = 0.7
bn.train()
sx = self.x[rank * 4:rank * 4 + 4]
sx.requires_grad_()
sy = syncbn(sx)
sy.backward(self.y_bp[rank * 4:rank * 4 + 4])
smean = syncbn.running_mean
svar = syncbn.running_var
sx_grad = sx.grad
sw_grad = syncbn.weight.grad
sb_grad = syncbn.bias.grad
if size == 1:
x = self.x[rank * 4:rank * 4 + 4]
y_bp = self.y_bp[rank * 4:rank * 4 + 4]
elif size == 2:
x = self.x[rank // 2 * 8:rank // 2 * 8 + 8]
y_bp = self.y_bp[rank // 2 * 8:rank // 2 * 8 + 8]
elif size == 4:
x = self.x
y_bp = self.y_bp
x.requires_grad_()
y = bn(x)
y.backward(y_bp)
if size == 2:
y = y[rank % 2 * 4:rank % 2 * 4 + 4]
elif size == 4:
y = y[rank * 4:rank * 4 + 4]
mean = bn.running_mean
var = bn.running_var
if size == 1:
x_grad = x.grad
w_grad = bn.weight.grad
b_grad = bn.bias.grad
elif size == 2:
x_grad = x.grad[rank % 2 * 4:rank % 2 * 4 + 4]
w_grad = bn.weight.grad / 2
b_grad = bn.bias.grad / 2
elif size == 4:
x_grad = x.grad[rank * 4:rank * 4 + 4]
w_grad = bn.weight.grad / 4
b_grad = bn.bias.grad / 4
assert np.allclose(mean.data.cpu().numpy(),
smean.data.cpu().numpy(), 1e-3)
assert np.allclose(var.data.cpu().numpy(),
svar.data.cpu().numpy(), 1e-3)
assert np.allclose(y.data.cpu().numpy(), sy.data.cpu().numpy(), 1e-3)
assert np.allclose(w_grad.data.cpu().numpy(),
sw_grad.data.cpu().numpy(), 1e-3)
assert np.allclose(b_grad.data.cpu().numpy(),
sb_grad.data.cpu().numpy(), 1e-3)
assert np.allclose(x_grad.data.cpu().numpy(),
sx_grad.data.cpu().numpy(), 1e-2)
def test_syncbn_1(self):
self._test_syncbn_train(size=1)
def test_syncbn_2(self):
self._test_syncbn_train(size=2)
def test_syncbn_4(self):
self._test_syncbn_train(size=4)
def test_syncbn_1_half(self):
self._test_syncbn_train(size=1, half=True)
def test_syncbn_2_half(self):
self._test_syncbn_train(size=2, half=True)
def test_syncbn_4_half(self):
self._test_syncbn_train(size=4, half=True)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment