add ext ops, support parrots (#310)

* add ext ops, support parrots * fix lint * fix lint * update op from mmdetection * support non-pytorch env * fix import bug * test not import mmcv.op * rename mmcv.op to mmcv.ops * fix compile warning * 1. fix syncbn warning in pytorch 1.5 2. support only cpu compile 3. add point_sample from mmdet * fix text bug * update docstrings * fix line endings * minor updates * remove non_local from ops * bug fix for nonlocal2d * rename ops_ext to _ext and _ext to _flow_warp_ext * update the doc * try clang-format github action * fix github action * add ops to api.rst * fix cpp format * fix clang format issues * remove .clang-format Co-authored-by: Kai Chen <chenkaidev@gmail.com>

add ext ops, support parrots (#310)
* add ext ops, support parrots * fix lint * fix lint * update op from mmdetection * support non-pytorch env * fix import bug * test not import mmcv.op * rename mmcv.op to mmcv.ops * fix compile warning * 1. fix syncbn warning in pytorch 1.5 2. support only cpu compile 3. add point_sample from mmdet * fix text bug * update docstrings * fix line endings * minor updates * remove non_local from ops * bug fix for nonlocal2d * rename ops_ext to _ext and _ext to _flow_warp_ext * update the doc * try clang-format github action * fix github action * add ops to api.rst * fix cpp format * fix clang format issues * remove .clang-format Co-authored-by: Kai Chen <chenkaidev@gmail.com>
c0f5492e · zhuyuanhao · GitHub · a7bf7701 · c0f5492e · c0f5492e
Unverified Commit c0f5492e authored Jun 28, 2020 by zhuyuanhao Committed by GitHub Jun 28, 2020
6 changed files
--- a/tests/test_ops/test_modulated_deform_conv.py
+++ b/tests/test_ops/test_modulated_deform_conv.py
+import os
+
+import numpy
+import torch
+
+cur_dir = os.path.dirname(os.path.abspath(__file__))
+
+input_t = [[[[1., 2., 3.], [1., 2., 3.], [1., 2., 3.]]]]
+output_t = [[[[0.5, 1.5, 2.5, 1.5], [1.0, 3.0, 5.0, 3.0], [1.0, 3.0, 5.0, 3.0],
+              [0.5, 1.5, 2.5, 1.5]]]]
+input_grad = [[[[2., 2., 2.], [2., 2., 2.], [2., 2., 2.]]]]
+dcn_w_grad = [[[[9., 9.], [9., 9.]]]]
+dcn_offset_w_grad = [[[[-7.0, -4.0], [0.0, 0.0]]], [[[-9.0, 7.5], [-6.0,
+                                                                   5.0]]],
+                     [[[-4.0, -7.0], [0.0, 0.0]]],
+                     [[[-7.5, -9.0], [-5.0, -6.0]]],
+                     [[[-7.0, -4.0], [-7.0, -4.0]]],
+                     [[[-6.0, 5.0], [-9.0, 7.5]]],
+                     [[[-4.0, -7.0], [-4.0, -7.0]]],
+                     [[[-5.0, -6.0], [-7.5, -9.0]]], [[[10.5, 6.0], [7.0,
+                                                                     4.0]]],
+                     [[[6.0, 10.5], [4.0, 7.0]]], [[[7.0, 4.0], [10.5, 6.0]]],
+                     [[[4.0, 7.0], [6.0, 10.5]]]]
+dcn_offset_b_grad = [
+    -3.0, -1.5, -3.0, -1.5, -3.0, -1.5, -3.0, -1.5, 4.5, 4.5, 4.5, 4.5
+]
+
+
+class TestMdconv(object):
+
+    def _test_mdconv(self, dtype=torch.float):
+        if not torch.cuda.is_available():
+            return
+        from mmcv.ops import ModulatedDeformConv2dPack
+        input = torch.tensor(input_t).cuda().type(dtype)
+        input.requires_grad = True
+
+        dcn = ModulatedDeformConv2dPack(
+            1,
+            1,
+            kernel_size=(2, 2),
+            stride=1,
+            padding=1,
+            deform_groups=1,
+            bias=False).cuda()
+        dcn.weight.data.fill_(1.)
+        dcn.type(dtype)
+        output = dcn(input)
+        output.sum().backward()
+        assert numpy.allclose(output.cpu().detach().numpy(), output_t, 1e-2)
+        assert numpy.allclose(input.grad.cpu().detach().numpy(), input_grad,
+                              1e-2)
+        assert numpy.allclose(dcn.weight.grad.cpu().detach().numpy(),
+                              dcn_w_grad, 1e-2)
+        assert numpy.allclose(
+            dcn.conv_offset.weight.grad.cpu().detach().numpy(),
+            dcn_offset_w_grad, 1e-2)
+        assert numpy.allclose(dcn.conv_offset.bias.grad.cpu().detach().numpy(),
+                              dcn_offset_b_grad, 1e-2)
+
+    def test_mdconv(self):
+        self._test_mdconv(torch.double)
+        self._test_mdconv(torch.float)
+        self._test_mdconv(torch.half)
--- a/tests/test_ops/test_nms.py
+++ b/tests/test_ops/test_nms.py
+import numpy as np
+import pytest
+import torch
+
+
+class Testnms(object):
+
+    def test_nms_allclose(self):
+        if not torch.cuda.is_available():
+            return
+        from mmcv.ops import nms
+        np_boxes = np.array([[6.0, 3.0, 8.0, 7.0], [3.0, 6.0, 9.0, 11.0],
+                             [3.0, 7.0, 10.0, 12.0], [1.0, 4.0, 13.0, 7.0]],
+                            dtype=np.float32)
+        np_scores = np.array([0.6, 0.9, 0.7, 0.2], dtype=np.float32)
+        np_inds = np.array([1, 0, 3])
+        np_dets = np.array([[3.0, 6.0, 9.0, 11.0, 0.9],
+                            [6.0, 3.0, 8.0, 7.0, 0.6],
+                            [1.0, 4.0, 13.0, 7.0, 0.2]])
+        boxes = torch.from_numpy(np_boxes)
+        scores = torch.from_numpy(np_scores)
+        dets, inds = nms(boxes, scores, iou_threshold=0.3, offset=0)
+        assert np.allclose(dets, np_dets)  # test cpu
+        assert np.allclose(inds, np_inds)  # test cpu
+        dets, inds = nms(
+            boxes.cuda(), scores.cuda(), iou_threshold=0.3, offset=0)
+        assert np.allclose(dets.cpu().numpy(), np_dets)  # test gpu
+        assert np.allclose(inds.cpu().numpy(), np_inds)  # test gpu
+
+    def test_softnms_allclose(self):
+        if not torch.cuda.is_available():
+            return
+        from mmcv.ops import soft_nms
+        np_boxes = np.array([[6.0, 3.0, 8.0, 7.0], [3.0, 6.0, 9.0, 11.0],
+                             [3.0, 7.0, 10.0, 12.0], [1.0, 4.0, 13.0, 7.0]],
+                            dtype=np.float32)
+        np_scores = np.array([0.6, 0.9, 0.7, 0.2], dtype=np.float32)
+
+        np_output = {
+            'linear': {
+                'dets':
+                np.array(
+                    [[3., 6., 9., 11., 0.9], [6., 3., 8., 7., 0.6],
+                     [3., 7., 10., 12., 0.29024392], [1., 4., 13., 7., 0.2]],
+                    dtype=np.float32),
+                'inds':
+                np.array([1, 0, 2, 3], dtype=np.int64)
+            },
+            'gaussian': {
+                'dets':
+                np.array([[3., 6., 9., 11., 0.9], [6., 3., 8., 7., 0.59630775],
+                          [3., 7., 10., 12., 0.35275510],
+                          [1., 4., 13., 7., 0.18650459]],
+                         dtype=np.float32),
+                'inds':
+                np.array([1, 0, 2, 3], dtype=np.int64)
+            },
+            'naive': {
+                'dets':
+                np.array([[3., 6., 9., 11., 0.9], [6., 3., 8., 7., 0.6],
+                          [1., 4., 13., 7., 0.2]],
+                         dtype=np.float32),
+                'inds':
+                np.array([1, 0, 3], dtype=np.int64)
+            }
+        }
+
+        boxes = torch.from_numpy(np_boxes)
+        scores = torch.from_numpy(np_scores)
+
+        configs = [[0.3, 0.5, 0.01, 'linear'], [0.3, 0.5, 0.01, 'gaussian'],
+                   [0.3, 0.5, 0.01, 'naive']]
+
+        for iou, sig, mscore, m in configs:
+            dets, inds = soft_nms(
+                boxes,
+                scores,
+                iou_threshold=iou,
+                sigma=sig,
+                min_score=mscore,
+                method=m)
+            assert np.allclose(dets.cpu().numpy(), np_output[m]['dets'])
+            assert np.allclose(inds.cpu().numpy(), np_output[m]['inds'])
+
+        if torch.__version__ != 'parrots':
+            boxes = boxes.cuda()
+            scores = scores.cuda()
+            for iou, sig, mscore, m in configs:
+                dets, inds = soft_nms(
+                    boxes,
+                    scores,
+                    iou_threshold=iou,
+                    sigma=sig,
+                    min_score=mscore,
+                    method=m)
+                assert np.allclose(dets.cpu().numpy(), np_output[m]['dets'])
+                assert np.allclose(inds.cpu().numpy(), np_output[m]['inds'])
+
+    def test_nms_match(self):
+        if not torch.cuda.is_available():
+            return
+        from mmcv.ops import nms, nms_match
+        iou_thr = 0.6
+        # empty input
+        empty_dets = np.array([])
+        assert len(nms_match(empty_dets, iou_thr)) == 0
+
+        # non empty ndarray input
+        np_dets = np.array(
+            [[49.1, 32.4, 51.0, 35.9, 0.9], [49.3, 32.9, 51.0, 35.3, 0.9],
+             [35.3, 11.5, 39.9, 14.5, 0.4], [35.2, 11.7, 39.7, 15.7, 0.3]],
+            dtype=np.float32)
+        np_groups = nms_match(np_dets, iou_thr)
+        assert isinstance(np_groups[0], np.ndarray)
+        assert len(np_groups) == 2
+        tensor_dets = torch.from_numpy(np_dets)
+        boxes = tensor_dets[:, :4]
+        scores = tensor_dets[:, 4]
+        nms_keep_inds = nms(boxes.contiguous(), scores.contiguous(),
+                            iou_thr)[1]
+        assert set([g[0].item()
+                    for g in np_groups]) == set(nms_keep_inds.tolist())
+
+        # non empty tensor input
+        tensor_dets = torch.from_numpy(np_dets)
+        tensor_groups = nms_match(tensor_dets, iou_thr)
+        assert isinstance(tensor_groups[0], torch.Tensor)
+        for i in range(len(tensor_groups)):
+            assert np.equal(tensor_groups[i].numpy(), np_groups[i]).all()
+
+        # input of wrong shape
+        wrong_dets = np.zeros((2, 3))
+        with pytest.raises(AssertionError):
+            nms_match(wrong_dets, iou_thr)
--- a/tests/test_ops/test_psa_mask.py
+++ b/tests/test_ops/test_psa_mask.py
+import numpy as np
+import torch
+import torch.nn as nn
+
+
+class Loss(nn.Module):
+
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, input, target):
+        input = input.view(-1)
+        target = target.view(-1)
+        return torch.mean(input - target)
+
+
+class TestPSAMask(object):
+
+    def test_psa_mask_collect(self):
+        if not torch.cuda.is_available():
+            return
+        from mmcv.ops import PSAMask
+        test_loss = Loss()
+
+        input = np.fromfile(
+            'tests/data/for_psa_mask/psa_input.bin', dtype=np.float32)
+        output_collect = np.fromfile(
+            'tests/data/for_psa_mask/psa_output_collect.bin', dtype=np.float32)
+
+        input = input.reshape((4, 16, 8, 8))
+        output_collect = output_collect.reshape((4, 64, 8, 8))
+        label = torch.ones((4, 64, 8, 8))
+
+        input = torch.FloatTensor(input)
+        input.requires_grad = True
+
+        psamask_collect = PSAMask('collect', (4, 4))
+
+        # test collect cpu
+        test_output = psamask_collect(input)
+        loss = test_loss(test_output, label)
+        loss.backward()
+        test_output = test_output.detach().numpy()
+        assert np.allclose(test_output, output_collect)
+        assert test_output.shape == output_collect.shape
+
+        psamask_collect.cuda()
+        input = input.cuda()
+        label = label.cuda()
+
+        # test collect cuda
+        test_output = psamask_collect(input)
+        loss = test_loss(test_output, label)
+        loss.backward()
+        test_output = test_output.detach().cpu().numpy()
+        assert np.allclose(test_output, output_collect)
+        assert test_output.shape == output_collect.shape
+
+    def test_psa_mask_distribute(self):
+        if not torch.cuda.is_available():
+            return
+        from mmcv.ops import PSAMask
+        test_loss = Loss()
+
+        input = np.fromfile(
+            'tests/data/for_psa_mask/psa_input.bin', dtype=np.float32)
+        output_distribute = np.fromfile(
+            'tests/data/for_psa_mask/psa_output_distribute.bin',
+            dtype=np.float32)
+
+        input = input.reshape((4, 16, 8, 8))
+        output_distribute = output_distribute.reshape((4, 64, 8, 8))
+        label = torch.ones((4, 64, 8, 8))
+
+        input = torch.FloatTensor(input)
+        input.requires_grad = True
+
+        psamask_distribute = PSAMask('distribute', (4, 4))
+
+        # test distribute cpu
+        test_output = psamask_distribute(input)
+        loss = test_loss(test_output, label)
+        loss.backward()
+        test_output = test_output.detach().numpy()
+        assert np.allclose(test_output, output_distribute)
+        assert test_output.shape == output_distribute.shape
+
+        psamask_distribute.cuda()
+        input = input.cuda()
+        label = label.cuda()
+
+        # test distribute cuda
+        test_output = psamask_distribute(input)
+        loss = test_loss(test_output, label)
+        loss.backward()
+        test_output = test_output.detach().cpu().numpy()
+        assert np.allclose(test_output, output_distribute)
+        assert test_output.shape == output_distribute.shape
--- a/tests/test_ops/test_roi_align.py
+++ b/tests/test_ops/test_roi_align.py
+import os
+
+import numpy as np
+import torch
+
+_USING_PARROTS = True
+try:
+    from parrots.autograd import gradcheck
+except ImportError:
+    from torch.autograd import gradcheck
+    _USING_PARROTS = False
+
+cur_dir = os.path.dirname(os.path.abspath(__file__))
+
+inputs = [([[[[1., 2.], [3., 4.]]]], [[0., 0., 0., 1., 1.]]),
+          ([[[[1., 2.], [3., 4.]], [[4., 3.], [2.,
+                                               1.]]]], [[0., 0., 0., 1., 1.]]),
+          ([[[[1., 2., 5., 6.], [3., 4., 7., 8.], [9., 10., 13., 14.],
+              [11., 12., 15., 16.]]]], [[0., 0., 0., 3., 3.]])]
+outputs = [([[[[1.0, 1.25], [1.5, 1.75]]]], [[[[3.0625, 0.4375],
+                                               [0.4375, 0.0625]]]]),
+           ([[[[1.0, 1.25], [1.5, 1.75]],
+              [[4.0, 3.75], [3.5, 3.25]]]], [[[[3.0625, 0.4375],
+                                               [0.4375, 0.0625]],
+                                              [[3.0625, 0.4375],
+                                               [0.4375, 0.0625]]]]),
+           ([[[[1.9375, 4.75], [7.5625, 10.375]]]],
+            [[[[0.47265625, 0.42968750, 0.42968750, 0.04296875],
+               [0.42968750, 0.39062500, 0.39062500, 0.03906250],
+               [0.42968750, 0.39062500, 0.39062500, 0.03906250],
+               [0.04296875, 0.03906250, 0.03906250, 0.00390625]]]])]
+
+
+class TestRoiAlign(object):
+
+    def test_roialign_gradcheck(self):
+        if not torch.cuda.is_available():
+            return
+        from mmcv.ops import RoIAlign
+        pool_h = 2
+        pool_w = 2
+        spatial_scale = 1.0
+        sampling_ratio = 2
+
+        for case in inputs:
+            np_input = np.array(case[0])
+            np_rois = np.array(case[1])
+
+            x = torch.tensor(np_input, device='cuda', requires_grad=True)
+            rois = torch.tensor(np_rois, device='cuda')
+
+            froipool = RoIAlign((pool_h, pool_w), spatial_scale,
+                                sampling_ratio)
+
+            if _USING_PARROTS:
+                pass
+                # gradcheck(froipool, (x, rois), no_grads=[rois])
+            else:
+                gradcheck(froipool, (x, rois), eps=1e-2, atol=1e-2)
+
+    def _test_roipool_allclose(self, dtype=torch.float):
+        if not torch.cuda.is_available():
+            return
+        from mmcv.ops import roi_align
+        pool_h = 2
+        pool_w = 2
+        spatial_scale = 1.0
+        sampling_ratio = 2
+
+        for case, output in zip(inputs, outputs):
+            np_input = np.array(case[0])
+            np_rois = np.array(case[1])
+            np_output = np.array(output[0])
+            np_grad = np.array(output[1])
+
+            x = torch.tensor(
+                np_input, dtype=dtype, device='cuda', requires_grad=True)
+            rois = torch.tensor(np_rois, dtype=dtype, device='cuda')
+
+            output = roi_align(x, rois, (pool_h, pool_w), spatial_scale,
+                               sampling_ratio, 'avg', True)
+            output.backward(torch.ones_like(output))
+            assert np.allclose(
+                output.data.type(torch.float).cpu().numpy(),
+                np_output,
+                atol=1e-3)
+            assert np.allclose(
+                x.grad.data.type(torch.float).cpu().numpy(),
+                np_grad,
+                atol=1e-3)
+
+    def test_roipool_allclose(self):
+        self._test_roipool_allclose(torch.float)
+        self._test_roipool_allclose(torch.double)
+        self._test_roipool_allclose(torch.half)
--- a/tests/test_ops/test_roi_pool.py
+++ b/tests/test_ops/test_roi_pool.py
+import os
+
+import numpy as np
+import torch
+
+_USING_PARROTS = True
+try:
+    from parrots.autograd import gradcheck
+except ImportError:
+    from torch.autograd import gradcheck
+
+    _USING_PARROTS = False
+
+cur_dir = os.path.dirname(os.path.abspath(__file__))
+
+inputs = [([[[[1., 2.], [3., 4.]]]], [[0., 0., 0., 1., 1.]]),
+          ([[[[1., 2.], [3., 4.]], [[4., 3.], [2.,
+                                               1.]]]], [[0., 0., 0., 1., 1.]]),
+          ([[[[1., 2., 5., 6.], [3., 4., 7., 8.], [9., 10., 13., 14.],
+              [11., 12., 15., 16.]]]], [[0., 0., 0., 3., 3.]])]
+outputs = [([[[[1., 2.], [3., 4.]]]], [[[[1., 1.], [1., 1.]]]]),
+           ([[[[1., 2.], [3., 4.]], [[4., 3.], [2., 1.]]]], [[[[1., 1.],
+                                                               [1., 1.]],
+                                                              [[1., 1.],
+                                                               [1., 1.]]]]),
+           ([[[[4., 8.], [12., 16.]]]], [[[[0., 0., 0., 0.], [0., 1., 0., 1.],
+                                           [0., 0., 0., 0.], [0., 1., 0.,
+                                                              1.]]]])]
+
+
+class TestRoiPool(object):
+
+    def test_roipool_gradcheck(self):
+        if not torch.cuda.is_available():
+            return
+        from mmcv.ops import RoIPool
+        pool_h = 2
+        pool_w = 2
+        spatial_scale = 1.0
+
+        for case in inputs:
+            np_input = np.array(case[0])
+            np_rois = np.array(case[1])
+
+            x = torch.tensor(np_input, device='cuda', requires_grad=True)
+            rois = torch.tensor(np_rois, device='cuda')
+
+            froipool = RoIPool((pool_h, pool_w), spatial_scale)
+
+            if _USING_PARROTS:
+                pass
+                # gradcheck(froipool, (x, rois), no_grads=[rois])
+            else:
+                gradcheck(froipool, (x, rois), eps=1e-2, atol=1e-2)
+
+    def _test_roipool_allclose(self, dtype=torch.float):
+        if not torch.cuda.is_available():
+            return
+        from mmcv.ops import roi_pool
+        pool_h = 2
+        pool_w = 2
+        spatial_scale = 1.0
+
+        for case, output in zip(inputs, outputs):
+            np_input = np.array(case[0])
+            np_rois = np.array(case[1])
+            np_output = np.array(output[0])
+            np_grad = np.array(output[1])
+
+            x = torch.tensor(
+                np_input, dtype=dtype, device='cuda', requires_grad=True)
+            rois = torch.tensor(np_rois, dtype=dtype, device='cuda')
+
+            output = roi_pool(x, rois, (pool_h, pool_w), spatial_scale)
+            output.backward(torch.ones_like(output))
+            assert np.allclose(output.data.cpu().numpy(), np_output, 1e-3)
+            assert np.allclose(x.grad.data.cpu().numpy(), np_grad, 1e-3)
+
+    def test_roipool_allclose(self):
+        self._test_roipool_allclose(torch.double)
+        self._test_roipool_allclose(torch.float)
+        self._test_roipool_allclose(torch.half)
--- a/tests/test_ops/test_syncbn.py
+++ b/tests/test_ops/test_syncbn.py
+import os
+import re
+
+import numpy as np
+import torch
+import torch.distributed as dist
+import torch.nn as nn
+
+
+class TestSyncBN(object):
+
+    def dist_init(self):
+        rank = int(os.environ['SLURM_PROCID'])
+        world_size = int(os.environ['SLURM_NTASKS'])
+        local_rank = int(os.environ['SLURM_LOCALID'])
+        node_list = str(os.environ['SLURM_NODELIST'])
+
+        node_parts = re.findall('[0-9]+', node_list)
+        host_ip = '{}.{}.{}.{}'.format(node_parts[1], node_parts[2],
+                                       node_parts[3], node_parts[4])
+        port = '12341'
+        init_method = 'tcp://{}:{}'.format(host_ip, port)
+
+        dist.init_process_group(
+            'nccl', init_method=init_method, world_size=world_size, rank=rank)
+        torch.cuda.set_device(local_rank)
+
+    def _test_syncbn_train(self, size=1, half=False):
+
+        if 'SLURM_NTASKS' not in os.environ or int(
+                os.environ['SLURM_NTASKS']) != 4:
+            print('must run with slurm has 4 processes!\n'
+                  'srun -p test --gres=gpu:4 -n4')
+            return
+        else:
+            print('Running syncbn test')
+        from mmcv.ops import SyncBatchNorm
+
+        assert size in (1, 2, 4)
+        if not dist.is_initialized():
+            self.dist_init()
+        rank = dist.get_rank()
+
+        torch.manual_seed(9)
+        torch.cuda.manual_seed(9)
+
+        self.x = torch.rand(16, 3, 2, 3).cuda()
+        self.y_bp = torch.rand(16, 3, 2, 3).cuda()
+
+        if half:
+            self.x = self.x.half()
+            self.y_bp = self.y_bp.half()
+        dist.broadcast(self.x, src=0)
+        dist.broadcast(self.y_bp, src=0)
+
+        torch.cuda.synchronize()
+        if size == 1:
+            groups = [None, None, None, None]
+            groups[0] = dist.new_group([0])
+            groups[1] = dist.new_group([1])
+            groups[2] = dist.new_group([2])
+            groups[3] = dist.new_group([3])
+            group = groups[rank]
+        elif size == 2:
+            groups = [None, None, None, None]
+            groups[0] = groups[1] = dist.new_group([0, 1])
+            groups[2] = groups[3] = dist.new_group([2, 3])
+            group = groups[rank]
+        elif size == 4:
+            group = dist.group.WORLD
+        syncbn = SyncBatchNorm(3, group=group).cuda()
+        syncbn.weight.data[0] = 0.2
+        syncbn.weight.data[1] = 0.5
+        syncbn.weight.data[2] = 0.7
+        syncbn.train()
+
+        bn = nn.BatchNorm2d(3).cuda()
+        bn.weight.data[0] = 0.2
+        bn.weight.data[1] = 0.5
+        bn.weight.data[2] = 0.7
+        bn.train()
+
+        sx = self.x[rank * 4:rank * 4 + 4]
+        sx.requires_grad_()
+        sy = syncbn(sx)
+        sy.backward(self.y_bp[rank * 4:rank * 4 + 4])
+
+        smean = syncbn.running_mean
+        svar = syncbn.running_var
+        sx_grad = sx.grad
+        sw_grad = syncbn.weight.grad
+        sb_grad = syncbn.bias.grad
+
+        if size == 1:
+            x = self.x[rank * 4:rank * 4 + 4]
+            y_bp = self.y_bp[rank * 4:rank * 4 + 4]
+        elif size == 2:
+            x = self.x[rank // 2 * 8:rank // 2 * 8 + 8]
+            y_bp = self.y_bp[rank // 2 * 8:rank // 2 * 8 + 8]
+        elif size == 4:
+            x = self.x
+            y_bp = self.y_bp
+        x.requires_grad_()
+        y = bn(x)
+        y.backward(y_bp)
+
+        if size == 2:
+            y = y[rank % 2 * 4:rank % 2 * 4 + 4]
+        elif size == 4:
+            y = y[rank * 4:rank * 4 + 4]
+
+        mean = bn.running_mean
+        var = bn.running_var
+        if size == 1:
+            x_grad = x.grad
+            w_grad = bn.weight.grad
+            b_grad = bn.bias.grad
+        elif size == 2:
+            x_grad = x.grad[rank % 2 * 4:rank % 2 * 4 + 4]
+            w_grad = bn.weight.grad / 2
+            b_grad = bn.bias.grad / 2
+        elif size == 4:
+            x_grad = x.grad[rank * 4:rank * 4 + 4]
+            w_grad = bn.weight.grad / 4
+            b_grad = bn.bias.grad / 4
+
+        assert np.allclose(mean.data.cpu().numpy(),
+                           smean.data.cpu().numpy(), 1e-3)
+        assert np.allclose(var.data.cpu().numpy(),
+                           svar.data.cpu().numpy(), 1e-3)
+        assert np.allclose(y.data.cpu().numpy(), sy.data.cpu().numpy(), 1e-3)
+        assert np.allclose(w_grad.data.cpu().numpy(),
+                           sw_grad.data.cpu().numpy(), 1e-3)
+        assert np.allclose(b_grad.data.cpu().numpy(),
+                           sb_grad.data.cpu().numpy(), 1e-3)
+        assert np.allclose(x_grad.data.cpu().numpy(),
+                           sx_grad.data.cpu().numpy(), 1e-2)
+
+    def test_syncbn_1(self):
+        self._test_syncbn_train(size=1)
+
+    def test_syncbn_2(self):
+        self._test_syncbn_train(size=2)
+
+    def test_syncbn_4(self):
+        self._test_syncbn_train(size=4)
+
+    def test_syncbn_1_half(self):
+        self._test_syncbn_train(size=1, half=True)
+
+    def test_syncbn_2_half(self):
+        self._test_syncbn_train(size=2, half=True)
+
+    def test_syncbn_4_half(self):
+        self._test_syncbn_train(size=4, half=True)