Simplify and organize test_ops. (#1551)

* Simlify and organize test_ops. We perform the following: - Simplify the functions slow_roi_pooling, slow_ps_roi_pooling, slow_ps_roi_align and bilinear_interpolate (including finding and removing a semi-bug in slow_ps_roi_pooling, which used bin_w instead of bin_h); - Wrote a slow_roi_align function, that was missing; - Create a base class testing all combinations of forward/backward, cpu/cuda, contiguous/non-contiguous; - Organize all testing inside the base class with _test_forward and _test_backward (which can be easily overriden if a parciular op needs something different); an Op class then only needs to implement fn, get_script_fn, and expected_fn. A few points: - We are using the same inputs for all tests, and not trying all possible inputs in the domain of a given operation. One improvement would be to test more diverse inputs, and to personalize the inputs for some ops (e.g. different inputs for pooling ops and align ops). - Running all tests is quite slow (~1 min only for CPU tests), so that can possibly be improved. * Reduce input size used in gradcheck. gradcheck can be quite costly, and it was causing OOM errors and making the tests slow. By reducing the size of the input, the test speed is down to 3 seconds for the CPU tests. Other points: - We remove an unused namedtuple; - We inherit from object for better Python 2 compatibility; - We remove a hardcoded pool_size from the TorchScript functions, and add it as a parameter instead. * Replace Tensor by torch.Tensor in type annotations. This should fix lint errors.

Simplify and organize test_ops. (#1551)
* Simlify and organize test_ops. We perform the following: - Simplify the functions slow_roi_pooling, slow_ps_roi_pooling, slow_ps_roi_align and bilinear_interpolate (including finding and removing a semi-bug in slow_ps_roi_pooling, which used bin_w instead of bin_h); - Wrote a slow_roi_align function, that was missing; - Create a base class testing all combinations of forward/backward, cpu/cuda, contiguous/non-contiguous; - Organize all testing inside the base class with _test_forward and _test_backward (which can be easily overriden if a parciular op needs something different); an Op class then only needs to implement fn, get_script_fn, and expected_fn. A few points: - We are using the same inputs for all tests, and not trying all possible inputs in the domain of a given operation. One improvement would be to test more diverse inputs, and to personalize the inputs for some ops (e.g. different inputs for pooling ops and align ops). - Running all tests is quite slow (~1 min only for CPU tests), so that can possibly be improved. * Reduce input size used in gradcheck. gradcheck can be quite costly, and it was causing OOM errors and making the tests slow. By reducing the size of the input, the test speed is down to 3 seconds for the CPU tests. Other points: - We remove an unused namedtuple; - We inherit from object for better Python 2 compatibility; - We remove a hardcoded pool_size from the TorchScript functions, and add it as a parameter instead. * Replace Tensor by torch.Tensor in type annotations. This should fix lint errors.
af225a8a · pedrofreire · Francisco Massa · 4897402a · af225a8a
Commit af225a8a authored Nov 06, 2019 by pedrofreire Committed by Francisco Massa Nov 06, 2019
Hide whitespace changes
Inline Side-by-side

Showing with 235 additions and 1097 deletions

test/test_ops.py test/test_ops.py +235 -1097

No files found.
--- a/test/test_ops.py
+++ b/test/test_ops.py
+from __future__ import division
 import numpy as np
 import torch
 from torch.autograd import gradcheck
@@ -8,1168 +9,305 @@ from itertools import product
 import unittest
-class RoIPoolTester(unittest.TestCase):
+class RoIOpTester(object):
    @classmethod
    def setUpClass(cls):
        cls.dtype = torch.float64
-    def slow_roi_pooling(self, x, rois, pool_h, pool_w, spatial_scale=1,
+    def test_forward_cpu_contiguous(self):
-                         device=None, dtype=torch.float64):
+        self._test_forward(device=torch.device('cpu'), contiguous=True)
-        if device is None:
-            device = torch.device("cpu")
-        c = x.size(1)
-        y = torch.zeros(rois.size(0), c, pool_h, pool_w, dtype=dtype, device=device)
-        rois = torch.round(rois * spatial_scale)
-        for n in range(0, y.size(0)):
-            for r, roi in enumerate(rois):
-                if roi[0] == n:
-                    start_h, end_h = int(roi[2].item()), int(roi[4].item()) + 1
-                    start_w, end_w = int(roi[1].item()), int(roi[3].item()) + 1
-                    roi_x = x[roi[0].long(), :, start_h:end_h, start_w:end_w]
-                    bin_h, bin_w = roi_x.size(-2) / float(pool_h), roi_x.size(-1) / float(pool_w)
-                    for j in range(0, pool_h):
-                        cj = slice(int(np.floor(j * bin_h)), int(np.ceil((j + 1) * bin_h)))
-                        for i in range(0, pool_w):
-                            ci = slice(int(np.floor(i * bin_w)), int(np.ceil((i + 1) * bin_w)))
-                            t = roi_x[:, cj, ci].reshape(c, -1)
-                            if t.numel() > 0:
-                                y[r, :, j, i] = torch.max(t, 1)[0]
-        return y
-    def test_roi_pool_basic_cpu(self):
-        device = torch.device('cpu')
-        x = torch.rand(1, 1, 10, 10, dtype=self.dtype, device=device)
-        rois = torch.tensor([[0, 0, 0, 4, 4]],  # format is (xyxy)
-                            dtype=self.dtype, device=device)
-        pool_h, pool_w = (5, 5)
-        roi_pool = ops.RoIPool((pool_h, pool_w), 1)
-        y = roi_pool(x, rois)
-        gt_y = self.slow_roi_pooling(x, rois, pool_h, pool_w, device=device, dtype=self.dtype)
+    def test_forward_cpu_non_contiguous(self):
+        self._test_forward(device=torch.device('cpu'), contiguous=False)
-        self.assertTrue(torch.allclose(gt_y, y), 'RoIPool layer incorrect on CPU')
+    def test_backward_cpu_contiguous(self):
+        self._test_backward(device=torch.device('cpu'), contiguous=True)
-        # non-contiguous
+    def test_backward_cpu_non_contiguous(self):
-        y = roi_pool(x.permute(0, 1, 3, 2), rois)
+        self._test_backward(device=torch.device('cpu'), contiguous=False)
-        gt_y = self.slow_roi_pooling(x.permute(0, 1, 3, 2), rois, pool_h, pool_w, device=device, dtype=self.dtype)
-        self.assertTrue(torch.allclose(gt_y, y), 'RoIPool layer incorrect on CPU')
-    def test_roi_pool_cpu(self):
-        device = torch.device('cpu')
-        x = torch.rand(2, 1, 10, 10, dtype=self.dtype, device=device)
-        rois = torch.tensor([[0, 0, 0, 9, 9],  # format is (xyxy)
-                             [0, 0, 5, 4, 9],
-                             [0, 5, 5, 9, 9],
-                             [1, 0, 0, 9, 9]],
-                            dtype=self.dtype, device=device)
-        pool_h, pool_w = (5, 5)
-        roi_pool = ops.RoIPool((pool_h, pool_w), 1)
-        y = roi_pool(x, rois)
-        gt_y = self.slow_roi_pooling(x, rois, pool_h, pool_w, device=device, dtype=self.dtype)
-        self.assertTrue(torch.allclose(gt_y, y), 'RoIPool layer incorrect on CPU for batch > 1')
-        # non-contiguous
-        y = roi_pool(x.permute(0, 1, 3, 2), rois)
-        gt_y = self.slow_roi_pooling(x.permute(0, 1, 3, 2), rois, pool_h, pool_w, device=device, dtype=self.dtype)
-        self.assertTrue(torch.allclose(gt_y, y), 'RoIPool layer incorrect on CPU for batch > 1')
-    def test_roi_pool_cpu_empty_rois(self):
-        device = torch.device('cpu')
-        x = torch.tensor(
-            [[[[0.1767, 1.2851, 4.2325, 4.8645, 7.1496]],
-              [[2.5916, 4.3361, 3.8143, 6.1329, 2.0230]],
-              [[1.4492, 3.3384, 4.0816, 6.3116, 5.1068]]]],
-            dtype=self.dtype, device=device)
-        rois = torch.tensor(
-            [[0., 1., 0., 4., 0.],
-             [0., 2., 0., 3., 0.],
-             [0., 0., 0., 0., 0.],
-             [0., 0., 0., 0., 0.],
-             [0., 2., 0., 2., 0.]],
-            dtype=self.dtype, device=device)
-        pool_h, pool_w = (1, 2)
-        roi_pool = ops.RoIPool((pool_h, pool_w), 1)
-        y = roi_pool(x, rois)
-        gt_y = self.slow_roi_pooling(x, rois, pool_h, pool_w, device=device, dtype=self.dtype)
-        self.assertTrue(torch.allclose(gt_y, y), 'RoIPool layer incorrect on CPU empty rois')
-        # non-contiguous
-        y = roi_pool(x.permute(0, 1, 3, 2), rois)
-        gt_y = self.slow_roi_pooling(x.permute(0, 1, 3, 2), rois, pool_h, pool_w, device=device, dtype=self.dtype)
-        self.assertTrue(torch.allclose(gt_y, y), 'RoIPool layer incorrect on CPU for empty rois non-contiguous')
-    def test_roi_pool_gradient_cpu(self):
-        device = torch.device('cpu')
-        x = torch.ones(1, 1, 10, 10, dtype=self.dtype, device=device, requires_grad=True)
-        rois = torch.tensor([
-            [0, 0, 0, 9, 9],
-            [0, 0, 5, 4, 9],
-            [0, 0, 0, 4, 4]],
-            dtype=self.dtype, device=device)
-        layer = ops.RoIPool((5, 5), 1).to(dtype=self.dtype, device=device)
-        y = layer(x, rois)
-        s = y.sum()
-        s.backward()
-        gt_grad = torch.tensor([[[[2., 1., 2., 1., 2., 0., 1., 0., 1., 0.],
-                                  [1., 1., 1., 1., 1., 0., 0., 0., 0., 0.],
-                                  [2., 1., 2., 1., 2., 0., 1., 0., 1., 0.],
-                                  [1., 1., 1., 1., 1., 0., 0., 0., 0., 0.],
-                                  [2., 1., 2., 1., 2., 0., 1., 0., 1., 0.],
-                                  [1., 1., 1., 1., 1., 0., 0., 0., 0., 0.],
-                                  [2., 1., 2., 1., 2., 0., 1., 0., 1., 0.],
-                                  [1., 1., 1., 1., 1., 0., 0., 0., 0., 0.],
-                                  [2., 1., 2., 1., 2., 0., 1., 0., 1., 0.],
-                                  [1., 1., 1., 1., 1., 0., 0., 0., 0., 0.]]]],
-                               device=device, dtype=self.dtype)
-        self.assertTrue(torch.allclose(x.grad, gt_grad), 'gradient incorrect for roi_pool')
-    def test_roi_pool_align_non_cont_grad_cpu(self):
-        devices = ['cpu']
-        if torch.cuda.is_available():
-            devices.append('cuda')
-        for d in devices:
-            device = torch.device(d)
-            rois = torch.tensor([
-                [0, 0, 0, 9, 9],
-                [0, 0, 5, 5, 9],
-                [0, 5, 5, 9, 9]], dtype=self.dtype, device=device)
-            grad_cont = torch.rand(3, 1, 5, 5, dtype=self.dtype, device=device)
-            grad = grad_cont.permute(2, 1, 3, 0).contiguous().permute(3, 1, 0, 2)
-            for op in ['RoIPool', 'RoIAlign']:
-                x = torch.rand(1, 1, 10, 10, dtype=self.dtype, device=device, requires_grad=True)
-                kwargs = {}
-                if op == 'RoIAlign':
-                    kwargs['sampling_ratio'] = 1
-                m = getattr(ops, op)((5, 5), 1, **kwargs)
-                y = m(x, rois)
-                y.backward(grad_cont)
-                g1 = x.grad.detach().clone()
-                del x.grad
-                y = m(x, rois)
-                y.backward(grad)
-                g2 = x.grad.detach().clone()
-                del x.grad
-                self.assertTrue(torch.allclose(g1, g2), 'gradient incorrect for {}'.format(op))
-    def test_roi_pool_gradcheck_cpu(self):
-        device = torch.device('cpu')
-        x = torch.rand(1, 1, 10, 10, dtype=self.dtype, device=device, requires_grad=True)
-        rois = torch.tensor([
-            [0, 0, 0, 9, 9],
-            [0, 0, 5, 5, 9],
-            [0, 5, 5, 9, 9]], dtype=self.dtype, device=device)
-        m = ops.RoIPool((5, 5), 1).to(dtype=self.dtype, device=device)
-        def func(input):
-            return m(input, rois)
-        self.assertTrue(gradcheck(func, (x,)), 'gradcheck failed for roi_pool CPU')
-        self.assertTrue(gradcheck(func, (x.permute(0, 1, 3, 2),)), 'gradcheck failed for roi_pool CPU')
-        @torch.jit.script
-        def script_func(input, rois):
-            return ops.roi_pool(input, rois, 5, 1.0)[0]
-        self.assertTrue(gradcheck(lambda x: script_func(x, rois), (x,)), 'gradcheck failed for scripted roi_pool')
    @unittest.skipIf(not torch.cuda.is_available(), "CUDA unavailable")
-    def test_roi_pool_basic_cuda(self):
+    def test_forward_cuda_contiguous(self):
-        device = torch.device('cuda')
+        self._test_forward(device=torch.device('cuda'), contiguous=True)
-        x = torch.rand(1, 1, 10, 10, dtype=self.dtype, device=device)
-        rois = torch.tensor([[0, 0, 0, 4, 4]],  # format is (xyxy)
-                            dtype=self.dtype, device=device)
-        pool_h, pool_w = (5, 5)
-        roi_pool = ops.RoIPool((pool_h, pool_w), 1)
-        y = roi_pool(x, rois)
-        gt_y = self.slow_roi_pooling(x, rois, pool_h, pool_w, device=device, dtype=self.dtype)
-        self.assertTrue(torch.allclose(gt_y.cuda(), y), 'RoIPool layer incorrect')
+    @unittest.skipIf(not torch.cuda.is_available(), "CUDA unavailable")
+    def test_forward_cuda_non_contiguous(self):
+        self._test_forward(device=torch.device('cuda'), contiguous=False)
-        y = roi_pool(x.permute(0, 1, 3, 2), rois)
+    @unittest.skipIf(not torch.cuda.is_available(), "CUDA unavailable")
-        gt_y = self.slow_roi_pooling(x.permute(0, 1, 3, 2), rois, pool_h, pool_w, device=device, dtype=self.dtype)
+    def test_backward_cuda_contiguous(self):
-        self.assertTrue(torch.allclose(gt_y.cuda(), y), 'RoIPool layer incorrect')
+        self._test_backward(device=torch.device('cuda'), contiguous=True)
    @unittest.skipIf(not torch.cuda.is_available(), "CUDA unavailable")
-    def test_roi_pool_cuda(self):
+    def test_backward_cuda_non_contiguous(self):
-        device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
+        self._test_backward(device=torch.device('cuda'), contiguous=False)
-        x = torch.rand(2, 1, 10, 10, dtype=self.dtype, device=device)
+    def _test_forward(self, device, contiguous):
+        pool_size = 5
+        # n_channels % (pool_size ** 2) == 0 required for PS opeartions.
+        n_channels = 2 * (pool_size ** 2)
+        x = torch.rand(2, n_channels, 10, 10, dtype=self.dtype, device=device)
+        if not contiguous:
+            x = x.permute(0, 1, 3, 2)
        rois = torch.tensor([[0, 0, 0, 9, 9],  # format is (xyxy)
                             [0, 0, 5, 4, 9],
                             [0, 5, 5, 9, 9],
                             [1, 0, 0, 9, 9]],
                            dtype=self.dtype, device=device)
-        pool_h, pool_w = (5, 5)
+        pool_h, pool_w = pool_size, pool_size
-        roi_pool = ops.RoIPool((pool_h, pool_w), 1)
+        y = self.fn(x, rois, pool_h, pool_w, spatial_scale=1, sampling_ratio=-1)
-        y = roi_pool(x, rois)
+        gt_y = self.expected_fn(x, rois, pool_h, pool_w, spatial_scale=1,
+                                sampling_ratio=-1, device=device, dtype=self.dtype)
-        gt_y = self.slow_roi_pooling(x, rois, pool_h, pool_w, device=device, dtype=self.dtype)
+        self.assertTrue(torch.allclose(gt_y, y))
-        self.assertTrue(torch.allclose(gt_y.cuda(), y), 'RoIPool layer incorrect')
+    def _test_backward(self, device, contiguous):
-        y = roi_pool(x.permute(0, 1, 3, 2), rois)
+        pool_size = 2
-        gt_y = self.slow_roi_pooling(x.permute(0, 1, 3, 2), rois, pool_h, pool_w, device=device, dtype=self.dtype)
+        x = torch.rand(1, 2 * (pool_size ** 2), 5, 5, dtype=self.dtype, device=device, requires_grad=True)
-        self.assertTrue(torch.allclose(gt_y.cuda(), y), 'RoIPool layer incorrect')
+        if not contiguous:
+            x = x.permute(0, 1, 3, 2)
-    @unittest.skipIf(not torch.cuda.is_available(), "CUDA unavailable")
+        rois = torch.tensor([[0, 0, 0, 4, 4],  # format is (xyxy)
-    def test_roi_pool_gradient_cuda(self):
+                             [0, 0, 2, 3, 4],
-        device = torch.device('cuda')
+                             [0, 2, 2, 4, 4]],
-        layer = ops.RoIPool((5, 5), 1).to(dtype=self.dtype, device=device)
+                            dtype=self.dtype, device=device)
-        x = torch.ones(1, 1, 10, 10, dtype=self.dtype, device=device, requires_grad=True)
-        rois = torch.tensor([
-            [0, 0, 0, 9, 9],
-            [0, 0, 5, 4, 9],
-            [0, 0, 0, 4, 4]],
-            dtype=self.dtype, device=device)
-        y = layer(x, rois)
-        s = y.sum()
-        s.backward()
-        gt_grad = torch.tensor([[[[2., 1., 2., 1., 2., 0., 1., 0., 1., 0.],
-                                  [1., 1., 1., 1., 1., 0., 0., 0., 0., 0.],
-                                  [2., 1., 2., 1., 2., 0., 1., 0., 1., 0.],
-                                  [1., 1., 1., 1., 1., 0., 0., 0., 0., 0.],
-                                  [2., 1., 2., 1., 2., 0., 1., 0., 1., 0.],
-                                  [1., 1., 1., 1., 1., 0., 0., 0., 0., 0.],
-                                  [2., 1., 2., 1., 2., 0., 1., 0., 1., 0.],
-                                  [1., 1., 1., 1., 1., 0., 0., 0., 0., 0.],
-                                  [2., 1., 2., 1., 2., 0., 1., 0., 1., 0.],
-                                  [1., 1., 1., 1., 1., 0., 0., 0., 0., 0.]]]],
-                               device=device, dtype=self.dtype)
-        self.assertTrue(torch.allclose(x.grad, gt_grad), 'gradient incorrect for roi_pool')
-    @unittest.skipIf(not torch.cuda.is_available(), "CUDA unavailable")
-    def test_roi_pool_gradcheck_cuda(self):
-        device = torch.device('cuda')
-        x = torch.rand(1, 1, 10, 10, dtype=self.dtype, device=device, requires_grad=True)
-        rois = torch.tensor([
-            [0, 0, 0, 9, 9],
-            [0, 0, 5, 5, 9],
-            [0, 5, 5, 9, 9]], dtype=self.dtype, device=device)
-        m = ops.RoIPool((5, 5), 1).to(dtype=self.dtype, device=device)
-        def func(input):
-            return m(input, rois)
-        self.assertTrue(gradcheck(func, (x,)), 'gradcheck failed for roi_pool CUDA')
-        self.assertTrue(gradcheck(func, (x.permute(0, 1, 3, 2),)), 'gradcheck failed for roi_pool CUDA')
-        @torch.jit.script
-        def script_func(input, rois):
-            return ops.roi_pool(input, rois, 5, 1.0)[0]
-        self.assertTrue(gradcheck(lambda x: script_func(x, rois), (x,)),
-                        'gradcheck failed for scripted roi_pool on CUDA')
-class RoIAlignTester(unittest.TestCase):
-    @classmethod
-    def setUpClass(cls):
-        torch.manual_seed(123)
-        cls.dtype = torch.float32
-        cls.x = torch.rand(1, 1, 10, 10, dtype=cls.dtype)
-        cls.single_roi = torch.tensor([[0, 0, 0, 4, 4]],  # format is (xyxy)
-                                      dtype=cls.dtype)
-        cls.rois = torch.tensor([[0, 0, 0, 9, 9],  # format is (xyxy)
-                                 [0, 0, 5, 4, 9],
-                                 [0, 5, 5, 9, 9]],
-                                dtype=cls.dtype)
-        cls.gt_y_single = torch.tensor(
-            [[[[0.41617328, 0.5040753, 0.25266218, 0.4296828, 0.29928464],
-               [0.5210769, 0.57222337, 0.2524979, 0.32063985, 0.32635176],
-               [0.73108256, 0.6114335, 0.62033176, 0.8188273, 0.5562218],
-               [0.83115816, 0.70803946, 0.7084047, 0.74928707, 0.7769296],
-               [0.54266506, 0.45964524, 0.5780159, 0.80522037, 0.7321807]]]], dtype=cls.dtype)
-        cls.gt_y_multiple = torch.tensor(
-            [[[[0.49311584, 0.35972416, 0.40843594, 0.3638034, 0.49751836],
-               [0.70881474, 0.75481665, 0.5826779, 0.34767765, 0.46865487],
-               [0.4740328, 0.69306874, 0.3617804, 0.47145438, 0.66130304],
-               [0.6861706, 0.17634538, 0.47194335, 0.42473823, 0.37930614],
-               [0.62666404, 0.49973848, 0.37911576, 0.5842756, 0.7176864]]],
-             [[[0.67499936, 0.6607055, 0.42656037, 0.46134934, 0.42144877],
-               [0.7471722, 0.7235433, 0.14512213, 0.13031253, 0.289369],
-               [0.8443615, 0.6659734, 0.23614208, 0.14719573, 0.4268827],
-               [0.69429564, 0.5621515, 0.5019923, 0.40678093, 0.34556213],
-               [0.51315194, 0.7177093, 0.6494485, 0.6775592, 0.43865064]]],
-             [[[0.24465509, 0.36108392, 0.64635646, 0.4051828, 0.33956185],
-               [0.49006107, 0.42982674, 0.34184104, 0.15493104, 0.49633422],
-               [0.54400194, 0.5265246, 0.22381854, 0.3929715, 0.6757667],
-               [0.32961223, 0.38482672, 0.68877804, 0.71822757, 0.711909],
-               [0.561259, 0.71047884, 0.84651315, 0.8541089, 0.644432]]]], dtype=cls.dtype)
-        cls.x_grad = torch.tensor(
-            [[[[0.075625, 0.15125, 0.15124999, 0.15125002, 0.15812504,
-                0.15812503, 0.15124999, 0.15124999, 0.15125006, 0.0756249],
-               [0.15125, 0.30250007, 0.3025, 0.30250007, 0.31625012,
-                0.31625003, 0.3025, 0.3025, 0.30250013, 0.1512498],
-               [0.15124999, 0.3025, 0.30249995, 0.3025, 0.31625006,
-                0.31625, 0.30249995, 0.30249995, 0.30250007, 0.15124978],
-               [0.15125002, 0.30250007, 0.3025, 0.30250007, 0.31625012,
-                0.3162501, 0.3025, 0.3025, 0.30250013, 0.15124981],
-               [0.15812504, 0.31625012, 0.31625006, 0.31625012, 0.33062524,
-                0.3306251, 0.31625006, 0.31625006, 0.3162502, 0.15812483],
-               [0.5181251, 1.0962502, 1.0362502, 1.0962503, 0.69062525, 0.6906252,
-                1.0962502, 1.0362502, 1.0962503, 0.5181248],
-               [0.93125, 1.9925, 1.8624997, 1.9925, 1.0962502, 1.0962502,
-                1.9925, 1.8624998, 1.9925, 0.9312496],
-               [0.8712501, 1.8625, 1.7425002, 1.8625001, 1.0362502, 1.0362502,
-                1.8625, 1.7425001, 1.8625002, 0.8712497],
-               [0.93125004, 1.9925, 1.8625002, 1.9925, 1.0962503, 1.0962503,
-                1.9925001, 1.8625001, 1.9925001, 0.93124974],
-               [0.43562484, 0.9312497, 0.8712497, 0.9312497, 0.5181249, 0.5181248,
-                0.9312496, 0.8712497, 0.93124974, 0.43562466]]]], dtype=cls.dtype)
-    def test_roi_align_basic_cpu(self):
-        device = torch.device('cpu')
-        x = self.x.to(device)
-        single_roi = self.single_roi.to(device)
-        gt_y_single = self.gt_y_single.to(device)
-        pool_h, pool_w = (5, 5)
-        roi_align = ops.RoIAlign((pool_h, pool_w), spatial_scale=1, sampling_ratio=2).to(device=device)
-        y = roi_align(x, single_roi)
-        self.assertTrue(torch.allclose(gt_y_single, y), 'RoIAlign layer incorrect for single ROI on CPU')
-        y = roi_align(x.transpose(2, 3).contiguous().transpose(2, 3), single_roi)
-        self.assertTrue(torch.allclose(gt_y_single, y), 'RoIAlign layer incorrect for single ROI on CPU')
-    def test_roi_align_cpu(self):
-        device = torch.device('cpu')
-        x = self.x.to(device)
-        rois = self.rois.to(device)
-        gt_y_multiple = self.gt_y_multiple.to(device)
-        pool_h, pool_w = (5, 5)
-        roi_align = ops.RoIAlign((pool_h, pool_w), spatial_scale=1, sampling_ratio=2).to(device=device)
-        y = roi_align(x, rois)
-        self.assertTrue(torch.allclose(gt_y_multiple, y), 'RoIAlign layer incorrect for multiple ROIs on CPU')
-        y = roi_align(x.transpose(2, 3).contiguous().transpose(2, 3), rois)
-        self.assertTrue(torch.allclose(gt_y_multiple, y), 'RoIAlign layer incorrect for multiple ROIs on CPU')
-    @unittest.skipIf(not torch.cuda.is_available(), "CUDA unavailable")
-    def test_roi_align_basic_cuda(self):
-        device = torch.device('cuda')
-        x = self.x.to(device)
-        single_roi = self.single_roi.to(device)
-        gt_y_single = self.gt_y_single.to(device)
-        pool_h, pool_w = (5, 5)
-        roi_align = ops.RoIAlign((pool_h, pool_w), spatial_scale=1, sampling_ratio=2).to(device=device)
-        y = roi_align(x, single_roi)
-        self.assertTrue(torch.allclose(gt_y_single, y), 'RoIAlign layer incorrect for single ROI on CUDA')
-        y = roi_align(x.transpose(2, 3).contiguous().transpose(2, 3), single_roi)
-        self.assertTrue(torch.allclose(gt_y_single, y), 'RoIAlign layer incorrect for single ROI on CUDA')
-    @unittest.skipIf(not torch.cuda.is_available(), "CUDA unavailable")
-    def test_roi_align_cuda(self):
-        device = torch.device('cuda')
-        x = self.x.to(device)
-        rois = self.rois.to(device)
-        gt_y_multiple = self.gt_y_multiple.to(device)
-        pool_h, pool_w = (5, 5)
-        roi_align = ops.RoIAlign((pool_h, pool_w), spatial_scale=1, sampling_ratio=2).to(device=device)
-        y = roi_align(x, rois)
-        self.assertTrue(torch.allclose(gt_y_multiple, y), 'RoIAlign layer incorrect for multiple ROIs on CUDA')
+        def func(z):
+            return self.fn(z, rois, pool_size, pool_size, spatial_scale=1, sampling_ratio=1)
-        y = roi_align(x.transpose(2, 3).contiguous().transpose(2, 3), rois)
+        script_func = self.get_script_fn(rois, pool_size)
-        self.assertTrue(torch.allclose(gt_y_multiple, y), 'RoIAlign layer incorrect for multiple ROIs on CUDA')
-    def test_roi_align_gradient_cpu(self):
+        self.assertTrue(gradcheck(func, (x,)))
-        """
+        self.assertTrue(gradcheck(script_func, (x,)))
-        Compute gradients for RoIAlign with multiple bounding boxes on CPU
+        return
-        """
-        device = torch.device('cpu')
-        pool_h, pool_w = (5, 5)
-        roi_align = ops.RoIAlign((pool_h, pool_w), spatial_scale=1, sampling_ratio=2).to(device=device)
-        x = self.x.to(device).clone()
+    def fn(*args, **kwargs):
-        rois = self.rois.to(device)
+        pass
-        gt_grad = self.x_grad.to(device)
-        x.requires_grad = True
+    def get_script_fn(*args, **kwargs):
-        y = roi_align(x, rois)
+        pass
-        s = y.sum()
-        s.backward()
-        self.assertTrue(torch.allclose(x.grad, gt_grad), 'gradient incorrect for RoIAlign CPU')
+    def expected_fn(*args, **kwargs):
+        pass
-    def test_roi_align_gradcheck_cpu(self):
-        dtype = torch.float64
-        device = torch.device('cpu')
-        m = ops.RoIAlign((5, 5), 0.5, 1).to(dtype=dtype, device=device)
-        x = torch.rand(1, 1, 10, 10, dtype=dtype, device=device, requires_grad=True)
-        rois = self.rois.to(device=device, dtype=dtype)
-        def func(input):
+class RoIPoolTester(RoIOpTester, unittest.TestCase):
-            return m(input, rois)
+    def fn(self, x, rois, pool_h, pool_w, spatial_scale=1, sampling_ratio=-1, **kwargs):
+        return ops.RoIPool((pool_h, pool_w), spatial_scale)(x, rois)
-        self.assertTrue(gradcheck(func, (x,)), 'gradcheck failed for RoIAlign CPU')
-        self.assertTrue(gradcheck(func, (x.transpose(2, 3),)), 'gradcheck failed for RoIAlign CPU')
+    def get_script_fn(self, rois, pool_size):
        @torch.jit.script
-        def script_func(input, rois):
+        def script_fn(input, rois, pool_size):
-            return ops.roi_align(input, rois, 5, 0.5, 1)[0]
+            # type: (torch.Tensor, torch.Tensor, int) -> torch.Tensor
+            return ops.roi_pool(input, rois, pool_size, 1.0)[0]
+        return lambda x: script_fn(x, rois, pool_size)
-        self.assertTrue(gradcheck(lambda x: script_func(x, rois), (x,)), 'gradcheck failed for scripted roi_align')
+    def expected_fn(self, x, rois, pool_h, pool_w, spatial_scale=1, sampling_ratio=-1,
+                    device=None, dtype=torch.float64):
+        if device is None:
+            device = torch.device("cpu")
-    @unittest.skipIf(not torch.cuda.is_available(), "CUDA unavailable")
+        n_channels = x.size(1)
-    def test_roi_align_gradient_cuda(self):
+        y = torch.zeros(rois.size(0), n_channels, pool_h, pool_w, dtype=dtype, device=device)
-        """
-        Compute gradients for RoIAlign with multiple bounding boxes on the GPU
-        """
-        device = torch.device('cuda')
-        pool_h, pool_w = (5, 5)
-        roi_align = ops.RoIAlign((pool_h, pool_w), spatial_scale=1, sampling_ratio=2).to(device=device)
-        x = self.x.to(device).clone()
+        def get_slice(k, block):
-        rois = self.rois.to(device)
+            return slice(int(np.floor(k * block)), int(np.ceil((k + 1) * block)))
-        gt_grad = self.x_grad.to(device)
-        x.requires_grad = True
+        for roi_idx, roi in enumerate(rois):
-        y = roi_align(x, rois)
+            batch_idx = int(roi[0])
-        s = y.sum()
+            j_begin, i_begin, j_end, i_end = (int(round(x.item() * spatial_scale)) for x in roi[1:])
-        s.backward()
+            roi_x = x[batch_idx, :, i_begin:i_end + 1, j_begin:j_end + 1]
-        self.assertTrue(torch.allclose(x.grad, gt_grad), 'gradient incorrect for RoIAlign CUDA')
+            roi_h, roi_w = roi_x.shape[-2:]
+            bin_h = roi_h / pool_h
+            bin_w = roi_w / pool_w
-    @unittest.skipIf(not torch.cuda.is_available(), "CUDA unavailable")
+            for i in range(0, pool_h):
-    def test_roi_align_gradcheck_cuda(self):
+                for j in range(0, pool_w):
-        dtype = torch.float64
+                    bin_x = roi_x[:, get_slice(i, bin_h), get_slice(j, bin_w)]
-        device = torch.device('cuda')
+                    if bin_x.numel() > 0:
-        m = ops.RoIAlign((5, 5), 0.5, 1).to(dtype=dtype, device=device)
+                        y[roi_idx, :, i, j] = bin_x.reshape(n_channels, -1).max(dim=1)[0]
-        x = torch.rand(1, 1, 10, 10, dtype=dtype, device=device, requires_grad=True)
+        return y
-        rois = self.rois.to(device=device, dtype=dtype)
-        def func(input):
-            return m(input, rois)
-        self.assertTrue(gradcheck(func, (x,)), 'gradcheck failed for RoIAlign CUDA')
+class PSRoIPoolTester(RoIOpTester, unittest.TestCase):
-        self.assertTrue(gradcheck(func, (x.transpose(2, 3),)), 'gradcheck failed for RoIAlign CUDA')
+    def fn(self, x, rois, pool_h, pool_w, spatial_scale=1, sampling_ratio=-1, **kwargs):
+        return ops.PSRoIPool((pool_h, pool_w), 1)(x, rois)
+    def get_script_fn(self, rois, pool_size):
        @torch.jit.script
-        def script_func(input, rois):
+        def script_fn(input, rois, pool_size):
-            return ops.roi_align(input, rois, 5, 0.5, 1)[0]
+            # type: (torch.Tensor, torch.Tensor, int) -> torch.Tensor
+            return ops.ps_roi_pool(input, rois, pool_size, 1.0)[0]
+        return lambda x: script_fn(x, rois, pool_size)
-        self.assertTrue(gradcheck(lambda x: script_func(x, rois), (x,)),
+    def expected_fn(self, x, rois, pool_h, pool_w, spatial_scale=1, sampling_ratio=-1,
-                        'gradcheck failed for scripted roi_align on CUDA')
+                    device=None, dtype=torch.float64):
+        if device is None:
+            device = torch.device("cpu")
+        n_input_channels = x.size(1)
+        self.assertEqual(n_input_channels % (pool_h * pool_w), 0, "input channels must be divisible by ph * pw")
+        n_output_channels = int(n_input_channels / (pool_h * pool_w))
+        y = torch.zeros(rois.size(0), n_output_channels, pool_h, pool_w, dtype=dtype, device=device)
+        def get_slice(k, block):
+            return slice(int(np.floor(k * block)), int(np.ceil((k + 1) * block)))
+        for roi_idx, roi in enumerate(rois):
+            batch_idx = int(roi[0])
+            j_begin, i_begin, j_end, i_end = (int(round(x.item() * spatial_scale)) for x in roi[1:])
+            roi_x = x[batch_idx, :, i_begin:i_end + 1, j_begin:j_end + 1]
+            roi_height = max(i_end - i_begin, 1)
+            roi_width = max(j_end - j_begin, 1)
+            bin_h, bin_w = roi_height / float(pool_h), roi_width / float(pool_w)
+            for i in range(0, pool_h):
+                for j in range(0, pool_w):
+                    bin_x = roi_x[:, get_slice(i, bin_h), get_slice(j, bin_w)]
+                    if bin_x.numel() > 0:
+                        area = bin_x.size(-2) * bin_x.size(-1)
+                        for c_out in range(0, n_output_channels):
+                            c_in = c_out * (pool_h * pool_w) + pool_w * i + j
+                            t = torch.sum(bin_x[c_in, :, :])
+                            y[roi_idx, c_out, i, j] = t / area
+        return y
 def bilinear_interpolate(data, height, width, y, x):
    if y < -1.0 or y > height or x < -1.0 or x > width:
        return 0.
-    if y <= 0:
+    y = min(max(0, y), height - 1)
-        y = 0.
+    x = min(max(0, x), width - 1)
-    if x <= 0:
-        x = 0.
-    y_low, x_low = int(y), int(x)
+    y_low = int(y)
-    y_high, x_high = 0, 0
+    y_high = min(y_low + 1, height - 1)
-    if y_low >= height - 1:
+    x_low = int(x)
-        y_high = y_low = height - 1
+    x_high = min(x_low + 1, width - 1)
-        y = float(y_low)
-    else:
-        y_high = y_low + 1
-    if x_low >= width - 1:
+    wy_h = y - y_low
-        x_high = x_low = width - 1
+    wy_l = 1 - wy_h
-        x = float(x_low)
-    else:
-        x_high = x_low + 1
-    ly = y - y_low
+    wx_h = x - x_low
-    lx = x - x_low
+    wx_l = 1 - wx_h
-    hy, hx = 1. - ly, 1. - lx
-    v1 = data[y_low * width + x_low]
+    val = 0
-    v2 = data[y_low * width + x_high]
+    for wx, x in zip((wx_l, wx_h), (x_low, x_high)):
-    v3 = data[y_high * width + x_low]
+        for wy, y in zip((wy_l, wy_h), (y_low, y_high)):
-    v4 = data[y_high * width + x_high]
+            val += wx * wy * data[y * width + x]
-    w1, w2, w3, w4 = hy * hx, hy * lx, ly * hx, ly * lx
+    return val
-    return w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4
+class RoIAlignTester(RoIOpTester, unittest.TestCase):
+    def fn(self, x, rois, pool_h, pool_w, spatial_scale=1, sampling_ratio=-1, **kwargs):
+        return ops.RoIAlign((pool_h, pool_w), spatial_scale=spatial_scale,
+                            sampling_ratio=sampling_ratio)(x, rois)
-class PSRoIAlignTester(unittest.TestCase):
+    def get_script_fn(self, rois, pool_size):
-    @classmethod
+        @torch.jit.script
-    def setUpClass(cls):
+        def script_fn(input, rois, pool_size):
-        cls.dtype = torch.float64
+            # type: (torch.Tensor, torch.Tensor, int) -> torch.Tensor
+            return ops.roi_align(input, rois, pool_size, 1.0)[0]
+        return lambda x: script_fn(x, rois, pool_size)
-    def slow_ps_roi_align(self, in_data, rois, pool_h, pool_w, device, spatial_scale=1,
+    def expected_fn(self, in_data, rois, pool_h, pool_w, spatial_scale=1, sampling_ratio=-1,
-                          sampling_ratio=-1, dtype=torch.float64):
+                    device=None, dtype=torch.float64):
        if device is None:
            device = torch.device("cpu")
-        num_input_channels = in_data.size(1)
+        n_channels = in_data.size(1)
-        self.assertEqual(num_input_channels % (pool_h * pool_w), 0, "input channels must be divisible by ph * pw")
+        out_data = torch.zeros(rois.size(0), n_channels, pool_h, pool_w, dtype=dtype, device=device)
-        num_output_channels = int(num_input_channels / (pool_h * pool_w))
-        out_data = torch.zeros(rois.size(0), num_output_channels, pool_h, pool_w, dtype=dtype, device=device)
+        for r, roi in enumerate(rois):
+            batch_idx = int(roi[0])
-        for n in range(0, in_data.size(0)):
+            j_begin, i_begin, j_end, i_end = (x.item() * spatial_scale for x in roi[1:])
-            for r, roi in enumerate(rois):
-                if roi[0] != n:
+            roi_h = i_end - i_begin
-                    continue
+            roi_w = j_end - j_begin
-                roi[1:] = (roi[1:] * spatial_scale) - 0.5
+            bin_h = roi_h / pool_h
-                c_in = 0
+            bin_w = roi_w / pool_w
-                roi_height = float(roi[4].item() - roi[2].item())
-                roi_width = float(roi[3].item() - roi[1].item())
+            for i in range(0, pool_h):
-                bin_h, bin_w = roi_height / float(pool_h), roi_width / float(pool_w)
+                start_h = i_begin + i * bin_h
-                for c_out in range(0, num_output_channels):
+                grid_h = sampling_ratio if sampling_ratio > 0 else int(np.ceil(bin_h))
-                    for j in range(0, pool_h):
+                for j in range(0, pool_w):
-                        start_h = float(j) * bin_h + roi[2].item()
+                    start_w = j_begin + j * bin_w
+                    grid_w = sampling_ratio if sampling_ratio > 0 else int(np.ceil(bin_w))
-                        for i in range(0, pool_w):
-                            start_w = float(i) * bin_w + roi[1].item()
+                    for channel in range(0, n_channels):
-                            roi_bin_grid_h = sampling_ratio if sampling_ratio > 0 else int(np.ceil(roi_height / pool_h))
+                        val = 0
-                            roi_bin_grid_w = sampling_ratio if sampling_ratio > 0 else int(np.ceil(roi_width / pool_w))
+                        for iy in range(0, grid_h):
+                            y = start_h + (iy + 0.5) * bin_h / grid_h
-                            val = 0.
+                            for ix in range(0, grid_w):
-                            for iy in range(0, roi_bin_grid_h):
+                                x = start_w + (ix + 0.5) * bin_w / grid_w
-                                y = start_h + (iy + 0.5) * bin_h / float(roi_bin_grid_h)
+                                val += bilinear_interpolate(
-                                for ix in range(0, roi_bin_grid_w):
+                                    in_data[batch_idx, channel, :, :].flatten(),
-                                    x = start_w + (ix + 0.5) * bin_w / float(roi_bin_grid_w)
+                                    in_data.size(-2),
-                                    val += bilinear_interpolate(
+                                    in_data.size(-1),
-                                        in_data[n, c_in, :, :].flatten(),
+                                    y, x
-                                        in_data.size(-2),
+                                )
-                                        in_data.size(-1),
+                        val /= grid_h * grid_w
-                                        y, x
-                                    )
+                        out_data[r, channel, i, j] = val
-                            count = roi_bin_grid_h * roi_bin_grid_w
-                            out_data[r, c_out, j, i] = val / count
-                            c_in += 1
        return out_data
-    def test_ps_roi_align_basic_cpu(self):
-        device = torch.device('cpu')
-        pool_size = 3
-        x = torch.rand(1, 2 * (pool_size ** 2), 7, 7, dtype=self.dtype, device=device)
-        rois = torch.tensor([[0, 0, 0, 5, 5]],  # format is (xyxy)
-                            dtype=self.dtype, device=device)
-        pool_h, pool_w = (pool_size, pool_size)
-        ps_roi_align = ops.PSRoIAlign((pool_h, pool_w), spatial_scale=1, sampling_ratio=2)
-        y = ps_roi_align(x, rois)
-        gt_y = self.slow_ps_roi_align(x, rois, pool_h, pool_w, device,
-                                      spatial_scale=1, sampling_ratio=2,
-                                      dtype=self.dtype)
-        self.assertTrue(torch.allclose(gt_y, y), 'PSRoIAlign layer incorrect on CPU')
-        y = ps_roi_align(x.permute(0, 1, 3, 2), rois)
-        gt_y = self.slow_ps_roi_align(x.permute(0, 1, 3, 2), rois, pool_h, pool_w, device,
-                                      spatial_scale=1, sampling_ratio=-1,
-                                      dtype=self.dtype)
-        self.assertTrue(torch.allclose(gt_y, y), 'PSRoIAlign layer incorrect on CPU')
-    def test_ps_roi_align_cpu(self):
-        device = torch.device('cpu')
-        pool_size = 5
-        x = torch.rand(2, 2 * (pool_size ** 2), 10, 10, dtype=self.dtype, device=device)
-        rois = torch.tensor([[0, 0, 0, 9, 9],  # format is (xyxy)
-                             [0, 0, 5, 4, 9],
-                             [0, 5, 5, 9, 9],
-                             [1, 0, 0, 9, 9]],
-                            dtype=self.dtype, device=device)
-        pool_h, pool_w = (pool_size, pool_size)
-        ps_roi_align = ops.PSRoIAlign((pool_h, pool_w), spatial_scale=1, sampling_ratio=2)
-        y = ps_roi_align(x, rois)
-        gt_y = self.slow_ps_roi_align(x, rois, pool_h, pool_w, device,
-                                      spatial_scale=1, sampling_ratio=2,
-                                      dtype=self.dtype)
-        self.assertTrue(torch.allclose(gt_y, y), 'PSRoIAlign layer incorrect on CPU')
-        y = ps_roi_align(x.permute(0, 1, 3, 2), rois)
-        gt_y = self.slow_ps_roi_align(x.permute(0, 1, 3, 2), rois, pool_h, pool_w,
-                                      device, spatial_scale=1, sampling_ratio=2,
-                                      dtype=self.dtype)
-        self.assertTrue(torch.allclose(gt_y, y), 'PSRoIAlign layer incorrect on CPU')
-    def test_ps_roi_align_gradient_cpu(self):
-        device = torch.device('cpu')
-        pool_size = 3
-        layer = ops.PSRoIAlign((pool_size, pool_size), spatial_scale=1,
-                               sampling_ratio=-1).to(dtype=self.dtype, device=device)
-        x = torch.ones(1, pool_size ** 2, 5, 5, dtype=self.dtype, device=device, requires_grad=True)
-        rois = torch.tensor([
-            [0, 0, 0, 4, 4],
-            [0, 0, 3, 5, 5],
-            [0, 1, 0, 2, 4]],
-            dtype=self.dtype, device=device)
-        y = layer(x, rois)
-        s = y.sum()
-        s.backward()
-        gt_grad = torch.tensor([[[[8.125e-01, 6.875e-01, 0.0, 0.0, 0.0, ],
-                                  [2.7083333333e-01, 2.2916666667e-01, 0.0, 0.0, 0.0, ],
-                                  [1.0416666667e-01, 6.25e-02, 0.0, 0.0, 0.0, ],
-                                  [5.2083333333e-01, 3.125e-01, 0.0, 0.0, 0.0, ],
-                                  [0.0, 0.0, 0.0, 0.0, 0.0, ]],
-                                 [[8.3266726847e-17, 1.125e00, 3.750e-01, 0.0, 0.0, ],
-                                  [2.7755575616e-17, 3.750e-01, 1.250e-01, 0.0, 0.0, ],
-                                  [0.0, 3.4722222222e-02, 9.7222222222e-02, 3.4722222222e-02, 0.0, ],
-                                  [0.0, 1.7361111111e-01, 4.8611111111e-01, 1.7361111111e-01, 0.0, ],
-                                  [0.0, 0.0, 0.0, 0.0, 0.0, ]],
-                                 [[0.0, 5.000e-01, 4.375e-01, 5.000e-01, 6.25e-02, ],
-                                  [0.0, 1.6666666667e-01, 1.4583333333e-01, 1.6666666667e-01, 2.0833333333e-02, ],
-                                  [0.0, 0.0, 0.0, 6.25e-02, 1.0416666667e-01, ],
-                                  [0.0, 0.0, 0.0, 3.125e-01, 5.2083333333e-01, ],
-                                  [0.0, 0.0, 0.0, 0.0, 0.0, ]],
-                                 [[0.0, 0.0, 0.0, 0.0, 0.0, ],
-                                  [5.4166666667e-01, 4.5833333333e-01, 0.0, 0.0, 0.0, ],
-                                  [5.4166666667e-01, 4.5833333333e-01, 0.0, 0.0, 0.0, ],
-                                  [3.125e-01, 1.875e-01, 0.0, 0.0, 0.0, ],
-                                  [3.125e-01, 1.875e-01, 0.0, 0.0, 0.0, ]],
-                                 [[0.0, 0.0, 0.0, 0.0, 0.0, ],
-                                  [5.5511151231e-17, 7.500e-01, 2.500e-01, 0.0, 0.0, ],
-                                  [5.5511151231e-17, 7.500e-01, 2.500e-01, 0.0, 0.0, ],
-                                  [0.0, 1.0416666667e-01, 2.9166666667e-01, 1.0416666667e-01, 0.0, ],
-                                  [0.0, 1.0416666667e-01, 2.9166666667e-01, 1.0416666667e-01, 0.0, ]],
-                                 [[0.0, 0.0, 0.0, 0.0, 0.0, ],
-                                  [0.0, 3.3333333333e-01, 2.9166666667e-01, 3.3333333333e-01, 4.1666666667e-02, ],
-                                  [0.0, 3.3333333333e-01, 2.9166666667e-01, 3.3333333333e-01, 4.1666666667e-02, ],
-                                  [0.0, 0.0, 0.0, 1.875e-01, 3.125e-01, ],
-                                  [0.0, 0.0, 0.0, 1.875e-01, 3.125e-01, ]],
-                                 [[0.0, 0.0, 0.0, 0.0, 0.0, ],
-                                  [0.0, 0.0, 0.0, 0.0, 0.0, ],
-                                  [2.7083333333e-01, 2.2916666667e-01, 0.0, 0.0, 0.0, ],
-                                  [7.2222222222e-01, 6.1111111111e-01, 0.0, 0.0, 0.0, ],
-                                  [7.1527777778e-01, 4.5138888889e-01, 0.0, 0.0, 0.0, ]],
-                                 [[0.0, 0.0, 0.0, 0.0, 0.0, ],
-                                  [0.0, 0.0, 0.0, 0.0, 0.0, ],
-                                  [2.7755575616e-17, 3.750e-01, 1.250e-01, 0.0, 0.0, ],
-                                  [7.4014868308e-17, 1.000e00, 3.3333333333e-01, 0.0, 0.0, ],
-                                  [9.2518585385e-18, 3.3333333333e-01, 6.25e-01, 2.0833333333e-01, 0.0, ]],
-                                 [[0.0, 0.0, 0.0, 0.0, 0.0, ],
-                                  [0.0, 0.0, 0.0, 0.0, 0.0, ],
-                                  [0.0, 1.6666666667e-01, 1.4583333333e-01, 1.6666666667e-01, 2.0833333333e-02, ],
-                                  [0.0, 4.4444444444e-01, 3.8888888889e-01, 4.4444444444e-01, 5.5555555556e-02, ],
-                                  [0.0, 5.5555555556e-02, 4.8611111111e-02, 4.3055555556e-01, 6.3194444444e-01, ]]]],
-                               device=device, dtype=self.dtype)
-        self.assertTrue(torch.allclose(x.grad, gt_grad), 'gradient incorrect for PSRoIAlign on CPU')
-    def test_ps_roi_align_gradcheck_cpu(self):
-        device = torch.device('cpu')
-        pool_size = 5
-        x = torch.rand(1, pool_size ** 2, 10, 10, dtype=self.dtype, device=device, requires_grad=True)
-        rois = torch.tensor([
-            [0, 0, 0, 9, 9],
-            [0, 0, 5, 5, 9],
-            [0, 5, 5, 9, 9]], dtype=self.dtype, device=device)
-        m = ops.PSRoIAlign((pool_size, pool_size), spatial_scale=1,
-                           sampling_ratio=2).to(dtype=self.dtype, device=device)
-        def func(input):
-            return m(input, rois)
-        self.assertTrue(gradcheck(func, (x,)), 'gradcheck failed for PSRoIAlign on CPU')
-        self.assertTrue(gradcheck(func, (x.permute(0, 1, 3, 2),)), 'gradcheck failed for PSRoIAlign on CPU')
-        @torch.jit.script
-        def script_func(input, rois):
-            return ops.ps_roi_align(input, rois, 5, 2.0, 1)[0]
-        self.assertTrue(gradcheck(lambda x: script_func(x, rois), (x,)),
-                        'gradcheck failed for scripted ps_roi_align on CPU')
-    @unittest.skipIf(not torch.cuda.is_available(), "CUDA unavailable")
-    def test_ps_roi_align_basic_cuda(self):
-        device = torch.device('cuda')
-        pool_size = 3
-        x = torch.rand(1, 2 * (pool_size ** 2), 7, 7, dtype=self.dtype, device=device)
-        rois = torch.tensor([[0, 0, 0, 5, 5]],  # format is (xyxy)
-                            dtype=self.dtype, device=device)
-        pool_h, pool_w = (pool_size, pool_size)
-        ps_roi_align = ops.PSRoIAlign((pool_h, pool_w), spatial_scale=1, sampling_ratio=2)
-        y = ps_roi_align(x, rois)
-        gt_y = self.slow_ps_roi_align(x, rois, pool_h, pool_w, device,
-                                      spatial_scale=1, sampling_ratio=2,
-                                      dtype=self.dtype)
-        self.assertTrue(torch.allclose(gt_y.cuda(), y), 'PSRoIAlign layer incorrect')
-        y = ps_roi_align(x.permute(0, 1, 3, 2), rois)
-        gt_y = self.slow_ps_roi_align(x.permute(0, 1, 3, 2), rois, pool_h, pool_w, device,
-                                      spatial_scale=1, sampling_ratio=-1,
-                                      dtype=self.dtype)
-        self.assertTrue(torch.allclose(gt_y.cuda(), y), 'PSRoIAlign layer incorrect')
-    @unittest.skipIf(not torch.cuda.is_available(), "CUDA unavailable")
-    def test_ps_roi_align_cuda(self):
-        device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
-        pool_size = 5
-        x = torch.rand(2, 2 * (pool_size ** 2), 10, 10, dtype=self.dtype, device=device)
-        rois = torch.tensor([[0, 0, 0, 9, 9],  # format is (xyxy)
-                             [0, 0, 5, 4, 9],
-                             [0, 5, 5, 9, 9],
-                             [1, 0, 0, 9, 9]],
-                            dtype=self.dtype, device=device)
-        pool_h, pool_w = (pool_size, pool_size)
-        ps_roi_align = ops.PSRoIAlign((pool_h, pool_w), spatial_scale=1, sampling_ratio=2)
-        y = ps_roi_align(x, rois)
-        gt_y = self.slow_ps_roi_align(x, rois, pool_h, pool_w, device,
-                                      spatial_scale=1, sampling_ratio=2,
-                                      dtype=self.dtype)
-        self.assertTrue(torch.allclose(gt_y.cuda(), y), 'PSRoIAlign layer incorrect')
-        y = ps_roi_align(x.permute(0, 1, 3, 2), rois)
-        gt_y = self.slow_ps_roi_align(x.permute(0, 1, 3, 2), rois, pool_h, pool_w,
-                                      device, spatial_scale=1, sampling_ratio=2,
-                                      dtype=self.dtype)
-        self.assertTrue(torch.allclose(gt_y.cuda(), y), 'PSRoIAlign layer incorrect')
-    @unittest.skipIf(not torch.cuda.is_available(), "CUDA unavailable")
-    def test_ps_roi_align_gradient_cuda(self):
-        device = torch.device('cuda')
-        pool_size = 3
-        layer = ops.PSRoIAlign((pool_size, pool_size), spatial_scale=1,
-                               sampling_ratio=-1).to(dtype=self.dtype, device=device)
-        x = torch.ones(1, pool_size ** 2, 5, 5, dtype=self.dtype, device=device, requires_grad=True)
-        rois = torch.tensor([
-            [0, 0, 0, 4, 4],
-            [0, 0, 3, 5, 5],
-            [0, 1, 0, 2, 4]],
-            dtype=self.dtype, device=device)
-        y = layer(x, rois)
-        s = y.sum()
-        s.backward()
-        gt_grad = torch.tensor([[[[8.125e-01, 6.875e-01, 0.0, 0.0, 0.0, ],
-                                  [2.7083333333e-01, 2.2916666667e-01, 0.0, 0.0, 0.0, ],
-                                  [1.0416666667e-01, 6.25e-02, 0.0, 0.0, 0.0, ],
-                                  [5.2083333333e-01, 3.125e-01, 0.0, 0.0, 0.0, ],
-                                  [0.0, 0.0, 0.0, 0.0, 0.0, ]],
-                                 [[8.3266726847e-17, 1.125e00, 3.750e-01, 0.0, 0.0, ],
-                                  [2.7755575616e-17, 3.750e-01, 1.250e-01, 0.0, 0.0, ],
-                                  [0.0, 3.4722222222e-02, 9.7222222222e-02, 3.4722222222e-02, 0.0, ],
-                                  [0.0, 1.7361111111e-01, 4.8611111111e-01, 1.7361111111e-01, 0.0, ],
-                                  [0.0, 0.0, 0.0, 0.0, 0.0, ]],
-                                 [[0.0, 5.000e-01, 4.375e-01, 5.000e-01, 6.25e-02, ],
-                                  [0.0, 1.6666666667e-01, 1.4583333333e-01, 1.6666666667e-01, 2.0833333333e-02, ],
-                                  [0.0, 0.0, 0.0, 6.25e-02, 1.0416666667e-01, ],
-                                  [0.0, 0.0, 0.0, 3.125e-01, 5.2083333333e-01, ],
-                                  [0.0, 0.0, 0.0, 0.0, 0.0, ]],
-                                 [[0.0, 0.0, 0.0, 0.0, 0.0, ],
-                                  [5.4166666667e-01, 4.5833333333e-01, 0.0, 0.0, 0.0, ],
-                                  [5.4166666667e-01, 4.5833333333e-01, 0.0, 0.0, 0.0, ],
-                                  [3.125e-01, 1.875e-01, 0.0, 0.0, 0.0, ],
-                                  [3.125e-01, 1.875e-01, 0.0, 0.0, 0.0, ]],
-                                 [[0.0, 0.0, 0.0, 0.0, 0.0, ],
-                                  [5.5511151231e-17, 7.500e-01, 2.500e-01, 0.0, 0.0, ],
-                                  [5.5511151231e-17, 7.500e-01, 2.500e-01, 0.0, 0.0, ],
-                                  [0.0, 1.0416666667e-01, 2.9166666667e-01, 1.0416666667e-01, 0.0, ],
-                                  [0.0, 1.0416666667e-01, 2.9166666667e-01, 1.0416666667e-01, 0.0, ]],
-                                 [[0.0, 0.0, 0.0, 0.0, 0.0, ],
-                                  [0.0, 3.3333333333e-01, 2.9166666667e-01, 3.3333333333e-01, 4.1666666667e-02, ],
-                                  [0.0, 3.3333333333e-01, 2.9166666667e-01, 3.3333333333e-01, 4.1666666667e-02, ],
-                                  [0.0, 0.0, 0.0, 1.875e-01, 3.125e-01, ],
-                                  [0.0, 0.0, 0.0, 1.875e-01, 3.125e-01, ]],
-                                 [[0.0, 0.0, 0.0, 0.0, 0.0, ],
-                                  [0.0, 0.0, 0.0, 0.0, 0.0, ],
-                                  [2.7083333333e-01, 2.2916666667e-01, 0.0, 0.0, 0.0, ],
-                                  [7.2222222222e-01, 6.1111111111e-01, 0.0, 0.0, 0.0, ],
-                                  [7.1527777778e-01, 4.5138888889e-01, 0.0, 0.0, 0.0, ]],
-                                 [[0.0, 0.0, 0.0, 0.0, 0.0, ],
-                                  [0.0, 0.0, 0.0, 0.0, 0.0, ],
-                                  [2.7755575616e-17, 3.750e-01, 1.250e-01, 0.0, 0.0, ],
-                                  [7.4014868308e-17, 1.000e00, 3.3333333333e-01, 0.0, 0.0, ],
-                                  [9.2518585385e-18, 3.3333333333e-01, 6.25e-01, 2.0833333333e-01, 0.0, ]],
-                                 [[0.0, 0.0, 0.0, 0.0, 0.0, ],
-                                  [0.0, 0.0, 0.0, 0.0, 0.0, ],
-                                  [0.0, 1.6666666667e-01, 1.4583333333e-01, 1.6666666667e-01, 2.0833333333e-02, ],
-                                  [0.0, 4.4444444444e-01, 3.8888888889e-01, 4.4444444444e-01, 5.5555555556e-02, ],
-                                  [0.0, 5.5555555556e-02, 4.8611111111e-02, 4.3055555556e-01, 6.3194444444e-01, ]]]],
-                               device=device, dtype=self.dtype)
-        self.assertTrue(torch.allclose(x.grad, gt_grad), 'gradient incorrect for PSRoIAlign')
-    @unittest.skipIf(not torch.cuda.is_available(), "CUDA unavailable")
+class PSRoIAlignTester(RoIOpTester, unittest.TestCase):
-    def test_ps_roi_align_gradcheck_cuda(self):
+    def fn(self, x, rois, pool_h, pool_w, spatial_scale=1, sampling_ratio=-1, **kwargs):
-        device = torch.device('cuda')
+        return ops.PSRoIAlign((pool_h, pool_w), spatial_scale=spatial_scale,
-        pool_size = 5
+                              sampling_ratio=sampling_ratio)(x, rois)
-        x = torch.rand(1, pool_size ** 2, 10, 10, dtype=self.dtype, device=device, requires_grad=True)
-        rois = torch.tensor([
-            [0, 0, 0, 9, 9],
-            [0, 0, 5, 5, 9],
-            [0, 5, 5, 9, 9]], dtype=self.dtype, device=device)
-        m = ops.PSRoIAlign((pool_size, pool_size), spatial_scale=1,
-                           sampling_ratio=2).to(dtype=self.dtype, device=device)
-        def func(input):
-            return m(input, rois)
-        self.assertTrue(gradcheck(func, (x,)), 'gradcheck failed for PSRoIAlign CUDA')
-        self.assertTrue(gradcheck(func, (x.permute(0, 1, 3, 2),)), 'gradcheck failed for PSRoIAlign CUDA')
+    def get_script_fn(self, rois, pool_size):
        @torch.jit.script
-        def script_func(input, rois):
+        def script_fn(input, rois, pool_size):
-            return ops.ps_roi_align(input, rois, 5, 2.0, 1)[0]
+            # type: (torch.Tensor, torch.Tensor, int) -> torch.Tensor
+            return ops.ps_roi_align(input, rois, pool_size, 1.0)[0]
-        self.assertTrue(gradcheck(lambda x: script_func(x, rois), (x,)),
+        return lambda x: script_fn(x, rois, pool_size)
-                        'gradcheck failed for scripted ps_roi_align on CUDA')
-class PSRoIPoolTester(unittest.TestCase):
-    @classmethod
-    def setUpClass(cls):
-        cls.dtype = torch.float64
-    def slow_ps_roi_pooling(self, x, rois, pool_h, pool_w, device, spatial_scale=1,
+    def expected_fn(self, in_data, rois, pool_h, pool_w, device, spatial_scale=1,
-                            dtype=torch.float64):
+                    sampling_ratio=-1, dtype=torch.float64):
        if device is None:
            device = torch.device("cpu")
-        num_input_channels = x.size(1)
+        n_input_channels = in_data.size(1)
-        self.assertEqual(num_input_channels % (pool_h * pool_w), 0, "input channels must be divisible by ph * pw")
+        self.assertEqual(n_input_channels % (pool_h * pool_w), 0, "input channels must be divisible by ph * pw")
-        num_output_channels = int(num_input_channels / (pool_h * pool_w))
+        n_output_channels = int(n_input_channels / (pool_h * pool_w))
-        y = torch.zeros(rois.size(0), num_output_channels, pool_h, pool_w, dtype=dtype, device=device)
+        out_data = torch.zeros(rois.size(0), n_output_channels, pool_h, pool_w, dtype=dtype, device=device)
-        rois = torch.round(rois * spatial_scale).int()
+        for r, roi in enumerate(rois):
-        for n in range(0, x.size(0)):
+            batch_idx = int(roi[0])
-            for r, roi in enumerate(rois):
+            j_begin, i_begin, j_end, i_end = (x.item() * spatial_scale - 0.5 for x in roi[1:])
-                if roi[0] != n:
-                    continue
+            roi_h = i_end - i_begin
-                c_in = 0
+            roi_w = j_end - j_begin
-                for c_out in range(0, num_output_channels):
+            bin_h = roi_h / pool_h
-                    roi_height = max(roi[4].item() - roi[2].item(), 1)
+            bin_w = roi_w / pool_w
-                    roi_width = max(roi[3].item() - roi[1].item(), 1)
-                    bin_h, bin_w = roi_height / float(pool_h), roi_width / float(pool_w)
+            for i in range(0, pool_h):
+                start_h = i_begin + i * bin_h
-                    for j in range(0, pool_h):
+                grid_h = sampling_ratio if sampling_ratio > 0 else int(np.ceil(bin_h))
-                        start_h = int(np.floor(j * bin_h)) + roi[2].item()
+                for j in range(0, pool_w):
-                        end_h = int(np.ceil((j + 1) * bin_w)) + roi[2].item()
+                    start_w = j_begin + j * bin_w
+                    grid_w = sampling_ratio if sampling_ratio > 0 else int(np.ceil(bin_w))
-                        # range-check
+                    for c_out in range(0, n_output_channels):
-                        start_h = min(max(start_h, 0), x.size(2))
+                        c_in = c_out * (pool_h * pool_w) + pool_w * i + j
-                        end_h = min(max(end_h, 0), x.size(2))
+                        val = 0
-                        for i in range(0, pool_w):
+                        for iy in range(0, grid_h):
-                            start_w = int(np.floor(i * bin_w)) + roi[1].item()
+                            y = start_h + (iy + 0.5) * bin_h / grid_h
-                            end_w = int(np.ceil((i + 1) * bin_w)) + roi[1].item()
+                            for ix in range(0, grid_w):
+                                x = start_w + (ix + 0.5) * bin_w / grid_w
-                            # range-check
+                                val += bilinear_interpolate(
-                            start_w = min(max(start_w, 0), x.size(3))
+                                    in_data[batch_idx, c_in, :, :].flatten(),
-                            end_w = min(max(end_w, 0), x.size(3))
+                                    in_data.size(-2),
+                                    in_data.size(-1),
-                            is_empty = (end_h <= start_h) or (end_w <= start_w)
+                                    y, x
-                            area = (end_h - start_h) * (end_w - start_w)
+                                )
+                        val /= grid_h * grid_w
-                            if not is_empty:
-                                t = torch.sum(x[n, c_in, slice(start_h, end_h), slice(start_w, end_w)])
+                        out_data[r, c_out, i, j] = val
-                                y[r, c_out, j, i] = t / area
+        return out_data
-                            c_in += 1
-        return y
-    def test_ps_roi_pool_basic_cpu(self):
-        device = torch.device('cpu')
-        pool_size = 3
-        x = torch.rand(1, pool_size ** 2, 10, 10, dtype=self.dtype, device=device)
-        rois = torch.tensor([[0, 0, 0, 4, 4]],  # format is (xyxy)
-                            dtype=self.dtype, device=device)
-        pool_h, pool_w = (pool_size, pool_size)
-        ps_roi_pool = ops.PSRoIPool((pool_h, pool_w), 1)
-        y = ps_roi_pool(x, rois)
-        gt_y = self.slow_ps_roi_pooling(x, rois, pool_h, pool_w, device, dtype=self.dtype)
-        self.assertTrue(torch.allclose(gt_y, y), 'PSRoIPool layer incorrect on CPU')
-        y = ps_roi_pool(x.permute(0, 1, 3, 2), rois)
-        gt_y = self.slow_ps_roi_pooling(x.permute(0, 1, 3, 2), rois, pool_h, pool_w, device, dtype=self.dtype)
-        self.assertTrue(torch.allclose(gt_y, y), 'PSRoIPool layer incorrect on CPU')
-    def test_ps_roi_pool_cpu(self):
-        device = torch.device('cpu')
-        pool_size = 5
-        x = torch.rand(2, 2 * (pool_size ** 2), 10, 10, dtype=self.dtype, device=device)
-        rois = torch.tensor([[0, 0, 0, 9, 9],  # format is (xyxy)
-                             [0, 0, 5, 4, 9],
-                             [0, 5, 5, 9, 9],
-                             [1, 0, 0, 9, 9]],
-                            dtype=self.dtype, device=device)
-        pool_h, pool_w = (pool_size, pool_size)
-        ps_roi_pool = ops.PSRoIPool((pool_h, pool_w), 1)
-        y = ps_roi_pool(x, rois)
-        gt_y = self.slow_ps_roi_pooling(x, rois, pool_h, pool_w, device, dtype=self.dtype)
-        self.assertTrue(torch.allclose(gt_y, y), 'PSRoIPool layer incorrect on CPU')
-        y = ps_roi_pool(x.permute(0, 1, 3, 2), rois)
-        gt_y = self.slow_ps_roi_pooling(x.permute(0, 1, 3, 2), rois, pool_h, pool_w, device, dtype=self.dtype)
-        self.assertTrue(torch.allclose(gt_y, y), 'PSRoIPool layer incorrect on CPU')
-    def test_ps_roi_pool_gradient_cpu(self):
-        device = torch.device('cpu')
-        pool_size = 3
-        layer = ops.PSRoIPool((pool_size, pool_size), 1).to(dtype=self.dtype, device=device)
-        x = torch.ones(1, pool_size ** 2, 5, 5, dtype=self.dtype, device=device, requires_grad=True)
-        rois = torch.tensor([
-            [0, 0, 0, 4, 4],
-            [0, 0, 3, 5, 5],
-            [0, 1, 0, 2, 4]],
-            dtype=self.dtype, device=device)
-        y = layer(x, rois)
-        s = y.sum()
-        s.backward()
-        gt_grad = torch.tensor([[[[0.2500, 0.7500, 0.0000, 0.0000, 0.0000],
-                                  [0.2500, 0.7500, 0.0000, 0.0000, 0.0000],
-                                  [0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
-                                  [0.5000, 0.5000, 0.0000, 0.0000, 0.0000],
-                                  [0.0000, 0.0000, 0.0000, 0.0000, 0.0000]],
-                                 [[0.0000, 0.7500, 0.2500, 0.0000, 0.0000],
-                                  [0.0000, 0.7500, 0.2500, 0.0000, 0.0000],
-                                  [0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
-                                  [0.0000, 1. / 3, 1. / 3, 1. / 3, 0.0000],
-                                  [0.0000, 0.0000, 0.0000, 0.0000, 0.0000]],
-                                 [[0.0000, 0.5000, 0.2500, 0.2500, 0.0000],
-                                  [0.0000, 0.5000, 0.2500, 0.2500, 0.0000],
-                                  [0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
-                                  [0.0000, 0.0000, 0.0000, 0.5000, 0.5000],
-                                  [0.0000, 0.0000, 0.0000, 0.0000, 0.0000]],
-                                 [[0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
-                                  [0.2500, 0.7500, 0.0000, 0.0000, 0.0000],
-                                  [0.2500, 0.7500, 0.0000, 0.0000, 0.0000],
-                                  [0.2500, 0.2500, 0.0000, 0.0000, 0.0000],
-                                  [0.2500, 0.2500, 0.0000, 0.0000, 0.0000]],
-                                 [[0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
-                                  [0.0000, 0.7500, 0.2500, 0.0000, 0.0000],
-                                  [0.0000, 0.7500, 0.2500, 0.0000, 0.0000],
-                                  [0.0000, 1. / 6, 1. / 6, 1. / 6, 0.0000],
-                                  [0.0000, 1. / 6, 1. / 6, 1. / 6, 0.0000]],
-                                 [[0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
-                                  [0.0000, 0.5000, 0.2500, 0.2500, 0.0000],
-                                  [0.0000, 0.5000, 0.2500, 0.2500, 0.0000],
-                                  [0.0000, 0.0000, 0.0000, 0.2500, 0.2500],
-                                  [0.0000, 0.0000, 0.0000, 0.2500, 0.2500]],
-                                 [[0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
-                                  [0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
-                                  [0.2500, 0.7500, 0.0000, 0.0000, 0.0000],
-                                  [0.2500, 0.7500, 0.0000, 0.0000, 0.0000],
-                                  [0.5000, 0.5000, 0.0000, 0.0000, 0.0000]],
-                                 [[0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
-                                  [0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
-                                  [0.0000, 0.7500, 0.2500, 0.0000, 0.0000],
-                                  [0.0000, 0.7500, 0.2500, 0.0000, 0.0000],
-                                  [0.0000, 1. / 3, 1. / 3, 1. / 3, 0.0000]],
-                                 [[0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
-                                  [0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
-                                  [0.0000, 0.5000, 0.2500, 0.2500, 0.0000],
-                                  [0.0000, 0.5000, 0.2500, 0.2500, 0.0000],
-                                  [0.0000, 0.0000, 0.0000, 0.5000, 0.5000]]]],
-                               device=device, dtype=self.dtype)
-        self.assertTrue(torch.allclose(x.grad, gt_grad), 'gradient incorrect for PSRoIPool on CPU')
-    def test_ps_roi_pool_gradcheck_cpu(self):
-        device = torch.device('cpu')
-        pool_size = 5
-        x = torch.rand(1, pool_size ** 2, 10, 10, dtype=self.dtype, device=device, requires_grad=True)
-        rois = torch.tensor([
-            [0, 0, 0, 9, 9],
-            [0, 0, 5, 5, 9],
-            [0, 5, 5, 9, 9]], dtype=self.dtype, device=device)
-        m = ops.PSRoIPool((pool_size, pool_size), 1).to(dtype=self.dtype, device=device)
-        def func(input):
-            return m(input, rois)
-        self.assertTrue(gradcheck(func, (x,)), 'gradcheck failed for PSRoIPool on CPU')
-        self.assertTrue(gradcheck(func, (x.permute(0, 1, 3, 2),)), 'gradcheck failed for PSRoIPool on CPU')
-        @torch.jit.script
-        def script_func(input, rois):
-            return ops.ps_roi_pool(input, rois, 5, 1.0)[0]
-        self.assertTrue(gradcheck(lambda x: script_func(x, rois), (x,)),
-                        'gradcheck failed for scripted ps_roi_pool on CPU')
-    @unittest.skipIf(not torch.cuda.is_available(), "CUDA unavailable")
-    def test_ps_roi_pool_basic_cuda(self):
-        device = torch.device('cuda')
-        pool_size = 3
-        x = torch.rand(1, pool_size ** 2, 10, 10, dtype=self.dtype, device=device)
-        rois = torch.tensor([[0, 0, 0, 4, 4]],  # format is (xyxy)
-                            dtype=self.dtype, device=device)
-        pool_h, pool_w = (pool_size, pool_size)
-        ps_roi_pool = ops.PSRoIPool((pool_h, pool_w), 1)
-        y = ps_roi_pool(x, rois)
-        gt_y = self.slow_ps_roi_pooling(x, rois, pool_h, pool_w, device, dtype=self.dtype)
-        self.assertTrue(torch.allclose(gt_y.cuda(), y), 'PSRoIPool layer incorrect')
-        y = ps_roi_pool(x.permute(0, 1, 3, 2), rois)
-        gt_y = self.slow_ps_roi_pooling(x.permute(0, 1, 3, 2), rois, pool_h, pool_w, device, dtype=self.dtype)
-        self.assertTrue(torch.allclose(gt_y.cuda(), y), 'PSRoIPool layer incorrect')
-    @unittest.skipIf(not torch.cuda.is_available(), "CUDA unavailable")
-    def test_ps_roi_pool_cuda(self):
-        device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
-        pool_size = 5
-        x = torch.rand(2, 2 * (pool_size ** 2), 10, 10, dtype=self.dtype, device=device)
-        rois = torch.tensor([[0, 0, 0, 9, 9],  # format is (xyxy)
-                             [0, 0, 5, 4, 9],
-                             [0, 5, 5, 9, 9],
-                             [1, 0, 0, 9, 9]],
-                            dtype=self.dtype, device=device)
-        pool_h, pool_w = (pool_size, pool_size)
-        ps_roi_pool = ops.PSRoIPool((pool_h, pool_w), 1)
-        y = ps_roi_pool(x, rois)
-        gt_y = self.slow_ps_roi_pooling(x, rois, pool_h, pool_w, device, dtype=self.dtype)
-        self.assertTrue(torch.allclose(gt_y.cuda(), y), 'PSRoIPool layer incorrect')
-        y = ps_roi_pool(x.permute(0, 1, 3, 2), rois)
-        gt_y = self.slow_ps_roi_pooling(x.permute(0, 1, 3, 2), rois, pool_h, pool_w, device, dtype=self.dtype)
-        self.assertTrue(torch.allclose(gt_y.cuda(), y), 'PSRoIPool layer incorrect')
-    @unittest.skipIf(not torch.cuda.is_available(), "CUDA unavailable")
-    def test_ps_roi_pool_gradient_cuda(self):
-        device = torch.device('cuda')
-        pool_size = 3
-        layer = ops.PSRoIPool((pool_size, pool_size), 1).to(dtype=self.dtype, device=device)
-        x = torch.ones(1, pool_size ** 2, 5, 5, dtype=self.dtype, device=device, requires_grad=True)
-        rois = torch.tensor([
-            [0, 0, 0, 4, 4],
-            [0, 0, 3, 5, 5],
-            [0, 1, 0, 2, 4]],
-            dtype=self.dtype, device=device)
-        y = layer(x, rois)
-        s = y.sum()
-        s.backward()
-        gt_grad = torch.tensor([[[[0.2500, 0.7500, 0.0000, 0.0000, 0.0000],
-                                  [0.2500, 0.7500, 0.0000, 0.0000, 0.0000],
-                                  [0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
-                                  [0.5000, 0.5000, 0.0000, 0.0000, 0.0000],
-                                  [0.0000, 0.0000, 0.0000, 0.0000, 0.0000]],
-                                 [[0.0000, 0.7500, 0.2500, 0.0000, 0.0000],
-                                  [0.0000, 0.7500, 0.2500, 0.0000, 0.0000],
-                                  [0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
-                                  [0.0000, 1. / 3, 1. / 3, 1. / 3, 0.0000],
-                                  [0.0000, 0.0000, 0.0000, 0.0000, 0.0000]],
-                                 [[0.0000, 0.5000, 0.2500, 0.2500, 0.0000],
-                                  [0.0000, 0.5000, 0.2500, 0.2500, 0.0000],
-                                  [0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
-                                  [0.0000, 0.0000, 0.0000, 0.5000, 0.5000],
-                                  [0.0000, 0.0000, 0.0000, 0.0000, 0.0000]],
-                                 [[0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
-                                  [0.2500, 0.7500, 0.0000, 0.0000, 0.0000],
-                                  [0.2500, 0.7500, 0.0000, 0.0000, 0.0000],
-                                  [0.2500, 0.2500, 0.0000, 0.0000, 0.0000],
-                                  [0.2500, 0.2500, 0.0000, 0.0000, 0.0000]],
-                                 [[0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
-                                  [0.0000, 0.7500, 0.2500, 0.0000, 0.0000],
-                                  [0.0000, 0.7500, 0.2500, 0.0000, 0.0000],
-                                  [0.0000, 1. / 6, 1. / 6, 1. / 6, 0.0000],
-                                  [0.0000, 1. / 6, 1. / 6, 1. / 6, 0.0000]],
-                                 [[0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
-                                  [0.0000, 0.5000, 0.2500, 0.2500, 0.0000],
-                                  [0.0000, 0.5000, 0.2500, 0.2500, 0.0000],
-                                  [0.0000, 0.0000, 0.0000, 0.2500, 0.2500],
-                                  [0.0000, 0.0000, 0.0000, 0.2500, 0.2500]],
-                                 [[0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
-                                  [0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
-                                  [0.2500, 0.7500, 0.0000, 0.0000, 0.0000],
-                                  [0.2500, 0.7500, 0.0000, 0.0000, 0.0000],
-                                  [0.5000, 0.5000, 0.0000, 0.0000, 0.0000]],
-                                 [[0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
-                                  [0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
-                                  [0.0000, 0.7500, 0.2500, 0.0000, 0.0000],
-                                  [0.0000, 0.7500, 0.2500, 0.0000, 0.0000],
-                                  [0.0000, 1. / 3, 1. / 3, 1. / 3, 0.0000]],
-                                 [[0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
-                                  [0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
-                                  [0.0000, 0.5000, 0.2500, 0.2500, 0.0000],
-                                  [0.0000, 0.5000, 0.2500, 0.2500, 0.0000],
-                                  [0.0000, 0.0000, 0.0000, 0.5000, 0.5000]]]],
-                               device=device, dtype=self.dtype)
-        self.assertTrue(torch.allclose(x.grad, gt_grad), 'gradient incorrect for PSRoIPool')
-    @unittest.skipIf(not torch.cuda.is_available(), "CUDA unavailable")
-    def test_ps_roi_pool_gradcheck_cuda(self):
-        device = torch.device('cuda')
-        pool_size = 5
-        x = torch.rand(1, pool_size ** 2, 10, 10, dtype=self.dtype, device=device, requires_grad=True)
-        rois = torch.tensor([
-            [0, 0, 0, 9, 9],
-            [0, 0, 5, 5, 9],
-            [0, 5, 5, 9, 9]], dtype=self.dtype, device=device)
-        m = ops.PSRoIPool((pool_size, pool_size), 1).to(dtype=self.dtype, device=device)
-        def func(input):
-            return m(input, rois)
-        self.assertTrue(gradcheck(func, (x,)), 'gradcheck failed for PSRoIPool CUDA')
-        self.assertTrue(gradcheck(func, (x.permute(0, 1, 3, 2),)), 'gradcheck failed for PSRoIPool CUDA')
-        @torch.jit.script
-        def script_func(input, rois):
-            return ops.ps_roi_pool(input, rois, 5, 1.0)[0]
-        self.assertTrue(gradcheck(lambda x: script_func(x, rois), (x,)),
-                        'gradcheck failed for scripted ps_roi_pool on CUDA')
 class NMSTester(unittest.TestCase):