Simplify and organize test_ops. (#1551)

* Simlify and organize test_ops. We perform the following: - Simplify the functions slow_roi_pooling, slow_ps_roi_pooling, slow_ps_roi_align and bilinear_interpolate (including finding and removing a semi-bug in slow_ps_roi_pooling, which used bin_w instead of bin_h); - Wrote a slow_roi_align function, that was missing; - Create a base class testing all combinations of forward/backward, cpu/cuda, contiguous/non-contiguous; - Organize all testing inside the base class with _test_forward and _test_backward (which can be easily overriden if a parciular op needs something different); an Op class then only needs to implement fn, get_script_fn, and expected_fn. A few points: - We are using the same inputs for all tests, and not trying all possible inputs in the domain of a given operation. One improvement would be to test more diverse inputs, and to personalize the inputs for some ops (e.g. different inputs for pooling ops and align ops). - Running all tests is quite slow (~1 min only for CPU tests), so that can possibly be improved. * Reduce input size used in gradcheck. gradcheck can be quite costly, and it was causing OOM errors and making the tests slow. By reducing the size of the input, the test speed is down to 3 seconds for the CPU tests. Other points: - We remove an unused namedtuple; - We inherit from object for better Python 2 compatibility; - We remove a hardcoded pool_size from the TorchScript functions, and add it as a parameter instead. * Replace Tensor by torch.Tensor in type annotations. This should fix lint errors.

Simplify and organize test_ops. (#1551)
* Simlify and organize test_ops. We perform the following: - Simplify the functions slow_roi_pooling, slow_ps_roi_pooling, slow_ps_roi_align and bilinear_interpolate (including finding and removing a semi-bug in slow_ps_roi_pooling, which used bin_w instead of bin_h); - Wrote a slow_roi_align function, that was missing; - Create a base class testing all combinations of forward/backward, cpu/cuda, contiguous/non-contiguous; - Organize all testing inside the base class with _test_forward and _test_backward (which can be easily overriden if a parciular op needs something different); an Op class then only needs to implement fn, get_script_fn, and expected_fn. A few points: - We are using the same inputs for all tests, and not trying all possible inputs in the domain of a given operation. One improvement would be to test more diverse inputs, and to personalize the inputs for some ops (e.g. different inputs for pooling ops and align ops). - Running all tests is quite slow (~1 min only for CPU tests), so that can possibly be improved. * Reduce input size used in gradcheck. gradcheck can be quite costly, and it was causing OOM errors and making the tests slow. By reducing the size of the input, the test speed is down to 3 seconds for the CPU tests. Other points: - We remove an unused namedtuple; - We inherit from object for better Python 2 compatibility; - We remove a hardcoded pool_size from the TorchScript functions, and add it as a parameter instead. * Replace Tensor by torch.Tensor in type annotations. This should fix lint errors.
af225a8a · pedrofreire · Francisco Massa · 4897402a · af225a8a
Commit af225a8a authored Nov 06, 2019 by pedrofreire Committed by Francisco Massa Nov 06, 2019
Show whitespace changes
Inline Side-by-side

Showing with 235 additions and 1097 deletions

test/test_ops.py test/test_ops.py +235 -1097

No files found.
--- a/test/test_ops.py
+++ b/test/test_ops.py
+from __future__ import division
 import numpy as np
 import torch
 from torch.autograd import gradcheck
@@ -8,1168 +9,305 @@ from itertools import product
 import unittest


-class RoIPoolTester(unittest.TestCase):
+class RoIOpTester(object):
    @classmethod
    def setUpClass(cls):
        cls.dtype = torch.float64

-    def slow_roi_pooling(self, x, rois, pool_h, pool_w, spatial_scale=1,
-                         device=None, dtype=torch.float64):
-        if device is None:
-            device = torch.device("cpu")
-        c = x.size(1)
-        y = torch.zeros(rois.size(0), c, pool_h, pool_w, dtype=dtype, device=device)
-
-        rois = torch.round(rois * spatial_scale)
-
-        for n in range(0, y.size(0)):
-            for r, roi in enumerate(rois):
-                if roi[0] == n:
-                    start_h, end_h = int(roi[2].item()), int(roi[4].item()) + 1
-                    start_w, end_w = int(roi[1].item()), int(roi[3].item()) + 1
-                    roi_x = x[roi[0].long(), :, start_h:end_h, start_w:end_w]
-                    bin_h, bin_w = roi_x.size(-2) / float(pool_h), roi_x.size(-1) / float(pool_w)
-
-                    for j in range(0, pool_h):
-                        cj = slice(int(np.floor(j * bin_h)), int(np.ceil((j + 1) * bin_h)))
-                        for i in range(0, pool_w):
-                            ci = slice(int(np.floor(i * bin_w)), int(np.ceil((i + 1) * bin_w)))
-                            t = roi_x[:, cj, ci].reshape(c, -1)
-                            if t.numel() > 0:
-                                y[r, :, j, i] = torch.max(t, 1)[0]
-        return y
-
-    def test_roi_pool_basic_cpu(self):
-        device = torch.device('cpu')
-        x = torch.rand(1, 1, 10, 10, dtype=self.dtype, device=device)
-        rois = torch.tensor([[0, 0, 0, 4, 4]],  # format is (xyxy)
-                            dtype=self.dtype, device=device)
-
-        pool_h, pool_w = (5, 5)
-        roi_pool = ops.RoIPool((pool_h, pool_w), 1)
-        y = roi_pool(x, rois)
-
-        gt_y = self.slow_roi_pooling(x, rois, pool_h, pool_w, device=device, dtype=self.dtype)
-
-        self.assertTrue(torch.allclose(gt_y, y), 'RoIPool layer incorrect on CPU')
-
-        # non-contiguous
-        y = roi_pool(x.permute(0, 1, 3, 2), rois)
-        gt_y = self.slow_roi_pooling(x.permute(0, 1, 3, 2), rois, pool_h, pool_w, device=device, dtype=self.dtype)
-        self.assertTrue(torch.allclose(gt_y, y), 'RoIPool layer incorrect on CPU')
-
-    def test_roi_pool_cpu(self):
-        device = torch.device('cpu')
-        x = torch.rand(2, 1, 10, 10, dtype=self.dtype, device=device)
-        rois = torch.tensor([[0, 0, 0, 9, 9],  # format is (xyxy)
-                             [0, 0, 5, 4, 9],
-                             [0, 5, 5, 9, 9],
-                             [1, 0, 0, 9, 9]],
-                            dtype=self.dtype, device=device)
-
-        pool_h, pool_w = (5, 5)
-        roi_pool = ops.RoIPool((pool_h, pool_w), 1)
-        y = roi_pool(x, rois)
-
-        gt_y = self.slow_roi_pooling(x, rois, pool_h, pool_w, device=device, dtype=self.dtype)
-
-        self.assertTrue(torch.allclose(gt_y, y), 'RoIPool layer incorrect on CPU for batch > 1')
-
-        # non-contiguous
-        y = roi_pool(x.permute(0, 1, 3, 2), rois)
-        gt_y = self.slow_roi_pooling(x.permute(0, 1, 3, 2), rois, pool_h, pool_w, device=device, dtype=self.dtype)
-        self.assertTrue(torch.allclose(gt_y, y), 'RoIPool layer incorrect on CPU for batch > 1')
-
-    def test_roi_pool_cpu_empty_rois(self):
-        device = torch.device('cpu')
-        x = torch.tensor(
-            [[[[0.1767, 1.2851, 4.2325, 4.8645, 7.1496]],
-              [[2.5916, 4.3361, 3.8143, 6.1329, 2.0230]],
-              [[1.4492, 3.3384, 4.0816, 6.3116, 5.1068]]]],
-            dtype=self.dtype, device=device)
-        rois = torch.tensor(
-            [[0., 1., 0., 4., 0.],
-             [0., 2., 0., 3., 0.],
-             [0., 0., 0., 0., 0.],
-             [0., 0., 0., 0., 0.],
-             [0., 2., 0., 2., 0.]],
-            dtype=self.dtype, device=device)
-
-        pool_h, pool_w = (1, 2)
-        roi_pool = ops.RoIPool((pool_h, pool_w), 1)
-        y = roi_pool(x, rois)
+    def test_forward_cpu_contiguous(self):
+        self._test_forward(device=torch.device('cpu'), contiguous=True)

-        gt_y = self.slow_roi_pooling(x, rois, pool_h, pool_w, device=device, dtype=self.dtype)
+    def test_forward_cpu_non_contiguous(self):
+        self._test_forward(device=torch.device('cpu'), contiguous=False)

-        self.assertTrue(torch.allclose(gt_y, y), 'RoIPool layer incorrect on CPU empty rois')
+    def test_backward_cpu_contiguous(self):
+        self._test_backward(device=torch.device('cpu'), contiguous=True)

-        # non-contiguous
-        y = roi_pool(x.permute(0, 1, 3, 2), rois)
-        gt_y = self.slow_roi_pooling(x.permute(0, 1, 3, 2), rois, pool_h, pool_w, device=device, dtype=self.dtype)
-        self.assertTrue(torch.allclose(gt_y, y), 'RoIPool layer incorrect on CPU for empty rois non-contiguous')
-
-    def test_roi_pool_gradient_cpu(self):
-        device = torch.device('cpu')
-        x = torch.ones(1, 1, 10, 10, dtype=self.dtype, device=device, requires_grad=True)
-        rois = torch.tensor([
-            [0, 0, 0, 9, 9],
-            [0, 0, 5, 4, 9],
-            [0, 0, 0, 4, 4]],
-            dtype=self.dtype, device=device)
-
-        layer = ops.RoIPool((5, 5), 1).to(dtype=self.dtype, device=device)
-
-        y = layer(x, rois)
-        s = y.sum()
-        s.backward()
-
-        gt_grad = torch.tensor([[[[2., 1., 2., 1., 2., 0., 1., 0., 1., 0.],
-                                  [1., 1., 1., 1., 1., 0., 0., 0., 0., 0.],
-                                  [2., 1., 2., 1., 2., 0., 1., 0., 1., 0.],
-                                  [1., 1., 1., 1., 1., 0., 0., 0., 0., 0.],
-                                  [2., 1., 2., 1., 2., 0., 1., 0., 1., 0.],
-                                  [1., 1., 1., 1., 1., 0., 0., 0., 0., 0.],
-                                  [2., 1., 2., 1., 2., 0., 1., 0., 1., 0.],
-                                  [1., 1., 1., 1., 1., 0., 0., 0., 0., 0.],
-                                  [2., 1., 2., 1., 2., 0., 1., 0., 1., 0.],
-                                  [1., 1., 1., 1., 1., 0., 0., 0., 0., 0.]]]],
-                               device=device, dtype=self.dtype)
-
-        self.assertTrue(torch.allclose(x.grad, gt_grad), 'gradient incorrect for roi_pool')
-
-    def test_roi_pool_align_non_cont_grad_cpu(self):
-        devices = ['cpu']
-        if torch.cuda.is_available():
-            devices.append('cuda')
-
-        for d in devices:
-            device = torch.device(d)
-            rois = torch.tensor([
-                [0, 0, 0, 9, 9],
-                [0, 0, 5, 5, 9],
-                [0, 5, 5, 9, 9]], dtype=self.dtype, device=device)
-
-            grad_cont = torch.rand(3, 1, 5, 5, dtype=self.dtype, device=device)
-            grad = grad_cont.permute(2, 1, 3, 0).contiguous().permute(3, 1, 0, 2)
-
-            for op in ['RoIPool', 'RoIAlign']:
-                x = torch.rand(1, 1, 10, 10, dtype=self.dtype, device=device, requires_grad=True)
-                kwargs = {}
-                if op == 'RoIAlign':
-                    kwargs['sampling_ratio'] = 1
-                m = getattr(ops, op)((5, 5), 1, **kwargs)
-
-                y = m(x, rois)
-                y.backward(grad_cont)
-
-                g1 = x.grad.detach().clone()
-                del x.grad
-
-                y = m(x, rois)
-                y.backward(grad)
-
-                g2 = x.grad.detach().clone()
-                del x.grad
-                self.assertTrue(torch.allclose(g1, g2), 'gradient incorrect for {}'.format(op))
-
-    def test_roi_pool_gradcheck_cpu(self):
-        device = torch.device('cpu')
-        x = torch.rand(1, 1, 10, 10, dtype=self.dtype, device=device, requires_grad=True)
-        rois = torch.tensor([
-            [0, 0, 0, 9, 9],
-            [0, 0, 5, 5, 9],
-            [0, 5, 5, 9, 9]], dtype=self.dtype, device=device)
-
-        m = ops.RoIPool((5, 5), 1).to(dtype=self.dtype, device=device)
-
-        def func(input):
-            return m(input, rois)
-
-        self.assertTrue(gradcheck(func, (x,)), 'gradcheck failed for roi_pool CPU')
-        self.assertTrue(gradcheck(func, (x.permute(0, 1, 3, 2),)), 'gradcheck failed for roi_pool CPU')
-
-        @torch.jit.script
-        def script_func(input, rois):
-            return ops.roi_pool(input, rois, 5, 1.0)[0]
-
-        self.assertTrue(gradcheck(lambda x: script_func(x, rois), (x,)), 'gradcheck failed for scripted roi_pool')
+    def test_backward_cpu_non_contiguous(self):
+        self._test_backward(device=torch.device('cpu'), contiguous=False)

    @unittest.skipIf(not torch.cuda.is_available(), "CUDA unavailable")
-    def test_roi_pool_basic_cuda(self):
-        device = torch.device('cuda')
-        x = torch.rand(1, 1, 10, 10, dtype=self.dtype, device=device)
-        rois = torch.tensor([[0, 0, 0, 4, 4]],  # format is (xyxy)
-                            dtype=self.dtype, device=device)
-
-        pool_h, pool_w = (5, 5)
-        roi_pool = ops.RoIPool((pool_h, pool_w), 1)
-        y = roi_pool(x, rois)
+    def test_forward_cuda_contiguous(self):
+        self._test_forward(device=torch.device('cuda'), contiguous=True)

-        gt_y = self.slow_roi_pooling(x, rois, pool_h, pool_w, device=device, dtype=self.dtype)
-
-        self.assertTrue(torch.allclose(gt_y.cuda(), y), 'RoIPool layer incorrect')
+    @unittest.skipIf(not torch.cuda.is_available(), "CUDA unavailable")
+    def test_forward_cuda_non_contiguous(self):
+        self._test_forward(device=torch.device('cuda'), contiguous=False)

-        y = roi_pool(x.permute(0, 1, 3, 2), rois)
-        gt_y = self.slow_roi_pooling(x.permute(0, 1, 3, 2), rois, pool_h, pool_w, device=device, dtype=self.dtype)
-        self.assertTrue(torch.allclose(gt_y.cuda(), y), 'RoIPool layer incorrect')
+    @unittest.skipIf(not torch.cuda.is_available(), "CUDA unavailable")
+    def test_backward_cuda_contiguous(self):
+        self._test_backward(device=torch.device('cuda'), contiguous=True)

    @unittest.skipIf(not torch.cuda.is_available(), "CUDA unavailable")
-    def test_roi_pool_cuda(self):
-        device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
-        x = torch.rand(2, 1, 10, 10, dtype=self.dtype, device=device)
+    def test_backward_cuda_non_contiguous(self):
+        self._test_backward(device=torch.device('cuda'), contiguous=False)
+
+    def _test_forward(self, device, contiguous):
+        pool_size = 5
+        # n_channels % (pool_size ** 2) == 0 required for PS opeartions.
+        n_channels = 2 * (pool_size ** 2)
+        x = torch.rand(2, n_channels, 10, 10, dtype=self.dtype, device=device)
+        if not contiguous:
+            x = x.permute(0, 1, 3, 2)
        rois = torch.tensor([[0, 0, 0, 9, 9],  # format is (xyxy)
                             [0, 0, 5, 4, 9],
                             [0, 5, 5, 9, 9],
                             [1, 0, 0, 9, 9]],
                            dtype=self.dtype, device=device)

-        pool_h, pool_w = (5, 5)
-        roi_pool = ops.RoIPool((pool_h, pool_w), 1)
-        y = roi_pool(x, rois)
-
-        gt_y = self.slow_roi_pooling(x, rois, pool_h, pool_w, device=device, dtype=self.dtype)
-
-        self.assertTrue(torch.allclose(gt_y.cuda(), y), 'RoIPool layer incorrect')
-
-        y = roi_pool(x.permute(0, 1, 3, 2), rois)
-        gt_y = self.slow_roi_pooling(x.permute(0, 1, 3, 2), rois, pool_h, pool_w, device=device, dtype=self.dtype)
-        self.assertTrue(torch.allclose(gt_y.cuda(), y), 'RoIPool layer incorrect')
-
-    @unittest.skipIf(not torch.cuda.is_available(), "CUDA unavailable")
-    def test_roi_pool_gradient_cuda(self):
-        device = torch.device('cuda')
-        layer = ops.RoIPool((5, 5), 1).to(dtype=self.dtype, device=device)
-        x = torch.ones(1, 1, 10, 10, dtype=self.dtype, device=device, requires_grad=True)
-        rois = torch.tensor([
-            [0, 0, 0, 9, 9],
-            [0, 0, 5, 4, 9],
-            [0, 0, 0, 4, 4]],
+        pool_h, pool_w = pool_size, pool_size
+        y = self.fn(x, rois, pool_h, pool_w, spatial_scale=1, sampling_ratio=-1)
+        gt_y = self.expected_fn(x, rois, pool_h, pool_w, spatial_scale=1,
+                                sampling_ratio=-1, device=device, dtype=self.dtype)
+
+        self.assertTrue(torch.allclose(gt_y, y))
+
+    def _test_backward(self, device, contiguous):
+        pool_size = 2
+        x = torch.rand(1, 2 * (pool_size ** 2), 5, 5, dtype=self.dtype, device=device, requires_grad=True)
+        if not contiguous:
+            x = x.permute(0, 1, 3, 2)
+        rois = torch.tensor([[0, 0, 0, 4, 4],  # format is (xyxy)
+                             [0, 0, 2, 3, 4],
+                             [0, 2, 2, 4, 4]],
                            dtype=self.dtype, device=device)

-        y = layer(x, rois)
-        s = y.sum()
-        s.backward()
-        gt_grad = torch.tensor([[[[2., 1., 2., 1., 2., 0., 1., 0., 1., 0.],
-                                  [1., 1., 1., 1., 1., 0., 0., 0., 0., 0.],
-                                  [2., 1., 2., 1., 2., 0., 1., 0., 1., 0.],
-                                  [1., 1., 1., 1., 1., 0., 0., 0., 0., 0.],
-                                  [2., 1., 2., 1., 2., 0., 1., 0., 1., 0.],
-                                  [1., 1., 1., 1., 1., 0., 0., 0., 0., 0.],
-                                  [2., 1., 2., 1., 2., 0., 1., 0., 1., 0.],
-                                  [1., 1., 1., 1., 1., 0., 0., 0., 0., 0.],
-                                  [2., 1., 2., 1., 2., 0., 1., 0., 1., 0.],
-                                  [1., 1., 1., 1., 1., 0., 0., 0., 0., 0.]]]],
-                               device=device, dtype=self.dtype)
-
-        self.assertTrue(torch.allclose(x.grad, gt_grad), 'gradient incorrect for roi_pool')
+        def func(z):
+            return self.fn(z, rois, pool_size, pool_size, spatial_scale=1, sampling_ratio=1)

-    @unittest.skipIf(not torch.cuda.is_available(), "CUDA unavailable")
-    def test_roi_pool_gradcheck_cuda(self):
-        device = torch.device('cuda')
-        x = torch.rand(1, 1, 10, 10, dtype=self.dtype, device=device, requires_grad=True)
-        rois = torch.tensor([
-            [0, 0, 0, 9, 9],
-            [0, 0, 5, 5, 9],
-            [0, 5, 5, 9, 9]], dtype=self.dtype, device=device)
+        script_func = self.get_script_fn(rois, pool_size)

-        m = ops.RoIPool((5, 5), 1).to(dtype=self.dtype, device=device)
+        self.assertTrue(gradcheck(func, (x,)))
+        self.assertTrue(gradcheck(script_func, (x,)))
+        return

-        def func(input):
-            return m(input, rois)
+    def fn(*args, **kwargs):
+        pass

-        self.assertTrue(gradcheck(func, (x,)), 'gradcheck failed for roi_pool CUDA')
-        self.assertTrue(gradcheck(func, (x.permute(0, 1, 3, 2),)), 'gradcheck failed for roi_pool CUDA')
+    def get_script_fn(*args, **kwargs):
+        pass

-        @torch.jit.script
-        def script_func(input, rois):
-            return ops.roi_pool(input, rois, 5, 1.0)[0]
-
-        self.assertTrue(gradcheck(lambda x: script_func(x, rois), (x,)),
-                        'gradcheck failed for scripted roi_pool on CUDA')
+    def expected_fn(*args, **kwargs):
+        pass


-class RoIAlignTester(unittest.TestCase):
-    @classmethod
-    def setUpClass(cls):
-        torch.manual_seed(123)
-        cls.dtype = torch.float32
-        cls.x = torch.rand(1, 1, 10, 10, dtype=cls.dtype)
-        cls.single_roi = torch.tensor([[0, 0, 0, 4, 4]],  # format is (xyxy)
-                                      dtype=cls.dtype)
-        cls.rois = torch.tensor([[0, 0, 0, 9, 9],  # format is (xyxy)
-                                 [0, 0, 5, 4, 9],
-                                 [0, 5, 5, 9, 9]],
-                                dtype=cls.dtype)
-
-        cls.gt_y_single = torch.tensor(
-            [[[[0.41617328, 0.5040753, 0.25266218, 0.4296828, 0.29928464],
-               [0.5210769, 0.57222337, 0.2524979, 0.32063985, 0.32635176],
-               [0.73108256, 0.6114335, 0.62033176, 0.8188273, 0.5562218],
-               [0.83115816, 0.70803946, 0.7084047, 0.74928707, 0.7769296],
-               [0.54266506, 0.45964524, 0.5780159, 0.80522037, 0.7321807]]]], dtype=cls.dtype)
-
-        cls.gt_y_multiple = torch.tensor(
-            [[[[0.49311584, 0.35972416, 0.40843594, 0.3638034, 0.49751836],
-               [0.70881474, 0.75481665, 0.5826779, 0.34767765, 0.46865487],
-               [0.4740328, 0.69306874, 0.3617804, 0.47145438, 0.66130304],
-               [0.6861706, 0.17634538, 0.47194335, 0.42473823, 0.37930614],
-               [0.62666404, 0.49973848, 0.37911576, 0.5842756, 0.7176864]]],
-             [[[0.67499936, 0.6607055, 0.42656037, 0.46134934, 0.42144877],
-               [0.7471722, 0.7235433, 0.14512213, 0.13031253, 0.289369],
-               [0.8443615, 0.6659734, 0.23614208, 0.14719573, 0.4268827],
-               [0.69429564, 0.5621515, 0.5019923, 0.40678093, 0.34556213],
-               [0.51315194, 0.7177093, 0.6494485, 0.6775592, 0.43865064]]],
-             [[[0.24465509, 0.36108392, 0.64635646, 0.4051828, 0.33956185],
-               [0.49006107, 0.42982674, 0.34184104, 0.15493104, 0.49633422],
-               [0.54400194, 0.5265246, 0.22381854, 0.3929715, 0.6757667],
-               [0.32961223, 0.38482672, 0.68877804, 0.71822757, 0.711909],
-               [0.561259, 0.71047884, 0.84651315, 0.8541089, 0.644432]]]], dtype=cls.dtype)
-
-        cls.x_grad = torch.tensor(
-            [[[[0.075625, 0.15125, 0.15124999, 0.15125002, 0.15812504,
-                0.15812503, 0.15124999, 0.15124999, 0.15125006, 0.0756249],
-               [0.15125, 0.30250007, 0.3025, 0.30250007, 0.31625012,
-                0.31625003, 0.3025, 0.3025, 0.30250013, 0.1512498],
-               [0.15124999, 0.3025, 0.30249995, 0.3025, 0.31625006,
-                0.31625, 0.30249995, 0.30249995, 0.30250007, 0.15124978],
-               [0.15125002, 0.30250007, 0.3025, 0.30250007, 0.31625012,
-                0.3162501, 0.3025, 0.3025, 0.30250013, 0.15124981],
-               [0.15812504, 0.31625012, 0.31625006, 0.31625012, 0.33062524,
-                0.3306251, 0.31625006, 0.31625006, 0.3162502, 0.15812483],
-               [0.5181251, 1.0962502, 1.0362502, 1.0962503, 0.69062525, 0.6906252,
-                1.0962502, 1.0362502, 1.0962503, 0.5181248],
-               [0.93125, 1.9925, 1.8624997, 1.9925, 1.0962502, 1.0962502,
-                1.9925, 1.8624998, 1.9925, 0.9312496],
-               [0.8712501, 1.8625, 1.7425002, 1.8625001, 1.0362502, 1.0362502,
-                1.8625, 1.7425001, 1.8625002, 0.8712497],
-               [0.93125004, 1.9925, 1.8625002, 1.9925, 1.0962503, 1.0962503,
-                1.9925001, 1.8625001, 1.9925001, 0.93124974],
-               [0.43562484, 0.9312497, 0.8712497, 0.9312497, 0.5181249, 0.5181248,
-                0.9312496, 0.8712497, 0.93124974, 0.43562466]]]], dtype=cls.dtype)
-
-    def test_roi_align_basic_cpu(self):
-        device = torch.device('cpu')
-        x = self.x.to(device)
-        single_roi = self.single_roi.to(device)
-        gt_y_single = self.gt_y_single.to(device)
-
-        pool_h, pool_w = (5, 5)
-        roi_align = ops.RoIAlign((pool_h, pool_w), spatial_scale=1, sampling_ratio=2).to(device=device)
-        y = roi_align(x, single_roi)
-
-        self.assertTrue(torch.allclose(gt_y_single, y), 'RoIAlign layer incorrect for single ROI on CPU')
-
-        y = roi_align(x.transpose(2, 3).contiguous().transpose(2, 3), single_roi)
-        self.assertTrue(torch.allclose(gt_y_single, y), 'RoIAlign layer incorrect for single ROI on CPU')
-
-    def test_roi_align_cpu(self):
-        device = torch.device('cpu')
-        x = self.x.to(device)
-        rois = self.rois.to(device)
-        gt_y_multiple = self.gt_y_multiple.to(device)
-
-        pool_h, pool_w = (5, 5)
-        roi_align = ops.RoIAlign((pool_h, pool_w), spatial_scale=1, sampling_ratio=2).to(device=device)
-        y = roi_align(x, rois)
-
-        self.assertTrue(torch.allclose(gt_y_multiple, y), 'RoIAlign layer incorrect for multiple ROIs on CPU')
-
-        y = roi_align(x.transpose(2, 3).contiguous().transpose(2, 3), rois)
-        self.assertTrue(torch.allclose(gt_y_multiple, y), 'RoIAlign layer incorrect for multiple ROIs on CPU')
+class RoIPoolTester(RoIOpTester, unittest.TestCase):
+    def fn(self, x, rois, pool_h, pool_w, spatial_scale=1, sampling_ratio=-1, **kwargs):
+        return ops.RoIPool((pool_h, pool_w), spatial_scale)(x, rois)

-    @unittest.skipIf(not torch.cuda.is_available(), "CUDA unavailable")
-    def test_roi_align_basic_cuda(self):
-        device = torch.device('cuda')
-        x = self.x.to(device)
-        single_roi = self.single_roi.to(device)
-        gt_y_single = self.gt_y_single.to(device)
-
-        pool_h, pool_w = (5, 5)
-        roi_align = ops.RoIAlign((pool_h, pool_w), spatial_scale=1, sampling_ratio=2).to(device=device)
-        y = roi_align(x, single_roi)
-
-        self.assertTrue(torch.allclose(gt_y_single, y), 'RoIAlign layer incorrect for single ROI on CUDA')
-
-        y = roi_align(x.transpose(2, 3).contiguous().transpose(2, 3), single_roi)
-        self.assertTrue(torch.allclose(gt_y_single, y), 'RoIAlign layer incorrect for single ROI on CUDA')
-
-    @unittest.skipIf(not torch.cuda.is_available(), "CUDA unavailable")
-    def test_roi_align_cuda(self):
-        device = torch.device('cuda')
-        x = self.x.to(device)
-        rois = self.rois.to(device)
-        gt_y_multiple = self.gt_y_multiple.to(device)
-
-        pool_h, pool_w = (5, 5)
-        roi_align = ops.RoIAlign((pool_h, pool_w), spatial_scale=1, sampling_ratio=2).to(device=device)
-        y = roi_align(x, rois)
-
-        self.assertTrue(torch.allclose(gt_y_multiple, y), 'RoIAlign layer incorrect for multiple ROIs on CUDA')
+    def get_script_fn(self, rois, pool_size):
+        @torch.jit.script
+        def script_fn(input, rois, pool_size):
+            # type: (torch.Tensor, torch.Tensor, int) -> torch.Tensor
+            return ops.roi_pool(input, rois, pool_size, 1.0)[0]
+        return lambda x: script_fn(x, rois, pool_size)

-        y = roi_align(x.transpose(2, 3).contiguous().transpose(2, 3), rois)
-        self.assertTrue(torch.allclose(gt_y_multiple, y), 'RoIAlign layer incorrect for multiple ROIs on CUDA')
+    def expected_fn(self, x, rois, pool_h, pool_w, spatial_scale=1, sampling_ratio=-1,
+                    device=None, dtype=torch.float64):
+        if device is None:
+            device = torch.device("cpu")

-    def test_roi_align_gradient_cpu(self):
-        """
-        Compute gradients for RoIAlign with multiple bounding boxes on CPU
-        """
-        device = torch.device('cpu')
-        pool_h, pool_w = (5, 5)
-        roi_align = ops.RoIAlign((pool_h, pool_w), spatial_scale=1, sampling_ratio=2).to(device=device)
+        n_channels = x.size(1)
+        y = torch.zeros(rois.size(0), n_channels, pool_h, pool_w, dtype=dtype, device=device)

-        x = self.x.to(device).clone()
-        rois = self.rois.to(device)
-        gt_grad = self.x_grad.to(device)
+        def get_slice(k, block):
+            return slice(int(np.floor(k * block)), int(np.ceil((k + 1) * block)))

-        x.requires_grad = True
-        y = roi_align(x, rois)
-        s = y.sum()
-        s.backward()
+        for roi_idx, roi in enumerate(rois):
+            batch_idx = int(roi[0])
+            j_begin, i_begin, j_end, i_end = (int(round(x.item() * spatial_scale)) for x in roi[1:])
+            roi_x = x[batch_idx, :, i_begin:i_end + 1, j_begin:j_end + 1]

-        self.assertTrue(torch.allclose(x.grad, gt_grad), 'gradient incorrect for RoIAlign CPU')
+            roi_h, roi_w = roi_x.shape[-2:]
+            bin_h = roi_h / pool_h
+            bin_w = roi_w / pool_w

-    def test_roi_align_gradcheck_cpu(self):
-        dtype = torch.float64
-        device = torch.device('cpu')
-        m = ops.RoIAlign((5, 5), 0.5, 1).to(dtype=dtype, device=device)
-        x = torch.rand(1, 1, 10, 10, dtype=dtype, device=device, requires_grad=True)
-        rois = self.rois.to(device=device, dtype=dtype)
+            for i in range(0, pool_h):
+                for j in range(0, pool_w):
+                    bin_x = roi_x[:, get_slice(i, bin_h), get_slice(j, bin_w)]
+                    if bin_x.numel() > 0:
+                        y[roi_idx, :, i, j] = bin_x.reshape(n_channels, -1).max(dim=1)[0]
+        return y

-        def func(input):
-            return m(input, rois)

-        self.assertTrue(gradcheck(func, (x,)), 'gradcheck failed for RoIAlign CPU')
-        self.assertTrue(gradcheck(func, (x.transpose(2, 3),)), 'gradcheck failed for RoIAlign CPU')
+class PSRoIPoolTester(RoIOpTester, unittest.TestCase):
+    def fn(self, x, rois, pool_h, pool_w, spatial_scale=1, sampling_ratio=-1, **kwargs):
+        return ops.PSRoIPool((pool_h, pool_w), 1)(x, rois)

+    def get_script_fn(self, rois, pool_size):
        @torch.jit.script
-        def script_func(input, rois):
-            return ops.roi_align(input, rois, 5, 0.5, 1)[0]
+        def script_fn(input, rois, pool_size):
+            # type: (torch.Tensor, torch.Tensor, int) -> torch.Tensor
+            return ops.ps_roi_pool(input, rois, pool_size, 1.0)[0]
+        return lambda x: script_fn(x, rois, pool_size)

-        self.assertTrue(gradcheck(lambda x: script_func(x, rois), (x,)), 'gradcheck failed for scripted roi_align')
-
-    @unittest.skipIf(not torch.cuda.is_available(), "CUDA unavailable")
-    def test_roi_align_gradient_cuda(self):
-        """
-        Compute gradients for RoIAlign with multiple bounding boxes on the GPU
-        """
-        device = torch.device('cuda')
-        pool_h, pool_w = (5, 5)
-        roi_align = ops.RoIAlign((pool_h, pool_w), spatial_scale=1, sampling_ratio=2).to(device=device)
-
-        x = self.x.to(device).clone()
-        rois = self.rois.to(device)
-        gt_grad = self.x_grad.to(device)
-
-        x.requires_grad = True
-        y = roi_align(x, rois)
-        s = y.sum()
-        s.backward()
-
-        self.assertTrue(torch.allclose(x.grad, gt_grad), 'gradient incorrect for RoIAlign CUDA')
-
-    @unittest.skipIf(not torch.cuda.is_available(), "CUDA unavailable")
-    def test_roi_align_gradcheck_cuda(self):
-        dtype = torch.float64
-        device = torch.device('cuda')
-        m = ops.RoIAlign((5, 5), 0.5, 1).to(dtype=dtype, device=device)
-        x = torch.rand(1, 1, 10, 10, dtype=dtype, device=device, requires_grad=True)
-        rois = self.rois.to(device=device, dtype=dtype)
+    def expected_fn(self, x, rois, pool_h, pool_w, spatial_scale=1, sampling_ratio=-1,
+                    device=None, dtype=torch.float64):
+        if device is None:
+            device = torch.device("cpu")
+        n_input_channels = x.size(1)
+        self.assertEqual(n_input_channels % (pool_h * pool_w), 0, "input channels must be divisible by ph * pw")
+        n_output_channels = int(n_input_channels / (pool_h * pool_w))
+        y = torch.zeros(rois.size(0), n_output_channels, pool_h, pool_w, dtype=dtype, device=device)

-        def func(input):
-            return m(input, rois)
+        def get_slice(k, block):
+            return slice(int(np.floor(k * block)), int(np.ceil((k + 1) * block)))

-        self.assertTrue(gradcheck(func, (x,)), 'gradcheck failed for RoIAlign CUDA')
-        self.assertTrue(gradcheck(func, (x.transpose(2, 3),)), 'gradcheck failed for RoIAlign CUDA')
+        for roi_idx, roi in enumerate(rois):
+            batch_idx = int(roi[0])
+            j_begin, i_begin, j_end, i_end = (int(round(x.item() * spatial_scale)) for x in roi[1:])
+            roi_x = x[batch_idx, :, i_begin:i_end + 1, j_begin:j_end + 1]

-        @torch.jit.script
-        def script_func(input, rois):
-            return ops.roi_align(input, rois, 5, 0.5, 1)[0]
+            roi_height = max(i_end - i_begin, 1)
+            roi_width = max(j_end - j_begin, 1)
+            bin_h, bin_w = roi_height / float(pool_h), roi_width / float(pool_w)

-        self.assertTrue(gradcheck(lambda x: script_func(x, rois), (x,)),
-                        'gradcheck failed for scripted roi_align on CUDA')
+            for i in range(0, pool_h):
+                for j in range(0, pool_w):
+                    bin_x = roi_x[:, get_slice(i, bin_h), get_slice(j, bin_w)]
+                    if bin_x.numel() > 0:
+                        area = bin_x.size(-2) * bin_x.size(-1)
+                        for c_out in range(0, n_output_channels):
+                            c_in = c_out * (pool_h * pool_w) + pool_w * i + j
+                            t = torch.sum(bin_x[c_in, :, :])
+                            y[roi_idx, c_out, i, j] = t / area
+        return y


 def bilinear_interpolate(data, height, width, y, x):
    if y < -1.0 or y > height or x < -1.0 or x > width:
        return 0.

-    if y <= 0:
-        y = 0.
-    if x <= 0:
-        x = 0.
+    y = min(max(0, y), height - 1)
+    x = min(max(0, x), width - 1)

-    y_low, x_low = int(y), int(x)
-    y_high, x_high = 0, 0
+    y_low = int(y)
+    y_high = min(y_low + 1, height - 1)

-    if y_low >= height - 1:
-        y_high = y_low = height - 1
-        y = float(y_low)
-    else:
-        y_high = y_low + 1
+    x_low = int(x)
+    x_high = min(x_low + 1, width - 1)

-    if x_low >= width - 1:
-        x_high = x_low = width - 1
-        x = float(x_low)
-    else:
-        x_high = x_low + 1
+    wy_h = y - y_low
+    wy_l = 1 - wy_h

-    ly = y - y_low
-    lx = x - x_low
-    hy, hx = 1. - ly, 1. - lx
+    wx_h = x - x_low
+    wx_l = 1 - wx_h

-    v1 = data[y_low * width + x_low]
-    v2 = data[y_low * width + x_high]
-    v3 = data[y_high * width + x_low]
-    v4 = data[y_high * width + x_high]
-    w1, w2, w3, w4 = hy * hx, hy * lx, ly * hx, ly * lx
+    val = 0
+    for wx, x in zip((wx_l, wx_h), (x_low, x_high)):
+        for wy, y in zip((wy_l, wy_h), (y_low, y_high)):
+            val += wx * wy * data[y * width + x]
+    return val

-    return w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4

+class RoIAlignTester(RoIOpTester, unittest.TestCase):
+    def fn(self, x, rois, pool_h, pool_w, spatial_scale=1, sampling_ratio=-1, **kwargs):
+        return ops.RoIAlign((pool_h, pool_w), spatial_scale=spatial_scale,
+                            sampling_ratio=sampling_ratio)(x, rois)

-class PSRoIAlignTester(unittest.TestCase):
-    @classmethod
-    def setUpClass(cls):
-        cls.dtype = torch.float64
+    def get_script_fn(self, rois, pool_size):
+        @torch.jit.script
+        def script_fn(input, rois, pool_size):
+            # type: (torch.Tensor, torch.Tensor, int) -> torch.Tensor
+            return ops.roi_align(input, rois, pool_size, 1.0)[0]
+        return lambda x: script_fn(x, rois, pool_size)

-    def slow_ps_roi_align(self, in_data, rois, pool_h, pool_w, device, spatial_scale=1,
-                          sampling_ratio=-1, dtype=torch.float64):
+    def expected_fn(self, in_data, rois, pool_h, pool_w, spatial_scale=1, sampling_ratio=-1,
+                    device=None, dtype=torch.float64):
        if device is None:
            device = torch.device("cpu")
-        num_input_channels = in_data.size(1)
-        self.assertEqual(num_input_channels % (pool_h * pool_w), 0, "input channels must be divisible by ph * pw")
-        num_output_channels = int(num_input_channels / (pool_h * pool_w))
-        out_data = torch.zeros(rois.size(0), num_output_channels, pool_h, pool_w, dtype=dtype, device=device)
+        n_channels = in_data.size(1)
+        out_data = torch.zeros(rois.size(0), n_channels, pool_h, pool_w, dtype=dtype, device=device)

-        for n in range(0, in_data.size(0)):
        for r, roi in enumerate(rois):
-                if roi[0] != n:
-                    continue
-                roi[1:] = (roi[1:] * spatial_scale) - 0.5
-                c_in = 0
-                roi_height = float(roi[4].item() - roi[2].item())
-                roi_width = float(roi[3].item() - roi[1].item())
-                bin_h, bin_w = roi_height / float(pool_h), roi_width / float(pool_w)
-                for c_out in range(0, num_output_channels):
-                    for j in range(0, pool_h):
-                        start_h = float(j) * bin_h + roi[2].item()
-
-                        for i in range(0, pool_w):
-                            start_w = float(i) * bin_w + roi[1].item()
-
-                            roi_bin_grid_h = sampling_ratio if sampling_ratio > 0 else int(np.ceil(roi_height / pool_h))
-                            roi_bin_grid_w = sampling_ratio if sampling_ratio > 0 else int(np.ceil(roi_width / pool_w))
-
-                            val = 0.
-                            for iy in range(0, roi_bin_grid_h):
-                                y = start_h + (iy + 0.5) * bin_h / float(roi_bin_grid_h)
-                                for ix in range(0, roi_bin_grid_w):
-                                    x = start_w + (ix + 0.5) * bin_w / float(roi_bin_grid_w)
+            batch_idx = int(roi[0])
+            j_begin, i_begin, j_end, i_end = (x.item() * spatial_scale for x in roi[1:])
+
+            roi_h = i_end - i_begin
+            roi_w = j_end - j_begin
+            bin_h = roi_h / pool_h
+            bin_w = roi_w / pool_w
+
+            for i in range(0, pool_h):
+                start_h = i_begin + i * bin_h
+                grid_h = sampling_ratio if sampling_ratio > 0 else int(np.ceil(bin_h))
+                for j in range(0, pool_w):
+                    start_w = j_begin + j * bin_w
+                    grid_w = sampling_ratio if sampling_ratio > 0 else int(np.ceil(bin_w))
+
+                    for channel in range(0, n_channels):
+
+                        val = 0
+                        for iy in range(0, grid_h):
+                            y = start_h + (iy + 0.5) * bin_h / grid_h
+                            for ix in range(0, grid_w):
+                                x = start_w + (ix + 0.5) * bin_w / grid_w
                                val += bilinear_interpolate(
-                                        in_data[n, c_in, :, :].flatten(),
+                                    in_data[batch_idx, channel, :, :].flatten(),
                                    in_data.size(-2),
                                    in_data.size(-1),
                                    y, x
                                )
-                            count = roi_bin_grid_h * roi_bin_grid_w
-                            out_data[r, c_out, j, i] = val / count
-                            c_in += 1
-        return out_data
-
-    def test_ps_roi_align_basic_cpu(self):
-        device = torch.device('cpu')
-        pool_size = 3
-        x = torch.rand(1, 2 * (pool_size ** 2), 7, 7, dtype=self.dtype, device=device)
-        rois = torch.tensor([[0, 0, 0, 5, 5]],  # format is (xyxy)
-                            dtype=self.dtype, device=device)
-
-        pool_h, pool_w = (pool_size, pool_size)
-        ps_roi_align = ops.PSRoIAlign((pool_h, pool_w), spatial_scale=1, sampling_ratio=2)
-        y = ps_roi_align(x, rois)
-
-        gt_y = self.slow_ps_roi_align(x, rois, pool_h, pool_w, device,
-                                      spatial_scale=1, sampling_ratio=2,
-                                      dtype=self.dtype)
-        self.assertTrue(torch.allclose(gt_y, y), 'PSRoIAlign layer incorrect on CPU')
-
-        y = ps_roi_align(x.permute(0, 1, 3, 2), rois)
-        gt_y = self.slow_ps_roi_align(x.permute(0, 1, 3, 2), rois, pool_h, pool_w, device,
-                                      spatial_scale=1, sampling_ratio=-1,
-                                      dtype=self.dtype)
-        self.assertTrue(torch.allclose(gt_y, y), 'PSRoIAlign layer incorrect on CPU')
-
-    def test_ps_roi_align_cpu(self):
-        device = torch.device('cpu')
-        pool_size = 5
-        x = torch.rand(2, 2 * (pool_size ** 2), 10, 10, dtype=self.dtype, device=device)
-        rois = torch.tensor([[0, 0, 0, 9, 9],  # format is (xyxy)
-                             [0, 0, 5, 4, 9],
-                             [0, 5, 5, 9, 9],
-                             [1, 0, 0, 9, 9]],
-                            dtype=self.dtype, device=device)
-
-        pool_h, pool_w = (pool_size, pool_size)
-        ps_roi_align = ops.PSRoIAlign((pool_h, pool_w), spatial_scale=1, sampling_ratio=2)
-        y = ps_roi_align(x, rois)
-
-        gt_y = self.slow_ps_roi_align(x, rois, pool_h, pool_w, device,
-                                      spatial_scale=1, sampling_ratio=2,
-                                      dtype=self.dtype)
-        self.assertTrue(torch.allclose(gt_y, y), 'PSRoIAlign layer incorrect on CPU')
-
-        y = ps_roi_align(x.permute(0, 1, 3, 2), rois)
-        gt_y = self.slow_ps_roi_align(x.permute(0, 1, 3, 2), rois, pool_h, pool_w,
-                                      device, spatial_scale=1, sampling_ratio=2,
-                                      dtype=self.dtype)
-        self.assertTrue(torch.allclose(gt_y, y), 'PSRoIAlign layer incorrect on CPU')
-
-    def test_ps_roi_align_gradient_cpu(self):
-        device = torch.device('cpu')
-        pool_size = 3
-        layer = ops.PSRoIAlign((pool_size, pool_size), spatial_scale=1,
-                               sampling_ratio=-1).to(dtype=self.dtype, device=device)
-        x = torch.ones(1, pool_size ** 2, 5, 5, dtype=self.dtype, device=device, requires_grad=True)
-        rois = torch.tensor([
-            [0, 0, 0, 4, 4],
-            [0, 0, 3, 5, 5],
-            [0, 1, 0, 2, 4]],
-            dtype=self.dtype, device=device)
-
-        y = layer(x, rois)
-        s = y.sum()
-        s.backward()
-        gt_grad = torch.tensor([[[[8.125e-01, 6.875e-01, 0.0, 0.0, 0.0, ],
-                                  [2.7083333333e-01, 2.2916666667e-01, 0.0, 0.0, 0.0, ],
-                                  [1.0416666667e-01, 6.25e-02, 0.0, 0.0, 0.0, ],
-                                  [5.2083333333e-01, 3.125e-01, 0.0, 0.0, 0.0, ],
-                                  [0.0, 0.0, 0.0, 0.0, 0.0, ]],
-                                 [[8.3266726847e-17, 1.125e00, 3.750e-01, 0.0, 0.0, ],
-                                  [2.7755575616e-17, 3.750e-01, 1.250e-01, 0.0, 0.0, ],
-                                  [0.0, 3.4722222222e-02, 9.7222222222e-02, 3.4722222222e-02, 0.0, ],
-                                  [0.0, 1.7361111111e-01, 4.8611111111e-01, 1.7361111111e-01, 0.0, ],
-                                  [0.0, 0.0, 0.0, 0.0, 0.0, ]],
-                                 [[0.0, 5.000e-01, 4.375e-01, 5.000e-01, 6.25e-02, ],
-                                  [0.0, 1.6666666667e-01, 1.4583333333e-01, 1.6666666667e-01, 2.0833333333e-02, ],
-                                  [0.0, 0.0, 0.0, 6.25e-02, 1.0416666667e-01, ],
-                                  [0.0, 0.0, 0.0, 3.125e-01, 5.2083333333e-01, ],
-                                  [0.0, 0.0, 0.0, 0.0, 0.0, ]],
-                                 [[0.0, 0.0, 0.0, 0.0, 0.0, ],
-                                  [5.4166666667e-01, 4.5833333333e-01, 0.0, 0.0, 0.0, ],
-                                  [5.4166666667e-01, 4.5833333333e-01, 0.0, 0.0, 0.0, ],
-                                  [3.125e-01, 1.875e-01, 0.0, 0.0, 0.0, ],
-                                  [3.125e-01, 1.875e-01, 0.0, 0.0, 0.0, ]],
-                                 [[0.0, 0.0, 0.0, 0.0, 0.0, ],
-                                  [5.5511151231e-17, 7.500e-01, 2.500e-01, 0.0, 0.0, ],
-                                  [5.5511151231e-17, 7.500e-01, 2.500e-01, 0.0, 0.0, ],
-                                  [0.0, 1.0416666667e-01, 2.9166666667e-01, 1.0416666667e-01, 0.0, ],
-                                  [0.0, 1.0416666667e-01, 2.9166666667e-01, 1.0416666667e-01, 0.0, ]],
-                                 [[0.0, 0.0, 0.0, 0.0, 0.0, ],
-                                  [0.0, 3.3333333333e-01, 2.9166666667e-01, 3.3333333333e-01, 4.1666666667e-02, ],
-                                  [0.0, 3.3333333333e-01, 2.9166666667e-01, 3.3333333333e-01, 4.1666666667e-02, ],
-                                  [0.0, 0.0, 0.0, 1.875e-01, 3.125e-01, ],
-                                  [0.0, 0.0, 0.0, 1.875e-01, 3.125e-01, ]],
-                                 [[0.0, 0.0, 0.0, 0.0, 0.0, ],
-                                  [0.0, 0.0, 0.0, 0.0, 0.0, ],
-                                  [2.7083333333e-01, 2.2916666667e-01, 0.0, 0.0, 0.0, ],
-                                  [7.2222222222e-01, 6.1111111111e-01, 0.0, 0.0, 0.0, ],
-                                  [7.1527777778e-01, 4.5138888889e-01, 0.0, 0.0, 0.0, ]],
-                                 [[0.0, 0.0, 0.0, 0.0, 0.0, ],
-                                  [0.0, 0.0, 0.0, 0.0, 0.0, ],
-                                  [2.7755575616e-17, 3.750e-01, 1.250e-01, 0.0, 0.0, ],
-                                  [7.4014868308e-17, 1.000e00, 3.3333333333e-01, 0.0, 0.0, ],
-                                  [9.2518585385e-18, 3.3333333333e-01, 6.25e-01, 2.0833333333e-01, 0.0, ]],
-                                 [[0.0, 0.0, 0.0, 0.0, 0.0, ],
-                                  [0.0, 0.0, 0.0, 0.0, 0.0, ],
-                                  [0.0, 1.6666666667e-01, 1.4583333333e-01, 1.6666666667e-01, 2.0833333333e-02, ],
-                                  [0.0, 4.4444444444e-01, 3.8888888889e-01, 4.4444444444e-01, 5.5555555556e-02, ],
-                                  [0.0, 5.5555555556e-02, 4.8611111111e-02, 4.3055555556e-01, 6.3194444444e-01, ]]]],
-                               device=device, dtype=self.dtype)
-        self.assertTrue(torch.allclose(x.grad, gt_grad), 'gradient incorrect for PSRoIAlign on CPU')
-
-    def test_ps_roi_align_gradcheck_cpu(self):
-        device = torch.device('cpu')
-        pool_size = 5
-        x = torch.rand(1, pool_size ** 2, 10, 10, dtype=self.dtype, device=device, requires_grad=True)
-        rois = torch.tensor([
-            [0, 0, 0, 9, 9],
-            [0, 0, 5, 5, 9],
-            [0, 5, 5, 9, 9]], dtype=self.dtype, device=device)
-
-        m = ops.PSRoIAlign((pool_size, pool_size), spatial_scale=1,
-                           sampling_ratio=2).to(dtype=self.dtype, device=device)
-
-        def func(input):
-            return m(input, rois)
-
-        self.assertTrue(gradcheck(func, (x,)), 'gradcheck failed for PSRoIAlign on CPU')
-        self.assertTrue(gradcheck(func, (x.permute(0, 1, 3, 2),)), 'gradcheck failed for PSRoIAlign on CPU')
-
-        @torch.jit.script
-        def script_func(input, rois):
-            return ops.ps_roi_align(input, rois, 5, 2.0, 1)[0]
-
-        self.assertTrue(gradcheck(lambda x: script_func(x, rois), (x,)),
-                        'gradcheck failed for scripted ps_roi_align on CPU')
-
-    @unittest.skipIf(not torch.cuda.is_available(), "CUDA unavailable")
-    def test_ps_roi_align_basic_cuda(self):
-        device = torch.device('cuda')
-        pool_size = 3
-        x = torch.rand(1, 2 * (pool_size ** 2), 7, 7, dtype=self.dtype, device=device)
-        rois = torch.tensor([[0, 0, 0, 5, 5]],  # format is (xyxy)
-                            dtype=self.dtype, device=device)
-
-        pool_h, pool_w = (pool_size, pool_size)
-        ps_roi_align = ops.PSRoIAlign((pool_h, pool_w), spatial_scale=1, sampling_ratio=2)
-        y = ps_roi_align(x, rois)
-
-        gt_y = self.slow_ps_roi_align(x, rois, pool_h, pool_w, device,
-                                      spatial_scale=1, sampling_ratio=2,
-                                      dtype=self.dtype)
-        self.assertTrue(torch.allclose(gt_y.cuda(), y), 'PSRoIAlign layer incorrect')
-
-        y = ps_roi_align(x.permute(0, 1, 3, 2), rois)
-        gt_y = self.slow_ps_roi_align(x.permute(0, 1, 3, 2), rois, pool_h, pool_w, device,
-                                      spatial_scale=1, sampling_ratio=-1,
-                                      dtype=self.dtype)
-        self.assertTrue(torch.allclose(gt_y.cuda(), y), 'PSRoIAlign layer incorrect')
-
-    @unittest.skipIf(not torch.cuda.is_available(), "CUDA unavailable")
-    def test_ps_roi_align_cuda(self):
-        device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
-        pool_size = 5
-        x = torch.rand(2, 2 * (pool_size ** 2), 10, 10, dtype=self.dtype, device=device)
-        rois = torch.tensor([[0, 0, 0, 9, 9],  # format is (xyxy)
-                             [0, 0, 5, 4, 9],
-                             [0, 5, 5, 9, 9],
-                             [1, 0, 0, 9, 9]],
-                            dtype=self.dtype, device=device)
-
-        pool_h, pool_w = (pool_size, pool_size)
-        ps_roi_align = ops.PSRoIAlign((pool_h, pool_w), spatial_scale=1, sampling_ratio=2)
-        y = ps_roi_align(x, rois)
-
-        gt_y = self.slow_ps_roi_align(x, rois, pool_h, pool_w, device,
-                                      spatial_scale=1, sampling_ratio=2,
-                                      dtype=self.dtype)
-        self.assertTrue(torch.allclose(gt_y.cuda(), y), 'PSRoIAlign layer incorrect')
-
-        y = ps_roi_align(x.permute(0, 1, 3, 2), rois)
-        gt_y = self.slow_ps_roi_align(x.permute(0, 1, 3, 2), rois, pool_h, pool_w,
-                                      device, spatial_scale=1, sampling_ratio=2,
-                                      dtype=self.dtype)
-        self.assertTrue(torch.allclose(gt_y.cuda(), y), 'PSRoIAlign layer incorrect')
-
-    @unittest.skipIf(not torch.cuda.is_available(), "CUDA unavailable")
-    def test_ps_roi_align_gradient_cuda(self):
-        device = torch.device('cuda')
-        pool_size = 3
-        layer = ops.PSRoIAlign((pool_size, pool_size), spatial_scale=1,
-                               sampling_ratio=-1).to(dtype=self.dtype, device=device)
-        x = torch.ones(1, pool_size ** 2, 5, 5, dtype=self.dtype, device=device, requires_grad=True)
-        rois = torch.tensor([
-            [0, 0, 0, 4, 4],
-            [0, 0, 3, 5, 5],
-            [0, 1, 0, 2, 4]],
-            dtype=self.dtype, device=device)
+                        val /= grid_h * grid_w

-        y = layer(x, rois)
-        s = y.sum()
-        s.backward()
-        gt_grad = torch.tensor([[[[8.125e-01, 6.875e-01, 0.0, 0.0, 0.0, ],
-                                  [2.7083333333e-01, 2.2916666667e-01, 0.0, 0.0, 0.0, ],
-                                  [1.0416666667e-01, 6.25e-02, 0.0, 0.0, 0.0, ],
-                                  [5.2083333333e-01, 3.125e-01, 0.0, 0.0, 0.0, ],
-                                  [0.0, 0.0, 0.0, 0.0, 0.0, ]],
-                                 [[8.3266726847e-17, 1.125e00, 3.750e-01, 0.0, 0.0, ],
-                                  [2.7755575616e-17, 3.750e-01, 1.250e-01, 0.0, 0.0, ],
-                                  [0.0, 3.4722222222e-02, 9.7222222222e-02, 3.4722222222e-02, 0.0, ],
-                                  [0.0, 1.7361111111e-01, 4.8611111111e-01, 1.7361111111e-01, 0.0, ],
-                                  [0.0, 0.0, 0.0, 0.0, 0.0, ]],
-                                 [[0.0, 5.000e-01, 4.375e-01, 5.000e-01, 6.25e-02, ],
-                                  [0.0, 1.6666666667e-01, 1.4583333333e-01, 1.6666666667e-01, 2.0833333333e-02, ],
-                                  [0.0, 0.0, 0.0, 6.25e-02, 1.0416666667e-01, ],
-                                  [0.0, 0.0, 0.0, 3.125e-01, 5.2083333333e-01, ],
-                                  [0.0, 0.0, 0.0, 0.0, 0.0, ]],
-                                 [[0.0, 0.0, 0.0, 0.0, 0.0, ],
-                                  [5.4166666667e-01, 4.5833333333e-01, 0.0, 0.0, 0.0, ],
-                                  [5.4166666667e-01, 4.5833333333e-01, 0.0, 0.0, 0.0, ],
-                                  [3.125e-01, 1.875e-01, 0.0, 0.0, 0.0, ],
-                                  [3.125e-01, 1.875e-01, 0.0, 0.0, 0.0, ]],
-                                 [[0.0, 0.0, 0.0, 0.0, 0.0, ],
-                                  [5.5511151231e-17, 7.500e-01, 2.500e-01, 0.0, 0.0, ],
-                                  [5.5511151231e-17, 7.500e-01, 2.500e-01, 0.0, 0.0, ],
-                                  [0.0, 1.0416666667e-01, 2.9166666667e-01, 1.0416666667e-01, 0.0, ],
-                                  [0.0, 1.0416666667e-01, 2.9166666667e-01, 1.0416666667e-01, 0.0, ]],
-                                 [[0.0, 0.0, 0.0, 0.0, 0.0, ],
-                                  [0.0, 3.3333333333e-01, 2.9166666667e-01, 3.3333333333e-01, 4.1666666667e-02, ],
-                                  [0.0, 3.3333333333e-01, 2.9166666667e-01, 3.3333333333e-01, 4.1666666667e-02, ],
-                                  [0.0, 0.0, 0.0, 1.875e-01, 3.125e-01, ],
-                                  [0.0, 0.0, 0.0, 1.875e-01, 3.125e-01, ]],
-                                 [[0.0, 0.0, 0.0, 0.0, 0.0, ],
-                                  [0.0, 0.0, 0.0, 0.0, 0.0, ],
-                                  [2.7083333333e-01, 2.2916666667e-01, 0.0, 0.0, 0.0, ],
-                                  [7.2222222222e-01, 6.1111111111e-01, 0.0, 0.0, 0.0, ],
-                                  [7.1527777778e-01, 4.5138888889e-01, 0.0, 0.0, 0.0, ]],
-                                 [[0.0, 0.0, 0.0, 0.0, 0.0, ],
-                                  [0.0, 0.0, 0.0, 0.0, 0.0, ],
-                                  [2.7755575616e-17, 3.750e-01, 1.250e-01, 0.0, 0.0, ],
-                                  [7.4014868308e-17, 1.000e00, 3.3333333333e-01, 0.0, 0.0, ],
-                                  [9.2518585385e-18, 3.3333333333e-01, 6.25e-01, 2.0833333333e-01, 0.0, ]],
-                                 [[0.0, 0.0, 0.0, 0.0, 0.0, ],
-                                  [0.0, 0.0, 0.0, 0.0, 0.0, ],
-                                  [0.0, 1.6666666667e-01, 1.4583333333e-01, 1.6666666667e-01, 2.0833333333e-02, ],
-                                  [0.0, 4.4444444444e-01, 3.8888888889e-01, 4.4444444444e-01, 5.5555555556e-02, ],
-                                  [0.0, 5.5555555556e-02, 4.8611111111e-02, 4.3055555556e-01, 6.3194444444e-01, ]]]],
-                               device=device, dtype=self.dtype)
-        self.assertTrue(torch.allclose(x.grad, gt_grad), 'gradient incorrect for PSRoIAlign')
-
-    @unittest.skipIf(not torch.cuda.is_available(), "CUDA unavailable")
-    def test_ps_roi_align_gradcheck_cuda(self):
-        device = torch.device('cuda')
-        pool_size = 5
-        x = torch.rand(1, pool_size ** 2, 10, 10, dtype=self.dtype, device=device, requires_grad=True)
-        rois = torch.tensor([
-            [0, 0, 0, 9, 9],
-            [0, 0, 5, 5, 9],
-            [0, 5, 5, 9, 9]], dtype=self.dtype, device=device)
-
-        m = ops.PSRoIAlign((pool_size, pool_size), spatial_scale=1,
-                           sampling_ratio=2).to(dtype=self.dtype, device=device)
+                        out_data[r, channel, i, j] = val
+        return out_data

-        def func(input):
-            return m(input, rois)

-        self.assertTrue(gradcheck(func, (x,)), 'gradcheck failed for PSRoIAlign CUDA')
-        self.assertTrue(gradcheck(func, (x.permute(0, 1, 3, 2),)), 'gradcheck failed for PSRoIAlign CUDA')
+class PSRoIAlignTester(RoIOpTester, unittest.TestCase):
+    def fn(self, x, rois, pool_h, pool_w, spatial_scale=1, sampling_ratio=-1, **kwargs):
+        return ops.PSRoIAlign((pool_h, pool_w), spatial_scale=spatial_scale,
+                              sampling_ratio=sampling_ratio)(x, rois)

+    def get_script_fn(self, rois, pool_size):
        @torch.jit.script
-        def script_func(input, rois):
-            return ops.ps_roi_align(input, rois, 5, 2.0, 1)[0]
+        def script_fn(input, rois, pool_size):
+            # type: (torch.Tensor, torch.Tensor, int) -> torch.Tensor
+            return ops.ps_roi_align(input, rois, pool_size, 1.0)[0]
+        return lambda x: script_fn(x, rois, pool_size)

-        self.assertTrue(gradcheck(lambda x: script_func(x, rois), (x,)),
-                        'gradcheck failed for scripted ps_roi_align on CUDA')
-
-
-class PSRoIPoolTester(unittest.TestCase):
-    @classmethod
-    def setUpClass(cls):
-        cls.dtype = torch.float64
-
-    def slow_ps_roi_pooling(self, x, rois, pool_h, pool_w, device, spatial_scale=1,
-                            dtype=torch.float64):
+    def expected_fn(self, in_data, rois, pool_h, pool_w, device, spatial_scale=1,
+                    sampling_ratio=-1, dtype=torch.float64):
        if device is None:
            device = torch.device("cpu")
-        num_input_channels = x.size(1)
-        self.assertEqual(num_input_channels % (pool_h * pool_w), 0, "input channels must be divisible by ph * pw")
-        num_output_channels = int(num_input_channels / (pool_h * pool_w))
-        y = torch.zeros(rois.size(0), num_output_channels, pool_h, pool_w, dtype=dtype, device=device)
+        n_input_channels = in_data.size(1)
+        self.assertEqual(n_input_channels % (pool_h * pool_w), 0, "input channels must be divisible by ph * pw")
+        n_output_channels = int(n_input_channels / (pool_h * pool_w))
+        out_data = torch.zeros(rois.size(0), n_output_channels, pool_h, pool_w, dtype=dtype, device=device)

-        rois = torch.round(rois * spatial_scale).int()
-        for n in range(0, x.size(0)):
        for r, roi in enumerate(rois):
-                if roi[0] != n:
-                    continue
-                c_in = 0
-                for c_out in range(0, num_output_channels):
-                    roi_height = max(roi[4].item() - roi[2].item(), 1)
-                    roi_width = max(roi[3].item() - roi[1].item(), 1)
-                    bin_h, bin_w = roi_height / float(pool_h), roi_width / float(pool_w)
-
-                    for j in range(0, pool_h):
-                        start_h = int(np.floor(j * bin_h)) + roi[2].item()
-                        end_h = int(np.ceil((j + 1) * bin_w)) + roi[2].item()
-
-                        # range-check
-                        start_h = min(max(start_h, 0), x.size(2))
-                        end_h = min(max(end_h, 0), x.size(2))
-
-                        for i in range(0, pool_w):
-                            start_w = int(np.floor(i * bin_w)) + roi[1].item()
-                            end_w = int(np.ceil((i + 1) * bin_w)) + roi[1].item()
-
-                            # range-check
-                            start_w = min(max(start_w, 0), x.size(3))
-                            end_w = min(max(end_w, 0), x.size(3))
-
-                            is_empty = (end_h <= start_h) or (end_w <= start_w)
-                            area = (end_h - start_h) * (end_w - start_w)
-
-                            if not is_empty:
-                                t = torch.sum(x[n, c_in, slice(start_h, end_h), slice(start_w, end_w)])
-                                y[r, c_out, j, i] = t / area
-                            c_in += 1
-        return y
-
-    def test_ps_roi_pool_basic_cpu(self):
-        device = torch.device('cpu')
-        pool_size = 3
-        x = torch.rand(1, pool_size ** 2, 10, 10, dtype=self.dtype, device=device)
-        rois = torch.tensor([[0, 0, 0, 4, 4]],  # format is (xyxy)
-                            dtype=self.dtype, device=device)
-
-        pool_h, pool_w = (pool_size, pool_size)
-        ps_roi_pool = ops.PSRoIPool((pool_h, pool_w), 1)
-        y = ps_roi_pool(x, rois)
-
-        gt_y = self.slow_ps_roi_pooling(x, rois, pool_h, pool_w, device, dtype=self.dtype)
-        self.assertTrue(torch.allclose(gt_y, y), 'PSRoIPool layer incorrect on CPU')
-
-        y = ps_roi_pool(x.permute(0, 1, 3, 2), rois)
-        gt_y = self.slow_ps_roi_pooling(x.permute(0, 1, 3, 2), rois, pool_h, pool_w, device, dtype=self.dtype)
-        self.assertTrue(torch.allclose(gt_y, y), 'PSRoIPool layer incorrect on CPU')
-
-    def test_ps_roi_pool_cpu(self):
-        device = torch.device('cpu')
-        pool_size = 5
-        x = torch.rand(2, 2 * (pool_size ** 2), 10, 10, dtype=self.dtype, device=device)
-        rois = torch.tensor([[0, 0, 0, 9, 9],  # format is (xyxy)
-                             [0, 0, 5, 4, 9],
-                             [0, 5, 5, 9, 9],
-                             [1, 0, 0, 9, 9]],
-                            dtype=self.dtype, device=device)
-
-        pool_h, pool_w = (pool_size, pool_size)
-        ps_roi_pool = ops.PSRoIPool((pool_h, pool_w), 1)
-        y = ps_roi_pool(x, rois)
-
-        gt_y = self.slow_ps_roi_pooling(x, rois, pool_h, pool_w, device, dtype=self.dtype)
-        self.assertTrue(torch.allclose(gt_y, y), 'PSRoIPool layer incorrect on CPU')
-
-        y = ps_roi_pool(x.permute(0, 1, 3, 2), rois)
-        gt_y = self.slow_ps_roi_pooling(x.permute(0, 1, 3, 2), rois, pool_h, pool_w, device, dtype=self.dtype)
-        self.assertTrue(torch.allclose(gt_y, y), 'PSRoIPool layer incorrect on CPU')
-
-    def test_ps_roi_pool_gradient_cpu(self):
-        device = torch.device('cpu')
-        pool_size = 3
-        layer = ops.PSRoIPool((pool_size, pool_size), 1).to(dtype=self.dtype, device=device)
-        x = torch.ones(1, pool_size ** 2, 5, 5, dtype=self.dtype, device=device, requires_grad=True)
-        rois = torch.tensor([
-            [0, 0, 0, 4, 4],
-            [0, 0, 3, 5, 5],
-            [0, 1, 0, 2, 4]],
-            dtype=self.dtype, device=device)
-
-        y = layer(x, rois)
-        s = y.sum()
-        s.backward()
-        gt_grad = torch.tensor([[[[0.2500, 0.7500, 0.0000, 0.0000, 0.0000],
-                                  [0.2500, 0.7500, 0.0000, 0.0000, 0.0000],
-                                  [0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
-                                  [0.5000, 0.5000, 0.0000, 0.0000, 0.0000],
-                                  [0.0000, 0.0000, 0.0000, 0.0000, 0.0000]],
-
-                                 [[0.0000, 0.7500, 0.2500, 0.0000, 0.0000],
-                                  [0.0000, 0.7500, 0.2500, 0.0000, 0.0000],
-                                  [0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
-                                  [0.0000, 1. / 3, 1. / 3, 1. / 3, 0.0000],
-                                  [0.0000, 0.0000, 0.0000, 0.0000, 0.0000]],
-
-                                 [[0.0000, 0.5000, 0.2500, 0.2500, 0.0000],
-                                  [0.0000, 0.5000, 0.2500, 0.2500, 0.0000],
-                                  [0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
-                                  [0.0000, 0.0000, 0.0000, 0.5000, 0.5000],
-                                  [0.0000, 0.0000, 0.0000, 0.0000, 0.0000]],
-
-                                 [[0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
-                                  [0.2500, 0.7500, 0.0000, 0.0000, 0.0000],
-                                  [0.2500, 0.7500, 0.0000, 0.0000, 0.0000],
-                                  [0.2500, 0.2500, 0.0000, 0.0000, 0.0000],
-                                  [0.2500, 0.2500, 0.0000, 0.0000, 0.0000]],
-
-                                 [[0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
-                                  [0.0000, 0.7500, 0.2500, 0.0000, 0.0000],
-                                  [0.0000, 0.7500, 0.2500, 0.0000, 0.0000],
-                                  [0.0000, 1. / 6, 1. / 6, 1. / 6, 0.0000],
-                                  [0.0000, 1. / 6, 1. / 6, 1. / 6, 0.0000]],
-
-                                 [[0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
-                                  [0.0000, 0.5000, 0.2500, 0.2500, 0.0000],
-                                  [0.0000, 0.5000, 0.2500, 0.2500, 0.0000],
-                                  [0.0000, 0.0000, 0.0000, 0.2500, 0.2500],
-                                  [0.0000, 0.0000, 0.0000, 0.2500, 0.2500]],
-
-                                 [[0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
-                                  [0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
-                                  [0.2500, 0.7500, 0.0000, 0.0000, 0.0000],
-                                  [0.2500, 0.7500, 0.0000, 0.0000, 0.0000],
-                                  [0.5000, 0.5000, 0.0000, 0.0000, 0.0000]],
-
-                                 [[0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
-                                  [0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
-                                  [0.0000, 0.7500, 0.2500, 0.0000, 0.0000],
-                                  [0.0000, 0.7500, 0.2500, 0.0000, 0.0000],
-                                  [0.0000, 1. / 3, 1. / 3, 1. / 3, 0.0000]],
-
-                                 [[0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
-                                  [0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
-                                  [0.0000, 0.5000, 0.2500, 0.2500, 0.0000],
-                                  [0.0000, 0.5000, 0.2500, 0.2500, 0.0000],
-                                  [0.0000, 0.0000, 0.0000, 0.5000, 0.5000]]]],
-                               device=device, dtype=self.dtype)
-        self.assertTrue(torch.allclose(x.grad, gt_grad), 'gradient incorrect for PSRoIPool on CPU')
-
-    def test_ps_roi_pool_gradcheck_cpu(self):
-        device = torch.device('cpu')
-        pool_size = 5
-        x = torch.rand(1, pool_size ** 2, 10, 10, dtype=self.dtype, device=device, requires_grad=True)
-        rois = torch.tensor([
-            [0, 0, 0, 9, 9],
-            [0, 0, 5, 5, 9],
-            [0, 5, 5, 9, 9]], dtype=self.dtype, device=device)
-
-        m = ops.PSRoIPool((pool_size, pool_size), 1).to(dtype=self.dtype, device=device)
-
-        def func(input):
-            return m(input, rois)
-
-        self.assertTrue(gradcheck(func, (x,)), 'gradcheck failed for PSRoIPool on CPU')
-        self.assertTrue(gradcheck(func, (x.permute(0, 1, 3, 2),)), 'gradcheck failed for PSRoIPool on CPU')
-
-        @torch.jit.script
-        def script_func(input, rois):
-            return ops.ps_roi_pool(input, rois, 5, 1.0)[0]
-
-        self.assertTrue(gradcheck(lambda x: script_func(x, rois), (x,)),
-                        'gradcheck failed for scripted ps_roi_pool on CPU')
-
-    @unittest.skipIf(not torch.cuda.is_available(), "CUDA unavailable")
-    def test_ps_roi_pool_basic_cuda(self):
-        device = torch.device('cuda')
-        pool_size = 3
-        x = torch.rand(1, pool_size ** 2, 10, 10, dtype=self.dtype, device=device)
-        rois = torch.tensor([[0, 0, 0, 4, 4]],  # format is (xyxy)
-                            dtype=self.dtype, device=device)
-
-        pool_h, pool_w = (pool_size, pool_size)
-        ps_roi_pool = ops.PSRoIPool((pool_h, pool_w), 1)
-        y = ps_roi_pool(x, rois)
-
-        gt_y = self.slow_ps_roi_pooling(x, rois, pool_h, pool_w, device, dtype=self.dtype)
-        self.assertTrue(torch.allclose(gt_y.cuda(), y), 'PSRoIPool layer incorrect')
-
-        y = ps_roi_pool(x.permute(0, 1, 3, 2), rois)
-        gt_y = self.slow_ps_roi_pooling(x.permute(0, 1, 3, 2), rois, pool_h, pool_w, device, dtype=self.dtype)
-        self.assertTrue(torch.allclose(gt_y.cuda(), y), 'PSRoIPool layer incorrect')
-
-    @unittest.skipIf(not torch.cuda.is_available(), "CUDA unavailable")
-    def test_ps_roi_pool_cuda(self):
-        device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
-        pool_size = 5
-        x = torch.rand(2, 2 * (pool_size ** 2), 10, 10, dtype=self.dtype, device=device)
-        rois = torch.tensor([[0, 0, 0, 9, 9],  # format is (xyxy)
-                             [0, 0, 5, 4, 9],
-                             [0, 5, 5, 9, 9],
-                             [1, 0, 0, 9, 9]],
-                            dtype=self.dtype, device=device)
-
-        pool_h, pool_w = (pool_size, pool_size)
-        ps_roi_pool = ops.PSRoIPool((pool_h, pool_w), 1)
-        y = ps_roi_pool(x, rois)
-
-        gt_y = self.slow_ps_roi_pooling(x, rois, pool_h, pool_w, device, dtype=self.dtype)
-
-        self.assertTrue(torch.allclose(gt_y.cuda(), y), 'PSRoIPool layer incorrect')
-
-        y = ps_roi_pool(x.permute(0, 1, 3, 2), rois)
-        gt_y = self.slow_ps_roi_pooling(x.permute(0, 1, 3, 2), rois, pool_h, pool_w, device, dtype=self.dtype)
-        self.assertTrue(torch.allclose(gt_y.cuda(), y), 'PSRoIPool layer incorrect')
-
-    @unittest.skipIf(not torch.cuda.is_available(), "CUDA unavailable")
-    def test_ps_roi_pool_gradient_cuda(self):
-        device = torch.device('cuda')
-        pool_size = 3
-        layer = ops.PSRoIPool((pool_size, pool_size), 1).to(dtype=self.dtype, device=device)
-        x = torch.ones(1, pool_size ** 2, 5, 5, dtype=self.dtype, device=device, requires_grad=True)
-        rois = torch.tensor([
-            [0, 0, 0, 4, 4],
-            [0, 0, 3, 5, 5],
-            [0, 1, 0, 2, 4]],
-            dtype=self.dtype, device=device)
-
-        y = layer(x, rois)
-        s = y.sum()
-        s.backward()
-        gt_grad = torch.tensor([[[[0.2500, 0.7500, 0.0000, 0.0000, 0.0000],
-                                  [0.2500, 0.7500, 0.0000, 0.0000, 0.0000],
-                                  [0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
-                                  [0.5000, 0.5000, 0.0000, 0.0000, 0.0000],
-                                  [0.0000, 0.0000, 0.0000, 0.0000, 0.0000]],
-
-                                 [[0.0000, 0.7500, 0.2500, 0.0000, 0.0000],
-                                  [0.0000, 0.7500, 0.2500, 0.0000, 0.0000],
-                                  [0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
-                                  [0.0000, 1. / 3, 1. / 3, 1. / 3, 0.0000],
-                                  [0.0000, 0.0000, 0.0000, 0.0000, 0.0000]],
-
-                                 [[0.0000, 0.5000, 0.2500, 0.2500, 0.0000],
-                                  [0.0000, 0.5000, 0.2500, 0.2500, 0.0000],
-                                  [0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
-                                  [0.0000, 0.0000, 0.0000, 0.5000, 0.5000],
-                                  [0.0000, 0.0000, 0.0000, 0.0000, 0.0000]],
-
-                                 [[0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
-                                  [0.2500, 0.7500, 0.0000, 0.0000, 0.0000],
-                                  [0.2500, 0.7500, 0.0000, 0.0000, 0.0000],
-                                  [0.2500, 0.2500, 0.0000, 0.0000, 0.0000],
-                                  [0.2500, 0.2500, 0.0000, 0.0000, 0.0000]],
-
-                                 [[0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
-                                  [0.0000, 0.7500, 0.2500, 0.0000, 0.0000],
-                                  [0.0000, 0.7500, 0.2500, 0.0000, 0.0000],
-                                  [0.0000, 1. / 6, 1. / 6, 1. / 6, 0.0000],
-                                  [0.0000, 1. / 6, 1. / 6, 1. / 6, 0.0000]],
-
-                                 [[0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
-                                  [0.0000, 0.5000, 0.2500, 0.2500, 0.0000],
-                                  [0.0000, 0.5000, 0.2500, 0.2500, 0.0000],
-                                  [0.0000, 0.0000, 0.0000, 0.2500, 0.2500],
-                                  [0.0000, 0.0000, 0.0000, 0.2500, 0.2500]],
-
-                                 [[0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
-                                  [0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
-                                  [0.2500, 0.7500, 0.0000, 0.0000, 0.0000],
-                                  [0.2500, 0.7500, 0.0000, 0.0000, 0.0000],
-                                  [0.5000, 0.5000, 0.0000, 0.0000, 0.0000]],
-
-                                 [[0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
-                                  [0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
-                                  [0.0000, 0.7500, 0.2500, 0.0000, 0.0000],
-                                  [0.0000, 0.7500, 0.2500, 0.0000, 0.0000],
-                                  [0.0000, 1. / 3, 1. / 3, 1. / 3, 0.0000]],
-
-                                 [[0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
-                                  [0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
-                                  [0.0000, 0.5000, 0.2500, 0.2500, 0.0000],
-                                  [0.0000, 0.5000, 0.2500, 0.2500, 0.0000],
-                                  [0.0000, 0.0000, 0.0000, 0.5000, 0.5000]]]],
-                               device=device, dtype=self.dtype)
-        self.assertTrue(torch.allclose(x.grad, gt_grad), 'gradient incorrect for PSRoIPool')
-
-    @unittest.skipIf(not torch.cuda.is_available(), "CUDA unavailable")
-    def test_ps_roi_pool_gradcheck_cuda(self):
-        device = torch.device('cuda')
-        pool_size = 5
-        x = torch.rand(1, pool_size ** 2, 10, 10, dtype=self.dtype, device=device, requires_grad=True)
-        rois = torch.tensor([
-            [0, 0, 0, 9, 9],
-            [0, 0, 5, 5, 9],
-            [0, 5, 5, 9, 9]], dtype=self.dtype, device=device)
-
-        m = ops.PSRoIPool((pool_size, pool_size), 1).to(dtype=self.dtype, device=device)
-
-        def func(input):
-            return m(input, rois)
-
-        self.assertTrue(gradcheck(func, (x,)), 'gradcheck failed for PSRoIPool CUDA')
-        self.assertTrue(gradcheck(func, (x.permute(0, 1, 3, 2),)), 'gradcheck failed for PSRoIPool CUDA')
-
-        @torch.jit.script
-        def script_func(input, rois):
-            return ops.ps_roi_pool(input, rois, 5, 1.0)[0]
+            batch_idx = int(roi[0])
+            j_begin, i_begin, j_end, i_end = (x.item() * spatial_scale - 0.5 for x in roi[1:])
+
+            roi_h = i_end - i_begin
+            roi_w = j_end - j_begin
+            bin_h = roi_h / pool_h
+            bin_w = roi_w / pool_w
+
+            for i in range(0, pool_h):
+                start_h = i_begin + i * bin_h
+                grid_h = sampling_ratio if sampling_ratio > 0 else int(np.ceil(bin_h))
+                for j in range(0, pool_w):
+                    start_w = j_begin + j * bin_w
+                    grid_w = sampling_ratio if sampling_ratio > 0 else int(np.ceil(bin_w))
+                    for c_out in range(0, n_output_channels):
+                        c_in = c_out * (pool_h * pool_w) + pool_w * i + j
+
+                        val = 0
+                        for iy in range(0, grid_h):
+                            y = start_h + (iy + 0.5) * bin_h / grid_h
+                            for ix in range(0, grid_w):
+                                x = start_w + (ix + 0.5) * bin_w / grid_w
+                                val += bilinear_interpolate(
+                                    in_data[batch_idx, c_in, :, :].flatten(),
+                                    in_data.size(-2),
+                                    in_data.size(-1),
+                                    y, x
+                                )
+                        val /= grid_h * grid_w

-        self.assertTrue(gradcheck(lambda x: script_func(x, rois), (x,)),
-                        'gradcheck failed for scripted ps_roi_pool on CUDA')
+                        out_data[r, c_out, i, j] = val
+        return out_data


 class NMSTester(unittest.TestCase):