Re-sync with internal repository

3d3b2fdc · Nikhila Ravi · 2480723a · 3d3b2fdc · 3d3b2fdc · 3d3b2fdc
Commit 3d3b2fdc authored Mar 18, 2020 by Nikhila Ravi
7 changed files
--- a/pytorch3d/renderer/points/__init__.py
+++ b/pytorch3d/renderer/points/__init__.py
+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
--- a/pytorch3d/renderer/points/rasterize_points.py
+++ b/pytorch3d/renderer/points/rasterize_points.py
+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+
+from typing import Optional
+import torch
+
+from pytorch3d import _C
+from pytorch3d.renderer.mesh.rasterize_meshes import pix_to_ndc
+
+
+# TODO(jcjohns): Support non-square images
+def rasterize_points(
+    pointclouds,
+    image_size: int = 256,
+    radius: float = 0.01,
+    points_per_pixel: int = 8,
+    bin_size: Optional[int] = None,
+    max_points_per_bin: Optional[int] = None,
+):
+    """
+    Pointcloud rasterization
+
+    Args:
+        pointclouds: A Pointclouds object representing a batch of point clouds to be
+            rasterized. This is a batch of N pointclouds, where each point cloud
+            can have a different number of points; the coordinates of each point
+            are (x, y, z). The coordinates are expected to
+            be in normalized device coordinates (NDC): [-1, 1]^3 with the camera at
+            (0, 0, 0); the x-axis goes from left-to-right, the y-axis goes from
+            top-to-bottom, and the z-axis goes from back-to-front.
+        image_size: Integer giving the resolution of the rasterized image
+        radius (Optional): Float giving the radius (in NDC units) of the disk to
+            be rasterized for each point.
+        points_per_pixel (Optional): We will keep track of this many points per
+            pixel, returning the nearest points_per_pixel points along the z-axis
+        bin_size: Size of bins to use for coarse-to-fine rasterization. Setting
+            bin_size=0 uses naive rasterization; setting bin_size=None attempts to
+            set it heuristically based on the shape of the input. This should not
+            affect the output, but can affect the speed of the forward pass.
+        points_per_bin: Only applicable when using coarse-to-fine rasterization
+            (bin_size > 0); this is the maxiumum number of points allowed within each
+            bin. If more than this many points actually fall into a bin, an error
+            will be raised. This should not affect the output values, but can affect
+            the memory usage in the forward pass.
+
+    Returns:
+        3-element tuple containing
+
+        - **idx**: int32 Tensor of shape (N, image_size, image_size, points_per_pixel)
+          giving the indices of the nearest points at each pixel, in ascending
+          z-order. Concretely `idx[n, y, x, k] = p` means that `points[p]` is the kth
+          closest point (along the z-direction) to pixel (y, x) - note that points
+          represents the packed points of shape (P, 3).
+          Pixels that are hit by fewer than points_per_pixel are padded with -1.
+        - **zbuf**: Tensor of shape (N, image_size, image_size, points_per_pixel)
+          giving the z-coordinates of the nearest points at each pixel, sorted in
+          z-order. Concretely, if `idx[n, y, x, k] = p` then
+          `zbuf[n, y, x, k] = points[n, p, 2]`. Pixels hit by fewer than
+          points_per_pixel are padded with -1
+        - **dists2**: Tensor of shape (N, image_size, image_size, points_per_pixel)
+          giving the squared Euclidean distance (in NDC units) in the x/y plane
+          for each point closest to the pixel. Concretely if `idx[n, y, x, k] = p`
+          then `dists[n, y, x, k]` is the squared distance between the pixel (y, x)
+          and the point `(points[n, p, 0], points[n, p, 1])`. Pixels hit with fewer
+          than points_per_pixel are padded with -1.
+    """
+    points_packed = pointclouds.points_packed()
+    cloud_to_packed_first_idx = pointclouds.cloud_to_packed_first_idx()
+    num_points_per_cloud = pointclouds.num_points_per_cloud()
+
+    if bin_size is None:
+        if not points_packed.is_cuda:
+            # Binned CPU rasterization not fully implemented
+            bin_size = 0
+        else:
+            # TODO: These heuristics are not well-thought out!
+            if image_size <= 64:
+                bin_size = 8
+            elif image_size <= 256:
+                bin_size = 16
+            elif image_size <= 512:
+                bin_size = 32
+            elif image_size <= 1024:
+                bin_size = 64
+
+    if max_points_per_bin is None:
+        max_points_per_bin = int(max(10000, points_packed.shape[0] / 5))
+
+    # Function.apply cannot take keyword args, so we handle defaults in this
+    # wrapper and call apply with positional args only
+    return _RasterizePoints.apply(
+        points_packed,
+        cloud_to_packed_first_idx,
+        num_points_per_cloud,
+        image_size,
+        radius,
+        points_per_pixel,
+        bin_size,
+        max_points_per_bin,
+    )
+
+
+class _RasterizePoints(torch.autograd.Function):
+    @staticmethod
+    def forward(
+        ctx,
+        points,  # (P, 3)
+        cloud_to_packed_first_idx,
+        num_points_per_cloud,
+        image_size: int = 256,
+        radius: float = 0.01,
+        points_per_pixel: int = 8,
+        bin_size: int = 0,
+        max_points_per_bin: int = 0,
+    ):
+        # TODO: Add better error handling for when there are more than
+        # max_points_per_bin in any bin.
+        args = (
+            points,
+            cloud_to_packed_first_idx,
+            num_points_per_cloud,
+            image_size,
+            radius,
+            points_per_pixel,
+            bin_size,
+            max_points_per_bin,
+        )
+        idx, zbuf, dists = _C.rasterize_points(*args)
+        ctx.save_for_backward(points, idx)
+        return idx, zbuf, dists
+
+    @staticmethod
+    def backward(ctx, grad_idx, grad_zbuf, grad_dists):
+        grad_points = None
+        grad_cloud_to_packed_first_idx = None
+        grad_num_points_per_cloud = None
+        grad_image_size = None
+        grad_radius = None
+        grad_points_per_pixel = None
+        grad_bin_size = None
+        grad_max_points_per_bin = None
+        points, idx = ctx.saved_tensors
+        args = (points, idx, grad_zbuf, grad_dists)
+        grad_points = _C.rasterize_points_backward(*args)
+        grads = (
+            grad_points,
+            grad_cloud_to_packed_first_idx,
+            grad_num_points_per_cloud,
+            grad_image_size,
+            grad_radius,
+            grad_points_per_pixel,
+            grad_bin_size,
+            grad_max_points_per_bin,
+        )
+        return grads
+
+
+def rasterize_points_python(
+    pointclouds,
+    image_size: int = 256,
+    radius: float = 0.01,
+    points_per_pixel: int = 8,
+):
+    """
+    Naive pure PyTorch implementation of pointcloud rasterization.
+
+    Inputs / Outputs: Same as above
+    """
+    N = len(pointclouds)
+    S, K = image_size, points_per_pixel
+    device = pointclouds.device
+
+    points_packed = pointclouds.points_packed()
+    cloud_to_packed_first_idx = pointclouds.cloud_to_packed_first_idx()
+    num_points_per_cloud = pointclouds.num_points_per_cloud()
+
+    # Intialize output tensors.
+    point_idxs = torch.full(
+        (N, S, S, K), fill_value=-1, dtype=torch.int32, device=device
+    )
+    zbuf = torch.full(
+        (N, S, S, K), fill_value=-1, dtype=torch.float32, device=device
+    )
+    pix_dists = torch.full(
+        (N, S, S, K), fill_value=-1, dtype=torch.float32, device=device
+    )
+
+    # NDC is from [-1, 1]. Get pixel size using specified image size.
+    radius2 = radius * radius
+
+    # Iterate through the batch of point clouds.
+    for n in range(N):
+        point_start_idx = cloud_to_packed_first_idx[n]
+        point_stop_idx = point_start_idx + num_points_per_cloud[n]
+
+        # Iterate through the horizontal lines of the image from top to bottom.
+        for yi in range(S):
+            # Y coordinate of one end of the image. Reverse the ordering
+            # of yi so that +Y is pointing up in the image.
+            yfix = S - 1 - yi
+            yf = pix_to_ndc(yfix, S)
+
+            # Iterate through pixels on this horizontal line, left to right.
+            for xi in range(S):
+                # X coordinate of one end of the image. Reverse the ordering
+                # of xi so that +X is pointing to the left in the image.
+                xfix = S - 1 - xi
+                xf = pix_to_ndc(xfix, S)
+
+                top_k_points = []
+                # Check whether each point in the batch affects this pixel.
+                for p in range(point_start_idx, point_stop_idx):
+                    px, py, pz = points_packed[p, :]
+                    if pz < 0:
+                        continue
+                    dx = px - xf
+                    dy = py - yf
+                    dist2 = dx * dx + dy * dy
+                    if dist2 < radius2:
+                        top_k_points.append((pz, p, dist2))
+                        top_k_points.sort()
+                        if len(top_k_points) > K:
+                            top_k_points = top_k_points[:K]
+                for k, (pz, p, dist2) in enumerate(top_k_points):
+                    zbuf[n, yi, xi, k] = pz
+                    point_idxs[n, yi, xi, k] = p
+                    pix_dists[n, yi, xi, k] = dist2
+    return point_idxs, zbuf, pix_dists
--- a/pytorch3d/structures/pointclouds.py
+++ b/pytorch3d/structures/pointclouds.py
--- a/tests/bm_pointclouds.py
+++ b/tests/bm_pointclouds.py
+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+
+
+from itertools import product
+from fvcore.common.benchmark import benchmark
+
+from test_pointclouds import TestPointclouds
+
+
+def bm_compute_packed_padded_pointclouds() -> None:
+    kwargs_list = []
+    num_clouds = [32, 128]
+    max_p = [100, 10000]
+    feats = [1, 10, 300]
+    test_cases = product(num_clouds, max_p, feats)
+    for case in test_cases:
+        n, p, f = case
+        kwargs_list.append({"num_clouds": n, "max_p": p, "features": f})
+    benchmark(
+        TestPointclouds.compute_packed_with_init,
+        "COMPUTE_PACKED",
+        kwargs_list,
+        warmup_iters=1,
+    )
+    benchmark(
+        TestPointclouds.compute_padded_with_init,
+        "COMPUTE_PADDED",
+        kwargs_list,
+        warmup_iters=1,
+    )
--- a/tests/bm_rasterize_points.py
+++ b/tests/bm_rasterize_points.py
+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+
+
+import torch
+from fvcore.common.benchmark import benchmark
+
+from pytorch3d.renderer.points.rasterize_points import (
+    rasterize_points,
+    rasterize_points_python,
+)
+from pytorch3d.structures.pointclouds import Pointclouds
+
+
+def _bm_python_with_init(N, P, img_size=32, radius=0.1, pts_per_pxl=3):
+    torch.manual_seed(231)
+    points = torch.randn(N, P, 3)
+    pointclouds = Pointclouds(points=points)
+    args = (pointclouds, img_size, radius, pts_per_pxl)
+    return lambda: rasterize_points_python(*args)
+
+
+def _bm_cpu_with_init(N, P, img_size=32, radius=0.1, pts_per_pxl=3):
+    torch.manual_seed(231)
+    points = torch.randn(N, P, 3)
+    pointclouds = Pointclouds(points=points)
+    args = (pointclouds, img_size, radius, pts_per_pxl)
+    return lambda: rasterize_points(*args)
+
+
+def _bm_cuda_with_init(N, P, img_size=32, radius=0.1, pts_per_pxl=3):
+    torch.manual_seed(231)
+    points = torch.randn(N, P, 3, device=torch.device("cuda"))
+    pointclouds = Pointclouds(points=points)
+    args = (pointclouds, img_size, radius, pts_per_pxl)
+    return lambda: rasterize_points(*args)
+
+
+def bm_python_vs_cpu() -> None:
+    kwargs_list = [
+        {"N": 1, "P": 32, "img_size": 32, "radius": 0.1, "pts_per_pxl": 3},
+        {"N": 2, "P": 32, "img_size": 32, "radius": 0.1, "pts_per_pxl": 3},
+    ]
+    benchmark(
+        _bm_python_with_init, "RASTERIZE_PYTHON", kwargs_list, warmup_iters=1
+    )
+    benchmark(_bm_cpu_with_init, "RASTERIZE_CPU", kwargs_list, warmup_iters=1)
+    kwargs_list = [
+        {"N": 2, "P": 32, "img_size": 32, "radius": 0.1, "pts_per_pxl": 3},
+        {"N": 4, "P": 1024, "img_size": 128, "radius": 0.05, "pts_per_pxl": 5},
+    ]
+    benchmark(_bm_cpu_with_init, "RASTERIZE_CPU", kwargs_list, warmup_iters=1)
+    benchmark(_bm_cuda_with_init, "RASTERIZE_CUDA", kwargs_list, warmup_iters=1)
--- a/tests/test_pointclouds.py
+++ b/tests/test_pointclouds.py
--- a/tests/test_rasterize_points.py
+++ b/tests/test_rasterize_points.py
+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+
+
+import numpy as np
+import unittest
+import torch
+
+from pytorch3d import _C
+from pytorch3d.renderer.points.rasterize_points import (
+    rasterize_points,
+    rasterize_points_python,
+)
+from pytorch3d.structures.pointclouds import Pointclouds
+
+from common_testing import TestCaseMixin
+
+
+class TestRasterizePoints(TestCaseMixin, unittest.TestCase):
+    def test_python_simple_cpu(self):
+        self._simple_test_case(
+            rasterize_points_python, torch.device("cpu"), bin_size=-1
+        )
+
+    def test_naive_simple_cpu(self):
+        device = torch.device("cpu")
+        self._simple_test_case(rasterize_points, device)
+
+    def test_naive_simple_cuda(self):
+        device = torch.device("cuda")
+        self._simple_test_case(rasterize_points, device, bin_size=0)
+
+    def test_python_behind_camera(self):
+        self._test_behind_camera(
+            rasterize_points_python, torch.device("cpu"), bin_size=-1
+        )
+
+    def test_cpu_behind_camera(self):
+        self._test_behind_camera(rasterize_points, torch.device("cpu"))
+
+    def test_cuda_behind_camera(self):
+        self._test_behind_camera(
+            rasterize_points, torch.device("cuda"), bin_size=0
+        )
+
+    def test_cpp_vs_naive_vs_binned(self):
+        # Make sure that the backward pass runs for all pathways
+        N = 2
+        P = 1000
+        image_size = 32
+        radius = 0.1
+        points_per_pixel = 3
+        points1 = torch.randn(P, 3, requires_grad=True)
+        points2 = torch.randn(int(P / 2), 3, requires_grad=True)
+        pointclouds = Pointclouds(points=[points1, points2])
+        grad_zbuf = torch.randn(N, image_size, image_size, points_per_pixel)
+        grad_dists = torch.randn(N, image_size, image_size, points_per_pixel)
+
+        # Option I: CPU, naive
+        idx1, zbuf1, dists1 = rasterize_points(
+            pointclouds, image_size, radius, points_per_pixel, bin_size=0
+        )
+        loss = (zbuf1 * grad_zbuf).sum() + (dists1 * grad_dists).sum()
+        loss.backward()
+        grad1 = points1.grad.data.clone()
+
+        # Option II: CUDA, naive
+        points1_cuda = points1.cuda().detach().clone().requires_grad_(True)
+        points2_cuda = points2.cuda().detach().clone().requires_grad_(True)
+        pointclouds = Pointclouds(points=[points1_cuda, points2_cuda])
+        grad_zbuf = grad_zbuf.cuda()
+        grad_dists = grad_dists.cuda()
+        idx2, zbuf2, dists2 = rasterize_points(
+            pointclouds, image_size, radius, points_per_pixel, bin_size=0
+        )
+        loss = (zbuf2 * grad_zbuf).sum() + (dists2 * grad_dists).sum()
+        loss.backward()
+        idx2 = idx2.data.cpu().clone()
+        zbuf2 = zbuf2.data.cpu().clone()
+        dists2 = dists2.data.cpu().clone()
+        grad2 = points1_cuda.grad.data.cpu().clone()
+
+        # Option III: CUDA, binned
+        points1_cuda = points1.cuda().detach().clone().requires_grad_(True)
+        points2_cuda = points2.cuda().detach().clone().requires_grad_(True)
+        pointclouds = Pointclouds(points=[points1_cuda, points2_cuda])
+        idx3, zbuf3, dists3 = rasterize_points(
+            pointclouds, image_size, radius, points_per_pixel, bin_size=32
+        )
+        loss = (zbuf3 * grad_zbuf).sum() + (dists3 * grad_dists).sum()
+        points1.grad.data.zero_()
+        loss.backward()
+        idx3 = idx3.data.cpu().clone()
+        zbuf3 = zbuf3.data.cpu().clone()
+        dists3 = dists3.data.cpu().clone()
+        grad3 = points1_cuda.grad.data.cpu().clone()
+
+        # Make sure everything was the same
+        idx12_same = (idx1 == idx2).all().item()
+        idx13_same = (idx1 == idx3).all().item()
+        zbuf12_same = (zbuf1 == zbuf2).all().item()
+        zbuf13_same = (zbuf1 == zbuf3).all().item()
+        dists12_diff = (dists1 - dists2).abs().max().item()
+        dists13_diff = (dists1 - dists3).abs().max().item()
+        self.assertTrue(idx12_same)
+        self.assertTrue(idx13_same)
+        self.assertTrue(zbuf12_same)
+        self.assertTrue(zbuf13_same)
+        self.assertTrue(dists12_diff < 1e-6)
+        self.assertTrue(dists13_diff < 1e-6)
+
+        diff12 = (grad1 - grad2).abs().max().item()
+        diff13 = (grad1 - grad3).abs().max().item()
+        diff23 = (grad2 - grad3).abs().max().item()
+        self.assertTrue(diff12 < 5e-6)
+        self.assertTrue(diff13 < 5e-6)
+        self.assertTrue(diff23 < 5e-6)
+
+    def test_python_vs_cpu_naive(self):
+        torch.manual_seed(231)
+        image_size = 32
+        radius = 0.1
+        points_per_pixel = 3
+
+        # Test a batch of homogeneous point clouds.
+        N = 2
+        P = 17
+        points = torch.randn(N, P, 3, requires_grad=True)
+        pointclouds = Pointclouds(points=points)
+        args = (pointclouds, image_size, radius, points_per_pixel)
+        self._compare_impls(
+            rasterize_points_python,
+            rasterize_points,
+            args,
+            args,
+            points,
+            points,
+            compare_grads=True,
+        )
+
+        # Test a batch of heterogeneous point clouds.
+        P2 = 10
+        points1 = torch.randn(P, 3, requires_grad=True)
+        points2 = torch.randn(P2, 3)
+        pointclouds = Pointclouds(points=[points1, points2])
+        args = (pointclouds, image_size, radius, points_per_pixel)
+        self._compare_impls(
+            rasterize_points_python,
+            rasterize_points,
+            args,
+            args,
+            points1,  # check gradients for first element in batch
+            points1,
+            compare_grads=True,
+        )
+
+    def test_cpu_vs_cuda_naive(self):
+        torch.manual_seed(231)
+        image_size = 64
+        radius = 0.1
+        points_per_pixel = 5
+
+        # Test homogeneous point cloud batch.
+        N = 2
+        P = 1000
+        bin_size = 0
+        points_cpu = torch.rand(N, P, 3, requires_grad=True)
+        points_cuda = points_cpu.cuda().detach().requires_grad_(True)
+        pointclouds_cpu = Pointclouds(points=points_cpu)
+        pointclouds_cuda = Pointclouds(points=points_cuda)
+        args_cpu = (
+            pointclouds_cpu,
+            image_size,
+            radius,
+            points_per_pixel,
+            bin_size,
+        )
+        args_cuda = (
+            pointclouds_cuda,
+            image_size,
+            radius,
+            points_per_pixel,
+            bin_size,
+        )
+        self._compare_impls(
+            rasterize_points,
+            rasterize_points,
+            args_cpu,
+            args_cuda,
+            points_cpu,
+            points_cuda,
+            compare_grads=True,
+        )
+
+    def _compare_impls(
+        self,
+        fn1,
+        fn2,
+        args1,
+        args2,
+        grad_var1=None,
+        grad_var2=None,
+        compare_grads=False,
+    ):
+        idx1, zbuf1, dist1 = fn1(*args1)
+        torch.manual_seed(231)
+        grad_zbuf = torch.randn_like(zbuf1)
+        grad_dist = torch.randn_like(dist1)
+        loss = (zbuf1 * grad_zbuf).sum() + (dist1 * grad_dist).sum()
+        if compare_grads:
+            loss.backward()
+            grad_points1 = grad_var1.grad.data.clone().cpu()
+
+        idx2, zbuf2, dist2 = fn2(*args2)
+        grad_zbuf = grad_zbuf.to(zbuf2)
+        grad_dist = grad_dist.to(dist2)
+        loss = (zbuf2 * grad_zbuf).sum() + (dist2 * grad_dist).sum()
+        if compare_grads:
+            # clear points1.grad in case args1 and args2 reused the same tensor
+            grad_var1.grad.data.zero_()
+            loss.backward()
+            grad_points2 = grad_var2.grad.data.clone().cpu()
+
+        self.assertEqual((idx1.cpu() == idx2.cpu()).all().item(), 1)
+        self.assertEqual((zbuf1.cpu() == zbuf2.cpu()).all().item(), 1)
+        self.assertClose(dist1.cpu(), dist2.cpu())
+        if compare_grads:
+            self.assertTrue(
+                torch.allclose(grad_points1, grad_points2, atol=2e-6)
+            )
+
+    def _test_behind_camera(self, rasterize_points_fn, device, bin_size=None):
+        # Test case where all points are behind the camera -- nothing should
+        # get rasterized
+        N = 2
+        P = 32
+        xy = torch.randn(N, P, 2)
+        z = torch.randn(N, P, 1).abs().mul(-1)  # Make them all negative
+        points = torch.cat([xy, z], dim=2).to(device)
+        image_size = 16
+        points_per_pixel = 3
+        radius = 0.2
+        idx_expected = torch.full(
+            (N, 16, 16, 3), fill_value=-1, dtype=torch.int32, device=device
+        )
+        zbuf_expected = torch.full(
+            (N, 16, 16, 3), fill_value=-1, dtype=torch.float32, device=device
+        )
+        dists_expected = zbuf_expected.clone()
+        pointclouds = Pointclouds(points=points)
+        if bin_size == -1:
+            # simple python case with no binning
+            idx, zbuf, dists = rasterize_points_fn(
+                pointclouds, image_size, radius, points_per_pixel
+            )
+        else:
+            idx, zbuf, dists = rasterize_points_fn(
+                pointclouds, image_size, radius, points_per_pixel, bin_size
+            )
+        idx_same = (idx == idx_expected).all().item() == 1
+        zbuf_same = (zbuf == zbuf_expected).all().item() == 1
+
+        self.assertTrue(idx_same)
+        self.assertTrue(zbuf_same)
+        self.assertTrue(torch.allclose(dists, dists_expected))
+
+    def _simple_test_case(self, rasterize_points_fn, device, bin_size=0):
+        # Create two pointclouds with different numbers of points.
+        # fmt: off
+        points1 = torch.tensor(
+            [
+                [0.0, 0.0,  0.0],  # noqa: E241
+                [0.4, 0.0,  0.1],  # noqa: E241
+                [0.0, 0.4,  0.2],  # noqa: E241
+                [0.0, 0.0, -0.1],  # noqa: E241 Points with negative z should be skippped
+            ],
+            device=device,
+        )
+        points2 = torch.tensor(
+            [
+                [0.0, 0.0,  0.0],  # noqa: E241
+                [0.4, 0.0,  0.1],  # noqa: E241
+                [0.0, 0.4,  0.2],  # noqa: E241
+                [0.0, 0.0, -0.1],  # noqa: E241 Points with negative z should be skippped
+                [0.0, 0.0, -0.7],  # noqa: E241 Points with negative z should be skippped
+            ],
+            device=device,
+        )
+        # fmt: on
+        pointclouds = Pointclouds(points=[points1, points2])
+
+        image_size = 5
+        points_per_pixel = 2
+        radius = 0.5
+
+        # The expected output values. Note that in the outputs, the world space
+        # +Y is up, and the world space +X is left.
+        idx1_expected = torch.full(
+            (1, 5, 5, 2), fill_value=-1, dtype=torch.int32, device=device
+        )
+        # fmt: off
+        idx1_expected[0, :, :, 0] = torch.tensor([
+            [-1, -1,  2, -1, -1],  # noqa: E241
+            [-1,  1,  0,  2, -1],  # noqa: E241
+            [ 1,  0,  0,  0, -1],  # noqa: E241 E201
+            [-1,  1,  0, -1, -1],  # noqa: E241
+            [-1, -1, -1, -1, -1],  # noqa: E241
+        ], device=device)
+        idx1_expected[0, :, :, 1] = torch.tensor([
+            [-1, -1, -1, -1, -1],  # noqa: E241
+            [-1,  2,  2, -1, -1],  # noqa: E241
+            [-1,  1,  1, -1, -1],  # noqa: E241
+            [-1, -1, -1, -1, -1],  # noqa: E241
+            [-1, -1, -1, -1, -1],  # noqa: E241
+        ], device=device)
+        # fmt: on
+
+        zbuf1_expected = torch.full(
+            (1, 5, 5, 2), fill_value=100, dtype=torch.float32, device=device
+        )
+        # fmt: off
+        zbuf1_expected[0, :, :, 0] = torch.tensor([
+            [-1.0, -1.0,  0.2, -1.0, -1.0],  # noqa: E241
+            [-1.0,  0.1,  0.0,  0.2, -1.0],  # noqa: E241
+            [ 0.1,  0.0,  0.0,  0.0, -1.0],  # noqa: E241 E201
+            [-1.0,  0.1,  0.0, -1.0, -1.0],  # noqa: E241
+            [-1.0, -1.0, -1.0, -1.0, -1.0]   # noqa: E241
+        ], device=device)
+        zbuf1_expected[0, :, :, 1] = torch.tensor([
+            [-1.0, -1.0, -1.0, -1.0, -1.0],  # noqa: E241
+            [-1.0,  0.2,  0.2, -1.0, -1.0],  # noqa: E241
+            [-1.0,  0.1,  0.1, -1.0, -1.0],  # noqa: E241
+            [-1.0, -1.0, -1.0, -1.0, -1.0],  # noqa: E241
+            [-1.0, -1.0, -1.0, -1.0, -1.0],  # noqa: E241
+        ], device=device)
+        # fmt: on
+
+        dists1_expected = torch.full(
+            (1, 5, 5, 2), fill_value=0.0, dtype=torch.float32, device=device
+        )
+        # fmt: off
+        dists1_expected[0, :, :, 0] = torch.tensor([
+            [-1.00, -1.00,  0.16, -1.00, -1.00],  # noqa: E241
+            [-1.00,  0.16,  0.16,  0.16, -1.00],  # noqa: E241
+            [ 0.16,  0.16,  0.00,  0.16, -1.00],  # noqa: E241 E201
+            [-1.00,  0.16,  0.16, -1.00, -1.00],  # noqa: E241
+            [-1.00, -1.00, -1.00, -1.00, -1.00],  # noqa: E241
+        ], device=device)
+        dists1_expected[0, :, :, 1] = torch.tensor([
+            [-1.00, -1.00, -1.00, -1.00, -1.00],  # noqa: E241
+            [-1.00,  0.16,  0.00, -1.00, -1.00],  # noqa: E241
+            [-1.00,  0.00,  0.16, -1.00, -1.00],  # noqa: E241
+            [-1.00, -1.00, -1.00, -1.00, -1.00],  # noqa: E241
+            [-1.00, -1.00, -1.00, -1.00, -1.00],  # noqa: E241
+        ], device=device)
+        # fmt: on
+
+        if bin_size == -1:
+            # simple python case with no binning
+            idx, zbuf, dists = rasterize_points_fn(
+                pointclouds, image_size, radius, points_per_pixel
+            )
+        else:
+            idx, zbuf, dists = rasterize_points_fn(
+                pointclouds, image_size, radius, points_per_pixel, bin_size
+            )
+
+        # check first point cloud
+        idx_same = (idx[0, ...] == idx1_expected).all().item() == 1
+        if idx_same == 0:
+            print(idx[0, :, :, 0])
+            print(idx[0, :, :, 1])
+        zbuf_same = (zbuf[0, ...] == zbuf1_expected).all().item() == 1
+        dist_same = torch.allclose(dists[0, ...], dists1_expected)
+        self.assertTrue(idx_same)
+        self.assertTrue(zbuf_same)
+        self.assertTrue(dist_same)
+
+        # Check second point cloud - the indices in idx refer to points in the
+        # pointclouds.points_packed() tensor. In the second point cloud,
+        # two points are behind the screen - the expected indices are the same
+        # the first pointcloud but offset by the number of points in the
+        # first pointcloud.
+        num_points_per_cloud = pointclouds.num_points_per_cloud()
+        idx1_expected[idx1_expected >= 0] += num_points_per_cloud[0]
+
+        idx_same = (idx[1, ...] == idx1_expected).all().item() == 1
+        zbuf_same = (zbuf[1, ...] == zbuf1_expected).all().item() == 1
+        self.assertTrue(idx_same)
+        self.assertTrue(zbuf_same)
+        self.assertTrue(torch.allclose(dists[1, ...], dists1_expected))
+
+    def test_coarse_cpu(self):
+        return self._test_coarse_rasterize(torch.device("cpu"))
+
+    def test_coarse_cuda(self):
+        return self._test_coarse_rasterize(torch.device("cuda"))
+
+    def test_compare_coarse_cpu_vs_cuda(self):
+        torch.manual_seed(231)
+        N = 3
+        max_P = 1000
+        image_size = 64
+        radius = 0.1
+        bin_size = 16
+        max_points_per_bin = 500
+
+        # create heterogeneous point clouds
+        points = []
+        for _ in range(N):
+            p = np.random.choice(max_P)
+            points.append(torch.randn(p, 3))
+
+        pointclouds = Pointclouds(points=points)
+        points_packed = pointclouds.points_packed()
+        cloud_to_packed_first_idx = pointclouds.cloud_to_packed_first_idx()
+        num_points_per_cloud = pointclouds.num_points_per_cloud()
+        args = (
+            points_packed,
+            cloud_to_packed_first_idx,
+            num_points_per_cloud,
+            image_size,
+            radius,
+            bin_size,
+            max_points_per_bin,
+        )
+        bp_cpu = _C._rasterize_points_coarse(*args)
+
+        pointclouds_cuda = pointclouds.to("cuda:0")
+        points_packed = pointclouds_cuda.points_packed()
+        cloud_to_packed_first_idx = pointclouds_cuda.cloud_to_packed_first_idx()
+        num_points_per_cloud = pointclouds_cuda.num_points_per_cloud()
+        args = (
+            points_packed,
+            cloud_to_packed_first_idx,
+            num_points_per_cloud,
+            image_size,
+            radius,
+            bin_size,
+            max_points_per_bin,
+        )
+        bp_cuda = _C._rasterize_points_coarse(*args)
+
+        # Bin points might not be the same: CUDA version might write them in
+        # any order. But if we sort the non-(-1) elements of the CUDA output
+        # then they should be the same.
+        for n in range(N):
+            for by in range(bp_cpu.shape[1]):
+                for bx in range(bp_cpu.shape[2]):
+                    K = (bp_cpu[n, by, bx] != -1).sum().item()
+                    idxs_cpu = bp_cpu[n, by, bx].tolist()
+                    idxs_cuda = bp_cuda[n, by, bx].tolist()
+                    idxs_cuda[:K] = sorted(idxs_cuda[:K])
+                    self.assertEqual(idxs_cpu, idxs_cuda)
+
+    def _test_coarse_rasterize(self, device):
+        #
+        #  Note that +Y is up and +X is left in the diagram below.
+        #
+        #  (4)              |2
+        #                   |
+        #                   |
+        #                   |
+        #                   |1
+        #                   |
+        #             (1)   |
+        #                   | (2)
+        # ____________(0)__(5)___________________
+        # 2        1        |          -1      -2
+        #                   |
+        #       (3)         |
+        #                   |
+        #                   |-1
+        #                   |
+        #
+        # Locations of the points are shown by o. The screen bounding box
+        # is between [-1, 1] in both the x and y directions.
+        #
+        # These points are interesting because:
+        # (0) Falls into two bins;
+        # (1) and (2) fall into one bin;
+        # (3) is out-of-bounds, but its disk is in-bounds;
+        # (4) is out-of-bounds, and its entire disk is also out-of-bounds
+        # (5) has a negative z-value, so it should be skipped
+        # fmt: off
+        points = torch.tensor(
+            [
+                [ 0.5,  0.0,  0.0],  # noqa: E241, E201
+                [ 0.5,  0.5,  0.1],  # noqa: E241, E201
+                [-0.3,  0.4,  0.0],  # noqa: E241
+                [ 1.1, -0.5,  0.2],  # noqa: E241, E201
+                [ 2.0,  2.0,  0.3],  # noqa: E241, E201
+                [ 0.0,  0.0, -0.1],  # noqa: E241, E201
+            ],
+            device=device
+        )
+        # fmt: on
+        image_size = 16
+        radius = 0.2
+        bin_size = 8
+        max_points_per_bin = 5
+
+        bin_points_expected = -1 * torch.ones(
+            1, 2, 2, 5, dtype=torch.int32, device=device
+        )
+        # Note that the order is only deterministic here for CUDA if all points
+        # fit in one chunk. This will the the case for this small example, but
+        # to properly exercise coordianted writes among multiple chunks we need
+        # to use a bigger test case.
+        bin_points_expected[0, 1, 0, :2] = torch.tensor([0, 3])
+        bin_points_expected[0, 0, 1, 0] = torch.tensor([2])
+        bin_points_expected[0, 0, 0, :2] = torch.tensor([0, 1])
+
+        pointclouds = Pointclouds(points=[points])
+        args = (
+            pointclouds.points_packed(),
+            pointclouds.cloud_to_packed_first_idx(),
+            pointclouds.num_points_per_cloud(),
+            image_size,
+            radius,
+            bin_size,
+            max_points_per_bin,
+        )
+        bin_points = _C._rasterize_points_coarse(*args)
+        bin_points_same = (bin_points == bin_points_expected).all()
+        self.assertTrue(bin_points_same.item() == 1)