pulsar integration.

Summary: This diff integrates the pulsar renderer source code into PyTorch3D as an alternative backend for the PyTorch3D point renderer. This diff is the first of a series of three diffs to complete that migration and focuses on the packaging and integration of the source code. For more information about the pulsar backend, see the release notes and the paper (https://arxiv.org/abs/2004.07484). For information on how to use the backend, see the point cloud rendering notebook and the examples in the folder `docs/examples`. Tasks addressed in the following diffs: * Add the PyTorch3D interface, * Add notebook examples and documentation (or adapt the existing ones to feature both interfaces). Reviewed By: nikhilaravi Differential Revision: D23947736 fbshipit-source-id: a5e77b53e6750334db22aefa89b4c079cda1b443

pulsar integration.
Summary: This diff integrates the pulsar renderer source code into PyTorch3D as an alternative backend for the PyTorch3D point renderer. This diff is the first of a series of three diffs to complete that migration and focuses on the packaging and integration of the source code. For more information about the pulsar backend, see the release notes and the paper (https://arxiv.org/abs/2004.07484). For information on how to use the backend, see the point cloud rendering notebook and the examples in the folder `docs/examples`. Tasks addressed in the following diffs: * Add the PyTorch3D interface, * Add notebook examples and documentation (or adapt the existing ones to feature both interfaces). Reviewed By: nikhilaravi Differential Revision: D23947736 fbshipit-source-id: a5e77b53e6750334db22aefa89b4c079cda1b443
b19fe1de · Christoph Lassner · Facebook GitHub Bot · d5650323 · b19fe1de · b19fe1de
Commit b19fe1de authored Nov 03, 2020 by Christoph Lassner Committed by Facebook GitHub Bot Nov 03, 2020
20 changed files
--- a/docs/examples/pulsar_basic.py
+++ b/docs/examples/pulsar_basic.py
+#!/usr/bin/env python3
+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+"""
+This example demonstrates the most trivial, direct interface of the pulsar
+sphere renderer. It renders and saves an image with 10 random spheres.
+Output: basic.png.
+"""
+from os import path
+
+import imageio
+import torch
+from pytorch3d.renderer.points.pulsar import Renderer
+
+
+n_points = 10
+width = 1_000
+height = 1_000
+device = torch.device("cuda")
+renderer = Renderer(width, height, n_points).to(device)
+# Generate sample data.
+vert_pos = torch.rand(n_points, 3, dtype=torch.float32, device=device) * 10.0
+vert_pos[:, 2] += 25.0
+vert_pos[:, :2] -= 5.0
+vert_col = torch.rand(n_points, 3, dtype=torch.float32, device=device)
+vert_rad = torch.rand(n_points, dtype=torch.float32, device=device)
+cam_params = torch.tensor(
+    [
+        0.0,
+        0.0,
+        0.0,  # Position 0, 0, 0 (x, y, z).
+        0.0,
+        0.0,
+        0.0,  # Rotation 0, 0, 0 (in axis-angle format).
+        5.0,  # Focal length in world size.
+        2.0,  # Sensor size in world size.
+    ],
+    dtype=torch.float32,
+    device=device,
+)
+# Render.
+image = renderer(
+    vert_pos,
+    vert_col,
+    vert_rad,
+    cam_params,
+    1.0e-1,  # Renderer blending parameter gamma, in [1., 1e-5].
+    45.0,  # Maximum depth.
+)
+print("Writing image to `%s`." % (path.abspath("basic.png")))
+imageio.imsave("basic.png", (image.cpu().detach() * 255.0).to(torch.uint8).numpy())
--- a/docs/examples/pulsar_cam.py
+++ b/docs/examples/pulsar_cam.py
+#!/usr/bin/env python3
+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+"""
+This example demonstrates camera parameter optimization with the plain
+pulsar interface. For this, a reference image has been pre-generated
+(you can find it at `../../tests/pulsar/reference/examples_TestRenderer_test_cam.png`).
+The same scene parameterization is loaded and the camera parameters
+distorted. Gradient-based optimization is used to converge towards the
+original camera parameters.
+"""
+from os import path
+
+import cv2
+import imageio
+import numpy as np
+import torch
+from pytorch3d.renderer.points.pulsar import Renderer
+from torch import nn, optim
+
+
+n_points = 20
+width = 1_000
+height = 1_000
+device = torch.device("cuda")
+
+
+class SceneModel(nn.Module):
+    """
+    A simple scene model to demonstrate use of pulsar in PyTorch modules.
+
+    The scene model is parameterized with sphere locations (vert_pos),
+    channel content (vert_col), radiuses (vert_rad), camera position (cam_pos),
+    camera rotation (cam_rot) and sensor focal length and width (cam_sensor).
+
+    The forward method of the model renders this scene description. Any
+    of these parameters could instead be passed as inputs to the forward
+    method and come from a different model.
+    """
+
+    def __init__(self):
+        super(SceneModel, self).__init__()
+        self.gamma = 0.1
+        # Points.
+        torch.manual_seed(1)
+        vert_pos = torch.rand(n_points, 3, dtype=torch.float32) * 10.0
+        vert_pos[:, 2] += 25.0
+        vert_pos[:, :2] -= 5.0
+        self.register_parameter("vert_pos", nn.Parameter(vert_pos, requires_grad=False))
+        self.register_parameter(
+            "vert_col",
+            nn.Parameter(
+                torch.rand(n_points, 3, dtype=torch.float32), requires_grad=False
+            ),
+        )
+        self.register_parameter(
+            "vert_rad",
+            nn.Parameter(
+                torch.rand(n_points, dtype=torch.float32), requires_grad=False
+            ),
+        )
+        self.register_parameter(
+            "cam_pos",
+            nn.Parameter(
+                torch.tensor([0.1, 0.1, 0.0], dtype=torch.float32), requires_grad=True
+            ),
+        )
+        self.register_parameter(
+            "cam_rot",
+            nn.Parameter(
+                torch.tensor(
+                    [
+                        # We're using the 6D rot. representation for better gradients.
+                        0.9995,
+                        0.0300445,
+                        -0.0098482,
+                        -0.0299445,
+                        0.9995,
+                        0.0101482,
+                    ],
+                    dtype=torch.float32,
+                ),
+                requires_grad=True,
+            ),
+        )
+        self.register_parameter(
+            "cam_sensor",
+            nn.Parameter(
+                torch.tensor([4.8, 1.8], dtype=torch.float32), requires_grad=True
+            ),
+        )
+        self.renderer = Renderer(width, height, n_points)
+
+    def forward(self):
+        return self.renderer.forward(
+            self.vert_pos,
+            self.vert_col,
+            self.vert_rad,
+            torch.cat([self.cam_pos, self.cam_rot, self.cam_sensor]),
+            self.gamma,
+            45.0,
+        )
+
+
+# Load reference.
+ref = (
+    torch.from_numpy(
+        imageio.imread(
+            "../../tests/pulsar/reference/examples_TestRenderer_test_cam.png"
+        )
+    ).to(torch.float32)
+    / 255.0
+).to(device)
+# Set up model.
+model = SceneModel().to(device)
+# Optimizer.
+optimizer = optim.SGD(
+    [
+        {"params": [model.cam_pos], "lr": 1e-4},  # 1e-3
+        {"params": [model.cam_rot], "lr": 5e-6},
+        {"params": [model.cam_sensor], "lr": 1e-4},
+    ]
+)
+
+print("Writing video to `%s`." % (path.abspath("cam.gif")))
+writer = imageio.get_writer("cam.gif", format="gif", fps=25)
+
+# Optimize.
+for i in range(300):
+    optimizer.zero_grad()
+    result = model()
+    # Visualize.
+    result_im = (result.cpu().detach().numpy() * 255).astype(np.uint8)
+    cv2.imshow("opt", result_im[:, :, ::-1])
+    writer.append_data(result_im)
+    overlay_img = np.ascontiguousarray(
+        ((result * 0.5 + ref * 0.5).cpu().detach().numpy() * 255).astype(np.uint8)[
+            :, :, ::-1
+        ]
+    )
+    overlay_img = cv2.putText(
+        overlay_img,
+        "Step %d" % (i),
+        (10, 40),
+        cv2.FONT_HERSHEY_SIMPLEX,
+        1,
+        (0, 0, 0),
+        2,
+        cv2.LINE_AA,
+        False,
+    )
+    cv2.imshow("overlay", overlay_img)
+    cv2.waitKey(1)
+    # Update.
+    loss = ((result - ref) ** 2).sum()
+    print("loss {}: {}".format(i, loss.item()))
+    loss.backward()
+    optimizer.step()
+writer.close()
--- a/docs/examples/pulsar_multiview.py
+++ b/docs/examples/pulsar_multiview.py
+#!/usr/bin/env python3
+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+"""
+This example demonstrates multiview 3D reconstruction using the plain
+pulsar interface. For this, reference images have been pre-generated
+(you can find them at `../../tests/pulsar/reference/examples_TestRenderer_test_multiview_%d.png`).
+The camera parameters are assumed given. The scene is initialized with
+random spheres. Gradient-based optimization is used to optimize sphere
+parameters and prune spheres to converge to a 3D representation.
+"""
+from os import path
+
+import cv2
+import imageio
+import numpy as np
+import torch
+from pytorch3d.renderer.points.pulsar import Renderer
+from torch import nn, optim
+
+
+n_points = 400_000
+width = 1_000
+height = 1_000
+visualize_ids = [0, 1]
+device = torch.device("cuda")
+
+
+class SceneModel(nn.Module):
+    """
+    A simple scene model to demonstrate use of pulsar in PyTorch modules.
+
+    The scene model is parameterized with sphere locations (vert_pos),
+    channel content (vert_col), radiuses (vert_rad), camera position (cam_pos),
+    camera rotation (cam_rot) and sensor focal length and width (cam_sensor).
+
+    The forward method of the model renders this scene description. Any
+    of these parameters could instead be passed as inputs to the forward
+    method and come from a different model. Optionally, camera parameters can
+    be provided to the forward method in which case the scene is rendered
+    using those parameters.
+    """
+
+    def __init__(self):
+        super(SceneModel, self).__init__()
+        self.gamma = 1.0
+        # Points.
+        torch.manual_seed(1)
+        vert_pos = torch.rand((1, n_points, 3), dtype=torch.float32) * 10.0
+        vert_pos[:, :, 2] += 25.0
+        vert_pos[:, :, :2] -= 5.0
+        self.register_parameter("vert_pos", nn.Parameter(vert_pos, requires_grad=True))
+        self.register_parameter(
+            "vert_col",
+            nn.Parameter(
+                torch.ones(1, n_points, 3, dtype=torch.float32) * 0.5,
+                requires_grad=True,
+            ),
+        )
+        self.register_parameter(
+            "vert_rad",
+            nn.Parameter(
+                torch.ones(1, n_points, dtype=torch.float32) * 0.05, requires_grad=True
+            ),
+        )
+        self.register_parameter(
+            "vert_opy",
+            nn.Parameter(
+                torch.ones(1, n_points, dtype=torch.float32), requires_grad=True
+            ),
+        )
+        self.register_buffer(
+            "cam_params",
+            torch.tensor(
+                [
+                    [
+                        np.sin(angle) * 35.0,
+                        0.0,
+                        30.0 - np.cos(angle) * 35.0,
+                        0.0,
+                        -angle,
+                        0.0,
+                        5.0,
+                        2.0,
+                    ]
+                    for angle in [-1.5, -0.8, -0.4, -0.1, 0.1, 0.4, 0.8, 1.5]
+                ],
+                dtype=torch.float32,
+            ),
+        )
+        self.renderer = Renderer(width, height, n_points)
+
+    def forward(self, cam=None):
+        if cam is None:
+            cam = self.cam_params
+            n_views = 8
+        else:
+            n_views = 1
+        return self.renderer.forward(
+            self.vert_pos.expand(n_views, -1, -1),
+            self.vert_col.expand(n_views, -1, -1),
+            self.vert_rad.expand(n_views, -1),
+            cam,
+            self.gamma,
+            45.0,
+        )
+
+
+# Load reference.
+ref = torch.stack(
+    [
+        torch.from_numpy(
+            imageio.imread(
+                "../../tests/pulsar/reference/examples_TestRenderer_test_multiview_%d.png"
+                % idx
+            )
+        ).to(torch.float32)
+        / 255.0
+        for idx in range(8)
+    ]
+).to(device)
+# Set up model.
+model = SceneModel().to(device)
+# Optimizer.
+optimizer = optim.SGD(
+    [
+        {"params": [model.vert_col], "lr": 1e-1},
+        {"params": [model.vert_rad], "lr": 1e-3},
+        {"params": [model.vert_pos], "lr": 1e-3},
+    ]
+)
+
+# For visualization.
+angle = 0.0
+print("Writing video to `%s`." % (path.abspath("multiview.avi")))
+writer = imageio.get_writer("multiview.gif", format="gif", fps=25)
+
+# Optimize.
+for i in range(300):
+    optimizer.zero_grad()
+    result = model()
+    # Visualize.
+    result_im = (result.cpu().detach().numpy() * 255).astype(np.uint8)
+    cv2.imshow("opt", result_im[0, :, :, ::-1])
+    overlay_img = np.ascontiguousarray(
+        ((result * 0.5 + ref * 0.5).cpu().detach().numpy() * 255).astype(np.uint8)[
+            0, :, :, ::-1
+        ]
+    )
+    overlay_img = cv2.putText(
+        overlay_img,
+        "Step %d" % (i),
+        (10, 40),
+        cv2.FONT_HERSHEY_SIMPLEX,
+        1,
+        (0, 0, 0),
+        2,
+        cv2.LINE_AA,
+        False,
+    )
+    cv2.imshow("overlay", overlay_img)
+    cv2.waitKey(1)
+    # Update.
+    loss = ((result - ref) ** 2).sum()
+    print("loss {}: {}".format(i, loss.item()))
+    loss.backward()
+    optimizer.step()
+    # Cleanup.
+    with torch.no_grad():
+        model.vert_col.data = torch.clamp(model.vert_col.data, 0.0, 1.0)
+        # Remove points.
+        model.vert_pos.data[model.vert_rad < 0.001, :] = -1000.0
+        model.vert_rad.data[model.vert_rad < 0.001] = 0.0001
+        vd = (
+            (model.vert_col - torch.ones(1, 1, 3, dtype=torch.float32).to(device))
+            .abs()
+            .sum(dim=2)
+        )
+        model.vert_pos.data[vd <= 0.2] = -1000.0
+    # Rotating visualization.
+    cam_control = torch.tensor(
+        [
+            [
+                np.sin(angle) * 35.0,
+                0.0,
+                30.0 - np.cos(angle) * 35.0,
+                0.0,
+                -angle,
+                0.0,
+                5.0,
+                2.0,
+            ]
+        ],
+        dtype=torch.float32,
+    ).to(device)
+    with torch.no_grad():
+        result = model.forward(cam=cam_control)[0]
+        result_im = (result.cpu().detach().numpy() * 255).astype(np.uint8)
+        cv2.imshow("vis", result_im[:, :, ::-1])
+        writer.append_data(result_im)
+        angle += 0.05
+writer.close()
--- a/docs/examples/pulsar_optimization.py
+++ b/docs/examples/pulsar_optimization.py
+#!/usr/bin/env python3
+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+"""
+This example demonstrates scene optimization with the plain
+pulsar interface. For this, a reference image has been pre-generated
+(you can find it at `../../tests/pulsar/reference/examples_TestRenderer_test_smallopt.png`).
+The scene is initialized with random spheres. Gradient-based
+optimization is used to converge towards a faithful
+scene representation.
+"""
+import cv2
+import imageio
+import numpy as np
+import torch
+from pytorch3d.renderer.points.pulsar import Renderer
+from torch import nn, optim
+
+
+n_points = 10_000
+width = 1_000
+height = 1_000
+device = torch.device("cuda")
+
+
+class SceneModel(nn.Module):
+    """
+    A simple scene model to demonstrate use of pulsar in PyTorch modules.
+
+    The scene model is parameterized with sphere locations (vert_pos),
+    channel content (vert_col), radiuses (vert_rad), camera position (cam_pos),
+    camera rotation (cam_rot) and sensor focal length and width (cam_sensor).
+
+    The forward method of the model renders this scene description. Any
+    of these parameters could instead be passed as inputs to the forward
+    method and come from a different model.
+    """
+
+    def __init__(self):
+        super(SceneModel, self).__init__()
+        self.gamma = 1.0
+        # Points.
+        torch.manual_seed(1)
+        vert_pos = torch.rand(n_points, 3, dtype=torch.float32) * 10.0
+        vert_pos[:, 2] += 25.0
+        vert_pos[:, :2] -= 5.0
+        self.register_parameter("vert_pos", nn.Parameter(vert_pos, requires_grad=True))
+        self.register_parameter(
+            "vert_col",
+            nn.Parameter(
+                torch.ones(n_points, 3, dtype=torch.float32) * 0.5, requires_grad=True
+            ),
+        )
+        self.register_parameter(
+            "vert_rad",
+            nn.Parameter(
+                torch.ones(n_points, dtype=torch.float32) * 0.3, requires_grad=True
+            ),
+        )
+        self.register_buffer(
+            "cam_params",
+            torch.tensor([0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 5.0, 2.0], dtype=torch.float32),
+        )
+        # The volumetric optimization works better with a higher number of tracked
+        # intersections per ray.
+        self.renderer = Renderer(width, height, n_points, n_track=32)
+
+    def forward(self):
+        return self.renderer.forward(
+            self.vert_pos,
+            self.vert_col,
+            self.vert_rad,
+            self.cam_params,
+            self.gamma,
+            45.0,
+            return_forward_info=True,
+        )
+
+
+# Load reference.
+ref = (
+    torch.from_numpy(
+        imageio.imread(
+            "../../tests/pulsar/reference/examples_TestRenderer_test_smallopt.png"
+        )
+    ).to(torch.float32)
+    / 255.0
+).to(device)
+# Set up model.
+model = SceneModel().to(device)
+# Optimizer.
+optimizer = optim.SGD(
+    [
+        {"params": [model.vert_col], "lr": 1e0},
+        {"params": [model.vert_rad], "lr": 5e-3},
+        {"params": [model.vert_pos], "lr": 1e-2},
+    ]
+)
+
+# Optimize.
+for i in range(500):
+    optimizer.zero_grad()
+    result, result_info = model()
+    # Visualize.
+    result_im = (result.cpu().detach().numpy() * 255).astype(np.uint8)
+    cv2.imshow("opt", result_im[:, :, ::-1])
+    overlay_img = np.ascontiguousarray(
+        ((result * 0.5 + ref * 0.5).cpu().detach().numpy() * 255).astype(np.uint8)[
+            :, :, ::-1
+        ]
+    )
+    overlay_img = cv2.putText(
+        overlay_img,
+        "Step %d" % (i),
+        (10, 40),
+        cv2.FONT_HERSHEY_SIMPLEX,
+        1,
+        (0, 0, 0),
+        2,
+        cv2.LINE_AA,
+        False,
+    )
+    cv2.imshow("overlay", overlay_img)
+    cv2.waitKey(1)
+    # Update.
+    loss = ((result - ref) ** 2).sum()
+    print("loss {}: {}".format(i, loss.item()))
+    loss.backward()
+    optimizer.step()
+    # Cleanup.
+    with torch.no_grad():
+        model.vert_col.data = torch.clamp(model.vert_col.data, 0.0, 1.0)
+        # Remove points.
+        model.vert_pos.data[model.vert_rad < 0.001, :] = -1000.0
+        model.vert_rad.data[model.vert_rad < 0.001] = 0.0001
+        vd = (
+            (model.vert_col - torch.ones(3, dtype=torch.float32).to(device))
+            .abs()
+            .sum(dim=1)
+        )
+        model.vert_pos.data[vd <= 0.2] = -1000.0
--- a/pytorch3d/csrc/ext.cpp
+++ b/pytorch3d/csrc/ext.cpp
 // Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.

+// clang-format off
+#include "./pulsar/global.h" // Include before <torch/extension.h>.
 #include <torch/extension.h>
+// clang-format on
+#include "./pulsar/pytorch/renderer.h"
+#include "./pulsar/pytorch/tensor_util.h"
 #include "blending/sigmoid_alpha_blend.h"
 #include "compositing/alpha_composite.h"
 #include "compositing/norm_weighted_sum.h"
@@ -65,4 +70,90 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
  m.def("face_point_dist_backward", &FacePointDistanceBackward);
  m.def("point_face_array_dist_forward", &PointFaceArrayDistanceForward);
  m.def("point_face_array_dist_backward", &PointFaceArrayDistanceBackward);
+
+  // Pulsar.
+#ifdef PULSAR_LOGGING_ENABLED
+  c10::ShowLogInfoToStderr();
+#endif
+  py::class_<
+      pulsar::pytorch::Renderer,
+      std::shared_ptr<pulsar::pytorch::Renderer>>(m, "PulsarRenderer")
+      .def(py::init<
+           const uint&,
+           const uint&,
+           const uint&,
+           const bool&,
+           const bool&,
+           const float&,
+           const uint&,
+           const uint&>())
+      .def(
+          "__eq__",
+          [](const pulsar::pytorch::Renderer& a,
+             const pulsar::pytorch::Renderer& b) { return a == b; },
+          py::is_operator())
+      .def(
+          "__ne__",
+          [](const pulsar::pytorch::Renderer& a,
+             const pulsar::pytorch::Renderer& b) { return !(a == b); },
+          py::is_operator())
+      .def(
+          "__repr__",
+          [](const pulsar::pytorch::Renderer& self) {
+            std::stringstream ss;
+            ss << self;
+            return ss.str();
+          })
+      .def(
+          "forward",
+          &pulsar::pytorch::Renderer::forward,
+          py::arg("vert_pos"),
+          py::arg("vert_col"),
+          py::arg("vert_radii"),
+
+          py::arg("cam_pos"),
+          py::arg("pixel_0_0_center"),
+          py::arg("pixel_vec_x"),
+          py::arg("pixel_vec_y"),
+          py::arg("focal_length"),
+          py::arg("principal_point_offsets"),
+
+          py::arg("gamma"),
+          py::arg("max_depth"),
+          py::arg("min_depth") /* = 0.f*/,
+          py::arg(
+              "bg_col") /* = at::nullopt not exposed properly in pytorch 1.1. */
+          ,
+          py::arg("opacity") /* = at::nullopt ... */,
+          py::arg("percent_allowed_difference") = 0.01f,
+          py::arg("max_n_hits") = MAX_UINT,
+          py::arg("mode") = 0)
+      .def("backward", &pulsar::pytorch::Renderer::backward)
+      .def_property(
+          "device_tracker",
+          [](const pulsar::pytorch::Renderer& self) {
+            return self.device_tracker;
+          },
+          [](pulsar::pytorch::Renderer& self, const torch::Tensor& val) {
+            self.device_tracker = val;
+          })
+      .def_property_readonly("width", &pulsar::pytorch::Renderer::width)
+      .def_property_readonly("height", &pulsar::pytorch::Renderer::height)
+      .def_property_readonly(
+          "max_num_balls", &pulsar::pytorch::Renderer::max_num_balls)
+      .def_property_readonly(
+          "orthogonal", &pulsar::pytorch::Renderer::orthogonal)
+      .def_property_readonly(
+          "right_handed", &pulsar::pytorch::Renderer::right_handed)
+      .def_property_readonly("n_track", &pulsar::pytorch::Renderer::n_track);
+  m.def(
+      "pulsar_sphere_ids_from_result_info_nograd",
+      &pulsar::pytorch::sphere_ids_from_result_info_nograd);
+  // Constants.
+  m.attr("EPS") = py::float_(EPS);
+  m.attr("MAX_FLOAT") = py::float_(MAX_FLOAT);
+  m.attr("MAX_INT") = py::int_(MAX_INT);
+  m.attr("MAX_UINT") = py::int_(MAX_UINT);
+  m.attr("MAX_USHORT") = py::int_(MAX_USHORT);
+  m.attr("PULSAR_MAX_GRAD_SPHERES") = py::int_(MAX_GRAD_SPHERES);
 }
--- a/pytorch3d/csrc/pulsar/constants.h
+++ b/pytorch3d/csrc/pulsar/constants.h
+// Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#ifndef PULSAR_NATIVE_CONSTANTS_H_
+#define PULSAR_NATIVE_CONSTANTS_H_
+
+#define EPS 1E-6
+#define FEPS 1E-6f
+#define MAX_FLOAT 3.4E38f
+#define MAX_INT 2147483647
+#define MAX_UINT 4294967295u
+#define MAX_USHORT 65535u
+
+#endif
--- a/pytorch3d/csrc/pulsar/cuda/README.md
+++ b/pytorch3d/csrc/pulsar/cuda/README.md
+# CUDA device compilation units
+
+This folder contains `.cu` files to create compilation units
+for device-specific functions. See `../include/README.md` for
+more information.
--- a/pytorch3d/csrc/pulsar/cuda/commands.h
+++ b/pytorch3d/csrc/pulsar/cuda/commands.h
+// Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#ifndef PULSAR_NATIVE_CUDA_COMMANDS_H_
+#define PULSAR_NATIVE_CUDA_COMMANDS_H_
+
+// Definitions for GPU commands.
+#include <cooperative_groups.h>
+#include <cub/cub.cuh>
+namespace cg = cooperative_groups;
+
+#ifdef __DRIVER_TYPES_H__
+#ifndef DEVICE_RESET
+#define DEVICE_RESET cudaDeviceReset();
+#endif
+#else
+#ifndef DEVICE_RESET
+#define DEVICE_RESET
+#endif
+#endif
+
+#define HANDLECUDA(CMD) CMD
+// handleCudaError((CMD), __FILE__, __LINE__)
+inline void
+handleCudaError(const cudaError_t err, const char* file, const int line) {
+  if (err != cudaSuccess) {
+#ifndef __NVCC__
+    fprintf(
+        stderr,
+        "%s(%i) : getLastCudaError() CUDA error :"
+        " (%d) %s.\n",
+        file,
+        line,
+        static_cast<int>(err),
+        cudaGetErrorString(err));
+    DEVICE_RESET
+    exit(1);
+#endif
+  }
+}
+inline void
+getLastCudaError(const char* errorMessage, const char* file, const int line) {
+  cudaError_t err = cudaGetLastError();
+  if (cudaSuccess != err) {
+    fprintf(stderr, "Error: %s.", errorMessage);
+    handleCudaError(err, file, line);
+  }
+}
+
+#define ALIGN(VAL) __align__(VAL)
+#define SYNC() HANDLECUDE(cudaDeviceSynchronize())
+#define THREADFENCE_B() __threadfence_block()
+#define SHFL_SYNC(a, b, c) __shfl_sync((a), (b), (c))
+#define SHARED __shared__
+#define ACTIVEMASK() __activemask()
+#define BALLOT(mask, val) __ballot_sync((mask), val)
+/**
+ * Find the cumulative sum within a warp up to the current
+ * thread lane, with each mask thread contributing base.
+ */
+template <typename T>
+DEVICE T
+WARP_CUMSUM(const cg::coalesced_group& group, const uint& mask, const T& base) {
+  T ret = base;
+  T shfl_val;
+  shfl_val = __shfl_down_sync(mask, ret, 1u); // Deactivate the rightmost lane.
+  ret += (group.thread_rank() < 31) * shfl_val;
+  shfl_val = __shfl_down_sync(mask, ret, 2u);
+  ret += (group.thread_rank() < 30) * shfl_val;
+  shfl_val = __shfl_down_sync(mask, ret, 4u); // ...4
+  ret += (group.thread_rank() < 28) * shfl_val;
+  shfl_val = __shfl_down_sync(mask, ret, 8u); // ...8
+  ret += (group.thread_rank() < 24) * shfl_val;
+  shfl_val = __shfl_down_sync(mask, ret, 16u); // ...16
+  ret += (group.thread_rank() < 16) * shfl_val;
+  return ret;
+}
+
+template <typename T>
+DEVICE T
+WARP_MAX(const cg::coalesced_group& group, const uint& mask, const T& base) {
+  T ret = base;
+  ret = max(ret, __shfl_down_sync(mask, ret, 16u));
+  ret = max(ret, __shfl_down_sync(mask, ret, 8u));
+  ret = max(ret, __shfl_down_sync(mask, ret, 4u));
+  ret = max(ret, __shfl_down_sync(mask, ret, 2u));
+  ret = max(ret, __shfl_down_sync(mask, ret, 1u));
+  return ret;
+}
+
+template <typename T>
+DEVICE T
+WARP_SUM(const cg::coalesced_group& group, const uint& mask, const T& base) {
+  T ret = base;
+  ret = ret + __shfl_down_sync(mask, ret, 16u);
+  ret = ret + __shfl_down_sync(mask, ret, 8u);
+  ret = ret + __shfl_down_sync(mask, ret, 4u);
+  ret = ret + __shfl_down_sync(mask, ret, 2u);
+  ret = ret + __shfl_down_sync(mask, ret, 1u);
+  return ret;
+}
+
+INLINE DEVICE float3 WARP_SUM_FLOAT3(
+    const cg::coalesced_group& group,
+    const uint& mask,
+    const float3& base) {
+  float3 ret = base;
+  ret.x = WARP_SUM(group, mask, base.x);
+  ret.y = WARP_SUM(group, mask, base.y);
+  ret.z = WARP_SUM(group, mask, base.z);
+  return ret;
+}
+
+// Floating point.
+// #define FMUL(a, b) __fmul_rn((a), (b))
+#define FMUL(a, b) ((a) * (b))
+#define FDIV(a, b) __fdiv_rn((a), (b))
+// #define FSUB(a, b) __fsub_rn((a), (b))
+#define FSUB(a, b) ((a) - (b))
+#define FADD(a, b) __fadd_rn((a), (b))
+#define FSQRT(a) __fsqrt_rn(a)
+#define FEXP(a) fasterexp(a)
+#define FLN(a) fasterlog(a)
+#define FPOW(a, b) __powf((a), (b))
+#define FMAX(a, b) fmax((a), (b))
+#define FMIN(a, b) fmin((a), (b))
+#define FCEIL(a) ceilf(a)
+#define FFLOOR(a) floorf(a)
+#define FROUND(x) nearbyintf(x)
+#define FSATURATE(x) __saturatef(x)
+#define FABS(a) abs(a)
+#define IASF(a, loc) (loc) = __int_as_float(a)
+#define FASI(a, loc) (loc) = __float_as_int(a)
+#define FABSLEQAS(a, b, c) \
+  ((a) <= (b) ? FSUB((b), (a)) <= (c) : FSUB((a), (b)) < (c))
+/** Calculates x*y+z. */
+#define FMA(x, y, z) __fmaf_rn((x), (y), (z))
+#define I2F(a) __int2float_rn(a)
+#define FRCP(x) __frcp_rn(x)
+__device__ static float atomicMax(float* address, float val) {
+  int* address_as_i = (int*)address;
+  int old = *address_as_i, assumed;
+  do {
+    assumed = old;
+    old = ::atomicCAS(
+        address_as_i,
+        assumed,
+        __float_as_int(::fmaxf(val, __int_as_float(assumed))));
+  } while (assumed != old);
+  return __int_as_float(old);
+}
+__device__ static float atomicMin(float* address, float val) {
+  int* address_as_i = (int*)address;
+  int old = *address_as_i, assumed;
+  do {
+    assumed = old;
+    old = ::atomicCAS(
+        address_as_i,
+        assumed,
+        __float_as_int(::fminf(val, __int_as_float(assumed))));
+  } while (assumed != old);
+  return __int_as_float(old);
+}
+#define DMAX(a, b) FMAX(a, b)
+#define DMIN(a, b) FMIN(a, b)
+#define DSQRT(a) sqrt(a)
+#define DSATURATE(a) DMIN(1., DMAX(0., (a)))
+// half
+#define HADD(a, b) __hadd((a), (b))
+#define HSUB2(a, b) __hsub2((a), (b))
+#define HMUL2(a, b) __hmul2((a), (b))
+#define HSQRT(a) hsqrt(a)
+
+// uint.
+#define CLZ(VAL) __clz(VAL)
+#define POPC(a) __popc(a)
+//
+//
+//
+//
+//
+//
+//
+//
+//
+#define ATOMICADD(PTR, VAL) atomicAdd((PTR), (VAL))
+#define ATOMICADD_F3(PTR, VAL)   \
+  ATOMICADD(&((PTR)->x), VAL.x); \
+  ATOMICADD(&((PTR)->y), VAL.y); \
+  ATOMICADD(&((PTR)->z), VAL.z);
+#if (CUDART_VERSION >= 10000)
+#define ATOMICADD_B(PTR, VAL) atomicAdd_block((PTR), (VAL))
+#else
+#define ATOMICADD_B(PTR, VAL) ATOMICADD(PTR, VAL)
+#endif
+//
+//
+//
+//
+// int.
+#define IMIN(a, b) min((a), (b))
+#define IMAX(a, b) max((a), (b))
+#define IABS(a) abs(a)
+
+// Checks.
+#define CHECKOK THCudaCheck
+#define ARGCHECK THArgCheck
+
+// Math.
+#define NORM3DF(x, y, z) norm3df(x, y, z)
+#define RNORM3DF(x, y, z) rnorm3df(x, y, z)
+
+// High level.
+INLINE DEVICE void prefetch_l1(unsigned long addr) {
+  asm(" prefetch.global.L1 [ %1 ];" : "=l"(addr) : "l"(addr));
+}
+#define PREFETCH(PTR) prefetch_l1((unsigned long)(PTR))
+#define GET_SORT_WS_SIZE(RES_PTR, KEY_TYPE, VAL_TYPE, NUM_OBJECTS) \
+  cub::DeviceRadixSort::SortPairsDescending(                       \
+      (void*)NULL,                                                 \
+      *(RES_PTR),                                                  \
+      reinterpret_cast<KEY_TYPE*>(NULL),                           \
+      reinterpret_cast<KEY_TYPE*>(NULL),                           \
+      reinterpret_cast<VAL_TYPE*>(NULL),                           \
+      reinterpret_cast<VAL_TYPE*>(NULL),                           \
+      (NUM_OBJECTS));
+#define GET_REDUCE_WS_SIZE(RES_PTR, TYPE, REDUCE_OP, NUM_OBJECTS) \
+  {                                                               \
+    TYPE init = TYPE();                                           \
+    cub::DeviceReduce::Reduce(                                    \
+        (void*)NULL,                                              \
+        *(RES_PTR),                                               \
+        (TYPE*)NULL,                                              \
+        (TYPE*)NULL,                                              \
+        (NUM_OBJECTS),                                            \
+        (REDUCE_OP),                                              \
+        init);                                                    \
+  }
+#define GET_SELECT_WS_SIZE(                              \
+    RES_PTR, TYPE_SELECTOR, TYPE_SELECTION, NUM_OBJECTS) \
+  {                                                      \
+    cub::DeviceSelect::Flagged(                          \
+        (void*)NULL,                                     \
+        *(RES_PTR),                                      \
+        (TYPE_SELECTION*)NULL,                           \
+        (TYPE_SELECTOR*)NULL,                            \
+        (TYPE_SELECTION*)NULL,                           \
+        (int*)NULL,                                      \
+        (NUM_OBJECTS));                                  \
+  }
+#define GET_SUM_WS_SIZE(RES_PTR, TYPE_SUM, NUM_OBJECTS) \
+  {                                                     \
+    cub::DeviceReduce::Sum(                             \
+        (void*)NULL,                                    \
+        *(RES_PTR),                                     \
+        (TYPE_SUM*)NULL,                                \
+        (TYPE_SUM*)NULL,                                \
+        NUM_OBJECTS);                                   \
+  }
+#define GET_MM_WS_SIZE(RES_PTR, TYPE, NUM_OBJECTS)                         \
+  {                                                                        \
+    TYPE init = TYPE();                                                    \
+    cub::DeviceReduce::Max(                                                \
+        (void*)NULL, *(RES_PTR), (TYPE*)NULL, (TYPE*)NULL, (NUM_OBJECTS)); \
+  }
+#define SORT_DESCENDING(                                               \
+    TMPN1, SORT_PTR, SORTED_PTR, VAL_PTR, VAL_SORTED_PTR, NUM_OBJECTS) \
+  void* TMPN1 = NULL;                                                  \
+  size_t TMPN1##_bytes = 0;                                            \
+  cub::DeviceRadixSort::SortPairsDescending(                           \
+      TMPN1,                                                           \
+      TMPN1##_bytes,                                                   \
+      (SORT_PTR),                                                      \
+      (SORTED_PTR),                                                    \
+      (VAL_PTR),                                                       \
+      (VAL_SORTED_PTR),                                                \
+      (NUM_OBJECTS));                                                  \
+  HANDLECUDA(cudaMalloc(&TMPN1, TMPN1##_bytes));                       \
+  cub::DeviceRadixSort::SortPairsDescending(                           \
+      TMPN1,                                                           \
+      TMPN1##_bytes,                                                   \
+      (SORT_PTR),                                                      \
+      (SORTED_PTR),                                                    \
+      (VAL_PTR),                                                       \
+      (VAL_SORTED_PTR),                                                \
+      (NUM_OBJECTS));                                                  \
+  HANDLECUDA(cudaFree(TMPN1));
+#define SORT_DESCENDING_WS(                  \
+    TMPN1,                                   \
+    SORT_PTR,                                \
+    SORTED_PTR,                              \
+    VAL_PTR,                                 \
+    VAL_SORTED_PTR,                          \
+    NUM_OBJECTS,                             \
+    WORKSPACE_PTR,                           \
+    WORKSPACE_BYTES)                         \
+  cub::DeviceRadixSort::SortPairsDescending( \
+      (WORKSPACE_PTR),                       \
+      (WORKSPACE_BYTES),                     \
+      (SORT_PTR),                            \
+      (SORTED_PTR),                          \
+      (VAL_PTR),                             \
+      (VAL_SORTED_PTR),                      \
+      (NUM_OBJECTS));
+#define SORT_ASCENDING_WS(         \
+    SORT_PTR,                      \
+    SORTED_PTR,                    \
+    VAL_PTR,                       \
+    VAL_SORTED_PTR,                \
+    NUM_OBJECTS,                   \
+    WORKSPACE_PTR,                 \
+    WORKSPACE_BYTES,               \
+    STREAM)                        \
+  cub::DeviceRadixSort::SortPairs( \
+      (WORKSPACE_PTR),             \
+      (WORKSPACE_BYTES),           \
+      (SORT_PTR),                  \
+      (SORTED_PTR),                \
+      (VAL_PTR),                   \
+      (VAL_SORTED_PTR),            \
+      (NUM_OBJECTS),               \
+      0,                           \
+      sizeof(*(SORT_PTR)) * 8,     \
+      (STREAM));
+#define SUM_WS(                                                            \
+    SUM_PTR, OUT_PTR, NUM_OBJECTS, WORKSPACE_PTR, WORKSPACE_BYTES, STREAM) \
+  cub::DeviceReduce::Sum(                                                  \
+      (WORKSPACE_PTR),                                                     \
+      (WORKSPACE_BYTES),                                                   \
+      (SUM_PTR),                                                           \
+      (OUT_PTR),                                                           \
+      (NUM_OBJECTS),                                                       \
+      (STREAM));
+#define MIN_WS(                                                            \
+    MIN_PTR, OUT_PTR, NUM_OBJECTS, WORKSPACE_PTR, WORKSPACE_BYTES, STREAM) \
+  cub::DeviceReduce::Min(                                                  \
+      (WORKSPACE_PTR),                                                     \
+      (WORKSPACE_BYTES),                                                   \
+      (MIN_PTR),                                                           \
+      (OUT_PTR),                                                           \
+      (NUM_OBJECTS),                                                       \
+      (STREAM));
+#define MAX_WS(                                                            \
+    MAX_PTR, OUT_PTR, NUM_OBJECTS, WORKSPACE_PTR, WORKSPACE_BYTES, STREAM) \
+  cub::DeviceReduce::Min(                                                  \
+      (WORKSPACE_PTR),                                                     \
+      (WORKSPACE_BYTES),                                                   \
+      (MAX_PTR),                                                           \
+      (OUT_PTR),                                                           \
+      (NUM_OBJECTS),                                                       \
+      (STREAM));
+//
+//
+//
+// TODO: rewrite using nested contexts instead of temporary names.
+#define REDUCE(REDUCE_PTR, RESULT_PTR, NUM_ITEMS, REDUCE_OP, REDUCE_INIT) \
+  cub::DeviceReduce::Reduce(                                              \
+      TMPN1,                                                              \
+      TMPN1##_bytes,                                                      \
+      (REDUCE_PTR),                                                       \
+      (RESULT_PTR),                                                       \
+      (NUM_ITEMS),                                                        \
+      (REDUCE_OP),                                                        \
+      (REDUCE_INIT));                                                     \
+  HANDLECUDA(cudaMalloc(&TMPN1, TMPN1##_bytes));                          \
+  cub::DeviceReduce::Reduce(                                              \
+      TMPN1,                                                              \
+      TMPN1##_bytes,                                                      \
+      (REDUCE_PTR),                                                       \
+      (RESULT_PTR),                                                       \
+      (NUM_ITEMS),                                                        \
+      (REDUCE_OP),                                                        \
+      (REDUCE_INIT));                                                     \
+  HANDLECUDA(cudaFree(TMPN1));
+#define REDUCE_WS(           \
+    REDUCE_PTR,              \
+    RESULT_PTR,              \
+    NUM_ITEMS,               \
+    REDUCE_OP,               \
+    REDUCE_INIT,             \
+    WORKSPACE_PTR,           \
+    WORSPACE_BYTES,          \
+    STREAM)                  \
+  cub::DeviceReduce::Reduce( \
+      (WORKSPACE_PTR),       \
+      (WORSPACE_BYTES),      \
+      (REDUCE_PTR),          \
+      (RESULT_PTR),          \
+      (NUM_ITEMS),           \
+      (REDUCE_OP),           \
+      (REDUCE_INIT),         \
+      (STREAM));
+#define SELECT_FLAGS_WS(      \
+    FLAGS_PTR,                \
+    ITEM_PTR,                 \
+    OUT_PTR,                  \
+    NUM_SELECTED_PTR,         \
+    NUM_ITEMS,                \
+    WORKSPACE_PTR,            \
+    WORSPACE_BYTES,           \
+    STREAM)                   \
+  cub::DeviceSelect::Flagged( \
+      (WORKSPACE_PTR),        \
+      (WORSPACE_BYTES),       \
+      (ITEM_PTR),             \
+      (FLAGS_PTR),            \
+      (OUT_PTR),              \
+      (NUM_SELECTED_PTR),     \
+      (NUM_ITEMS),            \
+      stream = (STREAM));
+
+#define COPY_HOST_DEV(PTR_D, PTR_H, TYPE, SIZE) \
+  HANDLECUDA(cudaMemcpy(                        \
+      (PTR_D), (PTR_H), sizeof(TYPE) * (SIZE), cudaMemcpyHostToDevice))
+#define COPY_DEV_HOST(PTR_H, PTR_D, TYPE, SIZE) \
+  HANDLECUDA(cudaMemcpy(                        \
+      (PTR_H), (PTR_D), sizeof(TYPE) * (SIZE), cudaMemcpyDeviceToHost))
+#define COPY_DEV_DEV(PTR_T, PTR_S, TYPE, SIZE) \
+  HANDLECUDA(cudaMemcpy(                       \
+      (PTR_T), (PTR_S), sizeof(TYPE) * (SIZE), cudaMemcpyDeviceToDevice))
+//
+// We *must* use cudaMallocManaged for pointers on device that should
+// interact with pytorch. However, this comes at a significant speed penalty.
+// We're using plain CUDA pointers for the rendering operations and
+// explicitly copy results to managed pointers wrapped for pytorch (see
+// pytorch/util.h).
+#define MALLOC(VAR, TYPE, SIZE) cudaMalloc(&(VAR), sizeof(TYPE) * (SIZE))
+#define FREE(PTR) HANDLECUDA(cudaFree(PTR))
+#define MEMSET(VAR, VAL, TYPE, SIZE, STREAM) \
+  HANDLECUDA(cudaMemsetAsync((VAR), (VAL), sizeof(TYPE) * (SIZE), (STREAM)))
+
+#define LAUNCH_MAX_PARALLEL_1D(FUNC, N, STREAM, ...)                \
+  {                                                                 \
+    int64_t max_threads =                                           \
+        at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock; \
+    uint num_threads = min((N), max_threads);                       \
+    uint num_blocks = iDivCeil((N), num_threads);                   \
+    FUNC<<<num_blocks, num_threads, 0, (STREAM)>>>(__VA_ARGS__);    \
+  }
+#define LAUNCH_PARALLEL_1D(FUNC, N, TN, STREAM, ...)                   \
+  {                                                                    \
+    uint num_threads = min(static_cast<int>(N), static_cast<int>(TN)); \
+    uint num_blocks = iDivCeil((N), num_threads);                      \
+    FUNC<<<num_blocks, num_threads, 0, (STREAM)>>>(__VA_ARGS__);       \
+  }
+#define LAUNCH_MAX_PARALLEL_2D(FUNC, NX, NY, STREAM, ...)               \
+  {                                                                     \
+    int64_t max_threads =                                               \
+        at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock;     \
+    int64_t max_threads_sqrt = static_cast<int64_t>(sqrt(max_threads)); \
+    dim3 num_threads, num_blocks;                                       \
+    num_threads.x = min((NX), max_threads_sqrt);                        \
+    num_blocks.x = iDivCeil((NX), num_threads.x);                       \
+    num_threads.y = min((NY), max_threads_sqrt);                        \
+    num_blocks.y = iDivCeil((NY), num_threads.y);                       \
+    num_threads.z = 1;                                                  \
+    num_blocks.z = 1;                                                   \
+    FUNC<<<num_blocks, num_threads, 0, (STREAM)>>>(__VA_ARGS__);        \
+  }
+#define LAUNCH_PARALLEL_2D(FUNC, NX, NY, TX, TY, STREAM, ...)    \
+  {                                                              \
+    dim3 num_threads, num_blocks;                                \
+    num_threads.x = min((NX), (TX));                             \
+    num_blocks.x = iDivCeil((NX), num_threads.x);                \
+    num_threads.y = min((NY), (TY));                             \
+    num_blocks.y = iDivCeil((NY), num_threads.y);                \
+    num_threads.z = 1;                                           \
+    num_blocks.z = 1;                                            \
+    FUNC<<<num_blocks, num_threads, 0, (STREAM)>>>(__VA_ARGS__); \
+  }
+
+#define GET_PARALLEL_IDX_1D(VARNAME, N)                               \
+  const uint VARNAME = __mul24(blockIdx.x, blockDim.x) + threadIdx.x; \
+  if (VARNAME >= (N)) {                                               \
+    return;                                                           \
+  }
+#define GET_PARALLEL_IDS_2D(VAR_X, VAR_Y, WIDTH, HEIGHT)            \
+  const uint VAR_X = __mul24(blockIdx.x, blockDim.x) + threadIdx.x; \
+  const uint VAR_Y = __mul24(blockIdx.y, blockDim.y) + threadIdx.y; \
+  if (VAR_X >= (WIDTH) || VAR_Y >= (HEIGHT))                        \
+    return;
+#define END_PARALLEL()
+#define END_PARALLEL_NORET()
+#define END_PARALLEL_2D_NORET()
+#define END_PARALLEL_2D()
+#define RETURN_PARALLEL() return
+#define CHECKLAUNCH() THCudaCheck(cudaGetLastError());
+#define ISONDEVICE true
+#define SYNCDEVICE() HANDLECUDA(cudaDeviceSynchronize())
+#define START_TIME(TN)                             \
+  cudaEvent_t __time_start_##TN, __time_stop_##TN; \
+  cudaEventCreate(&__time_start_##TN);             \
+  cudaEventCreate(&__time_stop_##TN);              \
+  cudaEventRecord(__time_start_##TN);
+#define STOP_TIME(TN) cudaEventRecord(__time_stop_##TN);
+#define GET_TIME(TN, TOPTR)               \
+  cudaEventSynchronize(__time_stop_##TN); \
+  cudaEventElapsedTime((TOPTR), __time_start_##TN, __time_stop_##TN);
+#define START_TIME_CU(TN) START_TIME(CN)
+#define STOP_TIME_CU(TN) STOP_TIME(TN)
+#define GET_TIME_CU(TN, TOPTR) GET_TIME(TN, TOPTR)
+
+#endif
--- a/pytorch3d/csrc/pulsar/cuda/renderer.backward.gpu.cu
+++ b/pytorch3d/csrc/pulsar/cuda/renderer.backward.gpu.cu
+// Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#include "../include/renderer.backward.instantiate.h"
--- a/pytorch3d/csrc/pulsar/cuda/renderer.backward_dbg.gpu.cu
+++ b/pytorch3d/csrc/pulsar/cuda/renderer.backward_dbg.gpu.cu
+// Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#include "../include/renderer.backward_dbg.instantiate.h"
--- a/pytorch3d/csrc/pulsar/cuda/renderer.calc_gradients.gpu.cu
+++ b/pytorch3d/csrc/pulsar/cuda/renderer.calc_gradients.gpu.cu
+// Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#include "../include/renderer.calc_gradients.instantiate.h"
--- a/pytorch3d/csrc/pulsar/cuda/renderer.calc_signature.gpu.cu
+++ b/pytorch3d/csrc/pulsar/cuda/renderer.calc_signature.gpu.cu
+// Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#include "../include/renderer.calc_signature.instantiate.h"
--- a/pytorch3d/csrc/pulsar/cuda/renderer.construct.gpu.cu
+++ b/pytorch3d/csrc/pulsar/cuda/renderer.construct.gpu.cu
+// Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#include "../include/renderer.construct.instantiate.h"
--- a/pytorch3d/csrc/pulsar/cuda/renderer.create_selector.gpu.cu
+++ b/pytorch3d/csrc/pulsar/cuda/renderer.create_selector.gpu.cu
+// Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#include "../include/renderer.create_selector.instantiate.h"
--- a/pytorch3d/csrc/pulsar/cuda/renderer.destruct.gpu.cu
+++ b/pytorch3d/csrc/pulsar/cuda/renderer.destruct.gpu.cu
+// Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#include "../include/renderer.destruct.instantiate.h"
--- a/pytorch3d/csrc/pulsar/cuda/renderer.fill_bg.gpu.cu
+++ b/pytorch3d/csrc/pulsar/cuda/renderer.fill_bg.gpu.cu
+// Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#include "../include/renderer.fill_bg.instantiate.h"
--- a/pytorch3d/csrc/pulsar/cuda/renderer.forward.gpu.cu
+++ b/pytorch3d/csrc/pulsar/cuda/renderer.forward.gpu.cu
+// Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#include "../include/renderer.forward.instantiate.h"
--- a/pytorch3d/csrc/pulsar/cuda/renderer.norm_cam_gradients.gpu.cu
+++ b/pytorch3d/csrc/pulsar/cuda/renderer.norm_cam_gradients.gpu.cu
+// Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#include "../include/renderer.norm_cam_gradients.instantiate.h"
--- a/pytorch3d/csrc/pulsar/cuda/renderer.norm_sphere_gradients.gpu.cu
+++ b/pytorch3d/csrc/pulsar/cuda/renderer.norm_sphere_gradients.gpu.cu
+// Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#include "../include/renderer.norm_sphere_gradients.instantiate.h"
--- a/pytorch3d/csrc/pulsar/cuda/renderer.render.gpu.cu
+++ b/pytorch3d/csrc/pulsar/cuda/renderer.render.gpu.cu
+// Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#include "../include/renderer.render.instantiate.h"