pulsar integration.

Summary: This diff integrates the pulsar renderer source code into PyTorch3D as an alternative backend for the PyTorch3D point renderer. This diff is the first of a series of three diffs to complete that migration and focuses on the packaging and integration of the source code. For more information about the pulsar backend, see the release notes and the paper (https://arxiv.org/abs/2004.07484). For information on how to use the backend, see the point cloud rendering notebook and the examples in the folder `docs/examples`. Tasks addressed in the following diffs: * Add the PyTorch3D interface, * Add notebook examples and documentation (or adapt the existing ones to feature both interfaces). Reviewed By: nikhilaravi Differential Revision: D23947736 fbshipit-source-id: a5e77b53e6750334db22aefa89b4c079cda1b443

pulsar integration.
Summary: This diff integrates the pulsar renderer source code into PyTorch3D as an alternative backend for the PyTorch3D point renderer. This diff is the first of a series of three diffs to complete that migration and focuses on the packaging and integration of the source code. For more information about the pulsar backend, see the release notes and the paper (https://arxiv.org/abs/2004.07484). For information on how to use the backend, see the point cloud rendering notebook and the examples in the folder `docs/examples`. Tasks addressed in the following diffs: * Add the PyTorch3D interface, * Add notebook examples and documentation (or adapt the existing ones to feature both interfaces). Reviewed By: nikhilaravi Differential Revision: D23947736 fbshipit-source-id: a5e77b53e6750334db22aefa89b4c079cda1b443
b19fe1de · Christoph Lassner · Facebook GitHub Bot · d5650323 · b19fe1de · b19fe1de
Commit b19fe1de authored Nov 03, 2020 by Christoph Lassner Committed by Facebook GitHub Bot Nov 03, 2020
20 changed files
--- a/pytorch3d/csrc/pulsar/include/fastermath.h
+++ b/pytorch3d/csrc/pulsar/include/fastermath.h
+#ifndef PULSAR_NATIVE_INCLUDE_FASTERMATH_H_
+#define PULSAR_NATIVE_INCLUDE_FASTERMATH_H_
+/*=====================================================================*
+ *                   Copyright (C) 2011 Paul Mineiro                   *
+ * All rights reserved.                                                *
+ *                                                                     *
+ * Redistribution and use in source and binary forms, with             *
+ * or without modification, are permitted provided that the            *
+ * following conditions are met:                                       *
+ *                                                                     *
+ *     * Redistributions of source code must retain the                *
+ *     above copyright notice, this list of conditions and             *
+ *     the following disclaimer.                                       *
+ *                                                                     *
+ *     * Redistributions in binary form must reproduce the             *
+ *     above copyright notice, this list of conditions and             *
+ *     the following disclaimer in the documentation and/or            *
+ *     other materials provided with the distribution.                 *
+ *                                                                     *
+ *     * Neither the name of Paul Mineiro nor the names                *
+ *     of other contributors may be used to endorse or promote         *
+ *     products derived from this software without specific            *
+ *     prior written permission.                                       *
+ *                                                                     *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND              *
+ * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,         *
+ * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES               *
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE             *
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER               *
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,                 *
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES            *
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE           *
+ * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR                *
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF          *
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT           *
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY              *
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE             *
+ * POSSIBILITY OF SUCH DAMAGE.                                         *
+ *                                                                     *
+ * Contact: Paul Mineiro <paul@mineiro.com>                            *
+ *=====================================================================*/
+#include <stdint.h>
+#include "./commands.h"
+#ifdef __cplusplus
+#define cast_uint32_t static_cast<uint32_t>
+#else
+#define cast_uint32_t (uint32_t)
+#endif
+IHD float fasterlog2(float x) {
+  union {
+    float f;
+    uint32_t i;
+  } vx = {x};
+  float y = vx.i;
+  y *= 1.1920928955078125e-7f;
+  return y - 126.94269504f;
+}
+IHD float fasterlog(float x) {
+  //  return 0.69314718f * fasterlog2 (x);
+  union {
+    float f;
+    uint32_t i;
+  } vx = {x};
+  float y = vx.i;
+  y *= 8.2629582881927490e-8f;
+  return y - 87.989971088f;
+}
+IHD float fasterpow2(float p) {
+  float clipp = (p < -126) ? -126.0f : p;
+  union {
+    uint32_t i;
+    float f;
+  } v = {cast_uint32_t((1 << 23) * (clipp + 126.94269504f))};
+  return v.f;
+}
+IHD float fasterexp(float p) {
+  return fasterpow2(1.442695040f * p);
+}
+#endif
--- a/pytorch3d/csrc/pulsar/include/math.h
+++ b/pytorch3d/csrc/pulsar/include/math.h
+// Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#ifndef PULSAR_NATIVE_IMPL_MATH_H_
+#define PULSAR_NATIVE_IMPL_MATH_H_
+#include "./camera.h"
+#include "./commands.h"
+#include "./fastermath.h"
+/**
+ * Get the direction of val.
+ *
+ * Returns +1 if val is positive, -1 if val is zero or negative.
+ */
+IHD int sign_dir(const int& val) {
+  return -(static_cast<int>((val <= 0)) << 1) + 1;
+};
+/**
+ * Get the direction of val.
+ *
+ * Returns +1 if val is positive, -1 if val is zero or negative.
+ */
+IHD float sign_dir(const float& val) {
+  return static_cast<float>(1 - (static_cast<int>((val <= 0)) << 1));
+};
+/**
+ * Integer ceil division.
+ */
+IHD uint iDivCeil(uint a, uint b) {
+  return (a % b != 0) ? (a / b + 1) : (a / b);
+}
+IHD float3 outer_product_sum(const float3& a) {
+  return make_float3(
+      a.x * a.x + a.x * a.y + a.x * a.z,
+      a.x * a.y + a.y * a.y + a.y * a.z,
+      a.x * a.z + a.y * a.z + a.z * a.z);
+}
+// TODO: put intrinsics here.
+IHD float3 operator+(const float3& a, const float3& b) {
+  return make_float3(a.x + b.x, a.y + b.y, a.z + b.z);
+}
+IHD void operator+=(float3& a, const float3& b) {
+  a.x += b.x;
+  a.y += b.y;
+  a.z += b.z;
+}
+IHD void operator-=(float3& a, const float3& b) {
+  a.x -= b.x;
+  a.y -= b.y;
+  a.z -= b.z;
+}
+IHD void operator/=(float3& a, const float& b) {
+  a.x /= b;
+  a.y /= b;
+  a.z /= b;
+}
+IHD void operator*=(float3& a, const float& b) {
+  a.x *= b;
+  a.y *= b;
+  a.z *= b;
+}
+IHD float3 operator/(const float3& a, const float& b) {
+  return make_float3(a.x / b, a.y / b, a.z / b);
+}
+IHD float3 operator-(const float3& a, const float3& b) {
+  return make_float3(a.x - b.x, a.y - b.y, a.z - b.z);
+}
+IHD float3 operator*(const float3& a, const float& b) {
+  return make_float3(a.x * b, a.y * b, a.z * b);
+}
+IHD float3 operator*(const float3& a, const float3& b) {
+  return make_float3(a.x * b.x, a.y * b.y, a.z * b.z);
+}
+IHD float3 operator*(const float& a, const float3& b) {
+  return b * a;
+}
+IHD float length(const float3& v) {
+  // TODO: benchmark what's faster.
+  return NORM3DF(v.x, v.y, v.z);
+  // return __fsqrt_rn(v.x * v.x + v.y * v.y + v.z * v.z);
+}
+/**
+ * Left-hand multiplication of the constructed rotation matrix with the vector.
+ */
+IHD float3 rotate(
+    const float3& v,
+    const float3& dir_x,
+    const float3& dir_y,
+    const float3& dir_z) {
+  return make_float3(
+      dir_x.x * v.x + dir_x.y * v.y + dir_x.z * v.z,
+      dir_y.x * v.x + dir_y.y * v.y + dir_y.z * v.z,
+      dir_z.x * v.x + dir_z.y * v.y + dir_z.z * v.z);
+}
+IHD float3 normalize(const float3& v) {
+  return v * RNORM3DF(v.x, v.y, v.z);
+}
+INLINE DEVICE float dot(const float3& a, const float3& b) {
+  return FADD(FADD(FMUL(a.x, b.x), FMUL(a.y, b.y)), FMUL(a.z, b.z));
+}
+INLINE DEVICE float3 cross(const float3& a, const float3& b) {
+  // TODO: faster
+  return make_float3(
+      a.y * b.z - a.z * b.y, a.z * b.x - a.x * b.z, a.x * b.y - a.y * b.x);
+}
+namespace pulsar {
+IHD CamGradInfo operator+(const CamGradInfo& a, const CamGradInfo& b) {
+  CamGradInfo res;
+  res.cam_pos = a.cam_pos + b.cam_pos;
+  res.pixel_0_0_center = a.pixel_0_0_center + b.pixel_0_0_center;
+  res.pixel_dir_x = a.pixel_dir_x + b.pixel_dir_x;
+  res.pixel_dir_y = a.pixel_dir_y + b.pixel_dir_y;
+  return res;
+}
+IHD CamGradInfo operator*(const CamGradInfo& a, const float& b) {
+  CamGradInfo res;
+  res.cam_pos = a.cam_pos * b;
+  res.pixel_0_0_center = a.pixel_0_0_center * b;
+  res.pixel_dir_x = a.pixel_dir_x * b;
+  res.pixel_dir_y = a.pixel_dir_y * b;
+  return res;
+}
+IHD IntWrapper operator+(const IntWrapper& a, const IntWrapper& b) {
+  IntWrapper res;
+  res.val = a.val + b.val;
+  return res;
+}
+} // namespace pulsar
+#endif
--- a/pytorch3d/csrc/pulsar/include/renderer.backward.device.h
+++ b/pytorch3d/csrc/pulsar/include/renderer.backward.device.h
+// Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#ifndef PULSAR_NATIVE_RENDERER_BACKWARD_DEVICE_H_
+#define PULSAR_NATIVE_RENDERER_BACKWARD_DEVICE_H_
+#include "./camera.device.h"
+#include "./math.h"
+#include "./renderer.h"
+namespace pulsar {
+namespace Renderer {
+template <bool DEV>
+void backward(
+    Renderer* self,
+    const float* grad_im,
+    const float* image,
+    const float* forw_info,
+    const float* vert_pos,
+    const float* vert_col,
+    const float* vert_rad,
+    const CamInfo& cam,
+    const float& gamma,
+    float percent_allowed_difference,
+    const uint& max_n_hits,
+    const float* vert_opy_d,
+    const size_t& num_balls,
+    const uint& mode,
+    const bool& dif_pos,
+    const bool& dif_col,
+    const bool& dif_rad,
+    const bool& dif_cam,
+    const bool& dif_opy,
+    cudaStream_t stream) {
+  ARGCHECK(gamma > 0.f && gamma <= 1.f, 6, "gamma must be in [0., 1.]");
+  ARGCHECK(
+      percent_allowed_difference >= 0.f && percent_allowed_difference <= 1.f,
+      7,
+      "percent_allowed_difference must be in [0., 1.]");
+  ARGCHECK(max_n_hits >= 1u, 8, "max_n_hits must be >= 1");
+  ARGCHECK(
+      num_balls > 0 && num_balls <= self->max_num_balls,
+      9,
+      "num_balls must be >0 and less than max num balls!");
+  ARGCHECK(
+      cam.film_width == self->cam.film_width &&
+          cam.film_height == self->cam.film_height,
+      5,
+      "cam film size must agree");
+  ARGCHECK(mode <= 1, 10, "mode must be <= 1!");
+  if (percent_allowed_difference < EPS) {
+    LOG(WARNING) << "percent_allowed_difference < " << FEPS << "! Clamping to "
+                 << FEPS << ".";
+    percent_allowed_difference = FEPS;
+  }
+  if (percent_allowed_difference > 1.f - FEPS) {
+    LOG(WARNING) << "percent_allowed_difference > " << (1.f - FEPS)
+                 << "! Clamping to " << (1.f - FEPS) << ".";
+    percent_allowed_difference = 1.f - FEPS;
+  }
+  LOG_IF(INFO, PULSAR_LOG_RENDER) << "Rendering backward pass...";
+  // Update camera.
+  self->cam.eye = cam.eye;
+  self->cam.pixel_0_0_center = cam.pixel_0_0_center - cam.eye;
+  self->cam.pixel_dir_x = cam.pixel_dir_x;
+  self->cam.pixel_dir_y = cam.pixel_dir_y;
+  self->cam.sensor_dir_z = cam.sensor_dir_z;
+  self->cam.half_pixel_size = cam.half_pixel_size;
+  self->cam.focal_length = cam.focal_length;
+  self->cam.aperture_width = cam.aperture_width;
+  self->cam.aperture_height = cam.aperture_height;
+  self->cam.min_dist = cam.min_dist;
+  self->cam.max_dist = cam.max_dist;
+  self->cam.norm_fac = cam.norm_fac;
+  self->cam.principal_point_offset_x = cam.principal_point_offset_x;
+  self->cam.principal_point_offset_y = cam.principal_point_offset_y;
+  self->cam.film_border_left = cam.film_border_left;
+  self->cam.film_border_top = cam.film_border_top;
+#ifdef PULSAR_TIMINGS_ENABLED
+  START_TIME(calc_signature);
+#endif
+  LAUNCH_MAX_PARALLEL_1D(
+      calc_signature<DEV>,
+      num_balls,
+      stream,
+      *self,
+      reinterpret_cast<const float3*>(vert_pos),
+      vert_col,
+      vert_rad,
+      num_balls);
+  CHECKLAUNCH();
+#ifdef PULSAR_TIMINGS_ENABLED
+  STOP_TIME(calc_signature);
+  START_TIME(calc_gradients);
+#endif
+  MEMSET(self->grad_pos_d, 0, float3, num_balls, stream);
+  MEMSET(self->grad_col_d, 0, float, num_balls * self->cam.n_channels, stream);
+  MEMSET(self->grad_rad_d, 0, float, num_balls, stream);
+  MEMSET(self->grad_cam_d, 0, float, 12, stream);
+  MEMSET(self->grad_cam_buf_d, 0, CamGradInfo, num_balls, stream);
+  MEMSET(self->grad_opy_d, 0, float, num_balls, stream);
+  MEMSET(self->ids_sorted_d, 0, int, num_balls, stream);
+  LAUNCH_PARALLEL_2D(
+      calc_gradients<DEV>,
+      self->cam.film_width,
+      self->cam.film_height,
+      GRAD_BLOCK_SIZE,
+      GRAD_BLOCK_SIZE,
+      stream,
+      self->cam,
+      grad_im,
+      gamma,
+      reinterpret_cast<const float3*>(vert_pos),
+      vert_col,
+      vert_rad,
+      vert_opy_d,
+      num_balls,
+      image,
+      forw_info,
+      self->di_d,
+      self->ii_d,
+      dif_pos,
+      dif_col,
+      dif_rad,
+      dif_cam,
+      dif_opy,
+      self->grad_rad_d,
+      self->grad_col_d,
+      self->grad_pos_d,
+      self->grad_cam_buf_d,
+      self->grad_opy_d,
+      self->ids_sorted_d,
+      self->n_track);
+  CHECKLAUNCH();
+#ifdef PULSAR_TIMINGS_ENABLED
+  STOP_TIME(calc_gradients);
+  START_TIME(normalize);
+#endif
+  LAUNCH_MAX_PARALLEL_1D(
+      norm_sphere_gradients<DEV>, num_balls, stream, *self, num_balls);
+  CHECKLAUNCH();
+  if (dif_cam) {
+    SUM_WS(
+        self->grad_cam_buf_d,
+        reinterpret_cast<CamGradInfo*>(self->grad_cam_d),
+        static_cast<int>(num_balls),
+        self->workspace_d,
+        self->workspace_size,
+        stream);
+    CHECKLAUNCH();
+    SUM_WS(
+        (IntWrapper*)(self->ids_sorted_d),
+        (IntWrapper*)(self->n_grad_contributions_d),
+        static_cast<int>(num_balls),
+        self->workspace_d,
+        self->workspace_size,
+        stream);
+    CHECKLAUNCH();
+    LAUNCH_MAX_PARALLEL_1D(
+        norm_cam_gradients<DEV>, static_cast<int64_t>(1), stream, *self);
+    CHECKLAUNCH();
+  }
+#ifdef PULSAR_TIMINGS_ENABLED
+  STOP_TIME(normalize);
+  float time_ms;
+  // This blocks the result and prevents batch-processing from parallelizing.
+  GET_TIME(calc_signature, &time_ms);
+  std::cout << "Time for signature calculation: " << time_ms << " ms"
+            << std::endl;
+  GET_TIME(calc_gradients, &time_ms);
+  std::cout << "Time for gradient calculation: " << time_ms << " ms"
+            << std::endl;
+  GET_TIME(normalize, &time_ms);
+  std::cout << "Time for aggregation and normalization: " << time_ms << " ms"
+            << std::endl;
+#endif
+  LOG_IF(INFO, PULSAR_LOG_RENDER) << "Backward pass complete.";
+}
+} // namespace Renderer
+} // namespace pulsar
+#endif
--- a/pytorch3d/csrc/pulsar/include/renderer.backward.instantiate.h
+++ b/pytorch3d/csrc/pulsar/include/renderer.backward.instantiate.h
+// Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#include "./renderer.backward.device.h"
+namespace pulsar {
+namespace Renderer {
+template void backward<ISONDEVICE>(
+    Renderer* self,
+    const float* grad_im,
+    const float* image,
+    const float* forw_info,
+    const float* vert_pos,
+    const float* vert_col,
+    const float* vert_rad,
+    const CamInfo& cam,
+    const float& gamma,
+    float percent_allowed_difference,
+    const uint& max_n_hits,
+    const float* vert_opy,
+    const size_t& num_balls,
+    const uint& mode,
+    const bool& dif_pos,
+    const bool& dif_col,
+    const bool& dif_rad,
+    const bool& dif_cam,
+    const bool& dif_opy,
+    cudaStream_t stream);
+} // namespace Renderer
+} // namespace pulsar
--- a/pytorch3d/csrc/pulsar/include/renderer.backward_dbg.device.h
+++ b/pytorch3d/csrc/pulsar/include/renderer.backward_dbg.device.h
+// Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#ifndef PULSAR_NATIVE_RENDERER_BACKWARD_DBG_DEVICE_H_
+#define PULSAR_NATIVE_RENDERER_BACKWARD_DBG_DEVICE_H_
+#include "./camera.device.h"
+#include "./math.h"
+#include "./renderer.h"
+namespace pulsar {
+namespace Renderer {
+template <bool DEV>
+void backward_dbg(
+    Renderer* self,
+    const float* grad_im,
+    const float* image,
+    const float* forw_info,
+    const float* vert_pos,
+    const float* vert_col,
+    const float* vert_rad,
+    const CamInfo& cam,
+    const float& gamma,
+    float percent_allowed_difference,
+    const uint& max_n_hits,
+    const float* vert_opy_d,
+    const size_t& num_balls,
+    const uint& mode,
+    const bool& dif_pos,
+    const bool& dif_col,
+    const bool& dif_rad,
+    const bool& dif_cam,
+    const bool& dif_opy,
+    const uint& pos_x,
+    const uint& pos_y,
+    cudaStream_t stream) {
+  ARGCHECK(gamma > 0.f && gamma <= 1.f, 6, "gamma must be in [0., 1.]");
+  ARGCHECK(
+      percent_allowed_difference >= 0.f && percent_allowed_difference <= 1.f,
+      7,
+      "percent_allowed_difference must be in [0., 1.]");
+  ARGCHECK(max_n_hits >= 1u, 8, "max_n_hits must be >= 1");
+  ARGCHECK(
+      num_balls > 0 && num_balls <= self->max_num_balls,
+      9,
+      "num_balls must be >0 and less than max num balls!");
+  ARGCHECK(
+      cam.film_width == self->cam.film_width &&
+          cam.film_height == self->cam.film_height,
+      5,
+      "cam film size must agree");
+  ARGCHECK(mode <= 1, 10, "mode must be <= 1!");
+  if (percent_allowed_difference < EPS) {
+    LOG(WARNING) << "percent_allowed_difference < " << FEPS << "! Clamping to "
+                 << FEPS << ".";
+    percent_allowed_difference = FEPS;
+  }
+  ARGCHECK(
+      pos_x < cam.film_width && pos_y < cam.film_height,
+      15,
+      "pos_x must be < width and pos_y < height.");
+  if (percent_allowed_difference > 1.f - FEPS) {
+    LOG(WARNING) << "percent_allowed_difference > " << (1.f - FEPS)
+                 << "! Clamping to " << (1.f - FEPS) << ".";
+    percent_allowed_difference = 1.f - FEPS;
+  }
+  LOG_IF(INFO, PULSAR_LOG_RENDER)
+      << "Rendering debug backward pass for x: " << pos_x << ", y: " << pos_y;
+  // Update camera.
+  self->cam.eye = cam.eye;
+  self->cam.pixel_0_0_center = cam.pixel_0_0_center - cam.eye;
+  self->cam.pixel_dir_x = cam.pixel_dir_x;
+  self->cam.pixel_dir_y = cam.pixel_dir_y;
+  self->cam.sensor_dir_z = cam.sensor_dir_z;
+  self->cam.half_pixel_size = cam.half_pixel_size;
+  self->cam.focal_length = cam.focal_length;
+  self->cam.aperture_width = cam.aperture_width;
+  self->cam.aperture_height = cam.aperture_height;
+  self->cam.min_dist = cam.min_dist;
+  self->cam.max_dist = cam.max_dist;
+  self->cam.norm_fac = cam.norm_fac;
+  self->cam.principal_point_offset_x = cam.principal_point_offset_x;
+  self->cam.principal_point_offset_y = cam.principal_point_offset_y;
+  self->cam.film_border_left = cam.film_border_left;
+  self->cam.film_border_top = cam.film_border_top;
+  LAUNCH_MAX_PARALLEL_1D(
+      calc_signature<DEV>,
+      num_balls,
+      stream,
+      *self,
+      reinterpret_cast<const float3*>(vert_pos),
+      vert_col,
+      vert_rad,
+      num_balls);
+  CHECKLAUNCH();
+  MEMSET(self->grad_pos_d, 0, float3, num_balls, stream);
+  MEMSET(self->grad_col_d, 0, float, num_balls * self->cam.n_channels, stream);
+  MEMSET(self->grad_rad_d, 0, float, num_balls, stream);
+  MEMSET(self->grad_cam_d, 0, float, 12, stream);
+  MEMSET(self->grad_cam_buf_d, 0, CamGradInfo, num_balls, stream);
+  MEMSET(self->grad_opy_d, 0, float, num_balls, stream);
+  MEMSET(self->ids_sorted_d, 0, int, num_balls, stream);
+  LAUNCH_MAX_PARALLEL_2D(
+      calc_gradients<DEV>,
+      (int64_t)1,
+      (int64_t)1,
+      stream,
+      self->cam,
+      grad_im,
+      gamma,
+      reinterpret_cast<const float3*>(vert_pos),
+      vert_col,
+      vert_rad,
+      vert_opy_d,
+      num_balls,
+      image,
+      forw_info,
+      self->di_d,
+      self->ii_d,
+      dif_pos,
+      dif_col,
+      dif_rad,
+      dif_cam,
+      dif_opy,
+      self->grad_rad_d,
+      self->grad_col_d,
+      self->grad_pos_d,
+      self->grad_cam_buf_d,
+      self->grad_opy_d,
+      self->ids_sorted_d,
+      self->n_track,
+      pos_x,
+      pos_y);
+  CHECKLAUNCH();
+  // We're not doing sphere gradient normalization here.
+  SUM_WS(
+      self->grad_cam_buf_d,
+      reinterpret_cast<CamGradInfo*>(self->grad_cam_d),
+      static_cast<int>(1),
+      self->workspace_d,
+      self->workspace_size,
+      stream);
+  CHECKLAUNCH();
+  // We're not doing camera gradient normalization here.
+  LOG_IF(INFO, PULSAR_LOG_RENDER) << "Debug backward pass complete.";
+}
+} // namespace Renderer
+} // namespace pulsar
+#endif
--- a/pytorch3d/csrc/pulsar/include/renderer.backward_dbg.instantiate.h
+++ b/pytorch3d/csrc/pulsar/include/renderer.backward_dbg.instantiate.h
+// Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#include "./renderer.backward_dbg.device.h"
+namespace pulsar {
+namespace Renderer {
+template void backward_dbg<ISONDEVICE>(
+    Renderer* self,
+    const float* grad_im,
+    const float* image,
+    const float* forw_info,
+    const float* vert_pos,
+    const float* vert_col,
+    const float* vert_rad,
+    const CamInfo& cam,
+    const float& gamma,
+    float percent_allowed_difference,
+    const uint& max_n_hits,
+    const float* vert_opy,
+    const size_t& num_balls,
+    const uint& mode,
+    const bool& dif_pos,
+    const bool& dif_col,
+    const bool& dif_rad,
+    const bool& dif_cam,
+    const bool& dif_opy,
+    const uint& pos_x,
+    const uint& pos_y,
+    cudaStream_t stream);
+} // namespace Renderer
+} // namespace pulsar
--- a/pytorch3d/csrc/pulsar/include/renderer.calc_gradients.device.h
+++ b/pytorch3d/csrc/pulsar/include/renderer.calc_gradients.device.h
+// Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#ifndef PULSAR_NATIVE_INCLUDE_RENDERER_CALC_GRADIENTS_H_
+#define PULSAR_NATIVE_INCLUDE_RENDERER_CALC_GRADIENTS_H_
+#include "../global.h"
+#include "./commands.h"
+#include "./renderer.h"
+#include "./renderer.draw.device.h"
+namespace pulsar {
+namespace Renderer {
+template <bool DEV>
+GLOBAL void calc_gradients(
+    const CamInfo cam, /** Camera in world coordinates. */
+    float const* const RESTRICT grad_im, /** The gradient image. */
+    const float
+        gamma, /** The transparency parameter used in the forward pass. */
+    float3 const* const RESTRICT vert_poss, /** Vertex position vector. */
+    float const* const RESTRICT vert_cols, /** Vertex color vector. */
+    float const* const RESTRICT vert_rads, /** Vertex radius vector. */
+    float const* const RESTRICT opacity, /** Vertex opacity. */
+    const uint num_balls, /** Number of balls. */
+    float const* const RESTRICT result_d, /** Result image. */
+    float const* const RESTRICT forw_info_d, /** Forward pass info. */
+    DrawInfo const* const RESTRICT di_d, /** Draw information. */
+    IntersectInfo const* const RESTRICT ii_d, /** Intersect information. */
+    // Mode switches.
+    const bool calc_grad_pos,
+    const bool calc_grad_col,
+    const bool calc_grad_rad,
+    const bool calc_grad_cam,
+    const bool calc_grad_opy,
+    // Out variables.
+    float* const RESTRICT grad_rad_d, /** Radius gradients. */
+    float* const RESTRICT grad_col_d, /** Color gradients. */
+    float3* const RESTRICT grad_pos_d, /** Position gradients. */
+    CamGradInfo* const RESTRICT grad_cam_buf_d, /** Camera gradient buffer. */
+    float* const RESTRICT grad_opy_d, /** Opacity gradient buffer. */
+    int* const RESTRICT
+        grad_contributed_d, /** Gradient contribution counter. */
+    // Infrastructure.
+    const int n_track,
+    const uint offs_x,
+    const uint offs_y /** Debug offsets. */
+) {
+  uint limit_x = cam.film_width, limit_y = cam.film_height;
+  if (offs_x != 0) {
+    // We're in debug mode.
+    limit_x = 1;
+    limit_y = 1;
+  }
+  GET_PARALLEL_IDS_2D(coord_x_base, coord_y_base, limit_x, limit_y);
+  // coord_x_base and coord_y_base are in the film coordinate system.
+  // We now need to translate to the aperture coordinate system. If
+  // the principal point was shifted left/up nothing has to be
+  // subtracted - only shift needs to be added in case it has been
+  // shifted down/right.
+  const uint film_coord_x = coord_x_base + offs_x;
+  const uint ap_coord_x = film_coord_x +
+      2 * static_cast<uint>(std::max(0, cam.principal_point_offset_x));
+  const uint film_coord_y = coord_y_base + offs_y;
+  const uint ap_coord_y = film_coord_y +
+      2 * static_cast<uint>(std::max(0, cam.principal_point_offset_y));
+  const float3 ray_dir = /** Ray cast through the pixel, normalized. */
+      cam.pixel_0_0_center + ap_coord_x * cam.pixel_dir_x +
+      ap_coord_y * cam.pixel_dir_y;
+  const float norm_ray_dir = length(ray_dir);
+  // ray_dir_norm *must* be calculated here in the same way as in the draw
+  // function to have the same values withno other numerical instabilities
+  // (for example, ray_dir * FRCP(norm_ray_dir) does not work)!
+  float3 ray_dir_norm; /** Ray cast through the pixel, normalized. */
+  float2 projected_ray; /** Ray intersection with the sensor. */
+  if (cam.orthogonal_projection) {
+    ray_dir_norm = cam.sensor_dir_z;
+    projected_ray.x = static_cast<float>(ap_coord_x);
+    projected_ray.y = static_cast<float>(ap_coord_y);
+  } else {
+    ray_dir_norm = normalize(
+        cam.pixel_0_0_center + ap_coord_x * cam.pixel_dir_x +
+        ap_coord_y * cam.pixel_dir_y);
+    // This is a reasonable assumption for normal focal lengths and image sizes.
+    PASSERT(FABS(ray_dir_norm.z) > FEPS);
+    projected_ray.x = ray_dir_norm.x / ray_dir_norm.z * cam.focal_length;
+    projected_ray.y = ray_dir_norm.y / ray_dir_norm.z * cam.focal_length;
+  }
+  float* result = const_cast<float*>(
+      result_d + film_coord_y * cam.film_width * cam.n_channels +
+      film_coord_x * cam.n_channels);
+  const float* grad_im_l = grad_im +
+      film_coord_y * cam.film_width * cam.n_channels +
+      film_coord_x * cam.n_channels;
+  // For writing...
+  float3 grad_pos;
+  float grad_rad, grad_opy;
+  CamGradInfo grad_cam_local = CamGradInfo();
+  // Set up shared infrastructure.
+  const int fwi_loc = film_coord_y * cam.film_width * (3 + 2 * n_track) +
+      film_coord_x * (3 + 2 * n_track);
+  float sm_m = forw_info_d[fwi_loc];
+  float sm_d = forw_info_d[fwi_loc + 1];
+  PULSAR_LOG_DEV_APIX(
+      PULSAR_LOG_GRAD,
+      "grad|sm_m: %f, sm_d: %f, result: "
+      "%f, %f, %f; grad_im: %f, %f, %f.\n",
+      sm_m,
+      sm_d,
+      result[0],
+      result[1],
+      result[2],
+      grad_im_l[0],
+      grad_im_l[1],
+      grad_im_l[2]);
+  // Start processing.
+  for (int grad_idx = 0; grad_idx < n_track; ++grad_idx) {
+    int sphere_idx;
+    FASI(forw_info_d[fwi_loc + 3 + 2 * grad_idx], sphere_idx);
+    PASSERT(
+        sphere_idx == -1 ||
+        sphere_idx >= 0 && static_cast<uint>(sphere_idx) < num_balls);
+    if (sphere_idx >= 0) {
+      // TODO: make more efficient.
+      grad_pos = make_float3(0.f, 0.f, 0.f);
+      grad_rad = 0.f;
+      grad_cam_local = CamGradInfo();
+      const DrawInfo di = di_d[sphere_idx];
+      grad_opy = 0.f;
+      draw(
+          di,
+          opacity == NULL ? 1.f : opacity[sphere_idx],
+          cam,
+          gamma,
+          ray_dir_norm,
+          projected_ray,
+          // Mode switches.
+          false, // draw only
+          calc_grad_pos,
+          calc_grad_col,
+          calc_grad_rad,
+          calc_grad_cam,
+          calc_grad_opy,
+          // Position info.
+          ap_coord_x,
+          ap_coord_y,
+          sphere_idx,
+          // Optional in.
+          &ii_d[sphere_idx],
+          &ray_dir,
+          &norm_ray_dir,
+          grad_im_l,
+          NULL,
+          // In/out
+          &sm_d,
+          &sm_m,
+          result,
+          // Optional out.
+          NULL,
+          NULL,
+          &grad_pos,
+          grad_col_d + sphere_idx * cam.n_channels,
+          &grad_rad,
+          &grad_cam_local,
+          &grad_opy);
+      ATOMICADD(&(grad_rad_d[sphere_idx]), grad_rad);
+      // Color has been added directly.
+      ATOMICADD_F3(&(grad_pos_d[sphere_idx]), grad_pos);
+      ATOMICADD_F3(
+          &(grad_cam_buf_d[sphere_idx].cam_pos), grad_cam_local.cam_pos);
+      if (!cam.orthogonal_projection) {
+        ATOMICADD_F3(
+            &(grad_cam_buf_d[sphere_idx].pixel_0_0_center),
+            grad_cam_local.pixel_0_0_center);
+      }
+      ATOMICADD_F3(
+          &(grad_cam_buf_d[sphere_idx].pixel_dir_x),
+          grad_cam_local.pixel_dir_x);
+      ATOMICADD_F3(
+          &(grad_cam_buf_d[sphere_idx].pixel_dir_y),
+          grad_cam_local.pixel_dir_y);
+      ATOMICADD(&(grad_opy_d[sphere_idx]), grad_opy);
+      ATOMICADD(&(grad_contributed_d[sphere_idx]), 1);
+    }
+  }
+  END_PARALLEL_2D_NORET();
+};
+} // namespace Renderer
+} // namespace pulsar
+#endif
--- a/pytorch3d/csrc/pulsar/include/renderer.calc_gradients.instantiate.h
+++ b/pytorch3d/csrc/pulsar/include/renderer.calc_gradients.instantiate.h
+// Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#include "./renderer.calc_gradients.device.h"
+namespace pulsar {
+namespace Renderer {
+template GLOBAL void calc_gradients<ISONDEVICE>(
+    const CamInfo cam, /** Camera in world coordinates. */
+    float const* const RESTRICT grad_im, /** The gradient image. */
+    const float
+        gamma, /** The transparency parameter used in the forward pass. */
+    float3 const* const RESTRICT vert_poss, /** Vertex position vector. */
+    float const* const RESTRICT vert_cols, /** Vertex color vector. */
+    float const* const RESTRICT vert_rads, /** Vertex radius vector. */
+    float const* const RESTRICT opacity, /** Vertex opacity. */
+    const uint num_balls, /** Number of balls. */
+    float const* const RESTRICT result_d, /** Result image. */
+    float const* const RESTRICT forw_info_d, /** Forward pass info. */
+    DrawInfo const* const RESTRICT di_d, /** Draw information. */
+    IntersectInfo const* const RESTRICT ii_d, /** Intersect information. */
+    // Mode switches.
+    const bool calc_grad_pos,
+    const bool calc_grad_col,
+    const bool calc_grad_rad,
+    const bool calc_grad_cam,
+    const bool calc_grad_opy,
+    // Out variables.
+    float* const RESTRICT grad_rad_d, /** Radius gradients. */
+    float* const RESTRICT grad_col_d, /** Color gradients. */
+    float3* const RESTRICT grad_pos_d, /** Position gradients. */
+    CamGradInfo* const RESTRICT grad_cam_buf_d, /** Camera gradient buffer. */
+    float* const RESTRICT grad_opy_d, /** Opacity gradient buffer. */
+    int* const RESTRICT
+        grad_contributed_d, /** Gradient contribution counter. */
+    // Infrastructure.
+    const int n_track,
+    const uint offs_x,
+    const uint offs_y);
+} // namespace Renderer
+} // namespace pulsar
--- a/pytorch3d/csrc/pulsar/include/renderer.calc_signature.device.h
+++ b/pytorch3d/csrc/pulsar/include/renderer.calc_signature.device.h
+// Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#ifndef PULSAR_NATIVE_INCLUDE_RENDERER_CALC_SIGNATURE_DEVICE_H_
+#define PULSAR_NATIVE_INCLUDE_RENDERER_CALC_SIGNATURE_DEVICE_H_
+#include "../global.h"
+#include "./camera.device.h"
+#include "./commands.h"
+#include "./math.h"
+#include "./renderer.get_screen_area.device.h"
+#include "./renderer.h"
+namespace pulsar {
+namespace Renderer {
+template <bool DEV>
+GLOBAL void calc_signature(
+    Renderer renderer,
+    float3 const* const RESTRICT vert_poss,
+    float const* const RESTRICT vert_cols,
+    float const* const RESTRICT vert_rads,
+    const uint num_balls) {
+  /* We're not using RESTRICT here for the pointers within `renderer`. Just one
+     value is being read from each of the pointers, so the effect would be
+     negligible or non-existent. */
+  GET_PARALLEL_IDX_1D(idx, num_balls);
+  // Create aliases.
+  // For reading...
+  const float3& vert_pos = vert_poss[idx]; /** Vertex position. */
+  const float* vert_col =
+      vert_cols + idx * renderer.cam.n_channels; /** Vertex color. */
+  const float& vert_rad = vert_rads[idx]; /** Vertex radius. */
+  const CamInfo& cam = renderer.cam; /** Camera in world coordinates. */
+  // For writing...
+  /** Ball ID (either original index of the ball or -1 if not visible). */
+  int& id_out = renderer.ids_d[idx];
+  /** Intersection helper structure for the ball. */
+  IntersectInfo& intersect_helper_out = renderer.ii_d[idx];
+  /** Draw helper structure for this ball. */
+  DrawInfo& draw_helper_out = renderer.di_d[idx];
+  /** Minimum possible intersection depth for this ball. */
+  float& closest_possible_intersect_out = renderer.min_depth_d[idx];
+  PULSAR_LOG_DEV(
+      PULSAR_LOG_CALC_SIGNATURE,
+      "signature %d|vert_pos: %.9f, %.9f, %.9f, vert_col (first three): "
+      "%.9f, %.9f, %.9f.\n",
+      idx,
+      vert_pos.x,
+      vert_pos.y,
+      vert_pos.z,
+      vert_col[0],
+      vert_col[1],
+      vert_col[2]);
+  // Set flags to invalid for a potential early return.
+  id_out = -1; // Invalid ID.
+  closest_possible_intersect_out =
+      MAX_FLOAT; // These spheres are sorted to the very end.
+  intersect_helper_out.max.x = MAX_USHORT; // No intersection possible.
+  intersect_helper_out.min.x = MAX_USHORT;
+  intersect_helper_out.max.y = MAX_USHORT;
+  intersect_helper_out.min.y = MAX_USHORT;
+  // Start processing.
+  /** Ball center in the camera coordinate system. */
+  const float3 ball_center_cam = vert_pos - cam.eye;
+  /** Distance to the ball center in the camera coordinate system. */
+  const float t_center = length(ball_center_cam);
+  /** Closest possible intersection with this ball from the camera. */
+  float closest_possible_intersect;
+  if (cam.orthogonal_projection) {
+    const float3 ball_center_cam_rot = rotate(
+        ball_center_cam,
+        cam.pixel_dir_x / length(cam.pixel_dir_x),
+        cam.pixel_dir_y / length(cam.pixel_dir_y),
+        cam.sensor_dir_z);
+    closest_possible_intersect = ball_center_cam_rot.z - vert_rad;
+  } else {
+    closest_possible_intersect = t_center - vert_rad;
+  }
+  PULSAR_LOG_DEV(
+      PULSAR_LOG_CALC_SIGNATURE,
+      "signature %d|t_center: %f. vert_rad: %f. "
+      "closest_possible_intersect: %f.\n",
+      idx,
+      t_center,
+      vert_rad,
+      closest_possible_intersect);
+  /**
+   * Corner points of the enclosing projected rectangle of the ball.
+   * They are first calculated in the camera coordinate system, then
+   * converted to the pixel coordinate system.
+   */
+  float x_1, x_2, y_1, y_2;
+  bool hits_screen_plane;
+  float3 ray_center_norm = ball_center_cam / t_center;
+  PASSERT(vert_rad >= 0.f);
+  if (closest_possible_intersect < cam.min_dist ||
+      closest_possible_intersect > cam.max_dist) {
+    PULSAR_LOG_DEV(
+        PULSAR_LOG_CALC_SIGNATURE,
+        "signature %d|ignoring sphere out of min/max bounds: %.9f, "
+        "min: %.9f, max: %.9f.\n",
+        idx,
+        closest_possible_intersect,
+        cam.min_dist,
+        cam.max_dist);
+    RETURN_PARALLEL();
+  }
+  // Find the relevant region on the screen plane.
+  hits_screen_plane = get_screen_area(
+      ball_center_cam,
+      ray_center_norm,
+      vert_rad,
+      cam,
+      idx,
+      &x_1,
+      &x_2,
+      &y_1,
+      &y_2);
+  if (!hits_screen_plane)
+    RETURN_PARALLEL();
+  PULSAR_LOG_DEV(
+      PULSAR_LOG_CALC_SIGNATURE,
+      "signature %d|in pixels: x_1: %f, x_2: %f, y_1: %f, y_2: %f.\n",
+      idx,
+      x_1,
+      x_2,
+      y_1,
+      y_2);
+  // Check whether the pixel coordinates are on screen.
+  if (FMAX(x_1, x_2) <= static_cast<float>(cam.film_border_left) ||
+      FMIN(x_1, x_2) >=
+          static_cast<float>(cam.film_border_left + cam.film_width) - 0.5f ||
+      FMAX(y_1, y_2) <= static_cast<float>(cam.film_border_top) ||
+      FMIN(y_1, y_2) >
+          static_cast<float>(cam.film_border_top + cam.film_height) - 0.5f)
+    RETURN_PARALLEL();
+  // Write results.
+  id_out = idx;
+  intersect_helper_out.min.x = static_cast<ushort>(
+      FMAX(FMIN(x_1, x_2), static_cast<float>(cam.film_border_left)));
+  intersect_helper_out.min.y = static_cast<ushort>(
+      FMAX(FMIN(y_1, y_2), static_cast<float>(cam.film_border_top)));
+  // In the following calculations, the max that needs to be stored is
+  // exclusive.
+  // That means that the calculated value needs to be `ceil`ed and incremented
+  // to find the correct value.
+  intersect_helper_out.max.x = static_cast<ushort>(FMIN(
+      FCEIL(FMAX(x_1, x_2)) + 1,
+      static_cast<float>(cam.film_border_left + cam.film_width)));
+  intersect_helper_out.max.y = static_cast<ushort>(FMIN(
+      FCEIL(FMAX(y_1, y_2)) + 1,
+      static_cast<float>(cam.film_border_top + cam.film_height)));
+  PULSAR_LOG_DEV(
+      PULSAR_LOG_CALC_SIGNATURE,
+      "signature %d|limits after refining: x_1: %u, x_2: %u, "
+      "y_1: %u, y_2: %u.\n",
+      idx,
+      intersect_helper_out.min.x,
+      intersect_helper_out.max.x,
+      intersect_helper_out.min.y,
+      intersect_helper_out.max.y);
+  if (intersect_helper_out.min.x == MAX_USHORT) {
+    id_out = -1;
+    RETURN_PARALLEL();
+  }
+  PULSAR_LOG_DEV(
+      PULSAR_LOG_CALC_SIGNATURE,
+      "signature %d|writing info. closest_possible_intersect: %.9f. "
+      "ray_center_norm: %.9f, %.9f, %.9f. t_center: %.9f. radius: %.9f.\n",
+      idx,
+      closest_possible_intersect,
+      ray_center_norm.x,
+      ray_center_norm.y,
+      ray_center_norm.z,
+      t_center,
+      vert_rad);
+  closest_possible_intersect_out = closest_possible_intersect;
+  draw_helper_out.ray_center_norm = ray_center_norm;
+  draw_helper_out.t_center = t_center;
+  draw_helper_out.radius = vert_rad;
+  if (cam.n_channels <= 3) {
+    draw_helper_out.first_color = vert_col[0];
+    for (uint c_id = 1; c_id < cam.n_channels; ++c_id) {
+      draw_helper_out.color_union.color[c_id - 1] = vert_col[c_id];
+    }
+  } else {
+    draw_helper_out.color_union.ptr = const_cast<float*>(vert_col);
+  }
+  END_PARALLEL();
+};
+} // namespace Renderer
+} // namespace pulsar
+#endif
--- a/pytorch3d/csrc/pulsar/include/renderer.calc_signature.instantiate.h
+++ b/pytorch3d/csrc/pulsar/include/renderer.calc_signature.instantiate.h
+// Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#ifndef PULSAR_NATIVE_INCLUDE_RENDERER_CALC_SIGNATURE_INSTANTIATE_H_
+#define PULSAR_NATIVE_INCLUDE_RENDERER_CALC_SIGNATURE_INSTANTIATE_H_
+#include "./renderer.calc_signature.device.h"
+namespace pulsar {
+namespace Renderer {
+template GLOBAL void calc_signature<ISONDEVICE>(
+    Renderer renderer,
+    float3 const* const RESTRICT vert_poss,
+    float const* const RESTRICT vert_cols,
+    float const* const RESTRICT vert_rads,
+    const uint num_balls);
+}
+} // namespace pulsar
+#endif
--- a/pytorch3d/csrc/pulsar/include/renderer.construct.device.h
+++ b/pytorch3d/csrc/pulsar/include/renderer.construct.device.h
+// Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#ifndef PULSAR_NATIVE_INCLUDE_RENDERER_CONSTRUCT_DEVICE_H_
+#define PULSAR_NATIVE_INCLUDE_RENDERER_CONSTRUCT_DEVICE_H_
+#include "../global.h"
+#include "./camera.device.h"
+#include "./commands.h"
+#include "./math.h"
+#include "./renderer.h"
+namespace pulsar {
+namespace Renderer {
+template <bool DEV>
+HOST void construct(
+    Renderer* self,
+    const size_t& max_num_balls,
+    const int& width,
+    const int& height,
+    const bool& orthogonal_projection,
+    const bool& right_handed_system,
+    const float& background_normalization_depth,
+    const uint& n_channels,
+    const uint& n_track) {
+  ARGCHECK(
+      (max_num_balls > 0 && max_num_balls < MAX_INT),
+      2,
+      ("the maximum number of balls must be >0 and <" +
+       std::to_string(MAX_INT) + ". Is " + std::to_string(max_num_balls) + ".")
+          .c_str());
+  ARGCHECK(width > 1, 3, "the image width must be > 1");
+  ARGCHECK(height > 1, 4, "the image height must be > 1");
+  ARGCHECK(
+      background_normalization_depth > 0.f &&
+          background_normalization_depth < 1.f,
+      6,
+      "background_normalization_depth must be in ]0., 1.[.");
+  ARGCHECK(n_channels > 0, 7, "n_channels must be >0!");
+  ARGCHECK(
+      n_track > 0 && n_track <= MAX_GRAD_SPHERES,
+      8,
+      ("n_track must be >0 and <" + std::to_string(MAX_GRAD_SPHERES) + ". Is " +
+       std::to_string(n_track) + ".")
+          .c_str());
+  self->cam.film_width = width;
+  self->cam.film_height = height;
+  self->max_num_balls = max_num_balls;
+  MALLOC(self->result_d, float, width* height* n_channels);
+  self->cam.orthogonal_projection = orthogonal_projection;
+  self->cam.right_handed = right_handed_system;
+  self->cam.background_normalization_depth = background_normalization_depth;
+  self->cam.n_channels = n_channels;
+  MALLOC(self->min_depth_d, float, max_num_balls);
+  MALLOC(self->min_depth_sorted_d, float, max_num_balls);
+  MALLOC(self->ii_d, IntersectInfo, max_num_balls);
+  MALLOC(self->ii_sorted_d, IntersectInfo, max_num_balls);
+  MALLOC(self->ids_d, int, max_num_balls);
+  MALLOC(self->ids_sorted_d, int, max_num_balls);
+  size_t sort_id_size = 0;
+  GET_SORT_WS_SIZE(&sort_id_size, float, int, max_num_balls);
+  CHECKLAUNCH();
+  size_t sort_ii_size = 0;
+  GET_SORT_WS_SIZE(&sort_ii_size, float, IntersectInfo, max_num_balls);
+  CHECKLAUNCH();
+  size_t sort_di_size = 0;
+  GET_SORT_WS_SIZE(&sort_di_size, float, DrawInfo, max_num_balls);
+  CHECKLAUNCH();
+  size_t select_ii_size = 0;
+  GET_SELECT_WS_SIZE(&select_ii_size, char, IntersectInfo, max_num_balls);
+  size_t select_di_size = 0;
+  GET_SELECT_WS_SIZE(&select_di_size, char, DrawInfo, max_num_balls);
+  size_t sum_size = 0;
+  GET_SUM_WS_SIZE(&sum_size, CamGradInfo, max_num_balls);
+  size_t sum_cont_size = 0;
+  GET_SUM_WS_SIZE(&sum_cont_size, int, max_num_balls);
+  size_t reduce_size = 0;
+  GET_REDUCE_WS_SIZE(
+      &reduce_size, IntersectInfo, IntersectInfoMinMax(), max_num_balls);
+  self->workspace_size = IMAX(
+      IMAX(IMAX(sort_id_size, sort_ii_size), sort_di_size),
+      IMAX(
+          IMAX(select_di_size, select_ii_size),
+          IMAX(IMAX(sum_size, sum_cont_size), reduce_size)));
+  MALLOC(self->workspace_d, char, self->workspace_size);
+  MALLOC(self->di_d, DrawInfo, max_num_balls);
+  MALLOC(self->di_sorted_d, DrawInfo, max_num_balls);
+  MALLOC(self->region_flags_d, char, max_num_balls);
+  MALLOC(self->num_selected_d, size_t, 1);
+  MALLOC(self->forw_info_d, float, width* height*(3 + 2 * n_track));
+  MALLOC(self->min_max_pixels_d, IntersectInfo, 1);
+  MALLOC(self->grad_pos_d, float3, max_num_balls);
+  MALLOC(self->grad_col_d, float, max_num_balls* n_channels);
+  MALLOC(self->grad_rad_d, float, max_num_balls);
+  MALLOC(self->grad_cam_d, float, 12);
+  MALLOC(self->grad_cam_buf_d, CamGradInfo, max_num_balls);
+  MALLOC(self->grad_opy_d, float, max_num_balls);
+  MALLOC(self->n_grad_contributions_d, int, 1);
+  self->n_track = static_cast<int>(n_track);
+}
+} // namespace Renderer
+} // namespace pulsar
+#endif
--- a/pytorch3d/csrc/pulsar/include/renderer.construct.instantiate.h
+++ b/pytorch3d/csrc/pulsar/include/renderer.construct.instantiate.h
+// Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#ifndef PULSAR_NATIVE_INCLUDE_RENDERER_CONSTRUCT_INSTANTIATE_H_
+#define PULSAR_NATIVE_INCLUDE_RENDERER_CONSTRUCT_INSTANTIATE_H_
+#include "./renderer.construct.device.h"
+namespace pulsar {
+namespace Renderer {
+template void construct<ISONDEVICE>(
+    Renderer* self,
+    const size_t& max_num_balls,
+    const int& width,
+    const int& height,
+    const bool& orthogonal_projection,
+    const bool& right_handed_system,
+    const float& background_normalization_depth,
+    const uint& n_channels,
+    const uint& n_track);
+}
+} // namespace pulsar
+#endif
--- a/pytorch3d/csrc/pulsar/include/renderer.create_selector.device.h
+++ b/pytorch3d/csrc/pulsar/include/renderer.create_selector.device.h
+// Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#ifndef PULSAR_NATIVE_INCLUDE_RENDERER_CREATE_SELECTOR_DEVICE_H_
+#define PULSAR_NATIVE_INCLUDE_RENDERER_CREATE_SELECTOR_DEVICE_H_
+#include "../global.h"
+#include "./commands.h"
+#include "./renderer.h"
+namespace pulsar {
+namespace Renderer {
+template <bool DEV>
+GLOBAL void create_selector(
+    IntersectInfo const* const RESTRICT ii_sorted_d,
+    const uint num_balls,
+    const int min_x,
+    const int max_x,
+    const int min_y,
+    const int max_y,
+    /* Out variables. */
+    char* RESTRICT region_flags_d) {
+  GET_PARALLEL_IDX_1D(idx, num_balls);
+  bool hit = (static_cast<int>(ii_sorted_d[idx].min.x) <= max_x) &&
+      (static_cast<int>(ii_sorted_d[idx].max.x) > min_x) &&
+      (static_cast<int>(ii_sorted_d[idx].min.y) <= max_y) &&
+      (static_cast<int>(ii_sorted_d[idx].max.y) > min_y);
+  region_flags_d[idx] = hit;
+  END_PARALLEL_NORET();
+}
+} // namespace Renderer
+} // namespace pulsar
+#endif
--- a/pytorch3d/csrc/pulsar/include/renderer.create_selector.instantiate.h
+++ b/pytorch3d/csrc/pulsar/include/renderer.create_selector.instantiate.h
+// Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#ifndef PULSAR_NATIVE_INCLUDE_RENDERER_CREATE_SELECTOR_INSTANTIATE_H_
+#define PULSAR_NATIVE_INCLUDE_RENDERER_CREATE_SELECTOR_INSTANTIATE_H_
+#include "./renderer.create_selector.device.h"
+namespace pulsar {
+namespace Renderer {
+template GLOBAL void create_selector<ISONDEVICE>(
+    IntersectInfo const* const RESTRICT ii_sorted_d,
+    const uint num_balls,
+    const int min_x,
+    const int max_x,
+    const int min_y,
+    const int max_y,
+    /* Out variables. */
+    char* RESTRICT region_flags_d);
+}
+} // namespace pulsar
+#endif
--- a/pytorch3d/csrc/pulsar/include/renderer.destruct.device.h
+++ b/pytorch3d/csrc/pulsar/include/renderer.destruct.device.h
+// Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#ifndef PULSAR_NATIVE_INCLUDE_RENDERER_DESTRUCT_H_
+#define PULSAR_NATIVE_INCLUDE_RENDERER_DESTRUCT_H_
+#include "../global.h"
+#include "./commands.h"
+#include "./renderer.h"
+namespace pulsar {
+namespace Renderer {
+template <bool DEV>
+HOST void destruct(Renderer* self) {
+  if (self->result_d != NULL)
+    FREE(self->result_d);
+  self->result_d = NULL;
+  if (self->min_depth_d != NULL)
+    FREE(self->min_depth_d);
+  self->min_depth_d = NULL;
+  if (self->min_depth_sorted_d != NULL)
+    FREE(self->min_depth_sorted_d);
+  self->min_depth_sorted_d = NULL;
+  if (self->ii_d != NULL)
+    FREE(self->ii_d);
+  self->ii_d = NULL;
+  if (self->ii_sorted_d != NULL)
+    FREE(self->ii_sorted_d);
+  self->ii_sorted_d = NULL;
+  if (self->ids_d != NULL)
+    FREE(self->ids_d);
+  self->ids_d = NULL;
+  if (self->ids_sorted_d != NULL)
+    FREE(self->ids_sorted_d);
+  self->ids_sorted_d = NULL;
+  if (self->workspace_d != NULL)
+    FREE(self->workspace_d);
+  self->workspace_d = NULL;
+  if (self->di_d != NULL)
+    FREE(self->di_d);
+  self->di_d = NULL;
+  if (self->di_sorted_d != NULL)
+    FREE(self->di_sorted_d);
+  self->di_sorted_d = NULL;
+  if (self->region_flags_d != NULL)
+    FREE(self->region_flags_d);
+  self->region_flags_d = NULL;
+  if (self->num_selected_d != NULL)
+    FREE(self->num_selected_d);
+  self->num_selected_d = NULL;
+  if (self->forw_info_d != NULL)
+    FREE(self->forw_info_d);
+  self->forw_info_d = NULL;
+  if (self->min_max_pixels_d != NULL)
+    FREE(self->min_max_pixels_d);
+  self->min_max_pixels_d = NULL;
+  if (self->grad_pos_d != NULL)
+    FREE(self->grad_pos_d);
+  self->grad_pos_d = NULL;
+  if (self->grad_col_d != NULL)
+    FREE(self->grad_col_d);
+  self->grad_col_d = NULL;
+  if (self->grad_rad_d != NULL)
+    FREE(self->grad_rad_d);
+  self->grad_rad_d = NULL;
+  if (self->grad_cam_d != NULL)
+    FREE(self->grad_cam_d);
+  self->grad_cam_d = NULL;
+  if (self->grad_cam_buf_d != NULL)
+    FREE(self->grad_cam_buf_d);
+  self->grad_cam_buf_d = NULL;
+  if (self->grad_opy_d != NULL)
+    FREE(self->grad_opy_d);
+  self->grad_opy_d = NULL;
+  if (self->n_grad_contributions_d != NULL)
+    FREE(self->n_grad_contributions_d);
+  self->n_grad_contributions_d = NULL;
+}
+} // namespace Renderer
+} // namespace pulsar
+#endif
--- a/pytorch3d/csrc/pulsar/include/renderer.destruct.instantiate.h
+++ b/pytorch3d/csrc/pulsar/include/renderer.destruct.instantiate.h
+// Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#ifndef PULSAR_NATIVE_INCLUDE_RENDERER_DESTRUCT_INSTANTIATE_H_
+#define PULSAR_NATIVE_INCLUDE_RENDERER_DESTRUCT_INSTANTIATE_H_
+#include "./renderer.destruct.device.h"
+namespace pulsar {
+namespace Renderer {
+template void destruct<ISONDEVICE>(Renderer* self);
+}
+} // namespace pulsar
+#endif
--- a/pytorch3d/csrc/pulsar/include/renderer.draw.device.h
+++ b/pytorch3d/csrc/pulsar/include/renderer.draw.device.h
--- a/pytorch3d/csrc/pulsar/include/renderer.fill_bg.device.h
+++ b/pytorch3d/csrc/pulsar/include/renderer.fill_bg.device.h
+// Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#ifndef PULSAR_NATIVE_INCLUDE_RENDERER_FILL_BG_DEVICE_H_
+#define PULSAR_NATIVE_INCLUDE_RENDERER_FILL_BG_DEVICE_H_
+#include "../global.h"
+#include "./camera.h"
+#include "./commands.h"
+#include "./renderer.h"
+namespace pulsar {
+namespace Renderer {
+template <bool DEV>
+GLOBAL void fill_bg(
+    Renderer renderer,
+    const CamInfo cam,
+    float const* const bg_col_d,
+    const float gamma,
+    const uint mode) {
+  GET_PARALLEL_IDS_2D(coord_x, coord_y, cam.film_width, cam.film_height);
+  int write_loc = coord_y * cam.film_width * (3 + 2 * renderer.n_track) +
+      coord_x * (3 + 2 * renderer.n_track);
+  if (renderer.forw_info_d[write_loc + 1] // sm_d
+      == 0.f) {
+    // This location has not been processed yet.
+    // Write first the forw_info:
+    // sm_m
+    renderer.forw_info_d[write_loc] =
+        cam.background_normalization_depth / gamma;
+    // sm_d
+    renderer.forw_info_d[write_loc + 1] = 1.f;
+    // max_closest_possible_intersection_hit
+    renderer.forw_info_d[write_loc + 2] = -1.f;
+    // sphere IDs and intersection depths.
+    for (int i = 0; i < renderer.n_track; ++i) {
+      int sphere_id = -1;
+      IASF(sphere_id, renderer.forw_info_d[write_loc + 3 + i * 2]);
+      renderer.forw_info_d[write_loc + 3 + i * 2 + 1] = -1.f;
+    }
+    if (mode == 0) {
+      // Image background.
+      for (int i = 0; i < cam.n_channels; ++i) {
+        renderer.result_d
+            [coord_y * cam.film_width * cam.n_channels +
+             coord_x * cam.n_channels + i] = bg_col_d[i];
+      }
+    }
+  }
+  END_PARALLEL_2D_NORET();
+};
+} // namespace Renderer
+} // namespace pulsar
+#endif
--- a/pytorch3d/csrc/pulsar/include/renderer.fill_bg.instantiate.h
+++ b/pytorch3d/csrc/pulsar/include/renderer.fill_bg.instantiate.h
+// Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#include "./renderer.fill_bg.device.h"
+namespace pulsar {
+namespace Renderer {
+template GLOBAL void fill_bg<ISONDEVICE>(
+    Renderer renderer,
+    const CamInfo norm,
+    float const* const bg_col_d,
+    const float gamma,
+    const uint mode);
+} // namespace Renderer
+} // namespace pulsar
--- a/pytorch3d/csrc/pulsar/include/renderer.forward.device.h
+++ b/pytorch3d/csrc/pulsar/include/renderer.forward.device.h
+// Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#ifndef PULSAR_NATIVE_INCLUDE_RENDERER_FORWARD_DEVICE_H_
+#define PULSAR_NATIVE_INCLUDE_RENDERER_FORWARD_DEVICE_H_
+#include "../global.h"
+#include "./camera.device.h"
+#include "./commands.h"
+#include "./math.h"
+#include "./renderer.h"
+namespace pulsar {
+namespace Renderer {
+template <bool DEV>
+void forward(
+    Renderer* self,
+    const float* vert_pos,
+    const float* vert_col,
+    const float* vert_rad,
+    const CamInfo& cam,
+    const float& gamma,
+    float percent_allowed_difference,
+    const uint& max_n_hits,
+    const float* bg_col_d,
+    const float* opacity_d,
+    const size_t& num_balls,
+    const uint& mode,
+    cudaStream_t stream) {
+  ARGCHECK(gamma > 0.f && gamma <= 1.f, 6, "gamma must be in [0., 1.]");
+  ARGCHECK(
+      percent_allowed_difference >= 0.f && percent_allowed_difference <= 1.f,
+      7,
+      "percent_allowed_difference must be in [0., 1.]");
+  ARGCHECK(max_n_hits >= 1u, 8, "max_n_hits must be >= 1");
+  ARGCHECK(
+      num_balls > 0 && num_balls <= self->max_num_balls,
+      9,
+      ("num_balls must be >0 and <= max num balls! (" +
+       std::to_string(num_balls) + " vs. " +
+       std::to_string(self->max_num_balls) + ")")
+          .c_str());
+  ARGCHECK(
+      cam.film_width == self->cam.film_width &&
+          cam.film_height == self->cam.film_height,
+      5,
+      "cam result width and height must agree");
+  ARGCHECK(mode <= 1, 10, "mode must be <= 1!");
+  if (percent_allowed_difference > 1.f - FEPS) {
+    LOG(WARNING) << "percent_allowed_difference > " << (1.f - FEPS)
+                 << "! Clamping to " << (1.f - FEPS) << ".";
+    percent_allowed_difference = 1.f - FEPS;
+  }
+  LOG_IF(INFO, PULSAR_LOG_RENDER) << "Rendering forward pass...";
+  // Update camera and transform into a new virtual camera system with
+  // centered principal point and subsection rendering.
+  self->cam.eye = cam.eye;
+  self->cam.pixel_0_0_center = cam.pixel_0_0_center - cam.eye;
+  self->cam.pixel_dir_x = cam.pixel_dir_x;
+  self->cam.pixel_dir_y = cam.pixel_dir_y;
+  self->cam.sensor_dir_z = cam.sensor_dir_z;
+  self->cam.half_pixel_size = cam.half_pixel_size;
+  self->cam.focal_length = cam.focal_length;
+  self->cam.aperture_width = cam.aperture_width;
+  self->cam.aperture_height = cam.aperture_height;
+  self->cam.min_dist = cam.min_dist;
+  self->cam.max_dist = cam.max_dist;
+  self->cam.norm_fac = cam.norm_fac;
+  self->cam.principal_point_offset_x = cam.principal_point_offset_x;
+  self->cam.principal_point_offset_y = cam.principal_point_offset_y;
+  self->cam.film_border_left = cam.film_border_left;
+  self->cam.film_border_top = cam.film_border_top;
+#ifdef PULSAR_TIMINGS_ENABLED
+  START_TIME(calc_signature);
+#endif
+  LAUNCH_MAX_PARALLEL_1D(
+      calc_signature<DEV>,
+      num_balls,
+      stream,
+      *self,
+      reinterpret_cast<const float3*>(vert_pos),
+      vert_col,
+      vert_rad,
+      num_balls);
+  CHECKLAUNCH();
+#ifdef PULSAR_TIMINGS_ENABLED
+  STOP_TIME(calc_signature);
+  START_TIME(sort);
+#endif
+  SORT_ASCENDING_WS(
+      self->min_depth_d,
+      self->min_depth_sorted_d,
+      self->ids_d,
+      self->ids_sorted_d,
+      num_balls,
+      self->workspace_d,
+      self->workspace_size,
+      stream);
+  SORT_ASCENDING_WS(
+      self->min_depth_d,
+      self->min_depth_sorted_d,
+      self->ii_d,
+      self->ii_sorted_d,
+      num_balls,
+      self->workspace_d,
+      self->workspace_size,
+      stream);
+  SORT_ASCENDING_WS(
+      self->min_depth_d,
+      self->min_depth_sorted_d,
+      self->di_d,
+      self->di_sorted_d,
+      num_balls,
+      self->workspace_d,
+      self->workspace_size,
+      stream);
+  CHECKLAUNCH();
+#ifdef PULSAR_TIMINGS_ENABLED
+  STOP_TIME(sort);
+  START_TIME(minmax);
+#endif
+  IntersectInfo pixel_minmax;
+  pixel_minmax.min.x = MAX_USHORT;
+  pixel_minmax.min.y = MAX_USHORT;
+  pixel_minmax.max.x = 0;
+  pixel_minmax.max.y = 0;
+  REDUCE_WS(
+      self->ii_sorted_d,
+      self->min_max_pixels_d,
+      num_balls,
+      IntersectInfoMinMax(),
+      pixel_minmax,
+      self->workspace_d,
+      self->workspace_size,
+      stream);
+  COPY_DEV_HOST(&pixel_minmax, self->min_max_pixels_d, IntersectInfo, 1);
+  LOG_IF(INFO, PULSAR_LOG_RENDER)
+      << "Region with pixels to render: " << pixel_minmax.min.x << ":"
+      << pixel_minmax.max.x << " (x), " << pixel_minmax.min.y << ":"
+      << pixel_minmax.max.y << " (y).";
+#ifdef PULSAR_TIMINGS_ENABLED
+  STOP_TIME(minmax);
+  START_TIME(render);
+#endif
+  MEMSET(
+      self->result_d,
+      0,
+      float,
+      self->cam.film_width * self->cam.film_height * self->cam.n_channels,
+      stream);
+  MEMSET(
+      self->forw_info_d,
+      0,
+      float,
+      self->cam.film_width * self->cam.film_height * (3 + 2 * self->n_track),
+      stream);
+  if (pixel_minmax.max.y > pixel_minmax.min.y &&
+      pixel_minmax.max.x > pixel_minmax.min.x) {
+    PASSERT(
+        pixel_minmax.min.x >= static_cast<ushort>(self->cam.film_border_left) &&
+        pixel_minmax.min.x <
+            static_cast<ushort>(
+                self->cam.film_border_left + self->cam.film_width) &&
+        pixel_minmax.max.x <=
+            static_cast<ushort>(
+                self->cam.film_border_left + self->cam.film_width) &&
+        pixel_minmax.min.y >= static_cast<ushort>(self->cam.film_border_top) &&
+        pixel_minmax.min.y <
+            static_cast<ushort>(
+                self->cam.film_border_top + self->cam.film_height) &&
+        pixel_minmax.max.y <=
+            static_cast<ushort>(
+                self->cam.film_border_top + self->cam.film_height));
+    // Cut the image in 3x3 regions.
+    int y_step = RENDER_BLOCK_SIZE *
+        iDivCeil(pixel_minmax.max.y - pixel_minmax.min.y,
+                 3u * RENDER_BLOCK_SIZE);
+    int x_step = RENDER_BLOCK_SIZE *
+        iDivCeil(pixel_minmax.max.x - pixel_minmax.min.x,
+                 3u * RENDER_BLOCK_SIZE);
+    LOG_IF(INFO, PULSAR_LOG_RENDER) << "Using image slices of size " << x_step
+                                    << ", " << y_step << " (W, H).";
+    for (int y_min = pixel_minmax.min.y; y_min < pixel_minmax.max.y;
+         y_min += y_step) {
+      for (int x_min = pixel_minmax.min.x; x_min < pixel_minmax.max.x;
+           x_min += x_step) {
+        // Create region selection.
+        LAUNCH_MAX_PARALLEL_1D(
+            create_selector<DEV>,
+            num_balls,
+            stream,
+            self->ii_sorted_d,
+            num_balls,
+            x_min,
+            x_min + x_step,
+            y_min,
+            y_min + y_step,
+            self->region_flags_d);
+        CHECKLAUNCH();
+        SELECT_FLAGS_WS(
+            self->region_flags_d,
+            self->ii_sorted_d,
+            self->ii_d,
+            self->num_selected_d,
+            num_balls,
+            self->workspace_d,
+            self->workspace_size,
+            stream);
+        CHECKLAUNCH();
+        SELECT_FLAGS_WS(
+            self->region_flags_d,
+            self->di_sorted_d,
+            self->di_d,
+            self->num_selected_d,
+            num_balls,
+            self->workspace_d,
+            self->workspace_size,
+            stream);
+        CHECKLAUNCH();
+        SELECT_FLAGS_WS(
+            self->region_flags_d,
+            self->ids_sorted_d,
+            self->ids_d,
+            self->num_selected_d,
+            num_balls,
+            self->workspace_d,
+            self->workspace_size,
+            stream);
+        CHECKLAUNCH();
+        LAUNCH_PARALLEL_2D(
+            render<DEV>,
+            x_step,
+            y_step,
+            RENDER_BLOCK_SIZE,
+            RENDER_BLOCK_SIZE,
+            stream,
+            self->num_selected_d,
+            self->ii_d,
+            self->di_d,
+            self->min_depth_d,
+            self->ids_d,
+            opacity_d,
+            self->cam,
+            gamma,
+            percent_allowed_difference,
+            max_n_hits,
+            bg_col_d,
+            mode,
+            x_min,
+            y_min,
+            x_step,
+            y_step,
+            self->result_d,
+            self->forw_info_d,
+            self->n_track);
+        CHECKLAUNCH();
+      }
+    }
+  }
+  if (mode == 0) {
+    LAUNCH_MAX_PARALLEL_2D(
+        fill_bg<DEV>,
+        static_cast<int64_t>(self->cam.film_width),
+        static_cast<int64_t>(self->cam.film_height),
+        stream,
+        *self,
+        self->cam,
+        bg_col_d,
+        gamma,
+        mode);
+    CHECKLAUNCH();
+  }
+#ifdef PULSAR_TIMINGS_ENABLED
+  STOP_TIME(render);
+  float time_ms;
+  // This blocks the result and prevents batch-processing from parallelizing.
+  GET_TIME(calc_signature, &time_ms);
+  std::cout << "Time for signature calculation: " << time_ms << " ms"
+            << std::endl;
+  GET_TIME(sort, &time_ms);
+  std::cout << "Time for sorting: " << time_ms << " ms" << std::endl;
+  GET_TIME(minmax, &time_ms);
+  std::cout << "Time for minmax pixel calculation: " << time_ms << " ms"
+            << std::endl;
+  GET_TIME(render, &time_ms);
+  std::cout << "Time for rendering: " << time_ms << " ms" << std::endl;
+#endif
+  LOG_IF(INFO, PULSAR_LOG_RENDER) << "Forward pass complete.";
+}
+} // namespace Renderer
+} // namespace pulsar
+#endif