pulsar integration.

Summary: This diff integrates the pulsar renderer source code into PyTorch3D as an alternative backend for the PyTorch3D point renderer. This diff is the first of a series of three diffs to complete that migration and focuses on the packaging and integration of the source code. For more information about the pulsar backend, see the release notes and the paper (https://arxiv.org/abs/2004.07484). For information on how to use the backend, see the point cloud rendering notebook and the examples in the folder `docs/examples`. Tasks addressed in the following diffs: * Add the PyTorch3D interface, * Add notebook examples and documentation (or adapt the existing ones to feature both interfaces). Reviewed By: nikhilaravi Differential Revision: D23947736 fbshipit-source-id: a5e77b53e6750334db22aefa89b4c079cda1b443

pulsar integration.
Summary: This diff integrates the pulsar renderer source code into PyTorch3D as an alternative backend for the PyTorch3D point renderer. This diff is the first of a series of three diffs to complete that migration and focuses on the packaging and integration of the source code. For more information about the pulsar backend, see the release notes and the paper (https://arxiv.org/abs/2004.07484). For information on how to use the backend, see the point cloud rendering notebook and the examples in the folder `docs/examples`. Tasks addressed in the following diffs: * Add the PyTorch3D interface, * Add notebook examples and documentation (or adapt the existing ones to feature both interfaces). Reviewed By: nikhilaravi Differential Revision: D23947736 fbshipit-source-id: a5e77b53e6750334db22aefa89b4c079cda1b443
b19fe1de · Christoph Lassner · Facebook GitHub Bot · d5650323 · b19fe1de · b19fe1de
Commit b19fe1de authored Nov 03, 2020 by Christoph Lassner Committed by Facebook GitHub Bot Nov 03, 2020
20 changed files
--- a/pytorch3d/csrc/pulsar/global.h
+++ b/pytorch3d/csrc/pulsar/global.h
+// Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#ifndef PULSAR_GLOBAL_H
+#define PULSAR_GLOBAL_H
+#include "./constants.h"
+#ifndef WIN32
+#include <csignal>
+#endif
+#if defined(_WIN64) || defined(_WIN32)
+#define uint unsigned int
+#define ushort unsigned short
+#endif
+#include "./logging.h" // <- include before torch/extension.h
+#define MAX_GRAD_SPHERES 128
+#ifdef __CUDACC__
+#define INLINE __forceinline__
+#define HOST __host__
+#define DEVICE __device__
+#define GLOBAL __global__
+#define RESTRICT __restrict__
+#define DEBUGBREAK()
+#pragma diag_suppress = attribute_not_allowed
+#pragma diag_suppress = 1866
+#pragma diag_suppress = 2941
+#pragma diag_suppress = 2951
+#pragma diag_suppress = 2967
+#else // __CUDACC__
+#define INLINE inline
+#define HOST
+#define DEVICE
+#define GLOBAL
+#define RESTRICT
+#define DEBUGBREAK() std::raise(SIGINT)
+// Don't care about pytorch warnings; they shouldn't clutter our warnings.
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Weverything"
+#include <ATen/cuda/CUDAContext.h>
+#include <torch/extension.h>
+#pragma clang diagnostic pop
+namespace py = pybind11;
+inline float3 make_float3(const float& x, const float& y, const float& z) {
+  float3 res;
+  res.x = x;
+  res.y = y;
+  res.z = z;
+  return res;
+}
+inline bool operator==(const float3& a, const float3& b) {
+  return a.x == b.x && a.y == b.y && a.z == b.z;
+}
+#endif // __CUDACC__
+#define IHD INLINE HOST DEVICE
+// An assertion command that can be used on host and device.
+#ifdef PULSAR_ASSERTIONS
+#ifdef __CUDACC__
+#define PASSERT(VAL)                                     \
+  if (!(VAL)) {                                          \
+    printf(                                              \
+        "Pulsar assertion failed in %s, line %d: %s.\n", \
+        __FILE__,                                        \
+        __LINE__,                                        \
+        #VAL);                                           \
+  }
+#else
+#define PASSERT(VAL)                                     \
+  if (!(VAL)) {                                          \
+    printf(                                              \
+        "Pulsar assertion failed in %s, line %d: %s.\n", \
+        __FILE__,                                        \
+        __LINE__,                                        \
+        #VAL);                                           \
+    std::raise(SIGINT);                                  \
+  }
+#endif
+#else
+#define PASSERT(VAL)
+#endif
+#endif
--- a/pytorch3d/csrc/pulsar/host/README.md
+++ b/pytorch3d/csrc/pulsar/host/README.md
+# Device-specific host compilation units
+This folder contains `.cpp` files to create compilation units
+for device specific functions. See `../include/README.md` for
+more information.
--- a/pytorch3d/csrc/pulsar/host/commands.h
+++ b/pytorch3d/csrc/pulsar/host/commands.h
+// Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#ifndef PULSAR_NATIVE_COMMANDS_H_
+#define PULSAR_NATIVE_COMMANDS_H_
+#ifdef _MSC_VER
+#include <intrin.h>
+#define __builtin_popcount (int)__popcnt
+#endif
+// Definitions for CPU commands.
+// #include <execution>
+// #include <numeric>
+namespace cg {
+struct coalesced_group {
+  INLINE uint thread_rank() const {
+    return 0u;
+  }
+  INLINE uint size() const {
+    return 1u;
+  }
+  INLINE uint ballot(uint val) const {
+    return static_cast<uint>(val > 0);
+  }
+};
+struct thread_block {
+  INLINE uint thread_rank() const {
+    return 0u;
+  }
+  INLINE uint size() const {
+    return 1u;
+  }
+  INLINE void sync() const {}
+};
+INLINE coalesced_group coalesced_threads() {
+  coalesced_group ret;
+  return ret;
+}
+INLINE thread_block this_thread_block() {
+  thread_block ret;
+  return ret;
+}
+} // namespace cg
+#define SHFL_SYNC(a, b, c) (b)
+template <typename T>
+T WARP_CUMSUM(
+    const cg::coalesced_group& group,
+    const uint& mask,
+    const T& base) {
+  return base;
+}
+template <typename T>
+DEVICE T
+WARP_MAX(const cg::coalesced_group& group, const uint& mask, const T& base) {
+  return base;
+}
+template <typename T>
+DEVICE T
+WARP_SUM(const cg::coalesced_group& group, const uint& mask, const T& base) {
+  return base;
+}
+INLINE DEVICE float3 WARP_SUM_FLOAT3(
+    const cg::coalesced_group& group,
+    const uint& mask,
+    const float3& base) {
+  return base;
+}
+#define ACTIVEMASK() (1u << 31)
+#define ALIGN(VAL)
+#define SYNC()
+#define THREADFENCE_B()
+#define BALLOT(mask, val) (val != 0)
+#define SHARED
+// Floating point.
+#define FMAX(a, b) std::fmax((a), (b))
+#define FMIN(a, b) std::fmin((a), (b))
+INLINE float atomicMax(float* address, float val) {
+  *address = std::max(*address, val);
+  return *address;
+}
+INLINE float atomicMin(float* address, float val) {
+  *address = std::min(*address, val);
+  return *address;
+}
+#define FMUL(a, b) ((a) * (b))
+#define FDIV(a, b) ((a) / (b))
+#define FSUB(a, b) ((a) - (b))
+#define FABSLEQAS(a, b, c) \
+  ((a) <= (b) ? FSUB((b), (a)) <= (c) : FSUB((a), (b)) < (c))
+#define FADD(a, b) ((a) + (b))
+#define FSQRT(a) sqrtf(a)
+#define FEXP(a) fasterexp(a)
+#define FLN(a) fasterlog(a)
+#define FPOW(a, b) powf((a), (b))
+#define FROUND(x) roundf(x)
+#define FCEIL(a) ceilf(a)
+#define FFLOOR(a) floorf(a)
+#define FSATURATE(x) std::max(0.f, std::min(1.f, x))
+#define FABS(a) abs(a)
+#define FMA(x, y, z) ((x) * (y) + (z))
+#define I2F(a) static_cast<float>(a)
+#define FRCP(x) (1.f / (x))
+#define IASF(x, loc) memcpy(&(loc), &(x), sizeof(x))
+#define FASI(x, loc) memcpy(&(loc), &(x), sizeof(x))
+#define DMAX(a, b) std::max((a), (b))
+#define DMIN(a, b) std::min((a), (b))
+#define DSATURATE(a) DMIN(1., DMAX(0., (a)))
+#define DSQRT(a) sqrt(a)
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+// uint.
+#define CLZ(VAL) _clz(VAL)
+template <typename T>
+INLINE T ATOMICADD(T* address, T val) {
+  T old = *address;
+  *address += val;
+  return old;
+}
+template <typename T>
+INLINE void ATOMICADD_F3(T* address, T val) {
+  ATOMICADD(&(address->x), val.x);
+  ATOMICADD(&(address->y), val.y);
+  ATOMICADD(&(address->z), val.z);
+}
+#define ATOMICADD_B(a, b) ATOMICADD((a), (b))
+#define POPC(a) __builtin_popcount(a)
+// int.
+#define IMIN(a, b) std::min((a), (b))
+#define IMAX(a, b) std::max((a), (b))
+#define IABS(a) abs(a)
+// Checks.
+#define CHECKOK THCheck
+#define ARGCHECK THArgCheck
+// Math.
+#define NORM3DF(x, y, z) sqrtf(x* x + y * y + z * z)
+#define RNORM3DF(x, y, z) (1.f / sqrtf(x * x + y * y + z * z))
+// High level.
+#define PREFETCH(PTR)
+#define GET_SORT_WS_SIZE(RES_PTR, KEY_TYPE, VAL_TYPE, NUM_OBJECTS) \
+  *(RES_PTR) = 0;
+#define GET_REDUCE_WS_SIZE(RES_PTR, TYPE, REDUCE_OP, NUM_OBJECTS) \
+  *(RES_PTR) = 0;
+#define GET_SELECT_WS_SIZE(                              \
+    RES_PTR, TYPE_SELECTOR, TYPE_SELECTION, NUM_OBJECTS) \
+  *(RES_PTR) = 0;
+#define GET_SUM_WS_SIZE(RES_PTR, TYPE_SUM, NUM_OBJECTS) *(RES_PTR) = 0;
+#define GET_MM_WS_SIZE(RES_PTR, TYPE, NUM_OBJECTS) *(RES_PTR) = 0;
+#define SORT_DESCENDING(                                                     \
+    TMPN1, SORT_PTR, SORTED_PTR, VAL_PTR, VAL_SORTED_PTR, NUM_OBJECTS)       \
+  std::vector<size_t> TMPN1(NUM_OBJECTS);                                    \
+  std::iota(TMPN1.begin(), TMPN1.end(), 0);                                  \
+  const auto TMPN1##_val_ptr = (SORT_PTR);                                   \
+  std::sort(                                                                 \
+      TMPN1.begin(), TMPN1.end(), [&TMPN1##_val_ptr](size_t i1, size_t i2) { \
+        return TMPN1##_val_ptr[i1] > TMPN1##_val_ptr[i2];                    \
+      });                                                                    \
+  for (int i = 0; i < (NUM_OBJECTS); ++i) {                                  \
+    (SORTED_PTR)[i] = (SORT_PTR)[TMPN1[i]];                                  \
+  }                                                                          \
+  for (int i = 0; i < (NUM_OBJECTS); ++i) {                                  \
+    (VAL_SORTED_PTR)[i] = (VAL_PTR)[TMPN1[i]];                               \
+  }
+#define SORT_ASCENDING(                                                 \
+    SORT_PTR, SORTED_PTR, VAL_PTR, VAL_SORTED_PTR, NUM_OBJECTS, STREAM) \
+  {                                                                     \
+    std::vector<size_t> TMPN1(NUM_OBJECTS);                             \
+    std::iota(TMPN1.begin(), TMPN1.end(), 0);                           \
+    const auto TMPN1_val_ptr = (SORT_PTR);                              \
+    std::sort(                                                          \
+        TMPN1.begin(),                                                  \
+        TMPN1.end(),                                                    \
+        [&TMPN1_val_ptr](size_t i1, size_t i2) -> bool {                \
+          return TMPN1_val_ptr[i1] < TMPN1_val_ptr[i2];                 \
+        });                                                             \
+    for (int i = 0; i < (NUM_OBJECTS); ++i) {                           \
+      (SORTED_PTR)[i] = (SORT_PTR)[TMPN1[i]];                           \
+    }                                                                   \
+    for (int i = 0; i < (NUM_OBJECTS); ++i) {                           \
+      (VAL_SORTED_PTR)[i] = (VAL_PTR)[TMPN1[i]];                        \
+    }                                                                   \
+  }
+#define SORT_DESCENDING_WS( \
+    TMPN1,                  \
+    SORT_PTR,               \
+    SORTED_PTR,             \
+    VAL_PTR,                \
+    VAL_SORTED_PTR,         \
+    NUM_OBJECTS,            \
+    WORSPACE_PTR,           \
+    WORKSPACE_SIZE)         \
+  SORT_DESCENDING(          \
+      TMPN1, SORT_PTR, SORTED_PTR, VAL_PTR, VAL_SORTED_PTR, NUM_OBJECTS)
+#define SORT_ASCENDING_WS( \
+    SORT_PTR,              \
+    SORTED_PTR,            \
+    VAL_PTR,               \
+    VAL_SORTED_PTR,        \
+    NUM_OBJECTS,           \
+    WORSPACE_PTR,          \
+    WORKSPACE_SIZE,        \
+    STREAM)                \
+  SORT_ASCENDING(          \
+      SORT_PTR, SORTED_PTR, VAL_PTR, VAL_SORTED_PTR, NUM_OBJECTS, STREAM)
+#define REDUCE(REDUCE_PTR, RESULT_PTR, NUM_ITEMS, REDUCE_OP, REDUCE_INIT) \
+  {                                                                       \
+    *(RESULT_PTR) = (REDUCE_INIT);                                        \
+    for (int i = 0; i < (NUM_ITEMS); ++i) {                               \
+      *(RESULT_PTR) = REDUCE_OP(*(RESULT_PTR), (REDUCE_PTR)[i]);          \
+    }                                                                     \
+  }
+#define REDUCE_WS(  \
+    REDUCE_PTR,     \
+    RESULT_PTR,     \
+    NUM_ITEMS,      \
+    REDUCE_OP,      \
+    REDUCE_INIT,    \
+    WORKSPACE_PTR,  \
+    WORKSPACE_SIZE, \
+    STREAM)         \
+  REDUCE(REDUCE_PTR, RESULT_PTR, NUM_ITEMS, REDUCE_OP, REDUCE_INIT)
+#define SELECT_FLAGS_WS(                    \
+    FLAGS_PTR,                              \
+    ITEM_PTR,                               \
+    OUT_PTR,                                \
+    NUM_SELECTED_PTR,                       \
+    NUM_ITEMS,                              \
+    WORKSPACE_PTR,                          \
+    WORSPACE_BYTES,                         \
+    STREAM)                                 \
+  {                                         \
+    *NUM_SELECTED_PTR = 0;                  \
+    ptrdiff_t write_pos = 0;                \
+    for (int i = 0; i < NUM_ITEMS; ++i) {   \
+      if (FLAGS_PTR[i]) {                   \
+        OUT_PTR[write_pos++] = ITEM_PTR[i]; \
+        *NUM_SELECTED_PTR += 1;             \
+      }                                     \
+    }                                       \
+  }
+template <typename T>
+void SUM_WS(
+    T* SUM_PTR,
+    T* OUT_PTR,
+    size_t NUM_OBJECTS,
+    char* WORKSPACE_PTR,
+    size_t WORKSPACE_BYTES,
+    cudaStream_t STREAM) {
+  *(OUT_PTR) = T();
+  for (int i = 0; i < (NUM_OBJECTS); ++i) {
+    *(OUT_PTR) = *(OUT_PTR) + (SUM_PTR)[i];
+  }
+}
+template <typename T>
+void MIN_WS(
+    T* MIN_PTR,
+    T* OUT_PTR,
+    size_t NUM_OBJECTS,
+    char* WORKSPACE_PTR,
+    size_t WORKSPACE_BYTES,
+    cudaStream_t STREAM) {
+  *(OUT_PTR) = T();
+  for (int i = 0; i < (NUM_OBJECTS); ++i) {
+    *(OUT_PTR) = std::min<T>(*(OUT_PTR), (MIN_PTR)[i]);
+  }
+}
+template <typename T>
+void MAX_WS(
+    T* MAX_PTR,
+    T* OUT_PTR,
+    size_t NUM_OBJECTS,
+    char* WORKSPACE_PTR,
+    size_t WORKSPACE_BYTES,
+    cudaStream_t STREAM) {
+  *(OUT_PTR) = T();
+  for (int i = 0; i < (NUM_OBJECTS); ++i) {
+    *(OUT_PTR) = std::max<T>(*(OUT_PTR), (MAX_PTR)[i]);
+  }
+}
+//
+//
+//
+//
+#define COPY_HOST_DEV(PTR_D, PTR_H, TYPE, SIZE) \
+  std::memcpy((PTR_D), (PTR_H), sizeof(TYPE) * (SIZE))
+//
+#define COPY_DEV_HOST(PTR_H, PTR_D, TYPE, SIZE) \
+  std::memcpy((PTR_H), (PTR_D), sizeof(TYPE) * (SIZE))
+//
+#define COPY_DEV_DEV(PTR_T, PTR_S, TYPE, SIZE) \
+  std::memcpy((PTR_T), (PTR_S), sizeof(TYPE) * SIZE)
+//
+#define MALLOC(VAR, TYPE, SIZE) MALLOC_HOST(VAR, TYPE, SIZE)
+#define FREE(PTR) FREE_HOST(PTR)
+#define MEMSET(VAR, VAL, TYPE, SIZE, STREAM) \
+  memset((VAR), (VAL), sizeof(TYPE) * (SIZE))
+//
+#define LAUNCH_MAX_PARALLEL_1D(FUNC, N, STREAM, ...) FUNC(__VA_ARGS__);
+#define LAUNCH_PARALLEL_1D(FUNC, N, TN, STREAM, ...) FUNC(__VA_ARGS__);
+#define LAUNCH_MAX_PARALLEL_2D(FUNC, NX, NY, STREAM, ...) FUNC(__VA_ARGS__);
+#define LAUNCH_PARALLEL_2D(FUNC, NX, NY, TX, TY, STREAM, ...) FUNC(__VA_ARGS__);
+//
+//
+//
+//
+//
+#define GET_PARALLEL_IDX_1D(VARNAME, N) \
+  for (uint VARNAME = 0; VARNAME < (N); ++VARNAME) {
+#define GET_PARALLEL_IDS_2D(VAR_X, VAR_Y, WIDTH, HEIGHT)          \
+  int2 blockDim;                                                  \
+  blockDim.x = 1;                                                 \
+  blockDim.y = 1;                                                 \
+  uint __parallel_2d_width = WIDTH;                               \
+  uint __parallel_2d_height = HEIGHT;                             \
+  for (uint VAR_Y = 0; VAR_Y < __parallel_2d_height; ++(VAR_Y)) { \
+    for (uint VAR_X = 0; VAR_X < __parallel_2d_width; ++(VAR_X)) {
+//
+//
+//
+#define END_PARALLEL() \
+  end_parallel:;       \
+  }
+#define END_PARALLEL_NORET() }
+#define END_PARALLEL_2D() \
+  end_parallel:;          \
+  }                       \
+  }
+#define END_PARALLEL_2D_NORET() \
+  }                             \
+  }
+#define RETURN_PARALLEL() goto end_parallel;
+#define CHECKLAUNCH()
+#define ISONDEVICE false
+#define SYNCDEVICE()
+#define START_TIME(TN) \
+  auto __time_start_##TN = std::chrono::steady_clock::now();
+#define STOP_TIME(TN) auto __time_stop_##TN = std::chrono::steady_clock::now();
+#define GET_TIME(TN, TOPTR)                                       \
+  *TOPTR = std::chrono::duration_cast<std::chrono::milliseconds>( \
+               __time_stop_##TN - __time_start_##TN)              \
+               .count()
+#define START_TIME_CU(TN)                          \
+  cudaEvent_t __time_start_##TN, __time_stop_##TN; \
+  cudaEventCreate(&__time_start_##TN);             \
+  cudaEventCreate(&__time_stop_##TN);              \
+  cudaEventRecord(__time_start_##TN);
+#define STOP_TIME_CU(TN) cudaEventRecord(__time_stop_##TN);
+#define GET_TIME_CU(TN, TOPTR)            \
+  cudaEventSynchronize(__time_stop_##TN); \
+  cudaEventElapsedTime((TOPTR), __time_start_##TN, __time_stop_##TN);
+#endif
--- a/pytorch3d/csrc/pulsar/host/renderer.backward.cpu.cpp
+++ b/pytorch3d/csrc/pulsar/host/renderer.backward.cpu.cpp
+// Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#include "../include/renderer.backward.instantiate.h"
--- a/pytorch3d/csrc/pulsar/host/renderer.backward_dbg.cpu.cpp
+++ b/pytorch3d/csrc/pulsar/host/renderer.backward_dbg.cpu.cpp
+// Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#include "../include/renderer.backward_dbg.instantiate.h"
--- a/pytorch3d/csrc/pulsar/host/renderer.calc_gradients.cpu.cpp
+++ b/pytorch3d/csrc/pulsar/host/renderer.calc_gradients.cpu.cpp
+// Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#include "../include/renderer.calc_gradients.instantiate.h"
--- a/pytorch3d/csrc/pulsar/host/renderer.calc_signature.cpu.cpp
+++ b/pytorch3d/csrc/pulsar/host/renderer.calc_signature.cpu.cpp
+// Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#include "../include/renderer.calc_signature.instantiate.h"
--- a/pytorch3d/csrc/pulsar/host/renderer.construct.cpu.cpp
+++ b/pytorch3d/csrc/pulsar/host/renderer.construct.cpu.cpp
+// Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#include "../include/renderer.construct.instantiate.h"
--- a/pytorch3d/csrc/pulsar/host/renderer.create_selector.cpu.cpp
+++ b/pytorch3d/csrc/pulsar/host/renderer.create_selector.cpu.cpp
+// Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#include "../include/renderer.create_selector.instantiate.h"
--- a/pytorch3d/csrc/pulsar/host/renderer.destruct.cpu.cpp
+++ b/pytorch3d/csrc/pulsar/host/renderer.destruct.cpu.cpp
+// Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#include "../include/renderer.destruct.instantiate.h"
--- a/pytorch3d/csrc/pulsar/host/renderer.fill_bg.cpu.cpp
+++ b/pytorch3d/csrc/pulsar/host/renderer.fill_bg.cpu.cpp
+// Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#include "../include/renderer.fill_bg.instantiate.h"
--- a/pytorch3d/csrc/pulsar/host/renderer.forward.cpu.cpp
+++ b/pytorch3d/csrc/pulsar/host/renderer.forward.cpu.cpp
+// Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#include "../include/renderer.forward.instantiate.h"
--- a/pytorch3d/csrc/pulsar/host/renderer.norm_cam_gradients.cpu.cpp
+++ b/pytorch3d/csrc/pulsar/host/renderer.norm_cam_gradients.cpu.cpp
+// Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#include "../include/renderer.norm_cam_gradients.instantiate.h"
--- a/pytorch3d/csrc/pulsar/host/renderer.norm_sphere_gradients.cpu.cpp
+++ b/pytorch3d/csrc/pulsar/host/renderer.norm_sphere_gradients.cpu.cpp
+// Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#include "../include/renderer.norm_sphere_gradients.instantiate.h"
--- a/pytorch3d/csrc/pulsar/host/renderer.render.cpu.cpp
+++ b/pytorch3d/csrc/pulsar/host/renderer.render.cpu.cpp
+// Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#include "../include/renderer.render.instantiate.h"
--- a/pytorch3d/csrc/pulsar/include/README.md
+++ b/pytorch3d/csrc/pulsar/include/README.md
+# The `include` folder
+This folder contains header files with implementations of several useful
+algorithms. These implementations are usually done in files called `x.device.h`
+and use macros that route every device specific command to the right
+implementation (see `commands.h`).
+If you're using a device specific implementation, include `x.device.h`.
+This gives you the high-speed, device specific implementation that lets
+you work with all the details of the datastructure. All function calls are
+inlined. If you need to work with the high-level interface and be able to
+dynamically pick a device, only include `x.h`. The functions there are
+templated with a boolean `DEV` flag and are instantiated in device specific
+compilation units. You will not be able to use any other functions, but can
+use `func<true>(params)` to work on a CUDA device, or `func<false>(params)`
+to work on the host.
--- a/pytorch3d/csrc/pulsar/include/camera.device.h
+++ b/pytorch3d/csrc/pulsar/include/camera.device.h
+// Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#ifndef PULSAR_NATIVE_INCLUDE_CAMERA_DEVICE_H_
+#define PULSAR_NATIVE_INCLUDE_CAMERA_DEVICE_H_
+#include "../global.h"
+#include "./camera.h"
+#include "./commands.h"
+namespace pulsar {
+IHD CamGradInfo::CamGradInfo() {
+  cam_pos = make_float3(0.f, 0.f, 0.f);
+  pixel_0_0_center = make_float3(0.f, 0.f, 0.f);
+  pixel_dir_x = make_float3(0.f, 0.f, 0.f);
+  pixel_dir_y = make_float3(0.f, 0.f, 0.f);
+}
+} // namespace pulsar
+#endif
--- a/pytorch3d/csrc/pulsar/include/camera.h
+++ b/pytorch3d/csrc/pulsar/include/camera.h
+// Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#ifndef PULSAR_NATIVE_INCLUDE_CAMERA_H_
+#define PULSAR_NATIVE_INCLUDE_CAMERA_H_
+#include "../global.h"
+namespace pulsar {
+/**
+ * Everything that's needed to raycast with our camera model.
+ */
+struct CamInfo {
+  float3 eye; /** Position in world coordinates. */
+  float3 pixel_0_0_center; /** LUC center of pixel position in world
+                              coordinates. */
+  float3 pixel_dir_x; /** Direction for increasing x for one pixel to the next,
+                       * in  world coordinates. */
+  float3 pixel_dir_y; /** Direction for increasing y for one pixel to the next,
+                       * in  world coordinates. */
+  float3 sensor_dir_z; /** Normalized direction vector from eye through the
+                        * sensor in z direction (optical axis). */
+  float half_pixel_size; /** Half size of a pixel, in world coordinates. This
+                          * must be consistent with pixel_dir_x and pixel_dir_y!
+                          */
+  float focal_length; /** The focal length, if applicable. */
+  uint aperture_width; /** Full image width in px, possibly not fully used
+                        * in case of a shifted principal point. */
+  uint aperture_height; /** Full image height in px, possibly not fully used
+                         * in case of a shifted principal point. */
+  uint film_width; /** Resulting image width. */
+  uint film_height; /** Resulting image height. */
+  /** The top left coordinates (inclusive) of the film in the full aperture. */
+  uint film_border_left, film_border_top;
+  int32_t principal_point_offset_x; /** Horizontal principal point offset. */
+  int32_t principal_point_offset_y; /** Vertical principal point offset. */
+  float min_dist; /** Minimum distance for a ball to be rendered. */
+  float max_dist; /** Maximum distance for a ball to be rendered. */
+  float norm_fac; /** 1 / (max_dist - min_dist), pre-computed. */
+  /** The depth where to place the background, in normalized coordinates where
+   * 0. is the backmost depth and 1. the frontmost. */
+  float background_normalization_depth;
+  /** The number of image content channels to use. Usually three. */
+  uint n_channels;
+  /** Whether to use an orthogonal instead of a perspective projection. */
+  bool orthogonal_projection;
+  /** Whether to use a right-handed system (inverts the z axis). */
+  bool right_handed;
+};
+inline bool operator==(const CamInfo& a, const CamInfo& b) {
+  return a.film_width == b.film_width && a.film_height == b.film_height &&
+      a.background_normalization_depth == b.background_normalization_depth &&
+      a.n_channels == b.n_channels &&
+      a.orthogonal_projection == b.orthogonal_projection &&
+      a.right_handed == b.right_handed;
+};
+struct CamGradInfo {
+  HOST DEVICE CamGradInfo();
+  float3 cam_pos;
+  float3 pixel_0_0_center;
+  float3 pixel_dir_x;
+  float3 pixel_dir_y;
+};
+// TODO: remove once https://github.com/NVlabs/cub/issues/172 is resolved.
+struct IntWrapper {
+  int val;
+};
+} // namespace pulsar
+#endif
--- a/pytorch3d/csrc/pulsar/include/closest_sphere_tracker.device.h
+++ b/pytorch3d/csrc/pulsar/include/closest_sphere_tracker.device.h
+// Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#ifndef PULSAR_NATIVE_INCLUDE_CLOSEST_SPHERE_TRACKER_DEVICE_H_
+#define PULSAR_NATIVE_INCLUDE_CLOSEST_SPHERE_TRACKER_DEVICE_H_
+#include "../global.h"
+namespace pulsar {
+namespace Renderer {
+/**
+ * A facility to track the closest spheres to the camera.
+ *
+ * Their max number is defined by MAX_GRAD_SPHERES (this is defined in
+ * `pulsar/native/global.h`). This is done to keep the performance as high as
+ * possible because this struct needs to do updates continuously on the GPU.
+ */
+struct ClosestSphereTracker {
+ public:
+  IHD ClosestSphereTracker(const int& n_track) : n_hits(0), n_track(n_track) {
+    PASSERT(n_track < MAX_GRAD_SPHERES);
+    // Initialize the sphere IDs to -1 and the weights to 0.
+    for (int i = 0; i < n_track; ++i) {
+      this->most_important_sphere_ids[i] = -1;
+      this->closest_sphere_intersection_depths[i] = MAX_FLOAT;
+    }
+  };
+  IHD void track(
+      const uint& sphere_idx,
+      const float& intersection_depth,
+      const uint& coord_x,
+      const uint& coord_y) {
+    PULSAR_LOG_DEV_PIX(
+        PULSAR_LOG_TRACKER_PIX,
+        "tracker|tracking sphere %u (depth: %f).\n",
+        sphere_idx,
+        intersection_depth);
+    for (int i = IMIN(this->n_hits, n_track) - 1; i >= -1; --i) {
+      if (i < 0 ||
+          this->closest_sphere_intersection_depths[i] < intersection_depth) {
+        // Write position is i+1.
+        PULSAR_LOG_DEV_PIX(
+            PULSAR_LOG_TRACKER_PIX,
+            "tracker|determined writing position: %d.\n",
+            i + 1);
+        if (i + 1 < n_track) {
+          // Shift every other sphere back.
+          for (int j = n_track - 1; j > i + 1; --j) {
+            this->closest_sphere_intersection_depths[j] =
+                this->closest_sphere_intersection_depths[j - 1];
+            this->most_important_sphere_ids[j] =
+                this->most_important_sphere_ids[j - 1];
+          }
+          this->closest_sphere_intersection_depths[i + 1] = intersection_depth;
+          this->most_important_sphere_ids[i + 1] = sphere_idx;
+        }
+        break;
+      }
+    }
+#if PULSAR_LOG_TRACKER_PIX
+    PULSAR_LOG_DEV_PIX(
+        PULSAR_LOG_TRACKER_PIX,
+        "tracker|sphere list after adding sphere %u:\n",
+        sphere_idx);
+    for (int i = 0; i < n_track; ++i) {
+      PULSAR_LOG_DEV_PIX(
+          PULSAR_LOG_TRACKER_PIX,
+          "tracker|sphere %d: %d (depth: %f).\n",
+          i,
+          this->most_important_sphere_ids[i],
+          this->closest_sphere_intersection_depths[i]);
+    }
+#endif // PULSAR_LOG_TRACKER_PIX
+    this->n_hits += 1;
+  }
+  /**
+   * Get the number of hits registered.
+   */
+  IHD int get_n_hits() const {
+    return this->n_hits;
+  }
+  /**
+   * Get the idx closest sphere ID.
+   *
+   * For example, get_closest_sphere_id(0) gives the overall closest
+   * sphere id.
+   *
+   * This method is implemented for highly optimized scenarios and will *not*
+   * perform an index check at runtime if assertions are disabled. idx must be
+   * >=0 and < IMIN(n_hits, n_track) for a valid result, if it is >=
+   * n_hits it will return -1.
+   */
+  IHD int get_closest_sphere_id(const int& idx) {
+    PASSERT(idx >= 0 && idx < n_track);
+    return this->most_important_sphere_ids[idx];
+  }
+  /**
+   * Get the idx closest sphere normalized_depth.
+   *
+   * For example, get_closest_sphere_depth(0) gives the overall closest
+   * sphere depth (normalized).
+   *
+   * This method is implemented for highly optimized scenarios and will *not*
+   * perform an index check at runtime if assertions are disabled. idx must be
+   * >=0 and < IMIN(n_hits, n_track) for a valid result, if it is >=
+   * n_hits it will return 1. + FEPS.
+   */
+  IHD float get_closest_sphere_depth(const int& idx) {
+    PASSERT(idx >= 0 && idx < n_track);
+    return this->closest_sphere_intersection_depths[idx];
+  }
+ private:
+  /** The number of registered hits so far. */
+  int n_hits;
+  /** The number of intersections to track. Must be <MAX_GRAD_SPHERES. */
+  int n_track;
+  /** The sphere ids of the n_track spheres with the highest color
+   * contribution. */
+  int most_important_sphere_ids[MAX_GRAD_SPHERES];
+  /** The normalized depths of the closest n_track spheres. */
+  float closest_sphere_intersection_depths[MAX_GRAD_SPHERES];
+};
+} // namespace Renderer
+} // namespace pulsar
+#endif
--- a/pytorch3d/csrc/pulsar/include/commands.h
+++ b/pytorch3d/csrc/pulsar/include/commands.h
+// Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#ifndef PULSAR_NATIVE_COMMANDS_ROUTING_H_
+#define PULSAR_NATIVE_COMMANDS_ROUTING_H_
+#include "../global.h"
+// Commands available everywhere.
+#define MALLOC_HOST(VAR, TYPE, SIZE) \
+  VAR = static_cast<TYPE*>(malloc(sizeof(TYPE) * (SIZE)))
+#define FREE_HOST(PTR) free(PTR)
+/* Include command definitions depending on CPU or GPU use. */
+#ifdef __CUDACC__
+// TODO: find out which compiler we're using here and use the suppression.
+// #pragma push
+// #pragma diag_suppress = 68
+#include <ATen/cuda/CUDAContext.h>
+#include <THC/THC.h>
+// #pragma pop
+#include "../cuda/commands.h"
+#else
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Weverything"
+#include <TH/TH.h>
+#pragma clang diagnostic pop
+#include "../host/commands.h"
+#endif
+#endif