init

298ffd02 · Ruilong Li · 298ffd02 · 298ffd02 · 298ffd02 · 298ffd02
Commit 298ffd02 authored Sep 08, 2022 by Ruilong Li
14 changed files
--- a/.gitignore
+++ b/.gitignore
+# Visual Studio Code configs.
+.vscode/
+
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+# lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+.pytest_cache/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# pyenv
+.python-version
+
+# celery beat schedule file
+celerybeat-schedule
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+
+.DS_Store
+
+# Direnv config.
+.envrc
+
+# line_profiler
+*.lprof
+
+# vscode
+.vsocde
--- a/nerfacc/__init__.py
+++ b/nerfacc/__init__.py
+import math
+from typing import Callable, Tuple
+
+import torch
+
+from .cuda import VolumeRenderer, ray_aabb_intersect, ray_marching
+
+
+def volumetric_rendering(
+    query_fn: Callable,
+    rays_o: torch.Tensor,
+    rays_d: torch.Tensor,
+    scene_aabb: torch.Tensor,
+    scene_occ_binary: torch.Tensor,
+    scene_resolution: Tuple[int, int, int],
+    render_bkgd: torch.Tensor = None,
+    render_n_samples: int = 1024,
+    **kwargs,
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    """A *fast* version of differentiable volumetric rendering."""
+    device = rays_o.device
+    if render_bkgd is None:
+        render_bkgd = torch.ones(3, device=device)
+    scene_resolution = torch.tensor(scene_resolution, dtype=torch.int, device=device)
+
+    rays_o = rays_o.contiguous()
+    rays_d = rays_d.contiguous()
+    scene_aabb = scene_aabb.contiguous()
+    scene_occ_binary = scene_occ_binary.contiguous()
+    render_bkgd = render_bkgd.contiguous()
+
+    n_rays = rays_o.shape[0]
+    render_total_samples = n_rays * render_n_samples
+    render_step_size = (
+        (scene_aabb[3:] - scene_aabb[:3]).max() * math.sqrt(3) / render_n_samples
+    )
+
+    with torch.no_grad():
+        # TODO: avoid clamp here. kinda stupid
+        t_min, t_max = ray_aabb_intersect(rays_o, rays_d, scene_aabb)
+        t_min = torch.clamp(t_min, max=1e10)
+        t_max = torch.clamp(t_max, max=1e10)
+
+        (
+            packed_info,
+            frustum_origins,
+            frustum_dirs,
+            frustum_starts,
+            frustum_ends,
+        ) = ray_marching(
+            # rays
+            rays_o,
+            rays_d,
+            t_min,
+            t_max,
+            # density grid
+            scene_aabb,
+            scene_resolution,
+            scene_occ_binary,
+            # sampling
+            render_total_samples,
+            render_n_samples,
+            render_step_size,
+        )
+
+        # squeeze valid samples
+        total_samples = max(packed_info[:, -1].sum(), 1)
+        frustum_origins = frustum_origins[:total_samples]
+        frustum_dirs = frustum_dirs[:total_samples]
+        frustum_starts = frustum_starts[:total_samples]
+        frustum_ends = frustum_ends[:total_samples]
+
+        frustum_positions = (
+            frustum_origins + frustum_dirs * (frustum_starts + frustum_ends) / 2.0
+        )
+
+    query_results = query_fn(frustum_positions, frustum_dirs, **kwargs)
+    rgbs, densities = query_results[0], query_results[1]
+
+    (
+        accumulated_weight,
+        accumulated_depth,
+        accumulated_color,
+        alive_ray_mask,
+    ) = VolumeRenderer.apply(
+        packed_info,
+        frustum_starts,
+        frustum_ends,
+        densities.contiguous(),
+        rgbs.contiguous(),
+    )
+
+    accumulated_depth = torch.clip(accumulated_depth, t_min[:, None], t_max[:, None])
+    accumulated_color = accumulated_color + render_bkgd * (1.0 - accumulated_weight)
+
+    return accumulated_color, accumulated_depth, accumulated_weight, alive_ray_mask
--- a/nerfacc/cuda/__init__.py
+++ b/nerfacc/cuda/__init__.py
+import torch
+from torch.cuda.amp import custom_bwd, custom_fwd
+
+from ._backend import _C
+
+ray_aabb_intersect = _C.ray_aabb_intersect
+ray_marching = _C.ray_marching
+volumetric_rendering_forward = _C.volumetric_rendering_forward
+volumetric_rendering_backward = _C.volumetric_rendering_backward
+
+
+class VolumeRenderer(torch.autograd.Function):
+    """CUDA Volumetirc Renderer"""
+
+    @staticmethod
+    @custom_fwd(cast_inputs=torch.float32)
+    def forward(ctx, packed_info, starts, ends, sigmas, rgbs):
+        (
+            accumulated_weight,
+            accumulated_depth,
+            accumulated_color,
+            mask,
+        ) = volumetric_rendering_forward(packed_info, starts, ends, sigmas, rgbs)
+        ctx.save_for_backward(
+            accumulated_weight,
+            accumulated_depth,
+            accumulated_color,
+            packed_info,
+            starts,
+            ends,
+            sigmas,
+            rgbs,
+        )
+        return accumulated_weight, accumulated_depth, accumulated_color, mask
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx, grad_weight, grad_depth, grad_color, _grad_mask):
+        (
+            accumulated_weight,
+            accumulated_depth,
+            accumulated_color,
+            packed_info,
+            starts,
+            ends,
+            sigmas,
+            rgbs,
+        ) = ctx.saved_tensors
+        grad_sigmas, grad_rgbs = volumetric_rendering_backward(
+            accumulated_weight,
+            accumulated_depth,
+            accumulated_color,
+            grad_weight,
+            grad_depth,
+            grad_color,
+            packed_info,
+            starts,
+            ends,
+            sigmas,
+            rgbs,
+        )
+        # corresponds to the input argument list of forward()
+        return None, None, None, grad_sigmas, grad_rgbs
--- a/nerfacc/cuda/_backend.py
+++ b/nerfacc/cuda/_backend.py
+"""Setup cuda backend."""
+import glob
+import os
+from subprocess import DEVNULL, call
+
+from torch.utils.cpp_extension import load
+
+PATH = os.path.dirname(os.path.abspath(__file__))
+
+
+def cuda_toolkit_available():
+    """Check if the nvcc is avaiable on the machine."""
+    # https://github.com/idiap/fast-transformers/blob/master/setup.py
+    try:
+        call(["nvcc"], stdout=DEVNULL, stderr=DEVNULL)
+        return True
+    except FileNotFoundError:
+        return False
+
+
+if cuda_toolkit_available():
+    sources = glob.glob(os.path.join(PATH, "csrc/*.cu"))
+else:
+    sources = glob.glob(os.path.join(PATH, "csrc/*.cpp"))
+
+extra_cflags = ["-O3"]
+extra_cuda_cflags = ["-O3"]
+_C = load(
+    name="nerfacc_cuda",
+    sources=sources,
+    extra_cflags=extra_cflags,
+    extra_cuda_cflags=extra_cuda_cflags,
+)
+
+__all__ = ["_C"]
--- a/nerfacc/cuda/csrc/include/helpers.h
+++ b/nerfacc/cuda/csrc/include/helpers.h
+#pragma once
+
+#ifdef __CUDACC__
+#define CUDA_HOSTDEV __host__ __device__
+#else
+#define CUDA_HOSTDEV
+#endif
+
+#include <torch/extension.h>
+
+
+inline constexpr CUDA_HOSTDEV float __SQRT3() { return 1.73205080757f; }
+
+template <typename scalar_t>
+inline CUDA_HOSTDEV void __swap(scalar_t &a, scalar_t &b)
+{
+    scalar_t c = a;
+    a = b;
+    b = c;
+}
+
+inline CUDA_HOSTDEV float __clamp(float f, float a, float b) { return fmaxf(a, fminf(f, b)); }
+
+inline CUDA_HOSTDEV int __clamp(int f, int a, int b) { return std::max(a, std::min(f, b)); }
+
+inline CUDA_HOSTDEV float __sign(float x) { return copysignf(1.0, x); }
+
+inline CUDA_HOSTDEV uint32_t __expand_bits(uint32_t v)
+{
+    v = (v * 0x00010001u) & 0xFF0000FFu;
+    v = (v * 0x00000101u) & 0x0F00F00Fu;
+    v = (v * 0x00000011u) & 0xC30C30C3u;
+    v = (v * 0x00000005u) & 0x49249249u;
+    return v;
+}
+
+inline CUDA_HOSTDEV uint32_t __morton3D(uint32_t x, uint32_t y, uint32_t z)
+{
+    uint32_t xx = __expand_bits(x);
+    uint32_t yy = __expand_bits(y);
+    uint32_t zz = __expand_bits(z);
+    return xx | (yy << 1) | (zz << 2);
+}
+
+inline CUDA_HOSTDEV uint32_t __morton3D_invert(uint32_t x)
+{
+	x = x & 0x49249249;
+	x = (x | (x >> 2)) & 0xc30c30c3;
+	x = (x | (x >> 4)) & 0x0f00f00f;
+	x = (x | (x >> 8)) & 0xff0000ff;
+	x = (x | (x >> 16)) & 0x0000ffff;
+	return x;
+}
\ No newline at end of file
--- a/nerfacc/cuda/csrc/include/helpers_cuda.h
+++ b/nerfacc/cuda/csrc/include/helpers_cuda.h
+#pragma once
+
+#include "helpers.h"
+#include <c10/cuda/CUDAGuard.h>
+
+#define CHECK_CUDA(x) TORCH_CHECK(x.is_cuda(), #x " must be a CUDA tensor")
+#define CHECK_CONTIGUOUS(x) \
+    TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
+#define CHECK_INPUT(x) \
+    CHECK_CUDA(x);     \
+    CHECK_CONTIGUOUS(x)
+#define CUDA_GET_THREAD_ID(tid, Q)                         \
+    const int tid = blockIdx.x * blockDim.x + threadIdx.x; \
+    if (tid >= Q)                                          \
+    return
+#define CUDA_N_BLOCKS_NEEDED(Q, CUDA_N_THREADS) ((Q - 1) / CUDA_N_THREADS + 1)
+#define DEVICE_GUARD(_ten) \
+    const at::cuda::OptionalCUDAGuard device_guard(device_of(_ten));
\ No newline at end of file
--- a/nerfacc/cuda/csrc/include/helpers_math.h
+++ b/nerfacc/cuda/csrc/include/helpers_math.h
--- a/nerfacc/cuda/csrc/intersection.cu
+++ b/nerfacc/cuda/csrc/intersection.cu
+#include "include/helpers_cuda.h"
+
+
+template <typename scalar_t>
+inline __host__ __device__ void _ray_aabb_intersect(
+    const scalar_t* rays_o,
+    const scalar_t* rays_d,
+    const scalar_t* aabb,
+    scalar_t* near,
+    scalar_t* far
+) {
+    // aabb is [xmin, ymin, zmin, xmax, ymax, zmax]
+    scalar_t tmin = (aabb[0] - rays_o[0]) / rays_d[0];
+    scalar_t tmax = (aabb[3] - rays_o[0]) / rays_d[0];
+    if (tmin > tmax) __swap(tmin, tmax);
+
+    scalar_t tymin = (aabb[1] - rays_o[1]) / rays_d[1];
+    scalar_t tymax = (aabb[4] - rays_o[1]) / rays_d[1];
+    if (tymin > tymax) __swap(tymin, tymax);
+
+    if (tmin > tymax || tymin > tmax){
+        *near = std::numeric_limits<scalar_t>::max();
+        *far = std::numeric_limits<scalar_t>::max();
+        return;
+    }
+
+    if (tymin > tmin) tmin = tymin;
+    if (tymax < tmax) tmax = tymax;
+
+    scalar_t tzmin = (aabb[2] - rays_o[2]) / rays_d[2];
+    scalar_t tzmax = (aabb[5] - rays_o[2]) / rays_d[2];
+    if (tzmin > tzmax) __swap(tzmin, tzmax);
+
+    if (tmin > tzmax || tzmin > tmax){
+        *near = std::numeric_limits<scalar_t>::max();
+        *far = std::numeric_limits<scalar_t>::max();
+        return;
+    }
+
+    if (tzmin > tmin) tmin = tzmin;
+    if (tzmax < tmax) tmax = tzmax;
+
+    *near = tmin;
+    *far = tmax;
+    return;
+}
+
+
+template <typename scalar_t>
+__global__ void kernel_ray_aabb_intersect(
+    const int N,
+    const scalar_t* rays_o,
+    const scalar_t* rays_d,
+    const scalar_t* aabb,
+    scalar_t* t_min,
+    scalar_t* t_max
+){
+    // aabb is [xmin, ymin, zmin, xmax, ymax, zmax]
+    CUDA_GET_THREAD_ID(thread_id, N);
+
+    // locate
+    rays_o += thread_id * 3;
+    rays_d += thread_id * 3;
+    t_min += thread_id;
+    t_max += thread_id;
+    
+    _ray_aabb_intersect<scalar_t>(rays_o, rays_d, aabb, t_min, t_max);
+    
+    scalar_t zero = static_cast<scalar_t>(0.f);
+    *t_min = *t_min > zero ? *t_min : zero;
+    return;
+}
+
+/**
+ * @brief Ray AABB Test
+ * 
+ * @param rays_o Ray origins. Tensor with shape [N, 3].
+ * @param rays_d Normalized ray directions. Tensor with shape [N, 3].
+ * @param aabb Scene AABB [xmin, ymin, zmin, xmax, ymax, zmax]. Tensor with shape [6].
+ * @return std::vector<torch::Tensor> 
+ *  Ray AABB intersection {t_min, t_max} with shape [N] respectively. Note the t_min is 
+ *  clipped to minimum zero. 
+ */
+std::vector<torch::Tensor> ray_aabb_intersect(
+    const torch::Tensor rays_o, const torch::Tensor rays_d, const torch::Tensor aabb
+) {
+    DEVICE_GUARD(rays_o);
+    CHECK_INPUT(rays_o);
+    CHECK_INPUT(rays_d);
+    CHECK_INPUT(aabb);
+    TORCH_CHECK(rays_o.ndimension() == 2 & rays_o.size(1) == 3)
+    TORCH_CHECK(rays_d.ndimension() == 2 & rays_d.size(1) == 3)
+    TORCH_CHECK(aabb.ndimension() == 1 & aabb.size(0) == 6)
+    
+    const int N = rays_o.size(0);
+
+    const int threads = 256;
+    const int blocks = CUDA_N_BLOCKS_NEEDED(N, threads);
+
+    torch::Tensor t_min = torch::empty({N}, rays_o.options());
+    torch::Tensor t_max = torch::empty({N}, rays_o.options());
+
+    AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+        rays_o.scalar_type(), "ray_aabb_intersect", 
+        ([&] {
+            kernel_ray_aabb_intersect<scalar_t><<<blocks, threads>>>(
+                N,
+                rays_o.data_ptr<scalar_t>(),
+                rays_d.data_ptr<scalar_t>(),
+                aabb.data_ptr<scalar_t>(),
+                t_min.data_ptr<scalar_t>(),
+                t_max.data_ptr<scalar_t>()
+            );
+        })
+    );
+
+    return {t_min, t_max};
+}
\ No newline at end of file
--- a/nerfacc/cuda/csrc/pybind.cu
+++ b/nerfacc/cuda/csrc/pybind.cu
+#include "include/helpers_cuda.h"
+
+
+std::vector<torch::Tensor> ray_aabb_intersect(
+    const torch::Tensor rays_o,
+    const torch::Tensor rays_d,
+    const torch::Tensor aabb
+);
+
+
+std::vector<torch::Tensor> ray_marching(
+    // rays
+    const torch::Tensor rays_o, 
+    const torch::Tensor rays_d, 
+    const torch::Tensor t_min, 
+    const torch::Tensor t_max,
+    // density grid
+    const torch::Tensor aabb,
+    const torch::Tensor resolution,
+    const torch::Tensor occ_binary, 
+    // sampling
+    const int max_total_samples,
+    const int max_per_ray_samples,
+    const float dt
+);
+
+std::vector<torch::Tensor> volumetric_rendering_forward(
+    torch::Tensor packed_info, 
+    torch::Tensor starts, 
+    torch::Tensor ends, 
+    torch::Tensor sigmas, 
+    torch::Tensor rgbs
+);
+
+std::vector<torch::Tensor> volumetric_rendering_backward(
+    torch::Tensor accumulated_weight, 
+    torch::Tensor accumulated_depth, 
+    torch::Tensor accumulated_color, 
+    torch::Tensor grad_weight, 
+    torch::Tensor grad_depth, 
+    torch::Tensor grad_color, 
+    torch::Tensor packed_info, 
+    torch::Tensor starts, 
+    torch::Tensor ends, 
+    torch::Tensor sigmas, 
+    torch::Tensor rgbs
+);
+
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
+{
+    m.def("ray_aabb_intersect", &ray_aabb_intersect);
+    m.def("ray_marching", &ray_marching);
+    m.def("volumetric_rendering_forward", &volumetric_rendering_forward);
+    m.def("volumetric_rendering_backward", &volumetric_rendering_backward);
+}
\ No newline at end of file
--- a/nerfacc/cuda/csrc/ray_marching.cu
+++ b/nerfacc/cuda/csrc/ray_marching.cu
+#include "include/helpers_cuda.h"
+
+
+inline __device__ int cascaded_grid_idx_at(
+    const float x, const float y, const float z, 
+    const int* resolution, const float* aabb
+) {
+    // TODO(ruilongli): if the x, y, z is outside the aabb, it will be clipped into aabb!!! We should just return false
+    int ix = (int)(((x - aabb[0]) / (aabb[3] - aabb[0])) * resolution[0]);
+    int iy = (int)(((y - aabb[1]) / (aabb[4] - aabb[1])) * resolution[1]);
+    int iz = (int)(((z - aabb[2]) / (aabb[5] - aabb[2])) * resolution[2]);
+    ix = __clamp(ix, 0, resolution[0]-1);
+    iy = __clamp(iy, 0, resolution[1]-1);
+    iz = __clamp(iz, 0, resolution[2]-1);
+    int idx = ix * resolution[1] * resolution[2] + iy * resolution[2] + iz;
+    return idx;
+}
+
+inline __device__ bool grid_occupied_at(
+    const float x, const float y, const float z, 
+    const int* resolution, const float* aabb, const bool* occ_binary
+) {
+    int idx = cascaded_grid_idx_at(x, y, z, resolution, aabb);
+    return occ_binary[idx];
+}
+
+inline __device__ float distance_to_next_voxel(
+    float x, float y, float z, 
+    float dir_x, float dir_y, float dir_z, 
+    float idir_x, float idir_y, float idir_z,
+    const int* resolution
+) { // dda like step
+    // TODO: warning: expression has no effect?
+    x, y, z = resolution[0] * x, resolution[1] * y, resolution[2] * z;
+    float tx = ((floorf(x + 0.5f + 0.5f * __sign(dir_x)) - x) * idir_x) / resolution[0];
+    float ty = ((floorf(y + 0.5f + 0.5f * __sign(dir_y)) - y) * idir_y) / resolution[1];
+    float tz = ((floorf(z + 0.5f + 0.5f * __sign(dir_z)) - z) * idir_z) / resolution[2];
+    float t = min(min(tx, ty), tz);
+    return fmaxf(t, 0.0f);
+}
+
+inline __device__ float advance_to_next_voxel(
+    float t,
+    float x, float y, float z, 
+    float dir_x, float dir_y, float dir_z, 
+    float idir_x, float idir_y, float idir_z,
+    const int* resolution, float dt_min) {
+    // Regular stepping (may be slower but matches non-empty space)
+    float t_target = t + distance_to_next_voxel(
+        x, y, z, dir_x, dir_y, dir_z, idir_x, idir_y, idir_z, resolution
+    );
+    do {
+        t += dt_min;
+    } while (t < t_target);
+    return t;
+}
+
+
+__global__ void kernel_raymarching(
+    // rays info
+    const uint32_t n_rays,
+    const float* rays_o,  // shape (n_rays, 3)
+    const float* rays_d,  // shape (n_rays, 3)
+    const float* t_min,  // shape (n_rays,)
+    const float* t_max,  // shape (n_rays,)
+    // density grid
+    const float* aabb,  // [min_x, min_y, min_z, max_x, max_y, max_y]
+    const int* resolution,  // [reso_x, reso_y, reso_z]
+    const bool* occ_binary,  // shape (reso_x, reso_y, reso_z)
+    // sampling
+    const int max_total_samples,
+    const int max_per_ray_samples,
+    const float dt,
+    // writable helpers
+    int* steps_counter,
+    int* rays_counter,
+    // frustrum outputs
+    int* packed_info,
+    float* frustum_origins,
+    float* frustum_dirs,
+    float* frustum_starts,
+    float* frustum_ends 
+) {
+    CUDA_GET_THREAD_ID(i, n_rays);
+
+    // locate
+    rays_o += i * 3;
+    rays_d += i * 3;
+    t_min += i;
+    t_max += i;
+
+    const float ox = rays_o[0], oy = rays_o[1], oz = rays_o[2];
+    const float dx = rays_d[0], dy = rays_d[1], dz = rays_d[2];
+    const float rdx = 1 / dx, rdy = 1 / dy, rdz = 1 / dz;
+    const float near = t_min[0], far = t_max[0];
+
+    uint32_t ray_idx, base, marching_samples;
+    uint32_t j;
+    float t0, t1, t_mid;
+
+    // first pass to compute an accurate number of steps
+    j = 0;
+    t0 = near;  // TODO(ruilongli): perturb `near` as in ngp_pl?
+    t1 = t0 + dt;
+    t_mid = (t0 + t1) * 0.5f;
+
+    while (t_mid < far && j < max_per_ray_samples) {
+        // current center
+        const float x = ox + t_mid * dx;
+        const float y = oy + t_mid * dy;
+        const float z = oz + t_mid * dz;
+        
+        if (grid_occupied_at(x, y, z, resolution, aabb, occ_binary)) {
+            ++j;
+            // march to next sample
+            t0 = t1;
+            t1 = t0 + dt;
+            t_mid = (t0 + t1) * 0.5f;
+        }
+        else {
+            // march to next sample
+            t_mid = advance_to_next_voxel(
+                t_mid, x, y, z, dx, dy, dz, rdx, rdy, rdz, resolution, dt
+            );
+            t0 = t_mid - dt * 0.5f;
+            t1 = t_mid + dt * 0.5f;
+        }
+    }
+    if (j == 0) return;
+
+    marching_samples = j;
+    base = atomicAdd(steps_counter, marching_samples);
+    if (base + marching_samples > max_total_samples) return;
+    ray_idx = atomicAdd(rays_counter, 1);
+
+    // locate
+    frustum_origins += base * 3;
+    frustum_dirs += base * 3;
+    frustum_starts += base;
+    frustum_ends += base;
+
+    // Second round
+    j = 0;
+    t0 = near;
+    t1 = t0 + dt;
+    t_mid = (t0 + t1) / 2.;
+
+    while (t_mid < far && j < marching_samples) {
+        // current center
+        const float x = ox + t_mid * dx;
+        const float y = oy + t_mid * dy;
+        const float z = oz + t_mid * dz;
+        
+        if (grid_occupied_at(x, y, z, resolution, aabb, occ_binary)) {
+            frustum_origins[j * 3 + 0] = ox;
+            frustum_origins[j * 3 + 1] = oy;
+            frustum_origins[j * 3 + 2] = oz;
+            frustum_dirs[j * 3 + 0] = dx;
+            frustum_dirs[j * 3 + 1] = dy;
+            frustum_dirs[j * 3 + 2] = dz;
+            frustum_starts[j] = t0;   
+            frustum_ends[j] = t1;     
+            ++j;
+            // march to next sample
+            t0 = t1;
+            t1 = t0 + dt;
+            t_mid = (t0 + t1) * 0.5f;
+        }
+        else {
+            // march to next sample
+            t_mid = advance_to_next_voxel(
+                t_mid, x, y, z, dx, dy, dz, rdx, rdy, rdz, resolution, dt
+            );
+            t0 = t_mid - dt * 0.5f;
+            t1 = t_mid + dt * 0.5f;
+		}
+	}
+
+    packed_info[ray_idx * 3 + 0] = i;  // ray idx in {rays_o, rays_d}
+    packed_info[ray_idx * 3 + 1] = base;  // point idx start.
+    packed_info[ray_idx * 3 + 2] = j;  // point idx shift (actual marching samples).
+
+    return;
+}
+
+
+/**
+ * @brief Sample points by ray marching.
+ * 
+ * @param rays_o Ray origins Shape of [n_rays, 3].
+ * @param rays_d Normalized ray directions. Shape of [n_rays, 3].
+ * @param t_min Near planes of rays. Shape of [n_rays].
+ * @param t_max Far planes of rays. Shape of [n_rays].
+ * @param grid_center Density grid center. TODO: support 3-dims.
+ * @param grid_scale Density grid base level scale. TODO: support 3-dims.
+ * @param grid_cascades Density grid levels.
+ * @param grid_size Density grid resolution.
+ * @param grid_bitfield Density grid uint8 bit field.
+ * @param marching_steps Marching steps during inference.
+ * @param max_total_samples Maximum total number of samples in this batch.
+ * @param max_ray_samples Used to define the minimal step size: SQRT3() / max_ray_samples.
+ * @param cone_angle 0. for nerf-synthetic and 1./256 for real scenes.
+ * @param step_scale Scale up the step size by this much. Usually equals to scene scale.
+ * @return std::vector<torch::Tensor> 
+ * - packed_info: Stores how to index the ray samples from the returned values.
+ *  Shape of [n_rays, 3]. First value is the ray index. Second value is the sample 
+ *  start index in the results for this ray. Third value is the number of samples for
+ *  this ray. Note for rays that have zero samples, we simply skip them so the `packed_info`
+ *  has some zero padding in the end.
+ * - origins: Ray origins for those samples. [max_total_samples, 3]
+ * - dirs: Ray directions for those samples. [max_total_samples, 3]
+ * - starts: Where the frustum-shape sample starts along a ray. [max_total_samples, 1]
+ * - ends: Where the frustum-shape sample ends along a ray. [max_total_samples, 1]
+ */
+std::vector<torch::Tensor> ray_marching(
+    // rays
+    const torch::Tensor rays_o, 
+    const torch::Tensor rays_d, 
+    const torch::Tensor t_min, 
+    const torch::Tensor t_max,
+    // density grid
+    const torch::Tensor aabb,
+    const torch::Tensor resolution,
+    const torch::Tensor occ_binary, 
+    // sampling
+    const int max_total_samples,
+    const int max_per_ray_samples,
+    const float dt
+) {
+    DEVICE_GUARD(rays_o);
+
+    CHECK_INPUT(rays_o);
+    CHECK_INPUT(rays_d);
+    CHECK_INPUT(t_min);
+    CHECK_INPUT(t_max);
+    CHECK_INPUT(aabb);
+    CHECK_INPUT(resolution);
+    CHECK_INPUT(occ_binary);
+    
+    const int n_rays = rays_o.size(0);
+
+    const int threads = 256;
+    const int blocks = CUDA_N_BLOCKS_NEEDED(n_rays, threads);
+
+    // helper counter
+    torch::Tensor steps_counter = torch::zeros(
+        {1}, rays_o.options().dtype(torch::kInt32));
+    torch::Tensor rays_counter = torch::zeros(
+        {1}, rays_o.options().dtype(torch::kInt32));
+
+    // output frustum samples
+    torch::Tensor packed_info = torch::zeros(
+        {n_rays, 3}, rays_o.options().dtype(torch::kInt32));  // ray_id, sample_id, num_samples
+    torch::Tensor frustum_origins = torch::zeros({max_total_samples, 3}, rays_o.options());
+    torch::Tensor frustum_dirs = torch::zeros({max_total_samples, 3}, rays_o.options());
+    torch::Tensor frustum_starts = torch::zeros({max_total_samples, 1}, rays_o.options());
+    torch::Tensor frustum_ends = torch::zeros({max_total_samples, 1}, rays_o.options());
+
+    kernel_raymarching<<<blocks, threads>>>(
+        // rays
+        n_rays,
+        rays_o.data_ptr<float>(),
+        rays_d.data_ptr<float>(),
+        t_min.data_ptr<float>(),
+        t_max.data_ptr<float>(),
+        // density grid
+        aabb.data_ptr<float>(),
+        resolution.data_ptr<int>(),
+        occ_binary.data_ptr<bool>(),
+        // sampling
+        max_total_samples,
+        max_per_ray_samples,
+        dt,
+        // writable helpers
+        steps_counter.data_ptr<int>(),  // total samples.
+        rays_counter.data_ptr<int>(),  // total rays.
+        packed_info.data_ptr<int>(), 
+        frustum_origins.data_ptr<float>(),
+        frustum_dirs.data_ptr<float>(), 
+        frustum_starts.data_ptr<float>(),
+        frustum_ends.data_ptr<float>()
+    ); 
+
+    return {packed_info, frustum_origins, frustum_dirs, frustum_starts, frustum_ends};
+}
+
--- a/nerfacc/cuda/csrc/vol_rendering.cu
+++ b/nerfacc/cuda/csrc/vol_rendering.cu
+#include "include/helpers_cuda.h"
+
+
+template <typename scalar_t>
+__global__ void volumetric_rendering_forward_kernel(
+    const uint32_t n_rays,
+    const int* packed_info,  // input ray & point indices.
+    const scalar_t* starts,  // input start t
+    const scalar_t* ends,  // input end t
+    const scalar_t* sigmas,  // input density after activation
+    const scalar_t* rgbs,  // input rgb after activation 
+    // should be all-zero initialized
+    scalar_t* accumulated_weight,  // output
+    scalar_t* accumulated_depth,  // output
+    scalar_t* accumulated_color,  // output
+    bool* mask  // output
+) {
+    CUDA_GET_THREAD_ID(thread_id, n_rays);
+
+    // locate
+    const int i = packed_info[thread_id * 3 + 0];  // ray idx in {rays_o, rays_d}
+    const int base = packed_info[thread_id * 3 + 1];  // point idx start.
+    const int numsteps = packed_info[thread_id * 3 + 2];  // point idx shift.
+    if (numsteps == 0) return;
+
+    starts += base;
+    ends += base;
+    sigmas += base;
+    rgbs += base * 3;
+
+    accumulated_weight += i;
+    accumulated_depth += i;
+    accumulated_color += i * 3;
+    mask += i;
+
+    // accumulated rendering
+    scalar_t T = 1.f;
+    scalar_t EPSILON = 1e-4f;
+    int j = 0;
+    for (; j < numsteps; ++j) {
+        if (T < EPSILON) {
+            break;
+        }
+        const scalar_t delta = ends[j] - starts[j];
+        const scalar_t t = (ends[j] + starts[j]) * 0.5f;
+
+        const scalar_t alpha = 1.f - __expf(-sigmas[j] * delta);
+        const scalar_t weight = alpha * T;
+        accumulated_weight[0] += weight;
+        accumulated_depth[0] += weight * t;
+        accumulated_color[0] += weight * rgbs[j * 3 + 0];
+        accumulated_color[1] += weight * rgbs[j * 3 + 1];
+        accumulated_color[2] += weight * rgbs[j * 3 + 2];
+        T *= (1.f - alpha);
+    }
+    mask[0] = true;
+}
+
+
+template <typename scalar_t>
+__global__ void volumetric_rendering_backward_kernel(
+    const uint32_t n_rays,
+    const int* packed_info,  // input ray & point indices.
+    const scalar_t* starts,  // input start t
+    const scalar_t* ends,  // input end t
+    const scalar_t* sigmas,  // input density after activation
+    const scalar_t* rgbs,  // input rgb after activation 
+    const scalar_t* accumulated_weight,  // forward output
+    const scalar_t* accumulated_depth,  // forward output
+    const scalar_t* accumulated_color,  // forward output
+    const scalar_t* grad_weight,  // input
+    const scalar_t* grad_depth,  // input
+    const scalar_t* grad_color,  // input
+    scalar_t* grad_sigmas,  // output
+    scalar_t* grad_rgbs  // output
+) {
+    CUDA_GET_THREAD_ID(thread_id, n_rays);
+
+    // locate
+    const int i = packed_info[thread_id * 3 + 0];  // ray idx in {rays_o, rays_d}
+    const int base = packed_info[thread_id * 3 + 1];  // point idx start.
+    const int numsteps = packed_info[thread_id * 3 + 2];  // point idx shift.
+    if (numsteps == 0) return;
+
+    starts += base;
+    ends += base;
+    sigmas += base;
+    rgbs += base * 3;
+
+    grad_sigmas += base;
+    grad_rgbs += base * 3;
+
+    accumulated_weight += i;
+    accumulated_depth += i;
+    accumulated_color += i * 3;
+    
+    grad_weight += i;
+    grad_depth += i;
+    grad_color += i * 3;
+    
+    // backward of accumulated rendering
+    scalar_t T = 1.f;
+    scalar_t EPSILON = 1e-4f;
+    int j = 0;
+    scalar_t r = 0, g = 0, b = 0, d = 0;
+    for (; j < numsteps; ++j) {
+        if (T < EPSILON) {
+            break;
+        }
+        const scalar_t delta = ends[j] - starts[j];
+        const scalar_t t = (ends[j] + starts[j]) * 0.5f;
+
+        const scalar_t alpha = 1.f - __expf(-sigmas[j] * delta);
+        const scalar_t weight = alpha * T;
+
+        r += weight * rgbs[j * 3 + 0];
+        g += weight * rgbs[j * 3 + 1];
+        b += weight * rgbs[j * 3 + 2];
+        d += weight * t;
+
+        T *= (1.f - alpha);
+
+        grad_rgbs[j * 3 + 0] = grad_color[0] * weight;
+        grad_rgbs[j * 3 + 1] = grad_color[1] * weight;
+        grad_rgbs[j * 3 + 2] = grad_color[2] * weight;
+
+        grad_sigmas[j] = delta * (
+            grad_color[0] * (T * rgbs[j * 3 + 0] - (accumulated_color[0] - r)) +
+            grad_color[1] * (T * rgbs[j * 3 + 1] - (accumulated_color[1] - g)) +
+            grad_color[2] * (T * rgbs[j * 3 + 2] - (accumulated_color[2] - b)) +
+            grad_weight[0] * (1.f - accumulated_weight[0]) +
+            grad_depth[0] * (t * T - (accumulated_depth[0] - d))
+        );
+    }
+}
+
+/**
+ * @brief Volumetric Rendering: Accumulating samples in the forward pass.
+ *  The inputs, excepct for `sigmas` and `rgbs`, are the outputs of our
+ *  cuda ray marching function in `raymarching.cu`
+ * 
+ * @param packed_info Stores how to index the ray samples from the returned values.
+ *  Shape of [n_rays, 3]. First value is the ray index. Second value is the sample 
+ *  start index in the results for this ray. Third value is the number of samples for
+ *  this ray. Note for rays that have zero samples, we simply skip them so the `packed_info`
+ *  has some zero padding in the end.
+ * @param starts: Where the frustum-shape sample starts along a ray. [total_samples, 1]
+ * @param ends: Where the frustum-shape sample ends along a ray. [total_samples, 1]
+ * @param sigmas Densities at those samples. [total_samples, 1]
+ * @param rgbs RGBs at those samples. [total_samples, 3]
+ * @return std::vector<torch::Tensor> 
+ * - accumulated_weight: Ray opacity. [n_rays, 1]
+ * - accumulated_depth: Ray depth. [n_rays, 1]
+ * - accumulated_color: Ray color. [n_rays, 3]
+ * - mask: Boolen value store if this ray has valid samples from packed_info. [n_rays]
+ */
+std::vector<torch::Tensor> volumetric_rendering_forward(
+    torch::Tensor packed_info, 
+    torch::Tensor starts, 
+    torch::Tensor ends, 
+    torch::Tensor sigmas, 
+    torch::Tensor rgbs
+) {
+    DEVICE_GUARD(packed_info);
+    CHECK_INPUT(packed_info);
+    CHECK_INPUT(starts);
+    CHECK_INPUT(ends);
+    CHECK_INPUT(sigmas);
+    CHECK_INPUT(rgbs);
+    TORCH_CHECK(packed_info.ndimension() == 2 & packed_info.size(1) == 3);
+    TORCH_CHECK(starts.ndimension() == 2 & starts.size(1) == 1);
+    TORCH_CHECK(ends.ndimension() == 2 & ends.size(1) == 1);
+    TORCH_CHECK(sigmas.ndimension() == 2 & sigmas.size(1) == 1);
+    TORCH_CHECK(rgbs.ndimension() == 2 & rgbs.size(1) == 3);
+
+    const uint32_t n_rays = packed_info.size(0);
+
+    const int threads = 256;
+    const int blocks = CUDA_N_BLOCKS_NEEDED(n_rays, threads);
+
+    // outputs
+    torch::Tensor accumulated_weight = torch::zeros({n_rays, 1}, sigmas.options()); 
+    torch::Tensor accumulated_depth = torch::zeros({n_rays, 1}, sigmas.options()); 
+    torch::Tensor accumulated_color = torch::zeros({n_rays, 3}, sigmas.options()); 
+    // The rays that are not skipped during sampling.
+    torch::Tensor mask = torch::zeros({n_rays}, sigmas.options().dtype(torch::kBool)); 
+
+    AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+        sigmas.scalar_type(),
+        "volumetric_rendering_forward",
+        ([&]
+         { volumetric_rendering_forward_kernel<scalar_t><<<blocks, threads>>>(
+                n_rays,
+                packed_info.data_ptr<int>(), 
+                starts.data_ptr<scalar_t>(),
+                ends.data_ptr<scalar_t>(),
+                sigmas.data_ptr<scalar_t>(),
+                rgbs.data_ptr<scalar_t>(),
+                accumulated_weight.data_ptr<scalar_t>(),
+                accumulated_depth.data_ptr<scalar_t>(),
+                accumulated_color.data_ptr<scalar_t>(),
+                mask.data_ptr<bool>()
+            ); 
+        }));
+
+    return {accumulated_weight, accumulated_depth, accumulated_color, mask};
+}
+
+
+
+/**
+ * @brief Volumetric Rendering: Accumulating samples in the backward pass.
+ */
+std::vector<torch::Tensor> volumetric_rendering_backward(
+    torch::Tensor accumulated_weight, 
+    torch::Tensor accumulated_depth, 
+    torch::Tensor accumulated_color, 
+    torch::Tensor grad_weight, 
+    torch::Tensor grad_depth, 
+    torch::Tensor grad_color, 
+    torch::Tensor packed_info, 
+    torch::Tensor starts, 
+    torch::Tensor ends, 
+    torch::Tensor sigmas, 
+    torch::Tensor rgbs
+) {
+    DEVICE_GUARD(packed_info);
+    const uint32_t n_rays = packed_info.size(0);
+
+    const int threads = 256;
+    const int blocks = CUDA_N_BLOCKS_NEEDED(n_rays, threads);
+
+    // outputs
+    torch::Tensor grad_sigmas = torch::zeros(sigmas.sizes(), sigmas.options()); 
+    torch::Tensor grad_rgbs = torch::zeros(rgbs.sizes(), rgbs.options()); 
+
+    AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+        sigmas.scalar_type(),
+        "volumetric_rendering_backward",
+        ([&]
+         { volumetric_rendering_backward_kernel<scalar_t><<<blocks, threads>>>(
+                n_rays,
+                packed_info.data_ptr<int>(), 
+                starts.data_ptr<scalar_t>(),
+                ends.data_ptr<scalar_t>(),
+                sigmas.data_ptr<scalar_t>(),
+                rgbs.data_ptr<scalar_t>(),
+                accumulated_weight.data_ptr<scalar_t>(),
+                accumulated_depth.data_ptr<scalar_t>(),
+                accumulated_color.data_ptr<scalar_t>(),
+                grad_weight.data_ptr<scalar_t>(),
+                grad_depth.data_ptr<scalar_t>(),
+                grad_color.data_ptr<scalar_t>(),
+                grad_sigmas.data_ptr<scalar_t>(),
+                grad_rgbs.data_ptr<scalar_t>()
+            ); 
+        }));
+
+    return {grad_sigmas, grad_rgbs};
+}
\ No newline at end of file
--- a/requirements.txt
+++ b/requirements.txt
+ninja
+pybind11
+--extra-index-url https://download.pytorch.org/whl/cu116
+torch==1.12.1
+-e .
--- a/setup.py
+++ b/setup.py
+from setuptools import find_packages, setup
+
+setup(
+    name="nerfacc",
+    description="NeRF accelerated rendering",
+    version="0.0.2",
+    python_requires=">=3.9",
+    packages=find_packages(exclude=("tests*",)),
+)
--- a/setup.sh
+++ b/setup.sh
+conda create -n nerfacc python=3.9 -y
+conda activate nerfacc
+pip install -r requirements.txt
\ No newline at end of file