Reformat (#31)

* seems working * contraction func in cuda * Update type * More type updates * disable DDA for contraction * update contraction perfom in readme * 360 data: Garden * eval at max_steps * add perform of 360 to readme * fix contraction scaling * tiny hot fix * new volrend * cleanup ray_marching.cu * cleanup backend * tests * cleaning up Grid * fix doc for grid base class * check and fix for contraction * test grid * rendering and marching * transmittance_compress verified * rendering is indeed faster * pipeline is working * lego example * cleanup * cuda folder is cleaned up! finally! * cuda formatting * contraction verify * upgrade grid * test for ray marching * pipeline * ngp with contraction * train_ngp runs but slow * trasmittance seperate to two. Now NGP is as fast as before * verified faster than before * bug fix for contraction * ngp contraction fix * tiny cleanup * contraction works! yay! * contraction with tanh seems working * minor update * support alpha rendering * absorb visibility to ray marching * tiny import update * get rid of contraction temperture; * doc for ContractionType * doc for Grid * doc for grid.py is done * doc for ray marching * rendering function * fix doc for rendering * doc for vol rend * autosummary for utils * fix autosummary line break * utils docs * api doc is done * starting work on examples * contraction for npg is in python now * further clean up examples * mlp nerf is running * dnerf is in * update readme command * merge * disable pylint error for now * reformatting and skip tests without cuda * fix the type issue for contractiontype * fix cuda attribute issue * bump to 0.1.0 Co-authored-by: Matt Tancik <tancik@berkeley.edu>

Reformat (#31)
* seems working * contraction func in cuda * Update type * More type updates * disable DDA for contraction * update contraction perfom in readme * 360 data: Garden * eval at max_steps * add perform of 360 to readme * fix contraction scaling * tiny hot fix * new volrend * cleanup ray_marching.cu * cleanup backend * tests * cleaning up Grid * fix doc for grid base class * check and fix for contraction * test grid * rendering and marching * transmittance_compress verified * rendering is indeed faster * pipeline is working * lego example * cleanup * cuda folder is cleaned up! finally! * cuda formatting * contraction verify * upgrade grid * test for ray marching * pipeline * ngp with contraction * train_ngp runs but slow * trasmittance seperate to two. Now NGP is as fast as before * verified faster than before * bug fix for contraction * ngp contraction fix * tiny cleanup * contraction works! yay! * contraction with tanh seems working * minor update * support alpha rendering * absorb visibility to ray marching * tiny import update * get rid of contraction temperture; * doc for ContractionType * doc for Grid * doc for grid.py is done * doc for ray marching * rendering function * fix doc for rendering * doc for vol rend * autosummary for utils * fix autosummary line break * utils docs * api doc is done * starting work on examples * contraction for npg is in python now * further clean up examples * mlp nerf is running * dnerf is in * update readme command * merge * disable pylint error for now * reformatting and skip tests without cuda * fix the type issue for contractiontype * fix cuda attribute issue * bump to 0.1.0 Co-authored-by: Matt Tancik <tancik@berkeley.edu>
8dcfbad9 · Ruilong Li(李瑞龙) · GitHub · a7611603 · 8dcfbad9 · 8dcfbad9
Unverified Commit 8dcfbad9 authored Sep 27, 2022 by Ruilong Li(李瑞龙) Committed by GitHub Sep 27, 2022
18 changed files
--- a/nerfacc/cuda/csrc/ray_marching.cu
+++ b/nerfacc/cuda/csrc/ray_marching.cu
+#include "include/helpers_cuda.h"
+#include "include/helpers_math.h"
+#include "include/helpers_contraction.h"
+
+inline __device__ __host__ float calc_dt(
+    const float t, const float cone_angle,
+    const float dt_min, const float dt_max)
+{
+    return clamp(t * cone_angle, dt_min, dt_max);
+}
+
+inline __device__ __host__ int grid_idx_at(
+    const float3 xyz_unit, const int3 grid_res)
+{
+    // xyz should be always in [0, 1]^3.
+    int3 ixyz = make_int3(xyz_unit * make_float3(grid_res));
+    ixyz = clamp(ixyz, make_int3(0, 0, 0), grid_res - 1);
+    int3 grid_offset = make_int3(grid_res.y * grid_res.z, grid_res.z, 1);
+    int idx = dot(ixyz, grid_offset);
+    return idx;
+}
+
+inline __device__ __host__ bool grid_occupied_at(
+    const float3 xyz,
+    const float3 roi_min, const float3 roi_max,
+    ContractionType type,
+    const int3 grid_res, const bool *grid_binary)
+{
+    if (type == ContractionType::AABB &&
+        (xyz.x < roi_min.x || xyz.x > roi_max.x ||
+         xyz.y < roi_min.y || xyz.y > roi_max.y ||
+         xyz.z < roi_min.z || xyz.z > roi_max.z))
+    {
+        return false;
+    }
+    float3 xyz_unit = apply_contraction(
+        xyz, roi_min, roi_max, type);
+    int idx = grid_idx_at(xyz_unit, grid_res);
+    return grid_binary[idx];
+}
+
+// dda like step
+inline __device__ __host__ float distance_to_next_voxel(
+    const float3 xyz, const float3 dir, const float3 inv_dir,
+    const float3 roi_min, const float3 roi_max, const int3 grid_res)
+{
+    float3 _occ_res = make_float3(grid_res);
+    float3 _xyz = roi_to_unit(xyz, roi_min, roi_max) * _occ_res;
+    float3 txyz = ((floorf(_xyz + 0.5f + 0.5f * sign(dir)) - _xyz) * inv_dir) / _occ_res * (roi_max - roi_min);
+    float t = min(min(txyz.x, txyz.y), txyz.z);
+    return fmaxf(t, 0.0f);
+}
+
+inline __device__ __host__ float advance_to_next_voxel(
+    const float t, const float dt_min,
+    const float3 xyz, const float3 dir, const float3 inv_dir,
+    const float3 roi_min, const float3 roi_max, const int3 grid_res)
+{
+    // Regular stepping (may be slower but matches non-empty space)
+    float t_target = t + distance_to_next_voxel(
+                             xyz, dir, inv_dir, roi_min, roi_max, grid_res);
+    float _t = t;
+    do
+    {
+        _t += dt_min;
+    } while (_t < t_target);
+    return _t;
+}
+
+// -------------------------------------------------------------------------------
+// Raymarching
+// -------------------------------------------------------------------------------
+
+__global__ void ray_marching_kernel(
+    // rays info
+    const uint32_t n_rays,
+    const float *rays_o, // shape (n_rays, 3)
+    const float *rays_d, // shape (n_rays, 3)
+    const float *t_min,  // shape (n_rays,)
+    const float *t_max,  // shape (n_rays,)
+    // occupancy grid & contraction
+    const float *roi,
+    const int3 grid_res,
+    const bool *grid_binary, // shape (reso_x, reso_y, reso_z)
+    const ContractionType type,
+    // sampling
+    const float step_size,
+    const float cone_angle,
+    const int *packed_info,
+    // first round outputs
+    int *num_steps,
+    // second round outputs
+    float *t_starts,
+    float *t_ends)
+{
+    CUDA_GET_THREAD_ID(i, n_rays);
+
+    bool is_first_round = (packed_info == nullptr);
+
+    // locate
+    rays_o += i * 3;
+    rays_d += i * 3;
+    t_min += i;
+    t_max += i;
+
+    if (is_first_round)
+    {
+        num_steps += i;
+    }
+    else
+    {
+        int base = packed_info[i * 2 + 0];
+        int steps = packed_info[i * 2 + 1];
+        t_starts += base;
+        t_ends += base;
+    }
+
+    const float3 origin = make_float3(rays_o[0], rays_o[1], rays_o[2]);
+    const float3 dir = make_float3(rays_d[0], rays_d[1], rays_d[2]);
+    const float3 inv_dir = 1.0f / dir;
+    const float near = t_min[0], far = t_max[0];
+
+    const float3 roi_min = make_float3(roi[0], roi[1], roi[2]);
+    const float3 roi_max = make_float3(roi[3], roi[4], roi[5]);
+
+    // TODO: compute dt_max from occ resolution.
+    float dt_min = step_size;
+    float dt_max = 1e10f;
+
+    int j = 0;
+    float t0 = near;
+    float dt = calc_dt(t0, cone_angle, dt_min, dt_max);
+    float t1 = t0 + dt;
+    float t_mid = (t0 + t1) * 0.5f;
+
+    while (t_mid < far)
+    {
+        // current center
+        const float3 xyz = origin + t_mid * dir;
+        if (grid_occupied_at(xyz, roi_min, roi_max, type, grid_res, grid_binary))
+        {
+            if (!is_first_round)
+            {
+                t_starts[j] = t0;
+                t_ends[j] = t1;
+            }
+            ++j;
+            // march to next sample
+            t0 = t1;
+            t1 = t0 + calc_dt(t0, cone_angle, dt_min, dt_max);
+            t_mid = (t0 + t1) * 0.5f;
+        }
+        else
+        {
+            // march to next sample
+            switch (type)
+            {
+            case ContractionType::AABB:
+                // no contraction
+                t_mid = advance_to_next_voxel(
+                    t_mid, dt_min, xyz, dir, inv_dir, roi_min, roi_max, grid_res);
+                dt = calc_dt(t_mid, cone_angle, dt_min, dt_max);
+                t0 = t_mid - dt * 0.5f;
+                t1 = t_mid + dt * 0.5f;
+                break;
+
+            default:
+                // any type of scene contraction does not work with DDA.
+                t0 = t1;
+                t1 = t0 + calc_dt(t0, cone_angle, dt_min, dt_max);
+                t_mid = (t0 + t1) * 0.5f;
+                break;
+            }
+        }
+    }
+
+    if (is_first_round)
+    {
+        *num_steps = j;
+    }
+    return;
+}
+
+std::vector<torch::Tensor> ray_marching(
+    // rays
+    const torch::Tensor rays_o,
+    const torch::Tensor rays_d,
+    const torch::Tensor t_min,
+    const torch::Tensor t_max,
+    // occupancy grid & contraction
+    const torch::Tensor roi,
+    const torch::Tensor grid_binary,
+    const ContractionType type,
+    // sampling
+    const float step_size,
+    const float cone_angle)
+{
+    DEVICE_GUARD(rays_o);
+
+    CHECK_INPUT(rays_o);
+    CHECK_INPUT(rays_d);
+    CHECK_INPUT(t_min);
+    CHECK_INPUT(t_max);
+    CHECK_INPUT(roi);
+    CHECK_INPUT(grid_binary);
+    TORCH_CHECK(rays_o.ndimension() == 2 & rays_o.size(1) == 3)
+    TORCH_CHECK(rays_d.ndimension() == 2 & rays_d.size(1) == 3)
+    TORCH_CHECK(t_min.ndimension() == 1)
+    TORCH_CHECK(t_max.ndimension() == 1)
+    TORCH_CHECK(roi.ndimension() == 1 & roi.size(0) == 6)
+    TORCH_CHECK(grid_binary.ndimension() == 3)
+
+    const int n_rays = rays_o.size(0);
+    const int3 grid_res = make_int3(
+        grid_binary.size(0), grid_binary.size(1), grid_binary.size(2));
+
+    const int threads = 256;
+    const int blocks = CUDA_N_BLOCKS_NEEDED(n_rays, threads);
+
+    // helper counter
+    torch::Tensor num_steps = torch::zeros(
+        {n_rays}, rays_o.options().dtype(torch::kInt32));
+
+    // count number of samples per ray
+    ray_marching_kernel<<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
+        // rays
+        n_rays,
+        rays_o.data_ptr<float>(),
+        rays_d.data_ptr<float>(),
+        t_min.data_ptr<float>(),
+        t_max.data_ptr<float>(),
+        // occupancy grid & contraction
+        roi.data_ptr<float>(),
+        grid_res,
+        grid_binary.data_ptr<bool>(),
+        type,
+        // sampling
+        step_size,
+        cone_angle,
+        nullptr, /* packed_info */
+        // outputs
+        num_steps.data_ptr<int>(),
+        nullptr, /* t_starts */
+        nullptr /* t_ends */);
+
+    torch::Tensor cum_steps = num_steps.cumsum(0, torch::kInt32);
+    torch::Tensor packed_info = torch::stack({cum_steps - num_steps, num_steps}, 1);
+
+    // output samples starts and ends
+    int total_steps = cum_steps[cum_steps.size(0) - 1].item<int>();
+    torch::Tensor t_starts = torch::zeros({total_steps, 1}, rays_o.options());
+    torch::Tensor t_ends = torch::zeros({total_steps, 1}, rays_o.options());
+
+    ray_marching_kernel<<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
+        // rays
+        n_rays,
+        rays_o.data_ptr<float>(),
+        rays_d.data_ptr<float>(),
+        t_min.data_ptr<float>(),
+        t_max.data_ptr<float>(),
+        // occupancy grid & contraction
+        roi.data_ptr<float>(),
+        grid_res,
+        grid_binary.data_ptr<bool>(),
+        type,
+        // sampling
+        step_size,
+        cone_angle,
+        packed_info.data_ptr<int>(),
+        // outputs
+        nullptr, /* num_steps */
+        t_starts.data_ptr<float>(),
+        t_ends.data_ptr<float>());
+
+    return {packed_info, t_starts, t_ends};
+}
+
+// -----------------------------------------------------------------------------
+// Ray index for each sample
+// -----------------------------------------------------------------------------
+
+__global__ void ray_indices_kernel(
+    // input
+    const int n_rays,
+    const int *packed_info,
+    // output
+    int *ray_indices)
+{
+    CUDA_GET_THREAD_ID(i, n_rays);
+
+    // locate
+    const int base = packed_info[i * 2 + 0];  // point idx start.
+    const int steps = packed_info[i * 2 + 1]; // point idx shift.
+    if (steps == 0)
+        return;
+
+    ray_indices += base;
+
+    for (int j = 0; j < steps; ++j)
+    {
+        ray_indices[j] = i;
+    }
+}
+
+torch::Tensor unpack_to_ray_indices(const torch::Tensor packed_info)
+{
+    DEVICE_GUARD(packed_info);
+    CHECK_INPUT(packed_info);
+
+    const int n_rays = packed_info.size(0);
+    const int threads = 256;
+    const int blocks = CUDA_N_BLOCKS_NEEDED(n_rays, threads);
+
+    int n_samples = packed_info[n_rays - 1].sum(0).item<int>();
+    torch::Tensor ray_indices = torch::zeros(
+        {n_samples}, packed_info.options().dtype(torch::kInt32));
+
+    ray_indices_kernel<<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
+        n_rays,
+        packed_info.data_ptr<int>(),
+        ray_indices.data_ptr<int>());
+    return ray_indices;
+}
+
+// ----------------------------------------------------------------------------
+// Query the occupancy grid
+// ----------------------------------------------------------------------------
+
+__global__ void query_occ_kernel(
+    // rays info
+    const uint32_t n_samples,
+    const float *samples, // shape (n_samples, 3)
+    // occupancy grid & contraction
+    const float *roi,
+    const int3 grid_res,
+    const bool *grid_binary, // shape (reso_x, reso_y, reso_z)
+    const ContractionType type,
+    // outputs
+    bool *occs)
+{
+    CUDA_GET_THREAD_ID(i, n_samples);
+
+    // locate
+    samples += i * 3;
+    occs += i;
+
+    const float3 roi_min = make_float3(roi[0], roi[1], roi[2]);
+    const float3 roi_max = make_float3(roi[3], roi[4], roi[5]);
+    const float3 xyz = make_float3(samples[0], samples[1], samples[2]);
+
+    *occs = grid_occupied_at(xyz, roi_min, roi_max, type, grid_res, grid_binary);
+    return;
+}
+
+torch::Tensor query_occ(
+    const torch::Tensor samples,
+    // occupancy grid & contraction
+    const torch::Tensor roi,
+    const torch::Tensor grid_binary,
+    const ContractionType type)
+{
+    DEVICE_GUARD(samples);
+    CHECK_INPUT(samples);
+
+    const int n_samples = samples.size(0);
+    const int3 grid_res = make_int3(
+        grid_binary.size(0), grid_binary.size(1), grid_binary.size(2));
+
+    const int threads = 256;
+    const int blocks = CUDA_N_BLOCKS_NEEDED(n_samples, threads);
+
+    torch::Tensor occs = torch::zeros(
+        {n_samples}, samples.options().dtype(torch::kBool));
+
+    query_occ_kernel<<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
+        n_samples,
+        samples.data_ptr<float>(),
+        // grid
+        roi.data_ptr<float>(),
+        grid_res,
+        grid_binary.data_ptr<bool>(),
+        type,
+        // outputs
+        occs.data_ptr<bool>());
+    return occs;
+}
--- a/nerfacc/cuda/csrc/rendering.cu
+++ b/nerfacc/cuda/csrc/rendering.cu
+#include "include/helpers_cuda.h"
+
+template <typename scalar_t>
+__global__ void rendering_forward_kernel(
+    const uint32_t n_rays,
+    const int *packed_info,        // input ray & point indices.
+    const scalar_t *starts,        // input start t
+    const scalar_t *ends,          // input end t
+    const scalar_t *sigmas,        // input density after activation
+    const scalar_t *alphas,        // input alpha (opacity) values.
+    const scalar_t early_stop_eps, // transmittance threshold for early stop
+    // outputs: should be all-zero initialized
+    int *num_steps,        // the number of valid steps for each ray
+    scalar_t *weights,     // the number rendering weights for each sample
+    bool *compact_selector // the samples that we needs to compute the gradients
+)
+{
+    CUDA_GET_THREAD_ID(i, n_rays);
+
+    // locate
+    const int base = packed_info[i * 2 + 0];  // point idx start.
+    const int steps = packed_info[i * 2 + 1]; // point idx shift.
+    if (steps == 0)
+        return;
+
+    if (alphas != nullptr)
+    {
+        // rendering with alpha
+        alphas += base;
+    }
+    else
+    {
+        // rendering with density
+        starts += base;
+        ends += base;
+        sigmas += base;
+    }
+
+    if (num_steps != nullptr)
+    {
+        num_steps += i;
+    }
+    if (weights != nullptr)
+    {
+        weights += base;
+    }
+    if (compact_selector != nullptr)
+    {
+        compact_selector += base;
+    }
+
+    // accumulated rendering
+    scalar_t T = 1.f;
+    int j = 0;
+    for (; j < steps; ++j)
+    {
+        if (T < early_stop_eps)
+        {
+            break;
+        }
+        scalar_t alpha;
+        if (alphas != nullptr)
+        {
+            // rendering with alpha
+            alpha = alphas[j];
+        }
+        else
+        {
+            // rendering with density
+            scalar_t delta = ends[j] - starts[j];
+            alpha = 1.f - __expf(-sigmas[j] * delta);
+        }
+        const scalar_t weight = alpha * T;
+        T *= (1.f - alpha);
+        if (weights != nullptr)
+        {
+            weights[j] = weight;
+        }
+        if (compact_selector != nullptr)
+        {
+            compact_selector[j] = true;
+        }
+    }
+    if (num_steps != nullptr)
+    {
+        *num_steps = j;
+    }
+    return;
+}
+
+template <typename scalar_t>
+__global__ void rendering_backward_kernel(
+    const uint32_t n_rays,
+    const int *packed_info,        // input ray & point indices.
+    const scalar_t *starts,        // input start t
+    const scalar_t *ends,          // input end t
+    const scalar_t *sigmas,        // input density after activation
+    const scalar_t *alphas,        // input alpha (opacity) values.
+    const scalar_t early_stop_eps, // transmittance threshold for early stop
+    const scalar_t *weights,       // forward output
+    const scalar_t *grad_weights,  // input gradients
+    // if alphas was given, we compute the gradients for alphas.
+    // otherwise, we compute the gradients for sigmas.
+    scalar_t *grad_sigmas, // output gradients
+    scalar_t *grad_alphas  // output gradients
+)
+{
+    CUDA_GET_THREAD_ID(i, n_rays);
+
+    // locate
+    const int base = packed_info[i * 2 + 0];  // point idx start.
+    const int steps = packed_info[i * 2 + 1]; // point idx shift.
+    if (steps == 0)
+        return;
+
+    if (alphas != nullptr)
+    {
+        // rendering with alpha
+        alphas += base;
+        grad_alphas += base;
+    }
+    else
+    {
+        // rendering with density
+        starts += base;
+        ends += base;
+        sigmas += base;
+        grad_sigmas += base;
+    }
+
+    weights += base;
+    grad_weights += base;
+
+    scalar_t accum = 0;
+    for (int j = 0; j < steps; ++j)
+    {
+        accum += grad_weights[j] * weights[j];
+    }
+
+    // backward of accumulated rendering
+    scalar_t T = 1.f;
+    for (int j = 0; j < steps; ++j)
+    {
+        if (T < early_stop_eps)
+        {
+            break;
+        }
+        scalar_t alpha;
+        if (alphas != nullptr)
+        {
+            // rendering with alpha
+            alpha = alphas[j];
+            grad_alphas[j] = (grad_weights[j] * T - accum) / fmaxf(1.f - alpha, 1e-10f);
+        }
+        else
+        {
+            // rendering with density
+            scalar_t delta = ends[j] - starts[j];
+            alpha = 1.f - __expf(-sigmas[j] * delta);
+            grad_sigmas[j] = (grad_weights[j] * T - accum) * delta;
+        }
+
+        accum -= grad_weights[j] * weights[j];
+        T *= (1.f - alpha);
+    }
+}
+
+std::vector<torch::Tensor> rendering_forward(
+    torch::Tensor packed_info,
+    torch::Tensor starts,
+    torch::Tensor ends,
+    torch::Tensor sigmas,
+    float early_stop_eps,
+    bool compression)
+{
+    DEVICE_GUARD(packed_info);
+
+    CHECK_INPUT(packed_info);
+    CHECK_INPUT(starts);
+    CHECK_INPUT(ends);
+    CHECK_INPUT(sigmas);
+
+    TORCH_CHECK(packed_info.ndimension() == 2 & packed_info.size(1) == 2);
+    TORCH_CHECK(starts.ndimension() == 2 & starts.size(1) == 1);
+    TORCH_CHECK(ends.ndimension() == 2 & ends.size(1) == 1);
+    TORCH_CHECK(sigmas.ndimension() == 2 & sigmas.size(1) == 1);
+
+    const uint32_t n_rays = packed_info.size(0);
+    const uint32_t n_samples = sigmas.size(0);
+
+    const int threads = 256;
+    const int blocks = CUDA_N_BLOCKS_NEEDED(n_rays, threads);
+
+    if (compression)
+    {
+        // compress the samples to get rid of invisible ones.
+        torch::Tensor num_steps = torch::zeros({n_rays}, packed_info.options());
+        torch::Tensor compact_selector = torch::zeros(
+            {n_samples}, sigmas.options().dtype(torch::kBool));
+
+        AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+            sigmas.scalar_type(),
+            "rendering_forward",
+            ([&]
+             { rendering_forward_kernel<scalar_t><<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
+                   n_rays,
+                   // inputs
+                   packed_info.data_ptr<int>(),
+                   starts.data_ptr<scalar_t>(),
+                   ends.data_ptr<scalar_t>(),
+                   sigmas.data_ptr<scalar_t>(),
+                   nullptr, // alphas
+                   early_stop_eps,
+                   // outputs
+                   num_steps.data_ptr<int>(),
+                   nullptr,
+                   compact_selector.data_ptr<bool>()); }));
+
+        torch::Tensor cum_steps = num_steps.cumsum(0, torch::kInt32);
+        torch::Tensor compact_packed_info = torch::stack({cum_steps - num_steps, num_steps}, 1);
+        return {compact_packed_info, compact_selector};
+    }
+    else
+    {
+        // just do the forward rendering.
+        torch::Tensor weights = torch::zeros({n_samples}, sigmas.options());
+
+        AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+            sigmas.scalar_type(),
+            "rendering_forward",
+            ([&]
+             { rendering_forward_kernel<scalar_t><<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
+                   n_rays,
+                   // inputs
+                   packed_info.data_ptr<int>(),
+                   starts.data_ptr<scalar_t>(),
+                   ends.data_ptr<scalar_t>(),
+                   sigmas.data_ptr<scalar_t>(),
+                   nullptr, // alphas
+                   early_stop_eps,
+                   // outputs
+                   nullptr,
+                   weights.data_ptr<scalar_t>(),
+                   nullptr); }));
+
+        return {weights};
+    }
+}
+
+torch::Tensor rendering_backward(
+    torch::Tensor weights,
+    torch::Tensor grad_weights,
+    torch::Tensor packed_info,
+    torch::Tensor starts,
+    torch::Tensor ends,
+    torch::Tensor sigmas,
+    float early_stop_eps)
+{
+    DEVICE_GUARD(packed_info);
+    const uint32_t n_rays = packed_info.size(0);
+    const uint32_t n_samples = sigmas.size(0);
+
+    const int threads = 256;
+    const int blocks = CUDA_N_BLOCKS_NEEDED(n_rays, threads);
+
+    // outputs
+    torch::Tensor grad_sigmas = torch::zeros(sigmas.sizes(), sigmas.options());
+
+    AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+        sigmas.scalar_type(),
+        "rendering_backward",
+        ([&]
+         { rendering_backward_kernel<scalar_t><<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
+               n_rays,
+               // inputs
+               packed_info.data_ptr<int>(),
+               starts.data_ptr<scalar_t>(),
+               ends.data_ptr<scalar_t>(),
+               sigmas.data_ptr<scalar_t>(),
+               nullptr, // alphas
+               early_stop_eps,
+               weights.data_ptr<scalar_t>(),
+               grad_weights.data_ptr<scalar_t>(),
+               // outputs
+               grad_sigmas.data_ptr<scalar_t>(),
+               nullptr // alphas gradients
+           ); }));
+
+    return grad_sigmas;
+}
+
+// -- rendering with alphas -- //
+
+std::vector<torch::Tensor> rendering_alphas_forward(
+    torch::Tensor packed_info,
+    torch::Tensor alphas,
+    float early_stop_eps,
+    bool compression)
+{
+    DEVICE_GUARD(packed_info);
+
+    CHECK_INPUT(packed_info);
+    CHECK_INPUT(alphas);
+
+    TORCH_CHECK(packed_info.ndimension() == 2 & packed_info.size(1) == 2);
+    TORCH_CHECK(alphas.ndimension() == 2 & alphas.size(1) == 1);
+
+    const uint32_t n_rays = packed_info.size(0);
+    const uint32_t n_samples = alphas.size(0);
+
+    const int threads = 256;
+    const int blocks = CUDA_N_BLOCKS_NEEDED(n_rays, threads);
+
+    if (compression)
+    {
+        // compress the samples to get rid of invisible ones.
+        torch::Tensor num_steps = torch::zeros({n_rays}, packed_info.options());
+        torch::Tensor compact_selector = torch::zeros(
+            {n_samples}, alphas.options().dtype(torch::kBool));
+
+        AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+            alphas.scalar_type(),
+            "rendering_alphas_forward",
+            ([&]
+             { rendering_forward_kernel<scalar_t><<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
+                   n_rays,
+                   // inputs
+                   packed_info.data_ptr<int>(),
+                   nullptr, // starts
+                   nullptr, // ends
+                   nullptr, // sigmas
+                   alphas.data_ptr<scalar_t>(),
+                   early_stop_eps,
+                   // outputs
+                   num_steps.data_ptr<int>(),
+                   nullptr,
+                   compact_selector.data_ptr<bool>()); }));
+
+        torch::Tensor cum_steps = num_steps.cumsum(0, torch::kInt32);
+        torch::Tensor compact_packed_info = torch::stack({cum_steps - num_steps, num_steps}, 1);
+        return {compact_selector, compact_packed_info};
+    }
+    else
+    {
+        // just do the forward rendering.
+        torch::Tensor weights = torch::zeros({n_samples}, alphas.options());
+
+        AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+            alphas.scalar_type(),
+            "rendering_forward",
+            ([&]
+             { rendering_forward_kernel<scalar_t><<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
+                   n_rays,
+                   // inputs
+                   packed_info.data_ptr<int>(),
+                   nullptr, // starts
+                   nullptr, // ends
+                   nullptr, // sigmas
+                   alphas.data_ptr<scalar_t>(),
+                   early_stop_eps,
+                   // outputs
+                   nullptr,
+                   weights.data_ptr<scalar_t>(),
+                   nullptr); }));
+
+        return {weights};
+    }
+}
+
+torch::Tensor rendering_alphas_backward(
+    torch::Tensor weights,
+    torch::Tensor grad_weights,
+    torch::Tensor packed_info,
+    torch::Tensor alphas,
+    float early_stop_eps)
+{
+    DEVICE_GUARD(packed_info);
+    const uint32_t n_rays = packed_info.size(0);
+    const uint32_t n_samples = alphas.size(0);
+
+    const int threads = 256;
+    const int blocks = CUDA_N_BLOCKS_NEEDED(n_rays, threads);
+
+    // outputs
+    torch::Tensor grad_alphas = torch::zeros(alphas.sizes(), alphas.options());
+
+    AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+        alphas.scalar_type(),
+        "rendering_alphas_backward",
+        ([&]
+         { rendering_backward_kernel<scalar_t><<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
+               n_rays,
+               // inputs
+               packed_info.data_ptr<int>(),
+               nullptr, // starts
+               nullptr, // ends
+               nullptr, // sigmas
+               alphas.data_ptr<scalar_t>(),
+               early_stop_eps,
+               weights.data_ptr<scalar_t>(),
+               grad_weights.data_ptr<scalar_t>(),
+               // outputs
+               nullptr, // sigma gradients
+               grad_alphas.data_ptr<scalar_t>()); }));
+
+    return grad_alphas;
+}
--- a/nerfacc/cuda/csrc/volumetric_marching.cu
+++ b/nerfacc/cuda/csrc/volumetric_marching.cu
-#include <pybind11/pybind11.h>
-#include "include/helpers_cuda.h"
-
-
-inline __device__ int cascaded_grid_idx_at(
-    const float x, const float y, const float z, 
-    const int resx, const int resy, const int resz, 
-    const float* aabb
-) {
-    int ix = (int)(((x - aabb[0]) / (aabb[3] - aabb[0])) * resx);
-    int iy = (int)(((y - aabb[1]) / (aabb[4] - aabb[1])) * resy);
-    int iz = (int)(((z - aabb[2]) / (aabb[5] - aabb[2])) * resz);
-    ix = __clamp(ix, 0, resx-1);
-    iy = __clamp(iy, 0, resy-1);
-    iz = __clamp(iz, 0, resz-1);
-    int idx = ix * resy * resz + iy * resz + iz;
-    return idx;
-}
-
-inline __device__ bool grid_occupied_at(
-    const float x, const float y, const float z, 
-    const int resx, const int resy, const int resz, 
-    const float* aabb, const bool* occ_binary
-) {
-    if (x <= aabb[0] || x >= aabb[3] || y <= aabb[1] || y >= aabb[4] || z <= aabb[2] || z >= aabb[5]) {
-        return false;
-    }
-    int idx = cascaded_grid_idx_at(x, y, z, resx, resy, resz, aabb);
-    return occ_binary[idx];
-}
-
-inline __device__ float distance_to_next_voxel(
-    float x, float y, float z, 
-    float dir_x, float dir_y, float dir_z, 
-    float idir_x, float idir_y, float idir_z,
-    const int resx, const int resy, const int resz,
-    const float* aabb
-) { // dda like step
-    // TODO: this is ugly -- optimize this.
-    float _x = ((x - aabb[0]) / (aabb[3] - aabb[0])) * resx;
-    float _y = ((y - aabb[1]) / (aabb[4] - aabb[1])) * resy;
-    float _z = ((z - aabb[2]) / (aabb[5] - aabb[2])) * resz;
-    float tx = ((floorf(_x + 0.5f + 0.5f * __sign(dir_x)) - _x) * idir_x) / resx * (aabb[3] - aabb[0]);
-    float ty = ((floorf(_y + 0.5f + 0.5f * __sign(dir_y)) - _y) * idir_y) / resy * (aabb[4] - aabb[1]);
-    float tz = ((floorf(_z + 0.5f + 0.5f * __sign(dir_z)) - _z) * idir_z) / resz * (aabb[5] - aabb[2]);
-    float t = min(min(tx, ty), tz);
-    return fmaxf(t, 0.0f);
-}
-
-inline __device__ float advance_to_next_voxel(
-    float t,
-    float x, float y, float z, 
-    float dir_x, float dir_y, float dir_z, 
-    float idir_x, float idir_y, float idir_z,
-    const int resx, const int resy, const int resz, const float* aabb,
-    float dt_min) {
-    // Regular stepping (may be slower but matches non-empty space)
-    float t_target = t + distance_to_next_voxel(
-        x, y, z, 
-        dir_x, dir_y, dir_z, 
-        idir_x, idir_y, idir_z, 
-        resx, resy, resz, aabb
-    );
-    do {
-        t += dt_min;
-    } while (t < t_target);
-    return t;
-}
-
-
-__global__ void marching_steps_kernel(
-    // rays info
-    const uint32_t n_rays,
-    const float* rays_o,  // shape (n_rays, 3)
-    const float* rays_d,  // shape (n_rays, 3)
-    const float* t_min,  // shape (n_rays,)
-    const float* t_max,  // shape (n_rays,)
-    // density grid
-    const float* aabb,  // [min_x, min_y, min_z, max_x, max_y, max_y]
-    const int resx,
-    const int resy,
-    const int resz,
-    const bool* occ_binary,  // shape (reso_x, reso_y, reso_z)
-    // sampling
-    const float dt,
-    // outputs
-    int* num_steps
-) {
-    CUDA_GET_THREAD_ID(i, n_rays);
-
-    // locate
-    rays_o += i * 3;
-    rays_d += i * 3;
-    t_min += i;
-    t_max += i;
-    num_steps += i;
-
-    const float ox = rays_o[0], oy = rays_o[1], oz = rays_o[2];
-    const float dx = rays_d[0], dy = rays_d[1], dz = rays_d[2];
-    const float rdx = 1 / dx, rdy = 1 / dy, rdz = 1 / dz;
-    const float near = t_min[0], far = t_max[0];
-
-    int j = 0;
-    float t0 = near;  // TODO(ruilongli): perturb `near` as in ngp_pl?
-    float t1 = t0 + dt;
-    float t_mid = (t0 + t1) * 0.5f;
-
-    while (t_mid < far) {
-        // current center
-        const float x = ox + t_mid * dx;
-        const float y = oy + t_mid * dy;
-        const float z = oz + t_mid * dz;
-        if (grid_occupied_at(x, y, z, resx, resy, resz, aabb, occ_binary)) {
-            ++j;
-            // march to next sample
-            t0 = t1;
-            t1 = t0 + dt;
-            t_mid = (t0 + t1) * 0.5f;
-        }
-        else {
-            // march to next sample
-            t_mid = advance_to_next_voxel(
-                t_mid, x, y, z, dx, dy, dz, rdx, rdy, rdz, resx, resy, resz, aabb, dt
-            );
-            t0 = t_mid - dt * 0.5f;
-            t1 = t_mid + dt * 0.5f;
-        }
-    }
-    if (j == 0) return;
-
-    num_steps[0] = j;
-    return;
-}
-
-
-__global__ void marching_forward_kernel(
-    // rays info
-    const uint32_t n_rays,
-    const float* rays_o,  // shape (n_rays, 3)
-    const float* rays_d,  // shape (n_rays, 3)
-    const float* t_min,  // shape (n_rays,)
-    const float* t_max,  // shape (n_rays,)
-    // density grid
-    const float* aabb,  // [min_x, min_y, min_z, max_x, max_y, max_y]
-    const int resx,
-    const int resy,
-    const int resz,
-    const bool* occ_binary,  // shape (reso_x, reso_y, reso_z)
-    // sampling
-    const float dt,
-    const int* packed_info,
-    // frustrum outputs
-    float* frustum_starts,
-    float* frustum_ends 
-) {
-    CUDA_GET_THREAD_ID(i, n_rays);
-
-    // locate
-    rays_o += i * 3;
-    rays_d += i * 3;
-    t_min += i;
-    t_max += i;
-    int base = packed_info[i * 2 + 0];
-    int steps = packed_info[i * 2 + 1];
-
-    const float ox = rays_o[0], oy = rays_o[1], oz = rays_o[2];
-    const float dx = rays_d[0], dy = rays_d[1], dz = rays_d[2];
-    const float rdx = 1 / dx, rdy = 1 / dy, rdz = 1 / dz;
-    const float near = t_min[0], far = t_max[0];
-
-    // locate
-    frustum_starts += base;
-    frustum_ends += base;
-
-    int j = 0;
-    float t0 = near;
-    float t1 = t0 + dt;
-    float t_mid = (t0 + t1) / 2.;
-
-    while (t_mid < far) {
-        // current center
-        const float x = ox + t_mid * dx;
-        const float y = oy + t_mid * dy;
-        const float z = oz + t_mid * dz;
-        
-        if (grid_occupied_at(x, y, z, resx, resy, resz, aabb, occ_binary)) {
-            frustum_starts[j] = t0;   
-            frustum_ends[j] = t1;     
-            ++j;
-            // march to next sample
-            t0 = t1;
-            t1 = t0 + dt;
-            t_mid = (t0 + t1) * 0.5f;
-        }
-        else {
-            // march to next sample
-            t_mid = advance_to_next_voxel(
-                t_mid, x, y, z, dx, dy, dz, rdx, rdy, rdz, resx, resy, resz, aabb, dt
-            );
-            t0 = t_mid - dt * 0.5f;
-            t1 = t_mid + dt * 0.5f;
-		}
-	}
-    if (j != steps) {
-        printf("WTF %d v.s. %d\n", j, steps);
-    }
-    return;
-}
-
-__global__ void ray_indices_kernel(
-    // input
-    const int n_rays,
-    const int* packed_info,
-    // output
-    int* ray_indices
-) {
-    CUDA_GET_THREAD_ID(i, n_rays);
-
-    // locate
-    const int base = packed_info[i * 2 + 0];  // point idx start.
-    const int steps = packed_info[i * 2 + 1];  // point idx shift.
-    if (steps == 0) return;
-
-    ray_indices += base;
-
-    for (int j = 0; j < steps; ++j) {
-        ray_indices[j] = i;
-    }
-}
-
-
-__global__ void occ_query_kernel(
-    // rays info
-    const uint32_t n_samples,
-    const float* samples,  // shape (n_samples, 3)
-    // density grid
-    const float* aabb,  // [min_x, min_y, min_z, max_x, max_y, max_y]
-    const int resx,
-    const int resy,
-    const int resz,
-    const bool* occ_binary,  // shape (reso_x, reso_y, reso_z)
-    // outputs
-    bool* occs
-) {
-    CUDA_GET_THREAD_ID(i, n_samples);
-
-    // locate
-    samples += i * 3;
-    occs += i;
-
-    occs[0] = grid_occupied_at(
-        samples[0], samples[1], samples[2], 
-        resx, resy, resz, aabb, occ_binary
-    );
-    return;
-}
-
-
-std::vector<torch::Tensor> volumetric_marching(
-    // rays
-    const torch::Tensor rays_o, 
-    const torch::Tensor rays_d, 
-    const torch::Tensor t_min, 
-    const torch::Tensor t_max,
-    // density grid
-    const torch::Tensor aabb,
-    const pybind11::list resolution,
-    const torch::Tensor occ_binary, 
-    // sampling
-    const float dt
-) {
-    DEVICE_GUARD(rays_o);
-
-    CHECK_INPUT(rays_o);
-    CHECK_INPUT(rays_d);
-    CHECK_INPUT(t_min);
-    CHECK_INPUT(t_max);
-    CHECK_INPUT(aabb);
-    CHECK_INPUT(occ_binary);
-    
-    const int n_rays = rays_o.size(0);
-
-    const int threads = 256;
-    const int blocks = CUDA_N_BLOCKS_NEEDED(n_rays, threads);
-
-    // helper counter
-    torch::Tensor num_steps = torch::zeros(
-        {n_rays}, rays_o.options().dtype(torch::kInt32));
-
-    // count number of samples per ray
-    marching_steps_kernel<<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
-        // rays
-        n_rays,
-        rays_o.data_ptr<float>(),
-        rays_d.data_ptr<float>(),
-        t_min.data_ptr<float>(),
-        t_max.data_ptr<float>(),
-        // density grid
-        aabb.data_ptr<float>(),
-        resolution[0].cast<int>(),
-        resolution[1].cast<int>(),
-        resolution[2].cast<int>(),
-        occ_binary.data_ptr<bool>(),
-        // sampling
-        dt,
-        // outputs
-        num_steps.data_ptr<int>()
-    ); 
-
-    torch::Tensor cum_steps = num_steps.cumsum(0, torch::kInt32);
-    torch::Tensor packed_info = torch::stack({cum_steps - num_steps, num_steps}, 1);
-    // std::cout << "num_steps" << num_steps.dtype() << std::endl;
-    // std::cout << "cum_steps" << cum_steps.dtype() << std::endl;
-    // std::cout << "packed_info" << packed_info.dtype() << std::endl;
-
-    // output frustum samples
-    int total_steps = cum_steps[cum_steps.size(0) - 1].item<int>();
-    torch::Tensor frustum_starts = torch::zeros({total_steps, 1}, rays_o.options());
-    torch::Tensor frustum_ends = torch::zeros({total_steps, 1}, rays_o.options());
-
-    marching_forward_kernel<<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
-        // rays
-        n_rays,
-        rays_o.data_ptr<float>(),
-        rays_d.data_ptr<float>(),
-        t_min.data_ptr<float>(),
-        t_max.data_ptr<float>(),
-        // density grid
-        aabb.data_ptr<float>(),
-        resolution[0].cast<int>(),
-        resolution[1].cast<int>(),
-        resolution[2].cast<int>(),
-        occ_binary.data_ptr<bool>(),
-        // sampling
-        dt,
-        packed_info.data_ptr<int>(),
-        // outputs
-        frustum_starts.data_ptr<float>(),
-        frustum_ends.data_ptr<float>()
-    ); 
-
-    return {packed_info, frustum_starts, frustum_ends};
-}
-
-
-torch::Tensor unpack_to_ray_indices(const torch::Tensor packed_info) {
-    DEVICE_GUARD(packed_info);
-    CHECK_INPUT(packed_info);
-
-    const int n_rays = packed_info.size(0);
-    const int threads = 256;
-    const int blocks = CUDA_N_BLOCKS_NEEDED(n_rays, threads);
-
-    int n_samples = packed_info[n_rays - 1].sum(0).item<int>();
-    torch::Tensor ray_indices = torch::zeros(
-        {n_samples}, packed_info.options().dtype(torch::kInt32));
-
-    ray_indices_kernel<<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
-        n_rays,
-        packed_info.data_ptr<int>(),
-        ray_indices.data_ptr<int>()
-    ); 
-    return ray_indices;
-}
-
-
-torch::Tensor query_occ(
-    const torch::Tensor samples,
-    // density grid
-    const torch::Tensor aabb,
-    const pybind11::list resolution,
-    const torch::Tensor occ_binary
-) {
-    DEVICE_GUARD(samples);
-    CHECK_INPUT(samples);
-
-    const int n_samples = samples.size(0);
-    const int threads = 256;
-    const int blocks = CUDA_N_BLOCKS_NEEDED(n_samples, threads);
-
-    torch::Tensor occs = torch::zeros(
-        {n_samples}, samples.options().dtype(torch::kBool));
-
-    occ_query_kernel<<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
-        n_samples,
-        samples.data_ptr<float>(),
-        // density grid
-        aabb.data_ptr<float>(),
-        resolution[0].cast<int>(),
-        resolution[1].cast<int>(),
-        resolution[2].cast<int>(),
-        occ_binary.data_ptr<bool>(),
-        // outputs
-        occs.data_ptr<bool>()
-    ); 
-    return occs;
-}
-
--- a/nerfacc/cuda/csrc/volumetric_rendering.cu
+++ b/nerfacc/cuda/csrc/volumetric_rendering.cu
-#include "include/helpers_cuda.h"
-
-
-template <typename scalar_t>
-__global__ void volumetric_rendering_steps_kernel(
-    const uint32_t n_rays,
-    const int* packed_info,  // input ray & point indices.
-    const scalar_t* starts,  // input start t
-    const scalar_t* ends,  // input end t
-    const scalar_t* sigmas,  // input density after activation
-    // output: should be all zero (false) initialized
-    int* num_steps, 
-    bool* selector
-) {
-    CUDA_GET_THREAD_ID(i, n_rays);
-
-    // locate
-    const int base = packed_info[i * 2 + 0];  // point idx start.
-    const int steps = packed_info[i * 2 + 1];  // point idx shift.
-    if (steps == 0) return;
-
-    starts += base;
-    ends += base;
-    sigmas += base;
-    num_steps += i;
-    selector += base;
-
-    // accumulated rendering
-    scalar_t T = 1.f;
-    scalar_t EPSILON = 1e-4f;
-    int j = 0;
-    for (; j < steps; ++j) {
-        if (T < EPSILON) {
-            break;
-        }
-        const scalar_t delta = ends[j] - starts[j];
-        const scalar_t alpha = 1.f - __expf(-sigmas[j] * delta);
-        const scalar_t weight = alpha * T;
-        T *= (1.f - alpha);
-        selector[j] = true;
-    }
-    num_steps[0] = j;
-    return;
-}
-
-
-template <typename scalar_t>
-__global__ void volumetric_rendering_weights_forward_kernel(
-    const uint32_t n_rays,
-    const int* packed_info,  // input ray & point indices.
-    const scalar_t* starts,  // input start t
-    const scalar_t* ends,  // input end t
-    const scalar_t* sigmas,  // input density after activation
-    // should be all-zero initialized
-    scalar_t* weights  // output
-) {
-    CUDA_GET_THREAD_ID(i, n_rays);
-
-    // locate
-    const int base = packed_info[i * 2 + 0];  // point idx start.
-    const int steps = packed_info[i * 2 + 1];  // point idx shift.
-    if (steps == 0) return;
-
-    starts += base;
-    ends += base;
-    sigmas += base;
-    weights += base;
-
-    // accumulated rendering
-    scalar_t T = 1.f;
-    scalar_t EPSILON = 1e-4f;
-    for (int j = 0; j < steps; ++j) {
-        if (T < EPSILON) {
-            break;
-        }
-        const scalar_t delta = ends[j] - starts[j];
-        const scalar_t alpha = 1.f - __expf(-sigmas[j] * delta);
-        const scalar_t weight = alpha * T;
-        weights[j] = weight;
-        T *= (1.f - alpha);
-    }
-}
-
-
-template <typename scalar_t>
-__global__ void volumetric_rendering_weights_backward_kernel(
-    const uint32_t n_rays,
-    const int* packed_info,  // input ray & point indices.
-    const scalar_t* starts,  // input start t
-    const scalar_t* ends,  // input end t
-    const scalar_t* sigmas,  // input density after activation
-    const scalar_t* weights,  // forward output
-    const scalar_t* grad_weights,  // input
-    scalar_t* grad_sigmas  // output
-) {
-    CUDA_GET_THREAD_ID(i, n_rays);
-
-    // locate
-    const int base = packed_info[i * 2 + 0];  // point idx start.
-    const int steps = packed_info[i * 2 + 1];  // point idx shift.
-    if (steps == 0) return;
-
-    starts += base;
-    ends += base;
-    sigmas += base;
-    weights += base;
-    grad_weights += base;
-    grad_sigmas += base;
-
-    scalar_t accum = 0;
-    for (int j = 0; j < steps; ++j) {
-        accum += grad_weights[j] * weights[j];
-    }
-
-    // backward of accumulated rendering
-    scalar_t T = 1.f;
-    scalar_t EPSILON = 1e-4f;
-    for (int j = 0; j < steps; ++j) {
-        if (T < EPSILON) {
-            break;
-        }
-        const scalar_t delta = ends[j] - starts[j];
-        const scalar_t alpha = 1.f - __expf(-sigmas[j] * delta);
-
-        grad_sigmas[j] = delta * (grad_weights[j] * T - accum);
-        accum -= grad_weights[j] * weights[j];
-        T *= (1.f - alpha);
-    }
-}
-
-
-std::vector<torch::Tensor> volumetric_rendering_steps(
-    torch::Tensor packed_info, 
-    torch::Tensor starts, 
-    torch::Tensor ends, 
-    torch::Tensor sigmas
-) {
-    DEVICE_GUARD(packed_info);
-
-    CHECK_INPUT(packed_info);
-    CHECK_INPUT(starts);
-    CHECK_INPUT(ends);
-    CHECK_INPUT(sigmas);
-    
-    TORCH_CHECK(packed_info.ndimension() == 2 & packed_info.size(1) == 2);
-    TORCH_CHECK(starts.ndimension() == 2 & starts.size(1) == 1);
-    TORCH_CHECK(ends.ndimension() == 2 & ends.size(1) == 1);
-    TORCH_CHECK(sigmas.ndimension() == 2 & sigmas.size(1) == 1);
-
-    const uint32_t n_rays = packed_info.size(0);
-    const uint32_t n_samples = sigmas.size(0);
-
-    const int threads = 256;
-    const int blocks = CUDA_N_BLOCKS_NEEDED(n_rays, threads);
-
-    torch::Tensor num_steps = torch::zeros({n_rays}, packed_info.options());
-    torch::Tensor selector = torch::zeros({n_samples}, packed_info.options().dtype(torch::kBool));
-
-    AT_DISPATCH_FLOATING_TYPES_AND_HALF(
-        sigmas.scalar_type(),
-        "volumetric_marching_steps",
-        ([&]
-         { volumetric_rendering_steps_kernel<scalar_t><<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
-                n_rays,
-                packed_info.data_ptr<int>(), 
-                starts.data_ptr<scalar_t>(),
-                ends.data_ptr<scalar_t>(),
-                sigmas.data_ptr<scalar_t>(),
-                num_steps.data_ptr<int>(),
-                selector.data_ptr<bool>()
-            ); 
-        }));
-
-    torch::Tensor cum_steps = num_steps.cumsum(0, torch::kInt32);
-    torch::Tensor compact_packed_info = torch::stack({cum_steps - num_steps, num_steps}, 1);
-
-    return {compact_packed_info, selector};
-}
-
-
-torch::Tensor volumetric_rendering_weights_forward(
-    torch::Tensor packed_info, 
-    torch::Tensor starts, 
-    torch::Tensor ends, 
-    torch::Tensor sigmas
-) {
-    DEVICE_GUARD(packed_info);
-    CHECK_INPUT(packed_info);
-    CHECK_INPUT(starts);
-    CHECK_INPUT(ends);
-    CHECK_INPUT(sigmas);
-    TORCH_CHECK(packed_info.ndimension() == 2 & packed_info.size(1) == 2);
-    TORCH_CHECK(starts.ndimension() == 2 & starts.size(1) == 1);
-    TORCH_CHECK(ends.ndimension() == 2 & ends.size(1) == 1);
-    TORCH_CHECK(sigmas.ndimension() == 2 & sigmas.size(1) == 1);
-
-    const uint32_t n_rays = packed_info.size(0);
-    const uint32_t n_samples = sigmas.size(0);
-
-    const int threads = 256;
-    const int blocks = CUDA_N_BLOCKS_NEEDED(n_rays, threads);
-
-    // outputs
-    torch::Tensor weights = torch::zeros({n_samples}, sigmas.options()); 
-
-    AT_DISPATCH_FLOATING_TYPES_AND_HALF(
-        sigmas.scalar_type(),
-        "volumetric_rendering_weights_forward",
-        ([&]
-         { volumetric_rendering_weights_forward_kernel<scalar_t><<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
-                n_rays,
-                packed_info.data_ptr<int>(), 
-                starts.data_ptr<scalar_t>(),
-                ends.data_ptr<scalar_t>(),
-                sigmas.data_ptr<scalar_t>(),
-                weights.data_ptr<scalar_t>()
-            ); 
-        }));
-
-    return weights;
-}
-
-
-torch::Tensor volumetric_rendering_weights_backward(
-    torch::Tensor weights, 
-    torch::Tensor grad_weights, 
-    torch::Tensor packed_info, 
-    torch::Tensor starts, 
-    torch::Tensor ends, 
-    torch::Tensor sigmas
-) {
-    DEVICE_GUARD(packed_info);
-    const uint32_t n_rays = packed_info.size(0);
-    const uint32_t n_samples = sigmas.size(0);
-
-    const int threads = 256;
-    const int blocks = CUDA_N_BLOCKS_NEEDED(n_rays, threads);
-
-    // outputs
-    torch::Tensor grad_sigmas = torch::zeros(sigmas.sizes(), sigmas.options()); 
-
-    AT_DISPATCH_FLOATING_TYPES_AND_HALF(
-        sigmas.scalar_type(),
-        "volumetric_rendering_weights_backward",
-        ([&]
-         { volumetric_rendering_weights_backward_kernel<scalar_t><<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
-                n_rays,
-                packed_info.data_ptr<int>(), 
-                starts.data_ptr<scalar_t>(),
-                ends.data_ptr<scalar_t>(),
-                sigmas.data_ptr<scalar_t>(),
-                weights.data_ptr<scalar_t>(),
-                grad_weights.data_ptr<scalar_t>(),
-                grad_sigmas.data_ptr<scalar_t>()
-            ); 
-        }));
-
-    return grad_sigmas;
-}
--- a/nerfacc/occupancy_field.py
+++ b/nerfacc/occupancy_field.py
--- a/nerfacc/pipeline.py
+++ b/nerfacc/pipeline.py
+from typing import Callable, Optional, Tuple
+
+import torch
+
+from .grid import Grid
+from .ray_marching import ray_marching, unpack_to_ray_indices
+from .vol_rendering import accumulate_along_rays, render_weight_from_density
+
+
+def rendering(
+    # radiance field
+    rgb_sigma_fn: Callable,
+    # ray marching results
+    packed_info: torch.Tensor,
+    t_starts: torch.Tensor,
+    t_ends: torch.Tensor,
+    # rendering options
+    early_stop_eps: float = 1e-4,
+    render_bkgd: Optional[torch.Tensor] = None,
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    """Render the rays through the radience field defined by `rgb_sigma_fn`.
+
+    This function is differentiable to the outputs of `rgb_sigma_fn` so it can be used for
+    gradient-based optimization.
+
+    Warning:
+        This function is not differentiable to `t_starts`, `t_ends`.
+
+    Args:
+        rgb_sigma_fn: A function that takes in samples {t_starts (N, 1), t_ends (N, 1), \
+            ray indices (N,)} and returns the post-activation rgb (N, 3) and density \
+            values (N, 1).
+        packed_info: Packed ray marching info. See :func:`ray_marching` for details.
+        t_starts: Per-sample start distance. Tensor with shape (n_samples, 1).
+        t_ends: Per-sample end distance. Tensor with shape (n_samples, 1).
+        early_stop_eps: Early stop threshold during trasmittance accumulation. Default: 1e-4.
+        render_bkgd: Optional. Background color. Tensor with shape (3,).
+
+    Returns:
+        Ray colors (n_rays, 3), opacities (n_rays, 1) and depths (n_rays, 1).
+
+    Examples:
+
+    .. code-block:: python
+
+        import torch
+        from nerfacc import OccupancyGrid, ray_marching, rendering
+
+        device = "cuda:0"
+        batch_size = 128
+        rays_o = torch.rand((batch_size, 3), device=device)
+        rays_d = torch.randn((batch_size, 3), device=device)
+        rays_d = rays_d / rays_d.norm(dim=-1, keepdim=True)
+
+        # Ray marching.
+        packed_info, t_starts, t_ends = ray_marching(
+            rays_o, rays_d, near_plane=0.1, far_plane=1.0, render_step_size=1e-3
+        )
+
+        # Rendering.
+        def rgb_sigma_fn(t_starts, t_ends, ray_indices):
+            # This is a dummy function that returns random values.
+            rgbs = torch.rand((t_starts.shape[0], 3), device=device)
+            sigmas = torch.rand((t_starts.shape[0], 1), device=device)
+            return rgbs, sigmas
+        colors, opacities, depths = rendering(rgb_sigma_fn, packed_info, t_starts, t_ends)
+
+        # torch.Size([128, 3]) torch.Size([128, 1]) torch.Size([128, 1])
+        print(colors.shape, opacities.shape, depths.shape)
+
+    """
+    n_rays = packed_info.shape[0]
+    ray_indices = unpack_to_ray_indices(packed_info)
+
+    # Query sigma and color with gradients
+    rgbs, sigmas = rgb_sigma_fn(t_starts, t_ends, ray_indices)
+    assert rgbs.shape[-1] == 3, "rgbs must have 3 channels, got {}".format(
+        rgbs.shape
+    )
+    assert (
+        sigmas.shape == t_starts.shape
+    ), "sigmas must have shape of (N, 1)! Got {}".format(sigmas.shape)
+
+    # Rendering: compute weights and ray indices.
+    weights = render_weight_from_density(
+        packed_info, t_starts, t_ends, sigmas, early_stop_eps
+    )
+
+    # Rendering: accumulate rgbs, opacities, and depths along the rays.
+    colors = accumulate_along_rays(
+        weights, ray_indices, values=rgbs, n_rays=n_rays
+    )
+    opacities = accumulate_along_rays(
+        weights, ray_indices, values=None, n_rays=n_rays
+    )
+    depths = accumulate_along_rays(
+        weights,
+        ray_indices,
+        values=(t_starts + t_ends) / 2.0,
+        n_rays=n_rays,
+    )
+
+    # Background composition.
+    if render_bkgd is not None:
+        colors = colors + render_bkgd * (1.0 - opacities)
+
+    return colors, opacities, depths
+
+
+def volumetric_rendering(
+    # radiance field
+    sigma_fn: Callable,
+    rgb_sigma_fn: Callable,
+    # rays
+    rays_o: torch.Tensor,
+    rays_d: torch.Tensor,
+    t_min: Optional[torch.Tensor] = None,
+    t_max: Optional[torch.Tensor] = None,
+    # bounding box of the scene
+    scene_aabb: Optional[torch.Tensor] = None,
+    # grid for skipping samples
+    grid: Optional[Grid] = None,
+    # rendering options
+    near_plane: Optional[float] = None,
+    far_plane: Optional[float] = None,
+    render_step_size: float = 1e-3,
+    stratified: bool = False,
+    cone_angle: float = 0.0,
+    early_stop_eps: float = 1e-4,
+    render_bkgd: Optional[torch.Tensor] = None,
+    return_extra_info: bool = False,
+) -> Tuple[torch.Tensor, torch.Tensor, int, int]:
+    """Differentiable volumetric rendering pipeline.
+
+    This function is the integration of those individual functions:
+
+        - ray_aabb_intersect: ray AABB intersection.
+        - ray_marching: ray marching with grid-based skipping.
+        - compute_weights: compute transmittance and compress samples.
+        - accumulate_along_rays: accumulate samples along rays to get final per-ray RGB etc.
+
+    Args:
+        sigma_fn: A function that takes in samples {t_starts (N, 1), t_ends (N, 1),
+            ray indices (N,)} and returns the post-activation density values (N, 1).
+        rgb_sigma_fn: A function that takes in samples {t_starts (N, 1), t_ends (N, 1),
+            ray indices (N,)} and returns the post-activation rgb (N, 3) and density
+            values (N, 1).
+        rays_o: Ray origins. Tensor with shape (n_rays, 3).
+        rays_d: Normalized ray directions. Tensor with shape (n_rays, 3).
+        t_min: Optional. Per-ray minimum distance. Tensor with shape (n_rays).
+        t_max: Optional. Per-ray maximum distance. Tensor with shape (n_rays).
+        scene_aabb: Optional. Scene bounding box for computing t_min and t_max.
+            A tensor with shape (6,) {xmin, ymin, zmin, xmax, ymax, zmax}.
+            scene_aabb which be ignored if both t_min and t_max are provided.
+        grid: Optional. Grid for to idicates where to skip during marching.
+            See :class:`nerfacc.Grid` for details.
+        near_plane: Optional. Near plane distance. If provided, it will be used
+            to clip t_min.
+        far_plane: Optional. Far plane distance. If provided, it will be used
+            to clip t_max.
+        render_step_size: Step size for marching. Default: 1e-3.
+        stratified: Whether to use stratified sampling. Default: False.
+        cone_angle: Cone angle for linearly-increased step size. 0. means
+            constant step size. Default: 0.0.
+        early_stop_eps: Early stop threshold for marching. Default: 1e-4.
+        render_bkgd: Optional. Background color. If provided, it will be used
+            to fill the background. Default: None.
+        return_extra_info: Whether to return extra info. Default: False.
+
+    Returns:
+        Ray colors (n_rays, 3), opacities (n_rays, 1) and depths (n_rays, 1).
+        If return_extra_info is True, it will also return a dictionary of extra info,
+        including:
+
+            - "n_marching_samples": Total number of samples kept after marching.
+            - "n_rendering_samples": Total number of samples used for actual rendering.
+
+    """
+    assert rays_o.shape == rays_d.shape and rays_o.dim() == 2, "Invalid rays."
+    n_rays = rays_o.shape[0]
+    rays_o = rays_o.contiguous()
+    rays_d = rays_d.contiguous()
+
+    extra_info = {}
+    with torch.no_grad():
+        # Ray marching with skipping.
+        packed_info, t_starts, t_ends = ray_marching(
+            rays_o,
+            rays_d,
+            t_min=t_min,
+            t_max=t_max,
+            scene_aabb=scene_aabb,
+            grid=grid,
+            sigma_fn=sigma_fn,
+            early_stop_eps=early_stop_eps,
+            near_plane=near_plane,
+            far_plane=far_plane,
+            render_step_size=render_step_size,
+            stratified=stratified,
+            cone_angle=cone_angle,
+        )
+        extra_info["n_rendering_samples"] = len(t_starts)
+
+    colors, opacities, depths = rendering(
+        rgb_sigma_fn,
+        packed_info=packed_info,
+        t_starts=t_starts,
+        t_ends=t_ends,
+        early_stop_eps=early_stop_eps,
+        render_bkgd=render_bkgd,
+    )
+
+    if return_extra_info:
+        return colors, opacities, depths, extra_info
+    else:
+        return colors, opacities, depths
--- a/nerfacc/ray_marching.py
+++ b/nerfacc/ray_marching.py
--- a/nerfacc/utils.py
+++ b/nerfacc/utils.py
--- a/nerfacc/vol_rendering.py
+++ b/nerfacc/vol_rendering.py
--- a/nerfacc/volumetric_rendering.py
+++ b/nerfacc/volumetric_rendering.py
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"

 [project]
 name = "nerfacc"
-version = "0.0.9"
+version = "0.1.0"
 authors = [{name = "Ruilong", email = "ruilongli94@gmail.com"}]
 license = { text="MIT" }
 requires-python = ">=3.8"
@@ -35,6 +35,11 @@ dev = [
 [tool.black]
 line-length = 80

+[tool.isort]
+multi_line_output = 3
+line_length = 80
+include_trailing_comma = true
+
 # pylint
 [tool.pylint.messages_control]
 max-line-length = 80

--- a/tests/test_all.py
+++ b/tests/test_all.py
-import torch
-import tqdm
-
-from nerfacc import volumetric_rendering_pipeline
-
-device = "cuda:0"
-
-
-def sigma_fn(frustum_starts, frustum_ends, ray_indices):
-    return torch.rand_like(frustum_ends[:, :1])
-
-
-def rgb_sigma_fn(frustum_starts, frustum_ends, ray_indices):
-    return torch.rand(
-        (frustum_ends.shape[0], 3), device=device
-    ), torch.rand_like(frustum_ends)
-
-
-def test_rendering():
-    scene_aabb = torch.tensor([0, 0, 0, 1, 1, 1], device=device).float()
-    scene_resolution = [128, 128, 128]
-    scene_occ_binary = torch.ones((128 * 128 * 128), device=device).bool()
-    rays_o = torch.rand((10000, 3), device=device)
-    rays_d = torch.randn((10000, 3), device=device)
-    rays_d = rays_d / rays_d.norm(dim=-1, keepdim=True)
-    render_bkgd = torch.ones(3, device=device)
-
-    for step in tqdm.tqdm(range(1000)):
-        volumetric_rendering_pipeline(
-            sigma_fn,
-            rgb_sigma_fn,
-            rays_o,
-            rays_d,
-            scene_aabb,
-            scene_resolution,
-            scene_occ_binary,
-            render_bkgd,
-            render_step_size=1e-3,
-            near_plane=0.0,
-            stratified=False,
-        )
-
-
-if __name__ == "__main__":
-    test_rendering()
--- a/tests/test_contraction.py
+++ b/tests/test_contraction.py
--- a/tests/test_grid.py
+++ b/tests/test_grid.py
--- a/tests/test_marching.py
+++ b/tests/test_marching.py
--- a/tests/test_occupancy_field.py
+++ b/tests/test_occupancy_field.py
-import torch
-import tqdm
-
-from nerfacc import OccupancyField
-
-device = "cuda:0"
-
-
-def occ_eval_fn(positions: torch.Tensor) -> torch.Tensor:
-    return torch.rand_like(positions[:, :1])
-
-
-def test_occ_field():
-    occ_field = OccupancyField(occ_eval_fn, aabb=[0, 0, 0, 1, 1, 1]).to(device)
-
-    for step in tqdm.tqdm(range(50000)):
-        occ_field.every_n_step(step, occ_thre=0.1)
-
-
-if __name__ == "__main__":
-    test_occ_field()
--- a/tests/test_ray_marching.py
+++ b/tests/test_ray_marching.py
--- a/tests/test_rendering.py
+++ b/tests/test_rendering.py