Reformat (#31)

* seems working * contraction func in cuda * Update type * More type updates * disable DDA for contraction * update contraction perfom in readme * 360 data: Garden * eval at max_steps * add perform of 360 to readme * fix contraction scaling * tiny hot fix * new volrend * cleanup ray_marching.cu * cleanup backend * tests * cleaning up Grid * fix doc for grid base class * check and fix for contraction * test grid * rendering and marching * transmittance_compress verified * rendering is indeed faster * pipeline is working * lego example * cleanup * cuda folder is cleaned up! finally! * cuda formatting * contraction verify * upgrade grid * test for ray marching * pipeline * ngp with contraction * train_ngp runs but slow * trasmittance seperate to two. Now NGP is as fast as before * verified faster than before * bug fix for contraction * ngp contraction fix * tiny cleanup * contraction works! yay! * contraction with tanh seems working * minor update * support alpha rendering * absorb visibility to ray marching * tiny import update * get rid of contraction temperture; * doc for ContractionType * doc for Grid * doc for grid.py is done * doc for ray marching * rendering function * fix doc for rendering * doc for vol rend * autosummary for utils * fix autosummary line break * utils docs * api doc is done * starting work on examples * contraction for npg is in python now * further clean up examples * mlp nerf is running * dnerf is in * update readme command * merge * disable pylint error for now * reformatting and skip tests without cuda * fix the type issue for contractiontype * fix cuda attribute issue * bump to 0.1.0 Co-authored-by: Matt Tancik <tancik@berkeley.edu>

Reformat (#31)
* seems working * contraction func in cuda * Update type * More type updates * disable DDA for contraction * update contraction perfom in readme * 360 data: Garden * eval at max_steps * add perform of 360 to readme * fix contraction scaling * tiny hot fix * new volrend * cleanup ray_marching.cu * cleanup backend * tests * cleaning up Grid * fix doc for grid base class * check and fix for contraction * test grid * rendering and marching * transmittance_compress verified * rendering is indeed faster * pipeline is working * lego example * cleanup * cuda folder is cleaned up! finally! * cuda formatting * contraction verify * upgrade grid * test for ray marching * pipeline * ngp with contraction * train_ngp runs but slow * trasmittance seperate to two. Now NGP is as fast as before * verified faster than before * bug fix for contraction * ngp contraction fix * tiny cleanup * contraction works! yay! * contraction with tanh seems working * minor update * support alpha rendering * absorb visibility to ray marching * tiny import update * get rid of contraction temperture; * doc for ContractionType * doc for Grid * doc for grid.py is done * doc for ray marching * rendering function * fix doc for rendering * doc for vol rend * autosummary for utils * fix autosummary line break * utils docs * api doc is done * starting work on examples * contraction for npg is in python now * further clean up examples * mlp nerf is running * dnerf is in * update readme command * merge * disable pylint error for now * reformatting and skip tests without cuda * fix the type issue for contractiontype * fix cuda attribute issue * bump to 0.1.0 Co-authored-by: Matt Tancik <tancik@berkeley.edu>
8dcfbad9 · Ruilong Li(李瑞龙) · GitHub · a7611603 · 8dcfbad9 · 8dcfbad9
Unverified Commit 8dcfbad9 authored Sep 27, 2022 by Ruilong Li(李瑞龙) Committed by GitHub Sep 27, 2022
18 changed files
--- a/nerfacc/cuda/csrc/ray_marching.cu
+++ b/nerfacc/cuda/csrc/ray_marching.cu
+#include "include/helpers_cuda.h"
+#include "include/helpers_math.h"
+#include "include/helpers_contraction.h"
+
+inline __device__ __host__ float calc_dt(
+    const float t, const float cone_angle,
+    const float dt_min, const float dt_max)
+{
+    return clamp(t * cone_angle, dt_min, dt_max);
+}
+
+inline __device__ __host__ int grid_idx_at(
+    const float3 xyz_unit, const int3 grid_res)
+{
+    // xyz should be always in [0, 1]^3.
+    int3 ixyz = make_int3(xyz_unit * make_float3(grid_res));
+    ixyz = clamp(ixyz, make_int3(0, 0, 0), grid_res - 1);
+    int3 grid_offset = make_int3(grid_res.y * grid_res.z, grid_res.z, 1);
+    int idx = dot(ixyz, grid_offset);
+    return idx;
+}
+
+inline __device__ __host__ bool grid_occupied_at(
+    const float3 xyz,
+    const float3 roi_min, const float3 roi_max,
+    ContractionType type,
+    const int3 grid_res, const bool *grid_binary)
+{
+    if (type == ContractionType::AABB &&
+        (xyz.x < roi_min.x || xyz.x > roi_max.x ||
+         xyz.y < roi_min.y || xyz.y > roi_max.y ||
+         xyz.z < roi_min.z || xyz.z > roi_max.z))
+    {
+        return false;
+    }
+    float3 xyz_unit = apply_contraction(
+        xyz, roi_min, roi_max, type);
+    int idx = grid_idx_at(xyz_unit, grid_res);
+    return grid_binary[idx];
+}
+
+// dda like step
+inline __device__ __host__ float distance_to_next_voxel(
+    const float3 xyz, const float3 dir, const float3 inv_dir,
+    const float3 roi_min, const float3 roi_max, const int3 grid_res)
+{
+    float3 _occ_res = make_float3(grid_res);
+    float3 _xyz = roi_to_unit(xyz, roi_min, roi_max) * _occ_res;
+    float3 txyz = ((floorf(_xyz + 0.5f + 0.5f * sign(dir)) - _xyz) * inv_dir) / _occ_res * (roi_max - roi_min);
+    float t = min(min(txyz.x, txyz.y), txyz.z);
+    return fmaxf(t, 0.0f);
+}
+
+inline __device__ __host__ float advance_to_next_voxel(
+    const float t, const float dt_min,
+    const float3 xyz, const float3 dir, const float3 inv_dir,
+    const float3 roi_min, const float3 roi_max, const int3 grid_res)
+{
+    // Regular stepping (may be slower but matches non-empty space)
+    float t_target = t + distance_to_next_voxel(
+                             xyz, dir, inv_dir, roi_min, roi_max, grid_res);
+    float _t = t;
+    do
+    {
+        _t += dt_min;
+    } while (_t < t_target);
+    return _t;
+}
+
+// -------------------------------------------------------------------------------
+// Raymarching
+// -------------------------------------------------------------------------------
+
+__global__ void ray_marching_kernel(
+    // rays info
+    const uint32_t n_rays,
+    const float *rays_o, // shape (n_rays, 3)
+    const float *rays_d, // shape (n_rays, 3)
+    const float *t_min,  // shape (n_rays,)
+    const float *t_max,  // shape (n_rays,)
+    // occupancy grid & contraction
+    const float *roi,
+    const int3 grid_res,
+    const bool *grid_binary, // shape (reso_x, reso_y, reso_z)
+    const ContractionType type,
+    // sampling
+    const float step_size,
+    const float cone_angle,
+    const int *packed_info,
+    // first round outputs
+    int *num_steps,
+    // second round outputs
+    float *t_starts,
+    float *t_ends)
+{
+    CUDA_GET_THREAD_ID(i, n_rays);
+
+    bool is_first_round = (packed_info == nullptr);
+
+    // locate
+    rays_o += i * 3;
+    rays_d += i * 3;
+    t_min += i;
+    t_max += i;
+
+    if (is_first_round)
+    {
+        num_steps += i;
+    }
+    else
+    {
+        int base = packed_info[i * 2 + 0];
+        int steps = packed_info[i * 2 + 1];
+        t_starts += base;
+        t_ends += base;
+    }
+
+    const float3 origin = make_float3(rays_o[0], rays_o[1], rays_o[2]);
+    const float3 dir = make_float3(rays_d[0], rays_d[1], rays_d[2]);
+    const float3 inv_dir = 1.0f / dir;
+    const float near = t_min[0], far = t_max[0];
+
+    const float3 roi_min = make_float3(roi[0], roi[1], roi[2]);
+    const float3 roi_max = make_float3(roi[3], roi[4], roi[5]);
+
+    // TODO: compute dt_max from occ resolution.
+    float dt_min = step_size;
+    float dt_max = 1e10f;
+
+    int j = 0;
+    float t0 = near;
+    float dt = calc_dt(t0, cone_angle, dt_min, dt_max);
+    float t1 = t0 + dt;
+    float t_mid = (t0 + t1) * 0.5f;
+
+    while (t_mid < far)
+    {
+        // current center
+        const float3 xyz = origin + t_mid * dir;
+        if (grid_occupied_at(xyz, roi_min, roi_max, type, grid_res, grid_binary))
+        {
+            if (!is_first_round)
+            {
+                t_starts[j] = t0;
+                t_ends[j] = t1;
+            }
+            ++j;
+            // march to next sample
+            t0 = t1;
+            t1 = t0 + calc_dt(t0, cone_angle, dt_min, dt_max);
+            t_mid = (t0 + t1) * 0.5f;
+        }
+        else
+        {
+            // march to next sample
+            switch (type)
+            {
+            case ContractionType::AABB:
+                // no contraction
+                t_mid = advance_to_next_voxel(
+                    t_mid, dt_min, xyz, dir, inv_dir, roi_min, roi_max, grid_res);
+                dt = calc_dt(t_mid, cone_angle, dt_min, dt_max);
+                t0 = t_mid - dt * 0.5f;
+                t1 = t_mid + dt * 0.5f;
+                break;
+
+            default:
+                // any type of scene contraction does not work with DDA.
+                t0 = t1;
+                t1 = t0 + calc_dt(t0, cone_angle, dt_min, dt_max);
+                t_mid = (t0 + t1) * 0.5f;
+                break;
+            }
+        }
+    }
+
+    if (is_first_round)
+    {
+        *num_steps = j;
+    }
+    return;
+}
+
+std::vector<torch::Tensor> ray_marching(
+    // rays
+    const torch::Tensor rays_o,
+    const torch::Tensor rays_d,
+    const torch::Tensor t_min,
+    const torch::Tensor t_max,
+    // occupancy grid & contraction
+    const torch::Tensor roi,
+    const torch::Tensor grid_binary,
+    const ContractionType type,
+    // sampling
+    const float step_size,
+    const float cone_angle)
+{
+    DEVICE_GUARD(rays_o);
+
+    CHECK_INPUT(rays_o);
+    CHECK_INPUT(rays_d);
+    CHECK_INPUT(t_min);
+    CHECK_INPUT(t_max);
+    CHECK_INPUT(roi);
+    CHECK_INPUT(grid_binary);
+    TORCH_CHECK(rays_o.ndimension() == 2 & rays_o.size(1) == 3)
+    TORCH_CHECK(rays_d.ndimension() == 2 & rays_d.size(1) == 3)
+    TORCH_CHECK(t_min.ndimension() == 1)
+    TORCH_CHECK(t_max.ndimension() == 1)
+    TORCH_CHECK(roi.ndimension() == 1 & roi.size(0) == 6)
+    TORCH_CHECK(grid_binary.ndimension() == 3)
+
+    const int n_rays = rays_o.size(0);
+    const int3 grid_res = make_int3(
+        grid_binary.size(0), grid_binary.size(1), grid_binary.size(2));
+
+    const int threads = 256;
+    const int blocks = CUDA_N_BLOCKS_NEEDED(n_rays, threads);
+
+    // helper counter
+    torch::Tensor num_steps = torch::zeros(
+        {n_rays}, rays_o.options().dtype(torch::kInt32));
+
+    // count number of samples per ray
+    ray_marching_kernel<<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
+        // rays
+        n_rays,
+        rays_o.data_ptr<float>(),
+        rays_d.data_ptr<float>(),
+        t_min.data_ptr<float>(),
+        t_max.data_ptr<float>(),
+        // occupancy grid & contraction
+        roi.data_ptr<float>(),
+        grid_res,
+        grid_binary.data_ptr<bool>(),
+        type,
+        // sampling
+        step_size,
+        cone_angle,
+        nullptr, /* packed_info */
+        // outputs
+        num_steps.data_ptr<int>(),
+        nullptr, /* t_starts */
+        nullptr /* t_ends */);
+
+    torch::Tensor cum_steps = num_steps.cumsum(0, torch::kInt32);
+    torch::Tensor packed_info = torch::stack({cum_steps - num_steps, num_steps}, 1);
+
+    // output samples starts and ends
+    int total_steps = cum_steps[cum_steps.size(0) - 1].item<int>();
+    torch::Tensor t_starts = torch::zeros({total_steps, 1}, rays_o.options());
+    torch::Tensor t_ends = torch::zeros({total_steps, 1}, rays_o.options());
+
+    ray_marching_kernel<<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
+        // rays
+        n_rays,
+        rays_o.data_ptr<float>(),
+        rays_d.data_ptr<float>(),
+        t_min.data_ptr<float>(),
+        t_max.data_ptr<float>(),
+        // occupancy grid & contraction
+        roi.data_ptr<float>(),
+        grid_res,
+        grid_binary.data_ptr<bool>(),
+        type,
+        // sampling
+        step_size,
+        cone_angle,
+        packed_info.data_ptr<int>(),
+        // outputs
+        nullptr, /* num_steps */
+        t_starts.data_ptr<float>(),
+        t_ends.data_ptr<float>());
+
+    return {packed_info, t_starts, t_ends};
+}
+
+// -----------------------------------------------------------------------------
+// Ray index for each sample
+// -----------------------------------------------------------------------------
+
+__global__ void ray_indices_kernel(
+    // input
+    const int n_rays,
+    const int *packed_info,
+    // output
+    int *ray_indices)
+{
+    CUDA_GET_THREAD_ID(i, n_rays);
+
+    // locate
+    const int base = packed_info[i * 2 + 0];  // point idx start.
+    const int steps = packed_info[i * 2 + 1]; // point idx shift.
+    if (steps == 0)
+        return;
+
+    ray_indices += base;
+
+    for (int j = 0; j < steps; ++j)
+    {
+        ray_indices[j] = i;
+    }
+}
+
+torch::Tensor unpack_to_ray_indices(const torch::Tensor packed_info)
+{
+    DEVICE_GUARD(packed_info);
+    CHECK_INPUT(packed_info);
+
+    const int n_rays = packed_info.size(0);
+    const int threads = 256;
+    const int blocks = CUDA_N_BLOCKS_NEEDED(n_rays, threads);
+
+    int n_samples = packed_info[n_rays - 1].sum(0).item<int>();
+    torch::Tensor ray_indices = torch::zeros(
+        {n_samples}, packed_info.options().dtype(torch::kInt32));
+
+    ray_indices_kernel<<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
+        n_rays,
+        packed_info.data_ptr<int>(),
+        ray_indices.data_ptr<int>());
+    return ray_indices;
+}
+
+// ----------------------------------------------------------------------------
+// Query the occupancy grid
+// ----------------------------------------------------------------------------
+
+__global__ void query_occ_kernel(
+    // rays info
+    const uint32_t n_samples,
+    const float *samples, // shape (n_samples, 3)
+    // occupancy grid & contraction
+    const float *roi,
+    const int3 grid_res,
+    const bool *grid_binary, // shape (reso_x, reso_y, reso_z)
+    const ContractionType type,
+    // outputs
+    bool *occs)
+{
+    CUDA_GET_THREAD_ID(i, n_samples);
+
+    // locate
+    samples += i * 3;
+    occs += i;
+
+    const float3 roi_min = make_float3(roi[0], roi[1], roi[2]);
+    const float3 roi_max = make_float3(roi[3], roi[4], roi[5]);
+    const float3 xyz = make_float3(samples[0], samples[1], samples[2]);
+
+    *occs = grid_occupied_at(xyz, roi_min, roi_max, type, grid_res, grid_binary);
+    return;
+}
+
+torch::Tensor query_occ(
+    const torch::Tensor samples,
+    // occupancy grid & contraction
+    const torch::Tensor roi,
+    const torch::Tensor grid_binary,
+    const ContractionType type)
+{
+    DEVICE_GUARD(samples);
+    CHECK_INPUT(samples);
+
+    const int n_samples = samples.size(0);
+    const int3 grid_res = make_int3(
+        grid_binary.size(0), grid_binary.size(1), grid_binary.size(2));
+
+    const int threads = 256;
+    const int blocks = CUDA_N_BLOCKS_NEEDED(n_samples, threads);
+
+    torch::Tensor occs = torch::zeros(
+        {n_samples}, samples.options().dtype(torch::kBool));
+
+    query_occ_kernel<<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
+        n_samples,
+        samples.data_ptr<float>(),
+        // grid
+        roi.data_ptr<float>(),
+        grid_res,
+        grid_binary.data_ptr<bool>(),
+        type,
+        // outputs
+        occs.data_ptr<bool>());
+    return occs;
+}
--- a/nerfacc/cuda/csrc/rendering.cu
+++ b/nerfacc/cuda/csrc/rendering.cu
+#include "include/helpers_cuda.h"
+
+template <typename scalar_t>
+__global__ void rendering_forward_kernel(
+    const uint32_t n_rays,
+    const int *packed_info,        // input ray & point indices.
+    const scalar_t *starts,        // input start t
+    const scalar_t *ends,          // input end t
+    const scalar_t *sigmas,        // input density after activation
+    const scalar_t *alphas,        // input alpha (opacity) values.
+    const scalar_t early_stop_eps, // transmittance threshold for early stop
+    // outputs: should be all-zero initialized
+    int *num_steps,        // the number of valid steps for each ray
+    scalar_t *weights,     // the number rendering weights for each sample
+    bool *compact_selector // the samples that we needs to compute the gradients
+)
+{
+    CUDA_GET_THREAD_ID(i, n_rays);
+
+    // locate
+    const int base = packed_info[i * 2 + 0];  // point idx start.
+    const int steps = packed_info[i * 2 + 1]; // point idx shift.
+    if (steps == 0)
+        return;
+
+    if (alphas != nullptr)
+    {
+        // rendering with alpha
+        alphas += base;
+    }
+    else
+    {
+        // rendering with density
+        starts += base;
+        ends += base;
+        sigmas += base;
+    }
+
+    if (num_steps != nullptr)
+    {
+        num_steps += i;
+    }
+    if (weights != nullptr)
+    {
+        weights += base;
+    }
+    if (compact_selector != nullptr)
+    {
+        compact_selector += base;
+    }
+
+    // accumulated rendering
+    scalar_t T = 1.f;
+    int j = 0;
+    for (; j < steps; ++j)
+    {
+        if (T < early_stop_eps)
+        {
+            break;
+        }
+        scalar_t alpha;
+        if (alphas != nullptr)
+        {
+            // rendering with alpha
+            alpha = alphas[j];
+        }
+        else
+        {
+            // rendering with density
+            scalar_t delta = ends[j] - starts[j];
+            alpha = 1.f - __expf(-sigmas[j] * delta);
+        }
+        const scalar_t weight = alpha * T;
+        T *= (1.f - alpha);
+        if (weights != nullptr)
+        {
+            weights[j] = weight;
+        }
+        if (compact_selector != nullptr)
+        {
+            compact_selector[j] = true;
+        }
+    }
+    if (num_steps != nullptr)
+    {
+        *num_steps = j;
+    }
+    return;
+}
+
+template <typename scalar_t>
+__global__ void rendering_backward_kernel(
+    const uint32_t n_rays,
+    const int *packed_info,        // input ray & point indices.
+    const scalar_t *starts,        // input start t
+    const scalar_t *ends,          // input end t
+    const scalar_t *sigmas,        // input density after activation
+    const scalar_t *alphas,        // input alpha (opacity) values.
+    const scalar_t early_stop_eps, // transmittance threshold for early stop
+    const scalar_t *weights,       // forward output
+    const scalar_t *grad_weights,  // input gradients
+    // if alphas was given, we compute the gradients for alphas.
+    // otherwise, we compute the gradients for sigmas.
+    scalar_t *grad_sigmas, // output gradients
+    scalar_t *grad_alphas  // output gradients
+)
+{
+    CUDA_GET_THREAD_ID(i, n_rays);
+
+    // locate
+    const int base = packed_info[i * 2 + 0];  // point idx start.
+    const int steps = packed_info[i * 2 + 1]; // point idx shift.
+    if (steps == 0)
+        return;
+
+    if (alphas != nullptr)
+    {
+        // rendering with alpha
+        alphas += base;
+        grad_alphas += base;
+    }
+    else
+    {
+        // rendering with density
+        starts += base;
+        ends += base;
+        sigmas += base;
+        grad_sigmas += base;
+    }
+
+    weights += base;
+    grad_weights += base;
+
+    scalar_t accum = 0;
+    for (int j = 0; j < steps; ++j)
+    {
+        accum += grad_weights[j] * weights[j];
+    }
+
+    // backward of accumulated rendering
+    scalar_t T = 1.f;
+    for (int j = 0; j < steps; ++j)
+    {
+        if (T < early_stop_eps)
+        {
+            break;
+        }
+        scalar_t alpha;
+        if (alphas != nullptr)
+        {
+            // rendering with alpha
+            alpha = alphas[j];
+            grad_alphas[j] = (grad_weights[j] * T - accum) / fmaxf(1.f - alpha, 1e-10f);
+        }
+        else
+        {
+            // rendering with density
+            scalar_t delta = ends[j] - starts[j];
+            alpha = 1.f - __expf(-sigmas[j] * delta);
+            grad_sigmas[j] = (grad_weights[j] * T - accum) * delta;
+        }
+
+        accum -= grad_weights[j] * weights[j];
+        T *= (1.f - alpha);
+    }
+}
+
+std::vector<torch::Tensor> rendering_forward(
+    torch::Tensor packed_info,
+    torch::Tensor starts,
+    torch::Tensor ends,
+    torch::Tensor sigmas,
+    float early_stop_eps,
+    bool compression)
+{
+    DEVICE_GUARD(packed_info);
+
+    CHECK_INPUT(packed_info);
+    CHECK_INPUT(starts);
+    CHECK_INPUT(ends);
+    CHECK_INPUT(sigmas);
+
+    TORCH_CHECK(packed_info.ndimension() == 2 & packed_info.size(1) == 2);
+    TORCH_CHECK(starts.ndimension() == 2 & starts.size(1) == 1);
+    TORCH_CHECK(ends.ndimension() == 2 & ends.size(1) == 1);
+    TORCH_CHECK(sigmas.ndimension() == 2 & sigmas.size(1) == 1);
+
+    const uint32_t n_rays = packed_info.size(0);
+    const uint32_t n_samples = sigmas.size(0);
+
+    const int threads = 256;
+    const int blocks = CUDA_N_BLOCKS_NEEDED(n_rays, threads);
+
+    if (compression)
+    {
+        // compress the samples to get rid of invisible ones.
+        torch::Tensor num_steps = torch::zeros({n_rays}, packed_info.options());
+        torch::Tensor compact_selector = torch::zeros(
+            {n_samples}, sigmas.options().dtype(torch::kBool));
+
+        AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+            sigmas.scalar_type(),
+            "rendering_forward",
+            ([&]
+             { rendering_forward_kernel<scalar_t><<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
+                   n_rays,
+                   // inputs
+                   packed_info.data_ptr<int>(),
+                   starts.data_ptr<scalar_t>(),
+                   ends.data_ptr<scalar_t>(),
+                   sigmas.data_ptr<scalar_t>(),
+                   nullptr, // alphas
+                   early_stop_eps,
+                   // outputs
+                   num_steps.data_ptr<int>(),
+                   nullptr,
+                   compact_selector.data_ptr<bool>()); }));
+
+        torch::Tensor cum_steps = num_steps.cumsum(0, torch::kInt32);
+        torch::Tensor compact_packed_info = torch::stack({cum_steps - num_steps, num_steps}, 1);
+        return {compact_packed_info, compact_selector};
+    }
+    else
+    {
+        // just do the forward rendering.
+        torch::Tensor weights = torch::zeros({n_samples}, sigmas.options());
+
+        AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+            sigmas.scalar_type(),
+            "rendering_forward",
+            ([&]
+             { rendering_forward_kernel<scalar_t><<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
+                   n_rays,
+                   // inputs
+                   packed_info.data_ptr<int>(),
+                   starts.data_ptr<scalar_t>(),
+                   ends.data_ptr<scalar_t>(),
+                   sigmas.data_ptr<scalar_t>(),
+                   nullptr, // alphas
+                   early_stop_eps,
+                   // outputs
+                   nullptr,
+                   weights.data_ptr<scalar_t>(),
+                   nullptr); }));
+
+        return {weights};
+    }
+}
+
+torch::Tensor rendering_backward(
+    torch::Tensor weights,
+    torch::Tensor grad_weights,
+    torch::Tensor packed_info,
+    torch::Tensor starts,
+    torch::Tensor ends,
+    torch::Tensor sigmas,
+    float early_stop_eps)
+{
+    DEVICE_GUARD(packed_info);
+    const uint32_t n_rays = packed_info.size(0);
+    const uint32_t n_samples = sigmas.size(0);
+
+    const int threads = 256;
+    const int blocks = CUDA_N_BLOCKS_NEEDED(n_rays, threads);
+
+    // outputs
+    torch::Tensor grad_sigmas = torch::zeros(sigmas.sizes(), sigmas.options());
+
+    AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+        sigmas.scalar_type(),
+        "rendering_backward",
+        ([&]
+         { rendering_backward_kernel<scalar_t><<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
+               n_rays,
+               // inputs
+               packed_info.data_ptr<int>(),
+               starts.data_ptr<scalar_t>(),
+               ends.data_ptr<scalar_t>(),
+               sigmas.data_ptr<scalar_t>(),
+               nullptr, // alphas
+               early_stop_eps,
+               weights.data_ptr<scalar_t>(),
+               grad_weights.data_ptr<scalar_t>(),
+               // outputs
+               grad_sigmas.data_ptr<scalar_t>(),
+               nullptr // alphas gradients
+           ); }));
+
+    return grad_sigmas;
+}
+
+// -- rendering with alphas -- //
+
+std::vector<torch::Tensor> rendering_alphas_forward(
+    torch::Tensor packed_info,
+    torch::Tensor alphas,
+    float early_stop_eps,
+    bool compression)
+{
+    DEVICE_GUARD(packed_info);
+
+    CHECK_INPUT(packed_info);
+    CHECK_INPUT(alphas);
+
+    TORCH_CHECK(packed_info.ndimension() == 2 & packed_info.size(1) == 2);
+    TORCH_CHECK(alphas.ndimension() == 2 & alphas.size(1) == 1);
+
+    const uint32_t n_rays = packed_info.size(0);
+    const uint32_t n_samples = alphas.size(0);
+
+    const int threads = 256;
+    const int blocks = CUDA_N_BLOCKS_NEEDED(n_rays, threads);
+
+    if (compression)
+    {
+        // compress the samples to get rid of invisible ones.
+        torch::Tensor num_steps = torch::zeros({n_rays}, packed_info.options());
+        torch::Tensor compact_selector = torch::zeros(
+            {n_samples}, alphas.options().dtype(torch::kBool));
+
+        AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+            alphas.scalar_type(),
+            "rendering_alphas_forward",
+            ([&]
+             { rendering_forward_kernel<scalar_t><<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
+                   n_rays,
+                   // inputs
+                   packed_info.data_ptr<int>(),
+                   nullptr, // starts
+                   nullptr, // ends
+                   nullptr, // sigmas
+                   alphas.data_ptr<scalar_t>(),
+                   early_stop_eps,
+                   // outputs
+                   num_steps.data_ptr<int>(),
+                   nullptr,
+                   compact_selector.data_ptr<bool>()); }));
+
+        torch::Tensor cum_steps = num_steps.cumsum(0, torch::kInt32);
+        torch::Tensor compact_packed_info = torch::stack({cum_steps - num_steps, num_steps}, 1);
+        return {compact_selector, compact_packed_info};
+    }
+    else
+    {
+        // just do the forward rendering.
+        torch::Tensor weights = torch::zeros({n_samples}, alphas.options());
+
+        AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+            alphas.scalar_type(),
+            "rendering_forward",
+            ([&]
+             { rendering_forward_kernel<scalar_t><<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
+                   n_rays,
+                   // inputs
+                   packed_info.data_ptr<int>(),
+                   nullptr, // starts
+                   nullptr, // ends
+                   nullptr, // sigmas
+                   alphas.data_ptr<scalar_t>(),
+                   early_stop_eps,
+                   // outputs
+                   nullptr,
+                   weights.data_ptr<scalar_t>(),
+                   nullptr); }));
+
+        return {weights};
+    }
+}
+
+torch::Tensor rendering_alphas_backward(
+    torch::Tensor weights,
+    torch::Tensor grad_weights,
+    torch::Tensor packed_info,
+    torch::Tensor alphas,
+    float early_stop_eps)
+{
+    DEVICE_GUARD(packed_info);
+    const uint32_t n_rays = packed_info.size(0);
+    const uint32_t n_samples = alphas.size(0);
+
+    const int threads = 256;
+    const int blocks = CUDA_N_BLOCKS_NEEDED(n_rays, threads);
+
+    // outputs
+    torch::Tensor grad_alphas = torch::zeros(alphas.sizes(), alphas.options());
+
+    AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+        alphas.scalar_type(),
+        "rendering_alphas_backward",
+        ([&]
+         { rendering_backward_kernel<scalar_t><<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
+               n_rays,
+               // inputs
+               packed_info.data_ptr<int>(),
+               nullptr, // starts
+               nullptr, // ends
+               nullptr, // sigmas
+               alphas.data_ptr<scalar_t>(),
+               early_stop_eps,
+               weights.data_ptr<scalar_t>(),
+               grad_weights.data_ptr<scalar_t>(),
+               // outputs
+               nullptr, // sigma gradients
+               grad_alphas.data_ptr<scalar_t>()); }));
+
+    return grad_alphas;
+}
--- a/nerfacc/cuda/csrc/volumetric_marching.cu
+++ b/nerfacc/cuda/csrc/volumetric_marching.cu
-#include <pybind11/pybind11.h>
-#include "include/helpers_cuda.h"
-
-
-inline __device__ int cascaded_grid_idx_at(
-    const float x, const float y, const float z, 
-    const int resx, const int resy, const int resz, 
-    const float* aabb
-) {
-    int ix = (int)(((x - aabb[0]) / (aabb[3] - aabb[0])) * resx);
-    int iy = (int)(((y - aabb[1]) / (aabb[4] - aabb[1])) * resy);
-    int iz = (int)(((z - aabb[2]) / (aabb[5] - aabb[2])) * resz);
-    ix = __clamp(ix, 0, resx-1);
-    iy = __clamp(iy, 0, resy-1);
-    iz = __clamp(iz, 0, resz-1);
-    int idx = ix * resy * resz + iy * resz + iz;
-    return idx;
-}
-
-inline __device__ bool grid_occupied_at(
-    const float x, const float y, const float z, 
-    const int resx, const int resy, const int resz, 
-    const float* aabb, const bool* occ_binary
-) {
-    if (x <= aabb[0] || x >= aabb[3] || y <= aabb[1] || y >= aabb[4] || z <= aabb[2] || z >= aabb[5]) {
-        return false;
-    }
-    int idx = cascaded_grid_idx_at(x, y, z, resx, resy, resz, aabb);
-    return occ_binary[idx];
-}
-
-inline __device__ float distance_to_next_voxel(
-    float x, float y, float z, 
-    float dir_x, float dir_y, float dir_z, 
-    float idir_x, float idir_y, float idir_z,
-    const int resx, const int resy, const int resz,
-    const float* aabb
-) { // dda like step
-    // TODO: this is ugly -- optimize this.
-    float _x = ((x - aabb[0]) / (aabb[3] - aabb[0])) * resx;
-    float _y = ((y - aabb[1]) / (aabb[4] - aabb[1])) * resy;
-    float _z = ((z - aabb[2]) / (aabb[5] - aabb[2])) * resz;
-    float tx = ((floorf(_x + 0.5f + 0.5f * __sign(dir_x)) - _x) * idir_x) / resx * (aabb[3] - aabb[0]);
-    float ty = ((floorf(_y + 0.5f + 0.5f * __sign(dir_y)) - _y) * idir_y) / resy * (aabb[4] - aabb[1]);
-    float tz = ((floorf(_z + 0.5f + 0.5f * __sign(dir_z)) - _z) * idir_z) / resz * (aabb[5] - aabb[2]);
-    float t = min(min(tx, ty), tz);
-    return fmaxf(t, 0.0f);
-}
-
-inline __device__ float advance_to_next_voxel(
-    float t,
-    float x, float y, float z, 
-    float dir_x, float dir_y, float dir_z, 
-    float idir_x, float idir_y, float idir_z,
-    const int resx, const int resy, const int resz, const float* aabb,
-    float dt_min) {
-    // Regular stepping (may be slower but matches non-empty space)
-    float t_target = t + distance_to_next_voxel(
-        x, y, z, 
-        dir_x, dir_y, dir_z, 
-        idir_x, idir_y, idir_z, 
-        resx, resy, resz, aabb
-    );
-    do {
-        t += dt_min;
-    } while (t < t_target);
-    return t;
-}
-
-
-__global__ void marching_steps_kernel(
-    // rays info
-    const uint32_t n_rays,
-    const float* rays_o,  // shape (n_rays, 3)
-    const float* rays_d,  // shape (n_rays, 3)
-    const float* t_min,  // shape (n_rays,)
-    const float* t_max,  // shape (n_rays,)
-    // density grid
-    const float* aabb,  // [min_x, min_y, min_z, max_x, max_y, max_y]
-    const int resx,
-    const int resy,
-    const int resz,
-    const bool* occ_binary,  // shape (reso_x, reso_y, reso_z)
-    // sampling
-    const float dt,
-    // outputs
-    int* num_steps
-) {
-    CUDA_GET_THREAD_ID(i, n_rays);
-
-    // locate
-    rays_o += i * 3;
-    rays_d += i * 3;
-    t_min += i;
-    t_max += i;
-    num_steps += i;
-
-    const float ox = rays_o[0], oy = rays_o[1], oz = rays_o[2];
-    const float dx = rays_d[0], dy = rays_d[1], dz = rays_d[2];
-    const float rdx = 1 / dx, rdy = 1 / dy, rdz = 1 / dz;
-    const float near = t_min[0], far = t_max[0];
-
-    int j = 0;
-    float t0 = near;  // TODO(ruilongli): perturb `near` as in ngp_pl?
-    float t1 = t0 + dt;
-    float t_mid = (t0 + t1) * 0.5f;
-
-    while (t_mid < far) {
-        // current center
-        const float x = ox + t_mid * dx;
-        const float y = oy + t_mid * dy;
-        const float z = oz + t_mid * dz;
-        if (grid_occupied_at(x, y, z, resx, resy, resz, aabb, occ_binary)) {
-            ++j;
-            // march to next sample
-            t0 = t1;
-            t1 = t0 + dt;
-            t_mid = (t0 + t1) * 0.5f;
-        }
-        else {
-            // march to next sample
-            t_mid = advance_to_next_voxel(
-                t_mid, x, y, z, dx, dy, dz, rdx, rdy, rdz, resx, resy, resz, aabb, dt
-            );
-            t0 = t_mid - dt * 0.5f;
-            t1 = t_mid + dt * 0.5f;
-        }
-    }
-    if (j == 0) return;
-
-    num_steps[0] = j;
-    return;
-}
-
-
-__global__ void marching_forward_kernel(
-    // rays info
-    const uint32_t n_rays,
-    const float* rays_o,  // shape (n_rays, 3)
-    const float* rays_d,  // shape (n_rays, 3)
-    const float* t_min,  // shape (n_rays,)
-    const float* t_max,  // shape (n_rays,)
-    // density grid
-    const float* aabb,  // [min_x, min_y, min_z, max_x, max_y, max_y]
-    const int resx,
-    const int resy,
-    const int resz,
-    const bool* occ_binary,  // shape (reso_x, reso_y, reso_z)
-    // sampling
-    const float dt,
-    const int* packed_info,
-    // frustrum outputs
-    float* frustum_starts,
-    float* frustum_ends 
-) {
-    CUDA_GET_THREAD_ID(i, n_rays);
-
-    // locate
-    rays_o += i * 3;
-    rays_d += i * 3;
-    t_min += i;
-    t_max += i;
-    int base = packed_info[i * 2 + 0];
-    int steps = packed_info[i * 2 + 1];
-
-    const float ox = rays_o[0], oy = rays_o[1], oz = rays_o[2];
-    const float dx = rays_d[0], dy = rays_d[1], dz = rays_d[2];
-    const float rdx = 1 / dx, rdy = 1 / dy, rdz = 1 / dz;
-    const float near = t_min[0], far = t_max[0];
-
-    // locate
-    frustum_starts += base;
-    frustum_ends += base;
-
-    int j = 0;
-    float t0 = near;
-    float t1 = t0 + dt;
-    float t_mid = (t0 + t1) / 2.;
-
-    while (t_mid < far) {
-        // current center
-        const float x = ox + t_mid * dx;
-        const float y = oy + t_mid * dy;
-        const float z = oz + t_mid * dz;
-        
-        if (grid_occupied_at(x, y, z, resx, resy, resz, aabb, occ_binary)) {
-            frustum_starts[j] = t0;   
-            frustum_ends[j] = t1;     
-            ++j;
-            // march to next sample
-            t0 = t1;
-            t1 = t0 + dt;
-            t_mid = (t0 + t1) * 0.5f;
-        }
-        else {
-            // march to next sample
-            t_mid = advance_to_next_voxel(
-                t_mid, x, y, z, dx, dy, dz, rdx, rdy, rdz, resx, resy, resz, aabb, dt
-            );
-            t0 = t_mid - dt * 0.5f;
-            t1 = t_mid + dt * 0.5f;
-		}
-	}
-    if (j != steps) {
-        printf("WTF %d v.s. %d\n", j, steps);
-    }
-    return;
-}
-
-__global__ void ray_indices_kernel(
-    // input
-    const int n_rays,
-    const int* packed_info,
-    // output
-    int* ray_indices
-) {
-    CUDA_GET_THREAD_ID(i, n_rays);
-
-    // locate
-    const int base = packed_info[i * 2 + 0];  // point idx start.
-    const int steps = packed_info[i * 2 + 1];  // point idx shift.
-    if (steps == 0) return;
-
-    ray_indices += base;
-
-    for (int j = 0; j < steps; ++j) {
-        ray_indices[j] = i;
-    }
-}
-
-
-__global__ void occ_query_kernel(
-    // rays info
-    const uint32_t n_samples,
-    const float* samples,  // shape (n_samples, 3)
-    // density grid
-    const float* aabb,  // [min_x, min_y, min_z, max_x, max_y, max_y]
-    const int resx,
-    const int resy,
-    const int resz,
-    const bool* occ_binary,  // shape (reso_x, reso_y, reso_z)
-    // outputs
-    bool* occs
-) {
-    CUDA_GET_THREAD_ID(i, n_samples);
-
-    // locate
-    samples += i * 3;
-    occs += i;
-
-    occs[0] = grid_occupied_at(
-        samples[0], samples[1], samples[2], 
-        resx, resy, resz, aabb, occ_binary
-    );
-    return;
-}
-
-
-std::vector<torch::Tensor> volumetric_marching(
-    // rays
-    const torch::Tensor rays_o, 
-    const torch::Tensor rays_d, 
-    const torch::Tensor t_min, 
-    const torch::Tensor t_max,
-    // density grid
-    const torch::Tensor aabb,
-    const pybind11::list resolution,
-    const torch::Tensor occ_binary, 
-    // sampling
-    const float dt
-) {
-    DEVICE_GUARD(rays_o);
-
-    CHECK_INPUT(rays_o);
-    CHECK_INPUT(rays_d);
-    CHECK_INPUT(t_min);
-    CHECK_INPUT(t_max);
-    CHECK_INPUT(aabb);
-    CHECK_INPUT(occ_binary);
-    
-    const int n_rays = rays_o.size(0);
-
-    const int threads = 256;
-    const int blocks = CUDA_N_BLOCKS_NEEDED(n_rays, threads);
-
-    // helper counter
-    torch::Tensor num_steps = torch::zeros(
-        {n_rays}, rays_o.options().dtype(torch::kInt32));
-
-    // count number of samples per ray
-    marching_steps_kernel<<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
-        // rays
-        n_rays,
-        rays_o.data_ptr<float>(),
-        rays_d.data_ptr<float>(),
-        t_min.data_ptr<float>(),
-        t_max.data_ptr<float>(),
-        // density grid
-        aabb.data_ptr<float>(),
-        resolution[0].cast<int>(),
-        resolution[1].cast<int>(),
-        resolution[2].cast<int>(),
-        occ_binary.data_ptr<bool>(),
-        // sampling
-        dt,
-        // outputs
-        num_steps.data_ptr<int>()
-    ); 
-
-    torch::Tensor cum_steps = num_steps.cumsum(0, torch::kInt32);
-    torch::Tensor packed_info = torch::stack({cum_steps - num_steps, num_steps}, 1);
-    // std::cout << "num_steps" << num_steps.dtype() << std::endl;
-    // std::cout << "cum_steps" << cum_steps.dtype() << std::endl;
-    // std::cout << "packed_info" << packed_info.dtype() << std::endl;
-
-    // output frustum samples
-    int total_steps = cum_steps[cum_steps.size(0) - 1].item<int>();
-    torch::Tensor frustum_starts = torch::zeros({total_steps, 1}, rays_o.options());
-    torch::Tensor frustum_ends = torch::zeros({total_steps, 1}, rays_o.options());
-
-    marching_forward_kernel<<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
-        // rays
-        n_rays,
-        rays_o.data_ptr<float>(),
-        rays_d.data_ptr<float>(),
-        t_min.data_ptr<float>(),
-        t_max.data_ptr<float>(),
-        // density grid
-        aabb.data_ptr<float>(),
-        resolution[0].cast<int>(),
-        resolution[1].cast<int>(),
-        resolution[2].cast<int>(),
-        occ_binary.data_ptr<bool>(),
-        // sampling
-        dt,
-        packed_info.data_ptr<int>(),
-        // outputs
-        frustum_starts.data_ptr<float>(),
-        frustum_ends.data_ptr<float>()
-    ); 
-
-    return {packed_info, frustum_starts, frustum_ends};
-}
-
-
-torch::Tensor unpack_to_ray_indices(const torch::Tensor packed_info) {
-    DEVICE_GUARD(packed_info);
-    CHECK_INPUT(packed_info);
-
-    const int n_rays = packed_info.size(0);
-    const int threads = 256;
-    const int blocks = CUDA_N_BLOCKS_NEEDED(n_rays, threads);
-
-    int n_samples = packed_info[n_rays - 1].sum(0).item<int>();
-    torch::Tensor ray_indices = torch::zeros(
-        {n_samples}, packed_info.options().dtype(torch::kInt32));
-
-    ray_indices_kernel<<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
-        n_rays,
-        packed_info.data_ptr<int>(),
-        ray_indices.data_ptr<int>()
-    ); 
-    return ray_indices;
-}
-
-
-torch::Tensor query_occ(
-    const torch::Tensor samples,
-    // density grid
-    const torch::Tensor aabb,
-    const pybind11::list resolution,
-    const torch::Tensor occ_binary
-) {
-    DEVICE_GUARD(samples);
-    CHECK_INPUT(samples);
-
-    const int n_samples = samples.size(0);
-    const int threads = 256;
-    const int blocks = CUDA_N_BLOCKS_NEEDED(n_samples, threads);
-
-    torch::Tensor occs = torch::zeros(
-        {n_samples}, samples.options().dtype(torch::kBool));
-
-    occ_query_kernel<<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
-        n_samples,
-        samples.data_ptr<float>(),
-        // density grid
-        aabb.data_ptr<float>(),
-        resolution[0].cast<int>(),
-        resolution[1].cast<int>(),
-        resolution[2].cast<int>(),
-        occ_binary.data_ptr<bool>(),
-        // outputs
-        occs.data_ptr<bool>()
-    ); 
-    return occs;
-}
-
--- a/nerfacc/cuda/csrc/volumetric_rendering.cu
+++ b/nerfacc/cuda/csrc/volumetric_rendering.cu
-#include "include/helpers_cuda.h"
-
-
-template <typename scalar_t>
-__global__ void volumetric_rendering_steps_kernel(
-    const uint32_t n_rays,
-    const int* packed_info,  // input ray & point indices.
-    const scalar_t* starts,  // input start t
-    const scalar_t* ends,  // input end t
-    const scalar_t* sigmas,  // input density after activation
-    // output: should be all zero (false) initialized
-    int* num_steps, 
-    bool* selector
-) {
-    CUDA_GET_THREAD_ID(i, n_rays);
-
-    // locate
-    const int base = packed_info[i * 2 + 0];  // point idx start.
-    const int steps = packed_info[i * 2 + 1];  // point idx shift.
-    if (steps == 0) return;
-
-    starts += base;
-    ends += base;
-    sigmas += base;
-    num_steps += i;
-    selector += base;
-
-    // accumulated rendering
-    scalar_t T = 1.f;
-    scalar_t EPSILON = 1e-4f;
-    int j = 0;
-    for (; j < steps; ++j) {
-        if (T < EPSILON) {
-            break;
-        }
-        const scalar_t delta = ends[j] - starts[j];
-        const scalar_t alpha = 1.f - __expf(-sigmas[j] * delta);
-        const scalar_t weight = alpha * T;
-        T *= (1.f - alpha);
-        selector[j] = true;
-    }
-    num_steps[0] = j;
-    return;
-}
-
-
-template <typename scalar_t>
-__global__ void volumetric_rendering_weights_forward_kernel(
-    const uint32_t n_rays,
-    const int* packed_info,  // input ray & point indices.
-    const scalar_t* starts,  // input start t
-    const scalar_t* ends,  // input end t
-    const scalar_t* sigmas,  // input density after activation
-    // should be all-zero initialized
-    scalar_t* weights  // output
-) {
-    CUDA_GET_THREAD_ID(i, n_rays);
-
-    // locate
-    const int base = packed_info[i * 2 + 0];  // point idx start.
-    const int steps = packed_info[i * 2 + 1];  // point idx shift.
-    if (steps == 0) return;
-
-    starts += base;
-    ends += base;
-    sigmas += base;
-    weights += base;
-
-    // accumulated rendering
-    scalar_t T = 1.f;
-    scalar_t EPSILON = 1e-4f;
-    for (int j = 0; j < steps; ++j) {
-        if (T < EPSILON) {
-            break;
-        }
-        const scalar_t delta = ends[j] - starts[j];
-        const scalar_t alpha = 1.f - __expf(-sigmas[j] * delta);
-        const scalar_t weight = alpha * T;
-        weights[j] = weight;
-        T *= (1.f - alpha);
-    }
-}
-
-
-template <typename scalar_t>
-__global__ void volumetric_rendering_weights_backward_kernel(
-    const uint32_t n_rays,
-    const int* packed_info,  // input ray & point indices.
-    const scalar_t* starts,  // input start t
-    const scalar_t* ends,  // input end t
-    const scalar_t* sigmas,  // input density after activation
-    const scalar_t* weights,  // forward output
-    const scalar_t* grad_weights,  // input
-    scalar_t* grad_sigmas  // output
-) {
-    CUDA_GET_THREAD_ID(i, n_rays);
-
-    // locate
-    const int base = packed_info[i * 2 + 0];  // point idx start.
-    const int steps = packed_info[i * 2 + 1];  // point idx shift.
-    if (steps == 0) return;
-
-    starts += base;
-    ends += base;
-    sigmas += base;
-    weights += base;
-    grad_weights += base;
-    grad_sigmas += base;
-
-    scalar_t accum = 0;
-    for (int j = 0; j < steps; ++j) {
-        accum += grad_weights[j] * weights[j];
-    }
-
-    // backward of accumulated rendering
-    scalar_t T = 1.f;
-    scalar_t EPSILON = 1e-4f;
-    for (int j = 0; j < steps; ++j) {
-        if (T < EPSILON) {
-            break;
-        }
-        const scalar_t delta = ends[j] - starts[j];
-        const scalar_t alpha = 1.f - __expf(-sigmas[j] * delta);
-
-        grad_sigmas[j] = delta * (grad_weights[j] * T - accum);
-        accum -= grad_weights[j] * weights[j];
-        T *= (1.f - alpha);
-    }
-}
-
-
-std::vector<torch::Tensor> volumetric_rendering_steps(
-    torch::Tensor packed_info, 
-    torch::Tensor starts, 
-    torch::Tensor ends, 
-    torch::Tensor sigmas
-) {
-    DEVICE_GUARD(packed_info);
-
-    CHECK_INPUT(packed_info);
-    CHECK_INPUT(starts);
-    CHECK_INPUT(ends);
-    CHECK_INPUT(sigmas);
-    
-    TORCH_CHECK(packed_info.ndimension() == 2 & packed_info.size(1) == 2);
-    TORCH_CHECK(starts.ndimension() == 2 & starts.size(1) == 1);
-    TORCH_CHECK(ends.ndimension() == 2 & ends.size(1) == 1);
-    TORCH_CHECK(sigmas.ndimension() == 2 & sigmas.size(1) == 1);
-
-    const uint32_t n_rays = packed_info.size(0);
-    const uint32_t n_samples = sigmas.size(0);
-
-    const int threads = 256;
-    const int blocks = CUDA_N_BLOCKS_NEEDED(n_rays, threads);
-
-    torch::Tensor num_steps = torch::zeros({n_rays}, packed_info.options());
-    torch::Tensor selector = torch::zeros({n_samples}, packed_info.options().dtype(torch::kBool));
-
-    AT_DISPATCH_FLOATING_TYPES_AND_HALF(
-        sigmas.scalar_type(),
-        "volumetric_marching_steps",
-        ([&]
-         { volumetric_rendering_steps_kernel<scalar_t><<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
-                n_rays,
-                packed_info.data_ptr<int>(), 
-                starts.data_ptr<scalar_t>(),
-                ends.data_ptr<scalar_t>(),
-                sigmas.data_ptr<scalar_t>(),
-                num_steps.data_ptr<int>(),
-                selector.data_ptr<bool>()
-            ); 
-        }));
-
-    torch::Tensor cum_steps = num_steps.cumsum(0, torch::kInt32);
-    torch::Tensor compact_packed_info = torch::stack({cum_steps - num_steps, num_steps}, 1);
-
-    return {compact_packed_info, selector};
-}
-
-
-torch::Tensor volumetric_rendering_weights_forward(
-    torch::Tensor packed_info, 
-    torch::Tensor starts, 
-    torch::Tensor ends, 
-    torch::Tensor sigmas
-) {
-    DEVICE_GUARD(packed_info);
-    CHECK_INPUT(packed_info);
-    CHECK_INPUT(starts);
-    CHECK_INPUT(ends);
-    CHECK_INPUT(sigmas);
-    TORCH_CHECK(packed_info.ndimension() == 2 & packed_info.size(1) == 2);
-    TORCH_CHECK(starts.ndimension() == 2 & starts.size(1) == 1);
-    TORCH_CHECK(ends.ndimension() == 2 & ends.size(1) == 1);
-    TORCH_CHECK(sigmas.ndimension() == 2 & sigmas.size(1) == 1);
-
-    const uint32_t n_rays = packed_info.size(0);
-    const uint32_t n_samples = sigmas.size(0);
-
-    const int threads = 256;
-    const int blocks = CUDA_N_BLOCKS_NEEDED(n_rays, threads);
-
-    // outputs
-    torch::Tensor weights = torch::zeros({n_samples}, sigmas.options()); 
-
-    AT_DISPATCH_FLOATING_TYPES_AND_HALF(
-        sigmas.scalar_type(),
-        "volumetric_rendering_weights_forward",
-        ([&]
-         { volumetric_rendering_weights_forward_kernel<scalar_t><<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
-                n_rays,
-                packed_info.data_ptr<int>(), 
-                starts.data_ptr<scalar_t>(),
-                ends.data_ptr<scalar_t>(),
-                sigmas.data_ptr<scalar_t>(),
-                weights.data_ptr<scalar_t>()
-            ); 
-        }));
-
-    return weights;
-}
-
-
-torch::Tensor volumetric_rendering_weights_backward(
-    torch::Tensor weights, 
-    torch::Tensor grad_weights, 
-    torch::Tensor packed_info, 
-    torch::Tensor starts, 
-    torch::Tensor ends, 
-    torch::Tensor sigmas
-) {
-    DEVICE_GUARD(packed_info);
-    const uint32_t n_rays = packed_info.size(0);
-    const uint32_t n_samples = sigmas.size(0);
-
-    const int threads = 256;
-    const int blocks = CUDA_N_BLOCKS_NEEDED(n_rays, threads);
-
-    // outputs
-    torch::Tensor grad_sigmas = torch::zeros(sigmas.sizes(), sigmas.options()); 
-
-    AT_DISPATCH_FLOATING_TYPES_AND_HALF(
-        sigmas.scalar_type(),
-        "volumetric_rendering_weights_backward",
-        ([&]
-         { volumetric_rendering_weights_backward_kernel<scalar_t><<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
-                n_rays,
-                packed_info.data_ptr<int>(), 
-                starts.data_ptr<scalar_t>(),
-                ends.data_ptr<scalar_t>(),
-                sigmas.data_ptr<scalar_t>(),
-                weights.data_ptr<scalar_t>(),
-                grad_weights.data_ptr<scalar_t>(),
-                grad_sigmas.data_ptr<scalar_t>()
-            ); 
-        }));
-
-    return grad_sigmas;
-}
--- a/nerfacc/occupancy_field.py
+++ b/nerfacc/occupancy_field.py
-""" Occupancy field for accelerating volumetric rendering. """
-from typing import Callable, List, Tuple, Union
+from typing import Callable, List, Union

 import torch
-from torch import nn
+import torch.nn as nn

+from .contraction import ContractionType, contract_inv
+
+# TODO: add this to the dependency
 # from torch_scatter import scatter_max


-def meshgrid3d(
-    res: List[int], device: Union[torch.device, str] = "cpu"
-) -> torch.Tensor:
-    """Create 3D grid coordinates.
+class Grid(nn.Module):
+    """An abstract Grid class.

-    Args:
-        res: resolutions for {x, y, z} dimensions.
+    The grid is used as a cache of the 3D space to indicate whether each voxel
+    area is important or not for the differentiable rendering process. The
+    ray marching function (see :func:`nerfacc.ray_marching`) would use the
+    grid to skip the unimportant voxel areas.

-    Returns:
-        torch.long with shape (res[0], res[1], res[2], 3): dense 3D grid coordinates.
+    To work with :func:`nerfacc.ray_marching`, three attributes must exist:
+
+        - :attr:`roi_aabb`: The axis-aligned bounding box of the region of interest.
+        - :attr:`binary`: A 3D binarized tensor of shape {resx, resy, resz}, \
+            with torch.bool data type.
+        - :attr:`contraction_type`: The contraction type of the grid, indicating how \
+            the 3D space is mapped to the grid.
    """
-    assert len(res) == 3
-    return (
-        torch.stack(
-            torch.meshgrid(
-                [
-                    torch.arange(res[0]),
-                    torch.arange(res[1]),
-                    torch.arange(res[2]),
-                ],
-                indexing="ij",
-            ),
-            dim=-1,
-        )
-        .long()
-        .to(device)
-    )

+    def __init__(self, *args, **kwargs):
+        super().__init__()
+        self._dummy = torch.nn.Parameter(torch.empty(0))

-class OccupancyField(nn.Module):
-    """Occupancy Field that supports EMA updates. Both 2D and 3D are supported.
+    @property
+    def device(self) -> torch.device:
+        return self._dummy.device

-    Note:
-        Make sure the arguemnts match with the ``num_dim`` -- Either 2D or 3D.
+    @property
+    def roi_aabb(self) -> torch.Tensor:
+        """The axis-aligned bounding box of the region of interest.
+
+        Its is a shape (6,) tensor in the format of {minx, miny, minz, maxx, maxy, maxz}.
+        """
+        if hasattr(self, "_roi_aabb"):
+            return getattr(self, "_roi_aabb")
+        else:
+            raise NotImplementedError("please set an attribute named _roi_aabb")
+
+    @property
+    def binary(self) -> torch.Tensor:
+        """A 3D binarized tensor with torch.bool data type.
+
+        The tensor is of shape (resx, resy, resz), in which each boolen value
+        represents whether the corresponding voxel should be kept or not.
+        """
+        if hasattr(self, "_binary"):
+            return getattr(self, "_binary")
+        else:
+            raise NotImplementedError("please set an attribute named _binary")
+
+    @property
+    def contraction_type(self) -> ContractionType:
+        """The contraction type of the grid.
+
+        The contraction type is an indicator of how the 3D space is contracted
+        to this voxel grid. See :class:`nerfacc.ContractionType` for more details.
+        """
+        if hasattr(self, "_contraction_type"):
+            return getattr(self, "_contraction_type")
+        else:
+            raise NotImplementedError(
+                "please set an attribute named _contraction_type"
+            )
+
+
+class OccupancyGrid(Grid):
+    """Occupancy grid: whether each voxel area is occupied or not.

    Args:
-        occ_eval_fn: A Callable function that takes in the un-normalized points x,
-            with shape of (N, 2) or (N, 3) (depends on ``num_dim``),
-            and outputs the occupancy of those points with shape of (N, 1).
-        aabb: Scene bounding box. If ``num_dim=2`` it should be {min_x, min_y,max_x, max_y}.
-            If ``num_dim=3`` it should be {min_x, min_y, min_z, max_x, max_y, max_z}.
-        resolution: The field resolution. It can either be a int of a list of ints
-            to specify resolution on each dimension.  If ``num_dim=2`` it is for {res_x, res_y}.
-            If ``num_dim=3`` it is for {res_x, res_y, res_z}. Default is 128.
-        num_dim: The space dimension. Either 2 or 3. Default is 3.
-
-    Attributes:
-        aabb: Scene bounding box.
-        occ_grid: The occupancy grid. It is a tensor of shape (num_cells,).
-        occ_grid_binary: The binary occupancy grid. It is a tensor of shape (num_cells,).
-        grid_coords: The grid coordinates. It is a tensor of shape (num_cells, num_dim).
-        grid_indices: The grid indices. It is a tensor of shape (num_cells,).
+        roi_aabb: The axis-aligned bounding box of the region of interest. Useful for mapping
+            the 3D space to the grid.
+        resolution: The resolution of the grid. If an integer is given, the grid is assumed to
+            be a cube. Otherwise, a list or a tensor of shape (3,) is expected. Default: 128.
+        contraction_type: The contraction type of the grid. See :class:`nerfacc.ContractionType`
+            for more details. Default: :attr:`nerfacc.ContractionType.AABB`.
    """

-    aabb: torch.Tensor
-    occ_grid: torch.Tensor
-    occ_grid_binary: torch.Tensor
-    grid_coords: torch.Tensor
-    grid_indices: torch.Tensor
+    NUM_DIM: int = 3

    def __init__(
        self,
-        occ_eval_fn: Callable,
-        aabb: Union[torch.Tensor, List[float]],
-        resolution: Union[int, List[int]] = 128,
-        num_dim: int = 3,
+        roi_aabb: Union[List[int], torch.Tensor],
+        resolution: Union[int, List[int], torch.Tensor] = 128,
+        contraction_type: ContractionType = ContractionType.AABB,
    ) -> None:
        super().__init__()
-        self.occ_eval_fn = occ_eval_fn
-        if not isinstance(aabb, torch.Tensor):
-            aabb = torch.tensor(aabb, dtype=torch.float32)
-        if not isinstance(resolution, (list, tuple)):
-            resolution = [resolution] * num_dim
-        assert num_dim in [2, 3], "Currently only supports 2D or 3D field."
-        assert aabb.shape == (
-            num_dim * 2,
-        ), f"shape of aabb ({aabb.shape}) should be num_dim * 2 ({num_dim * 2})."
-        assert (
-            len(resolution) == num_dim
-        ), f"length of resolution ({len(resolution)}) should be num_dim ({num_dim})."
-
-        self.register_buffer("aabb", aabb)
-        self.resolution = resolution
-        self.register_buffer("resolution_tensor", torch.tensor(resolution))
-        self.num_dim = num_dim
-        self.num_cells = int(torch.tensor(resolution).prod().item())
-
-        # Stores cell occupancy values ranged in [0, 1].
-        occ_grid = torch.zeros(self.num_cells)
-        self.register_buffer("occ_grid", occ_grid)
-        occ_grid_binary = torch.zeros(self.num_cells, dtype=torch.bool)
-        self.register_buffer("occ_grid_binary", occ_grid_binary)
+        if isinstance(resolution, int):
+            resolution = [resolution] * self.NUM_DIM
+        if isinstance(resolution, (list, tuple)):
+            resolution = torch.tensor(resolution, dtype=torch.int32)
+        assert isinstance(
+            resolution, torch.Tensor
+        ), f"Invalid type: {type(resolution)}"
+        assert resolution.shape == (
+            self.NUM_DIM,
+        ), f"Invalid shape: {resolution.shape}"
+
+        if isinstance(roi_aabb, (list, tuple)):
+            roi_aabb = torch.tensor(roi_aabb, dtype=torch.float32)
+        assert isinstance(
+            roi_aabb, torch.Tensor
+        ), f"Invalid type: {type(roi_aabb)}"
+        assert roi_aabb.shape == torch.Size(
+            [self.NUM_DIM * 2]
+        ), f"Invalid shape: {roi_aabb.shape}"
+
+        # total number of voxels
+        self.num_cells = int(resolution.prod().item())
+
+        # required attributes
+        self.register_buffer("_roi_aabb", roi_aabb)
+        self.register_buffer(
+            "_binary", torch.zeros(resolution.tolist(), dtype=torch.bool)
+        )
+        self._contraction_type = contraction_type
+
+        # helper attributes
+        self.register_buffer("resolution", resolution)
+        self.register_buffer("occs", torch.zeros(self.num_cells))

        # Grid coords & indices
-        grid_coords = meshgrid3d(self.resolution).reshape(
-            self.num_cells, self.num_dim
+        grid_coords = _meshgrid3d(resolution).reshape(
+            self.num_cells, self.NUM_DIM
        )
        self.register_buffer("grid_coords", grid_coords)
        grid_indices = torch.arange(self.num_cells)
@@ -116,13 +143,14 @@ class OccupancyField(nn.Module):
    @torch.no_grad()
    def _sample_uniform_and_occupied_cells(self, n: int) -> torch.Tensor:
        """Samples both n uniform and occupied cells."""
-        device = self.occ_grid.device
-
-        uniform_indices = torch.randint(self.num_cells, (n,), device=device)
-
-        occupied_indices = torch.nonzero(self.occ_grid_binary)[:, 0]
+        uniform_indices = torch.randint(
+            self.num_cells, (n,), device=self.device
+        )
+        occupied_indices = torch.nonzero(self._binary.flatten())[:, 0]
        if n < len(occupied_indices):
-            selector = torch.randint(len(occupied_indices), (n,), device=device)
+            selector = torch.randint(
+                len(occupied_indices), (n,), device=self.device
+            )
            occupied_indices = occupied_indices[selector]
        indices = torch.cat([uniform_indices, occupied_indices], dim=0)
        return indices
@@ -131,6 +159,7 @@ class OccupancyField(nn.Module):
    def _update(
        self,
        step: int,
+        occ_eval_fn: Callable,
        occ_thre: float = 0.01,
        ema_decay: float = 0.95,
        warmup_steps: int = 256,
@@ -147,92 +176,47 @@ class OccupancyField(nn.Module):
        grid_coords = self.grid_coords[indices]
        x = (
            grid_coords + torch.rand_like(grid_coords, dtype=torch.float32)
-        ) / self.resolution_tensor
-        bb_min, bb_max = torch.split(
-            self.aabb, [self.num_dim, self.num_dim], dim=0
+        ) / self.resolution
+        # voxel coordinates [0, 1]^3 -> world
+        x = contract_inv(
+            x,
+            roi=self._roi_aabb,
+            type=self._contraction_type,
        )
-        x = x * (bb_max - bb_min) + bb_min
-        occ = self.occ_eval_fn(x).squeeze(-1)
+        occ = occ_eval_fn(x).squeeze(-1)

        # ema update
-        self.occ_grid[indices] = torch.maximum(
-            self.occ_grid[indices] * ema_decay, occ
-        )
+        self.occs[indices] = torch.maximum(self.occs[indices] * ema_decay, occ)
        # suppose to use scatter max but emperically it is almost the same.
-        # self.occ_grid, _ = scatter_max(
-        #     occ, indices, dim=0, out=self.occ_grid * ema_decay
+        # self.occs, _ = scatter_max(
+        #     occ, indices, dim=0, out=self.occs * ema_decay
        # )
-        self.occ_grid_binary = self.occ_grid > torch.clamp(
-            self.occ_grid.mean(), max=occ_thre
-        )
-
-    @torch.no_grad()
-    def query_occ(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
-        """Query the occupancy, given samples.
-
-        Args:
-            x: Samples with shape (..., 2) or (..., 3).
-
-        Returns:
-            float and binary occupancy values with shape (...) respectively.
-        """
-        assert (
-            x.shape[-1] == self.num_dim
-        ), "The samples are not drawn from a proper space!"
-        resolution = torch.tensor(self.resolution).to(self.occ_grid.device)
-
-        bb_min, bb_max = torch.split(
-            self.aabb, [self.num_dim, self.num_dim], dim=0
-        )
-        x = (x - bb_min) / (bb_max - bb_min)
-        selector = ((x > 0.0) & (x < 1.0)).all(dim=-1)
-
-        grid_coords = torch.floor(x * resolution).long()
-        if self.num_dim == 2:
-            grid_indices = (
-                grid_coords[..., 0] * self.resolution[-1] + grid_coords[..., 1]
-            )
-        elif self.num_dim == 3:
-            grid_indices = (
-                grid_coords[..., 0] * self.resolution[-1] * self.resolution[-2]
-                + grid_coords[..., 1] * self.resolution[-1]
-                + grid_coords[..., 2]
-            )
-        else:
-            raise NotImplementedError("Currently only supports 2D or 3D field.")
-        occs = torch.zeros(x.shape[:-1], device=x.device)
-        occs[selector] = self.occ_grid[grid_indices[selector]]
-        occs_binary = torch.zeros(
-            x.shape[:-1], device=x.device, dtype=torch.bool
-        )
-        occs_binary[selector] = self.occ_grid_binary[grid_indices[selector]]
-        return occs, occs_binary
+        self._binary = (
+            self.occs > torch.clamp(self.occs.mean(), max=occ_thre)
+        ).reshape(self._binary.shape)

    @torch.no_grad()
    def every_n_step(
        self,
        step: int,
+        occ_eval_fn: Callable,
        occ_thre: float = 1e-2,
        ema_decay: float = 0.95,
        warmup_steps: int = 256,
        n: int = 16,
-    ):
-        """Update the field every n steps during training.
-
-        This function is designed for training only. If for some reason you want to
-        manually update the field, please use the ``_update()`` function instead.
+    ) -> None:
+        """Update the grid every n steps during training.

        Args:
            step: Current training step.
-            occ_thre: Threshold to binarize the occupancy field.
-            ema_decay: The decay rate for EMA updates.
+            occ_eval_fn: A function that takes in sample locations :math:`(N, 3)` and
+                returns the occupancy values :math:`(N, 1)` at those locations.
+            occ_thre: Threshold used to binarize the occupancy grid. Default: 1e-2.
+            ema_decay: The decay rate for EMA updates. Default: 0.95.
            warmup_steps: Sample all cells during the warmup stage. After the warmup
                stage we change the sampling strategy to 1/4 uniformly sampled cells
-                together with 1/4 occupied cells.
-            n: Update the field every n steps.
-
-        Returns:
-            None
+                together with 1/4 occupied cells. Default: 256.
+            n: Update the grid every n steps. Default: 16.
        """
        if not self.training:
            raise RuntimeError(
@@ -243,18 +227,31 @@ class OccupancyField(nn.Module):
        if step % n == 0 and self.training:
            self._update(
                step=step,
+                occ_eval_fn=occ_eval_fn,
                occ_thre=occ_thre,
                ema_decay=ema_decay,
                warmup_steps=warmup_steps,
            )

-    def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
-        """Query the occupancy, given samples.

-        Args:
-            x: Samples with shape (..., 2) or (..., 3).
-
-        Returns:
-            float and binary occupancy values with shape (...) respectively.
-        """
-        return self.query_occ(x)
+def _meshgrid3d(
+    res: torch.Tensor, device: Union[torch.device, str] = "cpu"
+) -> torch.Tensor:
+    """Create 3D grid coordinates."""
+    assert len(res) == 3
+    res = res.tolist()
+    return (
+        torch.stack(
+            torch.meshgrid(
+                [
+                    torch.arange(res[0]),
+                    torch.arange(res[1]),
+                    torch.arange(res[2]),
+                ],
+                indexing="ij",
+            ),
+            dim=-1,
+        )
+        .long()
+        .to(device)
+    )
--- a/nerfacc/pipeline.py
+++ b/nerfacc/pipeline.py
+from typing import Callable, Optional, Tuple
+
+import torch
+
+from .grid import Grid
+from .ray_marching import ray_marching, unpack_to_ray_indices
+from .vol_rendering import accumulate_along_rays, render_weight_from_density
+
+
+def rendering(
+    # radiance field
+    rgb_sigma_fn: Callable,
+    # ray marching results
+    packed_info: torch.Tensor,
+    t_starts: torch.Tensor,
+    t_ends: torch.Tensor,
+    # rendering options
+    early_stop_eps: float = 1e-4,
+    render_bkgd: Optional[torch.Tensor] = None,
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    """Render the rays through the radience field defined by `rgb_sigma_fn`.
+
+    This function is differentiable to the outputs of `rgb_sigma_fn` so it can be used for
+    gradient-based optimization.
+
+    Warning:
+        This function is not differentiable to `t_starts`, `t_ends`.
+
+    Args:
+        rgb_sigma_fn: A function that takes in samples {t_starts (N, 1), t_ends (N, 1), \
+            ray indices (N,)} and returns the post-activation rgb (N, 3) and density \
+            values (N, 1).
+        packed_info: Packed ray marching info. See :func:`ray_marching` for details.
+        t_starts: Per-sample start distance. Tensor with shape (n_samples, 1).
+        t_ends: Per-sample end distance. Tensor with shape (n_samples, 1).
+        early_stop_eps: Early stop threshold during trasmittance accumulation. Default: 1e-4.
+        render_bkgd: Optional. Background color. Tensor with shape (3,).
+
+    Returns:
+        Ray colors (n_rays, 3), opacities (n_rays, 1) and depths (n_rays, 1).
+
+    Examples:
+
+    .. code-block:: python
+
+        import torch
+        from nerfacc import OccupancyGrid, ray_marching, rendering
+
+        device = "cuda:0"
+        batch_size = 128
+        rays_o = torch.rand((batch_size, 3), device=device)
+        rays_d = torch.randn((batch_size, 3), device=device)
+        rays_d = rays_d / rays_d.norm(dim=-1, keepdim=True)
+
+        # Ray marching.
+        packed_info, t_starts, t_ends = ray_marching(
+            rays_o, rays_d, near_plane=0.1, far_plane=1.0, render_step_size=1e-3
+        )
+
+        # Rendering.
+        def rgb_sigma_fn(t_starts, t_ends, ray_indices):
+            # This is a dummy function that returns random values.
+            rgbs = torch.rand((t_starts.shape[0], 3), device=device)
+            sigmas = torch.rand((t_starts.shape[0], 1), device=device)
+            return rgbs, sigmas
+        colors, opacities, depths = rendering(rgb_sigma_fn, packed_info, t_starts, t_ends)
+
+        # torch.Size([128, 3]) torch.Size([128, 1]) torch.Size([128, 1])
+        print(colors.shape, opacities.shape, depths.shape)
+
+    """
+    n_rays = packed_info.shape[0]
+    ray_indices = unpack_to_ray_indices(packed_info)
+
+    # Query sigma and color with gradients
+    rgbs, sigmas = rgb_sigma_fn(t_starts, t_ends, ray_indices)
+    assert rgbs.shape[-1] == 3, "rgbs must have 3 channels, got {}".format(
+        rgbs.shape
+    )
+    assert (
+        sigmas.shape == t_starts.shape
+    ), "sigmas must have shape of (N, 1)! Got {}".format(sigmas.shape)
+
+    # Rendering: compute weights and ray indices.
+    weights = render_weight_from_density(
+        packed_info, t_starts, t_ends, sigmas, early_stop_eps
+    )
+
+    # Rendering: accumulate rgbs, opacities, and depths along the rays.
+    colors = accumulate_along_rays(
+        weights, ray_indices, values=rgbs, n_rays=n_rays
+    )
+    opacities = accumulate_along_rays(
+        weights, ray_indices, values=None, n_rays=n_rays
+    )
+    depths = accumulate_along_rays(
+        weights,
+        ray_indices,
+        values=(t_starts + t_ends) / 2.0,
+        n_rays=n_rays,
+    )
+
+    # Background composition.
+    if render_bkgd is not None:
+        colors = colors + render_bkgd * (1.0 - opacities)
+
+    return colors, opacities, depths
+
+
+def volumetric_rendering(
+    # radiance field
+    sigma_fn: Callable,
+    rgb_sigma_fn: Callable,
+    # rays
+    rays_o: torch.Tensor,
+    rays_d: torch.Tensor,
+    t_min: Optional[torch.Tensor] = None,
+    t_max: Optional[torch.Tensor] = None,
+    # bounding box of the scene
+    scene_aabb: Optional[torch.Tensor] = None,
+    # grid for skipping samples
+    grid: Optional[Grid] = None,
+    # rendering options
+    near_plane: Optional[float] = None,
+    far_plane: Optional[float] = None,
+    render_step_size: float = 1e-3,
+    stratified: bool = False,
+    cone_angle: float = 0.0,
+    early_stop_eps: float = 1e-4,
+    render_bkgd: Optional[torch.Tensor] = None,
+    return_extra_info: bool = False,
+) -> Tuple[torch.Tensor, torch.Tensor, int, int]:
+    """Differentiable volumetric rendering pipeline.
+
+    This function is the integration of those individual functions:
+
+        - ray_aabb_intersect: ray AABB intersection.
+        - ray_marching: ray marching with grid-based skipping.
+        - compute_weights: compute transmittance and compress samples.
+        - accumulate_along_rays: accumulate samples along rays to get final per-ray RGB etc.
+
+    Args:
+        sigma_fn: A function that takes in samples {t_starts (N, 1), t_ends (N, 1),
+            ray indices (N,)} and returns the post-activation density values (N, 1).
+        rgb_sigma_fn: A function that takes in samples {t_starts (N, 1), t_ends (N, 1),
+            ray indices (N,)} and returns the post-activation rgb (N, 3) and density
+            values (N, 1).
+        rays_o: Ray origins. Tensor with shape (n_rays, 3).
+        rays_d: Normalized ray directions. Tensor with shape (n_rays, 3).
+        t_min: Optional. Per-ray minimum distance. Tensor with shape (n_rays).
+        t_max: Optional. Per-ray maximum distance. Tensor with shape (n_rays).
+        scene_aabb: Optional. Scene bounding box for computing t_min and t_max.
+            A tensor with shape (6,) {xmin, ymin, zmin, xmax, ymax, zmax}.
+            scene_aabb which be ignored if both t_min and t_max are provided.
+        grid: Optional. Grid for to idicates where to skip during marching.
+            See :class:`nerfacc.Grid` for details.
+        near_plane: Optional. Near plane distance. If provided, it will be used
+            to clip t_min.
+        far_plane: Optional. Far plane distance. If provided, it will be used
+            to clip t_max.
+        render_step_size: Step size for marching. Default: 1e-3.
+        stratified: Whether to use stratified sampling. Default: False.
+        cone_angle: Cone angle for linearly-increased step size. 0. means
+            constant step size. Default: 0.0.
+        early_stop_eps: Early stop threshold for marching. Default: 1e-4.
+        render_bkgd: Optional. Background color. If provided, it will be used
+            to fill the background. Default: None.
+        return_extra_info: Whether to return extra info. Default: False.
+
+    Returns:
+        Ray colors (n_rays, 3), opacities (n_rays, 1) and depths (n_rays, 1).
+        If return_extra_info is True, it will also return a dictionary of extra info,
+        including:
+
+            - "n_marching_samples": Total number of samples kept after marching.
+            - "n_rendering_samples": Total number of samples used for actual rendering.
+
+    """
+    assert rays_o.shape == rays_d.shape and rays_o.dim() == 2, "Invalid rays."
+    n_rays = rays_o.shape[0]
+    rays_o = rays_o.contiguous()
+    rays_d = rays_d.contiguous()
+
+    extra_info = {}
+    with torch.no_grad():
+        # Ray marching with skipping.
+        packed_info, t_starts, t_ends = ray_marching(
+            rays_o,
+            rays_d,
+            t_min=t_min,
+            t_max=t_max,
+            scene_aabb=scene_aabb,
+            grid=grid,
+            sigma_fn=sigma_fn,
+            early_stop_eps=early_stop_eps,
+            near_plane=near_plane,
+            far_plane=far_plane,
+            render_step_size=render_step_size,
+            stratified=stratified,
+            cone_angle=cone_angle,
+        )
+        extra_info["n_rendering_samples"] = len(t_starts)
+
+    colors, opacities, depths = rendering(
+        rgb_sigma_fn,
+        packed_info=packed_info,
+        t_starts=t_starts,
+        t_ends=t_ends,
+        early_stop_eps=early_stop_eps,
+        render_bkgd=render_bkgd,
+    )
+
+    if return_extra_info:
+        return colors, opacities, depths, extra_info
+    else:
+        return colors, opacities, depths
--- a/nerfacc/ray_marching.py
+++ b/nerfacc/ray_marching.py
+from typing import Callable, Optional, Tuple
+
+import torch
+from torch import Tensor
+
+import nerfacc.cuda as _C
+
+from .grid import Grid
+from .vol_rendering import render_visibility
+
+
+@torch.no_grad()
+def ray_aabb_intersect(
+    rays_o: Tensor, rays_d: Tensor, aabb: Tensor
+) -> Tuple[Tensor, Tensor]:
+    """Ray AABB Test.
+
+    Note:
+        this function is not differentiable to any inputs.
+
+    Args:
+        rays_o: Ray origins of shape (n_rays, 3).
+        rays_d: Normalized ray directions of shape (n_rays, 3).
+        aabb: Scene bounding box {xmin, ymin, zmin, xmax, ymax, zmax}. \
+            Tensor with shape (6)
+
+    Returns:
+        Ray AABB intersection {t_min, t_max} with shape (n_rays) respectively. \
+        Note the t_min is clipped to minimum zero. 1e10 means no intersection.
+
+    Examples:
+
+    .. code-block:: python
+
+        aabb = torch.tensor([0.0, 0.0, 0.0, 1.0, 1.0, 1.0], device="cuda:0")
+        rays_o = torch.rand((128, 3), device="cuda:0")
+        rays_d = torch.randn((128, 3), device="cuda:0")
+        rays_d = rays_d / rays_d.norm(dim=-1, keepdim=True)
+        t_min, t_max = ray_aabb_intersect(rays_o, rays_d, aabb)
+
+    """
+    if rays_o.is_cuda and rays_d.is_cuda and aabb.is_cuda:
+        rays_o = rays_o.contiguous()
+        rays_d = rays_d.contiguous()
+        aabb = aabb.contiguous()
+        t_min, t_max = _C.ray_aabb_intersect(rays_o, rays_d, aabb)
+    else:
+        raise NotImplementedError("Only support cuda inputs.")
+    return t_min, t_max
+
+
+@torch.no_grad()
+def unpack_to_ray_indices(packed_info: Tensor) -> Tensor:
+    """Unpack `packed_info` to `ray_indices`. Useful for converting per ray data to per sample data.
+
+    Note: 
+        this function is not differentiable to any inputs.
+
+    Args:
+        packed_info: Stores information on which samples belong to the same ray. \
+            See :func:`nerfacc.ray_marching` for details. Tensor with shape (n_rays, 2).
+
+    Returns:
+        Ray index of each sample. LongTensor with shape (n_sample).
+
+    Examples:
+
+    .. code-block:: python
+
+        rays_o = torch.rand((128, 3), device="cuda:0")
+        rays_d = torch.randn((128, 3), device="cuda:0")
+        rays_d = rays_d / rays_d.norm(dim=-1, keepdim=True)
+        # Ray marching with near far plane.
+        packed_info, t_starts, t_ends = ray_marching(
+            rays_o, rays_d, near_plane=0.1, far_plane=1.0, render_step_size=1e-3
+        )
+        # torch.Size([128, 2]) torch.Size([115200, 1]) torch.Size([115200, 1])
+        print(packed_info.shape, t_starts.shape, t_ends.shape)
+        # Unpack per-ray info to per-sample info.
+        ray_indices = unpack_to_ray_indices(packed_info)
+        # torch.Size([115200]) torch.int64
+        print(ray_indices.shape, ray_indices.dtype)
+
+    """
+    if packed_info.is_cuda:
+        ray_indices = _C.unpack_to_ray_indices(packed_info.contiguous())
+    else:
+        raise NotImplementedError("Only support cuda inputs.")
+    return ray_indices.long()
+
+
+@torch.no_grad()
+def ray_marching(
+    # rays
+    rays_o: Tensor,
+    rays_d: Tensor,
+    t_min: Optional[Tensor] = None,
+    t_max: Optional[Tensor] = None,
+    # bounding box of the scene
+    scene_aabb: Optional[Tensor] = None,
+    # binarized grid for skipping empty space
+    grid: Optional[Grid] = None,
+    # sigma function for skipping invisible space
+    sigma_fn: Optional[Callable] = None,
+    early_stop_eps: float = 1e-4,
+    # rendering options
+    near_plane: Optional[float] = None,
+    far_plane: Optional[float] = None,
+    render_step_size: float = 1e-3,
+    stratified: bool = False,
+    cone_angle: float = 0.0,
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    """Ray marching with space skipping.
+
+    Note:
+        The logic for computing `t_min` and `t_max`:
+        1. If `t_min` and `t_max` are given, use them with highest priority.
+        2. If `t_min` and `t_max` are not given, but `scene_aabb` is given, use \
+            :func:`ray_aabb_intersect` to compute `t_min` and `t_max`.
+        3. If `t_min` and `t_max` are not given, and `scene_aabb` is not given, \
+            set `t_min` to 0.0, and `t_max` to 1e10. (the case of unbounded scene)
+        4. Always clip `t_min` with `near_plane` and `t_max` with `far_plane` if given.
+
+    Warning:
+        This function is not differentiable to any inputs.
+
+    Args:
+        rays_o: Ray origins of shape (n_rays, 3).
+        rays_d: Normalized ray directions of shape (n_rays, 3).
+        t_min: Optional. Per-ray minimum distance. Tensor with shape (n_rays).
+        t_max: Optional. Per-ray maximum distance. Tensor with shape (n_rays).
+        scene_aabb: Optional. Scene bounding box for computing t_min and t_max.
+            A tensor with shape (6,) {xmin, ymin, zmin, xmax, ymax, zmax}.
+            `scene_aabb` will be ignored if both `t_min` and `t_max` are provided.
+        grid: Optional. Grid that idicates where to skip during marching.
+            See :class:`nerfacc.Grid` for details.
+        sigma_fn: Optional. If provided, the marching will skip the invisible space
+            by evaluating the density along the ray with `sigma_fn`. It should be a 
+            function that takes in samples {t_starts (N, 1), t_ends (N, 1),
+            ray indices (N,)} and returns the post-activation density values (N, 1).
+        early_stop_eps: Early stop threshold for skipping invisible space. Default: 1e-4.
+        near_plane: Optional. Near plane distance. If provided, it will be used
+            to clip t_min.
+        far_plane: Optional. Far plane distance. If provided, it will be used
+            to clip t_max.
+        render_step_size: Step size for marching. Default: 1e-3.
+        stratified: Whether to use stratified sampling. Default: False.
+        cone_angle: Cone angle for linearly-increased step size. 0. means
+            constant step size. Default: 0.0.
+
+    Returns:
+        A tuple of tensors.
+
+            - **packed_info**: Stores information on which samples belong to the same ray. \
+                Tensor with shape (n_rays, 2). The first column stores the index of the \
+                first sample of each ray. The second column stores the number of samples \
+                of each ray.
+            - **t_starts**: Per-sample start distance. Tensor with shape (n_samples, 1).
+            - **t_ends**: Per-sample end distance. Tensor with shape (n_samples, 1).
+
+    Examples:
+
+    .. code-block:: python
+
+        import torch
+        from nerfacc import OccupancyGrid, ray_marching, unpack_to_ray_indices
+
+        device = "cuda:0"
+        batch_size = 128
+        rays_o = torch.rand((batch_size, 3), device=device)
+        rays_d = torch.randn((batch_size, 3), device=device)
+        rays_d = rays_d / rays_d.norm(dim=-1, keepdim=True)
+
+        # Ray marching with near far plane.
+        packed_info, t_starts, t_ends = ray_marching(
+            rays_o, rays_d, near_plane=0.1, far_plane=1.0, render_step_size=1e-3
+        )
+
+        # Ray marching with aabb.
+        scene_aabb = torch.tensor([0.0, 0.0, 0.0, 1.0, 1.0, 1.0], device=device)
+        packed_info, t_starts, t_ends = ray_marching(
+            rays_o, rays_d, scene_aabb=scene_aabb, render_step_size=1e-3
+        )
+
+        # Ray marching with per-ray t_min and t_max.
+        t_min = torch.zeros((batch_size,), device=device)
+        t_max = torch.ones((batch_size,), device=device)
+        packed_info, t_starts, t_ends = ray_marching(
+            rays_o, rays_d, t_min=t_min, t_max=t_max, render_step_size=1e-3
+        )
+
+        # Ray marching with aabb and skip areas based on occupancy grid.
+        scene_aabb = torch.tensor([0.0, 0.0, 0.0, 1.0, 1.0, 1.0], device=device)
+        grid = OccupancyGrid(roi_aabb=[0.0, 0.0, 0.0, 0.5, 0.5, 0.5]).to(device)
+        packed_info, t_starts, t_ends = ray_marching(
+            rays_o, rays_d, scene_aabb=scene_aabb, grid=grid, render_step_size=1e-3
+        )
+
+        # Convert t_starts and t_ends to sample locations.
+        ray_indices = unpack_to_ray_indices(packed_info)
+        t_mid = (t_starts + t_ends) / 2.0
+        sample_locs = rays_o[ray_indices] + t_mid * rays_d[ray_indices]
+
+    """
+    if not rays_o.is_cuda:
+        raise NotImplementedError("Only support cuda inputs.")
+
+    # logic for t_min and t_max:
+    # 1. if t_min and t_max are given, use them with highest priority.
+    # 2. if t_min and t_max are not given, but scene_aabb is given, use
+    # ray_aabb_intersect to compute t_min and t_max.
+    # 3. if t_min and t_max are not given, and scene_aabb is not given,
+    # set t_min to 0.0, and t_max to 1e10. (the case of unbounded scene)
+    # 4. always clip t_min with near_plane and t_max with far_plane if given.
+    if t_min is None or t_max is None:
+        if scene_aabb is not None:
+            t_min, t_max = ray_aabb_intersect(rays_o, rays_d, scene_aabb)
+        else:
+            t_min = torch.zeros_like(rays_o[..., 0])
+            t_max = torch.ones_like(rays_o[..., 0]) * 1e10
+    if near_plane is not None:
+        t_min = torch.clamp(t_min, min=near_plane)
+    if far_plane is not None:
+        t_max = torch.clamp(t_max, max=far_plane)
+
+    # stratified sampling: prevent overfitting during training
+    if stratified:
+        t_min = t_min + torch.rand_like(t_min) * render_step_size
+
+    # use grid for skipping if given
+    if grid is not None:
+        grid_roi_aabb = grid.roi_aabb
+        grid_binary = grid.binary
+        contraction_type = _C.ContractionType(grid.contraction_type.value)
+    else:
+        grid_roi_aabb = torch.tensor(
+            [-1e10, -1e10, -1e10, 1e10, 1e10, 1e10],
+            dtype=torch.float32,
+            device=rays_o.device,
+        )
+        grid_binary = torch.ones(
+            [1, 1, 1], dtype=torch.bool, device=rays_o.device
+        )
+        contraction_type = _C.ContractionType.AABB
+
+    # marching with grid-based skipping
+    packed_info, t_starts, t_ends = _C.ray_marching(
+        # rays
+        rays_o.contiguous(),
+        rays_d.contiguous(),
+        t_min.contiguous(),
+        t_max.contiguous(),
+        # coontraction and grid
+        grid_roi_aabb.contiguous(),
+        grid_binary.contiguous(),
+        contraction_type,
+        # sampling
+        render_step_size,
+        cone_angle,
+    )
+
+    # skip invisible space
+    if sigma_fn is not None:
+        # Query sigma without gradients
+        ray_indices = unpack_to_ray_indices(packed_info)
+        sigmas = sigma_fn(t_starts, t_ends, ray_indices)
+        assert (
+            sigmas.shape == t_starts.shape
+        ), "sigmas must have shape of (N, 1)! Got {}".format(sigmas.shape)
+        alphas = 1.0 - torch.exp(-sigmas * (t_ends - t_starts))
+
+        # Compute visibility of the samples, and filter out invisible samples
+        visibility, packed_info_visible = render_visibility(
+            packed_info, alphas, early_stop_eps
+        )
+        t_starts, t_ends = t_starts[visibility], t_ends[visibility]
+        packed_info = packed_info_visible
+
+    return packed_info, t_starts, t_ends
--- a/nerfacc/utils.py
+++ b/nerfacc/utils.py
-""" Volumetric rendering utilities. """
-from typing import Any, List, Optional, Tuple
-
-import torch
-from torch import Tensor
-
-import nerfacc.cuda as nerfacc_cuda
-
-
-@torch.no_grad()
-def ray_aabb_intersect(
-    rays_o: Tensor, rays_d: Tensor, aabb: Tensor
-) -> Tuple[Tensor, Tensor]:
-    """Ray AABB Test.
-
-    Note: this function is not differentiable to inputs.
-
-    Args:
-        rays_o: Ray origins. Tensor with shape (n_rays, 3).
-        rays_d: Normalized ray directions. Tensor with shape (n_rays, 3).
-        aabb: Scene bounding box {xmin, ymin, zmin, xmax, ymax, zmax}. \
-            Tensor with shape (6)
-
-    Returns:
-        Ray AABB intersection {t_min, t_max} with shape (n_rays) respectively. \
-        Note the t_min is clipped to minimum zero. 1e10 means no intersection.
-
-    """
-    if rays_o.is_cuda and rays_d.is_cuda and aabb.is_cuda:
-        rays_o = rays_o.contiguous()
-        rays_d = rays_d.contiguous()
-        aabb = aabb.contiguous()
-        t_min, t_max = nerfacc_cuda.ray_aabb_intersect(rays_o, rays_d, aabb)
-    else:
-        raise NotImplementedError("Only support cuda inputs.")
-    return t_min, t_max
-
-
-@torch.no_grad()
-def volumetric_marching(
-    rays_o: Tensor,
-    rays_d: Tensor,
-    aabb: Tensor,
-    scene_resolution: List[int],
-    scene_occ_binary: Tensor,
-    t_min: Optional[Tensor] = None,
-    t_max: Optional[Tensor] = None,
-    render_step_size: float = 1e-3,
-    near_plane: float = 0.0,
-    stratified: bool = False,
-) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-    """Volumetric marching with occupancy test.
-
-    Note: this function is not differentiable to inputs.
-
-    Args:
-        rays_o: Ray origins. Tensor with shape (n_rays, 3).
-        rays_d: Normalized ray directions. Tensor with shape (n_rays, 3).
-        aabb: Scene bounding box {xmin, ymin, zmin, xmax, ymax, zmax}. \
-            Tensor with shape (6)
-        scene_resolution: Shape of the `scene_occ_binary`. {resx, resy, resz}.
-        scene_occ_binary: Scene occupancy binary field. BoolTensor with \
-            shape (resx * resy * resz)
-        t_min: Optional. Ray near planes. Tensor with shape (n_ray,). \
-            If not given it will be calculated using aabb test. Default is None.
-        t_max: Optional. Ray far planes. Tensor with shape (n_ray,). \
-            If not given it will be calculated using aabb test. Default is None.
-        render_step_size: Marching step size. Default is 1e-3.
-        near_plane: Near plane of the camera. Default is 0.0.
-        stratified: Whether to use stratified sampling. Default is False.
-
-    Returns:
-        A tuple of tensors containing
-
-            - **packed_info**: Stores information on which samples belong to the same ray. \
-                It is a tensor with shape (n_rays, 2). For each ray, the two values \
-                indicate the start index and the number of samples for this ray, \
-                respectively.
-            - **frustum_starts**: Sampled frustum directions. Tensor with shape (n_samples, 3).
-            - **frustum_ends**: Sampled frustum directions. Tensor with shape (n_samples, 3).
-
-    """
-    if not rays_o.is_cuda:
-        raise NotImplementedError("Only support cuda inputs.")
-    if t_min is None or t_max is None:
-        t_min, t_max = ray_aabb_intersect(rays_o, rays_d, aabb)
-        if near_plane > 0.0:
-            t_min = torch.clamp(t_min, min=near_plane)
-    assert (
-        scene_occ_binary.numel()
-        == scene_resolution[0] * scene_resolution[1] * scene_resolution[2]
-    ), f"Shape {scene_occ_binary.shape} is not right!"
-
-    if stratified:
-        t_min = t_min + torch.rand_like(t_min) * render_step_size
-    (
-        packed_info,
-        frustum_starts,
-        frustum_ends,
-    ) = nerfacc_cuda.volumetric_marching(
-        # rays
-        rays_o.contiguous(),
-        rays_d.contiguous(),
-        t_min.contiguous(),
-        t_max.contiguous(),
-        # density grid
-        aabb.contiguous(),
-        scene_resolution,
-        scene_occ_binary.contiguous(),
-        # sampling
-        render_step_size,
-    )
-
-    return packed_info, frustum_starts, frustum_ends
-
-
-@torch.no_grad()
-def volumetric_rendering_steps(
-    packed_info: Tensor,
-    sigmas: Tensor,
-    frustum_starts: Tensor,
-    frustum_ends: Tensor,
-    *args,
-) -> Tuple[Tensor, ...]:
-    """Compute rendering marching steps.
-
-    This function will compact the samples by terminate the marching once the \
-    transmittance reaches to 0.9999. It is recommanded that before running your \
-    network with gradients enabled, first run this function without gradients \
-    (torch.no_grad()) to quickly filter out some samples.
-
-    Note: this function is not differentiable to inputs.
-
-    Args:
-        packed_info: Stores infomation on which samples belong to the same ray. \
-            See volumetric_marching for details. Tensor with shape (n_rays, 2).
-        sigmas: Densities at those samples. Tensor with shape (n_samples, 1).
-        frustum_starts: Where the frustum-shape sample starts along a ray. Tensor with \
-            shape (n_samples, 1).
-        frustum_ends: Where the frustum-shape sample ends along a ray. Tensor with \
-            shape (n_samples, 1).
-
-    Returns:
-        A tuple of tensors containing
-
-            - **compact_packed_info**: Compacted version of input packed_info.
-            - **compact_frustum_starts**: Compacted version of input frustum_starts.
-            - **compact_frustum_ends**: Compacted version of input frustum_ends.
-
-    """
-    if (
-        packed_info.is_cuda
-        and frustum_starts.is_cuda
-        and frustum_ends.is_cuda
-        and sigmas.is_cuda
-    ):
-        packed_info = packed_info.contiguous()
-        frustum_starts = frustum_starts.contiguous()
-        frustum_ends = frustum_ends.contiguous()
-        sigmas = sigmas.contiguous()
-        (
-            compact_packed_info,
-            compact_selector,
-        ) = nerfacc_cuda.volumetric_rendering_steps(
-            packed_info, frustum_starts, frustum_ends, sigmas
-        )
-        compact_frustum_starts = frustum_starts[compact_selector]
-        compact_frustum_ends = frustum_ends[compact_selector]
-        extras = (arg[compact_selector] for arg in args)
-    else:
-        raise NotImplementedError("Only support cuda inputs.")
-    return (
-        compact_packed_info,
-        compact_frustum_starts,
-        compact_frustum_ends,
-        *extras,
-    )
-
-
-def volumetric_rendering_weights(
-    packed_info: Tensor,
-    sigmas: Tensor,
-    frustum_starts: Tensor,
-    frustum_ends: Tensor,
-) -> Tensor:
-    """Compute weights for volumetric rendering.
-
-    Note: this function is only differentiable to `sigmas`.
-
-    Args:
-        packed_info: Stores information on which samples belong to the same ray. \
-            See ``volumetric_marching`` for details. Tensor with shape (n_rays, 2).
-        sigmas: Densities at those samples. Tensor with shape (n_samples, 1).
-        frustum_starts: Where the frustum-shape sample starts along a ray. Tensor with \
-            shape (n_samples, 1).
-        frustum_ends: Where the frustum-shape sample ends along a ray. Tensor with \
-            shape (n_samples, 1).
-
-    Returns:
-        Volumetric rendering weights for those samples. Tensor with shape (n_samples).
-
-    """
-    if (
-        packed_info.is_cuda
-        and frustum_starts.is_cuda
-        and frustum_ends.is_cuda
-        and sigmas.is_cuda
-    ):
-        packed_info = packed_info.contiguous()
-        frustum_starts = frustum_starts.contiguous()
-        frustum_ends = frustum_ends.contiguous()
-        sigmas = sigmas.contiguous()
-        weights = _VolumetricRenderingWeights.apply(
-            packed_info, frustum_starts, frustum_ends, sigmas
-        )
-    else:
-        raise NotImplementedError("Only support cuda inputs.")
-    return weights
-
-
-def volumetric_rendering_accumulate(
-    weights: Tensor,
-    ray_indices: Tensor,
-    values: Optional[Tensor] = None,
-    n_rays: Optional[int] = None,
-) -> Tensor:
-    """Accumulate volumetric values along the ray.
-
-    Note: this function is only differentiable to weights and values.
-
-    Args:
-        weights: Volumetric rendering weights for those samples. Tensor with shape \
-            (n_samples).
-        ray_indices: Ray index of each sample. IntTensor with shape (n_sample).
-        values: The values to be accmulated. Tensor with shape (n_samples, D). If \
-            None, the accumulated values are just weights. Default is None.
-        n_rays: Total number of rays. This will decide the shape of the ouputs. If \
-            None, it will be inferred from `ray_indices.max() + 1`.  If specified \
-            it should be at least larger than `ray_indices.max()`. Default is None.
-
-    Returns:
-        Accumulated values with shape (n_rays, D). If `values` is not given then we return \
-            the accumulated weights, in which case D == 1.
-    """
-    assert ray_indices.dim() == 1 and weights.dim() == 1
-    if not weights.is_cuda:
-        raise NotImplementedError("Only support cuda inputs.")
-    if values is not None:
-        assert values.dim() == 2 and values.shape[0] == weights.shape[0]
-        src = weights[:, None] * values
-    else:
-        src = weights[:, None]
-
-    if ray_indices.numel() == 0:
-        assert n_rays is not None
-        return torch.zeros((n_rays, src.shape[-1]), device=weights.device)
-
-    if n_rays is None:
-        n_rays = int(ray_indices.max()) + 1
-    else:
-        assert n_rays > ray_indices.max()
-
-    index = ray_indices[:, None].long().expand(-1, src.shape[-1])
-    outputs = torch.zeros((n_rays, src.shape[-1]), device=weights.device)
-    outputs.scatter_add_(0, index, src)
-    return outputs
-
-
-@torch.no_grad()
-def unpack_to_ray_indices(packed_info: Tensor) -> Tensor:
-    """Unpack `packed_info` to ray indices. Useful for converting per ray data to per sample data.
-
-    Note: this function is not differentiable to inputs.
-
-    Args:
-        packed_info: Stores information on which samples belong to the same ray. \
-            See ``volumetric_marching`` for details. Tensor with shape (n_rays, 2).
-
-    Returns:
-        Ray index of each sample. IntTensor with shape (n_sample).
-
-    """
-    if packed_info.is_cuda:
-        packed_info = packed_info.contiguous()
-        ray_indices = nerfacc_cuda.unpack_to_ray_indices(packed_info)
-    else:
-        raise NotImplementedError("Only support cuda inputs.")
-    return ray_indices
-
-
-class _VolumetricRenderingWeights(torch.autograd.Function):
-    @staticmethod
-    def forward(
-        ctx, packed_info, frustum_starts, frustum_ends, sigmas
-    ):  # pylint: disable=arguments-differ
-        weights = nerfacc_cuda.volumetric_rendering_weights_forward(
-            packed_info, frustum_starts, frustum_ends, sigmas
-        )
-        ctx.save_for_backward(
-            packed_info,
-            frustum_starts,
-            frustum_ends,
-            sigmas,
-            weights,
-        )
-        return weights
-
-    @staticmethod
-    def backward(ctx, grad_weights):  # pylint: disable=arguments-differ
-        (
-            packed_info,
-            frustum_starts,
-            frustum_ends,
-            sigmas,
-            weights,
-        ) = ctx.saved_tensors
-        grad_sigmas = nerfacc_cuda.volumetric_rendering_weights_backward(
-            weights,
-            grad_weights,
-            packed_info,
-            frustum_starts,
-            frustum_ends,
-            sigmas,
-        )
-        return None, None, None, grad_sigmas
-
-    @staticmethod
-    def jvp(ctx: Any, *grad_inputs: Any) -> Any:
-        raise NotImplementedError("Not implemented.")
--- a/nerfacc/vol_rendering.py
+++ b/nerfacc/vol_rendering.py
+from typing import Optional, Tuple
+
+import torch
+from torch import Tensor
+
+import nerfacc.cuda as _C
+
+
+def accumulate_along_rays(
+    weights: Tensor,
+    ray_indices: Tensor,
+    values: Optional[Tensor] = None,
+    n_rays: Optional[int] = None,
+) -> Tensor:
+    """Accumulate volumetric values along the ray.
+
+    Note:
+        This function is only differentiable to `weights` and `values`.
+
+    Args:
+        weights: Volumetric rendering weights for those samples. Tensor with shape \
+            (n_samples,).
+        ray_indices: Ray index of each sample. IntTensor with shape (n_samples). \
+            It can be obtained from `unpack_to_ray_indices(packed_info)`.
+        values: The values to be accmulated. Tensor with shape (n_samples, D). If \
+            None, the accumulated values are just weights. Default is None.
+        n_rays: Total number of rays. This will decide the shape of the ouputs. If \
+            None, it will be inferred from `ray_indices.max() + 1`.  If specified \
+            it should be at least larger than `ray_indices.max()`. Default is None.
+
+    Returns:
+        Accumulated values with shape (n_rays, D). If `values` is not given then we return \
+            the accumulated weights, in which case D == 1.
+
+    Examples:
+
+    .. code-block:: python
+
+        # Rendering: accumulate rgbs, opacities, and depths along the rays.
+        colors = accumulate_along_rays(weights, ray_indices, values=rgbs, n_rays=n_rays)
+        opacities = accumulate_along_rays(weights, ray_indices, values=None, n_rays=n_rays)
+        depths = accumulate_along_rays(
+            weights,
+            ray_indices,
+            values=(t_starts + t_ends) / 2.0,
+            n_rays=n_rays,
+        )
+        # (n_rays, 3), (n_rays, 1), (n_rays, 1)
+        print(colors.shape, opacities.shape, depths.shape)
+
+    """
+    assert ray_indices.dim() == 1 and weights.dim() == 1
+    if not weights.is_cuda:
+        raise NotImplementedError("Only support cuda inputs.")
+    if values is not None:
+        assert (
+            values.dim() == 2 and values.shape[0] == weights.shape[0]
+        ), "Invalid shapes: {} vs {}".format(values.shape, weights.shape)
+        src = weights[:, None] * values
+    else:
+        src = weights[:, None]
+
+    if ray_indices.numel() == 0:
+        assert n_rays is not None
+        return torch.zeros((n_rays, src.shape[-1]), device=weights.device)
+
+    if n_rays is None:
+        n_rays = int(ray_indices.max()) + 1
+    else:
+        assert n_rays > ray_indices.max()
+
+    ray_indices = ray_indices.int()
+    index = ray_indices[:, None].long().expand(-1, src.shape[-1])
+    outputs = torch.zeros((n_rays, src.shape[-1]), device=weights.device)
+    outputs.scatter_add_(0, index, src)
+    return outputs
+
+
+def render_weight_from_density(
+    packed_info,
+    t_starts,
+    t_ends,
+    sigmas,
+    early_stop_eps: float = 1e-4,
+) -> torch.Tensor:
+    """Compute transmittance weights from density.
+
+    Args:
+        packed_info: Stores information on which samples belong to the same ray. \
+            See :func:`nerfacc.ray_marching` for details. Tensor with shape (n_rays, 2).
+        t_starts: Where the frustum-shape sample starts along a ray. Tensor with \
+            shape (n_samples, 1).
+        t_ends: Where the frustum-shape sample ends along a ray. Tensor with \
+            shape (n_samples, 1).
+        sigmas: The density values of the samples. Tensor with shape (n_samples, 1).
+        early_stop_eps: The epsilon value for early stopping. Default is 1e-4.
+    
+    Returns:
+        transmittance weights with shape (n_samples,).
+
+    Examples:
+
+    .. code-block:: python
+
+        rays_o = torch.rand((128, 3), device="cuda:0")
+        rays_d = torch.randn((128, 3), device="cuda:0")
+        rays_d = rays_d / rays_d.norm(dim=-1, keepdim=True)
+
+        # Ray marching with near far plane.
+        packed_info, t_starts, t_ends = ray_marching(
+            rays_o, rays_d, near_plane=0.1, far_plane=1.0, render_step_size=1e-3
+        )
+        # pesudo density
+        sigmas = torch.rand((t_starts.shape[0], 1), device="cuda:0")
+        # Rendering: compute weights and ray indices.
+        weights = render_weight_from_density(
+            packed_info, t_starts, t_ends, sigmas, early_stop_eps=1e-4
+        )
+        # torch.Size([115200, 1]) torch.Size([115200])
+        print(sigmas.shape, weights.shape)
+
+    """
+    if not sigmas.is_cuda:
+        raise NotImplementedError("Only support cuda inputs.")
+    weights = _RenderingDensity.apply(
+        packed_info, t_starts, t_ends, sigmas, early_stop_eps
+    )
+    return weights
+
+
+def render_weight_from_alpha(
+    packed_info,
+    alphas,
+    early_stop_eps: float = 1e-4,
+) -> Tuple[torch.Tensor, ...]:
+    """Compute transmittance weights from density.
+
+    Args:
+        packed_info: Stores information on which samples belong to the same ray. \
+            See :func:`nerfacc.ray_marching` for details. Tensor with shape (n_rays, 2).
+        alphas: The opacity values of the samples. Tensor with shape (n_samples, 1).
+        early_stop_eps: The epsilon value for early stopping. Default is 1e-4.
+    
+    Returns:
+        transmittance weights with shape (n_samples,).
+
+    Examples:
+
+    .. code-block:: python
+
+        rays_o = torch.rand((128, 3), device="cuda:0")
+        rays_d = torch.randn((128, 3), device="cuda:0")
+        rays_d = rays_d / rays_d.norm(dim=-1, keepdim=True)
+
+        # Ray marching with near far plane.
+        packed_info, t_starts, t_ends = ray_marching(
+            rays_o, rays_d, near_plane=0.1, far_plane=1.0, render_step_size=1e-3
+        )
+        # pesudo opacity
+        alphas = torch.rand((t_starts.shape[0], 1), device="cuda:0")
+        # Rendering: compute weights and ray indices.
+        weights = render_weight_from_alpha(
+            packed_info, alphas, early_stop_eps=1e-4
+        )
+        # torch.Size([115200, 1]) torch.Size([115200])
+        print(alphas.shape, weights.shape)
+
+    """
+    if not alphas.is_cuda:
+        raise NotImplementedError("Only support cuda inputs.")
+    weights = _RenderingAlpha.apply(packed_info, alphas, early_stop_eps)
+    return weights
+
+
+@torch.no_grad()
+def render_visibility(
+    packed_info: torch.Tensor,
+    alphas: torch.Tensor,
+    early_stop_eps: float = 1e-4,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """Filter out invisible samples given alpha (opacity).
+
+    Args:
+        packed_info: Stores information on which samples belong to the same ray. \
+            See :func:`nerfacc.ray_marching` for details. Tensor with shape (n_rays, 2).
+        alphas: The opacity values of the samples. Tensor with shape (n_samples, 1).
+        early_stop_eps: The epsilon value for early stopping. Default is 1e-4.
+    
+    Returns:
+        A tuple of tensors.
+
+            - **visibility**: The visibility mask for samples. Boolen tensor of shape \
+                (n_samples,).
+            - **packed_info_visible**: The new packed_info for visible samples. \
+                Tensor shape (n_rays, 2). It should be used if you use the visiblity \
+                mask to filter out invisible samples.
+
+    Examples:
+
+    .. code-block:: python
+
+        rays_o = torch.rand((128, 3), device="cuda:0")
+        rays_d = torch.randn((128, 3), device="cuda:0")
+        rays_d = rays_d / rays_d.norm(dim=-1, keepdim=True)
+
+        # Ray marching with near far plane.
+        packed_info, t_starts, t_ends = ray_marching(
+            rays_o, rays_d, near_plane=0.1, far_plane=1.0, render_step_size=1e-3
+        )
+        # pesudo opacity
+        alphas = torch.rand((t_starts.shape[0], 1), device="cuda:0")
+        # Rendering but only for computing visibility of each samples.
+        visibility, packed_info_visible = render_visibility(
+            packed_info, alphas, early_stop_eps=1e-4
+        )
+        t_starts_visible = t_starts[visibility]
+        t_ends_visible = t_ends[visibility]
+        # torch.Size([115200, 1]) torch.Size([1283, 1])
+        print(t_starts.shape, t_starts_visible.shape)
+
+    """
+    visibility, packed_info_visible = _C.rendering_alphas_forward(
+        packed_info.contiguous(),
+        alphas.contiguous(),
+        early_stop_eps,
+        True,  # compute visibility instead of weights
+    )
+    return visibility, packed_info_visible
+
+
+class _RenderingDensity(torch.autograd.Function):
+    """Rendering transmittance weights from density."""
+
+    @staticmethod
+    def forward(
+        ctx,
+        packed_info,
+        t_starts,
+        t_ends,
+        sigmas,
+        early_stop_eps: float = 1e-4,
+    ):
+        packed_info = packed_info.contiguous()
+        t_starts = t_starts.contiguous()
+        t_ends = t_ends.contiguous()
+        sigmas = sigmas.contiguous()
+        weights = _C.rendering_forward(
+            packed_info,
+            t_starts,
+            t_ends,
+            sigmas,
+            early_stop_eps,
+            False,  # not doing filtering
+        )[0]
+        if ctx.needs_input_grad[3]:  # sigmas
+            ctx.save_for_backward(
+                packed_info,
+                t_starts,
+                t_ends,
+                sigmas,
+                weights,
+            )
+            ctx.early_stop_eps = early_stop_eps
+        return weights
+
+    @staticmethod
+    def backward(ctx, grad_weights):
+        grad_weights = grad_weights.contiguous()
+        early_stop_eps = ctx.early_stop_eps
+        (
+            packed_info,
+            t_starts,
+            t_ends,
+            sigmas,
+            weights,
+        ) = ctx.saved_tensors
+        grad_sigmas = _C.rendering_backward(
+            weights,
+            grad_weights,
+            packed_info,
+            t_starts,
+            t_ends,
+            sigmas,
+            early_stop_eps,
+        )
+        return None, None, None, grad_sigmas, None
+
+
+class _RenderingAlpha(torch.autograd.Function):
+    """Rendering transmittance weights from alpha."""
+
+    @staticmethod
+    def forward(
+        ctx,
+        packed_info,
+        alphas,
+        early_stop_eps: float = 1e-4,
+    ):
+        packed_info = packed_info.contiguous()
+        alphas = alphas.contiguous()
+        weights = _C.rendering_alphas_forward(
+            packed_info,
+            alphas,
+            early_stop_eps,
+            False,  # not doing filtering
+        )[0]
+        if ctx.needs_input_grad[1]:  # alphas
+            ctx.save_for_backward(
+                packed_info,
+                alphas,
+                weights,
+            )
+            ctx.early_stop_eps = early_stop_eps
+        return weights
+
+    @staticmethod
+    def backward(ctx, grad_weights):
+        grad_weights = grad_weights.contiguous()
+        early_stop_eps = ctx.early_stop_eps
+        (
+            packed_info,
+            alphas,
+            weights,
+        ) = ctx.saved_tensors
+        grad_sigmas = _C.rendering_backward(
+            weights,
+            grad_weights,
+            packed_info,
+            alphas,
+            early_stop_eps,
+        )
+        return None, grad_sigmas, None
--- a/nerfacc/volumetric_rendering.py
+++ b/nerfacc/volumetric_rendering.py
-""" Full volumetric rendering pipeline. """
-from typing import Callable, List, Optional, Tuple
-
-import torch
-
-from .utils import (
-    unpack_to_ray_indices,
-    volumetric_marching,
-    volumetric_rendering_accumulate,
-    volumetric_rendering_steps,
-    volumetric_rendering_weights,
-)
-
-
-def volumetric_rendering_pipeline(
-    sigma_fn: Callable,
-    rgb_sigma_fn: Callable,
-    rays_o: torch.Tensor,
-    rays_d: torch.Tensor,
-    scene_aabb: torch.Tensor,
-    scene_resolution: Optional[List[int]] = None,
-    scene_occ_binary: Optional[torch.Tensor] = None,
-    render_bkgd: Optional[torch.Tensor] = None,
-    render_step_size: float = 1e-3,
-    near_plane: float = 0.0,
-    stratified: bool = False,
-) -> Tuple[torch.Tensor, torch.Tensor, int, int]:
-    """Differentiable volumetric rendering pipeline.
-
-    This function is the integration of those individual functions:
-
-    - ray_aabb_intersect
-    - volumetric_marching
-    - volumetric_rendering_steps
-    - volumetric_rendering_weights
-    - volumetric_rendering_accumulate
-
-    Args:
-        sigma_fn: A function that takes in the {frustum starts (N, 1), frustum ends (N, 1), and
-            ray indices (N,)} and returns the post-activation sigma values (N, 1).
-        rgb_sigma_fn: A function that takes in the {frustum starts (N, 1), frustum ends (N, 1), and
-            ray indices (N,)} and returns the post-activation rgb values (N, 3) and sigma values (N, 1).
-        rays_o: The origin of the rays (n_rays, 3).
-        rays_d: The normalized direction of the rays (n_rays, 3).
-        scene_aabb: The scene axis-aligned bounding box {xmin, ymin, zmin, xmax, ymax, zmax}.
-        scene_resolution: The scene resolution (3,). Defaults to None.
-        scene_occ_binary: The scene occupancy binary tensor used to skip samples (n_cells,). Defaults to None.
-        render_bkgd: The background color (3,). Default: None.
-        render_step_size: The step size for the volumetric rendering. Default: 1e-3.
-        near_plane: The near plane for the volumetric rendering. Default: 0.0.
-        stratified: Whether to use stratified sampling. Default: False.
-
-    Returns:
-        Ray colors (n_rays, 3), and opacities (n_rays, 1), the number of marching steps, and the number of rendering steps.
-    """
-    n_rays = rays_o.shape[0]
-
-    if scene_occ_binary is None:
-        scene_occ_binary = torch.ones(
-            (1),
-            dtype=torch.bool,
-            device=rays_o.device,
-        )
-        scene_resolution = [1, 1, 1]
-
-    if scene_resolution is None:
-        assert scene_occ_binary is not None and scene_occ_binary.dim() == 3
-        scene_resolution = scene_occ_binary.shape
-
-    rays_o = rays_o.contiguous()
-    rays_d = rays_d.contiguous()
-    scene_aabb = scene_aabb.contiguous()
-    scene_occ_binary = scene_occ_binary.contiguous()
-
-    with torch.no_grad():
-        # Ray marching and occupancy check.
-        assert scene_resolution is not None
-        packed_info, frustum_starts, frustum_ends = volumetric_marching(
-            rays_o,
-            rays_d,
-            aabb=scene_aabb,
-            scene_resolution=scene_resolution,
-            scene_occ_binary=scene_occ_binary,
-            render_step_size=render_step_size,
-            near_plane=near_plane,
-            stratified=stratified,
-        )
-        n_marching_samples = frustum_starts.shape[0]
-        ray_indices = unpack_to_ray_indices(packed_info)
-
-        # Query sigma without gradients
-        sigmas = sigma_fn(frustum_starts, frustum_ends, ray_indices)
-
-        # Ray marching and rendering check.
-        packed_info, frustum_starts, frustum_ends = volumetric_rendering_steps(
-            packed_info,
-            sigmas,
-            frustum_starts,
-            frustum_ends,
-        )
-        n_rendering_samples = frustum_starts.shape[0]
-        ray_indices = unpack_to_ray_indices(packed_info)
-
-    # Query sigma and color with gradients
-    rgbs, sigmas = rgb_sigma_fn(frustum_starts, frustum_ends, ray_indices)
-    assert rgbs.shape[-1] == 3, f"rgbs must have 3 channels, got {rgbs.shape}"
-    assert (
-        sigmas.shape[-1] == 1
-    ), f"sigmas must have 1 channel, got {sigmas.shape}"
-
-    # Rendering: compute weights and ray indices.
-    weights = volumetric_rendering_weights(
-        packed_info, sigmas, frustum_starts, frustum_ends
-    )
-
-    # Rendering: accumulate rgbs and opacities along the rays.
-    colors = volumetric_rendering_accumulate(
-        weights, ray_indices, values=rgbs, n_rays=n_rays
-    )
-    opacities = volumetric_rendering_accumulate(
-        weights, ray_indices, values=None, n_rays=n_rays
-    )
-    # depths = volumetric_rendering_accumulate(
-    #     weights,
-    #     ray_indices,
-    #     values=(frustum_starts + frustum_ends) / 2.0,
-    #     n_rays=n_rays,
-    # )
-
-    if render_bkgd is not None:
-        render_bkgd = render_bkgd.contiguous()
-        colors = colors + render_bkgd * (1.0 - opacities)
-
-    return colors, opacities, n_marching_samples, n_rendering_samples
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"

 [project]
 name = "nerfacc"
-version = "0.0.9"
+version = "0.1.0"
 authors = [{name = "Ruilong", email = "ruilongli94@gmail.com"}]
 license = { text="MIT" }
 requires-python = ">=3.8"
@@ -35,6 +35,11 @@ dev = [
 [tool.black]
 line-length = 80

+[tool.isort]
+multi_line_output = 3
+line_length = 80
+include_trailing_comma = true
+
 # pylint
 [tool.pylint.messages_control]
 max-line-length = 80

--- a/tests/test_all.py
+++ b/tests/test_all.py
-import torch
-import tqdm
-
-from nerfacc import volumetric_rendering_pipeline
-
-device = "cuda:0"
-
-
-def sigma_fn(frustum_starts, frustum_ends, ray_indices):
-    return torch.rand_like(frustum_ends[:, :1])
-
-
-def rgb_sigma_fn(frustum_starts, frustum_ends, ray_indices):
-    return torch.rand(
-        (frustum_ends.shape[0], 3), device=device
-    ), torch.rand_like(frustum_ends)
-
-
-def test_rendering():
-    scene_aabb = torch.tensor([0, 0, 0, 1, 1, 1], device=device).float()
-    scene_resolution = [128, 128, 128]
-    scene_occ_binary = torch.ones((128 * 128 * 128), device=device).bool()
-    rays_o = torch.rand((10000, 3), device=device)
-    rays_d = torch.randn((10000, 3), device=device)
-    rays_d = rays_d / rays_d.norm(dim=-1, keepdim=True)
-    render_bkgd = torch.ones(3, device=device)
-
-    for step in tqdm.tqdm(range(1000)):
-        volumetric_rendering_pipeline(
-            sigma_fn,
-            rgb_sigma_fn,
-            rays_o,
-            rays_d,
-            scene_aabb,
-            scene_resolution,
-            scene_occ_binary,
-            render_bkgd,
-            render_step_size=1e-3,
-            near_plane=0.0,
-            stratified=False,
-        )
-
-
-if __name__ == "__main__":
-    test_rendering()
--- a/tests/test_contraction.py
+++ b/tests/test_contraction.py
+import pytest
+import torch
+
+from nerfacc.contraction import ContractionType, contract, contract_inv
+
+device = "cuda:0"
+
+
+@pytest.mark.skipif(not torch.cuda.is_available, reason="No CUDA device")
+def test_identity():
+    samples = torch.rand([128, 3], device=device)
+    roi = torch.tensor([0, 0, 0, 1, 1, 1], dtype=torch.float32, device=device)
+    samples_out = contract(samples, roi=roi)
+    assert torch.allclose(samples_out, samples)
+    samples_inv = contract(samples_out, roi=roi)
+    assert torch.allclose(samples_inv, samples)
+
+
+@pytest.mark.skipif(not torch.cuda.is_available, reason="No CUDA device")
+def test_normalization():
+    samples = torch.rand([128, 3], device=device)
+    roi = torch.tensor(
+        [-1, -1, -1, 1, 1, 1], dtype=torch.float32, device=device
+    )
+    samples_out = contract(samples, roi=roi)
+    assert torch.allclose(samples_out, samples * 0.5 + 0.5)
+    samples_inv = contract_inv(samples_out, roi=roi)
+    assert torch.allclose(samples_inv, samples, atol=1e-6)
+
+
+@pytest.mark.skipif(not torch.cuda.is_available, reason="No CUDA device")
+def test_contract():
+    x = torch.rand([128, 3], device=device)
+    roi = torch.tensor(
+        [0.2, 0.3, 0.4, 0.7, 0.8, 0.6], dtype=torch.float32, device=device
+    )
+    for type in [
+        ContractionType.UN_BOUNDED_SPHERE,
+        ContractionType.UN_BOUNDED_TANH,
+    ]:
+        x_unit = contract(x, roi=roi, type=type)
+        assert x_unit.max() <= 1 and x_unit.min() >= 0
+        x_inv = contract_inv(x_unit, roi=roi, type=type)
+        assert torch.allclose(x_inv, x, atol=1e-3)
+
+
+if __name__ == "__main__":
+    test_identity()
+    test_normalization()
+    test_contract()
--- a/tests/test_grid.py
+++ b/tests/test_grid.py
+import pytest
+import torch
+
+from nerfacc.contraction import ContractionType
+from nerfacc.grid import OccupancyGrid
+
+device = "cuda:0"
+
+
+@pytest.mark.skipif(not torch.cuda.is_available, reason="No CUDA device")
+def occ_eval_fn(x: torch.Tensor) -> torch.Tensor:
+    """Pesudo occupancy function: (N, 3) -> (N, 1)."""
+    return torch.rand_like(x[:, :1])
+
+
+@pytest.mark.skipif(not torch.cuda.is_available, reason="No CUDA device")
+def test_occ_grid():
+    occ_grid = OccupancyGrid(roi_aabb=[0, 0, 0, 1, 1, 1], resolution=128).to(
+        device
+    )
+    occ_grid.every_n_step(0, occ_eval_fn, occ_thre=0.1)
+    assert occ_grid.roi_aabb.shape == (6,)
+    assert occ_grid.binary.shape == (128, 128, 128)
+
+
+if __name__ == "__main__":
+    test_occ_grid()
--- a/tests/test_marching.py
+++ b/tests/test_marching.py
-import torch
-import tqdm
-
-from nerfacc import volumetric_marching
-
-device = "cuda:0"
-
-
-def test_marching():
-    torch.manual_seed(42)
-    scene_aabb = torch.tensor([0, 0, 0, 1, 1, 1], device=device).float()
-    scene_occ_binary = torch.rand((128 * 128 * 128), device=device) > 0.5
-    rays_o = torch.rand((10000, 3), device=device)
-    rays_d = torch.randn((10000, 3), device=device)
-    rays_d = rays_d / rays_d.norm(dim=-1, keepdim=True)
-
-    for step in tqdm.tqdm(range(5000)):
-        volumetric_marching(
-            rays_o,
-            rays_d,
-            aabb=scene_aabb,
-            scene_resolution=[128, 128, 128],
-            scene_occ_binary=scene_occ_binary,
-        )
-
-
-if __name__ == "__main__":
-    test_marching()
--- a/tests/test_occupancy_field.py
+++ b/tests/test_occupancy_field.py
-import torch
-import tqdm
-
-from nerfacc import OccupancyField
-
-device = "cuda:0"
-
-
-def occ_eval_fn(positions: torch.Tensor) -> torch.Tensor:
-    return torch.rand_like(positions[:, :1])
-
-
-def test_occ_field():
-    occ_field = OccupancyField(occ_eval_fn, aabb=[0, 0, 0, 1, 1, 1]).to(device)
-
-    for step in tqdm.tqdm(range(50000)):
-        occ_field.every_n_step(step, occ_thre=0.1)
-
-
-if __name__ == "__main__":
-    test_occ_field()
--- a/tests/test_ray_marching.py
+++ b/tests/test_ray_marching.py
+import pytest
+import torch
+
+from nerfacc.grid import OccupancyGrid
+from nerfacc.ray_marching import ray_marching, unpack_to_ray_indices
+
+device = "cuda:0"
+batch_size = 128
+
+
+@pytest.mark.skipif(not torch.cuda.is_available, reason="No CUDA device")
+def test_marching_with_near_far():
+    rays_o = torch.rand((batch_size, 3), device=device)
+    rays_d = torch.randn((batch_size, 3), device=device)
+    rays_d = rays_d / rays_d.norm(dim=-1, keepdim=True)
+
+    packed_info, t_starts, t_ends = ray_marching(
+        rays_o,
+        rays_d,
+        near_plane=0.1,
+        far_plane=1.0,
+        render_step_size=1e-3,
+    )
+    return
+
+
+@pytest.mark.skipif(not torch.cuda.is_available, reason="No CUDA device")
+def test_marching_with_grid():
+    rays_o = torch.rand((batch_size, 3), device=device)
+    rays_d = torch.randn((batch_size, 3), device=device)
+    rays_d = rays_d / rays_d.norm(dim=-1, keepdim=True)
+    grid = OccupancyGrid(roi_aabb=[0, 0, 0, 1, 1, 1]).to(device)
+    grid._binary[:] = True
+
+    packed_info, t_starts, t_ends = ray_marching(
+        rays_o,
+        rays_d,
+        grid=grid,
+        near_plane=0.0,
+        far_plane=1.0,
+        render_step_size=1e-2,
+    )
+    ray_indices = unpack_to_ray_indices(packed_info).long()
+    samples = (
+        rays_o[ray_indices] + rays_d[ray_indices] * (t_starts + t_ends) / 2.0
+    )
+    assert (samples <= grid.roi_aabb[3:].unsqueeze(0)).all()
+    assert (samples >= grid.roi_aabb[:3].unsqueeze(0)).all()
+    return
+
+
+if __name__ == "__main__":
+    test_marching_with_near_far()
+    test_marching_with_grid()
--- a/tests/test_rendering.py
+++ b/tests/test_rendering.py
+import pytest
 import torch
-import tqdm

-from nerfacc import (
-    unpack_to_ray_indices,
-    volumetric_marching,
-    volumetric_rendering_accumulate,
-    volumetric_rendering_steps,
-    volumetric_rendering_weights,
-)
+from nerfacc.ray_marching import ray_marching
+from nerfacc.vol_rendering import render_weight_from_density

 device = "cuda:0"
+batch_size = 128


-def test_rendering():
-    scene_aabb = torch.tensor([0, 0, 0, 1, 1, 1], device=device).float()
-    scene_occ_binary = torch.ones((128 * 128 * 128), device=device).bool()
-    rays_o = torch.rand((10000, 3), device=device)
-    rays_d = torch.randn((10000, 3), device=device)
+@pytest.mark.skipif(not torch.cuda.is_available, reason="No CUDA device")
+def test_transmittance_compress():
+    rays_o = torch.rand((batch_size, 3), device=device)
+    rays_d = torch.randn((batch_size, 3), device=device)
    rays_d = rays_d / rays_d.norm(dim=-1, keepdim=True)

-    for step in tqdm.tqdm(range(1000)):
-        (packed_info, frustum_starts, frustum_ends,) = volumetric_marching(
-            rays_o,
-            rays_d,
-            aabb=scene_aabb,
-            scene_resolution=[128, 128, 128],
-            scene_occ_binary=scene_occ_binary,
-        )
-
-        sigmas = torch.rand_like(frustum_ends[:, :1], requires_grad=True) * 100
-
-        (
-            packed_info,
-            frustum_starts,
-            frustum_ends,
-        ) = volumetric_rendering_steps(
-            packed_info,
-            sigmas,
-            frustum_starts,
-            frustum_ends,
-        )
-        ray_indices = unpack_to_ray_indices(packed_info)
-
-        sigmas = torch.rand_like(frustum_ends[:, :1], requires_grad=True) * 100
-        values = torch.rand_like(frustum_starts, requires_grad=True)
-        weights = volumetric_rendering_weights(
-            packed_info,
-            sigmas,
-            frustum_starts,
-            frustum_ends,
-        )
-
-        accum_values = volumetric_rendering_accumulate(
-            weights,
-            ray_indices,
-            values,
-            n_rays=rays_o.shape[0],
-        )
-
-        accum_values.sum().backward()
+    packed_info, t_starts, t_ends = ray_marching(
+        rays_o,
+        rays_d,
+        near_plane=0.1,
+        far_plane=1.0,
+        render_step_size=1e-2,
+    )
+    sigmas = torch.rand_like(t_starts, requires_grad=True)
+    weights = render_weight_from_density(
+        packed_info,
+        t_starts,
+        t_ends,
+        sigmas * 1e2,
+    )
+    weights.sum().backward()
+    assert sigmas.grad is not None


 if __name__ == "__main__":
-    test_rendering()
+    test_transmittance_compress()