cleanup

4a4bbaba · Ruilong Li · 224ba1a7 · 4a4bbaba · 4a4bbaba · 4a4bbaba
Commit 4a4bbaba authored Sep 12, 2022 by Ruilong Li
8 changed files
--- a/README.md
+++ b/README.md
@@ -12,9 +12,9 @@ python examples/trainval.py

 | trainval (35k, 1<<16) | Lego | Mic | Materials ｜
 | - | - | - | - ｜
-| Time | 325s  | 357s  | 360s  |
+| Time | 370s  | 357s  | 360s  |
 | PSNR | 36.20 | 36.55 | 29.59 |
-| FPS  | 12.56 | 25.54 | 9.65  |
+| FPS  | 11.94 | 25.54 | 9.65  |

 Tested with the default settings on the Lego test set.


--- a/nerfacc/cuda/__init__.py
+++ b/nerfacc/cuda/__init__.py
-import torch
-from torch.cuda.amp import custom_bwd, custom_fwd
-
 from ._backend import _C

-# ray_aabb_intersect = _C.ray_aabb_intersect
 ray_marching = _C.ray_marching
-volumetric_rendering_forward = _C.volumetric_rendering_forward
-volumetric_rendering_backward = _C.volumetric_rendering_backward
-volumetric_rendering_inference = _C.volumetric_rendering_inference
-# volumetric_weights_forward = _C.volumetric_weights_forward
-# volumetric_weights_forward = _C.volumetric_weights_forward
-
-
-class VolumeRenderer(torch.autograd.Function):
-    """CUDA Volumetirc Renderer"""
-
-    @staticmethod
-    @custom_fwd(cast_inputs=torch.float32)
-    def forward(ctx, packed_info, starts, ends, sigmas, rgbs):
-        (
-            accumulated_weight,
-            accumulated_depth,
-            accumulated_color,
-            mask,
-            steps_counter,
-        ) = volumetric_rendering_forward(packed_info, starts, ends, sigmas, rgbs)
-        ctx.save_for_backward(
-            accumulated_weight,
-            accumulated_depth,
-            accumulated_color,
-            packed_info,
-            starts,
-            ends,
-            sigmas,
-            rgbs,
-        )
-        return (
-            accumulated_weight,
-            accumulated_depth,
-            accumulated_color,
-            mask,
-            steps_counter,
-        )
-
-    @staticmethod
-    @custom_bwd
-    def backward(
-        ctx, grad_weight, grad_depth, grad_color, _grad_mask, _grad_steps_counter
-    ):
-        (
-            accumulated_weight,
-            accumulated_depth,
-            accumulated_color,
-            packed_info,
-            starts,
-            ends,
-            sigmas,
-            rgbs,
-        ) = ctx.saved_tensors
-        grad_sigmas, grad_rgbs = volumetric_rendering_backward(
-            accumulated_weight,
-            accumulated_depth,
-            accumulated_color,
-            grad_weight,
-            grad_depth,
-            grad_color,
-            packed_info,
-            starts,
-            ends,
-            sigmas,
-            rgbs,
-        )
-        # corresponds to the input argument list of forward()
-        return None, None, None, grad_sigmas, grad_rgbs
+volumetric_rendering_steps = _C.volumetric_rendering_steps
--- a/nerfacc/cuda/csrc/pybind.cu
+++ b/nerfacc/cuda/csrc/pybind.cu
@@ -7,52 +7,13 @@ std::vector<torch::Tensor> ray_aabb_intersect(
    const torch::Tensor aabb
 );

-
-// std::vector<torch::Tensor> ray_marching(
-//     // rays
-//     const torch::Tensor rays_o, 
-//     const torch::Tensor rays_d, 
-//     const torch::Tensor t_min, 
-//     const torch::Tensor t_max,
-//     // density grid
-//     const torch::Tensor aabb,
-//     const pybind11::list resolution,
-//     const torch::Tensor occ_binary, 
-//     // sampling
-//     const int max_total_samples,
-//     const int max_per_ray_samples,
-//     const float dt
-// );
-
-std::vector<torch::Tensor> volumetric_rendering_inference(
+std::vector<torch::Tensor> volumetric_rendering_steps(
    torch::Tensor packed_info, 
    torch::Tensor starts, 
    torch::Tensor ends, 
    torch::Tensor sigmas
 );

-std::vector<torch::Tensor> volumetric_rendering_forward(
-    torch::Tensor packed_info, 
-    torch::Tensor starts, 
-    torch::Tensor ends, 
-    torch::Tensor sigmas, 
-    torch::Tensor rgbs
-);
-
-std::vector<torch::Tensor> volumetric_rendering_backward(
-    torch::Tensor accumulated_weight, 
-    torch::Tensor accumulated_depth, 
-    torch::Tensor accumulated_color, 
-    torch::Tensor grad_weight, 
-    torch::Tensor grad_depth, 
-    torch::Tensor grad_color, 
-    torch::Tensor packed_info, 
-    torch::Tensor starts, 
-    torch::Tensor ends, 
-    torch::Tensor sigmas, 
-    torch::Tensor rgbs
-);
-
 std::vector<torch::Tensor> volumetric_weights_forward(
    torch::Tensor packed_info, 
    torch::Tensor starts, 
@@ -87,9 +48,7 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
 {
    m.def("ray_aabb_intersect", &ray_aabb_intersect);
    m.def("ray_marching", &ray_marching);
-    m.def("volumetric_rendering_inference", &volumetric_rendering_inference);
-    m.def("volumetric_rendering_forward", &volumetric_rendering_forward);
-    m.def("volumetric_rendering_backward", &volumetric_rendering_backward);
+    m.def("volumetric_rendering_steps", &volumetric_rendering_steps);
    m.def("volumetric_weights_forward", &volumetric_weights_forward);
    m.def("volumetric_weights_backward", &volumetric_weights_backward);
 }
\ No newline at end of file
--- a/nerfacc/cuda/csrc/ray_marching.cu
+++ b/nerfacc/cuda/csrc/ray_marching.cu
@@ -61,7 +61,6 @@ inline __device__ float advance_to_next_voxel(
 }


-
 __global__ void marching_steps_kernel(
    // rays info
    const uint32_t n_rays,

--- a/nerfacc/cuda/csrc/vol_rendering.cu
+++ b/nerfacc/cuda/csrc/vol_rendering.cu
-#include "include/helpers_cuda.h"
-
-
-template <typename scalar_t>
-__global__ void volumetric_rendering_inference_kernel(
-    const uint32_t n_rays,
-    const int* packed_info,  // input ray & point indices.
-    const scalar_t* starts,  // input start t
-    const scalar_t* ends,  // input end t
-    const scalar_t* sigmas,  // input density after activation
-    int* compact_packed_info,  // output: should be all zero initialized
-    int* compact_selector,  // output: should be all zero initialized
-    // writable helpers
-    int* steps_counter
-) {
-    CUDA_GET_THREAD_ID(thread_id, n_rays);
-
-    // locate
-    const int base = packed_info[thread_id * 2 + 0];  // point idx start.
-    const int steps = packed_info[thread_id * 2 + 1];  // point idx shift.
-    if (steps == 0) return;
-
-    starts += base;
-    ends += base;
-    sigmas += base;
-
-    // accumulated rendering
-    scalar_t T = 1.f;
-    scalar_t EPSILON = 1e-4f;
-    int j = 0;
-    for (; j < steps; ++j) {
-        if (T < EPSILON) {
-            break;
-        }
-        const scalar_t delta = ends[j] - starts[j];
-        const scalar_t alpha = 1.f - __expf(-sigmas[j] * delta);
-        const scalar_t weight = alpha * T;
-        T *= (1.f - alpha);
-    }
-    
-    int compact_base = atomicAdd(steps_counter, j);
-
-    compact_selector += compact_base;
-    for (int k = 0; k < j; ++k) {
-        compact_selector[k] = base + k;
-    }
-
-    compact_packed_info[thread_id * 2 + 0] = compact_base; // compact point idx start.
-    compact_packed_info[thread_id * 2 + 1] = j;  // compact point idx shift.
-}
-
-
-template <typename scalar_t>
-__global__ void volumetric_rendering_forward_kernel(
-    const uint32_t n_rays,
-    const int* packed_info,  // input ray & point indices.
-    const scalar_t* starts,  // input start t
-    const scalar_t* ends,  // input end t
-    const scalar_t* sigmas,  // input density after activation
-    const scalar_t* rgbs,  // input rgb after activation 
-    // should be all-zero initialized
-    scalar_t* accumulated_weight,  // output
-    scalar_t* accumulated_depth,  // output
-    scalar_t* accumulated_color,  // output
-    bool* mask,  // output
-    // writable helpers
-    int* steps_counter
-) {
-    CUDA_GET_THREAD_ID(thread_id, n_rays);
-
-    // locate
-    const int i = packed_info[thread_id * 3 + 0];  // ray idx in {rays_o, rays_d}
-    const int base = packed_info[thread_id * 3 + 1];  // point idx start.
-    const int numsteps = packed_info[thread_id * 3 + 2];  // point idx shift.
-    if (numsteps == 0) return;
-
-    starts += base;
-    ends += base;
-    sigmas += base;
-    rgbs += base * 3;
-
-    accumulated_weight += i;
-    accumulated_depth += i;
-    accumulated_color += i * 3;
-    mask += i;
-
-    // accumulated rendering
-    scalar_t T = 1.f;
-    scalar_t EPSILON = 1e-4f;
-    int j = 0;
-    for (; j < numsteps; ++j) {
-        if (T < EPSILON) {
-            break;
-        }
-        const scalar_t delta = ends[j] - starts[j];
-        const scalar_t t = (ends[j] + starts[j]) * 0.5f;
-
-        const scalar_t alpha = 1.f - __expf(-sigmas[j] * delta);
-        const scalar_t weight = alpha * T;
-        accumulated_weight[0] += weight;
-        accumulated_depth[0] += weight * t;
-        accumulated_color[0] += weight * rgbs[j * 3 + 0];
-        accumulated_color[1] += weight * rgbs[j * 3 + 1];
-        accumulated_color[2] += weight * rgbs[j * 3 + 2];
-        T *= (1.f - alpha);
-    }
-    mask[0] = true;
-    atomicAdd(steps_counter, j);
-}
-
-
-template <typename scalar_t>
-__global__ void volumetric_rendering_backward_kernel(
-    const uint32_t n_rays,
-    const int* packed_info,  // input ray & point indices.
-    const scalar_t* starts,  // input start t
-    const scalar_t* ends,  // input end t
-    const scalar_t* sigmas,  // input density after activation
-    const scalar_t* rgbs,  // input rgb after activation 
-    const scalar_t* accumulated_weight,  // forward output
-    const scalar_t* accumulated_depth,  // forward output
-    const scalar_t* accumulated_color,  // forward output
-    const scalar_t* grad_weight,  // input
-    const scalar_t* grad_depth,  // input
-    const scalar_t* grad_color,  // input
-    scalar_t* grad_sigmas,  // output
-    scalar_t* grad_rgbs  // output
-) {
-    CUDA_GET_THREAD_ID(thread_id, n_rays);
-
-    // locate
-    const int i = packed_info[thread_id * 3 + 0];  // ray idx in {rays_o, rays_d}
-    const int base = packed_info[thread_id * 3 + 1];  // point idx start.
-    const int numsteps = packed_info[thread_id * 3 + 2];  // point idx shift.
-    if (numsteps == 0) return;
-
-    starts += base;
-    ends += base;
-    sigmas += base;
-    rgbs += base * 3;
-
-    grad_sigmas += base;
-    grad_rgbs += base * 3;
-
-    accumulated_weight += i;
-    accumulated_depth += i;
-    accumulated_color += i * 3;
-    
-    grad_weight += i;
-    grad_depth += i;
-    grad_color += i * 3;
-    
-    // backward of accumulated rendering
-    scalar_t T = 1.f;
-    scalar_t EPSILON = 1e-4f;
-    int j = 0;
-    scalar_t r = 0, g = 0, b = 0, d = 0;
-    for (; j < numsteps; ++j) {
-        if (T < EPSILON) {
-            break;
-        }
-        const scalar_t delta = ends[j] - starts[j];
-        const scalar_t t = (ends[j] + starts[j]) * 0.5f;
-
-        const scalar_t alpha = 1.f - __expf(-sigmas[j] * delta);
-        const scalar_t weight = alpha * T;
-
-        r += weight * rgbs[j * 3 + 0];
-        g += weight * rgbs[j * 3 + 1];
-        b += weight * rgbs[j * 3 + 2];
-        d += weight * t;
-
-        T *= (1.f - alpha);
-
-        grad_rgbs[j * 3 + 0] = grad_color[0] * weight;
-        grad_rgbs[j * 3 + 1] = grad_color[1] * weight;
-        grad_rgbs[j * 3 + 2] = grad_color[2] * weight;
-
-        grad_sigmas[j] = delta * (
-            grad_color[0] * (T * rgbs[j * 3 + 0] - (accumulated_color[0] - r)) +
-            grad_color[1] * (T * rgbs[j * 3 + 1] - (accumulated_color[1] - g)) +
-            grad_color[2] * (T * rgbs[j * 3 + 2] - (accumulated_color[2] - b)) +
-            grad_weight[0] * (1.f - accumulated_weight[0]) +
-            grad_depth[0] * (t * T - (accumulated_depth[0] - d))
-        );
-    }
-}
-
-
-std::vector<torch::Tensor> volumetric_rendering_inference(
-    torch::Tensor packed_info, 
-    torch::Tensor starts, 
-    torch::Tensor ends, 
-    torch::Tensor sigmas
-) {
-    DEVICE_GUARD(packed_info);
-    CHECK_INPUT(packed_info);
-    CHECK_INPUT(starts);
-    CHECK_INPUT(ends);
-    CHECK_INPUT(sigmas);
-    TORCH_CHECK(packed_info.ndimension() == 2 & packed_info.size(1) == 2);
-    TORCH_CHECK(starts.ndimension() == 2 & starts.size(1) == 1);
-    TORCH_CHECK(ends.ndimension() == 2 & ends.size(1) == 1);
-    TORCH_CHECK(sigmas.ndimension() == 2 & sigmas.size(1) == 1);
-
-    const uint32_t n_rays = packed_info.size(0);
-    const uint32_t n_samples = sigmas.size(0);
-
-    const int threads = 256;
-    const int blocks = CUDA_N_BLOCKS_NEEDED(n_rays, threads);
-
-    // helper counter
-    torch::Tensor steps_counter = torch::zeros(
-        {1}, packed_info.options().dtype(torch::kInt32));
-
-    // outputs
-    torch::Tensor compact_packed_info = torch::zeros({n_rays, 2}, packed_info.options()); 
-    torch::Tensor compact_selector = - torch::ones({n_samples}, packed_info.options()); 
-
-    AT_DISPATCH_FLOATING_TYPES_AND_HALF(
-        sigmas.scalar_type(),
-        "volumetric_rendering_inference",
-        ([&]
-         { volumetric_rendering_inference_kernel<scalar_t><<<blocks, threads>>>(
-                n_rays,
-                packed_info.data_ptr<int>(), 
-                starts.data_ptr<scalar_t>(),
-                ends.data_ptr<scalar_t>(),
-                sigmas.data_ptr<scalar_t>(),
-                compact_packed_info.data_ptr<int>(),
-                compact_selector.data_ptr<int>(),
-                steps_counter.data_ptr<int>()
-            ); 
-        }));
-
-    return {compact_packed_info, compact_selector, steps_counter};
-}
-
-
-/**
- * @brief Volumetric Rendering: Accumulating samples in the forward pass.
- *  The inputs, excepct for `sigmas` and `rgbs`, are the outputs of our
- *  cuda ray marching function in `raymarching.cu`
- * 
- * @param packed_info Stores how to index the ray samples from the returned values.
- *  Shape of [n_rays, 3]. First value is the ray index. Second value is the sample 
- *  start index in the results for this ray. Third value is the number of samples for
- *  this ray. Note for rays that have zero samples, we simply skip them so the `packed_info`
- *  has some zero padding in the end.
- * @param starts: Where the frustum-shape sample starts along a ray. [total_samples, 1]
- * @param ends: Where the frustum-shape sample ends along a ray. [total_samples, 1]
- * @param sigmas Densities at those samples. [total_samples, 1]
- * @param rgbs RGBs at those samples. [total_samples, 3]
- * @return std::vector<torch::Tensor> 
- * - accumulated_weight: Ray opacity. [n_rays, 1]
- * - accumulated_depth: Ray depth. [n_rays, 1]
- * - accumulated_color: Ray color. [n_rays, 3]
- * - mask: Boolen value store if this ray has valid samples from packed_info. [n_rays]
- */
-std::vector<torch::Tensor> volumetric_rendering_forward(
-    torch::Tensor packed_info, 
-    torch::Tensor starts, 
-    torch::Tensor ends, 
-    torch::Tensor sigmas, 
-    torch::Tensor rgbs
-) {
-    DEVICE_GUARD(packed_info);
-    CHECK_INPUT(packed_info);
-    CHECK_INPUT(starts);
-    CHECK_INPUT(ends);
-    CHECK_INPUT(sigmas);
-    CHECK_INPUT(rgbs);
-    TORCH_CHECK(packed_info.ndimension() == 2 & packed_info.size(1) == 3);
-    TORCH_CHECK(starts.ndimension() == 2 & starts.size(1) == 1);
-    TORCH_CHECK(ends.ndimension() == 2 & ends.size(1) == 1);
-    TORCH_CHECK(sigmas.ndimension() == 2 & sigmas.size(1) == 1);
-    TORCH_CHECK(rgbs.ndimension() == 2 & rgbs.size(1) == 3);
-
-    const uint32_t n_rays = packed_info.size(0);
-
-    const int threads = 256;
-    const int blocks = CUDA_N_BLOCKS_NEEDED(n_rays, threads);
-
-    // helper counter
-    torch::Tensor steps_counter = torch::zeros(
-        {1}, rgbs.options().dtype(torch::kInt32));
-
-    // outputs
-    torch::Tensor accumulated_weight = torch::zeros({n_rays, 1}, sigmas.options()); 
-    torch::Tensor accumulated_depth = torch::zeros({n_rays, 1}, sigmas.options()); 
-    torch::Tensor accumulated_color = torch::zeros({n_rays, 3}, sigmas.options()); 
-    // The rays that are not skipped during sampling.
-    torch::Tensor mask = torch::zeros({n_rays}, sigmas.options().dtype(torch::kBool)); 
-
-    AT_DISPATCH_FLOATING_TYPES_AND_HALF(
-        sigmas.scalar_type(),
-        "volumetric_rendering_forward",
-        ([&]
-         { volumetric_rendering_forward_kernel<scalar_t><<<blocks, threads>>>(
-                n_rays,
-                packed_info.data_ptr<int>(), 
-                starts.data_ptr<scalar_t>(),
-                ends.data_ptr<scalar_t>(),
-                sigmas.data_ptr<scalar_t>(),
-                rgbs.data_ptr<scalar_t>(),
-                accumulated_weight.data_ptr<scalar_t>(),
-                accumulated_depth.data_ptr<scalar_t>(),
-                accumulated_color.data_ptr<scalar_t>(),
-                mask.data_ptr<bool>(),
-                steps_counter.data_ptr<int>()
-            ); 
-        }));
-
-    return {accumulated_weight, accumulated_depth, accumulated_color, mask, steps_counter};
-}
-
-
-
-/**
- * @brief Volumetric Rendering: Accumulating samples in the backward pass.
- */
-std::vector<torch::Tensor> volumetric_rendering_backward(
-    torch::Tensor accumulated_weight, 
-    torch::Tensor accumulated_depth, 
-    torch::Tensor accumulated_color, 
-    torch::Tensor grad_weight, 
-    torch::Tensor grad_depth, 
-    torch::Tensor grad_color, 
-    torch::Tensor packed_info, 
-    torch::Tensor starts, 
-    torch::Tensor ends, 
-    torch::Tensor sigmas, 
-    torch::Tensor rgbs
-) {
-    DEVICE_GUARD(packed_info);
-    const uint32_t n_rays = packed_info.size(0);
-
-    const int threads = 256;
-    const int blocks = CUDA_N_BLOCKS_NEEDED(n_rays, threads);
-
-    // outputs
-    torch::Tensor grad_sigmas = torch::zeros(sigmas.sizes(), sigmas.options()); 
-    torch::Tensor grad_rgbs = torch::zeros(rgbs.sizes(), rgbs.options()); 
-
-    AT_DISPATCH_FLOATING_TYPES_AND_HALF(
-        sigmas.scalar_type(),
-        "volumetric_rendering_backward",
-        ([&]
-         { volumetric_rendering_backward_kernel<scalar_t><<<blocks, threads>>>(
-                n_rays,
-                packed_info.data_ptr<int>(), 
-                starts.data_ptr<scalar_t>(),
-                ends.data_ptr<scalar_t>(),
-                sigmas.data_ptr<scalar_t>(),
-                rgbs.data_ptr<scalar_t>(),
-                accumulated_weight.data_ptr<scalar_t>(),
-                accumulated_depth.data_ptr<scalar_t>(),
-                accumulated_color.data_ptr<scalar_t>(),
-                grad_weight.data_ptr<scalar_t>(),
-                grad_depth.data_ptr<scalar_t>(),
-                grad_color.data_ptr<scalar_t>(),
-                grad_sigmas.data_ptr<scalar_t>(),
-                grad_rgbs.data_ptr<scalar_t>()
-            ); 
-        }));
-
-    return {grad_sigmas, grad_rgbs};
-}
\ No newline at end of file
--- a/nerfacc/cuda/csrc/volumetric_weights.cu
+++ b/nerfacc/cuda/csrc/volumetric_weights.cu
 #include "include/helpers_cuda.h"


+template <typename scalar_t>
+__global__ void volumetric_rendering_steps_kernel(
+    const uint32_t n_rays,
+    const int* packed_info,  // input ray & point indices.
+    const scalar_t* starts,  // input start t
+    const scalar_t* ends,  // input end t
+    const scalar_t* sigmas,  // input density after activation
+    // output: should be all zero (false) initialized
+    int* num_steps, 
+    bool* selector
+) {
+    CUDA_GET_THREAD_ID(i, n_rays);
+
+    // locate
+    const int base = packed_info[i * 2 + 0];  // point idx start.
+    const int steps = packed_info[i * 2 + 1];  // point idx shift.
+    if (steps == 0) return;
+
+    starts += base;
+    ends += base;
+    sigmas += base;
+    num_steps += i;
+    selector += base;
+
+    // accumulated rendering
+    scalar_t T = 1.f;
+    scalar_t EPSILON = 1e-4f;
+    int j = 0;
+    for (; j < steps; ++j) {
+        if (T < EPSILON) {
+            break;
+        }
+        const scalar_t delta = ends[j] - starts[j];
+        const scalar_t alpha = 1.f - __expf(-sigmas[j] * delta);
+        const scalar_t weight = alpha * T;
+        T *= (1.f - alpha);
+        selector[j] = true;
+    }
+    num_steps[0] = j;
+    return;
+}
+
+
 template <typename scalar_t>
 __global__ void volumetric_weights_forward_kernel(
    const uint32_t n_rays,
@@ -95,6 +138,55 @@ __global__ void volumetric_weights_backward_kernel(
 }


+std::vector<torch::Tensor> volumetric_rendering_steps(
+    torch::Tensor packed_info, 
+    torch::Tensor starts, 
+    torch::Tensor ends, 
+    torch::Tensor sigmas
+) {
+    DEVICE_GUARD(packed_info);
+
+    CHECK_INPUT(packed_info);
+    CHECK_INPUT(starts);
+    CHECK_INPUT(ends);
+    CHECK_INPUT(sigmas);
+    
+    TORCH_CHECK(packed_info.ndimension() == 2 & packed_info.size(1) == 2);
+    TORCH_CHECK(starts.ndimension() == 2 & starts.size(1) == 1);
+    TORCH_CHECK(ends.ndimension() == 2 & ends.size(1) == 1);
+    TORCH_CHECK(sigmas.ndimension() == 2 & sigmas.size(1) == 1);
+
+    const uint32_t n_rays = packed_info.size(0);
+    const uint32_t n_samples = sigmas.size(0);
+
+    const int threads = 256;
+    const int blocks = CUDA_N_BLOCKS_NEEDED(n_rays, threads);
+
+    torch::Tensor num_steps = torch::zeros({n_rays}, packed_info.options());
+    torch::Tensor selector = torch::zeros({n_samples}, packed_info.options().dtype(torch::kBool));
+
+    AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+        sigmas.scalar_type(),
+        "volumetric_rendering_inference",
+        ([&]
+         { volumetric_rendering_steps_kernel<scalar_t><<<blocks, threads>>>(
+                n_rays,
+                packed_info.data_ptr<int>(), 
+                starts.data_ptr<scalar_t>(),
+                ends.data_ptr<scalar_t>(),
+                sigmas.data_ptr<scalar_t>(),
+                num_steps.data_ptr<int>(),
+                selector.data_ptr<bool>()
+            ); 
+        }));
+
+    torch::Tensor cum_steps = num_steps.cumsum(0, torch::kInt32);
+    torch::Tensor compact_packed_info = torch::stack({cum_steps - num_steps, num_steps}, 1);
+
+    return {compact_packed_info, selector};
+}
+
+
 std::vector<torch::Tensor> volumetric_weights_forward(
    torch::Tensor packed_info, 
    torch::Tensor starts, 

--- a/nerfacc/utils.py
+++ b/nerfacc/utils.py
@@ -78,7 +78,7 @@ def volumetric_accumulate(
    values: torch.Tensor = None,
    n_rays: int = None,
 ) -> torch.Tensor:
-    """Accumulate values along the ray.
+    """Accumulate volumetric values along the ray.

    Note: this function is only differentiable to `weights` and `values`.


--- a/nerfacc/volumetric_rendering.py
+++ b/nerfacc/volumetric_rendering.py
@@ -5,7 +5,7 @@ import torch

 from .cuda import (  # ComputeWeight,; VolumeRenderer,; ray_aabb_intersect,
    ray_marching,
-    volumetric_rendering_inference,
+    volumetric_rendering_steps,
 )
 from .utils import ray_aabb_intersect, volumetric_accumulate, volumetric_weights

@@ -35,16 +35,13 @@ def volumetric_rendering(
    render_bkgd = render_bkgd.contiguous()

    n_rays = rays_o.shape[0]
-    if render_est_n_samples is None:
-        render_total_samples = n_rays * render_n_samples
-    else:
-        render_total_samples = render_est_n_samples
    if render_step_size is None:
        # Note: CPU<->GPU is not idea, try to pre-define it outside this function.
        render_step_size = (
            (scene_aabb[3:] - scene_aabb[:3]).max() * math.sqrt(3) / render_n_samples
        )

+    # get packed samples from ray marching & occupancy check.
    with torch.no_grad():
        t_min, t_max = ray_aabb_intersect(rays_o, rays_d, scene_aabb)

@@ -73,50 +70,30 @@ def volumetric_rendering(
        )
        steps_counter = packed_info[:, -1].sum(0, keepdim=True)

+    # compat the samples thru volumetric rendering
    with torch.no_grad():
        densities = query_fn(
            frustum_positions, frustum_dirs, only_density=True, **kwargs
        )
-        (
-            compact_packed_info,
-            compact_selector,
-            compact_steps_counter,
-        ) = volumetric_rendering_inference(
+        compact_packed_info, compact_selector = volumetric_rendering_steps(
            packed_info.contiguous(),
            frustum_starts.contiguous(),
            frustum_ends.contiguous(),
            densities.contiguous(),
        )
-        compact_selector = compact_selector[compact_selector >= 0].long()
-        compact_pad = int(math.ceil(len(compact_selector) / 256.0)) * 256 - len(
-            compact_selector
-        )
-        compact_selector = torch.nn.functional.pad(compact_selector, (0, compact_pad))
        compact_frustum_positions = frustum_positions[compact_selector]
        compact_frustum_dirs = frustum_dirs[compact_selector]
        compact_frustum_starts = frustum_starts[compact_selector]
        compact_frustum_ends = frustum_ends[compact_selector]
-        # print(compact_selector.float().mean(), compact_steps_counter, steps_counter)
+        compact_steps_counter = compact_packed_info[:, -1].sum(0, keepdim=True)

+    # network
    compact_query_results = query_fn(
        compact_frustum_positions, compact_frustum_dirs, **kwargs
    )
    compact_rgbs, compact_densities = compact_query_results[0], compact_query_results[1]

-    # (
-    #     accumulated_weight,
-    #     accumulated_depth,
-    #     accumulated_color,
-    #     alive_ray_mask,
-    #     compact_steps_counter,
-    # ) = VolumeRenderer.apply(
-    #     compact_packed_info.contiguous(),
-    #     compact_frustum_starts.contiguous(),
-    #     compact_frustum_ends.contiguous(),
-    #     compact_densities.contiguous(),
-    #     compact_rgbs.contiguous(),
-    # )
-
+    # accumulation
    compact_weights, compact_ray_indices, alive_ray_mask = volumetric_weights(
        compact_packed_info,
        compact_frustum_starts,
@@ -136,46 +113,6 @@ def volumetric_rendering(
        n_rays,
    )

-    # index = compact_ray_indices[:, None].long()
-
-    # accumulated_color = torch.zeros((n_rays, 3), device=device)
-    # accumulated_color.scatter_add_(
-    #     dim=0,
-    #     index=index.expand(-1, 3),
-    #     src=compact_weights[:, None] * compact_rgbs,
-    # )
-    # accumulated_weight = torch.zeros((n_rays, 1), device=device)
-    # accumulated_weight.scatter_add_(
-    #     dim=0,
-    #     index=index.expand(-1, 1),
-    #     src=compact_weights[:, None],
-    # )
-    # accumulated_depth = torch.zeros((n_rays, 1), device=device)
-    # accumulated_depth.scatter_add_(
-    #     dim=0,
-    #     index=index.expand(-1, 1),
-    #     src=compact_weights[:, None]
-    #     * (compact_frustum_starts + compact_frustum_ends)
-    #     / 2.0,
-    # )
-
-    # query_results = query_fn(frustum_positions, frustum_dirs, **kwargs)
-    # rgbs, densities = query_results[0], query_results[1]
-
-    # (
-    #     accumulated_weight,
-    #     accumulated_depth,
-    #     accumulated_color,
-    #     alive_ray_mask,
-    #     compact_steps_counter,
-    # ) = VolumeRenderer.apply(
-    #     packed_info.contiguous(),
-    #     frustum_starts.contiguous(),
-    #     frustum_ends.contiguous(),
-    #     densities.contiguous(),
-    #     rgbs.contiguous(),
-    # )
-
    accumulated_depth = torch.clip(accumulated_depth, t_min[:, None], t_max[:, None])
    accumulated_color = accumulated_color + render_bkgd * (1.0 - accumulated_weight)