Add new Utils: pack & unpack data; cdf sampling; query grid (#57)

* new utils * proper test for pack * add test por cdf and query occ * add deprecated warning * bump version * fix list return to tuple

Add new Utils: pack & unpack data; cdf sampling; query grid (#57)
* new utils * proper test for pack * add test por cdf and query occ * add deprecated warning * bump version * fix list return to tuple
601c4c34 · Ruilong Li(李瑞龙) · GitHub · 40075646 · 601c4c34 · 601c4c34
Unverified Commit 601c4c34 authored Oct 07, 2022 by Ruilong Li(李瑞龙) Committed by GitHub Oct 07, 2022
20 changed files
--- a/docs/source/apis/generated/nerfacc.pack_data.rst
+++ b/docs/source/apis/generated/nerfacc.pack_data.rst
+nerfacc.pack\_data
+==================
+
+.. currentmodule:: nerfacc
+
+.. autofunction:: pack_data
\ No newline at end of file
--- a/docs/source/apis/generated/nerfacc.ray_resampling.rst
+++ b/docs/source/apis/generated/nerfacc.ray_resampling.rst
+nerfacc.ray\_resampling
+=======================
+
+.. currentmodule:: nerfacc
+
+.. autofunction:: ray_resampling
\ No newline at end of file
--- a/docs/source/apis/generated/nerfacc.unpack_data.rst
+++ b/docs/source/apis/generated/nerfacc.unpack_data.rst
+nerfacc.unpack\_data
+====================
+
+.. currentmodule:: nerfacc
+
+.. autofunction:: unpack_data
\ No newline at end of file
--- a/docs/source/apis/generated/nerfacc.unpack_info.rst
+++ b/docs/source/apis/generated/nerfacc.unpack_info.rst
+nerfacc.unpack\_info
+====================
+
+.. currentmodule:: nerfacc
+
+.. autofunction:: unpack_info
\ No newline at end of file
--- a/docs/source/apis/generated/nerfacc.unpack_to_ray_indices.rst
+++ b/docs/source/apis/generated/nerfacc.unpack_to_ray_indices.rst
-nerfacc.unpack\_to\_ray\_indices
-================================
-
-.. currentmodule:: nerfacc
-
-.. autofunction:: unpack_to_ray_indices
\ No newline at end of file
--- a/docs/source/apis/utils.rst
+++ b/docs/source/apis/utils.rst
@@ -8,11 +8,14 @@ Utils
   :toctree: generated/

   ray_aabb_intersect
-   unpack_to_ray_indices
+   unpack_info

   accumulate_along_rays
   render_weight_from_density
   render_weight_from_alpha
   render_visibility

+   ray_resampling
+   pack_data
+   unpack_data
   
\ No newline at end of file
--- a/nerfacc/__init__.py
+++ b/nerfacc/__init__.py
 """
 Copyright (c) 2022 Ruilong Li, UC Berkeley.
 """
+import warnings

+from .cdf import ray_resampling
 from .contraction import ContractionType, contract, contract_inv
-from .grid import Grid, OccupancyGrid
+from .grid import Grid, OccupancyGrid, query_grid
 from .intersection import ray_aabb_intersect
-from .pack import unpack_to_ray_indices
+from .pack import pack_data, unpack_data, unpack_info
 from .ray_marching import ray_marching
 from .version import __version__
 from .vol_rendering import (
@@ -16,19 +18,34 @@ from .vol_rendering import (
    rendering,
 )

+
+# About to be deprecated
+def unpack_to_ray_indices(*args, **kwargs):
+    warnings.warn(
+        "`unpack_to_ray_indices` will be deprecated. Please use `unpack_info` instead.",
+        DeprecationWarning,
+        stacklevel=2,
+    )
+    return unpack_info(*args, **kwargs)
+
+
 __all__ = [
+    "__version__",
    "Grid",
    "OccupancyGrid",
+    "query_grid",
    "ContractionType",
    "contract",
    "contract_inv",
    "ray_aabb_intersect",
    "ray_marching",
-    "unpack_to_ray_indices",
    "accumulate_along_rays",
    "render_visibility",
    "render_weight_from_alpha",
    "render_weight_from_density",
    "rendering",
-    "__version__",
+    "pack_data",
+    "unpack_data",
+    "unpack_info",
+    "ray_resampling",
 ]
--- a/nerfacc/cdf.py
+++ b/nerfacc/cdf.py
+"""
+Copyright (c) 2022 Ruilong Li, UC Berkeley.
+"""
+
+from typing import Tuple
+
+from torch import Tensor
+
+import nerfacc.cuda as _C
+
+
+def ray_resampling(
+    packed_info: Tensor,
+    t_starts: Tensor,
+    t_ends: Tensor,
+    weights: Tensor,
+    n_samples: int,
+) -> Tuple[Tensor, Tensor, Tensor]:
+    """Resample a set of rays based on the CDF of the weights.
+
+    Args:
+        packed_info (Tensor): Stores information on which samples belong to the same ray. \
+            See :func:`nerfacc.ray_marching` for details. Tensor with shape (n_rays, 2).
+        t_starts: Where the frustum-shape sample starts along a ray. Tensor with \
+            shape (n_samples, 1).
+        t_ends: Where the frustum-shape sample ends along a ray. Tensor with \
+            shape (n_samples, 1).
+        weights: Volumetric rendering weights for those samples. Tensor with shape \
+            (n_samples,).
+        n_samples (int): Number of samples per ray to resample.
+
+    Returns:
+        Resampled packed info (n_rays, 2), t_starts (n_samples, 1), and t_ends (n_samples, 1).
+    """
+    (
+        resampled_packed_info,
+        resampled_t_starts,
+        resampled_t_ends,
+    ) = _C.ray_resampling(
+        packed_info.contiguous(),
+        t_starts.contiguous(),
+        t_ends.contiguous(),
+        weights.contiguous(),
+        n_samples,
+    )
+    return resampled_packed_info, resampled_t_starts, resampled_t_ends
--- a/nerfacc/cuda/__init__.py
+++ b/nerfacc/cuda/__init__.py
@@ -19,13 +19,17 @@ ContractionTypeGetter = _make_lazy_cuda_func("ContractionType")
 contract = _make_lazy_cuda_func("contract")
 contract_inv = _make_lazy_cuda_func("contract_inv")

-query_occ = _make_lazy_cuda_func("query_occ")
+grid_query = _make_lazy_cuda_func("grid_query")

 ray_aabb_intersect = _make_lazy_cuda_func("ray_aabb_intersect")
 ray_marching = _make_lazy_cuda_func("ray_marching")
-unpack_to_ray_indices = _make_lazy_cuda_func("unpack_to_ray_indices")
+ray_resampling = _make_lazy_cuda_func("ray_resampling")

 rendering_forward = _make_lazy_cuda_func("rendering_forward")
 rendering_backward = _make_lazy_cuda_func("rendering_backward")
 rendering_alphas_forward = _make_lazy_cuda_func("rendering_alphas_forward")
 rendering_alphas_backward = _make_lazy_cuda_func("rendering_alphas_backward")
+
+unpack_data = _make_lazy_cuda_func("unpack_data")
+unpack_info = _make_lazy_cuda_func("unpack_info")
+unpack_info_to_mask = _make_lazy_cuda_func("unpack_info_to_mask")
--- a/nerfacc/cuda/csrc/cdf.cu
+++ b/nerfacc/cuda/csrc/cdf.cu
+/*
+ * Copyright (c) 2022 Ruilong Li, UC Berkeley.
+ */
+
+#include "include/helpers_cuda.h"
+
+template <typename scalar_t>
+__global__ void cdf_resampling_kernel(
+    const uint32_t n_rays,
+    const int *packed_info,  // input ray & point indices.
+    const scalar_t *starts,  // input start t
+    const scalar_t *ends,    // input end t
+    const scalar_t *weights, // transmittance weights
+    const int *resample_packed_info,
+    scalar_t *resample_starts,
+    scalar_t *resample_ends)
+{
+    CUDA_GET_THREAD_ID(i, n_rays);
+
+    // locate
+    const int base = packed_info[i * 2 + 0];  // point idx start.
+    const int steps = packed_info[i * 2 + 1]; // point idx shift.
+    const int resample_base = resample_packed_info[i * 2 + 0];
+    const int resample_steps = resample_packed_info[i * 2 + 1];
+    if (steps == 0)
+        return;
+
+    starts += base;
+    ends += base;
+    weights += base;
+    resample_starts += resample_base;
+    resample_ends += resample_base;
+
+    // normalize weights **per ray**
+    scalar_t weights_sum = 0.0f;
+    for (int j = 0; j < steps; j++)
+        weights_sum += weights[j];
+    scalar_t padding = fmaxf(1e-5f - weights_sum, 0.0f);
+    scalar_t padding_step = padding / steps;
+    weights_sum += padding;
+
+    int num_bins = resample_steps + 1;
+    scalar_t cdf_step_size = (1.0f - 1.0 / num_bins) / resample_steps;
+
+    int idx = 0, j = 0;
+    scalar_t cdf_prev = 0.0f, cdf_next = (weights[idx] + padding_step) / weights_sum;
+    scalar_t cdf_u = 1.0 / (2 * num_bins);
+    while (j < num_bins)
+    {
+        if (cdf_u < cdf_next)
+        {
+            // printf("cdf_u: %f, cdf_next: %f\n", cdf_u, cdf_next);
+            // resample in this interval
+            scalar_t scaling = (ends[idx] - starts[idx]) / (cdf_next - cdf_prev);
+            scalar_t t = (cdf_u - cdf_prev) * scaling + starts[idx];
+            if (j < num_bins - 1)
+                resample_starts[j] = t;
+            if (j > 0)
+                resample_ends[j - 1] = t;
+            // going further to next resample
+            cdf_u += cdf_step_size;
+            j += 1;
+        }
+        else
+        {
+            // going to next interval
+            idx += 1;
+            cdf_prev = cdf_next;
+            cdf_next += (weights[idx] + padding_step) / weights_sum;
+        }
+    }
+    if (j != num_bins)
+    {
+        printf("Error: %d %d %f\n", j, num_bins, weights_sum);
+    }
+    return;
+}
+
+// template <typename scalar_t>
+// __global__ void cdf_resampling_kernel(
+//     const uint32_t n_rays,
+//     const int *packed_info,   // input ray & point indices.
+//     const scalar_t *starts,   // input start t
+//     const scalar_t *ends,     // input end t
+//     const scalar_t *weights,  // transmittance weights
+//     const int *resample_packed_info,
+//     scalar_t *resample_starts,
+//     scalar_t *resample_ends)
+// {
+//     CUDA_GET_THREAD_ID(i, n_rays);
+
+//     // locate
+//     const int base = packed_info[i * 2 + 0];  // point idx start.
+//     const int steps = packed_info[i * 2 + 1]; // point idx shift.
+//     const int resample_base = resample_packed_info[i * 2 + 0];
+//     const int resample_steps = resample_packed_info[i * 2 + 1];
+//     if (steps == 0)
+//         return;
+
+//     starts += base;
+//     ends += base;
+//     weights += base;
+//     resample_starts += resample_base;
+//     resample_ends += resample_base;
+
+//     scalar_t cdf_step_size = 1.0f / resample_steps;
+
+//     // normalize weights **per ray**
+//     scalar_t weights_sum = 0.0f;
+//     for (int j = 0; j < steps; j++)
+//         weights_sum += weights[j];
+
+//     scalar_t padding = fmaxf(1e-5f - weights_sum, 0.0f);
+//     scalar_t padding_step = padding / steps;
+//     weights_sum += padding;
+
+//     int idx = 0, j = 0;
+//     scalar_t cdf_prev = 0.0f, cdf_next = (weights[idx] + padding_step) / weights_sum;
+//     scalar_t cdf_u = 0.5f * cdf_step_size;
+//     while (cdf_u < 1.0f)
+//     {
+//         if (cdf_u < cdf_next)
+//         {
+//             // resample in this interval
+//             scalar_t scaling = (ends[idx] - starts[idx]) / (cdf_next - cdf_prev);
+//             scalar_t resample_mid = (cdf_u - cdf_prev) * scaling + starts[idx];
+//             scalar_t resample_half_size = cdf_step_size * scaling * 0.5;
+//             resample_starts[j] = fmaxf(resample_mid - resample_half_size, starts[idx]);
+//             resample_ends[j] = fminf(resample_mid + resample_half_size, ends[idx]);
+//             // going further to next resample
+//             cdf_u += cdf_step_size;
+//             j += 1;
+//         }
+//         else
+//         {
+//             // go to next interval
+//             idx += 1;
+//             if (idx == steps)
+//                 break;
+//             cdf_prev = cdf_next;
+//             cdf_next += (weights[idx] + padding_step) / weights_sum;
+//         }
+//     }
+//     if (j != resample_steps)
+//     {
+//         printf("Error: %d %d %f\n", j, resample_steps, weights_sum);
+//     }
+//     return;
+// }
+
+std::vector<torch::Tensor> ray_resampling(
+    torch::Tensor packed_info,
+    torch::Tensor starts,
+    torch::Tensor ends,
+    torch::Tensor weights,
+    const int steps)
+{
+    DEVICE_GUARD(packed_info);
+
+    CHECK_INPUT(packed_info);
+    CHECK_INPUT(starts);
+    CHECK_INPUT(ends);
+    CHECK_INPUT(weights);
+
+    TORCH_CHECK(packed_info.ndimension() == 2 & packed_info.size(1) == 2);
+    TORCH_CHECK(starts.ndimension() == 2 & starts.size(1) == 1);
+    TORCH_CHECK(ends.ndimension() == 2 & ends.size(1) == 1);
+    TORCH_CHECK(weights.ndimension() == 1);
+
+    const uint32_t n_rays = packed_info.size(0);
+    const uint32_t n_samples = weights.size(0);
+
+    const int threads = 256;
+    const int blocks = CUDA_N_BLOCKS_NEEDED(n_rays, threads);
+
+    torch::Tensor num_steps = torch::split(packed_info, 1, 1)[1];
+    torch::Tensor resample_num_steps = (num_steps > 0).to(num_steps.options()) * steps;
+    torch::Tensor resample_cum_steps = resample_num_steps.cumsum(0, torch::kInt32);
+    torch::Tensor resample_packed_info = torch::cat(
+        {resample_cum_steps - resample_num_steps, resample_num_steps}, 1);
+
+    int total_steps = resample_cum_steps[resample_cum_steps.size(0) - 1].item<int>();
+    torch::Tensor resample_starts = torch::zeros({total_steps, 1}, starts.options());
+    torch::Tensor resample_ends = torch::zeros({total_steps, 1}, ends.options());
+
+    AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+        weights.scalar_type(),
+        "ray_resampling",
+        ([&]
+         { cdf_resampling_kernel<scalar_t><<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
+               n_rays,
+               // inputs
+               packed_info.data_ptr<int>(),
+               starts.data_ptr<scalar_t>(),
+               ends.data_ptr<scalar_t>(),
+               weights.data_ptr<scalar_t>(),
+               resample_packed_info.data_ptr<int>(),
+               // outputs
+               resample_starts.data_ptr<scalar_t>(),
+               resample_ends.data_ptr<scalar_t>()); }));
+
+    return {resample_packed_info, resample_starts, resample_ends};
+}
--- a/nerfacc/cuda/csrc/pack.cu
+++ b/nerfacc/cuda/csrc/pack.cu
@@ -4,7 +4,7 @@

 #include "include/helpers_cuda.h"

-__global__ void ray_indices_kernel(
+__global__ void unpack_info_kernel(
    // input
    const int n_rays,
    const int *packed_info,
@@ -27,7 +27,61 @@ __global__ void ray_indices_kernel(
    }
 }

-torch::Tensor unpack_to_ray_indices(const torch::Tensor packed_info)
+__global__ void unpack_info_to_mask_kernel(
+    // input
+    const int n_rays,
+    const int *packed_info,
+    const int n_samples,
+    // output
+    bool *masks) // [n_rays, n_samples]
+{
+    CUDA_GET_THREAD_ID(i, n_rays);
+
+    // locate
+    const int base = packed_info[i * 2 + 0];  // point idx start.
+    const int steps = packed_info[i * 2 + 1]; // point idx shift.
+    if (steps == 0)
+        return;
+
+    masks += i * n_samples;
+
+    for (int j = 0; j < steps; ++j)
+    {
+        masks[j] = true;
+    }
+}
+
+template <typename scalar_t>
+__global__ void unpack_data_kernel(
+    const uint32_t n_rays,
+    const int *packed_info, // input ray & point indices.
+    const int data_dim,
+    const scalar_t *data,
+    const int n_sampler_per_ray,
+    scalar_t *unpacked_data) // (n_rays, n_sampler_per_ray, data_dim)
+{
+    CUDA_GET_THREAD_ID(i, n_rays);
+
+    // locate
+    const int base = packed_info[i * 2 + 0];  // point idx start.
+    const int steps = packed_info[i * 2 + 1]; // point idx shift.
+    if (steps == 0)
+        return;
+
+    data += base * data_dim;
+    unpacked_data += i * n_sampler_per_ray * data_dim;
+
+    for (int j = 0; j < steps; j++)
+    {
+        for (int k = 0; k < data_dim; k++)
+        {
+            unpacked_data[j * data_dim + k] = data[j * data_dim + k];
+        }
+    }
+    return;
+}
+
+torch::Tensor unpack_info(const torch::Tensor packed_info)
 {
    DEVICE_GUARD(packed_info);
    CHECK_INPUT(packed_info);
@@ -40,9 +94,71 @@ torch::Tensor unpack_to_ray_indices(const torch::Tensor packed_info)
    torch::Tensor ray_indices = torch::zeros(
        {n_samples}, packed_info.options().dtype(torch::kInt32));

-    ray_indices_kernel<<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
+    unpack_info_kernel<<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
        n_rays,
        packed_info.data_ptr<int>(),
        ray_indices.data_ptr<int>());
    return ray_indices;
 }
+
+
+torch::Tensor unpack_info_to_mask(
+    const torch::Tensor packed_info, const int n_samples)
+{
+    DEVICE_GUARD(packed_info);
+    CHECK_INPUT(packed_info);
+
+    const int n_rays = packed_info.size(0);
+    const int threads = 256;
+    const int blocks = CUDA_N_BLOCKS_NEEDED(n_rays, threads);
+
+    torch::Tensor masks = torch::zeros(
+        {n_rays, n_samples}, packed_info.options().dtype(torch::kBool));
+
+    unpack_info_to_mask_kernel<<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
+        n_rays,
+        packed_info.data_ptr<int>(),
+        n_samples,
+        masks.data_ptr<bool>());
+    return masks;
+}
+
+torch::Tensor unpack_data(
+    torch::Tensor packed_info,
+    torch::Tensor data,
+    int n_samples_per_ray)
+{
+    DEVICE_GUARD(packed_info);
+
+    CHECK_INPUT(packed_info);
+    CHECK_INPUT(data);
+
+    TORCH_CHECK(packed_info.ndimension() == 2 & packed_info.size(1) == 2);
+    TORCH_CHECK(data.ndimension() == 2);
+
+    const int n_rays = packed_info.size(0);
+    const int n_samples = data.size(0);
+    const int data_dim = data.size(1);
+
+    const int threads = 256;
+    const int blocks = CUDA_N_BLOCKS_NEEDED(n_rays, threads);
+
+    torch::Tensor unpacked_data = torch::zeros(
+        {n_rays, n_samples_per_ray, data_dim}, data.options());
+
+    AT_DISPATCH_ALL_TYPES(
+        data.scalar_type(),
+        "unpack_data",
+        ([&]
+         { unpack_data_kernel<scalar_t><<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
+               n_rays,
+               // inputs
+               packed_info.data_ptr<int>(),
+               data_dim,
+               data.data_ptr<scalar_t>(),
+               n_samples_per_ray,
+               // outputs
+               unpacked_data.data_ptr<scalar_t>()); }));
+
+    return unpacked_data;
+}
--- a/nerfacc/cuda/csrc/pybind.cu
+++ b/nerfacc/cuda/csrc/pybind.cu
@@ -44,14 +44,17 @@ std::vector<torch::Tensor> ray_marching(
    const float step_size,
    const float cone_angle);

-torch::Tensor unpack_to_ray_indices(
+torch::Tensor unpack_info(
    const torch::Tensor packed_info);

-torch::Tensor query_occ(
+torch::Tensor unpack_info_to_mask(
+    const torch::Tensor packed_info, const int n_samples);
+
+torch::Tensor grid_query(
    const torch::Tensor samples,
    // occupancy grid & contraction
    const torch::Tensor roi,
-    const torch::Tensor grid_binary,
+    const torch::Tensor grid_value,
    const ContractionType type);

 torch::Tensor contract(
@@ -81,6 +84,18 @@ std::vector<torch::Tensor> rendering_alphas_forward(
    float alpha_thre,
    bool compression);

+std::vector<torch::Tensor> ray_resampling(
+    torch::Tensor packed_info,
+    torch::Tensor starts,
+    torch::Tensor ends,
+    torch::Tensor weights,
+    const int steps);
+
+torch::Tensor unpack_data(
+    torch::Tensor packed_info,
+    torch::Tensor data,
+    int n_samples_per_ray);
+
 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
 {
    // contraction
@@ -92,16 +107,21 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
    m.def("contract_inv", &contract_inv);

    // grid
-    m.def("query_occ", &query_occ);
+    m.def("grid_query", &grid_query);

    // marching
    m.def("ray_aabb_intersect", &ray_aabb_intersect);
    m.def("ray_marching", &ray_marching);
-    m.def("unpack_to_ray_indices", &unpack_to_ray_indices);
+    m.def("ray_resampling", &ray_resampling);

    // rendering
    m.def("rendering_forward", &rendering_forward);
    m.def("rendering_backward", &rendering_backward);
    m.def("rendering_alphas_forward", &rendering_alphas_forward);
    m.def("rendering_alphas_backward", &rendering_alphas_backward);
+
+    // pack & unpack
+    m.def("unpack_data", &unpack_data);
+    m.def("unpack_info", &unpack_info);
+    m.def("unpack_info_to_mask", &unpack_info_to_mask);
 }
\ No newline at end of file
--- a/nerfacc/cuda/csrc/ray_marching.cu
+++ b/nerfacc/cuda/csrc/ray_marching.cu
@@ -24,11 +24,12 @@ inline __device__ __host__ int grid_idx_at(
    return idx;
 }

-inline __device__ __host__ bool grid_occupied_at(
+template <typename scalar_t>
+inline __device__ __host__ scalar_t grid_occupied_at(
    const float3 xyz,
    const float3 roi_min, const float3 roi_max,
    ContractionType type,
-    const int3 grid_res, const bool *grid_binary)
+    const int3 grid_res, const scalar_t *grid_value)
 {
    if (type == ContractionType::AABB &&
        (xyz.x < roi_min.x || xyz.x > roi_max.x ||
@@ -40,7 +41,7 @@ inline __device__ __host__ bool grid_occupied_at(
    float3 xyz_unit = apply_contraction(
        xyz, roi_min, roi_max, type);
    int idx = grid_idx_at(xyz_unit, grid_res);
-    return grid_binary[idx];
+    return grid_value[idx];
 }

 // dda like step
@@ -283,6 +284,7 @@ std::vector<torch::Tensor> ray_marching(
 // Query the occupancy grid
 // ----------------------------------------------------------------------------

+template <typename scalar_t>
 __global__ void query_occ_kernel(
    // rays info
    const uint32_t n_samples,
@@ -290,10 +292,10 @@ __global__ void query_occ_kernel(
    // occupancy grid & contraction
    const float *roi,
    const int3 grid_res,
-    const bool *grid_binary, // shape (reso_x, reso_y, reso_z)
+    const scalar_t *grid_value, // shape (reso_x, reso_y, reso_z)
    const ContractionType type,
    // outputs
-    bool *occs)
+    scalar_t *occs)
 {
    CUDA_GET_THREAD_ID(i, n_samples);

@@ -305,15 +307,15 @@ __global__ void query_occ_kernel(
    const float3 roi_max = make_float3(roi[3], roi[4], roi[5]);
    const float3 xyz = make_float3(samples[0], samples[1], samples[2]);

-    *occs = grid_occupied_at(xyz, roi_min, roi_max, type, grid_res, grid_binary);
+    *occs = grid_occupied_at(xyz, roi_min, roi_max, type, grid_res, grid_value);
    return;
 }

-torch::Tensor query_occ(
+torch::Tensor grid_query(
    const torch::Tensor samples,
    // occupancy grid & contraction
    const torch::Tensor roi,
-    const torch::Tensor grid_binary,
+    const torch::Tensor grid_value,
    const ContractionType type)
 {
    DEVICE_GUARD(samples);
@@ -321,23 +323,28 @@ torch::Tensor query_occ(

    const int n_samples = samples.size(0);
    const int3 grid_res = make_int3(
-        grid_binary.size(0), grid_binary.size(1), grid_binary.size(2));
+        grid_value.size(0), grid_value.size(1), grid_value.size(2));

    const int threads = 256;
    const int blocks = CUDA_N_BLOCKS_NEEDED(n_samples, threads);

-    torch::Tensor occs = torch::zeros(
-        {n_samples}, samples.options().dtype(torch::kBool));
+    torch::Tensor occs = torch::zeros({n_samples}, grid_value.options());
+
+    AT_DISPATCH_FLOATING_TYPES_AND(
+        at::ScalarType::Bool,
+        occs.scalar_type(),
+        "grid_query",
+        ([&]
+         { query_occ_kernel<<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
+               n_samples,
+               samples.data_ptr<float>(),
+               // grid
+               roi.data_ptr<float>(),
+               grid_res,
+               grid_value.data_ptr<scalar_t>(),
+               type,
+               // outputs
+               occs.data_ptr<scalar_t>()); }));

-    query_occ_kernel<<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
-        n_samples,
-        samples.data_ptr<float>(),
-        // grid
-        roi.data_ptr<float>(),
-        grid_res,
-        grid_binary.data_ptr<bool>(),
-        type,
-        // outputs
-        occs.data_ptr<bool>());
    return occs;
 }
--- a/nerfacc/grid.py
+++ b/nerfacc/grid.py
@@ -7,12 +7,46 @@ from typing import Callable, List, Union
 import torch
 import torch.nn as nn

+import nerfacc.cuda as _C
+
 from .contraction import ContractionType, contract_inv

-# TODO: add this to the dependency
+# TODO: check torch.scatter_reduce_
 # from torch_scatter import scatter_max


+@torch.no_grad()
+def query_grid(
+    samples: torch.Tensor,
+    grid_roi: torch.Tensor,
+    grid_values: torch.Tensor,
+    grid_type: ContractionType,
+):
+    """Query grid values given coordinates.
+
+    Args:
+        samples: (n_samples, 3) tensor of coordinates.
+        grid_roi: (6,) region of interest of the grid. Usually it should be
+            accquired from the grid itself using `grid.roi_aabb`.
+        grid_values: A 3D tensor of grid values in the shape of (resx, resy, resz).
+        grid_type: Contraction type of the grid. Usually it should be
+            accquired from the grid itself using `grid.contraction_type`.
+
+    Returns:
+        (n_samples) values for those samples queried from the grid.
+    """
+    assert samples.dim() == 2 and samples.size(-1) == 3
+    assert grid_roi.dim() == 1 and grid_roi.size(0) == 6
+    assert grid_values.dim() == 3
+    assert isinstance(grid_type, ContractionType)
+    return _C.grid_query(
+        samples.contiguous(),
+        grid_roi.contiguous(),
+        grid_values.contiguous(),
+        grid_type.to_cpp_version(),
+    )
+
+
 class Grid(nn.Module):
    """An abstract Grid class.

@@ -242,6 +276,23 @@ class OccupancyGrid(Grid):
                warmup_steps=warmup_steps,
            )

+    @torch.no_grad()
+    def query_occ(self, samples: torch.Tensor) -> torch.Tensor:
+        """Query the occupancy field at the given samples.
+
+        Args:
+            samples: Samples in the world coordinates. (n_samples, 3)
+
+        Returns:
+            Occupancy values at the given samples. (n_samples,)
+        """
+        return query_grid(
+            samples,
+            self._roi_aabb,
+            self.occs.reshape(self.resolution.tolist()),
+            self.contraction_type,
+        )
+

 def _meshgrid3d(
    res: torch.Tensor, device: Union[torch.device, str] = "cpu"

--- a/nerfacc/pack.py
+++ b/nerfacc/pack.py
 """
 Copyright (c) 2022 Ruilong Li, UC Berkeley.
 """
+from typing import Optional, Tuple

 import torch
 from torch import Tensor
@@ -8,8 +9,42 @@ from torch import Tensor
 import nerfacc.cuda as _C


+def pack_data(data: Tensor, mask: Tensor) -> Tuple[Tensor, Tensor]:
+    """Pack per-ray data (n_rays, n_samples, D) to (all_samples, D) based on mask.
+
+    Args:
+        data: Tensor with shape (n_rays, n_samples, D).
+        mask: Boolen tensor with shape (n_rays, n_samples).
+
+    Returns:
+        Tuple of Tensors including packed data (all_samples, D), \
+        and packed_info (n_rays, 2) which stores the start index of the sample,
+        and the number of samples kept for each ray. \
+
+    Examples:
+
+    .. code-block:: python
+
+        data = torch.rand((10, 3, 4), device="cuda:0")
+        mask = data.rand((10, 3), dtype=torch.bool, device="cuda:0")
+        packed_data, packed_info = pack(data, mask)
+        print(packed_data.shape, packed_info.shape)
+
+    """
+    assert data.dim() == 3, "data must be with shape of (n_rays, n_samples, D)."
+    assert (
+        mask.shape == data.shape[:2]
+    ), "mask must be with shape of (n_rays, n_samples)."
+    assert mask.dtype == torch.bool, "mask must be a boolean tensor."
+    packed_data = data[mask]
+    num_steps = mask.long().sum(dim=-1)
+    cum_steps = num_steps.cumsum(dim=0, dtype=torch.long)
+    packed_info = torch.stack([cum_steps - num_steps, num_steps], dim=-1)
+    return packed_data, packed_info
+
+
 @torch.no_grad()
-def unpack_to_ray_indices(packed_info: Tensor) -> Tensor:
+def unpack_info(packed_info: Tensor) -> Tensor:
    """Unpack `packed_info` to `ray_indices`. Useful for converting per ray data to per sample data.

    Note: 
@@ -36,13 +71,85 @@ def unpack_to_ray_indices(packed_info: Tensor) -> Tensor:
        # torch.Size([128, 2]) torch.Size([115200, 1]) torch.Size([115200, 1])
        print(packed_info.shape, t_starts.shape, t_ends.shape)
        # Unpack per-ray info to per-sample info.
-        ray_indices = unpack_to_ray_indices(packed_info)
+        ray_indices = unpack_info(packed_info)
        # torch.Size([115200]) torch.int64
        print(ray_indices.shape, ray_indices.dtype)

    """
+    assert (
+        packed_info.dim() == 2 and packed_info.shape[-1] == 2
+    ), "packed_info must be a 2D tensor with shape (n_rays, 2)."
    if packed_info.is_cuda:
-        ray_indices = _C.unpack_to_ray_indices(packed_info.contiguous().int())
+        ray_indices = _C.unpack_info(packed_info.contiguous().int())
    else:
        raise NotImplementedError("Only support cuda inputs.")
    return ray_indices.long()
+
+
+def unpack_data(
+    packed_info: Tensor,
+    data: Tensor,
+    n_samples: Optional[int] = None,
+) -> Tensor:
+    """Unpack packed data (all_samples, D) to per-ray data (n_rays, n_samples, D).
+
+    Args:
+        packed_info (Tensor): Stores information on which samples belong to the same ray. \
+            See :func:`nerfacc.ray_marching` for details. Tensor with shape (n_rays, 2).
+        data: Packed data to unpack. Tensor with shape (n_samples, D).
+        n_samples (int): Optional Number of samples per ray. If not provided, it \
+            will be inferred from the packed_info.
+
+    Returns:
+        Unpacked data (n_rays, n_samples, D).
+
+    Examples:
+
+    .. code-block:: python
+
+        rays_o = torch.rand((128, 3), device="cuda:0")
+        rays_d = torch.randn((128, 3), device="cuda:0")
+        rays_d = rays_d / rays_d.norm(dim=-1, keepdim=True)
+
+        # Ray marching with aabb.
+        scene_aabb = torch.tensor([0.0, 0.0, 0.0, 1.0, 1.0, 1.0], device="cuda:0")
+        packed_info, t_starts, t_ends = ray_marching(
+            rays_o, rays_d, scene_aabb=scene_aabb, render_step_size=1e-2
+        )
+        print(t_starts.shape)  # torch.Size([all_samples, 1])
+
+        t_starts = unpack_data(packed_info, t_starts, n_samples=1024)
+        print(t_starts.shape)  # torch.Size([128, 1024, 1])
+    """
+    assert (
+        packed_info.dim() == 2 and packed_info.shape[-1] == 2
+    ), "packed_info must be a 2D tensor with shape (n_rays, 2)."
+    assert (
+        data.dim() == 2
+    ), "data must be a 2D tensor with shape (n_samples, D)."
+    if n_samples is None:
+        n_samples = packed_info[:, 1].max().item()
+    return _UnpackData.apply(packed_info, data, n_samples)
+
+
+class _UnpackData(torch.autograd.Function):
+    """Unpack packed data (all_samples, D) to per-ray data (n_rays, n_samples, D)."""
+
+    @staticmethod
+    def forward(ctx, packed_info: Tensor, data: Tensor, n_samples: int):
+        # shape of the data should be (all_samples, D)
+        packed_info = packed_info.contiguous().int()
+        data = data.contiguous()
+        if ctx.needs_input_grad[1]:
+            ctx.save_for_backward(packed_info)
+            ctx.n_samples = n_samples
+        return _C.unpack_data(packed_info, data, n_samples)
+
+    @staticmethod
+    def backward(ctx, grad: Tensor):
+        # shape of the grad should be (n_rays, n_samples, D)
+        packed_info = ctx.saved_tensors[0]
+        n_samples = ctx.n_samples
+        mask = _C.unpack_info_to_mask(packed_info, n_samples)
+        packed_grad = grad[mask].contiguous()
+        return None, packed_grad, None
--- a/nerfacc/ray_marching.py
+++ b/nerfacc/ray_marching.py
@@ -7,7 +7,7 @@ import nerfacc.cuda as _C
 from .contraction import ContractionType
 from .grid import Grid
 from .intersection import ray_aabb_intersect
-from .pack import unpack_to_ray_indices
+from .pack import unpack_info
 from .vol_rendering import render_visibility


@@ -87,7 +87,7 @@ def ray_marching(
    .. code-block:: python

        import torch
-        from nerfacc import OccupancyGrid, ray_marching, unpack_to_ray_indices
+        from nerfacc import OccupancyGrid, ray_marching, unpack_info

        device = "cuda:0"
        batch_size = 128
@@ -121,7 +121,7 @@ def ray_marching(
        )

        # Convert t_starts and t_ends to sample locations.
-        ray_indices = unpack_to_ray_indices(packed_info)
+        ray_indices = unpack_info(packed_info)
        t_mid = (t_starts + t_ends) / 2.0
        sample_locs = rays_o[ray_indices] + t_mid * rays_d[ray_indices]

@@ -186,7 +186,7 @@ def ray_marching(
    # skip invisible space
    if sigma_fn is not None:
        # Query sigma without gradients
-        ray_indices = unpack_to_ray_indices(packed_info)
+        ray_indices = unpack_info(packed_info)
        sigmas = sigma_fn(t_starts, t_ends, ray_indices)
        assert (
            sigmas.shape == t_starts.shape

--- a/nerfacc/vol_rendering.py
+++ b/nerfacc/vol_rendering.py
@@ -9,7 +9,7 @@ from torch import Tensor

 import nerfacc.cuda as _C

-from .pack import unpack_to_ray_indices
+from .pack import unpack_info


 def rendering(
@@ -77,7 +77,7 @@ def rendering(

    """
    n_rays = packed_info.shape[0]
-    ray_indices = unpack_to_ray_indices(packed_info)
+    ray_indices = unpack_info(packed_info)

    # Query sigma and color with gradients
    rgbs, sigmas = rgb_sigma_fn(t_starts, t_ends, ray_indices)
@@ -129,7 +129,7 @@ def accumulate_along_rays(
        weights: Volumetric rendering weights for those samples. Tensor with shape \
            (n_samples,).
        ray_indices: Ray index of each sample. IntTensor with shape (n_samples). \
-            It can be obtained from `unpack_to_ray_indices(packed_info)`.
+            It can be obtained from `unpack_info(packed_info)`.
        values: The values to be accmulated. Tensor with shape (n_samples, D). If \
            None, the accumulated values are just weights. Default is None.
        n_rays: Total number of rays. This will decide the shape of the ouputs. If \

--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"

 [project]
 name = "nerfacc"
-version = "0.1.6"
+version = "0.1.7"
 description = "A General NeRF Acceleration Toolbox."
 readme = "README.md"
 authors = [{name = "Ruilong", email = "ruilongli94@gmail.com"}]

--- a/tests/test_grid.py
+++ b/tests/test_grid.py
@@ -9,7 +9,7 @@ device = "cuda:0"
 @pytest.mark.skipif(not torch.cuda.is_available, reason="No CUDA device")
 def occ_eval_fn(x: torch.Tensor) -> torch.Tensor:
    """Pesudo occupancy function: (N, 3) -> (N, 1)."""
-    return torch.rand_like(x[:, :1])
+    return ((x - 0.5).norm(dim=-1, keepdim=True) < 0.5).float()


 @pytest.mark.skipif(not torch.cuda.is_available, reason="No CUDA device")
@@ -21,5 +21,16 @@ def test_occ_grid():
    assert occ_grid.binary.shape == (128, 128, 128)


+@pytest.mark.skipif(not torch.cuda.is_available, reason="No CUDA device")
+def test_query_grid():
+    roi_aabb = [0, 0, 0, 1, 1, 1]
+    occ_grid = OccupancyGrid(roi_aabb=roi_aabb, resolution=128).to(device)
+    occ_grid.every_n_step(0, occ_eval_fn, occ_thre=0.1)
+    samples = torch.rand((100, 3), device=device)
+    occs = occ_grid.query_occ(samples)
+    assert occs.shape == (100,)
+
+
 if __name__ == "__main__":
    test_occ_grid()
+    test_query_grid()
--- a/tests/test_pack.py
+++ b/tests/test_pack.py
 import pytest
 import torch

-from nerfacc import unpack_to_ray_indices
+from nerfacc import pack_data, unpack_data, unpack_info

 device = "cuda:0"
 batch_size = 32
 eps = 1e-6


+@pytest.mark.skipif(not torch.cuda.is_available, reason="No CUDA device")
+def test_pack_data():
+    n_rays = 2
+    n_samples = 3
+    data = torch.rand((n_rays, n_samples, 2), device=device, requires_grad=True)
+    mask = torch.rand((n_rays, n_samples), device=device) > 0.5
+    packed_data, packed_info = pack_data(data, mask)
+    unpacked_data = unpack_data(packed_info, packed_data, n_samples)
+    unpacked_data.sum().backward()
+    assert (data.grad[mask] == 1).all()
+    assert torch.allclose(
+        unpacked_data.sum(dim=1), (data * mask[..., None]).sum(dim=1)
+    )
+
+
 @pytest.mark.skipif(not torch.cuda.is_available, reason="No CUDA device")
 def test_unpack_info():
    packed_info = torch.tensor(
@@ -16,9 +31,10 @@ def test_unpack_info():
    ray_indices_tgt = torch.tensor(
        [0, 2, 2, 2, 2], dtype=torch.int64, device=device
    )
-    ray_indices = unpack_to_ray_indices(packed_info)
+    ray_indices = unpack_info(packed_info)
    assert torch.allclose(ray_indices, ray_indices_tgt)


 if __name__ == "__main__":
+    test_pack_data()
    test_unpack_info()