inference first for vol render

64ad63cd · Ruilong Li · 16324602 · 64ad63cd · 64ad63cd · 64ad63cd
Commit 64ad63cd authored Sep 11, 2022 by Ruilong Li
7 changed files
--- a/README.md
+++ b/README.md
@@ -14,7 +14,7 @@ python examples/trainval.py
 | - | - | - | - ｜
 | Time | 377s | 357s ｜ 354s |
 | PSNR | 36.08 | 36.58 ｜ 29.63 |
-
+35.37?

 Tested with the default settings on the Lego test set.

@@ -23,10 +23,7 @@ Tested with the default settings on the Lego test set.
 | instant-ngp (paper)            | trainval?            | 36.39  |  -   | -    | 3090    |
 | instant-ngp (code)             | train (35k steps)    | 36.08  |  308 sec  | 55.32 fps  | TITAN RTX  |  1734MB |
 | torch-ngp (`-O`)               | train (30K steps)    | 34.15  |  310 sec  | 7.8 fps    | V100 |
-| ours                           | train (30K steps)    | 34.40  |  296 sec  | 6.2 fps    | TITAN RTX  |
-| ours                           | trainval (30K steps) | 35.42  |  291 sec  | 6.4 fps    | TITAN RTX  |
-| ours (2**16 samples w preload) | trainval (35K steps) | 36.18  |  385 sec  | 8.3 fps    | TITAN RTX  |
-| ours (2**16 samples w preload) | train (35K steps) | 35.03  |  383 sec  | 8.0 fps    | TITAN RTX  |
+| ours                           | trainval (35K steps) | 36.08  |  343 sec  | 9.6 fps    | TITAN RTX  |

 ## Tips:


--- a/examples/radiance_fields/ngp.py
+++ b/examples/radiance_fields/ngp.py
@@ -58,9 +58,16 @@ class NGPradianceField(BaseRadianceField):
            self.direction_encoding = tcnn.Encoding(
                n_input_dims=num_dim,
                encoding_config={
+                    "otype": "Composite",
+                    "nested": [
+                        {
+                            "n_dims_to_encode": 3,
                            "otype": "SphericalHarmonics",
                            "degree": 4,
                        },
+                        # {"otype": "Identity", "n_bins": 4, "degree": 4},
+                    ],
+                },
            )

        self.mlp_base = tcnn.NetworkWithInputEncoding(
@@ -134,6 +141,7 @@ class NGPradianceField(BaseRadianceField):
        positions: torch.Tensor,
        directions: torch.Tensor = None,
        mask: torch.Tensor = None,
+        only_density: bool = False,
    ):
        if self.use_viewdirs and (directions is not None):
            assert (
@@ -143,12 +151,18 @@ class NGPradianceField(BaseRadianceField):
            density = torch.zeros_like(positions[..., :1])
            rgb = torch.zeros(list(positions.shape[:-1]) + [3], device=positions.device)
            density[mask], embedding = self.query_density(positions[mask])
+            if only_density:
+                return density
+
            rgb[mask] = self.query_rgb(
                directions[mask] if directions is not None else None,
                embedding=embedding,
            )
        else:
            density, embedding = self.query_density(positions, return_feat=True)
+            if only_density:
+                return density
+
            rgb = self._query_rgb(directions, embedding=embedding)

        return rgb, density
--- a/examples/trainval.py
+++ b/examples/trainval.py
@@ -12,6 +12,27 @@ from nerfacc import OccupancyField, volumetric_rendering

 TARGET_SAMPLE_BATCH_SIZE = 1 << 16

+# import tqdm
+
+# device = "cuda:0"
+# radiance_field = NGPradianceField(aabb=[0, 0, 0, 1, 1, 1]).to(device)
+# positions = torch.rand((TARGET_SAMPLE_BATCH_SIZE, 3), device=device)
+# directions = torch.rand(positions.shape, device=device)
+# optimizer = torch.optim.Adam(
+#     radiance_field.parameters(),
+#     lr=1e-10,
+#     # betas=(0.9, 0.99),
+#     eps=1e-15,
+#     # weight_decay=1e-6,
+# )
+# for _ in tqdm.tqdm(range(1000)):
+#     rgbs, sigmas = radiance_field(positions, directions)
+#     loss = rgbs.mean()
+#     optimizer.zero_grad()
+#     loss.backward()
+#     optimizer.step()
+# exit()
+

 def render_image(radiance_field, rays, render_bkgd, render_step_size):
    """Render the pixels of an image.

--- a/nerfacc/cuda/__init__.py
+++ b/nerfacc/cuda/__init__.py
@@ -7,6 +7,7 @@ ray_aabb_intersect = _C.ray_aabb_intersect
 ray_marching = _C.ray_marching
 volumetric_rendering_forward = _C.volumetric_rendering_forward
 volumetric_rendering_backward = _C.volumetric_rendering_backward
+volumetric_rendering_inference = _C.volumetric_rendering_inference


 class VolumeRenderer(torch.autograd.Function):

--- a/nerfacc/cuda/csrc/pybind.cu
+++ b/nerfacc/cuda/csrc/pybind.cu
@@ -24,6 +24,13 @@ std::vector<torch::Tensor> ray_marching(
    const float dt
 );

+std::vector<torch::Tensor> volumetric_rendering_inference(
+    torch::Tensor packed_info, 
+    torch::Tensor starts, 
+    torch::Tensor ends, 
+    torch::Tensor sigmas
+);
+
 std::vector<torch::Tensor> volumetric_rendering_forward(
    torch::Tensor packed_info, 
    torch::Tensor starts, 
@@ -51,6 +58,7 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
 {
    m.def("ray_aabb_intersect", &ray_aabb_intersect);
    m.def("ray_marching", &ray_marching);
+    m.def("volumetric_rendering_inference", &volumetric_rendering_inference);
    m.def("volumetric_rendering_forward", &volumetric_rendering_forward);
    m.def("volumetric_rendering_backward", &volumetric_rendering_backward);
 }
\ No newline at end of file
--- a/nerfacc/cuda/csrc/vol_rendering.cu
+++ b/nerfacc/cuda/csrc/vol_rendering.cu
 #include "include/helpers_cuda.h"


+template <typename scalar_t>
+__global__ void volumetric_rendering_inference_kernel(
+    const uint32_t n_rays,
+    const int* packed_info,  // input ray & point indices.
+    const scalar_t* starts,  // input start t
+    const scalar_t* ends,  // input end t
+    const scalar_t* sigmas,  // input density after activation
+    int* compact_packed_info,  // output: should be all zero initialized
+    int* compact_selector,  // output: should be all zero initialized
+    // writable helpers
+    int* steps_counter
+) {
+    CUDA_GET_THREAD_ID(thread_id, n_rays);
+
+    // locate
+    const int i = packed_info[thread_id * 3 + 0];  // ray idx in {rays_o, rays_d}
+    const int base = packed_info[thread_id * 3 + 1];  // point idx start.
+    const int numsteps = packed_info[thread_id * 3 + 2];  // point idx shift.
+    if (numsteps == 0) return;
+
+    starts += base;
+    ends += base;
+    sigmas += base;
+
+    // accumulated rendering
+    scalar_t T = 1.f;
+    scalar_t EPSILON = 1e-4f;
+    int j = 0;
+    for (; j < numsteps; ++j) {
+        if (T < EPSILON) {
+            break;
+        }
+        const scalar_t delta = ends[j] - starts[j];
+        const scalar_t alpha = 1.f - __expf(-sigmas[j] * delta);
+        const scalar_t weight = alpha * T;
+        T *= (1.f - alpha);
+    }
+    
+    int compact_base = atomicAdd(steps_counter, j);
+
+    compact_selector += compact_base;
+    for (int k = 0; k < j; ++k) {
+        compact_selector[k] = base + k;
+    }
+
+    compact_packed_info += thread_id * 3;
+    compact_packed_info[0] = i; // ray idx in {rays_o, rays_d}
+    compact_packed_info[1] = compact_base; // compact point idx start.
+    compact_packed_info[2] = j;  // compact point idx shift.
+}
+
+
 template <typename scalar_t>
 __global__ void volumetric_rendering_forward_kernel(
    const uint32_t n_rays,
@@ -137,6 +189,57 @@ __global__ void volumetric_rendering_backward_kernel(
    }
 }

+
+std::vector<torch::Tensor> volumetric_rendering_inference(
+    torch::Tensor packed_info, 
+    torch::Tensor starts, 
+    torch::Tensor ends, 
+    torch::Tensor sigmas
+) {
+    DEVICE_GUARD(packed_info);
+    CHECK_INPUT(packed_info);
+    CHECK_INPUT(starts);
+    CHECK_INPUT(ends);
+    CHECK_INPUT(sigmas);
+    TORCH_CHECK(packed_info.ndimension() == 2 & packed_info.size(1) == 3);
+    TORCH_CHECK(starts.ndimension() == 2 & starts.size(1) == 1);
+    TORCH_CHECK(ends.ndimension() == 2 & ends.size(1) == 1);
+    TORCH_CHECK(sigmas.ndimension() == 2 & sigmas.size(1) == 1);
+
+    const uint32_t n_rays = packed_info.size(0);
+    const uint32_t n_samples = sigmas.size(0);
+
+    const int threads = 256;
+    const int blocks = CUDA_N_BLOCKS_NEEDED(n_rays, threads);
+
+    // helper counter
+    torch::Tensor steps_counter = torch::zeros(
+        {1}, packed_info.options().dtype(torch::kInt32));
+
+    // outputs
+    torch::Tensor compact_packed_info = torch::zeros({n_rays, 3}, packed_info.options()); 
+    torch::Tensor compact_selector = - torch::ones({n_samples}, packed_info.options()); 
+
+    AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+        sigmas.scalar_type(),
+        "volumetric_rendering_inference",
+        ([&]
+         { volumetric_rendering_inference_kernel<scalar_t><<<blocks, threads>>>(
+                n_rays,
+                packed_info.data_ptr<int>(), 
+                starts.data_ptr<scalar_t>(),
+                ends.data_ptr<scalar_t>(),
+                sigmas.data_ptr<scalar_t>(),
+                compact_packed_info.data_ptr<int>(),
+                compact_selector.data_ptr<int>(),
+                steps_counter.data_ptr<int>()
+            ); 
+        }));
+
+    return {compact_packed_info, compact_selector, steps_counter};
+}
+
+
 /**
 * @brief Volumetric Rendering: Accumulating samples in the forward pass.
 *  The inputs, excepct for `sigmas` and `rgbs`, are the outputs of our

--- a/nerfacc/volumetric_rendering.py
+++ b/nerfacc/volumetric_rendering.py
@@ -3,7 +3,12 @@ from typing import Callable, Tuple

 import torch

-from .cuda import VolumeRenderer, ray_aabb_intersect, ray_marching
+from .cuda import (
+    VolumeRenderer,
+    ray_aabb_intersect,
+    ray_marching,
+    volumetric_rendering_inference,
+)


 def volumetric_rendering(
@@ -69,6 +74,7 @@ def volumetric_rendering(

        # squeeze valid samples
        total_samples = max(packed_info[:, -1].sum(), 1)
+        total_samples = int(math.ceil(total_samples / 256.0)) * 256
        frustum_origins = frustum_origins[:total_samples]
        frustum_dirs = frustum_dirs[:total_samples]
        frustum_starts = frustum_starts[:total_samples]
@@ -78,8 +84,36 @@ def volumetric_rendering(
            frustum_origins + frustum_dirs * (frustum_starts + frustum_ends) / 2.0
        )

-    query_results = query_fn(frustum_positions, frustum_dirs, **kwargs)
-    rgbs, densities = query_results[0], query_results[1]
+    with torch.no_grad():
+        densities = query_fn(
+            frustum_positions, frustum_dirs, only_density=True, **kwargs
+        )
+        (
+            compact_packed_info,
+            compact_selector,
+            compact_steps_counter,
+        ) = volumetric_rendering_inference(
+            packed_info.contiguous(),
+            frustum_starts.contiguous(),
+            frustum_ends.contiguous(),
+            densities.contiguous(),
+        )
+        compact_selector = compact_selector[compact_selector >= 0].long()
+        compact_pad = int(math.ceil(len(compact_selector) / 256.0)) * 256 - len(
+            compact_selector
+        )
+        compact_selector = torch.nn.functional.pad(compact_selector, (0, compact_pad))
+        compact_frustum_positions = frustum_positions[compact_selector]
+        compact_frustum_dirs = frustum_dirs[compact_selector]
+        compact_frustum_starts = frustum_starts[compact_selector]
+        compact_frustum_ends = frustum_ends[compact_selector]
+        # print(compact_selector.float().mean(), compact_steps_counter, steps_counter)
+
+    compact_query_results = query_fn(
+        compact_frustum_positions, compact_frustum_dirs, **kwargs
+    )
+    compact_rgbs, compact_densities = compact_query_results[0], compact_query_results[1]
+
    (
        accumulated_weight,
        accumulated_depth,
@@ -87,13 +121,30 @@ def volumetric_rendering(
        alive_ray_mask,
        compact_steps_counter,
    ) = VolumeRenderer.apply(
-        packed_info,
-        frustum_starts,
-        frustum_ends,
-        densities.contiguous(),
-        rgbs.contiguous(),
+        compact_packed_info.contiguous(),
+        compact_frustum_starts.contiguous(),
+        compact_frustum_ends.contiguous(),
+        compact_densities.contiguous(),
+        compact_rgbs.contiguous(),
    )

+    # query_results = query_fn(frustum_positions, frustum_dirs, **kwargs)
+    # rgbs, densities = query_results[0], query_results[1]
+
+    # (
+    #     accumulated_weight,
+    #     accumulated_depth,
+    #     accumulated_color,
+    #     alive_ray_mask,
+    #     compact_steps_counter,
+    # ) = VolumeRenderer.apply(
+    #     packed_info.contiguous(),
+    #     frustum_starts.contiguous(),
+    #     frustum_ends.contiguous(),
+    #     densities.contiguous(),
+    #     rgbs.contiguous(),
+    # )
+
    accumulated_depth = torch.clip(accumulated_depth, t_min[:, None], t_max[:, None])
    accumulated_color = accumulated_color + render_bkgd * (1.0 - accumulated_weight)