Unverified Commit 674424ec authored by Ruilong Li(李瑞龙)'s avatar Ruilong Li(李瑞龙) Committed by GitHub
Browse files

Cub (#103)

- Faster rendering function via nvidia-cub, shipped with cuda >= 11.0 (Require >=11.6 for out use). ~10% speedup
- Expose transmittance computation.
parent bca2d4dc
......@@ -87,16 +87,18 @@ def rgb_sigma_fn(
return rgbs, sigmas # (n_samples, 3), (n_samples, 1)
# Efficient Raymarching: Skip empty and occluded space, pack samples from all rays.
# packed_info: (n_rays, 2). t_starts: (n_samples, 1). t_ends: (n_samples, 1).
# ray_indices: (n_samples,). t_starts: (n_samples, 1). t_ends: (n_samples, 1).
with torch.no_grad():
packed_info, t_starts, t_ends = nerfacc.ray_marching(
ray_indices, t_starts, t_ends = nerfacc.ray_marching(
rays_o, rays_d, sigma_fn=sigma_fn, near_plane=0.2, far_plane=1.0,
early_stop_eps=1e-4, alpha_thre=1e-2,
)
# Differentiable Volumetric Rendering.
# colors: (n_rays, 3). opaicity: (n_rays, 1). depth: (n_rays, 1).
color, opacity, depth = nerfacc.rendering(rgb_sigma_fn, packed_info, t_starts, t_ends)
color, opacity, depth = nerfacc.rendering(
t_starts, t_ends, ray_indices, n_rays=rays_o.shape[0], rgb_sigma_fn=rgb_sigma_fn
)
# Optimize: Both the network and rays will receive gradients
optimizer.zero_grad()
......
nerfacc.render\_transmittance\_from\_alpha
==========================================
.. currentmodule:: nerfacc
.. autofunction:: render_transmittance_from_alpha
\ No newline at end of file
nerfacc.render\_transmittance\_from\_density
============================================
.. currentmodule:: nerfacc
.. autofunction:: render_transmittance_from_density
\ No newline at end of file
......@@ -11,6 +11,8 @@ Utils
unpack_info
accumulate_along_rays
render_transmittance_from_density
render_transmittance_from_alpha
render_weight_from_density
render_weight_from_alpha
render_visibility
......
......@@ -90,16 +90,18 @@ An simple example is like this:
return rgbs, sigmas # (n_samples, 3), (n_samples, 1)
# Efficient Raymarching: Skip empty and occluded space, pack samples from all rays.
# packed_info: (n_rays, 2). t_starts: (n_samples, 1). t_ends: (n_samples, 1).
# ray_indices: (n_samples,). t_starts: (n_samples, 1). t_ends: (n_samples, 1).
with torch.no_grad():
packed_info, t_starts, t_ends = nerfacc.ray_marching(
ray_indices, t_starts, t_ends = nerfacc.ray_marching(
rays_o, rays_d, sigma_fn=sigma_fn, near_plane=0.2, far_plane=1.0,
early_stop_eps=1e-4, alpha_thre=1e-2,
)
# Differentiable Volumetric Rendering.
# colors: (n_rays, 3). opaicity: (n_rays, 1). depth: (n_rays, 1).
color, opacity, depth = nerfacc.rendering(rgb_sigma_fn, packed_info, t_starts, t_ends)
color, opacity, depth = nerfacc.rendering(
t_starts, t_ends, ray_indices, n_rays=rays_o.shape[0], rgb_sigma_fn=rgb_sigma_fn
)
# Optimize: Both the network and rays will receive gradients
optimizer.zero_grad()
......
......@@ -85,7 +85,7 @@ def render_image(
)
for i in range(0, num_rays, chunk):
chunk_rays = namedtuple_map(lambda r: r[i : i + chunk], rays)
packed_info, t_starts, t_ends = ray_marching(
ray_indices, t_starts, t_ends = ray_marching(
chunk_rays.origins,
chunk_rays.viewdirs,
scene_aabb=scene_aabb,
......@@ -99,9 +99,10 @@ def render_image(
alpha_thre=alpha_thre,
)
rgb, opacity, depth = rendering(
packed_info,
t_starts,
t_ends,
ray_indices,
n_rays=chunk_rays.origins.shape[0],
rgb_sigma_fn=rgb_sigma_fn,
render_bkgd=render_bkgd,
)
......
......@@ -8,11 +8,13 @@ from .contraction import ContractionType, contract, contract_inv
from .grid import Grid, OccupancyGrid, query_grid
from .intersection import ray_aabb_intersect
from .losses import distortion as loss_distortion
from .pack import pack_data, unpack_data, unpack_info
from .pack import pack_data, pack_info, unpack_data, unpack_info
from .ray_marching import ray_marching
from .version import __version__
from .vol_rendering import (
accumulate_along_rays,
render_transmittance_from_alpha,
render_transmittance_from_density,
render_visibility,
render_weight_from_alpha,
render_weight_from_density,
......@@ -48,7 +50,10 @@ __all__ = [
"pack_data",
"unpack_data",
"unpack_info",
"pack_info",
"ray_resampling",
"loss_distortion",
"unpack_to_ray_indices",
"render_transmittance_from_density",
"render_transmittance_from_alpha",
]
......@@ -25,10 +25,45 @@ ray_aabb_intersect = _make_lazy_cuda_func("ray_aabb_intersect")
ray_marching = _make_lazy_cuda_func("ray_marching")
ray_resampling = _make_lazy_cuda_func("ray_resampling")
rendering_forward = _make_lazy_cuda_func("rendering_forward")
rendering_backward = _make_lazy_cuda_func("rendering_backward")
rendering_alphas_forward = _make_lazy_cuda_func("rendering_alphas_forward")
rendering_alphas_backward = _make_lazy_cuda_func("rendering_alphas_backward")
is_cub_available = _make_lazy_cuda_func("is_cub_available")
transmittance_from_sigma_forward_cub = _make_lazy_cuda_func(
"transmittance_from_sigma_forward_cub"
)
transmittance_from_sigma_backward_cub = _make_lazy_cuda_func(
"transmittance_from_sigma_backward_cub"
)
transmittance_from_alpha_forward_cub = _make_lazy_cuda_func(
"transmittance_from_alpha_forward_cub"
)
transmittance_from_alpha_backward_cub = _make_lazy_cuda_func(
"transmittance_from_alpha_backward_cub"
)
transmittance_from_sigma_forward_naive = _make_lazy_cuda_func(
"transmittance_from_sigma_forward_naive"
)
transmittance_from_sigma_backward_naive = _make_lazy_cuda_func(
"transmittance_from_sigma_backward_naive"
)
transmittance_from_alpha_forward_naive = _make_lazy_cuda_func(
"transmittance_from_alpha_forward_naive"
)
transmittance_from_alpha_backward_naive = _make_lazy_cuda_func(
"transmittance_from_alpha_backward_naive"
)
weight_from_sigma_forward_naive = _make_lazy_cuda_func(
"weight_from_sigma_forward_naive"
)
weight_from_sigma_backward_naive = _make_lazy_cuda_func(
"weight_from_sigma_backward_naive"
)
weight_from_alpha_forward_naive = _make_lazy_cuda_func(
"weight_from_alpha_forward_naive"
)
weight_from_alpha_backward_naive = _make_lazy_cuda_func(
"weight_from_alpha_backward_naive"
)
unpack_data = _make_lazy_cuda_func("unpack_data")
unpack_info = _make_lazy_cuda_func("unpack_info")
......
......@@ -3,9 +3,14 @@ Copyright (c) 2022 Ruilong Li, UC Berkeley.
"""
import glob
import json
import os
import shutil
import urllib.request
import zipfile
from subprocess import DEVNULL, call
from packaging import version
from rich.console import Console
from torch.utils.cpp_extension import _get_build_directory, load
......@@ -21,32 +26,70 @@ def cuda_toolkit_available():
return False
def load_extention(name: str):
return load(
name=name,
sources=glob.glob(os.path.join(PATH, "csrc/*.cu")),
extra_cflags=["-O3"],
extra_cuda_cflags=["-O3"],
)
def cuda_toolkit_version():
"""Get the cuda toolkit version."""
cuda_home = os.path.join(os.path.dirname(shutil.which("nvcc")), "..")
if os.path.exists(os.path.join(cuda_home, "version.txt")):
with open(os.path.join(cuda_home, "version.txt")) as f:
cuda_version = f.read().strip().split()[-1]
elif os.path.exists(os.path.join(cuda_home, "version.json")):
with open(os.path.join(cuda_home, "version.json")) as f:
cuda_version = json.load(f)["cuda"]["version"]
else:
raise RuntimeError("Cannot find the cuda version.")
return cuda_version
_C = None
name = "nerfacc_cuda"
if os.listdir(_get_build_directory(name, verbose=False)) != []:
# If the build exists, we assume the extension has been built
# and we can load it.
_C = load_extention(name)
else:
# First time to build the extension
if cuda_toolkit_available():
build_dir = _get_build_directory(name, verbose=False)
extra_include_paths = []
extra_cflags = ["-O3"]
extra_cuda_cflags = ["-O3"]
_C = None
if cuda_toolkit_available():
# # we need cub >= 1.15.0 which is shipped with cuda >= 11.6, so download if
# # necessary. (compling does not garentee to success)
# if version.parse(cuda_toolkit_version()) < version.parse("11.6"):
# target_path = os.path.join(build_dir, "cub-1.17.0")
# if not os.path.exists(target_path):
# zip_path, _ = urllib.request.urlretrieve(
# "https://github.com/NVIDIA/cub/archive/1.17.0.tar.gz",
# os.path.join(build_dir, "cub-1.17.0.tar.gz"),
# )
# shutil.unpack_archive(zip_path, build_dir)
# extra_include_paths.append(target_path)
# extra_cuda_cflags.append("-DTHRUST_IGNORE_CUB_VERSION_CHECK")
# print(
# f"download cub because the cuda version is {cuda_toolkit_version()}"
# )
if os.path.exists(os.path.join(build_dir, f"{name}.so")):
# If the build exists, we assume the extension has been built
# and we can load it.
_C = load(
name=name,
sources=glob.glob(os.path.join(PATH, "csrc/*.cu")),
extra_cflags=extra_cflags,
extra_cuda_cflags=extra_cuda_cflags,
extra_include_paths=extra_include_paths,
)
else:
with Console().status(
"[bold yellow]NerfAcc: Setting up CUDA (This may take a few minutes the first time)",
spinner="bouncingBall",
):
_C = load_extention(name)
else:
Console().print(
"[yellow]NerfAcc: No CUDA toolkit found. NerfAcc will be disabled.[/yellow]"
)
_C = load(
name=name,
sources=glob.glob(os.path.join(PATH, "csrc/*.cu")),
extra_cflags=extra_cflags,
extra_cuda_cflags=extra_cuda_cflags,
extra_include_paths=extra_include_paths,
)
else:
Console().print(
"[yellow]NerfAcc: No CUDA toolkit found. NerfAcc will be disabled.[/yellow]"
)
__all__ = ["_C"]
......@@ -6,6 +6,8 @@
#include <torch/extension.h>
#include <c10/cuda/CUDAGuard.h>
#include <ATen/cuda/Exceptions.h>
#include <ATen/cuda/cub_definitions.cuh>
#define CHECK_CUDA(x) TORCH_CHECK(x.is_cuda(), #x " must be a CUDA tensor")
#define CHECK_CONTIGUOUS(x) \
......@@ -20,3 +22,13 @@
#define CUDA_N_BLOCKS_NEEDED(Q, CUDA_N_THREADS) ((Q - 1) / CUDA_N_THREADS + 1)
#define DEVICE_GUARD(_ten) \
const at::cuda::OptionalCUDAGuard device_guard(device_of(_ten));
// https://github.com/pytorch/pytorch/blob/233305a852e1cd7f319b15b5137074c9eac455f6/aten/src/ATen/cuda/cub.cuh#L38-L46
#define CUB_WRAPPER(func, ...) do { \
size_t temp_storage_bytes = 0; \
func(nullptr, temp_storage_bytes, __VA_ARGS__); \
auto& caching_allocator = *::c10::cuda::CUDACachingAllocator::get(); \
auto temp_storage = caching_allocator.allocate(temp_storage_bytes); \
func(temp_storage.get(), temp_storage_bytes, __VA_ARGS__); \
AT_CUDA_CHECK(cudaGetLastError()); \
} while (false)
\ No newline at end of file
......@@ -81,7 +81,7 @@ __global__ void unpack_data_kernel(
return;
}
torch::Tensor unpack_info(const torch::Tensor packed_info)
torch::Tensor unpack_info(const torch::Tensor packed_info, const int n_samples)
{
DEVICE_GUARD(packed_info);
CHECK_INPUT(packed_info);
......@@ -90,7 +90,7 @@ torch::Tensor unpack_info(const torch::Tensor packed_info)
const int threads = 256;
const int blocks = CUDA_N_BLOCKS_NEEDED(n_rays, threads);
int n_samples = packed_info[n_rays - 1].sum(0).item<int>();
// int n_samples = packed_info[n_rays - 1].sum(0).item<int>();
torch::Tensor ray_indices = torch::empty(
{n_samples}, packed_info.options().dtype(torch::kInt32));
......
......@@ -6,24 +6,6 @@
#include "include/helpers_math.h"
#include "include/helpers_contraction.h"
std::vector<torch::Tensor> rendering_forward(
torch::Tensor packed_info,
torch::Tensor starts,
torch::Tensor ends,
torch::Tensor sigmas,
float early_stop_eps,
float alpha_thre,
bool compression);
torch::Tensor rendering_backward(
torch::Tensor weights,
torch::Tensor grad_weights,
torch::Tensor packed_info,
torch::Tensor starts,
torch::Tensor ends,
torch::Tensor sigmas,
float early_stop_eps,
float alpha_thre);
std::vector<torch::Tensor> ray_aabb_intersect(
const torch::Tensor rays_o,
......@@ -45,7 +27,7 @@ std::vector<torch::Tensor> ray_marching(
const float cone_angle);
torch::Tensor unpack_info(
const torch::Tensor packed_info);
const torch::Tensor packed_info, const int n_samples);
torch::Tensor unpack_info_to_mask(
const torch::Tensor packed_info, const int n_samples);
......@@ -69,32 +51,82 @@ torch::Tensor contract_inv(
const torch::Tensor roi,
const ContractionType type);
torch::Tensor rendering_alphas_backward(
std::vector<torch::Tensor> ray_resampling(
torch::Tensor packed_info,
torch::Tensor starts,
torch::Tensor ends,
torch::Tensor weights,
torch::Tensor grad_weights,
const int steps);
torch::Tensor unpack_data(
torch::Tensor packed_info,
torch::Tensor data,
int n_samples_per_ray);
// cub implementations: parallel across samples
bool is_cub_available() {
return (bool) CUB_SUPPORTS_SCAN_BY_KEY();
}
torch::Tensor transmittance_from_sigma_forward_cub(
torch::Tensor ray_indices,
torch::Tensor starts,
torch::Tensor ends,
torch::Tensor sigmas);
torch::Tensor transmittance_from_sigma_backward_cub(
torch::Tensor ray_indices,
torch::Tensor starts,
torch::Tensor ends,
torch::Tensor transmittance,
torch::Tensor transmittance_grad);
torch::Tensor transmittance_from_alpha_forward_cub(
torch::Tensor ray_indices, torch::Tensor alphas);
torch::Tensor transmittance_from_alpha_backward_cub(
torch::Tensor ray_indices,
torch::Tensor alphas,
float early_stop_eps,
float alpha_thre);
torch::Tensor transmittance,
torch::Tensor transmittance_grad);
std::vector<torch::Tensor> rendering_alphas_forward(
// naive implementations: parallel across rays
torch::Tensor transmittance_from_sigma_forward_naive(
torch::Tensor packed_info,
torch::Tensor starts,
torch::Tensor ends,
torch::Tensor sigmas);
torch::Tensor transmittance_from_sigma_backward_naive(
torch::Tensor packed_info,
torch::Tensor starts,
torch::Tensor ends,
torch::Tensor transmittance,
torch::Tensor transmittance_grad);
torch::Tensor transmittance_from_alpha_forward_naive(
torch::Tensor packed_info,
torch::Tensor alphas);
torch::Tensor transmittance_from_alpha_backward_naive(
torch::Tensor packed_info,
torch::Tensor alphas,
float early_stop_eps,
float alpha_thre,
bool compression);
torch::Tensor transmittance,
torch::Tensor transmittance_grad);
std::vector<torch::Tensor> ray_resampling(
torch::Tensor weight_from_sigma_forward_naive(
torch::Tensor packed_info,
torch::Tensor starts,
torch::Tensor ends,
torch::Tensor sigmas);
torch::Tensor weight_from_sigma_backward_naive(
torch::Tensor weights,
const int steps);
torch::Tensor unpack_data(
torch::Tensor grad_weights,
torch::Tensor packed_info,
torch::Tensor data,
int n_samples_per_ray);
torch::Tensor starts,
torch::Tensor ends,
torch::Tensor sigmas);
torch::Tensor weight_from_alpha_forward_naive(
torch::Tensor packed_info,
torch::Tensor alphas);
torch::Tensor weight_from_alpha_backward_naive(
torch::Tensor weights,
torch::Tensor grad_weights,
torch::Tensor packed_info,
torch::Tensor alphas);
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
{
......@@ -115,10 +147,21 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
m.def("ray_resampling", &ray_resampling);
// rendering
m.def("rendering_forward", &rendering_forward);
m.def("rendering_backward", &rendering_backward);
m.def("rendering_alphas_forward", &rendering_alphas_forward);
m.def("rendering_alphas_backward", &rendering_alphas_backward);
m.def("is_cub_available", is_cub_available);
m.def("transmittance_from_sigma_forward_cub", transmittance_from_sigma_forward_cub);
m.def("transmittance_from_sigma_backward_cub", transmittance_from_sigma_backward_cub);
m.def("transmittance_from_alpha_forward_cub", transmittance_from_alpha_forward_cub);
m.def("transmittance_from_alpha_backward_cub", transmittance_from_alpha_backward_cub);
m.def("transmittance_from_sigma_forward_naive", transmittance_from_sigma_forward_naive);
m.def("transmittance_from_sigma_backward_naive", transmittance_from_sigma_backward_naive);
m.def("transmittance_from_alpha_forward_naive", transmittance_from_alpha_forward_naive);
m.def("transmittance_from_alpha_backward_naive", transmittance_from_alpha_backward_naive);
m.def("weight_from_sigma_forward_naive", weight_from_sigma_forward_naive);
m.def("weight_from_sigma_backward_naive", weight_from_sigma_backward_naive);
m.def("weight_from_alpha_forward_naive", weight_from_alpha_forward_naive);
m.def("weight_from_alpha_backward_naive", weight_from_alpha_backward_naive);
// pack & unpack
m.def("unpack_data", &unpack_data);
......
......@@ -95,6 +95,7 @@ __global__ void ray_marching_kernel(
// first round outputs
int *num_steps,
// second round outputs
int *ray_indices,
float *t_starts,
float *t_ends)
{
......@@ -118,6 +119,7 @@ __global__ void ray_marching_kernel(
int steps = packed_info[i * 2 + 1];
t_starts += base;
t_ends += base;
ray_indices += base;
}
const float3 origin = make_float3(rays_o[0], rays_o[1], rays_o[2]);
......@@ -148,6 +150,7 @@ __global__ void ray_marching_kernel(
{
t_starts[j] = t0;
t_ends[j] = t1;
ray_indices[j] = i;
}
++j;
// march to next sample
......@@ -245,6 +248,7 @@ std::vector<torch::Tensor> ray_marching(
nullptr, /* packed_info */
// outputs
num_steps.data_ptr<int>(),
nullptr, /* ray_indices */
nullptr, /* t_starts */
nullptr /* t_ends */);
......@@ -255,6 +259,7 @@ std::vector<torch::Tensor> ray_marching(
int total_steps = cum_steps[cum_steps.size(0) - 1].item<int>();
torch::Tensor t_starts = torch::empty({total_steps, 1}, rays_o.options());
torch::Tensor t_ends = torch::empty({total_steps, 1}, rays_o.options());
torch::Tensor ray_indices = torch::empty({total_steps}, cum_steps.options());
ray_marching_kernel<<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
// rays
......@@ -274,10 +279,11 @@ std::vector<torch::Tensor> ray_marching(
packed_info.data_ptr<int>(),
// outputs
nullptr, /* num_steps */
ray_indices.data_ptr<int>(),
t_starts.data_ptr<float>(),
t_ends.data_ptr<float>());
return {packed_info, t_starts, t_ends};
return {packed_info, ray_indices, t_starts, t_ends};
}
// ----------------------------------------------------------------------------
......
/*
* Copyright (c) 2022 Ruilong Li, UC Berkeley.
*/
#include "include/helpers_cuda.h"
__global__ void transmittance_from_sigma_forward_kernel(
const uint32_t n_rays,
// inputs
const int *packed_info,
const float *starts,
const float *ends,
const float *sigmas,
// outputs
float *transmittance)
{
CUDA_GET_THREAD_ID(i, n_rays);
// locate
const int base = packed_info[i * 2 + 0];
const int steps = packed_info[i * 2 + 1];
if (steps == 0)
return;
starts += base;
ends += base;
sigmas += base;
transmittance += base;
// accumulation
float cumsum = 0.0f;
for (int j = 0; j < steps; ++j)
{
transmittance[j] = __expf(-cumsum);
cumsum += sigmas[j] * (ends[j] - starts[j]);
}
// // another way to impl:
// float T = 1.f;
// for (int j = 0; j < steps; ++j)
// {
// const float delta = ends[j] - starts[j];
// const float alpha = 1.f - __expf(-sigmas[j] * delta);
// transmittance[j] = T;
// T *= (1.f - alpha);
// }
return;
}
__global__ void transmittance_from_sigma_backward_kernel(
const uint32_t n_rays,
// inputs
const int *packed_info,
const float *starts,
const float *ends,
const float *transmittance,
const float *transmittance_grad,
// outputs
float *sigmas_grad)
{
CUDA_GET_THREAD_ID(i, n_rays);
// locate
const int base = packed_info[i * 2 + 0];
const int steps = packed_info[i * 2 + 1];
if (steps == 0)
return;
transmittance += base;
transmittance_grad += base;
starts += base;
ends += base;
sigmas_grad += base;
// accumulation
float cumsum = 0.0f;
for (int j = steps - 1; j >= 0; --j)
{
sigmas_grad[j] = cumsum * (ends[j] - starts[j]);
cumsum += -transmittance_grad[j] * transmittance[j];
}
return;
}
__global__ void transmittance_from_alpha_forward_kernel(
const uint32_t n_rays,
// inputs
const int *packed_info,
const float *alphas,
// outputs
float *transmittance)
{
CUDA_GET_THREAD_ID(i, n_rays);
// locate
const int base = packed_info[i * 2 + 0];
const int steps = packed_info[i * 2 + 1];
if (steps == 0)
return;
alphas += base;
transmittance += base;
// accumulation
float T = 1.0f;
for (int j = 0; j < steps; ++j)
{
transmittance[j] = T;
T *= (1.0f - alphas[j]);
}
return;
}
__global__ void transmittance_from_alpha_backward_kernel(
const uint32_t n_rays,
// inputs
const int *packed_info,
const float *alphas,
const float *transmittance,
const float *transmittance_grad,
// outputs
float *alphas_grad)
{
CUDA_GET_THREAD_ID(i, n_rays);
// locate
const int base = packed_info[i * 2 + 0];
const int steps = packed_info[i * 2 + 1];
if (steps == 0)
return;
alphas += base;
transmittance += base;
transmittance_grad += base;
alphas_grad += base;
// accumulation
float cumsum = 0.0f;
for (int j = steps - 1; j >= 0; --j)
{
alphas_grad[j] = cumsum / fmax(1.0f - alphas[j], 1e-10f);
cumsum += -transmittance_grad[j] * transmittance[j];
}
return;
}
torch::Tensor transmittance_from_sigma_forward_naive(
torch::Tensor packed_info,
torch::Tensor starts,
torch::Tensor ends,
torch::Tensor sigmas)
{
DEVICE_GUARD(packed_info);
CHECK_INPUT(packed_info);
CHECK_INPUT(starts);
CHECK_INPUT(ends);
CHECK_INPUT(sigmas);
TORCH_CHECK(packed_info.ndimension() == 2);
TORCH_CHECK(starts.ndimension() == 2 & starts.size(1) == 1);
TORCH_CHECK(ends.ndimension() == 2 & ends.size(1) == 1);
TORCH_CHECK(sigmas.ndimension() == 2 & sigmas.size(1) == 1);
const uint32_t n_samples = sigmas.size(0);
const uint32_t n_rays = packed_info.size(0);
const int threads = 256;
const int blocks = CUDA_N_BLOCKS_NEEDED(n_rays, threads);
// outputs
torch::Tensor transmittance = torch::empty_like(sigmas);
// parallel across rays
transmittance_from_sigma_forward_kernel<<<
blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
n_rays,
// inputs
packed_info.data_ptr<int>(),
starts.data_ptr<float>(),
ends.data_ptr<float>(),
sigmas.data_ptr<float>(),
// outputs
transmittance.data_ptr<float>());
return transmittance;
}
torch::Tensor transmittance_from_sigma_backward_naive(
torch::Tensor packed_info,
torch::Tensor starts,
torch::Tensor ends,
torch::Tensor transmittance,
torch::Tensor transmittance_grad)
{
DEVICE_GUARD(packed_info);
CHECK_INPUT(packed_info);
CHECK_INPUT(starts);
CHECK_INPUT(ends);
CHECK_INPUT(transmittance);
CHECK_INPUT(transmittance_grad);
TORCH_CHECK(packed_info.ndimension() == 2);
TORCH_CHECK(starts.ndimension() == 2 & starts.size(1) == 1);
TORCH_CHECK(ends.ndimension() == 2 & ends.size(1) == 1);
TORCH_CHECK(transmittance.ndimension() == 2 & transmittance.size(1) == 1);
TORCH_CHECK(transmittance_grad.ndimension() == 2 & transmittance_grad.size(1) == 1);
const uint32_t n_samples = transmittance.size(0);
const uint32_t n_rays = packed_info.size(0);
const int threads = 256;
const int blocks = CUDA_N_BLOCKS_NEEDED(n_rays, threads);
// outputs
torch::Tensor sigmas_grad = torch::empty_like(transmittance);
// parallel across rays
transmittance_from_sigma_backward_kernel<<<
blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
n_rays,
// inputs
packed_info.data_ptr<int>(),
starts.data_ptr<float>(),
ends.data_ptr<float>(),
transmittance.data_ptr<float>(),
transmittance_grad.data_ptr<float>(),
// outputs
sigmas_grad.data_ptr<float>());
return sigmas_grad;
}
torch::Tensor transmittance_from_alpha_forward_naive(
torch::Tensor packed_info, torch::Tensor alphas)
{
DEVICE_GUARD(packed_info);
CHECK_INPUT(packed_info);
CHECK_INPUT(alphas);
TORCH_CHECK(alphas.ndimension() == 2 & alphas.size(1) == 1);
TORCH_CHECK(packed_info.ndimension() == 2);
const uint32_t n_samples = alphas.size(0);
const uint32_t n_rays = packed_info.size(0);
const int threads = 256;
const int blocks = CUDA_N_BLOCKS_NEEDED(n_rays, threads);
// outputs
torch::Tensor transmittance = torch::empty_like(alphas);
// parallel across rays
transmittance_from_alpha_forward_kernel<<<
blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
n_rays,
// inputs
packed_info.data_ptr<int>(),
alphas.data_ptr<float>(),
// outputs
transmittance.data_ptr<float>());
return transmittance;
}
torch::Tensor transmittance_from_alpha_backward_naive(
torch::Tensor packed_info,
torch::Tensor alphas,
torch::Tensor transmittance,
torch::Tensor transmittance_grad)
{
DEVICE_GUARD(packed_info);
CHECK_INPUT(packed_info);
CHECK_INPUT(transmittance);
CHECK_INPUT(transmittance_grad);
TORCH_CHECK(packed_info.ndimension() == 2);
TORCH_CHECK(transmittance.ndimension() == 2 & transmittance.size(1) == 1);
TORCH_CHECK(transmittance_grad.ndimension() == 2 & transmittance_grad.size(1) == 1);
const uint32_t n_samples = transmittance.size(0);
const uint32_t n_rays = packed_info.size(0);
const int threads = 256;
const int blocks = CUDA_N_BLOCKS_NEEDED(n_rays, threads);
// outputs
torch::Tensor alphas_grad = torch::empty_like(alphas);
// parallel across rays
transmittance_from_alpha_backward_kernel<<<
blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
n_rays,
// inputs
packed_info.data_ptr<int>(),
alphas.data_ptr<float>(),
transmittance.data_ptr<float>(),
transmittance_grad.data_ptr<float>(),
// outputs
alphas_grad.data_ptr<float>());
return alphas_grad;
}
/*
* Copyright (c) 2022 Ruilong Li, UC Berkeley.
*/
// CUB is supported in CUDA >= 11.0
// ExclusiveScanByKey is supported in CUB >= 1.15.0 (CUDA >= 11.6)
// See: https://github.com/NVIDIA/cub/tree/main#releases
#include "include/helpers_cuda.h"
#if CUB_SUPPORTS_SCAN_BY_KEY()
#include <cub/cub.cuh>
#endif
struct Product
{
template <typename T>
__host__ __device__ __forceinline__ T operator()(const T &a, const T &b) const { return a * b; }
};
#if CUB_SUPPORTS_SCAN_BY_KEY()
template <typename KeysInputIteratorT, typename ValuesInputIteratorT, typename ValuesOutputIteratorT>
inline void exclusive_sum_by_key(
KeysInputIteratorT keys, ValuesInputIteratorT input, ValuesOutputIteratorT output, int64_t num_items)
{
TORCH_CHECK(num_items <= std::numeric_limits<int>::max(),
"cub ExclusiveSumByKey does not support more than INT_MAX elements");
CUB_WRAPPER(cub::DeviceScan::ExclusiveSumByKey, keys, input, output,
num_items, cub::Equality(), at::cuda::getCurrentCUDAStream());
}
template <typename KeysInputIteratorT, typename ValuesInputIteratorT, typename ValuesOutputIteratorT>
inline void exclusive_prod_by_key(
KeysInputIteratorT keys, ValuesInputIteratorT input, ValuesOutputIteratorT output, int64_t num_items)
{
TORCH_CHECK(num_items <= std::numeric_limits<int>::max(),
"cub ExclusiveScanByKey does not support more than INT_MAX elements");
CUB_WRAPPER(cub::DeviceScan::ExclusiveScanByKey, keys, input, output, Product(), 1.0f,
num_items, cub::Equality(), at::cuda::getCurrentCUDAStream());
}
#endif
torch::Tensor transmittance_from_sigma_forward_cub(
torch::Tensor ray_indices,
torch::Tensor starts,
torch::Tensor ends,
torch::Tensor sigmas)
{
DEVICE_GUARD(ray_indices);
CHECK_INPUT(ray_indices);
CHECK_INPUT(starts);
CHECK_INPUT(ends);
CHECK_INPUT(sigmas);
TORCH_CHECK(ray_indices.ndimension() == 1);
TORCH_CHECK(starts.ndimension() == 2 & starts.size(1) == 1);
TORCH_CHECK(ends.ndimension() == 2 & ends.size(1) == 1);
TORCH_CHECK(sigmas.ndimension() == 2 & sigmas.size(1) == 1);
const uint32_t n_samples = sigmas.size(0);
// parallel across samples
torch::Tensor sigmas_dt = sigmas * (ends - starts);
torch::Tensor sigmas_dt_cumsum = torch::empty_like(sigmas);
#if CUB_SUPPORTS_SCAN_BY_KEY()
exclusive_sum_by_key(
ray_indices.data_ptr<int>(),
sigmas_dt.data_ptr<float>(),
sigmas_dt_cumsum.data_ptr<float>(),
n_samples);
#else
std::runtime_error("CUB functions are only supported in CUDA >= 11.6.");
#endif
torch::Tensor transmittance = (-sigmas_dt_cumsum).exp();
return transmittance;
}
torch::Tensor transmittance_from_sigma_backward_cub(
torch::Tensor ray_indices,
torch::Tensor starts,
torch::Tensor ends,
torch::Tensor transmittance,
torch::Tensor transmittance_grad)
{
DEVICE_GUARD(ray_indices);
CHECK_INPUT(ray_indices);
CHECK_INPUT(starts);
CHECK_INPUT(ends);
CHECK_INPUT(transmittance);
CHECK_INPUT(transmittance_grad);
TORCH_CHECK(ray_indices.ndimension() == 1);
TORCH_CHECK(starts.ndimension() == 2 & starts.size(1) == 1);
TORCH_CHECK(ends.ndimension() == 2 & ends.size(1) == 1);
TORCH_CHECK(transmittance.ndimension() == 2 & transmittance.size(1) == 1);
TORCH_CHECK(transmittance_grad.ndimension() == 2 & transmittance_grad.size(1) == 1);
const uint32_t n_samples = transmittance.size(0);
// parallel across samples
torch::Tensor sigmas_dt_cumsum_grad = -transmittance_grad * transmittance;
torch::Tensor sigmas_dt_grad = torch::empty_like(transmittance_grad);
#if CUB_SUPPORTS_SCAN_BY_KEY()
exclusive_sum_by_key(
thrust::make_reverse_iterator(ray_indices.data_ptr<int>() + n_samples),
thrust::make_reverse_iterator(sigmas_dt_cumsum_grad.data_ptr<float>() + n_samples),
thrust::make_reverse_iterator(sigmas_dt_grad.data_ptr<float>() + n_samples),
n_samples);
#else
std::runtime_error("CUB functions are only supported in CUDA >= 11.6.");
#endif
torch::Tensor sigmas_grad = sigmas_dt_grad * (ends - starts);
return sigmas_grad;
}
torch::Tensor transmittance_from_alpha_forward_cub(
torch::Tensor ray_indices, torch::Tensor alphas)
{
DEVICE_GUARD(ray_indices);
CHECK_INPUT(ray_indices);
CHECK_INPUT(alphas);
TORCH_CHECK(alphas.ndimension() == 2 & alphas.size(1) == 1);
TORCH_CHECK(ray_indices.ndimension() == 1);
const uint32_t n_samples = alphas.size(0);
// parallel across samples
torch::Tensor transmittance = torch::empty_like(alphas);
#if CUB_SUPPORTS_SCAN_BY_KEY()
exclusive_prod_by_key(
ray_indices.data_ptr<int>(),
(1.0f - alphas).data_ptr<float>(),
transmittance.data_ptr<float>(),
n_samples);
#else
std::runtime_error("CUB functions are only supported in CUDA >= 11.6.");
#endif
return transmittance;
}
torch::Tensor transmittance_from_alpha_backward_cub(
torch::Tensor ray_indices,
torch::Tensor alphas,
torch::Tensor transmittance,
torch::Tensor transmittance_grad)
{
DEVICE_GUARD(ray_indices);
CHECK_INPUT(ray_indices);
CHECK_INPUT(transmittance);
CHECK_INPUT(transmittance_grad);
TORCH_CHECK(ray_indices.ndimension() == 1);
TORCH_CHECK(transmittance.ndimension() == 2 & transmittance.size(1) == 1);
TORCH_CHECK(transmittance_grad.ndimension() == 2 & transmittance_grad.size(1) == 1);
const uint32_t n_samples = transmittance.size(0);
// parallel across samples
torch::Tensor sigmas_dt_cumsum_grad = -transmittance_grad * transmittance;
torch::Tensor sigmas_dt_grad = torch::empty_like(transmittance_grad);
#if CUB_SUPPORTS_SCAN_BY_KEY()
exclusive_sum_by_key(
thrust::make_reverse_iterator(ray_indices.data_ptr<int>() + n_samples),
thrust::make_reverse_iterator(sigmas_dt_cumsum_grad.data_ptr<float>() + n_samples),
thrust::make_reverse_iterator(sigmas_dt_grad.data_ptr<float>() + n_samples),
n_samples);
#else
std::runtime_error("CUB functions are only supported in CUDA >= 11.6.");
#endif
torch::Tensor alphas_grad = sigmas_dt_grad / (1.0f - alphas).clamp_min(1e-10f);
return alphas_grad;
}
/*
* Copyright (c) 2022 Ruilong Li, UC Berkeley.
*/
#include "include/helpers_cuda.h"
__global__ void weight_from_sigma_forward_kernel(
const uint32_t n_rays,
const int *packed_info,
const float *starts,
const float *ends,
const float *sigmas,
// outputs
float *weights)
{
CUDA_GET_THREAD_ID(i, n_rays);
// locate
const int base = packed_info[i * 2 + 0];
const int steps = packed_info[i * 2 + 1];
if (steps == 0)
return;
starts += base;
ends += base;
sigmas += base;
weights += base;
// accumulation
float T = 1.f;
for (int j = 0; j < steps; ++j)
{
const float delta = ends[j] - starts[j];
const float alpha = 1.f - __expf(-sigmas[j] * delta);
weights[j] = alpha * T;
T *= (1.f - alpha);
}
return;
}
__global__ void weight_from_sigma_backward_kernel(
const uint32_t n_rays,
const int *packed_info,
const float *starts,
const float *ends,
const float *sigmas,
const float *weights,
const float *grad_weights,
// outputs
float *grad_sigmas)
{
CUDA_GET_THREAD_ID(i, n_rays);
// locate
const int base = packed_info[i * 2 + 0];
const int steps = packed_info[i * 2 + 1];
if (steps == 0)
return;
starts += base;
ends += base;
sigmas += base;
weights += base;
grad_weights += base;
grad_sigmas += base;
float accum = 0;
for (int j = 0; j < steps; ++j)
{
accum += grad_weights[j] * weights[j];
}
// accumulation
float T = 1.f;
for (int j = 0; j < steps; ++j)
{
const float delta = ends[j] - starts[j];
const float alpha = 1.f - __expf(-sigmas[j] * delta);
grad_sigmas[j] = (grad_weights[j] * T - accum) * delta;
accum -= grad_weights[j] * weights[j];
T *= (1.f - alpha);
}
return;
}
__global__ void weight_from_alpha_forward_kernel(
const uint32_t n_rays,
const int *packed_info,
const float *alphas,
// outputs
float *weights)
{
CUDA_GET_THREAD_ID(i, n_rays);
// locate
const int base = packed_info[i * 2 + 0];
const int steps = packed_info[i * 2 + 1];
if (steps == 0)
return;
alphas += base;
weights += base;
// accumulation
float T = 1.f;
for (int j = 0; j < steps; ++j)
{
const float alpha = alphas[j];
weights[j] = alpha * T;
T *= (1.f - alpha);
}
return;
}
__global__ void weight_from_alpha_backward_kernel(
const uint32_t n_rays,
const int *packed_info,
const float *alphas,
const float *weights,
const float *grad_weights,
// outputs
float *grad_alphas)
{
CUDA_GET_THREAD_ID(i, n_rays);
// locate
const int base = packed_info[i * 2 + 0];
const int steps = packed_info[i * 2 + 1];
if (steps == 0)
return;
alphas += base;
weights += base;
grad_weights += base;
grad_alphas += base;
float accum = 0;
for (int j = 0; j < steps; ++j)
{
accum += grad_weights[j] * weights[j];
}
// accumulation
float T = 1.f;
for (int j = 0; j < steps; ++j)
{
const float alpha = alphas[j];
grad_alphas[j] = (grad_weights[j] * T - accum) / fmaxf(1.f - alpha, 1e-10f);
accum -= grad_weights[j] * weights[j];
T *= (1.f - alpha);
}
return;
}
torch::Tensor weight_from_sigma_forward_naive(
torch::Tensor packed_info,
torch::Tensor starts,
torch::Tensor ends,
torch::Tensor sigmas)
{
DEVICE_GUARD(packed_info);
CHECK_INPUT(packed_info);
CHECK_INPUT(starts);
CHECK_INPUT(ends);
CHECK_INPUT(sigmas);
TORCH_CHECK(packed_info.ndimension() == 2);
TORCH_CHECK(starts.ndimension() == 2 & starts.size(1) == 1);
TORCH_CHECK(ends.ndimension() == 2 & ends.size(1) == 1);
TORCH_CHECK(sigmas.ndimension() == 2 & sigmas.size(1) == 1);
const uint32_t n_samples = sigmas.size(0);
const uint32_t n_rays = packed_info.size(0);
const int threads = 256;
const int blocks = CUDA_N_BLOCKS_NEEDED(n_rays, threads);
// outputs
torch::Tensor weights = torch::empty_like(sigmas);
weight_from_sigma_forward_kernel<<<
blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
n_rays,
// inputs
packed_info.data_ptr<int>(),
starts.data_ptr<float>(),
ends.data_ptr<float>(),
sigmas.data_ptr<float>(),
// outputs
weights.data_ptr<float>());
return weights;
}
torch::Tensor weight_from_sigma_backward_naive(
torch::Tensor weights,
torch::Tensor grad_weights,
torch::Tensor packed_info,
torch::Tensor starts,
torch::Tensor ends,
torch::Tensor sigmas)
{
DEVICE_GUARD(packed_info);
CHECK_INPUT(weights);
CHECK_INPUT(grad_weights);
CHECK_INPUT(packed_info);
CHECK_INPUT(starts);
CHECK_INPUT(ends);
CHECK_INPUT(sigmas);
TORCH_CHECK(packed_info.ndimension() == 2);
TORCH_CHECK(starts.ndimension() == 2 & starts.size(1) == 1);
TORCH_CHECK(ends.ndimension() == 2 & ends.size(1) == 1);
TORCH_CHECK(sigmas.ndimension() == 2 & sigmas.size(1) == 1);
TORCH_CHECK(weights.ndimension() == 2 & weights.size(1) == 1);
TORCH_CHECK(grad_weights.ndimension() == 2 & grad_weights.size(1) == 1);
const uint32_t n_samples = sigmas.size(0);
const uint32_t n_rays = packed_info.size(0);
const int threads = 256;
const int blocks = CUDA_N_BLOCKS_NEEDED(n_rays, threads);
// outputs
torch::Tensor grad_sigmas = torch::empty_like(sigmas);
weight_from_sigma_backward_kernel<<<
blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
n_rays,
// inputs
packed_info.data_ptr<int>(),
starts.data_ptr<float>(),
ends.data_ptr<float>(),
sigmas.data_ptr<float>(),
weights.data_ptr<float>(),
grad_weights.data_ptr<float>(),
// outputs
grad_sigmas.data_ptr<float>());
return grad_sigmas;
}
torch::Tensor weight_from_alpha_forward_naive(
torch::Tensor packed_info, torch::Tensor alphas)
{
DEVICE_GUARD(packed_info);
CHECK_INPUT(packed_info);
CHECK_INPUT(alphas);
TORCH_CHECK(packed_info.ndimension() == 2);
TORCH_CHECK(alphas.ndimension() == 2 & alphas.size(1) == 1);
const uint32_t n_samples = alphas.size(0);
const uint32_t n_rays = packed_info.size(0);
const int threads = 256;
const int blocks = CUDA_N_BLOCKS_NEEDED(n_rays, threads);
// outputs
torch::Tensor weights = torch::empty_like(alphas);
weight_from_alpha_forward_kernel<<<
blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
n_rays,
// inputs
packed_info.data_ptr<int>(),
alphas.data_ptr<float>(),
// outputs
weights.data_ptr<float>());
return weights;
}
torch::Tensor weight_from_alpha_backward_naive(
torch::Tensor weights,
torch::Tensor grad_weights,
torch::Tensor packed_info,
torch::Tensor alphas)
{
DEVICE_GUARD(packed_info);
CHECK_INPUT(packed_info);
CHECK_INPUT(alphas);
CHECK_INPUT(weights);
CHECK_INPUT(grad_weights);
TORCH_CHECK(packed_info.ndimension() == 2);
TORCH_CHECK(alphas.ndimension() == 2 & alphas.size(1) == 1);
TORCH_CHECK(weights.ndimension() == 2 & weights.size(1) == 1);
TORCH_CHECK(grad_weights.ndimension() == 2 & grad_weights.size(1) == 1);
const uint32_t n_samples = alphas.size(0);
const uint32_t n_rays = packed_info.size(0);
const int threads = 256;
const int blocks = CUDA_N_BLOCKS_NEEDED(n_rays, threads);
// outputs
torch::Tensor grad_alphas = torch::empty_like(alphas);
weight_from_alpha_backward_kernel<<<
blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
n_rays,
// inputs
packed_info.data_ptr<int>(),
alphas.data_ptr<float>(),
weights.data_ptr<float>(),
grad_weights.data_ptr<float>(),
// outputs
grad_alphas.data_ptr<float>());
return grad_alphas;
}
/*
* Copyright (c) 2022 Ruilong Li, UC Berkeley.
*/
#include "include/helpers_cuda.h"
template <typename scalar_t>
__global__ void rendering_forward_kernel(
const uint32_t n_rays,
const int *packed_info, // input ray & point indices.
const scalar_t *starts, // input start t
const scalar_t *ends, // input end t
const scalar_t *sigmas, // input density after activation
const scalar_t *alphas, // input alpha (opacity) values.
const scalar_t early_stop_eps, // transmittance threshold for early stop
const scalar_t alpha_thre, // alpha threshold for emtpy space
// outputs: should be all-zero initialized
int *num_steps, // the number of valid steps for each ray
scalar_t *weights, // the number rendering weights for each sample
bool *compact_selector // the samples that we needs to compute the gradients
)
{
CUDA_GET_THREAD_ID(i, n_rays);
// locate
const int base = packed_info[i * 2 + 0]; // point idx start.
const int steps = packed_info[i * 2 + 1]; // point idx shift.
if (steps == 0)
return;
if (alphas != nullptr)
{
// rendering with alpha
alphas += base;
}
else
{
// rendering with density
starts += base;
ends += base;
sigmas += base;
}
if (num_steps != nullptr)
{
num_steps += i;
}
if (weights != nullptr)
{
weights += base;
}
if (compact_selector != nullptr)
{
compact_selector += base;
}
// accumulated rendering
scalar_t T = 1.f;
int cnt = 0;
for (int j = 0; j < steps; ++j)
{
if (T < early_stop_eps)
{
break;
}
scalar_t alpha;
if (alphas != nullptr)
{
// rendering with alpha
alpha = alphas[j];
}
else
{
// rendering with density
scalar_t delta = ends[j] - starts[j];
alpha = 1.f - __expf(-sigmas[j] * delta);
}
if (alpha < alpha_thre)
{
// empty space
continue;
}
const scalar_t weight = alpha * T;
T *= (1.f - alpha);
if (weights != nullptr)
{
weights[j] = weight;
}
if (compact_selector != nullptr)
{
compact_selector[j] = true;
}
cnt += 1;
}
if (num_steps != nullptr)
{
*num_steps = cnt;
}
return;
}
template <typename scalar_t>
__global__ void rendering_backward_kernel(
const uint32_t n_rays,
const int *packed_info, // input ray & point indices.
const scalar_t *starts, // input start t
const scalar_t *ends, // input end t
const scalar_t *sigmas, // input density after activation
const scalar_t *alphas, // input alpha (opacity) values.
const scalar_t early_stop_eps, // transmittance threshold for early stop
const scalar_t alpha_thre, // alpha threshold for emtpy space
const scalar_t *weights, // forward output
const scalar_t *grad_weights, // input gradients
// if alphas was given, we compute the gradients for alphas.
// otherwise, we compute the gradients for sigmas.
scalar_t *grad_sigmas, // output gradients
scalar_t *grad_alphas // output gradients
)
{
CUDA_GET_THREAD_ID(i, n_rays);
// locate
const int base = packed_info[i * 2 + 0]; // point idx start.
const int steps = packed_info[i * 2 + 1]; // point idx shift.
if (steps == 0)
return;
if (alphas != nullptr)
{
// rendering with alpha
alphas += base;
grad_alphas += base;
}
else
{
// rendering with density
starts += base;
ends += base;
sigmas += base;
grad_sigmas += base;
}
weights += base;
grad_weights += base;
scalar_t accum = 0;
for (int j = 0; j < steps; ++j)
{
accum += grad_weights[j] * weights[j];
}
// backward of accumulated rendering
scalar_t T = 1.f;
for (int j = 0; j < steps; ++j)
{
if (T < early_stop_eps)
{
break;
}
scalar_t alpha;
if (alphas != nullptr)
{
// rendering with alpha
alpha = alphas[j];
if (alpha < alpha_thre)
{
// empty space
continue;
}
grad_alphas[j] = (grad_weights[j] * T - accum) / fmaxf(1.f - alpha, 1e-10f);
}
else
{
// rendering with density
scalar_t delta = ends[j] - starts[j];
alpha = 1.f - __expf(-sigmas[j] * delta);
if (alpha < alpha_thre)
{
// empty space
continue;
}
grad_sigmas[j] = (grad_weights[j] * T - accum) * delta;
}
accum -= grad_weights[j] * weights[j];
T *= (1.f - alpha);
}
}
std::vector<torch::Tensor> rendering_forward(
torch::Tensor packed_info,
torch::Tensor starts,
torch::Tensor ends,
torch::Tensor sigmas,
float early_stop_eps,
float alpha_thre,
bool compression)
{
DEVICE_GUARD(packed_info);
CHECK_INPUT(packed_info);
CHECK_INPUT(starts);
CHECK_INPUT(ends);
CHECK_INPUT(sigmas);
TORCH_CHECK(packed_info.ndimension() == 2 & packed_info.size(1) == 2);
TORCH_CHECK(starts.ndimension() == 2 & starts.size(1) == 1);
TORCH_CHECK(ends.ndimension() == 2 & ends.size(1) == 1);
TORCH_CHECK(sigmas.ndimension() == 2 & sigmas.size(1) == 1);
const uint32_t n_rays = packed_info.size(0);
const uint32_t n_samples = sigmas.size(0);
const int threads = 256;
const int blocks = CUDA_N_BLOCKS_NEEDED(n_rays, threads);
if (compression)
{
// compress the samples to get rid of invisible ones.
torch::Tensor num_steps = torch::zeros({n_rays}, packed_info.options());
torch::Tensor compact_selector = torch::zeros(
{n_samples}, sigmas.options().dtype(torch::kBool));
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
sigmas.scalar_type(),
"rendering_forward",
([&]
{ rendering_forward_kernel<scalar_t><<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
n_rays,
// inputs
packed_info.data_ptr<int>(),
starts.data_ptr<scalar_t>(),
ends.data_ptr<scalar_t>(),
sigmas.data_ptr<scalar_t>(),
nullptr, // alphas
early_stop_eps,
alpha_thre,
// outputs
num_steps.data_ptr<int>(),
nullptr,
compact_selector.data_ptr<bool>()); }));
torch::Tensor cum_steps = num_steps.cumsum(0, torch::kInt32);
torch::Tensor compact_packed_info = torch::stack({cum_steps - num_steps, num_steps}, 1);
return {compact_packed_info, compact_selector};
}
else
{
// just do the forward rendering.
torch::Tensor weights = torch::zeros({n_samples}, sigmas.options());
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
sigmas.scalar_type(),
"rendering_forward",
([&]
{ rendering_forward_kernel<scalar_t><<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
n_rays,
// inputs
packed_info.data_ptr<int>(),
starts.data_ptr<scalar_t>(),
ends.data_ptr<scalar_t>(),
sigmas.data_ptr<scalar_t>(),
nullptr, // alphas
early_stop_eps,
alpha_thre,
// outputs
nullptr,
weights.data_ptr<scalar_t>(),
nullptr); }));
return {weights};
}
}
torch::Tensor rendering_backward(
torch::Tensor weights,
torch::Tensor grad_weights,
torch::Tensor packed_info,
torch::Tensor starts,
torch::Tensor ends,
torch::Tensor sigmas,
float early_stop_eps,
float alpha_thre)
{
DEVICE_GUARD(packed_info);
const uint32_t n_rays = packed_info.size(0);
const uint32_t n_samples = sigmas.size(0);
const int threads = 256;
const int blocks = CUDA_N_BLOCKS_NEEDED(n_rays, threads);
// outputs
torch::Tensor grad_sigmas = torch::zeros(sigmas.sizes(), sigmas.options());
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
sigmas.scalar_type(),
"rendering_backward",
([&]
{ rendering_backward_kernel<scalar_t><<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
n_rays,
// inputs
packed_info.data_ptr<int>(),
starts.data_ptr<scalar_t>(),
ends.data_ptr<scalar_t>(),
sigmas.data_ptr<scalar_t>(),
nullptr, // alphas
early_stop_eps,
alpha_thre,
weights.data_ptr<scalar_t>(),
grad_weights.data_ptr<scalar_t>(),
// outputs
grad_sigmas.data_ptr<scalar_t>(),
nullptr // alphas gradients
); }));
return grad_sigmas;
}
// -- rendering with alphas -- //
std::vector<torch::Tensor> rendering_alphas_forward(
torch::Tensor packed_info,
torch::Tensor alphas,
float early_stop_eps,
float alpha_thre,
bool compression)
{
DEVICE_GUARD(packed_info);
CHECK_INPUT(packed_info);
CHECK_INPUT(alphas);
TORCH_CHECK(packed_info.ndimension() == 2 & packed_info.size(1) == 2);
TORCH_CHECK(alphas.ndimension() == 2 & alphas.size(1) == 1);
const uint32_t n_rays = packed_info.size(0);
const uint32_t n_samples = alphas.size(0);
const int threads = 256;
const int blocks = CUDA_N_BLOCKS_NEEDED(n_rays, threads);
if (compression)
{
// compress the samples to get rid of invisible ones.
torch::Tensor num_steps = torch::zeros({n_rays}, packed_info.options());
torch::Tensor compact_selector = torch::zeros(
{n_samples}, alphas.options().dtype(torch::kBool));
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
alphas.scalar_type(),
"rendering_alphas_forward",
([&]
{ rendering_forward_kernel<scalar_t><<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
n_rays,
// inputs
packed_info.data_ptr<int>(),
nullptr, // starts
nullptr, // ends
nullptr, // sigmas
alphas.data_ptr<scalar_t>(),
early_stop_eps,
alpha_thre,
// outputs
num_steps.data_ptr<int>(),
nullptr,
compact_selector.data_ptr<bool>()); }));
torch::Tensor cum_steps = num_steps.cumsum(0, torch::kInt32);
torch::Tensor compact_packed_info = torch::stack({cum_steps - num_steps, num_steps}, 1);
return {compact_selector, compact_packed_info};
}
else
{
// just do the forward rendering.
torch::Tensor weights = torch::zeros({n_samples}, alphas.options());
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
alphas.scalar_type(),
"rendering_forward",
([&]
{ rendering_forward_kernel<scalar_t><<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
n_rays,
// inputs
packed_info.data_ptr<int>(),
nullptr, // starts
nullptr, // ends
nullptr, // sigmas
alphas.data_ptr<scalar_t>(),
early_stop_eps,
alpha_thre,
// outputs
nullptr,
weights.data_ptr<scalar_t>(),
nullptr); }));
return {weights};
}
}
torch::Tensor rendering_alphas_backward(
torch::Tensor weights,
torch::Tensor grad_weights,
torch::Tensor packed_info,
torch::Tensor alphas,
float early_stop_eps,
float alpha_thre)
{
DEVICE_GUARD(packed_info);
const uint32_t n_rays = packed_info.size(0);
const uint32_t n_samples = alphas.size(0);
const int threads = 256;
const int blocks = CUDA_N_BLOCKS_NEEDED(n_rays, threads);
// outputs
torch::Tensor grad_alphas = torch::zeros(alphas.sizes(), alphas.options());
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
alphas.scalar_type(),
"rendering_alphas_backward",
([&]
{ rendering_backward_kernel<scalar_t><<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
n_rays,
// inputs
packed_info.data_ptr<int>(),
nullptr, // starts
nullptr, // ends
nullptr, // sigmas
alphas.data_ptr<scalar_t>(),
early_stop_eps,
alpha_thre,
weights.data_ptr<scalar_t>(),
grad_weights.data_ptr<scalar_t>(),
// outputs
nullptr, // sigma gradients
grad_alphas.data_ptr<scalar_t>()); }));
return grad_alphas;
}
......@@ -44,7 +44,41 @@ def pack_data(data: Tensor, mask: Tensor) -> Tuple[Tensor, Tensor]:
@torch.no_grad()
def unpack_info(packed_info: Tensor) -> Tensor:
def pack_info(ray_indices: Tensor, n_rays: int = None) -> Tensor:
"""Pack `ray_indices` to `packed_info`. Useful for converting per sample data to per ray data.
Note:
this function is not differentiable to any inputs.
Args:
ray_indices: Ray index of each sample. LongTensor with shape (n_sample).
Returns:
packed_info: Stores information on which samples belong to the same ray. \
See :func:`nerfacc.ray_marching` for details. Tensor with shape (n_rays, 2).
"""
assert (
ray_indices.dim() == 1
), "ray_indices must be a 1D tensor with shape (n_samples)."
if ray_indices.is_cuda:
ray_indices = ray_indices.contiguous().int()
device = ray_indices.device
if n_rays is None:
n_rays = int(ray_indices.max()) + 1
# else:
# assert n_rays > ray_indices.max()
src = torch.ones_like(ray_indices)
num_steps = torch.zeros((n_rays,), device=device, dtype=torch.int)
num_steps.scatter_add_(0, ray_indices.long(), src)
cum_steps = num_steps.cumsum(dim=0, dtype=torch.int)
packed_info = torch.stack([cum_steps - num_steps, num_steps], dim=-1)
else:
raise NotImplementedError("Only support cuda inputs.")
return packed_info.int()
@torch.no_grad()
def unpack_info(packed_info: Tensor, n_samples: int) -> Tensor:
"""Unpack `packed_info` to `ray_indices`. Useful for converting per ray data to per sample data.
Note:
......@@ -53,6 +87,7 @@ def unpack_info(packed_info: Tensor) -> Tensor:
Args:
packed_info: Stores information on which samples belong to the same ray. \
See :func:`nerfacc.ray_marching` for details. Tensor with shape (n_rays, 2).
n_samples: Total number of samples.
Returns:
Ray index of each sample. LongTensor with shape (n_sample).
......@@ -71,7 +106,7 @@ def unpack_info(packed_info: Tensor) -> Tensor:
# torch.Size([128, 2]) torch.Size([115200, 1]) torch.Size([115200, 1])
print(packed_info.shape, t_starts.shape, t_ends.shape)
# Unpack per-ray info to per-sample info.
ray_indices = unpack_info(packed_info)
ray_indices = unpack_info(packed_info, t_starts.shape[0])
# torch.Size([115200]) torch.int64
print(ray_indices.shape, ray_indices.dtype)
......@@ -80,7 +115,7 @@ def unpack_info(packed_info: Tensor) -> Tensor:
packed_info.dim() == 2 and packed_info.shape[-1] == 2
), "packed_info must be a 2D tensor with shape (n_rays, 2)."
if packed_info.is_cuda:
ray_indices = _C.unpack_info(packed_info.contiguous().int())
ray_indices = _C.unpack_info(packed_info.contiguous().int(), n_samples)
else:
raise NotImplementedError("Only support cuda inputs.")
return ray_indices.long()
......
......@@ -7,7 +7,6 @@ import nerfacc.cuda as _C
from .contraction import ContractionType
from .grid import Grid
from .intersection import ray_aabb_intersect
from .pack import unpack_info
from .vol_rendering import render_visibility
......@@ -82,10 +81,7 @@ def ray_marching(
Returns:
A tuple of tensors.
- **packed_info**: Stores information on which samples belong to the same ray. \
Tensor with shape (n_rays, 2). The first column stores the index of the \
first sample of each ray. The second column stores the number of samples \
of each ray.
- **ray_indices**: Ray index of each sample. IntTensor with shape (n_samples).
- **t_starts**: Per-sample start distance. Tensor with shape (n_samples, 1).
- **t_ends**: Per-sample end distance. Tensor with shape (n_samples, 1).
......@@ -103,32 +99,31 @@ def ray_marching(
rays_d = rays_d / rays_d.norm(dim=-1, keepdim=True)
# Ray marching with near far plane.
packed_info, t_starts, t_ends = ray_marching(
ray_indices, t_starts, t_ends = ray_marching(
rays_o, rays_d, near_plane=0.1, far_plane=1.0, render_step_size=1e-3
)
# Ray marching with aabb.
scene_aabb = torch.tensor([0.0, 0.0, 0.0, 1.0, 1.0, 1.0], device=device)
packed_info, t_starts, t_ends = ray_marching(
ray_indices, t_starts, t_ends = ray_marching(
rays_o, rays_d, scene_aabb=scene_aabb, render_step_size=1e-3
)
# Ray marching with per-ray t_min and t_max.
t_min = torch.zeros((batch_size,), device=device)
t_max = torch.ones((batch_size,), device=device)
packed_info, t_starts, t_ends = ray_marching(
ray_indices, t_starts, t_ends = ray_marching(
rays_o, rays_d, t_min=t_min, t_max=t_max, render_step_size=1e-3
)
# Ray marching with aabb and skip areas based on occupancy grid.
scene_aabb = torch.tensor([0.0, 0.0, 0.0, 1.0, 1.0, 1.0], device=device)
grid = OccupancyGrid(roi_aabb=[0.0, 0.0, 0.0, 0.5, 0.5, 0.5]).to(device)
packed_info, t_starts, t_ends = ray_marching(
ray_indices, t_starts, t_ends = ray_marching(
rays_o, rays_d, scene_aabb=scene_aabb, grid=grid, render_step_size=1e-3
)
# Convert t_starts and t_ends to sample locations.
ray_indices = unpack_info(packed_info)
t_mid = (t_starts + t_ends) / 2.0
sample_locs = rays_o[ray_indices] + t_mid * rays_d[ray_indices]
......@@ -179,7 +174,7 @@ def ray_marching(
contraction_type = ContractionType.AABB.to_cpp_version()
# marching with grid-based skipping
packed_info, t_starts, t_ends = _C.ray_marching(
packed_info, ray_indices, t_starts, t_ends = _C.ray_marching(
# rays
rays_o.contiguous(),
rays_d.contiguous(),
......@@ -197,7 +192,6 @@ def ray_marching(
# skip invisible space
if sigma_fn is not None or alpha_fn is not None:
# Query sigma without gradients
ray_indices = unpack_info(packed_info)
if sigma_fn is not None:
sigmas = sigma_fn(t_starts, t_ends, ray_indices.long())
assert (
......@@ -211,10 +205,16 @@ def ray_marching(
), "alphas must have shape of (N, 1)! Got {}".format(alphas.shape)
# Compute visibility of the samples, and filter out invisible samples
visibility, packed_info_visible = render_visibility(
packed_info, alphas, early_stop_eps, alpha_thre
masks = render_visibility(
alphas,
ray_indices=ray_indices,
early_stop_eps=early_stop_eps,
alpha_thre=alpha_thre,
)
ray_indices, t_starts, t_ends = (
ray_indices[masks],
t_starts[masks],
t_ends[masks],
)
t_starts, t_ends = t_starts[visibility], t_ends[visibility]
packed_info = packed_info_visible
return packed_info, t_starts, t_ends
return ray_indices, t_starts, t_ends
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment