Commit 298ffd02 authored by Ruilong Li's avatar Ruilong Li
Browse files

init

parents
# Visual Studio Code configs.
.vscode/
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
# lib/
lib64/
parts/
sdist/
var/
wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
.hypothesis/
.pytest_cache/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
target/
# Jupyter Notebook
.ipynb_checkpoints
# pyenv
.python-version
# celery beat schedule file
celerybeat-schedule
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.DS_Store
# Direnv config.
.envrc
# line_profiler
*.lprof
# vscode
.vsocde
import math
from typing import Callable, Tuple
import torch
from .cuda import VolumeRenderer, ray_aabb_intersect, ray_marching
def volumetric_rendering(
query_fn: Callable,
rays_o: torch.Tensor,
rays_d: torch.Tensor,
scene_aabb: torch.Tensor,
scene_occ_binary: torch.Tensor,
scene_resolution: Tuple[int, int, int],
render_bkgd: torch.Tensor = None,
render_n_samples: int = 1024,
**kwargs,
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
"""A *fast* version of differentiable volumetric rendering."""
device = rays_o.device
if render_bkgd is None:
render_bkgd = torch.ones(3, device=device)
scene_resolution = torch.tensor(scene_resolution, dtype=torch.int, device=device)
rays_o = rays_o.contiguous()
rays_d = rays_d.contiguous()
scene_aabb = scene_aabb.contiguous()
scene_occ_binary = scene_occ_binary.contiguous()
render_bkgd = render_bkgd.contiguous()
n_rays = rays_o.shape[0]
render_total_samples = n_rays * render_n_samples
render_step_size = (
(scene_aabb[3:] - scene_aabb[:3]).max() * math.sqrt(3) / render_n_samples
)
with torch.no_grad():
# TODO: avoid clamp here. kinda stupid
t_min, t_max = ray_aabb_intersect(rays_o, rays_d, scene_aabb)
t_min = torch.clamp(t_min, max=1e10)
t_max = torch.clamp(t_max, max=1e10)
(
packed_info,
frustum_origins,
frustum_dirs,
frustum_starts,
frustum_ends,
) = ray_marching(
# rays
rays_o,
rays_d,
t_min,
t_max,
# density grid
scene_aabb,
scene_resolution,
scene_occ_binary,
# sampling
render_total_samples,
render_n_samples,
render_step_size,
)
# squeeze valid samples
total_samples = max(packed_info[:, -1].sum(), 1)
frustum_origins = frustum_origins[:total_samples]
frustum_dirs = frustum_dirs[:total_samples]
frustum_starts = frustum_starts[:total_samples]
frustum_ends = frustum_ends[:total_samples]
frustum_positions = (
frustum_origins + frustum_dirs * (frustum_starts + frustum_ends) / 2.0
)
query_results = query_fn(frustum_positions, frustum_dirs, **kwargs)
rgbs, densities = query_results[0], query_results[1]
(
accumulated_weight,
accumulated_depth,
accumulated_color,
alive_ray_mask,
) = VolumeRenderer.apply(
packed_info,
frustum_starts,
frustum_ends,
densities.contiguous(),
rgbs.contiguous(),
)
accumulated_depth = torch.clip(accumulated_depth, t_min[:, None], t_max[:, None])
accumulated_color = accumulated_color + render_bkgd * (1.0 - accumulated_weight)
return accumulated_color, accumulated_depth, accumulated_weight, alive_ray_mask
import torch
from torch.cuda.amp import custom_bwd, custom_fwd
from ._backend import _C
ray_aabb_intersect = _C.ray_aabb_intersect
ray_marching = _C.ray_marching
volumetric_rendering_forward = _C.volumetric_rendering_forward
volumetric_rendering_backward = _C.volumetric_rendering_backward
class VolumeRenderer(torch.autograd.Function):
"""CUDA Volumetirc Renderer"""
@staticmethod
@custom_fwd(cast_inputs=torch.float32)
def forward(ctx, packed_info, starts, ends, sigmas, rgbs):
(
accumulated_weight,
accumulated_depth,
accumulated_color,
mask,
) = volumetric_rendering_forward(packed_info, starts, ends, sigmas, rgbs)
ctx.save_for_backward(
accumulated_weight,
accumulated_depth,
accumulated_color,
packed_info,
starts,
ends,
sigmas,
rgbs,
)
return accumulated_weight, accumulated_depth, accumulated_color, mask
@staticmethod
@custom_bwd
def backward(ctx, grad_weight, grad_depth, grad_color, _grad_mask):
(
accumulated_weight,
accumulated_depth,
accumulated_color,
packed_info,
starts,
ends,
sigmas,
rgbs,
) = ctx.saved_tensors
grad_sigmas, grad_rgbs = volumetric_rendering_backward(
accumulated_weight,
accumulated_depth,
accumulated_color,
grad_weight,
grad_depth,
grad_color,
packed_info,
starts,
ends,
sigmas,
rgbs,
)
# corresponds to the input argument list of forward()
return None, None, None, grad_sigmas, grad_rgbs
"""Setup cuda backend."""
import glob
import os
from subprocess import DEVNULL, call
from torch.utils.cpp_extension import load
PATH = os.path.dirname(os.path.abspath(__file__))
def cuda_toolkit_available():
"""Check if the nvcc is avaiable on the machine."""
# https://github.com/idiap/fast-transformers/blob/master/setup.py
try:
call(["nvcc"], stdout=DEVNULL, stderr=DEVNULL)
return True
except FileNotFoundError:
return False
if cuda_toolkit_available():
sources = glob.glob(os.path.join(PATH, "csrc/*.cu"))
else:
sources = glob.glob(os.path.join(PATH, "csrc/*.cpp"))
extra_cflags = ["-O3"]
extra_cuda_cflags = ["-O3"]
_C = load(
name="nerfacc_cuda",
sources=sources,
extra_cflags=extra_cflags,
extra_cuda_cflags=extra_cuda_cflags,
)
__all__ = ["_C"]
#pragma once
#ifdef __CUDACC__
#define CUDA_HOSTDEV __host__ __device__
#else
#define CUDA_HOSTDEV
#endif
#include <torch/extension.h>
inline constexpr CUDA_HOSTDEV float __SQRT3() { return 1.73205080757f; }
template <typename scalar_t>
inline CUDA_HOSTDEV void __swap(scalar_t &a, scalar_t &b)
{
scalar_t c = a;
a = b;
b = c;
}
inline CUDA_HOSTDEV float __clamp(float f, float a, float b) { return fmaxf(a, fminf(f, b)); }
inline CUDA_HOSTDEV int __clamp(int f, int a, int b) { return std::max(a, std::min(f, b)); }
inline CUDA_HOSTDEV float __sign(float x) { return copysignf(1.0, x); }
inline CUDA_HOSTDEV uint32_t __expand_bits(uint32_t v)
{
v = (v * 0x00010001u) & 0xFF0000FFu;
v = (v * 0x00000101u) & 0x0F00F00Fu;
v = (v * 0x00000011u) & 0xC30C30C3u;
v = (v * 0x00000005u) & 0x49249249u;
return v;
}
inline CUDA_HOSTDEV uint32_t __morton3D(uint32_t x, uint32_t y, uint32_t z)
{
uint32_t xx = __expand_bits(x);
uint32_t yy = __expand_bits(y);
uint32_t zz = __expand_bits(z);
return xx | (yy << 1) | (zz << 2);
}
inline CUDA_HOSTDEV uint32_t __morton3D_invert(uint32_t x)
{
x = x & 0x49249249;
x = (x | (x >> 2)) & 0xc30c30c3;
x = (x | (x >> 4)) & 0x0f00f00f;
x = (x | (x >> 8)) & 0xff0000ff;
x = (x | (x >> 16)) & 0x0000ffff;
return x;
}
\ No newline at end of file
#pragma once
#include "helpers.h"
#include <c10/cuda/CUDAGuard.h>
#define CHECK_CUDA(x) TORCH_CHECK(x.is_cuda(), #x " must be a CUDA tensor")
#define CHECK_CONTIGUOUS(x) \
TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
#define CHECK_INPUT(x) \
CHECK_CUDA(x); \
CHECK_CONTIGUOUS(x)
#define CUDA_GET_THREAD_ID(tid, Q) \
const int tid = blockIdx.x * blockDim.x + threadIdx.x; \
if (tid >= Q) \
return
#define CUDA_N_BLOCKS_NEEDED(Q, CUDA_N_THREADS) ((Q - 1) / CUDA_N_THREADS + 1)
#define DEVICE_GUARD(_ten) \
const at::cuda::OptionalCUDAGuard device_guard(device_of(_ten));
\ No newline at end of file
This diff is collapsed.
#include "include/helpers_cuda.h"
template <typename scalar_t>
inline __host__ __device__ void _ray_aabb_intersect(
const scalar_t* rays_o,
const scalar_t* rays_d,
const scalar_t* aabb,
scalar_t* near,
scalar_t* far
) {
// aabb is [xmin, ymin, zmin, xmax, ymax, zmax]
scalar_t tmin = (aabb[0] - rays_o[0]) / rays_d[0];
scalar_t tmax = (aabb[3] - rays_o[0]) / rays_d[0];
if (tmin > tmax) __swap(tmin, tmax);
scalar_t tymin = (aabb[1] - rays_o[1]) / rays_d[1];
scalar_t tymax = (aabb[4] - rays_o[1]) / rays_d[1];
if (tymin > tymax) __swap(tymin, tymax);
if (tmin > tymax || tymin > tmax){
*near = std::numeric_limits<scalar_t>::max();
*far = std::numeric_limits<scalar_t>::max();
return;
}
if (tymin > tmin) tmin = tymin;
if (tymax < tmax) tmax = tymax;
scalar_t tzmin = (aabb[2] - rays_o[2]) / rays_d[2];
scalar_t tzmax = (aabb[5] - rays_o[2]) / rays_d[2];
if (tzmin > tzmax) __swap(tzmin, tzmax);
if (tmin > tzmax || tzmin > tmax){
*near = std::numeric_limits<scalar_t>::max();
*far = std::numeric_limits<scalar_t>::max();
return;
}
if (tzmin > tmin) tmin = tzmin;
if (tzmax < tmax) tmax = tzmax;
*near = tmin;
*far = tmax;
return;
}
template <typename scalar_t>
__global__ void kernel_ray_aabb_intersect(
const int N,
const scalar_t* rays_o,
const scalar_t* rays_d,
const scalar_t* aabb,
scalar_t* t_min,
scalar_t* t_max
){
// aabb is [xmin, ymin, zmin, xmax, ymax, zmax]
CUDA_GET_THREAD_ID(thread_id, N);
// locate
rays_o += thread_id * 3;
rays_d += thread_id * 3;
t_min += thread_id;
t_max += thread_id;
_ray_aabb_intersect<scalar_t>(rays_o, rays_d, aabb, t_min, t_max);
scalar_t zero = static_cast<scalar_t>(0.f);
*t_min = *t_min > zero ? *t_min : zero;
return;
}
/**
* @brief Ray AABB Test
*
* @param rays_o Ray origins. Tensor with shape [N, 3].
* @param rays_d Normalized ray directions. Tensor with shape [N, 3].
* @param aabb Scene AABB [xmin, ymin, zmin, xmax, ymax, zmax]. Tensor with shape [6].
* @return std::vector<torch::Tensor>
* Ray AABB intersection {t_min, t_max} with shape [N] respectively. Note the t_min is
* clipped to minimum zero.
*/
std::vector<torch::Tensor> ray_aabb_intersect(
const torch::Tensor rays_o, const torch::Tensor rays_d, const torch::Tensor aabb
) {
DEVICE_GUARD(rays_o);
CHECK_INPUT(rays_o);
CHECK_INPUT(rays_d);
CHECK_INPUT(aabb);
TORCH_CHECK(rays_o.ndimension() == 2 & rays_o.size(1) == 3)
TORCH_CHECK(rays_d.ndimension() == 2 & rays_d.size(1) == 3)
TORCH_CHECK(aabb.ndimension() == 1 & aabb.size(0) == 6)
const int N = rays_o.size(0);
const int threads = 256;
const int blocks = CUDA_N_BLOCKS_NEEDED(N, threads);
torch::Tensor t_min = torch::empty({N}, rays_o.options());
torch::Tensor t_max = torch::empty({N}, rays_o.options());
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
rays_o.scalar_type(), "ray_aabb_intersect",
([&] {
kernel_ray_aabb_intersect<scalar_t><<<blocks, threads>>>(
N,
rays_o.data_ptr<scalar_t>(),
rays_d.data_ptr<scalar_t>(),
aabb.data_ptr<scalar_t>(),
t_min.data_ptr<scalar_t>(),
t_max.data_ptr<scalar_t>()
);
})
);
return {t_min, t_max};
}
\ No newline at end of file
#include "include/helpers_cuda.h"
std::vector<torch::Tensor> ray_aabb_intersect(
const torch::Tensor rays_o,
const torch::Tensor rays_d,
const torch::Tensor aabb
);
std::vector<torch::Tensor> ray_marching(
// rays
const torch::Tensor rays_o,
const torch::Tensor rays_d,
const torch::Tensor t_min,
const torch::Tensor t_max,
// density grid
const torch::Tensor aabb,
const torch::Tensor resolution,
const torch::Tensor occ_binary,
// sampling
const int max_total_samples,
const int max_per_ray_samples,
const float dt
);
std::vector<torch::Tensor> volumetric_rendering_forward(
torch::Tensor packed_info,
torch::Tensor starts,
torch::Tensor ends,
torch::Tensor sigmas,
torch::Tensor rgbs
);
std::vector<torch::Tensor> volumetric_rendering_backward(
torch::Tensor accumulated_weight,
torch::Tensor accumulated_depth,
torch::Tensor accumulated_color,
torch::Tensor grad_weight,
torch::Tensor grad_depth,
torch::Tensor grad_color,
torch::Tensor packed_info,
torch::Tensor starts,
torch::Tensor ends,
torch::Tensor sigmas,
torch::Tensor rgbs
);
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
{
m.def("ray_aabb_intersect", &ray_aabb_intersect);
m.def("ray_marching", &ray_marching);
m.def("volumetric_rendering_forward", &volumetric_rendering_forward);
m.def("volumetric_rendering_backward", &volumetric_rendering_backward);
}
\ No newline at end of file
#include "include/helpers_cuda.h"
inline __device__ int cascaded_grid_idx_at(
const float x, const float y, const float z,
const int* resolution, const float* aabb
) {
// TODO(ruilongli): if the x, y, z is outside the aabb, it will be clipped into aabb!!! We should just return false
int ix = (int)(((x - aabb[0]) / (aabb[3] - aabb[0])) * resolution[0]);
int iy = (int)(((y - aabb[1]) / (aabb[4] - aabb[1])) * resolution[1]);
int iz = (int)(((z - aabb[2]) / (aabb[5] - aabb[2])) * resolution[2]);
ix = __clamp(ix, 0, resolution[0]-1);
iy = __clamp(iy, 0, resolution[1]-1);
iz = __clamp(iz, 0, resolution[2]-1);
int idx = ix * resolution[1] * resolution[2] + iy * resolution[2] + iz;
return idx;
}
inline __device__ bool grid_occupied_at(
const float x, const float y, const float z,
const int* resolution, const float* aabb, const bool* occ_binary
) {
int idx = cascaded_grid_idx_at(x, y, z, resolution, aabb);
return occ_binary[idx];
}
inline __device__ float distance_to_next_voxel(
float x, float y, float z,
float dir_x, float dir_y, float dir_z,
float idir_x, float idir_y, float idir_z,
const int* resolution
) { // dda like step
// TODO: warning: expression has no effect?
x, y, z = resolution[0] * x, resolution[1] * y, resolution[2] * z;
float tx = ((floorf(x + 0.5f + 0.5f * __sign(dir_x)) - x) * idir_x) / resolution[0];
float ty = ((floorf(y + 0.5f + 0.5f * __sign(dir_y)) - y) * idir_y) / resolution[1];
float tz = ((floorf(z + 0.5f + 0.5f * __sign(dir_z)) - z) * idir_z) / resolution[2];
float t = min(min(tx, ty), tz);
return fmaxf(t, 0.0f);
}
inline __device__ float advance_to_next_voxel(
float t,
float x, float y, float z,
float dir_x, float dir_y, float dir_z,
float idir_x, float idir_y, float idir_z,
const int* resolution, float dt_min) {
// Regular stepping (may be slower but matches non-empty space)
float t_target = t + distance_to_next_voxel(
x, y, z, dir_x, dir_y, dir_z, idir_x, idir_y, idir_z, resolution
);
do {
t += dt_min;
} while (t < t_target);
return t;
}
__global__ void kernel_raymarching(
// rays info
const uint32_t n_rays,
const float* rays_o, // shape (n_rays, 3)
const float* rays_d, // shape (n_rays, 3)
const float* t_min, // shape (n_rays,)
const float* t_max, // shape (n_rays,)
// density grid
const float* aabb, // [min_x, min_y, min_z, max_x, max_y, max_y]
const int* resolution, // [reso_x, reso_y, reso_z]
const bool* occ_binary, // shape (reso_x, reso_y, reso_z)
// sampling
const int max_total_samples,
const int max_per_ray_samples,
const float dt,
// writable helpers
int* steps_counter,
int* rays_counter,
// frustrum outputs
int* packed_info,
float* frustum_origins,
float* frustum_dirs,
float* frustum_starts,
float* frustum_ends
) {
CUDA_GET_THREAD_ID(i, n_rays);
// locate
rays_o += i * 3;
rays_d += i * 3;
t_min += i;
t_max += i;
const float ox = rays_o[0], oy = rays_o[1], oz = rays_o[2];
const float dx = rays_d[0], dy = rays_d[1], dz = rays_d[2];
const float rdx = 1 / dx, rdy = 1 / dy, rdz = 1 / dz;
const float near = t_min[0], far = t_max[0];
uint32_t ray_idx, base, marching_samples;
uint32_t j;
float t0, t1, t_mid;
// first pass to compute an accurate number of steps
j = 0;
t0 = near; // TODO(ruilongli): perturb `near` as in ngp_pl?
t1 = t0 + dt;
t_mid = (t0 + t1) * 0.5f;
while (t_mid < far && j < max_per_ray_samples) {
// current center
const float x = ox + t_mid * dx;
const float y = oy + t_mid * dy;
const float z = oz + t_mid * dz;
if (grid_occupied_at(x, y, z, resolution, aabb, occ_binary)) {
++j;
// march to next sample
t0 = t1;
t1 = t0 + dt;
t_mid = (t0 + t1) * 0.5f;
}
else {
// march to next sample
t_mid = advance_to_next_voxel(
t_mid, x, y, z, dx, dy, dz, rdx, rdy, rdz, resolution, dt
);
t0 = t_mid - dt * 0.5f;
t1 = t_mid + dt * 0.5f;
}
}
if (j == 0) return;
marching_samples = j;
base = atomicAdd(steps_counter, marching_samples);
if (base + marching_samples > max_total_samples) return;
ray_idx = atomicAdd(rays_counter, 1);
// locate
frustum_origins += base * 3;
frustum_dirs += base * 3;
frustum_starts += base;
frustum_ends += base;
// Second round
j = 0;
t0 = near;
t1 = t0 + dt;
t_mid = (t0 + t1) / 2.;
while (t_mid < far && j < marching_samples) {
// current center
const float x = ox + t_mid * dx;
const float y = oy + t_mid * dy;
const float z = oz + t_mid * dz;
if (grid_occupied_at(x, y, z, resolution, aabb, occ_binary)) {
frustum_origins[j * 3 + 0] = ox;
frustum_origins[j * 3 + 1] = oy;
frustum_origins[j * 3 + 2] = oz;
frustum_dirs[j * 3 + 0] = dx;
frustum_dirs[j * 3 + 1] = dy;
frustum_dirs[j * 3 + 2] = dz;
frustum_starts[j] = t0;
frustum_ends[j] = t1;
++j;
// march to next sample
t0 = t1;
t1 = t0 + dt;
t_mid = (t0 + t1) * 0.5f;
}
else {
// march to next sample
t_mid = advance_to_next_voxel(
t_mid, x, y, z, dx, dy, dz, rdx, rdy, rdz, resolution, dt
);
t0 = t_mid - dt * 0.5f;
t1 = t_mid + dt * 0.5f;
}
}
packed_info[ray_idx * 3 + 0] = i; // ray idx in {rays_o, rays_d}
packed_info[ray_idx * 3 + 1] = base; // point idx start.
packed_info[ray_idx * 3 + 2] = j; // point idx shift (actual marching samples).
return;
}
/**
* @brief Sample points by ray marching.
*
* @param rays_o Ray origins Shape of [n_rays, 3].
* @param rays_d Normalized ray directions. Shape of [n_rays, 3].
* @param t_min Near planes of rays. Shape of [n_rays].
* @param t_max Far planes of rays. Shape of [n_rays].
* @param grid_center Density grid center. TODO: support 3-dims.
* @param grid_scale Density grid base level scale. TODO: support 3-dims.
* @param grid_cascades Density grid levels.
* @param grid_size Density grid resolution.
* @param grid_bitfield Density grid uint8 bit field.
* @param marching_steps Marching steps during inference.
* @param max_total_samples Maximum total number of samples in this batch.
* @param max_ray_samples Used to define the minimal step size: SQRT3() / max_ray_samples.
* @param cone_angle 0. for nerf-synthetic and 1./256 for real scenes.
* @param step_scale Scale up the step size by this much. Usually equals to scene scale.
* @return std::vector<torch::Tensor>
* - packed_info: Stores how to index the ray samples from the returned values.
* Shape of [n_rays, 3]. First value is the ray index. Second value is the sample
* start index in the results for this ray. Third value is the number of samples for
* this ray. Note for rays that have zero samples, we simply skip them so the `packed_info`
* has some zero padding in the end.
* - origins: Ray origins for those samples. [max_total_samples, 3]
* - dirs: Ray directions for those samples. [max_total_samples, 3]
* - starts: Where the frustum-shape sample starts along a ray. [max_total_samples, 1]
* - ends: Where the frustum-shape sample ends along a ray. [max_total_samples, 1]
*/
std::vector<torch::Tensor> ray_marching(
// rays
const torch::Tensor rays_o,
const torch::Tensor rays_d,
const torch::Tensor t_min,
const torch::Tensor t_max,
// density grid
const torch::Tensor aabb,
const torch::Tensor resolution,
const torch::Tensor occ_binary,
// sampling
const int max_total_samples,
const int max_per_ray_samples,
const float dt
) {
DEVICE_GUARD(rays_o);
CHECK_INPUT(rays_o);
CHECK_INPUT(rays_d);
CHECK_INPUT(t_min);
CHECK_INPUT(t_max);
CHECK_INPUT(aabb);
CHECK_INPUT(resolution);
CHECK_INPUT(occ_binary);
const int n_rays = rays_o.size(0);
const int threads = 256;
const int blocks = CUDA_N_BLOCKS_NEEDED(n_rays, threads);
// helper counter
torch::Tensor steps_counter = torch::zeros(
{1}, rays_o.options().dtype(torch::kInt32));
torch::Tensor rays_counter = torch::zeros(
{1}, rays_o.options().dtype(torch::kInt32));
// output frustum samples
torch::Tensor packed_info = torch::zeros(
{n_rays, 3}, rays_o.options().dtype(torch::kInt32)); // ray_id, sample_id, num_samples
torch::Tensor frustum_origins = torch::zeros({max_total_samples, 3}, rays_o.options());
torch::Tensor frustum_dirs = torch::zeros({max_total_samples, 3}, rays_o.options());
torch::Tensor frustum_starts = torch::zeros({max_total_samples, 1}, rays_o.options());
torch::Tensor frustum_ends = torch::zeros({max_total_samples, 1}, rays_o.options());
kernel_raymarching<<<blocks, threads>>>(
// rays
n_rays,
rays_o.data_ptr<float>(),
rays_d.data_ptr<float>(),
t_min.data_ptr<float>(),
t_max.data_ptr<float>(),
// density grid
aabb.data_ptr<float>(),
resolution.data_ptr<int>(),
occ_binary.data_ptr<bool>(),
// sampling
max_total_samples,
max_per_ray_samples,
dt,
// writable helpers
steps_counter.data_ptr<int>(), // total samples.
rays_counter.data_ptr<int>(), // total rays.
packed_info.data_ptr<int>(),
frustum_origins.data_ptr<float>(),
frustum_dirs.data_ptr<float>(),
frustum_starts.data_ptr<float>(),
frustum_ends.data_ptr<float>()
);
return {packed_info, frustum_origins, frustum_dirs, frustum_starts, frustum_ends};
}
#include "include/helpers_cuda.h"
template <typename scalar_t>
__global__ void volumetric_rendering_forward_kernel(
const uint32_t n_rays,
const int* packed_info, // input ray & point indices.
const scalar_t* starts, // input start t
const scalar_t* ends, // input end t
const scalar_t* sigmas, // input density after activation
const scalar_t* rgbs, // input rgb after activation
// should be all-zero initialized
scalar_t* accumulated_weight, // output
scalar_t* accumulated_depth, // output
scalar_t* accumulated_color, // output
bool* mask // output
) {
CUDA_GET_THREAD_ID(thread_id, n_rays);
// locate
const int i = packed_info[thread_id * 3 + 0]; // ray idx in {rays_o, rays_d}
const int base = packed_info[thread_id * 3 + 1]; // point idx start.
const int numsteps = packed_info[thread_id * 3 + 2]; // point idx shift.
if (numsteps == 0) return;
starts += base;
ends += base;
sigmas += base;
rgbs += base * 3;
accumulated_weight += i;
accumulated_depth += i;
accumulated_color += i * 3;
mask += i;
// accumulated rendering
scalar_t T = 1.f;
scalar_t EPSILON = 1e-4f;
int j = 0;
for (; j < numsteps; ++j) {
if (T < EPSILON) {
break;
}
const scalar_t delta = ends[j] - starts[j];
const scalar_t t = (ends[j] + starts[j]) * 0.5f;
const scalar_t alpha = 1.f - __expf(-sigmas[j] * delta);
const scalar_t weight = alpha * T;
accumulated_weight[0] += weight;
accumulated_depth[0] += weight * t;
accumulated_color[0] += weight * rgbs[j * 3 + 0];
accumulated_color[1] += weight * rgbs[j * 3 + 1];
accumulated_color[2] += weight * rgbs[j * 3 + 2];
T *= (1.f - alpha);
}
mask[0] = true;
}
template <typename scalar_t>
__global__ void volumetric_rendering_backward_kernel(
const uint32_t n_rays,
const int* packed_info, // input ray & point indices.
const scalar_t* starts, // input start t
const scalar_t* ends, // input end t
const scalar_t* sigmas, // input density after activation
const scalar_t* rgbs, // input rgb after activation
const scalar_t* accumulated_weight, // forward output
const scalar_t* accumulated_depth, // forward output
const scalar_t* accumulated_color, // forward output
const scalar_t* grad_weight, // input
const scalar_t* grad_depth, // input
const scalar_t* grad_color, // input
scalar_t* grad_sigmas, // output
scalar_t* grad_rgbs // output
) {
CUDA_GET_THREAD_ID(thread_id, n_rays);
// locate
const int i = packed_info[thread_id * 3 + 0]; // ray idx in {rays_o, rays_d}
const int base = packed_info[thread_id * 3 + 1]; // point idx start.
const int numsteps = packed_info[thread_id * 3 + 2]; // point idx shift.
if (numsteps == 0) return;
starts += base;
ends += base;
sigmas += base;
rgbs += base * 3;
grad_sigmas += base;
grad_rgbs += base * 3;
accumulated_weight += i;
accumulated_depth += i;
accumulated_color += i * 3;
grad_weight += i;
grad_depth += i;
grad_color += i * 3;
// backward of accumulated rendering
scalar_t T = 1.f;
scalar_t EPSILON = 1e-4f;
int j = 0;
scalar_t r = 0, g = 0, b = 0, d = 0;
for (; j < numsteps; ++j) {
if (T < EPSILON) {
break;
}
const scalar_t delta = ends[j] - starts[j];
const scalar_t t = (ends[j] + starts[j]) * 0.5f;
const scalar_t alpha = 1.f - __expf(-sigmas[j] * delta);
const scalar_t weight = alpha * T;
r += weight * rgbs[j * 3 + 0];
g += weight * rgbs[j * 3 + 1];
b += weight * rgbs[j * 3 + 2];
d += weight * t;
T *= (1.f - alpha);
grad_rgbs[j * 3 + 0] = grad_color[0] * weight;
grad_rgbs[j * 3 + 1] = grad_color[1] * weight;
grad_rgbs[j * 3 + 2] = grad_color[2] * weight;
grad_sigmas[j] = delta * (
grad_color[0] * (T * rgbs[j * 3 + 0] - (accumulated_color[0] - r)) +
grad_color[1] * (T * rgbs[j * 3 + 1] - (accumulated_color[1] - g)) +
grad_color[2] * (T * rgbs[j * 3 + 2] - (accumulated_color[2] - b)) +
grad_weight[0] * (1.f - accumulated_weight[0]) +
grad_depth[0] * (t * T - (accumulated_depth[0] - d))
);
}
}
/**
* @brief Volumetric Rendering: Accumulating samples in the forward pass.
* The inputs, excepct for `sigmas` and `rgbs`, are the outputs of our
* cuda ray marching function in `raymarching.cu`
*
* @param packed_info Stores how to index the ray samples from the returned values.
* Shape of [n_rays, 3]. First value is the ray index. Second value is the sample
* start index in the results for this ray. Third value is the number of samples for
* this ray. Note for rays that have zero samples, we simply skip them so the `packed_info`
* has some zero padding in the end.
* @param starts: Where the frustum-shape sample starts along a ray. [total_samples, 1]
* @param ends: Where the frustum-shape sample ends along a ray. [total_samples, 1]
* @param sigmas Densities at those samples. [total_samples, 1]
* @param rgbs RGBs at those samples. [total_samples, 3]
* @return std::vector<torch::Tensor>
* - accumulated_weight: Ray opacity. [n_rays, 1]
* - accumulated_depth: Ray depth. [n_rays, 1]
* - accumulated_color: Ray color. [n_rays, 3]
* - mask: Boolen value store if this ray has valid samples from packed_info. [n_rays]
*/
std::vector<torch::Tensor> volumetric_rendering_forward(
torch::Tensor packed_info,
torch::Tensor starts,
torch::Tensor ends,
torch::Tensor sigmas,
torch::Tensor rgbs
) {
DEVICE_GUARD(packed_info);
CHECK_INPUT(packed_info);
CHECK_INPUT(starts);
CHECK_INPUT(ends);
CHECK_INPUT(sigmas);
CHECK_INPUT(rgbs);
TORCH_CHECK(packed_info.ndimension() == 2 & packed_info.size(1) == 3);
TORCH_CHECK(starts.ndimension() == 2 & starts.size(1) == 1);
TORCH_CHECK(ends.ndimension() == 2 & ends.size(1) == 1);
TORCH_CHECK(sigmas.ndimension() == 2 & sigmas.size(1) == 1);
TORCH_CHECK(rgbs.ndimension() == 2 & rgbs.size(1) == 3);
const uint32_t n_rays = packed_info.size(0);
const int threads = 256;
const int blocks = CUDA_N_BLOCKS_NEEDED(n_rays, threads);
// outputs
torch::Tensor accumulated_weight = torch::zeros({n_rays, 1}, sigmas.options());
torch::Tensor accumulated_depth = torch::zeros({n_rays, 1}, sigmas.options());
torch::Tensor accumulated_color = torch::zeros({n_rays, 3}, sigmas.options());
// The rays that are not skipped during sampling.
torch::Tensor mask = torch::zeros({n_rays}, sigmas.options().dtype(torch::kBool));
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
sigmas.scalar_type(),
"volumetric_rendering_forward",
([&]
{ volumetric_rendering_forward_kernel<scalar_t><<<blocks, threads>>>(
n_rays,
packed_info.data_ptr<int>(),
starts.data_ptr<scalar_t>(),
ends.data_ptr<scalar_t>(),
sigmas.data_ptr<scalar_t>(),
rgbs.data_ptr<scalar_t>(),
accumulated_weight.data_ptr<scalar_t>(),
accumulated_depth.data_ptr<scalar_t>(),
accumulated_color.data_ptr<scalar_t>(),
mask.data_ptr<bool>()
);
}));
return {accumulated_weight, accumulated_depth, accumulated_color, mask};
}
/**
* @brief Volumetric Rendering: Accumulating samples in the backward pass.
*/
std::vector<torch::Tensor> volumetric_rendering_backward(
torch::Tensor accumulated_weight,
torch::Tensor accumulated_depth,
torch::Tensor accumulated_color,
torch::Tensor grad_weight,
torch::Tensor grad_depth,
torch::Tensor grad_color,
torch::Tensor packed_info,
torch::Tensor starts,
torch::Tensor ends,
torch::Tensor sigmas,
torch::Tensor rgbs
) {
DEVICE_GUARD(packed_info);
const uint32_t n_rays = packed_info.size(0);
const int threads = 256;
const int blocks = CUDA_N_BLOCKS_NEEDED(n_rays, threads);
// outputs
torch::Tensor grad_sigmas = torch::zeros(sigmas.sizes(), sigmas.options());
torch::Tensor grad_rgbs = torch::zeros(rgbs.sizes(), rgbs.options());
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
sigmas.scalar_type(),
"volumetric_rendering_backward",
([&]
{ volumetric_rendering_backward_kernel<scalar_t><<<blocks, threads>>>(
n_rays,
packed_info.data_ptr<int>(),
starts.data_ptr<scalar_t>(),
ends.data_ptr<scalar_t>(),
sigmas.data_ptr<scalar_t>(),
rgbs.data_ptr<scalar_t>(),
accumulated_weight.data_ptr<scalar_t>(),
accumulated_depth.data_ptr<scalar_t>(),
accumulated_color.data_ptr<scalar_t>(),
grad_weight.data_ptr<scalar_t>(),
grad_depth.data_ptr<scalar_t>(),
grad_color.data_ptr<scalar_t>(),
grad_sigmas.data_ptr<scalar_t>(),
grad_rgbs.data_ptr<scalar_t>()
);
}));
return {grad_sigmas, grad_rgbs};
}
\ No newline at end of file
ninja
pybind11
--extra-index-url https://download.pytorch.org/whl/cu116
torch==1.12.1
-e .
from setuptools import find_packages, setup
setup(
name="nerfacc",
description="NeRF accelerated rendering",
version="0.0.2",
python_requires=">=3.9",
packages=find_packages(exclude=("tests*",)),
)
conda create -n nerfacc python=3.9 -y
conda activate nerfacc
pip install -r requirements.txt
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment