Unverified Commit 8dcfbad9 authored by Ruilong Li(李瑞龙)'s avatar Ruilong Li(李瑞龙) Committed by GitHub
Browse files

Reformat (#31)



* seems working

* contraction func in cuda

* Update type

* More type updates

* disable DDA for contraction

* update contraction perfom in readme

* 360 data: Garden

* eval at max_steps

* add perform of 360 to readme

* fix contraction scaling

* tiny hot fix

* new volrend

* cleanup ray_marching.cu

* cleanup backend

* tests

* cleaning up Grid

* fix doc for grid base class

* check and fix for contraction

* test grid

* rendering and marching

* transmittance_compress verified

* rendering is indeed faster

* pipeline is working

* lego example

* cleanup

* cuda folder is cleaned up! finally!

* cuda formatting

* contraction verify

* upgrade grid

* test for ray marching

* pipeline

* ngp with contraction

* train_ngp runs but slow

* trasmittance seperate to two. Now NGP is as fast as before

* verified faster than before

* bug fix for contraction

* ngp contraction fix

* tiny cleanup

* contraction works! yay!

* contraction with tanh seems working

* minor update

* support alpha rendering

* absorb visibility to ray marching

* tiny import update

* get rid of contraction temperture;

* doc for ContractionType

* doc for Grid

* doc for grid.py is done

* doc for ray marching

* rendering function

* fix doc for rendering

* doc for vol rend

* autosummary for utils

* fix autosummary line break

* utils docs

* api doc is done

* starting work on examples

* contraction for npg is in python now

* further clean up examples

* mlp nerf is running

* dnerf is in

* update readme command

* merge

* disable pylint error for now

* reformatting and skip tests without cuda

* fix the type issue for contractiontype

* fix cuda attribute issue

* bump to 0.1.0
Co-authored-by: default avatarMatt Tancik <tancik@berkeley.edu>
parent a7611603
#include "include/helpers_cuda.h"
#include "include/helpers_math.h"
#include "include/helpers_contraction.h"
inline __device__ __host__ float calc_dt(
const float t, const float cone_angle,
const float dt_min, const float dt_max)
{
return clamp(t * cone_angle, dt_min, dt_max);
}
inline __device__ __host__ int grid_idx_at(
const float3 xyz_unit, const int3 grid_res)
{
// xyz should be always in [0, 1]^3.
int3 ixyz = make_int3(xyz_unit * make_float3(grid_res));
ixyz = clamp(ixyz, make_int3(0, 0, 0), grid_res - 1);
int3 grid_offset = make_int3(grid_res.y * grid_res.z, grid_res.z, 1);
int idx = dot(ixyz, grid_offset);
return idx;
}
inline __device__ __host__ bool grid_occupied_at(
const float3 xyz,
const float3 roi_min, const float3 roi_max,
ContractionType type,
const int3 grid_res, const bool *grid_binary)
{
if (type == ContractionType::AABB &&
(xyz.x < roi_min.x || xyz.x > roi_max.x ||
xyz.y < roi_min.y || xyz.y > roi_max.y ||
xyz.z < roi_min.z || xyz.z > roi_max.z))
{
return false;
}
float3 xyz_unit = apply_contraction(
xyz, roi_min, roi_max, type);
int idx = grid_idx_at(xyz_unit, grid_res);
return grid_binary[idx];
}
// dda like step
inline __device__ __host__ float distance_to_next_voxel(
const float3 xyz, const float3 dir, const float3 inv_dir,
const float3 roi_min, const float3 roi_max, const int3 grid_res)
{
float3 _occ_res = make_float3(grid_res);
float3 _xyz = roi_to_unit(xyz, roi_min, roi_max) * _occ_res;
float3 txyz = ((floorf(_xyz + 0.5f + 0.5f * sign(dir)) - _xyz) * inv_dir) / _occ_res * (roi_max - roi_min);
float t = min(min(txyz.x, txyz.y), txyz.z);
return fmaxf(t, 0.0f);
}
inline __device__ __host__ float advance_to_next_voxel(
const float t, const float dt_min,
const float3 xyz, const float3 dir, const float3 inv_dir,
const float3 roi_min, const float3 roi_max, const int3 grid_res)
{
// Regular stepping (may be slower but matches non-empty space)
float t_target = t + distance_to_next_voxel(
xyz, dir, inv_dir, roi_min, roi_max, grid_res);
float _t = t;
do
{
_t += dt_min;
} while (_t < t_target);
return _t;
}
// -------------------------------------------------------------------------------
// Raymarching
// -------------------------------------------------------------------------------
__global__ void ray_marching_kernel(
// rays info
const uint32_t n_rays,
const float *rays_o, // shape (n_rays, 3)
const float *rays_d, // shape (n_rays, 3)
const float *t_min, // shape (n_rays,)
const float *t_max, // shape (n_rays,)
// occupancy grid & contraction
const float *roi,
const int3 grid_res,
const bool *grid_binary, // shape (reso_x, reso_y, reso_z)
const ContractionType type,
// sampling
const float step_size,
const float cone_angle,
const int *packed_info,
// first round outputs
int *num_steps,
// second round outputs
float *t_starts,
float *t_ends)
{
CUDA_GET_THREAD_ID(i, n_rays);
bool is_first_round = (packed_info == nullptr);
// locate
rays_o += i * 3;
rays_d += i * 3;
t_min += i;
t_max += i;
if (is_first_round)
{
num_steps += i;
}
else
{
int base = packed_info[i * 2 + 0];
int steps = packed_info[i * 2 + 1];
t_starts += base;
t_ends += base;
}
const float3 origin = make_float3(rays_o[0], rays_o[1], rays_o[2]);
const float3 dir = make_float3(rays_d[0], rays_d[1], rays_d[2]);
const float3 inv_dir = 1.0f / dir;
const float near = t_min[0], far = t_max[0];
const float3 roi_min = make_float3(roi[0], roi[1], roi[2]);
const float3 roi_max = make_float3(roi[3], roi[4], roi[5]);
// TODO: compute dt_max from occ resolution.
float dt_min = step_size;
float dt_max = 1e10f;
int j = 0;
float t0 = near;
float dt = calc_dt(t0, cone_angle, dt_min, dt_max);
float t1 = t0 + dt;
float t_mid = (t0 + t1) * 0.5f;
while (t_mid < far)
{
// current center
const float3 xyz = origin + t_mid * dir;
if (grid_occupied_at(xyz, roi_min, roi_max, type, grid_res, grid_binary))
{
if (!is_first_round)
{
t_starts[j] = t0;
t_ends[j] = t1;
}
++j;
// march to next sample
t0 = t1;
t1 = t0 + calc_dt(t0, cone_angle, dt_min, dt_max);
t_mid = (t0 + t1) * 0.5f;
}
else
{
// march to next sample
switch (type)
{
case ContractionType::AABB:
// no contraction
t_mid = advance_to_next_voxel(
t_mid, dt_min, xyz, dir, inv_dir, roi_min, roi_max, grid_res);
dt = calc_dt(t_mid, cone_angle, dt_min, dt_max);
t0 = t_mid - dt * 0.5f;
t1 = t_mid + dt * 0.5f;
break;
default:
// any type of scene contraction does not work with DDA.
t0 = t1;
t1 = t0 + calc_dt(t0, cone_angle, dt_min, dt_max);
t_mid = (t0 + t1) * 0.5f;
break;
}
}
}
if (is_first_round)
{
*num_steps = j;
}
return;
}
std::vector<torch::Tensor> ray_marching(
// rays
const torch::Tensor rays_o,
const torch::Tensor rays_d,
const torch::Tensor t_min,
const torch::Tensor t_max,
// occupancy grid & contraction
const torch::Tensor roi,
const torch::Tensor grid_binary,
const ContractionType type,
// sampling
const float step_size,
const float cone_angle)
{
DEVICE_GUARD(rays_o);
CHECK_INPUT(rays_o);
CHECK_INPUT(rays_d);
CHECK_INPUT(t_min);
CHECK_INPUT(t_max);
CHECK_INPUT(roi);
CHECK_INPUT(grid_binary);
TORCH_CHECK(rays_o.ndimension() == 2 & rays_o.size(1) == 3)
TORCH_CHECK(rays_d.ndimension() == 2 & rays_d.size(1) == 3)
TORCH_CHECK(t_min.ndimension() == 1)
TORCH_CHECK(t_max.ndimension() == 1)
TORCH_CHECK(roi.ndimension() == 1 & roi.size(0) == 6)
TORCH_CHECK(grid_binary.ndimension() == 3)
const int n_rays = rays_o.size(0);
const int3 grid_res = make_int3(
grid_binary.size(0), grid_binary.size(1), grid_binary.size(2));
const int threads = 256;
const int blocks = CUDA_N_BLOCKS_NEEDED(n_rays, threads);
// helper counter
torch::Tensor num_steps = torch::zeros(
{n_rays}, rays_o.options().dtype(torch::kInt32));
// count number of samples per ray
ray_marching_kernel<<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
// rays
n_rays,
rays_o.data_ptr<float>(),
rays_d.data_ptr<float>(),
t_min.data_ptr<float>(),
t_max.data_ptr<float>(),
// occupancy grid & contraction
roi.data_ptr<float>(),
grid_res,
grid_binary.data_ptr<bool>(),
type,
// sampling
step_size,
cone_angle,
nullptr, /* packed_info */
// outputs
num_steps.data_ptr<int>(),
nullptr, /* t_starts */
nullptr /* t_ends */);
torch::Tensor cum_steps = num_steps.cumsum(0, torch::kInt32);
torch::Tensor packed_info = torch::stack({cum_steps - num_steps, num_steps}, 1);
// output samples starts and ends
int total_steps = cum_steps[cum_steps.size(0) - 1].item<int>();
torch::Tensor t_starts = torch::zeros({total_steps, 1}, rays_o.options());
torch::Tensor t_ends = torch::zeros({total_steps, 1}, rays_o.options());
ray_marching_kernel<<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
// rays
n_rays,
rays_o.data_ptr<float>(),
rays_d.data_ptr<float>(),
t_min.data_ptr<float>(),
t_max.data_ptr<float>(),
// occupancy grid & contraction
roi.data_ptr<float>(),
grid_res,
grid_binary.data_ptr<bool>(),
type,
// sampling
step_size,
cone_angle,
packed_info.data_ptr<int>(),
// outputs
nullptr, /* num_steps */
t_starts.data_ptr<float>(),
t_ends.data_ptr<float>());
return {packed_info, t_starts, t_ends};
}
// -----------------------------------------------------------------------------
// Ray index for each sample
// -----------------------------------------------------------------------------
__global__ void ray_indices_kernel(
// input
const int n_rays,
const int *packed_info,
// output
int *ray_indices)
{
CUDA_GET_THREAD_ID(i, n_rays);
// locate
const int base = packed_info[i * 2 + 0]; // point idx start.
const int steps = packed_info[i * 2 + 1]; // point idx shift.
if (steps == 0)
return;
ray_indices += base;
for (int j = 0; j < steps; ++j)
{
ray_indices[j] = i;
}
}
torch::Tensor unpack_to_ray_indices(const torch::Tensor packed_info)
{
DEVICE_GUARD(packed_info);
CHECK_INPUT(packed_info);
const int n_rays = packed_info.size(0);
const int threads = 256;
const int blocks = CUDA_N_BLOCKS_NEEDED(n_rays, threads);
int n_samples = packed_info[n_rays - 1].sum(0).item<int>();
torch::Tensor ray_indices = torch::zeros(
{n_samples}, packed_info.options().dtype(torch::kInt32));
ray_indices_kernel<<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
n_rays,
packed_info.data_ptr<int>(),
ray_indices.data_ptr<int>());
return ray_indices;
}
// ----------------------------------------------------------------------------
// Query the occupancy grid
// ----------------------------------------------------------------------------
__global__ void query_occ_kernel(
// rays info
const uint32_t n_samples,
const float *samples, // shape (n_samples, 3)
// occupancy grid & contraction
const float *roi,
const int3 grid_res,
const bool *grid_binary, // shape (reso_x, reso_y, reso_z)
const ContractionType type,
// outputs
bool *occs)
{
CUDA_GET_THREAD_ID(i, n_samples);
// locate
samples += i * 3;
occs += i;
const float3 roi_min = make_float3(roi[0], roi[1], roi[2]);
const float3 roi_max = make_float3(roi[3], roi[4], roi[5]);
const float3 xyz = make_float3(samples[0], samples[1], samples[2]);
*occs = grid_occupied_at(xyz, roi_min, roi_max, type, grid_res, grid_binary);
return;
}
torch::Tensor query_occ(
const torch::Tensor samples,
// occupancy grid & contraction
const torch::Tensor roi,
const torch::Tensor grid_binary,
const ContractionType type)
{
DEVICE_GUARD(samples);
CHECK_INPUT(samples);
const int n_samples = samples.size(0);
const int3 grid_res = make_int3(
grid_binary.size(0), grid_binary.size(1), grid_binary.size(2));
const int threads = 256;
const int blocks = CUDA_N_BLOCKS_NEEDED(n_samples, threads);
torch::Tensor occs = torch::zeros(
{n_samples}, samples.options().dtype(torch::kBool));
query_occ_kernel<<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
n_samples,
samples.data_ptr<float>(),
// grid
roi.data_ptr<float>(),
grid_res,
grid_binary.data_ptr<bool>(),
type,
// outputs
occs.data_ptr<bool>());
return occs;
}
#include "include/helpers_cuda.h"
template <typename scalar_t>
__global__ void rendering_forward_kernel(
const uint32_t n_rays,
const int *packed_info, // input ray & point indices.
const scalar_t *starts, // input start t
const scalar_t *ends, // input end t
const scalar_t *sigmas, // input density after activation
const scalar_t *alphas, // input alpha (opacity) values.
const scalar_t early_stop_eps, // transmittance threshold for early stop
// outputs: should be all-zero initialized
int *num_steps, // the number of valid steps for each ray
scalar_t *weights, // the number rendering weights for each sample
bool *compact_selector // the samples that we needs to compute the gradients
)
{
CUDA_GET_THREAD_ID(i, n_rays);
// locate
const int base = packed_info[i * 2 + 0]; // point idx start.
const int steps = packed_info[i * 2 + 1]; // point idx shift.
if (steps == 0)
return;
if (alphas != nullptr)
{
// rendering with alpha
alphas += base;
}
else
{
// rendering with density
starts += base;
ends += base;
sigmas += base;
}
if (num_steps != nullptr)
{
num_steps += i;
}
if (weights != nullptr)
{
weights += base;
}
if (compact_selector != nullptr)
{
compact_selector += base;
}
// accumulated rendering
scalar_t T = 1.f;
int j = 0;
for (; j < steps; ++j)
{
if (T < early_stop_eps)
{
break;
}
scalar_t alpha;
if (alphas != nullptr)
{
// rendering with alpha
alpha = alphas[j];
}
else
{
// rendering with density
scalar_t delta = ends[j] - starts[j];
alpha = 1.f - __expf(-sigmas[j] * delta);
}
const scalar_t weight = alpha * T;
T *= (1.f - alpha);
if (weights != nullptr)
{
weights[j] = weight;
}
if (compact_selector != nullptr)
{
compact_selector[j] = true;
}
}
if (num_steps != nullptr)
{
*num_steps = j;
}
return;
}
template <typename scalar_t>
__global__ void rendering_backward_kernel(
const uint32_t n_rays,
const int *packed_info, // input ray & point indices.
const scalar_t *starts, // input start t
const scalar_t *ends, // input end t
const scalar_t *sigmas, // input density after activation
const scalar_t *alphas, // input alpha (opacity) values.
const scalar_t early_stop_eps, // transmittance threshold for early stop
const scalar_t *weights, // forward output
const scalar_t *grad_weights, // input gradients
// if alphas was given, we compute the gradients for alphas.
// otherwise, we compute the gradients for sigmas.
scalar_t *grad_sigmas, // output gradients
scalar_t *grad_alphas // output gradients
)
{
CUDA_GET_THREAD_ID(i, n_rays);
// locate
const int base = packed_info[i * 2 + 0]; // point idx start.
const int steps = packed_info[i * 2 + 1]; // point idx shift.
if (steps == 0)
return;
if (alphas != nullptr)
{
// rendering with alpha
alphas += base;
grad_alphas += base;
}
else
{
// rendering with density
starts += base;
ends += base;
sigmas += base;
grad_sigmas += base;
}
weights += base;
grad_weights += base;
scalar_t accum = 0;
for (int j = 0; j < steps; ++j)
{
accum += grad_weights[j] * weights[j];
}
// backward of accumulated rendering
scalar_t T = 1.f;
for (int j = 0; j < steps; ++j)
{
if (T < early_stop_eps)
{
break;
}
scalar_t alpha;
if (alphas != nullptr)
{
// rendering with alpha
alpha = alphas[j];
grad_alphas[j] = (grad_weights[j] * T - accum) / fmaxf(1.f - alpha, 1e-10f);
}
else
{
// rendering with density
scalar_t delta = ends[j] - starts[j];
alpha = 1.f - __expf(-sigmas[j] * delta);
grad_sigmas[j] = (grad_weights[j] * T - accum) * delta;
}
accum -= grad_weights[j] * weights[j];
T *= (1.f - alpha);
}
}
std::vector<torch::Tensor> rendering_forward(
torch::Tensor packed_info,
torch::Tensor starts,
torch::Tensor ends,
torch::Tensor sigmas,
float early_stop_eps,
bool compression)
{
DEVICE_GUARD(packed_info);
CHECK_INPUT(packed_info);
CHECK_INPUT(starts);
CHECK_INPUT(ends);
CHECK_INPUT(sigmas);
TORCH_CHECK(packed_info.ndimension() == 2 & packed_info.size(1) == 2);
TORCH_CHECK(starts.ndimension() == 2 & starts.size(1) == 1);
TORCH_CHECK(ends.ndimension() == 2 & ends.size(1) == 1);
TORCH_CHECK(sigmas.ndimension() == 2 & sigmas.size(1) == 1);
const uint32_t n_rays = packed_info.size(0);
const uint32_t n_samples = sigmas.size(0);
const int threads = 256;
const int blocks = CUDA_N_BLOCKS_NEEDED(n_rays, threads);
if (compression)
{
// compress the samples to get rid of invisible ones.
torch::Tensor num_steps = torch::zeros({n_rays}, packed_info.options());
torch::Tensor compact_selector = torch::zeros(
{n_samples}, sigmas.options().dtype(torch::kBool));
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
sigmas.scalar_type(),
"rendering_forward",
([&]
{ rendering_forward_kernel<scalar_t><<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
n_rays,
// inputs
packed_info.data_ptr<int>(),
starts.data_ptr<scalar_t>(),
ends.data_ptr<scalar_t>(),
sigmas.data_ptr<scalar_t>(),
nullptr, // alphas
early_stop_eps,
// outputs
num_steps.data_ptr<int>(),
nullptr,
compact_selector.data_ptr<bool>()); }));
torch::Tensor cum_steps = num_steps.cumsum(0, torch::kInt32);
torch::Tensor compact_packed_info = torch::stack({cum_steps - num_steps, num_steps}, 1);
return {compact_packed_info, compact_selector};
}
else
{
// just do the forward rendering.
torch::Tensor weights = torch::zeros({n_samples}, sigmas.options());
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
sigmas.scalar_type(),
"rendering_forward",
([&]
{ rendering_forward_kernel<scalar_t><<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
n_rays,
// inputs
packed_info.data_ptr<int>(),
starts.data_ptr<scalar_t>(),
ends.data_ptr<scalar_t>(),
sigmas.data_ptr<scalar_t>(),
nullptr, // alphas
early_stop_eps,
// outputs
nullptr,
weights.data_ptr<scalar_t>(),
nullptr); }));
return {weights};
}
}
torch::Tensor rendering_backward(
torch::Tensor weights,
torch::Tensor grad_weights,
torch::Tensor packed_info,
torch::Tensor starts,
torch::Tensor ends,
torch::Tensor sigmas,
float early_stop_eps)
{
DEVICE_GUARD(packed_info);
const uint32_t n_rays = packed_info.size(0);
const uint32_t n_samples = sigmas.size(0);
const int threads = 256;
const int blocks = CUDA_N_BLOCKS_NEEDED(n_rays, threads);
// outputs
torch::Tensor grad_sigmas = torch::zeros(sigmas.sizes(), sigmas.options());
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
sigmas.scalar_type(),
"rendering_backward",
([&]
{ rendering_backward_kernel<scalar_t><<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
n_rays,
// inputs
packed_info.data_ptr<int>(),
starts.data_ptr<scalar_t>(),
ends.data_ptr<scalar_t>(),
sigmas.data_ptr<scalar_t>(),
nullptr, // alphas
early_stop_eps,
weights.data_ptr<scalar_t>(),
grad_weights.data_ptr<scalar_t>(),
// outputs
grad_sigmas.data_ptr<scalar_t>(),
nullptr // alphas gradients
); }));
return grad_sigmas;
}
// -- rendering with alphas -- //
std::vector<torch::Tensor> rendering_alphas_forward(
torch::Tensor packed_info,
torch::Tensor alphas,
float early_stop_eps,
bool compression)
{
DEVICE_GUARD(packed_info);
CHECK_INPUT(packed_info);
CHECK_INPUT(alphas);
TORCH_CHECK(packed_info.ndimension() == 2 & packed_info.size(1) == 2);
TORCH_CHECK(alphas.ndimension() == 2 & alphas.size(1) == 1);
const uint32_t n_rays = packed_info.size(0);
const uint32_t n_samples = alphas.size(0);
const int threads = 256;
const int blocks = CUDA_N_BLOCKS_NEEDED(n_rays, threads);
if (compression)
{
// compress the samples to get rid of invisible ones.
torch::Tensor num_steps = torch::zeros({n_rays}, packed_info.options());
torch::Tensor compact_selector = torch::zeros(
{n_samples}, alphas.options().dtype(torch::kBool));
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
alphas.scalar_type(),
"rendering_alphas_forward",
([&]
{ rendering_forward_kernel<scalar_t><<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
n_rays,
// inputs
packed_info.data_ptr<int>(),
nullptr, // starts
nullptr, // ends
nullptr, // sigmas
alphas.data_ptr<scalar_t>(),
early_stop_eps,
// outputs
num_steps.data_ptr<int>(),
nullptr,
compact_selector.data_ptr<bool>()); }));
torch::Tensor cum_steps = num_steps.cumsum(0, torch::kInt32);
torch::Tensor compact_packed_info = torch::stack({cum_steps - num_steps, num_steps}, 1);
return {compact_selector, compact_packed_info};
}
else
{
// just do the forward rendering.
torch::Tensor weights = torch::zeros({n_samples}, alphas.options());
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
alphas.scalar_type(),
"rendering_forward",
([&]
{ rendering_forward_kernel<scalar_t><<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
n_rays,
// inputs
packed_info.data_ptr<int>(),
nullptr, // starts
nullptr, // ends
nullptr, // sigmas
alphas.data_ptr<scalar_t>(),
early_stop_eps,
// outputs
nullptr,
weights.data_ptr<scalar_t>(),
nullptr); }));
return {weights};
}
}
torch::Tensor rendering_alphas_backward(
torch::Tensor weights,
torch::Tensor grad_weights,
torch::Tensor packed_info,
torch::Tensor alphas,
float early_stop_eps)
{
DEVICE_GUARD(packed_info);
const uint32_t n_rays = packed_info.size(0);
const uint32_t n_samples = alphas.size(0);
const int threads = 256;
const int blocks = CUDA_N_BLOCKS_NEEDED(n_rays, threads);
// outputs
torch::Tensor grad_alphas = torch::zeros(alphas.sizes(), alphas.options());
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
alphas.scalar_type(),
"rendering_alphas_backward",
([&]
{ rendering_backward_kernel<scalar_t><<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
n_rays,
// inputs
packed_info.data_ptr<int>(),
nullptr, // starts
nullptr, // ends
nullptr, // sigmas
alphas.data_ptr<scalar_t>(),
early_stop_eps,
weights.data_ptr<scalar_t>(),
grad_weights.data_ptr<scalar_t>(),
// outputs
nullptr, // sigma gradients
grad_alphas.data_ptr<scalar_t>()); }));
return grad_alphas;
}
#include <pybind11/pybind11.h>
#include "include/helpers_cuda.h"
inline __device__ int cascaded_grid_idx_at(
const float x, const float y, const float z,
const int resx, const int resy, const int resz,
const float* aabb
) {
int ix = (int)(((x - aabb[0]) / (aabb[3] - aabb[0])) * resx);
int iy = (int)(((y - aabb[1]) / (aabb[4] - aabb[1])) * resy);
int iz = (int)(((z - aabb[2]) / (aabb[5] - aabb[2])) * resz);
ix = __clamp(ix, 0, resx-1);
iy = __clamp(iy, 0, resy-1);
iz = __clamp(iz, 0, resz-1);
int idx = ix * resy * resz + iy * resz + iz;
return idx;
}
inline __device__ bool grid_occupied_at(
const float x, const float y, const float z,
const int resx, const int resy, const int resz,
const float* aabb, const bool* occ_binary
) {
if (x <= aabb[0] || x >= aabb[3] || y <= aabb[1] || y >= aabb[4] || z <= aabb[2] || z >= aabb[5]) {
return false;
}
int idx = cascaded_grid_idx_at(x, y, z, resx, resy, resz, aabb);
return occ_binary[idx];
}
inline __device__ float distance_to_next_voxel(
float x, float y, float z,
float dir_x, float dir_y, float dir_z,
float idir_x, float idir_y, float idir_z,
const int resx, const int resy, const int resz,
const float* aabb
) { // dda like step
// TODO: this is ugly -- optimize this.
float _x = ((x - aabb[0]) / (aabb[3] - aabb[0])) * resx;
float _y = ((y - aabb[1]) / (aabb[4] - aabb[1])) * resy;
float _z = ((z - aabb[2]) / (aabb[5] - aabb[2])) * resz;
float tx = ((floorf(_x + 0.5f + 0.5f * __sign(dir_x)) - _x) * idir_x) / resx * (aabb[3] - aabb[0]);
float ty = ((floorf(_y + 0.5f + 0.5f * __sign(dir_y)) - _y) * idir_y) / resy * (aabb[4] - aabb[1]);
float tz = ((floorf(_z + 0.5f + 0.5f * __sign(dir_z)) - _z) * idir_z) / resz * (aabb[5] - aabb[2]);
float t = min(min(tx, ty), tz);
return fmaxf(t, 0.0f);
}
inline __device__ float advance_to_next_voxel(
float t,
float x, float y, float z,
float dir_x, float dir_y, float dir_z,
float idir_x, float idir_y, float idir_z,
const int resx, const int resy, const int resz, const float* aabb,
float dt_min) {
// Regular stepping (may be slower but matches non-empty space)
float t_target = t + distance_to_next_voxel(
x, y, z,
dir_x, dir_y, dir_z,
idir_x, idir_y, idir_z,
resx, resy, resz, aabb
);
do {
t += dt_min;
} while (t < t_target);
return t;
}
__global__ void marching_steps_kernel(
// rays info
const uint32_t n_rays,
const float* rays_o, // shape (n_rays, 3)
const float* rays_d, // shape (n_rays, 3)
const float* t_min, // shape (n_rays,)
const float* t_max, // shape (n_rays,)
// density grid
const float* aabb, // [min_x, min_y, min_z, max_x, max_y, max_y]
const int resx,
const int resy,
const int resz,
const bool* occ_binary, // shape (reso_x, reso_y, reso_z)
// sampling
const float dt,
// outputs
int* num_steps
) {
CUDA_GET_THREAD_ID(i, n_rays);
// locate
rays_o += i * 3;
rays_d += i * 3;
t_min += i;
t_max += i;
num_steps += i;
const float ox = rays_o[0], oy = rays_o[1], oz = rays_o[2];
const float dx = rays_d[0], dy = rays_d[1], dz = rays_d[2];
const float rdx = 1 / dx, rdy = 1 / dy, rdz = 1 / dz;
const float near = t_min[0], far = t_max[0];
int j = 0;
float t0 = near; // TODO(ruilongli): perturb `near` as in ngp_pl?
float t1 = t0 + dt;
float t_mid = (t0 + t1) * 0.5f;
while (t_mid < far) {
// current center
const float x = ox + t_mid * dx;
const float y = oy + t_mid * dy;
const float z = oz + t_mid * dz;
if (grid_occupied_at(x, y, z, resx, resy, resz, aabb, occ_binary)) {
++j;
// march to next sample
t0 = t1;
t1 = t0 + dt;
t_mid = (t0 + t1) * 0.5f;
}
else {
// march to next sample
t_mid = advance_to_next_voxel(
t_mid, x, y, z, dx, dy, dz, rdx, rdy, rdz, resx, resy, resz, aabb, dt
);
t0 = t_mid - dt * 0.5f;
t1 = t_mid + dt * 0.5f;
}
}
if (j == 0) return;
num_steps[0] = j;
return;
}
__global__ void marching_forward_kernel(
// rays info
const uint32_t n_rays,
const float* rays_o, // shape (n_rays, 3)
const float* rays_d, // shape (n_rays, 3)
const float* t_min, // shape (n_rays,)
const float* t_max, // shape (n_rays,)
// density grid
const float* aabb, // [min_x, min_y, min_z, max_x, max_y, max_y]
const int resx,
const int resy,
const int resz,
const bool* occ_binary, // shape (reso_x, reso_y, reso_z)
// sampling
const float dt,
const int* packed_info,
// frustrum outputs
float* frustum_starts,
float* frustum_ends
) {
CUDA_GET_THREAD_ID(i, n_rays);
// locate
rays_o += i * 3;
rays_d += i * 3;
t_min += i;
t_max += i;
int base = packed_info[i * 2 + 0];
int steps = packed_info[i * 2 + 1];
const float ox = rays_o[0], oy = rays_o[1], oz = rays_o[2];
const float dx = rays_d[0], dy = rays_d[1], dz = rays_d[2];
const float rdx = 1 / dx, rdy = 1 / dy, rdz = 1 / dz;
const float near = t_min[0], far = t_max[0];
// locate
frustum_starts += base;
frustum_ends += base;
int j = 0;
float t0 = near;
float t1 = t0 + dt;
float t_mid = (t0 + t1) / 2.;
while (t_mid < far) {
// current center
const float x = ox + t_mid * dx;
const float y = oy + t_mid * dy;
const float z = oz + t_mid * dz;
if (grid_occupied_at(x, y, z, resx, resy, resz, aabb, occ_binary)) {
frustum_starts[j] = t0;
frustum_ends[j] = t1;
++j;
// march to next sample
t0 = t1;
t1 = t0 + dt;
t_mid = (t0 + t1) * 0.5f;
}
else {
// march to next sample
t_mid = advance_to_next_voxel(
t_mid, x, y, z, dx, dy, dz, rdx, rdy, rdz, resx, resy, resz, aabb, dt
);
t0 = t_mid - dt * 0.5f;
t1 = t_mid + dt * 0.5f;
}
}
if (j != steps) {
printf("WTF %d v.s. %d\n", j, steps);
}
return;
}
__global__ void ray_indices_kernel(
// input
const int n_rays,
const int* packed_info,
// output
int* ray_indices
) {
CUDA_GET_THREAD_ID(i, n_rays);
// locate
const int base = packed_info[i * 2 + 0]; // point idx start.
const int steps = packed_info[i * 2 + 1]; // point idx shift.
if (steps == 0) return;
ray_indices += base;
for (int j = 0; j < steps; ++j) {
ray_indices[j] = i;
}
}
__global__ void occ_query_kernel(
// rays info
const uint32_t n_samples,
const float* samples, // shape (n_samples, 3)
// density grid
const float* aabb, // [min_x, min_y, min_z, max_x, max_y, max_y]
const int resx,
const int resy,
const int resz,
const bool* occ_binary, // shape (reso_x, reso_y, reso_z)
// outputs
bool* occs
) {
CUDA_GET_THREAD_ID(i, n_samples);
// locate
samples += i * 3;
occs += i;
occs[0] = grid_occupied_at(
samples[0], samples[1], samples[2],
resx, resy, resz, aabb, occ_binary
);
return;
}
std::vector<torch::Tensor> volumetric_marching(
// rays
const torch::Tensor rays_o,
const torch::Tensor rays_d,
const torch::Tensor t_min,
const torch::Tensor t_max,
// density grid
const torch::Tensor aabb,
const pybind11::list resolution,
const torch::Tensor occ_binary,
// sampling
const float dt
) {
DEVICE_GUARD(rays_o);
CHECK_INPUT(rays_o);
CHECK_INPUT(rays_d);
CHECK_INPUT(t_min);
CHECK_INPUT(t_max);
CHECK_INPUT(aabb);
CHECK_INPUT(occ_binary);
const int n_rays = rays_o.size(0);
const int threads = 256;
const int blocks = CUDA_N_BLOCKS_NEEDED(n_rays, threads);
// helper counter
torch::Tensor num_steps = torch::zeros(
{n_rays}, rays_o.options().dtype(torch::kInt32));
// count number of samples per ray
marching_steps_kernel<<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
// rays
n_rays,
rays_o.data_ptr<float>(),
rays_d.data_ptr<float>(),
t_min.data_ptr<float>(),
t_max.data_ptr<float>(),
// density grid
aabb.data_ptr<float>(),
resolution[0].cast<int>(),
resolution[1].cast<int>(),
resolution[2].cast<int>(),
occ_binary.data_ptr<bool>(),
// sampling
dt,
// outputs
num_steps.data_ptr<int>()
);
torch::Tensor cum_steps = num_steps.cumsum(0, torch::kInt32);
torch::Tensor packed_info = torch::stack({cum_steps - num_steps, num_steps}, 1);
// std::cout << "num_steps" << num_steps.dtype() << std::endl;
// std::cout << "cum_steps" << cum_steps.dtype() << std::endl;
// std::cout << "packed_info" << packed_info.dtype() << std::endl;
// output frustum samples
int total_steps = cum_steps[cum_steps.size(0) - 1].item<int>();
torch::Tensor frustum_starts = torch::zeros({total_steps, 1}, rays_o.options());
torch::Tensor frustum_ends = torch::zeros({total_steps, 1}, rays_o.options());
marching_forward_kernel<<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
// rays
n_rays,
rays_o.data_ptr<float>(),
rays_d.data_ptr<float>(),
t_min.data_ptr<float>(),
t_max.data_ptr<float>(),
// density grid
aabb.data_ptr<float>(),
resolution[0].cast<int>(),
resolution[1].cast<int>(),
resolution[2].cast<int>(),
occ_binary.data_ptr<bool>(),
// sampling
dt,
packed_info.data_ptr<int>(),
// outputs
frustum_starts.data_ptr<float>(),
frustum_ends.data_ptr<float>()
);
return {packed_info, frustum_starts, frustum_ends};
}
torch::Tensor unpack_to_ray_indices(const torch::Tensor packed_info) {
DEVICE_GUARD(packed_info);
CHECK_INPUT(packed_info);
const int n_rays = packed_info.size(0);
const int threads = 256;
const int blocks = CUDA_N_BLOCKS_NEEDED(n_rays, threads);
int n_samples = packed_info[n_rays - 1].sum(0).item<int>();
torch::Tensor ray_indices = torch::zeros(
{n_samples}, packed_info.options().dtype(torch::kInt32));
ray_indices_kernel<<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
n_rays,
packed_info.data_ptr<int>(),
ray_indices.data_ptr<int>()
);
return ray_indices;
}
torch::Tensor query_occ(
const torch::Tensor samples,
// density grid
const torch::Tensor aabb,
const pybind11::list resolution,
const torch::Tensor occ_binary
) {
DEVICE_GUARD(samples);
CHECK_INPUT(samples);
const int n_samples = samples.size(0);
const int threads = 256;
const int blocks = CUDA_N_BLOCKS_NEEDED(n_samples, threads);
torch::Tensor occs = torch::zeros(
{n_samples}, samples.options().dtype(torch::kBool));
occ_query_kernel<<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
n_samples,
samples.data_ptr<float>(),
// density grid
aabb.data_ptr<float>(),
resolution[0].cast<int>(),
resolution[1].cast<int>(),
resolution[2].cast<int>(),
occ_binary.data_ptr<bool>(),
// outputs
occs.data_ptr<bool>()
);
return occs;
}
#include "include/helpers_cuda.h"
template <typename scalar_t>
__global__ void volumetric_rendering_steps_kernel(
const uint32_t n_rays,
const int* packed_info, // input ray & point indices.
const scalar_t* starts, // input start t
const scalar_t* ends, // input end t
const scalar_t* sigmas, // input density after activation
// output: should be all zero (false) initialized
int* num_steps,
bool* selector
) {
CUDA_GET_THREAD_ID(i, n_rays);
// locate
const int base = packed_info[i * 2 + 0]; // point idx start.
const int steps = packed_info[i * 2 + 1]; // point idx shift.
if (steps == 0) return;
starts += base;
ends += base;
sigmas += base;
num_steps += i;
selector += base;
// accumulated rendering
scalar_t T = 1.f;
scalar_t EPSILON = 1e-4f;
int j = 0;
for (; j < steps; ++j) {
if (T < EPSILON) {
break;
}
const scalar_t delta = ends[j] - starts[j];
const scalar_t alpha = 1.f - __expf(-sigmas[j] * delta);
const scalar_t weight = alpha * T;
T *= (1.f - alpha);
selector[j] = true;
}
num_steps[0] = j;
return;
}
template <typename scalar_t>
__global__ void volumetric_rendering_weights_forward_kernel(
const uint32_t n_rays,
const int* packed_info, // input ray & point indices.
const scalar_t* starts, // input start t
const scalar_t* ends, // input end t
const scalar_t* sigmas, // input density after activation
// should be all-zero initialized
scalar_t* weights // output
) {
CUDA_GET_THREAD_ID(i, n_rays);
// locate
const int base = packed_info[i * 2 + 0]; // point idx start.
const int steps = packed_info[i * 2 + 1]; // point idx shift.
if (steps == 0) return;
starts += base;
ends += base;
sigmas += base;
weights += base;
// accumulated rendering
scalar_t T = 1.f;
scalar_t EPSILON = 1e-4f;
for (int j = 0; j < steps; ++j) {
if (T < EPSILON) {
break;
}
const scalar_t delta = ends[j] - starts[j];
const scalar_t alpha = 1.f - __expf(-sigmas[j] * delta);
const scalar_t weight = alpha * T;
weights[j] = weight;
T *= (1.f - alpha);
}
}
template <typename scalar_t>
__global__ void volumetric_rendering_weights_backward_kernel(
const uint32_t n_rays,
const int* packed_info, // input ray & point indices.
const scalar_t* starts, // input start t
const scalar_t* ends, // input end t
const scalar_t* sigmas, // input density after activation
const scalar_t* weights, // forward output
const scalar_t* grad_weights, // input
scalar_t* grad_sigmas // output
) {
CUDA_GET_THREAD_ID(i, n_rays);
// locate
const int base = packed_info[i * 2 + 0]; // point idx start.
const int steps = packed_info[i * 2 + 1]; // point idx shift.
if (steps == 0) return;
starts += base;
ends += base;
sigmas += base;
weights += base;
grad_weights += base;
grad_sigmas += base;
scalar_t accum = 0;
for (int j = 0; j < steps; ++j) {
accum += grad_weights[j] * weights[j];
}
// backward of accumulated rendering
scalar_t T = 1.f;
scalar_t EPSILON = 1e-4f;
for (int j = 0; j < steps; ++j) {
if (T < EPSILON) {
break;
}
const scalar_t delta = ends[j] - starts[j];
const scalar_t alpha = 1.f - __expf(-sigmas[j] * delta);
grad_sigmas[j] = delta * (grad_weights[j] * T - accum);
accum -= grad_weights[j] * weights[j];
T *= (1.f - alpha);
}
}
std::vector<torch::Tensor> volumetric_rendering_steps(
torch::Tensor packed_info,
torch::Tensor starts,
torch::Tensor ends,
torch::Tensor sigmas
) {
DEVICE_GUARD(packed_info);
CHECK_INPUT(packed_info);
CHECK_INPUT(starts);
CHECK_INPUT(ends);
CHECK_INPUT(sigmas);
TORCH_CHECK(packed_info.ndimension() == 2 & packed_info.size(1) == 2);
TORCH_CHECK(starts.ndimension() == 2 & starts.size(1) == 1);
TORCH_CHECK(ends.ndimension() == 2 & ends.size(1) == 1);
TORCH_CHECK(sigmas.ndimension() == 2 & sigmas.size(1) == 1);
const uint32_t n_rays = packed_info.size(0);
const uint32_t n_samples = sigmas.size(0);
const int threads = 256;
const int blocks = CUDA_N_BLOCKS_NEEDED(n_rays, threads);
torch::Tensor num_steps = torch::zeros({n_rays}, packed_info.options());
torch::Tensor selector = torch::zeros({n_samples}, packed_info.options().dtype(torch::kBool));
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
sigmas.scalar_type(),
"volumetric_marching_steps",
([&]
{ volumetric_rendering_steps_kernel<scalar_t><<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
n_rays,
packed_info.data_ptr<int>(),
starts.data_ptr<scalar_t>(),
ends.data_ptr<scalar_t>(),
sigmas.data_ptr<scalar_t>(),
num_steps.data_ptr<int>(),
selector.data_ptr<bool>()
);
}));
torch::Tensor cum_steps = num_steps.cumsum(0, torch::kInt32);
torch::Tensor compact_packed_info = torch::stack({cum_steps - num_steps, num_steps}, 1);
return {compact_packed_info, selector};
}
torch::Tensor volumetric_rendering_weights_forward(
torch::Tensor packed_info,
torch::Tensor starts,
torch::Tensor ends,
torch::Tensor sigmas
) {
DEVICE_GUARD(packed_info);
CHECK_INPUT(packed_info);
CHECK_INPUT(starts);
CHECK_INPUT(ends);
CHECK_INPUT(sigmas);
TORCH_CHECK(packed_info.ndimension() == 2 & packed_info.size(1) == 2);
TORCH_CHECK(starts.ndimension() == 2 & starts.size(1) == 1);
TORCH_CHECK(ends.ndimension() == 2 & ends.size(1) == 1);
TORCH_CHECK(sigmas.ndimension() == 2 & sigmas.size(1) == 1);
const uint32_t n_rays = packed_info.size(0);
const uint32_t n_samples = sigmas.size(0);
const int threads = 256;
const int blocks = CUDA_N_BLOCKS_NEEDED(n_rays, threads);
// outputs
torch::Tensor weights = torch::zeros({n_samples}, sigmas.options());
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
sigmas.scalar_type(),
"volumetric_rendering_weights_forward",
([&]
{ volumetric_rendering_weights_forward_kernel<scalar_t><<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
n_rays,
packed_info.data_ptr<int>(),
starts.data_ptr<scalar_t>(),
ends.data_ptr<scalar_t>(),
sigmas.data_ptr<scalar_t>(),
weights.data_ptr<scalar_t>()
);
}));
return weights;
}
torch::Tensor volumetric_rendering_weights_backward(
torch::Tensor weights,
torch::Tensor grad_weights,
torch::Tensor packed_info,
torch::Tensor starts,
torch::Tensor ends,
torch::Tensor sigmas
) {
DEVICE_GUARD(packed_info);
const uint32_t n_rays = packed_info.size(0);
const uint32_t n_samples = sigmas.size(0);
const int threads = 256;
const int blocks = CUDA_N_BLOCKS_NEEDED(n_rays, threads);
// outputs
torch::Tensor grad_sigmas = torch::zeros(sigmas.sizes(), sigmas.options());
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
sigmas.scalar_type(),
"volumetric_rendering_weights_backward",
([&]
{ volumetric_rendering_weights_backward_kernel<scalar_t><<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
n_rays,
packed_info.data_ptr<int>(),
starts.data_ptr<scalar_t>(),
ends.data_ptr<scalar_t>(),
sigmas.data_ptr<scalar_t>(),
weights.data_ptr<scalar_t>(),
grad_weights.data_ptr<scalar_t>(),
grad_sigmas.data_ptr<scalar_t>()
);
}));
return grad_sigmas;
}
from typing import Callable, Optional, Tuple
import torch
from .grid import Grid
from .ray_marching import ray_marching, unpack_to_ray_indices
from .vol_rendering import accumulate_along_rays, render_weight_from_density
def rendering(
# radiance field
rgb_sigma_fn: Callable,
# ray marching results
packed_info: torch.Tensor,
t_starts: torch.Tensor,
t_ends: torch.Tensor,
# rendering options
early_stop_eps: float = 1e-4,
render_bkgd: Optional[torch.Tensor] = None,
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
"""Render the rays through the radience field defined by `rgb_sigma_fn`.
This function is differentiable to the outputs of `rgb_sigma_fn` so it can be used for
gradient-based optimization.
Warning:
This function is not differentiable to `t_starts`, `t_ends`.
Args:
rgb_sigma_fn: A function that takes in samples {t_starts (N, 1), t_ends (N, 1), \
ray indices (N,)} and returns the post-activation rgb (N, 3) and density \
values (N, 1).
packed_info: Packed ray marching info. See :func:`ray_marching` for details.
t_starts: Per-sample start distance. Tensor with shape (n_samples, 1).
t_ends: Per-sample end distance. Tensor with shape (n_samples, 1).
early_stop_eps: Early stop threshold during trasmittance accumulation. Default: 1e-4.
render_bkgd: Optional. Background color. Tensor with shape (3,).
Returns:
Ray colors (n_rays, 3), opacities (n_rays, 1) and depths (n_rays, 1).
Examples:
.. code-block:: python
import torch
from nerfacc import OccupancyGrid, ray_marching, rendering
device = "cuda:0"
batch_size = 128
rays_o = torch.rand((batch_size, 3), device=device)
rays_d = torch.randn((batch_size, 3), device=device)
rays_d = rays_d / rays_d.norm(dim=-1, keepdim=True)
# Ray marching.
packed_info, t_starts, t_ends = ray_marching(
rays_o, rays_d, near_plane=0.1, far_plane=1.0, render_step_size=1e-3
)
# Rendering.
def rgb_sigma_fn(t_starts, t_ends, ray_indices):
# This is a dummy function that returns random values.
rgbs = torch.rand((t_starts.shape[0], 3), device=device)
sigmas = torch.rand((t_starts.shape[0], 1), device=device)
return rgbs, sigmas
colors, opacities, depths = rendering(rgb_sigma_fn, packed_info, t_starts, t_ends)
# torch.Size([128, 3]) torch.Size([128, 1]) torch.Size([128, 1])
print(colors.shape, opacities.shape, depths.shape)
"""
n_rays = packed_info.shape[0]
ray_indices = unpack_to_ray_indices(packed_info)
# Query sigma and color with gradients
rgbs, sigmas = rgb_sigma_fn(t_starts, t_ends, ray_indices)
assert rgbs.shape[-1] == 3, "rgbs must have 3 channels, got {}".format(
rgbs.shape
)
assert (
sigmas.shape == t_starts.shape
), "sigmas must have shape of (N, 1)! Got {}".format(sigmas.shape)
# Rendering: compute weights and ray indices.
weights = render_weight_from_density(
packed_info, t_starts, t_ends, sigmas, early_stop_eps
)
# Rendering: accumulate rgbs, opacities, and depths along the rays.
colors = accumulate_along_rays(
weights, ray_indices, values=rgbs, n_rays=n_rays
)
opacities = accumulate_along_rays(
weights, ray_indices, values=None, n_rays=n_rays
)
depths = accumulate_along_rays(
weights,
ray_indices,
values=(t_starts + t_ends) / 2.0,
n_rays=n_rays,
)
# Background composition.
if render_bkgd is not None:
colors = colors + render_bkgd * (1.0 - opacities)
return colors, opacities, depths
def volumetric_rendering(
# radiance field
sigma_fn: Callable,
rgb_sigma_fn: Callable,
# rays
rays_o: torch.Tensor,
rays_d: torch.Tensor,
t_min: Optional[torch.Tensor] = None,
t_max: Optional[torch.Tensor] = None,
# bounding box of the scene
scene_aabb: Optional[torch.Tensor] = None,
# grid for skipping samples
grid: Optional[Grid] = None,
# rendering options
near_plane: Optional[float] = None,
far_plane: Optional[float] = None,
render_step_size: float = 1e-3,
stratified: bool = False,
cone_angle: float = 0.0,
early_stop_eps: float = 1e-4,
render_bkgd: Optional[torch.Tensor] = None,
return_extra_info: bool = False,
) -> Tuple[torch.Tensor, torch.Tensor, int, int]:
"""Differentiable volumetric rendering pipeline.
This function is the integration of those individual functions:
- ray_aabb_intersect: ray AABB intersection.
- ray_marching: ray marching with grid-based skipping.
- compute_weights: compute transmittance and compress samples.
- accumulate_along_rays: accumulate samples along rays to get final per-ray RGB etc.
Args:
sigma_fn: A function that takes in samples {t_starts (N, 1), t_ends (N, 1),
ray indices (N,)} and returns the post-activation density values (N, 1).
rgb_sigma_fn: A function that takes in samples {t_starts (N, 1), t_ends (N, 1),
ray indices (N,)} and returns the post-activation rgb (N, 3) and density
values (N, 1).
rays_o: Ray origins. Tensor with shape (n_rays, 3).
rays_d: Normalized ray directions. Tensor with shape (n_rays, 3).
t_min: Optional. Per-ray minimum distance. Tensor with shape (n_rays).
t_max: Optional. Per-ray maximum distance. Tensor with shape (n_rays).
scene_aabb: Optional. Scene bounding box for computing t_min and t_max.
A tensor with shape (6,) {xmin, ymin, zmin, xmax, ymax, zmax}.
scene_aabb which be ignored if both t_min and t_max are provided.
grid: Optional. Grid for to idicates where to skip during marching.
See :class:`nerfacc.Grid` for details.
near_plane: Optional. Near plane distance. If provided, it will be used
to clip t_min.
far_plane: Optional. Far plane distance. If provided, it will be used
to clip t_max.
render_step_size: Step size for marching. Default: 1e-3.
stratified: Whether to use stratified sampling. Default: False.
cone_angle: Cone angle for linearly-increased step size. 0. means
constant step size. Default: 0.0.
early_stop_eps: Early stop threshold for marching. Default: 1e-4.
render_bkgd: Optional. Background color. If provided, it will be used
to fill the background. Default: None.
return_extra_info: Whether to return extra info. Default: False.
Returns:
Ray colors (n_rays, 3), opacities (n_rays, 1) and depths (n_rays, 1).
If return_extra_info is True, it will also return a dictionary of extra info,
including:
- "n_marching_samples": Total number of samples kept after marching.
- "n_rendering_samples": Total number of samples used for actual rendering.
"""
assert rays_o.shape == rays_d.shape and rays_o.dim() == 2, "Invalid rays."
n_rays = rays_o.shape[0]
rays_o = rays_o.contiguous()
rays_d = rays_d.contiguous()
extra_info = {}
with torch.no_grad():
# Ray marching with skipping.
packed_info, t_starts, t_ends = ray_marching(
rays_o,
rays_d,
t_min=t_min,
t_max=t_max,
scene_aabb=scene_aabb,
grid=grid,
sigma_fn=sigma_fn,
early_stop_eps=early_stop_eps,
near_plane=near_plane,
far_plane=far_plane,
render_step_size=render_step_size,
stratified=stratified,
cone_angle=cone_angle,
)
extra_info["n_rendering_samples"] = len(t_starts)
colors, opacities, depths = rendering(
rgb_sigma_fn,
packed_info=packed_info,
t_starts=t_starts,
t_ends=t_ends,
early_stop_eps=early_stop_eps,
render_bkgd=render_bkgd,
)
if return_extra_info:
return colors, opacities, depths, extra_info
else:
return colors, opacities, depths
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
......@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
[project]
name = "nerfacc"
version = "0.0.9"
version = "0.1.0"
authors = [{name = "Ruilong", email = "ruilongli94@gmail.com"}]
license = { text="MIT" }
requires-python = ">=3.8"
......@@ -35,6 +35,11 @@ dev = [
[tool.black]
line-length = 80
[tool.isort]
multi_line_output = 3
line_length = 80
include_trailing_comma = true
# pylint
[tool.pylint.messages_control]
max-line-length = 80
......
import torch
import tqdm
from nerfacc import volumetric_rendering_pipeline
device = "cuda:0"
def sigma_fn(frustum_starts, frustum_ends, ray_indices):
return torch.rand_like(frustum_ends[:, :1])
def rgb_sigma_fn(frustum_starts, frustum_ends, ray_indices):
return torch.rand(
(frustum_ends.shape[0], 3), device=device
), torch.rand_like(frustum_ends)
def test_rendering():
scene_aabb = torch.tensor([0, 0, 0, 1, 1, 1], device=device).float()
scene_resolution = [128, 128, 128]
scene_occ_binary = torch.ones((128 * 128 * 128), device=device).bool()
rays_o = torch.rand((10000, 3), device=device)
rays_d = torch.randn((10000, 3), device=device)
rays_d = rays_d / rays_d.norm(dim=-1, keepdim=True)
render_bkgd = torch.ones(3, device=device)
for step in tqdm.tqdm(range(1000)):
volumetric_rendering_pipeline(
sigma_fn,
rgb_sigma_fn,
rays_o,
rays_d,
scene_aabb,
scene_resolution,
scene_occ_binary,
render_bkgd,
render_step_size=1e-3,
near_plane=0.0,
stratified=False,
)
if __name__ == "__main__":
test_rendering()
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
import torch
import tqdm
from nerfacc import OccupancyField
device = "cuda:0"
def occ_eval_fn(positions: torch.Tensor) -> torch.Tensor:
return torch.rand_like(positions[:, :1])
def test_occ_field():
occ_field = OccupancyField(occ_eval_fn, aabb=[0, 0, 0, 1, 1, 1]).to(device)
for step in tqdm.tqdm(range(50000)):
occ_field.every_n_step(step, occ_thre=0.1)
if __name__ == "__main__":
test_occ_field()
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment