Commit b19fe1de authored by Christoph Lassner's avatar Christoph Lassner Committed by Facebook GitHub Bot
Browse files

pulsar integration.

Summary:
This diff integrates the pulsar renderer source code into PyTorch3D as an alternative backend for the PyTorch3D point renderer. This diff is the first of a series of three diffs to complete that migration and focuses on the packaging and integration of the source code.

For more information about the pulsar backend, see the release notes and the paper (https://arxiv.org/abs/2004.07484). For information on how to use the backend, see the point cloud rendering notebook and the examples in the folder `docs/examples`.

Tasks addressed in the following diffs:
* Add the PyTorch3D interface,
* Add notebook examples and documentation (or adapt the existing ones to feature both interfaces).

Reviewed By: nikhilaravi

Differential Revision: D23947736

fbshipit-source-id: a5e77b53e6750334db22aefa89b4c079cda1b443
parent d5650323
#!/usr/bin/env python3
# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
"""
This example demonstrates the most trivial, direct interface of the pulsar
sphere renderer. It renders and saves an image with 10 random spheres.
Output: basic.png.
"""
from os import path
import imageio
import torch
from pytorch3d.renderer.points.pulsar import Renderer
n_points = 10
width = 1_000
height = 1_000
device = torch.device("cuda")
renderer = Renderer(width, height, n_points).to(device)
# Generate sample data.
vert_pos = torch.rand(n_points, 3, dtype=torch.float32, device=device) * 10.0
vert_pos[:, 2] += 25.0
vert_pos[:, :2] -= 5.0
vert_col = torch.rand(n_points, 3, dtype=torch.float32, device=device)
vert_rad = torch.rand(n_points, dtype=torch.float32, device=device)
cam_params = torch.tensor(
[
0.0,
0.0,
0.0, # Position 0, 0, 0 (x, y, z).
0.0,
0.0,
0.0, # Rotation 0, 0, 0 (in axis-angle format).
5.0, # Focal length in world size.
2.0, # Sensor size in world size.
],
dtype=torch.float32,
device=device,
)
# Render.
image = renderer(
vert_pos,
vert_col,
vert_rad,
cam_params,
1.0e-1, # Renderer blending parameter gamma, in [1., 1e-5].
45.0, # Maximum depth.
)
print("Writing image to `%s`." % (path.abspath("basic.png")))
imageio.imsave("basic.png", (image.cpu().detach() * 255.0).to(torch.uint8).numpy())
#!/usr/bin/env python3
# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
"""
This example demonstrates camera parameter optimization with the plain
pulsar interface. For this, a reference image has been pre-generated
(you can find it at `../../tests/pulsar/reference/examples_TestRenderer_test_cam.png`).
The same scene parameterization is loaded and the camera parameters
distorted. Gradient-based optimization is used to converge towards the
original camera parameters.
"""
from os import path
import cv2
import imageio
import numpy as np
import torch
from pytorch3d.renderer.points.pulsar import Renderer
from torch import nn, optim
n_points = 20
width = 1_000
height = 1_000
device = torch.device("cuda")
class SceneModel(nn.Module):
"""
A simple scene model to demonstrate use of pulsar in PyTorch modules.
The scene model is parameterized with sphere locations (vert_pos),
channel content (vert_col), radiuses (vert_rad), camera position (cam_pos),
camera rotation (cam_rot) and sensor focal length and width (cam_sensor).
The forward method of the model renders this scene description. Any
of these parameters could instead be passed as inputs to the forward
method and come from a different model.
"""
def __init__(self):
super(SceneModel, self).__init__()
self.gamma = 0.1
# Points.
torch.manual_seed(1)
vert_pos = torch.rand(n_points, 3, dtype=torch.float32) * 10.0
vert_pos[:, 2] += 25.0
vert_pos[:, :2] -= 5.0
self.register_parameter("vert_pos", nn.Parameter(vert_pos, requires_grad=False))
self.register_parameter(
"vert_col",
nn.Parameter(
torch.rand(n_points, 3, dtype=torch.float32), requires_grad=False
),
)
self.register_parameter(
"vert_rad",
nn.Parameter(
torch.rand(n_points, dtype=torch.float32), requires_grad=False
),
)
self.register_parameter(
"cam_pos",
nn.Parameter(
torch.tensor([0.1, 0.1, 0.0], dtype=torch.float32), requires_grad=True
),
)
self.register_parameter(
"cam_rot",
nn.Parameter(
torch.tensor(
[
# We're using the 6D rot. representation for better gradients.
0.9995,
0.0300445,
-0.0098482,
-0.0299445,
0.9995,
0.0101482,
],
dtype=torch.float32,
),
requires_grad=True,
),
)
self.register_parameter(
"cam_sensor",
nn.Parameter(
torch.tensor([4.8, 1.8], dtype=torch.float32), requires_grad=True
),
)
self.renderer = Renderer(width, height, n_points)
def forward(self):
return self.renderer.forward(
self.vert_pos,
self.vert_col,
self.vert_rad,
torch.cat([self.cam_pos, self.cam_rot, self.cam_sensor]),
self.gamma,
45.0,
)
# Load reference.
ref = (
torch.from_numpy(
imageio.imread(
"../../tests/pulsar/reference/examples_TestRenderer_test_cam.png"
)
).to(torch.float32)
/ 255.0
).to(device)
# Set up model.
model = SceneModel().to(device)
# Optimizer.
optimizer = optim.SGD(
[
{"params": [model.cam_pos], "lr": 1e-4}, # 1e-3
{"params": [model.cam_rot], "lr": 5e-6},
{"params": [model.cam_sensor], "lr": 1e-4},
]
)
print("Writing video to `%s`." % (path.abspath("cam.gif")))
writer = imageio.get_writer("cam.gif", format="gif", fps=25)
# Optimize.
for i in range(300):
optimizer.zero_grad()
result = model()
# Visualize.
result_im = (result.cpu().detach().numpy() * 255).astype(np.uint8)
cv2.imshow("opt", result_im[:, :, ::-1])
writer.append_data(result_im)
overlay_img = np.ascontiguousarray(
((result * 0.5 + ref * 0.5).cpu().detach().numpy() * 255).astype(np.uint8)[
:, :, ::-1
]
)
overlay_img = cv2.putText(
overlay_img,
"Step %d" % (i),
(10, 40),
cv2.FONT_HERSHEY_SIMPLEX,
1,
(0, 0, 0),
2,
cv2.LINE_AA,
False,
)
cv2.imshow("overlay", overlay_img)
cv2.waitKey(1)
# Update.
loss = ((result - ref) ** 2).sum()
print("loss {}: {}".format(i, loss.item()))
loss.backward()
optimizer.step()
writer.close()
#!/usr/bin/env python3
# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
"""
This example demonstrates multiview 3D reconstruction using the plain
pulsar interface. For this, reference images have been pre-generated
(you can find them at `../../tests/pulsar/reference/examples_TestRenderer_test_multiview_%d.png`).
The camera parameters are assumed given. The scene is initialized with
random spheres. Gradient-based optimization is used to optimize sphere
parameters and prune spheres to converge to a 3D representation.
"""
from os import path
import cv2
import imageio
import numpy as np
import torch
from pytorch3d.renderer.points.pulsar import Renderer
from torch import nn, optim
n_points = 400_000
width = 1_000
height = 1_000
visualize_ids = [0, 1]
device = torch.device("cuda")
class SceneModel(nn.Module):
"""
A simple scene model to demonstrate use of pulsar in PyTorch modules.
The scene model is parameterized with sphere locations (vert_pos),
channel content (vert_col), radiuses (vert_rad), camera position (cam_pos),
camera rotation (cam_rot) and sensor focal length and width (cam_sensor).
The forward method of the model renders this scene description. Any
of these parameters could instead be passed as inputs to the forward
method and come from a different model. Optionally, camera parameters can
be provided to the forward method in which case the scene is rendered
using those parameters.
"""
def __init__(self):
super(SceneModel, self).__init__()
self.gamma = 1.0
# Points.
torch.manual_seed(1)
vert_pos = torch.rand((1, n_points, 3), dtype=torch.float32) * 10.0
vert_pos[:, :, 2] += 25.0
vert_pos[:, :, :2] -= 5.0
self.register_parameter("vert_pos", nn.Parameter(vert_pos, requires_grad=True))
self.register_parameter(
"vert_col",
nn.Parameter(
torch.ones(1, n_points, 3, dtype=torch.float32) * 0.5,
requires_grad=True,
),
)
self.register_parameter(
"vert_rad",
nn.Parameter(
torch.ones(1, n_points, dtype=torch.float32) * 0.05, requires_grad=True
),
)
self.register_parameter(
"vert_opy",
nn.Parameter(
torch.ones(1, n_points, dtype=torch.float32), requires_grad=True
),
)
self.register_buffer(
"cam_params",
torch.tensor(
[
[
np.sin(angle) * 35.0,
0.0,
30.0 - np.cos(angle) * 35.0,
0.0,
-angle,
0.0,
5.0,
2.0,
]
for angle in [-1.5, -0.8, -0.4, -0.1, 0.1, 0.4, 0.8, 1.5]
],
dtype=torch.float32,
),
)
self.renderer = Renderer(width, height, n_points)
def forward(self, cam=None):
if cam is None:
cam = self.cam_params
n_views = 8
else:
n_views = 1
return self.renderer.forward(
self.vert_pos.expand(n_views, -1, -1),
self.vert_col.expand(n_views, -1, -1),
self.vert_rad.expand(n_views, -1),
cam,
self.gamma,
45.0,
)
# Load reference.
ref = torch.stack(
[
torch.from_numpy(
imageio.imread(
"../../tests/pulsar/reference/examples_TestRenderer_test_multiview_%d.png"
% idx
)
).to(torch.float32)
/ 255.0
for idx in range(8)
]
).to(device)
# Set up model.
model = SceneModel().to(device)
# Optimizer.
optimizer = optim.SGD(
[
{"params": [model.vert_col], "lr": 1e-1},
{"params": [model.vert_rad], "lr": 1e-3},
{"params": [model.vert_pos], "lr": 1e-3},
]
)
# For visualization.
angle = 0.0
print("Writing video to `%s`." % (path.abspath("multiview.avi")))
writer = imageio.get_writer("multiview.gif", format="gif", fps=25)
# Optimize.
for i in range(300):
optimizer.zero_grad()
result = model()
# Visualize.
result_im = (result.cpu().detach().numpy() * 255).astype(np.uint8)
cv2.imshow("opt", result_im[0, :, :, ::-1])
overlay_img = np.ascontiguousarray(
((result * 0.5 + ref * 0.5).cpu().detach().numpy() * 255).astype(np.uint8)[
0, :, :, ::-1
]
)
overlay_img = cv2.putText(
overlay_img,
"Step %d" % (i),
(10, 40),
cv2.FONT_HERSHEY_SIMPLEX,
1,
(0, 0, 0),
2,
cv2.LINE_AA,
False,
)
cv2.imshow("overlay", overlay_img)
cv2.waitKey(1)
# Update.
loss = ((result - ref) ** 2).sum()
print("loss {}: {}".format(i, loss.item()))
loss.backward()
optimizer.step()
# Cleanup.
with torch.no_grad():
model.vert_col.data = torch.clamp(model.vert_col.data, 0.0, 1.0)
# Remove points.
model.vert_pos.data[model.vert_rad < 0.001, :] = -1000.0
model.vert_rad.data[model.vert_rad < 0.001] = 0.0001
vd = (
(model.vert_col - torch.ones(1, 1, 3, dtype=torch.float32).to(device))
.abs()
.sum(dim=2)
)
model.vert_pos.data[vd <= 0.2] = -1000.0
# Rotating visualization.
cam_control = torch.tensor(
[
[
np.sin(angle) * 35.0,
0.0,
30.0 - np.cos(angle) * 35.0,
0.0,
-angle,
0.0,
5.0,
2.0,
]
],
dtype=torch.float32,
).to(device)
with torch.no_grad():
result = model.forward(cam=cam_control)[0]
result_im = (result.cpu().detach().numpy() * 255).astype(np.uint8)
cv2.imshow("vis", result_im[:, :, ::-1])
writer.append_data(result_im)
angle += 0.05
writer.close()
#!/usr/bin/env python3
# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
"""
This example demonstrates scene optimization with the plain
pulsar interface. For this, a reference image has been pre-generated
(you can find it at `../../tests/pulsar/reference/examples_TestRenderer_test_smallopt.png`).
The scene is initialized with random spheres. Gradient-based
optimization is used to converge towards a faithful
scene representation.
"""
import cv2
import imageio
import numpy as np
import torch
from pytorch3d.renderer.points.pulsar import Renderer
from torch import nn, optim
n_points = 10_000
width = 1_000
height = 1_000
device = torch.device("cuda")
class SceneModel(nn.Module):
"""
A simple scene model to demonstrate use of pulsar in PyTorch modules.
The scene model is parameterized with sphere locations (vert_pos),
channel content (vert_col), radiuses (vert_rad), camera position (cam_pos),
camera rotation (cam_rot) and sensor focal length and width (cam_sensor).
The forward method of the model renders this scene description. Any
of these parameters could instead be passed as inputs to the forward
method and come from a different model.
"""
def __init__(self):
super(SceneModel, self).__init__()
self.gamma = 1.0
# Points.
torch.manual_seed(1)
vert_pos = torch.rand(n_points, 3, dtype=torch.float32) * 10.0
vert_pos[:, 2] += 25.0
vert_pos[:, :2] -= 5.0
self.register_parameter("vert_pos", nn.Parameter(vert_pos, requires_grad=True))
self.register_parameter(
"vert_col",
nn.Parameter(
torch.ones(n_points, 3, dtype=torch.float32) * 0.5, requires_grad=True
),
)
self.register_parameter(
"vert_rad",
nn.Parameter(
torch.ones(n_points, dtype=torch.float32) * 0.3, requires_grad=True
),
)
self.register_buffer(
"cam_params",
torch.tensor([0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 5.0, 2.0], dtype=torch.float32),
)
# The volumetric optimization works better with a higher number of tracked
# intersections per ray.
self.renderer = Renderer(width, height, n_points, n_track=32)
def forward(self):
return self.renderer.forward(
self.vert_pos,
self.vert_col,
self.vert_rad,
self.cam_params,
self.gamma,
45.0,
return_forward_info=True,
)
# Load reference.
ref = (
torch.from_numpy(
imageio.imread(
"../../tests/pulsar/reference/examples_TestRenderer_test_smallopt.png"
)
).to(torch.float32)
/ 255.0
).to(device)
# Set up model.
model = SceneModel().to(device)
# Optimizer.
optimizer = optim.SGD(
[
{"params": [model.vert_col], "lr": 1e0},
{"params": [model.vert_rad], "lr": 5e-3},
{"params": [model.vert_pos], "lr": 1e-2},
]
)
# Optimize.
for i in range(500):
optimizer.zero_grad()
result, result_info = model()
# Visualize.
result_im = (result.cpu().detach().numpy() * 255).astype(np.uint8)
cv2.imshow("opt", result_im[:, :, ::-1])
overlay_img = np.ascontiguousarray(
((result * 0.5 + ref * 0.5).cpu().detach().numpy() * 255).astype(np.uint8)[
:, :, ::-1
]
)
overlay_img = cv2.putText(
overlay_img,
"Step %d" % (i),
(10, 40),
cv2.FONT_HERSHEY_SIMPLEX,
1,
(0, 0, 0),
2,
cv2.LINE_AA,
False,
)
cv2.imshow("overlay", overlay_img)
cv2.waitKey(1)
# Update.
loss = ((result - ref) ** 2).sum()
print("loss {}: {}".format(i, loss.item()))
loss.backward()
optimizer.step()
# Cleanup.
with torch.no_grad():
model.vert_col.data = torch.clamp(model.vert_col.data, 0.0, 1.0)
# Remove points.
model.vert_pos.data[model.vert_rad < 0.001, :] = -1000.0
model.vert_rad.data[model.vert_rad < 0.001] = 0.0001
vd = (
(model.vert_col - torch.ones(3, dtype=torch.float32).to(device))
.abs()
.sum(dim=1)
)
model.vert_pos.data[vd <= 0.2] = -1000.0
// Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
// clang-format off
#include "./pulsar/global.h" // Include before <torch/extension.h>.
#include <torch/extension.h>
// clang-format on
#include "./pulsar/pytorch/renderer.h"
#include "./pulsar/pytorch/tensor_util.h"
#include "blending/sigmoid_alpha_blend.h"
#include "compositing/alpha_composite.h"
#include "compositing/norm_weighted_sum.h"
......@@ -65,4 +70,90 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("face_point_dist_backward", &FacePointDistanceBackward);
m.def("point_face_array_dist_forward", &PointFaceArrayDistanceForward);
m.def("point_face_array_dist_backward", &PointFaceArrayDistanceBackward);
// Pulsar.
#ifdef PULSAR_LOGGING_ENABLED
c10::ShowLogInfoToStderr();
#endif
py::class_<
pulsar::pytorch::Renderer,
std::shared_ptr<pulsar::pytorch::Renderer>>(m, "PulsarRenderer")
.def(py::init<
const uint&,
const uint&,
const uint&,
const bool&,
const bool&,
const float&,
const uint&,
const uint&>())
.def(
"__eq__",
[](const pulsar::pytorch::Renderer& a,
const pulsar::pytorch::Renderer& b) { return a == b; },
py::is_operator())
.def(
"__ne__",
[](const pulsar::pytorch::Renderer& a,
const pulsar::pytorch::Renderer& b) { return !(a == b); },
py::is_operator())
.def(
"__repr__",
[](const pulsar::pytorch::Renderer& self) {
std::stringstream ss;
ss << self;
return ss.str();
})
.def(
"forward",
&pulsar::pytorch::Renderer::forward,
py::arg("vert_pos"),
py::arg("vert_col"),
py::arg("vert_radii"),
py::arg("cam_pos"),
py::arg("pixel_0_0_center"),
py::arg("pixel_vec_x"),
py::arg("pixel_vec_y"),
py::arg("focal_length"),
py::arg("principal_point_offsets"),
py::arg("gamma"),
py::arg("max_depth"),
py::arg("min_depth") /* = 0.f*/,
py::arg(
"bg_col") /* = at::nullopt not exposed properly in pytorch 1.1. */
,
py::arg("opacity") /* = at::nullopt ... */,
py::arg("percent_allowed_difference") = 0.01f,
py::arg("max_n_hits") = MAX_UINT,
py::arg("mode") = 0)
.def("backward", &pulsar::pytorch::Renderer::backward)
.def_property(
"device_tracker",
[](const pulsar::pytorch::Renderer& self) {
return self.device_tracker;
},
[](pulsar::pytorch::Renderer& self, const torch::Tensor& val) {
self.device_tracker = val;
})
.def_property_readonly("width", &pulsar::pytorch::Renderer::width)
.def_property_readonly("height", &pulsar::pytorch::Renderer::height)
.def_property_readonly(
"max_num_balls", &pulsar::pytorch::Renderer::max_num_balls)
.def_property_readonly(
"orthogonal", &pulsar::pytorch::Renderer::orthogonal)
.def_property_readonly(
"right_handed", &pulsar::pytorch::Renderer::right_handed)
.def_property_readonly("n_track", &pulsar::pytorch::Renderer::n_track);
m.def(
"pulsar_sphere_ids_from_result_info_nograd",
&pulsar::pytorch::sphere_ids_from_result_info_nograd);
// Constants.
m.attr("EPS") = py::float_(EPS);
m.attr("MAX_FLOAT") = py::float_(MAX_FLOAT);
m.attr("MAX_INT") = py::int_(MAX_INT);
m.attr("MAX_UINT") = py::int_(MAX_UINT);
m.attr("MAX_USHORT") = py::int_(MAX_USHORT);
m.attr("PULSAR_MAX_GRAD_SPHERES") = py::int_(MAX_GRAD_SPHERES);
}
// Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
#ifndef PULSAR_NATIVE_CONSTANTS_H_
#define PULSAR_NATIVE_CONSTANTS_H_
#define EPS 1E-6
#define FEPS 1E-6f
#define MAX_FLOAT 3.4E38f
#define MAX_INT 2147483647
#define MAX_UINT 4294967295u
#define MAX_USHORT 65535u
#endif
# CUDA device compilation units
This folder contains `.cu` files to create compilation units
for device-specific functions. See `../include/README.md` for
more information.
// Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
#ifndef PULSAR_NATIVE_CUDA_COMMANDS_H_
#define PULSAR_NATIVE_CUDA_COMMANDS_H_
// Definitions for GPU commands.
#include <cooperative_groups.h>
#include <cub/cub.cuh>
namespace cg = cooperative_groups;
#ifdef __DRIVER_TYPES_H__
#ifndef DEVICE_RESET
#define DEVICE_RESET cudaDeviceReset();
#endif
#else
#ifndef DEVICE_RESET
#define DEVICE_RESET
#endif
#endif
#define HANDLECUDA(CMD) CMD
// handleCudaError((CMD), __FILE__, __LINE__)
inline void
handleCudaError(const cudaError_t err, const char* file, const int line) {
if (err != cudaSuccess) {
#ifndef __NVCC__
fprintf(
stderr,
"%s(%i) : getLastCudaError() CUDA error :"
" (%d) %s.\n",
file,
line,
static_cast<int>(err),
cudaGetErrorString(err));
DEVICE_RESET
exit(1);
#endif
}
}
inline void
getLastCudaError(const char* errorMessage, const char* file, const int line) {
cudaError_t err = cudaGetLastError();
if (cudaSuccess != err) {
fprintf(stderr, "Error: %s.", errorMessage);
handleCudaError(err, file, line);
}
}
#define ALIGN(VAL) __align__(VAL)
#define SYNC() HANDLECUDE(cudaDeviceSynchronize())
#define THREADFENCE_B() __threadfence_block()
#define SHFL_SYNC(a, b, c) __shfl_sync((a), (b), (c))
#define SHARED __shared__
#define ACTIVEMASK() __activemask()
#define BALLOT(mask, val) __ballot_sync((mask), val)
/**
* Find the cumulative sum within a warp up to the current
* thread lane, with each mask thread contributing base.
*/
template <typename T>
DEVICE T
WARP_CUMSUM(const cg::coalesced_group& group, const uint& mask, const T& base) {
T ret = base;
T shfl_val;
shfl_val = __shfl_down_sync(mask, ret, 1u); // Deactivate the rightmost lane.
ret += (group.thread_rank() < 31) * shfl_val;
shfl_val = __shfl_down_sync(mask, ret, 2u);
ret += (group.thread_rank() < 30) * shfl_val;
shfl_val = __shfl_down_sync(mask, ret, 4u); // ...4
ret += (group.thread_rank() < 28) * shfl_val;
shfl_val = __shfl_down_sync(mask, ret, 8u); // ...8
ret += (group.thread_rank() < 24) * shfl_val;
shfl_val = __shfl_down_sync(mask, ret, 16u); // ...16
ret += (group.thread_rank() < 16) * shfl_val;
return ret;
}
template <typename T>
DEVICE T
WARP_MAX(const cg::coalesced_group& group, const uint& mask, const T& base) {
T ret = base;
ret = max(ret, __shfl_down_sync(mask, ret, 16u));
ret = max(ret, __shfl_down_sync(mask, ret, 8u));
ret = max(ret, __shfl_down_sync(mask, ret, 4u));
ret = max(ret, __shfl_down_sync(mask, ret, 2u));
ret = max(ret, __shfl_down_sync(mask, ret, 1u));
return ret;
}
template <typename T>
DEVICE T
WARP_SUM(const cg::coalesced_group& group, const uint& mask, const T& base) {
T ret = base;
ret = ret + __shfl_down_sync(mask, ret, 16u);
ret = ret + __shfl_down_sync(mask, ret, 8u);
ret = ret + __shfl_down_sync(mask, ret, 4u);
ret = ret + __shfl_down_sync(mask, ret, 2u);
ret = ret + __shfl_down_sync(mask, ret, 1u);
return ret;
}
INLINE DEVICE float3 WARP_SUM_FLOAT3(
const cg::coalesced_group& group,
const uint& mask,
const float3& base) {
float3 ret = base;
ret.x = WARP_SUM(group, mask, base.x);
ret.y = WARP_SUM(group, mask, base.y);
ret.z = WARP_SUM(group, mask, base.z);
return ret;
}
// Floating point.
// #define FMUL(a, b) __fmul_rn((a), (b))
#define FMUL(a, b) ((a) * (b))
#define FDIV(a, b) __fdiv_rn((a), (b))
// #define FSUB(a, b) __fsub_rn((a), (b))
#define FSUB(a, b) ((a) - (b))
#define FADD(a, b) __fadd_rn((a), (b))
#define FSQRT(a) __fsqrt_rn(a)
#define FEXP(a) fasterexp(a)
#define FLN(a) fasterlog(a)
#define FPOW(a, b) __powf((a), (b))
#define FMAX(a, b) fmax((a), (b))
#define FMIN(a, b) fmin((a), (b))
#define FCEIL(a) ceilf(a)
#define FFLOOR(a) floorf(a)
#define FROUND(x) nearbyintf(x)
#define FSATURATE(x) __saturatef(x)
#define FABS(a) abs(a)
#define IASF(a, loc) (loc) = __int_as_float(a)
#define FASI(a, loc) (loc) = __float_as_int(a)
#define FABSLEQAS(a, b, c) \
((a) <= (b) ? FSUB((b), (a)) <= (c) : FSUB((a), (b)) < (c))
/** Calculates x*y+z. */
#define FMA(x, y, z) __fmaf_rn((x), (y), (z))
#define I2F(a) __int2float_rn(a)
#define FRCP(x) __frcp_rn(x)
__device__ static float atomicMax(float* address, float val) {
int* address_as_i = (int*)address;
int old = *address_as_i, assumed;
do {
assumed = old;
old = ::atomicCAS(
address_as_i,
assumed,
__float_as_int(::fmaxf(val, __int_as_float(assumed))));
} while (assumed != old);
return __int_as_float(old);
}
__device__ static float atomicMin(float* address, float val) {
int* address_as_i = (int*)address;
int old = *address_as_i, assumed;
do {
assumed = old;
old = ::atomicCAS(
address_as_i,
assumed,
__float_as_int(::fminf(val, __int_as_float(assumed))));
} while (assumed != old);
return __int_as_float(old);
}
#define DMAX(a, b) FMAX(a, b)
#define DMIN(a, b) FMIN(a, b)
#define DSQRT(a) sqrt(a)
#define DSATURATE(a) DMIN(1., DMAX(0., (a)))
// half
#define HADD(a, b) __hadd((a), (b))
#define HSUB2(a, b) __hsub2((a), (b))
#define HMUL2(a, b) __hmul2((a), (b))
#define HSQRT(a) hsqrt(a)
// uint.
#define CLZ(VAL) __clz(VAL)
#define POPC(a) __popc(a)
//
//
//
//
//
//
//
//
//
#define ATOMICADD(PTR, VAL) atomicAdd((PTR), (VAL))
#define ATOMICADD_F3(PTR, VAL) \
ATOMICADD(&((PTR)->x), VAL.x); \
ATOMICADD(&((PTR)->y), VAL.y); \
ATOMICADD(&((PTR)->z), VAL.z);
#if (CUDART_VERSION >= 10000)
#define ATOMICADD_B(PTR, VAL) atomicAdd_block((PTR), (VAL))
#else
#define ATOMICADD_B(PTR, VAL) ATOMICADD(PTR, VAL)
#endif
//
//
//
//
// int.
#define IMIN(a, b) min((a), (b))
#define IMAX(a, b) max((a), (b))
#define IABS(a) abs(a)
// Checks.
#define CHECKOK THCudaCheck
#define ARGCHECK THArgCheck
// Math.
#define NORM3DF(x, y, z) norm3df(x, y, z)
#define RNORM3DF(x, y, z) rnorm3df(x, y, z)
// High level.
INLINE DEVICE void prefetch_l1(unsigned long addr) {
asm(" prefetch.global.L1 [ %1 ];" : "=l"(addr) : "l"(addr));
}
#define PREFETCH(PTR) prefetch_l1((unsigned long)(PTR))
#define GET_SORT_WS_SIZE(RES_PTR, KEY_TYPE, VAL_TYPE, NUM_OBJECTS) \
cub::DeviceRadixSort::SortPairsDescending( \
(void*)NULL, \
*(RES_PTR), \
reinterpret_cast<KEY_TYPE*>(NULL), \
reinterpret_cast<KEY_TYPE*>(NULL), \
reinterpret_cast<VAL_TYPE*>(NULL), \
reinterpret_cast<VAL_TYPE*>(NULL), \
(NUM_OBJECTS));
#define GET_REDUCE_WS_SIZE(RES_PTR, TYPE, REDUCE_OP, NUM_OBJECTS) \
{ \
TYPE init = TYPE(); \
cub::DeviceReduce::Reduce( \
(void*)NULL, \
*(RES_PTR), \
(TYPE*)NULL, \
(TYPE*)NULL, \
(NUM_OBJECTS), \
(REDUCE_OP), \
init); \
}
#define GET_SELECT_WS_SIZE( \
RES_PTR, TYPE_SELECTOR, TYPE_SELECTION, NUM_OBJECTS) \
{ \
cub::DeviceSelect::Flagged( \
(void*)NULL, \
*(RES_PTR), \
(TYPE_SELECTION*)NULL, \
(TYPE_SELECTOR*)NULL, \
(TYPE_SELECTION*)NULL, \
(int*)NULL, \
(NUM_OBJECTS)); \
}
#define GET_SUM_WS_SIZE(RES_PTR, TYPE_SUM, NUM_OBJECTS) \
{ \
cub::DeviceReduce::Sum( \
(void*)NULL, \
*(RES_PTR), \
(TYPE_SUM*)NULL, \
(TYPE_SUM*)NULL, \
NUM_OBJECTS); \
}
#define GET_MM_WS_SIZE(RES_PTR, TYPE, NUM_OBJECTS) \
{ \
TYPE init = TYPE(); \
cub::DeviceReduce::Max( \
(void*)NULL, *(RES_PTR), (TYPE*)NULL, (TYPE*)NULL, (NUM_OBJECTS)); \
}
#define SORT_DESCENDING( \
TMPN1, SORT_PTR, SORTED_PTR, VAL_PTR, VAL_SORTED_PTR, NUM_OBJECTS) \
void* TMPN1 = NULL; \
size_t TMPN1##_bytes = 0; \
cub::DeviceRadixSort::SortPairsDescending( \
TMPN1, \
TMPN1##_bytes, \
(SORT_PTR), \
(SORTED_PTR), \
(VAL_PTR), \
(VAL_SORTED_PTR), \
(NUM_OBJECTS)); \
HANDLECUDA(cudaMalloc(&TMPN1, TMPN1##_bytes)); \
cub::DeviceRadixSort::SortPairsDescending( \
TMPN1, \
TMPN1##_bytes, \
(SORT_PTR), \
(SORTED_PTR), \
(VAL_PTR), \
(VAL_SORTED_PTR), \
(NUM_OBJECTS)); \
HANDLECUDA(cudaFree(TMPN1));
#define SORT_DESCENDING_WS( \
TMPN1, \
SORT_PTR, \
SORTED_PTR, \
VAL_PTR, \
VAL_SORTED_PTR, \
NUM_OBJECTS, \
WORKSPACE_PTR, \
WORKSPACE_BYTES) \
cub::DeviceRadixSort::SortPairsDescending( \
(WORKSPACE_PTR), \
(WORKSPACE_BYTES), \
(SORT_PTR), \
(SORTED_PTR), \
(VAL_PTR), \
(VAL_SORTED_PTR), \
(NUM_OBJECTS));
#define SORT_ASCENDING_WS( \
SORT_PTR, \
SORTED_PTR, \
VAL_PTR, \
VAL_SORTED_PTR, \
NUM_OBJECTS, \
WORKSPACE_PTR, \
WORKSPACE_BYTES, \
STREAM) \
cub::DeviceRadixSort::SortPairs( \
(WORKSPACE_PTR), \
(WORKSPACE_BYTES), \
(SORT_PTR), \
(SORTED_PTR), \
(VAL_PTR), \
(VAL_SORTED_PTR), \
(NUM_OBJECTS), \
0, \
sizeof(*(SORT_PTR)) * 8, \
(STREAM));
#define SUM_WS( \
SUM_PTR, OUT_PTR, NUM_OBJECTS, WORKSPACE_PTR, WORKSPACE_BYTES, STREAM) \
cub::DeviceReduce::Sum( \
(WORKSPACE_PTR), \
(WORKSPACE_BYTES), \
(SUM_PTR), \
(OUT_PTR), \
(NUM_OBJECTS), \
(STREAM));
#define MIN_WS( \
MIN_PTR, OUT_PTR, NUM_OBJECTS, WORKSPACE_PTR, WORKSPACE_BYTES, STREAM) \
cub::DeviceReduce::Min( \
(WORKSPACE_PTR), \
(WORKSPACE_BYTES), \
(MIN_PTR), \
(OUT_PTR), \
(NUM_OBJECTS), \
(STREAM));
#define MAX_WS( \
MAX_PTR, OUT_PTR, NUM_OBJECTS, WORKSPACE_PTR, WORKSPACE_BYTES, STREAM) \
cub::DeviceReduce::Min( \
(WORKSPACE_PTR), \
(WORKSPACE_BYTES), \
(MAX_PTR), \
(OUT_PTR), \
(NUM_OBJECTS), \
(STREAM));
//
//
//
// TODO: rewrite using nested contexts instead of temporary names.
#define REDUCE(REDUCE_PTR, RESULT_PTR, NUM_ITEMS, REDUCE_OP, REDUCE_INIT) \
cub::DeviceReduce::Reduce( \
TMPN1, \
TMPN1##_bytes, \
(REDUCE_PTR), \
(RESULT_PTR), \
(NUM_ITEMS), \
(REDUCE_OP), \
(REDUCE_INIT)); \
HANDLECUDA(cudaMalloc(&TMPN1, TMPN1##_bytes)); \
cub::DeviceReduce::Reduce( \
TMPN1, \
TMPN1##_bytes, \
(REDUCE_PTR), \
(RESULT_PTR), \
(NUM_ITEMS), \
(REDUCE_OP), \
(REDUCE_INIT)); \
HANDLECUDA(cudaFree(TMPN1));
#define REDUCE_WS( \
REDUCE_PTR, \
RESULT_PTR, \
NUM_ITEMS, \
REDUCE_OP, \
REDUCE_INIT, \
WORKSPACE_PTR, \
WORSPACE_BYTES, \
STREAM) \
cub::DeviceReduce::Reduce( \
(WORKSPACE_PTR), \
(WORSPACE_BYTES), \
(REDUCE_PTR), \
(RESULT_PTR), \
(NUM_ITEMS), \
(REDUCE_OP), \
(REDUCE_INIT), \
(STREAM));
#define SELECT_FLAGS_WS( \
FLAGS_PTR, \
ITEM_PTR, \
OUT_PTR, \
NUM_SELECTED_PTR, \
NUM_ITEMS, \
WORKSPACE_PTR, \
WORSPACE_BYTES, \
STREAM) \
cub::DeviceSelect::Flagged( \
(WORKSPACE_PTR), \
(WORSPACE_BYTES), \
(ITEM_PTR), \
(FLAGS_PTR), \
(OUT_PTR), \
(NUM_SELECTED_PTR), \
(NUM_ITEMS), \
stream = (STREAM));
#define COPY_HOST_DEV(PTR_D, PTR_H, TYPE, SIZE) \
HANDLECUDA(cudaMemcpy( \
(PTR_D), (PTR_H), sizeof(TYPE) * (SIZE), cudaMemcpyHostToDevice))
#define COPY_DEV_HOST(PTR_H, PTR_D, TYPE, SIZE) \
HANDLECUDA(cudaMemcpy( \
(PTR_H), (PTR_D), sizeof(TYPE) * (SIZE), cudaMemcpyDeviceToHost))
#define COPY_DEV_DEV(PTR_T, PTR_S, TYPE, SIZE) \
HANDLECUDA(cudaMemcpy( \
(PTR_T), (PTR_S), sizeof(TYPE) * (SIZE), cudaMemcpyDeviceToDevice))
//
// We *must* use cudaMallocManaged for pointers on device that should
// interact with pytorch. However, this comes at a significant speed penalty.
// We're using plain CUDA pointers for the rendering operations and
// explicitly copy results to managed pointers wrapped for pytorch (see
// pytorch/util.h).
#define MALLOC(VAR, TYPE, SIZE) cudaMalloc(&(VAR), sizeof(TYPE) * (SIZE))
#define FREE(PTR) HANDLECUDA(cudaFree(PTR))
#define MEMSET(VAR, VAL, TYPE, SIZE, STREAM) \
HANDLECUDA(cudaMemsetAsync((VAR), (VAL), sizeof(TYPE) * (SIZE), (STREAM)))
#define LAUNCH_MAX_PARALLEL_1D(FUNC, N, STREAM, ...) \
{ \
int64_t max_threads = \
at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock; \
uint num_threads = min((N), max_threads); \
uint num_blocks = iDivCeil((N), num_threads); \
FUNC<<<num_blocks, num_threads, 0, (STREAM)>>>(__VA_ARGS__); \
}
#define LAUNCH_PARALLEL_1D(FUNC, N, TN, STREAM, ...) \
{ \
uint num_threads = min(static_cast<int>(N), static_cast<int>(TN)); \
uint num_blocks = iDivCeil((N), num_threads); \
FUNC<<<num_blocks, num_threads, 0, (STREAM)>>>(__VA_ARGS__); \
}
#define LAUNCH_MAX_PARALLEL_2D(FUNC, NX, NY, STREAM, ...) \
{ \
int64_t max_threads = \
at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock; \
int64_t max_threads_sqrt = static_cast<int64_t>(sqrt(max_threads)); \
dim3 num_threads, num_blocks; \
num_threads.x = min((NX), max_threads_sqrt); \
num_blocks.x = iDivCeil((NX), num_threads.x); \
num_threads.y = min((NY), max_threads_sqrt); \
num_blocks.y = iDivCeil((NY), num_threads.y); \
num_threads.z = 1; \
num_blocks.z = 1; \
FUNC<<<num_blocks, num_threads, 0, (STREAM)>>>(__VA_ARGS__); \
}
#define LAUNCH_PARALLEL_2D(FUNC, NX, NY, TX, TY, STREAM, ...) \
{ \
dim3 num_threads, num_blocks; \
num_threads.x = min((NX), (TX)); \
num_blocks.x = iDivCeil((NX), num_threads.x); \
num_threads.y = min((NY), (TY)); \
num_blocks.y = iDivCeil((NY), num_threads.y); \
num_threads.z = 1; \
num_blocks.z = 1; \
FUNC<<<num_blocks, num_threads, 0, (STREAM)>>>(__VA_ARGS__); \
}
#define GET_PARALLEL_IDX_1D(VARNAME, N) \
const uint VARNAME = __mul24(blockIdx.x, blockDim.x) + threadIdx.x; \
if (VARNAME >= (N)) { \
return; \
}
#define GET_PARALLEL_IDS_2D(VAR_X, VAR_Y, WIDTH, HEIGHT) \
const uint VAR_X = __mul24(blockIdx.x, blockDim.x) + threadIdx.x; \
const uint VAR_Y = __mul24(blockIdx.y, blockDim.y) + threadIdx.y; \
if (VAR_X >= (WIDTH) || VAR_Y >= (HEIGHT)) \
return;
#define END_PARALLEL()
#define END_PARALLEL_NORET()
#define END_PARALLEL_2D_NORET()
#define END_PARALLEL_2D()
#define RETURN_PARALLEL() return
#define CHECKLAUNCH() THCudaCheck(cudaGetLastError());
#define ISONDEVICE true
#define SYNCDEVICE() HANDLECUDA(cudaDeviceSynchronize())
#define START_TIME(TN) \
cudaEvent_t __time_start_##TN, __time_stop_##TN; \
cudaEventCreate(&__time_start_##TN); \
cudaEventCreate(&__time_stop_##TN); \
cudaEventRecord(__time_start_##TN);
#define STOP_TIME(TN) cudaEventRecord(__time_stop_##TN);
#define GET_TIME(TN, TOPTR) \
cudaEventSynchronize(__time_stop_##TN); \
cudaEventElapsedTime((TOPTR), __time_start_##TN, __time_stop_##TN);
#define START_TIME_CU(TN) START_TIME(CN)
#define STOP_TIME_CU(TN) STOP_TIME(TN)
#define GET_TIME_CU(TN, TOPTR) GET_TIME(TN, TOPTR)
#endif
// Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
#include "../include/renderer.backward.instantiate.h"
// Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
#include "../include/renderer.backward_dbg.instantiate.h"
// Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
#include "../include/renderer.calc_gradients.instantiate.h"
// Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
#include "../include/renderer.calc_signature.instantiate.h"
// Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
#include "../include/renderer.construct.instantiate.h"
// Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
#include "../include/renderer.create_selector.instantiate.h"
// Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
#include "../include/renderer.destruct.instantiate.h"
// Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
#include "../include/renderer.fill_bg.instantiate.h"
// Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
#include "../include/renderer.forward.instantiate.h"
// Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
#include "../include/renderer.norm_cam_gradients.instantiate.h"
// Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
#include "../include/renderer.norm_sphere_gradients.instantiate.h"
// Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
#include "../include/renderer.render.instantiate.h"
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment