Commit a1ec436b authored by Samuli Laine's avatar Samuli Laine
Browse files

Add CUDA rasterizer

parent 78528e68
......@@ -6,7 +6,7 @@
// distribution of this software and related documentation without an express
// license agreement from NVIDIA CORPORATION is strictly prohibited.
#include "rasterize.h"
#include "rasterize_gl.h"
#include "glutil.h"
#include <vector>
#define STRINGIFY_SHADER_SOURCE(x) #x
......@@ -210,8 +210,6 @@ void rasterizeInitGLContext(NVDR_CTX_ARGS, RasterizeGLState& s, int cudaDeviceId
layout(location = 1) out vec4 out_db;
IF_ZMODIFY(
layout(location = 1) uniform float in_dummy;
in vec4 gl_FragCoord;
out float gl_FragDepth;
)
void main()
{
......@@ -233,8 +231,6 @@ void rasterizeInitGLContext(NVDR_CTX_ARGS, RasterizeGLState& s, int cudaDeviceId
layout(location = 1) out vec4 out_db;
IF_ZMODIFY(
layout(location = 1) uniform float in_dummy;
in vec4 gl_FragCoord;
out float gl_FragDepth;
)
void main()
{
......@@ -280,8 +276,6 @@ void rasterizeInitGLContext(NVDR_CTX_ARGS, RasterizeGLState& s, int cudaDeviceId
layout(location = 0) out vec4 out_raster;
IF_ZMODIFY(
layout(location = 1) uniform float in_dummy;
in vec4 gl_FragCoord;
out float gl_FragDepth;
)
void main()
{
......@@ -300,8 +294,6 @@ void rasterizeInitGLContext(NVDR_CTX_ARGS, RasterizeGLState& s, int cudaDeviceId
layout(location = 0) out vec4 out_raster;
IF_ZMODIFY(
layout(location = 1) uniform float in_dummy;
in vec4 gl_FragCoord;
out float gl_FragDepth;
)
void main()
{
......@@ -364,9 +356,9 @@ void rasterizeInitGLContext(NVDR_CTX_ARGS, RasterizeGLState& s, int cudaDeviceId
NVDR_CHECK_GL_ERROR(glGenTextures(1, &s.glPrevOutBuffer));
}
bool rasterizeResizeBuffers(NVDR_CTX_ARGS, RasterizeGLState& s, int posCount, int triCount, int width, int height, int depth)
void rasterizeResizeBuffers(NVDR_CTX_ARGS, RasterizeGLState& s, bool& changes, int posCount, int triCount, int width, int height, int depth)
{
bool changes = false;
changes = false;
// Resize vertex buffer?
if (posCount > s.posCount)
......@@ -435,8 +427,6 @@ bool rasterizeResizeBuffers(NVDR_CTX_ARGS, RasterizeGLState& s, int posCount, in
changes = true;
}
return changes;
}
void rasterizeRender(NVDR_CTX_ARGS, RasterizeGLState& s, cudaStream_t stream, const float* posPtr, int posCount, int vtxPerInstance, const int32_t* triPtr, int triCount, const int32_t* rangesPtr, int width, int height, int depth, int peeling_idx)
......
// Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
//
// NVIDIA CORPORATION and its licensors retain all intellectual property
// and proprietary rights in and to this software, related documentation
// and any modifications thereto. Any use, reproduction, disclosure or
// distribution of this software and related documentation without an express
// license agreement from NVIDIA CORPORATION is strictly prohibited.
#pragma once
//------------------------------------------------------------------------
// Do not try to include OpenGL stuff when compiling CUDA kernels for torch.
#if !(defined(NVDR_TORCH) && defined(__CUDACC__))
#include "framework.h"
#include "glutil.h"
//------------------------------------------------------------------------
// OpenGL-related persistent state for forward op.
struct RasterizeGLState // Must be initializable by memset to zero.
{
int width; // Allocated frame buffer width.
int height; // Allocated frame buffer height.
int depth; // Allocated frame buffer depth.
int posCount; // Allocated position buffer in floats.
int triCount; // Allocated triangle buffer in ints.
GLContext glctx;
GLuint glFBO;
GLuint glColorBuffer[2];
GLuint glPrevOutBuffer;
GLuint glDepthStencilBuffer;
GLuint glVAO;
GLuint glTriBuffer;
GLuint glPosBuffer;
GLuint glProgram;
GLuint glProgramDP;
GLuint glVertexShader;
GLuint glGeometryShader;
GLuint glFragmentShader;
GLuint glFragmentShaderDP;
cudaGraphicsResource_t cudaColorBuffer[2];
cudaGraphicsResource_t cudaPrevOutBuffer;
cudaGraphicsResource_t cudaPosBuffer;
cudaGraphicsResource_t cudaTriBuffer;
int enableDB;
int enableZModify; // Modify depth in shader, workaround for a rasterization issue on A100.
};
//------------------------------------------------------------------------
// Shared C++ code prototypes.
void rasterizeInitGLContext(NVDR_CTX_ARGS, RasterizeGLState& s, int cudaDeviceIdx);
void rasterizeResizeBuffers(NVDR_CTX_ARGS, RasterizeGLState& s, bool& changes, int posCount, int triCount, int width, int height, int depth);
void rasterizeRender(NVDR_CTX_ARGS, RasterizeGLState& s, cudaStream_t stream, const float* posPtr, int posCount, int vtxPerInstance, const int32_t* triPtr, int triCount, const int32_t* rangesPtr, int width, int height, int depth, int peeling_idx);
void rasterizeCopyResults(NVDR_CTX_ARGS, RasterizeGLState& s, cudaStream_t stream, float** outputPtr, int width, int height, int depth);
void rasterizeReleaseBuffers(NVDR_CTX_ARGS, RasterizeGLState& s);
//------------------------------------------------------------------------
#endif // !(defined(NVDR_TORCH) && defined(__CUDACC__))
......@@ -56,13 +56,25 @@ verbose = True # Print status messages to stdout.
# Internal helper funcs.
def _find_compiler_bindir():
hostx64_paths = sorted(glob.glob('C:/Program Files/Microsoft Visual Studio/*/Enterprise/VC/Tools/MSVC/*/bin/Hostx64/x64'), reverse=True)
if hostx64_paths != []:
return hostx64_paths[0]
hostx64_paths = sorted(glob.glob('C:/Program Files (x86)/Microsoft Visual Studio/*/Enterprise/VC/Tools/MSVC/*/bin/Hostx64/x64'), reverse=True)
if hostx64_paths != []:
return hostx64_paths[0]
hostx64_paths = sorted(glob.glob('C:/Program Files/Microsoft Visual Studio/*/Professional/VC/Tools/MSVC/*/bin/Hostx64/x64'), reverse=True)
if hostx64_paths != []:
return hostx64_paths[0]
hostx64_paths = sorted(glob.glob('C:/Program Files (x86)/Microsoft Visual Studio/*/Professional/VC/Tools/MSVC/*/bin/Hostx64/x64'), reverse=True)
if hostx64_paths != []:
return hostx64_paths[0]
hostx64_paths = sorted(glob.glob('C:/Program Files/Microsoft Visual Studio/*/BuildTools/VC/Tools/MSVC/*/bin/Hostx64/x64'), reverse=True)
if hostx64_paths != []:
return hostx64_paths[0]
hostx64_paths = sorted(glob.glob('C:/Program Files (x86)/Microsoft Visual Studio/*/BuildTools/VC/Tools/MSVC/*/bin/Hostx64/x64'), reverse=True)
if hostx64_paths != []:
return hostx64_paths[0]
hostx64_paths = sorted(glob.glob('C:/Program Files/Microsoft Visual Studio/*/Community/VC/Tools/MSVC/*/bin/Hostx64/x64'), reverse=True)
if hostx64_paths != []:
return hostx64_paths[0]
hostx64_paths = sorted(glob.glob('C:/Program Files (x86)/Microsoft Visual Studio/*/Community/VC/Tools/MSVC/*/bin/Hostx64/x64'), reverse=True)
......
......@@ -21,7 +21,7 @@
#include "../common/common.cpp"
#include "../common/rasterize.h"
#include "../common/rasterize.cpp"
#include "../common/rasterize_gl.cpp"
#include "../common/rasterize.cu"
#include "tf_rasterize.cu"
......
......@@ -74,14 +74,15 @@ struct RasterizeFwdOp : public OpKernel
setGLContext(m_glState.glctx); // (Re-)Activate GL context.
// Resize all buffers.
rasterizeResizeBuffers(ctx, m_glState, posCount, triCount, width, height, depth); // In common/rasterize.cpp
// Newly created GL objects sometimes don't map properly to CUDA until after first context swap. Workaround.
if (initCtx)
bool changes = false;
rasterizeResizeBuffers(ctx, m_glState, changes, posCount, triCount, width, height, depth); // In common/rasterize_gl.cpp
if (changes)
{
// On first execution, do a bonus context swap.
#ifdef _WIN32
// Workaround for occasional blank first frame on Windows.
releaseGLContext();
setGLContext(m_glState.glctx);
#endif
}
// Copy input data to GL and render.
......
......@@ -6,5 +6,5 @@
# distribution of this software and related documentation without an express
# license agreement from NVIDIA CORPORATION is strictly prohibited.
from .ops import RasterizeGLContext, get_log_level, set_log_level, rasterize, DepthPeeler, interpolate, texture, texture_construct_mip, antialias, antialias_construct_topology_hash
__all__ = ["RasterizeGLContext", "get_log_level", "set_log_level", "rasterize", "DepthPeeler", "interpolate", "texture", "texture_construct_mip", "antialias", "antialias_construct_topology_hash"]
from .ops import RasterizeCudaContext, RasterizeGLContext, get_log_level, set_log_level, rasterize, DepthPeeler, interpolate, texture, texture_construct_mip, antialias, antialias_construct_topology_hash
__all__ = ["RasterizeCudaContext", "RasterizeGLContext", "get_log_level", "set_log_level", "rasterize", "DepthPeeler", "interpolate", "texture", "texture_construct_mip", "antialias", "antialias_construct_topology_hash"]
......@@ -6,22 +6,23 @@
# distribution of this software and related documentation without an express
# license agreement from NVIDIA CORPORATION is strictly prohibited.
import importlib
import logging
import numpy as np
import os
import sys
import torch
import torch.utils.cpp_extension
#----------------------------------------------------------------------------
# C++/Cuda plugin compiler/loader.
_cached_plugin = None
def _get_plugin():
_cached_plugin = {}
def _get_plugin(gl=False):
assert isinstance(gl, bool)
# Return cached plugin if already loaded.
global _cached_plugin
if _cached_plugin is not None:
return _cached_plugin
if _cached_plugin.get(gl, None) is not None:
return _cached_plugin[gl]
# Make sure we can find the necessary compiler and libary binaries.
if os.name == 'nt':
......@@ -29,7 +30,9 @@ def _get_plugin():
def find_cl_path():
import glob
for edition in ['Enterprise', 'Professional', 'BuildTools', 'Community']:
paths = sorted(glob.glob(r"C:\Program Files (x86)\Microsoft Visual Studio\*\%s\VC\Tools\MSVC\*\bin\Hostx64\x64" % edition), reverse=True)
vs_relative_path = r"\Microsoft Visual Studio\*\%s\VC\Tools\MSVC\*\bin\Hostx64\x64" % edition
paths = sorted(glob.glob(r"C:\Program Files" + vs_relative_path), reverse=True)
paths += sorted(glob.glob(r"C:\Program Files (x86)" + vs_relative_path), reverse=True)
if paths:
return paths[0]
......@@ -43,35 +46,52 @@ def _get_plugin():
# Compiler options.
opts = ['-DNVDR_TORCH']
# Linker options.
if os.name == 'posix':
ldflags = ['-lGL', '-lEGL']
elif os.name == 'nt':
libs = ['gdi32', 'opengl32', 'user32', 'setgpu']
ldflags = ['/LIBPATH:' + lib_dir] + ['/DEFAULTLIB:' + x for x in libs]
# Linker options for the GL-interfacing plugin.
ldflags = []
if gl:
if os.name == 'posix':
ldflags = ['-lGL', '-lEGL']
elif os.name == 'nt':
libs = ['gdi32', 'opengl32', 'user32', 'setgpu']
ldflags = ['/LIBPATH:' + lib_dir] + ['/DEFAULTLIB:' + x for x in libs]
# List of source files.
source_files = [
'../common/common.cpp',
'../common/glutil.cpp',
'../common/rasterize.cu',
'../common/rasterize.cpp',
'../common/interpolate.cu',
'../common/texture.cu',
'../common/texture.cpp',
'../common/antialias.cu',
'torch_bindings.cpp',
'torch_rasterize.cpp',
'torch_interpolate.cpp',
'torch_texture.cpp',
'torch_antialias.cpp',
]
if gl:
source_files = [
'../common/common.cpp',
'../common/glutil.cpp',
'../common/rasterize_gl.cpp',
'torch_bindings_gl.cpp',
'torch_rasterize_gl.cpp',
]
else:
source_files = [
'../common/cudaraster/impl/Buffer.cpp',
'../common/cudaraster/impl/CudaRaster.cpp',
'../common/cudaraster/impl/RasterImpl.cu',
'../common/cudaraster/impl/RasterImpl.cpp',
'../common/common.cpp',
'../common/rasterize.cu',
'../common/interpolate.cu',
'../common/texture.cu',
'../common/texture.cpp',
'../common/antialias.cu',
'torch_bindings.cpp',
'torch_rasterize.cpp',
'torch_interpolate.cpp',
'torch_texture.cpp',
'torch_antialias.cpp',
]
# Some containers set this to contain old architectures that won't compile. We only need the one installed in the machine.
os.environ['TORCH_CUDA_ARCH_LIST'] = ''
# On Linux, show a warning if GLEW is being forcibly loaded when compiling the GL plugin.
if gl and (os.name == 'posix') and ('libGLEW' in os.environ.get('LD_PRELOAD', '')):
logging.getLogger('nvdiffrast').warning("Warning: libGLEW is being loaded via LD_PRELOAD, and will probably conflict with the OpenGL plugin")
# Try to detect if a stray lock file is left in cache directory and show a warning. This sometimes happens on Windows if the build is interrupted at just the right moment.
plugin_name = 'nvdiffrast_plugin'
plugin_name = 'nvdiffrast_plugin' + ('_gl' if gl else '')
try:
lock_fn = os.path.join(torch.utils.cpp_extension._get_build_directory(plugin_name, False), 'lock')
if os.path.exists(lock_fn):
......@@ -79,14 +99,27 @@ def _get_plugin():
except:
pass
# Speed up compilation on Windows.
if os.name == 'nt':
# Skip telemetry sending step in vcvarsall.bat
os.environ['VSCMD_SKIP_SENDTELEMETRY'] = '1'
# Opportunistically patch distutils to cache MSVC environments.
try:
import distutils._msvccompiler
import functools
if not hasattr(distutils._msvccompiler._get_vc_env, '__wrapped__'):
distutils._msvccompiler._get_vc_env = functools.lru_cache()(distutils._msvccompiler._get_vc_env)
except:
pass
# Compile and load.
source_paths = [os.path.join(os.path.dirname(__file__), fn) for fn in source_files]
torch.utils.cpp_extension.load(name=plugin_name, sources=source_paths, extra_cflags=opts, extra_cuda_cflags=opts, extra_ldflags=ldflags, with_cuda=True, verbose=False)
torch.utils.cpp_extension.load(name=plugin_name, sources=source_paths, extra_cflags=opts, extra_cuda_cflags=opts+['-lineinfo'], extra_ldflags=ldflags, with_cuda=True, verbose=False)
# Import, cache, and return the compiled module.
import nvdiffrast_plugin
_cached_plugin = nvdiffrast_plugin
return _cached_plugin
_cached_plugin[gl] = importlib.import_module(plugin_name)
return _cached_plugin[gl]
#----------------------------------------------------------------------------
# Log level.
......@@ -118,7 +151,35 @@ def set_log_level(level):
_get_plugin().set_log_level(level)
#----------------------------------------------------------------------------
# GL State wrapper.
# CudaRaster state wrapper.
#----------------------------------------------------------------------------
class RasterizeCudaContext:
def __init__(self, device=None):
'''Create a new Cuda rasterizer context.
The context is deleted and internal storage is released when the object is
destroyed.
Args:
device (Optional): Cuda device on which the context is created. Type can be
`torch.device`, string (e.g., `'cuda:1'`), or int. If not
specified, context will be created on currently active Cuda
device.
Returns:
The newly created Cuda rasterizer context.
'''
if device is None:
cuda_device_idx = torch.cuda.current_device()
else:
with torch.cuda.device(device):
cuda_device_idx = torch.cuda.current_device()
self.cpp_wrapper = _get_plugin().RasterizeCRStateWrapper(cuda_device_idx)
self.output_db = True
self.active_depth_peeler = None
#----------------------------------------------------------------------------
# GL state wrapper.
#----------------------------------------------------------------------------
class RasterizeGLContext:
......@@ -157,8 +218,8 @@ class RasterizeGLContext:
else:
with torch.cuda.device(device):
cuda_device_idx = torch.cuda.current_device()
self.cpp_wrapper = _get_plugin().RasterizeGLStateWrapper(output_db, mode == 'automatic', cuda_device_idx)
self.active_depth_peeler = None # For error checking only
self.cpp_wrapper = _get_plugin(gl=True).RasterizeGLStateWrapper(output_db, mode == 'automatic', cuda_device_idx)
self.active_depth_peeler = None # For error checking only.
def set_context(self):
'''Set (activate) OpenGL context in the current CPU thread.
......@@ -180,8 +241,11 @@ class RasterizeGLContext:
class _rasterize_func(torch.autograd.Function):
@staticmethod
def forward(ctx, glctx, pos, tri, resolution, ranges, grad_db, peeling_idx):
out, out_db = _get_plugin().rasterize_fwd(glctx.cpp_wrapper, pos, tri, resolution, ranges, peeling_idx)
def forward(ctx, raster_ctx, pos, tri, resolution, ranges, grad_db, peeling_idx):
if isinstance(raster_ctx, RasterizeGLContext):
out, out_db = _get_plugin(gl=True).rasterize_fwd_gl(raster_ctx.cpp_wrapper, pos, tri, resolution, ranges, peeling_idx)
else:
out, out_db = _get_plugin().rasterize_fwd_cuda(raster_ctx.cpp_wrapper, pos, tri, resolution, ranges, peeling_idx)
ctx.save_for_backward(pos, tri, out)
ctx.saved_grad_db = grad_db
return out, out_db
......@@ -204,7 +268,7 @@ def rasterize(glctx, pos, tri, resolution, ranges=None, grad_db=True):
output tensors will be contiguous and reside in GPU memory.
Args:
glctx: OpenGL context of type `RasterizeGLContext`.
glctx: Rasterizer context of type `RasterizeGLContext` or `RasterizeCudaContext`.
pos: Vertex position tensor with dtype `torch.float32`. To enable range
mode, this tensor should have a 2D shape [num_vertices, 4]. To enable
instanced mode, use a 3D shape [minibatch_size, num_vertices, 4].
......@@ -214,8 +278,8 @@ def rasterize(glctx, pos, tri, resolution, ranges=None, grad_db=True):
`torch.int32`, specifying start indices and counts into `tri`.
Ignored in instanced mode.
grad_db: Propagate gradients of image-space derivatives of barycentrics
into `pos` in backward pass. Ignored if OpenGL context was
not configured to output image-space derivatives.
into `pos` in backward pass. Ignored if using an OpenGL context that
was not configured to output image-space derivatives.
Returns:
A tuple of two tensors. The first output tensor has shape [minibatch_size,
......@@ -226,7 +290,7 @@ def rasterize(glctx, pos, tri, resolution, ranges=None, grad_db=True):
(du/dX, du/dY, dv/dX, dv/dY). Otherwise it will be an empty tensor with shape
[minibatch_size, height, width, 0].
'''
assert isinstance(glctx, RasterizeGLContext)
assert isinstance(glctx, (RasterizeGLContext, RasterizeCudaContext))
assert grad_db is True or grad_db is False
grad_db = grad_db and glctx.output_db
......@@ -258,7 +322,7 @@ class DepthPeeler:
Returns:
The newly created depth peeler.
'''
assert isinstance(glctx, RasterizeGLContext)
assert isinstance(glctx, (RasterizeGLContext, RasterizeCudaContext))
assert grad_db is True or grad_db is False
grad_db = grad_db and glctx.output_db
......@@ -271,7 +335,7 @@ class DepthPeeler:
assert isinstance(ranges, torch.Tensor)
# Store all the parameters.
self.glctx = glctx
self.raster_ctx = glctx
self.pos = pos
self.tri = tri
self.resolution = resolution
......@@ -280,18 +344,18 @@ class DepthPeeler:
self.peeling_idx = None
def __enter__(self):
if self.glctx is None:
if self.raster_ctx is None:
raise RuntimeError("Cannot re-enter a terminated depth peeling operation")
if self.glctx.active_depth_peeler is not None:
raise RuntimeError("Cannot have multiple depth peelers active simultaneously in a RasterizeGLContext")
self.glctx.active_depth_peeler = self
if self.raster_ctx.active_depth_peeler is not None:
raise RuntimeError("Cannot have multiple depth peelers active simultaneously in a rasterization context")
self.raster_ctx.active_depth_peeler = self
self.peeling_idx = 0
return self
def __exit__(self, *args):
assert self.glctx.active_depth_peeler is self
self.glctx.active_depth_peeler = None
self.glctx = None # Remove all references to input tensor so they're not left dangling.
assert self.raster_ctx.active_depth_peeler is self
self.raster_ctx.active_depth_peeler = None
self.raster_ctx = None # Remove all references to input tensor so they're not left dangling.
self.pos = None
self.tri = None
self.resolution = None
......@@ -309,9 +373,9 @@ class DepthPeeler:
Returns:
A tuple of two tensors as in `rasterize()`.
'''
assert self.glctx.active_depth_peeler is self
assert self.raster_ctx.active_depth_peeler is self
assert self.peeling_idx >= 0
result = _rasterize_func.apply(self.glctx, self.pos, self.tri, self.resolution, self.ranges, self.grad_db, self.peeling_idx)
result = _rasterize_func.apply(self.raster_ctx, self.pos, self.tri, self.resolution, self.ranges, self.grad_db, self.peeling_idx)
self.peeling_idx += 1
return result
......@@ -604,6 +668,14 @@ def antialias(color, rast, pos, tri, topology_hash=None, pos_gradient_boost=1.0)
All input tensors must be contiguous and reside in GPU memory. The output tensor
will be contiguous and reside in GPU memory.
Note that silhouette edge determination is based on vertex indices in the triangle
tensor. For it to work properly, a vertex belonging to multiple triangles must be
referred to using the same vertex index in each triangle. Otherwise, nvdiffrast will always
classify the adjacent edges as silhouette edges, which leads to bad performance and
potentially incorrect gradients. If you are unsure whether your data is good, check
which pixels are modified by the antialias operation and compare to the example in the
documentation.
Args:
color: Input image to antialias with shape [minibatch_size, height, width, num_channels].
rast: Main output tensor from `rasterize()`.
......
......@@ -20,7 +20,7 @@
#define OP_RETURN_TTV std::tuple<torch::Tensor, torch::Tensor, std::vector<torch::Tensor> >
#define OP_RETURN_TTTTV std::tuple<torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, std::vector<torch::Tensor> >
OP_RETURN_TT rasterize_fwd (RasterizeGLStateWrapper& stateWrapper, torch::Tensor pos, torch::Tensor tri, std::tuple<int, int> resolution, torch::Tensor ranges, int depth_idx);
OP_RETURN_TT rasterize_fwd_cuda (RasterizeCRStateWrapper& stateWrapper, torch::Tensor pos, torch::Tensor tri, std::tuple<int, int> resolution, torch::Tensor ranges, int peeling_idx);
OP_RETURN_T rasterize_grad (torch::Tensor pos, torch::Tensor tri, torch::Tensor out, torch::Tensor dy);
OP_RETURN_T rasterize_grad_db (torch::Tensor pos, torch::Tensor tri, torch::Tensor out, torch::Tensor dy, torch::Tensor ddb);
OP_RETURN_TT interpolate_fwd (torch::Tensor attr, torch::Tensor rast, torch::Tensor tri);
......@@ -42,9 +42,7 @@ OP_RETURN_TT antialias_grad (torch::Tensor color, to
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
// State classes.
pybind11::class_<RasterizeGLStateWrapper>(m, "RasterizeGLStateWrapper").def(pybind11::init<bool, bool, int>())
.def("set_context", &RasterizeGLStateWrapper::setContext)
.def("release_context", &RasterizeGLStateWrapper::releaseContext);
pybind11::class_<RasterizeCRStateWrapper>(m, "RasterizeCRStateWrapper").def(pybind11::init<int>());
pybind11::class_<TextureMipWrapper>(m, "TextureMipWrapper").def(pybind11::init<>());
pybind11::class_<TopologyHashWrapper>(m, "TopologyHashWrapper");
......@@ -53,7 +51,7 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("set_log_level", [](int level){ FLAGS_caffe2_log_level = level; }, "set log level");
// Ops.
m.def("rasterize_fwd", &rasterize_fwd, "rasterize forward op");
m.def("rasterize_fwd_cuda", &rasterize_fwd_cuda, "rasterize forward op (cuda)");
m.def("rasterize_grad", &rasterize_grad, "rasterize gradient op ignoring db gradients");
m.def("rasterize_grad_db", &rasterize_grad_db, "rasterize gradient op with db gradients");
m.def("interpolate_fwd", &interpolate_fwd, "interpolate forward op with attribute derivatives");
......
// Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
//
// NVIDIA CORPORATION and its licensors retain all intellectual property
// and proprietary rights in and to this software, related documentation
// and any modifications thereto. Any use, reproduction, disclosure or
// distribution of this software and related documentation without an express
// license agreement from NVIDIA CORPORATION is strictly prohibited.
#include "torch_common.inl"
#include "torch_types.h"
#include <tuple>
//------------------------------------------------------------------------
// Op prototypes.
std::tuple<torch::Tensor, torch::Tensor> rasterize_fwd_gl(RasterizeGLStateWrapper& stateWrapper, torch::Tensor pos, torch::Tensor tri, std::tuple<int, int> resolution, torch::Tensor ranges, int peeling_idx);
//------------------------------------------------------------------------
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
// State classes.
pybind11::class_<RasterizeGLStateWrapper>(m, "RasterizeGLStateWrapper").def(pybind11::init<bool, bool, int>())
.def("set_context", &RasterizeGLStateWrapper::setContext)
.def("release_context", &RasterizeGLStateWrapper::releaseContext);
// Ops.
m.def("rasterize_fwd_gl", &rasterize_fwd_gl, "rasterize forward op (opengl)");
}
//------------------------------------------------------------------------
......@@ -10,55 +10,41 @@
#include "torch_types.h"
#include "../common/common.h"
#include "../common/rasterize.h"
#include "../common/cudaraster/CudaRaster.hpp"
#include "../common/cudaraster/impl/Constants.hpp"
#include <tuple>
//------------------------------------------------------------------------
// Kernel prototypes.
void RasterizeCudaFwdShaderKernel(const RasterizeCudaFwdShaderParams p);
void RasterizeGradKernel(const RasterizeGradParams p);
void RasterizeGradKernelDb(const RasterizeGradParams p);
//------------------------------------------------------------------------
// Python GL state wrapper methods.
// Python CudaRaster state wrapper methods.
RasterizeGLStateWrapper::RasterizeGLStateWrapper(bool enableDB, bool automatic_, int cudaDeviceIdx_)
RasterizeCRStateWrapper::RasterizeCRStateWrapper(int cudaDeviceIdx_)
{
pState = new RasterizeGLState();
automatic = automatic_;
const at::cuda::OptionalCUDAGuard device_guard(cudaDeviceIdx_);
cudaDeviceIdx = cudaDeviceIdx_;
memset(pState, 0, sizeof(RasterizeGLState));
pState->enableDB = enableDB ? 1 : 0;
rasterizeInitGLContext(NVDR_CTX_PARAMS, *pState, cudaDeviceIdx_);
releaseGLContext();
cr = new CR::CudaRaster();
}
RasterizeGLStateWrapper::~RasterizeGLStateWrapper(void)
RasterizeCRStateWrapper::~RasterizeCRStateWrapper(void)
{
setGLContext(pState->glctx);
rasterizeReleaseBuffers(NVDR_CTX_PARAMS, *pState);
releaseGLContext();
destroyGLContext(pState->glctx);
delete pState;
}
void RasterizeGLStateWrapper::setContext(void)
{
setGLContext(pState->glctx);
}
void RasterizeGLStateWrapper::releaseContext(void)
{
releaseGLContext();
const at::cuda::OptionalCUDAGuard device_guard(cudaDeviceIdx);
delete cr;
}
//------------------------------------------------------------------------
// Forward op.
// Forward op (Cuda).
std::tuple<torch::Tensor, torch::Tensor> rasterize_fwd(RasterizeGLStateWrapper& stateWrapper, torch::Tensor pos, torch::Tensor tri, std::tuple<int, int> resolution, torch::Tensor ranges, int peeling_idx)
std::tuple<torch::Tensor, torch::Tensor> rasterize_fwd_cuda(RasterizeCRStateWrapper& stateWrapper, torch::Tensor pos, torch::Tensor tri, std::tuple<int, int> resolution, torch::Tensor ranges, int peeling_idx)
{
const at::cuda::OptionalCUDAGuard device_guard(device_of(pos));
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
RasterizeGLState& s = *stateWrapper.pState;
CR::CudaRaster* cr = stateWrapper.cr;
// Check inputs.
NVDR_CHECK_DEVICE(pos, tri);
......@@ -67,11 +53,8 @@ std::tuple<torch::Tensor, torch::Tensor> rasterize_fwd(RasterizeGLStateWrapper&
NVDR_CHECK_F32(pos);
NVDR_CHECK_I32(tri, ranges);
// Check that GL context was created for the correct GPU.
NVDR_CHECK(pos.get_device() == stateWrapper.cudaDeviceIdx, "GL context must must reside on the same device as input tensors");
// Determine number of outputs
int num_outputs = s.enableDB ? 2 : 1;
// Check that CudaRaster context was created for the correct GPU.
NVDR_CHECK(pos.get_device() == stateWrapper.cudaDeviceIdx, "CudaRaster context must must reside on the same device as input tensors");
// Determine instance mode and check input dimensions.
bool instance_mode = pos.sizes().size() > 2;
......@@ -87,49 +70,75 @@ std::tuple<torch::Tensor, torch::Tensor> rasterize_fwd(RasterizeGLStateWrapper&
// Get output shape.
int height = std::get<0>(resolution);
int width = std::get<1>(resolution);
int depth = instance_mode ? pos.size(0) : ranges.size(0);
int depth = instance_mode ? pos.size(0) : ranges.size(0); // Depth of tensor, not related to depth buffering.
NVDR_CHECK(height > 0 && width > 0, "resolution must be [>0, >0]");
// Get position and triangle buffer sizes in int32/float32.
int posCount = 4 * pos.size(0) * (instance_mode ? pos.size(1) : 1);
int triCount = 3 * tri.size(0);
// Check resolution compatibility with CudaRaster.
TORCH_CHECK(height <= CR_MAXVIEWPORT_SIZE && width <= CR_MAXVIEWPORT_SIZE, "resolution must be [<=", CR_MAXVIEWPORT_SIZE, ", <=", CR_MAXVIEWPORT_SIZE, "]");
TORCH_CHECK(((height | width) & (CR_TILE_SIZE - 1)) == 0, "width and height must be divisible by ", CR_TILE_SIZE);
// Set the GL context unless manual context.
if (stateWrapper.automatic)
setGLContext(s.glctx);
// Get position and triangle buffer sizes in vertices / triangles.
int posCount = instance_mode ? pos.size(1) : pos.size(0);
int triCount = tri.size(0);
// Resize all buffers.
if (rasterizeResizeBuffers(NVDR_CTX_PARAMS, s, posCount, triCount, width, height, depth))
{
#ifdef _WIN32
// Workaround for occasional blank first frame on Windows.
releaseGLContext();
setGLContext(s.glctx);
#endif
}
// Copy input data to GL and render.
// Render.
const float* posPtr = pos.data_ptr<float>();
const int32_t* rangesPtr = instance_mode ? 0 : ranges.data_ptr<int32_t>(); // This is in CPU memory.
const int32_t* triPtr = tri.data_ptr<int32_t>();
int vtxPerInstance = instance_mode ? pos.size(1) : 0;
rasterizeRender(NVDR_CTX_PARAMS, s, stream, posPtr, posCount, vtxPerInstance, triPtr, triCount, rangesPtr, width, height, depth, peeling_idx);
// Set up CudaRaster.
cr->setViewportSize(width, height, depth);
cr->setVertexBuffer((void*)posPtr, posCount);
cr->setIndexBuffer((void*)triPtr, triCount);
// Enable depth peeling?
bool enablePeel = (peeling_idx > 0);
cr->setRenderModeFlags(enablePeel ? CR::CudaRaster::RenderModeFlag_EnableDepthPeeling : 0); // No backface culling.
if (enablePeel)
cr->swapDepthAndPeel(); // Use previous depth buffer as peeling depth input.
// Run CudaRaster in one large batch. In case of error, the workload could be split into smaller batches - maybe do that in the future.
cr->deferredClear(0u);
bool success = cr->drawTriangles(rangesPtr, stream);
NVDR_CHECK(success, "subtriangle count overflow");
// Allocate output tensors.
torch::TensorOptions opts = torch::TensorOptions().dtype(torch::kFloat32).device(torch::kCUDA);
torch::Tensor out = torch::empty({depth, height, width, 4}, opts);
torch::Tensor out_db = torch::empty({depth, height, width, s.enableDB ? 4 : 0}, opts);
float* outputPtr[2];
outputPtr[0] = out.data_ptr<float>();
outputPtr[1] = s.enableDB ? out_db.data_ptr<float>() : NULL;
torch::Tensor out_db = torch::empty({depth, height, width, 4}, opts);
// Copy rasterized results into CUDA buffers.
rasterizeCopyResults(NVDR_CTX_PARAMS, s, stream, outputPtr, width, height, depth);
// Populate pixel shader kernel parameters.
RasterizeCudaFwdShaderParams p;
p.pos = posPtr;
p.tri = triPtr;
p.in_idx = (const int*)cr->getColorBuffer();
p.out = out.data_ptr<float>();
p.out_db = out_db.data_ptr<float>();
p.numTriangles = triCount;
p.numVertices = posCount;
p.width = width;
p.height = height;
p.depth = depth;
p.instance_mode = (pos.sizes().size() > 2) ? 1 : 0;
p.xs = 2.f / (float)p.width;
p.xo = 1.f / (float)p.width - 1.f;
p.ys = 2.f / (float)p.height;
p.yo = 1.f / (float)p.height - 1.f;
// Done. Release GL context and return.
if (stateWrapper.automatic)
releaseGLContext();
// Verify that buffers are aligned to allow float2/float4 operations.
NVDR_CHECK(!((uintptr_t)p.pos & 15), "pos input tensor not aligned to float4");
NVDR_CHECK(!((uintptr_t)p.out & 15), "out output tensor not aligned to float4");
NVDR_CHECK(!((uintptr_t)p.out_db & 15), "out_db output tensor not aligned to float4");
// Choose launch parameters.
dim3 blockSize = getLaunchBlockSize(RAST_CUDA_FWD_SHADER_KERNEL_BLOCK_WIDTH, RAST_CUDA_FWD_SHADER_KERNEL_BLOCK_HEIGHT, p.width, p.height);
dim3 gridSize = getLaunchGridSize(blockSize, p.width, p.height, p.depth);
// Launch CUDA kernel.
void* args[] = {&p};
NVDR_CHECK_CUDA_ERROR(cudaLaunchKernel((void*)RasterizeCudaFwdShaderKernel, gridSize, blockSize, args, 0, stream));
// Return.
return std::tuple<torch::Tensor, torch::Tensor>(out, out_db);
}
......
// Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
//
// NVIDIA CORPORATION and its licensors retain all intellectual property
// and proprietary rights in and to this software, related documentation
// and any modifications thereto. Any use, reproduction, disclosure or
// distribution of this software and related documentation without an express
// license agreement from NVIDIA CORPORATION is strictly prohibited.
#include "torch_common.inl"
#include "torch_types.h"
#include "../common/common.h"
#include "../common/rasterize_gl.h"
#include <tuple>
//------------------------------------------------------------------------
// Python GL state wrapper methods.
RasterizeGLStateWrapper::RasterizeGLStateWrapper(bool enableDB, bool automatic_, int cudaDeviceIdx_)
{
pState = new RasterizeGLState();
automatic = automatic_;
cudaDeviceIdx = cudaDeviceIdx_;
memset(pState, 0, sizeof(RasterizeGLState));
pState->enableDB = enableDB ? 1 : 0;
rasterizeInitGLContext(NVDR_CTX_PARAMS, *pState, cudaDeviceIdx_);
releaseGLContext();
}
RasterizeGLStateWrapper::~RasterizeGLStateWrapper(void)
{
setGLContext(pState->glctx);
rasterizeReleaseBuffers(NVDR_CTX_PARAMS, *pState);
releaseGLContext();
destroyGLContext(pState->glctx);
delete pState;
}
void RasterizeGLStateWrapper::setContext(void)
{
setGLContext(pState->glctx);
}
void RasterizeGLStateWrapper::releaseContext(void)
{
releaseGLContext();
}
//------------------------------------------------------------------------
// Forward op (OpenGL).
std::tuple<torch::Tensor, torch::Tensor> rasterize_fwd_gl(RasterizeGLStateWrapper& stateWrapper, torch::Tensor pos, torch::Tensor tri, std::tuple<int, int> resolution, torch::Tensor ranges, int peeling_idx)
{
const at::cuda::OptionalCUDAGuard device_guard(device_of(pos));
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
RasterizeGLState& s = *stateWrapper.pState;
// Check inputs.
NVDR_CHECK_DEVICE(pos, tri);
NVDR_CHECK_CPU(ranges);
NVDR_CHECK_CONTIGUOUS(pos, tri, ranges);
NVDR_CHECK_F32(pos);
NVDR_CHECK_I32(tri, ranges);
// Check that GL context was created for the correct GPU.
NVDR_CHECK(pos.get_device() == stateWrapper.cudaDeviceIdx, "GL context must must reside on the same device as input tensors");
// Determine number of outputs
int num_outputs = s.enableDB ? 2 : 1;
// Determine instance mode and check input dimensions.
bool instance_mode = pos.sizes().size() > 2;
if (instance_mode)
NVDR_CHECK(pos.sizes().size() == 3 && pos.size(0) > 0 && pos.size(1) > 0 && pos.size(2) == 4, "instance mode - pos must have shape [>0, >0, 4]");
else
{
NVDR_CHECK(pos.sizes().size() == 2 && pos.size(0) > 0 && pos.size(1) == 4, "range mode - pos must have shape [>0, 4]");
NVDR_CHECK(ranges.sizes().size() == 2 && ranges.size(0) > 0 && ranges.size(1) == 2, "range mode - ranges must have shape [>0, 2]");
}
NVDR_CHECK(tri.sizes().size() == 2 && tri.size(0) > 0 && tri.size(1) == 3, "tri must have shape [>0, 3]");
// Get output shape.
int height = std::get<0>(resolution);
int width = std::get<1>(resolution);
int depth = instance_mode ? pos.size(0) : ranges.size(0);
NVDR_CHECK(height > 0 && width > 0, "resolution must be [>0, >0]");
// Get position and triangle buffer sizes in int32/float32.
int posCount = 4 * pos.size(0) * (instance_mode ? pos.size(1) : 1);
int triCount = 3 * tri.size(0);
// Set the GL context unless manual context.
if (stateWrapper.automatic)
setGLContext(s.glctx);
// Resize all buffers.
bool changes = false;
rasterizeResizeBuffers(NVDR_CTX_PARAMS, s, changes, posCount, triCount, width, height, depth);
if (changes)
{
#ifdef _WIN32
// Workaround for occasional blank first frame on Windows.
releaseGLContext();
setGLContext(s.glctx);
#endif
}
// Copy input data to GL and render.
const float* posPtr = pos.data_ptr<float>();
const int32_t* rangesPtr = instance_mode ? 0 : ranges.data_ptr<int32_t>(); // This is in CPU memory.
const int32_t* triPtr = tri.data_ptr<int32_t>();
int vtxPerInstance = instance_mode ? pos.size(1) : 0;
rasterizeRender(NVDR_CTX_PARAMS, s, stream, posPtr, posCount, vtxPerInstance, triPtr, triCount, rangesPtr, width, height, depth, peeling_idx);
// Allocate output tensors.
torch::TensorOptions opts = torch::TensorOptions().dtype(torch::kFloat32).device(torch::kCUDA);
torch::Tensor out = torch::empty({depth, height, width, 4}, opts);
torch::Tensor out_db = torch::empty({depth, height, width, s.enableDB ? 4 : 0}, opts);
float* outputPtr[2];
outputPtr[0] = out.data_ptr<float>();
outputPtr[1] = s.enableDB ? out_db.data_ptr<float>() : NULL;
// Copy rasterized results into CUDA buffers.
rasterizeCopyResults(NVDR_CTX_PARAMS, s, stream, outputPtr, width, height, depth);
// Done. Release GL context and return.
if (stateWrapper.automatic)
releaseGLContext();
return std::tuple<torch::Tensor, torch::Tensor>(out, out_db);
}
//------------------------------------------------------------------------
......@@ -26,6 +26,20 @@ public:
int cudaDeviceIdx;
};
//------------------------------------------------------------------------
// Python CudaRaster state wrapper.
namespace CR { class CudaRaster; }
class RasterizeCRStateWrapper
{
public:
RasterizeCRStateWrapper (int cudaDeviceIdx);
~RasterizeCRStateWrapper (void);
CR::CudaRaster* cr;
int cudaDeviceIdx;
};
//------------------------------------------------------------------------
// Mipmap wrapper to prevent intrusion from Python side.
......
......@@ -48,7 +48,8 @@ def fit_cube(max_iter = 5000,
out_dir = None,
log_fn = None,
mp4save_interval = None,
mp4save_fn = None):
mp4save_fn = None,
use_opengl = False):
log_file = None
writer = None
......@@ -73,7 +74,8 @@ def fit_cube(max_iter = 5000,
vtx_pos = torch.from_numpy(vtxp.astype(np.float32)).cuda()
vtx_col = torch.from_numpy(vtxc.astype(np.float32)).cuda()
glctx = dr.RasterizeGLContext()
# Rasterizer context
glctx = dr.RasterizeGLContext() if use_opengl else dr.RasterizeCudaContext()
# Repeats.
for rep in range(repeats):
......@@ -161,7 +163,8 @@ def fit_cube(max_iter = 5000,
def main():
parser = argparse.ArgumentParser(description='Cube fit example')
parser.add_argument('--outdir', help='Specify output directory', default='')
parser.add_argument('--opengl', help='enable OpenGL rendering', action='store_true', default=False)
parser.add_argument('--outdir', help='specify output directory', default='')
parser.add_argument('--discontinuous', action='store_true', default=False)
parser.add_argument('--resolution', type=int, default=0, required=True)
parser.add_argument('--display-interval', type=int, default=0)
......@@ -188,7 +191,8 @@ def main():
out_dir=out_dir,
log_fn='log.txt',
mp4save_interval=args.mp4save_interval,
mp4save_fn='progress.mp4'
mp4save_fn='progress.mp4',
use_opengl=args.opengl
)
# Done.
......
......@@ -47,7 +47,7 @@ def fit_earth(max_iter = 20000,
display_res = 1024,
enable_mip = True,
res = 512,
ref_res = 4096,
ref_res = 2048, # Dropped from 4096 to 2048 to allow using the Cuda rasterizer.
lr_base = 1e-2,
lr_ramp = 0.1,
out_dir = None,
......@@ -55,7 +55,8 @@ def fit_earth(max_iter = 20000,
texsave_interval = None,
texsave_fn = None,
imgsave_interval = None,
imgsave_fn = None):
imgsave_fn = None,
use_opengl = False):
log_file = None
if out_dir:
......@@ -64,7 +65,7 @@ def fit_earth(max_iter = 20000,
log_file = open(out_dir + '/' + log_fn, 'wt')
else:
imgsave_interval, texsave_interval = None, None
# Mesh and texture adapted from "3D Earth Photorealistic 2K" model at
# https://www.turbosquid.com/3d-models/3d-realistic-earth-photorealistic-2k-1279125
datadir = f'{pathlib.Path(__file__).absolute().parents[1]}/data'
......@@ -86,7 +87,7 @@ def fit_earth(max_iter = 20000,
tex = torch.from_numpy(tex.astype(np.float32)).cuda()
tex_opt = torch.full(tex.shape, 0.2, device='cuda', requires_grad=True)
glctx = dr.RasterizeGLContext()
glctx = dr.RasterizeGLContext() if use_opengl else dr.RasterizeCudaContext()
ang = 0.0
......@@ -177,8 +178,9 @@ def fit_earth(max_iter = 20000,
def main():
parser = argparse.ArgumentParser(description='Earth texture fitting example')
parser.add_argument('--outdir', help='Specify output directory', default='')
parser.add_argument('--mip', action='store_true', default=False)
parser.add_argument('--opengl', help='enable OpenGL rendering', action='store_true', default=False)
parser.add_argument('--outdir', help='specify output directory', default='')
parser.add_argument('--mip', help='enable mipmapping', action='store_true', default=False)
parser.add_argument('--display-interval', type=int, default=0)
parser.add_argument('--max-iter', type=int, default=10000)
args = parser.parse_args()
......@@ -193,7 +195,7 @@ def main():
print ('No output directory specified, not saving log or images')
# Run.
fit_earth(max_iter=args.max_iter, log_interval=10, display_interval=args.display_interval, enable_mip=args.mip, out_dir=out_dir, log_fn='log.txt', texsave_interval=1000, texsave_fn='tex_%06d.png', imgsave_interval=1000, imgsave_fn='img_%06d.png')
fit_earth(max_iter=args.max_iter, log_interval=10, display_interval=args.display_interval, enable_mip=args.mip, out_dir=out_dir, log_fn='log.txt', texsave_interval=1000, texsave_fn='tex_%06d.png', imgsave_interval=1000, imgsave_fn='img_%06d.png', use_opengl=args.opengl)
# Done.
print("Done.")
......
......@@ -32,7 +32,8 @@ def fit_env_phong(max_iter = 1000,
out_dir = None,
log_fn = None,
mp4save_interval = None,
mp4save_fn = None):
mp4save_fn = None,
use_opengl = False):
log_file = None
writer = None
......@@ -74,7 +75,7 @@ def fit_env_phong(max_iter = 1000,
# Render.
ang = 0.0
imgloss_avg, phong_avg = [], []
glctx = dr.RasterizeGLContext()
glctx = dr.RasterizeGLContext() if use_opengl else dr.RasterizeCudaContext()
zero_tensor = torch.as_tensor(0.0, dtype=torch.float32, device='cuda')
one_tensor = torch.as_tensor(1.0, dtype=torch.float32, device='cuda')
......@@ -162,7 +163,7 @@ def fit_env_phong(max_iter = 1000,
if log_file:
log_file.write(s + '\n')
# Show/save result image.
# Show/save result image.
display_image = display_interval and (it % display_interval == 0)
save_mp4 = mp4save_interval and (it % mp4save_interval == 0)
......@@ -193,7 +194,8 @@ def fit_env_phong(max_iter = 1000,
def main():
parser = argparse.ArgumentParser(description='Environment map fitting example')
parser.add_argument('--outdir', help='Specify output directory', default='')
parser.add_argument('--opengl', help='enable OpenGL rendering', action='store_true', default=False)
parser.add_argument('--outdir', help='specify output directory', default='')
parser.add_argument('--display-interval', type=int, default=0)
parser.add_argument('--mp4save-interval', type=int, default=10)
parser.add_argument('--max-iter', type=int, default=5000)
......@@ -214,7 +216,8 @@ def main():
display_interval=args.display_interval,
out_dir=out_dir,
mp4save_interval=args.mp4save_interval,
mp4save_fn='progress.mp4'
mp4save_fn='progress.mp4',
use_opengl=args.opengl
)
# Done.
......
......@@ -132,7 +132,8 @@ def fit_pose(max_iter = 10000,
out_dir = None,
log_fn = None,
mp4save_interval = None,
mp4save_fn = None):
mp4save_fn = None,
use_opengl = False):
log_file = None
writer = None
......@@ -160,7 +161,7 @@ def fit_pose(max_iter = 10000,
col_idx = torch.from_numpy(col_idx.astype(np.int32)).cuda()
vtx_col = torch.from_numpy(col.astype(np.float32)).cuda()
glctx = dr.RasterizeGLContext()
glctx = dr.RasterizeGLContext() if use_opengl else dr.RasterizeCudaContext()
for rep in range(repeats):
pose_target = torch.tensor(q_rnd(), device='cuda')
......@@ -253,7 +254,8 @@ def fit_pose(max_iter = 10000,
def main():
parser = argparse.ArgumentParser(description='Cube pose fitting example')
parser.add_argument('--outdir', help='Specify output directory', default='')
parser.add_argument('--opengl', help='enable OpenGL rendering', action='store_true', default=False)
parser.add_argument('--outdir', help='specify output directory', default='')
parser.add_argument('--display-interval', type=int, default=0)
parser.add_argument('--mp4save-interval', type=int, default=10)
parser.add_argument('--max-iter', type=int, default=1000)
......@@ -277,7 +279,8 @@ def main():
out_dir=out_dir,
log_fn='log.txt',
mp4save_interval=args.mp4save_interval,
mp4save_fn='progress.mp4'
mp4save_fn='progress.mp4',
use_opengl=args.opengl
)
# Done.
......
......@@ -10,15 +10,23 @@ import imageio
import numpy as np
import torch
import nvdiffrast.torch as dr
import sys
def tensor(*args, **kwargs):
return torch.tensor(*args, device='cuda', **kwargs)
if sys.argv[1:] == ['--cuda']:
glctx = dr.RasterizeCudaContext()
elif sys.argv[1:] == ['--opengl']:
glctx = dr.RasterizeGLContext()
else:
print("Specify either --cuda or --opengl")
exit(1)
pos = tensor([[[-0.8, -0.8, 0, 1], [0.8, -0.8, 0, 1], [-0.8, 0.8, 0, 1]]], dtype=torch.float32)
col = tensor([[[1, 0, 0], [0, 1, 0], [0, 0, 1]]], dtype=torch.float32)
tri = tensor([[0, 1, 2]], dtype=torch.int32)
glctx = dr.RasterizeGLContext()
rast, _ = dr.rasterize(glctx, pos, tri, resolution=[256, 256])
out, _ = dr.interpolate(col, rast, tri)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment