Commit a1ec436b authored by Samuli Laine's avatar Samuli Laine
Browse files

Add CUDA rasterizer

parent 78528e68
...@@ -6,7 +6,7 @@ ...@@ -6,7 +6,7 @@
// distribution of this software and related documentation without an express // distribution of this software and related documentation without an express
// license agreement from NVIDIA CORPORATION is strictly prohibited. // license agreement from NVIDIA CORPORATION is strictly prohibited.
#include "rasterize.h" #include "rasterize_gl.h"
#include "glutil.h" #include "glutil.h"
#include <vector> #include <vector>
#define STRINGIFY_SHADER_SOURCE(x) #x #define STRINGIFY_SHADER_SOURCE(x) #x
...@@ -210,8 +210,6 @@ void rasterizeInitGLContext(NVDR_CTX_ARGS, RasterizeGLState& s, int cudaDeviceId ...@@ -210,8 +210,6 @@ void rasterizeInitGLContext(NVDR_CTX_ARGS, RasterizeGLState& s, int cudaDeviceId
layout(location = 1) out vec4 out_db; layout(location = 1) out vec4 out_db;
IF_ZMODIFY( IF_ZMODIFY(
layout(location = 1) uniform float in_dummy; layout(location = 1) uniform float in_dummy;
in vec4 gl_FragCoord;
out float gl_FragDepth;
) )
void main() void main()
{ {
...@@ -233,8 +231,6 @@ void rasterizeInitGLContext(NVDR_CTX_ARGS, RasterizeGLState& s, int cudaDeviceId ...@@ -233,8 +231,6 @@ void rasterizeInitGLContext(NVDR_CTX_ARGS, RasterizeGLState& s, int cudaDeviceId
layout(location = 1) out vec4 out_db; layout(location = 1) out vec4 out_db;
IF_ZMODIFY( IF_ZMODIFY(
layout(location = 1) uniform float in_dummy; layout(location = 1) uniform float in_dummy;
in vec4 gl_FragCoord;
out float gl_FragDepth;
) )
void main() void main()
{ {
...@@ -280,8 +276,6 @@ void rasterizeInitGLContext(NVDR_CTX_ARGS, RasterizeGLState& s, int cudaDeviceId ...@@ -280,8 +276,6 @@ void rasterizeInitGLContext(NVDR_CTX_ARGS, RasterizeGLState& s, int cudaDeviceId
layout(location = 0) out vec4 out_raster; layout(location = 0) out vec4 out_raster;
IF_ZMODIFY( IF_ZMODIFY(
layout(location = 1) uniform float in_dummy; layout(location = 1) uniform float in_dummy;
in vec4 gl_FragCoord;
out float gl_FragDepth;
) )
void main() void main()
{ {
...@@ -300,8 +294,6 @@ void rasterizeInitGLContext(NVDR_CTX_ARGS, RasterizeGLState& s, int cudaDeviceId ...@@ -300,8 +294,6 @@ void rasterizeInitGLContext(NVDR_CTX_ARGS, RasterizeGLState& s, int cudaDeviceId
layout(location = 0) out vec4 out_raster; layout(location = 0) out vec4 out_raster;
IF_ZMODIFY( IF_ZMODIFY(
layout(location = 1) uniform float in_dummy; layout(location = 1) uniform float in_dummy;
in vec4 gl_FragCoord;
out float gl_FragDepth;
) )
void main() void main()
{ {
...@@ -364,9 +356,9 @@ void rasterizeInitGLContext(NVDR_CTX_ARGS, RasterizeGLState& s, int cudaDeviceId ...@@ -364,9 +356,9 @@ void rasterizeInitGLContext(NVDR_CTX_ARGS, RasterizeGLState& s, int cudaDeviceId
NVDR_CHECK_GL_ERROR(glGenTextures(1, &s.glPrevOutBuffer)); NVDR_CHECK_GL_ERROR(glGenTextures(1, &s.glPrevOutBuffer));
} }
bool rasterizeResizeBuffers(NVDR_CTX_ARGS, RasterizeGLState& s, int posCount, int triCount, int width, int height, int depth) void rasterizeResizeBuffers(NVDR_CTX_ARGS, RasterizeGLState& s, bool& changes, int posCount, int triCount, int width, int height, int depth)
{ {
bool changes = false; changes = false;
// Resize vertex buffer? // Resize vertex buffer?
if (posCount > s.posCount) if (posCount > s.posCount)
...@@ -435,8 +427,6 @@ bool rasterizeResizeBuffers(NVDR_CTX_ARGS, RasterizeGLState& s, int posCount, in ...@@ -435,8 +427,6 @@ bool rasterizeResizeBuffers(NVDR_CTX_ARGS, RasterizeGLState& s, int posCount, in
changes = true; changes = true;
} }
return changes;
} }
void rasterizeRender(NVDR_CTX_ARGS, RasterizeGLState& s, cudaStream_t stream, const float* posPtr, int posCount, int vtxPerInstance, const int32_t* triPtr, int triCount, const int32_t* rangesPtr, int width, int height, int depth, int peeling_idx) void rasterizeRender(NVDR_CTX_ARGS, RasterizeGLState& s, cudaStream_t stream, const float* posPtr, int posCount, int vtxPerInstance, const int32_t* triPtr, int triCount, const int32_t* rangesPtr, int width, int height, int depth, int peeling_idx)
......
// Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
//
// NVIDIA CORPORATION and its licensors retain all intellectual property
// and proprietary rights in and to this software, related documentation
// and any modifications thereto. Any use, reproduction, disclosure or
// distribution of this software and related documentation without an express
// license agreement from NVIDIA CORPORATION is strictly prohibited.
#pragma once
//------------------------------------------------------------------------
// Do not try to include OpenGL stuff when compiling CUDA kernels for torch.
#if !(defined(NVDR_TORCH) && defined(__CUDACC__))
#include "framework.h"
#include "glutil.h"
//------------------------------------------------------------------------
// OpenGL-related persistent state for forward op.
struct RasterizeGLState // Must be initializable by memset to zero.
{
int width; // Allocated frame buffer width.
int height; // Allocated frame buffer height.
int depth; // Allocated frame buffer depth.
int posCount; // Allocated position buffer in floats.
int triCount; // Allocated triangle buffer in ints.
GLContext glctx;
GLuint glFBO;
GLuint glColorBuffer[2];
GLuint glPrevOutBuffer;
GLuint glDepthStencilBuffer;
GLuint glVAO;
GLuint glTriBuffer;
GLuint glPosBuffer;
GLuint glProgram;
GLuint glProgramDP;
GLuint glVertexShader;
GLuint glGeometryShader;
GLuint glFragmentShader;
GLuint glFragmentShaderDP;
cudaGraphicsResource_t cudaColorBuffer[2];
cudaGraphicsResource_t cudaPrevOutBuffer;
cudaGraphicsResource_t cudaPosBuffer;
cudaGraphicsResource_t cudaTriBuffer;
int enableDB;
int enableZModify; // Modify depth in shader, workaround for a rasterization issue on A100.
};
//------------------------------------------------------------------------
// Shared C++ code prototypes.
void rasterizeInitGLContext(NVDR_CTX_ARGS, RasterizeGLState& s, int cudaDeviceIdx);
void rasterizeResizeBuffers(NVDR_CTX_ARGS, RasterizeGLState& s, bool& changes, int posCount, int triCount, int width, int height, int depth);
void rasterizeRender(NVDR_CTX_ARGS, RasterizeGLState& s, cudaStream_t stream, const float* posPtr, int posCount, int vtxPerInstance, const int32_t* triPtr, int triCount, const int32_t* rangesPtr, int width, int height, int depth, int peeling_idx);
void rasterizeCopyResults(NVDR_CTX_ARGS, RasterizeGLState& s, cudaStream_t stream, float** outputPtr, int width, int height, int depth);
void rasterizeReleaseBuffers(NVDR_CTX_ARGS, RasterizeGLState& s);
//------------------------------------------------------------------------
#endif // !(defined(NVDR_TORCH) && defined(__CUDACC__))
...@@ -56,13 +56,25 @@ verbose = True # Print status messages to stdout. ...@@ -56,13 +56,25 @@ verbose = True # Print status messages to stdout.
# Internal helper funcs. # Internal helper funcs.
def _find_compiler_bindir(): def _find_compiler_bindir():
hostx64_paths = sorted(glob.glob('C:/Program Files/Microsoft Visual Studio/*/Enterprise/VC/Tools/MSVC/*/bin/Hostx64/x64'), reverse=True)
if hostx64_paths != []:
return hostx64_paths[0]
hostx64_paths = sorted(glob.glob('C:/Program Files (x86)/Microsoft Visual Studio/*/Enterprise/VC/Tools/MSVC/*/bin/Hostx64/x64'), reverse=True) hostx64_paths = sorted(glob.glob('C:/Program Files (x86)/Microsoft Visual Studio/*/Enterprise/VC/Tools/MSVC/*/bin/Hostx64/x64'), reverse=True)
if hostx64_paths != []:
return hostx64_paths[0]
hostx64_paths = sorted(glob.glob('C:/Program Files/Microsoft Visual Studio/*/Professional/VC/Tools/MSVC/*/bin/Hostx64/x64'), reverse=True)
if hostx64_paths != []: if hostx64_paths != []:
return hostx64_paths[0] return hostx64_paths[0]
hostx64_paths = sorted(glob.glob('C:/Program Files (x86)/Microsoft Visual Studio/*/Professional/VC/Tools/MSVC/*/bin/Hostx64/x64'), reverse=True) hostx64_paths = sorted(glob.glob('C:/Program Files (x86)/Microsoft Visual Studio/*/Professional/VC/Tools/MSVC/*/bin/Hostx64/x64'), reverse=True)
if hostx64_paths != []:
return hostx64_paths[0]
hostx64_paths = sorted(glob.glob('C:/Program Files/Microsoft Visual Studio/*/BuildTools/VC/Tools/MSVC/*/bin/Hostx64/x64'), reverse=True)
if hostx64_paths != []: if hostx64_paths != []:
return hostx64_paths[0] return hostx64_paths[0]
hostx64_paths = sorted(glob.glob('C:/Program Files (x86)/Microsoft Visual Studio/*/BuildTools/VC/Tools/MSVC/*/bin/Hostx64/x64'), reverse=True) hostx64_paths = sorted(glob.glob('C:/Program Files (x86)/Microsoft Visual Studio/*/BuildTools/VC/Tools/MSVC/*/bin/Hostx64/x64'), reverse=True)
if hostx64_paths != []:
return hostx64_paths[0]
hostx64_paths = sorted(glob.glob('C:/Program Files/Microsoft Visual Studio/*/Community/VC/Tools/MSVC/*/bin/Hostx64/x64'), reverse=True)
if hostx64_paths != []: if hostx64_paths != []:
return hostx64_paths[0] return hostx64_paths[0]
hostx64_paths = sorted(glob.glob('C:/Program Files (x86)/Microsoft Visual Studio/*/Community/VC/Tools/MSVC/*/bin/Hostx64/x64'), reverse=True) hostx64_paths = sorted(glob.glob('C:/Program Files (x86)/Microsoft Visual Studio/*/Community/VC/Tools/MSVC/*/bin/Hostx64/x64'), reverse=True)
......
...@@ -21,7 +21,7 @@ ...@@ -21,7 +21,7 @@
#include "../common/common.cpp" #include "../common/common.cpp"
#include "../common/rasterize.h" #include "../common/rasterize.h"
#include "../common/rasterize.cpp" #include "../common/rasterize_gl.cpp"
#include "../common/rasterize.cu" #include "../common/rasterize.cu"
#include "tf_rasterize.cu" #include "tf_rasterize.cu"
......
...@@ -74,14 +74,15 @@ struct RasterizeFwdOp : public OpKernel ...@@ -74,14 +74,15 @@ struct RasterizeFwdOp : public OpKernel
setGLContext(m_glState.glctx); // (Re-)Activate GL context. setGLContext(m_glState.glctx); // (Re-)Activate GL context.
// Resize all buffers. // Resize all buffers.
rasterizeResizeBuffers(ctx, m_glState, posCount, triCount, width, height, depth); // In common/rasterize.cpp bool changes = false;
rasterizeResizeBuffers(ctx, m_glState, changes, posCount, triCount, width, height, depth); // In common/rasterize_gl.cpp
// Newly created GL objects sometimes don't map properly to CUDA until after first context swap. Workaround. if (changes)
if (initCtx)
{ {
// On first execution, do a bonus context swap. #ifdef _WIN32
// Workaround for occasional blank first frame on Windows.
releaseGLContext(); releaseGLContext();
setGLContext(m_glState.glctx); setGLContext(m_glState.glctx);
#endif
} }
// Copy input data to GL and render. // Copy input data to GL and render.
......
...@@ -6,5 +6,5 @@ ...@@ -6,5 +6,5 @@
# distribution of this software and related documentation without an express # distribution of this software and related documentation without an express
# license agreement from NVIDIA CORPORATION is strictly prohibited. # license agreement from NVIDIA CORPORATION is strictly prohibited.
from .ops import RasterizeGLContext, get_log_level, set_log_level, rasterize, DepthPeeler, interpolate, texture, texture_construct_mip, antialias, antialias_construct_topology_hash from .ops import RasterizeCudaContext, RasterizeGLContext, get_log_level, set_log_level, rasterize, DepthPeeler, interpolate, texture, texture_construct_mip, antialias, antialias_construct_topology_hash
__all__ = ["RasterizeGLContext", "get_log_level", "set_log_level", "rasterize", "DepthPeeler", "interpolate", "texture", "texture_construct_mip", "antialias", "antialias_construct_topology_hash"] __all__ = ["RasterizeCudaContext", "RasterizeGLContext", "get_log_level", "set_log_level", "rasterize", "DepthPeeler", "interpolate", "texture", "texture_construct_mip", "antialias", "antialias_construct_topology_hash"]
...@@ -6,22 +6,23 @@ ...@@ -6,22 +6,23 @@
# distribution of this software and related documentation without an express # distribution of this software and related documentation without an express
# license agreement from NVIDIA CORPORATION is strictly prohibited. # license agreement from NVIDIA CORPORATION is strictly prohibited.
import importlib
import logging import logging
import numpy as np import numpy as np
import os import os
import sys
import torch import torch
import torch.utils.cpp_extension import torch.utils.cpp_extension
#---------------------------------------------------------------------------- #----------------------------------------------------------------------------
# C++/Cuda plugin compiler/loader. # C++/Cuda plugin compiler/loader.
_cached_plugin = None _cached_plugin = {}
def _get_plugin(): def _get_plugin(gl=False):
assert isinstance(gl, bool)
# Return cached plugin if already loaded. # Return cached plugin if already loaded.
global _cached_plugin if _cached_plugin.get(gl, None) is not None:
if _cached_plugin is not None: return _cached_plugin[gl]
return _cached_plugin
# Make sure we can find the necessary compiler and libary binaries. # Make sure we can find the necessary compiler and libary binaries.
if os.name == 'nt': if os.name == 'nt':
...@@ -29,7 +30,9 @@ def _get_plugin(): ...@@ -29,7 +30,9 @@ def _get_plugin():
def find_cl_path(): def find_cl_path():
import glob import glob
for edition in ['Enterprise', 'Professional', 'BuildTools', 'Community']: for edition in ['Enterprise', 'Professional', 'BuildTools', 'Community']:
paths = sorted(glob.glob(r"C:\Program Files (x86)\Microsoft Visual Studio\*\%s\VC\Tools\MSVC\*\bin\Hostx64\x64" % edition), reverse=True) vs_relative_path = r"\Microsoft Visual Studio\*\%s\VC\Tools\MSVC\*\bin\Hostx64\x64" % edition
paths = sorted(glob.glob(r"C:\Program Files" + vs_relative_path), reverse=True)
paths += sorted(glob.glob(r"C:\Program Files (x86)" + vs_relative_path), reverse=True)
if paths: if paths:
return paths[0] return paths[0]
...@@ -43,35 +46,52 @@ def _get_plugin(): ...@@ -43,35 +46,52 @@ def _get_plugin():
# Compiler options. # Compiler options.
opts = ['-DNVDR_TORCH'] opts = ['-DNVDR_TORCH']
# Linker options. # Linker options for the GL-interfacing plugin.
if os.name == 'posix': ldflags = []
ldflags = ['-lGL', '-lEGL'] if gl:
elif os.name == 'nt': if os.name == 'posix':
libs = ['gdi32', 'opengl32', 'user32', 'setgpu'] ldflags = ['-lGL', '-lEGL']
ldflags = ['/LIBPATH:' + lib_dir] + ['/DEFAULTLIB:' + x for x in libs] elif os.name == 'nt':
libs = ['gdi32', 'opengl32', 'user32', 'setgpu']
ldflags = ['/LIBPATH:' + lib_dir] + ['/DEFAULTLIB:' + x for x in libs]
# List of source files. # List of source files.
source_files = [ if gl:
'../common/common.cpp', source_files = [
'../common/glutil.cpp', '../common/common.cpp',
'../common/rasterize.cu', '../common/glutil.cpp',
'../common/rasterize.cpp', '../common/rasterize_gl.cpp',
'../common/interpolate.cu', 'torch_bindings_gl.cpp',
'../common/texture.cu', 'torch_rasterize_gl.cpp',
'../common/texture.cpp', ]
'../common/antialias.cu', else:
'torch_bindings.cpp', source_files = [
'torch_rasterize.cpp', '../common/cudaraster/impl/Buffer.cpp',
'torch_interpolate.cpp', '../common/cudaraster/impl/CudaRaster.cpp',
'torch_texture.cpp', '../common/cudaraster/impl/RasterImpl.cu',
'torch_antialias.cpp', '../common/cudaraster/impl/RasterImpl.cpp',
] '../common/common.cpp',
'../common/rasterize.cu',
'../common/interpolate.cu',
'../common/texture.cu',
'../common/texture.cpp',
'../common/antialias.cu',
'torch_bindings.cpp',
'torch_rasterize.cpp',
'torch_interpolate.cpp',
'torch_texture.cpp',
'torch_antialias.cpp',
]
# Some containers set this to contain old architectures that won't compile. We only need the one installed in the machine. # Some containers set this to contain old architectures that won't compile. We only need the one installed in the machine.
os.environ['TORCH_CUDA_ARCH_LIST'] = '' os.environ['TORCH_CUDA_ARCH_LIST'] = ''
# On Linux, show a warning if GLEW is being forcibly loaded when compiling the GL plugin.
if gl and (os.name == 'posix') and ('libGLEW' in os.environ.get('LD_PRELOAD', '')):
logging.getLogger('nvdiffrast').warning("Warning: libGLEW is being loaded via LD_PRELOAD, and will probably conflict with the OpenGL plugin")
# Try to detect if a stray lock file is left in cache directory and show a warning. This sometimes happens on Windows if the build is interrupted at just the right moment. # Try to detect if a stray lock file is left in cache directory and show a warning. This sometimes happens on Windows if the build is interrupted at just the right moment.
plugin_name = 'nvdiffrast_plugin' plugin_name = 'nvdiffrast_plugin' + ('_gl' if gl else '')
try: try:
lock_fn = os.path.join(torch.utils.cpp_extension._get_build_directory(plugin_name, False), 'lock') lock_fn = os.path.join(torch.utils.cpp_extension._get_build_directory(plugin_name, False), 'lock')
if os.path.exists(lock_fn): if os.path.exists(lock_fn):
...@@ -79,14 +99,27 @@ def _get_plugin(): ...@@ -79,14 +99,27 @@ def _get_plugin():
except: except:
pass pass
# Speed up compilation on Windows.
if os.name == 'nt':
# Skip telemetry sending step in vcvarsall.bat
os.environ['VSCMD_SKIP_SENDTELEMETRY'] = '1'
# Opportunistically patch distutils to cache MSVC environments.
try:
import distutils._msvccompiler
import functools
if not hasattr(distutils._msvccompiler._get_vc_env, '__wrapped__'):
distutils._msvccompiler._get_vc_env = functools.lru_cache()(distutils._msvccompiler._get_vc_env)
except:
pass
# Compile and load. # Compile and load.
source_paths = [os.path.join(os.path.dirname(__file__), fn) for fn in source_files] source_paths = [os.path.join(os.path.dirname(__file__), fn) for fn in source_files]
torch.utils.cpp_extension.load(name=plugin_name, sources=source_paths, extra_cflags=opts, extra_cuda_cflags=opts, extra_ldflags=ldflags, with_cuda=True, verbose=False) torch.utils.cpp_extension.load(name=plugin_name, sources=source_paths, extra_cflags=opts, extra_cuda_cflags=opts+['-lineinfo'], extra_ldflags=ldflags, with_cuda=True, verbose=False)
# Import, cache, and return the compiled module. # Import, cache, and return the compiled module.
import nvdiffrast_plugin _cached_plugin[gl] = importlib.import_module(plugin_name)
_cached_plugin = nvdiffrast_plugin return _cached_plugin[gl]
return _cached_plugin
#---------------------------------------------------------------------------- #----------------------------------------------------------------------------
# Log level. # Log level.
...@@ -118,7 +151,35 @@ def set_log_level(level): ...@@ -118,7 +151,35 @@ def set_log_level(level):
_get_plugin().set_log_level(level) _get_plugin().set_log_level(level)
#---------------------------------------------------------------------------- #----------------------------------------------------------------------------
# GL State wrapper. # CudaRaster state wrapper.
#----------------------------------------------------------------------------
class RasterizeCudaContext:
def __init__(self, device=None):
'''Create a new Cuda rasterizer context.
The context is deleted and internal storage is released when the object is
destroyed.
Args:
device (Optional): Cuda device on which the context is created. Type can be
`torch.device`, string (e.g., `'cuda:1'`), or int. If not
specified, context will be created on currently active Cuda
device.
Returns:
The newly created Cuda rasterizer context.
'''
if device is None:
cuda_device_idx = torch.cuda.current_device()
else:
with torch.cuda.device(device):
cuda_device_idx = torch.cuda.current_device()
self.cpp_wrapper = _get_plugin().RasterizeCRStateWrapper(cuda_device_idx)
self.output_db = True
self.active_depth_peeler = None
#----------------------------------------------------------------------------
# GL state wrapper.
#---------------------------------------------------------------------------- #----------------------------------------------------------------------------
class RasterizeGLContext: class RasterizeGLContext:
...@@ -157,8 +218,8 @@ class RasterizeGLContext: ...@@ -157,8 +218,8 @@ class RasterizeGLContext:
else: else:
with torch.cuda.device(device): with torch.cuda.device(device):
cuda_device_idx = torch.cuda.current_device() cuda_device_idx = torch.cuda.current_device()
self.cpp_wrapper = _get_plugin().RasterizeGLStateWrapper(output_db, mode == 'automatic', cuda_device_idx) self.cpp_wrapper = _get_plugin(gl=True).RasterizeGLStateWrapper(output_db, mode == 'automatic', cuda_device_idx)
self.active_depth_peeler = None # For error checking only self.active_depth_peeler = None # For error checking only.
def set_context(self): def set_context(self):
'''Set (activate) OpenGL context in the current CPU thread. '''Set (activate) OpenGL context in the current CPU thread.
...@@ -180,8 +241,11 @@ class RasterizeGLContext: ...@@ -180,8 +241,11 @@ class RasterizeGLContext:
class _rasterize_func(torch.autograd.Function): class _rasterize_func(torch.autograd.Function):
@staticmethod @staticmethod
def forward(ctx, glctx, pos, tri, resolution, ranges, grad_db, peeling_idx): def forward(ctx, raster_ctx, pos, tri, resolution, ranges, grad_db, peeling_idx):
out, out_db = _get_plugin().rasterize_fwd(glctx.cpp_wrapper, pos, tri, resolution, ranges, peeling_idx) if isinstance(raster_ctx, RasterizeGLContext):
out, out_db = _get_plugin(gl=True).rasterize_fwd_gl(raster_ctx.cpp_wrapper, pos, tri, resolution, ranges, peeling_idx)
else:
out, out_db = _get_plugin().rasterize_fwd_cuda(raster_ctx.cpp_wrapper, pos, tri, resolution, ranges, peeling_idx)
ctx.save_for_backward(pos, tri, out) ctx.save_for_backward(pos, tri, out)
ctx.saved_grad_db = grad_db ctx.saved_grad_db = grad_db
return out, out_db return out, out_db
...@@ -204,7 +268,7 @@ def rasterize(glctx, pos, tri, resolution, ranges=None, grad_db=True): ...@@ -204,7 +268,7 @@ def rasterize(glctx, pos, tri, resolution, ranges=None, grad_db=True):
output tensors will be contiguous and reside in GPU memory. output tensors will be contiguous and reside in GPU memory.
Args: Args:
glctx: OpenGL context of type `RasterizeGLContext`. glctx: Rasterizer context of type `RasterizeGLContext` or `RasterizeCudaContext`.
pos: Vertex position tensor with dtype `torch.float32`. To enable range pos: Vertex position tensor with dtype `torch.float32`. To enable range
mode, this tensor should have a 2D shape [num_vertices, 4]. To enable mode, this tensor should have a 2D shape [num_vertices, 4]. To enable
instanced mode, use a 3D shape [minibatch_size, num_vertices, 4]. instanced mode, use a 3D shape [minibatch_size, num_vertices, 4].
...@@ -214,8 +278,8 @@ def rasterize(glctx, pos, tri, resolution, ranges=None, grad_db=True): ...@@ -214,8 +278,8 @@ def rasterize(glctx, pos, tri, resolution, ranges=None, grad_db=True):
`torch.int32`, specifying start indices and counts into `tri`. `torch.int32`, specifying start indices and counts into `tri`.
Ignored in instanced mode. Ignored in instanced mode.
grad_db: Propagate gradients of image-space derivatives of barycentrics grad_db: Propagate gradients of image-space derivatives of barycentrics
into `pos` in backward pass. Ignored if OpenGL context was into `pos` in backward pass. Ignored if using an OpenGL context that
not configured to output image-space derivatives. was not configured to output image-space derivatives.
Returns: Returns:
A tuple of two tensors. The first output tensor has shape [minibatch_size, A tuple of two tensors. The first output tensor has shape [minibatch_size,
...@@ -226,7 +290,7 @@ def rasterize(glctx, pos, tri, resolution, ranges=None, grad_db=True): ...@@ -226,7 +290,7 @@ def rasterize(glctx, pos, tri, resolution, ranges=None, grad_db=True):
(du/dX, du/dY, dv/dX, dv/dY). Otherwise it will be an empty tensor with shape (du/dX, du/dY, dv/dX, dv/dY). Otherwise it will be an empty tensor with shape
[minibatch_size, height, width, 0]. [minibatch_size, height, width, 0].
''' '''
assert isinstance(glctx, RasterizeGLContext) assert isinstance(glctx, (RasterizeGLContext, RasterizeCudaContext))
assert grad_db is True or grad_db is False assert grad_db is True or grad_db is False
grad_db = grad_db and glctx.output_db grad_db = grad_db and glctx.output_db
...@@ -258,7 +322,7 @@ class DepthPeeler: ...@@ -258,7 +322,7 @@ class DepthPeeler:
Returns: Returns:
The newly created depth peeler. The newly created depth peeler.
''' '''
assert isinstance(glctx, RasterizeGLContext) assert isinstance(glctx, (RasterizeGLContext, RasterizeCudaContext))
assert grad_db is True or grad_db is False assert grad_db is True or grad_db is False
grad_db = grad_db and glctx.output_db grad_db = grad_db and glctx.output_db
...@@ -271,7 +335,7 @@ class DepthPeeler: ...@@ -271,7 +335,7 @@ class DepthPeeler:
assert isinstance(ranges, torch.Tensor) assert isinstance(ranges, torch.Tensor)
# Store all the parameters. # Store all the parameters.
self.glctx = glctx self.raster_ctx = glctx
self.pos = pos self.pos = pos
self.tri = tri self.tri = tri
self.resolution = resolution self.resolution = resolution
...@@ -280,18 +344,18 @@ class DepthPeeler: ...@@ -280,18 +344,18 @@ class DepthPeeler:
self.peeling_idx = None self.peeling_idx = None
def __enter__(self): def __enter__(self):
if self.glctx is None: if self.raster_ctx is None:
raise RuntimeError("Cannot re-enter a terminated depth peeling operation") raise RuntimeError("Cannot re-enter a terminated depth peeling operation")
if self.glctx.active_depth_peeler is not None: if self.raster_ctx.active_depth_peeler is not None:
raise RuntimeError("Cannot have multiple depth peelers active simultaneously in a RasterizeGLContext") raise RuntimeError("Cannot have multiple depth peelers active simultaneously in a rasterization context")
self.glctx.active_depth_peeler = self self.raster_ctx.active_depth_peeler = self
self.peeling_idx = 0 self.peeling_idx = 0
return self return self
def __exit__(self, *args): def __exit__(self, *args):
assert self.glctx.active_depth_peeler is self assert self.raster_ctx.active_depth_peeler is self
self.glctx.active_depth_peeler = None self.raster_ctx.active_depth_peeler = None
self.glctx = None # Remove all references to input tensor so they're not left dangling. self.raster_ctx = None # Remove all references to input tensor so they're not left dangling.
self.pos = None self.pos = None
self.tri = None self.tri = None
self.resolution = None self.resolution = None
...@@ -309,9 +373,9 @@ class DepthPeeler: ...@@ -309,9 +373,9 @@ class DepthPeeler:
Returns: Returns:
A tuple of two tensors as in `rasterize()`. A tuple of two tensors as in `rasterize()`.
''' '''
assert self.glctx.active_depth_peeler is self assert self.raster_ctx.active_depth_peeler is self
assert self.peeling_idx >= 0 assert self.peeling_idx >= 0
result = _rasterize_func.apply(self.glctx, self.pos, self.tri, self.resolution, self.ranges, self.grad_db, self.peeling_idx) result = _rasterize_func.apply(self.raster_ctx, self.pos, self.tri, self.resolution, self.ranges, self.grad_db, self.peeling_idx)
self.peeling_idx += 1 self.peeling_idx += 1
return result return result
...@@ -604,6 +668,14 @@ def antialias(color, rast, pos, tri, topology_hash=None, pos_gradient_boost=1.0) ...@@ -604,6 +668,14 @@ def antialias(color, rast, pos, tri, topology_hash=None, pos_gradient_boost=1.0)
All input tensors must be contiguous and reside in GPU memory. The output tensor All input tensors must be contiguous and reside in GPU memory. The output tensor
will be contiguous and reside in GPU memory. will be contiguous and reside in GPU memory.
Note that silhouette edge determination is based on vertex indices in the triangle
tensor. For it to work properly, a vertex belonging to multiple triangles must be
referred to using the same vertex index in each triangle. Otherwise, nvdiffrast will always
classify the adjacent edges as silhouette edges, which leads to bad performance and
potentially incorrect gradients. If you are unsure whether your data is good, check
which pixels are modified by the antialias operation and compare to the example in the
documentation.
Args: Args:
color: Input image to antialias with shape [minibatch_size, height, width, num_channels]. color: Input image to antialias with shape [minibatch_size, height, width, num_channels].
rast: Main output tensor from `rasterize()`. rast: Main output tensor from `rasterize()`.
......
...@@ -20,7 +20,7 @@ ...@@ -20,7 +20,7 @@
#define OP_RETURN_TTV std::tuple<torch::Tensor, torch::Tensor, std::vector<torch::Tensor> > #define OP_RETURN_TTV std::tuple<torch::Tensor, torch::Tensor, std::vector<torch::Tensor> >
#define OP_RETURN_TTTTV std::tuple<torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, std::vector<torch::Tensor> > #define OP_RETURN_TTTTV std::tuple<torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, std::vector<torch::Tensor> >
OP_RETURN_TT rasterize_fwd (RasterizeGLStateWrapper& stateWrapper, torch::Tensor pos, torch::Tensor tri, std::tuple<int, int> resolution, torch::Tensor ranges, int depth_idx); OP_RETURN_TT rasterize_fwd_cuda (RasterizeCRStateWrapper& stateWrapper, torch::Tensor pos, torch::Tensor tri, std::tuple<int, int> resolution, torch::Tensor ranges, int peeling_idx);
OP_RETURN_T rasterize_grad (torch::Tensor pos, torch::Tensor tri, torch::Tensor out, torch::Tensor dy); OP_RETURN_T rasterize_grad (torch::Tensor pos, torch::Tensor tri, torch::Tensor out, torch::Tensor dy);
OP_RETURN_T rasterize_grad_db (torch::Tensor pos, torch::Tensor tri, torch::Tensor out, torch::Tensor dy, torch::Tensor ddb); OP_RETURN_T rasterize_grad_db (torch::Tensor pos, torch::Tensor tri, torch::Tensor out, torch::Tensor dy, torch::Tensor ddb);
OP_RETURN_TT interpolate_fwd (torch::Tensor attr, torch::Tensor rast, torch::Tensor tri); OP_RETURN_TT interpolate_fwd (torch::Tensor attr, torch::Tensor rast, torch::Tensor tri);
...@@ -42,9 +42,7 @@ OP_RETURN_TT antialias_grad (torch::Tensor color, to ...@@ -42,9 +42,7 @@ OP_RETURN_TT antialias_grad (torch::Tensor color, to
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
// State classes. // State classes.
pybind11::class_<RasterizeGLStateWrapper>(m, "RasterizeGLStateWrapper").def(pybind11::init<bool, bool, int>()) pybind11::class_<RasterizeCRStateWrapper>(m, "RasterizeCRStateWrapper").def(pybind11::init<int>());
.def("set_context", &RasterizeGLStateWrapper::setContext)
.def("release_context", &RasterizeGLStateWrapper::releaseContext);
pybind11::class_<TextureMipWrapper>(m, "TextureMipWrapper").def(pybind11::init<>()); pybind11::class_<TextureMipWrapper>(m, "TextureMipWrapper").def(pybind11::init<>());
pybind11::class_<TopologyHashWrapper>(m, "TopologyHashWrapper"); pybind11::class_<TopologyHashWrapper>(m, "TopologyHashWrapper");
...@@ -53,7 +51,7 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { ...@@ -53,7 +51,7 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("set_log_level", [](int level){ FLAGS_caffe2_log_level = level; }, "set log level"); m.def("set_log_level", [](int level){ FLAGS_caffe2_log_level = level; }, "set log level");
// Ops. // Ops.
m.def("rasterize_fwd", &rasterize_fwd, "rasterize forward op"); m.def("rasterize_fwd_cuda", &rasterize_fwd_cuda, "rasterize forward op (cuda)");
m.def("rasterize_grad", &rasterize_grad, "rasterize gradient op ignoring db gradients"); m.def("rasterize_grad", &rasterize_grad, "rasterize gradient op ignoring db gradients");
m.def("rasterize_grad_db", &rasterize_grad_db, "rasterize gradient op with db gradients"); m.def("rasterize_grad_db", &rasterize_grad_db, "rasterize gradient op with db gradients");
m.def("interpolate_fwd", &interpolate_fwd, "interpolate forward op with attribute derivatives"); m.def("interpolate_fwd", &interpolate_fwd, "interpolate forward op with attribute derivatives");
......
// Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
//
// NVIDIA CORPORATION and its licensors retain all intellectual property
// and proprietary rights in and to this software, related documentation
// and any modifications thereto. Any use, reproduction, disclosure or
// distribution of this software and related documentation without an express
// license agreement from NVIDIA CORPORATION is strictly prohibited.
#include "torch_common.inl"
#include "torch_types.h"
#include <tuple>
//------------------------------------------------------------------------
// Op prototypes.
std::tuple<torch::Tensor, torch::Tensor> rasterize_fwd_gl(RasterizeGLStateWrapper& stateWrapper, torch::Tensor pos, torch::Tensor tri, std::tuple<int, int> resolution, torch::Tensor ranges, int peeling_idx);
//------------------------------------------------------------------------
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
// State classes.
pybind11::class_<RasterizeGLStateWrapper>(m, "RasterizeGLStateWrapper").def(pybind11::init<bool, bool, int>())
.def("set_context", &RasterizeGLStateWrapper::setContext)
.def("release_context", &RasterizeGLStateWrapper::releaseContext);
// Ops.
m.def("rasterize_fwd_gl", &rasterize_fwd_gl, "rasterize forward op (opengl)");
}
//------------------------------------------------------------------------
...@@ -10,55 +10,41 @@ ...@@ -10,55 +10,41 @@
#include "torch_types.h" #include "torch_types.h"
#include "../common/common.h" #include "../common/common.h"
#include "../common/rasterize.h" #include "../common/rasterize.h"
#include "../common/cudaraster/CudaRaster.hpp"
#include "../common/cudaraster/impl/Constants.hpp"
#include <tuple> #include <tuple>
//------------------------------------------------------------------------ //------------------------------------------------------------------------
// Kernel prototypes. // Kernel prototypes.
void RasterizeCudaFwdShaderKernel(const RasterizeCudaFwdShaderParams p);
void RasterizeGradKernel(const RasterizeGradParams p); void RasterizeGradKernel(const RasterizeGradParams p);
void RasterizeGradKernelDb(const RasterizeGradParams p); void RasterizeGradKernelDb(const RasterizeGradParams p);
//------------------------------------------------------------------------ //------------------------------------------------------------------------
// Python GL state wrapper methods. // Python CudaRaster state wrapper methods.
RasterizeGLStateWrapper::RasterizeGLStateWrapper(bool enableDB, bool automatic_, int cudaDeviceIdx_) RasterizeCRStateWrapper::RasterizeCRStateWrapper(int cudaDeviceIdx_)
{ {
pState = new RasterizeGLState(); const at::cuda::OptionalCUDAGuard device_guard(cudaDeviceIdx_);
automatic = automatic_;
cudaDeviceIdx = cudaDeviceIdx_; cudaDeviceIdx = cudaDeviceIdx_;
memset(pState, 0, sizeof(RasterizeGLState)); cr = new CR::CudaRaster();
pState->enableDB = enableDB ? 1 : 0;
rasterizeInitGLContext(NVDR_CTX_PARAMS, *pState, cudaDeviceIdx_);
releaseGLContext();
} }
RasterizeGLStateWrapper::~RasterizeGLStateWrapper(void) RasterizeCRStateWrapper::~RasterizeCRStateWrapper(void)
{ {
setGLContext(pState->glctx); const at::cuda::OptionalCUDAGuard device_guard(cudaDeviceIdx);
rasterizeReleaseBuffers(NVDR_CTX_PARAMS, *pState); delete cr;
releaseGLContext();
destroyGLContext(pState->glctx);
delete pState;
}
void RasterizeGLStateWrapper::setContext(void)
{
setGLContext(pState->glctx);
}
void RasterizeGLStateWrapper::releaseContext(void)
{
releaseGLContext();
} }
//------------------------------------------------------------------------ //------------------------------------------------------------------------
// Forward op. // Forward op (Cuda).
std::tuple<torch::Tensor, torch::Tensor> rasterize_fwd(RasterizeGLStateWrapper& stateWrapper, torch::Tensor pos, torch::Tensor tri, std::tuple<int, int> resolution, torch::Tensor ranges, int peeling_idx) std::tuple<torch::Tensor, torch::Tensor> rasterize_fwd_cuda(RasterizeCRStateWrapper& stateWrapper, torch::Tensor pos, torch::Tensor tri, std::tuple<int, int> resolution, torch::Tensor ranges, int peeling_idx)
{ {
const at::cuda::OptionalCUDAGuard device_guard(device_of(pos)); const at::cuda::OptionalCUDAGuard device_guard(device_of(pos));
cudaStream_t stream = at::cuda::getCurrentCUDAStream(); cudaStream_t stream = at::cuda::getCurrentCUDAStream();
RasterizeGLState& s = *stateWrapper.pState; CR::CudaRaster* cr = stateWrapper.cr;
// Check inputs. // Check inputs.
NVDR_CHECK_DEVICE(pos, tri); NVDR_CHECK_DEVICE(pos, tri);
...@@ -67,11 +53,8 @@ std::tuple<torch::Tensor, torch::Tensor> rasterize_fwd(RasterizeGLStateWrapper& ...@@ -67,11 +53,8 @@ std::tuple<torch::Tensor, torch::Tensor> rasterize_fwd(RasterizeGLStateWrapper&
NVDR_CHECK_F32(pos); NVDR_CHECK_F32(pos);
NVDR_CHECK_I32(tri, ranges); NVDR_CHECK_I32(tri, ranges);
// Check that GL context was created for the correct GPU. // Check that CudaRaster context was created for the correct GPU.
NVDR_CHECK(pos.get_device() == stateWrapper.cudaDeviceIdx, "GL context must must reside on the same device as input tensors"); NVDR_CHECK(pos.get_device() == stateWrapper.cudaDeviceIdx, "CudaRaster context must must reside on the same device as input tensors");
// Determine number of outputs
int num_outputs = s.enableDB ? 2 : 1;
// Determine instance mode and check input dimensions. // Determine instance mode and check input dimensions.
bool instance_mode = pos.sizes().size() > 2; bool instance_mode = pos.sizes().size() > 2;
...@@ -87,49 +70,75 @@ std::tuple<torch::Tensor, torch::Tensor> rasterize_fwd(RasterizeGLStateWrapper& ...@@ -87,49 +70,75 @@ std::tuple<torch::Tensor, torch::Tensor> rasterize_fwd(RasterizeGLStateWrapper&
// Get output shape. // Get output shape.
int height = std::get<0>(resolution); int height = std::get<0>(resolution);
int width = std::get<1>(resolution); int width = std::get<1>(resolution);
int depth = instance_mode ? pos.size(0) : ranges.size(0); int depth = instance_mode ? pos.size(0) : ranges.size(0); // Depth of tensor, not related to depth buffering.
NVDR_CHECK(height > 0 && width > 0, "resolution must be [>0, >0]"); NVDR_CHECK(height > 0 && width > 0, "resolution must be [>0, >0]");
// Get position and triangle buffer sizes in int32/float32. // Check resolution compatibility with CudaRaster.
int posCount = 4 * pos.size(0) * (instance_mode ? pos.size(1) : 1); TORCH_CHECK(height <= CR_MAXVIEWPORT_SIZE && width <= CR_MAXVIEWPORT_SIZE, "resolution must be [<=", CR_MAXVIEWPORT_SIZE, ", <=", CR_MAXVIEWPORT_SIZE, "]");
int triCount = 3 * tri.size(0); TORCH_CHECK(((height | width) & (CR_TILE_SIZE - 1)) == 0, "width and height must be divisible by ", CR_TILE_SIZE);
// Set the GL context unless manual context. // Get position and triangle buffer sizes in vertices / triangles.
if (stateWrapper.automatic) int posCount = instance_mode ? pos.size(1) : pos.size(0);
setGLContext(s.glctx); int triCount = tri.size(0);
// Resize all buffers. // Render.
if (rasterizeResizeBuffers(NVDR_CTX_PARAMS, s, posCount, triCount, width, height, depth))
{
#ifdef _WIN32
// Workaround for occasional blank first frame on Windows.
releaseGLContext();
setGLContext(s.glctx);
#endif
}
// Copy input data to GL and render.
const float* posPtr = pos.data_ptr<float>(); const float* posPtr = pos.data_ptr<float>();
const int32_t* rangesPtr = instance_mode ? 0 : ranges.data_ptr<int32_t>(); // This is in CPU memory. const int32_t* rangesPtr = instance_mode ? 0 : ranges.data_ptr<int32_t>(); // This is in CPU memory.
const int32_t* triPtr = tri.data_ptr<int32_t>(); const int32_t* triPtr = tri.data_ptr<int32_t>();
int vtxPerInstance = instance_mode ? pos.size(1) : 0;
rasterizeRender(NVDR_CTX_PARAMS, s, stream, posPtr, posCount, vtxPerInstance, triPtr, triCount, rangesPtr, width, height, depth, peeling_idx); // Set up CudaRaster.
cr->setViewportSize(width, height, depth);
cr->setVertexBuffer((void*)posPtr, posCount);
cr->setIndexBuffer((void*)triPtr, triCount);
// Enable depth peeling?
bool enablePeel = (peeling_idx > 0);
cr->setRenderModeFlags(enablePeel ? CR::CudaRaster::RenderModeFlag_EnableDepthPeeling : 0); // No backface culling.
if (enablePeel)
cr->swapDepthAndPeel(); // Use previous depth buffer as peeling depth input.
// Run CudaRaster in one large batch. In case of error, the workload could be split into smaller batches - maybe do that in the future.
cr->deferredClear(0u);
bool success = cr->drawTriangles(rangesPtr, stream);
NVDR_CHECK(success, "subtriangle count overflow");
// Allocate output tensors. // Allocate output tensors.
torch::TensorOptions opts = torch::TensorOptions().dtype(torch::kFloat32).device(torch::kCUDA); torch::TensorOptions opts = torch::TensorOptions().dtype(torch::kFloat32).device(torch::kCUDA);
torch::Tensor out = torch::empty({depth, height, width, 4}, opts); torch::Tensor out = torch::empty({depth, height, width, 4}, opts);
torch::Tensor out_db = torch::empty({depth, height, width, s.enableDB ? 4 : 0}, opts); torch::Tensor out_db = torch::empty({depth, height, width, 4}, opts);
float* outputPtr[2];
outputPtr[0] = out.data_ptr<float>();
outputPtr[1] = s.enableDB ? out_db.data_ptr<float>() : NULL;
// Copy rasterized results into CUDA buffers. // Populate pixel shader kernel parameters.
rasterizeCopyResults(NVDR_CTX_PARAMS, s, stream, outputPtr, width, height, depth); RasterizeCudaFwdShaderParams p;
p.pos = posPtr;
p.tri = triPtr;
p.in_idx = (const int*)cr->getColorBuffer();
p.out = out.data_ptr<float>();
p.out_db = out_db.data_ptr<float>();
p.numTriangles = triCount;
p.numVertices = posCount;
p.width = width;
p.height = height;
p.depth = depth;
p.instance_mode = (pos.sizes().size() > 2) ? 1 : 0;
p.xs = 2.f / (float)p.width;
p.xo = 1.f / (float)p.width - 1.f;
p.ys = 2.f / (float)p.height;
p.yo = 1.f / (float)p.height - 1.f;
// Done. Release GL context and return. // Verify that buffers are aligned to allow float2/float4 operations.
if (stateWrapper.automatic) NVDR_CHECK(!((uintptr_t)p.pos & 15), "pos input tensor not aligned to float4");
releaseGLContext(); NVDR_CHECK(!((uintptr_t)p.out & 15), "out output tensor not aligned to float4");
NVDR_CHECK(!((uintptr_t)p.out_db & 15), "out_db output tensor not aligned to float4");
// Choose launch parameters.
dim3 blockSize = getLaunchBlockSize(RAST_CUDA_FWD_SHADER_KERNEL_BLOCK_WIDTH, RAST_CUDA_FWD_SHADER_KERNEL_BLOCK_HEIGHT, p.width, p.height);
dim3 gridSize = getLaunchGridSize(blockSize, p.width, p.height, p.depth);
// Launch CUDA kernel.
void* args[] = {&p};
NVDR_CHECK_CUDA_ERROR(cudaLaunchKernel((void*)RasterizeCudaFwdShaderKernel, gridSize, blockSize, args, 0, stream));
// Return.
return std::tuple<torch::Tensor, torch::Tensor>(out, out_db); return std::tuple<torch::Tensor, torch::Tensor>(out, out_db);
} }
......
// Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
//
// NVIDIA CORPORATION and its licensors retain all intellectual property
// and proprietary rights in and to this software, related documentation
// and any modifications thereto. Any use, reproduction, disclosure or
// distribution of this software and related documentation without an express
// license agreement from NVIDIA CORPORATION is strictly prohibited.
#include "torch_common.inl"
#include "torch_types.h"
#include "../common/common.h"
#include "../common/rasterize_gl.h"
#include <tuple>
//------------------------------------------------------------------------
// Python GL state wrapper methods.
RasterizeGLStateWrapper::RasterizeGLStateWrapper(bool enableDB, bool automatic_, int cudaDeviceIdx_)
{
pState = new RasterizeGLState();
automatic = automatic_;
cudaDeviceIdx = cudaDeviceIdx_;
memset(pState, 0, sizeof(RasterizeGLState));
pState->enableDB = enableDB ? 1 : 0;
rasterizeInitGLContext(NVDR_CTX_PARAMS, *pState, cudaDeviceIdx_);
releaseGLContext();
}
RasterizeGLStateWrapper::~RasterizeGLStateWrapper(void)
{
setGLContext(pState->glctx);
rasterizeReleaseBuffers(NVDR_CTX_PARAMS, *pState);
releaseGLContext();
destroyGLContext(pState->glctx);
delete pState;
}
void RasterizeGLStateWrapper::setContext(void)
{
setGLContext(pState->glctx);
}
void RasterizeGLStateWrapper::releaseContext(void)
{
releaseGLContext();
}
//------------------------------------------------------------------------
// Forward op (OpenGL).
std::tuple<torch::Tensor, torch::Tensor> rasterize_fwd_gl(RasterizeGLStateWrapper& stateWrapper, torch::Tensor pos, torch::Tensor tri, std::tuple<int, int> resolution, torch::Tensor ranges, int peeling_idx)
{
const at::cuda::OptionalCUDAGuard device_guard(device_of(pos));
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
RasterizeGLState& s = *stateWrapper.pState;
// Check inputs.
NVDR_CHECK_DEVICE(pos, tri);
NVDR_CHECK_CPU(ranges);
NVDR_CHECK_CONTIGUOUS(pos, tri, ranges);
NVDR_CHECK_F32(pos);
NVDR_CHECK_I32(tri, ranges);
// Check that GL context was created for the correct GPU.
NVDR_CHECK(pos.get_device() == stateWrapper.cudaDeviceIdx, "GL context must must reside on the same device as input tensors");
// Determine number of outputs
int num_outputs = s.enableDB ? 2 : 1;
// Determine instance mode and check input dimensions.
bool instance_mode = pos.sizes().size() > 2;
if (instance_mode)
NVDR_CHECK(pos.sizes().size() == 3 && pos.size(0) > 0 && pos.size(1) > 0 && pos.size(2) == 4, "instance mode - pos must have shape [>0, >0, 4]");
else
{
NVDR_CHECK(pos.sizes().size() == 2 && pos.size(0) > 0 && pos.size(1) == 4, "range mode - pos must have shape [>0, 4]");
NVDR_CHECK(ranges.sizes().size() == 2 && ranges.size(0) > 0 && ranges.size(1) == 2, "range mode - ranges must have shape [>0, 2]");
}
NVDR_CHECK(tri.sizes().size() == 2 && tri.size(0) > 0 && tri.size(1) == 3, "tri must have shape [>0, 3]");
// Get output shape.
int height = std::get<0>(resolution);
int width = std::get<1>(resolution);
int depth = instance_mode ? pos.size(0) : ranges.size(0);
NVDR_CHECK(height > 0 && width > 0, "resolution must be [>0, >0]");
// Get position and triangle buffer sizes in int32/float32.
int posCount = 4 * pos.size(0) * (instance_mode ? pos.size(1) : 1);
int triCount = 3 * tri.size(0);
// Set the GL context unless manual context.
if (stateWrapper.automatic)
setGLContext(s.glctx);
// Resize all buffers.
bool changes = false;
rasterizeResizeBuffers(NVDR_CTX_PARAMS, s, changes, posCount, triCount, width, height, depth);
if (changes)
{
#ifdef _WIN32
// Workaround for occasional blank first frame on Windows.
releaseGLContext();
setGLContext(s.glctx);
#endif
}
// Copy input data to GL and render.
const float* posPtr = pos.data_ptr<float>();
const int32_t* rangesPtr = instance_mode ? 0 : ranges.data_ptr<int32_t>(); // This is in CPU memory.
const int32_t* triPtr = tri.data_ptr<int32_t>();
int vtxPerInstance = instance_mode ? pos.size(1) : 0;
rasterizeRender(NVDR_CTX_PARAMS, s, stream, posPtr, posCount, vtxPerInstance, triPtr, triCount, rangesPtr, width, height, depth, peeling_idx);
// Allocate output tensors.
torch::TensorOptions opts = torch::TensorOptions().dtype(torch::kFloat32).device(torch::kCUDA);
torch::Tensor out = torch::empty({depth, height, width, 4}, opts);
torch::Tensor out_db = torch::empty({depth, height, width, s.enableDB ? 4 : 0}, opts);
float* outputPtr[2];
outputPtr[0] = out.data_ptr<float>();
outputPtr[1] = s.enableDB ? out_db.data_ptr<float>() : NULL;
// Copy rasterized results into CUDA buffers.
rasterizeCopyResults(NVDR_CTX_PARAMS, s, stream, outputPtr, width, height, depth);
// Done. Release GL context and return.
if (stateWrapper.automatic)
releaseGLContext();
return std::tuple<torch::Tensor, torch::Tensor>(out, out_db);
}
//------------------------------------------------------------------------
...@@ -26,6 +26,20 @@ public: ...@@ -26,6 +26,20 @@ public:
int cudaDeviceIdx; int cudaDeviceIdx;
}; };
//------------------------------------------------------------------------
// Python CudaRaster state wrapper.
namespace CR { class CudaRaster; }
class RasterizeCRStateWrapper
{
public:
RasterizeCRStateWrapper (int cudaDeviceIdx);
~RasterizeCRStateWrapper (void);
CR::CudaRaster* cr;
int cudaDeviceIdx;
};
//------------------------------------------------------------------------ //------------------------------------------------------------------------
// Mipmap wrapper to prevent intrusion from Python side. // Mipmap wrapper to prevent intrusion from Python side.
......
...@@ -48,7 +48,8 @@ def fit_cube(max_iter = 5000, ...@@ -48,7 +48,8 @@ def fit_cube(max_iter = 5000,
out_dir = None, out_dir = None,
log_fn = None, log_fn = None,
mp4save_interval = None, mp4save_interval = None,
mp4save_fn = None): mp4save_fn = None,
use_opengl = False):
log_file = None log_file = None
writer = None writer = None
...@@ -73,7 +74,8 @@ def fit_cube(max_iter = 5000, ...@@ -73,7 +74,8 @@ def fit_cube(max_iter = 5000,
vtx_pos = torch.from_numpy(vtxp.astype(np.float32)).cuda() vtx_pos = torch.from_numpy(vtxp.astype(np.float32)).cuda()
vtx_col = torch.from_numpy(vtxc.astype(np.float32)).cuda() vtx_col = torch.from_numpy(vtxc.astype(np.float32)).cuda()
glctx = dr.RasterizeGLContext() # Rasterizer context
glctx = dr.RasterizeGLContext() if use_opengl else dr.RasterizeCudaContext()
# Repeats. # Repeats.
for rep in range(repeats): for rep in range(repeats):
...@@ -161,7 +163,8 @@ def fit_cube(max_iter = 5000, ...@@ -161,7 +163,8 @@ def fit_cube(max_iter = 5000,
def main(): def main():
parser = argparse.ArgumentParser(description='Cube fit example') parser = argparse.ArgumentParser(description='Cube fit example')
parser.add_argument('--outdir', help='Specify output directory', default='') parser.add_argument('--opengl', help='enable OpenGL rendering', action='store_true', default=False)
parser.add_argument('--outdir', help='specify output directory', default='')
parser.add_argument('--discontinuous', action='store_true', default=False) parser.add_argument('--discontinuous', action='store_true', default=False)
parser.add_argument('--resolution', type=int, default=0, required=True) parser.add_argument('--resolution', type=int, default=0, required=True)
parser.add_argument('--display-interval', type=int, default=0) parser.add_argument('--display-interval', type=int, default=0)
...@@ -188,7 +191,8 @@ def main(): ...@@ -188,7 +191,8 @@ def main():
out_dir=out_dir, out_dir=out_dir,
log_fn='log.txt', log_fn='log.txt',
mp4save_interval=args.mp4save_interval, mp4save_interval=args.mp4save_interval,
mp4save_fn='progress.mp4' mp4save_fn='progress.mp4',
use_opengl=args.opengl
) )
# Done. # Done.
......
...@@ -47,7 +47,7 @@ def fit_earth(max_iter = 20000, ...@@ -47,7 +47,7 @@ def fit_earth(max_iter = 20000,
display_res = 1024, display_res = 1024,
enable_mip = True, enable_mip = True,
res = 512, res = 512,
ref_res = 4096, ref_res = 2048, # Dropped from 4096 to 2048 to allow using the Cuda rasterizer.
lr_base = 1e-2, lr_base = 1e-2,
lr_ramp = 0.1, lr_ramp = 0.1,
out_dir = None, out_dir = None,
...@@ -55,7 +55,8 @@ def fit_earth(max_iter = 20000, ...@@ -55,7 +55,8 @@ def fit_earth(max_iter = 20000,
texsave_interval = None, texsave_interval = None,
texsave_fn = None, texsave_fn = None,
imgsave_interval = None, imgsave_interval = None,
imgsave_fn = None): imgsave_fn = None,
use_opengl = False):
log_file = None log_file = None
if out_dir: if out_dir:
...@@ -64,7 +65,7 @@ def fit_earth(max_iter = 20000, ...@@ -64,7 +65,7 @@ def fit_earth(max_iter = 20000,
log_file = open(out_dir + '/' + log_fn, 'wt') log_file = open(out_dir + '/' + log_fn, 'wt')
else: else:
imgsave_interval, texsave_interval = None, None imgsave_interval, texsave_interval = None, None
# Mesh and texture adapted from "3D Earth Photorealistic 2K" model at # Mesh and texture adapted from "3D Earth Photorealistic 2K" model at
# https://www.turbosquid.com/3d-models/3d-realistic-earth-photorealistic-2k-1279125 # https://www.turbosquid.com/3d-models/3d-realistic-earth-photorealistic-2k-1279125
datadir = f'{pathlib.Path(__file__).absolute().parents[1]}/data' datadir = f'{pathlib.Path(__file__).absolute().parents[1]}/data'
...@@ -86,7 +87,7 @@ def fit_earth(max_iter = 20000, ...@@ -86,7 +87,7 @@ def fit_earth(max_iter = 20000,
tex = torch.from_numpy(tex.astype(np.float32)).cuda() tex = torch.from_numpy(tex.astype(np.float32)).cuda()
tex_opt = torch.full(tex.shape, 0.2, device='cuda', requires_grad=True) tex_opt = torch.full(tex.shape, 0.2, device='cuda', requires_grad=True)
glctx = dr.RasterizeGLContext() glctx = dr.RasterizeGLContext() if use_opengl else dr.RasterizeCudaContext()
ang = 0.0 ang = 0.0
...@@ -177,8 +178,9 @@ def fit_earth(max_iter = 20000, ...@@ -177,8 +178,9 @@ def fit_earth(max_iter = 20000,
def main(): def main():
parser = argparse.ArgumentParser(description='Earth texture fitting example') parser = argparse.ArgumentParser(description='Earth texture fitting example')
parser.add_argument('--outdir', help='Specify output directory', default='') parser.add_argument('--opengl', help='enable OpenGL rendering', action='store_true', default=False)
parser.add_argument('--mip', action='store_true', default=False) parser.add_argument('--outdir', help='specify output directory', default='')
parser.add_argument('--mip', help='enable mipmapping', action='store_true', default=False)
parser.add_argument('--display-interval', type=int, default=0) parser.add_argument('--display-interval', type=int, default=0)
parser.add_argument('--max-iter', type=int, default=10000) parser.add_argument('--max-iter', type=int, default=10000)
args = parser.parse_args() args = parser.parse_args()
...@@ -193,7 +195,7 @@ def main(): ...@@ -193,7 +195,7 @@ def main():
print ('No output directory specified, not saving log or images') print ('No output directory specified, not saving log or images')
# Run. # Run.
fit_earth(max_iter=args.max_iter, log_interval=10, display_interval=args.display_interval, enable_mip=args.mip, out_dir=out_dir, log_fn='log.txt', texsave_interval=1000, texsave_fn='tex_%06d.png', imgsave_interval=1000, imgsave_fn='img_%06d.png') fit_earth(max_iter=args.max_iter, log_interval=10, display_interval=args.display_interval, enable_mip=args.mip, out_dir=out_dir, log_fn='log.txt', texsave_interval=1000, texsave_fn='tex_%06d.png', imgsave_interval=1000, imgsave_fn='img_%06d.png', use_opengl=args.opengl)
# Done. # Done.
print("Done.") print("Done.")
......
...@@ -32,7 +32,8 @@ def fit_env_phong(max_iter = 1000, ...@@ -32,7 +32,8 @@ def fit_env_phong(max_iter = 1000,
out_dir = None, out_dir = None,
log_fn = None, log_fn = None,
mp4save_interval = None, mp4save_interval = None,
mp4save_fn = None): mp4save_fn = None,
use_opengl = False):
log_file = None log_file = None
writer = None writer = None
...@@ -74,7 +75,7 @@ def fit_env_phong(max_iter = 1000, ...@@ -74,7 +75,7 @@ def fit_env_phong(max_iter = 1000,
# Render. # Render.
ang = 0.0 ang = 0.0
imgloss_avg, phong_avg = [], [] imgloss_avg, phong_avg = [], []
glctx = dr.RasterizeGLContext() glctx = dr.RasterizeGLContext() if use_opengl else dr.RasterizeCudaContext()
zero_tensor = torch.as_tensor(0.0, dtype=torch.float32, device='cuda') zero_tensor = torch.as_tensor(0.0, dtype=torch.float32, device='cuda')
one_tensor = torch.as_tensor(1.0, dtype=torch.float32, device='cuda') one_tensor = torch.as_tensor(1.0, dtype=torch.float32, device='cuda')
...@@ -162,7 +163,7 @@ def fit_env_phong(max_iter = 1000, ...@@ -162,7 +163,7 @@ def fit_env_phong(max_iter = 1000,
if log_file: if log_file:
log_file.write(s + '\n') log_file.write(s + '\n')
# Show/save result image. # Show/save result image.
display_image = display_interval and (it % display_interval == 0) display_image = display_interval and (it % display_interval == 0)
save_mp4 = mp4save_interval and (it % mp4save_interval == 0) save_mp4 = mp4save_interval and (it % mp4save_interval == 0)
...@@ -193,7 +194,8 @@ def fit_env_phong(max_iter = 1000, ...@@ -193,7 +194,8 @@ def fit_env_phong(max_iter = 1000,
def main(): def main():
parser = argparse.ArgumentParser(description='Environment map fitting example') parser = argparse.ArgumentParser(description='Environment map fitting example')
parser.add_argument('--outdir', help='Specify output directory', default='') parser.add_argument('--opengl', help='enable OpenGL rendering', action='store_true', default=False)
parser.add_argument('--outdir', help='specify output directory', default='')
parser.add_argument('--display-interval', type=int, default=0) parser.add_argument('--display-interval', type=int, default=0)
parser.add_argument('--mp4save-interval', type=int, default=10) parser.add_argument('--mp4save-interval', type=int, default=10)
parser.add_argument('--max-iter', type=int, default=5000) parser.add_argument('--max-iter', type=int, default=5000)
...@@ -214,7 +216,8 @@ def main(): ...@@ -214,7 +216,8 @@ def main():
display_interval=args.display_interval, display_interval=args.display_interval,
out_dir=out_dir, out_dir=out_dir,
mp4save_interval=args.mp4save_interval, mp4save_interval=args.mp4save_interval,
mp4save_fn='progress.mp4' mp4save_fn='progress.mp4',
use_opengl=args.opengl
) )
# Done. # Done.
......
...@@ -132,7 +132,8 @@ def fit_pose(max_iter = 10000, ...@@ -132,7 +132,8 @@ def fit_pose(max_iter = 10000,
out_dir = None, out_dir = None,
log_fn = None, log_fn = None,
mp4save_interval = None, mp4save_interval = None,
mp4save_fn = None): mp4save_fn = None,
use_opengl = False):
log_file = None log_file = None
writer = None writer = None
...@@ -160,7 +161,7 @@ def fit_pose(max_iter = 10000, ...@@ -160,7 +161,7 @@ def fit_pose(max_iter = 10000,
col_idx = torch.from_numpy(col_idx.astype(np.int32)).cuda() col_idx = torch.from_numpy(col_idx.astype(np.int32)).cuda()
vtx_col = torch.from_numpy(col.astype(np.float32)).cuda() vtx_col = torch.from_numpy(col.astype(np.float32)).cuda()
glctx = dr.RasterizeGLContext() glctx = dr.RasterizeGLContext() if use_opengl else dr.RasterizeCudaContext()
for rep in range(repeats): for rep in range(repeats):
pose_target = torch.tensor(q_rnd(), device='cuda') pose_target = torch.tensor(q_rnd(), device='cuda')
...@@ -253,7 +254,8 @@ def fit_pose(max_iter = 10000, ...@@ -253,7 +254,8 @@ def fit_pose(max_iter = 10000,
def main(): def main():
parser = argparse.ArgumentParser(description='Cube pose fitting example') parser = argparse.ArgumentParser(description='Cube pose fitting example')
parser.add_argument('--outdir', help='Specify output directory', default='') parser.add_argument('--opengl', help='enable OpenGL rendering', action='store_true', default=False)
parser.add_argument('--outdir', help='specify output directory', default='')
parser.add_argument('--display-interval', type=int, default=0) parser.add_argument('--display-interval', type=int, default=0)
parser.add_argument('--mp4save-interval', type=int, default=10) parser.add_argument('--mp4save-interval', type=int, default=10)
parser.add_argument('--max-iter', type=int, default=1000) parser.add_argument('--max-iter', type=int, default=1000)
...@@ -277,7 +279,8 @@ def main(): ...@@ -277,7 +279,8 @@ def main():
out_dir=out_dir, out_dir=out_dir,
log_fn='log.txt', log_fn='log.txt',
mp4save_interval=args.mp4save_interval, mp4save_interval=args.mp4save_interval,
mp4save_fn='progress.mp4' mp4save_fn='progress.mp4',
use_opengl=args.opengl
) )
# Done. # Done.
......
...@@ -10,15 +10,23 @@ import imageio ...@@ -10,15 +10,23 @@ import imageio
import numpy as np import numpy as np
import torch import torch
import nvdiffrast.torch as dr import nvdiffrast.torch as dr
import sys
def tensor(*args, **kwargs): def tensor(*args, **kwargs):
return torch.tensor(*args, device='cuda', **kwargs) return torch.tensor(*args, device='cuda', **kwargs)
if sys.argv[1:] == ['--cuda']:
glctx = dr.RasterizeCudaContext()
elif sys.argv[1:] == ['--opengl']:
glctx = dr.RasterizeGLContext()
else:
print("Specify either --cuda or --opengl")
exit(1)
pos = tensor([[[-0.8, -0.8, 0, 1], [0.8, -0.8, 0, 1], [-0.8, 0.8, 0, 1]]], dtype=torch.float32) pos = tensor([[[-0.8, -0.8, 0, 1], [0.8, -0.8, 0, 1], [-0.8, 0.8, 0, 1]]], dtype=torch.float32)
col = tensor([[[1, 0, 0], [0, 1, 0], [0, 0, 1]]], dtype=torch.float32) col = tensor([[[1, 0, 0], [0, 1, 0], [0, 0, 1]]], dtype=torch.float32)
tri = tensor([[0, 1, 2]], dtype=torch.int32) tri = tensor([[0, 1, 2]], dtype=torch.int32)
glctx = dr.RasterizeGLContext()
rast, _ = dr.rasterize(glctx, pos, tri, resolution=[256, 256]) rast, _ = dr.rasterize(glctx, pos, tri, resolution=[256, 256])
out, _ = dr.interpolate(col, rast, tri) out, _ = dr.interpolate(col, rast, tri)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment