"src/include/functional.hpp" did not exist on "eafdabba771fe3a41843b86c85f6e574b2d176e1"
Commit ce7063f1 authored by Samuli Laine's avatar Samuli Laine
Browse files

Support for multiple GPUs, mip bias input for texture op

parent 2468e2a0
This diff is collapsed.
......@@ -6,4 +6,4 @@
# distribution of this software and related documentation without an express
# license agreement from NVIDIA CORPORATION is strictly prohibited.
__version__ = '0.2.0'
__version__ = '0.2.1'
......@@ -185,6 +185,8 @@ template<class T> static __device__ __forceinline__ void swap(T& a, T& b)
//------------------------------------------------------------------------
// Coalesced atomics. These are all done via macros.
#if __CUDA_ARCH__ >= 700 // Warp match instruction __match_any_sync() is only available on compute capability 7.x and higher
#define CA_TEMP _ca_temp
#define CA_TEMP_PARAM float* CA_TEMP
#define CA_DECLARE_TEMP(threads_per_block) \
......@@ -228,5 +230,24 @@ template<class T> static __device__ __forceinline__ void swap(T& a, T& b)
caAtomicAdd((ptr)+(idx), (value)); \
} while(0)
//------------------------------------------------------------------------
// Disable atomic coalescing for compute capability lower than 7.x
#else // __CUDA_ARCH__ >= 700
#define CA_TEMP _ca_temp
#define CA_TEMP_PARAM float CA_TEMP
#define CA_DECLARE_TEMP(threads_per_block) CA_TEMP_PARAM
#define CA_SET_GROUP_MASK(group, thread_mask)
#define CA_SET_GROUP(group)
#define caAtomicAdd(ptr, value) atomicAdd((ptr), (value))
#define caAtomicAdd3_xyw(ptr, x, y, w) \
do { \
atomicAdd((ptr), (x)); \
atomicAdd((ptr)+1, (y)); \
atomicAdd((ptr)+3, (w)); \
} while(0)
#define caAtomicAddTexture(ptr, level, idx, value) atomicAdd((ptr)+(idx), (value))
#endif // __CUDA_ARCH__ >= 700
//------------------------------------------------------------------------
#endif // __CUDACC__
......@@ -36,6 +36,7 @@ using namespace tensorflow::shape_inference;
#include <torch/extension.h>
#include <ATen/cuda/CUDAContext.h>
#include <ATen/cuda/CUDAUtils.h>
#include <c10/cuda/CUDAGuard.h>
#include <pybind11/numpy.h>
#endif
#define NVDR_CTX_ARGS int _nvdr_ctx_dummy
......
......@@ -37,26 +37,43 @@ struct GLContext
static void setGLContext(GLContext& glctx)
{
if (!glctx.hglrc)
LOG(ERROR) << "setGLContext() called with null gltcx";
LOG(FATAL) << "setGLContext() called with null gltcx";
if (!wglMakeCurrent(glctx.hdc, glctx.hglrc))
LOG(ERROR) << "wglMakeCurrent() failed when setting GL context";
LOG(FATAL) << "wglMakeCurrent() failed when setting GL context";
if (glctx.glewInitialized)
return;
GLenum result = glewInit();
if (result != GLEW_OK)
LOG(ERROR) << "glewInit() failed, return value = " << result;
LOG(FATAL) << "glewInit() failed, return value = " << result;
glctx.glewInitialized = 1;
}
static void releaseGLContext(void)
{
if (!wglMakeCurrent(NULL, NULL))
LOG(ERROR) << "wglMakeCurrent() failed when releasing GL context";
LOG(FATAL) << "wglMakeCurrent() failed when releasing GL context";
}
static GLContext createGLContext(void)
extern "C" int set_gpu(const char*);
static GLContext createGLContext(int cudaDeviceIdx)
{
if (cudaDeviceIdx >= 0)
{
char pciBusId[256] = "";
LOG(INFO) << "Creating GL context for Cuda device " << cudaDeviceIdx;
if (cudaDeviceGetPCIBusId(pciBusId, 255, cudaDeviceIdx) != CUDA_SUCCESS)
{
LOG(INFO) << "PCI bus id query failed";
}
else
{
int res = set_gpu(pciBusId);
LOG(INFO) << "Selecting device with PCI bus id " << pciBusId << " - " << (res ? "failed, expect crash or major slowdown" : "success");
}
}
HINSTANCE hInstance = GetModuleHandle(NULL);
WNDCLASS wc = {};
wc.style = CS_OWNDC;
......@@ -101,7 +118,7 @@ static GLContext createGLContext(void)
static void destroyGLContext(GLContext& glctx)
{
if (!glctx.hglrc)
LOG(ERROR) << "destroyGLContext() called with null gltcx";
LOG(FATAL) << "destroyGLContext() called with null gltcx";
// If this is the current context, release it.
if (wglGetCurrentContext() == glctx.hglrc)
......@@ -109,13 +126,13 @@ static void destroyGLContext(GLContext& glctx)
HWND hwnd = WindowFromDC(glctx.hdc);
if (!hwnd)
LOG(ERROR) << "WindowFromDC() failed";
LOG(FATAL) << "WindowFromDC() failed";
if (!ReleaseDC(hwnd, glctx.hdc))
LOG(ERROR) << "ReleaseDC() failed";
LOG(FATAL) << "ReleaseDC() failed";
if (!wglDeleteContext(glctx.hglrc))
LOG(ERROR) << "wglDeleteContext() failed";
LOG(FATAL) << "wglDeleteContext() failed";
if (!DestroyWindow(hwnd))
LOG(ERROR) << "DestroyWindow() failed";
LOG(FATAL) << "DestroyWindow() failed";
LOG(INFO) << std::hex << std::setfill('0')
<< "WGL OpenGL context destroyed (hdc: 0x" << std::setw(8) << (uint32_t)(uintptr_t)glctx.hdc
......@@ -140,6 +157,7 @@ static void destroyGLContext(GLContext& glctx)
# include <GL/glew.h> // Use system-supplied glew.h
#endif
#include <EGL/egl.h>
#include <EGL/eglext.h>
#include <GL/gl.h>
#include <cuda_gl_interop.h>
......@@ -148,7 +166,6 @@ static void destroyGLContext(GLContext& glctx)
struct GLContext
{
EGLDisplay display;
EGLSurface surface;
EGLContext context;
int glewInitialized;
};
......@@ -158,9 +175,9 @@ struct GLContext
static void setGLContext(GLContext& glctx)
{
if (!glctx.context)
LOG(ERROR) << "setGLContext() called with null gltcx";
LOG(FATAL) << "setGLContext() called with null gltcx";
if (!eglMakeCurrent(glctx.display, glctx.surface, glctx.surface, glctx.context))
if (!eglMakeCurrent(glctx.display, EGL_NO_SURFACE, EGL_NO_SURFACE, glctx.context))
LOG(ERROR) << "eglMakeCurrent() failed when setting GL context";
if (glctx.glewInitialized)
......@@ -168,7 +185,7 @@ static void setGLContext(GLContext& glctx)
GLenum result = glewInit();
if (result != GLEW_OK)
LOG(ERROR) << "glewInit() failed, return value = " << result;
LOG(FATAL) << "glewInit() failed, return value = " << result;
glctx.glewInitialized = 1;
}
......@@ -178,21 +195,83 @@ static void releaseGLContext(void)
if (display == EGL_NO_DISPLAY)
LOG(WARNING) << "releaseGLContext() called with no active display";
if (!eglMakeCurrent(display, EGL_NO_SURFACE, EGL_NO_SURFACE, EGL_NO_CONTEXT))
LOG(ERROR) << "eglMakeCurrent() failed when releasing GL context";
LOG(FATAL) << "eglMakeCurrent() failed when releasing GL context";
}
static GLContext createGLContext(void)
static EGLDisplay getCudaDisplay(int cudaDeviceIdx)
{
// Initialize.
typedef EGLBoolean (*eglQueryDevicesEXT_t)(EGLint, EGLDeviceEXT, EGLint*);
typedef EGLBoolean (*eglQueryDeviceAttribEXT_t)(EGLDeviceEXT, EGLint, EGLAttrib*);
typedef EGLDisplay (*eglGetPlatformDisplayEXT_t)(EGLenum, void*, const EGLint*);
EGLDisplay display = eglGetDisplay(EGL_DEFAULT_DISPLAY);
if (display == EGL_NO_DISPLAY)
LOG(ERROR) << "eglGetDisplay() failed";
eglQueryDevicesEXT_t eglQueryDevicesEXT = (eglQueryDevicesEXT_t)eglGetProcAddress("eglQueryDevicesEXT");
if (!eglQueryDevicesEXT)
{
LOG(INFO) << "eglGetProcAddress(\"eglQueryDevicesEXT\") failed";
return 0;
}
eglQueryDeviceAttribEXT_t eglQueryDeviceAttribEXT = (eglQueryDeviceAttribEXT_t)eglGetProcAddress("eglQueryDeviceAttribEXT");
if (!eglQueryDeviceAttribEXT)
{
LOG(INFO) << "eglGetProcAddress(\"eglQueryDeviceAttribEXT\") failed";
return 0;
}
eglGetPlatformDisplayEXT_t eglGetPlatformDisplayEXT = (eglGetPlatformDisplayEXT_t)eglGetProcAddress("eglGetPlatformDisplayEXT");
if (!eglGetPlatformDisplayEXT)
{
LOG(INFO) << "eglGetProcAddress(\"eglGetPlatformDisplayEXT\") failed";
return 0;
}
int num_devices = 0;
eglQueryDevicesEXT(0, 0, &num_devices);
if (!num_devices)
return 0;
EGLDisplay display = 0;
EGLDeviceEXT* devices = (EGLDeviceEXT*)malloc(num_devices * sizeof(void*));
eglQueryDevicesEXT(num_devices, devices, &num_devices);
for (int i=0; i < num_devices; i++)
{
EGLDeviceEXT device = devices[i];
intptr_t value = -1;
if (eglQueryDeviceAttribEXT(device, EGL_CUDA_DEVICE_NV, &value) && value == cudaDeviceIdx)
{
display = eglGetPlatformDisplayEXT(EGL_PLATFORM_DEVICE_EXT, device, 0);
break;
}
}
free(devices);
return display;
}
static GLContext createGLContext(int cudaDeviceIdx)
{
EGLDisplay display = 0;
if (cudaDeviceIdx >= 0)
{
char pciBusId[256] = "";
LOG(INFO) << "Creating GL context for Cuda device " << cudaDeviceIdx;
display = getCudaDisplay(cudaDeviceIdx);
if (!display)
LOG(INFO) << "Failed, falling back to default display";
}
if (!display)
{
display = eglGetDisplay(EGL_DEFAULT_DISPLAY);
if (display == EGL_NO_DISPLAY)
LOG(FATAL) << "eglGetDisplay() failed";
}
EGLint major;
EGLint minor;
if (!eglInitialize(display, &major, &minor))
LOG(ERROR) << "eglInitialize() failed";
LOG(FATAL) << "eglInitialize() failed";
// Choose configuration.
......@@ -211,45 +290,32 @@ static GLContext createGLContext(void)
EGLConfig config;
EGLint num_config;
if (!eglChooseConfig(display, context_attribs, &config, 1, &num_config))
LOG(ERROR) << "eglChooseConfig() failed";
// Create dummy pbuffer surface.
const EGLint surface_attribs[] = {
EGL_WIDTH, 1,
EGL_HEIGHT, 1,
EGL_NONE
};
EGLSurface surface = eglCreatePbufferSurface(display, config, surface_attribs);
if (surface == EGL_NO_SURFACE)
LOG(ERROR) << "eglCreatePbufferSurface() failed";
LOG(FATAL) << "eglChooseConfig() failed";
// Create GL context.
if (!eglBindAPI(EGL_OPENGL_API))
LOG(ERROR) << "eglBindAPI() failed";
LOG(FATAL) << "eglBindAPI() failed";
EGLContext context = eglCreateContext(display, config, EGL_NO_CONTEXT, NULL);
if (context == EGL_NO_CONTEXT)
LOG(ERROR) << "eglCreateContext() failed";
LOG(FATAL) << "eglCreateContext() failed";
// Done.
LOG(INFO) << "EGL " << (int)minor << "." << (int)major << " OpenGL context created (disp: 0x"
<< std::hex << std::setfill('0')
<< std::setw(16) << (uintptr_t)display
<< ", surf: 0x" << std::setw(16) << (uintptr_t)surface
<< ", ctx: 0x" << std::setw(16) << (uintptr_t)context << ")";
GLContext glctx = {display, surface, context, 0};
GLContext glctx = {display, context, 0};
return glctx;
}
static void destroyGLContext(GLContext& glctx)
{
if (!glctx.context)
LOG(ERROR) << "destroyGLContext() called with null gltcx";
LOG(FATAL) << "destroyGLContext() called with null gltcx";
// If this is the current context, release it.
if (eglGetCurrentContext() == glctx.context)
......@@ -257,13 +323,10 @@ static void destroyGLContext(GLContext& glctx)
if (!eglDestroyContext(glctx.display, glctx.context))
LOG(ERROR) << "eglDestroyContext() failed";
if (!eglDestroySurface(glctx.display, glctx.surface))
LOG(ERROR) << "eglDestroySurface() failed";
LOG(INFO) << "EGL OpenGL context destroyed (disp: 0x"
<< std::hex << std::setfill('0')
<< std::setw(16) << (uintptr_t)glctx.display
<< ", surf: 0x" << std::setw(16) << (uintptr_t)glctx.surface
<< ", ctx: 0x" << std::setw(16) << (uintptr_t)glctx.context << ")";
memset(&glctx, 0, sizeof(GLContext));
......
......@@ -76,12 +76,12 @@ static void constructGLProgram(NVDR_CTX_ARGS, GLuint* pProgram, GLuint glVertexS
//------------------------------------------------------------------------
// Shared C++ functions.
void rasterizeInitGLContext(NVDR_CTX_ARGS, RasterizeGLState& s)
void rasterizeInitGLContext(NVDR_CTX_ARGS, RasterizeGLState& s, int cudaDeviceIdx)
{
// Create GL context and set it current.
s.glctx = createGLContext();
s.glctx = createGLContext(cudaDeviceIdx);
setGLContext(s.glctx);
// Version check.
GLint vMajor = 0;
GLint vMinor = 0;
......@@ -90,7 +90,7 @@ void rasterizeInitGLContext(NVDR_CTX_ARGS, RasterizeGLState& s)
glGetError(); // Clear possible GL_INVALID_ENUM error in version query.
LOG(INFO) << "OpenGL version reported as " << vMajor << "." << vMinor;
NVDR_CHECK((vMajor == 4 && vMinor >= 4) || vMajor > 4, "OpenGL 4.4 or later is required");
// Number of output buffers.
int num_outputs = s.enableDB ? 2 : 1;
......@@ -319,7 +319,7 @@ void rasterizeResizeBuffers(NVDR_CTX_ARGS, RasterizeGLState& s, int posCount, in
s.width = ROUND_UP(s.width, 32);
s.height = ROUND_UP(s.height, 32);
LOG(INFO) << "Increasing frame buffer size to (width, height, depth) = (" << s.width << ", " << s.height << ", " << s.depth << ")";
// Allocate color buffers.
for (int i=0; i < num_outputs; i++)
{
......
......@@ -83,7 +83,7 @@ struct RasterizeGLState
//------------------------------------------------------------------------
// Shared C++ code prototypes.
void rasterizeInitGLContext(NVDR_CTX_ARGS, RasterizeGLState& s);
void rasterizeInitGLContext(NVDR_CTX_ARGS, RasterizeGLState& s, int cudaDeviceIdx);
void rasterizeResizeBuffers(NVDR_CTX_ARGS, RasterizeGLState& s, int posCount, int triCount, int width, int height, int depth);
void rasterizeRender(NVDR_CTX_ARGS, RasterizeGLState& s, cudaStream_t stream, const float* posPtr, int posCount, int vtxPerInstance, const int32_t* triPtr, int triCount, const int32_t* rangesPtr, int width, int height, int depth);
void rasterizeCopyResults(NVDR_CTX_ARGS, RasterizeGLState& s, cudaStream_t stream, float** outputPtr, int width, int height, int depth);
......
This diff is collapsed.
......@@ -40,7 +40,8 @@ struct TextureKernelParams
{
const float* tex; // Incoming texture buffer.
const float* uv; // Incoming texcoord buffer.
const float* uvDA; // Incoming uv pixel diffs. NULL if mips disabled.
const float* uvDA; // Incoming uv pixel diffs or NULL.
const float* mipLevelBias; // Incoming mip level bias or NULL.
const float* dy; // Incoming output gradient.
float* mip; // Mip data buffer.
float* out; // Outgoing texture data.
......@@ -48,7 +49,8 @@ struct TextureKernelParams
float* gradTexMip; // Temporary texture gradients for mip levels > 0.
float* gradUV; // Outgoing texcoord gradient.
float* gradUVDA; // Outgoing texcoord pixel differential gradient.
int enableMip; // If true, we have uv_da input and mip output tensor.
float* gradMipLevelBias; // Outgoing mip level bias gradient.
int enableMip; // If true, we have uv_da and/or mip_level_bias input(s), and a mip tensor.
int filterMode; // One of the TEX_MODE_ constants.
int boundaryMode; // One of the TEX_BOUNDARY_MODE_ contants.
int texConst; // If true, texture is known to be constant.
......
......@@ -19,7 +19,7 @@ from . import plugin_loader
def _get_gl_opts():
libs = {
'posix': ['GL', 'GLEW'],
'nt': ['gdi32', 'glew32s', 'opengl32', 'user32'],
'nt': ['gdi32', 'glew32s', 'opengl32', 'user32', 'setgpu'],
}
return ['-l' + x for x in libs[os.name]]
......
......@@ -12,11 +12,11 @@
struct RasterizeFwdOp : public OpKernel
{
RasterizeGLState m_glState; // OpenGL-related persistent state.
int m_tri_const; // 1 if triangle array is known to be constant.
int m_tri_const; // 1 if triangle array is known to be constant.
RasterizeFwdOp(OpKernelConstruction* ctx):
OpKernel(ctx)
{
{
memset(&m_glState, 0, sizeof(RasterizeGLState));
OP_REQUIRES_OK(ctx, ctx->GetAttr("enable_db", &m_glState.enableDB));
OP_REQUIRES_OK(ctx, ctx->GetAttr("tri_const", &m_tri_const));
......@@ -48,7 +48,7 @@ struct RasterizeFwdOp : public OpKernel
OP_REQUIRES(ctx, pos.dims() == 2 && pos.dim_size(0) > 0 && pos.dim_size(1) == 4, errors::InvalidArgument("range mode - pos must have shape [>0, 4]"));
OP_REQUIRES(ctx, tri.dims() == 2 && tri.dim_size(0) > 0 && tri.dim_size(1) == 3, errors::InvalidArgument("tri must have shape [>0, 3]"));
OP_REQUIRES(ctx, resolution.dims() == 1 && resolution.dim_size(0) == 2, errors::InvalidArgument("resolution must have shape [2]"));
OP_REQUIRES(ctx, ranges.dims() == 2 && ranges.dim_size(0) > 0 && ranges.dim_size(1) == 2, errors::InvalidArgument("range mode - ranges must have shape [>0, 2]"));
OP_REQUIRES(ctx, ranges.dims() == 2 && ranges.dim_size(0) > 0 && ranges.dim_size(1) == 2, errors::InvalidArgument("range mode - ranges must have shape [>0, 2]"));
}
// Get output shape.
......@@ -65,12 +65,16 @@ struct RasterizeFwdOp : public OpKernel
// Init context and GL?
bool initCtx = !m_glState.glFBO;
if (initCtx)
rasterizeInitGLContext(ctx, m_glState); // In common/rasterize.inl
{
const DeviceBase::GpuDeviceInfo* g = ctx->device()->tensorflow_gpu_device_info();
int cudaDeviceIdx = g ? g->gpu_id : -1;
rasterizeInitGLContext(ctx, m_glState, cudaDeviceIdx); // In common/rasterize.cpp
}
else
setGLContext(m_glState.glctx); // (Re-)Activate GL context.
// Resize all buffers.
rasterizeResizeBuffers(ctx, m_glState, posCount, triCount, width, height, depth); // In common/rasterize.inl
rasterizeResizeBuffers(ctx, m_glState, posCount, triCount, width, height, depth); // In common/rasterize.cpp
// Newly created GL objects sometimes don't map properly to CUDA until after first context swap. Workaround.
if (initCtx)
......@@ -79,7 +83,7 @@ struct RasterizeFwdOp : public OpKernel
releaseGLContext();
setGLContext(m_glState.glctx);
}
// Copy input data to GL and render.
const float* posPtr = pos.flat<float>().data();
const int32_t* rangesPtr = instance_mode ? 0 : ranges.flat<int32_t>().data(); // This is in CPU memory.
......@@ -178,7 +182,7 @@ struct RasterizeGradOp : public OpKernel
p.out = out.flat<float>().data();
p.dy = dy.flat<float>().data();
p.ddb = ENABLE_DB ? ddb.flat<float>().data() : 0;
// Set up pixel position to clip space x, y transform.
p.xs = 2.f / (float)p.width;
p.xo = 1.f / (float)p.width - 1.f;
......
......@@ -45,9 +45,9 @@ def _get_plugin():
# Linker options.
if os.name == 'posix':
ldflags = ['-lGL', '-lGLEW']
ldflags = ['-lGL', '-lGLEW', '-lEGL']
elif os.name == 'nt':
libs = ['gdi32', 'glew32s', 'opengl32', 'user32']
libs = ['gdi32', 'glew32s', 'opengl32', 'user32', 'setgpu']
ldflags = ['/LIBPATH:' + lib_dir] + ['/DEFAULTLIB:' + x for x in libs]
# List of source files.
......@@ -103,9 +103,9 @@ def set_log_level(level):
'''Set log level.
Log levels follow the convention on the C++ side of Torch:
0 = Info,
1 = Warning,
2 = Error,
0 = Info,
1 = Warning,
2 = Error,
3 = Fatal.
The default log level is 1.
......@@ -121,7 +121,7 @@ def set_log_level(level):
#----------------------------------------------------------------------------
class RasterizeGLContext:
def __init__(self, output_db=True, mode='automatic'):
def __init__(self, output_db=True, mode='automatic', device=None):
'''Create a new OpenGL rasterizer context.
Creating an OpenGL context is a slow operation so you should reuse the same
......@@ -131,7 +131,10 @@ class RasterizeGLContext:
Args:
output_db (bool): Compute and output image-space derivates of barycentrics.
mode: OpenGL context handling mode. Valid values are 'manual' and 'automatic'.
device (Optional): Cuda device on which the context is created. Type can be
`torch.device`, string (e.g., `'cuda:1'`), or int. If not
specified, context will be created on currently active Cuda
device.
Returns:
The newly created OpenGL rasterizer context.
'''
......@@ -139,11 +142,16 @@ class RasterizeGLContext:
assert mode in ['automatic', 'manual']
self.output_db = output_db
self.mode = mode
self.cpp_wrapper = _get_plugin().RasterizeGLStateWrapper(output_db, mode == 'automatic')
if device is None:
cuda_device_idx = torch.cuda.current_device()
else:
with torch.cuda.device(device):
cuda_device_idx = torch.cuda.current_device()
self.cpp_wrapper = _get_plugin().RasterizeGLStateWrapper(output_db, mode == 'automatic', cuda_device_idx)
def set_context(self):
'''Set (activate) OpenGL context in the current CPU thread.
Only available if context was created in manual mode.
Only available if context was created in manual mode.
'''
assert self.mode == 'manual'
self.cpp_wrapper.set_context()
......@@ -316,22 +324,26 @@ def interpolate(attr, rast, tri, rast_db=None, diff_attrs=None):
# Linear-mipmap-linear and linear-mipmap-nearest: Mipmaps enabled.
class _texture_func_mip(torch.autograd.Function):
@staticmethod
def forward(ctx, filter_mode, tex, uv, uv_da, mip, filter_mode_enum, boundary_mode_enum):
out = _get_plugin().texture_fwd_mip(tex, uv, uv_da, mip, filter_mode_enum, boundary_mode_enum)
ctx.save_for_backward(tex, uv, uv_da)
def forward(ctx, filter_mode, tex, uv, uv_da, mip_level_bias, mip, filter_mode_enum, boundary_mode_enum):
if uv_da is None:
uv_da = torch.tensor([])
if mip_level_bias is None:
mip_level_bias = torch.tensor([])
out = _get_plugin().texture_fwd_mip(tex, uv, uv_da, mip_level_bias, mip, filter_mode_enum, boundary_mode_enum)
ctx.save_for_backward(tex, uv, uv_da, mip_level_bias)
ctx.saved_misc = filter_mode, mip, filter_mode_enum, boundary_mode_enum
return out
@staticmethod
def backward(ctx, dy):
tex, uv, uv_da = ctx.saved_variables
tex, uv, uv_da, mip_level_bias = ctx.saved_variables
filter_mode, mip, filter_mode_enum, boundary_mode_enum = ctx.saved_misc
if filter_mode == 'linear-mipmap-linear':
g_tex, g_uv, g_uv_da = _get_plugin().texture_grad_linear_mipmap_linear(tex, uv, dy, uv_da, mip, filter_mode_enum, boundary_mode_enum)
return None, g_tex, g_uv, g_uv_da, None, None, None
g_tex, g_uv, g_uv_da, g_mip_level_bias = _get_plugin().texture_grad_linear_mipmap_linear(tex, uv, dy, uv_da, mip_level_bias, mip, filter_mode_enum, boundary_mode_enum)
return None, g_tex, g_uv, g_uv_da, g_mip_level_bias, None, None, None
else: # linear-mipmap-nearest
g_tex, g_uv = _get_plugin().texture_grad_linear_mipmap_nearest(tex, uv, dy, uv_da, mip, filter_mode_enum, boundary_mode_enum)
return None, g_tex, g_uv, None, None, None, None
g_tex, g_uv = _get_plugin().texture_grad_linear_mipmap_nearest(tex, uv, dy, uv_da, mip_level_bias, mip, filter_mode_enum, boundary_mode_enum)
return None, g_tex, g_uv, None, None, None, None, None
# Linear and nearest: Mipmaps disabled.
class _texture_func(torch.autograd.Function):
......@@ -354,7 +366,7 @@ class _texture_func(torch.autograd.Function):
return None, g_tex, None, None, None
# Op wrapper.
def texture(tex, uv, uv_da=None, mip=None, filter_mode='auto', boundary_mode='wrap', max_mip_level=None):
def texture(tex, uv, uv_da=None, mip_level_bias=None, mip=None, filter_mode='auto', boundary_mode='wrap', max_mip_level=None):
"""Perform texture sampling.
All input tensors must be contiguous and reside in GPU memory. The output tensor
......@@ -364,22 +376,24 @@ def texture(tex, uv, uv_da=None, mip=None, filter_mode='auto', boundary_mode='wr
tex: Texture tensor with dtype `torch.float32`. For 2D textures, must have shape
[minibatch_size, tex_height, tex_width, tex_channels]. For cube map textures,
must have shape [minibatch_size, 6, tex_height, tex_width, tex_channels] where
tex_width and tex_height are equal. Note that `boundary_mode` must also be set
tex_width and tex_height are equal. Note that `boundary_mode` must also be set
to 'cube' to enable cube map mode. Broadcasting is supported along the minibatch axis.
uv: Tensor containing per-pixel texture coordinates. When sampling a 2D texture,
uv: Tensor containing per-pixel texture coordinates. When sampling a 2D texture,
must have shape [minibatch_size, height, width, 2]. When sampling a cube map
texture, must have shape [minibatch_size, height, width, 3].
uv_da: (Optional) Tensor containing image-space derivatives of texture coordinates.
Must have same shape as `uv` except for the last dimension that is to be twice
as long.
mip_level_bias: (Optional) Per-pixel bias for mip level selection. If `uv_da` is omitted,
determines mip level directly. Must have shape [minibatch_size, height, width].
mip: (Optional) Preconstructed mipmap stack from a `texture_construct_mip()` call. If not
specified, the mipmap stack is constructed internally and discarded afterwards.
filter_mode: Texture filtering mode to be used. Valid values are 'auto', 'nearest',
filter_mode: Texture filtering mode to be used. Valid values are 'auto', 'nearest',
'linear', 'linear-mipmap-nearest', and 'linear-mipmap-linear'. Mode 'auto'
selects 'linear' if `uv_da` is not specified, and 'linear-mipmap-linear'
when `uv_da` is specified, these being the highest-quality modes possible
depending on the availability of the image-space derivatives of the texture
coordinates.
selects 'linear' if neither `uv_da` or `mip_level_bias` is specified, and
'linear-mipmap-linear' when at least one of them is specified, these being
the highest-quality modes possible depending on the availability of the
image-space derivatives of the texture coordinates or direct mip level information.
boundary_mode: Valid values are 'wrap', 'clamp', 'zero', and 'cube'. If `tex` defines a
cube map, this must be set to 'cube'. The default mode 'wrap' takes fractional
part of texture coordinates. Mode 'clamp' clamps texture coordinates to the
......@@ -395,7 +409,7 @@ def texture(tex, uv, uv_da=None, mip=None, filter_mode='auto', boundary_mode='wr
# Default filter mode.
if filter_mode == 'auto':
filter_mode = 'linear-mipmap-linear' if (uv_da is not None) else 'linear'
filter_mode = 'linear-mipmap-linear' if (uv_da is not None or mip_level_bias is not None) else 'linear'
# Sanitize inputs.
if max_mip_level is None:
......@@ -407,7 +421,7 @@ def texture(tex, uv, uv_da=None, mip=None, filter_mode='auto', boundary_mode='wr
# Check inputs.
assert isinstance(tex, torch.Tensor) and isinstance(uv, torch.Tensor)
if 'mipmap' in filter_mode:
assert isinstance(uv_da, torch.Tensor)
assert isinstance(uv_da, torch.Tensor) or isinstance(mip_level_bias, torch.Tensor)
# If mipping disabled via max level=0, we may as well use simpler filtering internally.
if max_mip_level == 0 and filter_mode in ['linear-mipmap-nearest', 'linear-mipmap-linear']:
......@@ -430,10 +444,10 @@ def texture(tex, uv, uv_da=None, mip=None, filter_mode='auto', boundary_mode='wr
# Choose stub.
if filter_mode == 'linear-mipmap-linear' or filter_mode == 'linear-mipmap-nearest':
return _texture_func_mip.apply(filter_mode, tex, uv, uv_da, mip, filter_mode_enum, boundary_mode_enum)
return _texture_func_mip.apply(filter_mode, tex, uv, uv_da, mip_level_bias, mip, filter_mode_enum, boundary_mode_enum)
else:
return _texture_func.apply(filter_mode, tex, uv, filter_mode_enum, boundary_mode_enum)
# Mipmap precalculation for cases where the texture stays constant.
def texture_construct_mip(tex, max_mip_level=None, cube_mode=False):
"""Construct a mipmap stack for a texture.
......
......@@ -24,6 +24,7 @@ void AntialiasGradKernel (const AntialiasKernelParams p);
TopologyHashWrapper antialias_construct_topology_hash(torch::Tensor tri)
{
const at::cuda::OptionalCUDAGuard device_guard(device_of(tri));
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
AntialiasKernelParams p = {}; // Initialize all fields to zero.
......@@ -66,6 +67,7 @@ TopologyHashWrapper antialias_construct_topology_hash(torch::Tensor tri)
std::tuple<torch::Tensor, torch::Tensor> antialias_fwd(torch::Tensor color, torch::Tensor rast, torch::Tensor pos, torch::Tensor tri, TopologyHashWrapper topology_hash_wrap)
{
const at::cuda::OptionalCUDAGuard device_guard(device_of(color));
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
AntialiasKernelParams p = {}; // Initialize all fields to zero.
p.instance_mode = (pos.sizes().size() > 2) ? 1 : 0;
......@@ -112,10 +114,10 @@ std::tuple<torch::Tensor, torch::Tensor> antialias_fwd(torch::Tensor color, torc
p.xh = .5f * (float)p.width;
p.yh = .5f * (float)p.height;
p.allocTriangles = topology_hash.size(0) / (4 * AA_HASH_ELEMENTS_PER_TRIANGLE);
// Allocate output tensors.
torch::Tensor out = color.detach().clone(); // Use color as base.
torch::TensorOptions opts = torch::TensorOptions().dtype(torch::kFloat32).device(torch::kCUDA);
torch::TensorOptions opts = torch::TensorOptions().dtype(torch::kFloat32).device(torch::kCUDA);
torch::Tensor work_buffer = torch::empty({p.n * p.width * p.height * 8 + 4}, opts); // 8 int for a maximum of two work items per pixel.
p.output = out.data_ptr<float>();
p.workBuffer = (int4*)(work_buffer.data_ptr<float>());
......@@ -153,6 +155,7 @@ std::tuple<torch::Tensor, torch::Tensor> antialias_fwd(torch::Tensor color, torc
std::tuple<torch::Tensor, torch::Tensor> antialias_grad(torch::Tensor color, torch::Tensor rast, torch::Tensor pos, torch::Tensor tri, torch::Tensor dy, torch::Tensor work_buffer)
{
const at::cuda::OptionalCUDAGuard device_guard(device_of(color));
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
AntialiasKernelParams p = {}; // Initialize all fields to zero.
p.instance_mode = (pos.sizes().size() > 2) ? 1 : 0;
......
......@@ -13,9 +13,10 @@
//------------------------------------------------------------------------
// Op prototypes. Return type macros for readability.
#define OP_RETURN_T torch::Tensor
#define OP_RETURN_TT std::tuple<torch::Tensor, torch::Tensor>
#define OP_RETURN_TTT std::tuple<torch::Tensor, torch::Tensor, torch::Tensor>
#define OP_RETURN_T torch::Tensor
#define OP_RETURN_TT std::tuple<torch::Tensor, torch::Tensor>
#define OP_RETURN_TTT std::tuple<torch::Tensor, torch::Tensor, torch::Tensor>
#define OP_RETURN_TTTT std::tuple<torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor>
OP_RETURN_TT rasterize_fwd (RasterizeGLStateWrapper& stateWrapper, torch::Tensor pos, torch::Tensor tri, std::tuple<int, int> resolution, torch::Tensor ranges);
OP_RETURN_T rasterize_grad (torch::Tensor pos, torch::Tensor tri, torch::Tensor out, torch::Tensor dy);
......@@ -26,11 +27,11 @@ OP_RETURN_TT interpolate_grad (torch::Tensor attr, tor
OP_RETURN_TTT interpolate_grad_da (torch::Tensor attr, torch::Tensor rast, torch::Tensor tri, torch::Tensor dy, torch::Tensor rast_db, torch::Tensor dda, bool diff_attrs_all, std::vector<int>& diff_attrs_vec);
TextureMipWrapper texture_construct_mip (torch::Tensor tex, int max_mip_level, bool cube_mode);
OP_RETURN_T texture_fwd (torch::Tensor tex, torch::Tensor uv, int filter_mode, int boundary_mode);
OP_RETURN_T texture_fwd_mip (torch::Tensor tex, torch::Tensor uv, torch::Tensor uv_da, TextureMipWrapper mip, int filter_mode, int boundary_mode);
OP_RETURN_T texture_fwd_mip (torch::Tensor tex, torch::Tensor uv, torch::Tensor uv_da, torch::Tensor mip_level_bias, TextureMipWrapper mip, int filter_mode, int boundary_mode);
OP_RETURN_T texture_grad_nearest (torch::Tensor tex, torch::Tensor uv, torch::Tensor dy, int filter_mode, int boundary_mode);
OP_RETURN_TT texture_grad_linear (torch::Tensor tex, torch::Tensor uv, torch::Tensor dy, int filter_mode, int boundary_mode);
OP_RETURN_TT texture_grad_linear_mipmap_nearest (torch::Tensor tex, torch::Tensor uv, torch::Tensor dy, torch::Tensor uv_da, TextureMipWrapper mip, int filter_mode, int boundary_mode);
OP_RETURN_TTT texture_grad_linear_mipmap_linear (torch::Tensor tex, torch::Tensor uv, torch::Tensor dy, torch::Tensor uv_da, TextureMipWrapper mip, int filter_mode, int boundary_mode);
OP_RETURN_TT texture_grad_linear_mipmap_nearest (torch::Tensor tex, torch::Tensor uv, torch::Tensor dy, torch::Tensor uv_da, torch::Tensor mip_level_bias, TextureMipWrapper mip, int filter_mode, int boundary_mode);
OP_RETURN_TTTT texture_grad_linear_mipmap_linear (torch::Tensor tex, torch::Tensor uv, torch::Tensor dy, torch::Tensor uv_da, torch::Tensor mip_level_bias, TextureMipWrapper mip, int filter_mode, int boundary_mode);
TopologyHashWrapper antialias_construct_topology_hash (torch::Tensor tri);
OP_RETURN_TT antialias_fwd (torch::Tensor color, torch::Tensor rast, torch::Tensor pos, torch::Tensor tri, TopologyHashWrapper topology_hash);
OP_RETURN_TT antialias_grad (torch::Tensor color, torch::Tensor rast, torch::Tensor pos, torch::Tensor tri, torch::Tensor dy, torch::Tensor work_buffer);
......@@ -39,7 +40,7 @@ OP_RETURN_TT antialias_grad (torch::Tensor color, to
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
// State classes.
pybind11::class_<RasterizeGLStateWrapper>(m, "RasterizeGLStateWrapper").def(pybind11::init<bool, bool>())
pybind11::class_<RasterizeGLStateWrapper>(m, "RasterizeGLStateWrapper").def(pybind11::init<bool, bool, int>())
.def("set_context", &RasterizeGLStateWrapper::setContext)
.def("release_context", &RasterizeGLStateWrapper::releaseContext);
pybind11::class_<TextureMipWrapper>(m, "TextureMipWrapper");
......@@ -58,8 +59,8 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("interpolate_grad", &interpolate_grad, "interpolate gradient op with attribute derivatives");
m.def("interpolate_grad_da", &interpolate_grad_da, "interpolate gradient op without attribute derivatives");
m.def("texture_construct_mip", &texture_construct_mip, "texture mipmap construction");
m.def("texture_fwd", &texture_fwd, "texture forward op with mipmapping and texcoord derivatives");
m.def("texture_fwd_mip", &texture_fwd_mip, "texture forward op without mipmapping and texcoord derivatives");
m.def("texture_fwd", &texture_fwd, "texture forward op without mipmapping");
m.def("texture_fwd_mip", &texture_fwd_mip, "texture forward op with mipmapping");
m.def("texture_grad_nearest", &texture_grad_nearest, "texture gradient op in nearest mode");
m.def("texture_grad_linear", &texture_grad_linear, "texture gradient op in linear mode");
m.def("texture_grad_linear_mipmap_nearest", &texture_grad_linear_mipmap_nearest, "texture gradient op in linear-mipmap-nearest mode");
......
......@@ -17,7 +17,7 @@
#define __func__ __FUNCTION__
#endif
#define NVDR_CHECK_DEVICE(...) do { TORCH_CHECK(at::cuda::check_device({__VA_ARGS__}), __func__, "(): Inputs " #__VA_ARGS__ " must reside on current GPU device") } while(0)
#define NVDR_CHECK_DEVICE(...) do { TORCH_CHECK(at::cuda::check_device({__VA_ARGS__}), __func__, "(): Inputs " #__VA_ARGS__ " must reside on the same GPU device") } while(0)
#define NVDR_CHECK_CPU(...) do { nvdr_check_cpu({__VA_ARGS__}, __func__, "(): Inputs " #__VA_ARGS__ " must reside on CPU"); } while(0)
#define NVDR_CHECK_CONTIGUOUS(...) do { nvdr_check_contiguous({__VA_ARGS__}, __func__, "(): Inputs " #__VA_ARGS__ " must be contiguous tensors"); } while(0)
#define NVDR_CHECK_F32(...) do { nvdr_check_f32({__VA_ARGS__}, __func__, "(): Inputs " #__VA_ARGS__ " must be float32 tensors"); } while(0)
......
......@@ -41,6 +41,7 @@ static void set_diff_attrs(InterpolateKernelParams& p, bool diff_attrs_all, std:
std::tuple<torch::Tensor, torch::Tensor> interpolate_fwd_da(torch::Tensor attr, torch::Tensor rast, torch::Tensor tri, torch::Tensor rast_db, bool diff_attrs_all, std::vector<int>& diff_attrs_vec)
{
const at::cuda::OptionalCUDAGuard device_guard(device_of(attr));
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
InterpolateKernelParams p = {}; // Initialize all fields to zero.
bool enable_da = (rast_db.defined()) && (diff_attrs_all || !diff_attrs_vec.empty());
......@@ -86,6 +87,8 @@ std::tuple<torch::Tensor, torch::Tensor> interpolate_fwd_da(torch::Tensor attr,
// Set attribute pixel differential info if enabled, otherwise leave as zero.
if (enable_da)
set_diff_attrs(p, diff_attrs_all, diff_attrs_vec);
else
p.numDiffAttr = 0;
// Get input pointers.
p.attr = attr.data_ptr<float>();
......@@ -95,7 +98,7 @@ std::tuple<torch::Tensor, torch::Tensor> interpolate_fwd_da(torch::Tensor attr,
p.attrBC = (p.instance_mode && attr.size(0) == 1) ? 1 : 0;
// Allocate output tensors.
torch::TensorOptions opts = torch::TensorOptions().dtype(torch::kFloat32).device(torch::kCUDA);
torch::TensorOptions opts = torch::TensorOptions().dtype(torch::kFloat32).device(torch::kCUDA);
torch::Tensor out = torch::empty({p.depth, p.height, p.width, p.numAttr}, opts);
torch::Tensor out_da = torch::empty({p.depth, p.height, p.width, p.numDiffAttr * 2}, opts);
......@@ -133,6 +136,7 @@ std::tuple<torch::Tensor, torch::Tensor> interpolate_fwd(torch::Tensor attr, tor
std::tuple<torch::Tensor, torch::Tensor, torch::Tensor> interpolate_grad_da(torch::Tensor attr, torch::Tensor rast, torch::Tensor tri, torch::Tensor dy, torch::Tensor rast_db, torch::Tensor dda, bool diff_attrs_all, std::vector<int>& diff_attrs_vec)
{
const at::cuda::OptionalCUDAGuard device_guard(device_of(attr));
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
InterpolateKernelParams p = {}; // Initialize all fields to zero.
bool enable_da = (rast_db.defined()) && (diff_attrs_all || !diff_attrs_vec.empty());
......@@ -190,6 +194,8 @@ std::tuple<torch::Tensor, torch::Tensor, torch::Tensor> interpolate_grad_da(torc
// Set attribute pixel differential info if enabled, otherwise leave as zero.
if (enable_da)
set_diff_attrs(p, diff_attrs_all, diff_attrs_vec);
else
p.numDiffAttr = 0;
// Get input pointers.
p.attr = attr.data_ptr<float>();
......@@ -201,7 +207,7 @@ std::tuple<torch::Tensor, torch::Tensor, torch::Tensor> interpolate_grad_da(torc
p.attrBC = (p.instance_mode && attr_depth < p.depth) ? 1 : 0;
// Allocate output tensors.
torch::TensorOptions opts = torch::TensorOptions().dtype(torch::kFloat32).device(torch::kCUDA);
torch::TensorOptions opts = torch::TensorOptions().dtype(torch::kFloat32).device(torch::kCUDA);
torch::Tensor gradAttr = torch::zeros_like(attr);
torch::Tensor gradRaster = torch::empty_like(rast);
torch::Tensor gradRasterDB;
......
......@@ -21,13 +21,14 @@ void RasterizeGradKernelDb(const RasterizeGradParams p);
//------------------------------------------------------------------------
// Python GL state wrapper methods.
RasterizeGLStateWrapper::RasterizeGLStateWrapper(bool enableDB, bool automatic_)
RasterizeGLStateWrapper::RasterizeGLStateWrapper(bool enableDB, bool automatic_, int cudaDeviceIdx_)
{
pState = new RasterizeGLState();
automatic = automatic_;
cudaDeviceIdx = cudaDeviceIdx_;
memset(pState, 0, sizeof(RasterizeGLState));
pState->enableDB = enableDB ? 1 : 0;
rasterizeInitGLContext(NVDR_CTX_PARAMS, *pState);
rasterizeInitGLContext(NVDR_CTX_PARAMS, *pState, cudaDeviceIdx_);
releaseGLContext();
}
......@@ -52,6 +53,7 @@ void RasterizeGLStateWrapper::releaseContext(void)
std::tuple<torch::Tensor, torch::Tensor> rasterize_fwd(RasterizeGLStateWrapper& stateWrapper, torch::Tensor pos, torch::Tensor tri, std::tuple<int, int> resolution, torch::Tensor ranges)
{
const at::cuda::OptionalCUDAGuard device_guard(device_of(pos));
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
RasterizeGLState& s = *stateWrapper.pState;
......@@ -62,6 +64,9 @@ std::tuple<torch::Tensor, torch::Tensor> rasterize_fwd(RasterizeGLStateWrapper&
NVDR_CHECK_F32(pos);
NVDR_CHECK_I32(tri, ranges);
// Check that GL context was created for the correct GPU.
NVDR_CHECK(pos.get_device() == stateWrapper.cudaDeviceIdx, "GL context must must reside on the same device as input tensors");
// Determine number of outputs
int num_outputs = s.enableDB ? 2 : 1;
......@@ -101,7 +106,7 @@ std::tuple<torch::Tensor, torch::Tensor> rasterize_fwd(RasterizeGLStateWrapper&
rasterizeRender(NVDR_CTX_PARAMS, s, stream, posPtr, posCount, vtxPerInstance, triPtr, triCount, rangesPtr, width, height, depth);
// Allocate output tensors.
torch::TensorOptions opts = torch::TensorOptions().dtype(torch::kFloat32).device(torch::kCUDA);
torch::TensorOptions opts = torch::TensorOptions().dtype(torch::kFloat32).device(torch::kCUDA);
torch::Tensor out = torch::empty({depth, height, width, 4}, opts);
torch::Tensor out_db = torch::empty({depth, height, width, s.enableDB ? 4 : 0}, opts);
float* outputPtr[2];
......@@ -123,6 +128,7 @@ std::tuple<torch::Tensor, torch::Tensor> rasterize_fwd(RasterizeGLStateWrapper&
torch::Tensor rasterize_grad_db(torch::Tensor pos, torch::Tensor tri, torch::Tensor out, torch::Tensor dy, torch::Tensor ddb)
{
const at::cuda::OptionalCUDAGuard device_guard(device_of(pos));
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
RasterizeGradParams p;
bool enable_db = ddb.defined();
......@@ -178,7 +184,7 @@ torch::Tensor rasterize_grad_db(torch::Tensor pos, torch::Tensor tri, torch::Ten
p.out = out.data_ptr<float>();
p.dy = dy_.data_ptr<float>();
p.ddb = enable_db ? ddb_.data_ptr<float>() : NULL;
// Set up pixel position to clip space x, y transform.
p.xs = 2.f / (float)p.width;
p.xo = 1.f / (float)p.width - 1.f;
......@@ -209,7 +215,7 @@ torch::Tensor rasterize_grad_db(torch::Tensor pos, torch::Tensor tri, torch::Ten
// Version without derivatives.
torch::Tensor rasterize_grad(torch::Tensor pos, torch::Tensor tri, torch::Tensor out, torch::Tensor dy)
{
{
torch::Tensor empty_tensor;
return rasterize_grad_db(pos, tri, out, dy, empty_tensor);
}
......
This diff is collapsed.
......@@ -15,7 +15,7 @@ class RasterizeGLState;
class RasterizeGLStateWrapper
{
public:
RasterizeGLStateWrapper (bool enableDB, bool automatic);
RasterizeGLStateWrapper (bool enableDB, bool automatic, int cudaDeviceIdx);
~RasterizeGLStateWrapper (void);
void setContext (void);
......@@ -23,6 +23,7 @@ public:
RasterizeGLState* pState;
bool automatic;
int cudaDeviceIdx;
};
//------------------------------------------------------------------------
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment