Commit ce7063f1 authored by Samuli Laine's avatar Samuli Laine
Browse files

Support for multiple GPUs, mip bias input for texture op

parent 2468e2a0
This diff is collapsed.
......@@ -6,4 +6,4 @@
# distribution of this software and related documentation without an express
# license agreement from NVIDIA CORPORATION is strictly prohibited.
__version__ = '0.2.0'
__version__ = '0.2.1'
......@@ -185,6 +185,8 @@ template<class T> static __device__ __forceinline__ void swap(T& a, T& b)
//------------------------------------------------------------------------
// Coalesced atomics. These are all done via macros.
#if __CUDA_ARCH__ >= 700 // Warp match instruction __match_any_sync() is only available on compute capability 7.x and higher
#define CA_TEMP _ca_temp
#define CA_TEMP_PARAM float* CA_TEMP
#define CA_DECLARE_TEMP(threads_per_block) \
......@@ -228,5 +230,24 @@ template<class T> static __device__ __forceinline__ void swap(T& a, T& b)
caAtomicAdd((ptr)+(idx), (value)); \
} while(0)
//------------------------------------------------------------------------
// Disable atomic coalescing for compute capability lower than 7.x
#else // __CUDA_ARCH__ >= 700
#define CA_TEMP _ca_temp
#define CA_TEMP_PARAM float CA_TEMP
#define CA_DECLARE_TEMP(threads_per_block) CA_TEMP_PARAM
#define CA_SET_GROUP_MASK(group, thread_mask)
#define CA_SET_GROUP(group)
#define caAtomicAdd(ptr, value) atomicAdd((ptr), (value))
#define caAtomicAdd3_xyw(ptr, x, y, w) \
do { \
atomicAdd((ptr), (x)); \
atomicAdd((ptr)+1, (y)); \
atomicAdd((ptr)+3, (w)); \
} while(0)
#define caAtomicAddTexture(ptr, level, idx, value) atomicAdd((ptr)+(idx), (value))
#endif // __CUDA_ARCH__ >= 700
//------------------------------------------------------------------------
#endif // __CUDACC__
......@@ -36,6 +36,7 @@ using namespace tensorflow::shape_inference;
#include <torch/extension.h>
#include <ATen/cuda/CUDAContext.h>
#include <ATen/cuda/CUDAUtils.h>
#include <c10/cuda/CUDAGuard.h>
#include <pybind11/numpy.h>
#endif
#define NVDR_CTX_ARGS int _nvdr_ctx_dummy
......
......@@ -37,26 +37,43 @@ struct GLContext
static void setGLContext(GLContext& glctx)
{
if (!glctx.hglrc)
LOG(ERROR) << "setGLContext() called with null gltcx";
LOG(FATAL) << "setGLContext() called with null gltcx";
if (!wglMakeCurrent(glctx.hdc, glctx.hglrc))
LOG(ERROR) << "wglMakeCurrent() failed when setting GL context";
LOG(FATAL) << "wglMakeCurrent() failed when setting GL context";
if (glctx.glewInitialized)
return;
GLenum result = glewInit();
if (result != GLEW_OK)
LOG(ERROR) << "glewInit() failed, return value = " << result;
LOG(FATAL) << "glewInit() failed, return value = " << result;
glctx.glewInitialized = 1;
}
static void releaseGLContext(void)
{
if (!wglMakeCurrent(NULL, NULL))
LOG(ERROR) << "wglMakeCurrent() failed when releasing GL context";
LOG(FATAL) << "wglMakeCurrent() failed when releasing GL context";
}
static GLContext createGLContext(void)
extern "C" int set_gpu(const char*);
static GLContext createGLContext(int cudaDeviceIdx)
{
if (cudaDeviceIdx >= 0)
{
char pciBusId[256] = "";
LOG(INFO) << "Creating GL context for Cuda device " << cudaDeviceIdx;
if (cudaDeviceGetPCIBusId(pciBusId, 255, cudaDeviceIdx) != CUDA_SUCCESS)
{
LOG(INFO) << "PCI bus id query failed";
}
else
{
int res = set_gpu(pciBusId);
LOG(INFO) << "Selecting device with PCI bus id " << pciBusId << " - " << (res ? "failed, expect crash or major slowdown" : "success");
}
}
HINSTANCE hInstance = GetModuleHandle(NULL);
WNDCLASS wc = {};
wc.style = CS_OWNDC;
......@@ -101,7 +118,7 @@ static GLContext createGLContext(void)
static void destroyGLContext(GLContext& glctx)
{
if (!glctx.hglrc)
LOG(ERROR) << "destroyGLContext() called with null gltcx";
LOG(FATAL) << "destroyGLContext() called with null gltcx";
// If this is the current context, release it.
if (wglGetCurrentContext() == glctx.hglrc)
......@@ -109,13 +126,13 @@ static void destroyGLContext(GLContext& glctx)
HWND hwnd = WindowFromDC(glctx.hdc);
if (!hwnd)
LOG(ERROR) << "WindowFromDC() failed";
LOG(FATAL) << "WindowFromDC() failed";
if (!ReleaseDC(hwnd, glctx.hdc))
LOG(ERROR) << "ReleaseDC() failed";
LOG(FATAL) << "ReleaseDC() failed";
if (!wglDeleteContext(glctx.hglrc))
LOG(ERROR) << "wglDeleteContext() failed";
LOG(FATAL) << "wglDeleteContext() failed";
if (!DestroyWindow(hwnd))
LOG(ERROR) << "DestroyWindow() failed";
LOG(FATAL) << "DestroyWindow() failed";
LOG(INFO) << std::hex << std::setfill('0')
<< "WGL OpenGL context destroyed (hdc: 0x" << std::setw(8) << (uint32_t)(uintptr_t)glctx.hdc
......@@ -140,6 +157,7 @@ static void destroyGLContext(GLContext& glctx)
# include <GL/glew.h> // Use system-supplied glew.h
#endif
#include <EGL/egl.h>
#include <EGL/eglext.h>
#include <GL/gl.h>
#include <cuda_gl_interop.h>
......@@ -148,7 +166,6 @@ static void destroyGLContext(GLContext& glctx)
struct GLContext
{
EGLDisplay display;
EGLSurface surface;
EGLContext context;
int glewInitialized;
};
......@@ -158,9 +175,9 @@ struct GLContext
static void setGLContext(GLContext& glctx)
{
if (!glctx.context)
LOG(ERROR) << "setGLContext() called with null gltcx";
LOG(FATAL) << "setGLContext() called with null gltcx";
if (!eglMakeCurrent(glctx.display, glctx.surface, glctx.surface, glctx.context))
if (!eglMakeCurrent(glctx.display, EGL_NO_SURFACE, EGL_NO_SURFACE, glctx.context))
LOG(ERROR) << "eglMakeCurrent() failed when setting GL context";
if (glctx.glewInitialized)
......@@ -168,7 +185,7 @@ static void setGLContext(GLContext& glctx)
GLenum result = glewInit();
if (result != GLEW_OK)
LOG(ERROR) << "glewInit() failed, return value = " << result;
LOG(FATAL) << "glewInit() failed, return value = " << result;
glctx.glewInitialized = 1;
}
......@@ -178,21 +195,83 @@ static void releaseGLContext(void)
if (display == EGL_NO_DISPLAY)
LOG(WARNING) << "releaseGLContext() called with no active display";
if (!eglMakeCurrent(display, EGL_NO_SURFACE, EGL_NO_SURFACE, EGL_NO_CONTEXT))
LOG(ERROR) << "eglMakeCurrent() failed when releasing GL context";
LOG(FATAL) << "eglMakeCurrent() failed when releasing GL context";
}
static GLContext createGLContext(void)
static EGLDisplay getCudaDisplay(int cudaDeviceIdx)
{
// Initialize.
typedef EGLBoolean (*eglQueryDevicesEXT_t)(EGLint, EGLDeviceEXT, EGLint*);
typedef EGLBoolean (*eglQueryDeviceAttribEXT_t)(EGLDeviceEXT, EGLint, EGLAttrib*);
typedef EGLDisplay (*eglGetPlatformDisplayEXT_t)(EGLenum, void*, const EGLint*);
EGLDisplay display = eglGetDisplay(EGL_DEFAULT_DISPLAY);
if (display == EGL_NO_DISPLAY)
LOG(ERROR) << "eglGetDisplay() failed";
eglQueryDevicesEXT_t eglQueryDevicesEXT = (eglQueryDevicesEXT_t)eglGetProcAddress("eglQueryDevicesEXT");
if (!eglQueryDevicesEXT)
{
LOG(INFO) << "eglGetProcAddress(\"eglQueryDevicesEXT\") failed";
return 0;
}
eglQueryDeviceAttribEXT_t eglQueryDeviceAttribEXT = (eglQueryDeviceAttribEXT_t)eglGetProcAddress("eglQueryDeviceAttribEXT");
if (!eglQueryDeviceAttribEXT)
{
LOG(INFO) << "eglGetProcAddress(\"eglQueryDeviceAttribEXT\") failed";
return 0;
}
eglGetPlatformDisplayEXT_t eglGetPlatformDisplayEXT = (eglGetPlatformDisplayEXT_t)eglGetProcAddress("eglGetPlatformDisplayEXT");
if (!eglGetPlatformDisplayEXT)
{
LOG(INFO) << "eglGetProcAddress(\"eglGetPlatformDisplayEXT\") failed";
return 0;
}
int num_devices = 0;
eglQueryDevicesEXT(0, 0, &num_devices);
if (!num_devices)
return 0;
EGLDisplay display = 0;
EGLDeviceEXT* devices = (EGLDeviceEXT*)malloc(num_devices * sizeof(void*));
eglQueryDevicesEXT(num_devices, devices, &num_devices);
for (int i=0; i < num_devices; i++)
{
EGLDeviceEXT device = devices[i];
intptr_t value = -1;
if (eglQueryDeviceAttribEXT(device, EGL_CUDA_DEVICE_NV, &value) && value == cudaDeviceIdx)
{
display = eglGetPlatformDisplayEXT(EGL_PLATFORM_DEVICE_EXT, device, 0);
break;
}
}
free(devices);
return display;
}
static GLContext createGLContext(int cudaDeviceIdx)
{
EGLDisplay display = 0;
if (cudaDeviceIdx >= 0)
{
char pciBusId[256] = "";
LOG(INFO) << "Creating GL context for Cuda device " << cudaDeviceIdx;
display = getCudaDisplay(cudaDeviceIdx);
if (!display)
LOG(INFO) << "Failed, falling back to default display";
}
if (!display)
{
display = eglGetDisplay(EGL_DEFAULT_DISPLAY);
if (display == EGL_NO_DISPLAY)
LOG(FATAL) << "eglGetDisplay() failed";
}
EGLint major;
EGLint minor;
if (!eglInitialize(display, &major, &minor))
LOG(ERROR) << "eglInitialize() failed";
LOG(FATAL) << "eglInitialize() failed";
// Choose configuration.
......@@ -211,45 +290,32 @@ static GLContext createGLContext(void)
EGLConfig config;
EGLint num_config;
if (!eglChooseConfig(display, context_attribs, &config, 1, &num_config))
LOG(ERROR) << "eglChooseConfig() failed";
// Create dummy pbuffer surface.
const EGLint surface_attribs[] = {
EGL_WIDTH, 1,
EGL_HEIGHT, 1,
EGL_NONE
};
EGLSurface surface = eglCreatePbufferSurface(display, config, surface_attribs);
if (surface == EGL_NO_SURFACE)
LOG(ERROR) << "eglCreatePbufferSurface() failed";
LOG(FATAL) << "eglChooseConfig() failed";
// Create GL context.
if (!eglBindAPI(EGL_OPENGL_API))
LOG(ERROR) << "eglBindAPI() failed";
LOG(FATAL) << "eglBindAPI() failed";
EGLContext context = eglCreateContext(display, config, EGL_NO_CONTEXT, NULL);
if (context == EGL_NO_CONTEXT)
LOG(ERROR) << "eglCreateContext() failed";
LOG(FATAL) << "eglCreateContext() failed";
// Done.
LOG(INFO) << "EGL " << (int)minor << "." << (int)major << " OpenGL context created (disp: 0x"
<< std::hex << std::setfill('0')
<< std::setw(16) << (uintptr_t)display
<< ", surf: 0x" << std::setw(16) << (uintptr_t)surface
<< ", ctx: 0x" << std::setw(16) << (uintptr_t)context << ")";
GLContext glctx = {display, surface, context, 0};
GLContext glctx = {display, context, 0};
return glctx;
}
static void destroyGLContext(GLContext& glctx)
{
if (!glctx.context)
LOG(ERROR) << "destroyGLContext() called with null gltcx";
LOG(FATAL) << "destroyGLContext() called with null gltcx";
// If this is the current context, release it.
if (eglGetCurrentContext() == glctx.context)
......@@ -257,13 +323,10 @@ static void destroyGLContext(GLContext& glctx)
if (!eglDestroyContext(glctx.display, glctx.context))
LOG(ERROR) << "eglDestroyContext() failed";
if (!eglDestroySurface(glctx.display, glctx.surface))
LOG(ERROR) << "eglDestroySurface() failed";
LOG(INFO) << "EGL OpenGL context destroyed (disp: 0x"
<< std::hex << std::setfill('0')
<< std::setw(16) << (uintptr_t)glctx.display
<< ", surf: 0x" << std::setw(16) << (uintptr_t)glctx.surface
<< ", ctx: 0x" << std::setw(16) << (uintptr_t)glctx.context << ")";
memset(&glctx, 0, sizeof(GLContext));
......
......@@ -76,12 +76,12 @@ static void constructGLProgram(NVDR_CTX_ARGS, GLuint* pProgram, GLuint glVertexS
//------------------------------------------------------------------------
// Shared C++ functions.
void rasterizeInitGLContext(NVDR_CTX_ARGS, RasterizeGLState& s)
void rasterizeInitGLContext(NVDR_CTX_ARGS, RasterizeGLState& s, int cudaDeviceIdx)
{
// Create GL context and set it current.
s.glctx = createGLContext();
s.glctx = createGLContext(cudaDeviceIdx);
setGLContext(s.glctx);
// Version check.
GLint vMajor = 0;
GLint vMinor = 0;
......@@ -90,7 +90,7 @@ void rasterizeInitGLContext(NVDR_CTX_ARGS, RasterizeGLState& s)
glGetError(); // Clear possible GL_INVALID_ENUM error in version query.
LOG(INFO) << "OpenGL version reported as " << vMajor << "." << vMinor;
NVDR_CHECK((vMajor == 4 && vMinor >= 4) || vMajor > 4, "OpenGL 4.4 or later is required");
// Number of output buffers.
int num_outputs = s.enableDB ? 2 : 1;
......@@ -319,7 +319,7 @@ void rasterizeResizeBuffers(NVDR_CTX_ARGS, RasterizeGLState& s, int posCount, in
s.width = ROUND_UP(s.width, 32);
s.height = ROUND_UP(s.height, 32);
LOG(INFO) << "Increasing frame buffer size to (width, height, depth) = (" << s.width << ", " << s.height << ", " << s.depth << ")";
// Allocate color buffers.
for (int i=0; i < num_outputs; i++)
{
......
......@@ -83,7 +83,7 @@ struct RasterizeGLState
//------------------------------------------------------------------------
// Shared C++ code prototypes.
void rasterizeInitGLContext(NVDR_CTX_ARGS, RasterizeGLState& s);
void rasterizeInitGLContext(NVDR_CTX_ARGS, RasterizeGLState& s, int cudaDeviceIdx);
void rasterizeResizeBuffers(NVDR_CTX_ARGS, RasterizeGLState& s, int posCount, int triCount, int width, int height, int depth);
void rasterizeRender(NVDR_CTX_ARGS, RasterizeGLState& s, cudaStream_t stream, const float* posPtr, int posCount, int vtxPerInstance, const int32_t* triPtr, int triCount, const int32_t* rangesPtr, int width, int height, int depth);
void rasterizeCopyResults(NVDR_CTX_ARGS, RasterizeGLState& s, cudaStream_t stream, float** outputPtr, int width, int height, int depth);
......
This diff is collapsed.
......@@ -40,7 +40,8 @@ struct TextureKernelParams
{
const float* tex; // Incoming texture buffer.
const float* uv; // Incoming texcoord buffer.
const float* uvDA; // Incoming uv pixel diffs. NULL if mips disabled.
const float* uvDA; // Incoming uv pixel diffs or NULL.
const float* mipLevelBias; // Incoming mip level bias or NULL.
const float* dy; // Incoming output gradient.
float* mip; // Mip data buffer.
float* out; // Outgoing texture data.
......@@ -48,7 +49,8 @@ struct TextureKernelParams
float* gradTexMip; // Temporary texture gradients for mip levels > 0.
float* gradUV; // Outgoing texcoord gradient.
float* gradUVDA; // Outgoing texcoord pixel differential gradient.
int enableMip; // If true, we have uv_da input and mip output tensor.
float* gradMipLevelBias; // Outgoing mip level bias gradient.
int enableMip; // If true, we have uv_da and/or mip_level_bias input(s), and a mip tensor.
int filterMode; // One of the TEX_MODE_ constants.
int boundaryMode; // One of the TEX_BOUNDARY_MODE_ contants.
int texConst; // If true, texture is known to be constant.
......
......@@ -19,7 +19,7 @@ from . import plugin_loader
def _get_gl_opts():
libs = {
'posix': ['GL', 'GLEW'],
'nt': ['gdi32', 'glew32s', 'opengl32', 'user32'],
'nt': ['gdi32', 'glew32s', 'opengl32', 'user32', 'setgpu'],
}
return ['-l' + x for x in libs[os.name]]
......
......@@ -12,11 +12,11 @@
struct RasterizeFwdOp : public OpKernel
{
RasterizeGLState m_glState; // OpenGL-related persistent state.
int m_tri_const; // 1 if triangle array is known to be constant.
int m_tri_const; // 1 if triangle array is known to be constant.
RasterizeFwdOp(OpKernelConstruction* ctx):
OpKernel(ctx)
{
{
memset(&m_glState, 0, sizeof(RasterizeGLState));
OP_REQUIRES_OK(ctx, ctx->GetAttr("enable_db", &m_glState.enableDB));
OP_REQUIRES_OK(ctx, ctx->GetAttr("tri_const", &m_tri_const));
......@@ -48,7 +48,7 @@ struct RasterizeFwdOp : public OpKernel
OP_REQUIRES(ctx, pos.dims() == 2 && pos.dim_size(0) > 0 && pos.dim_size(1) == 4, errors::InvalidArgument("range mode - pos must have shape [>0, 4]"));
OP_REQUIRES(ctx, tri.dims() == 2 && tri.dim_size(0) > 0 && tri.dim_size(1) == 3, errors::InvalidArgument("tri must have shape [>0, 3]"));
OP_REQUIRES(ctx, resolution.dims() == 1 && resolution.dim_size(0) == 2, errors::InvalidArgument("resolution must have shape [2]"));
OP_REQUIRES(ctx, ranges.dims() == 2 && ranges.dim_size(0) > 0 && ranges.dim_size(1) == 2, errors::InvalidArgument("range mode - ranges must have shape [>0, 2]"));
OP_REQUIRES(ctx, ranges.dims() == 2 && ranges.dim_size(0) > 0 && ranges.dim_size(1) == 2, errors::InvalidArgument("range mode - ranges must have shape [>0, 2]"));
}
// Get output shape.
......@@ -65,12 +65,16 @@ struct RasterizeFwdOp : public OpKernel
// Init context and GL?
bool initCtx = !m_glState.glFBO;
if (initCtx)
rasterizeInitGLContext(ctx, m_glState); // In common/rasterize.inl
{
const DeviceBase::GpuDeviceInfo* g = ctx->device()->tensorflow_gpu_device_info();
int cudaDeviceIdx = g ? g->gpu_id : -1;
rasterizeInitGLContext(ctx, m_glState, cudaDeviceIdx); // In common/rasterize.cpp
}
else
setGLContext(m_glState.glctx); // (Re-)Activate GL context.
// Resize all buffers.
rasterizeResizeBuffers(ctx, m_glState, posCount, triCount, width, height, depth); // In common/rasterize.inl
rasterizeResizeBuffers(ctx, m_glState, posCount, triCount, width, height, depth); // In common/rasterize.cpp
// Newly created GL objects sometimes don't map properly to CUDA until after first context swap. Workaround.
if (initCtx)
......@@ -79,7 +83,7 @@ struct RasterizeFwdOp : public OpKernel
releaseGLContext();
setGLContext(m_glState.glctx);
}
// Copy input data to GL and render.
const float* posPtr = pos.flat<float>().data();
const int32_t* rangesPtr = instance_mode ? 0 : ranges.flat<int32_t>().data(); // This is in CPU memory.
......@@ -178,7 +182,7 @@ struct RasterizeGradOp : public OpKernel
p.out = out.flat<float>().data();
p.dy = dy.flat<float>().data();
p.ddb = ENABLE_DB ? ddb.flat<float>().data() : 0;
// Set up pixel position to clip space x, y transform.
p.xs = 2.f / (float)p.width;
p.xo = 1.f / (float)p.width - 1.f;
......
......@@ -45,9 +45,9 @@ def _get_plugin():
# Linker options.
if os.name == 'posix':
ldflags = ['-lGL', '-lGLEW']
ldflags = ['-lGL', '-lGLEW', '-lEGL']
elif os.name == 'nt':
libs = ['gdi32', 'glew32s', 'opengl32', 'user32']
libs = ['gdi32', 'glew32s', 'opengl32', 'user32', 'setgpu']
ldflags = ['/LIBPATH:' + lib_dir] + ['/DEFAULTLIB:' + x for x in libs]
# List of source files.
......@@ -103,9 +103,9 @@ def set_log_level(level):
'''Set log level.
Log levels follow the convention on the C++ side of Torch:
0 = Info,
1 = Warning,
2 = Error,
0 = Info,
1 = Warning,
2 = Error,
3 = Fatal.
The default log level is 1.
......@@ -121,7 +121,7 @@ def set_log_level(level):
#----------------------------------------------------------------------------
class RasterizeGLContext:
def __init__(self, output_db=True, mode='automatic'):
def __init__(self, output_db=True, mode='automatic', device=None):
'''Create a new OpenGL rasterizer context.
Creating an OpenGL context is a slow operation so you should reuse the same
......@@ -131,7 +131,10 @@ class RasterizeGLContext:
Args:
output_db (bool): Compute and output image-space derivates of barycentrics.
mode: OpenGL context handling mode. Valid values are 'manual' and 'automatic'.
device (Optional): Cuda device on which the context is created. Type can be
`torch.device`, string (e.g., `'cuda:1'`), or int. If not
specified, context will be created on currently active Cuda
device.
Returns:
The newly created OpenGL rasterizer context.
'''
......@@ -139,11 +142,16 @@ class RasterizeGLContext:
assert mode in ['automatic', 'manual']
self.output_db = output_db
self.mode = mode
self.cpp_wrapper = _get_plugin().RasterizeGLStateWrapper(output_db, mode == 'automatic')
if device is None:
cuda_device_idx = torch.cuda.current_device()
else:
with torch.cuda.device(device):
cuda_device_idx = torch.cuda.current_device()
self.cpp_wrapper = _get_plugin().RasterizeGLStateWrapper(output_db, mode == 'automatic', cuda_device_idx)
def set_context(self):
'''Set (activate) OpenGL context in the current CPU thread.
Only available if context was created in manual mode.
Only available if context was created in manual mode.
'''
assert self.mode == 'manual'
self.cpp_wrapper.set_context()
......@@ -316,22 +324,26 @@ def interpolate(attr, rast, tri, rast_db=None, diff_attrs=None):
# Linear-mipmap-linear and linear-mipmap-nearest: Mipmaps enabled.
class _texture_func_mip(torch.autograd.Function):
@staticmethod
def forward(ctx, filter_mode, tex, uv, uv_da, mip, filter_mode_enum, boundary_mode_enum):
out = _get_plugin().texture_fwd_mip(tex, uv, uv_da, mip, filter_mode_enum, boundary_mode_enum)
ctx.save_for_backward(tex, uv, uv_da)
def forward(ctx, filter_mode, tex, uv, uv_da, mip_level_bias, mip, filter_mode_enum, boundary_mode_enum):
if uv_da is None:
uv_da = torch.tensor([])
if mip_level_bias is None:
mip_level_bias = torch.tensor([])
out = _get_plugin().texture_fwd_mip(tex, uv, uv_da, mip_level_bias, mip, filter_mode_enum, boundary_mode_enum)
ctx.save_for_backward(tex, uv, uv_da, mip_level_bias)
ctx.saved_misc = filter_mode, mip, filter_mode_enum, boundary_mode_enum
return out
@staticmethod
def backward(ctx, dy):
tex, uv, uv_da = ctx.saved_variables
tex, uv, uv_da, mip_level_bias = ctx.saved_variables
filter_mode, mip, filter_mode_enum, boundary_mode_enum = ctx.saved_misc
if filter_mode == 'linear-mipmap-linear':
g_tex, g_uv, g_uv_da = _get_plugin().texture_grad_linear_mipmap_linear(tex, uv, dy, uv_da, mip, filter_mode_enum, boundary_mode_enum)
return None, g_tex, g_uv, g_uv_da, None, None, None
g_tex, g_uv, g_uv_da, g_mip_level_bias = _get_plugin().texture_grad_linear_mipmap_linear(tex, uv, dy, uv_da, mip_level_bias, mip, filter_mode_enum, boundary_mode_enum)
return None, g_tex, g_uv, g_uv_da, g_mip_level_bias, None, None, None
else: # linear-mipmap-nearest
g_tex, g_uv = _get_plugin().texture_grad_linear_mipmap_nearest(tex, uv, dy, uv_da, mip, filter_mode_enum, boundary_mode_enum)
return None, g_tex, g_uv, None, None, None, None
g_tex, g_uv = _get_plugin().texture_grad_linear_mipmap_nearest(tex, uv, dy, uv_da, mip_level_bias, mip, filter_mode_enum, boundary_mode_enum)
return None, g_tex, g_uv, None, None, None, None, None
# Linear and nearest: Mipmaps disabled.
class _texture_func(torch.autograd.Function):
......@@ -354,7 +366,7 @@ class _texture_func(torch.autograd.Function):
return None, g_tex, None, None, None
# Op wrapper.
def texture(tex, uv, uv_da=None, mip=None, filter_mode='auto', boundary_mode='wrap', max_mip_level=None):
def texture(tex, uv, uv_da=None, mip_level_bias=None, mip=None, filter_mode='auto', boundary_mode='wrap', max_mip_level=None):
"""Perform texture sampling.
All input tensors must be contiguous and reside in GPU memory. The output tensor
......@@ -364,22 +376,24 @@ def texture(tex, uv, uv_da=None, mip=None, filter_mode='auto', boundary_mode='wr
tex: Texture tensor with dtype `torch.float32`. For 2D textures, must have shape
[minibatch_size, tex_height, tex_width, tex_channels]. For cube map textures,
must have shape [minibatch_size, 6, tex_height, tex_width, tex_channels] where
tex_width and tex_height are equal. Note that `boundary_mode` must also be set
tex_width and tex_height are equal. Note that `boundary_mode` must also be set
to 'cube' to enable cube map mode. Broadcasting is supported along the minibatch axis.
uv: Tensor containing per-pixel texture coordinates. When sampling a 2D texture,
uv: Tensor containing per-pixel texture coordinates. When sampling a 2D texture,
must have shape [minibatch_size, height, width, 2]. When sampling a cube map
texture, must have shape [minibatch_size, height, width, 3].
uv_da: (Optional) Tensor containing image-space derivatives of texture coordinates.
Must have same shape as `uv` except for the last dimension that is to be twice
as long.
mip_level_bias: (Optional) Per-pixel bias for mip level selection. If `uv_da` is omitted,
determines mip level directly. Must have shape [minibatch_size, height, width].
mip: (Optional) Preconstructed mipmap stack from a `texture_construct_mip()` call. If not
specified, the mipmap stack is constructed internally and discarded afterwards.
filter_mode: Texture filtering mode to be used. Valid values are 'auto', 'nearest',
filter_mode: Texture filtering mode to be used. Valid values are 'auto', 'nearest',
'linear', 'linear-mipmap-nearest', and 'linear-mipmap-linear'. Mode 'auto'
selects 'linear' if `uv_da` is not specified, and 'linear-mipmap-linear'
when `uv_da` is specified, these being the highest-quality modes possible
depending on the availability of the image-space derivatives of the texture
coordinates.
selects 'linear' if neither `uv_da` or `mip_level_bias` is specified, and
'linear-mipmap-linear' when at least one of them is specified, these being
the highest-quality modes possible depending on the availability of the
image-space derivatives of the texture coordinates or direct mip level information.
boundary_mode: Valid values are 'wrap', 'clamp', 'zero', and 'cube'. If `tex` defines a
cube map, this must be set to 'cube'. The default mode 'wrap' takes fractional
part of texture coordinates. Mode 'clamp' clamps texture coordinates to the
......@@ -395,7 +409,7 @@ def texture(tex, uv, uv_da=None, mip=None, filter_mode='auto', boundary_mode='wr
# Default filter mode.
if filter_mode == 'auto':
filter_mode = 'linear-mipmap-linear' if (uv_da is not None) else 'linear'
filter_mode = 'linear-mipmap-linear' if (uv_da is not None or mip_level_bias is not None) else 'linear'
# Sanitize inputs.
if max_mip_level is None:
......@@ -407,7 +421,7 @@ def texture(tex, uv, uv_da=None, mip=None, filter_mode='auto', boundary_mode='wr
# Check inputs.
assert isinstance(tex, torch.Tensor) and isinstance(uv, torch.Tensor)
if 'mipmap' in filter_mode:
assert isinstance(uv_da, torch.Tensor)
assert isinstance(uv_da, torch.Tensor) or isinstance(mip_level_bias, torch.Tensor)
# If mipping disabled via max level=0, we may as well use simpler filtering internally.
if max_mip_level == 0 and filter_mode in ['linear-mipmap-nearest', 'linear-mipmap-linear']:
......@@ -430,10 +444,10 @@ def texture(tex, uv, uv_da=None, mip=None, filter_mode='auto', boundary_mode='wr
# Choose stub.
if filter_mode == 'linear-mipmap-linear' or filter_mode == 'linear-mipmap-nearest':
return _texture_func_mip.apply(filter_mode, tex, uv, uv_da, mip, filter_mode_enum, boundary_mode_enum)
return _texture_func_mip.apply(filter_mode, tex, uv, uv_da, mip_level_bias, mip, filter_mode_enum, boundary_mode_enum)
else:
return _texture_func.apply(filter_mode, tex, uv, filter_mode_enum, boundary_mode_enum)
# Mipmap precalculation for cases where the texture stays constant.
def texture_construct_mip(tex, max_mip_level=None, cube_mode=False):
"""Construct a mipmap stack for a texture.
......
......@@ -24,6 +24,7 @@ void AntialiasGradKernel (const AntialiasKernelParams p);
TopologyHashWrapper antialias_construct_topology_hash(torch::Tensor tri)
{
const at::cuda::OptionalCUDAGuard device_guard(device_of(tri));
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
AntialiasKernelParams p = {}; // Initialize all fields to zero.
......@@ -66,6 +67,7 @@ TopologyHashWrapper antialias_construct_topology_hash(torch::Tensor tri)
std::tuple<torch::Tensor, torch::Tensor> antialias_fwd(torch::Tensor color, torch::Tensor rast, torch::Tensor pos, torch::Tensor tri, TopologyHashWrapper topology_hash_wrap)
{
const at::cuda::OptionalCUDAGuard device_guard(device_of(color));
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
AntialiasKernelParams p = {}; // Initialize all fields to zero.
p.instance_mode = (pos.sizes().size() > 2) ? 1 : 0;
......@@ -112,10 +114,10 @@ std::tuple<torch::Tensor, torch::Tensor> antialias_fwd(torch::Tensor color, torc
p.xh = .5f * (float)p.width;
p.yh = .5f * (float)p.height;
p.allocTriangles = topology_hash.size(0) / (4 * AA_HASH_ELEMENTS_PER_TRIANGLE);
// Allocate output tensors.
torch::Tensor out = color.detach().clone(); // Use color as base.
torch::TensorOptions opts = torch::TensorOptions().dtype(torch::kFloat32).device(torch::kCUDA);
torch::TensorOptions opts = torch::TensorOptions().dtype(torch::kFloat32).device(torch::kCUDA);
torch::Tensor work_buffer = torch::empty({p.n * p.width * p.height * 8 + 4}, opts); // 8 int for a maximum of two work items per pixel.
p.output = out.data_ptr<float>();
p.workBuffer = (int4*)(work_buffer.data_ptr<float>());
......@@ -153,6 +155,7 @@ std::tuple<torch::Tensor, torch::Tensor> antialias_fwd(torch::Tensor color, torc
std::tuple<torch::Tensor, torch::Tensor> antialias_grad(torch::Tensor color, torch::Tensor rast, torch::Tensor pos, torch::Tensor tri, torch::Tensor dy, torch::Tensor work_buffer)
{
const at::cuda::OptionalCUDAGuard device_guard(device_of(color));
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
AntialiasKernelParams p = {}; // Initialize all fields to zero.
p.instance_mode = (pos.sizes().size() > 2) ? 1 : 0;
......
......@@ -13,9 +13,10 @@
//------------------------------------------------------------------------
// Op prototypes. Return type macros for readability.
#define OP_RETURN_T torch::Tensor
#define OP_RETURN_TT std::tuple<torch::Tensor, torch::Tensor>
#define OP_RETURN_TTT std::tuple<torch::Tensor, torch::Tensor, torch::Tensor>
#define OP_RETURN_T torch::Tensor
#define OP_RETURN_TT std::tuple<torch::Tensor, torch::Tensor>
#define OP_RETURN_TTT std::tuple<torch::Tensor, torch::Tensor, torch::Tensor>
#define OP_RETURN_TTTT std::tuple<torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor>
OP_RETURN_TT rasterize_fwd (RasterizeGLStateWrapper& stateWrapper, torch::Tensor pos, torch::Tensor tri, std::tuple<int, int> resolution, torch::Tensor ranges);
OP_RETURN_T rasterize_grad (torch::Tensor pos, torch::Tensor tri, torch::Tensor out, torch::Tensor dy);
......@@ -26,11 +27,11 @@ OP_RETURN_TT interpolate_grad (torch::Tensor attr, tor
OP_RETURN_TTT interpolate_grad_da (torch::Tensor attr, torch::Tensor rast, torch::Tensor tri, torch::Tensor dy, torch::Tensor rast_db, torch::Tensor dda, bool diff_attrs_all, std::vector<int>& diff_attrs_vec);
TextureMipWrapper texture_construct_mip (torch::Tensor tex, int max_mip_level, bool cube_mode);
OP_RETURN_T texture_fwd (torch::Tensor tex, torch::Tensor uv, int filter_mode, int boundary_mode);
OP_RETURN_T texture_fwd_mip (torch::Tensor tex, torch::Tensor uv, torch::Tensor uv_da, TextureMipWrapper mip, int filter_mode, int boundary_mode);
OP_RETURN_T texture_fwd_mip (torch::Tensor tex, torch::Tensor uv, torch::Tensor uv_da, torch::Tensor mip_level_bias, TextureMipWrapper mip, int filter_mode, int boundary_mode);
OP_RETURN_T texture_grad_nearest (torch::Tensor tex, torch::Tensor uv, torch::Tensor dy, int filter_mode, int boundary_mode);
OP_RETURN_TT texture_grad_linear (torch::Tensor tex, torch::Tensor uv, torch::Tensor dy, int filter_mode, int boundary_mode);
OP_RETURN_TT texture_grad_linear_mipmap_nearest (torch::Tensor tex, torch::Tensor uv, torch::Tensor dy, torch::Tensor uv_da, TextureMipWrapper mip, int filter_mode, int boundary_mode);
OP_RETURN_TTT texture_grad_linear_mipmap_linear (torch::Tensor tex, torch::Tensor uv, torch::Tensor dy, torch::Tensor uv_da, TextureMipWrapper mip, int filter_mode, int boundary_mode);
OP_RETURN_TT texture_grad_linear_mipmap_nearest (torch::Tensor tex, torch::Tensor uv, torch::Tensor dy, torch::Tensor uv_da, torch::Tensor mip_level_bias, TextureMipWrapper mip, int filter_mode, int boundary_mode);
OP_RETURN_TTTT texture_grad_linear_mipmap_linear (torch::Tensor tex, torch::Tensor uv, torch::Tensor dy, torch::Tensor uv_da, torch::Tensor mip_level_bias, TextureMipWrapper mip, int filter_mode, int boundary_mode);
TopologyHashWrapper antialias_construct_topology_hash (torch::Tensor tri);
OP_RETURN_TT antialias_fwd (torch::Tensor color, torch::Tensor rast, torch::Tensor pos, torch::Tensor tri, TopologyHashWrapper topology_hash);
OP_RETURN_TT antialias_grad (torch::Tensor color, torch::Tensor rast, torch::Tensor pos, torch::Tensor tri, torch::Tensor dy, torch::Tensor work_buffer);
......@@ -39,7 +40,7 @@ OP_RETURN_TT antialias_grad (torch::Tensor color, to
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
// State classes.
pybind11::class_<RasterizeGLStateWrapper>(m, "RasterizeGLStateWrapper").def(pybind11::init<bool, bool>())
pybind11::class_<RasterizeGLStateWrapper>(m, "RasterizeGLStateWrapper").def(pybind11::init<bool, bool, int>())
.def("set_context", &RasterizeGLStateWrapper::setContext)
.def("release_context", &RasterizeGLStateWrapper::releaseContext);
pybind11::class_<TextureMipWrapper>(m, "TextureMipWrapper");
......@@ -58,8 +59,8 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("interpolate_grad", &interpolate_grad, "interpolate gradient op with attribute derivatives");
m.def("interpolate_grad_da", &interpolate_grad_da, "interpolate gradient op without attribute derivatives");
m.def("texture_construct_mip", &texture_construct_mip, "texture mipmap construction");
m.def("texture_fwd", &texture_fwd, "texture forward op with mipmapping and texcoord derivatives");
m.def("texture_fwd_mip", &texture_fwd_mip, "texture forward op without mipmapping and texcoord derivatives");
m.def("texture_fwd", &texture_fwd, "texture forward op without mipmapping");
m.def("texture_fwd_mip", &texture_fwd_mip, "texture forward op with mipmapping");
m.def("texture_grad_nearest", &texture_grad_nearest, "texture gradient op in nearest mode");
m.def("texture_grad_linear", &texture_grad_linear, "texture gradient op in linear mode");
m.def("texture_grad_linear_mipmap_nearest", &texture_grad_linear_mipmap_nearest, "texture gradient op in linear-mipmap-nearest mode");
......
......@@ -17,7 +17,7 @@
#define __func__ __FUNCTION__
#endif
#define NVDR_CHECK_DEVICE(...) do { TORCH_CHECK(at::cuda::check_device({__VA_ARGS__}), __func__, "(): Inputs " #__VA_ARGS__ " must reside on current GPU device") } while(0)
#define NVDR_CHECK_DEVICE(...) do { TORCH_CHECK(at::cuda::check_device({__VA_ARGS__}), __func__, "(): Inputs " #__VA_ARGS__ " must reside on the same GPU device") } while(0)
#define NVDR_CHECK_CPU(...) do { nvdr_check_cpu({__VA_ARGS__}, __func__, "(): Inputs " #__VA_ARGS__ " must reside on CPU"); } while(0)
#define NVDR_CHECK_CONTIGUOUS(...) do { nvdr_check_contiguous({__VA_ARGS__}, __func__, "(): Inputs " #__VA_ARGS__ " must be contiguous tensors"); } while(0)
#define NVDR_CHECK_F32(...) do { nvdr_check_f32({__VA_ARGS__}, __func__, "(): Inputs " #__VA_ARGS__ " must be float32 tensors"); } while(0)
......
......@@ -41,6 +41,7 @@ static void set_diff_attrs(InterpolateKernelParams& p, bool diff_attrs_all, std:
std::tuple<torch::Tensor, torch::Tensor> interpolate_fwd_da(torch::Tensor attr, torch::Tensor rast, torch::Tensor tri, torch::Tensor rast_db, bool diff_attrs_all, std::vector<int>& diff_attrs_vec)
{
const at::cuda::OptionalCUDAGuard device_guard(device_of(attr));
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
InterpolateKernelParams p = {}; // Initialize all fields to zero.
bool enable_da = (rast_db.defined()) && (diff_attrs_all || !diff_attrs_vec.empty());
......@@ -86,6 +87,8 @@ std::tuple<torch::Tensor, torch::Tensor> interpolate_fwd_da(torch::Tensor attr,
// Set attribute pixel differential info if enabled, otherwise leave as zero.
if (enable_da)
set_diff_attrs(p, diff_attrs_all, diff_attrs_vec);
else
p.numDiffAttr = 0;
// Get input pointers.
p.attr = attr.data_ptr<float>();
......@@ -95,7 +98,7 @@ std::tuple<torch::Tensor, torch::Tensor> interpolate_fwd_da(torch::Tensor attr,
p.attrBC = (p.instance_mode && attr.size(0) == 1) ? 1 : 0;
// Allocate output tensors.
torch::TensorOptions opts = torch::TensorOptions().dtype(torch::kFloat32).device(torch::kCUDA);
torch::TensorOptions opts = torch::TensorOptions().dtype(torch::kFloat32).device(torch::kCUDA);
torch::Tensor out = torch::empty({p.depth, p.height, p.width, p.numAttr}, opts);
torch::Tensor out_da = torch::empty({p.depth, p.height, p.width, p.numDiffAttr * 2}, opts);
......@@ -133,6 +136,7 @@ std::tuple<torch::Tensor, torch::Tensor> interpolate_fwd(torch::Tensor attr, tor
std::tuple<torch::Tensor, torch::Tensor, torch::Tensor> interpolate_grad_da(torch::Tensor attr, torch::Tensor rast, torch::Tensor tri, torch::Tensor dy, torch::Tensor rast_db, torch::Tensor dda, bool diff_attrs_all, std::vector<int>& diff_attrs_vec)
{
const at::cuda::OptionalCUDAGuard device_guard(device_of(attr));
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
InterpolateKernelParams p = {}; // Initialize all fields to zero.
bool enable_da = (rast_db.defined()) && (diff_attrs_all || !diff_attrs_vec.empty());
......@@ -190,6 +194,8 @@ std::tuple<torch::Tensor, torch::Tensor, torch::Tensor> interpolate_grad_da(torc
// Set attribute pixel differential info if enabled, otherwise leave as zero.
if (enable_da)
set_diff_attrs(p, diff_attrs_all, diff_attrs_vec);
else
p.numDiffAttr = 0;
// Get input pointers.
p.attr = attr.data_ptr<float>();
......@@ -201,7 +207,7 @@ std::tuple<torch::Tensor, torch::Tensor, torch::Tensor> interpolate_grad_da(torc
p.attrBC = (p.instance_mode && attr_depth < p.depth) ? 1 : 0;
// Allocate output tensors.
torch::TensorOptions opts = torch::TensorOptions().dtype(torch::kFloat32).device(torch::kCUDA);
torch::TensorOptions opts = torch::TensorOptions().dtype(torch::kFloat32).device(torch::kCUDA);
torch::Tensor gradAttr = torch::zeros_like(attr);
torch::Tensor gradRaster = torch::empty_like(rast);
torch::Tensor gradRasterDB;
......
......@@ -21,13 +21,14 @@ void RasterizeGradKernelDb(const RasterizeGradParams p);
//------------------------------------------------------------------------
// Python GL state wrapper methods.
RasterizeGLStateWrapper::RasterizeGLStateWrapper(bool enableDB, bool automatic_)
RasterizeGLStateWrapper::RasterizeGLStateWrapper(bool enableDB, bool automatic_, int cudaDeviceIdx_)
{
pState = new RasterizeGLState();
automatic = automatic_;
cudaDeviceIdx = cudaDeviceIdx_;
memset(pState, 0, sizeof(RasterizeGLState));
pState->enableDB = enableDB ? 1 : 0;
rasterizeInitGLContext(NVDR_CTX_PARAMS, *pState);
rasterizeInitGLContext(NVDR_CTX_PARAMS, *pState, cudaDeviceIdx_);
releaseGLContext();
}
......@@ -52,6 +53,7 @@ void RasterizeGLStateWrapper::releaseContext(void)
std::tuple<torch::Tensor, torch::Tensor> rasterize_fwd(RasterizeGLStateWrapper& stateWrapper, torch::Tensor pos, torch::Tensor tri, std::tuple<int, int> resolution, torch::Tensor ranges)
{
const at::cuda::OptionalCUDAGuard device_guard(device_of(pos));
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
RasterizeGLState& s = *stateWrapper.pState;
......@@ -62,6 +64,9 @@ std::tuple<torch::Tensor, torch::Tensor> rasterize_fwd(RasterizeGLStateWrapper&
NVDR_CHECK_F32(pos);
NVDR_CHECK_I32(tri, ranges);
// Check that GL context was created for the correct GPU.
NVDR_CHECK(pos.get_device() == stateWrapper.cudaDeviceIdx, "GL context must must reside on the same device as input tensors");
// Determine number of outputs
int num_outputs = s.enableDB ? 2 : 1;
......@@ -101,7 +106,7 @@ std::tuple<torch::Tensor, torch::Tensor> rasterize_fwd(RasterizeGLStateWrapper&
rasterizeRender(NVDR_CTX_PARAMS, s, stream, posPtr, posCount, vtxPerInstance, triPtr, triCount, rangesPtr, width, height, depth);
// Allocate output tensors.
torch::TensorOptions opts = torch::TensorOptions().dtype(torch::kFloat32).device(torch::kCUDA);
torch::TensorOptions opts = torch::TensorOptions().dtype(torch::kFloat32).device(torch::kCUDA);
torch::Tensor out = torch::empty({depth, height, width, 4}, opts);
torch::Tensor out_db = torch::empty({depth, height, width, s.enableDB ? 4 : 0}, opts);
float* outputPtr[2];
......@@ -123,6 +128,7 @@ std::tuple<torch::Tensor, torch::Tensor> rasterize_fwd(RasterizeGLStateWrapper&
torch::Tensor rasterize_grad_db(torch::Tensor pos, torch::Tensor tri, torch::Tensor out, torch::Tensor dy, torch::Tensor ddb)
{
const at::cuda::OptionalCUDAGuard device_guard(device_of(pos));
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
RasterizeGradParams p;
bool enable_db = ddb.defined();
......@@ -178,7 +184,7 @@ torch::Tensor rasterize_grad_db(torch::Tensor pos, torch::Tensor tri, torch::Ten
p.out = out.data_ptr<float>();
p.dy = dy_.data_ptr<float>();
p.ddb = enable_db ? ddb_.data_ptr<float>() : NULL;
// Set up pixel position to clip space x, y transform.
p.xs = 2.f / (float)p.width;
p.xo = 1.f / (float)p.width - 1.f;
......@@ -209,7 +215,7 @@ torch::Tensor rasterize_grad_db(torch::Tensor pos, torch::Tensor tri, torch::Ten
// Version without derivatives.
torch::Tensor rasterize_grad(torch::Tensor pos, torch::Tensor tri, torch::Tensor out, torch::Tensor dy)
{
{
torch::Tensor empty_tensor;
return rasterize_grad_db(pos, tri, out, dy, empty_tensor);
}
......
This diff is collapsed.
......@@ -15,7 +15,7 @@ class RasterizeGLState;
class RasterizeGLStateWrapper
{
public:
RasterizeGLStateWrapper (bool enableDB, bool automatic);
RasterizeGLStateWrapper (bool enableDB, bool automatic, int cudaDeviceIdx);
~RasterizeGLStateWrapper (void);
void setContext (void);
......@@ -23,6 +23,7 @@ public:
RasterizeGLState* pState;
bool automatic;
int cudaDeviceIdx;
};
//------------------------------------------------------------------------
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment