Commit ce7063f1 authored by Samuli Laine's avatar Samuli Laine
Browse files

Support for multiple GPUs, mip bias input for texture op

parent 2468e2a0
This diff is collapsed.
...@@ -6,4 +6,4 @@ ...@@ -6,4 +6,4 @@
# distribution of this software and related documentation without an express # distribution of this software and related documentation without an express
# license agreement from NVIDIA CORPORATION is strictly prohibited. # license agreement from NVIDIA CORPORATION is strictly prohibited.
__version__ = '0.2.0' __version__ = '0.2.1'
...@@ -185,6 +185,8 @@ template<class T> static __device__ __forceinline__ void swap(T& a, T& b) ...@@ -185,6 +185,8 @@ template<class T> static __device__ __forceinline__ void swap(T& a, T& b)
//------------------------------------------------------------------------ //------------------------------------------------------------------------
// Coalesced atomics. These are all done via macros. // Coalesced atomics. These are all done via macros.
#if __CUDA_ARCH__ >= 700 // Warp match instruction __match_any_sync() is only available on compute capability 7.x and higher
#define CA_TEMP _ca_temp #define CA_TEMP _ca_temp
#define CA_TEMP_PARAM float* CA_TEMP #define CA_TEMP_PARAM float* CA_TEMP
#define CA_DECLARE_TEMP(threads_per_block) \ #define CA_DECLARE_TEMP(threads_per_block) \
...@@ -228,5 +230,24 @@ template<class T> static __device__ __forceinline__ void swap(T& a, T& b) ...@@ -228,5 +230,24 @@ template<class T> static __device__ __forceinline__ void swap(T& a, T& b)
caAtomicAdd((ptr)+(idx), (value)); \ caAtomicAdd((ptr)+(idx), (value)); \
} while(0) } while(0)
//------------------------------------------------------------------------
// Disable atomic coalescing for compute capability lower than 7.x
#else // __CUDA_ARCH__ >= 700
#define CA_TEMP _ca_temp
#define CA_TEMP_PARAM float CA_TEMP
#define CA_DECLARE_TEMP(threads_per_block) CA_TEMP_PARAM
#define CA_SET_GROUP_MASK(group, thread_mask)
#define CA_SET_GROUP(group)
#define caAtomicAdd(ptr, value) atomicAdd((ptr), (value))
#define caAtomicAdd3_xyw(ptr, x, y, w) \
do { \
atomicAdd((ptr), (x)); \
atomicAdd((ptr)+1, (y)); \
atomicAdd((ptr)+3, (w)); \
} while(0)
#define caAtomicAddTexture(ptr, level, idx, value) atomicAdd((ptr)+(idx), (value))
#endif // __CUDA_ARCH__ >= 700
//------------------------------------------------------------------------ //------------------------------------------------------------------------
#endif // __CUDACC__ #endif // __CUDACC__
...@@ -36,6 +36,7 @@ using namespace tensorflow::shape_inference; ...@@ -36,6 +36,7 @@ using namespace tensorflow::shape_inference;
#include <torch/extension.h> #include <torch/extension.h>
#include <ATen/cuda/CUDAContext.h> #include <ATen/cuda/CUDAContext.h>
#include <ATen/cuda/CUDAUtils.h> #include <ATen/cuda/CUDAUtils.h>
#include <c10/cuda/CUDAGuard.h>
#include <pybind11/numpy.h> #include <pybind11/numpy.h>
#endif #endif
#define NVDR_CTX_ARGS int _nvdr_ctx_dummy #define NVDR_CTX_ARGS int _nvdr_ctx_dummy
......
...@@ -37,26 +37,43 @@ struct GLContext ...@@ -37,26 +37,43 @@ struct GLContext
static void setGLContext(GLContext& glctx) static void setGLContext(GLContext& glctx)
{ {
if (!glctx.hglrc) if (!glctx.hglrc)
LOG(ERROR) << "setGLContext() called with null gltcx"; LOG(FATAL) << "setGLContext() called with null gltcx";
if (!wglMakeCurrent(glctx.hdc, glctx.hglrc)) if (!wglMakeCurrent(glctx.hdc, glctx.hglrc))
LOG(ERROR) << "wglMakeCurrent() failed when setting GL context"; LOG(FATAL) << "wglMakeCurrent() failed when setting GL context";
if (glctx.glewInitialized) if (glctx.glewInitialized)
return; return;
GLenum result = glewInit(); GLenum result = glewInit();
if (result != GLEW_OK) if (result != GLEW_OK)
LOG(ERROR) << "glewInit() failed, return value = " << result; LOG(FATAL) << "glewInit() failed, return value = " << result;
glctx.glewInitialized = 1; glctx.glewInitialized = 1;
} }
static void releaseGLContext(void) static void releaseGLContext(void)
{ {
if (!wglMakeCurrent(NULL, NULL)) if (!wglMakeCurrent(NULL, NULL))
LOG(ERROR) << "wglMakeCurrent() failed when releasing GL context"; LOG(FATAL) << "wglMakeCurrent() failed when releasing GL context";
} }
static GLContext createGLContext(void) extern "C" int set_gpu(const char*);
static GLContext createGLContext(int cudaDeviceIdx)
{ {
if (cudaDeviceIdx >= 0)
{
char pciBusId[256] = "";
LOG(INFO) << "Creating GL context for Cuda device " << cudaDeviceIdx;
if (cudaDeviceGetPCIBusId(pciBusId, 255, cudaDeviceIdx) != CUDA_SUCCESS)
{
LOG(INFO) << "PCI bus id query failed";
}
else
{
int res = set_gpu(pciBusId);
LOG(INFO) << "Selecting device with PCI bus id " << pciBusId << " - " << (res ? "failed, expect crash or major slowdown" : "success");
}
}
HINSTANCE hInstance = GetModuleHandle(NULL); HINSTANCE hInstance = GetModuleHandle(NULL);
WNDCLASS wc = {}; WNDCLASS wc = {};
wc.style = CS_OWNDC; wc.style = CS_OWNDC;
...@@ -101,7 +118,7 @@ static GLContext createGLContext(void) ...@@ -101,7 +118,7 @@ static GLContext createGLContext(void)
static void destroyGLContext(GLContext& glctx) static void destroyGLContext(GLContext& glctx)
{ {
if (!glctx.hglrc) if (!glctx.hglrc)
LOG(ERROR) << "destroyGLContext() called with null gltcx"; LOG(FATAL) << "destroyGLContext() called with null gltcx";
// If this is the current context, release it. // If this is the current context, release it.
if (wglGetCurrentContext() == glctx.hglrc) if (wglGetCurrentContext() == glctx.hglrc)
...@@ -109,13 +126,13 @@ static void destroyGLContext(GLContext& glctx) ...@@ -109,13 +126,13 @@ static void destroyGLContext(GLContext& glctx)
HWND hwnd = WindowFromDC(glctx.hdc); HWND hwnd = WindowFromDC(glctx.hdc);
if (!hwnd) if (!hwnd)
LOG(ERROR) << "WindowFromDC() failed"; LOG(FATAL) << "WindowFromDC() failed";
if (!ReleaseDC(hwnd, glctx.hdc)) if (!ReleaseDC(hwnd, glctx.hdc))
LOG(ERROR) << "ReleaseDC() failed"; LOG(FATAL) << "ReleaseDC() failed";
if (!wglDeleteContext(glctx.hglrc)) if (!wglDeleteContext(glctx.hglrc))
LOG(ERROR) << "wglDeleteContext() failed"; LOG(FATAL) << "wglDeleteContext() failed";
if (!DestroyWindow(hwnd)) if (!DestroyWindow(hwnd))
LOG(ERROR) << "DestroyWindow() failed"; LOG(FATAL) << "DestroyWindow() failed";
LOG(INFO) << std::hex << std::setfill('0') LOG(INFO) << std::hex << std::setfill('0')
<< "WGL OpenGL context destroyed (hdc: 0x" << std::setw(8) << (uint32_t)(uintptr_t)glctx.hdc << "WGL OpenGL context destroyed (hdc: 0x" << std::setw(8) << (uint32_t)(uintptr_t)glctx.hdc
...@@ -140,6 +157,7 @@ static void destroyGLContext(GLContext& glctx) ...@@ -140,6 +157,7 @@ static void destroyGLContext(GLContext& glctx)
# include <GL/glew.h> // Use system-supplied glew.h # include <GL/glew.h> // Use system-supplied glew.h
#endif #endif
#include <EGL/egl.h> #include <EGL/egl.h>
#include <EGL/eglext.h>
#include <GL/gl.h> #include <GL/gl.h>
#include <cuda_gl_interop.h> #include <cuda_gl_interop.h>
...@@ -148,7 +166,6 @@ static void destroyGLContext(GLContext& glctx) ...@@ -148,7 +166,6 @@ static void destroyGLContext(GLContext& glctx)
struct GLContext struct GLContext
{ {
EGLDisplay display; EGLDisplay display;
EGLSurface surface;
EGLContext context; EGLContext context;
int glewInitialized; int glewInitialized;
}; };
...@@ -158,9 +175,9 @@ struct GLContext ...@@ -158,9 +175,9 @@ struct GLContext
static void setGLContext(GLContext& glctx) static void setGLContext(GLContext& glctx)
{ {
if (!glctx.context) if (!glctx.context)
LOG(ERROR) << "setGLContext() called with null gltcx"; LOG(FATAL) << "setGLContext() called with null gltcx";
if (!eglMakeCurrent(glctx.display, glctx.surface, glctx.surface, glctx.context)) if (!eglMakeCurrent(glctx.display, EGL_NO_SURFACE, EGL_NO_SURFACE, glctx.context))
LOG(ERROR) << "eglMakeCurrent() failed when setting GL context"; LOG(ERROR) << "eglMakeCurrent() failed when setting GL context";
if (glctx.glewInitialized) if (glctx.glewInitialized)
...@@ -168,7 +185,7 @@ static void setGLContext(GLContext& glctx) ...@@ -168,7 +185,7 @@ static void setGLContext(GLContext& glctx)
GLenum result = glewInit(); GLenum result = glewInit();
if (result != GLEW_OK) if (result != GLEW_OK)
LOG(ERROR) << "glewInit() failed, return value = " << result; LOG(FATAL) << "glewInit() failed, return value = " << result;
glctx.glewInitialized = 1; glctx.glewInitialized = 1;
} }
...@@ -178,21 +195,83 @@ static void releaseGLContext(void) ...@@ -178,21 +195,83 @@ static void releaseGLContext(void)
if (display == EGL_NO_DISPLAY) if (display == EGL_NO_DISPLAY)
LOG(WARNING) << "releaseGLContext() called with no active display"; LOG(WARNING) << "releaseGLContext() called with no active display";
if (!eglMakeCurrent(display, EGL_NO_SURFACE, EGL_NO_SURFACE, EGL_NO_CONTEXT)) if (!eglMakeCurrent(display, EGL_NO_SURFACE, EGL_NO_SURFACE, EGL_NO_CONTEXT))
LOG(ERROR) << "eglMakeCurrent() failed when releasing GL context"; LOG(FATAL) << "eglMakeCurrent() failed when releasing GL context";
} }
static GLContext createGLContext(void) static EGLDisplay getCudaDisplay(int cudaDeviceIdx)
{ {
// Initialize. typedef EGLBoolean (*eglQueryDevicesEXT_t)(EGLint, EGLDeviceEXT, EGLint*);
typedef EGLBoolean (*eglQueryDeviceAttribEXT_t)(EGLDeviceEXT, EGLint, EGLAttrib*);
typedef EGLDisplay (*eglGetPlatformDisplayEXT_t)(EGLenum, void*, const EGLint*);
EGLDisplay display = eglGetDisplay(EGL_DEFAULT_DISPLAY); eglQueryDevicesEXT_t eglQueryDevicesEXT = (eglQueryDevicesEXT_t)eglGetProcAddress("eglQueryDevicesEXT");
if (!eglQueryDevicesEXT)
{
LOG(INFO) << "eglGetProcAddress(\"eglQueryDevicesEXT\") failed";
return 0;
}
eglQueryDeviceAttribEXT_t eglQueryDeviceAttribEXT = (eglQueryDeviceAttribEXT_t)eglGetProcAddress("eglQueryDeviceAttribEXT");
if (!eglQueryDeviceAttribEXT)
{
LOG(INFO) << "eglGetProcAddress(\"eglQueryDeviceAttribEXT\") failed";
return 0;
}
eglGetPlatformDisplayEXT_t eglGetPlatformDisplayEXT = (eglGetPlatformDisplayEXT_t)eglGetProcAddress("eglGetPlatformDisplayEXT");
if (!eglGetPlatformDisplayEXT)
{
LOG(INFO) << "eglGetProcAddress(\"eglGetPlatformDisplayEXT\") failed";
return 0;
}
int num_devices = 0;
eglQueryDevicesEXT(0, 0, &num_devices);
if (!num_devices)
return 0;
EGLDisplay display = 0;
EGLDeviceEXT* devices = (EGLDeviceEXT*)malloc(num_devices * sizeof(void*));
eglQueryDevicesEXT(num_devices, devices, &num_devices);
for (int i=0; i < num_devices; i++)
{
EGLDeviceEXT device = devices[i];
intptr_t value = -1;
if (eglQueryDeviceAttribEXT(device, EGL_CUDA_DEVICE_NV, &value) && value == cudaDeviceIdx)
{
display = eglGetPlatformDisplayEXT(EGL_PLATFORM_DEVICE_EXT, device, 0);
break;
}
}
free(devices);
return display;
}
static GLContext createGLContext(int cudaDeviceIdx)
{
EGLDisplay display = 0;
if (cudaDeviceIdx >= 0)
{
char pciBusId[256] = "";
LOG(INFO) << "Creating GL context for Cuda device " << cudaDeviceIdx;
display = getCudaDisplay(cudaDeviceIdx);
if (!display)
LOG(INFO) << "Failed, falling back to default display";
}
if (!display)
{
display = eglGetDisplay(EGL_DEFAULT_DISPLAY);
if (display == EGL_NO_DISPLAY) if (display == EGL_NO_DISPLAY)
LOG(ERROR) << "eglGetDisplay() failed"; LOG(FATAL) << "eglGetDisplay() failed";
}
EGLint major; EGLint major;
EGLint minor; EGLint minor;
if (!eglInitialize(display, &major, &minor)) if (!eglInitialize(display, &major, &minor))
LOG(ERROR) << "eglInitialize() failed"; LOG(FATAL) << "eglInitialize() failed";
// Choose configuration. // Choose configuration.
...@@ -211,45 +290,32 @@ static GLContext createGLContext(void) ...@@ -211,45 +290,32 @@ static GLContext createGLContext(void)
EGLConfig config; EGLConfig config;
EGLint num_config; EGLint num_config;
if (!eglChooseConfig(display, context_attribs, &config, 1, &num_config)) if (!eglChooseConfig(display, context_attribs, &config, 1, &num_config))
LOG(ERROR) << "eglChooseConfig() failed"; LOG(FATAL) << "eglChooseConfig() failed";
// Create dummy pbuffer surface.
const EGLint surface_attribs[] = {
EGL_WIDTH, 1,
EGL_HEIGHT, 1,
EGL_NONE
};
EGLSurface surface = eglCreatePbufferSurface(display, config, surface_attribs);
if (surface == EGL_NO_SURFACE)
LOG(ERROR) << "eglCreatePbufferSurface() failed";
// Create GL context. // Create GL context.
if (!eglBindAPI(EGL_OPENGL_API)) if (!eglBindAPI(EGL_OPENGL_API))
LOG(ERROR) << "eglBindAPI() failed"; LOG(FATAL) << "eglBindAPI() failed";
EGLContext context = eglCreateContext(display, config, EGL_NO_CONTEXT, NULL); EGLContext context = eglCreateContext(display, config, EGL_NO_CONTEXT, NULL);
if (context == EGL_NO_CONTEXT) if (context == EGL_NO_CONTEXT)
LOG(ERROR) << "eglCreateContext() failed"; LOG(FATAL) << "eglCreateContext() failed";
// Done. // Done.
LOG(INFO) << "EGL " << (int)minor << "." << (int)major << " OpenGL context created (disp: 0x" LOG(INFO) << "EGL " << (int)minor << "." << (int)major << " OpenGL context created (disp: 0x"
<< std::hex << std::setfill('0') << std::hex << std::setfill('0')
<< std::setw(16) << (uintptr_t)display << std::setw(16) << (uintptr_t)display
<< ", surf: 0x" << std::setw(16) << (uintptr_t)surface
<< ", ctx: 0x" << std::setw(16) << (uintptr_t)context << ")"; << ", ctx: 0x" << std::setw(16) << (uintptr_t)context << ")";
GLContext glctx = {display, surface, context, 0}; GLContext glctx = {display, context, 0};
return glctx; return glctx;
} }
static void destroyGLContext(GLContext& glctx) static void destroyGLContext(GLContext& glctx)
{ {
if (!glctx.context) if (!glctx.context)
LOG(ERROR) << "destroyGLContext() called with null gltcx"; LOG(FATAL) << "destroyGLContext() called with null gltcx";
// If this is the current context, release it. // If this is the current context, release it.
if (eglGetCurrentContext() == glctx.context) if (eglGetCurrentContext() == glctx.context)
...@@ -257,13 +323,10 @@ static void destroyGLContext(GLContext& glctx) ...@@ -257,13 +323,10 @@ static void destroyGLContext(GLContext& glctx)
if (!eglDestroyContext(glctx.display, glctx.context)) if (!eglDestroyContext(glctx.display, glctx.context))
LOG(ERROR) << "eglDestroyContext() failed"; LOG(ERROR) << "eglDestroyContext() failed";
if (!eglDestroySurface(glctx.display, glctx.surface))
LOG(ERROR) << "eglDestroySurface() failed";
LOG(INFO) << "EGL OpenGL context destroyed (disp: 0x" LOG(INFO) << "EGL OpenGL context destroyed (disp: 0x"
<< std::hex << std::setfill('0') << std::hex << std::setfill('0')
<< std::setw(16) << (uintptr_t)glctx.display << std::setw(16) << (uintptr_t)glctx.display
<< ", surf: 0x" << std::setw(16) << (uintptr_t)glctx.surface
<< ", ctx: 0x" << std::setw(16) << (uintptr_t)glctx.context << ")"; << ", ctx: 0x" << std::setw(16) << (uintptr_t)glctx.context << ")";
memset(&glctx, 0, sizeof(GLContext)); memset(&glctx, 0, sizeof(GLContext));
......
...@@ -76,10 +76,10 @@ static void constructGLProgram(NVDR_CTX_ARGS, GLuint* pProgram, GLuint glVertexS ...@@ -76,10 +76,10 @@ static void constructGLProgram(NVDR_CTX_ARGS, GLuint* pProgram, GLuint glVertexS
//------------------------------------------------------------------------ //------------------------------------------------------------------------
// Shared C++ functions. // Shared C++ functions.
void rasterizeInitGLContext(NVDR_CTX_ARGS, RasterizeGLState& s) void rasterizeInitGLContext(NVDR_CTX_ARGS, RasterizeGLState& s, int cudaDeviceIdx)
{ {
// Create GL context and set it current. // Create GL context and set it current.
s.glctx = createGLContext(); s.glctx = createGLContext(cudaDeviceIdx);
setGLContext(s.glctx); setGLContext(s.glctx);
// Version check. // Version check.
......
...@@ -83,7 +83,7 @@ struct RasterizeGLState ...@@ -83,7 +83,7 @@ struct RasterizeGLState
//------------------------------------------------------------------------ //------------------------------------------------------------------------
// Shared C++ code prototypes. // Shared C++ code prototypes.
void rasterizeInitGLContext(NVDR_CTX_ARGS, RasterizeGLState& s); void rasterizeInitGLContext(NVDR_CTX_ARGS, RasterizeGLState& s, int cudaDeviceIdx);
void rasterizeResizeBuffers(NVDR_CTX_ARGS, RasterizeGLState& s, int posCount, int triCount, int width, int height, int depth); void rasterizeResizeBuffers(NVDR_CTX_ARGS, RasterizeGLState& s, int posCount, int triCount, int width, int height, int depth);
void rasterizeRender(NVDR_CTX_ARGS, RasterizeGLState& s, cudaStream_t stream, const float* posPtr, int posCount, int vtxPerInstance, const int32_t* triPtr, int triCount, const int32_t* rangesPtr, int width, int height, int depth); void rasterizeRender(NVDR_CTX_ARGS, RasterizeGLState& s, cudaStream_t stream, const float* posPtr, int posCount, int vtxPerInstance, const int32_t* triPtr, int triCount, const int32_t* rangesPtr, int width, int height, int depth);
void rasterizeCopyResults(NVDR_CTX_ARGS, RasterizeGLState& s, cudaStream_t stream, float** outputPtr, int width, int height, int depth); void rasterizeCopyResults(NVDR_CTX_ARGS, RasterizeGLState& s, cudaStream_t stream, float** outputPtr, int width, int height, int depth);
......
...@@ -448,13 +448,16 @@ static __device__ __forceinline__ float2 indexTextureLinear(const TextureKernelP ...@@ -448,13 +448,16 @@ static __device__ __forceinline__ float2 indexTextureLinear(const TextureKernelP
//------------------------------------------------------------------------ //------------------------------------------------------------------------
// Mip level calculation. // Mip level calculation.
template <bool CUBE_MODE, int FILTER_MODE> template <bool CUBE_MODE, bool BIAS_ONLY, int FILTER_MODE>
static __device__ __forceinline__ void calculateMipLevel(int& level0, int& level1, float& flevel, const TextureKernelParams& p, int pidx, float3 uv, float4* pdw, float3* pdfdv) static __device__ __forceinline__ void calculateMipLevel(int& level0, int& level1, float& flevel, const TextureKernelParams& p, int pidx, float3 uv, float4* pdw, float3* pdfdv)
{ {
// Do nothing if mips not in use. // Do nothing if mips not in use.
if (FILTER_MODE == TEX_MODE_NEAREST || FILTER_MODE == TEX_MODE_LINEAR) if (FILTER_MODE == TEX_MODE_NEAREST || FILTER_MODE == TEX_MODE_LINEAR)
return; return;
// Determine mip level based on UV pixel derivatives. If no derivatives are given (mip level bias only), leave as zero.
if (!BIAS_ONLY)
{
// Get pixel derivatives of texture coordinates. // Get pixel derivatives of texture coordinates.
float4 uvDA; float4 uvDA;
float3 dvdX, dvdY; // Gradients use these later. float3 dvdX, dvdY; // Gradients use these later.
...@@ -531,25 +534,24 @@ static __device__ __forceinline__ void calculateMipLevel(int& level0, int& level ...@@ -531,25 +534,24 @@ static __device__ __forceinline__ void calculateMipLevel(int& level0, int& level
} }
} }
// Calculate true mip level and clamp. // Finally, calculate mip level.
flevel = .5f * __log2f(lenMajorSqr); flevel = .5f * __log2f(lenMajorSqr);
}
// Bias the mip level and clamp.
if (p.mipLevelBias)
flevel += p.mipLevelBias[pidx];
flevel = fminf(fmaxf(flevel, 0.f), (float)p.mipLevelMax); flevel = fminf(fmaxf(flevel, 0.f), (float)p.mipLevelMax);
if (FILTER_MODE == TEX_MODE_LINEAR_MIPMAP_NEAREST) // Calculate levels depending on filter mode.
{
// Linear-mipmap-nearest.
level0 = __float2int_rn(flevel);
}
else
{
// Linear-mipmap-linear.
if (flevel > 0.f) // Leave everything at zero if flevel == 0 (magnification)
{
level0 = __float2int_rd(flevel); level0 = __float2int_rd(flevel);
// Leave everything else at zero if flevel == 0 (magnification) or when in linear-mipmap-nearest mode.
if (FILTER_MODE == TEX_MODE_LINEAR_MIPMAP_LINEAR && flevel > 0.f)
{
level1 = min(level0 + 1, p.mipLevelMax); level1 = min(level0 + 1, p.mipLevelMax);
flevel -= level0; // Fractional part. Zero if clamped on last level. flevel -= level0; // Fractional part. Zero if clamped on last level.
} }
}
} }
//------------------------------------------------------------------------ //------------------------------------------------------------------------
...@@ -672,7 +674,7 @@ __global__ void MipBuildKernel4(const TextureKernelParams p) { MipBuildKernelTem ...@@ -672,7 +674,7 @@ __global__ void MipBuildKernel4(const TextureKernelParams p) { MipBuildKernelTem
//------------------------------------------------------------------------ //------------------------------------------------------------------------
// Forward kernel. // Forward kernel.
template <class T, int C, bool CUBE_MODE, int FILTER_MODE> template <class T, int C, bool CUBE_MODE, bool BIAS_ONLY, int FILTER_MODE>
static __forceinline__ __device__ void TextureFwdKernelTemplate(const TextureKernelParams p) static __forceinline__ __device__ void TextureFwdKernelTemplate(const TextureKernelParams p)
{ {
// Calculate pixel position. // Calculate pixel position.
...@@ -714,7 +716,7 @@ static __forceinline__ __device__ void TextureFwdKernelTemplate(const TextureKer ...@@ -714,7 +716,7 @@ static __forceinline__ __device__ void TextureFwdKernelTemplate(const TextureKer
float flevel = 0.f; // Fractional level. float flevel = 0.f; // Fractional level.
int level0 = 0; // Discrete level 0. int level0 = 0; // Discrete level 0.
int level1 = 0; // Discrete level 1. int level1 = 0; // Discrete level 1.
calculateMipLevel<CUBE_MODE, FILTER_MODE>(level0, level1, flevel, p, pidx, uv, 0, 0); calculateMipLevel<CUBE_MODE, BIAS_ONLY, FILTER_MODE>(level0, level1, flevel, p, pidx, uv, 0, 0);
// Get texel indices and pointer for level 0. // Get texel indices and pointer for level 0.
int4 tc0 = make_int4(0, 0, 0, 0); int4 tc0 = make_int4(0, 0, 0, 0);
...@@ -766,30 +768,42 @@ static __forceinline__ __device__ void TextureFwdKernelTemplate(const TextureKer ...@@ -766,30 +768,42 @@ static __forceinline__ __device__ void TextureFwdKernelTemplate(const TextureKer
} }
// Template specializations. // Template specializations.
__global__ void TextureFwdKernelNearest1 (const TextureKernelParams p) { TextureFwdKernelTemplate<float, 1, false, TEX_MODE_NEAREST>(p); } __global__ void TextureFwdKernelNearest1 (const TextureKernelParams p) { TextureFwdKernelTemplate<float, 1, false, false, TEX_MODE_NEAREST>(p); }
__global__ void TextureFwdKernelNearest2 (const TextureKernelParams p) { TextureFwdKernelTemplate<float2, 2, false, TEX_MODE_NEAREST>(p); } __global__ void TextureFwdKernelNearest2 (const TextureKernelParams p) { TextureFwdKernelTemplate<float2, 2, false, false, TEX_MODE_NEAREST>(p); }
__global__ void TextureFwdKernelNearest4 (const TextureKernelParams p) { TextureFwdKernelTemplate<float4, 4, false, TEX_MODE_NEAREST>(p); } __global__ void TextureFwdKernelNearest4 (const TextureKernelParams p) { TextureFwdKernelTemplate<float4, 4, false, false, TEX_MODE_NEAREST>(p); }
__global__ void TextureFwdKernelLinear1 (const TextureKernelParams p) { TextureFwdKernelTemplate<float, 1, false, TEX_MODE_LINEAR>(p); } __global__ void TextureFwdKernelLinear1 (const TextureKernelParams p) { TextureFwdKernelTemplate<float, 1, false, false, TEX_MODE_LINEAR>(p); }
__global__ void TextureFwdKernelLinear2 (const TextureKernelParams p) { TextureFwdKernelTemplate<float2, 2, false, TEX_MODE_LINEAR>(p); } __global__ void TextureFwdKernelLinear2 (const TextureKernelParams p) { TextureFwdKernelTemplate<float2, 2, false, false, TEX_MODE_LINEAR>(p); }
__global__ void TextureFwdKernelLinear4 (const TextureKernelParams p) { TextureFwdKernelTemplate<float4, 4, false, TEX_MODE_LINEAR>(p); } __global__ void TextureFwdKernelLinear4 (const TextureKernelParams p) { TextureFwdKernelTemplate<float4, 4, false, false, TEX_MODE_LINEAR>(p); }
__global__ void TextureFwdKernelLinearMipmapNearest1 (const TextureKernelParams p) { TextureFwdKernelTemplate<float, 1, false, TEX_MODE_LINEAR_MIPMAP_NEAREST>(p); } __global__ void TextureFwdKernelLinearMipmapNearest1 (const TextureKernelParams p) { TextureFwdKernelTemplate<float, 1, false, false, TEX_MODE_LINEAR_MIPMAP_NEAREST>(p); }
__global__ void TextureFwdKernelLinearMipmapNearest2 (const TextureKernelParams p) { TextureFwdKernelTemplate<float2, 2, false, TEX_MODE_LINEAR_MIPMAP_NEAREST>(p); } __global__ void TextureFwdKernelLinearMipmapNearest2 (const TextureKernelParams p) { TextureFwdKernelTemplate<float2, 2, false, false, TEX_MODE_LINEAR_MIPMAP_NEAREST>(p); }
__global__ void TextureFwdKernelLinearMipmapNearest4 (const TextureKernelParams p) { TextureFwdKernelTemplate<float4, 4, false, TEX_MODE_LINEAR_MIPMAP_NEAREST>(p); } __global__ void TextureFwdKernelLinearMipmapNearest4 (const TextureKernelParams p) { TextureFwdKernelTemplate<float4, 4, false, false, TEX_MODE_LINEAR_MIPMAP_NEAREST>(p); }
__global__ void TextureFwdKernelLinearMipmapLinear1 (const TextureKernelParams p) { TextureFwdKernelTemplate<float, 1, false, TEX_MODE_LINEAR_MIPMAP_LINEAR>(p); } __global__ void TextureFwdKernelLinearMipmapLinear1 (const TextureKernelParams p) { TextureFwdKernelTemplate<float, 1, false, false, TEX_MODE_LINEAR_MIPMAP_LINEAR>(p); }
__global__ void TextureFwdKernelLinearMipmapLinear2 (const TextureKernelParams p) { TextureFwdKernelTemplate<float2, 2, false, TEX_MODE_LINEAR_MIPMAP_LINEAR>(p); } __global__ void TextureFwdKernelLinearMipmapLinear2 (const TextureKernelParams p) { TextureFwdKernelTemplate<float2, 2, false, false, TEX_MODE_LINEAR_MIPMAP_LINEAR>(p); }
__global__ void TextureFwdKernelLinearMipmapLinear4 (const TextureKernelParams p) { TextureFwdKernelTemplate<float4, 4, false, TEX_MODE_LINEAR_MIPMAP_LINEAR>(p); } __global__ void TextureFwdKernelLinearMipmapLinear4 (const TextureKernelParams p) { TextureFwdKernelTemplate<float4, 4, false, false, TEX_MODE_LINEAR_MIPMAP_LINEAR>(p); }
__global__ void TextureFwdKernelCubeNearest1 (const TextureKernelParams p) { TextureFwdKernelTemplate<float, 1, true, TEX_MODE_NEAREST>(p); } __global__ void TextureFwdKernelCubeNearest1 (const TextureKernelParams p) { TextureFwdKernelTemplate<float, 1, true, false, TEX_MODE_NEAREST>(p); }
__global__ void TextureFwdKernelCubeNearest2 (const TextureKernelParams p) { TextureFwdKernelTemplate<float2, 2, true, TEX_MODE_NEAREST>(p); } __global__ void TextureFwdKernelCubeNearest2 (const TextureKernelParams p) { TextureFwdKernelTemplate<float2, 2, true, false, TEX_MODE_NEAREST>(p); }
__global__ void TextureFwdKernelCubeNearest4 (const TextureKernelParams p) { TextureFwdKernelTemplate<float4, 4, true, TEX_MODE_NEAREST>(p); } __global__ void TextureFwdKernelCubeNearest4 (const TextureKernelParams p) { TextureFwdKernelTemplate<float4, 4, true, false, TEX_MODE_NEAREST>(p); }
__global__ void TextureFwdKernelCubeLinear1 (const TextureKernelParams p) { TextureFwdKernelTemplate<float, 1, true, TEX_MODE_LINEAR>(p); } __global__ void TextureFwdKernelCubeLinear1 (const TextureKernelParams p) { TextureFwdKernelTemplate<float, 1, true, false, TEX_MODE_LINEAR>(p); }
__global__ void TextureFwdKernelCubeLinear2 (const TextureKernelParams p) { TextureFwdKernelTemplate<float2, 2, true, TEX_MODE_LINEAR>(p); } __global__ void TextureFwdKernelCubeLinear2 (const TextureKernelParams p) { TextureFwdKernelTemplate<float2, 2, true, false, TEX_MODE_LINEAR>(p); }
__global__ void TextureFwdKernelCubeLinear4 (const TextureKernelParams p) { TextureFwdKernelTemplate<float4, 4, true, TEX_MODE_LINEAR>(p); } __global__ void TextureFwdKernelCubeLinear4 (const TextureKernelParams p) { TextureFwdKernelTemplate<float4, 4, true, false, TEX_MODE_LINEAR>(p); }
__global__ void TextureFwdKernelCubeLinearMipmapNearest1 (const TextureKernelParams p) { TextureFwdKernelTemplate<float, 1, true, TEX_MODE_LINEAR_MIPMAP_NEAREST>(p); } __global__ void TextureFwdKernelCubeLinearMipmapNearest1 (const TextureKernelParams p) { TextureFwdKernelTemplate<float, 1, true, false, TEX_MODE_LINEAR_MIPMAP_NEAREST>(p); }
__global__ void TextureFwdKernelCubeLinearMipmapNearest2 (const TextureKernelParams p) { TextureFwdKernelTemplate<float2, 2, true, TEX_MODE_LINEAR_MIPMAP_NEAREST>(p); } __global__ void TextureFwdKernelCubeLinearMipmapNearest2 (const TextureKernelParams p) { TextureFwdKernelTemplate<float2, 2, true, false, TEX_MODE_LINEAR_MIPMAP_NEAREST>(p); }
__global__ void TextureFwdKernelCubeLinearMipmapNearest4 (const TextureKernelParams p) { TextureFwdKernelTemplate<float4, 4, true, TEX_MODE_LINEAR_MIPMAP_NEAREST>(p); } __global__ void TextureFwdKernelCubeLinearMipmapNearest4 (const TextureKernelParams p) { TextureFwdKernelTemplate<float4, 4, true, false, TEX_MODE_LINEAR_MIPMAP_NEAREST>(p); }
__global__ void TextureFwdKernelCubeLinearMipmapLinear1 (const TextureKernelParams p) { TextureFwdKernelTemplate<float, 1, true, TEX_MODE_LINEAR_MIPMAP_LINEAR>(p); } __global__ void TextureFwdKernelCubeLinearMipmapLinear1 (const TextureKernelParams p) { TextureFwdKernelTemplate<float, 1, true, false, TEX_MODE_LINEAR_MIPMAP_LINEAR>(p); }
__global__ void TextureFwdKernelCubeLinearMipmapLinear2 (const TextureKernelParams p) { TextureFwdKernelTemplate<float2, 2, true, TEX_MODE_LINEAR_MIPMAP_LINEAR>(p); } __global__ void TextureFwdKernelCubeLinearMipmapLinear2 (const TextureKernelParams p) { TextureFwdKernelTemplate<float2, 2, true, false, TEX_MODE_LINEAR_MIPMAP_LINEAR>(p); }
__global__ void TextureFwdKernelCubeLinearMipmapLinear4 (const TextureKernelParams p) { TextureFwdKernelTemplate<float4, 4, true, TEX_MODE_LINEAR_MIPMAP_LINEAR>(p); } __global__ void TextureFwdKernelCubeLinearMipmapLinear4 (const TextureKernelParams p) { TextureFwdKernelTemplate<float4, 4, true, false, TEX_MODE_LINEAR_MIPMAP_LINEAR>(p); }
__global__ void TextureFwdKernelLinearMipmapNearestBO1 (const TextureKernelParams p) { TextureFwdKernelTemplate<float, 1, false, true, TEX_MODE_LINEAR_MIPMAP_NEAREST>(p); }
__global__ void TextureFwdKernelLinearMipmapNearestBO2 (const TextureKernelParams p) { TextureFwdKernelTemplate<float2, 2, false, true, TEX_MODE_LINEAR_MIPMAP_NEAREST>(p); }
__global__ void TextureFwdKernelLinearMipmapNearestBO4 (const TextureKernelParams p) { TextureFwdKernelTemplate<float4, 4, false, true, TEX_MODE_LINEAR_MIPMAP_NEAREST>(p); }
__global__ void TextureFwdKernelLinearMipmapLinearBO1 (const TextureKernelParams p) { TextureFwdKernelTemplate<float, 1, false, true, TEX_MODE_LINEAR_MIPMAP_LINEAR>(p); }
__global__ void TextureFwdKernelLinearMipmapLinearBO2 (const TextureKernelParams p) { TextureFwdKernelTemplate<float2, 2, false, true, TEX_MODE_LINEAR_MIPMAP_LINEAR>(p); }
__global__ void TextureFwdKernelLinearMipmapLinearBO4 (const TextureKernelParams p) { TextureFwdKernelTemplate<float4, 4, false, true, TEX_MODE_LINEAR_MIPMAP_LINEAR>(p); }
__global__ void TextureFwdKernelCubeLinearMipmapNearestBO1 (const TextureKernelParams p) { TextureFwdKernelTemplate<float, 1, true, true, TEX_MODE_LINEAR_MIPMAP_NEAREST>(p); }
__global__ void TextureFwdKernelCubeLinearMipmapNearestBO2 (const TextureKernelParams p) { TextureFwdKernelTemplate<float2, 2, true, true, TEX_MODE_LINEAR_MIPMAP_NEAREST>(p); }
__global__ void TextureFwdKernelCubeLinearMipmapNearestBO4 (const TextureKernelParams p) { TextureFwdKernelTemplate<float4, 4, true, true, TEX_MODE_LINEAR_MIPMAP_NEAREST>(p); }
__global__ void TextureFwdKernelCubeLinearMipmapLinearBO1 (const TextureKernelParams p) { TextureFwdKernelTemplate<float, 1, true, true, TEX_MODE_LINEAR_MIPMAP_LINEAR>(p); }
__global__ void TextureFwdKernelCubeLinearMipmapLinearBO2 (const TextureKernelParams p) { TextureFwdKernelTemplate<float2, 2, true, true, TEX_MODE_LINEAR_MIPMAP_LINEAR>(p); }
__global__ void TextureFwdKernelCubeLinearMipmapLinearBO4 (const TextureKernelParams p) { TextureFwdKernelTemplate<float4, 4, true, true, TEX_MODE_LINEAR_MIPMAP_LINEAR>(p); }
//------------------------------------------------------------------------ //------------------------------------------------------------------------
// Gradient mip puller kernel. // Gradient mip puller kernel.
...@@ -856,7 +870,7 @@ __global__ void MipGradKernel4(const TextureKernelParams p) { MipGradKernelTempl ...@@ -856,7 +870,7 @@ __global__ void MipGradKernel4(const TextureKernelParams p) { MipGradKernelTempl
//------------------------------------------------------------------------ //------------------------------------------------------------------------
// Gradient kernel. // Gradient kernel.
template <bool CUBE_MODE, int FILTER_MODE> template <bool CUBE_MODE, bool BIAS_ONLY, int FILTER_MODE>
static __forceinline__ __device__ void TextureGradKernelTemplate(const TextureKernelParams p) static __forceinline__ __device__ void TextureGradKernelTemplate(const TextureKernelParams p)
{ {
// Temporary space for coalesced atomics. // Temporary space for coalesced atomics.
...@@ -898,18 +912,28 @@ static __forceinline__ __device__ void TextureGradKernelTemplate(const TextureKe ...@@ -898,18 +912,28 @@ static __forceinline__ __device__ void TextureGradKernelTemplate(const TextureKe
if (FILTER_MODE != TEX_MODE_NEAREST) if (FILTER_MODE != TEX_MODE_NEAREST)
((float3*)p.gradUV)[pidx] = make_float3(0.f, 0.f, 0.f); ((float3*)p.gradUV)[pidx] = make_float3(0.f, 0.f, 0.f);
if (FILTER_MODE == TEX_MODE_LINEAR_MIPMAP_LINEAR) if (FILTER_MODE == TEX_MODE_LINEAR_MIPMAP_LINEAR)
{
if (p.gradUVDA)
{ {
((float2*)p.gradUVDA)[3 * pidx + 0] = make_float2(0.f, 0.f); ((float2*)p.gradUVDA)[3 * pidx + 0] = make_float2(0.f, 0.f);
((float2*)p.gradUVDA)[3 * pidx + 1] = make_float2(0.f, 0.f); ((float2*)p.gradUVDA)[3 * pidx + 1] = make_float2(0.f, 0.f);
((float2*)p.gradUVDA)[3 * pidx + 2] = make_float2(0.f, 0.f); ((float2*)p.gradUVDA)[3 * pidx + 2] = make_float2(0.f, 0.f);
} }
if (p.gradMipLevelBias)
p.gradMipLevelBias[pidx] = 0.f;
}
} }
else else
{ {
if (FILTER_MODE != TEX_MODE_NEAREST) if (FILTER_MODE != TEX_MODE_NEAREST)
((float2*)p.gradUV)[pidx] = make_float2(0.f, 0.f); ((float2*)p.gradUV)[pidx] = make_float2(0.f, 0.f);
if (FILTER_MODE == TEX_MODE_LINEAR_MIPMAP_LINEAR) if (FILTER_MODE == TEX_MODE_LINEAR_MIPMAP_LINEAR)
{
if (p.gradUVDA)
((float4*)p.gradUVDA)[pidx] = make_float4(0.f, 0.f, 0.f, 0.f); ((float4*)p.gradUVDA)[pidx] = make_float4(0.f, 0.f, 0.f, 0.f);
if (p.gradMipLevelBias)
p.gradMipLevelBias[pidx] = 0.f;
}
} }
return; return;
} }
...@@ -944,7 +968,7 @@ static __forceinline__ __device__ void TextureGradKernelTemplate(const TextureKe ...@@ -944,7 +968,7 @@ static __forceinline__ __device__ void TextureGradKernelTemplate(const TextureKe
float flevel = 0.f; // Fractional level. float flevel = 0.f; // Fractional level.
int level0 = 0; // Discrete level 0. int level0 = 0; // Discrete level 0.
int level1 = 0; // Discrete level 1. int level1 = 0; // Discrete level 1.
calculateMipLevel<CUBE_MODE, FILTER_MODE>(level0, level1, flevel, p, pidx, uv, &dw, &dfdv); calculateMipLevel<CUBE_MODE, BIAS_ONLY, FILTER_MODE>(level0, level1, flevel, p, pidx, uv, &dw, &dfdv);
// UV gradient accumulators. // UV gradient accumulators.
float gu = 0.f; float gu = 0.f;
...@@ -1058,7 +1082,14 @@ static __forceinline__ __device__ void TextureGradKernelTemplate(const TextureKe ...@@ -1058,7 +1082,14 @@ static __forceinline__ __device__ void TextureGradKernelTemplate(const TextureKe
else else
((float2*)p.gradUV)[pidx] = make_float2(gu, gv); ((float2*)p.gradUV)[pidx] = make_float2(gu, gv);
// Final UV pixel differential gradients. // Store mip level bias gradient.
if (p.gradMipLevelBias)
p.gradMipLevelBias[pidx] = df;
// Store UV pixel differential gradients.
if (!BIAS_ONLY)
{
// Final gradients.
dw *= df; // dL/(d{s,y}/d{X,Y}) = df/(d{s,y}/d{X,Y}) * dL/df. dw *= df; // dL/(d{s,y}/d{X,Y}) = df/(d{s,y}/d{X,Y}) * dL/df.
// Store them. // Store them.
...@@ -1073,16 +1104,21 @@ static __forceinline__ __device__ void TextureGradKernelTemplate(const TextureKe ...@@ -1073,16 +1104,21 @@ static __forceinline__ __device__ void TextureGradKernelTemplate(const TextureKe
} }
else else
((float4*)p.gradUVDA)[pidx] = dw; ((float4*)p.gradUVDA)[pidx] = dw;
}
} }
// Template specializations. // Template specializations.
__global__ void TextureGradKernelNearest (const TextureKernelParams p) { TextureGradKernelTemplate<false, TEX_MODE_NEAREST>(p); } __global__ void TextureGradKernelNearest (const TextureKernelParams p) { TextureGradKernelTemplate<false, false, TEX_MODE_NEAREST>(p); }
__global__ void TextureGradKernelLinear (const TextureKernelParams p) { TextureGradKernelTemplate<false, TEX_MODE_LINEAR>(p); } __global__ void TextureGradKernelLinear (const TextureKernelParams p) { TextureGradKernelTemplate<false, false, TEX_MODE_LINEAR>(p); }
__global__ void TextureGradKernelLinearMipmapNearest (const TextureKernelParams p) { TextureGradKernelTemplate<false, TEX_MODE_LINEAR_MIPMAP_NEAREST>(p); } __global__ void TextureGradKernelLinearMipmapNearest (const TextureKernelParams p) { TextureGradKernelTemplate<false, false, TEX_MODE_LINEAR_MIPMAP_NEAREST>(p); }
__global__ void TextureGradKernelLinearMipmapLinear (const TextureKernelParams p) { TextureGradKernelTemplate<false, TEX_MODE_LINEAR_MIPMAP_LINEAR>(p); } __global__ void TextureGradKernelLinearMipmapLinear (const TextureKernelParams p) { TextureGradKernelTemplate<false, false, TEX_MODE_LINEAR_MIPMAP_LINEAR>(p); }
__global__ void TextureGradKernelCubeNearest (const TextureKernelParams p) { TextureGradKernelTemplate<true, TEX_MODE_NEAREST>(p); } __global__ void TextureGradKernelCubeNearest (const TextureKernelParams p) { TextureGradKernelTemplate<true, false, TEX_MODE_NEAREST>(p); }
__global__ void TextureGradKernelCubeLinear (const TextureKernelParams p) { TextureGradKernelTemplate<true, TEX_MODE_LINEAR>(p); } __global__ void TextureGradKernelCubeLinear (const TextureKernelParams p) { TextureGradKernelTemplate<true, false, TEX_MODE_LINEAR>(p); }
__global__ void TextureGradKernelCubeLinearMipmapNearest (const TextureKernelParams p) { TextureGradKernelTemplate<true, TEX_MODE_LINEAR_MIPMAP_NEAREST>(p); } __global__ void TextureGradKernelCubeLinearMipmapNearest (const TextureKernelParams p) { TextureGradKernelTemplate<true, false, TEX_MODE_LINEAR_MIPMAP_NEAREST>(p); }
__global__ void TextureGradKernelCubeLinearMipmapLinear (const TextureKernelParams p) { TextureGradKernelTemplate<true, TEX_MODE_LINEAR_MIPMAP_LINEAR>(p); } __global__ void TextureGradKernelCubeLinearMipmapLinear (const TextureKernelParams p) { TextureGradKernelTemplate<true, false, TEX_MODE_LINEAR_MIPMAP_LINEAR>(p); }
__global__ void TextureGradKernelLinearMipmapNearestBO (const TextureKernelParams p) { TextureGradKernelTemplate<false, true, TEX_MODE_LINEAR_MIPMAP_NEAREST>(p); }
__global__ void TextureGradKernelLinearMipmapLinearBO (const TextureKernelParams p) { TextureGradKernelTemplate<false, true, TEX_MODE_LINEAR_MIPMAP_LINEAR>(p); }
__global__ void TextureGradKernelCubeLinearMipmapNearestBO (const TextureKernelParams p) { TextureGradKernelTemplate<true, true, TEX_MODE_LINEAR_MIPMAP_NEAREST>(p); }
__global__ void TextureGradKernelCubeLinearMipmapLinearBO (const TextureKernelParams p) { TextureGradKernelTemplate<true, true, TEX_MODE_LINEAR_MIPMAP_LINEAR>(p); }
//------------------------------------------------------------------------ //------------------------------------------------------------------------
...@@ -40,7 +40,8 @@ struct TextureKernelParams ...@@ -40,7 +40,8 @@ struct TextureKernelParams
{ {
const float* tex; // Incoming texture buffer. const float* tex; // Incoming texture buffer.
const float* uv; // Incoming texcoord buffer. const float* uv; // Incoming texcoord buffer.
const float* uvDA; // Incoming uv pixel diffs. NULL if mips disabled. const float* uvDA; // Incoming uv pixel diffs or NULL.
const float* mipLevelBias; // Incoming mip level bias or NULL.
const float* dy; // Incoming output gradient. const float* dy; // Incoming output gradient.
float* mip; // Mip data buffer. float* mip; // Mip data buffer.
float* out; // Outgoing texture data. float* out; // Outgoing texture data.
...@@ -48,7 +49,8 @@ struct TextureKernelParams ...@@ -48,7 +49,8 @@ struct TextureKernelParams
float* gradTexMip; // Temporary texture gradients for mip levels > 0. float* gradTexMip; // Temporary texture gradients for mip levels > 0.
float* gradUV; // Outgoing texcoord gradient. float* gradUV; // Outgoing texcoord gradient.
float* gradUVDA; // Outgoing texcoord pixel differential gradient. float* gradUVDA; // Outgoing texcoord pixel differential gradient.
int enableMip; // If true, we have uv_da input and mip output tensor. float* gradMipLevelBias; // Outgoing mip level bias gradient.
int enableMip; // If true, we have uv_da and/or mip_level_bias input(s), and a mip tensor.
int filterMode; // One of the TEX_MODE_ constants. int filterMode; // One of the TEX_MODE_ constants.
int boundaryMode; // One of the TEX_BOUNDARY_MODE_ contants. int boundaryMode; // One of the TEX_BOUNDARY_MODE_ contants.
int texConst; // If true, texture is known to be constant. int texConst; // If true, texture is known to be constant.
......
...@@ -19,7 +19,7 @@ from . import plugin_loader ...@@ -19,7 +19,7 @@ from . import plugin_loader
def _get_gl_opts(): def _get_gl_opts():
libs = { libs = {
'posix': ['GL', 'GLEW'], 'posix': ['GL', 'GLEW'],
'nt': ['gdi32', 'glew32s', 'opengl32', 'user32'], 'nt': ['gdi32', 'glew32s', 'opengl32', 'user32', 'setgpu'],
} }
return ['-l' + x for x in libs[os.name]] return ['-l' + x for x in libs[os.name]]
......
...@@ -65,12 +65,16 @@ struct RasterizeFwdOp : public OpKernel ...@@ -65,12 +65,16 @@ struct RasterizeFwdOp : public OpKernel
// Init context and GL? // Init context and GL?
bool initCtx = !m_glState.glFBO; bool initCtx = !m_glState.glFBO;
if (initCtx) if (initCtx)
rasterizeInitGLContext(ctx, m_glState); // In common/rasterize.inl {
const DeviceBase::GpuDeviceInfo* g = ctx->device()->tensorflow_gpu_device_info();
int cudaDeviceIdx = g ? g->gpu_id : -1;
rasterizeInitGLContext(ctx, m_glState, cudaDeviceIdx); // In common/rasterize.cpp
}
else else
setGLContext(m_glState.glctx); // (Re-)Activate GL context. setGLContext(m_glState.glctx); // (Re-)Activate GL context.
// Resize all buffers. // Resize all buffers.
rasterizeResizeBuffers(ctx, m_glState, posCount, triCount, width, height, depth); // In common/rasterize.inl rasterizeResizeBuffers(ctx, m_glState, posCount, triCount, width, height, depth); // In common/rasterize.cpp
// Newly created GL objects sometimes don't map properly to CUDA until after first context swap. Workaround. // Newly created GL objects sometimes don't map properly to CUDA until after first context swap. Workaround.
if (initCtx) if (initCtx)
......
...@@ -45,9 +45,9 @@ def _get_plugin(): ...@@ -45,9 +45,9 @@ def _get_plugin():
# Linker options. # Linker options.
if os.name == 'posix': if os.name == 'posix':
ldflags = ['-lGL', '-lGLEW'] ldflags = ['-lGL', '-lGLEW', '-lEGL']
elif os.name == 'nt': elif os.name == 'nt':
libs = ['gdi32', 'glew32s', 'opengl32', 'user32'] libs = ['gdi32', 'glew32s', 'opengl32', 'user32', 'setgpu']
ldflags = ['/LIBPATH:' + lib_dir] + ['/DEFAULTLIB:' + x for x in libs] ldflags = ['/LIBPATH:' + lib_dir] + ['/DEFAULTLIB:' + x for x in libs]
# List of source files. # List of source files.
...@@ -121,7 +121,7 @@ def set_log_level(level): ...@@ -121,7 +121,7 @@ def set_log_level(level):
#---------------------------------------------------------------------------- #----------------------------------------------------------------------------
class RasterizeGLContext: class RasterizeGLContext:
def __init__(self, output_db=True, mode='automatic'): def __init__(self, output_db=True, mode='automatic', device=None):
'''Create a new OpenGL rasterizer context. '''Create a new OpenGL rasterizer context.
Creating an OpenGL context is a slow operation so you should reuse the same Creating an OpenGL context is a slow operation so you should reuse the same
...@@ -131,7 +131,10 @@ class RasterizeGLContext: ...@@ -131,7 +131,10 @@ class RasterizeGLContext:
Args: Args:
output_db (bool): Compute and output image-space derivates of barycentrics. output_db (bool): Compute and output image-space derivates of barycentrics.
mode: OpenGL context handling mode. Valid values are 'manual' and 'automatic'. mode: OpenGL context handling mode. Valid values are 'manual' and 'automatic'.
device (Optional): Cuda device on which the context is created. Type can be
`torch.device`, string (e.g., `'cuda:1'`), or int. If not
specified, context will be created on currently active Cuda
device.
Returns: Returns:
The newly created OpenGL rasterizer context. The newly created OpenGL rasterizer context.
''' '''
...@@ -139,7 +142,12 @@ class RasterizeGLContext: ...@@ -139,7 +142,12 @@ class RasterizeGLContext:
assert mode in ['automatic', 'manual'] assert mode in ['automatic', 'manual']
self.output_db = output_db self.output_db = output_db
self.mode = mode self.mode = mode
self.cpp_wrapper = _get_plugin().RasterizeGLStateWrapper(output_db, mode == 'automatic') if device is None:
cuda_device_idx = torch.cuda.current_device()
else:
with torch.cuda.device(device):
cuda_device_idx = torch.cuda.current_device()
self.cpp_wrapper = _get_plugin().RasterizeGLStateWrapper(output_db, mode == 'automatic', cuda_device_idx)
def set_context(self): def set_context(self):
'''Set (activate) OpenGL context in the current CPU thread. '''Set (activate) OpenGL context in the current CPU thread.
...@@ -316,22 +324,26 @@ def interpolate(attr, rast, tri, rast_db=None, diff_attrs=None): ...@@ -316,22 +324,26 @@ def interpolate(attr, rast, tri, rast_db=None, diff_attrs=None):
# Linear-mipmap-linear and linear-mipmap-nearest: Mipmaps enabled. # Linear-mipmap-linear and linear-mipmap-nearest: Mipmaps enabled.
class _texture_func_mip(torch.autograd.Function): class _texture_func_mip(torch.autograd.Function):
@staticmethod @staticmethod
def forward(ctx, filter_mode, tex, uv, uv_da, mip, filter_mode_enum, boundary_mode_enum): def forward(ctx, filter_mode, tex, uv, uv_da, mip_level_bias, mip, filter_mode_enum, boundary_mode_enum):
out = _get_plugin().texture_fwd_mip(tex, uv, uv_da, mip, filter_mode_enum, boundary_mode_enum) if uv_da is None:
ctx.save_for_backward(tex, uv, uv_da) uv_da = torch.tensor([])
if mip_level_bias is None:
mip_level_bias = torch.tensor([])
out = _get_plugin().texture_fwd_mip(tex, uv, uv_da, mip_level_bias, mip, filter_mode_enum, boundary_mode_enum)
ctx.save_for_backward(tex, uv, uv_da, mip_level_bias)
ctx.saved_misc = filter_mode, mip, filter_mode_enum, boundary_mode_enum ctx.saved_misc = filter_mode, mip, filter_mode_enum, boundary_mode_enum
return out return out
@staticmethod @staticmethod
def backward(ctx, dy): def backward(ctx, dy):
tex, uv, uv_da = ctx.saved_variables tex, uv, uv_da, mip_level_bias = ctx.saved_variables
filter_mode, mip, filter_mode_enum, boundary_mode_enum = ctx.saved_misc filter_mode, mip, filter_mode_enum, boundary_mode_enum = ctx.saved_misc
if filter_mode == 'linear-mipmap-linear': if filter_mode == 'linear-mipmap-linear':
g_tex, g_uv, g_uv_da = _get_plugin().texture_grad_linear_mipmap_linear(tex, uv, dy, uv_da, mip, filter_mode_enum, boundary_mode_enum) g_tex, g_uv, g_uv_da, g_mip_level_bias = _get_plugin().texture_grad_linear_mipmap_linear(tex, uv, dy, uv_da, mip_level_bias, mip, filter_mode_enum, boundary_mode_enum)
return None, g_tex, g_uv, g_uv_da, None, None, None return None, g_tex, g_uv, g_uv_da, g_mip_level_bias, None, None, None
else: # linear-mipmap-nearest else: # linear-mipmap-nearest
g_tex, g_uv = _get_plugin().texture_grad_linear_mipmap_nearest(tex, uv, dy, uv_da, mip, filter_mode_enum, boundary_mode_enum) g_tex, g_uv = _get_plugin().texture_grad_linear_mipmap_nearest(tex, uv, dy, uv_da, mip_level_bias, mip, filter_mode_enum, boundary_mode_enum)
return None, g_tex, g_uv, None, None, None, None return None, g_tex, g_uv, None, None, None, None, None
# Linear and nearest: Mipmaps disabled. # Linear and nearest: Mipmaps disabled.
class _texture_func(torch.autograd.Function): class _texture_func(torch.autograd.Function):
...@@ -354,7 +366,7 @@ class _texture_func(torch.autograd.Function): ...@@ -354,7 +366,7 @@ class _texture_func(torch.autograd.Function):
return None, g_tex, None, None, None return None, g_tex, None, None, None
# Op wrapper. # Op wrapper.
def texture(tex, uv, uv_da=None, mip=None, filter_mode='auto', boundary_mode='wrap', max_mip_level=None): def texture(tex, uv, uv_da=None, mip_level_bias=None, mip=None, filter_mode='auto', boundary_mode='wrap', max_mip_level=None):
"""Perform texture sampling. """Perform texture sampling.
All input tensors must be contiguous and reside in GPU memory. The output tensor All input tensors must be contiguous and reside in GPU memory. The output tensor
...@@ -372,14 +384,16 @@ def texture(tex, uv, uv_da=None, mip=None, filter_mode='auto', boundary_mode='wr ...@@ -372,14 +384,16 @@ def texture(tex, uv, uv_da=None, mip=None, filter_mode='auto', boundary_mode='wr
uv_da: (Optional) Tensor containing image-space derivatives of texture coordinates. uv_da: (Optional) Tensor containing image-space derivatives of texture coordinates.
Must have same shape as `uv` except for the last dimension that is to be twice Must have same shape as `uv` except for the last dimension that is to be twice
as long. as long.
mip_level_bias: (Optional) Per-pixel bias for mip level selection. If `uv_da` is omitted,
determines mip level directly. Must have shape [minibatch_size, height, width].
mip: (Optional) Preconstructed mipmap stack from a `texture_construct_mip()` call. If not mip: (Optional) Preconstructed mipmap stack from a `texture_construct_mip()` call. If not
specified, the mipmap stack is constructed internally and discarded afterwards. specified, the mipmap stack is constructed internally and discarded afterwards.
filter_mode: Texture filtering mode to be used. Valid values are 'auto', 'nearest', filter_mode: Texture filtering mode to be used. Valid values are 'auto', 'nearest',
'linear', 'linear-mipmap-nearest', and 'linear-mipmap-linear'. Mode 'auto' 'linear', 'linear-mipmap-nearest', and 'linear-mipmap-linear'. Mode 'auto'
selects 'linear' if `uv_da` is not specified, and 'linear-mipmap-linear' selects 'linear' if neither `uv_da` or `mip_level_bias` is specified, and
when `uv_da` is specified, these being the highest-quality modes possible 'linear-mipmap-linear' when at least one of them is specified, these being
depending on the availability of the image-space derivatives of the texture the highest-quality modes possible depending on the availability of the
coordinates. image-space derivatives of the texture coordinates or direct mip level information.
boundary_mode: Valid values are 'wrap', 'clamp', 'zero', and 'cube'. If `tex` defines a boundary_mode: Valid values are 'wrap', 'clamp', 'zero', and 'cube'. If `tex` defines a
cube map, this must be set to 'cube'. The default mode 'wrap' takes fractional cube map, this must be set to 'cube'. The default mode 'wrap' takes fractional
part of texture coordinates. Mode 'clamp' clamps texture coordinates to the part of texture coordinates. Mode 'clamp' clamps texture coordinates to the
...@@ -395,7 +409,7 @@ def texture(tex, uv, uv_da=None, mip=None, filter_mode='auto', boundary_mode='wr ...@@ -395,7 +409,7 @@ def texture(tex, uv, uv_da=None, mip=None, filter_mode='auto', boundary_mode='wr
# Default filter mode. # Default filter mode.
if filter_mode == 'auto': if filter_mode == 'auto':
filter_mode = 'linear-mipmap-linear' if (uv_da is not None) else 'linear' filter_mode = 'linear-mipmap-linear' if (uv_da is not None or mip_level_bias is not None) else 'linear'
# Sanitize inputs. # Sanitize inputs.
if max_mip_level is None: if max_mip_level is None:
...@@ -407,7 +421,7 @@ def texture(tex, uv, uv_da=None, mip=None, filter_mode='auto', boundary_mode='wr ...@@ -407,7 +421,7 @@ def texture(tex, uv, uv_da=None, mip=None, filter_mode='auto', boundary_mode='wr
# Check inputs. # Check inputs.
assert isinstance(tex, torch.Tensor) and isinstance(uv, torch.Tensor) assert isinstance(tex, torch.Tensor) and isinstance(uv, torch.Tensor)
if 'mipmap' in filter_mode: if 'mipmap' in filter_mode:
assert isinstance(uv_da, torch.Tensor) assert isinstance(uv_da, torch.Tensor) or isinstance(mip_level_bias, torch.Tensor)
# If mipping disabled via max level=0, we may as well use simpler filtering internally. # If mipping disabled via max level=0, we may as well use simpler filtering internally.
if max_mip_level == 0 and filter_mode in ['linear-mipmap-nearest', 'linear-mipmap-linear']: if max_mip_level == 0 and filter_mode in ['linear-mipmap-nearest', 'linear-mipmap-linear']:
...@@ -430,7 +444,7 @@ def texture(tex, uv, uv_da=None, mip=None, filter_mode='auto', boundary_mode='wr ...@@ -430,7 +444,7 @@ def texture(tex, uv, uv_da=None, mip=None, filter_mode='auto', boundary_mode='wr
# Choose stub. # Choose stub.
if filter_mode == 'linear-mipmap-linear' or filter_mode == 'linear-mipmap-nearest': if filter_mode == 'linear-mipmap-linear' or filter_mode == 'linear-mipmap-nearest':
return _texture_func_mip.apply(filter_mode, tex, uv, uv_da, mip, filter_mode_enum, boundary_mode_enum) return _texture_func_mip.apply(filter_mode, tex, uv, uv_da, mip_level_bias, mip, filter_mode_enum, boundary_mode_enum)
else: else:
return _texture_func.apply(filter_mode, tex, uv, filter_mode_enum, boundary_mode_enum) return _texture_func.apply(filter_mode, tex, uv, filter_mode_enum, boundary_mode_enum)
......
...@@ -24,6 +24,7 @@ void AntialiasGradKernel (const AntialiasKernelParams p); ...@@ -24,6 +24,7 @@ void AntialiasGradKernel (const AntialiasKernelParams p);
TopologyHashWrapper antialias_construct_topology_hash(torch::Tensor tri) TopologyHashWrapper antialias_construct_topology_hash(torch::Tensor tri)
{ {
const at::cuda::OptionalCUDAGuard device_guard(device_of(tri));
cudaStream_t stream = at::cuda::getCurrentCUDAStream(); cudaStream_t stream = at::cuda::getCurrentCUDAStream();
AntialiasKernelParams p = {}; // Initialize all fields to zero. AntialiasKernelParams p = {}; // Initialize all fields to zero.
...@@ -66,6 +67,7 @@ TopologyHashWrapper antialias_construct_topology_hash(torch::Tensor tri) ...@@ -66,6 +67,7 @@ TopologyHashWrapper antialias_construct_topology_hash(torch::Tensor tri)
std::tuple<torch::Tensor, torch::Tensor> antialias_fwd(torch::Tensor color, torch::Tensor rast, torch::Tensor pos, torch::Tensor tri, TopologyHashWrapper topology_hash_wrap) std::tuple<torch::Tensor, torch::Tensor> antialias_fwd(torch::Tensor color, torch::Tensor rast, torch::Tensor pos, torch::Tensor tri, TopologyHashWrapper topology_hash_wrap)
{ {
const at::cuda::OptionalCUDAGuard device_guard(device_of(color));
cudaStream_t stream = at::cuda::getCurrentCUDAStream(); cudaStream_t stream = at::cuda::getCurrentCUDAStream();
AntialiasKernelParams p = {}; // Initialize all fields to zero. AntialiasKernelParams p = {}; // Initialize all fields to zero.
p.instance_mode = (pos.sizes().size() > 2) ? 1 : 0; p.instance_mode = (pos.sizes().size() > 2) ? 1 : 0;
...@@ -153,6 +155,7 @@ std::tuple<torch::Tensor, torch::Tensor> antialias_fwd(torch::Tensor color, torc ...@@ -153,6 +155,7 @@ std::tuple<torch::Tensor, torch::Tensor> antialias_fwd(torch::Tensor color, torc
std::tuple<torch::Tensor, torch::Tensor> antialias_grad(torch::Tensor color, torch::Tensor rast, torch::Tensor pos, torch::Tensor tri, torch::Tensor dy, torch::Tensor work_buffer) std::tuple<torch::Tensor, torch::Tensor> antialias_grad(torch::Tensor color, torch::Tensor rast, torch::Tensor pos, torch::Tensor tri, torch::Tensor dy, torch::Tensor work_buffer)
{ {
const at::cuda::OptionalCUDAGuard device_guard(device_of(color));
cudaStream_t stream = at::cuda::getCurrentCUDAStream(); cudaStream_t stream = at::cuda::getCurrentCUDAStream();
AntialiasKernelParams p = {}; // Initialize all fields to zero. AntialiasKernelParams p = {}; // Initialize all fields to zero.
p.instance_mode = (pos.sizes().size() > 2) ? 1 : 0; p.instance_mode = (pos.sizes().size() > 2) ? 1 : 0;
......
...@@ -16,6 +16,7 @@ ...@@ -16,6 +16,7 @@
#define OP_RETURN_T torch::Tensor #define OP_RETURN_T torch::Tensor
#define OP_RETURN_TT std::tuple<torch::Tensor, torch::Tensor> #define OP_RETURN_TT std::tuple<torch::Tensor, torch::Tensor>
#define OP_RETURN_TTT std::tuple<torch::Tensor, torch::Tensor, torch::Tensor> #define OP_RETURN_TTT std::tuple<torch::Tensor, torch::Tensor, torch::Tensor>
#define OP_RETURN_TTTT std::tuple<torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor>
OP_RETURN_TT rasterize_fwd (RasterizeGLStateWrapper& stateWrapper, torch::Tensor pos, torch::Tensor tri, std::tuple<int, int> resolution, torch::Tensor ranges); OP_RETURN_TT rasterize_fwd (RasterizeGLStateWrapper& stateWrapper, torch::Tensor pos, torch::Tensor tri, std::tuple<int, int> resolution, torch::Tensor ranges);
OP_RETURN_T rasterize_grad (torch::Tensor pos, torch::Tensor tri, torch::Tensor out, torch::Tensor dy); OP_RETURN_T rasterize_grad (torch::Tensor pos, torch::Tensor tri, torch::Tensor out, torch::Tensor dy);
...@@ -26,11 +27,11 @@ OP_RETURN_TT interpolate_grad (torch::Tensor attr, tor ...@@ -26,11 +27,11 @@ OP_RETURN_TT interpolate_grad (torch::Tensor attr, tor
OP_RETURN_TTT interpolate_grad_da (torch::Tensor attr, torch::Tensor rast, torch::Tensor tri, torch::Tensor dy, torch::Tensor rast_db, torch::Tensor dda, bool diff_attrs_all, std::vector<int>& diff_attrs_vec); OP_RETURN_TTT interpolate_grad_da (torch::Tensor attr, torch::Tensor rast, torch::Tensor tri, torch::Tensor dy, torch::Tensor rast_db, torch::Tensor dda, bool diff_attrs_all, std::vector<int>& diff_attrs_vec);
TextureMipWrapper texture_construct_mip (torch::Tensor tex, int max_mip_level, bool cube_mode); TextureMipWrapper texture_construct_mip (torch::Tensor tex, int max_mip_level, bool cube_mode);
OP_RETURN_T texture_fwd (torch::Tensor tex, torch::Tensor uv, int filter_mode, int boundary_mode); OP_RETURN_T texture_fwd (torch::Tensor tex, torch::Tensor uv, int filter_mode, int boundary_mode);
OP_RETURN_T texture_fwd_mip (torch::Tensor tex, torch::Tensor uv, torch::Tensor uv_da, TextureMipWrapper mip, int filter_mode, int boundary_mode); OP_RETURN_T texture_fwd_mip (torch::Tensor tex, torch::Tensor uv, torch::Tensor uv_da, torch::Tensor mip_level_bias, TextureMipWrapper mip, int filter_mode, int boundary_mode);
OP_RETURN_T texture_grad_nearest (torch::Tensor tex, torch::Tensor uv, torch::Tensor dy, int filter_mode, int boundary_mode); OP_RETURN_T texture_grad_nearest (torch::Tensor tex, torch::Tensor uv, torch::Tensor dy, int filter_mode, int boundary_mode);
OP_RETURN_TT texture_grad_linear (torch::Tensor tex, torch::Tensor uv, torch::Tensor dy, int filter_mode, int boundary_mode); OP_RETURN_TT texture_grad_linear (torch::Tensor tex, torch::Tensor uv, torch::Tensor dy, int filter_mode, int boundary_mode);
OP_RETURN_TT texture_grad_linear_mipmap_nearest (torch::Tensor tex, torch::Tensor uv, torch::Tensor dy, torch::Tensor uv_da, TextureMipWrapper mip, int filter_mode, int boundary_mode); OP_RETURN_TT texture_grad_linear_mipmap_nearest (torch::Tensor tex, torch::Tensor uv, torch::Tensor dy, torch::Tensor uv_da, torch::Tensor mip_level_bias, TextureMipWrapper mip, int filter_mode, int boundary_mode);
OP_RETURN_TTT texture_grad_linear_mipmap_linear (torch::Tensor tex, torch::Tensor uv, torch::Tensor dy, torch::Tensor uv_da, TextureMipWrapper mip, int filter_mode, int boundary_mode); OP_RETURN_TTTT texture_grad_linear_mipmap_linear (torch::Tensor tex, torch::Tensor uv, torch::Tensor dy, torch::Tensor uv_da, torch::Tensor mip_level_bias, TextureMipWrapper mip, int filter_mode, int boundary_mode);
TopologyHashWrapper antialias_construct_topology_hash (torch::Tensor tri); TopologyHashWrapper antialias_construct_topology_hash (torch::Tensor tri);
OP_RETURN_TT antialias_fwd (torch::Tensor color, torch::Tensor rast, torch::Tensor pos, torch::Tensor tri, TopologyHashWrapper topology_hash); OP_RETURN_TT antialias_fwd (torch::Tensor color, torch::Tensor rast, torch::Tensor pos, torch::Tensor tri, TopologyHashWrapper topology_hash);
OP_RETURN_TT antialias_grad (torch::Tensor color, torch::Tensor rast, torch::Tensor pos, torch::Tensor tri, torch::Tensor dy, torch::Tensor work_buffer); OP_RETURN_TT antialias_grad (torch::Tensor color, torch::Tensor rast, torch::Tensor pos, torch::Tensor tri, torch::Tensor dy, torch::Tensor work_buffer);
...@@ -39,7 +40,7 @@ OP_RETURN_TT antialias_grad (torch::Tensor color, to ...@@ -39,7 +40,7 @@ OP_RETURN_TT antialias_grad (torch::Tensor color, to
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
// State classes. // State classes.
pybind11::class_<RasterizeGLStateWrapper>(m, "RasterizeGLStateWrapper").def(pybind11::init<bool, bool>()) pybind11::class_<RasterizeGLStateWrapper>(m, "RasterizeGLStateWrapper").def(pybind11::init<bool, bool, int>())
.def("set_context", &RasterizeGLStateWrapper::setContext) .def("set_context", &RasterizeGLStateWrapper::setContext)
.def("release_context", &RasterizeGLStateWrapper::releaseContext); .def("release_context", &RasterizeGLStateWrapper::releaseContext);
pybind11::class_<TextureMipWrapper>(m, "TextureMipWrapper"); pybind11::class_<TextureMipWrapper>(m, "TextureMipWrapper");
...@@ -58,8 +59,8 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { ...@@ -58,8 +59,8 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("interpolate_grad", &interpolate_grad, "interpolate gradient op with attribute derivatives"); m.def("interpolate_grad", &interpolate_grad, "interpolate gradient op with attribute derivatives");
m.def("interpolate_grad_da", &interpolate_grad_da, "interpolate gradient op without attribute derivatives"); m.def("interpolate_grad_da", &interpolate_grad_da, "interpolate gradient op without attribute derivatives");
m.def("texture_construct_mip", &texture_construct_mip, "texture mipmap construction"); m.def("texture_construct_mip", &texture_construct_mip, "texture mipmap construction");
m.def("texture_fwd", &texture_fwd, "texture forward op with mipmapping and texcoord derivatives"); m.def("texture_fwd", &texture_fwd, "texture forward op without mipmapping");
m.def("texture_fwd_mip", &texture_fwd_mip, "texture forward op without mipmapping and texcoord derivatives"); m.def("texture_fwd_mip", &texture_fwd_mip, "texture forward op with mipmapping");
m.def("texture_grad_nearest", &texture_grad_nearest, "texture gradient op in nearest mode"); m.def("texture_grad_nearest", &texture_grad_nearest, "texture gradient op in nearest mode");
m.def("texture_grad_linear", &texture_grad_linear, "texture gradient op in linear mode"); m.def("texture_grad_linear", &texture_grad_linear, "texture gradient op in linear mode");
m.def("texture_grad_linear_mipmap_nearest", &texture_grad_linear_mipmap_nearest, "texture gradient op in linear-mipmap-nearest mode"); m.def("texture_grad_linear_mipmap_nearest", &texture_grad_linear_mipmap_nearest, "texture gradient op in linear-mipmap-nearest mode");
......
...@@ -17,7 +17,7 @@ ...@@ -17,7 +17,7 @@
#define __func__ __FUNCTION__ #define __func__ __FUNCTION__
#endif #endif
#define NVDR_CHECK_DEVICE(...) do { TORCH_CHECK(at::cuda::check_device({__VA_ARGS__}), __func__, "(): Inputs " #__VA_ARGS__ " must reside on current GPU device") } while(0) #define NVDR_CHECK_DEVICE(...) do { TORCH_CHECK(at::cuda::check_device({__VA_ARGS__}), __func__, "(): Inputs " #__VA_ARGS__ " must reside on the same GPU device") } while(0)
#define NVDR_CHECK_CPU(...) do { nvdr_check_cpu({__VA_ARGS__}, __func__, "(): Inputs " #__VA_ARGS__ " must reside on CPU"); } while(0) #define NVDR_CHECK_CPU(...) do { nvdr_check_cpu({__VA_ARGS__}, __func__, "(): Inputs " #__VA_ARGS__ " must reside on CPU"); } while(0)
#define NVDR_CHECK_CONTIGUOUS(...) do { nvdr_check_contiguous({__VA_ARGS__}, __func__, "(): Inputs " #__VA_ARGS__ " must be contiguous tensors"); } while(0) #define NVDR_CHECK_CONTIGUOUS(...) do { nvdr_check_contiguous({__VA_ARGS__}, __func__, "(): Inputs " #__VA_ARGS__ " must be contiguous tensors"); } while(0)
#define NVDR_CHECK_F32(...) do { nvdr_check_f32({__VA_ARGS__}, __func__, "(): Inputs " #__VA_ARGS__ " must be float32 tensors"); } while(0) #define NVDR_CHECK_F32(...) do { nvdr_check_f32({__VA_ARGS__}, __func__, "(): Inputs " #__VA_ARGS__ " must be float32 tensors"); } while(0)
......
...@@ -41,6 +41,7 @@ static void set_diff_attrs(InterpolateKernelParams& p, bool diff_attrs_all, std: ...@@ -41,6 +41,7 @@ static void set_diff_attrs(InterpolateKernelParams& p, bool diff_attrs_all, std:
std::tuple<torch::Tensor, torch::Tensor> interpolate_fwd_da(torch::Tensor attr, torch::Tensor rast, torch::Tensor tri, torch::Tensor rast_db, bool diff_attrs_all, std::vector<int>& diff_attrs_vec) std::tuple<torch::Tensor, torch::Tensor> interpolate_fwd_da(torch::Tensor attr, torch::Tensor rast, torch::Tensor tri, torch::Tensor rast_db, bool diff_attrs_all, std::vector<int>& diff_attrs_vec)
{ {
const at::cuda::OptionalCUDAGuard device_guard(device_of(attr));
cudaStream_t stream = at::cuda::getCurrentCUDAStream(); cudaStream_t stream = at::cuda::getCurrentCUDAStream();
InterpolateKernelParams p = {}; // Initialize all fields to zero. InterpolateKernelParams p = {}; // Initialize all fields to zero.
bool enable_da = (rast_db.defined()) && (diff_attrs_all || !diff_attrs_vec.empty()); bool enable_da = (rast_db.defined()) && (diff_attrs_all || !diff_attrs_vec.empty());
...@@ -86,6 +87,8 @@ std::tuple<torch::Tensor, torch::Tensor> interpolate_fwd_da(torch::Tensor attr, ...@@ -86,6 +87,8 @@ std::tuple<torch::Tensor, torch::Tensor> interpolate_fwd_da(torch::Tensor attr,
// Set attribute pixel differential info if enabled, otherwise leave as zero. // Set attribute pixel differential info if enabled, otherwise leave as zero.
if (enable_da) if (enable_da)
set_diff_attrs(p, diff_attrs_all, diff_attrs_vec); set_diff_attrs(p, diff_attrs_all, diff_attrs_vec);
else
p.numDiffAttr = 0;
// Get input pointers. // Get input pointers.
p.attr = attr.data_ptr<float>(); p.attr = attr.data_ptr<float>();
...@@ -133,6 +136,7 @@ std::tuple<torch::Tensor, torch::Tensor> interpolate_fwd(torch::Tensor attr, tor ...@@ -133,6 +136,7 @@ std::tuple<torch::Tensor, torch::Tensor> interpolate_fwd(torch::Tensor attr, tor
std::tuple<torch::Tensor, torch::Tensor, torch::Tensor> interpolate_grad_da(torch::Tensor attr, torch::Tensor rast, torch::Tensor tri, torch::Tensor dy, torch::Tensor rast_db, torch::Tensor dda, bool diff_attrs_all, std::vector<int>& diff_attrs_vec) std::tuple<torch::Tensor, torch::Tensor, torch::Tensor> interpolate_grad_da(torch::Tensor attr, torch::Tensor rast, torch::Tensor tri, torch::Tensor dy, torch::Tensor rast_db, torch::Tensor dda, bool diff_attrs_all, std::vector<int>& diff_attrs_vec)
{ {
const at::cuda::OptionalCUDAGuard device_guard(device_of(attr));
cudaStream_t stream = at::cuda::getCurrentCUDAStream(); cudaStream_t stream = at::cuda::getCurrentCUDAStream();
InterpolateKernelParams p = {}; // Initialize all fields to zero. InterpolateKernelParams p = {}; // Initialize all fields to zero.
bool enable_da = (rast_db.defined()) && (diff_attrs_all || !diff_attrs_vec.empty()); bool enable_da = (rast_db.defined()) && (diff_attrs_all || !diff_attrs_vec.empty());
...@@ -190,6 +194,8 @@ std::tuple<torch::Tensor, torch::Tensor, torch::Tensor> interpolate_grad_da(torc ...@@ -190,6 +194,8 @@ std::tuple<torch::Tensor, torch::Tensor, torch::Tensor> interpolate_grad_da(torc
// Set attribute pixel differential info if enabled, otherwise leave as zero. // Set attribute pixel differential info if enabled, otherwise leave as zero.
if (enable_da) if (enable_da)
set_diff_attrs(p, diff_attrs_all, diff_attrs_vec); set_diff_attrs(p, diff_attrs_all, diff_attrs_vec);
else
p.numDiffAttr = 0;
// Get input pointers. // Get input pointers.
p.attr = attr.data_ptr<float>(); p.attr = attr.data_ptr<float>();
......
...@@ -21,13 +21,14 @@ void RasterizeGradKernelDb(const RasterizeGradParams p); ...@@ -21,13 +21,14 @@ void RasterizeGradKernelDb(const RasterizeGradParams p);
//------------------------------------------------------------------------ //------------------------------------------------------------------------
// Python GL state wrapper methods. // Python GL state wrapper methods.
RasterizeGLStateWrapper::RasterizeGLStateWrapper(bool enableDB, bool automatic_) RasterizeGLStateWrapper::RasterizeGLStateWrapper(bool enableDB, bool automatic_, int cudaDeviceIdx_)
{ {
pState = new RasterizeGLState(); pState = new RasterizeGLState();
automatic = automatic_; automatic = automatic_;
cudaDeviceIdx = cudaDeviceIdx_;
memset(pState, 0, sizeof(RasterizeGLState)); memset(pState, 0, sizeof(RasterizeGLState));
pState->enableDB = enableDB ? 1 : 0; pState->enableDB = enableDB ? 1 : 0;
rasterizeInitGLContext(NVDR_CTX_PARAMS, *pState); rasterizeInitGLContext(NVDR_CTX_PARAMS, *pState, cudaDeviceIdx_);
releaseGLContext(); releaseGLContext();
} }
...@@ -52,6 +53,7 @@ void RasterizeGLStateWrapper::releaseContext(void) ...@@ -52,6 +53,7 @@ void RasterizeGLStateWrapper::releaseContext(void)
std::tuple<torch::Tensor, torch::Tensor> rasterize_fwd(RasterizeGLStateWrapper& stateWrapper, torch::Tensor pos, torch::Tensor tri, std::tuple<int, int> resolution, torch::Tensor ranges) std::tuple<torch::Tensor, torch::Tensor> rasterize_fwd(RasterizeGLStateWrapper& stateWrapper, torch::Tensor pos, torch::Tensor tri, std::tuple<int, int> resolution, torch::Tensor ranges)
{ {
const at::cuda::OptionalCUDAGuard device_guard(device_of(pos));
cudaStream_t stream = at::cuda::getCurrentCUDAStream(); cudaStream_t stream = at::cuda::getCurrentCUDAStream();
RasterizeGLState& s = *stateWrapper.pState; RasterizeGLState& s = *stateWrapper.pState;
...@@ -62,6 +64,9 @@ std::tuple<torch::Tensor, torch::Tensor> rasterize_fwd(RasterizeGLStateWrapper& ...@@ -62,6 +64,9 @@ std::tuple<torch::Tensor, torch::Tensor> rasterize_fwd(RasterizeGLStateWrapper&
NVDR_CHECK_F32(pos); NVDR_CHECK_F32(pos);
NVDR_CHECK_I32(tri, ranges); NVDR_CHECK_I32(tri, ranges);
// Check that GL context was created for the correct GPU.
NVDR_CHECK(pos.get_device() == stateWrapper.cudaDeviceIdx, "GL context must must reside on the same device as input tensors");
// Determine number of outputs // Determine number of outputs
int num_outputs = s.enableDB ? 2 : 1; int num_outputs = s.enableDB ? 2 : 1;
...@@ -123,6 +128,7 @@ std::tuple<torch::Tensor, torch::Tensor> rasterize_fwd(RasterizeGLStateWrapper& ...@@ -123,6 +128,7 @@ std::tuple<torch::Tensor, torch::Tensor> rasterize_fwd(RasterizeGLStateWrapper&
torch::Tensor rasterize_grad_db(torch::Tensor pos, torch::Tensor tri, torch::Tensor out, torch::Tensor dy, torch::Tensor ddb) torch::Tensor rasterize_grad_db(torch::Tensor pos, torch::Tensor tri, torch::Tensor out, torch::Tensor dy, torch::Tensor ddb)
{ {
const at::cuda::OptionalCUDAGuard device_guard(device_of(pos));
cudaStream_t stream = at::cuda::getCurrentCUDAStream(); cudaStream_t stream = at::cuda::getCurrentCUDAStream();
RasterizeGradParams p; RasterizeGradParams p;
bool enable_db = ddb.defined(); bool enable_db = ddb.defined();
......
...@@ -42,6 +42,18 @@ void TextureFwdKernelCubeLinearMipmapNearest4 (const TextureKernelParams p); ...@@ -42,6 +42,18 @@ void TextureFwdKernelCubeLinearMipmapNearest4 (const TextureKernelParams p);
void TextureFwdKernelCubeLinearMipmapLinear1 (const TextureKernelParams p); void TextureFwdKernelCubeLinearMipmapLinear1 (const TextureKernelParams p);
void TextureFwdKernelCubeLinearMipmapLinear2 (const TextureKernelParams p); void TextureFwdKernelCubeLinearMipmapLinear2 (const TextureKernelParams p);
void TextureFwdKernelCubeLinearMipmapLinear4 (const TextureKernelParams p); void TextureFwdKernelCubeLinearMipmapLinear4 (const TextureKernelParams p);
void TextureFwdKernelLinearMipmapNearestBO1 (const TextureKernelParams p);
void TextureFwdKernelLinearMipmapNearestBO2 (const TextureKernelParams p);
void TextureFwdKernelLinearMipmapNearestBO4 (const TextureKernelParams p);
void TextureFwdKernelLinearMipmapLinearBO1 (const TextureKernelParams p);
void TextureFwdKernelLinearMipmapLinearBO2 (const TextureKernelParams p);
void TextureFwdKernelLinearMipmapLinearBO4 (const TextureKernelParams p);
void TextureFwdKernelCubeLinearMipmapNearestBO1 (const TextureKernelParams p);
void TextureFwdKernelCubeLinearMipmapNearestBO2 (const TextureKernelParams p);
void TextureFwdKernelCubeLinearMipmapNearestBO4 (const TextureKernelParams p);
void TextureFwdKernelCubeLinearMipmapLinearBO1 (const TextureKernelParams p);
void TextureFwdKernelCubeLinearMipmapLinearBO2 (const TextureKernelParams p);
void TextureFwdKernelCubeLinearMipmapLinearBO4 (const TextureKernelParams p);
void MipGradKernel1 (const TextureKernelParams p); void MipGradKernel1 (const TextureKernelParams p);
void MipGradKernel2 (const TextureKernelParams p); void MipGradKernel2 (const TextureKernelParams p);
void MipGradKernel4 (const TextureKernelParams p); void MipGradKernel4 (const TextureKernelParams p);
...@@ -53,6 +65,10 @@ void TextureGradKernelCubeNearest (const TextureKernelParams p); ...@@ -53,6 +65,10 @@ void TextureGradKernelCubeNearest (const TextureKernelParams p);
void TextureGradKernelCubeLinear (const TextureKernelParams p); void TextureGradKernelCubeLinear (const TextureKernelParams p);
void TextureGradKernelCubeLinearMipmapNearest (const TextureKernelParams p); void TextureGradKernelCubeLinearMipmapNearest (const TextureKernelParams p);
void TextureGradKernelCubeLinearMipmapLinear (const TextureKernelParams p); void TextureGradKernelCubeLinearMipmapLinear (const TextureKernelParams p);
void TextureGradKernelLinearMipmapNearestBO (const TextureKernelParams p);
void TextureGradKernelLinearMipmapLinearBO (const TextureKernelParams p);
void TextureGradKernelCubeLinearMipmapNearestBO (const TextureKernelParams p);
void TextureGradKernelCubeLinearMipmapLinearBO (const TextureKernelParams p);
//------------------------------------------------------------------------ //------------------------------------------------------------------------
// Modeselektor. // Modeselektor.
...@@ -81,6 +97,7 @@ static void set_modes(TextureKernelParams& p, int filter_mode, int boundary_mode ...@@ -81,6 +97,7 @@ static void set_modes(TextureKernelParams& p, int filter_mode, int boundary_mode
TextureMipWrapper texture_construct_mip(torch::Tensor tex, int max_mip_level, bool cube_mode) TextureMipWrapper texture_construct_mip(torch::Tensor tex, int max_mip_level, bool cube_mode)
{ {
const at::cuda::OptionalCUDAGuard device_guard(device_of(tex));
cudaStream_t stream = at::cuda::getCurrentCUDAStream(); cudaStream_t stream = at::cuda::getCurrentCUDAStream();
TextureKernelParams p = {}; // Initialize all fields to zero. TextureKernelParams p = {}; // Initialize all fields to zero.
p.mipLevelLimit = max_mip_level; p.mipLevelLimit = max_mip_level;
...@@ -151,31 +168,46 @@ TextureMipWrapper texture_construct_mip(torch::Tensor tex, int max_mip_level, bo ...@@ -151,31 +168,46 @@ TextureMipWrapper texture_construct_mip(torch::Tensor tex, int max_mip_level, bo
//------------------------------------------------------------------------ //------------------------------------------------------------------------
// Forward op. // Forward op.
torch::Tensor texture_fwd_mip(torch::Tensor tex, torch::Tensor uv, torch::Tensor uv_da, TextureMipWrapper mip_wrap, int filter_mode, int boundary_mode) torch::Tensor texture_fwd_mip(torch::Tensor tex, torch::Tensor uv, torch::Tensor uv_da, torch::Tensor mip_level_bias, TextureMipWrapper mip_wrap, int filter_mode, int boundary_mode)
{ {
const at::cuda::OptionalCUDAGuard device_guard(device_of(tex));
cudaStream_t stream = at::cuda::getCurrentCUDAStream(); cudaStream_t stream = at::cuda::getCurrentCUDAStream();
TextureKernelParams p = {}; // Initialize all fields to zero. TextureKernelParams p = {}; // Initialize all fields to zero.
torch::Tensor& mip = mip_wrap.mip; // Unwrap. torch::Tensor& mip = mip_wrap.mip; // Unwrap.
int max_mip_level = mip_wrap.max_mip_level; int max_mip_level = mip_wrap.max_mip_level;
set_modes(p, filter_mode, boundary_mode, max_mip_level); set_modes(p, filter_mode, boundary_mode, max_mip_level);
// See if we have these tensors or not.
bool has_uv_da = uv_da.defined() && uv_da.nbytes();
bool has_mip_level_bias = mip_level_bias.defined() && mip_level_bias.nbytes();
if (p.enableMip) if (p.enableMip)
{ {
NVDR_CHECK(uv_da.defined(), "mipmapping filter mode requires uv_da input"); NVDR_CHECK(has_uv_da || has_mip_level_bias, "mipmapping filter mode requires uv_da and/or mip_level_bias input");
NVDR_CHECK(mip.defined(), "mipmapping filter mode requires mip tensor input"); NVDR_CHECK(mip.defined(), "mipmapping filter mode requires mip tensor input");
} }
// Check inputs. // Check inputs.
NVDR_CHECK_DEVICE(tex, uv);
NVDR_CHECK_CONTIGUOUS(tex, uv);
NVDR_CHECK_F32(tex, uv);
if (p.enableMip) if (p.enableMip)
{ {
NVDR_CHECK_DEVICE(tex, uv, uv_da, mip); NVDR_CHECK_DEVICE(mip);
NVDR_CHECK_CONTIGUOUS(tex, uv, uv_da, mip); NVDR_CHECK_CONTIGUOUS(mip);
NVDR_CHECK_F32(tex, uv, uv_da, mip); NVDR_CHECK_F32(mip);
if (has_uv_da)
{
NVDR_CHECK_DEVICE(uv_da);
NVDR_CHECK_CONTIGUOUS(uv_da);
NVDR_CHECK_F32(uv_da);
} }
else if (has_mip_level_bias)
{ {
NVDR_CHECK_DEVICE(tex, uv); NVDR_CHECK_DEVICE(mip_level_bias);
NVDR_CHECK_CONTIGUOUS(tex, uv); NVDR_CHECK_CONTIGUOUS(mip_level_bias);
NVDR_CHECK_F32(tex, uv); NVDR_CHECK_F32(mip_level_bias);
}
} }
// Sanity checks and state setters. // Sanity checks and state setters.
...@@ -204,17 +236,23 @@ torch::Tensor texture_fwd_mip(torch::Tensor tex, torch::Tensor uv, torch::Tensor ...@@ -204,17 +236,23 @@ torch::Tensor texture_fwd_mip(torch::Tensor tex, torch::Tensor uv, torch::Tensor
p.imgWidth = uv.size(2); p.imgWidth = uv.size(2);
p.texDepth = tex.size(0); p.texDepth = tex.size(0);
if (p.enableMip) if (p.enableMip)
{
if (has_uv_da)
{ {
if (!cube_mode) if (!cube_mode)
NVDR_CHECK(uv_da.sizes().size() == 4 && uv_da.size(0) == p.n && uv_da.size(1) == p.imgHeight && uv_da.size(2) == p.imgWidth && uv_da.size(3) == 4, "uv_da must have shape [minibatch_size, height, width, 4]"); NVDR_CHECK(uv_da.sizes().size() == 4 && uv_da.size(0) == p.n && uv_da.size(1) == p.imgHeight && uv_da.size(2) == p.imgWidth && uv_da.size(3) == 4, "uv_da must have shape [minibatch_size, height, width, 4]");
else else
NVDR_CHECK(uv_da.sizes().size() == 4 && uv_da.size(0) == p.n && uv_da.size(1) == p.imgHeight && uv_da.size(2) == p.imgWidth && uv_da.size(3) == 6, "uv_da must have shape [minibatch_size, height, width, 6] in cube map mode"); NVDR_CHECK(uv_da.sizes().size() == 4 && uv_da.size(0) == p.n && uv_da.size(1) == p.imgHeight && uv_da.size(2) == p.imgWidth && uv_da.size(3) == 6, "uv_da must have shape [minibatch_size, height, width, 6] in cube map mode");
} }
if (has_mip_level_bias)
NVDR_CHECK(mip_level_bias.sizes().size() == 3 && mip_level_bias.size(0) == p.n && mip_level_bias.size(1) == p.imgHeight && mip_level_bias.size(2) == p.imgWidth, "mip_level_bias must have shape [minibatch_size, height, width]");
}
// Get input pointers. // Get input pointers.
p.tex = tex.data_ptr<float>(); p.tex = tex.data_ptr<float>();
p.uv = uv.data_ptr<float>(); p.uv = uv.data_ptr<float>();
p.uvDA = p.enableMip ? uv_da.data_ptr<float>() : NULL; p.uvDA = (p.enableMip && has_uv_da) ? uv_da.data_ptr<float>() : NULL;
p.mipLevelBias = (p.enableMip && has_mip_level_bias) ? mip_level_bias.data_ptr<float>() : NULL;
// Allocate output tensor. // Allocate output tensor.
torch::TensorOptions opts = torch::TensorOptions().dtype(torch::kFloat32).device(torch::kCUDA); torch::TensorOptions opts = torch::TensorOptions().dtype(torch::kFloat32).device(torch::kCUDA);
...@@ -263,8 +301,8 @@ torch::Tensor texture_fwd_mip(torch::Tensor tex, torch::Tensor uv, torch::Tensor ...@@ -263,8 +301,8 @@ torch::Tensor texture_fwd_mip(torch::Tensor tex, torch::Tensor uv, torch::Tensor
dim3 blockSize = getLaunchBlockSize(TEX_FWD_MAX_KERNEL_BLOCK_WIDTH, TEX_FWD_MAX_KERNEL_BLOCK_HEIGHT, p.imgWidth, p.imgHeight); dim3 blockSize = getLaunchBlockSize(TEX_FWD_MAX_KERNEL_BLOCK_WIDTH, TEX_FWD_MAX_KERNEL_BLOCK_HEIGHT, p.imgWidth, p.imgHeight);
dim3 gridSize = getLaunchGridSize(blockSize, p.imgWidth, p.imgHeight, p.n); dim3 gridSize = getLaunchGridSize(blockSize, p.imgWidth, p.imgHeight, p.n);
// Choose kernel based on filter mode, cube mode, and datatype. // Choose kernel based on filter mode, cube mode, bias-only mode, and datatype.
void* func_tbl[TEX_MODE_COUNT * 3 * 2] = { void* func_tbl[TEX_MODE_COUNT * 2 * 2 * 3] = {
(void*)TextureFwdKernelNearest1, (void*)TextureFwdKernelNearest1,
(void*)TextureFwdKernelNearest2, (void*)TextureFwdKernelNearest2,
(void*)TextureFwdKernelNearest4, (void*)TextureFwdKernelNearest4,
...@@ -289,13 +327,39 @@ torch::Tensor texture_fwd_mip(torch::Tensor tex, torch::Tensor uv, torch::Tensor ...@@ -289,13 +327,39 @@ torch::Tensor texture_fwd_mip(torch::Tensor tex, torch::Tensor uv, torch::Tensor
(void*)TextureFwdKernelCubeLinearMipmapLinear1, (void*)TextureFwdKernelCubeLinearMipmapLinear1,
(void*)TextureFwdKernelCubeLinearMipmapLinear2, (void*)TextureFwdKernelCubeLinearMipmapLinear2,
(void*)TextureFwdKernelCubeLinearMipmapLinear4, (void*)TextureFwdKernelCubeLinearMipmapLinear4,
NULL,
NULL,
NULL,
NULL,
NULL,
NULL,
(void*)TextureFwdKernelLinearMipmapNearestBO1,
(void*)TextureFwdKernelLinearMipmapNearestBO2,
(void*)TextureFwdKernelLinearMipmapNearestBO4,
(void*)TextureFwdKernelLinearMipmapLinearBO1,
(void*)TextureFwdKernelLinearMipmapLinearBO2,
(void*)TextureFwdKernelLinearMipmapLinearBO4,
NULL,
NULL,
NULL,
NULL,
NULL,
NULL,
(void*)TextureFwdKernelCubeLinearMipmapNearestBO1,
(void*)TextureFwdKernelCubeLinearMipmapNearestBO2,
(void*)TextureFwdKernelCubeLinearMipmapNearestBO4,
(void*)TextureFwdKernelCubeLinearMipmapLinearBO1,
(void*)TextureFwdKernelCubeLinearMipmapLinearBO2,
(void*)TextureFwdKernelCubeLinearMipmapLinearBO4,
}; };
// Function index. // Function index.
int func_idx = p.filterMode; int func_idx = p.filterMode;
if (cube_mode) if (cube_mode)
func_idx += TEX_MODE_COUNT; func_idx += TEX_MODE_COUNT; // Cube variant.
func_idx = func_idx * 3 + channel_div_idx; if (p.enableMip && !has_uv_da)
func_idx += TEX_MODE_COUNT * 2; // Bias-only variant.
func_idx = func_idx * 3 + channel_div_idx; // Choose vector size.
// Launch kernel. // Launch kernel.
NVDR_CHECK_CUDA_ERROR(cudaLaunchKernel(func_tbl[func_idx], gridSize, blockSize, args, 0, stream)); NVDR_CHECK_CUDA_ERROR(cudaLaunchKernel(func_tbl[func_idx], gridSize, blockSize, args, 0, stream));
...@@ -308,37 +372,52 @@ torch::Tensor texture_fwd_mip(torch::Tensor tex, torch::Tensor uv, torch::Tensor ...@@ -308,37 +372,52 @@ torch::Tensor texture_fwd_mip(torch::Tensor tex, torch::Tensor uv, torch::Tensor
torch::Tensor texture_fwd(torch::Tensor tex, torch::Tensor uv, int filter_mode, int boundary_mode) torch::Tensor texture_fwd(torch::Tensor tex, torch::Tensor uv, int filter_mode, int boundary_mode)
{ {
torch::Tensor empty_tensor; torch::Tensor empty_tensor;
return texture_fwd_mip(tex, uv, empty_tensor, TextureMipWrapper(), filter_mode, boundary_mode); return texture_fwd_mip(tex, uv, empty_tensor, empty_tensor, TextureMipWrapper(), filter_mode, boundary_mode);
} }
//------------------------------------------------------------------------ //------------------------------------------------------------------------
// Gradient op. // Gradient op.
std::tuple<torch::Tensor, torch::Tensor, torch::Tensor> texture_grad_linear_mipmap_linear(torch::Tensor tex, torch::Tensor uv, torch::Tensor dy, torch::Tensor uv_da, TextureMipWrapper mip_wrap, int filter_mode, int boundary_mode) std::tuple<torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor> texture_grad_linear_mipmap_linear(torch::Tensor tex, torch::Tensor uv, torch::Tensor dy, torch::Tensor uv_da, torch::Tensor mip_level_bias, TextureMipWrapper mip_wrap, int filter_mode, int boundary_mode)
{ {
const at::cuda::OptionalCUDAGuard device_guard(device_of(tex));
cudaStream_t stream = at::cuda::getCurrentCUDAStream(); cudaStream_t stream = at::cuda::getCurrentCUDAStream();
TextureKernelParams p = {}; // Initialize all fields to zero. TextureKernelParams p = {}; // Initialize all fields to zero.
torch::Tensor& mip = mip_wrap.mip; // Unwrap. torch::Tensor& mip = mip_wrap.mip; // Unwrap.
int max_mip_level = mip_wrap.max_mip_level; int max_mip_level = mip_wrap.max_mip_level;
set_modes(p, filter_mode, boundary_mode, max_mip_level); set_modes(p, filter_mode, boundary_mode, max_mip_level);
// See if we have these tensors or not.
bool has_uv_da = uv_da.defined() && uv_da.nbytes();
bool has_mip_level_bias = mip_level_bias.defined() && mip_level_bias.nbytes();
if (p.enableMip) if (p.enableMip)
{ {
NVDR_CHECK(uv_da.defined(), "mipmapping filter mode requires uv_da input in gradient"); NVDR_CHECK(has_uv_da || has_mip_level_bias, "mipmapping filter mode requires uv_da and/or mip_level_bias input");
NVDR_CHECK(mip.defined(), "mipmapping filter mode requires mip input in gradient"); NVDR_CHECK(mip.defined(), "mipmapping filter mode requires mip tensor input");
} }
// Check inputs. // Check inputs.
NVDR_CHECK_DEVICE(tex, uv);
NVDR_CHECK_CONTIGUOUS(tex, uv);
NVDR_CHECK_F32(tex, uv);
if (p.enableMip) if (p.enableMip)
{ {
NVDR_CHECK_DEVICE(tex, uv, dy, uv_da, mip); NVDR_CHECK_DEVICE(mip);
NVDR_CHECK_CONTIGUOUS(tex, uv, uv_da, mip); NVDR_CHECK_CONTIGUOUS(mip);
NVDR_CHECK_F32(tex, uv, dy, uv_da, mip); NVDR_CHECK_F32(mip);
if (has_uv_da)
{
NVDR_CHECK_DEVICE(uv_da);
NVDR_CHECK_CONTIGUOUS(uv_da);
NVDR_CHECK_F32(uv_da);
} }
else if (has_mip_level_bias)
{ {
NVDR_CHECK_DEVICE(tex, uv, dy); NVDR_CHECK_DEVICE(mip_level_bias);
NVDR_CHECK_CONTIGUOUS(tex, uv); NVDR_CHECK_CONTIGUOUS(mip_level_bias);
NVDR_CHECK_F32(tex, uv, dy); NVDR_CHECK_F32(mip_level_bias);
}
} }
// Sanity checks and state setters. // Sanity checks and state setters.
...@@ -367,12 +446,17 @@ std::tuple<torch::Tensor, torch::Tensor, torch::Tensor> texture_grad_linear_mipm ...@@ -367,12 +446,17 @@ std::tuple<torch::Tensor, torch::Tensor, torch::Tensor> texture_grad_linear_mipm
p.imgWidth = uv.size(2); p.imgWidth = uv.size(2);
p.texDepth = tex.size(0); p.texDepth = tex.size(0);
if (p.enableMip) if (p.enableMip)
{
if (has_uv_da)
{ {
if (!cube_mode) if (!cube_mode)
NVDR_CHECK(uv_da.sizes().size() == 4 && uv_da.size(0) == p.n && uv_da.size(1) == p.imgHeight && uv_da.size(2) == p.imgWidth && uv_da.size(3) == 4, "uv_da must have shape [minibatch_size, height, width, 4]"); NVDR_CHECK(uv_da.sizes().size() == 4 && uv_da.size(0) == p.n && uv_da.size(1) == p.imgHeight && uv_da.size(2) == p.imgWidth && uv_da.size(3) == 4, "uv_da must have shape [minibatch_size, height, width, 4]");
else else
NVDR_CHECK(uv_da.sizes().size() == 4 && uv_da.size(0) == p.n && uv_da.size(1) == p.imgHeight && uv_da.size(2) == p.imgWidth && uv_da.size(3) == 6, "uv_da must have shape [minibatch_size, height, width, 6] in cube map mode"); NVDR_CHECK(uv_da.sizes().size() == 4 && uv_da.size(0) == p.n && uv_da.size(1) == p.imgHeight && uv_da.size(2) == p.imgWidth && uv_da.size(3) == 6, "uv_da must have shape [minibatch_size, height, width, 6] in cube map mode");
} }
if (has_mip_level_bias)
NVDR_CHECK(mip_level_bias.sizes().size() == 3 && mip_level_bias.size(0) == p.n && mip_level_bias.size(1) == p.imgHeight && mip_level_bias.size(2) == p.imgWidth, "mip_level_bias must have shape [minibatch_size, height, width]");
}
NVDR_CHECK(dy.sizes().size() == 4 && dy.size(0) == p.n && dy.size(1) == p.imgHeight && dy.size(2) == p.imgWidth && dy.size(3) == p.channels, "dy must have shape [minibatch_size, height, width, channels]"); NVDR_CHECK(dy.sizes().size() == 4 && dy.size(0) == p.n && dy.size(1) == p.imgHeight && dy.size(2) == p.imgWidth && dy.size(3) == p.channels, "dy must have shape [minibatch_size, height, width, channels]");
// Get contiguous version of dy. // Get contiguous version of dy.
...@@ -382,7 +466,8 @@ std::tuple<torch::Tensor, torch::Tensor, torch::Tensor> texture_grad_linear_mipm ...@@ -382,7 +466,8 @@ std::tuple<torch::Tensor, torch::Tensor, torch::Tensor> texture_grad_linear_mipm
p.tex = tex.data_ptr<float>(); p.tex = tex.data_ptr<float>();
p.uv = uv.data_ptr<float>(); p.uv = uv.data_ptr<float>();
p.dy = dy_.data_ptr<float>(); p.dy = dy_.data_ptr<float>();
p.uvDA = p.enableMip ? uv_da.data_ptr<float>() : NULL; p.uvDA = (p.enableMip && has_uv_da) ? uv_da.data_ptr<float>() : NULL;
p.mipLevelBias = (p.enableMip && has_mip_level_bias) ? mip_level_bias.data_ptr<float>() : NULL;
p.mip = p.enableMip ? (float*)mip.data_ptr<float>() : NULL; p.mip = p.enableMip ? (float*)mip.data_ptr<float>() : NULL;
// Allocate output tensor for tex gradient. // Allocate output tensor for tex gradient.
...@@ -392,17 +477,29 @@ std::tuple<torch::Tensor, torch::Tensor, torch::Tensor> texture_grad_linear_mipm ...@@ -392,17 +477,29 @@ std::tuple<torch::Tensor, torch::Tensor, torch::Tensor> texture_grad_linear_mipm
// Allocate output tensor for uv gradient. // Allocate output tensor for uv gradient.
torch::Tensor grad_uv; torch::Tensor grad_uv;
torch::Tensor grad_uv_da; torch::Tensor grad_uv_da;
torch::Tensor grad_mip_level_bias;
if (p.filterMode != TEX_MODE_NEAREST) if (p.filterMode != TEX_MODE_NEAREST)
{ {
grad_uv = torch::empty_like(uv); grad_uv = torch::empty_like(uv);
p.gradUV = grad_uv.data_ptr<float>(); p.gradUV = grad_uv.data_ptr<float>();
// Allocate output tensor for uv_da gradient. // Gradients for things affecting mip level.
if (p.filterMode == TEX_MODE_LINEAR_MIPMAP_LINEAR) if (p.filterMode == TEX_MODE_LINEAR_MIPMAP_LINEAR)
{
// Allocate output tensor for uv_da gradient.
if (has_uv_da)
{ {
grad_uv_da = torch::empty_like(uv_da); grad_uv_da = torch::empty_like(uv_da);
p.gradUVDA = grad_uv_da.data_ptr<float>(); p.gradUVDA = grad_uv_da.data_ptr<float>();
} }
// Allocate output tensor for mip_level_bias gradient.
if (has_mip_level_bias)
{
grad_mip_level_bias = torch::empty_like(mip_level_bias);
p.gradMipLevelBias = grad_mip_level_bias.data_ptr<float>();
}
}
} }
// Choose kernel variants based on channel count. // Choose kernel variants based on channel count.
...@@ -457,7 +554,7 @@ std::tuple<torch::Tensor, torch::Tensor, torch::Tensor> texture_grad_linear_mipm ...@@ -457,7 +554,7 @@ std::tuple<torch::Tensor, torch::Tensor, torch::Tensor> texture_grad_linear_mipm
dim3 blockSize = getLaunchBlockSize(TEX_GRAD_MAX_KERNEL_BLOCK_WIDTH, TEX_GRAD_MAX_KERNEL_BLOCK_HEIGHT, p.imgWidth, p.imgHeight); dim3 blockSize = getLaunchBlockSize(TEX_GRAD_MAX_KERNEL_BLOCK_WIDTH, TEX_GRAD_MAX_KERNEL_BLOCK_HEIGHT, p.imgWidth, p.imgHeight);
dim3 gridSize = getLaunchGridSize(blockSize, p.imgWidth, p.imgHeight, p.n); dim3 gridSize = getLaunchGridSize(blockSize, p.imgWidth, p.imgHeight, p.n);
void* func_tbl[TEX_MODE_COUNT * 2] = { void* func_tbl[TEX_MODE_COUNT * 2 * 2] = {
(void*)TextureGradKernelNearest, (void*)TextureGradKernelNearest,
(void*)TextureGradKernelLinear, (void*)TextureGradKernelLinear,
(void*)TextureGradKernelLinearMipmapNearest, (void*)TextureGradKernelLinearMipmapNearest,
...@@ -466,12 +563,22 @@ std::tuple<torch::Tensor, torch::Tensor, torch::Tensor> texture_grad_linear_mipm ...@@ -466,12 +563,22 @@ std::tuple<torch::Tensor, torch::Tensor, torch::Tensor> texture_grad_linear_mipm
(void*)TextureGradKernelCubeLinear, (void*)TextureGradKernelCubeLinear,
(void*)TextureGradKernelCubeLinearMipmapNearest, (void*)TextureGradKernelCubeLinearMipmapNearest,
(void*)TextureGradKernelCubeLinearMipmapLinear, (void*)TextureGradKernelCubeLinearMipmapLinear,
NULL,
NULL,
(void*)TextureGradKernelLinearMipmapNearestBO,
(void*)TextureGradKernelLinearMipmapLinearBO,
NULL,
NULL,
(void*)TextureGradKernelCubeLinearMipmapNearestBO,
(void*)TextureGradKernelCubeLinearMipmapLinearBO,
}; };
// Function index. // Function index.
int func_idx = p.filterMode; int func_idx = p.filterMode;
if (cube_mode) if (cube_mode)
func_idx += TEX_MODE_COUNT; func_idx += TEX_MODE_COUNT; // Cube variant.
if (p.enableMip && !has_uv_da)
func_idx += TEX_MODE_COUNT * 2; // Bias-only variant.
// Launch main gradient kernel. // Launch main gradient kernel.
NVDR_CHECK_CUDA_ERROR(cudaLaunchKernel(func_tbl[func_idx], gridSize, blockSize, args, 0, stream)); NVDR_CHECK_CUDA_ERROR(cudaLaunchKernel(func_tbl[func_idx], gridSize, blockSize, args, 0, stream));
...@@ -488,14 +595,14 @@ std::tuple<torch::Tensor, torch::Tensor, torch::Tensor> texture_grad_linear_mipm ...@@ -488,14 +595,14 @@ std::tuple<torch::Tensor, torch::Tensor, torch::Tensor> texture_grad_linear_mipm
} }
// Return output tensors. // Return output tensors.
return std::tuple<torch::Tensor, torch::Tensor, torch::Tensor>(grad_tex, grad_uv, grad_uv_da); return std::tuple<torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor>(grad_tex, grad_uv, grad_uv_da, grad_mip_level_bias);
} }
// Version for nearest filter mode. // Version for nearest filter mode.
torch::Tensor texture_grad_nearest(torch::Tensor tex, torch::Tensor uv, torch::Tensor dy, int filter_mode, int boundary_mode) torch::Tensor texture_grad_nearest(torch::Tensor tex, torch::Tensor uv, torch::Tensor dy, int filter_mode, int boundary_mode)
{ {
torch::Tensor empty_tensor; torch::Tensor empty_tensor;
std::tuple<torch::Tensor, torch::Tensor, torch::Tensor> result = texture_grad_linear_mipmap_linear(tex, uv, dy, empty_tensor, TextureMipWrapper(), filter_mode, boundary_mode); std::tuple<torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor> result = texture_grad_linear_mipmap_linear(tex, uv, dy, empty_tensor, empty_tensor, TextureMipWrapper(), filter_mode, boundary_mode);
return std::get<0>(result); return std::get<0>(result);
} }
...@@ -503,14 +610,14 @@ torch::Tensor texture_grad_nearest(torch::Tensor tex, torch::Tensor uv, torch::T ...@@ -503,14 +610,14 @@ torch::Tensor texture_grad_nearest(torch::Tensor tex, torch::Tensor uv, torch::T
std::tuple<torch::Tensor, torch::Tensor> texture_grad_linear(torch::Tensor tex, torch::Tensor uv, torch::Tensor dy, int filter_mode, int boundary_mode) std::tuple<torch::Tensor, torch::Tensor> texture_grad_linear(torch::Tensor tex, torch::Tensor uv, torch::Tensor dy, int filter_mode, int boundary_mode)
{ {
torch::Tensor empty_tensor; torch::Tensor empty_tensor;
std::tuple<torch::Tensor, torch::Tensor, torch::Tensor> result = texture_grad_linear_mipmap_linear(tex, uv, dy, empty_tensor, TextureMipWrapper(), filter_mode, boundary_mode); std::tuple<torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor> result = texture_grad_linear_mipmap_linear(tex, uv, dy, empty_tensor, empty_tensor, TextureMipWrapper(), filter_mode, boundary_mode);
return std::tuple<torch::Tensor, torch::Tensor>(std::get<0>(result), std::get<1>(result)); return std::tuple<torch::Tensor, torch::Tensor>(std::get<0>(result), std::get<1>(result));
} }
// Version for linear-mipmap-nearest mode. // Version for linear-mipmap-nearest mode.
std::tuple<torch::Tensor, torch::Tensor> texture_grad_linear_mipmap_nearest(torch::Tensor tex, torch::Tensor uv, torch::Tensor dy, torch::Tensor uv_da, TextureMipWrapper mip, int filter_mode, int boundary_mode) std::tuple<torch::Tensor, torch::Tensor> texture_grad_linear_mipmap_nearest(torch::Tensor tex, torch::Tensor uv, torch::Tensor dy, torch::Tensor uv_da, torch::Tensor mip_level_bias, TextureMipWrapper mip, int filter_mode, int boundary_mode)
{ {
std::tuple<torch::Tensor, torch::Tensor, torch::Tensor> result = texture_grad_linear_mipmap_linear(tex, uv, dy, uv_da, mip, filter_mode, boundary_mode); std::tuple<torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor> result = texture_grad_linear_mipmap_linear(tex, uv, dy, uv_da, mip_level_bias, mip, filter_mode, boundary_mode);
return std::tuple<torch::Tensor, torch::Tensor>(std::get<0>(result), std::get<1>(result)); return std::tuple<torch::Tensor, torch::Tensor>(std::get<0>(result), std::get<1>(result));
} }
......
...@@ -15,7 +15,7 @@ class RasterizeGLState; ...@@ -15,7 +15,7 @@ class RasterizeGLState;
class RasterizeGLStateWrapper class RasterizeGLStateWrapper
{ {
public: public:
RasterizeGLStateWrapper (bool enableDB, bool automatic); RasterizeGLStateWrapper (bool enableDB, bool automatic, int cudaDeviceIdx);
~RasterizeGLStateWrapper (void); ~RasterizeGLStateWrapper (void);
void setContext (void); void setContext (void);
...@@ -23,6 +23,7 @@ public: ...@@ -23,6 +23,7 @@ public:
RasterizeGLState* pState; RasterizeGLState* pState;
bool automatic; bool automatic;
int cudaDeviceIdx;
}; };
//------------------------------------------------------------------------ //------------------------------------------------------------------------
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment