Commit 2f498f23 authored by Samuli Laine's avatar Samuli Laine
Browse files

Bug and perf fixes

parent 5a2d5d59
...@@ -901,12 +901,15 @@ must have shape [minibatch_size, height, width, 2]. When sampling a cube map ...@@ -901,12 +901,15 @@ must have shape [minibatch_size, height, width, 2]. When sampling a cube map
texture, must have shape [minibatch_size, height, width, 3].</td></tr><tr class="arg"><td class="argname">uv_da</td><td class="arg_short">(Optional) Tensor containing image-space derivatives of texture coordinates. texture, must have shape [minibatch_size, height, width, 3].</td></tr><tr class="arg"><td class="argname">uv_da</td><td class="arg_short">(Optional) Tensor containing image-space derivatives of texture coordinates.
Must have same shape as <code>uv</code> except for the last dimension that is to be twice Must have same shape as <code>uv</code> except for the last dimension that is to be twice
as long.</td></tr><tr class="arg"><td class="argname">mip_level_bias</td><td class="arg_short">(Optional) Per-pixel bias for mip level selection. If <code>uv_da</code> is omitted, as long.</td></tr><tr class="arg"><td class="argname">mip_level_bias</td><td class="arg_short">(Optional) Per-pixel bias for mip level selection. If <code>uv_da</code> is omitted,
determines mip level directly. Must have shape [minibatch_size, height, width].</td></tr><tr class="arg"><td class="argname">mip</td><td class="arg_short">(Optional) Preconstructed mipmap stack from a <code>texture_construct_mip()</code> call or a list determines mip level directly. Must have shape [minibatch_size, height, width].</td></tr><tr class="arg"><td class="argname">mip</td><td class="arg_short">(Optional) Preconstructed mipmap stack from a <code>texture_construct_mip()</code> call, or a list
of tensors specifying a custom mipmap stack. Gradients of a custom mipmap stack of tensors specifying a custom mipmap stack. When specifying a custom mipmap stack,
are not automatically propagated to base texture but the mipmap tensors will the tensors in the list must follow the same format as <code>tex</code> except for width and
receive gradients of their own. If a mipmap stack is not specified but the chosen height that must follow the usual rules for mipmap sizes. The base level texture
filter mode requires it, the mipmap stack is constructed internally and is still supplied in <code>tex</code> and must not be included in the list. Gradients of a
discarded afterwards.</td></tr><tr class="arg"><td class="argname">filter_mode</td><td class="arg_short">Texture filtering mode to be used. Valid values are 'auto', 'nearest', custom mipmap stack are not automatically propagated to base texture but the mipmap
tensors will receive gradients of their own. If a mipmap stack is not specified
but the chosen filter mode requires it, the mipmap stack is constructed internally
and discarded afterwards.</td></tr><tr class="arg"><td class="argname">filter_mode</td><td class="arg_short">Texture filtering mode to be used. Valid values are 'auto', 'nearest',
'linear', 'linear-mipmap-nearest', and 'linear-mipmap-linear'. Mode 'auto' 'linear', 'linear-mipmap-nearest', and 'linear-mipmap-linear'. Mode 'auto'
selects 'linear' if neither <code>uv_da</code> or <code>mip_level_bias</code> is specified, and selects 'linear' if neither <code>uv_da</code> or <code>mip_level_bias</code> is specified, and
'linear-mipmap-linear' when at least one of them is specified, these being 'linear-mipmap-linear' when at least one of them is specified, these being
......
...@@ -42,7 +42,7 @@ using namespace tensorflow::shape_inference; ...@@ -42,7 +42,7 @@ using namespace tensorflow::shape_inference;
#define NVDR_CTX_ARGS int _nvdr_ctx_dummy #define NVDR_CTX_ARGS int _nvdr_ctx_dummy
#define NVDR_CTX_PARAMS 0 #define NVDR_CTX_PARAMS 0
#define NVDR_CHECK(COND, ERR) do { TORCH_CHECK(COND, ERR) } while(0) #define NVDR_CHECK(COND, ERR) do { TORCH_CHECK(COND, ERR) } while(0)
#define NVDR_CHECK_CUDA_ERROR(CUDA_CALL) do { cudaError_t err = CUDA_CALL; AT_CUDA_CHECK(cudaGetLastError()); } while(0) #define NVDR_CHECK_CUDA_ERROR(CUDA_CALL) do { cudaError_t err = CUDA_CALL; TORCH_CHECK(err == CUDA_SUCCESS, "Cuda error: ", cudaGetLastError(), "[", #CUDA_CALL, ";]"); } while(0)
#define NVDR_CHECK_GL_ERROR(GL_CALL) do { GL_CALL; GLenum err = glGetError(); TORCH_CHECK(err == GL_NO_ERROR, "OpenGL error: ", getGLErrorString(err), "[", #GL_CALL, ";]"); } while(0) #define NVDR_CHECK_GL_ERROR(GL_CALL) do { GL_CALL; GLenum err = glGetError(); TORCH_CHECK(err == GL_NO_ERROR, "OpenGL error: ", getGLErrorString(err), "[", #GL_CALL, ";]"); } while(0)
#endif #endif
......
...@@ -168,10 +168,9 @@ void rasterizeInitGLContext(NVDR_CTX_ARGS, RasterizeGLState& s, int cudaDeviceId ...@@ -168,10 +168,9 @@ void rasterizeInitGLContext(NVDR_CTX_ARGS, RasterizeGLState& s, int cudaDeviceId
int layer_id = v_layer[0]; int layer_id = v_layer[0];
int prim_id = gl_PrimitiveIDIn + v_offset[0]; int prim_id = gl_PrimitiveIDIn + v_offset[0];
// Flip z before hw depth test because depth is cleared to zero. gl_Layer = layer_id; gl_PrimitiveID = prim_id; gl_Position = vec4(gl_in[0].gl_Position.x, gl_in[0].gl_Position.y, gl_in[0].gl_Position.z, gl_in[0].gl_Position.w); var_uvzw = vec4(1.f, 0.f, gl_in[0].gl_Position.z, gl_in[0].gl_Position.w); var_db = db0; EmitVertex();
gl_Layer = layer_id; gl_PrimitiveID = prim_id; gl_Position = vec4(gl_in[0].gl_Position.x, gl_in[0].gl_Position.y, -gl_in[0].gl_Position.z, gl_in[0].gl_Position.w); var_uvzw = vec4(1.f, 0.f, gl_in[0].gl_Position.z, gl_in[0].gl_Position.w); var_db = db0; EmitVertex(); gl_Layer = layer_id; gl_PrimitiveID = prim_id; gl_Position = vec4(gl_in[1].gl_Position.x, gl_in[1].gl_Position.y, gl_in[1].gl_Position.z, gl_in[1].gl_Position.w); var_uvzw = vec4(0.f, 1.f, gl_in[1].gl_Position.z, gl_in[1].gl_Position.w); var_db = db1; EmitVertex();
gl_Layer = layer_id; gl_PrimitiveID = prim_id; gl_Position = vec4(gl_in[1].gl_Position.x, gl_in[1].gl_Position.y, -gl_in[1].gl_Position.z, gl_in[1].gl_Position.w); var_uvzw = vec4(0.f, 1.f, gl_in[1].gl_Position.z, gl_in[1].gl_Position.w); var_db = db1; EmitVertex(); gl_Layer = layer_id; gl_PrimitiveID = prim_id; gl_Position = vec4(gl_in[2].gl_Position.x, gl_in[2].gl_Position.y, gl_in[2].gl_Position.z, gl_in[2].gl_Position.w); var_uvzw = vec4(0.f, 0.f, gl_in[2].gl_Position.z, gl_in[2].gl_Position.w); var_db = db2; EmitVertex();
gl_Layer = layer_id; gl_PrimitiveID = prim_id; gl_Position = vec4(gl_in[2].gl_Position.x, gl_in[2].gl_Position.y, -gl_in[2].gl_Position.z, gl_in[2].gl_Position.w); var_uvzw = vec4(0.f, 0.f, gl_in[2].gl_Position.z, gl_in[2].gl_Position.w); var_db = db2; EmitVertex();
} }
) )
); );
...@@ -209,10 +208,9 @@ void rasterizeInitGLContext(NVDR_CTX_ARGS, RasterizeGLState& s, int cudaDeviceId ...@@ -209,10 +208,9 @@ void rasterizeInitGLContext(NVDR_CTX_ARGS, RasterizeGLState& s, int cudaDeviceId
int layer_id = v_layer[0]; int layer_id = v_layer[0];
int prim_id = gl_PrimitiveIDIn + v_offset[0]; int prim_id = gl_PrimitiveIDIn + v_offset[0];
// Flip z before hw depth test because depth is cleared to zero. gl_Layer = layer_id; gl_PrimitiveID = prim_id; gl_Position = vec4(gl_in[0].gl_Position.x, gl_in[0].gl_Position.y, gl_in[0].gl_Position.z, gl_in[0].gl_Position.w); var_uvzw = vec4(1.f, 0.f, gl_in[0].gl_Position.z, gl_in[0].gl_Position.w); EmitVertex();
gl_Layer = layer_id; gl_PrimitiveID = prim_id; gl_Position = vec4(gl_in[0].gl_Position.x, gl_in[0].gl_Position.y, -gl_in[0].gl_Position.z, gl_in[0].gl_Position.w); var_uvzw = vec4(1.f, 0.f, gl_in[0].gl_Position.z, gl_in[0].gl_Position.w); EmitVertex(); gl_Layer = layer_id; gl_PrimitiveID = prim_id; gl_Position = vec4(gl_in[1].gl_Position.x, gl_in[1].gl_Position.y, gl_in[1].gl_Position.z, gl_in[1].gl_Position.w); var_uvzw = vec4(0.f, 1.f, gl_in[1].gl_Position.z, gl_in[1].gl_Position.w); EmitVertex();
gl_Layer = layer_id; gl_PrimitiveID = prim_id; gl_Position = vec4(gl_in[1].gl_Position.x, gl_in[1].gl_Position.y, -gl_in[1].gl_Position.z, gl_in[1].gl_Position.w); var_uvzw = vec4(0.f, 1.f, gl_in[1].gl_Position.z, gl_in[1].gl_Position.w); EmitVertex(); gl_Layer = layer_id; gl_PrimitiveID = prim_id; gl_Position = vec4(gl_in[2].gl_Position.x, gl_in[2].gl_Position.y, gl_in[2].gl_Position.z, gl_in[2].gl_Position.w); var_uvzw = vec4(0.f, 0.f, gl_in[2].gl_Position.z, gl_in[2].gl_Position.w); EmitVertex();
gl_Layer = layer_id; gl_PrimitiveID = prim_id; gl_Position = vec4(gl_in[2].gl_Position.x, gl_in[2].gl_Position.y, -gl_in[2].gl_Position.z, gl_in[2].gl_Position.w); var_uvzw = vec4(0.f, 0.f, gl_in[2].gl_Position.z, gl_in[2].gl_Position.w); EmitVertex();
} }
) )
); );
...@@ -262,9 +260,10 @@ void rasterizeInitGLContext(NVDR_CTX_ARGS, RasterizeGLState& s, int cudaDeviceId ...@@ -262,9 +260,10 @@ void rasterizeInitGLContext(NVDR_CTX_ARGS, RasterizeGLState& s, int cudaDeviceId
NVDR_CHECK_GL_ERROR(glBindFragDataLocation(s.glProgram, 1, "out_db")); NVDR_CHECK_GL_ERROR(glBindFragDataLocation(s.glProgram, 1, "out_db"));
NVDR_CHECK_GL_ERROR(glUseProgram(s.glProgram)); NVDR_CHECK_GL_ERROR(glUseProgram(s.glProgram));
// Set up rendering mode. Inverted depth so that all buffers can be cleared to zero. // Set up depth test.
NVDR_CHECK_GL_ERROR(glEnable(GL_DEPTH_TEST)); NVDR_CHECK_GL_ERROR(glEnable(GL_DEPTH_TEST));
NVDR_CHECK_GL_ERROR(glDepthFunc(GL_GEQUAL)); NVDR_CHECK_GL_ERROR(glDepthFunc(GL_LESS));
NVDR_CHECK_GL_ERROR(glClearDepth(1.0));
// Create and bind output buffers. Storage is allocated later. // Create and bind output buffers. Storage is allocated later.
NVDR_CHECK_GL_ERROR(glGenTextures(num_outputs, s.glColorBuffer)); NVDR_CHECK_GL_ERROR(glGenTextures(num_outputs, s.glColorBuffer));
...@@ -375,18 +374,14 @@ void rasterizeRender(NVDR_CTX_ARGS, RasterizeGLState& s, cudaStream_t stream, co ...@@ -375,18 +374,14 @@ void rasterizeRender(NVDR_CTX_ARGS, RasterizeGLState& s, cudaStream_t stream, co
NVDR_CHECK_CUDA_ERROR(cudaGraphicsUnmapResources(1, &s.cudaPosBuffer, stream)); NVDR_CHECK_CUDA_ERROR(cudaGraphicsUnmapResources(1, &s.cudaPosBuffer, stream));
} }
// Set viewport, clear color and depth/stencil buffers. // Set viewport, clear color buffer(s) and depth/stencil buffer.
NVDR_CHECK_GL_ERROR(glViewport(0, 0, width, height)); NVDR_CHECK_GL_ERROR(glViewport(0, 0, width, height));
NVDR_CHECK_GL_ERROR(glClearTexSubImage(s.glDepthStencilBuffer, 0, 0, 0, 0, width, height, depth, GL_DEPTH_STENCIL, GL_UNSIGNED_INT_24_8, 0)); NVDR_CHECK_GL_ERROR(glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT | GL_STENCIL_BUFFER_BIT));
NVDR_CHECK_GL_ERROR(glClearTexSubImage(s.glColorBuffer[0], 0, 0, 0, 0, width, height, depth, GL_RGBA, GL_FLOAT, 0));
// If outputting bary differentials, clear second output buffer and set resolution uniform // If outputting bary differentials, set resolution uniform
if (s.enableDB) if (s.enableDB)
{
NVDR_CHECK_GL_ERROR(glClearTexSubImage(s.glColorBuffer[1], 0, 0, 0, 0, width, height, depth, GL_RGBA, GL_FLOAT, 0));
NVDR_CHECK_GL_ERROR(glUniform2f(0, 2.f / (float)width, 2.f / (float)height)); NVDR_CHECK_GL_ERROR(glUniform2f(0, 2.f / (float)width, 2.f / (float)height));
}
// Render the meshes. // Render the meshes.
if (depth == 1 && !rangesPtr) if (depth == 1 && !rangesPtr)
{ {
......
...@@ -56,6 +56,9 @@ verbose = True # Print status messages to stdout. ...@@ -56,6 +56,9 @@ verbose = True # Print status messages to stdout.
# Internal helper funcs. # Internal helper funcs.
def _find_compiler_bindir(): def _find_compiler_bindir():
hostx64_paths = sorted(glob.glob('C:/Program Files (x86)/Microsoft Visual Studio/*/Enterprise/VC/Tools/MSVC/*/bin/Hostx64/x64'), reverse=True)
if hostx64_paths != []:
return hostx64_paths[0]
hostx64_paths = sorted(glob.glob('C:/Program Files (x86)/Microsoft Visual Studio/*/Professional/VC/Tools/MSVC/*/bin/Hostx64/x64'), reverse=True) hostx64_paths = sorted(glob.glob('C:/Program Files (x86)/Microsoft Visual Studio/*/Professional/VC/Tools/MSVC/*/bin/Hostx64/x64'), reverse=True)
if hostx64_paths != []: if hostx64_paths != []:
return hostx64_paths[0] return hostx64_paths[0]
......
...@@ -28,7 +28,7 @@ def _get_plugin(): ...@@ -28,7 +28,7 @@ def _get_plugin():
lib_dir = os.path.dirname(__file__) + r"\..\lib" lib_dir = os.path.dirname(__file__) + r"\..\lib"
def find_cl_path(): def find_cl_path():
import glob import glob
for edition in ['Professional', 'BuildTools', 'Community']: for edition in ['Enterprise', 'Professional', 'BuildTools', 'Community']:
paths = sorted(glob.glob(r"C:\Program Files (x86)\Microsoft Visual Studio\*\%s\VC\Tools\MSVC\*\bin\Hostx64\x64" % edition), reverse=True) paths = sorted(glob.glob(r"C:\Program Files (x86)\Microsoft Visual Studio\*\%s\VC\Tools\MSVC\*\bin\Hostx64\x64" % edition), reverse=True)
if paths: if paths:
return paths[0] return paths[0]
...@@ -389,12 +389,15 @@ def texture(tex, uv, uv_da=None, mip_level_bias=None, mip=None, filter_mode='aut ...@@ -389,12 +389,15 @@ def texture(tex, uv, uv_da=None, mip_level_bias=None, mip=None, filter_mode='aut
as long. as long.
mip_level_bias: (Optional) Per-pixel bias for mip level selection. If `uv_da` is omitted, mip_level_bias: (Optional) Per-pixel bias for mip level selection. If `uv_da` is omitted,
determines mip level directly. Must have shape [minibatch_size, height, width]. determines mip level directly. Must have shape [minibatch_size, height, width].
mip: (Optional) Preconstructed mipmap stack from a `texture_construct_mip()` call or a list mip: (Optional) Preconstructed mipmap stack from a `texture_construct_mip()` call, or a list
of tensors specifying a custom mipmap stack. Gradients of a custom mipmap stack of tensors specifying a custom mipmap stack. When specifying a custom mipmap stack,
are not automatically propagated to base texture but the mipmap tensors will the tensors in the list must follow the same format as `tex` except for width and
receive gradients of their own. If a mipmap stack is not specified but the chosen height that must follow the usual rules for mipmap sizes. The base level texture
filter mode requires it, the mipmap stack is constructed internally and is still supplied in `tex` and must not be included in the list. Gradients of a
discarded afterwards. custom mipmap stack are not automatically propagated to base texture but the mipmap
tensors will receive gradients of their own. If a mipmap stack is not specified
but the chosen filter mode requires it, the mipmap stack is constructed internally
and discarded afterwards.
filter_mode: Texture filtering mode to be used. Valid values are 'auto', 'nearest', filter_mode: Texture filtering mode to be used. Valid values are 'auto', 'nearest',
'linear', 'linear-mipmap-nearest', and 'linear-mipmap-linear'. Mode 'auto' 'linear', 'linear-mipmap-nearest', and 'linear-mipmap-linear'. Mode 'auto'
selects 'linear' if neither `uv_da` or `mip_level_bias` is specified, and selects 'linear' if neither `uv_da` or `mip_level_bias` is specified, and
......
...@@ -319,14 +319,14 @@ torch::Tensor texture_fwd_mip(torch::Tensor tex, torch::Tensor uv, torch::Tensor ...@@ -319,14 +319,14 @@ torch::Tensor texture_fwd_mip(torch::Tensor tex, torch::Tensor uv, torch::Tensor
NVDR_CHECK(!((uintptr_t)p.uv & 7), "uv input tensor not aligned to float2"); NVDR_CHECK(!((uintptr_t)p.uv & 7), "uv input tensor not aligned to float2");
if ((p.channels & 3) == 0) if ((p.channels & 3) == 0)
{ {
for (int i=1; 0 <= p.mipLevelMax; i++) for (int i=0; i <= p.mipLevelMax; i++)
NVDR_CHECK(!((uintptr_t)p.tex[i] & 15), "tex or mip input tensor not aligned to float4"); NVDR_CHECK(!((uintptr_t)p.tex[i] & 15), "tex or mip input tensor not aligned to float4");
NVDR_CHECK(!((uintptr_t)p.out & 15), "out output tensor not aligned to float4"); NVDR_CHECK(!((uintptr_t)p.out & 15), "out output tensor not aligned to float4");
NVDR_CHECK(!((uintptr_t)pmip & 15), "mip input tensor not aligned to float4"); NVDR_CHECK(!((uintptr_t)pmip & 15), "mip input tensor not aligned to float4");
} }
if ((p.channels & 1) == 0) if ((p.channels & 1) == 0)
{ {
for (int i=1; 0 <= p.mipLevelMax; i++) for (int i=0; i <= p.mipLevelMax; i++)
NVDR_CHECK(!((uintptr_t)p.tex[i] & 7), "tex or mip input tensor not aligned to float2"); NVDR_CHECK(!((uintptr_t)p.tex[i] & 7), "tex or mip input tensor not aligned to float2");
NVDR_CHECK(!((uintptr_t)p.out & 7), "out output tensor not aligned to float2"); NVDR_CHECK(!((uintptr_t)p.out & 7), "out output tensor not aligned to float2");
NVDR_CHECK(!((uintptr_t)pmip & 7), "mip input tensor not aligned to float2"); NVDR_CHECK(!((uintptr_t)pmip & 7), "mip input tensor not aligned to float2");
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment