Bug and perf fixes

2f498f23 · Samuli Laine · 5a2d5d59 · 2f498f23 · 2f498f23 · 2f498f23
Commit 2f498f23 authored Feb 03, 2021 by Samuli Laine
6 changed files
--- a/docs/index.html
+++ b/docs/index.html
@@ -901,12 +901,15 @@ must have shape [minibatch_size, height, width, 2]. When sampling a cube map
 texture, must have shape [minibatch_size, height, width, 3].</td></tr><tr class="arg"><td class="argname">uv_da</td><td class="arg_short">(Optional) Tensor containing image-space derivatives of texture coordinates.
 Must have same shape as <code>uv</code> except for the last dimension that is to be twice
 as long.</td></tr><tr class="arg"><td class="argname">mip_level_bias</td><td class="arg_short">(Optional) Per-pixel bias for mip level selection. If <code>uv_da</code> is omitted,
-determines mip level directly. Must have shape [minibatch_size, height, width].</td></tr><tr class="arg"><td class="argname">mip</td><td class="arg_short">(Optional) Preconstructed mipmap stack from a <code>texture_construct_mip()</code> call or a list
+determines mip level directly. Must have shape [minibatch_size, height, width].</td></tr><tr class="arg"><td class="argname">mip</td><td class="arg_short">(Optional) Preconstructed mipmap stack from a <code>texture_construct_mip()</code> call, or a list
-of tensors specifying a custom mipmap stack. Gradients of a custom mipmap stack
+of tensors specifying a custom mipmap stack. When specifying a custom mipmap stack,
-are not automatically propagated to base texture but the mipmap tensors will
+the tensors in the list must follow the same format as <code>tex</code> except for width and
-receive gradients of their own. If a mipmap stack is not specified but the chosen
+height that must follow the usual rules for mipmap sizes. The base level texture
-filter mode requires it, the mipmap stack is constructed internally and
+is still supplied in <code>tex</code> and must not be included in the list. Gradients of a
-discarded afterwards.</td></tr><tr class="arg"><td class="argname">filter_mode</td><td class="arg_short">Texture filtering mode to be used. Valid values are 'auto', 'nearest',
+custom mipmap stack are not automatically propagated to base texture but the mipmap
+tensors will receive gradients of their own. If a mipmap stack is not specified
+but the chosen filter mode requires it, the mipmap stack is constructed internally
+and discarded afterwards.</td></tr><tr class="arg"><td class="argname">filter_mode</td><td class="arg_short">Texture filtering mode to be used. Valid values are 'auto', 'nearest',
 'linear', 'linear-mipmap-nearest', and 'linear-mipmap-linear'. Mode 'auto'
 selects 'linear' if neither <code>uv_da</code> or <code>mip_level_bias</code> is specified, and
 'linear-mipmap-linear' when at least one of them is specified, these being

--- a/nvdiffrast/common/framework.h
+++ b/nvdiffrast/common/framework.h
@@ -42,7 +42,7 @@ using namespace tensorflow::shape_inference;
 #define NVDR_CTX_ARGS int _nvdr_ctx_dummy
 #define NVDR_CTX_PARAMS 0
 #define NVDR_CHECK(COND, ERR) do { TORCH_CHECK(COND, ERR) } while(0)
-#define NVDR_CHECK_CUDA_ERROR(CUDA_CALL) do { cudaError_t err = CUDA_CALL; AT_CUDA_CHECK(cudaGetLastError()); } while(0)
+#define NVDR_CHECK_CUDA_ERROR(CUDA_CALL) do { cudaError_t err = CUDA_CALL; TORCH_CHECK(err == CUDA_SUCCESS, "Cuda error: ", cudaGetLastError(), "[", #CUDA_CALL, ";]"); } while(0)
 #define NVDR_CHECK_GL_ERROR(GL_CALL) do { GL_CALL; GLenum err = glGetError(); TORCH_CHECK(err == GL_NO_ERROR, "OpenGL error: ", getGLErrorString(err), "[", #GL_CALL, ";]"); } while(0)
 #endif

--- a/nvdiffrast/common/rasterize.cpp
+++ b/nvdiffrast/common/rasterize.cpp
@@ -168,10 +168,9 @@ void rasterizeInitGLContext(NVDR_CTX_ARGS, RasterizeGLState& s, int cudaDeviceId
                    int layer_id = v_layer[0];
                    int prim_id = gl_PrimitiveIDIn + v_offset[0];
-                    // Flip z before hw depth test because depth is cleared to zero.
+                    gl_Layer = layer_id; gl_PrimitiveID = prim_id; gl_Position = vec4(gl_in[0].gl_Position.x, gl_in[0].gl_Position.y, gl_in[0].gl_Position.z, gl_in[0].gl_Position.w); var_uvzw = vec4(1.f, 0.f, gl_in[0].gl_Position.z, gl_in[0].gl_Position.w); var_db = db0; EmitVertex();
-                    gl_Layer = layer_id; gl_PrimitiveID = prim_id; gl_Position = vec4(gl_in[0].gl_Position.x, gl_in[0].gl_Position.y, -gl_in[0].gl_Position.z, gl_in[0].gl_Position.w); var_uvzw = vec4(1.f, 0.f, gl_in[0].gl_Position.z, gl_in[0].gl_Position.w); var_db = db0; EmitVertex();
+                    gl_Layer = layer_id; gl_PrimitiveID = prim_id; gl_Position = vec4(gl_in[1].gl_Position.x, gl_in[1].gl_Position.y, gl_in[1].gl_Position.z, gl_in[1].gl_Position.w); var_uvzw = vec4(0.f, 1.f, gl_in[1].gl_Position.z, gl_in[1].gl_Position.w); var_db = db1; EmitVertex();
-                    gl_Layer = layer_id; gl_PrimitiveID = prim_id; gl_Position = vec4(gl_in[1].gl_Position.x, gl_in[1].gl_Position.y, -gl_in[1].gl_Position.z, gl_in[1].gl_Position.w); var_uvzw = vec4(0.f, 1.f, gl_in[1].gl_Position.z, gl_in[1].gl_Position.w); var_db = db1; EmitVertex();
+                    gl_Layer = layer_id; gl_PrimitiveID = prim_id; gl_Position = vec4(gl_in[2].gl_Position.x, gl_in[2].gl_Position.y, gl_in[2].gl_Position.z, gl_in[2].gl_Position.w); var_uvzw = vec4(0.f, 0.f, gl_in[2].gl_Position.z, gl_in[2].gl_Position.w); var_db = db2; EmitVertex();
-                    gl_Layer = layer_id; gl_PrimitiveID = prim_id; gl_Position = vec4(gl_in[2].gl_Position.x, gl_in[2].gl_Position.y, -gl_in[2].gl_Position.z, gl_in[2].gl_Position.w); var_uvzw = vec4(0.f, 0.f, gl_in[2].gl_Position.z, gl_in[2].gl_Position.w); var_db = db2; EmitVertex();
                }
            )
        );
@@ -209,10 +208,9 @@ void rasterizeInitGLContext(NVDR_CTX_ARGS, RasterizeGLState& s, int cudaDeviceId
                    int layer_id = v_layer[0];
                    int prim_id = gl_PrimitiveIDIn + v_offset[0];
-                    // Flip z before hw depth test because depth is cleared to zero.
+                    gl_Layer = layer_id; gl_PrimitiveID = prim_id; gl_Position = vec4(gl_in[0].gl_Position.x, gl_in[0].gl_Position.y, gl_in[0].gl_Position.z, gl_in[0].gl_Position.w); var_uvzw = vec4(1.f, 0.f, gl_in[0].gl_Position.z, gl_in[0].gl_Position.w); EmitVertex();
-                    gl_Layer = layer_id; gl_PrimitiveID = prim_id; gl_Position = vec4(gl_in[0].gl_Position.x, gl_in[0].gl_Position.y, -gl_in[0].gl_Position.z, gl_in[0].gl_Position.w); var_uvzw = vec4(1.f, 0.f, gl_in[0].gl_Position.z, gl_in[0].gl_Position.w); EmitVertex();
+                    gl_Layer = layer_id; gl_PrimitiveID = prim_id; gl_Position = vec4(gl_in[1].gl_Position.x, gl_in[1].gl_Position.y, gl_in[1].gl_Position.z, gl_in[1].gl_Position.w); var_uvzw = vec4(0.f, 1.f, gl_in[1].gl_Position.z, gl_in[1].gl_Position.w); EmitVertex();
-                    gl_Layer = layer_id; gl_PrimitiveID = prim_id; gl_Position = vec4(gl_in[1].gl_Position.x, gl_in[1].gl_Position.y, -gl_in[1].gl_Position.z, gl_in[1].gl_Position.w); var_uvzw = vec4(0.f, 1.f, gl_in[1].gl_Position.z, gl_in[1].gl_Position.w); EmitVertex();
+                    gl_Layer = layer_id; gl_PrimitiveID = prim_id; gl_Position = vec4(gl_in[2].gl_Position.x, gl_in[2].gl_Position.y, gl_in[2].gl_Position.z, gl_in[2].gl_Position.w); var_uvzw = vec4(0.f, 0.f, gl_in[2].gl_Position.z, gl_in[2].gl_Position.w); EmitVertex();
-                    gl_Layer = layer_id; gl_PrimitiveID = prim_id; gl_Position = vec4(gl_in[2].gl_Position.x, gl_in[2].gl_Position.y, -gl_in[2].gl_Position.z, gl_in[2].gl_Position.w); var_uvzw = vec4(0.f, 0.f, gl_in[2].gl_Position.z, gl_in[2].gl_Position.w); EmitVertex();
                }
            )
        );
@@ -262,9 +260,10 @@ void rasterizeInitGLContext(NVDR_CTX_ARGS, RasterizeGLState& s, int cudaDeviceId
    NVDR_CHECK_GL_ERROR(glBindFragDataLocation(s.glProgram, 1, "out_db"));
    NVDR_CHECK_GL_ERROR(glUseProgram(s.glProgram));
-    // Set up rendering mode. Inverted depth so that all buffers can be cleared to zero.
+    // Set up depth test.
    NVDR_CHECK_GL_ERROR(glEnable(GL_DEPTH_TEST));
-    NVDR_CHECK_GL_ERROR(glDepthFunc(GL_GEQUAL));
+    NVDR_CHECK_GL_ERROR(glDepthFunc(GL_LESS));
+    NVDR_CHECK_GL_ERROR(glClearDepth(1.0));
    // Create and bind output buffers. Storage is allocated later.
    NVDR_CHECK_GL_ERROR(glGenTextures(num_outputs, s.glColorBuffer));
@@ -375,18 +374,14 @@ void rasterizeRender(NVDR_CTX_ARGS, RasterizeGLState& s, cudaStream_t stream, co
        NVDR_CHECK_CUDA_ERROR(cudaGraphicsUnmapResources(1, &s.cudaPosBuffer, stream));
    }
-    // Set viewport, clear color and depth/stencil buffers.
+    // Set viewport, clear color buffer(s) and depth/stencil buffer.
    NVDR_CHECK_GL_ERROR(glViewport(0, 0, width, height));
-    NVDR_CHECK_GL_ERROR(glClearTexSubImage(s.glDepthStencilBuffer, 0, 0, 0, 0, width, height, depth, GL_DEPTH_STENCIL, GL_UNSIGNED_INT_24_8, 0));
+    NVDR_CHECK_GL_ERROR(glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT | GL_STENCIL_BUFFER_BIT));
-    NVDR_CHECK_GL_ERROR(glClearTexSubImage(s.glColorBuffer[0], 0, 0, 0, 0, width, height, depth, GL_RGBA, GL_FLOAT, 0));
-    // If outputting bary differentials, clear second output buffer and set resolution uniform
+    // If outputting bary differentials, set resolution uniform
    if (s.enableDB)
-    {
-        NVDR_CHECK_GL_ERROR(glClearTexSubImage(s.glColorBuffer[1], 0, 0, 0, 0, width, height, depth, GL_RGBA, GL_FLOAT, 0));
        NVDR_CHECK_GL_ERROR(glUniform2f(0, 2.f / (float)width, 2.f / (float)height));
-    }
    // Render the meshes.
    if (depth == 1 && !rangesPtr)
    {

--- a/nvdiffrast/tensorflow/plugin_loader.py
+++ b/nvdiffrast/tensorflow/plugin_loader.py
@@ -56,6 +56,9 @@ verbose = True # Print status messages to stdout.
 # Internal helper funcs.
 def _find_compiler_bindir():
+    hostx64_paths = sorted(glob.glob('C:/Program Files (x86)/Microsoft Visual Studio/*/Enterprise/VC/Tools/MSVC/*/bin/Hostx64/x64'), reverse=True)
+    if hostx64_paths != []:
+        return hostx64_paths[0]
    hostx64_paths = sorted(glob.glob('C:/Program Files (x86)/Microsoft Visual Studio/*/Professional/VC/Tools/MSVC/*/bin/Hostx64/x64'), reverse=True)
    if hostx64_paths != []:
        return hostx64_paths[0]

--- a/nvdiffrast/torch/ops.py
+++ b/nvdiffrast/torch/ops.py
@@ -28,7 +28,7 @@ def _get_plugin():
        lib_dir = os.path.dirname(__file__) + r"\..\lib"
        def find_cl_path():
            import glob
-            for edition in ['Professional', 'BuildTools', 'Community']:
+            for edition in ['Enterprise', 'Professional', 'BuildTools', 'Community']:
                paths = sorted(glob.glob(r"C:\Program Files (x86)\Microsoft Visual Studio\*\%s\VC\Tools\MSVC\*\bin\Hostx64\x64" % edition), reverse=True)
                if paths:
                    return paths[0]
@@ -389,12 +389,15 @@ def texture(tex, uv, uv_da=None, mip_level_bias=None, mip=None, filter_mode='aut
               as long.
        mip_level_bias: (Optional) Per-pixel bias for mip level selection. If `uv_da` is omitted,
                        determines mip level directly. Must have shape [minibatch_size, height, width].
-        mip: (Optional) Preconstructed mipmap stack from a `texture_construct_mip()` call or a list
+        mip: (Optional) Preconstructed mipmap stack from a `texture_construct_mip()` call, or a list
-                        of tensors specifying a custom mipmap stack. Gradients of a custom mipmap stack
+                        of tensors specifying a custom mipmap stack. When specifying a custom mipmap stack,
-                        are not automatically propagated to base texture but the mipmap tensors will
+                        the tensors in the list must follow the same format as `tex` except for width and
-                        receive gradients of their own. If a mipmap stack is not specified but the chosen
+                        height that must follow the usual rules for mipmap sizes. The base level texture
-                        filter mode requires it, the mipmap stack is constructed internally and
+                        is still supplied in `tex` and must not be included in the list. Gradients of a
-                        discarded afterwards.
+                        custom mipmap stack are not automatically propagated to base texture but the mipmap
+                        tensors will receive gradients of their own. If a mipmap stack is not specified
+                        but the chosen filter mode requires it, the mipmap stack is constructed internally
+                        and discarded afterwards.
        filter_mode: Texture filtering mode to be used. Valid values are 'auto', 'nearest',
                     'linear', 'linear-mipmap-nearest', and 'linear-mipmap-linear'. Mode 'auto'
                     selects 'linear' if neither `uv_da` or `mip_level_bias` is specified, and

--- a/nvdiffrast/torch/torch_texture.cpp
+++ b/nvdiffrast/torch/torch_texture.cpp
@@ -319,14 +319,14 @@ torch::Tensor texture_fwd_mip(torch::Tensor tex, torch::Tensor uv, torch::Tensor
        NVDR_CHECK(!((uintptr_t)p.uv & 7), "uv input tensor not aligned to float2");
    if ((p.channels & 3) == 0)
    {
-        for (int i=1; 0 <= p.mipLevelMax; i++)
+        for (int i=0; i <= p.mipLevelMax; i++)
            NVDR_CHECK(!((uintptr_t)p.tex[i] & 15), "tex or mip input tensor not aligned to float4");
        NVDR_CHECK(!((uintptr_t)p.out    & 15), "out output tensor not aligned to float4");
        NVDR_CHECK(!((uintptr_t)pmip     & 15), "mip input tensor not aligned to float4");
    }
    if ((p.channels & 1) == 0)
    {
-        for (int i=1; 0 <= p.mipLevelMax; i++)
+        for (int i=0; i <= p.mipLevelMax; i++)
            NVDR_CHECK(!((uintptr_t)p.tex[i] & 7), "tex or mip input tensor not aligned to float2");
        NVDR_CHECK(!((uintptr_t)p.out    & 7), "out output tensor not aligned to float2");
        NVDR_CHECK(!((uintptr_t)pmip     & 7), "mip input tensor not aligned to float2");