Initial commit

1f95925c · Samuli Laine · 1f95925c · 1f95925c · 1f95925c · 1f95925c
Commit 1f95925c authored Nov 03, 2020 by Samuli Laine
20 changed files
--- a/nvdiffrast/common/rasterize.cpp
+++ b/nvdiffrast/common/rasterize.cpp
+// Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#include "rasterize.h"
+#include "glutil.inl"
+#include <vector>
+#define STRINGIFY_SHADER_SOURCE(x) #x
+
+//------------------------------------------------------------------------
+// Helpers.
+
+#define ROUND_UP(x, y) ((((x) + ((y) - 1)) / (y)) * (y))
+static int ROUND_UP_BITS(uint32_t x, uint32_t y)
+{
+    // Round x up so that it has at most y bits of mantissa.
+    if (x < (1u << y))
+        return x;
+    uint32_t m = 0;
+    while (x & ~m)
+        m = (m << 1) | 1u;
+    m >>= y;
+    if (!(x & m))
+        return x;
+    return (x | m) + 1u;
+}
+
+//------------------------------------------------------------------------
+// GL helpers.
+
+static void compileGLShader(NVDR_CTX_ARGS, GLuint* pShader, GLenum shaderType, const char* src)
+{
+    const char* srcPtr = src;
+    int srcLength = strlen(src);
+    *pShader = 0;
+    NVDR_CHECK_GL_ERROR(*pShader = glCreateShader(shaderType));
+    NVDR_CHECK_GL_ERROR(glShaderSource(*pShader, 1, &srcPtr, &srcLength));
+    NVDR_CHECK_GL_ERROR(glCompileShader(*pShader));
+}
+
+static void constructGLProgram(NVDR_CTX_ARGS, GLuint* pProgram, GLuint glVertexShader, GLuint glGeometryShader, GLuint glFragmentShader)
+{
+    *pProgram = 0;
+
+    GLuint glProgram = 0;
+    NVDR_CHECK_GL_ERROR(glProgram = glCreateProgram());
+    NVDR_CHECK_GL_ERROR(glAttachShader(glProgram, glVertexShader));
+    NVDR_CHECK_GL_ERROR(glAttachShader(glProgram, glGeometryShader));
+    NVDR_CHECK_GL_ERROR(glAttachShader(glProgram, glFragmentShader));
+    NVDR_CHECK_GL_ERROR(glLinkProgram(glProgram));
+
+    GLint linkStatus = 0;
+    NVDR_CHECK_GL_ERROR(glGetProgramiv(glProgram, GL_LINK_STATUS, &linkStatus));
+    if (!linkStatus)
+    {
+        GLint infoLen = 0;
+        NVDR_CHECK_GL_ERROR(glGetProgramiv(glProgram, GL_INFO_LOG_LENGTH, &infoLen));
+        if (infoLen)
+        {
+            const char* hdr = "glLinkProgram() failed:\n";
+            std::vector<char> info(strlen(hdr) + infoLen);
+            strcpy(&info[0], hdr);
+            NVDR_CHECK_GL_ERROR(glGetProgramInfoLog(glProgram, infoLen, &infoLen, &info[strlen(hdr)]));
+            NVDR_CHECK(0, &info[0]);
+        }
+        NVDR_CHECK(0, "glLinkProgram() failed");
+    }
+
+    *pProgram = glProgram;
+}
+
+//------------------------------------------------------------------------
+// Shared C++ functions.
+
+void rasterizeInitGLContext(NVDR_CTX_ARGS, RasterizeGLState& s)
+{
+    // Create GL context and set it current.
+    s.glctx = createGLContext();
+    setGLContext(s.glctx);
+ 
+    // Version check.
+    GLint vMajor = 0;
+    GLint vMinor = 0;
+    glGetIntegerv(GL_MAJOR_VERSION, &vMajor);
+    glGetIntegerv(GL_MINOR_VERSION, &vMinor);
+    glGetError(); // Clear possible GL_INVALID_ENUM error in version query.
+    LOG(INFO) << "OpenGL version reported as " << vMajor << "." << vMinor;
+    NVDR_CHECK((vMajor == 4 && vMinor >= 4) || vMajor > 4, "OpenGL 4.4 or later is required");
+    
+    // Number of output buffers.
+    int num_outputs = s.enableDB ? 2 : 1;
+
+    // Set up vertex shader.
+    compileGLShader(NVDR_CTX_PARAMS, &s.glVertexShader, GL_VERTEX_SHADER,
+        "#version 330\n"
+        "#extension GL_ARB_shader_draw_parameters : enable\n"
+        STRINGIFY_SHADER_SOURCE(
+            layout(location = 0) in vec4 in_pos;
+            out int v_layer;
+            out int v_offset;
+            void main()
+            {
+                int layer = gl_DrawIDARB;
+                gl_Position = in_pos;
+                v_layer = layer;
+                v_offset = gl_BaseInstanceARB; // Sneak in TriID offset here.
+            }
+        )
+    );
+
+    // Geometry and fragment shaders depend on if bary differential output is enabled or not.
+    if (s.enableDB)
+    {
+        // Set up geometry shader. Calculation of per-pixel bary differentials is based on:
+        //           u = (u/w) / (1/w)
+        //   --> du/dX = d((u/w) / (1/w))/dX
+        //   --> du/dX = [d(u/w)/dX - u*d(1/w)/dX] * w
+        // and we know both d(u/w)/dX and d(1/w)/dX are constant over triangle.
+        compileGLShader(NVDR_CTX_PARAMS, &s.glGeometryShader, GL_GEOMETRY_SHADER,
+            "#version 430\n"
+            STRINGIFY_SHADER_SOURCE(
+                layout(triangles) in;
+                layout(triangle_strip, max_vertices=3) out;
+                layout(location = 0) uniform vec2 vp_scale;
+                in int v_layer[];
+                in int v_offset[];
+                out vec4 var_uvzw;
+                out vec4 var_db;
+                void main()
+                {
+                    // Plane equations for bary differentials.
+                    float w0 = gl_in[0].gl_Position.w;
+                    float w1 = gl_in[1].gl_Position.w;
+                    float w2 = gl_in[2].gl_Position.w;
+                    vec2 p0 = gl_in[0].gl_Position.xy;
+                    vec2 p1 = gl_in[1].gl_Position.xy;
+                    vec2 p2 = gl_in[2].gl_Position.xy;
+                    vec2 e0 = p0*w2 - p2*w0;
+                    vec2 e1 = p1*w2 - p2*w1;
+                    float a = e0.x*e1.y - e0.y*e1.x;
+
+                    // Clamp area to an epsilon to avoid arbitrarily high bary differentials.
+                    float eps = 1e-6f; // ~1 pixel in 1k x 1k image.
+                    float ca = (abs(a) >= eps) ? a : (a < 0.f) ? -eps : eps; // Clamp with sign.
+                    float ia = 1.f / ca; // Inverse area.
+
+                    vec2 ascl = ia * vp_scale;
+                    float dudx =  e1.y * ascl.x;
+                    float dudy = -e1.x * ascl.y;
+                    float dvdx = -e0.y * ascl.x;
+                    float dvdy =  e0.x * ascl.y;
+
+                    float duwdx = w2 * dudx;
+                    float dvwdx = w2 * dvdx;
+                    float duvdx = w0 * dudx + w1 * dvdx;
+                    float duwdy = w2 * dudy;
+                    float dvwdy = w2 * dvdy;
+                    float duvdy = w0 * dudy + w1 * dvdy;
+
+                    vec4 db0 = vec4(duvdx - dvwdx, duvdy - dvwdy, dvwdx, dvwdy);
+                    vec4 db1 = vec4(duwdx, duwdy, duvdx - duwdx, duvdy - duwdy);
+                    vec4 db2 = vec4(duwdx, duwdy, dvwdx, dvwdy);
+
+                    int layer_id = v_layer[0];
+                    int prim_id = gl_PrimitiveIDIn + v_offset[0];
+
+                    // Flip z before hw depth test because depth is cleared to zero.
+                    gl_Layer = layer_id; gl_PrimitiveID = prim_id; gl_Position = vec4(gl_in[0].gl_Position.x, gl_in[0].gl_Position.y, -gl_in[0].gl_Position.z, gl_in[0].gl_Position.w); var_uvzw = vec4(1.f, 0.f, gl_in[0].gl_Position.z, gl_in[0].gl_Position.w); var_db = db0; EmitVertex();
+                    gl_Layer = layer_id; gl_PrimitiveID = prim_id; gl_Position = vec4(gl_in[1].gl_Position.x, gl_in[1].gl_Position.y, -gl_in[1].gl_Position.z, gl_in[1].gl_Position.w); var_uvzw = vec4(0.f, 1.f, gl_in[1].gl_Position.z, gl_in[1].gl_Position.w); var_db = db1; EmitVertex();
+                    gl_Layer = layer_id; gl_PrimitiveID = prim_id; gl_Position = vec4(gl_in[2].gl_Position.x, gl_in[2].gl_Position.y, -gl_in[2].gl_Position.z, gl_in[2].gl_Position.w); var_uvzw = vec4(0.f, 0.f, gl_in[2].gl_Position.z, gl_in[2].gl_Position.w); var_db = db2; EmitVertex();
+                }
+            )
+        );
+
+        // Set up fragment shader.
+        compileGLShader(NVDR_CTX_PARAMS, &s.glFragmentShader, GL_FRAGMENT_SHADER,
+            "#version 330\n"
+            STRINGIFY_SHADER_SOURCE(
+                in vec4 var_uvzw;
+                in vec4 var_db;
+                in int gl_PrimitiveID;
+                layout(location = 0) out vec4 out_raster;
+                layout(location = 1) out vec4 out_db;
+                void main()
+                {
+                    out_raster = vec4(var_uvzw.x, var_uvzw.y, var_uvzw.z / var_uvzw.w, float(gl_PrimitiveID + 1));
+                    out_db = var_db * var_uvzw.w;
+                }
+            )
+        );
+    }
+    else
+    {
+        // Geometry shader without bary differential output.
+        compileGLShader(NVDR_CTX_PARAMS, &s.glGeometryShader, GL_GEOMETRY_SHADER,
+            "#version 330\n"
+            STRINGIFY_SHADER_SOURCE(
+                layout(triangles) in;
+                layout(triangle_strip, max_vertices=3) out;
+                in int v_layer[];
+                in int v_offset[];
+                out vec4 var_uvzw;
+                void main()
+                {
+                    int layer_id = v_layer[0];
+                    int prim_id = gl_PrimitiveIDIn + v_offset[0];
+
+                    // Flip z before hw depth test because depth is cleared to zero.
+                    gl_Layer = layer_id; gl_PrimitiveID = prim_id; gl_Position = vec4(gl_in[0].gl_Position.x, gl_in[0].gl_Position.y, -gl_in[0].gl_Position.z, gl_in[0].gl_Position.w); var_uvzw = vec4(1.f, 0.f, gl_in[0].gl_Position.z, gl_in[0].gl_Position.w); EmitVertex();
+                    gl_Layer = layer_id; gl_PrimitiveID = prim_id; gl_Position = vec4(gl_in[1].gl_Position.x, gl_in[1].gl_Position.y, -gl_in[1].gl_Position.z, gl_in[1].gl_Position.w); var_uvzw = vec4(0.f, 1.f, gl_in[1].gl_Position.z, gl_in[1].gl_Position.w); EmitVertex();
+                    gl_Layer = layer_id; gl_PrimitiveID = prim_id; gl_Position = vec4(gl_in[2].gl_Position.x, gl_in[2].gl_Position.y, -gl_in[2].gl_Position.z, gl_in[2].gl_Position.w); var_uvzw = vec4(0.f, 0.f, gl_in[2].gl_Position.z, gl_in[2].gl_Position.w); EmitVertex();
+                }
+            )
+        );
+
+        // Fragment shader without bary differential output.
+        compileGLShader(NVDR_CTX_PARAMS, &s.glFragmentShader, GL_FRAGMENT_SHADER,
+            "#version 330\n"
+            STRINGIFY_SHADER_SOURCE(
+                in vec4 var_uvzw;
+                in int gl_PrimitiveID;
+                layout(location = 0) out vec4 out_raster;
+                void main()
+                {
+                    out_raster = vec4(var_uvzw.x, var_uvzw.y, var_uvzw.z / var_uvzw.w, float(gl_PrimitiveID + 1));
+                }
+            )
+        );
+    }
+
+    // Finalize program.
+    constructGLProgram(NVDR_CTX_PARAMS, &s.glProgram, s.glVertexShader, s.glGeometryShader, s.glFragmentShader);
+
+    // Construct main fbo and bind permanently.
+    NVDR_CHECK_GL_ERROR(glGenFramebuffers(1, &s.glFBO));
+    NVDR_CHECK_GL_ERROR(glBindFramebuffer(GL_FRAMEBUFFER, s.glFBO));
+
+    // Enable two color attachments.
+    GLenum draw_buffers[2] = { GL_COLOR_ATTACHMENT0, GL_COLOR_ATTACHMENT1 };
+    NVDR_CHECK_GL_ERROR(glDrawBuffers(num_outputs, draw_buffers));
+
+    // Construct vertex array object.
+    NVDR_CHECK_GL_ERROR(glGenVertexArrays(1, &s.glVAO));
+    NVDR_CHECK_GL_ERROR(glBindVertexArray(s.glVAO));
+
+    // Construct position buffer, bind permanently, enable, set ptr.
+    NVDR_CHECK_GL_ERROR(glGenBuffers(1, &s.glPosBuffer));
+    NVDR_CHECK_GL_ERROR(glBindBuffer(GL_ARRAY_BUFFER, s.glPosBuffer));
+    NVDR_CHECK_GL_ERROR(glEnableVertexAttribArray(0));
+    NVDR_CHECK_GL_ERROR(glVertexAttribPointer(0, 4, GL_FLOAT, GL_FALSE, 0, 0));
+
+    // Construct index buffer and bind permanently.
+    NVDR_CHECK_GL_ERROR(glGenBuffers(1, &s.glTriBuffer));
+    NVDR_CHECK_GL_ERROR(glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, s.glTriBuffer));
+
+    // Bind color outputs and activate program.
+    NVDR_CHECK_GL_ERROR(glBindFragDataLocation(s.glProgram, 0, "out_raster"));
+    NVDR_CHECK_GL_ERROR(glBindFragDataLocation(s.glProgram, 1, "out_db"));
+    NVDR_CHECK_GL_ERROR(glUseProgram(s.glProgram));
+
+    // Set up rendering mode. Inverted depth so that all buffers can be cleared to zero.
+    NVDR_CHECK_GL_ERROR(glEnable(GL_DEPTH_TEST));
+    NVDR_CHECK_GL_ERROR(glDepthFunc(GL_GEQUAL));
+
+    // Create and bind output buffers. Storage is allocated later.
+    NVDR_CHECK_GL_ERROR(glGenTextures(num_outputs, s.glColorBuffer));
+    for (int i=0; i < num_outputs; i++)
+    {
+        NVDR_CHECK_GL_ERROR(glBindTexture(GL_TEXTURE_2D_ARRAY, s.glColorBuffer[i]));
+        NVDR_CHECK_GL_ERROR(glFramebufferTexture(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0 + i, s.glColorBuffer[i], 0));
+    }
+
+    // Create and bind depth/stencil buffer. Storage is allocated later.
+    NVDR_CHECK_GL_ERROR(glGenTextures(1, &s.glDepthStencilBuffer));
+    NVDR_CHECK_GL_ERROR(glBindTexture(GL_TEXTURE_2D_ARRAY, s.glDepthStencilBuffer));
+    NVDR_CHECK_GL_ERROR(glFramebufferTexture(GL_FRAMEBUFFER, GL_DEPTH_STENCIL_ATTACHMENT, s.glDepthStencilBuffer, 0));
+}
+
+void rasterizeResizeBuffers(NVDR_CTX_ARGS, RasterizeGLState& s, int posCount, int triCount, int width, int height, int depth)
+{
+    // Resize vertex buffer?
+    if (posCount > s.posCount)
+    {
+        if (s.cudaPosBuffer)
+            NVDR_CHECK_CUDA_ERROR(cudaGraphicsUnregisterResource(s.cudaPosBuffer));
+        s.posCount = (posCount > 64) ? ROUND_UP_BITS(posCount, 2) : 64;
+        LOG(INFO) << "Increasing position buffer size to " << s.posCount << " float32";
+        NVDR_CHECK_GL_ERROR(glBufferData(GL_ARRAY_BUFFER, s.posCount * sizeof(float), NULL, GL_DYNAMIC_DRAW));
+        NVDR_CHECK_CUDA_ERROR(cudaGraphicsGLRegisterBuffer(&s.cudaPosBuffer, s.glPosBuffer, cudaGraphicsRegisterFlagsWriteDiscard));
+    }
+
+    // Resize triangle buffer?
+    if (triCount > s.triCount)
+    {
+        if (s.cudaTriBuffer)
+            NVDR_CHECK_CUDA_ERROR(cudaGraphicsUnregisterResource(s.cudaTriBuffer));
+        s.triCount = (triCount > 64) ? ROUND_UP_BITS(triCount, 2) : 64;
+        LOG(INFO) << "Increasing triangle buffer size to " << s.triCount << " int32";
+        NVDR_CHECK_GL_ERROR(glBufferData(GL_ELEMENT_ARRAY_BUFFER, s.triCount * sizeof(int32_t), NULL, GL_DYNAMIC_DRAW));
+        NVDR_CHECK_CUDA_ERROR(cudaGraphicsGLRegisterBuffer(&s.cudaTriBuffer, s.glTriBuffer, cudaGraphicsRegisterFlagsWriteDiscard));
+    }
+
+    // Resize framebuffer?
+    if (width > s.width || height > s.height || depth > s.depth)
+    {
+        int num_outputs = s.enableDB ? 2 : 1;
+        if (s.cudaColorBuffer[0])
+            for (int i=0; i < num_outputs; i++)
+                NVDR_CHECK_CUDA_ERROR(cudaGraphicsUnregisterResource(s.cudaColorBuffer[i]));
+
+        // New framebuffer size.
+        s.width  = (width > s.width) ? width : s.width;
+        s.height = (height > s.height) ? height : s.height;
+        s.depth  = (depth > s.depth) ? depth : s.depth;
+        s.width  = ROUND_UP(s.width, 32);
+        s.height = ROUND_UP(s.height, 32);
+        LOG(INFO) << "Increasing frame buffer size to (width, height, depth) = (" << s.width << ", " << s.height << ", " << s.depth << ")";
+        
+        // Allocate color buffers.
+        for (int i=0; i < num_outputs; i++)
+        {
+            NVDR_CHECK_GL_ERROR(glBindTexture(GL_TEXTURE_2D_ARRAY, s.glColorBuffer[i]));
+            NVDR_CHECK_GL_ERROR(glTexImage3D(GL_TEXTURE_2D_ARRAY, 0, GL_RGBA32F, s.width, s.height, s.depth, 0, GL_RGBA, GL_UNSIGNED_BYTE, 0));
+        }
+
+        // Allocate depth/stencil buffer.
+        NVDR_CHECK_GL_ERROR(glBindTexture(GL_TEXTURE_2D_ARRAY, s.glDepthStencilBuffer));
+        NVDR_CHECK_GL_ERROR(glTexImage3D(GL_TEXTURE_2D_ARRAY, 0, GL_DEPTH24_STENCIL8, s.width, s.height, s.depth, 0, GL_DEPTH_STENCIL, GL_UNSIGNED_INT_24_8, 0));
+
+        // (Re-)register all GL buffers into Cuda.
+        for (int i=0; i < num_outputs; i++)
+            NVDR_CHECK_CUDA_ERROR(cudaGraphicsGLRegisterImage(&s.cudaColorBuffer[i], s.glColorBuffer[i], GL_TEXTURE_3D, cudaGraphicsRegisterFlagsReadOnly));
+    }
+
+    // Resize range arrays?
+    if ((unsigned int)depth > s.drawCmdBuffer.size())
+    {
+        int newSize = (depth > 64) ? ROUND_UP_BITS(depth, 1) : 64;
+        LOG(INFO) << "Increasing range array size to " << newSize << " elements";
+        s.drawCmdBuffer.resize(newSize);
+    }
+}
+
+void rasterizeRender(NVDR_CTX_ARGS, RasterizeGLState& s, cudaStream_t stream, const float* posPtr, int posCount, int vtxPerInstance, const int32_t* triPtr, int triCount, const int32_t* rangesPtr, int width, int height, int depth)
+{
+    if (triPtr)
+    {
+        // Copy both position and triangle buffers.
+        void* glPosPtr = NULL;
+        void* glTriPtr = NULL;
+        size_t posBytes = 0;
+        size_t triBytes = 0;
+        NVDR_CHECK_CUDA_ERROR(cudaGraphicsMapResources(2, &s.cudaPosBuffer, stream));
+        NVDR_CHECK_CUDA_ERROR(cudaGraphicsResourceGetMappedPointer(&glPosPtr, &posBytes, s.cudaPosBuffer));
+        NVDR_CHECK_CUDA_ERROR(cudaGraphicsResourceGetMappedPointer(&glTriPtr, &triBytes, s.cudaTriBuffer));
+        NVDR_CHECK(posBytes >= posCount * sizeof(float), "mapped GL position buffer size mismatch");
+        NVDR_CHECK(triBytes >= triCount * sizeof(int32_t), "mapped GL triangle buffer size mismatch");
+        NVDR_CHECK_CUDA_ERROR(cudaMemcpyAsync(glPosPtr, posPtr, posCount * sizeof(float), cudaMemcpyDeviceToDevice, stream));
+        NVDR_CHECK_CUDA_ERROR(cudaMemcpyAsync(glTriPtr, triPtr, triCount * sizeof(int32_t), cudaMemcpyDeviceToDevice, stream));
+        NVDR_CHECK_CUDA_ERROR(cudaGraphicsUnmapResources(2, &s.cudaPosBuffer, stream));
+    }
+    else
+    {
+        // Copy position buffer only. Triangles are already copied and known to be constant.
+        void* glPosPtr = NULL;
+        size_t posBytes = 0;
+        NVDR_CHECK_CUDA_ERROR(cudaGraphicsMapResources(1, &s.cudaPosBuffer, stream));
+        NVDR_CHECK_CUDA_ERROR(cudaGraphicsResourceGetMappedPointer(&glPosPtr, &posBytes, s.cudaPosBuffer));
+        NVDR_CHECK(posBytes >= posCount * sizeof(float), "mapped GL position buffer size mismatch");
+        NVDR_CHECK_CUDA_ERROR(cudaMemcpyAsync(glPosPtr, posPtr, posCount * sizeof(float), cudaMemcpyDeviceToDevice, stream));
+        NVDR_CHECK_CUDA_ERROR(cudaGraphicsUnmapResources(1, &s.cudaPosBuffer, stream));
+    }
+
+    // Set viewport, clear color and depth/stencil buffers.
+    NVDR_CHECK_GL_ERROR(glViewport(0, 0, width, height));
+    NVDR_CHECK_GL_ERROR(glClearTexSubImage(s.glDepthStencilBuffer, 0, 0, 0, 0, width, height, depth, GL_DEPTH_STENCIL, GL_UNSIGNED_INT_24_8, 0));
+    NVDR_CHECK_GL_ERROR(glClearTexSubImage(s.glColorBuffer[0], 0, 0, 0, 0, width, height, depth, GL_RGBA, GL_FLOAT, 0));
+
+    // If outputting bary differentials, clear second output buffer and set resolution uniform
+    if (s.enableDB)
+    {
+        NVDR_CHECK_GL_ERROR(glClearTexSubImage(s.glColorBuffer[1], 0, 0, 0, 0, width, height, depth, GL_RGBA, GL_FLOAT, 0));
+        NVDR_CHECK_GL_ERROR(glUniform2f(0, 2.f / (float)width, 2.f / (float)height));
+    }
+    
+    // Render the meshes.
+    if (depth == 1 && !rangesPtr)
+    {
+        // Trivial case.
+        NVDR_CHECK_GL_ERROR(glDrawElements(GL_TRIANGLES, triCount, GL_UNSIGNED_INT, 0));
+    }
+    else
+    {
+        if (!rangesPtr)
+        {
+            // Fill in range array to instantiate the same triangles for each output layer.
+            // Triangle IDs starts at zero (i.e., one) for each layer, so they correspond to
+            // the first dimension in addressing the triangle array.
+            for (int i=0; i < depth; i++)
+            {
+                GLDrawCmd& cmd = s.drawCmdBuffer[i];
+                cmd.firstIndex    = 0;
+                cmd.count         = triCount;
+                cmd.baseVertex    = vtxPerInstance * i;
+                cmd.baseInstance  = 0;
+                cmd.instanceCount = 1;
+            }
+        }
+        else
+        {
+            // Fill in the range array according to user-given ranges. Triangle IDs point
+            // to the input triangle array, NOT index within range, so they correspond to
+            // the first dimension in addressing the triangle array.
+            for (int i=0, j=0; i < depth; i++)
+            {
+                GLDrawCmd& cmd = s.drawCmdBuffer[i];
+                int first = rangesPtr[j++];
+                int count = rangesPtr[j++];
+                NVDR_CHECK(first >= 0 && count >= 0, "range contains negative values");
+                NVDR_CHECK((first + count) * 3 <= triCount, "range extends beyond end of triangle buffer");
+                cmd.firstIndex    = first * 3;
+                cmd.count         = count * 3;
+                cmd.baseVertex    = 0;
+                cmd.baseInstance  = first;
+                cmd.instanceCount = 1;
+            }
+        }
+
+        // Draw!
+        NVDR_CHECK_GL_ERROR(glMultiDrawElementsIndirect(GL_TRIANGLES, GL_UNSIGNED_INT, &s.drawCmdBuffer[0], depth, sizeof(GLDrawCmd)));
+    }
+}
+
+void rasterizeCopyResults(NVDR_CTX_ARGS, RasterizeGLState& s, cudaStream_t stream, float** outputPtr, int width, int height, int depth)
+{
+    // Copy color buffers to output tensors.
+    cudaArray_t array = 0;
+    cudaChannelFormatDesc arrayDesc = {};   // For error checking.
+    cudaExtent arrayExt = {};               // For error checking.
+    int num_outputs = s.enableDB ? 2 : 1;
+    NVDR_CHECK_CUDA_ERROR(cudaGraphicsMapResources(num_outputs, s.cudaColorBuffer, stream));
+    for (int i=0; i < num_outputs; i++)
+    {
+        NVDR_CHECK_CUDA_ERROR(cudaGraphicsSubResourceGetMappedArray(&array, s.cudaColorBuffer[i], 0, 0));
+        NVDR_CHECK_CUDA_ERROR(cudaArrayGetInfo(&arrayDesc, &arrayExt, NULL, array));
+        NVDR_CHECK(arrayDesc.f == cudaChannelFormatKindFloat, "CUDA mapped array data kind mismatch");
+        NVDR_CHECK(arrayDesc.x == 32 && arrayDesc.y == 32 && arrayDesc.z == 32 && arrayDesc.w == 32, "CUDA mapped array data width mismatch");
+        NVDR_CHECK(arrayExt.width >= width && arrayExt.height >= height && arrayExt.depth >= depth, "CUDA mapped array extent mismatch");
+        cudaMemcpy3DParms p = {0};
+        p.srcArray = array;
+        p.dstPtr.ptr = outputPtr[i];
+        p.dstPtr.pitch = width * 4 * sizeof(float);
+        p.dstPtr.xsize = width;
+        p.dstPtr.ysize = height;
+        p.extent.width = width;
+        p.extent.height = height;
+        p.extent.depth = depth;
+        p.kind = cudaMemcpyDeviceToDevice;
+        NVDR_CHECK_CUDA_ERROR(cudaMemcpy3DAsync(&p, stream));
+    }
+    NVDR_CHECK_CUDA_ERROR(cudaGraphicsUnmapResources(num_outputs, s.cudaColorBuffer, stream));
+}
+
+//------------------------------------------------------------------------
--- a/nvdiffrast/common/rasterize.cu
+++ b/nvdiffrast/common/rasterize.cu
+// Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#include "common.h"
+#include "rasterize.h"
+
+//------------------------------------------------------------------------
+// Gradient Cuda kernel.
+
+template <bool ENABLE_DB>
+static __forceinline__ __device__ void RasterizeGradKernelTemplate(const RasterizeGradParams p)
+{
+    // Temporary space for coalesced atomics.
+    CA_DECLARE_TEMP(RAST_GRAD_MAX_KERNEL_BLOCK_WIDTH * RAST_GRAD_MAX_KERNEL_BLOCK_HEIGHT);    
+
+    // Calculate pixel position.
+    int px = blockIdx.x * blockDim.x + threadIdx.x;
+    int py = blockIdx.y * blockDim.y + threadIdx.y;
+    int pz = blockIdx.z;
+    if (px >= p.width || py >= p.height || pz >= p.depth)
+        return;
+
+    // Pixel index.
+    int pidx = px + p.width * (py + p.height * pz);
+
+    // Read triangle idx and dy.
+    float2 dy  = ((float2*)p.dy)[pidx * 2];
+    float4 ddb = ENABLE_DB ? ((float4*)p.ddb)[pidx] : make_float4(0.f, 0.f, 0.f, 0.f);
+    int triIdx = (int)(((float*)p.out)[pidx * 4 + 3]) - 1;
+
+    // Exit if nothing to do.
+    if (triIdx < 0 || triIdx >= p.numTriangles)
+        return; // No or corrupt triangle.
+    int grad_all_dy = __float_as_int(dy.x) | __float_as_int(dy.y); // Bitwise OR of all incoming gradients.
+    int grad_all_ddb = 0;
+    if (ENABLE_DB)
+        grad_all_ddb = __float_as_int(ddb.x) | __float_as_int(ddb.y) | __float_as_int(ddb.z) | __float_as_int(ddb.w);
+    if (((grad_all_dy | grad_all_ddb) << 1) == 0)
+        return; // All incoming gradients are +0/-0.
+
+    // Fetch vertex indices.
+    int vi0 = p.tri[triIdx * 3 + 0];
+    int vi1 = p.tri[triIdx * 3 + 1];
+    int vi2 = p.tri[triIdx * 3 + 2];
+
+    // Bail out if vertex indices are corrupt.
+    if (vi0 < 0 || vi0 >= p.numVertices ||
+        vi1 < 0 || vi1 >= p.numVertices ||
+        vi2 < 0 || vi2 >= p.numVertices)
+        return;
+
+    // In instance mode, adjust vertex indices by minibatch index.
+    if (p.instance_mode)
+    {
+        vi0 += pz * p.numVertices;
+        vi1 += pz * p.numVertices;
+        vi2 += pz * p.numVertices;
+    }
+
+    // Initialize coalesced atomics.
+    CA_SET_GROUP(triIdx);
+    
+    // Fetch vertex positions.
+    float4 p0 = ((float4*)p.pos)[vi0];
+    float4 p1 = ((float4*)p.pos)[vi1];
+    float4 p2 = ((float4*)p.pos)[vi2];
+
+    // Evaluate edge functions.
+    float fx = p.xs * (float)px + p.xo;
+    float fy = p.ys * (float)py + p.yo;
+    float p0x = p0.x - fx * p0.w;
+    float p0y = p0.y - fy * p0.w;
+    float p1x = p1.x - fx * p1.w;
+    float p1y = p1.y - fy * p1.w;
+    float p2x = p2.x - fx * p2.w;
+    float p2y = p2.y - fy * p2.w;
+    float a0 = p1x*p2y - p1y*p2x;
+    float a1 = p2x*p0y - p2y*p0x;
+    float a2 = p0x*p1y - p0y*p1x;
+
+    // Compute inverse area with epsilon.
+    float at = a0 + a1 + a2;
+    float ep = copysignf(1e-6f, at); // ~1 pixel in 1k x 1k image.
+    float iw = 1.f / (at + ep);
+
+    // Perspective correct, normalized barycentrics.
+    float b0 = a0 * iw;
+    float b1 = a1 * iw;
+
+    // Position gradients.
+    float gb0  = dy.x * iw;
+    float gb1  = dy.y * iw;
+    float gbb  = gb0 * b0 + gb1 * b1;
+    float gp0x = gbb * (p2y - p1y) - gb1 * p2y;
+    float gp1x = gbb * (p0y - p2y) + gb0 * p2y;
+    float gp2x = gbb * (p1y - p0y) - gb0 * p1y + gb1 * p0y;
+    float gp0y = gbb * (p1x - p2x) + gb1 * p2x;
+    float gp1y = gbb * (p2x - p0x) - gb0 * p2x;
+    float gp2y = gbb * (p0x - p1x) + gb0 * p1x - gb1 * p0x;
+    float gp0w = -fx * gp0x - fy * gp0y;
+    float gp1w = -fx * gp1x - fy * gp1y;
+    float gp2w = -fx * gp2x - fy * gp2y;
+
+    // Bary differential gradients.
+    if (ENABLE_DB && ((grad_all_ddb) << 1) != 0)
+    {
+        float dfxdX = p.xs * iw;
+        float dfydY = p.ys * iw;
+        ddb.x *= dfxdX;
+        ddb.y *= dfydY;
+        ddb.z *= dfxdX;
+        ddb.w *= dfydY;
+
+        float da0dX = p1.y * p2.w - p2.y * p1.w;
+        float da1dX = p2.y * p0.w - p0.y * p2.w;
+        float da2dX = p0.y * p1.w - p1.y * p0.w;
+        float da0dY = p2.x * p1.w - p1.x * p2.w;
+        float da1dY = p0.x * p2.w - p2.x * p0.w;
+        float da2dY = p1.x * p0.w - p0.x * p1.w;
+        float datdX = da0dX + da1dX + da2dX;
+        float datdY = da0dY + da1dY + da2dY;
+
+        float x01 = p0.x - p1.x;
+        float x12 = p1.x - p2.x;
+        float x20 = p2.x - p0.x;
+        float y01 = p0.y - p1.y;
+        float y12 = p1.y - p2.y;
+        float y20 = p2.y - p0.y;
+        float w01 = p0.w - p1.w;
+        float w12 = p1.w - p2.w;
+        float w20 = p2.w - p0.w;
+
+        float a0p1 = fy * p2.x - fx * p2.y;
+        float a0p2 = fx * p1.y - fy * p1.x;
+        float a1p0 = fx * p2.y - fy * p2.x;
+        float a1p2 = fy * p0.x - fx * p0.y;
+
+        float wdudX = 2.f * b0 * datdX - da0dX; 
+        float wdudY = 2.f * b0 * datdY - da0dY;
+        float wdvdX = 2.f * b1 * datdX - da1dX;
+        float wdvdY = 2.f * b1 * datdY - da1dY;
+
+        float c0  = iw * (ddb.x * wdudX + ddb.y * wdudY + ddb.z * wdvdX + ddb.w * wdvdY);
+        float cx  = c0 * fx - ddb.x * b0 - ddb.z * b1;
+        float cy  = c0 * fy - ddb.y * b0 - ddb.w * b1;
+        float cxy = iw * (ddb.x * datdX + ddb.y * datdY);
+        float czw = iw * (ddb.z * datdX + ddb.w * datdY);
+
+        gp0x += c0 * y12 - cy * w12              + czw * p2y                                               + ddb.w * p2.w;
+        gp1x += c0 * y20 - cy * w20 - cxy * p2y                              - ddb.y * p2.w;
+        gp2x += c0 * y01 - cy * w01 + cxy * p1y  - czw * p0y                 + ddb.y * p1.w                - ddb.w * p0.w;
+        gp0y += cx * w12 - c0 * x12              - czw * p2x                                - ddb.z * p2.w;
+        gp1y += cx * w20 - c0 * x20 + cxy * p2x               + ddb.x * p2.w;
+        gp2y += cx * w01 - c0 * x01 - cxy * p1x  + czw * p0x  - ddb.x * p1.w                + ddb.z * p0.w;
+        gp0w += cy * x12 - cx * y12              - czw * a1p0                               + ddb.z * p2.y - ddb.w * p2.x;
+        gp1w += cy * x20 - cx * y20 - cxy * a0p1              - ddb.x * p2.y + ddb.y * p2.x;
+        gp2w += cy * x01 - cx * y01 - cxy * a0p2 - czw * a1p2 + ddb.x * p1.y - ddb.y * p1.x - ddb.z * p0.y + ddb.w * p0.x;
+    }
+
+    // Accumulate using coalesced atomics.
+    caAtomicAdd3_xyw(p.grad + 4 * vi0, gp0x, gp0y, gp0w);
+    caAtomicAdd3_xyw(p.grad + 4 * vi1, gp1x, gp1y, gp1w);
+    caAtomicAdd3_xyw(p.grad + 4 * vi2, gp2x, gp2y, gp2w);
+}
+
+// Template specializations.
+__global__ void RasterizeGradKernel  (const RasterizeGradParams p) { RasterizeGradKernelTemplate<false>(p); }
+__global__ void RasterizeGradKernelDb(const RasterizeGradParams p) { RasterizeGradKernelTemplate<true>(p); }
+
+//------------------------------------------------------------------------
--- a/nvdiffrast/common/rasterize.h
+++ b/nvdiffrast/common/rasterize.h
+// Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#pragma once
+
+//------------------------------------------------------------------------
+// Constants and helpers.
+
+#define RAST_GRAD_MAX_KERNEL_BLOCK_WIDTH  8
+#define RAST_GRAD_MAX_KERNEL_BLOCK_HEIGHT 8
+
+//------------------------------------------------------------------------
+// Gradient CUDA kernel params.
+
+struct RasterizeGradParams
+{
+    const float*    pos;            // Incoming position buffer.
+    const int*      tri;            // Incoming triangle buffer.
+    const float*    out;            // Rasterizer output buffer.
+    const float*    dy;             // Incoming gradients of rasterizer output buffer.
+    const float*    ddb;            // Incoming gradients of bary diff output buffer.
+    float*          grad;           // Outgoing position gradients.
+    int             numTriangles;   // Number of triangles.
+    int             numVertices;    // Number of vertices.
+    int             width;          // Image width.
+    int             height;         // Image height.
+    int             depth;          // Size of minibatch.
+    int             instance_mode;  // 1 if in instance rendering mode.
+    float           xs, xo, ys, yo; // Pixel position to clip-space x, y transform.
+};
+
+//------------------------------------------------------------------------
+// The rest is for C++ compilation only when using torch.
+
+#if !defined(NVDR_TORCH) || !defined(__CUDACC__)
+#include "glutil.inl"
+
+//------------------------------------------------------------------------
+// Draw command struct used by rasterizer.
+
+struct GLDrawCmd
+{
+    uint32_t    count;
+    uint32_t    instanceCount;
+    uint32_t    firstIndex;
+    uint32_t    baseVertex;
+    uint32_t    baseInstance;
+};
+
+//------------------------------------------------------------------------
+// OpenGL-related persistent state for forward op.
+
+struct RasterizeGLState
+{
+    int                     width;              // Allocated frame buffer width.
+    int                     height;             // Allocated frame buffer height.
+    int                     depth;              // Allocated frame buffer depth.
+    int                     posCount;           // Allocated position buffer in floats.
+    int                     triCount;           // Allocated triangle buffer in ints.
+    GLContext               glctx;
+    GLuint                  glFBO;
+    GLuint                  glColorBuffer[2];
+    GLuint                  glDepthStencilBuffer;
+    GLuint                  glVAO;
+    GLuint                  glTriBuffer;
+    GLuint                  glPosBuffer;
+    GLuint                  glProgram;
+    GLuint                  glVertexShader;
+    GLuint                  glGeometryShader;
+    GLuint                  glFragmentShader;
+    cudaGraphicsResource_t  cudaColorBuffer[2];
+    cudaGraphicsResource_t  cudaPosBuffer;
+    cudaGraphicsResource_t  cudaTriBuffer;
+    std::vector<GLDrawCmd>  drawCmdBuffer;
+    int                     enableDB;
+};
+
+//------------------------------------------------------------------------
+// Shared C++ code prototypes.
+
+void rasterizeInitGLContext(NVDR_CTX_ARGS, RasterizeGLState& s);
+void rasterizeResizeBuffers(NVDR_CTX_ARGS, RasterizeGLState& s, int posCount, int triCount, int width, int height, int depth);
+void rasterizeRender(NVDR_CTX_ARGS, RasterizeGLState& s, cudaStream_t stream, const float* posPtr, int posCount, int vtxPerInstance, const int32_t* triPtr, int triCount, const int32_t* rangesPtr, int width, int height, int depth);
+void rasterizeCopyResults(NVDR_CTX_ARGS, RasterizeGLState& s, cudaStream_t stream, float** outputPtr, int width, int height, int depth);
+
+//------------------------------------------------------------------------
+#endif // !defined(NVDR_TORCH) || !defined(__CUDACC__)
--- a/nvdiffrast/common/texture.cpp
+++ b/nvdiffrast/common/texture.cpp
+// Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#include "framework.h"
+#include "texture.h"
+
+//------------------------------------------------------------------------
+// Mip stack construction and access helpers.
+
+void raiseMipSizeError(NVDR_CTX_ARGS, const TextureKernelParams& p)
+{
+    char buf[1024];
+    int bufsz = 1024;
+
+    std::string msg = "Mip-map size error - cannot downsample an odd extent greater than 1. Resize the texture so that both spatial extents are powers of two, or limit the number of mip maps using max_mip_level argument.\n";
+    
+    int w = p.texWidth;
+    int h = p.texHeight;
+    bool ew = false;
+    bool eh = false;
+
+    msg += "Attempted mip stack construction:\n";
+    msg +=               "level  width height\n";
+    msg +=               "-----  ----- ------\n";
+    snprintf(buf, bufsz, "base   %5d  %5d\n", w, h);
+    msg += buf;
+    
+    int mipTotal = 0;
+    int level = 0;
+    while ((w|h) > 1 && !(ew || eh)) // Stop at first impossible size.
+    {
+        // Current level.
+        level += 1;
+
+        // Determine if downsampling fails.
+        ew = ew || (w > 1 && (w & 1));
+        eh = eh || (h > 1 && (h & 1));
+
+        // Downsample.
+        if (w > 1) w >>= 1;
+        if (h > 1) h >>= 1;
+
+        // Append level size to error message.
+        snprintf(buf, bufsz, "mip %-2d ", level);
+        msg += buf; 
+        if (ew) snprintf(buf, bufsz, "  err  ");
+        else    snprintf(buf, bufsz, "%5d  ", w);
+        msg += buf;
+        if (eh) snprintf(buf, bufsz, "  err\n");
+        else    snprintf(buf, bufsz, "%5d\n", h);
+        msg += buf;
+    }
+
+    NVDR_CHECK(0, msg);
+}
+
+int calculateMipInfo(NVDR_CTX_ARGS, TextureKernelParams& p)
+{
+    // No levels at all?
+    if (p.mipLevelLimit == 0)
+    {
+        p.mipOffset[0] = 0;
+        p.mipLevelMax = 0;
+        return 0;
+    }
+
+    // Current level size.
+    int w = p.texWidth;
+    int h = p.texHeight;
+    
+    p.mipOffset[0] = 0;
+    int mipTotal = 0;
+    int level = 0;
+    int c = (p.boundaryMode == TEX_BOUNDARY_MODE_CUBE) ? (p.channels * 6) : p.channels;
+    while ((w|h) > 1)
+    {   
+        // Current level.     
+        level += 1;
+
+        // Quit if cannot downsample.
+        if ((w > 1 && (w & 1)) || (h > 1 && (h & 1)))
+            raiseMipSizeError(NVDR_CTX_PARAMS, p);
+
+        // Downsample.
+        if (w > 1) w >>= 1;
+        if (h > 1) h >>= 1;
+
+        p.mipOffset[level] = mipTotal;
+        mipTotal += w * h * p.texDepth * c;
+
+        // Hit the level limit?
+        if (p.mipLevelLimit >= 0 && level == p.mipLevelLimit)
+            break;
+    }
+
+    p.mipLevelMax = level;
+    return mipTotal;
+}
+
+//------------------------------------------------------------------------
--- a/nvdiffrast/common/texture.cu
+++ b/nvdiffrast/common/texture.cu
+// Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#include "common.h"
+#include "texture.h"
+
+//------------------------------------------------------------------------
+// Memory access and math helpers.
+
+static __device__ __forceinline__ void accum_from_mem(float* a, int s, float  b, float c) { a[0] += b * c; }
+static __device__ __forceinline__ void accum_from_mem(float* a, int s, float2 b, float c) { a[0] += b.x * c; a[s] += b.y * c; }
+static __device__ __forceinline__ void accum_from_mem(float* a, int s, float4 b, float c) { a[0] += b.x * c; a[s] += b.y * c; a[2*s] += b.z * c; a[3*s] += b.w * c; }
+static __device__ __forceinline__ void accum_to_mem(float&  a, float* b, int s) { a += b[0]; }
+static __device__ __forceinline__ void accum_to_mem(float2& a, float* b, int s) { float2 v = a; v.x += b[0]; v.y += b[s]; a = v; }
+static __device__ __forceinline__ void accum_to_mem(float4& a, float* b, int s) { float4 v = a; v.x += b[0]; v.y += b[s]; v.z += b[2*s]; v.w += b[3*s]; a = v; }
+template<class T> static __device__ __forceinline__ T lerp  (const T& a, const T& b, float c) { return a + c * (b - a); }
+template<class T> static __device__ __forceinline__ T bilerp(const T& a, const T& b, const T& c, const T& d, const float2& e) { return lerp(lerp(a, b, e.x), lerp(c, d, e.x), e.y); }
+
+//------------------------------------------------------------------------
+// Cube map wrapping for smooth filtering across edges and corners. At corners,
+// one of the texture coordinates will be negative. For correct interpolation,
+// the missing texel must take the average color of the other three.
+
+static __constant__ uint32_t c_cubeWrapMask1[48] =
+{
+    0x1530a440, 0x1133a550, 0x6103a110, 0x1515aa44, 0x6161aa11, 0x40154a04, 0x44115a05, 0x04611a01,
+    0x2630a440, 0x2233a550, 0x5203a110, 0x2626aa44, 0x5252aa11, 0x40264a04, 0x44225a05, 0x04521a01,
+    0x32608064, 0x3366a055, 0x13062091, 0x32328866, 0x13132299, 0x50320846, 0x55330a55, 0x05130219,
+    0x42508064, 0x4455a055, 0x14052091, 0x42428866, 0x14142299, 0x60420846, 0x66440a55, 0x06140219,
+    0x5230a044, 0x5533a055, 0x1503a011, 0x5252aa44, 0x1515aa11, 0x40520a44, 0x44550a55, 0x04150a11,
+    0x6130a044, 0x6633a055, 0x2603a011, 0x6161aa44, 0x2626aa11, 0x40610a44, 0x44660a55, 0x04260a11,
+};
+
+static __constant__ uint8_t c_cubeWrapMask2[48] =
+{
+    0x26, 0x33, 0x11, 0x05, 0x00, 0x09, 0x0c, 0x04, 0x04, 0x00, 0x00, 0x05, 0x00, 0x81, 0xc0, 0x40,
+    0x02, 0x03, 0x09, 0x00, 0x0a, 0x00, 0x00, 0x02, 0x64, 0x30, 0x90, 0x55, 0xa0, 0x99, 0xcc, 0x64,
+    0x24, 0x30, 0x10, 0x05, 0x00, 0x01, 0x00, 0x00, 0x06, 0x03, 0x01, 0x05, 0x00, 0x89, 0xcc, 0x44,
+};
+
+static __device__ __forceinline__ int4 wrapCubeMap(int face, int ix0, int ix1, int iy0, int iy1, int w)
+{
+    // Calculate case number.
+    int cx = (ix0 < 0) ? 0 : (ix1 >= w) ? 2 : 1;
+    int cy = (iy0 < 0) ? 0 : (iy1 >= w) ? 6 : 3;
+    int c = cx + cy;
+    if (c >= 5)
+        c--; 
+    c = (face << 3) + c;
+
+    // Compute coordinates and faces.
+    unsigned int m = c_cubeWrapMask1[c];
+    int x0 = (m >>  0) & 3; x0 = (x0 == 0) ? 0 : (x0 == 1) ? ix0 : iy0;
+    int x1 = (m >>  2) & 3; x1 = (x1 == 0) ? 0 : (x1 == 1) ? ix1 : iy0;
+    int x2 = (m >>  4) & 3; x2 = (x2 == 0) ? 0 : (x2 == 1) ? ix0 : iy1;
+    int x3 = (m >>  6) & 3; x3 = (x3 == 0) ? 0 : (x3 == 1) ? ix1 : iy1;
+    int y0 = (m >>  8) & 3; y0 = (y0 == 0) ? 0 : (y0 == 1) ? ix0 : iy0;
+    int y1 = (m >> 10) & 3; y1 = (y1 == 0) ? 0 : (y1 == 1) ? ix1 : iy0;
+    int y2 = (m >> 12) & 3; y2 = (y2 == 0) ? 0 : (y2 == 1) ? ix0 : iy1;
+    int y3 = (m >> 14) & 3; y3 = (y3 == 0) ? 0 : (y3 == 1) ? ix1 : iy1;
+    int f0 = ((m >> 16) & 15) - 1;
+    int f1 = ((m >> 20) & 15) - 1;
+    int f2 = ((m >> 24) & 15) - 1;
+    int f3 = ((m >> 28)     ) - 1;
+
+    // Flips.
+    unsigned int f = c_cubeWrapMask2[c];
+    int w1 = w - 1;
+    if (f & 0x01) x0 = w1 - x0;
+    if (f & 0x02) x1 = w1 - x1;
+    if (f & 0x04) x2 = w1 - x2;
+    if (f & 0x08) x3 = w1 - x3;
+    if (f & 0x10) y0 = w1 - y0;
+    if (f & 0x20) y1 = w1 - y1;
+    if (f & 0x40) y2 = w1 - y2;
+    if (f & 0x80) y3 = w1 - y3;
+
+    // Done.
+    int4 tcOut;
+    tcOut.x = x0 + (y0 + f0 * w) * w;
+    tcOut.y = x1 + (y1 + f1 * w) * w;
+    tcOut.z = x2 + (y2 + f2 * w) * w;
+    tcOut.w = x3 + (y3 + f3 * w) * w;
+    return tcOut;
+}
+
+//------------------------------------------------------------------------
+// Cube map indexing and gradient functions.
+
+// Map a 3D lookup vector into an (s,t) face coordinates (returned in first .
+// two parameters) and face index.
+static __device__ __forceinline__ int indexCubeMap(float& x, float& y, float z)
+{
+    float ax = fabsf(x);
+    float ay = fabsf(y);
+    float az = fabsf(z);
+    int idx;
+    float c;
+    if (az > fmaxf(ax, ay)) { idx = 4; c = z; }
+    else if (ay > ax)       { idx = 2; c = y; y = z; }
+    else                    { idx = 0; c = x; x = z; }
+    if (c < 0.f) idx += 1;
+    float m = __frcp_rz(fabsf(c)) * .5;
+    float m0 = __uint_as_float(__float_as_uint(m) ^ ((0x21u >> idx) << 31));
+    float m1 = (idx != 2) ? -m : m;
+    x = x * m0 + .5;
+    y = y * m1 + .5;
+    x = fminf(fmaxf(x, 0.f), 1.f);
+    y = fminf(fmaxf(y, 0.f), 1.f);
+    return idx;
+}
+
+// Based on dA/d{s,t}, compute dA/d{x,y,z} at a given 3D lookup vector.
+static __device__ __forceinline__ float3 indexCubeMapGrad(float3 uv, float gu, float gv)
+{
+    float ax = fabsf(uv.x);
+    float ay = fabsf(uv.y);
+    float az = fabsf(uv.z);
+    int idx;
+    float c;
+    float c0 = gu;
+    float c1 = gv;
+    if (az > fmaxf(ax, ay)) { idx = 0x10; c = uv.z; c0 *= uv.x; c1 *= uv.y; }
+    else if (ay > ax)       { idx = 0x04; c = uv.y; c0 *= uv.x; c1 *= uv.z; }
+    else                    { idx = 0x01; c = uv.x; c0 *= uv.z; c1 *= uv.y; }
+    if (c < 0.f) idx += idx;
+    float m = __frcp_rz(fabsf(c));
+    c0 = (idx & 0x34) ? -c0 : c0;
+    c1 = (idx & 0x2e) ? -c1 : c1;
+    float gl = (c0 + c1) * m;
+    float gx = (idx & 0x03) ? gl : (idx & 0x20) ? -gu : gu;
+    float gy = (idx & 0x0c) ? gl : -gv;
+    float gz = (idx & 0x30) ? gl : (idx & 0x03) ? gu : gv;
+    gz = (idx & 0x09) ? -gz : gz;
+    return make_float3(gx, gy, gz) * (m * .5f);
+}
+
+// Based on dL/d(d{s,t}/s{X,Y}), compute dL/d(d{x,y,z}/d{X,Y}). This is just two
+// indexCubeMapGrad() functions rolled together.
+static __device__ __forceinline__ void indexCubeMapGrad4(float3 uv, float4 dw, float3& g0, float3& g1)
+{
+    float ax = fabsf(uv.x);
+    float ay = fabsf(uv.y);
+    float az = fabsf(uv.z);
+    int idx;
+    float c, c0, c1;
+    if (az > fmaxf(ax, ay)) { idx = 0x10; c = uv.z; c0 = uv.x; c1 = uv.y; }
+    else if (ay > ax)       { idx = 0x04; c = uv.y; c0 = uv.x; c1 = uv.z; }
+    else                    { idx = 0x01; c = uv.x; c0 = uv.z; c1 = uv.y; }
+    if (c < 0.f) idx += idx;
+    float m = __frcp_rz(fabsf(c));
+    c0 = (idx & 0x34) ? -c0 : c0;
+    c1 = (idx & 0x2e) ? -c1 : c1;
+    float gl0 = (dw.x * c0 + dw.z * c1) * m;
+    float gl1 = (dw.y * c0 + dw.w * c1) * m;
+    float gx0 = (idx & 0x03) ? gl0 : (idx & 0x20) ? -dw.x : dw.x;
+    float gx1 = (idx & 0x03) ? gl1 : (idx & 0x20) ? -dw.y : dw.y;
+    float gy0 = (idx & 0x0c) ? gl0 : -dw.z;
+    float gy1 = (idx & 0x0c) ? gl1 : -dw.w;
+    float gz0 = (idx & 0x30) ? gl0 : (idx & 0x03) ? dw.x : dw.z;
+    float gz1 = (idx & 0x30) ? gl1 : (idx & 0x03) ? dw.y : dw.w;
+    if (idx & 0x09)
+    {
+        gz0 = -gz0;
+        gz1 = -gz1;
+    }
+    g0 = make_float3(gx0, gy0, gz0) * (m * .5f);
+    g1 = make_float3(gx1, gy1, gz1) * (m * .5f);
+}
+
+// Compute d{s,t}/d{X,Y} based on d{x,y,z}/d{X,Y} at a given 3D lookup vector.
+// Result is (ds/dX, ds/dY, dt/dX, dt/dY).
+static __device__ __forceinline__ float4 indexCubeMapGradST(float3 uv, float3 dvdX, float3 dvdY)
+{
+    float ax = fabsf(uv.x);
+    float ay = fabsf(uv.y);
+    float az = fabsf(uv.z);
+    int idx;
+    float c, gu, gv;
+    if (az > fmaxf(ax, ay)) { idx = 0x10; c = uv.z; gu = uv.x; gv = uv.y; }
+    else if (ay > ax)       { idx = 0x04; c = uv.y; gu = uv.x; gv = uv.z; }
+    else                    { idx = 0x01; c = uv.x; gu = uv.z; gv = uv.y; }
+    if (c < 0.f) idx += idx;
+    if (idx & 0x09)
+    {
+        dvdX.z = -dvdX.z;
+        dvdY.z = -dvdY.z;
+    }
+    float m = __frcp_rz(fabsf(c));
+    float dm = m * .5f;
+    float mm = m * dm;
+    gu *= (idx & 0x34) ? -mm : mm;
+    gv *= (idx & 0x2e) ? -mm : mm;
+
+    if (idx & 0x03)
+    {
+        return make_float4(gu * dvdX.x + dm * dvdX.z,
+                           gu * dvdY.x + dm * dvdY.z,
+                           gv * dvdX.x - dm * dvdX.y,
+                           gv * dvdY.x - dm * dvdY.y);
+    }
+    else if (idx & 0x0c)
+    {
+        return make_float4(gu * dvdX.y + dm * dvdX.x,
+                           gu * dvdY.y + dm * dvdY.x,
+                           gv * dvdX.y + dm * dvdX.z,
+                           gv * dvdY.y + dm * dvdY.z);
+    } 
+    else // (idx & 0x30)
+    {
+        return make_float4(gu * dvdX.z + copysignf(dm, c) * dvdX.x,
+                           gu * dvdY.z + copysignf(dm, c) * dvdY.x,
+                           gv * dvdX.z - dm * dvdX.y,
+                           gv * dvdY.z - dm * dvdY.y);
+    }
+}
+
+// Compute d(d{s,t}/d{X,Y})/d{x,y,z}, i.e., how the pixel derivatives of 2D face
+// coordinates change w.r.t. 3D texture coordinate vector, returned as follows:
+//   |  d(ds/dX)/dx  d(ds/dY)/dx  d(dt/dX)/dx  d(dt/dY)/dx  |
+//   |  d(ds/dX)/dy  d(ds/dY)/dy  d(dt/dX)/dy  d(dt/dY)/dy  |
+//   |  d(ds/dX)/dz  d(ds/dY)/dz  d(dt/dX)/dz  d(dt/dY)/dz  |
+static __device__ __forceinline__ void indexCubeMapGrad2(float3 uv, float3 dvdX, float3 dvdY, float4& dx, float4& dy, float4& dz)
+{
+    float ax = fabsf(uv.x);
+    float ay = fabsf(uv.y);
+    float az = fabsf(uv.z);
+    int idx;
+    float c, gu, gv;
+    if (az > fmaxf(ax, ay)) { idx = 0x10; c = uv.z; gu = uv.x; gv = uv.y; }
+    else if (ay > ax)       { idx = 0x04; c = uv.y; gu = uv.x; gv = uv.z; }
+    else                    { idx = 0x01; c = uv.x; gu = uv.z; gv = uv.y; }
+    if (c < 0.f) idx += idx;
+
+    if (idx & 0x09)
+    {
+        dvdX.z = -dvdX.z;
+        dvdY.z = -dvdY.z;
+    }
+
+    float m = __frcp_rz(c);
+    float dm = -m * fabsf(m) * .5;
+    float mm = m * m * .5;
+    float mu = (idx & 0x34) ? -mm : mm;
+    float mv = (idx & 0x2e) ? -mm : mm;
+    gu *= -2.0 * m * mu;
+    gv *= -2.0 * m * mv;
+    
+    if (idx & 0x03)
+    {
+        dx.x = gu * dvdX.x + dm * dvdX.z;
+        dx.y = gu * dvdY.x + dm * dvdY.z;
+        dx.z = gv * dvdX.x - dm * dvdX.y;
+        dx.w = gv * dvdY.x - dm * dvdY.y;
+        dy.x = 0.f;
+        dy.y = 0.f;
+        dy.z = mv * dvdX.x;
+        dy.w = mv * dvdY.x;
+        dz.x = mu * dvdX.x;
+        dz.y = mu * dvdY.x;
+        dz.z = 0.f;
+        dz.w = 0.f;
+    }
+    else if (idx & 0x0c)
+    {
+        dx.x = mu * dvdX.y;
+        dx.y = mu * dvdY.y;
+        dx.z = 0.f;
+        dx.w = 0.f;
+        dy.x = gu * dvdX.y + dm * dvdX.x;
+        dy.y = gu * dvdY.y + dm * dvdY.x;
+        dy.z = gv * dvdX.y + dm * dvdX.z;
+        dy.w = gv * dvdY.y + dm * dvdY.z;
+        dz.x = 0.f;
+        dz.y = 0.f;
+        dz.z = mv * dvdX.y;
+        dz.w = mv * dvdY.y;
+    }
+    else // (idx & 0x30)
+    {
+        dx.x = mu * dvdX.z;
+        dx.y = mu * dvdY.z;
+        dx.z = 0.f;
+        dx.w = 0.f;
+        dy.x = 0.f;
+        dy.y = 0.f;
+        dy.z = mv * dvdX.z;
+        dy.w = mv * dvdY.z;
+        dz.x = gu * dvdX.z - fabsf(dm) * dvdX.x;
+        dz.y = gu * dvdY.z - fabsf(dm) * dvdY.x;
+        dz.z = gv * dvdX.z - dm * dvdX.y;
+        dz.w = gv * dvdY.z - dm * dvdY.y;
+    }
+}
+
+//------------------------------------------------------------------------
+// General texture indexing.
+
+template <bool CUBE_MODE>
+static __device__ __forceinline__ int indexTextureNearest(const TextureKernelParams& p, float3 uv, int tz)
+{
+    int w = p.texWidth;
+    int h = p.texHeight;
+    float u = uv.x;
+    float v = uv.y;
+
+    // Cube map indexing.
+    if (CUBE_MODE)
+    {   
+        // No wrap. Fold face index into tz right away.
+        tz = 6 * tz + indexCubeMap(u, v, uv.z); // Rewrites u, v.
+    }
+    else
+    {
+        // Handle boundary.
+        if (p.boundaryMode == TEX_BOUNDARY_MODE_WRAP)
+        {
+            u = u - (float)__float2int_rd(u);
+            v = v - (float)__float2int_rd(v);
+        }
+    }
+
+    u = u * (float)w;
+    v = v * (float)h;
+
+    int iu = __float2int_rd(u);
+    int iv = __float2int_rd(v);
+
+    // In zero boundary mode, return texture address -1.
+    if (!CUBE_MODE && p.boundaryMode == TEX_BOUNDARY_MODE_ZERO)
+    {
+        if (iu < 0 || iu >= w || iv < 0 || iv >= h)
+            return -1;
+    }
+
+    // Otherwise clamp and calculate the coordinate properly.
+    iu = min(max(iu, 0), w-1);
+    iv = min(max(iv, 0), h-1);
+    return iu + w * (iv + tz * h);
+}
+
+template <bool CUBE_MODE>
+static __device__ __forceinline__ float2 indexTextureLinear(const TextureKernelParams& p, float3 uv, int tz, int4& tcOut, int level)
+{
+    // Mip level size.
+    int2 sz = mipLevelSize(p, level);
+    int w = sz.x;
+    int h = sz.y;
+
+    // Compute texture-space u, v.
+    float u = uv.x;
+    float v = uv.y;
+    bool clampU = false;
+    bool clampV = false;
+
+    // Cube map indexing.
+    int face = 0;
+    if (CUBE_MODE)
+    {
+        // Neither clamp or wrap.
+        face = indexCubeMap(u, v, uv.z); // Rewrites u, v.
+        u = u * (float)w - 0.5f;
+        v = v * (float)h - 0.5f;
+    }
+    else
+    {
+        if (p.boundaryMode == TEX_BOUNDARY_MODE_WRAP)
+        {
+            // Wrap.
+            u = u - (float)__float2int_rd(u);
+            v = v - (float)__float2int_rd(v);
+        }
+
+        // Move to texel space.
+        u = u * (float)w - 0.5f;
+        v = v * (float)h - 0.5f;
+        
+        if (p.boundaryMode == TEX_BOUNDARY_MODE_CLAMP)
+        {
+            // Clamp to center of edge texels.
+            u = fminf(fmaxf(u, 0.f), w - 1.f);
+            v = fminf(fmaxf(v, 0.f), h - 1.f);
+            clampU = (u == 0.f || u == w - 1.f);
+            clampV = (v == 0.f || v == h - 1.f);
+        } 
+    }
+
+    // Compute texel coordinates and weights.
+    int iu0 = __float2int_rd(u);
+    int iv0 = __float2int_rd(v);
+    int iu1 = iu0 + (clampU ? 0 : 1); // Ensure zero u/v gradients with clamped.
+    int iv1 = iv0 + (clampV ? 0 : 1);
+    u -= (float)iu0;
+    v -= (float)iv0;
+    
+    // Cube map wrapping.
+    bool cubeWrap = CUBE_MODE && (iu0 < 0 || iv0 < 0 || iu1 >= w || iv1 >= h);
+    if (cubeWrap)
+    {
+        tcOut = wrapCubeMap(face, iu0, iu1, iv0, iv1, w);
+        tcOut += 6 * tz * w * h;  // Bring in tz.
+        return make_float2(u, v); // Done.
+    }
+
+    // Fold cube map face into tz.
+    if (CUBE_MODE)
+        tz = 6 * tz + face;
+
+    // Wrap overflowing texel indices.
+    if (!CUBE_MODE && p.boundaryMode == TEX_BOUNDARY_MODE_WRAP)
+    {
+        if (iu0 < 0) iu0 += w;
+        if (iv0 < 0) iv0 += h;
+        if (iu1 >= w) iu1 -= w;
+        if (iv1 >= h) iv1 -= h;
+    }
+
+    // Coordinates
+    iu0 += tz * w * h;
+    iu1 += tz * w * h;
+    tcOut.x = iu0 + w * iv0;
+    tcOut.y = iu1 + w * iv0;
+    tcOut.z = iu0 + w * iv1;
+    tcOut.w = iu1 + w * iv1;
+
+    // Invalidate texture addresses outside unit square if we are in zero mode.
+    if (!CUBE_MODE && p.boundaryMode == TEX_BOUNDARY_MODE_ZERO)
+    {
+        bool iu0_out = (iu0 < 0 || iu0 >= w);
+        bool iu1_out = (iu1 < 0 || iu1 >= w);
+        bool iv0_out = (iv0 < 0 || iv0 >= h);
+        bool iv1_out = (iv1 < 0 || iv1 >= h);
+        if (iu0_out || iv0_out) tcOut.x = -1;
+        if (iu1_out || iv0_out) tcOut.y = -1;
+        if (iu0_out || iv1_out) tcOut.z = -1;
+        if (iu1_out || iv1_out) tcOut.w = -1;
+    }
+
+    // All done.
+    return make_float2(u, v);
+}
+
+//------------------------------------------------------------------------
+// Mip level calculation.
+
+template <bool CUBE_MODE, int FILTER_MODE>
+static __device__ __forceinline__ void calculateMipLevel(int& level0, int& level1, float& flevel, const TextureKernelParams& p, int pidx, float3 uv, float4* pdw, float3* pdfdv)
+{
+    // Do nothing if mips not in use.
+    if (FILTER_MODE == TEX_MODE_NEAREST || FILTER_MODE == TEX_MODE_LINEAR)
+        return;
+
+    // Get pixel derivatives of texture coordinates.
+    float4 uvDA;
+    float3 dvdX, dvdY; // Gradients use these later.
+    if (CUBE_MODE)
+    {
+        // Fetch.
+        float2 d0 = ((const float2*)p.uvDA)[3 * pidx + 0];
+        float2 d1 = ((const float2*)p.uvDA)[3 * pidx + 1];
+        float2 d2 = ((const float2*)p.uvDA)[3 * pidx + 2];    
+
+        // Map d{x,y,z}/d{X,Y} into d{s,t}/d{X,Y}.
+        dvdX = make_float3(d0.x, d1.x, d2.x); // d{x,y,z}/dX
+        dvdY = make_float3(d0.y, d1.y, d2.y); // d{x,y,z}/dY
+        uvDA = indexCubeMapGradST(uv, dvdX, dvdY); // d{s,t}/d{X,Y}
+    }
+    else
+    {
+        // Fetch.
+        uvDA = ((const float4*)p.uvDA)[pidx];
+    }
+
+    // Scaling factors.
+    float uscl = p.texWidth;
+    float vscl = p.texHeight;
+
+    // d[s,t]/d[X,Y].
+    float dsdx = uvDA.x * uscl;
+    float dsdy = uvDA.y * uscl;
+    float dtdx = uvDA.z * vscl;
+    float dtdy = uvDA.w * vscl;
+
+    // Calculate footprint axis lengths.
+    float A = dsdx*dsdx + dtdx*dtdx;
+    float B = dsdy*dsdy + dtdy*dtdy;
+    float C = dsdx*dsdy + dtdx*dtdy;
+    float l2b = 0.5 * (A + B);
+    float l2n = 0.25 * (A-B)*(A-B) + C*C;
+    float l2a = sqrt(l2n);
+    float lenMinorSqr = fmaxf(0.0, l2b - l2a);
+    float lenMajorSqr = l2b + l2a;
+
+    // Footprint vs. mip level gradient.
+    if (pdw && FILTER_MODE == TEX_MODE_LINEAR_MIPMAP_LINEAR)
+    {
+        float dw   = 0.72134752f / (l2n + l2a * l2b); // Constant is 0.5/ln(2).
+        float AB   = dw * .5f * (A - B);
+        float Cw   = dw * C;
+        float l2aw = dw * l2a;
+        float d_f_ddsdX = uscl * (dsdx * (l2aw + AB) + dsdy * Cw);
+        float d_f_ddsdY = uscl * (dsdy * (l2aw - AB) + dsdx * Cw);
+        float d_f_ddtdX = vscl * (dtdx * (l2aw + AB) + dtdy * Cw);
+        float d_f_ddtdY = vscl * (dtdy * (l2aw - AB) + dtdx * Cw);
+
+        *pdw = make_float4(d_f_ddsdX, d_f_ddsdY, d_f_ddtdX, d_f_ddtdY);
+    
+        // In cube maps, there is also a texture coordinate vs. mip level gradient.
+        if (CUBE_MODE)
+        {
+            float4 dx, dy, dz;
+            indexCubeMapGrad2(uv, dvdX, dvdY, dx, dy, dz);
+
+            float3 d_dsdX_dv = make_float3(dx.x, dy.x, dz.x);
+            float3 d_dsdY_dv = make_float3(dx.y, dy.y, dz.y);
+            float3 d_dtdX_dv = make_float3(dx.z, dy.z, dz.z);
+            float3 d_dtdY_dv = make_float3(dx.w, dy.w, dz.w);
+
+            float3 d_f_dv = make_float3(0.f, 0.f, 0.f);
+            d_f_dv += d_dsdX_dv * d_f_ddsdX;
+            d_f_dv += d_dsdY_dv * d_f_ddsdY;
+            d_f_dv += d_dtdX_dv * d_f_ddtdX;
+            d_f_dv += d_dtdY_dv * d_f_ddtdY;
+
+            *pdfdv = d_f_dv;
+        }
+    }
+
+    // Calculate true mip level and clamp.
+    flevel = .5f * __log2f(lenMajorSqr);
+    flevel = fminf(fmaxf(flevel, 0.f), (float)p.mipLevelMax);
+
+    if (FILTER_MODE == TEX_MODE_LINEAR_MIPMAP_NEAREST)
+    {
+        // Linear-mipmap-nearest.
+        level0 = __float2int_rn(flevel);
+    }
+    else
+    {
+        // Linear-mipmap-linear.
+        if (flevel > 0.f) // Leave everything at zero if flevel == 0 (magnification)
+        {
+            level0 = __float2int_rd(flevel);
+            level1 = min(level0 + 1, p.mipLevelMax);
+            flevel -= level0; // Fractional part. Zero if clamped on last level.
+        }
+    }
+}
+
+//------------------------------------------------------------------------
+// Texel fetch and accumulator helpers that understand cube map corners.
+
+template<class T>
+static __device__ __forceinline__ void fetchQuad(T& a00, T& a10, T& a01, T& a11, const float* pIn, int4 tc, bool corner)
+{
+    if (corner)
+    {
+        T avg = zero_value<T>();
+        if (tc.x >= 0) avg += (a00 = *((const T*)&pIn[tc.x]));
+        if (tc.y >= 0) avg += (a10 = *((const T*)&pIn[tc.y]));
+        if (tc.z >= 0) avg += (a01 = *((const T*)&pIn[tc.z]));
+        if (tc.w >= 0) avg += (a11 = *((const T*)&pIn[tc.w]));
+        avg *= 0.33333333f;
+        if (tc.x < 0) a00 = avg;
+        if (tc.y < 0) a10 = avg;
+        if (tc.z < 0) a01 = avg;
+        if (tc.w < 0) a11 = avg;
+    }
+    else
+    {
+        a00 = (tc.x >= 0) ? *((const T*)&pIn[tc.x]) : zero_value<T>();
+        a10 = (tc.y >= 0) ? *((const T*)&pIn[tc.y]) : zero_value<T>();
+        a01 = (tc.z >= 0) ? *((const T*)&pIn[tc.z]) : zero_value<T>();
+        a11 = (tc.w >= 0) ? *((const T*)&pIn[tc.w]) : zero_value<T>();
+    }
+}
+
+static __device__ __forceinline__ void accumQuad(float4 c, float* pOut, int level, int4 tc, bool corner, CA_TEMP_PARAM)
+{
+    if (corner)
+    {
+        float cb;
+        if (tc.x < 0) cb = c.x;
+        if (tc.y < 0) cb = c.y;
+        if (tc.z < 0) cb = c.z;
+        if (tc.w < 0) cb = c.w;
+        cb *= 0.33333333f;
+        if (tc.x >= 0) caAtomicAddTexture(pOut, level, tc.x, c.x + cb);
+        if (tc.y >= 0) caAtomicAddTexture(pOut, level, tc.y, c.y + cb);
+        if (tc.z >= 0) caAtomicAddTexture(pOut, level, tc.z, c.z + cb);
+        if (tc.w >= 0) caAtomicAddTexture(pOut, level, tc.w, c.w + cb);
+    }
+    else
+    {
+        if (tc.x >= 0) caAtomicAddTexture(pOut, level, tc.x, c.x);
+        if (tc.y >= 0) caAtomicAddTexture(pOut, level, tc.y, c.y);
+        if (tc.z >= 0) caAtomicAddTexture(pOut, level, tc.z, c.z);
+        if (tc.w >= 0) caAtomicAddTexture(pOut, level, tc.w, c.w);
+    }
+}
+
+//------------------------------------------------------------------------
+// Mip builder kernel.
+
+template<class T, int C>
+static __forceinline__ __device__ void MipBuildKernelTemplate(const TextureKernelParams p)
+{
+    // Sizes.
+    int2 sz_in = mipLevelSize(p, p.mipLevelOut - 1);
+    int2 sz_out = mipLevelSize(p, p.mipLevelOut);
+
+    // Calculate pixel position.
+    int px = blockIdx.x * blockDim.x + threadIdx.x;
+    int py = blockIdx.y * blockDim.y + threadIdx.y;
+    int pz = blockIdx.z;
+    if (px >= sz_out.x || py >= sz_out.y)
+        return;
+
+    // Pixel indices.
+    int pidx_in0 = p.channels * (((px + sz_in.x * py) << 1) + (pz * sz_in.x * sz_in.y));
+    int pidx_in1 = pidx_in0 + p.channels * sz_in.x; // Next pixel down.
+    int pidx_out = p.channels * (px + sz_out.x * (py + sz_out.y * pz));
+
+    // Input and output pointers.
+    const float* pin = (p.mipLevelOut > 1) ? (p.mip + p.mipOffset[p.mipLevelOut - 1]) : p.tex;
+    float* pout = p.mip + p.mipOffset[p.mipLevelOut];
+
+    // Special case: Input texture height or width is 1.
+    if (sz_in.x == 1 || sz_in.y == 1)
+    {
+        if (sz_in.y == 1)
+            pidx_in1 = pidx_in0 + p.channels; // Next pixel on the right.
+
+        for (int i=0; i < p.channels; i += C)
+        {
+            T v0 = *((const T*)&pin[pidx_in0 + i]);
+            T v1 = *((const T*)&pin[pidx_in1 + i]);
+            T avg = .5f * (v0 + v1);
+#if TEX_DEBUG_MIP_RETAIN_VARIANCE
+            avg = (avg - .5f) * 1.41421356f + .5f;
+#endif
+            *((T*)&pout[pidx_out + i]) = avg;
+        }
+
+        return;
+    }
+
+    for (int i=0; i < p.channels; i += C)
+    {
+        T v0 = *((const T*)&pin[pidx_in0 + i]);
+        T v1 = *((const T*)&pin[pidx_in0 + i + p.channels]);
+        T v2 = *((const T*)&pin[pidx_in1 + i]);
+        T v3 = *((const T*)&pin[pidx_in1 + i + p.channels]);
+        T avg = .25f * (v0 + v1 + v2 + v3);
+#if TEX_DEBUG_MIP_RETAIN_VARIANCE
+        avg = (avg - .5f) * 2.f + .5f;
+#endif
+        *((T*)&pout[pidx_out + i]) = avg;
+    }
+}
+
+// Template specializations.
+__global__ void MipBuildKernel1(const TextureKernelParams p) { MipBuildKernelTemplate<float,  1>(p); }
+__global__ void MipBuildKernel2(const TextureKernelParams p) { MipBuildKernelTemplate<float2, 2>(p); }
+__global__ void MipBuildKernel4(const TextureKernelParams p) { MipBuildKernelTemplate<float4, 4>(p); }
+
+//------------------------------------------------------------------------
+// Forward kernel.
+
+template <class T, int C, bool CUBE_MODE, int FILTER_MODE>
+static __forceinline__ __device__ void TextureFwdKernelTemplate(const TextureKernelParams p)
+{
+    // Calculate pixel position.
+    int px = blockIdx.x * blockDim.x + threadIdx.x;
+    int py = blockIdx.y * blockDim.y + threadIdx.y;
+    int pz = blockIdx.z;
+    int tz = (p.texDepth == 1) ? 0 : pz;
+    if (px >= p.imgWidth || py >= p.imgHeight || pz >= p.n)
+        return;
+
+    // Pixel index.
+    int pidx = px + p.imgWidth * (py + p.imgHeight * pz);
+
+    // Output ptr.
+    float* pOut = p.out + pidx * p.channels;
+
+    // Get UV.
+    float3 uv;
+    if (CUBE_MODE)
+        uv = ((const float3*)p.uv)[pidx];
+    else
+        uv = make_float3(((const float2*)p.uv)[pidx], 0.f);
+
+    // Nearest mode.
+    if (FILTER_MODE == TEX_MODE_NEAREST)
+    {
+        int tc = indexTextureNearest<CUBE_MODE>(p, uv, tz);
+        tc *= p.channels;
+        const float* pIn = p.tex;
+        
+        // Copy if valid tc, otherwise output zero.
+        for (int i=0; i < p.channels; i += C)
+            *((T*)&pOut[i]) = (tc >= 0) ? *((const T*)&pIn[tc + i]) : zero_value<T>();
+
+        return; // Exit.
+    }
+
+    // Calculate mip level. In 'linear' mode these will all stay zero.
+    float  flevel = 0.f; // Fractional level.
+    int    level0 = 0;   // Discrete level 0.
+    int    level1 = 0;   // Discrete level 1.
+    calculateMipLevel<CUBE_MODE, FILTER_MODE>(level0, level1, flevel, p, pidx, uv, 0, 0);
+
+    // Get texel indices and pointer for level 0.
+    int4 tc0 = make_int4(0, 0, 0, 0);
+    float2 uv0 = indexTextureLinear<CUBE_MODE>(p, uv, tz, tc0, level0);
+    const float* pIn0 = level0 ? (p.mip + p.mipOffset[level0]) : p.tex;
+    bool corner0 = CUBE_MODE && ((tc0.x | tc0.y | tc0.z | tc0.w) < 0);
+    tc0 *= p.channels;
+
+    // Bilinear fetch.
+    if (FILTER_MODE == TEX_MODE_LINEAR || FILTER_MODE == TEX_MODE_LINEAR_MIPMAP_NEAREST)
+    {
+        // Interpolate.
+        for (int i=0; i < p.channels; i += C, tc0 += C)
+        {
+            T a00, a10, a01, a11;
+            fetchQuad<T>(a00, a10, a01, a11, pIn0, tc0, corner0);
+            *((T*)&pOut[i]) = bilerp(a00, a10, a01, a11, uv0);
+        }
+        return; // Exit.
+    }
+
+    // Get texel indices and pointer for level 1.
+    int4 tc1 = make_int4(0, 0, 0, 0);
+    float2 uv1 = indexTextureLinear<CUBE_MODE>(p, uv, tz, tc1, level1);
+    const float* pIn1 = level1 ? (p.mip + p.mipOffset[level1]) : p.tex;
+    bool corner1 = CUBE_MODE && ((tc1.x | tc1.y | tc1.z | tc1.w) < 0);
+    tc1 *= p.channels;
+
+    // Trilinear fetch.
+    for (int i=0; i < p.channels; i += C, tc0 += C, tc1 += C)
+    {
+        // First level.
+        T a00, a10, a01, a11;
+        fetchQuad<T>(a00, a10, a01, a11, pIn0, tc0, corner0);
+        T a = bilerp(a00, a10, a01, a11, uv0);
+
+        // Second level unless in magnification mode.
+        if (flevel > 0.f)
+        {
+            T b00, b10, b01, b11;
+            fetchQuad<T>(b00, b10, b01, b11, pIn1, tc1, corner1);
+            T b = bilerp(b00, b10, b01, b11, uv1);
+            a = lerp(a, b, flevel); // Interpolate between levels.
+        }
+
+        // Write.
+        *((T*)&pOut[i]) = a;
+    }
+}
+
+// Template specializations.
+__global__ void TextureFwdKernelNearest1                 (const TextureKernelParams p) { TextureFwdKernelTemplate<float,  1, false, TEX_MODE_NEAREST>(p); }
+__global__ void TextureFwdKernelNearest2                 (const TextureKernelParams p) { TextureFwdKernelTemplate<float2, 2, false, TEX_MODE_NEAREST>(p); }
+__global__ void TextureFwdKernelNearest4                 (const TextureKernelParams p) { TextureFwdKernelTemplate<float4, 4, false, TEX_MODE_NEAREST>(p); }
+__global__ void TextureFwdKernelLinear1                  (const TextureKernelParams p) { TextureFwdKernelTemplate<float,  1, false, TEX_MODE_LINEAR>(p); }
+__global__ void TextureFwdKernelLinear2                  (const TextureKernelParams p) { TextureFwdKernelTemplate<float2, 2, false, TEX_MODE_LINEAR>(p); }
+__global__ void TextureFwdKernelLinear4                  (const TextureKernelParams p) { TextureFwdKernelTemplate<float4, 4, false, TEX_MODE_LINEAR>(p); }
+__global__ void TextureFwdKernelLinearMipmapNearest1     (const TextureKernelParams p) { TextureFwdKernelTemplate<float,  1, false, TEX_MODE_LINEAR_MIPMAP_NEAREST>(p); }
+__global__ void TextureFwdKernelLinearMipmapNearest2     (const TextureKernelParams p) { TextureFwdKernelTemplate<float2, 2, false, TEX_MODE_LINEAR_MIPMAP_NEAREST>(p); }
+__global__ void TextureFwdKernelLinearMipmapNearest4     (const TextureKernelParams p) { TextureFwdKernelTemplate<float4, 4, false, TEX_MODE_LINEAR_MIPMAP_NEAREST>(p); }
+__global__ void TextureFwdKernelLinearMipmapLinear1      (const TextureKernelParams p) { TextureFwdKernelTemplate<float,  1, false, TEX_MODE_LINEAR_MIPMAP_LINEAR>(p); }
+__global__ void TextureFwdKernelLinearMipmapLinear2      (const TextureKernelParams p) { TextureFwdKernelTemplate<float2, 2, false, TEX_MODE_LINEAR_MIPMAP_LINEAR>(p); }
+__global__ void TextureFwdKernelLinearMipmapLinear4      (const TextureKernelParams p) { TextureFwdKernelTemplate<float4, 4, false, TEX_MODE_LINEAR_MIPMAP_LINEAR>(p); }
+__global__ void TextureFwdKernelCubeNearest1             (const TextureKernelParams p) { TextureFwdKernelTemplate<float,  1, true,  TEX_MODE_NEAREST>(p); }
+__global__ void TextureFwdKernelCubeNearest2             (const TextureKernelParams p) { TextureFwdKernelTemplate<float2, 2, true,  TEX_MODE_NEAREST>(p); }
+__global__ void TextureFwdKernelCubeNearest4             (const TextureKernelParams p) { TextureFwdKernelTemplate<float4, 4, true,  TEX_MODE_NEAREST>(p); }
+__global__ void TextureFwdKernelCubeLinear1              (const TextureKernelParams p) { TextureFwdKernelTemplate<float,  1, true,  TEX_MODE_LINEAR>(p); }
+__global__ void TextureFwdKernelCubeLinear2              (const TextureKernelParams p) { TextureFwdKernelTemplate<float2, 2, true,  TEX_MODE_LINEAR>(p); }
+__global__ void TextureFwdKernelCubeLinear4              (const TextureKernelParams p) { TextureFwdKernelTemplate<float4, 4, true,  TEX_MODE_LINEAR>(p); }
+__global__ void TextureFwdKernelCubeLinearMipmapNearest1 (const TextureKernelParams p) { TextureFwdKernelTemplate<float,  1, true,  TEX_MODE_LINEAR_MIPMAP_NEAREST>(p); }
+__global__ void TextureFwdKernelCubeLinearMipmapNearest2 (const TextureKernelParams p) { TextureFwdKernelTemplate<float2, 2, true,  TEX_MODE_LINEAR_MIPMAP_NEAREST>(p); }
+__global__ void TextureFwdKernelCubeLinearMipmapNearest4 (const TextureKernelParams p) { TextureFwdKernelTemplate<float4, 4, true,  TEX_MODE_LINEAR_MIPMAP_NEAREST>(p); }
+__global__ void TextureFwdKernelCubeLinearMipmapLinear1  (const TextureKernelParams p) { TextureFwdKernelTemplate<float,  1, true,  TEX_MODE_LINEAR_MIPMAP_LINEAR>(p); }
+__global__ void TextureFwdKernelCubeLinearMipmapLinear2  (const TextureKernelParams p) { TextureFwdKernelTemplate<float2, 2, true,  TEX_MODE_LINEAR_MIPMAP_LINEAR>(p); }
+__global__ void TextureFwdKernelCubeLinearMipmapLinear4  (const TextureKernelParams p) { TextureFwdKernelTemplate<float4, 4, true,  TEX_MODE_LINEAR_MIPMAP_LINEAR>(p); }
+
+//------------------------------------------------------------------------
+// Gradient mip puller kernel.
+
+template<class T, int C>
+static __forceinline__ __device__ void MipGradKernelTemplate(const TextureKernelParams p)
+{
+    // Calculate pixel position.
+    int px = blockIdx.x * blockDim.x + threadIdx.x;
+    int py = blockIdx.y * blockDim.y + threadIdx.y;
+    int pz = blockIdx.z;
+    if (px >= p.texWidth || py >= p.texHeight)
+        return;
+
+    // Number of wide elements.
+    int c = p.channels;
+    if (C == 2) c >>= 1;
+    if (C == 4) c >>= 2;
+
+    // Dynamically allocated shared memory for holding a texel.
+    extern __shared__ float s_texelAccum[];
+    int sharedOfs = threadIdx.x + threadIdx.y * blockDim.x;
+    int sharedStride = blockDim.x * blockDim.y;
+#   define TEXEL_ACCUM(_i) (s_texelAccum + (sharedOfs + (_i) * sharedStride))
+
+    // Clear the texel.
+    for (int i=0; i < p.channels; i++)
+        *TEXEL_ACCUM(i) = 0.f;
+
+    // Track texel position and accumulation weight over the mip stack.
+    int x = px;
+    int y = py;
+    float w = 1.f;
+
+    // Pull gradients from all levels.
+    int2 sz = mipLevelSize(p, 0); // Previous level size.
+    for (int level=1; level <= p.mipLevelMax; level++)
+    {
+        // Weight decay depends on previous level size.
+        if (sz.x > 1) w *= .5f;
+        if (sz.y > 1) w *= .5f;
+
+        // Current level size and coordinates.
+        sz = mipLevelSize(p, level);
+        x >>= 1;
+        y >>= 1;
+
+        T* pIn = (T*)(p.gradTexMip + p.mipOffset[level] + (x + sz.x * (y + sz.y * pz)) * p.channels);
+        for (int i=0; i < c; i++)
+            accum_from_mem(TEXEL_ACCUM(i * C), sharedStride, pIn[i], w);
+    }
+
+    // Add to main texture gradients.
+    T* pOut = (T*)(p.gradTex + (px + p.texWidth * (py + p.texHeight * pz)) * p.channels);
+    for (int i=0; i < c; i++)
+        accum_to_mem(pOut[i], TEXEL_ACCUM(i * C), sharedStride);
+}
+
+// Template specializations.
+__global__ void MipGradKernel1(const TextureKernelParams p) { MipGradKernelTemplate<float,  1>(p); }
+__global__ void MipGradKernel2(const TextureKernelParams p) { MipGradKernelTemplate<float2, 2>(p); }
+__global__ void MipGradKernel4(const TextureKernelParams p) { MipGradKernelTemplate<float4, 4>(p); }
+
+//------------------------------------------------------------------------
+// Gradient kernel.
+
+template <bool CUBE_MODE, int FILTER_MODE>
+static __forceinline__ __device__ void TextureGradKernelTemplate(const TextureKernelParams p)
+{
+    // Temporary space for coalesced atomics.
+    CA_DECLARE_TEMP(TEX_GRAD_MAX_KERNEL_BLOCK_WIDTH * TEX_GRAD_MAX_KERNEL_BLOCK_HEIGHT);
+
+    // Calculate pixel position.
+    int px = blockIdx.x * blockDim.x + threadIdx.x;
+    int py = blockIdx.y * blockDim.y + threadIdx.y;
+    int pz = blockIdx.z;
+    int tz = (p.texDepth == 1) ? 0 : pz;
+    if (px >= p.imgWidth || py >= p.imgHeight || pz >= p.n)
+        return;
+
+    // Pixel index.
+    int pidx = px + p.imgWidth * (py + p.imgHeight * pz);
+
+    // Early exit if output gradients are zero.
+    const float* pDy = p.dy + pidx * p.channels;
+    unsigned int dmax = 0u;
+    if ((p.channels & 3) == 0)
+    {
+        for (int i=0; i < p.channels; i += 4)
+        {
+            uint4 dy = *((const uint4*)&pDy[i]);
+            dmax |= (dy.x | dy.y | dy.z | dy.w);
+        }
+    }
+    else
+    {
+        for (int i=0; i < p.channels; i++)
+            dmax |= __float_as_uint(pDy[i]);
+    }
+
+    // Store zeros and exit.
+    if (__uint_as_float(dmax) == 0.f)
+    {
+        if (CUBE_MODE)
+        {
+            if (FILTER_MODE != TEX_MODE_NEAREST)
+                ((float3*)p.gradUV)[pidx] = make_float3(0.f, 0.f, 0.f);
+            if (FILTER_MODE == TEX_MODE_LINEAR_MIPMAP_LINEAR)
+            {
+                ((float2*)p.gradUVDA)[3 * pidx + 0] = make_float2(0.f, 0.f);
+                ((float2*)p.gradUVDA)[3 * pidx + 1] = make_float2(0.f, 0.f);
+                ((float2*)p.gradUVDA)[3 * pidx + 2] = make_float2(0.f, 0.f);
+            }
+        }
+        else
+        {
+            if (FILTER_MODE != TEX_MODE_NEAREST)
+                ((float2*)p.gradUV)[pidx] = make_float2(0.f, 0.f);
+            if (FILTER_MODE == TEX_MODE_LINEAR_MIPMAP_LINEAR)
+                ((float4*)p.gradUVDA)[pidx] = make_float4(0.f, 0.f, 0.f, 0.f);
+        }
+        return;
+    }
+
+    // Get UV.
+    float3 uv;
+    if (CUBE_MODE)
+        uv = ((const float3*)p.uv)[pidx];
+    else
+        uv = make_float3(((const float2*)p.uv)[pidx], 0.f);
+
+    // Nearest mode - texture gradients only.
+    if (FILTER_MODE == TEX_MODE_NEAREST)
+    {
+        int tc = indexTextureNearest<CUBE_MODE>(p, uv, tz);
+        if (tc < 0)
+            return; // Outside texture.
+
+        tc *= p.channels;
+        float* pOut = p.gradTex;
+
+        // Accumulate texture gradients.
+        for (int i=0; i < p.channels; i++)
+            caAtomicAddTexture(pOut, 0, tc + i, pDy[i]);
+
+        return; // Exit.
+    }
+
+    // Calculate mip level. In 'linear' mode these will all stay zero.
+    float4 dw = make_float4(0.f, 0.f, 0.f, 0.f);
+    float3 dfdv = make_float3(0.f, 0.f, 0.f);
+    float  flevel = 0.f; // Fractional level.
+    int    level0 = 0;   // Discrete level 0.
+    int    level1 = 0;   // Discrete level 1.
+    calculateMipLevel<CUBE_MODE, FILTER_MODE>(level0, level1, flevel, p, pidx, uv, &dw, &dfdv);
+
+    // UV gradient accumulators.
+    float gu = 0.f;
+    float gv = 0.f;
+
+    // Get texel indices and pointers for level 0.
+    int4 tc0 = make_int4(0, 0, 0, 0);
+    float2 uv0 = indexTextureLinear<CUBE_MODE>(p, uv, tz, tc0, level0);
+    const float* pIn0 = level0 ? (p.mip + p.mipOffset[level0]) : p.tex;
+    float* pOut0 = level0 ? (p.gradTexMip + p.mipOffset[level0]) : p.gradTex;
+    bool corner0 = CUBE_MODE && ((tc0.x | tc0.y | tc0.z | tc0.w) < 0);
+    tc0 *= p.channels;
+
+    // Texel weights.
+    float uv011 = uv0.x * uv0.y;
+    float uv010 = uv0.x - uv011;
+    float uv001 = uv0.y - uv011;
+    float uv000 = 1.f - uv0.x - uv001;
+    float4 tw0 = make_float4(uv000, uv010, uv001, uv011);
+
+    // Attribute weights.
+    int2 sz0 = mipLevelSize(p, level0);
+    float sclu0 = (float)sz0.x;
+    float sclv0 = (float)sz0.y;
+
+    // Bilinear mode - texture and uv gradients.
+    if (FILTER_MODE == TEX_MODE_LINEAR || FILTER_MODE == TEX_MODE_LINEAR_MIPMAP_NEAREST)
+    {
+        for (int i=0; i < p.channels; i++, tc0 += 1)
+        {
+            float dy = pDy[i];
+            accumQuad(tw0 * dy, pOut0, level0, tc0, corner0, CA_TEMP);
+            
+            float a00, a10, a01, a11;
+            fetchQuad<float>(a00, a10, a01, a11, pIn0, tc0, corner0);
+            float ad = (a11 + a00 - a10 - a01);
+            gu += dy * ((a10 - a00) + uv0.y * ad) * sclu0;
+            gv += dy * ((a01 - a00) + uv0.x * ad) * sclv0;
+        }
+
+        // Store UV gradients and exit.
+        if (CUBE_MODE)
+            ((float3*)p.gradUV)[pidx] = indexCubeMapGrad(uv, gu, gv);
+        else
+            ((float2*)p.gradUV)[pidx] = make_float2(gu, gv);
+
+        return;
+    }
+
+    // Accumulate fractional mip level gradient.
+    float df = 0; // dL/df.
+
+    // Get texel indices and pointers for level 1.
+    int4 tc1 = make_int4(0, 0, 0, 0);
+    float2 uv1 = indexTextureLinear<CUBE_MODE>(p, uv, tz, tc1, level1);
+    const float* pIn1 = level1 ? (p.mip + p.mipOffset[level1]) : p.tex;
+    float* pOut1 = level1 ? (p.gradTexMip + p.mipOffset[level1]) : p.gradTex;
+    bool corner1 = CUBE_MODE && ((tc1.x | tc1.y | tc1.z | tc1.w) < 0);
+    tc1 *= p.channels;
+
+    // Texel weights.
+    float uv111 = uv1.x * uv1.y;
+    float uv110 = uv1.x - uv111;
+    float uv101 = uv1.y - uv111;
+    float uv100 = 1.f - uv1.x - uv101;
+    float4 tw1 = make_float4(uv100, uv110, uv101, uv111);
+
+    // Attribute weights.
+    int2 sz1 = mipLevelSize(p, level1);
+    float sclu1 = (float)sz1.x;
+    float sclv1 = (float)sz1.y;
+
+    // Trilinear mode.
+    for (int i=0; i < p.channels; i++, tc0 += 1, tc1 += 1)
+    {
+        float dy = pDy[i];
+        float dy0 = (1.f - flevel) * dy;
+        accumQuad(tw0 * dy0, pOut0, level0, tc0, corner0, CA_TEMP);
+
+        // UV gradients for first level.
+        float a00, a10, a01, a11;
+        fetchQuad<float>(a00, a10, a01, a11, pIn0, tc0, corner0);
+        float ad = (a11 + a00 - a10 - a01);
+        gu += dy0 * ((a10 - a00) + uv0.y * ad) * sclu0;
+        gv += dy0 * ((a01 - a00) + uv0.x * ad) * sclv0;
+
+        // Second level unless in magnification mode.
+        if (flevel > 0.f)
+        {
+            // Texture gradients for second level.
+            float dy1 = flevel * dy;
+            accumQuad(tw1 * dy1, pOut1, level1, tc1, corner1, CA_TEMP);
+    
+            // UV gradients for second level.
+            float b00, b10, b01, b11;
+            fetchQuad<float>(b00, b10, b01, b11, pIn1, tc1, corner1);
+            float bd = (b11 + b00 - b10 - b01);
+            gu += dy1 * ((b10 - b00) + uv1.y * bd) * sclu1;
+            gv += dy1 * ((b01 - b00) + uv1.x * bd) * sclv1;
+
+            // Mip level gradient.
+            float a = bilerp(a00, a10, a01, a11, uv0);
+            float b = bilerp(b00, b10, b01, b11, uv1);
+            df += (b-a) * dy;
+        }
+    }
+
+    // Store UV gradients.
+    if (CUBE_MODE)
+        ((float3*)p.gradUV)[pidx] = indexCubeMapGrad(uv, gu, gv) + (dfdv * df);
+    else
+        ((float2*)p.gradUV)[pidx] = make_float2(gu, gv);
+
+    // Final UV pixel differential gradients.
+    dw *= df; // dL/(d{s,y}/d{X,Y}) = df/(d{s,y}/d{X,Y}) * dL/df.
+
+    // Store them.
+    if (CUBE_MODE)
+    {        
+        // Remap from dL/(d{s,t}/s{X,Y}) to dL/(d{x,y,z}/d{X,Y}).
+        float3 g0, g1;
+        indexCubeMapGrad4(uv, dw, g0, g1); 
+        ((float2*)p.gradUVDA)[3 * pidx + 0] = make_float2(g0.x, g1.x);
+        ((float2*)p.gradUVDA)[3 * pidx + 1] = make_float2(g0.y, g1.y);
+        ((float2*)p.gradUVDA)[3 * pidx + 2] = make_float2(g0.z, g1.z);
+    }
+    else
+        ((float4*)p.gradUVDA)[pidx] = dw;
+}
+
+// Template specializations.
+__global__ void TextureGradKernelNearest                 (const TextureKernelParams p) { TextureGradKernelTemplate<false, TEX_MODE_NEAREST>(p); }
+__global__ void TextureGradKernelLinear                  (const TextureKernelParams p) { TextureGradKernelTemplate<false, TEX_MODE_LINEAR>(p); }
+__global__ void TextureGradKernelLinearMipmapNearest     (const TextureKernelParams p) { TextureGradKernelTemplate<false, TEX_MODE_LINEAR_MIPMAP_NEAREST>(p); }
+__global__ void TextureGradKernelLinearMipmapLinear      (const TextureKernelParams p) { TextureGradKernelTemplate<false, TEX_MODE_LINEAR_MIPMAP_LINEAR>(p); }
+__global__ void TextureGradKernelCubeNearest             (const TextureKernelParams p) { TextureGradKernelTemplate<true,  TEX_MODE_NEAREST>(p); }
+__global__ void TextureGradKernelCubeLinear              (const TextureKernelParams p) { TextureGradKernelTemplate<true,  TEX_MODE_LINEAR>(p); }
+__global__ void TextureGradKernelCubeLinearMipmapNearest (const TextureKernelParams p) { TextureGradKernelTemplate<true,  TEX_MODE_LINEAR_MIPMAP_NEAREST>(p); }
+__global__ void TextureGradKernelCubeLinearMipmapLinear  (const TextureKernelParams p) { TextureGradKernelTemplate<true,  TEX_MODE_LINEAR_MIPMAP_LINEAR>(p); }
+
+//------------------------------------------------------------------------
--- a/nvdiffrast/common/texture.h
+++ b/nvdiffrast/common/texture.h
+// Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#pragma once
+#include "framework.h"
+
+//------------------------------------------------------------------------
+// Constants.
+
+#define TEX_DEBUG_MIP_RETAIN_VARIANCE           0   // For debugging
+#define TEX_FWD_MAX_KERNEL_BLOCK_WIDTH          8
+#define TEX_FWD_MAX_KERNEL_BLOCK_HEIGHT         8
+#define TEX_FWD_MAX_MIP_KERNEL_BLOCK_WIDTH      8
+#define TEX_FWD_MAX_MIP_KERNEL_BLOCK_HEIGHT     8
+#define TEX_GRAD_MAX_KERNEL_BLOCK_WIDTH         8
+#define TEX_GRAD_MAX_KERNEL_BLOCK_HEIGHT        8
+#define TEX_GRAD_MAX_MIP_KERNEL_BLOCK_WIDTH     8
+#define TEX_GRAD_MAX_MIP_KERNEL_BLOCK_HEIGHT    8
+#define TEX_MAX_MIP_LEVEL                       14  // Currently a texture cannot be larger than 2 GB because we use 32-bit indices everywhere.
+#define TEX_MODE_NEAREST                        0   // Nearest on base level.
+#define TEX_MODE_LINEAR                         1   // Bilinear on base level.
+#define TEX_MODE_LINEAR_MIPMAP_NEAREST          2   // Bilinear on nearest mip level.
+#define TEX_MODE_LINEAR_MIPMAP_LINEAR           3   // Trilinear.
+#define TEX_MODE_COUNT                          4
+#define TEX_BOUNDARY_MODE_CUBE                  0   // Cube map mode.
+#define TEX_BOUNDARY_MODE_WRAP                  1   // Wrap (u, v).
+#define TEX_BOUNDARY_MODE_CLAMP                 2   // Clamp (u, v).
+#define TEX_BOUNDARY_MODE_ZERO                  3   // Pad with zeros.
+#define TEX_BOUNDARY_MODE_COUNT                 4
+
+//------------------------------------------------------------------------
+// CUDA kernel params.
+
+struct TextureKernelParams
+{
+    const float*    tex;                            // Incoming texture buffer.
+    const float*    uv;                             // Incoming texcoord buffer.
+    const float*    uvDA;                           // Incoming uv pixel diffs. NULL if mips disabled.
+    const float*    dy;                             // Incoming output gradient.
+    float*          mip;                            // Mip data buffer.
+    float*          out;                            // Outgoing texture data.
+    float*          gradTex;                        // Outgoing texture gradient.
+    float*          gradTexMip;                     // Temporary texture gradients for mip levels > 0.
+    float*          gradUV;                         // Outgoing texcoord gradient.
+    float*          gradUVDA;                       // Outgoing texcoord pixel differential gradient.
+    int             enableMip;                      // If true, we have uv_da input and mip output tensor.
+    int             filterMode;                     // One of the TEX_MODE_ constants.
+    int             boundaryMode;                   // One of the TEX_BOUNDARY_MODE_ contants.
+    int             texConst;                       // If true, texture is known to be constant.
+    int             mipLevelLimit;                  // Mip level limit coming from the op.
+    int             channels;                       // Number of texture channels.
+    int             imgWidth;                       // Image width.
+    int             imgHeight;                      // Image height.
+    int             texWidth;                       // Texture width.
+    int             texHeight;                      // Texture height.
+    int             texDepth;                       // Texture depth.
+    int             n;                              // Minibatch size.
+    int             mipLevelMax;                    // Maximum mip level index. Zero if mips disabled.
+    int             mipOffset[TEX_MAX_MIP_LEVEL];   // Offsets in mip data. 0: unused, 1+: offset to mip.
+    int             mipLevelOut;                    // Mip level being calculated in builder kernel.
+};
+
+//------------------------------------------------------------------------
+// C++ helper function prototypes.
+
+void raiseMipSizeError(NVDR_CTX_ARGS, const TextureKernelParams& p);
+int calculateMipInfo(NVDR_CTX_ARGS, TextureKernelParams& p);
+
+//------------------------------------------------------------------------
+// Macros.
+
+#define mipLevelSize(p, i) make_int2(((p).texWidth >> (i)) > 1 ? ((p).texWidth >> (i)) : 1, ((p).texHeight >> (i)) > 1 ? ((p).texHeight >> (i)) : 1)
+
+//------------------------------------------------------------------------
--- a/nvdiffrast/lib/glew.h
+++ b/nvdiffrast/lib/glew.h
--- a/nvdiffrast/lib/glew32s.lib
+++ b/nvdiffrast/lib/glew32s.lib
--- a/nvdiffrast/tensorflow/__init__.py
+++ b/nvdiffrast/tensorflow/__init__.py
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+from .ops import rasterize, interpolate, texture, antialias
+from .plugin_loader import set_cache_dir
+
+__all__ = ["rasterize", "interpolate", "texture", "antialias", "set_cache_dir"]
--- a/nvdiffrast/tensorflow/ops.py
+++ b/nvdiffrast/tensorflow/ops.py
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+import tensorflow as tf
+import numpy as np
+import os
+from . import plugin_loader
+
+#----------------------------------------------------------------------------
+# Helpers.
+#----------------------------------------------------------------------------
+
+# OpenGL-related linker options depending on platform.
+def _get_gl_opts():
+    libs = {
+        'posix': ['GL', 'GLEW'],
+        'nt':    ['gdi32', 'glew32s', 'opengl32', 'user32'], 
+    }
+    return ['-l' + x for x in libs[os.name]]
+
+# Load the cpp plugin.
+def _get_plugin():
+    fn = os.path.join(os.path.dirname(__file__), 'tf_all.cu')
+    return plugin_loader.get_plugin(fn, extra_nvcc_options=_get_gl_opts() + ['-DNVDR_TENSORFLOW'])
+
+# Convert parameter to a numpy array if possible.
+def _get_constant(x, dtype):
+    try:
+        return np.asarray(x, dtype=dtype)
+    except (TypeError, ValueError):
+        return None
+
+# Tests for a construction-time constantness instead of tf.constant node because
+# the latter can be overridden in Session.run() feed_dict at evaluation time.
+def _is_constant(x, dtype):
+    if isinstance(x, np.ndarray):
+        return np.can_cast(x.dtype, dtype, 'unsafe')
+    else:
+        return _get_constant(x, dtype) is not None
+
+#----------------------------------------------------------------------------
+# Rasterize.
+#----------------------------------------------------------------------------
+
+def rasterize(pos, tri, resolution, ranges=None, tri_const=False, output_db=True, grad_db=True):
+    assert tri_const is True or tri_const is False
+    assert output_db is True or output_db is False
+
+    # Known constant resolution?
+    resolution_c = _get_constant(resolution, np.int32)
+
+    # Known constant triangles?
+    tri_const = tri_const or _is_constant(tri, np.int32)
+
+    # Convert all inputs to tensors / base types.
+    tri_const = 1 if tri_const else 0
+    tri = tf.convert_to_tensor(tri, dtype=tf.int32)
+    pos = tf.convert_to_tensor(pos, dtype=tf.float32)
+    resolution = tf.convert_to_tensor(resolution, dtype=tf.int32)
+    if ranges is None:
+        ranges = tf.convert_to_tensor(np.zeros(shape=[0, 2], dtype=np.int32)) # Empty tensor.
+    else:
+        ranges = tf.convert_to_tensor(ranges, dtype=tf.int32) # Convert input to tensor.
+
+    # Infer as much about the output shape as possible.
+    out_shape = [None, None, None, 4]
+    if pos.shape.rank == 3: # Instanced mode.
+        out_shape[0] = pos.shape[0].value
+    elif pos.shape.rank == 2: # Range mode.
+        if ranges.shape.rank not in [None, 0]:
+            out_shape[0] = ranges.shape[0].value
+    if resolution_c is not None:
+        assert resolution_c.shape == (2,)
+        out_shape[1], out_shape[2] = resolution_c
+
+    # Output pixel differentials.
+    @tf.custom_gradient
+    def func_db(pos):
+        out, out_db = _get_plugin().rasterize_fwd(pos, tri, resolution, ranges, 1, tri_const)
+        out.set_shape(out_shape)
+        out_db.set_shape(out_shape)
+        def grad(dy, ddb):
+            if grad_db:
+                return _get_plugin().rasterize_grad_db(pos, tri, out, dy, ddb)
+            else:
+                return _get_plugin().rasterize_grad(pos, tri, out, dy)
+        return (out, out_db), grad
+
+    # Do not output pixel differentials.
+    @tf.custom_gradient
+    def func(pos):
+        out, out_db = _get_plugin().rasterize_fwd(pos, tri, resolution, ranges, 0, tri_const)
+        out.set_shape(out_shape)
+        out_db.set_shape(out_shape[:-1] + [0]) # Zero channels in out_db.
+        def grad(dy, _):
+            return _get_plugin().rasterize_grad(pos, tri, out, dy)
+        return (out, out_db), grad
+
+    # Choose stub.
+    if output_db:
+        return func_db(pos)
+    else:
+        return func(pos)
+
+#----------------------------------------------------------------------------
+# Interpolate.
+#----------------------------------------------------------------------------
+
+def interpolate(attr, rast, tri, rast_db=None, diff_attrs=None):
+    # Sanitize the list of pixel differential attributes.
+    if diff_attrs is None:
+        diff_attrs = []
+    elif diff_attrs != 'all':
+        diff_attrs = _get_constant(diff_attrs, np.int32)
+        assert (diff_attrs is not None) and len(diff_attrs.shape) == 1
+        diff_attrs = diff_attrs.tolist()
+
+    # Convert all inputs to tensors.
+    attr = tf.convert_to_tensor(attr, dtype=tf.float32)
+    rast = tf.convert_to_tensor(rast, dtype=tf.float32)
+    tri = tf.convert_to_tensor(tri, dtype=tf.int32)
+    if diff_attrs:
+        rast_db = tf.convert_to_tensor(rast_db, dtype=tf.float32)
+
+    # Infer output shape.
+    out_shape = [None, None, None, None]
+    if rast.shape.rank is not None:
+        out_shape = [rast.shape[0].value, rast.shape[1].value, rast.shape[2].value, None]
+    if attr.shape.rank in [2, 3]:
+        out_shape[3] = attr.shape[-1].value
+
+    # Output pixel differentials for at least some attributes.
+    @tf.custom_gradient
+    def func_da(attr, rast, rast_db):
+        diff_attrs_all = int(diff_attrs == 'all')
+        diff_attrs_list = [] if diff_attrs_all else diff_attrs
+        out, out_da = _get_plugin().interpolate_fwd_da(attr, rast, tri, rast_db, diff_attrs_all, diff_attrs_list)
+
+        # Infer number of channels in out_da.
+        if not diff_attrs_all:
+            da_channels = 2 * len(diff_attrs)
+        if (attr.shape.rank in [2, 3]) and (attr.shape[-1].value is not None):
+            da_channels = 2 * attr.shape[-1].value
+        else:
+            da_channels = None
+
+        # Set output shapes.
+        out.set_shape(out_shape)
+        out_da.set_shape([out_shape[0], out_shape[1], out_shape[2], da_channels])
+
+        def grad(dy, dda):
+            return _get_plugin().interpolate_grad_da(attr, rast, tri, dy, rast_db, dda, diff_attrs_all, diff_attrs_list)
+        return (out, out_da), grad
+
+    # No pixel differentials for any attribute.
+    @tf.custom_gradient
+    def func(attr, rast):
+        out, out_da = _get_plugin().interpolate_fwd(attr, rast, tri)
+        out.set_shape(out_shape)
+        out_da.set_shape(out_shape[:-1] + [0]) # Zero channels in out_da.
+        def grad(dy, _):
+            return _get_plugin().interpolate_grad(attr, rast, tri, dy)
+        return (out, out_da), grad
+
+    # Choose stub.
+    if diff_attrs:
+        return func_da(attr, rast, rast_db)
+    else:
+        return func(attr, rast)
+
+#----------------------------------------------------------------------------
+# Texture.
+#----------------------------------------------------------------------------
+
+def texture(tex, uv, uv_da=None, filter_mode='auto', boundary_mode='wrap', tex_const=False, max_mip_level=None):
+    assert tex_const is True or tex_const is False
+
+    # Default filter mode.
+    if filter_mode == 'auto':
+        filter_mode = 'linear-mipmap-linear' if (uv_da is not None) else 'linear'
+
+    # Known constant texture?
+    tex_const = tex_const or _is_constant(tex, np.float32)
+
+    # Sanitize inputs.
+    tex_const = 1 if tex_const else 0
+    if max_mip_level is None:
+        max_mip_level = -1
+    else:
+        max_mip_level = int(max_mip_level)
+        assert max_mip_level >= 0
+
+    # Convert inputs to tensors.
+    tex = tf.convert_to_tensor(tex, dtype=tf.float32)
+    uv = tf.convert_to_tensor(uv, dtype=tf.float32)
+    if 'mipmap' in filter_mode:
+        uv_da = tf.convert_to_tensor(uv_da, dtype=tf.float32)
+
+    # Infer output shape.
+    out_shape = [None, None, None, None]
+    if uv.shape.rank is not None:
+        assert uv.shape.rank == 4
+        out_shape = [uv.shape[0].value, uv.shape[1].value, uv.shape[2].value, None]
+    if tex.shape.rank is not None:
+        assert tex.shape.rank == (5 if boundary_mode == 'cube' else 4)
+        out_shape[-1] = tex.shape[-1].value
+
+    # If mipping disabled via max level=0, we may as well use simpler filtering internally.
+    if max_mip_level == 0 and filter_mode in ['linear-mipmap-nearest', 'linear-mipmap-linear']:
+        filter_mode = 'linear'
+
+    # Convert filter mode to internal enumeration.
+    filter_mode_dict = {'nearest': 0, 'linear': 1, 'linear-mipmap-nearest': 2, 'linear-mipmap-linear': 3}
+    filter_mode_enum = filter_mode_dict[filter_mode]
+
+    # Convert boundary mode to internal enumeration.
+    boundary_mode_dict = {'cube': 0, 'wrap': 1, 'clamp': 2, 'zero': 3}
+    boundary_mode_enum = boundary_mode_dict[boundary_mode]
+
+    # Linear-mipmap-linear: Mipmaps enabled, all gradients active.
+    @tf.custom_gradient
+    def func_linear_mipmap_linear(tex, uv, uv_da):
+        out, mip = _get_plugin().texture_fwd_mip(tex, uv, uv_da, filter_mode_enum, boundary_mode_enum, tex_const, max_mip_level)
+        out.set_shape(out_shape)
+        def grad(dy):
+            return _get_plugin().texture_grad_linear_mipmap_linear(tex, uv, dy, uv_da, mip, filter_mode_enum, boundary_mode_enum, max_mip_level)
+        return out, grad
+
+    # Linear-mipmap-nearest: Mipmaps enabled, no gradients to uv_da.
+    @tf.custom_gradient
+    def func_linear_mipmap_nearest(tex, uv):
+        out, mip = _get_plugin().texture_fwd_mip(tex, uv, uv_da, filter_mode_enum, boundary_mode_enum, tex_const, max_mip_level)
+        out.set_shape(out_shape)
+        def grad(dy):
+            return _get_plugin().texture_grad_linear_mipmap_nearest(tex, uv, dy, uv_da, mip, filter_mode_enum, boundary_mode_enum, max_mip_level)
+        return out, grad
+
+    # Linear: Mipmaps disabled, no uv_da, no gradients to uv_da.
+    @tf.custom_gradient
+    def func_linear(tex, uv):
+        out = _get_plugin().texture_fwd(tex, uv, filter_mode_enum, boundary_mode_enum)
+        out.set_shape(out_shape)
+        def grad(dy):
+            return _get_plugin().texture_grad_linear(tex, uv, dy, filter_mode_enum, boundary_mode_enum)
+        return out, grad
+
+    # Nearest: Mipmaps disabled, no uv_da, no gradients to uv_da or uv.
+    @tf.custom_gradient
+    def func_nearest(tex):
+        out = _get_plugin().texture_fwd(tex, uv, filter_mode_enum, boundary_mode_enum)
+        out.set_shape(out_shape)
+        def grad(dy):
+            return _get_plugin().texture_grad_nearest(tex, uv, dy, filter_mode_enum, boundary_mode_enum)
+        return out, grad
+
+    # Choose stub.
+    if filter_mode == 'linear-mipmap-linear':
+        return func_linear_mipmap_linear(tex, uv, uv_da)
+    elif filter_mode == 'linear-mipmap-nearest':
+        return func_linear_mipmap_nearest(tex, uv)
+    elif filter_mode == 'linear':
+        return func_linear(tex, uv)
+    elif filter_mode == 'nearest':
+        return func_nearest(tex)
+
+#----------------------------------------------------------------------------
+# Antialias.
+#----------------------------------------------------------------------------
+
+def antialias(color, rast, pos, tri, tri_const=False, pos_gradient_boost=1.0):
+    assert tri_const is True or tri_const is False
+
+    # Known constant triangles?
+    tri_const = tri_const or _is_constant(tri, np.int32)
+
+    # Convert inputs to tensors.
+    color = tf.convert_to_tensor(color, dtype=tf.float32)
+    rast = tf.convert_to_tensor(rast, dtype=tf.float32)
+    pos = tf.convert_to_tensor(pos, dtype=tf.float32)
+    tri = tf.convert_to_tensor(tri, dtype=tf.int32)
+
+    # Sanitize inputs.
+    tri_const = 1 if tri_const else 0
+
+    @tf.custom_gradient
+    def func(color, pos):
+        color_out, work_buffer = _get_plugin().antialias_fwd(color, rast, pos, tri, tri_const)
+        color_out.set_shape(color.shape)
+        def grad(dy):
+            grad_color, grad_pos = _get_plugin().antialias_grad(color, rast, pos, tri, dy, work_buffer)
+            if pos_gradient_boost != 1.0:
+                grad_pos = grad_pos * pos_gradient_boost
+            return grad_color, grad_pos
+        return color_out, grad
+
+    return func(color, pos)
+
+#----------------------------------------------------------------------------
--- a/nvdiffrast/tensorflow/plugin_loader.py
+++ b/nvdiffrast/tensorflow/plugin_loader.py
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+import glob
+import os
+import re
+import uuid
+import hashlib
+import tempfile
+import shutil
+import tensorflow as tf
+from tensorflow.python.client import device_lib # pylint: disable=no-name-in-module
+
+#----------------------------------------------------------------------------
+# Global options.
+
+_nvdiffrast_cache_dir = None
+
+def set_cache_dir(path: str) -> None:
+    '''Set CUDA kernel compilation temp dir.
+
+    If `set_cache_dir` is not called, the cache directory will default to
+    one of the below:
+
+    - Value of NVDIFFRAST_CACHE_DIR env var, if set
+    - $HOME/.cache/nvdiffrast if HOME env var is set
+    - $USERPROFILE/.cache/nvdiffrast if USERPROFILE is set.
+
+    Args:
+      path: Where to save CUDA kernel build temporaries
+    '''
+    global _nvdiffrast_cache_dir
+    _nvdiffrast_cache_dir = path
+
+def make_cache_dir_path(*paths: str) -> str:
+    if _nvdiffrast_cache_dir is not None:
+        return os.path.join(_nvdiffrast_cache_dir, *paths)
+    if 'NVDIFFRAST_CACHE_DIR' in os.environ:
+        return os.path.join(os.environ['NVDIFFRAST_CACHE_DIR'], *paths)
+    if 'HOME' in os.environ:
+        return os.path.join(os.environ['HOME'], '.cache', 'nvdiffrast', *paths)
+    if 'USERPROFILE' in os.environ:
+        return os.path.join(os.environ['USERPROFILE'], '.cache', 'nvdiffrast', *paths)
+    return os.path.join(tempfile.gettempdir(), '.cache', 'nvdiffrast', *paths)
+
+cuda_cache_version_tag = 'v1'
+do_not_hash_included_headers = False # Speed up compilation by assuming that headers included by the CUDA code never change. Unsafe!
+verbose = True # Print status messages to stdout.
+
+#----------------------------------------------------------------------------
+# Internal helper funcs.
+
+def _find_compiler_bindir():
+    hostx64_paths = sorted(glob.glob('C:/Program Files (x86)/Microsoft Visual Studio/*/Professional/VC/Tools/MSVC/*/bin/Hostx64/x64'), reverse=True)
+    if hostx64_paths != []:
+        return hostx64_paths[0]
+    hostx64_paths = sorted(glob.glob('C:/Program Files (x86)/Microsoft Visual Studio/*/BuildTools/VC/Tools/MSVC/*/bin/Hostx64/x64'), reverse=True)
+    if hostx64_paths != []:
+        return hostx64_paths[0]
+    hostx64_paths = sorted(glob.glob('C:/Program Files (x86)/Microsoft Visual Studio/*/Community/VC/Tools/MSVC/*/bin/Hostx64/x64'), reverse=True)
+    if hostx64_paths != []:
+        return hostx64_paths[0]
+    vc_bin_dir = 'C:/Program Files (x86)/Microsoft Visual Studio 14.0/vc/bin'
+    if os.path.isdir(vc_bin_dir):
+        return vc_bin_dir
+    return None
+
+def _get_compute_cap(device):
+    caps_str = device.physical_device_desc
+    m = re.search('compute capability: (\\d+).(\\d+)', caps_str)
+    major = m.group(1)
+    minor = m.group(2)
+    return (major, minor)
+
+def _get_cuda_gpu_arch_string():
+    gpus = [x for x in device_lib.list_local_devices() if x.device_type == 'GPU']
+    if len(gpus) == 0:
+        raise RuntimeError('No GPU devices found')
+    (major, minor) = _get_compute_cap(gpus[0])
+    return 'sm_%s%s' % (major, minor)
+
+def _run_cmd(cmd):
+    with os.popen(cmd) as pipe:
+        output = pipe.read()
+        status = pipe.close()
+    if status is not None:
+        raise RuntimeError('NVCC returned an error. See below for full command line and output log:\n\n%s\n\n%s' % (cmd, output))
+
+def _prepare_nvcc_cli(opts):
+    cmd = 'nvcc ' + opts.strip()
+    cmd += ' --disable-warnings'
+    cmd += ' --include-path "%s"' % tf.sysconfig.get_include()
+    cmd += ' --include-path "%s"' % os.path.join(tf.sysconfig.get_include(), 'external', 'protobuf_archive', 'src')
+    cmd += ' --include-path "%s"' % os.path.join(tf.sysconfig.get_include(), 'external', 'com_google_absl')
+    cmd += ' --include-path "%s"' % os.path.join(tf.sysconfig.get_include(), 'external', 'eigen_archive')
+
+    compiler_bindir = _find_compiler_bindir()
+    if compiler_bindir is None:
+        # Require that _find_compiler_bindir succeeds on Windows.  Allow
+        # nvcc to use whatever is the default on Linux.
+        if os.name == 'nt':
+            raise RuntimeError('Could not find MSVC/GCC/CLANG installation on this computer. Check compiler_bindir_search_path list in "%s".' % __file__)
+    else:
+        cmd += ' --compiler-bindir "%s"' % compiler_bindir
+    cmd += ' 2>&1'
+    return cmd
+
+#----------------------------------------------------------------------------
+# Main entry point.
+
+_plugin_cache = dict()
+
+def get_plugin(cuda_file, extra_nvcc_options=[]):
+    cuda_file_base = os.path.basename(cuda_file)
+    cuda_file_name, cuda_file_ext = os.path.splitext(cuda_file_base)
+
+    # Already in cache?
+    if cuda_file in _plugin_cache:
+        return _plugin_cache[cuda_file]
+
+    # Setup plugin.
+    if verbose:
+        print('Setting up TensorFlow plugin "%s": ' % cuda_file_base, end='', flush=True)
+    try:
+        # Hash CUDA source.
+        md5 = hashlib.md5()
+        with open(cuda_file, 'rb') as f:
+            md5.update(f.read())
+        md5.update(b'\n')
+
+        # Hash headers included by the CUDA code by running it through the preprocessor.
+        if not do_not_hash_included_headers:
+            if verbose:
+                print('Preprocessing... ', end='', flush=True)
+            with tempfile.TemporaryDirectory() as tmp_dir:
+                tmp_file = os.path.join(tmp_dir, cuda_file_name + '_tmp' + cuda_file_ext)
+                _run_cmd(_prepare_nvcc_cli('"%s" --preprocess -o "%s" --keep --keep-dir "%s"' % (cuda_file, tmp_file, tmp_dir)))
+                with open(tmp_file, 'rb') as f:
+                    bad_file_str = ('"' + cuda_file.replace('\\', '/') + '"').encode('utf-8') # __FILE__ in error check macros
+                    good_file_str = ('"' + cuda_file_base + '"').encode('utf-8')
+                    for ln in f:
+                        if not ln.startswith(b'# ') and not ln.startswith(b'#line '): # ignore line number pragmas
+                            ln = ln.replace(bad_file_str, good_file_str)
+                            md5.update(ln)
+                    md5.update(b'\n')
+
+        # Select compiler options.
+        compile_opts = ''
+        if os.name == 'nt':
+            compile_opts += '"%s"' % os.path.join(tf.sysconfig.get_lib(), 'python', '_pywrap_tensorflow_internal.lib')
+            compile_opts += ' --library-path="%s"' % (os.path.dirname(__file__) + r"\..\lib") # Find glew32s.lib during compilation.
+        elif os.name == 'posix':
+            compile_opts += '"%s"' % os.path.join(tf.sysconfig.get_lib(), 'python', '_pywrap_tensorflow_internal.so')
+            compile_opts += ' --compiler-options \'-fPIC -D_GLIBCXX_USE_CXX11_ABI=0\''
+        else:
+            assert False # not Windows or Linux, w00t?
+        compile_opts += ' --gpu-architecture=%s' % _get_cuda_gpu_arch_string()
+        compile_opts += ' --use_fast_math'
+        for opt in extra_nvcc_options:
+            compile_opts += ' ' + opt
+        nvcc_cmd = _prepare_nvcc_cli(compile_opts)
+
+        # Hash build configuration.
+        md5.update(('nvcc_cmd: ' + nvcc_cmd).encode('utf-8') + b'\n')
+        md5.update(('tf.VERSION: ' + tf.VERSION).encode('utf-8') + b'\n')
+        md5.update(('cuda_cache_version_tag: ' + cuda_cache_version_tag).encode('utf-8') + b'\n')
+
+        # Compile if not already compiled.
+        bin_file_ext = '.dll' if os.name == 'nt' else '.so'
+        cuda_cache_path = make_cache_dir_path()
+        bin_file = os.path.join(make_cache_dir_path(), cuda_file_name + '_' + md5.hexdigest() + bin_file_ext)
+        if not os.path.isfile(bin_file):
+            if verbose:
+                print('Compiling... ', end='', flush=True)
+            with tempfile.TemporaryDirectory() as tmp_dir:
+                tmp_file = os.path.join(tmp_dir, cuda_file_name + '_tmp' + bin_file_ext)
+                _run_cmd(nvcc_cmd + ' "%s" --shared -o "%s" --keep --keep-dir "%s"' % (cuda_file, tmp_file, tmp_dir))
+                os.makedirs(cuda_cache_path, exist_ok=True)
+                intermediate_file = os.path.join(cuda_cache_path, cuda_file_name + '_' + uuid.uuid4().hex + '_tmp' + bin_file_ext)
+                shutil.copyfile(tmp_file, intermediate_file)
+                os.rename(intermediate_file, bin_file) # atomic
+
+        # Load.
+        if verbose:
+            print('Loading... ', end='', flush=True)
+        plugin = tf.load_op_library(bin_file)
+
+        # Add to cache.
+        _plugin_cache[cuda_file] = plugin
+        if verbose:
+            print('Done.', flush=True)
+        return plugin
+
+    except:
+        if verbose:
+            print('Failed!', flush=True)
+        raise
+
+#----------------------------------------------------------------------------
--- a/nvdiffrast/tensorflow/tf_all.cu
+++ b/nvdiffrast/tensorflow/tf_all.cu
+// Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+// TF-specific helpers.
+
+#define OP_CHECK_CUDA_ERROR(CTX, CUDA_CALL) do { cudaError_t err = CUDA_CALL; OP_REQUIRES(CTX, err == cudaSuccess, errors::Internal("Cuda error: ", cudaGetErrorName(err), "[", #CUDA_CALL, ";]")); } while (0)
+#define OP_CHECK_GL_ERROR(CTX, GL_CALL) do { GL_CALL; GLenum err = glGetError(); OP_REQUIRES(CTX, err == GL_NO_ERROR, errors::Internal("OpenGL error: ", getGLErrorString(err), "[", #GL_CALL, ";]")); } while (0)
+
+// Cuda kernels and CPP all together. What an absolute compilation unit.
+
+#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#include "../common/framework.h"
+
+#include "../common/common.h"
+#include "../common/common.cpp"
+
+#include "../common/rasterize.h"
+#include "../common/rasterize.cpp"
+#include "../common/rasterize.cu"
+#include "tf_rasterize.cu"
+
+#include "../common/interpolate.cu"
+#include "tf_interpolate.cu"
+
+#include "../common/texture.cpp"
+#include "../common/texture.cu"
+#include "tf_texture.cu"
+
+#include "../common/antialias.cu"
+#include "tf_antialias.cu"
--- a/nvdiffrast/tensorflow/tf_antialias.cu
+++ b/nvdiffrast/tensorflow/tf_antialias.cu
+// Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+//------------------------------------------------------------------------
+// Forward TensorFlow op.
+
+struct AntialiasFwdOp : public OpKernel
+{
+    AntialiasKernelParams m_attribs;
+
+    AntialiasFwdOp(OpKernelConstruction* ctx): OpKernel(ctx)
+    {
+        memset(&m_attribs, 0, sizeof(m_attribs));
+        OP_REQUIRES_OK(ctx, ctx->GetAttr("tri_const", &m_attribs.tri_const));
+    }
+
+    void Compute(OpKernelContext* ctx)
+    {
+        AntialiasKernelParams& p = m_attribs;
+        cudaStream_t stream = ctx->eigen_device<Eigen::GpuDevice>().stream();
+
+        // Get input.
+        const Tensor& color     = ctx->input(0);
+        const Tensor& rasterOut = ctx->input(1);
+        const Tensor& pos       = ctx->input(2);
+        const Tensor& tri       = ctx->input(3);
+
+        // Instance rendering mode?
+        p.instance_mode = pos.dims() > 2;
+
+        // Extract input dimensions.
+        if (p.instance_mode)
+            p.numVertices = (pos.dims() > 1) ? pos.dim_size(1) : 0;
+        else
+            p.numVertices = (pos.dims() > 0) ? pos.dim_size(0) : 0;
+        p.numTriangles = (tri.dims() > 0) ? tri.dim_size(0) : 0;
+        p.n        = (color.dims() > 0) ? color.dim_size(0) : 0;
+        p.height   = (color.dims() > 1) ? color.dim_size(1) : 0;
+        p.width    = (color.dims() > 2) ? color.dim_size(2) : 0;
+        p.channels = (color.dims() > 3) ? color.dim_size(3) : 0;
+
+        // Sanity checks.
+        OP_REQUIRES(ctx, color.dims() == 4 && color.dim_size(0) > 0 && color.dim_size(1) > 0 && color.dim_size(2) > 0 && color.dim_size(3) > 0, errors::InvalidArgument("color must have shape[>0, >0, >0, >0]"));
+        OP_REQUIRES(ctx, rasterOut.dims() == 4 && rasterOut.dim_size(0) > 0 && rasterOut.dim_size(1) > 0 && rasterOut.dim_size(2) > 0 && rasterOut.dim_size(3) == 4, errors::InvalidArgument("raster_out must have shape[>0, >0, >0, 4]"));
+        OP_REQUIRES(ctx, tri.dims() == 2 && tri.dim_size(0) > 0 && tri.dim_size(1) == 3, errors::InvalidArgument("tri must have shape [>0, 3]"));
+        OP_REQUIRES(ctx, color.dim_size(1) == rasterOut.dim_size(1) && color.dim_size(2) == rasterOut.dim_size(2), errors::InvalidArgument("color and raster_out inputs must have same spatial dimensions"));
+        if (p.instance_mode)
+        {
+            OP_REQUIRES(ctx, pos.dims() == 3 && pos.dim_size(0) > 0 && pos.dim_size(1) > 0 && pos.dim_size(2) == 4, errors::InvalidArgument("pos must have shape [>0, >0, 4] or [>0, 4]"));
+            OP_REQUIRES(ctx, rasterOut.dim_size(0) == p.n && pos.dim_size(0) == p.n, errors::InvalidArgument("minibatch size mismatch between inputs color, raster_out, pos"));
+        }
+        else
+        {
+            OP_REQUIRES(ctx, pos.dims() == 2 && pos.dim_size(0) > 0 && pos.dim_size(1) == 4, errors::InvalidArgument("pos must have shape [>0, >0, 4] or [>0, 4]"));
+            OP_REQUIRES(ctx, rasterOut.dim_size(0) == p.n, errors::InvalidArgument("minibatch size mismatch between inputs color, raster_out"));
+        }
+
+        // Get input pointers.
+        p.color = color.flat<float>().data();
+        p.rasterOut = rasterOut.flat<float>().data();
+        p.tri = tri.flat<int>().data();
+        p.pos = pos.flat<float>().data();
+
+        // Misc parameters.
+        p.xh = .5f * (float)p.width;
+        p.yh = .5f * (float)p.height;
+
+        // Allocate output tensor.
+        Tensor* outputTensor = NULL;
+        TensorShape outputShape;
+        outputShape.AddDim(p.n);
+        outputShape.AddDim(p.height);
+        outputShape.AddDim(p.width);
+        outputShape.AddDim(p.channels);
+        OP_REQUIRES_OK(ctx, ctx->allocate_output(0, outputShape, &outputTensor));
+        p.output = outputTensor->flat<float>().data();
+
+        // Allocate work buffer. One extra int4 for storing counters.
+        Tensor* workTensor = NULL;
+        TensorShape workShape;
+        workShape.AddDim(p.n * p.width * p.height * 8 + 4); // 8 int for a maximum of two work items per pixel.
+        OP_REQUIRES_OK(ctx, ctx->allocate_output(1, workShape, &workTensor));
+        p.workBuffer = (int4*)(workTensor->flat<int>().data());
+
+        // Clear the work counters.
+        OP_CHECK_CUDA_ERROR(ctx, cudaMemsetAsync(p.workBuffer, 0, sizeof(int4), stream));
+
+        // Verify that buffers are aligned to allow float2/float4 operations.
+        OP_REQUIRES(ctx, !((uintptr_t)p.pos        & 15), errors::Internal("pos input tensor not aligned to float4"));
+        OP_REQUIRES(ctx, !((uintptr_t)p.rasterOut  &  7), errors::Internal("raster_out input tensor not aligned to float2"));
+        OP_REQUIRES(ctx, !((uintptr_t)p.workBuffer & 15), errors::Internal("work_buffer internal tensor not aligned to int4"));
+
+        // Kernel parameters.
+        void* args[] = {&p};
+
+        // (Re-)calculate opposite vertex hash.
+        if (!p.evHash || !p.tri_const)
+        {            
+            if (p.allocTriangles < p.numTriangles)
+            {
+                p.allocTriangles = max(p.allocTriangles, 64);
+                while (p.allocTriangles < p.numTriangles)
+                    p.allocTriangles <<= 1; // Must be power of two.
+               
+                // (Re-)allocate memory for the hash.
+                OP_CHECK_CUDA_ERROR(ctx, cudaFree(p.evHash));
+                OP_CHECK_CUDA_ERROR(ctx, cudaMalloc(&p.evHash, p.allocTriangles * AA_HASH_ELEMENTS_PER_TRIANGLE * sizeof(uint4)));
+                LOG(INFO) << "Increasing topology hash size to accommodate " << p.allocTriangles << " triangles";
+            }
+
+            // Clear the hash and launch the mesh kernel to populate it.
+            OP_CHECK_CUDA_ERROR(ctx, cudaMemsetAsync(p.evHash, 0, p.allocTriangles * AA_HASH_ELEMENTS_PER_TRIANGLE * sizeof(uint4), stream));
+            OP_CHECK_CUDA_ERROR(ctx, cudaLaunchKernel((void*)AntialiasFwdMeshKernel, (p.numTriangles - 1) / AA_MESH_KERNEL_THREADS_PER_BLOCK + 1, AA_MESH_KERNEL_THREADS_PER_BLOCK, args, 0, stream));
+        }
+
+        // Copy input to output as a baseline.
+        OP_CHECK_CUDA_ERROR(ctx, cudaMemcpyAsync(p.output, p.color, p.n * p.height * p.width * p.channels * sizeof(float), cudaMemcpyDeviceToDevice, stream));
+
+        // Choose launch parameters for the discontinuity finder kernel and launch.
+        dim3 blockSize(AA_DISCONTINUITY_KERNEL_BLOCK_WIDTH, AA_DISCONTINUITY_KERNEL_BLOCK_HEIGHT, 1);
+        dim3 gridSize = getLaunchGridSize(blockSize, p.width, p.height, p.n);
+        OP_CHECK_CUDA_ERROR(ctx, cudaLaunchKernel((void*)AntialiasFwdDiscontinuityKernel, gridSize, blockSize, args, 0, stream));
+
+        // Determine optimum block size for the persistent analysis kernel.
+        int device = 0;
+        int numCTA = 0;
+        int numSM  = 0;
+        OP_CHECK_CUDA_ERROR(ctx, cudaGetDevice(&device));
+        OP_CHECK_CUDA_ERROR(ctx, cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numCTA, (void*)AntialiasFwdAnalysisKernel, AA_ANALYSIS_KERNEL_THREADS_PER_BLOCK, 0));
+        OP_CHECK_CUDA_ERROR(ctx, cudaDeviceGetAttribute(&numSM, cudaDevAttrMultiProcessorCount, device));
+
+        // Launch analysis kernel.
+        OP_CHECK_CUDA_ERROR(ctx, cudaLaunchKernel((void*)AntialiasFwdAnalysisKernel, numCTA * numSM, AA_ANALYSIS_KERNEL_THREADS_PER_BLOCK, args, 0, stream));
+    }
+};
+
+REGISTER_OP("AntialiasFwd")
+    .Input      ("color: float")
+    .Input      ("raster_out: float")
+    .Input      ("pos: float")
+    .Input      ("tri: int32")
+    .Output     ("output: float")
+    .Output     ("work_buffer: int32")
+    .Attr       ("tri_const: int");
+
+REGISTER_KERNEL_BUILDER(Name("AntialiasFwd").Device(DEVICE_GPU), AntialiasFwdOp);
+
+//------------------------------------------------------------------------
+// Gradient TensorFlow op.
+
+struct AntialiasGradOp : public OpKernel
+{
+    AntialiasKernelParams m_attribs;
+
+    AntialiasGradOp(OpKernelConstruction* ctx): OpKernel(ctx)
+    {
+        memset(&m_attribs, 0, sizeof(m_attribs));
+    }
+
+    void Compute(OpKernelContext* ctx)
+    {
+        AntialiasKernelParams& p = m_attribs;
+        cudaStream_t stream = ctx->eigen_device<Eigen::GpuDevice>().stream();
+
+        // Get input.
+        const Tensor& color      = ctx->input(0);
+        const Tensor& rasterOut  = ctx->input(1);
+        const Tensor& pos        = ctx->input(2);
+        const Tensor& tri        = ctx->input(3);
+        const Tensor& dy         = ctx->input(4);
+        const Tensor& workBuffer = ctx->input(5);
+
+        // Instance rendering mode?
+        p.instance_mode = pos.dims() > 2;
+
+        // Extract input dimensions.
+        if (p.instance_mode)
+            p.numVertices = (pos.dims() > 1) ? pos.dim_size(1) : 0;
+        else
+            p.numVertices = (pos.dims() > 0) ? pos.dim_size(0) : 0;
+        p.numTriangles = (tri.dims() > 0) ? tri.dim_size(0) : 0;
+        p.n        = (color.dims() > 0) ? color.dim_size(0) : 0;
+        p.height   = (color.dims() > 1) ? color.dim_size(1) : 0;
+        p.width    = (color.dims() > 2) ? color.dim_size(2) : 0;
+        p.channels = (color.dims() > 3) ? color.dim_size(3) : 0;
+
+        // Sanity checks.
+        OP_REQUIRES(ctx, dy.dims() == 4 && dy.dim_size(0) > 0 && dy.dim_size(1) > 0 && dy.dim_size(2) > 0 && dy.dim_size(3) > 0, errors::InvalidArgument("dy must have shape[>0, >0, >0, >0]"));
+        OP_REQUIRES(ctx, color.dims() == 4 && color.dim_size(0) > 0 && color.dim_size(1) > 0 && color.dim_size(2) > 0 && color.dim_size(3) > 0, errors::InvalidArgument("color must have shape[>0, >0, >0, >0]"));
+        OP_REQUIRES(ctx, rasterOut.dims() == 4 && rasterOut.dim_size(0) > 0 && rasterOut.dim_size(1) > 0 && rasterOut.dim_size(2) > 0 && rasterOut.dim_size(3) == 4, errors::InvalidArgument("raster_out must have shape[>0, >0, >0, 4]"));
+        OP_REQUIRES(ctx, tri.dims() == 2 && tri.dim_size(0) > 0 && tri.dim_size(1) == 3, errors::InvalidArgument("tri must have shape [>0, 3]"));
+        OP_REQUIRES(ctx, color.dim_size(1) == rasterOut.dim_size(1) && color.dim_size(2) == rasterOut.dim_size(2), errors::InvalidArgument("color and raster_out inputs must have same spatial dimensions"));
+        OP_REQUIRES(ctx, color.dim_size(1) == dy.dim_size(1) && color.dim_size(2) == dy.dim_size(2) && color.dim_size(3) == dy.dim_size(3), errors::InvalidArgument("color and dy inputs must have same dimensions"));
+        if (p.instance_mode)
+        {
+            OP_REQUIRES(ctx, pos.dims() == 3 && pos.dim_size(0) > 0 && pos.dim_size(1) > 0 && pos.dim_size(2) == 4, errors::InvalidArgument("pos must have shape [>0, >0, 4] or [>0, 4]"));
+            OP_REQUIRES(ctx, rasterOut.dim_size(0) == p.n && pos.dim_size(0) == p.n, errors::InvalidArgument("minibatch size mismatch between inputs color, raster_out, pos"));
+            OP_REQUIRES(ctx, dy.dim_size(0) == p.n && rasterOut.dim_size(0) == p.n && pos.dim_size(0) == p.n, errors::InvalidArgument("minibatch size mismatch between inputs dy, color, raster_out, pos"));
+        }
+        else
+        {
+            OP_REQUIRES(ctx, pos.dims() == 2 && pos.dim_size(0) > 0 && pos.dim_size(1) == 4, errors::InvalidArgument("pos must have shape [>0, >0, 4] or [>0, 4]"));
+            OP_REQUIRES(ctx, rasterOut.dim_size(0) == p.n, errors::InvalidArgument("minibatch size mismatch between inputs color, raster_out"));
+            OP_REQUIRES(ctx, dy.dim_size(0) == p.n && rasterOut.dim_size(0) == p.n, errors::InvalidArgument("minibatch size mismatch between inputs dy, color, raster_out"));
+        }
+
+        // Get input pointers.
+        p.dy = dy.flat<float>().data();
+        p.color = color.flat<float>().data();
+        p.rasterOut = rasterOut.flat<float>().data();
+        p.tri = tri.flat<int>().data();
+        p.pos = pos.flat<float>().data();
+        p.workBuffer = (int4*)(workBuffer.flat<int>().data());
+
+        // Misc parameters.
+        p.xh = .5f * (float)p.width;
+        p.yh = .5f * (float)p.height;
+
+        // Allocate color gradient output tensor.
+        Tensor* gradColor = NULL;
+        TensorShape gradColorShape;
+        gradColorShape.AddDim(p.n);
+        gradColorShape.AddDim(p.height);
+        gradColorShape.AddDim(p.width);
+        gradColorShape.AddDim(p.channels);
+        OP_REQUIRES_OK(ctx, ctx->allocate_output(0, gradColorShape, &gradColor));
+        p.gradColor = gradColor->flat<float>().data();
+
+        // Allocate position gradient output tensor.
+        Tensor* gradPos = NULL;
+        TensorShape gradPosShape;
+        if (p.instance_mode)
+            gradPosShape.AddDim(p.n);
+        gradPosShape.AddDim(p.numVertices);
+        gradPosShape.AddDim(4);
+        OP_REQUIRES_OK(ctx, ctx->allocate_output(1, gradPosShape, &gradPos));
+        p.gradPos = gradPos->flat<float>().data();
+
+        // Initialize all the stuff.
+        OP_CHECK_CUDA_ERROR(ctx, cudaMemsetAsync(&p.workBuffer[0].y, 0, sizeof(int), stream)); // Gradient kernel work counter.
+        OP_CHECK_CUDA_ERROR(ctx, cudaMemcpyAsync(p.gradColor, p.dy, p.n * p.height * p.width * p.channels * sizeof(float), cudaMemcpyDeviceToDevice, stream));
+        OP_CHECK_CUDA_ERROR(ctx, cudaMemsetAsync(p.gradPos, 0, (p.instance_mode ? p.n : 1) * p.numVertices * 4 * sizeof(float), stream));
+
+        // Verify that buffers are aligned to allow float2/float4 operations.
+        OP_REQUIRES(ctx, !((uintptr_t)p.pos        & 15), errors::Internal("pos input tensor not aligned to float4"));
+        OP_REQUIRES(ctx, !((uintptr_t)p.workBuffer & 15), errors::Internal("work_buffer internal tensor not aligned to int4"));
+
+        // Launch the gradient kernel.
+        void* args[] = {&p};
+
+        int device = 0;
+        int numCTA = 0;
+        int numSM  = 0;
+        OP_CHECK_CUDA_ERROR(ctx, cudaGetDevice(&device));
+        OP_CHECK_CUDA_ERROR(ctx, cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numCTA, (void*)AntialiasGradKernel, AA_GRAD_KERNEL_THREADS_PER_BLOCK, 0));
+        OP_CHECK_CUDA_ERROR(ctx, cudaDeviceGetAttribute(&numSM, cudaDevAttrMultiProcessorCount, device));
+        OP_CHECK_CUDA_ERROR(ctx, cudaLaunchKernel((void*)AntialiasGradKernel, numCTA * numSM, AA_GRAD_KERNEL_THREADS_PER_BLOCK, args, 0, stream));
+    }
+};
+
+REGISTER_OP("AntialiasGrad")
+    .Input      ("color: float")
+    .Input      ("raster_out: float")
+    .Input      ("pos: float")
+    .Input      ("tri: int32")
+    .Input      ("dy: float")
+    .Input      ("work_buffer: int32")
+    .Output     ("grad_color: float")
+    .Output     ("grad_pos: float");
+
+REGISTER_KERNEL_BUILDER(Name("AntialiasGrad").Device(DEVICE_GPU), AntialiasGradOp);
+
+//------------------------------------------------------------------------
--- a/nvdiffrast/tensorflow/tf_interpolate.cu
+++ b/nvdiffrast/tensorflow/tf_interpolate.cu
+// Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+//------------------------------------------------------------------------
+// Common op attribute parser.
+
+static __host__ void interpolateParseOpAttributes(OpKernelConstruction* ctx, InterpolateKernelParams& p, bool enableDA)
+{
+    if (enableDA)
+    {
+        OP_REQUIRES_OK(ctx, ctx->GetAttr("diff_attrs_all", &p.diff_attrs_all));
+        if (!p.diff_attrs_all)
+        {
+            std::vector<int> diff_attrs_vec;
+            OP_REQUIRES_OK(ctx, ctx->GetAttr("diff_attrs", &diff_attrs_vec));
+            OP_REQUIRES(ctx, diff_attrs_vec.size() > 0, errors::InvalidArgument("differentiation enabled with empty diff_attrs list"));
+            OP_REQUIRES(ctx, diff_attrs_vec.size() <= IP_MAX_DIFF_ATTRS, errors::InvalidArgument("too many entries in diff_attrs list (increase IP_MAX_DIFF_ATTRS)"));
+            p.numDiffAttr = diff_attrs_vec.size();
+            memcpy(p.diffAttrs, &diff_attrs_vec[0], diff_attrs_vec.size()*sizeof(int));
+        }
+    }
+}
+
+//------------------------------------------------------------------------
+// Forward TensorFlow op.
+
+template <bool ENABLE_DA>
+struct InterpolateFwdOp : public OpKernel
+{
+    InterpolateKernelParams m_attribs;
+
+    InterpolateFwdOp(OpKernelConstruction* ctx): OpKernel(ctx)
+    {
+        memset(&m_attribs, 0, sizeof(m_attribs));
+        interpolateParseOpAttributes(ctx, m_attribs, ENABLE_DA);
+    }
+
+    void Compute(OpKernelContext* ctx)
+    {
+        InterpolateKernelParams& p = m_attribs;
+        cudaStream_t stream = ctx->eigen_device<Eigen::GpuDevice>().stream();
+
+        // Get input.
+        const Tensor& attr    = ctx->input(0);
+        const Tensor& rast    = ctx->input(1);
+        const Tensor& tri     = ctx->input(2);
+        const Tensor& rast_db = ctx->input(ENABLE_DA ? 3 : 2);
+
+        // Instance rendering mode?
+        p.instance_mode = attr.dims() > 2;
+
+        // Extract input dimensions.
+        if (p.instance_mode)
+        {
+            p.numVertices  = (attr.dims() > 1) ? attr.dim_size(1) : 0;
+            p.numAttr      = (attr.dims() > 2) ? attr.dim_size(2) : 0;
+        }
+        else
+        {
+            p.numVertices  = (attr.dims() > 0) ? attr.dim_size(0) : 0;
+            p.numAttr      = (attr.dims() > 1) ? attr.dim_size(1) : 0;
+        }
+        p.numTriangles = (tri.dims() > 0) ? tri.dim_size(0) : 0;
+        p.height       = (rast.dims() > 1) ? rast.dim_size(1) : 0;
+        p.width        = (rast.dims() > 2) ? rast.dim_size(2) : 0;
+        p.depth        = (rast.dims() > 0) ? rast.dim_size(0) : 0;
+
+        // Sanity checks.
+        OP_REQUIRES(ctx, rast.dims() == 4 && rast.dim_size(0) > 0 && rast.dim_size(1) > 0 && rast.dim_size(2) > 0 && rast.dim_size(3) == 4, errors::InvalidArgument("rast must have shape[>0, >0, >0, 4]"));
+        OP_REQUIRES(ctx, tri.dims() == 2 && tri.dim_size(0) > 0 && tri.dim_size(1) == 3, errors::InvalidArgument("tri must have shape [>0, 3]"));
+        OP_REQUIRES(ctx, (attr.dims() == 2 || attr.dims() == 3) && attr.dim_size(0) > 0 && attr.dim_size(1) > 0 && (attr.dims() == 2 || attr.dim_size(2) > 0), errors::InvalidArgument("attr must have shape [>0, >0, >0] or [>0, >0]"));
+        if (p.instance_mode)
+            OP_REQUIRES(ctx, attr.dim_size(0) == p.depth || attr.dim_size(0) == 1, errors::InvalidArgument("minibatch size mismatch between inputs rast, attr"));
+        if (ENABLE_DA)
+        {
+            OP_REQUIRES(ctx, rast_db.dims() == 4 && rast_db.dim_size(0) > 0 && rast_db.dim_size(1) > 0 && rast_db.dim_size(2) > 0 && rast_db.dim_size(3) == 4, errors::InvalidArgument("rast_db must have shape[>0, >0, >0, 4]"));
+            OP_REQUIRES(ctx, rast_db.dim_size(1) == rast.dim_size(1) && rast_db.dim_size(2) == rast.dim_size(2), errors::InvalidArgument("spatial size mismatch between inputs rast and rast_db"));
+            OP_REQUIRES(ctx, rast_db.dim_size(0) == p.depth, errors::InvalidArgument("minibatch size mismatch between inputs rast, rast_db"));
+        }
+
+        // All diff attrs mode.
+        if (p.diff_attrs_all)
+            p.numDiffAttr = p.numAttr;
+
+        // Get input pointers.
+        p.attr = attr.flat<float>().data();
+        p.rast = rast.flat<float>().data();
+        p.tri = tri.flat<int>().data();
+        p.attrBC = (p.instance_mode && attr.dim_size(0) == 1) ? 1 : 0;
+        p.rastDB = ENABLE_DA ? rast_db.flat<float>().data() : 0;
+
+        // Allocate main output tensor.
+        Tensor* out_tensor = NULL;
+        TensorShape out_shape;
+        out_shape.AddDim(p.depth);
+        out_shape.AddDim(p.height);
+        out_shape.AddDim(p.width);
+        out_shape.AddDim(p.numAttr);
+        OP_REQUIRES_OK(ctx, ctx->allocate_output(0, out_shape, &out_tensor));
+        p.out = out_tensor->flat<float>().data();
+
+        // Allocate pixel differential output tensor.
+        Tensor* out_da_tensor = NULL;
+        out_shape.set_dim(3, p.numDiffAttr * 2);
+        OP_REQUIRES_OK(ctx, ctx->allocate_output(1, out_shape, &out_da_tensor));
+        p.outDA = ENABLE_DA ? out_da_tensor->flat<float>().data() : 0;
+
+        // Verify that buffers are aligned to allow float2/float4 operations.
+        OP_REQUIRES(ctx, !((uintptr_t)p.rast   & 15), errors::Internal("rast input tensor not aligned to float4"));
+        OP_REQUIRES(ctx, !((uintptr_t)p.rastDB & 15), errors::Internal("rast_db input tensor not aligned to float4"));        
+        if (ENABLE_DA)
+            OP_REQUIRES(ctx, !((uintptr_t)p.outDA & 7), errors::Internal("out_da output tensor not aligned to float2"));
+
+        // Choose launch parameters.
+        dim3 blockSize = getLaunchBlockSize(IP_FWD_MAX_KERNEL_BLOCK_WIDTH, IP_FWD_MAX_KERNEL_BLOCK_HEIGHT, p.width, p.height);
+        dim3 gridSize  = getLaunchGridSize(blockSize, p.width, p.height, p.depth);
+
+        // Launch CUDA kernel.
+        void* args[] = {&p};
+        void* func = ENABLE_DA ? (void*)InterpolateFwdKernelDa : (void*)InterpolateFwdKernel;
+        OP_CHECK_CUDA_ERROR(ctx, cudaLaunchKernel(func, gridSize, blockSize, args, 0, stream));
+    }
+};
+
+REGISTER_OP("InterpolateFwd")
+    .Input      ("attr: float")
+    .Input      ("rast: float")
+    .Input      ("tri: int32")
+    .Output     ("out: float")
+    .Output     ("out_da: float");
+
+REGISTER_OP("InterpolateFwdDa")
+    .Input      ("attr: float")
+    .Input      ("rast: float")
+    .Input      ("tri: int32")
+    .Input      ("rast_db: float")
+    .Output     ("out: float")
+    .Output     ("out_da: float")
+    .Attr       ("diff_attrs_all: int")
+    .Attr       ("diff_attrs: list(int)");
+
+REGISTER_KERNEL_BUILDER(Name("InterpolateFwd")  .Device(DEVICE_GPU), InterpolateFwdOp<false>);
+REGISTER_KERNEL_BUILDER(Name("InterpolateFwdDa").Device(DEVICE_GPU), InterpolateFwdOp<true>);
+
+//------------------------------------------------------------------------
+// Gradient TensorFlow op.
+
+template <bool ENABLE_DA>
+struct InterpolateGradOp : public OpKernel
+{
+    InterpolateKernelParams m_attribs;
+
+    InterpolateGradOp(OpKernelConstruction* ctx): OpKernel(ctx)
+    {
+        memset(&m_attribs, 0, sizeof(m_attribs));
+        interpolateParseOpAttributes(ctx, m_attribs, ENABLE_DA);      
+    }
+
+    void Compute(OpKernelContext* ctx)
+    {
+        InterpolateKernelParams& p = m_attribs;
+        cudaStream_t stream = ctx->eigen_device<Eigen::GpuDevice>().stream();
+
+        // Get input.
+        const Tensor& attr    = ctx->input(0);
+        const Tensor& rast    = ctx->input(1);
+        const Tensor& tri     = ctx->input(2);
+        const Tensor& dy      = ctx->input(3);
+        const Tensor& rast_db = ctx->input(ENABLE_DA ? 4 : 3);
+        const Tensor& dda     = ctx->input(ENABLE_DA ? 5 : 3);
+
+        // Instance rendering mode?
+        p.instance_mode = attr.dims() > 2;
+
+        // Extract input dimensions.
+        if (p.instance_mode)
+        {
+            p.numVertices  = (attr.dims() > 1) ? attr.dim_size(1) : 0;
+            p.numAttr      = (attr.dims() > 2) ? attr.dim_size(2) : 0;
+        }
+        else
+        {
+            p.numVertices  = (attr.dims() > 0) ? attr.dim_size(0) : 0;
+            p.numAttr      = (attr.dims() > 1) ? attr.dim_size(1) : 0;
+        }
+        p.numTriangles = (tri.dims() > 0) ? tri.dim_size(0) : 0;
+        p.depth        = (rast.dims() > 0) ? rast.dim_size(0) : 0;
+        p.height       = (rast.dims() > 1) ? rast.dim_size(1) : 0;
+        p.width        = (rast.dims() > 2) ? rast.dim_size(2) : 0;
+        int attr_depth = p.instance_mode ? (attr.dims() > 1 ? attr.dim_size(0) : 0) : 1;
+
+        // Sanity checks.
+        OP_REQUIRES(ctx, rast.dims() == 4 && rast.dim_size(0) > 0 && rast.dim_size(1) > 0 && rast.dim_size(2) > 0 && rast.dim_size(3) == 4, errors::InvalidArgument("rast must have shape[>0, >0, >0, 4]"));
+        OP_REQUIRES(ctx, tri.dims() == 2 && tri.dim_size(0) > 0 && tri.dim_size(1) == 3, errors::InvalidArgument("tri must have shape [>0, 3]"));
+        OP_REQUIRES(ctx, (attr.dims() == 2 || attr.dims() == 3) && attr.dim_size(0) > 0 && attr.dim_size(1) > 0 && (attr.dims() == 2 || attr.dim_size(2) > 0), errors::InvalidArgument("attr must have shape [>0, >0, >0] or [>0, >0]"));
+        OP_REQUIRES(ctx, dy.dims() == 4 && dy.dim_size(0) > 0 && dy.dim_size(1) == p.height && dy.dim_size(2) == p.width && dy.dim_size(3) > 0, errors::InvalidArgument("dy must have shape [>0, height, width, >0]"));
+        OP_REQUIRES(ctx, dy.dim_size(3) == p.numAttr, errors::InvalidArgument("argument count mismatch between inputs dy, attr"));
+        OP_REQUIRES(ctx, (attr_depth == p.depth || attr_depth == 1) && dy.dim_size(0) == p.depth, errors::InvalidArgument("minibatch size mismatch between inputs rast, dy, attr"));
+        if (ENABLE_DA)
+        {
+            OP_REQUIRES(ctx, dda.dims() == 4 && dda.dim_size(0) > 0 && dda.dim_size(1) == p.height && dda.dim_size(2) == p.width, errors::InvalidArgument("dda must have shape [>0, height, width, ?]"));
+            OP_REQUIRES(ctx, dda.dim_size(0) == p.depth, errors::InvalidArgument("minibatch size mismatch between rast, dda"));
+        }
+
+        // All diff attrs mode.
+        if (p.diff_attrs_all)
+            p.numDiffAttr = p.numAttr;
+
+        // Get input pointers.
+        p.attr   = attr.flat<float>().data();
+        p.rast   = rast.flat<float>().data();
+        p.tri    = tri.flat<int>().data();
+        p.dy     = dy.flat<float>().data();
+        p.rastDB = ENABLE_DA ? rast_db.flat<float>().data() : 0;
+        p.dda    = ENABLE_DA ? dda.flat<float>().data() : 0;
+        p.attrBC = (p.instance_mode && attr_depth < p.depth) ? 1 : 0;
+
+        // Allocate attribute gradient output tensor.
+        Tensor* grad_attr_tensor = NULL;
+        TensorShape grad_attr_shape;
+        if (p.instance_mode)
+            grad_attr_shape.AddDim(attr_depth);
+        grad_attr_shape.AddDim(p.numVertices);
+        grad_attr_shape.AddDim(p.numAttr);
+        OP_REQUIRES_OK(ctx, ctx->allocate_output(0, grad_attr_shape, &grad_attr_tensor));
+        p.gradAttr = grad_attr_tensor->flat<float>().data();
+
+        // Allocate bary gradient output tensor.
+        Tensor* grad_rast_tensor = NULL;
+        TensorShape grad_rast_shape;
+        grad_rast_shape.AddDim(p.depth);
+        grad_rast_shape.AddDim(p.height);
+        grad_rast_shape.AddDim(p.width);
+        grad_rast_shape.AddDim(4);
+        OP_REQUIRES_OK(ctx, ctx->allocate_output(1, grad_rast_shape, &grad_rast_tensor));
+        p.gradRaster = grad_rast_tensor->flat<float>().data();
+
+        // Allocate bary pixel diff gradient output tensor.
+        if (ENABLE_DA)
+        {
+            Tensor* grad_rast_db_tensor = NULL;
+            OP_REQUIRES_OK(ctx, ctx->allocate_output(2, grad_rast_shape, &grad_rast_db_tensor));
+            p.gradRasterDB = grad_rast_db_tensor->flat<float>().data();
+        }
+        
+        // Clear attribute gradients.
+        cudaMemsetAsync(p.gradAttr, 0, attr_depth * p.numVertices * p.numAttr * sizeof(float), stream);
+
+        // Verify that buffers are aligned to allow float2/float4 operations.
+        OP_REQUIRES(ctx, !((uintptr_t)p.rast   & 15), errors::Internal("rast input tensor not aligned to float4"));
+        OP_REQUIRES(ctx, !((uintptr_t)p.gradRaster & 15), errors::Internal("grad_rast output tensor not aligned to float4"));
+        if (ENABLE_DA)
+        {
+            OP_REQUIRES(ctx, !((uintptr_t)p.dda & 7), errors::Internal("dda input tensor not aligned to float2"));
+            OP_REQUIRES(ctx, !((uintptr_t)p.rastDB & 15), errors::Internal("rast_db input tensor not aligned to float4"));        
+            OP_REQUIRES(ctx, !((uintptr_t)p.gradRasterDB & 15), errors::Internal("grad_rast_db output tensor not aligned to float4"));
+        }
+    
+        // Choose launch parameters.
+        dim3 blockSize = getLaunchBlockSize(IP_GRAD_MAX_KERNEL_BLOCK_WIDTH, IP_GRAD_MAX_KERNEL_BLOCK_HEIGHT, p.width, p.height);
+        dim3 gridSize  = getLaunchGridSize(blockSize, p.width, p.height, p.depth);
+
+        // Launch CUDA kernel.
+        void* args[] = {&p};
+        void* func = ENABLE_DA ? (void*)InterpolateGradKernelDa : (void*)InterpolateGradKernel;
+        OP_CHECK_CUDA_ERROR(ctx, cudaLaunchKernel(func, gridSize, blockSize, args, 0, stream));
+    }
+};
+
+REGISTER_OP("InterpolateGrad")
+    .Input      ("attr: float")
+    .Input      ("rast: float")
+    .Input      ("tri: int32")
+    .Input      ("dy: float")
+    .Output     ("grad_attr: float")
+    .Output     ("grad_rast: float")
+    ;
+
+REGISTER_OP("InterpolateGradDa")
+    .Input      ("attr: float")
+    .Input      ("rast: float")
+    .Input      ("tri: int32")
+    .Input      ("dy: float")
+    .Input      ("rast_db: float")
+    .Input      ("dda: float")
+    .Output     ("grad_attr: float")
+    .Output     ("grad_rast: float")
+    .Output     ("grad_rast_db: float")
+    .Attr       ("diff_attrs_all: int")
+    .Attr       ("diff_attrs: list(int)");
+    ;
+
+REGISTER_KERNEL_BUILDER(Name("InterpolateGrad")  .Device(DEVICE_GPU), InterpolateGradOp<false>);
+REGISTER_KERNEL_BUILDER(Name("InterpolateGradDa").Device(DEVICE_GPU), InterpolateGradOp<true>);
+
+//------------------------------------------------------------------------
--- a/nvdiffrast/tensorflow/tf_rasterize.cu
+++ b/nvdiffrast/tensorflow/tf_rasterize.cu
+// Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+//------------------------------------------------------------------------
+// Forward TensorFlow op.
+
+struct RasterizeFwdOp : public OpKernel
+{
+    RasterizeGLState        m_glState;              // OpenGL-related persistent state.
+    int                     m_tri_const;            // 1 if triangle array is known to be constant.    
+
+    RasterizeFwdOp(OpKernelConstruction* ctx):
+        OpKernel(ctx)
+    {        
+        memset(&m_glState, 0, sizeof(RasterizeGLState));
+        OP_REQUIRES_OK(ctx, ctx->GetAttr("enable_db", &m_glState.enableDB));
+        OP_REQUIRES_OK(ctx, ctx->GetAttr("tri_const", &m_tri_const));
+    }
+
+    void Compute(OpKernelContext* ctx)
+    {
+        cudaStream_t stream = ctx->eigen_device<Eigen::GpuDevice>().stream();
+
+        // Check that input shapes are correct.
+        const Tensor& pos = ctx->input(0);
+        const Tensor& tri = ctx->input(1);
+        const Tensor& resolution = ctx->input(2);
+        const Tensor& ranges = ctx->input(3);
+
+        // Determine number of outputs
+        int num_outputs = m_glState.enableDB ? 2 : 1;
+
+        // Determine instance mode and check input dimensions.
+        bool instance_mode = pos.dims() > 2;
+        if (instance_mode)
+        {
+            OP_REQUIRES(ctx, pos.dims() == 3 && pos.dim_size(0) > 0 && pos.dim_size(1) > 0 && pos.dim_size(2) == 4, errors::InvalidArgument("instance mode - pos must have shape [>0, >0, 4]"));
+            OP_REQUIRES(ctx, tri.dims() == 2 && tri.dim_size(0) > 0 && tri.dim_size(1) == 3, errors::InvalidArgument("tri must have shape [>0, 3]"));
+            OP_REQUIRES(ctx, resolution.dims() == 1 && resolution.dim_size(0) == 2, errors::InvalidArgument("resolution must have shape [2]"));
+        }
+        else
+        {
+            OP_REQUIRES(ctx, pos.dims() == 2 && pos.dim_size(0) > 0 && pos.dim_size(1) == 4, errors::InvalidArgument("range mode - pos must have shape [>0, 4]"));
+            OP_REQUIRES(ctx, tri.dims() == 2 && tri.dim_size(0) > 0 && tri.dim_size(1) == 3, errors::InvalidArgument("tri must have shape [>0, 3]"));
+            OP_REQUIRES(ctx, resolution.dims() == 1 && resolution.dim_size(0) == 2, errors::InvalidArgument("resolution must have shape [2]"));
+            OP_REQUIRES(ctx, ranges.dims() == 2 && ranges.dim_size(0) > 0 && ranges.dim_size(1) == 2, errors::InvalidArgument("range mode - ranges must have shape [>0, 2]"));            
+        }
+
+        // Get output shape.
+        const int32_t* res_in = resolution.flat<int32_t>().data(); // This is in CPU memory.
+        int height = res_in[0];
+        int width  = res_in[1];
+        int depth  = instance_mode ? pos.dim_size(0) : ranges.dim_size(0);
+        OP_REQUIRES(ctx, height > 0 && width > 0, errors::InvalidArgument("resolution must be [>0, >0]"));
+
+        // Get position and triangle buffer sizes in int32/float32.
+        int posCount = 4 * pos.dim_size(0) * (instance_mode ? pos.dim_size(1) : 1);
+        int triCount = 3 * tri.dim_size(0);
+
+        // Init context and GL?
+        bool initCtx = !m_glState.glFBO;
+        if (initCtx)
+            rasterizeInitGLContext(ctx, m_glState); // In common/rasterize.inl
+        else
+            setGLContext(m_glState.glctx); // (Re-)Activate GL context.
+
+        // Resize all buffers.
+        rasterizeResizeBuffers(ctx, m_glState, posCount, triCount, width, height, depth); // In common/rasterize.inl
+
+        // Newly created GL objects sometimes don't map properly to CUDA until after first context swap. Workaround.
+        if (initCtx)
+        {
+            // On first execution, do a bonus context swap.
+            releaseGLContext();
+            setGLContext(m_glState.glctx);
+        }
+    
+        // Copy input data to GL and render.
+        const float* posPtr = pos.flat<float>().data();
+        const int32_t* rangesPtr = instance_mode ? 0 : ranges.flat<int32_t>().data(); // This is in CPU memory.
+        const int32_t* triPtr = (initCtx || !m_tri_const) ? tri.flat<int32_t>().data() : NULL; // Copy triangles only if needed.
+        int vtxPerInstance = instance_mode ? pos.dim_size(1) : 0;
+        rasterizeRender(ctx, m_glState, stream, posPtr, posCount, vtxPerInstance, triPtr, triCount, rangesPtr, width, height, depth);
+
+        // Allocate output tensors.
+        TensorShape output_shape;
+        output_shape.AddDim(depth);
+        output_shape.AddDim(height);
+        output_shape.AddDim(width);
+        output_shape.AddDim(4);
+        float* outputPtr[2];
+        for (int i=0; i < 2; i++)
+        {
+            if (i >= num_outputs)
+                output_shape.set_dim(3, 0); // Zero channels for unwanted out_db tensor.
+            Tensor* output_tensor = NULL;
+            OP_REQUIRES_OK(ctx, ctx->allocate_output(i, output_shape, &output_tensor));
+            if (i < num_outputs)
+                outputPtr[i] = output_tensor->flat<float>().data();
+        }
+
+        // Copy rasterized results into CUDA buffers.
+        rasterizeCopyResults(ctx, m_glState, stream, outputPtr, width, height, depth);
+
+        // Done. Release GL context.
+        releaseGLContext();
+    }
+};
+
+REGISTER_OP("RasterizeFwd")
+    .Input      ("pos: float")
+    .Input      ("tri: int32")
+    .Input      ("resolution: int32")
+    .Input      ("ranges: int32")
+    .Output     ("out: float")
+    .Output     ("out_db: float")
+    .Attr       ("enable_db: int")
+    .Attr       ("tri_const: int");
+
+REGISTER_KERNEL_BUILDER(Name("RasterizeFwd").Device(DEVICE_GPU).HostMemory("resolution").HostMemory("ranges"), RasterizeFwdOp);
+
+//------------------------------------------------------------------------
+// Gradient TensorFlow op.
+
+template <bool ENABLE_DB>
+struct RasterizeGradOp : public OpKernel
+{
+    RasterizeGradParams m_attribs;
+
+    RasterizeGradOp(OpKernelConstruction* ctx): OpKernel(ctx)
+    {
+        memset(&m_attribs, 0, sizeof(m_attribs));
+    }
+
+    void Compute(OpKernelContext* ctx)
+    {
+        RasterizeGradParams& p = m_attribs;
+        cudaStream_t stream = ctx->eigen_device<Eigen::GpuDevice>().stream();
+
+        // Input tensors.
+        const Tensor& pos = ctx->input(0);
+        const Tensor& tri = ctx->input(1);
+        const Tensor& out = ctx->input(2);
+        const Tensor& dy  = ctx->input(3);
+        const Tensor& ddb = ctx->input(ENABLE_DB ? 4 : 3);
+
+        // Determine instance mode.
+        p.instance_mode = (pos.dims() > 2) ? 1 : 0;
+
+        // Shape is taken from the rasterizer output tensor.
+        OP_REQUIRES(ctx, out.dims() == 4, errors::InvalidArgument("out must be rank-4"));
+        p.depth  = out.dim_size(0);
+        p.height = out.dim_size(1);
+        p.width  = out.dim_size(2);
+        OP_REQUIRES(ctx, p.depth > 0 && p.height > 0 && p.width > 0, errors::InvalidArgument("resolution must be [>0, >0, >0]"));
+
+        // Check other shapes.
+        if (p.instance_mode)
+            OP_REQUIRES(ctx, pos.dims() == 3 && pos.dim_size(0) == p.depth && pos.dim_size(1) > 0 && pos.dim_size(2) == 4, errors::InvalidArgument("pos must have shape [depth, >0, 4]"));
+        else
+            OP_REQUIRES(ctx, pos.dims() == 2 && pos.dim_size(0) > 0 && pos.dim_size(1) == 4, errors::InvalidArgument("pos must have shape [>0, 4]"));
+        OP_REQUIRES(ctx, tri.dims() == 2 && tri.dim_size(0) > 0 && tri.dim_size(1) == 3, errors::InvalidArgument("tri must have shape [>0, 3]"));
+        OP_REQUIRES(ctx, out.dims() == 4 && out.dim_size(0) == p.depth && out.dim_size(1) == p.height && out.dim_size(2) == p.width && out.dim_size(3) == 4, errors::InvalidArgument("out must have shape [depth, height, width, 4]"));
+        OP_REQUIRES(ctx,  dy.dims() == 4 &&  dy.dim_size(0) == p.depth &&  dy.dim_size(1) == p.height &&  dy.dim_size(2) == p.width &&  dy.dim_size(3) == 4, errors::InvalidArgument("dy must have shape [depth, height, width, 4]"));
+        if (ENABLE_DB)
+            OP_REQUIRES(ctx, ddb.dims() == 4 && ddb.dim_size(0) == p.depth && ddb.dim_size(1) == p.height && ddb.dim_size(2) == p.width && ddb.dim_size(3) == 4, errors::InvalidArgument("ddb must have shape [depth, height, width, 4]"));
+
+        // Populate parameters.
+        p.numTriangles = tri.dim_size(0);
+        p.numVertices = p.instance_mode ? pos.dim_size(1) : pos.dim_size(0);
+        p.pos = pos.flat<float>().data();
+        p.tri = tri.flat<int>().data();
+        p.out = out.flat<float>().data();
+        p.dy  = dy.flat<float>().data();
+        p.ddb = ENABLE_DB ? ddb.flat<float>().data() : 0;
+        
+        // Set up pixel position to clip space x, y transform.
+        p.xs = 2.f / (float)p.width;
+        p.xo = 1.f / (float)p.width - 1.f;
+        p.ys = 2.f / (float)p.height;
+        p.yo = 1.f / (float)p.height - 1.f;
+
+        // Allocate output tensor for position gradients.
+        Tensor* grad_tensor = NULL;
+        TensorShape grad_shape;
+        if (p.instance_mode)
+            grad_shape.AddDim(p.depth);
+        grad_shape.AddDim(p.numVertices);
+        grad_shape.AddDim(4);
+        OP_REQUIRES_OK(ctx, ctx->allocate_output(0, grad_shape, &grad_tensor));
+        p.grad = grad_tensor->flat<float>().data();
+
+        // Clear the output buffers.
+        size_t gradBytes = (p.instance_mode ? p.depth : 1) * p.numVertices * 4 * sizeof(float);
+        cudaMemsetAsync(p.grad, 0, gradBytes, stream);
+
+        // Verify that buffers are aligned to allow float2/float4 operations.
+        OP_REQUIRES(ctx, !((uintptr_t)p.pos & 15), errors::Internal("pos input tensor not aligned to float4"));
+        OP_REQUIRES(ctx, !((uintptr_t)p.dy  &  7), errors::Internal("dy input tensor not aligned to float2"));
+        if (ENABLE_DB)
+            OP_REQUIRES(ctx, !((uintptr_t)p.ddb & 15), errors::Internal("ddb input tensor not aligned to float4"));
+
+        // Choose launch parameters.
+        dim3 blockSize = getLaunchBlockSize(RAST_GRAD_MAX_KERNEL_BLOCK_WIDTH, RAST_GRAD_MAX_KERNEL_BLOCK_HEIGHT, p.width, p.height);
+        dim3 gridSize  = getLaunchGridSize(blockSize, p.width, p.height, p.depth);
+
+        // Launch CUDA kernel.
+        void* args[] = {&p};
+        void* func = ENABLE_DB ? (void*)RasterizeGradKernelDb : (void*)RasterizeGradKernel;
+        OP_CHECK_CUDA_ERROR(ctx, cudaLaunchKernel(func, gridSize, blockSize, args, 0, stream));
+    }
+};
+
+REGISTER_OP("RasterizeGrad")
+    .Input      ("pos: float")
+    .Input      ("tri: int32")
+    .Input      ("out: float")
+    .Input      ("dy: float")
+    .Output     ("grad: float");
+
+REGISTER_OP("RasterizeGradDb")
+    .Input      ("pos: float")
+    .Input      ("tri: int32")
+    .Input      ("out: float")
+    .Input      ("dy: float")
+    .Input      ("ddb: float")
+    .Output     ("grad: float");
+
+REGISTER_KERNEL_BUILDER(Name("RasterizeGrad")  .Device(DEVICE_GPU), RasterizeGradOp<false>);
+REGISTER_KERNEL_BUILDER(Name("RasterizeGradDb").Device(DEVICE_GPU), RasterizeGradOp<true>);
+
+//------------------------------------------------------------------------
--- a/nvdiffrast/tensorflow/tf_texture.cu
+++ b/nvdiffrast/tensorflow/tf_texture.cu
+// Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+//------------------------------------------------------------------------
+// Common op attribute parser.
+
+static __host__ void parseOpAttributes(OpKernelConstruction* ctx, TextureKernelParams& p)
+{
+    // Mip and filter modes.
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("filter_mode", &p.filterMode));
+    OP_REQUIRES(ctx, p.filterMode >= 0 && p.filterMode < TEX_MODE_COUNT, errors::InvalidArgument("filter_mode unsupported"));
+    p.enableMip = (p.filterMode == TEX_MODE_LINEAR_MIPMAP_NEAREST || p.filterMode == TEX_MODE_LINEAR_MIPMAP_LINEAR);
+
+    // Mip level clamp.
+    if (p.enableMip)
+    {
+        OP_REQUIRES_OK(ctx, ctx->GetAttr("max_mip_level", &p.mipLevelLimit));
+        OP_REQUIRES(ctx, p.mipLevelLimit >= -1, errors::InvalidArgument("invalid max_mip_level"));
+        ctx->GetAttr("tex_const", &p.texConst); // Only available in forward op.
+    }
+
+    // Boundary mode.
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("boundary_mode", &p.boundaryMode));
+    OP_REQUIRES(ctx, p.boundaryMode >= 0 && p.boundaryMode < TEX_BOUNDARY_MODE_COUNT, errors::InvalidArgument("boundary_mode unsupported"));
+}
+
+//------------------------------------------------------------------------
+// Forward TensorFlow op.
+
+struct TextureFwdOp : public OpKernel
+{
+    TextureKernelParams m_attribs;
+    PersistentTensor    m_persistentMipTensor; // Used if texture is constant and mips are enabled.
+    bool                m_persistentMipTensorInitialized;
+
+    TextureFwdOp(OpKernelConstruction* ctx): OpKernel(ctx)
+    {
+        memset(&m_attribs, 0, sizeof(m_attribs));
+        m_persistentMipTensorInitialized = false;
+        parseOpAttributes(ctx, m_attribs);
+    }
+
+    void Compute(OpKernelContext* ctx)
+    {
+        TextureKernelParams& p = m_attribs;
+        cudaStream_t stream = ctx->eigen_device<Eigen::GpuDevice>().stream();
+        bool cube_mode = (p.boundaryMode == TEX_BOUNDARY_MODE_CUBE);
+
+        // Get input.
+        const Tensor& tex   = ctx->input(0);
+        const Tensor& uv    = ctx->input(1);
+        const Tensor& uv_da = ctx->input(p.enableMip ? 2 : 1);
+
+        // Extract input dimensions.
+        p.n         = (uv.dims() > 0) ? uv.dim_size(0) : 0;
+        p.imgHeight = (uv.dims() > 1) ? uv.dim_size(1) : 0;
+        p.imgWidth  = (uv.dims() > 2) ? uv.dim_size(2) : 0;
+        p.texDepth  = (tex.dims() > 0) ? tex.dim_size(0) : 0;
+        if (!cube_mode)
+        {
+            p.texHeight = (tex.dims() > 1) ? tex.dim_size(1) : 0;
+            p.texWidth  = (tex.dims() > 2) ? tex.dim_size(2) : 0;
+            p.channels  = (tex.dims() > 3) ? tex.dim_size(3) : 0;
+        }
+        else
+        {
+            p.texHeight = (tex.dims() > 2) ? tex.dim_size(2) : 0;
+            p.texWidth  = (tex.dims() > 3) ? tex.dim_size(3) : 0;
+            p.channels  = (tex.dims() > 4) ? tex.dim_size(4) : 0;
+        }
+
+        // Sanity checks.
+        if (!cube_mode)
+        {
+            OP_REQUIRES(ctx, tex.dims() == 4 && tex.dim_size(0) > 0 && tex.dim_size(1) > 0 && tex.dim_size(2) > 0 && tex.dim_size(3) > 0, errors::InvalidArgument("tex must have shape[>0, >0, >0, >0]"));
+            OP_REQUIRES(ctx, uv.dims() == 4 && uv.dim_size(0) > 0 && uv.dim_size(1) > 0 && uv.dim_size(2) > 0 && uv.dim_size(3) == 2, errors::InvalidArgument("uv must have shape [>0, >0, >0, 2]"));
+        }
+        else
+        {
+            OP_REQUIRES(ctx, tex.dims() == 5 && tex.dim_size(0) > 0 && tex.dim_size(1) == 6 && tex.dim_size(2) > 0 && tex.dim_size(3) > 0 && tex.dim_size(4) > 0, errors::InvalidArgument("tex must have shape[>0, 6, >0, >0, >0] in cube map mode"));
+            OP_REQUIRES(ctx, uv.dims() == 4 && uv.dim_size(0) > 0 && uv.dim_size(1) > 0 && uv.dim_size(2) > 0 && uv.dim_size(3) == 3, errors::InvalidArgument("uv must have shape [>0, >0, >0, 3] in cube map mode"));
+            OP_REQUIRES(ctx, tex.dim_size(2) == tex.dim_size(3), errors::InvalidArgument("texture shape must be square in cube map mode"));
+        }
+        OP_REQUIRES(ctx, tex.dim_size(0) == 1 || tex.dim_size(0) == p.n, errors::InvalidArgument("minibatch size mismatch between inputs tex, uv"));
+        OP_REQUIRES(ctx, p.texWidth <= (1 << TEX_MAX_MIP_LEVEL) && p.texHeight <= (1 << TEX_MAX_MIP_LEVEL), errors::InvalidArgument("texture size too large"));
+        if (p.enableMip)
+        {
+            if (!cube_mode)
+                OP_REQUIRES(ctx, uv_da.dims() == 4 && uv_da.dim_size(0) == p.n && uv_da.dim_size(1) == p.imgHeight && uv_da.dim_size(2) == p.imgWidth && uv_da.dim_size(3) == 4, errors::InvalidArgument("uv_da must have shape [minibatch_size, height, width, 4]"));
+            else
+                OP_REQUIRES(ctx, uv_da.dims() == 4 && uv_da.dim_size(0) == p.n && uv_da.dim_size(1) == p.imgHeight && uv_da.dim_size(2) == p.imgWidth && uv_da.dim_size(3) == 6, errors::InvalidArgument("uv_da must have shape [minibatch_size, height, width, 6] in cube map mode"));
+        }
+
+        // Get input pointers.
+        p.tex = tex.flat<float>().data();
+        p.uv = uv.flat<float>().data();
+        p.uvDA = p.enableMip ? uv_da.flat<float>().data() : 0;
+
+        // Allocate output tensor.
+        Tensor* out_tensor = NULL;
+        TensorShape out_shape;
+        out_shape.AddDim(p.n);
+        out_shape.AddDim(p.imgHeight);
+        out_shape.AddDim(p.imgWidth);
+        out_shape.AddDim(p.channels);
+        OP_REQUIRES_OK(ctx, ctx->allocate_output(0, out_shape, &out_tensor));
+        p.out = out_tensor->flat<float>().data();
+
+        // Choose kernel variants based on channel count.
+        void* args[] = {&p};
+        int channel_div_idx = 0;
+        if (!(p.channels & 3))
+            channel_div_idx = 2;  // Channel count divisible by 4.
+        else if (!(p.channels & 1))
+            channel_div_idx = 1;  // Channel count divisible by 2.
+
+        // Mip-related setup.
+        if (p.enableMip)
+        {
+            // Generate mip offsets.
+            int mipTotal = calculateMipInfo(ctx, p);
+
+            // Mip output tensor.
+            Tensor* mip_tensor = NULL;
+            TensorShape mip_shape;
+            mip_shape.AddDim(mipTotal);
+
+            // If texture is constant, calculate mip stack only once.
+            bool computeMip = true;
+            if (p.texConst)
+            {
+                // First execution?
+                if (!m_persistentMipTensorInitialized)
+                {
+                    // Allocate a persistent mip tensor.
+                    OP_REQUIRES_OK(ctx, ctx->allocate_persistent(DT_FLOAT, mip_shape, &m_persistentMipTensor, &mip_tensor));
+                    m_persistentMipTensorInitialized = true;
+                }
+                else
+                {
+                    // Reuse the persistent tensor, do not recompute mip levels.
+                    mip_tensor = m_persistentMipTensor.AccessTensor(ctx);
+                    computeMip = false;
+                }
+
+                // Set as output tensor as well.
+                ctx->set_output(1, *mip_tensor);
+            }
+            else
+            {
+                // Allocate an output tensor as usual.
+                OP_REQUIRES_OK(ctx, ctx->allocate_output(1, mip_shape, &mip_tensor));
+            }
+
+            p.mip = mip_tensor->flat<float>().data(); // Pointer to data.
+
+            // Build mip levels if needed.
+            if (computeMip)
+            {
+                for (int i=1; i <= p.mipLevelMax; i++)
+                {
+                    int2 ms = mipLevelSize(p, i);
+                    int3 sz = make_int3(ms.x, ms.y, p.texDepth);
+                    dim3 blockSize = getLaunchBlockSize(TEX_FWD_MAX_MIP_KERNEL_BLOCK_WIDTH, TEX_FWD_MAX_MIP_KERNEL_BLOCK_HEIGHT, sz.x, sz.y);
+                    dim3 gridSize  = getLaunchGridSize(blockSize, sz.x, sz.y, sz.z * (cube_mode ? 6 : 1));
+                    p.mipLevelOut = i;
+
+                    void* build_func_tbl[3] = { (void*)MipBuildKernel1, (void*)MipBuildKernel2, (void*)MipBuildKernel4 };
+                    OP_CHECK_CUDA_ERROR(ctx, cudaLaunchKernel(build_func_tbl[channel_div_idx], gridSize, blockSize, args, 0, stream));
+                }
+            }
+        }
+
+        // Verify that buffers are aligned to allow float2/float4 operations. Unused pointers are zero so always aligned.
+        if (!cube_mode)
+            OP_REQUIRES(ctx, !((uintptr_t)p.uv & 7), errors::Internal("uv input tensor not aligned to float2"));
+        if ((p.channels & 3) == 0)
+        {
+            OP_REQUIRES(ctx, !((uintptr_t)p.tex & 15), errors::Internal("tex input tensor not aligned to float4"));
+            OP_REQUIRES(ctx, !((uintptr_t)p.out & 15), errors::Internal("out output tensor not aligned to float4"));
+            OP_REQUIRES(ctx, !((uintptr_t)p.mip & 15), errors::Internal("mip output tensor not aligned to float4"));
+        }
+        if ((p.channels & 1) == 0)
+        {
+            OP_REQUIRES(ctx, !((uintptr_t)p.tex & 7), errors::Internal("tex input tensor not aligned to float2"));
+            OP_REQUIRES(ctx, !((uintptr_t)p.out & 7), errors::Internal("out output tensor not aligned to float2"));
+            OP_REQUIRES(ctx, !((uintptr_t)p.mip & 7), errors::Internal("mip output tensor not aligned to float2"));
+        }
+        if (!cube_mode)
+            OP_REQUIRES(ctx, !((uintptr_t)p.uvDA & 15), errors::Internal("uv_da input tensor not aligned to float4"));
+        else
+            OP_REQUIRES(ctx, !((uintptr_t)p.uvDA & 7), errors::Internal("uv_da input tensor not aligned to float2"));
+
+        // Choose launch parameters for texture lookup kernel.
+        dim3 blockSize = getLaunchBlockSize(TEX_FWD_MAX_KERNEL_BLOCK_WIDTH, TEX_FWD_MAX_KERNEL_BLOCK_HEIGHT, p.imgWidth, p.imgHeight);
+        dim3 gridSize  = getLaunchGridSize(blockSize, p.imgWidth, p.imgHeight, p.n);
+
+        // Choose kernel based on filter mode, cube mode, and datatype.
+        void* func_tbl[TEX_MODE_COUNT * 3 * 2] = {
+            (void*)TextureFwdKernelNearest1,
+            (void*)TextureFwdKernelNearest2,
+            (void*)TextureFwdKernelNearest4,
+            (void*)TextureFwdKernelLinear1,
+            (void*)TextureFwdKernelLinear2,
+            (void*)TextureFwdKernelLinear4,
+            (void*)TextureFwdKernelLinearMipmapNearest1,
+            (void*)TextureFwdKernelLinearMipmapNearest2,
+            (void*)TextureFwdKernelLinearMipmapNearest4,
+            (void*)TextureFwdKernelLinearMipmapLinear1,
+            (void*)TextureFwdKernelLinearMipmapLinear2,
+            (void*)TextureFwdKernelLinearMipmapLinear4,
+            (void*)TextureFwdKernelCubeNearest1,
+            (void*)TextureFwdKernelCubeNearest2,
+            (void*)TextureFwdKernelCubeNearest4,
+            (void*)TextureFwdKernelCubeLinear1,
+            (void*)TextureFwdKernelCubeLinear2,
+            (void*)TextureFwdKernelCubeLinear4,
+            (void*)TextureFwdKernelCubeLinearMipmapNearest1,
+            (void*)TextureFwdKernelCubeLinearMipmapNearest2,
+            (void*)TextureFwdKernelCubeLinearMipmapNearest4,
+            (void*)TextureFwdKernelCubeLinearMipmapLinear1,
+            (void*)TextureFwdKernelCubeLinearMipmapLinear2,
+            (void*)TextureFwdKernelCubeLinearMipmapLinear4,
+        };
+
+        // Function index.
+        int func_idx = p.filterMode;
+        if (cube_mode)
+            func_idx += TEX_MODE_COUNT;
+        func_idx = func_idx * 3 + channel_div_idx;
+
+        // Launch kernel.
+        OP_CHECK_CUDA_ERROR(ctx, cudaLaunchKernel(func_tbl[func_idx], gridSize, blockSize, args, 0, stream));
+    }
+};
+
+REGISTER_OP("TextureFwd")
+    .Input      ("tex: float")
+    .Input      ("uv: float")
+    .Output     ("out: float")
+    .Attr       ("filter_mode: int")
+    .Attr       ("boundary_mode: int");
+
+REGISTER_OP("TextureFwdMip")
+    .Input      ("tex: float")
+    .Input      ("uv: float")
+    .Input      ("uv_da: float")
+    .Output     ("out: float")
+    .Output     ("mip: float")
+    .Attr       ("filter_mode: int")
+    .Attr       ("boundary_mode: int")
+    .Attr       ("tex_const: int")
+    .Attr       ("max_mip_level: int");
+
+REGISTER_KERNEL_BUILDER(Name("TextureFwd")   .Device(DEVICE_GPU), TextureFwdOp);
+REGISTER_KERNEL_BUILDER(Name("TextureFwdMip").Device(DEVICE_GPU), TextureFwdOp);
+
+//------------------------------------------------------------------------
+// Gradient TensorFlow op.
+
+struct TextureGradOp : public OpKernel
+{
+    TextureKernelParams m_attribs;
+
+    TextureGradOp(OpKernelConstruction* ctx): OpKernel(ctx)
+    {
+        memset(&m_attribs, 0, sizeof(m_attribs));
+        parseOpAttributes(ctx, m_attribs);
+    }
+
+    void Compute(OpKernelContext* ctx)
+    {
+        TextureKernelParams& p = m_attribs;
+        cudaStream_t stream = ctx->eigen_device<Eigen::GpuDevice>().stream();
+        bool cube_mode = (p.boundaryMode == TEX_BOUNDARY_MODE_CUBE);
+        
+        // Get input.
+        const Tensor& tex   = ctx->input(0);
+        const Tensor& uv    = ctx->input(1);
+        const Tensor& dy    = ctx->input(2);
+        const Tensor& uv_da = ctx->input(p.enableMip ? 3 : 2);
+        const Tensor& mip   = ctx->input(p.enableMip ? 4 : 2);
+
+        // Extract input dimensions.
+        p.n         = (uv.dims() > 0) ? uv.dim_size(0) : 0;
+        p.imgHeight = (uv.dims() > 1) ? uv.dim_size(1) : 0;
+        p.imgWidth  = (uv.dims() > 2) ? uv.dim_size(2) : 0;
+        p.texDepth  = (tex.dims() > 0) ? tex.dim_size(0) : 0;
+        if (!cube_mode)
+        {
+            p.texHeight = (tex.dims() > 1) ? tex.dim_size(1) : 0;
+            p.texWidth  = (tex.dims() > 2) ? tex.dim_size(2) : 0;
+            p.channels  = (tex.dims() > 3) ? tex.dim_size(3) : 0;
+        }
+        else
+        {
+            p.texHeight = (tex.dims() > 2) ? tex.dim_size(2) : 0;
+            p.texWidth  = (tex.dims() > 3) ? tex.dim_size(3) : 0;
+            p.channels  = (tex.dims() > 4) ? tex.dim_size(4) : 0;
+        }
+
+        // Sanity checks.
+        if (!cube_mode)
+        {
+            OP_REQUIRES(ctx, tex.dims() == 4 && tex.dim_size(0) > 0 && tex.dim_size(1) > 0 && tex.dim_size(2) > 0 && tex.dim_size(3) > 0, errors::InvalidArgument("tex must have shape[>0, >0, >0, >0]"));
+            OP_REQUIRES(ctx, uv.dims() == 4 && uv.dim_size(0) > 0 && uv.dim_size(1) > 0 && uv.dim_size(2) > 0 && uv.dim_size(3) == 2, errors::InvalidArgument("uv must have shape [>0, >0, >0, 2]"));
+        }
+        else
+        {
+            OP_REQUIRES(ctx, tex.dims() == 5 && tex.dim_size(0) > 0 && tex.dim_size(1) == 6 && tex.dim_size(2) > 0 && tex.dim_size(3) > 0 && tex.dim_size(4) > 0, errors::InvalidArgument("tex must have shape[>0, 6, >0, >0, >0] in cube map mode"));
+            OP_REQUIRES(ctx, uv.dims() == 4 && uv.dim_size(0) > 0 && uv.dim_size(1) > 0 && uv.dim_size(2) > 0 && uv.dim_size(3) == 3, errors::InvalidArgument("uv must have shape [>0, >0, >0, 3] in cube map mode"));
+            OP_REQUIRES(ctx, tex.dim_size(2) == tex.dim_size(3), errors::InvalidArgument("texture shape must be square in cube map mode"));
+        }
+        OP_REQUIRES(ctx, tex.dim_size(0) == 1 || tex.dim_size(0) == p.n, errors::InvalidArgument("minibatch size mismatch between inputs tex, uv"));
+        OP_REQUIRES(ctx, dy.dims() == 4 && dy.dim_size(0) == p.n && dy.dim_size(1) == p.imgHeight && dy.dim_size(2) == p.imgWidth && dy.dim_size(3) == p.channels, errors::InvalidArgument("dy must have shape [minibatch_size, height, width, channels]"));
+        if (p.enableMip)
+        {
+            if (!cube_mode)
+                OP_REQUIRES(ctx, uv_da.dims() == 4 && uv_da.dim_size(0) == p.n && uv_da.dim_size(1) == p.imgHeight && uv_da.dim_size(2) == p.imgWidth && uv_da.dim_size(3) == 4, errors::InvalidArgument("uv_da must have shape [minibatch_size, height, width, 4]"));
+            else
+                OP_REQUIRES(ctx, uv_da.dims() == 4 && uv_da.dim_size(0) == p.n && uv_da.dim_size(1) == p.imgHeight && uv_da.dim_size(2) == p.imgWidth && uv_da.dim_size(3) == 6, errors::InvalidArgument("uv_da must have shape [minibatch_size, height, width, 6] in cube map mode"));
+        }
+        
+        // Get input pointers.
+        p.tex = tex.flat<float>().data();
+        p.uv = uv.flat<float>().data();
+        p.dy = dy.flat<float>().data();
+        p.uvDA = p.enableMip ? uv_da.flat<float>().data() : 0;
+        p.mip = p.enableMip ? (float*)mip.flat<float>().data() : 0;
+
+        // Allocate output tensor for tex gradient.
+        Tensor* grad_tex_tensor = NULL;
+        TensorShape grad_tex_shape;
+        grad_tex_shape.AddDim(p.texDepth);
+        if (cube_mode)
+            grad_tex_shape.AddDim(6);
+        grad_tex_shape.AddDim(p.texHeight);
+        grad_tex_shape.AddDim(p.texWidth);
+        grad_tex_shape.AddDim(p.channels);
+        OP_REQUIRES_OK(ctx, ctx->allocate_output(0, grad_tex_shape, &grad_tex_tensor));
+        p.gradTex = grad_tex_tensor->flat<float>().data();
+
+        // Allocate output tensor for uv gradient.
+        if (p.filterMode != TEX_MODE_NEAREST)
+        {
+            TensorShape grad_uv_shape;
+            Tensor* grad_uv_tensor = NULL;
+            grad_uv_shape.AddDim(p.n);
+            grad_uv_shape.AddDim(p.imgHeight);
+            grad_uv_shape.AddDim(p.imgWidth);
+            grad_uv_shape.AddDim(uv.dim_size(3));
+            OP_REQUIRES_OK(ctx, ctx->allocate_output(1, grad_uv_shape, &grad_uv_tensor));
+            p.gradUV = grad_uv_tensor->flat<float>().data();
+
+            // Allocate output tensor for uv_da gradient.
+            if (p.filterMode == TEX_MODE_LINEAR_MIPMAP_LINEAR)
+            {
+                Tensor* grad_uv_da_tensor = NULL;
+                grad_uv_shape.set_dim(3, uv_da.dim_size(3));
+                OP_REQUIRES_OK(ctx, ctx->allocate_output(2, grad_uv_shape, &grad_uv_da_tensor));
+                p.gradUVDA = grad_uv_da_tensor->flat<float>().data();
+            }
+        }
+
+        // Choose kernel variants based on channel count.
+        int channel_div_idx = 0;
+        if (!(p.channels & 3))
+            channel_div_idx = 2;  // Channel count divisible by 4.
+        else if (!(p.channels & 1))
+            channel_div_idx = 1;  // Channel count divisible by 2.
+
+        // Mip-related setup.
+        Tensor grad_mip_tensor;
+        if (p.enableMip)
+        {
+            // Generate mip offsets.
+            int mipTotal = calculateMipInfo(ctx, p);
+
+            // Get space for temporary mip gradients.
+            TensorShape grad_mip_shape;
+            grad_mip_shape.AddDim(mipTotal);
+            ctx->allocate_temp(DT_FLOAT, grad_mip_shape, &grad_mip_tensor);
+            p.gradTexMip = grad_mip_tensor.flat<float>().data();
+
+            // Clear mip gradients.
+            OP_CHECK_CUDA_ERROR(ctx, cudaMemsetAsync(p.gradTexMip, 0, mipTotal * sizeof(float), stream));
+        }
+
+        // Initialize texture gradients to zero.
+        int texBytes = p.texHeight * p.texWidth * p.texDepth * p.channels * sizeof(float);
+        if (cube_mode)
+            texBytes *= 6;
+        OP_CHECK_CUDA_ERROR(ctx, cudaMemsetAsync(p.gradTex, 0, texBytes, stream));
+
+        // Verify that buffers are aligned to allow float2/float4 operations. Unused pointers are zero so always aligned.
+        if (!cube_mode)
+        {
+            OP_REQUIRES(ctx, !((uintptr_t)p.uv       & 7), errors::Internal("uv input tensor not aligned to float2"));
+            OP_REQUIRES(ctx, !((uintptr_t)p.gradUV   & 7), errors::Internal("grad_uv output tensor not aligned to float2"));
+            OP_REQUIRES(ctx, !((uintptr_t)p.uvDA     & 15), errors::Internal("uv_da input tensor not aligned to float4"));
+            OP_REQUIRES(ctx, !((uintptr_t)p.gradUVDA & 15), errors::Internal("grad_uv_da output tensor not aligned to float4"));
+        }
+        else
+        {
+            OP_REQUIRES(ctx, !((uintptr_t)p.uvDA     & 7), errors::Internal("uv_da input tensor not aligned to float2"));
+            OP_REQUIRES(ctx, !((uintptr_t)p.gradUVDA & 7), errors::Internal("grad_uv_da output tensor not aligned to float2"));
+        }
+        if ((p.channels & 3) == 0)
+        {
+            OP_REQUIRES(ctx, !((uintptr_t)p.tex     & 15), errors::Internal("tex input tensor not aligned to float4"));
+            OP_REQUIRES(ctx, !((uintptr_t)p.gradTex & 15), errors::Internal("grad_tex output tensor not aligned to float4"));
+            OP_REQUIRES(ctx, !((uintptr_t)p.dy      & 15), errors::Internal("dy input tensor not aligned to float4"));
+            OP_REQUIRES(ctx, !((uintptr_t)p.mip     & 15), errors::Internal("mip input tensor not aligned to float4"));
+        }
+        if ((p.channels & 1) == 0)
+        {
+            OP_REQUIRES(ctx, !((uintptr_t)p.tex     & 7), errors::Internal("tex input tensor not aligned to float2"));
+            OP_REQUIRES(ctx, !((uintptr_t)p.gradTex & 7), errors::Internal("grad_tex output tensor not aligned to float2"));
+            OP_REQUIRES(ctx, !((uintptr_t)p.dy      & 7), errors::Internal("dy output tensor not aligned to float2"));
+            OP_REQUIRES(ctx, !((uintptr_t)p.mip     & 7), errors::Internal("mip input tensor not aligned to float2"));
+        }
+
+        // Choose launch parameters for main gradient kernel.
+        void* args[] = {&p};
+        dim3 blockSize = getLaunchBlockSize(TEX_GRAD_MAX_KERNEL_BLOCK_WIDTH, TEX_GRAD_MAX_KERNEL_BLOCK_HEIGHT, p.imgWidth, p.imgHeight);
+        dim3 gridSize  = getLaunchGridSize(blockSize, p.imgWidth, p.imgHeight, p.n);
+
+        void* func_tbl[TEX_MODE_COUNT * 2] = { 
+            (void*)TextureGradKernelNearest,
+            (void*)TextureGradKernelLinear,
+            (void*)TextureGradKernelLinearMipmapNearest,
+            (void*)TextureGradKernelLinearMipmapLinear,
+            (void*)TextureGradKernelCubeNearest,
+            (void*)TextureGradKernelCubeLinear,
+            (void*)TextureGradKernelCubeLinearMipmapNearest,
+            (void*)TextureGradKernelCubeLinearMipmapLinear,
+        };
+
+        // Function index.
+        int func_idx = p.filterMode;
+        if (cube_mode)
+            func_idx += TEX_MODE_COUNT;
+
+        // Launch main gradient kernel.
+        OP_CHECK_CUDA_ERROR(ctx, cudaLaunchKernel(func_tbl[func_idx], gridSize, blockSize, args, 0, stream));
+
+        // Launch kernel to pull gradients from mip levels.
+        if (p.enableMip)
+        {
+            dim3 blockSize = getLaunchBlockSize(TEX_GRAD_MAX_MIP_KERNEL_BLOCK_WIDTH, TEX_GRAD_MAX_MIP_KERNEL_BLOCK_HEIGHT, p.texWidth, p.texHeight);
+            dim3 gridSize  = getLaunchGridSize(blockSize, p.texWidth, p.texHeight, p.texDepth * (cube_mode ? 6 : 1));
+            int sharedBytes = blockSize.x * blockSize.y * p.channels * sizeof(float);
+
+            void* mip_grad_func_tbl[3] = { (void*)MipGradKernel1, (void*)MipGradKernel2, (void*)MipGradKernel4 };
+            OP_CHECK_CUDA_ERROR(ctx, cudaLaunchKernel(mip_grad_func_tbl[channel_div_idx], gridSize, blockSize, args, sharedBytes, stream));
+        }
+    }
+};
+
+REGISTER_OP("TextureGradNearest")
+    .Input      ("tex: float")
+    .Input      ("uv: float")
+    .Input      ("dy: float")
+    .Output     ("grad_tex: float")
+    .Attr       ("filter_mode: int")
+    .Attr       ("boundary_mode: int");
+
+REGISTER_OP("TextureGradLinear")
+    .Input      ("tex: float")
+    .Input      ("uv: float")
+    .Input      ("dy: float")
+    .Output     ("grad_tex: float")
+    .Output     ("grad_uv: float")
+    .Attr       ("filter_mode: int")
+    .Attr       ("boundary_mode: int");
+
+REGISTER_OP("TextureGradLinearMipmapNearest")
+    .Input      ("tex: float")
+    .Input      ("uv: float")
+    .Input      ("dy: float")
+    .Input      ("uv_da: float")
+    .Input      ("mip: float")
+    .Output     ("grad_tex: float")
+    .Output     ("grad_uv: float")
+    .Attr       ("filter_mode: int")
+    .Attr       ("boundary_mode: int")
+    .Attr       ("max_mip_level: int");
+    
+REGISTER_OP("TextureGradLinearMipmapLinear")
+    .Input      ("tex: float")
+    .Input      ("uv: float")
+    .Input      ("dy: float")
+    .Input      ("uv_da: float")
+    .Input      ("mip: float")
+    .Output     ("grad_tex: float")
+    .Output     ("grad_uv: float")
+    .Output     ("grad_uv_da: float")
+    .Attr       ("filter_mode: int")
+    .Attr       ("boundary_mode: int")
+    .Attr       ("max_mip_level: int");
+    
+REGISTER_KERNEL_BUILDER(Name("TextureGradNearest")            .Device(DEVICE_GPU), TextureGradOp);
+REGISTER_KERNEL_BUILDER(Name("TextureGradLinear")             .Device(DEVICE_GPU), TextureGradOp);
+REGISTER_KERNEL_BUILDER(Name("TextureGradLinearMipmapNearest").Device(DEVICE_GPU), TextureGradOp);
+REGISTER_KERNEL_BUILDER(Name("TextureGradLinearMipmapLinear") .Device(DEVICE_GPU), TextureGradOp);
+        
+//------------------------------------------------------------------------
--- a/nvdiffrast/torch/__init__.py
+++ b/nvdiffrast/torch/__init__.py
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+from .ops import RasterizeGLContext, get_log_level, set_log_level, rasterize, interpolate, texture, texture_construct_mip, antialias, antialias_construct_topology_hash
+__all__ = ["RasterizeGLContext", "get_log_level", "set_log_level", "rasterize", "interpolate", "texture", "texture_construct_mip", "antialias", "antialias_construct_topology_hash"]
--- a/nvdiffrast/torch/ops.py
+++ b/nvdiffrast/torch/ops.py
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+import logging
+import numpy as np
+import os
+import sys
+import torch
+import torch.utils.cpp_extension
+
+#----------------------------------------------------------------------------
+# C++/Cuda plugin compiler/loader.
+
+_cached_plugin = None
+def _get_plugin():
+    # Return cached plugin if already loaded.
+    global _cached_plugin
+    if _cached_plugin is not None:
+        return _cached_plugin
+
+    # Make sure we can find the necessary compiler and libary binaries.
+    if os.name == 'nt':
+        lib_dir = os.path.dirname(__file__) + r"\..\lib"
+        def find_cl_path():
+            import glob
+            for edition in ['Professional', 'BuildTools', 'Community']:
+                paths = sorted(glob.glob(r"C:\Program Files (x86)\Microsoft Visual Studio\*\%s\VC\Tools\MSVC\*\bin\Hostx64\x64" % edition), reverse=True)
+                if paths:
+                    return paths[0]
+
+        # If cl.exe is not on path, try to find it.
+        if os.system("where cl.exe >nul 2>nul") != 0:
+            cl_path = find_cl_path()
+            if cl_path is None:
+                raise RuntimeError("Could not locate a supported Microsoft Visual C++ installation")
+            os.environ['PATH'] += ';' + cl_path
+
+    # Compiler options.
+    opts = ['-DNVDR_TORCH']
+
+    # Linker options.
+    if os.name == 'posix':
+        ldflags = ['-lGL', '-lGLEW']
+    elif os.name == 'nt':
+        libs = ['gdi32', 'glew32s', 'opengl32', 'user32']
+        ldflags = ['/LIBPATH:' + lib_dir] + ['/DEFAULTLIB:' + x for x in libs]
+
+    # List of source files.
+    source_files = [
+        '../common/common.cpp',
+        '../common/rasterize.cu',
+        '../common/rasterize.cpp',
+        '../common/interpolate.cu',
+        '../common/texture.cu',
+        '../common/texture.cpp',
+        '../common/antialias.cu',
+        'torch_bindings.cpp',
+        'torch_rasterize.cpp',
+        'torch_interpolate.cpp',
+        'torch_texture.cpp',
+        'torch_antialias.cpp',
+    ]
+
+    # Some containers set this to contain old architectures that won't compile. We only need the one installed in the machine.
+    os.environ['TORCH_CUDA_ARCH_LIST'] = ''
+
+    # Try to detect if a stray lock file is left in cache directory and show a warning. This sometimes happens on Windows if the build is interrupted at just the right moment.
+    plugin_name = 'nvdiffrast_plugin'
+    try:
+        lock_fn = os.path.join(torch.utils.cpp_extension._get_build_directory(plugin_name, False), 'lock')
+        if os.path.exists(lock_fn):
+            logging.getLogger('nvdiffrast').warning("Lock file exists in build directory: '%s'" % lock_fn)
+    except:
+        pass
+
+    # Compile and load.
+    source_paths = [os.path.join(os.path.dirname(__file__), fn) for fn in source_files]
+    torch.utils.cpp_extension.load(name=plugin_name, sources=source_paths, extra_cflags=opts, extra_cuda_cflags=opts, extra_ldflags=ldflags, with_cuda=True, verbose=False)
+
+    # Import, cache, and return the compiled module.
+    import nvdiffrast_plugin
+    _cached_plugin = nvdiffrast_plugin
+    return _cached_plugin
+
+#----------------------------------------------------------------------------
+# Log level.
+#----------------------------------------------------------------------------
+
+def get_log_level():
+    '''Get current log level.
+
+    Returns:
+      Current log level in nvdiffrast. See `set_log_level()` for possible values.
+    '''
+    return _get_plugin().get_log_level()
+
+def set_log_level(level):
+    '''Set log level.
+
+    Log levels follow the convention on the C++ side of Torch:
+      0 = Info, 
+      1 = Warning, 
+      2 = Error, 
+      3 = Fatal.
+    The default log level is 1.
+
+    Args:
+      level: New log level as integer. Internal nvdiffrast messages of this 
+             severity or higher will be printed, while messages of lower
+             severity will be silent.
+    '''
+    _get_plugin().set_log_level(level)
+
+#----------------------------------------------------------------------------
+# GL State wrapper.
+#----------------------------------------------------------------------------
+
+class RasterizeGLContext:
+    def __init__(self, output_db=True, mode='automatic'):
+        '''Create a new OpenGL rasterizer context.
+
+        Creating an OpenGL context is a slow operation so you should reuse the same
+        context in all calls to `rasterize()` on the same CPU thread. The OpenGL context
+        is deleted when the object is destroyed.
+
+        Args:
+          output_db (bool): Compute and output image-space derivates of barycentrics.
+          mode: OpenGL context handling mode. Valid values are 'manual' and 'automatic'.
+
+        Returns:
+          The newly created OpenGL rasterizer context.
+        '''
+        assert output_db is True or output_db is False
+        assert mode in ['automatic', 'manual']
+        self.output_db = output_db
+        self.mode = mode
+        self.cpp_wrapper = _get_plugin().RasterizeGLStateWrapper(output_db, mode == 'automatic')
+
+    def set_context(self):
+        '''Set (activate) OpenGL context in the current CPU thread.
+           Only available if context was created in manual mode.   
+        '''
+        assert self.mode == 'manual'
+        self.cpp_wrapper.set_context()
+
+    def release_context(self):
+        '''Release (deactivate) currently active OpenGL context.
+           Only available if context was created in manual mode.
+        '''
+        assert self.mode == 'manual'
+        self.cpp_wrapper.release_context()
+
+#----------------------------------------------------------------------------
+# Rasterize.
+#----------------------------------------------------------------------------
+
+class _rasterize_func(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, glctx, pos, tri, resolution, ranges, grad_db):
+        out, out_db = _get_plugin().rasterize_fwd(glctx.cpp_wrapper, pos, tri, resolution, ranges)
+        ctx.save_for_backward(pos, tri, out)
+        ctx.saved_grad_db = grad_db
+        return out, out_db
+
+    @staticmethod
+    def backward(ctx, dy, ddb):
+        pos, tri, out = ctx.saved_variables
+        if ctx.saved_grad_db:
+            g_pos = _get_plugin().rasterize_grad_db(pos, tri, out, dy, ddb)
+        else:
+            g_pos = _get_plugin().rasterize_grad(pos, tri, out, dy)
+        return None, g_pos, None, None, None, None
+
+# Op wrapper.
+def rasterize(glctx, pos, tri, resolution, ranges=None, grad_db=True):
+    '''Rasterize triangles.
+
+    All input tensors must be contiguous and reside in GPU memory except for
+    the `ranges` tensor that, if specified, has to reside in CPU memory. The 
+    output tensors will be contiguous and reside in GPU memory.
+
+    Args:
+        glctx: OpenGL context of type `RasterizeGLContext`.
+        pos: Vertex position tensor with dtype `torch.float32`. To enable range
+             mode, this tensor should have a 2D shape [num_vertices, 4]. To enable
+             instanced mode, use a 3D shape [minibatch_size, num_vertices, 4].
+        tri: Triangle tensor with shape [num_triangles, 3] and dtype `torch.int32`.
+        resolution: Output resolution as integer tuple (height, width).
+        ranges: In range mode, tensor with shape [minibatch_size, 2] and dtype 
+                `torch.int32`, specifying start indices and counts into `tri`.
+                Ignored in instanced mode.
+        grad_db: Propagate gradients of image-space derivatives of barycentrics
+                 into `pos` in backward pass. Ignored if OpenGL context was
+                 not configured to output image-space derivatives.
+
+    Returns:
+        A tuple of two tensors. The first output tensor has shape [minibatch_size, 
+        height, width, 4] and contains the main rasterizer output in order (u, v, z/w,
+        triangle_id). If the OpenGL context was configured to output image-space
+        derivatives of barycentrics, the second output tensor will also have shape
+        [minibatch_size, height, width, 4] and contain said derivatives in order
+        (du/dX, du/dY, dv/dX, dv/dY). Otherwise it will be an empty tensor with shape
+        [minibatch_size, height, width, 0].
+    '''
+    assert isinstance(glctx, RasterizeGLContext)
+    assert grad_db is True or grad_db is False
+    grad_db = grad_db and glctx.output_db
+
+    # Sanitize inputs.
+    assert isinstance(pos, torch.Tensor) and isinstance(tri, torch.Tensor)
+    resolution = tuple(resolution)
+    if ranges is None:
+        ranges = torch.empty(size=(0, 2), dtype=torch.int32, device='cpu')
+    else:
+        assert isinstance(ranges, torch.Tensor)
+
+    # Instantiate the function.
+    return _rasterize_func.apply(glctx, pos, tri, resolution, ranges, grad_db)
+
+#----------------------------------------------------------------------------
+# Interpolate.
+#----------------------------------------------------------------------------
+
+# Output pixel differentials for at least some attributes.
+class _interpolate_func_da(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, attr, rast, tri, rast_db, diff_attrs_all, diff_attrs_list):
+        out, out_da = _get_plugin().interpolate_fwd_da(attr, rast, tri, rast_db, diff_attrs_all, diff_attrs_list)
+        ctx.save_for_backward(attr, rast, tri, rast_db)
+        ctx.saved_misc = diff_attrs_all, diff_attrs_list
+        return out, out_da
+
+    @staticmethod
+    def backward(ctx, dy, dda):
+        attr, rast, tri, rast_db = ctx.saved_variables
+        diff_attrs_all, diff_attrs_list = ctx.saved_misc
+        g_attr, g_rast, g_rast_db = _get_plugin().interpolate_grad_da(attr, rast, tri, dy, rast_db, dda, diff_attrs_all, diff_attrs_list)
+        return g_attr, g_rast, None, g_rast_db, None, None
+
+# No pixel differential for any attribute.
+class _interpolate_func(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, attr, rast, tri):
+        out, out_da = _get_plugin().interpolate_fwd(attr, rast, tri)
+        ctx.save_for_backward(attr, rast, tri)
+        return out, out_da
+
+    @staticmethod
+    def backward(ctx, dy, _):
+        attr, rast, tri = ctx.saved_variables
+        g_attr, g_rast = _get_plugin().interpolate_grad(attr, rast, tri, dy)
+        return g_attr, g_rast, None
+
+# Op wrapper.
+def interpolate(attr, rast, tri, rast_db=None, diff_attrs=None):
+    """Interpolate vertex attributes.
+
+    All input tensors must be contiguous and reside in GPU memory. The output tensors
+    will be contiguous and reside in GPU memory.
+
+    Args:
+        attr: Attribute tensor with dtype `torch.float32`. 
+              Shape is [num_vertices, num_attributes] in range mode, or 
+              [minibatch_size, num_vertices, num_attributes] in instanced mode.
+              Broadcasting is supported along the minibatch axis.
+        rast: Main output tensor from `rasterize()`.
+        tri: Triangle tensor with shape [num_triangles, 3] and dtype `torch.int32`.
+        rast_db: (Optional) Tensor containing image-space derivatives of barycentrics, 
+                 i.e., the second output tensor from `rasterize()`. Enables computing
+                 image-space derivatives of attributes.
+        diff_attrs: (Optional) List of attribute indices for which image-space
+                    derivatives are to be computed. Special value 'all' is equivalent
+                    to list [0, 1, ..., num_attributes - 1].
+
+    Returns:
+        A tuple of two tensors. The first output tensor contains interpolated
+        attributes and has shape [minibatch_size, height, width, num_attributes].
+        If `rast_db` and `diff_attrs` were specified, the second output tensor contains
+        the image-space derivatives of the selected attributes and has shape
+        [minibatch_size, height, width, 2 * len(diff_attrs)]. The derivatives of the
+        first selected attribute A will be on channels 0 and 1 as (dA/dX, dA/dY), etc.
+        Otherwise, the second output tensor will be an empty tensor with shape
+        [minibatch_size, height, width, 0].
+    """
+    # Sanitize the list of pixel differential attributes.
+    if diff_attrs is None:
+        diff_attrs = []
+    elif diff_attrs != 'all':
+        diff_attrs = np.asarray(diff_attrs, np.int32)
+        assert len(diff_attrs.shape) == 1
+        diff_attrs = diff_attrs.tolist()
+
+    diff_attrs_all = int(diff_attrs == 'all')
+    diff_attrs_list = [] if diff_attrs_all else diff_attrs
+
+    # Check inputs.
+    assert all(isinstance(x, torch.Tensor) for x in (attr, rast, tri))
+    if diff_attrs:
+        assert isinstance(rast_db, torch.Tensor)
+
+    # Choose stub.
+    if diff_attrs:
+        return _interpolate_func_da.apply(attr, rast, tri, rast_db, diff_attrs_all, diff_attrs_list)
+    else:
+        return _interpolate_func.apply(attr, rast, tri)
+
+#----------------------------------------------------------------------------
+# Texture
+#----------------------------------------------------------------------------
+
+# Linear-mipmap-linear and linear-mipmap-nearest: Mipmaps enabled.
+class _texture_func_mip(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, filter_mode, tex, uv, uv_da, mip, filter_mode_enum, boundary_mode_enum):
+        out = _get_plugin().texture_fwd_mip(tex, uv, uv_da, mip, filter_mode_enum, boundary_mode_enum)
+        ctx.save_for_backward(tex, uv, uv_da)
+        ctx.saved_misc = filter_mode, mip, filter_mode_enum, boundary_mode_enum
+        return out
+
+    @staticmethod
+    def backward(ctx, dy):
+        tex, uv, uv_da = ctx.saved_variables
+        filter_mode, mip, filter_mode_enum, boundary_mode_enum = ctx.saved_misc
+        if filter_mode == 'linear-mipmap-linear':
+            g_tex, g_uv, g_uv_da = _get_plugin().texture_grad_linear_mipmap_linear(tex, uv, dy, uv_da, mip, filter_mode_enum, boundary_mode_enum)
+            return None, g_tex, g_uv, g_uv_da, None, None, None
+        else: # linear-mipmap-nearest
+            g_tex, g_uv = _get_plugin().texture_grad_linear_mipmap_nearest(tex, uv, dy, uv_da, mip, filter_mode_enum, boundary_mode_enum)
+            return None, g_tex, g_uv, None, None, None, None
+
+# Linear and nearest: Mipmaps disabled.
+class _texture_func(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, filter_mode, tex, uv, filter_mode_enum, boundary_mode_enum):
+        out = _get_plugin().texture_fwd(tex, uv, filter_mode_enum, boundary_mode_enum)
+        ctx.save_for_backward(tex, uv)
+        ctx.saved_misc = filter_mode, filter_mode_enum, boundary_mode_enum
+        return out
+
+    @staticmethod
+    def backward(ctx, dy):
+        tex, uv = ctx.saved_variables
+        filter_mode, filter_mode_enum, boundary_mode_enum = ctx.saved_misc
+        if filter_mode == 'linear':
+            g_tex, g_uv = _get_plugin().texture_grad_linear(tex, uv, dy, filter_mode_enum, boundary_mode_enum)
+            return None, g_tex, g_uv, None, None
+        else: # nearest
+            g_tex = _get_plugin().texture_grad_nearest(tex, uv, dy, filter_mode_enum, boundary_mode_enum)
+            return None, g_tex, None, None, None
+
+# Op wrapper.
+def texture(tex, uv, uv_da=None, mip=None, filter_mode='auto', boundary_mode='wrap', max_mip_level=None):
+    """Perform texture sampling.
+
+    All input tensors must be contiguous and reside in GPU memory. The output tensor
+    will be contiguous and reside in GPU memory.
+
+    Args:
+        tex: Texture tensor with dtype `torch.float32`. For 2D textures, must have shape
+             [minibatch_size, tex_height, tex_width, tex_channels]. For cube map textures,
+             must have shape [minibatch_size, 6, tex_height, tex_width, tex_channels] where
+             tex_width and tex_height are equal. Note that `boundary_mode` must also be set 
+             to 'cube' to enable cube map mode. Broadcasting is supported along the minibatch axis.
+        uv: Tensor containing per-pixel texture coordinates. When sampling a 2D texture, 
+            must have shape [minibatch_size, height, width, 2]. When sampling a cube map
+            texture, must have shape [minibatch_size, height, width, 3].
+        uv_da: (Optional) Tensor containing image-space derivatives of texture coordinates.
+               Must have same shape as `uv` except for the last dimension that is to be twice
+               as long.
+        mip: (Optional) Preconstructed mipmap stack from a `texture_construct_mip()` call. If not
+             specified, the mipmap stack is constructed internally and discarded afterwards.
+        filter_mode: Texture filtering mode to be used. Valid values are 'auto', 'nearest', 
+                     'linear', 'linear-mipmap-nearest', and 'linear-mipmap-linear'. Mode 'auto'
+                     selects 'linear' if `uv_da` is not specified, and 'linear-mipmap-linear'
+                     when `uv_da` is specified, these being the highest-quality modes possible
+                     depending on the availability of the image-space derivatives of the texture
+                     coordinates.
+        boundary_mode: Valid values are 'wrap', 'clamp', 'zero', and 'cube'. If `tex` defines a
+                       cube map, this must be set to 'cube'. The default mode 'wrap' takes fractional
+                       part of texture coordinates. Mode 'clamp' clamps texture coordinates to the
+                       centers of the boundary texels. Mode 'zero' virtually extends the texture with
+                       all-zero values in all directions.
+        max_mip_level: If specified, limits the number of mipmaps constructed and used in mipmap-based
+                       filter modes.
+
+    Returns:
+        A tensor containing the results of the texture sampling with shape
+        [minibatch_size, height, width, tex_channels].
+    """
+
+    # Default filter mode.
+    if filter_mode == 'auto':
+        filter_mode = 'linear-mipmap-linear' if (uv_da is not None) else 'linear'
+
+    # Sanitize inputs.
+    if max_mip_level is None:
+        max_mip_level = -1
+    else:
+        max_mip_level = int(max_mip_level)
+        assert max_mip_level >= 0
+
+    # Check inputs.
+    assert isinstance(tex, torch.Tensor) and isinstance(uv, torch.Tensor)
+    if 'mipmap' in filter_mode:
+        assert isinstance(uv_da, torch.Tensor)
+
+    # If mipping disabled via max level=0, we may as well use simpler filtering internally.
+    if max_mip_level == 0 and filter_mode in ['linear-mipmap-nearest', 'linear-mipmap-linear']:
+        filter_mode = 'linear'
+
+    # Convert filter mode to internal enumeration.
+    filter_mode_dict = {'nearest': 0, 'linear': 1, 'linear-mipmap-nearest': 2, 'linear-mipmap-linear': 3}
+    filter_mode_enum = filter_mode_dict[filter_mode]
+
+    # Convert boundary mode to internal enumeration.
+    boundary_mode_dict = {'cube': 0, 'wrap': 1, 'clamp': 2, 'zero': 3}
+    boundary_mode_enum = boundary_mode_dict[boundary_mode]
+
+    # Construct a mipmap if necessary.
+    if 'mipmap' in filter_mode:
+        if mip is not None:
+            assert isinstance(mip, _get_plugin().TextureMipWrapper)
+        else:
+            mip = _get_plugin().texture_construct_mip(tex, max_mip_level, boundary_mode == 'cube')
+
+    # Choose stub.
+    if filter_mode == 'linear-mipmap-linear' or filter_mode == 'linear-mipmap-nearest':
+        return _texture_func_mip.apply(filter_mode, tex, uv, uv_da, mip, filter_mode_enum, boundary_mode_enum)
+    else:
+        return _texture_func.apply(filter_mode, tex, uv, filter_mode_enum, boundary_mode_enum)
+        
+# Mipmap precalculation for cases where the texture stays constant.
+def texture_construct_mip(tex, max_mip_level=None, cube_mode=False):
+    """Construct a mipmap stack for a texture.
+
+    This function can be used for constructing a mipmap stack for a texture that is known to remain
+    constant. This avoids reconstructing it every time `texture()` is called.
+
+    Args:
+        tex: Texture tensor with the same constraints as in `texture()`.
+        max_mip_level: If specified, limits the number of mipmaps constructed.
+        cube_mode: Must be set to True if `tex` specifies a cube map texture.
+
+    Returns:
+        An opaque object containing the mipmap stack. This can be supplied in a call to `texture()` 
+        in the `mip` argument.
+    """
+
+    assert isinstance(tex, torch.Tensor)
+    assert cube_mode is True or cube_mode is False
+    if max_mip_level is None:
+        max_mip_level = -1
+    else:
+        max_mip_level = int(max_mip_level)
+        assert max_mip_level >= 0
+    return _get_plugin().texture_construct_mip(tex, max_mip_level, cube_mode)
+
+#----------------------------------------------------------------------------
+# Antialias.
+#----------------------------------------------------------------------------
+
+class _antialias_func(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, color, rast, pos, tri, topology_hash, pos_gradient_boost):
+        out, work_buffer = _get_plugin().antialias_fwd(color, rast, pos, tri, topology_hash)
+        ctx.save_for_backward(color, rast, pos, tri)
+        ctx.saved_misc = pos_gradient_boost, work_buffer
+        return out
+
+    @staticmethod
+    def backward(ctx, dy):
+        color, rast, pos, tri = ctx.saved_variables
+        pos_gradient_boost, work_buffer = ctx.saved_misc
+        g_color, g_pos = _get_plugin().antialias_grad(color, rast, pos, tri, dy, work_buffer)
+        if pos_gradient_boost != 1.0:
+            g_pos = g_pos * pos_gradient_boost
+        return g_color, None, g_pos, None, None, None
+
+# Op wrapper.
+def antialias(color, rast, pos, tri, topology_hash=None, pos_gradient_boost=1.0):
+    """Perform antialiasing.
+
+    All input tensors must be contiguous and reside in GPU memory. The output tensor
+    will be contiguous and reside in GPU memory.
+
+    Args:
+        color: Input image to antialias with shape [minibatch_size, height, width, num_channels].
+        rast: Main output tensor from `rasterize()`.
+        pos: Vertex position tensor used in the rasterization operation.
+        tri: Triangle tensor used in the rasterization operation.
+        topology_hash: (Optional) Preconstructed topology hash for the triangle tensor. If not
+                       specified, the topology hash is constructed internally and discarded afterwards.
+        pos_gradient_boost: (Optional) Multiplier for gradients propagated to `pos`.
+
+    Returns:
+        A tensor containing the antialiased image with the same shape as `color` input tensor.
+    """
+
+    # Check inputs.
+    assert all(isinstance(x, torch.Tensor) for x in (color, rast, pos, tri))
+
+    # Construct topology hash unless provided by user.
+    if topology_hash is not None:
+        assert isinstance(topology_hash, _get_plugin().TopologyHashWrapper)
+    else:
+        topology_hash = _get_plugin().antialias_construct_topology_hash(tri)
+
+    # Instantiate the function.
+    return _antialias_func.apply(color, rast, pos, tri, topology_hash, pos_gradient_boost)
+
+# Topology hash precalculation for cases where the triangle array stays constant.
+def antialias_construct_topology_hash(tri):
+    """Construct a topology hash for a triangle tensor.
+
+    This function can be used for constructing a topology hash for a triangle tensor that is 
+    known to remain constant. This avoids reconstructing it every time `antialias()` is called.
+
+    Args:
+        tri: Triangle tensor with shape [num_triangles, 3]. Must be contiguous and reside in
+             GPU memory.
+
+    Returns:
+        An opaque object containing the topology hash. This can be supplied in a call to 
+        `antialias()` in the `topology_hash` argument.
+    """
+    assert isinstance(tri, torch.Tensor)
+    return _get_plugin().antialias_construct_topology_hash(tri)
+
+#----------------------------------------------------------------------------
--- a/nvdiffrast/torch/torch_antialias.cpp
+++ b/nvdiffrast/torch/torch_antialias.cpp
+// Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#include "torch_common.inl"
+#include "torch_types.h"
+#include "../common/common.h"
+#include "../common/antialias.h"
+
+//------------------------------------------------------------------------
+// Kernel prototypes.
+
+void AntialiasFwdMeshKernel         (const AntialiasKernelParams p);
+void AntialiasFwdDiscontinuityKernel(const AntialiasKernelParams p);
+void AntialiasFwdAnalysisKernel     (const AntialiasKernelParams p);
+void AntialiasGradKernel            (const AntialiasKernelParams p);
+
+//------------------------------------------------------------------------
+// Topology hash construction.
+
+TopologyHashWrapper antialias_construct_topology_hash(torch::Tensor tri)
+{
+    cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+    AntialiasKernelParams p = {}; // Initialize all fields to zero.
+
+    // Check inputs.
+    NVDR_CHECK_DEVICE(tri);
+    NVDR_CHECK_CONTIGUOUS(tri);
+    NVDR_CHECK_I32(tri);
+    NVDR_CHECK(tri.sizes().size() == 2 && tri.size(0) > 0 && tri.size(1) == 3, "tri must have shape [>0, 3]");
+
+    // Fill in kernel parameters.
+    p.numTriangles = tri.size(0);
+    p.numVertices = 0x7fffffff; // Let's not require vertex positions just to enable an error check.
+    p.tri = tri.data_ptr<int>();
+
+    // Kernel parameters.
+    p.allocTriangles = p.allocTriangles < 64 ? 64 : p.allocTriangles;
+    while (p.allocTriangles < p.numTriangles)
+        p.allocTriangles <<= 1; // Must be power of two.
+
+    // Construct the hash tensor and get pointer.
+    torch::TensorOptions opts = torch::TensorOptions().dtype(torch::kInt32).device(torch::kCUDA);
+    torch::Tensor ev_hash = torch::zeros({p.allocTriangles * AA_HASH_ELEMENTS_PER_TRIANGLE * 4}, opts);
+    p.evHash = (uint4*)(ev_hash.data_ptr<int>());
+
+    // Check alignment.
+    NVDR_CHECK(!((uintptr_t)p.evHash & 15), "ev_hash internal tensor not aligned to int4");
+
+    // Populate the hash.
+    void* args[] = {&p};
+    NVDR_CHECK_CUDA_ERROR(cudaLaunchKernel((void*)AntialiasFwdMeshKernel, (p.numTriangles - 1) / AA_MESH_KERNEL_THREADS_PER_BLOCK + 1, AA_MESH_KERNEL_THREADS_PER_BLOCK, args, 0, stream));
+
+    // Return.
+    TopologyHashWrapper hash_wrap;
+    hash_wrap.ev_hash = ev_hash;
+    return hash_wrap;
+}
+
+//------------------------------------------------------------------------
+// Forward op.
+
+std::tuple<torch::Tensor, torch::Tensor> antialias_fwd(torch::Tensor color, torch::Tensor rast, torch::Tensor pos, torch::Tensor tri, TopologyHashWrapper topology_hash_wrap)
+{
+    cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+    AntialiasKernelParams p = {}; // Initialize all fields to zero.
+    p.instance_mode = (pos.sizes().size() > 2) ? 1 : 0;
+    torch::Tensor& topology_hash = topology_hash_wrap.ev_hash; // Unwrap.
+
+    // Check inputs.
+    NVDR_CHECK_DEVICE(color, rast, pos, tri, topology_hash);
+    NVDR_CHECK_CONTIGUOUS(color, rast, pos, tri, topology_hash);
+    NVDR_CHECK_F32(color, rast, pos);
+    NVDR_CHECK_I32(tri, topology_hash);
+
+    // Sanity checks.
+    NVDR_CHECK(color.sizes().size() == 4 && color.size(0) > 0 && color.size(1) > 0 && color.size(2) > 0 && color.size(3) > 0, "color must have shape[>0, >0, >0, >0]");
+    NVDR_CHECK(rast.sizes().size() == 4 && rast.size(0) > 0 && rast.size(1) > 0 && rast.size(2) > 0 && rast.size(3) == 4, "rast must have shape[>0, >0, >0, 4]");
+    NVDR_CHECK(tri.sizes().size() == 2 && tri.size(0) > 0 && tri.size(1) == 3, "tri must have shape [>0, 3]");
+    NVDR_CHECK(color.size(1) == rast.size(1) && color.size(2) == rast.size(2), "color and rast inputs must have same spatial dimensions");
+    if (p.instance_mode)
+    {
+        NVDR_CHECK(pos.sizes().size() == 3 && pos.size(0) > 0 && pos.size(1) > 0 && pos.size(2) == 4, "pos must have shape [>0, >0, 4] or [>0, 4]");
+        NVDR_CHECK(rast.size(0) == color.size(0) && pos.size(0) == color.size(0), "minibatch size mismatch between inputs color, rast, pos");
+    }
+    else
+    {
+        NVDR_CHECK(pos.sizes().size() == 2 && pos.size(0) > 0 && pos.size(1) == 4, "pos must have shape [>0, >0, 4] or [>0, 4]");
+        NVDR_CHECK(rast.size(0) == color.size(0), "minibatch size mismatch between inputs color, rast");
+    }
+
+    // Extract input dimensions.
+    p.numVertices  = pos.size(p.instance_mode ? 1 : 0);
+    p.numTriangles = tri.size(0);
+    p.n            = color.size(0);
+    p.height       = color.size(1);
+    p.width        = color.size(2);
+    p.channels     = color.size(3);
+
+    // Get input pointers.
+    p.color = color.data_ptr<float>();
+    p.rasterOut = rast.data_ptr<float>();
+    p.tri = tri.data_ptr<int>();
+    p.pos = pos.data_ptr<float>();
+    p.evHash = (uint4*)(topology_hash.data_ptr<int>());
+
+    // Misc parameters.
+    p.xh = .5f * (float)p.width;
+    p.yh = .5f * (float)p.height;
+    p.allocTriangles = topology_hash.size(0) / (4 * AA_HASH_ELEMENTS_PER_TRIANGLE);
+   
+    // Allocate output tensors.
+    torch::Tensor out = color.detach().clone(); // Use color as base.
+    torch::TensorOptions opts = torch::TensorOptions().dtype(torch::kFloat32).device(torch::kCUDA);    
+    torch::Tensor work_buffer = torch::empty({p.n * p.width * p.height * 8 + 4}, opts); // 8 int for a maximum of two work items per pixel.
+    p.output = out.data_ptr<float>();
+    p.workBuffer = (int4*)(work_buffer.data_ptr<float>());
+
+    // Clear the work counters.
+    NVDR_CHECK_CUDA_ERROR(cudaMemsetAsync(p.workBuffer, 0, sizeof(int4), stream));
+
+    // Verify that buffers are aligned to allow float2/float4 operations.
+    NVDR_CHECK(!((uintptr_t)p.pos        & 15), "pos input tensor not aligned to float4");
+    NVDR_CHECK(!((uintptr_t)p.rasterOut  &  7), "raster_out input tensor not aligned to float2");
+    NVDR_CHECK(!((uintptr_t)p.workBuffer & 15), "work_buffer internal tensor not aligned to int4");
+    NVDR_CHECK(!((uintptr_t)p.evHash     & 15), "topology_hash internal tensor not aligned to int4");
+
+    // Choose launch parameters for the discontinuity finder kernel and launch.
+    void* args[] = {&p};
+    dim3 blockSize(AA_DISCONTINUITY_KERNEL_BLOCK_WIDTH, AA_DISCONTINUITY_KERNEL_BLOCK_HEIGHT, 1);
+    dim3 gridSize = getLaunchGridSize(blockSize, p.width, p.height, p.n);
+    NVDR_CHECK_CUDA_ERROR(cudaLaunchKernel((void*)AntialiasFwdDiscontinuityKernel, gridSize, blockSize, args, 0, stream));
+
+    // Determine optimum block size for the persistent analysis kernel and launch.
+    int device = 0;
+    int numCTA = 0;
+    int numSM  = 0;
+    NVDR_CHECK_CUDA_ERROR(cudaGetDevice(&device));
+    NVDR_CHECK_CUDA_ERROR(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numCTA, (void*)AntialiasFwdAnalysisKernel, AA_ANALYSIS_KERNEL_THREADS_PER_BLOCK, 0));
+    NVDR_CHECK_CUDA_ERROR(cudaDeviceGetAttribute(&numSM, cudaDevAttrMultiProcessorCount, device));
+    NVDR_CHECK_CUDA_ERROR(cudaLaunchKernel((void*)AntialiasFwdAnalysisKernel, numCTA * numSM, AA_ANALYSIS_KERNEL_THREADS_PER_BLOCK, args, 0, stream));
+
+    // Return results.
+    return std::tuple<torch::Tensor, torch::Tensor>(out, work_buffer);
+}
+
+//------------------------------------------------------------------------
+// Gradient op.
+
+std::tuple<torch::Tensor, torch::Tensor> antialias_grad(torch::Tensor color, torch::Tensor rast, torch::Tensor pos, torch::Tensor tri, torch::Tensor dy, torch::Tensor work_buffer)
+{
+    cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+    AntialiasKernelParams p = {}; // Initialize all fields to zero.
+    p.instance_mode = (pos.sizes().size() > 2) ? 1 : 0;
+
+    // Check inputs.
+    NVDR_CHECK_DEVICE(color, rast, pos, tri, dy, work_buffer);
+    NVDR_CHECK_CONTIGUOUS(color, rast, pos, tri, work_buffer);
+    NVDR_CHECK_F32(color, rast, pos, dy, work_buffer);
+    NVDR_CHECK_I32(tri);
+
+    // Sanity checks.
+    NVDR_CHECK(dy.sizes().size() == 4 && dy.size(0) > 0 && dy.size(1) > 0 && dy.size(2) > 0 && dy.size(3) > 0, "dy must have shape[>0, >0, >0, >0]");
+    NVDR_CHECK(color.sizes().size() == 4 && color.size(0) > 0 && color.size(1) > 0 && color.size(2) > 0 && color.size(3) > 0, "color must have shape[>0, >0, >0, >0]");
+    NVDR_CHECK(rast.sizes().size() == 4 && rast.size(0) > 0 && rast.size(1) > 0 && rast.size(2) > 0 && rast.size(3) == 4, "raster_out must have shape[>0, >0, >0, 4]");
+    NVDR_CHECK(tri.sizes().size() == 2 && tri.size(0) > 0 && tri.size(1) == 3, "tri must have shape [>0, 3]");
+    NVDR_CHECK(color.size(1) == rast.size(1) && color.size(2) == rast.size(2), "color and raster_out inputs must have same spatial dimensions");
+    NVDR_CHECK(color.size(1) == dy.size(1) && color.size(2) == dy.size(2) && color.size(3) == dy.size(3), "color and dy inputs must have same dimensions");
+    if (p.instance_mode)
+    {
+        NVDR_CHECK(pos.sizes().size() == 3 && pos.size(0) > 0 && pos.size(1) > 0 && pos.size(2) == 4, "pos must have shape [>0, >0, 4] or [>0, 4]");
+        NVDR_CHECK(rast.size(0) == color.size(0) && pos.size(0) == color.size(0), "minibatch size mismatch between inputs color, raster_out, pos");
+        NVDR_CHECK(dy.size(0) == color.size(0) && rast.size(0) == color.size(0) && pos.size(0) ==color.size(0), "minibatch size mismatch between inputs dy, color, raster_out, pos");
+    }
+    else
+    {
+        NVDR_CHECK(pos.sizes().size() == 2 && pos.size(0) > 0 && pos.size(1) == 4, "pos must have shape [>0, >0, 4] or [>0, 4]");
+        NVDR_CHECK(rast.size(0) == color.size(0), "minibatch size mismatch between inputs color, raster_out");
+        NVDR_CHECK(dy.size(0) == color.size(0) && rast.size(0) == color.size(0), "minibatch size mismatch between inputs dy, color, raster_out");
+    }
+
+    // Extract input dimensions.
+    p.numVertices  = pos.size(p.instance_mode ? 1 : 0);
+    p.numTriangles = tri.size(0);
+    p.n            = color.size(0);
+    p.height       = color.size(1);
+    p.width        = color.size(2);
+    p.channels     = color.size(3);
+
+    // Ensure dy is contiguous.
+    torch::Tensor dy_ = dy.contiguous();
+
+    // Get input pointers.
+    p.color = color.data_ptr<float>();
+    p.rasterOut = rast.data_ptr<float>();
+    p.tri = tri.data_ptr<int>();
+    p.pos = pos.data_ptr<float>();
+    p.dy = dy_.data_ptr<float>();
+    p.workBuffer = (int4*)(work_buffer.data_ptr<float>());
+
+    // Misc parameters.
+    p.xh = .5f * (float)p.width;
+    p.yh = .5f * (float)p.height;
+
+    // Allocate output tensors.
+    torch::Tensor grad_color = dy_.detach().clone(); // Use dy as base.
+    torch::Tensor grad_pos = torch::zeros_like(pos);
+    p.gradColor = grad_color.data_ptr<float>();
+    p.gradPos = grad_pos.data_ptr<float>();
+
+    // Clear gradient kernel work counter.
+    NVDR_CHECK_CUDA_ERROR(cudaMemsetAsync(&p.workBuffer[0].y, 0, sizeof(int), stream));
+
+    // Verify that buffers are aligned to allow float2/float4 operations.
+    NVDR_CHECK(!((uintptr_t)p.pos        & 15), "pos input tensor not aligned to float4");
+    NVDR_CHECK(!((uintptr_t)p.workBuffer & 15), "work_buffer internal tensor not aligned to int4");
+
+    // Determine optimum block size for the gradient kernel and launch.
+    void* args[] = {&p};
+    int device = 0;
+    int numCTA = 0;
+    int numSM  = 0;
+    NVDR_CHECK_CUDA_ERROR(cudaGetDevice(&device));
+    NVDR_CHECK_CUDA_ERROR(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numCTA, (void*)AntialiasGradKernel, AA_GRAD_KERNEL_THREADS_PER_BLOCK, 0));
+    NVDR_CHECK_CUDA_ERROR(cudaDeviceGetAttribute(&numSM, cudaDevAttrMultiProcessorCount, device));
+    NVDR_CHECK_CUDA_ERROR(cudaLaunchKernel((void*)AntialiasGradKernel, numCTA * numSM, AA_GRAD_KERNEL_THREADS_PER_BLOCK, args, 0, stream));
+
+    // Return results.
+    return std::tuple<torch::Tensor, torch::Tensor>(grad_color, grad_pos);
+}
+
+//------------------------------------------------------------------------
--- a/nvdiffrast/torch/torch_bindings.cpp
+++ b/nvdiffrast/torch/torch_bindings.cpp
+// Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#include "torch_common.inl"
+#include "torch_types.h"
+#include <tuple>
+
+//------------------------------------------------------------------------
+// Op prototypes. Return type macros for readability.
+
+#define OP_RETURN_T   torch::Tensor
+#define OP_RETURN_TT  std::tuple<torch::Tensor, torch::Tensor>
+#define OP_RETURN_TTT std::tuple<torch::Tensor, torch::Tensor, torch::Tensor>
+
+OP_RETURN_TT        rasterize_fwd                       (RasterizeGLStateWrapper& stateWrapper, torch::Tensor pos, torch::Tensor tri, std::tuple<int, int> resolution, torch::Tensor ranges);
+OP_RETURN_T         rasterize_grad                      (torch::Tensor pos, torch::Tensor tri, torch::Tensor out, torch::Tensor dy);
+OP_RETURN_T         rasterize_grad_db                   (torch::Tensor pos, torch::Tensor tri, torch::Tensor out, torch::Tensor dy, torch::Tensor ddb);
+OP_RETURN_TT        interpolate_fwd                     (torch::Tensor attr, torch::Tensor rast, torch::Tensor tri);
+OP_RETURN_TT        interpolate_fwd_da                  (torch::Tensor attr, torch::Tensor rast, torch::Tensor tri, torch::Tensor rast_db, bool diff_attrs_all, std::vector<int>& diff_attrs_vec);
+OP_RETURN_TT        interpolate_grad                    (torch::Tensor attr, torch::Tensor rast, torch::Tensor tri, torch::Tensor dy);
+OP_RETURN_TTT       interpolate_grad_da                 (torch::Tensor attr, torch::Tensor rast, torch::Tensor tri, torch::Tensor dy, torch::Tensor rast_db, torch::Tensor dda, bool diff_attrs_all, std::vector<int>& diff_attrs_vec);
+TextureMipWrapper   texture_construct_mip               (torch::Tensor tex, int max_mip_level, bool cube_mode);
+OP_RETURN_T         texture_fwd                         (torch::Tensor tex, torch::Tensor uv, int filter_mode, int boundary_mode);
+OP_RETURN_T         texture_fwd_mip                     (torch::Tensor tex, torch::Tensor uv, torch::Tensor uv_da, TextureMipWrapper mip, int filter_mode, int boundary_mode);
+OP_RETURN_T         texture_grad_nearest                (torch::Tensor tex, torch::Tensor uv, torch::Tensor dy, int filter_mode, int boundary_mode);
+OP_RETURN_TT        texture_grad_linear                 (torch::Tensor tex, torch::Tensor uv, torch::Tensor dy, int filter_mode, int boundary_mode);
+OP_RETURN_TT        texture_grad_linear_mipmap_nearest  (torch::Tensor tex, torch::Tensor uv, torch::Tensor dy, torch::Tensor uv_da, TextureMipWrapper mip, int filter_mode, int boundary_mode);
+OP_RETURN_TTT       texture_grad_linear_mipmap_linear   (torch::Tensor tex, torch::Tensor uv, torch::Tensor dy, torch::Tensor uv_da, TextureMipWrapper mip, int filter_mode, int boundary_mode);
+TopologyHashWrapper antialias_construct_topology_hash   (torch::Tensor tri);
+OP_RETURN_TT        antialias_fwd                       (torch::Tensor color, torch::Tensor rast, torch::Tensor pos, torch::Tensor tri, TopologyHashWrapper topology_hash);
+OP_RETURN_TT        antialias_grad                      (torch::Tensor color, torch::Tensor rast, torch::Tensor pos, torch::Tensor tri, torch::Tensor dy, torch::Tensor work_buffer);
+
+//------------------------------------------------------------------------
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+    // State classes.
+    pybind11::class_<RasterizeGLStateWrapper>(m, "RasterizeGLStateWrapper").def(pybind11::init<bool, bool>())
+        .def("set_context",     &RasterizeGLStateWrapper::setContext)
+        .def("release_context", &RasterizeGLStateWrapper::releaseContext);
+    pybind11::class_<TextureMipWrapper>(m, "TextureMipWrapper");
+    pybind11::class_<TopologyHashWrapper>(m, "TopologyHashWrapper");
+
+    // Plumbing to torch/c10 logging system.
+    m.def("get_log_level", [](void)     { return FLAGS_caffe2_log_level;  }, "get log level");
+    m.def("set_log_level", [](int level){ FLAGS_caffe2_log_level = level; }, "set log level");
+
+    // Ops.
+    m.def("rasterize_fwd",                      &rasterize_fwd,                         "rasterize forward op");
+    m.def("rasterize_grad",                     &rasterize_grad,                        "rasterize gradient op ignoring db gradients");
+    m.def("rasterize_grad_db",                  &rasterize_grad_db,                     "rasterize gradient op with db gradients");
+    m.def("interpolate_fwd",                    &interpolate_fwd,                       "interpolate forward op with attribute derivatives");
+    m.def("interpolate_fwd_da",                 &interpolate_fwd_da,                    "interpolate forward op without attribute derivatives");
+    m.def("interpolate_grad",                   &interpolate_grad,                      "interpolate gradient op with attribute derivatives");
+    m.def("interpolate_grad_da",                &interpolate_grad_da,                   "interpolate gradient op without attribute derivatives");
+    m.def("texture_construct_mip",              &texture_construct_mip,                 "texture mipmap construction");
+    m.def("texture_fwd",                        &texture_fwd,                           "texture forward op with mipmapping and texcoord derivatives");
+    m.def("texture_fwd_mip",                    &texture_fwd_mip,                       "texture forward op without mipmapping and texcoord derivatives");
+    m.def("texture_grad_nearest",               &texture_grad_nearest,                  "texture gradient op in nearest mode");
+    m.def("texture_grad_linear",                &texture_grad_linear,                   "texture gradient op in linear mode");
+    m.def("texture_grad_linear_mipmap_nearest", &texture_grad_linear_mipmap_nearest,    "texture gradient op in linear-mipmap-nearest mode");
+    m.def("texture_grad_linear_mipmap_linear",  &texture_grad_linear_mipmap_linear,     "texture gradient op in linear-mipmap-linear mode");
+    m.def("antialias_construct_topology_hash",  &antialias_construct_topology_hash,     "antialias topology hash construction");
+    m.def("antialias_fwd",                      &antialias_fwd,                         "antialias forward op");
+    m.def("antialias_grad",                     &antialias_grad,                        "antialias gradient op");
+}
+
+//------------------------------------------------------------------------