Initial commit

1f95925c · Samuli Laine · 1f95925c · 1f95925c · 1f95925c · 1f95925c
Commit 1f95925c authored Nov 03, 2020 by Samuli Laine
20 changed files
--- a/docs/img/spot_texw.png
+++ b/docs/img/spot_texw.png
--- a/docs/img/spot_tri.png
+++ b/docs/img/spot_tri.png
--- a/docs/img/spot_uv.png
+++ b/docs/img/spot_uv.png
--- a/docs/img/teaser.png
+++ b/docs/img/teaser.png
--- a/docs/img/teaser1.png
+++ b/docs/img/teaser1.png
--- a/docs/img/teaser2.png
+++ b/docs/img/teaser2.png
--- a/docs/img/teaser3.png
+++ b/docs/img/teaser3.png
--- a/docs/img/teaser4.png
+++ b/docs/img/teaser4.png
--- a/docs/img/teaser5.png
+++ b/docs/img/teaser5.png
--- a/docs/img/tri.png
+++ b/docs/img/tri.png
--- a/docs/index.html
+++ b/docs/index.html
--- a/nvdiffrast/__init__.py
+++ b/nvdiffrast/__init__.py
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+__version__ = '0.2.0'
--- a/nvdiffrast/common/antialias.cu
+++ b/nvdiffrast/common/antialias.cu
+// Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#include "antialias.h"
+
+//------------------------------------------------------------------------
+// Helpers.
+
+#define F32_MAX (3.402823466e+38f)
+static __forceinline__ __device__ bool same_sign(float a, float b) { return (__float_as_int(a) ^ __float_as_int(b)) >= 0; }
+static __forceinline__ __device__ bool rational_gt(float n0, float n1, float d0, float d1) { return (n0*d1 > n1*d0) == same_sign(d0, d1); }
+static __forceinline__ __device__ int max_idx3(float n0, float n1, float n2, float d0, float d1, float d2)
+{
+    bool g10 = rational_gt(n1, n0, d1, d0);
+    bool g20 = rational_gt(n2, n0, d2, d0);
+    bool g21 = rational_gt(n2, n1, d2, d1);
+    if (g20 && g21) return 2;
+    if (g10) return 1;
+    return 0;
+}
+
+//------------------------------------------------------------------------
+// Format of antialiasing work items stored in work buffer. Usually accessed directly as int4.
+
+struct AAWorkItem
+{
+    enum
+    {
+        EDGE_MASK       = 3,    // Edge index in lowest bits.
+        FLAG_DOWN_BIT   = 2,    // Down instead of right.
+        FLAG_TRI1_BIT   = 3,    // Edge is from other pixel's triangle.
+    };
+
+    int             px, py;         // Pixel x, y.
+    unsigned int    pz_flags;       // High 16 bits = pixel z, low 16 bits = edge index and flags.
+    float           alpha;          // Antialiasing alpha value. Zero if no AA.
+};
+
+//------------------------------------------------------------------------
+// Hash functions. Adapted from public-domain code at http://www.burtleburtle.net/bob/hash/doobs.html
+
+#define JENKINS_MAGIC (0x9e3779b9u)
+static __device__ __forceinline__ void jenkins_mix(unsigned int& a, unsigned int& b, unsigned int& c)
+{
+    a -= b; a -= c; a ^= (c>>13);
+    b -= c; b -= a; b ^= (a<<8);
+    c -= a; c -= b; c ^= (b>>13);
+    a -= b; a -= c; a ^= (c>>12);
+    b -= c; b -= a; b ^= (a<<16);
+    c -= a; c -= b; c ^= (b>>5);
+    a -= b; a -= c; a ^= (c>>3);
+    b -= c; b -= a; b ^= (a<<10);
+    c -= a; c -= b; c ^= (b>>15);
+}
+
+// Helper class for hash index iteration. Implements simple odd-skip linear probing with a key-dependent skip.
+class HashIndex
+{
+public:
+    __device__ __forceinline__ HashIndex(const AntialiasKernelParams& p, uint64_t key)
+    {
+        m_mask = p.allocTriangles * AA_HASH_ELEMENTS_PER_TRIANGLE - 1;
+        m_idx  = (uint32_t)(key & 0xffffffffu);
+        m_skip = (uint32_t)(key >> 32);
+        uint32_t dummy = JENKINS_MAGIC;
+        jenkins_mix(m_idx, m_skip, dummy);
+        m_idx &= m_mask;
+        m_skip &= m_mask;
+        m_skip |= 1;
+    }
+    __device__ __forceinline__ int get(void) const { return m_idx; }
+    __device__ __forceinline__ void next(void) { m_idx = (m_idx + m_skip) & m_mask; }
+private:
+    uint32_t m_idx, m_skip, m_mask;
+};
+
+static __device__ __forceinline__ void hash_insert(const AntialiasKernelParams& p, uint64_t key, int v)
+{
+    HashIndex idx(p, key);
+    while(1)
+    {
+        uint64_t prev = atomicCAS((unsigned long long*)&p.evHash[idx.get()], 0, (unsigned long long)key);
+        if (prev == 0 || prev == key)
+            break;
+        idx.next();
+    }
+    int* q = (int*)&p.evHash[idx.get()];
+    int a = atomicCAS(q+2, 0, v);
+    if (a != 0 && a != v)
+        atomicCAS(q+3, 0, v);
+}
+
+static __device__ __forceinline__ int2 hash_find(const AntialiasKernelParams& p, uint64_t key)
+{
+    HashIndex idx(p, key);
+    while(1)
+    {
+        uint4 entry = p.evHash[idx.get()];
+        uint64_t k = ((uint64_t)entry.x) | (((uint64_t)entry.y) << 32);
+        if (k == key || k == 0)
+            return make_int2((int)entry.z, (int)entry.w);
+        idx.next();
+    }
+}
+
+static __device__ __forceinline__ void evhash_insert_vertex(const AntialiasKernelParams& p, int va, int vb, int vn)
+{
+    if (va == vb)
+        return;
+    
+    uint64_t v0 = (uint32_t)min(va, vb) + 1; // canonical vertex order
+    uint64_t v1 = (uint32_t)max(va, vb) + 1;
+    uint64_t vk = v0 | (v1 << 32); // hash key
+    hash_insert(p, vk, vn + 1);
+}
+
+static __forceinline__ __device__ int evhash_find_vertex(const AntialiasKernelParams& p, int va, int vb, int vr)
+{
+    if (va == vb)
+        return -1;
+
+    uint64_t v0 = (uint32_t)min(va, vb) + 1; // canonical vertex order
+    uint64_t v1 = (uint32_t)max(va, vb) + 1;
+    uint64_t vk = v0 | (v1 << 32); // hash key
+    int2 vn = hash_find(p, vk) - 1;
+    if (vn.x == vr) return vn.y;
+    if (vn.y == vr) return vn.x;
+    return -1;
+}
+
+//------------------------------------------------------------------------
+// Mesh analysis kernel.
+
+__global__ void AntialiasFwdMeshKernel(const AntialiasKernelParams p)
+{
+    int idx = threadIdx.x + blockIdx.x * blockDim.x;
+    if (idx >= p.numTriangles)
+        return;
+
+    int v0 = p.tri[idx * 3 + 0];
+    int v1 = p.tri[idx * 3 + 1];
+    int v2 = p.tri[idx * 3 + 2];
+
+    if (v0 < 0 || v0 >= p.numVertices ||
+        v1 < 0 || v1 >= p.numVertices ||
+        v2 < 0 || v2 >= p.numVertices)
+        return;
+
+    if (v0 == v1 || v1 == v2 || v2 == v0)
+        return;
+
+    evhash_insert_vertex(p, v1, v2, v0);
+    evhash_insert_vertex(p, v2, v0, v1);
+    evhash_insert_vertex(p, v0, v1, v2);
+}
+
+//------------------------------------------------------------------------
+// Discontinuity finder kernel.
+
+__global__ void AntialiasFwdDiscontinuityKernel(const AntialiasKernelParams p)
+{
+    // Calculate pixel position.
+    int px = blockIdx.x * AA_DISCONTINUITY_KERNEL_BLOCK_WIDTH + threadIdx.x;
+    int py = blockIdx.y * AA_DISCONTINUITY_KERNEL_BLOCK_HEIGHT + threadIdx.y;
+    int pz = blockIdx.z;
+    if (px >= p.width || py >= p.height || pz >= p.n)
+        return;
+
+    // Pointer to our TriIdx and fetch.
+    int pidx0 = ((px + p.width * (py + p.height * pz)) << 2) + 3;
+    float tri0 = p.rasterOut[pidx0];
+
+    // Look right, clamp at edge.
+    int pidx1 = pidx0;
+    if (px < p.width - 1)
+        pidx1 += 4;
+    float tri1 = p.rasterOut[pidx1];
+
+    // Look down, clamp at edge.
+    int pidx2 = pidx0;
+    if (py < p.height - 1)
+        pidx2 += p.width << 2;
+    float tri2 = p.rasterOut[pidx2];
+
+    // Determine amount of work.
+    int count = 0;
+    if (tri1 != tri0) count  = 1;
+    if (tri2 != tri0) count += 1;
+    if (!count)
+        return; // Exit warp.
+
+    // Coalesce work counter update to once per CTA.
+    __shared__ int s_temp;
+    s_temp = 0;
+    __syncthreads();
+    int idx = atomicAdd(&s_temp, count);
+    __syncthreads();
+    if (idx == 0)
+    {
+        int base = atomicAdd(&p.workBuffer[0].x, s_temp);
+        s_temp = base + 1; // don't clobber the counters in first slot.
+    }
+    __syncthreads();
+    idx += s_temp;
+
+    // Write to memory.
+    if (tri1 != tri0) p.workBuffer[idx++] = make_int4(px, py, (pz << 16), 0);
+    if (tri2 != tri0) p.workBuffer[idx]   = make_int4(px, py, (pz << 16) + (1 << AAWorkItem::FLAG_DOWN_BIT), 0);
+}
+
+//------------------------------------------------------------------------
+// Forward analysis kernel.
+
+__global__ void AntialiasFwdAnalysisKernel(const AntialiasKernelParams p)
+{
+    __shared__ int s_base;
+    int workCount = p.workBuffer[0].x;
+    for(;;)
+    {
+        // Persistent threads work fetcher.
+        __syncthreads();
+        if (threadIdx.x == 0)
+            s_base = atomicAdd(&p.workBuffer[0].y, AA_ANALYSIS_KERNEL_THREADS_PER_BLOCK);
+        __syncthreads();
+        int thread_idx = s_base + threadIdx.x;
+        if (thread_idx >= workCount)
+            return;
+
+        int4* pItem = p.workBuffer + thread_idx + 1;
+        int4 item = *pItem;
+        int px = item.x;
+        int py = item.y;
+        int pz = (int)(((unsigned int)item.z) >> 16);
+        int d  = (item.z >> AAWorkItem::FLAG_DOWN_BIT) & 1;
+
+        int pixel0 = px + p.width * (py + p.height * pz);
+        int pixel1 = pixel0 + (d ? p.width : 1);
+        float2 zt0 = ((float2*)p.rasterOut)[(pixel0 << 1) + 1];
+        float2 zt1 = ((float2*)p.rasterOut)[(pixel1 << 1) + 1];
+        int tri0 = (int)zt0.y - 1;
+        int tri1 = (int)zt1.y - 1;
+
+        // Select triangle based on background / depth.
+        int tri = (tri0 >= 0) ? tri0 : tri1;
+        if (tri0 >= 0 && tri1 >= 0)
+            tri = (zt0.x < zt1.x) ? tri0 : tri1;
+        if (tri == tri1)
+        {
+            // Calculate with respect to neighbor pixel if chose that triangle.
+            px += 1 - d;
+            py += d;
+        }
+
+        // Bail out if triangle index is corrupt.
+        if (tri < 0 || tri >= p.numTriangles)
+            continue;
+
+        // Fetch vertex indices.
+        int vi0 = p.tri[tri * 3 + 0];
+        int vi1 = p.tri[tri * 3 + 1];
+        int vi2 = p.tri[tri * 3 + 2];
+
+        // Bail out if vertex indices are corrupt.
+        if (vi0 < 0 || vi0 >= p.numVertices ||
+            vi1 < 0 || vi1 >= p.numVertices ||
+            vi2 < 0 || vi2 >= p.numVertices)
+            continue;
+
+        // Fetch opposite vertex indices. Use vertex itself (always silhouette) if no opposite vertex exists.
+        int op0 = evhash_find_vertex(p, vi2, vi1, vi0);
+        int op1 = evhash_find_vertex(p, vi0, vi2, vi1);
+        int op2 = evhash_find_vertex(p, vi1, vi0, vi2);
+
+        // Instance mode: Adjust vertex indices based on minibatch index.
+        if (p.instance_mode)
+        {
+            int vbase = pz * p.numVertices;
+            vi0 += vbase; 
+            vi1 += vbase; 
+            vi2 += vbase;
+            if (op0 >= 0) op0 += vbase;
+            if (op1 >= 0) op1 += vbase;
+            if (op2 >= 0) op2 += vbase;
+        }
+
+        // Fetch vertex positions.
+        float4 p0 = ((float4*)p.pos)[vi0];
+        float4 p1 = ((float4*)p.pos)[vi1];
+        float4 p2 = ((float4*)p.pos)[vi2];
+        float4 o0 = (op0 < 0) ? p0 : ((float4*)p.pos)[op0];
+        float4 o1 = (op1 < 0) ? p1 : ((float4*)p.pos)[op1];
+        float4 o2 = (op2 < 0) ? p2 : ((float4*)p.pos)[op2];
+
+        // Project vertices to pixel space.
+        float w0  = 1.f / p0.w;
+        float w1  = 1.f / p1.w;
+        float w2  = 1.f / p2.w;
+        float ow0 = 1.f / o0.w;
+        float ow1 = 1.f / o1.w;
+        float ow2 = 1.f / o2.w;
+        float fx  = (float)px + .5f - p.xh;
+        float fy  = (float)py + .5f - p.yh;
+        float x0  = p0.x * w0 * p.xh - fx;
+        float y0  = p0.y * w0 * p.yh - fy;
+        float x1  = p1.x * w1 * p.xh - fx;
+        float y1  = p1.y * w1 * p.yh - fy;
+        float x2  = p2.x * w2 * p.xh - fx;
+        float y2  = p2.y * w2 * p.yh - fy;
+        float ox0 = o0.x * ow0 * p.xh - fx;
+        float oy0 = o0.y * ow0 * p.yh - fy;
+        float ox1 = o1.x * ow1 * p.xh - fx;
+        float oy1 = o1.y * ow1 * p.yh - fy;
+        float ox2 = o2.x * ow2 * p.xh - fx;
+        float oy2 = o2.y * ow2 * p.yh - fy;
+
+        // Signs to kill non-silhouette edges.
+        float bb = (x1-x0)*(y2-y0) - (x2-x0)*(y1-y0); // Triangle itself.
+        float a0 = (x1-ox0)*(y2-oy0) - (x2-ox0)*(y1-oy0); // Wings.
+        float a1 = (x2-ox1)*(y0-oy1) - (x0-ox1)*(y2-oy1);
+        float a2 = (x0-ox2)*(y1-oy2) - (x1-ox2)*(y0-oy2);
+
+        // If no matching signs anywhere, skip the rest.
+        if (same_sign(a0, bb) || same_sign(a1, bb) || same_sign(a2, bb))
+        {
+            // XY flip for horizontal edges.
+            if (d)
+            {
+                swap(x0, y0);
+                swap(x1, y1);
+                swap(x2, y2);
+            }
+
+            float dx0 = x2 - x1;
+            float dx1 = x0 - x2;
+            float dx2 = x1 - x0;
+            float dy0 = y2 - y1;
+            float dy1 = y0 - y2;
+            float dy2 = y1 - y0;
+
+            // Check if an edge crosses between us and the neighbor pixel.
+            float dc = -F32_MAX;
+            float ds = (tri == tri0) ? 1.f : -1.f;
+            float d0 = ds * (x1*dy0 - y1*dx0);
+            float d1 = ds * (x2*dy1 - y2*dx1);
+            float d2 = ds * (x0*dy2 - y0*dx2);
+
+            if (same_sign(y1, y2)) d0 = -F32_MAX, dy0 = 1.f;
+            if (same_sign(y2, y0)) d1 = -F32_MAX, dy1 = 1.f;
+            if (same_sign(y0, y1)) d2 = -F32_MAX, dy2 = 1.f;
+
+            int di = max_idx3(d0, d1, d2, dy0, dy1, dy2);
+            if (di == 0 && same_sign(a0, bb) && fabsf(dy0) >= fabsf(dx0)) dc = d0 / dy0;
+            if (di == 1 && same_sign(a1, bb) && fabsf(dy1) >= fabsf(dx1)) dc = d1 / dy1;
+            if (di == 2 && same_sign(a2, bb) && fabsf(dy2) >= fabsf(dx2)) dc = d2 / dy2;
+            float eps = .0625f; // Expect no more than 1/16 pixel inaccuracy.
+
+            // Adjust output image if a suitable edge was found.
+            if (dc > -eps && dc < 1.f + eps)
+            {
+                dc = fminf(fmaxf(dc, 0.f), 1.f);
+                float alpha = ds * (.5f - dc);
+                const float* pColor0 = p.color + pixel0 * p.channels;
+                const float* pColor1 = p.color + pixel1 * p.channels;
+                float* pOutput = p.output + (alpha > 0.f ? pixel0 : pixel1) * p.channels;
+                for (int i=0; i < p.channels; i++)
+                    atomicAdd(&pOutput[i], alpha * (pColor1[i] - pColor0[i]));
+
+                // Rewrite the work item's flags and alpha. Keep original px, py.
+                unsigned int flags = pz << 16;
+                flags |= di;
+                flags |= d << AAWorkItem::FLAG_DOWN_BIT;
+                flags |= (__float_as_uint(ds) >> 31) << AAWorkItem::FLAG_TRI1_BIT;
+                ((int2*)pItem)[1] = make_int2(flags, __float_as_int(alpha));
+            }
+        }
+    }
+}
+
+//------------------------------------------------------------------------
+// Gradient kernel.
+
+__global__ void AntialiasGradKernel(const AntialiasKernelParams p)
+{
+    // Temporary space for coalesced atomics.
+    CA_DECLARE_TEMP(AA_GRAD_KERNEL_THREADS_PER_BLOCK);
+    __shared__ int s_base; // Work counter communication across entire CTA.
+
+    int workCount = p.workBuffer[0].x;
+
+    for(;;)
+    {
+        // Persistent threads work fetcher.
+        __syncthreads();
+        if (threadIdx.x == 0)
+            s_base = atomicAdd(&p.workBuffer[0].y, AA_GRAD_KERNEL_THREADS_PER_BLOCK);
+        __syncthreads();
+        int thread_idx = s_base + threadIdx.x;
+        if (thread_idx >= workCount)
+            return;
+
+        // Read work item filled out by forward kernel.
+        int4 item = p.workBuffer[thread_idx + 1];
+        unsigned int amask = __ballot_sync(0xffffffffu, item.w);
+        if (item.w == 0)
+            continue; // No effect.
+
+        // Unpack work item and replicate setup from forward analysis kernel.
+        int px = item.x;
+        int py = item.y;
+        int pz = (int)(((unsigned int)item.z) >> 16);
+        int d = (item.z >> AAWorkItem::FLAG_DOWN_BIT) & 1;
+        float alpha = __int_as_float(item.w);
+        int tri1 = (item.z >> AAWorkItem::FLAG_TRI1_BIT) & 1;
+        int di = item.z & AAWorkItem::EDGE_MASK;
+        float ds = __int_as_float(__float_as_int(1.0) | (tri1 << 31));
+        int pixel0 = px + p.width * (py + p.height * pz);
+        int pixel1 = pixel0 + (d ? p.width : 1);
+        int tri = (int)p.rasterOut[((tri1 ? pixel1 : pixel0) << 2) + 3] - 1;
+        if (tri1)
+        {
+            px += 1 - d;
+            py += d;
+        }
+
+        // Bail out if triangle index is corrupt.
+        bool triFail = (tri < 0 || tri >= p.numTriangles);
+        amask = __ballot_sync(amask, !triFail);
+        if (triFail)
+            continue;
+
+        // Outgoing color gradients.
+        float* pGrad0 = p.gradColor + pixel0 * p.channels;
+        float* pGrad1 = p.gradColor + pixel1 * p.channels;
+
+        // Incoming color gradients.
+        const float* pDy = p.dy + (alpha > 0.f ? pixel0 : pixel1) * p.channels;
+
+        // Position gradient weight based on colors and incoming gradients.
+        float dd = 0.f;
+        const float* pColor0 = p.color + pixel0 * p.channels;
+        const float* pColor1 = p.color + pixel1 * p.channels;
+
+        // Loop over channels and accumulate.
+        for (int i=0; i < p.channels; i++)
+        {
+            float dy = pDy[i];
+            if (dy != 0.f)
+            {
+                // Update position gradient weight.
+                dd += dy * (pColor1[i] - pColor0[i]);
+
+                // Update color gradients. No coalescing because all have different targets.
+                float v = alpha * dy;
+                atomicAdd(&pGrad0[i], -v);
+                atomicAdd(&pGrad1[i], v);
+            }
+        }
+
+        // If position weight is zero, skip the rest.
+        bool noGrad = (dd == 0.f);
+        amask = __ballot_sync(amask, !noGrad);
+        if (noGrad)
+            continue;
+
+        // Fetch vertex indices of the active edge and their positions.
+        int i1 = (di < 2) ? (di + 1) : 0;
+        int i2 = (i1 < 2) ? (i1 + 1) : 0;
+        int vi1 = p.tri[3 * tri + i1];
+        int vi2 = p.tri[3 * tri + i2];
+
+        // Bail out if vertex indices are corrupt.
+        bool vtxFail = (vi1 < 0 || vi1 >= p.numVertices || vi2 < 0 || vi2 >= p.numVertices);
+        amask = __ballot_sync(amask, !vtxFail);
+        if (vtxFail)
+            continue;
+    
+        // Instance mode: Adjust vertex indices based on minibatch index.
+        if (p.instance_mode)
+        {
+            vi1 += pz * p.numVertices;
+            vi2 += pz * p.numVertices;
+        }
+
+        // Fetch vertex positions.
+        float4 p1 = ((float4*)p.pos)[vi1];
+        float4 p2 = ((float4*)p.pos)[vi2];
+
+        // Project vertices to pixel space.
+        float pxh = p.xh;
+        float pyh = p.yh;
+        float fx = (float)px + .5f - pxh;
+        float fy = (float)py + .5f - pyh;
+
+        // XY flip for horizontal edges.
+        if (d)
+        {
+            swap(p1.x, p1.y);
+            swap(p2.x, p2.y);
+            swap(pxh, pyh);
+            swap(fx, fy);
+        }
+
+        // Gradient calculation setup.
+        float w1 = 1.f / p1.w;
+        float w2 = 1.f / p2.w;
+        float x1 = p1.x * w1 * pxh - fx;
+        float y1 = p1.y * w1 * pyh - fy;
+        float x2 = p2.x * w2 * pxh - fx;
+        float y2 = p2.y * w2 * pyh - fy;
+        float dx = x2 - x1;
+        float dy = y2 - y1;
+        float db = x1*dy - y1*dx;
+
+        // Compute inverse delta-y with epsilon.
+        float ep = copysignf(1e-3f, dy); // ~1/1000 pixel.
+        float iy = 1.f / (dy + ep);
+
+        // Compute position gradients.
+        float dby = db * iy;
+        float iw1 = -w1 * iy * dd;
+        float iw2 =  w2 * iy * dd;
+        float gp1x = iw1 * pxh * y2;
+        float gp2x = iw2 * pxh * y1;
+        float gp1y = iw1 * pyh * (dby - x2);
+        float gp2y = iw2 * pyh * (dby - x1);
+        float gp1w = -(p1.x * gp1x + p1.y * gp1y) * w1;
+        float gp2w = -(p2.x * gp2x + p2.y * gp2y) * w2;
+
+        // XY flip the gradients.
+        if (d)
+        {
+            swap(gp1x, gp1y);
+            swap(gp2x, gp2y);
+        }
+
+        // Kill position gradients if alpha was saturated.
+        if (fabsf(alpha) >= 0.5f)
+        {
+            gp1x = gp1y = gp1w = 0.f;
+            gp2x = gp2y = gp2w = 0.f;
+        }
+
+        // Initialize coalesced atomics. Match both triangle ID and edge index.
+        // Also note that some threads may be inactive.
+        CA_SET_GROUP_MASK(tri ^ (di << 30), amask);
+
+        // Accumulate gradients.
+        caAtomicAdd3_xyw(p.gradPos + 4 * vi1, gp1x, gp1y, gp1w);
+        caAtomicAdd3_xyw(p.gradPos + 4 * vi2, gp2x, gp2y, gp2w);
+    }
+}
+
+//------------------------------------------------------------------------
--- a/nvdiffrast/common/antialias.h
+++ b/nvdiffrast/common/antialias.h
+// Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#pragma once
+#include "common.h"
+
+//------------------------------------------------------------------------
+// Constants and helpers.
+
+#define AA_DISCONTINUITY_KERNEL_BLOCK_WIDTH     32
+#define AA_DISCONTINUITY_KERNEL_BLOCK_HEIGHT    8
+#define AA_ANALYSIS_KERNEL_THREADS_PER_BLOCK    256
+#define AA_MESH_KERNEL_THREADS_PER_BLOCK        256
+#define AA_HASH_ELEMENTS_PER_TRIANGLE           8   // Minimum is 4 but 8 gives fewer collisions. Must be power of two.
+#define AA_GRAD_KERNEL_THREADS_PER_BLOCK        256
+
+//------------------------------------------------------------------------
+// CUDA kernel params.
+
+struct AntialiasKernelParams
+{
+    const float*    color;          // Incoming color buffer.
+    const float*    rasterOut;      // Incoming rasterizer output buffer.
+    const int*      tri;            // Incoming triangle buffer.
+    const float*    pos;            // Incoming position buffer.
+    float*          output;         // Output buffer of forward kernel.    
+    const float*    dy;             // Incoming gradients.
+    float*          gradColor;      // Output buffer, color gradient.
+    float*          gradPos;        // Output buffer, position gradient.
+    int4*           workBuffer;     // Buffer for storing intermediate work items. First item reserved for counters.
+    uint4*          evHash;         // Edge-vertex hash.
+    int             allocTriangles; // Number of triangles accommodated by evHash. Always power of two.
+    int             numTriangles;   // Number of triangles.
+    int             numVertices;    // Number of vertices.
+    int             width;          // Input width.
+    int             height;         // Input height.
+    int             n;              // Minibatch size.
+    int             channels;       // Channel count in color input.
+    float           xh, yh;         // Transfer to pixel space.
+    int             instance_mode;  // 0=normal, 1=instance mode.
+    int             tri_const;      // 1 if triangle array is known to be constant.
+};
+
+//------------------------------------------------------------------------
--- a/nvdiffrast/common/common.cpp
+++ b/nvdiffrast/common/common.cpp
+// Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#include <cuda_runtime.h>
+
+//------------------------------------------------------------------------
+// Block and grid size calculators for kernel launches.
+
+dim3 getLaunchBlockSize(int maxWidth, int maxHeight, int width, int height)
+{
+    int maxThreads = maxWidth * maxHeight;
+    if (maxThreads <= 1 || (width * height) <= 1)
+        return dim3(1, 1, 1); // Degenerate.
+
+    // Start from max size.
+    int bw = maxWidth;
+    int bh = maxHeight;
+
+    // Optimizations for weirdly sized buffers.
+    if (width < bw)
+    {
+        // Decrease block width to smallest power of two that covers the buffer width.
+        while ((bw >> 1) >= width)
+            bw >>= 1;
+
+        // Maximize height.
+        bh = maxThreads / bw;
+        if (bh > height)
+            bh = height;
+    }
+    else if (height < bh)
+    {
+        // Halve height and double width until fits completely inside buffer vertically.
+        while (bh > height)
+        {
+            bh >>= 1;
+            if (bw < width)
+                bw <<= 1;
+        }
+    }
+
+    // Done.
+    return dim3(bw, bh, 1);
+}
+
+dim3 getLaunchGridSize(dim3 blockSize, int width, int height, int depth)
+{
+    dim3 gridSize;
+    gridSize.x = (width  - 1) / blockSize.x + 1;
+    gridSize.y = (height - 1) / blockSize.y + 1;
+    gridSize.z = (depth  - 1) / blockSize.z + 1;
+    return gridSize;
+}
+
+//------------------------------------------------------------------------
--- a/nvdiffrast/common/common.h
+++ b/nvdiffrast/common/common.h
--- a/nvdiffrast/common/framework.h
+++ b/nvdiffrast/common/framework.h
+// Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#pragma once
+
+// Framework-specific macros to enable code sharing.
+
+//------------------------------------------------------------------------
+// Tensorflow.
+
+#ifdef NVDR_TENSORFLOW
+#define EIGEN_USE_GPU
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/platform/default/logging.h"
+using namespace tensorflow;
+using namespace tensorflow::shape_inference;
+#define NVDR_CTX_ARGS OpKernelContext* _nvdr_ctx
+#define NVDR_CTX_PARAMS _nvdr_ctx
+#define NVDR_CHECK(COND, ERR) OP_REQUIRES(_nvdr_ctx, COND, errors::Internal(ERR))
+#define NVDR_CHECK_CUDA_ERROR(CUDA_CALL) OP_CHECK_CUDA_ERROR(_nvdr_ctx, CUDA_CALL)
+#define NVDR_CHECK_GL_ERROR(GL_CALL) OP_CHECK_GL_ERROR(_nvdr_ctx, GL_CALL)
+#endif
+
+//------------------------------------------------------------------------
+// PyTorch.
+
+#ifdef NVDR_TORCH
+#ifndef __CUDACC__
+#include <torch/extension.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/CUDAUtils.h>
+#include <pybind11/numpy.h>
+#endif
+#define NVDR_CTX_ARGS int _nvdr_ctx_dummy
+#define NVDR_CTX_PARAMS 0
+#define NVDR_CHECK(COND, ERR) do { TORCH_CHECK(COND, ERR) } while(0)
+#define NVDR_CHECK_CUDA_ERROR(CUDA_CALL) do { cudaError_t err = CUDA_CALL; AT_CUDA_CHECK(cudaGetLastError()); } while(0)
+#define NVDR_CHECK_GL_ERROR(GL_CALL) do { GL_CALL; GLenum err = glGetError(); TORCH_CHECK(err == GL_NO_ERROR, "OpenGL error: ", getGLErrorString(err), "[", #GL_CALL, ";]"); } while(0)
+#endif
+
+//------------------------------------------------------------------------
--- a/nvdiffrast/common/glutil.inl
+++ b/nvdiffrast/common/glutil.inl
+// Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#pragma once
+#include "framework.h"
+#include <iostream>
+#include <iomanip>
+
+//------------------------------------------------------------------------
+// Windows.
+//------------------------------------------------------------------------
+
+#ifdef _WIN32
+#define NOMINMAX
+#include <windows.h>
+#define GLEW_STATIC
+#include "../lib/glew.h"
+#include <GL/gl.h>
+#include <cuda_gl_interop.h>
+
+//------------------------------------------------------------------------
+
+struct GLContext
+{
+    HDC     hdc;
+    HGLRC   hglrc;
+    int     glewInitialized;
+};
+
+//------------------------------------------------------------------------
+
+static void setGLContext(GLContext& glctx)
+{
+    if (!glctx.hglrc)
+        LOG(ERROR) << "setGLContext() called with null gltcx";
+    if (!wglMakeCurrent(glctx.hdc, glctx.hglrc))
+        LOG(ERROR) << "wglMakeCurrent() failed when setting GL context";
+
+    if (glctx.glewInitialized)
+        return;
+    GLenum result = glewInit();
+    if (result != GLEW_OK)
+        LOG(ERROR) << "glewInit() failed, return value = " << result;
+    glctx.glewInitialized = 1;
+}
+
+static void releaseGLContext(void)
+{
+    if (!wglMakeCurrent(NULL, NULL))
+        LOG(ERROR) << "wglMakeCurrent() failed when releasing GL context";
+}
+
+static GLContext createGLContext(void)
+{
+    HINSTANCE hInstance = GetModuleHandle(NULL);
+    WNDCLASS wc = {};
+    wc.style         = CS_OWNDC;
+    wc.lpfnWndProc   = DefWindowProc;
+    wc.hInstance     = hInstance;
+    wc.lpszClassName = "__DummyGLClassCPP";
+    int res = RegisterClass(&wc);
+
+    HWND hwnd = CreateWindow(
+        "__DummyGLClassCPP",        // lpClassName
+        "__DummyGLWindowCPP",       // lpWindowName
+        WS_OVERLAPPEDWINDOW,        // dwStyle
+        CW_USEDEFAULT,              // x
+        CW_USEDEFAULT,              // y
+        0, 0,                       // nWidth, nHeight
+        NULL, NULL,                 // hWndParent, hMenu
+        hInstance,                  // hInstance
+        NULL                        // lpParam
+    );
+
+    PIXELFORMATDESCRIPTOR pfd = {};
+    pfd.dwFlags      = PFD_SUPPORT_OPENGL;
+    pfd.iPixelType   = PFD_TYPE_RGBA;
+    pfd.iLayerType   = PFD_MAIN_PLANE;
+    pfd.cColorBits   = 32;
+    pfd.cDepthBits   = 24;
+    pfd.cStencilBits = 8;
+
+    HDC hdc = GetDC(hwnd);
+    int pixelformat = ChoosePixelFormat(hdc, &pfd);
+    SetPixelFormat(hdc, pixelformat, &pfd);
+
+    HGLRC hglrc = wglCreateContext(hdc);
+    LOG(INFO) << std::hex << std::setfill('0')
+              << "WGL OpenGL context created (hdc: 0x" << std::setw(8) << (uint32_t)(uintptr_t)hdc
+              << ", hglrc: 0x" << std::setw(8) << (uint32_t)(uintptr_t)hglrc << ")";
+
+    GLContext glctx = {hdc, hglrc, 0};
+    return glctx;
+}
+
+static void destroyGLContext(GLContext& glctx)
+{
+    if (!glctx.hglrc)
+        LOG(ERROR) << "destroyGLContext() called with null gltcx";
+
+    // If this is the current context, release it.
+    if (wglGetCurrentContext() == glctx.hglrc)
+        releaseGLContext();
+
+    HWND hwnd = WindowFromDC(glctx.hdc);
+    if (!hwnd)
+        LOG(ERROR) << "WindowFromDC() failed";
+    if (!ReleaseDC(hwnd, glctx.hdc))
+        LOG(ERROR) << "ReleaseDC() failed";
+    if (!wglDeleteContext(glctx.hglrc))
+        LOG(ERROR) << "wglDeleteContext() failed";
+    if (!DestroyWindow(hwnd))
+        LOG(ERROR) << "DestroyWindow() failed";
+
+    LOG(INFO) << std::hex << std::setfill('0')
+              << "WGL OpenGL context destroyed (hdc: 0x" << std::setw(8) << (uint32_t)(uintptr_t)glctx.hdc
+              << ", hglrc: 0x" << std::setw(8) << (uint32_t)(uintptr_t)glctx.hglrc << ")";
+
+    memset(&glctx, 0, sizeof(GLContext));
+}
+
+#endif // _WIN32
+
+//------------------------------------------------------------------------
+// Linux.
+//------------------------------------------------------------------------
+
+#ifdef __linux__
+#define GLEW_NO_GLU
+#define EGL_NO_X11 // X11/Xlib.h has "#define Status int" which breaks Tensorflow. Avoid it.
+#define MESA_EGL_NO_X11_HEADERS
+#if 1
+#   include "../lib/glew.h"    // Use local glew.h
+#else
+#   include <GL/glew.h> // Use system-supplied glew.h
+#endif
+#include <EGL/egl.h>
+#include <GL/gl.h>
+#include <cuda_gl_interop.h>
+
+//------------------------------------------------------------------------
+
+struct GLContext
+{
+    EGLDisplay  display;
+    EGLSurface  surface;
+    EGLContext  context;
+    int         glewInitialized;
+};
+
+//------------------------------------------------------------------------
+
+static void setGLContext(GLContext& glctx)
+{
+    if (!glctx.context)
+        LOG(ERROR) << "setGLContext() called with null gltcx";
+
+    if (!eglMakeCurrent(glctx.display, glctx.surface, glctx.surface, glctx.context))
+        LOG(ERROR) << "eglMakeCurrent() failed when setting GL context";
+
+    if (glctx.glewInitialized)
+        return;
+
+    GLenum result = glewInit();
+    if (result != GLEW_OK)
+        LOG(ERROR) << "glewInit() failed, return value = " << result;
+    glctx.glewInitialized = 1;
+}
+
+static void releaseGLContext(void)
+{
+    EGLDisplay display = eglGetCurrentDisplay();
+    if (display == EGL_NO_DISPLAY)
+        LOG(WARNING) << "releaseGLContext() called with no active display";
+    if (!eglMakeCurrent(display, EGL_NO_SURFACE, EGL_NO_SURFACE, EGL_NO_CONTEXT))
+        LOG(ERROR) << "eglMakeCurrent() failed when releasing GL context";
+}
+
+static GLContext createGLContext(void)
+{
+    // Initialize.
+
+    EGLDisplay display = eglGetDisplay(EGL_DEFAULT_DISPLAY);
+    if (display == EGL_NO_DISPLAY)
+        LOG(ERROR) << "eglGetDisplay() failed";
+
+    EGLint major;
+    EGLint minor;
+    if (!eglInitialize(display, &major, &minor))
+        LOG(ERROR) << "eglInitialize() failed";
+
+    // Choose configuration.
+
+    const EGLint context_attribs[] = {
+        EGL_RED_SIZE,           8,
+        EGL_GREEN_SIZE,         8,
+        EGL_BLUE_SIZE,          8,
+        EGL_ALPHA_SIZE,         8,
+        EGL_DEPTH_SIZE,         24,
+        EGL_STENCIL_SIZE,       8,
+        EGL_RENDERABLE_TYPE,    EGL_OPENGL_BIT,
+        EGL_SURFACE_TYPE,       EGL_PBUFFER_BIT,
+        EGL_NONE
+    };
+
+    EGLConfig config;
+    EGLint num_config;
+    if (!eglChooseConfig(display, context_attribs, &config, 1, &num_config))
+        LOG(ERROR) << "eglChooseConfig() failed";
+
+    // Create dummy pbuffer surface.
+
+    const EGLint surface_attribs[] = {
+        EGL_WIDTH,      1,
+        EGL_HEIGHT,     1,
+        EGL_NONE
+    };
+
+    EGLSurface surface = eglCreatePbufferSurface(display, config, surface_attribs);
+    if (surface == EGL_NO_SURFACE)
+        LOG(ERROR) << "eglCreatePbufferSurface() failed";
+
+    // Create GL context.
+
+    if (!eglBindAPI(EGL_OPENGL_API))
+        LOG(ERROR) << "eglBindAPI() failed";
+
+    EGLContext context = eglCreateContext(display, config, EGL_NO_CONTEXT, NULL);
+    if (context == EGL_NO_CONTEXT)
+        LOG(ERROR) << "eglCreateContext() failed";
+
+    // Done.
+
+    LOG(INFO) << "EGL " << (int)minor << "." << (int)major << " OpenGL context created (disp: 0x"
+              << std::hex << std::setfill('0')
+              << std::setw(16) << (uintptr_t)display
+              << ", surf: 0x" << std::setw(16) << (uintptr_t)surface
+              << ", ctx: 0x" << std::setw(16) << (uintptr_t)context << ")";
+
+    GLContext glctx = {display, surface, context, 0};
+    return glctx;
+}
+
+static void destroyGLContext(GLContext& glctx)
+{
+    if (!glctx.context)
+        LOG(ERROR) << "destroyGLContext() called with null gltcx";
+
+    // If this is the current context, release it.
+    if (eglGetCurrentContext() == glctx.context)
+        releaseGLContext();
+
+    if (!eglDestroyContext(glctx.display, glctx.context))
+        LOG(ERROR) << "eglDestroyContext() failed";
+    if (!eglDestroySurface(glctx.display, glctx.surface))
+        LOG(ERROR) << "eglDestroySurface() failed";
+
+    LOG(INFO) << "EGL OpenGL context destroyed (disp: 0x"
+              << std::hex << std::setfill('0')
+              << std::setw(16) << (uintptr_t)glctx.display
+              << ", surf: 0x" << std::setw(16) << (uintptr_t)glctx.surface
+              << ", ctx: 0x" << std::setw(16) << (uintptr_t)glctx.context << ")";
+
+    memset(&glctx, 0, sizeof(GLContext));
+}
+
+#endif // __linux__
+
+//------------------------------------------------------------------------
+// Common.
+//------------------------------------------------------------------------
+
+static const char* getGLErrorString(GLenum err)
+{
+    switch(err)
+    {
+        case GL_NO_ERROR:                       return "GL_NO_ERROR";
+        case GL_INVALID_ENUM:                   return "GL_INVALID_ENUM";
+        case GL_INVALID_VALUE:                  return "GL_INVALID_VALUE";
+        case GL_INVALID_OPERATION:              return "GL_INVALID_OPERATION";
+        case GL_STACK_OVERFLOW:                 return "GL_STACK_OVERFLOW";
+        case GL_STACK_UNDERFLOW:                return "GL_STACK_UNDERFLOW";
+        case GL_OUT_OF_MEMORY:                  return "GL_OUT_OF_MEMORY";
+        case GL_INVALID_FRAMEBUFFER_OPERATION:  return "GL_INVALID_FRAMEBUFFER_OPERATION";
+        case GL_TABLE_TOO_LARGE:                return "GL_TABLE_TOO_LARGE";
+        case GL_CONTEXT_LOST:                   return "GL_CONTEXT_LOST";
+    }
+    return "Unknown error";
+}
+
+//------------------------------------------------------------------------
--- a/nvdiffrast/common/interpolate.cu
+++ b/nvdiffrast/common/interpolate.cu
+// Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#include "common.h"
+#include "interpolate.h"
+
+//------------------------------------------------------------------------
+// Forward kernel.
+
+template <bool ENABLE_DA>
+static __forceinline__ __device__ void InterpolateFwdKernelTemplate(const InterpolateKernelParams p)
+{
+    // Calculate pixel position.
+    int px = blockIdx.x * blockDim.x + threadIdx.x;
+    int py = blockIdx.y * blockDim.y + threadIdx.y;
+    int pz = blockIdx.z;
+    if (px >= p.width || py >= p.height || pz >= p.depth)
+        return;
+
+    // Pixel index.
+    int pidx = px + p.width * (py + p.height * pz);
+
+    // Output ptrs.
+    float* out = p.out + pidx * p.numAttr;
+    float2* outDA = ENABLE_DA ? (((float2*)p.outDA) + pidx * p.numDiffAttr) : 0;
+
+    // Fetch rasterizer output.
+    float4 r = ((float4*)p.rast)[pidx];
+    int triIdx = (int)r.w - 1;
+    bool triValid = (triIdx >= 0 && triIdx < p.numTriangles);
+
+    // If no geometry in entire warp, zero the output and exit.
+    // Otherwise force barys to zero and output with live threads.
+    if (__all_sync(0xffffffffu, !triValid))
+    {
+        for (int i=0; i < p.numAttr; i++)
+            out[i] = 0.f;
+        if (ENABLE_DA)
+            for (int i=0; i < p.numDiffAttr; i++)
+                outDA[i] = make_float2(0.f, 0.f);
+        return;
+    }
+
+    // Fetch vertex indices.
+    int vi0 = triValid ? p.tri[triIdx * 3 + 0] : 0;
+    int vi1 = triValid ? p.tri[triIdx * 3 + 1] : 0;
+    int vi2 = triValid ? p.tri[triIdx * 3 + 2] : 0;
+
+    // Bail out if corrupt indices.
+    if (vi0 < 0 || vi0 >= p.numVertices ||
+        vi1 < 0 || vi1 >= p.numVertices ||
+        vi2 < 0 || vi2 >= p.numVertices)
+        return;
+
+    // In instance mode, adjust vertex indices by minibatch index unless broadcasting.
+    if (p.instance_mode && !p.attrBC)
+    {
+        vi0 += pz * p.numVertices;
+        vi1 += pz * p.numVertices;
+        vi2 += pz * p.numVertices;
+    }
+
+    // Pointers to attributes.
+    const float* a0 = p.attr + vi0 * p.numAttr;
+    const float* a1 = p.attr + vi1 * p.numAttr;
+    const float* a2 = p.attr + vi2 * p.numAttr;
+
+    // Barys. If no triangle, force all to zero -> output is zero.
+    float b0 = triValid ? r.x : 0.f;
+    float b1 = triValid ? r.y : 0.f;
+    float b2 = triValid ? (1.f - r.x - r.y) : 0.f;
+
+    // Interpolate and write attributes.
+    for (int i=0; i < p.numAttr; i++)
+        out[i] = b0*a0[i] + b1*a1[i] + b2*a2[i];
+
+    // No diff attrs? Exit.
+    if (!ENABLE_DA)
+        return;
+
+    // Read bary pixel differentials if we have a triangle.
+    float4 db = make_float4(0.f, 0.f, 0.f, 0.f);
+    if (triValid)
+        db = ((float4*)p.rastDB)[pidx];
+
+    // Unpack a bit.
+    float dudx = db.x;
+    float dudy = db.y;
+    float dvdx = db.z;
+    float dvdy = db.w;
+
+    // Calculate the pixel differentials of chosen attributes.    
+    for (int i=0; i < p.numDiffAttr; i++)
+    {   
+        // Input attribute index.
+        int j = p.diff_attrs_all ? i : p.diffAttrs[i];
+        if (j < 0)
+            j += p.numAttr; // Python-style negative indices.
+
+        // Zero output if invalid index.
+        float dsdx = 0.f;
+        float dsdy = 0.f;
+        if (j >= 0 && j < p.numAttr)
+        {
+            float s0 = a0[j];
+            float s1 = a1[j];
+            float s2 = a2[j];
+            float dsdu = s0 - s2;
+            float dsdv = s1 - s2;
+            dsdx = dudx*dsdu + dvdx*dsdv;
+            dsdy = dudy*dsdu + dvdy*dsdv;
+        }
+
+        // Write.
+        outDA[i] = make_float2(dsdx, dsdy);
+    }
+}
+
+// Template specializations.
+__global__ void InterpolateFwdKernel  (const InterpolateKernelParams p) { InterpolateFwdKernelTemplate<false>(p); }
+__global__ void InterpolateFwdKernelDa(const InterpolateKernelParams p) { InterpolateFwdKernelTemplate<true>(p); }
+
+//------------------------------------------------------------------------
+// Gradient kernel.
+
+template <bool ENABLE_DA>
+static __forceinline__ __device__ void InterpolateGradKernelTemplate(const InterpolateKernelParams p)
+{
+    // Temporary space for coalesced atomics.
+    CA_DECLARE_TEMP(IP_GRAD_MAX_KERNEL_BLOCK_WIDTH * IP_GRAD_MAX_KERNEL_BLOCK_HEIGHT);    
+
+    // Calculate pixel position.
+    int px = blockIdx.x * blockDim.x + threadIdx.x;
+    int py = blockIdx.y * blockDim.y + threadIdx.y;
+    int pz = blockIdx.z;
+    if (px >= p.width || py >= p.height || pz >= p.depth)
+        return;
+
+    // Pixel index.
+    int pidx = px + p.width * (py + p.height * pz);
+
+    // Fetch triangle ID. If none, output zero bary/db gradients and exit.
+    float4 r = ((float4*)p.rast)[pidx];
+    int triIdx = (int)r.w - 1;
+    if (triIdx < 0 || triIdx >= p.numTriangles)
+    {
+        ((float4*)p.gradRaster)[pidx] = make_float4(0.f, 0.f, 0.f, 0.f);
+        if (ENABLE_DA)
+            ((float4*)p.gradRasterDB)[pidx] = make_float4(0.f, 0.f, 0.f, 0.f);
+        return;
+    }
+
+    // Fetch vertex indices.
+    int vi0 = p.tri[triIdx * 3 + 0];
+    int vi1 = p.tri[triIdx * 3 + 1];
+    int vi2 = p.tri[triIdx * 3 + 2];
+
+    // Bail out if corrupt indices.
+    if (vi0 < 0 || vi0 >= p.numVertices ||
+        vi1 < 0 || vi1 >= p.numVertices ||
+        vi2 < 0 || vi2 >= p.numVertices)
+        return;
+
+    // In instance mode, adjust vertex indices by minibatch index unless broadcasting.
+    if (p.instance_mode && !p.attrBC)
+    {
+        vi0 += pz * p.numVertices;
+        vi1 += pz * p.numVertices;
+        vi2 += pz * p.numVertices;
+    }
+
+    // Initialize coalesced atomics.
+    CA_SET_GROUP(triIdx);
+
+    // Pointers to inputs.
+    const float* a0 = p.attr + vi0 * p.numAttr;
+    const float* a1 = p.attr + vi1 * p.numAttr;
+    const float* a2 = p.attr + vi2 * p.numAttr;
+    const float* pdy = p.dy + pidx * p.numAttr;
+
+    // Pointers to outputs.
+    float* ga0 = p.gradAttr + vi0 * p.numAttr;
+    float* ga1 = p.gradAttr + vi1 * p.numAttr;
+    float* ga2 = p.gradAttr + vi2 * p.numAttr;
+
+    // Barys and bary gradient accumulators.
+    float b0 = r.x;
+    float b1 = r.y;
+    float b2 = 1.f - r.x - r.y;
+    float gb0 = 0.f;
+    float gb1 = 0.f;
+
+    // Loop over attributes and accumulate attribute gradients.
+    for (int i=0; i < p.numAttr; i++)
+    {
+        float y = pdy[i];
+        float s0 = a0[i];
+        float s1 = a1[i];
+        float s2 = a2[i];
+        gb0 += y * (s0 - s2);
+        gb1 += y * (s1 - s2);
+        caAtomicAdd(ga0 + i, b0 * y);
+        caAtomicAdd(ga1 + i, b1 * y);
+        caAtomicAdd(ga2 + i, b2 * y);
+    }
+
+    // Write the bary gradients.
+    ((float4*)p.gradRaster)[pidx] = make_float4(gb0, gb1, 0.f, 0.f);
+
+    // If pixel differentials disabled, we're done.
+    if (!ENABLE_DA)
+        return;
+
+    // Calculate gradients based on attribute pixel differentials.
+    const float2* dda = ((float2*)p.dda) + pidx * p.numDiffAttr;
+    float gdudx = 0.f;
+    float gdudy = 0.f;
+    float gdvdx = 0.f;
+    float gdvdy = 0.f;
+
+    // Read bary pixel differentials.
+    float4 db = ((float4*)p.rastDB)[pidx];
+    float dudx = db.x;
+    float dudy = db.y;
+    float dvdx = db.z;
+    float dvdy = db.w;
+
+    for (int i=0; i < p.numDiffAttr; i++)
+    {
+        // Input attribute index.
+        int j = p.diff_attrs_all ? i : p.diffAttrs[i];
+        if (j < 0)
+            j += p.numAttr; // Python-style negative indices.
+
+        // Check that index is valid.
+        if (j >= 0 && j < p.numAttr)
+        {
+            float2 dsdxy = dda[i];
+            float dsdx = dsdxy.x;
+            float dsdy = dsdxy.y;
+
+            float s0 = a0[j];
+            float s1 = a1[j];
+            float s2 = a2[j];
+
+            // Gradients of db.
+            float dsdu = s0 - s2;
+            float dsdv = s1 - s2;
+            gdudx += dsdu * dsdx;
+            gdudy += dsdu * dsdy;
+            gdvdx += dsdv * dsdx;
+            gdvdy += dsdv * dsdy;
+
+            // Gradients of attributes.
+            float du = dsdx*dudx + dsdy*dudy;
+            float dv = dsdx*dvdx + dsdy*dvdy;
+            caAtomicAdd(ga0 + j, du);
+            caAtomicAdd(ga1 + j, dv);
+            caAtomicAdd(ga2 + j, -du - dv);
+        }
+    }
+
+    // Write.
+    ((float4*)p.gradRasterDB)[pidx] = make_float4(gdudx, gdudy, gdvdx, gdvdy);
+}
+
+// Template specializations.
+__global__ void InterpolateGradKernel  (const InterpolateKernelParams p) { InterpolateGradKernelTemplate<false>(p); }
+__global__ void InterpolateGradKernelDa(const InterpolateKernelParams p) { InterpolateGradKernelTemplate<true>(p); }
+
+//------------------------------------------------------------------------
--- a/nvdiffrast/common/interpolate.h
+++ b/nvdiffrast/common/interpolate.h
+// Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#pragma once
+
+//------------------------------------------------------------------------
+// Constants and helpers.
+
+#define IP_FWD_MAX_KERNEL_BLOCK_WIDTH   8
+#define IP_FWD_MAX_KERNEL_BLOCK_HEIGHT  8
+#define IP_GRAD_MAX_KERNEL_BLOCK_WIDTH  8
+#define IP_GRAD_MAX_KERNEL_BLOCK_HEIGHT 8
+#define IP_MAX_DIFF_ATTRS               32
+
+//------------------------------------------------------------------------
+// CUDA kernel params.
+
+struct InterpolateKernelParams
+{
+    const int*      tri;                            // Incoming triangle buffer.
+    const float*    attr;                           // Incoming attribute buffer.
+    const float*    rast;                           // Incoming rasterizer output buffer.
+    const float*    rastDB;                         // Incoming rasterizer output buffer for bary derivatives.
+    const float*    dy;                             // Incoming attribute gradients.
+    const float*    dda;                            // Incoming attr diff gradients.
+    float*          out;                            // Outgoing interpolated attributes.
+    float*          outDA;                          // Outgoing texcoord major axis lengths.
+    float*          gradAttr;                       // Outgoing attribute gradients.
+    float*          gradRaster;                     // Outgoing rasterizer gradients.
+    float*          gradRasterDB;                   // Outgoing rasterizer bary diff gradients.
+    int             numTriangles;                   // Number of triangles.
+    int             numVertices;                    // Number of vertices.
+    int             numAttr;                        // Number of total vertex attributes.
+    int             numDiffAttr;                    // Number of attributes to differentiate.
+    int             width;                          // Image width.
+    int             height;                         // Image height.
+    int             depth;                          // Minibatch size.
+    int             attrBC;                         // 0=normal, 1=attr is broadcast.
+    int             instance_mode;                  // 0=normal, 1=instance mode.
+    int             diff_attrs_all;                 // 0=normal, 1=produce pixel differentials for all attributes.
+    int             diffAttrs[IP_MAX_DIFF_ATTRS];   // List of attributes to differentiate.
+};
+
+//------------------------------------------------------------------------