Add CUDA rasterizer

a1ec436b · Samuli Laine · 78528e68 · a1ec436b · a1ec436b · a1ec436b
Commit a1ec436b authored Aug 17, 2022 by Samuli Laine
20 changed files
--- a/README.md
+++ b/README.md
@@ -11,7 +11,7 @@ Please refer to &#x261E;&#x261E; [nvdiffrast documentation](https://nvlabs.githu

 ## Licenses

-Copyright &copy; 2020, NVIDIA Corporation. All rights reserved.
+Copyright &copy; 2020&ndash;2022, NVIDIA Corporation. All rights reserved.

 This work is made available under the [Nvidia Source Code License](https://github.com/NVlabs/nvdiffrast/blob/main/LICENSE.txt).


--- a/docs/index.html
+++ b/docs/index.html
--- a/nvdiffrast/__init__.py
+++ b/nvdiffrast/__init__.py
@@ -6,4 +6,4 @@
 # distribution of this software and related documentation without an express
 # license agreement from NVIDIA CORPORATION is strictly prohibited.

-__version__ = '0.2.8'
+__version__ = '0.3.0'
--- a/nvdiffrast/common/cudaraster/CudaRaster.hpp
+++ b/nvdiffrast/common/cudaraster/CudaRaster.hpp
+// Copyright (c) 2009-2022, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#pragma once
+
+//------------------------------------------------------------------------
+// This is a slimmed-down and modernized version of the original
+// CudaRaster codebase that accompanied the HPG 2011 paper
+// "High-Performance Software Rasterization on GPUs" by Laine and Karras.
+// Modifications have been made to accommodate post-Volta execution model
+// with warp divergence. Support for shading, blending, quad rendering,
+// and supersampling have been removed as unnecessary for nvdiffrast.
+//------------------------------------------------------------------------
+
+namespace CR
+{
+
+class RasterImpl;
+
+//------------------------------------------------------------------------
+// Interface class to isolate user from implementation details.
+//------------------------------------------------------------------------
+
+class CudaRaster
+{
+public:
+    enum
+    {
+        RenderModeFlag_EnableBackfaceCulling = 1 << 0,   // Enable backface culling.
+        RenderModeFlag_EnableDepthPeeling    = 1 << 1,   // Enable depth peeling. Must have a peel buffer set.
+    };
+
+public:
+					        CudaRaster				(void);
+					        ~CudaRaster				(void);
+
+    void                    setViewportSize         (int width, int height, int numImages);     // Width and height must be multiples of tile size (8x8).
+    void                    setRenderModeFlags      (unsigned int renderModeFlags);             // Affects all subsequent calls to drawTriangles(). Defaults to zero.
+    void                    deferredClear           (unsigned int clearColor);                  // Clears color and depth buffers during next call to drawTriangles().
+    void                    setVertexBuffer         (void* vertices, int numVertices);          // GPU pointer managed by caller. Vertex positions in clip space as float4 (x, y, z, w).
+    void                    setIndexBuffer          (void* indices, int numTriangles);          // GPU pointer managed by caller. Triangle index+color quadruplets as uint4 (idx0, idx1, idx2, color).
+    bool                    drawTriangles           (const int* ranges, cudaStream_t stream);   // Ranges (offsets and counts) as #triangles entries, not as bytes. If NULL, draw all triangles. Returns false in case of internal overflow.
+    void*                   getColorBuffer          (void);                                     // GPU pointer managed by CudaRaster.
+    void*                   getDepthBuffer          (void);                                     // GPU pointer managed by CudaRaster.
+    void                    swapDepthAndPeel        (void);                                     // Swap depth and peeling buffers.
+
+private:
+					        CudaRaster           	(const CudaRaster&); // forbidden
+	CudaRaster&             operator=           	(const CudaRaster&); // forbidden
+
+private:
+    RasterImpl*             m_impl;                 // Opaque pointer to implementation.
+};
+
+//------------------------------------------------------------------------
+} // namespace CR
+
--- a/nvdiffrast/common/cudaraster/impl/BinRaster.inl
+++ b/nvdiffrast/common/cudaraster/impl/BinRaster.inl
+// Copyright (c) 2009-2022, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+//------------------------------------------------------------------------
+
+__device__ __inline__ void binRasterImpl(const CRParams p)
+{
+    __shared__ volatile U32 s_broadcast [CR_BIN_WARPS + 16];
+    __shared__ volatile S32 s_outOfs    [CR_MAXBINS_SQR];
+    __shared__ volatile S32 s_outTotal  [CR_MAXBINS_SQR];
+    __shared__ volatile S32 s_overIndex [CR_MAXBINS_SQR];
+    __shared__ volatile S32 s_outMask   [CR_BIN_WARPS][CR_MAXBINS_SQR + 1]; // +1 to avoid bank collisions
+    __shared__ volatile S32 s_outCount  [CR_BIN_WARPS][CR_MAXBINS_SQR + 1]; // +1 to avoid bank collisions
+    __shared__ volatile S32 s_triBuf    [CR_BIN_WARPS*32*4];                // triangle ring buffer
+    __shared__ volatile U32 s_batchPos;
+    __shared__ volatile U32 s_bufCount;
+    __shared__ volatile U32 s_overTotal;
+    __shared__ volatile U32 s_allocBase;
+
+    const CRImageParams&    ip              = getImageParams(p, blockIdx.z);
+    CRAtomics&              atomics         = p.atomics[blockIdx.z];
+    const U8*               triSubtris      = (const U8*)p.triSubtris + p.maxSubtris * blockIdx.z;
+    const CRTriangleHeader* triHeader       = (const CRTriangleHeader*)p.triHeader + p.maxSubtris * blockIdx.z;
+
+    S32*                    binFirstSeg     = (S32*)p.binFirstSeg + CR_MAXBINS_SQR * CR_BIN_STREAMS_SIZE * blockIdx.z;
+    S32*                    binTotal        = (S32*)p.binTotal    + CR_MAXBINS_SQR * CR_BIN_STREAMS_SIZE * blockIdx.z;
+    S32*                    binSegData      = (S32*)p.binSegData  + p.maxBinSegs * CR_BIN_SEG_SIZE * blockIdx.z;
+    S32*                    binSegNext      = (S32*)p.binSegNext  + p.maxBinSegs * blockIdx.z;
+    S32*                    binSegCount     = (S32*)p.binSegCount + p.maxBinSegs * blockIdx.z;
+
+    if (atomics.numSubtris > p.maxSubtris)
+        return;
+
+    // per-thread state
+    int thrInBlock = threadIdx.x + threadIdx.y * 32;
+    int batchPos = 0;
+
+    // first 16 elements of s_broadcast are always zero
+    if (thrInBlock < 16)
+        s_broadcast[thrInBlock] = 0;
+
+    // initialize output linked lists and offsets
+    if (thrInBlock < p.numBins)
+    {
+        binFirstSeg[(thrInBlock << CR_BIN_STREAMS_LOG2) + blockIdx.x] = -1;
+        s_outOfs[thrInBlock] = -CR_BIN_SEG_SIZE;
+        s_outTotal[thrInBlock] = 0;
+    }
+
+    // repeat until done
+    for(;;)
+    {
+        // get batch
+        if (thrInBlock == 0)
+            s_batchPos = atomicAdd(&atomics.binCounter, ip.binBatchSize);
+        __syncthreads();
+        batchPos = s_batchPos;
+
+        // all batches done?
+        if (batchPos >= ip.triCount)
+            break;
+
+        // per-thread state
+        int bufIndex = 0;
+        int bufCount = 0;
+        int batchEnd = min(batchPos + ip.binBatchSize, ip.triCount);
+
+        // loop over batch as long as we have triangles in it
+        do
+        {
+            // read more triangles
+            while (bufCount < CR_BIN_WARPS*32 && batchPos < batchEnd)
+            {
+                // get subtriangle count
+
+                int triIdx = batchPos + thrInBlock;
+                int num = 0;
+                if (triIdx < batchEnd)
+                    num = triSubtris[triIdx];
+
+                // cumulative sum of subtriangles within each warp
+                U32 myIdx = __popc(__ballot_sync(~0u, num & 1) & getLaneMaskLt());
+                if (__any_sync(~0u, num > 1))
+                {
+                    myIdx += __popc(__ballot_sync(~0u, num & 2) & getLaneMaskLt()) * 2;
+                    myIdx += __popc(__ballot_sync(~0u, num & 4) & getLaneMaskLt()) * 4;
+                }
+                if (threadIdx.x == 31) // Do not assume that last thread in warp wins the write.
+                    s_broadcast[threadIdx.y + 16] = myIdx + num;
+                __syncthreads();
+
+                // cumulative sum of per-warp subtriangle counts
+                // Note: cannot have more than 32 warps or this needs to sync between each step.
+                bool act = (thrInBlock < CR_BIN_WARPS);
+                U32 actMask = __ballot_sync(~0u, act);
+                if (threadIdx.y == 0 && act)
+                {
+                    volatile U32* ptr = &s_broadcast[thrInBlock + 16];
+                    U32 val = *ptr;
+                    #if (CR_BIN_WARPS > 1)
+                        val += ptr[-1]; __syncwarp(actMask);
+                        *ptr = val;     __syncwarp(actMask);
+                    #endif
+                    #if (CR_BIN_WARPS > 2)
+                        val += ptr[-2]; __syncwarp(actMask);
+                        *ptr = val;     __syncwarp(actMask);
+                    #endif
+                    #if (CR_BIN_WARPS > 4)
+                        val += ptr[-4]; __syncwarp(actMask);
+                        *ptr = val;     __syncwarp(actMask);
+                    #endif
+                    #if (CR_BIN_WARPS > 8)
+                        val += ptr[-8]; __syncwarp(actMask);
+                        *ptr = val;     __syncwarp(actMask);
+                    #endif
+                    #if (CR_BIN_WARPS > 16)
+                        val += ptr[-16]; __syncwarp(actMask);
+                        *ptr = val;      __syncwarp(actMask);
+                    #endif
+
+                    // initially assume that we consume everything
+                    // only last active thread does the writes
+                    if (threadIdx.x == CR_BIN_WARPS - 1)
+                    {
+                        s_batchPos = batchPos + CR_BIN_WARPS * 32;
+                        s_bufCount = bufCount + val;
+                    }
+                }
+                __syncthreads();
+
+                // skip if no subtriangles
+                if (num)
+                {
+                    // calculate write position for first subtriangle
+                    U32 pos = bufCount + myIdx + s_broadcast[threadIdx.y + 16 - 1];
+
+                    // only write if entire triangle fits
+                    if (pos + num <= CR_ARRAY_SIZE(s_triBuf))
+                    {
+                        pos += bufIndex; // adjust for current start position
+                        pos &= CR_ARRAY_SIZE(s_triBuf)-1;
+                        if (num == 1)
+                            s_triBuf[pos] = triIdx * 8 + 7; // single triangle
+                        else
+                        {
+                            for (int i=0; i < num; i++)
+                            {
+                                s_triBuf[pos] = triIdx * 8 + i;
+                                pos++;
+                                pos &= CR_ARRAY_SIZE(s_triBuf)-1;
+                            }
+                        }
+                    } else if (pos <= CR_ARRAY_SIZE(s_triBuf))
+                    {
+                        // this triangle is the first that failed, overwrite total count and triangle count
+                        s_batchPos = batchPos + thrInBlock;
+                        s_bufCount = pos;
+                    }
+                }
+
+                // update triangle counts
+                __syncthreads();
+                batchPos = s_batchPos;
+                bufCount = s_bufCount;
+            }
+
+            // make every warp clear its output buffers
+            for (int i=threadIdx.x; i < p.numBins; i += 32)
+                s_outMask[threadIdx.y][i] = 0;
+            __syncwarp();
+
+            // choose our triangle
+            uint4 triData = make_uint4(0, 0, 0, 0);
+            if (thrInBlock < bufCount)
+            {
+                U32 triPos = bufIndex + thrInBlock;
+                triPos &= CR_ARRAY_SIZE(s_triBuf)-1;
+
+                // find triangle
+                int triIdx = s_triBuf[triPos];
+                int dataIdx = triIdx >> 3;
+                int subtriIdx = triIdx & 7;
+                if (subtriIdx != 7)
+                    dataIdx = triHeader[dataIdx].misc + subtriIdx;
+
+                // read triangle
+
+                triData = *(((const uint4*)triHeader) + dataIdx);
+            }
+
+            // setup bounding box and edge functions, and rasterize
+            S32 lox, loy, hix, hiy;
+            bool hasTri = (thrInBlock < bufCount);
+            U32 hasTriMask = __ballot_sync(~0u, hasTri);
+            if (hasTri)
+            {
+                S32 v0x = add_s16lo_s16lo(triData.x, p.widthPixels  * (CR_SUBPIXEL_SIZE >> 1));
+                S32 v0y = add_s16hi_s16lo(triData.x, p.heightPixels * (CR_SUBPIXEL_SIZE >> 1));
+                S32 d01x = sub_s16lo_s16lo(triData.y, triData.x);
+                S32 d01y = sub_s16hi_s16hi(triData.y, triData.x);
+                S32 d02x = sub_s16lo_s16lo(triData.z, triData.x);
+                S32 d02y = sub_s16hi_s16hi(triData.z, triData.x);
+                int binLog = CR_BIN_LOG2 + CR_TILE_LOG2 + CR_SUBPIXEL_LOG2;
+                lox = add_clamp_0_x((v0x + min_min(d01x, 0, d02x)) >> binLog, 0, p.widthBins  - 1);
+                loy = add_clamp_0_x((v0y + min_min(d01y, 0, d02y)) >> binLog, 0, p.heightBins - 1);
+                hix = add_clamp_0_x((v0x + max_max(d01x, 0, d02x)) >> binLog, 0, p.widthBins  - 1);
+                hiy = add_clamp_0_x((v0y + max_max(d01y, 0, d02y)) >> binLog, 0, p.heightBins - 1);
+
+                U32 bit = 1 << threadIdx.x;
+                bool multi = (hix != lox || hiy != loy);
+                if (!__any_sync(hasTriMask, multi))
+                {
+                    int binIdx = lox + p.widthBins * loy;
+                    U32 mask = __match_any_sync(hasTriMask, binIdx);
+                    s_outMask[threadIdx.y][binIdx] = mask;
+                    __syncwarp(hasTriMask);
+                } else
+                {
+                    bool complex = (hix > lox+1 || hiy > loy+1);
+                    if (!__any_sync(hasTriMask, complex))
+                    {
+                        int binIdx = lox + p.widthBins * loy;
+                        atomicOr((U32*)&s_outMask[threadIdx.y][binIdx], bit);
+                        if (hix > lox) atomicOr((U32*)&s_outMask[threadIdx.y][binIdx + 1], bit);
+                        if (hiy > loy) atomicOr((U32*)&s_outMask[threadIdx.y][binIdx + p.widthBins], bit);
+                        if (hix > lox && hiy > loy) atomicOr((U32*)&s_outMask[threadIdx.y][binIdx + p.widthBins + 1], bit);
+                    } else
+                    {
+                        S32 d12x = d02x - d01x, d12y = d02y - d01y;
+                        v0x -= lox << binLog, v0y -= loy << binLog;
+
+                        S32 t01 = v0x * d01y - v0y * d01x;
+                        S32 t02 = v0y * d02x - v0x * d02y;
+                        S32 t12 = d01x * d12y - d01y * d12x - t01 - t02;
+                        S32 b01 = add_sub(t01 >> binLog, max(d01x, 0), min(d01y, 0));
+                        S32 b02 = add_sub(t02 >> binLog, max(d02y, 0), min(d02x, 0));
+                        S32 b12 = add_sub(t12 >> binLog, max(d12x, 0), min(d12y, 0));
+
+                        int width = hix - lox + 1;
+                        d01x += width * d01y;
+                        d02x += width * d02y;
+                        d12x += width * d12y;
+
+                        U8* currPtr = (U8*)&s_outMask[threadIdx.y][lox + loy * p.widthBins];
+                        U8* skipPtr = (U8*)&s_outMask[threadIdx.y][(hix + 1) + loy * p.widthBins];
+                        U8* endPtr  = (U8*)&s_outMask[threadIdx.y][lox + (hiy + 1) * p.widthBins];
+                        int stride  = p.widthBins * 4;
+                        int ptrYInc = stride - width * 4;
+
+                        do
+                        {
+                            if (b01 >= 0 && b02 >= 0 && b12 >= 0)
+                                atomicOr((U32*)currPtr, bit);
+                            currPtr += 4, b01 -= d01y, b02 += d02y, b12 -= d12y;
+                            if (currPtr == skipPtr)
+                                currPtr += ptrYInc, b01 += d01x, b02 -= d02x, b12 += d12x, skipPtr += stride;
+                        }
+                        while (currPtr != endPtr);
+                    }
+                }
+            }
+
+            // count per-bin contributions
+            if (thrInBlock == 0)
+                s_overTotal = 0; // overflow counter
+
+            // ensure that out masks are done
+            __syncthreads();
+
+            int overIndex = -1;
+            bool act = (thrInBlock < p.numBins);
+            U32 actMask = __ballot_sync(~0u, act);
+            if (act)
+            {
+                U8* srcPtr = (U8*)&s_outMask[0][thrInBlock];
+                U8* dstPtr = (U8*)&s_outCount[0][thrInBlock];
+                int total = 0;
+                for (int i = 0; i < CR_BIN_WARPS; i++)
+                {
+                    total += __popc(*(U32*)srcPtr);
+                    *(U32*)dstPtr = total;
+                    srcPtr += (CR_MAXBINS_SQR + 1) * 4;
+                    dstPtr += (CR_MAXBINS_SQR + 1) * 4;
+                }
+
+                // overflow => request a new segment
+                int ofs = s_outOfs[thrInBlock];
+                bool ovr = (((ofs - 1) >> CR_BIN_SEG_LOG2) != (((ofs - 1) + total) >> CR_BIN_SEG_LOG2));
+                U32 ovrMask = __ballot_sync(actMask, ovr);
+                if (ovr)
+                {
+                    overIndex = __popc(ovrMask & getLaneMaskLt());
+                    if (overIndex == 0)
+                        s_broadcast[threadIdx.y + 16] = atomicAdd((U32*)&s_overTotal, __popc(ovrMask));
+                    __syncwarp(ovrMask);
+                    overIndex += s_broadcast[threadIdx.y + 16];
+                    s_overIndex[thrInBlock] = overIndex;
+                }
+            }
+
+            // sync after overTotal is ready
+            __syncthreads();
+
+            // at least one segment overflowed => allocate segments
+            U32 overTotal = s_overTotal;
+            U32 allocBase = 0;
+            if (overTotal > 0)
+            {
+                // allocate memory
+                if (thrInBlock == 0)
+                {
+                    U32 allocBase = atomicAdd(&atomics.numBinSegs, overTotal);
+                    s_allocBase = (allocBase + overTotal <= p.maxBinSegs) ? allocBase : 0;
+                }
+                __syncthreads();
+                allocBase = s_allocBase;
+
+                // did my bin overflow?
+                if (overIndex != -1)
+                {
+                    // calculate new segment index
+                    int segIdx = allocBase + overIndex;
+
+                    // add to linked list
+                    if (s_outOfs[thrInBlock] < 0)
+                        binFirstSeg[(thrInBlock << CR_BIN_STREAMS_LOG2) + blockIdx.x] = segIdx;
+                    else
+                        binSegNext[(s_outOfs[thrInBlock] - 1) >> CR_BIN_SEG_LOG2] = segIdx;
+
+                    // defaults
+                    binSegNext [segIdx] = -1;
+                    binSegCount[segIdx] = CR_BIN_SEG_SIZE;
+                }
+            }
+
+            // concurrent emission -- each warp handles its own triangle
+            if (thrInBlock < bufCount)
+            {
+                int triPos  = (bufIndex + thrInBlock) & (CR_ARRAY_SIZE(s_triBuf) - 1);
+                int currBin = lox + loy * p.widthBins;
+                int skipBin = (hix + 1) + loy * p.widthBins;
+                int endBin  = lox + (hiy + 1) * p.widthBins;
+                int binYInc = p.widthBins - (hix - lox + 1);
+
+                // loop over triangle's bins
+                do
+                {
+                    U32 outMask = s_outMask[threadIdx.y][currBin];
+                    if (outMask & (1<<threadIdx.x))
+                    {
+                        int idx = __popc(outMask & getLaneMaskLt());
+                        if (threadIdx.y > 0)
+                            idx += s_outCount[threadIdx.y-1][currBin];
+
+                        int base = s_outOfs[currBin];
+                        int free = (-base) & (CR_BIN_SEG_SIZE - 1);
+                        if (idx >= free)
+                            idx += ((allocBase + s_overIndex[currBin]) << CR_BIN_SEG_LOG2) - free;
+                        else
+                            idx += base;
+
+                        binSegData[idx] = s_triBuf[triPos];
+                    }
+
+                    currBin++;
+                    if (currBin == skipBin)
+                        currBin += binYInc, skipBin += p.widthBins;
+                }
+                while (currBin != endBin);
+            }
+
+            // wait all triangles to finish, then replace overflown segment offsets
+            __syncthreads();
+            if (thrInBlock < p.numBins)
+            {
+                U32 total  = s_outCount[CR_BIN_WARPS - 1][thrInBlock];
+                U32 oldOfs = s_outOfs[thrInBlock];
+                if (overIndex == -1)
+                    s_outOfs[thrInBlock] = oldOfs + total;
+                else
+                {
+                    int addr = oldOfs + total;
+                    addr = ((addr - 1) & (CR_BIN_SEG_SIZE - 1)) + 1;
+                    addr += (allocBase + overIndex) << CR_BIN_SEG_LOG2;
+                    s_outOfs[thrInBlock] = addr;
+                }
+                s_outTotal[thrInBlock] += total;
+            }
+
+            // these triangles are now done
+            int count = ::min(bufCount, CR_BIN_WARPS * 32);
+            bufCount -= count;
+            bufIndex += count;
+            bufIndex &= CR_ARRAY_SIZE(s_triBuf)-1;
+        }
+        while (bufCount > 0 || batchPos < batchEnd);
+
+        // flush all bins
+        if (thrInBlock < p.numBins)
+        {
+            int ofs = s_outOfs[thrInBlock];
+            if (ofs & (CR_BIN_SEG_SIZE-1))
+            {
+                int seg = ofs >> CR_BIN_SEG_LOG2;
+                binSegCount[seg] = ofs & (CR_BIN_SEG_SIZE-1);
+                s_outOfs[thrInBlock] = (ofs + CR_BIN_SEG_SIZE - 1) & -CR_BIN_SEG_SIZE;
+            }
+        }
+    }
+
+    // output totals
+    if (thrInBlock < p.numBins)
+        binTotal[(thrInBlock << CR_BIN_STREAMS_LOG2) + blockIdx.x] = s_outTotal[thrInBlock];
+}
+
+//------------------------------------------------------------------------
--- a/nvdiffrast/common/cudaraster/impl/Buffer.cpp
+++ b/nvdiffrast/common/cudaraster/impl/Buffer.cpp
+// Copyright (c) 2009-2022, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#include "../../framework.h"
+#include "Buffer.hpp"
+
+using namespace CR;
+
+//------------------------------------------------------------------------
+
+Buffer::Buffer(void)
+:   m_gpuPtr(NULL),
+    m_bytes (0)
+{
+    // empty
+}
+
+Buffer::~Buffer(void)
+{
+    if (m_gpuPtr)
+        NVDR_CHECK_CUDA_ERROR(cudaFree(m_gpuPtr));
+}
+
+//------------------------------------------------------------------------
+
+void Buffer::reset(size_t bytes)
+{
+    if (bytes == m_bytes)
+        return;
+
+    if (m_gpuPtr)
+    {
+        NVDR_CHECK_CUDA_ERROR(cudaFree(m_gpuPtr));
+        m_gpuPtr = NULL;
+    }
+
+    if (bytes > 0)
+        NVDR_CHECK_CUDA_ERROR(cudaMalloc(&m_gpuPtr, bytes));
+
+    m_bytes = bytes;
+}
+
+//------------------------------------------------------------------------
+
+void Buffer::grow(size_t bytes)
+{
+    if (bytes > m_bytes)
+        reset(bytes);
+}
+
+//------------------------------------------------------------------------
--- a/nvdiffrast/common/cudaraster/impl/Buffer.hpp
+++ b/nvdiffrast/common/cudaraster/impl/Buffer.hpp
+// Copyright (c) 2009-2022, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#pragma once
+#include "Defs.hpp"
+
+namespace CR
+{
+//------------------------------------------------------------------------
+
+class Buffer
+{
+public:
+                    Buffer      (void);
+                    ~Buffer     (void);
+
+    void            reset       (size_t bytes);
+    void            grow        (size_t bytes);
+    void*           getPtr      (void) { return m_gpuPtr; }
+    size_t          getSize     (void) const { return m_bytes; }
+
+    void            setPtr      (void* ptr) { m_gpuPtr = ptr; }
+
+private:
+    void*           m_gpuPtr;
+    size_t          m_bytes;
+};
+
+//------------------------------------------------------------------------
+}
--- a/nvdiffrast/common/cudaraster/impl/CoarseRaster.inl
+++ b/nvdiffrast/common/cudaraster/impl/CoarseRaster.inl
--- a/nvdiffrast/common/cudaraster/impl/Constants.hpp
+++ b/nvdiffrast/common/cudaraster/impl/Constants.hpp
+// Copyright (c) 2009-2022, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#pragma once
+
+//------------------------------------------------------------------------
+
+#define CR_MAXVIEWPORT_LOG2     11      // ViewportSize / PixelSize.
+#define CR_SUBPIXEL_LOG2        4       // PixelSize / SubpixelSize.
+
+#define CR_MAXBINS_LOG2         4       // ViewportSize / BinSize.
+#define CR_BIN_LOG2             4       // BinSize / TileSize.
+#define CR_TILE_LOG2            3       // TileSize / PixelSize.
+
+#define CR_COVER8X8_LUT_SIZE    768     // 64-bit entries.
+#define CR_FLIPBIT_FLIP_Y       2
+#define CR_FLIPBIT_FLIP_X       3
+#define CR_FLIPBIT_SWAP_XY      4
+#define CR_FLIPBIT_COMPL        5
+
+#define CR_BIN_STREAMS_LOG2     4
+#define CR_BIN_SEG_LOG2         9       // 32-bit entries.
+#define CR_TILE_SEG_LOG2        5       // 32-bit entries.
+
+#define CR_MAXSUBTRIS_LOG2      24      // Triangle structs. Dictated by CoarseRaster.
+#define CR_COARSE_QUEUE_LOG2    10      // Triangles.
+
+#define CR_SETUP_WARPS          2
+#define CR_SETUP_OPT_BLOCKS     8
+#define CR_BIN_WARPS            16
+#define CR_COARSE_WARPS         16      // Must be a power of two.
+#define CR_FINE_MAX_WARPS       20
+
+#define CR_EMBED_IMAGE_PARAMS   32      // Number of per-image parameter structs embedded in kernel launch parameter block.
+
+//------------------------------------------------------------------------
+
+#define CR_MAXVIEWPORT_SIZE     (1 << CR_MAXVIEWPORT_LOG2)
+#define CR_SUBPIXEL_SIZE        (1 << CR_SUBPIXEL_LOG2)
+#define CR_SUBPIXEL_SQR         (1 << (CR_SUBPIXEL_LOG2 * 2))
+
+#define CR_MAXBINS_SIZE         (1 << CR_MAXBINS_LOG2)
+#define CR_MAXBINS_SQR          (1 << (CR_MAXBINS_LOG2 * 2))
+#define CR_BIN_SIZE             (1 << CR_BIN_LOG2)
+#define CR_BIN_SQR              (1 << (CR_BIN_LOG2 * 2))
+
+#define CR_MAXTILES_LOG2        (CR_MAXBINS_LOG2 + CR_BIN_LOG2)
+#define CR_MAXTILES_SIZE        (1 << CR_MAXTILES_LOG2)
+#define CR_MAXTILES_SQR         (1 << (CR_MAXTILES_LOG2 * 2))
+#define CR_TILE_SIZE            (1 << CR_TILE_LOG2)
+#define CR_TILE_SQR             (1 << (CR_TILE_LOG2 * 2))
+
+#define CR_BIN_STREAMS_SIZE     (1 << CR_BIN_STREAMS_LOG2)
+#define CR_BIN_SEG_SIZE         (1 << CR_BIN_SEG_LOG2)
+#define CR_TILE_SEG_SIZE        (1 << CR_TILE_SEG_LOG2)
+
+#define CR_MAXSUBTRIS_SIZE      (1 << CR_MAXSUBTRIS_LOG2)
+#define CR_COARSE_QUEUE_SIZE    (1 << CR_COARSE_QUEUE_LOG2)
+
+//------------------------------------------------------------------------
+// When evaluating interpolated Z pixel centers, we may introduce an error
+// of (+-CR_LERP_ERROR) ULPs.
+
+#define CR_LERP_ERROR(SAMPLES_LOG2) (2200u << (SAMPLES_LOG2))
+#define CR_DEPTH_MIN                CR_LERP_ERROR(3)
+#define CR_DEPTH_MAX                (CR_U32_MAX - CR_LERP_ERROR(3))
+
+//------------------------------------------------------------------------
--- a/nvdiffrast/common/cudaraster/impl/CudaRaster.cpp
+++ b/nvdiffrast/common/cudaraster/impl/CudaRaster.cpp
+// Copyright (c) 2009-2022, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#include "Defs.hpp"
+#include "../CudaRaster.hpp"
+#include "RasterImpl.hpp"
+
+using namespace CR;
+
+//------------------------------------------------------------------------
+// Stub interface implementation.
+//------------------------------------------------------------------------
+
+CudaRaster::CudaRaster()
+{
+    m_impl = new RasterImpl();
+}
+
+CudaRaster::~CudaRaster()
+{
+    delete m_impl;
+}
+
+void CudaRaster::setViewportSize(int width, int height, int numImages)
+{
+    m_impl->setViewportSize(Vec3i(width, height, numImages));
+}
+
+void CudaRaster::setRenderModeFlags(U32 flags)
+{
+    m_impl->setRenderModeFlags(flags);
+}
+
+void CudaRaster::deferredClear(U32 clearColor)
+{
+    m_impl->deferredClear(clearColor);
+}
+
+void CudaRaster::setVertexBuffer(void* vertices, int numVertices)
+{
+    m_impl->setVertexBuffer(vertices, numVertices);
+}
+
+void CudaRaster::setIndexBuffer(void* indices, int numTriangles)
+{
+    m_impl->setIndexBuffer(indices, numTriangles);
+}
+
+bool CudaRaster::drawTriangles(const int* ranges, cudaStream_t stream)
+{
+    return m_impl->drawTriangles((const Vec2i*)ranges, stream);
+}
+
+void* CudaRaster::getColorBuffer(void)
+{
+    return m_impl->getColorBuffer();
+}
+
+void* CudaRaster::getDepthBuffer(void)
+{
+    return m_impl->getDepthBuffer();
+}
+
+void CudaRaster::swapDepthAndPeel(void)
+{
+    m_impl->swapDepthAndPeel();
+}
+
+//------------------------------------------------------------------------
--- a/nvdiffrast/common/cudaraster/impl/Defs.hpp
+++ b/nvdiffrast/common/cudaraster/impl/Defs.hpp
+// Copyright (c) 2009-2022, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#pragma once
+#include <cuda_runtime.h>
+#include <cstdint>
+
+namespace CR
+{
+//------------------------------------------------------------------------
+
+#ifndef NULL
+#   define NULL 0
+#endif
+
+#ifdef __CUDACC__
+#   define CR_CUDA 1
+#else
+#   define CR_CUDA 0
+#endif
+
+#if CR_CUDA
+#   define CR_CUDA_FUNC     __device__ __inline__
+#   define CR_CUDA_CONST    __constant__
+#else
+#   define CR_CUDA_FUNC     inline
+#   define CR_CUDA_CONST    static const
+#endif
+
+#define CR_UNREF(X)         ((void)(X))
+#define CR_ARRAY_SIZE(X)    ((int)(sizeof(X) / sizeof((X)[0])))
+
+//------------------------------------------------------------------------
+
+typedef uint8_t             U8;
+typedef uint16_t            U16;
+typedef uint32_t            U32;
+typedef uint64_t            U64;
+typedef int8_t              S8;
+typedef int16_t             S16;
+typedef int32_t             S32;
+typedef int64_t             S64;
+typedef float               F32;
+typedef double              F64;
+typedef void                (*FuncPtr)(void);
+
+//------------------------------------------------------------------------
+
+#define CR_U32_MAX          (0xFFFFFFFFu)
+#define CR_S32_MIN          (~0x7FFFFFFF)
+#define CR_S32_MAX          (0x7FFFFFFF)
+#define CR_U64_MAX          ((U64)(S64)-1)
+#define CR_S64_MIN          ((S64)-1 << 63)
+#define CR_S64_MAX          (~((S64)-1 << 63))
+#define CR_F32_MIN          (1.175494351e-38f)
+#define CR_F32_MAX          (3.402823466e+38f)
+#define CR_F64_MIN          (2.2250738585072014e-308)
+#define CR_F64_MAX          (1.7976931348623158e+308)
+
+//------------------------------------------------------------------------
+// Misc types.
+
+class Vec2i
+{
+public:
+    Vec2i(int x_, int y_) : x(x_), y(y_) {}
+    int x, y;
+};
+
+class Vec3i
+{
+public:
+    Vec3i(int x_, int y_, int z_) : x(x_), y(y_), z(z_) {}
+    int x, y, z;
+};
+
+//------------------------------------------------------------------------
+// CUDA utilities.
+
+#if CR_CUDA
+#   define globalThreadIdx (threadIdx.x + blockDim.x * (threadIdx.y + blockDim.y * (blockIdx.x + gridDim.x * blockIdx.y)))
+#endif
+
+//------------------------------------------------------------------------
+} // namespace CR
--- a/nvdiffrast/common/cudaraster/impl/FineRaster.inl
+++ b/nvdiffrast/common/cudaraster/impl/FineRaster.inl
+// Copyright (c) 2009-2022, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+//------------------------------------------------------------------------
+// Utility funcs.
+//------------------------------------------------------------------------
+
+__device__ __inline__ void initTileZMax(U32& tileZMax, bool& tileZUpd, volatile U32* tileDepth)
+{
+    tileZMax = CR_DEPTH_MAX;
+    tileZUpd = (::min(tileDepth[threadIdx.x], tileDepth[threadIdx.x + 32]) < tileZMax);
+}
+
+__device__ __inline__ void updateTileZMax(U32& tileZMax, bool& tileZUpd, volatile U32* tileDepth, volatile U32* temp)
+{
+    // Entry is warp-coherent.
+    if (__any_sync(~0u, tileZUpd))
+    {
+        U32 z = ::max(tileDepth[threadIdx.x], tileDepth[threadIdx.x + 32]); __syncwarp();
+        temp[threadIdx.x + 16] = z; __syncwarp();
+        z = ::max(z, temp[threadIdx.x + 16 -  1]); __syncwarp(); temp[threadIdx.x + 16] = z; __syncwarp();
+        z = ::max(z, temp[threadIdx.x + 16 -  2]); __syncwarp(); temp[threadIdx.x + 16] = z; __syncwarp();
+        z = ::max(z, temp[threadIdx.x + 16 -  4]); __syncwarp(); temp[threadIdx.x + 16] = z; __syncwarp();
+        z = ::max(z, temp[threadIdx.x + 16 -  8]); __syncwarp(); temp[threadIdx.x + 16] = z; __syncwarp();
+        z = ::max(z, temp[threadIdx.x + 16 - 16]); __syncwarp(); temp[threadIdx.x + 16] = z; __syncwarp();
+        tileZMax = temp[47];
+        tileZUpd = false;
+    }
+}
+
+//------------------------------------------------------------------------
+
+__device__ __inline__ void getTriangle(const CRParams& p, S32& triIdx, S32& dataIdx, uint4& triHeader, S32& segment)
+{
+    const CRTriangleHeader* triHeaderPtr    = (const CRTriangleHeader*)p.triHeader + blockIdx.z * p.maxSubtris;;
+    const S32*              tileSegData     = (const S32*)p.tileSegData  + p.maxTileSegs * CR_TILE_SEG_SIZE * blockIdx.z;
+    const S32*              tileSegNext     = (const S32*)p.tileSegNext  + p.maxTileSegs * blockIdx.z;
+    const S32*              tileSegCount    = (const S32*)p.tileSegCount + p.maxTileSegs * blockIdx.z;
+
+    if (threadIdx.x >= tileSegCount[segment])
+    {
+        triIdx = -1;
+        dataIdx = -1;
+    }
+    else
+    {
+        int subtriIdx = tileSegData[segment * CR_TILE_SEG_SIZE + threadIdx.x];
+        triIdx = subtriIdx >> 3;
+        dataIdx = triIdx;
+        subtriIdx &= 7;
+        if (subtriIdx != 7)
+            dataIdx = triHeaderPtr[triIdx].misc + subtriIdx;
+        triHeader = *((uint4*)triHeaderPtr + dataIdx);
+    }
+
+    // advance to next segment
+    segment = tileSegNext[segment];
+}
+
+//------------------------------------------------------------------------
+
+__device__ __inline__ bool earlyZCull(uint4 triHeader, U32 tileZMax)
+{
+    U32 zmin = triHeader.w & 0xFFFFF000;
+    return (zmin > tileZMax);
+}
+
+//------------------------------------------------------------------------
+
+__device__ __inline__ U64 trianglePixelCoverage(const CRParams& p, const uint4& triHeader, int tileX, int tileY, volatile U64* s_cover8x8_lut)
+{
+    int baseX = (tileX << (CR_TILE_LOG2 + CR_SUBPIXEL_LOG2)) - ((p.widthPixels  - 1) << (CR_SUBPIXEL_LOG2 - 1));
+    int baseY = (tileY << (CR_TILE_LOG2 + CR_SUBPIXEL_LOG2)) - ((p.heightPixels - 1) << (CR_SUBPIXEL_LOG2 - 1));
+
+    // extract S16 vertex positions while subtracting tile coordinates
+    S32 v0x  = sub_s16lo_s16lo(triHeader.x, baseX);
+    S32 v0y  = sub_s16hi_s16lo(triHeader.x, baseY);
+    S32 v01x = sub_s16lo_s16lo(triHeader.y, triHeader.x);
+    S32 v01y = sub_s16hi_s16hi(triHeader.y, triHeader.x);
+    S32 v20x = sub_s16lo_s16lo(triHeader.x, triHeader.z);
+    S32 v20y = sub_s16hi_s16hi(triHeader.x, triHeader.z);
+
+    // extract flipbits
+    U32 f01 = (triHeader.w >> 6) & 0x3C;
+    U32 f12 = (triHeader.w >> 2) & 0x3C;
+    U32 f20 = (triHeader.w << 2) & 0x3C;
+
+    // compute per-edge coverage masks
+    U64 c01, c12, c20;
+    c01 = cover8x8_exact_fast(v0x, v0y, v01x, v01y, f01, s_cover8x8_lut);
+    c12 = cover8x8_exact_fast(v0x + v01x, v0y + v01y, -v01x - v20x, -v01y - v20y, f12, s_cover8x8_lut);
+    c20 = cover8x8_exact_fast(v0x, v0y, v20x, v20y, f20, s_cover8x8_lut);
+
+    // combine masks
+    return c01 & c12 & c20;
+}
+
+//------------------------------------------------------------------------
+
+__device__ __inline__ U32 scan32_value(U32 value, volatile U32* temp)
+{
+    __syncwarp();
+    temp[threadIdx.x + 16] = value; __syncwarp();
+    value += temp[threadIdx.x + 16 -  1]; __syncwarp(); temp[threadIdx.x + 16] = value; __syncwarp();
+    value += temp[threadIdx.x + 16 -  2]; __syncwarp(); temp[threadIdx.x + 16] = value; __syncwarp();
+    value += temp[threadIdx.x + 16 -  4]; __syncwarp(); temp[threadIdx.x + 16] = value; __syncwarp();
+    value += temp[threadIdx.x + 16 -  8]; __syncwarp(); temp[threadIdx.x + 16] = value; __syncwarp();
+    value += temp[threadIdx.x + 16 - 16]; __syncwarp(); temp[threadIdx.x + 16] = value; __syncwarp();
+    return value;
+}
+
+__device__ __inline__ volatile const U32& scan32_total(volatile U32* temp)
+{
+    return temp[47];
+}
+
+//------------------------------------------------------------------------
+
+__device__ __inline__ S32 findBit(U64 mask, int idx)
+{
+    U32 x = getLo(mask);
+    int  pop = __popc(x);
+    bool p   = (pop <= idx);
+    if (p) x = getHi(mask);
+    if (p) idx -= pop;
+    int bit = p ? 32 : 0;
+
+    pop = __popc(x & 0x0000ffffu);
+    p   = (pop <= idx);
+    if (p) x >>= 16;
+    if (p) bit += 16;
+    if (p) idx -= pop;
+
+    U32 tmp = x & 0x000000ffu;
+    pop = __popc(tmp);
+    p   = (pop <= idx);
+    if (p) tmp = x & 0x0000ff00u;
+    if (p) idx -= pop;
+
+    return findLeadingOne(tmp) + bit - idx;
+}
+
+//------------------------------------------------------------------------
+// Single-sample implementation.
+//------------------------------------------------------------------------
+
+__device__ __inline__ void executeROP(U32 color, U32 depth, volatile U32* pColor, volatile U32* pDepth, U32 ropMask)
+{
+    atomicMin((U32*)pDepth, depth);
+    __syncwarp(ropMask);
+    bool act = (depth == *pDepth);
+    __syncwarp(ropMask);
+    U32 actMask = __ballot_sync(ropMask, act);
+    if (act)
+    {
+        *pDepth = 0;
+        __syncwarp(actMask);
+        atomicMax((U32*)pDepth, threadIdx.x);
+        __syncwarp(actMask);
+        if (*pDepth == threadIdx.x)
+        {
+            *pDepth = depth;
+            *pColor = color;
+        }
+        __syncwarp(actMask);
+    }
+}
+
+//------------------------------------------------------------------------
+
+__device__ __inline__ void fineRasterImpl(const CRParams p)
+{
+                                                                            // for 20 warps:
+    __shared__ volatile U64 s_cover8x8_lut[CR_COVER8X8_LUT_SIZE];           // 6KB
+    __shared__ volatile U32 s_tileColor   [CR_FINE_MAX_WARPS][CR_TILE_SQR]; // 5KB
+    __shared__ volatile U32 s_tileDepth   [CR_FINE_MAX_WARPS][CR_TILE_SQR]; // 5KB
+    __shared__ volatile U32 s_tilePeel    [CR_FINE_MAX_WARPS][CR_TILE_SQR]; // 5KB
+    __shared__ volatile U32 s_triDataIdx  [CR_FINE_MAX_WARPS][64];          // 5KB  CRTriangleData index
+    __shared__ volatile U64 s_triangleCov [CR_FINE_MAX_WARPS][64];          // 10KB coverage mask
+    __shared__ volatile U32 s_triangleFrag[CR_FINE_MAX_WARPS][64];          // 5KB  fragment index
+    __shared__ volatile U32 s_temp        [CR_FINE_MAX_WARPS][80];          // 6.25KB
+                                                                            // = 47.25KB total
+
+    CRAtomics&            atomics   = p.atomics[blockIdx.z];
+    const CRTriangleData* triData   = (const CRTriangleData*)p.triData + blockIdx.z * p.maxSubtris;
+
+    const S32*      activeTiles     = (const S32*)p.activeTiles  + CR_MAXTILES_SQR * blockIdx.z;
+    const S32*      tileFirstSeg    = (const S32*)p.tileFirstSeg + CR_MAXTILES_SQR * blockIdx.z;
+
+    volatile U32*   tileColor       = s_tileColor[threadIdx.y];
+    volatile U32*   tileDepth       = s_tileDepth[threadIdx.y];
+    volatile U32*   tilePeel        = s_tilePeel[threadIdx.y];
+    volatile U32*   triDataIdx      = s_triDataIdx[threadIdx.y];
+    volatile U64*   triangleCov     = s_triangleCov[threadIdx.y];
+    volatile U32*   triangleFrag    = s_triangleFrag[threadIdx.y];
+    volatile U32*   temp            = s_temp[threadIdx.y];
+
+    if (atomics.numSubtris > p.maxSubtris || atomics.numBinSegs > p.maxBinSegs || atomics.numTileSegs > p.maxTileSegs)
+        return;
+
+    temp[threadIdx.x] = 0; // first 16 elements of temp are always zero
+    cover8x8_setupLUT(s_cover8x8_lut);
+    __syncthreads();
+
+    // loop over tiles
+    for (;;)
+    {
+        // pick a tile
+        if (threadIdx.x == 0)
+            temp[16] = atomicAdd(&atomics.fineCounter, 1);
+        __syncwarp();
+        int activeIdx = temp[16];
+        if (activeIdx >= atomics.numActiveTiles)
+            break;
+
+        int tileIdx = activeTiles[activeIdx];
+        S32 segment = tileFirstSeg[tileIdx];
+        int tileY = tileIdx / p.widthTiles;
+        int tileX = tileIdx - tileY * p.widthTiles;
+        int px = (tileX << CR_TILE_LOG2) + (threadIdx.x & (CR_TILE_SIZE - 1));
+        int py = (tileY << CR_TILE_LOG2) + (threadIdx.x >> CR_TILE_LOG2);
+
+        // initialize per-tile state
+        int triRead = 0, triWrite = 0;
+        int fragRead = 0, fragWrite = 0;
+        if (threadIdx.x == 0)
+            triangleFrag[63] = 0; // "previous triangle"
+
+        // deferred clear => clear tile
+        if (p.deferredClear)
+        {
+			tileColor[threadIdx.x] = p.clearColor;
+            tileDepth[threadIdx.x] = p.clearDepth;
+            tileColor[threadIdx.x + 32] = p.clearColor;
+            tileDepth[threadIdx.x + 32] = p.clearDepth;
+        }
+        else // otherwise => read tile from framebuffer
+        {
+            U32* pColor = (U32*)p.colorBuffer + p.widthPixels * p.heightPixels * blockIdx.z;
+            U32* pDepth = (U32*)p.depthBuffer + p.widthPixels * p.heightPixels * blockIdx.z;
+			tileColor[threadIdx.x] = pColor[px + p.widthPixels * py];
+            tileDepth[threadIdx.x] = pDepth[px + p.widthPixels * py];
+            tileColor[threadIdx.x + 32] = pColor[px + p.widthPixels * (py + 4)];
+            tileDepth[threadIdx.x + 32] = pDepth[px + p.widthPixels * (py + 4)];
+        }
+
+        // read peeling inputs if enabled
+        if (p.renderModeFlags & CudaRaster::RenderModeFlag_EnableDepthPeeling)
+        {
+            U32* pPeel = (U32*)p.peelBuffer + p.widthPixels * p.heightPixels * blockIdx.z;
+            tilePeel[threadIdx.x] = pPeel[px + p.widthPixels * py];
+            tilePeel[threadIdx.x + 32] = pPeel[px + p.widthPixels * (py + 4)];
+        }
+
+        U32 tileZMax;
+        bool tileZUpd;
+        initTileZMax(tileZMax, tileZUpd, tileDepth);
+
+        // process fragments
+        for(;;)
+        {
+            // need to queue more fragments?
+            if (fragWrite - fragRead < 32 && segment >= 0)
+            {
+                // update tile z - coherent over warp
+                updateTileZMax(tileZMax, tileZUpd, tileDepth, temp);
+
+                // read triangles
+                do
+                {
+                    // read triangle index and data, advance to next segment
+                    S32 triIdx, dataIdx;
+                    uint4 triHeader;
+                    getTriangle(p, triIdx, dataIdx, triHeader, segment);
+
+                    // early z cull
+                    if (triIdx >= 0 && earlyZCull(triHeader, tileZMax))
+                        triIdx = -1;
+
+                    // determine coverage
+                    U64 coverage = trianglePixelCoverage(p, triHeader, tileX, tileY, s_cover8x8_lut);
+                    S32 pop = (triIdx == -1) ? 0 : __popcll(coverage);
+
+                    // fragment count scan
+                    U32 frag = scan32_value(pop, temp);
+                    frag += fragWrite; // frag now holds cumulative fragment count
+                    fragWrite += scan32_total(temp);
+
+                    // queue non-empty triangles
+                    U32 goodMask = __ballot_sync(~0u, pop != 0);
+                    if (pop != 0)
+                    {
+                        int idx = (triWrite + __popc(goodMask & getLaneMaskLt())) & 63;
+                        triDataIdx  [idx] = dataIdx;
+                        triangleFrag[idx] = frag;
+                        triangleCov [idx] = coverage;
+                    }
+                    triWrite += __popc(goodMask);
+                }
+                while (fragWrite - fragRead < 32 && segment >= 0);
+            }
+            __syncwarp();
+
+            // end of segment?
+            if (fragRead == fragWrite)
+                break;
+
+            // clear triangle boundaries
+            temp[threadIdx.x + 16] = 0;
+            __syncwarp();
+
+            // tag triangle boundaries
+            if (triRead + threadIdx.x < triWrite)
+            {
+                int idx = triangleFrag[(triRead + threadIdx.x) & 63] - fragRead;
+                if (idx <= 32)
+                    temp[idx + 16 - 1] = 1;
+            }
+            __syncwarp();
+
+            int ropLaneIdx = threadIdx.x;
+            U32 boundaryMask = __ballot_sync(~0u, temp[ropLaneIdx + 16]);
+
+            // distribute fragments
+            bool hasFragment = (ropLaneIdx < fragWrite - fragRead);
+            U32 fragmentMask = __ballot_sync(~0u, hasFragment);
+            if (hasFragment)
+            {
+                int triBufIdx = (triRead + __popc(boundaryMask & getLaneMaskLt())) & 63;
+                int fragIdx = add_sub(fragRead, ropLaneIdx, triangleFrag[(triBufIdx - 1) & 63]);
+                U64 coverage = triangleCov[triBufIdx];
+                int pixelInTile = findBit(coverage, fragIdx);
+                int dataIdx = triDataIdx[triBufIdx];
+
+                // determine pixel position
+                U32 pixelX = (tileX << CR_TILE_LOG2) + (pixelInTile & 7);
+                U32 pixelY = (tileY << CR_TILE_LOG2) + (pixelInTile >> 3);
+
+                // depth test
+                U32 depth = 0;
+                uint4 td = *((uint4*)triData + dataIdx * (sizeof(CRTriangleData) >> 4));
+
+                depth = td.x * pixelX + td.y * pixelY + td.z;
+                bool zkill = (p.renderModeFlags & CudaRaster::RenderModeFlag_EnableDepthPeeling) && (depth <= tilePeel[pixelInTile]);
+                if (!zkill)
+                {
+                    U32 oldDepth = tileDepth[pixelInTile];
+                    if (depth > oldDepth)
+                        zkill = true;
+                    else if (oldDepth == tileZMax)
+                        tileZUpd = true; // we are replacing previous zmax => need to update
+                }
+
+                U32 ropMask = __ballot_sync(fragmentMask, !zkill);
+                if (!zkill)
+					executeROP(td.w, depth, &tileColor[pixelInTile], &tileDepth[pixelInTile], ropMask);
+            }
+            // no need to sync, as next up is updateTileZMax that does internal warp sync
+
+            // update counters
+            fragRead = ::min(fragRead + 32, fragWrite);
+            triRead += __popc(boundaryMask);
+        }
+
+        // Write tile back to the framebuffer.
+        if (true)
+        {
+            int px = (tileX << CR_TILE_LOG2) + (threadIdx.x & (CR_TILE_SIZE - 1));
+            int py = (tileY << CR_TILE_LOG2) + (threadIdx.x >> CR_TILE_LOG2);
+            U32* pColor = (U32*)p.colorBuffer + p.widthPixels * p.heightPixels * blockIdx.z;
+            U32* pDepth = (U32*)p.depthBuffer + p.widthPixels * p.heightPixels * blockIdx.z;
+            pColor[px + p.widthPixels * py] = tileColor[threadIdx.x];
+            pDepth[px + p.widthPixels * py] = tileDepth[threadIdx.x];
+            pColor[px + p.widthPixels * (py + 4)] = tileColor[threadIdx.x + 32];
+            pDepth[px + p.widthPixels * (py + 4)] = tileDepth[threadIdx.x + 32];
+        }
+    }
+}
+
+//------------------------------------------------------------------------
--- a/nvdiffrast/common/cudaraster/impl/PrivateDefs.hpp
+++ b/nvdiffrast/common/cudaraster/impl/PrivateDefs.hpp
+// Copyright (c) 2009-2022, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#pragma once
+#include "Defs.hpp"
+#include "Constants.hpp"
+
+namespace CR
+{
+//------------------------------------------------------------------------
+// Projected triangle.
+//------------------------------------------------------------------------
+
+struct CRTriangleHeader
+{
+    S16 v0x;    // Subpixels relative to viewport center. Valid if triSubtris = 1.
+    S16 v0y;
+    S16 v1x;
+    S16 v1y;
+    S16 v2x;
+    S16 v2y;
+
+    U32 misc;   // triSubtris=1: (zmin:20, f01:4, f12:4, f20:4), triSubtris>=2: (subtriBase)
+};
+
+//------------------------------------------------------------------------
+
+struct CRTriangleData
+{
+    U32 zx;     // zx * sampleX + zy * sampleY + zb = lerp(CR_DEPTH_MIN, CR_DEPTH_MAX, (clipZ / clipW + 1) / 2)
+    U32 zy;
+    U32 zb;
+    U32 id;     // Triangle id.
+};
+
+//------------------------------------------------------------------------
+// Device-side structures.
+//------------------------------------------------------------------------
+
+struct CRAtomics
+{
+    // Setup.
+    S32         numSubtris;         // = numTris
+
+    // Bin.
+    S32         binCounter;         // = 0
+    S32         numBinSegs;         // = 0
+
+    // Coarse.
+    S32         coarseCounter;      // = 0
+    S32         numTileSegs;        // = 0
+    S32         numActiveTiles;     // = 0
+
+    // Fine.
+    S32         fineCounter;        // = 0
+};
+
+//------------------------------------------------------------------------
+
+struct CRImageParams
+{
+    S32         triOffset;          // First triangle index to draw.
+    S32         triCount;           // Number of triangles to draw.
+    S32         binBatchSize;       // Number of triangles per batch.
+};
+
+//------------------------------------------------------------------------
+
+struct CRParams
+{
+    // Common.
+
+    CRAtomics*              atomics;            // Work counters. Per-image.
+    S32                     numImages;          // Batch size.
+    S32                     totalCount;         // In range mode, total number of triangles to render.
+    S32                     instanceMode;       // 0 = range mode, 1 = instance mode.
+
+    S32         numVertices;        // Number of vertices in input buffer, not counting multiples in instance mode.
+    S32         numTriangles;       // Number of triangles in input buffer.
+    void*       vertexBuffer;       // numVertices * float4(x, y, z, w)
+    void*       indexBuffer;        // numTriangles * int3(vi0, vi1, vi2)
+
+    S32         widthPixels;        // Viewport size in pixels. Must be multiple of tile size (8x8).
+    S32         heightPixels;
+    S32         widthBins;          // widthPixels / CR_BIN_SIZE
+    S32         heightBins;         // heightPixels / CR_BIN_SIZE
+    S32         numBins;            // widthBins * heightBins
+
+    S32         widthTiles;         // widthPixels / CR_TILE_SIZE
+    S32         heightTiles;        // heightPixels / CR_TILE_SIZE
+    S32         numTiles;           // widthTiles * heightTiles
+
+    U32         renderModeFlags;
+    S32         deferredClear;      // 1 = Clear framebuffer before rendering triangles.
+    U32         clearColor;
+    U32         clearDepth;
+
+    // These are uniform across batch.
+
+    S32         maxSubtris;
+    S32         maxBinSegs;
+    S32         maxTileSegs;
+
+    // Setup output / bin input.
+
+    void*       triSubtris;         // maxSubtris * U8
+    void*       triHeader;          // maxSubtris * CRTriangleHeader
+    void*       triData;            // maxSubtris * CRTriangleData
+
+    // Bin output / coarse input.
+
+    void*       binSegData;         // maxBinSegs * CR_BIN_SEG_SIZE * S32
+    void*       binSegNext;         // maxBinSegs * S32
+    void*       binSegCount;        // maxBinSegs * S32
+    void*       binFirstSeg;        // CR_MAXBINS_SQR * CR_BIN_STREAMS_SIZE * (S32 segIdx), -1 = none
+    void*       binTotal;           // CR_MAXBINS_SQR * CR_BIN_STREAMS_SIZE * (S32 numTris)
+
+    // Coarse output / fine input.
+
+    void*       tileSegData;        // maxTileSegs * CR_TILE_SEG_SIZE * S32
+    void*       tileSegNext;        // maxTileSegs * S32
+    void*       tileSegCount;       // maxTileSegs * S32
+    void*       activeTiles;        // CR_MAXTILES_SQR * (S32 tileIdx)
+    void*       tileFirstSeg;       // CR_MAXTILES_SQR * (S32 segIdx), -1 = none
+
+    // Surface buffers.
+
+    void*       colorBuffer;        // sizePixels.x * sizePixels.y * numImages * U32
+    void*       depthBuffer;        // sizePixels.x * sizePixels.y * numImages * U32
+    void*       peelBuffer;         // sizePixels.x * sizePixels.y * numImages * U32, only if peeling enabled.
+
+    // Per-image parameters for first images are embedded here to avoid extra memcpy for small batches.
+
+    CRImageParams imageParamsFirst[CR_EMBED_IMAGE_PARAMS];
+    const CRImageParams* imageParamsExtra; // After CR_EMBED_IMAGE_PARAMS.
+};
+
+//------------------------------------------------------------------------
+}
--- a/nvdiffrast/common/cudaraster/impl/RasterImpl.cpp
+++ b/nvdiffrast/common/cudaraster/impl/RasterImpl.cpp
+// Copyright (c) 2009-2022, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#include "../../framework.h"
+#include "PrivateDefs.hpp"
+#include "Constants.hpp"
+#include "RasterImpl.hpp"
+#include <cuda_runtime.h>
+
+using namespace CR;
+using std::min;
+using std::max;
+
+//------------------------------------------------------------------------
+// Kernel prototypes and variables.
+
+void triangleSetupKernel (const CRParams p);
+void binRasterKernel     (const CRParams p);
+void coarseRasterKernel  (const CRParams p);
+void fineRasterKernel    (const CRParams p);
+
+//------------------------------------------------------------------------
+
+RasterImpl::RasterImpl(void)
+:   m_renderModeFlags       (0),
+    m_deferredClear         (false),
+    m_clearColor            (0),
+    m_vertexPtr             (NULL),
+    m_indexPtr              (NULL),
+    m_numVertices           (0),
+    m_numTriangles          (0),
+    m_bufferSizesReported   (0),
+
+    m_numImages             (0),
+    m_sizePixels            (0, 0),
+    m_sizeBins              (0, 0),
+    m_numBins               (0),
+    m_sizeTiles             (0, 0),
+    m_numTiles              (0),
+
+    m_numSMs                (1),
+    m_numCoarseBlocksPerSM  (1),
+    m_numFineBlocksPerSM    (1),
+    m_numFineWarpsPerBlock  (1),
+
+    m_maxSubtris            (1),
+    m_maxBinSegs            (1),
+    m_maxTileSegs           (1)
+{
+    // Query relevant device attributes.
+
+    int currentDevice = 0;
+    NVDR_CHECK_CUDA_ERROR(cudaGetDevice(&currentDevice));
+    NVDR_CHECK_CUDA_ERROR(cudaDeviceGetAttribute(&m_numSMs, cudaDevAttrMultiProcessorCount, currentDevice));
+    cudaFuncAttributes attr;
+    NVDR_CHECK_CUDA_ERROR(cudaFuncGetAttributes(&attr, (void*)fineRasterKernel));
+    m_numFineWarpsPerBlock = min(attr.maxThreadsPerBlock / 32, CR_FINE_MAX_WARPS);
+    NVDR_CHECK_CUDA_ERROR(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&m_numCoarseBlocksPerSM, (void*)coarseRasterKernel, 32 * CR_COARSE_WARPS, 0));
+    NVDR_CHECK_CUDA_ERROR(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&m_numFineBlocksPerSM, (void*)fineRasterKernel, 32 * m_numFineWarpsPerBlock, 0));
+
+    // Setup functions.
+
+    NVDR_CHECK_CUDA_ERROR(cudaFuncSetCacheConfig((void*)triangleSetupKernel, cudaFuncCachePreferShared));
+    NVDR_CHECK_CUDA_ERROR(cudaFuncSetCacheConfig((void*)binRasterKernel,     cudaFuncCachePreferShared));
+    NVDR_CHECK_CUDA_ERROR(cudaFuncSetCacheConfig((void*)coarseRasterKernel,  cudaFuncCachePreferShared));
+    NVDR_CHECK_CUDA_ERROR(cudaFuncSetCacheConfig((void*)fineRasterKernel,    cudaFuncCachePreferShared));
+}
+
+//------------------------------------------------------------------------
+
+RasterImpl::~RasterImpl(void)
+{
+    // Empty.
+}
+
+//------------------------------------------------------------------------
+
+void RasterImpl::setViewportSize(Vec3i size)
+{
+    if ((size.x | size.y) & (CR_TILE_SIZE - 1))
+        return; // Invalid size.
+
+    m_numImages     = size.z;
+    m_sizePixels    = Vec2i(size.x, size.y);
+    m_sizeTiles.x   = m_sizePixels.x >> CR_TILE_LOG2;
+    m_sizeTiles.y   = m_sizePixels.y >> CR_TILE_LOG2;
+    m_numTiles      = m_sizeTiles.x * m_sizeTiles.y;
+    m_sizeBins.x    = (m_sizeTiles.x + CR_BIN_SIZE - 1) >> CR_BIN_LOG2;
+    m_sizeBins.y    = (m_sizeTiles.y + CR_BIN_SIZE - 1) >> CR_BIN_LOG2;
+    m_numBins       = m_sizeBins.x * m_sizeBins.y;
+
+    m_colorBuffer.reset(m_sizePixels.x * m_sizePixels.y * m_numImages * sizeof(U32));
+    m_depthBuffer.reset(m_sizePixels.x * m_sizePixels.y * m_numImages * sizeof(U32));
+}
+
+void RasterImpl::swapDepthAndPeel(void)
+{
+    m_peelBuffer.reset(m_depthBuffer.getSize()); // Ensure equal size and valid pointer.
+
+    void* tmp = m_depthBuffer.getPtr();
+    m_depthBuffer.setPtr(m_peelBuffer.getPtr());
+    m_peelBuffer.setPtr(tmp);
+}
+
+//------------------------------------------------------------------------
+
+bool RasterImpl::drawTriangles(const Vec2i* ranges, cudaStream_t stream)
+{
+    bool instanceMode = (!ranges);
+
+    int maxSubtrisSlack     = 4096;     // x 81B    = 324KB
+    int maxBinSegsSlack     = 256;      // x 2137B  = 534KB
+    int maxTileSegsSlack    = 4096;     // x 136B   = 544KB
+
+    // Resize atomics as needed.
+    m_crAtomics    .grow(m_numImages * sizeof(CRAtomics));
+
+    // Size of these buffers doesn't depend on input.
+    m_binFirstSeg  .grow(m_numImages * CR_MAXBINS_SQR * CR_BIN_STREAMS_SIZE * sizeof(S32));
+    m_binTotal     .grow(m_numImages * CR_MAXBINS_SQR * CR_BIN_STREAMS_SIZE * sizeof(S32));
+    m_activeTiles  .grow(m_numImages * CR_MAXTILES_SQR * sizeof(S32));
+    m_tileFirstSeg .grow(m_numImages * CR_MAXTILES_SQR * sizeof(S32));
+
+    // Construct per-image parameters and determine worst-case buffer sizes.
+    std::vector<CRImageParams> imageParams(m_numImages);
+    for (int i=0; i < m_numImages; i++)
+    {
+        CRImageParams& ip = imageParams[i];
+
+        int roundSize  = CR_BIN_WARPS * 32;
+        int minBatches = CR_BIN_STREAMS_SIZE * 2;
+        int maxRounds  = 32;
+
+        ip.triOffset = instanceMode ? 0 : ranges[i].x;
+        ip.triCount  = instanceMode ? m_numTriangles : ranges[i].y;
+        ip.binBatchSize = min(max(ip.triCount / (roundSize * minBatches), 1), maxRounds) * roundSize;
+
+        m_maxSubtris  = max(m_maxSubtris,  min(ip.triCount + maxSubtrisSlack, CR_MAXSUBTRIS_SIZE));
+        m_maxBinSegs  = max(m_maxBinSegs,  max(m_numBins * CR_BIN_STREAMS_SIZE, (ip.triCount - 1) / CR_BIN_SEG_SIZE + 1) + maxBinSegsSlack);
+        m_maxTileSegs = max(m_maxTileSegs, max(m_numTiles, (ip.triCount - 1) / CR_TILE_SEG_SIZE + 1) + maxTileSegsSlack);
+    }
+
+    // Retry until successful.
+
+    for (;;)
+    {
+        // Allocate buffers.
+        m_triSubtris.reset(m_numImages * m_maxSubtris * sizeof(U8));
+        m_triHeader .reset(m_numImages * m_maxSubtris * sizeof(CRTriangleHeader));
+        m_triData   .reset(m_numImages * m_maxSubtris * sizeof(CRTriangleData));
+
+        m_binSegData .reset(m_numImages * m_maxBinSegs * CR_BIN_SEG_SIZE * sizeof(S32));
+        m_binSegNext .reset(m_numImages * m_maxBinSegs * sizeof(S32));
+        m_binSegCount.reset(m_numImages * m_maxBinSegs * sizeof(S32));
+
+        m_tileSegData .reset(m_numImages * m_maxTileSegs * CR_TILE_SEG_SIZE * sizeof(S32));
+        m_tileSegNext .reset(m_numImages * m_maxTileSegs * sizeof(S32));
+        m_tileSegCount.reset(m_numImages * m_maxTileSegs * sizeof(S32));
+
+        // Report if buffers grow from last time.
+        size_t sizesTotal = getTotalBufferSizes();
+        if (sizesTotal > m_bufferSizesReported)
+        {
+            size_t sizesMB = ((sizesTotal - 1) >> 20) + 1; // Round up.
+            sizesMB = ((sizesMB + 9) / 10) * 10; // 10MB granularity enough in this day and age.
+            LOG(INFO) << "Internal buffers grown to " << sizesMB << " MB";
+            m_bufferSizesReported = sizesMB << 20;
+        }
+
+        // Launch stages.
+        launchStages(&imageParams[0], instanceMode, stream);
+
+        // Get atomics.
+        std::vector<CRAtomics> atomics(m_numImages);
+        NVDR_CHECK_CUDA_ERROR(cudaMemcpyAsync(&atomics[0], m_crAtomics.getPtr(), sizeof(CRAtomics) * m_numImages, cudaMemcpyDeviceToHost, stream));
+
+        // Success?
+        bool failed = false;
+        for (int i=0; i < m_numImages; i++)
+        {
+            const CRAtomics& a = atomics[i];
+            failed = failed || (a.numSubtris > m_maxSubtris) || (a.numBinSegs > m_maxBinSegs) || (a.numTileSegs > m_maxTileSegs);
+        }
+        if (!failed)
+            break; // Success!
+
+        // If we were already at maximum capacity, no can do.
+        if (m_maxSubtris == CR_MAXSUBTRIS_SIZE)
+            return false;
+
+        // Enlarge buffers and try again.
+        for (int i=0; i < m_numImages; i++)
+        {
+            const CRAtomics& a = atomics[i];
+            m_maxSubtris  = max(m_maxSubtris,  min(a.numSubtris + maxSubtrisSlack, CR_MAXSUBTRIS_SIZE));
+            m_maxBinSegs  = max(m_maxBinSegs,  a.numBinSegs + maxBinSegsSlack);
+            m_maxTileSegs = max(m_maxTileSegs, a.numTileSegs + maxTileSegsSlack);
+        }
+    }
+
+    m_deferredClear = false;
+    return true; // Success.
+}
+
+//------------------------------------------------------------------------
+
+size_t RasterImpl::getTotalBufferSizes(void) const
+{
+    return
+        m_colorBuffer.getSize() + m_depthBuffer.getSize() + // Don't include atomics and image params.
+        m_triSubtris.getSize() + m_triHeader.getSize() + m_triData.getSize() +
+        m_binFirstSeg.getSize() + m_binTotal.getSize() + m_binSegData.getSize() + m_binSegNext.getSize() + m_binSegCount.getSize() +
+        m_activeTiles.getSize() + m_tileFirstSeg.getSize() + m_tileSegData.getSize() + m_tileSegNext.getSize() + m_tileSegCount.getSize();
+}
+
+//------------------------------------------------------------------------
+
+void RasterImpl::launchStages(const CRImageParams* imageParams, bool instanceMode, cudaStream_t stream)
+{
+    // Initialize atomics to mostly zero.
+    {
+        std::vector<CRAtomics> atomics(m_numImages);
+        memset(&atomics[0], 0, m_numImages * sizeof(CRAtomics));
+        for (int i=0; i < m_numImages; i++)
+            atomics[i].numSubtris = imageParams[i].triCount;
+        NVDR_CHECK_CUDA_ERROR(cudaMemcpyAsync(m_crAtomics.getPtr(), &atomics[0], m_numImages * sizeof(CRAtomics), cudaMemcpyHostToDevice, stream));
+    }
+
+    // Copy per-image parameters if there are more than fits in launch parameter block.
+    if (m_numImages > CR_EMBED_IMAGE_PARAMS)
+    {
+        int numImageParamsExtra = m_numImages - CR_EMBED_IMAGE_PARAMS;
+        m_crImageParamsExtra.grow(numImageParamsExtra * sizeof(CRImageParams));
+        NVDR_CHECK_CUDA_ERROR(cudaMemcpyAsync(m_crImageParamsExtra.getPtr(), imageParams + CR_EMBED_IMAGE_PARAMS, numImageParamsExtra * sizeof(CRImageParams), cudaMemcpyHostToDevice, stream));
+    }
+
+    // Set global parameters.
+    CRParams p;
+    {
+        p.atomics           = (CRAtomics*)m_crAtomics.getPtr();
+        p.numImages         = m_numImages;
+        p.totalCount        = 0; // Only relevant in range mode.
+        p.instanceMode      = instanceMode ? 1 : 0;
+
+        p.numVertices       = m_numVertices;
+        p.numTriangles      = m_numTriangles;
+        p.vertexBuffer      = m_vertexPtr;
+        p.indexBuffer       = m_indexPtr;
+
+        p.widthPixels       = m_sizePixels.x;
+        p.heightPixels      = m_sizePixels.y;
+        p.widthBins         = m_sizeBins.x;
+        p.heightBins        = m_sizeBins.y;
+        p.numBins           = m_numBins;
+
+        p.widthTiles        = m_sizeTiles.x;
+        p.heightTiles       = m_sizeTiles.y;
+        p.numTiles          = m_numTiles;
+
+        p.renderModeFlags   = m_renderModeFlags;
+        p.deferredClear     = m_deferredClear ? 1 : 0;
+        p.clearColor        = m_clearColor;
+        p.clearDepth        = CR_DEPTH_MAX;
+
+        p.maxSubtris        = m_maxSubtris;
+        p.maxBinSegs        = m_maxBinSegs;
+        p.maxTileSegs       = m_maxTileSegs;
+
+        p.triSubtris        = m_triSubtris.getPtr();
+        p.triHeader         = m_triHeader.getPtr();
+        p.triData           = m_triData.getPtr();
+        p.binSegData        = m_binSegData.getPtr();
+        p.binSegNext        = m_binSegNext.getPtr();
+        p.binSegCount       = m_binSegCount.getPtr();
+        p.binFirstSeg       = m_binFirstSeg.getPtr();
+        p.binTotal          = m_binTotal.getPtr();
+        p.tileSegData       = m_tileSegData.getPtr();
+        p.tileSegNext       = m_tileSegNext.getPtr();
+        p.tileSegCount      = m_tileSegCount.getPtr();
+        p.activeTiles       = m_activeTiles.getPtr();
+        p.tileFirstSeg      = m_tileFirstSeg.getPtr();
+
+        p.colorBuffer       = m_colorBuffer.getPtr();
+        p.depthBuffer       = m_depthBuffer.getPtr();
+        p.peelBuffer        = (m_renderModeFlags & CudaRaster::RenderModeFlag_EnableDepthPeeling) ? m_peelBuffer.getPtr() : 0;
+
+        memcpy(&p.imageParamsFirst, imageParams, min(m_numImages, CR_EMBED_IMAGE_PARAMS) * sizeof(CRImageParams));
+        p.imageParamsExtra  = (CRImageParams*)m_crImageParamsExtra.getPtr();
+    }
+
+    // Setup block sizes.
+
+    dim3 brBlock(32, CR_BIN_WARPS);
+    dim3 crBlock(32, CR_COARSE_WARPS);
+    dim3 frBlock(32, m_numFineWarpsPerBlock);
+
+    // Launch stages.
+    void* args[] = {&p};
+    if (instanceMode)
+    {
+        int setupBlocks = (m_numTriangles - 1) / (32 * CR_SETUP_WARPS) + 1;
+        NVDR_CHECK_CUDA_ERROR(cudaLaunchKernel((void*)triangleSetupKernel, dim3(setupBlocks, 1, m_numImages), dim3(32, CR_SETUP_WARPS), args, 0, stream));
+    }
+    else
+    {
+        for (int i=0; i < m_numImages; i++)
+            p.totalCount += imageParams[i].triCount;
+        int setupBlocks = (p.totalCount - 1) / (32 * CR_SETUP_WARPS) + 1;
+        NVDR_CHECK_CUDA_ERROR(cudaLaunchKernel((void*)triangleSetupKernel, dim3(setupBlocks, 1, 1), dim3(32, CR_SETUP_WARPS), args, 0, stream));
+    }
+    NVDR_CHECK_CUDA_ERROR(cudaLaunchKernel((void*)binRasterKernel, dim3(CR_BIN_STREAMS_SIZE, 1, m_numImages), brBlock, args, 0, stream));
+    NVDR_CHECK_CUDA_ERROR(cudaLaunchKernel((void*)coarseRasterKernel, dim3(m_numSMs * m_numCoarseBlocksPerSM, 1, m_numImages), crBlock, args, 0, stream));
+    NVDR_CHECK_CUDA_ERROR(cudaLaunchKernel((void*)fineRasterKernel, dim3(m_numSMs * m_numFineBlocksPerSM, 1, m_numImages), frBlock, args, 0, stream));
+}
+
+//------------------------------------------------------------------------
--- a/nvdiffrast/common/cudaraster/impl/RasterImpl.cu
+++ b/nvdiffrast/common/cudaraster/impl/RasterImpl.cu
+// Copyright (c) 2009-2022, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#include "../CudaRaster.hpp"
+#include "PrivateDefs.hpp"
+#include "Constants.hpp"
+#include "Util.inl"
+
+namespace CR
+{
+
+//------------------------------------------------------------------------
+// Stage implementations.
+//------------------------------------------------------------------------
+
+#include "TriangleSetup.inl"
+#include "BinRaster.inl"
+#include "CoarseRaster.inl"
+#include "FineRaster.inl"
+
+}
+
+//------------------------------------------------------------------------
+// Stage entry points.
+//------------------------------------------------------------------------
+
+__global__ void __launch_bounds__(CR_SETUP_WARPS * 32, CR_SETUP_OPT_BLOCKS)  triangleSetupKernel (const CR::CRParams p)  { CR::triangleSetupImpl(p); }
+__global__ void __launch_bounds__(CR_BIN_WARPS * 32, 1)                      binRasterKernel     (const CR::CRParams p)  { CR::binRasterImpl(p); }
+__global__ void __launch_bounds__(CR_COARSE_WARPS * 32, 1)                   coarseRasterKernel  (const CR::CRParams p)  { CR::coarseRasterImpl(p); }
+__global__ void __launch_bounds__(CR_FINE_MAX_WARPS * 32, 1)                 fineRasterKernel    (const CR::CRParams p)  { CR::fineRasterImpl(p); }
+
+//------------------------------------------------------------------------
--- a/nvdiffrast/common/cudaraster/impl/RasterImpl.hpp
+++ b/nvdiffrast/common/cudaraster/impl/RasterImpl.hpp
+// Copyright (c) 2009-2022, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#pragma once
+#include "PrivateDefs.hpp"
+#include "Buffer.hpp"
+#include "../CudaRaster.hpp"
+
+namespace CR
+{
+//------------------------------------------------------------------------
+
+class RasterImpl
+{
+public:
+					        RasterImpl				(void);
+					        ~RasterImpl				(void);
+
+    void                    setViewportSize         (Vec3i size); // Must be multiple of tile size (8x8).
+    void                    setRenderModeFlags      (U32 flags) { m_renderModeFlags = flags; }
+    void                    deferredClear           (U32 color) { m_deferredClear = true; m_clearColor = color; }
+    void                    setVertexBuffer         (void* ptr, int numVertices) { m_vertexPtr = ptr; m_numVertices = numVertices; } // GPU pointer.
+    void                    setIndexBuffer          (void* ptr, int numTriangles) { m_indexPtr = ptr; m_numTriangles = numTriangles; } // GPU pointer.
+    bool                    drawTriangles           (const Vec2i* ranges, cudaStream_t stream);
+    void*                   getColorBuffer          (void) { return m_colorBuffer.getPtr(); } // GPU pointer.
+    void*                   getDepthBuffer          (void) { return m_depthBuffer.getPtr(); } // GPU pointer.
+    void                    swapDepthAndPeel        (void);
+    size_t                  getTotalBufferSizes     (void) const;
+
+private:
+    void                    launchStages            (const CRImageParams* imageParams, bool instanceMode, cudaStream_t stream);
+
+    // State.
+
+    unsigned int            m_renderModeFlags;
+    bool                    m_deferredClear;
+    unsigned int            m_clearColor;
+    void*                   m_vertexPtr;
+    void*                   m_indexPtr;
+    int                     m_numVertices;          // Input buffer size.
+    int                     m_numTriangles;         // Input buffer size.
+    size_t                  m_bufferSizesReported;  // Previously reported buffer sizes.
+
+    // Surfaces.
+
+    Buffer                  m_colorBuffer;
+    Buffer                  m_depthBuffer;
+    Buffer                  m_peelBuffer;
+    int                     m_numImages;
+    Vec2i                   m_sizePixels;
+    Vec2i                   m_sizeBins;
+    S32                     m_numBins;
+    Vec2i                   m_sizeTiles;
+    S32                     m_numTiles;
+
+    // Launch sizes etc.
+
+    S32                     m_numSMs;
+    S32                     m_numCoarseBlocksPerSM;
+    S32                     m_numFineBlocksPerSM;
+    S32                     m_numFineWarpsPerBlock;
+
+    // Global intermediate buffers. Individual images have offsets to these.
+
+    Buffer                  m_crAtomics;
+    Buffer                  m_crImageParamsExtra;
+    Buffer                  m_triSubtris;
+    Buffer                  m_triHeader;
+    Buffer                  m_triData;
+    Buffer                  m_binFirstSeg;
+    Buffer                  m_binTotal;
+    Buffer                  m_binSegData;
+    Buffer                  m_binSegNext;
+	Buffer                  m_binSegCount;
+    Buffer                  m_activeTiles;
+    Buffer                  m_tileFirstSeg;
+    Buffer                  m_tileSegData;
+    Buffer                  m_tileSegNext;
+    Buffer                  m_tileSegCount;
+
+    // Actual buffer sizes.
+
+    S32                     m_maxSubtris;
+    S32                     m_maxBinSegs;
+    S32                     m_maxTileSegs;
+};
+
+//------------------------------------------------------------------------
+} // namespace CR
+
--- a/nvdiffrast/common/cudaraster/impl/TriangleSetup.inl
+++ b/nvdiffrast/common/cudaraster/impl/TriangleSetup.inl
+// Copyright (c) 2009-2022, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+//------------------------------------------------------------------------
+
+__device__ __inline__ void snapTriangle(
+    const CRParams& p,
+    float4 v0, float4 v1, float4 v2,
+    int2& p0, int2& p1, int2& p2, float3& rcpW, int2& lo, int2& hi)
+{
+    F32 viewScaleX = (F32)(p.widthPixels  << (CR_SUBPIXEL_LOG2 - 1));
+    F32 viewScaleY = (F32)(p.heightPixels << (CR_SUBPIXEL_LOG2 - 1));
+    rcpW = make_float3(1.0f / v0.w, 1.0f / v1.w, 1.0f / v2.w);
+    p0 = make_int2(f32_to_s32_sat(v0.x * rcpW.x * viewScaleX), f32_to_s32_sat(v0.y * rcpW.x * viewScaleY));
+    p1 = make_int2(f32_to_s32_sat(v1.x * rcpW.y * viewScaleX), f32_to_s32_sat(v1.y * rcpW.y * viewScaleY));
+    p2 = make_int2(f32_to_s32_sat(v2.x * rcpW.z * viewScaleX), f32_to_s32_sat(v2.y * rcpW.z * viewScaleY));
+    lo = make_int2(min_min(p0.x, p1.x, p2.x), min_min(p0.y, p1.y, p2.y));
+    hi = make_int2(max_max(p0.x, p1.x, p2.x), max_max(p0.y, p1.y, p2.y));
+}
+
+//------------------------------------------------------------------------
+
+__device__ __inline__ U32 cover8x8_selectFlips(S32 dx, S32 dy) // 10 instr
+{
+    U32 flips = 0;
+    if (dy > 0 || (dy == 0 && dx <= 0))
+        flips ^= (1 << CR_FLIPBIT_FLIP_X) ^ (1 << CR_FLIPBIT_FLIP_Y) ^ (1 << CR_FLIPBIT_COMPL);
+    if (dx > 0)
+        flips ^= (1 << CR_FLIPBIT_FLIP_X) ^ (1 << CR_FLIPBIT_FLIP_Y);
+    if (::abs(dx) < ::abs(dy))
+        flips ^= (1 << CR_FLIPBIT_SWAP_XY) ^ (1 << CR_FLIPBIT_FLIP_Y);
+    return flips;
+}
+
+//------------------------------------------------------------------------
+
+__device__ __inline__ bool prepareTriangle(
+    const CRParams& p,
+    int2 p0, int2 p1, int2 p2, int2 lo, int2 hi,
+    int2& d1, int2& d2, S32& area)
+{
+    // Backfacing or degenerate => cull.
+
+    d1 = make_int2(p1.x - p0.x, p1.y - p0.y);
+    d2 = make_int2(p2.x - p0.x, p2.y - p0.y);
+    area = d1.x * d2.y - d1.y * d2.x;
+
+    if (area == 0)
+        return false; // Degenerate.
+
+    if (area < 0 && (p.renderModeFlags & CudaRaster::RenderModeFlag_EnableBackfaceCulling) != 0)
+        return false; // Backfacing.
+
+    // AABB falls between samples => cull.
+
+    int sampleSize = 1 << CR_SUBPIXEL_LOG2;
+    int biasX = (p.widthPixels  << (CR_SUBPIXEL_LOG2 - 1)) - (sampleSize >> 1);
+    int biasY = (p.heightPixels << (CR_SUBPIXEL_LOG2 - 1)) - (sampleSize >> 1);
+    int lox = (int)add_add(lo.x, sampleSize - 1, biasX) & -sampleSize;
+    int loy = (int)add_add(lo.y, sampleSize - 1, biasY) & -sampleSize;
+    int hix = (hi.x + biasX) & -sampleSize;
+    int hiy = (hi.y + biasY) & -sampleSize;
+
+    if (lox > hix || loy > hiy)
+        return false; // Between pixels.
+
+    // AABB covers 1 or 2 samples => cull if they are not covered.
+
+    int diff = add_sub(hix, hiy, lox) - loy;
+    if (diff <= sampleSize)
+    {
+        int2 t0 = make_int2(add_sub(p0.x, biasX, lox), add_sub(p0.y, biasY, loy));
+        int2 t1 = make_int2(add_sub(p1.x, biasX, lox), add_sub(p1.y, biasY, loy));
+        int2 t2 = make_int2(add_sub(p2.x, biasX, lox), add_sub(p2.y, biasY, loy));
+        S32 e0 = t0.x * t1.y - t0.y * t1.x;
+        S32 e1 = t1.x * t2.y - t1.y * t2.x;
+        S32 e2 = t2.x * t0.y - t2.y * t0.x;
+
+        if (e0 < 0 || e1 < 0 || e2 < 0)
+        {
+            if (diff == 0)
+                return false; // Between pixels.
+
+            t0 = make_int2(add_sub(p0.x, biasX, hix), add_sub(p0.y, biasY, hiy));
+            t1 = make_int2(add_sub(p1.x, biasX, hix), add_sub(p1.y, biasY, hiy));
+            t2 = make_int2(add_sub(p2.x, biasX, hix), add_sub(p2.y, biasY, hiy));
+            e0 = t0.x * t1.y - t0.y * t1.x;
+            e1 = t1.x * t2.y - t1.y * t2.x;
+            e2 = t2.x * t0.y - t2.y * t0.x;
+
+            if (e0 < 0 || e1 < 0 || e2 < 0)
+                return false; // Between pixels.
+        }
+    }
+
+    // Otherwise => proceed to output the triangle.
+
+    return true; // Visible.
+}
+
+//------------------------------------------------------------------------
+
+__device__ __inline__ void setupTriangle(
+    const CRParams& p,
+    CRTriangleHeader* th, CRTriangleData* td, int triId,
+    float v0z, float v1z, float v2z,
+    int2 p0, int2 p1, int2 p2, float3 rcpW,
+    int2 d1, int2 d2, S32 area)
+{
+    // Swap vertices 1 and 2 if area is negative. Only executed if backface culling is
+    // disabled (if it is enabled, we never come here with area < 0).
+
+    if (area < 0)
+    {
+        swap(d1, d2);
+        swap(p1, p2);
+        swap(v1z, v2z);
+        swap(rcpW.y, rcpW.z);
+        area = -area;
+    }
+
+    int2 wv0;
+    wv0.x = p0.x + (p.widthPixels  << (CR_SUBPIXEL_LOG2 - 1));
+    wv0.y = p0.y + (p.heightPixels << (CR_SUBPIXEL_LOG2 - 1));
+
+    // Setup depth plane equation.
+
+    F32 zcoef = (F32)(CR_DEPTH_MAX - CR_DEPTH_MIN) * 0.5f;
+    F32 zbias = (F32)(CR_DEPTH_MAX + CR_DEPTH_MIN) * 0.5f;
+    float3 zvert = make_float3(
+        (v0z * zcoef) * rcpW.x + zbias,
+        (v1z * zcoef) * rcpW.y + zbias,
+        (v2z * zcoef) * rcpW.z + zbias
+    );
+    int2 zv0 = make_int2(
+        wv0.x - (1 << (CR_SUBPIXEL_LOG2 - 1)),
+        wv0.y - (1 << (CR_SUBPIXEL_LOG2 - 1))
+    );
+    uint3 zpleq = setupPleq(zvert, zv0, d1, d2, 1.0f / (F32)area);
+
+    U32 zmin = f32_to_u32_sat(fminf(fminf(zvert.x, zvert.y), zvert.z) - (F32)CR_LERP_ERROR(0));
+
+    // Write CRTriangleData.
+
+    *(uint4*)td = make_uint4(zpleq.x, zpleq.y, zpleq.z, triId);
+
+    // Determine flipbits.
+
+    U32 f01 = cover8x8_selectFlips(d1.x, d1.y);
+    U32 f12 = cover8x8_selectFlips(d2.x - d1.x, d2.y - d1.y);
+    U32 f20 = cover8x8_selectFlips(-d2.x, -d2.y);
+
+    // Write CRTriangleHeader.
+
+    *(uint4*)th = make_uint4(
+        prmt(p0.x, p0.y, 0x5410),
+        prmt(p1.x, p1.y, 0x5410),
+        prmt(p2.x, p2.y, 0x5410),
+        (zmin & 0xfffff000u) | (f01 << 6) | (f12 << 2) | (f20 >> 2));
+}
+
+//------------------------------------------------------------------------
+
+__device__ __inline__ void triangleSetupImpl(const CRParams p)
+{
+    __shared__ F32 s_bary[CR_SETUP_WARPS * 32][18];
+    F32* bary = s_bary[threadIdx.x + threadIdx.y * 32];
+
+    // Compute task and image indices.
+
+    int taskIdx = threadIdx.x + 32 * (threadIdx.y + CR_SETUP_WARPS * blockIdx.x);
+    int imageIdx = 0;
+    if (p.instanceMode)
+    {
+        imageIdx = blockIdx.z;
+        if (taskIdx >= p.numTriangles)
+            return;
+    }
+    else
+    {
+        while (imageIdx < p.numImages)
+        {
+            int count = getImageParams(p, imageIdx).triCount;
+            if (taskIdx < count)
+                break;
+            taskIdx -= count;
+            imageIdx += 1;
+        }
+        if (imageIdx == p.numImages)
+            return;
+    }
+
+    // Per-image data structures.
+
+    const CRImageParams& ip = getImageParams(p, imageIdx);
+    CRAtomics& atomics = p.atomics[imageIdx];
+
+    const int*          indexBuffer = (const int*)p.indexBuffer;
+    U8*                 triSubtris  = (U8*)p.triSubtris               + imageIdx * p.maxSubtris;
+    CRTriangleHeader*   triHeader   = (CRTriangleHeader*)p.triHeader  + imageIdx * p.maxSubtris;
+    CRTriangleData*     triData     = (CRTriangleData*)p.triData      + imageIdx * p.maxSubtris;
+
+    // Determine triangle index.
+
+    int triIdx = taskIdx;
+    if (!p.instanceMode)
+        triIdx += ip.triOffset;
+
+    // Read vertex indices.
+
+    if ((U32)triIdx >= (U32)p.numTriangles)
+    {
+        // Bad triangle index.
+        triSubtris[taskIdx] = 0;
+        return;
+    }
+
+    uint4 vidx;
+    vidx.x = indexBuffer[triIdx * 3 + 0];
+    vidx.y = indexBuffer[triIdx * 3 + 1];
+    vidx.z = indexBuffer[triIdx * 3 + 2];
+    vidx.w = triIdx + 1; // Triangle index.
+
+    if (vidx.x >= (U32)p.numVertices ||
+        vidx.y >= (U32)p.numVertices ||
+        vidx.z >= (U32)p.numVertices)
+    {
+        // Bad vertex index.
+        triSubtris[taskIdx] = 0;
+        return;
+    }
+
+    // Read vertex positions.
+
+    const float4* vertexBuffer = (const float4*)p.vertexBuffer;
+    if (p.instanceMode)
+        vertexBuffer += p.numVertices * imageIdx; // Instance offset.
+
+    float4 v0 = vertexBuffer[vidx.x];
+    float4 v1 = vertexBuffer[vidx.y];
+    float4 v2 = vertexBuffer[vidx.z];
+
+    // Outside view frustum => cull.
+
+    if (v0.w < fabsf(v0.x) | v0.w < fabsf(v0.y) | v0.w < fabsf(v0.z))
+    {
+        if ((v0.w < +v0.x & v1.w < +v1.x & v2.w < +v2.x) |
+            (v0.w < -v0.x & v1.w < -v1.x & v2.w < -v2.x) |
+            (v0.w < +v0.y & v1.w < +v1.y & v2.w < +v2.y) |
+            (v0.w < -v0.y & v1.w < -v1.y & v2.w < -v2.y) |
+            (v0.w < +v0.z & v1.w < +v1.z & v2.w < +v2.z) |
+            (v0.w < -v0.z & v1.w < -v1.z & v2.w < -v2.z))
+        {
+            triSubtris[taskIdx] = 0;
+            return;
+        }
+    }
+
+    // Inside depth range => try to snap vertices.
+
+    if (v0.w >= fabsf(v0.z) & v1.w >= fabsf(v1.z) & v2.w >= fabsf(v2.z))
+    {
+        // Inside S16 range and small enough => fast path.
+        // Note: aabbLimit comes from the fact that cover8x8
+        // does not support guardband with maximal viewport.
+
+        int2 p0, p1, p2, lo, hi;
+        float3 rcpW;
+
+        snapTriangle(p, v0, v1, v2, p0, p1, p2, rcpW, lo, hi);
+        S32 loxy = ::min(lo.x, lo.y);
+        S32 hixy = ::max(hi.x, hi.y);
+        S32 aabbLimit = (1 << (CR_MAXVIEWPORT_LOG2 + CR_SUBPIXEL_LOG2)) - 1;
+
+        if (loxy >= -32768 && hixy <= 32767 && hixy - loxy <= aabbLimit)
+        {
+            int2 d1, d2;
+            S32 area;
+            bool res = prepareTriangle(p, p0, p1, p2, lo, hi, d1, d2, area);
+            triSubtris[taskIdx] = res ? 1 : 0;
+
+            if (res)
+                setupTriangle(
+                    p,
+                    &triHeader[taskIdx], &triData[taskIdx], vidx.w,
+                    v0.z, v1.z, v2.z,
+                    p0, p1, p2, rcpW,
+                    d1, d2, area);
+
+            return;
+        }
+    }
+
+    // Clip to view frustum.
+
+    float4 ov0 = v0;
+    float4 od1 = make_float4(v1.x - v0.x, v1.y - v0.y, v1.z - v0.z, v1.w - v0.w);
+    float4 od2 = make_float4(v2.x - v0.x, v2.y - v0.y, v2.z - v0.z, v2.w - v0.w);
+    int numVerts = clipTriangleWithFrustum(bary, &ov0.x, &v1.x, &v2.x, &od1.x, &od2.x);
+
+    // Count non-culled subtriangles.
+
+    v0.x = ov0.x + od1.x * bary[0] + od2.x * bary[1];
+    v0.y = ov0.y + od1.y * bary[0] + od2.y * bary[1];
+    v0.z = ov0.z + od1.z * bary[0] + od2.z * bary[1];
+    v0.w = ov0.w + od1.w * bary[0] + od2.w * bary[1];
+    v1.x = ov0.x + od1.x * bary[2] + od2.x * bary[3];
+    v1.y = ov0.y + od1.y * bary[2] + od2.y * bary[3];
+    v1.z = ov0.z + od1.z * bary[2] + od2.z * bary[3];
+    v1.w = ov0.w + od1.w * bary[2] + od2.w * bary[3];
+    float4 tv1 = v1;
+
+    int numSubtris = 0;
+    for (int i = 2; i < numVerts; i++)
+    {
+        v2.x = ov0.x + od1.x * bary[i * 2 + 0] + od2.x * bary[i * 2 + 1];
+        v2.y = ov0.y + od1.y * bary[i * 2 + 0] + od2.y * bary[i * 2 + 1];
+        v2.z = ov0.z + od1.z * bary[i * 2 + 0] + od2.z * bary[i * 2 + 1];
+        v2.w = ov0.w + od1.w * bary[i * 2 + 0] + od2.w * bary[i * 2 + 1];
+
+        int2 p0, p1, p2, lo, hi, d1, d2;
+        float3 rcpW;
+        S32 area;
+
+        snapTriangle(p, v0, v1, v2, p0, p1, p2, rcpW, lo, hi);
+        if (prepareTriangle(p, p0, p1, p2, lo, hi, d1, d2, area))
+            numSubtris++;
+
+        v1 = v2;
+    }
+
+    triSubtris[taskIdx] = numSubtris;
+
+    // Multiple subtriangles => allocate.
+
+    int subtriBase = taskIdx;
+    if (numSubtris > 1)
+    {
+        subtriBase = atomicAdd(&atomics.numSubtris, numSubtris);
+        triHeader[taskIdx].misc = subtriBase;
+        if (subtriBase + numSubtris > p.maxSubtris)
+            numVerts = 0;
+    }
+
+    // Setup subtriangles.
+
+    v1 = tv1;
+    for (int i = 2; i < numVerts; i++)
+    {
+        v2.x = ov0.x + od1.x * bary[i * 2 + 0] + od2.x * bary[i * 2 + 1];
+        v2.y = ov0.y + od1.y * bary[i * 2 + 0] + od2.y * bary[i * 2 + 1];
+        v2.z = ov0.z + od1.z * bary[i * 2 + 0] + od2.z * bary[i * 2 + 1];
+        v2.w = ov0.w + od1.w * bary[i * 2 + 0] + od2.w * bary[i * 2 + 1];
+
+        int2 p0, p1, p2, lo, hi, d1, d2;
+        float3 rcpW;
+        S32 area;
+
+        snapTriangle(p, v0, v1, v2, p0, p1, p2, rcpW, lo, hi);
+        if (prepareTriangle(p, p0, p1, p2, lo, hi, d1, d2, area))
+        {
+            setupTriangle(
+                p,
+                &triHeader[subtriBase], &triData[subtriBase], vidx.w,
+                v0.z, v1.z, v2.z,
+                p0, p1, p2, rcpW,
+                d1, d2, area);
+
+            subtriBase++;
+        }
+
+        v1 = v2;
+    }
+}
+
+//------------------------------------------------------------------------
--- a/nvdiffrast/common/cudaraster/impl/Util.inl
+++ b/nvdiffrast/common/cudaraster/impl/Util.inl
--- a/nvdiffrast/common/rasterize.cu
+++ b/nvdiffrast/common/rasterize.cu
@@ -9,6 +9,106 @@
 #include "common.h"
 #include "rasterize.h"

+//------------------------------------------------------------------------
+// Cuda forward rasterizer pixel shader kernel.
+
+__global__ void RasterizeCudaFwdShaderKernel(const RasterizeCudaFwdShaderParams p)
+{
+    // Calculate pixel position.
+    int px = blockIdx.x * blockDim.x + threadIdx.x;
+    int py = blockIdx.y * blockDim.y + threadIdx.y;
+    int pz = blockIdx.z;
+    if (px >= p.width || py >= p.height || pz >= p.depth)
+        return;
+
+    // Pixel index.
+    int pidx = px + p.width * (py + p.height * pz);
+
+    // Fetch triangle idx.
+    int triIdx = p.in_idx[pidx] - 1;
+    if (triIdx < 0 || triIdx >= p.numTriangles)
+    {
+        // No or corrupt triangle.
+        ((float4*)p.out)[pidx] = make_float4(0.0, 0.0, 0.0, 0.0); // Clear out.
+        ((float4*)p.out_db)[pidx] = make_float4(0.0, 0.0, 0.0, 0.0); // Clear out_db.
+        return;
+    }
+
+    // Fetch vertex indices.
+    int vi0 = p.tri[triIdx * 3 + 0];
+    int vi1 = p.tri[triIdx * 3 + 1];
+    int vi2 = p.tri[triIdx * 3 + 2];
+
+    // Bail out if vertex indices are corrupt.
+    if (vi0 < 0 || vi0 >= p.numVertices ||
+        vi1 < 0 || vi1 >= p.numVertices ||
+        vi2 < 0 || vi2 >= p.numVertices)
+        return;
+
+    // In instance mode, adjust vertex indices by minibatch index.
+    if (p.instance_mode)
+    {
+        vi0 += pz * p.numVertices;
+        vi1 += pz * p.numVertices;
+        vi2 += pz * p.numVertices;
+    }
+
+    // Fetch vertex positions.
+    float4 p0 = ((float4*)p.pos)[vi0];
+    float4 p1 = ((float4*)p.pos)[vi1];
+    float4 p2 = ((float4*)p.pos)[vi2];
+
+    // Evaluate edge functions.
+    float fx = p.xs * (float)px + p.xo;
+    float fy = p.ys * (float)py + p.yo;
+    float p0x = p0.x - fx * p0.w;
+    float p0y = p0.y - fy * p0.w;
+    float p1x = p1.x - fx * p1.w;
+    float p1y = p1.y - fy * p1.w;
+    float p2x = p2.x - fx * p2.w;
+    float p2y = p2.y - fy * p2.w;
+    float a0 = p1x*p2y - p1y*p2x;
+    float a1 = p2x*p0y - p2y*p0x;
+    float a2 = p0x*p1y - p0y*p1x;
+
+    // Perspective correct, normalized barycentrics.
+    float iw = 1.f / (a0 + a1 + a2);
+    float b0 = a0 * iw;
+    float b1 = a1 * iw;
+
+    // Compute z/w for depth buffer.
+    float z = p0.z * a0 + p1.z * a1 + p2.z * a2;
+    float w = p0.w * a0 + p1.w * a1 + p2.w * a2;
+    float zw = z / w;
+
+    // Clamps to avoid NaNs.
+    b0 = __saturatef(b0); // Clamp to [+0.0, 1.0].
+    b1 = __saturatef(b1); // Clamp to [+0.0, 1.0].
+    zw = fmaxf(fminf(zw, 1.f), -1.f);
+
+    // Emit output.
+    ((float4*)p.out)[pidx] = make_float4(b0, b1, zw, (float)(triIdx + 1));
+
+    // Calculate bary pixel differentials.
+    float dfxdx = p.xs * iw;
+    float dfydy = p.ys * iw;
+    float da0dx = p2.y*p1.w - p1.y*p2.w;
+    float da0dy = p1.x*p2.w - p2.x*p1.w;
+    float da1dx = p0.y*p2.w - p2.y*p0.w;
+    float da1dy = p2.x*p0.w - p0.x*p2.w;
+    float da2dx = p1.y*p0.w - p0.y*p1.w;
+    float da2dy = p0.x*p1.w - p1.x*p0.w;
+    float datdx = da0dx + da1dx + da2dx;
+    float datdy = da0dy + da1dy + da2dy;
+    float dudx = dfxdx * (b0 * datdx - da0dx);
+    float dudy = dfydy * (b0 * datdy - da0dy);
+    float dvdx = dfxdx * (b1 * datdx - da1dx);
+    float dvdy = dfydy * (b1 * datdy - da1dy);
+
+    // Emit bary pixel differentials.
+    ((float4*)p.out_db)[pidx] = make_float4(dudx, dudy, dvdx, dvdy);
+}
+
 //------------------------------------------------------------------------
 // Gradient Cuda kernel.

@@ -16,7 +116,7 @@ template <bool ENABLE_DB>
 static __forceinline__ __device__ void RasterizeGradKernelTemplate(const RasterizeGradParams p)
 {
    // Temporary space for coalesced atomics.
-    CA_DECLARE_TEMP(RAST_GRAD_MAX_KERNEL_BLOCK_WIDTH * RAST_GRAD_MAX_KERNEL_BLOCK_HEIGHT);    
+    CA_DECLARE_TEMP(RAST_GRAD_MAX_KERNEL_BLOCK_WIDTH * RAST_GRAD_MAX_KERNEL_BLOCK_HEIGHT);

    // Calculate pixel position.
    int px = blockIdx.x * blockDim.x + threadIdx.x;
@@ -64,7 +164,7 @@ static __forceinline__ __device__ void RasterizeGradKernelTemplate(const Rasteri

    // Initialize coalesced atomics.
    CA_SET_GROUP(triIdx);
-    
+
    // Fetch vertex positions.
    float4 p0 = ((float4*)p.pos)[vi0];
    float4 p1 = ((float4*)p.pos)[vi1];

--- a/nvdiffrast/common/rasterize.h
+++ b/nvdiffrast/common/rasterize.h
@@ -11,9 +11,30 @@
 //------------------------------------------------------------------------
 // Constants and helpers.

+#define RAST_CUDA_FWD_SHADER_KERNEL_BLOCK_WIDTH  8
+#define RAST_CUDA_FWD_SHADER_KERNEL_BLOCK_HEIGHT 8
 #define RAST_GRAD_MAX_KERNEL_BLOCK_WIDTH  8
 #define RAST_GRAD_MAX_KERNEL_BLOCK_HEIGHT 8

+//------------------------------------------------------------------------
+// CUDA forward rasterizer shader kernel params.
+
+struct RasterizeCudaFwdShaderParams
+{
+    const float*    pos;            // Vertex positions.
+    const int*      tri;            // Triangle indices.
+    const int*      in_idx;         // Triangle idx buffer from rasterizer.
+    float*          out;            // Main output buffer.
+    float*          out_db;         // Bary pixel gradient output buffer.
+    int             numTriangles;   // Number of triangles.
+    int             numVertices;    // Number of vertices.
+    int             width;          // Image width.
+    int             height;         // Image height.
+    int             depth;          // Size of minibatch.
+    int             instance_mode;  // 1 if in instance rendering mode.
+    float           xs, xo, ys, yo; // Pixel position to clip-space x, y transform.
+};
+
 //------------------------------------------------------------------------
 // Gradient CUDA kernel params.

@@ -35,52 +56,3 @@ struct RasterizeGradParams
 };

 //------------------------------------------------------------------------
-// Do not try to include OpenGL stuff when compiling CUDA kernels for torch.
-
-#if !(defined(NVDR_TORCH) && defined(__CUDACC__))
-#include "framework.h"
-#include "glutil.h"
-
-//------------------------------------------------------------------------
-// OpenGL-related persistent state for forward op.
-
-struct RasterizeGLState // Must be initializable by memset to zero.
-{
-    int                     width;              // Allocated frame buffer width.
-    int                     height;             // Allocated frame buffer height.
-    int                     depth;              // Allocated frame buffer depth.
-    int                     posCount;           // Allocated position buffer in floats.
-    int                     triCount;           // Allocated triangle buffer in ints.
-    GLContext               glctx;
-    GLuint                  glFBO;
-    GLuint                  glColorBuffer[2];
-    GLuint                  glPrevOutBuffer;
-    GLuint                  glDepthStencilBuffer;
-    GLuint                  glVAO;
-    GLuint                  glTriBuffer;
-    GLuint                  glPosBuffer;
-    GLuint                  glProgram;
-    GLuint                  glProgramDP;
-    GLuint                  glVertexShader;
-    GLuint                  glGeometryShader;
-    GLuint                  glFragmentShader;
-    GLuint                  glFragmentShaderDP;
-    cudaGraphicsResource_t  cudaColorBuffer[2];
-    cudaGraphicsResource_t  cudaPrevOutBuffer;
-    cudaGraphicsResource_t  cudaPosBuffer;
-    cudaGraphicsResource_t  cudaTriBuffer;
-    int                     enableDB;
-    int                     enableZModify;      // Modify depth in shader, workaround for a rasterization issue on A100.
-};
-
-//------------------------------------------------------------------------
-// Shared C++ code prototypes.
-
-void rasterizeInitGLContext(NVDR_CTX_ARGS, RasterizeGLState& s, int cudaDeviceIdx);
-bool rasterizeResizeBuffers(NVDR_CTX_ARGS, RasterizeGLState& s, int posCount, int triCount, int width, int height, int depth);
-void rasterizeRender(NVDR_CTX_ARGS, RasterizeGLState& s, cudaStream_t stream, const float* posPtr, int posCount, int vtxPerInstance, const int32_t* triPtr, int triCount, const int32_t* rangesPtr, int width, int height, int depth, int peeling_idx);
-void rasterizeCopyResults(NVDR_CTX_ARGS, RasterizeGLState& s, cudaStream_t stream, float** outputPtr, int width, int height, int depth);
-void rasterizeReleaseBuffers(NVDR_CTX_ARGS, RasterizeGLState& s);
-
-//------------------------------------------------------------------------
-#endif // !(defined(NVDR_TORCH) && defined(__CUDACC__))