Fix to #123, CUDA rasterizer optimizations

c5caf7bd · Samuli Laine · 335cfa6b · c5caf7bd · c5caf7bd · c5caf7bd
Commit c5caf7bd authored Jun 30, 2023 by Samuli Laine
11 changed files
--- a/README.md
+++ b/README.md
@@ -11,7 +11,7 @@ Please refer to &#x261E;&#x261E; [nvdiffrast documentation](https://nvlabs.githu

 ## Licenses

-Copyright &copy; 2020&ndash;2022, NVIDIA Corporation. All rights reserved.
+Copyright &copy; 2020&ndash;2023, NVIDIA Corporation. All rights reserved.

 This work is made available under the [Nvidia Source Code License](https://github.com/NVlabs/nvdiffrast/blob/main/LICENSE.txt).


--- a/docs/index.html
+++ b/docs/index.html
@@ -377,7 +377,7 @@ Examples of things we've done with nvdiffrast
 <h3 id="linux">Linux</h3>
 <p>We recommend running nvdiffrast on <a href="https://www.docker.com/">Docker</a>. To build a Docker image with nvdiffrast and PyTorch 1.6 installed, run:</p>
 <div class="sourceCode" id="cb2"><pre class="sourceCode bash"><code class="sourceCode bash"><span id="cb2-1"><a href="#cb2-1" aria-hidden="true" tabindex="-1"></a><span class="ex">./run_sample.sh</span> --build-container</span></code></pre></div>
-<p>We recommend using Ubuntu, as some Linux distributions might not have all the required packages available — at least CentOS is reportedly problematic.</p>
+<p>We recommend using Ubuntu, as some Linux distributions might not have all the required packages available. Installation on CentOS is reportedly problematic, but success has been claimed <a href="https://github.com/NVlabs/nvdiffrast/issues/48#issuecomment-1449261808">here</a>.</p>
 <p>To try out some of the provided code examples, run:</p>
 <div class="sourceCode" id="cb3"><pre class="sourceCode bash"><code class="sourceCode bash"><span id="cb3-1"><a href="#cb3-1" aria-hidden="true" tabindex="-1"></a><span class="ex">./run_sample.sh</span> ./samples/torch/cube.py --resolution 32</span></code></pre></div>
 <p>Alternatively, if you have all the dependencies taken care of (consult the included Dockerfile for reference), you can install nvdiffrast in your local Python site-packages by running</p>
@@ -1036,7 +1036,7 @@ severity will be silent.</td></tr></table></div>

 </div>
 <h2 id="licenses">Licenses</h2>
-<p>Copyright © 2020–2022, NVIDIA Corporation. All rights reserved.</p>
+<p>Copyright © 2020–2023, NVIDIA Corporation. All rights reserved.</p>
 <p>This work is made available under the <a href="https://github.com/NVlabs/nvdiffrast/blob/main/LICENSE.txt">Nvidia Source Code License</a>.</p>
 <p>For business inquiries, please visit our website and submit the form: <a href="https://www.nvidia.com/en-us/research/inquiries/">NVIDIA Research Licensing</a></p>
 <p>We do not currently accept outside contributions in the form of pull requests.</p>

--- a/nvdiffrast/__init__.py
+++ b/nvdiffrast/__init__.py
@@ -6,4 +6,4 @@
 # distribution of this software and related documentation without an express
 # license agreement from NVIDIA CORPORATION is strictly prohibited.

-__version__ = '0.3.0'
+__version__ = '0.3.1'
--- a/nvdiffrast/common/cudaraster/CudaRaster.hpp
+++ b/nvdiffrast/common/cudaraster/CudaRaster.hpp
@@ -39,15 +39,15 @@ public:
 					        CudaRaster				(void);
 					        ~CudaRaster				(void);

-    void                    setViewportSize         (int width, int height, int numImages);     // Width and height must be multiples of tile size (8x8).
-    void                    setRenderModeFlags      (unsigned int renderModeFlags);             // Affects all subsequent calls to drawTriangles(). Defaults to zero.
-    void                    deferredClear           (unsigned int clearColor);                  // Clears color and depth buffers during next call to drawTriangles().
-    void                    setVertexBuffer         (void* vertices, int numVertices);          // GPU pointer managed by caller. Vertex positions in clip space as float4 (x, y, z, w).
-    void                    setIndexBuffer          (void* indices, int numTriangles);          // GPU pointer managed by caller. Triangle index+color quadruplets as uint4 (idx0, idx1, idx2, color).
-    bool                    drawTriangles           (const int* ranges, cudaStream_t stream);   // Ranges (offsets and counts) as #triangles entries, not as bytes. If NULL, draw all triangles. Returns false in case of internal overflow.
-    void*                   getColorBuffer          (void);                                     // GPU pointer managed by CudaRaster.
-    void*                   getDepthBuffer          (void);                                     // GPU pointer managed by CudaRaster.
-    void                    swapDepthAndPeel        (void);                                     // Swap depth and peeling buffers.
+    void                    setViewportSize         (int width, int height, int numImages);              // Width and height must be multiples of tile size (8x8).
+    void                    setRenderModeFlags      (unsigned int renderModeFlags);                      // Affects all subsequent calls to drawTriangles(). Defaults to zero.
+    void                    deferredClear           (unsigned int clearColor);                           // Clears color and depth buffers during next call to drawTriangles().
+    void                    setVertexBuffer         (void* vertices, int numVertices);                   // GPU pointer managed by caller. Vertex positions in clip space as float4 (x, y, z, w).
+    void                    setIndexBuffer          (void* indices, int numTriangles);                   // GPU pointer managed by caller. Triangle index+color quadruplets as uint4 (idx0, idx1, idx2, color).
+    bool                    drawTriangles           (const int* ranges, bool peel, cudaStream_t stream); // Ranges (offsets and counts) as #triangles entries, not as bytes. If NULL, draw all triangles. Returns false in case of internal overflow.
+    void*                   getColorBuffer          (void);                                              // GPU pointer managed by CudaRaster.
+    void*                   getDepthBuffer          (void);                                              // GPU pointer managed by CudaRaster.
+    void                    swapDepthAndPeel        (void);                                              // Swap depth and peeling buffers.

 private:
 					        CudaRaster           	(const CudaRaster&); // forbidden

--- a/nvdiffrast/common/cudaraster/impl/Buffer.cpp
+++ b/nvdiffrast/common/cudaraster/impl/Buffer.cpp
@@ -11,6 +11,8 @@

 using namespace CR;

+//------------------------------------------------------------------------
+// GPU buffer.
 //------------------------------------------------------------------------

 Buffer::Buffer(void)
@@ -26,8 +28,6 @@ Buffer::~Buffer(void)
        cudaFree(m_gpuPtr); // Don't throw an exception.
 }

-//------------------------------------------------------------------------
-
 void Buffer::reset(size_t bytes)
 {
    if (bytes == m_bytes)
@@ -45,9 +45,47 @@ void Buffer::reset(size_t bytes)
    m_bytes = bytes;
 }

+void Buffer::grow(size_t bytes)
+{
+    if (bytes > m_bytes)
+        reset(bytes);
+}
+
+//------------------------------------------------------------------------
+// Host buffer with page-locked memory.
 //------------------------------------------------------------------------

-void Buffer::grow(size_t bytes)
+HostBuffer::HostBuffer(void)
+:   m_hostPtr(NULL),
+    m_bytes  (0)
+{
+    // empty
+}
+
+HostBuffer::~HostBuffer(void)
+{
+    if (m_hostPtr)
+        cudaFreeHost(m_hostPtr); // Don't throw an exception.
+}
+
+void HostBuffer::reset(size_t bytes)
+{
+    if (bytes == m_bytes)
+        return;
+
+    if (m_hostPtr)
+    {
+        NVDR_CHECK_CUDA_ERROR(cudaFreeHost(m_hostPtr));
+        m_hostPtr = NULL;
+    }
+
+    if (bytes > 0)
+        NVDR_CHECK_CUDA_ERROR(cudaMallocHost(&m_hostPtr, bytes));
+
+    m_bytes = bytes;
+}
+
+void HostBuffer::grow(size_t bytes)
 {
    if (bytes > m_bytes)
        reset(bytes);

--- a/nvdiffrast/common/cudaraster/impl/Buffer.hpp
+++ b/nvdiffrast/common/cudaraster/impl/Buffer.hpp
@@ -31,5 +31,25 @@ private:
    size_t          m_bytes;
 };

+//------------------------------------------------------------------------
+
+class HostBuffer
+{
+public:
+                    HostBuffer  (void);
+                    ~HostBuffer (void);
+
+    void            reset       (size_t bytes);
+    void            grow        (size_t bytes);
+    void*           getPtr      (void) { return m_hostPtr; }
+    size_t          getSize     (void) const { return m_bytes; }
+
+    void            setPtr      (void* ptr) { m_hostPtr = ptr; }
+
+private:
+    void*           m_hostPtr;
+    size_t          m_bytes;
+};
+
 //------------------------------------------------------------------------
 }
--- a/nvdiffrast/common/cudaraster/impl/CudaRaster.cpp
+++ b/nvdiffrast/common/cudaraster/impl/CudaRaster.cpp
@@ -51,9 +51,9 @@ void CudaRaster::setIndexBuffer(void* indices, int numTriangles)
    m_impl->setIndexBuffer(indices, numTriangles);
 }

-bool CudaRaster::drawTriangles(const int* ranges, cudaStream_t stream)
+bool CudaRaster::drawTriangles(const int* ranges, bool peel, cudaStream_t stream)
 {
-    return m_impl->drawTriangles((const Vec2i*)ranges, stream);
+    return m_impl->drawTriangles((const Vec2i*)ranges, peel, stream);
 }

 void* CudaRaster::getColorBuffer(void)

--- a/nvdiffrast/common/cudaraster/impl/RasterImpl.cpp
+++ b/nvdiffrast/common/cudaraster/impl/RasterImpl.cpp
@@ -109,7 +109,7 @@ void RasterImpl::swapDepthAndPeel(void)

 //------------------------------------------------------------------------

-bool RasterImpl::drawTriangles(const Vec2i* ranges, cudaStream_t stream)
+bool RasterImpl::drawTriangles(const Vec2i* ranges, bool peel, cudaStream_t stream)
 {
    bool instanceMode = (!ranges);

@@ -119,6 +119,7 @@ bool RasterImpl::drawTriangles(const Vec2i* ranges, cudaStream_t stream)

    // Resize atomics as needed.
    m_crAtomics    .grow(m_numImages * sizeof(CRAtomics));
+    m_crAtomicsHost.grow(m_numImages * sizeof(CRAtomics));

    // Size of these buffers doesn't depend on input.
    m_binFirstSeg  .grow(m_numImages * CR_MAXBINS_SQR * CR_BIN_STREAMS_SIZE * sizeof(S32));
@@ -127,7 +128,8 @@ bool RasterImpl::drawTriangles(const Vec2i* ranges, cudaStream_t stream)
    m_tileFirstSeg .grow(m_numImages * CR_MAXTILES_SQR * sizeof(S32));

    // Construct per-image parameters and determine worst-case buffer sizes.
-    std::vector<CRImageParams> imageParams(m_numImages);
+    m_crImageParamsHost.grow(m_numImages * sizeof(CRImageParams));
+    CRImageParams* imageParams = (CRImageParams*)m_crImageParamsHost.getPtr();
    for (int i=0; i < m_numImages; i++)
    {
        CRImageParams& ip = imageParams[i];
@@ -172,12 +174,15 @@ bool RasterImpl::drawTriangles(const Vec2i* ranges, cudaStream_t stream)
            m_bufferSizesReported = sizesMB << 20;
        }

-        // Launch stages.
-        launchStages(&imageParams[0], instanceMode, stream);
+        // Launch stages. Blocks until everything is done.
+        launchStages(instanceMode, peel, stream);

-        // Get atomics.
-        std::vector<CRAtomics> atomics(m_numImages);
-        NVDR_CHECK_CUDA_ERROR(cudaMemcpyAsync(&atomics[0], m_crAtomics.getPtr(), sizeof(CRAtomics) * m_numImages, cudaMemcpyDeviceToHost, stream));
+        // Peeling iteration cannot fail, so no point checking things further.
+        if (peel)
+            break;
+
+        // Atomics after coarse stage are now available.
+        CRAtomics* atomics = (CRAtomics*)m_crAtomicsHost.getPtr();

        // Success?
        bool failed = false;
@@ -220,19 +225,24 @@ size_t RasterImpl::getTotalBufferSizes(void) const

 //------------------------------------------------------------------------

-void RasterImpl::launchStages(const CRImageParams* imageParams, bool instanceMode, cudaStream_t stream)
+void RasterImpl::launchStages(bool instanceMode, bool peel, cudaStream_t stream)
 {
-    // Initialize atomics to mostly zero.
+    CRImageParams* imageParams = (CRImageParams*)m_crImageParamsHost.getPtr();
+
+    // Unless peeling, initialize atomics to mostly zero.
+    CRAtomics* atomics = (CRAtomics*)m_crAtomicsHost.getPtr();
+    if (!peel)
    {
-        std::vector<CRAtomics> atomics(m_numImages);
-        memset(&atomics[0], 0, m_numImages * sizeof(CRAtomics));
+        memset(atomics, 0, m_numImages * sizeof(CRAtomics));
        for (int i=0; i < m_numImages; i++)
            atomics[i].numSubtris = imageParams[i].triCount;
-        NVDR_CHECK_CUDA_ERROR(cudaMemcpyAsync(m_crAtomics.getPtr(), &atomics[0], m_numImages * sizeof(CRAtomics), cudaMemcpyHostToDevice, stream));
    }

-    // Copy per-image parameters if there are more than fits in launch parameter block.
-    if (m_numImages > CR_EMBED_IMAGE_PARAMS)
+    // Copy to device. If peeling, this is the state after coarse raster launch on first iteration.
+    NVDR_CHECK_CUDA_ERROR(cudaMemcpyAsync(m_crAtomics.getPtr(), atomics, m_numImages * sizeof(CRAtomics), cudaMemcpyHostToDevice, stream));
+
+    // Copy per-image parameters if there are more than fits in launch parameter block and we haven't done it already.
+    if (!peel && m_numImages > CR_EMBED_IMAGE_PARAMS)
    {
        int numImageParamsExtra = m_numImages - CR_EMBED_IMAGE_PARAMS;
        m_crImageParamsExtra.grow(numImageParamsExtra * sizeof(CRImageParams));
@@ -298,24 +308,31 @@ void RasterImpl::launchStages(const CRImageParams* imageParams, bool instanceMod
    dim3 brBlock(32, CR_BIN_WARPS);
    dim3 crBlock(32, CR_COARSE_WARPS);
    dim3 frBlock(32, m_numFineWarpsPerBlock);
-
-    // Launch stages.
    void* args[] = {&p};
-    if (instanceMode)
-    {
-        int setupBlocks = (m_numTriangles - 1) / (32 * CR_SETUP_WARPS) + 1;
-        NVDR_CHECK_CUDA_ERROR(cudaLaunchKernel((void*)triangleSetupKernel, dim3(setupBlocks, 1, m_numImages), dim3(32, CR_SETUP_WARPS), args, 0, stream));
-    }
-    else
+
+    // Launch stages from setup to coarse and copy atomics to host only if this is not a peeling iteration.
+    if (!peel)
    {
-        for (int i=0; i < m_numImages; i++)
-            p.totalCount += imageParams[i].triCount;
-        int setupBlocks = (p.totalCount - 1) / (32 * CR_SETUP_WARPS) + 1;
-        NVDR_CHECK_CUDA_ERROR(cudaLaunchKernel((void*)triangleSetupKernel, dim3(setupBlocks, 1, 1), dim3(32, CR_SETUP_WARPS), args, 0, stream));
+        if (instanceMode)
+        {
+            int setupBlocks = (m_numTriangles - 1) / (32 * CR_SETUP_WARPS) + 1;
+            NVDR_CHECK_CUDA_ERROR(cudaLaunchKernel((void*)triangleSetupKernel, dim3(setupBlocks, 1, m_numImages), dim3(32, CR_SETUP_WARPS), args, 0, stream));
+        }
+        else
+        {
+            for (int i=0; i < m_numImages; i++)
+                p.totalCount += imageParams[i].triCount;
+            int setupBlocks = (p.totalCount - 1) / (32 * CR_SETUP_WARPS) + 1;
+            NVDR_CHECK_CUDA_ERROR(cudaLaunchKernel((void*)triangleSetupKernel, dim3(setupBlocks, 1, 1), dim3(32, CR_SETUP_WARPS), args, 0, stream));
+        }
+        NVDR_CHECK_CUDA_ERROR(cudaLaunchKernel((void*)binRasterKernel, dim3(CR_BIN_STREAMS_SIZE, 1, m_numImages), brBlock, args, 0, stream));
+        NVDR_CHECK_CUDA_ERROR(cudaLaunchKernel((void*)coarseRasterKernel, dim3(m_numSMs * m_numCoarseBlocksPerSM, 1, m_numImages), crBlock, args, 0, stream));
+        NVDR_CHECK_CUDA_ERROR(cudaMemcpyAsync(m_crAtomicsHost.getPtr(), m_crAtomics.getPtr(), sizeof(CRAtomics) * m_numImages, cudaMemcpyDeviceToHost, stream));
    }
-    NVDR_CHECK_CUDA_ERROR(cudaLaunchKernel((void*)binRasterKernel, dim3(CR_BIN_STREAMS_SIZE, 1, m_numImages), brBlock, args, 0, stream));
-    NVDR_CHECK_CUDA_ERROR(cudaLaunchKernel((void*)coarseRasterKernel, dim3(m_numSMs * m_numCoarseBlocksPerSM, 1, m_numImages), crBlock, args, 0, stream));
+
+    // Fine rasterizer is launched always.
    NVDR_CHECK_CUDA_ERROR(cudaLaunchKernel((void*)fineRasterKernel, dim3(m_numSMs * m_numFineBlocksPerSM, 1, m_numImages), frBlock, args, 0, stream));
+    NVDR_CHECK_CUDA_ERROR(cudaStreamSynchronize(stream));
 }

 //------------------------------------------------------------------------
--- a/nvdiffrast/common/cudaraster/impl/RasterImpl.hpp
+++ b/nvdiffrast/common/cudaraster/impl/RasterImpl.hpp
@@ -26,14 +26,14 @@ public:
    void                    deferredClear           (U32 color) { m_deferredClear = true; m_clearColor = color; }
    void                    setVertexBuffer         (void* ptr, int numVertices) { m_vertexPtr = ptr; m_numVertices = numVertices; } // GPU pointer.
    void                    setIndexBuffer          (void* ptr, int numTriangles) { m_indexPtr = ptr; m_numTriangles = numTriangles; } // GPU pointer.
-    bool                    drawTriangles           (const Vec2i* ranges, cudaStream_t stream);
+    bool                    drawTriangles           (const Vec2i* ranges, bool peel, cudaStream_t stream);
    void*                   getColorBuffer          (void) { return m_colorBuffer.getPtr(); } // GPU pointer.
    void*                   getDepthBuffer          (void) { return m_depthBuffer.getPtr(); } // GPU pointer.
    void                    swapDepthAndPeel        (void);
    size_t                  getTotalBufferSizes     (void) const;

 private:
-    void                    launchStages            (const CRImageParams* imageParams, bool instanceMode, cudaStream_t stream);
+    void                    launchStages            (bool instanceMode, bool peel, cudaStream_t stream);

    // State.

@@ -68,6 +68,8 @@ private:
    // Global intermediate buffers. Individual images have offsets to these.

    Buffer                  m_crAtomics;
+    HostBuffer              m_crAtomicsHost;
+    HostBuffer              m_crImageParamsHost;
    Buffer                  m_crImageParamsExtra;
    Buffer                  m_triSubtris;
    Buffer                  m_triHeader;

--- a/nvdiffrast/common/texture.cu
+++ b/nvdiffrast/common/texture.cu
@@ -446,13 +446,13 @@ static __device__ __forceinline__ float2 indexTextureLinear(const TextureKernelP
        if (iv1 >= h) iv1 -= h;
    }

-    // Coordinates
-    iu0 += tz * w * h;
-    iu1 += tz * w * h;
-    tcOut.x = iu0 + w * iv0;
-    tcOut.y = iu1 + w * iv0;
-    tcOut.z = iu0 + w * iv1;
-    tcOut.w = iu1 + w * iv1;
+    // Coordinates with tz folded in.
+    int iu0z = iu0 + tz * w * h;
+    int iu1z = iu1 + tz * w * h;
+    tcOut.x = iu0z + w * iv0;
+    tcOut.y = iu1z + w * iv0;
+    tcOut.z = iu0z + w * iv1;
+    tcOut.w = iu1z + w * iv1;

    // Invalidate texture addresses outside unit square if we are in zero mode.
    if (!CUBE_MODE && p.boundaryMode == TEX_BOUNDARY_MODE_ZERO)

--- a/nvdiffrast/torch/torch_rasterize.cpp
+++ b/nvdiffrast/torch/torch_rasterize.cpp
@@ -99,7 +99,7 @@ std::tuple<torch::Tensor, torch::Tensor> rasterize_fwd_cuda(RasterizeCRStateWrap

    // Run CudaRaster in one large batch. In case of error, the workload could be split into smaller batches - maybe do that in the future.
    cr->deferredClear(0u);
-    bool success = cr->drawTriangles(rangesPtr, stream);
+    bool success = cr->drawTriangles(rangesPtr, enablePeel, stream);
    NVDR_CHECK(success, "subtriangle count overflow");

    // Allocate output tensors.