Commit c5caf7bd authored by Samuli Laine's avatar Samuli Laine
Browse files

Fix to #123, CUDA rasterizer optimizations

parent 335cfa6b
......@@ -11,7 +11,7 @@ Please refer to ☞☞ [nvdiffrast documentation](https://nvlabs.githu
## Licenses
Copyright © 2020–2022, NVIDIA Corporation. All rights reserved.
Copyright © 2020–2023, NVIDIA Corporation. All rights reserved.
This work is made available under the [Nvidia Source Code License](https://github.com/NVlabs/nvdiffrast/blob/main/LICENSE.txt).
......
......@@ -377,7 +377,7 @@ Examples of things we've done with nvdiffrast
<h3 id="linux">Linux</h3>
<p>We recommend running nvdiffrast on <a href="https://www.docker.com/">Docker</a>. To build a Docker image with nvdiffrast and PyTorch 1.6 installed, run:</p>
<div class="sourceCode" id="cb2"><pre class="sourceCode bash"><code class="sourceCode bash"><span id="cb2-1"><a href="#cb2-1" aria-hidden="true" tabindex="-1"></a><span class="ex">./run_sample.sh</span> --build-container</span></code></pre></div>
<p>We recommend using Ubuntu, as some Linux distributions might not have all the required packages available — at least CentOS is reportedly problematic.</p>
<p>We recommend using Ubuntu, as some Linux distributions might not have all the required packages available. Installation on CentOS is reportedly problematic, but success has been claimed <a href="https://github.com/NVlabs/nvdiffrast/issues/48#issuecomment-1449261808">here</a>.</p>
<p>To try out some of the provided code examples, run:</p>
<div class="sourceCode" id="cb3"><pre class="sourceCode bash"><code class="sourceCode bash"><span id="cb3-1"><a href="#cb3-1" aria-hidden="true" tabindex="-1"></a><span class="ex">./run_sample.sh</span> ./samples/torch/cube.py --resolution 32</span></code></pre></div>
<p>Alternatively, if you have all the dependencies taken care of (consult the included Dockerfile for reference), you can install nvdiffrast in your local Python site-packages by running</p>
......@@ -1036,7 +1036,7 @@ severity will be silent.</td></tr></table></div>
</div>
<h2 id="licenses">Licenses</h2>
<p>Copyright © 2020–2022, NVIDIA Corporation. All rights reserved.</p>
<p>Copyright © 2020–2023, NVIDIA Corporation. All rights reserved.</p>
<p>This work is made available under the <a href="https://github.com/NVlabs/nvdiffrast/blob/main/LICENSE.txt">Nvidia Source Code License</a>.</p>
<p>For business inquiries, please visit our website and submit the form: <a href="https://www.nvidia.com/en-us/research/inquiries/">NVIDIA Research Licensing</a></p>
<p>We do not currently accept outside contributions in the form of pull requests.</p>
......
......@@ -6,4 +6,4 @@
# distribution of this software and related documentation without an express
# license agreement from NVIDIA CORPORATION is strictly prohibited.
__version__ = '0.3.0'
__version__ = '0.3.1'
......@@ -39,15 +39,15 @@ public:
CudaRaster (void);
~CudaRaster (void);
void setViewportSize (int width, int height, int numImages); // Width and height must be multiples of tile size (8x8).
void setRenderModeFlags (unsigned int renderModeFlags); // Affects all subsequent calls to drawTriangles(). Defaults to zero.
void deferredClear (unsigned int clearColor); // Clears color and depth buffers during next call to drawTriangles().
void setVertexBuffer (void* vertices, int numVertices); // GPU pointer managed by caller. Vertex positions in clip space as float4 (x, y, z, w).
void setIndexBuffer (void* indices, int numTriangles); // GPU pointer managed by caller. Triangle index+color quadruplets as uint4 (idx0, idx1, idx2, color).
bool drawTriangles (const int* ranges, cudaStream_t stream); // Ranges (offsets and counts) as #triangles entries, not as bytes. If NULL, draw all triangles. Returns false in case of internal overflow.
void* getColorBuffer (void); // GPU pointer managed by CudaRaster.
void* getDepthBuffer (void); // GPU pointer managed by CudaRaster.
void swapDepthAndPeel (void); // Swap depth and peeling buffers.
void setViewportSize (int width, int height, int numImages); // Width and height must be multiples of tile size (8x8).
void setRenderModeFlags (unsigned int renderModeFlags); // Affects all subsequent calls to drawTriangles(). Defaults to zero.
void deferredClear (unsigned int clearColor); // Clears color and depth buffers during next call to drawTriangles().
void setVertexBuffer (void* vertices, int numVertices); // GPU pointer managed by caller. Vertex positions in clip space as float4 (x, y, z, w).
void setIndexBuffer (void* indices, int numTriangles); // GPU pointer managed by caller. Triangle index+color quadruplets as uint4 (idx0, idx1, idx2, color).
bool drawTriangles (const int* ranges, bool peel, cudaStream_t stream); // Ranges (offsets and counts) as #triangles entries, not as bytes. If NULL, draw all triangles. Returns false in case of internal overflow.
void* getColorBuffer (void); // GPU pointer managed by CudaRaster.
void* getDepthBuffer (void); // GPU pointer managed by CudaRaster.
void swapDepthAndPeel (void); // Swap depth and peeling buffers.
private:
CudaRaster (const CudaRaster&); // forbidden
......
......@@ -11,6 +11,8 @@
using namespace CR;
//------------------------------------------------------------------------
// GPU buffer.
//------------------------------------------------------------------------
Buffer::Buffer(void)
......@@ -26,8 +28,6 @@ Buffer::~Buffer(void)
cudaFree(m_gpuPtr); // Don't throw an exception.
}
//------------------------------------------------------------------------
void Buffer::reset(size_t bytes)
{
if (bytes == m_bytes)
......@@ -45,9 +45,47 @@ void Buffer::reset(size_t bytes)
m_bytes = bytes;
}
void Buffer::grow(size_t bytes)
{
if (bytes > m_bytes)
reset(bytes);
}
//------------------------------------------------------------------------
// Host buffer with page-locked memory.
//------------------------------------------------------------------------
void Buffer::grow(size_t bytes)
HostBuffer::HostBuffer(void)
: m_hostPtr(NULL),
m_bytes (0)
{
// empty
}
HostBuffer::~HostBuffer(void)
{
if (m_hostPtr)
cudaFreeHost(m_hostPtr); // Don't throw an exception.
}
void HostBuffer::reset(size_t bytes)
{
if (bytes == m_bytes)
return;
if (m_hostPtr)
{
NVDR_CHECK_CUDA_ERROR(cudaFreeHost(m_hostPtr));
m_hostPtr = NULL;
}
if (bytes > 0)
NVDR_CHECK_CUDA_ERROR(cudaMallocHost(&m_hostPtr, bytes));
m_bytes = bytes;
}
void HostBuffer::grow(size_t bytes)
{
if (bytes > m_bytes)
reset(bytes);
......
......@@ -31,5 +31,25 @@ private:
size_t m_bytes;
};
//------------------------------------------------------------------------
class HostBuffer
{
public:
HostBuffer (void);
~HostBuffer (void);
void reset (size_t bytes);
void grow (size_t bytes);
void* getPtr (void) { return m_hostPtr; }
size_t getSize (void) const { return m_bytes; }
void setPtr (void* ptr) { m_hostPtr = ptr; }
private:
void* m_hostPtr;
size_t m_bytes;
};
//------------------------------------------------------------------------
}
......@@ -51,9 +51,9 @@ void CudaRaster::setIndexBuffer(void* indices, int numTriangles)
m_impl->setIndexBuffer(indices, numTriangles);
}
bool CudaRaster::drawTriangles(const int* ranges, cudaStream_t stream)
bool CudaRaster::drawTriangles(const int* ranges, bool peel, cudaStream_t stream)
{
return m_impl->drawTriangles((const Vec2i*)ranges, stream);
return m_impl->drawTriangles((const Vec2i*)ranges, peel, stream);
}
void* CudaRaster::getColorBuffer(void)
......
......@@ -109,7 +109,7 @@ void RasterImpl::swapDepthAndPeel(void)
//------------------------------------------------------------------------
bool RasterImpl::drawTriangles(const Vec2i* ranges, cudaStream_t stream)
bool RasterImpl::drawTriangles(const Vec2i* ranges, bool peel, cudaStream_t stream)
{
bool instanceMode = (!ranges);
......@@ -119,6 +119,7 @@ bool RasterImpl::drawTriangles(const Vec2i* ranges, cudaStream_t stream)
// Resize atomics as needed.
m_crAtomics .grow(m_numImages * sizeof(CRAtomics));
m_crAtomicsHost.grow(m_numImages * sizeof(CRAtomics));
// Size of these buffers doesn't depend on input.
m_binFirstSeg .grow(m_numImages * CR_MAXBINS_SQR * CR_BIN_STREAMS_SIZE * sizeof(S32));
......@@ -127,7 +128,8 @@ bool RasterImpl::drawTriangles(const Vec2i* ranges, cudaStream_t stream)
m_tileFirstSeg .grow(m_numImages * CR_MAXTILES_SQR * sizeof(S32));
// Construct per-image parameters and determine worst-case buffer sizes.
std::vector<CRImageParams> imageParams(m_numImages);
m_crImageParamsHost.grow(m_numImages * sizeof(CRImageParams));
CRImageParams* imageParams = (CRImageParams*)m_crImageParamsHost.getPtr();
for (int i=0; i < m_numImages; i++)
{
CRImageParams& ip = imageParams[i];
......@@ -172,12 +174,15 @@ bool RasterImpl::drawTriangles(const Vec2i* ranges, cudaStream_t stream)
m_bufferSizesReported = sizesMB << 20;
}
// Launch stages.
launchStages(&imageParams[0], instanceMode, stream);
// Launch stages. Blocks until everything is done.
launchStages(instanceMode, peel, stream);
// Get atomics.
std::vector<CRAtomics> atomics(m_numImages);
NVDR_CHECK_CUDA_ERROR(cudaMemcpyAsync(&atomics[0], m_crAtomics.getPtr(), sizeof(CRAtomics) * m_numImages, cudaMemcpyDeviceToHost, stream));
// Peeling iteration cannot fail, so no point checking things further.
if (peel)
break;
// Atomics after coarse stage are now available.
CRAtomics* atomics = (CRAtomics*)m_crAtomicsHost.getPtr();
// Success?
bool failed = false;
......@@ -220,19 +225,24 @@ size_t RasterImpl::getTotalBufferSizes(void) const
//------------------------------------------------------------------------
void RasterImpl::launchStages(const CRImageParams* imageParams, bool instanceMode, cudaStream_t stream)
void RasterImpl::launchStages(bool instanceMode, bool peel, cudaStream_t stream)
{
// Initialize atomics to mostly zero.
CRImageParams* imageParams = (CRImageParams*)m_crImageParamsHost.getPtr();
// Unless peeling, initialize atomics to mostly zero.
CRAtomics* atomics = (CRAtomics*)m_crAtomicsHost.getPtr();
if (!peel)
{
std::vector<CRAtomics> atomics(m_numImages);
memset(&atomics[0], 0, m_numImages * sizeof(CRAtomics));
memset(atomics, 0, m_numImages * sizeof(CRAtomics));
for (int i=0; i < m_numImages; i++)
atomics[i].numSubtris = imageParams[i].triCount;
NVDR_CHECK_CUDA_ERROR(cudaMemcpyAsync(m_crAtomics.getPtr(), &atomics[0], m_numImages * sizeof(CRAtomics), cudaMemcpyHostToDevice, stream));
}
// Copy per-image parameters if there are more than fits in launch parameter block.
if (m_numImages > CR_EMBED_IMAGE_PARAMS)
// Copy to device. If peeling, this is the state after coarse raster launch on first iteration.
NVDR_CHECK_CUDA_ERROR(cudaMemcpyAsync(m_crAtomics.getPtr(), atomics, m_numImages * sizeof(CRAtomics), cudaMemcpyHostToDevice, stream));
// Copy per-image parameters if there are more than fits in launch parameter block and we haven't done it already.
if (!peel && m_numImages > CR_EMBED_IMAGE_PARAMS)
{
int numImageParamsExtra = m_numImages - CR_EMBED_IMAGE_PARAMS;
m_crImageParamsExtra.grow(numImageParamsExtra * sizeof(CRImageParams));
......@@ -298,24 +308,31 @@ void RasterImpl::launchStages(const CRImageParams* imageParams, bool instanceMod
dim3 brBlock(32, CR_BIN_WARPS);
dim3 crBlock(32, CR_COARSE_WARPS);
dim3 frBlock(32, m_numFineWarpsPerBlock);
// Launch stages.
void* args[] = {&p};
if (instanceMode)
{
int setupBlocks = (m_numTriangles - 1) / (32 * CR_SETUP_WARPS) + 1;
NVDR_CHECK_CUDA_ERROR(cudaLaunchKernel((void*)triangleSetupKernel, dim3(setupBlocks, 1, m_numImages), dim3(32, CR_SETUP_WARPS), args, 0, stream));
}
else
// Launch stages from setup to coarse and copy atomics to host only if this is not a peeling iteration.
if (!peel)
{
for (int i=0; i < m_numImages; i++)
p.totalCount += imageParams[i].triCount;
int setupBlocks = (p.totalCount - 1) / (32 * CR_SETUP_WARPS) + 1;
NVDR_CHECK_CUDA_ERROR(cudaLaunchKernel((void*)triangleSetupKernel, dim3(setupBlocks, 1, 1), dim3(32, CR_SETUP_WARPS), args, 0, stream));
if (instanceMode)
{
int setupBlocks = (m_numTriangles - 1) / (32 * CR_SETUP_WARPS) + 1;
NVDR_CHECK_CUDA_ERROR(cudaLaunchKernel((void*)triangleSetupKernel, dim3(setupBlocks, 1, m_numImages), dim3(32, CR_SETUP_WARPS), args, 0, stream));
}
else
{
for (int i=0; i < m_numImages; i++)
p.totalCount += imageParams[i].triCount;
int setupBlocks = (p.totalCount - 1) / (32 * CR_SETUP_WARPS) + 1;
NVDR_CHECK_CUDA_ERROR(cudaLaunchKernel((void*)triangleSetupKernel, dim3(setupBlocks, 1, 1), dim3(32, CR_SETUP_WARPS), args, 0, stream));
}
NVDR_CHECK_CUDA_ERROR(cudaLaunchKernel((void*)binRasterKernel, dim3(CR_BIN_STREAMS_SIZE, 1, m_numImages), brBlock, args, 0, stream));
NVDR_CHECK_CUDA_ERROR(cudaLaunchKernel((void*)coarseRasterKernel, dim3(m_numSMs * m_numCoarseBlocksPerSM, 1, m_numImages), crBlock, args, 0, stream));
NVDR_CHECK_CUDA_ERROR(cudaMemcpyAsync(m_crAtomicsHost.getPtr(), m_crAtomics.getPtr(), sizeof(CRAtomics) * m_numImages, cudaMemcpyDeviceToHost, stream));
}
NVDR_CHECK_CUDA_ERROR(cudaLaunchKernel((void*)binRasterKernel, dim3(CR_BIN_STREAMS_SIZE, 1, m_numImages), brBlock, args, 0, stream));
NVDR_CHECK_CUDA_ERROR(cudaLaunchKernel((void*)coarseRasterKernel, dim3(m_numSMs * m_numCoarseBlocksPerSM, 1, m_numImages), crBlock, args, 0, stream));
// Fine rasterizer is launched always.
NVDR_CHECK_CUDA_ERROR(cudaLaunchKernel((void*)fineRasterKernel, dim3(m_numSMs * m_numFineBlocksPerSM, 1, m_numImages), frBlock, args, 0, stream));
NVDR_CHECK_CUDA_ERROR(cudaStreamSynchronize(stream));
}
//------------------------------------------------------------------------
......@@ -26,14 +26,14 @@ public:
void deferredClear (U32 color) { m_deferredClear = true; m_clearColor = color; }
void setVertexBuffer (void* ptr, int numVertices) { m_vertexPtr = ptr; m_numVertices = numVertices; } // GPU pointer.
void setIndexBuffer (void* ptr, int numTriangles) { m_indexPtr = ptr; m_numTriangles = numTriangles; } // GPU pointer.
bool drawTriangles (const Vec2i* ranges, cudaStream_t stream);
bool drawTriangles (const Vec2i* ranges, bool peel, cudaStream_t stream);
void* getColorBuffer (void) { return m_colorBuffer.getPtr(); } // GPU pointer.
void* getDepthBuffer (void) { return m_depthBuffer.getPtr(); } // GPU pointer.
void swapDepthAndPeel (void);
size_t getTotalBufferSizes (void) const;
private:
void launchStages (const CRImageParams* imageParams, bool instanceMode, cudaStream_t stream);
void launchStages (bool instanceMode, bool peel, cudaStream_t stream);
// State.
......@@ -68,6 +68,8 @@ private:
// Global intermediate buffers. Individual images have offsets to these.
Buffer m_crAtomics;
HostBuffer m_crAtomicsHost;
HostBuffer m_crImageParamsHost;
Buffer m_crImageParamsExtra;
Buffer m_triSubtris;
Buffer m_triHeader;
......
......@@ -446,13 +446,13 @@ static __device__ __forceinline__ float2 indexTextureLinear(const TextureKernelP
if (iv1 >= h) iv1 -= h;
}
// Coordinates
iu0 += tz * w * h;
iu1 += tz * w * h;
tcOut.x = iu0 + w * iv0;
tcOut.y = iu1 + w * iv0;
tcOut.z = iu0 + w * iv1;
tcOut.w = iu1 + w * iv1;
// Coordinates with tz folded in.
int iu0z = iu0 + tz * w * h;
int iu1z = iu1 + tz * w * h;
tcOut.x = iu0z + w * iv0;
tcOut.y = iu1z + w * iv0;
tcOut.z = iu0z + w * iv1;
tcOut.w = iu1z + w * iv1;
// Invalidate texture addresses outside unit square if we are in zero mode.
if (!CUBE_MODE && p.boundaryMode == TEX_BOUNDARY_MODE_ZERO)
......
......@@ -99,7 +99,7 @@ std::tuple<torch::Tensor, torch::Tensor> rasterize_fwd_cuda(RasterizeCRStateWrap
// Run CudaRaster in one large batch. In case of error, the workload could be split into smaller batches - maybe do that in the future.
cr->deferredClear(0u);
bool success = cr->drawTriangles(rangesPtr, stream);
bool success = cr->drawTriangles(rangesPtr, enablePeel, stream);
NVDR_CHECK(success, "subtriangle count overflow");
// Allocate output tensors.
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment