Commit a1ec436b authored by Samuli Laine's avatar Samuli Laine
Browse files

Add CUDA rasterizer

parent 78528e68
......@@ -11,7 +11,7 @@ Please refer to ☞☞ [nvdiffrast documentation](https://nvlabs.githu
## Licenses
Copyright © 2020, NVIDIA Corporation. All rights reserved.
Copyright © 2020–2022, NVIDIA Corporation. All rights reserved.
This work is made available under the [Nvidia Source Code License](https://github.com/NVlabs/nvdiffrast/blob/main/LICENSE.txt).
......
This diff is collapsed.
......@@ -6,4 +6,4 @@
# distribution of this software and related documentation without an express
# license agreement from NVIDIA CORPORATION is strictly prohibited.
__version__ = '0.2.8'
__version__ = '0.3.0'
// Copyright (c) 2009-2022, NVIDIA CORPORATION. All rights reserved.
//
// NVIDIA CORPORATION and its licensors retain all intellectual property
// and proprietary rights in and to this software, related documentation
// and any modifications thereto. Any use, reproduction, disclosure or
// distribution of this software and related documentation without an express
// license agreement from NVIDIA CORPORATION is strictly prohibited.
#pragma once
//------------------------------------------------------------------------
// This is a slimmed-down and modernized version of the original
// CudaRaster codebase that accompanied the HPG 2011 paper
// "High-Performance Software Rasterization on GPUs" by Laine and Karras.
// Modifications have been made to accommodate post-Volta execution model
// with warp divergence. Support for shading, blending, quad rendering,
// and supersampling have been removed as unnecessary for nvdiffrast.
//------------------------------------------------------------------------
namespace CR
{
class RasterImpl;
//------------------------------------------------------------------------
// Interface class to isolate user from implementation details.
//------------------------------------------------------------------------
class CudaRaster
{
public:
enum
{
RenderModeFlag_EnableBackfaceCulling = 1 << 0, // Enable backface culling.
RenderModeFlag_EnableDepthPeeling = 1 << 1, // Enable depth peeling. Must have a peel buffer set.
};
public:
CudaRaster (void);
~CudaRaster (void);
void setViewportSize (int width, int height, int numImages); // Width and height must be multiples of tile size (8x8).
void setRenderModeFlags (unsigned int renderModeFlags); // Affects all subsequent calls to drawTriangles(). Defaults to zero.
void deferredClear (unsigned int clearColor); // Clears color and depth buffers during next call to drawTriangles().
void setVertexBuffer (void* vertices, int numVertices); // GPU pointer managed by caller. Vertex positions in clip space as float4 (x, y, z, w).
void setIndexBuffer (void* indices, int numTriangles); // GPU pointer managed by caller. Triangle index+color quadruplets as uint4 (idx0, idx1, idx2, color).
bool drawTriangles (const int* ranges, cudaStream_t stream); // Ranges (offsets and counts) as #triangles entries, not as bytes. If NULL, draw all triangles. Returns false in case of internal overflow.
void* getColorBuffer (void); // GPU pointer managed by CudaRaster.
void* getDepthBuffer (void); // GPU pointer managed by CudaRaster.
void swapDepthAndPeel (void); // Swap depth and peeling buffers.
private:
CudaRaster (const CudaRaster&); // forbidden
CudaRaster& operator= (const CudaRaster&); // forbidden
private:
RasterImpl* m_impl; // Opaque pointer to implementation.
};
//------------------------------------------------------------------------
} // namespace CR
// Copyright (c) 2009-2022, NVIDIA CORPORATION. All rights reserved.
//
// NVIDIA CORPORATION and its licensors retain all intellectual property
// and proprietary rights in and to this software, related documentation
// and any modifications thereto. Any use, reproduction, disclosure or
// distribution of this software and related documentation without an express
// license agreement from NVIDIA CORPORATION is strictly prohibited.
//------------------------------------------------------------------------
__device__ __inline__ void binRasterImpl(const CRParams p)
{
__shared__ volatile U32 s_broadcast [CR_BIN_WARPS + 16];
__shared__ volatile S32 s_outOfs [CR_MAXBINS_SQR];
__shared__ volatile S32 s_outTotal [CR_MAXBINS_SQR];
__shared__ volatile S32 s_overIndex [CR_MAXBINS_SQR];
__shared__ volatile S32 s_outMask [CR_BIN_WARPS][CR_MAXBINS_SQR + 1]; // +1 to avoid bank collisions
__shared__ volatile S32 s_outCount [CR_BIN_WARPS][CR_MAXBINS_SQR + 1]; // +1 to avoid bank collisions
__shared__ volatile S32 s_triBuf [CR_BIN_WARPS*32*4]; // triangle ring buffer
__shared__ volatile U32 s_batchPos;
__shared__ volatile U32 s_bufCount;
__shared__ volatile U32 s_overTotal;
__shared__ volatile U32 s_allocBase;
const CRImageParams& ip = getImageParams(p, blockIdx.z);
CRAtomics& atomics = p.atomics[blockIdx.z];
const U8* triSubtris = (const U8*)p.triSubtris + p.maxSubtris * blockIdx.z;
const CRTriangleHeader* triHeader = (const CRTriangleHeader*)p.triHeader + p.maxSubtris * blockIdx.z;
S32* binFirstSeg = (S32*)p.binFirstSeg + CR_MAXBINS_SQR * CR_BIN_STREAMS_SIZE * blockIdx.z;
S32* binTotal = (S32*)p.binTotal + CR_MAXBINS_SQR * CR_BIN_STREAMS_SIZE * blockIdx.z;
S32* binSegData = (S32*)p.binSegData + p.maxBinSegs * CR_BIN_SEG_SIZE * blockIdx.z;
S32* binSegNext = (S32*)p.binSegNext + p.maxBinSegs * blockIdx.z;
S32* binSegCount = (S32*)p.binSegCount + p.maxBinSegs * blockIdx.z;
if (atomics.numSubtris > p.maxSubtris)
return;
// per-thread state
int thrInBlock = threadIdx.x + threadIdx.y * 32;
int batchPos = 0;
// first 16 elements of s_broadcast are always zero
if (thrInBlock < 16)
s_broadcast[thrInBlock] = 0;
// initialize output linked lists and offsets
if (thrInBlock < p.numBins)
{
binFirstSeg[(thrInBlock << CR_BIN_STREAMS_LOG2) + blockIdx.x] = -1;
s_outOfs[thrInBlock] = -CR_BIN_SEG_SIZE;
s_outTotal[thrInBlock] = 0;
}
// repeat until done
for(;;)
{
// get batch
if (thrInBlock == 0)
s_batchPos = atomicAdd(&atomics.binCounter, ip.binBatchSize);
__syncthreads();
batchPos = s_batchPos;
// all batches done?
if (batchPos >= ip.triCount)
break;
// per-thread state
int bufIndex = 0;
int bufCount = 0;
int batchEnd = min(batchPos + ip.binBatchSize, ip.triCount);
// loop over batch as long as we have triangles in it
do
{
// read more triangles
while (bufCount < CR_BIN_WARPS*32 && batchPos < batchEnd)
{
// get subtriangle count
int triIdx = batchPos + thrInBlock;
int num = 0;
if (triIdx < batchEnd)
num = triSubtris[triIdx];
// cumulative sum of subtriangles within each warp
U32 myIdx = __popc(__ballot_sync(~0u, num & 1) & getLaneMaskLt());
if (__any_sync(~0u, num > 1))
{
myIdx += __popc(__ballot_sync(~0u, num & 2) & getLaneMaskLt()) * 2;
myIdx += __popc(__ballot_sync(~0u, num & 4) & getLaneMaskLt()) * 4;
}
if (threadIdx.x == 31) // Do not assume that last thread in warp wins the write.
s_broadcast[threadIdx.y + 16] = myIdx + num;
__syncthreads();
// cumulative sum of per-warp subtriangle counts
// Note: cannot have more than 32 warps or this needs to sync between each step.
bool act = (thrInBlock < CR_BIN_WARPS);
U32 actMask = __ballot_sync(~0u, act);
if (threadIdx.y == 0 && act)
{
volatile U32* ptr = &s_broadcast[thrInBlock + 16];
U32 val = *ptr;
#if (CR_BIN_WARPS > 1)
val += ptr[-1]; __syncwarp(actMask);
*ptr = val; __syncwarp(actMask);
#endif
#if (CR_BIN_WARPS > 2)
val += ptr[-2]; __syncwarp(actMask);
*ptr = val; __syncwarp(actMask);
#endif
#if (CR_BIN_WARPS > 4)
val += ptr[-4]; __syncwarp(actMask);
*ptr = val; __syncwarp(actMask);
#endif
#if (CR_BIN_WARPS > 8)
val += ptr[-8]; __syncwarp(actMask);
*ptr = val; __syncwarp(actMask);
#endif
#if (CR_BIN_WARPS > 16)
val += ptr[-16]; __syncwarp(actMask);
*ptr = val; __syncwarp(actMask);
#endif
// initially assume that we consume everything
// only last active thread does the writes
if (threadIdx.x == CR_BIN_WARPS - 1)
{
s_batchPos = batchPos + CR_BIN_WARPS * 32;
s_bufCount = bufCount + val;
}
}
__syncthreads();
// skip if no subtriangles
if (num)
{
// calculate write position for first subtriangle
U32 pos = bufCount + myIdx + s_broadcast[threadIdx.y + 16 - 1];
// only write if entire triangle fits
if (pos + num <= CR_ARRAY_SIZE(s_triBuf))
{
pos += bufIndex; // adjust for current start position
pos &= CR_ARRAY_SIZE(s_triBuf)-1;
if (num == 1)
s_triBuf[pos] = triIdx * 8 + 7; // single triangle
else
{
for (int i=0; i < num; i++)
{
s_triBuf[pos] = triIdx * 8 + i;
pos++;
pos &= CR_ARRAY_SIZE(s_triBuf)-1;
}
}
} else if (pos <= CR_ARRAY_SIZE(s_triBuf))
{
// this triangle is the first that failed, overwrite total count and triangle count
s_batchPos = batchPos + thrInBlock;
s_bufCount = pos;
}
}
// update triangle counts
__syncthreads();
batchPos = s_batchPos;
bufCount = s_bufCount;
}
// make every warp clear its output buffers
for (int i=threadIdx.x; i < p.numBins; i += 32)
s_outMask[threadIdx.y][i] = 0;
__syncwarp();
// choose our triangle
uint4 triData = make_uint4(0, 0, 0, 0);
if (thrInBlock < bufCount)
{
U32 triPos = bufIndex + thrInBlock;
triPos &= CR_ARRAY_SIZE(s_triBuf)-1;
// find triangle
int triIdx = s_triBuf[triPos];
int dataIdx = triIdx >> 3;
int subtriIdx = triIdx & 7;
if (subtriIdx != 7)
dataIdx = triHeader[dataIdx].misc + subtriIdx;
// read triangle
triData = *(((const uint4*)triHeader) + dataIdx);
}
// setup bounding box and edge functions, and rasterize
S32 lox, loy, hix, hiy;
bool hasTri = (thrInBlock < bufCount);
U32 hasTriMask = __ballot_sync(~0u, hasTri);
if (hasTri)
{
S32 v0x = add_s16lo_s16lo(triData.x, p.widthPixels * (CR_SUBPIXEL_SIZE >> 1));
S32 v0y = add_s16hi_s16lo(triData.x, p.heightPixels * (CR_SUBPIXEL_SIZE >> 1));
S32 d01x = sub_s16lo_s16lo(triData.y, triData.x);
S32 d01y = sub_s16hi_s16hi(triData.y, triData.x);
S32 d02x = sub_s16lo_s16lo(triData.z, triData.x);
S32 d02y = sub_s16hi_s16hi(triData.z, triData.x);
int binLog = CR_BIN_LOG2 + CR_TILE_LOG2 + CR_SUBPIXEL_LOG2;
lox = add_clamp_0_x((v0x + min_min(d01x, 0, d02x)) >> binLog, 0, p.widthBins - 1);
loy = add_clamp_0_x((v0y + min_min(d01y, 0, d02y)) >> binLog, 0, p.heightBins - 1);
hix = add_clamp_0_x((v0x + max_max(d01x, 0, d02x)) >> binLog, 0, p.widthBins - 1);
hiy = add_clamp_0_x((v0y + max_max(d01y, 0, d02y)) >> binLog, 0, p.heightBins - 1);
U32 bit = 1 << threadIdx.x;
bool multi = (hix != lox || hiy != loy);
if (!__any_sync(hasTriMask, multi))
{
int binIdx = lox + p.widthBins * loy;
U32 mask = __match_any_sync(hasTriMask, binIdx);
s_outMask[threadIdx.y][binIdx] = mask;
__syncwarp(hasTriMask);
} else
{
bool complex = (hix > lox+1 || hiy > loy+1);
if (!__any_sync(hasTriMask, complex))
{
int binIdx = lox + p.widthBins * loy;
atomicOr((U32*)&s_outMask[threadIdx.y][binIdx], bit);
if (hix > lox) atomicOr((U32*)&s_outMask[threadIdx.y][binIdx + 1], bit);
if (hiy > loy) atomicOr((U32*)&s_outMask[threadIdx.y][binIdx + p.widthBins], bit);
if (hix > lox && hiy > loy) atomicOr((U32*)&s_outMask[threadIdx.y][binIdx + p.widthBins + 1], bit);
} else
{
S32 d12x = d02x - d01x, d12y = d02y - d01y;
v0x -= lox << binLog, v0y -= loy << binLog;
S32 t01 = v0x * d01y - v0y * d01x;
S32 t02 = v0y * d02x - v0x * d02y;
S32 t12 = d01x * d12y - d01y * d12x - t01 - t02;
S32 b01 = add_sub(t01 >> binLog, max(d01x, 0), min(d01y, 0));
S32 b02 = add_sub(t02 >> binLog, max(d02y, 0), min(d02x, 0));
S32 b12 = add_sub(t12 >> binLog, max(d12x, 0), min(d12y, 0));
int width = hix - lox + 1;
d01x += width * d01y;
d02x += width * d02y;
d12x += width * d12y;
U8* currPtr = (U8*)&s_outMask[threadIdx.y][lox + loy * p.widthBins];
U8* skipPtr = (U8*)&s_outMask[threadIdx.y][(hix + 1) + loy * p.widthBins];
U8* endPtr = (U8*)&s_outMask[threadIdx.y][lox + (hiy + 1) * p.widthBins];
int stride = p.widthBins * 4;
int ptrYInc = stride - width * 4;
do
{
if (b01 >= 0 && b02 >= 0 && b12 >= 0)
atomicOr((U32*)currPtr, bit);
currPtr += 4, b01 -= d01y, b02 += d02y, b12 -= d12y;
if (currPtr == skipPtr)
currPtr += ptrYInc, b01 += d01x, b02 -= d02x, b12 += d12x, skipPtr += stride;
}
while (currPtr != endPtr);
}
}
}
// count per-bin contributions
if (thrInBlock == 0)
s_overTotal = 0; // overflow counter
// ensure that out masks are done
__syncthreads();
int overIndex = -1;
bool act = (thrInBlock < p.numBins);
U32 actMask = __ballot_sync(~0u, act);
if (act)
{
U8* srcPtr = (U8*)&s_outMask[0][thrInBlock];
U8* dstPtr = (U8*)&s_outCount[0][thrInBlock];
int total = 0;
for (int i = 0; i < CR_BIN_WARPS; i++)
{
total += __popc(*(U32*)srcPtr);
*(U32*)dstPtr = total;
srcPtr += (CR_MAXBINS_SQR + 1) * 4;
dstPtr += (CR_MAXBINS_SQR + 1) * 4;
}
// overflow => request a new segment
int ofs = s_outOfs[thrInBlock];
bool ovr = (((ofs - 1) >> CR_BIN_SEG_LOG2) != (((ofs - 1) + total) >> CR_BIN_SEG_LOG2));
U32 ovrMask = __ballot_sync(actMask, ovr);
if (ovr)
{
overIndex = __popc(ovrMask & getLaneMaskLt());
if (overIndex == 0)
s_broadcast[threadIdx.y + 16] = atomicAdd((U32*)&s_overTotal, __popc(ovrMask));
__syncwarp(ovrMask);
overIndex += s_broadcast[threadIdx.y + 16];
s_overIndex[thrInBlock] = overIndex;
}
}
// sync after overTotal is ready
__syncthreads();
// at least one segment overflowed => allocate segments
U32 overTotal = s_overTotal;
U32 allocBase = 0;
if (overTotal > 0)
{
// allocate memory
if (thrInBlock == 0)
{
U32 allocBase = atomicAdd(&atomics.numBinSegs, overTotal);
s_allocBase = (allocBase + overTotal <= p.maxBinSegs) ? allocBase : 0;
}
__syncthreads();
allocBase = s_allocBase;
// did my bin overflow?
if (overIndex != -1)
{
// calculate new segment index
int segIdx = allocBase + overIndex;
// add to linked list
if (s_outOfs[thrInBlock] < 0)
binFirstSeg[(thrInBlock << CR_BIN_STREAMS_LOG2) + blockIdx.x] = segIdx;
else
binSegNext[(s_outOfs[thrInBlock] - 1) >> CR_BIN_SEG_LOG2] = segIdx;
// defaults
binSegNext [segIdx] = -1;
binSegCount[segIdx] = CR_BIN_SEG_SIZE;
}
}
// concurrent emission -- each warp handles its own triangle
if (thrInBlock < bufCount)
{
int triPos = (bufIndex + thrInBlock) & (CR_ARRAY_SIZE(s_triBuf) - 1);
int currBin = lox + loy * p.widthBins;
int skipBin = (hix + 1) + loy * p.widthBins;
int endBin = lox + (hiy + 1) * p.widthBins;
int binYInc = p.widthBins - (hix - lox + 1);
// loop over triangle's bins
do
{
U32 outMask = s_outMask[threadIdx.y][currBin];
if (outMask & (1<<threadIdx.x))
{
int idx = __popc(outMask & getLaneMaskLt());
if (threadIdx.y > 0)
idx += s_outCount[threadIdx.y-1][currBin];
int base = s_outOfs[currBin];
int free = (-base) & (CR_BIN_SEG_SIZE - 1);
if (idx >= free)
idx += ((allocBase + s_overIndex[currBin]) << CR_BIN_SEG_LOG2) - free;
else
idx += base;
binSegData[idx] = s_triBuf[triPos];
}
currBin++;
if (currBin == skipBin)
currBin += binYInc, skipBin += p.widthBins;
}
while (currBin != endBin);
}
// wait all triangles to finish, then replace overflown segment offsets
__syncthreads();
if (thrInBlock < p.numBins)
{
U32 total = s_outCount[CR_BIN_WARPS - 1][thrInBlock];
U32 oldOfs = s_outOfs[thrInBlock];
if (overIndex == -1)
s_outOfs[thrInBlock] = oldOfs + total;
else
{
int addr = oldOfs + total;
addr = ((addr - 1) & (CR_BIN_SEG_SIZE - 1)) + 1;
addr += (allocBase + overIndex) << CR_BIN_SEG_LOG2;
s_outOfs[thrInBlock] = addr;
}
s_outTotal[thrInBlock] += total;
}
// these triangles are now done
int count = ::min(bufCount, CR_BIN_WARPS * 32);
bufCount -= count;
bufIndex += count;
bufIndex &= CR_ARRAY_SIZE(s_triBuf)-1;
}
while (bufCount > 0 || batchPos < batchEnd);
// flush all bins
if (thrInBlock < p.numBins)
{
int ofs = s_outOfs[thrInBlock];
if (ofs & (CR_BIN_SEG_SIZE-1))
{
int seg = ofs >> CR_BIN_SEG_LOG2;
binSegCount[seg] = ofs & (CR_BIN_SEG_SIZE-1);
s_outOfs[thrInBlock] = (ofs + CR_BIN_SEG_SIZE - 1) & -CR_BIN_SEG_SIZE;
}
}
}
// output totals
if (thrInBlock < p.numBins)
binTotal[(thrInBlock << CR_BIN_STREAMS_LOG2) + blockIdx.x] = s_outTotal[thrInBlock];
}
//------------------------------------------------------------------------
// Copyright (c) 2009-2022, NVIDIA CORPORATION. All rights reserved.
//
// NVIDIA CORPORATION and its licensors retain all intellectual property
// and proprietary rights in and to this software, related documentation
// and any modifications thereto. Any use, reproduction, disclosure or
// distribution of this software and related documentation without an express
// license agreement from NVIDIA CORPORATION is strictly prohibited.
#include "../../framework.h"
#include "Buffer.hpp"
using namespace CR;
//------------------------------------------------------------------------
Buffer::Buffer(void)
: m_gpuPtr(NULL),
m_bytes (0)
{
// empty
}
Buffer::~Buffer(void)
{
if (m_gpuPtr)
NVDR_CHECK_CUDA_ERROR(cudaFree(m_gpuPtr));
}
//------------------------------------------------------------------------
void Buffer::reset(size_t bytes)
{
if (bytes == m_bytes)
return;
if (m_gpuPtr)
{
NVDR_CHECK_CUDA_ERROR(cudaFree(m_gpuPtr));
m_gpuPtr = NULL;
}
if (bytes > 0)
NVDR_CHECK_CUDA_ERROR(cudaMalloc(&m_gpuPtr, bytes));
m_bytes = bytes;
}
//------------------------------------------------------------------------
void Buffer::grow(size_t bytes)
{
if (bytes > m_bytes)
reset(bytes);
}
//------------------------------------------------------------------------
// Copyright (c) 2009-2022, NVIDIA CORPORATION. All rights reserved.
//
// NVIDIA CORPORATION and its licensors retain all intellectual property
// and proprietary rights in and to this software, related documentation
// and any modifications thereto. Any use, reproduction, disclosure or
// distribution of this software and related documentation without an express
// license agreement from NVIDIA CORPORATION is strictly prohibited.
#pragma once
#include "Defs.hpp"
namespace CR
{
//------------------------------------------------------------------------
class Buffer
{
public:
Buffer (void);
~Buffer (void);
void reset (size_t bytes);
void grow (size_t bytes);
void* getPtr (void) { return m_gpuPtr; }
size_t getSize (void) const { return m_bytes; }
void setPtr (void* ptr) { m_gpuPtr = ptr; }
private:
void* m_gpuPtr;
size_t m_bytes;
};
//------------------------------------------------------------------------
}
This diff is collapsed.
// Copyright (c) 2009-2022, NVIDIA CORPORATION. All rights reserved.
//
// NVIDIA CORPORATION and its licensors retain all intellectual property
// and proprietary rights in and to this software, related documentation
// and any modifications thereto. Any use, reproduction, disclosure or
// distribution of this software and related documentation without an express
// license agreement from NVIDIA CORPORATION is strictly prohibited.
#pragma once
//------------------------------------------------------------------------
#define CR_MAXVIEWPORT_LOG2 11 // ViewportSize / PixelSize.
#define CR_SUBPIXEL_LOG2 4 // PixelSize / SubpixelSize.
#define CR_MAXBINS_LOG2 4 // ViewportSize / BinSize.
#define CR_BIN_LOG2 4 // BinSize / TileSize.
#define CR_TILE_LOG2 3 // TileSize / PixelSize.
#define CR_COVER8X8_LUT_SIZE 768 // 64-bit entries.
#define CR_FLIPBIT_FLIP_Y 2
#define CR_FLIPBIT_FLIP_X 3
#define CR_FLIPBIT_SWAP_XY 4
#define CR_FLIPBIT_COMPL 5
#define CR_BIN_STREAMS_LOG2 4
#define CR_BIN_SEG_LOG2 9 // 32-bit entries.
#define CR_TILE_SEG_LOG2 5 // 32-bit entries.
#define CR_MAXSUBTRIS_LOG2 24 // Triangle structs. Dictated by CoarseRaster.
#define CR_COARSE_QUEUE_LOG2 10 // Triangles.
#define CR_SETUP_WARPS 2
#define CR_SETUP_OPT_BLOCKS 8
#define CR_BIN_WARPS 16
#define CR_COARSE_WARPS 16 // Must be a power of two.
#define CR_FINE_MAX_WARPS 20
#define CR_EMBED_IMAGE_PARAMS 32 // Number of per-image parameter structs embedded in kernel launch parameter block.
//------------------------------------------------------------------------
#define CR_MAXVIEWPORT_SIZE (1 << CR_MAXVIEWPORT_LOG2)
#define CR_SUBPIXEL_SIZE (1 << CR_SUBPIXEL_LOG2)
#define CR_SUBPIXEL_SQR (1 << (CR_SUBPIXEL_LOG2 * 2))
#define CR_MAXBINS_SIZE (1 << CR_MAXBINS_LOG2)
#define CR_MAXBINS_SQR (1 << (CR_MAXBINS_LOG2 * 2))
#define CR_BIN_SIZE (1 << CR_BIN_LOG2)
#define CR_BIN_SQR (1 << (CR_BIN_LOG2 * 2))
#define CR_MAXTILES_LOG2 (CR_MAXBINS_LOG2 + CR_BIN_LOG2)
#define CR_MAXTILES_SIZE (1 << CR_MAXTILES_LOG2)
#define CR_MAXTILES_SQR (1 << (CR_MAXTILES_LOG2 * 2))
#define CR_TILE_SIZE (1 << CR_TILE_LOG2)
#define CR_TILE_SQR (1 << (CR_TILE_LOG2 * 2))
#define CR_BIN_STREAMS_SIZE (1 << CR_BIN_STREAMS_LOG2)
#define CR_BIN_SEG_SIZE (1 << CR_BIN_SEG_LOG2)
#define CR_TILE_SEG_SIZE (1 << CR_TILE_SEG_LOG2)
#define CR_MAXSUBTRIS_SIZE (1 << CR_MAXSUBTRIS_LOG2)
#define CR_COARSE_QUEUE_SIZE (1 << CR_COARSE_QUEUE_LOG2)
//------------------------------------------------------------------------
// When evaluating interpolated Z pixel centers, we may introduce an error
// of (+-CR_LERP_ERROR) ULPs.
#define CR_LERP_ERROR(SAMPLES_LOG2) (2200u << (SAMPLES_LOG2))
#define CR_DEPTH_MIN CR_LERP_ERROR(3)
#define CR_DEPTH_MAX (CR_U32_MAX - CR_LERP_ERROR(3))
//------------------------------------------------------------------------
// Copyright (c) 2009-2022, NVIDIA CORPORATION. All rights reserved.
//
// NVIDIA CORPORATION and its licensors retain all intellectual property
// and proprietary rights in and to this software, related documentation
// and any modifications thereto. Any use, reproduction, disclosure or
// distribution of this software and related documentation without an express
// license agreement from NVIDIA CORPORATION is strictly prohibited.
#include "Defs.hpp"
#include "../CudaRaster.hpp"
#include "RasterImpl.hpp"
using namespace CR;
//------------------------------------------------------------------------
// Stub interface implementation.
//------------------------------------------------------------------------
CudaRaster::CudaRaster()
{
m_impl = new RasterImpl();
}
CudaRaster::~CudaRaster()
{
delete m_impl;
}
void CudaRaster::setViewportSize(int width, int height, int numImages)
{
m_impl->setViewportSize(Vec3i(width, height, numImages));
}
void CudaRaster::setRenderModeFlags(U32 flags)
{
m_impl->setRenderModeFlags(flags);
}
void CudaRaster::deferredClear(U32 clearColor)
{
m_impl->deferredClear(clearColor);
}
void CudaRaster::setVertexBuffer(void* vertices, int numVertices)
{
m_impl->setVertexBuffer(vertices, numVertices);
}
void CudaRaster::setIndexBuffer(void* indices, int numTriangles)
{
m_impl->setIndexBuffer(indices, numTriangles);
}
bool CudaRaster::drawTriangles(const int* ranges, cudaStream_t stream)
{
return m_impl->drawTriangles((const Vec2i*)ranges, stream);
}
void* CudaRaster::getColorBuffer(void)
{
return m_impl->getColorBuffer();
}
void* CudaRaster::getDepthBuffer(void)
{
return m_impl->getDepthBuffer();
}
void CudaRaster::swapDepthAndPeel(void)
{
m_impl->swapDepthAndPeel();
}
//------------------------------------------------------------------------
// Copyright (c) 2009-2022, NVIDIA CORPORATION. All rights reserved.
//
// NVIDIA CORPORATION and its licensors retain all intellectual property
// and proprietary rights in and to this software, related documentation
// and any modifications thereto. Any use, reproduction, disclosure or
// distribution of this software and related documentation without an express
// license agreement from NVIDIA CORPORATION is strictly prohibited.
#pragma once
#include <cuda_runtime.h>
#include <cstdint>
namespace CR
{
//------------------------------------------------------------------------
#ifndef NULL
# define NULL 0
#endif
#ifdef __CUDACC__
# define CR_CUDA 1
#else
# define CR_CUDA 0
#endif
#if CR_CUDA
# define CR_CUDA_FUNC __device__ __inline__
# define CR_CUDA_CONST __constant__
#else
# define CR_CUDA_FUNC inline
# define CR_CUDA_CONST static const
#endif
#define CR_UNREF(X) ((void)(X))
#define CR_ARRAY_SIZE(X) ((int)(sizeof(X) / sizeof((X)[0])))
//------------------------------------------------------------------------
typedef uint8_t U8;
typedef uint16_t U16;
typedef uint32_t U32;
typedef uint64_t U64;
typedef int8_t S8;
typedef int16_t S16;
typedef int32_t S32;
typedef int64_t S64;
typedef float F32;
typedef double F64;
typedef void (*FuncPtr)(void);
//------------------------------------------------------------------------
#define CR_U32_MAX (0xFFFFFFFFu)
#define CR_S32_MIN (~0x7FFFFFFF)
#define CR_S32_MAX (0x7FFFFFFF)
#define CR_U64_MAX ((U64)(S64)-1)
#define CR_S64_MIN ((S64)-1 << 63)
#define CR_S64_MAX (~((S64)-1 << 63))
#define CR_F32_MIN (1.175494351e-38f)
#define CR_F32_MAX (3.402823466e+38f)
#define CR_F64_MIN (2.2250738585072014e-308)
#define CR_F64_MAX (1.7976931348623158e+308)
//------------------------------------------------------------------------
// Misc types.
class Vec2i
{
public:
Vec2i(int x_, int y_) : x(x_), y(y_) {}
int x, y;
};
class Vec3i
{
public:
Vec3i(int x_, int y_, int z_) : x(x_), y(y_), z(z_) {}
int x, y, z;
};
//------------------------------------------------------------------------
// CUDA utilities.
#if CR_CUDA
# define globalThreadIdx (threadIdx.x + blockDim.x * (threadIdx.y + blockDim.y * (blockIdx.x + gridDim.x * blockIdx.y)))
#endif
//------------------------------------------------------------------------
} // namespace CR
// Copyright (c) 2009-2022, NVIDIA CORPORATION. All rights reserved.
//
// NVIDIA CORPORATION and its licensors retain all intellectual property
// and proprietary rights in and to this software, related documentation
// and any modifications thereto. Any use, reproduction, disclosure or
// distribution of this software and related documentation without an express
// license agreement from NVIDIA CORPORATION is strictly prohibited.
//------------------------------------------------------------------------
// Utility funcs.
//------------------------------------------------------------------------
__device__ __inline__ void initTileZMax(U32& tileZMax, bool& tileZUpd, volatile U32* tileDepth)
{
tileZMax = CR_DEPTH_MAX;
tileZUpd = (::min(tileDepth[threadIdx.x], tileDepth[threadIdx.x + 32]) < tileZMax);
}
__device__ __inline__ void updateTileZMax(U32& tileZMax, bool& tileZUpd, volatile U32* tileDepth, volatile U32* temp)
{
// Entry is warp-coherent.
if (__any_sync(~0u, tileZUpd))
{
U32 z = ::max(tileDepth[threadIdx.x], tileDepth[threadIdx.x + 32]); __syncwarp();
temp[threadIdx.x + 16] = z; __syncwarp();
z = ::max(z, temp[threadIdx.x + 16 - 1]); __syncwarp(); temp[threadIdx.x + 16] = z; __syncwarp();
z = ::max(z, temp[threadIdx.x + 16 - 2]); __syncwarp(); temp[threadIdx.x + 16] = z; __syncwarp();
z = ::max(z, temp[threadIdx.x + 16 - 4]); __syncwarp(); temp[threadIdx.x + 16] = z; __syncwarp();
z = ::max(z, temp[threadIdx.x + 16 - 8]); __syncwarp(); temp[threadIdx.x + 16] = z; __syncwarp();
z = ::max(z, temp[threadIdx.x + 16 - 16]); __syncwarp(); temp[threadIdx.x + 16] = z; __syncwarp();
tileZMax = temp[47];
tileZUpd = false;
}
}
//------------------------------------------------------------------------
__device__ __inline__ void getTriangle(const CRParams& p, S32& triIdx, S32& dataIdx, uint4& triHeader, S32& segment)
{
const CRTriangleHeader* triHeaderPtr = (const CRTriangleHeader*)p.triHeader + blockIdx.z * p.maxSubtris;;
const S32* tileSegData = (const S32*)p.tileSegData + p.maxTileSegs * CR_TILE_SEG_SIZE * blockIdx.z;
const S32* tileSegNext = (const S32*)p.tileSegNext + p.maxTileSegs * blockIdx.z;
const S32* tileSegCount = (const S32*)p.tileSegCount + p.maxTileSegs * blockIdx.z;
if (threadIdx.x >= tileSegCount[segment])
{
triIdx = -1;
dataIdx = -1;
}
else
{
int subtriIdx = tileSegData[segment * CR_TILE_SEG_SIZE + threadIdx.x];
triIdx = subtriIdx >> 3;
dataIdx = triIdx;
subtriIdx &= 7;
if (subtriIdx != 7)
dataIdx = triHeaderPtr[triIdx].misc + subtriIdx;
triHeader = *((uint4*)triHeaderPtr + dataIdx);
}
// advance to next segment
segment = tileSegNext[segment];
}
//------------------------------------------------------------------------
__device__ __inline__ bool earlyZCull(uint4 triHeader, U32 tileZMax)
{
U32 zmin = triHeader.w & 0xFFFFF000;
return (zmin > tileZMax);
}
//------------------------------------------------------------------------
__device__ __inline__ U64 trianglePixelCoverage(const CRParams& p, const uint4& triHeader, int tileX, int tileY, volatile U64* s_cover8x8_lut)
{
int baseX = (tileX << (CR_TILE_LOG2 + CR_SUBPIXEL_LOG2)) - ((p.widthPixels - 1) << (CR_SUBPIXEL_LOG2 - 1));
int baseY = (tileY << (CR_TILE_LOG2 + CR_SUBPIXEL_LOG2)) - ((p.heightPixels - 1) << (CR_SUBPIXEL_LOG2 - 1));
// extract S16 vertex positions while subtracting tile coordinates
S32 v0x = sub_s16lo_s16lo(triHeader.x, baseX);
S32 v0y = sub_s16hi_s16lo(triHeader.x, baseY);
S32 v01x = sub_s16lo_s16lo(triHeader.y, triHeader.x);
S32 v01y = sub_s16hi_s16hi(triHeader.y, triHeader.x);
S32 v20x = sub_s16lo_s16lo(triHeader.x, triHeader.z);
S32 v20y = sub_s16hi_s16hi(triHeader.x, triHeader.z);
// extract flipbits
U32 f01 = (triHeader.w >> 6) & 0x3C;
U32 f12 = (triHeader.w >> 2) & 0x3C;
U32 f20 = (triHeader.w << 2) & 0x3C;
// compute per-edge coverage masks
U64 c01, c12, c20;
c01 = cover8x8_exact_fast(v0x, v0y, v01x, v01y, f01, s_cover8x8_lut);
c12 = cover8x8_exact_fast(v0x + v01x, v0y + v01y, -v01x - v20x, -v01y - v20y, f12, s_cover8x8_lut);
c20 = cover8x8_exact_fast(v0x, v0y, v20x, v20y, f20, s_cover8x8_lut);
// combine masks
return c01 & c12 & c20;
}
//------------------------------------------------------------------------
__device__ __inline__ U32 scan32_value(U32 value, volatile U32* temp)
{
__syncwarp();
temp[threadIdx.x + 16] = value; __syncwarp();
value += temp[threadIdx.x + 16 - 1]; __syncwarp(); temp[threadIdx.x + 16] = value; __syncwarp();
value += temp[threadIdx.x + 16 - 2]; __syncwarp(); temp[threadIdx.x + 16] = value; __syncwarp();
value += temp[threadIdx.x + 16 - 4]; __syncwarp(); temp[threadIdx.x + 16] = value; __syncwarp();
value += temp[threadIdx.x + 16 - 8]; __syncwarp(); temp[threadIdx.x + 16] = value; __syncwarp();
value += temp[threadIdx.x + 16 - 16]; __syncwarp(); temp[threadIdx.x + 16] = value; __syncwarp();
return value;
}
__device__ __inline__ volatile const U32& scan32_total(volatile U32* temp)
{
return temp[47];
}
//------------------------------------------------------------------------
__device__ __inline__ S32 findBit(U64 mask, int idx)
{
U32 x = getLo(mask);
int pop = __popc(x);
bool p = (pop <= idx);
if (p) x = getHi(mask);
if (p) idx -= pop;
int bit = p ? 32 : 0;
pop = __popc(x & 0x0000ffffu);
p = (pop <= idx);
if (p) x >>= 16;
if (p) bit += 16;
if (p) idx -= pop;
U32 tmp = x & 0x000000ffu;
pop = __popc(tmp);
p = (pop <= idx);
if (p) tmp = x & 0x0000ff00u;
if (p) idx -= pop;
return findLeadingOne(tmp) + bit - idx;
}
//------------------------------------------------------------------------
// Single-sample implementation.
//------------------------------------------------------------------------
__device__ __inline__ void executeROP(U32 color, U32 depth, volatile U32* pColor, volatile U32* pDepth, U32 ropMask)
{
atomicMin((U32*)pDepth, depth);
__syncwarp(ropMask);
bool act = (depth == *pDepth);
__syncwarp(ropMask);
U32 actMask = __ballot_sync(ropMask, act);
if (act)
{
*pDepth = 0;
__syncwarp(actMask);
atomicMax((U32*)pDepth, threadIdx.x);
__syncwarp(actMask);
if (*pDepth == threadIdx.x)
{
*pDepth = depth;
*pColor = color;
}
__syncwarp(actMask);
}
}
//------------------------------------------------------------------------
__device__ __inline__ void fineRasterImpl(const CRParams p)
{
// for 20 warps:
__shared__ volatile U64 s_cover8x8_lut[CR_COVER8X8_LUT_SIZE]; // 6KB
__shared__ volatile U32 s_tileColor [CR_FINE_MAX_WARPS][CR_TILE_SQR]; // 5KB
__shared__ volatile U32 s_tileDepth [CR_FINE_MAX_WARPS][CR_TILE_SQR]; // 5KB
__shared__ volatile U32 s_tilePeel [CR_FINE_MAX_WARPS][CR_TILE_SQR]; // 5KB
__shared__ volatile U32 s_triDataIdx [CR_FINE_MAX_WARPS][64]; // 5KB CRTriangleData index
__shared__ volatile U64 s_triangleCov [CR_FINE_MAX_WARPS][64]; // 10KB coverage mask
__shared__ volatile U32 s_triangleFrag[CR_FINE_MAX_WARPS][64]; // 5KB fragment index
__shared__ volatile U32 s_temp [CR_FINE_MAX_WARPS][80]; // 6.25KB
// = 47.25KB total
CRAtomics& atomics = p.atomics[blockIdx.z];
const CRTriangleData* triData = (const CRTriangleData*)p.triData + blockIdx.z * p.maxSubtris;
const S32* activeTiles = (const S32*)p.activeTiles + CR_MAXTILES_SQR * blockIdx.z;
const S32* tileFirstSeg = (const S32*)p.tileFirstSeg + CR_MAXTILES_SQR * blockIdx.z;
volatile U32* tileColor = s_tileColor[threadIdx.y];
volatile U32* tileDepth = s_tileDepth[threadIdx.y];
volatile U32* tilePeel = s_tilePeel[threadIdx.y];
volatile U32* triDataIdx = s_triDataIdx[threadIdx.y];
volatile U64* triangleCov = s_triangleCov[threadIdx.y];
volatile U32* triangleFrag = s_triangleFrag[threadIdx.y];
volatile U32* temp = s_temp[threadIdx.y];
if (atomics.numSubtris > p.maxSubtris || atomics.numBinSegs > p.maxBinSegs || atomics.numTileSegs > p.maxTileSegs)
return;
temp[threadIdx.x] = 0; // first 16 elements of temp are always zero
cover8x8_setupLUT(s_cover8x8_lut);
__syncthreads();
// loop over tiles
for (;;)
{
// pick a tile
if (threadIdx.x == 0)
temp[16] = atomicAdd(&atomics.fineCounter, 1);
__syncwarp();
int activeIdx = temp[16];
if (activeIdx >= atomics.numActiveTiles)
break;
int tileIdx = activeTiles[activeIdx];
S32 segment = tileFirstSeg[tileIdx];
int tileY = tileIdx / p.widthTiles;
int tileX = tileIdx - tileY * p.widthTiles;
int px = (tileX << CR_TILE_LOG2) + (threadIdx.x & (CR_TILE_SIZE - 1));
int py = (tileY << CR_TILE_LOG2) + (threadIdx.x >> CR_TILE_LOG2);
// initialize per-tile state
int triRead = 0, triWrite = 0;
int fragRead = 0, fragWrite = 0;
if (threadIdx.x == 0)
triangleFrag[63] = 0; // "previous triangle"
// deferred clear => clear tile
if (p.deferredClear)
{
tileColor[threadIdx.x] = p.clearColor;
tileDepth[threadIdx.x] = p.clearDepth;
tileColor[threadIdx.x + 32] = p.clearColor;
tileDepth[threadIdx.x + 32] = p.clearDepth;
}
else // otherwise => read tile from framebuffer
{
U32* pColor = (U32*)p.colorBuffer + p.widthPixels * p.heightPixels * blockIdx.z;
U32* pDepth = (U32*)p.depthBuffer + p.widthPixels * p.heightPixels * blockIdx.z;
tileColor[threadIdx.x] = pColor[px + p.widthPixels * py];
tileDepth[threadIdx.x] = pDepth[px + p.widthPixels * py];
tileColor[threadIdx.x + 32] = pColor[px + p.widthPixels * (py + 4)];
tileDepth[threadIdx.x + 32] = pDepth[px + p.widthPixels * (py + 4)];
}
// read peeling inputs if enabled
if (p.renderModeFlags & CudaRaster::RenderModeFlag_EnableDepthPeeling)
{
U32* pPeel = (U32*)p.peelBuffer + p.widthPixels * p.heightPixels * blockIdx.z;
tilePeel[threadIdx.x] = pPeel[px + p.widthPixels * py];
tilePeel[threadIdx.x + 32] = pPeel[px + p.widthPixels * (py + 4)];
}
U32 tileZMax;
bool tileZUpd;
initTileZMax(tileZMax, tileZUpd, tileDepth);
// process fragments
for(;;)
{
// need to queue more fragments?
if (fragWrite - fragRead < 32 && segment >= 0)
{
// update tile z - coherent over warp
updateTileZMax(tileZMax, tileZUpd, tileDepth, temp);
// read triangles
do
{
// read triangle index and data, advance to next segment
S32 triIdx, dataIdx;
uint4 triHeader;
getTriangle(p, triIdx, dataIdx, triHeader, segment);
// early z cull
if (triIdx >= 0 && earlyZCull(triHeader, tileZMax))
triIdx = -1;
// determine coverage
U64 coverage = trianglePixelCoverage(p, triHeader, tileX, tileY, s_cover8x8_lut);
S32 pop = (triIdx == -1) ? 0 : __popcll(coverage);
// fragment count scan
U32 frag = scan32_value(pop, temp);
frag += fragWrite; // frag now holds cumulative fragment count
fragWrite += scan32_total(temp);
// queue non-empty triangles
U32 goodMask = __ballot_sync(~0u, pop != 0);
if (pop != 0)
{
int idx = (triWrite + __popc(goodMask & getLaneMaskLt())) & 63;
triDataIdx [idx] = dataIdx;
triangleFrag[idx] = frag;
triangleCov [idx] = coverage;
}
triWrite += __popc(goodMask);
}
while (fragWrite - fragRead < 32 && segment >= 0);
}
__syncwarp();
// end of segment?
if (fragRead == fragWrite)
break;
// clear triangle boundaries
temp[threadIdx.x + 16] = 0;
__syncwarp();
// tag triangle boundaries
if (triRead + threadIdx.x < triWrite)
{
int idx = triangleFrag[(triRead + threadIdx.x) & 63] - fragRead;
if (idx <= 32)
temp[idx + 16 - 1] = 1;
}
__syncwarp();
int ropLaneIdx = threadIdx.x;
U32 boundaryMask = __ballot_sync(~0u, temp[ropLaneIdx + 16]);
// distribute fragments
bool hasFragment = (ropLaneIdx < fragWrite - fragRead);
U32 fragmentMask = __ballot_sync(~0u, hasFragment);
if (hasFragment)
{
int triBufIdx = (triRead + __popc(boundaryMask & getLaneMaskLt())) & 63;
int fragIdx = add_sub(fragRead, ropLaneIdx, triangleFrag[(triBufIdx - 1) & 63]);
U64 coverage = triangleCov[triBufIdx];
int pixelInTile = findBit(coverage, fragIdx);
int dataIdx = triDataIdx[triBufIdx];
// determine pixel position
U32 pixelX = (tileX << CR_TILE_LOG2) + (pixelInTile & 7);
U32 pixelY = (tileY << CR_TILE_LOG2) + (pixelInTile >> 3);
// depth test
U32 depth = 0;
uint4 td = *((uint4*)triData + dataIdx * (sizeof(CRTriangleData) >> 4));
depth = td.x * pixelX + td.y * pixelY + td.z;
bool zkill = (p.renderModeFlags & CudaRaster::RenderModeFlag_EnableDepthPeeling) && (depth <= tilePeel[pixelInTile]);
if (!zkill)
{
U32 oldDepth = tileDepth[pixelInTile];
if (depth > oldDepth)
zkill = true;
else if (oldDepth == tileZMax)
tileZUpd = true; // we are replacing previous zmax => need to update
}
U32 ropMask = __ballot_sync(fragmentMask, !zkill);
if (!zkill)
executeROP(td.w, depth, &tileColor[pixelInTile], &tileDepth[pixelInTile], ropMask);
}
// no need to sync, as next up is updateTileZMax that does internal warp sync
// update counters
fragRead = ::min(fragRead + 32, fragWrite);
triRead += __popc(boundaryMask);
}
// Write tile back to the framebuffer.
if (true)
{
int px = (tileX << CR_TILE_LOG2) + (threadIdx.x & (CR_TILE_SIZE - 1));
int py = (tileY << CR_TILE_LOG2) + (threadIdx.x >> CR_TILE_LOG2);
U32* pColor = (U32*)p.colorBuffer + p.widthPixels * p.heightPixels * blockIdx.z;
U32* pDepth = (U32*)p.depthBuffer + p.widthPixels * p.heightPixels * blockIdx.z;
pColor[px + p.widthPixels * py] = tileColor[threadIdx.x];
pDepth[px + p.widthPixels * py] = tileDepth[threadIdx.x];
pColor[px + p.widthPixels * (py + 4)] = tileColor[threadIdx.x + 32];
pDepth[px + p.widthPixels * (py + 4)] = tileDepth[threadIdx.x + 32];
}
}
}
//------------------------------------------------------------------------
// Copyright (c) 2009-2022, NVIDIA CORPORATION. All rights reserved.
//
// NVIDIA CORPORATION and its licensors retain all intellectual property
// and proprietary rights in and to this software, related documentation
// and any modifications thereto. Any use, reproduction, disclosure or
// distribution of this software and related documentation without an express
// license agreement from NVIDIA CORPORATION is strictly prohibited.
#pragma once
#include "Defs.hpp"
#include "Constants.hpp"
namespace CR
{
//------------------------------------------------------------------------
// Projected triangle.
//------------------------------------------------------------------------
struct CRTriangleHeader
{
S16 v0x; // Subpixels relative to viewport center. Valid if triSubtris = 1.
S16 v0y;
S16 v1x;
S16 v1y;
S16 v2x;
S16 v2y;
U32 misc; // triSubtris=1: (zmin:20, f01:4, f12:4, f20:4), triSubtris>=2: (subtriBase)
};
//------------------------------------------------------------------------
struct CRTriangleData
{
U32 zx; // zx * sampleX + zy * sampleY + zb = lerp(CR_DEPTH_MIN, CR_DEPTH_MAX, (clipZ / clipW + 1) / 2)
U32 zy;
U32 zb;
U32 id; // Triangle id.
};
//------------------------------------------------------------------------
// Device-side structures.
//------------------------------------------------------------------------
struct CRAtomics
{
// Setup.
S32 numSubtris; // = numTris
// Bin.
S32 binCounter; // = 0
S32 numBinSegs; // = 0
// Coarse.
S32 coarseCounter; // = 0
S32 numTileSegs; // = 0
S32 numActiveTiles; // = 0
// Fine.
S32 fineCounter; // = 0
};
//------------------------------------------------------------------------
struct CRImageParams
{
S32 triOffset; // First triangle index to draw.
S32 triCount; // Number of triangles to draw.
S32 binBatchSize; // Number of triangles per batch.
};
//------------------------------------------------------------------------
struct CRParams
{
// Common.
CRAtomics* atomics; // Work counters. Per-image.
S32 numImages; // Batch size.
S32 totalCount; // In range mode, total number of triangles to render.
S32 instanceMode; // 0 = range mode, 1 = instance mode.
S32 numVertices; // Number of vertices in input buffer, not counting multiples in instance mode.
S32 numTriangles; // Number of triangles in input buffer.
void* vertexBuffer; // numVertices * float4(x, y, z, w)
void* indexBuffer; // numTriangles * int3(vi0, vi1, vi2)
S32 widthPixels; // Viewport size in pixels. Must be multiple of tile size (8x8).
S32 heightPixels;
S32 widthBins; // widthPixels / CR_BIN_SIZE
S32 heightBins; // heightPixels / CR_BIN_SIZE
S32 numBins; // widthBins * heightBins
S32 widthTiles; // widthPixels / CR_TILE_SIZE
S32 heightTiles; // heightPixels / CR_TILE_SIZE
S32 numTiles; // widthTiles * heightTiles
U32 renderModeFlags;
S32 deferredClear; // 1 = Clear framebuffer before rendering triangles.
U32 clearColor;
U32 clearDepth;
// These are uniform across batch.
S32 maxSubtris;
S32 maxBinSegs;
S32 maxTileSegs;
// Setup output / bin input.
void* triSubtris; // maxSubtris * U8
void* triHeader; // maxSubtris * CRTriangleHeader
void* triData; // maxSubtris * CRTriangleData
// Bin output / coarse input.
void* binSegData; // maxBinSegs * CR_BIN_SEG_SIZE * S32
void* binSegNext; // maxBinSegs * S32
void* binSegCount; // maxBinSegs * S32
void* binFirstSeg; // CR_MAXBINS_SQR * CR_BIN_STREAMS_SIZE * (S32 segIdx), -1 = none
void* binTotal; // CR_MAXBINS_SQR * CR_BIN_STREAMS_SIZE * (S32 numTris)
// Coarse output / fine input.
void* tileSegData; // maxTileSegs * CR_TILE_SEG_SIZE * S32
void* tileSegNext; // maxTileSegs * S32
void* tileSegCount; // maxTileSegs * S32
void* activeTiles; // CR_MAXTILES_SQR * (S32 tileIdx)
void* tileFirstSeg; // CR_MAXTILES_SQR * (S32 segIdx), -1 = none
// Surface buffers.
void* colorBuffer; // sizePixels.x * sizePixels.y * numImages * U32
void* depthBuffer; // sizePixels.x * sizePixels.y * numImages * U32
void* peelBuffer; // sizePixels.x * sizePixels.y * numImages * U32, only if peeling enabled.
// Per-image parameters for first images are embedded here to avoid extra memcpy for small batches.
CRImageParams imageParamsFirst[CR_EMBED_IMAGE_PARAMS];
const CRImageParams* imageParamsExtra; // After CR_EMBED_IMAGE_PARAMS.
};
//------------------------------------------------------------------------
}
// Copyright (c) 2009-2022, NVIDIA CORPORATION. All rights reserved.
//
// NVIDIA CORPORATION and its licensors retain all intellectual property
// and proprietary rights in and to this software, related documentation
// and any modifications thereto. Any use, reproduction, disclosure or
// distribution of this software and related documentation without an express
// license agreement from NVIDIA CORPORATION is strictly prohibited.
#include "../../framework.h"
#include "PrivateDefs.hpp"
#include "Constants.hpp"
#include "RasterImpl.hpp"
#include <cuda_runtime.h>
using namespace CR;
using std::min;
using std::max;
//------------------------------------------------------------------------
// Kernel prototypes and variables.
void triangleSetupKernel (const CRParams p);
void binRasterKernel (const CRParams p);
void coarseRasterKernel (const CRParams p);
void fineRasterKernel (const CRParams p);
//------------------------------------------------------------------------
RasterImpl::RasterImpl(void)
: m_renderModeFlags (0),
m_deferredClear (false),
m_clearColor (0),
m_vertexPtr (NULL),
m_indexPtr (NULL),
m_numVertices (0),
m_numTriangles (0),
m_bufferSizesReported (0),
m_numImages (0),
m_sizePixels (0, 0),
m_sizeBins (0, 0),
m_numBins (0),
m_sizeTiles (0, 0),
m_numTiles (0),
m_numSMs (1),
m_numCoarseBlocksPerSM (1),
m_numFineBlocksPerSM (1),
m_numFineWarpsPerBlock (1),
m_maxSubtris (1),
m_maxBinSegs (1),
m_maxTileSegs (1)
{
// Query relevant device attributes.
int currentDevice = 0;
NVDR_CHECK_CUDA_ERROR(cudaGetDevice(&currentDevice));
NVDR_CHECK_CUDA_ERROR(cudaDeviceGetAttribute(&m_numSMs, cudaDevAttrMultiProcessorCount, currentDevice));
cudaFuncAttributes attr;
NVDR_CHECK_CUDA_ERROR(cudaFuncGetAttributes(&attr, (void*)fineRasterKernel));
m_numFineWarpsPerBlock = min(attr.maxThreadsPerBlock / 32, CR_FINE_MAX_WARPS);
NVDR_CHECK_CUDA_ERROR(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&m_numCoarseBlocksPerSM, (void*)coarseRasterKernel, 32 * CR_COARSE_WARPS, 0));
NVDR_CHECK_CUDA_ERROR(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&m_numFineBlocksPerSM, (void*)fineRasterKernel, 32 * m_numFineWarpsPerBlock, 0));
// Setup functions.
NVDR_CHECK_CUDA_ERROR(cudaFuncSetCacheConfig((void*)triangleSetupKernel, cudaFuncCachePreferShared));
NVDR_CHECK_CUDA_ERROR(cudaFuncSetCacheConfig((void*)binRasterKernel, cudaFuncCachePreferShared));
NVDR_CHECK_CUDA_ERROR(cudaFuncSetCacheConfig((void*)coarseRasterKernel, cudaFuncCachePreferShared));
NVDR_CHECK_CUDA_ERROR(cudaFuncSetCacheConfig((void*)fineRasterKernel, cudaFuncCachePreferShared));
}
//------------------------------------------------------------------------
RasterImpl::~RasterImpl(void)
{
// Empty.
}
//------------------------------------------------------------------------
void RasterImpl::setViewportSize(Vec3i size)
{
if ((size.x | size.y) & (CR_TILE_SIZE - 1))
return; // Invalid size.
m_numImages = size.z;
m_sizePixels = Vec2i(size.x, size.y);
m_sizeTiles.x = m_sizePixels.x >> CR_TILE_LOG2;
m_sizeTiles.y = m_sizePixels.y >> CR_TILE_LOG2;
m_numTiles = m_sizeTiles.x * m_sizeTiles.y;
m_sizeBins.x = (m_sizeTiles.x + CR_BIN_SIZE - 1) >> CR_BIN_LOG2;
m_sizeBins.y = (m_sizeTiles.y + CR_BIN_SIZE - 1) >> CR_BIN_LOG2;
m_numBins = m_sizeBins.x * m_sizeBins.y;
m_colorBuffer.reset(m_sizePixels.x * m_sizePixels.y * m_numImages * sizeof(U32));
m_depthBuffer.reset(m_sizePixels.x * m_sizePixels.y * m_numImages * sizeof(U32));
}
void RasterImpl::swapDepthAndPeel(void)
{
m_peelBuffer.reset(m_depthBuffer.getSize()); // Ensure equal size and valid pointer.
void* tmp = m_depthBuffer.getPtr();
m_depthBuffer.setPtr(m_peelBuffer.getPtr());
m_peelBuffer.setPtr(tmp);
}
//------------------------------------------------------------------------
bool RasterImpl::drawTriangles(const Vec2i* ranges, cudaStream_t stream)
{
bool instanceMode = (!ranges);
int maxSubtrisSlack = 4096; // x 81B = 324KB
int maxBinSegsSlack = 256; // x 2137B = 534KB
int maxTileSegsSlack = 4096; // x 136B = 544KB
// Resize atomics as needed.
m_crAtomics .grow(m_numImages * sizeof(CRAtomics));
// Size of these buffers doesn't depend on input.
m_binFirstSeg .grow(m_numImages * CR_MAXBINS_SQR * CR_BIN_STREAMS_SIZE * sizeof(S32));
m_binTotal .grow(m_numImages * CR_MAXBINS_SQR * CR_BIN_STREAMS_SIZE * sizeof(S32));
m_activeTiles .grow(m_numImages * CR_MAXTILES_SQR * sizeof(S32));
m_tileFirstSeg .grow(m_numImages * CR_MAXTILES_SQR * sizeof(S32));
// Construct per-image parameters and determine worst-case buffer sizes.
std::vector<CRImageParams> imageParams(m_numImages);
for (int i=0; i < m_numImages; i++)
{
CRImageParams& ip = imageParams[i];
int roundSize = CR_BIN_WARPS * 32;
int minBatches = CR_BIN_STREAMS_SIZE * 2;
int maxRounds = 32;
ip.triOffset = instanceMode ? 0 : ranges[i].x;
ip.triCount = instanceMode ? m_numTriangles : ranges[i].y;
ip.binBatchSize = min(max(ip.triCount / (roundSize * minBatches), 1), maxRounds) * roundSize;
m_maxSubtris = max(m_maxSubtris, min(ip.triCount + maxSubtrisSlack, CR_MAXSUBTRIS_SIZE));
m_maxBinSegs = max(m_maxBinSegs, max(m_numBins * CR_BIN_STREAMS_SIZE, (ip.triCount - 1) / CR_BIN_SEG_SIZE + 1) + maxBinSegsSlack);
m_maxTileSegs = max(m_maxTileSegs, max(m_numTiles, (ip.triCount - 1) / CR_TILE_SEG_SIZE + 1) + maxTileSegsSlack);
}
// Retry until successful.
for (;;)
{
// Allocate buffers.
m_triSubtris.reset(m_numImages * m_maxSubtris * sizeof(U8));
m_triHeader .reset(m_numImages * m_maxSubtris * sizeof(CRTriangleHeader));
m_triData .reset(m_numImages * m_maxSubtris * sizeof(CRTriangleData));
m_binSegData .reset(m_numImages * m_maxBinSegs * CR_BIN_SEG_SIZE * sizeof(S32));
m_binSegNext .reset(m_numImages * m_maxBinSegs * sizeof(S32));
m_binSegCount.reset(m_numImages * m_maxBinSegs * sizeof(S32));
m_tileSegData .reset(m_numImages * m_maxTileSegs * CR_TILE_SEG_SIZE * sizeof(S32));
m_tileSegNext .reset(m_numImages * m_maxTileSegs * sizeof(S32));
m_tileSegCount.reset(m_numImages * m_maxTileSegs * sizeof(S32));
// Report if buffers grow from last time.
size_t sizesTotal = getTotalBufferSizes();
if (sizesTotal > m_bufferSizesReported)
{
size_t sizesMB = ((sizesTotal - 1) >> 20) + 1; // Round up.
sizesMB = ((sizesMB + 9) / 10) * 10; // 10MB granularity enough in this day and age.
LOG(INFO) << "Internal buffers grown to " << sizesMB << " MB";
m_bufferSizesReported = sizesMB << 20;
}
// Launch stages.
launchStages(&imageParams[0], instanceMode, stream);
// Get atomics.
std::vector<CRAtomics> atomics(m_numImages);
NVDR_CHECK_CUDA_ERROR(cudaMemcpyAsync(&atomics[0], m_crAtomics.getPtr(), sizeof(CRAtomics) * m_numImages, cudaMemcpyDeviceToHost, stream));
// Success?
bool failed = false;
for (int i=0; i < m_numImages; i++)
{
const CRAtomics& a = atomics[i];
failed = failed || (a.numSubtris > m_maxSubtris) || (a.numBinSegs > m_maxBinSegs) || (a.numTileSegs > m_maxTileSegs);
}
if (!failed)
break; // Success!
// If we were already at maximum capacity, no can do.
if (m_maxSubtris == CR_MAXSUBTRIS_SIZE)
return false;
// Enlarge buffers and try again.
for (int i=0; i < m_numImages; i++)
{
const CRAtomics& a = atomics[i];
m_maxSubtris = max(m_maxSubtris, min(a.numSubtris + maxSubtrisSlack, CR_MAXSUBTRIS_SIZE));
m_maxBinSegs = max(m_maxBinSegs, a.numBinSegs + maxBinSegsSlack);
m_maxTileSegs = max(m_maxTileSegs, a.numTileSegs + maxTileSegsSlack);
}
}
m_deferredClear = false;
return true; // Success.
}
//------------------------------------------------------------------------
size_t RasterImpl::getTotalBufferSizes(void) const
{
return
m_colorBuffer.getSize() + m_depthBuffer.getSize() + // Don't include atomics and image params.
m_triSubtris.getSize() + m_triHeader.getSize() + m_triData.getSize() +
m_binFirstSeg.getSize() + m_binTotal.getSize() + m_binSegData.getSize() + m_binSegNext.getSize() + m_binSegCount.getSize() +
m_activeTiles.getSize() + m_tileFirstSeg.getSize() + m_tileSegData.getSize() + m_tileSegNext.getSize() + m_tileSegCount.getSize();
}
//------------------------------------------------------------------------
void RasterImpl::launchStages(const CRImageParams* imageParams, bool instanceMode, cudaStream_t stream)
{
// Initialize atomics to mostly zero.
{
std::vector<CRAtomics> atomics(m_numImages);
memset(&atomics[0], 0, m_numImages * sizeof(CRAtomics));
for (int i=0; i < m_numImages; i++)
atomics[i].numSubtris = imageParams[i].triCount;
NVDR_CHECK_CUDA_ERROR(cudaMemcpyAsync(m_crAtomics.getPtr(), &atomics[0], m_numImages * sizeof(CRAtomics), cudaMemcpyHostToDevice, stream));
}
// Copy per-image parameters if there are more than fits in launch parameter block.
if (m_numImages > CR_EMBED_IMAGE_PARAMS)
{
int numImageParamsExtra = m_numImages - CR_EMBED_IMAGE_PARAMS;
m_crImageParamsExtra.grow(numImageParamsExtra * sizeof(CRImageParams));
NVDR_CHECK_CUDA_ERROR(cudaMemcpyAsync(m_crImageParamsExtra.getPtr(), imageParams + CR_EMBED_IMAGE_PARAMS, numImageParamsExtra * sizeof(CRImageParams), cudaMemcpyHostToDevice, stream));
}
// Set global parameters.
CRParams p;
{
p.atomics = (CRAtomics*)m_crAtomics.getPtr();
p.numImages = m_numImages;
p.totalCount = 0; // Only relevant in range mode.
p.instanceMode = instanceMode ? 1 : 0;
p.numVertices = m_numVertices;
p.numTriangles = m_numTriangles;
p.vertexBuffer = m_vertexPtr;
p.indexBuffer = m_indexPtr;
p.widthPixels = m_sizePixels.x;
p.heightPixels = m_sizePixels.y;
p.widthBins = m_sizeBins.x;
p.heightBins = m_sizeBins.y;
p.numBins = m_numBins;
p.widthTiles = m_sizeTiles.x;
p.heightTiles = m_sizeTiles.y;
p.numTiles = m_numTiles;
p.renderModeFlags = m_renderModeFlags;
p.deferredClear = m_deferredClear ? 1 : 0;
p.clearColor = m_clearColor;
p.clearDepth = CR_DEPTH_MAX;
p.maxSubtris = m_maxSubtris;
p.maxBinSegs = m_maxBinSegs;
p.maxTileSegs = m_maxTileSegs;
p.triSubtris = m_triSubtris.getPtr();
p.triHeader = m_triHeader.getPtr();
p.triData = m_triData.getPtr();
p.binSegData = m_binSegData.getPtr();
p.binSegNext = m_binSegNext.getPtr();
p.binSegCount = m_binSegCount.getPtr();
p.binFirstSeg = m_binFirstSeg.getPtr();
p.binTotal = m_binTotal.getPtr();
p.tileSegData = m_tileSegData.getPtr();
p.tileSegNext = m_tileSegNext.getPtr();
p.tileSegCount = m_tileSegCount.getPtr();
p.activeTiles = m_activeTiles.getPtr();
p.tileFirstSeg = m_tileFirstSeg.getPtr();
p.colorBuffer = m_colorBuffer.getPtr();
p.depthBuffer = m_depthBuffer.getPtr();
p.peelBuffer = (m_renderModeFlags & CudaRaster::RenderModeFlag_EnableDepthPeeling) ? m_peelBuffer.getPtr() : 0;
memcpy(&p.imageParamsFirst, imageParams, min(m_numImages, CR_EMBED_IMAGE_PARAMS) * sizeof(CRImageParams));
p.imageParamsExtra = (CRImageParams*)m_crImageParamsExtra.getPtr();
}
// Setup block sizes.
dim3 brBlock(32, CR_BIN_WARPS);
dim3 crBlock(32, CR_COARSE_WARPS);
dim3 frBlock(32, m_numFineWarpsPerBlock);
// Launch stages.
void* args[] = {&p};
if (instanceMode)
{
int setupBlocks = (m_numTriangles - 1) / (32 * CR_SETUP_WARPS) + 1;
NVDR_CHECK_CUDA_ERROR(cudaLaunchKernel((void*)triangleSetupKernel, dim3(setupBlocks, 1, m_numImages), dim3(32, CR_SETUP_WARPS), args, 0, stream));
}
else
{
for (int i=0; i < m_numImages; i++)
p.totalCount += imageParams[i].triCount;
int setupBlocks = (p.totalCount - 1) / (32 * CR_SETUP_WARPS) + 1;
NVDR_CHECK_CUDA_ERROR(cudaLaunchKernel((void*)triangleSetupKernel, dim3(setupBlocks, 1, 1), dim3(32, CR_SETUP_WARPS), args, 0, stream));
}
NVDR_CHECK_CUDA_ERROR(cudaLaunchKernel((void*)binRasterKernel, dim3(CR_BIN_STREAMS_SIZE, 1, m_numImages), brBlock, args, 0, stream));
NVDR_CHECK_CUDA_ERROR(cudaLaunchKernel((void*)coarseRasterKernel, dim3(m_numSMs * m_numCoarseBlocksPerSM, 1, m_numImages), crBlock, args, 0, stream));
NVDR_CHECK_CUDA_ERROR(cudaLaunchKernel((void*)fineRasterKernel, dim3(m_numSMs * m_numFineBlocksPerSM, 1, m_numImages), frBlock, args, 0, stream));
}
//------------------------------------------------------------------------
// Copyright (c) 2009-2022, NVIDIA CORPORATION. All rights reserved.
//
// NVIDIA CORPORATION and its licensors retain all intellectual property
// and proprietary rights in and to this software, related documentation
// and any modifications thereto. Any use, reproduction, disclosure or
// distribution of this software and related documentation without an express
// license agreement from NVIDIA CORPORATION is strictly prohibited.
#include "../CudaRaster.hpp"
#include "PrivateDefs.hpp"
#include "Constants.hpp"
#include "Util.inl"
namespace CR
{
//------------------------------------------------------------------------
// Stage implementations.
//------------------------------------------------------------------------
#include "TriangleSetup.inl"
#include "BinRaster.inl"
#include "CoarseRaster.inl"
#include "FineRaster.inl"
}
//------------------------------------------------------------------------
// Stage entry points.
//------------------------------------------------------------------------
__global__ void __launch_bounds__(CR_SETUP_WARPS * 32, CR_SETUP_OPT_BLOCKS) triangleSetupKernel (const CR::CRParams p) { CR::triangleSetupImpl(p); }
__global__ void __launch_bounds__(CR_BIN_WARPS * 32, 1) binRasterKernel (const CR::CRParams p) { CR::binRasterImpl(p); }
__global__ void __launch_bounds__(CR_COARSE_WARPS * 32, 1) coarseRasterKernel (const CR::CRParams p) { CR::coarseRasterImpl(p); }
__global__ void __launch_bounds__(CR_FINE_MAX_WARPS * 32, 1) fineRasterKernel (const CR::CRParams p) { CR::fineRasterImpl(p); }
//------------------------------------------------------------------------
// Copyright (c) 2009-2022, NVIDIA CORPORATION. All rights reserved.
//
// NVIDIA CORPORATION and its licensors retain all intellectual property
// and proprietary rights in and to this software, related documentation
// and any modifications thereto. Any use, reproduction, disclosure or
// distribution of this software and related documentation without an express
// license agreement from NVIDIA CORPORATION is strictly prohibited.
#pragma once
#include "PrivateDefs.hpp"
#include "Buffer.hpp"
#include "../CudaRaster.hpp"
namespace CR
{
//------------------------------------------------------------------------
class RasterImpl
{
public:
RasterImpl (void);
~RasterImpl (void);
void setViewportSize (Vec3i size); // Must be multiple of tile size (8x8).
void setRenderModeFlags (U32 flags) { m_renderModeFlags = flags; }
void deferredClear (U32 color) { m_deferredClear = true; m_clearColor = color; }
void setVertexBuffer (void* ptr, int numVertices) { m_vertexPtr = ptr; m_numVertices = numVertices; } // GPU pointer.
void setIndexBuffer (void* ptr, int numTriangles) { m_indexPtr = ptr; m_numTriangles = numTriangles; } // GPU pointer.
bool drawTriangles (const Vec2i* ranges, cudaStream_t stream);
void* getColorBuffer (void) { return m_colorBuffer.getPtr(); } // GPU pointer.
void* getDepthBuffer (void) { return m_depthBuffer.getPtr(); } // GPU pointer.
void swapDepthAndPeel (void);
size_t getTotalBufferSizes (void) const;
private:
void launchStages (const CRImageParams* imageParams, bool instanceMode, cudaStream_t stream);
// State.
unsigned int m_renderModeFlags;
bool m_deferredClear;
unsigned int m_clearColor;
void* m_vertexPtr;
void* m_indexPtr;
int m_numVertices; // Input buffer size.
int m_numTriangles; // Input buffer size.
size_t m_bufferSizesReported; // Previously reported buffer sizes.
// Surfaces.
Buffer m_colorBuffer;
Buffer m_depthBuffer;
Buffer m_peelBuffer;
int m_numImages;
Vec2i m_sizePixels;
Vec2i m_sizeBins;
S32 m_numBins;
Vec2i m_sizeTiles;
S32 m_numTiles;
// Launch sizes etc.
S32 m_numSMs;
S32 m_numCoarseBlocksPerSM;
S32 m_numFineBlocksPerSM;
S32 m_numFineWarpsPerBlock;
// Global intermediate buffers. Individual images have offsets to these.
Buffer m_crAtomics;
Buffer m_crImageParamsExtra;
Buffer m_triSubtris;
Buffer m_triHeader;
Buffer m_triData;
Buffer m_binFirstSeg;
Buffer m_binTotal;
Buffer m_binSegData;
Buffer m_binSegNext;
Buffer m_binSegCount;
Buffer m_activeTiles;
Buffer m_tileFirstSeg;
Buffer m_tileSegData;
Buffer m_tileSegNext;
Buffer m_tileSegCount;
// Actual buffer sizes.
S32 m_maxSubtris;
S32 m_maxBinSegs;
S32 m_maxTileSegs;
};
//------------------------------------------------------------------------
} // namespace CR
// Copyright (c) 2009-2022, NVIDIA CORPORATION. All rights reserved.
//
// NVIDIA CORPORATION and its licensors retain all intellectual property
// and proprietary rights in and to this software, related documentation
// and any modifications thereto. Any use, reproduction, disclosure or
// distribution of this software and related documentation without an express
// license agreement from NVIDIA CORPORATION is strictly prohibited.
//------------------------------------------------------------------------
__device__ __inline__ void snapTriangle(
const CRParams& p,
float4 v0, float4 v1, float4 v2,
int2& p0, int2& p1, int2& p2, float3& rcpW, int2& lo, int2& hi)
{
F32 viewScaleX = (F32)(p.widthPixels << (CR_SUBPIXEL_LOG2 - 1));
F32 viewScaleY = (F32)(p.heightPixels << (CR_SUBPIXEL_LOG2 - 1));
rcpW = make_float3(1.0f / v0.w, 1.0f / v1.w, 1.0f / v2.w);
p0 = make_int2(f32_to_s32_sat(v0.x * rcpW.x * viewScaleX), f32_to_s32_sat(v0.y * rcpW.x * viewScaleY));
p1 = make_int2(f32_to_s32_sat(v1.x * rcpW.y * viewScaleX), f32_to_s32_sat(v1.y * rcpW.y * viewScaleY));
p2 = make_int2(f32_to_s32_sat(v2.x * rcpW.z * viewScaleX), f32_to_s32_sat(v2.y * rcpW.z * viewScaleY));
lo = make_int2(min_min(p0.x, p1.x, p2.x), min_min(p0.y, p1.y, p2.y));
hi = make_int2(max_max(p0.x, p1.x, p2.x), max_max(p0.y, p1.y, p2.y));
}
//------------------------------------------------------------------------
__device__ __inline__ U32 cover8x8_selectFlips(S32 dx, S32 dy) // 10 instr
{
U32 flips = 0;
if (dy > 0 || (dy == 0 && dx <= 0))
flips ^= (1 << CR_FLIPBIT_FLIP_X) ^ (1 << CR_FLIPBIT_FLIP_Y) ^ (1 << CR_FLIPBIT_COMPL);
if (dx > 0)
flips ^= (1 << CR_FLIPBIT_FLIP_X) ^ (1 << CR_FLIPBIT_FLIP_Y);
if (::abs(dx) < ::abs(dy))
flips ^= (1 << CR_FLIPBIT_SWAP_XY) ^ (1 << CR_FLIPBIT_FLIP_Y);
return flips;
}
//------------------------------------------------------------------------
__device__ __inline__ bool prepareTriangle(
const CRParams& p,
int2 p0, int2 p1, int2 p2, int2 lo, int2 hi,
int2& d1, int2& d2, S32& area)
{
// Backfacing or degenerate => cull.
d1 = make_int2(p1.x - p0.x, p1.y - p0.y);
d2 = make_int2(p2.x - p0.x, p2.y - p0.y);
area = d1.x * d2.y - d1.y * d2.x;
if (area == 0)
return false; // Degenerate.
if (area < 0 && (p.renderModeFlags & CudaRaster::RenderModeFlag_EnableBackfaceCulling) != 0)
return false; // Backfacing.
// AABB falls between samples => cull.
int sampleSize = 1 << CR_SUBPIXEL_LOG2;
int biasX = (p.widthPixels << (CR_SUBPIXEL_LOG2 - 1)) - (sampleSize >> 1);
int biasY = (p.heightPixels << (CR_SUBPIXEL_LOG2 - 1)) - (sampleSize >> 1);
int lox = (int)add_add(lo.x, sampleSize - 1, biasX) & -sampleSize;
int loy = (int)add_add(lo.y, sampleSize - 1, biasY) & -sampleSize;
int hix = (hi.x + biasX) & -sampleSize;
int hiy = (hi.y + biasY) & -sampleSize;
if (lox > hix || loy > hiy)
return false; // Between pixels.
// AABB covers 1 or 2 samples => cull if they are not covered.
int diff = add_sub(hix, hiy, lox) - loy;
if (diff <= sampleSize)
{
int2 t0 = make_int2(add_sub(p0.x, biasX, lox), add_sub(p0.y, biasY, loy));
int2 t1 = make_int2(add_sub(p1.x, biasX, lox), add_sub(p1.y, biasY, loy));
int2 t2 = make_int2(add_sub(p2.x, biasX, lox), add_sub(p2.y, biasY, loy));
S32 e0 = t0.x * t1.y - t0.y * t1.x;
S32 e1 = t1.x * t2.y - t1.y * t2.x;
S32 e2 = t2.x * t0.y - t2.y * t0.x;
if (e0 < 0 || e1 < 0 || e2 < 0)
{
if (diff == 0)
return false; // Between pixels.
t0 = make_int2(add_sub(p0.x, biasX, hix), add_sub(p0.y, biasY, hiy));
t1 = make_int2(add_sub(p1.x, biasX, hix), add_sub(p1.y, biasY, hiy));
t2 = make_int2(add_sub(p2.x, biasX, hix), add_sub(p2.y, biasY, hiy));
e0 = t0.x * t1.y - t0.y * t1.x;
e1 = t1.x * t2.y - t1.y * t2.x;
e2 = t2.x * t0.y - t2.y * t0.x;
if (e0 < 0 || e1 < 0 || e2 < 0)
return false; // Between pixels.
}
}
// Otherwise => proceed to output the triangle.
return true; // Visible.
}
//------------------------------------------------------------------------
__device__ __inline__ void setupTriangle(
const CRParams& p,
CRTriangleHeader* th, CRTriangleData* td, int triId,
float v0z, float v1z, float v2z,
int2 p0, int2 p1, int2 p2, float3 rcpW,
int2 d1, int2 d2, S32 area)
{
// Swap vertices 1 and 2 if area is negative. Only executed if backface culling is
// disabled (if it is enabled, we never come here with area < 0).
if (area < 0)
{
swap(d1, d2);
swap(p1, p2);
swap(v1z, v2z);
swap(rcpW.y, rcpW.z);
area = -area;
}
int2 wv0;
wv0.x = p0.x + (p.widthPixels << (CR_SUBPIXEL_LOG2 - 1));
wv0.y = p0.y + (p.heightPixels << (CR_SUBPIXEL_LOG2 - 1));
// Setup depth plane equation.
F32 zcoef = (F32)(CR_DEPTH_MAX - CR_DEPTH_MIN) * 0.5f;
F32 zbias = (F32)(CR_DEPTH_MAX + CR_DEPTH_MIN) * 0.5f;
float3 zvert = make_float3(
(v0z * zcoef) * rcpW.x + zbias,
(v1z * zcoef) * rcpW.y + zbias,
(v2z * zcoef) * rcpW.z + zbias
);
int2 zv0 = make_int2(
wv0.x - (1 << (CR_SUBPIXEL_LOG2 - 1)),
wv0.y - (1 << (CR_SUBPIXEL_LOG2 - 1))
);
uint3 zpleq = setupPleq(zvert, zv0, d1, d2, 1.0f / (F32)area);
U32 zmin = f32_to_u32_sat(fminf(fminf(zvert.x, zvert.y), zvert.z) - (F32)CR_LERP_ERROR(0));
// Write CRTriangleData.
*(uint4*)td = make_uint4(zpleq.x, zpleq.y, zpleq.z, triId);
// Determine flipbits.
U32 f01 = cover8x8_selectFlips(d1.x, d1.y);
U32 f12 = cover8x8_selectFlips(d2.x - d1.x, d2.y - d1.y);
U32 f20 = cover8x8_selectFlips(-d2.x, -d2.y);
// Write CRTriangleHeader.
*(uint4*)th = make_uint4(
prmt(p0.x, p0.y, 0x5410),
prmt(p1.x, p1.y, 0x5410),
prmt(p2.x, p2.y, 0x5410),
(zmin & 0xfffff000u) | (f01 << 6) | (f12 << 2) | (f20 >> 2));
}
//------------------------------------------------------------------------
__device__ __inline__ void triangleSetupImpl(const CRParams p)
{
__shared__ F32 s_bary[CR_SETUP_WARPS * 32][18];
F32* bary = s_bary[threadIdx.x + threadIdx.y * 32];
// Compute task and image indices.
int taskIdx = threadIdx.x + 32 * (threadIdx.y + CR_SETUP_WARPS * blockIdx.x);
int imageIdx = 0;
if (p.instanceMode)
{
imageIdx = blockIdx.z;
if (taskIdx >= p.numTriangles)
return;
}
else
{
while (imageIdx < p.numImages)
{
int count = getImageParams(p, imageIdx).triCount;
if (taskIdx < count)
break;
taskIdx -= count;
imageIdx += 1;
}
if (imageIdx == p.numImages)
return;
}
// Per-image data structures.
const CRImageParams& ip = getImageParams(p, imageIdx);
CRAtomics& atomics = p.atomics[imageIdx];
const int* indexBuffer = (const int*)p.indexBuffer;
U8* triSubtris = (U8*)p.triSubtris + imageIdx * p.maxSubtris;
CRTriangleHeader* triHeader = (CRTriangleHeader*)p.triHeader + imageIdx * p.maxSubtris;
CRTriangleData* triData = (CRTriangleData*)p.triData + imageIdx * p.maxSubtris;
// Determine triangle index.
int triIdx = taskIdx;
if (!p.instanceMode)
triIdx += ip.triOffset;
// Read vertex indices.
if ((U32)triIdx >= (U32)p.numTriangles)
{
// Bad triangle index.
triSubtris[taskIdx] = 0;
return;
}
uint4 vidx;
vidx.x = indexBuffer[triIdx * 3 + 0];
vidx.y = indexBuffer[triIdx * 3 + 1];
vidx.z = indexBuffer[triIdx * 3 + 2];
vidx.w = triIdx + 1; // Triangle index.
if (vidx.x >= (U32)p.numVertices ||
vidx.y >= (U32)p.numVertices ||
vidx.z >= (U32)p.numVertices)
{
// Bad vertex index.
triSubtris[taskIdx] = 0;
return;
}
// Read vertex positions.
const float4* vertexBuffer = (const float4*)p.vertexBuffer;
if (p.instanceMode)
vertexBuffer += p.numVertices * imageIdx; // Instance offset.
float4 v0 = vertexBuffer[vidx.x];
float4 v1 = vertexBuffer[vidx.y];
float4 v2 = vertexBuffer[vidx.z];
// Outside view frustum => cull.
if (v0.w < fabsf(v0.x) | v0.w < fabsf(v0.y) | v0.w < fabsf(v0.z))
{
if ((v0.w < +v0.x & v1.w < +v1.x & v2.w < +v2.x) |
(v0.w < -v0.x & v1.w < -v1.x & v2.w < -v2.x) |
(v0.w < +v0.y & v1.w < +v1.y & v2.w < +v2.y) |
(v0.w < -v0.y & v1.w < -v1.y & v2.w < -v2.y) |
(v0.w < +v0.z & v1.w < +v1.z & v2.w < +v2.z) |
(v0.w < -v0.z & v1.w < -v1.z & v2.w < -v2.z))
{
triSubtris[taskIdx] = 0;
return;
}
}
// Inside depth range => try to snap vertices.
if (v0.w >= fabsf(v0.z) & v1.w >= fabsf(v1.z) & v2.w >= fabsf(v2.z))
{
// Inside S16 range and small enough => fast path.
// Note: aabbLimit comes from the fact that cover8x8
// does not support guardband with maximal viewport.
int2 p0, p1, p2, lo, hi;
float3 rcpW;
snapTriangle(p, v0, v1, v2, p0, p1, p2, rcpW, lo, hi);
S32 loxy = ::min(lo.x, lo.y);
S32 hixy = ::max(hi.x, hi.y);
S32 aabbLimit = (1 << (CR_MAXVIEWPORT_LOG2 + CR_SUBPIXEL_LOG2)) - 1;
if (loxy >= -32768 && hixy <= 32767 && hixy - loxy <= aabbLimit)
{
int2 d1, d2;
S32 area;
bool res = prepareTriangle(p, p0, p1, p2, lo, hi, d1, d2, area);
triSubtris[taskIdx] = res ? 1 : 0;
if (res)
setupTriangle(
p,
&triHeader[taskIdx], &triData[taskIdx], vidx.w,
v0.z, v1.z, v2.z,
p0, p1, p2, rcpW,
d1, d2, area);
return;
}
}
// Clip to view frustum.
float4 ov0 = v0;
float4 od1 = make_float4(v1.x - v0.x, v1.y - v0.y, v1.z - v0.z, v1.w - v0.w);
float4 od2 = make_float4(v2.x - v0.x, v2.y - v0.y, v2.z - v0.z, v2.w - v0.w);
int numVerts = clipTriangleWithFrustum(bary, &ov0.x, &v1.x, &v2.x, &od1.x, &od2.x);
// Count non-culled subtriangles.
v0.x = ov0.x + od1.x * bary[0] + od2.x * bary[1];
v0.y = ov0.y + od1.y * bary[0] + od2.y * bary[1];
v0.z = ov0.z + od1.z * bary[0] + od2.z * bary[1];
v0.w = ov0.w + od1.w * bary[0] + od2.w * bary[1];
v1.x = ov0.x + od1.x * bary[2] + od2.x * bary[3];
v1.y = ov0.y + od1.y * bary[2] + od2.y * bary[3];
v1.z = ov0.z + od1.z * bary[2] + od2.z * bary[3];
v1.w = ov0.w + od1.w * bary[2] + od2.w * bary[3];
float4 tv1 = v1;
int numSubtris = 0;
for (int i = 2; i < numVerts; i++)
{
v2.x = ov0.x + od1.x * bary[i * 2 + 0] + od2.x * bary[i * 2 + 1];
v2.y = ov0.y + od1.y * bary[i * 2 + 0] + od2.y * bary[i * 2 + 1];
v2.z = ov0.z + od1.z * bary[i * 2 + 0] + od2.z * bary[i * 2 + 1];
v2.w = ov0.w + od1.w * bary[i * 2 + 0] + od2.w * bary[i * 2 + 1];
int2 p0, p1, p2, lo, hi, d1, d2;
float3 rcpW;
S32 area;
snapTriangle(p, v0, v1, v2, p0, p1, p2, rcpW, lo, hi);
if (prepareTriangle(p, p0, p1, p2, lo, hi, d1, d2, area))
numSubtris++;
v1 = v2;
}
triSubtris[taskIdx] = numSubtris;
// Multiple subtriangles => allocate.
int subtriBase = taskIdx;
if (numSubtris > 1)
{
subtriBase = atomicAdd(&atomics.numSubtris, numSubtris);
triHeader[taskIdx].misc = subtriBase;
if (subtriBase + numSubtris > p.maxSubtris)
numVerts = 0;
}
// Setup subtriangles.
v1 = tv1;
for (int i = 2; i < numVerts; i++)
{
v2.x = ov0.x + od1.x * bary[i * 2 + 0] + od2.x * bary[i * 2 + 1];
v2.y = ov0.y + od1.y * bary[i * 2 + 0] + od2.y * bary[i * 2 + 1];
v2.z = ov0.z + od1.z * bary[i * 2 + 0] + od2.z * bary[i * 2 + 1];
v2.w = ov0.w + od1.w * bary[i * 2 + 0] + od2.w * bary[i * 2 + 1];
int2 p0, p1, p2, lo, hi, d1, d2;
float3 rcpW;
S32 area;
snapTriangle(p, v0, v1, v2, p0, p1, p2, rcpW, lo, hi);
if (prepareTriangle(p, p0, p1, p2, lo, hi, d1, d2, area))
{
setupTriangle(
p,
&triHeader[subtriBase], &triData[subtriBase], vidx.w,
v0.z, v1.z, v2.z,
p0, p1, p2, rcpW,
d1, d2, area);
subtriBase++;
}
v1 = v2;
}
}
//------------------------------------------------------------------------
This diff is collapsed.
......@@ -9,6 +9,106 @@
#include "common.h"
#include "rasterize.h"
//------------------------------------------------------------------------
// Cuda forward rasterizer pixel shader kernel.
__global__ void RasterizeCudaFwdShaderKernel(const RasterizeCudaFwdShaderParams p)
{
// Calculate pixel position.
int px = blockIdx.x * blockDim.x + threadIdx.x;
int py = blockIdx.y * blockDim.y + threadIdx.y;
int pz = blockIdx.z;
if (px >= p.width || py >= p.height || pz >= p.depth)
return;
// Pixel index.
int pidx = px + p.width * (py + p.height * pz);
// Fetch triangle idx.
int triIdx = p.in_idx[pidx] - 1;
if (triIdx < 0 || triIdx >= p.numTriangles)
{
// No or corrupt triangle.
((float4*)p.out)[pidx] = make_float4(0.0, 0.0, 0.0, 0.0); // Clear out.
((float4*)p.out_db)[pidx] = make_float4(0.0, 0.0, 0.0, 0.0); // Clear out_db.
return;
}
// Fetch vertex indices.
int vi0 = p.tri[triIdx * 3 + 0];
int vi1 = p.tri[triIdx * 3 + 1];
int vi2 = p.tri[triIdx * 3 + 2];
// Bail out if vertex indices are corrupt.
if (vi0 < 0 || vi0 >= p.numVertices ||
vi1 < 0 || vi1 >= p.numVertices ||
vi2 < 0 || vi2 >= p.numVertices)
return;
// In instance mode, adjust vertex indices by minibatch index.
if (p.instance_mode)
{
vi0 += pz * p.numVertices;
vi1 += pz * p.numVertices;
vi2 += pz * p.numVertices;
}
// Fetch vertex positions.
float4 p0 = ((float4*)p.pos)[vi0];
float4 p1 = ((float4*)p.pos)[vi1];
float4 p2 = ((float4*)p.pos)[vi2];
// Evaluate edge functions.
float fx = p.xs * (float)px + p.xo;
float fy = p.ys * (float)py + p.yo;
float p0x = p0.x - fx * p0.w;
float p0y = p0.y - fy * p0.w;
float p1x = p1.x - fx * p1.w;
float p1y = p1.y - fy * p1.w;
float p2x = p2.x - fx * p2.w;
float p2y = p2.y - fy * p2.w;
float a0 = p1x*p2y - p1y*p2x;
float a1 = p2x*p0y - p2y*p0x;
float a2 = p0x*p1y - p0y*p1x;
// Perspective correct, normalized barycentrics.
float iw = 1.f / (a0 + a1 + a2);
float b0 = a0 * iw;
float b1 = a1 * iw;
// Compute z/w for depth buffer.
float z = p0.z * a0 + p1.z * a1 + p2.z * a2;
float w = p0.w * a0 + p1.w * a1 + p2.w * a2;
float zw = z / w;
// Clamps to avoid NaNs.
b0 = __saturatef(b0); // Clamp to [+0.0, 1.0].
b1 = __saturatef(b1); // Clamp to [+0.0, 1.0].
zw = fmaxf(fminf(zw, 1.f), -1.f);
// Emit output.
((float4*)p.out)[pidx] = make_float4(b0, b1, zw, (float)(triIdx + 1));
// Calculate bary pixel differentials.
float dfxdx = p.xs * iw;
float dfydy = p.ys * iw;
float da0dx = p2.y*p1.w - p1.y*p2.w;
float da0dy = p1.x*p2.w - p2.x*p1.w;
float da1dx = p0.y*p2.w - p2.y*p0.w;
float da1dy = p2.x*p0.w - p0.x*p2.w;
float da2dx = p1.y*p0.w - p0.y*p1.w;
float da2dy = p0.x*p1.w - p1.x*p0.w;
float datdx = da0dx + da1dx + da2dx;
float datdy = da0dy + da1dy + da2dy;
float dudx = dfxdx * (b0 * datdx - da0dx);
float dudy = dfydy * (b0 * datdy - da0dy);
float dvdx = dfxdx * (b1 * datdx - da1dx);
float dvdy = dfydy * (b1 * datdy - da1dy);
// Emit bary pixel differentials.
((float4*)p.out_db)[pidx] = make_float4(dudx, dudy, dvdx, dvdy);
}
//------------------------------------------------------------------------
// Gradient Cuda kernel.
......@@ -16,7 +116,7 @@ template <bool ENABLE_DB>
static __forceinline__ __device__ void RasterizeGradKernelTemplate(const RasterizeGradParams p)
{
// Temporary space for coalesced atomics.
CA_DECLARE_TEMP(RAST_GRAD_MAX_KERNEL_BLOCK_WIDTH * RAST_GRAD_MAX_KERNEL_BLOCK_HEIGHT);
CA_DECLARE_TEMP(RAST_GRAD_MAX_KERNEL_BLOCK_WIDTH * RAST_GRAD_MAX_KERNEL_BLOCK_HEIGHT);
// Calculate pixel position.
int px = blockIdx.x * blockDim.x + threadIdx.x;
......@@ -64,7 +164,7 @@ static __forceinline__ __device__ void RasterizeGradKernelTemplate(const Rasteri
// Initialize coalesced atomics.
CA_SET_GROUP(triIdx);
// Fetch vertex positions.
float4 p0 = ((float4*)p.pos)[vi0];
float4 p1 = ((float4*)p.pos)[vi1];
......
......@@ -11,9 +11,30 @@
//------------------------------------------------------------------------
// Constants and helpers.
#define RAST_CUDA_FWD_SHADER_KERNEL_BLOCK_WIDTH 8
#define RAST_CUDA_FWD_SHADER_KERNEL_BLOCK_HEIGHT 8
#define RAST_GRAD_MAX_KERNEL_BLOCK_WIDTH 8
#define RAST_GRAD_MAX_KERNEL_BLOCK_HEIGHT 8
//------------------------------------------------------------------------
// CUDA forward rasterizer shader kernel params.
struct RasterizeCudaFwdShaderParams
{
const float* pos; // Vertex positions.
const int* tri; // Triangle indices.
const int* in_idx; // Triangle idx buffer from rasterizer.
float* out; // Main output buffer.
float* out_db; // Bary pixel gradient output buffer.
int numTriangles; // Number of triangles.
int numVertices; // Number of vertices.
int width; // Image width.
int height; // Image height.
int depth; // Size of minibatch.
int instance_mode; // 1 if in instance rendering mode.
float xs, xo, ys, yo; // Pixel position to clip-space x, y transform.
};
//------------------------------------------------------------------------
// Gradient CUDA kernel params.
......@@ -35,52 +56,3 @@ struct RasterizeGradParams
};
//------------------------------------------------------------------------
// Do not try to include OpenGL stuff when compiling CUDA kernels for torch.
#if !(defined(NVDR_TORCH) && defined(__CUDACC__))
#include "framework.h"
#include "glutil.h"
//------------------------------------------------------------------------
// OpenGL-related persistent state for forward op.
struct RasterizeGLState // Must be initializable by memset to zero.
{
int width; // Allocated frame buffer width.
int height; // Allocated frame buffer height.
int depth; // Allocated frame buffer depth.
int posCount; // Allocated position buffer in floats.
int triCount; // Allocated triangle buffer in ints.
GLContext glctx;
GLuint glFBO;
GLuint glColorBuffer[2];
GLuint glPrevOutBuffer;
GLuint glDepthStencilBuffer;
GLuint glVAO;
GLuint glTriBuffer;
GLuint glPosBuffer;
GLuint glProgram;
GLuint glProgramDP;
GLuint glVertexShader;
GLuint glGeometryShader;
GLuint glFragmentShader;
GLuint glFragmentShaderDP;
cudaGraphicsResource_t cudaColorBuffer[2];
cudaGraphicsResource_t cudaPrevOutBuffer;
cudaGraphicsResource_t cudaPosBuffer;
cudaGraphicsResource_t cudaTriBuffer;
int enableDB;
int enableZModify; // Modify depth in shader, workaround for a rasterization issue on A100.
};
//------------------------------------------------------------------------
// Shared C++ code prototypes.
void rasterizeInitGLContext(NVDR_CTX_ARGS, RasterizeGLState& s, int cudaDeviceIdx);
bool rasterizeResizeBuffers(NVDR_CTX_ARGS, RasterizeGLState& s, int posCount, int triCount, int width, int height, int depth);
void rasterizeRender(NVDR_CTX_ARGS, RasterizeGLState& s, cudaStream_t stream, const float* posPtr, int posCount, int vtxPerInstance, const int32_t* triPtr, int triCount, const int32_t* rangesPtr, int width, int height, int depth, int peeling_idx);
void rasterizeCopyResults(NVDR_CTX_ARGS, RasterizeGLState& s, cudaStream_t stream, float** outputPtr, int width, int height, int depth);
void rasterizeReleaseBuffers(NVDR_CTX_ARGS, RasterizeGLState& s);
//------------------------------------------------------------------------
#endif // !(defined(NVDR_TORCH) && defined(__CUDACC__))
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment