Commit 1f95925c authored by Samuli Laine's avatar Samuli Laine
Browse files

Initial commit

parents
// Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
//
// NVIDIA CORPORATION and its licensors retain all intellectual property
// and proprietary rights in and to this software, related documentation
// and any modifications thereto. Any use, reproduction, disclosure or
// distribution of this software and related documentation without an express
// license agreement from NVIDIA CORPORATION is strictly prohibited.
#include "rasterize.h"
#include "glutil.inl"
#include <vector>
#define STRINGIFY_SHADER_SOURCE(x) #x
//------------------------------------------------------------------------
// Helpers.
#define ROUND_UP(x, y) ((((x) + ((y) - 1)) / (y)) * (y))
static int ROUND_UP_BITS(uint32_t x, uint32_t y)
{
// Round x up so that it has at most y bits of mantissa.
if (x < (1u << y))
return x;
uint32_t m = 0;
while (x & ~m)
m = (m << 1) | 1u;
m >>= y;
if (!(x & m))
return x;
return (x | m) + 1u;
}
//------------------------------------------------------------------------
// GL helpers.
static void compileGLShader(NVDR_CTX_ARGS, GLuint* pShader, GLenum shaderType, const char* src)
{
const char* srcPtr = src;
int srcLength = strlen(src);
*pShader = 0;
NVDR_CHECK_GL_ERROR(*pShader = glCreateShader(shaderType));
NVDR_CHECK_GL_ERROR(glShaderSource(*pShader, 1, &srcPtr, &srcLength));
NVDR_CHECK_GL_ERROR(glCompileShader(*pShader));
}
static void constructGLProgram(NVDR_CTX_ARGS, GLuint* pProgram, GLuint glVertexShader, GLuint glGeometryShader, GLuint glFragmentShader)
{
*pProgram = 0;
GLuint glProgram = 0;
NVDR_CHECK_GL_ERROR(glProgram = glCreateProgram());
NVDR_CHECK_GL_ERROR(glAttachShader(glProgram, glVertexShader));
NVDR_CHECK_GL_ERROR(glAttachShader(glProgram, glGeometryShader));
NVDR_CHECK_GL_ERROR(glAttachShader(glProgram, glFragmentShader));
NVDR_CHECK_GL_ERROR(glLinkProgram(glProgram));
GLint linkStatus = 0;
NVDR_CHECK_GL_ERROR(glGetProgramiv(glProgram, GL_LINK_STATUS, &linkStatus));
if (!linkStatus)
{
GLint infoLen = 0;
NVDR_CHECK_GL_ERROR(glGetProgramiv(glProgram, GL_INFO_LOG_LENGTH, &infoLen));
if (infoLen)
{
const char* hdr = "glLinkProgram() failed:\n";
std::vector<char> info(strlen(hdr) + infoLen);
strcpy(&info[0], hdr);
NVDR_CHECK_GL_ERROR(glGetProgramInfoLog(glProgram, infoLen, &infoLen, &info[strlen(hdr)]));
NVDR_CHECK(0, &info[0]);
}
NVDR_CHECK(0, "glLinkProgram() failed");
}
*pProgram = glProgram;
}
//------------------------------------------------------------------------
// Shared C++ functions.
void rasterizeInitGLContext(NVDR_CTX_ARGS, RasterizeGLState& s)
{
// Create GL context and set it current.
s.glctx = createGLContext();
setGLContext(s.glctx);
// Version check.
GLint vMajor = 0;
GLint vMinor = 0;
glGetIntegerv(GL_MAJOR_VERSION, &vMajor);
glGetIntegerv(GL_MINOR_VERSION, &vMinor);
glGetError(); // Clear possible GL_INVALID_ENUM error in version query.
LOG(INFO) << "OpenGL version reported as " << vMajor << "." << vMinor;
NVDR_CHECK((vMajor == 4 && vMinor >= 4) || vMajor > 4, "OpenGL 4.4 or later is required");
// Number of output buffers.
int num_outputs = s.enableDB ? 2 : 1;
// Set up vertex shader.
compileGLShader(NVDR_CTX_PARAMS, &s.glVertexShader, GL_VERTEX_SHADER,
"#version 330\n"
"#extension GL_ARB_shader_draw_parameters : enable\n"
STRINGIFY_SHADER_SOURCE(
layout(location = 0) in vec4 in_pos;
out int v_layer;
out int v_offset;
void main()
{
int layer = gl_DrawIDARB;
gl_Position = in_pos;
v_layer = layer;
v_offset = gl_BaseInstanceARB; // Sneak in TriID offset here.
}
)
);
// Geometry and fragment shaders depend on if bary differential output is enabled or not.
if (s.enableDB)
{
// Set up geometry shader. Calculation of per-pixel bary differentials is based on:
// u = (u/w) / (1/w)
// --> du/dX = d((u/w) / (1/w))/dX
// --> du/dX = [d(u/w)/dX - u*d(1/w)/dX] * w
// and we know both d(u/w)/dX and d(1/w)/dX are constant over triangle.
compileGLShader(NVDR_CTX_PARAMS, &s.glGeometryShader, GL_GEOMETRY_SHADER,
"#version 430\n"
STRINGIFY_SHADER_SOURCE(
layout(triangles) in;
layout(triangle_strip, max_vertices=3) out;
layout(location = 0) uniform vec2 vp_scale;
in int v_layer[];
in int v_offset[];
out vec4 var_uvzw;
out vec4 var_db;
void main()
{
// Plane equations for bary differentials.
float w0 = gl_in[0].gl_Position.w;
float w1 = gl_in[1].gl_Position.w;
float w2 = gl_in[2].gl_Position.w;
vec2 p0 = gl_in[0].gl_Position.xy;
vec2 p1 = gl_in[1].gl_Position.xy;
vec2 p2 = gl_in[2].gl_Position.xy;
vec2 e0 = p0*w2 - p2*w0;
vec2 e1 = p1*w2 - p2*w1;
float a = e0.x*e1.y - e0.y*e1.x;
// Clamp area to an epsilon to avoid arbitrarily high bary differentials.
float eps = 1e-6f; // ~1 pixel in 1k x 1k image.
float ca = (abs(a) >= eps) ? a : (a < 0.f) ? -eps : eps; // Clamp with sign.
float ia = 1.f / ca; // Inverse area.
vec2 ascl = ia * vp_scale;
float dudx = e1.y * ascl.x;
float dudy = -e1.x * ascl.y;
float dvdx = -e0.y * ascl.x;
float dvdy = e0.x * ascl.y;
float duwdx = w2 * dudx;
float dvwdx = w2 * dvdx;
float duvdx = w0 * dudx + w1 * dvdx;
float duwdy = w2 * dudy;
float dvwdy = w2 * dvdy;
float duvdy = w0 * dudy + w1 * dvdy;
vec4 db0 = vec4(duvdx - dvwdx, duvdy - dvwdy, dvwdx, dvwdy);
vec4 db1 = vec4(duwdx, duwdy, duvdx - duwdx, duvdy - duwdy);
vec4 db2 = vec4(duwdx, duwdy, dvwdx, dvwdy);
int layer_id = v_layer[0];
int prim_id = gl_PrimitiveIDIn + v_offset[0];
// Flip z before hw depth test because depth is cleared to zero.
gl_Layer = layer_id; gl_PrimitiveID = prim_id; gl_Position = vec4(gl_in[0].gl_Position.x, gl_in[0].gl_Position.y, -gl_in[0].gl_Position.z, gl_in[0].gl_Position.w); var_uvzw = vec4(1.f, 0.f, gl_in[0].gl_Position.z, gl_in[0].gl_Position.w); var_db = db0; EmitVertex();
gl_Layer = layer_id; gl_PrimitiveID = prim_id; gl_Position = vec4(gl_in[1].gl_Position.x, gl_in[1].gl_Position.y, -gl_in[1].gl_Position.z, gl_in[1].gl_Position.w); var_uvzw = vec4(0.f, 1.f, gl_in[1].gl_Position.z, gl_in[1].gl_Position.w); var_db = db1; EmitVertex();
gl_Layer = layer_id; gl_PrimitiveID = prim_id; gl_Position = vec4(gl_in[2].gl_Position.x, gl_in[2].gl_Position.y, -gl_in[2].gl_Position.z, gl_in[2].gl_Position.w); var_uvzw = vec4(0.f, 0.f, gl_in[2].gl_Position.z, gl_in[2].gl_Position.w); var_db = db2; EmitVertex();
}
)
);
// Set up fragment shader.
compileGLShader(NVDR_CTX_PARAMS, &s.glFragmentShader, GL_FRAGMENT_SHADER,
"#version 330\n"
STRINGIFY_SHADER_SOURCE(
in vec4 var_uvzw;
in vec4 var_db;
in int gl_PrimitiveID;
layout(location = 0) out vec4 out_raster;
layout(location = 1) out vec4 out_db;
void main()
{
out_raster = vec4(var_uvzw.x, var_uvzw.y, var_uvzw.z / var_uvzw.w, float(gl_PrimitiveID + 1));
out_db = var_db * var_uvzw.w;
}
)
);
}
else
{
// Geometry shader without bary differential output.
compileGLShader(NVDR_CTX_PARAMS, &s.glGeometryShader, GL_GEOMETRY_SHADER,
"#version 330\n"
STRINGIFY_SHADER_SOURCE(
layout(triangles) in;
layout(triangle_strip, max_vertices=3) out;
in int v_layer[];
in int v_offset[];
out vec4 var_uvzw;
void main()
{
int layer_id = v_layer[0];
int prim_id = gl_PrimitiveIDIn + v_offset[0];
// Flip z before hw depth test because depth is cleared to zero.
gl_Layer = layer_id; gl_PrimitiveID = prim_id; gl_Position = vec4(gl_in[0].gl_Position.x, gl_in[0].gl_Position.y, -gl_in[0].gl_Position.z, gl_in[0].gl_Position.w); var_uvzw = vec4(1.f, 0.f, gl_in[0].gl_Position.z, gl_in[0].gl_Position.w); EmitVertex();
gl_Layer = layer_id; gl_PrimitiveID = prim_id; gl_Position = vec4(gl_in[1].gl_Position.x, gl_in[1].gl_Position.y, -gl_in[1].gl_Position.z, gl_in[1].gl_Position.w); var_uvzw = vec4(0.f, 1.f, gl_in[1].gl_Position.z, gl_in[1].gl_Position.w); EmitVertex();
gl_Layer = layer_id; gl_PrimitiveID = prim_id; gl_Position = vec4(gl_in[2].gl_Position.x, gl_in[2].gl_Position.y, -gl_in[2].gl_Position.z, gl_in[2].gl_Position.w); var_uvzw = vec4(0.f, 0.f, gl_in[2].gl_Position.z, gl_in[2].gl_Position.w); EmitVertex();
}
)
);
// Fragment shader without bary differential output.
compileGLShader(NVDR_CTX_PARAMS, &s.glFragmentShader, GL_FRAGMENT_SHADER,
"#version 330\n"
STRINGIFY_SHADER_SOURCE(
in vec4 var_uvzw;
in int gl_PrimitiveID;
layout(location = 0) out vec4 out_raster;
void main()
{
out_raster = vec4(var_uvzw.x, var_uvzw.y, var_uvzw.z / var_uvzw.w, float(gl_PrimitiveID + 1));
}
)
);
}
// Finalize program.
constructGLProgram(NVDR_CTX_PARAMS, &s.glProgram, s.glVertexShader, s.glGeometryShader, s.glFragmentShader);
// Construct main fbo and bind permanently.
NVDR_CHECK_GL_ERROR(glGenFramebuffers(1, &s.glFBO));
NVDR_CHECK_GL_ERROR(glBindFramebuffer(GL_FRAMEBUFFER, s.glFBO));
// Enable two color attachments.
GLenum draw_buffers[2] = { GL_COLOR_ATTACHMENT0, GL_COLOR_ATTACHMENT1 };
NVDR_CHECK_GL_ERROR(glDrawBuffers(num_outputs, draw_buffers));
// Construct vertex array object.
NVDR_CHECK_GL_ERROR(glGenVertexArrays(1, &s.glVAO));
NVDR_CHECK_GL_ERROR(glBindVertexArray(s.glVAO));
// Construct position buffer, bind permanently, enable, set ptr.
NVDR_CHECK_GL_ERROR(glGenBuffers(1, &s.glPosBuffer));
NVDR_CHECK_GL_ERROR(glBindBuffer(GL_ARRAY_BUFFER, s.glPosBuffer));
NVDR_CHECK_GL_ERROR(glEnableVertexAttribArray(0));
NVDR_CHECK_GL_ERROR(glVertexAttribPointer(0, 4, GL_FLOAT, GL_FALSE, 0, 0));
// Construct index buffer and bind permanently.
NVDR_CHECK_GL_ERROR(glGenBuffers(1, &s.glTriBuffer));
NVDR_CHECK_GL_ERROR(glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, s.glTriBuffer));
// Bind color outputs and activate program.
NVDR_CHECK_GL_ERROR(glBindFragDataLocation(s.glProgram, 0, "out_raster"));
NVDR_CHECK_GL_ERROR(glBindFragDataLocation(s.glProgram, 1, "out_db"));
NVDR_CHECK_GL_ERROR(glUseProgram(s.glProgram));
// Set up rendering mode. Inverted depth so that all buffers can be cleared to zero.
NVDR_CHECK_GL_ERROR(glEnable(GL_DEPTH_TEST));
NVDR_CHECK_GL_ERROR(glDepthFunc(GL_GEQUAL));
// Create and bind output buffers. Storage is allocated later.
NVDR_CHECK_GL_ERROR(glGenTextures(num_outputs, s.glColorBuffer));
for (int i=0; i < num_outputs; i++)
{
NVDR_CHECK_GL_ERROR(glBindTexture(GL_TEXTURE_2D_ARRAY, s.glColorBuffer[i]));
NVDR_CHECK_GL_ERROR(glFramebufferTexture(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0 + i, s.glColorBuffer[i], 0));
}
// Create and bind depth/stencil buffer. Storage is allocated later.
NVDR_CHECK_GL_ERROR(glGenTextures(1, &s.glDepthStencilBuffer));
NVDR_CHECK_GL_ERROR(glBindTexture(GL_TEXTURE_2D_ARRAY, s.glDepthStencilBuffer));
NVDR_CHECK_GL_ERROR(glFramebufferTexture(GL_FRAMEBUFFER, GL_DEPTH_STENCIL_ATTACHMENT, s.glDepthStencilBuffer, 0));
}
void rasterizeResizeBuffers(NVDR_CTX_ARGS, RasterizeGLState& s, int posCount, int triCount, int width, int height, int depth)
{
// Resize vertex buffer?
if (posCount > s.posCount)
{
if (s.cudaPosBuffer)
NVDR_CHECK_CUDA_ERROR(cudaGraphicsUnregisterResource(s.cudaPosBuffer));
s.posCount = (posCount > 64) ? ROUND_UP_BITS(posCount, 2) : 64;
LOG(INFO) << "Increasing position buffer size to " << s.posCount << " float32";
NVDR_CHECK_GL_ERROR(glBufferData(GL_ARRAY_BUFFER, s.posCount * sizeof(float), NULL, GL_DYNAMIC_DRAW));
NVDR_CHECK_CUDA_ERROR(cudaGraphicsGLRegisterBuffer(&s.cudaPosBuffer, s.glPosBuffer, cudaGraphicsRegisterFlagsWriteDiscard));
}
// Resize triangle buffer?
if (triCount > s.triCount)
{
if (s.cudaTriBuffer)
NVDR_CHECK_CUDA_ERROR(cudaGraphicsUnregisterResource(s.cudaTriBuffer));
s.triCount = (triCount > 64) ? ROUND_UP_BITS(triCount, 2) : 64;
LOG(INFO) << "Increasing triangle buffer size to " << s.triCount << " int32";
NVDR_CHECK_GL_ERROR(glBufferData(GL_ELEMENT_ARRAY_BUFFER, s.triCount * sizeof(int32_t), NULL, GL_DYNAMIC_DRAW));
NVDR_CHECK_CUDA_ERROR(cudaGraphicsGLRegisterBuffer(&s.cudaTriBuffer, s.glTriBuffer, cudaGraphicsRegisterFlagsWriteDiscard));
}
// Resize framebuffer?
if (width > s.width || height > s.height || depth > s.depth)
{
int num_outputs = s.enableDB ? 2 : 1;
if (s.cudaColorBuffer[0])
for (int i=0; i < num_outputs; i++)
NVDR_CHECK_CUDA_ERROR(cudaGraphicsUnregisterResource(s.cudaColorBuffer[i]));
// New framebuffer size.
s.width = (width > s.width) ? width : s.width;
s.height = (height > s.height) ? height : s.height;
s.depth = (depth > s.depth) ? depth : s.depth;
s.width = ROUND_UP(s.width, 32);
s.height = ROUND_UP(s.height, 32);
LOG(INFO) << "Increasing frame buffer size to (width, height, depth) = (" << s.width << ", " << s.height << ", " << s.depth << ")";
// Allocate color buffers.
for (int i=0; i < num_outputs; i++)
{
NVDR_CHECK_GL_ERROR(glBindTexture(GL_TEXTURE_2D_ARRAY, s.glColorBuffer[i]));
NVDR_CHECK_GL_ERROR(glTexImage3D(GL_TEXTURE_2D_ARRAY, 0, GL_RGBA32F, s.width, s.height, s.depth, 0, GL_RGBA, GL_UNSIGNED_BYTE, 0));
}
// Allocate depth/stencil buffer.
NVDR_CHECK_GL_ERROR(glBindTexture(GL_TEXTURE_2D_ARRAY, s.glDepthStencilBuffer));
NVDR_CHECK_GL_ERROR(glTexImage3D(GL_TEXTURE_2D_ARRAY, 0, GL_DEPTH24_STENCIL8, s.width, s.height, s.depth, 0, GL_DEPTH_STENCIL, GL_UNSIGNED_INT_24_8, 0));
// (Re-)register all GL buffers into Cuda.
for (int i=0; i < num_outputs; i++)
NVDR_CHECK_CUDA_ERROR(cudaGraphicsGLRegisterImage(&s.cudaColorBuffer[i], s.glColorBuffer[i], GL_TEXTURE_3D, cudaGraphicsRegisterFlagsReadOnly));
}
// Resize range arrays?
if ((unsigned int)depth > s.drawCmdBuffer.size())
{
int newSize = (depth > 64) ? ROUND_UP_BITS(depth, 1) : 64;
LOG(INFO) << "Increasing range array size to " << newSize << " elements";
s.drawCmdBuffer.resize(newSize);
}
}
void rasterizeRender(NVDR_CTX_ARGS, RasterizeGLState& s, cudaStream_t stream, const float* posPtr, int posCount, int vtxPerInstance, const int32_t* triPtr, int triCount, const int32_t* rangesPtr, int width, int height, int depth)
{
if (triPtr)
{
// Copy both position and triangle buffers.
void* glPosPtr = NULL;
void* glTriPtr = NULL;
size_t posBytes = 0;
size_t triBytes = 0;
NVDR_CHECK_CUDA_ERROR(cudaGraphicsMapResources(2, &s.cudaPosBuffer, stream));
NVDR_CHECK_CUDA_ERROR(cudaGraphicsResourceGetMappedPointer(&glPosPtr, &posBytes, s.cudaPosBuffer));
NVDR_CHECK_CUDA_ERROR(cudaGraphicsResourceGetMappedPointer(&glTriPtr, &triBytes, s.cudaTriBuffer));
NVDR_CHECK(posBytes >= posCount * sizeof(float), "mapped GL position buffer size mismatch");
NVDR_CHECK(triBytes >= triCount * sizeof(int32_t), "mapped GL triangle buffer size mismatch");
NVDR_CHECK_CUDA_ERROR(cudaMemcpyAsync(glPosPtr, posPtr, posCount * sizeof(float), cudaMemcpyDeviceToDevice, stream));
NVDR_CHECK_CUDA_ERROR(cudaMemcpyAsync(glTriPtr, triPtr, triCount * sizeof(int32_t), cudaMemcpyDeviceToDevice, stream));
NVDR_CHECK_CUDA_ERROR(cudaGraphicsUnmapResources(2, &s.cudaPosBuffer, stream));
}
else
{
// Copy position buffer only. Triangles are already copied and known to be constant.
void* glPosPtr = NULL;
size_t posBytes = 0;
NVDR_CHECK_CUDA_ERROR(cudaGraphicsMapResources(1, &s.cudaPosBuffer, stream));
NVDR_CHECK_CUDA_ERROR(cudaGraphicsResourceGetMappedPointer(&glPosPtr, &posBytes, s.cudaPosBuffer));
NVDR_CHECK(posBytes >= posCount * sizeof(float), "mapped GL position buffer size mismatch");
NVDR_CHECK_CUDA_ERROR(cudaMemcpyAsync(glPosPtr, posPtr, posCount * sizeof(float), cudaMemcpyDeviceToDevice, stream));
NVDR_CHECK_CUDA_ERROR(cudaGraphicsUnmapResources(1, &s.cudaPosBuffer, stream));
}
// Set viewport, clear color and depth/stencil buffers.
NVDR_CHECK_GL_ERROR(glViewport(0, 0, width, height));
NVDR_CHECK_GL_ERROR(glClearTexSubImage(s.glDepthStencilBuffer, 0, 0, 0, 0, width, height, depth, GL_DEPTH_STENCIL, GL_UNSIGNED_INT_24_8, 0));
NVDR_CHECK_GL_ERROR(glClearTexSubImage(s.glColorBuffer[0], 0, 0, 0, 0, width, height, depth, GL_RGBA, GL_FLOAT, 0));
// If outputting bary differentials, clear second output buffer and set resolution uniform
if (s.enableDB)
{
NVDR_CHECK_GL_ERROR(glClearTexSubImage(s.glColorBuffer[1], 0, 0, 0, 0, width, height, depth, GL_RGBA, GL_FLOAT, 0));
NVDR_CHECK_GL_ERROR(glUniform2f(0, 2.f / (float)width, 2.f / (float)height));
}
// Render the meshes.
if (depth == 1 && !rangesPtr)
{
// Trivial case.
NVDR_CHECK_GL_ERROR(glDrawElements(GL_TRIANGLES, triCount, GL_UNSIGNED_INT, 0));
}
else
{
if (!rangesPtr)
{
// Fill in range array to instantiate the same triangles for each output layer.
// Triangle IDs starts at zero (i.e., one) for each layer, so they correspond to
// the first dimension in addressing the triangle array.
for (int i=0; i < depth; i++)
{
GLDrawCmd& cmd = s.drawCmdBuffer[i];
cmd.firstIndex = 0;
cmd.count = triCount;
cmd.baseVertex = vtxPerInstance * i;
cmd.baseInstance = 0;
cmd.instanceCount = 1;
}
}
else
{
// Fill in the range array according to user-given ranges. Triangle IDs point
// to the input triangle array, NOT index within range, so they correspond to
// the first dimension in addressing the triangle array.
for (int i=0, j=0; i < depth; i++)
{
GLDrawCmd& cmd = s.drawCmdBuffer[i];
int first = rangesPtr[j++];
int count = rangesPtr[j++];
NVDR_CHECK(first >= 0 && count >= 0, "range contains negative values");
NVDR_CHECK((first + count) * 3 <= triCount, "range extends beyond end of triangle buffer");
cmd.firstIndex = first * 3;
cmd.count = count * 3;
cmd.baseVertex = 0;
cmd.baseInstance = first;
cmd.instanceCount = 1;
}
}
// Draw!
NVDR_CHECK_GL_ERROR(glMultiDrawElementsIndirect(GL_TRIANGLES, GL_UNSIGNED_INT, &s.drawCmdBuffer[0], depth, sizeof(GLDrawCmd)));
}
}
void rasterizeCopyResults(NVDR_CTX_ARGS, RasterizeGLState& s, cudaStream_t stream, float** outputPtr, int width, int height, int depth)
{
// Copy color buffers to output tensors.
cudaArray_t array = 0;
cudaChannelFormatDesc arrayDesc = {}; // For error checking.
cudaExtent arrayExt = {}; // For error checking.
int num_outputs = s.enableDB ? 2 : 1;
NVDR_CHECK_CUDA_ERROR(cudaGraphicsMapResources(num_outputs, s.cudaColorBuffer, stream));
for (int i=0; i < num_outputs; i++)
{
NVDR_CHECK_CUDA_ERROR(cudaGraphicsSubResourceGetMappedArray(&array, s.cudaColorBuffer[i], 0, 0));
NVDR_CHECK_CUDA_ERROR(cudaArrayGetInfo(&arrayDesc, &arrayExt, NULL, array));
NVDR_CHECK(arrayDesc.f == cudaChannelFormatKindFloat, "CUDA mapped array data kind mismatch");
NVDR_CHECK(arrayDesc.x == 32 && arrayDesc.y == 32 && arrayDesc.z == 32 && arrayDesc.w == 32, "CUDA mapped array data width mismatch");
NVDR_CHECK(arrayExt.width >= width && arrayExt.height >= height && arrayExt.depth >= depth, "CUDA mapped array extent mismatch");
cudaMemcpy3DParms p = {0};
p.srcArray = array;
p.dstPtr.ptr = outputPtr[i];
p.dstPtr.pitch = width * 4 * sizeof(float);
p.dstPtr.xsize = width;
p.dstPtr.ysize = height;
p.extent.width = width;
p.extent.height = height;
p.extent.depth = depth;
p.kind = cudaMemcpyDeviceToDevice;
NVDR_CHECK_CUDA_ERROR(cudaMemcpy3DAsync(&p, stream));
}
NVDR_CHECK_CUDA_ERROR(cudaGraphicsUnmapResources(num_outputs, s.cudaColorBuffer, stream));
}
//------------------------------------------------------------------------
// Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
//
// NVIDIA CORPORATION and its licensors retain all intellectual property
// and proprietary rights in and to this software, related documentation
// and any modifications thereto. Any use, reproduction, disclosure or
// distribution of this software and related documentation without an express
// license agreement from NVIDIA CORPORATION is strictly prohibited.
#include "common.h"
#include "rasterize.h"
//------------------------------------------------------------------------
// Gradient Cuda kernel.
template <bool ENABLE_DB>
static __forceinline__ __device__ void RasterizeGradKernelTemplate(const RasterizeGradParams p)
{
// Temporary space for coalesced atomics.
CA_DECLARE_TEMP(RAST_GRAD_MAX_KERNEL_BLOCK_WIDTH * RAST_GRAD_MAX_KERNEL_BLOCK_HEIGHT);
// Calculate pixel position.
int px = blockIdx.x * blockDim.x + threadIdx.x;
int py = blockIdx.y * blockDim.y + threadIdx.y;
int pz = blockIdx.z;
if (px >= p.width || py >= p.height || pz >= p.depth)
return;
// Pixel index.
int pidx = px + p.width * (py + p.height * pz);
// Read triangle idx and dy.
float2 dy = ((float2*)p.dy)[pidx * 2];
float4 ddb = ENABLE_DB ? ((float4*)p.ddb)[pidx] : make_float4(0.f, 0.f, 0.f, 0.f);
int triIdx = (int)(((float*)p.out)[pidx * 4 + 3]) - 1;
// Exit if nothing to do.
if (triIdx < 0 || triIdx >= p.numTriangles)
return; // No or corrupt triangle.
int grad_all_dy = __float_as_int(dy.x) | __float_as_int(dy.y); // Bitwise OR of all incoming gradients.
int grad_all_ddb = 0;
if (ENABLE_DB)
grad_all_ddb = __float_as_int(ddb.x) | __float_as_int(ddb.y) | __float_as_int(ddb.z) | __float_as_int(ddb.w);
if (((grad_all_dy | grad_all_ddb) << 1) == 0)
return; // All incoming gradients are +0/-0.
// Fetch vertex indices.
int vi0 = p.tri[triIdx * 3 + 0];
int vi1 = p.tri[triIdx * 3 + 1];
int vi2 = p.tri[triIdx * 3 + 2];
// Bail out if vertex indices are corrupt.
if (vi0 < 0 || vi0 >= p.numVertices ||
vi1 < 0 || vi1 >= p.numVertices ||
vi2 < 0 || vi2 >= p.numVertices)
return;
// In instance mode, adjust vertex indices by minibatch index.
if (p.instance_mode)
{
vi0 += pz * p.numVertices;
vi1 += pz * p.numVertices;
vi2 += pz * p.numVertices;
}
// Initialize coalesced atomics.
CA_SET_GROUP(triIdx);
// Fetch vertex positions.
float4 p0 = ((float4*)p.pos)[vi0];
float4 p1 = ((float4*)p.pos)[vi1];
float4 p2 = ((float4*)p.pos)[vi2];
// Evaluate edge functions.
float fx = p.xs * (float)px + p.xo;
float fy = p.ys * (float)py + p.yo;
float p0x = p0.x - fx * p0.w;
float p0y = p0.y - fy * p0.w;
float p1x = p1.x - fx * p1.w;
float p1y = p1.y - fy * p1.w;
float p2x = p2.x - fx * p2.w;
float p2y = p2.y - fy * p2.w;
float a0 = p1x*p2y - p1y*p2x;
float a1 = p2x*p0y - p2y*p0x;
float a2 = p0x*p1y - p0y*p1x;
// Compute inverse area with epsilon.
float at = a0 + a1 + a2;
float ep = copysignf(1e-6f, at); // ~1 pixel in 1k x 1k image.
float iw = 1.f / (at + ep);
// Perspective correct, normalized barycentrics.
float b0 = a0 * iw;
float b1 = a1 * iw;
// Position gradients.
float gb0 = dy.x * iw;
float gb1 = dy.y * iw;
float gbb = gb0 * b0 + gb1 * b1;
float gp0x = gbb * (p2y - p1y) - gb1 * p2y;
float gp1x = gbb * (p0y - p2y) + gb0 * p2y;
float gp2x = gbb * (p1y - p0y) - gb0 * p1y + gb1 * p0y;
float gp0y = gbb * (p1x - p2x) + gb1 * p2x;
float gp1y = gbb * (p2x - p0x) - gb0 * p2x;
float gp2y = gbb * (p0x - p1x) + gb0 * p1x - gb1 * p0x;
float gp0w = -fx * gp0x - fy * gp0y;
float gp1w = -fx * gp1x - fy * gp1y;
float gp2w = -fx * gp2x - fy * gp2y;
// Bary differential gradients.
if (ENABLE_DB && ((grad_all_ddb) << 1) != 0)
{
float dfxdX = p.xs * iw;
float dfydY = p.ys * iw;
ddb.x *= dfxdX;
ddb.y *= dfydY;
ddb.z *= dfxdX;
ddb.w *= dfydY;
float da0dX = p1.y * p2.w - p2.y * p1.w;
float da1dX = p2.y * p0.w - p0.y * p2.w;
float da2dX = p0.y * p1.w - p1.y * p0.w;
float da0dY = p2.x * p1.w - p1.x * p2.w;
float da1dY = p0.x * p2.w - p2.x * p0.w;
float da2dY = p1.x * p0.w - p0.x * p1.w;
float datdX = da0dX + da1dX + da2dX;
float datdY = da0dY + da1dY + da2dY;
float x01 = p0.x - p1.x;
float x12 = p1.x - p2.x;
float x20 = p2.x - p0.x;
float y01 = p0.y - p1.y;
float y12 = p1.y - p2.y;
float y20 = p2.y - p0.y;
float w01 = p0.w - p1.w;
float w12 = p1.w - p2.w;
float w20 = p2.w - p0.w;
float a0p1 = fy * p2.x - fx * p2.y;
float a0p2 = fx * p1.y - fy * p1.x;
float a1p0 = fx * p2.y - fy * p2.x;
float a1p2 = fy * p0.x - fx * p0.y;
float wdudX = 2.f * b0 * datdX - da0dX;
float wdudY = 2.f * b0 * datdY - da0dY;
float wdvdX = 2.f * b1 * datdX - da1dX;
float wdvdY = 2.f * b1 * datdY - da1dY;
float c0 = iw * (ddb.x * wdudX + ddb.y * wdudY + ddb.z * wdvdX + ddb.w * wdvdY);
float cx = c0 * fx - ddb.x * b0 - ddb.z * b1;
float cy = c0 * fy - ddb.y * b0 - ddb.w * b1;
float cxy = iw * (ddb.x * datdX + ddb.y * datdY);
float czw = iw * (ddb.z * datdX + ddb.w * datdY);
gp0x += c0 * y12 - cy * w12 + czw * p2y + ddb.w * p2.w;
gp1x += c0 * y20 - cy * w20 - cxy * p2y - ddb.y * p2.w;
gp2x += c0 * y01 - cy * w01 + cxy * p1y - czw * p0y + ddb.y * p1.w - ddb.w * p0.w;
gp0y += cx * w12 - c0 * x12 - czw * p2x - ddb.z * p2.w;
gp1y += cx * w20 - c0 * x20 + cxy * p2x + ddb.x * p2.w;
gp2y += cx * w01 - c0 * x01 - cxy * p1x + czw * p0x - ddb.x * p1.w + ddb.z * p0.w;
gp0w += cy * x12 - cx * y12 - czw * a1p0 + ddb.z * p2.y - ddb.w * p2.x;
gp1w += cy * x20 - cx * y20 - cxy * a0p1 - ddb.x * p2.y + ddb.y * p2.x;
gp2w += cy * x01 - cx * y01 - cxy * a0p2 - czw * a1p2 + ddb.x * p1.y - ddb.y * p1.x - ddb.z * p0.y + ddb.w * p0.x;
}
// Accumulate using coalesced atomics.
caAtomicAdd3_xyw(p.grad + 4 * vi0, gp0x, gp0y, gp0w);
caAtomicAdd3_xyw(p.grad + 4 * vi1, gp1x, gp1y, gp1w);
caAtomicAdd3_xyw(p.grad + 4 * vi2, gp2x, gp2y, gp2w);
}
// Template specializations.
__global__ void RasterizeGradKernel (const RasterizeGradParams p) { RasterizeGradKernelTemplate<false>(p); }
__global__ void RasterizeGradKernelDb(const RasterizeGradParams p) { RasterizeGradKernelTemplate<true>(p); }
//------------------------------------------------------------------------
// Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
//
// NVIDIA CORPORATION and its licensors retain all intellectual property
// and proprietary rights in and to this software, related documentation
// and any modifications thereto. Any use, reproduction, disclosure or
// distribution of this software and related documentation without an express
// license agreement from NVIDIA CORPORATION is strictly prohibited.
#pragma once
//------------------------------------------------------------------------
// Constants and helpers.
#define RAST_GRAD_MAX_KERNEL_BLOCK_WIDTH 8
#define RAST_GRAD_MAX_KERNEL_BLOCK_HEIGHT 8
//------------------------------------------------------------------------
// Gradient CUDA kernel params.
struct RasterizeGradParams
{
const float* pos; // Incoming position buffer.
const int* tri; // Incoming triangle buffer.
const float* out; // Rasterizer output buffer.
const float* dy; // Incoming gradients of rasterizer output buffer.
const float* ddb; // Incoming gradients of bary diff output buffer.
float* grad; // Outgoing position gradients.
int numTriangles; // Number of triangles.
int numVertices; // Number of vertices.
int width; // Image width.
int height; // Image height.
int depth; // Size of minibatch.
int instance_mode; // 1 if in instance rendering mode.
float xs, xo, ys, yo; // Pixel position to clip-space x, y transform.
};
//------------------------------------------------------------------------
// The rest is for C++ compilation only when using torch.
#if !defined(NVDR_TORCH) || !defined(__CUDACC__)
#include "glutil.inl"
//------------------------------------------------------------------------
// Draw command struct used by rasterizer.
struct GLDrawCmd
{
uint32_t count;
uint32_t instanceCount;
uint32_t firstIndex;
uint32_t baseVertex;
uint32_t baseInstance;
};
//------------------------------------------------------------------------
// OpenGL-related persistent state for forward op.
struct RasterizeGLState
{
int width; // Allocated frame buffer width.
int height; // Allocated frame buffer height.
int depth; // Allocated frame buffer depth.
int posCount; // Allocated position buffer in floats.
int triCount; // Allocated triangle buffer in ints.
GLContext glctx;
GLuint glFBO;
GLuint glColorBuffer[2];
GLuint glDepthStencilBuffer;
GLuint glVAO;
GLuint glTriBuffer;
GLuint glPosBuffer;
GLuint glProgram;
GLuint glVertexShader;
GLuint glGeometryShader;
GLuint glFragmentShader;
cudaGraphicsResource_t cudaColorBuffer[2];
cudaGraphicsResource_t cudaPosBuffer;
cudaGraphicsResource_t cudaTriBuffer;
std::vector<GLDrawCmd> drawCmdBuffer;
int enableDB;
};
//------------------------------------------------------------------------
// Shared C++ code prototypes.
void rasterizeInitGLContext(NVDR_CTX_ARGS, RasterizeGLState& s);
void rasterizeResizeBuffers(NVDR_CTX_ARGS, RasterizeGLState& s, int posCount, int triCount, int width, int height, int depth);
void rasterizeRender(NVDR_CTX_ARGS, RasterizeGLState& s, cudaStream_t stream, const float* posPtr, int posCount, int vtxPerInstance, const int32_t* triPtr, int triCount, const int32_t* rangesPtr, int width, int height, int depth);
void rasterizeCopyResults(NVDR_CTX_ARGS, RasterizeGLState& s, cudaStream_t stream, float** outputPtr, int width, int height, int depth);
//------------------------------------------------------------------------
#endif // !defined(NVDR_TORCH) || !defined(__CUDACC__)
// Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
//
// NVIDIA CORPORATION and its licensors retain all intellectual property
// and proprietary rights in and to this software, related documentation
// and any modifications thereto. Any use, reproduction, disclosure or
// distribution of this software and related documentation without an express
// license agreement from NVIDIA CORPORATION is strictly prohibited.
#include "framework.h"
#include "texture.h"
//------------------------------------------------------------------------
// Mip stack construction and access helpers.
void raiseMipSizeError(NVDR_CTX_ARGS, const TextureKernelParams& p)
{
char buf[1024];
int bufsz = 1024;
std::string msg = "Mip-map size error - cannot downsample an odd extent greater than 1. Resize the texture so that both spatial extents are powers of two, or limit the number of mip maps using max_mip_level argument.\n";
int w = p.texWidth;
int h = p.texHeight;
bool ew = false;
bool eh = false;
msg += "Attempted mip stack construction:\n";
msg += "level width height\n";
msg += "----- ----- ------\n";
snprintf(buf, bufsz, "base %5d %5d\n", w, h);
msg += buf;
int mipTotal = 0;
int level = 0;
while ((w|h) > 1 && !(ew || eh)) // Stop at first impossible size.
{
// Current level.
level += 1;
// Determine if downsampling fails.
ew = ew || (w > 1 && (w & 1));
eh = eh || (h > 1 && (h & 1));
// Downsample.
if (w > 1) w >>= 1;
if (h > 1) h >>= 1;
// Append level size to error message.
snprintf(buf, bufsz, "mip %-2d ", level);
msg += buf;
if (ew) snprintf(buf, bufsz, " err ");
else snprintf(buf, bufsz, "%5d ", w);
msg += buf;
if (eh) snprintf(buf, bufsz, " err\n");
else snprintf(buf, bufsz, "%5d\n", h);
msg += buf;
}
NVDR_CHECK(0, msg);
}
int calculateMipInfo(NVDR_CTX_ARGS, TextureKernelParams& p)
{
// No levels at all?
if (p.mipLevelLimit == 0)
{
p.mipOffset[0] = 0;
p.mipLevelMax = 0;
return 0;
}
// Current level size.
int w = p.texWidth;
int h = p.texHeight;
p.mipOffset[0] = 0;
int mipTotal = 0;
int level = 0;
int c = (p.boundaryMode == TEX_BOUNDARY_MODE_CUBE) ? (p.channels * 6) : p.channels;
while ((w|h) > 1)
{
// Current level.
level += 1;
// Quit if cannot downsample.
if ((w > 1 && (w & 1)) || (h > 1 && (h & 1)))
raiseMipSizeError(NVDR_CTX_PARAMS, p);
// Downsample.
if (w > 1) w >>= 1;
if (h > 1) h >>= 1;
p.mipOffset[level] = mipTotal;
mipTotal += w * h * p.texDepth * c;
// Hit the level limit?
if (p.mipLevelLimit >= 0 && level == p.mipLevelLimit)
break;
}
p.mipLevelMax = level;
return mipTotal;
}
//------------------------------------------------------------------------
// Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
//
// NVIDIA CORPORATION and its licensors retain all intellectual property
// and proprietary rights in and to this software, related documentation
// and any modifications thereto. Any use, reproduction, disclosure or
// distribution of this software and related documentation without an express
// license agreement from NVIDIA CORPORATION is strictly prohibited.
#include "common.h"
#include "texture.h"
//------------------------------------------------------------------------
// Memory access and math helpers.
static __device__ __forceinline__ void accum_from_mem(float* a, int s, float b, float c) { a[0] += b * c; }
static __device__ __forceinline__ void accum_from_mem(float* a, int s, float2 b, float c) { a[0] += b.x * c; a[s] += b.y * c; }
static __device__ __forceinline__ void accum_from_mem(float* a, int s, float4 b, float c) { a[0] += b.x * c; a[s] += b.y * c; a[2*s] += b.z * c; a[3*s] += b.w * c; }
static __device__ __forceinline__ void accum_to_mem(float& a, float* b, int s) { a += b[0]; }
static __device__ __forceinline__ void accum_to_mem(float2& a, float* b, int s) { float2 v = a; v.x += b[0]; v.y += b[s]; a = v; }
static __device__ __forceinline__ void accum_to_mem(float4& a, float* b, int s) { float4 v = a; v.x += b[0]; v.y += b[s]; v.z += b[2*s]; v.w += b[3*s]; a = v; }
template<class T> static __device__ __forceinline__ T lerp (const T& a, const T& b, float c) { return a + c * (b - a); }
template<class T> static __device__ __forceinline__ T bilerp(const T& a, const T& b, const T& c, const T& d, const float2& e) { return lerp(lerp(a, b, e.x), lerp(c, d, e.x), e.y); }
//------------------------------------------------------------------------
// Cube map wrapping for smooth filtering across edges and corners. At corners,
// one of the texture coordinates will be negative. For correct interpolation,
// the missing texel must take the average color of the other three.
static __constant__ uint32_t c_cubeWrapMask1[48] =
{
0x1530a440, 0x1133a550, 0x6103a110, 0x1515aa44, 0x6161aa11, 0x40154a04, 0x44115a05, 0x04611a01,
0x2630a440, 0x2233a550, 0x5203a110, 0x2626aa44, 0x5252aa11, 0x40264a04, 0x44225a05, 0x04521a01,
0x32608064, 0x3366a055, 0x13062091, 0x32328866, 0x13132299, 0x50320846, 0x55330a55, 0x05130219,
0x42508064, 0x4455a055, 0x14052091, 0x42428866, 0x14142299, 0x60420846, 0x66440a55, 0x06140219,
0x5230a044, 0x5533a055, 0x1503a011, 0x5252aa44, 0x1515aa11, 0x40520a44, 0x44550a55, 0x04150a11,
0x6130a044, 0x6633a055, 0x2603a011, 0x6161aa44, 0x2626aa11, 0x40610a44, 0x44660a55, 0x04260a11,
};
static __constant__ uint8_t c_cubeWrapMask2[48] =
{
0x26, 0x33, 0x11, 0x05, 0x00, 0x09, 0x0c, 0x04, 0x04, 0x00, 0x00, 0x05, 0x00, 0x81, 0xc0, 0x40,
0x02, 0x03, 0x09, 0x00, 0x0a, 0x00, 0x00, 0x02, 0x64, 0x30, 0x90, 0x55, 0xa0, 0x99, 0xcc, 0x64,
0x24, 0x30, 0x10, 0x05, 0x00, 0x01, 0x00, 0x00, 0x06, 0x03, 0x01, 0x05, 0x00, 0x89, 0xcc, 0x44,
};
static __device__ __forceinline__ int4 wrapCubeMap(int face, int ix0, int ix1, int iy0, int iy1, int w)
{
// Calculate case number.
int cx = (ix0 < 0) ? 0 : (ix1 >= w) ? 2 : 1;
int cy = (iy0 < 0) ? 0 : (iy1 >= w) ? 6 : 3;
int c = cx + cy;
if (c >= 5)
c--;
c = (face << 3) + c;
// Compute coordinates and faces.
unsigned int m = c_cubeWrapMask1[c];
int x0 = (m >> 0) & 3; x0 = (x0 == 0) ? 0 : (x0 == 1) ? ix0 : iy0;
int x1 = (m >> 2) & 3; x1 = (x1 == 0) ? 0 : (x1 == 1) ? ix1 : iy0;
int x2 = (m >> 4) & 3; x2 = (x2 == 0) ? 0 : (x2 == 1) ? ix0 : iy1;
int x3 = (m >> 6) & 3; x3 = (x3 == 0) ? 0 : (x3 == 1) ? ix1 : iy1;
int y0 = (m >> 8) & 3; y0 = (y0 == 0) ? 0 : (y0 == 1) ? ix0 : iy0;
int y1 = (m >> 10) & 3; y1 = (y1 == 0) ? 0 : (y1 == 1) ? ix1 : iy0;
int y2 = (m >> 12) & 3; y2 = (y2 == 0) ? 0 : (y2 == 1) ? ix0 : iy1;
int y3 = (m >> 14) & 3; y3 = (y3 == 0) ? 0 : (y3 == 1) ? ix1 : iy1;
int f0 = ((m >> 16) & 15) - 1;
int f1 = ((m >> 20) & 15) - 1;
int f2 = ((m >> 24) & 15) - 1;
int f3 = ((m >> 28) ) - 1;
// Flips.
unsigned int f = c_cubeWrapMask2[c];
int w1 = w - 1;
if (f & 0x01) x0 = w1 - x0;
if (f & 0x02) x1 = w1 - x1;
if (f & 0x04) x2 = w1 - x2;
if (f & 0x08) x3 = w1 - x3;
if (f & 0x10) y0 = w1 - y0;
if (f & 0x20) y1 = w1 - y1;
if (f & 0x40) y2 = w1 - y2;
if (f & 0x80) y3 = w1 - y3;
// Done.
int4 tcOut;
tcOut.x = x0 + (y0 + f0 * w) * w;
tcOut.y = x1 + (y1 + f1 * w) * w;
tcOut.z = x2 + (y2 + f2 * w) * w;
tcOut.w = x3 + (y3 + f3 * w) * w;
return tcOut;
}
//------------------------------------------------------------------------
// Cube map indexing and gradient functions.
// Map a 3D lookup vector into an (s,t) face coordinates (returned in first .
// two parameters) and face index.
static __device__ __forceinline__ int indexCubeMap(float& x, float& y, float z)
{
float ax = fabsf(x);
float ay = fabsf(y);
float az = fabsf(z);
int idx;
float c;
if (az > fmaxf(ax, ay)) { idx = 4; c = z; }
else if (ay > ax) { idx = 2; c = y; y = z; }
else { idx = 0; c = x; x = z; }
if (c < 0.f) idx += 1;
float m = __frcp_rz(fabsf(c)) * .5;
float m0 = __uint_as_float(__float_as_uint(m) ^ ((0x21u >> idx) << 31));
float m1 = (idx != 2) ? -m : m;
x = x * m0 + .5;
y = y * m1 + .5;
x = fminf(fmaxf(x, 0.f), 1.f);
y = fminf(fmaxf(y, 0.f), 1.f);
return idx;
}
// Based on dA/d{s,t}, compute dA/d{x,y,z} at a given 3D lookup vector.
static __device__ __forceinline__ float3 indexCubeMapGrad(float3 uv, float gu, float gv)
{
float ax = fabsf(uv.x);
float ay = fabsf(uv.y);
float az = fabsf(uv.z);
int idx;
float c;
float c0 = gu;
float c1 = gv;
if (az > fmaxf(ax, ay)) { idx = 0x10; c = uv.z; c0 *= uv.x; c1 *= uv.y; }
else if (ay > ax) { idx = 0x04; c = uv.y; c0 *= uv.x; c1 *= uv.z; }
else { idx = 0x01; c = uv.x; c0 *= uv.z; c1 *= uv.y; }
if (c < 0.f) idx += idx;
float m = __frcp_rz(fabsf(c));
c0 = (idx & 0x34) ? -c0 : c0;
c1 = (idx & 0x2e) ? -c1 : c1;
float gl = (c0 + c1) * m;
float gx = (idx & 0x03) ? gl : (idx & 0x20) ? -gu : gu;
float gy = (idx & 0x0c) ? gl : -gv;
float gz = (idx & 0x30) ? gl : (idx & 0x03) ? gu : gv;
gz = (idx & 0x09) ? -gz : gz;
return make_float3(gx, gy, gz) * (m * .5f);
}
// Based on dL/d(d{s,t}/s{X,Y}), compute dL/d(d{x,y,z}/d{X,Y}). This is just two
// indexCubeMapGrad() functions rolled together.
static __device__ __forceinline__ void indexCubeMapGrad4(float3 uv, float4 dw, float3& g0, float3& g1)
{
float ax = fabsf(uv.x);
float ay = fabsf(uv.y);
float az = fabsf(uv.z);
int idx;
float c, c0, c1;
if (az > fmaxf(ax, ay)) { idx = 0x10; c = uv.z; c0 = uv.x; c1 = uv.y; }
else if (ay > ax) { idx = 0x04; c = uv.y; c0 = uv.x; c1 = uv.z; }
else { idx = 0x01; c = uv.x; c0 = uv.z; c1 = uv.y; }
if (c < 0.f) idx += idx;
float m = __frcp_rz(fabsf(c));
c0 = (idx & 0x34) ? -c0 : c0;
c1 = (idx & 0x2e) ? -c1 : c1;
float gl0 = (dw.x * c0 + dw.z * c1) * m;
float gl1 = (dw.y * c0 + dw.w * c1) * m;
float gx0 = (idx & 0x03) ? gl0 : (idx & 0x20) ? -dw.x : dw.x;
float gx1 = (idx & 0x03) ? gl1 : (idx & 0x20) ? -dw.y : dw.y;
float gy0 = (idx & 0x0c) ? gl0 : -dw.z;
float gy1 = (idx & 0x0c) ? gl1 : -dw.w;
float gz0 = (idx & 0x30) ? gl0 : (idx & 0x03) ? dw.x : dw.z;
float gz1 = (idx & 0x30) ? gl1 : (idx & 0x03) ? dw.y : dw.w;
if (idx & 0x09)
{
gz0 = -gz0;
gz1 = -gz1;
}
g0 = make_float3(gx0, gy0, gz0) * (m * .5f);
g1 = make_float3(gx1, gy1, gz1) * (m * .5f);
}
// Compute d{s,t}/d{X,Y} based on d{x,y,z}/d{X,Y} at a given 3D lookup vector.
// Result is (ds/dX, ds/dY, dt/dX, dt/dY).
static __device__ __forceinline__ float4 indexCubeMapGradST(float3 uv, float3 dvdX, float3 dvdY)
{
float ax = fabsf(uv.x);
float ay = fabsf(uv.y);
float az = fabsf(uv.z);
int idx;
float c, gu, gv;
if (az > fmaxf(ax, ay)) { idx = 0x10; c = uv.z; gu = uv.x; gv = uv.y; }
else if (ay > ax) { idx = 0x04; c = uv.y; gu = uv.x; gv = uv.z; }
else { idx = 0x01; c = uv.x; gu = uv.z; gv = uv.y; }
if (c < 0.f) idx += idx;
if (idx & 0x09)
{
dvdX.z = -dvdX.z;
dvdY.z = -dvdY.z;
}
float m = __frcp_rz(fabsf(c));
float dm = m * .5f;
float mm = m * dm;
gu *= (idx & 0x34) ? -mm : mm;
gv *= (idx & 0x2e) ? -mm : mm;
if (idx & 0x03)
{
return make_float4(gu * dvdX.x + dm * dvdX.z,
gu * dvdY.x + dm * dvdY.z,
gv * dvdX.x - dm * dvdX.y,
gv * dvdY.x - dm * dvdY.y);
}
else if (idx & 0x0c)
{
return make_float4(gu * dvdX.y + dm * dvdX.x,
gu * dvdY.y + dm * dvdY.x,
gv * dvdX.y + dm * dvdX.z,
gv * dvdY.y + dm * dvdY.z);
}
else // (idx & 0x30)
{
return make_float4(gu * dvdX.z + copysignf(dm, c) * dvdX.x,
gu * dvdY.z + copysignf(dm, c) * dvdY.x,
gv * dvdX.z - dm * dvdX.y,
gv * dvdY.z - dm * dvdY.y);
}
}
// Compute d(d{s,t}/d{X,Y})/d{x,y,z}, i.e., how the pixel derivatives of 2D face
// coordinates change w.r.t. 3D texture coordinate vector, returned as follows:
// | d(ds/dX)/dx d(ds/dY)/dx d(dt/dX)/dx d(dt/dY)/dx |
// | d(ds/dX)/dy d(ds/dY)/dy d(dt/dX)/dy d(dt/dY)/dy |
// | d(ds/dX)/dz d(ds/dY)/dz d(dt/dX)/dz d(dt/dY)/dz |
static __device__ __forceinline__ void indexCubeMapGrad2(float3 uv, float3 dvdX, float3 dvdY, float4& dx, float4& dy, float4& dz)
{
float ax = fabsf(uv.x);
float ay = fabsf(uv.y);
float az = fabsf(uv.z);
int idx;
float c, gu, gv;
if (az > fmaxf(ax, ay)) { idx = 0x10; c = uv.z; gu = uv.x; gv = uv.y; }
else if (ay > ax) { idx = 0x04; c = uv.y; gu = uv.x; gv = uv.z; }
else { idx = 0x01; c = uv.x; gu = uv.z; gv = uv.y; }
if (c < 0.f) idx += idx;
if (idx & 0x09)
{
dvdX.z = -dvdX.z;
dvdY.z = -dvdY.z;
}
float m = __frcp_rz(c);
float dm = -m * fabsf(m) * .5;
float mm = m * m * .5;
float mu = (idx & 0x34) ? -mm : mm;
float mv = (idx & 0x2e) ? -mm : mm;
gu *= -2.0 * m * mu;
gv *= -2.0 * m * mv;
if (idx & 0x03)
{
dx.x = gu * dvdX.x + dm * dvdX.z;
dx.y = gu * dvdY.x + dm * dvdY.z;
dx.z = gv * dvdX.x - dm * dvdX.y;
dx.w = gv * dvdY.x - dm * dvdY.y;
dy.x = 0.f;
dy.y = 0.f;
dy.z = mv * dvdX.x;
dy.w = mv * dvdY.x;
dz.x = mu * dvdX.x;
dz.y = mu * dvdY.x;
dz.z = 0.f;
dz.w = 0.f;
}
else if (idx & 0x0c)
{
dx.x = mu * dvdX.y;
dx.y = mu * dvdY.y;
dx.z = 0.f;
dx.w = 0.f;
dy.x = gu * dvdX.y + dm * dvdX.x;
dy.y = gu * dvdY.y + dm * dvdY.x;
dy.z = gv * dvdX.y + dm * dvdX.z;
dy.w = gv * dvdY.y + dm * dvdY.z;
dz.x = 0.f;
dz.y = 0.f;
dz.z = mv * dvdX.y;
dz.w = mv * dvdY.y;
}
else // (idx & 0x30)
{
dx.x = mu * dvdX.z;
dx.y = mu * dvdY.z;
dx.z = 0.f;
dx.w = 0.f;
dy.x = 0.f;
dy.y = 0.f;
dy.z = mv * dvdX.z;
dy.w = mv * dvdY.z;
dz.x = gu * dvdX.z - fabsf(dm) * dvdX.x;
dz.y = gu * dvdY.z - fabsf(dm) * dvdY.x;
dz.z = gv * dvdX.z - dm * dvdX.y;
dz.w = gv * dvdY.z - dm * dvdY.y;
}
}
//------------------------------------------------------------------------
// General texture indexing.
template <bool CUBE_MODE>
static __device__ __forceinline__ int indexTextureNearest(const TextureKernelParams& p, float3 uv, int tz)
{
int w = p.texWidth;
int h = p.texHeight;
float u = uv.x;
float v = uv.y;
// Cube map indexing.
if (CUBE_MODE)
{
// No wrap. Fold face index into tz right away.
tz = 6 * tz + indexCubeMap(u, v, uv.z); // Rewrites u, v.
}
else
{
// Handle boundary.
if (p.boundaryMode == TEX_BOUNDARY_MODE_WRAP)
{
u = u - (float)__float2int_rd(u);
v = v - (float)__float2int_rd(v);
}
}
u = u * (float)w;
v = v * (float)h;
int iu = __float2int_rd(u);
int iv = __float2int_rd(v);
// In zero boundary mode, return texture address -1.
if (!CUBE_MODE && p.boundaryMode == TEX_BOUNDARY_MODE_ZERO)
{
if (iu < 0 || iu >= w || iv < 0 || iv >= h)
return -1;
}
// Otherwise clamp and calculate the coordinate properly.
iu = min(max(iu, 0), w-1);
iv = min(max(iv, 0), h-1);
return iu + w * (iv + tz * h);
}
template <bool CUBE_MODE>
static __device__ __forceinline__ float2 indexTextureLinear(const TextureKernelParams& p, float3 uv, int tz, int4& tcOut, int level)
{
// Mip level size.
int2 sz = mipLevelSize(p, level);
int w = sz.x;
int h = sz.y;
// Compute texture-space u, v.
float u = uv.x;
float v = uv.y;
bool clampU = false;
bool clampV = false;
// Cube map indexing.
int face = 0;
if (CUBE_MODE)
{
// Neither clamp or wrap.
face = indexCubeMap(u, v, uv.z); // Rewrites u, v.
u = u * (float)w - 0.5f;
v = v * (float)h - 0.5f;
}
else
{
if (p.boundaryMode == TEX_BOUNDARY_MODE_WRAP)
{
// Wrap.
u = u - (float)__float2int_rd(u);
v = v - (float)__float2int_rd(v);
}
// Move to texel space.
u = u * (float)w - 0.5f;
v = v * (float)h - 0.5f;
if (p.boundaryMode == TEX_BOUNDARY_MODE_CLAMP)
{
// Clamp to center of edge texels.
u = fminf(fmaxf(u, 0.f), w - 1.f);
v = fminf(fmaxf(v, 0.f), h - 1.f);
clampU = (u == 0.f || u == w - 1.f);
clampV = (v == 0.f || v == h - 1.f);
}
}
// Compute texel coordinates and weights.
int iu0 = __float2int_rd(u);
int iv0 = __float2int_rd(v);
int iu1 = iu0 + (clampU ? 0 : 1); // Ensure zero u/v gradients with clamped.
int iv1 = iv0 + (clampV ? 0 : 1);
u -= (float)iu0;
v -= (float)iv0;
// Cube map wrapping.
bool cubeWrap = CUBE_MODE && (iu0 < 0 || iv0 < 0 || iu1 >= w || iv1 >= h);
if (cubeWrap)
{
tcOut = wrapCubeMap(face, iu0, iu1, iv0, iv1, w);
tcOut += 6 * tz * w * h; // Bring in tz.
return make_float2(u, v); // Done.
}
// Fold cube map face into tz.
if (CUBE_MODE)
tz = 6 * tz + face;
// Wrap overflowing texel indices.
if (!CUBE_MODE && p.boundaryMode == TEX_BOUNDARY_MODE_WRAP)
{
if (iu0 < 0) iu0 += w;
if (iv0 < 0) iv0 += h;
if (iu1 >= w) iu1 -= w;
if (iv1 >= h) iv1 -= h;
}
// Coordinates
iu0 += tz * w * h;
iu1 += tz * w * h;
tcOut.x = iu0 + w * iv0;
tcOut.y = iu1 + w * iv0;
tcOut.z = iu0 + w * iv1;
tcOut.w = iu1 + w * iv1;
// Invalidate texture addresses outside unit square if we are in zero mode.
if (!CUBE_MODE && p.boundaryMode == TEX_BOUNDARY_MODE_ZERO)
{
bool iu0_out = (iu0 < 0 || iu0 >= w);
bool iu1_out = (iu1 < 0 || iu1 >= w);
bool iv0_out = (iv0 < 0 || iv0 >= h);
bool iv1_out = (iv1 < 0 || iv1 >= h);
if (iu0_out || iv0_out) tcOut.x = -1;
if (iu1_out || iv0_out) tcOut.y = -1;
if (iu0_out || iv1_out) tcOut.z = -1;
if (iu1_out || iv1_out) tcOut.w = -1;
}
// All done.
return make_float2(u, v);
}
//------------------------------------------------------------------------
// Mip level calculation.
template <bool CUBE_MODE, int FILTER_MODE>
static __device__ __forceinline__ void calculateMipLevel(int& level0, int& level1, float& flevel, const TextureKernelParams& p, int pidx, float3 uv, float4* pdw, float3* pdfdv)
{
// Do nothing if mips not in use.
if (FILTER_MODE == TEX_MODE_NEAREST || FILTER_MODE == TEX_MODE_LINEAR)
return;
// Get pixel derivatives of texture coordinates.
float4 uvDA;
float3 dvdX, dvdY; // Gradients use these later.
if (CUBE_MODE)
{
// Fetch.
float2 d0 = ((const float2*)p.uvDA)[3 * pidx + 0];
float2 d1 = ((const float2*)p.uvDA)[3 * pidx + 1];
float2 d2 = ((const float2*)p.uvDA)[3 * pidx + 2];
// Map d{x,y,z}/d{X,Y} into d{s,t}/d{X,Y}.
dvdX = make_float3(d0.x, d1.x, d2.x); // d{x,y,z}/dX
dvdY = make_float3(d0.y, d1.y, d2.y); // d{x,y,z}/dY
uvDA = indexCubeMapGradST(uv, dvdX, dvdY); // d{s,t}/d{X,Y}
}
else
{
// Fetch.
uvDA = ((const float4*)p.uvDA)[pidx];
}
// Scaling factors.
float uscl = p.texWidth;
float vscl = p.texHeight;
// d[s,t]/d[X,Y].
float dsdx = uvDA.x * uscl;
float dsdy = uvDA.y * uscl;
float dtdx = uvDA.z * vscl;
float dtdy = uvDA.w * vscl;
// Calculate footprint axis lengths.
float A = dsdx*dsdx + dtdx*dtdx;
float B = dsdy*dsdy + dtdy*dtdy;
float C = dsdx*dsdy + dtdx*dtdy;
float l2b = 0.5 * (A + B);
float l2n = 0.25 * (A-B)*(A-B) + C*C;
float l2a = sqrt(l2n);
float lenMinorSqr = fmaxf(0.0, l2b - l2a);
float lenMajorSqr = l2b + l2a;
// Footprint vs. mip level gradient.
if (pdw && FILTER_MODE == TEX_MODE_LINEAR_MIPMAP_LINEAR)
{
float dw = 0.72134752f / (l2n + l2a * l2b); // Constant is 0.5/ln(2).
float AB = dw * .5f * (A - B);
float Cw = dw * C;
float l2aw = dw * l2a;
float d_f_ddsdX = uscl * (dsdx * (l2aw + AB) + dsdy * Cw);
float d_f_ddsdY = uscl * (dsdy * (l2aw - AB) + dsdx * Cw);
float d_f_ddtdX = vscl * (dtdx * (l2aw + AB) + dtdy * Cw);
float d_f_ddtdY = vscl * (dtdy * (l2aw - AB) + dtdx * Cw);
*pdw = make_float4(d_f_ddsdX, d_f_ddsdY, d_f_ddtdX, d_f_ddtdY);
// In cube maps, there is also a texture coordinate vs. mip level gradient.
if (CUBE_MODE)
{
float4 dx, dy, dz;
indexCubeMapGrad2(uv, dvdX, dvdY, dx, dy, dz);
float3 d_dsdX_dv = make_float3(dx.x, dy.x, dz.x);
float3 d_dsdY_dv = make_float3(dx.y, dy.y, dz.y);
float3 d_dtdX_dv = make_float3(dx.z, dy.z, dz.z);
float3 d_dtdY_dv = make_float3(dx.w, dy.w, dz.w);
float3 d_f_dv = make_float3(0.f, 0.f, 0.f);
d_f_dv += d_dsdX_dv * d_f_ddsdX;
d_f_dv += d_dsdY_dv * d_f_ddsdY;
d_f_dv += d_dtdX_dv * d_f_ddtdX;
d_f_dv += d_dtdY_dv * d_f_ddtdY;
*pdfdv = d_f_dv;
}
}
// Calculate true mip level and clamp.
flevel = .5f * __log2f(lenMajorSqr);
flevel = fminf(fmaxf(flevel, 0.f), (float)p.mipLevelMax);
if (FILTER_MODE == TEX_MODE_LINEAR_MIPMAP_NEAREST)
{
// Linear-mipmap-nearest.
level0 = __float2int_rn(flevel);
}
else
{
// Linear-mipmap-linear.
if (flevel > 0.f) // Leave everything at zero if flevel == 0 (magnification)
{
level0 = __float2int_rd(flevel);
level1 = min(level0 + 1, p.mipLevelMax);
flevel -= level0; // Fractional part. Zero if clamped on last level.
}
}
}
//------------------------------------------------------------------------
// Texel fetch and accumulator helpers that understand cube map corners.
template<class T>
static __device__ __forceinline__ void fetchQuad(T& a00, T& a10, T& a01, T& a11, const float* pIn, int4 tc, bool corner)
{
if (corner)
{
T avg = zero_value<T>();
if (tc.x >= 0) avg += (a00 = *((const T*)&pIn[tc.x]));
if (tc.y >= 0) avg += (a10 = *((const T*)&pIn[tc.y]));
if (tc.z >= 0) avg += (a01 = *((const T*)&pIn[tc.z]));
if (tc.w >= 0) avg += (a11 = *((const T*)&pIn[tc.w]));
avg *= 0.33333333f;
if (tc.x < 0) a00 = avg;
if (tc.y < 0) a10 = avg;
if (tc.z < 0) a01 = avg;
if (tc.w < 0) a11 = avg;
}
else
{
a00 = (tc.x >= 0) ? *((const T*)&pIn[tc.x]) : zero_value<T>();
a10 = (tc.y >= 0) ? *((const T*)&pIn[tc.y]) : zero_value<T>();
a01 = (tc.z >= 0) ? *((const T*)&pIn[tc.z]) : zero_value<T>();
a11 = (tc.w >= 0) ? *((const T*)&pIn[tc.w]) : zero_value<T>();
}
}
static __device__ __forceinline__ void accumQuad(float4 c, float* pOut, int level, int4 tc, bool corner, CA_TEMP_PARAM)
{
if (corner)
{
float cb;
if (tc.x < 0) cb = c.x;
if (tc.y < 0) cb = c.y;
if (tc.z < 0) cb = c.z;
if (tc.w < 0) cb = c.w;
cb *= 0.33333333f;
if (tc.x >= 0) caAtomicAddTexture(pOut, level, tc.x, c.x + cb);
if (tc.y >= 0) caAtomicAddTexture(pOut, level, tc.y, c.y + cb);
if (tc.z >= 0) caAtomicAddTexture(pOut, level, tc.z, c.z + cb);
if (tc.w >= 0) caAtomicAddTexture(pOut, level, tc.w, c.w + cb);
}
else
{
if (tc.x >= 0) caAtomicAddTexture(pOut, level, tc.x, c.x);
if (tc.y >= 0) caAtomicAddTexture(pOut, level, tc.y, c.y);
if (tc.z >= 0) caAtomicAddTexture(pOut, level, tc.z, c.z);
if (tc.w >= 0) caAtomicAddTexture(pOut, level, tc.w, c.w);
}
}
//------------------------------------------------------------------------
// Mip builder kernel.
template<class T, int C>
static __forceinline__ __device__ void MipBuildKernelTemplate(const TextureKernelParams p)
{
// Sizes.
int2 sz_in = mipLevelSize(p, p.mipLevelOut - 1);
int2 sz_out = mipLevelSize(p, p.mipLevelOut);
// Calculate pixel position.
int px = blockIdx.x * blockDim.x + threadIdx.x;
int py = blockIdx.y * blockDim.y + threadIdx.y;
int pz = blockIdx.z;
if (px >= sz_out.x || py >= sz_out.y)
return;
// Pixel indices.
int pidx_in0 = p.channels * (((px + sz_in.x * py) << 1) + (pz * sz_in.x * sz_in.y));
int pidx_in1 = pidx_in0 + p.channels * sz_in.x; // Next pixel down.
int pidx_out = p.channels * (px + sz_out.x * (py + sz_out.y * pz));
// Input and output pointers.
const float* pin = (p.mipLevelOut > 1) ? (p.mip + p.mipOffset[p.mipLevelOut - 1]) : p.tex;
float* pout = p.mip + p.mipOffset[p.mipLevelOut];
// Special case: Input texture height or width is 1.
if (sz_in.x == 1 || sz_in.y == 1)
{
if (sz_in.y == 1)
pidx_in1 = pidx_in0 + p.channels; // Next pixel on the right.
for (int i=0; i < p.channels; i += C)
{
T v0 = *((const T*)&pin[pidx_in0 + i]);
T v1 = *((const T*)&pin[pidx_in1 + i]);
T avg = .5f * (v0 + v1);
#if TEX_DEBUG_MIP_RETAIN_VARIANCE
avg = (avg - .5f) * 1.41421356f + .5f;
#endif
*((T*)&pout[pidx_out + i]) = avg;
}
return;
}
for (int i=0; i < p.channels; i += C)
{
T v0 = *((const T*)&pin[pidx_in0 + i]);
T v1 = *((const T*)&pin[pidx_in0 + i + p.channels]);
T v2 = *((const T*)&pin[pidx_in1 + i]);
T v3 = *((const T*)&pin[pidx_in1 + i + p.channels]);
T avg = .25f * (v0 + v1 + v2 + v3);
#if TEX_DEBUG_MIP_RETAIN_VARIANCE
avg = (avg - .5f) * 2.f + .5f;
#endif
*((T*)&pout[pidx_out + i]) = avg;
}
}
// Template specializations.
__global__ void MipBuildKernel1(const TextureKernelParams p) { MipBuildKernelTemplate<float, 1>(p); }
__global__ void MipBuildKernel2(const TextureKernelParams p) { MipBuildKernelTemplate<float2, 2>(p); }
__global__ void MipBuildKernel4(const TextureKernelParams p) { MipBuildKernelTemplate<float4, 4>(p); }
//------------------------------------------------------------------------
// Forward kernel.
template <class T, int C, bool CUBE_MODE, int FILTER_MODE>
static __forceinline__ __device__ void TextureFwdKernelTemplate(const TextureKernelParams p)
{
// Calculate pixel position.
int px = blockIdx.x * blockDim.x + threadIdx.x;
int py = blockIdx.y * blockDim.y + threadIdx.y;
int pz = blockIdx.z;
int tz = (p.texDepth == 1) ? 0 : pz;
if (px >= p.imgWidth || py >= p.imgHeight || pz >= p.n)
return;
// Pixel index.
int pidx = px + p.imgWidth * (py + p.imgHeight * pz);
// Output ptr.
float* pOut = p.out + pidx * p.channels;
// Get UV.
float3 uv;
if (CUBE_MODE)
uv = ((const float3*)p.uv)[pidx];
else
uv = make_float3(((const float2*)p.uv)[pidx], 0.f);
// Nearest mode.
if (FILTER_MODE == TEX_MODE_NEAREST)
{
int tc = indexTextureNearest<CUBE_MODE>(p, uv, tz);
tc *= p.channels;
const float* pIn = p.tex;
// Copy if valid tc, otherwise output zero.
for (int i=0; i < p.channels; i += C)
*((T*)&pOut[i]) = (tc >= 0) ? *((const T*)&pIn[tc + i]) : zero_value<T>();
return; // Exit.
}
// Calculate mip level. In 'linear' mode these will all stay zero.
float flevel = 0.f; // Fractional level.
int level0 = 0; // Discrete level 0.
int level1 = 0; // Discrete level 1.
calculateMipLevel<CUBE_MODE, FILTER_MODE>(level0, level1, flevel, p, pidx, uv, 0, 0);
// Get texel indices and pointer for level 0.
int4 tc0 = make_int4(0, 0, 0, 0);
float2 uv0 = indexTextureLinear<CUBE_MODE>(p, uv, tz, tc0, level0);
const float* pIn0 = level0 ? (p.mip + p.mipOffset[level0]) : p.tex;
bool corner0 = CUBE_MODE && ((tc0.x | tc0.y | tc0.z | tc0.w) < 0);
tc0 *= p.channels;
// Bilinear fetch.
if (FILTER_MODE == TEX_MODE_LINEAR || FILTER_MODE == TEX_MODE_LINEAR_MIPMAP_NEAREST)
{
// Interpolate.
for (int i=0; i < p.channels; i += C, tc0 += C)
{
T a00, a10, a01, a11;
fetchQuad<T>(a00, a10, a01, a11, pIn0, tc0, corner0);
*((T*)&pOut[i]) = bilerp(a00, a10, a01, a11, uv0);
}
return; // Exit.
}
// Get texel indices and pointer for level 1.
int4 tc1 = make_int4(0, 0, 0, 0);
float2 uv1 = indexTextureLinear<CUBE_MODE>(p, uv, tz, tc1, level1);
const float* pIn1 = level1 ? (p.mip + p.mipOffset[level1]) : p.tex;
bool corner1 = CUBE_MODE && ((tc1.x | tc1.y | tc1.z | tc1.w) < 0);
tc1 *= p.channels;
// Trilinear fetch.
for (int i=0; i < p.channels; i += C, tc0 += C, tc1 += C)
{
// First level.
T a00, a10, a01, a11;
fetchQuad<T>(a00, a10, a01, a11, pIn0, tc0, corner0);
T a = bilerp(a00, a10, a01, a11, uv0);
// Second level unless in magnification mode.
if (flevel > 0.f)
{
T b00, b10, b01, b11;
fetchQuad<T>(b00, b10, b01, b11, pIn1, tc1, corner1);
T b = bilerp(b00, b10, b01, b11, uv1);
a = lerp(a, b, flevel); // Interpolate between levels.
}
// Write.
*((T*)&pOut[i]) = a;
}
}
// Template specializations.
__global__ void TextureFwdKernelNearest1 (const TextureKernelParams p) { TextureFwdKernelTemplate<float, 1, false, TEX_MODE_NEAREST>(p); }
__global__ void TextureFwdKernelNearest2 (const TextureKernelParams p) { TextureFwdKernelTemplate<float2, 2, false, TEX_MODE_NEAREST>(p); }
__global__ void TextureFwdKernelNearest4 (const TextureKernelParams p) { TextureFwdKernelTemplate<float4, 4, false, TEX_MODE_NEAREST>(p); }
__global__ void TextureFwdKernelLinear1 (const TextureKernelParams p) { TextureFwdKernelTemplate<float, 1, false, TEX_MODE_LINEAR>(p); }
__global__ void TextureFwdKernelLinear2 (const TextureKernelParams p) { TextureFwdKernelTemplate<float2, 2, false, TEX_MODE_LINEAR>(p); }
__global__ void TextureFwdKernelLinear4 (const TextureKernelParams p) { TextureFwdKernelTemplate<float4, 4, false, TEX_MODE_LINEAR>(p); }
__global__ void TextureFwdKernelLinearMipmapNearest1 (const TextureKernelParams p) { TextureFwdKernelTemplate<float, 1, false, TEX_MODE_LINEAR_MIPMAP_NEAREST>(p); }
__global__ void TextureFwdKernelLinearMipmapNearest2 (const TextureKernelParams p) { TextureFwdKernelTemplate<float2, 2, false, TEX_MODE_LINEAR_MIPMAP_NEAREST>(p); }
__global__ void TextureFwdKernelLinearMipmapNearest4 (const TextureKernelParams p) { TextureFwdKernelTemplate<float4, 4, false, TEX_MODE_LINEAR_MIPMAP_NEAREST>(p); }
__global__ void TextureFwdKernelLinearMipmapLinear1 (const TextureKernelParams p) { TextureFwdKernelTemplate<float, 1, false, TEX_MODE_LINEAR_MIPMAP_LINEAR>(p); }
__global__ void TextureFwdKernelLinearMipmapLinear2 (const TextureKernelParams p) { TextureFwdKernelTemplate<float2, 2, false, TEX_MODE_LINEAR_MIPMAP_LINEAR>(p); }
__global__ void TextureFwdKernelLinearMipmapLinear4 (const TextureKernelParams p) { TextureFwdKernelTemplate<float4, 4, false, TEX_MODE_LINEAR_MIPMAP_LINEAR>(p); }
__global__ void TextureFwdKernelCubeNearest1 (const TextureKernelParams p) { TextureFwdKernelTemplate<float, 1, true, TEX_MODE_NEAREST>(p); }
__global__ void TextureFwdKernelCubeNearest2 (const TextureKernelParams p) { TextureFwdKernelTemplate<float2, 2, true, TEX_MODE_NEAREST>(p); }
__global__ void TextureFwdKernelCubeNearest4 (const TextureKernelParams p) { TextureFwdKernelTemplate<float4, 4, true, TEX_MODE_NEAREST>(p); }
__global__ void TextureFwdKernelCubeLinear1 (const TextureKernelParams p) { TextureFwdKernelTemplate<float, 1, true, TEX_MODE_LINEAR>(p); }
__global__ void TextureFwdKernelCubeLinear2 (const TextureKernelParams p) { TextureFwdKernelTemplate<float2, 2, true, TEX_MODE_LINEAR>(p); }
__global__ void TextureFwdKernelCubeLinear4 (const TextureKernelParams p) { TextureFwdKernelTemplate<float4, 4, true, TEX_MODE_LINEAR>(p); }
__global__ void TextureFwdKernelCubeLinearMipmapNearest1 (const TextureKernelParams p) { TextureFwdKernelTemplate<float, 1, true, TEX_MODE_LINEAR_MIPMAP_NEAREST>(p); }
__global__ void TextureFwdKernelCubeLinearMipmapNearest2 (const TextureKernelParams p) { TextureFwdKernelTemplate<float2, 2, true, TEX_MODE_LINEAR_MIPMAP_NEAREST>(p); }
__global__ void TextureFwdKernelCubeLinearMipmapNearest4 (const TextureKernelParams p) { TextureFwdKernelTemplate<float4, 4, true, TEX_MODE_LINEAR_MIPMAP_NEAREST>(p); }
__global__ void TextureFwdKernelCubeLinearMipmapLinear1 (const TextureKernelParams p) { TextureFwdKernelTemplate<float, 1, true, TEX_MODE_LINEAR_MIPMAP_LINEAR>(p); }
__global__ void TextureFwdKernelCubeLinearMipmapLinear2 (const TextureKernelParams p) { TextureFwdKernelTemplate<float2, 2, true, TEX_MODE_LINEAR_MIPMAP_LINEAR>(p); }
__global__ void TextureFwdKernelCubeLinearMipmapLinear4 (const TextureKernelParams p) { TextureFwdKernelTemplate<float4, 4, true, TEX_MODE_LINEAR_MIPMAP_LINEAR>(p); }
//------------------------------------------------------------------------
// Gradient mip puller kernel.
template<class T, int C>
static __forceinline__ __device__ void MipGradKernelTemplate(const TextureKernelParams p)
{
// Calculate pixel position.
int px = blockIdx.x * blockDim.x + threadIdx.x;
int py = blockIdx.y * blockDim.y + threadIdx.y;
int pz = blockIdx.z;
if (px >= p.texWidth || py >= p.texHeight)
return;
// Number of wide elements.
int c = p.channels;
if (C == 2) c >>= 1;
if (C == 4) c >>= 2;
// Dynamically allocated shared memory for holding a texel.
extern __shared__ float s_texelAccum[];
int sharedOfs = threadIdx.x + threadIdx.y * blockDim.x;
int sharedStride = blockDim.x * blockDim.y;
# define TEXEL_ACCUM(_i) (s_texelAccum + (sharedOfs + (_i) * sharedStride))
// Clear the texel.
for (int i=0; i < p.channels; i++)
*TEXEL_ACCUM(i) = 0.f;
// Track texel position and accumulation weight over the mip stack.
int x = px;
int y = py;
float w = 1.f;
// Pull gradients from all levels.
int2 sz = mipLevelSize(p, 0); // Previous level size.
for (int level=1; level <= p.mipLevelMax; level++)
{
// Weight decay depends on previous level size.
if (sz.x > 1) w *= .5f;
if (sz.y > 1) w *= .5f;
// Current level size and coordinates.
sz = mipLevelSize(p, level);
x >>= 1;
y >>= 1;
T* pIn = (T*)(p.gradTexMip + p.mipOffset[level] + (x + sz.x * (y + sz.y * pz)) * p.channels);
for (int i=0; i < c; i++)
accum_from_mem(TEXEL_ACCUM(i * C), sharedStride, pIn[i], w);
}
// Add to main texture gradients.
T* pOut = (T*)(p.gradTex + (px + p.texWidth * (py + p.texHeight * pz)) * p.channels);
for (int i=0; i < c; i++)
accum_to_mem(pOut[i], TEXEL_ACCUM(i * C), sharedStride);
}
// Template specializations.
__global__ void MipGradKernel1(const TextureKernelParams p) { MipGradKernelTemplate<float, 1>(p); }
__global__ void MipGradKernel2(const TextureKernelParams p) { MipGradKernelTemplate<float2, 2>(p); }
__global__ void MipGradKernel4(const TextureKernelParams p) { MipGradKernelTemplate<float4, 4>(p); }
//------------------------------------------------------------------------
// Gradient kernel.
template <bool CUBE_MODE, int FILTER_MODE>
static __forceinline__ __device__ void TextureGradKernelTemplate(const TextureKernelParams p)
{
// Temporary space for coalesced atomics.
CA_DECLARE_TEMP(TEX_GRAD_MAX_KERNEL_BLOCK_WIDTH * TEX_GRAD_MAX_KERNEL_BLOCK_HEIGHT);
// Calculate pixel position.
int px = blockIdx.x * blockDim.x + threadIdx.x;
int py = blockIdx.y * blockDim.y + threadIdx.y;
int pz = blockIdx.z;
int tz = (p.texDepth == 1) ? 0 : pz;
if (px >= p.imgWidth || py >= p.imgHeight || pz >= p.n)
return;
// Pixel index.
int pidx = px + p.imgWidth * (py + p.imgHeight * pz);
// Early exit if output gradients are zero.
const float* pDy = p.dy + pidx * p.channels;
unsigned int dmax = 0u;
if ((p.channels & 3) == 0)
{
for (int i=0; i < p.channels; i += 4)
{
uint4 dy = *((const uint4*)&pDy[i]);
dmax |= (dy.x | dy.y | dy.z | dy.w);
}
}
else
{
for (int i=0; i < p.channels; i++)
dmax |= __float_as_uint(pDy[i]);
}
// Store zeros and exit.
if (__uint_as_float(dmax) == 0.f)
{
if (CUBE_MODE)
{
if (FILTER_MODE != TEX_MODE_NEAREST)
((float3*)p.gradUV)[pidx] = make_float3(0.f, 0.f, 0.f);
if (FILTER_MODE == TEX_MODE_LINEAR_MIPMAP_LINEAR)
{
((float2*)p.gradUVDA)[3 * pidx + 0] = make_float2(0.f, 0.f);
((float2*)p.gradUVDA)[3 * pidx + 1] = make_float2(0.f, 0.f);
((float2*)p.gradUVDA)[3 * pidx + 2] = make_float2(0.f, 0.f);
}
}
else
{
if (FILTER_MODE != TEX_MODE_NEAREST)
((float2*)p.gradUV)[pidx] = make_float2(0.f, 0.f);
if (FILTER_MODE == TEX_MODE_LINEAR_MIPMAP_LINEAR)
((float4*)p.gradUVDA)[pidx] = make_float4(0.f, 0.f, 0.f, 0.f);
}
return;
}
// Get UV.
float3 uv;
if (CUBE_MODE)
uv = ((const float3*)p.uv)[pidx];
else
uv = make_float3(((const float2*)p.uv)[pidx], 0.f);
// Nearest mode - texture gradients only.
if (FILTER_MODE == TEX_MODE_NEAREST)
{
int tc = indexTextureNearest<CUBE_MODE>(p, uv, tz);
if (tc < 0)
return; // Outside texture.
tc *= p.channels;
float* pOut = p.gradTex;
// Accumulate texture gradients.
for (int i=0; i < p.channels; i++)
caAtomicAddTexture(pOut, 0, tc + i, pDy[i]);
return; // Exit.
}
// Calculate mip level. In 'linear' mode these will all stay zero.
float4 dw = make_float4(0.f, 0.f, 0.f, 0.f);
float3 dfdv = make_float3(0.f, 0.f, 0.f);
float flevel = 0.f; // Fractional level.
int level0 = 0; // Discrete level 0.
int level1 = 0; // Discrete level 1.
calculateMipLevel<CUBE_MODE, FILTER_MODE>(level0, level1, flevel, p, pidx, uv, &dw, &dfdv);
// UV gradient accumulators.
float gu = 0.f;
float gv = 0.f;
// Get texel indices and pointers for level 0.
int4 tc0 = make_int4(0, 0, 0, 0);
float2 uv0 = indexTextureLinear<CUBE_MODE>(p, uv, tz, tc0, level0);
const float* pIn0 = level0 ? (p.mip + p.mipOffset[level0]) : p.tex;
float* pOut0 = level0 ? (p.gradTexMip + p.mipOffset[level0]) : p.gradTex;
bool corner0 = CUBE_MODE && ((tc0.x | tc0.y | tc0.z | tc0.w) < 0);
tc0 *= p.channels;
// Texel weights.
float uv011 = uv0.x * uv0.y;
float uv010 = uv0.x - uv011;
float uv001 = uv0.y - uv011;
float uv000 = 1.f - uv0.x - uv001;
float4 tw0 = make_float4(uv000, uv010, uv001, uv011);
// Attribute weights.
int2 sz0 = mipLevelSize(p, level0);
float sclu0 = (float)sz0.x;
float sclv0 = (float)sz0.y;
// Bilinear mode - texture and uv gradients.
if (FILTER_MODE == TEX_MODE_LINEAR || FILTER_MODE == TEX_MODE_LINEAR_MIPMAP_NEAREST)
{
for (int i=0; i < p.channels; i++, tc0 += 1)
{
float dy = pDy[i];
accumQuad(tw0 * dy, pOut0, level0, tc0, corner0, CA_TEMP);
float a00, a10, a01, a11;
fetchQuad<float>(a00, a10, a01, a11, pIn0, tc0, corner0);
float ad = (a11 + a00 - a10 - a01);
gu += dy * ((a10 - a00) + uv0.y * ad) * sclu0;
gv += dy * ((a01 - a00) + uv0.x * ad) * sclv0;
}
// Store UV gradients and exit.
if (CUBE_MODE)
((float3*)p.gradUV)[pidx] = indexCubeMapGrad(uv, gu, gv);
else
((float2*)p.gradUV)[pidx] = make_float2(gu, gv);
return;
}
// Accumulate fractional mip level gradient.
float df = 0; // dL/df.
// Get texel indices and pointers for level 1.
int4 tc1 = make_int4(0, 0, 0, 0);
float2 uv1 = indexTextureLinear<CUBE_MODE>(p, uv, tz, tc1, level1);
const float* pIn1 = level1 ? (p.mip + p.mipOffset[level1]) : p.tex;
float* pOut1 = level1 ? (p.gradTexMip + p.mipOffset[level1]) : p.gradTex;
bool corner1 = CUBE_MODE && ((tc1.x | tc1.y | tc1.z | tc1.w) < 0);
tc1 *= p.channels;
// Texel weights.
float uv111 = uv1.x * uv1.y;
float uv110 = uv1.x - uv111;
float uv101 = uv1.y - uv111;
float uv100 = 1.f - uv1.x - uv101;
float4 tw1 = make_float4(uv100, uv110, uv101, uv111);
// Attribute weights.
int2 sz1 = mipLevelSize(p, level1);
float sclu1 = (float)sz1.x;
float sclv1 = (float)sz1.y;
// Trilinear mode.
for (int i=0; i < p.channels; i++, tc0 += 1, tc1 += 1)
{
float dy = pDy[i];
float dy0 = (1.f - flevel) * dy;
accumQuad(tw0 * dy0, pOut0, level0, tc0, corner0, CA_TEMP);
// UV gradients for first level.
float a00, a10, a01, a11;
fetchQuad<float>(a00, a10, a01, a11, pIn0, tc0, corner0);
float ad = (a11 + a00 - a10 - a01);
gu += dy0 * ((a10 - a00) + uv0.y * ad) * sclu0;
gv += dy0 * ((a01 - a00) + uv0.x * ad) * sclv0;
// Second level unless in magnification mode.
if (flevel > 0.f)
{
// Texture gradients for second level.
float dy1 = flevel * dy;
accumQuad(tw1 * dy1, pOut1, level1, tc1, corner1, CA_TEMP);
// UV gradients for second level.
float b00, b10, b01, b11;
fetchQuad<float>(b00, b10, b01, b11, pIn1, tc1, corner1);
float bd = (b11 + b00 - b10 - b01);
gu += dy1 * ((b10 - b00) + uv1.y * bd) * sclu1;
gv += dy1 * ((b01 - b00) + uv1.x * bd) * sclv1;
// Mip level gradient.
float a = bilerp(a00, a10, a01, a11, uv0);
float b = bilerp(b00, b10, b01, b11, uv1);
df += (b-a) * dy;
}
}
// Store UV gradients.
if (CUBE_MODE)
((float3*)p.gradUV)[pidx] = indexCubeMapGrad(uv, gu, gv) + (dfdv * df);
else
((float2*)p.gradUV)[pidx] = make_float2(gu, gv);
// Final UV pixel differential gradients.
dw *= df; // dL/(d{s,y}/d{X,Y}) = df/(d{s,y}/d{X,Y}) * dL/df.
// Store them.
if (CUBE_MODE)
{
// Remap from dL/(d{s,t}/s{X,Y}) to dL/(d{x,y,z}/d{X,Y}).
float3 g0, g1;
indexCubeMapGrad4(uv, dw, g0, g1);
((float2*)p.gradUVDA)[3 * pidx + 0] = make_float2(g0.x, g1.x);
((float2*)p.gradUVDA)[3 * pidx + 1] = make_float2(g0.y, g1.y);
((float2*)p.gradUVDA)[3 * pidx + 2] = make_float2(g0.z, g1.z);
}
else
((float4*)p.gradUVDA)[pidx] = dw;
}
// Template specializations.
__global__ void TextureGradKernelNearest (const TextureKernelParams p) { TextureGradKernelTemplate<false, TEX_MODE_NEAREST>(p); }
__global__ void TextureGradKernelLinear (const TextureKernelParams p) { TextureGradKernelTemplate<false, TEX_MODE_LINEAR>(p); }
__global__ void TextureGradKernelLinearMipmapNearest (const TextureKernelParams p) { TextureGradKernelTemplate<false, TEX_MODE_LINEAR_MIPMAP_NEAREST>(p); }
__global__ void TextureGradKernelLinearMipmapLinear (const TextureKernelParams p) { TextureGradKernelTemplate<false, TEX_MODE_LINEAR_MIPMAP_LINEAR>(p); }
__global__ void TextureGradKernelCubeNearest (const TextureKernelParams p) { TextureGradKernelTemplate<true, TEX_MODE_NEAREST>(p); }
__global__ void TextureGradKernelCubeLinear (const TextureKernelParams p) { TextureGradKernelTemplate<true, TEX_MODE_LINEAR>(p); }
__global__ void TextureGradKernelCubeLinearMipmapNearest (const TextureKernelParams p) { TextureGradKernelTemplate<true, TEX_MODE_LINEAR_MIPMAP_NEAREST>(p); }
__global__ void TextureGradKernelCubeLinearMipmapLinear (const TextureKernelParams p) { TextureGradKernelTemplate<true, TEX_MODE_LINEAR_MIPMAP_LINEAR>(p); }
//------------------------------------------------------------------------
// Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
//
// NVIDIA CORPORATION and its licensors retain all intellectual property
// and proprietary rights in and to this software, related documentation
// and any modifications thereto. Any use, reproduction, disclosure or
// distribution of this software and related documentation without an express
// license agreement from NVIDIA CORPORATION is strictly prohibited.
#pragma once
#include "framework.h"
//------------------------------------------------------------------------
// Constants.
#define TEX_DEBUG_MIP_RETAIN_VARIANCE 0 // For debugging
#define TEX_FWD_MAX_KERNEL_BLOCK_WIDTH 8
#define TEX_FWD_MAX_KERNEL_BLOCK_HEIGHT 8
#define TEX_FWD_MAX_MIP_KERNEL_BLOCK_WIDTH 8
#define TEX_FWD_MAX_MIP_KERNEL_BLOCK_HEIGHT 8
#define TEX_GRAD_MAX_KERNEL_BLOCK_WIDTH 8
#define TEX_GRAD_MAX_KERNEL_BLOCK_HEIGHT 8
#define TEX_GRAD_MAX_MIP_KERNEL_BLOCK_WIDTH 8
#define TEX_GRAD_MAX_MIP_KERNEL_BLOCK_HEIGHT 8
#define TEX_MAX_MIP_LEVEL 14 // Currently a texture cannot be larger than 2 GB because we use 32-bit indices everywhere.
#define TEX_MODE_NEAREST 0 // Nearest on base level.
#define TEX_MODE_LINEAR 1 // Bilinear on base level.
#define TEX_MODE_LINEAR_MIPMAP_NEAREST 2 // Bilinear on nearest mip level.
#define TEX_MODE_LINEAR_MIPMAP_LINEAR 3 // Trilinear.
#define TEX_MODE_COUNT 4
#define TEX_BOUNDARY_MODE_CUBE 0 // Cube map mode.
#define TEX_BOUNDARY_MODE_WRAP 1 // Wrap (u, v).
#define TEX_BOUNDARY_MODE_CLAMP 2 // Clamp (u, v).
#define TEX_BOUNDARY_MODE_ZERO 3 // Pad with zeros.
#define TEX_BOUNDARY_MODE_COUNT 4
//------------------------------------------------------------------------
// CUDA kernel params.
struct TextureKernelParams
{
const float* tex; // Incoming texture buffer.
const float* uv; // Incoming texcoord buffer.
const float* uvDA; // Incoming uv pixel diffs. NULL if mips disabled.
const float* dy; // Incoming output gradient.
float* mip; // Mip data buffer.
float* out; // Outgoing texture data.
float* gradTex; // Outgoing texture gradient.
float* gradTexMip; // Temporary texture gradients for mip levels > 0.
float* gradUV; // Outgoing texcoord gradient.
float* gradUVDA; // Outgoing texcoord pixel differential gradient.
int enableMip; // If true, we have uv_da input and mip output tensor.
int filterMode; // One of the TEX_MODE_ constants.
int boundaryMode; // One of the TEX_BOUNDARY_MODE_ contants.
int texConst; // If true, texture is known to be constant.
int mipLevelLimit; // Mip level limit coming from the op.
int channels; // Number of texture channels.
int imgWidth; // Image width.
int imgHeight; // Image height.
int texWidth; // Texture width.
int texHeight; // Texture height.
int texDepth; // Texture depth.
int n; // Minibatch size.
int mipLevelMax; // Maximum mip level index. Zero if mips disabled.
int mipOffset[TEX_MAX_MIP_LEVEL]; // Offsets in mip data. 0: unused, 1+: offset to mip.
int mipLevelOut; // Mip level being calculated in builder kernel.
};
//------------------------------------------------------------------------
// C++ helper function prototypes.
void raiseMipSizeError(NVDR_CTX_ARGS, const TextureKernelParams& p);
int calculateMipInfo(NVDR_CTX_ARGS, TextureKernelParams& p);
//------------------------------------------------------------------------
// Macros.
#define mipLevelSize(p, i) make_int2(((p).texWidth >> (i)) > 1 ? ((p).texWidth >> (i)) : 1, ((p).texHeight >> (i)) > 1 ? ((p).texHeight >> (i)) : 1)
//------------------------------------------------------------------------
This source diff could not be displayed because it is too large. You can view the blob instead.
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# NVIDIA CORPORATION and its licensors retain all intellectual property
# and proprietary rights in and to this software, related documentation
# and any modifications thereto. Any use, reproduction, disclosure or
# distribution of this software and related documentation without an express
# license agreement from NVIDIA CORPORATION is strictly prohibited.
from .ops import rasterize, interpolate, texture, antialias
from .plugin_loader import set_cache_dir
__all__ = ["rasterize", "interpolate", "texture", "antialias", "set_cache_dir"]
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# NVIDIA CORPORATION and its licensors retain all intellectual property
# and proprietary rights in and to this software, related documentation
# and any modifications thereto. Any use, reproduction, disclosure or
# distribution of this software and related documentation without an express
# license agreement from NVIDIA CORPORATION is strictly prohibited.
import tensorflow as tf
import numpy as np
import os
from . import plugin_loader
#----------------------------------------------------------------------------
# Helpers.
#----------------------------------------------------------------------------
# OpenGL-related linker options depending on platform.
def _get_gl_opts():
libs = {
'posix': ['GL', 'GLEW'],
'nt': ['gdi32', 'glew32s', 'opengl32', 'user32'],
}
return ['-l' + x for x in libs[os.name]]
# Load the cpp plugin.
def _get_plugin():
fn = os.path.join(os.path.dirname(__file__), 'tf_all.cu')
return plugin_loader.get_plugin(fn, extra_nvcc_options=_get_gl_opts() + ['-DNVDR_TENSORFLOW'])
# Convert parameter to a numpy array if possible.
def _get_constant(x, dtype):
try:
return np.asarray(x, dtype=dtype)
except (TypeError, ValueError):
return None
# Tests for a construction-time constantness instead of tf.constant node because
# the latter can be overridden in Session.run() feed_dict at evaluation time.
def _is_constant(x, dtype):
if isinstance(x, np.ndarray):
return np.can_cast(x.dtype, dtype, 'unsafe')
else:
return _get_constant(x, dtype) is not None
#----------------------------------------------------------------------------
# Rasterize.
#----------------------------------------------------------------------------
def rasterize(pos, tri, resolution, ranges=None, tri_const=False, output_db=True, grad_db=True):
assert tri_const is True or tri_const is False
assert output_db is True or output_db is False
# Known constant resolution?
resolution_c = _get_constant(resolution, np.int32)
# Known constant triangles?
tri_const = tri_const or _is_constant(tri, np.int32)
# Convert all inputs to tensors / base types.
tri_const = 1 if tri_const else 0
tri = tf.convert_to_tensor(tri, dtype=tf.int32)
pos = tf.convert_to_tensor(pos, dtype=tf.float32)
resolution = tf.convert_to_tensor(resolution, dtype=tf.int32)
if ranges is None:
ranges = tf.convert_to_tensor(np.zeros(shape=[0, 2], dtype=np.int32)) # Empty tensor.
else:
ranges = tf.convert_to_tensor(ranges, dtype=tf.int32) # Convert input to tensor.
# Infer as much about the output shape as possible.
out_shape = [None, None, None, 4]
if pos.shape.rank == 3: # Instanced mode.
out_shape[0] = pos.shape[0].value
elif pos.shape.rank == 2: # Range mode.
if ranges.shape.rank not in [None, 0]:
out_shape[0] = ranges.shape[0].value
if resolution_c is not None:
assert resolution_c.shape == (2,)
out_shape[1], out_shape[2] = resolution_c
# Output pixel differentials.
@tf.custom_gradient
def func_db(pos):
out, out_db = _get_plugin().rasterize_fwd(pos, tri, resolution, ranges, 1, tri_const)
out.set_shape(out_shape)
out_db.set_shape(out_shape)
def grad(dy, ddb):
if grad_db:
return _get_plugin().rasterize_grad_db(pos, tri, out, dy, ddb)
else:
return _get_plugin().rasterize_grad(pos, tri, out, dy)
return (out, out_db), grad
# Do not output pixel differentials.
@tf.custom_gradient
def func(pos):
out, out_db = _get_plugin().rasterize_fwd(pos, tri, resolution, ranges, 0, tri_const)
out.set_shape(out_shape)
out_db.set_shape(out_shape[:-1] + [0]) # Zero channels in out_db.
def grad(dy, _):
return _get_plugin().rasterize_grad(pos, tri, out, dy)
return (out, out_db), grad
# Choose stub.
if output_db:
return func_db(pos)
else:
return func(pos)
#----------------------------------------------------------------------------
# Interpolate.
#----------------------------------------------------------------------------
def interpolate(attr, rast, tri, rast_db=None, diff_attrs=None):
# Sanitize the list of pixel differential attributes.
if diff_attrs is None:
diff_attrs = []
elif diff_attrs != 'all':
diff_attrs = _get_constant(diff_attrs, np.int32)
assert (diff_attrs is not None) and len(diff_attrs.shape) == 1
diff_attrs = diff_attrs.tolist()
# Convert all inputs to tensors.
attr = tf.convert_to_tensor(attr, dtype=tf.float32)
rast = tf.convert_to_tensor(rast, dtype=tf.float32)
tri = tf.convert_to_tensor(tri, dtype=tf.int32)
if diff_attrs:
rast_db = tf.convert_to_tensor(rast_db, dtype=tf.float32)
# Infer output shape.
out_shape = [None, None, None, None]
if rast.shape.rank is not None:
out_shape = [rast.shape[0].value, rast.shape[1].value, rast.shape[2].value, None]
if attr.shape.rank in [2, 3]:
out_shape[3] = attr.shape[-1].value
# Output pixel differentials for at least some attributes.
@tf.custom_gradient
def func_da(attr, rast, rast_db):
diff_attrs_all = int(diff_attrs == 'all')
diff_attrs_list = [] if diff_attrs_all else diff_attrs
out, out_da = _get_plugin().interpolate_fwd_da(attr, rast, tri, rast_db, diff_attrs_all, diff_attrs_list)
# Infer number of channels in out_da.
if not diff_attrs_all:
da_channels = 2 * len(diff_attrs)
if (attr.shape.rank in [2, 3]) and (attr.shape[-1].value is not None):
da_channels = 2 * attr.shape[-1].value
else:
da_channels = None
# Set output shapes.
out.set_shape(out_shape)
out_da.set_shape([out_shape[0], out_shape[1], out_shape[2], da_channels])
def grad(dy, dda):
return _get_plugin().interpolate_grad_da(attr, rast, tri, dy, rast_db, dda, diff_attrs_all, diff_attrs_list)
return (out, out_da), grad
# No pixel differentials for any attribute.
@tf.custom_gradient
def func(attr, rast):
out, out_da = _get_plugin().interpolate_fwd(attr, rast, tri)
out.set_shape(out_shape)
out_da.set_shape(out_shape[:-1] + [0]) # Zero channels in out_da.
def grad(dy, _):
return _get_plugin().interpolate_grad(attr, rast, tri, dy)
return (out, out_da), grad
# Choose stub.
if diff_attrs:
return func_da(attr, rast, rast_db)
else:
return func(attr, rast)
#----------------------------------------------------------------------------
# Texture.
#----------------------------------------------------------------------------
def texture(tex, uv, uv_da=None, filter_mode='auto', boundary_mode='wrap', tex_const=False, max_mip_level=None):
assert tex_const is True or tex_const is False
# Default filter mode.
if filter_mode == 'auto':
filter_mode = 'linear-mipmap-linear' if (uv_da is not None) else 'linear'
# Known constant texture?
tex_const = tex_const or _is_constant(tex, np.float32)
# Sanitize inputs.
tex_const = 1 if tex_const else 0
if max_mip_level is None:
max_mip_level = -1
else:
max_mip_level = int(max_mip_level)
assert max_mip_level >= 0
# Convert inputs to tensors.
tex = tf.convert_to_tensor(tex, dtype=tf.float32)
uv = tf.convert_to_tensor(uv, dtype=tf.float32)
if 'mipmap' in filter_mode:
uv_da = tf.convert_to_tensor(uv_da, dtype=tf.float32)
# Infer output shape.
out_shape = [None, None, None, None]
if uv.shape.rank is not None:
assert uv.shape.rank == 4
out_shape = [uv.shape[0].value, uv.shape[1].value, uv.shape[2].value, None]
if tex.shape.rank is not None:
assert tex.shape.rank == (5 if boundary_mode == 'cube' else 4)
out_shape[-1] = tex.shape[-1].value
# If mipping disabled via max level=0, we may as well use simpler filtering internally.
if max_mip_level == 0 and filter_mode in ['linear-mipmap-nearest', 'linear-mipmap-linear']:
filter_mode = 'linear'
# Convert filter mode to internal enumeration.
filter_mode_dict = {'nearest': 0, 'linear': 1, 'linear-mipmap-nearest': 2, 'linear-mipmap-linear': 3}
filter_mode_enum = filter_mode_dict[filter_mode]
# Convert boundary mode to internal enumeration.
boundary_mode_dict = {'cube': 0, 'wrap': 1, 'clamp': 2, 'zero': 3}
boundary_mode_enum = boundary_mode_dict[boundary_mode]
# Linear-mipmap-linear: Mipmaps enabled, all gradients active.
@tf.custom_gradient
def func_linear_mipmap_linear(tex, uv, uv_da):
out, mip = _get_plugin().texture_fwd_mip(tex, uv, uv_da, filter_mode_enum, boundary_mode_enum, tex_const, max_mip_level)
out.set_shape(out_shape)
def grad(dy):
return _get_plugin().texture_grad_linear_mipmap_linear(tex, uv, dy, uv_da, mip, filter_mode_enum, boundary_mode_enum, max_mip_level)
return out, grad
# Linear-mipmap-nearest: Mipmaps enabled, no gradients to uv_da.
@tf.custom_gradient
def func_linear_mipmap_nearest(tex, uv):
out, mip = _get_plugin().texture_fwd_mip(tex, uv, uv_da, filter_mode_enum, boundary_mode_enum, tex_const, max_mip_level)
out.set_shape(out_shape)
def grad(dy):
return _get_plugin().texture_grad_linear_mipmap_nearest(tex, uv, dy, uv_da, mip, filter_mode_enum, boundary_mode_enum, max_mip_level)
return out, grad
# Linear: Mipmaps disabled, no uv_da, no gradients to uv_da.
@tf.custom_gradient
def func_linear(tex, uv):
out = _get_plugin().texture_fwd(tex, uv, filter_mode_enum, boundary_mode_enum)
out.set_shape(out_shape)
def grad(dy):
return _get_plugin().texture_grad_linear(tex, uv, dy, filter_mode_enum, boundary_mode_enum)
return out, grad
# Nearest: Mipmaps disabled, no uv_da, no gradients to uv_da or uv.
@tf.custom_gradient
def func_nearest(tex):
out = _get_plugin().texture_fwd(tex, uv, filter_mode_enum, boundary_mode_enum)
out.set_shape(out_shape)
def grad(dy):
return _get_plugin().texture_grad_nearest(tex, uv, dy, filter_mode_enum, boundary_mode_enum)
return out, grad
# Choose stub.
if filter_mode == 'linear-mipmap-linear':
return func_linear_mipmap_linear(tex, uv, uv_da)
elif filter_mode == 'linear-mipmap-nearest':
return func_linear_mipmap_nearest(tex, uv)
elif filter_mode == 'linear':
return func_linear(tex, uv)
elif filter_mode == 'nearest':
return func_nearest(tex)
#----------------------------------------------------------------------------
# Antialias.
#----------------------------------------------------------------------------
def antialias(color, rast, pos, tri, tri_const=False, pos_gradient_boost=1.0):
assert tri_const is True or tri_const is False
# Known constant triangles?
tri_const = tri_const or _is_constant(tri, np.int32)
# Convert inputs to tensors.
color = tf.convert_to_tensor(color, dtype=tf.float32)
rast = tf.convert_to_tensor(rast, dtype=tf.float32)
pos = tf.convert_to_tensor(pos, dtype=tf.float32)
tri = tf.convert_to_tensor(tri, dtype=tf.int32)
# Sanitize inputs.
tri_const = 1 if tri_const else 0
@tf.custom_gradient
def func(color, pos):
color_out, work_buffer = _get_plugin().antialias_fwd(color, rast, pos, tri, tri_const)
color_out.set_shape(color.shape)
def grad(dy):
grad_color, grad_pos = _get_plugin().antialias_grad(color, rast, pos, tri, dy, work_buffer)
if pos_gradient_boost != 1.0:
grad_pos = grad_pos * pos_gradient_boost
return grad_color, grad_pos
return color_out, grad
return func(color, pos)
#----------------------------------------------------------------------------
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# NVIDIA CORPORATION and its licensors retain all intellectual property
# and proprietary rights in and to this software, related documentation
# and any modifications thereto. Any use, reproduction, disclosure or
# distribution of this software and related documentation without an express
# license agreement from NVIDIA CORPORATION is strictly prohibited.
import glob
import os
import re
import uuid
import hashlib
import tempfile
import shutil
import tensorflow as tf
from tensorflow.python.client import device_lib # pylint: disable=no-name-in-module
#----------------------------------------------------------------------------
# Global options.
_nvdiffrast_cache_dir = None
def set_cache_dir(path: str) -> None:
'''Set CUDA kernel compilation temp dir.
If `set_cache_dir` is not called, the cache directory will default to
one of the below:
- Value of NVDIFFRAST_CACHE_DIR env var, if set
- $HOME/.cache/nvdiffrast if HOME env var is set
- $USERPROFILE/.cache/nvdiffrast if USERPROFILE is set.
Args:
path: Where to save CUDA kernel build temporaries
'''
global _nvdiffrast_cache_dir
_nvdiffrast_cache_dir = path
def make_cache_dir_path(*paths: str) -> str:
if _nvdiffrast_cache_dir is not None:
return os.path.join(_nvdiffrast_cache_dir, *paths)
if 'NVDIFFRAST_CACHE_DIR' in os.environ:
return os.path.join(os.environ['NVDIFFRAST_CACHE_DIR'], *paths)
if 'HOME' in os.environ:
return os.path.join(os.environ['HOME'], '.cache', 'nvdiffrast', *paths)
if 'USERPROFILE' in os.environ:
return os.path.join(os.environ['USERPROFILE'], '.cache', 'nvdiffrast', *paths)
return os.path.join(tempfile.gettempdir(), '.cache', 'nvdiffrast', *paths)
cuda_cache_version_tag = 'v1'
do_not_hash_included_headers = False # Speed up compilation by assuming that headers included by the CUDA code never change. Unsafe!
verbose = True # Print status messages to stdout.
#----------------------------------------------------------------------------
# Internal helper funcs.
def _find_compiler_bindir():
hostx64_paths = sorted(glob.glob('C:/Program Files (x86)/Microsoft Visual Studio/*/Professional/VC/Tools/MSVC/*/bin/Hostx64/x64'), reverse=True)
if hostx64_paths != []:
return hostx64_paths[0]
hostx64_paths = sorted(glob.glob('C:/Program Files (x86)/Microsoft Visual Studio/*/BuildTools/VC/Tools/MSVC/*/bin/Hostx64/x64'), reverse=True)
if hostx64_paths != []:
return hostx64_paths[0]
hostx64_paths = sorted(glob.glob('C:/Program Files (x86)/Microsoft Visual Studio/*/Community/VC/Tools/MSVC/*/bin/Hostx64/x64'), reverse=True)
if hostx64_paths != []:
return hostx64_paths[0]
vc_bin_dir = 'C:/Program Files (x86)/Microsoft Visual Studio 14.0/vc/bin'
if os.path.isdir(vc_bin_dir):
return vc_bin_dir
return None
def _get_compute_cap(device):
caps_str = device.physical_device_desc
m = re.search('compute capability: (\\d+).(\\d+)', caps_str)
major = m.group(1)
minor = m.group(2)
return (major, minor)
def _get_cuda_gpu_arch_string():
gpus = [x for x in device_lib.list_local_devices() if x.device_type == 'GPU']
if len(gpus) == 0:
raise RuntimeError('No GPU devices found')
(major, minor) = _get_compute_cap(gpus[0])
return 'sm_%s%s' % (major, minor)
def _run_cmd(cmd):
with os.popen(cmd) as pipe:
output = pipe.read()
status = pipe.close()
if status is not None:
raise RuntimeError('NVCC returned an error. See below for full command line and output log:\n\n%s\n\n%s' % (cmd, output))
def _prepare_nvcc_cli(opts):
cmd = 'nvcc ' + opts.strip()
cmd += ' --disable-warnings'
cmd += ' --include-path "%s"' % tf.sysconfig.get_include()
cmd += ' --include-path "%s"' % os.path.join(tf.sysconfig.get_include(), 'external', 'protobuf_archive', 'src')
cmd += ' --include-path "%s"' % os.path.join(tf.sysconfig.get_include(), 'external', 'com_google_absl')
cmd += ' --include-path "%s"' % os.path.join(tf.sysconfig.get_include(), 'external', 'eigen_archive')
compiler_bindir = _find_compiler_bindir()
if compiler_bindir is None:
# Require that _find_compiler_bindir succeeds on Windows. Allow
# nvcc to use whatever is the default on Linux.
if os.name == 'nt':
raise RuntimeError('Could not find MSVC/GCC/CLANG installation on this computer. Check compiler_bindir_search_path list in "%s".' % __file__)
else:
cmd += ' --compiler-bindir "%s"' % compiler_bindir
cmd += ' 2>&1'
return cmd
#----------------------------------------------------------------------------
# Main entry point.
_plugin_cache = dict()
def get_plugin(cuda_file, extra_nvcc_options=[]):
cuda_file_base = os.path.basename(cuda_file)
cuda_file_name, cuda_file_ext = os.path.splitext(cuda_file_base)
# Already in cache?
if cuda_file in _plugin_cache:
return _plugin_cache[cuda_file]
# Setup plugin.
if verbose:
print('Setting up TensorFlow plugin "%s": ' % cuda_file_base, end='', flush=True)
try:
# Hash CUDA source.
md5 = hashlib.md5()
with open(cuda_file, 'rb') as f:
md5.update(f.read())
md5.update(b'\n')
# Hash headers included by the CUDA code by running it through the preprocessor.
if not do_not_hash_included_headers:
if verbose:
print('Preprocessing... ', end='', flush=True)
with tempfile.TemporaryDirectory() as tmp_dir:
tmp_file = os.path.join(tmp_dir, cuda_file_name + '_tmp' + cuda_file_ext)
_run_cmd(_prepare_nvcc_cli('"%s" --preprocess -o "%s" --keep --keep-dir "%s"' % (cuda_file, tmp_file, tmp_dir)))
with open(tmp_file, 'rb') as f:
bad_file_str = ('"' + cuda_file.replace('\\', '/') + '"').encode('utf-8') # __FILE__ in error check macros
good_file_str = ('"' + cuda_file_base + '"').encode('utf-8')
for ln in f:
if not ln.startswith(b'# ') and not ln.startswith(b'#line '): # ignore line number pragmas
ln = ln.replace(bad_file_str, good_file_str)
md5.update(ln)
md5.update(b'\n')
# Select compiler options.
compile_opts = ''
if os.name == 'nt':
compile_opts += '"%s"' % os.path.join(tf.sysconfig.get_lib(), 'python', '_pywrap_tensorflow_internal.lib')
compile_opts += ' --library-path="%s"' % (os.path.dirname(__file__) + r"\..\lib") # Find glew32s.lib during compilation.
elif os.name == 'posix':
compile_opts += '"%s"' % os.path.join(tf.sysconfig.get_lib(), 'python', '_pywrap_tensorflow_internal.so')
compile_opts += ' --compiler-options \'-fPIC -D_GLIBCXX_USE_CXX11_ABI=0\''
else:
assert False # not Windows or Linux, w00t?
compile_opts += ' --gpu-architecture=%s' % _get_cuda_gpu_arch_string()
compile_opts += ' --use_fast_math'
for opt in extra_nvcc_options:
compile_opts += ' ' + opt
nvcc_cmd = _prepare_nvcc_cli(compile_opts)
# Hash build configuration.
md5.update(('nvcc_cmd: ' + nvcc_cmd).encode('utf-8') + b'\n')
md5.update(('tf.VERSION: ' + tf.VERSION).encode('utf-8') + b'\n')
md5.update(('cuda_cache_version_tag: ' + cuda_cache_version_tag).encode('utf-8') + b'\n')
# Compile if not already compiled.
bin_file_ext = '.dll' if os.name == 'nt' else '.so'
cuda_cache_path = make_cache_dir_path()
bin_file = os.path.join(make_cache_dir_path(), cuda_file_name + '_' + md5.hexdigest() + bin_file_ext)
if not os.path.isfile(bin_file):
if verbose:
print('Compiling... ', end='', flush=True)
with tempfile.TemporaryDirectory() as tmp_dir:
tmp_file = os.path.join(tmp_dir, cuda_file_name + '_tmp' + bin_file_ext)
_run_cmd(nvcc_cmd + ' "%s" --shared -o "%s" --keep --keep-dir "%s"' % (cuda_file, tmp_file, tmp_dir))
os.makedirs(cuda_cache_path, exist_ok=True)
intermediate_file = os.path.join(cuda_cache_path, cuda_file_name + '_' + uuid.uuid4().hex + '_tmp' + bin_file_ext)
shutil.copyfile(tmp_file, intermediate_file)
os.rename(intermediate_file, bin_file) # atomic
# Load.
if verbose:
print('Loading... ', end='', flush=True)
plugin = tf.load_op_library(bin_file)
# Add to cache.
_plugin_cache[cuda_file] = plugin
if verbose:
print('Done.', flush=True)
return plugin
except:
if verbose:
print('Failed!', flush=True)
raise
#----------------------------------------------------------------------------
// Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
//
// NVIDIA CORPORATION and its licensors retain all intellectual property
// and proprietary rights in and to this software, related documentation
// and any modifications thereto. Any use, reproduction, disclosure or
// distribution of this software and related documentation without an express
// license agreement from NVIDIA CORPORATION is strictly prohibited.
// TF-specific helpers.
#define OP_CHECK_CUDA_ERROR(CTX, CUDA_CALL) do { cudaError_t err = CUDA_CALL; OP_REQUIRES(CTX, err == cudaSuccess, errors::Internal("Cuda error: ", cudaGetErrorName(err), "[", #CUDA_CALL, ";]")); } while (0)
#define OP_CHECK_GL_ERROR(CTX, GL_CALL) do { GL_CALL; GLenum err = glGetError(); OP_REQUIRES(CTX, err == GL_NO_ERROR, errors::Internal("OpenGL error: ", getGLErrorString(err), "[", #GL_CALL, ";]")); } while (0)
// Cuda kernels and CPP all together. What an absolute compilation unit.
#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
#include "../common/framework.h"
#include "../common/common.h"
#include "../common/common.cpp"
#include "../common/rasterize.h"
#include "../common/rasterize.cpp"
#include "../common/rasterize.cu"
#include "tf_rasterize.cu"
#include "../common/interpolate.cu"
#include "tf_interpolate.cu"
#include "../common/texture.cpp"
#include "../common/texture.cu"
#include "tf_texture.cu"
#include "../common/antialias.cu"
#include "tf_antialias.cu"
// Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
//
// NVIDIA CORPORATION and its licensors retain all intellectual property
// and proprietary rights in and to this software, related documentation
// and any modifications thereto. Any use, reproduction, disclosure or
// distribution of this software and related documentation without an express
// license agreement from NVIDIA CORPORATION is strictly prohibited.
//------------------------------------------------------------------------
// Forward TensorFlow op.
struct AntialiasFwdOp : public OpKernel
{
AntialiasKernelParams m_attribs;
AntialiasFwdOp(OpKernelConstruction* ctx): OpKernel(ctx)
{
memset(&m_attribs, 0, sizeof(m_attribs));
OP_REQUIRES_OK(ctx, ctx->GetAttr("tri_const", &m_attribs.tri_const));
}
void Compute(OpKernelContext* ctx)
{
AntialiasKernelParams& p = m_attribs;
cudaStream_t stream = ctx->eigen_device<Eigen::GpuDevice>().stream();
// Get input.
const Tensor& color = ctx->input(0);
const Tensor& rasterOut = ctx->input(1);
const Tensor& pos = ctx->input(2);
const Tensor& tri = ctx->input(3);
// Instance rendering mode?
p.instance_mode = pos.dims() > 2;
// Extract input dimensions.
if (p.instance_mode)
p.numVertices = (pos.dims() > 1) ? pos.dim_size(1) : 0;
else
p.numVertices = (pos.dims() > 0) ? pos.dim_size(0) : 0;
p.numTriangles = (tri.dims() > 0) ? tri.dim_size(0) : 0;
p.n = (color.dims() > 0) ? color.dim_size(0) : 0;
p.height = (color.dims() > 1) ? color.dim_size(1) : 0;
p.width = (color.dims() > 2) ? color.dim_size(2) : 0;
p.channels = (color.dims() > 3) ? color.dim_size(3) : 0;
// Sanity checks.
OP_REQUIRES(ctx, color.dims() == 4 && color.dim_size(0) > 0 && color.dim_size(1) > 0 && color.dim_size(2) > 0 && color.dim_size(3) > 0, errors::InvalidArgument("color must have shape[>0, >0, >0, >0]"));
OP_REQUIRES(ctx, rasterOut.dims() == 4 && rasterOut.dim_size(0) > 0 && rasterOut.dim_size(1) > 0 && rasterOut.dim_size(2) > 0 && rasterOut.dim_size(3) == 4, errors::InvalidArgument("raster_out must have shape[>0, >0, >0, 4]"));
OP_REQUIRES(ctx, tri.dims() == 2 && tri.dim_size(0) > 0 && tri.dim_size(1) == 3, errors::InvalidArgument("tri must have shape [>0, 3]"));
OP_REQUIRES(ctx, color.dim_size(1) == rasterOut.dim_size(1) && color.dim_size(2) == rasterOut.dim_size(2), errors::InvalidArgument("color and raster_out inputs must have same spatial dimensions"));
if (p.instance_mode)
{
OP_REQUIRES(ctx, pos.dims() == 3 && pos.dim_size(0) > 0 && pos.dim_size(1) > 0 && pos.dim_size(2) == 4, errors::InvalidArgument("pos must have shape [>0, >0, 4] or [>0, 4]"));
OP_REQUIRES(ctx, rasterOut.dim_size(0) == p.n && pos.dim_size(0) == p.n, errors::InvalidArgument("minibatch size mismatch between inputs color, raster_out, pos"));
}
else
{
OP_REQUIRES(ctx, pos.dims() == 2 && pos.dim_size(0) > 0 && pos.dim_size(1) == 4, errors::InvalidArgument("pos must have shape [>0, >0, 4] or [>0, 4]"));
OP_REQUIRES(ctx, rasterOut.dim_size(0) == p.n, errors::InvalidArgument("minibatch size mismatch between inputs color, raster_out"));
}
// Get input pointers.
p.color = color.flat<float>().data();
p.rasterOut = rasterOut.flat<float>().data();
p.tri = tri.flat<int>().data();
p.pos = pos.flat<float>().data();
// Misc parameters.
p.xh = .5f * (float)p.width;
p.yh = .5f * (float)p.height;
// Allocate output tensor.
Tensor* outputTensor = NULL;
TensorShape outputShape;
outputShape.AddDim(p.n);
outputShape.AddDim(p.height);
outputShape.AddDim(p.width);
outputShape.AddDim(p.channels);
OP_REQUIRES_OK(ctx, ctx->allocate_output(0, outputShape, &outputTensor));
p.output = outputTensor->flat<float>().data();
// Allocate work buffer. One extra int4 for storing counters.
Tensor* workTensor = NULL;
TensorShape workShape;
workShape.AddDim(p.n * p.width * p.height * 8 + 4); // 8 int for a maximum of two work items per pixel.
OP_REQUIRES_OK(ctx, ctx->allocate_output(1, workShape, &workTensor));
p.workBuffer = (int4*)(workTensor->flat<int>().data());
// Clear the work counters.
OP_CHECK_CUDA_ERROR(ctx, cudaMemsetAsync(p.workBuffer, 0, sizeof(int4), stream));
// Verify that buffers are aligned to allow float2/float4 operations.
OP_REQUIRES(ctx, !((uintptr_t)p.pos & 15), errors::Internal("pos input tensor not aligned to float4"));
OP_REQUIRES(ctx, !((uintptr_t)p.rasterOut & 7), errors::Internal("raster_out input tensor not aligned to float2"));
OP_REQUIRES(ctx, !((uintptr_t)p.workBuffer & 15), errors::Internal("work_buffer internal tensor not aligned to int4"));
// Kernel parameters.
void* args[] = {&p};
// (Re-)calculate opposite vertex hash.
if (!p.evHash || !p.tri_const)
{
if (p.allocTriangles < p.numTriangles)
{
p.allocTriangles = max(p.allocTriangles, 64);
while (p.allocTriangles < p.numTriangles)
p.allocTriangles <<= 1; // Must be power of two.
// (Re-)allocate memory for the hash.
OP_CHECK_CUDA_ERROR(ctx, cudaFree(p.evHash));
OP_CHECK_CUDA_ERROR(ctx, cudaMalloc(&p.evHash, p.allocTriangles * AA_HASH_ELEMENTS_PER_TRIANGLE * sizeof(uint4)));
LOG(INFO) << "Increasing topology hash size to accommodate " << p.allocTriangles << " triangles";
}
// Clear the hash and launch the mesh kernel to populate it.
OP_CHECK_CUDA_ERROR(ctx, cudaMemsetAsync(p.evHash, 0, p.allocTriangles * AA_HASH_ELEMENTS_PER_TRIANGLE * sizeof(uint4), stream));
OP_CHECK_CUDA_ERROR(ctx, cudaLaunchKernel((void*)AntialiasFwdMeshKernel, (p.numTriangles - 1) / AA_MESH_KERNEL_THREADS_PER_BLOCK + 1, AA_MESH_KERNEL_THREADS_PER_BLOCK, args, 0, stream));
}
// Copy input to output as a baseline.
OP_CHECK_CUDA_ERROR(ctx, cudaMemcpyAsync(p.output, p.color, p.n * p.height * p.width * p.channels * sizeof(float), cudaMemcpyDeviceToDevice, stream));
// Choose launch parameters for the discontinuity finder kernel and launch.
dim3 blockSize(AA_DISCONTINUITY_KERNEL_BLOCK_WIDTH, AA_DISCONTINUITY_KERNEL_BLOCK_HEIGHT, 1);
dim3 gridSize = getLaunchGridSize(blockSize, p.width, p.height, p.n);
OP_CHECK_CUDA_ERROR(ctx, cudaLaunchKernel((void*)AntialiasFwdDiscontinuityKernel, gridSize, blockSize, args, 0, stream));
// Determine optimum block size for the persistent analysis kernel.
int device = 0;
int numCTA = 0;
int numSM = 0;
OP_CHECK_CUDA_ERROR(ctx, cudaGetDevice(&device));
OP_CHECK_CUDA_ERROR(ctx, cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numCTA, (void*)AntialiasFwdAnalysisKernel, AA_ANALYSIS_KERNEL_THREADS_PER_BLOCK, 0));
OP_CHECK_CUDA_ERROR(ctx, cudaDeviceGetAttribute(&numSM, cudaDevAttrMultiProcessorCount, device));
// Launch analysis kernel.
OP_CHECK_CUDA_ERROR(ctx, cudaLaunchKernel((void*)AntialiasFwdAnalysisKernel, numCTA * numSM, AA_ANALYSIS_KERNEL_THREADS_PER_BLOCK, args, 0, stream));
}
};
REGISTER_OP("AntialiasFwd")
.Input ("color: float")
.Input ("raster_out: float")
.Input ("pos: float")
.Input ("tri: int32")
.Output ("output: float")
.Output ("work_buffer: int32")
.Attr ("tri_const: int");
REGISTER_KERNEL_BUILDER(Name("AntialiasFwd").Device(DEVICE_GPU), AntialiasFwdOp);
//------------------------------------------------------------------------
// Gradient TensorFlow op.
struct AntialiasGradOp : public OpKernel
{
AntialiasKernelParams m_attribs;
AntialiasGradOp(OpKernelConstruction* ctx): OpKernel(ctx)
{
memset(&m_attribs, 0, sizeof(m_attribs));
}
void Compute(OpKernelContext* ctx)
{
AntialiasKernelParams& p = m_attribs;
cudaStream_t stream = ctx->eigen_device<Eigen::GpuDevice>().stream();
// Get input.
const Tensor& color = ctx->input(0);
const Tensor& rasterOut = ctx->input(1);
const Tensor& pos = ctx->input(2);
const Tensor& tri = ctx->input(3);
const Tensor& dy = ctx->input(4);
const Tensor& workBuffer = ctx->input(5);
// Instance rendering mode?
p.instance_mode = pos.dims() > 2;
// Extract input dimensions.
if (p.instance_mode)
p.numVertices = (pos.dims() > 1) ? pos.dim_size(1) : 0;
else
p.numVertices = (pos.dims() > 0) ? pos.dim_size(0) : 0;
p.numTriangles = (tri.dims() > 0) ? tri.dim_size(0) : 0;
p.n = (color.dims() > 0) ? color.dim_size(0) : 0;
p.height = (color.dims() > 1) ? color.dim_size(1) : 0;
p.width = (color.dims() > 2) ? color.dim_size(2) : 0;
p.channels = (color.dims() > 3) ? color.dim_size(3) : 0;
// Sanity checks.
OP_REQUIRES(ctx, dy.dims() == 4 && dy.dim_size(0) > 0 && dy.dim_size(1) > 0 && dy.dim_size(2) > 0 && dy.dim_size(3) > 0, errors::InvalidArgument("dy must have shape[>0, >0, >0, >0]"));
OP_REQUIRES(ctx, color.dims() == 4 && color.dim_size(0) > 0 && color.dim_size(1) > 0 && color.dim_size(2) > 0 && color.dim_size(3) > 0, errors::InvalidArgument("color must have shape[>0, >0, >0, >0]"));
OP_REQUIRES(ctx, rasterOut.dims() == 4 && rasterOut.dim_size(0) > 0 && rasterOut.dim_size(1) > 0 && rasterOut.dim_size(2) > 0 && rasterOut.dim_size(3) == 4, errors::InvalidArgument("raster_out must have shape[>0, >0, >0, 4]"));
OP_REQUIRES(ctx, tri.dims() == 2 && tri.dim_size(0) > 0 && tri.dim_size(1) == 3, errors::InvalidArgument("tri must have shape [>0, 3]"));
OP_REQUIRES(ctx, color.dim_size(1) == rasterOut.dim_size(1) && color.dim_size(2) == rasterOut.dim_size(2), errors::InvalidArgument("color and raster_out inputs must have same spatial dimensions"));
OP_REQUIRES(ctx, color.dim_size(1) == dy.dim_size(1) && color.dim_size(2) == dy.dim_size(2) && color.dim_size(3) == dy.dim_size(3), errors::InvalidArgument("color and dy inputs must have same dimensions"));
if (p.instance_mode)
{
OP_REQUIRES(ctx, pos.dims() == 3 && pos.dim_size(0) > 0 && pos.dim_size(1) > 0 && pos.dim_size(2) == 4, errors::InvalidArgument("pos must have shape [>0, >0, 4] or [>0, 4]"));
OP_REQUIRES(ctx, rasterOut.dim_size(0) == p.n && pos.dim_size(0) == p.n, errors::InvalidArgument("minibatch size mismatch between inputs color, raster_out, pos"));
OP_REQUIRES(ctx, dy.dim_size(0) == p.n && rasterOut.dim_size(0) == p.n && pos.dim_size(0) == p.n, errors::InvalidArgument("minibatch size mismatch between inputs dy, color, raster_out, pos"));
}
else
{
OP_REQUIRES(ctx, pos.dims() == 2 && pos.dim_size(0) > 0 && pos.dim_size(1) == 4, errors::InvalidArgument("pos must have shape [>0, >0, 4] or [>0, 4]"));
OP_REQUIRES(ctx, rasterOut.dim_size(0) == p.n, errors::InvalidArgument("minibatch size mismatch between inputs color, raster_out"));
OP_REQUIRES(ctx, dy.dim_size(0) == p.n && rasterOut.dim_size(0) == p.n, errors::InvalidArgument("minibatch size mismatch between inputs dy, color, raster_out"));
}
// Get input pointers.
p.dy = dy.flat<float>().data();
p.color = color.flat<float>().data();
p.rasterOut = rasterOut.flat<float>().data();
p.tri = tri.flat<int>().data();
p.pos = pos.flat<float>().data();
p.workBuffer = (int4*)(workBuffer.flat<int>().data());
// Misc parameters.
p.xh = .5f * (float)p.width;
p.yh = .5f * (float)p.height;
// Allocate color gradient output tensor.
Tensor* gradColor = NULL;
TensorShape gradColorShape;
gradColorShape.AddDim(p.n);
gradColorShape.AddDim(p.height);
gradColorShape.AddDim(p.width);
gradColorShape.AddDim(p.channels);
OP_REQUIRES_OK(ctx, ctx->allocate_output(0, gradColorShape, &gradColor));
p.gradColor = gradColor->flat<float>().data();
// Allocate position gradient output tensor.
Tensor* gradPos = NULL;
TensorShape gradPosShape;
if (p.instance_mode)
gradPosShape.AddDim(p.n);
gradPosShape.AddDim(p.numVertices);
gradPosShape.AddDim(4);
OP_REQUIRES_OK(ctx, ctx->allocate_output(1, gradPosShape, &gradPos));
p.gradPos = gradPos->flat<float>().data();
// Initialize all the stuff.
OP_CHECK_CUDA_ERROR(ctx, cudaMemsetAsync(&p.workBuffer[0].y, 0, sizeof(int), stream)); // Gradient kernel work counter.
OP_CHECK_CUDA_ERROR(ctx, cudaMemcpyAsync(p.gradColor, p.dy, p.n * p.height * p.width * p.channels * sizeof(float), cudaMemcpyDeviceToDevice, stream));
OP_CHECK_CUDA_ERROR(ctx, cudaMemsetAsync(p.gradPos, 0, (p.instance_mode ? p.n : 1) * p.numVertices * 4 * sizeof(float), stream));
// Verify that buffers are aligned to allow float2/float4 operations.
OP_REQUIRES(ctx, !((uintptr_t)p.pos & 15), errors::Internal("pos input tensor not aligned to float4"));
OP_REQUIRES(ctx, !((uintptr_t)p.workBuffer & 15), errors::Internal("work_buffer internal tensor not aligned to int4"));
// Launch the gradient kernel.
void* args[] = {&p};
int device = 0;
int numCTA = 0;
int numSM = 0;
OP_CHECK_CUDA_ERROR(ctx, cudaGetDevice(&device));
OP_CHECK_CUDA_ERROR(ctx, cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numCTA, (void*)AntialiasGradKernel, AA_GRAD_KERNEL_THREADS_PER_BLOCK, 0));
OP_CHECK_CUDA_ERROR(ctx, cudaDeviceGetAttribute(&numSM, cudaDevAttrMultiProcessorCount, device));
OP_CHECK_CUDA_ERROR(ctx, cudaLaunchKernel((void*)AntialiasGradKernel, numCTA * numSM, AA_GRAD_KERNEL_THREADS_PER_BLOCK, args, 0, stream));
}
};
REGISTER_OP("AntialiasGrad")
.Input ("color: float")
.Input ("raster_out: float")
.Input ("pos: float")
.Input ("tri: int32")
.Input ("dy: float")
.Input ("work_buffer: int32")
.Output ("grad_color: float")
.Output ("grad_pos: float");
REGISTER_KERNEL_BUILDER(Name("AntialiasGrad").Device(DEVICE_GPU), AntialiasGradOp);
//------------------------------------------------------------------------
// Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
//
// NVIDIA CORPORATION and its licensors retain all intellectual property
// and proprietary rights in and to this software, related documentation
// and any modifications thereto. Any use, reproduction, disclosure or
// distribution of this software and related documentation without an express
// license agreement from NVIDIA CORPORATION is strictly prohibited.
//------------------------------------------------------------------------
// Common op attribute parser.
static __host__ void interpolateParseOpAttributes(OpKernelConstruction* ctx, InterpolateKernelParams& p, bool enableDA)
{
if (enableDA)
{
OP_REQUIRES_OK(ctx, ctx->GetAttr("diff_attrs_all", &p.diff_attrs_all));
if (!p.diff_attrs_all)
{
std::vector<int> diff_attrs_vec;
OP_REQUIRES_OK(ctx, ctx->GetAttr("diff_attrs", &diff_attrs_vec));
OP_REQUIRES(ctx, diff_attrs_vec.size() > 0, errors::InvalidArgument("differentiation enabled with empty diff_attrs list"));
OP_REQUIRES(ctx, diff_attrs_vec.size() <= IP_MAX_DIFF_ATTRS, errors::InvalidArgument("too many entries in diff_attrs list (increase IP_MAX_DIFF_ATTRS)"));
p.numDiffAttr = diff_attrs_vec.size();
memcpy(p.diffAttrs, &diff_attrs_vec[0], diff_attrs_vec.size()*sizeof(int));
}
}
}
//------------------------------------------------------------------------
// Forward TensorFlow op.
template <bool ENABLE_DA>
struct InterpolateFwdOp : public OpKernel
{
InterpolateKernelParams m_attribs;
InterpolateFwdOp(OpKernelConstruction* ctx): OpKernel(ctx)
{
memset(&m_attribs, 0, sizeof(m_attribs));
interpolateParseOpAttributes(ctx, m_attribs, ENABLE_DA);
}
void Compute(OpKernelContext* ctx)
{
InterpolateKernelParams& p = m_attribs;
cudaStream_t stream = ctx->eigen_device<Eigen::GpuDevice>().stream();
// Get input.
const Tensor& attr = ctx->input(0);
const Tensor& rast = ctx->input(1);
const Tensor& tri = ctx->input(2);
const Tensor& rast_db = ctx->input(ENABLE_DA ? 3 : 2);
// Instance rendering mode?
p.instance_mode = attr.dims() > 2;
// Extract input dimensions.
if (p.instance_mode)
{
p.numVertices = (attr.dims() > 1) ? attr.dim_size(1) : 0;
p.numAttr = (attr.dims() > 2) ? attr.dim_size(2) : 0;
}
else
{
p.numVertices = (attr.dims() > 0) ? attr.dim_size(0) : 0;
p.numAttr = (attr.dims() > 1) ? attr.dim_size(1) : 0;
}
p.numTriangles = (tri.dims() > 0) ? tri.dim_size(0) : 0;
p.height = (rast.dims() > 1) ? rast.dim_size(1) : 0;
p.width = (rast.dims() > 2) ? rast.dim_size(2) : 0;
p.depth = (rast.dims() > 0) ? rast.dim_size(0) : 0;
// Sanity checks.
OP_REQUIRES(ctx, rast.dims() == 4 && rast.dim_size(0) > 0 && rast.dim_size(1) > 0 && rast.dim_size(2) > 0 && rast.dim_size(3) == 4, errors::InvalidArgument("rast must have shape[>0, >0, >0, 4]"));
OP_REQUIRES(ctx, tri.dims() == 2 && tri.dim_size(0) > 0 && tri.dim_size(1) == 3, errors::InvalidArgument("tri must have shape [>0, 3]"));
OP_REQUIRES(ctx, (attr.dims() == 2 || attr.dims() == 3) && attr.dim_size(0) > 0 && attr.dim_size(1) > 0 && (attr.dims() == 2 || attr.dim_size(2) > 0), errors::InvalidArgument("attr must have shape [>0, >0, >0] or [>0, >0]"));
if (p.instance_mode)
OP_REQUIRES(ctx, attr.dim_size(0) == p.depth || attr.dim_size(0) == 1, errors::InvalidArgument("minibatch size mismatch between inputs rast, attr"));
if (ENABLE_DA)
{
OP_REQUIRES(ctx, rast_db.dims() == 4 && rast_db.dim_size(0) > 0 && rast_db.dim_size(1) > 0 && rast_db.dim_size(2) > 0 && rast_db.dim_size(3) == 4, errors::InvalidArgument("rast_db must have shape[>0, >0, >0, 4]"));
OP_REQUIRES(ctx, rast_db.dim_size(1) == rast.dim_size(1) && rast_db.dim_size(2) == rast.dim_size(2), errors::InvalidArgument("spatial size mismatch between inputs rast and rast_db"));
OP_REQUIRES(ctx, rast_db.dim_size(0) == p.depth, errors::InvalidArgument("minibatch size mismatch between inputs rast, rast_db"));
}
// All diff attrs mode.
if (p.diff_attrs_all)
p.numDiffAttr = p.numAttr;
// Get input pointers.
p.attr = attr.flat<float>().data();
p.rast = rast.flat<float>().data();
p.tri = tri.flat<int>().data();
p.attrBC = (p.instance_mode && attr.dim_size(0) == 1) ? 1 : 0;
p.rastDB = ENABLE_DA ? rast_db.flat<float>().data() : 0;
// Allocate main output tensor.
Tensor* out_tensor = NULL;
TensorShape out_shape;
out_shape.AddDim(p.depth);
out_shape.AddDim(p.height);
out_shape.AddDim(p.width);
out_shape.AddDim(p.numAttr);
OP_REQUIRES_OK(ctx, ctx->allocate_output(0, out_shape, &out_tensor));
p.out = out_tensor->flat<float>().data();
// Allocate pixel differential output tensor.
Tensor* out_da_tensor = NULL;
out_shape.set_dim(3, p.numDiffAttr * 2);
OP_REQUIRES_OK(ctx, ctx->allocate_output(1, out_shape, &out_da_tensor));
p.outDA = ENABLE_DA ? out_da_tensor->flat<float>().data() : 0;
// Verify that buffers are aligned to allow float2/float4 operations.
OP_REQUIRES(ctx, !((uintptr_t)p.rast & 15), errors::Internal("rast input tensor not aligned to float4"));
OP_REQUIRES(ctx, !((uintptr_t)p.rastDB & 15), errors::Internal("rast_db input tensor not aligned to float4"));
if (ENABLE_DA)
OP_REQUIRES(ctx, !((uintptr_t)p.outDA & 7), errors::Internal("out_da output tensor not aligned to float2"));
// Choose launch parameters.
dim3 blockSize = getLaunchBlockSize(IP_FWD_MAX_KERNEL_BLOCK_WIDTH, IP_FWD_MAX_KERNEL_BLOCK_HEIGHT, p.width, p.height);
dim3 gridSize = getLaunchGridSize(blockSize, p.width, p.height, p.depth);
// Launch CUDA kernel.
void* args[] = {&p};
void* func = ENABLE_DA ? (void*)InterpolateFwdKernelDa : (void*)InterpolateFwdKernel;
OP_CHECK_CUDA_ERROR(ctx, cudaLaunchKernel(func, gridSize, blockSize, args, 0, stream));
}
};
REGISTER_OP("InterpolateFwd")
.Input ("attr: float")
.Input ("rast: float")
.Input ("tri: int32")
.Output ("out: float")
.Output ("out_da: float");
REGISTER_OP("InterpolateFwdDa")
.Input ("attr: float")
.Input ("rast: float")
.Input ("tri: int32")
.Input ("rast_db: float")
.Output ("out: float")
.Output ("out_da: float")
.Attr ("diff_attrs_all: int")
.Attr ("diff_attrs: list(int)");
REGISTER_KERNEL_BUILDER(Name("InterpolateFwd") .Device(DEVICE_GPU), InterpolateFwdOp<false>);
REGISTER_KERNEL_BUILDER(Name("InterpolateFwdDa").Device(DEVICE_GPU), InterpolateFwdOp<true>);
//------------------------------------------------------------------------
// Gradient TensorFlow op.
template <bool ENABLE_DA>
struct InterpolateGradOp : public OpKernel
{
InterpolateKernelParams m_attribs;
InterpolateGradOp(OpKernelConstruction* ctx): OpKernel(ctx)
{
memset(&m_attribs, 0, sizeof(m_attribs));
interpolateParseOpAttributes(ctx, m_attribs, ENABLE_DA);
}
void Compute(OpKernelContext* ctx)
{
InterpolateKernelParams& p = m_attribs;
cudaStream_t stream = ctx->eigen_device<Eigen::GpuDevice>().stream();
// Get input.
const Tensor& attr = ctx->input(0);
const Tensor& rast = ctx->input(1);
const Tensor& tri = ctx->input(2);
const Tensor& dy = ctx->input(3);
const Tensor& rast_db = ctx->input(ENABLE_DA ? 4 : 3);
const Tensor& dda = ctx->input(ENABLE_DA ? 5 : 3);
// Instance rendering mode?
p.instance_mode = attr.dims() > 2;
// Extract input dimensions.
if (p.instance_mode)
{
p.numVertices = (attr.dims() > 1) ? attr.dim_size(1) : 0;
p.numAttr = (attr.dims() > 2) ? attr.dim_size(2) : 0;
}
else
{
p.numVertices = (attr.dims() > 0) ? attr.dim_size(0) : 0;
p.numAttr = (attr.dims() > 1) ? attr.dim_size(1) : 0;
}
p.numTriangles = (tri.dims() > 0) ? tri.dim_size(0) : 0;
p.depth = (rast.dims() > 0) ? rast.dim_size(0) : 0;
p.height = (rast.dims() > 1) ? rast.dim_size(1) : 0;
p.width = (rast.dims() > 2) ? rast.dim_size(2) : 0;
int attr_depth = p.instance_mode ? (attr.dims() > 1 ? attr.dim_size(0) : 0) : 1;
// Sanity checks.
OP_REQUIRES(ctx, rast.dims() == 4 && rast.dim_size(0) > 0 && rast.dim_size(1) > 0 && rast.dim_size(2) > 0 && rast.dim_size(3) == 4, errors::InvalidArgument("rast must have shape[>0, >0, >0, 4]"));
OP_REQUIRES(ctx, tri.dims() == 2 && tri.dim_size(0) > 0 && tri.dim_size(1) == 3, errors::InvalidArgument("tri must have shape [>0, 3]"));
OP_REQUIRES(ctx, (attr.dims() == 2 || attr.dims() == 3) && attr.dim_size(0) > 0 && attr.dim_size(1) > 0 && (attr.dims() == 2 || attr.dim_size(2) > 0), errors::InvalidArgument("attr must have shape [>0, >0, >0] or [>0, >0]"));
OP_REQUIRES(ctx, dy.dims() == 4 && dy.dim_size(0) > 0 && dy.dim_size(1) == p.height && dy.dim_size(2) == p.width && dy.dim_size(3) > 0, errors::InvalidArgument("dy must have shape [>0, height, width, >0]"));
OP_REQUIRES(ctx, dy.dim_size(3) == p.numAttr, errors::InvalidArgument("argument count mismatch between inputs dy, attr"));
OP_REQUIRES(ctx, (attr_depth == p.depth || attr_depth == 1) && dy.dim_size(0) == p.depth, errors::InvalidArgument("minibatch size mismatch between inputs rast, dy, attr"));
if (ENABLE_DA)
{
OP_REQUIRES(ctx, dda.dims() == 4 && dda.dim_size(0) > 0 && dda.dim_size(1) == p.height && dda.dim_size(2) == p.width, errors::InvalidArgument("dda must have shape [>0, height, width, ?]"));
OP_REQUIRES(ctx, dda.dim_size(0) == p.depth, errors::InvalidArgument("minibatch size mismatch between rast, dda"));
}
// All diff attrs mode.
if (p.diff_attrs_all)
p.numDiffAttr = p.numAttr;
// Get input pointers.
p.attr = attr.flat<float>().data();
p.rast = rast.flat<float>().data();
p.tri = tri.flat<int>().data();
p.dy = dy.flat<float>().data();
p.rastDB = ENABLE_DA ? rast_db.flat<float>().data() : 0;
p.dda = ENABLE_DA ? dda.flat<float>().data() : 0;
p.attrBC = (p.instance_mode && attr_depth < p.depth) ? 1 : 0;
// Allocate attribute gradient output tensor.
Tensor* grad_attr_tensor = NULL;
TensorShape grad_attr_shape;
if (p.instance_mode)
grad_attr_shape.AddDim(attr_depth);
grad_attr_shape.AddDim(p.numVertices);
grad_attr_shape.AddDim(p.numAttr);
OP_REQUIRES_OK(ctx, ctx->allocate_output(0, grad_attr_shape, &grad_attr_tensor));
p.gradAttr = grad_attr_tensor->flat<float>().data();
// Allocate bary gradient output tensor.
Tensor* grad_rast_tensor = NULL;
TensorShape grad_rast_shape;
grad_rast_shape.AddDim(p.depth);
grad_rast_shape.AddDim(p.height);
grad_rast_shape.AddDim(p.width);
grad_rast_shape.AddDim(4);
OP_REQUIRES_OK(ctx, ctx->allocate_output(1, grad_rast_shape, &grad_rast_tensor));
p.gradRaster = grad_rast_tensor->flat<float>().data();
// Allocate bary pixel diff gradient output tensor.
if (ENABLE_DA)
{
Tensor* grad_rast_db_tensor = NULL;
OP_REQUIRES_OK(ctx, ctx->allocate_output(2, grad_rast_shape, &grad_rast_db_tensor));
p.gradRasterDB = grad_rast_db_tensor->flat<float>().data();
}
// Clear attribute gradients.
cudaMemsetAsync(p.gradAttr, 0, attr_depth * p.numVertices * p.numAttr * sizeof(float), stream);
// Verify that buffers are aligned to allow float2/float4 operations.
OP_REQUIRES(ctx, !((uintptr_t)p.rast & 15), errors::Internal("rast input tensor not aligned to float4"));
OP_REQUIRES(ctx, !((uintptr_t)p.gradRaster & 15), errors::Internal("grad_rast output tensor not aligned to float4"));
if (ENABLE_DA)
{
OP_REQUIRES(ctx, !((uintptr_t)p.dda & 7), errors::Internal("dda input tensor not aligned to float2"));
OP_REQUIRES(ctx, !((uintptr_t)p.rastDB & 15), errors::Internal("rast_db input tensor not aligned to float4"));
OP_REQUIRES(ctx, !((uintptr_t)p.gradRasterDB & 15), errors::Internal("grad_rast_db output tensor not aligned to float4"));
}
// Choose launch parameters.
dim3 blockSize = getLaunchBlockSize(IP_GRAD_MAX_KERNEL_BLOCK_WIDTH, IP_GRAD_MAX_KERNEL_BLOCK_HEIGHT, p.width, p.height);
dim3 gridSize = getLaunchGridSize(blockSize, p.width, p.height, p.depth);
// Launch CUDA kernel.
void* args[] = {&p};
void* func = ENABLE_DA ? (void*)InterpolateGradKernelDa : (void*)InterpolateGradKernel;
OP_CHECK_CUDA_ERROR(ctx, cudaLaunchKernel(func, gridSize, blockSize, args, 0, stream));
}
};
REGISTER_OP("InterpolateGrad")
.Input ("attr: float")
.Input ("rast: float")
.Input ("tri: int32")
.Input ("dy: float")
.Output ("grad_attr: float")
.Output ("grad_rast: float")
;
REGISTER_OP("InterpolateGradDa")
.Input ("attr: float")
.Input ("rast: float")
.Input ("tri: int32")
.Input ("dy: float")
.Input ("rast_db: float")
.Input ("dda: float")
.Output ("grad_attr: float")
.Output ("grad_rast: float")
.Output ("grad_rast_db: float")
.Attr ("diff_attrs_all: int")
.Attr ("diff_attrs: list(int)");
;
REGISTER_KERNEL_BUILDER(Name("InterpolateGrad") .Device(DEVICE_GPU), InterpolateGradOp<false>);
REGISTER_KERNEL_BUILDER(Name("InterpolateGradDa").Device(DEVICE_GPU), InterpolateGradOp<true>);
//------------------------------------------------------------------------
// Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
//
// NVIDIA CORPORATION and its licensors retain all intellectual property
// and proprietary rights in and to this software, related documentation
// and any modifications thereto. Any use, reproduction, disclosure or
// distribution of this software and related documentation without an express
// license agreement from NVIDIA CORPORATION is strictly prohibited.
//------------------------------------------------------------------------
// Forward TensorFlow op.
struct RasterizeFwdOp : public OpKernel
{
RasterizeGLState m_glState; // OpenGL-related persistent state.
int m_tri_const; // 1 if triangle array is known to be constant.
RasterizeFwdOp(OpKernelConstruction* ctx):
OpKernel(ctx)
{
memset(&m_glState, 0, sizeof(RasterizeGLState));
OP_REQUIRES_OK(ctx, ctx->GetAttr("enable_db", &m_glState.enableDB));
OP_REQUIRES_OK(ctx, ctx->GetAttr("tri_const", &m_tri_const));
}
void Compute(OpKernelContext* ctx)
{
cudaStream_t stream = ctx->eigen_device<Eigen::GpuDevice>().stream();
// Check that input shapes are correct.
const Tensor& pos = ctx->input(0);
const Tensor& tri = ctx->input(1);
const Tensor& resolution = ctx->input(2);
const Tensor& ranges = ctx->input(3);
// Determine number of outputs
int num_outputs = m_glState.enableDB ? 2 : 1;
// Determine instance mode and check input dimensions.
bool instance_mode = pos.dims() > 2;
if (instance_mode)
{
OP_REQUIRES(ctx, pos.dims() == 3 && pos.dim_size(0) > 0 && pos.dim_size(1) > 0 && pos.dim_size(2) == 4, errors::InvalidArgument("instance mode - pos must have shape [>0, >0, 4]"));
OP_REQUIRES(ctx, tri.dims() == 2 && tri.dim_size(0) > 0 && tri.dim_size(1) == 3, errors::InvalidArgument("tri must have shape [>0, 3]"));
OP_REQUIRES(ctx, resolution.dims() == 1 && resolution.dim_size(0) == 2, errors::InvalidArgument("resolution must have shape [2]"));
}
else
{
OP_REQUIRES(ctx, pos.dims() == 2 && pos.dim_size(0) > 0 && pos.dim_size(1) == 4, errors::InvalidArgument("range mode - pos must have shape [>0, 4]"));
OP_REQUIRES(ctx, tri.dims() == 2 && tri.dim_size(0) > 0 && tri.dim_size(1) == 3, errors::InvalidArgument("tri must have shape [>0, 3]"));
OP_REQUIRES(ctx, resolution.dims() == 1 && resolution.dim_size(0) == 2, errors::InvalidArgument("resolution must have shape [2]"));
OP_REQUIRES(ctx, ranges.dims() == 2 && ranges.dim_size(0) > 0 && ranges.dim_size(1) == 2, errors::InvalidArgument("range mode - ranges must have shape [>0, 2]"));
}
// Get output shape.
const int32_t* res_in = resolution.flat<int32_t>().data(); // This is in CPU memory.
int height = res_in[0];
int width = res_in[1];
int depth = instance_mode ? pos.dim_size(0) : ranges.dim_size(0);
OP_REQUIRES(ctx, height > 0 && width > 0, errors::InvalidArgument("resolution must be [>0, >0]"));
// Get position and triangle buffer sizes in int32/float32.
int posCount = 4 * pos.dim_size(0) * (instance_mode ? pos.dim_size(1) : 1);
int triCount = 3 * tri.dim_size(0);
// Init context and GL?
bool initCtx = !m_glState.glFBO;
if (initCtx)
rasterizeInitGLContext(ctx, m_glState); // In common/rasterize.inl
else
setGLContext(m_glState.glctx); // (Re-)Activate GL context.
// Resize all buffers.
rasterizeResizeBuffers(ctx, m_glState, posCount, triCount, width, height, depth); // In common/rasterize.inl
// Newly created GL objects sometimes don't map properly to CUDA until after first context swap. Workaround.
if (initCtx)
{
// On first execution, do a bonus context swap.
releaseGLContext();
setGLContext(m_glState.glctx);
}
// Copy input data to GL and render.
const float* posPtr = pos.flat<float>().data();
const int32_t* rangesPtr = instance_mode ? 0 : ranges.flat<int32_t>().data(); // This is in CPU memory.
const int32_t* triPtr = (initCtx || !m_tri_const) ? tri.flat<int32_t>().data() : NULL; // Copy triangles only if needed.
int vtxPerInstance = instance_mode ? pos.dim_size(1) : 0;
rasterizeRender(ctx, m_glState, stream, posPtr, posCount, vtxPerInstance, triPtr, triCount, rangesPtr, width, height, depth);
// Allocate output tensors.
TensorShape output_shape;
output_shape.AddDim(depth);
output_shape.AddDim(height);
output_shape.AddDim(width);
output_shape.AddDim(4);
float* outputPtr[2];
for (int i=0; i < 2; i++)
{
if (i >= num_outputs)
output_shape.set_dim(3, 0); // Zero channels for unwanted out_db tensor.
Tensor* output_tensor = NULL;
OP_REQUIRES_OK(ctx, ctx->allocate_output(i, output_shape, &output_tensor));
if (i < num_outputs)
outputPtr[i] = output_tensor->flat<float>().data();
}
// Copy rasterized results into CUDA buffers.
rasterizeCopyResults(ctx, m_glState, stream, outputPtr, width, height, depth);
// Done. Release GL context.
releaseGLContext();
}
};
REGISTER_OP("RasterizeFwd")
.Input ("pos: float")
.Input ("tri: int32")
.Input ("resolution: int32")
.Input ("ranges: int32")
.Output ("out: float")
.Output ("out_db: float")
.Attr ("enable_db: int")
.Attr ("tri_const: int");
REGISTER_KERNEL_BUILDER(Name("RasterizeFwd").Device(DEVICE_GPU).HostMemory("resolution").HostMemory("ranges"), RasterizeFwdOp);
//------------------------------------------------------------------------
// Gradient TensorFlow op.
template <bool ENABLE_DB>
struct RasterizeGradOp : public OpKernel
{
RasterizeGradParams m_attribs;
RasterizeGradOp(OpKernelConstruction* ctx): OpKernel(ctx)
{
memset(&m_attribs, 0, sizeof(m_attribs));
}
void Compute(OpKernelContext* ctx)
{
RasterizeGradParams& p = m_attribs;
cudaStream_t stream = ctx->eigen_device<Eigen::GpuDevice>().stream();
// Input tensors.
const Tensor& pos = ctx->input(0);
const Tensor& tri = ctx->input(1);
const Tensor& out = ctx->input(2);
const Tensor& dy = ctx->input(3);
const Tensor& ddb = ctx->input(ENABLE_DB ? 4 : 3);
// Determine instance mode.
p.instance_mode = (pos.dims() > 2) ? 1 : 0;
// Shape is taken from the rasterizer output tensor.
OP_REQUIRES(ctx, out.dims() == 4, errors::InvalidArgument("out must be rank-4"));
p.depth = out.dim_size(0);
p.height = out.dim_size(1);
p.width = out.dim_size(2);
OP_REQUIRES(ctx, p.depth > 0 && p.height > 0 && p.width > 0, errors::InvalidArgument("resolution must be [>0, >0, >0]"));
// Check other shapes.
if (p.instance_mode)
OP_REQUIRES(ctx, pos.dims() == 3 && pos.dim_size(0) == p.depth && pos.dim_size(1) > 0 && pos.dim_size(2) == 4, errors::InvalidArgument("pos must have shape [depth, >0, 4]"));
else
OP_REQUIRES(ctx, pos.dims() == 2 && pos.dim_size(0) > 0 && pos.dim_size(1) == 4, errors::InvalidArgument("pos must have shape [>0, 4]"));
OP_REQUIRES(ctx, tri.dims() == 2 && tri.dim_size(0) > 0 && tri.dim_size(1) == 3, errors::InvalidArgument("tri must have shape [>0, 3]"));
OP_REQUIRES(ctx, out.dims() == 4 && out.dim_size(0) == p.depth && out.dim_size(1) == p.height && out.dim_size(2) == p.width && out.dim_size(3) == 4, errors::InvalidArgument("out must have shape [depth, height, width, 4]"));
OP_REQUIRES(ctx, dy.dims() == 4 && dy.dim_size(0) == p.depth && dy.dim_size(1) == p.height && dy.dim_size(2) == p.width && dy.dim_size(3) == 4, errors::InvalidArgument("dy must have shape [depth, height, width, 4]"));
if (ENABLE_DB)
OP_REQUIRES(ctx, ddb.dims() == 4 && ddb.dim_size(0) == p.depth && ddb.dim_size(1) == p.height && ddb.dim_size(2) == p.width && ddb.dim_size(3) == 4, errors::InvalidArgument("ddb must have shape [depth, height, width, 4]"));
// Populate parameters.
p.numTriangles = tri.dim_size(0);
p.numVertices = p.instance_mode ? pos.dim_size(1) : pos.dim_size(0);
p.pos = pos.flat<float>().data();
p.tri = tri.flat<int>().data();
p.out = out.flat<float>().data();
p.dy = dy.flat<float>().data();
p.ddb = ENABLE_DB ? ddb.flat<float>().data() : 0;
// Set up pixel position to clip space x, y transform.
p.xs = 2.f / (float)p.width;
p.xo = 1.f / (float)p.width - 1.f;
p.ys = 2.f / (float)p.height;
p.yo = 1.f / (float)p.height - 1.f;
// Allocate output tensor for position gradients.
Tensor* grad_tensor = NULL;
TensorShape grad_shape;
if (p.instance_mode)
grad_shape.AddDim(p.depth);
grad_shape.AddDim(p.numVertices);
grad_shape.AddDim(4);
OP_REQUIRES_OK(ctx, ctx->allocate_output(0, grad_shape, &grad_tensor));
p.grad = grad_tensor->flat<float>().data();
// Clear the output buffers.
size_t gradBytes = (p.instance_mode ? p.depth : 1) * p.numVertices * 4 * sizeof(float);
cudaMemsetAsync(p.grad, 0, gradBytes, stream);
// Verify that buffers are aligned to allow float2/float4 operations.
OP_REQUIRES(ctx, !((uintptr_t)p.pos & 15), errors::Internal("pos input tensor not aligned to float4"));
OP_REQUIRES(ctx, !((uintptr_t)p.dy & 7), errors::Internal("dy input tensor not aligned to float2"));
if (ENABLE_DB)
OP_REQUIRES(ctx, !((uintptr_t)p.ddb & 15), errors::Internal("ddb input tensor not aligned to float4"));
// Choose launch parameters.
dim3 blockSize = getLaunchBlockSize(RAST_GRAD_MAX_KERNEL_BLOCK_WIDTH, RAST_GRAD_MAX_KERNEL_BLOCK_HEIGHT, p.width, p.height);
dim3 gridSize = getLaunchGridSize(blockSize, p.width, p.height, p.depth);
// Launch CUDA kernel.
void* args[] = {&p};
void* func = ENABLE_DB ? (void*)RasterizeGradKernelDb : (void*)RasterizeGradKernel;
OP_CHECK_CUDA_ERROR(ctx, cudaLaunchKernel(func, gridSize, blockSize, args, 0, stream));
}
};
REGISTER_OP("RasterizeGrad")
.Input ("pos: float")
.Input ("tri: int32")
.Input ("out: float")
.Input ("dy: float")
.Output ("grad: float");
REGISTER_OP("RasterizeGradDb")
.Input ("pos: float")
.Input ("tri: int32")
.Input ("out: float")
.Input ("dy: float")
.Input ("ddb: float")
.Output ("grad: float");
REGISTER_KERNEL_BUILDER(Name("RasterizeGrad") .Device(DEVICE_GPU), RasterizeGradOp<false>);
REGISTER_KERNEL_BUILDER(Name("RasterizeGradDb").Device(DEVICE_GPU), RasterizeGradOp<true>);
//------------------------------------------------------------------------
// Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
//
// NVIDIA CORPORATION and its licensors retain all intellectual property
// and proprietary rights in and to this software, related documentation
// and any modifications thereto. Any use, reproduction, disclosure or
// distribution of this software and related documentation without an express
// license agreement from NVIDIA CORPORATION is strictly prohibited.
//------------------------------------------------------------------------
// Common op attribute parser.
static __host__ void parseOpAttributes(OpKernelConstruction* ctx, TextureKernelParams& p)
{
// Mip and filter modes.
OP_REQUIRES_OK(ctx, ctx->GetAttr("filter_mode", &p.filterMode));
OP_REQUIRES(ctx, p.filterMode >= 0 && p.filterMode < TEX_MODE_COUNT, errors::InvalidArgument("filter_mode unsupported"));
p.enableMip = (p.filterMode == TEX_MODE_LINEAR_MIPMAP_NEAREST || p.filterMode == TEX_MODE_LINEAR_MIPMAP_LINEAR);
// Mip level clamp.
if (p.enableMip)
{
OP_REQUIRES_OK(ctx, ctx->GetAttr("max_mip_level", &p.mipLevelLimit));
OP_REQUIRES(ctx, p.mipLevelLimit >= -1, errors::InvalidArgument("invalid max_mip_level"));
ctx->GetAttr("tex_const", &p.texConst); // Only available in forward op.
}
// Boundary mode.
OP_REQUIRES_OK(ctx, ctx->GetAttr("boundary_mode", &p.boundaryMode));
OP_REQUIRES(ctx, p.boundaryMode >= 0 && p.boundaryMode < TEX_BOUNDARY_MODE_COUNT, errors::InvalidArgument("boundary_mode unsupported"));
}
//------------------------------------------------------------------------
// Forward TensorFlow op.
struct TextureFwdOp : public OpKernel
{
TextureKernelParams m_attribs;
PersistentTensor m_persistentMipTensor; // Used if texture is constant and mips are enabled.
bool m_persistentMipTensorInitialized;
TextureFwdOp(OpKernelConstruction* ctx): OpKernel(ctx)
{
memset(&m_attribs, 0, sizeof(m_attribs));
m_persistentMipTensorInitialized = false;
parseOpAttributes(ctx, m_attribs);
}
void Compute(OpKernelContext* ctx)
{
TextureKernelParams& p = m_attribs;
cudaStream_t stream = ctx->eigen_device<Eigen::GpuDevice>().stream();
bool cube_mode = (p.boundaryMode == TEX_BOUNDARY_MODE_CUBE);
// Get input.
const Tensor& tex = ctx->input(0);
const Tensor& uv = ctx->input(1);
const Tensor& uv_da = ctx->input(p.enableMip ? 2 : 1);
// Extract input dimensions.
p.n = (uv.dims() > 0) ? uv.dim_size(0) : 0;
p.imgHeight = (uv.dims() > 1) ? uv.dim_size(1) : 0;
p.imgWidth = (uv.dims() > 2) ? uv.dim_size(2) : 0;
p.texDepth = (tex.dims() > 0) ? tex.dim_size(0) : 0;
if (!cube_mode)
{
p.texHeight = (tex.dims() > 1) ? tex.dim_size(1) : 0;
p.texWidth = (tex.dims() > 2) ? tex.dim_size(2) : 0;
p.channels = (tex.dims() > 3) ? tex.dim_size(3) : 0;
}
else
{
p.texHeight = (tex.dims() > 2) ? tex.dim_size(2) : 0;
p.texWidth = (tex.dims() > 3) ? tex.dim_size(3) : 0;
p.channels = (tex.dims() > 4) ? tex.dim_size(4) : 0;
}
// Sanity checks.
if (!cube_mode)
{
OP_REQUIRES(ctx, tex.dims() == 4 && tex.dim_size(0) > 0 && tex.dim_size(1) > 0 && tex.dim_size(2) > 0 && tex.dim_size(3) > 0, errors::InvalidArgument("tex must have shape[>0, >0, >0, >0]"));
OP_REQUIRES(ctx, uv.dims() == 4 && uv.dim_size(0) > 0 && uv.dim_size(1) > 0 && uv.dim_size(2) > 0 && uv.dim_size(3) == 2, errors::InvalidArgument("uv must have shape [>0, >0, >0, 2]"));
}
else
{
OP_REQUIRES(ctx, tex.dims() == 5 && tex.dim_size(0) > 0 && tex.dim_size(1) == 6 && tex.dim_size(2) > 0 && tex.dim_size(3) > 0 && tex.dim_size(4) > 0, errors::InvalidArgument("tex must have shape[>0, 6, >0, >0, >0] in cube map mode"));
OP_REQUIRES(ctx, uv.dims() == 4 && uv.dim_size(0) > 0 && uv.dim_size(1) > 0 && uv.dim_size(2) > 0 && uv.dim_size(3) == 3, errors::InvalidArgument("uv must have shape [>0, >0, >0, 3] in cube map mode"));
OP_REQUIRES(ctx, tex.dim_size(2) == tex.dim_size(3), errors::InvalidArgument("texture shape must be square in cube map mode"));
}
OP_REQUIRES(ctx, tex.dim_size(0) == 1 || tex.dim_size(0) == p.n, errors::InvalidArgument("minibatch size mismatch between inputs tex, uv"));
OP_REQUIRES(ctx, p.texWidth <= (1 << TEX_MAX_MIP_LEVEL) && p.texHeight <= (1 << TEX_MAX_MIP_LEVEL), errors::InvalidArgument("texture size too large"));
if (p.enableMip)
{
if (!cube_mode)
OP_REQUIRES(ctx, uv_da.dims() == 4 && uv_da.dim_size(0) == p.n && uv_da.dim_size(1) == p.imgHeight && uv_da.dim_size(2) == p.imgWidth && uv_da.dim_size(3) == 4, errors::InvalidArgument("uv_da must have shape [minibatch_size, height, width, 4]"));
else
OP_REQUIRES(ctx, uv_da.dims() == 4 && uv_da.dim_size(0) == p.n && uv_da.dim_size(1) == p.imgHeight && uv_da.dim_size(2) == p.imgWidth && uv_da.dim_size(3) == 6, errors::InvalidArgument("uv_da must have shape [minibatch_size, height, width, 6] in cube map mode"));
}
// Get input pointers.
p.tex = tex.flat<float>().data();
p.uv = uv.flat<float>().data();
p.uvDA = p.enableMip ? uv_da.flat<float>().data() : 0;
// Allocate output tensor.
Tensor* out_tensor = NULL;
TensorShape out_shape;
out_shape.AddDim(p.n);
out_shape.AddDim(p.imgHeight);
out_shape.AddDim(p.imgWidth);
out_shape.AddDim(p.channels);
OP_REQUIRES_OK(ctx, ctx->allocate_output(0, out_shape, &out_tensor));
p.out = out_tensor->flat<float>().data();
// Choose kernel variants based on channel count.
void* args[] = {&p};
int channel_div_idx = 0;
if (!(p.channels & 3))
channel_div_idx = 2; // Channel count divisible by 4.
else if (!(p.channels & 1))
channel_div_idx = 1; // Channel count divisible by 2.
// Mip-related setup.
if (p.enableMip)
{
// Generate mip offsets.
int mipTotal = calculateMipInfo(ctx, p);
// Mip output tensor.
Tensor* mip_tensor = NULL;
TensorShape mip_shape;
mip_shape.AddDim(mipTotal);
// If texture is constant, calculate mip stack only once.
bool computeMip = true;
if (p.texConst)
{
// First execution?
if (!m_persistentMipTensorInitialized)
{
// Allocate a persistent mip tensor.
OP_REQUIRES_OK(ctx, ctx->allocate_persistent(DT_FLOAT, mip_shape, &m_persistentMipTensor, &mip_tensor));
m_persistentMipTensorInitialized = true;
}
else
{
// Reuse the persistent tensor, do not recompute mip levels.
mip_tensor = m_persistentMipTensor.AccessTensor(ctx);
computeMip = false;
}
// Set as output tensor as well.
ctx->set_output(1, *mip_tensor);
}
else
{
// Allocate an output tensor as usual.
OP_REQUIRES_OK(ctx, ctx->allocate_output(1, mip_shape, &mip_tensor));
}
p.mip = mip_tensor->flat<float>().data(); // Pointer to data.
// Build mip levels if needed.
if (computeMip)
{
for (int i=1; i <= p.mipLevelMax; i++)
{
int2 ms = mipLevelSize(p, i);
int3 sz = make_int3(ms.x, ms.y, p.texDepth);
dim3 blockSize = getLaunchBlockSize(TEX_FWD_MAX_MIP_KERNEL_BLOCK_WIDTH, TEX_FWD_MAX_MIP_KERNEL_BLOCK_HEIGHT, sz.x, sz.y);
dim3 gridSize = getLaunchGridSize(blockSize, sz.x, sz.y, sz.z * (cube_mode ? 6 : 1));
p.mipLevelOut = i;
void* build_func_tbl[3] = { (void*)MipBuildKernel1, (void*)MipBuildKernel2, (void*)MipBuildKernel4 };
OP_CHECK_CUDA_ERROR(ctx, cudaLaunchKernel(build_func_tbl[channel_div_idx], gridSize, blockSize, args, 0, stream));
}
}
}
// Verify that buffers are aligned to allow float2/float4 operations. Unused pointers are zero so always aligned.
if (!cube_mode)
OP_REQUIRES(ctx, !((uintptr_t)p.uv & 7), errors::Internal("uv input tensor not aligned to float2"));
if ((p.channels & 3) == 0)
{
OP_REQUIRES(ctx, !((uintptr_t)p.tex & 15), errors::Internal("tex input tensor not aligned to float4"));
OP_REQUIRES(ctx, !((uintptr_t)p.out & 15), errors::Internal("out output tensor not aligned to float4"));
OP_REQUIRES(ctx, !((uintptr_t)p.mip & 15), errors::Internal("mip output tensor not aligned to float4"));
}
if ((p.channels & 1) == 0)
{
OP_REQUIRES(ctx, !((uintptr_t)p.tex & 7), errors::Internal("tex input tensor not aligned to float2"));
OP_REQUIRES(ctx, !((uintptr_t)p.out & 7), errors::Internal("out output tensor not aligned to float2"));
OP_REQUIRES(ctx, !((uintptr_t)p.mip & 7), errors::Internal("mip output tensor not aligned to float2"));
}
if (!cube_mode)
OP_REQUIRES(ctx, !((uintptr_t)p.uvDA & 15), errors::Internal("uv_da input tensor not aligned to float4"));
else
OP_REQUIRES(ctx, !((uintptr_t)p.uvDA & 7), errors::Internal("uv_da input tensor not aligned to float2"));
// Choose launch parameters for texture lookup kernel.
dim3 blockSize = getLaunchBlockSize(TEX_FWD_MAX_KERNEL_BLOCK_WIDTH, TEX_FWD_MAX_KERNEL_BLOCK_HEIGHT, p.imgWidth, p.imgHeight);
dim3 gridSize = getLaunchGridSize(blockSize, p.imgWidth, p.imgHeight, p.n);
// Choose kernel based on filter mode, cube mode, and datatype.
void* func_tbl[TEX_MODE_COUNT * 3 * 2] = {
(void*)TextureFwdKernelNearest1,
(void*)TextureFwdKernelNearest2,
(void*)TextureFwdKernelNearest4,
(void*)TextureFwdKernelLinear1,
(void*)TextureFwdKernelLinear2,
(void*)TextureFwdKernelLinear4,
(void*)TextureFwdKernelLinearMipmapNearest1,
(void*)TextureFwdKernelLinearMipmapNearest2,
(void*)TextureFwdKernelLinearMipmapNearest4,
(void*)TextureFwdKernelLinearMipmapLinear1,
(void*)TextureFwdKernelLinearMipmapLinear2,
(void*)TextureFwdKernelLinearMipmapLinear4,
(void*)TextureFwdKernelCubeNearest1,
(void*)TextureFwdKernelCubeNearest2,
(void*)TextureFwdKernelCubeNearest4,
(void*)TextureFwdKernelCubeLinear1,
(void*)TextureFwdKernelCubeLinear2,
(void*)TextureFwdKernelCubeLinear4,
(void*)TextureFwdKernelCubeLinearMipmapNearest1,
(void*)TextureFwdKernelCubeLinearMipmapNearest2,
(void*)TextureFwdKernelCubeLinearMipmapNearest4,
(void*)TextureFwdKernelCubeLinearMipmapLinear1,
(void*)TextureFwdKernelCubeLinearMipmapLinear2,
(void*)TextureFwdKernelCubeLinearMipmapLinear4,
};
// Function index.
int func_idx = p.filterMode;
if (cube_mode)
func_idx += TEX_MODE_COUNT;
func_idx = func_idx * 3 + channel_div_idx;
// Launch kernel.
OP_CHECK_CUDA_ERROR(ctx, cudaLaunchKernel(func_tbl[func_idx], gridSize, blockSize, args, 0, stream));
}
};
REGISTER_OP("TextureFwd")
.Input ("tex: float")
.Input ("uv: float")
.Output ("out: float")
.Attr ("filter_mode: int")
.Attr ("boundary_mode: int");
REGISTER_OP("TextureFwdMip")
.Input ("tex: float")
.Input ("uv: float")
.Input ("uv_da: float")
.Output ("out: float")
.Output ("mip: float")
.Attr ("filter_mode: int")
.Attr ("boundary_mode: int")
.Attr ("tex_const: int")
.Attr ("max_mip_level: int");
REGISTER_KERNEL_BUILDER(Name("TextureFwd") .Device(DEVICE_GPU), TextureFwdOp);
REGISTER_KERNEL_BUILDER(Name("TextureFwdMip").Device(DEVICE_GPU), TextureFwdOp);
//------------------------------------------------------------------------
// Gradient TensorFlow op.
struct TextureGradOp : public OpKernel
{
TextureKernelParams m_attribs;
TextureGradOp(OpKernelConstruction* ctx): OpKernel(ctx)
{
memset(&m_attribs, 0, sizeof(m_attribs));
parseOpAttributes(ctx, m_attribs);
}
void Compute(OpKernelContext* ctx)
{
TextureKernelParams& p = m_attribs;
cudaStream_t stream = ctx->eigen_device<Eigen::GpuDevice>().stream();
bool cube_mode = (p.boundaryMode == TEX_BOUNDARY_MODE_CUBE);
// Get input.
const Tensor& tex = ctx->input(0);
const Tensor& uv = ctx->input(1);
const Tensor& dy = ctx->input(2);
const Tensor& uv_da = ctx->input(p.enableMip ? 3 : 2);
const Tensor& mip = ctx->input(p.enableMip ? 4 : 2);
// Extract input dimensions.
p.n = (uv.dims() > 0) ? uv.dim_size(0) : 0;
p.imgHeight = (uv.dims() > 1) ? uv.dim_size(1) : 0;
p.imgWidth = (uv.dims() > 2) ? uv.dim_size(2) : 0;
p.texDepth = (tex.dims() > 0) ? tex.dim_size(0) : 0;
if (!cube_mode)
{
p.texHeight = (tex.dims() > 1) ? tex.dim_size(1) : 0;
p.texWidth = (tex.dims() > 2) ? tex.dim_size(2) : 0;
p.channels = (tex.dims() > 3) ? tex.dim_size(3) : 0;
}
else
{
p.texHeight = (tex.dims() > 2) ? tex.dim_size(2) : 0;
p.texWidth = (tex.dims() > 3) ? tex.dim_size(3) : 0;
p.channels = (tex.dims() > 4) ? tex.dim_size(4) : 0;
}
// Sanity checks.
if (!cube_mode)
{
OP_REQUIRES(ctx, tex.dims() == 4 && tex.dim_size(0) > 0 && tex.dim_size(1) > 0 && tex.dim_size(2) > 0 && tex.dim_size(3) > 0, errors::InvalidArgument("tex must have shape[>0, >0, >0, >0]"));
OP_REQUIRES(ctx, uv.dims() == 4 && uv.dim_size(0) > 0 && uv.dim_size(1) > 0 && uv.dim_size(2) > 0 && uv.dim_size(3) == 2, errors::InvalidArgument("uv must have shape [>0, >0, >0, 2]"));
}
else
{
OP_REQUIRES(ctx, tex.dims() == 5 && tex.dim_size(0) > 0 && tex.dim_size(1) == 6 && tex.dim_size(2) > 0 && tex.dim_size(3) > 0 && tex.dim_size(4) > 0, errors::InvalidArgument("tex must have shape[>0, 6, >0, >0, >0] in cube map mode"));
OP_REQUIRES(ctx, uv.dims() == 4 && uv.dim_size(0) > 0 && uv.dim_size(1) > 0 && uv.dim_size(2) > 0 && uv.dim_size(3) == 3, errors::InvalidArgument("uv must have shape [>0, >0, >0, 3] in cube map mode"));
OP_REQUIRES(ctx, tex.dim_size(2) == tex.dim_size(3), errors::InvalidArgument("texture shape must be square in cube map mode"));
}
OP_REQUIRES(ctx, tex.dim_size(0) == 1 || tex.dim_size(0) == p.n, errors::InvalidArgument("minibatch size mismatch between inputs tex, uv"));
OP_REQUIRES(ctx, dy.dims() == 4 && dy.dim_size(0) == p.n && dy.dim_size(1) == p.imgHeight && dy.dim_size(2) == p.imgWidth && dy.dim_size(3) == p.channels, errors::InvalidArgument("dy must have shape [minibatch_size, height, width, channels]"));
if (p.enableMip)
{
if (!cube_mode)
OP_REQUIRES(ctx, uv_da.dims() == 4 && uv_da.dim_size(0) == p.n && uv_da.dim_size(1) == p.imgHeight && uv_da.dim_size(2) == p.imgWidth && uv_da.dim_size(3) == 4, errors::InvalidArgument("uv_da must have shape [minibatch_size, height, width, 4]"));
else
OP_REQUIRES(ctx, uv_da.dims() == 4 && uv_da.dim_size(0) == p.n && uv_da.dim_size(1) == p.imgHeight && uv_da.dim_size(2) == p.imgWidth && uv_da.dim_size(3) == 6, errors::InvalidArgument("uv_da must have shape [minibatch_size, height, width, 6] in cube map mode"));
}
// Get input pointers.
p.tex = tex.flat<float>().data();
p.uv = uv.flat<float>().data();
p.dy = dy.flat<float>().data();
p.uvDA = p.enableMip ? uv_da.flat<float>().data() : 0;
p.mip = p.enableMip ? (float*)mip.flat<float>().data() : 0;
// Allocate output tensor for tex gradient.
Tensor* grad_tex_tensor = NULL;
TensorShape grad_tex_shape;
grad_tex_shape.AddDim(p.texDepth);
if (cube_mode)
grad_tex_shape.AddDim(6);
grad_tex_shape.AddDim(p.texHeight);
grad_tex_shape.AddDim(p.texWidth);
grad_tex_shape.AddDim(p.channels);
OP_REQUIRES_OK(ctx, ctx->allocate_output(0, grad_tex_shape, &grad_tex_tensor));
p.gradTex = grad_tex_tensor->flat<float>().data();
// Allocate output tensor for uv gradient.
if (p.filterMode != TEX_MODE_NEAREST)
{
TensorShape grad_uv_shape;
Tensor* grad_uv_tensor = NULL;
grad_uv_shape.AddDim(p.n);
grad_uv_shape.AddDim(p.imgHeight);
grad_uv_shape.AddDim(p.imgWidth);
grad_uv_shape.AddDim(uv.dim_size(3));
OP_REQUIRES_OK(ctx, ctx->allocate_output(1, grad_uv_shape, &grad_uv_tensor));
p.gradUV = grad_uv_tensor->flat<float>().data();
// Allocate output tensor for uv_da gradient.
if (p.filterMode == TEX_MODE_LINEAR_MIPMAP_LINEAR)
{
Tensor* grad_uv_da_tensor = NULL;
grad_uv_shape.set_dim(3, uv_da.dim_size(3));
OP_REQUIRES_OK(ctx, ctx->allocate_output(2, grad_uv_shape, &grad_uv_da_tensor));
p.gradUVDA = grad_uv_da_tensor->flat<float>().data();
}
}
// Choose kernel variants based on channel count.
int channel_div_idx = 0;
if (!(p.channels & 3))
channel_div_idx = 2; // Channel count divisible by 4.
else if (!(p.channels & 1))
channel_div_idx = 1; // Channel count divisible by 2.
// Mip-related setup.
Tensor grad_mip_tensor;
if (p.enableMip)
{
// Generate mip offsets.
int mipTotal = calculateMipInfo(ctx, p);
// Get space for temporary mip gradients.
TensorShape grad_mip_shape;
grad_mip_shape.AddDim(mipTotal);
ctx->allocate_temp(DT_FLOAT, grad_mip_shape, &grad_mip_tensor);
p.gradTexMip = grad_mip_tensor.flat<float>().data();
// Clear mip gradients.
OP_CHECK_CUDA_ERROR(ctx, cudaMemsetAsync(p.gradTexMip, 0, mipTotal * sizeof(float), stream));
}
// Initialize texture gradients to zero.
int texBytes = p.texHeight * p.texWidth * p.texDepth * p.channels * sizeof(float);
if (cube_mode)
texBytes *= 6;
OP_CHECK_CUDA_ERROR(ctx, cudaMemsetAsync(p.gradTex, 0, texBytes, stream));
// Verify that buffers are aligned to allow float2/float4 operations. Unused pointers are zero so always aligned.
if (!cube_mode)
{
OP_REQUIRES(ctx, !((uintptr_t)p.uv & 7), errors::Internal("uv input tensor not aligned to float2"));
OP_REQUIRES(ctx, !((uintptr_t)p.gradUV & 7), errors::Internal("grad_uv output tensor not aligned to float2"));
OP_REQUIRES(ctx, !((uintptr_t)p.uvDA & 15), errors::Internal("uv_da input tensor not aligned to float4"));
OP_REQUIRES(ctx, !((uintptr_t)p.gradUVDA & 15), errors::Internal("grad_uv_da output tensor not aligned to float4"));
}
else
{
OP_REQUIRES(ctx, !((uintptr_t)p.uvDA & 7), errors::Internal("uv_da input tensor not aligned to float2"));
OP_REQUIRES(ctx, !((uintptr_t)p.gradUVDA & 7), errors::Internal("grad_uv_da output tensor not aligned to float2"));
}
if ((p.channels & 3) == 0)
{
OP_REQUIRES(ctx, !((uintptr_t)p.tex & 15), errors::Internal("tex input tensor not aligned to float4"));
OP_REQUIRES(ctx, !((uintptr_t)p.gradTex & 15), errors::Internal("grad_tex output tensor not aligned to float4"));
OP_REQUIRES(ctx, !((uintptr_t)p.dy & 15), errors::Internal("dy input tensor not aligned to float4"));
OP_REQUIRES(ctx, !((uintptr_t)p.mip & 15), errors::Internal("mip input tensor not aligned to float4"));
}
if ((p.channels & 1) == 0)
{
OP_REQUIRES(ctx, !((uintptr_t)p.tex & 7), errors::Internal("tex input tensor not aligned to float2"));
OP_REQUIRES(ctx, !((uintptr_t)p.gradTex & 7), errors::Internal("grad_tex output tensor not aligned to float2"));
OP_REQUIRES(ctx, !((uintptr_t)p.dy & 7), errors::Internal("dy output tensor not aligned to float2"));
OP_REQUIRES(ctx, !((uintptr_t)p.mip & 7), errors::Internal("mip input tensor not aligned to float2"));
}
// Choose launch parameters for main gradient kernel.
void* args[] = {&p};
dim3 blockSize = getLaunchBlockSize(TEX_GRAD_MAX_KERNEL_BLOCK_WIDTH, TEX_GRAD_MAX_KERNEL_BLOCK_HEIGHT, p.imgWidth, p.imgHeight);
dim3 gridSize = getLaunchGridSize(blockSize, p.imgWidth, p.imgHeight, p.n);
void* func_tbl[TEX_MODE_COUNT * 2] = {
(void*)TextureGradKernelNearest,
(void*)TextureGradKernelLinear,
(void*)TextureGradKernelLinearMipmapNearest,
(void*)TextureGradKernelLinearMipmapLinear,
(void*)TextureGradKernelCubeNearest,
(void*)TextureGradKernelCubeLinear,
(void*)TextureGradKernelCubeLinearMipmapNearest,
(void*)TextureGradKernelCubeLinearMipmapLinear,
};
// Function index.
int func_idx = p.filterMode;
if (cube_mode)
func_idx += TEX_MODE_COUNT;
// Launch main gradient kernel.
OP_CHECK_CUDA_ERROR(ctx, cudaLaunchKernel(func_tbl[func_idx], gridSize, blockSize, args, 0, stream));
// Launch kernel to pull gradients from mip levels.
if (p.enableMip)
{
dim3 blockSize = getLaunchBlockSize(TEX_GRAD_MAX_MIP_KERNEL_BLOCK_WIDTH, TEX_GRAD_MAX_MIP_KERNEL_BLOCK_HEIGHT, p.texWidth, p.texHeight);
dim3 gridSize = getLaunchGridSize(blockSize, p.texWidth, p.texHeight, p.texDepth * (cube_mode ? 6 : 1));
int sharedBytes = blockSize.x * blockSize.y * p.channels * sizeof(float);
void* mip_grad_func_tbl[3] = { (void*)MipGradKernel1, (void*)MipGradKernel2, (void*)MipGradKernel4 };
OP_CHECK_CUDA_ERROR(ctx, cudaLaunchKernel(mip_grad_func_tbl[channel_div_idx], gridSize, blockSize, args, sharedBytes, stream));
}
}
};
REGISTER_OP("TextureGradNearest")
.Input ("tex: float")
.Input ("uv: float")
.Input ("dy: float")
.Output ("grad_tex: float")
.Attr ("filter_mode: int")
.Attr ("boundary_mode: int");
REGISTER_OP("TextureGradLinear")
.Input ("tex: float")
.Input ("uv: float")
.Input ("dy: float")
.Output ("grad_tex: float")
.Output ("grad_uv: float")
.Attr ("filter_mode: int")
.Attr ("boundary_mode: int");
REGISTER_OP("TextureGradLinearMipmapNearest")
.Input ("tex: float")
.Input ("uv: float")
.Input ("dy: float")
.Input ("uv_da: float")
.Input ("mip: float")
.Output ("grad_tex: float")
.Output ("grad_uv: float")
.Attr ("filter_mode: int")
.Attr ("boundary_mode: int")
.Attr ("max_mip_level: int");
REGISTER_OP("TextureGradLinearMipmapLinear")
.Input ("tex: float")
.Input ("uv: float")
.Input ("dy: float")
.Input ("uv_da: float")
.Input ("mip: float")
.Output ("grad_tex: float")
.Output ("grad_uv: float")
.Output ("grad_uv_da: float")
.Attr ("filter_mode: int")
.Attr ("boundary_mode: int")
.Attr ("max_mip_level: int");
REGISTER_KERNEL_BUILDER(Name("TextureGradNearest") .Device(DEVICE_GPU), TextureGradOp);
REGISTER_KERNEL_BUILDER(Name("TextureGradLinear") .Device(DEVICE_GPU), TextureGradOp);
REGISTER_KERNEL_BUILDER(Name("TextureGradLinearMipmapNearest").Device(DEVICE_GPU), TextureGradOp);
REGISTER_KERNEL_BUILDER(Name("TextureGradLinearMipmapLinear") .Device(DEVICE_GPU), TextureGradOp);
//------------------------------------------------------------------------
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# NVIDIA CORPORATION and its licensors retain all intellectual property
# and proprietary rights in and to this software, related documentation
# and any modifications thereto. Any use, reproduction, disclosure or
# distribution of this software and related documentation without an express
# license agreement from NVIDIA CORPORATION is strictly prohibited.
from .ops import RasterizeGLContext, get_log_level, set_log_level, rasterize, interpolate, texture, texture_construct_mip, antialias, antialias_construct_topology_hash
__all__ = ["RasterizeGLContext", "get_log_level", "set_log_level", "rasterize", "interpolate", "texture", "texture_construct_mip", "antialias", "antialias_construct_topology_hash"]
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# NVIDIA CORPORATION and its licensors retain all intellectual property
# and proprietary rights in and to this software, related documentation
# and any modifications thereto. Any use, reproduction, disclosure or
# distribution of this software and related documentation without an express
# license agreement from NVIDIA CORPORATION is strictly prohibited.
import logging
import numpy as np
import os
import sys
import torch
import torch.utils.cpp_extension
#----------------------------------------------------------------------------
# C++/Cuda plugin compiler/loader.
_cached_plugin = None
def _get_plugin():
# Return cached plugin if already loaded.
global _cached_plugin
if _cached_plugin is not None:
return _cached_plugin
# Make sure we can find the necessary compiler and libary binaries.
if os.name == 'nt':
lib_dir = os.path.dirname(__file__) + r"\..\lib"
def find_cl_path():
import glob
for edition in ['Professional', 'BuildTools', 'Community']:
paths = sorted(glob.glob(r"C:\Program Files (x86)\Microsoft Visual Studio\*\%s\VC\Tools\MSVC\*\bin\Hostx64\x64" % edition), reverse=True)
if paths:
return paths[0]
# If cl.exe is not on path, try to find it.
if os.system("where cl.exe >nul 2>nul") != 0:
cl_path = find_cl_path()
if cl_path is None:
raise RuntimeError("Could not locate a supported Microsoft Visual C++ installation")
os.environ['PATH'] += ';' + cl_path
# Compiler options.
opts = ['-DNVDR_TORCH']
# Linker options.
if os.name == 'posix':
ldflags = ['-lGL', '-lGLEW']
elif os.name == 'nt':
libs = ['gdi32', 'glew32s', 'opengl32', 'user32']
ldflags = ['/LIBPATH:' + lib_dir] + ['/DEFAULTLIB:' + x for x in libs]
# List of source files.
source_files = [
'../common/common.cpp',
'../common/rasterize.cu',
'../common/rasterize.cpp',
'../common/interpolate.cu',
'../common/texture.cu',
'../common/texture.cpp',
'../common/antialias.cu',
'torch_bindings.cpp',
'torch_rasterize.cpp',
'torch_interpolate.cpp',
'torch_texture.cpp',
'torch_antialias.cpp',
]
# Some containers set this to contain old architectures that won't compile. We only need the one installed in the machine.
os.environ['TORCH_CUDA_ARCH_LIST'] = ''
# Try to detect if a stray lock file is left in cache directory and show a warning. This sometimes happens on Windows if the build is interrupted at just the right moment.
plugin_name = 'nvdiffrast_plugin'
try:
lock_fn = os.path.join(torch.utils.cpp_extension._get_build_directory(plugin_name, False), 'lock')
if os.path.exists(lock_fn):
logging.getLogger('nvdiffrast').warning("Lock file exists in build directory: '%s'" % lock_fn)
except:
pass
# Compile and load.
source_paths = [os.path.join(os.path.dirname(__file__), fn) for fn in source_files]
torch.utils.cpp_extension.load(name=plugin_name, sources=source_paths, extra_cflags=opts, extra_cuda_cflags=opts, extra_ldflags=ldflags, with_cuda=True, verbose=False)
# Import, cache, and return the compiled module.
import nvdiffrast_plugin
_cached_plugin = nvdiffrast_plugin
return _cached_plugin
#----------------------------------------------------------------------------
# Log level.
#----------------------------------------------------------------------------
def get_log_level():
'''Get current log level.
Returns:
Current log level in nvdiffrast. See `set_log_level()` for possible values.
'''
return _get_plugin().get_log_level()
def set_log_level(level):
'''Set log level.
Log levels follow the convention on the C++ side of Torch:
0 = Info,
1 = Warning,
2 = Error,
3 = Fatal.
The default log level is 1.
Args:
level: New log level as integer. Internal nvdiffrast messages of this
severity or higher will be printed, while messages of lower
severity will be silent.
'''
_get_plugin().set_log_level(level)
#----------------------------------------------------------------------------
# GL State wrapper.
#----------------------------------------------------------------------------
class RasterizeGLContext:
def __init__(self, output_db=True, mode='automatic'):
'''Create a new OpenGL rasterizer context.
Creating an OpenGL context is a slow operation so you should reuse the same
context in all calls to `rasterize()` on the same CPU thread. The OpenGL context
is deleted when the object is destroyed.
Args:
output_db (bool): Compute and output image-space derivates of barycentrics.
mode: OpenGL context handling mode. Valid values are 'manual' and 'automatic'.
Returns:
The newly created OpenGL rasterizer context.
'''
assert output_db is True or output_db is False
assert mode in ['automatic', 'manual']
self.output_db = output_db
self.mode = mode
self.cpp_wrapper = _get_plugin().RasterizeGLStateWrapper(output_db, mode == 'automatic')
def set_context(self):
'''Set (activate) OpenGL context in the current CPU thread.
Only available if context was created in manual mode.
'''
assert self.mode == 'manual'
self.cpp_wrapper.set_context()
def release_context(self):
'''Release (deactivate) currently active OpenGL context.
Only available if context was created in manual mode.
'''
assert self.mode == 'manual'
self.cpp_wrapper.release_context()
#----------------------------------------------------------------------------
# Rasterize.
#----------------------------------------------------------------------------
class _rasterize_func(torch.autograd.Function):
@staticmethod
def forward(ctx, glctx, pos, tri, resolution, ranges, grad_db):
out, out_db = _get_plugin().rasterize_fwd(glctx.cpp_wrapper, pos, tri, resolution, ranges)
ctx.save_for_backward(pos, tri, out)
ctx.saved_grad_db = grad_db
return out, out_db
@staticmethod
def backward(ctx, dy, ddb):
pos, tri, out = ctx.saved_variables
if ctx.saved_grad_db:
g_pos = _get_plugin().rasterize_grad_db(pos, tri, out, dy, ddb)
else:
g_pos = _get_plugin().rasterize_grad(pos, tri, out, dy)
return None, g_pos, None, None, None, None
# Op wrapper.
def rasterize(glctx, pos, tri, resolution, ranges=None, grad_db=True):
'''Rasterize triangles.
All input tensors must be contiguous and reside in GPU memory except for
the `ranges` tensor that, if specified, has to reside in CPU memory. The
output tensors will be contiguous and reside in GPU memory.
Args:
glctx: OpenGL context of type `RasterizeGLContext`.
pos: Vertex position tensor with dtype `torch.float32`. To enable range
mode, this tensor should have a 2D shape [num_vertices, 4]. To enable
instanced mode, use a 3D shape [minibatch_size, num_vertices, 4].
tri: Triangle tensor with shape [num_triangles, 3] and dtype `torch.int32`.
resolution: Output resolution as integer tuple (height, width).
ranges: In range mode, tensor with shape [minibatch_size, 2] and dtype
`torch.int32`, specifying start indices and counts into `tri`.
Ignored in instanced mode.
grad_db: Propagate gradients of image-space derivatives of barycentrics
into `pos` in backward pass. Ignored if OpenGL context was
not configured to output image-space derivatives.
Returns:
A tuple of two tensors. The first output tensor has shape [minibatch_size,
height, width, 4] and contains the main rasterizer output in order (u, v, z/w,
triangle_id). If the OpenGL context was configured to output image-space
derivatives of barycentrics, the second output tensor will also have shape
[minibatch_size, height, width, 4] and contain said derivatives in order
(du/dX, du/dY, dv/dX, dv/dY). Otherwise it will be an empty tensor with shape
[minibatch_size, height, width, 0].
'''
assert isinstance(glctx, RasterizeGLContext)
assert grad_db is True or grad_db is False
grad_db = grad_db and glctx.output_db
# Sanitize inputs.
assert isinstance(pos, torch.Tensor) and isinstance(tri, torch.Tensor)
resolution = tuple(resolution)
if ranges is None:
ranges = torch.empty(size=(0, 2), dtype=torch.int32, device='cpu')
else:
assert isinstance(ranges, torch.Tensor)
# Instantiate the function.
return _rasterize_func.apply(glctx, pos, tri, resolution, ranges, grad_db)
#----------------------------------------------------------------------------
# Interpolate.
#----------------------------------------------------------------------------
# Output pixel differentials for at least some attributes.
class _interpolate_func_da(torch.autograd.Function):
@staticmethod
def forward(ctx, attr, rast, tri, rast_db, diff_attrs_all, diff_attrs_list):
out, out_da = _get_plugin().interpolate_fwd_da(attr, rast, tri, rast_db, diff_attrs_all, diff_attrs_list)
ctx.save_for_backward(attr, rast, tri, rast_db)
ctx.saved_misc = diff_attrs_all, diff_attrs_list
return out, out_da
@staticmethod
def backward(ctx, dy, dda):
attr, rast, tri, rast_db = ctx.saved_variables
diff_attrs_all, diff_attrs_list = ctx.saved_misc
g_attr, g_rast, g_rast_db = _get_plugin().interpolate_grad_da(attr, rast, tri, dy, rast_db, dda, diff_attrs_all, diff_attrs_list)
return g_attr, g_rast, None, g_rast_db, None, None
# No pixel differential for any attribute.
class _interpolate_func(torch.autograd.Function):
@staticmethod
def forward(ctx, attr, rast, tri):
out, out_da = _get_plugin().interpolate_fwd(attr, rast, tri)
ctx.save_for_backward(attr, rast, tri)
return out, out_da
@staticmethod
def backward(ctx, dy, _):
attr, rast, tri = ctx.saved_variables
g_attr, g_rast = _get_plugin().interpolate_grad(attr, rast, tri, dy)
return g_attr, g_rast, None
# Op wrapper.
def interpolate(attr, rast, tri, rast_db=None, diff_attrs=None):
"""Interpolate vertex attributes.
All input tensors must be contiguous and reside in GPU memory. The output tensors
will be contiguous and reside in GPU memory.
Args:
attr: Attribute tensor with dtype `torch.float32`.
Shape is [num_vertices, num_attributes] in range mode, or
[minibatch_size, num_vertices, num_attributes] in instanced mode.
Broadcasting is supported along the minibatch axis.
rast: Main output tensor from `rasterize()`.
tri: Triangle tensor with shape [num_triangles, 3] and dtype `torch.int32`.
rast_db: (Optional) Tensor containing image-space derivatives of barycentrics,
i.e., the second output tensor from `rasterize()`. Enables computing
image-space derivatives of attributes.
diff_attrs: (Optional) List of attribute indices for which image-space
derivatives are to be computed. Special value 'all' is equivalent
to list [0, 1, ..., num_attributes - 1].
Returns:
A tuple of two tensors. The first output tensor contains interpolated
attributes and has shape [minibatch_size, height, width, num_attributes].
If `rast_db` and `diff_attrs` were specified, the second output tensor contains
the image-space derivatives of the selected attributes and has shape
[minibatch_size, height, width, 2 * len(diff_attrs)]. The derivatives of the
first selected attribute A will be on channels 0 and 1 as (dA/dX, dA/dY), etc.
Otherwise, the second output tensor will be an empty tensor with shape
[minibatch_size, height, width, 0].
"""
# Sanitize the list of pixel differential attributes.
if diff_attrs is None:
diff_attrs = []
elif diff_attrs != 'all':
diff_attrs = np.asarray(diff_attrs, np.int32)
assert len(diff_attrs.shape) == 1
diff_attrs = diff_attrs.tolist()
diff_attrs_all = int(diff_attrs == 'all')
diff_attrs_list = [] if diff_attrs_all else diff_attrs
# Check inputs.
assert all(isinstance(x, torch.Tensor) for x in (attr, rast, tri))
if diff_attrs:
assert isinstance(rast_db, torch.Tensor)
# Choose stub.
if diff_attrs:
return _interpolate_func_da.apply(attr, rast, tri, rast_db, diff_attrs_all, diff_attrs_list)
else:
return _interpolate_func.apply(attr, rast, tri)
#----------------------------------------------------------------------------
# Texture
#----------------------------------------------------------------------------
# Linear-mipmap-linear and linear-mipmap-nearest: Mipmaps enabled.
class _texture_func_mip(torch.autograd.Function):
@staticmethod
def forward(ctx, filter_mode, tex, uv, uv_da, mip, filter_mode_enum, boundary_mode_enum):
out = _get_plugin().texture_fwd_mip(tex, uv, uv_da, mip, filter_mode_enum, boundary_mode_enum)
ctx.save_for_backward(tex, uv, uv_da)
ctx.saved_misc = filter_mode, mip, filter_mode_enum, boundary_mode_enum
return out
@staticmethod
def backward(ctx, dy):
tex, uv, uv_da = ctx.saved_variables
filter_mode, mip, filter_mode_enum, boundary_mode_enum = ctx.saved_misc
if filter_mode == 'linear-mipmap-linear':
g_tex, g_uv, g_uv_da = _get_plugin().texture_grad_linear_mipmap_linear(tex, uv, dy, uv_da, mip, filter_mode_enum, boundary_mode_enum)
return None, g_tex, g_uv, g_uv_da, None, None, None
else: # linear-mipmap-nearest
g_tex, g_uv = _get_plugin().texture_grad_linear_mipmap_nearest(tex, uv, dy, uv_da, mip, filter_mode_enum, boundary_mode_enum)
return None, g_tex, g_uv, None, None, None, None
# Linear and nearest: Mipmaps disabled.
class _texture_func(torch.autograd.Function):
@staticmethod
def forward(ctx, filter_mode, tex, uv, filter_mode_enum, boundary_mode_enum):
out = _get_plugin().texture_fwd(tex, uv, filter_mode_enum, boundary_mode_enum)
ctx.save_for_backward(tex, uv)
ctx.saved_misc = filter_mode, filter_mode_enum, boundary_mode_enum
return out
@staticmethod
def backward(ctx, dy):
tex, uv = ctx.saved_variables
filter_mode, filter_mode_enum, boundary_mode_enum = ctx.saved_misc
if filter_mode == 'linear':
g_tex, g_uv = _get_plugin().texture_grad_linear(tex, uv, dy, filter_mode_enum, boundary_mode_enum)
return None, g_tex, g_uv, None, None
else: # nearest
g_tex = _get_plugin().texture_grad_nearest(tex, uv, dy, filter_mode_enum, boundary_mode_enum)
return None, g_tex, None, None, None
# Op wrapper.
def texture(tex, uv, uv_da=None, mip=None, filter_mode='auto', boundary_mode='wrap', max_mip_level=None):
"""Perform texture sampling.
All input tensors must be contiguous and reside in GPU memory. The output tensor
will be contiguous and reside in GPU memory.
Args:
tex: Texture tensor with dtype `torch.float32`. For 2D textures, must have shape
[minibatch_size, tex_height, tex_width, tex_channels]. For cube map textures,
must have shape [minibatch_size, 6, tex_height, tex_width, tex_channels] where
tex_width and tex_height are equal. Note that `boundary_mode` must also be set
to 'cube' to enable cube map mode. Broadcasting is supported along the minibatch axis.
uv: Tensor containing per-pixel texture coordinates. When sampling a 2D texture,
must have shape [minibatch_size, height, width, 2]. When sampling a cube map
texture, must have shape [minibatch_size, height, width, 3].
uv_da: (Optional) Tensor containing image-space derivatives of texture coordinates.
Must have same shape as `uv` except for the last dimension that is to be twice
as long.
mip: (Optional) Preconstructed mipmap stack from a `texture_construct_mip()` call. If not
specified, the mipmap stack is constructed internally and discarded afterwards.
filter_mode: Texture filtering mode to be used. Valid values are 'auto', 'nearest',
'linear', 'linear-mipmap-nearest', and 'linear-mipmap-linear'. Mode 'auto'
selects 'linear' if `uv_da` is not specified, and 'linear-mipmap-linear'
when `uv_da` is specified, these being the highest-quality modes possible
depending on the availability of the image-space derivatives of the texture
coordinates.
boundary_mode: Valid values are 'wrap', 'clamp', 'zero', and 'cube'. If `tex` defines a
cube map, this must be set to 'cube'. The default mode 'wrap' takes fractional
part of texture coordinates. Mode 'clamp' clamps texture coordinates to the
centers of the boundary texels. Mode 'zero' virtually extends the texture with
all-zero values in all directions.
max_mip_level: If specified, limits the number of mipmaps constructed and used in mipmap-based
filter modes.
Returns:
A tensor containing the results of the texture sampling with shape
[minibatch_size, height, width, tex_channels].
"""
# Default filter mode.
if filter_mode == 'auto':
filter_mode = 'linear-mipmap-linear' if (uv_da is not None) else 'linear'
# Sanitize inputs.
if max_mip_level is None:
max_mip_level = -1
else:
max_mip_level = int(max_mip_level)
assert max_mip_level >= 0
# Check inputs.
assert isinstance(tex, torch.Tensor) and isinstance(uv, torch.Tensor)
if 'mipmap' in filter_mode:
assert isinstance(uv_da, torch.Tensor)
# If mipping disabled via max level=0, we may as well use simpler filtering internally.
if max_mip_level == 0 and filter_mode in ['linear-mipmap-nearest', 'linear-mipmap-linear']:
filter_mode = 'linear'
# Convert filter mode to internal enumeration.
filter_mode_dict = {'nearest': 0, 'linear': 1, 'linear-mipmap-nearest': 2, 'linear-mipmap-linear': 3}
filter_mode_enum = filter_mode_dict[filter_mode]
# Convert boundary mode to internal enumeration.
boundary_mode_dict = {'cube': 0, 'wrap': 1, 'clamp': 2, 'zero': 3}
boundary_mode_enum = boundary_mode_dict[boundary_mode]
# Construct a mipmap if necessary.
if 'mipmap' in filter_mode:
if mip is not None:
assert isinstance(mip, _get_plugin().TextureMipWrapper)
else:
mip = _get_plugin().texture_construct_mip(tex, max_mip_level, boundary_mode == 'cube')
# Choose stub.
if filter_mode == 'linear-mipmap-linear' or filter_mode == 'linear-mipmap-nearest':
return _texture_func_mip.apply(filter_mode, tex, uv, uv_da, mip, filter_mode_enum, boundary_mode_enum)
else:
return _texture_func.apply(filter_mode, tex, uv, filter_mode_enum, boundary_mode_enum)
# Mipmap precalculation for cases where the texture stays constant.
def texture_construct_mip(tex, max_mip_level=None, cube_mode=False):
"""Construct a mipmap stack for a texture.
This function can be used for constructing a mipmap stack for a texture that is known to remain
constant. This avoids reconstructing it every time `texture()` is called.
Args:
tex: Texture tensor with the same constraints as in `texture()`.
max_mip_level: If specified, limits the number of mipmaps constructed.
cube_mode: Must be set to True if `tex` specifies a cube map texture.
Returns:
An opaque object containing the mipmap stack. This can be supplied in a call to `texture()`
in the `mip` argument.
"""
assert isinstance(tex, torch.Tensor)
assert cube_mode is True or cube_mode is False
if max_mip_level is None:
max_mip_level = -1
else:
max_mip_level = int(max_mip_level)
assert max_mip_level >= 0
return _get_plugin().texture_construct_mip(tex, max_mip_level, cube_mode)
#----------------------------------------------------------------------------
# Antialias.
#----------------------------------------------------------------------------
class _antialias_func(torch.autograd.Function):
@staticmethod
def forward(ctx, color, rast, pos, tri, topology_hash, pos_gradient_boost):
out, work_buffer = _get_plugin().antialias_fwd(color, rast, pos, tri, topology_hash)
ctx.save_for_backward(color, rast, pos, tri)
ctx.saved_misc = pos_gradient_boost, work_buffer
return out
@staticmethod
def backward(ctx, dy):
color, rast, pos, tri = ctx.saved_variables
pos_gradient_boost, work_buffer = ctx.saved_misc
g_color, g_pos = _get_plugin().antialias_grad(color, rast, pos, tri, dy, work_buffer)
if pos_gradient_boost != 1.0:
g_pos = g_pos * pos_gradient_boost
return g_color, None, g_pos, None, None, None
# Op wrapper.
def antialias(color, rast, pos, tri, topology_hash=None, pos_gradient_boost=1.0):
"""Perform antialiasing.
All input tensors must be contiguous and reside in GPU memory. The output tensor
will be contiguous and reside in GPU memory.
Args:
color: Input image to antialias with shape [minibatch_size, height, width, num_channels].
rast: Main output tensor from `rasterize()`.
pos: Vertex position tensor used in the rasterization operation.
tri: Triangle tensor used in the rasterization operation.
topology_hash: (Optional) Preconstructed topology hash for the triangle tensor. If not
specified, the topology hash is constructed internally and discarded afterwards.
pos_gradient_boost: (Optional) Multiplier for gradients propagated to `pos`.
Returns:
A tensor containing the antialiased image with the same shape as `color` input tensor.
"""
# Check inputs.
assert all(isinstance(x, torch.Tensor) for x in (color, rast, pos, tri))
# Construct topology hash unless provided by user.
if topology_hash is not None:
assert isinstance(topology_hash, _get_plugin().TopologyHashWrapper)
else:
topology_hash = _get_plugin().antialias_construct_topology_hash(tri)
# Instantiate the function.
return _antialias_func.apply(color, rast, pos, tri, topology_hash, pos_gradient_boost)
# Topology hash precalculation for cases where the triangle array stays constant.
def antialias_construct_topology_hash(tri):
"""Construct a topology hash for a triangle tensor.
This function can be used for constructing a topology hash for a triangle tensor that is
known to remain constant. This avoids reconstructing it every time `antialias()` is called.
Args:
tri: Triangle tensor with shape [num_triangles, 3]. Must be contiguous and reside in
GPU memory.
Returns:
An opaque object containing the topology hash. This can be supplied in a call to
`antialias()` in the `topology_hash` argument.
"""
assert isinstance(tri, torch.Tensor)
return _get_plugin().antialias_construct_topology_hash(tri)
#----------------------------------------------------------------------------
// Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
//
// NVIDIA CORPORATION and its licensors retain all intellectual property
// and proprietary rights in and to this software, related documentation
// and any modifications thereto. Any use, reproduction, disclosure or
// distribution of this software and related documentation without an express
// license agreement from NVIDIA CORPORATION is strictly prohibited.
#include "torch_common.inl"
#include "torch_types.h"
#include "../common/common.h"
#include "../common/antialias.h"
//------------------------------------------------------------------------
// Kernel prototypes.
void AntialiasFwdMeshKernel (const AntialiasKernelParams p);
void AntialiasFwdDiscontinuityKernel(const AntialiasKernelParams p);
void AntialiasFwdAnalysisKernel (const AntialiasKernelParams p);
void AntialiasGradKernel (const AntialiasKernelParams p);
//------------------------------------------------------------------------
// Topology hash construction.
TopologyHashWrapper antialias_construct_topology_hash(torch::Tensor tri)
{
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
AntialiasKernelParams p = {}; // Initialize all fields to zero.
// Check inputs.
NVDR_CHECK_DEVICE(tri);
NVDR_CHECK_CONTIGUOUS(tri);
NVDR_CHECK_I32(tri);
NVDR_CHECK(tri.sizes().size() == 2 && tri.size(0) > 0 && tri.size(1) == 3, "tri must have shape [>0, 3]");
// Fill in kernel parameters.
p.numTriangles = tri.size(0);
p.numVertices = 0x7fffffff; // Let's not require vertex positions just to enable an error check.
p.tri = tri.data_ptr<int>();
// Kernel parameters.
p.allocTriangles = p.allocTriangles < 64 ? 64 : p.allocTriangles;
while (p.allocTriangles < p.numTriangles)
p.allocTriangles <<= 1; // Must be power of two.
// Construct the hash tensor and get pointer.
torch::TensorOptions opts = torch::TensorOptions().dtype(torch::kInt32).device(torch::kCUDA);
torch::Tensor ev_hash = torch::zeros({p.allocTriangles * AA_HASH_ELEMENTS_PER_TRIANGLE * 4}, opts);
p.evHash = (uint4*)(ev_hash.data_ptr<int>());
// Check alignment.
NVDR_CHECK(!((uintptr_t)p.evHash & 15), "ev_hash internal tensor not aligned to int4");
// Populate the hash.
void* args[] = {&p};
NVDR_CHECK_CUDA_ERROR(cudaLaunchKernel((void*)AntialiasFwdMeshKernel, (p.numTriangles - 1) / AA_MESH_KERNEL_THREADS_PER_BLOCK + 1, AA_MESH_KERNEL_THREADS_PER_BLOCK, args, 0, stream));
// Return.
TopologyHashWrapper hash_wrap;
hash_wrap.ev_hash = ev_hash;
return hash_wrap;
}
//------------------------------------------------------------------------
// Forward op.
std::tuple<torch::Tensor, torch::Tensor> antialias_fwd(torch::Tensor color, torch::Tensor rast, torch::Tensor pos, torch::Tensor tri, TopologyHashWrapper topology_hash_wrap)
{
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
AntialiasKernelParams p = {}; // Initialize all fields to zero.
p.instance_mode = (pos.sizes().size() > 2) ? 1 : 0;
torch::Tensor& topology_hash = topology_hash_wrap.ev_hash; // Unwrap.
// Check inputs.
NVDR_CHECK_DEVICE(color, rast, pos, tri, topology_hash);
NVDR_CHECK_CONTIGUOUS(color, rast, pos, tri, topology_hash);
NVDR_CHECK_F32(color, rast, pos);
NVDR_CHECK_I32(tri, topology_hash);
// Sanity checks.
NVDR_CHECK(color.sizes().size() == 4 && color.size(0) > 0 && color.size(1) > 0 && color.size(2) > 0 && color.size(3) > 0, "color must have shape[>0, >0, >0, >0]");
NVDR_CHECK(rast.sizes().size() == 4 && rast.size(0) > 0 && rast.size(1) > 0 && rast.size(2) > 0 && rast.size(3) == 4, "rast must have shape[>0, >0, >0, 4]");
NVDR_CHECK(tri.sizes().size() == 2 && tri.size(0) > 0 && tri.size(1) == 3, "tri must have shape [>0, 3]");
NVDR_CHECK(color.size(1) == rast.size(1) && color.size(2) == rast.size(2), "color and rast inputs must have same spatial dimensions");
if (p.instance_mode)
{
NVDR_CHECK(pos.sizes().size() == 3 && pos.size(0) > 0 && pos.size(1) > 0 && pos.size(2) == 4, "pos must have shape [>0, >0, 4] or [>0, 4]");
NVDR_CHECK(rast.size(0) == color.size(0) && pos.size(0) == color.size(0), "minibatch size mismatch between inputs color, rast, pos");
}
else
{
NVDR_CHECK(pos.sizes().size() == 2 && pos.size(0) > 0 && pos.size(1) == 4, "pos must have shape [>0, >0, 4] or [>0, 4]");
NVDR_CHECK(rast.size(0) == color.size(0), "minibatch size mismatch between inputs color, rast");
}
// Extract input dimensions.
p.numVertices = pos.size(p.instance_mode ? 1 : 0);
p.numTriangles = tri.size(0);
p.n = color.size(0);
p.height = color.size(1);
p.width = color.size(2);
p.channels = color.size(3);
// Get input pointers.
p.color = color.data_ptr<float>();
p.rasterOut = rast.data_ptr<float>();
p.tri = tri.data_ptr<int>();
p.pos = pos.data_ptr<float>();
p.evHash = (uint4*)(topology_hash.data_ptr<int>());
// Misc parameters.
p.xh = .5f * (float)p.width;
p.yh = .5f * (float)p.height;
p.allocTriangles = topology_hash.size(0) / (4 * AA_HASH_ELEMENTS_PER_TRIANGLE);
// Allocate output tensors.
torch::Tensor out = color.detach().clone(); // Use color as base.
torch::TensorOptions opts = torch::TensorOptions().dtype(torch::kFloat32).device(torch::kCUDA);
torch::Tensor work_buffer = torch::empty({p.n * p.width * p.height * 8 + 4}, opts); // 8 int for a maximum of two work items per pixel.
p.output = out.data_ptr<float>();
p.workBuffer = (int4*)(work_buffer.data_ptr<float>());
// Clear the work counters.
NVDR_CHECK_CUDA_ERROR(cudaMemsetAsync(p.workBuffer, 0, sizeof(int4), stream));
// Verify that buffers are aligned to allow float2/float4 operations.
NVDR_CHECK(!((uintptr_t)p.pos & 15), "pos input tensor not aligned to float4");
NVDR_CHECK(!((uintptr_t)p.rasterOut & 7), "raster_out input tensor not aligned to float2");
NVDR_CHECK(!((uintptr_t)p.workBuffer & 15), "work_buffer internal tensor not aligned to int4");
NVDR_CHECK(!((uintptr_t)p.evHash & 15), "topology_hash internal tensor not aligned to int4");
// Choose launch parameters for the discontinuity finder kernel and launch.
void* args[] = {&p};
dim3 blockSize(AA_DISCONTINUITY_KERNEL_BLOCK_WIDTH, AA_DISCONTINUITY_KERNEL_BLOCK_HEIGHT, 1);
dim3 gridSize = getLaunchGridSize(blockSize, p.width, p.height, p.n);
NVDR_CHECK_CUDA_ERROR(cudaLaunchKernel((void*)AntialiasFwdDiscontinuityKernel, gridSize, blockSize, args, 0, stream));
// Determine optimum block size for the persistent analysis kernel and launch.
int device = 0;
int numCTA = 0;
int numSM = 0;
NVDR_CHECK_CUDA_ERROR(cudaGetDevice(&device));
NVDR_CHECK_CUDA_ERROR(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numCTA, (void*)AntialiasFwdAnalysisKernel, AA_ANALYSIS_KERNEL_THREADS_PER_BLOCK, 0));
NVDR_CHECK_CUDA_ERROR(cudaDeviceGetAttribute(&numSM, cudaDevAttrMultiProcessorCount, device));
NVDR_CHECK_CUDA_ERROR(cudaLaunchKernel((void*)AntialiasFwdAnalysisKernel, numCTA * numSM, AA_ANALYSIS_KERNEL_THREADS_PER_BLOCK, args, 0, stream));
// Return results.
return std::tuple<torch::Tensor, torch::Tensor>(out, work_buffer);
}
//------------------------------------------------------------------------
// Gradient op.
std::tuple<torch::Tensor, torch::Tensor> antialias_grad(torch::Tensor color, torch::Tensor rast, torch::Tensor pos, torch::Tensor tri, torch::Tensor dy, torch::Tensor work_buffer)
{
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
AntialiasKernelParams p = {}; // Initialize all fields to zero.
p.instance_mode = (pos.sizes().size() > 2) ? 1 : 0;
// Check inputs.
NVDR_CHECK_DEVICE(color, rast, pos, tri, dy, work_buffer);
NVDR_CHECK_CONTIGUOUS(color, rast, pos, tri, work_buffer);
NVDR_CHECK_F32(color, rast, pos, dy, work_buffer);
NVDR_CHECK_I32(tri);
// Sanity checks.
NVDR_CHECK(dy.sizes().size() == 4 && dy.size(0) > 0 && dy.size(1) > 0 && dy.size(2) > 0 && dy.size(3) > 0, "dy must have shape[>0, >0, >0, >0]");
NVDR_CHECK(color.sizes().size() == 4 && color.size(0) > 0 && color.size(1) > 0 && color.size(2) > 0 && color.size(3) > 0, "color must have shape[>0, >0, >0, >0]");
NVDR_CHECK(rast.sizes().size() == 4 && rast.size(0) > 0 && rast.size(1) > 0 && rast.size(2) > 0 && rast.size(3) == 4, "raster_out must have shape[>0, >0, >0, 4]");
NVDR_CHECK(tri.sizes().size() == 2 && tri.size(0) > 0 && tri.size(1) == 3, "tri must have shape [>0, 3]");
NVDR_CHECK(color.size(1) == rast.size(1) && color.size(2) == rast.size(2), "color and raster_out inputs must have same spatial dimensions");
NVDR_CHECK(color.size(1) == dy.size(1) && color.size(2) == dy.size(2) && color.size(3) == dy.size(3), "color and dy inputs must have same dimensions");
if (p.instance_mode)
{
NVDR_CHECK(pos.sizes().size() == 3 && pos.size(0) > 0 && pos.size(1) > 0 && pos.size(2) == 4, "pos must have shape [>0, >0, 4] or [>0, 4]");
NVDR_CHECK(rast.size(0) == color.size(0) && pos.size(0) == color.size(0), "minibatch size mismatch between inputs color, raster_out, pos");
NVDR_CHECK(dy.size(0) == color.size(0) && rast.size(0) == color.size(0) && pos.size(0) ==color.size(0), "minibatch size mismatch between inputs dy, color, raster_out, pos");
}
else
{
NVDR_CHECK(pos.sizes().size() == 2 && pos.size(0) > 0 && pos.size(1) == 4, "pos must have shape [>0, >0, 4] or [>0, 4]");
NVDR_CHECK(rast.size(0) == color.size(0), "minibatch size mismatch between inputs color, raster_out");
NVDR_CHECK(dy.size(0) == color.size(0) && rast.size(0) == color.size(0), "minibatch size mismatch between inputs dy, color, raster_out");
}
// Extract input dimensions.
p.numVertices = pos.size(p.instance_mode ? 1 : 0);
p.numTriangles = tri.size(0);
p.n = color.size(0);
p.height = color.size(1);
p.width = color.size(2);
p.channels = color.size(3);
// Ensure dy is contiguous.
torch::Tensor dy_ = dy.contiguous();
// Get input pointers.
p.color = color.data_ptr<float>();
p.rasterOut = rast.data_ptr<float>();
p.tri = tri.data_ptr<int>();
p.pos = pos.data_ptr<float>();
p.dy = dy_.data_ptr<float>();
p.workBuffer = (int4*)(work_buffer.data_ptr<float>());
// Misc parameters.
p.xh = .5f * (float)p.width;
p.yh = .5f * (float)p.height;
// Allocate output tensors.
torch::Tensor grad_color = dy_.detach().clone(); // Use dy as base.
torch::Tensor grad_pos = torch::zeros_like(pos);
p.gradColor = grad_color.data_ptr<float>();
p.gradPos = grad_pos.data_ptr<float>();
// Clear gradient kernel work counter.
NVDR_CHECK_CUDA_ERROR(cudaMemsetAsync(&p.workBuffer[0].y, 0, sizeof(int), stream));
// Verify that buffers are aligned to allow float2/float4 operations.
NVDR_CHECK(!((uintptr_t)p.pos & 15), "pos input tensor not aligned to float4");
NVDR_CHECK(!((uintptr_t)p.workBuffer & 15), "work_buffer internal tensor not aligned to int4");
// Determine optimum block size for the gradient kernel and launch.
void* args[] = {&p};
int device = 0;
int numCTA = 0;
int numSM = 0;
NVDR_CHECK_CUDA_ERROR(cudaGetDevice(&device));
NVDR_CHECK_CUDA_ERROR(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numCTA, (void*)AntialiasGradKernel, AA_GRAD_KERNEL_THREADS_PER_BLOCK, 0));
NVDR_CHECK_CUDA_ERROR(cudaDeviceGetAttribute(&numSM, cudaDevAttrMultiProcessorCount, device));
NVDR_CHECK_CUDA_ERROR(cudaLaunchKernel((void*)AntialiasGradKernel, numCTA * numSM, AA_GRAD_KERNEL_THREADS_PER_BLOCK, args, 0, stream));
// Return results.
return std::tuple<torch::Tensor, torch::Tensor>(grad_color, grad_pos);
}
//------------------------------------------------------------------------
// Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
//
// NVIDIA CORPORATION and its licensors retain all intellectual property
// and proprietary rights in and to this software, related documentation
// and any modifications thereto. Any use, reproduction, disclosure or
// distribution of this software and related documentation without an express
// license agreement from NVIDIA CORPORATION is strictly prohibited.
#include "torch_common.inl"
#include "torch_types.h"
#include <tuple>
//------------------------------------------------------------------------
// Op prototypes. Return type macros for readability.
#define OP_RETURN_T torch::Tensor
#define OP_RETURN_TT std::tuple<torch::Tensor, torch::Tensor>
#define OP_RETURN_TTT std::tuple<torch::Tensor, torch::Tensor, torch::Tensor>
OP_RETURN_TT rasterize_fwd (RasterizeGLStateWrapper& stateWrapper, torch::Tensor pos, torch::Tensor tri, std::tuple<int, int> resolution, torch::Tensor ranges);
OP_RETURN_T rasterize_grad (torch::Tensor pos, torch::Tensor tri, torch::Tensor out, torch::Tensor dy);
OP_RETURN_T rasterize_grad_db (torch::Tensor pos, torch::Tensor tri, torch::Tensor out, torch::Tensor dy, torch::Tensor ddb);
OP_RETURN_TT interpolate_fwd (torch::Tensor attr, torch::Tensor rast, torch::Tensor tri);
OP_RETURN_TT interpolate_fwd_da (torch::Tensor attr, torch::Tensor rast, torch::Tensor tri, torch::Tensor rast_db, bool diff_attrs_all, std::vector<int>& diff_attrs_vec);
OP_RETURN_TT interpolate_grad (torch::Tensor attr, torch::Tensor rast, torch::Tensor tri, torch::Tensor dy);
OP_RETURN_TTT interpolate_grad_da (torch::Tensor attr, torch::Tensor rast, torch::Tensor tri, torch::Tensor dy, torch::Tensor rast_db, torch::Tensor dda, bool diff_attrs_all, std::vector<int>& diff_attrs_vec);
TextureMipWrapper texture_construct_mip (torch::Tensor tex, int max_mip_level, bool cube_mode);
OP_RETURN_T texture_fwd (torch::Tensor tex, torch::Tensor uv, int filter_mode, int boundary_mode);
OP_RETURN_T texture_fwd_mip (torch::Tensor tex, torch::Tensor uv, torch::Tensor uv_da, TextureMipWrapper mip, int filter_mode, int boundary_mode);
OP_RETURN_T texture_grad_nearest (torch::Tensor tex, torch::Tensor uv, torch::Tensor dy, int filter_mode, int boundary_mode);
OP_RETURN_TT texture_grad_linear (torch::Tensor tex, torch::Tensor uv, torch::Tensor dy, int filter_mode, int boundary_mode);
OP_RETURN_TT texture_grad_linear_mipmap_nearest (torch::Tensor tex, torch::Tensor uv, torch::Tensor dy, torch::Tensor uv_da, TextureMipWrapper mip, int filter_mode, int boundary_mode);
OP_RETURN_TTT texture_grad_linear_mipmap_linear (torch::Tensor tex, torch::Tensor uv, torch::Tensor dy, torch::Tensor uv_da, TextureMipWrapper mip, int filter_mode, int boundary_mode);
TopologyHashWrapper antialias_construct_topology_hash (torch::Tensor tri);
OP_RETURN_TT antialias_fwd (torch::Tensor color, torch::Tensor rast, torch::Tensor pos, torch::Tensor tri, TopologyHashWrapper topology_hash);
OP_RETURN_TT antialias_grad (torch::Tensor color, torch::Tensor rast, torch::Tensor pos, torch::Tensor tri, torch::Tensor dy, torch::Tensor work_buffer);
//------------------------------------------------------------------------
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
// State classes.
pybind11::class_<RasterizeGLStateWrapper>(m, "RasterizeGLStateWrapper").def(pybind11::init<bool, bool>())
.def("set_context", &RasterizeGLStateWrapper::setContext)
.def("release_context", &RasterizeGLStateWrapper::releaseContext);
pybind11::class_<TextureMipWrapper>(m, "TextureMipWrapper");
pybind11::class_<TopologyHashWrapper>(m, "TopologyHashWrapper");
// Plumbing to torch/c10 logging system.
m.def("get_log_level", [](void) { return FLAGS_caffe2_log_level; }, "get log level");
m.def("set_log_level", [](int level){ FLAGS_caffe2_log_level = level; }, "set log level");
// Ops.
m.def("rasterize_fwd", &rasterize_fwd, "rasterize forward op");
m.def("rasterize_grad", &rasterize_grad, "rasterize gradient op ignoring db gradients");
m.def("rasterize_grad_db", &rasterize_grad_db, "rasterize gradient op with db gradients");
m.def("interpolate_fwd", &interpolate_fwd, "interpolate forward op with attribute derivatives");
m.def("interpolate_fwd_da", &interpolate_fwd_da, "interpolate forward op without attribute derivatives");
m.def("interpolate_grad", &interpolate_grad, "interpolate gradient op with attribute derivatives");
m.def("interpolate_grad_da", &interpolate_grad_da, "interpolate gradient op without attribute derivatives");
m.def("texture_construct_mip", &texture_construct_mip, "texture mipmap construction");
m.def("texture_fwd", &texture_fwd, "texture forward op with mipmapping and texcoord derivatives");
m.def("texture_fwd_mip", &texture_fwd_mip, "texture forward op without mipmapping and texcoord derivatives");
m.def("texture_grad_nearest", &texture_grad_nearest, "texture gradient op in nearest mode");
m.def("texture_grad_linear", &texture_grad_linear, "texture gradient op in linear mode");
m.def("texture_grad_linear_mipmap_nearest", &texture_grad_linear_mipmap_nearest, "texture gradient op in linear-mipmap-nearest mode");
m.def("texture_grad_linear_mipmap_linear", &texture_grad_linear_mipmap_linear, "texture gradient op in linear-mipmap-linear mode");
m.def("antialias_construct_topology_hash", &antialias_construct_topology_hash, "antialias topology hash construction");
m.def("antialias_fwd", &antialias_fwd, "antialias forward op");
m.def("antialias_grad", &antialias_grad, "antialias gradient op");
}
//------------------------------------------------------------------------
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment