Initial commit

1f95925c · Samuli Laine · 1f95925c · 1f95925c · 1f95925c · 1f95925c
Commit 1f95925c authored Nov 03, 2020 by Samuli Laine
20 changed files
--- a/nvdiffrast/common/rasterize.cpp
+++ b/nvdiffrast/common/rasterize.cpp
--- a/nvdiffrast/common/rasterize.cu
+++ b/nvdiffrast/common/rasterize.cu
+// Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#include "common.h"
+#include "rasterize.h"
+
+//------------------------------------------------------------------------
+// Gradient Cuda kernel.
+
+template <bool ENABLE_DB>
+static __forceinline__ __device__ void RasterizeGradKernelTemplate(const RasterizeGradParams p)
+{
+    // Temporary space for coalesced atomics.
+    CA_DECLARE_TEMP(RAST_GRAD_MAX_KERNEL_BLOCK_WIDTH * RAST_GRAD_MAX_KERNEL_BLOCK_HEIGHT);    
+
+    // Calculate pixel position.
+    int px = blockIdx.x * blockDim.x + threadIdx.x;
+    int py = blockIdx.y * blockDim.y + threadIdx.y;
+    int pz = blockIdx.z;
+    if (px >= p.width || py >= p.height || pz >= p.depth)
+        return;
+
+    // Pixel index.
+    int pidx = px + p.width * (py + p.height * pz);
+
+    // Read triangle idx and dy.
+    float2 dy  = ((float2*)p.dy)[pidx * 2];
+    float4 ddb = ENABLE_DB ? ((float4*)p.ddb)[pidx] : make_float4(0.f, 0.f, 0.f, 0.f);
+    int triIdx = (int)(((float*)p.out)[pidx * 4 + 3]) - 1;
+
+    // Exit if nothing to do.
+    if (triIdx < 0 || triIdx >= p.numTriangles)
+        return; // No or corrupt triangle.
+    int grad_all_dy = __float_as_int(dy.x) | __float_as_int(dy.y); // Bitwise OR of all incoming gradients.
+    int grad_all_ddb = 0;
+    if (ENABLE_DB)
+        grad_all_ddb = __float_as_int(ddb.x) | __float_as_int(ddb.y) | __float_as_int(ddb.z) | __float_as_int(ddb.w);
+    if (((grad_all_dy | grad_all_ddb) << 1) == 0)
+        return; // All incoming gradients are +0/-0.
+
+    // Fetch vertex indices.
+    int vi0 = p.tri[triIdx * 3 + 0];
+    int vi1 = p.tri[triIdx * 3 + 1];
+    int vi2 = p.tri[triIdx * 3 + 2];
+
+    // Bail out if vertex indices are corrupt.
+    if (vi0 < 0 || vi0 >= p.numVertices ||
+        vi1 < 0 || vi1 >= p.numVertices ||
+        vi2 < 0 || vi2 >= p.numVertices)
+        return;
+
+    // In instance mode, adjust vertex indices by minibatch index.
+    if (p.instance_mode)
+    {
+        vi0 += pz * p.numVertices;
+        vi1 += pz * p.numVertices;
+        vi2 += pz * p.numVertices;
+    }
+
+    // Initialize coalesced atomics.
+    CA_SET_GROUP(triIdx);
+    
+    // Fetch vertex positions.
+    float4 p0 = ((float4*)p.pos)[vi0];
+    float4 p1 = ((float4*)p.pos)[vi1];
+    float4 p2 = ((float4*)p.pos)[vi2];
+
+    // Evaluate edge functions.
+    float fx = p.xs * (float)px + p.xo;
+    float fy = p.ys * (float)py + p.yo;
+    float p0x = p0.x - fx * p0.w;
+    float p0y = p0.y - fy * p0.w;
+    float p1x = p1.x - fx * p1.w;
+    float p1y = p1.y - fy * p1.w;
+    float p2x = p2.x - fx * p2.w;
+    float p2y = p2.y - fy * p2.w;
+    float a0 = p1x*p2y - p1y*p2x;
+    float a1 = p2x*p0y - p2y*p0x;
+    float a2 = p0x*p1y - p0y*p1x;
+
+    // Compute inverse area with epsilon.
+    float at = a0 + a1 + a2;
+    float ep = copysignf(1e-6f, at); // ~1 pixel in 1k x 1k image.
+    float iw = 1.f / (at + ep);
+
+    // Perspective correct, normalized barycentrics.
+    float b0 = a0 * iw;
+    float b1 = a1 * iw;
+
+    // Position gradients.
+    float gb0  = dy.x * iw;
+    float gb1  = dy.y * iw;
+    float gbb  = gb0 * b0 + gb1 * b1;
+    float gp0x = gbb * (p2y - p1y) - gb1 * p2y;
+    float gp1x = gbb * (p0y - p2y) + gb0 * p2y;
+    float gp2x = gbb * (p1y - p0y) - gb0 * p1y + gb1 * p0y;
+    float gp0y = gbb * (p1x - p2x) + gb1 * p2x;
+    float gp1y = gbb * (p2x - p0x) - gb0 * p2x;
+    float gp2y = gbb * (p0x - p1x) + gb0 * p1x - gb1 * p0x;
+    float gp0w = -fx * gp0x - fy * gp0y;
+    float gp1w = -fx * gp1x - fy * gp1y;
+    float gp2w = -fx * gp2x - fy * gp2y;
+
+    // Bary differential gradients.
+    if (ENABLE_DB && ((grad_all_ddb) << 1) != 0)
+    {
+        float dfxdX = p.xs * iw;
+        float dfydY = p.ys * iw;
+        ddb.x *= dfxdX;
+        ddb.y *= dfydY;
+        ddb.z *= dfxdX;
+        ddb.w *= dfydY;
+
+        float da0dX = p1.y * p2.w - p2.y * p1.w;
+        float da1dX = p2.y * p0.w - p0.y * p2.w;
+        float da2dX = p0.y * p1.w - p1.y * p0.w;
+        float da0dY = p2.x * p1.w - p1.x * p2.w;
+        float da1dY = p0.x * p2.w - p2.x * p0.w;
+        float da2dY = p1.x * p0.w - p0.x * p1.w;
+        float datdX = da0dX + da1dX + da2dX;
+        float datdY = da0dY + da1dY + da2dY;
+
+        float x01 = p0.x - p1.x;
+        float x12 = p1.x - p2.x;
+        float x20 = p2.x - p0.x;
+        float y01 = p0.y - p1.y;
+        float y12 = p1.y - p2.y;
+        float y20 = p2.y - p0.y;
+        float w01 = p0.w - p1.w;
+        float w12 = p1.w - p2.w;
+        float w20 = p2.w - p0.w;
+
+        float a0p1 = fy * p2.x - fx * p2.y;
+        float a0p2 = fx * p1.y - fy * p1.x;
+        float a1p0 = fx * p2.y - fy * p2.x;
+        float a1p2 = fy * p0.x - fx * p0.y;
+
+        float wdudX = 2.f * b0 * datdX - da0dX; 
+        float wdudY = 2.f * b0 * datdY - da0dY;
+        float wdvdX = 2.f * b1 * datdX - da1dX;
+        float wdvdY = 2.f * b1 * datdY - da1dY;
+
+        float c0  = iw * (ddb.x * wdudX + ddb.y * wdudY + ddb.z * wdvdX + ddb.w * wdvdY);
+        float cx  = c0 * fx - ddb.x * b0 - ddb.z * b1;
+        float cy  = c0 * fy - ddb.y * b0 - ddb.w * b1;
+        float cxy = iw * (ddb.x * datdX + ddb.y * datdY);
+        float czw = iw * (ddb.z * datdX + ddb.w * datdY);
+
+        gp0x += c0 * y12 - cy * w12              + czw * p2y                                               + ddb.w * p2.w;
+        gp1x += c0 * y20 - cy * w20 - cxy * p2y                              - ddb.y * p2.w;
+        gp2x += c0 * y01 - cy * w01 + cxy * p1y  - czw * p0y                 + ddb.y * p1.w                - ddb.w * p0.w;
+        gp0y += cx * w12 - c0 * x12              - czw * p2x                                - ddb.z * p2.w;
+        gp1y += cx * w20 - c0 * x20 + cxy * p2x               + ddb.x * p2.w;
+        gp2y += cx * w01 - c0 * x01 - cxy * p1x  + czw * p0x  - ddb.x * p1.w                + ddb.z * p0.w;
+        gp0w += cy * x12 - cx * y12              - czw * a1p0                               + ddb.z * p2.y - ddb.w * p2.x;
+        gp1w += cy * x20 - cx * y20 - cxy * a0p1              - ddb.x * p2.y + ddb.y * p2.x;
+        gp2w += cy * x01 - cx * y01 - cxy * a0p2 - czw * a1p2 + ddb.x * p1.y - ddb.y * p1.x - ddb.z * p0.y + ddb.w * p0.x;
+    }
+
+    // Accumulate using coalesced atomics.
+    caAtomicAdd3_xyw(p.grad + 4 * vi0, gp0x, gp0y, gp0w);
+    caAtomicAdd3_xyw(p.grad + 4 * vi1, gp1x, gp1y, gp1w);
+    caAtomicAdd3_xyw(p.grad + 4 * vi2, gp2x, gp2y, gp2w);
+}
+
+// Template specializations.
+__global__ void RasterizeGradKernel  (const RasterizeGradParams p) { RasterizeGradKernelTemplate<false>(p); }
+__global__ void RasterizeGradKernelDb(const RasterizeGradParams p) { RasterizeGradKernelTemplate<true>(p); }
+
+//------------------------------------------------------------------------
--- a/nvdiffrast/common/rasterize.h
+++ b/nvdiffrast/common/rasterize.h
+// Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#pragma once
+
+//------------------------------------------------------------------------
+// Constants and helpers.
+
+#define RAST_GRAD_MAX_KERNEL_BLOCK_WIDTH  8
+#define RAST_GRAD_MAX_KERNEL_BLOCK_HEIGHT 8
+
+//------------------------------------------------------------------------
+// Gradient CUDA kernel params.
+
+struct RasterizeGradParams
+{
+    const float*    pos;            // Incoming position buffer.
+    const int*      tri;            // Incoming triangle buffer.
+    const float*    out;            // Rasterizer output buffer.
+    const float*    dy;             // Incoming gradients of rasterizer output buffer.
+    const float*    ddb;            // Incoming gradients of bary diff output buffer.
+    float*          grad;           // Outgoing position gradients.
+    int             numTriangles;   // Number of triangles.
+    int             numVertices;    // Number of vertices.
+    int             width;          // Image width.
+    int             height;         // Image height.
+    int             depth;          // Size of minibatch.
+    int             instance_mode;  // 1 if in instance rendering mode.
+    float           xs, xo, ys, yo; // Pixel position to clip-space x, y transform.
+};
+
+//------------------------------------------------------------------------
+// The rest is for C++ compilation only when using torch.
+
+#if !defined(NVDR_TORCH) || !defined(__CUDACC__)
+#include "glutil.inl"
+
+//------------------------------------------------------------------------
+// Draw command struct used by rasterizer.
+
+struct GLDrawCmd
+{
+    uint32_t    count;
+    uint32_t    instanceCount;
+    uint32_t    firstIndex;
+    uint32_t    baseVertex;
+    uint32_t    baseInstance;
+};
+
+//------------------------------------------------------------------------
+// OpenGL-related persistent state for forward op.
+
+struct RasterizeGLState
+{
+    int                     width;              // Allocated frame buffer width.
+    int                     height;             // Allocated frame buffer height.
+    int                     depth;              // Allocated frame buffer depth.
+    int                     posCount;           // Allocated position buffer in floats.
+    int                     triCount;           // Allocated triangle buffer in ints.
+    GLContext               glctx;
+    GLuint                  glFBO;
+    GLuint                  glColorBuffer[2];
+    GLuint                  glDepthStencilBuffer;
+    GLuint                  glVAO;
+    GLuint                  glTriBuffer;
+    GLuint                  glPosBuffer;
+    GLuint                  glProgram;
+    GLuint                  glVertexShader;
+    GLuint                  glGeometryShader;
+    GLuint                  glFragmentShader;
+    cudaGraphicsResource_t  cudaColorBuffer[2];
+    cudaGraphicsResource_t  cudaPosBuffer;
+    cudaGraphicsResource_t  cudaTriBuffer;
+    std::vector<GLDrawCmd>  drawCmdBuffer;
+    int                     enableDB;
+};
+
+//------------------------------------------------------------------------
+// Shared C++ code prototypes.
+
+void rasterizeInitGLContext(NVDR_CTX_ARGS, RasterizeGLState& s);
+void rasterizeResizeBuffers(NVDR_CTX_ARGS, RasterizeGLState& s, int posCount, int triCount, int width, int height, int depth);
+void rasterizeRender(NVDR_CTX_ARGS, RasterizeGLState& s, cudaStream_t stream, const float* posPtr, int posCount, int vtxPerInstance, const int32_t* triPtr, int triCount, const int32_t* rangesPtr, int width, int height, int depth);
+void rasterizeCopyResults(NVDR_CTX_ARGS, RasterizeGLState& s, cudaStream_t stream, float** outputPtr, int width, int height, int depth);
+
+//------------------------------------------------------------------------
+#endif // !defined(NVDR_TORCH) || !defined(__CUDACC__)
--- a/nvdiffrast/common/texture.cpp
+++ b/nvdiffrast/common/texture.cpp
+// Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#include "framework.h"
+#include "texture.h"
+
+//------------------------------------------------------------------------
+// Mip stack construction and access helpers.
+
+void raiseMipSizeError(NVDR_CTX_ARGS, const TextureKernelParams& p)
+{
+    char buf[1024];
+    int bufsz = 1024;
+
+    std::string msg = "Mip-map size error - cannot downsample an odd extent greater than 1. Resize the texture so that both spatial extents are powers of two, or limit the number of mip maps using max_mip_level argument.\n";
+    
+    int w = p.texWidth;
+    int h = p.texHeight;
+    bool ew = false;
+    bool eh = false;
+
+    msg += "Attempted mip stack construction:\n";
+    msg +=               "level  width height\n";
+    msg +=               "-----  ----- ------\n";
+    snprintf(buf, bufsz, "base   %5d  %5d\n", w, h);
+    msg += buf;
+    
+    int mipTotal = 0;
+    int level = 0;
+    while ((w|h) > 1 && !(ew || eh)) // Stop at first impossible size.
+    {
+        // Current level.
+        level += 1;
+
+        // Determine if downsampling fails.
+        ew = ew || (w > 1 && (w & 1));
+        eh = eh || (h > 1 && (h & 1));
+
+        // Downsample.
+        if (w > 1) w >>= 1;
+        if (h > 1) h >>= 1;
+
+        // Append level size to error message.
+        snprintf(buf, bufsz, "mip %-2d ", level);
+        msg += buf; 
+        if (ew) snprintf(buf, bufsz, "  err  ");
+        else    snprintf(buf, bufsz, "%5d  ", w);
+        msg += buf;
+        if (eh) snprintf(buf, bufsz, "  err\n");
+        else    snprintf(buf, bufsz, "%5d\n", h);
+        msg += buf;
+    }
+
+    NVDR_CHECK(0, msg);
+}
+
+int calculateMipInfo(NVDR_CTX_ARGS, TextureKernelParams& p)
+{
+    // No levels at all?
+    if (p.mipLevelLimit == 0)
+    {
+        p.mipOffset[0] = 0;
+        p.mipLevelMax = 0;
+        return 0;
+    }
+
+    // Current level size.
+    int w = p.texWidth;
+    int h = p.texHeight;
+    
+    p.mipOffset[0] = 0;
+    int mipTotal = 0;
+    int level = 0;
+    int c = (p.boundaryMode == TEX_BOUNDARY_MODE_CUBE) ? (p.channels * 6) : p.channels;
+    while ((w|h) > 1)
+    {   
+        // Current level.     
+        level += 1;
+
+        // Quit if cannot downsample.
+        if ((w > 1 && (w & 1)) || (h > 1 && (h & 1)))
+            raiseMipSizeError(NVDR_CTX_PARAMS, p);
+
+        // Downsample.
+        if (w > 1) w >>= 1;
+        if (h > 1) h >>= 1;
+
+        p.mipOffset[level] = mipTotal;
+        mipTotal += w * h * p.texDepth * c;
+
+        // Hit the level limit?
+        if (p.mipLevelLimit >= 0 && level == p.mipLevelLimit)
+            break;
+    }
+
+    p.mipLevelMax = level;
+    return mipTotal;
+}
+
+//------------------------------------------------------------------------
--- a/nvdiffrast/common/texture.cu
+++ b/nvdiffrast/common/texture.cu
--- a/nvdiffrast/common/texture.h
+++ b/nvdiffrast/common/texture.h
+// Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#pragma once
+#include "framework.h"
+
+//------------------------------------------------------------------------
+// Constants.
+
+#define TEX_DEBUG_MIP_RETAIN_VARIANCE           0   // For debugging
+#define TEX_FWD_MAX_KERNEL_BLOCK_WIDTH          8
+#define TEX_FWD_MAX_KERNEL_BLOCK_HEIGHT         8
+#define TEX_FWD_MAX_MIP_KERNEL_BLOCK_WIDTH      8
+#define TEX_FWD_MAX_MIP_KERNEL_BLOCK_HEIGHT     8
+#define TEX_GRAD_MAX_KERNEL_BLOCK_WIDTH         8
+#define TEX_GRAD_MAX_KERNEL_BLOCK_HEIGHT        8
+#define TEX_GRAD_MAX_MIP_KERNEL_BLOCK_WIDTH     8
+#define TEX_GRAD_MAX_MIP_KERNEL_BLOCK_HEIGHT    8
+#define TEX_MAX_MIP_LEVEL                       14  // Currently a texture cannot be larger than 2 GB because we use 32-bit indices everywhere.
+#define TEX_MODE_NEAREST                        0   // Nearest on base level.
+#define TEX_MODE_LINEAR                         1   // Bilinear on base level.
+#define TEX_MODE_LINEAR_MIPMAP_NEAREST          2   // Bilinear on nearest mip level.
+#define TEX_MODE_LINEAR_MIPMAP_LINEAR           3   // Trilinear.
+#define TEX_MODE_COUNT                          4
+#define TEX_BOUNDARY_MODE_CUBE                  0   // Cube map mode.
+#define TEX_BOUNDARY_MODE_WRAP                  1   // Wrap (u, v).
+#define TEX_BOUNDARY_MODE_CLAMP                 2   // Clamp (u, v).
+#define TEX_BOUNDARY_MODE_ZERO                  3   // Pad with zeros.
+#define TEX_BOUNDARY_MODE_COUNT                 4
+
+//------------------------------------------------------------------------
+// CUDA kernel params.
+
+struct TextureKernelParams
+{
+    const float*    tex;                            // Incoming texture buffer.
+    const float*    uv;                             // Incoming texcoord buffer.
+    const float*    uvDA;                           // Incoming uv pixel diffs. NULL if mips disabled.
+    const float*    dy;                             // Incoming output gradient.
+    float*          mip;                            // Mip data buffer.
+    float*          out;                            // Outgoing texture data.
+    float*          gradTex;                        // Outgoing texture gradient.
+    float*          gradTexMip;                     // Temporary texture gradients for mip levels > 0.
+    float*          gradUV;                         // Outgoing texcoord gradient.
+    float*          gradUVDA;                       // Outgoing texcoord pixel differential gradient.
+    int             enableMip;                      // If true, we have uv_da input and mip output tensor.
+    int             filterMode;                     // One of the TEX_MODE_ constants.
+    int             boundaryMode;                   // One of the TEX_BOUNDARY_MODE_ contants.
+    int             texConst;                       // If true, texture is known to be constant.
+    int             mipLevelLimit;                  // Mip level limit coming from the op.
+    int             channels;                       // Number of texture channels.
+    int             imgWidth;                       // Image width.
+    int             imgHeight;                      // Image height.
+    int             texWidth;                       // Texture width.
+    int             texHeight;                      // Texture height.
+    int             texDepth;                       // Texture depth.
+    int             n;                              // Minibatch size.
+    int             mipLevelMax;                    // Maximum mip level index. Zero if mips disabled.
+    int             mipOffset[TEX_MAX_MIP_LEVEL];   // Offsets in mip data. 0: unused, 1+: offset to mip.
+    int             mipLevelOut;                    // Mip level being calculated in builder kernel.
+};
+
+//------------------------------------------------------------------------
+// C++ helper function prototypes.
+
+void raiseMipSizeError(NVDR_CTX_ARGS, const TextureKernelParams& p);
+int calculateMipInfo(NVDR_CTX_ARGS, TextureKernelParams& p);
+
+//------------------------------------------------------------------------
+// Macros.
+
+#define mipLevelSize(p, i) make_int2(((p).texWidth >> (i)) > 1 ? ((p).texWidth >> (i)) : 1, ((p).texHeight >> (i)) > 1 ? ((p).texHeight >> (i)) : 1)
+
+//------------------------------------------------------------------------
--- a/nvdiffrast/lib/glew.h
+++ b/nvdiffrast/lib/glew.h
--- a/nvdiffrast/lib/glew32s.lib
+++ b/nvdiffrast/lib/glew32s.lib
--- a/nvdiffrast/tensorflow/__init__.py
+++ b/nvdiffrast/tensorflow/__init__.py
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+from .ops import rasterize, interpolate, texture, antialias
+from .plugin_loader import set_cache_dir
+
+__all__ = ["rasterize", "interpolate", "texture", "antialias", "set_cache_dir"]
--- a/nvdiffrast/tensorflow/ops.py
+++ b/nvdiffrast/tensorflow/ops.py
--- a/nvdiffrast/tensorflow/plugin_loader.py
+++ b/nvdiffrast/tensorflow/plugin_loader.py
--- a/nvdiffrast/tensorflow/tf_all.cu
+++ b/nvdiffrast/tensorflow/tf_all.cu
+// Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+// TF-specific helpers.
+
+#define OP_CHECK_CUDA_ERROR(CTX, CUDA_CALL) do { cudaError_t err = CUDA_CALL; OP_REQUIRES(CTX, err == cudaSuccess, errors::Internal("Cuda error: ", cudaGetErrorName(err), "[", #CUDA_CALL, ";]")); } while (0)
+#define OP_CHECK_GL_ERROR(CTX, GL_CALL) do { GL_CALL; GLenum err = glGetError(); OP_REQUIRES(CTX, err == GL_NO_ERROR, errors::Internal("OpenGL error: ", getGLErrorString(err), "[", #GL_CALL, ";]")); } while (0)
+
+// Cuda kernels and CPP all together. What an absolute compilation unit.
+
+#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#include "../common/framework.h"
+
+#include "../common/common.h"
+#include "../common/common.cpp"
+
+#include "../common/rasterize.h"
+#include "../common/rasterize.cpp"
+#include "../common/rasterize.cu"
+#include "tf_rasterize.cu"
+
+#include "../common/interpolate.cu"
+#include "tf_interpolate.cu"
+
+#include "../common/texture.cpp"
+#include "../common/texture.cu"
+#include "tf_texture.cu"
+
+#include "../common/antialias.cu"
+#include "tf_antialias.cu"
--- a/nvdiffrast/tensorflow/tf_antialias.cu
+++ b/nvdiffrast/tensorflow/tf_antialias.cu
--- a/nvdiffrast/tensorflow/tf_interpolate.cu
+++ b/nvdiffrast/tensorflow/tf_interpolate.cu
--- a/nvdiffrast/tensorflow/tf_rasterize.cu
+++ b/nvdiffrast/tensorflow/tf_rasterize.cu
--- a/nvdiffrast/tensorflow/tf_texture.cu
+++ b/nvdiffrast/tensorflow/tf_texture.cu
--- a/nvdiffrast/torch/__init__.py
+++ b/nvdiffrast/torch/__init__.py
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+from .ops import RasterizeGLContext, get_log_level, set_log_level, rasterize, interpolate, texture, texture_construct_mip, antialias, antialias_construct_topology_hash
+__all__ = ["RasterizeGLContext", "get_log_level", "set_log_level", "rasterize", "interpolate", "texture", "texture_construct_mip", "antialias", "antialias_construct_topology_hash"]
--- a/nvdiffrast/torch/ops.py
+++ b/nvdiffrast/torch/ops.py
--- a/nvdiffrast/torch/torch_antialias.cpp
+++ b/nvdiffrast/torch/torch_antialias.cpp
--- a/nvdiffrast/torch/torch_bindings.cpp
+++ b/nvdiffrast/torch/torch_bindings.cpp
+// Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#include "torch_common.inl"
+#include "torch_types.h"
+#include <tuple>
+
+//------------------------------------------------------------------------
+// Op prototypes. Return type macros for readability.
+
+#define OP_RETURN_T   torch::Tensor
+#define OP_RETURN_TT  std::tuple<torch::Tensor, torch::Tensor>
+#define OP_RETURN_TTT std::tuple<torch::Tensor, torch::Tensor, torch::Tensor>
+
+OP_RETURN_TT        rasterize_fwd                       (RasterizeGLStateWrapper& stateWrapper, torch::Tensor pos, torch::Tensor tri, std::tuple<int, int> resolution, torch::Tensor ranges);
+OP_RETURN_T         rasterize_grad                      (torch::Tensor pos, torch::Tensor tri, torch::Tensor out, torch::Tensor dy);
+OP_RETURN_T         rasterize_grad_db                   (torch::Tensor pos, torch::Tensor tri, torch::Tensor out, torch::Tensor dy, torch::Tensor ddb);
+OP_RETURN_TT        interpolate_fwd                     (torch::Tensor attr, torch::Tensor rast, torch::Tensor tri);
+OP_RETURN_TT        interpolate_fwd_da                  (torch::Tensor attr, torch::Tensor rast, torch::Tensor tri, torch::Tensor rast_db, bool diff_attrs_all, std::vector<int>& diff_attrs_vec);
+OP_RETURN_TT        interpolate_grad                    (torch::Tensor attr, torch::Tensor rast, torch::Tensor tri, torch::Tensor dy);
+OP_RETURN_TTT       interpolate_grad_da                 (torch::Tensor attr, torch::Tensor rast, torch::Tensor tri, torch::Tensor dy, torch::Tensor rast_db, torch::Tensor dda, bool diff_attrs_all, std::vector<int>& diff_attrs_vec);
+TextureMipWrapper   texture_construct_mip               (torch::Tensor tex, int max_mip_level, bool cube_mode);
+OP_RETURN_T         texture_fwd                         (torch::Tensor tex, torch::Tensor uv, int filter_mode, int boundary_mode);
+OP_RETURN_T         texture_fwd_mip                     (torch::Tensor tex, torch::Tensor uv, torch::Tensor uv_da, TextureMipWrapper mip, int filter_mode, int boundary_mode);
+OP_RETURN_T         texture_grad_nearest                (torch::Tensor tex, torch::Tensor uv, torch::Tensor dy, int filter_mode, int boundary_mode);
+OP_RETURN_TT        texture_grad_linear                 (torch::Tensor tex, torch::Tensor uv, torch::Tensor dy, int filter_mode, int boundary_mode);
+OP_RETURN_TT        texture_grad_linear_mipmap_nearest  (torch::Tensor tex, torch::Tensor uv, torch::Tensor dy, torch::Tensor uv_da, TextureMipWrapper mip, int filter_mode, int boundary_mode);
+OP_RETURN_TTT       texture_grad_linear_mipmap_linear   (torch::Tensor tex, torch::Tensor uv, torch::Tensor dy, torch::Tensor uv_da, TextureMipWrapper mip, int filter_mode, int boundary_mode);
+TopologyHashWrapper antialias_construct_topology_hash   (torch::Tensor tri);
+OP_RETURN_TT        antialias_fwd                       (torch::Tensor color, torch::Tensor rast, torch::Tensor pos, torch::Tensor tri, TopologyHashWrapper topology_hash);
+OP_RETURN_TT        antialias_grad                      (torch::Tensor color, torch::Tensor rast, torch::Tensor pos, torch::Tensor tri, torch::Tensor dy, torch::Tensor work_buffer);
+
+//------------------------------------------------------------------------
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+    // State classes.
+    pybind11::class_<RasterizeGLStateWrapper>(m, "RasterizeGLStateWrapper").def(pybind11::init<bool, bool>())
+        .def("set_context",     &RasterizeGLStateWrapper::setContext)
+        .def("release_context", &RasterizeGLStateWrapper::releaseContext);
+    pybind11::class_<TextureMipWrapper>(m, "TextureMipWrapper");
+    pybind11::class_<TopologyHashWrapper>(m, "TopologyHashWrapper");
+
+    // Plumbing to torch/c10 logging system.
+    m.def("get_log_level", [](void)     { return FLAGS_caffe2_log_level;  }, "get log level");
+    m.def("set_log_level", [](int level){ FLAGS_caffe2_log_level = level; }, "set log level");
+
+    // Ops.
+    m.def("rasterize_fwd",                      &rasterize_fwd,                         "rasterize forward op");
+    m.def("rasterize_grad",                     &rasterize_grad,                        "rasterize gradient op ignoring db gradients");
+    m.def("rasterize_grad_db",                  &rasterize_grad_db,                     "rasterize gradient op with db gradients");
+    m.def("interpolate_fwd",                    &interpolate_fwd,                       "interpolate forward op with attribute derivatives");
+    m.def("interpolate_fwd_da",                 &interpolate_fwd_da,                    "interpolate forward op without attribute derivatives");
+    m.def("interpolate_grad",                   &interpolate_grad,                      "interpolate gradient op with attribute derivatives");
+    m.def("interpolate_grad_da",                &interpolate_grad_da,                   "interpolate gradient op without attribute derivatives");
+    m.def("texture_construct_mip",              &texture_construct_mip,                 "texture mipmap construction");
+    m.def("texture_fwd",                        &texture_fwd,                           "texture forward op with mipmapping and texcoord derivatives");
+    m.def("texture_fwd_mip",                    &texture_fwd_mip,                       "texture forward op without mipmapping and texcoord derivatives");
+    m.def("texture_grad_nearest",               &texture_grad_nearest,                  "texture gradient op in nearest mode");
+    m.def("texture_grad_linear",                &texture_grad_linear,                   "texture gradient op in linear mode");
+    m.def("texture_grad_linear_mipmap_nearest", &texture_grad_linear_mipmap_nearest,    "texture gradient op in linear-mipmap-nearest mode");
+    m.def("texture_grad_linear_mipmap_linear",  &texture_grad_linear_mipmap_linear,     "texture gradient op in linear-mipmap-linear mode");
+    m.def("antialias_construct_topology_hash",  &antialias_construct_topology_hash,     "antialias topology hash construction");
+    m.def("antialias_fwd",                      &antialias_fwd,                         "antialias forward op");
+    m.def("antialias_grad",                     &antialias_grad,                        "antialias gradient op");
+}
+
+//------------------------------------------------------------------------