Initial commit

1f95925c · Samuli Laine · 1f95925c · 1f95925c · 1f95925c · 1f95925c
Commit 1f95925c authored Nov 03, 2020 by Samuli Laine
20 changed files
--- a/nvdiffrast/torch/torch_common.inl
+++ b/nvdiffrast/torch/torch_common.inl
+// Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#pragma once
+#include "../common/framework.h"
+
+//------------------------------------------------------------------------
+// Input check helpers.
+//------------------------------------------------------------------------
+
+#ifdef _MSC_VER
+#define __func__ __FUNCTION__
+#endif
+
+#define NVDR_CHECK_DEVICE(...) do { TORCH_CHECK(at::cuda::check_device({__VA_ARGS__}), __func__, "(): Inputs " #__VA_ARGS__ " must reside on current GPU device") } while(0)
+#define NVDR_CHECK_CPU(...) do { nvdr_check_cpu({__VA_ARGS__}, __func__, "(): Inputs " #__VA_ARGS__ " must reside on CPU"); } while(0)
+#define NVDR_CHECK_CONTIGUOUS(...) do { nvdr_check_contiguous({__VA_ARGS__}, __func__, "(): Inputs " #__VA_ARGS__ " must be contiguous tensors"); } while(0)
+#define NVDR_CHECK_F32(...) do { nvdr_check_f32({__VA_ARGS__}, __func__, "(): Inputs " #__VA_ARGS__ " must be float32 tensors"); } while(0)
+#define NVDR_CHECK_I32(...) do { nvdr_check_i32({__VA_ARGS__}, __func__, "(): Inputs " #__VA_ARGS__ " must be int32 tensors"); } while(0)
+inline void nvdr_check_cpu(at::ArrayRef<at::Tensor> ts,        const char* func, const char* err_msg) { for (const at::Tensor& t : ts) TORCH_CHECK(t.device().type() == c10::DeviceType::CPU, func, err_msg); }
+inline void nvdr_check_contiguous(at::ArrayRef<at::Tensor> ts, const char* func, const char* err_msg) { for (const at::Tensor& t : ts) TORCH_CHECK(t.is_contiguous(), func, err_msg); }
+inline void nvdr_check_f32(at::ArrayRef<at::Tensor> ts,        const char* func, const char* err_msg) { for (const at::Tensor& t : ts) TORCH_CHECK(t.dtype() == torch::kFloat32, func, err_msg); }
+inline void nvdr_check_i32(at::ArrayRef<at::Tensor> ts,        const char* func, const char* err_msg) { for (const at::Tensor& t : ts) TORCH_CHECK(t.dtype() == torch::kInt32, func, err_msg); }
+//------------------------------------------------------------------------
--- a/nvdiffrast/torch/torch_interpolate.cpp
+++ b/nvdiffrast/torch/torch_interpolate.cpp
+// Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#include "torch_common.inl"
+#include "../common/common.h"
+#include "../common/interpolate.h"
+
+//------------------------------------------------------------------------
+// Kernel prototypes.
+
+void InterpolateFwdKernel   (const InterpolateKernelParams p);
+void InterpolateFwdKernelDa (const InterpolateKernelParams p);
+void InterpolateGradKernel  (const InterpolateKernelParams p);
+void InterpolateGradKernelDa(const InterpolateKernelParams p);
+
+//------------------------------------------------------------------------
+// Helper
+
+static void set_diff_attrs(InterpolateKernelParams& p, bool diff_attrs_all, std::vector<int>& diff_attrs_vec)
+{
+    if (diff_attrs_all)
+    {
+        p.numDiffAttr = p.numAttr;
+        p.diff_attrs_all = 1;
+    }
+    else
+    {
+        NVDR_CHECK(diff_attrs_vec.size() <= IP_MAX_DIFF_ATTRS, "too many entries in diff_attrs list (increase IP_MAX_DIFF_ATTRS)");
+        p.numDiffAttr = diff_attrs_vec.size();
+        memcpy(p.diffAttrs, &diff_attrs_vec[0], diff_attrs_vec.size()*sizeof(int));
+    }
+}
+
+//------------------------------------------------------------------------
+// Forward op.
+
+std::tuple<torch::Tensor, torch::Tensor> interpolate_fwd_da(torch::Tensor attr, torch::Tensor rast, torch::Tensor tri, torch::Tensor rast_db, bool diff_attrs_all, std::vector<int>& diff_attrs_vec)
+{
+    cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+    InterpolateKernelParams p = {}; // Initialize all fields to zero.
+    bool enable_da = (rast_db.defined()) && (diff_attrs_all || !diff_attrs_vec.empty());
+    p.instance_mode = (attr.sizes().size() > 2) ? 1 : 0;
+
+    // Check inputs.
+    if (enable_da)
+    {
+        NVDR_CHECK_DEVICE(attr, rast, tri, rast_db);
+        NVDR_CHECK_CONTIGUOUS(attr, rast, tri, rast_db);
+        NVDR_CHECK_F32(attr, rast, rast_db);
+        NVDR_CHECK_I32(tri);
+    }
+    else
+    {
+        NVDR_CHECK_DEVICE(attr, rast, tri);
+        NVDR_CHECK_CONTIGUOUS(attr, rast, tri);
+        NVDR_CHECK_F32(attr, rast);
+        NVDR_CHECK_I32(tri);
+    }
+
+    // Sanity checks.
+    NVDR_CHECK(rast.sizes().size() == 4 && rast.size(0) > 0 && rast.size(1) > 0 && rast.size(2) > 0 && rast.size(3) == 4, "rast must have shape[>0, >0, >0, 4]");
+    NVDR_CHECK( tri.sizes().size() == 2 && tri.size(0) > 0 && tri.size(1) == 3, "tri must have shape [>0, 3]");
+    NVDR_CHECK((attr.sizes().size() == 2 || attr.sizes().size() == 3) && attr.size(0) > 0 && attr.size(1) > 0 && (attr.sizes().size() == 2 || attr.size(2) > 0), "attr must have shape [>0, >0, >0] or [>0, >0]");
+    if (p.instance_mode)
+        NVDR_CHECK(attr.size(0) == rast.size(0) || attr.size(0) == 1, "minibatch size mismatch between inputs rast, attr");
+    if (enable_da)
+    {
+        NVDR_CHECK(rast_db.sizes().size() == 4 && rast_db.size(0) > 0 && rast_db.size(1) > 0 && rast_db.size(2) > 0 && rast_db.size(3) == 4, "rast_db must have shape[>0, >0, >0, 4]");
+        NVDR_CHECK(rast_db.size(1) == rast.size(1) && rast_db.size(2) == rast.size(2), "spatial size mismatch between inputs rast and rast_db");
+        NVDR_CHECK(rast_db.size(0) == rast.size(0), "minibatch size mismatch between inputs rast, rast_db");
+    }
+
+    // Extract input dimensions.
+    p.numVertices  = attr.size(p.instance_mode ? 1 : 0);
+    p.numAttr      = attr.size(p.instance_mode ? 2 : 1);
+    p.numTriangles = tri.size(0);
+    p.height       = rast.size(1);
+    p.width        = rast.size(2);
+    p.depth        = rast.size(0);
+
+    // Set attribute pixel differential info if enabled, otherwise leave as zero.
+    if (enable_da)
+        set_diff_attrs(p, diff_attrs_all, diff_attrs_vec);
+
+    // Get input pointers.
+    p.attr = attr.data_ptr<float>();
+    p.rast = rast.data_ptr<float>();
+    p.tri = tri.data_ptr<int>();
+    p.rastDB = enable_da ? rast_db.data_ptr<float>() : NULL;
+    p.attrBC = (p.instance_mode && attr.size(0) == 1) ? 1 : 0;
+
+    // Allocate output tensors.
+    torch::TensorOptions opts = torch::TensorOptions().dtype(torch::kFloat32).device(torch::kCUDA);    
+    torch::Tensor out = torch::empty({p.depth, p.height, p.width, p.numAttr}, opts);
+    torch::Tensor out_da = torch::empty({p.depth, p.height, p.width, p.numDiffAttr * 2}, opts);
+
+    p.out = out.data_ptr<float>();
+    p.outDA = enable_da ? out_da.data_ptr<float>() : NULL;
+
+    // Verify that buffers are aligned to allow float2/float4 operations.
+    NVDR_CHECK(!((uintptr_t)p.rast   & 15), "rast input tensor not aligned to float4");
+    NVDR_CHECK(!((uintptr_t)p.rastDB & 15), "rast_db input tensor not aligned to float4");
+    NVDR_CHECK(!((uintptr_t)p.outDA  &  7), "out_da output tensor not aligned to float2");
+
+    // Choose launch parameters.
+    dim3 blockSize = getLaunchBlockSize(IP_FWD_MAX_KERNEL_BLOCK_WIDTH, IP_FWD_MAX_KERNEL_BLOCK_HEIGHT, p.width, p.height);
+    dim3 gridSize  = getLaunchGridSize(blockSize, p.width, p.height, p.depth);
+
+    // Launch CUDA kernel.
+    void* args[] = {&p};
+    void* func = enable_da ? (void*)InterpolateFwdKernelDa : (void*)InterpolateFwdKernel;
+    NVDR_CHECK_CUDA_ERROR(cudaLaunchKernel(func, gridSize, blockSize, args, 0, stream));
+
+    // Return results.
+    return std::tuple<torch::Tensor, torch::Tensor>(out, out_da);
+}
+
+// Version without derivatives.
+std::tuple<torch::Tensor, torch::Tensor> interpolate_fwd(torch::Tensor attr, torch::Tensor rast, torch::Tensor tri)
+{
+    std::vector<int> empty_vec;
+    torch::Tensor empty_tensor;
+    return interpolate_fwd_da(attr, rast, tri, empty_tensor, false, empty_vec);
+}
+
+//------------------------------------------------------------------------
+// Gradient op.
+
+std::tuple<torch::Tensor, torch::Tensor, torch::Tensor> interpolate_grad_da(torch::Tensor attr, torch::Tensor rast, torch::Tensor tri, torch::Tensor dy, torch::Tensor rast_db, torch::Tensor dda, bool diff_attrs_all, std::vector<int>& diff_attrs_vec)
+{
+    cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+    InterpolateKernelParams p = {}; // Initialize all fields to zero.
+    bool enable_da = (rast_db.defined()) && (diff_attrs_all || !diff_attrs_vec.empty());
+    p.instance_mode = (attr.sizes().size() > 2) ? 1 : 0;
+
+    // Check inputs.
+    if (enable_da)
+    {
+        NVDR_CHECK_DEVICE(attr, rast, tri, dy, rast_db, dda);
+        NVDR_CHECK_CONTIGUOUS(attr, rast, tri, rast_db);
+        NVDR_CHECK_F32(attr, rast, dy, rast_db, dda);
+        NVDR_CHECK_I32(tri);
+    }
+    else
+    {
+        NVDR_CHECK_DEVICE(attr, rast, tri, dy);
+        NVDR_CHECK_CONTIGUOUS(attr, rast, tri);
+        NVDR_CHECK_F32(attr, rast, dy);
+        NVDR_CHECK_I32(tri);
+    }
+
+    // Depth of attributes.
+    int attr_depth = p.instance_mode ? (attr.sizes().size() > 1 ? attr.size(0) : 0) : 1;
+
+    // Sanity checks.
+    NVDR_CHECK(rast.sizes().size() == 4 && rast.size(0) > 0 && rast.size(1) > 0 && rast.size(2) > 0 && rast.size(3) == 4, "rast must have shape[>0, >0, >0, 4]");
+    NVDR_CHECK(tri.sizes().size() == 2 && tri.size(0) > 0 && tri.size(1) == 3, "tri must have shape [>0, 3]");
+    NVDR_CHECK((attr.sizes().size() == 2 || attr.sizes().size() == 3) && attr.size(0) > 0 && attr.size(1) > 0 && (attr.sizes().size() == 2 || attr.size(2) > 0), "attr must have shape [>0, >0, >0] or [>0, >0]");
+    NVDR_CHECK(dy.sizes().size() == 4 && dy.size(0) > 0 && dy.size(1) == rast.size(1) && dy.size(2) == rast.size(2) && dy.size(3) > 0, "dy must have shape [>0, height, width, >0]");
+    NVDR_CHECK(dy.size(3) == attr.size(attr.sizes().size() - 1), "argument count mismatch between inputs dy, attr");
+    NVDR_CHECK((attr_depth == rast.size(0) || attr_depth == 1) && dy.size(0) == rast.size(0), "minibatch size mismatch between inputs rast, dy, attr");
+    if (enable_da)
+    {
+        NVDR_CHECK(dda.sizes().size() == 4 && dda.size(0) > 0 && dda.size(1) == rast.size(1) && dda.size(2) == rast.size(2), "dda must have shape [>0, height, width, ?]");
+        NVDR_CHECK(dda.size(0) == rast.size(0), "minibatch size mismatch between rast, dda");
+        NVDR_CHECK(rast_db.sizes().size() == 4 && rast_db.size(0) > 0 && rast_db.size(1) > 0 && rast_db.size(2) > 0 && rast_db.size(3) == 4, "rast_db must have shape[>0, >0, >0, 4]");
+        NVDR_CHECK(rast_db.size(1) == rast.size(1) && rast_db.size(2) == rast.size(2), "spatial size mismatch between inputs rast and rast_db");
+        NVDR_CHECK(rast_db.size(0) == rast.size(0), "minibatch size mismatch between inputs rast, rast_db");
+    }
+
+    // Extract input dimensions.
+    p.numVertices  = attr.size(p.instance_mode ? 1 : 0);
+    p.numAttr      = attr.size(p.instance_mode ? 2 : 1);
+    p.numTriangles = tri.size(0);
+    p.height       = rast.size(1);
+    p.width        = rast.size(2);
+    p.depth        = rast.size(0);
+
+    // Ensure gradients are contiguous.
+    torch::Tensor dy_ = dy.contiguous();
+    torch::Tensor dda_;
+    if (enable_da)
+        dda_ = dda.contiguous();
+
+    // Set attribute pixel differential info if enabled, otherwise leave as zero.
+    if (enable_da)
+        set_diff_attrs(p, diff_attrs_all, diff_attrs_vec);
+
+    // Get input pointers.
+    p.attr = attr.data_ptr<float>();
+    p.rast = rast.data_ptr<float>();
+    p.tri = tri.data_ptr<int>();
+    p.dy = dy_.data_ptr<float>();
+    p.rastDB = enable_da ? rast_db.data_ptr<float>() : NULL;
+    p.dda = enable_da ? dda_.data_ptr<float>() : NULL;
+    p.attrBC = (p.instance_mode && attr_depth < p.depth) ? 1 : 0;
+
+    // Allocate output tensors.
+    torch::TensorOptions opts = torch::TensorOptions().dtype(torch::kFloat32).device(torch::kCUDA);    
+    torch::Tensor gradAttr = torch::zeros_like(attr);
+    torch::Tensor gradRaster = torch::empty_like(rast);
+    torch::Tensor gradRasterDB;
+    if (enable_da)
+        gradRasterDB = torch::empty_like(rast_db);
+
+    p.gradAttr = gradAttr.data_ptr<float>();
+    p.gradRaster = gradRaster.data_ptr<float>();
+    p.gradRasterDB = enable_da ? gradRasterDB.data_ptr<float>() : NULL;
+
+    // Verify that buffers are aligned to allow float2/float4 operations.
+    NVDR_CHECK(!((uintptr_t)p.rast         & 15), "rast input tensor not aligned to float4");
+    NVDR_CHECK(!((uintptr_t)p.rastDB       & 15), "rast_db input tensor not aligned to float4");
+    NVDR_CHECK(!((uintptr_t)p.dda          &  7), "dda input tensor not aligned to float2");
+    NVDR_CHECK(!((uintptr_t)p.gradRaster   & 15), "grad_rast output tensor not aligned to float4");
+    NVDR_CHECK(!((uintptr_t)p.gradRasterDB & 15), "grad_rast_db output tensor not aligned to float4");
+
+    // Choose launch parameters.
+    dim3 blockSize = getLaunchBlockSize(IP_GRAD_MAX_KERNEL_BLOCK_WIDTH, IP_GRAD_MAX_KERNEL_BLOCK_HEIGHT, p.width, p.height);
+    dim3 gridSize  = getLaunchGridSize(blockSize, p.width, p.height, p.depth);
+
+    // Launch CUDA kernel.
+    void* args[] = {&p};
+    void* func = enable_da ? (void*)InterpolateGradKernelDa : (void*)InterpolateGradKernel;
+    NVDR_CHECK_CUDA_ERROR(cudaLaunchKernel(func, gridSize, blockSize, args, 0, stream));
+
+    // Return results.
+    return std::tuple<torch::Tensor, torch::Tensor, torch::Tensor>(gradAttr, gradRaster, gradRasterDB);
+}
+
+// Version without derivatives.
+std::tuple<torch::Tensor, torch::Tensor> interpolate_grad(torch::Tensor attr, torch::Tensor rast, torch::Tensor tri, torch::Tensor dy)
+{
+    std::vector<int> empty_vec;
+    torch::Tensor empty_tensor;
+    std::tuple<torch::Tensor, torch::Tensor, torch::Tensor> result = interpolate_grad_da(attr, rast, tri, dy, empty_tensor, empty_tensor, false, empty_vec);
+    return std::tuple<torch::Tensor, torch::Tensor>(std::get<0>(result), std::get<1>(result));
+}
+
+//------------------------------------------------------------------------
--- a/nvdiffrast/torch/torch_rasterize.cpp
+++ b/nvdiffrast/torch/torch_rasterize.cpp
+// Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#include "torch_common.inl"
+#include "torch_types.h"
+#include "../common/common.h"
+#include "../common/rasterize.h"
+#include <tuple>
+
+//------------------------------------------------------------------------
+// Kernel prototypes.
+
+void RasterizeGradKernel(const RasterizeGradParams p);
+void RasterizeGradKernelDb(const RasterizeGradParams p);
+
+//------------------------------------------------------------------------
+// Python GL state wrapper methods.
+
+RasterizeGLStateWrapper::RasterizeGLStateWrapper(bool enableDB, bool automatic_)
+{
+    pState = new RasterizeGLState();
+    automatic = automatic_;
+    memset(pState, 0, sizeof(RasterizeGLState));
+    pState->enableDB = enableDB ? 1 : 0;
+    rasterizeInitGLContext(NVDR_CTX_PARAMS, *pState);
+    releaseGLContext();
+}
+
+RasterizeGLStateWrapper::~RasterizeGLStateWrapper(void)
+{
+    destroyGLContext(pState->glctx);
+    delete pState;
+}
+
+void RasterizeGLStateWrapper::setContext(void)
+{
+    setGLContext(pState->glctx);
+}
+
+void RasterizeGLStateWrapper::releaseContext(void)
+{
+    releaseGLContext();
+}
+
+//------------------------------------------------------------------------
+// Forward op.
+
+std::tuple<torch::Tensor, torch::Tensor> rasterize_fwd(RasterizeGLStateWrapper& stateWrapper, torch::Tensor pos, torch::Tensor tri, std::tuple<int, int> resolution, torch::Tensor ranges)
+{
+    cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+    RasterizeGLState& s = *stateWrapper.pState;
+
+    // Check inputs.
+    NVDR_CHECK_DEVICE(pos, tri);
+    NVDR_CHECK_CPU(ranges);
+    NVDR_CHECK_CONTIGUOUS(pos, tri, ranges);
+    NVDR_CHECK_F32(pos);
+    NVDR_CHECK_I32(tri, ranges);
+
+    // Determine number of outputs
+    int num_outputs = s.enableDB ? 2 : 1;
+
+    // Determine instance mode and check input dimensions.
+    bool instance_mode = pos.sizes().size() > 2;
+    if (instance_mode)
+        NVDR_CHECK(pos.sizes().size() == 3 && pos.size(0) > 0 && pos.size(1) > 0 && pos.size(2) == 4, "instance mode - pos must have shape [>0, >0, 4]");
+    else
+    {
+        NVDR_CHECK(pos.sizes().size() == 2 && pos.size(0) > 0 && pos.size(1) == 4, "range mode - pos must have shape [>0, 4]");
+        NVDR_CHECK(ranges.sizes().size() == 2 && ranges.size(0) > 0 && ranges.size(1) == 2, "range mode - ranges must have shape [>0, 2]");
+    }
+    NVDR_CHECK(tri.sizes().size() == 2 && tri.size(0) > 0 && tri.size(1) == 3, "tri must have shape [>0, 3]");
+
+    // Get output shape.
+    int height = std::get<0>(resolution);
+    int width  = std::get<1>(resolution);
+    int depth  = instance_mode ? pos.size(0) : ranges.size(0);
+    NVDR_CHECK(height > 0 && width > 0, "resolution must be [>0, >0]");
+
+    // Get position and triangle buffer sizes in int32/float32.
+    int posCount = 4 * pos.size(0) * (instance_mode ? pos.size(1) : 1);
+    int triCount = 3 * tri.size(0);
+
+    // Set the GL context unless manual context.
+    if (stateWrapper.automatic)
+        setGLContext(s.glctx);
+
+    // Resize all buffers.
+    rasterizeResizeBuffers(NVDR_CTX_PARAMS, s, posCount, triCount, width, height, depth);
+
+    // Copy input data to GL and render.
+    const float* posPtr = pos.data_ptr<float>();
+    const int32_t* rangesPtr = instance_mode ? 0 : ranges.data_ptr<int32_t>(); // This is in CPU memory.
+    const int32_t* triPtr = tri.data_ptr<int32_t>();
+    int vtxPerInstance = instance_mode ? pos.size(1) : 0;
+    rasterizeRender(NVDR_CTX_PARAMS, s, stream, posPtr, posCount, vtxPerInstance, triPtr, triCount, rangesPtr, width, height, depth);
+
+    // Allocate output tensors.
+    torch::TensorOptions opts = torch::TensorOptions().dtype(torch::kFloat32).device(torch::kCUDA);    
+    torch::Tensor out = torch::empty({depth, height, width, 4}, opts);
+    torch::Tensor out_db = torch::empty({depth, height, width, s.enableDB ? 4 : 0}, opts);
+    float* outputPtr[2];
+    outputPtr[0] = out.data_ptr<float>();
+    outputPtr[1] = s.enableDB ? out_db.data_ptr<float>() : NULL;
+
+    // Copy rasterized results into CUDA buffers.
+    rasterizeCopyResults(NVDR_CTX_PARAMS, s, stream, outputPtr, width, height, depth);
+
+    // Done. Release GL context and return.
+    if (stateWrapper.automatic)
+        releaseGLContext();
+
+    return std::tuple<torch::Tensor, torch::Tensor>(out, out_db);
+}
+
+//------------------------------------------------------------------------
+// Gradient op.
+
+torch::Tensor rasterize_grad_db(torch::Tensor pos, torch::Tensor tri, torch::Tensor out, torch::Tensor dy, torch::Tensor ddb)
+{
+    cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+    RasterizeGradParams p;
+    bool enable_db = ddb.defined();
+
+    // Check inputs.
+    if (enable_db)
+    {
+        NVDR_CHECK_DEVICE(pos, tri, out, dy, ddb);
+        NVDR_CHECK_CONTIGUOUS(pos, tri, out);
+        NVDR_CHECK_F32(pos, out, dy, ddb);
+        NVDR_CHECK_I32(tri);
+    }
+    else
+    {
+        NVDR_CHECK_DEVICE(pos, tri, out, dy);
+        NVDR_CHECK_CONTIGUOUS(pos, tri, out);
+        NVDR_CHECK_F32(pos, out, dy);
+        NVDR_CHECK_I32(tri);
+    }
+
+    // Determine instance mode.
+    p.instance_mode = (pos.sizes().size() > 2) ? 1 : 0;
+
+    // Shape is taken from the rasterizer output tensor.
+    NVDR_CHECK(out.sizes().size() == 4, "tensor out must be rank-4");
+    p.depth  = out.size(0);
+    p.height = out.size(1);
+    p.width  = out.size(2);
+    NVDR_CHECK(p.depth > 0 && p.height > 0 && p.width > 0, "resolution must be [>0, >0, >0]");
+
+    // Check other shapes.
+    if (p.instance_mode)
+        NVDR_CHECK(pos.sizes().size() == 3 && pos.size(0) == p.depth && pos.size(1) > 0 && pos.size(2) == 4, "pos must have shape [depth, >0, 4]");
+    else
+        NVDR_CHECK(pos.sizes().size() == 2 && pos.size(0) > 0 && pos.size(1) == 4, "pos must have shape [>0, 4]");
+    NVDR_CHECK(tri.sizes().size() == 2 && tri.size(0) > 0 && tri.size(1) == 3, "tri must have shape [>0, 3]");
+    NVDR_CHECK(out.sizes().size() == 4 && out.size(0) == p.depth && out.size(1) == p.height && out.size(2) == p.width && out.size(3) == 4, "out must have shape [depth, height, width, 4]");
+    NVDR_CHECK( dy.sizes().size() == 4 &&  dy.size(0) == p.depth &&  dy.size(1) == p.height &&  dy.size(2) == p.width &&  dy.size(3) == 4, "dy must have shape [depth, height, width, 4]");
+    if (enable_db)
+        NVDR_CHECK(ddb.sizes().size() == 4 && ddb.size(0) == p.depth && ddb.size(1) == p.height && ddb.size(2) == p.width && ddb.size(3) == 4, "ddb must have shape [depth, height, width, 4]");
+
+    // Ensure gradients are contiguous.
+    torch::Tensor dy_ = dy.contiguous();
+    torch::Tensor ddb_;
+    if (enable_db)
+        ddb_ = ddb.contiguous();
+
+    // Populate parameters.
+    p.numTriangles = tri.size(0);
+    p.numVertices = p.instance_mode ? pos.size(1) : pos.size(0);
+    p.pos = pos.data_ptr<float>();
+    p.tri = tri.data_ptr<int>();
+    p.out = out.data_ptr<float>();
+    p.dy  = dy_.data_ptr<float>();
+    p.ddb = enable_db ? ddb_.data_ptr<float>() : NULL;
+    
+    // Set up pixel position to clip space x, y transform.
+    p.xs = 2.f / (float)p.width;
+    p.xo = 1.f / (float)p.width - 1.f;
+    p.ys = 2.f / (float)p.height;
+    p.yo = 1.f / (float)p.height - 1.f;
+
+    // Allocate output tensor for position gradients.
+    torch::Tensor grad = torch::zeros_like(pos);
+    p.grad = grad.data_ptr<float>();
+
+    // Verify that buffers are aligned to allow float2/float4 operations.
+    NVDR_CHECK(!((uintptr_t)p.pos & 15), "pos input tensor not aligned to float4");
+    NVDR_CHECK(!((uintptr_t)p.dy  &  7), "dy input tensor not aligned to float2");
+    NVDR_CHECK(!((uintptr_t)p.ddb & 15), "ddb input tensor not aligned to float4");
+
+    // Choose launch parameters.
+    dim3 blockSize = getLaunchBlockSize(RAST_GRAD_MAX_KERNEL_BLOCK_WIDTH, RAST_GRAD_MAX_KERNEL_BLOCK_HEIGHT, p.width, p.height);
+    dim3 gridSize  = getLaunchGridSize(blockSize, p.width, p.height, p.depth);
+
+    // Launch CUDA kernel.
+    void* args[] = {&p};
+    void* func = enable_db ? (void*)RasterizeGradKernelDb : (void*)RasterizeGradKernel;
+    NVDR_CHECK_CUDA_ERROR(cudaLaunchKernel(func, gridSize, blockSize, args, 0, stream));
+
+    // Return the gradients.
+    return grad;
+}
+
+// Version without derivatives.
+torch::Tensor rasterize_grad(torch::Tensor pos, torch::Tensor tri, torch::Tensor out, torch::Tensor dy)
+{ 
+    torch::Tensor empty_tensor;
+    return rasterize_grad_db(pos, tri, out, dy, empty_tensor);
+}
+
+//------------------------------------------------------------------------
--- a/nvdiffrast/torch/torch_texture.cpp
+++ b/nvdiffrast/torch/torch_texture.cpp
+// Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#include "torch_common.inl"
+#include "torch_types.h"
+#include "../common/common.h"
+#include "../common/texture.h"
+#include <cuda_runtime.h>
+
+//------------------------------------------------------------------------
+// Kernel prototypes.
+
+void MipBuildKernel1                            (const TextureKernelParams p);
+void MipBuildKernel2                            (const TextureKernelParams p);
+void MipBuildKernel4                            (const TextureKernelParams p);
+void TextureFwdKernelNearest1                   (const TextureKernelParams p);
+void TextureFwdKernelNearest2                   (const TextureKernelParams p);
+void TextureFwdKernelNearest4                   (const TextureKernelParams p);
+void TextureFwdKernelLinear1                    (const TextureKernelParams p);
+void TextureFwdKernelLinear2                    (const TextureKernelParams p);
+void TextureFwdKernelLinear4                    (const TextureKernelParams p);
+void TextureFwdKernelLinearMipmapNearest1       (const TextureKernelParams p);
+void TextureFwdKernelLinearMipmapNearest2       (const TextureKernelParams p);
+void TextureFwdKernelLinearMipmapNearest4       (const TextureKernelParams p);
+void TextureFwdKernelLinearMipmapLinear1        (const TextureKernelParams p);
+void TextureFwdKernelLinearMipmapLinear2        (const TextureKernelParams p);
+void TextureFwdKernelLinearMipmapLinear4        (const TextureKernelParams p);
+void TextureFwdKernelCubeNearest1               (const TextureKernelParams p);
+void TextureFwdKernelCubeNearest2               (const TextureKernelParams p);
+void TextureFwdKernelCubeNearest4               (const TextureKernelParams p);
+void TextureFwdKernelCubeLinear1                (const TextureKernelParams p);
+void TextureFwdKernelCubeLinear2                (const TextureKernelParams p);
+void TextureFwdKernelCubeLinear4                (const TextureKernelParams p);
+void TextureFwdKernelCubeLinearMipmapNearest1   (const TextureKernelParams p);
+void TextureFwdKernelCubeLinearMipmapNearest2   (const TextureKernelParams p);
+void TextureFwdKernelCubeLinearMipmapNearest4   (const TextureKernelParams p);
+void TextureFwdKernelCubeLinearMipmapLinear1    (const TextureKernelParams p);
+void TextureFwdKernelCubeLinearMipmapLinear2    (const TextureKernelParams p);
+void TextureFwdKernelCubeLinearMipmapLinear4    (const TextureKernelParams p);
+void MipGradKernel1                             (const TextureKernelParams p);
+void MipGradKernel2                             (const TextureKernelParams p);
+void MipGradKernel4                             (const TextureKernelParams p);
+void TextureGradKernelNearest                   (const TextureKernelParams p);
+void TextureGradKernelLinear                    (const TextureKernelParams p);
+void TextureGradKernelLinearMipmapNearest       (const TextureKernelParams p);
+void TextureGradKernelLinearMipmapLinear        (const TextureKernelParams p);
+void TextureGradKernelCubeNearest               (const TextureKernelParams p);
+void TextureGradKernelCubeLinear                (const TextureKernelParams p);
+void TextureGradKernelCubeLinearMipmapNearest   (const TextureKernelParams p);
+void TextureGradKernelCubeLinearMipmapLinear    (const TextureKernelParams p);
+
+//------------------------------------------------------------------------
+// Modeselektor.
+
+static void set_modes(TextureKernelParams& p, int filter_mode, int boundary_mode, int max_mip_level)
+{
+    // Mip and filter modes.
+    p.filterMode = filter_mode;
+    NVDR_CHECK(p.filterMode >= 0 && p.filterMode < TEX_MODE_COUNT, "filter_mode unsupported");
+    p.enableMip = (p.filterMode == TEX_MODE_LINEAR_MIPMAP_NEAREST || p.filterMode == TEX_MODE_LINEAR_MIPMAP_LINEAR);
+
+    // Mip level clamp.
+    if (p.enableMip)
+    {
+        p.mipLevelLimit = max_mip_level;
+        NVDR_CHECK(p.mipLevelLimit >= -1, "invalid max_mip_level");
+    }
+
+    // Boundary mode.
+    p.boundaryMode = boundary_mode;
+    NVDR_CHECK(p.boundaryMode >= 0 && p.boundaryMode < TEX_BOUNDARY_MODE_COUNT, "boundary_mode unsupported");
+}
+
+//------------------------------------------------------------------------
+// Mipmap construction.
+
+TextureMipWrapper texture_construct_mip(torch::Tensor tex, int max_mip_level, bool cube_mode)
+{
+    cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+    TextureKernelParams p = {}; // Initialize all fields to zero.
+    p.mipLevelLimit = max_mip_level;
+    p.boundaryMode = cube_mode ? TEX_BOUNDARY_MODE_CUBE : TEX_BOUNDARY_MODE_WRAP;
+    NVDR_CHECK(p.mipLevelLimit >= -1, "invalid max_mip_level");
+
+    // Check inputs.
+    NVDR_CHECK_DEVICE(tex);
+    NVDR_CHECK_CONTIGUOUS(tex);
+    NVDR_CHECK_F32(tex);
+
+    // Populate parameters and sanity check tex shape.
+    if (!cube_mode)
+    {
+        NVDR_CHECK(tex.sizes().size() == 4 && tex.size(0) > 0 && tex.size(1) > 0 && tex.size(2) > 0 && tex.size(3) > 0, "tex must have shape[>0, >0, >0, >0]");
+    }
+    else
+    {
+        NVDR_CHECK(tex.sizes().size() == 5 && tex.size(0) > 0 && tex.size(1) == 6 && tex.size(2) > 0 && tex.size(3) > 0 && tex.size(4) > 0, "tex must have shape[>0, 6, >0, >0, >0] in cube map mode");
+        NVDR_CHECK(tex.size(2) == tex.size(3), "texture shape must be square in cube map mode");
+    }
+    p.texDepth  = tex.size(0);
+    p.texHeight = tex.size(cube_mode ? 2 : 1);
+    p.texWidth  = tex.size(cube_mode ? 3 : 2);
+    p.channels  = tex.size(cube_mode ? 4 : 3);
+
+    // Set texture pointer.
+    p.tex = tex.data_ptr<float>();
+
+    // Set mip offsets and calculate total size.
+    int mipTotal = calculateMipInfo(NVDR_CTX_PARAMS, p);
+    
+    // Allocate and set mip tensor.
+    torch::TensorOptions opts = torch::TensorOptions().dtype(torch::kFloat32).device(torch::kCUDA);    
+    torch::Tensor mip = torch::empty({mipTotal}, opts);
+    p.mip = mip.data_ptr<float>();
+
+    // Choose kernel variants based on channel count.
+    void* args[] = {&p};
+    int channel_div_idx = 0;
+    if (!(p.channels & 3))
+        channel_div_idx = 2;  // Channel count divisible by 4.
+    else if (!(p.channels & 1))
+        channel_div_idx = 1;  // Channel count divisible by 2.
+
+    // Build mip levels.
+    for (int i=1; i <= p.mipLevelMax; i++)
+    {
+        int2 ms = mipLevelSize(p, i);
+        int3 sz = make_int3(ms.x, ms.y, p.texDepth);
+        dim3 blockSize = getLaunchBlockSize(TEX_FWD_MAX_MIP_KERNEL_BLOCK_WIDTH, TEX_FWD_MAX_MIP_KERNEL_BLOCK_HEIGHT, sz.x, sz.y);
+        dim3 gridSize  = getLaunchGridSize(blockSize, sz.x, sz.y, sz.z * (cube_mode ? 6 : 1));
+        p.mipLevelOut = i;
+
+        void* build_func_tbl[3] = { (void*)MipBuildKernel1, (void*)MipBuildKernel2, (void*)MipBuildKernel4 };
+        NVDR_CHECK_CUDA_ERROR(cudaLaunchKernel(build_func_tbl[channel_div_idx], gridSize, blockSize, args, 0, stream));
+    }
+
+    // Return the mip tensor in a wrapper.    
+    TextureMipWrapper mip_wrap;
+    mip_wrap.mip = mip;
+    mip_wrap.max_mip_level = max_mip_level;
+    mip_wrap.texture_size = tex.sizes().vec();
+    mip_wrap.cube_mode = cube_mode;
+    return mip_wrap;
+}
+
+//------------------------------------------------------------------------
+// Forward op.
+
+torch::Tensor texture_fwd_mip(torch::Tensor tex, torch::Tensor uv, torch::Tensor uv_da, TextureMipWrapper mip_wrap, int filter_mode, int boundary_mode)
+{
+    cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+    TextureKernelParams p = {}; // Initialize all fields to zero.
+    torch::Tensor& mip = mip_wrap.mip; // Unwrap.
+    int max_mip_level = mip_wrap.max_mip_level;
+    set_modes(p, filter_mode, boundary_mode, max_mip_level);
+    if (p.enableMip)
+    {
+        NVDR_CHECK(uv_da.defined(), "mipmapping filter mode requires uv_da input");
+        NVDR_CHECK(mip.defined(), "mipmapping filter mode requires mip tensor input");
+    }
+
+    // Check inputs.
+    if (p.enableMip)
+    {
+        NVDR_CHECK_DEVICE(tex, uv, uv_da, mip);
+        NVDR_CHECK_CONTIGUOUS(tex, uv, uv_da, mip);
+        NVDR_CHECK_F32(tex, uv, uv_da, mip);
+    }
+    else
+    {
+        NVDR_CHECK_DEVICE(tex, uv);
+        NVDR_CHECK_CONTIGUOUS(tex, uv);
+        NVDR_CHECK_F32(tex, uv);
+    }
+
+    // Sanity checks and state setters.
+    bool cube_mode = (boundary_mode == TEX_BOUNDARY_MODE_CUBE);
+    if (!cube_mode)
+    {
+        NVDR_CHECK(tex.sizes().size() == 4 && tex.size(0) > 0 && tex.size(1) > 0 && tex.size(2) > 0 && tex.size(3) > 0, "tex must have shape[>0, >0, >0, >0]");
+        NVDR_CHECK(uv.sizes().size() == 4 && uv.size(0) > 0 && uv.size(1) > 0 && uv.size(2) > 0 && uv.size(3) == 2, "uv must have shape [>0, >0, >0, 2]");
+        p.texHeight = tex.size(1);
+        p.texWidth  = tex.size(2);
+        p.channels  = tex.size(3);
+    }
+    else
+    {
+        NVDR_CHECK(tex.sizes().size() == 5 && tex.size(0) > 0 && tex.size(1) == 6 && tex.size(2) > 0 && tex.size(3) > 0 && tex.size(4) > 0, "tex must have shape[>0, 6, >0, >0, >0] in cube map mode");
+        NVDR_CHECK(uv.sizes().size() == 4 && uv.size(0) > 0 && uv.size(1) > 0 && uv.size(2) > 0 && uv.size(3) == 3, "uv must have shape [>0, >0, >0, 3] in cube map mode");
+        NVDR_CHECK(tex.size(2) == tex.size(3), "texture shape must be square in cube map mode");
+        p.texHeight = tex.size(2);
+        p.texWidth  = tex.size(3);
+        p.channels  = tex.size(4);
+    }
+    NVDR_CHECK(tex.size(0) == 1 || tex.size(0) == uv.size(0), "minibatch size mismatch between inputs tex, uv");
+    NVDR_CHECK(p.texWidth <= (1 << TEX_MAX_MIP_LEVEL) && p.texHeight <= (1 << TEX_MAX_MIP_LEVEL), "texture size too large");
+    p.n         = uv.size(0);
+    p.imgHeight = uv.size(1);
+    p.imgWidth  = uv.size(2);
+    p.texDepth  = tex.size(0);
+    if (p.enableMip)
+    {
+        if (!cube_mode)
+            NVDR_CHECK(uv_da.sizes().size() == 4 && uv_da.size(0) == p.n && uv_da.size(1) == p.imgHeight && uv_da.size(2) == p.imgWidth && uv_da.size(3) == 4, "uv_da must have shape [minibatch_size, height, width, 4]");
+        else
+            NVDR_CHECK(uv_da.sizes().size() == 4 && uv_da.size(0) == p.n && uv_da.size(1) == p.imgHeight && uv_da.size(2) == p.imgWidth && uv_da.size(3) == 6, "uv_da must have shape [minibatch_size, height, width, 6] in cube map mode");
+    }
+
+    // Get input pointers.
+    p.tex = tex.data_ptr<float>();
+    p.uv = uv.data_ptr<float>();
+    p.uvDA = p.enableMip ? uv_da.data_ptr<float>() : NULL;
+
+    // Allocate output tensor.
+    torch::TensorOptions opts = torch::TensorOptions().dtype(torch::kFloat32).device(torch::kCUDA);    
+    torch::Tensor out = torch::empty({p.n, p.imgHeight, p.imgWidth, p.channels}, opts);
+    p.out = out.data_ptr<float>();
+
+    // Choose kernel variants based on channel count.
+    void* args[] = {&p};
+    int channel_div_idx = 0;
+    if (!(p.channels & 3))
+        channel_div_idx = 2;  // Channel count divisible by 4.
+    else if (!(p.channels & 1))
+        channel_div_idx = 1;  // Channel count divisible by 2.
+
+    // Mip-related setup.
+    if (p.enableMip)
+    {
+        // Generate mip offsets, check mipmap size, and set mip data pointer.
+        int mipTotal = calculateMipInfo(NVDR_CTX_PARAMS, p);
+        NVDR_CHECK(tex.sizes() == mip_wrap.texture_size && cube_mode == mip_wrap.cube_mode, "mip does not match texture size");
+        NVDR_CHECK(mip.sizes().size() == 1 && mip.size(0) == mipTotal, "mip tensor size mismatch");
+        p.mip = mip.data_ptr<float>();
+    }
+
+    // Verify that buffers are aligned to allow float2/float4 operations. Unused pointers are zero so always aligned.
+    if (!cube_mode)
+        NVDR_CHECK(!((uintptr_t)p.uv & 7), "uv input tensor not aligned to float2");
+    if ((p.channels & 3) == 0)
+    {
+        NVDR_CHECK(!((uintptr_t)p.tex & 15), "tex input tensor not aligned to float4");
+        NVDR_CHECK(!((uintptr_t)p.out & 15), "out output tensor not aligned to float4");
+        NVDR_CHECK(!((uintptr_t)p.mip & 15), "mip output tensor not aligned to float4");
+    }
+    if ((p.channels & 1) == 0)
+    {
+        NVDR_CHECK(!((uintptr_t)p.tex & 7), "tex input tensor not aligned to float2");
+        NVDR_CHECK(!((uintptr_t)p.out & 7), "out output tensor not aligned to float2");
+        NVDR_CHECK(!((uintptr_t)p.mip & 7), "mip output tensor not aligned to float2");
+    }
+    if (!cube_mode)
+        NVDR_CHECK(!((uintptr_t)p.uvDA & 15), "uv_da input tensor not aligned to float4");
+    else
+        NVDR_CHECK(!((uintptr_t)p.uvDA & 7), "uv_da input tensor not aligned to float2");
+
+    // Choose launch parameters for texture lookup kernel.
+    dim3 blockSize = getLaunchBlockSize(TEX_FWD_MAX_KERNEL_BLOCK_WIDTH, TEX_FWD_MAX_KERNEL_BLOCK_HEIGHT, p.imgWidth, p.imgHeight);
+    dim3 gridSize  = getLaunchGridSize(blockSize, p.imgWidth, p.imgHeight, p.n);
+
+    // Choose kernel based on filter mode, cube mode, and datatype.
+    void* func_tbl[TEX_MODE_COUNT * 3 * 2] = {
+        (void*)TextureFwdKernelNearest1,
+        (void*)TextureFwdKernelNearest2,
+        (void*)TextureFwdKernelNearest4,
+        (void*)TextureFwdKernelLinear1,
+        (void*)TextureFwdKernelLinear2,
+        (void*)TextureFwdKernelLinear4,
+        (void*)TextureFwdKernelLinearMipmapNearest1,
+        (void*)TextureFwdKernelLinearMipmapNearest2,
+        (void*)TextureFwdKernelLinearMipmapNearest4,
+        (void*)TextureFwdKernelLinearMipmapLinear1,
+        (void*)TextureFwdKernelLinearMipmapLinear2,
+        (void*)TextureFwdKernelLinearMipmapLinear4,
+        (void*)TextureFwdKernelCubeNearest1,
+        (void*)TextureFwdKernelCubeNearest2,
+        (void*)TextureFwdKernelCubeNearest4,
+        (void*)TextureFwdKernelCubeLinear1,
+        (void*)TextureFwdKernelCubeLinear2,
+        (void*)TextureFwdKernelCubeLinear4,
+        (void*)TextureFwdKernelCubeLinearMipmapNearest1,
+        (void*)TextureFwdKernelCubeLinearMipmapNearest2,
+        (void*)TextureFwdKernelCubeLinearMipmapNearest4,
+        (void*)TextureFwdKernelCubeLinearMipmapLinear1,
+        (void*)TextureFwdKernelCubeLinearMipmapLinear2,
+        (void*)TextureFwdKernelCubeLinearMipmapLinear4,
+    };
+
+    // Function index.
+    int func_idx = p.filterMode;
+    if (cube_mode)
+        func_idx += TEX_MODE_COUNT;
+    func_idx = func_idx * 3 + channel_div_idx;
+
+    // Launch kernel.
+    NVDR_CHECK_CUDA_ERROR(cudaLaunchKernel(func_tbl[func_idx], gridSize, blockSize, args, 0, stream));
+
+    // Return output tensor.
+    return out;
+}
+
+// Version without mipmaps.
+torch::Tensor texture_fwd(torch::Tensor tex, torch::Tensor uv, int filter_mode, int boundary_mode)
+{
+    torch::Tensor empty_tensor;
+    return texture_fwd_mip(tex, uv, empty_tensor, TextureMipWrapper(), filter_mode, boundary_mode);
+}
+
+//------------------------------------------------------------------------
+// Gradient op.
+
+std::tuple<torch::Tensor, torch::Tensor, torch::Tensor> texture_grad_linear_mipmap_linear(torch::Tensor tex, torch::Tensor uv, torch::Tensor dy, torch::Tensor uv_da, TextureMipWrapper mip_wrap, int filter_mode, int boundary_mode)
+{
+    cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+    TextureKernelParams p = {}; // Initialize all fields to zero.
+    torch::Tensor& mip = mip_wrap.mip; // Unwrap.
+    int max_mip_level = mip_wrap.max_mip_level;
+    set_modes(p, filter_mode, boundary_mode, max_mip_level);
+    if (p.enableMip)
+    {
+        NVDR_CHECK(uv_da.defined(), "mipmapping filter mode requires uv_da input in gradient");
+        NVDR_CHECK(mip.defined(), "mipmapping filter mode requires mip input in gradient");
+    }
+
+    // Check inputs.
+    if (p.enableMip)
+    {
+        NVDR_CHECK_DEVICE(tex, uv, dy, uv_da, mip);
+        NVDR_CHECK_CONTIGUOUS(tex, uv, uv_da, mip);
+        NVDR_CHECK_F32(tex, uv, dy, uv_da, mip);
+    }
+    else
+    {
+        NVDR_CHECK_DEVICE(tex, uv, dy);
+        NVDR_CHECK_CONTIGUOUS(tex, uv);
+        NVDR_CHECK_F32(tex, uv, dy);
+    }
+
+    // Sanity checks and state setters.
+    bool cube_mode = (boundary_mode == TEX_BOUNDARY_MODE_CUBE);
+    if (!cube_mode)
+    {
+        NVDR_CHECK(tex.sizes().size() == 4 && tex.size(0) > 0 && tex.size(1) > 0 && tex.size(2) > 0 && tex.size(3) > 0, "tex must have shape[>0, >0, >0, >0]");
+        NVDR_CHECK(uv.sizes().size() == 4 && uv.size(0) > 0 && uv.size(1) > 0 && uv.size(2) > 0 && uv.size(3) == 2, "uv must have shape [>0, >0, >0, 2]");
+        p.texHeight = tex.size(1);
+        p.texWidth  = tex.size(2);
+        p.channels  = tex.size(3);
+    }
+    else
+    {
+        NVDR_CHECK(tex.sizes().size() == 5 && tex.size(0) > 0 && tex.size(1) == 6 && tex.size(2) > 0 && tex.size(3) > 0 && tex.size(4) > 0, "tex must have shape[>0, 6, >0, >0, >0] in cube map mode");
+        NVDR_CHECK(uv.sizes().size() == 4 && uv.size(0) > 0 && uv.size(1) > 0 && uv.size(2) > 0 && uv.size(3) == 3, "uv must have shape [>0, >0, >0, 3] in cube map mode");
+        NVDR_CHECK(tex.size(2) == tex.size(3), "texture shape must be square in cube map mode");
+        p.texHeight = tex.size(2);
+        p.texWidth  = tex.size(3);
+        p.channels  = tex.size(4);
+    }
+    NVDR_CHECK(tex.size(0) == 1 || tex.size(0) == uv.size(0), "minibatch size mismatch between inputs tex, uv");
+    NVDR_CHECK(p.texWidth <= (1 << TEX_MAX_MIP_LEVEL) && p.texHeight <= (1 << TEX_MAX_MIP_LEVEL), "texture size too large");
+    p.n         = uv.size(0);
+    p.imgHeight = uv.size(1);
+    p.imgWidth  = uv.size(2);
+    p.texDepth  = tex.size(0);
+    if (p.enableMip)
+    {
+        if (!cube_mode)
+            NVDR_CHECK(uv_da.sizes().size() == 4 && uv_da.size(0) == p.n && uv_da.size(1) == p.imgHeight && uv_da.size(2) == p.imgWidth && uv_da.size(3) == 4, "uv_da must have shape [minibatch_size, height, width, 4]");
+        else
+            NVDR_CHECK(uv_da.sizes().size() == 4 && uv_da.size(0) == p.n && uv_da.size(1) == p.imgHeight && uv_da.size(2) == p.imgWidth && uv_da.size(3) == 6, "uv_da must have shape [minibatch_size, height, width, 6] in cube map mode");
+    }
+    NVDR_CHECK(dy.sizes().size() == 4 && dy.size(0) == p.n && dy.size(1) == p.imgHeight && dy.size(2) == p.imgWidth && dy.size(3) == p.channels, "dy must have shape [minibatch_size, height, width, channels]");
+        
+    // Get contiguous version of dy.
+    torch::Tensor dy_ = dy.contiguous();
+
+    // Get input pointers.
+    p.tex = tex.data_ptr<float>();
+    p.uv = uv.data_ptr<float>();
+    p.dy = dy_.data_ptr<float>();
+    p.uvDA = p.enableMip ? uv_da.data_ptr<float>() : NULL;
+    p.mip = p.enableMip ? (float*)mip.data_ptr<float>() : NULL;
+
+    // Allocate output tensor for tex gradient.
+    torch::Tensor grad_tex = torch::zeros_like(tex);
+    p.gradTex = grad_tex.data_ptr<float>();
+
+    // Allocate output tensor for uv gradient.
+    torch::Tensor grad_uv;
+    torch::Tensor grad_uv_da;
+    if (p.filterMode != TEX_MODE_NEAREST)
+    {
+        grad_uv = torch::empty_like(uv);
+        p.gradUV = grad_uv.data_ptr<float>();
+
+        // Allocate output tensor for uv_da gradient.
+        if (p.filterMode == TEX_MODE_LINEAR_MIPMAP_LINEAR)
+        {
+            grad_uv_da = torch::empty_like(uv_da);
+            p.gradUVDA = grad_uv_da.data_ptr<float>();
+        }
+    }
+
+    // Choose kernel variants based on channel count.
+    int channel_div_idx = 0;
+    if (!(p.channels & 3))
+        channel_div_idx = 2;  // Channel count divisible by 4.
+    else if (!(p.channels & 1))
+        channel_div_idx = 1;  // Channel count divisible by 2.
+
+    // Mip-related setup.
+    torch::Tensor grad_mip;
+    if (p.enableMip)
+    {
+        // Generate mip offsets and get space for temporary mip gradients.
+        int mipTotal = calculateMipInfo(NVDR_CTX_PARAMS, p);
+        NVDR_CHECK(tex.sizes() == mip_wrap.texture_size && cube_mode == mip_wrap.cube_mode, "mip does not match texture size");
+        NVDR_CHECK(mip.sizes().size() == 1 && mip.size(0) == mipTotal, "mip tensor size mismatch");
+        grad_mip = torch::zeros_like(mip);
+        p.gradTexMip = grad_mip.data_ptr<float>();
+    }
+
+    // Verify that buffers are aligned to allow float2/float4 operations. Unused pointers are zero so always aligned.
+    if (!cube_mode)
+    {
+        NVDR_CHECK(!((uintptr_t)p.uv       & 7), "uv input tensor not aligned to float2");
+        NVDR_CHECK(!((uintptr_t)p.gradUV   & 7), "grad_uv output tensor not aligned to float2");
+        NVDR_CHECK(!((uintptr_t)p.uvDA     & 15), "uv_da input tensor not aligned to float4");
+        NVDR_CHECK(!((uintptr_t)p.gradUVDA & 15), "grad_uv_da output tensor not aligned to float4");
+    }
+    else
+    {
+        NVDR_CHECK(!((uintptr_t)p.uvDA     & 7), "uv_da input tensor not aligned to float2");
+        NVDR_CHECK(!((uintptr_t)p.gradUVDA & 7), "grad_uv_da output tensor not aligned to float2");
+    }
+    if ((p.channels & 3) == 0)
+    {
+        NVDR_CHECK(!((uintptr_t)p.tex     & 15), "tex input tensor not aligned to float4");
+        NVDR_CHECK(!((uintptr_t)p.gradTex & 15), "grad_tex output tensor not aligned to float4");
+        NVDR_CHECK(!((uintptr_t)p.dy      & 15), "dy input tensor not aligned to float4");
+        NVDR_CHECK(!((uintptr_t)p.mip     & 15), "mip input tensor not aligned to float4");
+    }
+    if ((p.channels & 1) == 0)
+    {
+        NVDR_CHECK(!((uintptr_t)p.tex     & 7), "tex input tensor not aligned to float2");
+        NVDR_CHECK(!((uintptr_t)p.gradTex & 7), "grad_tex output tensor not aligned to float2");
+        NVDR_CHECK(!((uintptr_t)p.dy      & 7), "dy output tensor not aligned to float2");
+        NVDR_CHECK(!((uintptr_t)p.mip     & 7), "mip input tensor not aligned to float2");
+    }
+
+    // Choose launch parameters for main gradient kernel.
+    void* args[] = {&p};
+    dim3 blockSize = getLaunchBlockSize(TEX_GRAD_MAX_KERNEL_BLOCK_WIDTH, TEX_GRAD_MAX_KERNEL_BLOCK_HEIGHT, p.imgWidth, p.imgHeight);
+    dim3 gridSize  = getLaunchGridSize(blockSize, p.imgWidth, p.imgHeight, p.n);
+
+    void* func_tbl[TEX_MODE_COUNT * 2] = { 
+        (void*)TextureGradKernelNearest,
+        (void*)TextureGradKernelLinear,
+        (void*)TextureGradKernelLinearMipmapNearest,
+        (void*)TextureGradKernelLinearMipmapLinear,
+        (void*)TextureGradKernelCubeNearest,
+        (void*)TextureGradKernelCubeLinear,
+        (void*)TextureGradKernelCubeLinearMipmapNearest,
+        (void*)TextureGradKernelCubeLinearMipmapLinear,
+    };
+
+    // Function index.
+    int func_idx = p.filterMode;
+    if (cube_mode)
+        func_idx += TEX_MODE_COUNT;
+
+    // Launch main gradient kernel.
+    NVDR_CHECK_CUDA_ERROR(cudaLaunchKernel(func_tbl[func_idx], gridSize, blockSize, args, 0, stream));
+
+    // Launch kernel to pull gradients from mip levels.
+    if (p.enableMip)
+    {
+        dim3 blockSize = getLaunchBlockSize(TEX_GRAD_MAX_MIP_KERNEL_BLOCK_WIDTH, TEX_GRAD_MAX_MIP_KERNEL_BLOCK_HEIGHT, p.texWidth, p.texHeight);
+        dim3 gridSize  = getLaunchGridSize(blockSize, p.texWidth, p.texHeight, p.texDepth * (cube_mode ? 6 : 1));
+        int sharedBytes = blockSize.x * blockSize.y * p.channels * sizeof(float);
+
+        void* mip_grad_func_tbl[3] = { (void*)MipGradKernel1, (void*)MipGradKernel2, (void*)MipGradKernel4 };
+        NVDR_CHECK_CUDA_ERROR(cudaLaunchKernel(mip_grad_func_tbl[channel_div_idx], gridSize, blockSize, args, sharedBytes, stream));
+    }
+
+    // Return output tensors.
+    return std::tuple<torch::Tensor, torch::Tensor, torch::Tensor>(grad_tex, grad_uv, grad_uv_da);
+}
+
+// Version for nearest filter mode.
+torch::Tensor texture_grad_nearest(torch::Tensor tex, torch::Tensor uv, torch::Tensor dy, int filter_mode, int boundary_mode)
+{
+    torch::Tensor empty_tensor;
+    std::tuple<torch::Tensor, torch::Tensor, torch::Tensor> result = texture_grad_linear_mipmap_linear(tex, uv, dy, empty_tensor, TextureMipWrapper(), filter_mode, boundary_mode);
+    return std::get<0>(result);
+}
+
+// Version for linear filter mode.
+std::tuple<torch::Tensor, torch::Tensor> texture_grad_linear(torch::Tensor tex, torch::Tensor uv, torch::Tensor dy, int filter_mode, int boundary_mode)
+{
+    torch::Tensor empty_tensor;
+    std::tuple<torch::Tensor, torch::Tensor, torch::Tensor> result = texture_grad_linear_mipmap_linear(tex, uv, dy, empty_tensor, TextureMipWrapper(), filter_mode, boundary_mode);
+    return std::tuple<torch::Tensor, torch::Tensor>(std::get<0>(result), std::get<1>(result));
+}
+
+// Version for linear-mipmap-nearest mode.
+std::tuple<torch::Tensor, torch::Tensor> texture_grad_linear_mipmap_nearest(torch::Tensor tex, torch::Tensor uv, torch::Tensor dy, torch::Tensor uv_da, TextureMipWrapper mip, int filter_mode, int boundary_mode)
+{
+    std::tuple<torch::Tensor, torch::Tensor, torch::Tensor> result = texture_grad_linear_mipmap_linear(tex, uv, dy, uv_da, mip, filter_mode, boundary_mode);
+    return std::tuple<torch::Tensor, torch::Tensor>(std::get<0>(result), std::get<1>(result));
+}
+
+//------------------------------------------------------------------------
--- a/nvdiffrast/torch/torch_types.h
+++ b/nvdiffrast/torch/torch_types.h
+// Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#include "torch_common.inl"
+
+//------------------------------------------------------------------------
+// Python GL state wrapper.
+
+class RasterizeGLState;
+class RasterizeGLStateWrapper
+{
+public:
+    RasterizeGLStateWrapper     (bool enableDB, bool automatic);
+    ~RasterizeGLStateWrapper    (void);
+
+    void setContext             (void);
+    void releaseContext         (void);
+
+    RasterizeGLState*           pState;
+    bool                        automatic;
+};
+
+//------------------------------------------------------------------------
+// Mipmap wrapper to prevent intrusion from Python side.
+
+class TextureMipWrapper
+{
+public:
+    torch::Tensor               mip;
+    int                         max_mip_level;
+    std::vector<int64_t>        texture_size;   // For error checking.
+    bool                        cube_mode;      // For error checking.
+};
+
+
+//------------------------------------------------------------------------
+// Antialias topology hash wrapper to prevent intrusion from Python side.
+
+class TopologyHashWrapper
+{
+public:
+    torch::Tensor               ev_hash;
+};
+
+//------------------------------------------------------------------------
--- a/run_sample.sh
+++ b/run_sample.sh
+#!/bin/bash
+
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+function print_help {
+    echo "Usage: `basename $0` [--build-container] <python_file>"
+    echo ""
+    echo "Option --build-container will build the Docker container based on"
+    echo "docker/Dockerfile and tag the image with gltorch:latest."
+    echo ""
+    echo "Example: `basename $0` samples/torch/envphong.py"
+}
+
+build_container=0
+sample=""
+while [[ "$#" -gt 0 ]]; do
+    case $1 in
+        --build-container) build_container=1;;
+        -h|--help) print_help; exit 0 ;;
+        --*) echo "Unknown parameter passed: $1"; exit 1 ;;
+        *) sample="$1"; shift; break;
+    esac
+    shift
+done
+
+rest=$@
+
+# Build the docker container
+if [ "$build_container" = "1" ]; then
+    docker build --tag gltorch:latest -f docker/Dockerfile .
+    docker build --tag gltensorflow:latest --build-arg BASE_IMAGE=tensorflow/tensorflow:1.15.0-gpu-py3 -f docker/Dockerfile .
+fi
+
+if [ ! -f "$sample" ]; then
+    echo
+    echo "No python sample given or file '$sample' not found.  Exiting."
+    exit 1
+fi
+
+image="gltorch:latest"
+TENSORFLOW_CUDA_CACHE=""
+# Magically choose the tensorflow container if running a sample from the samples/tensorflow/ directory
+if [[ $sample == *"/tensorflow/"* ]]; then
+    image="gltensorflow:latest"
+    TENSORFLOW_CUDA_CACHE="-e NVDIFFRAST_CACHE_DIR=/app/tmp"
+fi
+
+echo "Using container image: $image"
+echo "Running command: $sample $rest"
+
+# Run a sample with docker
+docker run --rm -it --gpus all --user $(id -u):$(id -g) \
+    -v `pwd`:/app --workdir /app -e TORCH_EXTENSIONS_DIR=/app/tmp $TENSORFLOW_CUDA_CACHE $image python3 $sample $rest
--- a/samples/data/NOTICE.txt
+++ b/samples/data/NOTICE.txt
+
+Environment map stored as part of samples/data/envphong.npz is derived from a Wave Engine sample material originally shared under MIT License that is reproduced below.
+Original material: https://github.com/WaveEngine/Samples/tree/master/Materials/EnvironmentMap/Content/Assets/CubeMap.cubemap
+Original license:  https://github.com/WaveEngine/Samples/blob/master/LICENSE.md
+
+Mesh and texture stored as part of samples/data/earth.npz are derived from "3D Earth Photorealistic 2K" model originally made available under TurboSquid 3D Model License that is reproduced below.
+Original material: https://www.turbosquid.com/3d-models/3d-realistic-earth-photorealistic-2k-1279125
+Original license:  https://blog.turbosquid.com/turbosquid-3d-model-license/#3d-model-license
+
+
+
+MIT License
+
+Copyright (c) 2016 Wave Coorporation
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+
+TurboSquid 3D Model License
+
+This is a legally binding agreement between licensee ("you"), and TurboSquid regarding your rights to use 3D Models from the Site under this license. "You" refers to the purchasing entity, whether that is a natural person who must be at least 18 years of age, or a corporate entity. The rights granted in this agreement are granted to the purchasing entity, its parent company, and its majority owned affiliates on a "royalty free" basis, which means that after a Purchase, there are no future royalties or payments that are required. This agreement incorporates by reference the Terms of Use as well as the Site's policies and procedures as such.
+I. Introduction & Definitions
+
+Definitions
+
+This agreement is intended to be easy to understand, and to provide clarity for using 3D Models in the work you create ("Creations"). Over the years, TurboSquid has been asked many questions about how 3D Models may be used in Creations, and we have attempted to answer those questions in this agreement.
+
+Some words in this agreement are given specific meanings. Words that appear initially in quotations, such as "you" and "Creations", are defined in the text preceding the word. Other capitalized words are defined below:
+
+"3D Model" is the collection of one or more digital files, packaged in the form of a product on the Site that can be identified by a 3D Model ID, and that is made available to you for Purchase on the Site. A 3D Model may include 3D Model files, geometry, texture maps, materials, motion captures, renderings and other constituent files related to the 3D Model data and its representation.
+
+"Site" refers to the TurboSquid websites, API's, software applications or any approved means or utility either currently in existence or in the future; the software and source code used by TurboSquid to provide such services; user interface layouts, designs, images, text, knowledgebase articles, program offers; site information provided in reports (such as popular keyword searches); and all other intellectual property protected under copyright, trademark, patent, publicity, or any other proprietary right.
+
+"Purchase" is the acquisition of a 3D Model by you from the Site under this agreement, whether as a purchase of 3D Model made available at a price of greater than $0, or a download of 3D Model made available at no charge.
+
+"TurboSquid" includes TurboSquid, Inc. and all licensed affiliates and partners that distribute 3D Models on behalf of TurboSquid, Inc.
+
+"Product Page" is the product page or interface that displays 3D Models available for Purchase on the Site.
+
+"Computer Game" is a type of Creation that includes digital games, computer-based games, handheld electronic games, mobile games, online games, web-games, social games, game mods, and console-based games.
+
+"Imagery" is a Creation made of any single image or sequence of images.
+
+"Depicted Intellectual Property" means any intellectual property depicted in the 3D Model, including any copyright, trademark, trade dress, right of publicity, or any other proprietary right throughout the world that may apply. For purposes of clarity, this does not refer to the copyrights owned by the creator of the 3D Model that are licensed in this agreement.
+
+To make reading this agreement easier and less repetitive, the following constructions are used:
+
+"Include," including," and "such as" are considered to be followed with "but not limited to." Examples are used in this agreement to illustrate, rather than limit, the scope of the terms.
+
+"The following restrictions", "the foregoing restrictions", and "subject to the restrictions" are considered to be followed with "in addition to all other restrictions applicable within this agreement."
+II. License Rights
+
+1. Ownership. TurboSquid does not grant title or ownership in 3D Models. All rights in 3D Models not expressly granted in this agreement are reserved by TurboSquid for itself and its licensors.
+
+2. Rights Granted. For 3D Models, TurboSquid grants to you a non-exclusive, perpetual, worldwide right and license to copy, distribute, reproduce, adapt, publicly display, publicly perform, digitally perform, transmit, broadcast, telecast, advertise, create derivative works, and market 3D Models within Creations in the uses authorized in this agreement. You may request authorization for a use not covered by this agreement ("New Use") by writing use@turbosquid.com. TurboSquid is authorized to approve a New Use if TurboSquid finds in its sole judgment that the New Use is substantially similar to another established use in this agreement and authorizes the New Use in writing.
+
+3. Rights Granted When Sharing 3D Models. If you Purchase as an employee of a corporate entity, sharing Purchased 3D Models with other employees of your corporate entity is allowed. Examples of allowed sharing include storing files on a networked hard drive, and aggregating 3D Models for later use in future Creations. You are responsible for any downstream distribution, use, or misuse by a recipient of a shared 3D Models. In all cases, sharing 3D Models with external people or entities is only allowed in the following situations, and with the following restrictions:
+
+a. In the production of a Creation owned by you, if you are working in collaboration with external parties, and there is a need to share 3D Models for the development and production of your Creation, sharing 3D Models with those external parties is allowed. Any external party that receives 3D Models may only use 3D Models on your Creations and must take reasonable care to secure and limit access to 3D Models to that purpose.
+
+b. In the production of a Creation owned by another entity ("your Client"), if you are working as a contractor and need to share 3D Models with your Client, or any external parties working with your Client, sharing 3D Models is allowed, subject to the restriction that all parties may use 3D Models only for your Client's particular Creation, and for successive versions of your Client's Creation, such as sequel Computer Games or movies that utilize the same 3D Models. All parties must take reasonable care to secure and limit access to 3D Models to the parties working on your Client's Creation. For all other use by any party, 3D Models must be Purchased again to create a new license agreement governing that use
+
+4. Editorial Use Restriction for Some 3D Models. The following restrictions apply to any 3D Model with an "Editorial Uses Only" label on its Product Page. Permitted use of Depicted Intellectual Property in such 3D Models is limited to news reporting in Creations of some cultural, editorial, journalistic, or otherwise newsworthy value, including news reporting on television and the internet. A second permitted use is use within an academic setting, limited to teaching, scholarship, and research. This restriction does not apply if you have the needed authorization to use the Depicted Intellectual Property for your Creation, such as if you are owner of the Depicted Intellectual Property, or the advertising team, hired party, or licensee of the Depicted Intellectual Property owner.
+
+5. Depicted Intellectual Property. TurboSquid does not own or license any Depicted Intellectual Property. TurboSquid does not in any way make any representations or warranties about Depicted Intellectual Property associated with 3D Models. You are solely responsible for determining the need for and, if appropriate, obtaining any needed clearance, consent, or release to use any Depicted Intellectual Property in your Creations.
+
+6. Creations of Imagery.
+
+Permitted Uses of Creations of Imagery. Subject to the following restrictions, you may use Creations of Imagery within news, film, movies, television programs, video projects, multi-media projects, theatrical display, software user interfaces; architectural renderings, Computer Games, virtual worlds, simulation and training environments; corporate communications, marketing collateral, tradeshow promotional items, booth decorations and presentations; pre-visualizations, product prototyping and research; mobile, web, print, television, and billboard advertising; online and electronic publications of blogs, literature, social media, and email campaigns; website designs and layouts, desktop and mobile wallpapers, screensavers, toolbar skins; books, magazines, posters, greeting cards; apparel items, brochures, framed or printed artwork, household items, office items, lenticular prints, product packaging and manufactured products.
+
+Restrictions on Permitted Uses of Creations of Imagery.
+
+a. Stock Media Clearinghouse. You may NOT publish or distribute Creations of Imagery through another stock media clearinghouse, for example as part of an online marketplace for photography, clip art, or design templates.
+
+b. Promotional Images. Images displayed for the promotion a 3D Model on its Product Page ("Promotional Images") may be used in Creations of Imagery, provided that the 3D Model itself has been Purchased and subject to the following restrictions:
+
+i. You may NOT use a Promotional Image that has any added element which is not included as part of the 3D Model. An example of this type of restricted use is if the 3D Model contains an airplane, and there is a Promotional Image of that airplane rendered over a blue sky; however, the blue sky image is not included as part of the 3D Model. Other prohibited examples include use of Promotional Images from movies or advertisements that may have used 3D Model.
+
+ii. You may NOT use any Promotional Image that has a logo, mark, watermark, attribution, copyright or other notice superimposed on the image without prior approval from TurboSquid Support.
+
+c. Business Logos. You may NOT use Imagery in any Creation that is a trademark, servicemark, or business logo. This restriction is included because the owners of these types of Creations typically seek exclusivity on the use of the imagery in their Creation, which is incompatible with the non-exclusive license granted to you under this agreement.
+
+
+7. Creations of Computer Games and Software
+
+Permitted Uses in Creations of Computer Games and Software. Subject to the following restrictions, you may incorporate 3D Models in Creations of Computer Games, virtual worlds, simulation and training environments; mobile, desktop and web applications; and interactive electronic publications of literature such as e-books and electronic textbooks.
+
+Restrictions on Permitted Uses of 3D Models in Creations of Games and Software.
+
+a. Interactivity. Your inclusion of 3D Models within any such Creation is limited to uses where 3D Model is contained in an interactive experience for the user and not made available outside of the interactive experience. Such a permitted example of this use would be to include a 3D Model of human anatomy in a medical training application in a way that the 3D Model or its environment may be manipulated or interacted with.
+
+b. Access to 3D Models. You must take all reasonable and industry standard measures to incorporate 3D Models within Creations to prevent other parties from gaining access to 3D Models. 3D Models must be contained in proprietary formats so that they cannot be opened or imported in a publicly available software application or framework, or extracted without reverse engineering. WebGL exports from Unity, Unreal, and Lumberyard are permitted. Any other open format or format encrypted with decryptable open standards (such as an encrypted compression archive or other WebGL programs not listed here) are prohibited from using 3D Models. If your Creation uses WebGL and you are not sure if it qualifies, please contact use@turbosquid.com and describe your Creation in detail.
+
+c. Open Systems. You typically may NOT include 3D Models in Creations that have the general functionality for importing and/or exporting 3D Models. Please contact use@turbosquid.com and describe your Creation in detail if this is your desired use. An example of such a prohibited use is to include 3D Models as a starter library within a standard retail Software Creation that allows users to generally work with 3D Models, even if the 3D Model itself is somehow protected and is not capable of being exported. An allowed use is for custom or enterprise software in certain circumstances.
+
+d. Virtual Good Sales. You may NOT import, upload, reproduce, make available, publish, transmit, distribute, or sublicense 3D Models in Creations of virtual goods or worlds for any 3D community ("Virtual World"), unless you or your Client owns the Virtual World platform and it complies with the previous restrictions.
+
+
+8. Creations of Physical Form.
+
+Permitted Uses in Creations of Physical Form. Subject to the following restrictions, you may use 3D Models to make Physical Creations such as 3D printed works, articles of manufacture, custom vehicles, furniture, jewelry, sculptural artwork, toys, and physical entertainment goods ("Creations of Physical Form").
+
+Restrictions on Permitted Uses in Creations of Physical Form.
+
+a. Substantially Similar Creations. Permitted use of any Creation of Physical Form in which a 3D Model is untransformed or substantially similar to the 3D Model is limited to personal use, gifts, or charitable donations, with a maximum of 5 instances of such Creation per Purchase; unless the 3D Model is a small part of a much larger array of other physical objects in the Creation. For example, if you are creating a real-world, physical human skeleton for manufacture for sale, it is permitted to add a 3D printed human head that exactly resembles the Purchased 3D Model, but it is not permitted to sell the 3D printed head by itself. Another permitted example of a 3D Model being a small part of a larger array is using a 3D Model that ends up within an automobile as a part of the automobile.
+
+b. No Depicted Intellectual Property. You may NOT reproduce Depicted Intellectual Property in any Creation of Physical Form for any purpose. For example, you may NOT make Physical Form Creations of a copyrighted character (Spiderman, Elsa, Slimer), or branded technology (Apple, Toshiba, Samsung).
+
+9. 3D Industry Promotional Use. If TurboSquid has granted you, as a hardware or software partner, access to priced 3D Models on a free-of-charge basis, your use of 3D Models is restricted to internal testing for your 3D software or hardware products, and to the promotion of your software or hardware products with Creations of Imagery provided that an attribution of the artist's name and the Site are included. You agree that should any 3D Models be used outside of these purposes in ways that are normally allowed after a Purchase, that you will notify TurboSquid and promptly Purchase the 3D Models and otherwise comply with the terms herein.
+
+10. Unauthorized Use. If you use 3D Models in an unauthorized way, TurboSquid may terminate your account and pursue other penalties, damages, losses, and profits TurboSquid is entitled to under this agreement or at law or equity. The following are unauthorized uses that are explicitly prohibited:
+
+a. Competition. You may NOT use 3D Models in a way that competes with the Site, including distributing through 3D Model Clearinghouses. You may NOT publish, distribute, or make 3D Models available through any online clearinghouse infrastructure. You may not redistribute 3D Models as part of any design template, After Effects template, stock photography, video or clip art for distribution or licensing through any online stock media clearinghouse whatever.
+
+b. Re-Distribution. You may NOT re-distribute, publish, or make 3D Models available to any third party except in the form of a permitted Creation, or shared as authorized in this agreement.
+
+c. Group Buying. You may NOT aggregate funds to Purchase 3D Models with one or more other parties. An example of this prohibited use is a website membership where members pool their money to make a single Purchase that is shared by the members of the group. Each such member must Purchase individually.
+
+d. No Obscene or Unlawful Use. You may NOT use 3D Models for any defamatory, harassing, pornographic, obscene, or racist purpose, or to infringe any party's Depicted Intellectual Property rights.
+
+e. False Attribution. You may NOT misrepresent yourself as the creator of 3D Models.
+
+11. Resellers. The license granted herein is wholly transferable by an authorized reseller ("Reseller") to another party ("Transferee"). Each transferred license must be transferred entirely and all transferred 3D Models must be permanently deleted from the Reseller's systems after the transfer. When transferring the license, Reseller represents and warrants that the Reseller has the authority to bind the Transferee to these terms. The Reseller is jointly and severally responsible with any Transferee and each are liable for the transferee's use and compliance with TurboSquid's Terms of Use and Site's policies and procedures as well as any financial obligations hereunder.
+III. License Term & Termination
+
+1. Term. Your right and license to 3D Models is perpetual, unless terminated as described herein.
+
+2. Termination. Your license grant is terminated immediately and without notice in the cases below. In such termination, you and any recipients of 3D Models must cease use, distribution, and destroy all copies of 3D Models.
+
+a. Reversal of Purchase. Your right and license to 3D Models are contingent on your Purchase of 3D Models. Any payment reversal of a Purchase for any reason immediately terminates all rights granted under this agreement. Potential Reasons for a payment reversal include:
+
+i. TurboSquid reverses your Purchase at your request.
+
+ii. TurboSquid receives a charge back or other notice from your bank or credit card cancelling your Purchase and/or withdrawing the funds used for your Purchase.
+
+iii. TurboSquid determines in its sole discretion that your Purchase was fraudulent.
+
+iv. When you are granted delayed payment terms, and fail to make payments such that TurboSquid sends you notice and terminates your account.
+
+b. Failure to Abide by the License Grant. Material failure to abide by the terms of this agreement immediately terminates your right and license to 3D Models. If you detect a violation of the license grant by you or any recipient of shared 3D Models, and promptly report the violation to agent@turbosquid.com, TurboSquid will make a good faith effort to find an appropriate remedy to preserve your license grant.
+IV. Warranties
+
+You covenant, represent, and warrant to TurboSquid that:
+
+    You have full right, power, legal capacity, and authority to enter into and perform this agreement, have obtained any third-party consent needed to do so, and, prior to any Purchase, had an opportunity to seek independent legal counsel.
+    You will not use 3D Models except pursuant to the terms of this agreement. Should you use 3D Models in an unauthorized way, you agree to any reasonable fee or penalty exercised by TurboSquid under this agreement or applicable law.
+    You will, prior to Purchase, determine the need for and, if appropriate, obtain any needed third-party clearance, consent, or release to use Depicted Intellectual Property shown in the digital rendering of 3D Models, and shall not use 3D Models to infringe any party's Depicted Intellectual Property rights.
+    You will immediately notify TurboSquid of any legal claim or challenge against your use of 3D Models or any other rights issue, before disclosing such issue to any third-party.
+
+V. Limitation of Liability
+
+1. 3D Models are provided on an "as is", "as available", and "with all faults" basis. TurboSquid makes no representations, warranties, conditions, or guarantees as to the usefulness, quality, suitability, truth, fitness for a particular purpose, non-infringement, merchantability, or cosmetic attributes of 3D Models, and does not guarantee the accuracy or completeness of specifications associated with 3D Models, including measurements, weight, durability, strength, materials, general physical properties, regulatory compliance, other engineering or construction attributes.
+
+2. TurboSquid disclaims all express or implied conditions, representations, and warranties of any kind regarding 3D Models, including any implied warranty or condition of merchantability. TurboSquid allows your Purchase to be refunded under certain reasonable time frames and conditions, subject to the Site's policies.
+
+3. You assume all risk for any damage to your computer systems and network for any damage to your computer system by obtaining 3D Models, including any damages resulting from computer viruses.
+
+4. To the fullest extent permitted by law, TurboSquid shall not be liable for (A) any direct, indirect, punitive, special, incidental, consequential, or exemplary damages (including loss of business, revenue, profits, goodwill, use, data, electronically transmitted orders, or other economic advantage) arising out of or in connection with 3D Models, even if TurboSquid has previously been advised of, or reasonably could have foreseen, the possibility of such damages, however they arise, whether in breach of contract or in tort (including negligence) or (B) any damages in excess of $1,000. To the extent that any jurisdiction does not allow the exclusion or limitation of direct, incidental, or consequential damages, portions of the preceding limitation or exclusion may not apply, but should be construed to the greatest extent applicable in such jurisdictions. Notwithstanding anything to the contrary herein, the TurboSquid indemnification obligation set forth below shall be limited to the following depending on the licensing tier:
+
+Tier 0: 3D Models acquired at free-of-charge are not indemnified.
+
+Tier 1: Standard License indemnity limitation is ten thousand ($10,000) dollars for all 3D Models acquired with payment. This indemnity is in aggregate for all 3D Models acquired under the Standard License.
+
+Tier 2: Small Business License indemnity limitation is two hundred and fifty thousand ($250,000) dollars for any 3D Model. This indemnity is in aggregate for all 3D Models acquired under the Small Business License.
+
+Tier 3: Enterprise License indemnity limitation is one million ($1,000,000) dollars for any 3D Model. This indemnity is in aggregate for all 3D Models acquired under the Enterprise License.
+
+For any 3D Model labeled Editorial, the above indemnities shall only apply if the model is properly used within the editorial license set forth herein (i.e. for news and editorial purposes in association with newsworthy media.)  For use outside the Editorial scope, no indemnification from TurboSquid shall apply. 
+
+5. You agree to indemnify and hold TurboSquid and its subsidiaries, affiliates, shareholders, officers, directors, agents, licensors, licensee, suppliers, alliance members, other partners, employees and representatives ("TurboSquid Parties") harmless from any claim or demand, including reasonable attorneys' fees, made by any third party due to, or arising out of your use of 3D Models or Creations.
+
+6. Subject to sections 4 and 5 above, TurboSquid shall indemnify, defend, and hold you harmless from and against any claim or demand, including reasonable attorneys' fees made by any third party for copyright or trademark infringement due to or arising out of your use of the 3D Models in accordance with these Terms, but excluding any modifications made by You, if such infringement was caused by the modification. This indemnity shall not apply to any 3D Model labeled for Editorial Use or a brand name, logo, or other Depicted Intellectual Property prior identified in a 3D Model.
+
+7. In the event of an indemnification claim by You, you agree to provide notice to TurboSquid within thirty days' of receiving any claim and allowing TurboSquid to fully control such claim, including but not limited to, selection of counsel, reasonable diligence into the claim, and if necessary litigation and/or settlement. Notice must be given via email to: agent@turbosquid.com. Notice is not considered made until it is acknowledged in writing by TurboSquid.
+VI. Other Terms
+
+1. Entire Agreement. This agreement constitutes the entire agreement between you and TurboSquid relating to your Purchase, unless you have a corporate license agreement with TurboSquid. Corporate licenses are available with additional protections for additional fees. Please contact enterprise@turbosquid.com if your organization requires a corporate license. TurboSquid does not otherwise offer any other changes, additions, variations, or additional signed forms related to this agreement. No modification to this agreement will be binding, unless in writing and signed by an authorized TurboSquid representative.
+
+2. Material Breach and Injunction.
+
+Your rights hereunder vary by licensing tier as follows:
+
+For the Standard License, you agree that any material breach of these Terms will result in irreparable harm to TurboSquid for which damages would be an inadequate remedy and, therefore, in addition to its rights and remedies otherwise available at law, TurboSquid will be entitled to equitable relief, including both a preliminary and permanent injunction, if such a breach occurs. You waive any requirement for the posting of a bond or other security if TurboSquid seeks such an injunction.
+
+For the Enterprise License, TurboSquid may not seek injunctive relief hereunder for any 3D Model. It hereby waives all right to equitable and injunctive relief and its damages shall be limited to monetary damages.
+
+Notwithstanding anything to the contrary herein, TurboSquid would be irreparably harmed and shall be entitled to equitable relief including injunctive relief for any hacking, theft, or misuse of the Site.
+
+3. Import/Export Regulations. 3D Models may be subject to the U.S. export laws and the export or import laws of other countries. You agree to comply strictly with all such laws and, in particular, shall with 3D Models: (a) obtain any export, re-export, or import authorizations required by U.S. or Your local laws; (b) not design, develop or produce missile, chemical/biological, or nuclear weaponry; and (c) not provide 3D Models to prohibited countries and entities identified in the U.S. export regulations.
+
+4. Governing Law. This agreement is governed by New York law, excluding conflict of law principles. Any action or proceeding arising out of or related to this agreement must be brought in a state or federal court located in New York, New York, and both parties irrevocably submit to the exclusive jurisdiction of such courts. All notices, requests and other communications under this agreement must be in writing (e-mail messages shall be deemed writings).
+
+5. LIMITED INTERNAL USER ARBITRATION. You acknowledge and agree that TurboSquid may, in its sole discretion, arbitrate disputes between TurboSquid users involving 3D Models (including any purchaser or supplier of 3D Models), and such findings shall be final and non-appealable. Either party may request that TurboSquid arbitrate the dispute, or TurboSquid may elect, at its option, to arbitrate the dispute. After TurboSquid elects to arbitrate any dispute hereunder, TurboSquid will waive any rights to a commission from both the Purchase and arbitration, and the parties must keep the results and process confidential and may not disclose anything related to the dispute to any other party (whether by oral, written, or other type of disclosure). To resolve disputes, TurboSquid may decide to terminate or suspend users, revoke the license, offer replacement 3D Models, reestablish the licensee, or surrender or reallocate fees (whether by refund, charitable donation, or otherwise). TurboSquid may award up to 3X the Purchase price to either party depending on the circumstances. YOU UNDERSTAND, ACKNOWLEDGE, AND AGREE THAT ACCEPTING THIS ARBITRATION PROVISION WAIVES RIGHTS TO JUDICIAL RESOLUTION, TRIAL BY JURY AND RIGHTS YOU WOULD OTHERWISE HAVE IF YOU HAD NOT AGREED TO THIS ARBITRATION PROVISION.
+
+6. Notice. Any notice under this agreement shall be via email to agent@turbosquid.com, provided that you receive an acknowledgement email from a TurboSquid representative within 5 business days. If no such acknowledgement email is received, notice must be in writing and delivered by mail to the following address.
+
+TurboSquid, Inc.
+c/o TurboSquid Support
+935 Gravier St., Suite 1600
+New Orleans, LA 70112
+
+7. Assignment. TurboSquid may not assign its rights under this agreement without providing you notice, except in the case of a bankruptcy, merger, acquisition, sale of all or substantially all of TurboSquid's assets to a subsequent owner or operator, or similar event.
+
+Your assignment rights vary based on the licensing tier of your purchase:
+
+For the Standard License, you may not assign your rights under this agreement without the prior written consent of TurboSquid.
+
+For Small Business or Enterprise Licenses, you may assign your rights under this agreement without the notice and consent of TurboSquid.
+
+8. English. This agreement may be translated into other languages, but English is the official language of this agreement and in any conflict between the English language version and any other version, the English language version shall control.
+
+9. Publicity. The following advertising, marketing, and publicity rights are granted to TurboSquid for each licensing tier:
+
+Standard License purchases may be fully publicized by TurboSquid and you hereby grant TurboSquid the right to use you and your company's name, logo, and project name on the TurboSquid website and in its related marketing and advertising materials.
+
+Small Business and Enterprise License purchase may not be publicized by TurboSquid in any way without prior written permission of the purchaser.
+
+10. Time limitations on any claim hereunder. Any claim by you hereunder, including without limitation a claim for indemnification under section V must be made within two years of purchasing the 3D Model.
+
+This 3D Model License is effective for use with 3D Models for use on or after June 17, 2020.
--- a/samples/data/cube_c.npz
+++ b/samples/data/cube_c.npz
--- a/samples/data/cube_d.npz
+++ b/samples/data/cube_d.npz
--- a/samples/data/cube_p.npz
+++ b/samples/data/cube_p.npz
--- a/samples/data/earth.npz
+++ b/samples/data/earth.npz
--- a/samples/data/envphong.npz
+++ b/samples/data/envphong.npz
--- a/samples/tensorflow/cube.py
+++ b/samples/tensorflow/cube.py
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+import numpy as np
+import os
+import sys
+import pathlib
+
+import util
+import tensorflow as tf
+
+sys.path.insert(0, os.path.join(sys.path[0], '../..')) # for nvdiffrast
+import nvdiffrast.tensorflow as dr
+
+#----------------------------------------------------------------------------
+# Cube shape fitter.
+#----------------------------------------------------------------------------
+
+def fit_cube(max_iter          = 5000,
+             resolution        = 4, 
+             discontinuous     = False,
+             repeats           = 1,
+             log_interval      = 10, 
+             display_interval  = None,
+             display_res       = 512,
+             out_dir           = '.',
+             log_fn            = None,
+             imgsave_interval  = None,
+             imgsave_fn        = None):
+
+    if out_dir:
+        os.makedirs(out_dir, exist_ok=True)
+    
+    datadir = f'{pathlib.Path(__file__).absolute().parents[1]}/data'
+    fn = 'cube_%s.npz' % ('d' if discontinuous else 'c')
+    with np.load(f'{datadir}/{fn}') as f:
+        pos_idx, vtxp, col_idx, vtxc = f.values()
+    print("Mesh has %d triangles and %d vertices." % (pos_idx.shape[0], vtxp.shape[0]))
+        
+    # Transformation matrix input to TF graph.
+    mtx_in = tf.placeholder(tf.float32, [4, 4])
+
+    # Setup TF graph for reference.
+    vtxw = np.concatenate([vtxp, np.ones([vtxp.shape[0], 1])], axis=1).astype(np.float32)
+    pos_clip = tf.matmul(vtxw, mtx_in, transpose_b=True)[tf.newaxis, ...]
+    rast_out, _ = dr.rasterize(pos_clip, pos_idx, resolution=[resolution, resolution], output_db=False)
+    color, _ = dr.interpolate(vtxc[tf.newaxis, ...], rast_out, col_idx)
+    color = dr.antialias(color, rast_out, pos_clip, pos_idx)
+
+    # Optimized variables.
+    vtxc_opt = tf.get_variable('vtxc', initializer=tf.zeros_initializer(), shape=vtxc.shape)
+    vtxp_opt = tf.get_variable('vtxp', initializer=tf.zeros_initializer(), shape=vtxp.shape)
+
+    # Optimization variable setters for initialization.
+    vtxc_opt_in = tf.placeholder(tf.float32, vtxc.shape)
+    vtxp_opt_in = tf.placeholder(tf.float32, vtxp.shape)
+    opt_set = tf.group(tf.assign(vtxc_opt, vtxc_opt_in), tf.assign(vtxp_opt, vtxp_opt_in))
+
+    # Setup TF graph for what we optimize result.
+    vtxw_opt = tf.concat([vtxp_opt, tf.ones([vtxp.shape[0], 1], tf.float32)], axis=1)
+    pos_clip_opt = tf.matmul(vtxw_opt, mtx_in, transpose_b=True)[tf.newaxis, ...]
+    rast_out_opt, _ = dr.rasterize(pos_clip_opt, pos_idx, resolution=[resolution, resolution], output_db=False)
+    color_opt, _ = dr.interpolate(vtxc_opt[tf.newaxis, ...], rast_out_opt, col_idx)
+    color_opt = dr.antialias(color_opt, rast_out_opt, pos_clip_opt, pos_idx)
+
+    # Image-space loss and optimizer.
+    loss = tf.reduce_mean((color_opt - color)**2)
+    lr_in = tf.placeholder(tf.float32, [])
+    train_op = tf.train.AdamOptimizer(lr_in, 0.9, 0.999).minimize(loss, var_list=[vtxp_opt, vtxc_opt])
+
+    # Setup TF graph for display.
+    rast_out_disp, _ = dr.rasterize(pos_clip_opt, pos_idx, resolution=[display_res, display_res], output_db=False)
+    color_disp, _ = dr.interpolate(vtxc_opt[tf.newaxis, ...], rast_out_disp, col_idx)
+    color_disp = dr.antialias(color_disp, rast_out_disp, pos_clip_opt, pos_idx)
+    rast_out_disp_ref, _ = dr.rasterize(pos_clip, pos_idx, resolution=[display_res, display_res], output_db=False)
+    color_disp_ref, _ = dr.interpolate(vtxc[tf.newaxis, ...], rast_out_disp_ref, col_idx)
+    color_disp_ref = dr.antialias(color_disp_ref, rast_out_disp_ref, pos_clip, pos_idx)
+
+    # Geometric error calculation
+    geom_loss = tf.reduce_mean(tf.reduce_sum((tf.abs(vtxp_opt) - .5)**2, axis=1)**0.5)
+
+    # Open log file.
+    log_file = open(out_dir + '/' + log_fn, 'wt') if log_fn else None
+
+    # Repeats.
+    for rep in range(repeats):
+
+        # Optimize.
+        ang = 0.0
+        gl_avg = []
+        util.init_uninitialized_vars()
+        for it in range(max_iter + 1):
+            # Initialize optimization.
+            if it == 0:
+                vtxp_init = np.random.uniform(-0.5, 0.5, size=vtxp.shape) + vtxp
+                vtxc_init = np.random.uniform(0.0, 1.0, size=vtxc.shape)
+                util.run(opt_set, {vtxc_opt_in: vtxc_init.astype(np.float32), vtxp_opt_in: vtxp_init.astype(np.float32)})
+
+            # Learning rate ramp.
+            lr = 1e-2
+            lr = lr * max(0.01, 10**(-it*0.0005))
+
+            # Random rotation/translation matrix for optimization.
+            r_rot = util.random_rotation_translation(0.25)
+
+            # Smooth rotation for display.
+            a_rot = np.matmul(util.rotate_x(-0.4), util.rotate_y(ang))
+
+            # Modelview and modelview + projection matrices.
+            proj  = util.projection(x=0.4)
+            r_mv  = np.matmul(util.translate(0, 0, -3.5), r_rot)
+            r_mvp = np.matmul(proj, r_mv).astype(np.float32)
+            a_mv  = np.matmul(util.translate(0, 0, -3.5), a_rot)
+            a_mvp = np.matmul(proj, a_mv).astype(np.float32)
+        
+            # Run training and measure geometric error.
+            gl_val, _ = util.run([geom_loss, train_op], {mtx_in: r_mvp, lr_in: lr})
+            gl_avg.append(gl_val)
+
+            # Print/save log.
+            if log_interval and (it % log_interval == 0):
+                gl_val, gl_avg = np.mean(np.asarray(gl_avg)), []
+                s = ("rep=%d," % rep) if repeats > 1 else ""
+                s += "iter=%d,err=%f" % (it, gl_val)
+                print(s)
+                if log_file:
+                    log_file.write(s + "\n")
+
+            # Show/save image.
+            display_image = display_interval and (it % display_interval == 0)
+            save_image = imgsave_interval and (it % imgsave_interval == 0)
+
+            if display_image or save_image:
+                ang = ang + 0.1
+                img_o = util.run(color_opt,      {mtx_in: r_mvp})[0]
+                img_b = util.run(color,          {mtx_in: r_mvp})[0]
+                img_d = util.run(color_disp,     {mtx_in: a_mvp})[0]
+                img_r = util.run(color_disp_ref, {mtx_in: a_mvp})[0]
+
+                scl = display_res // img_o.shape[0]
+                img_b = np.repeat(np.repeat(img_b, scl, axis=0), scl, axis=1)
+                img_o = np.repeat(np.repeat(img_o, scl, axis=0), scl, axis=1)
+                result_image = np.concatenate([img_o, img_b, img_d, img_r], axis=1)
+
+            if display_image:
+                util.display_image(result_image, size=display_res, title='%d / %d' % (it, max_iter))
+            if save_image:
+                util.save_image(out_dir + '/' + (imgsave_fn % it), result_image)
+
+    # All repeats done.
+    if log_file:
+        log_file.close()
+
+#----------------------------------------------------------------------------
+# Main function.
+#----------------------------------------------------------------------------
+
+def main():
+    display_interval = 0
+    discontinuous = False
+    resolution = 0
+
+    def usage():
+        print("Usage: python cube.py [-v] [-discontinuous] resolution")
+        exit()
+
+    for a in sys.argv[1:]:
+        if a == '-v':
+            display_interval = 100
+        elif a == '-discontinuous':
+            discontinuous = True
+        elif a.isdecimal():
+            resolution = int(a)
+        else:
+            usage()
+
+    if resolution <= 0:
+        usage()
+
+    # Initialize TensorFlow.
+    util.init_tf()
+
+    # Run.
+    out_dir = 'out/cube_%s_%d' % (('d' if discontinuous else 'c'), resolution)
+    fit_cube(max_iter=5000, resolution=resolution, discontinuous=discontinuous, log_interval=10, display_interval=display_interval, out_dir=out_dir, log_fn='log.txt', imgsave_interval=1000, imgsave_fn='img_%06d.png')
+
+    # Done.
+    print("Done.")
+
+#----------------------------------------------------------------------------
+
+if __name__ == "__main__":
+    main()
+
+#----------------------------------------------------------------------------
--- a/samples/tensorflow/earth.py
+++ b/samples/tensorflow/earth.py
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+import numpy as np
+import tensorflow as tf
+import os
+import sys
+import pathlib
+
+import util
+
+sys.path.insert(0, os.path.join(sys.path[0], '../..')) # for nvdiffrast
+import nvdiffrast.tensorflow as dr
+
+#----------------------------------------------------------------------------
+# Texture learning with/without mipmaps.
+#----------------------------------------------------------------------------
+
+def fit_earth(max_iter          = 20000,
+              log_interval      = 10,
+              display_interval  = None,
+              display_res       = 1024,
+              enable_mip        = True,
+              res               = 512,
+              ref_res           = 4096,
+              lr_base           = 1e-2,
+              lr_ramp           = 0.1,
+              out_dir           = '.',
+              log_fn            = None,
+              texsave_interval  = None,
+              texsave_fn        = None,
+              imgsave_interval  = None,
+              imgsave_fn        = None):
+
+    if out_dir:
+        os.makedirs(out_dir, exist_ok=True)
+
+    # Mesh and texture adapted from "3D Earth Photorealistic 2K" model at
+    # https://www.turbosquid.com/3d-models/3d-realistic-earth-photorealistic-2k-1279125
+    datadir = f'{pathlib.Path(__file__).absolute().parents[1]}/data'
+    with np.load(f'{datadir}/earth.npz') as f:
+        pos_idx, pos, uv_idx, uv, tex = f.values()
+    tex = tex.astype(np.float32)/255.0
+    max_mip_level = 9 # Texture is a 4x3 atlas of 512x512 maps.
+    print("Mesh has %d triangles and %d vertices." % (pos_idx.shape[0], pos.shape[0]))
+
+    # Transformation matrix input to TF graph.
+    mtx_in = tf.placeholder(tf.float32, [4, 4])
+
+    # Learned texture.
+    tex_var = tf.get_variable('tex', initializer=tf.constant_initializer(0.2), shape=tex.shape)
+
+    # Setup TF graph for reference rendering in high resolution.
+    pos_clip = tf.matmul(pos, mtx_in, transpose_b=True)[tf.newaxis, ...]
+    rast_out, rast_out_db = dr.rasterize(pos_clip, pos_idx, [ref_res, ref_res])
+    texc, texd = dr.interpolate(uv[tf.newaxis, ...], rast_out, uv_idx, rast_db=rast_out_db, diff_attrs='all')
+    color = dr.texture(tex[np.newaxis], texc, texd, filter_mode='linear-mipmap-linear', max_mip_level=max_mip_level)
+    color = color * tf.clip_by_value(rast_out[..., -1:], 0, 1) # Mask out background.
+    
+    # Reduce the reference to correct size.
+    while color.shape[1] > res:
+        color = util.bilinear_downsample(color)
+
+    # TF Graph for rendered candidate.
+    if enable_mip:
+        # With mipmaps.
+        rast_out_opt, rast_out_db_opt = dr.rasterize(pos_clip, pos_idx, [res, res])
+        texc_opt, texd_opt = dr.interpolate(uv[tf.newaxis, ...], rast_out_opt, uv_idx, rast_db=rast_out_db_opt, diff_attrs='all')
+        color_opt = dr.texture(tex_var[np.newaxis], texc_opt, texd_opt, filter_mode='linear-mipmap-linear', max_mip_level=max_mip_level)
+    else:
+        # No mipmaps: no image-space derivatives anywhere.
+        rast_out_opt, _ = dr.rasterize(pos_clip, pos_idx, [res, res], output_db=False)
+        texc_opt, _ = dr.interpolate(uv[tf.newaxis, ...], rast_out_opt, uv_idx)
+        color_opt = dr.texture(tex_var[np.newaxis], texc_opt, filter_mode='linear')    
+    color_opt = color_opt * tf.clip_by_value(rast_out_opt[..., -1:], 0, 1) # Mask out background.
+
+    # Measure only relevant portions of texture when calculating texture PSNR.
+    loss = tf.reduce_mean((color - color_opt)**2)
+    texmask = np.zeros_like(tex)
+    tr = tex.shape[1]//4
+    texmask[tr+13:2*tr-13, 25:-25, :] += 1.0
+    texmask[25:-25, tr+13:2*tr-13, :] += 1.0
+    texloss = (tf.reduce_sum(texmask * (tex - tex_var)**2)/np.sum(texmask))**0.5 # RMSE within masked area.
+
+    # Training driven by image-space loss.
+    lr_in = tf.placeholder(tf.float32, [])
+    train_op = tf.train.AdamOptimizer(lr_in, 0.9, 0.99).minimize(loss, var_list=[tex_var])
+
+    # Open log file.
+    log_file = open(out_dir + '/' + log_fn, 'wt') if log_fn else None
+
+    # Render.
+    ang = 0.0
+    util.init_uninitialized_vars()
+    texloss_avg = []
+    for it in range(max_iter + 1):
+        lr = lr_base * lr_ramp**(float(it)/float(max_iter))
+
+        # Random rotation/translation matrix for optimization.
+        r_rot = util.random_rotation_translation(0.25)
+
+        # Smooth rotation for display.
+        ang = ang + 0.01
+        a_rot = np.matmul(util.rotate_x(-0.4), util.rotate_y(ang))
+        dist = np.random.uniform(0.0, 48.5)
+
+        # Modelview and modelview + projection matrices.
+        proj  = util.projection(x=0.4, n=1.0, f=200.0)
+        r_mv  = np.matmul(util.translate(0, 0, -1.5-dist), r_rot)
+        r_mvp = np.matmul(proj, r_mv).astype(np.float32)
+        a_mv  = np.matmul(util.translate(0, 0, -3.5), a_rot)
+        a_mvp = np.matmul(proj, a_mv).astype(np.float32)
+    
+        # Run training and measure texture-space RMSE loss.
+        texloss_val, _ = util.run([texloss, train_op], {mtx_in: r_mvp, lr_in: lr})
+        texloss_avg.append(texloss_val)
+
+        # Print/save log.
+        if log_interval and (it % log_interval == 0):
+            texloss_val, texloss_avg = np.mean(np.asarray(texloss_avg)), []
+            psnr = -10.0 * np.log10(texloss_val**2) # PSNR based on average RMSE.
+            s = "iter=%d,loss=%f,psnr=%f" % (it, texloss_val, psnr)
+            print(s)
+            if log_file:
+                log_file.write(s + '\n')
+
+        # Show/save result images/textures.
+        display_image = display_interval and (it % display_interval) == 0
+        save_image = imgsave_interval and (it % imgsave_interval) == 0
+        save_texture = texsave_interval and (it % texsave_interval) == 0
+
+        if display_image or save_image:
+            result_image = util.run(color_opt, {mtx_in: a_mvp})[0]
+        if display_image:
+            util.display_image(result_image, size=display_res, title='%d / %d' % (it, max_iter))
+        if save_image:
+            util.save_image(out_dir + '/' + (imgsave_fn % it), result_image)
+        if save_texture:
+            util.save_image(out_dir + '/' + (texsave_fn % it), util.run(tex_var)[::-1])
+
+    # Done.
+    if log_file:
+        log_file.close()
+
+#----------------------------------------------------------------------------
+# Main function.
+#----------------------------------------------------------------------------
+
+def main():
+    display_interval = 0
+    enable_mip = None
+
+    def usage():
+        print("Usage: python earth.py [-v] [-mip|-nomip]")
+        exit()
+
+    for a in sys.argv[1:]:
+        if   a == '-v':     display_interval = 10
+        elif a == '-mip':   enable_mip = True
+        elif a == '-nomip': enable_mip = False
+        else:               usage()
+
+    if enable_mip is None:
+        usage()
+
+    # Initialize TensorFlow.        
+    util.init_tf()
+
+    # Run.
+    out_dir = 'out/earth_mip' if enable_mip else 'out/earth_nomip'
+    fit_earth(max_iter=20000, log_interval=10, display_interval=display_interval, enable_mip=enable_mip, out_dir=out_dir, log_fn='log.txt', texsave_interval=1000, texsave_fn='tex_%06d.png', imgsave_interval=1000, imgsave_fn='img_%06d.png')
+
+    # Done.
+    print("Done.")
+
+#----------------------------------------------------------------------------
+
+if __name__ == "__main__":
+    main()
+
+#----------------------------------------------------------------------------
--- a/samples/tensorflow/envphong.py
+++ b/samples/tensorflow/envphong.py
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+import numpy as np
+import tensorflow as tf
+import os
+import sys
+import pathlib
+
+import util
+
+sys.path.insert(0, os.path.join(sys.path[0], '../..')) # for nvdiffrast
+import nvdiffrast.tensorflow as dr
+
+#----------------------------------------------------------------------------
+# Environment map and Phong BRDF learning.
+#----------------------------------------------------------------------------
+
+def fit_env_phong(max_iter          = 1000,
+                  log_interval      = 10,
+                  display_interval  = None,
+                  display_res       = 1024,
+                  res               = 1024,
+                  lr_base           = 1e-2,
+                  lr_ramp           = 1.0,
+                  out_dir           = '.',
+                  log_fn            = None,
+                  imgsave_interval  = None,
+                  imgsave_fn        = None):
+
+    if out_dir:
+        os.makedirs(out_dir, exist_ok=True)
+
+    # Texture adapted from https://github.com/WaveEngine/Samples/tree/master/Materials/EnvironmentMap/Content/Assets/CubeMap.cubemap
+    datadir = f'{pathlib.Path(__file__).absolute().parents[1]}/data'
+    with np.load(f'{datadir}/envphong.npz') as f:
+        pos_idx, pos, normals, env = f.values()
+    env = env.astype(np.float32)/255.0
+    print("Mesh has %d triangles and %d vertices." % (pos_idx.shape[0], pos.shape[0]))
+
+    # Target Phong parameters.
+    phong_rgb = np.asarray([1.0, 0.8, 0.6], np.float32)
+    phong_exp = 25.0
+
+    # Inputs to TF graph.
+    mtx_in = tf.placeholder(tf.float32, [4, 4])
+    invmtx_in = tf.placeholder(tf.float32, [4, 4]) # Inverse.
+    campos_in = tf.placeholder(tf.float32, [3]) # Camera position in world space.
+    lightdir_in = tf.placeholder(tf.float32, [3]) # Light direction.
+
+    # Learned variables: environment maps, phong color, phong exponent.
+    env_var = tf.get_variable('env_var', initializer=tf.constant_initializer(0.5), shape=env.shape)
+    phong_var_raw = tf.get_variable('phong_var', initializer=tf.random_uniform_initializer(0.0, 1.0), shape=[4]) # R, G, B, exp.
+    phong_var = phong_var_raw * [1.0, 1.0, 1.0, 10.0] # Faster learning rate for the exponent.
+
+    # Transform and rasterize.
+    viewvec = pos[..., :3] - campos_in[np.newaxis, np.newaxis, :] # View vectors at vertices.
+    reflvec = viewvec - 2.0 * normals[tf.newaxis, ...] * tf.reduce_sum(normals[tf.newaxis, ...] * viewvec, axis=-1, keepdims=True) # Reflection vectors at vertices.
+    reflvec = reflvec / tf.reduce_sum(reflvec**2, axis=-1, keepdims=True)**0.5 # Normalize.
+    pos_clip = tf.matmul(pos, mtx_in, transpose_b=True)[tf.newaxis, ...]
+    rast_out, rast_out_db = dr.rasterize(pos_clip, pos_idx, [res, res])
+    refl, refld = dr.interpolate(reflvec, rast_out, pos_idx, rast_db=rast_out_db, diff_attrs='all') # Interpolated reflection vectors.
+    
+    # Phong light.
+    refl = refl / tf.reduce_sum(refl**2, axis=-1, keepdims=True)**0.5  # Normalize.
+    ldotr = tf.reduce_sum(-lightdir_in * refl, axis=-1, keepdims=True) # L dot R.
+
+    # Reference color. No need for AA because we are not learning geometry.
+    env = np.stack(env)[:, ::-1]
+    color = dr.texture(env[np.newaxis, ...], refl, refld, filter_mode='linear-mipmap-linear', boundary_mode='cube')
+    color = tf.reduce_sum(tf.stack(color), axis=0)
+    color = color + phong_rgb * tf.maximum(0.0, ldotr) ** phong_exp # Phong.
+    color = tf.maximum(color, 1.0 - tf.clip_by_value(rast_out[..., -1:], 0, 1)) # White background.
+
+    # Candidate rendering same up to this point, but uses learned texture and Phong parameters instead.
+    color_opt = dr.texture(env_var[tf.newaxis, ...], refl, uv_da=refld, filter_mode='linear-mipmap-linear', boundary_mode='cube')
+    color_opt = tf.reduce_sum(tf.stack(color_opt), axis=0)
+    color_opt = color_opt + phong_var[:3] * tf.maximum(0.0, ldotr) ** phong_var[3] # Phong.
+    color_opt = tf.maximum(color_opt, 1.0 - tf.clip_by_value(rast_out[..., -1:], 0, 1)) # White background.
+
+    # Training.
+    loss = tf.reduce_mean((color - color_opt)**2) # L2 pixel loss.
+    lr_in = tf.placeholder(tf.float32, [])
+    train_op = tf.train.AdamOptimizer(lr_in, 0.9, 0.99).minimize(loss, var_list=[env_var, phong_var_raw])
+
+    # Open log file.
+    log_file = open(out_dir + '/' + log_fn, 'wt') if log_fn else None
+
+    # Render.
+    ang = 0.0
+    util.init_uninitialized_vars()
+    imgloss_avg, phong_avg = [], []
+    for it in range(max_iter + 1):
+        lr = lr_base * lr_ramp**(float(it)/float(max_iter))
+
+        # Random rotation/translation matrix for optimization.
+        r_rot = util.random_rotation_translation(0.25)
+
+        # Smooth rotation for display.
+        ang = ang + 0.01
+        a_rot = np.matmul(util.rotate_x(-0.4), util.rotate_y(ang))
+
+        # Modelview and modelview + projection matrices.
+        proj  = util.projection(x=0.4, n=1.0, f=200.0)
+        r_mv  = np.matmul(util.translate(0, 0, -3.5), r_rot)
+        r_mvp = np.matmul(proj, r_mv).astype(np.float32)
+        a_mv  = np.matmul(util.translate(0, 0, -3.5), a_rot)
+        a_mvp = np.matmul(proj, a_mv).astype(np.float32)
+    
+        # Solve camera positions.
+        a_campos = np.linalg.inv(a_mv)[:3, 3]
+        r_campos = np.linalg.inv(r_mv)[:3, 3]
+
+        # Random light direction.        
+        lightdir = np.random.normal(size=[3])
+        lightdir /= np.linalg.norm(lightdir) + 1e-8
+
+        # Run training and measure image-space RMSE loss.
+        imgloss_val, phong_val, _ = util.run([loss, phong_var, train_op], {mtx_in: r_mvp, invmtx_in: np.linalg.inv(r_mvp), campos_in: r_campos, lightdir_in: lightdir, lr_in: lr})
+        imgloss_avg.append(imgloss_val**0.5)
+        phong_avg.append(phong_val)
+
+        # Print/save log.
+        if log_interval and (it % log_interval == 0):
+            imgloss_val, imgloss_avg = np.mean(np.asarray(imgloss_avg, np.float32)), []
+            phong_val, phong_avg = np.mean(np.asarray(phong_avg, np.float32), axis=0), []
+            phong_rgb_rmse = np.mean((phong_val[:3] - phong_rgb)**2)**0.5
+            phong_exp_rel_err = np.abs(phong_val[3] - phong_exp)/phong_exp
+            s = "iter=%d,phong_rgb_rmse=%f,phong_exp_rel_err=%f,img_rmse=%f" % (it, phong_rgb_rmse, phong_exp_rel_err, imgloss_val)
+            print(s)
+            if log_file:
+                log_file.write(s + '\n')
+
+        # Show/save result image.        
+        display_image = display_interval and (it % display_interval == 0)
+        save_image = imgsave_interval and (it % imgsave_interval == 0)
+
+        if display_image or save_image:
+            result_image = util.run(color_opt, {mtx_in: a_mvp, invmtx_in: np.linalg.inv(a_mvp), campos_in: a_campos, lightdir_in: lightdir})[0]
+        if display_image:
+            util.display_image(result_image, size=display_res, title='%d / %d' % (it, max_iter))
+        if save_image:
+            util.save_image(out_dir + '/' + (imgsave_fn % it), result_image)
+
+    # Done.
+    if log_file:
+        log_file.close()
+
+#----------------------------------------------------------------------------
+# Main function.
+#----------------------------------------------------------------------------
+
+def main():
+    display_interval = 0
+    for a in sys.argv[1:]:
+        if a == '-v':
+            display_interval = 10
+        else:
+            print("Usage: python envphong.py [-v]")
+            exit()
+
+    # Initialize TensorFlow.        
+    util.init_tf()
+
+    # Run.
+    fit_env_phong(max_iter=1500, log_interval=10, display_interval=display_interval, out_dir='out/env_phong', log_fn='log.txt', imgsave_interval=100, imgsave_fn='img_%06d.png')
+
+    # Done.
+    print("Done.")
+
+#----------------------------------------------------------------------------
+
+if __name__ == "__main__":
+    main()
+
+#----------------------------------------------------------------------------
--- a/samples/tensorflow/pose.py
+++ b/samples/tensorflow/pose.py
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+import numpy as np
+import tensorflow as tf
+import os
+import sys
+import util
+import pathlib
+
+sys.path.insert(0, os.path.join(sys.path[0], '../..')) # for nvdiffrast
+import nvdiffrast.tensorflow as dr
+
+#----------------------------------------------------------------------------
+# Quaternion math.
+#----------------------------------------------------------------------------
+
+# Unit quaternion.
+def q_unit():
+    return np.asarray([1, 0, 0, 0], np.float32)
+
+# Get a random normalized quaternion.
+def q_rnd():
+    u, v, w = np.random.uniform(0.0, 1.0, size=[3])
+    v *= 2.0 * np.pi
+    w *= 2.0 * np.pi
+    return np.asarray([(1.0-u)**0.5 * np.sin(v), (1.0-u)**0.5 * np.cos(v), u**0.5 * np.sin(w), u**0.5 * np.cos(w)], np.float32)
+
+# Get a random quaternion from the octahedral symmetric group S_4.
+_r2 = 0.5**0.5
+_q_S4 = [[ 1.0, 0.0, 0.0, 0.0], [ 0.0, 1.0, 0.0, 0.0], [ 0.0, 0.0, 1.0, 0.0], [ 0.0, 0.0, 0.0, 1.0],
+         [-0.5, 0.5, 0.5, 0.5], [-0.5,-0.5,-0.5, 0.5], [ 0.5,-0.5, 0.5, 0.5], [ 0.5, 0.5,-0.5, 0.5],
+         [ 0.5, 0.5, 0.5, 0.5], [-0.5, 0.5,-0.5, 0.5], [ 0.5,-0.5,-0.5, 0.5], [-0.5,-0.5, 0.5, 0.5],
+         [ _r2,-_r2, 0.0, 0.0], [ _r2, _r2, 0.0, 0.0], [ 0.0, 0.0, _r2, _r2], [ 0.0, 0.0,-_r2, _r2],
+         [ 0.0, _r2, _r2, 0.0], [ _r2, 0.0, 0.0,-_r2], [ _r2, 0.0, 0.0, _r2], [ 0.0,-_r2, _r2, 0.0],
+         [ _r2, 0.0, _r2, 0.0], [ 0.0, _r2, 0.0, _r2], [ _r2, 0.0,-_r2, 0.0], [ 0.0,-_r2, 0.0, _r2]]
+def q_rnd_S4():
+    return np.asarray(_q_S4[np.random.randint(24)], np.float32)
+
+# Quaternion slerp.
+def q_slerp(p, q, t):
+    d = np.dot(p, q)
+    if d < 0.0:
+        q = -q
+        d = -d
+    if d > 0.999:
+        a = p + t * (q-p)
+        return a / np.linalg.norm(a)
+    t0 = np.arccos(d)
+    tt = t0 * t
+    st = np.sin(tt)
+    st0 = np.sin(t0)
+    s1 = st / st0
+    s0 = np.cos(tt) - d*s1
+    return s0*p + s1*q
+
+# Quaterion scale (slerp vs. identity quaternion).
+def q_scale(q, scl):
+    return q_slerp(q_unit(), q, scl)
+
+# Quaternion product.
+def q_mul(p, q):
+    s1, V1 = p[0], p[1:]
+    s2, V2 = q[0], q[1:]
+    s = s1*s2 - np.dot(V1, V2)
+    V = s1*V2 + s2*V1 + np.cross(V1, V2)
+    return np.asarray([s, V[0], V[1], V[2]], np.float32)
+
+# Angular difference between two quaternions in degrees.
+def q_angle_deg(p, q):
+    d = np.abs(np.dot(p, q))
+    d = min(d, 1.0)
+    return np.degrees(2.0 * np.arccos(d))
+
+# Quaternion product in TensorFlow.
+def q_mul_tf(p, q):
+    a = p[0]*q[0] - p[1]*q[1] - p[2]*q[2] - p[3]*q[3]
+    b = p[0]*q[1] + p[1]*q[0] + p[2]*q[3] - p[3]*q[2]
+    c = p[0]*q[2] + p[2]*q[0] + p[3]*q[1] - p[1]*q[3]
+    d = p[0]*q[3] + p[3]*q[0] + p[1]*q[2] - p[2]*q[1]
+    return tf.stack([a, b, c, d])
+
+# Convert quaternion to 4x4 rotation matrix. TensorFlow.
+def q_to_mtx_tf(q):
+    r0 = tf.stack([1.0-2.0*q[1]**2 - 2.0*q[2]**2, 2.0*q[0]*q[1] - 2.0*q[2]*q[3], 2.0*q[0]*q[2] + 2.0*q[1]*q[3]])
+    r1 = tf.stack([2.0*q[0]*q[1] + 2.0*q[2]*q[3], 1.0 - 2.0*q[0]**2 - 2.0*q[2]**2, 2.0*q[1]*q[2] - 2.0*q[0]*q[3]])
+    r2 = tf.stack([2.0*q[0]*q[2] - 2.0*q[1]*q[3], 2.0*q[1]*q[2] + 2.0*q[0]*q[3], 1.0 - 2.0*q[0]**2 - 2.0*q[1]**2])
+    rr = tf.transpose(tf.stack([r0, r1, r2]), [1, 0])
+    rr = tf.concat([rr, tf.convert_to_tensor([[0], [0], [0]], tf.float32)], axis=1) # Pad right column.
+    rr = tf.concat([rr, tf.convert_to_tensor([[0, 0, 0, 1]], tf.float32)], axis=0)  # Pad bottom row.
+    return rr
+
+#----------------------------------------------------------------------------
+# Cube pose fitter.
+#----------------------------------------------------------------------------
+
+def fit_pose(max_iter           = 10000,
+             repeats            = 1,
+             log_interval       = 10,
+             display_interval   = None,
+             display_res        = 512,
+             lr_base            = 0.01,
+             lr_falloff         = 1.0,
+             nr_base            = 1.0,
+             nr_falloff         = 1e-4,
+             grad_phase_start   = 0.5,
+             resolution         = 256,
+             out_dir            = '.',
+             log_fn             = None,
+             imgsave_interval   = None,
+             imgsave_fn         = None):
+
+    if out_dir:
+        os.makedirs(out_dir, exist_ok=True)
+
+    datadir = f'{pathlib.Path(__file__).absolute().parents[1]}/data'
+    with np.load(f'{datadir}/cube_p.npz') as f:
+        pos_idx, pos, col_idx, col = f.values()
+    print("Mesh has %d triangles and %d vertices." % (pos_idx.shape[0], pos.shape[0]))
+
+    # Transformation matrix input to TF graph.
+    mtx_in = tf.placeholder(tf.float32, [4, 4])
+
+    # Pose matrix input to TF graph.    
+    pose_in = tf.placeholder(tf.float32, [4]) # Quaternion.
+    noise_in = tf.placeholder(tf.float32, [4]) # Mollification noise.
+    
+    # Setup TF graph for reference.
+    mtx_total = tf.matmul(mtx_in, q_to_mtx_tf(pose_in))
+    pos_clip = tf.matmul(pos, mtx_total, transpose_b=True)[tf.newaxis, ...]
+    rast_out, _ = dr.rasterize(pos_clip, pos_idx, resolution=[resolution, resolution], output_db=False)
+    color, _ = dr.interpolate(col[tf.newaxis, ...], rast_out, col_idx)
+    color = dr.antialias(color, rast_out, pos_clip, pos_idx)
+
+    # Setup TF graph for optimization candidate.
+    pose_var = tf.get_variable('pose', initializer=tf.zeros_initializer(), shape=[4])
+    pose_var_in = tf.placeholder(tf.float32, [4])
+    pose_set = tf.assign(pose_var, pose_var_in)
+    pose_norm_op = tf.assign(pose_var, pose_var / tf.reduce_sum(pose_var**2)**0.5) # Normalization operation.
+    pose_total = q_mul_tf(pose_var, noise_in)
+    mtx_total_opt = tf.matmul(mtx_in, q_to_mtx_tf(pose_total))
+    pos_clip_opt = tf.matmul(pos, mtx_total_opt, transpose_b=True)[tf.newaxis, ...]
+    rast_out_opt, _ = dr.rasterize(pos_clip_opt, pos_idx, resolution=[resolution, resolution], output_db=False)
+    color_opt, _ = dr.interpolate(col[tf.newaxis, ...], rast_out_opt, col_idx)
+    color_opt = dr.antialias(color_opt, rast_out_opt, pos_clip_opt, pos_idx)
+
+    # Image-space loss.
+    diff = (color_opt - color)**2 # L2 norm.
+    diff = tf.tanh(5.0 * tf.reduce_max(diff, axis=-1)) # Add some oomph to the loss.
+    loss = tf.reduce_mean(diff)
+    lr_in = tf.placeholder(tf.float32, [])
+    train_op = tf.train.AdamOptimizer(lr_in, 0.9, 0.999).minimize(loss, var_list=[pose_var])
+
+    # Open log file.
+    log_file = open(out_dir + '/' + log_fn, 'wt') if log_fn else None
+
+    # Repeats.
+    for rep in range(repeats):
+
+        # Optimize.
+        util.init_uninitialized_vars()
+        loss_best = np.inf
+        pose_best = None
+        for it in range(max_iter + 1):
+            # Modelview + projection matrix.
+            mvp = np.matmul(util.projection(x=0.4), util.translate(0, 0, -3.5)).astype(np.float32)
+
+            # Learning and noise rate scheduling.
+            itf = 1.0 * it / max_iter
+            lr = lr_base * lr_falloff**itf
+            nr = nr_base * nr_falloff**itf
+
+            # Noise input.
+            if itf >= grad_phase_start:
+                noise = q_unit()
+            else:
+                noise = q_scale(q_rnd(), nr)
+                noise = q_mul(noise, q_rnd_S4()) # Orientation noise.
+
+            # Initialize optimization.
+            if it == 0:
+                pose_target = q_rnd()                
+                util.run(pose_set, {pose_var_in: q_rnd()})
+                util.run(pose_norm_op)
+                util.run(loss, {mtx_in: mvp, pose_in: pose_target, noise_in: noise}) # Pipecleaning pass.
+
+            # Run gradient training step.
+            if itf >= grad_phase_start:
+                util.run(train_op, {mtx_in: mvp, pose_in: pose_target, noise_in: noise, lr_in: lr})
+                util.run(pose_norm_op)
+
+            # Measure image-space loss and update best found pose.
+            loss_val = util.run(loss, {mtx_in: mvp, pose_in: pose_target, noise_in: noise, lr_in: lr})
+            if loss_val < loss_best:
+                pose_best = util.run(pose_total, {noise_in: noise})
+                if loss_val > 0.0:
+                    loss_best = loss_val
+            else:
+                # Return to best pose in the greedy phase.
+                if itf < grad_phase_start:
+                    util.run(pose_set, {pose_var_in: pose_best})
+
+            # Print/save log.
+            if log_interval and (it % log_interval == 0):
+                err = q_angle_deg(util.run(pose_var), pose_target)
+                ebest = q_angle_deg(pose_best, pose_target)
+                s = "rep=%d,iter=%d,err=%f,err_best=%f,loss=%f,loss_best=%f,lr=%f,nr=%f" % (rep, it, err, ebest, loss_val, loss_best, lr, nr)
+                print(s)
+                if log_file:
+                    log_file.write(s + "\n")
+
+            # Show/save image.
+            display_image = display_interval and (it % display_interval == 0)
+            save_image = imgsave_interval and (it % imgsave_interval == 0)
+
+            if display_image or save_image:
+                img_ref, img_opt = util.run([color, color_opt], {mtx_in: mvp, pose_in: pose_target, noise_in: noise})
+                img_best, = util.run([color_opt], {mtx_in: mvp, pose_in: pose_best, noise_in: q_unit()})
+                img_ref = img_ref[0]
+                img_opt = img_opt[0]
+                img_best = img_best[0]
+                result_image = np.concatenate([img_ref, img_best, img_opt], axis=1)
+
+            if display_image:
+                util.display_image(result_image, size=display_res, title='(%d) %d / %d' % (rep, it, max_iter))
+            if save_image:
+                util.save_image(out_dir + '/' + (imgsave_fn % (rep, it)), result_image)
+
+    # All repeats done.
+    if log_file:
+        log_file.close()
+
+#----------------------------------------------------------------------------
+# Main function.
+#----------------------------------------------------------------------------
+
+def main():
+    display_interval = 0
+    repeats = 1
+
+    def usage():
+        print("Usage: python pose.py [-v] [repeats]")
+        exit()
+
+    for a in sys.argv[1:]:
+        if a == '-v':
+            display_interval = 10
+        elif a.isascii() and a.isdecimal():
+            repeats = int(a)
+        else:
+            usage()
+
+    if repeats <= 0:
+        usage()
+
+    # Initialize TensorFlow.
+    util.init_tf()
+
+    # Run.
+    fit_pose(max_iter=1000, repeats=repeats, log_interval=100, display_interval=display_interval, out_dir='out/pose', log_fn='log.txt', imgsave_interval=1000, imgsave_fn='img_%03d_%06d.png')
+
+    # Done.
+    print("Done.")
+
+#----------------------------------------------------------------------------
+
+if __name__ == "__main__":
+    main()
+
+#----------------------------------------------------------------------------
--- a/samples/tensorflow/triangle.py
+++ b/samples/tensorflow/triangle.py
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+import imageio
+import logging
+import os
+import numpy as np
+import tensorflow as tf
+import nvdiffrast.tensorflow as dr
+
+# Silence deprecation warnings and debug level logging
+logging.getLogger('tensorflow').setLevel(logging.ERROR)
+os.environ.setdefault('TF_CPP_MIN_LOG_LEVEL', '1')
+
+pos = tf.convert_to_tensor([[[-0.8, -0.8, 0, 1], [0.8, -0.8, 0, 1], [-0.8, 0.8, 0, 1]]], dtype=tf.float32)
+col = tf.convert_to_tensor([[[1, 0, 0], [0, 1, 0], [0, 0, 1]]], dtype=tf.float32)
+tri = tf.convert_to_tensor([[0, 1, 2]], dtype=tf.int32)
+
+rast, _ = dr.rasterize(pos, tri, resolution=[256, 256])
+out, _ = dr.interpolate(col, rast, tri)
+
+with tf.Session() as sess:
+    img = sess.run(out)
+    
+img = img[0, ::-1, :, :] # Flip vertically.
+img = np.clip(np.rint(img * 255), 0, 255).astype(np.uint8) # Quantize to np.uint8
+
+print("Saving to 'tri.png'.")
+imageio.imsave('tri.png', img)
--- a/samples/tensorflow/util.py
+++ b/samples/tensorflow/util.py
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+
+import os
+import numpy as np
+import tensorflow as tf
+
+# Silence deprecation warnings from TensorFlow 1.13 onwards
+import logging
+logging.getLogger('tensorflow').setLevel(logging.ERROR)
+
+from typing import Any, List
+
+#----------------------------------------------------------------------------
+# Projection and transformation matrix helpers.
+#----------------------------------------------------------------------------
+
+def projection(x=0.1, n=1.0, f=50.0):
+    return np.array([[n/x,    0,            0,              0], 
+                     [  0, n/-x,            0,              0], 
+                     [  0,    0, -(f+n)/(f-n), -(2*f*n)/(f-n)], 
+                     [  0,    0,           -1,              0]]).astype(np.float32)
+                    
+def translate(x, y, z):
+    return np.array([[1, 0, 0, x], 
+                     [0, 1, 0, y], 
+                     [0, 0, 1, z], 
+                     [0, 0, 0, 1]]).astype(np.float32)
+
+def rotate_x(a):
+    s, c = np.sin(a), np.cos(a)
+    return np.array([[1,  0, 0, 0], 
+                     [0,  c, s, 0], 
+                     [0, -s, c, 0], 
+                     [0,  0, 0, 1]]).astype(np.float32)
+
+def rotate_y(a):
+    s, c = np.sin(a), np.cos(a)
+    return np.array([[ c, 0, s, 0], 
+                     [ 0, 1, 0, 0], 
+                     [-s, 0, c, 0], 
+                     [ 0, 0, 0, 1]]).astype(np.float32)
+
+def random_rotation_translation(t):
+    m = np.random.normal(size=[3, 3])
+    m[1] = np.cross(m[0], m[2])
+    m[2] = np.cross(m[0], m[1])
+    m = m / np.linalg.norm(m, axis=1, keepdims=True)
+    m = np.pad(m, [[0, 1], [0, 1]], mode='constant')
+    m[3, 3] = 1.0
+    m[:3, 3] = np.random.uniform(-t, t, size=[3])
+    return m
+
+#----------------------------------------------------------------------------
+# Bilinear downsample by 2x.
+#----------------------------------------------------------------------------
+
+def bilinear_downsample(x):
+    w = tf.constant([[1, 3, 3, 1], [3, 9, 9, 3], [3, 9, 9, 3], [1, 3, 3, 1]], dtype=tf.float32) / 64.0
+    w = w[..., tf.newaxis, tf.newaxis] * tf.eye(x.shape[-1].value, batch_shape=[1, 1])
+    x = tf.nn.conv2d(x, w, strides=2, padding='SAME')
+    return x
+
+#----------------------------------------------------------------------------
+# Image display function using OpenGL.
+#----------------------------------------------------------------------------
+
+_glfw_window = None
+def display_image(image, zoom=None, size=None, title=None): # HWC
+    # Import OpenGL and glfw.
+    import OpenGL.GL as gl
+    import glfw
+
+    # Zoom image if requested.
+    image = np.asarray(image)
+    if size is not None:
+        assert zoom is None
+        zoom = max(1, size // image.shape[0])
+    if zoom is not None:
+        image = image.repeat(zoom, axis=0).repeat(zoom, axis=1)
+    height, width, channels = image.shape
+
+    # Initialize window.
+    if title is None:
+        title = 'Debug window'
+    global _glfw_window
+    if _glfw_window is None:
+        glfw.init()
+        _glfw_window = glfw.create_window(width, height, title, None, None)
+        glfw.make_context_current(_glfw_window)
+        glfw.show_window(_glfw_window)
+        glfw.swap_interval(0)
+    else:
+        glfw.make_context_current(_glfw_window)
+        glfw.set_window_title(_glfw_window, title)
+        glfw.set_window_size(_glfw_window, width, height)
+
+    # Update window.
+    glfw.poll_events()
+    gl.glClearColor(0, 0, 0, 1)
+    gl.glClear(gl.GL_COLOR_BUFFER_BIT)
+    gl.glWindowPos2f(0, 0)
+    gl.glPixelStorei(gl.GL_UNPACK_ALIGNMENT, 1)
+    gl_format = {3: gl.GL_RGB, 2: gl.GL_RG, 1: gl.GL_LUMINANCE}[channels]
+    gl_dtype = {'uint8': gl.GL_UNSIGNED_BYTE, 'float32': gl.GL_FLOAT}[image.dtype.name]
+    gl.glDrawPixels(width, height, gl_format, gl_dtype, image[::-1])
+    glfw.swap_buffers(_glfw_window)
+    if glfw.window_should_close(_glfw_window):
+        return False
+    return True
+
+#----------------------------------------------------------------------------
+# Image save helper.
+#----------------------------------------------------------------------------
+
+def save_image(fn, x):
+    import imageio
+    x = np.rint(x * 255.0)
+    x = np.clip(x, 0, 255).astype(np.uint8)
+    imageio.imsave(fn, x)
+
+#----------------------------------------------------------------------------
+
+# TensorFlow utilities
+
+#----------------------------------------------------------------------------
+
+def _sanitize_tf_config(config_dict: dict = None) -> dict:
+    # Defaults.
+    cfg = dict()
+    cfg["rnd.np_random_seed"]               = None      # Random seed for NumPy. None = keep as is.
+    cfg["rnd.tf_random_seed"]               = "auto"    # Random seed for TensorFlow. 'auto' = derive from NumPy random state. None = keep as is.
+    cfg["env.TF_CPP_MIN_LOG_LEVEL"]         = "1"       # 0 = Print all available debug info from TensorFlow. 1 = Print warnings and errors, but disable debug info.
+    cfg["env.HDF5_USE_FILE_LOCKING"]        = "FALSE"   # Disable HDF5 file locking to avoid concurrency issues with network shares.
+    cfg["graph_options.place_pruned_graph"] = True      # False = Check that all ops are available on the designated device. True = Skip the check for ops that are not used.
+    cfg["gpu_options.allow_growth"]         = True      # False = Allocate all GPU memory at the beginning. True = Allocate only as much GPU memory as needed.
+
+    # Remove defaults for environment variables that are already set.
+    for key in list(cfg):
+        fields = key.split(".")
+        if fields[0] == "env":
+            assert len(fields) == 2
+            if fields[1] in os.environ:
+                del cfg[key]
+
+    # User overrides.
+    if config_dict is not None:
+        cfg.update(config_dict)
+    return cfg
+
+
+def init_tf(config_dict: dict = None) -> None:
+    """Initialize TensorFlow session using good default settings."""
+    # Skip if already initialized.
+    if tf.get_default_session() is not None:
+        return
+
+    # Setup config dict and random seeds.
+    cfg = _sanitize_tf_config(config_dict)
+    np_random_seed = cfg["rnd.np_random_seed"]
+    if np_random_seed is not None:
+        np.random.seed(np_random_seed)
+    tf_random_seed = cfg["rnd.tf_random_seed"]
+    if tf_random_seed == "auto":
+        tf_random_seed = np.random.randint(1 << 31)
+    if tf_random_seed is not None:
+        tf.set_random_seed(tf_random_seed)
+
+    # Setup environment variables.
+    for key, value in cfg.items():
+        fields = key.split(".")
+        if fields[0] == "env":
+            assert len(fields) == 2
+            os.environ[fields[1]] = str(value)
+
+    # Create default TensorFlow session.
+    create_session(cfg, force_as_default=True)
+
+
+def assert_tf_initialized():
+    """Check that TensorFlow session has been initialized."""
+    if tf.get_default_session() is None:
+        raise RuntimeError("No default TensorFlow session found. Please call util.init_tf().")
+
+
+def create_session(config_dict: dict = None, force_as_default: bool = False) -> tf.Session:
+    """Create tf.Session based on config dict."""
+    # Setup TensorFlow config proto.
+    cfg = _sanitize_tf_config(config_dict)
+    config_proto = tf.ConfigProto()
+    for key, value in cfg.items():
+        fields = key.split(".")
+        if fields[0] not in ["rnd", "env"]:
+            obj = config_proto
+            for field in fields[:-1]:
+                obj = getattr(obj, field)
+            setattr(obj, fields[-1], value)
+
+    # Create session.
+    session = tf.Session(config=config_proto)
+    if force_as_default:
+        # pylint: disable=protected-access
+        session._default_session = session.as_default()
+        session._default_session.enforce_nesting = False
+        session._default_session.__enter__()
+    return session
+
+
+def is_tf_expression(x: Any) -> bool:
+    """Check whether the input is a valid Tensorflow expression, i.e., Tensorflow Tensor, Variable, or Operation."""
+    return isinstance(x, (tf.Tensor, tf.Variable, tf.Operation))
+
+
+def absolute_name_scope(scope: str) -> tf.name_scope:
+    """Forcefully enter the specified name scope, ignoring any surrounding scopes."""
+    return tf.name_scope(scope + "/")
+
+
+def init_uninitialized_vars(target_vars: List[tf.Variable] = None) -> None:
+    """Initialize all tf.Variables that have not already been initialized.
+
+    Equivalent to the following, but more efficient and does not bloat the tf graph:
+    tf.variables_initializer(tf.report_uninitialized_variables()).run()
+    """
+    assert_tf_initialized()
+    if target_vars is None:
+        target_vars = tf.global_variables()
+
+    test_vars = []
+    test_ops = []
+
+    with tf.control_dependencies(None):  # ignore surrounding control_dependencies
+        for var in target_vars:
+            assert is_tf_expression(var)
+
+            try:
+                tf.get_default_graph().get_tensor_by_name(var.name.replace(":0", "/IsVariableInitialized:0"))
+            except KeyError:
+                # Op does not exist => variable may be uninitialized.
+                test_vars.append(var)
+
+                with absolute_name_scope(var.name.split(":")[0]):
+                    test_ops.append(tf.is_variable_initialized(var))
+
+    init_vars = [var for var, inited in zip(test_vars, run(test_ops)) if not inited]
+    run([var.initializer for var in init_vars])
+
+def run(*args, **kwargs) -> Any:
+    """Run the specified ops in the default session."""
+    assert_tf_initialized()
+    return tf.get_default_session().run(*args, **kwargs)
--- a/samples/torch/cube.py
+++ b/samples/torch/cube.py
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+import argparse
+import os
+import pathlib
+import sys
+import numpy as np
+import torch
+import imageio
+
+import util
+
+import nvdiffrast.torch as dr
+
+# Transform vertex positions to clip space
+def transform_pos(mtx, pos):
+    t_mtx = torch.from_numpy(mtx).cuda() if isinstance(mtx, np.ndarray) else mtx
+    # (x,y,z) -> (x,y,z,1)
+    posw = torch.cat([pos, torch.ones([pos.shape[0], 1]).cuda()], axis=1)
+    return torch.matmul(posw, t_mtx.t())[None, ...]
+
+def render(glctx, mtx, pos, pos_idx, vtx_col, col_idx, resolution: int):
+    pos_clip    = transform_pos(mtx, pos)
+    rast_out, _ = dr.rasterize(glctx, pos_clip, pos_idx, resolution=[resolution, resolution])
+    color, _    = dr.interpolate(vtx_col[None, ...], rast_out, col_idx)
+    color       = dr.antialias(color, rast_out, pos_clip, pos_idx)
+    return color
+
+def make_grid(arr, ncols=2):
+    n, height, width, nc = arr.shape
+    nrows = n//ncols
+    assert n == nrows*ncols
+    return arr.reshape(nrows, ncols, height, width, nc).swapaxes(1,2).reshape(height*nrows, width*ncols, nc)
+
+def fit_cube(max_iter          = 5000,
+             resolution        = 4,
+             discontinuous     = False,
+             repeats           = 1,
+             log_interval      = 10,
+             display_interval  = None,
+             display_res       = 512,
+             out_dir           = None,
+             log_fn            = None,
+             mp4save_interval  = None,
+             mp4save_fn        = None):
+
+    log_file = None
+    writer = None
+    if out_dir:
+        os.makedirs(out_dir, exist_ok=True)
+        if log_fn:
+            log_file = open(f'{out_dir}/{log_fn}', 'wt')
+        if mp4save_interval != 0:
+            writer = imageio.get_writer(f'{out_dir}/{mp4save_fn}', mode='I', fps=30, codec='libx264', bitrate='16M')
+    else:
+        mp4save_interval = None
+
+    datadir = f'{pathlib.Path(__file__).absolute().parents[1]}/data'
+    fn = 'cube_%s.npz' % ('d' if discontinuous else 'c')
+    with np.load(f'{datadir}/{fn}') as f:
+        pos_idx, vtxp, col_idx, vtxc = f.values()
+    print("Mesh has %d triangles and %d vertices." % (pos_idx.shape[0], vtxp.shape[0]))
+
+    # Create position/triangle index tensors
+    pos_idx = torch.from_numpy(pos_idx.astype(np.int32)).cuda()
+    col_idx = torch.from_numpy(col_idx.astype(np.int32)).cuda()
+    vtx_pos = torch.from_numpy(vtxp.astype(np.float32)).cuda()
+    vtx_col = torch.from_numpy(vtxc.astype(np.float32)).cuda()
+
+    glctx = dr.RasterizeGLContext()
+
+    # Repeats.
+    for rep in range(repeats):
+
+        ang = 0.0
+        gl_avg = []
+
+        vtx_pos_rand = np.random.uniform(-0.5, 0.5, size=vtxp.shape) + vtxp
+        vtx_col_rand = np.random.uniform(0.0, 1.0, size=vtxc.shape)
+        vtx_pos_opt  = torch.tensor(vtx_pos_rand, dtype=torch.float32, device='cuda', requires_grad=True)
+        vtx_col_opt  = torch.tensor(vtx_col_rand, dtype=torch.float32, device='cuda', requires_grad=True)
+
+        # Adam optimizer for vertex position and color with a learning rate ramp.
+        optimizer    = torch.optim.Adam([vtx_pos_opt, vtx_col_opt], lr=1e-2)
+        scheduler    = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda x: max(0.01, 10**(-x*0.0005)))
+
+        for it in range(max_iter + 1):
+            # Random rotation/translation matrix for optimization.
+            r_rot = util.random_rotation_translation(0.25)
+
+            # Smooth rotation for display.
+            a_rot = np.matmul(util.rotate_x(-0.4), util.rotate_y(ang))
+
+            # Modelview and modelview + projection matrices.
+            proj  = util.projection(x=0.4)
+            r_mv  = np.matmul(util.translate(0, 0, -3.5), r_rot)
+            r_mvp = np.matmul(proj, r_mv).astype(np.float32)
+            a_mv  = np.matmul(util.translate(0, 0, -3.5), a_rot)
+            a_mvp = np.matmul(proj, a_mv).astype(np.float32)
+
+            # Compute geometric error for logging.
+            with torch.no_grad():
+                geom_loss = torch.mean(torch.sum((torch.abs(vtx_pos_opt) - .5)**2, dim=1)**0.5)
+                gl_avg.append(float(geom_loss))
+
+            # Print/save log.
+            if log_interval and (it % log_interval == 0):
+                gl_val = np.mean(np.asarray(gl_avg))
+                gl_avg = []
+                s = ("rep=%d," % rep) if repeats > 1 else ""
+                s += "iter=%d,err=%f" % (it, gl_val)
+                print(s)
+                if log_file:
+                    log_file.write(s + "\n")
+
+            color     = render(glctx, r_mvp, vtx_pos, pos_idx, vtx_col, col_idx, resolution)
+            color_opt = render(glctx, r_mvp, vtx_pos_opt, pos_idx, vtx_col_opt, col_idx, resolution)
+
+            # Compute loss and train.
+            loss = torch.mean((color - color_opt)**2) # L2 pixel loss.
+            optimizer.zero_grad()
+            loss.backward()
+            optimizer.step()
+            scheduler.step()
+
+            # Show/save image.
+            display_image = display_interval and (it % display_interval == 0)
+            save_mp4      = mp4save_interval and (it % mp4save_interval == 0)
+
+            if display_image or save_mp4:
+                ang = ang + 0.01
+
+                img_b = color[0].cpu().numpy()
+                img_o = color_opt[0].detach().cpu().numpy()
+                img_d = render(glctx, a_mvp, vtx_pos_opt, pos_idx, vtx_col_opt, col_idx, display_res)[0]
+                img_r = render(glctx, a_mvp, vtx_pos, pos_idx, vtx_col, col_idx, display_res)[0]
+
+                scl = display_res // img_o.shape[0]
+                img_b = np.repeat(np.repeat(img_b, scl, axis=0), scl, axis=1)
+                img_o = np.repeat(np.repeat(img_o, scl, axis=0), scl, axis=1)
+                result_image = make_grid(np.stack([img_o, img_b, img_d.detach().cpu().numpy(), img_r.cpu().numpy()]))
+
+                if display_image:
+                    util.display_image(result_image, size=display_res, title='%d / %d' % (it, max_iter))
+                if save_mp4:
+                    writer.append_data(np.clip(np.rint(result_image*255.0), 0, 255).astype(np.uint8))
+
+    # Done.
+    if writer is not None:
+        writer.close()
+    if log_file:
+        log_file.close()
+
+#----------------------------------------------------------------------------
+
+def main():
+    parser = argparse.ArgumentParser(description='Cube fit example')
+    parser.add_argument('--outdir', help='Specify output directory', default='')
+    parser.add_argument('--discontinuous', action='store_true', default=False)
+    parser.add_argument('--resolution', type=int, default=0, required=True)
+    parser.add_argument('--display-interval', type=int, default=0)
+    parser.add_argument('--mp4save-interval', type=int, default=100)
+    parser.add_argument('--max-iter', type=int, default=1000)
+    args = parser.parse_args()
+
+    # Set up logging.
+    if args.outdir:
+        ds = 'd' if args.discontinuous else 'c'
+        out_dir = f'{args.outdir}/cube_{ds}_{args.resolution}'
+        print (f'Saving results under {out_dir}')
+    else:
+        out_dir = None
+        print ('No output directory specified, not saving log or images')
+
+    # Run.
+    fit_cube(
+        max_iter=args.max_iter,
+        resolution=args.resolution,
+        discontinuous=args.discontinuous,
+        log_interval=10,
+        display_interval=args.display_interval,
+        out_dir=out_dir,
+        log_fn='log.txt',
+        mp4save_interval=args.mp4save_interval,
+        mp4save_fn='progress.mp4'
+    )
+
+    # Done.
+    print("Done.")
+
+#----------------------------------------------------------------------------
+
+if __name__ == "__main__":
+    main()
+
+#----------------------------------------------------------------------------
--- a/samples/torch/earth.py
+++ b/samples/torch/earth.py
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+import argparse
+import os
+import pathlib
+import sys
+import numpy as np
+import torch
+
+import util
+
+import nvdiffrast.torch as dr
+
+#----------------------------------------------------------------------------
+# Helpers.
+
+def transform_pos(mtx, pos):
+    t_mtx = torch.from_numpy(mtx).cuda() if isinstance(mtx, np.ndarray) else mtx
+    posw = torch.cat([pos, torch.ones([pos.shape[0], 1]).cuda()], axis=1)
+    return torch.matmul(posw, t_mtx.t())[None, ...]
+
+def render(glctx, mtx, pos, pos_idx, uv, uv_idx, tex, resolution, enable_mip, max_mip_level):
+    pos_clip = transform_pos(mtx, pos)
+    rast_out, rast_out_db = dr.rasterize(glctx, pos_clip, pos_idx, resolution=[resolution, resolution])
+
+    if enable_mip:
+        texc, texd = dr.interpolate(uv[None, ...], rast_out, uv_idx, rast_db=rast_out_db, diff_attrs='all')
+        color = dr.texture(tex[None, ...], texc, texd, filter_mode='linear-mipmap-linear', max_mip_level=max_mip_level)
+    else:
+        texc, _ = dr.interpolate(uv[None, ...], rast_out, uv_idx)
+        color = dr.texture(tex[None, ...], texc, filter_mode='linear')
+
+    color = color * torch.clamp(rast_out[..., -1:], 0, 1) # Mask out background.
+    return color
+
+#----------------------------------------------------------------------------
+
+def fit_earth(max_iter          = 20000,
+              log_interval      = 10,
+              display_interval  = None,
+              display_res       = 1024,
+              enable_mip        = True,
+              res               = 512,
+              ref_res           = 4096,
+              lr_base           = 1e-2,
+              lr_ramp           = 0.1,
+              out_dir           = None,
+              log_fn            = None,
+              texsave_interval  = None,
+              texsave_fn        = None,
+              imgsave_interval  = None,
+              imgsave_fn        = None):
+
+    log_file = None
+    if out_dir:
+        os.makedirs(out_dir, exist_ok=True)
+        if log_fn:
+            log_file = open(out_dir + '/' + log_fn, 'wt')
+    else:
+        imgsave_interval, texsave_interval = None, None
+    
+    # Mesh and texture adapted from "3D Earth Photorealistic 2K" model at
+    # https://www.turbosquid.com/3d-models/3d-realistic-earth-photorealistic-2k-1279125
+    datadir = f'{pathlib.Path(__file__).absolute().parents[1]}/data'
+    with np.load(f'{datadir}/earth.npz') as f:
+        pos_idx, pos, uv_idx, uv, tex = f.values()
+    tex = tex.astype(np.float32)/255.0
+    max_mip_level = 9 # Texture is a 4x3 atlas of 512x512 maps.
+    print("Mesh has %d triangles and %d vertices." % (pos_idx.shape[0], pos.shape[0]))
+
+    # Some input geometry contains vertex positions in (N, 4) (with v[:,3]==1).  Drop
+    # the last column in that case.
+    if pos.shape[1] == 4: pos = pos[:, 0:3]
+
+    # Create position/triangle index tensors
+    pos_idx = torch.from_numpy(pos_idx.astype(np.int32)).cuda()
+    vtx_pos = torch.from_numpy(pos.astype(np.float32)).cuda()
+    uv_idx  = torch.from_numpy(uv_idx.astype(np.int32)).cuda()
+    vtx_uv  = torch.from_numpy(uv.astype(np.float32)).cuda()
+
+    tex     = torch.from_numpy(tex.astype(np.float32)).cuda()
+    tex_opt = torch.full(tex.shape, 0.2, device='cuda', requires_grad=True)
+    glctx = dr.RasterizeGLContext()
+
+    ang = 0.0
+
+    # Adam optimizer for texture with a learning rate ramp.
+    optimizer    = torch.optim.Adam([tex_opt], lr=lr_base)
+    scheduler    = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda x: lr_ramp**(float(x)/float(max_iter)))
+
+    # Render.
+    ang = 0.0
+    texloss_avg = []
+    for it in range(max_iter + 1):
+        # Random rotation/translation matrix for optimization.
+        r_rot = util.random_rotation_translation(0.25)
+
+        # Smooth rotation for display.
+        a_rot = np.matmul(util.rotate_x(-0.4), util.rotate_y(ang))
+        dist = np.random.uniform(0.0, 48.5)
+
+        # Modelview and modelview + projection matrices.
+        proj  = util.projection(x=0.4, n=1.0, f=200.0)
+        r_mv  = np.matmul(util.translate(0, 0, -1.5-dist), r_rot)
+        r_mvp = np.matmul(proj, r_mv).astype(np.float32)
+        a_mv  = np.matmul(util.translate(0, 0, -3.5), a_rot)
+        a_mvp = np.matmul(proj, a_mv).astype(np.float32)
+
+        # Measure texture-space RMSE loss
+        with torch.no_grad():
+            texmask = torch.zeros_like(tex)
+            tr = tex.shape[1]//4
+            texmask[tr+13:2*tr-13, 25:-25, :] += 1.0
+            texmask[25:-25, tr+13:2*tr-13, :] += 1.0
+            # Measure only relevant portions of texture when calculating texture
+            # PSNR.
+            texloss = (torch.sum(texmask * (tex - tex_opt)**2)/torch.sum(texmask))**0.5 # RMSE within masked area.
+            texloss_avg.append(float(texloss))
+
+        # Render reference and optimized frames. Always enable mipmapping for reference.
+        color = render(glctx, r_mvp, vtx_pos, pos_idx, vtx_uv, uv_idx, tex, ref_res, True, max_mip_level)
+        color_opt = render(glctx, r_mvp, vtx_pos, pos_idx, vtx_uv, uv_idx, tex_opt, res, enable_mip, max_mip_level)
+
+        # Reduce the reference to correct size.
+        while color.shape[1] > res:
+            color = util.bilinear_downsample(color)
+
+        # Compute loss and perform a training step.
+        loss = torch.mean((color - color_opt)**2) # L2 pixel loss.
+        optimizer.zero_grad()
+        loss.backward()
+        optimizer.step()
+        scheduler.step()
+
+        # Print/save log.
+        if log_interval and (it % log_interval == 0):
+            texloss_val = np.mean(np.asarray(texloss_avg))
+            texloss_avg = []
+            psnr = -10.0 * np.log10(texloss_val**2) # PSNR based on average RMSE.
+            s = "iter=%d,loss=%f,psnr=%f" % (it, texloss_val, psnr)
+            print(s)
+            if log_file:
+                log_file.write(s + '\n')
+
+        # Show/save image.
+        display_image = display_interval and (it % display_interval == 0)
+        save_image = imgsave_interval and (it % imgsave_interval == 0)
+        save_texture = texsave_interval and (it % texsave_interval) == 0
+
+        if display_image or save_image:
+            ang = ang + 0.1
+
+            with torch.no_grad():
+                result_image = render(glctx, a_mvp, vtx_pos, pos_idx, vtx_uv, uv_idx, tex_opt, res, enable_mip, max_mip_level)[0].cpu().numpy()
+
+                if display_image:
+                    util.display_image(result_image, size=display_res, title='%d / %d' % (it, max_iter))
+                if save_image:
+                    util.save_image(out_dir + '/' + (imgsave_fn % it), result_image)
+
+                if save_texture:
+                    texture = tex_opt.cpu().numpy()[::-1]
+                    util.save_image(out_dir + '/' + (texsave_fn % it), texture)
+
+
+    # Done.
+    if log_file:
+        log_file.close()
+
+#----------------------------------------------------------------------------
+
+def main():
+    parser = argparse.ArgumentParser(description='Earth texture fitting example')
+    parser.add_argument('--outdir', help='Specify output directory', default='')
+    parser.add_argument('--mip', action='store_true', default=False)
+    parser.add_argument('--display-interval', type=int, default=0)
+    parser.add_argument('--max-iter', type=int, default=10000)
+    args = parser.parse_args()
+
+    # Set up logging.
+    if args.outdir:
+        ms = 'mip' if args.mip else 'nomip'
+        out_dir = f'{args.outdir}/earth_{ms}'
+        print (f'Saving results under {out_dir}')
+    else:
+        out_dir = None
+        print ('No output directory specified, not saving log or images')
+
+    # Run.
+    fit_earth(max_iter=args.max_iter, log_interval=10, display_interval=args.display_interval, enable_mip=args.mip, out_dir=out_dir, log_fn='log.txt', texsave_interval=1000, texsave_fn='tex_%06d.png', imgsave_interval=1000, imgsave_fn='img_%06d.png')
+
+    # Done.
+    print("Done.")
+
+#----------------------------------------------------------------------------
+
+if __name__ == "__main__":
+    main()
+
+#----------------------------------------------------------------------------