Add CUDA rasterizer

a1ec436b · Samuli Laine · 78528e68 · a1ec436b · a1ec436b · a1ec436b
Commit a1ec436b authored Aug 17, 2022 by Samuli Laine
17 changed files
--- a/nvdiffrast/common/rasterize.cpp
+++ b/nvdiffrast/common/rasterize.cpp
@@ -6,7 +6,7 @@
 // distribution of this software and related documentation without an express
 // license agreement from NVIDIA CORPORATION is strictly prohibited.
-#include "rasterize.h"
+#include "rasterize_gl.h"
 #include "glutil.h"
 #include <vector>
 #define STRINGIFY_SHADER_SOURCE(x) #x
@@ -210,8 +210,6 @@ void rasterizeInitGLContext(NVDR_CTX_ARGS, RasterizeGLState& s, int cudaDeviceId
                layout(location = 1) out vec4 out_db;
                IF_ZMODIFY(
                    layout(location = 1) uniform float in_dummy;
-                    in vec4 gl_FragCoord;
-                    out float gl_FragDepth;
                )
                void main()
                {
@@ -233,8 +231,6 @@ void rasterizeInitGLContext(NVDR_CTX_ARGS, RasterizeGLState& s, int cudaDeviceId
                layout(location = 1) out vec4 out_db;
                IF_ZMODIFY(
                    layout(location = 1) uniform float in_dummy;
-                    in vec4 gl_FragCoord;
-                    out float gl_FragDepth;
                )
                void main()
                {
@@ -280,8 +276,6 @@ void rasterizeInitGLContext(NVDR_CTX_ARGS, RasterizeGLState& s, int cudaDeviceId
                layout(location = 0) out vec4 out_raster;
                IF_ZMODIFY(
                    layout(location = 1) uniform float in_dummy;
-                    in vec4 gl_FragCoord;
-                    out float gl_FragDepth;
                )
                void main()
                {
@@ -300,8 +294,6 @@ void rasterizeInitGLContext(NVDR_CTX_ARGS, RasterizeGLState& s, int cudaDeviceId
                layout(location = 0) out vec4 out_raster;
                IF_ZMODIFY(
                    layout(location = 1) uniform float in_dummy;
-                    in vec4 gl_FragCoord;
-                    out float gl_FragDepth;
                )
                void main()
                {
@@ -364,9 +356,9 @@ void rasterizeInitGLContext(NVDR_CTX_ARGS, RasterizeGLState& s, int cudaDeviceId
    NVDR_CHECK_GL_ERROR(glGenTextures(1, &s.glPrevOutBuffer));
 }
-bool rasterizeResizeBuffers(NVDR_CTX_ARGS, RasterizeGLState& s, int posCount, int triCount, int width, int height, int depth)
+void rasterizeResizeBuffers(NVDR_CTX_ARGS, RasterizeGLState& s, bool& changes, int posCount, int triCount, int width, int height, int depth)
 {
-    bool changes = false;
+    changes = false;
    // Resize vertex buffer?
    if (posCount > s.posCount)
@@ -435,8 +427,6 @@ bool rasterizeResizeBuffers(NVDR_CTX_ARGS, RasterizeGLState& s, int posCount, in
        changes = true;
    }
-    return changes;
 }
 void rasterizeRender(NVDR_CTX_ARGS, RasterizeGLState& s, cudaStream_t stream, const float* posPtr, int posCount, int vtxPerInstance, const int32_t* triPtr, int triCount, const int32_t* rangesPtr, int width, int height, int depth, int peeling_idx)

--- a/nvdiffrast/common/rasterize_gl.h
+++ b/nvdiffrast/common/rasterize_gl.h
+// Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+#pragma once
+//------------------------------------------------------------------------
+// Do not try to include OpenGL stuff when compiling CUDA kernels for torch.
+#if !(defined(NVDR_TORCH) && defined(__CUDACC__))
+#include "framework.h"
+#include "glutil.h"
+//------------------------------------------------------------------------
+// OpenGL-related persistent state for forward op.
+struct RasterizeGLState // Must be initializable by memset to zero.
+{
+    int                     width;              // Allocated frame buffer width.
+    int                     height;             // Allocated frame buffer height.
+    int                     depth;              // Allocated frame buffer depth.
+    int                     posCount;           // Allocated position buffer in floats.
+    int                     triCount;           // Allocated triangle buffer in ints.
+    GLContext               glctx;
+    GLuint                  glFBO;
+    GLuint                  glColorBuffer[2];
+    GLuint                  glPrevOutBuffer;
+    GLuint                  glDepthStencilBuffer;
+    GLuint                  glVAO;
+    GLuint                  glTriBuffer;
+    GLuint                  glPosBuffer;
+    GLuint                  glProgram;
+    GLuint                  glProgramDP;
+    GLuint                  glVertexShader;
+    GLuint                  glGeometryShader;
+    GLuint                  glFragmentShader;
+    GLuint                  glFragmentShaderDP;
+    cudaGraphicsResource_t  cudaColorBuffer[2];
+    cudaGraphicsResource_t  cudaPrevOutBuffer;
+    cudaGraphicsResource_t  cudaPosBuffer;
+    cudaGraphicsResource_t  cudaTriBuffer;
+    int                     enableDB;
+    int                     enableZModify;      // Modify depth in shader, workaround for a rasterization issue on A100.
+};
+//------------------------------------------------------------------------
+// Shared C++ code prototypes.
+void rasterizeInitGLContext(NVDR_CTX_ARGS, RasterizeGLState& s, int cudaDeviceIdx);
+void rasterizeResizeBuffers(NVDR_CTX_ARGS, RasterizeGLState& s, bool& changes, int posCount, int triCount, int width, int height, int depth);
+void rasterizeRender(NVDR_CTX_ARGS, RasterizeGLState& s, cudaStream_t stream, const float* posPtr, int posCount, int vtxPerInstance, const int32_t* triPtr, int triCount, const int32_t* rangesPtr, int width, int height, int depth, int peeling_idx);
+void rasterizeCopyResults(NVDR_CTX_ARGS, RasterizeGLState& s, cudaStream_t stream, float** outputPtr, int width, int height, int depth);
+void rasterizeReleaseBuffers(NVDR_CTX_ARGS, RasterizeGLState& s);
+//------------------------------------------------------------------------
+#endif // !(defined(NVDR_TORCH) && defined(__CUDACC__))
--- a/nvdiffrast/tensorflow/plugin_loader.py
+++ b/nvdiffrast/tensorflow/plugin_loader.py
@@ -56,13 +56,25 @@ verbose = True # Print status messages to stdout.
 # Internal helper funcs.
 def _find_compiler_bindir():
+    hostx64_paths = sorted(glob.glob('C:/Program Files/Microsoft Visual Studio/*/Enterprise/VC/Tools/MSVC/*/bin/Hostx64/x64'), reverse=True)
+    if hostx64_paths != []:
+        return hostx64_paths[0]
    hostx64_paths = sorted(glob.glob('C:/Program Files (x86)/Microsoft Visual Studio/*/Enterprise/VC/Tools/MSVC/*/bin/Hostx64/x64'), reverse=True)
+    if hostx64_paths != []:
+        return hostx64_paths[0]
+    hostx64_paths = sorted(glob.glob('C:/Program Files/Microsoft Visual Studio/*/Professional/VC/Tools/MSVC/*/bin/Hostx64/x64'), reverse=True)
    if hostx64_paths != []:
        return hostx64_paths[0]
    hostx64_paths = sorted(glob.glob('C:/Program Files (x86)/Microsoft Visual Studio/*/Professional/VC/Tools/MSVC/*/bin/Hostx64/x64'), reverse=True)
+    if hostx64_paths != []:
+        return hostx64_paths[0]
+    hostx64_paths = sorted(glob.glob('C:/Program Files/Microsoft Visual Studio/*/BuildTools/VC/Tools/MSVC/*/bin/Hostx64/x64'), reverse=True)
    if hostx64_paths != []:
        return hostx64_paths[0]
    hostx64_paths = sorted(glob.glob('C:/Program Files (x86)/Microsoft Visual Studio/*/BuildTools/VC/Tools/MSVC/*/bin/Hostx64/x64'), reverse=True)
+    if hostx64_paths != []:
+        return hostx64_paths[0]
+    hostx64_paths = sorted(glob.glob('C:/Program Files/Microsoft Visual Studio/*/Community/VC/Tools/MSVC/*/bin/Hostx64/x64'), reverse=True)
    if hostx64_paths != []:
        return hostx64_paths[0]
    hostx64_paths = sorted(glob.glob('C:/Program Files (x86)/Microsoft Visual Studio/*/Community/VC/Tools/MSVC/*/bin/Hostx64/x64'), reverse=True)

--- a/nvdiffrast/tensorflow/tf_all.cu
+++ b/nvdiffrast/tensorflow/tf_all.cu
@@ -21,7 +21,7 @@
 #include "../common/common.cpp"
 #include "../common/rasterize.h"
-#include "../common/rasterize.cpp"
+#include "../common/rasterize_gl.cpp"
 #include "../common/rasterize.cu"
 #include "tf_rasterize.cu"

--- a/nvdiffrast/tensorflow/tf_rasterize.cu
+++ b/nvdiffrast/tensorflow/tf_rasterize.cu
@@ -74,14 +74,15 @@ struct RasterizeFwdOp : public OpKernel
            setGLContext(m_glState.glctx); // (Re-)Activate GL context.
        // Resize all buffers.
-        rasterizeResizeBuffers(ctx, m_glState, posCount, triCount, width, height, depth); // In common/rasterize.cpp
+        bool changes = false;
+        rasterizeResizeBuffers(ctx, m_glState, changes, posCount, triCount, width, height, depth); // In common/rasterize_gl.cpp
-        // Newly created GL objects sometimes don't map properly to CUDA until after first context swap. Workaround.
+        if (changes)
-        if (initCtx)
        {
-            // On first execution, do a bonus context swap.
+#ifdef _WIN32
+            // Workaround for occasional blank first frame on Windows.
            releaseGLContext();
            setGLContext(m_glState.glctx);
+#endif
        }
        // Copy input data to GL and render.

--- a/nvdiffrast/torch/__init__.py
+++ b/nvdiffrast/torch/__init__.py
@@ -6,5 +6,5 @@
 # distribution of this software and related documentation without an express
 # license agreement from NVIDIA CORPORATION is strictly prohibited.
-from .ops import RasterizeGLContext, get_log_level, set_log_level, rasterize, DepthPeeler, interpolate, texture, texture_construct_mip, antialias, antialias_construct_topology_hash
+from .ops import RasterizeCudaContext, RasterizeGLContext, get_log_level, set_log_level, rasterize, DepthPeeler, interpolate, texture, texture_construct_mip, antialias, antialias_construct_topology_hash
-__all__ = ["RasterizeGLContext", "get_log_level", "set_log_level", "rasterize", "DepthPeeler", "interpolate", "texture", "texture_construct_mip", "antialias", "antialias_construct_topology_hash"]
+__all__ = ["RasterizeCudaContext", "RasterizeGLContext", "get_log_level", "set_log_level", "rasterize", "DepthPeeler", "interpolate", "texture", "texture_construct_mip", "antialias", "antialias_construct_topology_hash"]
--- a/nvdiffrast/torch/ops.py
+++ b/nvdiffrast/torch/ops.py
@@ -6,22 +6,23 @@
 # distribution of this software and related documentation without an express
 # license agreement from NVIDIA CORPORATION is strictly prohibited.
+import importlib
 import logging
 import numpy as np
 import os
-import sys
 import torch
 import torch.utils.cpp_extension
 #----------------------------------------------------------------------------
 # C++/Cuda plugin compiler/loader.
-_cached_plugin = None
+_cached_plugin = {}
-def _get_plugin():
+def _get_plugin(gl=False):
+    assert isinstance(gl, bool)
    # Return cached plugin if already loaded.
-    global _cached_plugin
+    if _cached_plugin.get(gl, None) is not None:
-    if _cached_plugin is not None:
+        return _cached_plugin[gl]
-        return _cached_plugin
    # Make sure we can find the necessary compiler and libary binaries.
    if os.name == 'nt':
@@ -29,7 +30,9 @@ def _get_plugin():
        def find_cl_path():
            import glob
            for edition in ['Enterprise', 'Professional', 'BuildTools', 'Community']:
-                paths = sorted(glob.glob(r"C:\Program Files (x86)\Microsoft Visual Studio\*\%s\VC\Tools\MSVC\*\bin\Hostx64\x64" % edition), reverse=True)
+                vs_relative_path = r"\Microsoft Visual Studio\*\%s\VC\Tools\MSVC\*\bin\Hostx64\x64" % edition
+                paths = sorted(glob.glob(r"C:\Program Files" + vs_relative_path), reverse=True)
+                paths += sorted(glob.glob(r"C:\Program Files (x86)" + vs_relative_path), reverse=True)
                if paths:
                    return paths[0]
@@ -43,35 +46,52 @@ def _get_plugin():
    # Compiler options.
    opts = ['-DNVDR_TORCH']
-    # Linker options.
+    # Linker options for the GL-interfacing plugin.
-    if os.name == 'posix':
+    ldflags = []
-        ldflags = ['-lGL', '-lEGL']
+    if gl:
-    elif os.name == 'nt':
+        if os.name == 'posix':
-        libs = ['gdi32', 'opengl32', 'user32', 'setgpu']
+            ldflags = ['-lGL', '-lEGL']
-        ldflags = ['/LIBPATH:' + lib_dir] + ['/DEFAULTLIB:' + x for x in libs]
+        elif os.name == 'nt':
+            libs = ['gdi32', 'opengl32', 'user32', 'setgpu']
+            ldflags = ['/LIBPATH:' + lib_dir] + ['/DEFAULTLIB:' + x for x in libs]
    # List of source files.
-    source_files = [
+    if gl:
-        '../common/common.cpp',
+        source_files = [
-        '../common/glutil.cpp',
+            '../common/common.cpp',
-        '../common/rasterize.cu',
+            '../common/glutil.cpp',
-        '../common/rasterize.cpp',
+            '../common/rasterize_gl.cpp',
-        '../common/interpolate.cu',
+            'torch_bindings_gl.cpp',
-        '../common/texture.cu',
+            'torch_rasterize_gl.cpp',
-        '../common/texture.cpp',
+        ]
-        '../common/antialias.cu',
+    else:
-        'torch_bindings.cpp',
+        source_files = [
-        'torch_rasterize.cpp',
+            '../common/cudaraster/impl/Buffer.cpp',
-        'torch_interpolate.cpp',
+            '../common/cudaraster/impl/CudaRaster.cpp',
-        'torch_texture.cpp',
+            '../common/cudaraster/impl/RasterImpl.cu',
-        'torch_antialias.cpp',
+            '../common/cudaraster/impl/RasterImpl.cpp',
-    ]
+            '../common/common.cpp',
+            '../common/rasterize.cu',
+            '../common/interpolate.cu',
+            '../common/texture.cu',
+            '../common/texture.cpp',
+            '../common/antialias.cu',
+            'torch_bindings.cpp',
+            'torch_rasterize.cpp',
+            'torch_interpolate.cpp',
+            'torch_texture.cpp',
+            'torch_antialias.cpp',
+        ]
    # Some containers set this to contain old architectures that won't compile. We only need the one installed in the machine.
    os.environ['TORCH_CUDA_ARCH_LIST'] = ''
+    # On Linux, show a warning if GLEW is being forcibly loaded when compiling the GL plugin.
+    if gl and (os.name == 'posix') and ('libGLEW' in os.environ.get('LD_PRELOAD', '')):
+        logging.getLogger('nvdiffrast').warning("Warning: libGLEW is being loaded via LD_PRELOAD, and will probably conflict with the OpenGL plugin")
    # Try to detect if a stray lock file is left in cache directory and show a warning. This sometimes happens on Windows if the build is interrupted at just the right moment.
-    plugin_name = 'nvdiffrast_plugin'
+    plugin_name = 'nvdiffrast_plugin' + ('_gl' if gl else '')
    try:
        lock_fn = os.path.join(torch.utils.cpp_extension._get_build_directory(plugin_name, False), 'lock')
        if os.path.exists(lock_fn):
@@ -79,14 +99,27 @@ def _get_plugin():
    except:
        pass
+    # Speed up compilation on Windows.
+    if os.name == 'nt':
+        # Skip telemetry sending step in vcvarsall.bat
+        os.environ['VSCMD_SKIP_SENDTELEMETRY'] = '1'
+        # Opportunistically patch distutils to cache MSVC environments.
+        try:
+            import distutils._msvccompiler
+            import functools
+            if not hasattr(distutils._msvccompiler._get_vc_env, '__wrapped__'):
+                distutils._msvccompiler._get_vc_env = functools.lru_cache()(distutils._msvccompiler._get_vc_env)
+        except:
+            pass
    # Compile and load.
    source_paths = [os.path.join(os.path.dirname(__file__), fn) for fn in source_files]
-    torch.utils.cpp_extension.load(name=plugin_name, sources=source_paths, extra_cflags=opts, extra_cuda_cflags=opts, extra_ldflags=ldflags, with_cuda=True, verbose=False)
+    torch.utils.cpp_extension.load(name=plugin_name, sources=source_paths, extra_cflags=opts, extra_cuda_cflags=opts+['-lineinfo'], extra_ldflags=ldflags, with_cuda=True, verbose=False)
    # Import, cache, and return the compiled module.
-    import nvdiffrast_plugin
+    _cached_plugin[gl] = importlib.import_module(plugin_name)
-    _cached_plugin = nvdiffrast_plugin
+    return _cached_plugin[gl]
-    return _cached_plugin
 #----------------------------------------------------------------------------
 # Log level.
@@ -118,7 +151,35 @@ def set_log_level(level):
    _get_plugin().set_log_level(level)
 #----------------------------------------------------------------------------
-# GL State wrapper.
+# CudaRaster state wrapper.
+#----------------------------------------------------------------------------
+class RasterizeCudaContext:
+    def __init__(self, device=None):
+        '''Create a new Cuda rasterizer context.
+        The context is deleted and internal storage is released when the object is
+        destroyed.
+        Args:
+          device (Optional): Cuda device on which the context is created. Type can be
+                             `torch.device`, string (e.g., `'cuda:1'`), or int. If not
+                             specified, context will be created on currently active Cuda
+                             device.
+        Returns:
+          The newly created Cuda rasterizer context.
+        '''
+        if device is None:
+            cuda_device_idx = torch.cuda.current_device()
+        else:
+            with torch.cuda.device(device):
+                cuda_device_idx = torch.cuda.current_device()
+        self.cpp_wrapper = _get_plugin().RasterizeCRStateWrapper(cuda_device_idx)
+        self.output_db = True
+        self.active_depth_peeler = None
+#----------------------------------------------------------------------------
+# GL state wrapper.
 #----------------------------------------------------------------------------
 class RasterizeGLContext:
@@ -157,8 +218,8 @@ class RasterizeGLContext:
        else:
            with torch.cuda.device(device):
                cuda_device_idx = torch.cuda.current_device()
-        self.cpp_wrapper = _get_plugin().RasterizeGLStateWrapper(output_db, mode == 'automatic', cuda_device_idx)
+        self.cpp_wrapper = _get_plugin(gl=True).RasterizeGLStateWrapper(output_db, mode == 'automatic', cuda_device_idx)
-        self.active_depth_peeler = None # For error checking only
+        self.active_depth_peeler = None # For error checking only.
    def set_context(self):
        '''Set (activate) OpenGL context in the current CPU thread.
@@ -180,8 +241,11 @@ class RasterizeGLContext:
 class _rasterize_func(torch.autograd.Function):
    @staticmethod
-    def forward(ctx, glctx, pos, tri, resolution, ranges, grad_db, peeling_idx):
+    def forward(ctx, raster_ctx, pos, tri, resolution, ranges, grad_db, peeling_idx):
-        out, out_db = _get_plugin().rasterize_fwd(glctx.cpp_wrapper, pos, tri, resolution, ranges, peeling_idx)
+        if isinstance(raster_ctx, RasterizeGLContext):
+            out, out_db = _get_plugin(gl=True).rasterize_fwd_gl(raster_ctx.cpp_wrapper, pos, tri, resolution, ranges, peeling_idx)
+        else:
+            out, out_db = _get_plugin().rasterize_fwd_cuda(raster_ctx.cpp_wrapper, pos, tri, resolution, ranges, peeling_idx)
        ctx.save_for_backward(pos, tri, out)
        ctx.saved_grad_db = grad_db
        return out, out_db
@@ -204,7 +268,7 @@ def rasterize(glctx, pos, tri, resolution, ranges=None, grad_db=True):
    output tensors will be contiguous and reside in GPU memory.
    Args:
-        glctx: OpenGL context of type `RasterizeGLContext`.
+        glctx: Rasterizer context of type `RasterizeGLContext` or `RasterizeCudaContext`.
        pos: Vertex position tensor with dtype `torch.float32`. To enable range
             mode, this tensor should have a 2D shape [num_vertices, 4]. To enable
             instanced mode, use a 3D shape [minibatch_size, num_vertices, 4].
@@ -214,8 +278,8 @@ def rasterize(glctx, pos, tri, resolution, ranges=None, grad_db=True):
                `torch.int32`, specifying start indices and counts into `tri`.
                Ignored in instanced mode.
        grad_db: Propagate gradients of image-space derivatives of barycentrics
-                 into `pos` in backward pass. Ignored if OpenGL context was
+                 into `pos` in backward pass. Ignored if using an OpenGL context that
-                 not configured to output image-space derivatives.
+                 was not configured to output image-space derivatives.
    Returns:
        A tuple of two tensors. The first output tensor has shape [minibatch_size,
@@ -226,7 +290,7 @@ def rasterize(glctx, pos, tri, resolution, ranges=None, grad_db=True):
        (du/dX, du/dY, dv/dX, dv/dY). Otherwise it will be an empty tensor with shape
        [minibatch_size, height, width, 0].
    '''
-    assert isinstance(glctx, RasterizeGLContext)
+    assert isinstance(glctx, (RasterizeGLContext, RasterizeCudaContext))
    assert grad_db is True or grad_db is False
    grad_db = grad_db and glctx.output_db
@@ -258,7 +322,7 @@ class DepthPeeler:
        Returns:
          The newly created depth peeler.
        '''
-        assert isinstance(glctx, RasterizeGLContext)
+        assert isinstance(glctx, (RasterizeGLContext, RasterizeCudaContext))
        assert grad_db is True or grad_db is False
        grad_db = grad_db and glctx.output_db
@@ -271,7 +335,7 @@ class DepthPeeler:
            assert isinstance(ranges, torch.Tensor)
        # Store all the parameters.
-        self.glctx = glctx
+        self.raster_ctx = glctx
        self.pos = pos
        self.tri = tri
        self.resolution = resolution
@@ -280,18 +344,18 @@ class DepthPeeler:
        self.peeling_idx = None
    def __enter__(self):
-        if self.glctx is None:
+        if self.raster_ctx is None:
            raise RuntimeError("Cannot re-enter a terminated depth peeling operation")
-        if self.glctx.active_depth_peeler is not None:
+        if self.raster_ctx.active_depth_peeler is not None:
-            raise RuntimeError("Cannot have multiple depth peelers active simultaneously in a RasterizeGLContext")
+            raise RuntimeError("Cannot have multiple depth peelers active simultaneously in a rasterization context")
-        self.glctx.active_depth_peeler = self
+        self.raster_ctx.active_depth_peeler = self
        self.peeling_idx = 0
        return self
    def __exit__(self, *args):
-        assert self.glctx.active_depth_peeler is self
+        assert self.raster_ctx.active_depth_peeler is self
-        self.glctx.active_depth_peeler = None
+        self.raster_ctx.active_depth_peeler = None
-        self.glctx = None # Remove all references to input tensor so they're not left dangling.
+        self.raster_ctx = None # Remove all references to input tensor so they're not left dangling.
        self.pos = None
        self.tri = None
        self.resolution = None
@@ -309,9 +373,9 @@ class DepthPeeler:
        Returns:
          A tuple of two tensors as in `rasterize()`.
        '''
-        assert self.glctx.active_depth_peeler is self
+        assert self.raster_ctx.active_depth_peeler is self
        assert self.peeling_idx >= 0
-        result = _rasterize_func.apply(self.glctx, self.pos, self.tri, self.resolution, self.ranges, self.grad_db, self.peeling_idx)
+        result = _rasterize_func.apply(self.raster_ctx, self.pos, self.tri, self.resolution, self.ranges, self.grad_db, self.peeling_idx)
        self.peeling_idx += 1
        return result
@@ -604,6 +668,14 @@ def antialias(color, rast, pos, tri, topology_hash=None, pos_gradient_boost=1.0)
    All input tensors must be contiguous and reside in GPU memory. The output tensor
    will be contiguous and reside in GPU memory.
+    Note that silhouette edge determination is based on vertex indices in the triangle
+    tensor. For it to work properly, a vertex belonging to multiple triangles must be
+    referred to using the same vertex index in each triangle. Otherwise, nvdiffrast will always
+    classify the adjacent edges as silhouette edges, which leads to bad performance and
+    potentially incorrect gradients. If you are unsure whether your data is good, check
+    which pixels are modified by the antialias operation and compare to the example in the
+    documentation.
    Args:
        color: Input image to antialias with shape [minibatch_size, height, width, num_channels].
        rast: Main output tensor from `rasterize()`.

--- a/nvdiffrast/torch/torch_bindings.cpp
+++ b/nvdiffrast/torch/torch_bindings.cpp
@@ -20,7 +20,7 @@
 #define OP_RETURN_TTV   std::tuple<torch::Tensor, torch::Tensor, std::vector<torch::Tensor> >
 #define OP_RETURN_TTTTV std::tuple<torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, std::vector<torch::Tensor> >
-OP_RETURN_TT        rasterize_fwd                       (RasterizeGLStateWrapper& stateWrapper, torch::Tensor pos, torch::Tensor tri, std::tuple<int, int> resolution, torch::Tensor ranges, int depth_idx);
+OP_RETURN_TT        rasterize_fwd_cuda                  (RasterizeCRStateWrapper& stateWrapper, torch::Tensor pos, torch::Tensor tri, std::tuple<int, int> resolution, torch::Tensor ranges, int peeling_idx);
 OP_RETURN_T         rasterize_grad                      (torch::Tensor pos, torch::Tensor tri, torch::Tensor out, torch::Tensor dy);
 OP_RETURN_T         rasterize_grad_db                   (torch::Tensor pos, torch::Tensor tri, torch::Tensor out, torch::Tensor dy, torch::Tensor ddb);
 OP_RETURN_TT        interpolate_fwd                     (torch::Tensor attr, torch::Tensor rast, torch::Tensor tri);
@@ -42,9 +42,7 @@ OP_RETURN_TT        antialias_grad                      (torch::Tensor color, to
 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
    // State classes.
-    pybind11::class_<RasterizeGLStateWrapper>(m, "RasterizeGLStateWrapper").def(pybind11::init<bool, bool, int>())
+    pybind11::class_<RasterizeCRStateWrapper>(m, "RasterizeCRStateWrapper").def(pybind11::init<int>());
-        .def("set_context",     &RasterizeGLStateWrapper::setContext)
-        .def("release_context", &RasterizeGLStateWrapper::releaseContext);
    pybind11::class_<TextureMipWrapper>(m, "TextureMipWrapper").def(pybind11::init<>());
    pybind11::class_<TopologyHashWrapper>(m, "TopologyHashWrapper");
@@ -53,7 +51,7 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
    m.def("set_log_level", [](int level){ FLAGS_caffe2_log_level = level; }, "set log level");
    // Ops.
-    m.def("rasterize_fwd",                      &rasterize_fwd,                         "rasterize forward op");
+    m.def("rasterize_fwd_cuda",                 &rasterize_fwd_cuda,                    "rasterize forward op (cuda)");
    m.def("rasterize_grad",                     &rasterize_grad,                        "rasterize gradient op ignoring db gradients");
    m.def("rasterize_grad_db",                  &rasterize_grad_db,                     "rasterize gradient op with db gradients");
    m.def("interpolate_fwd",                    &interpolate_fwd,                       "interpolate forward op with attribute derivatives");

--- a/nvdiffrast/torch/torch_bindings_gl.cpp
+++ b/nvdiffrast/torch/torch_bindings_gl.cpp
+// Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+#include "torch_common.inl"
+#include "torch_types.h"
+#include <tuple>
+//------------------------------------------------------------------------
+// Op prototypes.
+std::tuple<torch::Tensor, torch::Tensor> rasterize_fwd_gl(RasterizeGLStateWrapper& stateWrapper, torch::Tensor pos, torch::Tensor tri, std::tuple<int, int> resolution, torch::Tensor ranges, int peeling_idx);
+//------------------------------------------------------------------------
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+    // State classes.
+    pybind11::class_<RasterizeGLStateWrapper>(m, "RasterizeGLStateWrapper").def(pybind11::init<bool, bool, int>())
+        .def("set_context",     &RasterizeGLStateWrapper::setContext)
+        .def("release_context", &RasterizeGLStateWrapper::releaseContext);
+    // Ops.
+    m.def("rasterize_fwd_gl", &rasterize_fwd_gl, "rasterize forward op (opengl)");
+}
+//------------------------------------------------------------------------
--- a/nvdiffrast/torch/torch_rasterize.cpp
+++ b/nvdiffrast/torch/torch_rasterize.cpp
@@ -10,55 +10,41 @@
 #include "torch_types.h"
 #include "../common/common.h"
 #include "../common/rasterize.h"
+#include "../common/cudaraster/CudaRaster.hpp"
+#include "../common/cudaraster/impl/Constants.hpp"
 #include <tuple>
 //------------------------------------------------------------------------
 // Kernel prototypes.
+void RasterizeCudaFwdShaderKernel(const RasterizeCudaFwdShaderParams p);
 void RasterizeGradKernel(const RasterizeGradParams p);
 void RasterizeGradKernelDb(const RasterizeGradParams p);
 //------------------------------------------------------------------------
-// Python GL state wrapper methods.
+// Python CudaRaster state wrapper methods.
-RasterizeGLStateWrapper::RasterizeGLStateWrapper(bool enableDB, bool automatic_, int cudaDeviceIdx_)
+RasterizeCRStateWrapper::RasterizeCRStateWrapper(int cudaDeviceIdx_)
 {
-    pState = new RasterizeGLState();
+    const at::cuda::OptionalCUDAGuard device_guard(cudaDeviceIdx_);
-    automatic = automatic_;
    cudaDeviceIdx = cudaDeviceIdx_;
-    memset(pState, 0, sizeof(RasterizeGLState));
+    cr = new CR::CudaRaster();
-    pState->enableDB = enableDB ? 1 : 0;
-    rasterizeInitGLContext(NVDR_CTX_PARAMS, *pState, cudaDeviceIdx_);
-    releaseGLContext();
 }
-RasterizeGLStateWrapper::~RasterizeGLStateWrapper(void)
+RasterizeCRStateWrapper::~RasterizeCRStateWrapper(void)
 {
-    setGLContext(pState->glctx);
+    const at::cuda::OptionalCUDAGuard device_guard(cudaDeviceIdx);
-    rasterizeReleaseBuffers(NVDR_CTX_PARAMS, *pState);
+    delete cr;
-    releaseGLContext();
-    destroyGLContext(pState->glctx);
-    delete pState;
-}
-void RasterizeGLStateWrapper::setContext(void)
-{
-    setGLContext(pState->glctx);
-}
-void RasterizeGLStateWrapper::releaseContext(void)
-{
-    releaseGLContext();
 }
 //------------------------------------------------------------------------
-// Forward op.
+// Forward op (Cuda).
-std::tuple<torch::Tensor, torch::Tensor> rasterize_fwd(RasterizeGLStateWrapper& stateWrapper, torch::Tensor pos, torch::Tensor tri, std::tuple<int, int> resolution, torch::Tensor ranges, int peeling_idx)
+std::tuple<torch::Tensor, torch::Tensor> rasterize_fwd_cuda(RasterizeCRStateWrapper& stateWrapper, torch::Tensor pos, torch::Tensor tri, std::tuple<int, int> resolution, torch::Tensor ranges, int peeling_idx)
 {
    const at::cuda::OptionalCUDAGuard device_guard(device_of(pos));
    cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-    RasterizeGLState& s = *stateWrapper.pState;
+    CR::CudaRaster* cr = stateWrapper.cr;
    // Check inputs.
    NVDR_CHECK_DEVICE(pos, tri);
@@ -67,11 +53,8 @@ std::tuple<torch::Tensor, torch::Tensor> rasterize_fwd(RasterizeGLStateWrapper&
    NVDR_CHECK_F32(pos);
    NVDR_CHECK_I32(tri, ranges);
-    // Check that GL context was created for the correct GPU.
+    // Check that CudaRaster context was created for the correct GPU.
-    NVDR_CHECK(pos.get_device() == stateWrapper.cudaDeviceIdx, "GL context must must reside on the same device as input tensors");
+    NVDR_CHECK(pos.get_device() == stateWrapper.cudaDeviceIdx, "CudaRaster context must must reside on the same device as input tensors");
-    // Determine number of outputs
-    int num_outputs = s.enableDB ? 2 : 1;
    // Determine instance mode and check input dimensions.
    bool instance_mode = pos.sizes().size() > 2;
@@ -87,49 +70,75 @@ std::tuple<torch::Tensor, torch::Tensor> rasterize_fwd(RasterizeGLStateWrapper&
    // Get output shape.
    int height = std::get<0>(resolution);
    int width  = std::get<1>(resolution);
-    int depth  = instance_mode ? pos.size(0) : ranges.size(0);
+    int depth  = instance_mode ? pos.size(0) : ranges.size(0); // Depth of tensor, not related to depth buffering.
    NVDR_CHECK(height > 0 && width > 0, "resolution must be [>0, >0]");
-    // Get position and triangle buffer sizes in int32/float32.
+    // Check resolution compatibility with CudaRaster.
-    int posCount = 4 * pos.size(0) * (instance_mode ? pos.size(1) : 1);
+    TORCH_CHECK(height <= CR_MAXVIEWPORT_SIZE && width <= CR_MAXVIEWPORT_SIZE, "resolution must be [<=", CR_MAXVIEWPORT_SIZE, ", <=", CR_MAXVIEWPORT_SIZE, "]");
-    int triCount = 3 * tri.size(0);
+    TORCH_CHECK(((height | width) & (CR_TILE_SIZE - 1)) == 0, "width and height must be divisible by ", CR_TILE_SIZE);
-    // Set the GL context unless manual context.
+    // Get position and triangle buffer sizes in vertices / triangles.
-    if (stateWrapper.automatic)
+    int posCount = instance_mode ? pos.size(1) : pos.size(0);
-        setGLContext(s.glctx);
+    int triCount = tri.size(0);
-    // Resize all buffers.
+    // Render.
-    if (rasterizeResizeBuffers(NVDR_CTX_PARAMS, s, posCount, triCount, width, height, depth))
-    {
-#ifdef _WIN32 
-        // Workaround for occasional blank first frame on Windows.
-        releaseGLContext();
-        setGLContext(s.glctx);
-#endif
-    }
-    // Copy input data to GL and render.
    const float* posPtr = pos.data_ptr<float>();
    const int32_t* rangesPtr = instance_mode ? 0 : ranges.data_ptr<int32_t>(); // This is in CPU memory.
    const int32_t* triPtr = tri.data_ptr<int32_t>();
-    int vtxPerInstance = instance_mode ? pos.size(1) : 0;
-    rasterizeRender(NVDR_CTX_PARAMS, s, stream, posPtr, posCount, vtxPerInstance, triPtr, triCount, rangesPtr, width, height, depth, peeling_idx);
+    // Set up CudaRaster.
+    cr->setViewportSize(width, height, depth);
+    cr->setVertexBuffer((void*)posPtr, posCount);
+    cr->setIndexBuffer((void*)triPtr, triCount);
+    // Enable depth peeling?
+    bool enablePeel = (peeling_idx > 0);
+    cr->setRenderModeFlags(enablePeel ? CR::CudaRaster::RenderModeFlag_EnableDepthPeeling : 0); // No backface culling.
+    if (enablePeel)
+        cr->swapDepthAndPeel(); // Use previous depth buffer as peeling depth input.
+    // Run CudaRaster in one large batch. In case of error, the workload could be split into smaller batches - maybe do that in the future.
+    cr->deferredClear(0u);
+    bool success = cr->drawTriangles(rangesPtr, stream);
+    NVDR_CHECK(success, "subtriangle count overflow");
    // Allocate output tensors.
    torch::TensorOptions opts = torch::TensorOptions().dtype(torch::kFloat32).device(torch::kCUDA);
    torch::Tensor out = torch::empty({depth, height, width, 4}, opts);
-    torch::Tensor out_db = torch::empty({depth, height, width, s.enableDB ? 4 : 0}, opts);
+    torch::Tensor out_db = torch::empty({depth, height, width, 4}, opts);
-    float* outputPtr[2];
-    outputPtr[0] = out.data_ptr<float>();
-    outputPtr[1] = s.enableDB ? out_db.data_ptr<float>() : NULL;
-    // Copy rasterized results into CUDA buffers.
+    // Populate pixel shader kernel parameters.
-    rasterizeCopyResults(NVDR_CTX_PARAMS, s, stream, outputPtr, width, height, depth);
+    RasterizeCudaFwdShaderParams p;
+    p.pos = posPtr;
+    p.tri = triPtr;
+    p.in_idx = (const int*)cr->getColorBuffer();
+    p.out = out.data_ptr<float>();
+    p.out_db = out_db.data_ptr<float>();
+    p.numTriangles = triCount;
+    p.numVertices = posCount;
+    p.width  = width;
+    p.height = height;
+    p.depth  = depth;
+    p.instance_mode = (pos.sizes().size() > 2) ? 1 : 0;
+    p.xs = 2.f / (float)p.width;
+    p.xo = 1.f / (float)p.width - 1.f;
+    p.ys = 2.f / (float)p.height;
+    p.yo = 1.f / (float)p.height - 1.f;
-    // Done. Release GL context and return.
+    // Verify that buffers are aligned to allow float2/float4 operations.
-    if (stateWrapper.automatic)
+    NVDR_CHECK(!((uintptr_t)p.pos & 15),    "pos input tensor not aligned to float4");
-        releaseGLContext();
+    NVDR_CHECK(!((uintptr_t)p.out & 15),    "out output tensor not aligned to float4");
+    NVDR_CHECK(!((uintptr_t)p.out_db & 15), "out_db output tensor not aligned to float4");
+    // Choose launch parameters.
+    dim3 blockSize = getLaunchBlockSize(RAST_CUDA_FWD_SHADER_KERNEL_BLOCK_WIDTH, RAST_CUDA_FWD_SHADER_KERNEL_BLOCK_HEIGHT, p.width, p.height);
+    dim3 gridSize  = getLaunchGridSize(blockSize, p.width, p.height, p.depth);
+    // Launch CUDA kernel.
+    void* args[] = {&p};
+    NVDR_CHECK_CUDA_ERROR(cudaLaunchKernel((void*)RasterizeCudaFwdShaderKernel, gridSize, blockSize, args, 0, stream));
+    // Return.
    return std::tuple<torch::Tensor, torch::Tensor>(out, out_db);
 }

--- a/nvdiffrast/torch/torch_rasterize_gl.cpp
+++ b/nvdiffrast/torch/torch_rasterize_gl.cpp
+// Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+#include "torch_common.inl"
+#include "torch_types.h"
+#include "../common/common.h"
+#include "../common/rasterize_gl.h"
+#include <tuple>
+//------------------------------------------------------------------------
+// Python GL state wrapper methods.
+RasterizeGLStateWrapper::RasterizeGLStateWrapper(bool enableDB, bool automatic_, int cudaDeviceIdx_)
+{
+    pState = new RasterizeGLState();
+    automatic = automatic_;
+    cudaDeviceIdx = cudaDeviceIdx_;
+    memset(pState, 0, sizeof(RasterizeGLState));
+    pState->enableDB = enableDB ? 1 : 0;
+    rasterizeInitGLContext(NVDR_CTX_PARAMS, *pState, cudaDeviceIdx_);
+    releaseGLContext();
+}
+RasterizeGLStateWrapper::~RasterizeGLStateWrapper(void)
+{
+    setGLContext(pState->glctx);
+    rasterizeReleaseBuffers(NVDR_CTX_PARAMS, *pState);
+    releaseGLContext();
+    destroyGLContext(pState->glctx);
+    delete pState;
+}
+void RasterizeGLStateWrapper::setContext(void)
+{
+    setGLContext(pState->glctx);
+}
+void RasterizeGLStateWrapper::releaseContext(void)
+{
+    releaseGLContext();
+}
+//------------------------------------------------------------------------
+// Forward op (OpenGL).
+std::tuple<torch::Tensor, torch::Tensor> rasterize_fwd_gl(RasterizeGLStateWrapper& stateWrapper, torch::Tensor pos, torch::Tensor tri, std::tuple<int, int> resolution, torch::Tensor ranges, int peeling_idx)
+{
+    const at::cuda::OptionalCUDAGuard device_guard(device_of(pos));
+    cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+    RasterizeGLState& s = *stateWrapper.pState;
+    // Check inputs.
+    NVDR_CHECK_DEVICE(pos, tri);
+    NVDR_CHECK_CPU(ranges);
+    NVDR_CHECK_CONTIGUOUS(pos, tri, ranges);
+    NVDR_CHECK_F32(pos);
+    NVDR_CHECK_I32(tri, ranges);
+    // Check that GL context was created for the correct GPU.
+    NVDR_CHECK(pos.get_device() == stateWrapper.cudaDeviceIdx, "GL context must must reside on the same device as input tensors");
+    // Determine number of outputs
+    int num_outputs = s.enableDB ? 2 : 1;
+    // Determine instance mode and check input dimensions.
+    bool instance_mode = pos.sizes().size() > 2;
+    if (instance_mode)
+        NVDR_CHECK(pos.sizes().size() == 3 && pos.size(0) > 0 && pos.size(1) > 0 && pos.size(2) == 4, "instance mode - pos must have shape [>0, >0, 4]");
+    else
+    {
+        NVDR_CHECK(pos.sizes().size() == 2 && pos.size(0) > 0 && pos.size(1) == 4, "range mode - pos must have shape [>0, 4]");
+        NVDR_CHECK(ranges.sizes().size() == 2 && ranges.size(0) > 0 && ranges.size(1) == 2, "range mode - ranges must have shape [>0, 2]");
+    }
+    NVDR_CHECK(tri.sizes().size() == 2 && tri.size(0) > 0 && tri.size(1) == 3, "tri must have shape [>0, 3]");
+    // Get output shape.
+    int height = std::get<0>(resolution);
+    int width  = std::get<1>(resolution);
+    int depth  = instance_mode ? pos.size(0) : ranges.size(0);
+    NVDR_CHECK(height > 0 && width > 0, "resolution must be [>0, >0]");
+    // Get position and triangle buffer sizes in int32/float32.
+    int posCount = 4 * pos.size(0) * (instance_mode ? pos.size(1) : 1);
+    int triCount = 3 * tri.size(0);
+    // Set the GL context unless manual context.
+    if (stateWrapper.automatic)
+        setGLContext(s.glctx);
+    // Resize all buffers.
+    bool changes = false;
+    rasterizeResizeBuffers(NVDR_CTX_PARAMS, s, changes, posCount, triCount, width, height, depth);
+    if (changes)
+    {
+#ifdef _WIN32
+        // Workaround for occasional blank first frame on Windows.
+        releaseGLContext();
+        setGLContext(s.glctx);
+#endif
+    }
+    // Copy input data to GL and render.
+    const float* posPtr = pos.data_ptr<float>();
+    const int32_t* rangesPtr = instance_mode ? 0 : ranges.data_ptr<int32_t>(); // This is in CPU memory.
+    const int32_t* triPtr = tri.data_ptr<int32_t>();
+    int vtxPerInstance = instance_mode ? pos.size(1) : 0;
+    rasterizeRender(NVDR_CTX_PARAMS, s, stream, posPtr, posCount, vtxPerInstance, triPtr, triCount, rangesPtr, width, height, depth, peeling_idx);
+    // Allocate output tensors.
+    torch::TensorOptions opts = torch::TensorOptions().dtype(torch::kFloat32).device(torch::kCUDA);
+    torch::Tensor out = torch::empty({depth, height, width, 4}, opts);
+    torch::Tensor out_db = torch::empty({depth, height, width, s.enableDB ? 4 : 0}, opts);
+    float* outputPtr[2];
+    outputPtr[0] = out.data_ptr<float>();
+    outputPtr[1] = s.enableDB ? out_db.data_ptr<float>() : NULL;
+    // Copy rasterized results into CUDA buffers.
+    rasterizeCopyResults(NVDR_CTX_PARAMS, s, stream, outputPtr, width, height, depth);
+    // Done. Release GL context and return.
+    if (stateWrapper.automatic)
+        releaseGLContext();
+    return std::tuple<torch::Tensor, torch::Tensor>(out, out_db);
+}
+//------------------------------------------------------------------------
--- a/nvdiffrast/torch/torch_types.h
+++ b/nvdiffrast/torch/torch_types.h
@@ -26,6 +26,20 @@ public:
    int                         cudaDeviceIdx;
 };
+//------------------------------------------------------------------------
+// Python CudaRaster state wrapper.
+namespace CR { class CudaRaster; }
+class RasterizeCRStateWrapper
+{
+public:
+    RasterizeCRStateWrapper     (int cudaDeviceIdx);
+    ~RasterizeCRStateWrapper    (void);
+    CR::CudaRaster*             cr;
+    int                         cudaDeviceIdx;
+};
 //------------------------------------------------------------------------
 // Mipmap wrapper to prevent intrusion from Python side.

--- a/samples/torch/cube.py
+++ b/samples/torch/cube.py
@@ -48,7 +48,8 @@ def fit_cube(max_iter          = 5000,
             out_dir           = None,
             log_fn            = None,
             mp4save_interval  = None,
-             mp4save_fn        = None):
+             mp4save_fn        = None,
+             use_opengl        = False):
    log_file = None
    writer = None
@@ -73,7 +74,8 @@ def fit_cube(max_iter          = 5000,
    vtx_pos = torch.from_numpy(vtxp.astype(np.float32)).cuda()
    vtx_col = torch.from_numpy(vtxc.astype(np.float32)).cuda()
-    glctx = dr.RasterizeGLContext()
+    # Rasterizer context
+    glctx = dr.RasterizeGLContext() if use_opengl else dr.RasterizeCudaContext()
    # Repeats.
    for rep in range(repeats):
@@ -161,7 +163,8 @@ def fit_cube(max_iter          = 5000,
 def main():
    parser = argparse.ArgumentParser(description='Cube fit example')
-    parser.add_argument('--outdir', help='Specify output directory', default='')
+    parser.add_argument('--opengl', help='enable OpenGL rendering', action='store_true', default=False)
+    parser.add_argument('--outdir', help='specify output directory', default='')
    parser.add_argument('--discontinuous', action='store_true', default=False)
    parser.add_argument('--resolution', type=int, default=0, required=True)
    parser.add_argument('--display-interval', type=int, default=0)
@@ -188,7 +191,8 @@ def main():
        out_dir=out_dir,
        log_fn='log.txt',
        mp4save_interval=args.mp4save_interval,
-        mp4save_fn='progress.mp4'
+        mp4save_fn='progress.mp4',
+        use_opengl=args.opengl
    )
    # Done.

--- a/samples/torch/earth.py
+++ b/samples/torch/earth.py
@@ -47,7 +47,7 @@ def fit_earth(max_iter          = 20000,
              display_res       = 1024,
              enable_mip        = True,
              res               = 512,
-              ref_res           = 4096,
+              ref_res           = 2048,  # Dropped from 4096 to 2048 to allow using the Cuda rasterizer.
              lr_base           = 1e-2,
              lr_ramp           = 0.1,
              out_dir           = None,
@@ -55,7 +55,8 @@ def fit_earth(max_iter          = 20000,
              texsave_interval  = None,
              texsave_fn        = None,
              imgsave_interval  = None,
-              imgsave_fn        = None):
+              imgsave_fn        = None,
+              use_opengl        = False):
    log_file = None
    if out_dir:
@@ -64,7 +65,7 @@ def fit_earth(max_iter          = 20000,
            log_file = open(out_dir + '/' + log_fn, 'wt')
    else:
        imgsave_interval, texsave_interval = None, None
    # Mesh and texture adapted from "3D Earth Photorealistic 2K" model at
    # https://www.turbosquid.com/3d-models/3d-realistic-earth-photorealistic-2k-1279125
    datadir = f'{pathlib.Path(__file__).absolute().parents[1]}/data'
@@ -86,7 +87,7 @@ def fit_earth(max_iter          = 20000,
    tex     = torch.from_numpy(tex.astype(np.float32)).cuda()
    tex_opt = torch.full(tex.shape, 0.2, device='cuda', requires_grad=True)
-    glctx = dr.RasterizeGLContext()
+    glctx = dr.RasterizeGLContext() if use_opengl else dr.RasterizeCudaContext()
    ang = 0.0
@@ -177,8 +178,9 @@ def fit_earth(max_iter          = 20000,
 def main():
    parser = argparse.ArgumentParser(description='Earth texture fitting example')
-    parser.add_argument('--outdir', help='Specify output directory', default='')
+    parser.add_argument('--opengl', help='enable OpenGL rendering', action='store_true', default=False)
-    parser.add_argument('--mip', action='store_true', default=False)
+    parser.add_argument('--outdir', help='specify output directory', default='')
+    parser.add_argument('--mip', help='enable mipmapping', action='store_true', default=False)
    parser.add_argument('--display-interval', type=int, default=0)
    parser.add_argument('--max-iter', type=int, default=10000)
    args = parser.parse_args()
@@ -193,7 +195,7 @@ def main():
        print ('No output directory specified, not saving log or images')
    # Run.
-    fit_earth(max_iter=args.max_iter, log_interval=10, display_interval=args.display_interval, enable_mip=args.mip, out_dir=out_dir, log_fn='log.txt', texsave_interval=1000, texsave_fn='tex_%06d.png', imgsave_interval=1000, imgsave_fn='img_%06d.png')
+    fit_earth(max_iter=args.max_iter, log_interval=10, display_interval=args.display_interval, enable_mip=args.mip, out_dir=out_dir, log_fn='log.txt', texsave_interval=1000, texsave_fn='tex_%06d.png', imgsave_interval=1000, imgsave_fn='img_%06d.png', use_opengl=args.opengl)
    # Done.
    print("Done.")

--- a/samples/torch/envphong.py
+++ b/samples/torch/envphong.py
@@ -32,7 +32,8 @@ def fit_env_phong(max_iter          = 1000,
                  out_dir           = None,
                  log_fn            = None,
                  mp4save_interval  = None,
-                  mp4save_fn        = None):
+                  mp4save_fn        = None,
+                  use_opengl        = False):
    log_file = None
    writer = None
@@ -74,7 +75,7 @@ def fit_env_phong(max_iter          = 1000,
    # Render.
    ang = 0.0
    imgloss_avg, phong_avg = [], []
-    glctx = dr.RasterizeGLContext()
+    glctx = dr.RasterizeGLContext() if use_opengl else dr.RasterizeCudaContext()
    zero_tensor = torch.as_tensor(0.0, dtype=torch.float32, device='cuda')
    one_tensor = torch.as_tensor(1.0, dtype=torch.float32, device='cuda')
@@ -162,7 +163,7 @@ def fit_env_phong(max_iter          = 1000,
            if log_file:
                log_file.write(s + '\n')
-        # Show/save result image.        
+        # Show/save result image.
        display_image = display_interval and (it % display_interval == 0)
        save_mp4 = mp4save_interval and (it % mp4save_interval == 0)
@@ -193,7 +194,8 @@ def fit_env_phong(max_iter          = 1000,
 def main():
    parser = argparse.ArgumentParser(description='Environment map fitting example')
-    parser.add_argument('--outdir', help='Specify output directory', default='')
+    parser.add_argument('--opengl', help='enable OpenGL rendering', action='store_true', default=False)
+    parser.add_argument('--outdir', help='specify output directory', default='')
    parser.add_argument('--display-interval', type=int, default=0)
    parser.add_argument('--mp4save-interval', type=int, default=10)
    parser.add_argument('--max-iter', type=int, default=5000)
@@ -214,7 +216,8 @@ def main():
        display_interval=args.display_interval,
        out_dir=out_dir,
        mp4save_interval=args.mp4save_interval,
-        mp4save_fn='progress.mp4'
+        mp4save_fn='progress.mp4',
+        use_opengl=args.opengl
    )
    # Done.

--- a/samples/torch/pose.py
+++ b/samples/torch/pose.py
@@ -132,7 +132,8 @@ def fit_pose(max_iter           = 10000,
             out_dir            = None,
             log_fn             = None,
             mp4save_interval   = None,
-             mp4save_fn         = None):
+             mp4save_fn         = None,
+             use_opengl         = False):
    log_file = None
    writer = None
@@ -160,7 +161,7 @@ def fit_pose(max_iter           = 10000,
    col_idx = torch.from_numpy(col_idx.astype(np.int32)).cuda()
    vtx_col = torch.from_numpy(col.astype(np.float32)).cuda()
-    glctx = dr.RasterizeGLContext()
+    glctx = dr.RasterizeGLContext() if use_opengl else dr.RasterizeCudaContext()
    for rep in range(repeats):
        pose_target = torch.tensor(q_rnd(), device='cuda')
@@ -253,7 +254,8 @@ def fit_pose(max_iter           = 10000,
 def main():
    parser = argparse.ArgumentParser(description='Cube pose fitting example')
-    parser.add_argument('--outdir', help='Specify output directory', default='')
+    parser.add_argument('--opengl', help='enable OpenGL rendering', action='store_true', default=False)
+    parser.add_argument('--outdir', help='specify output directory', default='')
    parser.add_argument('--display-interval', type=int, default=0)
    parser.add_argument('--mp4save-interval', type=int, default=10)
    parser.add_argument('--max-iter', type=int, default=1000)
@@ -277,7 +279,8 @@ def main():
        out_dir=out_dir,
        log_fn='log.txt',
        mp4save_interval=args.mp4save_interval,
-        mp4save_fn='progress.mp4'
+        mp4save_fn='progress.mp4',
+        use_opengl=args.opengl
    )
    # Done.

--- a/samples/torch/triangle.py
+++ b/samples/torch/triangle.py
@@ -10,15 +10,23 @@ import imageio
 import numpy as np
 import torch
 import nvdiffrast.torch as dr
+import sys
 def tensor(*args, **kwargs):
    return torch.tensor(*args, device='cuda', **kwargs)
+if sys.argv[1:] == ['--cuda']:
+    glctx = dr.RasterizeCudaContext()
+elif sys.argv[1:] == ['--opengl']:
+    glctx = dr.RasterizeGLContext()
+else:
+    print("Specify either --cuda or --opengl")
+    exit(1)
 pos = tensor([[[-0.8, -0.8, 0, 1], [0.8, -0.8, 0, 1], [-0.8, 0.8, 0, 1]]], dtype=torch.float32)
 col = tensor([[[1, 0, 0], [0, 1, 0], [0, 0, 1]]], dtype=torch.float32)
 tri = tensor([[0, 1, 2]], dtype=torch.int32)
-glctx = dr.RasterizeGLContext()
 rast, _ = dr.rasterize(glctx, pos, tri, resolution=[256, 256])
 out, _ = dr.interpolate(col, rast, tri)