Support for multiple GPUs, mip bias input for texture op

ce7063f1 · Samuli Laine · 2468e2a0 · ce7063f1 · ce7063f1 · ce7063f1
Commit ce7063f1 authored Jan 24, 2021 by Samuli Laine
20 changed files
--- a/docs/index.html
+++ b/docs/index.html
--- a/nvdiffrast/__init__.py
+++ b/nvdiffrast/__init__.py
@@ -6,4 +6,4 @@
 # distribution of this software and related documentation without an express
 # license agreement from NVIDIA CORPORATION is strictly prohibited.

-__version__ = '0.2.0'
+__version__ = '0.2.1'
--- a/nvdiffrast/common/common.h
+++ b/nvdiffrast/common/common.h
@@ -185,6 +185,8 @@ template<class T> static __device__ __forceinline__ void swap(T& a, T& b)
 //------------------------------------------------------------------------
 // Coalesced atomics. These are all done via macros.

+#if __CUDA_ARCH__ >= 700 // Warp match instruction __match_any_sync() is only available on compute capability 7.x and higher
+
 #define CA_TEMP       _ca_temp
 #define CA_TEMP_PARAM float* CA_TEMP
 #define CA_DECLARE_TEMP(threads_per_block) \
@@ -228,5 +230,24 @@ template<class T> static __device__ __forceinline__ void swap(T& a, T& b)
        caAtomicAdd((ptr)+(idx), (value));          \
    } while(0)

+//------------------------------------------------------------------------
+// Disable atomic coalescing for compute capability lower than 7.x
+
+#else // __CUDA_ARCH__ >= 700
+#define CA_TEMP _ca_temp
+#define CA_TEMP_PARAM float CA_TEMP
+#define CA_DECLARE_TEMP(threads_per_block) CA_TEMP_PARAM
+#define CA_SET_GROUP_MASK(group, thread_mask)
+#define CA_SET_GROUP(group)
+#define caAtomicAdd(ptr, value) atomicAdd((ptr), (value))
+#define caAtomicAdd3_xyw(ptr, x, y, w)  \
+    do {                                \
+        atomicAdd((ptr), (x));          \
+        atomicAdd((ptr)+1, (y));        \
+        atomicAdd((ptr)+3, (w));        \
+    } while(0)
+#define caAtomicAddTexture(ptr, level, idx, value) atomicAdd((ptr)+(idx), (value))
+#endif // __CUDA_ARCH__ >= 700
+
 //------------------------------------------------------------------------
 #endif // __CUDACC__
--- a/nvdiffrast/common/framework.h
+++ b/nvdiffrast/common/framework.h
@@ -36,6 +36,7 @@ using namespace tensorflow::shape_inference;
 #include <torch/extension.h>
 #include <ATen/cuda/CUDAContext.h>
 #include <ATen/cuda/CUDAUtils.h>
+#include <c10/cuda/CUDAGuard.h>
 #include <pybind11/numpy.h>
 #endif
 #define NVDR_CTX_ARGS int _nvdr_ctx_dummy

--- a/nvdiffrast/common/glutil.inl
+++ b/nvdiffrast/common/glutil.inl
@@ -37,26 +37,43 @@ struct GLContext
 static void setGLContext(GLContext& glctx)
 {
    if (!glctx.hglrc)
-        LOG(ERROR) << "setGLContext() called with null gltcx";
+        LOG(FATAL) << "setGLContext() called with null gltcx";
    if (!wglMakeCurrent(glctx.hdc, glctx.hglrc))
-        LOG(ERROR) << "wglMakeCurrent() failed when setting GL context";
+        LOG(FATAL) << "wglMakeCurrent() failed when setting GL context";

    if (glctx.glewInitialized)
        return;
    GLenum result = glewInit();
    if (result != GLEW_OK)
-        LOG(ERROR) << "glewInit() failed, return value = " << result;
+        LOG(FATAL) << "glewInit() failed, return value = " << result;
    glctx.glewInitialized = 1;
 }

 static void releaseGLContext(void)
 {
    if (!wglMakeCurrent(NULL, NULL))
-        LOG(ERROR) << "wglMakeCurrent() failed when releasing GL context";
+        LOG(FATAL) << "wglMakeCurrent() failed when releasing GL context";
 }

-static GLContext createGLContext(void)
+extern "C" int set_gpu(const char*);
+
+static GLContext createGLContext(int cudaDeviceIdx)
 {
+    if (cudaDeviceIdx >= 0)
+    {
+        char pciBusId[256] = "";
+        LOG(INFO) << "Creating GL context for Cuda device " << cudaDeviceIdx;
+        if (cudaDeviceGetPCIBusId(pciBusId, 255, cudaDeviceIdx) != CUDA_SUCCESS)
+        {
+            LOG(INFO) << "PCI bus id query failed";
+        }
+        else
+        {
+            int res = set_gpu(pciBusId);
+            LOG(INFO) << "Selecting device with PCI bus id " << pciBusId << " - " << (res ? "failed, expect crash or major slowdown" : "success");
+        }
+    }
+
    HINSTANCE hInstance = GetModuleHandle(NULL);
    WNDCLASS wc = {};
    wc.style         = CS_OWNDC;
@@ -101,7 +118,7 @@ static GLContext createGLContext(void)
 static void destroyGLContext(GLContext& glctx)
 {
    if (!glctx.hglrc)
-        LOG(ERROR) << "destroyGLContext() called with null gltcx";
+        LOG(FATAL) << "destroyGLContext() called with null gltcx";

    // If this is the current context, release it.
    if (wglGetCurrentContext() == glctx.hglrc)
@@ -109,13 +126,13 @@ static void destroyGLContext(GLContext& glctx)

    HWND hwnd = WindowFromDC(glctx.hdc);
    if (!hwnd)
-        LOG(ERROR) << "WindowFromDC() failed";
+        LOG(FATAL) << "WindowFromDC() failed";
    if (!ReleaseDC(hwnd, glctx.hdc))
-        LOG(ERROR) << "ReleaseDC() failed";
+        LOG(FATAL) << "ReleaseDC() failed";
    if (!wglDeleteContext(glctx.hglrc))
-        LOG(ERROR) << "wglDeleteContext() failed";
+        LOG(FATAL) << "wglDeleteContext() failed";
    if (!DestroyWindow(hwnd))
-        LOG(ERROR) << "DestroyWindow() failed";
+        LOG(FATAL) << "DestroyWindow() failed";

    LOG(INFO) << std::hex << std::setfill('0')
              << "WGL OpenGL context destroyed (hdc: 0x" << std::setw(8) << (uint32_t)(uintptr_t)glctx.hdc
@@ -140,6 +157,7 @@ static void destroyGLContext(GLContext& glctx)
 #   include <GL/glew.h> // Use system-supplied glew.h
 #endif
 #include <EGL/egl.h>
+#include <EGL/eglext.h>
 #include <GL/gl.h>
 #include <cuda_gl_interop.h>

@@ -148,7 +166,6 @@ static void destroyGLContext(GLContext& glctx)
 struct GLContext
 {
    EGLDisplay  display;
-    EGLSurface  surface;
    EGLContext  context;
    int         glewInitialized;
 };
@@ -158,9 +175,9 @@ struct GLContext
 static void setGLContext(GLContext& glctx)
 {
    if (!glctx.context)
-        LOG(ERROR) << "setGLContext() called with null gltcx";
+        LOG(FATAL) << "setGLContext() called with null gltcx";

-    if (!eglMakeCurrent(glctx.display, glctx.surface, glctx.surface, glctx.context))
+    if (!eglMakeCurrent(glctx.display, EGL_NO_SURFACE, EGL_NO_SURFACE, glctx.context))
        LOG(ERROR) << "eglMakeCurrent() failed when setting GL context";

    if (glctx.glewInitialized)
@@ -168,7 +185,7 @@ static void setGLContext(GLContext& glctx)

    GLenum result = glewInit();
    if (result != GLEW_OK)
-        LOG(ERROR) << "glewInit() failed, return value = " << result;
+        LOG(FATAL) << "glewInit() failed, return value = " << result;
    glctx.glewInitialized = 1;
 }

@@ -178,21 +195,83 @@ static void releaseGLContext(void)
    if (display == EGL_NO_DISPLAY)
        LOG(WARNING) << "releaseGLContext() called with no active display";
    if (!eglMakeCurrent(display, EGL_NO_SURFACE, EGL_NO_SURFACE, EGL_NO_CONTEXT))
-        LOG(ERROR) << "eglMakeCurrent() failed when releasing GL context";
+        LOG(FATAL) << "eglMakeCurrent() failed when releasing GL context";
 }

-static GLContext createGLContext(void)
+static EGLDisplay getCudaDisplay(int cudaDeviceIdx)
 {
-    // Initialize.
+    typedef EGLBoolean (*eglQueryDevicesEXT_t)(EGLint, EGLDeviceEXT, EGLint*);
+    typedef EGLBoolean (*eglQueryDeviceAttribEXT_t)(EGLDeviceEXT, EGLint, EGLAttrib*);
+    typedef EGLDisplay (*eglGetPlatformDisplayEXT_t)(EGLenum, void*, const EGLint*);

-    EGLDisplay display = eglGetDisplay(EGL_DEFAULT_DISPLAY);
-    if (display == EGL_NO_DISPLAY)
-        LOG(ERROR) << "eglGetDisplay() failed";
+    eglQueryDevicesEXT_t eglQueryDevicesEXT = (eglQueryDevicesEXT_t)eglGetProcAddress("eglQueryDevicesEXT");
+    if (!eglQueryDevicesEXT)
+    {
+        LOG(INFO) << "eglGetProcAddress(\"eglQueryDevicesEXT\") failed";
+        return 0;
+    }
+
+    eglQueryDeviceAttribEXT_t eglQueryDeviceAttribEXT = (eglQueryDeviceAttribEXT_t)eglGetProcAddress("eglQueryDeviceAttribEXT");
+    if (!eglQueryDeviceAttribEXT)
+    {
+        LOG(INFO) << "eglGetProcAddress(\"eglQueryDeviceAttribEXT\") failed";
+        return 0;
+    }
+
+    eglGetPlatformDisplayEXT_t eglGetPlatformDisplayEXT = (eglGetPlatformDisplayEXT_t)eglGetProcAddress("eglGetPlatformDisplayEXT");
+    if (!eglGetPlatformDisplayEXT)
+    {
+        LOG(INFO) << "eglGetProcAddress(\"eglGetPlatformDisplayEXT\") failed";
+        return 0;
+    }
+
+    int num_devices = 0;
+    eglQueryDevicesEXT(0, 0, &num_devices);
+    if (!num_devices)
+        return 0;
+    
+    EGLDisplay display = 0;
+    EGLDeviceEXT* devices = (EGLDeviceEXT*)malloc(num_devices * sizeof(void*));
+    eglQueryDevicesEXT(num_devices, devices, &num_devices);
+    for (int i=0; i < num_devices; i++)
+    {
+        EGLDeviceEXT device = devices[i]; 
+        intptr_t value = -1;
+        if (eglQueryDeviceAttribEXT(device, EGL_CUDA_DEVICE_NV, &value) && value == cudaDeviceIdx)
+        {
+            display = eglGetPlatformDisplayEXT(EGL_PLATFORM_DEVICE_EXT, device, 0);
+            break;
+        }
+    }
+
+    free(devices);
+    return display;
+}
+
+static GLContext createGLContext(int cudaDeviceIdx)
+{
+    EGLDisplay display = 0;
+
+    if (cudaDeviceIdx >= 0)
+    {
+        char pciBusId[256] = "";
+        LOG(INFO) << "Creating GL context for Cuda device " << cudaDeviceIdx;
+        display = getCudaDisplay(cudaDeviceIdx);
+        if (!display)
+            LOG(INFO) << "Failed, falling back to default display";
+    }
+
+    if (!display)  
+    {
+        display = eglGetDisplay(EGL_DEFAULT_DISPLAY);
+        if (display == EGL_NO_DISPLAY)
+            LOG(FATAL) << "eglGetDisplay() failed";
+    }

    EGLint major;
    EGLint minor;
    if (!eglInitialize(display, &major, &minor))
-        LOG(ERROR) << "eglInitialize() failed";
+        LOG(FATAL) << "eglInitialize() failed";

    // Choose configuration.

@@ -211,45 +290,32 @@ static GLContext createGLContext(void)
    EGLConfig config;
    EGLint num_config;
    if (!eglChooseConfig(display, context_attribs, &config, 1, &num_config))
-        LOG(ERROR) << "eglChooseConfig() failed";
-
-    // Create dummy pbuffer surface.
-
-    const EGLint surface_attribs[] = {
-        EGL_WIDTH,      1,
-        EGL_HEIGHT,     1,
-        EGL_NONE
-    };
-
-    EGLSurface surface = eglCreatePbufferSurface(display, config, surface_attribs);
-    if (surface == EGL_NO_SURFACE)
-        LOG(ERROR) << "eglCreatePbufferSurface() failed";
+        LOG(FATAL) << "eglChooseConfig() failed";

    // Create GL context.

    if (!eglBindAPI(EGL_OPENGL_API))
-        LOG(ERROR) << "eglBindAPI() failed";
+        LOG(FATAL) << "eglBindAPI() failed";

    EGLContext context = eglCreateContext(display, config, EGL_NO_CONTEXT, NULL);
    if (context == EGL_NO_CONTEXT)
-        LOG(ERROR) << "eglCreateContext() failed";
+        LOG(FATAL) << "eglCreateContext() failed";

    // Done.

    LOG(INFO) << "EGL " << (int)minor << "." << (int)major << " OpenGL context created (disp: 0x"
              << std::hex << std::setfill('0')
              << std::setw(16) << (uintptr_t)display
-              << ", surf: 0x" << std::setw(16) << (uintptr_t)surface
              << ", ctx: 0x" << std::setw(16) << (uintptr_t)context << ")";

-    GLContext glctx = {display, surface, context, 0};
+    GLContext glctx = {display, context, 0};
    return glctx;
 }

 static void destroyGLContext(GLContext& glctx)
 {
    if (!glctx.context)
-        LOG(ERROR) << "destroyGLContext() called with null gltcx";
+        LOG(FATAL) << "destroyGLContext() called with null gltcx";

    // If this is the current context, release it.
    if (eglGetCurrentContext() == glctx.context)
@@ -257,13 +323,10 @@ static void destroyGLContext(GLContext& glctx)

    if (!eglDestroyContext(glctx.display, glctx.context))
        LOG(ERROR) << "eglDestroyContext() failed";
-    if (!eglDestroySurface(glctx.display, glctx.surface))
-        LOG(ERROR) << "eglDestroySurface() failed";

    LOG(INFO) << "EGL OpenGL context destroyed (disp: 0x"
              << std::hex << std::setfill('0')
              << std::setw(16) << (uintptr_t)glctx.display
-              << ", surf: 0x" << std::setw(16) << (uintptr_t)glctx.surface
              << ", ctx: 0x" << std::setw(16) << (uintptr_t)glctx.context << ")";

    memset(&glctx, 0, sizeof(GLContext));

--- a/nvdiffrast/common/rasterize.cpp
+++ b/nvdiffrast/common/rasterize.cpp
@@ -76,12 +76,12 @@ static void constructGLProgram(NVDR_CTX_ARGS, GLuint* pProgram, GLuint glVertexS
 //------------------------------------------------------------------------
 // Shared C++ functions.

-void rasterizeInitGLContext(NVDR_CTX_ARGS, RasterizeGLState& s)
+void rasterizeInitGLContext(NVDR_CTX_ARGS, RasterizeGLState& s, int cudaDeviceIdx)
 {
    // Create GL context and set it current.
-    s.glctx = createGLContext();
+    s.glctx = createGLContext(cudaDeviceIdx);
    setGLContext(s.glctx);
- 
+
    // Version check.
    GLint vMajor = 0;
    GLint vMinor = 0;
@@ -90,7 +90,7 @@ void rasterizeInitGLContext(NVDR_CTX_ARGS, RasterizeGLState& s)
    glGetError(); // Clear possible GL_INVALID_ENUM error in version query.
    LOG(INFO) << "OpenGL version reported as " << vMajor << "." << vMinor;
    NVDR_CHECK((vMajor == 4 && vMinor >= 4) || vMajor > 4, "OpenGL 4.4 or later is required");
-    
+
    // Number of output buffers.
    int num_outputs = s.enableDB ? 2 : 1;

@@ -319,7 +319,7 @@ void rasterizeResizeBuffers(NVDR_CTX_ARGS, RasterizeGLState& s, int posCount, in
        s.width  = ROUND_UP(s.width, 32);
        s.height = ROUND_UP(s.height, 32);
        LOG(INFO) << "Increasing frame buffer size to (width, height, depth) = (" << s.width << ", " << s.height << ", " << s.depth << ")";
-        
+
        // Allocate color buffers.
        for (int i=0; i < num_outputs; i++)
        {

--- a/nvdiffrast/common/rasterize.h
+++ b/nvdiffrast/common/rasterize.h
@@ -83,7 +83,7 @@ struct RasterizeGLState
 //------------------------------------------------------------------------
 // Shared C++ code prototypes.

-void rasterizeInitGLContext(NVDR_CTX_ARGS, RasterizeGLState& s);
+void rasterizeInitGLContext(NVDR_CTX_ARGS, RasterizeGLState& s, int cudaDeviceIdx);
 void rasterizeResizeBuffers(NVDR_CTX_ARGS, RasterizeGLState& s, int posCount, int triCount, int width, int height, int depth);
 void rasterizeRender(NVDR_CTX_ARGS, RasterizeGLState& s, cudaStream_t stream, const float* posPtr, int posCount, int vtxPerInstance, const int32_t* triPtr, int triCount, const int32_t* rangesPtr, int width, int height, int depth);
 void rasterizeCopyResults(NVDR_CTX_ARGS, RasterizeGLState& s, cudaStream_t stream, float** outputPtr, int width, int height, int depth);

--- a/nvdiffrast/common/texture.cu
+++ b/nvdiffrast/common/texture.cu
--- a/nvdiffrast/common/texture.h
+++ b/nvdiffrast/common/texture.h
@@ -40,7 +40,8 @@ struct TextureKernelParams
 {
    const float*    tex;                            // Incoming texture buffer.
    const float*    uv;                             // Incoming texcoord buffer.
-    const float*    uvDA;                           // Incoming uv pixel diffs. NULL if mips disabled.
+    const float*    uvDA;                           // Incoming uv pixel diffs or NULL.
+    const float*    mipLevelBias;                   // Incoming mip level bias or NULL.
    const float*    dy;                             // Incoming output gradient.
    float*          mip;                            // Mip data buffer.
    float*          out;                            // Outgoing texture data.
@@ -48,7 +49,8 @@ struct TextureKernelParams
    float*          gradTexMip;                     // Temporary texture gradients for mip levels > 0.
    float*          gradUV;                         // Outgoing texcoord gradient.
    float*          gradUVDA;                       // Outgoing texcoord pixel differential gradient.
-    int             enableMip;                      // If true, we have uv_da input and mip output tensor.
+    float*          gradMipLevelBias;               // Outgoing mip level bias gradient.
+    int             enableMip;                      // If true, we have uv_da and/or mip_level_bias input(s), and a mip tensor.
    int             filterMode;                     // One of the TEX_MODE_ constants.
    int             boundaryMode;                   // One of the TEX_BOUNDARY_MODE_ contants.
    int             texConst;                       // If true, texture is known to be constant.

--- a/nvdiffrast/lib/setgpu.lib
+++ b/nvdiffrast/lib/setgpu.lib
--- a/nvdiffrast/tensorflow/ops.py
+++ b/nvdiffrast/tensorflow/ops.py
@@ -19,7 +19,7 @@ from . import plugin_loader
 def _get_gl_opts():
    libs = {
        'posix': ['GL', 'GLEW'],
-        'nt':    ['gdi32', 'glew32s', 'opengl32', 'user32'], 
+        'nt':    ['gdi32', 'glew32s', 'opengl32', 'user32', 'setgpu'],
    }
    return ['-l' + x for x in libs[os.name]]


--- a/nvdiffrast/tensorflow/tf_rasterize.cu
+++ b/nvdiffrast/tensorflow/tf_rasterize.cu
@@ -12,11 +12,11 @@
 struct RasterizeFwdOp : public OpKernel
 {
    RasterizeGLState        m_glState;              // OpenGL-related persistent state.
-    int                     m_tri_const;            // 1 if triangle array is known to be constant.    
+    int                     m_tri_const;            // 1 if triangle array is known to be constant.

    RasterizeFwdOp(OpKernelConstruction* ctx):
        OpKernel(ctx)
-    {        
+    {
        memset(&m_glState, 0, sizeof(RasterizeGLState));
        OP_REQUIRES_OK(ctx, ctx->GetAttr("enable_db", &m_glState.enableDB));
        OP_REQUIRES_OK(ctx, ctx->GetAttr("tri_const", &m_tri_const));
@@ -48,7 +48,7 @@ struct RasterizeFwdOp : public OpKernel
            OP_REQUIRES(ctx, pos.dims() == 2 && pos.dim_size(0) > 0 && pos.dim_size(1) == 4, errors::InvalidArgument("range mode - pos must have shape [>0, 4]"));
            OP_REQUIRES(ctx, tri.dims() == 2 && tri.dim_size(0) > 0 && tri.dim_size(1) == 3, errors::InvalidArgument("tri must have shape [>0, 3]"));
            OP_REQUIRES(ctx, resolution.dims() == 1 && resolution.dim_size(0) == 2, errors::InvalidArgument("resolution must have shape [2]"));
-            OP_REQUIRES(ctx, ranges.dims() == 2 && ranges.dim_size(0) > 0 && ranges.dim_size(1) == 2, errors::InvalidArgument("range mode - ranges must have shape [>0, 2]"));            
+            OP_REQUIRES(ctx, ranges.dims() == 2 && ranges.dim_size(0) > 0 && ranges.dim_size(1) == 2, errors::InvalidArgument("range mode - ranges must have shape [>0, 2]"));
        }

        // Get output shape.
@@ -65,12 +65,16 @@ struct RasterizeFwdOp : public OpKernel
        // Init context and GL?
        bool initCtx = !m_glState.glFBO;
        if (initCtx)
-            rasterizeInitGLContext(ctx, m_glState); // In common/rasterize.inl
+        {
+            const DeviceBase::GpuDeviceInfo* g = ctx->device()->tensorflow_gpu_device_info();
+            int cudaDeviceIdx = g ? g->gpu_id : -1;
+            rasterizeInitGLContext(ctx, m_glState, cudaDeviceIdx); // In common/rasterize.cpp
+        }
        else
            setGLContext(m_glState.glctx); // (Re-)Activate GL context.

        // Resize all buffers.
-        rasterizeResizeBuffers(ctx, m_glState, posCount, triCount, width, height, depth); // In common/rasterize.inl
+        rasterizeResizeBuffers(ctx, m_glState, posCount, triCount, width, height, depth); // In common/rasterize.cpp

        // Newly created GL objects sometimes don't map properly to CUDA until after first context swap. Workaround.
        if (initCtx)
@@ -79,7 +83,7 @@ struct RasterizeFwdOp : public OpKernel
            releaseGLContext();
            setGLContext(m_glState.glctx);
        }
-    
+
        // Copy input data to GL and render.
        const float* posPtr = pos.flat<float>().data();
        const int32_t* rangesPtr = instance_mode ? 0 : ranges.flat<int32_t>().data(); // This is in CPU memory.
@@ -178,7 +182,7 @@ struct RasterizeGradOp : public OpKernel
        p.out = out.flat<float>().data();
        p.dy  = dy.flat<float>().data();
        p.ddb = ENABLE_DB ? ddb.flat<float>().data() : 0;
-        
+
        // Set up pixel position to clip space x, y transform.
        p.xs = 2.f / (float)p.width;
        p.xo = 1.f / (float)p.width - 1.f;

--- a/nvdiffrast/torch/ops.py
+++ b/nvdiffrast/torch/ops.py
@@ -45,9 +45,9 @@ def _get_plugin():

    # Linker options.
    if os.name == 'posix':
-        ldflags = ['-lGL', '-lGLEW']
+        ldflags = ['-lGL', '-lGLEW', '-lEGL']
    elif os.name == 'nt':
-        libs = ['gdi32', 'glew32s', 'opengl32', 'user32']
+        libs = ['gdi32', 'glew32s', 'opengl32', 'user32', 'setgpu']
        ldflags = ['/LIBPATH:' + lib_dir] + ['/DEFAULTLIB:' + x for x in libs]

    # List of source files.
@@ -103,9 +103,9 @@ def set_log_level(level):
    '''Set log level.

    Log levels follow the convention on the C++ side of Torch:
-      0 = Info, 
-      1 = Warning, 
-      2 = Error, 
+      0 = Info,
+      1 = Warning,
+      2 = Error,
      3 = Fatal.
    The default log level is 1.

@@ -121,7 +121,7 @@ def set_log_level(level):
 #----------------------------------------------------------------------------

 class RasterizeGLContext:
-    def __init__(self, output_db=True, mode='automatic'):
+    def __init__(self, output_db=True, mode='automatic', device=None):
        '''Create a new OpenGL rasterizer context.

        Creating an OpenGL context is a slow operation so you should reuse the same
@@ -131,7 +131,10 @@ class RasterizeGLContext:
        Args:
          output_db (bool): Compute and output image-space derivates of barycentrics.
          mode: OpenGL context handling mode. Valid values are 'manual' and 'automatic'.
-
+          device (Optional): Cuda device on which the context is created. Type can be
+                             `torch.device`, string (e.g., `'cuda:1'`), or int. If not
+                             specified, context will be created on currently active Cuda
+                             device.
        Returns:
          The newly created OpenGL rasterizer context.
        '''
@@ -139,11 +142,16 @@ class RasterizeGLContext:
        assert mode in ['automatic', 'manual']
        self.output_db = output_db
        self.mode = mode
-        self.cpp_wrapper = _get_plugin().RasterizeGLStateWrapper(output_db, mode == 'automatic')
+        if device is None:
+            cuda_device_idx = torch.cuda.current_device()
+        else:
+            with torch.cuda.device(device):
+                cuda_device_idx = torch.cuda.current_device()
+        self.cpp_wrapper = _get_plugin().RasterizeGLStateWrapper(output_db, mode == 'automatic', cuda_device_idx)

    def set_context(self):
        '''Set (activate) OpenGL context in the current CPU thread.
-           Only available if context was created in manual mode.   
+           Only available if context was created in manual mode.
        '''
        assert self.mode == 'manual'
        self.cpp_wrapper.set_context()
@@ -316,22 +324,26 @@ def interpolate(attr, rast, tri, rast_db=None, diff_attrs=None):
 # Linear-mipmap-linear and linear-mipmap-nearest: Mipmaps enabled.
 class _texture_func_mip(torch.autograd.Function):
    @staticmethod
-    def forward(ctx, filter_mode, tex, uv, uv_da, mip, filter_mode_enum, boundary_mode_enum):
-        out = _get_plugin().texture_fwd_mip(tex, uv, uv_da, mip, filter_mode_enum, boundary_mode_enum)
-        ctx.save_for_backward(tex, uv, uv_da)
+    def forward(ctx, filter_mode, tex, uv, uv_da, mip_level_bias, mip, filter_mode_enum, boundary_mode_enum):
+        if uv_da is None:
+            uv_da = torch.tensor([])
+        if mip_level_bias is None:
+            mip_level_bias = torch.tensor([])
+        out = _get_plugin().texture_fwd_mip(tex, uv, uv_da, mip_level_bias, mip, filter_mode_enum, boundary_mode_enum)
+        ctx.save_for_backward(tex, uv, uv_da, mip_level_bias)
        ctx.saved_misc = filter_mode, mip, filter_mode_enum, boundary_mode_enum
        return out

    @staticmethod
    def backward(ctx, dy):
-        tex, uv, uv_da = ctx.saved_variables
+        tex, uv, uv_da, mip_level_bias = ctx.saved_variables
        filter_mode, mip, filter_mode_enum, boundary_mode_enum = ctx.saved_misc
        if filter_mode == 'linear-mipmap-linear':
-            g_tex, g_uv, g_uv_da = _get_plugin().texture_grad_linear_mipmap_linear(tex, uv, dy, uv_da, mip, filter_mode_enum, boundary_mode_enum)
-            return None, g_tex, g_uv, g_uv_da, None, None, None
+            g_tex, g_uv, g_uv_da, g_mip_level_bias = _get_plugin().texture_grad_linear_mipmap_linear(tex, uv, dy, uv_da, mip_level_bias, mip, filter_mode_enum, boundary_mode_enum)
+            return None, g_tex, g_uv, g_uv_da, g_mip_level_bias, None, None, None
        else: # linear-mipmap-nearest
-            g_tex, g_uv = _get_plugin().texture_grad_linear_mipmap_nearest(tex, uv, dy, uv_da, mip, filter_mode_enum, boundary_mode_enum)
-            return None, g_tex, g_uv, None, None, None, None
+            g_tex, g_uv = _get_plugin().texture_grad_linear_mipmap_nearest(tex, uv, dy, uv_da, mip_level_bias, mip, filter_mode_enum, boundary_mode_enum)
+            return None, g_tex, g_uv, None, None, None, None, None

 # Linear and nearest: Mipmaps disabled.
 class _texture_func(torch.autograd.Function):
@@ -354,7 +366,7 @@ class _texture_func(torch.autograd.Function):
            return None, g_tex, None, None, None

 # Op wrapper.
-def texture(tex, uv, uv_da=None, mip=None, filter_mode='auto', boundary_mode='wrap', max_mip_level=None):
+def texture(tex, uv, uv_da=None, mip_level_bias=None, mip=None, filter_mode='auto', boundary_mode='wrap', max_mip_level=None):
    """Perform texture sampling.

    All input tensors must be contiguous and reside in GPU memory. The output tensor
@@ -364,22 +376,24 @@ def texture(tex, uv, uv_da=None, mip=None, filter_mode='auto', boundary_mode='wr
        tex: Texture tensor with dtype `torch.float32`. For 2D textures, must have shape
             [minibatch_size, tex_height, tex_width, tex_channels]. For cube map textures,
             must have shape [minibatch_size, 6, tex_height, tex_width, tex_channels] where
-             tex_width and tex_height are equal. Note that `boundary_mode` must also be set 
+             tex_width and tex_height are equal. Note that `boundary_mode` must also be set
             to 'cube' to enable cube map mode. Broadcasting is supported along the minibatch axis.
-        uv: Tensor containing per-pixel texture coordinates. When sampling a 2D texture, 
+        uv: Tensor containing per-pixel texture coordinates. When sampling a 2D texture,
            must have shape [minibatch_size, height, width, 2]. When sampling a cube map
            texture, must have shape [minibatch_size, height, width, 3].
        uv_da: (Optional) Tensor containing image-space derivatives of texture coordinates.
               Must have same shape as `uv` except for the last dimension that is to be twice
               as long.
+        mip_level_bias: (Optional) Per-pixel bias for mip level selection. If `uv_da` is omitted,
+                        determines mip level directly. Must have shape [minibatch_size, height, width].
        mip: (Optional) Preconstructed mipmap stack from a `texture_construct_mip()` call. If not
             specified, the mipmap stack is constructed internally and discarded afterwards.
-        filter_mode: Texture filtering mode to be used. Valid values are 'auto', 'nearest', 
+        filter_mode: Texture filtering mode to be used. Valid values are 'auto', 'nearest',
                     'linear', 'linear-mipmap-nearest', and 'linear-mipmap-linear'. Mode 'auto'
-                     selects 'linear' if `uv_da` is not specified, and 'linear-mipmap-linear'
-                     when `uv_da` is specified, these being the highest-quality modes possible
-                     depending on the availability of the image-space derivatives of the texture
-                     coordinates.
+                     selects 'linear' if neither `uv_da` or `mip_level_bias` is specified, and
+                     'linear-mipmap-linear' when at least one of them is specified, these being
+                     the highest-quality modes possible depending on the availability of the
+                     image-space derivatives of the texture coordinates or direct mip level information.
        boundary_mode: Valid values are 'wrap', 'clamp', 'zero', and 'cube'. If `tex` defines a
                       cube map, this must be set to 'cube'. The default mode 'wrap' takes fractional
                       part of texture coordinates. Mode 'clamp' clamps texture coordinates to the
@@ -395,7 +409,7 @@ def texture(tex, uv, uv_da=None, mip=None, filter_mode='auto', boundary_mode='wr

    # Default filter mode.
    if filter_mode == 'auto':
-        filter_mode = 'linear-mipmap-linear' if (uv_da is not None) else 'linear'
+        filter_mode = 'linear-mipmap-linear' if (uv_da is not None or mip_level_bias is not None) else 'linear'

    # Sanitize inputs.
    if max_mip_level is None:
@@ -407,7 +421,7 @@ def texture(tex, uv, uv_da=None, mip=None, filter_mode='auto', boundary_mode='wr
    # Check inputs.
    assert isinstance(tex, torch.Tensor) and isinstance(uv, torch.Tensor)
    if 'mipmap' in filter_mode:
-        assert isinstance(uv_da, torch.Tensor)
+        assert isinstance(uv_da, torch.Tensor) or isinstance(mip_level_bias, torch.Tensor)

    # If mipping disabled via max level=0, we may as well use simpler filtering internally.
    if max_mip_level == 0 and filter_mode in ['linear-mipmap-nearest', 'linear-mipmap-linear']:
@@ -430,10 +444,10 @@ def texture(tex, uv, uv_da=None, mip=None, filter_mode='auto', boundary_mode='wr

    # Choose stub.
    if filter_mode == 'linear-mipmap-linear' or filter_mode == 'linear-mipmap-nearest':
-        return _texture_func_mip.apply(filter_mode, tex, uv, uv_da, mip, filter_mode_enum, boundary_mode_enum)
+        return _texture_func_mip.apply(filter_mode, tex, uv, uv_da, mip_level_bias, mip, filter_mode_enum, boundary_mode_enum)
    else:
        return _texture_func.apply(filter_mode, tex, uv, filter_mode_enum, boundary_mode_enum)
-        
+
 # Mipmap precalculation for cases where the texture stays constant.
 def texture_construct_mip(tex, max_mip_level=None, cube_mode=False):
    """Construct a mipmap stack for a texture.

--- a/nvdiffrast/torch/torch_antialias.cpp
+++ b/nvdiffrast/torch/torch_antialias.cpp
@@ -24,6 +24,7 @@ void AntialiasGradKernel            (const AntialiasKernelParams p);

 TopologyHashWrapper antialias_construct_topology_hash(torch::Tensor tri)
 {
+    const at::cuda::OptionalCUDAGuard device_guard(device_of(tri));
    cudaStream_t stream = at::cuda::getCurrentCUDAStream();
    AntialiasKernelParams p = {}; // Initialize all fields to zero.

@@ -66,6 +67,7 @@ TopologyHashWrapper antialias_construct_topology_hash(torch::Tensor tri)

 std::tuple<torch::Tensor, torch::Tensor> antialias_fwd(torch::Tensor color, torch::Tensor rast, torch::Tensor pos, torch::Tensor tri, TopologyHashWrapper topology_hash_wrap)
 {
+    const at::cuda::OptionalCUDAGuard device_guard(device_of(color));
    cudaStream_t stream = at::cuda::getCurrentCUDAStream();
    AntialiasKernelParams p = {}; // Initialize all fields to zero.
    p.instance_mode = (pos.sizes().size() > 2) ? 1 : 0;
@@ -112,10 +114,10 @@ std::tuple<torch::Tensor, torch::Tensor> antialias_fwd(torch::Tensor color, torc
    p.xh = .5f * (float)p.width;
    p.yh = .5f * (float)p.height;
    p.allocTriangles = topology_hash.size(0) / (4 * AA_HASH_ELEMENTS_PER_TRIANGLE);
-   
+
    // Allocate output tensors.
    torch::Tensor out = color.detach().clone(); // Use color as base.
-    torch::TensorOptions opts = torch::TensorOptions().dtype(torch::kFloat32).device(torch::kCUDA);    
+    torch::TensorOptions opts = torch::TensorOptions().dtype(torch::kFloat32).device(torch::kCUDA);
    torch::Tensor work_buffer = torch::empty({p.n * p.width * p.height * 8 + 4}, opts); // 8 int for a maximum of two work items per pixel.
    p.output = out.data_ptr<float>();
    p.workBuffer = (int4*)(work_buffer.data_ptr<float>());
@@ -153,6 +155,7 @@ std::tuple<torch::Tensor, torch::Tensor> antialias_fwd(torch::Tensor color, torc

 std::tuple<torch::Tensor, torch::Tensor> antialias_grad(torch::Tensor color, torch::Tensor rast, torch::Tensor pos, torch::Tensor tri, torch::Tensor dy, torch::Tensor work_buffer)
 {
+    const at::cuda::OptionalCUDAGuard device_guard(device_of(color));
    cudaStream_t stream = at::cuda::getCurrentCUDAStream();
    AntialiasKernelParams p = {}; // Initialize all fields to zero.
    p.instance_mode = (pos.sizes().size() > 2) ? 1 : 0;

--- a/nvdiffrast/torch/torch_bindings.cpp
+++ b/nvdiffrast/torch/torch_bindings.cpp
@@ -13,9 +13,10 @@
 //------------------------------------------------------------------------
 // Op prototypes. Return type macros for readability.

-#define OP_RETURN_T   torch::Tensor
-#define OP_RETURN_TT  std::tuple<torch::Tensor, torch::Tensor>
-#define OP_RETURN_TTT std::tuple<torch::Tensor, torch::Tensor, torch::Tensor>
+#define OP_RETURN_T     torch::Tensor
+#define OP_RETURN_TT    std::tuple<torch::Tensor, torch::Tensor>
+#define OP_RETURN_TTT   std::tuple<torch::Tensor, torch::Tensor, torch::Tensor>
+#define OP_RETURN_TTTT  std::tuple<torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor>

 OP_RETURN_TT        rasterize_fwd                       (RasterizeGLStateWrapper& stateWrapper, torch::Tensor pos, torch::Tensor tri, std::tuple<int, int> resolution, torch::Tensor ranges);
 OP_RETURN_T         rasterize_grad                      (torch::Tensor pos, torch::Tensor tri, torch::Tensor out, torch::Tensor dy);
@@ -26,11 +27,11 @@ OP_RETURN_TT        interpolate_grad                    (torch::Tensor attr, tor
 OP_RETURN_TTT       interpolate_grad_da                 (torch::Tensor attr, torch::Tensor rast, torch::Tensor tri, torch::Tensor dy, torch::Tensor rast_db, torch::Tensor dda, bool diff_attrs_all, std::vector<int>& diff_attrs_vec);
 TextureMipWrapper   texture_construct_mip               (torch::Tensor tex, int max_mip_level, bool cube_mode);
 OP_RETURN_T         texture_fwd                         (torch::Tensor tex, torch::Tensor uv, int filter_mode, int boundary_mode);
-OP_RETURN_T         texture_fwd_mip                     (torch::Tensor tex, torch::Tensor uv, torch::Tensor uv_da, TextureMipWrapper mip, int filter_mode, int boundary_mode);
+OP_RETURN_T         texture_fwd_mip                     (torch::Tensor tex, torch::Tensor uv, torch::Tensor uv_da, torch::Tensor mip_level_bias, TextureMipWrapper mip, int filter_mode, int boundary_mode);
 OP_RETURN_T         texture_grad_nearest                (torch::Tensor tex, torch::Tensor uv, torch::Tensor dy, int filter_mode, int boundary_mode);
 OP_RETURN_TT        texture_grad_linear                 (torch::Tensor tex, torch::Tensor uv, torch::Tensor dy, int filter_mode, int boundary_mode);
-OP_RETURN_TT        texture_grad_linear_mipmap_nearest  (torch::Tensor tex, torch::Tensor uv, torch::Tensor dy, torch::Tensor uv_da, TextureMipWrapper mip, int filter_mode, int boundary_mode);
-OP_RETURN_TTT       texture_grad_linear_mipmap_linear   (torch::Tensor tex, torch::Tensor uv, torch::Tensor dy, torch::Tensor uv_da, TextureMipWrapper mip, int filter_mode, int boundary_mode);
+OP_RETURN_TT        texture_grad_linear_mipmap_nearest  (torch::Tensor tex, torch::Tensor uv, torch::Tensor dy, torch::Tensor uv_da, torch::Tensor mip_level_bias, TextureMipWrapper mip, int filter_mode, int boundary_mode);
+OP_RETURN_TTTT      texture_grad_linear_mipmap_linear   (torch::Tensor tex, torch::Tensor uv, torch::Tensor dy, torch::Tensor uv_da, torch::Tensor mip_level_bias, TextureMipWrapper mip, int filter_mode, int boundary_mode);
 TopologyHashWrapper antialias_construct_topology_hash   (torch::Tensor tri);
 OP_RETURN_TT        antialias_fwd                       (torch::Tensor color, torch::Tensor rast, torch::Tensor pos, torch::Tensor tri, TopologyHashWrapper topology_hash);
 OP_RETURN_TT        antialias_grad                      (torch::Tensor color, torch::Tensor rast, torch::Tensor pos, torch::Tensor tri, torch::Tensor dy, torch::Tensor work_buffer);
@@ -39,7 +40,7 @@ OP_RETURN_TT        antialias_grad                      (torch::Tensor color, to

 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
    // State classes.
-    pybind11::class_<RasterizeGLStateWrapper>(m, "RasterizeGLStateWrapper").def(pybind11::init<bool, bool>())
+    pybind11::class_<RasterizeGLStateWrapper>(m, "RasterizeGLStateWrapper").def(pybind11::init<bool, bool, int>())
        .def("set_context",     &RasterizeGLStateWrapper::setContext)
        .def("release_context", &RasterizeGLStateWrapper::releaseContext);
    pybind11::class_<TextureMipWrapper>(m, "TextureMipWrapper");
@@ -58,8 +59,8 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
    m.def("interpolate_grad",                   &interpolate_grad,                      "interpolate gradient op with attribute derivatives");
    m.def("interpolate_grad_da",                &interpolate_grad_da,                   "interpolate gradient op without attribute derivatives");
    m.def("texture_construct_mip",              &texture_construct_mip,                 "texture mipmap construction");
-    m.def("texture_fwd",                        &texture_fwd,                           "texture forward op with mipmapping and texcoord derivatives");
-    m.def("texture_fwd_mip",                    &texture_fwd_mip,                       "texture forward op without mipmapping and texcoord derivatives");
+    m.def("texture_fwd",                        &texture_fwd,                           "texture forward op without mipmapping");
+    m.def("texture_fwd_mip",                    &texture_fwd_mip,                       "texture forward op with mipmapping");
    m.def("texture_grad_nearest",               &texture_grad_nearest,                  "texture gradient op in nearest mode");
    m.def("texture_grad_linear",                &texture_grad_linear,                   "texture gradient op in linear mode");
    m.def("texture_grad_linear_mipmap_nearest", &texture_grad_linear_mipmap_nearest,    "texture gradient op in linear-mipmap-nearest mode");

--- a/nvdiffrast/torch/torch_common.inl
+++ b/nvdiffrast/torch/torch_common.inl
@@ -17,7 +17,7 @@
 #define __func__ __FUNCTION__
 #endif

-#define NVDR_CHECK_DEVICE(...) do { TORCH_CHECK(at::cuda::check_device({__VA_ARGS__}), __func__, "(): Inputs " #__VA_ARGS__ " must reside on current GPU device") } while(0)
+#define NVDR_CHECK_DEVICE(...) do { TORCH_CHECK(at::cuda::check_device({__VA_ARGS__}), __func__, "(): Inputs " #__VA_ARGS__ " must reside on the same GPU device") } while(0)
 #define NVDR_CHECK_CPU(...) do { nvdr_check_cpu({__VA_ARGS__}, __func__, "(): Inputs " #__VA_ARGS__ " must reside on CPU"); } while(0)
 #define NVDR_CHECK_CONTIGUOUS(...) do { nvdr_check_contiguous({__VA_ARGS__}, __func__, "(): Inputs " #__VA_ARGS__ " must be contiguous tensors"); } while(0)
 #define NVDR_CHECK_F32(...) do { nvdr_check_f32({__VA_ARGS__}, __func__, "(): Inputs " #__VA_ARGS__ " must be float32 tensors"); } while(0)

--- a/nvdiffrast/torch/torch_interpolate.cpp
+++ b/nvdiffrast/torch/torch_interpolate.cpp
@@ -41,6 +41,7 @@ static void set_diff_attrs(InterpolateKernelParams& p, bool diff_attrs_all, std:

 std::tuple<torch::Tensor, torch::Tensor> interpolate_fwd_da(torch::Tensor attr, torch::Tensor rast, torch::Tensor tri, torch::Tensor rast_db, bool diff_attrs_all, std::vector<int>& diff_attrs_vec)
 {
+    const at::cuda::OptionalCUDAGuard device_guard(device_of(attr));
    cudaStream_t stream = at::cuda::getCurrentCUDAStream();
    InterpolateKernelParams p = {}; // Initialize all fields to zero.
    bool enable_da = (rast_db.defined()) && (diff_attrs_all || !diff_attrs_vec.empty());
@@ -86,6 +87,8 @@ std::tuple<torch::Tensor, torch::Tensor> interpolate_fwd_da(torch::Tensor attr,
    // Set attribute pixel differential info if enabled, otherwise leave as zero.
    if (enable_da)
        set_diff_attrs(p, diff_attrs_all, diff_attrs_vec);
+    else
+        p.numDiffAttr = 0;

    // Get input pointers.
    p.attr = attr.data_ptr<float>();
@@ -95,7 +98,7 @@ std::tuple<torch::Tensor, torch::Tensor> interpolate_fwd_da(torch::Tensor attr,
    p.attrBC = (p.instance_mode && attr.size(0) == 1) ? 1 : 0;

    // Allocate output tensors.
-    torch::TensorOptions opts = torch::TensorOptions().dtype(torch::kFloat32).device(torch::kCUDA);    
+    torch::TensorOptions opts = torch::TensorOptions().dtype(torch::kFloat32).device(torch::kCUDA);
    torch::Tensor out = torch::empty({p.depth, p.height, p.width, p.numAttr}, opts);
    torch::Tensor out_da = torch::empty({p.depth, p.height, p.width, p.numDiffAttr * 2}, opts);

@@ -133,6 +136,7 @@ std::tuple<torch::Tensor, torch::Tensor> interpolate_fwd(torch::Tensor attr, tor

 std::tuple<torch::Tensor, torch::Tensor, torch::Tensor> interpolate_grad_da(torch::Tensor attr, torch::Tensor rast, torch::Tensor tri, torch::Tensor dy, torch::Tensor rast_db, torch::Tensor dda, bool diff_attrs_all, std::vector<int>& diff_attrs_vec)
 {
+    const at::cuda::OptionalCUDAGuard device_guard(device_of(attr));
    cudaStream_t stream = at::cuda::getCurrentCUDAStream();
    InterpolateKernelParams p = {}; // Initialize all fields to zero.
    bool enable_da = (rast_db.defined()) && (diff_attrs_all || !diff_attrs_vec.empty());
@@ -190,6 +194,8 @@ std::tuple<torch::Tensor, torch::Tensor, torch::Tensor> interpolate_grad_da(torc
    // Set attribute pixel differential info if enabled, otherwise leave as zero.
    if (enable_da)
        set_diff_attrs(p, diff_attrs_all, diff_attrs_vec);
+    else
+        p.numDiffAttr = 0;

    // Get input pointers.
    p.attr = attr.data_ptr<float>();
@@ -201,7 +207,7 @@ std::tuple<torch::Tensor, torch::Tensor, torch::Tensor> interpolate_grad_da(torc
    p.attrBC = (p.instance_mode && attr_depth < p.depth) ? 1 : 0;

    // Allocate output tensors.
-    torch::TensorOptions opts = torch::TensorOptions().dtype(torch::kFloat32).device(torch::kCUDA);    
+    torch::TensorOptions opts = torch::TensorOptions().dtype(torch::kFloat32).device(torch::kCUDA);
    torch::Tensor gradAttr = torch::zeros_like(attr);
    torch::Tensor gradRaster = torch::empty_like(rast);
    torch::Tensor gradRasterDB;

--- a/nvdiffrast/torch/torch_rasterize.cpp
+++ b/nvdiffrast/torch/torch_rasterize.cpp
@@ -21,13 +21,14 @@ void RasterizeGradKernelDb(const RasterizeGradParams p);
 //------------------------------------------------------------------------
 // Python GL state wrapper methods.

-RasterizeGLStateWrapper::RasterizeGLStateWrapper(bool enableDB, bool automatic_)
+RasterizeGLStateWrapper::RasterizeGLStateWrapper(bool enableDB, bool automatic_, int cudaDeviceIdx_)
 {
    pState = new RasterizeGLState();
    automatic = automatic_;
+    cudaDeviceIdx = cudaDeviceIdx_;
    memset(pState, 0, sizeof(RasterizeGLState));
    pState->enableDB = enableDB ? 1 : 0;
-    rasterizeInitGLContext(NVDR_CTX_PARAMS, *pState);
+    rasterizeInitGLContext(NVDR_CTX_PARAMS, *pState, cudaDeviceIdx_);
    releaseGLContext();
 }

@@ -52,6 +53,7 @@ void RasterizeGLStateWrapper::releaseContext(void)

 std::tuple<torch::Tensor, torch::Tensor> rasterize_fwd(RasterizeGLStateWrapper& stateWrapper, torch::Tensor pos, torch::Tensor tri, std::tuple<int, int> resolution, torch::Tensor ranges)
 {
+    const at::cuda::OptionalCUDAGuard device_guard(device_of(pos));
    cudaStream_t stream = at::cuda::getCurrentCUDAStream();
    RasterizeGLState& s = *stateWrapper.pState;

@@ -62,6 +64,9 @@ std::tuple<torch::Tensor, torch::Tensor> rasterize_fwd(RasterizeGLStateWrapper&
    NVDR_CHECK_F32(pos);
    NVDR_CHECK_I32(tri, ranges);

+    // Check that GL context was created for the correct GPU.
+    NVDR_CHECK(pos.get_device() == stateWrapper.cudaDeviceIdx, "GL context must must reside on the same device as input tensors");
+
    // Determine number of outputs
    int num_outputs = s.enableDB ? 2 : 1;

@@ -101,7 +106,7 @@ std::tuple<torch::Tensor, torch::Tensor> rasterize_fwd(RasterizeGLStateWrapper&
    rasterizeRender(NVDR_CTX_PARAMS, s, stream, posPtr, posCount, vtxPerInstance, triPtr, triCount, rangesPtr, width, height, depth);

    // Allocate output tensors.
-    torch::TensorOptions opts = torch::TensorOptions().dtype(torch::kFloat32).device(torch::kCUDA);    
+    torch::TensorOptions opts = torch::TensorOptions().dtype(torch::kFloat32).device(torch::kCUDA);
    torch::Tensor out = torch::empty({depth, height, width, 4}, opts);
    torch::Tensor out_db = torch::empty({depth, height, width, s.enableDB ? 4 : 0}, opts);
    float* outputPtr[2];
@@ -123,6 +128,7 @@ std::tuple<torch::Tensor, torch::Tensor> rasterize_fwd(RasterizeGLStateWrapper&

 torch::Tensor rasterize_grad_db(torch::Tensor pos, torch::Tensor tri, torch::Tensor out, torch::Tensor dy, torch::Tensor ddb)
 {
+    const at::cuda::OptionalCUDAGuard device_guard(device_of(pos));
    cudaStream_t stream = at::cuda::getCurrentCUDAStream();
    RasterizeGradParams p;
    bool enable_db = ddb.defined();
@@ -178,7 +184,7 @@ torch::Tensor rasterize_grad_db(torch::Tensor pos, torch::Tensor tri, torch::Ten
    p.out = out.data_ptr<float>();
    p.dy  = dy_.data_ptr<float>();
    p.ddb = enable_db ? ddb_.data_ptr<float>() : NULL;
-    
+
    // Set up pixel position to clip space x, y transform.
    p.xs = 2.f / (float)p.width;
    p.xo = 1.f / (float)p.width - 1.f;
@@ -209,7 +215,7 @@ torch::Tensor rasterize_grad_db(torch::Tensor pos, torch::Tensor tri, torch::Ten

 // Version without derivatives.
 torch::Tensor rasterize_grad(torch::Tensor pos, torch::Tensor tri, torch::Tensor out, torch::Tensor dy)
-{ 
+{
    torch::Tensor empty_tensor;
    return rasterize_grad_db(pos, tri, out, dy, empty_tensor);
 }

--- a/nvdiffrast/torch/torch_texture.cpp
+++ b/nvdiffrast/torch/torch_texture.cpp
--- a/nvdiffrast/torch/torch_types.h
+++ b/nvdiffrast/torch/torch_types.h
@@ -15,7 +15,7 @@ class RasterizeGLState;
 class RasterizeGLStateWrapper
 {
 public:
-    RasterizeGLStateWrapper     (bool enableDB, bool automatic);
+    RasterizeGLStateWrapper     (bool enableDB, bool automatic, int cudaDeviceIdx);
    ~RasterizeGLStateWrapper    (void);

    void setContext             (void);
@@ -23,6 +23,7 @@ public:

    RasterizeGLState*           pState;
    bool                        automatic;
+    int                         cudaDeviceIdx;
 };

 //------------------------------------------------------------------------