Support for multiple GPUs, mip bias input for texture op

ce7063f1 · Samuli Laine · 2468e2a0 · ce7063f1 · ce7063f1 · ce7063f1
Commit ce7063f1 authored Jan 24, 2021 by Samuli Laine
20 changed files
--- a/docs/index.html
+++ b/docs/index.html
--- a/nvdiffrast/__init__.py
+++ b/nvdiffrast/__init__.py
@@ -6,4 +6,4 @@
 # distribution of this software and related documentation without an express
 # license agreement from NVIDIA CORPORATION is strictly prohibited.
-__version__ = '0.2.0'
+__version__ = '0.2.1'
--- a/nvdiffrast/common/common.h
+++ b/nvdiffrast/common/common.h
@@ -185,6 +185,8 @@ template<class T> static __device__ __forceinline__ void swap(T& a, T& b)
 //------------------------------------------------------------------------
 // Coalesced atomics. These are all done via macros.
+#if __CUDA_ARCH__ >= 700 // Warp match instruction __match_any_sync() is only available on compute capability 7.x and higher
 #define CA_TEMP       _ca_temp
 #define CA_TEMP_PARAM float* CA_TEMP
 #define CA_DECLARE_TEMP(threads_per_block) \
@@ -228,5 +230,24 @@ template<class T> static __device__ __forceinline__ void swap(T& a, T& b)
        caAtomicAdd((ptr)+(idx), (value));          \
    } while(0)
+//------------------------------------------------------------------------
+// Disable atomic coalescing for compute capability lower than 7.x
+#else // __CUDA_ARCH__ >= 700
+#define CA_TEMP _ca_temp
+#define CA_TEMP_PARAM float CA_TEMP
+#define CA_DECLARE_TEMP(threads_per_block) CA_TEMP_PARAM
+#define CA_SET_GROUP_MASK(group, thread_mask)
+#define CA_SET_GROUP(group)
+#define caAtomicAdd(ptr, value) atomicAdd((ptr), (value))
+#define caAtomicAdd3_xyw(ptr, x, y, w)  \
+    do {                                \
+        atomicAdd((ptr), (x));          \
+        atomicAdd((ptr)+1, (y));        \
+        atomicAdd((ptr)+3, (w));        \
+    } while(0)
+#define caAtomicAddTexture(ptr, level, idx, value) atomicAdd((ptr)+(idx), (value))
+#endif // __CUDA_ARCH__ >= 700
 //------------------------------------------------------------------------
 #endif // __CUDACC__
--- a/nvdiffrast/common/framework.h
+++ b/nvdiffrast/common/framework.h
@@ -36,6 +36,7 @@ using namespace tensorflow::shape_inference;
 #include <torch/extension.h>
 #include <ATen/cuda/CUDAContext.h>
 #include <ATen/cuda/CUDAUtils.h>
+#include <c10/cuda/CUDAGuard.h>
 #include <pybind11/numpy.h>
 #endif
 #define NVDR_CTX_ARGS int _nvdr_ctx_dummy

--- a/nvdiffrast/common/glutil.inl
+++ b/nvdiffrast/common/glutil.inl
@@ -37,26 +37,43 @@ struct GLContext
 static void setGLContext(GLContext& glctx)
 {
    if (!glctx.hglrc)
-        LOG(ERROR) << "setGLContext() called with null gltcx";
+        LOG(FATAL) << "setGLContext() called with null gltcx";
    if (!wglMakeCurrent(glctx.hdc, glctx.hglrc))
-        LOG(ERROR) << "wglMakeCurrent() failed when setting GL context";
+        LOG(FATAL) << "wglMakeCurrent() failed when setting GL context";
    if (glctx.glewInitialized)
        return;
    GLenum result = glewInit();
    if (result != GLEW_OK)
-        LOG(ERROR) << "glewInit() failed, return value = " << result;
+        LOG(FATAL) << "glewInit() failed, return value = " << result;
    glctx.glewInitialized = 1;
 }
 static void releaseGLContext(void)
 {
    if (!wglMakeCurrent(NULL, NULL))
-        LOG(ERROR) << "wglMakeCurrent() failed when releasing GL context";
+        LOG(FATAL) << "wglMakeCurrent() failed when releasing GL context";
 }
-static GLContext createGLContext(void)
+extern "C" int set_gpu(const char*);
+static GLContext createGLContext(int cudaDeviceIdx)
 {
+    if (cudaDeviceIdx >= 0)
+    {
+        char pciBusId[256] = "";
+        LOG(INFO) << "Creating GL context for Cuda device " << cudaDeviceIdx;
+        if (cudaDeviceGetPCIBusId(pciBusId, 255, cudaDeviceIdx) != CUDA_SUCCESS)
+        {
+            LOG(INFO) << "PCI bus id query failed";
+        }
+        else
+        {
+            int res = set_gpu(pciBusId);
+            LOG(INFO) << "Selecting device with PCI bus id " << pciBusId << " - " << (res ? "failed, expect crash or major slowdown" : "success");
+        }
+    }
    HINSTANCE hInstance = GetModuleHandle(NULL);
    WNDCLASS wc = {};
    wc.style         = CS_OWNDC;
@@ -101,7 +118,7 @@ static GLContext createGLContext(void)
 static void destroyGLContext(GLContext& glctx)
 {
    if (!glctx.hglrc)
-        LOG(ERROR) << "destroyGLContext() called with null gltcx";
+        LOG(FATAL) << "destroyGLContext() called with null gltcx";
    // If this is the current context, release it.
    if (wglGetCurrentContext() == glctx.hglrc)
@@ -109,13 +126,13 @@ static void destroyGLContext(GLContext& glctx)
    HWND hwnd = WindowFromDC(glctx.hdc);
    if (!hwnd)
-        LOG(ERROR) << "WindowFromDC() failed";
+        LOG(FATAL) << "WindowFromDC() failed";
    if (!ReleaseDC(hwnd, glctx.hdc))
-        LOG(ERROR) << "ReleaseDC() failed";
+        LOG(FATAL) << "ReleaseDC() failed";
    if (!wglDeleteContext(glctx.hglrc))
-        LOG(ERROR) << "wglDeleteContext() failed";
+        LOG(FATAL) << "wglDeleteContext() failed";
    if (!DestroyWindow(hwnd))
-        LOG(ERROR) << "DestroyWindow() failed";
+        LOG(FATAL) << "DestroyWindow() failed";
    LOG(INFO) << std::hex << std::setfill('0')
              << "WGL OpenGL context destroyed (hdc: 0x" << std::setw(8) << (uint32_t)(uintptr_t)glctx.hdc
@@ -140,6 +157,7 @@ static void destroyGLContext(GLContext& glctx)
 #   include <GL/glew.h> // Use system-supplied glew.h
 #endif
 #include <EGL/egl.h>
+#include <EGL/eglext.h>
 #include <GL/gl.h>
 #include <cuda_gl_interop.h>
@@ -148,7 +166,6 @@ static void destroyGLContext(GLContext& glctx)
 struct GLContext
 {
    EGLDisplay  display;
-    EGLSurface  surface;
    EGLContext  context;
    int         glewInitialized;
 };
@@ -158,9 +175,9 @@ struct GLContext
 static void setGLContext(GLContext& glctx)
 {
    if (!glctx.context)
-        LOG(ERROR) << "setGLContext() called with null gltcx";
+        LOG(FATAL) << "setGLContext() called with null gltcx";
-    if (!eglMakeCurrent(glctx.display, glctx.surface, glctx.surface, glctx.context))
+    if (!eglMakeCurrent(glctx.display, EGL_NO_SURFACE, EGL_NO_SURFACE, glctx.context))
        LOG(ERROR) << "eglMakeCurrent() failed when setting GL context";
    if (glctx.glewInitialized)
@@ -168,7 +185,7 @@ static void setGLContext(GLContext& glctx)
    GLenum result = glewInit();
    if (result != GLEW_OK)
-        LOG(ERROR) << "glewInit() failed, return value = " << result;
+        LOG(FATAL) << "glewInit() failed, return value = " << result;
    glctx.glewInitialized = 1;
 }
@@ -178,21 +195,83 @@ static void releaseGLContext(void)
    if (display == EGL_NO_DISPLAY)
        LOG(WARNING) << "releaseGLContext() called with no active display";
    if (!eglMakeCurrent(display, EGL_NO_SURFACE, EGL_NO_SURFACE, EGL_NO_CONTEXT))
-        LOG(ERROR) << "eglMakeCurrent() failed when releasing GL context";
+        LOG(FATAL) << "eglMakeCurrent() failed when releasing GL context";
 }
-static GLContext createGLContext(void)
+static EGLDisplay getCudaDisplay(int cudaDeviceIdx)
 {
-    // Initialize.
+    typedef EGLBoolean (*eglQueryDevicesEXT_t)(EGLint, EGLDeviceEXT, EGLint*);
+    typedef EGLBoolean (*eglQueryDeviceAttribEXT_t)(EGLDeviceEXT, EGLint, EGLAttrib*);
+    typedef EGLDisplay (*eglGetPlatformDisplayEXT_t)(EGLenum, void*, const EGLint*);
-    EGLDisplay display = eglGetDisplay(EGL_DEFAULT_DISPLAY);
+    eglQueryDevicesEXT_t eglQueryDevicesEXT = (eglQueryDevicesEXT_t)eglGetProcAddress("eglQueryDevicesEXT");
+    if (!eglQueryDevicesEXT)
+    {
+        LOG(INFO) << "eglGetProcAddress(\"eglQueryDevicesEXT\") failed";
+        return 0;
+    }
+    eglQueryDeviceAttribEXT_t eglQueryDeviceAttribEXT = (eglQueryDeviceAttribEXT_t)eglGetProcAddress("eglQueryDeviceAttribEXT");
+    if (!eglQueryDeviceAttribEXT)
+    {
+        LOG(INFO) << "eglGetProcAddress(\"eglQueryDeviceAttribEXT\") failed";
+        return 0;
+    }
+    eglGetPlatformDisplayEXT_t eglGetPlatformDisplayEXT = (eglGetPlatformDisplayEXT_t)eglGetProcAddress("eglGetPlatformDisplayEXT");
+    if (!eglGetPlatformDisplayEXT)
+    {
+        LOG(INFO) << "eglGetProcAddress(\"eglGetPlatformDisplayEXT\") failed";
+        return 0;
+    }
+    int num_devices = 0;
+    eglQueryDevicesEXT(0, 0, &num_devices);
+    if (!num_devices)
+        return 0;
+    EGLDisplay display = 0;
+    EGLDeviceEXT* devices = (EGLDeviceEXT*)malloc(num_devices * sizeof(void*));
+    eglQueryDevicesEXT(num_devices, devices, &num_devices);
+    for (int i=0; i < num_devices; i++)
+    {
+        EGLDeviceEXT device = devices[i]; 
+        intptr_t value = -1;
+        if (eglQueryDeviceAttribEXT(device, EGL_CUDA_DEVICE_NV, &value) && value == cudaDeviceIdx)
+        {
+            display = eglGetPlatformDisplayEXT(EGL_PLATFORM_DEVICE_EXT, device, 0);
+            break;
+        }
+    }
+    free(devices);
+    return display;
+}
+static GLContext createGLContext(int cudaDeviceIdx)
+{
+    EGLDisplay display = 0;
+    if (cudaDeviceIdx >= 0)
+    {
+        char pciBusId[256] = "";
+        LOG(INFO) << "Creating GL context for Cuda device " << cudaDeviceIdx;
+        display = getCudaDisplay(cudaDeviceIdx);
+        if (!display)
+            LOG(INFO) << "Failed, falling back to default display";
+    }
+    if (!display)  
+    {
+        display = eglGetDisplay(EGL_DEFAULT_DISPLAY);
        if (display == EGL_NO_DISPLAY)
-        LOG(ERROR) << "eglGetDisplay() failed";
+            LOG(FATAL) << "eglGetDisplay() failed";
+    }
    EGLint major;
    EGLint minor;
    if (!eglInitialize(display, &major, &minor))
-        LOG(ERROR) << "eglInitialize() failed";
+        LOG(FATAL) << "eglInitialize() failed";
    // Choose configuration.
@@ -211,45 +290,32 @@ static GLContext createGLContext(void)
    EGLConfig config;
    EGLint num_config;
    if (!eglChooseConfig(display, context_attribs, &config, 1, &num_config))
-        LOG(ERROR) << "eglChooseConfig() failed";
+        LOG(FATAL) << "eglChooseConfig() failed";
-    // Create dummy pbuffer surface.
-    const EGLint surface_attribs[] = {
-        EGL_WIDTH,      1,
-        EGL_HEIGHT,     1,
-        EGL_NONE
-    };
-    EGLSurface surface = eglCreatePbufferSurface(display, config, surface_attribs);
-    if (surface == EGL_NO_SURFACE)
-        LOG(ERROR) << "eglCreatePbufferSurface() failed";
    // Create GL context.
    if (!eglBindAPI(EGL_OPENGL_API))
-        LOG(ERROR) << "eglBindAPI() failed";
+        LOG(FATAL) << "eglBindAPI() failed";
    EGLContext context = eglCreateContext(display, config, EGL_NO_CONTEXT, NULL);
    if (context == EGL_NO_CONTEXT)
-        LOG(ERROR) << "eglCreateContext() failed";
+        LOG(FATAL) << "eglCreateContext() failed";
    // Done.
    LOG(INFO) << "EGL " << (int)minor << "." << (int)major << " OpenGL context created (disp: 0x"
              << std::hex << std::setfill('0')
              << std::setw(16) << (uintptr_t)display
-              << ", surf: 0x" << std::setw(16) << (uintptr_t)surface
              << ", ctx: 0x" << std::setw(16) << (uintptr_t)context << ")";
-    GLContext glctx = {display, surface, context, 0};
+    GLContext glctx = {display, context, 0};
    return glctx;
 }
 static void destroyGLContext(GLContext& glctx)
 {
    if (!glctx.context)
-        LOG(ERROR) << "destroyGLContext() called with null gltcx";
+        LOG(FATAL) << "destroyGLContext() called with null gltcx";
    // If this is the current context, release it.
    if (eglGetCurrentContext() == glctx.context)
@@ -257,13 +323,10 @@ static void destroyGLContext(GLContext& glctx)
    if (!eglDestroyContext(glctx.display, glctx.context))
        LOG(ERROR) << "eglDestroyContext() failed";
-    if (!eglDestroySurface(glctx.display, glctx.surface))
-        LOG(ERROR) << "eglDestroySurface() failed";
    LOG(INFO) << "EGL OpenGL context destroyed (disp: 0x"
              << std::hex << std::setfill('0')
              << std::setw(16) << (uintptr_t)glctx.display
-              << ", surf: 0x" << std::setw(16) << (uintptr_t)glctx.surface
              << ", ctx: 0x" << std::setw(16) << (uintptr_t)glctx.context << ")";
    memset(&glctx, 0, sizeof(GLContext));

--- a/nvdiffrast/common/rasterize.cpp
+++ b/nvdiffrast/common/rasterize.cpp
@@ -76,10 +76,10 @@ static void constructGLProgram(NVDR_CTX_ARGS, GLuint* pProgram, GLuint glVertexS
 //------------------------------------------------------------------------
 // Shared C++ functions.
-void rasterizeInitGLContext(NVDR_CTX_ARGS, RasterizeGLState& s)
+void rasterizeInitGLContext(NVDR_CTX_ARGS, RasterizeGLState& s, int cudaDeviceIdx)
 {
    // Create GL context and set it current.
-    s.glctx = createGLContext();
+    s.glctx = createGLContext(cudaDeviceIdx);
    setGLContext(s.glctx);
    // Version check.

--- a/nvdiffrast/common/rasterize.h
+++ b/nvdiffrast/common/rasterize.h
@@ -83,7 +83,7 @@ struct RasterizeGLState
 //------------------------------------------------------------------------
 // Shared C++ code prototypes.
-void rasterizeInitGLContext(NVDR_CTX_ARGS, RasterizeGLState& s);
+void rasterizeInitGLContext(NVDR_CTX_ARGS, RasterizeGLState& s, int cudaDeviceIdx);
 void rasterizeResizeBuffers(NVDR_CTX_ARGS, RasterizeGLState& s, int posCount, int triCount, int width, int height, int depth);
 void rasterizeRender(NVDR_CTX_ARGS, RasterizeGLState& s, cudaStream_t stream, const float* posPtr, int posCount, int vtxPerInstance, const int32_t* triPtr, int triCount, const int32_t* rangesPtr, int width, int height, int depth);
 void rasterizeCopyResults(NVDR_CTX_ARGS, RasterizeGLState& s, cudaStream_t stream, float** outputPtr, int width, int height, int depth);

--- a/nvdiffrast/common/texture.cu
+++ b/nvdiffrast/common/texture.cu
@@ -448,13 +448,16 @@ static __device__ __forceinline__ float2 indexTextureLinear(const TextureKernelP
 //------------------------------------------------------------------------
 // Mip level calculation.
-template <bool CUBE_MODE, int FILTER_MODE>
+template <bool CUBE_MODE, bool BIAS_ONLY, int FILTER_MODE>
 static __device__ __forceinline__ void calculateMipLevel(int& level0, int& level1, float& flevel, const TextureKernelParams& p, int pidx, float3 uv, float4* pdw, float3* pdfdv)
 {
    // Do nothing if mips not in use.
    if (FILTER_MODE == TEX_MODE_NEAREST || FILTER_MODE == TEX_MODE_LINEAR)
        return;
+    // Determine mip level based on UV pixel derivatives. If no derivatives are given (mip level bias only), leave as zero.
+    if (!BIAS_ONLY)
+    {
        // Get pixel derivatives of texture coordinates.
        float4 uvDA;
        float3 dvdX, dvdY; // Gradients use these later.
@@ -531,25 +534,24 @@ static __device__ __forceinline__ void calculateMipLevel(int& level0, int& level
            }
        }
-    // Calculate true mip level and clamp.
+        // Finally, calculate mip level.
        flevel = .5f * __log2f(lenMajorSqr);
+    }
+    // Bias the mip level and clamp.
+    if (p.mipLevelBias)
+        flevel += p.mipLevelBias[pidx];
    flevel = fminf(fmaxf(flevel, 0.f), (float)p.mipLevelMax);
-    if (FILTER_MODE == TEX_MODE_LINEAR_MIPMAP_NEAREST)
+    // Calculate levels depending on filter mode.
-    {
-        // Linear-mipmap-nearest.
-        level0 = __float2int_rn(flevel);
-    }
-    else
-    {
-        // Linear-mipmap-linear.
-        if (flevel > 0.f) // Leave everything at zero if flevel == 0 (magnification)
-        {
    level0 = __float2int_rd(flevel);
+    // Leave everything else at zero if flevel == 0 (magnification) or when in linear-mipmap-nearest mode.
+    if (FILTER_MODE == TEX_MODE_LINEAR_MIPMAP_LINEAR && flevel > 0.f)
+    {
        level1 = min(level0 + 1, p.mipLevelMax);
        flevel -= level0; // Fractional part. Zero if clamped on last level.
    }
-    }
 }
 //------------------------------------------------------------------------
@@ -672,7 +674,7 @@ __global__ void MipBuildKernel4(const TextureKernelParams p) { MipBuildKernelTem
 //------------------------------------------------------------------------
 // Forward kernel.
-template <class T, int C, bool CUBE_MODE, int FILTER_MODE>
+template <class T, int C, bool CUBE_MODE, bool BIAS_ONLY, int FILTER_MODE>
 static __forceinline__ __device__ void TextureFwdKernelTemplate(const TextureKernelParams p)
 {
    // Calculate pixel position.
@@ -714,7 +716,7 @@ static __forceinline__ __device__ void TextureFwdKernelTemplate(const TextureKer
    float  flevel = 0.f; // Fractional level.
    int    level0 = 0;   // Discrete level 0.
    int    level1 = 0;   // Discrete level 1.
-    calculateMipLevel<CUBE_MODE, FILTER_MODE>(level0, level1, flevel, p, pidx, uv, 0, 0);
+    calculateMipLevel<CUBE_MODE, BIAS_ONLY, FILTER_MODE>(level0, level1, flevel, p, pidx, uv, 0, 0);
    // Get texel indices and pointer for level 0.
    int4 tc0 = make_int4(0, 0, 0, 0);
@@ -766,30 +768,42 @@ static __forceinline__ __device__ void TextureFwdKernelTemplate(const TextureKer
 }
 // Template specializations.
-__global__ void TextureFwdKernelNearest1                 (const TextureKernelParams p) { TextureFwdKernelTemplate<float,  1, false, TEX_MODE_NEAREST>(p); }
+__global__ void TextureFwdKernelNearest1                    (const TextureKernelParams p) { TextureFwdKernelTemplate<float,  1, false, false, TEX_MODE_NEAREST>(p); }
-__global__ void TextureFwdKernelNearest2                 (const TextureKernelParams p) { TextureFwdKernelTemplate<float2, 2, false, TEX_MODE_NEAREST>(p); }
+__global__ void TextureFwdKernelNearest2                    (const TextureKernelParams p) { TextureFwdKernelTemplate<float2, 2, false, false, TEX_MODE_NEAREST>(p); }
-__global__ void TextureFwdKernelNearest4                 (const TextureKernelParams p) { TextureFwdKernelTemplate<float4, 4, false, TEX_MODE_NEAREST>(p); }
+__global__ void TextureFwdKernelNearest4                    (const TextureKernelParams p) { TextureFwdKernelTemplate<float4, 4, false, false, TEX_MODE_NEAREST>(p); }
-__global__ void TextureFwdKernelLinear1                  (const TextureKernelParams p) { TextureFwdKernelTemplate<float,  1, false, TEX_MODE_LINEAR>(p); }
+__global__ void TextureFwdKernelLinear1                     (const TextureKernelParams p) { TextureFwdKernelTemplate<float,  1, false, false, TEX_MODE_LINEAR>(p); }
-__global__ void TextureFwdKernelLinear2                  (const TextureKernelParams p) { TextureFwdKernelTemplate<float2, 2, false, TEX_MODE_LINEAR>(p); }
+__global__ void TextureFwdKernelLinear2                     (const TextureKernelParams p) { TextureFwdKernelTemplate<float2, 2, false, false, TEX_MODE_LINEAR>(p); }
-__global__ void TextureFwdKernelLinear4                  (const TextureKernelParams p) { TextureFwdKernelTemplate<float4, 4, false, TEX_MODE_LINEAR>(p); }
+__global__ void TextureFwdKernelLinear4                     (const TextureKernelParams p) { TextureFwdKernelTemplate<float4, 4, false, false, TEX_MODE_LINEAR>(p); }
-__global__ void TextureFwdKernelLinearMipmapNearest1     (const TextureKernelParams p) { TextureFwdKernelTemplate<float,  1, false, TEX_MODE_LINEAR_MIPMAP_NEAREST>(p); }
+__global__ void TextureFwdKernelLinearMipmapNearest1        (const TextureKernelParams p) { TextureFwdKernelTemplate<float,  1, false, false, TEX_MODE_LINEAR_MIPMAP_NEAREST>(p); }
-__global__ void TextureFwdKernelLinearMipmapNearest2     (const TextureKernelParams p) { TextureFwdKernelTemplate<float2, 2, false, TEX_MODE_LINEAR_MIPMAP_NEAREST>(p); }
+__global__ void TextureFwdKernelLinearMipmapNearest2        (const TextureKernelParams p) { TextureFwdKernelTemplate<float2, 2, false, false, TEX_MODE_LINEAR_MIPMAP_NEAREST>(p); }
-__global__ void TextureFwdKernelLinearMipmapNearest4     (const TextureKernelParams p) { TextureFwdKernelTemplate<float4, 4, false, TEX_MODE_LINEAR_MIPMAP_NEAREST>(p); }
+__global__ void TextureFwdKernelLinearMipmapNearest4        (const TextureKernelParams p) { TextureFwdKernelTemplate<float4, 4, false, false, TEX_MODE_LINEAR_MIPMAP_NEAREST>(p); }
-__global__ void TextureFwdKernelLinearMipmapLinear1      (const TextureKernelParams p) { TextureFwdKernelTemplate<float,  1, false, TEX_MODE_LINEAR_MIPMAP_LINEAR>(p); }
+__global__ void TextureFwdKernelLinearMipmapLinear1         (const TextureKernelParams p) { TextureFwdKernelTemplate<float,  1, false, false, TEX_MODE_LINEAR_MIPMAP_LINEAR>(p); }
-__global__ void TextureFwdKernelLinearMipmapLinear2      (const TextureKernelParams p) { TextureFwdKernelTemplate<float2, 2, false, TEX_MODE_LINEAR_MIPMAP_LINEAR>(p); }
+__global__ void TextureFwdKernelLinearMipmapLinear2         (const TextureKernelParams p) { TextureFwdKernelTemplate<float2, 2, false, false, TEX_MODE_LINEAR_MIPMAP_LINEAR>(p); }
-__global__ void TextureFwdKernelLinearMipmapLinear4      (const TextureKernelParams p) { TextureFwdKernelTemplate<float4, 4, false, TEX_MODE_LINEAR_MIPMAP_LINEAR>(p); }
+__global__ void TextureFwdKernelLinearMipmapLinear4         (const TextureKernelParams p) { TextureFwdKernelTemplate<float4, 4, false, false, TEX_MODE_LINEAR_MIPMAP_LINEAR>(p); }
-__global__ void TextureFwdKernelCubeNearest1             (const TextureKernelParams p) { TextureFwdKernelTemplate<float,  1, true,  TEX_MODE_NEAREST>(p); }
+__global__ void TextureFwdKernelCubeNearest1                (const TextureKernelParams p) { TextureFwdKernelTemplate<float,  1, true,  false, TEX_MODE_NEAREST>(p); }
-__global__ void TextureFwdKernelCubeNearest2             (const TextureKernelParams p) { TextureFwdKernelTemplate<float2, 2, true,  TEX_MODE_NEAREST>(p); }
+__global__ void TextureFwdKernelCubeNearest2                (const TextureKernelParams p) { TextureFwdKernelTemplate<float2, 2, true,  false, TEX_MODE_NEAREST>(p); }
-__global__ void TextureFwdKernelCubeNearest4             (const TextureKernelParams p) { TextureFwdKernelTemplate<float4, 4, true,  TEX_MODE_NEAREST>(p); }
+__global__ void TextureFwdKernelCubeNearest4                (const TextureKernelParams p) { TextureFwdKernelTemplate<float4, 4, true,  false, TEX_MODE_NEAREST>(p); }
-__global__ void TextureFwdKernelCubeLinear1              (const TextureKernelParams p) { TextureFwdKernelTemplate<float,  1, true,  TEX_MODE_LINEAR>(p); }
+__global__ void TextureFwdKernelCubeLinear1                 (const TextureKernelParams p) { TextureFwdKernelTemplate<float,  1, true,  false, TEX_MODE_LINEAR>(p); }
-__global__ void TextureFwdKernelCubeLinear2              (const TextureKernelParams p) { TextureFwdKernelTemplate<float2, 2, true,  TEX_MODE_LINEAR>(p); }
+__global__ void TextureFwdKernelCubeLinear2                 (const TextureKernelParams p) { TextureFwdKernelTemplate<float2, 2, true,  false, TEX_MODE_LINEAR>(p); }
-__global__ void TextureFwdKernelCubeLinear4              (const TextureKernelParams p) { TextureFwdKernelTemplate<float4, 4, true,  TEX_MODE_LINEAR>(p); }
+__global__ void TextureFwdKernelCubeLinear4                 (const TextureKernelParams p) { TextureFwdKernelTemplate<float4, 4, true,  false, TEX_MODE_LINEAR>(p); }
-__global__ void TextureFwdKernelCubeLinearMipmapNearest1 (const TextureKernelParams p) { TextureFwdKernelTemplate<float,  1, true,  TEX_MODE_LINEAR_MIPMAP_NEAREST>(p); }
+__global__ void TextureFwdKernelCubeLinearMipmapNearest1    (const TextureKernelParams p) { TextureFwdKernelTemplate<float,  1, true,  false, TEX_MODE_LINEAR_MIPMAP_NEAREST>(p); }
-__global__ void TextureFwdKernelCubeLinearMipmapNearest2 (const TextureKernelParams p) { TextureFwdKernelTemplate<float2, 2, true,  TEX_MODE_LINEAR_MIPMAP_NEAREST>(p); }
+__global__ void TextureFwdKernelCubeLinearMipmapNearest2    (const TextureKernelParams p) { TextureFwdKernelTemplate<float2, 2, true,  false, TEX_MODE_LINEAR_MIPMAP_NEAREST>(p); }
-__global__ void TextureFwdKernelCubeLinearMipmapNearest4 (const TextureKernelParams p) { TextureFwdKernelTemplate<float4, 4, true,  TEX_MODE_LINEAR_MIPMAP_NEAREST>(p); }
+__global__ void TextureFwdKernelCubeLinearMipmapNearest4    (const TextureKernelParams p) { TextureFwdKernelTemplate<float4, 4, true,  false, TEX_MODE_LINEAR_MIPMAP_NEAREST>(p); }
-__global__ void TextureFwdKernelCubeLinearMipmapLinear1  (const TextureKernelParams p) { TextureFwdKernelTemplate<float,  1, true,  TEX_MODE_LINEAR_MIPMAP_LINEAR>(p); }
+__global__ void TextureFwdKernelCubeLinearMipmapLinear1     (const TextureKernelParams p) { TextureFwdKernelTemplate<float,  1, true,  false, TEX_MODE_LINEAR_MIPMAP_LINEAR>(p); }
-__global__ void TextureFwdKernelCubeLinearMipmapLinear2  (const TextureKernelParams p) { TextureFwdKernelTemplate<float2, 2, true,  TEX_MODE_LINEAR_MIPMAP_LINEAR>(p); }
+__global__ void TextureFwdKernelCubeLinearMipmapLinear2     (const TextureKernelParams p) { TextureFwdKernelTemplate<float2, 2, true,  false, TEX_MODE_LINEAR_MIPMAP_LINEAR>(p); }
-__global__ void TextureFwdKernelCubeLinearMipmapLinear4  (const TextureKernelParams p) { TextureFwdKernelTemplate<float4, 4, true,  TEX_MODE_LINEAR_MIPMAP_LINEAR>(p); }
+__global__ void TextureFwdKernelCubeLinearMipmapLinear4     (const TextureKernelParams p) { TextureFwdKernelTemplate<float4, 4, true,  false, TEX_MODE_LINEAR_MIPMAP_LINEAR>(p); }
+__global__ void TextureFwdKernelLinearMipmapNearestBO1      (const TextureKernelParams p) { TextureFwdKernelTemplate<float,  1, false, true,  TEX_MODE_LINEAR_MIPMAP_NEAREST>(p); }
+__global__ void TextureFwdKernelLinearMipmapNearestBO2      (const TextureKernelParams p) { TextureFwdKernelTemplate<float2, 2, false, true,  TEX_MODE_LINEAR_MIPMAP_NEAREST>(p); }
+__global__ void TextureFwdKernelLinearMipmapNearestBO4      (const TextureKernelParams p) { TextureFwdKernelTemplate<float4, 4, false, true,  TEX_MODE_LINEAR_MIPMAP_NEAREST>(p); }
+__global__ void TextureFwdKernelLinearMipmapLinearBO1       (const TextureKernelParams p) { TextureFwdKernelTemplate<float,  1, false, true,  TEX_MODE_LINEAR_MIPMAP_LINEAR>(p); }
+__global__ void TextureFwdKernelLinearMipmapLinearBO2       (const TextureKernelParams p) { TextureFwdKernelTemplate<float2, 2, false, true,  TEX_MODE_LINEAR_MIPMAP_LINEAR>(p); }
+__global__ void TextureFwdKernelLinearMipmapLinearBO4       (const TextureKernelParams p) { TextureFwdKernelTemplate<float4, 4, false, true,  TEX_MODE_LINEAR_MIPMAP_LINEAR>(p); }
+__global__ void TextureFwdKernelCubeLinearMipmapNearestBO1  (const TextureKernelParams p) { TextureFwdKernelTemplate<float,  1, true,  true,  TEX_MODE_LINEAR_MIPMAP_NEAREST>(p); }
+__global__ void TextureFwdKernelCubeLinearMipmapNearestBO2  (const TextureKernelParams p) { TextureFwdKernelTemplate<float2, 2, true,  true,  TEX_MODE_LINEAR_MIPMAP_NEAREST>(p); }
+__global__ void TextureFwdKernelCubeLinearMipmapNearestBO4  (const TextureKernelParams p) { TextureFwdKernelTemplate<float4, 4, true,  true,  TEX_MODE_LINEAR_MIPMAP_NEAREST>(p); }
+__global__ void TextureFwdKernelCubeLinearMipmapLinearBO1   (const TextureKernelParams p) { TextureFwdKernelTemplate<float,  1, true,  true,  TEX_MODE_LINEAR_MIPMAP_LINEAR>(p); }
+__global__ void TextureFwdKernelCubeLinearMipmapLinearBO2   (const TextureKernelParams p) { TextureFwdKernelTemplate<float2, 2, true,  true,  TEX_MODE_LINEAR_MIPMAP_LINEAR>(p); }
+__global__ void TextureFwdKernelCubeLinearMipmapLinearBO4   (const TextureKernelParams p) { TextureFwdKernelTemplate<float4, 4, true,  true,  TEX_MODE_LINEAR_MIPMAP_LINEAR>(p); }
 //------------------------------------------------------------------------
 // Gradient mip puller kernel.
@@ -856,7 +870,7 @@ __global__ void MipGradKernel4(const TextureKernelParams p) { MipGradKernelTempl
 //------------------------------------------------------------------------
 // Gradient kernel.
-template <bool CUBE_MODE, int FILTER_MODE>
+template <bool CUBE_MODE, bool BIAS_ONLY, int FILTER_MODE>
 static __forceinline__ __device__ void TextureGradKernelTemplate(const TextureKernelParams p)
 {
    // Temporary space for coalesced atomics.
@@ -898,18 +912,28 @@ static __forceinline__ __device__ void TextureGradKernelTemplate(const TextureKe
            if (FILTER_MODE != TEX_MODE_NEAREST)
                ((float3*)p.gradUV)[pidx] = make_float3(0.f, 0.f, 0.f);
            if (FILTER_MODE == TEX_MODE_LINEAR_MIPMAP_LINEAR)
+            {
+                if (p.gradUVDA)
                {
                    ((float2*)p.gradUVDA)[3 * pidx + 0] = make_float2(0.f, 0.f);
                    ((float2*)p.gradUVDA)[3 * pidx + 1] = make_float2(0.f, 0.f);
                    ((float2*)p.gradUVDA)[3 * pidx + 2] = make_float2(0.f, 0.f);
                }
+                if (p.gradMipLevelBias)
+                    p.gradMipLevelBias[pidx] = 0.f;
+            }
        }
        else
        {
            if (FILTER_MODE != TEX_MODE_NEAREST)
                ((float2*)p.gradUV)[pidx] = make_float2(0.f, 0.f);
            if (FILTER_MODE == TEX_MODE_LINEAR_MIPMAP_LINEAR)
+            {
+                if (p.gradUVDA)
                    ((float4*)p.gradUVDA)[pidx] = make_float4(0.f, 0.f, 0.f, 0.f);
+                if (p.gradMipLevelBias)
+                    p.gradMipLevelBias[pidx] = 0.f;
+            }
        }
        return;
    }
@@ -944,7 +968,7 @@ static __forceinline__ __device__ void TextureGradKernelTemplate(const TextureKe
    float  flevel = 0.f; // Fractional level.
    int    level0 = 0;   // Discrete level 0.
    int    level1 = 0;   // Discrete level 1.
-    calculateMipLevel<CUBE_MODE, FILTER_MODE>(level0, level1, flevel, p, pidx, uv, &dw, &dfdv);
+    calculateMipLevel<CUBE_MODE, BIAS_ONLY, FILTER_MODE>(level0, level1, flevel, p, pidx, uv, &dw, &dfdv);
    // UV gradient accumulators.
    float gu = 0.f;
@@ -1058,7 +1082,14 @@ static __forceinline__ __device__ void TextureGradKernelTemplate(const TextureKe
    else
        ((float2*)p.gradUV)[pidx] = make_float2(gu, gv);
-    // Final UV pixel differential gradients.
+    // Store mip level bias gradient.
+    if (p.gradMipLevelBias)
+        p.gradMipLevelBias[pidx] = df;
+    // Store UV pixel differential gradients.
+    if (!BIAS_ONLY)
+    {
+        // Final gradients.
        dw *= df; // dL/(d{s,y}/d{X,Y}) = df/(d{s,y}/d{X,Y}) * dL/df.
        // Store them.
@@ -1073,16 +1104,21 @@ static __forceinline__ __device__ void TextureGradKernelTemplate(const TextureKe
        }
        else
            ((float4*)p.gradUVDA)[pidx] = dw;
+    }
 }
 // Template specializations.
-__global__ void TextureGradKernelNearest                 (const TextureKernelParams p) { TextureGradKernelTemplate<false, TEX_MODE_NEAREST>(p); }
+__global__ void TextureGradKernelNearest                    (const TextureKernelParams p) { TextureGradKernelTemplate<false, false, TEX_MODE_NEAREST>(p); }
-__global__ void TextureGradKernelLinear                  (const TextureKernelParams p) { TextureGradKernelTemplate<false, TEX_MODE_LINEAR>(p); }
+__global__ void TextureGradKernelLinear                     (const TextureKernelParams p) { TextureGradKernelTemplate<false, false, TEX_MODE_LINEAR>(p); }
-__global__ void TextureGradKernelLinearMipmapNearest     (const TextureKernelParams p) { TextureGradKernelTemplate<false, TEX_MODE_LINEAR_MIPMAP_NEAREST>(p); }
+__global__ void TextureGradKernelLinearMipmapNearest        (const TextureKernelParams p) { TextureGradKernelTemplate<false, false, TEX_MODE_LINEAR_MIPMAP_NEAREST>(p); }
-__global__ void TextureGradKernelLinearMipmapLinear      (const TextureKernelParams p) { TextureGradKernelTemplate<false, TEX_MODE_LINEAR_MIPMAP_LINEAR>(p); }
+__global__ void TextureGradKernelLinearMipmapLinear         (const TextureKernelParams p) { TextureGradKernelTemplate<false, false, TEX_MODE_LINEAR_MIPMAP_LINEAR>(p); }
-__global__ void TextureGradKernelCubeNearest             (const TextureKernelParams p) { TextureGradKernelTemplate<true,  TEX_MODE_NEAREST>(p); }
+__global__ void TextureGradKernelCubeNearest                (const TextureKernelParams p) { TextureGradKernelTemplate<true,  false, TEX_MODE_NEAREST>(p); }
-__global__ void TextureGradKernelCubeLinear              (const TextureKernelParams p) { TextureGradKernelTemplate<true,  TEX_MODE_LINEAR>(p); }
+__global__ void TextureGradKernelCubeLinear                 (const TextureKernelParams p) { TextureGradKernelTemplate<true,  false, TEX_MODE_LINEAR>(p); }
-__global__ void TextureGradKernelCubeLinearMipmapNearest (const TextureKernelParams p) { TextureGradKernelTemplate<true,  TEX_MODE_LINEAR_MIPMAP_NEAREST>(p); }
+__global__ void TextureGradKernelCubeLinearMipmapNearest    (const TextureKernelParams p) { TextureGradKernelTemplate<true,  false, TEX_MODE_LINEAR_MIPMAP_NEAREST>(p); }
-__global__ void TextureGradKernelCubeLinearMipmapLinear  (const TextureKernelParams p) { TextureGradKernelTemplate<true,  TEX_MODE_LINEAR_MIPMAP_LINEAR>(p); }
+__global__ void TextureGradKernelCubeLinearMipmapLinear     (const TextureKernelParams p) { TextureGradKernelTemplate<true,  false, TEX_MODE_LINEAR_MIPMAP_LINEAR>(p); }
+__global__ void TextureGradKernelLinearMipmapNearestBO      (const TextureKernelParams p) { TextureGradKernelTemplate<false, true,  TEX_MODE_LINEAR_MIPMAP_NEAREST>(p); }
+__global__ void TextureGradKernelLinearMipmapLinearBO       (const TextureKernelParams p) { TextureGradKernelTemplate<false, true,  TEX_MODE_LINEAR_MIPMAP_LINEAR>(p); }
+__global__ void TextureGradKernelCubeLinearMipmapNearestBO  (const TextureKernelParams p) { TextureGradKernelTemplate<true,  true,  TEX_MODE_LINEAR_MIPMAP_NEAREST>(p); }
+__global__ void TextureGradKernelCubeLinearMipmapLinearBO   (const TextureKernelParams p) { TextureGradKernelTemplate<true,  true,  TEX_MODE_LINEAR_MIPMAP_LINEAR>(p); }
 //------------------------------------------------------------------------
--- a/nvdiffrast/common/texture.h
+++ b/nvdiffrast/common/texture.h
@@ -40,7 +40,8 @@ struct TextureKernelParams
 {
    const float*    tex;                            // Incoming texture buffer.
    const float*    uv;                             // Incoming texcoord buffer.
-    const float*    uvDA;                           // Incoming uv pixel diffs. NULL if mips disabled.
+    const float*    uvDA;                           // Incoming uv pixel diffs or NULL.
+    const float*    mipLevelBias;                   // Incoming mip level bias or NULL.
    const float*    dy;                             // Incoming output gradient.
    float*          mip;                            // Mip data buffer.
    float*          out;                            // Outgoing texture data.
@@ -48,7 +49,8 @@ struct TextureKernelParams
    float*          gradTexMip;                     // Temporary texture gradients for mip levels > 0.
    float*          gradUV;                         // Outgoing texcoord gradient.
    float*          gradUVDA;                       // Outgoing texcoord pixel differential gradient.
-    int             enableMip;                      // If true, we have uv_da input and mip output tensor.
+    float*          gradMipLevelBias;               // Outgoing mip level bias gradient.
+    int             enableMip;                      // If true, we have uv_da and/or mip_level_bias input(s), and a mip tensor.
    int             filterMode;                     // One of the TEX_MODE_ constants.
    int             boundaryMode;                   // One of the TEX_BOUNDARY_MODE_ contants.
    int             texConst;                       // If true, texture is known to be constant.

--- a/nvdiffrast/lib/setgpu.lib
+++ b/nvdiffrast/lib/setgpu.lib
--- a/nvdiffrast/tensorflow/ops.py
+++ b/nvdiffrast/tensorflow/ops.py
@@ -19,7 +19,7 @@ from . import plugin_loader
 def _get_gl_opts():
    libs = {
        'posix': ['GL', 'GLEW'],
-        'nt':    ['gdi32', 'glew32s', 'opengl32', 'user32'], 
+        'nt':    ['gdi32', 'glew32s', 'opengl32', 'user32', 'setgpu'],
    }
    return ['-l' + x for x in libs[os.name]]

--- a/nvdiffrast/tensorflow/tf_rasterize.cu
+++ b/nvdiffrast/tensorflow/tf_rasterize.cu
@@ -65,12 +65,16 @@ struct RasterizeFwdOp : public OpKernel
        // Init context and GL?
        bool initCtx = !m_glState.glFBO;
        if (initCtx)
-            rasterizeInitGLContext(ctx, m_glState); // In common/rasterize.inl
+        {
+            const DeviceBase::GpuDeviceInfo* g = ctx->device()->tensorflow_gpu_device_info();
+            int cudaDeviceIdx = g ? g->gpu_id : -1;
+            rasterizeInitGLContext(ctx, m_glState, cudaDeviceIdx); // In common/rasterize.cpp
+        }
        else
            setGLContext(m_glState.glctx); // (Re-)Activate GL context.
        // Resize all buffers.
-        rasterizeResizeBuffers(ctx, m_glState, posCount, triCount, width, height, depth); // In common/rasterize.inl
+        rasterizeResizeBuffers(ctx, m_glState, posCount, triCount, width, height, depth); // In common/rasterize.cpp
        // Newly created GL objects sometimes don't map properly to CUDA until after first context swap. Workaround.
        if (initCtx)

--- a/nvdiffrast/torch/ops.py
+++ b/nvdiffrast/torch/ops.py
@@ -45,9 +45,9 @@ def _get_plugin():
    # Linker options.
    if os.name == 'posix':
-        ldflags = ['-lGL', '-lGLEW']
+        ldflags = ['-lGL', '-lGLEW', '-lEGL']
    elif os.name == 'nt':
-        libs = ['gdi32', 'glew32s', 'opengl32', 'user32']
+        libs = ['gdi32', 'glew32s', 'opengl32', 'user32', 'setgpu']
        ldflags = ['/LIBPATH:' + lib_dir] + ['/DEFAULTLIB:' + x for x in libs]
    # List of source files.
@@ -121,7 +121,7 @@ def set_log_level(level):
 #----------------------------------------------------------------------------
 class RasterizeGLContext:
-    def __init__(self, output_db=True, mode='automatic'):
+    def __init__(self, output_db=True, mode='automatic', device=None):
        '''Create a new OpenGL rasterizer context.
        Creating an OpenGL context is a slow operation so you should reuse the same
@@ -131,7 +131,10 @@ class RasterizeGLContext:
        Args:
          output_db (bool): Compute and output image-space derivates of barycentrics.
          mode: OpenGL context handling mode. Valid values are 'manual' and 'automatic'.
+          device (Optional): Cuda device on which the context is created. Type can be
+                             `torch.device`, string (e.g., `'cuda:1'`), or int. If not
+                             specified, context will be created on currently active Cuda
+                             device.
        Returns:
          The newly created OpenGL rasterizer context.
        '''
@@ -139,7 +142,12 @@ class RasterizeGLContext:
        assert mode in ['automatic', 'manual']
        self.output_db = output_db
        self.mode = mode
-        self.cpp_wrapper = _get_plugin().RasterizeGLStateWrapper(output_db, mode == 'automatic')
+        if device is None:
+            cuda_device_idx = torch.cuda.current_device()
+        else:
+            with torch.cuda.device(device):
+                cuda_device_idx = torch.cuda.current_device()
+        self.cpp_wrapper = _get_plugin().RasterizeGLStateWrapper(output_db, mode == 'automatic', cuda_device_idx)
    def set_context(self):
        '''Set (activate) OpenGL context in the current CPU thread.
@@ -316,22 +324,26 @@ def interpolate(attr, rast, tri, rast_db=None, diff_attrs=None):
 # Linear-mipmap-linear and linear-mipmap-nearest: Mipmaps enabled.
 class _texture_func_mip(torch.autograd.Function):
    @staticmethod
-    def forward(ctx, filter_mode, tex, uv, uv_da, mip, filter_mode_enum, boundary_mode_enum):
+    def forward(ctx, filter_mode, tex, uv, uv_da, mip_level_bias, mip, filter_mode_enum, boundary_mode_enum):
-        out = _get_plugin().texture_fwd_mip(tex, uv, uv_da, mip, filter_mode_enum, boundary_mode_enum)
+        if uv_da is None:
-        ctx.save_for_backward(tex, uv, uv_da)
+            uv_da = torch.tensor([])
+        if mip_level_bias is None:
+            mip_level_bias = torch.tensor([])
+        out = _get_plugin().texture_fwd_mip(tex, uv, uv_da, mip_level_bias, mip, filter_mode_enum, boundary_mode_enum)
+        ctx.save_for_backward(tex, uv, uv_da, mip_level_bias)
        ctx.saved_misc = filter_mode, mip, filter_mode_enum, boundary_mode_enum
        return out
    @staticmethod
    def backward(ctx, dy):
-        tex, uv, uv_da = ctx.saved_variables
+        tex, uv, uv_da, mip_level_bias = ctx.saved_variables
        filter_mode, mip, filter_mode_enum, boundary_mode_enum = ctx.saved_misc
        if filter_mode == 'linear-mipmap-linear':
-            g_tex, g_uv, g_uv_da = _get_plugin().texture_grad_linear_mipmap_linear(tex, uv, dy, uv_da, mip, filter_mode_enum, boundary_mode_enum)
+            g_tex, g_uv, g_uv_da, g_mip_level_bias = _get_plugin().texture_grad_linear_mipmap_linear(tex, uv, dy, uv_da, mip_level_bias, mip, filter_mode_enum, boundary_mode_enum)
-            return None, g_tex, g_uv, g_uv_da, None, None, None
+            return None, g_tex, g_uv, g_uv_da, g_mip_level_bias, None, None, None
        else: # linear-mipmap-nearest
-            g_tex, g_uv = _get_plugin().texture_grad_linear_mipmap_nearest(tex, uv, dy, uv_da, mip, filter_mode_enum, boundary_mode_enum)
+            g_tex, g_uv = _get_plugin().texture_grad_linear_mipmap_nearest(tex, uv, dy, uv_da, mip_level_bias, mip, filter_mode_enum, boundary_mode_enum)
-            return None, g_tex, g_uv, None, None, None, None
+            return None, g_tex, g_uv, None, None, None, None, None
 # Linear and nearest: Mipmaps disabled.
 class _texture_func(torch.autograd.Function):
@@ -354,7 +366,7 @@ class _texture_func(torch.autograd.Function):
            return None, g_tex, None, None, None
 # Op wrapper.
-def texture(tex, uv, uv_da=None, mip=None, filter_mode='auto', boundary_mode='wrap', max_mip_level=None):
+def texture(tex, uv, uv_da=None, mip_level_bias=None, mip=None, filter_mode='auto', boundary_mode='wrap', max_mip_level=None):
    """Perform texture sampling.
    All input tensors must be contiguous and reside in GPU memory. The output tensor
@@ -372,14 +384,16 @@ def texture(tex, uv, uv_da=None, mip=None, filter_mode='auto', boundary_mode='wr
        uv_da: (Optional) Tensor containing image-space derivatives of texture coordinates.
               Must have same shape as `uv` except for the last dimension that is to be twice
               as long.
+        mip_level_bias: (Optional) Per-pixel bias for mip level selection. If `uv_da` is omitted,
+                        determines mip level directly. Must have shape [minibatch_size, height, width].
        mip: (Optional) Preconstructed mipmap stack from a `texture_construct_mip()` call. If not
             specified, the mipmap stack is constructed internally and discarded afterwards.
        filter_mode: Texture filtering mode to be used. Valid values are 'auto', 'nearest',
                     'linear', 'linear-mipmap-nearest', and 'linear-mipmap-linear'. Mode 'auto'
-                     selects 'linear' if `uv_da` is not specified, and 'linear-mipmap-linear'
+                     selects 'linear' if neither `uv_da` or `mip_level_bias` is specified, and
-                     when `uv_da` is specified, these being the highest-quality modes possible
+                     'linear-mipmap-linear' when at least one of them is specified, these being
-                     depending on the availability of the image-space derivatives of the texture
+                     the highest-quality modes possible depending on the availability of the
-                     coordinates.
+                     image-space derivatives of the texture coordinates or direct mip level information.
        boundary_mode: Valid values are 'wrap', 'clamp', 'zero', and 'cube'. If `tex` defines a
                       cube map, this must be set to 'cube'. The default mode 'wrap' takes fractional
                       part of texture coordinates. Mode 'clamp' clamps texture coordinates to the
@@ -395,7 +409,7 @@ def texture(tex, uv, uv_da=None, mip=None, filter_mode='auto', boundary_mode='wr
    # Default filter mode.
    if filter_mode == 'auto':
-        filter_mode = 'linear-mipmap-linear' if (uv_da is not None) else 'linear'
+        filter_mode = 'linear-mipmap-linear' if (uv_da is not None or mip_level_bias is not None) else 'linear'
    # Sanitize inputs.
    if max_mip_level is None:
@@ -407,7 +421,7 @@ def texture(tex, uv, uv_da=None, mip=None, filter_mode='auto', boundary_mode='wr
    # Check inputs.
    assert isinstance(tex, torch.Tensor) and isinstance(uv, torch.Tensor)
    if 'mipmap' in filter_mode:
-        assert isinstance(uv_da, torch.Tensor)
+        assert isinstance(uv_da, torch.Tensor) or isinstance(mip_level_bias, torch.Tensor)
    # If mipping disabled via max level=0, we may as well use simpler filtering internally.
    if max_mip_level == 0 and filter_mode in ['linear-mipmap-nearest', 'linear-mipmap-linear']:
@@ -430,7 +444,7 @@ def texture(tex, uv, uv_da=None, mip=None, filter_mode='auto', boundary_mode='wr
    # Choose stub.
    if filter_mode == 'linear-mipmap-linear' or filter_mode == 'linear-mipmap-nearest':
-        return _texture_func_mip.apply(filter_mode, tex, uv, uv_da, mip, filter_mode_enum, boundary_mode_enum)
+        return _texture_func_mip.apply(filter_mode, tex, uv, uv_da, mip_level_bias, mip, filter_mode_enum, boundary_mode_enum)
    else:
        return _texture_func.apply(filter_mode, tex, uv, filter_mode_enum, boundary_mode_enum)

--- a/nvdiffrast/torch/torch_antialias.cpp
+++ b/nvdiffrast/torch/torch_antialias.cpp
@@ -24,6 +24,7 @@ void AntialiasGradKernel            (const AntialiasKernelParams p);
 TopologyHashWrapper antialias_construct_topology_hash(torch::Tensor tri)
 {
+    const at::cuda::OptionalCUDAGuard device_guard(device_of(tri));
    cudaStream_t stream = at::cuda::getCurrentCUDAStream();
    AntialiasKernelParams p = {}; // Initialize all fields to zero.
@@ -66,6 +67,7 @@ TopologyHashWrapper antialias_construct_topology_hash(torch::Tensor tri)
 std::tuple<torch::Tensor, torch::Tensor> antialias_fwd(torch::Tensor color, torch::Tensor rast, torch::Tensor pos, torch::Tensor tri, TopologyHashWrapper topology_hash_wrap)
 {
+    const at::cuda::OptionalCUDAGuard device_guard(device_of(color));
    cudaStream_t stream = at::cuda::getCurrentCUDAStream();
    AntialiasKernelParams p = {}; // Initialize all fields to zero.
    p.instance_mode = (pos.sizes().size() > 2) ? 1 : 0;
@@ -153,6 +155,7 @@ std::tuple<torch::Tensor, torch::Tensor> antialias_fwd(torch::Tensor color, torc
 std::tuple<torch::Tensor, torch::Tensor> antialias_grad(torch::Tensor color, torch::Tensor rast, torch::Tensor pos, torch::Tensor tri, torch::Tensor dy, torch::Tensor work_buffer)
 {
+    const at::cuda::OptionalCUDAGuard device_guard(device_of(color));
    cudaStream_t stream = at::cuda::getCurrentCUDAStream();
    AntialiasKernelParams p = {}; // Initialize all fields to zero.
    p.instance_mode = (pos.sizes().size() > 2) ? 1 : 0;

--- a/nvdiffrast/torch/torch_bindings.cpp
+++ b/nvdiffrast/torch/torch_bindings.cpp
@@ -16,6 +16,7 @@
 #define OP_RETURN_T     torch::Tensor
 #define OP_RETURN_TT    std::tuple<torch::Tensor, torch::Tensor>
 #define OP_RETURN_TTT   std::tuple<torch::Tensor, torch::Tensor, torch::Tensor>
+#define OP_RETURN_TTTT  std::tuple<torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor>
 OP_RETURN_TT        rasterize_fwd                       (RasterizeGLStateWrapper& stateWrapper, torch::Tensor pos, torch::Tensor tri, std::tuple<int, int> resolution, torch::Tensor ranges);
 OP_RETURN_T         rasterize_grad                      (torch::Tensor pos, torch::Tensor tri, torch::Tensor out, torch::Tensor dy);
@@ -26,11 +27,11 @@ OP_RETURN_TT        interpolate_grad                    (torch::Tensor attr, tor
 OP_RETURN_TTT       interpolate_grad_da                 (torch::Tensor attr, torch::Tensor rast, torch::Tensor tri, torch::Tensor dy, torch::Tensor rast_db, torch::Tensor dda, bool diff_attrs_all, std::vector<int>& diff_attrs_vec);
 TextureMipWrapper   texture_construct_mip               (torch::Tensor tex, int max_mip_level, bool cube_mode);
 OP_RETURN_T         texture_fwd                         (torch::Tensor tex, torch::Tensor uv, int filter_mode, int boundary_mode);
-OP_RETURN_T         texture_fwd_mip                     (torch::Tensor tex, torch::Tensor uv, torch::Tensor uv_da, TextureMipWrapper mip, int filter_mode, int boundary_mode);
+OP_RETURN_T         texture_fwd_mip                     (torch::Tensor tex, torch::Tensor uv, torch::Tensor uv_da, torch::Tensor mip_level_bias, TextureMipWrapper mip, int filter_mode, int boundary_mode);
 OP_RETURN_T         texture_grad_nearest                (torch::Tensor tex, torch::Tensor uv, torch::Tensor dy, int filter_mode, int boundary_mode);
 OP_RETURN_TT        texture_grad_linear                 (torch::Tensor tex, torch::Tensor uv, torch::Tensor dy, int filter_mode, int boundary_mode);
-OP_RETURN_TT        texture_grad_linear_mipmap_nearest  (torch::Tensor tex, torch::Tensor uv, torch::Tensor dy, torch::Tensor uv_da, TextureMipWrapper mip, int filter_mode, int boundary_mode);
+OP_RETURN_TT        texture_grad_linear_mipmap_nearest  (torch::Tensor tex, torch::Tensor uv, torch::Tensor dy, torch::Tensor uv_da, torch::Tensor mip_level_bias, TextureMipWrapper mip, int filter_mode, int boundary_mode);
-OP_RETURN_TTT       texture_grad_linear_mipmap_linear   (torch::Tensor tex, torch::Tensor uv, torch::Tensor dy, torch::Tensor uv_da, TextureMipWrapper mip, int filter_mode, int boundary_mode);
+OP_RETURN_TTTT      texture_grad_linear_mipmap_linear   (torch::Tensor tex, torch::Tensor uv, torch::Tensor dy, torch::Tensor uv_da, torch::Tensor mip_level_bias, TextureMipWrapper mip, int filter_mode, int boundary_mode);
 TopologyHashWrapper antialias_construct_topology_hash   (torch::Tensor tri);
 OP_RETURN_TT        antialias_fwd                       (torch::Tensor color, torch::Tensor rast, torch::Tensor pos, torch::Tensor tri, TopologyHashWrapper topology_hash);
 OP_RETURN_TT        antialias_grad                      (torch::Tensor color, torch::Tensor rast, torch::Tensor pos, torch::Tensor tri, torch::Tensor dy, torch::Tensor work_buffer);
@@ -39,7 +40,7 @@ OP_RETURN_TT        antialias_grad                      (torch::Tensor color, to
 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
    // State classes.
-    pybind11::class_<RasterizeGLStateWrapper>(m, "RasterizeGLStateWrapper").def(pybind11::init<bool, bool>())
+    pybind11::class_<RasterizeGLStateWrapper>(m, "RasterizeGLStateWrapper").def(pybind11::init<bool, bool, int>())
        .def("set_context",     &RasterizeGLStateWrapper::setContext)
        .def("release_context", &RasterizeGLStateWrapper::releaseContext);
    pybind11::class_<TextureMipWrapper>(m, "TextureMipWrapper");
@@ -58,8 +59,8 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
    m.def("interpolate_grad",                   &interpolate_grad,                      "interpolate gradient op with attribute derivatives");
    m.def("interpolate_grad_da",                &interpolate_grad_da,                   "interpolate gradient op without attribute derivatives");
    m.def("texture_construct_mip",              &texture_construct_mip,                 "texture mipmap construction");
-    m.def("texture_fwd",                        &texture_fwd,                           "texture forward op with mipmapping and texcoord derivatives");
+    m.def("texture_fwd",                        &texture_fwd,                           "texture forward op without mipmapping");
-    m.def("texture_fwd_mip",                    &texture_fwd_mip,                       "texture forward op without mipmapping and texcoord derivatives");
+    m.def("texture_fwd_mip",                    &texture_fwd_mip,                       "texture forward op with mipmapping");
    m.def("texture_grad_nearest",               &texture_grad_nearest,                  "texture gradient op in nearest mode");
    m.def("texture_grad_linear",                &texture_grad_linear,                   "texture gradient op in linear mode");
    m.def("texture_grad_linear_mipmap_nearest", &texture_grad_linear_mipmap_nearest,    "texture gradient op in linear-mipmap-nearest mode");

--- a/nvdiffrast/torch/torch_common.inl
+++ b/nvdiffrast/torch/torch_common.inl
@@ -17,7 +17,7 @@
 #define __func__ __FUNCTION__
 #endif
-#define NVDR_CHECK_DEVICE(...) do { TORCH_CHECK(at::cuda::check_device({__VA_ARGS__}), __func__, "(): Inputs " #__VA_ARGS__ " must reside on current GPU device") } while(0)
+#define NVDR_CHECK_DEVICE(...) do { TORCH_CHECK(at::cuda::check_device({__VA_ARGS__}), __func__, "(): Inputs " #__VA_ARGS__ " must reside on the same GPU device") } while(0)
 #define NVDR_CHECK_CPU(...) do { nvdr_check_cpu({__VA_ARGS__}, __func__, "(): Inputs " #__VA_ARGS__ " must reside on CPU"); } while(0)
 #define NVDR_CHECK_CONTIGUOUS(...) do { nvdr_check_contiguous({__VA_ARGS__}, __func__, "(): Inputs " #__VA_ARGS__ " must be contiguous tensors"); } while(0)
 #define NVDR_CHECK_F32(...) do { nvdr_check_f32({__VA_ARGS__}, __func__, "(): Inputs " #__VA_ARGS__ " must be float32 tensors"); } while(0)

--- a/nvdiffrast/torch/torch_interpolate.cpp
+++ b/nvdiffrast/torch/torch_interpolate.cpp
@@ -41,6 +41,7 @@ static void set_diff_attrs(InterpolateKernelParams& p, bool diff_attrs_all, std:
 std::tuple<torch::Tensor, torch::Tensor> interpolate_fwd_da(torch::Tensor attr, torch::Tensor rast, torch::Tensor tri, torch::Tensor rast_db, bool diff_attrs_all, std::vector<int>& diff_attrs_vec)
 {
+    const at::cuda::OptionalCUDAGuard device_guard(device_of(attr));
    cudaStream_t stream = at::cuda::getCurrentCUDAStream();
    InterpolateKernelParams p = {}; // Initialize all fields to zero.
    bool enable_da = (rast_db.defined()) && (diff_attrs_all || !diff_attrs_vec.empty());
@@ -86,6 +87,8 @@ std::tuple<torch::Tensor, torch::Tensor> interpolate_fwd_da(torch::Tensor attr,
    // Set attribute pixel differential info if enabled, otherwise leave as zero.
    if (enable_da)
        set_diff_attrs(p, diff_attrs_all, diff_attrs_vec);
+    else
+        p.numDiffAttr = 0;
    // Get input pointers.
    p.attr = attr.data_ptr<float>();
@@ -133,6 +136,7 @@ std::tuple<torch::Tensor, torch::Tensor> interpolate_fwd(torch::Tensor attr, tor
 std::tuple<torch::Tensor, torch::Tensor, torch::Tensor> interpolate_grad_da(torch::Tensor attr, torch::Tensor rast, torch::Tensor tri, torch::Tensor dy, torch::Tensor rast_db, torch::Tensor dda, bool diff_attrs_all, std::vector<int>& diff_attrs_vec)
 {
+    const at::cuda::OptionalCUDAGuard device_guard(device_of(attr));
    cudaStream_t stream = at::cuda::getCurrentCUDAStream();
    InterpolateKernelParams p = {}; // Initialize all fields to zero.
    bool enable_da = (rast_db.defined()) && (diff_attrs_all || !diff_attrs_vec.empty());
@@ -190,6 +194,8 @@ std::tuple<torch::Tensor, torch::Tensor, torch::Tensor> interpolate_grad_da(torc
    // Set attribute pixel differential info if enabled, otherwise leave as zero.
    if (enable_da)
        set_diff_attrs(p, diff_attrs_all, diff_attrs_vec);
+    else
+        p.numDiffAttr = 0;
    // Get input pointers.
    p.attr = attr.data_ptr<float>();

--- a/nvdiffrast/torch/torch_rasterize.cpp
+++ b/nvdiffrast/torch/torch_rasterize.cpp
@@ -21,13 +21,14 @@ void RasterizeGradKernelDb(const RasterizeGradParams p);
 //------------------------------------------------------------------------
 // Python GL state wrapper methods.
-RasterizeGLStateWrapper::RasterizeGLStateWrapper(bool enableDB, bool automatic_)
+RasterizeGLStateWrapper::RasterizeGLStateWrapper(bool enableDB, bool automatic_, int cudaDeviceIdx_)
 {
    pState = new RasterizeGLState();
    automatic = automatic_;
+    cudaDeviceIdx = cudaDeviceIdx_;
    memset(pState, 0, sizeof(RasterizeGLState));
    pState->enableDB = enableDB ? 1 : 0;
-    rasterizeInitGLContext(NVDR_CTX_PARAMS, *pState);
+    rasterizeInitGLContext(NVDR_CTX_PARAMS, *pState, cudaDeviceIdx_);
    releaseGLContext();
 }
@@ -52,6 +53,7 @@ void RasterizeGLStateWrapper::releaseContext(void)
 std::tuple<torch::Tensor, torch::Tensor> rasterize_fwd(RasterizeGLStateWrapper& stateWrapper, torch::Tensor pos, torch::Tensor tri, std::tuple<int, int> resolution, torch::Tensor ranges)
 {
+    const at::cuda::OptionalCUDAGuard device_guard(device_of(pos));
    cudaStream_t stream = at::cuda::getCurrentCUDAStream();
    RasterizeGLState& s = *stateWrapper.pState;
@@ -62,6 +64,9 @@ std::tuple<torch::Tensor, torch::Tensor> rasterize_fwd(RasterizeGLStateWrapper&
    NVDR_CHECK_F32(pos);
    NVDR_CHECK_I32(tri, ranges);
+    // Check that GL context was created for the correct GPU.
+    NVDR_CHECK(pos.get_device() == stateWrapper.cudaDeviceIdx, "GL context must must reside on the same device as input tensors");
    // Determine number of outputs
    int num_outputs = s.enableDB ? 2 : 1;
@@ -123,6 +128,7 @@ std::tuple<torch::Tensor, torch::Tensor> rasterize_fwd(RasterizeGLStateWrapper&
 torch::Tensor rasterize_grad_db(torch::Tensor pos, torch::Tensor tri, torch::Tensor out, torch::Tensor dy, torch::Tensor ddb)
 {
+    const at::cuda::OptionalCUDAGuard device_guard(device_of(pos));
    cudaStream_t stream = at::cuda::getCurrentCUDAStream();
    RasterizeGradParams p;
    bool enable_db = ddb.defined();

--- a/nvdiffrast/torch/torch_texture.cpp
+++ b/nvdiffrast/torch/torch_texture.cpp
@@ -42,6 +42,18 @@ void TextureFwdKernelCubeLinearMipmapNearest4   (const TextureKernelParams p);
 void TextureFwdKernelCubeLinearMipmapLinear1    (const TextureKernelParams p);
 void TextureFwdKernelCubeLinearMipmapLinear2    (const TextureKernelParams p);
 void TextureFwdKernelCubeLinearMipmapLinear4    (const TextureKernelParams p);
+void TextureFwdKernelLinearMipmapNearestBO1     (const TextureKernelParams p);
+void TextureFwdKernelLinearMipmapNearestBO2     (const TextureKernelParams p);
+void TextureFwdKernelLinearMipmapNearestBO4     (const TextureKernelParams p);
+void TextureFwdKernelLinearMipmapLinearBO1      (const TextureKernelParams p);
+void TextureFwdKernelLinearMipmapLinearBO2      (const TextureKernelParams p);
+void TextureFwdKernelLinearMipmapLinearBO4      (const TextureKernelParams p);
+void TextureFwdKernelCubeLinearMipmapNearestBO1 (const TextureKernelParams p);
+void TextureFwdKernelCubeLinearMipmapNearestBO2 (const TextureKernelParams p);
+void TextureFwdKernelCubeLinearMipmapNearestBO4 (const TextureKernelParams p);
+void TextureFwdKernelCubeLinearMipmapLinearBO1  (const TextureKernelParams p);
+void TextureFwdKernelCubeLinearMipmapLinearBO2  (const TextureKernelParams p);
+void TextureFwdKernelCubeLinearMipmapLinearBO4  (const TextureKernelParams p);
 void MipGradKernel1                             (const TextureKernelParams p);
 void MipGradKernel2                             (const TextureKernelParams p);
 void MipGradKernel4                             (const TextureKernelParams p);
@@ -53,6 +65,10 @@ void TextureGradKernelCubeNearest               (const TextureKernelParams p);
 void TextureGradKernelCubeLinear                (const TextureKernelParams p);
 void TextureGradKernelCubeLinearMipmapNearest   (const TextureKernelParams p);
 void TextureGradKernelCubeLinearMipmapLinear    (const TextureKernelParams p);
+void TextureGradKernelLinearMipmapNearestBO     (const TextureKernelParams p);
+void TextureGradKernelLinearMipmapLinearBO      (const TextureKernelParams p);
+void TextureGradKernelCubeLinearMipmapNearestBO (const TextureKernelParams p);
+void TextureGradKernelCubeLinearMipmapLinearBO  (const TextureKernelParams p);
 //------------------------------------------------------------------------
 // Modeselektor.
@@ -81,6 +97,7 @@ static void set_modes(TextureKernelParams& p, int filter_mode, int boundary_mode
 TextureMipWrapper texture_construct_mip(torch::Tensor tex, int max_mip_level, bool cube_mode)
 {
+    const at::cuda::OptionalCUDAGuard device_guard(device_of(tex));
    cudaStream_t stream = at::cuda::getCurrentCUDAStream();
    TextureKernelParams p = {}; // Initialize all fields to zero.
    p.mipLevelLimit = max_mip_level;
@@ -151,31 +168,46 @@ TextureMipWrapper texture_construct_mip(torch::Tensor tex, int max_mip_level, bo
 //------------------------------------------------------------------------
 // Forward op.
-torch::Tensor texture_fwd_mip(torch::Tensor tex, torch::Tensor uv, torch::Tensor uv_da, TextureMipWrapper mip_wrap, int filter_mode, int boundary_mode)
+torch::Tensor texture_fwd_mip(torch::Tensor tex, torch::Tensor uv, torch::Tensor uv_da, torch::Tensor mip_level_bias, TextureMipWrapper mip_wrap, int filter_mode, int boundary_mode)
 {
+    const at::cuda::OptionalCUDAGuard device_guard(device_of(tex));
    cudaStream_t stream = at::cuda::getCurrentCUDAStream();
    TextureKernelParams p = {}; // Initialize all fields to zero.
    torch::Tensor& mip = mip_wrap.mip; // Unwrap.
    int max_mip_level = mip_wrap.max_mip_level;
    set_modes(p, filter_mode, boundary_mode, max_mip_level);
+    // See if we have these tensors or not.
+    bool has_uv_da = uv_da.defined() && uv_da.nbytes();
+    bool has_mip_level_bias = mip_level_bias.defined() && mip_level_bias.nbytes();
    if (p.enableMip)
    {
-        NVDR_CHECK(uv_da.defined(), "mipmapping filter mode requires uv_da input");
+        NVDR_CHECK(has_uv_da || has_mip_level_bias, "mipmapping filter mode requires uv_da and/or mip_level_bias input");
        NVDR_CHECK(mip.defined(), "mipmapping filter mode requires mip tensor input");
    }
    // Check inputs.
+    NVDR_CHECK_DEVICE(tex, uv);
+    NVDR_CHECK_CONTIGUOUS(tex, uv);
+    NVDR_CHECK_F32(tex, uv);
    if (p.enableMip)
    {
-        NVDR_CHECK_DEVICE(tex, uv, uv_da, mip);
+        NVDR_CHECK_DEVICE(mip);
-        NVDR_CHECK_CONTIGUOUS(tex, uv, uv_da, mip);
+        NVDR_CHECK_CONTIGUOUS(mip);
-        NVDR_CHECK_F32(tex, uv, uv_da, mip);
+        NVDR_CHECK_F32(mip);
+        if (has_uv_da)
+        {
+            NVDR_CHECK_DEVICE(uv_da);
+            NVDR_CHECK_CONTIGUOUS(uv_da);
+            NVDR_CHECK_F32(uv_da);
        }
-    else
+        if (has_mip_level_bias)
        {
-        NVDR_CHECK_DEVICE(tex, uv);
+            NVDR_CHECK_DEVICE(mip_level_bias);
-        NVDR_CHECK_CONTIGUOUS(tex, uv);
+            NVDR_CHECK_CONTIGUOUS(mip_level_bias);
-        NVDR_CHECK_F32(tex, uv);
+            NVDR_CHECK_F32(mip_level_bias);
+        }
    }
    // Sanity checks and state setters.
@@ -204,17 +236,23 @@ torch::Tensor texture_fwd_mip(torch::Tensor tex, torch::Tensor uv, torch::Tensor
    p.imgWidth  = uv.size(2);
    p.texDepth  = tex.size(0);
    if (p.enableMip)
+    {
+        if (has_uv_da)
        {
            if (!cube_mode)
                NVDR_CHECK(uv_da.sizes().size() == 4 && uv_da.size(0) == p.n && uv_da.size(1) == p.imgHeight && uv_da.size(2) == p.imgWidth && uv_da.size(3) == 4, "uv_da must have shape [minibatch_size, height, width, 4]");
            else
                NVDR_CHECK(uv_da.sizes().size() == 4 && uv_da.size(0) == p.n && uv_da.size(1) == p.imgHeight && uv_da.size(2) == p.imgWidth && uv_da.size(3) == 6, "uv_da must have shape [minibatch_size, height, width, 6] in cube map mode");
        }
+        if (has_mip_level_bias)
+            NVDR_CHECK(mip_level_bias.sizes().size() == 3 && mip_level_bias.size(0) == p.n && mip_level_bias.size(1) == p.imgHeight && mip_level_bias.size(2) == p.imgWidth, "mip_level_bias must have shape [minibatch_size, height, width]");
+    }
    // Get input pointers.
    p.tex = tex.data_ptr<float>();
    p.uv = uv.data_ptr<float>();
-    p.uvDA = p.enableMip ? uv_da.data_ptr<float>() : NULL;
+    p.uvDA = (p.enableMip && has_uv_da) ? uv_da.data_ptr<float>() : NULL;
+    p.mipLevelBias = (p.enableMip && has_mip_level_bias) ? mip_level_bias.data_ptr<float>() : NULL;
    // Allocate output tensor.
    torch::TensorOptions opts = torch::TensorOptions().dtype(torch::kFloat32).device(torch::kCUDA);
@@ -263,8 +301,8 @@ torch::Tensor texture_fwd_mip(torch::Tensor tex, torch::Tensor uv, torch::Tensor
    dim3 blockSize = getLaunchBlockSize(TEX_FWD_MAX_KERNEL_BLOCK_WIDTH, TEX_FWD_MAX_KERNEL_BLOCK_HEIGHT, p.imgWidth, p.imgHeight);
    dim3 gridSize  = getLaunchGridSize(blockSize, p.imgWidth, p.imgHeight, p.n);
-    // Choose kernel based on filter mode, cube mode, and datatype.
+    // Choose kernel based on filter mode, cube mode, bias-only mode, and datatype.
-    void* func_tbl[TEX_MODE_COUNT * 3 * 2] = {
+    void* func_tbl[TEX_MODE_COUNT * 2 * 2 * 3] = {
        (void*)TextureFwdKernelNearest1,
        (void*)TextureFwdKernelNearest2,
        (void*)TextureFwdKernelNearest4,
@@ -289,13 +327,39 @@ torch::Tensor texture_fwd_mip(torch::Tensor tex, torch::Tensor uv, torch::Tensor
        (void*)TextureFwdKernelCubeLinearMipmapLinear1,
        (void*)TextureFwdKernelCubeLinearMipmapLinear2,
        (void*)TextureFwdKernelCubeLinearMipmapLinear4,
+        NULL,
+        NULL,
+        NULL,
+        NULL,
+        NULL,
+        NULL,
+        (void*)TextureFwdKernelLinearMipmapNearestBO1,
+        (void*)TextureFwdKernelLinearMipmapNearestBO2,
+        (void*)TextureFwdKernelLinearMipmapNearestBO4,
+        (void*)TextureFwdKernelLinearMipmapLinearBO1,
+        (void*)TextureFwdKernelLinearMipmapLinearBO2,
+        (void*)TextureFwdKernelLinearMipmapLinearBO4,
+        NULL,
+        NULL,
+        NULL,
+        NULL,
+        NULL,
+        NULL,
+        (void*)TextureFwdKernelCubeLinearMipmapNearestBO1,
+        (void*)TextureFwdKernelCubeLinearMipmapNearestBO2,
+        (void*)TextureFwdKernelCubeLinearMipmapNearestBO4,
+        (void*)TextureFwdKernelCubeLinearMipmapLinearBO1,
+        (void*)TextureFwdKernelCubeLinearMipmapLinearBO2,
+        (void*)TextureFwdKernelCubeLinearMipmapLinearBO4,
    };
    // Function index.
    int func_idx = p.filterMode;
    if (cube_mode)
-        func_idx += TEX_MODE_COUNT;
+        func_idx += TEX_MODE_COUNT; // Cube variant.
-    func_idx = func_idx * 3 + channel_div_idx;
+    if (p.enableMip && !has_uv_da)
+        func_idx += TEX_MODE_COUNT * 2; // Bias-only variant.
+    func_idx = func_idx * 3 + channel_div_idx; // Choose vector size.
    // Launch kernel.
    NVDR_CHECK_CUDA_ERROR(cudaLaunchKernel(func_tbl[func_idx], gridSize, blockSize, args, 0, stream));
@@ -308,37 +372,52 @@ torch::Tensor texture_fwd_mip(torch::Tensor tex, torch::Tensor uv, torch::Tensor
 torch::Tensor texture_fwd(torch::Tensor tex, torch::Tensor uv, int filter_mode, int boundary_mode)
 {
    torch::Tensor empty_tensor;
-    return texture_fwd_mip(tex, uv, empty_tensor, TextureMipWrapper(), filter_mode, boundary_mode);
+    return texture_fwd_mip(tex, uv, empty_tensor, empty_tensor, TextureMipWrapper(), filter_mode, boundary_mode);
 }
 //------------------------------------------------------------------------
 // Gradient op.
-std::tuple<torch::Tensor, torch::Tensor, torch::Tensor> texture_grad_linear_mipmap_linear(torch::Tensor tex, torch::Tensor uv, torch::Tensor dy, torch::Tensor uv_da, TextureMipWrapper mip_wrap, int filter_mode, int boundary_mode)
+std::tuple<torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor> texture_grad_linear_mipmap_linear(torch::Tensor tex, torch::Tensor uv, torch::Tensor dy, torch::Tensor uv_da, torch::Tensor mip_level_bias, TextureMipWrapper mip_wrap, int filter_mode, int boundary_mode)
 {
+    const at::cuda::OptionalCUDAGuard device_guard(device_of(tex));
    cudaStream_t stream = at::cuda::getCurrentCUDAStream();
    TextureKernelParams p = {}; // Initialize all fields to zero.
    torch::Tensor& mip = mip_wrap.mip; // Unwrap.
    int max_mip_level = mip_wrap.max_mip_level;
    set_modes(p, filter_mode, boundary_mode, max_mip_level);
+    // See if we have these tensors or not.
+    bool has_uv_da = uv_da.defined() && uv_da.nbytes();
+    bool has_mip_level_bias = mip_level_bias.defined() && mip_level_bias.nbytes();
    if (p.enableMip)
    {
-        NVDR_CHECK(uv_da.defined(), "mipmapping filter mode requires uv_da input in gradient");
+        NVDR_CHECK(has_uv_da || has_mip_level_bias, "mipmapping filter mode requires uv_da and/or mip_level_bias input");
-        NVDR_CHECK(mip.defined(), "mipmapping filter mode requires mip input in gradient");
+        NVDR_CHECK(mip.defined(), "mipmapping filter mode requires mip tensor input");
    }
    // Check inputs.
+    NVDR_CHECK_DEVICE(tex, uv);
+    NVDR_CHECK_CONTIGUOUS(tex, uv);
+    NVDR_CHECK_F32(tex, uv);
    if (p.enableMip)
    {
-        NVDR_CHECK_DEVICE(tex, uv, dy, uv_da, mip);
+        NVDR_CHECK_DEVICE(mip);
-        NVDR_CHECK_CONTIGUOUS(tex, uv, uv_da, mip);
+        NVDR_CHECK_CONTIGUOUS(mip);
-        NVDR_CHECK_F32(tex, uv, dy, uv_da, mip);
+        NVDR_CHECK_F32(mip);
+        if (has_uv_da)
+        {
+            NVDR_CHECK_DEVICE(uv_da);
+            NVDR_CHECK_CONTIGUOUS(uv_da);
+            NVDR_CHECK_F32(uv_da);
        }
-    else
+        if (has_mip_level_bias)
        {
-        NVDR_CHECK_DEVICE(tex, uv, dy);
+            NVDR_CHECK_DEVICE(mip_level_bias);
-        NVDR_CHECK_CONTIGUOUS(tex, uv);
+            NVDR_CHECK_CONTIGUOUS(mip_level_bias);
-        NVDR_CHECK_F32(tex, uv, dy);
+            NVDR_CHECK_F32(mip_level_bias);
+        }
    }
    // Sanity checks and state setters.
@@ -367,12 +446,17 @@ std::tuple<torch::Tensor, torch::Tensor, torch::Tensor> texture_grad_linear_mipm
    p.imgWidth  = uv.size(2);
    p.texDepth  = tex.size(0);
    if (p.enableMip)
+    {
+        if (has_uv_da)
        {
            if (!cube_mode)
                NVDR_CHECK(uv_da.sizes().size() == 4 && uv_da.size(0) == p.n && uv_da.size(1) == p.imgHeight && uv_da.size(2) == p.imgWidth && uv_da.size(3) == 4, "uv_da must have shape [minibatch_size, height, width, 4]");
            else
                NVDR_CHECK(uv_da.sizes().size() == 4 && uv_da.size(0) == p.n && uv_da.size(1) == p.imgHeight && uv_da.size(2) == p.imgWidth && uv_da.size(3) == 6, "uv_da must have shape [minibatch_size, height, width, 6] in cube map mode");
        }
+        if (has_mip_level_bias)
+            NVDR_CHECK(mip_level_bias.sizes().size() == 3 && mip_level_bias.size(0) == p.n && mip_level_bias.size(1) == p.imgHeight && mip_level_bias.size(2) == p.imgWidth, "mip_level_bias must have shape [minibatch_size, height, width]");
+    }
    NVDR_CHECK(dy.sizes().size() == 4 && dy.size(0) == p.n && dy.size(1) == p.imgHeight && dy.size(2) == p.imgWidth && dy.size(3) == p.channels, "dy must have shape [minibatch_size, height, width, channels]");
    // Get contiguous version of dy.
@@ -382,7 +466,8 @@ std::tuple<torch::Tensor, torch::Tensor, torch::Tensor> texture_grad_linear_mipm
    p.tex = tex.data_ptr<float>();
    p.uv = uv.data_ptr<float>();
    p.dy = dy_.data_ptr<float>();
-    p.uvDA = p.enableMip ? uv_da.data_ptr<float>() : NULL;
+    p.uvDA = (p.enableMip && has_uv_da) ? uv_da.data_ptr<float>() : NULL;
+    p.mipLevelBias = (p.enableMip && has_mip_level_bias) ? mip_level_bias.data_ptr<float>() : NULL;
    p.mip = p.enableMip ? (float*)mip.data_ptr<float>() : NULL;
    // Allocate output tensor for tex gradient.
@@ -392,17 +477,29 @@ std::tuple<torch::Tensor, torch::Tensor, torch::Tensor> texture_grad_linear_mipm
    // Allocate output tensor for uv gradient.
    torch::Tensor grad_uv;
    torch::Tensor grad_uv_da;
+    torch::Tensor grad_mip_level_bias;
    if (p.filterMode != TEX_MODE_NEAREST)
    {
        grad_uv = torch::empty_like(uv);
        p.gradUV = grad_uv.data_ptr<float>();
-        // Allocate output tensor for uv_da gradient.
+        // Gradients for things affecting mip level.
        if (p.filterMode == TEX_MODE_LINEAR_MIPMAP_LINEAR)
+        {
+            // Allocate output tensor for uv_da gradient.
+            if (has_uv_da)
            {
                grad_uv_da = torch::empty_like(uv_da);
                p.gradUVDA = grad_uv_da.data_ptr<float>();
            }
+            // Allocate output tensor for mip_level_bias gradient.
+            if (has_mip_level_bias)
+            {
+                grad_mip_level_bias = torch::empty_like(mip_level_bias);
+                p.gradMipLevelBias = grad_mip_level_bias.data_ptr<float>();
+            }
+        }
    }
    // Choose kernel variants based on channel count.
@@ -457,7 +554,7 @@ std::tuple<torch::Tensor, torch::Tensor, torch::Tensor> texture_grad_linear_mipm
    dim3 blockSize = getLaunchBlockSize(TEX_GRAD_MAX_KERNEL_BLOCK_WIDTH, TEX_GRAD_MAX_KERNEL_BLOCK_HEIGHT, p.imgWidth, p.imgHeight);
    dim3 gridSize  = getLaunchGridSize(blockSize, p.imgWidth, p.imgHeight, p.n);
-    void* func_tbl[TEX_MODE_COUNT * 2] = { 
+    void* func_tbl[TEX_MODE_COUNT * 2 * 2] = {
        (void*)TextureGradKernelNearest,
        (void*)TextureGradKernelLinear,
        (void*)TextureGradKernelLinearMipmapNearest,
@@ -466,12 +563,22 @@ std::tuple<torch::Tensor, torch::Tensor, torch::Tensor> texture_grad_linear_mipm
        (void*)TextureGradKernelCubeLinear,
        (void*)TextureGradKernelCubeLinearMipmapNearest,
        (void*)TextureGradKernelCubeLinearMipmapLinear,
+        NULL,
+        NULL,
+        (void*)TextureGradKernelLinearMipmapNearestBO,
+        (void*)TextureGradKernelLinearMipmapLinearBO,
+        NULL,
+        NULL,
+        (void*)TextureGradKernelCubeLinearMipmapNearestBO,
+        (void*)TextureGradKernelCubeLinearMipmapLinearBO,
    };
    // Function index.
    int func_idx = p.filterMode;
    if (cube_mode)
-        func_idx += TEX_MODE_COUNT;
+        func_idx += TEX_MODE_COUNT; // Cube variant.
+    if (p.enableMip && !has_uv_da)
+        func_idx += TEX_MODE_COUNT * 2; // Bias-only variant.
    // Launch main gradient kernel.
    NVDR_CHECK_CUDA_ERROR(cudaLaunchKernel(func_tbl[func_idx], gridSize, blockSize, args, 0, stream));
@@ -488,14 +595,14 @@ std::tuple<torch::Tensor, torch::Tensor, torch::Tensor> texture_grad_linear_mipm
    }
    // Return output tensors.
-    return std::tuple<torch::Tensor, torch::Tensor, torch::Tensor>(grad_tex, grad_uv, grad_uv_da);
+    return std::tuple<torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor>(grad_tex, grad_uv, grad_uv_da, grad_mip_level_bias);
 }
 // Version for nearest filter mode.
 torch::Tensor texture_grad_nearest(torch::Tensor tex, torch::Tensor uv, torch::Tensor dy, int filter_mode, int boundary_mode)
 {
    torch::Tensor empty_tensor;
-    std::tuple<torch::Tensor, torch::Tensor, torch::Tensor> result = texture_grad_linear_mipmap_linear(tex, uv, dy, empty_tensor, TextureMipWrapper(), filter_mode, boundary_mode);
+    std::tuple<torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor> result = texture_grad_linear_mipmap_linear(tex, uv, dy, empty_tensor, empty_tensor, TextureMipWrapper(), filter_mode, boundary_mode);
    return std::get<0>(result);
 }
@@ -503,14 +610,14 @@ torch::Tensor texture_grad_nearest(torch::Tensor tex, torch::Tensor uv, torch::T
 std::tuple<torch::Tensor, torch::Tensor> texture_grad_linear(torch::Tensor tex, torch::Tensor uv, torch::Tensor dy, int filter_mode, int boundary_mode)
 {
    torch::Tensor empty_tensor;
-    std::tuple<torch::Tensor, torch::Tensor, torch::Tensor> result = texture_grad_linear_mipmap_linear(tex, uv, dy, empty_tensor, TextureMipWrapper(), filter_mode, boundary_mode);
+    std::tuple<torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor> result = texture_grad_linear_mipmap_linear(tex, uv, dy, empty_tensor, empty_tensor, TextureMipWrapper(), filter_mode, boundary_mode);
    return std::tuple<torch::Tensor, torch::Tensor>(std::get<0>(result), std::get<1>(result));
 }
 // Version for linear-mipmap-nearest mode.
-std::tuple<torch::Tensor, torch::Tensor> texture_grad_linear_mipmap_nearest(torch::Tensor tex, torch::Tensor uv, torch::Tensor dy, torch::Tensor uv_da, TextureMipWrapper mip, int filter_mode, int boundary_mode)
+std::tuple<torch::Tensor, torch::Tensor> texture_grad_linear_mipmap_nearest(torch::Tensor tex, torch::Tensor uv, torch::Tensor dy, torch::Tensor uv_da, torch::Tensor mip_level_bias, TextureMipWrapper mip, int filter_mode, int boundary_mode)
 {
-    std::tuple<torch::Tensor, torch::Tensor, torch::Tensor> result = texture_grad_linear_mipmap_linear(tex, uv, dy, uv_da, mip, filter_mode, boundary_mode);
+    std::tuple<torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor> result = texture_grad_linear_mipmap_linear(tex, uv, dy, uv_da, mip_level_bias, mip, filter_mode, boundary_mode);
    return std::tuple<torch::Tensor, torch::Tensor>(std::get<0>(result), std::get<1>(result));
 }

--- a/nvdiffrast/torch/torch_types.h
+++ b/nvdiffrast/torch/torch_types.h
@@ -15,7 +15,7 @@ class RasterizeGLState;
 class RasterizeGLStateWrapper
 {
 public:
-    RasterizeGLStateWrapper     (bool enableDB, bool automatic);
+    RasterizeGLStateWrapper     (bool enableDB, bool automatic, int cudaDeviceIdx);
    ~RasterizeGLStateWrapper    (void);
    void setContext             (void);
@@ -23,6 +23,7 @@ public:
    RasterizeGLState*           pState;
    bool                        automatic;
+    int                         cudaDeviceIdx;
 };
 //------------------------------------------------------------------------