update

fba8bde8 · bailuo · fba8bde8 · fba8bde8 · fba8bde8 · fba8bde8
Commit fba8bde8 authored Oct 29, 2024 by bailuo
20 changed files
--- a/stylegan_human/torch_utils/op_edit/__init__.py
+++ b/stylegan_human/torch_utils/op_edit/__init__.py
+# Copyright (c) SenseTime Research. All rights reserved.
+
+from .fused_act import FusedLeakyReLU, fused_leaky_relu
+from .upfirdn2d import upfirdn2d
--- a/stylegan_human/torch_utils/op_edit/fused_act.py
+++ b/stylegan_human/torch_utils/op_edit/fused_act.py
+# Copyright (c) SenseTime Research. All rights reserved.
+
+import os
+
+import torch
+from torch import nn
+from torch.nn import functional as F
+from torch.autograd import Function
+from torch.utils.cpp_extension import load
+
+
+module_path = os.path.dirname(__file__)
+fused = load(
+    "fused",
+    sources=[
+        os.path.join(module_path, "fused_bias_act.cpp"),
+        os.path.join(module_path, "fused_bias_act_kernel.cu"),
+    ],
+)
+
+
+class FusedLeakyReLUFunctionBackward(Function):
+    @staticmethod
+    def forward(ctx, grad_output, out, negative_slope, scale):
+        ctx.save_for_backward(out)
+        ctx.negative_slope = negative_slope
+        ctx.scale = scale
+
+        empty = grad_output.new_empty(0)
+
+        grad_input = fused.fused_bias_act(
+            grad_output, empty, out, 3, 1, negative_slope, scale
+        )
+
+        dim = [0]
+
+        if grad_input.ndim > 2:
+            dim += list(range(2, grad_input.ndim))
+
+        grad_bias = grad_input.sum(dim).detach()
+
+        return grad_input, grad_bias
+
+    @staticmethod
+    def backward(ctx, gradgrad_input, gradgrad_bias):
+        (out,) = ctx.saved_tensors
+        gradgrad_out = fused.fused_bias_act(
+            gradgrad_input, gradgrad_bias, out, 3, 1, ctx.negative_slope, ctx.scale
+        )
+
+        return gradgrad_out, None, None, None
+
+
+class FusedLeakyReLUFunction(Function):
+    @staticmethod
+    def forward(ctx, input, bias, negative_slope, scale):
+        empty = input.new_empty(0)
+        out = fused.fused_bias_act(input, bias, empty, 3, 0, negative_slope, scale)
+        ctx.save_for_backward(out)
+        ctx.negative_slope = negative_slope
+        ctx.scale = scale
+
+        return out
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        (out,) = ctx.saved_tensors
+
+        grad_input, grad_bias = FusedLeakyReLUFunctionBackward.apply(
+            grad_output, out, ctx.negative_slope, ctx.scale
+        )
+
+        return grad_input, grad_bias, None, None
+
+
+class FusedLeakyReLU(nn.Module):
+    def __init__(self, channel, negative_slope=0.2, scale=2 ** 0.5):
+        super().__init__()
+
+        self.bias = nn.Parameter(torch.zeros(channel))
+        self.negative_slope = negative_slope
+        self.scale = scale
+
+    def forward(self, input):
+        return fused_leaky_relu(input, self.bias, self.negative_slope, self.scale)
+
+
+def fused_leaky_relu(input, bias, negative_slope=0.2, scale=2 ** 0.5):
+    if input.device.type == "cpu":
+        rest_dim = [1] * (input.ndim - bias.ndim - 1)
+        return (
+            F.leaky_relu(
+                input + bias.view(1, bias.shape[0], *rest_dim), negative_slope=0.2
+            )
+            * scale
+        )
+
+    else:
+        return FusedLeakyReLUFunction.apply(input, bias, negative_slope, scale)
--- a/stylegan_human/torch_utils/op_edit/fused_bias_act.cpp
+++ b/stylegan_human/torch_utils/op_edit/fused_bias_act.cpp
+// Copyright (c) SenseTime Research. All rights reserved.
+
+#include <torch/extension.h>
+
+
+torch::Tensor fused_bias_act_op(const torch::Tensor& input, const torch::Tensor& bias, const torch::Tensor& refer,
+    int act, int grad, float alpha, float scale);
+
+#define CHECK_CUDA(x) TORCH_CHECK(x.type().is_cuda(), #x " must be a CUDA tensor")
+#define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
+#define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)
+
+torch::Tensor fused_bias_act(const torch::Tensor& input, const torch::Tensor& bias, const torch::Tensor& refer,
+    int act, int grad, float alpha, float scale) {
+    CHECK_CUDA(input);
+    CHECK_CUDA(bias);
+
+    return fused_bias_act_op(input, bias, refer, act, grad, alpha, scale);
+}
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+    m.def("fused_bias_act", &fused_bias_act, "fused bias act (CUDA)");
+}
\ No newline at end of file
--- a/stylegan_human/torch_utils/op_edit/fused_bias_act_kernel.cu
+++ b/stylegan_human/torch_utils/op_edit/fused_bias_act_kernel.cu
+// Copyright (c) SenseTime Research. All rights reserved.
+
+// Copyright (c) 2019, NVIDIA Corporation. All rights reserved.
+//
+// This work is made available under the Nvidia Source Code License-NC.
+// To view a copy of this license, visit
+// https://nvlabs.github.io/stylegan2/license.html
+
+#include <torch/types.h>
+
+#include <ATen/ATen.h>
+#include <ATen/AccumulateType.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/CUDAApplyUtils.cuh>
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+
+template <typename scalar_t>
+static __global__ void fused_bias_act_kernel(scalar_t* out, const scalar_t* p_x, const scalar_t* p_b, const scalar_t* p_ref,
+    int act, int grad, scalar_t alpha, scalar_t scale, int loop_x, int size_x, int step_b, int size_b, int use_bias, int use_ref) {
+    int xi = blockIdx.x * loop_x * blockDim.x + threadIdx.x;
+
+    scalar_t zero = 0.0;
+
+    for (int loop_idx = 0; loop_idx < loop_x && xi < size_x; loop_idx++, xi += blockDim.x) {
+        scalar_t x = p_x[xi];
+
+        if (use_bias) {
+            x += p_b[(xi / step_b) % size_b];
+        }
+
+        scalar_t ref = use_ref ? p_ref[xi] : zero;
+
+        scalar_t y;
+
+        switch (act * 10 + grad) {
+            default:
+            case 10: y = x; break;
+            case 11: y = x; break;
+            case 12: y = 0.0; break;
+
+            case 30: y = (x > 0.0) ? x : x * alpha; break;
+            case 31: y = (ref > 0.0) ? x : x * alpha; break;
+            case 32: y = 0.0; break;
+        }
+
+        out[xi] = y * scale;
+    }
+}
+
+
+torch::Tensor fused_bias_act_op(const torch::Tensor& input, const torch::Tensor& bias, const torch::Tensor& refer,
+    int act, int grad, float alpha, float scale) {
+    int curDevice = -1;
+    cudaGetDevice(&curDevice);
+    cudaStream_t stream = at::cuda::getCurrentCUDAStream(curDevice);
+
+    auto x = input.contiguous();
+    auto b = bias.contiguous();
+    auto ref = refer.contiguous();
+
+    int use_bias = b.numel() ? 1 : 0;
+    int use_ref = ref.numel() ? 1 : 0;
+
+    int size_x = x.numel();
+    int size_b = b.numel();
+    int step_b = 1;
+
+    for (int i = 1 + 1; i < x.dim(); i++) {
+        step_b *= x.size(i);
+    }
+
+    int loop_x = 4;
+    int block_size = 4 * 32;
+    int grid_size = (size_x - 1) / (loop_x * block_size) + 1;
+
+    auto y = torch::empty_like(x);
+
+    AT_DISPATCH_FLOATING_TYPES_AND_HALF(x.scalar_type(), "fused_bias_act_kernel", [&] {
+        fused_bias_act_kernel<scalar_t><<<grid_size, block_size, 0, stream>>>(
+            y.data_ptr<scalar_t>(),
+            x.data_ptr<scalar_t>(),
+            b.data_ptr<scalar_t>(),
+            ref.data_ptr<scalar_t>(),
+            act,
+            grad,
+            alpha,
+            scale,
+            loop_x,
+            size_x,
+            step_b,
+            size_b,
+            use_bias,
+            use_ref
+        );
+    });
+
+    return y;
+}
\ No newline at end of file
--- a/stylegan_human/torch_utils/op_edit/upfirdn2d.cpp
+++ b/stylegan_human/torch_utils/op_edit/upfirdn2d.cpp
+// Copyright (c) SenseTime Research. All rights reserved.
+
+#include <torch/extension.h>
+
+
+torch::Tensor upfirdn2d_op(const torch::Tensor& input, const torch::Tensor& kernel,
+                            int up_x, int up_y, int down_x, int down_y,
+                            int pad_x0, int pad_x1, int pad_y0, int pad_y1);
+
+#define CHECK_CUDA(x) TORCH_CHECK(x.type().is_cuda(), #x " must be a CUDA tensor")
+#define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
+#define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)
+
+torch::Tensor upfirdn2d(const torch::Tensor& input, const torch::Tensor& kernel,
+                        int up_x, int up_y, int down_x, int down_y,
+                        int pad_x0, int pad_x1, int pad_y0, int pad_y1) {
+    CHECK_CUDA(input);
+    CHECK_CUDA(kernel);
+
+    return upfirdn2d_op(input, kernel, up_x, up_y, down_x, down_y, pad_x0, pad_x1, pad_y0, pad_y1);
+}
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+    m.def("upfirdn2d", &upfirdn2d, "upfirdn2d (CUDA)");
+}
\ No newline at end of file
--- a/stylegan_human/torch_utils/op_edit/upfirdn2d.py
+++ b/stylegan_human/torch_utils/op_edit/upfirdn2d.py
+# Copyright (c) SenseTime Research. All rights reserved.
+
+import os
+
+import torch
+from torch.nn import functional as F
+from torch.autograd import Function
+from torch.utils.cpp_extension import load
+
+
+module_path = os.path.dirname(__file__)
+upfirdn2d_op = load(
+    "upfirdn2d",
+    sources=[
+        os.path.join(module_path, "upfirdn2d.cpp"),
+        os.path.join(module_path, "upfirdn2d_kernel.cu"),
+    ],
+)
+
+
+class UpFirDn2dBackward(Function):
+    @staticmethod
+    def forward(
+        ctx, grad_output, kernel, grad_kernel, up, down, pad, g_pad, in_size, out_size
+    ):
+
+        up_x, up_y = up
+        down_x, down_y = down
+        g_pad_x0, g_pad_x1, g_pad_y0, g_pad_y1 = g_pad
+
+        grad_output = grad_output.reshape(-1, out_size[0], out_size[1], 1)
+
+        grad_input = upfirdn2d_op.upfirdn2d(
+            grad_output,
+            grad_kernel,
+            down_x,
+            down_y,
+            up_x,
+            up_y,
+            g_pad_x0,
+            g_pad_x1,
+            g_pad_y0,
+            g_pad_y1,
+        )
+        grad_input = grad_input.view(in_size[0], in_size[1], in_size[2], in_size[3])
+
+        ctx.save_for_backward(kernel)
+
+        pad_x0, pad_x1, pad_y0, pad_y1 = pad
+
+        ctx.up_x = up_x
+        ctx.up_y = up_y
+        ctx.down_x = down_x
+        ctx.down_y = down_y
+        ctx.pad_x0 = pad_x0
+        ctx.pad_x1 = pad_x1
+        ctx.pad_y0 = pad_y0
+        ctx.pad_y1 = pad_y1
+        ctx.in_size = in_size
+        ctx.out_size = out_size
+
+        return grad_input
+
+    @staticmethod
+    def backward(ctx, gradgrad_input):
+        (kernel,) = ctx.saved_tensors
+
+        gradgrad_input = gradgrad_input.reshape(-1, ctx.in_size[2], ctx.in_size[3], 1)
+
+        gradgrad_out = upfirdn2d_op.upfirdn2d(
+            gradgrad_input,
+            kernel,
+            ctx.up_x,
+            ctx.up_y,
+            ctx.down_x,
+            ctx.down_y,
+            ctx.pad_x0,
+            ctx.pad_x1,
+            ctx.pad_y0,
+            ctx.pad_y1,
+        )
+        # gradgrad_out = gradgrad_out.view(ctx.in_size[0], ctx.out_size[0], ctx.out_size[1], ctx.in_size[3])
+        gradgrad_out = gradgrad_out.view(
+            ctx.in_size[0], ctx.in_size[1], ctx.out_size[0], ctx.out_size[1]
+        )
+
+        return gradgrad_out, None, None, None, None, None, None, None, None
+
+
+class UpFirDn2d(Function):
+    @staticmethod
+    def forward(ctx, input, kernel, up, down, pad):
+        up_x, up_y = up
+        down_x, down_y = down
+        pad_x0, pad_x1, pad_y0, pad_y1 = pad
+
+        kernel_h, kernel_w = kernel.shape
+        batch, channel, in_h, in_w = input.shape
+        ctx.in_size = input.shape
+
+        input = input.reshape(-1, in_h, in_w, 1)
+
+        ctx.save_for_backward(kernel, torch.flip(kernel, [0, 1]))
+
+        out_h = (in_h * up_y + pad_y0 + pad_y1 - kernel_h) // down_y + 1
+        out_w = (in_w * up_x + pad_x0 + pad_x1 - kernel_w) // down_x + 1
+        ctx.out_size = (out_h, out_w)
+
+        ctx.up = (up_x, up_y)
+        ctx.down = (down_x, down_y)
+        ctx.pad = (pad_x0, pad_x1, pad_y0, pad_y1)
+
+        g_pad_x0 = kernel_w - pad_x0 - 1
+        g_pad_y0 = kernel_h - pad_y0 - 1
+        g_pad_x1 = in_w * up_x - out_w * down_x + pad_x0 - up_x + 1
+        g_pad_y1 = in_h * up_y - out_h * down_y + pad_y0 - up_y + 1
+
+        ctx.g_pad = (g_pad_x0, g_pad_x1, g_pad_y0, g_pad_y1)
+
+        out = upfirdn2d_op.upfirdn2d(
+            input, kernel, up_x, up_y, down_x, down_y, pad_x0, pad_x1, pad_y0, pad_y1
+        )
+        # out = out.view(major, out_h, out_w, minor)
+        out = out.view(-1, channel, out_h, out_w)
+
+        return out
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        kernel, grad_kernel = ctx.saved_tensors
+
+        grad_input = UpFirDn2dBackward.apply(
+            grad_output,
+            kernel,
+            grad_kernel,
+            ctx.up,
+            ctx.down,
+            ctx.pad,
+            ctx.g_pad,
+            ctx.in_size,
+            ctx.out_size,
+        )
+
+        return grad_input, None, None, None, None
+
+
+def upfirdn2d(input, kernel, up=1, down=1, pad=(0, 0)):
+    if input.device.type == "cpu":
+        out = upfirdn2d_native(
+            input, kernel, up, up, down, down, pad[0], pad[1], pad[0], pad[1]
+        )
+
+    else:
+        out = UpFirDn2d.apply(
+            input, kernel, (up, up), (down, down), (pad[0], pad[1], pad[0], pad[1])
+        )
+
+    return out
+
+
+def upfirdn2d_native(
+    input, kernel, up_x, up_y, down_x, down_y, pad_x0, pad_x1, pad_y0, pad_y1
+):
+    _, channel, in_h, in_w = input.shape
+    input = input.reshape(-1, in_h, in_w, 1)
+
+    _, in_h, in_w, minor = input.shape
+    kernel_h, kernel_w = kernel.shape
+
+    out = input.view(-1, in_h, 1, in_w, 1, minor)
+    out = F.pad(out, [0, 0, 0, up_x - 1, 0, 0, 0, up_y - 1])
+    out = out.view(-1, in_h * up_y, in_w * up_x, minor)
+
+    out = F.pad(
+        out, [0, 0, max(pad_x0, 0), max(pad_x1, 0), max(pad_y0, 0), max(pad_y1, 0)]
+    )
+    out = out[
+        :,
+        max(-pad_y0, 0) : out.shape[1] - max(-pad_y1, 0),
+        max(-pad_x0, 0) : out.shape[2] - max(-pad_x1, 0),
+        :,
+    ]
+
+    out = out.permute(0, 3, 1, 2)
+    out = out.reshape(
+        [-1, 1, in_h * up_y + pad_y0 + pad_y1, in_w * up_x + pad_x0 + pad_x1]
+    )
+    w = torch.flip(kernel, [0, 1]).view(1, 1, kernel_h, kernel_w)
+    out = F.conv2d(out, w)
+    out = out.reshape(
+        -1,
+        minor,
+        in_h * up_y + pad_y0 + pad_y1 - kernel_h + 1,
+        in_w * up_x + pad_x0 + pad_x1 - kernel_w + 1,
+    )
+    out = out.permute(0, 2, 3, 1)
+    out = out[:, ::down_y, ::down_x, :]
+
+    out_h = (in_h * up_y + pad_y0 + pad_y1 - kernel_h) // down_y + 1
+    out_w = (in_w * up_x + pad_x0 + pad_x1 - kernel_w) // down_x + 1
+
+    return out.view(-1, channel, out_h, out_w)
--- a/stylegan_human/torch_utils/op_edit/upfirdn2d_kernel.cu
+++ b/stylegan_human/torch_utils/op_edit/upfirdn2d_kernel.cu
+// Copyright (c) SenseTime Research. All rights reserved.
+
+// Copyright (c) 2019, NVIDIA Corporation. All rights reserved.
+//
+// This work is made available under the Nvidia Source Code License-NC.
+// To view a copy of this license, visit
+// https://nvlabs.github.io/stylegan2/license.html
+
+#include <torch/types.h>
+
+#include <ATen/ATen.h>
+#include <ATen/AccumulateType.h>
+#include <ATen/cuda/CUDAApplyUtils.cuh>
+#include <ATen/cuda/CUDAContext.h>
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+static __host__ __device__ __forceinline__ int floor_div(int a, int b) {
+  int c = a / b;
+
+  if (c * b > a) {
+    c--;
+  }
+
+  return c;
+}
+
+struct UpFirDn2DKernelParams {
+  int up_x;
+  int up_y;
+  int down_x;
+  int down_y;
+  int pad_x0;
+  int pad_x1;
+  int pad_y0;
+  int pad_y1;
+
+  int major_dim;
+  int in_h;
+  int in_w;
+  int minor_dim;
+  int kernel_h;
+  int kernel_w;
+  int out_h;
+  int out_w;
+  int loop_major;
+  int loop_x;
+};
+
+template <typename scalar_t>
+__global__ void upfirdn2d_kernel_large(scalar_t *out, const scalar_t *input,
+                                       const scalar_t *kernel,
+                                       const UpFirDn2DKernelParams p) {
+  int minor_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  int out_y = minor_idx / p.minor_dim;
+  minor_idx -= out_y * p.minor_dim;
+  int out_x_base = blockIdx.y * p.loop_x * blockDim.y + threadIdx.y;
+  int major_idx_base = blockIdx.z * p.loop_major;
+
+  if (out_x_base >= p.out_w || out_y >= p.out_h ||
+      major_idx_base >= p.major_dim) {
+    return;
+  }
+
+  int mid_y = out_y * p.down_y + p.up_y - 1 - p.pad_y0;
+  int in_y = min(max(floor_div(mid_y, p.up_y), 0), p.in_h);
+  int h = min(max(floor_div(mid_y + p.kernel_h, p.up_y), 0), p.in_h) - in_y;
+  int kernel_y = mid_y + p.kernel_h - (in_y + 1) * p.up_y;
+
+  for (int loop_major = 0, major_idx = major_idx_base;
+       loop_major < p.loop_major && major_idx < p.major_dim;
+       loop_major++, major_idx++) {
+    for (int loop_x = 0, out_x = out_x_base;
+         loop_x < p.loop_x && out_x < p.out_w; loop_x++, out_x += blockDim.y) {
+      int mid_x = out_x * p.down_x + p.up_x - 1 - p.pad_x0;
+      int in_x = min(max(floor_div(mid_x, p.up_x), 0), p.in_w);
+      int w = min(max(floor_div(mid_x + p.kernel_w, p.up_x), 0), p.in_w) - in_x;
+      int kernel_x = mid_x + p.kernel_w - (in_x + 1) * p.up_x;
+
+      const scalar_t *x_p =
+          &input[((major_idx * p.in_h + in_y) * p.in_w + in_x) * p.minor_dim +
+                 minor_idx];
+      const scalar_t *k_p = &kernel[kernel_y * p.kernel_w + kernel_x];
+      int x_px = p.minor_dim;
+      int k_px = -p.up_x;
+      int x_py = p.in_w * p.minor_dim;
+      int k_py = -p.up_y * p.kernel_w;
+
+      scalar_t v = 0.0f;
+
+      for (int y = 0; y < h; y++) {
+        for (int x = 0; x < w; x++) {
+          v += static_cast<scalar_t>(*x_p) * static_cast<scalar_t>(*k_p);
+          x_p += x_px;
+          k_p += k_px;
+        }
+
+        x_p += x_py - w * x_px;
+        k_p += k_py - w * k_px;
+      }
+
+      out[((major_idx * p.out_h + out_y) * p.out_w + out_x) * p.minor_dim +
+          minor_idx] = v;
+    }
+  }
+}
+
+template <typename scalar_t, int up_x, int up_y, int down_x, int down_y,
+          int kernel_h, int kernel_w, int tile_out_h, int tile_out_w>
+__global__ void upfirdn2d_kernel(scalar_t *out, const scalar_t *input,
+                                 const scalar_t *kernel,
+                                 const UpFirDn2DKernelParams p) {
+  const int tile_in_h = ((tile_out_h - 1) * down_y + kernel_h - 1) / up_y + 1;
+  const int tile_in_w = ((tile_out_w - 1) * down_x + kernel_w - 1) / up_x + 1;
+
+  __shared__ volatile float sk[kernel_h][kernel_w];
+  __shared__ volatile float sx[tile_in_h][tile_in_w];
+
+  int minor_idx = blockIdx.x;
+  int tile_out_y = minor_idx / p.minor_dim;
+  minor_idx -= tile_out_y * p.minor_dim;
+  tile_out_y *= tile_out_h;
+  int tile_out_x_base = blockIdx.y * p.loop_x * tile_out_w;
+  int major_idx_base = blockIdx.z * p.loop_major;
+
+  if (tile_out_x_base >= p.out_w | tile_out_y >= p.out_h |
+      major_idx_base >= p.major_dim) {
+    return;
+  }
+
+  for (int tap_idx = threadIdx.x; tap_idx < kernel_h * kernel_w;
+       tap_idx += blockDim.x) {
+    int ky = tap_idx / kernel_w;
+    int kx = tap_idx - ky * kernel_w;
+    scalar_t v = 0.0;
+
+    if (kx < p.kernel_w & ky < p.kernel_h) {
+      v = kernel[(p.kernel_h - 1 - ky) * p.kernel_w + (p.kernel_w - 1 - kx)];
+    }
+
+    sk[ky][kx] = v;
+  }
+
+  for (int loop_major = 0, major_idx = major_idx_base;
+       loop_major < p.loop_major & major_idx < p.major_dim;
+       loop_major++, major_idx++) {
+    for (int loop_x = 0, tile_out_x = tile_out_x_base;
+         loop_x < p.loop_x & tile_out_x < p.out_w;
+         loop_x++, tile_out_x += tile_out_w) {
+      int tile_mid_x = tile_out_x * down_x + up_x - 1 - p.pad_x0;
+      int tile_mid_y = tile_out_y * down_y + up_y - 1 - p.pad_y0;
+      int tile_in_x = floor_div(tile_mid_x, up_x);
+      int tile_in_y = floor_div(tile_mid_y, up_y);
+
+      __syncthreads();
+
+      for (int in_idx = threadIdx.x; in_idx < tile_in_h * tile_in_w;
+           in_idx += blockDim.x) {
+        int rel_in_y = in_idx / tile_in_w;
+        int rel_in_x = in_idx - rel_in_y * tile_in_w;
+        int in_x = rel_in_x + tile_in_x;
+        int in_y = rel_in_y + tile_in_y;
+
+        scalar_t v = 0.0;
+
+        if (in_x >= 0 & in_y >= 0 & in_x < p.in_w & in_y < p.in_h) {
+          v = input[((major_idx * p.in_h + in_y) * p.in_w + in_x) *
+                        p.minor_dim +
+                    minor_idx];
+        }
+
+        sx[rel_in_y][rel_in_x] = v;
+      }
+
+      __syncthreads();
+      for (int out_idx = threadIdx.x; out_idx < tile_out_h * tile_out_w;
+           out_idx += blockDim.x) {
+        int rel_out_y = out_idx / tile_out_w;
+        int rel_out_x = out_idx - rel_out_y * tile_out_w;
+        int out_x = rel_out_x + tile_out_x;
+        int out_y = rel_out_y + tile_out_y;
+
+        int mid_x = tile_mid_x + rel_out_x * down_x;
+        int mid_y = tile_mid_y + rel_out_y * down_y;
+        int in_x = floor_div(mid_x, up_x);
+        int in_y = floor_div(mid_y, up_y);
+        int rel_in_x = in_x - tile_in_x;
+        int rel_in_y = in_y - tile_in_y;
+        int kernel_x = (in_x + 1) * up_x - mid_x - 1;
+        int kernel_y = (in_y + 1) * up_y - mid_y - 1;
+
+        scalar_t v = 0.0;
+
+#pragma unroll
+        for (int y = 0; y < kernel_h / up_y; y++)
+#pragma unroll
+          for (int x = 0; x < kernel_w / up_x; x++)
+            v += sx[rel_in_y + y][rel_in_x + x] *
+                 sk[kernel_y + y * up_y][kernel_x + x * up_x];
+
+        if (out_x < p.out_w & out_y < p.out_h) {
+          out[((major_idx * p.out_h + out_y) * p.out_w + out_x) * p.minor_dim +
+              minor_idx] = v;
+        }
+      }
+    }
+  }
+}
+
+torch::Tensor upfirdn2d_op(const torch::Tensor &input,
+                           const torch::Tensor &kernel, int up_x, int up_y,
+                           int down_x, int down_y, int pad_x0, int pad_x1,
+                           int pad_y0, int pad_y1) {
+  int curDevice = -1;
+  cudaGetDevice(&curDevice);
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream(curDevice);
+
+  UpFirDn2DKernelParams p;
+
+  auto x = input.contiguous();
+  auto k = kernel.contiguous();
+
+  p.major_dim = x.size(0);
+  p.in_h = x.size(1);
+  p.in_w = x.size(2);
+  p.minor_dim = x.size(3);
+  p.kernel_h = k.size(0);
+  p.kernel_w = k.size(1);
+  p.up_x = up_x;
+  p.up_y = up_y;
+  p.down_x = down_x;
+  p.down_y = down_y;
+  p.pad_x0 = pad_x0;
+  p.pad_x1 = pad_x1;
+  p.pad_y0 = pad_y0;
+  p.pad_y1 = pad_y1;
+
+  p.out_h = (p.in_h * p.up_y + p.pad_y0 + p.pad_y1 - p.kernel_h + p.down_y) /
+            p.down_y;
+  p.out_w = (p.in_w * p.up_x + p.pad_x0 + p.pad_x1 - p.kernel_w + p.down_x) /
+            p.down_x;
+
+  auto out =
+      at::empty({p.major_dim, p.out_h, p.out_w, p.minor_dim}, x.options());
+
+  int mode = -1;
+
+  int tile_out_h = -1;
+  int tile_out_w = -1;
+
+  if (p.up_x == 1 && p.up_y == 1 && p.down_x == 1 && p.down_y == 1 &&
+      p.kernel_h <= 4 && p.kernel_w <= 4) {
+    mode = 1;
+    tile_out_h = 16;
+    tile_out_w = 64;
+  }
+
+  if (p.up_x == 1 && p.up_y == 1 && p.down_x == 1 && p.down_y == 1 &&
+      p.kernel_h <= 3 && p.kernel_w <= 3) {
+    mode = 2;
+    tile_out_h = 16;
+    tile_out_w = 64;
+  }
+
+  if (p.up_x == 2 && p.up_y == 2 && p.down_x == 1 && p.down_y == 1 &&
+      p.kernel_h <= 4 && p.kernel_w <= 4) {
+    mode = 3;
+    tile_out_h = 16;
+    tile_out_w = 64;
+  }
+
+  if (p.up_x == 2 && p.up_y == 2 && p.down_x == 1 && p.down_y == 1 &&
+      p.kernel_h <= 2 && p.kernel_w <= 2) {
+    mode = 4;
+    tile_out_h = 16;
+    tile_out_w = 64;
+  }
+
+  if (p.up_x == 1 && p.up_y == 1 && p.down_x == 2 && p.down_y == 2 &&
+      p.kernel_h <= 4 && p.kernel_w <= 4) {
+    mode = 5;
+    tile_out_h = 8;
+    tile_out_w = 32;
+  }
+
+  if (p.up_x == 1 && p.up_y == 1 && p.down_x == 2 && p.down_y == 2 &&
+      p.kernel_h <= 2 && p.kernel_w <= 2) {
+    mode = 6;
+    tile_out_h = 8;
+    tile_out_w = 32;
+  }
+
+  dim3 block_size;
+  dim3 grid_size;
+
+  if (tile_out_h > 0 && tile_out_w > 0) {
+    p.loop_major = (p.major_dim - 1) / 16384 + 1;
+    p.loop_x = 1;
+    block_size = dim3(32 * 8, 1, 1);
+    grid_size = dim3(((p.out_h - 1) / tile_out_h + 1) * p.minor_dim,
+                     (p.out_w - 1) / (p.loop_x * tile_out_w) + 1,
+                     (p.major_dim - 1) / p.loop_major + 1);
+  } else {
+    p.loop_major = (p.major_dim - 1) / 16384 + 1;
+    p.loop_x = 4;
+    block_size = dim3(4, 32, 1);
+    grid_size = dim3((p.out_h * p.minor_dim - 1) / block_size.x + 1,
+                     (p.out_w - 1) / (p.loop_x * block_size.y) + 1,
+                     (p.major_dim - 1) / p.loop_major + 1);
+  }
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(x.scalar_type(), "upfirdn2d_cuda", [&] {
+    switch (mode) {
+    case 1:
+      upfirdn2d_kernel<scalar_t, 1, 1, 1, 1, 4, 4, 16, 64>
+          <<<grid_size, block_size, 0, stream>>>(out.data_ptr<scalar_t>(),
+                                                 x.data_ptr<scalar_t>(),
+                                                 k.data_ptr<scalar_t>(), p);
+
+      break;
+
+    case 2:
+      upfirdn2d_kernel<scalar_t, 1, 1, 1, 1, 3, 3, 16, 64>
+          <<<grid_size, block_size, 0, stream>>>(out.data_ptr<scalar_t>(),
+                                                 x.data_ptr<scalar_t>(),
+                                                 k.data_ptr<scalar_t>(), p);
+
+      break;
+
+    case 3:
+      upfirdn2d_kernel<scalar_t, 2, 2, 1, 1, 4, 4, 16, 64>
+          <<<grid_size, block_size, 0, stream>>>(out.data_ptr<scalar_t>(),
+                                                 x.data_ptr<scalar_t>(),
+                                                 k.data_ptr<scalar_t>(), p);
+
+      break;
+
+    case 4:
+      upfirdn2d_kernel<scalar_t, 2, 2, 1, 1, 2, 2, 16, 64>
+          <<<grid_size, block_size, 0, stream>>>(out.data_ptr<scalar_t>(),
+                                                 x.data_ptr<scalar_t>(),
+                                                 k.data_ptr<scalar_t>(), p);
+
+      break;
+
+    case 5:
+      upfirdn2d_kernel<scalar_t, 1, 1, 2, 2, 4, 4, 8, 32>
+          <<<grid_size, block_size, 0, stream>>>(out.data_ptr<scalar_t>(),
+                                                 x.data_ptr<scalar_t>(),
+                                                 k.data_ptr<scalar_t>(), p);
+
+      break;
+
+    case 6:
+      upfirdn2d_kernel<scalar_t, 1, 1, 2, 2, 4, 4, 8, 32>
+          <<<grid_size, block_size, 0, stream>>>(out.data_ptr<scalar_t>(),
+                                                 x.data_ptr<scalar_t>(),
+                                                 k.data_ptr<scalar_t>(), p);
+
+      break;
+
+    default:
+      upfirdn2d_kernel_large<scalar_t><<<grid_size, block_size, 0, stream>>>(
+          out.data_ptr<scalar_t>(), x.data_ptr<scalar_t>(),
+          k.data_ptr<scalar_t>(), p);
+    }
+  });
+
+  return out;
+}
\ No newline at end of file
--- a/stylegan_human/torch_utils/ops/__init__.py
+++ b/stylegan_human/torch_utils/ops/__init__.py
+# Copyright (c) SenseTime Research. All rights reserved.
+
+#empty
\ No newline at end of file
--- a/stylegan_human/torch_utils/ops/bias_act.cpp
+++ b/stylegan_human/torch_utils/ops/bias_act.cpp
+// Copyright (c) SenseTime Research. All rights reserved.
+
+// Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#include <torch/extension.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+#include "bias_act.h"
+
+//------------------------------------------------------------------------
+
+static bool has_same_layout(torch::Tensor x, torch::Tensor y)
+{
+    if (x.dim() != y.dim())
+        return false;
+    for (int64_t i = 0; i < x.dim(); i++)
+    {
+        if (x.size(i) != y.size(i))
+            return false;
+        if (x.size(i) >= 2 && x.stride(i) != y.stride(i))
+            return false;
+    }
+    return true;
+}
+
+//------------------------------------------------------------------------
+
+static torch::Tensor bias_act(torch::Tensor x, torch::Tensor b, torch::Tensor xref, torch::Tensor yref, torch::Tensor dy, int grad, int dim, int act, float alpha, float gain, float clamp)
+{
+    // Validate arguments.
+    TORCH_CHECK(x.is_cuda(), "x must reside on CUDA device");
+    TORCH_CHECK(b.numel() == 0 || (b.dtype() == x.dtype() && b.device() == x.device()), "b must have the same dtype and device as x");
+    TORCH_CHECK(xref.numel() == 0 || (xref.sizes() == x.sizes() && xref.dtype() == x.dtype() && xref.device() == x.device()), "xref must have the same shape, dtype, and device as x");
+    TORCH_CHECK(yref.numel() == 0 || (yref.sizes() == x.sizes() && yref.dtype() == x.dtype() && yref.device() == x.device()), "yref must have the same shape, dtype, and device as x");
+    TORCH_CHECK(dy.numel() == 0 || (dy.sizes() == x.sizes() && dy.dtype() == x.dtype() && dy.device() == x.device()), "dy must have the same dtype and device as x");
+    TORCH_CHECK(x.numel() <= INT_MAX, "x is too large");
+    TORCH_CHECK(b.dim() == 1, "b must have rank 1");
+    TORCH_CHECK(b.numel() == 0 || (dim >= 0 && dim < x.dim()), "dim is out of bounds");
+    TORCH_CHECK(b.numel() == 0 || b.numel() == x.size(dim), "b has wrong number of elements");
+    TORCH_CHECK(grad >= 0, "grad must be non-negative");
+
+    // Validate layout.
+    TORCH_CHECK(x.is_non_overlapping_and_dense(), "x must be non-overlapping and dense");
+    TORCH_CHECK(b.is_contiguous(), "b must be contiguous");
+    TORCH_CHECK(xref.numel() == 0 || has_same_layout(xref, x), "xref must have the same layout as x");
+    TORCH_CHECK(yref.numel() == 0 || has_same_layout(yref, x), "yref must have the same layout as x");
+    TORCH_CHECK(dy.numel() == 0 || has_same_layout(dy, x), "dy must have the same layout as x");
+
+    // Create output tensor.
+    const at::cuda::OptionalCUDAGuard device_guard(device_of(x));
+    torch::Tensor y = torch::empty_like(x);
+    TORCH_CHECK(has_same_layout(y, x), "y must have the same layout as x");
+
+    // Initialize CUDA kernel parameters.
+    bias_act_kernel_params p;
+    p.x     = x.data_ptr();
+    p.b     = (b.numel()) ? b.data_ptr() : NULL;
+    p.xref  = (xref.numel()) ? xref.data_ptr() : NULL;
+    p.yref  = (yref.numel()) ? yref.data_ptr() : NULL;
+    p.dy    = (dy.numel()) ? dy.data_ptr() : NULL;
+    p.y     = y.data_ptr();
+    p.grad  = grad;
+    p.act   = act;
+    p.alpha = alpha;
+    p.gain  = gain;
+    p.clamp = clamp;
+    p.sizeX = (int)x.numel();
+    p.sizeB = (int)b.numel();
+    p.stepB = (b.numel()) ? (int)x.stride(dim) : 1;
+
+    // Choose CUDA kernel.
+    void* kernel;
+    AT_DISPATCH_FLOATING_TYPES_AND_HALF(x.scalar_type(), "upfirdn2d_cuda", [&]
+    {
+        kernel = choose_bias_act_kernel<scalar_t>(p);
+    });
+    TORCH_CHECK(kernel, "no CUDA kernel found for the specified activation func");
+
+    // Launch CUDA kernel.
+    p.loopX = 4;
+    int blockSize = 4 * 32;
+    int gridSize = (p.sizeX - 1) / (p.loopX * blockSize) + 1;
+    void* args[] = {&p};
+    AT_CUDA_CHECK(hipLaunchKernel(kernel, gridSize, blockSize, args, 0, at::hip::getCurrentHIPStreamMasqueradingAsCUDA()));
+    return y;
+}
+
+//------------------------------------------------------------------------
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
+{
+    m.def("bias_act", &bias_act);
+}
+
+//------------------------------------------------------------------------
--- a/stylegan_human/torch_utils/ops/bias_act.cu
+++ b/stylegan_human/torch_utils/ops/bias_act.cu
+// Copyright (c) SenseTime Research. All rights reserved.
+
+// Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#include <c10/util/Half.h>
+#include "bias_act.h"
+
+//------------------------------------------------------------------------
+// Helpers.
+
+template <class T> struct InternalType;
+template <> struct InternalType<double>     { typedef double scalar_t; };
+template <> struct InternalType<float>      { typedef float  scalar_t; };
+template <> struct InternalType<c10::Half>  { typedef float  scalar_t; };
+
+//------------------------------------------------------------------------
+// CUDA kernel.
+
+template <class T, int A>
+__global__ void bias_act_kernel(bias_act_kernel_params p)
+{
+    typedef typename InternalType<T>::scalar_t scalar_t;
+    int G                 = p.grad;
+    scalar_t alpha        = (scalar_t)p.alpha;
+    scalar_t gain         = (scalar_t)p.gain;
+    scalar_t clamp        = (scalar_t)p.clamp;
+    scalar_t one          = (scalar_t)1;
+    scalar_t two          = (scalar_t)2;
+    scalar_t expRange     = (scalar_t)80;
+    scalar_t halfExpRange = (scalar_t)40;
+    scalar_t seluScale    = (scalar_t)1.0507009873554804934193349852946;
+    scalar_t seluAlpha    = (scalar_t)1.6732632423543772848170429916717;
+
+    // Loop over elements.
+    int xi = blockIdx.x * p.loopX * blockDim.x + threadIdx.x;
+    for (int loopIdx = 0; loopIdx < p.loopX && xi < p.sizeX; loopIdx++, xi += blockDim.x)
+    {
+        // Load.
+        scalar_t x = (scalar_t)((const T*)p.x)[xi];
+        scalar_t b = (p.b) ? (scalar_t)((const T*)p.b)[(xi / p.stepB) % p.sizeB] : 0;
+        scalar_t xref = (p.xref) ? (scalar_t)((const T*)p.xref)[xi] : 0;
+        scalar_t yref = (p.yref) ? (scalar_t)((const T*)p.yref)[xi] : 0;
+        scalar_t dy = (p.dy) ? (scalar_t)((const T*)p.dy)[xi] : one;
+        scalar_t yy = (gain != 0) ? yref / gain : 0;
+        scalar_t y = 0;
+
+        // Apply bias.
+        ((G == 0) ? x : xref) += b;
+
+        // linear
+        if (A == 1)
+        {
+            if (G == 0) y = x;
+            if (G == 1) y = x;
+        }
+
+        // relu
+        if (A == 2)
+        {
+            if (G == 0) y = (x > 0) ? x : 0;
+            if (G == 1) y = (yy > 0) ? x : 0;
+        }
+
+        // lrelu
+        if (A == 3)
+        {
+            if (G == 0) y = (x > 0) ? x : x * alpha;
+            if (G == 1) y = (yy > 0) ? x : x * alpha;
+        }
+
+        // tanh
+        if (A == 4)
+        {
+            if (G == 0) { scalar_t c = exp(x); scalar_t d = one / c; y = (x < -expRange) ? -one : (x > expRange) ? one : (c - d) / (c + d); }
+            if (G == 1) y = x * (one - yy * yy);
+            if (G == 2) y = x * (one - yy * yy) * (-two * yy);
+        }
+
+        // sigmoid
+        if (A == 5)
+        {
+            if (G == 0) y = (x < -expRange) ? 0 : one / (exp(-x) + one);
+            if (G == 1) y = x * yy * (one - yy);
+            if (G == 2) y = x * yy * (one - yy) * (one - two * yy);
+        }
+
+        // elu
+        if (A == 6)
+        {
+            if (G == 0) y = (x >= 0) ? x : exp(x) - one;
+            if (G == 1) y = (yy >= 0) ? x : x * (yy + one);
+            if (G == 2) y = (yy >= 0) ? 0 : x * (yy + one);
+        }
+
+        // selu
+        if (A == 7)
+        {
+            if (G == 0) y = (x >= 0) ? seluScale * x : (seluScale * seluAlpha) * (exp(x) - one);
+            if (G == 1) y = (yy >= 0) ? x * seluScale : x * (yy + seluScale * seluAlpha);
+            if (G == 2) y = (yy >= 0) ? 0 : x * (yy + seluScale * seluAlpha);
+        }
+
+        // softplus
+        if (A == 8)
+        {
+            if (G == 0) y = (x > expRange) ? x : log(exp(x) + one);
+            if (G == 1) y = x * (one - exp(-yy));
+            if (G == 2) { scalar_t c = exp(-yy); y = x * c * (one - c); }
+        }
+
+        // swish
+        if (A == 9)
+        {
+            if (G == 0)
+                y = (x < -expRange) ? 0 : x / (exp(-x) + one);
+            else
+            {
+                scalar_t c = exp(xref);
+                scalar_t d = c + one;
+                if (G == 1)
+                    y = (xref > halfExpRange) ? x : x * c * (xref + d) / (d * d);
+                else
+                    y = (xref > halfExpRange) ? 0 : x * c * (xref * (two - d) + two * d) / (d * d * d);
+                yref = (xref < -expRange) ? 0 : xref / (exp(-xref) + one) * gain;
+            }
+        }
+
+        // Apply gain.
+        y *= gain * dy;
+
+        // Clamp.
+        if (clamp >= 0)
+        {
+            if (G == 0)
+                y = (y > -clamp & y < clamp) ? y : (y >= 0) ? clamp : -clamp;
+            else
+                y = (yref > -clamp & yref < clamp) ? y : 0;
+        }
+
+        // Store.
+        ((T*)p.y)[xi] = (T)y;
+    }
+}
+
+//------------------------------------------------------------------------
+// CUDA kernel selection.
+
+template <class T> void* choose_bias_act_kernel(const bias_act_kernel_params& p)
+{
+    if (p.act == 1) return (void*)bias_act_kernel<T, 1>;
+    if (p.act == 2) return (void*)bias_act_kernel<T, 2>;
+    if (p.act == 3) return (void*)bias_act_kernel<T, 3>;
+    if (p.act == 4) return (void*)bias_act_kernel<T, 4>;
+    if (p.act == 5) return (void*)bias_act_kernel<T, 5>;
+    if (p.act == 6) return (void*)bias_act_kernel<T, 6>;
+    if (p.act == 7) return (void*)bias_act_kernel<T, 7>;
+    if (p.act == 8) return (void*)bias_act_kernel<T, 8>;
+    if (p.act == 9) return (void*)bias_act_kernel<T, 9>;
+    return NULL;
+}
+
+//------------------------------------------------------------------------
+// Template specializations.
+
+template void* choose_bias_act_kernel<double>       (const bias_act_kernel_params& p);
+template void* choose_bias_act_kernel<float>        (const bias_act_kernel_params& p);
+template void* choose_bias_act_kernel<c10::Half>    (const bias_act_kernel_params& p);
+
+//------------------------------------------------------------------------
--- a/stylegan_human/torch_utils/ops/bias_act.h
+++ b/stylegan_human/torch_utils/ops/bias_act.h
+// Copyright (c) SenseTime Research. All rights reserved.
+
+// Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+//------------------------------------------------------------------------
+// CUDA kernel parameters.
+
+struct bias_act_kernel_params
+{
+    const void* x;      // [sizeX]
+    const void* b;      // [sizeB] or NULL
+    const void* xref;   // [sizeX] or NULL
+    const void* yref;   // [sizeX] or NULL
+    const void* dy;     // [sizeX] or NULL
+    void*       y;      // [sizeX]
+
+    int         grad;
+    int         act;
+    float       alpha;
+    float       gain;
+    float       clamp;
+
+    int         sizeX;
+    int         sizeB;
+    int         stepB;
+    int         loopX;
+};
+
+//------------------------------------------------------------------------
+// CUDA kernel selection.
+
+template <class T> void* choose_bias_act_kernel(const bias_act_kernel_params& p);
+
+//------------------------------------------------------------------------
--- a/stylegan_human/torch_utils/ops/bias_act.py
+++ b/stylegan_human/torch_utils/ops/bias_act.py
+# Copyright (c) SenseTime Research. All rights reserved.
+
+# Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+"""Custom PyTorch ops for efficient bias and activation."""
+
+import os
+import warnings
+import numpy as np
+import torch
+import dnnlib
+import traceback
+
+from .. import custom_ops
+from .. import misc
+
+#----------------------------------------------------------------------------
+
+activation_funcs = {
+    'linear':   dnnlib.EasyDict(func=lambda x, **_:         x,                                          def_alpha=0,    def_gain=1,             cuda_idx=1, ref='',  has_2nd_grad=False),
+    'relu':     dnnlib.EasyDict(func=lambda x, **_:         torch.nn.functional.relu(x),                def_alpha=0,    def_gain=np.sqrt(2),    cuda_idx=2, ref='y', has_2nd_grad=False),
+    'lrelu':    dnnlib.EasyDict(func=lambda x, alpha, **_:  torch.nn.functional.leaky_relu(x, alpha),   def_alpha=0.2,  def_gain=np.sqrt(2),    cuda_idx=3, ref='y', has_2nd_grad=False),
+    'tanh':     dnnlib.EasyDict(func=lambda x, **_:         torch.tanh(x),                              def_alpha=0,    def_gain=1,             cuda_idx=4, ref='y', has_2nd_grad=True),
+    'sigmoid':  dnnlib.EasyDict(func=lambda x, **_:         torch.sigmoid(x),                           def_alpha=0,    def_gain=1,             cuda_idx=5, ref='y', has_2nd_grad=True),
+    'elu':      dnnlib.EasyDict(func=lambda x, **_:         torch.nn.functional.elu(x),                 def_alpha=0,    def_gain=1,             cuda_idx=6, ref='y', has_2nd_grad=True),
+    'selu':     dnnlib.EasyDict(func=lambda x, **_:         torch.nn.functional.selu(x),                def_alpha=0,    def_gain=1,             cuda_idx=7, ref='y', has_2nd_grad=True),
+    'softplus': dnnlib.EasyDict(func=lambda x, **_:         torch.nn.functional.softplus(x),            def_alpha=0,    def_gain=1,             cuda_idx=8, ref='y', has_2nd_grad=True),
+    'swish':    dnnlib.EasyDict(func=lambda x, **_:         torch.sigmoid(x) * x,                       def_alpha=0,    def_gain=np.sqrt(2),    cuda_idx=9, ref='x', has_2nd_grad=True),
+}
+
+#----------------------------------------------------------------------------
+
+_inited = False
+_plugin = None
+_null_tensor = torch.empty([0])
+
+def _init():
+    global _inited, _plugin
+    if not _inited:
+        _inited = True
+        sources = ['bias_act.cpp', 'bias_act.cu']
+        sources = [os.path.join(os.path.dirname(__file__), s) for s in sources]
+        try:
+            _plugin = custom_ops.get_plugin('bias_act_plugin', sources=sources, extra_cuda_cflags=['--use_fast_math'])
+        except:
+            warnings.warn('Failed to build CUDA kernels for bias_act. Falling back to slow reference implementation. Details:\n\n' + traceback.format_exc())
+    return _plugin is not None
+
+#----------------------------------------------------------------------------
+
+def bias_act(x, b=None, dim=1, act='linear', alpha=None, gain=None, clamp=None, impl='cuda'):
+    r"""Fused bias and activation function.
+
+    Adds bias `b` to activation tensor `x`, evaluates activation function `act`,
+    and scales the result by `gain`. Each of the steps is optional. In most cases,
+    the fused op is considerably more efficient than performing the same calculation
+    using standard PyTorch ops. It supports first and second order gradients,
+    but not third order gradients.
+
+    Args:
+        x:      Input activation tensor. Can be of any shape.
+        b:      Bias vector, or `None` to disable. Must be a 1D tensor of the same type
+                as `x`. The shape must be known, and it must match the dimension of `x`
+                corresponding to `dim`.
+        dim:    The dimension in `x` corresponding to the elements of `b`.
+                The value of `dim` is ignored if `b` is not specified.
+        act:    Name of the activation function to evaluate, or `"linear"` to disable.
+                Can be e.g. `"relu"`, `"lrelu"`, `"tanh"`, `"sigmoid"`, `"swish"`, etc.
+                See `activation_funcs` for a full list. `None` is not allowed.
+        alpha:  Shape parameter for the activation function, or `None` to use the default.
+        gain:   Scaling factor for the output tensor, or `None` to use default.
+                See `activation_funcs` for the default scaling of each activation function.
+                If unsure, consider specifying 1.
+        clamp:  Clamp the output values to `[-clamp, +clamp]`, or `None` to disable
+                the clamping (default).
+        impl:   Name of the implementation to use. Can be `"ref"` or `"cuda"` (default).
+
+    Returns:
+        Tensor of the same shape and datatype as `x`.
+    """
+    assert isinstance(x, torch.Tensor)
+    assert impl in ['ref', 'cuda']
+    if impl == 'ref' and x.device.type == 'cuda' and _init():
+        return _bias_act_cuda(dim=dim, act=act, alpha=alpha, gain=gain, clamp=clamp).apply(x, b)
+    return _bias_act_ref(x=x, b=b, dim=dim, act=act, alpha=alpha, gain=gain, clamp=clamp)
+
+#----------------------------------------------------------------------------
+
+@misc.profiled_function
+def _bias_act_ref(x, b=None, dim=1, act='linear', alpha=None, gain=None, clamp=None):
+    """Slow reference implementation of `bias_act()` using standard TensorFlow ops.
+    """
+    assert isinstance(x, torch.Tensor)
+    assert clamp is None or clamp >= 0
+    spec = activation_funcs[act]
+    alpha = float(alpha if alpha is not None else spec.def_alpha)
+    gain = float(gain if gain is not None else spec.def_gain)
+    clamp = float(clamp if clamp is not None else -1)
+
+    # Add bias.
+    if b is not None:
+        assert isinstance(b, torch.Tensor) and b.ndim == 1
+        assert 0 <= dim < x.ndim
+        assert b.shape[0] == x.shape[dim]
+        x = x + b.reshape([-1 if i == dim else 1 for i in range(x.ndim)])
+
+    # Evaluate activation function.
+    alpha = float(alpha)
+    x = spec.func(x, alpha=alpha)
+
+    # Scale by gain.
+    gain = float(gain)
+    if gain != 1:
+        x = x * gain
+
+    # Clamp.
+    if clamp >= 0:
+        x = x.clamp(-clamp, clamp) # pylint: disable=invalid-unary-operand-type
+    return x
+
+#----------------------------------------------------------------------------
+
+_bias_act_cuda_cache = dict()
+
+def _bias_act_cuda(dim=1, act='linear', alpha=None, gain=None, clamp=None):
+    """Fast CUDA implementation of `bias_act()` using custom ops.
+    """
+    # Parse arguments.
+    assert clamp is None or clamp >= 0
+    spec = activation_funcs[act]
+    alpha = float(alpha if alpha is not None else spec.def_alpha)
+    gain = float(gain if gain is not None else spec.def_gain)
+    clamp = float(clamp if clamp is not None else -1)
+
+    # Lookup from cache.
+    key = (dim, act, alpha, gain, clamp)
+    if key in _bias_act_cuda_cache:
+        return _bias_act_cuda_cache[key]
+
+    # Forward op.
+    class BiasActCuda(torch.autograd.Function):
+        @staticmethod
+        def forward(ctx, x, b): # pylint: disable=arguments-differ
+            ctx.memory_format = torch.channels_last if x.ndim > 2 and x.stride()[1] == 1 else torch.contiguous_format
+            x = x.contiguous(memory_format=ctx.memory_format)
+            b = b.contiguous() if b is not None else _null_tensor
+            y = x
+            if act != 'linear' or gain != 1 or clamp >= 0 or b is not _null_tensor:
+                y = _plugin.bias_act(x, b, _null_tensor, _null_tensor, _null_tensor, 0, dim, spec.cuda_idx, alpha, gain, clamp)
+            ctx.save_for_backward(
+                x if 'x' in spec.ref or spec.has_2nd_grad else _null_tensor,
+                b if 'x' in spec.ref or spec.has_2nd_grad else _null_tensor,
+                y if 'y' in spec.ref else _null_tensor)
+            return y
+
+        @staticmethod
+        def backward(ctx, dy): # pylint: disable=arguments-differ
+            dy = dy.contiguous(memory_format=ctx.memory_format)
+            x, b, y = ctx.saved_tensors
+            dx = None
+            db = None
+
+            if ctx.needs_input_grad[0] or ctx.needs_input_grad[1]:
+                dx = dy
+                if act != 'linear' or gain != 1 or clamp >= 0:
+                    dx = BiasActCudaGrad.apply(dy, x, b, y)
+
+            if ctx.needs_input_grad[1]:
+                db = dx.sum([i for i in range(dx.ndim) if i != dim])
+
+            return dx, db
+
+    # Backward op.
+    class BiasActCudaGrad(torch.autograd.Function):
+        @staticmethod
+        def forward(ctx, dy, x, b, y): # pylint: disable=arguments-differ
+            ctx.memory_format = torch.channels_last if dy.ndim > 2 and dy.stride()[1] == 1 else torch.contiguous_format
+            dx = _plugin.bias_act(dy, b, x, y, _null_tensor, 1, dim, spec.cuda_idx, alpha, gain, clamp)
+            ctx.save_for_backward(
+                dy if spec.has_2nd_grad else _null_tensor,
+                x, b, y)
+            return dx
+
+        @staticmethod
+        def backward(ctx, d_dx): # pylint: disable=arguments-differ
+            d_dx = d_dx.contiguous(memory_format=ctx.memory_format)
+            dy, x, b, y = ctx.saved_tensors
+            d_dy = None
+            d_x = None
+            d_b = None
+            d_y = None
+
+            if ctx.needs_input_grad[0]:
+                d_dy = BiasActCudaGrad.apply(d_dx, x, b, y)
+
+            if spec.has_2nd_grad and (ctx.needs_input_grad[1] or ctx.needs_input_grad[2]):
+                d_x = _plugin.bias_act(d_dx, b, x, y, dy, 2, dim, spec.cuda_idx, alpha, gain, clamp)
+
+            if spec.has_2nd_grad and ctx.needs_input_grad[2]:
+                d_b = d_x.sum([i for i in range(d_x.ndim) if i != dim])
+
+            return d_dy, d_x, d_b, d_y
+
+    # Add to cache.
+    _bias_act_cuda_cache[key] = BiasActCuda
+    return BiasActCuda
+
+#----------------------------------------------------------------------------
--- a/stylegan_human/torch_utils/ops/conv2d_gradfix.py
+++ b/stylegan_human/torch_utils/ops/conv2d_gradfix.py
+# Copyright (c) SenseTime Research. All rights reserved.
+
+# Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+"""Custom replacement for `torch.nn.functional.conv2d` that supports
+arbitrarily high order gradients with zero performance penalty."""
+
+import warnings
+import contextlib
+import torch
+
+# pylint: disable=redefined-builtin
+# pylint: disable=arguments-differ
+# pylint: disable=protected-access
+
+#----------------------------------------------------------------------------
+
+enabled = False                     # Enable the custom op by setting this to true.
+weight_gradients_disabled = False   # Forcefully disable computation of gradients with respect to the weights.
+
+@contextlib.contextmanager
+def no_weight_gradients():
+    global weight_gradients_disabled
+    old = weight_gradients_disabled
+    weight_gradients_disabled = True
+    yield
+    weight_gradients_disabled = old
+
+#----------------------------------------------------------------------------
+
+def conv2d(input, weight, bias=None, stride=1, padding=0, dilation=1, groups=1):
+    if _should_use_custom_op(input):
+        return _conv2d_gradfix(transpose=False, weight_shape=weight.shape, stride=stride, padding=padding, output_padding=0, dilation=dilation, groups=groups).apply(input, weight, bias)
+    return torch.nn.functional.conv2d(input=input, weight=weight, bias=bias, stride=stride, padding=padding, dilation=dilation, groups=groups)
+
+def conv_transpose2d(input, weight, bias=None, stride=1, padding=0, output_padding=0, groups=1, dilation=1):
+    if _should_use_custom_op(input):
+        return _conv2d_gradfix(transpose=True, weight_shape=weight.shape, stride=stride, padding=padding, output_padding=output_padding, groups=groups, dilation=dilation).apply(input, weight, bias)
+    return torch.nn.functional.conv_transpose2d(input=input, weight=weight, bias=bias, stride=stride, padding=padding, output_padding=output_padding, groups=groups, dilation=dilation)
+
+#----------------------------------------------------------------------------
+
+def _should_use_custom_op(input):
+    assert isinstance(input, torch.Tensor)
+    if (not enabled) or (not torch.backends.cudnn.enabled):
+        return False
+    if input.device.type != 'cuda':
+        return False
+    if any(torch.__version__.startswith(x) for x in ['1.7.', '1.8.', '1.9']):
+        return True
+    warnings.warn(f'conv2d_gradfix not supported on PyTorch {torch.__version__}. Falling back to torch.nn.functional.conv2d().')
+    return False
+
+def _tuple_of_ints(xs, ndim):
+    xs = tuple(xs) if isinstance(xs, (tuple, list)) else (xs,) * ndim
+    assert len(xs) == ndim
+    assert all(isinstance(x, int) for x in xs)
+    return xs
+
+#----------------------------------------------------------------------------
+
+_conv2d_gradfix_cache = dict()
+
+def _conv2d_gradfix(transpose, weight_shape, stride, padding, output_padding, dilation, groups):
+    # Parse arguments.
+    ndim = 2
+    weight_shape = tuple(weight_shape)
+    stride = _tuple_of_ints(stride, ndim)
+    padding = _tuple_of_ints(padding, ndim)
+    output_padding = _tuple_of_ints(output_padding, ndim)
+    dilation = _tuple_of_ints(dilation, ndim)
+
+    # Lookup from cache.
+    key = (transpose, weight_shape, stride, padding, output_padding, dilation, groups)
+    if key in _conv2d_gradfix_cache:
+        return _conv2d_gradfix_cache[key]
+
+    # Validate arguments.
+    assert groups >= 1
+    assert len(weight_shape) == ndim + 2
+    assert all(stride[i] >= 1 for i in range(ndim))
+    assert all(padding[i] >= 0 for i in range(ndim))
+    assert all(dilation[i] >= 0 for i in range(ndim))
+    if not transpose:
+        assert all(output_padding[i] == 0 for i in range(ndim))
+    else: # transpose
+        assert all(0 <= output_padding[i] < max(stride[i], dilation[i]) for i in range(ndim))
+
+    # Helpers.
+    common_kwargs = dict(stride=stride, padding=padding, dilation=dilation, groups=groups)
+    def calc_output_padding(input_shape, output_shape):
+        if transpose:
+            return [0, 0]
+        return [
+            input_shape[i + 2]
+            - (output_shape[i + 2] - 1) * stride[i]
+            - (1 - 2 * padding[i])
+            - dilation[i] * (weight_shape[i + 2] - 1)
+            for i in range(ndim)
+        ]
+
+    # Forward & backward.
+    class Conv2d(torch.autograd.Function):
+        @staticmethod
+        def forward(ctx, input, weight, bias):
+            assert weight.shape == weight_shape
+            if not transpose:
+                output = torch.nn.functional.conv2d(input=input, weight=weight, bias=bias, **common_kwargs)
+            else: # transpose
+                output = torch.nn.functional.conv_transpose2d(input=input, weight=weight, bias=bias, output_padding=output_padding, **common_kwargs)
+            ctx.save_for_backward(input, weight)
+            return output
+
+        @staticmethod
+        def backward(ctx, grad_output):
+            input, weight = ctx.saved_tensors
+            grad_input = None
+            grad_weight = None
+            grad_bias = None
+
+            if ctx.needs_input_grad[0]:
+                p = calc_output_padding(input_shape=input.shape, output_shape=grad_output.shape)
+                grad_input = _conv2d_gradfix(transpose=(not transpose), weight_shape=weight_shape, output_padding=p, **common_kwargs).apply(grad_output, weight, None)
+                assert grad_input.shape == input.shape
+
+            if ctx.needs_input_grad[1] and not weight_gradients_disabled:
+                grad_weight = Conv2dGradWeight.apply(grad_output, input)
+                assert grad_weight.shape == weight_shape
+
+            if ctx.needs_input_grad[2]:
+                grad_bias = grad_output.sum([0, 2, 3])
+
+            return grad_input, grad_weight, grad_bias
+
+    # Gradient with respect to the weights.
+    class Conv2dGradWeight(torch.autograd.Function):
+        @staticmethod
+        def forward(ctx, grad_output, input):
+            op = torch._C._jit_get_operation('aten::cudnn_convolution_backward_weight' if not transpose else 'aten::cudnn_convolution_transpose_backward_weight')
+            flags = [torch.backends.cudnn.benchmark, torch.backends.cudnn.deterministic, torch.backends.cudnn.allow_tf32]
+            grad_weight = op(weight_shape, grad_output, input, padding, stride, dilation, groups, *flags)
+            assert grad_weight.shape == weight_shape
+            ctx.save_for_backward(grad_output, input)
+            return grad_weight
+
+        @staticmethod
+        def backward(ctx, grad2_grad_weight):
+            grad_output, input = ctx.saved_tensors
+            grad2_grad_output = None
+            grad2_input = None
+
+            if ctx.needs_input_grad[0]:
+                grad2_grad_output = Conv2d.apply(input, grad2_grad_weight, None)
+                assert grad2_grad_output.shape == grad_output.shape
+
+            if ctx.needs_input_grad[1]:
+                p = calc_output_padding(input_shape=input.shape, output_shape=grad_output.shape)
+                grad2_input = _conv2d_gradfix(transpose=(not transpose), weight_shape=weight_shape, output_padding=p, **common_kwargs).apply(grad_output, grad2_grad_weight, None)
+                assert grad2_input.shape == input.shape
+
+            return grad2_grad_output, grad2_input
+
+    _conv2d_gradfix_cache[key] = Conv2d
+    return Conv2d
+
+#----------------------------------------------------------------------------
--- a/stylegan_human/torch_utils/ops/conv2d_resample.py
+++ b/stylegan_human/torch_utils/ops/conv2d_resample.py
+# Copyright (c) SenseTime Research. All rights reserved.
+
+# Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+"""2D convolution with optional up/downsampling."""
+
+import torch
+
+from .. import misc
+from . import conv2d_gradfix
+from . import upfirdn2d
+from .upfirdn2d import _parse_padding
+from .upfirdn2d import _get_filter_size
+
+#----------------------------------------------------------------------------
+
+def _get_weight_shape(w):
+    with misc.suppress_tracer_warnings(): # this value will be treated as a constant
+        shape = [int(sz) for sz in w.shape]
+    misc.assert_shape(w, shape)
+    return shape
+
+#----------------------------------------------------------------------------
+
+def _conv2d_wrapper(x, w, stride=1, padding=0, groups=1, transpose=False, flip_weight=True):
+    """Wrapper for the underlying `conv2d()` and `conv_transpose2d()` implementations.
+    """
+    out_channels, in_channels_per_group, kh, kw = _get_weight_shape(w)
+
+    # Flip weight if requested.
+    if not flip_weight: # conv2d() actually performs correlation (flip_weight=True) not convolution (flip_weight=False).
+        w = w.flip([2, 3])
+
+    # Workaround performance pitfall in cuDNN 8.0.5, triggered when using
+    # 1x1 kernel + memory_format=channels_last + less than 64 channels.
+    if kw == 1 and kh == 1 and stride == 1 and padding in [0, [0, 0], (0, 0)] and not transpose:
+        if x.stride()[1] == 1 and min(out_channels, in_channels_per_group) < 64:
+            if out_channels <= 4 and groups == 1:
+                in_shape = x.shape
+                x = w.squeeze(3).squeeze(2) @ x.reshape([in_shape[0], in_channels_per_group, -1])
+                x = x.reshape([in_shape[0], out_channels, in_shape[2], in_shape[3]])
+            else:
+                x = x.to(memory_format=torch.contiguous_format)
+                w = w.to(memory_format=torch.contiguous_format)
+                x = conv2d_gradfix.conv2d(x, w, groups=groups)
+            return x.to(memory_format=torch.channels_last)
+
+    # Otherwise => execute using conv2d_gradfix.
+    op = conv2d_gradfix.conv_transpose2d if transpose else conv2d_gradfix.conv2d
+    return op(x, w, stride=stride, padding=padding, groups=groups)
+
+#----------------------------------------------------------------------------
+
+@misc.profiled_function
+def conv2d_resample(x, w, f=None, up=1, down=1, padding=0, groups=1, flip_weight=True, flip_filter=False):
+    r"""2D convolution with optional up/downsampling.
+
+    Padding is performed only once at the beginning, not between the operations.
+
+    Args:
+        x:              Input tensor of shape
+                        `[batch_size, in_channels, in_height, in_width]`.
+        w:              Weight tensor of shape
+                        `[out_channels, in_channels//groups, kernel_height, kernel_width]`.
+        f:              Low-pass filter for up/downsampling. Must be prepared beforehand by
+                        calling upfirdn2d.setup_filter(). None = identity (default).
+        up:             Integer upsampling factor (default: 1).
+        down:           Integer downsampling factor (default: 1).
+        padding:        Padding with respect to the upsampled image. Can be a single number
+                        or a list/tuple `[x, y]` or `[x_before, x_after, y_before, y_after]`
+                        (default: 0).
+        groups:         Split input channels into N groups (default: 1).
+        flip_weight:    False = convolution, True = correlation (default: True).
+        flip_filter:    False = convolution, True = correlation (default: False).
+
+    Returns:
+        Tensor of the shape `[batch_size, num_channels, out_height, out_width]`.
+    """
+    # Validate arguments.
+    assert isinstance(x, torch.Tensor) and (x.ndim == 4)
+    assert isinstance(w, torch.Tensor) and (w.ndim == 4) and (w.dtype == x.dtype)
+    assert f is None or (isinstance(f, torch.Tensor) and f.ndim in [1, 2] and f.dtype == torch.float32)
+    assert isinstance(up, int) and (up >= 1)
+    assert isinstance(down, int) and (down >= 1)
+    assert isinstance(groups, int) and (groups >= 1)
+    out_channels, in_channels_per_group, kh, kw = _get_weight_shape(w)
+    fw, fh = _get_filter_size(f)
+    px0, px1, py0, py1 = _parse_padding(padding)
+
+    # Adjust padding to account for up/downsampling.
+    if up > 1:
+        px0 += (fw + up - 1) // 2
+        px1 += (fw - up) // 2
+        py0 += (fh + up - 1) // 2
+        py1 += (fh - up) // 2
+    if down > 1:
+        px0 += (fw - down + 1) // 2
+        px1 += (fw - down) // 2
+        py0 += (fh - down + 1) // 2
+        py1 += (fh - down) // 2
+
+    # Fast path: 1x1 convolution with downsampling only => downsample first, then convolve.
+    if kw == 1 and kh == 1 and (down > 1 and up == 1):
+        x = upfirdn2d.upfirdn2d(x=x, f=f, down=down, padding=[px0,px1,py0,py1], flip_filter=flip_filter)
+        x = _conv2d_wrapper(x=x, w=w, groups=groups, flip_weight=flip_weight)
+        return x
+
+    # Fast path: 1x1 convolution with upsampling only => convolve first, then upsample.
+    if kw == 1 and kh == 1 and (up > 1 and down == 1):
+        x = _conv2d_wrapper(x=x, w=w, groups=groups, flip_weight=flip_weight)
+        x = upfirdn2d.upfirdn2d(x=x, f=f, up=up, padding=[px0,px1,py0,py1], gain=up**2, flip_filter=flip_filter)
+        return x
+
+    # Fast path: downsampling only => use strided convolution.
+    if down > 1 and up == 1:
+        x = upfirdn2d.upfirdn2d(x=x, f=f, padding=[px0,px1,py0,py1], flip_filter=flip_filter)
+        x = _conv2d_wrapper(x=x, w=w, stride=down, groups=groups, flip_weight=flip_weight)
+        return x
+
+    # Fast path: upsampling with optional downsampling => use transpose strided convolution.
+    if up > 1:
+        if groups == 1:
+            w = w.transpose(0, 1)
+        else:
+            w = w.reshape(groups, out_channels // groups, in_channels_per_group, kh, kw)
+            w = w.transpose(1, 2)
+            w = w.reshape(groups * in_channels_per_group, out_channels // groups, kh, kw)
+        px0 -= kw - 1
+        px1 -= kw - up
+        py0 -= kh - 1
+        py1 -= kh - up
+        pxt = max(min(-px0, -px1), 0)
+        pyt = max(min(-py0, -py1), 0)
+        x = _conv2d_wrapper(x=x, w=w, stride=up, padding=[pyt,pxt], groups=groups, transpose=True, flip_weight=(not flip_weight))
+        x = upfirdn2d.upfirdn2d(x=x, f=f, padding=[px0+pxt,px1+pxt,py0+pyt,py1+pyt], gain=up**2, flip_filter=flip_filter)
+        if down > 1:
+            x = upfirdn2d.upfirdn2d(x=x, f=f, down=down, flip_filter=flip_filter)
+        return x
+
+    # Fast path: no up/downsampling, padding supported by the underlying implementation => use plain conv2d.
+    if up == 1 and down == 1:
+        if px0 == px1 and py0 == py1 and px0 >= 0 and py0 >= 0:
+            return _conv2d_wrapper(x=x, w=w, padding=[py0,px0], groups=groups, flip_weight=flip_weight)
+
+    # Fallback: Generic reference implementation.
+    x = upfirdn2d.upfirdn2d(x=x, f=(f if up > 1 else None), up=up, padding=[px0,px1,py0,py1], gain=up**2, flip_filter=flip_filter)
+    x = _conv2d_wrapper(x=x, w=w, groups=groups, flip_weight=flip_weight)
+    if down > 1:
+        x = upfirdn2d.upfirdn2d(x=x, f=f, down=down, flip_filter=flip_filter)
+    return x
+
+#----------------------------------------------------------------------------
--- a/stylegan_human/torch_utils/ops/filtered_lrelu.cpp
+++ b/stylegan_human/torch_utils/ops/filtered_lrelu.cpp
+// Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#include <torch/extension.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+#include "filtered_lrelu.h"
+
+//------------------------------------------------------------------------
+
+static std::tuple<torch::Tensor, torch::Tensor, int> filtered_lrelu(
+    torch::Tensor x, torch::Tensor fu, torch::Tensor fd, torch::Tensor b, torch::Tensor si,
+    int up, int down, int px0, int px1, int py0, int py1, int sx, int sy, float gain, float slope, float clamp, bool flip_filters, bool writeSigns)
+{
+    // Set CUDA device.
+    TORCH_CHECK(x.is_cuda(), "x must reside on CUDA device");
+    const at::cuda::OptionalCUDAGuard device_guard(device_of(x));
+
+    // Validate arguments.
+    TORCH_CHECK(fu.device() == x.device() && fd.device() == x.device() && b.device() == x.device(), "all input tensors must reside on the same device");
+    TORCH_CHECK(fu.dtype() == torch::kFloat && fd.dtype() == torch::kFloat, "fu and fd must be float32");
+    TORCH_CHECK(b.dtype() == x.dtype(), "x and b must have the same dtype");
+    TORCH_CHECK(x.dtype() == torch::kHalf || x.dtype() == torch::kFloat, "x and b must be float16 or float32");
+    TORCH_CHECK(x.dim() == 4, "x must be rank 4");
+    TORCH_CHECK(x.size(0) * x.size(1) <= INT_MAX && x.size(2) <= INT_MAX && x.size(3) <= INT_MAX, "x is too large");
+    TORCH_CHECK(x.numel() > 0, "x is empty");
+    TORCH_CHECK((fu.dim() == 1 || fu.dim() == 2) && (fd.dim() == 1 || fd.dim() == 2), "fu and fd must be rank 1 or 2");
+    TORCH_CHECK(fu.size(0) <= INT_MAX && fu.size(-1) <= INT_MAX, "fu is too large");
+    TORCH_CHECK(fd.size(0) <= INT_MAX && fd.size(-1) <= INT_MAX, "fd is too large");
+    TORCH_CHECK(fu.numel() > 0, "fu is empty");
+    TORCH_CHECK(fd.numel() > 0, "fd is empty");
+    TORCH_CHECK(b.dim() == 1 && b.size(0) == x.size(1), "b must be a vector with the same number of channels as x");
+    TORCH_CHECK(up >= 1 && down >= 1, "up and down must be at least 1");
+
+    // Figure out how much shared memory is available on the device.
+    int maxSharedBytes = 0;
+    AT_CUDA_CHECK(cudaDeviceGetAttribute(&maxSharedBytes, cudaDevAttrMaxSharedMemoryPerBlockOptin, x.device().index()));
+    int sharedKB = maxSharedBytes >> 10;
+
+    // Populate enough launch parameters to check if a CUDA kernel exists.
+    filtered_lrelu_kernel_params p;
+    p.up      = up;
+    p.down    = down;
+    p.fuShape = make_int2((int)fu.size(-1), fu.dim() == 2 ? (int)fu.size(0) : 0); // shape [n, 0] indicates separable filter.
+    p.fdShape = make_int2((int)fd.size(-1), fd.dim() == 2 ? (int)fd.size(0) : 0);
+    filtered_lrelu_kernel_spec test_spec = choose_filtered_lrelu_kernel<float, int32_t, false, false>(p, sharedKB);
+    if (!test_spec.exec)
+    {
+        // No kernel found - return empty tensors and indicate missing kernel with return code of -1.
+        return std::make_tuple(torch::Tensor(), torch::Tensor(), -1);
+    }
+
+    // Input/output element size.
+    int64_t sz = (x.dtype() == torch::kHalf) ? 2 : 4;
+
+    // Input sizes.
+    int64_t xw = (int)x.size(3);
+    int64_t xh = (int)x.size(2);
+    int64_t fut_w = (int)fu.size(-1) - 1;
+    int64_t fut_h = (int)fu.size(0)  - 1;
+    int64_t fdt_w = (int)fd.size(-1) - 1;
+    int64_t fdt_h = (int)fd.size(0)  - 1;
+
+    // Logical size of upsampled buffer.
+    int64_t cw = xw * up + (px0 + px1) - fut_w;
+    int64_t ch = xh * up + (py0 + py1) - fut_h;
+    TORCH_CHECK(cw > fdt_w && ch > fdt_h, "upsampled buffer must be at least the size of downsampling filter");
+    TORCH_CHECK(cw <= INT_MAX && ch <= INT_MAX, "upsampled buffer is too large");
+
+    // Compute output size and allocate.
+    int64_t yw = (cw - fdt_w + (down - 1)) / down;
+    int64_t yh = (ch - fdt_h + (down - 1)) / down;
+    TORCH_CHECK(yw > 0 && yh > 0, "output must be at least 1x1");
+    TORCH_CHECK(yw <= INT_MAX && yh <= INT_MAX, "output is too large");
+    torch::Tensor y = torch::empty({x.size(0), x.size(1), yh, yw}, x.options(), x.suggest_memory_format());
+
+    // Allocate sign tensor.
+    torch::Tensor so;
+    torch::Tensor s = si;
+    bool readSigns = !!s.numel();
+    int64_t sw_active = 0; // Active width of sign tensor.
+    if (writeSigns)
+    {
+        sw_active = yw * down - (down - 1) + fdt_w;     // Active width in elements.
+        int64_t sh = yh * down - (down - 1) + fdt_h;    // Height = active height.
+        int64_t sw = (sw_active + 15) & ~15;            // Width  = active width in elements, rounded up to multiple of 16.
+        TORCH_CHECK(sh <= INT_MAX && (sw >> 2) <= INT_MAX, "signs is too large");
+        s = so = torch::empty({x.size(0), x.size(1), sh, sw >> 2}, x.options().dtype(torch::kUInt8), at::MemoryFormat::Contiguous);
+    }
+    else if (readSigns)
+        sw_active = s.size(3) << 2;
+
+    // Validate sign tensor if in use.
+    if (readSigns || writeSigns)
+    {
+        TORCH_CHECK(s.is_contiguous(), "signs must be contiguous");
+        TORCH_CHECK(s.dtype() == torch::kUInt8, "signs must be uint8");
+        TORCH_CHECK(s.device() == x.device(), "signs must reside on the same device as x");
+        TORCH_CHECK(s.dim() == 4, "signs must be rank 4");
+        TORCH_CHECK(s.size(0) == x.size(0) && s.size(1) == x.size(1), "signs must have same batch & channels as x");
+        TORCH_CHECK(s.size(2) <= INT_MAX && s.size(3) <= INT_MAX, "signs is too large");
+    }
+
+    // Populate rest of CUDA kernel parameters.
+    p.x         = x.data_ptr();
+    p.y         = y.data_ptr();
+    p.b         = b.data_ptr();
+    p.s         = (readSigns || writeSigns) ? s.data_ptr<unsigned char>() : 0;
+    p.fu        = fu.data_ptr<float>();
+    p.fd        = fd.data_ptr<float>();
+    p.pad0      = make_int2(px0, py0);
+    p.gain      = gain;
+    p.slope     = slope;
+    p.clamp     = clamp;
+    p.flip      = (flip_filters) ? 1 : 0;
+    p.xShape    = make_int4((int)x.size(3), (int)x.size(2), (int)x.size(1), (int)x.size(0));
+    p.yShape    = make_int4((int)y.size(3), (int)y.size(2), (int)y.size(1), (int)y.size(0));
+    p.sShape    = (readSigns || writeSigns) ? make_int2((int)s.size(3), (int)s.size(2)) : make_int2(0, 0); // Width is in bytes. Contiguous.
+    p.sOfs      = make_int2(sx, sy);
+    p.swLimit   = (sw_active + 3) >> 2; // Rounded up to bytes.
+
+    // x, y, b strides are in bytes.
+    p.xStride   = make_longlong4(sz * x.stride(3), sz * x.stride(2), sz * x.stride(1), sz * x.stride(0));
+    p.yStride   = make_longlong4(sz * y.stride(3), sz * y.stride(2), sz * y.stride(1), sz * y.stride(0));
+    p.bStride   = sz * b.stride(0);
+
+    // fu, fd strides are in elements.
+    p.fuStride  = make_longlong3(fu.stride(-1), fu.dim() == 2 ? fu.stride(0) : 0, 0);
+    p.fdStride  = make_longlong3(fd.stride(-1), fd.dim() == 2 ? fd.stride(0) : 0, 0);
+
+    // Determine if indices don't fit in int32. Support negative strides although Torch currently never produces those.
+    bool index64b = false;
+    if (std::abs(p.bStride * x.size(1)) > INT_MAX) index64b = true;
+    if (std::min(x.size(0) * p.xStride.w, 0ll) + std::min(x.size(1) * p.xStride.z, 0ll) + std::min(x.size(2) * p.xStride.y, 0ll) + std::min(x.size(3) * p.xStride.x, 0ll) < -INT_MAX) index64b = true;
+    if (std::max(x.size(0) * p.xStride.w, 0ll) + std::max(x.size(1) * p.xStride.z, 0ll) + std::max(x.size(2) * p.xStride.y, 0ll) + std::max(x.size(3) * p.xStride.x, 0ll) >  INT_MAX) index64b = true;
+    if (std::min(y.size(0) * p.yStride.w, 0ll) + std::min(y.size(1) * p.yStride.z, 0ll) + std::min(y.size(2) * p.yStride.y, 0ll) + std::min(y.size(3) * p.yStride.x, 0ll) < -INT_MAX) index64b = true;
+    if (std::max(y.size(0) * p.yStride.w, 0ll) + std::max(y.size(1) * p.yStride.z, 0ll) + std::max(y.size(2) * p.yStride.y, 0ll) + std::max(y.size(3) * p.yStride.x, 0ll) >  INT_MAX) index64b = true;
+    if (s.numel() > INT_MAX) index64b = true;
+
+    // Choose CUDA kernel.
+    filtered_lrelu_kernel_spec spec = { 0 };
+    AT_DISPATCH_FLOATING_TYPES_AND_HALF(x.scalar_type(), "filtered_lrelu_cuda", [&]
+    {
+        if constexpr (sizeof(scalar_t) <= 4) // Exclude doubles. constexpr prevents template instantiation.
+        {
+            // Choose kernel based on index type, datatype and sign read/write modes.
+            if      (!index64b &&  writeSigns && !readSigns) spec = choose_filtered_lrelu_kernel<scalar_t, int32_t, true,  false>(p, sharedKB);
+            else if (!index64b && !writeSigns &&  readSigns) spec = choose_filtered_lrelu_kernel<scalar_t, int32_t, false, true >(p, sharedKB);
+            else if (!index64b && !writeSigns && !readSigns) spec = choose_filtered_lrelu_kernel<scalar_t, int32_t, false, false>(p, sharedKB);
+            else if ( index64b &&  writeSigns && !readSigns) spec = choose_filtered_lrelu_kernel<scalar_t, int64_t, true,  false>(p, sharedKB);
+            else if ( index64b && !writeSigns &&  readSigns) spec = choose_filtered_lrelu_kernel<scalar_t, int64_t, false, true >(p, sharedKB);
+            else if ( index64b && !writeSigns && !readSigns) spec = choose_filtered_lrelu_kernel<scalar_t, int64_t, false, false>(p, sharedKB);
+        }
+    });
+    TORCH_CHECK(spec.exec, "internal error - CUDA kernel not found") // This should not happen because we tested earlier that kernel exists.
+
+    // Launch CUDA kernel.
+    void* args[] = {&p};
+    int bx = spec.numWarps * 32;
+    int gx = (p.yShape.x - 1) / spec.tileOut.x + 1;
+    int gy = (p.yShape.y - 1) / spec.tileOut.y + 1;
+    int gz = p.yShape.z * p.yShape.w;
+
+    // Repeat multiple horizontal tiles in a CTA?
+    if (spec.xrep)
+    {
+        p.tilesXrep = spec.xrep;
+        p.tilesXdim = gx;
+
+        gx = (gx + p.tilesXrep - 1) / p.tilesXrep;
+        std::swap(gx, gy);
+    }
+    else
+    {
+        p.tilesXrep = 0;
+        p.tilesXdim = 0;
+    }
+
+    // Launch filter setup kernel.
+    AT_CUDA_CHECK(cudaLaunchKernel(spec.setup, 1, 1024, args, 0, at::cuda::getCurrentCUDAStream()));
+
+    // Copy kernels to constant memory.
+    if      ( writeSigns && !readSigns) AT_CUDA_CHECK((copy_filters<true,  false>(at::cuda::getCurrentCUDAStream())));
+    else if (!writeSigns &&  readSigns) AT_CUDA_CHECK((copy_filters<false, true >(at::cuda::getCurrentCUDAStream())));
+    else if (!writeSigns && !readSigns) AT_CUDA_CHECK((copy_filters<false, false>(at::cuda::getCurrentCUDAStream())));
+
+    // Set cache and shared memory configurations for main kernel.
+    AT_CUDA_CHECK(cudaFuncSetCacheConfig(spec.exec, cudaFuncCachePreferShared));
+    if (spec.dynamicSharedKB) // Need dynamically allocated shared memory?
+        AT_CUDA_CHECK(cudaFuncSetAttribute(spec.exec, cudaFuncAttributeMaxDynamicSharedMemorySize, spec.dynamicSharedKB << 10));
+    AT_CUDA_CHECK(cudaFuncSetSharedMemConfig(spec.exec, cudaSharedMemBankSizeFourByte));
+
+    // Launch main kernel.
+    const int maxSubGz = 65535; // CUDA maximum for block z dimension.
+    for (int zofs=0; zofs < gz; zofs += maxSubGz) // Do multiple launches if gz is too big.
+    {
+        p.blockZofs = zofs;
+        int subGz = std::min(maxSubGz, gz - zofs);
+        AT_CUDA_CHECK(cudaLaunchKernel(spec.exec, dim3(gx, gy, subGz), bx, args, spec.dynamicSharedKB << 10, at::cuda::getCurrentCUDAStream()));
+    }
+
+    // Done.
+    return std::make_tuple(y, so, 0);
+}
+
+//------------------------------------------------------------------------
+
+static torch::Tensor filtered_lrelu_act(torch::Tensor x, torch::Tensor si, int sx, int sy, float gain, float slope, float clamp, bool writeSigns)
+{
+    // Set CUDA device.
+    TORCH_CHECK(x.is_cuda(), "x must reside on CUDA device");
+    const at::cuda::OptionalCUDAGuard device_guard(device_of(x));
+
+    // Validate arguments.
+    TORCH_CHECK(x.dim() == 4, "x must be rank 4");
+    TORCH_CHECK(x.size(0) * x.size(1) <= INT_MAX && x.size(2) <= INT_MAX && x.size(3) <= INT_MAX, "x is too large");
+    TORCH_CHECK(x.numel() > 0, "x is empty");
+    TORCH_CHECK(x.dtype() == torch::kHalf || x.dtype() == torch::kFloat || x.dtype() == torch::kDouble, "x must be float16, float32 or float64");
+
+    // Output signs if we don't have sign input.
+    torch::Tensor so;
+    torch::Tensor s = si;
+    bool readSigns = !!s.numel();
+    if (writeSigns)
+    {
+        int64_t sw = x.size(3);
+        sw = (sw + 15) & ~15; // Round to a multiple of 16 for coalescing.
+        s = so = torch::empty({x.size(0), x.size(1), x.size(2), sw >> 2}, x.options().dtype(torch::kUInt8), at::MemoryFormat::Contiguous);
+    }
+
+    // Validate sign tensor if in use.
+    if (readSigns || writeSigns)
+    {
+        TORCH_CHECK(s.is_contiguous(), "signs must be contiguous");
+        TORCH_CHECK(s.dtype() == torch::kUInt8, "signs must be uint8");
+        TORCH_CHECK(s.device() == x.device(), "signs must reside on the same device as x");
+        TORCH_CHECK(s.dim() == 4, "signs must be rank 4");
+        TORCH_CHECK(s.size(0) == x.size(0) && s.size(1) == x.size(1), "signs must have same batch & channels as x");
+        TORCH_CHECK(s.size(2) <= INT_MAX && (s.size(3) << 2) <= INT_MAX, "signs tensor is too large");
+    }
+
+    // Initialize CUDA kernel parameters.
+    filtered_lrelu_act_kernel_params p;
+    p.x         = x.data_ptr();
+    p.s         = (readSigns || writeSigns) ? s.data_ptr<unsigned char>() : 0;
+    p.gain      = gain;
+    p.slope     = slope;
+    p.clamp     = clamp;
+    p.xShape    = make_int4((int)x.size(3), (int)x.size(2), (int)x.size(1), (int)x.size(0));
+    p.xStride   = make_longlong4(x.stride(3), x.stride(2), x.stride(1), x.stride(0));
+    p.sShape    = (readSigns || writeSigns) ? make_int2((int)s.size(3) << 2, (int)s.size(2)) : make_int2(0, 0); // Width is in elements. Contiguous.
+    p.sOfs      = make_int2(sx, sy);
+
+    // Choose CUDA kernel.
+    void* func = 0;
+    AT_DISPATCH_FLOATING_TYPES_AND_HALF(x.scalar_type(), "filtered_lrelu_act_cuda", [&]
+    {
+        if (writeSigns)
+            func = choose_filtered_lrelu_act_kernel<scalar_t, true, false>();
+        else if (readSigns)
+            func = choose_filtered_lrelu_act_kernel<scalar_t, false, true>();
+        else
+            func = choose_filtered_lrelu_act_kernel<scalar_t, false, false>();
+    });
+    TORCH_CHECK(func, "internal error - CUDA kernel not found");
+
+    // Launch CUDA kernel.
+    void* args[] = {&p};
+    int bx = 128; // 4 warps per block.
+
+    // Logical size of launch = writeSigns ? p.s : p.x
+    uint32_t gx = writeSigns ? p.sShape.x : p.xShape.x;
+    uint32_t gy = writeSigns ? p.sShape.y : p.xShape.y;
+    uint32_t gz = p.xShape.z * p.xShape.w; // Same as in p.sShape if signs are in use.
+    gx = (gx - 1) / bx + 1;
+
+    // Make sure grid y and z dimensions are within CUDA launch limits. Kernel loops internally to do the rest.
+    const uint32_t gmax = 65535;
+    gy = std::min(gy, gmax);
+    gz = std::min(gz, gmax);
+
+    // Launch.
+    AT_CUDA_CHECK(cudaLaunchKernel(func, dim3(gx, gy, gz), bx, args, 0, at::cuda::getCurrentCUDAStream()));
+    return so;
+}
+
+//------------------------------------------------------------------------
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
+{
+    m.def("filtered_lrelu",      &filtered_lrelu);      // The whole thing.
+    m.def("filtered_lrelu_act_", &filtered_lrelu_act);  // Activation and sign tensor handling only. Modifies data tensor in-place.
+}
+
+//------------------------------------------------------------------------
\ No newline at end of file
--- a/stylegan_human/torch_utils/ops/filtered_lrelu.cu
+++ b/stylegan_human/torch_utils/ops/filtered_lrelu.cu
+// Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#include <c10/util/Half.h>
+#include "filtered_lrelu.h"
+#include <cstdint>
+
+//------------------------------------------------------------------------
+// Helpers.
+
+enum // Filter modes.
+{
+    MODE_SUSD = 0,  // Separable upsampling, separable downsampling.
+    MODE_FUSD = 1,  // Full upsampling, separable downsampling.
+    MODE_SUFD = 2,  // Separable upsampling, full downsampling.
+    MODE_FUFD = 3,  // Full upsampling, full downsampling.
+};
+
+template <class T> struct InternalType;
+template <> struct InternalType<double>
+{
+    typedef double scalar_t; typedef double2 vec2_t; typedef double4 vec4_t;
+    __device__ __forceinline__ static vec2_t zero_vec2(void) { return make_double2(0, 0); }
+    __device__ __forceinline__ static vec4_t zero_vec4(void) { return make_double4(0, 0, 0, 0); }
+    __device__ __forceinline__ static double clamp(double x, double c) { return fmin(fmax(x, -c), c); }
+};
+template <> struct InternalType<float>
+{
+    typedef float scalar_t; typedef float2 vec2_t; typedef float4 vec4_t;
+    __device__ __forceinline__ static vec2_t zero_vec2(void) { return make_float2(0, 0); }
+    __device__ __forceinline__ static vec4_t zero_vec4(void) { return make_float4(0, 0, 0, 0); }
+    __device__ __forceinline__ static float clamp(float x, float c) { return fminf(fmaxf(x, -c), c); }
+};
+template <> struct InternalType<c10::Half>
+{
+    typedef float scalar_t; typedef float2 vec2_t; typedef float4 vec4_t;
+    __device__ __forceinline__ static vec2_t zero_vec2(void) { return make_float2(0, 0); }
+    __device__ __forceinline__ static vec4_t zero_vec4(void) { return make_float4(0, 0, 0, 0); }
+    __device__ __forceinline__ static float clamp(float x, float c) { return fminf(fmaxf(x, -c), c); }
+};
+
+#define MIN(A, B)       ((A) < (B) ? (A) : (B))
+#define MAX(A, B)       ((A) > (B) ? (A) : (B))
+#define CEIL_DIV(A, B) (((B)==1) ? (A) : \
+                        ((B)==2) ? ((int)((A)+1) >> 1) : \
+                        ((B)==4) ? ((int)((A)+3) >> 2) : \
+                        (((A) + ((A) > 0 ? (B) - 1 : 0)) / (B)))
+
+// This works only up to blocks of size 256 x 256 and for all N that are powers of two.
+template <int N> __device__ __forceinline__ void fast_div_mod(int& x, int& y, unsigned int i)
+{
+    if ((N & (N-1)) && N <= 256)
+        y = (i * ((1<<24)/N + 1)) >> 24; // Assumes N <= 256, i < N*256.
+    else
+        y = i/N;
+
+    x = i - y*N;
+}
+
+// Type cast stride before reading it.
+template <class T> __device__ __forceinline__ T get_stride(const int64_t& x)
+{
+    return *reinterpret_cast<const T*>(&x);
+}
+
+//------------------------------------------------------------------------
+// Filters, setup kernel, copying function.
+
+#define MAX_FILTER_SIZE 32
+
+// Combined up/down filter buffers so that transfer can be done with one copy.
+__device__              float g_fbuf[2 * MAX_FILTER_SIZE * MAX_FILTER_SIZE]; // Filters in global memory, written by setup kernel.
+__device__ __constant__ float c_fbuf[2 * MAX_FILTER_SIZE * MAX_FILTER_SIZE]; // Filters in constant memory, read by main kernel.
+
+// Accessors to combined buffers to index up/down filters individually.
+#define c_fu (c_fbuf)
+#define c_fd (c_fbuf + MAX_FILTER_SIZE * MAX_FILTER_SIZE)
+#define g_fu (g_fbuf)
+#define g_fd (g_fbuf + MAX_FILTER_SIZE * MAX_FILTER_SIZE)
+
+// Set up filters into global memory buffer.
+static __global__ void setup_filters_kernel(filtered_lrelu_kernel_params p)
+{
+    for (int idx = threadIdx.x; idx < MAX_FILTER_SIZE * MAX_FILTER_SIZE; idx += blockDim.x)
+    {
+        int x, y;
+        fast_div_mod<MAX_FILTER_SIZE>(x, y, idx);
+
+        int fu_x = p.flip ? x : (p.fuShape.x - 1 - x);
+        int fu_y = p.flip ? y : (p.fuShape.y - 1 - y);
+        if (p.fuShape.y > 0)
+            g_fu[idx] = (x >= p.fuShape.x || y >= p.fuShape.y) ? 0.0f : p.fu[fu_x * p.fuStride.x + fu_y * p.fuStride.y];
+        else
+            g_fu[idx] = (x >= p.fuShape.x || y > 0) ? 0.0f : p.fu[fu_x * p.fuStride.x];
+
+        int fd_x = p.flip ? x : (p.fdShape.x - 1 - x);
+        int fd_y = p.flip ? y : (p.fdShape.y - 1 - y);
+        if (p.fdShape.y > 0)
+            g_fd[idx] = (x >= p.fdShape.x || y >= p.fdShape.y) ? 0.0f : p.fd[fd_x * p.fdStride.x + fd_y * p.fdStride.y];
+        else
+            g_fd[idx] = (x >= p.fdShape.x || y > 0) ? 0.0f : p.fd[fd_x * p.fdStride.x];
+    }
+}
+
+// Host function to copy filters written by setup kernel into constant buffer for main kernel.
+template <bool, bool> static cudaError_t copy_filters(cudaStream_t stream)
+{
+    void* src = 0;
+    cudaError_t err = cudaGetSymbolAddress(&src, g_fbuf);
+    if (err) return err;
+    return cudaMemcpyToSymbolAsync(c_fbuf, src, 2 * MAX_FILTER_SIZE * MAX_FILTER_SIZE * sizeof(float), 0, cudaMemcpyDeviceToDevice, stream);
+}
+
+//------------------------------------------------------------------------
+// Coordinate spaces:
+// - Relative to input tensor:      inX, inY, tileInX, tileInY
+// - Relative to input tile:        relInX, relInY, tileInW, tileInH
+// - Relative to upsampled tile:    relUpX, relUpY, tileUpW, tileUpH
+// - Relative to output tile:       relOutX, relOutY, tileOutW, tileOutH
+// - Relative to output tensor:     outX, outY, tileOutX, tileOutY
+//
+// Relationships between coordinate spaces:
+// - inX = tileInX + relInX
+// - inY = tileInY + relInY
+// - relUpX = relInX * up + phaseInX
+// - relUpY = relInY * up + phaseInY
+// - relUpX = relOutX * down
+// - relUpY = relOutY * down
+// - outX = tileOutX + relOutX
+// - outY = tileOutY + relOutY
+
+extern __shared__ char s_buf_raw[]; // When sharedKB <= 48, allocate shared memory statically inside the kernel, otherwise use the externally allocated shared memory buffer.
+
+template <class T, class index_t, int sharedKB, bool signWrite, bool signRead, int filterMode, int up, int fuSize, int down, int fdSize, int tileOutW, int tileOutH, int threadsPerBlock, bool enableXrep, bool enableWriteSkip>
+static __global__ void filtered_lrelu_kernel(filtered_lrelu_kernel_params p)
+{
+    // Check that we don't try to support non-existing filter modes.
+    static_assert(up   == 1 || up   == 2 || up   == 4, "only up=1, up=2, up=4 scales supported");
+    static_assert(down == 1 || down == 2 || down == 4, "only down=1, down=2, down=4 scales supported");
+    static_assert(fuSize >= up,   "upsampling filter size must be at least upsampling factor");
+    static_assert(fdSize >= down, "downsampling filter size must be at least downsampling factor");
+    static_assert(fuSize % up   == 0, "upsampling filter size must be divisible with upsampling factor");
+    static_assert(fdSize % down == 0, "downsampling filter size must be divisible with downsampling factor");
+    static_assert(fuSize <= MAX_FILTER_SIZE && fdSize <= MAX_FILTER_SIZE, "filter size greater than MAX_FILTER_SIZE");
+    static_assert(up   != 1 || (fuSize == 1 && (filterMode == MODE_FUFD || filterMode == MODE_FUSD)), "up=1 supported only for 1x1 full filters");
+    static_assert(down != 1 || (fdSize == 1 && (filterMode == MODE_FUFD || filterMode == MODE_SUFD)), "down=1 supported only for 1x1 full filters");
+    static_assert(!(up   == 4 && (filterMode == MODE_FUFD || filterMode == MODE_FUSD)), "full filters not supported for up=4");
+    static_assert(!(down == 4 && (filterMode == MODE_FUFD || filterMode == MODE_SUFD)), "full filters not supported for down=4");
+
+    // Static definitions.
+    typedef typename InternalType<T>::scalar_t scalar_t;
+    typedef typename InternalType<T>::vec2_t vec2_t;
+    typedef typename InternalType<T>::vec4_t vec4_t;
+    const int tileUpW    = (tileOutW * down + (fdSize - 1) - (down - 1) + 3) & ~3;  // Upsampled tile width, rounded up to multiple of 4.
+    const int tileUpH    = tileOutH * down + (fdSize - 1) - (down - 1);             // Upsampled tile height.
+    const int tileInW    = CEIL_DIV(tileUpW  + (fuSize - 1), up);                   // Input tile width.
+    const int tileInH    = CEIL_DIV(tileUpH  + (fuSize - 1), up);                   // Input tile height.
+    const int tileUpH_up = CEIL_DIV(tileUpH, up) * up;                              // Upsampled tile height rounded up to a multiple of up.
+    const int tileInH_up = CEIL_DIV(tileUpH_up + (fuSize - 1), up);                 // For allocations only, to avoid shared memory read overruns with up=2 and up=4.
+
+    // Merge 1x1 downsampling into last upsampling step for upf1 and ups2.
+    const bool downInline = (down == 1) && ((up == 1 && filterMode == MODE_FUFD) || (up == 2 && filterMode == MODE_SUFD));
+
+    // Sizes of logical buffers.
+    const int szIn    = tileInH_up * tileInW;
+    const int szUpX   = tileInH_up * tileUpW;
+    const int szUpXY  = downInline ? 0 : (tileUpH * tileUpW);
+    const int szDownX = tileUpH * tileOutW;
+
+    // Sizes for shared memory arrays.
+    const int s_buf0_size_base =
+        (filterMode == MODE_SUSD) ? MAX(szIn, szUpXY) :
+        (filterMode == MODE_FUSD) ? MAX(szIn, szDownX) :
+        (filterMode == MODE_SUFD) ? MAX(szIn, szUpXY) :
+        (filterMode == MODE_FUFD) ? szIn :
+        -1;
+    const int s_buf1_size_base =
+        (filterMode == MODE_SUSD) ? MAX(szUpX, szDownX) :
+        (filterMode == MODE_FUSD) ? szUpXY :
+        (filterMode == MODE_SUFD) ? szUpX  :
+        (filterMode == MODE_FUFD) ? szUpXY :
+        -1;
+
+    // Ensure U128 alignment.
+    const int s_buf0_size = (s_buf0_size_base + 3) & ~3;
+    const int s_buf1_size = (s_buf1_size_base + 3) & ~3;
+
+    // Check at compile time that we don't use too much shared memory.
+    static_assert((s_buf0_size + s_buf1_size) * sizeof(scalar_t) <= (sharedKB << 10), "shared memory overflow");
+
+    // Declare shared memory arrays.
+    scalar_t* s_buf0;
+    scalar_t* s_buf1;
+    if (sharedKB <= 48)
+    {
+        // Allocate shared memory arrays here.
+        __shared__ scalar_t s_buf0_st[(sharedKB > 48) ? (1<<24) : (s_buf0_size + s_buf1_size)]; // Prevent launching if this isn't optimized away when unused.
+        s_buf0 = s_buf0_st;
+        s_buf1 = s_buf0 + s_buf0_size;
+    }
+    else
+    {
+        // Use the dynamically allocated shared memory array.
+        s_buf0 = (scalar_t*)s_buf_raw;
+        s_buf1 = s_buf0 + s_buf0_size;
+    }
+
+    // Pointers to the buffers.
+    scalar_t* s_tileIn;       // Input tile:                      [relInX * tileInH + relInY]
+    scalar_t* s_tileUpX;      // After horizontal upsampling:     [relInY * tileUpW + relUpX]
+    scalar_t* s_tileUpXY;     // After upsampling:                [relUpY * tileUpW + relUpX]
+    scalar_t* s_tileDownX;    // After horizontal downsampling:   [relUpY * tileOutW + relOutX]
+    if (filterMode == MODE_SUSD)
+    {
+        s_tileIn    = s_buf0;
+        s_tileUpX   = s_buf1;
+        s_tileUpXY  = s_buf0;
+        s_tileDownX = s_buf1;
+    }
+    else if (filterMode == MODE_FUSD)
+    {
+        s_tileIn    = s_buf0;
+        s_tileUpXY  = s_buf1;
+        s_tileDownX = s_buf0;
+    }
+    else if (filterMode == MODE_SUFD)
+    {
+        s_tileIn    = s_buf0;
+        s_tileUpX   = s_buf1;
+        s_tileUpXY  = s_buf0;
+    }
+    else if (filterMode == MODE_FUFD)
+    {
+        s_tileIn    = s_buf0;
+        s_tileUpXY  = s_buf1;
+    }
+
+    // Allow large grids in z direction via per-launch offset.
+    int channelIdx = blockIdx.z + p.blockZofs;
+    int batchIdx = channelIdx / p.yShape.z;
+    channelIdx -= batchIdx * p.yShape.z;
+
+    // Offset to output feature map. In bytes.
+    index_t mapOfsOut = channelIdx * get_stride<index_t>(p.yStride.z) + batchIdx * get_stride<index_t>(p.yStride.w);
+
+    // Sign shift amount.
+    uint32_t signXo = ((threadIdx.x + p.sOfs.x) << 1) & 6;
+
+    // Inner tile loop.
+    #pragma unroll 1
+    for (int tileIdx = 0; !enableXrep || (tileIdx < MIN(p.tilesXrep, p.tilesXdim - p.tilesXrep * blockIdx.y)); tileIdx++)
+    {
+        // Locate output tile.
+        int tileX = enableXrep ? blockIdx.y * p.tilesXrep + tileIdx : blockIdx.x;
+        int tileOutX = tileX * tileOutW;
+        int tileOutY = (enableXrep ? blockIdx.x : blockIdx.y) * tileOutH;
+
+        // Locate input tile.
+        int tmpX = tileOutX * down - p.pad0.x;
+        int tmpY = tileOutY * down - p.pad0.y;
+        int tileInX = CEIL_DIV(tmpX, up);
+        int tileInY = CEIL_DIV(tmpY, up);
+        const int phaseInX = tileInX * up - tmpX;
+        const int phaseInY = tileInY * up - tmpY;
+
+        // Extra sync if input and output buffers are the same and we are not on first tile.
+        if (enableXrep && tileIdx > 0 && (filterMode == MODE_FUSD || (filterMode == MODE_SUFD && !downInline) || (filterMode == MODE_FUFD && downInline)))
+            __syncthreads();
+
+        // Load input tile & apply bias. Unrolled.
+        scalar_t b = (scalar_t)*(const T*)((const char*)p.b + (channelIdx * get_stride<index_t>(p.bStride)));
+        index_t mapOfsIn = channelIdx * get_stride<index_t>(p.xStride.z) + batchIdx * get_stride<index_t>(p.xStride.w);
+        int idx = threadIdx.x;
+        const int loopCountIN = CEIL_DIV(tileInW * tileInH, threadsPerBlock);
+        #pragma unroll
+        for (int loop = 0; loop < loopCountIN; loop++)
+        {
+            int relInX, relInY;
+            fast_div_mod<tileInW>(relInX, relInY, idx);
+            int inX = tileInX + relInX;
+            int inY = tileInY + relInY;
+            scalar_t v = 0;
+
+            if ((uint32_t)inX < p.xShape.x && (uint32_t)inY < p.xShape.y)
+                v = (scalar_t)*((const T*)((const char*)p.x + (inX * get_stride<index_t>(p.xStride.x) + inY * get_stride<index_t>(p.xStride.y) + mapOfsIn))) + b;
+
+            bool skip = (loop == loopCountIN-1) && (idx >= tileInW * tileInH);
+            if (!skip)
+                s_tileIn[idx] = v;
+
+            idx += threadsPerBlock;
+        }
+
+        if (filterMode == MODE_SUSD || filterMode == MODE_SUFD) // Separable upsampling filter.
+        {
+            // Horizontal upsampling.
+            __syncthreads();
+            if (up == 4)
+            {
+                for (int idx = threadIdx.x*up; idx < tileUpW * tileInH; idx += blockDim.x*up)
+                {
+                    int relUpX0, relInY;
+                    fast_div_mod<tileUpW>(relUpX0, relInY, idx);
+                    int relInX0 = relUpX0 / up;
+                    int src0 = relInX0 + tileInW * relInY;
+                    int dst = relInY * tileUpW + relUpX0;
+                    vec4_t v = InternalType<T>::zero_vec4();
+                    scalar_t a = s_tileIn[src0];
+                    if (phaseInX == 0)
+                    {
+                        #pragma unroll
+                        for (int step = 0; step < fuSize / up; step++)
+                        {
+                            v.x += a * (scalar_t)c_fu[step * up + 0];
+                            a = s_tileIn[src0 + step + 1];
+                            v.y += a * (scalar_t)c_fu[step * up + 3];
+                            v.z += a * (scalar_t)c_fu[step * up + 2];
+                            v.w += a * (scalar_t)c_fu[step * up + 1];
+                        }
+                    }
+                    else if (phaseInX == 1)
+                    {
+                        #pragma unroll
+                        for (int step = 0; step < fuSize / up; step++)
+                        {
+                            v.x += a * (scalar_t)c_fu[step * up + 1];
+                            v.y += a * (scalar_t)c_fu[step * up + 0];
+                            a = s_tileIn[src0 + step + 1];
+                            v.z += a * (scalar_t)c_fu[step * up + 3];
+                            v.w += a * (scalar_t)c_fu[step * up + 2];
+                        }
+                    }
+                    else if (phaseInX == 2)
+                    {
+                        #pragma unroll
+                        for (int step = 0; step < fuSize / up; step++)
+                        {
+                            v.x += a * (scalar_t)c_fu[step * up + 2];
+                            v.y += a * (scalar_t)c_fu[step * up + 1];
+                            v.z += a * (scalar_t)c_fu[step * up + 0];
+                            a = s_tileIn[src0 + step + 1];
+                            v.w += a * (scalar_t)c_fu[step * up + 3];
+                        }
+                    }
+                    else // (phaseInX == 3)
+                    {
+                        #pragma unroll
+                        for (int step = 0; step < fuSize / up; step++)
+                        {
+                            v.x += a * (scalar_t)c_fu[step * up + 3];
+                            v.y += a * (scalar_t)c_fu[step * up + 2];
+                            v.z += a * (scalar_t)c_fu[step * up + 1];
+                            v.w += a * (scalar_t)c_fu[step * up + 0];
+                            a = s_tileIn[src0 + step + 1];
+                        }
+                    }
+                    s_tileUpX[dst+0] = v.x;
+                    s_tileUpX[dst+1] = v.y;
+                    s_tileUpX[dst+2] = v.z;
+                    s_tileUpX[dst+3] = v.w;
+                }
+            }
+            else if (up == 2)
+            {
+                bool p0 = (phaseInX == 0);
+                for (int idx = threadIdx.x*up; idx < tileUpW * tileInH; idx += blockDim.x*up)
+                {
+                    int relUpX0, relInY;
+                    fast_div_mod<tileUpW>(relUpX0, relInY, idx);
+                    int relInX0 = relUpX0 / up;
+                    int src0 = relInX0 + tileInW * relInY;
+                    int dst = relInY * tileUpW + relUpX0;
+                    vec2_t v = InternalType<T>::zero_vec2();
+                    scalar_t a = s_tileIn[src0];
+                    if (p0) // (phaseInX == 0)
+                    {
+                        #pragma unroll
+                        for (int step = 0; step < fuSize / up; step++)
+                        {
+                            v.x += a * (scalar_t)c_fu[step * up + 0];
+                            a = s_tileIn[src0 + step + 1];
+                            v.y += a * (scalar_t)c_fu[step * up + 1];
+                        }
+                    }
+                    else // (phaseInX == 1)
+                    {
+                        #pragma unroll
+                        for (int step = 0; step < fuSize / up; step++)
+                        {
+                            v.x += a * (scalar_t)c_fu[step * up + 1];
+                            v.y += a * (scalar_t)c_fu[step * up + 0];
+                            a = s_tileIn[src0 + step + 1];
+                        }
+                    }
+                    s_tileUpX[dst+0] = v.x;
+                    s_tileUpX[dst+1] = v.y;
+                }
+            }
+
+            // Vertical upsampling & nonlinearity.
+
+            __syncthreads();
+            int groupMask = 15 << ((threadIdx.x & 31) & ~3);
+            int minY = tileOutY ? (tileOutY - tileOutH) * down + tileUpH : 0; // Skip already written signs.
+            int sShapeMaxY = MIN(p.sShape.y, tileOutY * down + tileUpH); // Avoid out-of-tile sign writes.
+            if (up == 4)
+            {
+                minY -= 3; // Adjust according to block height.
+                for (int idx = threadIdx.x; idx < tileUpW * tileUpH_up / up; idx += blockDim.x)
+                {
+                    int relUpX, relInY0;
+                    fast_div_mod<tileUpW>(relUpX, relInY0, idx);
+                    int relUpY0 = relInY0 * up;
+                    int src0 = relInY0 * tileUpW + relUpX;
+                    int dst = relUpY0 * tileUpW + relUpX;
+                    vec4_t v = InternalType<T>::zero_vec4();
+
+                    scalar_t a = s_tileUpX[src0];
+                    if (phaseInY == 0)
+                    {
+                        #pragma unroll
+                        for (int step = 0; step < fuSize / up; step++)
+                        {
+                            v.x += a * (scalar_t)c_fu[step * up + 0];
+                            a = s_tileUpX[src0 + (step + 1) * tileUpW];
+                            v.y += a * (scalar_t)c_fu[step * up + 3];
+                            v.z += a * (scalar_t)c_fu[step * up + 2];
+                            v.w += a * (scalar_t)c_fu[step * up + 1];
+                        }
+                    }
+                    else if (phaseInY == 1)
+                    {
+                        #pragma unroll
+                        for (int step = 0; step < fuSize / up; step++)
+                        {
+                            v.x += a * (scalar_t)c_fu[step * up + 1];
+                            v.y += a * (scalar_t)c_fu[step * up + 0];
+                            a = s_tileUpX[src0 + (step + 1) * tileUpW];
+                            v.z += a * (scalar_t)c_fu[step * up + 3];
+                            v.w += a * (scalar_t)c_fu[step * up + 2];
+                        }
+                    }
+                    else if (phaseInY == 2)
+                    {
+                        #pragma unroll
+                        for (int step = 0; step < fuSize / up; step++)
+                        {
+                            v.x += a * (scalar_t)c_fu[step * up + 2];
+                            v.y += a * (scalar_t)c_fu[step * up + 1];
+                            v.z += a * (scalar_t)c_fu[step * up + 0];
+                            a = s_tileUpX[src0 + (step + 1) * tileUpW];
+                            v.w += a * (scalar_t)c_fu[step * up + 3];
+                        }
+                    }
+                    else // (phaseInY == 3)
+                    {
+                        #pragma unroll
+                        for (int step = 0; step < fuSize / up; step++)
+                        {
+                            v.x += a * (scalar_t)c_fu[step * up + 3];
+                            v.y += a * (scalar_t)c_fu[step * up + 2];
+                            v.z += a * (scalar_t)c_fu[step * up + 1];
+                            v.w += a * (scalar_t)c_fu[step * up + 0];
+                            a = s_tileUpX[src0 + (step + 1) * tileUpW];
+                        }
+                    }
+
+                    int x = tileOutX * down + relUpX;
+                    int y = tileOutY * down + relUpY0;
+                    int signX = x + p.sOfs.x;
+                    int signY = y + p.sOfs.y;
+                    int signZ = blockIdx.z + p.blockZofs;
+                    int signXb = signX >> 2;
+                    index_t si0 = signXb + p.sShape.x * (signY + (index_t)p.sShape.y * signZ);
+                    index_t si1 = si0 + p.sShape.x;
+                    index_t si2 = si0 + p.sShape.x * 2;
+                    index_t si3 = si0 + p.sShape.x * 3;
+
+                    v.x *= (scalar_t)((float)up * (float)up * p.gain);
+                    v.y *= (scalar_t)((float)up * (float)up * p.gain);
+                    v.z *= (scalar_t)((float)up * (float)up * p.gain);
+                    v.w *= (scalar_t)((float)up * (float)up * p.gain);
+
+                    if (signWrite)
+                    {
+                        if (!enableWriteSkip)
+                        {
+                            // Determine and write signs.
+                            int sx = __float_as_uint(v.x) >> 31 <<  0;
+                            int sy = __float_as_uint(v.y) >> 31 <<  8;
+                            int sz = __float_as_uint(v.z) >> 31 << 16;
+                            int sw = __float_as_uint(v.w) >> 31 << 24;
+                            if (sx) v.x *= p.slope;
+                            if (sy) v.y *= p.slope;
+                            if (sz) v.z *= p.slope;
+                            if (sw) v.w *= p.slope;
+                            if (fabsf(v.x) > p.clamp) { sx = 2 <<  0; v.x = InternalType<T>::clamp(v.x, p.clamp); }
+                            if (fabsf(v.y) > p.clamp) { sy = 2 <<  8; v.y = InternalType<T>::clamp(v.y, p.clamp); }
+                            if (fabsf(v.z) > p.clamp) { sz = 2 << 16; v.z = InternalType<T>::clamp(v.z, p.clamp); }
+                            if (fabsf(v.w) > p.clamp) { sw = 2 << 24; v.w = InternalType<T>::clamp(v.w, p.clamp); }
+
+                            if ((uint32_t)signXb < p.swLimit && signY >= minY)
+                            {
+                                // Combine signs.
+                                uint32_t s = sx + sy + sw + sz;
+                                s <<= (signX & 3) << 1;
+                                s |= __shfl_xor_sync(groupMask, s, 1);
+                                s |= __shfl_xor_sync(groupMask, s, 2);
+
+                                // Write signs.
+                                if ((uint32_t)(signY + 0) < sShapeMaxY) { p.s[si0] = (unsigned char)(s >>  0); }
+                                if ((uint32_t)(signY + 1) < sShapeMaxY) { p.s[si1] = (unsigned char)(s >>  8); }
+                                if ((uint32_t)(signY + 2) < sShapeMaxY) { p.s[si2] = (unsigned char)(s >> 16); }
+                                if ((uint32_t)(signY + 3) < sShapeMaxY) { p.s[si3] = (unsigned char)(s >> 24); }
+                            }
+                        }
+                        else
+                        {
+                            // Determine and write signs.
+                            if ((uint32_t)signXb < p.swLimit && signY >= minY)
+                            {
+                                int sx = __float_as_uint(v.x) >> 31 <<  0;
+                                int sy = __float_as_uint(v.y) >> 31 <<  8;
+                                int sz = __float_as_uint(v.z) >> 31 << 16;
+                                int sw = __float_as_uint(v.w) >> 31 << 24;
+                                if (sx) v.x *= p.slope;
+                                if (sy) v.y *= p.slope;
+                                if (sz) v.z *= p.slope;
+                                if (sw) v.w *= p.slope;
+                                if (fabsf(v.x) > p.clamp) { sx = 2 <<  0; v.x = InternalType<T>::clamp(v.x, p.clamp); }
+                                if (fabsf(v.y) > p.clamp) { sy = 2 <<  8; v.y = InternalType<T>::clamp(v.y, p.clamp); }
+                                if (fabsf(v.z) > p.clamp) { sz = 2 << 16; v.z = InternalType<T>::clamp(v.z, p.clamp); }
+                                if (fabsf(v.w) > p.clamp) { sw = 2 << 24; v.w = InternalType<T>::clamp(v.w, p.clamp); }
+
+                                // Combine signs.
+                                uint32_t s = sx + sy + sw + sz;
+                                s <<= (signX & 3) << 1;
+                                s |= __shfl_xor_sync(groupMask, s, 1);
+                                s |= __shfl_xor_sync(groupMask, s, 2);
+
+                                // Write signs.
+                                if ((uint32_t)(signY + 0) < sShapeMaxY) { p.s[si0] = (unsigned char)(s >>  0); }
+                                if ((uint32_t)(signY + 1) < sShapeMaxY) { p.s[si1] = (unsigned char)(s >>  8); }
+                                if ((uint32_t)(signY + 2) < sShapeMaxY) { p.s[si2] = (unsigned char)(s >> 16); }
+                                if ((uint32_t)(signY + 3) < sShapeMaxY) { p.s[si3] = (unsigned char)(s >> 24); }
+                            }
+                            else
+                            {
+                                // Just compute the values.
+                                if (v.x < 0.f) v.x *= p.slope; v.x = InternalType<T>::clamp(v.x, p.clamp);
+                                if (v.y < 0.f) v.y *= p.slope; v.y = InternalType<T>::clamp(v.y, p.clamp);
+                                if (v.z < 0.f) v.z *= p.slope; v.z = InternalType<T>::clamp(v.z, p.clamp);
+                                if (v.w < 0.f) v.w *= p.slope; v.w = InternalType<T>::clamp(v.w, p.clamp);
+                            }
+                        }
+                    }
+                    else if (signRead) // Read signs and apply.
+                    {
+                        if ((uint32_t)signXb < p.swLimit)
+                        {
+                            int ss = (signX & 3) << 1;
+                            if ((uint32_t)(signY + 0) < p.sShape.y) { int s = p.s[si0] >> ss; if (s & 1) v.x *= p.slope; if (s & 2) v.x = 0.f; }
+                            if ((uint32_t)(signY + 1) < p.sShape.y) { int s = p.s[si1] >> ss; if (s & 1) v.y *= p.slope; if (s & 2) v.y = 0.f; }
+                            if ((uint32_t)(signY + 2) < p.sShape.y) { int s = p.s[si2] >> ss; if (s & 1) v.z *= p.slope; if (s & 2) v.z = 0.f; }
+                            if ((uint32_t)(signY + 3) < p.sShape.y) { int s = p.s[si3] >> ss; if (s & 1) v.w *= p.slope; if (s & 2) v.w = 0.f; }
+                        }
+                    }
+                    else // Forward pass with no sign write.
+                    {
+                        if (v.x < 0.f) v.x *= p.slope; v.x = InternalType<T>::clamp(v.x, p.clamp);
+                        if (v.y < 0.f) v.y *= p.slope; v.y = InternalType<T>::clamp(v.y, p.clamp);
+                        if (v.z < 0.f) v.z *= p.slope; v.z = InternalType<T>::clamp(v.z, p.clamp);
+                        if (v.w < 0.f) v.w *= p.slope; v.w = InternalType<T>::clamp(v.w, p.clamp);
+                    }
+
+                    s_tileUpXY[dst + 0 * tileUpW] = v.x;
+                    if (relUpY0 + 1 < tileUpH) s_tileUpXY[dst + 1 * tileUpW] = v.y;
+                    if (relUpY0 + 2 < tileUpH) s_tileUpXY[dst + 2 * tileUpW] = v.z;
+                    if (relUpY0 + 3 < tileUpH) s_tileUpXY[dst + 3 * tileUpW] = v.w;
+                }
+            }
+            else if (up == 2)
+            {
+                minY -= 1; // Adjust according to block height.
+                for (int idx = threadIdx.x; idx < tileUpW * tileUpH_up / up; idx += blockDim.x)
+                {
+                    int relUpX, relInY0;
+                    fast_div_mod<tileUpW>(relUpX, relInY0, idx);
+                    int relUpY0 = relInY0 * up;
+                    int src0 = relInY0 * tileUpW + relUpX;
+                    int dst = relUpY0 * tileUpW + relUpX;
+                    vec2_t v = InternalType<T>::zero_vec2();
+
+                    scalar_t a = s_tileUpX[src0];
+                    if (phaseInY == 0)
+                    {
+                        #pragma unroll
+                        for (int step = 0; step < fuSize / up; step++)
+                        {
+                            v.x += a * (scalar_t)c_fu[step * up + 0];
+                            a = s_tileUpX[src0 + (step + 1) * tileUpW];
+                            v.y += a * (scalar_t)c_fu[step * up + 1];
+                        }
+                    }
+                    else // (phaseInY == 1)
+                    {
+                        #pragma unroll
+                        for (int step = 0; step < fuSize / up; step++)
+                        {
+                            v.x += a * (scalar_t)c_fu[step * up + 1];
+                            v.y += a * (scalar_t)c_fu[step * up + 0];
+                            a = s_tileUpX[src0 + (step + 1) * tileUpW];
+                        }
+                    }
+
+                    int x = tileOutX * down + relUpX;
+                    int y = tileOutY * down + relUpY0;
+                    int signX = x + p.sOfs.x;
+                    int signY = y + p.sOfs.y;
+                    int signZ = blockIdx.z + p.blockZofs;
+                    int signXb = signX >> 2;
+                    index_t si0 = signXb + p.sShape.x * (signY + (index_t)p.sShape.y * signZ);
+                    index_t si1 = si0 + p.sShape.x;
+
+                    v.x *= (scalar_t)((float)up * (float)up * p.gain);
+                    v.y *= (scalar_t)((float)up * (float)up * p.gain);
+
+                    if (signWrite)
+                    {
+                        if (!enableWriteSkip)
+                        {
+                            // Determine and write signs.
+                            int sx = __float_as_uint(v.x) >> 31 << 0;
+                            int sy = __float_as_uint(v.y) >> 31 << 8;
+                            if (sx) v.x *= p.slope;
+                            if (sy) v.y *= p.slope;
+                            if (fabsf(v.x) > p.clamp) { sx = 2 << 0; v.x = InternalType<T>::clamp(v.x, p.clamp); }
+                            if (fabsf(v.y) > p.clamp) { sy = 2 << 8; v.y = InternalType<T>::clamp(v.y, p.clamp); }
+
+                            if ((uint32_t)signXb < p.swLimit && signY >= minY)
+                            {
+                                // Combine signs.
+                                int s = sx + sy;
+                                s <<= signXo;
+                                s |= __shfl_xor_sync(groupMask, s, 1);
+                                s |= __shfl_xor_sync(groupMask, s, 2);
+
+                                // Write signs.
+                                if ((uint32_t)(signY + 0) < sShapeMaxY) { p.s[si0] = (unsigned char)(s >>  0); }
+                                if ((uint32_t)(signY + 1) < sShapeMaxY) { p.s[si1] = (unsigned char)(s >>  8); }
+                            }
+                        }
+                        else
+                        {
+                            // Determine and write signs.
+                            if ((uint32_t)signXb < p.swLimit && signY >= minY)
+                            {
+                                int sx = __float_as_uint(v.x) >> 31 << 0;
+                                int sy = __float_as_uint(v.y) >> 31 << 8;
+                                if (sx) v.x *= p.slope;
+                                if (sy) v.y *= p.slope;
+                                if (fabsf(v.x) > p.clamp) { sx = 2 << 0; v.x = InternalType<T>::clamp(v.x, p.clamp); }
+                                if (fabsf(v.y) > p.clamp) { sy = 2 << 8; v.y = InternalType<T>::clamp(v.y, p.clamp); }
+
+                                // Combine signs.
+                                int s = sx + sy;
+                                s <<= signXo;
+                                s |= __shfl_xor_sync(groupMask, s, 1);
+                                s |= __shfl_xor_sync(groupMask, s, 2);
+
+                                // Write signs.
+                                if ((uint32_t)(signY + 0) < sShapeMaxY) { p.s[si0] = (unsigned char)(s >>  0); }
+                                if ((uint32_t)(signY + 1) < sShapeMaxY) { p.s[si1] = (unsigned char)(s >>  8); }
+                            }
+                            else
+                            {
+                                // Just compute the values.
+                                if (v.x < 0.f) v.x *= p.slope; v.x = InternalType<T>::clamp(v.x, p.clamp);
+                                if (v.y < 0.f) v.y *= p.slope; v.y = InternalType<T>::clamp(v.y, p.clamp);
+                            }
+                        }
+                    }
+                    else if (signRead) // Read signs and apply.
+                    {
+                        if ((uint32_t)signXb < p.swLimit)
+                        {
+                            if ((uint32_t)(signY + 0) < p.sShape.y) { int s = p.s[si0] >> signXo; if (s & 1) v.x *= p.slope; if (s & 2) v.x = 0.f; }
+                            if ((uint32_t)(signY + 1) < p.sShape.y) { int s = p.s[si1] >> signXo; if (s & 1) v.y *= p.slope; if (s & 2) v.y = 0.f; }
+                        }
+                    }
+                    else // Forward pass with no sign write.
+                    {
+                        if (v.x < 0.f) v.x *= p.slope; v.x = InternalType<T>::clamp(v.x, p.clamp);
+                        if (v.y < 0.f) v.y *= p.slope; v.y = InternalType<T>::clamp(v.y, p.clamp);
+                    }
+
+                    if (!downInline)
+                    {
+                        // Write into temporary buffer.
+                        s_tileUpXY[dst] = v.x;
+                        if (relUpY0 < tileUpH - 1)
+                            s_tileUpXY[dst + tileUpW] = v.y;
+                    }
+                    else
+                    {
+                        // Write directly into output buffer.
+                        if ((uint32_t)x < p.yShape.x)
+                        {
+                            int ymax = MIN(p.yShape.y, tileUpH + tileOutY * down);
+                            index_t ofs = x * get_stride<index_t>(p.yStride.x) + y * get_stride<index_t>(p.yStride.y) + mapOfsOut;
+                            if ((uint32_t)y + 0 < p.yShape.y) *((T*)((char*)p.y + ofs)) = (T)(v.x * (scalar_t)c_fd[0]);
+                            if ((uint32_t)y + 1 < ymax) *((T*)((char*)p.y + ofs + get_stride<index_t>(p.yStride.y))) = (T)(v.y * (scalar_t)c_fd[0]);
+                        }
+                    }
+                }
+            }
+        }
+        else if (filterMode == MODE_FUSD || filterMode == MODE_FUFD)
+        {
+            // Full upsampling filter.
+
+            if (up == 2)
+            {
+                // 2 x 2-wide.
+                __syncthreads();
+                int minY = tileOutY ? (tileOutY - tileOutH) * down + tileUpH + p.sOfs.y : 0; // Skip already written signs.
+                for (int idx = threadIdx.x * 4; idx < tileUpW * tileUpH; idx += blockDim.x * 4)
+                {
+                    int relUpX0, relUpY0;
+                    fast_div_mod<tileUpW>(relUpX0, relUpY0, idx);
+                    int relInX0 = CEIL_DIV(relUpX0 - phaseInX, up);
+                    int relInY0 = CEIL_DIV(relUpY0 - phaseInY, up);
+                    int src0 = relInX0 + tileInW * relInY0;
+                    int tap0y = (relInY0 * up + phaseInY - relUpY0);
+
+                    #define X_LOOP(TAPY, PX) \
+                        for (int sx = 0; sx < fuSize / up; sx++) \
+                        { \
+                            v.x += a * (scalar_t)c_fu[(sx * up + (((PX) - 0) & (up - 1))) + (sy * up + (TAPY)) * MAX_FILTER_SIZE]; \
+                            v.z += b * (scalar_t)c_fu[(sx * up + (((PX) - 0) & (up - 1))) + (sy * up + (TAPY)) * MAX_FILTER_SIZE]; if ((PX) == 0) { a = b; b = s_tileIn[src0 + 2 + sx + sy * tileInW]; } \
+                            v.y += a * (scalar_t)c_fu[(sx * up + (((PX) - 1) & (up - 1))) + (sy * up + (TAPY)) * MAX_FILTER_SIZE]; \
+                            v.w += b * (scalar_t)c_fu[(sx * up + (((PX) - 1) & (up - 1))) + (sy * up + (TAPY)) * MAX_FILTER_SIZE]; if ((PX) == 1) { a = b; b = s_tileIn[src0 + 2 + sx + sy * tileInW]; } \
+                        }
+
+                    vec4_t v = InternalType<T>::zero_vec4();
+                    if (tap0y == 0 && phaseInX == 0)
+                        #pragma unroll
+                        for (int sy = 0; sy < fuSize / up; sy++) { scalar_t a = s_tileIn[src0 + sy * tileInW]; scalar_t b = s_tileIn[src0 + sy * tileInW + 1];
+                            #pragma unroll
+                            X_LOOP(0, 0) }
+                    if (tap0y == 0 && phaseInX == 1)
+                        #pragma unroll
+                        for (int sy = 0; sy < fuSize / up; sy++) { scalar_t a = s_tileIn[src0 + sy * tileInW]; scalar_t b = s_tileIn[src0 + sy * tileInW + 1];
+                            #pragma unroll
+                            X_LOOP(0, 1) }
+                    if (tap0y == 1 && phaseInX == 0)
+                        #pragma unroll
+                        for (int sy = 0; sy < fuSize / up; sy++) { scalar_t a = s_tileIn[src0 + sy * tileInW]; scalar_t b = s_tileIn[src0 + sy * tileInW + 1];
+                            #pragma unroll
+                            X_LOOP(1, 0) }
+                    if (tap0y == 1 && phaseInX == 1)
+                        #pragma unroll
+                        for (int sy = 0; sy < fuSize / up; sy++) { scalar_t a = s_tileIn[src0 + sy * tileInW]; scalar_t b = s_tileIn[src0 + sy * tileInW + 1];
+                            #pragma unroll
+                            X_LOOP(1, 1) }
+
+                    #undef X_LOOP
+
+                    int x = tileOutX * down + relUpX0;
+                    int y = tileOutY * down + relUpY0;
+                    int signX = x + p.sOfs.x;
+                    int signY = y + p.sOfs.y;
+                    int signZ = blockIdx.z + p.blockZofs;
+                    int signXb = signX >> 2;
+                    index_t si = signXb + p.sShape.x * (signY + (index_t)p.sShape.y * signZ);
+
+                    v.x *= (scalar_t)((float)up * (float)up * p.gain);
+                    v.y *= (scalar_t)((float)up * (float)up * p.gain);
+                    v.z *= (scalar_t)((float)up * (float)up * p.gain);
+                    v.w *= (scalar_t)((float)up * (float)up * p.gain);
+
+                    if (signWrite)
+                    {
+                        if (!enableWriteSkip)
+                        {
+                            // Determine and write signs.
+                            int sx = __float_as_uint(v.x) >> 31;
+                            int sy = __float_as_uint(v.y) >> 31;
+                            int sz = __float_as_uint(v.z) >> 31;
+                            int sw = __float_as_uint(v.w) >> 31;
+                            if (sx) v.x *= p.slope; if (fabsf(v.x) > p.clamp) { sx = 2; v.x = InternalType<T>::clamp(v.x, p.clamp); }
+                            if (sy) v.y *= p.slope; if (fabsf(v.y) > p.clamp) { sy = 2; v.y = InternalType<T>::clamp(v.y, p.clamp); }
+                            if (sz) v.z *= p.slope; if (fabsf(v.z) > p.clamp) { sz = 2; v.z = InternalType<T>::clamp(v.z, p.clamp); }
+                            if (sw) v.w *= p.slope; if (fabsf(v.w) > p.clamp) { sw = 2; v.w = InternalType<T>::clamp(v.w, p.clamp); }
+
+                            if ((uint32_t)signXb < p.swLimit && (uint32_t)signY < p.sShape.y && signY >= minY)
+                            {
+                                p.s[si] = sx + (sy << 2) + (sz << 4) + (sw << 6);
+                            }
+                        }
+                        else
+                        {
+                            // Determine and write signs.
+                            if ((uint32_t)signXb < p.swLimit && (uint32_t)signY < p.sShape.y && signY >= minY)
+                            {
+                                int sx = __float_as_uint(v.x) >> 31;
+                                int sy = __float_as_uint(v.y) >> 31;
+                                int sz = __float_as_uint(v.z) >> 31;
+                                int sw = __float_as_uint(v.w) >> 31;
+                                if (sx) v.x *= p.slope; if (fabsf(v.x) > p.clamp) { sx = 2; v.x = InternalType<T>::clamp(v.x, p.clamp); }
+                                if (sy) v.y *= p.slope; if (fabsf(v.y) > p.clamp) { sy = 2; v.y = InternalType<T>::clamp(v.y, p.clamp); }
+                                if (sz) v.z *= p.slope; if (fabsf(v.z) > p.clamp) { sz = 2; v.z = InternalType<T>::clamp(v.z, p.clamp); }
+                                if (sw) v.w *= p.slope; if (fabsf(v.w) > p.clamp) { sw = 2; v.w = InternalType<T>::clamp(v.w, p.clamp); }
+
+                                p.s[si] = sx + (sy << 2) + (sz << 4) + (sw << 6);
+                            }
+                            else
+                            {
+                                // Just compute the values.
+                                if (v.x < 0.f) v.x *= p.slope; v.x = InternalType<T>::clamp(v.x, p.clamp);
+                                if (v.y < 0.f) v.y *= p.slope; v.y = InternalType<T>::clamp(v.y, p.clamp);
+                                if (v.z < 0.f) v.z *= p.slope; v.z = InternalType<T>::clamp(v.z, p.clamp);
+                                if (v.w < 0.f) v.w *= p.slope; v.w = InternalType<T>::clamp(v.w, p.clamp);
+                            }
+                        }
+                    }
+                    else if (signRead) // Read sign and apply.
+                    {
+                        if ((uint32_t)signY < p.sShape.y)
+                        {
+                            int s = 0;
+                            if ((uint32_t)signXb     < p.swLimit) s  = p.s[si];
+                            if ((uint32_t)signXb + 1 < p.swLimit) s |= p.s[si + 1] << 8;
+                            s >>= (signX & 3) << 1;
+                            if (s & 0x01) v.x *= p.slope; if (s & 0x02) v.x = 0.f;
+                            if (s & 0x04) v.y *= p.slope; if (s & 0x08) v.y = 0.f;
+                            if (s & 0x10) v.z *= p.slope; if (s & 0x20) v.z = 0.f;
+                            if (s & 0x40) v.w *= p.slope; if (s & 0x80) v.w = 0.f;
+                        }
+                    }
+                    else // Forward pass with no sign write.
+                    {
+                        if (v.x < 0.f) v.x *= p.slope; v.x = InternalType<T>::clamp(v.x, p.clamp);
+                        if (v.y < 0.f) v.y *= p.slope; v.y = InternalType<T>::clamp(v.y, p.clamp);
+                        if (v.z < 0.f) v.z *= p.slope; v.z = InternalType<T>::clamp(v.z, p.clamp);
+                        if (v.w < 0.f) v.w *= p.slope; v.w = InternalType<T>::clamp(v.w, p.clamp);
+                    }
+
+                    s_tileUpXY[idx + 0] = v.x;
+                    s_tileUpXY[idx + 1] = v.y;
+                    s_tileUpXY[idx + 2] = v.z;
+                    s_tileUpXY[idx + 3] = v.w;
+                }
+            }
+            else if (up == 1)
+            {
+                __syncthreads();
+                uint32_t groupMask = 15 << ((threadIdx.x & 31) & ~3);
+                int minY = tileOutY ? (tileOutY - tileOutH) * down + tileUpH : 0; // Skip already written signs.
+                for (int idx = threadIdx.x; idx < tileUpW * tileUpH; idx += blockDim.x)
+                {
+                    int relUpX0, relUpY0;
+                    fast_div_mod<tileUpW>(relUpX0, relUpY0, idx);
+                    scalar_t v = s_tileIn[idx] * (scalar_t)c_fu[0]; // 1x1 filter.
+
+                    int x = tileOutX * down + relUpX0;
+                    int y = tileOutY * down + relUpY0;
+                    int signX = x + p.sOfs.x;
+                    int signY = y + p.sOfs.y;
+                    int signZ = blockIdx.z + p.blockZofs;
+                    int signXb = signX >> 2;
+                    index_t si = signXb + p.sShape.x * (signY + (index_t)p.sShape.y * signZ);
+                    v *= (scalar_t)((float)up * (float)up * p.gain);
+
+                    if (signWrite)
+                    {
+                        if (!enableWriteSkip)
+                        {
+                            // Determine and write sign.
+                            uint32_t s = 0;
+                            uint32_t signXbit = (1u << signXo);
+                            if (v < 0.f)
+                            {
+                                s = signXbit;
+                                v *= p.slope;
+                            }
+                            if (fabsf(v) > p.clamp)
+                            {
+                                s = signXbit * 2;
+                                v = InternalType<T>::clamp(v, p.clamp);
+                            }
+                            if ((uint32_t)signXb < p.swLimit && (uint32_t)signY < p.sShape.y && signY >= minY)
+                            {
+                                s += __shfl_xor_sync(groupMask, s, 1);  // Coalesce.
+                                s += __shfl_xor_sync(groupMask, s, 2);  // Coalesce.
+                                p.s[si] = s;                            // Write.
+                            }
+                        }
+                        else
+                        {
+                            // Determine and write sign.
+                            if ((uint32_t)signXb < p.swLimit && (uint32_t)signY < p.sShape.y && signY >= minY)
+                            {
+                                uint32_t s = 0;
+                                uint32_t signXbit = (1u << signXo);
+                                if (v < 0.f)
+                                {
+                                    s = signXbit;
+                                    v *= p.slope;
+                                }
+                                if (fabsf(v) > p.clamp)
+                                {
+                                    s = signXbit * 2;
+                                    v = InternalType<T>::clamp(v, p.clamp);
+                                }
+                                s += __shfl_xor_sync(groupMask, s, 1);  // Coalesce.
+                                s += __shfl_xor_sync(groupMask, s, 2);  // Coalesce.
+                                p.s[si] = s;                            // Write.
+                            }
+                            else
+                            {
+                                // Just compute the value.
+                                if (v < 0.f) v *= p.slope;
+                                v = InternalType<T>::clamp(v, p.clamp);
+                            }
+                        }
+                    }
+                    else if (signRead)
+                    {
+                        // Read sign and apply if within sign tensor bounds.
+                        if ((uint32_t)signXb < p.swLimit && (uint32_t)signY < p.sShape.y)
+                        {
+                            int s = p.s[si];
+                            s >>= signXo;
+                            if (s & 1) v *= p.slope;
+                            if (s & 2) v = 0.f;
+                        }
+                    }
+                    else // Forward pass with no sign write.
+                    {
+                        if (v < 0.f) v *= p.slope;
+                        v = InternalType<T>::clamp(v, p.clamp);
+                    }
+
+                    if (!downInline) // Write into temporary buffer.
+                        s_tileUpXY[idx] = v;
+                    else if ((uint32_t)x < p.yShape.x && (uint32_t)y < p.yShape.y) // Write directly into output buffer
+                        *((T*)((char*)p.y + (x * get_stride<index_t>(p.yStride.x) + y * get_stride<index_t>(p.yStride.y) + mapOfsOut))) = (T)(v * (scalar_t)c_fd[0]);
+                }
+            }
+        }
+
+        // Downsampling.
+        if (filterMode == MODE_SUSD || filterMode == MODE_FUSD)
+        {
+            // Horizontal downsampling.
+            __syncthreads();
+            if (down == 4 && tileOutW % 4 == 0)
+            {
+                // Calculate 4 pixels at a time.
+                for (int idx = threadIdx.x * 4; idx < tileOutW * tileUpH; idx += blockDim.x * 4)
+                {
+                    int relOutX0, relUpY;
+                    fast_div_mod<tileOutW>(relOutX0, relUpY, idx);
+                    int relUpX0 = relOutX0 * down;
+                    int src0 = relUpY * tileUpW + relUpX0;
+                    vec4_t v = InternalType<T>::zero_vec4();
+                    #pragma unroll
+                    for (int step = 0; step < fdSize; step++)
+                    {
+                        v.x += s_tileUpXY[src0 +  0 + step] * (scalar_t)c_fd[step];
+                        v.y += s_tileUpXY[src0 +  4 + step] * (scalar_t)c_fd[step];
+                        v.z += s_tileUpXY[src0 +  8 + step] * (scalar_t)c_fd[step];
+                        v.w += s_tileUpXY[src0 + 12 + step] * (scalar_t)c_fd[step];
+                    }
+                    s_tileDownX[idx+0] = v.x;
+                    s_tileDownX[idx+1] = v.y;
+                    s_tileDownX[idx+2] = v.z;
+                    s_tileDownX[idx+3] = v.w;
+                }
+            }
+            else if ((down == 2 || down == 4) && (tileOutW % 2 == 0))
+            {
+                // Calculate 2 pixels at a time.
+                for (int idx = threadIdx.x * 2; idx < tileOutW * tileUpH; idx += blockDim.x * 2)
+                {
+                    int relOutX0, relUpY;
+                    fast_div_mod<tileOutW>(relOutX0, relUpY, idx);
+                    int relUpX0 = relOutX0 * down;
+                    int src0 = relUpY * tileUpW + relUpX0;
+                    vec2_t v = InternalType<T>::zero_vec2();
+                    #pragma unroll
+                    for (int step = 0; step < fdSize; step++)
+                    {
+                        v.x += s_tileUpXY[src0 +    0 + step] * (scalar_t)c_fd[step];
+                        v.y += s_tileUpXY[src0 + down + step] * (scalar_t)c_fd[step];
+                    }
+                    s_tileDownX[idx+0] = v.x;
+                    s_tileDownX[idx+1] = v.y;
+                }
+            }
+            else
+            {
+                // Calculate 1 pixel at a time.
+                for (int idx = threadIdx.x; idx < tileOutW * tileUpH; idx += blockDim.x)
+                {
+                    int relOutX0, relUpY;
+                    fast_div_mod<tileOutW>(relOutX0, relUpY, idx);
+                    int relUpX0 = relOutX0 * down;
+                    int src = relUpY * tileUpW + relUpX0;
+                    scalar_t v = 0.f;
+                    #pragma unroll
+                    for (int step = 0; step < fdSize; step++)
+                        v += s_tileUpXY[src + step] * (scalar_t)c_fd[step];
+                    s_tileDownX[idx] = v;
+                }
+            }
+
+            // Vertical downsampling & store output tile.
+            __syncthreads();
+            for (int idx = threadIdx.x; idx < tileOutW * tileOutH; idx += blockDim.x)
+            {
+                int relOutX, relOutY0;
+                fast_div_mod<tileOutW>(relOutX, relOutY0, idx);
+                int relUpY0 = relOutY0 * down;
+                int src0 = relUpY0 * tileOutW + relOutX;
+                scalar_t v = 0;
+                #pragma unroll
+                for (int step = 0; step < fdSize; step++)
+                    v += s_tileDownX[src0 + step * tileOutW] * (scalar_t)c_fd[step];
+
+                int outX = tileOutX + relOutX;
+                int outY = tileOutY + relOutY0;
+
+                if (outX < p.yShape.x & outY < p.yShape.y)
+                    *((T*)((char*)p.y + (outX * get_stride<index_t>(p.yStride.x) + outY * get_stride<index_t>(p.yStride.y) + mapOfsOut))) = (T)v;
+            }
+        }
+        else if (filterMode == MODE_SUFD || filterMode == MODE_FUFD)
+        {
+            // Full downsampling filter.
+            if (down == 2)
+            {
+                // 2-wide.
+                __syncthreads();
+                for (int idx = threadIdx.x * 2; idx < tileOutW * tileOutH; idx += blockDim.x * 2)
+                {
+                    int relOutX0, relOutY0;
+                    fast_div_mod<tileOutW>(relOutX0, relOutY0, idx);
+                    int relUpX0 = relOutX0 * down;
+                    int relUpY0 = relOutY0 * down;
+                    int src0 = relUpY0 * tileUpW + relUpX0;
+                    vec2_t v = InternalType<T>::zero_vec2();
+                    #pragma unroll
+                    for (int sy = 0; sy < fdSize; sy++)
+                    #pragma unroll
+                    for (int sx = 0; sx < fdSize; sx++)
+                    {
+                        v.x += s_tileUpXY[src0 + 0 + sx + sy * tileUpW] * (scalar_t)c_fd[sx + sy * MAX_FILTER_SIZE];
+                        v.y += s_tileUpXY[src0 + 2 + sx + sy * tileUpW] * (scalar_t)c_fd[sx + sy * MAX_FILTER_SIZE];
+                    }
+
+                    int outX = tileOutX + relOutX0;
+                    int outY = tileOutY + relOutY0;
+                    if ((uint32_t)outY < p.yShape.y)
+                    {
+                        index_t ofs = outX * get_stride<index_t>(p.yStride.x) + outY * get_stride<index_t>(p.yStride.y) + mapOfsOut;
+                        if (outX + 0 < p.yShape.x) *((T*)((char*)p.y + ofs)) = (T)v.x;
+                        if (outX + 1 < p.yShape.x) *((T*)((char*)p.y + ofs + get_stride<index_t>(p.yStride.x))) = (T)v.y;
+                    }
+                }
+            }
+            else if (down == 1 && !downInline)
+            {
+                // Thread per pixel.
+                __syncthreads();
+                for (int idx = threadIdx.x; idx < tileOutW * tileOutH; idx += blockDim.x)
+                {
+                    int relOutX0, relOutY0;
+                    fast_div_mod<tileOutW>(relOutX0, relOutY0, idx);
+                    scalar_t v = s_tileUpXY[idx] * (scalar_t)c_fd[0]; // 1x1 filter.
+
+                    int outX = tileOutX + relOutX0;
+                    int outY = tileOutY + relOutY0;
+                    if ((uint32_t)outX < p.yShape.x && (uint32_t)outY < p.yShape.y)
+                        *((T*)((char*)p.y + (outX * get_stride<index_t>(p.yStride.x) + outY * get_stride<index_t>(p.yStride.y) + mapOfsOut))) = (T)v;
+                }
+            }
+        }
+
+        if (!enableXrep)
+            break;
+    }
+}
+
+//------------------------------------------------------------------------
+// Compute activation function and signs for upsampled data tensor, modifying data tensor in-place. Used for accelerating the generic variant.
+// Sign tensor is known to be contiguous, and p.x and p.s have the same z, w dimensions. 64-bit indexing is always used.
+
+template <class T, bool signWrite, bool signRead>
+static __global__ void filtered_lrelu_act_kernel(filtered_lrelu_act_kernel_params p)
+{
+    typedef typename InternalType<T>::scalar_t scalar_t;
+
+    // Indexing.
+    int32_t x = threadIdx.x + blockIdx.x * blockDim.x;
+    int32_t ymax = signWrite ? p.sShape.y : p.xShape.y;
+    int32_t qmax = p.xShape.z * p.xShape.w; // Combined minibatch*channel maximum index.
+
+    // Loop to accommodate oversized tensors.
+    for (int32_t q = blockIdx.z; q < qmax; q += gridDim.z)
+    for (int32_t y = blockIdx.y; y < ymax; y += gridDim.y)
+    {
+        // Extract z and w (channel, minibatch index).
+        int32_t w = q / p.xShape.z;
+        int32_t z = q - w * p.xShape.z;
+
+        // Choose behavior based on sign read/write mode.
+        if (signWrite)
+        {
+            // Process value if in p.x.
+            uint32_t s = 0;
+            if (x < p.xShape.x && y < p.xShape.y)
+            {
+                int64_t ix = x * p.xStride.x + y * p.xStride.y + z * p.xStride.z + w * p.xStride.w;
+                T* pv = ((T*)p.x) + ix;
+                scalar_t v = (scalar_t)(*pv);
+
+                // Gain, LReLU, clamp.
+                v *= p.gain;
+                if (v < 0.f)
+                {
+                    v *= p.slope;
+                    s = 1; // Sign.
+                }
+                if (fabsf(v) > p.clamp)
+                {
+                    v = InternalType<T>::clamp(v, p.clamp);
+                    s = 2; // Clamp.
+                }
+
+                *pv = (T)v; // Write value.
+            }
+
+            // Coalesce into threads 0 and 16 of warp.
+            uint32_t m = (threadIdx.x & 16) ? 0xffff0000u : 0x0000ffffu;
+            s <<= ((threadIdx.x & 15) << 1); // Shift into place.
+            s |= __shfl_xor_sync(m, s, 1); // Distribute.
+            s |= __shfl_xor_sync(m, s, 2);
+            s |= __shfl_xor_sync(m, s, 4);
+            s |= __shfl_xor_sync(m, s, 8);
+
+            // Write signs if leader and in p.s.
+            if (!(threadIdx.x & 15) && x < p.sShape.x) // y is always in.
+            {
+                uint64_t is = x + p.sShape.x * (y + (int64_t)p.sShape.y * q); // Contiguous.
+                ((uint32_t*)p.s)[is >> 4] = s;
+            }
+        }
+        else if (signRead)
+        {
+            // Process value if in p.x.
+            if (x < p.xShape.x) // y is always in.
+            {
+                int64_t ix = x * p.xStride.x + y * p.xStride.y + z * p.xStride.z + w * p.xStride.w;
+                T* pv = ((T*)p.x) + ix;
+                scalar_t v = (scalar_t)(*pv);
+                v *= p.gain;
+
+                // Apply sign buffer offset.
+                uint32_t sx = x + p.sOfs.x;
+                uint32_t sy = y + p.sOfs.y;
+
+                // Read and apply signs if we land inside valid region of sign buffer.
+                if (sx < p.sShape.x && sy < p.sShape.y)
+                {
+                    uint64_t is = (sx >> 2) + (p.sShape.x >> 2) * (sy + (uint64_t)p.sShape.y * q); // Contiguous.
+                    unsigned char s = p.s[is];
+                    s >>= (sx & 3) << 1; // Shift into place.
+                    if (s & 1) // Sign?
+                        v *= p.slope;
+                    if (s & 2) // Clamp?
+                        v = 0.f;
+                }
+
+                *pv = (T)v; // Write value.
+            }
+        }
+        else
+        {
+            // Forward pass with no sign write. Process value if in p.x.
+            if (x < p.xShape.x) // y is always in.
+            {
+                int64_t ix = x * p.xStride.x + y * p.xStride.y + z * p.xStride.z + w * p.xStride.w;
+                T* pv = ((T*)p.x) + ix;
+                scalar_t v = (scalar_t)(*pv);
+                v *= p.gain;
+                if (v < 0.f)
+                    v *= p.slope;
+                if (fabsf(v) > p.clamp)
+                    v = InternalType<T>::clamp(v, p.clamp);
+                *pv = (T)v; // Write value.
+            }
+        }
+    }
+}
+
+template <class T, bool signWrite, bool signRead> void* choose_filtered_lrelu_act_kernel(void)
+{
+    return (void*)filtered_lrelu_act_kernel<T, signWrite, signRead>;
+}
+
+//------------------------------------------------------------------------
+// CUDA kernel selection.
+
+template <class T, class index_t, bool signWrite, bool signRead> filtered_lrelu_kernel_spec choose_filtered_lrelu_kernel(const filtered_lrelu_kernel_params& p, int sharedKB)
+{
+    filtered_lrelu_kernel_spec s = { 0 };
+
+    // Return the first matching kernel.
+#define CASE(SH, U, FU, D, FD, MODE, TW, TH, W, XR, WS) \
+    if (sharedKB >= SH) \
+    if ((p.fuShape.y == 0 && (MODE == MODE_SUSD || MODE == MODE_SUFD)) || (p.fuShape.y > 0 && (MODE == MODE_FUSD || MODE == MODE_FUFD))) \
+    if ((p.fdShape.y == 0 && (MODE == MODE_SUSD || MODE == MODE_FUSD)) || (p.fdShape.y > 0 && (MODE == MODE_SUFD || MODE == MODE_FUFD))) \
+    if (p.up == U && p.fuShape.x <= FU && p.fuShape.y <= FU && p.down == D && p.fdShape.x <= FD && p.fdShape.y <= FD) \
+    { \
+        static_assert((D*TW % 4) == 0, "down * tileWidth must be divisible by 4"); \
+        static_assert(FU % U == 0, "upscaling filter size must be multiple of upscaling factor"); \
+        static_assert(FD % D == 0, "downscaling filter size must be multiple of downscaling factor"); \
+        s.setup = (void*)setup_filters_kernel; \
+        s.exec = (void*)filtered_lrelu_kernel<T, index_t, SH, signWrite, signRead, MODE, U, FU, D, FD, TW, TH, W*32, !!XR, !!WS>; \
+        s.tileOut = make_int2(TW, TH); \
+        s.numWarps = W; \
+        s.xrep = XR; \
+        s.dynamicSharedKB = (SH == 48) ? 0 : SH; \
+        return s; \
+    }
+
+    // Launch parameters for various kernel specializations.
+    // Small filters must be listed before large filters, otherwise the kernel for larger filter will always match first.
+    // Kernels that use more shared memory must be listed before those that use less, for the same reason.
+
+    CASE(/*sharedKB*/48, /*up,fu*/1,1,  /*down,fd*/1,1,  /*mode*/MODE_FUFD, /*tw,th,warps,xrep,wskip*/64,  178, 32,  0,  0) // 1t-upf1-downf1
+    CASE(/*sharedKB*/48, /*up,fu*/2,8,  /*down,fd*/1,1,  /*mode*/MODE_SUFD, /*tw,th,warps,xrep,wskip*/152, 95,  16,  0,  0) // 4t-ups2-downf1
+    CASE(/*sharedKB*/48, /*up,fu*/1,1,  /*down,fd*/2,8,  /*mode*/MODE_FUSD, /*tw,th,warps,xrep,wskip*/56,  22,  16,  0,  0) // 4t-upf1-downs2
+    CASE(/*sharedKB*/48, /*up,fu*/2,8,  /*down,fd*/2,8,  /*mode*/MODE_SUSD, /*tw,th,warps,xrep,wskip*/56,  29,  16,  11, 0) // 4t-ups2-downs2
+    CASE(/*sharedKB*/48, /*up,fu*/2,8,  /*down,fd*/2,8,  /*mode*/MODE_FUSD, /*tw,th,warps,xrep,wskip*/60,  28,  16,  0,  0) // 4t-upf2-downs2
+    CASE(/*sharedKB*/48, /*up,fu*/2,8,  /*down,fd*/2,8,  /*mode*/MODE_SUFD, /*tw,th,warps,xrep,wskip*/56,  28,  16,  0,  0) // 4t-ups2-downf2
+    CASE(/*sharedKB*/48, /*up,fu*/4,16, /*down,fd*/2,8,  /*mode*/MODE_SUSD, /*tw,th,warps,xrep,wskip*/56,  31,  16,  11, 0) // 4t-ups4-downs2
+    CASE(/*sharedKB*/48, /*up,fu*/4,16, /*down,fd*/2,8,  /*mode*/MODE_SUFD, /*tw,th,warps,xrep,wskip*/56,  36,  16,  0,  0) // 4t-ups4-downf2
+    CASE(/*sharedKB*/48, /*up,fu*/2,8,  /*down,fd*/4,16, /*mode*/MODE_SUSD, /*tw,th,warps,xrep,wskip*/16,  22,  16,  12, 0) // 4t-ups2-downs4
+    CASE(/*sharedKB*/48, /*up,fu*/2,8,  /*down,fd*/4,16, /*mode*/MODE_FUSD, /*tw,th,warps,xrep,wskip*/29,  15,  16,  0,  0) // 4t-upf2-downs4
+    CASE(/*sharedKB*/48, /*up,fu*/2,12, /*down,fd*/1,1,  /*mode*/MODE_SUFD, /*tw,th,warps,xrep,wskip*/96,  150, 28,  0,  0) // 6t-ups2-downf1
+    CASE(/*sharedKB*/48, /*up,fu*/1,1,  /*down,fd*/2,12, /*mode*/MODE_FUSD, /*tw,th,warps,xrep,wskip*/32,  35,  24,  0,  0) // 6t-upf1-downs2
+    CASE(/*sharedKB*/48, /*up,fu*/2,12, /*down,fd*/2,12, /*mode*/MODE_SUSD, /*tw,th,warps,xrep,wskip*/32,  46,  16,  10, 0) // 6t-ups2-downs2
+    CASE(/*sharedKB*/48, /*up,fu*/2,12, /*down,fd*/2,12, /*mode*/MODE_FUSD, /*tw,th,warps,xrep,wskip*/58,  28,  24,  8,  0) // 6t-upf2-downs2
+    CASE(/*sharedKB*/48, /*up,fu*/2,12, /*down,fd*/2,12, /*mode*/MODE_SUFD, /*tw,th,warps,xrep,wskip*/52,  28,  16,  0,  0) // 6t-ups2-downf2
+    CASE(/*sharedKB*/48, /*up,fu*/4,24, /*down,fd*/2,12, /*mode*/MODE_SUSD, /*tw,th,warps,xrep,wskip*/32,  51,  16,  5,  0) // 6t-ups4-downs2
+    CASE(/*sharedKB*/48, /*up,fu*/4,24, /*down,fd*/2,12, /*mode*/MODE_SUFD, /*tw,th,warps,xrep,wskip*/32,  56,  16,  6,  0) // 6t-ups4-downf2
+    CASE(/*sharedKB*/48, /*up,fu*/2,12, /*down,fd*/4,24, /*mode*/MODE_SUSD, /*tw,th,warps,xrep,wskip*/16,  18,  16,  12, 0) // 6t-ups2-downs4
+    CASE(/*sharedKB*/96, /*up,fu*/2,12, /*down,fd*/4,24, /*mode*/MODE_FUSD, /*tw,th,warps,xrep,wskip*/27,  31,  32,  6,  0) // 6t-upf2-downs4 96kB
+    CASE(/*sharedKB*/48, /*up,fu*/2,12, /*down,fd*/4,24, /*mode*/MODE_FUSD, /*tw,th,warps,xrep,wskip*/27,  13,  24,  0,  0) // 6t-upf2-downs4
+    CASE(/*sharedKB*/48, /*up,fu*/2,16, /*down,fd*/1,1,  /*mode*/MODE_SUFD, /*tw,th,warps,xrep,wskip*/148, 89,  24,  0,  0) // 8t-ups2-downf1
+    CASE(/*sharedKB*/48, /*up,fu*/1,1,  /*down,fd*/2,16, /*mode*/MODE_FUSD, /*tw,th,warps,xrep,wskip*/32,  31,  16,  5,  0) // 8t-upf1-downs2
+    CASE(/*sharedKB*/48, /*up,fu*/2,16, /*down,fd*/2,16, /*mode*/MODE_SUSD, /*tw,th,warps,xrep,wskip*/32,  41,  16,  9,  0) // 8t-ups2-downs2
+    CASE(/*sharedKB*/48, /*up,fu*/2,16, /*down,fd*/2,16, /*mode*/MODE_FUSD, /*tw,th,warps,xrep,wskip*/56,  26,  24,  0,  0) // 8t-upf2-downs2
+    CASE(/*sharedKB*/48, /*up,fu*/2,16, /*down,fd*/2,16, /*mode*/MODE_SUFD, /*tw,th,warps,xrep,wskip*/32,  40,  16,  0,  0) // 8t-ups2-downf2
+    CASE(/*sharedKB*/48, /*up,fu*/4,32, /*down,fd*/2,16, /*mode*/MODE_SUSD, /*tw,th,warps,xrep,wskip*/32,  46,  24,  5,  0) // 8t-ups4-downs2
+    CASE(/*sharedKB*/48, /*up,fu*/4,32, /*down,fd*/2,16, /*mode*/MODE_SUFD, /*tw,th,warps,xrep,wskip*/32,  50,  16,  0,  0) // 8t-ups4-downf2
+    CASE(/*sharedKB*/96, /*up,fu*/2,16, /*down,fd*/4,32, /*mode*/MODE_SUSD, /*tw,th,warps,xrep,wskip*/24,  24,  32,  12, 1) // 8t-ups2-downs4 96kB
+    CASE(/*sharedKB*/48, /*up,fu*/2,16, /*down,fd*/4,32, /*mode*/MODE_SUSD, /*tw,th,warps,xrep,wskip*/16,  13,  16,  10, 1) // 8t-ups2-downs4
+    CASE(/*sharedKB*/96, /*up,fu*/2,16, /*down,fd*/4,32, /*mode*/MODE_FUSD, /*tw,th,warps,xrep,wskip*/25,  28,  28,  4,  0) // 8t-upf2-downs4 96kB
+    CASE(/*sharedKB*/48, /*up,fu*/2,16, /*down,fd*/4,32, /*mode*/MODE_FUSD, /*tw,th,warps,xrep,wskip*/25,  10,  24,  0,  0) // 8t-upf2-downs4
+
+    #undef CASE
+    return s; // No kernel found.
+}
+
+//------------------------------------------------------------------------
\ No newline at end of file
--- a/stylegan_human/torch_utils/ops/filtered_lrelu.h
+++ b/stylegan_human/torch_utils/ops/filtered_lrelu.h
+// Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#include <cuda_runtime.h>
+
+//------------------------------------------------------------------------
+// CUDA kernel parameters.
+
+struct filtered_lrelu_kernel_params
+{
+    // These parameters decide which kernel to use.
+    int             up;         // upsampling ratio (1, 2, 4)
+    int             down;       // downsampling ratio (1, 2, 4)
+    int2            fuShape;    // [size, 1] | [size, size]
+    int2            fdShape;    // [size, 1] | [size, size]
+
+    int             _dummy;     // Alignment.
+
+    // Rest of the parameters.
+    const void*     x;          // Input tensor.
+    void*           y;          // Output tensor.
+    const void*     b;          // Bias tensor.
+    unsigned char*  s;          // Sign tensor in/out. NULL if unused.
+    const float*    fu;         // Upsampling filter.
+    const float*    fd;         // Downsampling filter.
+
+    int2            pad0;       // Left/top padding.
+    float           gain;       // Additional gain factor.
+    float           slope;      // Leaky ReLU slope on negative side.
+    float           clamp;      // Clamp after nonlinearity.
+    int             flip;       // Filter kernel flip for gradient computation.
+
+    int             tilesXdim;  // Original number of horizontal output tiles.
+    int             tilesXrep;  // Number of horizontal tiles per CTA.
+    int             blockZofs;  // Block z offset to support large minibatch, channel dimensions.
+
+    int4            xShape;     // [width, height, channel, batch]
+    int4            yShape;     // [width, height, channel, batch]
+    int2            sShape;     // [width, height] - width is in bytes. Contiguous. Zeros if unused.
+    int2            sOfs;       // [ofs_x, ofs_y] - offset between upsampled data and sign tensor.
+    int             swLimit;    // Active width of sign tensor in bytes.
+
+    longlong4       xStride;    // Strides of all tensors except signs, same component order as shapes.
+    longlong4       yStride;    //
+    int64_t         bStride;    //
+    longlong3       fuStride;   //
+    longlong3       fdStride;   //
+};
+
+struct filtered_lrelu_act_kernel_params
+{
+    void*           x;          // Input/output, modified in-place.
+    unsigned char*  s;          // Sign tensor in/out. NULL if unused.
+
+    float           gain;       // Additional gain factor.
+    float           slope;      // Leaky ReLU slope on negative side.
+    float           clamp;      // Clamp after nonlinearity.
+
+    int4            xShape;     // [width, height, channel, batch]
+    longlong4       xStride;    // Input/output tensor strides, same order as in shape.
+    int2            sShape;     // [width, height] - width is in elements. Contiguous. Zeros if unused.
+    int2            sOfs;       // [ofs_x, ofs_y] - offset between upsampled data and sign tensor.
+};
+
+//------------------------------------------------------------------------
+// CUDA kernel specialization.
+
+struct filtered_lrelu_kernel_spec
+{
+    void*   setup;              // Function for filter kernel setup.
+    void*   exec;               // Function for main operation.
+    int2    tileOut;            // Width/height of launch tile.
+    int     numWarps;           // Number of warps per thread block, determines launch block size.
+    int     xrep;               // For processing multiple horizontal tiles per thread block.
+    int     dynamicSharedKB;    // How much dynamic shared memory the exec kernel wants.
+};
+
+//------------------------------------------------------------------------
+// CUDA kernel selection.
+
+template <class T, class index_t, bool signWrite, bool signRead> filtered_lrelu_kernel_spec choose_filtered_lrelu_kernel(const filtered_lrelu_kernel_params& p, int sharedKB);
+template <class T, bool signWrite, bool signRead> void* choose_filtered_lrelu_act_kernel(void);
+template <bool signWrite, bool signRead> cudaError_t copy_filters(cudaStream_t stream);
+
+//------------------------------------------------------------------------
\ No newline at end of file
--- a/stylegan_human/torch_utils/ops/filtered_lrelu.py
+++ b/stylegan_human/torch_utils/ops/filtered_lrelu.py
+# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+import os
+import numpy as np
+import torch
+import warnings
+
+from .. import custom_ops
+from .. import misc
+from . import upfirdn2d
+from . import bias_act
+
+#----------------------------------------------------------------------------
+
+_plugin = None
+
+def _init():
+    global _plugin
+    if _plugin is None:
+
+        # sources=['filtered_lrelu.h', 'filtered_lrelu.cu', 'filtered_lrelu.cpp', 'filtered_lrelu_wr.cu', 'filtered_lrelu_rd.cu', 'filtered_lrelu_ns.cu']
+        # sources = [os.path.join(os.path.dirname(__file__), s) for s in sources]
+        # try:
+        #     _plugin = custom_ops.get_plugin('filtered_lrelu_plugin', sources=sources, extra_cuda_cflags=['--use_fast_math', '--allow-unsupported-compiler'])
+        # except:
+        #     warnings.warn('Failed to build CUDA kernels for filtered_lrelu_plugin. Falling back to slow reference implementation. Details:\n\n' + traceback.format_exc())
+
+        _plugin = custom_ops.get_plugin_v3(
+            module_name='filtered_lrelu_plugin',
+            sources=['filtered_lrelu.cpp', 'filtered_lrelu_wr.cu', 'filtered_lrelu_rd.cu', 'filtered_lrelu_ns.cu'],
+            headers=['filtered_lrelu.h', 'filtered_lrelu.cu'],
+            source_dir=os.path.dirname(__file__),
+            extra_cuda_cflags=['--use_fast_math', '--allow-unsupported-compiler'],
+        )
+    return True
+
+def _get_filter_size(f):
+    if f is None:
+        return 1, 1
+    assert isinstance(f, torch.Tensor)
+    assert 1 <= f.ndim <= 2
+    return f.shape[-1], f.shape[0] # width, height
+
+def _parse_padding(padding):
+    if isinstance(padding, int):
+        padding = [padding, padding]
+    assert isinstance(padding, (list, tuple))
+    assert all(isinstance(x, (int, np.integer)) for x in padding)
+    padding = [int(x) for x in padding]
+    if len(padding) == 2:
+        px, py = padding
+        padding = [px, px, py, py]
+    px0, px1, py0, py1 = padding
+    return px0, px1, py0, py1
+
+#----------------------------------------------------------------------------
+
+def filtered_lrelu(x, fu=None, fd=None, b=None, up=1, down=1, padding=0, gain=np.sqrt(2), slope=0.2, clamp=None, flip_filter=False, impl='cuda'):
+    r"""Filtered leaky ReLU for a batch of 2D images.
+
+    Performs the following sequence of operations for each channel:
+
+    1. Add channel-specific bias if provided (`b`).
+
+    2. Upsample the image by inserting N-1 zeros after each pixel (`up`).
+
+    3. Pad the image with the specified number of zeros on each side (`padding`).
+       Negative padding corresponds to cropping the image.
+
+    4. Convolve the image with the specified upsampling FIR filter (`fu`), shrinking it
+       so that the footprint of all output pixels lies within the input image.
+
+    5. Multiply each value by the provided gain factor (`gain`).
+
+    6. Apply leaky ReLU activation function to each value.
+
+    7. Clamp each value between -clamp and +clamp, if `clamp` parameter is provided.
+
+    8. Convolve the image with the specified downsampling FIR filter (`fd`), shrinking
+       it so that the footprint of all output pixels lies within the input image.
+
+    9. Downsample the image by keeping every Nth pixel (`down`).
+
+    The fused op is considerably more efficient than performing the same calculation
+    using standard PyTorch ops. It supports gradients of arbitrary order.
+
+    Args:
+        x:           Float32/float16/float64 input tensor of the shape
+                     `[batch_size, num_channels, in_height, in_width]`.
+        fu:          Float32 upsampling FIR filter of the shape
+                     `[filter_height, filter_width]` (non-separable),
+                     `[filter_taps]` (separable), or
+                     `None` (identity).
+        fd:          Float32 downsampling FIR filter of the shape
+                     `[filter_height, filter_width]` (non-separable),
+                     `[filter_taps]` (separable), or
+                     `None` (identity).
+        b:           Bias vector, or `None` to disable. Must be a 1D tensor of the same type
+                     as `x`. The length of vector must must match the channel dimension of `x`.
+        up:          Integer upsampling factor (default: 1).
+        down:        Integer downsampling factor. (default: 1).
+        padding:     Padding with respect to the upsampled image. Can be a single number
+                     or a list/tuple `[x, y]` or `[x_before, x_after, y_before, y_after]`
+                     (default: 0).
+        gain:        Overall scaling factor for signal magnitude (default: sqrt(2)).
+        slope:       Slope on the negative side of leaky ReLU (default: 0.2).
+        clamp:       Maximum magnitude for leaky ReLU output (default: None).
+        flip_filter: False = convolution, True = correlation (default: False).
+        impl:        Implementation to use. Can be `'ref'` or `'cuda'` (default: `'cuda'`).
+
+    Returns:
+        Tensor of the shape `[batch_size, num_channels, out_height, out_width]`.
+    """
+    assert isinstance(x, torch.Tensor)
+    assert impl in ['ref', 'cuda']
+    if impl == 'ref' and x.device.type == 'cuda' and _init():
+        return _filtered_lrelu_cuda(up=up, down=down, padding=padding, gain=gain, slope=slope, clamp=clamp, flip_filter=flip_filter).apply(x, fu, fd, b, None, 0, 0)
+    return _filtered_lrelu_ref(x, fu=fu, fd=fd, b=b, up=up, down=down, padding=padding, gain=gain, slope=slope, clamp=clamp, flip_filter=flip_filter)
+
+#----------------------------------------------------------------------------
+
+@misc.profiled_function
+def _filtered_lrelu_ref(x, fu=None, fd=None, b=None, up=1, down=1, padding=0, gain=np.sqrt(2), slope=0.2, clamp=None, flip_filter=False):
+    """Slow and memory-inefficient reference implementation of `filtered_lrelu()` using
+    existing `upfirdn2n()` and `bias_act()` ops.
+    """
+    assert isinstance(x, torch.Tensor) and x.ndim == 4
+    fu_w, fu_h = _get_filter_size(fu)
+    fd_w, fd_h = _get_filter_size(fd)
+    if b is not None:
+        assert isinstance(b, torch.Tensor) and b.dtype == x.dtype
+        misc.assert_shape(b, [x.shape[1]])
+    assert isinstance(up, int) and up >= 1
+    assert isinstance(down, int) and down >= 1
+    px0, px1, py0, py1 = _parse_padding(padding)
+    assert gain == float(gain) and gain > 0
+    assert slope == float(slope) and slope >= 0
+    assert clamp is None or (clamp == float(clamp) and clamp >= 0)
+
+    # Calculate output size.
+    batch_size, channels, in_h, in_w = x.shape
+    in_dtype = x.dtype
+    out_w = (in_w * up + (px0 + px1) - (fu_w - 1) - (fd_w - 1) + (down - 1)) // down
+    out_h = (in_h * up + (py0 + py1) - (fu_h - 1) - (fd_h - 1) + (down - 1)) // down
+
+    # Compute using existing ops.
+    x = bias_act.bias_act(x=x, b=b) # Apply bias.
+    x = upfirdn2d.upfirdn2d(x=x, f=fu, up=up, padding=[px0, px1, py0, py1], gain=up**2, flip_filter=flip_filter) # Upsample.
+    x = bias_act.bias_act(x=x, act='lrelu', alpha=slope, gain=gain, clamp=clamp) # Bias, leaky ReLU, clamp.
+    x = upfirdn2d.upfirdn2d(x=x, f=fd, down=down, flip_filter=flip_filter) # Downsample.
+
+    # Check output shape & dtype.
+    misc.assert_shape(x, [batch_size, channels, out_h, out_w])
+    assert x.dtype == in_dtype
+    return x
+
+#----------------------------------------------------------------------------
+
+_filtered_lrelu_cuda_cache = dict()
+
+def _filtered_lrelu_cuda(up=1, down=1, padding=0, gain=np.sqrt(2), slope=0.2, clamp=None, flip_filter=False):
+    """Fast CUDA implementation of `filtered_lrelu()` using custom ops.
+    """
+    assert isinstance(up, int) and up >= 1
+    assert isinstance(down, int) and down >= 1
+    px0, px1, py0, py1 = _parse_padding(padding)
+    assert gain == float(gain) and gain > 0
+    gain = float(gain)
+    assert slope == float(slope) and slope >= 0
+    slope = float(slope)
+    assert clamp is None or (clamp == float(clamp) and clamp >= 0)
+    clamp = float(clamp if clamp is not None else 'inf')
+
+    # Lookup from cache.
+    key = (up, down, px0, px1, py0, py1, gain, slope, clamp, flip_filter)
+    if key in _filtered_lrelu_cuda_cache:
+        return _filtered_lrelu_cuda_cache[key]
+
+    # Forward op.
+    class FilteredLReluCuda(torch.autograd.Function):
+        @staticmethod
+        def forward(ctx, x, fu, fd, b, si, sx, sy): # pylint: disable=arguments-differ
+            assert isinstance(x, torch.Tensor) and x.ndim == 4
+
+            # Replace empty up/downsample kernels with full 1x1 kernels (faster than separable).
+            if fu is None:
+                fu = torch.ones([1, 1], dtype=torch.float32, device=x.device)
+            if fd is None:
+                fd = torch.ones([1, 1], dtype=torch.float32, device=x.device)
+            assert 1 <= fu.ndim <= 2
+            assert 1 <= fd.ndim <= 2
+
+            # Replace separable 1x1 kernels with full 1x1 kernels when scale factor is 1.
+            if up == 1 and fu.ndim == 1 and fu.shape[0] == 1:
+                fu = fu.square()[None]
+            if down == 1 and fd.ndim == 1 and fd.shape[0] == 1:
+                fd = fd.square()[None]
+
+            # Missing sign input tensor.
+            if si is None:
+                si = torch.empty([0])
+
+            # Missing bias tensor.
+            if b is None:
+                b = torch.zeros([x.shape[1]], dtype=x.dtype, device=x.device)
+
+            # Construct internal sign tensor only if gradients are needed.
+            write_signs = (si.numel() == 0) and (x.requires_grad or b.requires_grad)
+
+            # Warn if input storage strides are not in decreasing order due to e.g. channels-last layout.
+            strides = [x.stride(i) for i in range(x.ndim) if x.size(i) > 1]
+            if any(a < b for a, b in zip(strides[:-1], strides[1:])):
+                warnings.warn("low-performance memory layout detected in filtered_lrelu input", RuntimeWarning)
+
+            # Call C++/Cuda plugin if datatype is supported.
+            if x.dtype in [torch.float16, torch.float32]:
+                if torch.cuda.current_stream(x.device) != torch.cuda.default_stream(x.device):
+                    warnings.warn("filtered_lrelu called with non-default cuda stream but concurrent execution is not supported", RuntimeWarning)
+                y, so, return_code = _plugin.filtered_lrelu(x, fu, fd, b, si, up, down, px0, px1, py0, py1, sx, sy, gain, slope, clamp, flip_filter, write_signs)
+            else:
+                return_code = -1
+
+            # No Cuda kernel found? Fall back to generic implementation. Still more memory efficient than the reference implementation because
+            # only the bit-packed sign tensor is retained for gradient computation.
+            if return_code < 0:
+                warnings.warn("filtered_lrelu called with parameters that have no optimized CUDA kernel, using generic fallback", RuntimeWarning)
+
+                y = x.add(b.unsqueeze(-1).unsqueeze(-1)) # Add bias.
+                y = upfirdn2d.upfirdn2d(x=y, f=fu, up=up, padding=[px0, px1, py0, py1], gain=up**2, flip_filter=flip_filter) # Upsample.
+                so = _plugin.filtered_lrelu_act_(y, si, sx, sy, gain, slope, clamp, write_signs) # Activation function and sign handling. Modifies y in-place.
+                y = upfirdn2d.upfirdn2d(x=y, f=fd, down=down, flip_filter=flip_filter) # Downsample.
+
+            # Prepare for gradient computation.
+            ctx.save_for_backward(fu, fd, (si if si.numel() else so))
+            ctx.x_shape = x.shape
+            ctx.y_shape = y.shape
+            ctx.s_ofs = sx, sy
+            return y
+
+        @staticmethod
+        def backward(ctx, dy): # pylint: disable=arguments-differ
+            fu, fd, si = ctx.saved_tensors
+            _, _, xh, xw = ctx.x_shape
+            _, _, yh, yw = ctx.y_shape
+            sx, sy = ctx.s_ofs
+            dx  = None # 0
+            dfu = None; assert not ctx.needs_input_grad[1]
+            dfd = None; assert not ctx.needs_input_grad[2]
+            db  = None # 3
+            dsi = None; assert not ctx.needs_input_grad[4]
+            dsx = None; assert not ctx.needs_input_grad[5]
+            dsy = None; assert not ctx.needs_input_grad[6]
+
+            if ctx.needs_input_grad[0] or ctx.needs_input_grad[3]:
+                pp = [
+                    (fu.shape[-1] - 1) + (fd.shape[-1] - 1) - px0,
+                    xw * up - yw * down + px0 - (up - 1),
+                    (fu.shape[0] - 1) + (fd.shape[0] - 1) - py0,
+                    xh * up - yh * down + py0 - (up - 1),
+                ]
+                gg = gain * (up ** 2) / (down ** 2)
+                ff = (not flip_filter)
+                sx = sx - (fu.shape[-1] - 1) + px0
+                sy = sy - (fu.shape[0]  - 1) + py0
+                dx = _filtered_lrelu_cuda(up=down, down=up, padding=pp, gain=gg, slope=slope, clamp=None, flip_filter=ff).apply(dy, fd, fu, None, si, sx, sy)
+
+            if ctx.needs_input_grad[3]:
+                db = dx.sum([0, 2, 3])
+
+            return dx, dfu, dfd, db, dsi, dsx, dsy
+
+    # Add to cache.
+    _filtered_lrelu_cuda_cache[key] = FilteredLReluCuda
+    return FilteredLReluCuda
+
+#----------------------------------------------------------------------------
\ No newline at end of file
--- a/stylegan_human/torch_utils/ops/filtered_lrelu_ns.cu
+++ b/stylegan_human/torch_utils/ops/filtered_lrelu_ns.cu
+// Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#include "filtered_lrelu.cu"
+
+// Template/kernel specializations for no signs mode (no gradients required).
+
+// Full op, 32-bit indexing.
+template filtered_lrelu_kernel_spec choose_filtered_lrelu_kernel<c10::Half, int32_t, false, false>(const filtered_lrelu_kernel_params& p, int sharedKB);
+template filtered_lrelu_kernel_spec choose_filtered_lrelu_kernel<float,     int32_t, false, false>(const filtered_lrelu_kernel_params& p, int sharedKB);
+
+// Full op, 64-bit indexing.
+template filtered_lrelu_kernel_spec choose_filtered_lrelu_kernel<c10::Half, int64_t, false, false>(const filtered_lrelu_kernel_params& p, int sharedKB);
+template filtered_lrelu_kernel_spec choose_filtered_lrelu_kernel<float,     int64_t, false, false>(const filtered_lrelu_kernel_params& p, int sharedKB);
+
+// Activation/signs only for generic variant. 64-bit indexing.
+template void* choose_filtered_lrelu_act_kernel<c10::Half, false, false>(void);
+template void* choose_filtered_lrelu_act_kernel<float,     false, false>(void);
+template void* choose_filtered_lrelu_act_kernel<double,    false, false>(void);
+
+// Copy filters to constant memory.
+template cudaError_t copy_filters<false, false>(cudaStream_t stream);
\ No newline at end of file
--- a/stylegan_human/torch_utils/ops/filtered_lrelu_rd.cu
+++ b/stylegan_human/torch_utils/ops/filtered_lrelu_rd.cu
+// Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#include "filtered_lrelu.cu"
+
+// Template/kernel specializations for sign read mode.
+
+// Full op, 32-bit indexing.
+template filtered_lrelu_kernel_spec choose_filtered_lrelu_kernel<c10::Half, int32_t, false, true>(const filtered_lrelu_kernel_params& p, int sharedKB);
+template filtered_lrelu_kernel_spec choose_filtered_lrelu_kernel<float,     int32_t, false, true>(const filtered_lrelu_kernel_params& p, int sharedKB);
+
+// Full op, 64-bit indexing.
+template filtered_lrelu_kernel_spec choose_filtered_lrelu_kernel<c10::Half, int64_t, false, true>(const filtered_lrelu_kernel_params& p, int sharedKB);
+template filtered_lrelu_kernel_spec choose_filtered_lrelu_kernel<float,     int64_t, false, true>(const filtered_lrelu_kernel_params& p, int sharedKB);
+
+// Activation/signs only for generic variant. 64-bit indexing.
+template void* choose_filtered_lrelu_act_kernel<c10::Half, false, true>(void);
+template void* choose_filtered_lrelu_act_kernel<float,     false, true>(void);
+template void* choose_filtered_lrelu_act_kernel<double,    false, true>(void);
+
+// Copy filters to constant memory.
+template cudaError_t copy_filters<false, true>(cudaStream_t stream);
\ No newline at end of file