Unverified Commit 3c47bfdf authored by vfdev's avatar vfdev Committed by GitHub
Browse files

Added CUDA support for interpolation AA bilinear/bicubic (#3842)

* Added CUDA support for interpolation AA bilinear/bicubic

* Fixed code formatting error

* Fixed cuda tests on cpu-only builds

* Fixed cuda internal torch check for bicubic mode
parent 2c52d9f9
......@@ -14,7 +14,7 @@ import torchvision.transforms.functional as F
import torchvision.transforms as T
from torchvision.transforms import InterpolationMode
from common_utils import TransformsTester, cpu_and_gpu
from common_utils import TransformsTester, cpu_and_gpu, needs_cuda
from typing import Dict, List, Sequence, Tuple
......@@ -868,12 +868,14 @@ def test_perspective_interpolation_warning(tester):
tester.assertTrue(res1.equal(res2))
@pytest.mark.parametrize('device', ["cpu", ])
@pytest.mark.parametrize('device', cpu_and_gpu())
@pytest.mark.parametrize('dt', [None, torch.float32, torch.float64, torch.float16])
@pytest.mark.parametrize('size', [[96, 72], [96, 420], [420, 72]])
@pytest.mark.parametrize('interpolation', [BILINEAR, BICUBIC])
def test_resize_antialias(device, dt, size, interpolation, tester):
torch.manual_seed(12)
if dt == torch.float16 and device == "cpu":
# skip float16 on CPU case
return
......@@ -924,6 +926,19 @@ def test_resize_antialias(device, dt, size, interpolation, tester):
tester.assertTrue(resized_tensor.equal(resize_result), msg=f"{size}, {interpolation}, {dt}")
@needs_cuda
@pytest.mark.parametrize('interpolation', [BILINEAR, BICUBIC])
def test_assert_resize_antialias(interpolation, tester):
# Checks implementation on very large scales
# and catch TORCH_CHECK inside interpolate_aa_kernels.cu
torch.manual_seed(12)
tensor, pil_img = tester._create_data(1000, 1000, device="cuda")
with pytest.raises(RuntimeError, match=r"Max supported scale factor is"):
F.resize(tensor, size=(5, 5), interpolation=interpolation, antialias=True)
def check_functional_vs_PIL_vs_scripted(fn, fn_pil, fn_t, config, device, dtype, tol=2.0 + 1e-10, agg_method="max"):
tester = Tester()
......
#include <torch/library.h>
// Copied and adapted from
// Adapted from interp.cpp from Caffe util by Pauline Luc
// Originally developed by George Papandreou
#include <ATen/ATen.h>
#include <ATen/AccumulateType.h>
#include <ATen/NativeFunctions.h>
#include <ATen/TensorUtils.h>
#include <ATen/Utils.h>
#include <ATen/cuda/CUDAContext.h>
#include <ATen/cuda/CUDAApplyUtils.cuh>
#include <ATen/native/cuda/KernelUtils.cuh>
#include <ATen/native/cuda/UpSample.cuh>
// Below is experimental temporary code before merging it to PyTorch
namespace at {
namespace native {
namespace internal_upsample {
__device__ __forceinline__ size_t
idx(const size_t nc,
const size_t height,
const size_t width,
const size_t y,
const size_t x) {
return (nc * height + y) * width + x;
}
// taken from
// https://github.com/python-pillow/Pillow/blob/6812205f18ca4ef54372e87e1a13ce4a859434df/
// src/libImaging/Resample.c#L20-L29
template <typename accscalar_t>
__device__ __forceinline__ static accscalar_t bilinear_filter(accscalar_t x) {
if (x < 0.0) {
x = -x;
}
if (x < 1.0) {
return static_cast<accscalar_t>(1.0) - x;
}
return static_cast<accscalar_t>(0.0);
}
// taken from
// https://github.com/python-pillow/Pillow/blob/6812205f18ca4ef54372e87e1a13ce4a859434df/
// src/libImaging/Resample.c#L46-L62
template <typename accscalar_t>
__device__ __forceinline__ static accscalar_t bicubic_filter(accscalar_t x) {
// https://en.wikipedia.org/wiki/Bicubic_interpolation#Bicubic_convolution_algorithm
#define a -0.5
if (x < 0.0) {
x = -x;
}
if (x < 1.0) {
return ((a + 2.0) * x - (a + 3.0)) * x * x + static_cast<accscalar_t>(1.0);
}
if (x < 2.0) {
return (((x - 5) * x + 8) * x - 4) * a;
}
return static_cast<accscalar_t>(0.0);
#undef a
}
template <typename scalar_t, typename accscalar_t, typename filter_fn_t>
__device__ __forceinline__ static void _compute_weights(
const int64_t i,
const int64_t input_size,
const accscalar_t scale,
const accscalar_t support,
scalar_t* wt_ptr,
int64_t interp_size,
filter_fn_t filter_fn,
int64_t& xmin,
int64_t& xmax) {
accscalar_t invscale = (scale >= 1.0) ? 1.0 / scale : 1.0;
accscalar_t center = scale * (i + 0.5);
xmin = max(
static_cast<int64_t>(center - support + 0.5), static_cast<int64_t>(0));
xmax = min(static_cast<int64_t>(center + support + 0.5), input_size) - xmin;
accscalar_t total_w = 0.0;
int64_t j = 0;
for (j = 0; j < xmax; j++) {
accscalar_t w = filter_fn((j + xmin - center + 0.5) * invscale);
wt_ptr[j] = static_cast<scalar_t>(w);
total_w += w;
}
for (j = 0; j < xmax; j++) {
if (total_w != 0.0) {
wt_ptr[j] /= total_w;
}
}
for (; j < interp_size; j++) {
wt_ptr[j] = static_cast<scalar_t>(0.0);
}
}
template <typename scalar_t, typename accscalar_t>
__device__ __forceinline__ static accscalar_t interpolate_aa_single_dim(
scalar_t* src,
scalar_t* weights,
int64_t size) {
scalar_t t = static_cast<accscalar_t>(*src);
scalar_t wts = static_cast<accscalar_t>(weights[0]);
accscalar_t output = t * wts;
int64_t j = 1;
for (; j < size; j++) {
wts = static_cast<accscalar_t>(weights[j]);
t = static_cast<accscalar_t>(*(src + j));
output += t * wts;
}
return output;
}
template <typename scalar_t, typename accscalar_t, int interp_size>
C10_LAUNCH_BOUNDS_1(1024)
__global__ void upsample_gen2d_out_frame(
const int n,
const accscalar_t rheight,
const accscalar_t rwidth,
const bool align_corners,
const PackedTensorAccessor64<scalar_t, 4> idata,
PackedTensorAccessor64<scalar_t, 4> odata) {
int index = threadIdx.x + blockIdx.x * blockDim.x;
const int batchsize = idata.size(0);
const int channels = idata.size(1);
const int height1 = idata.size(2);
const int width1 = idata.size(3);
const int height2 = odata.size(2);
const int width2 = odata.size(3);
if (index < n) {
const int w2 = index % width2; // 0:width2-1
const int h2 = index / width2; // 0:height2-1
// special case: just copy
if (height1 == height2 && width1 == width2) {
const int h1 = h2;
const int w1 = w2;
for (int n = 0; n < batchsize; n++) {
for (int c = 0; c < channels; ++c) {
const scalar_t val = idata[n][c][h1][w1];
odata[n][c][h2][w2] = val;
}
}
return;
}
const accscalar_t support_h = static_cast<accscalar_t>(
(rheight >= 1.0) ? (interp_size * 0.5) * rheight : interp_size * 0.5);
const accscalar_t support_w = static_cast<accscalar_t>(
(rwidth >= 1.0) ? (interp_size * 0.5) * rwidth : interp_size * 0.5);
const int interp_height = (int)ceilf(support_h) * 2 + 1;
const int interp_width = (int)ceilf(support_w) * 2 + 1;
// Setup local buffers
// TODO: maybe we can specify dynamic shared memory size before calling the
// cuda code, however we should then ensure that device has enough shared
// memory
scalar_t wx[256];
scalar_t wy[256];
scalar_t buffer1[256];
scalar_t buffer2[256];
// Compute weights
int64_t xmin, xsize, ymin, ysize;
typedef scalar_t (*filter_fn_t)(scalar_t);
if (interp_size == 2) {
_compute_weights<scalar_t, accscalar_t, filter_fn_t>(
w2,
width1,
rwidth,
support_w,
wx,
interp_width,
bilinear_filter,
xmin,
xsize);
_compute_weights<scalar_t, accscalar_t, filter_fn_t>(
h2,
height1,
rheight,
support_h,
wy,
interp_height,
bilinear_filter,
ymin,
ysize);
} else if (interp_size == 4) {
_compute_weights<scalar_t, accscalar_t, filter_fn_t>(
w2,
width1,
rwidth,
support_w,
wx,
interp_width,
bicubic_filter,
xmin,
xsize);
_compute_weights<scalar_t, accscalar_t, filter_fn_t>(
h2,
height1,
rheight,
support_h,
wy,
interp_height,
bicubic_filter,
ymin,
ysize);
}
for (int n = 0; n < batchsize; n++) {
for (int c = 0; c < channels; ++c) {
// interpolate on x-axis for ymin to ymin + ysize
for (int64_t y = 0; y < ysize; y++) {
// copy data into the local buffer and use
// interpolate_aa_single_dim method
for (int x = 0; x < xsize; x++) {
buffer1[x] = idata[n][c][ymin + y][xmin + x];
}
buffer2[y] = static_cast<scalar_t>(
interpolate_aa_single_dim<scalar_t, accscalar_t>(
buffer1, wx, xsize));
}
odata[n][c][h2][w2] = static_cast<scalar_t>(
interpolate_aa_single_dim<scalar_t, accscalar_t>(
buffer2, wy, ysize));
}
}
}
}
template <int interp_size>
static void upsample_gen2d_out_cuda_template(
const Tensor& output,
const Tensor& input,
IntArrayRef output_size,
bool align_corners,
c10::optional<double> scales_h,
c10::optional<double> scales_w) {
TensorArg input_arg{input, "input", 1}, output_arg{output, "output", 2};
checkAllSameGPU("upsample_gen2d_out_cuda", {input_arg, output_arg});
int output_height = output_size[0];
int output_width = output_size[1];
int nbatch = input.size(0);
int channels = input.size(1);
int input_height = input.size(2);
int input_width = input.size(3);
const int num_kernels = output_height * output_width;
const int num_threads = std::min(
at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock, 1024);
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
input.scalar_type(), "upsample_bilinear2d_out_frame", [&] {
using accscalar_t = at::acc_type<scalar_t, true>;
auto idata = input.packed_accessor64<scalar_t, 4>();
auto odata = output.packed_accessor64<scalar_t, 4>();
const accscalar_t rheight = area_pixel_compute_scale<accscalar_t>(
input_height, output_height, align_corners, scales_h);
const accscalar_t rwidth = area_pixel_compute_scale<accscalar_t>(
input_width, output_width, align_corners, scales_w);
// We are using static buffer memory of 256 * sizeof(float) per thread
// to store weights. Size of weights array is
// interp_size = scale * 2 + 1 for bilinear mode
TORCH_CHECK(
rheight < (255 / interp_size),
"Max supported scale factor is 127 (bilinear), 63 (bicubic)");
TORCH_CHECK(
rwidth < (255 / interp_size),
"Max supported scale factor is 127 (bilinear), 63 (bicubic)");
upsample_gen2d_out_frame<scalar_t, accscalar_t, interp_size>
<<<cuda::ATenCeilDiv(num_kernels, num_threads),
num_threads,
0,
stream>>>(
num_kernels, rheight, rwidth, align_corners, idata, odata);
C10_CUDA_KERNEL_LAUNCH_CHECK();
});
}
} // namespace internal_upsample
} // namespace native
} // namespace at
namespace vision {
namespace ops {
namespace {
// Copied from "UpSample.h" as we can not use UpSample.h with UpSample.cuh
static std::array<int64_t, 4> upsample_2d_common_check(
at::IntArrayRef input_size,
at::IntArrayRef output_size) {
TORCH_CHECK(
output_size.size() == 2,
"It is expected output_size equals to 2, but got size ",
output_size.size());
TORCH_CHECK(
input_size.size() == 4,
"It is expected input_size equals to 4, but got size ",
input_size.size());
int64_t output_height = output_size[0];
int64_t output_width = output_size[1];
int64_t nbatch = input_size[0];
int64_t channels = input_size[1];
int64_t input_height = input_size[2];
int64_t input_width = input_size[3];
TORCH_CHECK(
input_height > 0 && input_width > 0 && output_height > 0 &&
output_width > 0,
"Input and output sizes should be greater than 0,"
" but got input (H: ",
input_height,
", W: ",
input_width,
") output (H: ",
output_height,
", W: ",
output_width,
")");
return {nbatch, channels, output_height, output_width};
}
template <int interp_size>
at::Tensor interpolate_gen2d_aa_forward_kernel(
const at::Tensor& input,
at::IntArrayRef output_size,
bool align_corners) {
c10::optional<c10::ArrayRef<double>> scale_factors = {};
// Copied from UpSampleBilinear2d.cpp
auto output = at::empty({0}, input.options());
auto osize = at::native::upsample::compute_output_size(
input.sizes(), output_size, scale_factors);
auto scale_h = at::native::upsample_cuda::get_scale_value(scale_factors, 0);
auto scale_w = at::native::upsample_cuda::get_scale_value(scale_factors, 1);
auto full_output_size = upsample_2d_common_check(input.sizes(), osize);
// Allow for empty batch size but not other dimensions
TORCH_CHECK(
input.numel() != 0 ||
c10::multiply_integers(
input.sizes().begin() + 1, input.sizes().end()),
"Non-empty 4D data tensor expected but got a tensor with sizes ",
input.sizes());
output.resize_(full_output_size, input.suggest_memory_format());
at::native::internal_upsample::upsample_gen2d_out_cuda_template<interp_size>(
output,
input,
{full_output_size[2], full_output_size[3]},
align_corners,
scale_h,
scale_w);
return output;
}
at::Tensor interpolate_linear_aa_forward_kernel(
const at::Tensor& input,
at::IntArrayRef output_size,
bool align_corners) {
return interpolate_gen2d_aa_forward_kernel<2>(
input, output_size, align_corners);
}
at::Tensor interpolate_bicubic_aa_forward_kernel(
const at::Tensor& input,
at::IntArrayRef output_size,
bool align_corners) {
return interpolate_gen2d_aa_forward_kernel<4>(
input, output_size, align_corners);
}
} // namespace
TORCH_LIBRARY_IMPL(torchvision, CUDA, m) {
m.impl(
TORCH_SELECTIVE_NAME("torchvision::_interpolate_linear_aa"),
TORCH_FN(interpolate_linear_aa_forward_kernel));
m.impl(
TORCH_SELECTIVE_NAME("torchvision::_interpolate_bicubic_aa"),
TORCH_FN(interpolate_bicubic_aa_forward_kernel));
}
} // namespace ops
} // namespace vision
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment