// - Relative to input tensor: inX, inY, tileInX, tileInY
// - Relative to input tile: relInX, relInY, tileInW, tileInH
// - Relative to upsampled tile: relUpX, relUpY, tileUpW, tileUpH
// - Relative to output tile: relOutX, relOutY, tileOutW, tileOutH
// - Relative to output tensor: outX, outY, tileOutX, tileOutY
//
// Relationships between coordinate spaces:
// - inX = tileInX + relInX
// - inY = tileInY + relInY
// - relUpX = relInX * up + phaseInX
// - relUpY = relInY * up + phaseInY
// - relUpX = relOutX * down
// - relUpY = relOutY * down
// - outX = tileOutX + relOutX
// - outY = tileOutY + relOutY
extern__shared__chars_buf_raw[];// When sharedKB <= 48, allocate shared memory statically inside the kernel, otherwise use the externally allocated shared memory buffer.
// - Relative to input tensor: inX, inY, tileInX, tileInY
// - Relative to input tile: relInX, relInY, tileInW, tileInH
// - Relative to upsampled tile: relUpX, relUpY, tileUpW, tileUpH
// - Relative to output tile: relOutX, relOutY, tileOutW, tileOutH
// - Relative to output tensor: outX, outY, tileOutX, tileOutY
//
// Relationships between coordinate spaces:
// - inX = tileInX + relInX
// - inY = tileInY + relInY
// - relUpX = relInX * up + phaseInX
// - relUpY = relInY * up + phaseInY
// - relUpX = relOutX * down
// - relUpY = relOutY * down
// - outX = tileOutX + relOutX
// - outY = tileOutY + relOutY
extern __shared__ char s_buf_raw[]; // When sharedKB <= 48, allocate shared memory statically inside the kernel, otherwise use the externally allocated shared memory buffer.
template <class T, class index_t, int sharedKB, bool signWrite, bool signRead, int filterMode, int up, int fuSize, int down, int fdSize, int tileOutW, int tileOutH, int threadsPerBlock, bool enableXrep, bool enableWriteSkip>
TORCH_CHECK(x.is_cuda(),"x must reside on CUDA device");
TORCH_CHECK(f.device()==x.device(),"f must reside on the same device as x");
TORCH_CHECK(f.dtype()==torch::kFloat,"f must be float32");
TORCH_CHECK(x.numel()<=INT_MAX,"x is too large");
TORCH_CHECK(f.numel()<=INT_MAX,"f is too large");
TORCH_CHECK(x.numel()>0,"x has zero size");
TORCH_CHECK(f.numel()>0,"f has zero size");
TORCH_CHECK(x.dim()==4,"x must be rank 4");
TORCH_CHECK(f.dim()==2,"f must be rank 2");
TORCH_CHECK((x.size(0)-1)*x.stride(0)+(x.size(1)-1)*x.stride(1)+(x.size(2)-1)*x.stride(2)+(x.size(3)-1)*x.stride(3)<=INT_MAX,"x memory footprint is too large");
TORCH_CHECK(f.size(0)>=1&&f.size(1)>=1,"f must be at least 1x1");
TORCH_CHECK(upx>=1&&upy>=1,"upsampling factor must be at least 1");
TORCH_CHECK(downx>=1&&downy>=1,"downsampling factor must be at least 1");
TORCH_CHECK(y.numel()<=INT_MAX,"output is too large");
TORCH_CHECK((y.size(0)-1)*y.stride(0)+(y.size(1)-1)*y.stride(1)+(y.size(2)-1)*y.stride(2)+(y.size(3)-1)*y.stride(3)<=INT_MAX,"output memory footprint is too large");
static torch::Tensor upfirdn2d(torch::Tensor x, torch::Tensor f, int upx, int upy, int downx, int downy, int padx0, int padx1, int pady0, int pady1, bool flip, float gain)
{
// Validate arguments.
TORCH_CHECK(x.is_cuda(), "x must reside on CUDA device");
TORCH_CHECK(f.device() == x.device(), "f must reside on the same device as x");
TORCH_CHECK(f.dtype() == torch::kFloat, "f must be float32");
TORCH_CHECK(x.numel() <= INT_MAX, "x is too large");
TORCH_CHECK(f.numel() <= INT_MAX, "f is too large");
TORCH_CHECK(x.numel() > 0, "x has zero size");
TORCH_CHECK(f.numel() > 0, "f has zero size");
TORCH_CHECK(x.dim() == 4, "x must be rank 4");
TORCH_CHECK(f.dim() == 2, "f must be rank 2");
TORCH_CHECK((x.size(0)-1)*x.stride(0) + (x.size(1)-1)*x.stride(1) + (x.size(2)-1)*x.stride(2) + (x.size(3)-1)*x.stride(3) <= INT_MAX, "x memory footprint is too large");
TORCH_CHECK(f.size(0) >= 1 && f.size(1) >= 1, "f must be at least 1x1");
TORCH_CHECK(upx >= 1 && upy >= 1, "upsampling factor must be at least 1");
TORCH_CHECK(downx >= 1 && downy >= 1, "downsampling factor must be at least 1");