TORCH_CHECK(x.is_cuda(),"x must reside on CUDA device");
TORCH_CHECK(b.numel()==0||(b.dtype()==x.dtype()&&b.device()==x.device()),"b must have the same dtype and device as x");
TORCH_CHECK(xref.numel()==0||(xref.sizes()==x.sizes()&&xref.dtype()==x.dtype()&&xref.device()==x.device()),"xref must have the same shape, dtype, and device as x");
TORCH_CHECK(yref.numel()==0||(yref.sizes()==x.sizes()&&yref.dtype()==x.dtype()&&yref.device()==x.device()),"yref must have the same shape, dtype, and device as x");
TORCH_CHECK(dy.numel()==0||(dy.sizes()==x.sizes()&&dy.dtype()==x.dtype()&&dy.device()==x.device()),"dy must have the same dtype and device as x");
TORCH_CHECK(x.numel()<=INT_MAX,"x is too large");
TORCH_CHECK(b.dim()==1,"b must have rank 1");
TORCH_CHECK(b.numel()==0||(dim>=0&&dim<x.dim()),"dim is out of bounds");
TORCH_CHECK(b.numel()==0||b.numel()==x.size(dim),"b has wrong number of elements");
TORCH_CHECK(grad>=0,"grad must be non-negative");
// Validate layout.
TORCH_CHECK(x.is_non_overlapping_and_dense(),"x must be non-overlapping and dense");
TORCH_CHECK(b.is_contiguous(),"b must be contiguous");
TORCH_CHECK(xref.numel()==0||has_same_layout(xref,x),"xref must have the same layout as x");
TORCH_CHECK(yref.numel()==0||has_same_layout(yref,x),"yref must have the same layout as x");
TORCH_CHECK(dy.numel()==0||has_same_layout(dy,x),"dy must have the same layout as x");
// - Relative to input tensor: inX, inY, tileInX, tileInY
// - Relative to input tile: relInX, relInY, tileInW, tileInH
// - Relative to upsampled tile: relUpX, relUpY, tileUpW, tileUpH
// - Relative to output tile: relOutX, relOutY, tileOutW, tileOutH
// - Relative to output tensor: outX, outY, tileOutX, tileOutY
//
// Relationships between coordinate spaces:
// - inX = tileInX + relInX
// - inY = tileInY + relInY
// - relUpX = relInX * up + phaseInX
// - relUpY = relInY * up + phaseInY
// - relUpX = relOutX * down
// - relUpY = relOutY * down
// - outX = tileOutX + relOutX
// - outY = tileOutY + relOutY
extern__shared__chars_buf_raw[];// When sharedKB <= 48, allocate shared memory statically inside the kernel, otherwise use the externally allocated shared memory buffer.
# warnings.warn('Failed to build CUDA kernels for filtered_lrelu_plugin. Falling back to slow reference implementation. Details:\n\n' + traceback.format_exc())