Unverified Commit 0ebbb0ab authored by Vasilis Vryniotis's avatar Vasilis Vryniotis Committed by GitHub
Browse files

Encapsulate and Standardise C++ Ops (#3097)

* Encapsulate and standardize deform_conv2d (#3074)

* Rename files.

* Standardizing method names.

* Adding anonymous namespaces.

* Applying C++ naming rules and alinging variable names across headers and cpp files.

* Syncing names across implementations.

* Rename deform_conv2d.h to deform_conv2d.cpp

* Use header files:
- Create header files for kernel implementation and remove definitions from vision_*.h files.
- Eliminate unnecessary headers and ensure all cpp include their headers.

* Change the naming convention for kernel implementations.

* Remove the _param postfix from the variables and standardizing names.

* Exposing public forward/backward methods to the C++ API and moving methods around to minimize git blame changes.

* Encapsulate and standardize nms (#3081)

* Syncing, where possible, the names of functions across devices.

* Adding all internal functions in anonymous namespaces.

* Renaming C++/CUDA kernel files and moving operator code from header to cpp file.

* Create foreach cpp file a separate header file with "public" functions.

* Removing unnecessary repeated includes.

* Update CMakeLists.txt to include all headers.

* Encapsulate and standardize ps_roi_align (#3082)

* Renaming C++ files & methods according to recommended naming conventions and aligning them with Python's API.
Syncing, where possible, the names of functions across devices.

* Adding all internal functions in anonymous namespaces.

* Renaming C++/CUDA kernel files and moving operator code from header to cpp file.

* Create foreach cpp file a separate header file with "public" functions.

* Removing unnecessary repeated includes.

* Encapsulate and standardize ps_roi_pool (#3084)

* Renaming C++ files & methods according to recommended naming conventions and aligning them with Python's API.

* Adding all internal functions in anonymous namespaces.

* Renaming C++/CUDA kernel files and moving operator code from header to cpp file.

* Create foreach cpp file a separate header file with "public" functions.

* Removing unnecessary repeated includes.

* Encapsulate and standardize roi_align (#3085)

* Renaming C++ files & methods according to recommended naming conventions and aligning them with Python's API.

* Adding all internal functions in anonymous namespaces.

* Renaming C++/CUDA kernel files and moving operator code from header to cpp file.

* Create foreach cpp file a separate header file with "public" functions.

* Removing unnecessary repeated includes.

* Encapsulate and standardize roi_pool  (#3088)

* Renaming C++ files & methods according to recommended naming conventions and aligning them with Python's API.

* Adding all internal functions in anonymous namespaces.

* Syncing variable names between the cpp files and their header files.

* Renaming C++/CUDA kernel files and moving operator code from header to cpp file.

* Create foreach cpp file a separate header file with "public" functions.

* Removing unnecessary repeated includes.

* Encapsulate and standardize new_empty_tensor_op (#3089)

* Renaming C++ files & methods according to recommended naming conventions and aligning them with Python's API.

* Create foreach cpp file a separate header file with "public" functions.

* Adding all internal functions in anonymous namespaces.

* Convert to const ref all possible parameters.

* Removing unnecessary repeated includes.

* Encapsulate and standardize C++ Ops - Clean up (#3094)

* Removing unnecessary repeated includes.

* Remove unnecessary vision_cpu.h, vision_cuda.h, autocast.h.

* Fixing naming convention and correcting method names on macros.

* Turn on clang formatter for cu files and fixing broken styles.

* Replace "#ifndef ... #define ... #endif" with "#pragma once" on header files.

* Adding operator methods in vision::ops namespace. (#3096)

* Adding operator methods in vision::ops namespace.

* Replace general.h with macros.h

* Adding vision.h to the necessary cpp files.
parent 8520f0be
...@@ -32,9 +32,11 @@ file(GLOB HEADERS torchvision/csrc/*.h) ...@@ -32,9 +32,11 @@ file(GLOB HEADERS torchvision/csrc/*.h)
# Image extension # Image extension
file(GLOB IMAGE_HEADERS torchvision/csrc/cpu/image/*.h) file(GLOB IMAGE_HEADERS torchvision/csrc/cpu/image/*.h)
file(GLOB IMAGE_SOURCES torchvision/csrc/cpu/image/*.cpp) file(GLOB IMAGE_SOURCES torchvision/csrc/cpu/image/*.cpp)
file(GLOB OPERATOR_SOURCES torchvision/csrc/cpu/*.h torchvision/csrc/cpu/*.cpp ${IMAGE_HEADERS} ${IMAGE_SOURCES} ${HEADERS} torchvision/csrc/*.cpp) file(GLOB OPERATOR_HEADERS torchvision/csrc/cpu/*.h)
file(GLOB OPERATOR_SOURCES ${OPERATOR_HEADERS} torchvision/csrc/cpu/*.cpp ${IMAGE_HEADERS} ${IMAGE_SOURCES} ${HEADERS} torchvision/csrc/*.cpp)
if(WITH_CUDA) if(WITH_CUDA)
file(GLOB OPERATOR_SOURCES ${OPERATOR_SOURCES} torchvision/csrc/cuda/*.h torchvision/csrc/cuda/*.cu) file(GLOB OPERATOR_HEADERS ${OPERATOR_HEADERS} torchvision/csrc/cuda/*.h)
file(GLOB OPERATOR_SOURCES ${OPERATOR_SOURCES} ${OPERATOR_HEADERS} torchvision/csrc/cuda/*.cu)
endif() endif()
file(GLOB MODELS_HEADERS torchvision/csrc/models/*.h) file(GLOB MODELS_HEADERS torchvision/csrc/models/*.h)
file(GLOB MODELS_SOURCES torchvision/csrc/models/*.h torchvision/csrc/models/*.cpp) file(GLOB MODELS_SOURCES torchvision/csrc/models/*.h torchvision/csrc/models/*.cpp)
...@@ -95,11 +97,11 @@ install(EXPORT TorchVisionTargets ...@@ -95,11 +97,11 @@ install(EXPORT TorchVisionTargets
install(FILES ${HEADERS} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/${PROJECT_NAME}) install(FILES ${HEADERS} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/${PROJECT_NAME})
install(FILES install(FILES
torchvision/csrc/cpu/vision_cpu.h ${OPERATOR_HEADERS}
DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/${PROJECT_NAME}/cpu) DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/${PROJECT_NAME}/cpu)
if(WITH_CUDA) if(WITH_CUDA)
install(FILES install(FILES
torchvision/csrc/cuda/vision_cuda.h ${OPERATOR_HEADERS}
DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/${PROJECT_NAME}/cuda) DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/${PROJECT_NAME}/cuda)
endif() endif()
install(FILES ${MODELS_HEADERS} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/${PROJECT_NAME}/models) install(FILES ${MODELS_HEADERS} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/${PROJECT_NAME}/models)
...@@ -152,8 +152,8 @@ def get_extensions(): ...@@ -152,8 +152,8 @@ def get_extensions():
) )
source_cuda = glob.glob(os.path.join(extensions_dir, 'hip', '*.hip')) source_cuda = glob.glob(os.path.join(extensions_dir, 'hip', '*.hip'))
# Copy over additional files # Copy over additional files
shutil.copy("torchvision/csrc/cuda/cuda_helpers.h", "torchvision/csrc/hip/cuda_helpers.h") for file in glob.glob(r"torchvision/csrc/cuda/*.h"):
shutil.copy("torchvision/csrc/cuda/vision_cuda.h", "torchvision/csrc/hip/vision_cuda.h") shutil.copy(file, "torchvision/csrc/hip")
else: else:
source_cuda = glob.glob(os.path.join(extensions_dir, 'cuda', '*.cu')) source_cuda = glob.glob(os.path.join(extensions_dir, 'cuda', '*.cu'))
......
#include <ATen/ATen.h> #include <ATen/ATen.h>
#include <torch/script.h> #include <torch/script.h>
#include <torch/torch.h> #include <torch/torch.h>
#include <torchvision/ROIAlign.h> #include <torchvision/roi_align.h>
#include <torchvision/cpu/vision_cpu.h>
#include <torchvision/nms.h> #include <torchvision/nms.h>
#ifdef _WIN32 #ifdef _WIN32
// Windows only // Windows only
// This is necessary until operators are automatically registered on include // This is necessary until operators are automatically registered on include
static auto _nms = &nms_cpu; static auto _nms = &vision::ops::nms_cpu;
#endif #endif
int main() { int main() {
......
#pragma once
#if defined(WITH_CUDA) || defined(WITH_HIP)
#include <ATen/autocast_mode.h>
#endif
...@@ -66,18 +66,17 @@ ...@@ -66,18 +66,17 @@
// modified from // modified from
// https://github.com/open-mmlab/mmdetection/blob/master/mmdet/ops/dcn/src/deform_conv_cuda.cpp // https://github.com/open-mmlab/mmdetection/blob/master/mmdet/ops/dcn/src/deform_conv_cuda.cpp
#include <ATen/ATen.h> #include "deform_conv2d_kernel.h"
#include <ATen/TensorUtils.h>
#include <TH/TH.h>
#include <cmath> namespace vision {
#include <iostream> namespace ops {
#include <tuple>
namespace {
const int kMaxParallelImgs = 32; const int kMaxParallelImgs = 32;
template <typename scalar_t> template <typename scalar_t>
static scalar_t bilinear_interpolate( scalar_t bilinear_interpolate(
const scalar_t* in, const scalar_t* in,
int height, int height,
int width, int width,
...@@ -116,7 +115,7 @@ static scalar_t bilinear_interpolate( ...@@ -116,7 +115,7 @@ static scalar_t bilinear_interpolate(
} }
template <typename scalar_t> template <typename scalar_t>
static void deformable_im2col_kernel( void deformable_im2col_kernel(
int n, int n,
const scalar_t* input, const scalar_t* input,
const scalar_t* offset, const scalar_t* offset,
...@@ -129,8 +128,8 @@ static void deformable_im2col_kernel( ...@@ -129,8 +128,8 @@ static void deformable_im2col_kernel(
int pad_w, int pad_w,
int stride_h, int stride_h,
int stride_w, int stride_w,
int dil_h, int dilation_h,
int dil_w, int dilation_w,
int batch_sz, int batch_sz,
int n_in_channels, int n_in_channels,
int n_offset_grps, int n_offset_grps,
...@@ -180,8 +179,10 @@ static void deformable_im2col_kernel( ...@@ -180,8 +179,10 @@ static void deformable_im2col_kernel(
offset_ptr[offset_idx * (out_h * out_w) + out_y * out_w + out_x]; offset_ptr[offset_idx * (out_h * out_w) + out_y * out_w + out_x];
const scalar_t offset_w = offset_ptr const scalar_t offset_w = offset_ptr
[(offset_idx + 1) * (out_h * out_w) + out_y * out_w + out_x]; [(offset_idx + 1) * (out_h * out_w) + out_y * out_w + out_x];
const scalar_t y = (out_y * stride_h - pad_h) + i * dil_h + offset_h; const scalar_t y =
const scalar_t x = (out_x * stride_w - pad_w) + j * dil_w + offset_w; (out_y * stride_h - pad_h) + i * dilation_h + offset_h;
const scalar_t x =
(out_x * stride_w - pad_w) + j * dilation_w + offset_w;
*columns_ptr = *columns_ptr =
mask_value * bilinear_interpolate(input_ptr, height, width, y, x); mask_value * bilinear_interpolate(input_ptr, height, width, y, x);
columns_ptr += batch_sz * out_h * out_w; columns_ptr += batch_sz * out_h * out_w;
...@@ -190,7 +191,7 @@ static void deformable_im2col_kernel( ...@@ -190,7 +191,7 @@ static void deformable_im2col_kernel(
} }
} }
static void deformable_im2col( void deformable_im2col(
const at::Tensor& input, const at::Tensor& input,
const at::Tensor& data_offset, const at::Tensor& data_offset,
const at::Tensor& data_mask, const at::Tensor& data_mask,
...@@ -203,8 +204,8 @@ static void deformable_im2col( ...@@ -203,8 +204,8 @@ static void deformable_im2col(
int pad_w, int pad_w,
int stride_h, int stride_h,
int stride_w, int stride_w,
int dil_h, int dilation_h,
int dil_w, int dilation_w,
int out_h, int out_h,
int out_w, int out_w,
int parallel_imgs, int parallel_imgs,
...@@ -228,8 +229,8 @@ static void deformable_im2col( ...@@ -228,8 +229,8 @@ static void deformable_im2col(
pad_w, pad_w,
stride_h, stride_h,
stride_w, stride_w,
dil_h, dilation_h,
dil_w, dilation_w,
parallel_imgs, parallel_imgs,
n_in_channels, n_in_channels,
deformable_group, deformable_group,
...@@ -240,7 +241,7 @@ static void deformable_im2col( ...@@ -240,7 +241,7 @@ static void deformable_im2col(
})); }));
} }
static int get_greatest_divisor_below_bound(int n, int bound) { int get_greatest_divisor_below_bound(int n, int bound) {
for (int k = bound; k > 1; --k) { for (int k = bound; k > 1; --k) {
if (n % k == 0) { if (n % k == 0) {
return k; return k;
...@@ -249,216 +250,8 @@ static int get_greatest_divisor_below_bound(int n, int bound) { ...@@ -249,216 +250,8 @@ static int get_greatest_divisor_below_bound(int n, int bound) {
return 1; return 1;
} }
at::Tensor DeformConv2d_forward_cpu(
const at::Tensor& input_param,
const at::Tensor& weight_param,
const at::Tensor& offset_param,
const at::Tensor& mask_param,
const at::Tensor& bias_param,
int64_t stride_h,
int64_t stride_w,
int64_t pad_h,
int64_t pad_w,
int64_t dil_h,
int64_t dil_w,
int64_t n_weight_grps,
int64_t n_offset_grps,
bool use_mask) {
at::Tensor input = input_param.contiguous();
at::Tensor offset = offset_param.contiguous();
at::Tensor weight = weight_param.contiguous();
at::Tensor mask = mask_param.contiguous();
at::Tensor bias = bias_param.contiguous();
TORCH_CHECK(input.ndimension() == 4);
TORCH_CHECK(offset.ndimension() == 4);
TORCH_CHECK(!use_mask || mask.ndimension() == 4);
TORCH_CHECK(weight.ndimension() == 4);
TORCH_CHECK(input.device().is_cpu(), "input must be a CPU tensor");
int batch_sz = input.size(0);
int n_in_channels = input.size(1);
int in_h = input.size(2);
int in_w = input.size(3);
int n_parallel_imgs =
get_greatest_divisor_below_bound(batch_sz, kMaxParallelImgs);
// Unpack shapes and args
int out_channels = weight.size(0);
int weight_h = weight.size(2);
int weight_w = weight.size(3);
int ker_h = dil_h * (weight_h - 1) + 1;
int ker_w = dil_w * (weight_w - 1) + 1;
int out_h = ((in_h + 2 * pad_h - ker_h) / stride_h) + 1;
int out_w = ((in_w + 2 * pad_w - ker_w) / stride_w) + 1;
TORCH_CHECK(
weight_h > 0 && weight_w > 0,
"weight_h: ",
weight_h,
" weight_w: ",
weight_w);
TORCH_CHECK(
stride_h > 0 && stride_w > 0,
"stride_h: ",
stride_h,
" stride_w: ",
stride_w);
TORCH_CHECK(pad_h >= 0 && pad_w >= 0, "pad_h: ", pad_h, " pad_w: ", pad_w);
TORCH_CHECK(dil_h > 0 && dil_w > 0, "dil_h: ", dil_h, " dil_w: ", dil_w);
TORCH_CHECK(weight.size(1) * n_weight_grps == input.size(1));
TORCH_CHECK(weight.size(0) % n_weight_grps == 0);
TORCH_CHECK(
(offset.size(1) == n_offset_grps * 2 * weight_h * weight_w),
"offset.shape[1] is not valid: got: ",
offset.size(1),
" expected: ",
n_offset_grps * 2 * weight_h * weight_w);
TORCH_CHECK(
(!use_mask || mask.size(1) == n_offset_grps * weight_h * weight_w),
"mask.shape[1] is not valid: got: ",
mask.size(1),
" expected: ",
n_offset_grps * weight_h * weight_w);
TORCH_CHECK(input.size(1) % n_offset_grps == 0);
TORCH_CHECK(
(offset.size(0) == input.size(0)), "invalid batch size of offset");
TORCH_CHECK(
(offset.size(2) == out_h && offset.size(3) == out_w),
"offset output dims: (",
offset.size(2),
", ",
offset.size(3),
") - ",
"computed output dims: (",
out_h,
", ",
out_w,
")");
TORCH_CHECK((mask.size(0) == input.size(0)), "invalid batch size of mask");
TORCH_CHECK(
(!use_mask || (mask.size(2) == out_h && mask.size(3) == out_w)),
"offset output dims: (",
mask.size(2),
", ",
mask.size(3),
") - ",
"computed output dims: (",
out_h,
", ",
out_w,
")");
TORCH_CHECK(
out_h > 0 && out_w > 0,
"Calculated output size too small - out_h: ",
out_h,
" out_w: ",
out_w);
auto out = at::zeros({batch_sz, out_channels, out_h, out_w}, input.options());
if (batch_sz == 0) {
return out;
}
// Separate batches into blocks
out = out.view({batch_sz / n_parallel_imgs,
n_parallel_imgs,
out_channels,
out_h,
out_w});
input = input.view(
{batch_sz / n_parallel_imgs, n_parallel_imgs, n_in_channels, in_h, in_w});
offset = offset.view({batch_sz / n_parallel_imgs,
n_parallel_imgs,
n_offset_grps * 2 * weight_h * weight_w,
out_h,
out_w});
if (use_mask) {
mask = mask.view({batch_sz / n_parallel_imgs,
n_parallel_imgs,
n_offset_grps * weight_h * weight_w,
out_h,
out_w});
}
at::Tensor out_buf = at::zeros(
{batch_sz / n_parallel_imgs,
out_channels,
n_parallel_imgs * out_h,
out_w},
out.options());
// Separate channels into convolution groups
out_buf = out_buf.view({out_buf.size(0),
n_weight_grps,
out_buf.size(1) / n_weight_grps,
out_buf.size(2),
out_buf.size(3)});
weight = weight.view({n_weight_grps,
weight.size(0) / n_weight_grps,
weight.size(1),
weight.size(2),
weight.size(3)});
// Sample points and perform convolution
auto columns = at::zeros(
{n_in_channels * weight_h * weight_w, n_parallel_imgs * out_h * out_w},
input.options());
for (int b = 0; b < batch_sz / n_parallel_imgs; b++) {
deformable_im2col(
input[b],
offset[b],
mask[b],
n_in_channels,
in_h,
in_w,
weight_h,
weight_w,
pad_h,
pad_w,
stride_h,
stride_w,
dil_h,
dil_w,
out_h,
out_w,
n_parallel_imgs,
n_offset_grps,
use_mask,
columns);
columns = columns.view(
{n_weight_grps, columns.size(0) / n_weight_grps, columns.size(1)});
for (int g = 0; g < n_weight_grps; g++) {
out_buf[b][g] = out_buf[b][g]
.flatten(1)
.addmm_(weight[g].flatten(1), columns[g])
.view_as(out_buf[b][g]);
}
columns =
columns.view({columns.size(0) * columns.size(1), columns.size(2)});
}
out_buf = out_buf.view({batch_sz / n_parallel_imgs,
out_channels,
n_parallel_imgs,
out_h,
out_w});
out_buf.transpose_(1, 2);
out.copy_(out_buf);
out = out.view({batch_sz, out_channels, out_h, out_w});
return out + bias.view({1, out_channels, 1, 1});
}
template <typename scalar_t> template <typename scalar_t>
static void deformable_col2im_kernel( void deformable_col2im_kernel(
int n, int n,
const scalar_t* col, const scalar_t* col,
const scalar_t* offset, const scalar_t* offset,
...@@ -533,7 +326,7 @@ static void deformable_col2im_kernel( ...@@ -533,7 +326,7 @@ static void deformable_col2im_kernel(
} }
} }
static void compute_grad_input( void compute_grad_input(
const at::Tensor& columns, const at::Tensor& columns,
const at::Tensor& offset, const at::Tensor& offset,
const at::Tensor& mask, const at::Tensor& mask,
...@@ -560,7 +353,7 @@ static void compute_grad_input( ...@@ -560,7 +353,7 @@ static void compute_grad_input(
channels * weight_h * weight_w * out_h * out_w * parallel_imgs; channels * weight_h * weight_w * out_h * out_w * parallel_imgs;
AT_DISPATCH_FLOATING_TYPES_AND_HALF( AT_DISPATCH_FLOATING_TYPES_AND_HALF(
columns.scalar_type(), "deformable_col2im", ([&] { columns.scalar_type(), "compute_grad_input", ([&] {
deformable_col2im_kernel( deformable_col2im_kernel(
num_kernels, num_kernels,
columns.data_ptr<scalar_t>(), columns.data_ptr<scalar_t>(),
...@@ -587,7 +380,7 @@ static void compute_grad_input( ...@@ -587,7 +380,7 @@ static void compute_grad_input(
} }
template <typename scalar_t> template <typename scalar_t>
static scalar_t get_coordinate_weight( scalar_t get_coordinate_weight(
const scalar_t* im_data, const scalar_t* im_data,
int height, int height,
int width, int width,
...@@ -620,7 +413,7 @@ static scalar_t get_coordinate_weight( ...@@ -620,7 +413,7 @@ static scalar_t get_coordinate_weight(
} }
template <typename scalar_t> template <typename scalar_t>
static void deformable_col2im_coord_kernel( void deformable_col2im_coord_kernel(
int n, int n,
const scalar_t* col, const scalar_t* col,
const scalar_t* im, const scalar_t* im,
...@@ -732,7 +525,7 @@ static void deformable_col2im_coord_kernel( ...@@ -732,7 +525,7 @@ static void deformable_col2im_coord_kernel(
} }
} }
static void compute_grad_offset_and_mask( void compute_grad_offset_and_mask(
const at::Tensor& columns, const at::Tensor& columns,
const at::Tensor& input, const at::Tensor& input,
const at::Tensor& offset, const at::Tensor& offset,
...@@ -761,7 +554,7 @@ static void compute_grad_offset_and_mask( ...@@ -761,7 +554,7 @@ static void compute_grad_offset_and_mask(
out_h * out_w * 2 * weight_h * weight_w * n_offset_grps * parallel_imgs; out_h * out_w * 2 * weight_h * weight_w * n_offset_grps * parallel_imgs;
AT_DISPATCH_FLOATING_TYPES_AND_HALF( AT_DISPATCH_FLOATING_TYPES_AND_HALF(
columns.scalar_type(), "deformable_col2im_coord", ([&] { columns.scalar_type(), "compute_grad_offset_and_mask", ([&] {
deformable_col2im_coord_kernel( deformable_col2im_coord_kernel(
num_kernels, num_kernels,
columns.data_ptr<scalar_t>(), columns.data_ptr<scalar_t>(),
...@@ -790,8 +583,7 @@ static void compute_grad_offset_and_mask( ...@@ -790,8 +583,7 @@ static void compute_grad_offset_and_mask(
})); }));
} }
static std::tuple<at::Tensor, at::Tensor, at::Tensor> std::tuple<at::Tensor, at::Tensor, at::Tensor> backward_gradient_inputs(
deform_conv2d_backward_input_cpu(
at::Tensor input, at::Tensor input,
at::Tensor weight, at::Tensor weight,
at::Tensor offset, at::Tensor offset,
...@@ -801,8 +593,8 @@ deform_conv2d_backward_input_cpu( ...@@ -801,8 +593,8 @@ deform_conv2d_backward_input_cpu(
int stride_w, int stride_w,
int pad_h, int pad_h,
int pad_w, int pad_w,
int dil_h, int dilation_h,
int dil_w, int dilation_w,
int n_weight_grps, int n_weight_grps,
int n_offset_grps, int n_offset_grps,
int n_parallel_imgs, int n_parallel_imgs,
...@@ -818,8 +610,10 @@ deform_conv2d_backward_input_cpu( ...@@ -818,8 +610,10 @@ deform_conv2d_backward_input_cpu(
int weight_h = weight.size(2); int weight_h = weight.size(2);
int weight_w = weight.size(3); int weight_w = weight.size(3);
long out_h = (in_h + 2 * pad_h - (dil_h * (weight_h - 1) + 1)) / stride_h + 1; long out_h =
long out_w = (in_w + 2 * pad_w - (dil_w * (weight_w - 1) + 1)) / stride_w + 1; (in_h + 2 * pad_h - (dilation_h * (weight_h - 1) + 1)) / stride_h + 1;
long out_w =
(in_w + 2 * pad_w - (dilation_w * (weight_w - 1) + 1)) / stride_w + 1;
auto grad_input = at::zeros_like(input); auto grad_input = at::zeros_like(input);
auto grad_offset = at::zeros_like(offset); auto grad_offset = at::zeros_like(offset);
...@@ -903,8 +697,8 @@ deform_conv2d_backward_input_cpu( ...@@ -903,8 +697,8 @@ deform_conv2d_backward_input_cpu(
pad_w, pad_w,
stride_h, stride_h,
stride_w, stride_w,
dil_h, dilation_h,
dil_w, dilation_w,
n_parallel_imgs, n_parallel_imgs,
n_offset_grps, n_offset_grps,
use_mask, use_mask,
...@@ -924,8 +718,8 @@ deform_conv2d_backward_input_cpu( ...@@ -924,8 +718,8 @@ deform_conv2d_backward_input_cpu(
pad_w, pad_w,
stride_h, stride_h,
stride_w, stride_w,
dil_h, dilation_h,
dil_w, dilation_w,
n_parallel_imgs, n_parallel_imgs,
n_offset_grps, n_offset_grps,
use_mask, use_mask,
...@@ -944,7 +738,7 @@ deform_conv2d_backward_input_cpu( ...@@ -944,7 +738,7 @@ deform_conv2d_backward_input_cpu(
return std::make_tuple(grad_input, grad_offset, grad_mask); return std::make_tuple(grad_input, grad_offset, grad_mask);
} }
static at::Tensor deform_conv2d_backward_parameters_cpu( at::Tensor backward_gradient_parameters(
at::Tensor input, at::Tensor input,
const at::Tensor& weight, const at::Tensor& weight,
at::Tensor offset, at::Tensor offset,
...@@ -954,8 +748,8 @@ static at::Tensor deform_conv2d_backward_parameters_cpu( ...@@ -954,8 +748,8 @@ static at::Tensor deform_conv2d_backward_parameters_cpu(
int stride_w, int stride_w,
int pad_h, int pad_h,
int pad_w, int pad_w,
int dil_h, int dilation_h,
int dil_w, int dilation_w,
int n_weight_grps, int n_weight_grps,
int n_offset_grps, int n_offset_grps,
int n_parallel_imgs, int n_parallel_imgs,
...@@ -1032,8 +826,8 @@ static at::Tensor deform_conv2d_backward_parameters_cpu( ...@@ -1032,8 +826,8 @@ static at::Tensor deform_conv2d_backward_parameters_cpu(
pad_w, pad_w,
stride_h, stride_h,
stride_w, stride_w,
dil_h, dilation_h,
dil_w, dilation_w,
out_h, out_h,
out_w, out_w,
n_parallel_imgs, n_parallel_imgs,
...@@ -1058,46 +852,263 @@ static at::Tensor deform_conv2d_backward_parameters_cpu( ...@@ -1058,46 +852,263 @@ static at::Tensor deform_conv2d_backward_parameters_cpu(
return grad_weight; return grad_weight;
} }
} // namespace
at::Tensor deform_conv2d_forward_cpu(
const at::Tensor& input,
const at::Tensor& weight,
const at::Tensor& offset,
const at::Tensor& mask,
const at::Tensor& bias,
int64_t stride_h,
int64_t stride_w,
int64_t pad_h,
int64_t pad_w,
int64_t dilation_h,
int64_t dilation_w,
int64_t n_weight_grps,
int64_t n_offset_grps,
bool use_mask) {
at::Tensor input_c = input.contiguous();
at::Tensor offset_c = offset.contiguous();
at::Tensor weight_c = weight.contiguous();
at::Tensor mask_c = mask.contiguous();
at::Tensor bias_c = bias.contiguous();
TORCH_CHECK(input_c.ndimension() == 4);
TORCH_CHECK(offset_c.ndimension() == 4);
TORCH_CHECK(!use_mask || mask_c.ndimension() == 4);
TORCH_CHECK(weight_c.ndimension() == 4);
TORCH_CHECK(input_c.device().is_cpu(), "input must be a CPU tensor");
int batch_sz = input_c.size(0);
int n_in_channels = input_c.size(1);
int in_h = input_c.size(2);
int in_w = input_c.size(3);
int n_parallel_imgs =
get_greatest_divisor_below_bound(batch_sz, kMaxParallelImgs);
// Unpack shapes and args
int out_channels = weight_c.size(0);
int weight_h = weight_c.size(2);
int weight_w = weight_c.size(3);
int ker_h = dilation_h * (weight_h - 1) + 1;
int ker_w = dilation_w * (weight_w - 1) + 1;
int out_h = ((in_h + 2 * pad_h - ker_h) / stride_h) + 1;
int out_w = ((in_w + 2 * pad_w - ker_w) / stride_w) + 1;
TORCH_CHECK(
weight_h > 0 && weight_w > 0,
"weight_h: ",
weight_h,
" weight_w: ",
weight_w);
TORCH_CHECK(
stride_h > 0 && stride_w > 0,
"stride_h: ",
stride_h,
" stride_w: ",
stride_w);
TORCH_CHECK(pad_h >= 0 && pad_w >= 0, "pad_h: ", pad_h, " pad_w: ", pad_w);
TORCH_CHECK(
dilation_h > 0 && dilation_w > 0,
"dilation_h: ",
dilation_h,
" dilation_w: ",
dilation_w);
TORCH_CHECK(weight_c.size(1) * n_weight_grps == input_c.size(1));
TORCH_CHECK(weight_c.size(0) % n_weight_grps == 0);
TORCH_CHECK(
(offset_c.size(1) == n_offset_grps * 2 * weight_h * weight_w),
"offset.shape[1] is not valid: got: ",
offset_c.size(1),
" expected: ",
n_offset_grps * 2 * weight_h * weight_w);
TORCH_CHECK(
(!use_mask || mask_c.size(1) == n_offset_grps * weight_h * weight_w),
"mask.shape[1] is not valid: got: ",
mask_c.size(1),
" expected: ",
n_offset_grps * weight_h * weight_w);
TORCH_CHECK(input_c.size(1) % n_offset_grps == 0);
TORCH_CHECK(
(offset_c.size(0) == input_c.size(0)), "invalid batch size of offset");
TORCH_CHECK(
(offset_c.size(2) == out_h && offset_c.size(3) == out_w),
"offset output dims: (",
offset_c.size(2),
", ",
offset_c.size(3),
") - ",
"computed output dims: (",
out_h,
", ",
out_w,
")");
TORCH_CHECK(
(mask_c.size(0) == input_c.size(0)), "invalid batch size of mask");
TORCH_CHECK(
(!use_mask || (mask_c.size(2) == out_h && mask_c.size(3) == out_w)),
"offset output dims: (",
mask_c.size(2),
", ",
mask_c.size(3),
") - ",
"computed output dims: (",
out_h,
", ",
out_w,
")");
TORCH_CHECK(
out_h > 0 && out_w > 0,
"Calculated output size too small - out_h: ",
out_h,
" out_w: ",
out_w);
auto out =
at::zeros({batch_sz, out_channels, out_h, out_w}, input_c.options());
if (batch_sz == 0) {
return out;
}
// Separate batches into blocks
out = out.view({batch_sz / n_parallel_imgs,
n_parallel_imgs,
out_channels,
out_h,
out_w});
input_c = input_c.view(
{batch_sz / n_parallel_imgs, n_parallel_imgs, n_in_channels, in_h, in_w});
offset_c = offset_c.view({batch_sz / n_parallel_imgs,
n_parallel_imgs,
n_offset_grps * 2 * weight_h * weight_w,
out_h,
out_w});
if (use_mask) {
mask_c = mask_c.view({batch_sz / n_parallel_imgs,
n_parallel_imgs,
n_offset_grps * weight_h * weight_w,
out_h,
out_w});
}
at::Tensor out_buf = at::zeros(
{batch_sz / n_parallel_imgs,
out_channels,
n_parallel_imgs * out_h,
out_w},
out.options());
// Separate channels into convolution groups
out_buf = out_buf.view({out_buf.size(0),
n_weight_grps,
out_buf.size(1) / n_weight_grps,
out_buf.size(2),
out_buf.size(3)});
weight_c = weight_c.view({n_weight_grps,
weight_c.size(0) / n_weight_grps,
weight_c.size(1),
weight_c.size(2),
weight_c.size(3)});
// Sample points and perform convolution
auto columns = at::zeros(
{n_in_channels * weight_h * weight_w, n_parallel_imgs * out_h * out_w},
input_c.options());
for (int b = 0; b < batch_sz / n_parallel_imgs; b++) {
deformable_im2col(
input_c[b],
offset_c[b],
mask_c[b],
n_in_channels,
in_h,
in_w,
weight_h,
weight_w,
pad_h,
pad_w,
stride_h,
stride_w,
dilation_h,
dilation_w,
out_h,
out_w,
n_parallel_imgs,
n_offset_grps,
use_mask,
columns);
columns = columns.view(
{n_weight_grps, columns.size(0) / n_weight_grps, columns.size(1)});
for (int g = 0; g < n_weight_grps; g++) {
out_buf[b][g] = out_buf[b][g]
.flatten(1)
.addmm_(weight_c[g].flatten(1), columns[g])
.view_as(out_buf[b][g]);
}
columns =
columns.view({columns.size(0) * columns.size(1), columns.size(2)});
}
out_buf = out_buf.view({batch_sz / n_parallel_imgs,
out_channels,
n_parallel_imgs,
out_h,
out_w});
out_buf.transpose_(1, 2);
out.copy_(out_buf);
out = out.view({batch_sz, out_channels, out_h, out_w});
return out + bias_c.view({1, out_channels, 1, 1});
}
std::tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor, at::Tensor> std::tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor, at::Tensor>
DeformConv2d_backward_cpu( deform_conv2d_backward_cpu(
const at::Tensor& grad_out_param, const at::Tensor& grad_out,
const at::Tensor& input_param, const at::Tensor& input,
const at::Tensor& weight_param, const at::Tensor& weight,
const at::Tensor& offset_param, const at::Tensor& offset,
const at::Tensor& mask_param, const at::Tensor& mask,
const at::Tensor& bias_param, const at::Tensor& bias,
int64_t stride_h, int64_t stride_h,
int64_t stride_w, int64_t stride_w,
int64_t pad_h, int64_t pad_h,
int64_t pad_w, int64_t pad_w,
int64_t dil_h, int64_t dilation_h,
int64_t dil_w, int64_t dilation_w,
int64_t n_weight_grps, int64_t n_weight_grps,
int64_t n_offset_grps, int64_t n_offset_grps,
bool use_mask) { bool use_mask) {
at::Tensor grad_out = grad_out_param.contiguous(); at::Tensor grad_out_c = grad_out.contiguous();
at::Tensor input = input_param.contiguous(); at::Tensor input_c = input.contiguous();
at::Tensor weight = weight_param.contiguous(); at::Tensor weight_c = weight.contiguous();
at::Tensor offset = offset_param.contiguous(); at::Tensor offset_c = offset.contiguous();
at::Tensor mask = mask_param.contiguous(); at::Tensor mask_c = mask.contiguous();
at::Tensor bias = bias_param.contiguous(); at::Tensor bias_c = bias.contiguous();
const int batch_sz = input.size(0); const int batch_sz = input_c.size(0);
const int n_parallel_imgs = const int n_parallel_imgs =
get_greatest_divisor_below_bound(batch_sz, kMaxParallelImgs); get_greatest_divisor_below_bound(batch_sz, kMaxParallelImgs);
auto grad_input_and_offset_and_mask = deform_conv2d_backward_input_cpu( auto grad_input_and_offset_and_mask = backward_gradient_inputs(
input, input_c,
weight, weight_c,
offset, offset_c,
mask, mask_c,
grad_out, grad_out_c,
stride_h, stride_h,
stride_w, stride_w,
pad_h, pad_h,
pad_w, pad_w,
dil_h, dilation_h,
dil_w, dilation_w,
n_weight_grps, n_weight_grps,
n_offset_grps, n_offset_grps,
n_parallel_imgs, n_parallel_imgs,
...@@ -1107,25 +1118,28 @@ DeformConv2d_backward_cpu( ...@@ -1107,25 +1118,28 @@ DeformConv2d_backward_cpu(
auto grad_offset = std::get<1>(grad_input_and_offset_and_mask); auto grad_offset = std::get<1>(grad_input_and_offset_and_mask);
auto grad_mask = std::get<2>(grad_input_and_offset_and_mask); auto grad_mask = std::get<2>(grad_input_and_offset_and_mask);
auto grad_weight = deform_conv2d_backward_parameters_cpu( auto grad_weight = backward_gradient_parameters(
input, input_c,
weight, weight_c,
offset, offset_c,
mask, mask_c,
grad_out, grad_out_c,
stride_h, stride_h,
stride_w, stride_w,
pad_h, pad_h,
pad_w, pad_w,
dil_h, dilation_h,
dil_w, dilation_w,
n_weight_grps, n_weight_grps,
n_offset_grps, n_offset_grps,
n_parallel_imgs, n_parallel_imgs,
use_mask); use_mask);
auto grad_bias = at::ones_like(bias) * grad_out.sum({0, 2, 3}); auto grad_bias = at::ones_like(bias_c) * grad_out_c.sum({0, 2, 3});
return std::make_tuple( return std::make_tuple(
grad_input, grad_weight, grad_offset, grad_mask, grad_bias); grad_input, grad_weight, grad_offset, grad_mask, grad_bias);
} }
} // namespace ops
} // namespace vision
#pragma once
#include <ATen/ATen.h>
#include "../macros.h"
namespace vision {
namespace ops {
VISION_API at::Tensor deform_conv2d_forward_cpu(
const at::Tensor& input,
const at::Tensor& weight,
const at::Tensor& offset,
const at::Tensor& mask,
const at::Tensor& bias,
int64_t stride_h,
int64_t stride_w,
int64_t pad_h,
int64_t pad_w,
int64_t dilation_h,
int64_t dilation_w,
int64_t n_weight_grps,
int64_t n_offset_grps,
bool use_mask);
VISION_API std::
tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor, at::Tensor>
deform_conv2d_backward_cpu(
const at::Tensor& grad_out,
const at::Tensor& input,
const at::Tensor& weight,
const at::Tensor& offset,
const at::Tensor& mask,
const at::Tensor& bias,
int64_t stride_h,
int64_t stride_w,
int64_t pad_h,
int64_t pad_w,
int64_t dilation_h,
int64_t dilation_w,
int64_t n_weight_grps,
int64_t n_offset_grps,
bool use_mask);
} // namespace ops
} // namespace vision
#include "vision_cpu.h" #include "nms_kernel.h"
namespace vision {
namespace ops {
namespace {
template <typename scalar_t> template <typename scalar_t>
at::Tensor nms_cpu_kernel( at::Tensor nms_kernel_impl(
const at::Tensor& dets, const at::Tensor& dets,
const at::Tensor& scores, const at::Tensor& scores,
double iou_threshold) { double iou_threshold) {
...@@ -69,6 +74,8 @@ at::Tensor nms_cpu_kernel( ...@@ -69,6 +74,8 @@ at::Tensor nms_cpu_kernel(
return keep_t.narrow(/*dim=*/0, /*start=*/0, /*length=*/num_to_keep); return keep_t.narrow(/*dim=*/0, /*start=*/0, /*length=*/num_to_keep);
} }
} // namespace
at::Tensor nms_cpu( at::Tensor nms_cpu(
const at::Tensor& dets, const at::Tensor& dets,
const at::Tensor& scores, const at::Tensor& scores,
...@@ -94,8 +101,11 @@ at::Tensor nms_cpu( ...@@ -94,8 +101,11 @@ at::Tensor nms_cpu(
auto result = at::empty({0}, dets.options()); auto result = at::empty({0}, dets.options());
AT_DISPATCH_FLOATING_TYPES(dets.scalar_type(), "nms", [&] { AT_DISPATCH_FLOATING_TYPES(dets.scalar_type(), "nms_cpu", [&] {
result = nms_cpu_kernel<scalar_t>(dets, scores, iou_threshold); result = nms_kernel_impl<scalar_t>(dets, scores, iou_threshold);
}); });
return result; return result;
} }
} // namespace ops
} // namespace vision
#pragma once
#include <ATen/ATen.h>
#include "../macros.h"
namespace vision {
namespace ops {
VISION_API at::Tensor nms_cpu(
const at::Tensor& dets,
const at::Tensor& scores,
double iou_threshold);
} // namespace ops
} // namespace vision
#include <ATen/ATen.h> #include "ps_roi_align_kernel.h"
#include <ATen/TensorUtils.h>
#include <TH/TH.h> namespace vision {
namespace ops {
namespace {
template <typename T> template <typename T>
T bilinear_interpolate( T bilinear_interpolate(
...@@ -57,7 +60,7 @@ T bilinear_interpolate( ...@@ -57,7 +60,7 @@ T bilinear_interpolate(
} }
template <typename T> template <typename T>
void PSROIAlignForwardCPU( void ps_roi_align_forward_kernel_impl(
int nthreads, int nthreads,
const T* input, const T* input,
const T spatial_scale, const T spatial_scale,
...@@ -202,7 +205,7 @@ inline void add(T* address, const T& val) { ...@@ -202,7 +205,7 @@ inline void add(T* address, const T& val) {
} }
template <typename T> template <typename T>
void PSROIAlignBackwardCPU( void ps_roi_align_backward_kernel_impl(
int nthreads, int nthreads,
const T* grad_output, const T* grad_output,
const int* channel_mapping, const int* channel_mapping,
...@@ -298,7 +301,9 @@ void PSROIAlignBackwardCPU( ...@@ -298,7 +301,9 @@ void PSROIAlignBackwardCPU(
} }
} }
std::tuple<at::Tensor, at::Tensor> PSROIAlign_forward_cpu( } // namespace
std::tuple<at::Tensor, at::Tensor> ps_roi_align_forward_cpu(
const at::Tensor& input, const at::Tensor& input,
const at::Tensor& rois, const at::Tensor& rois,
double spatial_scale, double spatial_scale,
...@@ -313,7 +318,7 @@ std::tuple<at::Tensor, at::Tensor> PSROIAlign_forward_cpu( ...@@ -313,7 +318,7 @@ std::tuple<at::Tensor, at::Tensor> PSROIAlign_forward_cpu(
at::TensorArg input_t{input, "input", 1}, rois_t{rois, "rois", 2}; at::TensorArg input_t{input, "input", 1}, rois_t{rois, "rois", 2};
at::CheckedFrom c = "PSROIAlign_forward_cpu"; at::CheckedFrom c = "ps_roi_align_forward_cpu";
at::checkAllSameType(c, {input_t, rois_t}); at::checkAllSameType(c, {input_t, rois_t});
int num_rois = rois.size(0); int num_rois = rois.size(0);
...@@ -338,8 +343,8 @@ std::tuple<at::Tensor, at::Tensor> PSROIAlign_forward_cpu( ...@@ -338,8 +343,8 @@ std::tuple<at::Tensor, at::Tensor> PSROIAlign_forward_cpu(
auto input_ = input.contiguous(), rois_ = rois.contiguous(); auto input_ = input.contiguous(), rois_ = rois.contiguous();
AT_DISPATCH_FLOATING_TYPES_AND_HALF( AT_DISPATCH_FLOATING_TYPES_AND_HALF(
input.scalar_type(), "PSROIAlign_forward", [&] { input.scalar_type(), "ps_roi_align_forward_cpu", [&] {
PSROIAlignForwardCPU<scalar_t>( ps_roi_align_forward_kernel_impl<scalar_t>(
output_size, output_size,
input_.data_ptr<scalar_t>(), input_.data_ptr<scalar_t>(),
spatial_scale, spatial_scale,
...@@ -357,7 +362,7 @@ std::tuple<at::Tensor, at::Tensor> PSROIAlign_forward_cpu( ...@@ -357,7 +362,7 @@ std::tuple<at::Tensor, at::Tensor> PSROIAlign_forward_cpu(
return std::make_tuple(output, channel_mapping); return std::make_tuple(output, channel_mapping);
} }
at::Tensor PSROIAlign_backward_cpu( at::Tensor ps_roi_align_backward_cpu(
const at::Tensor& grad, const at::Tensor& grad,
const at::Tensor& rois, const at::Tensor& rois,
const at::Tensor& channel_mapping, const at::Tensor& channel_mapping,
...@@ -379,7 +384,7 @@ at::Tensor PSROIAlign_backward_cpu( ...@@ -379,7 +384,7 @@ at::Tensor PSROIAlign_backward_cpu(
at::TensorArg grad_t{grad, "grad", 1}, rois_t{rois, "rois", 2}, at::TensorArg grad_t{grad, "grad", 1}, rois_t{rois, "rois", 2},
channel_mapping_t{channel_mapping, "channel_mapping", 3}; channel_mapping_t{channel_mapping, "channel_mapping", 3};
at::CheckedFrom c = "PSROIAlign_backward_cpu"; at::CheckedFrom c = "ps_roi_align_backward_cpu";
at::checkAllSameType(c, {grad_t, rois_t}); at::checkAllSameType(c, {grad_t, rois_t});
auto num_rois = rois.size(0); auto num_rois = rois.size(0);
...@@ -395,8 +400,8 @@ at::Tensor PSROIAlign_backward_cpu( ...@@ -395,8 +400,8 @@ at::Tensor PSROIAlign_backward_cpu(
auto grad_ = grad.contiguous(), rois_ = rois.contiguous(); auto grad_ = grad.contiguous(), rois_ = rois.contiguous();
AT_DISPATCH_FLOATING_TYPES_AND_HALF( AT_DISPATCH_FLOATING_TYPES_AND_HALF(
grad.scalar_type(), "PSROIAlign_backward", [&] { grad.scalar_type(), "ps_roi_align_backward_cpu", [&] {
PSROIAlignBackwardCPU<scalar_t>( ps_roi_align_backward_kernel_impl<scalar_t>(
grad.numel(), grad.numel(),
grad_.data_ptr<scalar_t>(), grad_.data_ptr<scalar_t>(),
channel_mapping.data_ptr<int>(), channel_mapping.data_ptr<int>(),
...@@ -414,3 +419,6 @@ at::Tensor PSROIAlign_backward_cpu( ...@@ -414,3 +419,6 @@ at::Tensor PSROIAlign_backward_cpu(
}); });
return grad_input; return grad_input;
} }
} // namespace ops
} // namespace vision
#pragma once
#include <ATen/ATen.h>
#include "../macros.h"
namespace vision {
namespace ops {
VISION_API std::tuple<at::Tensor, at::Tensor> ps_roi_align_forward_cpu(
const at::Tensor& input,
const at::Tensor& rois,
double spatial_scale,
int64_t pooled_height,
int64_t pooled_width,
int64_t sampling_ratio);
VISION_API at::Tensor ps_roi_align_backward_cpu(
const at::Tensor& grad,
const at::Tensor& rois,
const at::Tensor& channel_mapping,
double spatial_scale,
int64_t pooled_height,
int64_t pooled_width,
int64_t sampling_ratio,
int64_t batch_size,
int64_t channels,
int64_t height,
int64_t width);
} // namespace ops
} // namespace vision
#include <ATen/ATen.h> #include "ps_roi_pool_kernel.h"
#include <ATen/TensorUtils.h>
#include <TH/TH.h> namespace vision {
#include <algorithm> namespace ops {
namespace {
template <class T> template <class T>
inline void add(T* address, const T& val) { inline void add(T* address, const T& val) {
...@@ -9,7 +11,7 @@ inline void add(T* address, const T& val) { ...@@ -9,7 +11,7 @@ inline void add(T* address, const T& val) {
} }
template <typename T> template <typename T>
void PSROIPoolForward( void ps_roi_pool_forward_kernel_impl(
const T* input, const T* input,
const T spatial_scale, const T spatial_scale,
int channels, int channels,
...@@ -79,7 +81,7 @@ void PSROIPoolForward( ...@@ -79,7 +81,7 @@ void PSROIPoolForward(
} }
template <typename T> template <typename T>
void PSROIPoolBackward( void ps_roi_pool_backward_kernel_impl(
const T* grad_output, const T* grad_output,
const int* channel_mapping, const int* channel_mapping,
int num_rois, int num_rois,
...@@ -143,7 +145,9 @@ void PSROIPoolBackward( ...@@ -143,7 +145,9 @@ void PSROIPoolBackward(
} }
} }
std::tuple<at::Tensor, at::Tensor> PSROIPool_forward_cpu( } // namespace
std::tuple<at::Tensor, at::Tensor> ps_roi_pool_forward_cpu(
const at::Tensor& input, const at::Tensor& input,
const at::Tensor& rois, const at::Tensor& rois,
double spatial_scale, double spatial_scale,
...@@ -157,7 +161,7 @@ std::tuple<at::Tensor, at::Tensor> PSROIPool_forward_cpu( ...@@ -157,7 +161,7 @@ std::tuple<at::Tensor, at::Tensor> PSROIPool_forward_cpu(
at::TensorArg input_t{input, "input", 1}, rois_t{rois, "rois", 2}; at::TensorArg input_t{input, "input", 1}, rois_t{rois, "rois", 2};
at::CheckedFrom c = "PSROIPool_forward_cpu"; at::CheckedFrom c = "ps_roi_pool_forward_cpu";
at::checkAllSameType(c, {input_t, rois_t}); at::checkAllSameType(c, {input_t, rois_t});
int num_rois = rois.size(0); int num_rois = rois.size(0);
...@@ -182,8 +186,8 @@ std::tuple<at::Tensor, at::Tensor> PSROIPool_forward_cpu( ...@@ -182,8 +186,8 @@ std::tuple<at::Tensor, at::Tensor> PSROIPool_forward_cpu(
auto input_ = input.contiguous(), rois_ = rois.contiguous(); auto input_ = input.contiguous(), rois_ = rois.contiguous();
AT_DISPATCH_FLOATING_TYPES_AND_HALF( AT_DISPATCH_FLOATING_TYPES_AND_HALF(
input.scalar_type(), "PSROIPool_forward", [&] { input.scalar_type(), "ps_roi_pool_forward_cpu", [&] {
PSROIPoolForward<scalar_t>( ps_roi_pool_forward_kernel_impl<scalar_t>(
input_.data_ptr<scalar_t>(), input_.data_ptr<scalar_t>(),
spatial_scale, spatial_scale,
channels, channels,
...@@ -200,7 +204,7 @@ std::tuple<at::Tensor, at::Tensor> PSROIPool_forward_cpu( ...@@ -200,7 +204,7 @@ std::tuple<at::Tensor, at::Tensor> PSROIPool_forward_cpu(
return std::make_tuple(output, channel_mapping); return std::make_tuple(output, channel_mapping);
} }
at::Tensor PSROIPool_backward_cpu( at::Tensor ps_roi_pool_backward_cpu(
const at::Tensor& grad, const at::Tensor& grad,
const at::Tensor& rois, const at::Tensor& rois,
const at::Tensor& channel_mapping, const at::Tensor& channel_mapping,
...@@ -221,7 +225,7 @@ at::Tensor PSROIPool_backward_cpu( ...@@ -221,7 +225,7 @@ at::Tensor PSROIPool_backward_cpu(
at::TensorArg grad_t{grad, "grad", 1}, rois_t{rois, "rois", 2}, at::TensorArg grad_t{grad, "grad", 1}, rois_t{rois, "rois", 2},
channel_mapping_t{channel_mapping, "channel_mapping", 3}; channel_mapping_t{channel_mapping, "channel_mapping", 3};
at::CheckedFrom c = "PSROIPool_backward_cpu"; at::CheckedFrom c = "ps_roi_pool_backward_cpu";
at::checkAllSameType(c, {grad_t, rois_t}); at::checkAllSameType(c, {grad_t, rois_t});
auto num_rois = rois.size(0); auto num_rois = rois.size(0);
...@@ -237,8 +241,8 @@ at::Tensor PSROIPool_backward_cpu( ...@@ -237,8 +241,8 @@ at::Tensor PSROIPool_backward_cpu(
auto grad_ = grad.contiguous(), rois_ = rois.contiguous(); auto grad_ = grad.contiguous(), rois_ = rois.contiguous();
AT_DISPATCH_FLOATING_TYPES_AND_HALF( AT_DISPATCH_FLOATING_TYPES_AND_HALF(
grad.scalar_type(), "PSROIPool_backward", [&] { grad.scalar_type(), "ps_roi_pool_backward_cpu", [&] {
PSROIPoolBackward<scalar_t>( ps_roi_pool_backward_kernel_impl<scalar_t>(
grad_.data_ptr<scalar_t>(), grad_.data_ptr<scalar_t>(),
channel_mapping.data_ptr<int>(), channel_mapping.data_ptr<int>(),
num_rois, num_rois,
...@@ -254,3 +258,6 @@ at::Tensor PSROIPool_backward_cpu( ...@@ -254,3 +258,6 @@ at::Tensor PSROIPool_backward_cpu(
}); });
return grad_input; return grad_input;
} }
} // namespace ops
} // namespace vision
#pragma once
#include <ATen/ATen.h>
#include "../macros.h"
namespace vision {
namespace ops {
VISION_API std::tuple<at::Tensor, at::Tensor> ps_roi_pool_forward_cpu(
const at::Tensor& input,
const at::Tensor& rois,
double spatial_scale,
int64_t pooled_height,
int64_t pooled_width);
VISION_API at::Tensor ps_roi_pool_backward_cpu(
const at::Tensor& grad,
const at::Tensor& rois,
const at::Tensor& channel_mapping,
double spatial_scale,
int64_t pooled_height,
int64_t pooled_width,
int64_t batch_size,
int64_t channels,
int64_t height,
int64_t width);
} // namespace ops
} // namespace vision
#include <ATen/TensorUtils.h> #include "roi_align_kernel.h"
#include "vision_cpu.h"
namespace vision {
namespace ops {
namespace {
// implementation taken from Caffe2 // implementation taken from Caffe2
template <typename T> template <typename T>
...@@ -111,7 +115,7 @@ void pre_calc_for_bilinear_interpolate( ...@@ -111,7 +115,7 @@ void pre_calc_for_bilinear_interpolate(
} }
template <typename T> template <typename T>
void ROIAlignForward( void roi_align_forward_kernel_impl(
int nthreads, int nthreads,
const T* input, const T* input,
const T& spatial_scale, const T& spatial_scale,
...@@ -277,7 +281,7 @@ inline void add(T* address, const T& val) { ...@@ -277,7 +281,7 @@ inline void add(T* address, const T& val) {
} }
template <typename T> template <typename T>
void ROIAlignBackward( void roi_align_backward_kernel_impl(
int nthreads, int nthreads,
const T* grad_output, const T* grad_output,
const T& spatial_scale, const T& spatial_scale,
...@@ -382,9 +386,11 @@ void ROIAlignBackward( ...@@ -382,9 +386,11 @@ void ROIAlignBackward(
} // ix } // ix
} // iy } // iy
} // for } // for
} // ROIAlignBackward }
at::Tensor ROIAlign_forward_cpu( } // namespace
at::Tensor roi_align_forward_cpu(
const at::Tensor& input, const at::Tensor& input,
const at::Tensor& rois, const at::Tensor& rois,
double spatial_scale, double spatial_scale,
...@@ -398,7 +404,7 @@ at::Tensor ROIAlign_forward_cpu( ...@@ -398,7 +404,7 @@ at::Tensor ROIAlign_forward_cpu(
at::TensorArg input_t{input, "input", 1}, rois_t{rois, "rois", 2}; at::TensorArg input_t{input, "input", 1}, rois_t{rois, "rois", 2};
at::CheckedFrom c = "ROIAlign_forward_cpu"; at::CheckedFrom c = "roi_align_forward_cpu";
at::checkAllSameType(c, {input_t, rois_t}); at::checkAllSameType(c, {input_t, rois_t});
auto num_rois = rois.size(0); auto num_rois = rois.size(0);
...@@ -416,8 +422,8 @@ at::Tensor ROIAlign_forward_cpu( ...@@ -416,8 +422,8 @@ at::Tensor ROIAlign_forward_cpu(
auto input_ = input.contiguous(), rois_ = rois.contiguous(); auto input_ = input.contiguous(), rois_ = rois.contiguous();
AT_DISPATCH_FLOATING_TYPES_AND_HALF( AT_DISPATCH_FLOATING_TYPES_AND_HALF(
input.scalar_type(), "ROIAlign_forward", [&] { input.scalar_type(), "roi_align_forward_cpu", [&] {
ROIAlignForward<scalar_t>( roi_align_forward_kernel_impl<scalar_t>(
output_size, output_size,
input_.data_ptr<scalar_t>(), input_.data_ptr<scalar_t>(),
spatial_scale, spatial_scale,
...@@ -434,7 +440,7 @@ at::Tensor ROIAlign_forward_cpu( ...@@ -434,7 +440,7 @@ at::Tensor ROIAlign_forward_cpu(
return output; return output;
} }
at::Tensor ROIAlign_backward_cpu( at::Tensor roi_align_backward_cpu(
const at::Tensor& grad, const at::Tensor& grad,
const at::Tensor& rois, const at::Tensor& rois,
double spatial_scale, double spatial_scale,
...@@ -451,7 +457,7 @@ at::Tensor ROIAlign_backward_cpu( ...@@ -451,7 +457,7 @@ at::Tensor ROIAlign_backward_cpu(
at::TensorArg grad_t{grad, "grad", 1}, rois_t{rois, "rois", 2}; at::TensorArg grad_t{grad, "grad", 1}, rois_t{rois, "rois", 2};
at::CheckedFrom c = "ROIAlign_backward_cpu"; at::CheckedFrom c = "roi_align_backward_cpu";
at::checkAllSameType(c, {grad_t, rois_t}); at::checkAllSameType(c, {grad_t, rois_t});
at::Tensor grad_input = at::Tensor grad_input =
...@@ -470,8 +476,8 @@ at::Tensor ROIAlign_backward_cpu( ...@@ -470,8 +476,8 @@ at::Tensor ROIAlign_backward_cpu(
auto rois_ = rois.contiguous(); auto rois_ = rois.contiguous();
AT_DISPATCH_FLOATING_TYPES_AND_HALF( AT_DISPATCH_FLOATING_TYPES_AND_HALF(
grad.scalar_type(), "ROIAlign_forward", [&] { grad.scalar_type(), "roi_align_backward_cpu", [&] {
ROIAlignBackward<scalar_t>( roi_align_backward_kernel_impl<scalar_t>(
grad.numel(), grad.numel(),
grad.data_ptr<scalar_t>(), grad.data_ptr<scalar_t>(),
spatial_scale, spatial_scale,
...@@ -491,3 +497,6 @@ at::Tensor ROIAlign_backward_cpu( ...@@ -491,3 +497,6 @@ at::Tensor ROIAlign_backward_cpu(
}); });
return grad_input; return grad_input;
} }
} // namespace ops
} // namespace vision
#pragma once
#include <ATen/ATen.h>
#include "../macros.h"
namespace vision {
namespace ops {
VISION_API at::Tensor roi_align_forward_cpu(
const at::Tensor& input,
const at::Tensor& rois,
double spatial_scale,
int64_t pooled_height,
int64_t pooled_width,
int64_t sampling_ratio,
bool aligned);
VISION_API at::Tensor roi_align_backward_cpu(
const at::Tensor& grad,
const at::Tensor& rois,
double spatial_scale,
int64_t pooled_height,
int64_t pooled_width,
int64_t batch_size,
int64_t channels,
int64_t height,
int64_t width,
int64_t sampling_ratio,
bool aligned);
} // namespace ops
} // namespace vision
#include <ATen/ATen.h> #include <float.h>
#include <ATen/TensorUtils.h>
#include <TH/TH.h> #include "roi_pool_kernel.h"
#include <algorithm>
namespace vision {
namespace ops {
namespace {
template <class T> template <class T>
inline void add(T* address, const T& val) { inline void add(T* address, const T& val) {
...@@ -9,7 +13,7 @@ inline void add(T* address, const T& val) { ...@@ -9,7 +13,7 @@ inline void add(T* address, const T& val) {
} }
template <typename T> template <typename T>
void RoIPoolForward( void roi_pool_forward_kernel_impl(
const T* input, const T* input,
const T spatial_scale, const T spatial_scale,
int channels, int channels,
...@@ -78,7 +82,7 @@ void RoIPoolForward( ...@@ -78,7 +82,7 @@ void RoIPoolForward(
} }
template <typename T> template <typename T>
void RoIPoolBackward( void roi_pool_backward_kernel_impl(
const T* grad_output, const T* grad_output,
const int* argmax_data, const int* argmax_data,
int num_rois, int num_rois,
...@@ -120,7 +124,9 @@ void RoIPoolBackward( ...@@ -120,7 +124,9 @@ void RoIPoolBackward(
} // num_rois } // num_rois
} }
std::tuple<at::Tensor, at::Tensor> ROIPool_forward_cpu( } // namespace
std::tuple<at::Tensor, at::Tensor> roi_pool_forward_cpu(
const at::Tensor& input, const at::Tensor& input,
const at::Tensor& rois, const at::Tensor& rois,
double spatial_scale, double spatial_scale,
...@@ -131,7 +137,7 @@ std::tuple<at::Tensor, at::Tensor> ROIPool_forward_cpu( ...@@ -131,7 +137,7 @@ std::tuple<at::Tensor, at::Tensor> ROIPool_forward_cpu(
at::TensorArg input_t{input, "input", 1}, rois_t{rois, "rois", 2}; at::TensorArg input_t{input, "input", 1}, rois_t{rois, "rois", 2};
at::CheckedFrom c = "ROIPool_forward_cpu"; at::CheckedFrom c = "roi_pool_forward_cpu";
at::checkAllSameType(c, {input_t, rois_t}); at::checkAllSameType(c, {input_t, rois_t});
int num_rois = rois.size(0); int num_rois = rois.size(0);
...@@ -151,8 +157,8 @@ std::tuple<at::Tensor, at::Tensor> ROIPool_forward_cpu( ...@@ -151,8 +157,8 @@ std::tuple<at::Tensor, at::Tensor> ROIPool_forward_cpu(
auto input_ = input.contiguous(), rois_ = rois.contiguous(); auto input_ = input.contiguous(), rois_ = rois.contiguous();
AT_DISPATCH_FLOATING_TYPES_AND_HALF( AT_DISPATCH_FLOATING_TYPES_AND_HALF(
input.scalar_type(), "ROIPool_forward", [&] { input.scalar_type(), "roi_pool_forward_cpu", [&] {
RoIPoolForward<scalar_t>( roi_pool_forward_kernel_impl<scalar_t>(
input_.data_ptr<scalar_t>(), input_.data_ptr<scalar_t>(),
spatial_scale, spatial_scale,
channels, channels,
...@@ -168,7 +174,7 @@ std::tuple<at::Tensor, at::Tensor> ROIPool_forward_cpu( ...@@ -168,7 +174,7 @@ std::tuple<at::Tensor, at::Tensor> ROIPool_forward_cpu(
return std::make_tuple(output, argmax); return std::make_tuple(output, argmax);
} }
at::Tensor ROIPool_backward_cpu( at::Tensor roi_pool_backward_cpu(
const at::Tensor& grad, const at::Tensor& grad,
const at::Tensor& rois, const at::Tensor& rois,
const at::Tensor& argmax, const at::Tensor& argmax,
...@@ -188,7 +194,7 @@ at::Tensor ROIPool_backward_cpu( ...@@ -188,7 +194,7 @@ at::Tensor ROIPool_backward_cpu(
at::TensorArg grad_t{grad, "grad", 1}, rois_t{rois, "rois", 2}; at::TensorArg grad_t{grad, "grad", 1}, rois_t{rois, "rois", 2};
at::CheckedFrom c = "ROIPool_backward_cpu"; at::CheckedFrom c = "roi_pool_backward_cpu";
at::checkAllSameType(c, {grad_t, rois_t}); at::checkAllSameType(c, {grad_t, rois_t});
auto num_rois = rois.size(0); auto num_rois = rois.size(0);
...@@ -209,8 +215,8 @@ at::Tensor ROIPool_backward_cpu( ...@@ -209,8 +215,8 @@ at::Tensor ROIPool_backward_cpu(
auto rois_ = rois.contiguous(); auto rois_ = rois.contiguous();
AT_DISPATCH_FLOATING_TYPES_AND_HALF( AT_DISPATCH_FLOATING_TYPES_AND_HALF(
grad.scalar_type(), "ROIPool_backward", [&] { grad.scalar_type(), "roi_pool_backward_cpu", [&] {
RoIPoolBackward<scalar_t>( roi_pool_backward_kernel_impl<scalar_t>(
grad.data_ptr<scalar_t>(), grad.data_ptr<scalar_t>(),
argmax.data_ptr<int>(), argmax.data_ptr<int>(),
num_rois, num_rois,
...@@ -228,3 +234,6 @@ at::Tensor ROIPool_backward_cpu( ...@@ -228,3 +234,6 @@ at::Tensor ROIPool_backward_cpu(
}); });
return grad_input; return grad_input;
} }
} // namespace ops
} // namespace vision
#pragma once
#include <ATen/ATen.h>
#include "../macros.h"
namespace vision {
namespace ops {
VISION_API std::tuple<at::Tensor, at::Tensor> roi_pool_forward_cpu(
const at::Tensor& input,
const at::Tensor& rois,
double spatial_scale,
int64_t pooled_height,
int64_t pooled_width);
VISION_API at::Tensor roi_pool_backward_cpu(
const at::Tensor& grad,
const at::Tensor& rois,
const at::Tensor& argmax,
double spatial_scale,
int64_t pooled_height,
int64_t pooled_width,
int64_t batch_size,
int64_t channels,
int64_t height,
int64_t width);
} // namespace ops
} // namespace vision
#ifndef REGISTER_H #pragma once
#define REGISTER_H
#include "Video.h" #include "Video.h"
...@@ -15,4 +14,3 @@ static auto registerVideo = ...@@ -15,4 +14,3 @@ static auto registerVideo =
.def("next", &Video::Next); .def("next", &Video::Next);
} // namespace } // namespace
#endif
#pragma once #pragma once
namespace vision {
namespace ops {
#define CUDA_1D_KERNEL_LOOP(i, n) \ #define CUDA_1D_KERNEL_LOOP(i, n) \
for (int i = (blockIdx.x * blockDim.x) + threadIdx.x; i < (n); \ for (int i = (blockIdx.x * blockDim.x) + threadIdx.x; i < (n); \
i += (blockDim.x * gridDim.x)) i += (blockDim.x * gridDim.x))
...@@ -8,3 +11,6 @@ template <typename integer> ...@@ -8,3 +11,6 @@ template <typename integer>
constexpr __host__ __device__ inline integer ceil_div(integer n, integer m) { constexpr __host__ __device__ inline integer ceil_div(integer n, integer m) {
return (n + m - 1) / m; return (n + m - 1) / m;
} }
} // namespace ops
} // namespace vision
...@@ -66,17 +66,17 @@ ...@@ -66,17 +66,17 @@
// modified from // modified from
// https://github.com/open-mmlab/mmdetection/blob/master/mmdet/ops/dcn/src/deform_conv_cuda.cpp // https://github.com/open-mmlab/mmdetection/blob/master/mmdet/ops/dcn/src/deform_conv_cuda.cpp
#include <ATen/ATen.h>
#include <ATen/TensorUtils.h>
#include <ATen/cuda/CUDAContext.h> #include <ATen/cuda/CUDAContext.h>
#include <c10/cuda/CUDAGuard.h> #include <c10/cuda/CUDAGuard.h>
#include <THC/THCAtomics.cuh> #include <THC/THCAtomics.cuh>
#include "cuda_helpers.h" #include "cuda_helpers.h"
#include "deform_conv2d_kernel.h"
#include <cmath> namespace vision {
#include <iostream> namespace ops {
#include <tuple>
namespace {
const int kMaxParallelImgs = 32; const int kMaxParallelImgs = 32;
...@@ -90,7 +90,9 @@ inline unsigned int GET_THREADS() { ...@@ -90,7 +90,9 @@ inline unsigned int GET_THREADS() {
return 512; return 512;
} }
inline unsigned int GET_BLOCKS(const unsigned int THREADS, const unsigned int N) { inline unsigned int GET_BLOCKS(
const unsigned int THREADS,
const unsigned int N) {
unsigned int kMaxGridNum = unsigned int kMaxGridNum =
at::cuda::getCurrentDeviceProperties()->maxGridSize[0]; at::cuda::getCurrentDeviceProperties()->maxGridSize[0];
return std::min(kMaxGridNum, (N + THREADS - 1) / THREADS); return std::min(kMaxGridNum, (N + THREADS - 1) / THREADS);
...@@ -136,7 +138,7 @@ __device__ scalar_t bilinear_interpolate( ...@@ -136,7 +138,7 @@ __device__ scalar_t bilinear_interpolate(
} }
template <typename scalar_t> template <typename scalar_t>
__global__ void deformable_im2col_gpu_kernel( __global__ void deformable_im2col_kernel(
int n, int n,
const scalar_t* input_ptr, const scalar_t* input_ptr,
const scalar_t* offset_ptr, const scalar_t* offset_ptr,
...@@ -149,8 +151,8 @@ __global__ void deformable_im2col_gpu_kernel( ...@@ -149,8 +151,8 @@ __global__ void deformable_im2col_gpu_kernel(
int pad_w, int pad_w,
int stride_h, int stride_h,
int stride_w, int stride_w,
int dil_h, int dilation_h,
int dil_w, int dilation_w,
int batch_sz, int batch_sz,
int n_in_channels, int n_in_channels,
int n_offset_grps, int n_offset_grps,
...@@ -198,8 +200,10 @@ __global__ void deformable_im2col_gpu_kernel( ...@@ -198,8 +200,10 @@ __global__ void deformable_im2col_gpu_kernel(
offset_ptr[offset_idx * (out_h * out_w) + out_y * out_w + out_x]; offset_ptr[offset_idx * (out_h * out_w) + out_y * out_w + out_x];
const scalar_t offset_w = offset_ptr const scalar_t offset_w = offset_ptr
[(offset_idx + 1) * (out_h * out_w) + out_y * out_w + out_x]; [(offset_idx + 1) * (out_h * out_w) + out_y * out_w + out_x];
const scalar_t y = (out_y * stride_h - pad_h) + i * dil_h + offset_h; const scalar_t y =
const scalar_t x = (out_x * stride_w - pad_w) + j * dil_w + offset_w; (out_y * stride_h - pad_h) + i * dilation_h + offset_h;
const scalar_t x =
(out_x * stride_w - pad_w) + j * dilation_w + offset_w;
*columns_ptr = *columns_ptr =
mask_value * bilinear_interpolate(input_ptr, height, width, y, x); mask_value * bilinear_interpolate(input_ptr, height, width, y, x);
columns_ptr += batch_sz * out_h * out_w; columns_ptr += batch_sz * out_h * out_w;
...@@ -208,7 +212,7 @@ __global__ void deformable_im2col_gpu_kernel( ...@@ -208,7 +212,7 @@ __global__ void deformable_im2col_gpu_kernel(
} }
} }
static void deformable_im2col( void deformable_im2col(
const at::Tensor& input, const at::Tensor& input,
const at::Tensor& data_offset, const at::Tensor& data_offset,
const at::Tensor& data_mask, const at::Tensor& data_mask,
...@@ -221,8 +225,8 @@ static void deformable_im2col( ...@@ -221,8 +225,8 @@ static void deformable_im2col(
int pad_w, int pad_w,
int stride_h, int stride_h,
int stride_w, int stride_w,
int dil_h, int dilation_h,
int dil_w, int dilation_w,
int out_h, int out_h,
int out_w, int out_w,
int parallel_imgs, int parallel_imgs,
...@@ -235,10 +239,8 @@ static void deformable_im2col( ...@@ -235,10 +239,8 @@ static void deformable_im2col(
const unsigned int blocks = GET_BLOCKS(threads, num_kernels); const unsigned int blocks = GET_BLOCKS(threads, num_kernels);
AT_DISPATCH_FLOATING_TYPES_AND_HALF( AT_DISPATCH_FLOATING_TYPES_AND_HALF(
input.scalar_type(), "deformable_im2col_gpu", ([&] { input.scalar_type(), "deformable_im2col", ([&] {
deformable_im2col_gpu_kernel<<< deformable_im2col_kernel<<<blocks, threads>>>(
blocks,
threads>>>(
num_kernels, num_kernels,
input.data_ptr<scalar_t>(), input.data_ptr<scalar_t>(),
data_offset.data_ptr<scalar_t>(), data_offset.data_ptr<scalar_t>(),
...@@ -251,8 +253,8 @@ static void deformable_im2col( ...@@ -251,8 +253,8 @@ static void deformable_im2col(
pad_w, pad_w,
stride_h, stride_h,
stride_w, stride_w,
dil_h, dilation_h,
dil_w, dilation_w,
parallel_imgs, parallel_imgs,
n_in_channels, n_in_channels,
deformable_group, deformable_group,
...@@ -268,7 +270,7 @@ static void deformable_im2col( ...@@ -268,7 +270,7 @@ static void deformable_im2col(
} }
} }
static int get_greatest_divisor_below_bound(int n, int bound) { int get_greatest_divisor_below_bound(int n, int bound) {
for (int k = bound; k > 1; --k) { for (int k = bound; k > 1; --k) {
if (n % k == 0) { if (n % k == 0) {
return k; return k;
...@@ -277,217 +279,8 @@ static int get_greatest_divisor_below_bound(int n, int bound) { ...@@ -277,217 +279,8 @@ static int get_greatest_divisor_below_bound(int n, int bound) {
return 1; return 1;
} }
at::Tensor DeformConv2d_forward_cuda(
const at::Tensor& input_param,
const at::Tensor& weight_param,
const at::Tensor& offset_param,
const at::Tensor& mask_param,
const at::Tensor& bias_param,
int64_t stride_h,
int64_t stride_w,
int64_t pad_h,
int64_t pad_w,
int64_t dil_h,
int64_t dil_w,
int64_t n_weight_grps,
int64_t n_offset_grps,
bool use_mask) {
at::Tensor input = input_param.contiguous();
at::Tensor offset = offset_param.contiguous();
at::Tensor weight = weight_param.contiguous();
at::Tensor mask = mask_param.contiguous();
at::Tensor bias = bias_param.contiguous();
TORCH_CHECK(input.ndimension() == 4);
TORCH_CHECK(offset.ndimension() == 4);
TORCH_CHECK(!use_mask || mask.ndimension() == 4);
TORCH_CHECK(weight.ndimension() == 4);
TORCH_CHECK(input.is_cuda(), "input must be a CUDA tensor");
at::DeviceGuard guard(input.device());
int batch_sz = input.size(0);
int in_channels = input.size(1);
int in_h = input.size(2);
int in_w = input.size(3);
int n_parallel_imgs =
get_greatest_divisor_below_bound(batch_sz, kMaxParallelImgs);
int out_channels = weight.size(0);
int weight_h = weight.size(2);
int weight_w = weight.size(3);
int ker_h = dil_h * (weight_h - 1) + 1;
int ker_w = dil_w * (weight_w - 1) + 1;
int out_h = ((in_h + 2 * pad_h - ker_h) / stride_h) + 1;
int out_w = ((in_w + 2 * pad_w - ker_w) / stride_w) + 1;
TORCH_CHECK(
weight_h > 0 && weight_w > 0,
"weight_h: ",
weight_h,
" weight_w: ",
weight_w);
TORCH_CHECK(
stride_h > 0 && stride_w > 0,
"stride_h: ",
stride_h,
" stride_w: ",
stride_w);
TORCH_CHECK(pad_h >= 0 && pad_w >= 0, "pad_h: ", pad_h, " pad_w: ", pad_w);
TORCH_CHECK(dil_h > 0 && dil_w > 0, "dil_h: ", dil_h, " dil_w: ", dil_w);
TORCH_CHECK(weight.size(1) * n_weight_grps == input.size(1));
TORCH_CHECK(weight.size(0) % n_weight_grps == 0);
TORCH_CHECK(
(offset.size(1) == n_offset_grps * 2 * weight_h * weight_w),
"offset.shape[1] is not valid: got: ",
offset.size(1),
" expected: ",
n_offset_grps * 2 * weight_h * weight_w);
TORCH_CHECK(
(!use_mask || mask.size(1) == n_offset_grps * weight_h * weight_w),
"mask.shape[1] is not valid: got: ",
mask.size(1),
" expected: ",
n_offset_grps * weight_h * weight_w);
TORCH_CHECK(input.size(1) % n_offset_grps == 0);
TORCH_CHECK(
(offset.size(0) == input.size(0)), "invalid batch size of offset");
TORCH_CHECK(
(offset.size(2) == out_h && offset.size(3) == out_w),
"offset output dims: (",
offset.size(2),
", ",
offset.size(3),
") - ",
"computed output dims: (",
out_h,
", ",
out_w,
")");
TORCH_CHECK((mask.size(0) == input.size(0)), "invalid batch size of mask");
TORCH_CHECK(
(!use_mask || (mask.size(2) == out_h && mask.size(3) == out_w)),
"mask output dims: (",
mask.size(2),
", ",
mask.size(3),
") - ",
"computed output dims: (",
out_h,
", ",
out_w,
")");
TORCH_CHECK(
out_h > 0 && out_w > 0,
"Calculated output size too small - out_h: ",
out_h,
" out_w: ",
out_w);
auto out = at::zeros({batch_sz, out_channels, out_h, out_w}, input.options());
if (batch_sz == 0) {
return out;
}
// Separate batches into blocks
out = out.view({batch_sz / n_parallel_imgs,
n_parallel_imgs,
out_channels,
out_h,
out_w});
input = input.view(
{batch_sz / n_parallel_imgs, n_parallel_imgs, in_channels, in_h, in_w});
offset = offset.view({batch_sz / n_parallel_imgs,
n_parallel_imgs,
n_offset_grps * 2 * weight_h * weight_w,
out_h,
out_w});
if (use_mask) {
mask = mask.view({batch_sz / n_parallel_imgs,
n_parallel_imgs,
n_offset_grps * weight_h * weight_w,
out_h,
out_w});
}
at::Tensor out_buf = at::zeros(
{batch_sz / n_parallel_imgs,
out_channels,
n_parallel_imgs * out_h,
out_w},
out.options());
// Separate channels into convolution groups
out_buf = out_buf.view({out_buf.size(0),
n_weight_grps,
out_buf.size(1) / n_weight_grps,
out_buf.size(2),
out_buf.size(3)});
weight = weight.view({n_weight_grps,
weight.size(0) / n_weight_grps,
weight.size(1),
weight.size(2),
weight.size(3)});
// Sample points and perform convolution
auto columns = at::zeros(
{in_channels * weight_h * weight_w, n_parallel_imgs * out_h * out_w},
input.options());
for (int b = 0; b < batch_sz / n_parallel_imgs; b++) {
deformable_im2col(
input[b],
offset[b],
mask[b],
in_channels,
in_h,
in_w,
weight_h,
weight_w,
pad_h,
pad_w,
stride_h,
stride_w,
dil_h,
dil_w,
out_h,
out_w,
n_parallel_imgs,
n_offset_grps,
use_mask,
columns);
columns = columns.view(
{n_weight_grps, columns.size(0) / n_weight_grps, columns.size(1)});
for (int g = 0; g < n_weight_grps; g++) {
out_buf[b][g] = out_buf[b][g]
.flatten(1)
.addmm_(weight[g].flatten(1), columns[g])
.view_as(out_buf[b][g]);
}
columns =
columns.view({columns.size(0) * columns.size(1), columns.size(2)});
}
out_buf = out_buf.view({batch_sz / n_parallel_imgs,
out_channels,
n_parallel_imgs,
out_h,
out_w});
out_buf.transpose_(1, 2);
out.copy_(out_buf);
out = out.view({batch_sz, out_channels, out_h, out_w});
return out + bias.view({1, out_channels, 1, 1});
}
template <typename scalar_t> template <typename scalar_t>
__global__ void deformable_col2im_gpu_kernel( __global__ void deformable_col2im_kernel(
int n, int n,
const scalar_t* col, const scalar_t* col,
const scalar_t* offset_ptr, const scalar_t* offset_ptr,
...@@ -560,7 +353,7 @@ __global__ void deformable_col2im_gpu_kernel( ...@@ -560,7 +353,7 @@ __global__ void deformable_col2im_gpu_kernel(
} }
} }
static void compute_grad_input( void compute_grad_input(
const at::Tensor& columns, const at::Tensor& columns,
const at::Tensor& offset, const at::Tensor& offset,
const at::Tensor& mask, const at::Tensor& mask,
...@@ -590,10 +383,8 @@ static void compute_grad_input( ...@@ -590,10 +383,8 @@ static void compute_grad_input(
const unsigned int blocks = GET_BLOCKS(threads, num_kernels); const unsigned int blocks = GET_BLOCKS(threads, num_kernels);
AT_DISPATCH_FLOATING_TYPES_AND_HALF( AT_DISPATCH_FLOATING_TYPES_AND_HALF(
columns.scalar_type(), "deformable_col2im_gpu", ([&] { columns.scalar_type(), "compute_grad_input", ([&] {
deformable_col2im_gpu_kernel<<< deformable_col2im_kernel<<<blocks, threads>>>(
blocks,
threads>>>(
num_kernels, num_kernels,
columns.data_ptr<scalar_t>(), columns.data_ptr<scalar_t>(),
offset.data_ptr<scalar_t>(), offset.data_ptr<scalar_t>(),
...@@ -657,7 +448,7 @@ __device__ scalar_t get_coordinate_weight( ...@@ -657,7 +448,7 @@ __device__ scalar_t get_coordinate_weight(
} }
template <typename scalar_t> template <typename scalar_t>
__global__ void deformable_col2im_coord_gpu_kernel( __global__ void deformable_col2im_coord_kernel(
int n, int n,
const scalar_t* col_ptr, const scalar_t* col_ptr,
const scalar_t* im_ptr, const scalar_t* im_ptr,
...@@ -766,7 +557,7 @@ __global__ void deformable_col2im_coord_gpu_kernel( ...@@ -766,7 +557,7 @@ __global__ void deformable_col2im_coord_gpu_kernel(
} }
} }
static void compute_grad_offset_and_mask( void compute_grad_offset_and_mask(
const at::Tensor& columns, const at::Tensor& columns,
const at::Tensor& input, const at::Tensor& input,
const at::Tensor& offset, const at::Tensor& offset,
...@@ -798,10 +589,8 @@ static void compute_grad_offset_and_mask( ...@@ -798,10 +589,8 @@ static void compute_grad_offset_and_mask(
const unsigned int blocks = GET_BLOCKS(threads, num_kernels); const unsigned int blocks = GET_BLOCKS(threads, num_kernels);
AT_DISPATCH_FLOATING_TYPES_AND_HALF( AT_DISPATCH_FLOATING_TYPES_AND_HALF(
columns.scalar_type(), "deformable_col2im_coord_gpu", ([&] { columns.scalar_type(), "compute_grad_offset_and_mask", ([&] {
deformable_col2im_coord_gpu_kernel<<< deformable_col2im_coord_kernel<<<blocks, threads>>>(
blocks,
threads>>>(
num_kernels, num_kernels,
columns.data_ptr<scalar_t>(), columns.data_ptr<scalar_t>(),
input.data_ptr<scalar_t>(), input.data_ptr<scalar_t>(),
...@@ -835,7 +624,7 @@ static void compute_grad_offset_and_mask( ...@@ -835,7 +624,7 @@ static void compute_grad_offset_and_mask(
} }
} }
static std::tuple<at::Tensor, at::Tensor, at::Tensor> deform_conv2d_backward_input_cuda( std::tuple<at::Tensor, at::Tensor, at::Tensor> backward_gradient_inputs(
at::Tensor input, at::Tensor input,
at::Tensor weight, at::Tensor weight,
at::Tensor offset, at::Tensor offset,
...@@ -845,8 +634,8 @@ static std::tuple<at::Tensor, at::Tensor, at::Tensor> deform_conv2d_backward_inp ...@@ -845,8 +634,8 @@ static std::tuple<at::Tensor, at::Tensor, at::Tensor> deform_conv2d_backward_inp
int stride_w, int stride_w,
int pad_h, int pad_h,
int pad_w, int pad_w,
int dil_h, int dilation_h,
int dil_w, int dilation_w,
int n_weight_grps, int n_weight_grps,
int n_offset_grps, int n_offset_grps,
int n_parallel_imgs, int n_parallel_imgs,
...@@ -864,8 +653,10 @@ static std::tuple<at::Tensor, at::Tensor, at::Tensor> deform_conv2d_backward_inp ...@@ -864,8 +653,10 @@ static std::tuple<at::Tensor, at::Tensor, at::Tensor> deform_conv2d_backward_inp
int weight_h = weight.size(2); int weight_h = weight.size(2);
int weight_w = weight.size(3); int weight_w = weight.size(3);
long out_w = (in_w + 2 * pad_w - (dil_w * (weight_w - 1) + 1)) / stride_w + 1; long out_w =
long out_h = (in_h + 2 * pad_h - (dil_h * (weight_h - 1) + 1)) / stride_h + 1; (in_w + 2 * pad_w - (dilation_w * (weight_w - 1) + 1)) / stride_w + 1;
long out_h =
(in_h + 2 * pad_h - (dilation_h * (weight_h - 1) + 1)) / stride_h + 1;
auto grad_input = at::zeros_like(input); auto grad_input = at::zeros_like(input);
auto grad_offset = at::zeros_like(offset); auto grad_offset = at::zeros_like(offset);
...@@ -948,8 +739,8 @@ static std::tuple<at::Tensor, at::Tensor, at::Tensor> deform_conv2d_backward_inp ...@@ -948,8 +739,8 @@ static std::tuple<at::Tensor, at::Tensor, at::Tensor> deform_conv2d_backward_inp
pad_w, pad_w,
stride_h, stride_h,
stride_w, stride_w,
dil_h, dilation_h,
dil_w, dilation_w,
n_parallel_imgs, n_parallel_imgs,
n_offset_grps, n_offset_grps,
use_mask, use_mask,
...@@ -969,8 +760,8 @@ static std::tuple<at::Tensor, at::Tensor, at::Tensor> deform_conv2d_backward_inp ...@@ -969,8 +760,8 @@ static std::tuple<at::Tensor, at::Tensor, at::Tensor> deform_conv2d_backward_inp
pad_w, pad_w,
stride_h, stride_h,
stride_w, stride_w,
dil_h, dilation_h,
dil_w, dilation_w,
n_parallel_imgs, n_parallel_imgs,
n_offset_grps, n_offset_grps,
use_mask, use_mask,
...@@ -989,7 +780,7 @@ static std::tuple<at::Tensor, at::Tensor, at::Tensor> deform_conv2d_backward_inp ...@@ -989,7 +780,7 @@ static std::tuple<at::Tensor, at::Tensor, at::Tensor> deform_conv2d_backward_inp
return std::make_tuple(grad_input, grad_offset, grad_mask); return std::make_tuple(grad_input, grad_offset, grad_mask);
} }
static at::Tensor deform_conv2d_backward_parameters_cuda( at::Tensor backward_gradient_parameters(
at::Tensor input, at::Tensor input,
const at::Tensor& weight, const at::Tensor& weight,
at::Tensor offset, at::Tensor offset,
...@@ -999,8 +790,8 @@ static at::Tensor deform_conv2d_backward_parameters_cuda( ...@@ -999,8 +790,8 @@ static at::Tensor deform_conv2d_backward_parameters_cuda(
int stride_w, int stride_w,
int pad_h, int pad_h,
int pad_w, int pad_w,
int dil_h, int dilation_h,
int dil_w, int dilation_w,
int n_weight_grps, int n_weight_grps,
int n_offset_grps, int n_offset_grps,
int n_parallel_imgs, int n_parallel_imgs,
...@@ -1079,8 +870,8 @@ static at::Tensor deform_conv2d_backward_parameters_cuda( ...@@ -1079,8 +870,8 @@ static at::Tensor deform_conv2d_backward_parameters_cuda(
pad_w, pad_w,
stride_h, stride_h,
stride_w, stride_w,
dil_h, dilation_h,
dil_w, dilation_w,
out_h, out_h,
out_w, out_w,
n_parallel_imgs, n_parallel_imgs,
...@@ -1105,46 +896,264 @@ static at::Tensor deform_conv2d_backward_parameters_cuda( ...@@ -1105,46 +896,264 @@ static at::Tensor deform_conv2d_backward_parameters_cuda(
return grad_weight; return grad_weight;
} }
} // namespace
at::Tensor deform_conv2d_forward_cuda(
const at::Tensor& input,
const at::Tensor& weight,
const at::Tensor& offset,
const at::Tensor& mask,
const at::Tensor& bias,
int64_t stride_h,
int64_t stride_w,
int64_t pad_h,
int64_t pad_w,
int64_t dilation_h,
int64_t dilation_w,
int64_t n_weight_grps,
int64_t n_offset_grps,
bool use_mask) {
at::Tensor input_c = input.contiguous();
at::Tensor offset_c = offset.contiguous();
at::Tensor weight_c = weight.contiguous();
at::Tensor mask_c = mask.contiguous();
at::Tensor bias_c = bias.contiguous();
TORCH_CHECK(input_c.ndimension() == 4);
TORCH_CHECK(offset_c.ndimension() == 4);
TORCH_CHECK(!use_mask || mask_c.ndimension() == 4);
TORCH_CHECK(weight_c.ndimension() == 4);
TORCH_CHECK(input_c.is_cuda(), "input must be a CUDA tensor");
at::DeviceGuard guard(input_c.device());
int batch_sz = input_c.size(0);
int in_channels = input_c.size(1);
int in_h = input_c.size(2);
int in_w = input_c.size(3);
int n_parallel_imgs =
get_greatest_divisor_below_bound(batch_sz, kMaxParallelImgs);
int out_channels = weight_c.size(0);
int weight_h = weight_c.size(2);
int weight_w = weight_c.size(3);
int ker_h = dilation_h * (weight_h - 1) + 1;
int ker_w = dilation_w * (weight_w - 1) + 1;
int out_h = ((in_h + 2 * pad_h - ker_h) / stride_h) + 1;
int out_w = ((in_w + 2 * pad_w - ker_w) / stride_w) + 1;
TORCH_CHECK(
weight_h > 0 && weight_w > 0,
"weight_h: ",
weight_h,
" weight_w: ",
weight_w);
TORCH_CHECK(
stride_h > 0 && stride_w > 0,
"stride_h: ",
stride_h,
" stride_w: ",
stride_w);
TORCH_CHECK(pad_h >= 0 && pad_w >= 0, "pad_h: ", pad_h, " pad_w: ", pad_w);
TORCH_CHECK(
dilation_h > 0 && dilation_w > 0,
"dilation_h: ",
dilation_h,
" dilation_w: ",
dilation_w);
TORCH_CHECK(weight_c.size(1) * n_weight_grps == input_c.size(1));
TORCH_CHECK(weight_c.size(0) % n_weight_grps == 0);
TORCH_CHECK(
(offset_c.size(1) == n_offset_grps * 2 * weight_h * weight_w),
"offset.shape[1] is not valid: got: ",
offset_c.size(1),
" expected: ",
n_offset_grps * 2 * weight_h * weight_w);
TORCH_CHECK(
(!use_mask || mask_c.size(1) == n_offset_grps * weight_h * weight_w),
"mask.shape[1] is not valid: got: ",
mask_c.size(1),
" expected: ",
n_offset_grps * weight_h * weight_w);
TORCH_CHECK(input_c.size(1) % n_offset_grps == 0);
TORCH_CHECK(
(offset_c.size(0) == input_c.size(0)), "invalid batch size of offset");
TORCH_CHECK(
(offset_c.size(2) == out_h && offset_c.size(3) == out_w),
"offset output dims: (",
offset_c.size(2),
", ",
offset_c.size(3),
") - ",
"computed output dims: (",
out_h,
", ",
out_w,
")");
TORCH_CHECK(
(mask_c.size(0) == input_c.size(0)), "invalid batch size of mask");
TORCH_CHECK(
(!use_mask || (mask_c.size(2) == out_h && mask_c.size(3) == out_w)),
"mask output dims: (",
mask_c.size(2),
", ",
mask_c.size(3),
") - ",
"computed output dims: (",
out_h,
", ",
out_w,
")");
TORCH_CHECK(
out_h > 0 && out_w > 0,
"Calculated output size too small - out_h: ",
out_h,
" out_w: ",
out_w);
auto out =
at::zeros({batch_sz, out_channels, out_h, out_w}, input_c.options());
if (batch_sz == 0) {
return out;
}
// Separate batches into blocks
out = out.view({batch_sz / n_parallel_imgs,
n_parallel_imgs,
out_channels,
out_h,
out_w});
input_c = input_c.view(
{batch_sz / n_parallel_imgs, n_parallel_imgs, in_channels, in_h, in_w});
offset_c = offset_c.view({batch_sz / n_parallel_imgs,
n_parallel_imgs,
n_offset_grps * 2 * weight_h * weight_w,
out_h,
out_w});
if (use_mask) {
mask_c = mask_c.view({batch_sz / n_parallel_imgs,
n_parallel_imgs,
n_offset_grps * weight_h * weight_w,
out_h,
out_w});
}
at::Tensor out_buf = at::zeros(
{batch_sz / n_parallel_imgs,
out_channels,
n_parallel_imgs * out_h,
out_w},
out.options());
// Separate channels into convolution groups
out_buf = out_buf.view({out_buf.size(0),
n_weight_grps,
out_buf.size(1) / n_weight_grps,
out_buf.size(2),
out_buf.size(3)});
weight_c = weight_c.view({n_weight_grps,
weight_c.size(0) / n_weight_grps,
weight_c.size(1),
weight_c.size(2),
weight_c.size(3)});
// Sample points and perform convolution
auto columns = at::zeros(
{in_channels * weight_h * weight_w, n_parallel_imgs * out_h * out_w},
input_c.options());
for (int b = 0; b < batch_sz / n_parallel_imgs; b++) {
deformable_im2col(
input_c[b],
offset_c[b],
mask_c[b],
in_channels,
in_h,
in_w,
weight_h,
weight_w,
pad_h,
pad_w,
stride_h,
stride_w,
dilation_h,
dilation_w,
out_h,
out_w,
n_parallel_imgs,
n_offset_grps,
use_mask,
columns);
columns = columns.view(
{n_weight_grps, columns.size(0) / n_weight_grps, columns.size(1)});
for (int g = 0; g < n_weight_grps; g++) {
out_buf[b][g] = out_buf[b][g]
.flatten(1)
.addmm_(weight_c[g].flatten(1), columns[g])
.view_as(out_buf[b][g]);
}
columns =
columns.view({columns.size(0) * columns.size(1), columns.size(2)});
}
out_buf = out_buf.view({batch_sz / n_parallel_imgs,
out_channels,
n_parallel_imgs,
out_h,
out_w});
out_buf.transpose_(1, 2);
out.copy_(out_buf);
out = out.view({batch_sz, out_channels, out_h, out_w});
return out + bias_c.view({1, out_channels, 1, 1});
}
std::tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor, at::Tensor> std::tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor, at::Tensor>
DeformConv2d_backward_cuda( deform_conv2d_backward_cuda(
const at::Tensor& grad_out_param, const at::Tensor& grad_out,
const at::Tensor& input_param, const at::Tensor& input,
const at::Tensor& weight_param, const at::Tensor& weight,
const at::Tensor& offset_param, const at::Tensor& offset,
const at::Tensor& mask_param, const at::Tensor& mask,
const at::Tensor& bias_param, const at::Tensor& bias,
int64_t stride_h, int64_t stride_h,
int64_t stride_w, int64_t stride_w,
int64_t pad_h, int64_t pad_h,
int64_t pad_w, int64_t pad_w,
int64_t dil_h, int64_t dilation_h,
int64_t dil_w, int64_t dilation_w,
int64_t n_weight_grps, int64_t n_weight_grps,
int64_t n_offset_grps, int64_t n_offset_grps,
bool use_mask) { bool use_mask) {
at::Tensor grad_out = grad_out_param.contiguous(); at::Tensor grad_out_c = grad_out.contiguous();
at::Tensor input = input_param.contiguous(); at::Tensor input_c = input.contiguous();
at::Tensor weight = weight_param.contiguous(); at::Tensor weight_c = weight.contiguous();
at::Tensor offset = offset_param.contiguous(); at::Tensor offset_c = offset.contiguous();
at::Tensor mask = mask_param.contiguous(); at::Tensor mask_c = mask.contiguous();
at::Tensor bias = bias_param.contiguous(); at::Tensor bias_c = bias.contiguous();
const int batch_sz = input.size(0); const int batch_sz = input_c.size(0);
const int n_parallel_imgs = const int n_parallel_imgs =
get_greatest_divisor_below_bound(batch_sz, kMaxParallelImgs); get_greatest_divisor_below_bound(batch_sz, kMaxParallelImgs);
auto grad_input_and_offset_and_mask = deform_conv2d_backward_input_cuda( auto grad_input_and_offset_and_mask = backward_gradient_inputs(
input, input_c,
weight, weight_c,
offset, offset_c,
mask, mask_c,
grad_out, grad_out_c,
stride_h, stride_h,
stride_w, stride_w,
pad_h, pad_h,
pad_w, pad_w,
dil_h, dilation_h,
dil_w, dilation_w,
n_weight_grps, n_weight_grps,
n_offset_grps, n_offset_grps,
n_parallel_imgs, n_parallel_imgs,
...@@ -1154,26 +1163,29 @@ DeformConv2d_backward_cuda( ...@@ -1154,26 +1163,29 @@ DeformConv2d_backward_cuda(
auto grad_offset = std::get<1>(grad_input_and_offset_and_mask); auto grad_offset = std::get<1>(grad_input_and_offset_and_mask);
auto grad_mask = std::get<2>(grad_input_and_offset_and_mask); auto grad_mask = std::get<2>(grad_input_and_offset_and_mask);
auto grad_weight = deform_conv2d_backward_parameters_cuda( auto grad_weight = backward_gradient_parameters(
input, input_c,
weight, weight_c,
offset, offset_c,
mask, mask_c,
grad_out, grad_out_c,
stride_h, stride_h,
stride_w, stride_w,
pad_h, pad_h,
pad_w, pad_w,
dil_h, dilation_h,
dil_w, dilation_w,
n_weight_grps, n_weight_grps,
n_offset_grps, n_offset_grps,
n_parallel_imgs, n_parallel_imgs,
use_mask); use_mask);
auto value = grad_out.sum({0, 2, 3}); auto value = grad_out_c.sum({0, 2, 3});
auto grad_bias = at::ones_like(bias) * value; auto grad_bias = at::ones_like(bias_c) * value;
return std::make_tuple( return std::make_tuple(
grad_input, grad_weight, grad_offset, grad_mask, grad_bias); grad_input, grad_weight, grad_offset, grad_mask, grad_bias);
} }
} // namespace ops
} // namespace vision
#pragma once
#include <ATen/ATen.h>
#include "../macros.h"
namespace vision {
namespace ops {
VISION_API at::Tensor deform_conv2d_forward_cuda(
const at::Tensor& input,
const at::Tensor& weight,
const at::Tensor& offset,
const at::Tensor& mask,
const at::Tensor& bias,
int64_t stride_h,
int64_t stride_w,
int64_t pad_h,
int64_t pad_w,
int64_t dilation_h,
int64_t dilation_w,
int64_t n_weight_grps,
int64_t n_offset_grps,
bool use_mask);
VISION_API std::
tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor, at::Tensor>
deform_conv2d_backward_cuda(
const at::Tensor& grad_out,
const at::Tensor& input,
const at::Tensor& weight,
const at::Tensor& offset,
const at::Tensor& mask,
const at::Tensor& bias,
int64_t stride_h,
int64_t stride_w,
int64_t pad_h,
int64_t pad_w,
int64_t dilation_h,
int64_t dilation_w,
int64_t n_weight_grps,
int64_t n_offset_grps,
bool use_mask);
} // namespace ops
} // namespace vision
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment