"vscode:/vscode.git/clone" did not exist on "5fa8ae041cef2b5f5587d4eb076dbaeb5bf992f6"
Unverified Commit 0ebbb0ab authored by Vasilis Vryniotis's avatar Vasilis Vryniotis Committed by GitHub
Browse files

Encapsulate and Standardise C++ Ops (#3097)

* Encapsulate and standardize deform_conv2d (#3074)

* Rename files.

* Standardizing method names.

* Adding anonymous namespaces.

* Applying C++ naming rules and alinging variable names across headers and cpp files.

* Syncing names across implementations.

* Rename deform_conv2d.h to deform_conv2d.cpp

* Use header files:
- Create header files for kernel implementation and remove definitions from vision_*.h files.
- Eliminate unnecessary headers and ensure all cpp include their headers.

* Change the naming convention for kernel implementations.

* Remove the _param postfix from the variables and standardizing names.

* Exposing public forward/backward methods to the C++ API and moving methods around to minimize git blame changes.

* Encapsulate and standardize nms (#3081)

* Syncing, where possible, the names of functions across devices.

* Adding all internal functions in anonymous namespaces.

* Renaming C++/CUDA kernel files and moving operator code from header to cpp file.

* Create foreach cpp file a separate header file with "public" functions.

* Removing unnecessary repeated includes.

* Update CMakeLists.txt to include all headers.

* Encapsulate and standardize ps_roi_align (#3082)

* Renaming C++ files & methods according to recommended naming conventions and aligning them with Python's API.
Syncing, where possible, the names of functions across devices.

* Adding all internal functions in anonymous namespaces.

* Renaming C++/CUDA kernel files and moving operator code from header to cpp file.

* Create foreach cpp file a separate header file with "public" functions.

* Removing unnecessary repeated includes.

* Encapsulate and standardize ps_roi_pool (#3084)

* Renaming C++ files & methods according to recommended naming conventions and aligning them with Python's API.

* Adding all internal functions in anonymous namespaces.

* Renaming C++/CUDA kernel files and moving operator code from header to cpp file.

* Create foreach cpp file a separate header file with "public" functions.

* Removing unnecessary repeated includes.

* Encapsulate and standardize roi_align (#3085)

* Renaming C++ files & methods according to recommended naming conventions and aligning them with Python's API.

* Adding all internal functions in anonymous namespaces.

* Renaming C++/CUDA kernel files and moving operator code from header to cpp file.

* Create foreach cpp file a separate header file with "public" functions.

* Removing unnecessary repeated includes.

* Encapsulate and standardize roi_pool  (#3088)

* Renaming C++ files & methods according to recommended naming conventions and aligning them with Python's API.

* Adding all internal functions in anonymous namespaces.

* Syncing variable names between the cpp files and their header files.

* Renaming C++/CUDA kernel files and moving operator code from header to cpp file.

* Create foreach cpp file a separate header file with "public" functions.

* Removing unnecessary repeated includes.

* Encapsulate and standardize new_empty_tensor_op (#3089)

* Renaming C++ files & methods according to recommended naming conventions and aligning them with Python's API.

* Create foreach cpp file a separate header file with "public" functions.

* Adding all internal functions in anonymous namespaces.

* Convert to const ref all possible parameters.

* Removing unnecessary repeated includes.

* Encapsulate and standardize C++ Ops - Clean up (#3094)

* Removing unnecessary repeated includes.

* Remove unnecessary vision_cpu.h, vision_cuda.h, autocast.h.

* Fixing naming convention and correcting method names on macros.

* Turn on clang formatter for cu files and fixing broken styles.

* Replace "#ifndef ... #define ... #endif" with "#pragma once" on header files.

* Adding operator methods in vision::ops namespace. (#3096)

* Adding operator methods in vision::ops namespace.

* Replace general.h with macros.h

* Adding vision.h to the necessary cpp files.
parent 8520f0be
#include <ATen/ATen.h>
#include <ATen/cuda/CUDAContext.h> #include <ATen/cuda/CUDAContext.h>
#include <c10/cuda/CUDAGuard.h> #include <c10/cuda/CUDAGuard.h>
#include "cuda_helpers.h" #include "cuda_helpers.h"
#include "nms_kernel.h"
#include <iostream> namespace vision {
#include <vector> namespace ops {
namespace {
int const threadsPerBlock = sizeof(unsigned long long) * 8; int const threadsPerBlock = sizeof(unsigned long long) * 8;
template <typename T> template <typename T>
__device__ inline bool devIoU(T const* const a, T const* const b, const float threshold) { __device__ inline bool devIoU(
T const* const a,
T const* const b,
const float threshold) {
T left = max(a[0], b[0]), right = min(a[2], b[2]); T left = max(a[0], b[0]), right = min(a[2], b[2]);
T top = max(a[1], b[1]), bottom = min(a[3], b[3]); T top = max(a[1], b[1]), bottom = min(a[3], b[3]);
T width = max(right - left, (T)0), height = max(bottom - top, (T)0); T width = max(right - left, (T)0), height = max(bottom - top, (T)0);
...@@ -21,7 +26,7 @@ __device__ inline bool devIoU(T const* const a, T const* const b, const float th ...@@ -21,7 +26,7 @@ __device__ inline bool devIoU(T const* const a, T const* const b, const float th
} }
template <typename T> template <typename T>
__global__ void nms_kernel( __global__ void nms_kernel_impl(
int n_boxes, int n_boxes,
double iou_threshold, double iou_threshold,
const T* dev_boxes, const T* dev_boxes,
...@@ -29,7 +34,8 @@ __global__ void nms_kernel( ...@@ -29,7 +34,8 @@ __global__ void nms_kernel(
const int row_start = blockIdx.y; const int row_start = blockIdx.y;
const int col_start = blockIdx.x; const int col_start = blockIdx.x;
if (row_start > col_start) return; if (row_start > col_start)
return;
const int row_size = const int row_size =
min(n_boxes - row_start * threadsPerBlock, threadsPerBlock); min(n_boxes - row_start * threadsPerBlock, threadsPerBlock);
...@@ -68,7 +74,10 @@ __global__ void nms_kernel( ...@@ -68,7 +74,10 @@ __global__ void nms_kernel(
} }
} }
at::Tensor nms_cuda(const at::Tensor& dets, } // namespace
at::Tensor nms_cuda(
const at::Tensor& dets,
const at::Tensor& scores, const at::Tensor& scores,
double iou_threshold) { double iou_threshold) {
TORCH_CHECK(dets.is_cuda(), "dets must be a CUDA tensor"); TORCH_CHECK(dets.is_cuda(), "dets must be a CUDA tensor");
...@@ -118,8 +127,8 @@ at::Tensor nms_cuda(const at::Tensor& dets, ...@@ -118,8 +127,8 @@ at::Tensor nms_cuda(const at::Tensor& dets,
cudaStream_t stream = at::cuda::getCurrentCUDAStream(); cudaStream_t stream = at::cuda::getCurrentCUDAStream();
AT_DISPATCH_FLOATING_TYPES_AND_HALF( AT_DISPATCH_FLOATING_TYPES_AND_HALF(
dets_sorted.scalar_type(), "nms_kernel_cuda", [&] { dets_sorted.scalar_type(), "nms_cuda", [&] {
nms_kernel<scalar_t><<<blocks, threads, 0, stream>>>( nms_kernel_impl<scalar_t><<<blocks, threads, 0, stream>>>(
dets_num, dets_num,
iou_threshold, iou_threshold,
dets_sorted.data_ptr<scalar_t>(), dets_sorted.data_ptr<scalar_t>(),
...@@ -127,7 +136,8 @@ at::Tensor nms_cuda(const at::Tensor& dets, ...@@ -127,7 +136,8 @@ at::Tensor nms_cuda(const at::Tensor& dets,
}); });
at::Tensor mask_cpu = mask.to(at::kCPU); at::Tensor mask_cpu = mask.to(at::kCPU);
unsigned long long* mask_host = (unsigned long long*)mask_cpu.data_ptr<int64_t>(); unsigned long long* mask_host =
(unsigned long long*)mask_cpu.data_ptr<int64_t>();
std::vector<unsigned long long> remv(col_blocks); std::vector<unsigned long long> remv(col_blocks);
memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks); memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks);
...@@ -155,3 +165,6 @@ at::Tensor nms_cuda(const at::Tensor& dets, ...@@ -155,3 +165,6 @@ at::Tensor nms_cuda(const at::Tensor& dets,
{keep.narrow(/*dim=*/0, /*start=*/0, /*length=*/num_to_keep) {keep.narrow(/*dim=*/0, /*start=*/0, /*length=*/num_to_keep)
.to(order_t.device(), keep.scalar_type())}); .to(order_t.device(), keep.scalar_type())});
} }
} // namespace ops
} // namespace vision
#pragma once
#include <ATen/ATen.h>
#include "../macros.h"
namespace vision {
namespace ops {
VISION_API at::Tensor nms_cuda(
const at::Tensor& dets,
const at::Tensor& scores,
double iou_threshold);
} // namespace ops
} // namespace vision
#include <ATen/ATen.h>
#include <ATen/TensorUtils.h>
#include <ATen/cuda/CUDAContext.h> #include <ATen/cuda/CUDAContext.h>
#include <c10/cuda/CUDAGuard.h> #include <c10/cuda/CUDAGuard.h>
#include <THC/THCAtomics.cuh> #include <THC/THCAtomics.cuh>
#include <stdio.h>
#include "cuda_helpers.h" #include "cuda_helpers.h"
#include "ps_roi_align_kernel.h"
namespace vision {
namespace ops {
namespace {
template <typename T> template <typename T>
__device__ T bilinear_interpolate( __device__ T bilinear_interpolate(
...@@ -62,7 +65,7 @@ __device__ T bilinear_interpolate( ...@@ -62,7 +65,7 @@ __device__ T bilinear_interpolate(
} }
template <typename T> template <typename T>
__global__ void PSROIAlignForwardCUDA( __global__ void ps_roi_align_forward_kernel_impl(
int nthreads, int nthreads,
const T* input, const T* input,
const T spatial_scale, const T spatial_scale,
...@@ -195,7 +198,7 @@ __device__ void bilinear_interpolate_gradient( ...@@ -195,7 +198,7 @@ __device__ void bilinear_interpolate_gradient(
} }
template <typename T> template <typename T>
__global__ void PSROIAlignBackwardCUDA( __global__ void ps_roi_align_backward_kernel_impl(
int nthreads, int nthreads,
const T* grad_output, const T* grad_output,
const int* channel_mapping, const int* channel_mapping,
...@@ -292,7 +295,9 @@ __global__ void PSROIAlignBackwardCUDA( ...@@ -292,7 +295,9 @@ __global__ void PSROIAlignBackwardCUDA(
} }
} }
std::tuple<at::Tensor, at::Tensor> PSROIAlign_forward_cuda( } // namespace
std::tuple<at::Tensor, at::Tensor> ps_roi_align_forward_cuda(
const at::Tensor& input, const at::Tensor& input,
const at::Tensor& rois, const at::Tensor& rois,
double spatial_scale, double spatial_scale,
...@@ -307,7 +312,7 @@ std::tuple<at::Tensor, at::Tensor> PSROIAlign_forward_cuda( ...@@ -307,7 +312,7 @@ std::tuple<at::Tensor, at::Tensor> PSROIAlign_forward_cuda(
at::TensorArg input_t{input, "input", 1}, rois_t{rois, "rois", 2}; at::TensorArg input_t{input, "input", 1}, rois_t{rois, "rois", 2};
at::CheckedFrom c = "PSROIAlign_forward_cuda"; at::CheckedFrom c = "ps_roi_align_forward_cuda";
at::checkAllSameGPU(c, {input_t, rois_t}); at::checkAllSameGPU(c, {input_t, rois_t});
at::checkAllSameType(c, {input_t, rois_t}); at::checkAllSameType(c, {input_t, rois_t});
...@@ -337,15 +342,14 @@ std::tuple<at::Tensor, at::Tensor> PSROIAlign_forward_cuda( ...@@ -337,15 +342,14 @@ std::tuple<at::Tensor, at::Tensor> PSROIAlign_forward_cuda(
cudaStream_t stream = at::cuda::getCurrentCUDAStream(); cudaStream_t stream = at::cuda::getCurrentCUDAStream();
dim3 grid(std::min( dim3 grid(std::min(
ceil_div(static_cast<int64_t>(output_size), static_cast<int64_t>(512)), ceil_div(static_cast<int64_t>(output_size), static_cast<int64_t>(512)),
static_cast<int64_t>(4096))); static_cast<int64_t>(4096)));
dim3 block(512); dim3 block(512);
auto input_ = input.contiguous(), auto input_ = input.contiguous(), rois_ = rois.contiguous();
rois_ = rois.contiguous();
AT_DISPATCH_FLOATING_TYPES_AND_HALF( AT_DISPATCH_FLOATING_TYPES_AND_HALF(
input.scalar_type(), "PSROIAlign_forward", [&] { input.scalar_type(), "ps_roi_align_forward_cuda", [&] {
PSROIAlignForwardCUDA<scalar_t><<<grid, block, 0, stream>>>( ps_roi_align_forward_kernel_impl<scalar_t><<<grid, block, 0, stream>>>(
output_size, output_size,
input_.data_ptr<scalar_t>(), input_.data_ptr<scalar_t>(),
spatial_scale, spatial_scale,
...@@ -365,7 +369,7 @@ std::tuple<at::Tensor, at::Tensor> PSROIAlign_forward_cuda( ...@@ -365,7 +369,7 @@ std::tuple<at::Tensor, at::Tensor> PSROIAlign_forward_cuda(
return std::make_tuple(output, channel_mapping); return std::make_tuple(output, channel_mapping);
} }
at::Tensor PSROIAlign_backward_cuda( at::Tensor ps_roi_align_backward_cuda(
const at::Tensor& grad, const at::Tensor& grad,
const at::Tensor& rois, const at::Tensor& rois,
const at::Tensor& channel_mapping, const at::Tensor& channel_mapping,
...@@ -381,13 +385,12 @@ at::Tensor PSROIAlign_backward_cuda( ...@@ -381,13 +385,12 @@ at::Tensor PSROIAlign_backward_cuda(
TORCH_CHECK(grad.is_cuda(), "grad must be a CUDA tensor"); TORCH_CHECK(grad.is_cuda(), "grad must be a CUDA tensor");
TORCH_CHECK(rois.is_cuda(), "rois must be a CUDA tensor"); TORCH_CHECK(rois.is_cuda(), "rois must be a CUDA tensor");
TORCH_CHECK( TORCH_CHECK(
channel_mapping.is_cuda(), channel_mapping.is_cuda(), "channel_mapping must be a CUDA tensor");
"channel_mapping must be a CUDA tensor");
at::TensorArg grad_t{grad, "grad", 1}, rois_t{rois, "rois", 2}, at::TensorArg grad_t{grad, "grad", 1}, rois_t{rois, "rois", 2},
channel_mapping_t{channel_mapping, "channel_mapping", 3}; channel_mapping_t{channel_mapping, "channel_mapping", 3};
at::CheckedFrom c = "PSROIAlign_backward_cuda"; at::CheckedFrom c = "ps_roi_align_backward_cuda";
at::checkAllSameGPU(c, {grad_t, rois_t, channel_mapping_t}); at::checkAllSameGPU(c, {grad_t, rois_t, channel_mapping_t});
at::checkAllSameType(c, {grad_t, rois_t}); at::checkAllSameType(c, {grad_t, rois_t});
...@@ -400,7 +403,7 @@ at::Tensor PSROIAlign_backward_cuda( ...@@ -400,7 +403,7 @@ at::Tensor PSROIAlign_backward_cuda(
cudaStream_t stream = at::cuda::getCurrentCUDAStream(); cudaStream_t stream = at::cuda::getCurrentCUDAStream();
dim3 grid(std::min( dim3 grid(std::min(
ceil_div(static_cast<int64_t>(grad.numel()), static_cast<int64_t>(512)), ceil_div(static_cast<int64_t>(grad.numel()), static_cast<int64_t>(512)),
static_cast<int64_t>(4096))); static_cast<int64_t>(4096)));
dim3 block(512); dim3 block(512);
...@@ -412,11 +415,10 @@ at::Tensor PSROIAlign_backward_cuda( ...@@ -412,11 +415,10 @@ at::Tensor PSROIAlign_backward_cuda(
int channels_out = channels / (pooled_height * pooled_width); int channels_out = channels / (pooled_height * pooled_width);
auto grad_ = grad.contiguous(), auto grad_ = grad.contiguous(), rois_ = rois.contiguous();
rois_ = rois.contiguous();
AT_DISPATCH_FLOATING_TYPES_AND_HALF( AT_DISPATCH_FLOATING_TYPES_AND_HALF(
grad.scalar_type(), "PSROIAlign_backward", [&] { grad.scalar_type(), "ps_roi_align_backward_cuda", [&] {
PSROIAlignBackwardCUDA<scalar_t><<<grid, block, 0, stream>>>( ps_roi_align_backward_kernel_impl<scalar_t><<<grid, block, 0, stream>>>(
grad.numel(), grad.numel(),
grad_.data_ptr<scalar_t>(), grad_.data_ptr<scalar_t>(),
channel_mapping.data_ptr<int>(), channel_mapping.data_ptr<int>(),
...@@ -435,3 +437,6 @@ at::Tensor PSROIAlign_backward_cuda( ...@@ -435,3 +437,6 @@ at::Tensor PSROIAlign_backward_cuda(
AT_CUDA_CHECK(cudaGetLastError()); AT_CUDA_CHECK(cudaGetLastError());
return grad_input; return grad_input;
} }
} // namespace ops
} // namespace vision
#pragma once
#include <ATen/ATen.h>
#include "../macros.h"
namespace vision {
namespace ops {
VISION_API std::tuple<at::Tensor, at::Tensor> ps_roi_align_forward_cuda(
const at::Tensor& input,
const at::Tensor& rois,
double spatial_scale,
int64_t pooled_height,
int64_t pooled_width,
int64_t sampling_ratio);
VISION_API at::Tensor ps_roi_align_backward_cuda(
const at::Tensor& grad,
const at::Tensor& rois,
const at::Tensor& channel_mapping,
double spatial_scale,
int64_t pooled_height,
int64_t pooled_width,
int64_t sampling_ratio,
int64_t batch_size,
int64_t channels,
int64_t height,
int64_t width);
} // namespace ops
} // namespace vision
#include <ATen/ATen.h>
#include <ATen/TensorUtils.h>
#include <ATen/cuda/CUDAContext.h> #include <ATen/cuda/CUDAContext.h>
#include <c10/cuda/CUDAGuard.h> #include <c10/cuda/CUDAGuard.h>
#include <THC/THCAtomics.cuh> #include <THC/THCAtomics.cuh>
#include "cuda_helpers.h" #include "cuda_helpers.h"
#include "ps_roi_pool_kernel.h"
namespace vision {
namespace ops {
namespace {
template <typename T> template <typename T>
__global__ void PSROIPoolForward( __global__ void ps_roi_pool_forward_kernel_impl(
int nthreads, int nthreads,
const T* input, const T* input,
const T spatial_scale, const T spatial_scale,
...@@ -73,7 +77,7 @@ __global__ void PSROIPoolForward( ...@@ -73,7 +77,7 @@ __global__ void PSROIPoolForward(
} }
template <typename T> template <typename T>
__global__ void PSROIPoolBackward( __global__ void ps_roi_pool_backward_kernel_impl(
int nthreads, int nthreads,
const T* grad_output, const T* grad_output,
const int* channel_mapping, const int* channel_mapping,
...@@ -132,7 +136,9 @@ __global__ void PSROIPoolBackward( ...@@ -132,7 +136,9 @@ __global__ void PSROIPoolBackward(
} }
} }
std::tuple<at::Tensor, at::Tensor> PSROIPool_forward_cuda( } // namespace
std::tuple<at::Tensor, at::Tensor> ps_roi_pool_forward_cuda(
const at::Tensor& input, const at::Tensor& input,
const at::Tensor& rois, const at::Tensor& rois,
double spatial_scale, double spatial_scale,
...@@ -146,7 +152,7 @@ std::tuple<at::Tensor, at::Tensor> PSROIPool_forward_cuda( ...@@ -146,7 +152,7 @@ std::tuple<at::Tensor, at::Tensor> PSROIPool_forward_cuda(
at::TensorArg input_t{input, "input", 1}, rois_t{rois, "rois", 2}; at::TensorArg input_t{input, "input", 1}, rois_t{rois, "rois", 2};
at::CheckedFrom c = "PSROIPool_forward_cuda"; at::CheckedFrom c = "ps_roi_pool_forward_cuda";
at::checkAllSameGPU(c, {input_t, rois_t}); at::checkAllSameGPU(c, {input_t, rois_t});
at::checkAllSameType(c, {input_t, rois_t}); at::checkAllSameType(c, {input_t, rois_t});
...@@ -176,15 +182,14 @@ std::tuple<at::Tensor, at::Tensor> PSROIPool_forward_cuda( ...@@ -176,15 +182,14 @@ std::tuple<at::Tensor, at::Tensor> PSROIPool_forward_cuda(
cudaStream_t stream = at::cuda::getCurrentCUDAStream(); cudaStream_t stream = at::cuda::getCurrentCUDAStream();
dim3 grid(std::min( dim3 grid(std::min(
ceil_div(static_cast<int64_t>(output_size), static_cast<int64_t>(512)), ceil_div(static_cast<int64_t>(output_size), static_cast<int64_t>(512)),
static_cast<int64_t>(4096))); static_cast<int64_t>(4096)));
dim3 block(512); dim3 block(512);
auto input_ = input.contiguous(), auto input_ = input.contiguous(), rois_ = rois.contiguous();
rois_ = rois.contiguous();
AT_DISPATCH_FLOATING_TYPES_AND_HALF( AT_DISPATCH_FLOATING_TYPES_AND_HALF(
input.scalar_type(), "PSROIPool_forward", [&] { input.scalar_type(), "ps_roi_pool_forward_cuda", [&] {
PSROIPoolForward<scalar_t><<<grid, block, 0, stream>>>( ps_roi_pool_forward_kernel_impl<scalar_t><<<grid, block, 0, stream>>>(
output_size, output_size,
input_.data_ptr<scalar_t>(), input_.data_ptr<scalar_t>(),
spatial_scale, spatial_scale,
...@@ -202,7 +207,7 @@ std::tuple<at::Tensor, at::Tensor> PSROIPool_forward_cuda( ...@@ -202,7 +207,7 @@ std::tuple<at::Tensor, at::Tensor> PSROIPool_forward_cuda(
return std::make_tuple(output, channel_mapping); return std::make_tuple(output, channel_mapping);
} }
at::Tensor PSROIPool_backward_cuda( at::Tensor ps_roi_pool_backward_cuda(
const at::Tensor& grad, const at::Tensor& grad,
const at::Tensor& rois, const at::Tensor& rois,
const at::Tensor& channel_mapping, const at::Tensor& channel_mapping,
...@@ -217,13 +222,12 @@ at::Tensor PSROIPool_backward_cuda( ...@@ -217,13 +222,12 @@ at::Tensor PSROIPool_backward_cuda(
TORCH_CHECK(grad.is_cuda(), "grad must be a CUDA tensor"); TORCH_CHECK(grad.is_cuda(), "grad must be a CUDA tensor");
TORCH_CHECK(rois.is_cuda(), "rois must be a CUDA tensor"); TORCH_CHECK(rois.is_cuda(), "rois must be a CUDA tensor");
TORCH_CHECK( TORCH_CHECK(
channel_mapping.is_cuda(), channel_mapping.is_cuda(), "channel_mapping must be a CUDA tensor");
"channel_mapping must be a CUDA tensor");
at::TensorArg grad_t{grad, "grad", 1}, rois_t{rois, "rois", 2}, at::TensorArg grad_t{grad, "grad", 1}, rois_t{rois, "rois", 2},
channel_mapping_t{channel_mapping, "channel_mapping", 3}; channel_mapping_t{channel_mapping, "channel_mapping", 3};
at::CheckedFrom c = "PSROIPool_backward_cuda"; at::CheckedFrom c = "ps_roi_pool_backward_cuda";
at::checkAllSameGPU(c, {grad_t, rois_t, channel_mapping_t}); at::checkAllSameGPU(c, {grad_t, rois_t, channel_mapping_t});
at::checkAllSameType(c, {grad_t, rois_t}); at::checkAllSameType(c, {grad_t, rois_t});
...@@ -236,7 +240,7 @@ at::Tensor PSROIPool_backward_cuda( ...@@ -236,7 +240,7 @@ at::Tensor PSROIPool_backward_cuda(
cudaStream_t stream = at::cuda::getCurrentCUDAStream(); cudaStream_t stream = at::cuda::getCurrentCUDAStream();
dim3 grid(std::min( dim3 grid(std::min(
ceil_div(static_cast<int64_t>(grad.numel()), static_cast<int64_t>(512)), ceil_div(static_cast<int64_t>(grad.numel()), static_cast<int64_t>(512)),
static_cast<int64_t>(4096))); static_cast<int64_t>(4096)));
dim3 block(512); dim3 block(512);
...@@ -248,11 +252,10 @@ at::Tensor PSROIPool_backward_cuda( ...@@ -248,11 +252,10 @@ at::Tensor PSROIPool_backward_cuda(
int channels_out = channels / (pooled_height * pooled_width); int channels_out = channels / (pooled_height * pooled_width);
auto grad_ = grad.contiguous(), auto grad_ = grad.contiguous(), rois_ = rois.contiguous();
rois_ = rois.contiguous();
AT_DISPATCH_FLOATING_TYPES_AND_HALF( AT_DISPATCH_FLOATING_TYPES_AND_HALF(
grad.scalar_type(), "PSROIPool_backward", [&] { grad.scalar_type(), "ps_roi_pool_backward_cuda", [&] {
PSROIPoolBackward<scalar_t><<<grid, block, 0, stream>>>( ps_roi_pool_backward_kernel_impl<scalar_t><<<grid, block, 0, stream>>>(
grad.numel(), grad.numel(),
grad_.data_ptr<scalar_t>(), grad_.data_ptr<scalar_t>(),
channel_mapping.data_ptr<int>(), channel_mapping.data_ptr<int>(),
...@@ -270,3 +273,6 @@ at::Tensor PSROIPool_backward_cuda( ...@@ -270,3 +273,6 @@ at::Tensor PSROIPool_backward_cuda(
AT_CUDA_CHECK(cudaGetLastError()); AT_CUDA_CHECK(cudaGetLastError());
return grad_input; return grad_input;
} }
} // namespace ops
} // namespace vision
#pragma once
#include <ATen/ATen.h>
#include "../macros.h"
namespace vision {
namespace ops {
VISION_API std::tuple<at::Tensor, at::Tensor> ps_roi_pool_forward_cuda(
const at::Tensor& input,
const at::Tensor& rois,
double spatial_scale,
int64_t pooled_height,
int64_t pooled_width);
VISION_API at::Tensor ps_roi_pool_backward_cuda(
const at::Tensor& grad,
const at::Tensor& rois,
const at::Tensor& channel_mapping,
double spatial_scale,
int64_t pooled_height,
int64_t pooled_width,
int64_t batch_size,
int64_t channels,
int64_t height,
int64_t width);
} // namespace ops
} // namespace vision
#include <ATen/ATen.h>
#include <ATen/TensorUtils.h>
#include <ATen/cuda/CUDAContext.h> #include <ATen/cuda/CUDAContext.h>
#include <c10/cuda/CUDAGuard.h> #include <c10/cuda/CUDAGuard.h>
#include <THC/THCAtomics.cuh> #include <THC/THCAtomics.cuh>
#include "cuda_helpers.h" #include "cuda_helpers.h"
#include "roi_align_kernel.h"
namespace vision {
namespace ops {
namespace {
template <typename T> template <typename T>
__device__ T bilinear_interpolate( __device__ T bilinear_interpolate(
...@@ -61,7 +65,7 @@ __device__ T bilinear_interpolate( ...@@ -61,7 +65,7 @@ __device__ T bilinear_interpolate(
} }
template <typename T> template <typename T>
__global__ void RoIAlignForward( __global__ void roi_align_forward_kernel_impl(
int nthreads, int nthreads,
const T* input, const T* input,
const T spatial_scale, const T spatial_scale,
...@@ -197,7 +201,7 @@ __device__ void bilinear_interpolate_gradient( ...@@ -197,7 +201,7 @@ __device__ void bilinear_interpolate_gradient(
} }
template <typename T> template <typename T>
__global__ void RoIAlignBackward( __global__ void roi_align_backward_kernel_impl(
int nthreads, int nthreads,
const T* grad_output, const T* grad_output,
const T spatial_scale, const T spatial_scale,
...@@ -308,9 +312,11 @@ __global__ void RoIAlignBackward( ...@@ -308,9 +312,11 @@ __global__ void RoIAlignBackward(
} // ix } // ix
} // iy } // iy
} // CUDA_1D_KERNEL_LOOP } // CUDA_1D_KERNEL_LOOP
} // RoIAlignBackward }
at::Tensor ROIAlign_forward_cuda( } // namespace
at::Tensor roi_align_forward_cuda(
const at::Tensor& input, const at::Tensor& input,
const at::Tensor& rois, const at::Tensor& rois,
double spatial_scale, double spatial_scale,
...@@ -320,12 +326,11 @@ at::Tensor ROIAlign_forward_cuda( ...@@ -320,12 +326,11 @@ at::Tensor ROIAlign_forward_cuda(
bool aligned) { bool aligned) {
TORCH_CHECK(input.is_cuda(), "input must be a CUDA tensor"); TORCH_CHECK(input.is_cuda(), "input must be a CUDA tensor");
TORCH_CHECK(rois.is_cuda(), "rois must be a CUDA tensor"); TORCH_CHECK(rois.is_cuda(), "rois must be a CUDA tensor");
TORCH_CHECK( TORCH_CHECK(rois.size(1) == 5, "rois must have shape as Tensor[K, 5]");
rois.size(1) == 5, "rois must have shape as Tensor[K, 5]");
at::TensorArg input_t{input, "input", 1}, rois_t{rois, "rois", 2}; at::TensorArg input_t{input, "input", 1}, rois_t{rois, "rois", 2};
at::CheckedFrom c = "ROIAlign_forward_cuda"; at::CheckedFrom c = "roi_align_forward_cuda";
at::checkAllSameGPU(c, {input_t, rois_t}); at::checkAllSameGPU(c, {input_t, rois_t});
at::checkAllSameType(c, {input_t, rois_t}); at::checkAllSameType(c, {input_t, rois_t});
...@@ -343,7 +348,7 @@ at::Tensor ROIAlign_forward_cuda( ...@@ -343,7 +348,7 @@ at::Tensor ROIAlign_forward_cuda(
cudaStream_t stream = at::cuda::getCurrentCUDAStream(); cudaStream_t stream = at::cuda::getCurrentCUDAStream();
dim3 grid(std::min( dim3 grid(std::min(
ceil_div(static_cast<int64_t>(output_size), static_cast<int64_t>(512)), ceil_div(static_cast<int64_t>(output_size), static_cast<int64_t>(512)),
static_cast<int64_t>(4096))); static_cast<int64_t>(4096)));
dim3 block(512); dim3 block(512);
...@@ -352,28 +357,28 @@ at::Tensor ROIAlign_forward_cuda( ...@@ -352,28 +357,28 @@ at::Tensor ROIAlign_forward_cuda(
return output; return output;
} }
auto input_ = input.contiguous(), auto input_ = input.contiguous(), rois_ = rois.contiguous();
rois_ = rois.contiguous(); AT_DISPATCH_FLOATING_TYPES_AND_HALF(
AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.scalar_type(), "ROIAlign_forward", [&] { input.scalar_type(), "roi_align_forward_cuda", [&] {
RoIAlignForward<scalar_t><<<grid, block, 0, stream>>>( roi_align_forward_kernel_impl<scalar_t><<<grid, block, 0, stream>>>(
output_size, output_size,
input_.data_ptr<scalar_t>(), input_.data_ptr<scalar_t>(),
spatial_scale, spatial_scale,
channels, channels,
height, height,
width, width,
pooled_height, pooled_height,
pooled_width, pooled_width,
sampling_ratio, sampling_ratio,
aligned, aligned,
rois_.data_ptr<scalar_t>(), rois_.data_ptr<scalar_t>(),
output.data_ptr<scalar_t>()); output.data_ptr<scalar_t>());
}); });
AT_CUDA_CHECK(cudaGetLastError()); AT_CUDA_CHECK(cudaGetLastError());
return output; return output;
} }
at::Tensor ROIAlign_backward_cuda( at::Tensor roi_align_backward_cuda(
const at::Tensor& grad, const at::Tensor& grad,
const at::Tensor& rois, const at::Tensor& rois,
double spatial_scale, double spatial_scale,
...@@ -390,7 +395,7 @@ at::Tensor ROIAlign_backward_cuda( ...@@ -390,7 +395,7 @@ at::Tensor ROIAlign_backward_cuda(
at::TensorArg grad_t{grad, "grad", 1}, rois_t{rois, "rois", 2}; at::TensorArg grad_t{grad, "grad", 1}, rois_t{rois, "rois", 2};
at::CheckedFrom c = "ROIAlign_backward_cuda"; at::CheckedFrom c = "roi_align_backward_cuda";
at::checkAllSameGPU(c, {grad_t, rois_t}); at::checkAllSameGPU(c, {grad_t, rois_t});
at::checkAllSameType(c, {grad_t, rois_t}); at::checkAllSameType(c, {grad_t, rois_t});
...@@ -402,7 +407,7 @@ at::Tensor ROIAlign_backward_cuda( ...@@ -402,7 +407,7 @@ at::Tensor ROIAlign_backward_cuda(
cudaStream_t stream = at::cuda::getCurrentCUDAStream(); cudaStream_t stream = at::cuda::getCurrentCUDAStream();
dim3 grid(std::min( dim3 grid(std::min(
ceil_div(static_cast<int64_t>(grad.numel()), static_cast<int64_t>(512)), ceil_div(static_cast<int64_t>(grad.numel()), static_cast<int64_t>(512)),
static_cast<int64_t>(4096))); static_cast<int64_t>(4096)));
dim3 block(512); dim3 block(512);
...@@ -418,25 +423,29 @@ at::Tensor ROIAlign_backward_cuda( ...@@ -418,25 +423,29 @@ at::Tensor ROIAlign_backward_cuda(
int w_stride = grad.stride(3); int w_stride = grad.stride(3);
auto rois_ = rois.contiguous(); auto rois_ = rois.contiguous();
AT_DISPATCH_FLOATING_TYPES_AND_HALF(grad.scalar_type(), "ROIAlign_backward", [&] { AT_DISPATCH_FLOATING_TYPES_AND_HALF(
RoIAlignBackward<scalar_t><<<grid, block, 0, stream>>>( grad.scalar_type(), "roi_align_backward_cuda", [&] {
grad.numel(), roi_align_backward_kernel_impl<scalar_t><<<grid, block, 0, stream>>>(
grad.data_ptr<scalar_t>(), grad.numel(),
spatial_scale, grad.data_ptr<scalar_t>(),
channels, spatial_scale,
height, channels,
width, height,
pooled_height, width,
pooled_width, pooled_height,
sampling_ratio, pooled_width,
aligned, sampling_ratio,
grad_input.data_ptr<scalar_t>(), aligned,
rois_.data_ptr<scalar_t>(), grad_input.data_ptr<scalar_t>(),
n_stride, rois_.data_ptr<scalar_t>(),
c_stride, n_stride,
h_stride, c_stride,
w_stride); h_stride,
}); w_stride);
});
AT_CUDA_CHECK(cudaGetLastError()); AT_CUDA_CHECK(cudaGetLastError());
return grad_input; return grad_input;
} }
} // namespace ops
} // namespace vision
#pragma once
#include <ATen/ATen.h>
#include "../macros.h"
namespace vision {
namespace ops {
VISION_API at::Tensor roi_align_forward_cuda(
const at::Tensor& input,
const at::Tensor& rois,
double spatial_scale,
int64_t pooled_height,
int64_t pooled_width,
int64_t sampling_ratio,
bool aligned);
VISION_API at::Tensor roi_align_backward_cuda(
const at::Tensor& grad,
const at::Tensor& rois,
double spatial_scale,
int64_t pooled_height,
int64_t pooled_width,
int64_t batch_size,
int64_t channels,
int64_t height,
int64_t width,
int64_t sampling_ratio,
bool aligned);
} // namespace ops
} // namespace vision
#include <ATen/ATen.h>
#include <ATen/TensorUtils.h>
#include <ATen/cuda/CUDAContext.h> #include <ATen/cuda/CUDAContext.h>
#include <c10/cuda/CUDAGuard.h> #include <c10/cuda/CUDAGuard.h>
#include <float.h>
#include <THC/THCAtomics.cuh> #include <THC/THCAtomics.cuh>
#include "cuda_helpers.h" #include "cuda_helpers.h"
#include "roi_pool_kernel.h"
namespace vision {
namespace ops {
namespace {
template <typename T> template <typename T>
__global__ void RoIPoolForward( __global__ void roi_pool_forward_kernel_impl(
int nthreads, int nthreads,
const T* input, const T* input,
const T spatial_scale, const T spatial_scale,
...@@ -72,7 +77,7 @@ __global__ void RoIPoolForward( ...@@ -72,7 +77,7 @@ __global__ void RoIPoolForward(
} }
template <typename T> template <typename T>
__global__ void RoIPoolBackward( __global__ void roi_pool_backward_kernel_impl(
int nthreads, int nthreads,
const T* grad_output, const T* grad_output,
const int* argmax_data, const int* argmax_data,
...@@ -115,7 +120,9 @@ __global__ void RoIPoolBackward( ...@@ -115,7 +120,9 @@ __global__ void RoIPoolBackward(
} }
} }
std::tuple<at::Tensor, at::Tensor> ROIPool_forward_cuda( } // namespace
std::tuple<at::Tensor, at::Tensor> roi_pool_forward_cuda(
const at::Tensor& input, const at::Tensor& input,
const at::Tensor& rois, const at::Tensor& rois,
double spatial_scale, double spatial_scale,
...@@ -128,7 +135,7 @@ std::tuple<at::Tensor, at::Tensor> ROIPool_forward_cuda( ...@@ -128,7 +135,7 @@ std::tuple<at::Tensor, at::Tensor> ROIPool_forward_cuda(
at::TensorArg input_t{input, "input", 1}, rois_t{rois, "rois", 2}; at::TensorArg input_t{input, "input", 1}, rois_t{rois, "rois", 2};
at::CheckedFrom c = "ROIPool_forward_cuda"; at::CheckedFrom c = "roi_pool_forward_cuda";
at::checkAllSameGPU(c, {input_t, rois_t}); at::checkAllSameGPU(c, {input_t, rois_t});
at::checkAllSameType(c, {input_t, rois_t}); at::checkAllSameType(c, {input_t, rois_t});
...@@ -149,7 +156,7 @@ std::tuple<at::Tensor, at::Tensor> ROIPool_forward_cuda( ...@@ -149,7 +156,7 @@ std::tuple<at::Tensor, at::Tensor> ROIPool_forward_cuda(
cudaStream_t stream = at::cuda::getCurrentCUDAStream(); cudaStream_t stream = at::cuda::getCurrentCUDAStream();
dim3 grid(std::min( dim3 grid(std::min(
ceil_div(static_cast<int64_t>(output_size), static_cast<int64_t>(512)), ceil_div(static_cast<int64_t>(output_size), static_cast<int64_t>(512)),
static_cast<int64_t>(4096))); static_cast<int64_t>(4096)));
dim3 block(512); dim3 block(512);
...@@ -158,27 +165,27 @@ std::tuple<at::Tensor, at::Tensor> ROIPool_forward_cuda( ...@@ -158,27 +165,27 @@ std::tuple<at::Tensor, at::Tensor> ROIPool_forward_cuda(
return std::make_tuple(output, argmax); return std::make_tuple(output, argmax);
} }
auto input_ = input.contiguous(), auto input_ = input.contiguous(), rois_ = rois.contiguous();
rois_ = rois.contiguous(); AT_DISPATCH_FLOATING_TYPES_AND_HALF(
AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.scalar_type(), "ROIPool_forward", [&] { input.scalar_type(), "roi_pool_forward_cuda", [&] {
RoIPoolForward<scalar_t><<<grid, block, 0, stream>>>( roi_pool_forward_kernel_impl<scalar_t><<<grid, block, 0, stream>>>(
output_size, output_size,
input_.data_ptr<scalar_t>(), input_.data_ptr<scalar_t>(),
spatial_scale, spatial_scale,
channels, channels,
height, height,
width, width,
pooled_height, pooled_height,
pooled_width, pooled_width,
rois_.data_ptr<scalar_t>(), rois_.data_ptr<scalar_t>(),
output.data_ptr<scalar_t>(), output.data_ptr<scalar_t>(),
argmax.data_ptr<int>()); argmax.data_ptr<int>());
}); });
AT_CUDA_CHECK(cudaGetLastError()); AT_CUDA_CHECK(cudaGetLastError());
return std::make_tuple(output, argmax); return std::make_tuple(output, argmax);
} }
at::Tensor ROIPool_backward_cuda( at::Tensor roi_pool_backward_cuda(
const at::Tensor& grad, const at::Tensor& grad,
const at::Tensor& rois, const at::Tensor& rois,
const at::Tensor& argmax, const at::Tensor& argmax,
...@@ -197,7 +204,7 @@ at::Tensor ROIPool_backward_cuda( ...@@ -197,7 +204,7 @@ at::Tensor ROIPool_backward_cuda(
at::TensorArg grad_t{grad, "grad", 1}, rois_t{rois, "rois", 2}, at::TensorArg grad_t{grad, "grad", 1}, rois_t{rois, "rois", 2},
argmax_t{argmax, "argmax", 3}; argmax_t{argmax, "argmax", 3};
at::CheckedFrom c = "ROIPool_backward_cuda"; at::CheckedFrom c = "roi_pool_backward_cuda";
at::checkAllSameGPU(c, {grad_t, rois_t, argmax_t}); at::checkAllSameGPU(c, {grad_t, rois_t, argmax_t});
at::checkAllSameType(c, {grad_t, rois_t}); at::checkAllSameType(c, {grad_t, rois_t});
...@@ -211,7 +218,7 @@ at::Tensor ROIPool_backward_cuda( ...@@ -211,7 +218,7 @@ at::Tensor ROIPool_backward_cuda(
cudaStream_t stream = at::cuda::getCurrentCUDAStream(); cudaStream_t stream = at::cuda::getCurrentCUDAStream();
dim3 grid(std::min( dim3 grid(std::min(
ceil_div(static_cast<int64_t>(grad.numel()), static_cast<int64_t>(512)), ceil_div(static_cast<int64_t>(grad.numel()), static_cast<int64_t>(512)),
static_cast<int64_t>(4096))); static_cast<int64_t>(4096)));
dim3 block(512); dim3 block(512);
...@@ -226,27 +233,30 @@ at::Tensor ROIPool_backward_cuda( ...@@ -226,27 +233,30 @@ at::Tensor ROIPool_backward_cuda(
int h_stride = grad.stride(2); int h_stride = grad.stride(2);
int w_stride = grad.stride(3); int w_stride = grad.stride(3);
auto argmax_ = argmax.contiguous(), auto argmax_ = argmax.contiguous(), rois_ = rois.contiguous();
rois_ = rois.contiguous(); AT_DISPATCH_FLOATING_TYPES_AND_HALF(
AT_DISPATCH_FLOATING_TYPES_AND_HALF(grad.scalar_type(), "ROIPool_backward", [&] { grad.scalar_type(), "roi_pool_backward_cuda", [&] {
RoIPoolBackward<scalar_t><<<grid, block, 0, stream>>>( roi_pool_backward_kernel_impl<scalar_t><<<grid, block, 0, stream>>>(
grad.numel(), grad.numel(),
grad.data_ptr<scalar_t>(), grad.data_ptr<scalar_t>(),
argmax_.data_ptr<int>(), argmax_.data_ptr<int>(),
num_rois, num_rois,
spatial_scale, spatial_scale,
channels, channels,
height, height,
width, width,
pooled_height, pooled_height,
pooled_width, pooled_width,
grad_input.data_ptr<scalar_t>(), grad_input.data_ptr<scalar_t>(),
rois_.data_ptr<scalar_t>(), rois_.data_ptr<scalar_t>(),
n_stride, n_stride,
c_stride, c_stride,
h_stride, h_stride,
w_stride); w_stride);
}); });
AT_CUDA_CHECK(cudaGetLastError()); AT_CUDA_CHECK(cudaGetLastError());
return grad_input; return grad_input;
} }
} // namespace ops
} // namespace vision
#pragma once
#include <ATen/ATen.h>
#include "../macros.h"
namespace vision {
namespace ops {
VISION_API std::tuple<at::Tensor, at::Tensor> roi_pool_forward_cuda(
const at::Tensor& input,
const at::Tensor& rois,
double spatial_scale,
int64_t pooled_height,
int64_t pooled_width);
VISION_API at::Tensor roi_pool_backward_cuda(
const at::Tensor& grad,
const at::Tensor& rois,
const at::Tensor& argmax,
double spatial_scale,
int64_t pooled_height,
int64_t pooled_width,
int64_t batch_size,
int64_t channels,
int64_t height,
int64_t width);
} // namespace ops
} // namespace vision
#pragma once
#include <torch/extension.h>
#include "../macros.h"
VISION_API at::Tensor DeformConv2d_forward_cuda(
const at::Tensor& input,
const at::Tensor& weight,
const at::Tensor& offset,
const at::Tensor& mask,
const at::Tensor& bias,
int64_t stride_h,
int64_t stride_w,
int64_t pad_h,
int64_t pad_w,
int64_t dilation_h,
int64_t dilation_w,
int64_t groups,
int64_t deformable_groups,
bool use_mask);
VISION_API std::
tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor, at::Tensor>
DeformConv2d_backward_cuda(
const at::Tensor& grad_out,
const at::Tensor& input,
const at::Tensor& weight,
const at::Tensor& offset,
const at::Tensor& mask,
const at::Tensor& bias,
int64_t stride_h,
int64_t stride_w,
int64_t pad_h,
int64_t pad_w,
int64_t dilation_h,
int64_t dilation_w,
int64_t groups,
int64_t deformable_groups,
bool use_mask);
VISION_API at::Tensor nms_cuda(
const at::Tensor& dets,
const at::Tensor& scores,
double iou_threshold);
VISION_API std::tuple<at::Tensor, at::Tensor> PSROIAlign_forward_cuda(
const at::Tensor& input,
const at::Tensor& rois,
double spatial_scale,
int64_t pooled_height,
int64_t pooled_width,
int64_t sampling_ratio);
VISION_API at::Tensor PSROIAlign_backward_cuda(
const at::Tensor& grad,
const at::Tensor& rois,
const at::Tensor& channel_mapping,
double spatial_scale,
int64_t pooled_height,
int64_t pooled_width,
int64_t sampling_ratio,
int64_t batch_size,
int64_t channels,
int64_t height,
int64_t width);
VISION_API std::tuple<at::Tensor, at::Tensor> PSROIPool_forward_cuda(
const at::Tensor& input,
const at::Tensor& rois,
double spatial_scale,
int64_t pooled_height,
int64_t pooled_width);
VISION_API at::Tensor PSROIPool_backward_cuda(
const at::Tensor& grad,
const at::Tensor& rois,
const at::Tensor& channel_mapping,
double spatial_scale,
int64_t pooled_height,
int64_t pooled_width,
int64_t batch_size,
int64_t channels,
int64_t height,
int64_t width);
VISION_API at::Tensor ROIAlign_forward_cuda(
const at::Tensor& input,
const at::Tensor& rois,
double spatial_scale,
int64_t pooled_height,
int64_t pooled_width,
int64_t sampling_ratio,
bool aligned);
VISION_API at::Tensor ROIAlign_backward_cuda(
const at::Tensor& grad,
const at::Tensor& rois,
double spatial_scale,
int64_t pooled_height,
int64_t pooled_width,
int64_t batch_size,
int64_t channels,
int64_t height,
int64_t width,
int64_t sampling_ratio,
bool aligned);
VISION_API std::tuple<at::Tensor, at::Tensor> ROIPool_forward_cuda(
const at::Tensor& input,
const at::Tensor& rois,
const double spatial_scale,
const int64_t pooled_height,
const int64_t pooled_width);
VISION_API at::Tensor ROIPool_backward_cuda(
const at::Tensor& grad,
const at::Tensor& rois,
const at::Tensor& argmax,
const double spatial_scale,
const int64_t pooled_height,
const int64_t pooled_width,
const int64_t batch_size,
const int64_t channels,
const int64_t height,
const int64_t width);
#pragma once #include "deform_conv2d.h"
#include <torch/extension.h>
#include "cpu/vision_cpu.h" #if defined(WITH_CUDA) || defined(WITH_HIP)
#include <ATen/autocast_mode.h>
#ifdef WITH_CUDA
#include "autocast.h"
#include "cuda/vision_cuda.h"
#endif
#ifdef WITH_HIP
#include "autocast.h"
#include "hip/vision_cuda.h"
#endif #endif
// TODO: put this stuff in torchvision namespace namespace vision {
namespace ops {
at::Tensor deform_conv2d( at::Tensor deform_conv2d(
const at::Tensor& input, const at::Tensor& input,
...@@ -49,7 +44,7 @@ at::Tensor deform_conv2d( ...@@ -49,7 +44,7 @@ at::Tensor deform_conv2d(
} }
#if defined(WITH_CUDA) || defined(WITH_HIP) #if defined(WITH_CUDA) || defined(WITH_HIP)
at::Tensor DeformConv2d_autocast( at::Tensor deform_conv2d_autocast(
const at::Tensor& input, const at::Tensor& input,
const at::Tensor& weight, const at::Tensor& weight,
const at::Tensor& offset, const at::Tensor& offset,
...@@ -123,6 +118,8 @@ _deform_conv2d_backward( ...@@ -123,6 +118,8 @@ _deform_conv2d_backward(
use_mask); use_mask);
} }
namespace {
class DeformConv2dFunction class DeformConv2dFunction
: public torch::autograd::Function<DeformConv2dFunction> { : public torch::autograd::Function<DeformConv2dFunction> {
public: public:
...@@ -297,7 +294,9 @@ class DeformConv2dBackwardFunction ...@@ -297,7 +294,9 @@ class DeformConv2dBackwardFunction
} }
}; };
at::Tensor DeformConv2d_autograd( } // namespace
at::Tensor deform_conv2d_autograd(
const at::Tensor& input, const at::Tensor& input,
const at::Tensor& weight, const at::Tensor& weight,
const at::Tensor& offset, const at::Tensor& offset,
...@@ -330,7 +329,7 @@ at::Tensor DeformConv2d_autograd( ...@@ -330,7 +329,7 @@ at::Tensor DeformConv2d_autograd(
} }
std::tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor, at::Tensor> std::tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor, at::Tensor>
DeformConv2d_backward_autograd( deform_conv2d_backward_autograd(
const at::Tensor& grad, const at::Tensor& grad,
const at::Tensor& input, const at::Tensor& input,
const at::Tensor& weight, const at::Tensor& weight,
...@@ -365,3 +364,6 @@ DeformConv2d_backward_autograd( ...@@ -365,3 +364,6 @@ DeformConv2d_backward_autograd(
return std::make_tuple(result[0], result[1], result[2], result[3], result[4]); return std::make_tuple(result[0], result[1], result[2], result[3], result[4]);
} }
} // namespace ops
} // namespace vision
#pragma once
#include "cpu/deform_conv2d_kernel.h"
#ifdef WITH_CUDA
#include "cuda/deform_conv2d_kernel.h"
#endif
#ifdef WITH_HIP
#include "hip/deform_conv2d_kernel.h"
#endif
namespace vision {
namespace ops {
// C++ Forward
at::Tensor deform_conv2d(
const at::Tensor& input,
const at::Tensor& weight,
const at::Tensor& offset,
const at::Tensor& mask,
const at::Tensor& bias,
int64_t stride_h,
int64_t stride_w,
int64_t pad_h,
int64_t pad_w,
int64_t dilation_h,
int64_t dilation_w,
int64_t groups,
int64_t offset_groups,
bool use_mask);
// Autocast Forward
#if defined(WITH_CUDA) || defined(WITH_HIP)
at::Tensor deform_conv2d_autocast(
const at::Tensor& input,
const at::Tensor& weight,
const at::Tensor& offset,
const at::Tensor& mask,
const at::Tensor& bias,
int64_t stride_h,
int64_t stride_w,
int64_t pad_h,
int64_t pad_w,
int64_t dilation_h,
int64_t dilation_w,
int64_t groups,
int64_t offset_groups,
bool use_mask);
#endif
// C++ Backward
std::tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor, at::Tensor>
_deform_conv2d_backward(
const at::Tensor& grad,
const at::Tensor& input,
const at::Tensor& weight,
const at::Tensor& offset,
const at::Tensor& mask,
const at::Tensor& bias,
int64_t stride_h,
int64_t stride_w,
int64_t pad_h,
int64_t pad_w,
int64_t dilation_h,
int64_t dilation_w,
int64_t groups,
int64_t offset_groups,
bool use_mask);
// Autograd Forward and Backward
at::Tensor deform_conv2d_autograd(
const at::Tensor& input,
const at::Tensor& weight,
const at::Tensor& offset,
const at::Tensor& mask,
const at::Tensor& bias,
int64_t stride_h,
int64_t stride_w,
int64_t pad_h,
int64_t pad_w,
int64_t dilation_h,
int64_t dilation_w,
int64_t groups,
int64_t offset_groups,
bool use_mask);
std::tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor, at::Tensor>
deform_conv2d_backward_autograd(
const at::Tensor& grad,
const at::Tensor& input,
const at::Tensor& weight,
const at::Tensor& offset,
const at::Tensor& mask,
const at::Tensor& bias,
int64_t stride_h,
int64_t stride_w,
int64_t pad_h,
int64_t pad_w,
int64_t dilation_h,
int64_t dilation_w,
int64_t groups,
int64_t offset_groups,
bool use_mask);
} // namespace ops
} // namespace vision
#ifndef TORCHVISION_MACROS_H #pragma once
#define TORCHVISION_MACROS_H
#ifdef _WIN32 #ifdef _WIN32
#if defined(torchvision_EXPORTS) #if defined(torchvision_EXPORTS)
...@@ -20,5 +19,3 @@ ...@@ -20,5 +19,3 @@
#define VISION_INLINE_VARIABLE __attribute__((weak)) #define VISION_INLINE_VARIABLE __attribute__((weak))
#endif #endif
#endif #endif
#endif // TORCHVISION_MACROS_H
#ifndef ALEXNET_H #pragma once
#define ALEXNET_H
#include <torch/torch.h> #include <torch/torch.h>
#include "general.h" #include "../macros.h"
namespace vision { namespace vision {
namespace models { namespace models {
...@@ -20,5 +19,3 @@ TORCH_MODULE(AlexNet); ...@@ -20,5 +19,3 @@ TORCH_MODULE(AlexNet);
} // namespace models } // namespace models
} // namespace vision } // namespace vision
#endif // ALEXNET_H
#ifndef DENSENET_H #pragma once
#define DENSENET_H
#include <torch/torch.h> #include <torch/torch.h>
#include "general.h" #include "../macros.h"
namespace vision { namespace vision {
namespace models { namespace models {
...@@ -82,5 +81,3 @@ TORCH_MODULE(DenseNet161); ...@@ -82,5 +81,3 @@ TORCH_MODULE(DenseNet161);
} // namespace models } // namespace models
} // namespace vision } // namespace vision
#endif // DENSENET_H
#ifndef VISION_GENERAL_H
#define VISION_GENERAL_H
#ifdef _WIN32
#if defined(torchvision_EXPORTS)
#define VISION_API __declspec(dllexport)
#else
#define VISION_API __declspec(dllimport)
#endif
#else
#define VISION_API
#endif
#endif // VISION_GENERAL_H
\ No newline at end of file
#ifndef GOOGLENET_H #pragma once
#define GOOGLENET_H
#include <torch/torch.h> #include <torch/torch.h>
#include "general.h" #include "../macros.h"
namespace vision { namespace vision {
namespace models { namespace models {
...@@ -86,5 +85,3 @@ TORCH_MODULE(GoogLeNet); ...@@ -86,5 +85,3 @@ TORCH_MODULE(GoogLeNet);
} // namespace models } // namespace models
} // namespace vision } // namespace vision
#endif // GOOGLENET_H
#ifndef INCEPTION_H #pragma once
#define INCEPTION_H
#include <torch/torch.h> #include <torch/torch.h>
#include "general.h" #include "../macros.h"
namespace vision { namespace vision {
namespace models { namespace models {
...@@ -124,5 +123,3 @@ TORCH_MODULE(InceptionV3); ...@@ -124,5 +123,3 @@ TORCH_MODULE(InceptionV3);
} // namespace models } // namespace models
} // namespace vision } // namespace vision
#endif // INCEPTION_H
#ifndef MNASNET_H #pragma once
#define MNASNET_H
#include <torch/torch.h> #include <torch/torch.h>
#include "general.h" #include "../macros.h"
namespace vision { namespace vision {
namespace models { namespace models {
...@@ -43,5 +42,3 @@ TORCH_MODULE(MNASNet1_3); ...@@ -43,5 +42,3 @@ TORCH_MODULE(MNASNet1_3);
} // namespace models } // namespace models
} // namespace vision } // namespace vision
#endif // MNASNET_H
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment