Unverified Commit 0ebbb0ab authored by Vasilis Vryniotis's avatar Vasilis Vryniotis Committed by GitHub
Browse files

Encapsulate and Standardise C++ Ops (#3097)

* Encapsulate and standardize deform_conv2d (#3074)

* Rename files.

* Standardizing method names.

* Adding anonymous namespaces.

* Applying C++ naming rules and alinging variable names across headers and cpp files.

* Syncing names across implementations.

* Rename deform_conv2d.h to deform_conv2d.cpp

* Use header files:
- Create header files for kernel implementation and remove definitions from vision_*.h files.
- Eliminate unnecessary headers and ensure all cpp include their headers.

* Change the naming convention for kernel implementations.

* Remove the _param postfix from the variables and standardizing names.

* Exposing public forward/backward methods to the C++ API and moving methods around to minimize git blame changes.

* Encapsulate and standardize nms (#3081)

* Syncing, where possible, the names of functions across devices.

* Adding all internal functions in anonymous namespaces.

* Renaming C++/CUDA kernel files and moving operator code from header to cpp file.

* Create foreach cpp file a separate header file with "public" functions.

* Removing unnecessary repeated includes.

* Update CMakeLists.txt to include all headers.

* Encapsulate and standardize ps_roi_align (#3082)

* Renaming C++ files & methods according to recommended naming conventions and aligning them with Python's API.
Syncing, where possible, the names of functions across devices.

* Adding all internal functions in anonymous namespaces.

* Renaming C++/CUDA kernel files and moving operator code from header to cpp file.

* Create foreach cpp file a separate header file with "public" functions.

* Removing unnecessary repeated includes.

* Encapsulate and standardize ps_roi_pool (#3084)

* Renaming C++ files & methods according to recommended naming conventions and aligning them with Python's API.

* Adding all internal functions in anonymous namespaces.

* Renaming C++/CUDA kernel files and moving operator code from header to cpp file.

* Create foreach cpp file a separate header file with "public" functions.

* Removing unnecessary repeated includes.

* Encapsulate and standardize roi_align (#3085)

* Renaming C++ files & methods according to recommended naming conventions and aligning them with Python's API.

* Adding all internal functions in anonymous namespaces.

* Renaming C++/CUDA kernel files and moving operator code from header to cpp file.

* Create foreach cpp file a separate header file with "public" functions.

* Removing unnecessary repeated includes.

* Encapsulate and standardize roi_pool  (#3088)

* Renaming C++ files & methods according to recommended naming conventions and aligning them with Python's API.

* Adding all internal functions in anonymous namespaces.

* Syncing variable names between the cpp files and their header files.

* Renaming C++/CUDA kernel files and moving operator code from header to cpp file.

* Create foreach cpp file a separate header file with "public" functions.

* Removing unnecessary repeated includes.

* Encapsulate and standardize new_empty_tensor_op (#3089)

* Renaming C++ files & methods according to recommended naming conventions and aligning them with Python's API.

* Create foreach cpp file a separate header file with "public" functions.

* Adding all internal functions in anonymous namespaces.

* Convert to const ref all possible parameters.

* Removing unnecessary repeated includes.

* Encapsulate and standardize C++ Ops - Clean up (#3094)

* Removing unnecessary repeated includes.

* Remove unnecessary vision_cpu.h, vision_cuda.h, autocast.h.

* Fixing naming convention and correcting method names on macros.

* Turn on clang formatter for cu files and fixing broken styles.

* Replace "#ifndef ... #define ... #endif" with "#pragma once" on header files.

* Adding operator methods in vision::ops namespace. (#3096)

* Adding operator methods in vision::ops namespace.

* Replace general.h with macros.h

* Adding vision.h to the necessary cpp files.
parent 8520f0be
#include <ATen/ATen.h>
#include <ATen/cuda/CUDAContext.h>
#include <c10/cuda/CUDAGuard.h>
#include "cuda_helpers.h"
#include "nms_kernel.h"
#include <iostream>
#include <vector>
namespace vision {
namespace ops {
namespace {
int const threadsPerBlock = sizeof(unsigned long long) * 8;
template <typename T>
__device__ inline bool devIoU(T const* const a, T const* const b, const float threshold) {
__device__ inline bool devIoU(
T const* const a,
T const* const b,
const float threshold) {
T left = max(a[0], b[0]), right = min(a[2], b[2]);
T top = max(a[1], b[1]), bottom = min(a[3], b[3]);
T width = max(right - left, (T)0), height = max(bottom - top, (T)0);
......@@ -21,7 +26,7 @@ __device__ inline bool devIoU(T const* const a, T const* const b, const float th
}
template <typename T>
__global__ void nms_kernel(
__global__ void nms_kernel_impl(
int n_boxes,
double iou_threshold,
const T* dev_boxes,
......@@ -29,7 +34,8 @@ __global__ void nms_kernel(
const int row_start = blockIdx.y;
const int col_start = blockIdx.x;
if (row_start > col_start) return;
if (row_start > col_start)
return;
const int row_size =
min(n_boxes - row_start * threadsPerBlock, threadsPerBlock);
......@@ -68,7 +74,10 @@ __global__ void nms_kernel(
}
}
at::Tensor nms_cuda(const at::Tensor& dets,
} // namespace
at::Tensor nms_cuda(
const at::Tensor& dets,
const at::Tensor& scores,
double iou_threshold) {
TORCH_CHECK(dets.is_cuda(), "dets must be a CUDA tensor");
......@@ -118,8 +127,8 @@ at::Tensor nms_cuda(const at::Tensor& dets,
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
dets_sorted.scalar_type(), "nms_kernel_cuda", [&] {
nms_kernel<scalar_t><<<blocks, threads, 0, stream>>>(
dets_sorted.scalar_type(), "nms_cuda", [&] {
nms_kernel_impl<scalar_t><<<blocks, threads, 0, stream>>>(
dets_num,
iou_threshold,
dets_sorted.data_ptr<scalar_t>(),
......@@ -127,7 +136,8 @@ at::Tensor nms_cuda(const at::Tensor& dets,
});
at::Tensor mask_cpu = mask.to(at::kCPU);
unsigned long long* mask_host = (unsigned long long*)mask_cpu.data_ptr<int64_t>();
unsigned long long* mask_host =
(unsigned long long*)mask_cpu.data_ptr<int64_t>();
std::vector<unsigned long long> remv(col_blocks);
memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks);
......@@ -155,3 +165,6 @@ at::Tensor nms_cuda(const at::Tensor& dets,
{keep.narrow(/*dim=*/0, /*start=*/0, /*length=*/num_to_keep)
.to(order_t.device(), keep.scalar_type())});
}
} // namespace ops
} // namespace vision
#pragma once
#include <ATen/ATen.h>
#include "../macros.h"
namespace vision {
namespace ops {
VISION_API at::Tensor nms_cuda(
const at::Tensor& dets,
const at::Tensor& scores,
double iou_threshold);
} // namespace ops
} // namespace vision
#include <ATen/ATen.h>
#include <ATen/TensorUtils.h>
#include <ATen/cuda/CUDAContext.h>
#include <c10/cuda/CUDAGuard.h>
#include <THC/THCAtomics.cuh>
#include <stdio.h>
#include "cuda_helpers.h"
#include "ps_roi_align_kernel.h"
namespace vision {
namespace ops {
namespace {
template <typename T>
__device__ T bilinear_interpolate(
......@@ -62,7 +65,7 @@ __device__ T bilinear_interpolate(
}
template <typename T>
__global__ void PSROIAlignForwardCUDA(
__global__ void ps_roi_align_forward_kernel_impl(
int nthreads,
const T* input,
const T spatial_scale,
......@@ -195,7 +198,7 @@ __device__ void bilinear_interpolate_gradient(
}
template <typename T>
__global__ void PSROIAlignBackwardCUDA(
__global__ void ps_roi_align_backward_kernel_impl(
int nthreads,
const T* grad_output,
const int* channel_mapping,
......@@ -292,7 +295,9 @@ __global__ void PSROIAlignBackwardCUDA(
}
}
std::tuple<at::Tensor, at::Tensor> PSROIAlign_forward_cuda(
} // namespace
std::tuple<at::Tensor, at::Tensor> ps_roi_align_forward_cuda(
const at::Tensor& input,
const at::Tensor& rois,
double spatial_scale,
......@@ -307,7 +312,7 @@ std::tuple<at::Tensor, at::Tensor> PSROIAlign_forward_cuda(
at::TensorArg input_t{input, "input", 1}, rois_t{rois, "rois", 2};
at::CheckedFrom c = "PSROIAlign_forward_cuda";
at::CheckedFrom c = "ps_roi_align_forward_cuda";
at::checkAllSameGPU(c, {input_t, rois_t});
at::checkAllSameType(c, {input_t, rois_t});
......@@ -337,15 +342,14 @@ std::tuple<at::Tensor, at::Tensor> PSROIAlign_forward_cuda(
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
dim3 grid(std::min(
ceil_div(static_cast<int64_t>(output_size), static_cast<int64_t>(512)),
ceil_div(static_cast<int64_t>(output_size), static_cast<int64_t>(512)),
static_cast<int64_t>(4096)));
dim3 block(512);
auto input_ = input.contiguous(),
rois_ = rois.contiguous();
auto input_ = input.contiguous(), rois_ = rois.contiguous();
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
input.scalar_type(), "PSROIAlign_forward", [&] {
PSROIAlignForwardCUDA<scalar_t><<<grid, block, 0, stream>>>(
input.scalar_type(), "ps_roi_align_forward_cuda", [&] {
ps_roi_align_forward_kernel_impl<scalar_t><<<grid, block, 0, stream>>>(
output_size,
input_.data_ptr<scalar_t>(),
spatial_scale,
......@@ -365,7 +369,7 @@ std::tuple<at::Tensor, at::Tensor> PSROIAlign_forward_cuda(
return std::make_tuple(output, channel_mapping);
}
at::Tensor PSROIAlign_backward_cuda(
at::Tensor ps_roi_align_backward_cuda(
const at::Tensor& grad,
const at::Tensor& rois,
const at::Tensor& channel_mapping,
......@@ -381,13 +385,12 @@ at::Tensor PSROIAlign_backward_cuda(
TORCH_CHECK(grad.is_cuda(), "grad must be a CUDA tensor");
TORCH_CHECK(rois.is_cuda(), "rois must be a CUDA tensor");
TORCH_CHECK(
channel_mapping.is_cuda(),
"channel_mapping must be a CUDA tensor");
channel_mapping.is_cuda(), "channel_mapping must be a CUDA tensor");
at::TensorArg grad_t{grad, "grad", 1}, rois_t{rois, "rois", 2},
channel_mapping_t{channel_mapping, "channel_mapping", 3};
at::CheckedFrom c = "PSROIAlign_backward_cuda";
at::CheckedFrom c = "ps_roi_align_backward_cuda";
at::checkAllSameGPU(c, {grad_t, rois_t, channel_mapping_t});
at::checkAllSameType(c, {grad_t, rois_t});
......@@ -400,7 +403,7 @@ at::Tensor PSROIAlign_backward_cuda(
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
dim3 grid(std::min(
ceil_div(static_cast<int64_t>(grad.numel()), static_cast<int64_t>(512)),
ceil_div(static_cast<int64_t>(grad.numel()), static_cast<int64_t>(512)),
static_cast<int64_t>(4096)));
dim3 block(512);
......@@ -412,11 +415,10 @@ at::Tensor PSROIAlign_backward_cuda(
int channels_out = channels / (pooled_height * pooled_width);
auto grad_ = grad.contiguous(),
rois_ = rois.contiguous();
auto grad_ = grad.contiguous(), rois_ = rois.contiguous();
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
grad.scalar_type(), "PSROIAlign_backward", [&] {
PSROIAlignBackwardCUDA<scalar_t><<<grid, block, 0, stream>>>(
grad.scalar_type(), "ps_roi_align_backward_cuda", [&] {
ps_roi_align_backward_kernel_impl<scalar_t><<<grid, block, 0, stream>>>(
grad.numel(),
grad_.data_ptr<scalar_t>(),
channel_mapping.data_ptr<int>(),
......@@ -435,3 +437,6 @@ at::Tensor PSROIAlign_backward_cuda(
AT_CUDA_CHECK(cudaGetLastError());
return grad_input;
}
} // namespace ops
} // namespace vision
#pragma once
#include <ATen/ATen.h>
#include "../macros.h"
namespace vision {
namespace ops {
VISION_API std::tuple<at::Tensor, at::Tensor> ps_roi_align_forward_cuda(
const at::Tensor& input,
const at::Tensor& rois,
double spatial_scale,
int64_t pooled_height,
int64_t pooled_width,
int64_t sampling_ratio);
VISION_API at::Tensor ps_roi_align_backward_cuda(
const at::Tensor& grad,
const at::Tensor& rois,
const at::Tensor& channel_mapping,
double spatial_scale,
int64_t pooled_height,
int64_t pooled_width,
int64_t sampling_ratio,
int64_t batch_size,
int64_t channels,
int64_t height,
int64_t width);
} // namespace ops
} // namespace vision
#include <ATen/ATen.h>
#include <ATen/TensorUtils.h>
#include <ATen/cuda/CUDAContext.h>
#include <c10/cuda/CUDAGuard.h>
#include <THC/THCAtomics.cuh>
#include "cuda_helpers.h"
#include "ps_roi_pool_kernel.h"
namespace vision {
namespace ops {
namespace {
template <typename T>
__global__ void PSROIPoolForward(
__global__ void ps_roi_pool_forward_kernel_impl(
int nthreads,
const T* input,
const T spatial_scale,
......@@ -73,7 +77,7 @@ __global__ void PSROIPoolForward(
}
template <typename T>
__global__ void PSROIPoolBackward(
__global__ void ps_roi_pool_backward_kernel_impl(
int nthreads,
const T* grad_output,
const int* channel_mapping,
......@@ -132,7 +136,9 @@ __global__ void PSROIPoolBackward(
}
}
std::tuple<at::Tensor, at::Tensor> PSROIPool_forward_cuda(
} // namespace
std::tuple<at::Tensor, at::Tensor> ps_roi_pool_forward_cuda(
const at::Tensor& input,
const at::Tensor& rois,
double spatial_scale,
......@@ -146,7 +152,7 @@ std::tuple<at::Tensor, at::Tensor> PSROIPool_forward_cuda(
at::TensorArg input_t{input, "input", 1}, rois_t{rois, "rois", 2};
at::CheckedFrom c = "PSROIPool_forward_cuda";
at::CheckedFrom c = "ps_roi_pool_forward_cuda";
at::checkAllSameGPU(c, {input_t, rois_t});
at::checkAllSameType(c, {input_t, rois_t});
......@@ -176,15 +182,14 @@ std::tuple<at::Tensor, at::Tensor> PSROIPool_forward_cuda(
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
dim3 grid(std::min(
ceil_div(static_cast<int64_t>(output_size), static_cast<int64_t>(512)),
ceil_div(static_cast<int64_t>(output_size), static_cast<int64_t>(512)),
static_cast<int64_t>(4096)));
dim3 block(512);
auto input_ = input.contiguous(),
rois_ = rois.contiguous();
auto input_ = input.contiguous(), rois_ = rois.contiguous();
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
input.scalar_type(), "PSROIPool_forward", [&] {
PSROIPoolForward<scalar_t><<<grid, block, 0, stream>>>(
input.scalar_type(), "ps_roi_pool_forward_cuda", [&] {
ps_roi_pool_forward_kernel_impl<scalar_t><<<grid, block, 0, stream>>>(
output_size,
input_.data_ptr<scalar_t>(),
spatial_scale,
......@@ -202,7 +207,7 @@ std::tuple<at::Tensor, at::Tensor> PSROIPool_forward_cuda(
return std::make_tuple(output, channel_mapping);
}
at::Tensor PSROIPool_backward_cuda(
at::Tensor ps_roi_pool_backward_cuda(
const at::Tensor& grad,
const at::Tensor& rois,
const at::Tensor& channel_mapping,
......@@ -217,13 +222,12 @@ at::Tensor PSROIPool_backward_cuda(
TORCH_CHECK(grad.is_cuda(), "grad must be a CUDA tensor");
TORCH_CHECK(rois.is_cuda(), "rois must be a CUDA tensor");
TORCH_CHECK(
channel_mapping.is_cuda(),
"channel_mapping must be a CUDA tensor");
channel_mapping.is_cuda(), "channel_mapping must be a CUDA tensor");
at::TensorArg grad_t{grad, "grad", 1}, rois_t{rois, "rois", 2},
channel_mapping_t{channel_mapping, "channel_mapping", 3};
at::CheckedFrom c = "PSROIPool_backward_cuda";
at::CheckedFrom c = "ps_roi_pool_backward_cuda";
at::checkAllSameGPU(c, {grad_t, rois_t, channel_mapping_t});
at::checkAllSameType(c, {grad_t, rois_t});
......@@ -236,7 +240,7 @@ at::Tensor PSROIPool_backward_cuda(
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
dim3 grid(std::min(
ceil_div(static_cast<int64_t>(grad.numel()), static_cast<int64_t>(512)),
ceil_div(static_cast<int64_t>(grad.numel()), static_cast<int64_t>(512)),
static_cast<int64_t>(4096)));
dim3 block(512);
......@@ -248,11 +252,10 @@ at::Tensor PSROIPool_backward_cuda(
int channels_out = channels / (pooled_height * pooled_width);
auto grad_ = grad.contiguous(),
rois_ = rois.contiguous();
auto grad_ = grad.contiguous(), rois_ = rois.contiguous();
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
grad.scalar_type(), "PSROIPool_backward", [&] {
PSROIPoolBackward<scalar_t><<<grid, block, 0, stream>>>(
grad.scalar_type(), "ps_roi_pool_backward_cuda", [&] {
ps_roi_pool_backward_kernel_impl<scalar_t><<<grid, block, 0, stream>>>(
grad.numel(),
grad_.data_ptr<scalar_t>(),
channel_mapping.data_ptr<int>(),
......@@ -270,3 +273,6 @@ at::Tensor PSROIPool_backward_cuda(
AT_CUDA_CHECK(cudaGetLastError());
return grad_input;
}
} // namespace ops
} // namespace vision
#pragma once
#include <ATen/ATen.h>
#include "../macros.h"
namespace vision {
namespace ops {
VISION_API std::tuple<at::Tensor, at::Tensor> ps_roi_pool_forward_cuda(
const at::Tensor& input,
const at::Tensor& rois,
double spatial_scale,
int64_t pooled_height,
int64_t pooled_width);
VISION_API at::Tensor ps_roi_pool_backward_cuda(
const at::Tensor& grad,
const at::Tensor& rois,
const at::Tensor& channel_mapping,
double spatial_scale,
int64_t pooled_height,
int64_t pooled_width,
int64_t batch_size,
int64_t channels,
int64_t height,
int64_t width);
} // namespace ops
} // namespace vision
#include <ATen/ATen.h>
#include <ATen/TensorUtils.h>
#include <ATen/cuda/CUDAContext.h>
#include <c10/cuda/CUDAGuard.h>
#include <THC/THCAtomics.cuh>
#include "cuda_helpers.h"
#include "roi_align_kernel.h"
namespace vision {
namespace ops {
namespace {
template <typename T>
__device__ T bilinear_interpolate(
......@@ -61,7 +65,7 @@ __device__ T bilinear_interpolate(
}
template <typename T>
__global__ void RoIAlignForward(
__global__ void roi_align_forward_kernel_impl(
int nthreads,
const T* input,
const T spatial_scale,
......@@ -197,7 +201,7 @@ __device__ void bilinear_interpolate_gradient(
}
template <typename T>
__global__ void RoIAlignBackward(
__global__ void roi_align_backward_kernel_impl(
int nthreads,
const T* grad_output,
const T spatial_scale,
......@@ -308,9 +312,11 @@ __global__ void RoIAlignBackward(
} // ix
} // iy
} // CUDA_1D_KERNEL_LOOP
} // RoIAlignBackward
}
at::Tensor ROIAlign_forward_cuda(
} // namespace
at::Tensor roi_align_forward_cuda(
const at::Tensor& input,
const at::Tensor& rois,
double spatial_scale,
......@@ -320,12 +326,11 @@ at::Tensor ROIAlign_forward_cuda(
bool aligned) {
TORCH_CHECK(input.is_cuda(), "input must be a CUDA tensor");
TORCH_CHECK(rois.is_cuda(), "rois must be a CUDA tensor");
TORCH_CHECK(
rois.size(1) == 5, "rois must have shape as Tensor[K, 5]");
TORCH_CHECK(rois.size(1) == 5, "rois must have shape as Tensor[K, 5]");
at::TensorArg input_t{input, "input", 1}, rois_t{rois, "rois", 2};
at::CheckedFrom c = "ROIAlign_forward_cuda";
at::CheckedFrom c = "roi_align_forward_cuda";
at::checkAllSameGPU(c, {input_t, rois_t});
at::checkAllSameType(c, {input_t, rois_t});
......@@ -343,7 +348,7 @@ at::Tensor ROIAlign_forward_cuda(
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
dim3 grid(std::min(
ceil_div(static_cast<int64_t>(output_size), static_cast<int64_t>(512)),
ceil_div(static_cast<int64_t>(output_size), static_cast<int64_t>(512)),
static_cast<int64_t>(4096)));
dim3 block(512);
......@@ -352,28 +357,28 @@ at::Tensor ROIAlign_forward_cuda(
return output;
}
auto input_ = input.contiguous(),
rois_ = rois.contiguous();
AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.scalar_type(), "ROIAlign_forward", [&] {
RoIAlignForward<scalar_t><<<grid, block, 0, stream>>>(
output_size,
input_.data_ptr<scalar_t>(),
spatial_scale,
channels,
height,
width,
pooled_height,
pooled_width,
sampling_ratio,
aligned,
rois_.data_ptr<scalar_t>(),
output.data_ptr<scalar_t>());
});
auto input_ = input.contiguous(), rois_ = rois.contiguous();
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
input.scalar_type(), "roi_align_forward_cuda", [&] {
roi_align_forward_kernel_impl<scalar_t><<<grid, block, 0, stream>>>(
output_size,
input_.data_ptr<scalar_t>(),
spatial_scale,
channels,
height,
width,
pooled_height,
pooled_width,
sampling_ratio,
aligned,
rois_.data_ptr<scalar_t>(),
output.data_ptr<scalar_t>());
});
AT_CUDA_CHECK(cudaGetLastError());
return output;
}
at::Tensor ROIAlign_backward_cuda(
at::Tensor roi_align_backward_cuda(
const at::Tensor& grad,
const at::Tensor& rois,
double spatial_scale,
......@@ -390,7 +395,7 @@ at::Tensor ROIAlign_backward_cuda(
at::TensorArg grad_t{grad, "grad", 1}, rois_t{rois, "rois", 2};
at::CheckedFrom c = "ROIAlign_backward_cuda";
at::CheckedFrom c = "roi_align_backward_cuda";
at::checkAllSameGPU(c, {grad_t, rois_t});
at::checkAllSameType(c, {grad_t, rois_t});
......@@ -402,7 +407,7 @@ at::Tensor ROIAlign_backward_cuda(
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
dim3 grid(std::min(
ceil_div(static_cast<int64_t>(grad.numel()), static_cast<int64_t>(512)),
ceil_div(static_cast<int64_t>(grad.numel()), static_cast<int64_t>(512)),
static_cast<int64_t>(4096)));
dim3 block(512);
......@@ -418,25 +423,29 @@ at::Tensor ROIAlign_backward_cuda(
int w_stride = grad.stride(3);
auto rois_ = rois.contiguous();
AT_DISPATCH_FLOATING_TYPES_AND_HALF(grad.scalar_type(), "ROIAlign_backward", [&] {
RoIAlignBackward<scalar_t><<<grid, block, 0, stream>>>(
grad.numel(),
grad.data_ptr<scalar_t>(),
spatial_scale,
channels,
height,
width,
pooled_height,
pooled_width,
sampling_ratio,
aligned,
grad_input.data_ptr<scalar_t>(),
rois_.data_ptr<scalar_t>(),
n_stride,
c_stride,
h_stride,
w_stride);
});
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
grad.scalar_type(), "roi_align_backward_cuda", [&] {
roi_align_backward_kernel_impl<scalar_t><<<grid, block, 0, stream>>>(
grad.numel(),
grad.data_ptr<scalar_t>(),
spatial_scale,
channels,
height,
width,
pooled_height,
pooled_width,
sampling_ratio,
aligned,
grad_input.data_ptr<scalar_t>(),
rois_.data_ptr<scalar_t>(),
n_stride,
c_stride,
h_stride,
w_stride);
});
AT_CUDA_CHECK(cudaGetLastError());
return grad_input;
}
} // namespace ops
} // namespace vision
#pragma once
#include <ATen/ATen.h>
#include "../macros.h"
namespace vision {
namespace ops {
VISION_API at::Tensor roi_align_forward_cuda(
const at::Tensor& input,
const at::Tensor& rois,
double spatial_scale,
int64_t pooled_height,
int64_t pooled_width,
int64_t sampling_ratio,
bool aligned);
VISION_API at::Tensor roi_align_backward_cuda(
const at::Tensor& grad,
const at::Tensor& rois,
double spatial_scale,
int64_t pooled_height,
int64_t pooled_width,
int64_t batch_size,
int64_t channels,
int64_t height,
int64_t width,
int64_t sampling_ratio,
bool aligned);
} // namespace ops
} // namespace vision
#include <ATen/ATen.h>
#include <ATen/TensorUtils.h>
#include <ATen/cuda/CUDAContext.h>
#include <c10/cuda/CUDAGuard.h>
#include <float.h>
#include <THC/THCAtomics.cuh>
#include "cuda_helpers.h"
#include "roi_pool_kernel.h"
namespace vision {
namespace ops {
namespace {
template <typename T>
__global__ void RoIPoolForward(
__global__ void roi_pool_forward_kernel_impl(
int nthreads,
const T* input,
const T spatial_scale,
......@@ -72,7 +77,7 @@ __global__ void RoIPoolForward(
}
template <typename T>
__global__ void RoIPoolBackward(
__global__ void roi_pool_backward_kernel_impl(
int nthreads,
const T* grad_output,
const int* argmax_data,
......@@ -115,7 +120,9 @@ __global__ void RoIPoolBackward(
}
}
std::tuple<at::Tensor, at::Tensor> ROIPool_forward_cuda(
} // namespace
std::tuple<at::Tensor, at::Tensor> roi_pool_forward_cuda(
const at::Tensor& input,
const at::Tensor& rois,
double spatial_scale,
......@@ -128,7 +135,7 @@ std::tuple<at::Tensor, at::Tensor> ROIPool_forward_cuda(
at::TensorArg input_t{input, "input", 1}, rois_t{rois, "rois", 2};
at::CheckedFrom c = "ROIPool_forward_cuda";
at::CheckedFrom c = "roi_pool_forward_cuda";
at::checkAllSameGPU(c, {input_t, rois_t});
at::checkAllSameType(c, {input_t, rois_t});
......@@ -149,7 +156,7 @@ std::tuple<at::Tensor, at::Tensor> ROIPool_forward_cuda(
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
dim3 grid(std::min(
ceil_div(static_cast<int64_t>(output_size), static_cast<int64_t>(512)),
ceil_div(static_cast<int64_t>(output_size), static_cast<int64_t>(512)),
static_cast<int64_t>(4096)));
dim3 block(512);
......@@ -158,27 +165,27 @@ std::tuple<at::Tensor, at::Tensor> ROIPool_forward_cuda(
return std::make_tuple(output, argmax);
}
auto input_ = input.contiguous(),
rois_ = rois.contiguous();
AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.scalar_type(), "ROIPool_forward", [&] {
RoIPoolForward<scalar_t><<<grid, block, 0, stream>>>(
output_size,
input_.data_ptr<scalar_t>(),
spatial_scale,
channels,
height,
width,
pooled_height,
pooled_width,
rois_.data_ptr<scalar_t>(),
output.data_ptr<scalar_t>(),
argmax.data_ptr<int>());
});
auto input_ = input.contiguous(), rois_ = rois.contiguous();
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
input.scalar_type(), "roi_pool_forward_cuda", [&] {
roi_pool_forward_kernel_impl<scalar_t><<<grid, block, 0, stream>>>(
output_size,
input_.data_ptr<scalar_t>(),
spatial_scale,
channels,
height,
width,
pooled_height,
pooled_width,
rois_.data_ptr<scalar_t>(),
output.data_ptr<scalar_t>(),
argmax.data_ptr<int>());
});
AT_CUDA_CHECK(cudaGetLastError());
return std::make_tuple(output, argmax);
}
at::Tensor ROIPool_backward_cuda(
at::Tensor roi_pool_backward_cuda(
const at::Tensor& grad,
const at::Tensor& rois,
const at::Tensor& argmax,
......@@ -197,7 +204,7 @@ at::Tensor ROIPool_backward_cuda(
at::TensorArg grad_t{grad, "grad", 1}, rois_t{rois, "rois", 2},
argmax_t{argmax, "argmax", 3};
at::CheckedFrom c = "ROIPool_backward_cuda";
at::CheckedFrom c = "roi_pool_backward_cuda";
at::checkAllSameGPU(c, {grad_t, rois_t, argmax_t});
at::checkAllSameType(c, {grad_t, rois_t});
......@@ -211,7 +218,7 @@ at::Tensor ROIPool_backward_cuda(
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
dim3 grid(std::min(
ceil_div(static_cast<int64_t>(grad.numel()), static_cast<int64_t>(512)),
ceil_div(static_cast<int64_t>(grad.numel()), static_cast<int64_t>(512)),
static_cast<int64_t>(4096)));
dim3 block(512);
......@@ -226,27 +233,30 @@ at::Tensor ROIPool_backward_cuda(
int h_stride = grad.stride(2);
int w_stride = grad.stride(3);
auto argmax_ = argmax.contiguous(),
rois_ = rois.contiguous();
AT_DISPATCH_FLOATING_TYPES_AND_HALF(grad.scalar_type(), "ROIPool_backward", [&] {
RoIPoolBackward<scalar_t><<<grid, block, 0, stream>>>(
grad.numel(),
grad.data_ptr<scalar_t>(),
argmax_.data_ptr<int>(),
num_rois,
spatial_scale,
channels,
height,
width,
pooled_height,
pooled_width,
grad_input.data_ptr<scalar_t>(),
rois_.data_ptr<scalar_t>(),
n_stride,
c_stride,
h_stride,
w_stride);
});
auto argmax_ = argmax.contiguous(), rois_ = rois.contiguous();
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
grad.scalar_type(), "roi_pool_backward_cuda", [&] {
roi_pool_backward_kernel_impl<scalar_t><<<grid, block, 0, stream>>>(
grad.numel(),
grad.data_ptr<scalar_t>(),
argmax_.data_ptr<int>(),
num_rois,
spatial_scale,
channels,
height,
width,
pooled_height,
pooled_width,
grad_input.data_ptr<scalar_t>(),
rois_.data_ptr<scalar_t>(),
n_stride,
c_stride,
h_stride,
w_stride);
});
AT_CUDA_CHECK(cudaGetLastError());
return grad_input;
}
} // namespace ops
} // namespace vision
#pragma once
#include <ATen/ATen.h>
#include "../macros.h"
namespace vision {
namespace ops {
VISION_API std::tuple<at::Tensor, at::Tensor> roi_pool_forward_cuda(
const at::Tensor& input,
const at::Tensor& rois,
double spatial_scale,
int64_t pooled_height,
int64_t pooled_width);
VISION_API at::Tensor roi_pool_backward_cuda(
const at::Tensor& grad,
const at::Tensor& rois,
const at::Tensor& argmax,
double spatial_scale,
int64_t pooled_height,
int64_t pooled_width,
int64_t batch_size,
int64_t channels,
int64_t height,
int64_t width);
} // namespace ops
} // namespace vision
#pragma once
#include <torch/extension.h>
#include "../macros.h"
VISION_API at::Tensor DeformConv2d_forward_cuda(
const at::Tensor& input,
const at::Tensor& weight,
const at::Tensor& offset,
const at::Tensor& mask,
const at::Tensor& bias,
int64_t stride_h,
int64_t stride_w,
int64_t pad_h,
int64_t pad_w,
int64_t dilation_h,
int64_t dilation_w,
int64_t groups,
int64_t deformable_groups,
bool use_mask);
VISION_API std::
tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor, at::Tensor>
DeformConv2d_backward_cuda(
const at::Tensor& grad_out,
const at::Tensor& input,
const at::Tensor& weight,
const at::Tensor& offset,
const at::Tensor& mask,
const at::Tensor& bias,
int64_t stride_h,
int64_t stride_w,
int64_t pad_h,
int64_t pad_w,
int64_t dilation_h,
int64_t dilation_w,
int64_t groups,
int64_t deformable_groups,
bool use_mask);
VISION_API at::Tensor nms_cuda(
const at::Tensor& dets,
const at::Tensor& scores,
double iou_threshold);
VISION_API std::tuple<at::Tensor, at::Tensor> PSROIAlign_forward_cuda(
const at::Tensor& input,
const at::Tensor& rois,
double spatial_scale,
int64_t pooled_height,
int64_t pooled_width,
int64_t sampling_ratio);
VISION_API at::Tensor PSROIAlign_backward_cuda(
const at::Tensor& grad,
const at::Tensor& rois,
const at::Tensor& channel_mapping,
double spatial_scale,
int64_t pooled_height,
int64_t pooled_width,
int64_t sampling_ratio,
int64_t batch_size,
int64_t channels,
int64_t height,
int64_t width);
VISION_API std::tuple<at::Tensor, at::Tensor> PSROIPool_forward_cuda(
const at::Tensor& input,
const at::Tensor& rois,
double spatial_scale,
int64_t pooled_height,
int64_t pooled_width);
VISION_API at::Tensor PSROIPool_backward_cuda(
const at::Tensor& grad,
const at::Tensor& rois,
const at::Tensor& channel_mapping,
double spatial_scale,
int64_t pooled_height,
int64_t pooled_width,
int64_t batch_size,
int64_t channels,
int64_t height,
int64_t width);
VISION_API at::Tensor ROIAlign_forward_cuda(
const at::Tensor& input,
const at::Tensor& rois,
double spatial_scale,
int64_t pooled_height,
int64_t pooled_width,
int64_t sampling_ratio,
bool aligned);
VISION_API at::Tensor ROIAlign_backward_cuda(
const at::Tensor& grad,
const at::Tensor& rois,
double spatial_scale,
int64_t pooled_height,
int64_t pooled_width,
int64_t batch_size,
int64_t channels,
int64_t height,
int64_t width,
int64_t sampling_ratio,
bool aligned);
VISION_API std::tuple<at::Tensor, at::Tensor> ROIPool_forward_cuda(
const at::Tensor& input,
const at::Tensor& rois,
const double spatial_scale,
const int64_t pooled_height,
const int64_t pooled_width);
VISION_API at::Tensor ROIPool_backward_cuda(
const at::Tensor& grad,
const at::Tensor& rois,
const at::Tensor& argmax,
const double spatial_scale,
const int64_t pooled_height,
const int64_t pooled_width,
const int64_t batch_size,
const int64_t channels,
const int64_t height,
const int64_t width);
#pragma once
#include "deform_conv2d.h"
#include <torch/extension.h>
#include "cpu/vision_cpu.h"
#ifdef WITH_CUDA
#include "autocast.h"
#include "cuda/vision_cuda.h"
#endif
#ifdef WITH_HIP
#include "autocast.h"
#include "hip/vision_cuda.h"
#if defined(WITH_CUDA) || defined(WITH_HIP)
#include <ATen/autocast_mode.h>
#endif
// TODO: put this stuff in torchvision namespace
namespace vision {
namespace ops {
at::Tensor deform_conv2d(
const at::Tensor& input,
......@@ -49,7 +44,7 @@ at::Tensor deform_conv2d(
}
#if defined(WITH_CUDA) || defined(WITH_HIP)
at::Tensor DeformConv2d_autocast(
at::Tensor deform_conv2d_autocast(
const at::Tensor& input,
const at::Tensor& weight,
const at::Tensor& offset,
......@@ -123,6 +118,8 @@ _deform_conv2d_backward(
use_mask);
}
namespace {
class DeformConv2dFunction
: public torch::autograd::Function<DeformConv2dFunction> {
public:
......@@ -297,7 +294,9 @@ class DeformConv2dBackwardFunction
}
};
at::Tensor DeformConv2d_autograd(
} // namespace
at::Tensor deform_conv2d_autograd(
const at::Tensor& input,
const at::Tensor& weight,
const at::Tensor& offset,
......@@ -330,7 +329,7 @@ at::Tensor DeformConv2d_autograd(
}
std::tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor, at::Tensor>
DeformConv2d_backward_autograd(
deform_conv2d_backward_autograd(
const at::Tensor& grad,
const at::Tensor& input,
const at::Tensor& weight,
......@@ -365,3 +364,6 @@ DeformConv2d_backward_autograd(
return std::make_tuple(result[0], result[1], result[2], result[3], result[4]);
}
} // namespace ops
} // namespace vision
#pragma once
#include "cpu/deform_conv2d_kernel.h"
#ifdef WITH_CUDA
#include "cuda/deform_conv2d_kernel.h"
#endif
#ifdef WITH_HIP
#include "hip/deform_conv2d_kernel.h"
#endif
namespace vision {
namespace ops {
// C++ Forward
at::Tensor deform_conv2d(
const at::Tensor& input,
const at::Tensor& weight,
const at::Tensor& offset,
const at::Tensor& mask,
const at::Tensor& bias,
int64_t stride_h,
int64_t stride_w,
int64_t pad_h,
int64_t pad_w,
int64_t dilation_h,
int64_t dilation_w,
int64_t groups,
int64_t offset_groups,
bool use_mask);
// Autocast Forward
#if defined(WITH_CUDA) || defined(WITH_HIP)
at::Tensor deform_conv2d_autocast(
const at::Tensor& input,
const at::Tensor& weight,
const at::Tensor& offset,
const at::Tensor& mask,
const at::Tensor& bias,
int64_t stride_h,
int64_t stride_w,
int64_t pad_h,
int64_t pad_w,
int64_t dilation_h,
int64_t dilation_w,
int64_t groups,
int64_t offset_groups,
bool use_mask);
#endif
// C++ Backward
std::tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor, at::Tensor>
_deform_conv2d_backward(
const at::Tensor& grad,
const at::Tensor& input,
const at::Tensor& weight,
const at::Tensor& offset,
const at::Tensor& mask,
const at::Tensor& bias,
int64_t stride_h,
int64_t stride_w,
int64_t pad_h,
int64_t pad_w,
int64_t dilation_h,
int64_t dilation_w,
int64_t groups,
int64_t offset_groups,
bool use_mask);
// Autograd Forward and Backward
at::Tensor deform_conv2d_autograd(
const at::Tensor& input,
const at::Tensor& weight,
const at::Tensor& offset,
const at::Tensor& mask,
const at::Tensor& bias,
int64_t stride_h,
int64_t stride_w,
int64_t pad_h,
int64_t pad_w,
int64_t dilation_h,
int64_t dilation_w,
int64_t groups,
int64_t offset_groups,
bool use_mask);
std::tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor, at::Tensor>
deform_conv2d_backward_autograd(
const at::Tensor& grad,
const at::Tensor& input,
const at::Tensor& weight,
const at::Tensor& offset,
const at::Tensor& mask,
const at::Tensor& bias,
int64_t stride_h,
int64_t stride_w,
int64_t pad_h,
int64_t pad_w,
int64_t dilation_h,
int64_t dilation_w,
int64_t groups,
int64_t offset_groups,
bool use_mask);
} // namespace ops
} // namespace vision
#ifndef TORCHVISION_MACROS_H
#define TORCHVISION_MACROS_H
#pragma once
#ifdef _WIN32
#if defined(torchvision_EXPORTS)
......@@ -20,5 +19,3 @@
#define VISION_INLINE_VARIABLE __attribute__((weak))
#endif
#endif
#endif // TORCHVISION_MACROS_H
#ifndef ALEXNET_H
#define ALEXNET_H
#pragma once
#include <torch/torch.h>
#include "general.h"
#include "../macros.h"
namespace vision {
namespace models {
......@@ -20,5 +19,3 @@ TORCH_MODULE(AlexNet);
} // namespace models
} // namespace vision
#endif // ALEXNET_H
#ifndef DENSENET_H
#define DENSENET_H
#pragma once
#include <torch/torch.h>
#include "general.h"
#include "../macros.h"
namespace vision {
namespace models {
......@@ -82,5 +81,3 @@ TORCH_MODULE(DenseNet161);
} // namespace models
} // namespace vision
#endif // DENSENET_H
#ifndef VISION_GENERAL_H
#define VISION_GENERAL_H
#ifdef _WIN32
#if defined(torchvision_EXPORTS)
#define VISION_API __declspec(dllexport)
#else
#define VISION_API __declspec(dllimport)
#endif
#else
#define VISION_API
#endif
#endif // VISION_GENERAL_H
\ No newline at end of file
#ifndef GOOGLENET_H
#define GOOGLENET_H
#pragma once
#include <torch/torch.h>
#include "general.h"
#include "../macros.h"
namespace vision {
namespace models {
......@@ -86,5 +85,3 @@ TORCH_MODULE(GoogLeNet);
} // namespace models
} // namespace vision
#endif // GOOGLENET_H
#ifndef INCEPTION_H
#define INCEPTION_H
#pragma once
#include <torch/torch.h>
#include "general.h"
#include "../macros.h"
namespace vision {
namespace models {
......@@ -124,5 +123,3 @@ TORCH_MODULE(InceptionV3);
} // namespace models
} // namespace vision
#endif // INCEPTION_H
#ifndef MNASNET_H
#define MNASNET_H
#pragma once
#include <torch/torch.h>
#include "general.h"
#include "../macros.h"
namespace vision {
namespace models {
......@@ -43,5 +42,3 @@ TORCH_MODULE(MNASNet1_3);
} // namespace models
} // namespace vision
#endif // MNASNET_H
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment