Unverified Commit 230f9a3b authored by q.yao's avatar q.yao Committed by GitHub
Browse files

Refactor csrc with device dispatcher (#1463)

* Add device registry for pytorch ops

* add declaration of CheckDeviceConsistency

* fix for torch130

* assert with torch check

* Refactor ops with dispatch

* update rest ops

* faster install

* update compatibility

* update compatibility, rename parameter

* move cpu implement to pytorch/cpu

* update ops/csrc/README.md

* fix rocm support

* update cn document

* update docs

* list instead of map
parent ef8ba752
### v1.3.18
Some ops have different implementations on different devices. Lots of macros and type checks are scattered in several files, which makes the code hard to maintain. For example:
```c++
if (input.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(input);
CHECK_CUDA_INPUT(rois);
CHECK_CUDA_INPUT(output);
CHECK_CUDA_INPUT(argmax_y);
CHECK_CUDA_INPUT(argmax_x);
roi_align_forward_cuda(input, rois, output, argmax_y, argmax_x,
aligned_height, aligned_width, spatial_scale,
sampling_ratio, pool_mode, aligned);
#else
AT_ERROR("RoIAlign is not compiled with GPU support");
#endif
} else {
CHECK_CPU_INPUT(input);
CHECK_CPU_INPUT(rois);
CHECK_CPU_INPUT(output);
CHECK_CPU_INPUT(argmax_y);
CHECK_CPU_INPUT(argmax_x);
roi_align_forward_cpu(input, rois, output, argmax_y, argmax_x,
aligned_height, aligned_width, spatial_scale,
sampling_ratio, pool_mode, aligned);
}
```
Registry and dispatcher are added to manage these implementations.
```c++
void ROIAlignForwardCUDAKernelLauncher(Tensor input, Tensor rois, Tensor output,
Tensor argmax_y, Tensor argmax_x,
int aligned_height, int aligned_width,
float spatial_scale, int sampling_ratio,
int pool_mode, bool aligned);
void roi_align_forward_cuda(Tensor input, Tensor rois, Tensor output,
Tensor argmax_y, Tensor argmax_x,
int aligned_height, int aligned_width,
float spatial_scale, int sampling_ratio,
int pool_mode, bool aligned) {
ROIAlignForwardCUDAKernelLauncher(
input, rois, output, argmax_y, argmax_x, aligned_height, aligned_width,
spatial_scale, sampling_ratio, pool_mode, aligned);
}
// register cuda implementation
void roi_align_forward_impl(Tensor input, Tensor rois, Tensor output,
Tensor argmax_y, Tensor argmax_x,
int aligned_height, int aligned_width,
float spatial_scale, int sampling_ratio,
int pool_mode, bool aligned);
REGISTER_DEVICE_IMPL(roi_align_forward_impl, CUDA, roi_align_forward_cuda);
// roi_align.cpp
// use the dispatcher to invoke different implementation depending on device type of input tensors.
void roi_align_forward_impl(Tensor input, Tensor rois, Tensor output,
Tensor argmax_y, Tensor argmax_x,
int aligned_height, int aligned_width,
float spatial_scale, int sampling_ratio,
int pool_mode, bool aligned) {
DISPATCH_DEVICE_IMPL(roi_align_forward_impl, input, rois, output, argmax_y,
argmax_x, aligned_height, aligned_width, spatial_scale,
sampling_ratio, pool_mode, aligned);
}
```
### v1.3.11
In order to flexibly support more backends and hardwares like `NVIDIA GPUs` and `AMD GPUs`, the directory of `mmcv/ops/csrc` is refactored. Note that this refactoring will not affect the usage in API. For related information, please refer to [PR1206](https://github.com/open-mmlab/mmcv/pull/1206).
......
### v1.3.18
部分自定义算子对于不同的设备有不同实现,为此添加的大量宏命令与类型检查使得代码变得难以维护。例如:
```c++
if (input.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(input);
CHECK_CUDA_INPUT(rois);
CHECK_CUDA_INPUT(output);
CHECK_CUDA_INPUT(argmax_y);
CHECK_CUDA_INPUT(argmax_x);
roi_align_forward_cuda(input, rois, output, argmax_y, argmax_x,
aligned_height, aligned_width, spatial_scale,
sampling_ratio, pool_mode, aligned);
#else
AT_ERROR("RoIAlign is not compiled with GPU support");
#endif
} else {
CHECK_CPU_INPUT(input);
CHECK_CPU_INPUT(rois);
CHECK_CPU_INPUT(output);
CHECK_CPU_INPUT(argmax_y);
CHECK_CPU_INPUT(argmax_x);
roi_align_forward_cpu(input, rois, output, argmax_y, argmax_x,
aligned_height, aligned_width, spatial_scale,
sampling_ratio, pool_mode, aligned);
}
```
为此我们设计了注册与分发的机制以更好的管理这些算子实现。
```c++
void ROIAlignForwardCUDAKernelLauncher(Tensor input, Tensor rois, Tensor output,
Tensor argmax_y, Tensor argmax_x,
int aligned_height, int aligned_width,
float spatial_scale, int sampling_ratio,
int pool_mode, bool aligned);
void roi_align_forward_cuda(Tensor input, Tensor rois, Tensor output,
Tensor argmax_y, Tensor argmax_x,
int aligned_height, int aligned_width,
float spatial_scale, int sampling_ratio,
int pool_mode, bool aligned) {
ROIAlignForwardCUDAKernelLauncher(
input, rois, output, argmax_y, argmax_x, aligned_height, aligned_width,
spatial_scale, sampling_ratio, pool_mode, aligned);
}
// 注册算子的cuda实现
void roi_align_forward_impl(Tensor input, Tensor rois, Tensor output,
Tensor argmax_y, Tensor argmax_x,
int aligned_height, int aligned_width,
float spatial_scale, int sampling_ratio,
int pool_mode, bool aligned);
REGISTER_DEVICE_IMPL(roi_align_forward_impl, CUDA, roi_align_forward_cuda);
// roi_align.cpp
// 使用dispatcher根据参数中的Tensor device类型对实现进行分发
void roi_align_forward_impl(Tensor input, Tensor rois, Tensor output,
Tensor argmax_y, Tensor argmax_x,
int aligned_height, int aligned_width,
float spatial_scale, int sampling_ratio,
int pool_mode, bool aligned) {
DISPATCH_DEVICE_IMPL(roi_align_forward_impl, input, rois, output, argmax_y,
argmax_x, aligned_height, aligned_width, spatial_scale,
sampling_ratio, pool_mode, aligned);
}
```
### v1.3.11
为了灵活地支持更多的后端和硬件,例如 `NVIDIA GPUs``AMD GPUs`,我们重构了 `mmcv/ops/csrc` 目录。注意,这次重构不会影响 API 的使用。更多相关信息,请参考 [PR1206](https://github.com/open-mmlab/mmcv/pull/1206)
......
......@@ -12,6 +12,7 @@ This folder contains all non-python code for MMCV custom ops. Please follow the
│ ├── parrots_cuda_helper.hpp
│ ├── pytorch_cpp_helper.hpp
│ ├── pytorch_cuda_helper.hpp
│ ├── pytorch_device_registry.hpp
│   └── cuda
│   ├── common_cuda_helper.hpp
│   ├── parrots_cudawarpfunction.cuh
......@@ -37,9 +38,12 @@ This folder contains all non-python code for MMCV custom ops. Please follow the
│   ├── pybind.cpp
│   ├── ...
│   ├── ops.cpp
│   └── cuda
│   ├── cuda
│   │   ├── ...
│   │   └── ops_cuda.cu
│   └── cpu
│      ├── ...
│      └── ops_cuda.cu
│      └── ops.cpp
└── tensorrt
├── trt_cuda_helper.cuh
├── trt_plugin_helper.hpp
......@@ -64,6 +68,7 @@ This folder contains all non-python code for MMCV custom ops. Please follow the
- `parrots`: **Parrots** is a deep learning frame for model training and inference. Parrots custom ops are placed in this directory.
- `pytorch`: **PyTorch** custom ops are supported by binding C++ to Python with **pybind11**. The ops implementation and binding codes are placed in this directory.
- `cuda`: This directory contains cuda kernel launchers, which feed memory pointers of tensor to the cuda kernel in `common/cuda`. The launchers provide c++ interface of cuda implementation of corresponding custom ops.
- `cpu`: This directory contain cpu implementations of corresponding custom ops.
- `tensorrt`: **TensorRT** support for custom ops.
- `plugins`: This directory contains the implementation of the supported custom ops. Some ops might also use shared cuda kernel in `common/cuda`.
......@@ -102,42 +107,38 @@ This folder contains all non-python code for MMCV custom ops. Please follow the
}
```
2. Add ops implementation in `pytorch` directory. Select different implementations according to device type.
2. Register implementation for different devices.
```c++
// src/pytorch/new_ops.cpp
#ifdef MMCV_WITH_CUDA
// src/pytorch/cuda/cudabind.cpp
...
Tensor new_ops_forward_cuda(Tensor input, Tensor output, ...){
// implement cuda forward here
// use `NewOpsForwardCUDAKernelLauncher` here
}
#else
// declare interface here.
Tensor new_ops_forward_impl(Tensor input, Tensor output, ...);
// register the implementation for given device (CUDA here).
REGISTER_DEVICE_IMPL(new_ops_forward_impl, CUDA, new_ops_forward_cuda);
```
Tensor new_ops_forward_cpu(Tensor input, Tensor output, ...){
// implement cpu forward here
}
3. Add ops implementation in `pytorch` directory. Select different implementations according to device type.
```c++
// src/pytorch/new_ops.cpp
Tensor new_ops_forward_impl(Tensor input, Tensor output, ...){
// dispatch the implementation according to the device type of input.
DISPATCH_DEVICE_IMPL(new_ops_forward_impl, input, output, ...);
}
...
Tensor new_ops_forward(Tensor input, Tensor output, ...){
// select implementation by input device type
if (boxes.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(input);
CHECK_CUDA_INPUT(output);
return new_ops_forward_cuda(input, output, ...);
#else
AT_ERROR("new ops is not compiled with GPU support");
#endif
} else {
CHECK_CPU_INPUT(input);
CHECK_CPU_INPUT(output);
return new_ops_forward_cpu(input, output, ...);
}
return new_ops_forward_impl(input, output, ...);
}
```
3. Binding the implementation in `pytorch/pybind.cpp`
4. Binding the implementation in `pytorch/pybind.cpp`
```c++
// src/pytorch/pybind.cpp
......@@ -156,7 +157,7 @@ This folder contains all non-python code for MMCV custom ops. Please follow the
```
4. Build MMCV again. Enjoy new ops in python
5. Build MMCV again. Enjoy new ops in python
```python
from ..utils import ext_loader
......
......@@ -14,7 +14,7 @@
#include <cuda.h>
#include <cuda_runtime.h>
#include <torch/types.h>
#include <torch/extension.h>
#include <iostream>
#include <vector>
......
......@@ -217,7 +217,6 @@ __global__ void ms_deformable_im2col_gpu_kernel(
const int sampling_index = _temp;
const int m_col = _temp % num_heads;
_temp /= num_heads;
const int q_col = _temp % num_query;
_temp /= num_query;
const int b_col = _temp;
......@@ -278,7 +277,6 @@ __global__ void ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1(
const int sampling_index = _temp;
const int m_col = _temp % num_heads;
_temp /= num_heads;
const int q_col = _temp % num_query;
_temp /= num_query;
const int b_col = _temp;
......@@ -369,7 +367,6 @@ __global__ void ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2(
const int sampling_index = _temp;
const int m_col = _temp % num_heads;
_temp /= num_heads;
const int q_col = _temp % num_query;
_temp /= num_query;
const int b_col = _temp;
......@@ -463,7 +460,6 @@ __global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v1(
const int sampling_index = _temp;
const int m_col = _temp % num_heads;
_temp /= num_heads;
const int q_col = _temp % num_query;
_temp /= num_query;
const int b_col = _temp;
......@@ -555,7 +551,6 @@ __global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v2(
const int sampling_index = _temp;
const int m_col = _temp % num_heads;
_temp /= num_heads;
const int q_col = _temp % num_query;
_temp /= num_query;
const int b_col = _temp;
......@@ -658,7 +653,6 @@ __global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v2_multi_blocks(
const int sampling_index = _temp;
const int m_col = _temp % num_heads;
_temp /= num_heads;
const int q_col = _temp % num_query;
_temp /= num_query;
const int b_col = _temp;
......@@ -757,7 +751,6 @@ __global__ void ms_deformable_col2im_gpu_kernel_gm(
const int sampling_index = _temp;
const int m_col = _temp % num_heads;
_temp /= num_heads;
const int q_col = _temp % num_query;
_temp /= num_query;
const int b_col = _temp;
......
......@@ -34,6 +34,14 @@ __device__ __forceinline__ static void reduceMax(double *address, double val) {
}
// get rid of meaningless warnings when compiling host code
#ifdef HIP_DIFF
__device__ __forceinline__ static void reduceAdd(float *address, float val) {
atomicAdd(address, val);
}
__device__ __forceinline__ static void reduceAdd(double *address, double val) {
atomicAdd(address, val);
}
#else
#ifdef __CUDA_ARCH__
__device__ __forceinline__ static void reduceAdd(float *address, float val) {
#if (__CUDA_ARCH__ < 200)
......@@ -77,7 +85,8 @@ __device__ __forceinline__ static void reduceAdd(double *address, double val) {
atomicAdd(address, val);
#endif
}
#endif
#endif // __CUDA_ARCH__
#endif // HIP_DIFF
template <typename T>
__global__ void feats_reduce_kernel(
......
#ifndef PYTORCH_DEVICE_REGISTRY_H
#define PYTORCH_DEVICE_REGISTRY_H
#include <torch/extension.h>
#include <cassert>
#include <functional>
#include <map>
#include <type_traits>
inline std::string GetDeviceStr(const at::Device& device) {
std::string str = DeviceTypeName(device.type(), true);
if (device.has_index()) {
str.push_back(':');
str.append(std::to_string(device.index()));
}
return str;
}
// Registry
template <typename F, F f>
class DeviceRegistry;
template <typename Ret, typename... Args, Ret (*f)(Args...)>
class DeviceRegistry<Ret (*)(Args...), f> {
public:
using FunctionType = Ret (*)(Args...);
static const int MAX_DEVICE_TYPES =
int8_t(at::DeviceType::COMPILE_TIME_MAX_DEVICE_TYPES);
void Register(at::DeviceType device, FunctionType function) {
funcs_[int8_t(device)] = function;
}
FunctionType Find(at::DeviceType device) const {
return funcs_[int8_t(device)];
}
static DeviceRegistry& instance() {
static DeviceRegistry inst;
return inst;
}
private:
DeviceRegistry() {
for (size_t i = 0; i < MAX_DEVICE_TYPES; ++i) {
funcs_[i] = nullptr;
}
};
FunctionType funcs_[MAX_DEVICE_TYPES];
};
// get device of first tensor param
template <typename T, typename... Args,
std::enable_if_t<std::is_same<std::decay_t<T>, at::Tensor>::value,
bool> = true>
at::Device GetFirstTensorDevice(T&& t, Args&&... args) {
return std::forward<T>(t).device();
}
template <typename T, typename... Args,
std::enable_if_t<!std::is_same<std::decay_t<T>, at::Tensor>::value,
bool> = true>
at::Device GetFirstTensorDevice(T&& t, Args&&... args) {
return GetFirstTensorDevice(std::forward<Args>(args)...);
}
// check device consistency
inline std::pair<int, at::Device> CheckDeviceConsistency(
const at::Device& device, int index) {
return {index, device};
}
template <typename T, typename... Args,
std::enable_if_t<!std::is_same<std::decay_t<T>, at::Tensor>::value,
bool> = true>
std::pair<int, at::Device> CheckDeviceConsistency(const at::Device& device,
int index, T&& t,
Args&&... args);
template <typename T, typename... Args,
std::enable_if_t<std::is_same<std::decay_t<T>, at::Tensor>::value,
bool> = true>
std::pair<int, at::Device> CheckDeviceConsistency(const at::Device& device,
int index, T&& t,
Args&&... args) {
auto new_device = std::forward<T>(t).device();
if (new_device.type() != device.type() ||
new_device.index() != device.index()) {
return {index, new_device};
}
return CheckDeviceConsistency(device, index + 1, std::forward<Args>(args)...);
}
template <
typename T, typename... Args,
std::enable_if_t<!std::is_same<std::decay_t<T>, at::Tensor>::value, bool>>
std::pair<int, at::Device> CheckDeviceConsistency(const at::Device& device,
int index, T&& t,
Args&&... args) {
return CheckDeviceConsistency(device, index + 1, std::forward<Args>(args)...);
}
// dispatch
template <typename R, typename... Args>
auto Dispatch(const R& registry, const char* name, Args&&... args) {
auto device = GetFirstTensorDevice(std::forward<Args>(args)...);
auto inconsist =
CheckDeviceConsistency(device, 0, std::forward<Args>(args)...);
TORCH_CHECK(inconsist.first >= int(sizeof...(Args)), name, ": at param ",
inconsist.first,
", inconsistent device: ", GetDeviceStr(inconsist.second).c_str(),
" vs ", GetDeviceStr(device).c_str(), "\n")
auto f_ptr = registry.Find(device.type());
TORCH_CHECK(f_ptr != nullptr, name, ": implementation for device ",
GetDeviceStr(device).c_str(), " not found.\n")
return f_ptr(std::forward<Args>(args)...);
}
// helper macro
#define DEVICE_REGISTRY(key) DeviceRegistry<decltype(&(key)), key>::instance()
#define REGISTER_DEVICE_IMPL(key, device, value) \
struct key##_##device##_registerer { \
key##_##device##_registerer() { \
DEVICE_REGISTRY(key).Register(at::k##device, value); \
} \
}; \
static key##_##device##_registerer _##key##_##device##_registerer;
#define DISPATCH_DEVICE_IMPL(key, ...) \
Dispatch(DEVICE_REGISTRY(key), #key, __VA_ARGS__)
#endif // PYTORCH_DEVICE_REGISTRY
// Modified from
// https://github.com/CVMI-Lab/PAConv/tree/main/scene_seg/lib/paconv_lib/src/gpu
#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
#ifdef MMCV_WITH_CUDA
void AssignScoreWithKForwardCUDAKernelLauncher(
int B, int N0, int N1, int M, int K, int O, int aggregate,
const Tensor& points, const Tensor& centers, const Tensor& scores,
const Tensor& knn_idx, Tensor& output);
void assign_score_withk_forward_cuda(int B, int N0, int N1, int M, int K, int O,
void assign_score_withk_forward_impl(int B, int N0, int N1, int M, int K, int O,
int aggregate, const Tensor& points,
const Tensor& centers,
const Tensor& scores,
const Tensor& knn_idx, Tensor& output) {
AssignScoreWithKForwardCUDAKernelLauncher(
B, N0, N1, M, K, O, aggregate, points, centers, scores, knn_idx, output);
};
void AssignScoreWithKBackwardCUDAKernelLauncher(
int B, int N0, int N1, int M, int K, int O, int aggregate,
const Tensor& grad_out, const Tensor& points, const Tensor& centers,
const Tensor& scores, const Tensor& knn_idx, Tensor& grad_points,
Tensor& grad_centers, Tensor& grad_scores);
DISPATCH_DEVICE_IMPL(assign_score_withk_forward_impl, B, N0, N1, M, K, O,
aggregate, points, centers, scores, knn_idx, output);
}
void assign_score_withk_backward_cuda(
void assign_score_withk_backward_impl(
int B, int N0, int N1, int M, int K, int O, int aggregate,
const Tensor& grad_out, const Tensor& points, const Tensor& centers,
const Tensor& scores, const Tensor& knn_idx, Tensor& grad_points,
Tensor& grad_centers, Tensor& grad_scores) {
AssignScoreWithKBackwardCUDAKernelLauncher(
B, N0, N1, M, K, O, aggregate, grad_out, points, centers, scores, knn_idx,
grad_points, grad_centers, grad_scores);
};
#endif
DISPATCH_DEVICE_IMPL(assign_score_withk_backward_impl, B, N0, N1, M, K, O,
aggregate, grad_out, points, centers, scores, knn_idx,
grad_points, grad_centers, grad_scores);
}
void assign_score_withk_forward(const Tensor& points, const Tensor& centers,
const Tensor& scores, const Tensor& knn_idx,
Tensor& output, int B, int N0, int N1, int M,
int K, int O, int aggregate) {
if (points.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
CHECK_CONTIGUOUS(points);
CHECK_CONTIGUOUS(centers);
CHECK_CONTIGUOUS(scores);
CHECK_CONTIGUOUS(knn_idx);
CHECK_CONTIGUOUS(output);
assign_score_withk_forward_cuda(B, N0, N1, M, K, O, aggregate, points,
centers, scores, knn_idx, output);
#else
AT_ERROR("assign_score_withk is not compiled with GPU support");
#endif
} else {
AT_ERROR("assign_score_withk is not implemented on CPU");
}
assign_score_withk_forward_impl(B, N0, N1, M, K, O, aggregate, points,
centers, scores, knn_idx, output);
}
void assign_score_withk_backward(const Tensor& grad_out, const Tensor& points,
......@@ -62,24 +36,7 @@ void assign_score_withk_backward(const Tensor& grad_out, const Tensor& points,
Tensor& grad_centers, Tensor& grad_scores,
int B, int N0, int N1, int M, int K, int O,
int aggregate) {
if (grad_points.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
CHECK_CONTIGUOUS(grad_out);
CHECK_CONTIGUOUS(scores);
CHECK_CONTIGUOUS(points);
CHECK_CONTIGUOUS(centers);
CHECK_CONTIGUOUS(knn_idx);
CHECK_CONTIGUOUS(grad_scores);
CHECK_CONTIGUOUS(grad_points);
CHECK_CONTIGUOUS(grad_centers);
assign_score_withk_backward_cuda(B, N0, N1, M, K, O, aggregate, grad_out,
points, centers, scores, knn_idx,
grad_points, grad_centers, grad_scores);
#else
AT_ERROR("assign_score_withk is not compiled with GPU support");
#endif
} else {
AT_ERROR("assign_score_withk is not implemented on CPU");
}
assign_score_withk_backward_impl(B, N0, N1, M, K, O, aggregate, grad_out,
points, centers, scores, knn_idx,
grad_points, grad_centers, grad_scores);
}
......@@ -2,36 +2,19 @@
// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/ball_query.cpp
#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
#ifdef MMCV_WITH_CUDA
void BallQueryForwardCUDAKernelLauncher(int b, int n, int m, float min_radius,
float max_radius, int nsample,
const Tensor new_xyz, const Tensor xyz,
Tensor idx);
void ball_query_forward_cuda(int b, int n, int m, float min_radius,
void ball_query_forward_impl(int b, int n, int m, float min_radius,
float max_radius, int nsample,
const Tensor new_xyz, const Tensor xyz,
Tensor idx) {
BallQueryForwardCUDAKernelLauncher(b, n, m, min_radius, max_radius, nsample,
new_xyz, xyz, idx);
};
#endif
DISPATCH_DEVICE_IMPL(ball_query_forward_impl, b, n, m, min_radius, max_radius,
nsample, new_xyz, xyz, idx);
}
void ball_query_forward(Tensor new_xyz_tensor, Tensor xyz_tensor,
Tensor idx_tensor, int b, int n, int m,
float min_radius, float max_radius, int nsample) {
if (new_xyz_tensor.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(new_xyz_tensor);
CHECK_CUDA_INPUT(xyz_tensor);
ball_query_forward_cuda(b, n, m, min_radius, max_radius, nsample,
new_xyz_tensor, xyz_tensor, idx_tensor);
#else
AT_ERROR("ball_query is not compiled with GPU support");
#endif
} else {
AT_ERROR("ball_query is not implemented on CPU");
}
ball_query_forward_impl(b, n, m, min_radius, max_radius, nsample,
new_xyz_tensor, xyz_tensor, idx_tensor);
}
// Copyright (c) OpenMMLab. All rights reserved
#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
#ifdef MMCV_WITH_CUDA
void BBoxOverlapsCUDAKernelLauncher(const Tensor bboxes1, const Tensor bboxes2,
Tensor ious, const int mode,
const bool aligned, const int offset);
void bbox_overlaps_cuda(const Tensor bboxes1, const Tensor bboxes2, Tensor ious,
void bbox_overlaps_impl(const Tensor bboxes1, const Tensor bboxes2, Tensor ious,
const int mode, const bool aligned, const int offset) {
BBoxOverlapsCUDAKernelLauncher(bboxes1, bboxes2, ious, mode, aligned, offset);
DISPATCH_DEVICE_IMPL(bbox_overlaps_impl, bboxes1, bboxes2, ious, mode,
aligned, offset);
}
#endif
void bbox_overlaps(const Tensor bboxes1, const Tensor bboxes2, Tensor ious,
const int mode, const bool aligned, const int offset) {
if (bboxes1.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(bboxes1);
CHECK_CUDA_INPUT(bboxes2);
CHECK_CUDA_INPUT(ious);
bbox_overlaps_cuda(bboxes1, bboxes2, ious, mode, aligned, offset);
#else
AT_ERROR("bbox_overlaps is not compiled with GPU support");
#endif
} else {
AT_ERROR("bbox_overlaps is not implemented on CPU");
}
bbox_overlaps_impl(bboxes1, bboxes2, ious, mode, aligned, offset);
}
// Copyright (c) OpenMMLab. All rights reserved
#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
#ifdef MMCV_WITH_CUDA
void BorderAlignForwardCUDAKernelLauncher(const Tensor &input,
const Tensor &boxes, Tensor output,
Tensor argmax_idx,
const int pool_size);
void BorderAlignBackwardCUDAKernelLauncher(const Tensor &grad_output,
const Tensor &boxes,
const Tensor &argmax_idx,
Tensor grad_input,
const int pool_size);
void border_align_forward_cuda(const Tensor &input, const Tensor &boxes,
void border_align_forward_impl(const Tensor &input, const Tensor &boxes,
Tensor output, Tensor argmax_idx,
const int pool_size) {
BorderAlignForwardCUDAKernelLauncher(input, boxes, output, argmax_idx,
pool_size);
DISPATCH_DEVICE_IMPL(border_align_forward_impl, input, boxes, output,
argmax_idx, pool_size);
}
void border_align_backward_cuda(const Tensor &grad_output, const Tensor &boxes,
void border_align_backward_impl(const Tensor &grad_output, const Tensor &boxes,
const Tensor &argmax_idx, Tensor grad_input,
const int pool_size) {
BorderAlignBackwardCUDAKernelLauncher(grad_output, boxes, argmax_idx,
grad_input, pool_size);
DISPATCH_DEVICE_IMPL(border_align_backward_impl, grad_output, boxes,
argmax_idx, grad_input, pool_size);
}
#endif
void border_align_forward(const Tensor &input, const Tensor &boxes,
Tensor output, Tensor argmax_idx,
const int pool_size) {
if (input.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(input);
CHECK_CUDA_INPUT(boxes);
CHECK_CUDA_INPUT(output);
CHECK_CUDA_INPUT(argmax_idx);
border_align_forward_cuda(input, boxes, output, argmax_idx, pool_size);
#else
AT_ERROR("BorderAlign is not compiled with GPU support");
#endif
} else {
AT_ERROR("BorderAlign is not implemented on CPU");
}
border_align_forward_impl(input, boxes, output, argmax_idx, pool_size);
}
void border_align_backward(const Tensor &grad_output, const Tensor &boxes,
const Tensor &argmax_idx, Tensor grad_input,
const int pool_size) {
if (grad_output.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(grad_output);
CHECK_CUDA_INPUT(boxes);
CHECK_CUDA_INPUT(argmax_idx);
CHECK_CUDA_INPUT(grad_input);
border_align_backward_cuda(grad_output, boxes, argmax_idx, grad_input,
pool_size);
#else
AT_ERROR("BorderAlign is not compiled with GPU support");
#endif
} else {
AT_ERROR("BorderAlign is not implemented on CPU");
}
border_align_backward_impl(grad_output, boxes, argmax_idx, grad_input,
pool_size);
}
......@@ -2,28 +2,18 @@
// modified from
// https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/csrc/box_iou_rotated/box_iou_rotated.h
#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
void box_iou_rotated_cpu(const Tensor boxes1, const Tensor boxes2, Tensor ious,
const int mode_flag, const bool aligned);
#ifdef MMCV_WITH_CUDA
void box_iou_rotated_cuda(const Tensor boxes1, const Tensor boxes2, Tensor ious,
const int mode_flag, const bool aligned);
#endif
void box_iou_rotated_impl(const Tensor boxes1, const Tensor boxes2, Tensor ious,
const int mode_flag, const bool aligned) {
DISPATCH_DEVICE_IMPL(box_iou_rotated_impl, boxes1, boxes2, ious, mode_flag,
aligned);
}
// Interface for Python
// inline is needed to prevent multiple function definitions when this header is
// included by different cpps
void box_iou_rotated(const Tensor boxes1, const Tensor boxes2, Tensor ious,
const int mode_flag, const bool aligned) {
assert(boxes1.device().is_cuda() == boxes2.device().is_cuda());
if (boxes1.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
box_iou_rotated_cuda(boxes1, boxes2, ious, mode_flag, aligned);
#else
AT_ERROR("Not compiled with GPU support");
#endif
} else {
box_iou_rotated_cpu(boxes1, boxes2, ious, mode_flag, aligned);
}
box_iou_rotated_impl(boxes1, boxes2, ious, mode_flag, aligned);
}
// Copyright (c) OpenMMLab. All rights reserved
#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
#ifdef MMCV_WITH_CUDA
void CARAFEForwardCUDAKernelLauncher(const Tensor features, const Tensor masks,
Tensor rfeatures, Tensor routput,
Tensor rmasks, Tensor output,
const int kernel_size,
const int group_size,
const int scale_factor);
void CARAFEBackwardCUDAKernelLauncher(
const Tensor top_grad, const Tensor rfeatures, const Tensor masks,
Tensor rtop_grad, Tensor rbottom_grad_hs, Tensor rbottom_grad,
Tensor rmask_grad, Tensor bottom_grad, Tensor mask_grad,
const int kernel_size, const int group_size, const int scale_factor);
void carafe_forward_cuda(Tensor features, Tensor masks, Tensor rfeatures,
void carafe_forward_impl(Tensor features, Tensor masks, Tensor rfeatures,
Tensor routput, Tensor rmasks, Tensor output,
int kernel_size, int group_size, int scale_factor) {
CARAFEForwardCUDAKernelLauncher(features, masks, rfeatures, routput, rmasks,
output, kernel_size, group_size,
scale_factor);
DISPATCH_DEVICE_IMPL(carafe_forward_impl, features, masks, rfeatures, routput,
rmasks, output, kernel_size, group_size, scale_factor);
}
void carafe_backward_cuda(Tensor top_grad, Tensor rfeatures, Tensor masks,
void carafe_backward_impl(Tensor top_grad, Tensor rfeatures, Tensor masks,
Tensor rtop_grad, Tensor rbottom_grad_hs,
Tensor rbottom_grad, Tensor rmask_grad,
Tensor bottom_grad, Tensor mask_grad, int kernel_size,
int group_size, int scale_factor) {
CARAFEBackwardCUDAKernelLauncher(top_grad, rfeatures, masks, rtop_grad,
rbottom_grad_hs, rbottom_grad, rmask_grad,
bottom_grad, mask_grad, kernel_size,
group_size, scale_factor);
DISPATCH_DEVICE_IMPL(carafe_backward_impl, top_grad, rfeatures, masks,
rtop_grad, rbottom_grad_hs, rbottom_grad, rmask_grad,
bottom_grad, mask_grad, kernel_size, group_size,
scale_factor);
}
#endif
void carafe_forward(Tensor features, Tensor masks, Tensor rfeatures,
Tensor routput, Tensor rmasks, Tensor output,
int kernel_size, int group_size, int scale_factor) {
if (features.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(features);
CHECK_CUDA_INPUT(masks);
CHECK_CUDA_INPUT(rfeatures);
CHECK_CUDA_INPUT(routput);
CHECK_CUDA_INPUT(rmasks);
CHECK_CUDA_INPUT(output);
carafe_forward_cuda(features, masks, rfeatures, routput, rmasks, output,
kernel_size, group_size, scale_factor);
#else
AT_ERROR("Carafe is not compiled with GPU support");
#endif
} else {
AT_ERROR("Carafe is not implemented on CPU");
}
carafe_forward_impl(features, masks, rfeatures, routput, rmasks, output,
kernel_size, group_size, scale_factor);
}
void carafe_backward(Tensor top_grad, Tensor rfeatures, Tensor masks,
......@@ -61,24 +32,7 @@ void carafe_backward(Tensor top_grad, Tensor rfeatures, Tensor masks,
Tensor rbottom_grad, Tensor rmask_grad, Tensor bottom_grad,
Tensor mask_grad, int kernel_size, int group_size,
int scale_factor) {
if (top_grad.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(top_grad);
CHECK_CUDA_INPUT(rfeatures);
CHECK_CUDA_INPUT(masks);
CHECK_CUDA_INPUT(rtop_grad);
CHECK_CUDA_INPUT(rbottom_grad_hs);
CHECK_CUDA_INPUT(rbottom_grad);
CHECK_CUDA_INPUT(rmask_grad);
CHECK_CUDA_INPUT(bottom_grad);
CHECK_CUDA_INPUT(mask_grad);
carafe_backward_cuda(top_grad, rfeatures, masks, rtop_grad, rbottom_grad_hs,
rbottom_grad, rmask_grad, bottom_grad, mask_grad,
kernel_size, group_size, scale_factor);
#else
AT_ERROR("Carafe is not compiled with GPU support");
#endif
} else {
AT_ERROR("Carafe is not implemented on CPU");
}
carafe_backward_impl(top_grad, rfeatures, masks, rtop_grad, rbottom_grad_hs,
rbottom_grad, rmask_grad, bottom_grad, mask_grad,
kernel_size, group_size, scale_factor);
}
// Copyright (c) OpenMMLab. All rights reserved
#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
#ifdef MMCV_WITH_CUDA
void CARAFENAIVEForwardCUDAKernelLauncher(const Tensor features,
const Tensor masks, Tensor output,
const int kernel_size,
const int group_size,
const int scale_factor);
void CARAFENAIVEBackwardCUDAKernelLauncher(
const Tensor top_grad, const Tensor features, const Tensor masks,
Tensor bottom_grad, Tensor mask_grad, const int kernel_size,
const int group_size, const int scale_factor);
void carafe_naive_forward_cuda(Tensor features, Tensor masks, Tensor output,
void carafe_naive_forward_impl(Tensor features, Tensor masks, Tensor output,
int kernel_size, int group_size,
int scale_factor) {
CARAFENAIVEForwardCUDAKernelLauncher(features, masks, output, kernel_size,
group_size, scale_factor);
DISPATCH_DEVICE_IMPL(carafe_naive_forward_impl, features, masks, output,
kernel_size, group_size, scale_factor);
}
void carafe_naive_backward_cuda(Tensor top_grad, Tensor features, Tensor masks,
void carafe_naive_backward_impl(Tensor top_grad, Tensor features, Tensor masks,
Tensor bottom_grad, Tensor mask_grad,
int kernel_size, int group_size,
int scale_factor) {
CARAFENAIVEBackwardCUDAKernelLauncher(top_grad, features, masks, bottom_grad,
mask_grad, kernel_size, group_size,
scale_factor);
DISPATCH_DEVICE_IMPL(carafe_naive_backward_impl, top_grad, features, masks,
bottom_grad, mask_grad, kernel_size, group_size,
scale_factor);
}
#endif
void carafe_naive_forward(Tensor features, Tensor masks, Tensor output,
int kernel_size, int group_size, int scale_factor) {
if (features.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(features);
CHECK_CUDA_INPUT(masks);
CHECK_CUDA_INPUT(output);
carafe_naive_forward_cuda(features, masks, output, kernel_size, group_size,
scale_factor);
#else
AT_ERROR("CarafeNaive is not compiled with GPU support");
#endif
} else {
AT_ERROR("CarafeNaive is not implemented on CPU");
}
carafe_naive_forward_impl(features, masks, output, kernel_size, group_size,
scale_factor);
}
void carafe_naive_backward(Tensor top_grad, Tensor features, Tensor masks,
Tensor bottom_grad, Tensor mask_grad,
int kernel_size, int group_size, int scale_factor) {
if (top_grad.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(top_grad);
CHECK_CUDA_INPUT(features);
CHECK_CUDA_INPUT(masks);
CHECK_CUDA_INPUT(bottom_grad);
CHECK_CUDA_INPUT(mask_grad);
carafe_naive_backward_cuda(top_grad, features, masks, bottom_grad,
mask_grad, kernel_size, group_size,
scale_factor);
#else
AT_ERROR("CarafeNaive is not compiled with GPU support");
#endif
} else {
AT_ERROR("CarafeNaive is not implemented on CPU");
}
carafe_naive_backward_impl(top_grad, features, masks, bottom_grad, mask_grad,
kernel_size, group_size, scale_factor);
}
......@@ -2,65 +2,37 @@
#include <iostream>
#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
#ifdef MMCV_WITH_CUDA
void CorrelationForwardCUDAKernelLauncher(Tensor input1, Tensor input2,
Tensor output, int kH, int kW,
int patchH, int patchW, int padH,
int padW, int dilationH,
int dilationW, int dilation_patchH,
int dilation_patchW, int dH, int dW);
void CorrelationBackwardCUDAKernelLauncher(Tensor grad_output, Tensor input1,
Tensor input2, Tensor grad_input1,
Tensor grad_input2, int kH, int kW,
int patchH, int patchW, int padH,
int padW, int dilationH,
int dilationW, int dilation_patchH,
int dilation_patchW, int dH, int dW);
void correlation_cuda_forward(Tensor input1, Tensor input2, Tensor output,
void correlation_forward_impl(Tensor input1, Tensor input2, Tensor output,
int kH, int kW, int patchH, int patchW, int padH,
int padW, int dilationH, int dilationW,
int dilation_patchH, int dilation_patchW, int dH,
int dW) {
CorrelationForwardCUDAKernelLauncher(
input1, input2, output, kH, kW, patchH, patchW, padH, padW, dilationH,
dilationW, dilation_patchH, dilation_patchW, dH, dW);
DISPATCH_DEVICE_IMPL(correlation_forward_impl, input1, input2, output, kH, kW,
patchH, patchW, padH, padW, dilationH, dilationW,
dilation_patchH, dilation_patchW, dH, dW);
}
void correlation_cuda_backward(Tensor grad_output, Tensor input1, Tensor input2,
void correlation_backward_impl(Tensor grad_output, Tensor input1, Tensor input2,
Tensor grad_input1, Tensor grad_input2, int kH,
int kW, int patchH, int patchW, int padH,
int padW, int dilationH, int dilationW,
int dilation_patchH, int dilation_patchW, int dH,
int dW) {
CorrelationBackwardCUDAKernelLauncher(
grad_output, input1, input2, grad_input1, grad_input2, kH, kW, patchH,
patchW, padH, padW, dilationH, dilationW, dilation_patchH,
dilation_patchW, dH, dW);
DISPATCH_DEVICE_IMPL(correlation_backward_impl, grad_output, input1, input2,
grad_input1, grad_input2, kH, kW, patchH, patchW, padH,
padW, dilationH, dilationW, dilation_patchH,
dilation_patchW, dH, dW);
}
#endif
void correlation_forward(Tensor input1, Tensor input2, Tensor output, int kH,
int kW, int patchH, int patchW, int padH, int padW,
int dilationH, int dilationW, int dilation_patchH,
int dilation_patchW, int dH, int dW) {
if (input1.device().is_cuda() && input2.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(input1);
CHECK_CUDA_INPUT(input2);
correlation_cuda_forward(input1, input2, output, kH, kW, patchH, patchW,
padH, padW, dilationH, dilationW, dilation_patchH,
dilation_patchW, dH, dW);
#else
AT_ERROR("Correlation is not compiled with GPU support");
#endif
} else {
AT_ERROR("Correlation is not implemented on CPU");
}
correlation_forward_impl(input1, input2, output, kH, kW, patchH, patchW, padH,
padW, dilationH, dilationW, dilation_patchH,
dilation_patchW, dH, dW);
}
void correlation_backward(Tensor grad_output, Tensor input1, Tensor input2,
......@@ -68,20 +40,8 @@ void correlation_backward(Tensor grad_output, Tensor input1, Tensor input2,
int kW, int patchH, int patchW, int padH, int padW,
int dilationH, int dilationW, int dilation_patchH,
int dilation_patchW, int dH, int dW) {
if (input1.device().is_cuda() && input2.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(grad_output);
CHECK_CUDA_INPUT(input1);
CHECK_CUDA_INPUT(input2);
correlation_cuda_backward(grad_output, input1, input2, grad_input1,
grad_input2, kH, kW, patchH, patchW, padH, padW,
dilationH, dilationW, dilation_patchH,
dilation_patchW, dH, dW);
#else
AT_ERROR("Correlation is not compiled with GPU support");
#endif
} else {
AT_ERROR("Correlation is not implemented on CPU");
}
correlation_backward_impl(grad_output, input1, input2, grad_input1,
grad_input2, kH, kW, patchH, patchW, padH, padW,
dilationH, dilationW, dilation_patchH,
dilation_patchW, dH, dW);
}
......@@ -3,6 +3,7 @@
// https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/csrc/box_iou_rotated/box_iou_rotated_cpu.cpp
#include "box_iou_rotated_utils.hpp"
#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
template <typename T>
void box_iou_rotated_cpu_kernel(const Tensor boxes1, const Tensor boxes2,
......@@ -31,3 +32,7 @@ void box_iou_rotated_cpu(const Tensor boxes1, const Tensor boxes2, Tensor ious,
const int mode_flag, const bool aligned) {
box_iou_rotated_cpu_kernel<float>(boxes1, boxes2, ious, mode_flag, aligned);
}
void box_iou_rotated_impl(const Tensor boxes1, const Tensor boxes2, Tensor ious,
const int mode_flag, const bool aligned);
REGISTER_DEVICE_IMPL(box_iou_rotated_impl, CPU, box_iou_rotated_cpu);
// Copyright (c) OpenMMLab. All rights reserved
#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
template <typename T>
T deformable_im2col_bilinear_cpu(const T *input, const int data_width,
......@@ -375,3 +376,33 @@ void deformable_col2im_coord_cpu(
height_col, width_col, grad_offset_);
}));
}
void deformable_im2col_impl(Tensor data_im, Tensor data_offset,
const int channels, const int height,
const int width, const int ksize_h,
const int ksize_w, const int pad_h, const int pad_w,
const int stride_h, const int stride_w,
const int dilation_h, const int dilation_w,
const int parallel_imgs, const int deformable_group,
Tensor data_col);
void deformable_col2im_impl(Tensor data_col, Tensor data_offset,
const int channels, const int height,
const int width, const int ksize_h,
const int ksize_w, const int pad_h, const int pad_w,
const int stride_h, const int stride_w,
const int dilation_h, const int dilation_w,
const int parallel_imgs, const int deformable_group,
Tensor grad_im);
void deformable_col2im_coord_impl(
Tensor data_col, Tensor data_im, Tensor data_offset, const int channels,
const int height, const int width, const int ksize_h, const int ksize_w,
const int pad_h, const int pad_w, const int stride_h, const int stride_w,
const int dilation_h, const int dilation_w, const int parallel_imgs,
const int deformable_group, Tensor grad_offset);
REGISTER_DEVICE_IMPL(deformable_im2col_impl, CPU, deformable_im2col_cpu);
REGISTER_DEVICE_IMPL(deformable_col2im_impl, CPU, deformable_col2im_cpu);
REGISTER_DEVICE_IMPL(deformable_col2im_coord_impl, CPU,
deformable_col2im_coord_cpu);
// Copyright (c) OpenMMLab. All rights reserved
#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
template <typename T>
T dmcn_im2col_bilinear_cpu(const T *input, const int data_width,
......@@ -322,7 +323,7 @@ void modulated_deformable_im2col_cpu(
const Tensor data_im, const Tensor data_offset, const Tensor data_mask,
const int batch_size, const int channels, const int height_im,
const int width_im, const int height_col, const int width_col,
const int kernel_h, const int kenerl_w, const int pad_h, const int pad_w,
const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
const int stride_h, const int stride_w, const int dilation_h,
const int dilation_w, const int deformable_group, Tensor data_col) {
// num_axes should be smaller than block size
......@@ -338,7 +339,7 @@ void modulated_deformable_im2col_cpu(
modulated_deformable_im2col_cpu_kernel(
num_kernels, data_im_, data_offset_, data_mask_, height_im,
width_im, kernel_h, kenerl_w, pad_h, pad_w, stride_h, stride_w,
width_im, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
dilation_h, dilation_w, channel_per_deformable_group, batch_size,
channels, deformable_group, height_col, width_col, data_col_);
}));
......@@ -401,3 +402,35 @@ void modulated_deformable_col2im_coord_cpu(
height_col, width_col, grad_offset_, grad_mask_);
}));
}
void modulated_deformable_im2col_impl(
const Tensor data_im, const Tensor data_offset, const Tensor data_mask,
const int batch_size, const int channels, const int height_im,
const int width_im, const int height_col, const int width_col,
const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
const int stride_h, const int stride_w, const int dilation_h,
const int dilation_w, const int deformable_group, Tensor data_col);
void modulated_deformable_col2im_impl(
const Tensor data_col, const Tensor data_offset, const Tensor data_mask,
const int batch_size, const int channels, const int height_im,
const int width_im, const int height_col, const int width_col,
const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
const int stride_h, const int stride_w, const int dilation_h,
const int dilation_w, const int deformable_group, Tensor grad_im);
void modulated_deformable_col2im_coord_impl(
const Tensor data_col, const Tensor data_im, const Tensor data_offset,
const Tensor data_mask, const int batch_size, const int channels,
const int height_im, const int width_im, const int height_col,
const int width_col, const int kernel_h, const int kernel_w,
const int pad_h, const int pad_w, const int stride_h, const int stride_w,
const int dilation_h, const int dilation_w, const int deformable_group,
Tensor grad_offset, Tensor grad_mask);
REGISTER_DEVICE_IMPL(modulated_deformable_im2col_impl, CPU,
modulated_deformable_im2col_cpu);
REGISTER_DEVICE_IMPL(modulated_deformable_col2im_impl, CPU,
modulated_deformable_col2im_cpu);
REGISTER_DEVICE_IMPL(modulated_deformable_col2im_coord_impl, CPU,
modulated_deformable_col2im_coord_cpu);
// Copyright (c) OpenMMLab. All rights reserved
#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
Tensor nms_cpu(Tensor boxes, Tensor scores, float iou_threshold, int offset) {
if (boxes.numel() == 0) {
return at::empty({0}, boxes.options().dtype(at::kLong));
}
auto x1_t = boxes.select(1, 0).contiguous();
auto y1_t = boxes.select(1, 1).contiguous();
auto x2_t = boxes.select(1, 2).contiguous();
auto y2_t = boxes.select(1, 3).contiguous();
Tensor areas_t = (x2_t - x1_t + offset) * (y2_t - y1_t + offset);
auto order_t = std::get<1>(scores.sort(0, /* descending=*/true));
auto nboxes = boxes.size(0);
Tensor select_t = at::ones({nboxes}, boxes.options().dtype(at::kBool));
auto select = select_t.data_ptr<bool>();
auto order = order_t.data_ptr<int64_t>();
auto x1 = x1_t.data_ptr<float>();
auto y1 = y1_t.data_ptr<float>();
auto x2 = x2_t.data_ptr<float>();
auto y2 = y2_t.data_ptr<float>();
auto areas = areas_t.data_ptr<float>();
for (int64_t _i = 0; _i < nboxes; _i++) {
if (select[_i] == false) continue;
auto i = order[_i];
auto ix1 = x1[i];
auto iy1 = y1[i];
auto ix2 = x2[i];
auto iy2 = y2[i];
auto iarea = areas[i];
for (int64_t _j = _i + 1; _j < nboxes; _j++) {
if (select[_j] == false) continue;
auto j = order[_j];
auto xx1 = std::max(ix1, x1[j]);
auto yy1 = std::max(iy1, y1[j]);
auto xx2 = std::min(ix2, x2[j]);
auto yy2 = std::min(iy2, y2[j]);
auto w = std::max(0.f, xx2 - xx1 + offset);
auto h = std::max(0.f, yy2 - yy1 + offset);
auto inter = w * h;
auto ovr = inter / (iarea + areas[j] - inter);
if (ovr > iou_threshold) select[_j] = false;
}
}
return order_t.masked_select(select_t);
}
Tensor nms_impl(Tensor boxes, Tensor scores, float iou_threshold, int offset);
REGISTER_DEVICE_IMPL(nms_impl, CPU, nms_cpu);
Tensor softnms_cpu(Tensor boxes, Tensor scores, Tensor dets,
float iou_threshold, float sigma, float min_score,
int method, int offset) {
if (boxes.numel() == 0) {
return at::empty({0}, boxes.options().dtype(at::kLong));
}
auto x1_t = boxes.select(1, 0).contiguous();
auto y1_t = boxes.select(1, 1).contiguous();
auto x2_t = boxes.select(1, 2).contiguous();
auto y2_t = boxes.select(1, 3).contiguous();
auto scores_t = scores.clone();
Tensor areas_t = (x2_t - x1_t + offset) * (y2_t - y1_t + offset);
auto nboxes = boxes.size(0);
auto x1 = x1_t.data_ptr<float>();
auto y1 = y1_t.data_ptr<float>();
auto x2 = x2_t.data_ptr<float>();
auto y2 = y2_t.data_ptr<float>();
auto sc = scores_t.data_ptr<float>();
auto areas = areas_t.data_ptr<float>();
auto de = dets.data_ptr<float>();
int64_t pos = 0;
Tensor inds_t = at::arange(nboxes, boxes.options().dtype(at::kLong));
auto inds = inds_t.data_ptr<int64_t>();
for (int64_t i = 0; i < nboxes; i++) {
auto max_score = sc[i];
auto max_pos = i;
pos = i + 1;
// get max box
while (pos < nboxes) {
if (max_score < sc[pos]) {
max_score = sc[pos];
max_pos = pos;
}
pos = pos + 1;
}
// swap
auto ix1 = de[i * 5 + 0] = x1[max_pos];
auto iy1 = de[i * 5 + 1] = y1[max_pos];
auto ix2 = de[i * 5 + 2] = x2[max_pos];
auto iy2 = de[i * 5 + 3] = y2[max_pos];
auto iscore = de[i * 5 + 4] = sc[max_pos];
auto iarea = areas[max_pos];
auto iind = inds[max_pos];
x1[max_pos] = x1[i];
y1[max_pos] = y1[i];
x2[max_pos] = x2[i];
y2[max_pos] = y2[i];
sc[max_pos] = sc[i];
areas[max_pos] = areas[i];
inds[max_pos] = inds[i];
x1[i] = ix1;
y1[i] = iy1;
x2[i] = ix2;
y2[i] = iy2;
sc[i] = iscore;
areas[i] = iarea;
inds[i] = iind;
pos = i + 1;
while (pos < nboxes) {
auto xx1 = std::max(ix1, x1[pos]);
auto yy1 = std::max(iy1, y1[pos]);
auto xx2 = std::min(ix2, x2[pos]);
auto yy2 = std::min(iy2, y2[pos]);
auto w = std::max(0.f, xx2 - xx1 + offset);
auto h = std::max(0.f, yy2 - yy1 + offset);
auto inter = w * h;
auto ovr = inter / (iarea + areas[pos] - inter);
float weight = 1.;
if (method == 0) {
if (ovr >= iou_threshold) weight = 0;
} else if (method == 1) {
if (ovr >= iou_threshold) weight = 1 - ovr;
} else if (method == 2) {
weight = std::exp(-(ovr * ovr) / sigma);
}
sc[pos] *= weight;
// if box score falls below threshold, discard the box by
// swapping with last box update N
if (sc[pos] < min_score) {
x1[pos] = x1[nboxes - 1];
y1[pos] = y1[nboxes - 1];
x2[pos] = x2[nboxes - 1];
y2[pos] = y2[nboxes - 1];
sc[pos] = sc[nboxes - 1];
areas[pos] = areas[nboxes - 1];
inds[pos] = inds[nboxes - 1];
nboxes = nboxes - 1;
pos = pos - 1;
}
pos = pos + 1;
}
}
return inds_t.slice(0, 0, nboxes);
}
Tensor softnms_impl(Tensor boxes, Tensor scores, Tensor dets,
float iou_threshold, float sigma, float min_score,
int method, int offset);
REGISTER_DEVICE_IMPL(softnms_impl, CPU, softnms_cpu);
std::vector<std::vector<int> > nms_match_cpu(Tensor dets, float iou_threshold) {
auto x1_t = dets.select(1, 0).contiguous();
auto y1_t = dets.select(1, 1).contiguous();
auto x2_t = dets.select(1, 2).contiguous();
auto y2_t = dets.select(1, 3).contiguous();
auto scores = dets.select(1, 4).contiguous();
at::Tensor areas_t = (x2_t - x1_t) * (y2_t - y1_t);
auto order_t = std::get<1>(scores.sort(0, /* descending=*/true));
auto ndets = dets.size(0);
at::Tensor suppressed_t =
at::zeros({ndets}, dets.options().dtype(at::kByte).device(at::kCPU));
auto suppressed = suppressed_t.data_ptr<uint8_t>();
auto order = order_t.data_ptr<int64_t>();
auto x1 = x1_t.data_ptr<float>();
auto y1 = y1_t.data_ptr<float>();
auto x2 = x2_t.data_ptr<float>();
auto y2 = y2_t.data_ptr<float>();
auto areas = areas_t.data_ptr<float>();
std::vector<int> keep;
std::vector<std::vector<int> > matched;
for (int64_t _i = 0; _i < ndets; _i++) {
auto i = order[_i];
if (suppressed[i] == 1) continue;
keep.push_back(i);
std::vector<int> v_i;
auto ix1 = x1[i];
auto iy1 = y1[i];
auto ix2 = x2[i];
auto iy2 = y2[i];
auto iarea = areas[i];
for (int64_t _j = _i + 1; _j < ndets; _j++) {
auto j = order[_j];
if (suppressed[j] == 1) continue;
auto xx1 = std::max(ix1, x1[j]);
auto yy1 = std::max(iy1, y1[j]);
auto xx2 = std::min(ix2, x2[j]);
auto yy2 = std::min(iy2, y2[j]);
auto w = std::max(static_cast<float>(0), xx2 - xx1);
auto h = std::max(static_cast<float>(0), yy2 - yy1);
auto inter = w * h;
auto ovr = inter / (iarea + areas[j] - inter);
if (ovr >= iou_threshold) {
suppressed[j] = 1;
v_i.push_back(j);
}
}
matched.push_back(v_i);
}
for (size_t i = 0; i < keep.size(); i++)
matched[i].insert(matched[i].begin(), keep[i]);
return matched;
}
std::vector<std::vector<int> > nms_match_impl(Tensor dets, float iou_threshold);
REGISTER_DEVICE_IMPL(nms_match_impl, CPU, nms_match_cpu);
......@@ -11,9 +11,9 @@ Tensor nms_rotated_cpu_kernel(const Tensor dets, const Tensor scores,
// however, the code in this function is much shorter because
// we delegate the IoU computation for rotated boxes to
// the single_box_iou_rotated function in box_iou_rotated_utils.h
AT_ASSERTM(!dets.type().is_cuda(), "dets must be a CPU tensor");
AT_ASSERTM(!scores.type().is_cuda(), "scores must be a CPU tensor");
AT_ASSERTM(dets.type() == scores.type(),
AT_ASSERTM(!dets.is_cuda(), "dets must be a CPU tensor");
AT_ASSERTM(!scores.is_cuda(), "scores must be a CPU tensor");
AT_ASSERTM(dets.scalar_type() == scores.scalar_type(),
"dets should have the same type as scores");
if (dets.numel() == 0) {
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment