Unverified Commit 230f9a3b authored by q.yao's avatar q.yao Committed by GitHub
Browse files

Refactor csrc with device dispatcher (#1463)

* Add device registry for pytorch ops

* add declaration of CheckDeviceConsistency

* fix for torch130

* assert with torch check

* Refactor ops with dispatch

* update rest ops

* faster install

* update compatibility

* update compatibility, rename parameter

* move cpu implement to pytorch/cpu

* update ops/csrc/README.md

* fix rocm support

* update cn document

* update docs

* list instead of map
parent ef8ba752
// Copyright (c) OpenMMLab. All rights reserved
// It is modified from https://github.com/WenmuZhou/PAN.pytorch
#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
std::vector<std::vector<float>> estimate_confidence(int32_t* label,
float* score, int label_num,
int height, int width) {
std::vector<std::vector<float>> point_vector;
for (int i = 0; i < label_num; i++) {
std::vector<float> point;
point.push_back(0);
point.push_back(0);
point_vector.push_back(point);
}
for (int y = 0; y < height; y++) {
auto label_tmp = label + y * width;
auto score_tmp = score + y * width;
for (int x = 0; x < width; x++) {
auto l = label_tmp[x];
if (l > 0) {
float confidence = score_tmp[x];
point_vector[l].push_back(x);
point_vector[l].push_back(y);
point_vector[l][0] += confidence;
point_vector[l][1] += 1;
}
}
}
for (size_t l = 0; l < point_vector.size(); l++)
if (point_vector[l][1] > 0) {
point_vector[l][0] /= point_vector[l][1];
}
return point_vector;
}
std::vector<std::vector<float>> pixel_group_cpu(
Tensor score, Tensor mask, Tensor embedding, Tensor kernel_label,
Tensor kernel_contour, int kernel_region_num, float dis_threshold) {
assert(score.dim() == 2);
assert(mask.dim() == 2);
assert(embedding_dim.dim() == 3);
int height = score.size(0);
int width = score.size(1);
assert(height == mask.size(0) == embedding.size(1) == kernel_label.size(1));
assert(width == mask.size(1) == embedding.size(2) == kernel_label.size(2));
auto threshold_square = dis_threshold * dis_threshold;
auto ptr_score = score.data_ptr<float>();
auto ptr_mask = mask.data_ptr<bool>();
auto ptr_kernel_contour = kernel_contour.data_ptr<uint8_t>();
auto ptr_embedding = embedding.data_ptr<float>();
auto ptr_kernel_label = kernel_label.data_ptr<int32_t>();
std::queue<std::tuple<int, int, int32_t>> contour_pixels;
auto embedding_dim = embedding.size(2);
std::vector<std::vector<float>> kernel_vector(
kernel_region_num, std::vector<float>(embedding_dim + 1, 0));
Tensor text_label;
text_label = kernel_label.clone();
auto ptr_text_label = text_label.data_ptr<int32_t>();
for (int i = 0; i < height; i++) {
auto ptr_embedding_tmp = ptr_embedding + i * width * embedding_dim;
auto ptr_kernel_label_tmp = ptr_kernel_label + i * width;
auto ptr_kernel_contour_tmp = ptr_kernel_contour + i * width;
for (int j = 0, k = 0; j < width && k < width * embedding_dim;
j++, k += embedding_dim) {
int32_t label = ptr_kernel_label_tmp[j];
if (label > 0) {
for (int d = 0; d < embedding_dim; d++)
kernel_vector[label][d] += ptr_embedding_tmp[k + d];
kernel_vector[label][embedding_dim] += 1;
// kernel pixel number
if (ptr_kernel_contour_tmp[j]) {
contour_pixels.push(std::make_tuple(i, j, label));
}
}
}
}
for (int i = 0; i < kernel_region_num; i++) {
for (int j = 0; j < embedding_dim; j++) {
kernel_vector[i][j] /= kernel_vector[i][embedding_dim];
}
}
int dx[4] = {-1, 1, 0, 0};
int dy[4] = {0, 0, -1, 1};
while (!contour_pixels.empty()) {
auto query_pixel = contour_pixels.front();
contour_pixels.pop();
int y = std::get<0>(query_pixel);
int x = std::get<1>(query_pixel);
int32_t l = std::get<2>(query_pixel);
auto kernel_cv = kernel_vector[l];
for (int idx = 0; idx < 4; idx++) {
int tmpy = y + dy[idx];
int tmpx = x + dx[idx];
auto ptr_text_label_tmp = ptr_text_label + tmpy * width;
if (tmpy < 0 || tmpy >= height || tmpx < 0 || tmpx >= width) continue;
if (!ptr_mask[tmpy * width + tmpx] || ptr_text_label_tmp[tmpx] > 0)
continue;
float dis = 0;
auto ptr_embedding_tmp = ptr_embedding + tmpy * width * embedding_dim;
for (size_t i = 0; i < embedding_dim; i++) {
dis +=
pow(kernel_cv[i] - ptr_embedding_tmp[tmpx * embedding_dim + i], 2);
// ignore further computing if dis is big enough
if (dis >= threshold_square) break;
}
if (dis >= threshold_square) continue;
contour_pixels.push(std::make_tuple(tmpy, tmpx, l));
ptr_text_label_tmp[tmpx] = l;
}
}
return estimate_confidence(ptr_text_label, ptr_score, kernel_region_num,
height, width);
}
std::vector<std::vector<float>> pixel_group_impl(
Tensor score, Tensor mask, Tensor embedding, Tensor kernel_label,
Tensor kernel_contour, int kernel_region_num, float dis_threshold);
REGISTER_DEVICE_IMPL(pixel_group_impl, CPU, pixel_group_cpu);
// Copyright (c) OpenMMLab. All rights reserved
// Modified from
// https://github.com/hszhao/semseg/blob/master/lib/psa/src
#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
#ifndef min
#define min(a, b) (((a) < (b)) ? (a) : (b))
#endif
#ifndef max
#define max(a, b) (((a) > (b)) ? (a) : (b))
#endif
void psamask_collect_forward(const int num_, const int h_feature,
const int w_feature, const int h_mask,
const int w_mask, const int half_h_mask,
const int half_w_mask, const Tensor mask_data,
Tensor buffer_data) {
for (int n = 0; n < num_; n++) {
for (int h = 0; h < h_feature; h++) {
for (int w = 0; w < w_feature; w++) {
// effective mask region : [hstart, hend) x [wstart, wend) with
// mask-indexed
const int hstart = max(0, half_h_mask - h);
const int hend = min(h_mask, h_feature + half_h_mask - h);
const int wstart = max(0, half_w_mask - w);
const int wend = min(w_mask, w_feature + half_w_mask - w);
// (hidx, widx ) with mask-indexed
// (hidx + h - half_h_mask, widx + w - half_w_mask) with
// feature-indexed
for (int hidx = hstart; hidx < hend; hidx++) {
for (int widx = wstart; widx < wend; widx++) {
buffer_data.view({-1})[(n * h_feature * w_feature +
(hidx + h - half_h_mask) * w_feature +
(widx + w - half_w_mask)) *
h_feature * w_feature +
h * w_feature + w] =
mask_data.view(
{-1})[((n * h_mask * w_mask + hidx * w_mask + widx) *
h_feature +
h) *
w_feature +
w];
}
}
}
}
}
}
void psamask_distribute_forward(const int num_, const int h_feature,
const int w_feature, const int h_mask,
const int w_mask, const int half_h_mask,
const int half_w_mask, const Tensor mask_data,
Tensor buffer_data) {
for (int n = 0; n < num_; n++) {
for (int h = 0; h < h_feature; h++) {
for (int w = 0; w < w_feature; w++) {
// effective mask region : [hstart, hend) x [wstart, wend) with
// mask-indexed
const int hstart = max(0, half_h_mask - h);
const int hend = min(h_mask, h_feature + half_h_mask - h);
const int wstart = max(0, half_w_mask - w);
const int wend = min(w_mask, w_feature + half_w_mask - w);
// (hidx, widx ) with mask-indexed
// (hidx + h - half_h_mask, widx + w - half_w_mask) with
// feature-indexed
for (int hidx = hstart; hidx < hend; hidx++) {
for (int widx = wstart; widx < wend; widx++) {
buffer_data.view(
{-1})[(n * h_feature * w_feature + h * w_feature + w) *
h_feature * w_feature +
(hidx + h - half_h_mask) * w_feature +
(widx + w - half_w_mask)] =
mask_data.view(
{-1})[((n * h_mask * w_mask + hidx * w_mask + widx) *
h_feature +
h) *
w_feature +
w];
}
}
}
}
}
}
void psamask_collect_backward(const int num_, const int h_feature,
const int w_feature, const int h_mask,
const int w_mask, const int half_h_mask,
const int half_w_mask, const Tensor buffer_diff,
Tensor mask_diff) {
for (int n = 0; n < num_; n++) {
for (int h = 0; h < h_feature; h++) {
for (int w = 0; w < w_feature; w++) {
// effective mask region : [hstart, hend) x [wstart, wend) with
// mask-indexed
const int hstart = max(0, half_h_mask - h);
const int hend = min(h_mask, h_feature + half_h_mask - h);
const int wstart = max(0, half_w_mask - w);
const int wend = min(w_mask, w_feature + half_w_mask - w);
// (hidx, widx ) with mask-indexed
// (hidx + h - half_h_mask, widx + w - half_w_mask) with
// feature-indexed
for (int hidx = hstart; hidx < hend; hidx++) {
for (int widx = wstart; widx < wend; widx++) {
mask_diff.view({-1})[((n * h_mask * w_mask + hidx * w_mask + widx) *
h_feature +
h) *
w_feature +
w] =
buffer_diff.view({-1})[(n * h_feature * w_feature +
(hidx + h - half_h_mask) * w_feature +
(widx + w - half_w_mask)) *
h_feature * w_feature +
h * w_feature + w];
}
}
}
}
}
}
void psamask_distribute_backward(const int num_, const int h_feature,
const int w_feature, const int h_mask,
const int w_mask, const int half_h_mask,
const int half_w_mask,
const Tensor buffer_diff, Tensor mask_diff) {
for (int n = 0; n < num_; n++) {
for (int h = 0; h < h_feature; h++) {
for (int w = 0; w < w_feature; w++) {
// effective mask region : [hstart, hend) x [wstart, wend) with
// mask-indexed
const int hstart = max(0, half_h_mask - h);
const int hend = min(h_mask, h_feature + half_h_mask - h);
const int wstart = max(0, half_w_mask - w);
const int wend = min(w_mask, w_feature + half_w_mask - w);
// (hidx, widx ) with mask-indexed
// (hidx + h - half_h_mask, widx + w - half_w_mask) with
// feature-indexed
for (int hidx = hstart; hidx < hend; hidx++) {
for (int widx = wstart; widx < wend; widx++) {
mask_diff.view({-1})[((n * h_mask * w_mask + hidx * w_mask + widx) *
h_feature +
h) *
w_feature +
w] =
buffer_diff.view(
{-1})[(n * h_feature * w_feature + h * w_feature + w) *
h_feature * w_feature +
(hidx + h - half_h_mask) * w_feature +
(widx + w - half_w_mask)];
}
}
}
}
}
}
void psamask_forward_cpu(const int psa_type, const Tensor input, Tensor output,
const int num_, const int h_feature,
const int w_feature, const int h_mask,
const int w_mask, const int half_h_mask,
const int half_w_mask) {
if (psa_type == 0)
psamask_collect_forward(num_, h_feature, w_feature, h_mask, w_mask,
half_h_mask, half_w_mask, input, output);
else
psamask_distribute_forward(num_, h_feature, w_feature, h_mask, w_mask,
half_h_mask, half_w_mask, input, output);
}
void psamask_backward_cpu(const int psa_type, const Tensor grad_output,
Tensor grad_input, const int num_,
const int h_feature, const int w_feature,
const int h_mask, const int w_mask,
const int half_h_mask, const int half_w_mask) {
if (psa_type == 0)
psamask_collect_backward(num_, h_feature, w_feature, h_mask, w_mask,
half_h_mask, half_w_mask, grad_output, grad_input);
else
psamask_distribute_backward(num_, h_feature, w_feature, h_mask, w_mask,
half_h_mask, half_w_mask, grad_output,
grad_input);
}
void psamask_forward_impl(const int psa_type, const Tensor input, Tensor output,
const int num_, const int h_feature,
const int w_feature, const int h_mask,
const int w_mask, const int half_h_mask,
const int half_w_mask);
void psamask_backward_impl(const int psa_type, const Tensor grad_output,
Tensor grad_input, const int num_,
const int h_feature, const int w_feature,
const int h_mask, const int w_mask,
const int half_h_mask, const int half_w_mask);
REGISTER_DEVICE_IMPL(psamask_forward_impl, CPU, psamask_forward_cpu);
REGISTER_DEVICE_IMPL(psamask_backward_impl, CPU, psamask_backward_cpu);
...@@ -5,6 +5,7 @@ ...@@ -5,6 +5,7 @@
#include <ATen/TensorUtils.h> #include <ATen/TensorUtils.h>
#include "pytorch_cpp_helper.hpp" #include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
// implementation taken from Caffe2 // implementation taken from Caffe2
template <typename T> template <typename T>
...@@ -429,3 +430,37 @@ void ROIAlignBackwardCPULauncher(Tensor grad_output, Tensor rois, ...@@ -429,3 +430,37 @@ void ROIAlignBackwardCPULauncher(Tensor grad_output, Tensor rois,
n_stride, c_stride, h_stride, w_stride); n_stride, c_stride, h_stride, w_stride);
}); });
} }
void roi_align_forward_cpu(Tensor input, Tensor rois, Tensor output,
Tensor argmax_y, Tensor argmax_x, int aligned_height,
int aligned_width, float spatial_scale,
int sampling_ratio, int pool_mode, bool aligned) {
ROIAlignForwardCPULauncher(input, rois, output, argmax_y, argmax_x,
aligned_height, aligned_width, spatial_scale,
sampling_ratio, pool_mode, aligned);
}
void roi_align_backward_cpu(Tensor grad_output, Tensor rois, Tensor argmax_y,
Tensor argmax_x, Tensor grad_input,
int aligned_height, int aligned_width,
float spatial_scale, int sampling_ratio,
int pool_mode, bool aligned) {
ROIAlignBackwardCPULauncher(grad_output, rois, argmax_y, argmax_x, grad_input,
aligned_height, aligned_width, spatial_scale,
sampling_ratio, pool_mode, aligned);
}
void roi_align_forward_impl(Tensor input, Tensor rois, Tensor output,
Tensor argmax_y, Tensor argmax_x,
int aligned_height, int aligned_width,
float spatial_scale, int sampling_ratio,
int pool_mode, bool aligned);
void roi_align_backward_impl(Tensor grad_output, Tensor rois, Tensor argmax_y,
Tensor argmax_x, Tensor grad_input,
int aligned_height, int aligned_width,
float spatial_scale, int sampling_ratio,
int pool_mode, bool aligned);
REGISTER_DEVICE_IMPL(roi_align_forward_impl, CPU, roi_align_forward_cpu);
REGISTER_DEVICE_IMPL(roi_align_backward_impl, CPU, roi_align_backward_cpu);
...@@ -5,6 +5,7 @@ ...@@ -5,6 +5,7 @@
#include <ATen/TensorUtils.h> #include <ATen/TensorUtils.h>
#include "pytorch_cpp_helper.hpp" #include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
// implementation taken from Caffe2 // implementation taken from Caffe2
template <typename T> template <typename T>
...@@ -415,3 +416,43 @@ void ROIAlignRotatedBackwardCPULauncher(Tensor grad_output, Tensor rois, ...@@ -415,3 +416,43 @@ void ROIAlignRotatedBackwardCPULauncher(Tensor grad_output, Tensor rois,
n_stride, c_stride, h_stride, w_stride); n_stride, c_stride, h_stride, w_stride);
}); });
} }
void roi_align_rotated_forward_cpu(Tensor input, Tensor rois, Tensor output,
int aligned_height, int aligned_width,
float spatial_scale, int sampling_ratio,
bool aligned, bool clockwise) {
ROIAlignRotatedForwardCPULauncher(input, rois, output, aligned_height,
aligned_width, spatial_scale,
sampling_ratio, aligned, clockwise);
}
void roi_align_rotated_backward_cpu(Tensor top_grad, Tensor rois,
Tensor bottom_grad, int aligned_height,
int aligned_width, float spatial_scale,
int sampling_ratio, bool aligned,
bool clockwise) {
// Number of ROIs
int num_rois = rois.size(0);
int size_rois = rois.size(1);
if (size_rois != 6) {
AT_ERROR("wrong roi size");
}
ROIAlignRotatedBackwardCPULauncher(
top_grad, rois, bottom_grad, aligned_height, aligned_width, spatial_scale,
sampling_ratio, aligned, clockwise);
}
void roi_align_rotated_forward_impl(Tensor features, Tensor rois, Tensor output,
int aligned_height, int aligned_width,
float spatial_scale, int sample_ratio,
bool aligned, bool clockwise);
void roi_align_rotated_backward_impl(Tensor top_grad, Tensor rois,
Tensor bottom_grad, int aligned_height,
int aligned_width, float spatial_scale,
int sample_ratio, bool aligned,
bool clockwise);
REGISTER_DEVICE_IMPL(roi_align_rotated_forward_impl, CPU,
roi_align_rotated_forward_cpu);
REGISTER_DEVICE_IMPL(roi_align_rotated_backward_impl, CPU,
roi_align_rotated_backward_cpu);
// Copyright (c) OpenMMLab. All rights reserved. // Copyright (c) OpenMMLab. All rights reserved.
#include "pytorch_cpp_helper.hpp" #include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
template <typename T, typename T_int> template <typename T, typename T_int>
void dynamic_voxelize_forward_cpu_kernel( void dynamic_voxelize_forward_cpu_kernel(
...@@ -150,3 +151,20 @@ int hard_voxelize_forward_cpu(const at::Tensor& points, at::Tensor& voxels, ...@@ -150,3 +151,20 @@ int hard_voxelize_forward_cpu(const at::Tensor& points, at::Tensor& voxels,
return voxel_num; return voxel_num;
} }
int hard_voxelize_forward_impl(const at::Tensor& points, at::Tensor& voxels,
at::Tensor& coors,
at::Tensor& num_points_per_voxel,
const std::vector<float> voxel_size,
const std::vector<float> coors_range,
const int max_points, const int max_voxels,
const int NDim);
void dynamic_voxelize_forward_impl(const at::Tensor& points, at::Tensor& coors,
const std::vector<float> voxel_size,
const std::vector<float> coors_range,
const int NDim);
REGISTER_DEVICE_IMPL(hard_voxelize_forward_impl, CPU,
hard_voxelize_forward_cpu);
REGISTER_DEVICE_IMPL(dynamic_voxelize_forward_impl, CPU,
dynamic_voxelize_forward_cpu);
...@@ -7,8 +7,8 @@ ...@@ -7,8 +7,8 @@
void box_iou_rotated_cuda(const Tensor boxes1, const Tensor boxes2, Tensor ious, void box_iou_rotated_cuda(const Tensor boxes1, const Tensor boxes2, Tensor ious,
const int mode_flag, const bool aligned) { const int mode_flag, const bool aligned) {
using scalar_t = float; using scalar_t = float;
AT_ASSERTM(boxes1.type().is_cuda(), "boxes1 must be a CUDA tensor"); AT_ASSERTM(boxes1.is_cuda(), "boxes1 must be a CUDA tensor");
AT_ASSERTM(boxes2.type().is_cuda(), "boxes2 must be a CUDA tensor"); AT_ASSERTM(boxes2.is_cuda(), "boxes2 must be a CUDA tensor");
int output_size = ious.numel(); int output_size = ious.numel();
int num_boxes1 = boxes1.size(0); int num_boxes1 = boxes1.size(0);
......
This diff is collapsed.
...@@ -2,8 +2,9 @@ ...@@ -2,8 +2,9 @@
#include "deform_conv_cuda_kernel.cuh" #include "deform_conv_cuda_kernel.cuh"
#include "pytorch_cuda_helper.hpp" #include "pytorch_cuda_helper.hpp"
void deformable_im2col(Tensor data_im, Tensor data_offset, const int channels, void deformable_im2col_cuda(Tensor data_im, Tensor data_offset,
const int height, const int width, const int ksize_h, const int channels, const int height,
const int width, const int ksize_h,
const int ksize_w, const int pad_h, const int pad_w, const int ksize_w, const int pad_h, const int pad_w,
const int stride_h, const int stride_w, const int stride_h, const int stride_w,
const int dilation_h, const int dilation_w, const int dilation_h, const int dilation_w,
...@@ -35,8 +36,9 @@ void deformable_im2col(Tensor data_im, Tensor data_offset, const int channels, ...@@ -35,8 +36,9 @@ void deformable_im2col(Tensor data_im, Tensor data_offset, const int channels,
AT_CUDA_CHECK(cudaGetLastError()); AT_CUDA_CHECK(cudaGetLastError());
} }
void deformable_col2im(Tensor data_col, Tensor data_offset, const int channels, void deformable_col2im_cuda(Tensor data_col, Tensor data_offset,
const int height, const int width, const int ksize_h, const int channels, const int height,
const int width, const int ksize_h,
const int ksize_w, const int pad_h, const int pad_w, const int ksize_w, const int pad_h, const int pad_w,
const int stride_h, const int stride_w, const int stride_h, const int stride_w,
const int dilation_h, const int dilation_w, const int dilation_h, const int dilation_w,
...@@ -68,7 +70,7 @@ void deformable_col2im(Tensor data_col, Tensor data_offset, const int channels, ...@@ -68,7 +70,7 @@ void deformable_col2im(Tensor data_col, Tensor data_offset, const int channels,
AT_CUDA_CHECK(cudaGetLastError()); AT_CUDA_CHECK(cudaGetLastError());
} }
void deformable_col2im_coord( void deformable_col2im_coord_cuda(
Tensor data_col, Tensor data_im, Tensor data_offset, const int channels, Tensor data_col, Tensor data_im, Tensor data_offset, const int channels,
const int height, const int width, const int ksize_h, const int ksize_w, const int height, const int width, const int ksize_h, const int ksize_w,
const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w,
......
...@@ -14,8 +14,8 @@ inline int opt_n_threads(int work_size) { ...@@ -14,8 +14,8 @@ inline int opt_n_threads(int work_size) {
} }
void FurthestPointSamplingForwardCUDAKernelLauncher(int b, int n, int m, void FurthestPointSamplingForwardCUDAKernelLauncher(int b, int n, int m,
const float *dataset, const float* dataset,
float *temp, int *idxs) { float* temp, int* idxs) {
// dataset: (B, N, 3) // dataset: (B, N, 3)
// tmp: (B, N) // tmp: (B, N)
// output: // output:
...@@ -79,7 +79,7 @@ void FurthestPointSamplingForwardCUDAKernelLauncher(int b, int n, int m, ...@@ -79,7 +79,7 @@ void FurthestPointSamplingForwardCUDAKernelLauncher(int b, int n, int m,
} }
void FurthestPointSamplingWithDistForwardCUDAKernelLauncher( void FurthestPointSamplingWithDistForwardCUDAKernelLauncher(
int b, int n, int m, const float *dataset, float *temp, int *idxs) { int b, int n, int m, const float* dataset, float* temp, int* idxs) {
// dataset: (B, N, N) // dataset: (B, N, N)
// temp: (B, N) // temp: (B, N)
// output: // output:
......
...@@ -6,7 +6,7 @@ void modulated_deformable_im2col_cuda( ...@@ -6,7 +6,7 @@ void modulated_deformable_im2col_cuda(
const Tensor data_im, const Tensor data_offset, const Tensor data_mask, const Tensor data_im, const Tensor data_offset, const Tensor data_mask,
const int batch_size, const int channels, const int height_im, const int batch_size, const int channels, const int height_im,
const int width_im, const int height_col, const int width_col, const int width_im, const int height_col, const int width_col,
const int kernel_h, const int kenerl_w, const int pad_h, const int pad_w, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
const int stride_h, const int stride_w, const int dilation_h, const int stride_h, const int stride_w, const int dilation_h,
const int dilation_w, const int deformable_group, Tensor data_col) { const int dilation_w, const int deformable_group, Tensor data_col) {
// num_axes should be smaller than block size // num_axes should be smaller than block size
...@@ -24,7 +24,7 @@ void modulated_deformable_im2col_cuda( ...@@ -24,7 +24,7 @@ void modulated_deformable_im2col_cuda(
GET_BLOCKS(num_kernels), THREADS_PER_BLOCK, 0, GET_BLOCKS(num_kernels), THREADS_PER_BLOCK, 0,
at::cuda::getCurrentCUDAStream()>>>( at::cuda::getCurrentCUDAStream()>>>(
num_kernels, data_im_, data_offset_, data_mask_, height_im, num_kernels, data_im_, data_offset_, data_mask_, height_im,
width_im, kernel_h, kenerl_w, pad_h, pad_w, stride_h, stride_w, width_im, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
dilation_h, dilation_w, channel_per_deformable_group, batch_size, dilation_h, dilation_w, channel_per_deformable_group, batch_size,
channels, deformable_group, height_col, width_col, data_col_); channels, deformable_group, height_col, width_col, data_col_);
})); }));
......
...@@ -232,14 +232,12 @@ at::Tensor ms_deform_attn_cuda_forward(const at::Tensor &value, ...@@ -232,14 +232,12 @@ at::Tensor ms_deform_attn_cuda_forward(const at::Tensor &value,
AT_ASSERTM(attn_weight.is_contiguous(), AT_ASSERTM(attn_weight.is_contiguous(),
"attn_weight tensor has to be contiguous"); "attn_weight tensor has to be contiguous");
AT_ASSERTM(value.type().is_cuda(), "value must be a CUDA tensor"); AT_ASSERTM(value.is_cuda(), "value must be a CUDA tensor");
AT_ASSERTM(spatial_shapes.type().is_cuda(), AT_ASSERTM(spatial_shapes.is_cuda(), "spatial_shapes must be a CUDA tensor");
"spatial_shapes must be a CUDA tensor"); AT_ASSERTM(level_start_index.is_cuda(),
AT_ASSERTM(level_start_index.type().is_cuda(),
"level_start_index must be a CUDA tensor"); "level_start_index must be a CUDA tensor");
AT_ASSERTM(sampling_loc.type().is_cuda(), AT_ASSERTM(sampling_loc.is_cuda(), "sampling_loc must be a CUDA tensor");
"sampling_loc must be a CUDA tensor"); AT_ASSERTM(attn_weight.is_cuda(), "attn_weight must be a CUDA tensor");
AT_ASSERTM(attn_weight.type().is_cuda(), "attn_weight must be a CUDA tensor");
const int batch = value.size(0); const int batch = value.size(0);
const int spatial_size = value.size(1); const int spatial_size = value.size(1);
...@@ -268,17 +266,18 @@ at::Tensor ms_deform_attn_cuda_forward(const at::Tensor &value, ...@@ -268,17 +266,18 @@ at::Tensor ms_deform_attn_cuda_forward(const at::Tensor &value,
for (int n = 0; n < batch / im2col_step_; ++n) { for (int n = 0; n < batch / im2col_step_; ++n) {
auto columns = output_n.select(0, n); auto columns = output_n.select(0, n);
AT_DISPATCH_FLOATING_TYPES( AT_DISPATCH_FLOATING_TYPES(
value.type(), "ms_deform_attn_forward_cuda", ([&] { value.scalar_type(), "ms_deform_attn_forward_cuda", ([&] {
ms_deformable_im2col_cuda( ms_deformable_im2col_cuda(
at::cuda::getCurrentCUDAStream(), at::cuda::getCurrentCUDAStream(),
value.data<scalar_t>() + n * im2col_step_ * per_value_size, value.data_ptr<scalar_t>() + n * im2col_step_ * per_value_size,
spatial_shapes.data<int64_t>(), level_start_index.data<int64_t>(), spatial_shapes.data_ptr<int64_t>(),
sampling_loc.data<scalar_t>() + level_start_index.data_ptr<int64_t>(),
sampling_loc.data_ptr<scalar_t>() +
n * im2col_step_ * per_sample_loc_size, n * im2col_step_ * per_sample_loc_size,
attn_weight.data<scalar_t>() + attn_weight.data_ptr<scalar_t>() +
n * im2col_step_ * per_attn_weight_size, n * im2col_step_ * per_attn_weight_size,
batch_n, spatial_size, num_heads, channels, num_levels, num_query, batch_n, spatial_size, num_heads, channels, num_levels, num_query,
num_point, columns.data<scalar_t>()); num_point, columns.data_ptr<scalar_t>());
})); }));
} }
...@@ -305,15 +304,13 @@ void ms_deform_attn_cuda_backward( ...@@ -305,15 +304,13 @@ void ms_deform_attn_cuda_backward(
AT_ASSERTM(grad_output.is_contiguous(), AT_ASSERTM(grad_output.is_contiguous(),
"grad_output tensor has to be contiguous"); "grad_output tensor has to be contiguous");
AT_ASSERTM(value.type().is_cuda(), "value must be a CUDA tensor"); AT_ASSERTM(value.is_cuda(), "value must be a CUDA tensor");
AT_ASSERTM(spatial_shapes.type().is_cuda(), AT_ASSERTM(spatial_shapes.is_cuda(), "spatial_shapes must be a CUDA tensor");
"spatial_shapes must be a CUDA tensor"); AT_ASSERTM(level_start_index.is_cuda(),
AT_ASSERTM(level_start_index.type().is_cuda(),
"level_start_index must be a CUDA tensor"); "level_start_index must be a CUDA tensor");
AT_ASSERTM(sampling_loc.type().is_cuda(), AT_ASSERTM(sampling_loc.is_cuda(), "sampling_loc must be a CUDA tensor");
"sampling_loc must be a CUDA tensor"); AT_ASSERTM(attn_weight.is_cuda(), "attn_weight must be a CUDA tensor");
AT_ASSERTM(attn_weight.type().is_cuda(), "attn_weight must be a CUDA tensor"); AT_ASSERTM(grad_output.is_cuda(), "grad_output must be a CUDA tensor");
AT_ASSERTM(grad_output.type().is_cuda(), "grad_output must be a CUDA tensor");
const int batch = value.size(0); const int batch = value.size(0);
const int spatial_size = value.size(1); const int spatial_size = value.size(1);
...@@ -340,21 +337,24 @@ void ms_deform_attn_cuda_backward( ...@@ -340,21 +337,24 @@ void ms_deform_attn_cuda_backward(
for (int n = 0; n < batch / im2col_step_; ++n) { for (int n = 0; n < batch / im2col_step_; ++n) {
auto grad_output_g = grad_output_n.select(0, n); auto grad_output_g = grad_output_n.select(0, n);
AT_DISPATCH_FLOATING_TYPES( AT_DISPATCH_FLOATING_TYPES(
value.type(), "ms_deform_attn_backward_cuda", ([&] { value.scalar_type(), "ms_deform_attn_backward_cuda", ([&] {
ms_deformable_col2im_cuda( ms_deformable_col2im_cuda(
at::cuda::getCurrentCUDAStream(), grad_output_g.data<scalar_t>(), at::cuda::getCurrentCUDAStream(),
value.data<scalar_t>() + n * im2col_step_ * per_value_size, grad_output_g.data_ptr<scalar_t>(),
spatial_shapes.data<int64_t>(), level_start_index.data<int64_t>(), value.data_ptr<scalar_t>() + n * im2col_step_ * per_value_size,
sampling_loc.data<scalar_t>() + spatial_shapes.data_ptr<int64_t>(),
level_start_index.data_ptr<int64_t>(),
sampling_loc.data_ptr<scalar_t>() +
n * im2col_step_ * per_sample_loc_size, n * im2col_step_ * per_sample_loc_size,
attn_weight.data<scalar_t>() + attn_weight.data_ptr<scalar_t>() +
n * im2col_step_ * per_attn_weight_size, n * im2col_step_ * per_attn_weight_size,
batch_n, spatial_size, num_heads, channels, num_levels, num_query, batch_n, spatial_size, num_heads, channels, num_levels, num_query,
num_point, num_point,
grad_value.data<scalar_t>() + n * im2col_step_ * per_value_size, grad_value.data_ptr<scalar_t>() +
grad_sampling_loc.data<scalar_t>() + n * im2col_step_ * per_value_size,
grad_sampling_loc.data_ptr<scalar_t>() +
n * im2col_step_ * per_sample_loc_size, n * im2col_step_ * per_sample_loc_size,
grad_attn_weight.data<scalar_t>() + grad_attn_weight.data_ptr<scalar_t>() +
n * im2col_step_ * per_attn_weight_size); n * im2col_step_ * per_attn_weight_size);
})); }));
} }
......
...@@ -8,8 +8,8 @@ Tensor nms_rotated_cuda(const Tensor dets, const Tensor scores, ...@@ -8,8 +8,8 @@ Tensor nms_rotated_cuda(const Tensor dets, const Tensor scores,
const Tensor order_t, const Tensor dets_sorted, const Tensor order_t, const Tensor dets_sorted,
float iou_threshold, const int multi_label) { float iou_threshold, const int multi_label) {
// using scalar_t = float; // using scalar_t = float;
AT_ASSERTM(dets.type().is_cuda(), "dets must be a CUDA tensor"); AT_ASSERTM(dets.is_cuda(), "dets must be a CUDA tensor");
AT_ASSERTM(scores.type().is_cuda(), "scores must be a CUDA tensor"); AT_ASSERTM(scores.is_cuda(), "scores must be a CUDA tensor");
at::cuda::CUDAGuard device_guard(dets.device()); at::cuda::CUDAGuard device_guard(dets.device());
int dets_num = dets.size(0); int dets_num = dets.size(0);
...@@ -24,21 +24,22 @@ Tensor nms_rotated_cuda(const Tensor dets, const Tensor scores, ...@@ -24,21 +24,22 @@ Tensor nms_rotated_cuda(const Tensor dets, const Tensor scores,
cudaStream_t stream = at::cuda::getCurrentCUDAStream(); cudaStream_t stream = at::cuda::getCurrentCUDAStream();
AT_DISPATCH_FLOATING_TYPES_AND_HALF( AT_DISPATCH_FLOATING_TYPES_AND_HALF(
dets_sorted.type(), "nms_rotated_kernel_cuda", [&] { dets_sorted.scalar_type(), "nms_rotated_kernel_cuda", [&] {
nms_rotated_cuda_kernel<scalar_t><<<blocks, threads, 0, stream>>>( nms_rotated_cuda_kernel<scalar_t><<<blocks, threads, 0, stream>>>(
dets_num, iou_threshold, dets_sorted.data<scalar_t>(), dets_num, iou_threshold, dets_sorted.data_ptr<scalar_t>(),
(unsigned long long*)mask.data<int64_t>(), multi_label); (unsigned long long*)mask.data_ptr<int64_t>(), multi_label);
}); });
Tensor mask_cpu = mask.to(at::kCPU); Tensor mask_cpu = mask.to(at::kCPU);
unsigned long long* mask_host = (unsigned long long*)mask_cpu.data<int64_t>(); unsigned long long* mask_host =
(unsigned long long*)mask_cpu.data_ptr<int64_t>();
std::vector<unsigned long long> remv(col_blocks); std::vector<unsigned long long> remv(col_blocks);
memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks); memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks);
Tensor keep = Tensor keep =
at::empty({dets_num}, dets.options().dtype(at::kLong).device(at::kCPU)); at::empty({dets_num}, dets.options().dtype(at::kLong).device(at::kCPU));
int64_t* keep_out = keep.data<int64_t>(); int64_t* keep_out = keep.data_ptr<int64_t>();
int num_to_keep = 0; int num_to_keep = 0;
for (int i = 0; i < dets_num; i++) { for (int i = 0; i < dets_num; i++) {
......
...@@ -9,10 +9,10 @@ void ROIAlignRotatedForwardCUDAKernelLauncher( ...@@ -9,10 +9,10 @@ void ROIAlignRotatedForwardCUDAKernelLauncher(
const int pooled_height, const int pooled_width, at::Tensor output) { const int pooled_height, const int pooled_width, at::Tensor output) {
const int output_size = num_rois * pooled_height * pooled_width * channels; const int output_size = num_rois * pooled_height * pooled_width * channels;
AT_DISPATCH_FLOATING_TYPES_AND_HALF( AT_DISPATCH_FLOATING_TYPES_AND_HALF(
features.type(), "ROIAlignRotatedLaucherForward", ([&] { features.scalar_type(), "ROIAlignRotatedLaucherForward", ([&] {
const scalar_t *bottom_data = features.data<scalar_t>(); const scalar_t *bottom_data = features.data_ptr<scalar_t>();
const scalar_t *rois_data = rois.data<scalar_t>(); const scalar_t *rois_data = rois.data_ptr<scalar_t>();
scalar_t *top_data = output.data<scalar_t>(); scalar_t *top_data = output.data_ptr<scalar_t>();
roi_align_rotated_forward_cuda_kernel<scalar_t> roi_align_rotated_forward_cuda_kernel<scalar_t>
<<<GET_BLOCKS(output_size), THREADS_PER_BLOCK>>>( <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK>>>(
...@@ -31,10 +31,10 @@ void ROIAlignRotatedBackwardCUDAKernelLauncher( ...@@ -31,10 +31,10 @@ void ROIAlignRotatedBackwardCUDAKernelLauncher(
const int pooled_height, const int pooled_width, at::Tensor bottom_grad) { const int pooled_height, const int pooled_width, at::Tensor bottom_grad) {
const int output_size = num_rois * pooled_height * pooled_width * channels; const int output_size = num_rois * pooled_height * pooled_width * channels;
AT_DISPATCH_FLOATING_TYPES_AND_HALF( AT_DISPATCH_FLOATING_TYPES_AND_HALF(
top_grad.type(), "ROIAlignLaucherBackward", ([&] { top_grad.scalar_type(), "ROIAlignLaucherBackward", ([&] {
const scalar_t *top_diff = top_grad.data<scalar_t>(); const scalar_t *top_diff = top_grad.data_ptr<scalar_t>();
const scalar_t *rois_data = rois.data<scalar_t>(); const scalar_t *rois_data = rois.data_ptr<scalar_t>();
scalar_t *bottom_diff = bottom_grad.data<scalar_t>(); scalar_t *bottom_diff = bottom_grad.data_ptr<scalar_t>();
roi_align_rotated_backward_cuda_kernel<scalar_t> roi_align_rotated_backward_cuda_kernel<scalar_t>
<<<GET_BLOCKS(output_size), THREADS_PER_BLOCK>>>( <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK>>>(
output_size, top_diff, rois_data, spatial_scale, sample_num, output_size, top_diff, rois_data, spatial_scale, sample_num,
......
// Copyright (c) OpenMMLab. All rights reserved // Copyright (c) OpenMMLab. All rights reserved
#include "pytorch_cuda_helper.hpp" #include "pytorch_cuda_helper.hpp"
#include "pytorch_device_registry.hpp"
#include "tin_shift_cuda_kernel.cuh" #include "tin_shift_cuda_kernel.cuh"
void TINShiftForwardCUDAKernelLauncher(Tensor input, Tensor shift, void TINShiftForwardCUDAKernelLauncher(Tensor input, Tensor shift,
......
...@@ -6,8 +6,8 @@ ...@@ -6,8 +6,8 @@
#include "voxelization_cuda_kernel.cuh" #include "voxelization_cuda_kernel.cuh"
int HardVoxelizeForwardCUDAKernelLauncher( int HardVoxelizeForwardCUDAKernelLauncher(
const at::Tensor& points, at::Tensor& voxels, at::Tensor& coors, const at::Tensor &points, at::Tensor &voxels, at::Tensor &coors,
at::Tensor& num_points_per_voxel, const std::vector<float> voxel_size, at::Tensor &num_points_per_voxel, const std::vector<float> voxel_size,
const std::vector<float> coors_range, const int max_points, const std::vector<float> coors_range, const int max_points,
const int max_voxels, const int NDim = 3) { const int max_voxels, const int NDim = 3) {
// current version tooks about 0.04s for one frame on cpu // current version tooks about 0.04s for one frame on cpu
...@@ -146,7 +146,7 @@ int HardVoxelizeForwardCUDAKernelLauncher( ...@@ -146,7 +146,7 @@ int HardVoxelizeForwardCUDAKernelLauncher(
} }
void DynamicVoxelizeForwardCUDAKernelLauncher( void DynamicVoxelizeForwardCUDAKernelLauncher(
const at::Tensor& points, at::Tensor& coors, const at::Tensor &points, at::Tensor &coors,
const std::vector<float> voxel_size, const std::vector<float> coors_range, const std::vector<float> voxel_size, const std::vector<float> coors_range,
const int NDim = 3) { const int NDim = 3) {
// current version tooks about 0.04s for one frame on cpu // current version tooks about 0.04s for one frame on cpu
......
// Copyright (c) OpenMMLab. All rights reserved // Copyright (c) OpenMMLab. All rights reserved
#include "pytorch_cpp_helper.hpp" #include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
#ifdef MMCV_WITH_CUDA void deformable_im2col_impl(Tensor data_im, Tensor data_offset,
void deformable_im2col(Tensor data_im, Tensor data_offset, const int channels,
const int height, const int width, const int ksize_h,
const int ksize_w, const int pad_h, const int pad_w,
const int stride_h, const int stride_w,
const int dilation_h, const int dilation_w,
const int parallel_imgs, const int deformable_group,
Tensor data_col);
void deformable_col2im(Tensor data_col, Tensor data_offset, const int channels,
const int height, const int width, const int ksize_h,
const int ksize_w, const int pad_h, const int pad_w,
const int stride_h, const int stride_w,
const int dilation_h, const int dilation_w,
const int parallel_imgs, const int deformable_group,
Tensor grad_im);
void deformable_col2im_coord(
Tensor data_col, Tensor data_im, Tensor data_offset, const int channels,
const int height, const int width, const int ksize_h, const int ksize_w,
const int pad_h, const int pad_w, const int stride_h, const int stride_w,
const int dilation_h, const int dilation_w, const int parallel_imgs,
const int deformable_group, Tensor grad_offset);
#endif
void deformable_im2col_cpu(Tensor data_im, Tensor data_offset,
const int channels, const int height, const int channels, const int height,
const int width, const int ksize_h, const int width, const int ksize_h,
const int ksize_w, const int pad_h, const int pad_w, const int ksize_w, const int pad_h, const int pad_w,
const int stride_h, const int stride_w, const int stride_h, const int stride_w,
const int dilation_h, const int dilation_w, const int dilation_h, const int dilation_w,
const int parallel_imgs, const int deformable_group, const int parallel_imgs, const int deformable_group,
Tensor data_col); Tensor data_col) {
DISPATCH_DEVICE_IMPL(deformable_im2col_impl, data_im, data_offset, channels,
height, width, ksize_h, ksize_w, pad_h, pad_w, stride_h,
stride_w, dilation_h, dilation_w, parallel_imgs,
deformable_group, data_col);
}
void deformable_col2im_cpu(Tensor data_col, Tensor data_offset, void deformable_col2im_impl(Tensor data_col, Tensor data_offset,
const int channels, const int height, const int channels, const int height,
const int width, const int ksize_h, const int width, const int ksize_h,
const int ksize_w, const int pad_h, const int pad_w, const int ksize_w, const int pad_h, const int pad_w,
const int stride_h, const int stride_w, const int stride_h, const int stride_w,
const int dilation_h, const int dilation_w, const int dilation_h, const int dilation_w,
const int parallel_imgs, const int deformable_group, const int parallel_imgs, const int deformable_group,
Tensor grad_im); Tensor grad_im) {
DISPATCH_DEVICE_IMPL(deformable_col2im_impl, data_col, data_offset, channels,
height, width, ksize_h, ksize_w, pad_h, pad_w, stride_h,
stride_w, dilation_h, dilation_w, parallel_imgs,
deformable_group, grad_im);
}
void deformable_col2im_coord_cpu( void deformable_col2im_coord_impl(
Tensor data_col, Tensor data_im, Tensor data_offset, const int channels, Tensor data_col, Tensor data_im, Tensor data_offset, const int channels,
const int height, const int width, const int ksize_h, const int ksize_w, const int height, const int width, const int ksize_h, const int ksize_w,
const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w,
const int dilation_h, const int dilation_w, const int parallel_imgs, const int dilation_h, const int dilation_w, const int parallel_imgs,
const int deformable_group, Tensor grad_offset); const int deformable_group, Tensor grad_offset) {
DISPATCH_DEVICE_IMPL(deformable_col2im_coord_impl, data_col, data_im,
data_offset, channels, height, width, ksize_h, ksize_w,
pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w,
parallel_imgs, deformable_group, grad_offset);
}
void deform_conv_shape_check(at::Tensor input, at::Tensor offset, void deform_conv_shape_check(at::Tensor input, at::Tensor offset,
at::Tensor *gradOutput, at::Tensor weight, int kH, at::Tensor *gradOutput, at::Tensor weight, int kH,
...@@ -227,17 +216,9 @@ void deform_conv_forward(Tensor input, Tensor weight, Tensor offset, ...@@ -227,17 +216,9 @@ void deform_conv_forward(Tensor input, Tensor weight, Tensor offset,
output_buffer.size(2), output_buffer.size(3)}); output_buffer.size(2), output_buffer.size(3)});
for (int elt = 0; elt < batchSize / im2col_step; elt++) { for (int elt = 0; elt < batchSize / im2col_step; elt++) {
if (input.device().is_cuda()) { deformable_im2col_impl(input[elt], offset[elt], nInputPlane, inputHeight,
#ifdef MMCV_WITH_CUDA
deformable_im2col(input[elt], offset[elt], nInputPlane, inputHeight,
inputWidth, kH, kW, padH, padW, dH, dW, dilationH,
dilationW, im2col_step, deformable_group, columns);
#endif
} else {
deformable_im2col_cpu(input[elt], offset[elt], nInputPlane, inputHeight,
inputWidth, kH, kW, padH, padW, dH, dW, dilationH, inputWidth, kH, kW, padH, padW, dH, dW, dilationH,
dilationW, im2col_step, deformable_group, columns); dilationW, im2col_step, deformable_group, columns);
}
columns = columns.view({group, columns.size(0) / group, columns.size(1)}); columns = columns.view({group, columns.size(0) / group, columns.size(1)});
weight = weight.view({group, weight.size(0) / group, weight.size(1), weight = weight.view({group, weight.size(0) / group, weight.size(1),
...@@ -373,29 +354,15 @@ void deform_conv_backward_input(Tensor input, Tensor offset, Tensor gradOutput, ...@@ -373,29 +354,15 @@ void deform_conv_backward_input(Tensor input, Tensor offset, Tensor gradOutput,
{gradOutput.size(0), gradOutput.size(1) * gradOutput.size(2), {gradOutput.size(0), gradOutput.size(1) * gradOutput.size(2),
gradOutput.size(3), gradOutput.size(4), gradOutput.size(5)}); gradOutput.size(3), gradOutput.size(4), gradOutput.size(5)});
if (input.device().is_cuda()) { deformable_col2im_coord_impl(columns, input[elt], offset[elt], nInputPlane,
#ifdef MMCV_WITH_CUDA
deformable_col2im_coord(columns, input[elt], offset[elt], nInputPlane,
inputHeight, inputWidth, kH, kW, padH, padW, dH,
dW, dilationH, dilationW, im2col_step,
deformable_group, gradOffset[elt]);
deformable_col2im(columns, offset[elt], nInputPlane, inputHeight,
inputWidth, kH, kW, padH, padW, dH, dW, dilationH,
dilationW, im2col_step, deformable_group,
gradInput[elt]);
#endif
} else {
deformable_col2im_coord_cpu(columns, input[elt], offset[elt], nInputPlane,
inputHeight, inputWidth, kH, kW, padH, padW, inputHeight, inputWidth, kH, kW, padH, padW,
dH, dW, dilationH, dilationW, im2col_step, dH, dW, dilationH, dilationW, im2col_step,
deformable_group, gradOffset[elt]); deformable_group, gradOffset[elt]);
deformable_col2im_cpu(columns, offset[elt], nInputPlane, inputHeight, deformable_col2im_impl(columns, offset[elt], nInputPlane, inputHeight,
inputWidth, kH, kW, padH, padW, dH, dW, dilationH, inputWidth, kH, kW, padH, padW, dH, dW, dilationH,
dilationW, im2col_step, deformable_group, dilationW, im2col_step, deformable_group,
gradInput[elt]); gradInput[elt]);
}
weight = weight.view({weight.size(0) * weight.size(1), weight.size(2), weight = weight.view({weight.size(0) * weight.size(1), weight.size(2),
weight.size(3), weight.size(4)}); weight.size(3), weight.size(4)});
...@@ -508,17 +475,9 @@ void deform_conv_backward_parameters(Tensor input, Tensor offset, ...@@ -508,17 +475,9 @@ void deform_conv_backward_parameters(Tensor input, Tensor offset,
deformable_group * 2 * kH * kW, outputHeight, outputWidth}); deformable_group * 2 * kH * kW, outputHeight, outputWidth});
for (int elt = 0; elt < batchSize / im2col_step; elt++) { for (int elt = 0; elt < batchSize / im2col_step; elt++) {
if (input.device().is_cuda()) { deformable_im2col_impl(input[elt], offset[elt], nInputPlane, inputHeight,
#ifdef MMCV_WITH_CUDA
deformable_im2col(input[elt], offset[elt], nInputPlane, inputHeight,
inputWidth, kH, kW, padH, padW, dH, dW, dilationH, inputWidth, kH, kW, padH, padW, dH, dW, dilationH,
dilationW, im2col_step, deformable_group, columns); dilationW, im2col_step, deformable_group, columns);
#endif
} else {
deformable_im2col_cpu(input[elt], offset[elt], nInputPlane, inputHeight,
inputWidth, kH, kW, padH, padW, dH, dW, dilationH,
dilationW, im2col_step, deformable_group, columns);
}
// divide into group // divide into group
gradOutputBuffer = gradOutputBuffer.view( gradOutputBuffer = gradOutputBuffer.view(
......
// Copyright (c) OpenMMLab. All rights reserved // Copyright (c) OpenMMLab. All rights reserved
#include "pytorch_cpp_helper.hpp" #include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
#ifdef MMCV_WITH_CUDA void deform_roi_pool_forward_impl(Tensor input, Tensor rois, Tensor offset,
void DeformRoIPoolForwardCUDAKernelLauncher(Tensor input, Tensor rois,
Tensor offset, Tensor output,
int pooled_height, int pooled_width,
float spatial_scale,
int sampling_ratio, float gamma);
void DeformRoIPoolBackwardCUDAKernelLauncher(
Tensor grad_output, Tensor input, Tensor rois, Tensor offset,
Tensor grad_input, Tensor grad_offset, int pooled_height, int pooled_width,
float spatial_scale, int sampling_ratio, float gamma);
void deform_roi_pool_forward_cuda(Tensor input, Tensor rois, Tensor offset,
Tensor output, int pooled_height, Tensor output, int pooled_height,
int pooled_width, float spatial_scale, int pooled_width, float spatial_scale,
int sampling_ratio, float gamma) { int sampling_ratio, float gamma) {
DeformRoIPoolForwardCUDAKernelLauncher(input, rois, offset, output, DISPATCH_DEVICE_IMPL(deform_roi_pool_forward_impl, input, rois, offset,
pooled_height, pooled_width, output, pooled_height, pooled_width, spatial_scale,
spatial_scale, sampling_ratio, gamma); sampling_ratio, gamma);
} }
void deform_roi_pool_backward_cuda(Tensor grad_output, Tensor input, void deform_roi_pool_backward_impl(Tensor grad_output, Tensor input,
Tensor rois, Tensor offset, Tensor rois, Tensor offset,
Tensor grad_input, Tensor grad_offset, Tensor grad_input, Tensor grad_offset,
int pooled_height, int pooled_width, int pooled_height, int pooled_width,
float spatial_scale, int sampling_ratio, float spatial_scale, int sampling_ratio,
float gamma) { float gamma) {
DeformRoIPoolBackwardCUDAKernelLauncher( DISPATCH_DEVICE_IMPL(deform_roi_pool_backward_impl, grad_output, input, rois,
grad_output, input, rois, offset, grad_input, grad_offset, pooled_height, offset, grad_input, grad_offset, pooled_height,
pooled_width, spatial_scale, sampling_ratio, gamma); pooled_width, spatial_scale, sampling_ratio, gamma);
} }
#endif
void deform_roi_pool_forward(Tensor input, Tensor rois, Tensor offset, void deform_roi_pool_forward(Tensor input, Tensor rois, Tensor offset,
Tensor output, int pooled_height, int pooled_width, Tensor output, int pooled_height, int pooled_width,
float spatial_scale, int sampling_ratio, float spatial_scale, int sampling_ratio,
float gamma) { float gamma) {
if (input.device().is_cuda()) { deform_roi_pool_forward_impl(input, rois, offset, output, pooled_height,
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(input);
CHECK_CUDA_INPUT(rois);
CHECK_CUDA_INPUT(offset);
CHECK_CUDA_INPUT(output);
deform_roi_pool_forward_cuda(input, rois, offset, output, pooled_height,
pooled_width, spatial_scale, sampling_ratio, pooled_width, spatial_scale, sampling_ratio,
gamma); gamma);
#else
AT_ERROR("DeformRoIPool is not compiled with GPU support");
#endif
} else {
AT_ERROR("DeformRoIPool is not implemented on CPU");
}
} }
void deform_roi_pool_backward(Tensor grad_output, Tensor input, Tensor rois, void deform_roi_pool_backward(Tensor grad_output, Tensor input, Tensor rois,
...@@ -61,22 +36,7 @@ void deform_roi_pool_backward(Tensor grad_output, Tensor input, Tensor rois, ...@@ -61,22 +36,7 @@ void deform_roi_pool_backward(Tensor grad_output, Tensor input, Tensor rois,
Tensor grad_offset, int pooled_height, Tensor grad_offset, int pooled_height,
int pooled_width, float spatial_scale, int pooled_width, float spatial_scale,
int sampling_ratio, float gamma) { int sampling_ratio, float gamma) {
if (grad_output.device().is_cuda()) { deform_roi_pool_backward_impl(grad_output, input, rois, offset, grad_input,
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(grad_output);
CHECK_CUDA_INPUT(input);
CHECK_CUDA_INPUT(rois);
CHECK_CUDA_INPUT(offset);
CHECK_CUDA_INPUT(grad_input);
CHECK_CUDA_INPUT(grad_offset);
deform_roi_pool_backward_cuda(grad_output, input, rois, offset, grad_input,
grad_offset, pooled_height, pooled_width, grad_offset, pooled_height, pooled_width,
spatial_scale, sampling_ratio, gamma); spatial_scale, sampling_ratio, gamma);
#else
AT_ERROR("DeformRoIPool is not compiled with GPU support");
#endif
} else {
AT_ERROR("DeformRoIPool is not implemented on CPU");
}
} }
// Copyright (c) OpenMMLab. All rights reserved // Copyright (c) OpenMMLab. All rights reserved
#include "pytorch_cpp_helper.hpp" #include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
#ifdef MMCV_WITH_CUDA void sigmoid_focal_loss_forward_impl(Tensor input, Tensor target, Tensor weight,
void SigmoidFocalLossForwardCUDAKernelLauncher(Tensor input, Tensor target,
Tensor weight, Tensor output,
const float gamma,
const float alpha);
void SigmoidFocalLossBackwardCUDAKernelLauncher(Tensor input, Tensor target,
Tensor weight,
Tensor grad_input,
const float gamma,
const float alpha);
void SoftmaxFocalLossForwardCUDAKernelLauncher(Tensor input, Tensor target,
Tensor weight, Tensor output,
const float gamma,
const float alpha);
void SoftmaxFocalLossBackwardCUDAKernelLauncher(Tensor input, Tensor target,
Tensor weight, Tensor buff,
Tensor grad_input,
const float gamma,
const float alpha);
void sigmoid_focal_loss_forward_cuda(Tensor input, Tensor target, Tensor weight,
Tensor output, float gamma, float alpha) { Tensor output, float gamma, float alpha) {
SigmoidFocalLossForwardCUDAKernelLauncher(input, target, weight, output, DISPATCH_DEVICE_IMPL(sigmoid_focal_loss_forward_impl, input, target, weight,
gamma, alpha); output, gamma, alpha);
} }
void sigmoid_focal_loss_backward_cuda(Tensor input, Tensor target, void sigmoid_focal_loss_backward_impl(Tensor input, Tensor target,
Tensor weight, Tensor grad_input, Tensor weight, Tensor grad_input,
float gamma, float alpha) { float gamma, float alpha) {
SigmoidFocalLossBackwardCUDAKernelLauncher(input, target, weight, grad_input, DISPATCH_DEVICE_IMPL(sigmoid_focal_loss_backward_impl, input, target, weight,
gamma, alpha); grad_input, gamma, alpha);
} }
void softmax_focal_loss_forward_cuda(Tensor input, Tensor target, Tensor weight, void softmax_focal_loss_forward_impl(Tensor input, Tensor target, Tensor weight,
Tensor output, float gamma, float alpha) { Tensor output, float gamma, float alpha) {
SoftmaxFocalLossForwardCUDAKernelLauncher(input, target, weight, output, DISPATCH_DEVICE_IMPL(softmax_focal_loss_forward_impl, input, target, weight,
gamma, alpha); output, gamma, alpha);
} }
void softmax_focal_loss_backward_cuda(Tensor input, Tensor target, void softmax_focal_loss_backward_impl(Tensor input, Tensor target,
Tensor weight, Tensor buff, Tensor weight, Tensor buff,
Tensor grad_input, float gamma, Tensor grad_input, float gamma,
float alpha) { float alpha) {
SoftmaxFocalLossBackwardCUDAKernelLauncher(input, target, weight, buff, DISPATCH_DEVICE_IMPL(softmax_focal_loss_backward_impl, input, target, weight,
grad_input, gamma, alpha); buff, grad_input, gamma, alpha);
} }
#endif
void sigmoid_focal_loss_forward(Tensor input, Tensor target, Tensor weight, void sigmoid_focal_loss_forward(Tensor input, Tensor target, Tensor weight,
Tensor output, float gamma, float alpha) { Tensor output, float gamma, float alpha) {
if (input.device().is_cuda()) { sigmoid_focal_loss_forward_impl(input, target, weight, output, gamma, alpha);
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(input);
CHECK_CUDA_INPUT(target);
CHECK_CUDA_INPUT(weight);
CHECK_CUDA_INPUT(output);
sigmoid_focal_loss_forward_cuda(input, target, weight, output, gamma,
alpha);
#else
AT_ERROR("SigmoidFocalLoss is not compiled with GPU support");
#endif
} else {
AT_ERROR("SigmoidFocalLoss is not implemented on CPU");
}
} }
void sigmoid_focal_loss_backward(Tensor input, Tensor target, Tensor weight, void sigmoid_focal_loss_backward(Tensor input, Tensor target, Tensor weight,
Tensor grad_input, float gamma, float alpha) { Tensor grad_input, float gamma, float alpha) {
if (input.device().is_cuda()) { sigmoid_focal_loss_backward_impl(input, target, weight, grad_input, gamma,
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(input);
CHECK_CUDA_INPUT(target);
CHECK_CUDA_INPUT(weight);
CHECK_CUDA_INPUT(grad_input);
sigmoid_focal_loss_backward_cuda(input, target, weight, grad_input, gamma,
alpha); alpha);
#else
AT_ERROR("SigmoidFocalLoss is not compiled with GPU support");
#endif
} else {
AT_ERROR("SigmoidFocalLoss is not implemented on CPU");
}
} }
void softmax_focal_loss_forward(Tensor input, Tensor target, Tensor weight, void softmax_focal_loss_forward(Tensor input, Tensor target, Tensor weight,
Tensor output, float gamma, float alpha) { Tensor output, float gamma, float alpha) {
if (input.device().is_cuda()) { softmax_focal_loss_forward_impl(input, target, weight, output, gamma, alpha);
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(input);
CHECK_CUDA_INPUT(target);
CHECK_CUDA_INPUT(weight);
CHECK_CUDA_INPUT(output);
softmax_focal_loss_forward_cuda(input, target, weight, output, gamma,
alpha);
#else
AT_ERROR("SoftmaxFocalLoss is not compiled with GPU support");
#endif
} else {
AT_ERROR("SoftmaxFocalLoss is not implemented on CPU");
}
} }
void softmax_focal_loss_backward(Tensor input, Tensor target, Tensor weight, void softmax_focal_loss_backward(Tensor input, Tensor target, Tensor weight,
Tensor buff, Tensor grad_input, float gamma, Tensor buff, Tensor grad_input, float gamma,
float alpha) { float alpha) {
if (input.device().is_cuda()) { softmax_focal_loss_backward_impl(input, target, weight, buff, grad_input,
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(input);
CHECK_CUDA_INPUT(target);
CHECK_CUDA_INPUT(weight);
CHECK_CUDA_INPUT(buff);
CHECK_CUDA_INPUT(grad_input);
softmax_focal_loss_backward_cuda(input, target, weight, buff, grad_input,
gamma, alpha); gamma, alpha);
#else
AT_ERROR("SoftmaxFocalLoss is not compiled with GPU support");
#endif
} else {
AT_ERROR("SoftmaxFocalLoss is not implemented on CPU");
}
} }
...@@ -2,61 +2,33 @@ ...@@ -2,61 +2,33 @@
// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/sampling.cpp // https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/sampling.cpp
#include "pytorch_cpp_helper.hpp" #include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
#ifdef MMCV_WITH_CUDA void furthest_point_sampling_forward_impl(Tensor points_tensor,
void FurthestPointSamplingForwardCUDAKernelLauncher(int b, int n, int m, Tensor temp_tensor, Tensor idx_tensor,
const float *dataset, int b, int n, int m) {
float *temp, int *idxs); DISPATCH_DEVICE_IMPL(furthest_point_sampling_forward_impl, points_tensor,
temp_tensor, idx_tensor, b, n, m);
void furthest_point_sampling_forward_cuda(int b, int n, int m,
const float *dataset, float *temp,
int *idxs) {
FurthestPointSamplingForwardCUDAKernelLauncher(b, n, m, dataset, temp, idxs);
} }
void FurthestPointSamplingWithDistForwardCUDAKernelLauncher( void furthest_point_sampling_with_dist_forward_impl(Tensor points_tensor,
int b, int n, int m, const float *dataset, float *temp, int *idxs); Tensor temp_tensor,
Tensor idx_tensor, int b,
void furthest_point_sampling_with_dist_forward_cuda(int b, int n, int m, int n, int m) {
const float *dataset, DISPATCH_DEVICE_IMPL(furthest_point_sampling_with_dist_forward_impl,
float *temp, int *idxs) { points_tensor, temp_tensor, idx_tensor, b, n, m);
FurthestPointSamplingWithDistForwardCUDAKernelLauncher(b, n, m, dataset, temp,
idxs);
} }
#endif
void furthest_point_sampling_forward(Tensor points_tensor, Tensor temp_tensor, void furthest_point_sampling_forward(Tensor points_tensor, Tensor temp_tensor,
Tensor idx_tensor, int b, int n, int m) { Tensor idx_tensor, int b, int n, int m) {
if (points_tensor.device().is_cuda()) { furthest_point_sampling_forward_impl(points_tensor, temp_tensor, idx_tensor,
#ifdef MMCV_WITH_CUDA b, n, m);
const float *points = points_tensor.data_ptr<float>();
float *temp = temp_tensor.data_ptr<float>();
int *idx = idx_tensor.data_ptr<int>();
furthest_point_sampling_forward_cuda(b, n, m, points, temp, idx);
#else
AT_ERROR("furthest_point_sampling is not compiled with GPU support");
#endif
} else {
AT_ERROR("furthest_point_sampling is not implemented on CPU");
}
} }
void furthest_point_sampling_with_dist_forward(Tensor points_tensor, void furthest_point_sampling_with_dist_forward(Tensor points_tensor,
Tensor temp_tensor, Tensor temp_tensor,
Tensor idx_tensor, int b, int n, Tensor idx_tensor, int b, int n,
int m) { int m) {
if (points_tensor.device().is_cuda()) { furthest_point_sampling_with_dist_forward_impl(points_tensor, temp_tensor,
#ifdef MMCV_WITH_CUDA idx_tensor, b, n, m);
const float *points = points_tensor.data<float>();
float *temp = temp_tensor.data<float>();
int *idx = idx_tensor.data<int>();
furthest_point_sampling_with_dist_forward_cuda(b, n, m, points, temp, idx);
#else
AT_ERROR(
"furthest_point_sampling_with_dist is not compiled with GPU support");
#endif
} else {
AT_ERROR("furthest_point_sampling_with_dist is not implemented on CPU");
}
} }
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment