Unverified Commit c0f5492e authored by zhuyuanhao's avatar zhuyuanhao Committed by GitHub
Browse files

add ext ops, support parrots (#310)



* add ext ops, support parrots

* fix lint

* fix lint

* update op from mmdetection

* support non-pytorch env

* fix import bug

* test not import mmcv.op

* rename mmcv.op to mmcv.ops

* fix compile warning

* 1. fix syncbn warning in pytorch 1.5
2. support only cpu compile
3. add point_sample from mmdet

* fix text bug

* update docstrings

* fix line endings

* minor updates

* remove non_local from ops

* bug fix for nonlocal2d

* rename ops_ext to _ext and _ext to _flow_warp_ext

* update the doc

* try clang-format github action

* fix github action

* add ops to api.rst

* fix cpp format

* fix clang format issues

* remove .clang-format
Co-authored-by: default avatarKai Chen <chenkaidev@gmail.com>
parent a7bf7701
#include "pytorch_cpp_helper.hpp"
#ifdef WITH_CUDA
void ModulatedDeformConvForwardCUDAKernelLauncher(
Tensor input, Tensor weight, Tensor bias, Tensor ones, Tensor offset,
Tensor mask, Tensor output, Tensor columns, int kernel_h, int kernel_w,
const int stride_h, const int stride_w, const int pad_h, const int pad_w,
const int dilation_h, const int dilation_w, const int group,
const int deformable_group, const bool with_bias);
void ModulatedDeformConvBackwardCUDAKernelLauncher(
Tensor input, Tensor weight, Tensor bias, Tensor ones, Tensor offset,
Tensor mask, Tensor columns, Tensor grad_input, Tensor grad_weight,
Tensor grad_bias, Tensor grad_offset, Tensor grad_mask, Tensor grad_output,
int kernel_h, int kernel_w, int stride_h, int stride_w, int pad_h,
int pad_w, int dilation_h, int dilation_w, int group, int deformable_group,
const bool with_bias);
void modulated_deform_conv_forward_cuda(
Tensor input, Tensor weight, Tensor bias, Tensor ones, Tensor offset,
Tensor mask, Tensor output, Tensor columns, int kernel_h, int kernel_w,
const int stride_h, const int stride_w, const int pad_h, const int pad_w,
const int dilation_h, const int dilation_w, const int group,
const int deformable_group, const bool with_bias) {
ModulatedDeformConvForwardCUDAKernelLauncher(
input, weight, bias, ones, offset, mask, output, columns, kernel_h,
kernel_w, stride_h, stride_w, pad_h, pad_w, dilation_h, dilation_w, group,
deformable_group, with_bias);
}
void modulated_deform_conv_backward_cuda(
Tensor input, Tensor weight, Tensor bias, Tensor ones, Tensor offset,
Tensor mask, Tensor columns, Tensor grad_input, Tensor grad_weight,
Tensor grad_bias, Tensor grad_offset, Tensor grad_mask, Tensor grad_output,
int kernel_h, int kernel_w, int stride_h, int stride_w, int pad_h,
int pad_w, int dilation_h, int dilation_w, int group, int deformable_group,
const bool with_bias) {
ModulatedDeformConvBackwardCUDAKernelLauncher(
input, weight, bias, ones, offset, mask, columns, grad_input, grad_weight,
grad_bias, grad_offset, grad_mask, grad_output, kernel_h, kernel_w,
stride_h, stride_w, pad_h, pad_w, dilation_h, dilation_w, group,
deformable_group, with_bias);
}
#endif
void modulated_deform_conv_forward(
Tensor input, Tensor weight, Tensor bias, Tensor ones, Tensor offset,
Tensor mask, Tensor output, Tensor columns, int kernel_h, int kernel_w,
const int stride_h, const int stride_w, const int pad_h, const int pad_w,
const int dilation_h, const int dilation_w, const int group,
const int deformable_group, const bool with_bias) {
if (input.device().is_cuda()) {
#ifdef WITH_CUDA
CHECK_CUDA_INPUT(input);
CHECK_CUDA_INPUT(weight);
CHECK_CUDA_INPUT(bias);
CHECK_CUDA_INPUT(ones);
CHECK_CUDA_INPUT(offset);
CHECK_CUDA_INPUT(mask);
CHECK_CUDA_INPUT(output);
CHECK_CUDA_INPUT(columns);
modulated_deform_conv_forward_cuda(
input, weight, bias, ones, offset, mask, output, columns, kernel_h,
kernel_w, stride_h, stride_w, pad_h, pad_w, dilation_h, dilation_w,
group, deformable_group, with_bias);
#else
AT_ERROR("ModulatedDeformConv is not compiled with GPU support");
#endif
} else {
AT_ERROR("ModulatedDeformConv is not implemented on CPU");
}
}
void modulated_deform_conv_backward(
Tensor input, Tensor weight, Tensor bias, Tensor ones, Tensor offset,
Tensor mask, Tensor columns, Tensor grad_input, Tensor grad_weight,
Tensor grad_bias, Tensor grad_offset, Tensor grad_mask, Tensor grad_output,
int kernel_h, int kernel_w, int stride_h, int stride_w, int pad_h,
int pad_w, int dilation_h, int dilation_w, int group, int deformable_group,
const bool with_bias) {
if (input.device().is_cuda()) {
#ifdef WITH_CUDA
CHECK_CUDA_INPUT(input);
CHECK_CUDA_INPUT(weight);
CHECK_CUDA_INPUT(bias);
CHECK_CUDA_INPUT(ones);
CHECK_CUDA_INPUT(offset);
CHECK_CUDA_INPUT(mask);
CHECK_CUDA_INPUT(columns);
CHECK_CUDA_INPUT(grad_input);
CHECK_CUDA_INPUT(grad_weight);
CHECK_CUDA_INPUT(grad_bias);
CHECK_CUDA_INPUT(grad_offset);
CHECK_CUDA_INPUT(grad_mask);
CHECK_CUDA_INPUT(grad_output);
modulated_deform_conv_backward_cuda(
input, weight, bias, ones, offset, mask, columns, grad_input,
grad_weight, grad_bias, grad_offset, grad_mask, grad_output, kernel_h,
kernel_w, stride_h, stride_w, pad_h, pad_w, dilation_h, dilation_w,
group, deformable_group, with_bias);
#else
AT_ERROR("ModulatedDeformConv is not compiled with GPU support");
#endif
} else {
AT_ERROR("ModulatedDeformConv is not implemented on CPU");
}
}
#include "modulated_deform_conv_cuda_kernel.cuh"
#include "pytorch_cuda_helper.hpp"
void modulated_deformable_im2col_cuda(
const Tensor data_im, const Tensor data_offset, const Tensor data_mask,
const int batch_size, const int channels, const int height_im,
const int width_im, const int height_col, const int width_col,
const int kernel_h, const int kenerl_w, const int pad_h, const int pad_w,
const int stride_h, const int stride_w, const int dilation_h,
const int dilation_w, const int deformable_group, Tensor data_col) {
// num_axes should be smaller than block size
const int channel_per_deformable_group = channels / deformable_group;
const int num_kernels = channels * batch_size * height_col * width_col;
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
data_im.scalar_type(), "modulated_deformable_im2col_gpu", ([&] {
const scalar_t *data_im_ = data_im.data_ptr<scalar_t>();
const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();
const scalar_t *data_mask_ = data_mask.data_ptr<scalar_t>();
scalar_t *data_col_ = data_col.data_ptr<scalar_t>();
modulated_deformable_im2col_gpu_kernel<<<
GET_BLOCKS(num_kernels), THREADS_PER_BLOCK, 0,
at::cuda::getCurrentCUDAStream()>>>(
num_kernels, data_im_, data_offset_, data_mask_, height_im,
width_im, kernel_h, kenerl_w, pad_h, pad_w, stride_h, stride_w,
dilation_h, dilation_w, channel_per_deformable_group, batch_size,
channels, deformable_group, height_col, width_col, data_col_);
}));
AT_CUDA_CHECK(cudaGetLastError());
}
void modulated_deformable_col2im_cuda(
const Tensor data_col, const Tensor data_offset, const Tensor data_mask,
const int batch_size, const int channels, const int height_im,
const int width_im, const int height_col, const int width_col,
const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
const int stride_h, const int stride_w, const int dilation_h,
const int dilation_w, const int deformable_group, Tensor grad_im) {
const int channel_per_deformable_group = channels / deformable_group;
const int num_kernels =
channels * kernel_h * kernel_w * batch_size * height_col * width_col;
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
data_col.scalar_type(), "modulated_deformable_col2im_gpu", ([&] {
const scalar_t *data_col_ = data_col.data_ptr<scalar_t>();
const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();
const scalar_t *data_mask_ = data_mask.data_ptr<scalar_t>();
scalar_t *grad_im_ = grad_im.data_ptr<scalar_t>();
modulated_deformable_col2im_gpu_kernel<<<
GET_BLOCKS(num_kernels), THREADS_PER_BLOCK, 0,
at::cuda::getCurrentCUDAStream()>>>(
num_kernels, data_col_, data_offset_, data_mask_, channels,
height_im, width_im, kernel_h, kernel_w, pad_h, pad_w, stride_h,
stride_w, dilation_h, dilation_w, channel_per_deformable_group,
batch_size, deformable_group, height_col, width_col, grad_im_);
}));
AT_CUDA_CHECK(cudaGetLastError());
}
void modulated_deformable_col2im_coord_cuda(
const Tensor data_col, const Tensor data_im, const Tensor data_offset,
const Tensor data_mask, const int batch_size, const int channels,
const int height_im, const int width_im, const int height_col,
const int width_col, const int kernel_h, const int kernel_w,
const int pad_h, const int pad_w, const int stride_h, const int stride_w,
const int dilation_h, const int dilation_w, const int deformable_group,
Tensor grad_offset, Tensor grad_mask) {
const int num_kernels = batch_size * height_col * width_col * 2 * kernel_h *
kernel_w * deformable_group;
const int channel_per_deformable_group =
channels * kernel_h * kernel_w / deformable_group;
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
data_col.scalar_type(), "modulated_deformable_col2im_coord_gpu", ([&] {
const scalar_t *data_col_ = data_col.data_ptr<scalar_t>();
const scalar_t *data_im_ = data_im.data_ptr<scalar_t>();
const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();
const scalar_t *data_mask_ = data_mask.data_ptr<scalar_t>();
scalar_t *grad_offset_ = grad_offset.data_ptr<scalar_t>();
scalar_t *grad_mask_ = grad_mask.data_ptr<scalar_t>();
modulated_deformable_col2im_coord_gpu_kernel<<<
GET_BLOCKS(num_kernels), THREADS_PER_BLOCK, 0,
at::cuda::getCurrentCUDAStream()>>>(
num_kernels, data_col_, data_im_, data_offset_, data_mask_,
channels, height_im, width_im, kernel_h, kernel_w, pad_h, pad_w,
stride_h, stride_w, dilation_h, dilation_w,
channel_per_deformable_group, batch_size,
2 * kernel_h * kernel_w * deformable_group, deformable_group,
height_col, width_col, grad_offset_, grad_mask_);
}));
AT_CUDA_CHECK(cudaGetLastError());
}
void ModulatedDeformConvForwardCUDAKernelLauncher(
Tensor input, Tensor weight, Tensor bias, Tensor ones, Tensor offset,
Tensor mask, Tensor output, Tensor columns, int kernel_h, int kernel_w,
const int stride_h, const int stride_w, const int pad_h, const int pad_w,
const int dilation_h, const int dilation_w, const int group,
const int deformable_group, const bool with_bias) {
at::DeviceGuard guard(input.device());
const int batch = input.size(0);
const int channels = input.size(1);
const int height = input.size(2);
const int width = input.size(3);
const int channels_out = weight.size(0);
const int channels_kernel = weight.size(1);
const int kernel_h_ = weight.size(2);
const int kernel_w_ = weight.size(3);
if (kernel_h_ != kernel_h || kernel_w_ != kernel_w)
AT_ERROR("Input shape and kernel shape wont match: (%d x %d vs %d x %d).",
kernel_h_, kernel_w, kernel_h_, kernel_w_);
if (channels != channels_kernel * group)
AT_ERROR("Input shape and kernel channels wont match: (%d vs %d).",
channels, channels_kernel * group);
const int height_out =
(height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
const int width_out =
(width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
if (ones.ndimension() != 2 ||
ones.size(0) * ones.size(1) < height_out * width_out) {
// Resize plane and fill with ones...
ones = at::ones({height_out, width_out}, input.options());
}
// resize output
output = output.view({batch, channels_out, height_out, width_out}).zero_();
// resize temporary columns
columns =
at::zeros({channels * kernel_h * kernel_w, 1 * height_out * width_out},
input.options());
output = output.view({output.size(0), group, output.size(1) / group,
output.size(2), output.size(3)});
for (int b = 0; b < batch; b++) {
modulated_deformable_im2col_cuda(
input[b], offset[b], mask[b], 1, channels, height, width, height_out,
width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
dilation_h, dilation_w, deformable_group, columns);
// divide into group
weight = weight.view({group, weight.size(0) / group, weight.size(1),
weight.size(2), weight.size(3)});
columns = columns.view({group, columns.size(0) / group, columns.size(1)});
for (int g = 0; g < group; g++) {
output[b][g] = output[b][g]
.flatten(1)
.addmm_(weight[g].flatten(1), columns[g])
.view_as(output[b][g]);
}
weight = weight.view({weight.size(0) * weight.size(1), weight.size(2),
weight.size(3), weight.size(4)});
columns =
columns.view({columns.size(0) * columns.size(1), columns.size(2)});
}
output = output.view({output.size(0), output.size(1) * output.size(2),
output.size(3), output.size(4)});
if (with_bias) {
output += bias.view({1, bias.size(0), 1, 1});
}
}
void ModulatedDeformConvBackwardCUDAKernelLauncher(
Tensor input, Tensor weight, Tensor bias, Tensor ones, Tensor offset,
Tensor mask, Tensor columns, Tensor grad_input, Tensor grad_weight,
Tensor grad_bias, Tensor grad_offset, Tensor grad_mask, Tensor grad_output,
int kernel_h, int kernel_w, int stride_h, int stride_w, int pad_h,
int pad_w, int dilation_h, int dilation_w, int group, int deformable_group,
const bool with_bias) {
at::DeviceGuard guard(input.device());
const int batch = input.size(0);
const int channels = input.size(1);
const int height = input.size(2);
const int width = input.size(3);
const int channels_kernel = weight.size(1);
const int kernel_h_ = weight.size(2);
const int kernel_w_ = weight.size(3);
if (kernel_h_ != kernel_h || kernel_w_ != kernel_w)
AT_ERROR("Input shape and kernel shape wont match: (%d x %d vs %d x %d).",
kernel_h_, kernel_w, kernel_h_, kernel_w_);
if (channels != channels_kernel * group)
AT_ERROR("Input shape and kernel channels wont match: (%d vs %d).",
channels, channels_kernel * group);
const int height_out =
(height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
const int width_out =
(width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
if (ones.ndimension() != 2 ||
ones.size(0) * ones.size(1) < height_out * width_out) {
// Resize plane and fill with ones...
ones = at::ones({height_out, width_out}, input.options());
}
grad_input = grad_input.view({batch, channels, height, width});
columns = at::zeros({channels * kernel_h * kernel_w, height_out * width_out},
input.options());
grad_output =
grad_output.view({grad_output.size(0), group, grad_output.size(1) / group,
grad_output.size(2), grad_output.size(3)});
for (int b = 0; b < batch; b++) {
// divide int group
columns = columns.view({group, columns.size(0) / group, columns.size(1)});
weight = weight.view({group, weight.size(0) / group, weight.size(1),
weight.size(2), weight.size(3)});
for (int g = 0; g < group; g++) {
columns[g].addmm_(weight[g].flatten(1).transpose(0, 1),
grad_output[b][g].flatten(1), 0.0f, 1.0f);
}
columns =
columns.view({columns.size(0) * columns.size(1), columns.size(2)});
weight = weight.view({weight.size(0) * weight.size(1), weight.size(2),
weight.size(3), weight.size(4)});
// gradient w.r.t. input coordinate data
modulated_deformable_col2im_coord_cuda(
columns, input[b], offset[b], mask[b], 1, channels, height, width,
height_out, width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h,
stride_w, dilation_h, dilation_w, deformable_group, grad_offset[b],
grad_mask[b]);
// gradient w.r.t. input data
modulated_deformable_col2im_cuda(
columns, offset[b], mask[b], 1, channels, height, width, height_out,
width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
dilation_h, dilation_w, deformable_group, grad_input[b]);
// gradient w.r.t. weight, dWeight should accumulate across the batch and
// group
modulated_deformable_im2col_cuda(
input[b], offset[b], mask[b], 1, channels, height, width, height_out,
width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
dilation_h, dilation_w, deformable_group, columns);
columns = columns.view({group, columns.size(0) / group, columns.size(1)});
grad_weight = grad_weight.view({group, grad_weight.size(0) / group,
grad_weight.size(1), grad_weight.size(2),
grad_weight.size(3)});
if (with_bias)
grad_bias = grad_bias.view({group, grad_bias.size(0) / group});
for (int g = 0; g < group; g++) {
grad_weight[g] =
grad_weight[g]
.flatten(1)
.addmm_(grad_output[b][g].flatten(1), columns[g].transpose(0, 1))
.view_as(grad_weight[g]);
if (with_bias) {
grad_bias[g] =
grad_bias[g]
.view({-1, 1})
.addmm_(grad_output[b][g].flatten(1), ones.view({-1, 1}))
.view(-1);
}
}
columns =
columns.view({columns.size(0) * columns.size(1), columns.size(2)});
grad_weight = grad_weight.view({grad_weight.size(0) * grad_weight.size(1),
grad_weight.size(2), grad_weight.size(3),
grad_weight.size(4)});
if (with_bias)
grad_bias = grad_bias.view({grad_bias.size(0) * grad_bias.size(1)});
}
grad_output = grad_output.view({grad_output.size(0) * grad_output.size(1),
grad_output.size(2), grad_output.size(3),
grad_output.size(4)});
}
#include "pytorch_cpp_helper.hpp"
#ifdef WITH_CUDA
Tensor NMSCUDAKernelLauncher(Tensor boxes, Tensor scores, float iou_threshold,
int offset);
Tensor nms_cuda(Tensor boxes, Tensor scores, float iou_threshold, int offset) {
return NMSCUDAKernelLauncher(boxes, scores, iou_threshold, offset);
}
#endif
Tensor nms_cpu(Tensor boxes, Tensor scores, float iou_threshold, int offset) {
if (boxes.numel() == 0) {
return at::empty({0}, boxes.options().dtype(at::kLong));
}
auto x1_t = boxes.select(1, 0).contiguous();
auto y1_t = boxes.select(1, 1).contiguous();
auto x2_t = boxes.select(1, 2).contiguous();
auto y2_t = boxes.select(1, 3).contiguous();
Tensor areas_t = (x2_t - x1_t + offset) * (y2_t - y1_t + offset);
auto order_t = std::get<1>(scores.sort(0, /* descending=*/true));
auto nboxes = boxes.size(0);
Tensor select_t = at::ones({nboxes}, boxes.options().dtype(at::kBool));
auto select = select_t.data_ptr<bool>();
auto order = order_t.data_ptr<int64_t>();
auto x1 = x1_t.data_ptr<float>();
auto y1 = y1_t.data_ptr<float>();
auto x2 = x2_t.data_ptr<float>();
auto y2 = y2_t.data_ptr<float>();
auto areas = areas_t.data_ptr<float>();
for (int64_t _i = 0; _i < nboxes; _i++) {
if (select[_i] == false) continue;
auto i = order[_i];
auto ix1 = x1[i];
auto iy1 = y1[i];
auto ix2 = x2[i];
auto iy2 = y2[i];
auto iarea = areas[i];
for (int64_t _j = _i + 1; _j < nboxes; _j++) {
if (select[_j] == false) continue;
auto j = order[_j];
auto xx1 = std::max(ix1, x1[j]);
auto yy1 = std::max(iy1, y1[j]);
auto xx2 = std::min(ix2, x2[j]);
auto yy2 = std::min(iy2, y2[j]);
auto w = std::max(0.f, xx2 - xx1 + offset);
auto h = std::max(0.f, yy2 - yy1 + offset);
auto inter = w * h;
auto ovr = inter / (iarea + areas[j] - inter);
if (ovr >= iou_threshold) select[_j] = false;
}
}
return order_t.masked_select(select_t);
}
Tensor nms(Tensor boxes, Tensor scores, float iou_threshold, int offset) {
if (boxes.device().is_cuda()) {
#ifdef WITH_CUDA
CHECK_CUDA_INPUT(boxes);
CHECK_CUDA_INPUT(scores);
return nms_cuda(boxes, scores, iou_threshold, offset);
#else
AT_ERROR("nms is not compiled with GPU support");
#endif
} else {
CHECK_CPU_INPUT(boxes);
CHECK_CPU_INPUT(scores);
return nms_cpu(boxes, scores, iou_threshold, offset);
}
}
Tensor softnms_cpu(Tensor boxes, Tensor scores, Tensor dets,
float iou_threshold, float sigma, float min_score,
int method, int offset) {
if (boxes.numel() == 0) {
return at::empty({0}, boxes.options().dtype(at::kLong));
}
auto x1_t = boxes.select(1, 0).contiguous();
auto y1_t = boxes.select(1, 1).contiguous();
auto x2_t = boxes.select(1, 2).contiguous();
auto y2_t = boxes.select(1, 3).contiguous();
auto scores_t = scores.clone();
Tensor areas_t = (x2_t - x1_t + offset) * (y2_t - y1_t + offset);
auto nboxes = boxes.size(0);
auto x1 = x1_t.data_ptr<float>();
auto y1 = y1_t.data_ptr<float>();
auto x2 = x2_t.data_ptr<float>();
auto y2 = y2_t.data_ptr<float>();
auto sc = scores_t.data_ptr<float>();
auto areas = areas_t.data_ptr<float>();
auto de = dets.data_ptr<float>();
int64_t pos = 0;
Tensor inds_t = at::arange(nboxes, boxes.options().dtype(at::kLong));
auto inds = inds_t.data_ptr<long>();
for (int64_t i = 0; i < nboxes; i++) {
auto max_score = sc[i];
auto max_pos = i;
pos = i + 1;
// get max box
while (pos < nboxes) {
if (max_score < sc[pos]) {
max_score = sc[pos];
max_pos = pos;
}
pos = pos + 1;
}
// swap
auto ix1 = de[i * 5 + 0] = x1[max_pos];
auto iy1 = de[i * 5 + 1] = y1[max_pos];
auto ix2 = de[i * 5 + 2] = x2[max_pos];
auto iy2 = de[i * 5 + 3] = y2[max_pos];
auto iscore = de[i * 5 + 4] = sc[max_pos];
auto iarea = areas[max_pos];
auto iind = inds[max_pos];
x1[max_pos] = x1[i];
y1[max_pos] = y1[i];
x2[max_pos] = x2[i];
y2[max_pos] = y2[i];
sc[max_pos] = sc[i];
areas[max_pos] = areas[i];
inds[max_pos] = inds[i];
x1[i] = ix1;
y1[i] = iy1;
x2[i] = ix2;
y2[i] = iy2;
sc[i] = iscore;
areas[i] = iarea;
inds[i] = iind;
pos = i + 1;
while (pos < nboxes) {
auto xx1 = std::max(ix1, x1[pos]);
auto yy1 = std::max(iy1, y1[pos]);
auto xx2 = std::min(ix2, x2[pos]);
auto yy2 = std::min(iy2, y2[pos]);
auto w = std::max(0.f, xx2 - xx1 + offset);
auto h = std::max(0.f, yy2 - yy1 + offset);
auto inter = w * h;
auto ovr = inter / (iarea + areas[pos] - inter);
float weight = 1.;
if (method == 0) {
if (ovr >= iou_threshold) weight = 0;
} else if (method == 1) {
if (ovr >= iou_threshold) weight = 1 - ovr;
} else if (method == 2) {
weight = std::exp(-(ovr * ovr) / sigma);
}
sc[pos] *= weight;
// if box score falls below threshold, discard the box by
// swapping with last box update N
if (sc[pos] < min_score) {
x1[pos] = x1[nboxes - 1];
y1[pos] = y1[nboxes - 1];
x2[pos] = x2[nboxes - 1];
y2[pos] = y2[nboxes - 1];
sc[pos] = sc[nboxes - 1];
areas[pos] = areas[nboxes - 1];
inds[pos] = inds[nboxes - 1];
nboxes = nboxes - 1;
pos = pos - 1;
}
pos = pos + 1;
}
}
return inds_t.slice(0, 0, nboxes);
}
Tensor softnms(Tensor boxes, Tensor scores, Tensor dets, float iou_threshold,
float sigma, float min_score, int method, int offset) {
if (boxes.device().is_cuda()) {
AT_ERROR("softnms is not implemented on GPU");
} else {
return softnms_cpu(boxes, scores, dets, iou_threshold, sigma, min_score,
method, offset);
}
}
std::vector<std::vector<int> > nms_match_cpu(Tensor dets, float iou_threshold) {
auto x1_t = dets.select(1, 0).contiguous();
auto y1_t = dets.select(1, 1).contiguous();
auto x2_t = dets.select(1, 2).contiguous();
auto y2_t = dets.select(1, 3).contiguous();
auto scores = dets.select(1, 4).contiguous();
at::Tensor areas_t = (x2_t - x1_t) * (y2_t - y1_t);
auto order_t = std::get<1>(scores.sort(0, /* descending=*/true));
auto ndets = dets.size(0);
at::Tensor suppressed_t =
at::zeros({ndets}, dets.options().dtype(at::kByte).device(at::kCPU));
auto suppressed = suppressed_t.data_ptr<uint8_t>();
auto order = order_t.data_ptr<int64_t>();
auto x1 = x1_t.data_ptr<float>();
auto y1 = y1_t.data_ptr<float>();
auto x2 = x2_t.data_ptr<float>();
auto y2 = y2_t.data_ptr<float>();
auto areas = areas_t.data_ptr<float>();
std::vector<int> keep;
std::vector<std::vector<int> > matched;
for (int64_t _i = 0; _i < ndets; _i++) {
auto i = order[_i];
if (suppressed[i] == 1) continue;
keep.push_back(i);
std::vector<int> v_i;
auto ix1 = x1[i];
auto iy1 = y1[i];
auto ix2 = x2[i];
auto iy2 = y2[i];
auto iarea = areas[i];
for (int64_t _j = _i + 1; _j < ndets; _j++) {
auto j = order[_j];
if (suppressed[j] == 1) continue;
auto xx1 = std::max(ix1, x1[j]);
auto yy1 = std::max(iy1, y1[j]);
auto xx2 = std::min(ix2, x2[j]);
auto yy2 = std::min(iy2, y2[j]);
auto w = std::max(static_cast<float>(0), xx2 - xx1);
auto h = std::max(static_cast<float>(0), yy2 - yy1);
auto inter = w * h;
auto ovr = inter / (iarea + areas[j] - inter);
if (ovr >= iou_threshold) {
suppressed[j] = 1;
v_i.push_back(j);
}
}
matched.push_back(v_i);
}
for (int i = 0; i < keep.size(); i++)
matched[i].insert(matched[i].begin(), keep[i]);
return matched;
}
std::vector<std::vector<int> > nms_match(Tensor dets, float iou_threshold) {
if (dets.device().is_cuda()) {
AT_ERROR("nms_match is not implemented on GPU");
} else {
return nms_match_cpu(dets, iou_threshold);
}
}
#include "nms_kernel.cuh"
#include "pytorch_cuda_helper.hpp"
Tensor NMSCUDAKernelLauncher(Tensor boxes, Tensor scores, float iou_threshold,
int offset) {
at::cuda::CUDAGuard device_guard(boxes.device());
if (boxes.numel() == 0) {
return at::empty({0}, boxes.options().dtype(at::kLong));
}
auto order_t = std::get<1>(scores.sort(0, /*descending=*/true));
auto boxes_sorted = boxes.index_select(0, order_t);
int boxes_num = boxes.size(0);
const int col_blocks = DIVUP(boxes_num, threadsPerBlock);
Tensor mask =
at::empty({boxes_num, col_blocks}, boxes.options().dtype(at::kLong));
dim3 blocks(col_blocks, col_blocks);
dim3 threads(threadsPerBlock);
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
nms_cuda<<<blocks, threads, 0, stream>>>(
boxes_num, iou_threshold, offset, boxes_sorted.data_ptr<float>(),
(unsigned long long*)mask.data_ptr<int64_t>());
at::Tensor mask_cpu = mask.to(at::kCPU);
unsigned long long* mask_host =
(unsigned long long*)mask_cpu.data_ptr<int64_t>();
std::vector<unsigned long long> remv(col_blocks);
memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks);
at::Tensor keep_t =
at::zeros({boxes_num}, boxes.options().dtype(at::kBool).device(at::kCPU));
bool* keep = keep_t.data_ptr<bool>();
for (int i = 0; i < boxes_num; i++) {
int nblock = i / threadsPerBlock;
int inblock = i % threadsPerBlock;
if (!(remv[nblock] & (1ULL << inblock))) {
keep[i] = true;
// set every overlap box with bit 1 in remv
unsigned long long* p = mask_host + i * col_blocks;
for (int j = nblock; j < col_blocks; j++) {
remv[j] |= p[j];
}
}
}
AT_CUDA_CHECK(cudaGetLastError());
return order_t.masked_select(keep_t.to(at::kCUDA));
}
// Modified from
// https://github.com/hszhao/semseg/blob/master/lib/psa/src
#include "pytorch_cpp_helper.hpp"
#ifndef min
#define min(a, b) (((a) < (b)) ? (a) : (b))
#endif
#ifndef max
#define max(a, b) (((a) > (b)) ? (a) : (b))
#endif
void psamask_collect_forward(const int num_, const int h_feature,
const int w_feature, const int h_mask,
const int w_mask, const int half_h_mask,
const int half_w_mask, const Tensor mask_data,
Tensor buffer_data) {
for (int n = 0; n < num_; n++) {
for (int h = 0; h < h_feature; h++) {
for (int w = 0; w < w_feature; w++) {
// effective mask region : [hstart, hend) x [wstart, wend) with
// mask-indexed
const int hstart = max(0, half_h_mask - h);
const int hend = min(h_mask, h_feature + half_h_mask - h);
const int wstart = max(0, half_w_mask - w);
const int wend = min(w_mask, w_feature + half_w_mask - w);
// (hidx, widx ) with mask-indexed
// (hidx + h - half_h_mask, widx + w - half_w_mask) with
// feature-indexed
for (int hidx = hstart; hidx < hend; hidx++) {
for (int widx = wstart; widx < wend; widx++) {
buffer_data.view({-1})[(n * h_feature * w_feature +
(hidx + h - half_h_mask) * w_feature +
(widx + w - half_w_mask)) *
h_feature * w_feature +
h * w_feature + w] =
mask_data.view(
{-1})[((n * h_mask * w_mask + hidx * w_mask + widx) *
h_feature +
h) *
w_feature +
w];
}
}
}
}
}
}
void psamask_distribute_forward(const int num_, const int h_feature,
const int w_feature, const int h_mask,
const int w_mask, const int half_h_mask,
const int half_w_mask, const Tensor mask_data,
Tensor buffer_data) {
for (int n = 0; n < num_; n++) {
for (int h = 0; h < h_feature; h++) {
for (int w = 0; w < w_feature; w++) {
// effective mask region : [hstart, hend) x [wstart, wend) with
// mask-indexed
const int hstart = max(0, half_h_mask - h);
const int hend = min(h_mask, h_feature + half_h_mask - h);
const int wstart = max(0, half_w_mask - w);
const int wend = min(w_mask, w_feature + half_w_mask - w);
// (hidx, widx ) with mask-indexed
// (hidx + h - half_h_mask, widx + w - half_w_mask) with
// feature-indexed
for (int hidx = hstart; hidx < hend; hidx++) {
for (int widx = wstart; widx < wend; widx++) {
buffer_data.view(
{-1})[(n * h_feature * w_feature + h * w_feature + w) *
h_feature * w_feature +
(hidx + h - half_h_mask) * w_feature +
(widx + w - half_w_mask)] =
mask_data.view(
{-1})[((n * h_mask * w_mask + hidx * w_mask + widx) *
h_feature +
h) *
w_feature +
w];
}
}
}
}
}
}
void psamask_collect_backward(const int num_, const int h_feature,
const int w_feature, const int h_mask,
const int w_mask, const int half_h_mask,
const int half_w_mask, const Tensor buffer_diff,
Tensor mask_diff) {
for (int n = 0; n < num_; n++) {
for (int h = 0; h < h_feature; h++) {
for (int w = 0; w < w_feature; w++) {
// effective mask region : [hstart, hend) x [wstart, wend) with
// mask-indexed
const int hstart = max(0, half_h_mask - h);
const int hend = min(h_mask, h_feature + half_h_mask - h);
const int wstart = max(0, half_w_mask - w);
const int wend = min(w_mask, w_feature + half_w_mask - w);
// (hidx, widx ) with mask-indexed
// (hidx + h - half_h_mask, widx + w - half_w_mask) with
// feature-indexed
for (int hidx = hstart; hidx < hend; hidx++) {
for (int widx = wstart; widx < wend; widx++) {
mask_diff.view({-1})[((n * h_mask * w_mask + hidx * w_mask + widx) *
h_feature +
h) *
w_feature +
w] =
buffer_diff.view({-1})[(n * h_feature * w_feature +
(hidx + h - half_h_mask) * w_feature +
(widx + w - half_w_mask)) *
h_feature * w_feature +
h * w_feature + w];
}
}
}
}
}
}
void psamask_distribute_backward(const int num_, const int h_feature,
const int w_feature, const int h_mask,
const int w_mask, const int half_h_mask,
const int half_w_mask,
const Tensor buffer_diff, Tensor mask_diff) {
for (int n = 0; n < num_; n++) {
for (int h = 0; h < h_feature; h++) {
for (int w = 0; w < w_feature; w++) {
// effective mask region : [hstart, hend) x [wstart, wend) with
// mask-indexed
const int hstart = max(0, half_h_mask - h);
const int hend = min(h_mask, h_feature + half_h_mask - h);
const int wstart = max(0, half_w_mask - w);
const int wend = min(w_mask, w_feature + half_w_mask - w);
// (hidx, widx ) with mask-indexed
// (hidx + h - half_h_mask, widx + w - half_w_mask) with
// feature-indexed
for (int hidx = hstart; hidx < hend; hidx++) {
for (int widx = wstart; widx < wend; widx++) {
mask_diff.view({-1})[((n * h_mask * w_mask + hidx * w_mask + widx) *
h_feature +
h) *
w_feature +
w] =
buffer_diff.view(
{-1})[(n * h_feature * w_feature + h * w_feature + w) *
h_feature * w_feature +
(hidx + h - half_h_mask) * w_feature +
(widx + w - half_w_mask)];
}
}
}
}
}
}
void psamask_forward_cpu(const int psa_type, const Tensor input, Tensor output,
const int num_, const int h_feature,
const int w_feature, const int h_mask,
const int w_mask, const int half_h_mask,
const int half_w_mask) {
if (psa_type == 0)
psamask_collect_forward(num_, h_feature, w_feature, h_mask, w_mask,
half_h_mask, half_w_mask, input, output);
else
psamask_distribute_forward(num_, h_feature, w_feature, h_mask, w_mask,
half_h_mask, half_w_mask, input, output);
}
void psamask_backward_cpu(const int psa_type, const Tensor grad_output,
Tensor grad_input, const int num_,
const int h_feature, const int w_feature,
const int h_mask, const int w_mask,
const int half_h_mask, const int half_w_mask) {
if (psa_type == 0)
psamask_collect_backward(num_, h_feature, w_feature, h_mask, w_mask,
half_h_mask, half_w_mask, grad_output, grad_input);
else
psamask_distribute_backward(num_, h_feature, w_feature, h_mask, w_mask,
half_h_mask, half_w_mask, grad_output,
grad_input);
}
#ifdef WITH_CUDA
void PSAMaskForwardCUDAKernelLauncher(const int psa_type, const Tensor input,
Tensor output, const int num_,
const int h_feature, const int w_feature,
const int h_mask, const int w_mask,
const int half_h_mask,
const int half_w_mask);
void PSAMaskBackwardCUDAKernelLauncher(
const int psa_type, const Tensor grad_output, Tensor grad_input,
const int num_, const int h_feature, const int w_feature, const int h_mask,
const int w_mask, const int half_h_mask, const int half_w_mask);
void psamask_forward_cuda(const int psa_type, const Tensor input, Tensor output,
const int num_, const int h_feature,
const int w_feature, const int h_mask,
const int w_mask, const int half_h_mask,
const int half_w_mask) {
PSAMaskForwardCUDAKernelLauncher(psa_type, input, output, num_, h_feature,
w_feature, h_mask, w_mask, half_h_mask,
half_w_mask);
}
void psamask_backward_cuda(const int psa_type, const Tensor grad_output,
Tensor grad_input, const int num_,
const int h_feature, const int w_feature,
const int h_mask, const int w_mask,
const int half_h_mask, const int half_w_mask) {
PSAMaskBackwardCUDAKernelLauncher(psa_type, grad_output, grad_input, num_,
h_feature, w_feature, h_mask, w_mask,
half_h_mask, half_w_mask);
}
#endif
void psamask_forward(const Tensor input, Tensor output, const int psa_type,
const int num_, const int h_feature, const int w_feature,
const int h_mask, const int w_mask, const int half_h_mask,
const int half_w_mask) {
if (input.device().is_cuda()) {
#ifdef WITH_CUDA
CHECK_CUDA_INPUT(input);
CHECK_CUDA_INPUT(output);
psamask_forward_cuda(psa_type, input, output, num_, h_feature, w_feature,
h_mask, w_mask, half_h_mask, half_w_mask);
#else
AT_ERROR("PSAMask is not compiled with GPU support");
#endif
} else {
psamask_forward_cpu(psa_type, input, output, num_, h_feature, w_feature,
h_mask, w_mask, half_h_mask, half_w_mask);
}
}
void psamask_backward(Tensor grad_output, const Tensor grad_input,
const int psa_type, const int num_, const int h_feature,
const int w_feature, const int h_mask, const int w_mask,
const int half_h_mask, const int half_w_mask) {
if (grad_input.device().is_cuda()) {
#ifdef WITH_CUDA
CHECK_CUDA_INPUT(grad_input);
CHECK_CUDA_INPUT(grad_output);
psamask_backward_cuda(psa_type, grad_output, grad_input, num_, h_feature,
w_feature, h_mask, w_mask, half_h_mask, half_w_mask);
#else
AT_ERROR("PSAMask is not compiled with GPU support");
#endif
} else {
psamask_backward_cpu(psa_type, grad_output, grad_input, num_, h_feature,
w_feature, h_mask, w_mask, half_h_mask, half_w_mask);
}
}
// Modified from
// https://github.com/hszhao/semseg/blob/master/lib/psa/src
#include <THC/THC.h>
#include <torch/serialize/tensor.h>
#include <THC/THCDeviceUtils.cuh>
#include "psamask_cuda_kernel.cuh"
#include "pytorch_cuda_helper.hpp"
void PSAMaskForwardCUDAKernelLauncher(const int psa_type, const Tensor input,
Tensor output, const int num_,
const int h_feature, const int w_feature,
const int h_mask, const int w_mask,
const int half_h_mask,
const int half_w_mask) {
int nthreads = num_ * h_feature * w_feature;
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
if (psa_type == 0)
AT_DISPATCH_FLOATING_TYPES(
input.scalar_type(), "psamask_collect_forward_cuda", [&] {
psamask_collect_forward_cuda<scalar_t><<<nthreads, 512, 0, stream>>>(
nthreads, h_feature, w_feature, h_mask, w_mask, half_h_mask,
half_w_mask, input.data_ptr<scalar_t>(),
output.data_ptr<scalar_t>());
});
else
AT_DISPATCH_FLOATING_TYPES(
input.scalar_type(), "psamask_distribute_forward_cuda", [&] {
psamask_distribute_forward_cuda<scalar_t>
<<<nthreads, 512, 0, stream>>>(
nthreads, h_feature, w_feature, h_mask, w_mask, half_h_mask,
half_w_mask, input.data_ptr<scalar_t>(),
output.data_ptr<scalar_t>());
});
}
void PSAMaskBackwardCUDAKernelLauncher(
const int psa_type, const Tensor grad_output, Tensor grad_input,
const int num_, const int h_feature, const int w_feature, const int h_mask,
const int w_mask, const int half_h_mask, const int half_w_mask) {
int nthreads = num_ * h_feature * w_feature;
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
if (psa_type == 0)
AT_DISPATCH_FLOATING_TYPES(
grad_input.scalar_type(), "psamask_collect_backward_cuda", [&] {
psamask_collect_backward_cuda<scalar_t><<<nthreads, 512, 0, stream>>>(
nthreads, h_feature, w_feature, h_mask, w_mask, half_h_mask,
half_w_mask, grad_output.data_ptr<scalar_t>(),
grad_input.data_ptr<scalar_t>());
});
else
AT_DISPATCH_FLOATING_TYPES(
grad_input.scalar_type(), "psamask_distribute_backward_cuda", [&] {
psamask_distribute_backward_cuda<scalar_t>
<<<nthreads, 512, 0, stream>>>(
nthreads, h_feature, w_feature, h_mask, w_mask, half_h_mask,
half_w_mask, grad_output.data_ptr<scalar_t>(),
grad_input.data_ptr<scalar_t>());
});
}
#include "pytorch_cpp_helper.hpp"
std::string get_compiler_version();
std::string get_compiling_cuda_version();
int carafe_naive_forward(Tensor features, Tensor masks, Tensor output,
int kernel_size, int group_size, int scale_factor);
int carafe_naive_backward(Tensor top_grad, Tensor features, Tensor masks,
Tensor bottom_grad, Tensor mask_grad, int kernel_size,
int group_size, int scale_factor);
int carafe_forward(Tensor features, Tensor masks, Tensor rfeatures,
Tensor routput, Tensor rmasks, Tensor output,
int kernel_size, int group_size, int scale_factor);
int carafe_backward(Tensor top_grad, Tensor rfeatures, Tensor masks,
Tensor rtop_grad, Tensor rbottom_grad_hs,
Tensor rbottom_grad, Tensor rmask_grad, Tensor bottom_grad,
Tensor mask_grad, int kernel_size, int group_size,
int scale_factor);
int deform_conv_forward(Tensor input, Tensor weight, Tensor offset,
Tensor output, Tensor columns, Tensor ones, int kW,
int kH, int dW, int dH, int padW, int padH,
int dilationW, int dilationH, int group,
int deformable_group, int im2col_step);
int deform_conv_backward_input(Tensor input, Tensor offset, Tensor gradOutput,
Tensor gradInput, Tensor gradOffset,
Tensor weight, Tensor columns, int kW, int kH,
int dW, int dH, int padW, int padH,
int dilationW, int dilationH, int group,
int deformable_group, int im2col_step);
int deform_conv_backward_parameters(Tensor input, Tensor offset,
Tensor gradOutput, Tensor gradWeight,
Tensor columns, Tensor ones, int kW, int kH,
int dW, int dH, int padW, int padH,
int dilationW, int dilationH, int group,
int deformable_group, float scale,
int im2col_step);
void deform_roi_pool_forward(Tensor input, Tensor rois, Tensor offset,
Tensor output, int pooled_height, int pooled_width,
float spatial_scale, int sampling_ratio,
float gamma);
void deform_roi_pool_backward(Tensor grad_output, Tensor input, Tensor rois,
Tensor offset, Tensor grad_input,
Tensor grad_offset, int pooled_height,
int pooled_width, float spatial_scale,
int sampling_ratio, float gamma);
int sigmoid_focal_loss_forward(Tensor input, Tensor target, Tensor weight,
Tensor output, float gamma, float alpha);
int sigmoid_focal_loss_backward(Tensor input, Tensor target, Tensor weight,
Tensor grad_input, float gamma, float alpha);
int softmax_focal_loss_forward(Tensor input, Tensor target, Tensor weight,
Tensor output, float gamma, float alpha);
int softmax_focal_loss_backward(Tensor input, Tensor target, Tensor weight,
Tensor buff, Tensor grad_input, float gamma,
float alpha);
void bbox_overlaps(const Tensor bboxes1, const Tensor bboxes2, Tensor ious,
const int mode, const bool aligned, const int offset);
int masked_im2col_forward(const Tensor im, const Tensor mask_h_idx,
const Tensor mask_w_idx, Tensor col,
const int kernel_h, const int kernel_w,
const int pad_h, const int pad_w);
int masked_col2im_forward(const Tensor col, const Tensor mask_h_idx,
const Tensor mask_w_idx, Tensor im, int height,
int width, int channels);
int modulated_deform_conv_forward(
Tensor input, Tensor weight, Tensor bias, Tensor ones, Tensor offset,
Tensor mask, Tensor output, Tensor columns, int kernel_h, int kernel_w,
const int stride_h, const int stride_w, const int pad_h, const int pad_w,
const int dilation_h, const int dilation_w, const int group,
const int deformable_group, const bool with_bias);
int modulated_deform_conv_backward(
Tensor input, Tensor weight, Tensor bias, Tensor ones, Tensor offset,
Tensor mask, Tensor columns, Tensor grad_input, Tensor grad_weight,
Tensor grad_bias, Tensor grad_offset, Tensor grad_mask, Tensor grad_output,
int kernel_h, int kernel_w, int stride_h, int stride_w, int pad_h,
int pad_w, int dilation_h, int dilation_w, int group, int deformable_group,
const bool with_bias);
Tensor nms(Tensor boxes, Tensor scores, float iou_threshold, int offset);
Tensor softnms(Tensor boxes, Tensor scores, Tensor dets, float iou_threshold,
float sigma, float min_score, int method, int offset);
std::vector<std::vector<int> > nms_match(Tensor dets, float iou_threshold);
int roi_align_forward(Tensor input, Tensor rois, Tensor output, Tensor argmax_y,
Tensor argmax_x, int aligned_height, int aligned_width,
float spatial_scale, int sampling_ratio, int pool_mode,
bool aligned);
int roi_align_backward(Tensor grad_output, Tensor rois, Tensor argmax_y,
Tensor argmax_x, Tensor grad_input, int aligned_height,
int aligned_width, float spatial_scale,
int sampling_ratio, int pool_mode, bool aligned);
int roi_pool_forward(Tensor input, Tensor rois, Tensor output, Tensor argmax,
int pooled_height, int pooled_width, float spatial_scale);
int roi_pool_backward(Tensor grad_output, Tensor rois, Tensor argmax,
Tensor grad_input, int pooled_height, int pooled_width,
float spatial_scale);
void sync_bn_forward_mean(const Tensor input, Tensor mean);
void sync_bn_forward_var(const Tensor input, const Tensor mean, Tensor var);
void sync_bn_forward_output(const Tensor input, const Tensor mean,
const Tensor var, Tensor running_mean,
Tensor running_var, const Tensor weight,
const Tensor bias, Tensor norm, Tensor std,
Tensor output, float eps, float momentum,
int group_size);
void sync_bn_backward_param(const Tensor grad_output, const Tensor norm,
Tensor grad_weight, Tensor grad_bias);
void sync_bn_backward_data(const Tensor grad_output, const Tensor weight,
const Tensor grad_weight, const Tensor grad_bias,
const Tensor norm, const Tensor std,
Tensor grad_input);
void ca_forward(const Tensor t, const Tensor f, Tensor weight);
void ca_backward(const Tensor dw, const Tensor t, const Tensor f, Tensor dt,
Tensor df);
void ca_map_forward(const Tensor weight, const Tensor g, Tensor out);
void ca_map_backward(const Tensor dout, const Tensor weight, const Tensor g,
Tensor dw, Tensor dg);
void psamask_forward(const Tensor input, Tensor output, const int psa_type,
const int num_, const int h_feature, const int w_feature,
const int h_mask, const int w_mask, const int half_h_mask,
const int half_w_mask);
void psamask_backward(Tensor grad_output, const Tensor grad_input,
const int psa_type, const int num_, const int h_feature,
const int w_feature, const int h_mask, const int w_mask,
const int half_h_mask, const int half_w_mask);
Tensor bottom_pool_forward(Tensor input);
Tensor bottom_pool_backward(Tensor input, Tensor grad_output);
Tensor left_pool_forward(Tensor input);
Tensor left_pool_backward(Tensor input, Tensor grad_output);
Tensor right_pool_forward(Tensor input);
Tensor right_pool_backward(Tensor input, Tensor grad_output);
Tensor top_pool_forward(Tensor input);
Tensor top_pool_backward(Tensor input, Tensor grad_output);
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("get_compiler_version", &get_compiler_version, "get_compiler_version");
m.def("get_compiling_cuda_version", &get_compiling_cuda_version,
"get_compiling_cuda_version");
m.def("carafe_naive_forward", &carafe_naive_forward, "carafe_naive_forward",
py::arg("features"), py::arg("masks"), py::arg("output"),
py::arg("kernel_size"), py::arg("group_size"), py::arg("scale_factor"));
m.def("carafe_naive_backward", &carafe_naive_backward,
"carafe_naive_backward", py::arg("top_grad"), py::arg("features"),
py::arg("masks"), py::arg("bottom_grad"), py::arg("mask_grad"),
py::arg("kernel_size"), py::arg("group_size"), py::arg("scale_factor"));
m.def("carafe_forward", &carafe_forward, "carafe_forward",
py::arg("features"), py::arg("masks"), py::arg("rfeatures"),
py::arg("routput"), py::arg("rmasks"), py::arg("output"),
py::arg("kernel_size"), py::arg("group_size"), py::arg("scale_factor"));
m.def("carafe_backward", &carafe_backward, "carafe_backward",
py::arg("top_grad"), py::arg("rfeatures"), py::arg("masks"),
py::arg("rtop_grad"), py::arg("rbottom_grad_hs"),
py::arg("rbottom_grad"), py::arg("rmask_grad"), py::arg("bottom_grad"),
py::arg("mask_grad"), py::arg("kernel_size"), py::arg("group_size"),
py::arg("scale_factor"));
m.def("deform_conv_forward", &deform_conv_forward, "deform_conv_forward",
py::arg("input"), py::arg("weight"), py::arg("offset"),
py::arg("output"), py::arg("columns"), py::arg("ones"), py::arg("kW"),
py::arg("kH"), py::arg("dW"), py::arg("dH"), py::arg("padH"),
py::arg("padW"), py::arg("dilationW"), py::arg("dilationH"),
py::arg("group"), py::arg("deformable_group"), py::arg("im2col_step"));
m.def("deform_conv_backward_input", &deform_conv_backward_input,
"deform_conv_backward_input", py::arg("input"), py::arg("offset"),
py::arg("gradOutput"), py::arg("gradInput"), py::arg("gradOffset"),
py::arg("weight"), py::arg("columns"), py::arg("kW"), py::arg("kH"),
py::arg("dW"), py::arg("dH"), py::arg("padH"), py::arg("padW"),
py::arg("dilationW"), py::arg("dilationH"), py::arg("group"),
py::arg("deformable_group"), py::arg("im2col_step"));
m.def("deform_conv_backward_parameters", &deform_conv_backward_parameters,
"deform_conv_backward_parameters", py::arg("input"), py::arg("offset"),
py::arg("gradOutput"), py::arg("gradWeight"), py::arg("columns"),
py::arg("ones"), py::arg("kW"), py::arg("kH"), py::arg("dW"),
py::arg("dH"), py::arg("padH"), py::arg("padW"), py::arg("dilationW"),
py::arg("dilationH"), py::arg("group"), py::arg("deformable_group"),
py::arg("scale"), py::arg("im2col_step"));
m.def("deform_roi_pool_forward", &deform_roi_pool_forward,
"deform roi pool forward", py::arg("input"), py::arg("rois"),
py::arg("offset"), py::arg("output"), py::arg("pooled_height"),
py::arg("pooled_width"), py::arg("spatial_scale"),
py::arg("sampling_ratio"), py::arg("gamma"));
m.def("deform_roi_pool_backward", &deform_roi_pool_backward,
"deform roi pool backward", py::arg("grad_output"), py::arg("input"),
py::arg("rois"), py::arg("offset"), py::arg("grad_input"),
py::arg("grad_offset"), py::arg("pooled_height"),
py::arg("pooled_width"), py::arg("spatial_scale"),
py::arg("sampling_ratio"), py::arg("gamma"));
m.def("sigmoid_focal_loss_forward", &sigmoid_focal_loss_forward,
"sigmoid_focal_loss_forward ", py::arg("input"), py::arg("target"),
py::arg("weight"), py::arg("output"), py::arg("gamma"),
py::arg("alpha"));
m.def("sigmoid_focal_loss_backward", &sigmoid_focal_loss_backward,
"sigmoid_focal_loss_backward", py::arg("input"), py::arg("target"),
py::arg("weight"), py::arg("grad_input"), py::arg("gamma"),
py::arg("alpha"));
m.def("softmax_focal_loss_forward", &softmax_focal_loss_forward,
"softmax_focal_loss_forward", py::arg("input"), py::arg("target"),
py::arg("weight"), py::arg("output"), py::arg("gamma"),
py::arg("alpha"));
m.def("softmax_focal_loss_backward", &softmax_focal_loss_backward,
"softmax_focal_loss_backward", py::arg("input"), py::arg("target"),
py::arg("weight"), py::arg("buff"), py::arg("grad_input"),
py::arg("gamma"), py::arg("alpha"));
m.def("bbox_overlaps", &bbox_overlaps, "bbox_overlaps", py::arg("bboxes1"),
py::arg("bboxes2"), py::arg("ious"), py::arg("mode"),
py::arg("aligned"), py::arg("offset"));
m.def("masked_im2col_forward", &masked_im2col_forward,
"masked_im2col_forward", py::arg("im"), py::arg("mask_h_idx"),
py::arg("mask_w_idx"), py::arg("col"), py::arg("kernel_h"),
py::arg("kernel_w"), py::arg("pad_h"), py::arg("pad_w"));
m.def("masked_col2im_forward", &masked_col2im_forward,
"masked_col2im_forward", py::arg("col"), py::arg("mask_h_idx"),
py::arg("mask_w_idx"), py::arg("im"), py::arg("height"),
py::arg("width"), py::arg("channels"));
m.def("modulated_deform_conv_forward", &modulated_deform_conv_forward,
"modulated deform conv forward", py::arg("input"), py::arg("weight"),
py::arg("bias"), py::arg("ones"), py::arg("offset"), py::arg("mask"),
py::arg("output"), py::arg("columns"), py::arg("kernel_h"),
py::arg("kernel_w"), py::arg("stride_h"), py::arg("stride_w"),
py::arg("pad_h"), py::arg("pad_w"), py::arg("dilation_h"),
py::arg("dilation_w"), py::arg("group"), py::arg("deformable_group"),
py::arg("with_bias"));
m.def("modulated_deform_conv_backward", &modulated_deform_conv_backward,
"modulated deform conv backward", py::arg("input"), py::arg("weight"),
py::arg("bias"), py::arg("ones"), py::arg("offset"), py::arg("mask"),
py::arg("columns"), py::arg("grad_input"), py::arg("grad_weight"),
py::arg("grad_bias"), py::arg("grad_offset"), py::arg("grad_mask"),
py::arg("grad_output"), py::arg("kernel_h"), py::arg("kernel_w"),
py::arg("stride_h"), py::arg("stride_w"), py::arg("pad_h"),
py::arg("pad_w"), py::arg("dilation_h"), py::arg("dilation_w"),
py::arg("group"), py::arg("deformable_group"), py::arg("with_bias"));
m.def("nms", &nms, "nms (CPU/CUDA) ", py::arg("boxes"), py::arg("scores"),
py::arg("iou_threshold"), py::arg("offset"));
m.def("softnms", &softnms, "softnms (CPU) ", py::arg("boxes"),
py::arg("scores"), py::arg("dets"), py::arg("iou_threshold"),
py::arg("sigma"), py::arg("min_score"), py::arg("method"),
py::arg("offset"));
m.def("nms_match", &nms_match, "nms_match (CPU) ", py::arg("dets"),
py::arg("iou_threshold"));
m.def("roi_align_forward", &roi_align_forward, "roi_align forward",
py::arg("input"), py::arg("rois"), py::arg("output"),
py::arg("argmax_y"), py::arg("argmax_x"), py::arg("aligned_height"),
py::arg("aligned_width"), py::arg("spatial_scale"),
py::arg("sampling_ratio"), py::arg("pool_mode"), py::arg("aligned"));
m.def("roi_align_backward", &roi_align_backward, "roi_align backward",
py::arg("grad_output"), py::arg("rois"), py::arg("argmax_y"),
py::arg("argmax_x"), py::arg("grad_input"), py::arg("aligned_height"),
py::arg("aligned_width"), py::arg("spatial_scale"),
py::arg("sampling_ratio"), py::arg("pool_mode"), py::arg("aligned"));
m.def("roi_pool_forward", &roi_pool_forward, "roi_pool forward",
py::arg("input"), py::arg("rois"), py::arg("output"), py::arg("argmax"),
py::arg("pooled_height"), py::arg("pooled_width"),
py::arg("spatial_scale"));
m.def("roi_pool_backward", &roi_pool_backward, "roi_pool backward",
py::arg("grad_output"), py::arg("rois"), py::arg("argmax"),
py::arg("grad_input"), py::arg("pooled_height"),
py::arg("pooled_width"), py::arg("spatial_scale"));
m.def("sync_bn_forward_mean", &sync_bn_forward_mean, "sync_bn forward_mean",
py::arg("input"), py::arg("mean"));
m.def("sync_bn_forward_var", &sync_bn_forward_var, "sync_bn forward_var",
py::arg("input"), py::arg("mean"), py::arg("var"));
m.def("sync_bn_forward_output", &sync_bn_forward_output,
"sync_bn forward_output", py::arg("input"), py::arg("mean"),
py::arg("var"), py::arg("running_mean"), py::arg("running_var"),
py::arg("weight"), py::arg("bias"), py::arg("norm"), py::arg("std"),
py::arg("output"), py::arg("eps"), py::arg("momentum"),
py::arg("group_size"));
m.def("sync_bn_backward_param", &sync_bn_backward_param,
"sync_bn backward_param", py::arg("grad_output"), py::arg("norm"),
py::arg("grad_weight"), py::arg("grad_bias"));
m.def("sync_bn_backward_data", &sync_bn_backward_data,
"sync_bn backward_data", py::arg("grad_output"), py::arg("weight"),
py::arg("grad_weight"), py::arg("grad_bias"), py::arg("norm"),
py::arg("std"), py::arg("grad_input"));
m.def("ca_forward", &ca_forward, "ccattention forward", py::arg("t"),
py::arg("f"), py::arg("weight"));
m.def("ca_backward", &ca_backward, "ccattention backward", py::arg("dw"),
py::arg("t"), py::arg("f"), py::arg("dt"), py::arg("df"));
m.def("ca_map_forward", &ca_map_forward, "ccattention map forward",
py::arg("weight"), py::arg("g"), py::arg("out"));
m.def("ca_map_backward", &ca_map_backward, "ccattention map backward",
py::arg("dout"), py::arg("weight"), py::arg("g"), py::arg("dw"),
py::arg("dg"));
m.def("psamask_forward", &psamask_forward, "PSAMASK forward (CPU/CUDA)",
py::arg("input"), py::arg("output"), py::arg("psa_type"),
py::arg("num_"), py::arg("h_feature"), py::arg("w_feature"),
py::arg("h_mask"), py::arg("w_mask"), py::arg("half_h_mask"),
py::arg("half_w_mask"));
m.def("psamask_backward", &psamask_backward, "PSAMASK backward (CPU/CUDA)",
py::arg("grad_output"), py::arg("grad_input"), py::arg("psa_type"),
py::arg("num_"), py::arg("h_feature"), py::arg("w_feature"),
py::arg("h_mask"), py::arg("w_mask"), py::arg("half_h_mask"),
py::arg("half_w_mask"));
m.def("bottom_pool_forward", &bottom_pool_forward, "Bottom Pool Forward",
py::arg("input"), py::call_guard<py::gil_scoped_release>());
m.def("bottom_pool_backward", &bottom_pool_backward, "Bottom Pool Backward",
py::arg("input"), py::arg("grad_output"),
py::call_guard<py::gil_scoped_release>());
m.def("left_pool_forward", &left_pool_forward, "Left Pool Forward",
py::arg("input"), py::call_guard<py::gil_scoped_release>());
m.def("left_pool_backward", &left_pool_backward, "Left Pool Backward",
py::arg("input"), py::arg("grad_output"),
py::call_guard<py::gil_scoped_release>());
m.def("right_pool_forward", &right_pool_forward, "Right Pool Forward",
py::arg("input"), py::call_guard<py::gil_scoped_release>());
m.def("right_pool_backward", &right_pool_backward, "Right Pool Backward",
py::arg("input"), py::arg("grad_output"),
py::call_guard<py::gil_scoped_release>());
m.def("top_pool_forward", &top_pool_forward, "Top Pool Forward",
py::arg("input"), py::call_guard<py::gil_scoped_release>());
m.def("top_pool_backward", &top_pool_backward, "Top Pool Backward",
py::arg("input"), py::arg("grad_output"),
py::call_guard<py::gil_scoped_release>());
}
#include "pytorch_cpp_helper.hpp"
#ifdef WITH_CUDA
void ROIAlignForwardCUDAKernelLauncher(Tensor input, Tensor rois, Tensor output,
Tensor argmax_y, Tensor argmax_x,
int aligned_height, int aligned_width,
float spatial_scale, int sampling_ratio,
int pool_mode, bool aligned);
void ROIAlignBackwardCUDAKernelLauncher(Tensor grad_output, Tensor rois,
Tensor argmax_y, Tensor argmax_x,
Tensor grad_input, int aligned_height,
int aligned_width, float spatial_scale,
int sampling_ratio, int pool_mode,
bool aligned);
void roi_align_forward_cuda(Tensor input, Tensor rois, Tensor output,
Tensor argmax_y, Tensor argmax_x,
int aligned_height, int aligned_width,
float spatial_scale, int sampling_ratio,
int pool_mode, bool aligned) {
ROIAlignForwardCUDAKernelLauncher(
input, rois, output, argmax_y, argmax_x, aligned_height, aligned_width,
spatial_scale, sampling_ratio, pool_mode, aligned);
}
void roi_align_backward_cuda(Tensor grad_output, Tensor rois, Tensor argmax_y,
Tensor argmax_x, Tensor grad_input,
int aligned_height, int aligned_width,
float spatial_scale, int sampling_ratio,
int pool_mode, bool aligned) {
ROIAlignBackwardCUDAKernelLauncher(
grad_output, rois, argmax_y, argmax_x, grad_input, aligned_height,
aligned_width, spatial_scale, sampling_ratio, pool_mode, aligned);
}
#endif
void roi_align_forward(Tensor input, Tensor rois, Tensor output,
Tensor argmax_y, Tensor argmax_x, int aligned_height,
int aligned_width, float spatial_scale,
int sampling_ratio, int pool_mode, bool aligned) {
if (input.device().is_cuda()) {
#ifdef WITH_CUDA
CHECK_CUDA_INPUT(input);
CHECK_CUDA_INPUT(rois);
CHECK_CUDA_INPUT(output);
CHECK_CUDA_INPUT(argmax_y);
CHECK_CUDA_INPUT(argmax_x);
roi_align_forward_cuda(input, rois, output, argmax_y, argmax_x,
aligned_height, aligned_width, spatial_scale,
sampling_ratio, pool_mode, aligned);
#else
AT_ERROR("RoIAlign is not compiled with GPU support");
#endif
} else {
AT_ERROR("RoIAlign is not implemented on CPU");
}
}
void roi_align_backward(Tensor grad_output, Tensor rois, Tensor argmax_y,
Tensor argmax_x, Tensor grad_input, int aligned_height,
int aligned_width, float spatial_scale,
int sampling_ratio, int pool_mode, bool aligned) {
if (grad_output.device().is_cuda()) {
#ifdef WITH_CUDA
CHECK_CUDA_INPUT(grad_output);
CHECK_CUDA_INPUT(rois);
CHECK_CUDA_INPUT(argmax_y);
CHECK_CUDA_INPUT(argmax_x);
CHECK_CUDA_INPUT(grad_input);
roi_align_backward_cuda(grad_output, rois, argmax_y, argmax_x, grad_input,
aligned_height, aligned_width, spatial_scale,
sampling_ratio, pool_mode, aligned);
#else
AT_ERROR("RoIAlign is not compiled with GPU support");
#endif
} else {
AT_ERROR("RoIAlign is not implemented on CPU");
}
}
#include "pytorch_cuda_helper.hpp"
#include "roi_align_kernel.cuh"
void ROIAlignForwardCUDAKernelLauncher(Tensor input, Tensor rois, Tensor output,
Tensor argmax_y, Tensor argmax_x,
int aligned_height, int aligned_width,
float spatial_scale, int sampling_ratio,
int pool_mode, bool aligned) {
int output_size = output.numel();
int channels = input.size(1);
int height = input.size(2);
int width = input.size(3);
at::cuda::CUDAGuard device_guard(input.device());
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
input.scalar_type(), "roi_align_forward_cuda_kernel", [&] {
roi_align_forward_cuda_kernel<scalar_t>
<<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
output_size, input.data_ptr<scalar_t>(),
rois.data_ptr<scalar_t>(), output.data_ptr<scalar_t>(),
argmax_y.data_ptr<scalar_t>(), argmax_x.data_ptr<scalar_t>(),
aligned_height, aligned_width,
static_cast<scalar_t>(spatial_scale), sampling_ratio, pool_mode,
aligned, channels, height, width);
});
AT_CUDA_CHECK(cudaGetLastError());
}
void ROIAlignBackwardCUDAKernelLauncher(Tensor grad_output, Tensor rois,
Tensor argmax_y, Tensor argmax_x,
Tensor grad_input, int aligned_height,
int aligned_width, float spatial_scale,
int sampling_ratio, int pool_mode,
bool aligned) {
int output_size = grad_output.numel();
int channels = grad_input.size(1);
int height = grad_input.size(2);
int width = grad_input.size(3);
at::cuda::CUDAGuard device_guard(grad_output.device());
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
grad_output.scalar_type(), "roi_align_backward_cuda_kernel", [&] {
roi_align_backward_cuda_kernel<scalar_t>
<<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
output_size, grad_output.data_ptr<scalar_t>(),
rois.data_ptr<scalar_t>(), argmax_y.data_ptr<scalar_t>(),
argmax_x.data_ptr<scalar_t>(), grad_input.data_ptr<scalar_t>(),
aligned_height, aligned_width,
static_cast<scalar_t>(spatial_scale), sampling_ratio, pool_mode,
aligned, channels, height, width);
});
AT_CUDA_CHECK(cudaGetLastError());
}
#include "pytorch_cpp_helper.hpp"
#ifdef WITH_CUDA
void ROIPoolForwardCUDAKernelLauncher(Tensor input, Tensor rois, Tensor output,
Tensor argmax, int pooled_height,
int pooled_width, float spatial_scale);
void ROIPoolBackwardCUDAKernelLauncher(Tensor grad_output, Tensor rois,
Tensor argmax, Tensor grad_input,
int pooled_height, int pooled_width,
float spatial_scale);
void roi_pool_forward_cuda(Tensor input, Tensor rois, Tensor output,
Tensor argmax, int pooled_height, int pooled_width,
float spatial_scale) {
ROIPoolForwardCUDAKernelLauncher(input, rois, output, argmax, pooled_height,
pooled_width, spatial_scale);
}
void roi_pool_backward_cuda(Tensor grad_output, Tensor rois, Tensor argmax,
Tensor grad_input, int pooled_height,
int pooled_width, float spatial_scale) {
ROIPoolBackwardCUDAKernelLauncher(grad_output, rois, argmax, grad_input,
pooled_height, pooled_width, spatial_scale);
}
#endif
void roi_pool_forward(Tensor input, Tensor rois, Tensor output, Tensor argmax,
int pooled_height, int pooled_width,
float spatial_scale) {
if (input.device().is_cuda()) {
#ifdef WITH_CUDA
CHECK_CUDA_INPUT(input);
CHECK_CUDA_INPUT(rois);
CHECK_CUDA_INPUT(output);
CHECK_CUDA_INPUT(argmax);
roi_pool_forward_cuda(input, rois, output, argmax, pooled_height,
pooled_width, spatial_scale);
#else
AT_ERROR("RoIPool is not compiled with GPU support");
#endif
} else {
AT_ERROR("RoIPool is not implemented on CPU");
}
}
void roi_pool_backward(Tensor grad_output, Tensor rois, Tensor argmax,
Tensor grad_input, int pooled_height, int pooled_width,
float spatial_scale) {
if (grad_output.device().is_cuda()) {
#ifdef WITH_CUDA
CHECK_CUDA_INPUT(grad_output);
CHECK_CUDA_INPUT(rois);
CHECK_CUDA_INPUT(argmax);
CHECK_CUDA_INPUT(grad_input);
roi_pool_backward_cuda(grad_output, rois, argmax, grad_input, pooled_height,
pooled_width, spatial_scale);
#else
AT_ERROR("RoIPool is not compiled with GPU support");
#endif
} else {
AT_ERROR("RoIPool is not implemented on CPU");
}
}
#include "pytorch_cuda_helper.hpp"
#include "roi_pool_kernel.cuh"
void ROIPoolForwardCUDAKernelLauncher(Tensor input, Tensor rois, Tensor output,
Tensor argmax, int pooled_height,
int pooled_width, float spatial_scale) {
int output_size = output.numel();
int channels = input.size(1);
int height = input.size(2);
int width = input.size(3);
at::cuda::CUDAGuard device_guard(input.device());
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
input.scalar_type(), "roi_pool_forward_cuda_kernel", [&] {
roi_pool_forward_cuda_kernel<scalar_t>
<<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
output_size, input.data_ptr<scalar_t>(),
rois.data_ptr<scalar_t>(), output.data_ptr<scalar_t>(),
argmax.data_ptr<int>(), pooled_height, pooled_width,
static_cast<scalar_t>(spatial_scale), channels, height, width);
});
AT_CUDA_CHECK(cudaGetLastError());
}
void ROIPoolBackwardCUDAKernelLauncher(Tensor grad_output, Tensor rois,
Tensor argmax, Tensor grad_input,
int pooled_height, int pooled_width,
float spatial_scale) {
int output_size = grad_output.numel();
int channels = grad_input.size(1);
int height = grad_input.size(2);
int width = grad_input.size(3);
at::cuda::CUDAGuard device_guard(grad_output.device());
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
grad_output.scalar_type(), "roi_pool_backward_cuda_kernel", [&] {
roi_pool_backward_cuda_kernel<scalar_t>
<<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
output_size, grad_output.data_ptr<scalar_t>(),
rois.data_ptr<scalar_t>(), argmax.data_ptr<int>(),
grad_input.data_ptr<scalar_t>(), pooled_height, pooled_width,
channels, height, width);
});
AT_CUDA_CHECK(cudaGetLastError());
}
#include "pytorch_cpp_helper.hpp"
#ifdef WITH_CUDA
void SyncBNForwardMeanCUDAKernelLauncher(const Tensor input, Tensor mean);
void SyncBNForwardVarCUDAKernelLauncher(const Tensor input, const Tensor mean,
Tensor var);
void SyncBNForwardOutputCUDAKernelLauncher(
const Tensor input, const Tensor mean, const Tensor var,
Tensor running_mean, Tensor running_var, const Tensor weight,
const Tensor bias, Tensor norm, Tensor std, Tensor output, float eps,
float momentum, int group_size);
void SyncBNBackwardParamCUDAKernelLauncher(const Tensor grad_output,
const Tensor norm,
Tensor grad_weight,
Tensor grad_bias);
void SyncBNBackwardDataCUDAKernelLauncher(const Tensor grad_output,
const Tensor weight,
const Tensor grad_weight,
const Tensor grad_bias,
const Tensor norm, const Tensor std,
Tensor grad_input);
void sync_bn_forward_mean_cuda(const Tensor input, Tensor mean) {
SyncBNForwardMeanCUDAKernelLauncher(input, mean);
}
void sync_bn_forward_var_cuda(const Tensor input, const Tensor mean,
Tensor var) {
SyncBNForwardVarCUDAKernelLauncher(input, mean, var);
}
void sync_bn_forward_output_cuda(const Tensor input, const Tensor mean,
const Tensor var, Tensor running_mean,
Tensor running_var, const Tensor weight,
const Tensor bias, Tensor norm, Tensor std,
Tensor output, float eps, float momentum,
int group_size) {
SyncBNForwardOutputCUDAKernelLauncher(input, mean, var, running_mean,
running_var, weight, bias, norm, std,
output, eps, momentum, group_size);
}
void sync_bn_backward_param_cuda(const Tensor grad_output, const Tensor norm,
Tensor grad_weight, Tensor grad_bias) {
SyncBNBackwardParamCUDAKernelLauncher(grad_output, norm, grad_weight,
grad_bias);
}
void sync_bn_backward_data_cuda(const Tensor grad_output, const Tensor weight,
const Tensor grad_weight,
const Tensor grad_bias, const Tensor norm,
const Tensor std, Tensor grad_input) {
SyncBNBackwardDataCUDAKernelLauncher(grad_output, weight, grad_weight,
grad_bias, norm, std, grad_input);
}
#endif
void sync_bn_forward_mean(const Tensor input, Tensor mean) {
if (input.device().is_cuda()) {
#ifdef WITH_CUDA
CHECK_CUDA_INPUT(input);
CHECK_CUDA_INPUT(mean);
sync_bn_forward_mean_cuda(input, mean);
#else
AT_ERROR("SyncBatchNorm is not compiled with GPU support");
#endif
} else {
AT_ERROR("SyncBatchNorm is not implemented on CPU");
}
}
void sync_bn_forward_var(const Tensor input, const Tensor mean, Tensor var) {
if (input.device().is_cuda()) {
#ifdef WITH_CUDA
CHECK_CUDA_INPUT(input);
CHECK_CUDA_INPUT(mean);
CHECK_CUDA_INPUT(var);
sync_bn_forward_var_cuda(input, mean, var);
#else
AT_ERROR("SyncBatchNorm is not compiled with GPU support");
#endif
} else {
AT_ERROR("SyncBatchNorm is not implemented on CPU");
}
}
void sync_bn_forward_output(const Tensor input, const Tensor mean,
const Tensor var, Tensor running_mean,
Tensor running_var, const Tensor weight,
const Tensor bias, Tensor norm, Tensor std,
Tensor output, float eps, float momentum,
int group_size) {
if (input.device().is_cuda()) {
#ifdef WITH_CUDA
CHECK_CUDA_INPUT(input);
CHECK_CUDA_INPUT(mean);
CHECK_CUDA_INPUT(var);
CHECK_CUDA_INPUT(running_mean);
CHECK_CUDA_INPUT(running_var);
CHECK_CUDA_INPUT(weight);
CHECK_CUDA_INPUT(bias);
CHECK_CUDA_INPUT(norm);
CHECK_CUDA_INPUT(std);
CHECK_CUDA_INPUT(output);
sync_bn_forward_output_cuda(input, mean, var, running_mean, running_var,
weight, bias, norm, std, output, eps, momentum,
group_size);
#else
AT_ERROR("SyncBatchNorm is not compiled with GPU support");
#endif
} else {
AT_ERROR("SyncBatchNorm is not implemented on CPU");
}
}
void sync_bn_backward_param(const Tensor grad_output, const Tensor norm,
Tensor grad_weight, Tensor grad_bias) {
if (grad_output.device().is_cuda()) {
#ifdef WITH_CUDA
CHECK_CUDA_INPUT(grad_output);
CHECK_CUDA_INPUT(norm);
CHECK_CUDA_INPUT(grad_weight);
CHECK_CUDA_INPUT(grad_bias);
sync_bn_backward_param_cuda(grad_output, norm, grad_weight, grad_bias);
#else
AT_ERROR("SyncBatchNorm is not compiled with GPU support");
#endif
} else {
AT_ERROR("SyncBatchNorm is not implemented on CPU");
}
}
void sync_bn_backward_data(const Tensor grad_output, const Tensor weight,
const Tensor grad_weight, const Tensor grad_bias,
const Tensor norm, const Tensor std,
Tensor grad_input) {
if (grad_output.device().is_cuda()) {
#ifdef WITH_CUDA
CHECK_CUDA_INPUT(grad_output);
CHECK_CUDA_INPUT(weight);
CHECK_CUDA_INPUT(grad_weight);
CHECK_CUDA_INPUT(grad_bias);
CHECK_CUDA_INPUT(norm);
CHECK_CUDA_INPUT(std);
CHECK_CUDA_INPUT(grad_input);
sync_bn_backward_data_cuda(grad_output, weight, grad_weight, grad_bias,
norm, std, grad_input);
#else
AT_ERROR("SyncBatchNorm is not compiled with GPU support");
#endif
} else {
AT_ERROR("SyncBatchNorm is not implemented on CPU");
}
}
#include "pytorch_cuda_helper.hpp"
#include "sync_bn_cuda_kernel.cuh"
void SyncBNForwardMeanCUDAKernelLauncher(const Tensor input, Tensor mean) {
int num = input.size(0);
int channels = input.size(1);
int spatial = input.size(2);
at::cuda::CUDAGuard device_guard(input.device());
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
input.scalar_type(), "sync_bn_forward_mean_cuda_kernel", [&] {
sync_bn_forward_mean_cuda_kernel<scalar_t>
<<<channels, THREADS_PER_BLOCK, 0, stream>>>(
input.data_ptr<scalar_t>(), mean.data_ptr<float>(), num,
channels, spatial);
});
AT_CUDA_CHECK(cudaGetLastError());
}
void SyncBNForwardVarCUDAKernelLauncher(const Tensor input, const Tensor mean,
Tensor var) {
int num = input.size(0);
int channels = input.size(1);
int spatial = input.size(2);
at::cuda::CUDAGuard device_guard(input.device());
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
input.scalar_type(), "sync_bn_forward_mean_cuda_kernel", [&] {
sync_bn_forward_var_cuda_kernel<scalar_t>
<<<channels, THREADS_PER_BLOCK, 0, stream>>>(
input.data_ptr<scalar_t>(), mean.data_ptr<float>(),
var.data_ptr<float>(), num, channels, spatial);
});
AT_CUDA_CHECK(cudaGetLastError());
}
void SyncBNForwardOutputCUDAKernelLauncher(
const Tensor input, const Tensor mean, const Tensor var,
Tensor running_mean, Tensor running_var, const Tensor weight,
const Tensor bias, Tensor norm, Tensor std, Tensor output, float eps,
float momentum, int group_size) {
int num = input.size(0);
int channels = input.size(1);
int spatial = input.size(2);
at::cuda::CUDAGuard device_guard(input.device());
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
input.scalar_type(), "sync_bn_forward_mean_cuda_kernel", [&] {
sync_bn_forward_output_cuda_kernel<scalar_t>
<<<channels, THREADS_PER_BLOCK, 0, stream>>>(
input.data_ptr<scalar_t>(), mean.data_ptr<float>(),
var.data_ptr<float>(), running_mean.data_ptr<float>(),
running_var.data_ptr<float>(), weight.data_ptr<float>(),
bias.data_ptr<float>(), norm.data_ptr<float>(),
std.data_ptr<float>(), output.data_ptr<scalar_t>(), num,
channels, spatial, eps, momentum, group_size);
});
AT_CUDA_CHECK(cudaGetLastError());
}
void SyncBNBackwardParamCUDAKernelLauncher(const Tensor grad_output,
const Tensor norm,
Tensor grad_weight,
Tensor grad_bias) {
int num = grad_output.size(0);
int channels = grad_output.size(1);
int spatial = grad_output.size(2);
at::cuda::CUDAGuard device_guard(grad_output.device());
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
grad_output.scalar_type(), "sync_bn_backward_param_cuda_kernel", [&] {
sync_bn_backward_param_cuda_kernel<scalar_t>
<<<channels, THREADS_PER_BLOCK, 0, stream>>>(
grad_output.data_ptr<scalar_t>(), norm.data_ptr<float>(),
grad_weight.data_ptr<float>(), grad_bias.data_ptr<float>(), num,
channels, spatial);
});
AT_CUDA_CHECK(cudaGetLastError());
}
void SyncBNBackwardDataCUDAKernelLauncher(const Tensor grad_output,
const Tensor weight,
const Tensor grad_weight,
const Tensor grad_bias,
const Tensor norm, const Tensor std,
Tensor grad_input) {
int output_size = grad_input.numel();
int num = grad_input.size(0);
int channels = grad_input.size(1);
int spatial = grad_input.size(2);
at::cuda::CUDAGuard device_guard(grad_input.device());
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
grad_output.scalar_type(), "sync_bn_backward_data_cuda_kernel", [&] {
sync_bn_backward_data_cuda_kernel<scalar_t>
<<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
output_size, grad_output.data_ptr<scalar_t>(),
weight.data_ptr<float>(), grad_weight.data_ptr<float>(),
grad_bias.data_ptr<float>(), norm.data_ptr<float>(),
std.data_ptr<float>(), grad_input.data_ptr<scalar_t>(), num,
channels, spatial);
});
AT_CUDA_CHECK(cudaGetLastError());
}
#ifndef PYTORCH_CPP_HELPER
#define PYTORCH_CPP_HELPER
#include <torch/extension.h>
#include <vector>
using namespace at;
#define CHECK_CUDA(x) \
TORCH_CHECK(x.device().is_cuda(), #x " must be a CUDA tensor")
#define CHECK_CPU(x) \
TORCH_CHECK(!x.device().is_cuda(), #x " must be a CPU tensor")
#define CHECK_CONTIGUOUS(x) \
TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
#define CHECK_CUDA_INPUT(x) \
CHECK_CUDA(x); \
CHECK_CONTIGUOUS(x)
#define CHECK_CPU_INPUT(x) \
CHECK_CPU(x); \
CHECK_CONTIGUOUS(x)
#endif // PYTORCH_CPP_HELPER
#ifndef PYTORCH_CUDA_HELPER
#define PYTORCH_CUDA_HELPER
#include <ATen/ATen.h>
#include <ATen/cuda/CUDAContext.h>
#include <c10/cuda/CUDAGuard.h>
#include <THC/THCAtomics.cuh>
#include "common_cuda_helper.hpp"
using at::Half;
using at::Tensor;
using phalf = at::Half;
#define __PHALF(x) (x)
#endif // PYTORCH_CUDA_HELPER
#ifndef ROI_ALIGN_KERNEL_CUH
#define ROI_ALIGN_KERNEL_CUH
/*** Forward ***/
template <typename T>
__global__ void roi_align_forward_cuda_kernel(
const int nthreads, const T* input, const T* rois, T* output, T* argmax_y,
T* argmax_x, const int pooled_height, const int pooled_width,
const T spatial_scale, const int sampling_ratio,
const int pool_mode, // 0 - max pool, 1 - avg pool
const bool aligned, const int channels, const int height, const int width) {
CUDA_1D_KERNEL_LOOP(index, nthreads) {
// (n, c, ph, pw) is an element in the pooled output
int pw = index % pooled_width;
int ph = (index / pooled_width) % pooled_height;
int c = (index / pooled_width / pooled_height) % channels;
int n = index / pooled_width / pooled_height / channels;
const T* offset_rois = rois + n * 5;
int roi_batch_ind = offset_rois[0];
// Do not using rounding; this implementation detail is critical
T offset = aligned ? (T)0.5 : (T)0.0;
T roi_start_w = offset_rois[1] * spatial_scale - offset;
T roi_start_h = offset_rois[2] * spatial_scale - offset;
T roi_end_w = offset_rois[3] * spatial_scale - offset;
T roi_end_h = offset_rois[4] * spatial_scale - offset;
T roi_width = roi_end_w - roi_start_w;
T roi_height = roi_end_h - roi_start_h;
if (!aligned) { // for backward-compatibility only
roi_width = max(roi_width, (T)1.);
roi_height = max(roi_height, (T)1.);
}
T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
const T* offset_input =
input + (roi_batch_ind * channels + c) * height * width;
// We use roi_bin_grid to sample the grid and mimic integral
int roi_bin_grid_h =
(sampling_ratio > 0)
? sampling_ratio
: static_cast<int>(ceil(roi_height / pooled_height));
int roi_bin_grid_w = (sampling_ratio > 0)
? sampling_ratio
: static_cast<int>(ceil(roi_width / pooled_width));
if (pool_mode == 0) {
// We do max pooling inside a bin
T maxval = -FLT_MAX;
T maxidx_y = -1.f, maxidx_x = -1.f;
for (int iy = 0; iy < roi_bin_grid_h; iy++) {
const T y = roi_start_h + ph * bin_size_h +
static_cast<T>(iy + .5f) * bin_size_h /
static_cast<T>(roi_bin_grid_h);
for (int ix = 0; ix < roi_bin_grid_w; ix++) {
const T x = roi_start_w + pw * bin_size_w +
static_cast<T>(ix + .5f) * bin_size_w /
static_cast<T>(roi_bin_grid_w);
T val =
bilinear_interpolate(offset_input, height, width, y, x, index);
if (val > maxval) {
maxval = val;
maxidx_y = y;
maxidx_x = x;
}
}
}
output[index] = maxval;
argmax_y[index] = maxidx_y;
argmax_x[index] = maxidx_x;
} else if (pool_mode == 1) {
// We do average pooling inside a bin
const T count = max(roi_bin_grid_h * roi_bin_grid_w, 1);
T output_val = 0.;
for (int iy = 0; iy < roi_bin_grid_h; iy++) {
const T y = roi_start_h + ph * bin_size_h +
static_cast<T>(iy + .5f) * bin_size_h /
static_cast<T>(roi_bin_grid_h);
for (int ix = 0; ix < roi_bin_grid_w; ix++) {
const T x = roi_start_w + pw * bin_size_w +
static_cast<T>(ix + .5f) * bin_size_w /
static_cast<T>(roi_bin_grid_w);
T val =
bilinear_interpolate(offset_input, height, width, y, x, index);
output_val += val;
}
}
output[index] = output_val / count;
}
}
}
/*** Backward ***/
template <typename T>
__global__ void roi_align_backward_cuda_kernel(
const int nthreads, const T* grad_output, const T* rois, const T* argmax_y,
const T* argmax_x, T* grad_input, const int pooled_height,
const int pooled_width, const T spatial_scale, const int sampling_ratio,
const int pool_mode, // 0 - max pool, 1 - avg pool
const bool aligned, const int channels, const int height, const int width) {
CUDA_1D_KERNEL_LOOP(index, nthreads) {
// (n, c, ph, pw) is an element in the pooled output
int pw = index % pooled_width;
int ph = (index / pooled_width) % pooled_height;
int c = (index / pooled_width / pooled_height) % channels;
int n = index / pooled_width / pooled_height / channels;
const T grad_output_this_bin = grad_output[index];
const T* offset_rois = rois + n * 5;
int roi_batch_ind = offset_rois[0];
T* offset_grad_input =
grad_input + ((roi_batch_ind * channels + c) * height * width);
if (pool_mode == 0) {
T y = argmax_y[index], x = argmax_x[index];
if (y != -1.f) {
T w1, w2, w3, w4;
int x_low, x_high, y_low, y_high;
bilinear_interpolate_gradient(height, width, y, x, w1, w2, w3, w4,
x_low, x_high, y_low, y_high, index);
if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
atomicAdd(offset_grad_input + y_low * width + x_low,
grad_output_this_bin * w1);
atomicAdd(offset_grad_input + y_low * width + x_high,
grad_output_this_bin * w2);
atomicAdd(offset_grad_input + y_high * width + x_low,
grad_output_this_bin * w3);
atomicAdd(offset_grad_input + y_high * width + x_high,
grad_output_this_bin * w4);
}
}
} else if (pool_mode == 1) {
// Do not using rounding; this implementation detail is critical
T offset = aligned ? (T)0.5 : (T)0.0;
T roi_start_w = offset_rois[1] * spatial_scale - offset;
T roi_start_h = offset_rois[2] * spatial_scale - offset;
T roi_end_w = offset_rois[3] * spatial_scale - offset;
T roi_end_h = offset_rois[4] * spatial_scale - offset;
T roi_width = roi_end_w - roi_start_w;
T roi_height = roi_end_h - roi_start_h;
if (!aligned) { // for backward-compatibility only
roi_width = max(roi_width, (T)1.);
roi_height = max(roi_height, (T)1.);
}
T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
// We use roi_bin_grid to sample the grid and mimic integral
int roi_bin_grid_h =
(sampling_ratio > 0)
? sampling_ratio
: static_cast<int>(ceil(roi_height / pooled_height));
int roi_bin_grid_w =
(sampling_ratio > 0)
? sampling_ratio
: static_cast<int>(ceil(roi_width / pooled_width));
// We do average (integral) pooling inside a bin
const T count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4
for (int iy = 0; iy < roi_bin_grid_h; iy++) {
const T y = roi_start_h + ph * bin_size_h +
static_cast<T>(iy + .5f) * bin_size_h /
static_cast<T>(roi_bin_grid_h);
for (int ix = 0; ix < roi_bin_grid_w; ix++) {
const T x = roi_start_w + pw * bin_size_w +
static_cast<T>(ix + .5f) * bin_size_w /
static_cast<T>(roi_bin_grid_w);
T w1, w2, w3, w4;
int x_low, x_high, y_low, y_high;
bilinear_interpolate_gradient(height, width, y, x, w1, w2, w3, w4,
x_low, x_high, y_low, y_high, index);
if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
atomicAdd(offset_grad_input + y_low * width + x_low,
grad_output_this_bin * w1 / count);
atomicAdd(offset_grad_input + y_low * width + x_high,
grad_output_this_bin * w2 / count);
atomicAdd(offset_grad_input + y_high * width + x_low,
grad_output_this_bin * w3 / count);
atomicAdd(offset_grad_input + y_high * width + x_high,
grad_output_this_bin * w4 / count);
}
}
}
}
}
}
#endif // ROI_ALIGN_KERNEL_CUH
#ifndef ROI_POOL_KERNEL_CUH
#define ROI_POOL_KERNEL_CUH
#include <cuda.h>
template <typename T>
__global__ void roi_pool_forward_cuda_kernel(
const int nthreads, const T* input, const T* rois, T* output, int* argmax,
const int pooled_height, const int pooled_width, const T spatial_scale,
const int channels, const int height, const int width) {
CUDA_1D_KERNEL_LOOP(index, nthreads) {
// (n, c, ph, pw) is an element in the pooled output
int pw = index % pooled_width;
int ph = (index / pooled_width) % pooled_height;
int c = (index / pooled_width / pooled_height) % channels;
int n = index / pooled_width / pooled_height / channels;
const T* offset_rois = rois + n * 5;
int roi_batch_ind = offset_rois[0];
// calculate the roi region on feature maps
T roi_x1 = offset_rois[1] * spatial_scale;
T roi_y1 = offset_rois[2] * spatial_scale;
T roi_x2 = (offset_rois[3] + 1) * spatial_scale;
T roi_y2 = (offset_rois[4] + 1) * spatial_scale;
// force malformed rois to be 1x1
T roi_w = roi_x2 - roi_x1;
T roi_h = roi_y2 - roi_y1;
if (roi_w <= 0 || roi_h <= 0) continue;
T bin_size_w = roi_w / static_cast<T>(pooled_width);
T bin_size_h = roi_h / static_cast<T>(pooled_height);
// the corresponding bin region
int bin_x1 = floor(static_cast<T>(pw) * bin_size_w + roi_x1);
int bin_y1 = floor(static_cast<T>(ph) * bin_size_h + roi_y1);
int bin_x2 = ceil(static_cast<T>(pw + 1) * bin_size_w + roi_x1);
int bin_y2 = ceil(static_cast<T>(ph + 1) * bin_size_h + roi_y1);
// add roi offsets and clip to input boundaries
bin_x1 = min(max(bin_x1, 0), width);
bin_y1 = min(max(bin_y1, 0), height);
bin_x2 = min(max(bin_x2, 0), width);
bin_y2 = min(max(bin_y2, 0), height);
bool is_empty = (bin_y2 <= bin_y1) || (bin_x2 <= bin_x1);
const T* offset_input =
input + (roi_batch_ind * channels + c) * height * width;
// Define an empty pooling region to be zero
// If nothing is pooled, argmax = -1 causes nothing to be backprop'd
T max_val = is_empty ? 0 : -FLT_MAX;
int max_idx = -1;
for (int h = bin_y1; h < bin_y2; ++h) {
for (int w = bin_x1; w < bin_x2; ++w) {
int offset = h * width + w;
if (offset_input[offset] > max_val) {
max_val = offset_input[offset];
max_idx = offset;
}
}
}
output[index] = max_val;
if (argmax != NULL) argmax[index] = max_idx;
}
}
template <typename T>
__global__ void roi_pool_backward_cuda_kernel(
const int nthreads, const T* grad_output, const T* rois, const int* argmax,
T* grad_input, const int pooled_height, const int pooled_width,
const int channels, const int height, const int width) {
CUDA_1D_KERNEL_LOOP(index, nthreads) {
// (n, c) is an element in the pooled output
int c = (index / pooled_width / pooled_height) % channels;
int n = index / pooled_width / pooled_height / channels;
int roi_batch_ind = rois[n * 5];
T* grad_input_offset =
grad_input + ((roi_batch_ind * channels + c) * height * width);
int argmax_index = argmax[index];
if (argmax_index != -1) {
atomicAdd(grad_input_offset + argmax_index, grad_output[index]);
}
}
}
#endif
#ifndef SIGMOID_FOCAL_LOSS_KERNEL_CUH
#define SIGMOID_FOCAL_LOSS_KERNEL_CUH
template <typename T>
__global__ void sigmoid_focal_loss_forward_cuda_kernel(
const int nthreads, const T* input, const int64_t* target, const T* weight,
T* output, const T gamma, const T alpha, const int num_classes) {
CUDA_1D_KERNEL_LOOP(index, nthreads) {
int n = index / num_classes;
int c = index % num_classes;
int64_t t = target[n];
T flag_p = (t == c);
T flag_n = (t != c);
// p = sigmoid(x) = 1. / 1. + expf(-x)
T p = (T)1. / ((T)1. + expf(-input[index]));
// (1 - p)**gamma * log(p)
T term_p = pow(((T)1. - p), gamma) * log(max(p, (T)FLT_MIN));
// p**gamma * log(1 - p)
T term_n = pow(p, gamma) * log(max((T)1. - p, (T)FLT_MIN));
output[index] = (T)0.;
output[index] += -flag_p * alpha * term_p;
output[index] += -flag_n * ((T)1. - alpha) * term_n;
if (weight != NULL) {
output[index] *= weight[t];
}
}
}
template <typename T>
__global__ void sigmoid_focal_loss_backward_cuda_kernel(
const int nthreads, const T* input, const int64_t* target, const T* weight,
T* grad_input, const T gamma, const T alpha, const int num_classes) {
CUDA_1D_KERNEL_LOOP(index, nthreads) {
int n = index / num_classes;
int c = index % num_classes;
int64_t t = target[n];
T flag_p = (t == c);
T flag_n = (t != c);
// p = sigmoid(x) = 1. / 1. + expf(-x)
T p = (T)1. / ((T)1. + exp(-input[index]));
// (1 - p)**gamma * (1 - p - gamma*p*log(p))
T term_p = pow(((T)1. - p), gamma) *
((T)1. - p - (gamma * p * log(max(p, (T)FLT_MIN))));
// p**gamma * (gamma * (1 - p) * log(1 - p) - p)
T term_n = pow(p, gamma) *
(gamma * ((T)1. - p) * log(max((T)1. - p, (T)FLT_MIN)) - p);
grad_input[index] = (T)0.;
grad_input[index] += -flag_p * alpha * term_p;
grad_input[index] += -flag_n * ((T)1. - alpha) * term_n;
if (weight != NULL) {
grad_input[index] *= weight[t];
}
}
}
#endif
#ifndef SOFTMAX_FOCAL_LOSS_KERNEL_CUH
#define SOFTMAX_FOCAL_LOSS_KERNEL_CUH
template <typename T>
__global__ void softmax_focal_loss_forward_cuda_kernel(
const int nthreads, const T* softmax, const int64_t* target,
const T* weight, T* output, const T gamma, const T alpha,
const int num_classes) {
CUDA_1D_KERNEL_LOOP(index, nthreads) {
int64_t label = target[index];
T pred = softmax[index * num_classes + label];
if (label >= 0) {
output[index] =
-alpha * pow((T)1. - pred, gamma) * log(max(pred, (T)FLT_MIN));
} else {
output[index] = 0;
}
if (weight != NULL) {
output[index] *= weight[label];
}
}
}
template <typename T>
__global__ void softmax_focal_loss_backward_cuda1_kernel(
const int nthreads, const T* softmax, const int64_t* target,
const T* weight, T* buff, const T gamma, const T alpha,
const int num_classes) {
CUDA_1D_KERNEL_LOOP(index, nthreads) {
int64_t label = target[index];
T pred = softmax[index * num_classes + label];
if (label >= 0) {
buff[index] = alpha * (-pow((T)1. - pred, gamma) +
gamma * pow((T)1. - pred, gamma - 1) * pred *
log(max(pred, (T)FLT_MIN)));
} else {
buff[index] = 0;
}
if (weight != NULL) {
buff[index] *= weight[label];
}
}
}
template <typename T>
__global__ void softmax_focal_loss_backward_cuda2_kernel(
const int nthreads, const T* softmax, const int64_t* target, const T* buff,
T* grad_input, const int num_classes) {
CUDA_1D_KERNEL_LOOP(index, nthreads) {
int n = index / num_classes;
int c = index % num_classes;
int64_t label = target[n];
T flag = (label == c) * (T)1.;
if (label >= 0) {
grad_input[index] = buff[n] * (flag - softmax[index]);
} else {
grad_input[index] = 0;
}
}
}
#endif
#ifndef SOFTNMS_KERNEL_CUH
#define SOFTNMS_KERNEL_CUH
#include <cuda.h>
#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
int const threadsPerBlock = sizeof(unsigned long long int) * 8;
template <typename scalar_t>
__device__ inline scalar_t devIoU(scalar_t const *const a,
scalar_t const *const b) {
scalar_t left = fmaxf(a[0], b[0]), right = fminf(a[2], b[2]);
scalar_t top = fmaxf(a[1], b[1]), bottom = fminf(a[3], b[3]);
scalar_t width = fmaxf(right - left + 1, 0.f),
height = fmaxf(bottom - top + 1, 0.f);
scalar_t interS = width * height;
scalar_t Sa = (a[2] - a[0] + 1) * (a[3] - a[1] + 1);
scalar_t Sb = (b[2] - b[0] + 1) * (b[3] - b[1] + 1);
return interS / (Sa + Sb - interS);
}
template <typename scalar_t>
__global__ void softnms_max_kernel(const int n_boxes,
const scalar_t overlap_thresh,
const scalar_t *dev_boxes, int *order,
float *max_value, int *max_index) {
__shared__ float maximum[threadsPerBlock];
__shared__ int max_id[threadsPerBlock];
unsigned int tid = threadIdx.x;
unsigned int idx = blockIdx.x * threadsPerBlock + threadIdx.x;
if (idx >= n_boxes) {
return;
}
const int block_size = fminf(n_boxes + tid - idx, threadsPerBlock);
int *l_order = order + (idx - tid);
if (l_order[tid] == 0 && dev_boxes[idx * 5 + 4] >= overlap_thresh) {
maximum[tid] = dev_boxes[idx * 5 + 4];
} else {
maximum[tid] = -1.0;
}
max_id[tid] = tid;
__syncthreads();
if (block_size >= 1024 && tid < 512) {
if (maximum[tid] < maximum[tid + 512]) {
maximum[tid] = maximum[tid + 512];
max_id[tid] = max_id[tid + 512];
}
}
if (block_size >= 512 && tid < 256) {
if (maximum[tid] < maximum[tid + 256]) {
maximum[tid] = maximum[tid + 256];
max_id[tid] = max_id[tid + 256];
}
}
if (block_size >= 256 && tid < 128) {
if (maximum[tid] < maximum[tid + 128]) {
maximum[tid] = maximum[tid + 128];
max_id[tid] = max_id[tid + 128];
}
}
if (block_size >= 128 && tid < 64) {
if (maximum[tid] < maximum[tid + 64]) {
maximum[tid] = maximum[tid + 64];
max_id[tid] = max_id[tid + 64];
}
}
if (tid < 32) {
volatile float *vmaximum = maximum;
volatile int *vmax_id = max_id;
if (block_size >= 64 && vmaximum[tid] < vmaximum[tid + 32]) {
vmaximum[tid] = vmaximum[tid + 32];
vmax_id[tid] = vmax_id[tid + 32];
}
if (block_size >= 32 && tid < 16 && vmaximum[tid] < vmaximum[tid + 16]) {
vmaximum[tid] = vmaximum[tid + 16];
vmax_id[tid] = vmax_id[tid + 16];
}
if (block_size >= 16 && tid < 8 && vmaximum[tid] < vmaximum[tid + 8]) {
vmaximum[tid] = vmaximum[tid + 8];
vmax_id[tid] = vmax_id[tid + 8];
}
if (block_size >= 8 && tid < 4 && vmaximum[tid] < vmaximum[tid + 4]) {
vmaximum[tid] = vmaximum[tid + 4];
vmax_id[tid] = vmax_id[tid + 4];
}
if (block_size >= 4 && tid < 2 && vmaximum[tid] < vmaximum[tid + 2]) {
vmaximum[tid] = vmaximum[tid + 2];
vmax_id[tid] = vmax_id[tid + 2];
}
if (block_size >= 2 && tid < 1 && vmaximum[tid] < vmaximum[tid + 1]) {
vmaximum[tid] = vmaximum[tid + 1];
vmax_id[tid] = vmax_id[tid + 1];
}
}
if (tid == 0) {
max_value[blockIdx.x] = maximum[0];
max_index[blockIdx.x] = max_id[0];
}
}
template <typename scalar_t>
__global__ void softnms_update_kernel(const int n_boxes, const scalar_t sigma,
const scalar_t n_thresh,
const unsigned int method,
const scalar_t overlap_thresh,
scalar_t *dev_boxes, int *order,
unsigned long long *keep, int max_id) {
const int col_start = blockIdx.x;
const int col_size =
fminf(n_boxes - col_start * threadsPerBlock, threadsPerBlock);
const int cur_idx = threadsPerBlock * col_start + threadIdx.x;
const int tid = threadIdx.x;
if (cur_idx >= n_boxes) {
return;
}
__shared__ scalar_t cur_max_boxes[5];
cur_max_boxes[0] = dev_boxes[max_id * 5 + 0];
cur_max_boxes[1] = dev_boxes[max_id * 5 + 1];
cur_max_boxes[2] = dev_boxes[max_id * 5 + 2];
cur_max_boxes[3] = dev_boxes[max_id * 5 + 3];
cur_max_boxes[4] = dev_boxes[max_id * 5 + 4];
__syncthreads();
if (cur_idx != max_id && tid < col_size && order[cur_idx] == 0 &&
(!(keep[col_start] & (1ULL << tid)))) {
scalar_t block_boxes[5];
block_boxes[0] = dev_boxes[cur_idx * 5 + 0];
block_boxes[1] = dev_boxes[cur_idx * 5 + 1];
block_boxes[2] = dev_boxes[cur_idx * 5 + 2];
block_boxes[3] = dev_boxes[cur_idx * 5 + 3];
block_boxes[4] = dev_boxes[cur_idx * 5 + 4];
scalar_t ovr = devIoU(cur_max_boxes, block_boxes);
scalar_t weight = 1.0;
if (method == 1) {
if (ovr > n_thresh) {
weight = 1.0 - ovr;
}
} else if (method == 2) {
weight = exp(-(ovr * ovr) / sigma);
} else if (ovr >= n_thresh) {
weight = 0.0;
}
block_boxes[4] *= weight;
dev_boxes[cur_idx * 5 + 4] = block_boxes[4];
if (block_boxes[4] < overlap_thresh) {
keep[col_start] |= 1ULL << tid;
}
}
}
#endif
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment