Unverified Commit 48d99025 authored by z55250825's avatar z55250825 Committed by GitHub
Browse files

Add new parrots extension implementation for all ops (#794)

* delete all parrots file
add bbox_overlaps new parrots op impl

* support first new impl parrts op (bbox_overlaps)(success test)

* add box_iou_rotated op, test succeed

* add carafe and carafe_naive op, test succeed (one parrots bug need fix)

* add cc_attention op, test success

* add corner_pool op, test success

* add parrots op deform_conv, test success

* add deform_roi_pool op, test success (but has question)

* add focal loss op, test success (gradcheck)

* add masked_conv2d op, test success

* add modulated_deform_conv op, test success

* add nms and nms_rotated op, test success

* add psamask op, test success

* add roi_align op, test_success

* add roi_pool op, test success

* add sync_bn op, test success

* add tin_shift op, test success

* fix test_deform_roi_pool, add parrots test

* skip test_onnx because parrots does not support onnx

* fix c++ lint

* fix python lint

* fix python lint
parent 72e4cc12
#include <parrots/compute/aten.hpp>
#include <parrots/extension.hpp>
#include <parrots/foundation/ssattrs.hpp>
#include "cc_attention_pytorch.h"
using namespace parrots;
/*void ca_forward_cuda(const Tensor t, const Tensor f, Tensor weight);*/
void ca_forward_cuda_parrots(CudaContext &ctx, const SSElement &attr,
const OperatorBase::in_list_t &ins,
OperatorBase::out_list_t &outs) {
const auto &t = buildATensor(ctx, ins[0]);
const auto &f = buildATensor(ctx, ins[1]);
auto weight = buildATensor(ctx, outs[0]);
ca_forward_cuda(t, f, weight);
}
/* void ca_backward_cuda(const Tensor dw, const Tensor t, const Tensor f,
* Tensor dt, Tensor df)
*/
void ca_backward_cuda_parrots(CudaContext &ctx, const SSElement &attr,
const OperatorBase::in_list_t &ins,
OperatorBase::out_list_t &outs) {
const auto &dw = buildATensor(ctx, ins[0]);
const auto &t = buildATensor(ctx, ins[1]);
const auto &f = buildATensor(ctx, ins[2]);
auto dt = buildATensor(ctx, outs[0]);
auto df = buildATensor(ctx, outs[1]);
ca_backward_cuda(dw, t, f, dt, df);
}
/* void ca_map_forward_cuda(const Tensor weight, const Tensor g, Tensor out); */
void ca_map_forward_cuda_parrots(CudaContext &ctx, const SSElement &attr,
const OperatorBase::in_list_t &ins,
OperatorBase::out_list_t &outs) {
const auto &weight = buildATensor(ctx, ins[0]);
const auto &g = buildATensor(ctx, ins[1]);
auto out = buildATensor(ctx, outs[0]);
ca_map_forward_cuda(weight, g, out);
}
/* void ca_map_backward_cuda(const Tensor dout, const Tensor weight,
* const Tensor g, Tensor dw, Tensor dg);
*/
void ca_map_backward_cuda_parrots(CudaContext &ctx, const SSElement &attr,
const OperatorBase::in_list_t &ins,
OperatorBase::out_list_t &outs) {
const auto &dout = buildATensor(ctx, ins[0]);
const auto &weight = buildATensor(ctx, ins[1]);
const auto &g = buildATensor(ctx, ins[2]);
auto dw = buildATensor(ctx, outs[0]);
auto dg = buildATensor(ctx, outs[1]);
ca_map_backward_cuda(dout, weight, g, dw, dg);
}
PARROTS_EXTENSION_REGISTER(ca_forward)
.input(2)
.output(1)
.apply(ca_forward_cuda_parrots)
.done();
PARROTS_EXTENSION_REGISTER(ca_backward)
.input(3)
.output(2)
.apply(ca_backward_cuda_parrots)
.done();
PARROTS_EXTENSION_REGISTER(ca_map_forward)
.input(2)
.output(1)
.apply(ca_map_forward_cuda_parrots)
.done();
PARROTS_EXTENSION_REGISTER(ca_map_backward)
.input(3)
.output(2)
.apply(ca_map_backward_cuda_parrots)
.done();
#ifndef CC_ATTENTION_PYTORCH_H
#define CC_ATTENTION_PYTORCH_H
#include <torch/extension.h>
using namespace at;
void ca_forward_cuda(const Tensor t, const Tensor f, Tensor weight);
void ca_backward_cuda(const Tensor dw, const Tensor t, const Tensor f,
Tensor dt, Tensor df);
void ca_map_forward_cuda(const Tensor weight, const Tensor g, Tensor out);
void ca_map_backward_cuda(const Tensor dout, const Tensor weight,
const Tensor g, Tensor dw, Tensor dg);
#endif // CC_ATTENTION_PYTORCH_H
// Modified from
// https://github.com/princeton-vl/CornerNet-Lite/tree/master/core/models/py_utils/_cpools/src
#include "parrots_cpp_helper.hpp"
void bottom_pool_forward_cuda(CudaContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {}
void bottom_pool_backward_cuda(CudaContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {}
void top_pool_forward_cuda(CudaContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {}
void top_pool_backward_cuda(CudaContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {}
void left_pool_forward_cuda(CudaContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {}
void left_pool_backward_cuda(CudaContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {}
void right_pool_forward_cuda(CudaContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {}
void right_pool_backward_cuda(CudaContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {}
PARROTS_EXTENSION_REGISTER(bottom_pool_forward)
.input(1)
.output(1)
.apply(bottom_pool_forward_cuda)
.done();
PARROTS_EXTENSION_REGISTER(bottom_pool_backward)
.input(2)
.output(1)
.apply(bottom_pool_backward_cuda)
.done();
PARROTS_EXTENSION_REGISTER(top_pool_forward)
.input(1)
.output(1)
.apply(top_pool_forward_cuda)
.done();
PARROTS_EXTENSION_REGISTER(top_pool_backward)
.input(2)
.output(1)
.apply(top_pool_backward_cuda)
.done();
PARROTS_EXTENSION_REGISTER(left_pool_forward)
.input(1)
.output(1)
.apply(left_pool_forward_cuda)
.done();
PARROTS_EXTENSION_REGISTER(left_pool_backward)
.input(2)
.output(1)
.apply(left_pool_backward_cuda)
.done();
PARROTS_EXTENSION_REGISTER(right_pool_forward)
.input(1)
.output(1)
.apply(right_pool_forward_cuda)
.done();
PARROTS_EXTENSION_REGISTER(right_pool_backward)
.input(2)
.output(1)
.apply(right_pool_backward_cuda)
.done();
#include "pytorch_cpp_helper.hpp"
Tensor bottom_pool_forward(Tensor input) {
// Initialize output
Tensor output = at::zeros_like(input);
// Get height
int64_t height = input.size(2);
output.copy_(input);
for (int64_t ind = 1; ind < height; ind <<= 1) {
Tensor max_temp = at::slice(output, 2, ind, height);
Tensor cur_temp = at::slice(output, 2, ind, height).clone();
Tensor next_temp = at::slice(output, 2, 0, height - ind).clone();
at::max_out(max_temp, cur_temp, next_temp);
}
return output;
}
Tensor bottom_pool_backward(Tensor input, Tensor grad_output) {
auto output = at::zeros_like(input);
int32_t batch = input.size(0);
int32_t channel = input.size(1);
int32_t height = input.size(2);
int32_t width = input.size(3);
auto max_val = torch::zeros({batch, channel, width},
at::device(at::kCUDA).dtype(at::kFloat));
auto max_ind = torch::zeros({batch, channel, width},
at::device(at::kCUDA).dtype(at::kLong));
auto input_temp = input.select(2, 0);
max_val.copy_(input_temp);
max_ind.fill_(0);
auto output_temp = output.select(2, 0);
auto grad_output_temp = grad_output.select(2, 0);
output_temp.copy_(grad_output_temp);
auto un_max_ind = max_ind.unsqueeze(2);
auto gt_mask = torch::zeros({batch, channel, width},
at::device(at::kCUDA).dtype(at::kBool));
auto max_temp = torch::zeros({batch, channel, width},
at::device(at::kCUDA).dtype(at::kFloat));
for (int32_t ind = 0; ind < height - 1; ++ind) {
input_temp = input.select(2, ind + 1);
at::gt_out(gt_mask, input_temp, max_val);
at::masked_select_out(max_temp, input_temp, gt_mask);
max_val.masked_scatter_(gt_mask, max_temp);
max_ind.masked_fill_(gt_mask, ind + 1);
grad_output_temp = grad_output.select(2, ind + 1).unsqueeze(2);
output.scatter_add_(2, un_max_ind, grad_output_temp);
}
return output;
}
Tensor left_pool_forward(Tensor input) {
// Initialize output
Tensor output = at::zeros_like(input);
// Get width
int64_t width = input.size(3);
output.copy_(input);
for (int64_t ind = 1; ind < width; ind <<= 1) {
Tensor max_temp = at::slice(output, 3, 0, width - ind);
Tensor cur_temp = at::slice(output, 3, 0, width - ind).clone();
Tensor next_temp = at::slice(output, 3, ind, width).clone();
at::max_out(max_temp, cur_temp, next_temp);
}
return output;
}
Tensor left_pool_backward(Tensor input, Tensor grad_output) {
auto output = at::zeros_like(input);
int32_t batch = input.size(0);
int32_t channel = input.size(1);
int32_t height = input.size(2);
int32_t width = input.size(3);
auto max_val = torch::zeros({batch, channel, height},
at::device(at::kCUDA).dtype(at::kFloat));
auto max_ind = torch::zeros({batch, channel, height},
at::device(at::kCUDA).dtype(at::kLong));
auto input_temp = input.select(3, width - 1);
max_val.copy_(input_temp);
max_ind.fill_(width - 1);
auto output_temp = output.select(3, width - 1);
auto grad_output_temp = grad_output.select(3, width - 1);
output_temp.copy_(grad_output_temp);
auto un_max_ind = max_ind.unsqueeze(3);
auto gt_mask = torch::zeros({batch, channel, height},
at::device(at::kCUDA).dtype(at::kBool));
auto max_temp = torch::zeros({batch, channel, height},
at::device(at::kCUDA).dtype(at::kFloat));
for (int32_t ind = 1; ind < width; ++ind) {
input_temp = input.select(3, width - ind - 1);
at::gt_out(gt_mask, input_temp, max_val);
at::masked_select_out(max_temp, input_temp, gt_mask);
max_val.masked_scatter_(gt_mask, max_temp);
max_ind.masked_fill_(gt_mask, width - ind - 1);
grad_output_temp = grad_output.select(3, width - ind - 1).unsqueeze(3);
output.scatter_add_(3, un_max_ind, grad_output_temp);
}
return output;
}
Tensor right_pool_forward(Tensor input) {
// Initialize output
Tensor output = at::zeros_like(input);
// Get width
int64_t width = input.size(3);
output.copy_(input);
for (int64_t ind = 1; ind < width; ind <<= 1) {
Tensor max_temp = at::slice(output, 3, ind, width);
Tensor cur_temp = at::slice(output, 3, ind, width).clone();
Tensor next_temp = at::slice(output, 3, 0, width - ind).clone();
at::max_out(max_temp, cur_temp, next_temp);
}
return output;
}
Tensor right_pool_backward(Tensor input, Tensor grad_output) {
Tensor output = at::zeros_like(input);
int32_t batch = input.size(0);
int32_t channel = input.size(1);
int32_t height = input.size(2);
int32_t width = input.size(3);
auto max_val = torch::zeros({batch, channel, height},
at::device(at::kCUDA).dtype(at::kFloat));
auto max_ind = torch::zeros({batch, channel, height},
at::device(at::kCUDA).dtype(at::kLong));
auto input_temp = input.select(3, 0);
max_val.copy_(input_temp);
max_ind.fill_(0);
auto output_temp = output.select(3, 0);
auto grad_output_temp = grad_output.select(3, 0);
output_temp.copy_(grad_output_temp);
auto un_max_ind = max_ind.unsqueeze(3);
auto gt_mask = torch::zeros({batch, channel, height},
at::device(at::kCUDA).dtype(at::kBool));
auto max_temp = torch::zeros({batch, channel, height},
at::device(at::kCUDA).dtype(at::kFloat));
for (int32_t ind = 0; ind < width - 1; ++ind) {
input_temp = input.select(3, ind + 1);
at::gt_out(gt_mask, input_temp, max_val);
at::masked_select_out(max_temp, input_temp, gt_mask);
max_val.masked_scatter_(gt_mask, max_temp);
max_ind.masked_fill_(gt_mask, ind + 1);
grad_output_temp = grad_output.select(3, ind + 1).unsqueeze(3);
output.scatter_add_(3, un_max_ind, grad_output_temp);
}
return output;
}
Tensor top_pool_forward(Tensor input) {
// Initialize output
Tensor output = at::zeros_like(input);
// Get height
int64_t height = input.size(2);
output.copy_(input);
for (int64_t ind = 1; ind < height; ind <<= 1) {
Tensor max_temp = at::slice(output, 2, 0, height - ind);
Tensor cur_temp = at::slice(output, 2, 0, height - ind).clone();
Tensor next_temp = at::slice(output, 2, ind, height).clone();
at::max_out(max_temp, cur_temp, next_temp);
}
return output;
}
Tensor top_pool_backward(Tensor input, Tensor grad_output) {
auto output = at::zeros_like(input);
int32_t batch = input.size(0);
int32_t channel = input.size(1);
int32_t height = input.size(2);
int32_t width = input.size(3);
auto max_val = torch::zeros({batch, channel, width},
at::device(at::kCUDA).dtype(at::kFloat));
auto max_ind = torch::zeros({batch, channel, width},
at::device(at::kCUDA).dtype(at::kLong));
auto input_temp = input.select(2, height - 1);
max_val.copy_(input_temp);
max_ind.fill_(height - 1);
auto output_temp = output.select(2, height - 1);
auto grad_output_temp = grad_output.select(2, height - 1);
output_temp.copy_(grad_output_temp);
auto un_max_ind = max_ind.unsqueeze(2);
auto gt_mask = torch::zeros({batch, channel, width},
at::device(at::kCUDA).dtype(at::kBool));
auto max_temp = torch::zeros({batch, channel, width},
at::device(at::kCUDA).dtype(at::kFloat));
for (int32_t ind = 1; ind < height; ++ind) {
input_temp = input.select(2, height - ind - 1);
at::gt_out(gt_mask, input_temp, max_val);
at::masked_select_out(max_temp, input_temp, gt_mask);
max_val.masked_scatter_(gt_mask, max_temp);
max_ind.masked_fill_(gt_mask, height - ind - 1);
grad_output_temp = grad_output.select(2, height - ind - 1).unsqueeze(2);
output.scatter_add_(2, un_max_ind, grad_output_temp);
}
return output;
}
#include <parrots/compute/aten.hpp>
#include <parrots/extension.hpp>
#include <parrots/foundation/ssattrs.hpp>
#include "corner_pool_pytorch.h"
using namespace parrots;
#ifdef MMCV_WITH_CUDA
void bottom_pool_forward_parrots(CudaContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
at::Tensor input;
input = buildATensor(ctx, ins[0]);
auto out = bottom_pool_forward(input);
updateDArray(ctx, out, outs[0]);
}
void bottom_pool_backward_parrots(CudaContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
at::Tensor input, grad_output;
input = buildATensor(ctx, ins[0]);
grad_output = buildATensor(ctx, ins[1]);
auto out = bottom_pool_backward(input, grad_output);
updateDArray(ctx, out, outs[0]);
}
void left_pool_forward_parrots(CudaContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
at::Tensor input;
input = buildATensor(ctx, ins[0]);
auto out = left_pool_forward(input);
updateDArray(ctx, out, outs[0]);
}
void left_pool_backward_parrots(CudaContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
at::Tensor input, grad_output;
input = buildATensor(ctx, ins[0]);
grad_output = buildATensor(ctx, ins[1]);
auto out = left_pool_backward(input, grad_output);
updateDArray(ctx, out, outs[0]);
}
void right_pool_forward_parrots(CudaContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
at::Tensor input;
input = buildATensor(ctx, ins[0]);
auto out = right_pool_forward(input);
updateDArray(ctx, out, outs[0]);
}
void right_pool_backward_parrots(CudaContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
at::Tensor input, grad_output;
input = buildATensor(ctx, ins[0]);
grad_output = buildATensor(ctx, ins[1]);
auto out = right_pool_backward(input, grad_output);
updateDArray(ctx, out, outs[0]);
}
void top_pool_forward_parrots(CudaContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
at::Tensor input;
input = buildATensor(ctx, ins[0]);
auto out = top_pool_forward(input);
updateDArray(ctx, out, outs[0]);
}
void top_pool_backward_parrots(CudaContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
at::Tensor input, grad_output;
input = buildATensor(ctx, ins[0]);
grad_output = buildATensor(ctx, ins[1]);
auto out = top_pool_backward(input, grad_output);
updateDArray(ctx, out, outs[0]);
}
#endif
void bottom_pool_forward_parrots_cpu(HostContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
at::Tensor input;
input = buildATensor(ctx, ins[0]);
auto out = bottom_pool_forward(input);
updateDArray(ctx, out, outs[0]);
}
void bottom_pool_backward_parrots_cpu(HostContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
at::Tensor input, grad_output;
input = buildATensor(ctx, ins[0]);
grad_output = buildATensor(ctx, ins[1]);
auto out = bottom_pool_backward(input, grad_output);
updateDArray(ctx, out, outs[0]);
}
void left_pool_forward_parrots_cpu(HostContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
at::Tensor input;
input = buildATensor(ctx, ins[0]);
auto out = left_pool_forward(input);
updateDArray(ctx, out, outs[0]);
}
void left_pool_backward_parrots_cpu(HostContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
at::Tensor input, grad_output;
input = buildATensor(ctx, ins[0]);
grad_output = buildATensor(ctx, ins[1]);
auto out = left_pool_backward(input, grad_output);
updateDArray(ctx, out, outs[0]);
}
void right_pool_forward_parrots_cpu(HostContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
at::Tensor input;
input = buildATensor(ctx, ins[0]);
auto out = right_pool_forward(input);
updateDArray(ctx, out, outs[0]);
}
void right_pool_backward_parrots_cpu(HostContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
at::Tensor input, grad_output;
input = buildATensor(ctx, ins[0]);
grad_output = buildATensor(ctx, ins[1]);
auto out = right_pool_backward(input, grad_output);
updateDArray(ctx, out, outs[0]);
}
void top_pool_forward_parrots_cpu(HostContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
at::Tensor input;
input = buildATensor(ctx, ins[0]);
auto out = top_pool_forward(input);
updateDArray(ctx, out, outs[0]);
}
void top_pool_backward_parrots_cpu(HostContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
at::Tensor input, grad_output;
input = buildATensor(ctx, ins[0]);
grad_output = buildATensor(ctx, ins[1]);
auto out = top_pool_backward(input, grad_output);
updateDArray(ctx, out, outs[0]);
}
PARROTS_EXTENSION_REGISTER(bottom_pool_forward)
.input(1)
.output(1)
#ifdef MMCV_WITH_CUDA
.apply(bottom_pool_forward_parrots)
#endif
.apply(bottom_pool_forward_parrots_cpu)
.done();
PARROTS_EXTENSION_REGISTER(bottom_pool_backward)
.input(2)
.output(1)
#ifdef MMCV_WITH_CUDA
.apply(bottom_pool_backward_parrots)
#endif
.apply(bottom_pool_backward_parrots_cpu)
.done();
PARROTS_EXTENSION_REGISTER(top_pool_forward)
.input(1)
.output(1)
#ifdef MMCV_WITH_CUDA
.apply(top_pool_forward_parrots)
#endif
.apply(top_pool_forward_parrots_cpu)
.done();
PARROTS_EXTENSION_REGISTER(top_pool_backward)
.input(2)
.output(1)
#ifdef MMCV_WITH_CUDA
.apply(top_pool_backward_parrots)
#endif
.apply(top_pool_backward_parrots_cpu)
.done();
PARROTS_EXTENSION_REGISTER(left_pool_forward)
.input(1)
.output(1)
#ifdef MMCV_WITH_CUDA
.apply(left_pool_forward_parrots)
#endif
.apply(left_pool_forward_parrots_cpu)
.done();
PARROTS_EXTENSION_REGISTER(left_pool_backward)
.input(2)
.output(1)
#ifdef MMCV_WITH_CUDA
.apply(left_pool_backward_parrots)
#endif
.apply(left_pool_backward_parrots_cpu)
.done();
PARROTS_EXTENSION_REGISTER(right_pool_forward)
.input(1)
.output(1)
#ifdef MMCV_WITH_CUDA
.apply(right_pool_forward_parrots)
#endif
.apply(right_pool_forward_parrots_cpu)
.done();
PARROTS_EXTENSION_REGISTER(right_pool_backward)
.input(2)
.output(1)
#ifdef MMCV_WITH_CUDA
.apply(right_pool_backward_parrots)
#endif
.apply(right_pool_backward_parrots_cpu)
.done();
#ifndef CORNER_POOL_PYTORCH_H
#define CORNER_POOL_PYTORCH_H
#include <torch/extension.h>
at::Tensor bottom_pool_forward(at::Tensor input);
at::Tensor bottom_pool_backward(at::Tensor input, at::Tensor grad_output);
at::Tensor left_pool_forward(at::Tensor input);
at::Tensor left_pool_backward(at::Tensor input, at::Tensor grad_output);
at::Tensor right_pool_forward(at::Tensor input);
at::Tensor right_pool_backward(at::Tensor input, at::Tensor grad_output);
at::Tensor top_pool_forward(at::Tensor input);
at::Tensor top_pool_backward(at::Tensor input, at::Tensor grad_output);
#endif // CORNER_POOL_PYTORCH_H
// Copyright (c) 2018, SenseTime.
#include "parrots_cpp_helper.hpp"
#include "pytorch_cpp_helper.hpp"
void DeformConvForwardCUDAKernelLauncher(
const DArrayLite input, const DArrayLite weight, const DArrayLite offset,
DArrayLite output, DArrayLite columns, DArrayLite ones, int kW, int kH,
int dW, int dH, int padW, int padH, int dilationW, int dilationH, int group,
int deformable_group, int im2col_step, CudaContext& ctx,
cudaStream_t stream);
#ifdef MMCV_WITH_CUDA
void DeformConvForwardCUDAKernelLauncher(Tensor input, Tensor weight,
Tensor offset, Tensor output,
Tensor columns, Tensor ones, int kW,
int kH, int dW, int dH, int padW,
int padH, int dilationW, int dilationH,
int group, int deformable_group,
int im2col_step);
void DeformConvBackwardInputCUDAKernelLauncher(
const DArrayLite input, const DArrayLite offset,
const DArrayLite gradOutput, DArrayLite gradInput, DArrayLite gradOffset,
DArrayLite weight, DArrayLite columns, int kW, int kH, int dW, int dH,
int padW, int padH, int dilationW, int dilationH, int group,
int deformable_group, int im2col_step, CudaContext& ctx,
cudaStream_t stream);
Tensor input, Tensor offset, Tensor gradOutput, Tensor gradInput,
Tensor gradOffset, Tensor weight, Tensor columns, int kW, int kH, int dW,
int dH, int padW, int padH, int dilationW, int dilationH, int group,
int deformable_group, int im2col_step);
void DeformConvBackwardParametersCUDAKernelLauncher(
const DArrayLite input, const DArrayLite offset,
const DArrayLite gradOutput, DArrayLite gradWeight, DArrayLite columns,
DArrayLite ones, int kW, int kH, int dW, int dH, int padW, int padH,
int dilationW, int dilationH, int group, int deformable_group, float scale,
int im2col_step, CudaContext& ctx, cudaStream_t stream);
void deform_conv_forward_cuda(CudaContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
int kW, kH, dW, dH, padW, padH, dilationW, dilationH, group, deformable_group,
im2col_step;
SSAttrs(attr)
.get<int>("kW", kW)
.get<int>("kH", kH)
.get<int>("dW", dW)
.get<int>("dH", dH)
.get<int>("padW", padW)
.get<int>("padH", padH)
.get<int>("dilationW", dilationW)
.get<int>("dilationH", dilationH)
.get<int>("group", group)
.get<int>("deformable_group", deformable_group)
.get<int>("im2col_step", im2col_step)
.done();
const auto input = ins[0];
const auto weight = ins[1];
const auto offset = ins[2];
auto output = outs[0];
auto columns = outs[1];
auto ones = outs[2];
cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
Tensor input, Tensor offset, Tensor gradOutput, Tensor gradWeight,
Tensor columns, Tensor ones, int kW, int kH, int dW, int dH, int padW,
int padH, int dilationW, int dilationH, int group, int deformable_group,
float scale, int im2col_step);
void deform_conv_forward_cuda(Tensor input, Tensor weight, Tensor offset,
Tensor output, Tensor columns, Tensor ones,
int kW, int kH, int dW, int dH, int padW,
int padH, int dilationW, int dilationH, int group,
int deformable_group, int im2col_step) {
DeformConvForwardCUDAKernelLauncher(
input, weight, offset, output, columns, ones, kW, kH, dW, dH, padW, padH,
dilationW, dilationH, group, deformable_group, im2col_step, ctx, stream);
dilationW, dilationH, group, deformable_group, im2col_step);
}
void deform_conv_backward_input_cuda(CudaContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
int kW, kH, dW, dH, padW, padH, dilationW, dilationH, group, deformable_group,
im2col_step;
SSAttrs(attr)
.get<int>("kW", kW)
.get<int>("kH", kH)
.get<int>("dW", dW)
.get<int>("dH", dH)
.get<int>("padW", padW)
.get<int>("padH", padH)
.get<int>("dilationW", dilationW)
.get<int>("dilationH", dilationH)
.get<int>("group", group)
.get<int>("deformable_group", deformable_group)
.get<int>("im2col_step", im2col_step)
.done();
auto input = ins[0];
auto offset = ins[1];
auto gradOutput = ins[2];
auto gradInput = outs[0];
auto gradOffset = outs[1];
auto weight = outs[2];
auto columns = outs[3];
cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
void deform_conv_backward_input_cuda(Tensor input, Tensor offset,
Tensor gradOutput, Tensor gradInput,
Tensor gradOffset, Tensor weight,
Tensor columns, int kW, int kH, int dW,
int dH, int padW, int padH, int dilationW,
int dilationH, int group,
int deformable_group, int im2col_step) {
DeformConvBackwardInputCUDAKernelLauncher(
input, offset, gradOutput, gradInput, gradOffset, weight, columns, kW, kH,
dW, dH, padW, padH, dilationW, dilationH, group, deformable_group,
im2col_step, ctx, stream);
im2col_step);
}
void deform_conv_backward_parameters_cuda(CudaContext& ctx,
const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
int kW, kH, dW, dH, padW, padH, dilationW, dilationH, group, deformable_group,
im2col_step;
float scale;
SSAttrs(attr)
.get<int>("kW", kW)
.get<int>("kH", kH)
.get<int>("dW", dW)
.get<int>("dH", dH)
.get<int>("padW", padW)
.get<int>("padH", padH)
.get<int>("dilationW", dilationW)
.get<int>("dilationH", dilationH)
.get<int>("group", group)
.get<int>("deformable_group", deformable_group)
.get<float>("scale", scale)
.get<int>("im2col_step", im2col_step)
.done();
auto input = ins[0];
auto offset = ins[1];
auto gradOutput = ins[2];
auto gradWeight = outs[0];
auto columns = outs[1];
auto ones = outs[2];
cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
void deform_conv_backward_parameters_cuda(
Tensor input, Tensor offset, Tensor gradOutput, Tensor gradWeight,
Tensor columns, Tensor ones, int kW, int kH, int dW, int dH, int padW,
int padH, int dilationW, int dilationH, int group, int deformable_group,
float scale, int im2col_step) {
DeformConvBackwardParametersCUDAKernelLauncher(
input, offset, gradOutput, gradWeight, columns, ones, kW, kH, dW, dH,
padW, padH, dilationW, dilationH, group, deformable_group, scale,
im2col_step, ctx, stream);
im2col_step);
}
#endif
void deform_conv_forward(Tensor input, Tensor weight, Tensor offset,
Tensor output, Tensor columns, Tensor ones, int kW,
int kH, int dW, int dH, int padW, int padH,
int dilationW, int dilationH, int group,
int deformable_group, int im2col_step) {
if (input.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(input);
CHECK_CUDA_INPUT(offset);
CHECK_CUDA_INPUT(weight);
CHECK_CUDA_INPUT(output);
CHECK_CUDA_INPUT(columns);
CHECK_CUDA_INPUT(ones);
deform_conv_forward_cuda(input, weight, offset, output, columns, ones, kW,
kH, dW, dH, padW, padH, dilationW, dilationH,
group, deformable_group, im2col_step);
#else
AT_ERROR("DeformConv is not compiled with GPU support");
#endif
} else {
AT_ERROR("DeformConv is not implemented on CPU");
}
}
PARROTS_EXTENSION_REGISTER(deform_conv_forward)
.attr("kW")
.attr("kH")
.attr("dW")
.attr("dH")
.attr("padW")
.attr("padH")
.attr("dilationW")
.attr("dilationH")
.attr("group")
.attr("deformable_group")
.attr("im2col_step")
.input(3)
.output(3)
.apply(deform_conv_forward_cuda)
.done();
PARROTS_EXTENSION_REGISTER(deform_conv_backward_input)
.attr("kW")
.attr("kH")
.attr("dW")
.attr("dH")
.attr("padW")
.attr("padH")
.attr("dilationW")
.attr("dilationH")
.attr("group")
.attr("deformable_group")
.attr("im2col_step")
.input(3)
.output(4)
.apply(deform_conv_backward_input_cuda)
.done();
void deform_conv_backward_input(Tensor input, Tensor offset, Tensor gradOutput,
Tensor gradInput, Tensor gradOffset,
Tensor weight, Tensor columns, int kW, int kH,
int dW, int dH, int padW, int padH,
int dilationW, int dilationH, int group,
int deformable_group, int im2col_step) {
if (input.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(input);
CHECK_CUDA_INPUT(offset);
CHECK_CUDA_INPUT(gradOutput);
CHECK_CUDA_INPUT(gradInput);
CHECK_CUDA_INPUT(gradOffset);
CHECK_CUDA_INPUT(weight);
CHECK_CUDA_INPUT(columns);
deform_conv_backward_input_cuda(input, offset, gradOutput, gradInput,
gradOffset, weight, columns, kW, kH, dW, dH,
padW, padH, dilationW, dilationH, group,
deformable_group, im2col_step);
#else
AT_ERROR("DeformConv is not compiled with GPU support");
#endif
} else {
AT_ERROR("DeformConv is not implemented on CPU");
}
}
PARROTS_EXTENSION_REGISTER(deform_conv_backward_parameters)
.attr("kW")
.attr("kH")
.attr("dW")
.attr("dH")
.attr("padW")
.attr("padH")
.attr("dilationW")
.attr("dilationH")
.attr("group")
.attr("deformable_group")
.attr("scale")
.attr("im2col_step")
.input(3)
.output(3)
.apply(deform_conv_backward_parameters_cuda)
.done();
void deform_conv_backward_parameters(Tensor input, Tensor offset,
Tensor gradOutput, Tensor gradWeight,
Tensor columns, Tensor ones, int kW,
int kH, int dW, int dH, int padW, int padH,
int dilationW, int dilationH, int group,
int deformable_group, float scale,
int im2col_step) {
if (input.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(input);
CHECK_CUDA_INPUT(offset);
CHECK_CUDA_INPUT(gradOutput);
CHECK_CUDA_INPUT(gradWeight);
CHECK_CUDA_INPUT(columns);
CHECK_CUDA_INPUT(ones);
deform_conv_backward_parameters_cuda(input, offset, gradOutput, gradWeight,
columns, ones, kW, kH, dW, dH, padW,
padH, dilationW, dilationH, group,
deformable_group, scale, im2col_step);
#else
AT_ERROR("DeformConv is not compiled with GPU support");
#endif
} else {
AT_ERROR("DeformConv is not implemented on CPU");
}
}
This diff is collapsed.
#include <parrots/compute/aten.hpp>
#include <parrots/extension.hpp>
#include <parrots/foundation/ssattrs.hpp>
#include "deform_conv_pytorch.h"
using namespace parrots;
/*void deform_conv_forward_cuda(Tensor input, Tensor weight, Tensor offset,
* Tensor output, Tensor columns, Tensor ones,
* int kW, int kH, int dW, int dH, int padW,
* int padH, int dilationW, int dilationH, int
* group, int deformable_group, int im2col_step);
*/
void deform_conv_forward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
int kW, kH, dW, dH, padW, padH, dilationW, dilationH, group, deformable_group,
im2col_step;
SSAttrs(attr)
.get<int>("kW", kW)
.get<int>("kH", kH)
.get<int>("dW", dW)
.get<int>("dH", dH)
.get<int>("padW", padW)
.get<int>("padH", padH)
.get<int>("dilationW", dilationW)
.get<int>("dilationH", dilationH)
.get<int>("group", group)
.get<int>("deformable_group", deformable_group)
.get<int>("im2col_step", im2col_step)
.done();
const auto& input = buildATensor(ctx, ins[0]);
const auto& weight = buildATensor(ctx, ins[1]);
const auto& offset = buildATensor(ctx, ins[2]);
auto output = buildATensor(ctx, outs[0]);
auto columns = buildATensor(ctx, outs[1]);
auto ones = buildATensor(ctx, outs[2]);
deform_conv_forward_cuda(input, weight, offset, output, columns, ones, kW, kH,
dW, dH, padW, padH, dilationW, dilationH, group,
deformable_group, im2col_step);
}
/*void deform_conv_backward_input_cuda(Tensor input, Tensor offset,
* Tensor gradOutput, Tensor gradInput,
* Tensor gradOffset, Tensor weight,
* Tensor columns, int kW, int kH, int dW,
* int dH, int padW, int padH, int
* dilationW, int dilationH, int group, int deformable_group, int im2col_step);
*/
void deform_conv_backward_input_cuda_parrots(CudaContext& ctx,
const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
int kW, kH, dW, dH, padW, padH, dilationW, dilationH, group, deformable_group,
im2col_step;
SSAttrs(attr)
.get<int>("kW", kW)
.get<int>("kH", kH)
.get<int>("dW", dW)
.get<int>("dH", dH)
.get<int>("padW", padW)
.get<int>("padH", padH)
.get<int>("dilationW", dilationW)
.get<int>("dilationH", dilationH)
.get<int>("group", group)
.get<int>("deformable_group", deformable_group)
.get<int>("im2col_step", im2col_step)
.done();
const auto& input = buildATensor(ctx, ins[0]);
const auto& offset = buildATensor(ctx, ins[1]);
const auto& gradOutput = buildATensor(ctx, ins[2]);
auto gradInput = buildATensor(ctx, outs[0]);
auto gradOffset = buildATensor(ctx, outs[1]);
auto weight = buildATensor(ctx, outs[2]);
auto columns = buildATensor(ctx, outs[3]);
deform_conv_backward_input_cuda(input, offset, gradOutput, gradInput,
gradOffset, weight, columns, kW, kH, dW, dH,
padW, padH, dilationW, dilationH, group,
deformable_group, im2col_step);
}
/*void deform_conv_backward_parameters_cuda(
* Tensor input, Tensor offset, Tensor gradOutput, Tensor gradWeight,
* Tensor columns, Tensor ones, int kW, int kH, int dW, int dH, int padW,
* int padH, int dilationW, int dilationH, int group, int deformable_group,
* float scale, int im2col_step);
*/
void deform_conv_backward_parameters_cuda_parrots(
CudaContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
int kW, kH, dW, dH, padW, padH, dilationW, dilationH, group, deformable_group,
im2col_step;
float scale;
SSAttrs(attr)
.get<int>("kW", kW)
.get<int>("kH", kH)
.get<int>("dW", dW)
.get<int>("dH", dH)
.get<int>("padW", padW)
.get<int>("padH", padH)
.get<int>("dilationW", dilationW)
.get<int>("dilationH", dilationH)
.get<int>("group", group)
.get<int>("deformable_group", deformable_group)
.get<float>("scale", scale)
.get<int>("im2col_step", im2col_step)
.done();
const auto& input = buildATensor(ctx, ins[0]);
const auto& offset = buildATensor(ctx, ins[1]);
const auto& gradOutput = buildATensor(ctx, ins[2]);
auto gradWeight = buildATensor(ctx, outs[0]);
auto columns = buildATensor(ctx, outs[1]);
auto ones = buildATensor(ctx, outs[2]);
deform_conv_backward_parameters_cuda(input, offset, gradOutput, gradWeight,
columns, ones, kW, kH, dW, dH, padW,
padH, dilationW, dilationH, group,
deformable_group, scale, im2col_step);
}
PARROTS_EXTENSION_REGISTER(deform_conv_forward)
.attr("kW")
.attr("kH")
.attr("dW")
.attr("dH")
.attr("padW")
.attr("padH")
.attr("dilationW")
.attr("dilationH")
.attr("group")
.attr("deformable_group")
.attr("im2col_step")
.input(3)
.output(3)
.apply(deform_conv_forward_cuda_parrots)
.done();
PARROTS_EXTENSION_REGISTER(deform_conv_backward_input)
.attr("kW")
.attr("kH")
.attr("dW")
.attr("dH")
.attr("padW")
.attr("padH")
.attr("dilationW")
.attr("dilationH")
.attr("group")
.attr("deformable_group")
.attr("im2col_step")
.input(3)
.output(4)
.apply(deform_conv_backward_input_cuda_parrots)
.done();
PARROTS_EXTENSION_REGISTER(deform_conv_backward_parameters)
.attr("kW")
.attr("kH")
.attr("dW")
.attr("dH")
.attr("padW")
.attr("padH")
.attr("dilationW")
.attr("dilationH")
.attr("group")
.attr("deformable_group")
.attr("scale")
.attr("im2col_step")
.input(3)
.output(3)
.apply(deform_conv_backward_parameters_cuda_parrots)
.done();
#ifndef DEFORM_CONV_PYTORCH_H
#define DEFORM_CONV_PYTORCH_H
#include <torch/extension.h>
using namespace at;
void deform_conv_forward_cuda(Tensor input, Tensor weight, Tensor offset,
Tensor output, Tensor columns, Tensor ones,
int kW, int kH, int dW, int dH, int padW,
int padH, int dilationW, int dilationH, int group,
int deformable_group, int im2col_step);
void deform_conv_backward_input_cuda(Tensor input, Tensor offset,
Tensor gradOutput, Tensor gradInput,
Tensor gradOffset, Tensor weight,
Tensor columns, int kW, int kH, int dW,
int dH, int padW, int padH, int dilationW,
int dilationH, int group,
int deformable_group, int im2col_step);
void deform_conv_backward_parameters_cuda(
Tensor input, Tensor offset, Tensor gradOutput, Tensor gradWeight,
Tensor columns, Tensor ones, int kW, int kH, int dW, int dH, int padW,
int padH, int dilationW, int dilationH, int group, int deformable_group,
float scale, int im2col_step);
#endif // DEFORM_CONV_PYTORCH_H
#include "parrots_cpp_helper.hpp"
#include "pytorch_cpp_helper.hpp"
void DeformRoIPoolForwardCUDAKernelLauncher(
const DArrayLite input, const DArrayLite rois, const DArrayLite offset,
DArrayLite output, int pooled_height, int pooled_width, float spatial_scale,
int sampling_ratio, float gamma, cudaStream_t stream);
#ifdef MMCV_WITH_CUDA
void DeformRoIPoolForwardCUDAKernelLauncher(Tensor input, Tensor rois,
Tensor offset, Tensor output,
int pooled_height, int pooled_width,
float spatial_scale,
int sampling_ratio, float gamma);
void DeformRoIPoolBackwardCUDAKernelLauncher(
const DArrayLite grad_output, const DArrayLite input, const DArrayLite rois,
const DArrayLite offset, DArrayLite grad_input, DArrayLite grad_offset,
int pooled_height, int pooled_width, float spatial_scale,
int sampling_ratio, float gamma, cudaStream_t stream);
Tensor grad_output, Tensor input, Tensor rois, Tensor offset,
Tensor grad_input, Tensor grad_offset, int pooled_height, int pooled_width,
float spatial_scale, int sampling_ratio, float gamma);
void deform_roi_pool_forward_cuda(CudaContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
int pooled_height;
int pooled_width;
float spatial_scale;
int sampling_ratio;
float gamma;
SSAttrs(attr)
.get<int>("pooled_height", pooled_height)
.get<int>("pooled_width", pooled_width)
.get<float>("spatial_scale", spatial_scale)
.get<int>("sampling_ratio", sampling_ratio)
.get<float>("gamma", gamma)
.done();
const auto& input = ins[0];
const auto& rois = ins[1];
const auto& offset = ins[2];
auto& output = outs[0];
cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
DeformRoIPoolForwardCUDAKernelLauncher(
input, rois, offset, output, pooled_height, pooled_width, spatial_scale,
sampling_ratio, gamma, stream);
void deform_roi_pool_forward_cuda(Tensor input, Tensor rois, Tensor offset,
Tensor output, int pooled_height,
int pooled_width, float spatial_scale,
int sampling_ratio, float gamma) {
DeformRoIPoolForwardCUDAKernelLauncher(input, rois, offset, output,
pooled_height, pooled_width,
spatial_scale, sampling_ratio, gamma);
}
void deform_roi_pool_backward_cuda(CudaContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
int pooled_height;
int pooled_width;
float spatial_scale;
int sampling_ratio;
float gamma;
SSAttrs(attr)
.get<int>("pooled_height", pooled_height)
.get<int>("pooled_width", pooled_width)
.get<float>("spatial_scale", spatial_scale)
.get<int>("sampling_ratio", sampling_ratio)
.get<float>("gamma", gamma)
.done();
const auto& grad_output = ins[0];
const auto& input = ins[1];
const auto& rois = ins[2];
const auto& offset = ins[3];
auto& grad_input = outs[0];
auto& grad_offset = outs[1];
cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
void deform_roi_pool_backward_cuda(Tensor grad_output, Tensor input,
Tensor rois, Tensor offset,
Tensor grad_input, Tensor grad_offset,
int pooled_height, int pooled_width,
float spatial_scale, int sampling_ratio,
float gamma) {
DeformRoIPoolBackwardCUDAKernelLauncher(
grad_output, input, rois, offset, grad_input, grad_offset, pooled_height,
pooled_width, spatial_scale, sampling_ratio, gamma, stream);
pooled_width, spatial_scale, sampling_ratio, gamma);
}
#endif
void deform_roi_pool_forward(Tensor input, Tensor rois, Tensor offset,
Tensor output, int pooled_height, int pooled_width,
float spatial_scale, int sampling_ratio,
float gamma) {
if (input.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(input);
CHECK_CUDA_INPUT(rois);
CHECK_CUDA_INPUT(offset);
CHECK_CUDA_INPUT(output);
PARROTS_EXTENSION_REGISTER(deform_roi_pool_forward)
.attr("pooled_height")
.attr("pooled_width")
.attr("spatial_scale")
.attr("sampling_ratio")
.attr("gamma")
.input(3)
.output(1)
.apply(deform_roi_pool_forward_cuda)
.done();
deform_roi_pool_forward_cuda(input, rois, offset, output, pooled_height,
pooled_width, spatial_scale, sampling_ratio,
gamma);
#else
AT_ERROR("DeformRoIPool is not compiled with GPU support");
#endif
} else {
AT_ERROR("DeformRoIPool is not implemented on CPU");
}
}
PARROTS_EXTENSION_REGISTER(deform_roi_pool_backward)
.attr("pooled_height")
.attr("pooled_width")
.attr("spatial_scale")
.attr("sampling_ratio")
.attr("gamma")
.input(4)
.output(2)
.apply(deform_roi_pool_backward_cuda)
.done();
void deform_roi_pool_backward(Tensor grad_output, Tensor input, Tensor rois,
Tensor offset, Tensor grad_input,
Tensor grad_offset, int pooled_height,
int pooled_width, float spatial_scale,
int sampling_ratio, float gamma) {
if (grad_output.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(grad_output);
CHECK_CUDA_INPUT(input);
CHECK_CUDA_INPUT(rois);
CHECK_CUDA_INPUT(offset);
CHECK_CUDA_INPUT(grad_input);
CHECK_CUDA_INPUT(grad_offset);
deform_roi_pool_backward_cuda(grad_output, input, rois, offset, grad_input,
grad_offset, pooled_height, pooled_width,
spatial_scale, sampling_ratio, gamma);
#else
AT_ERROR("DeformRoIPool is not compiled with GPU support");
#endif
} else {
AT_ERROR("DeformRoIPool is not implemented on CPU");
}
}
#include "deform_roi_pool_cuda_kernel.cuh"
#include "parrots_cuda_helper.hpp"
#include "pytorch_cuda_helper.hpp"
void DeformRoIPoolForwardCUDAKernelLauncher(
const DArrayLite input, const DArrayLite rois, const DArrayLite offset,
DArrayLite output, int pooled_height, int pooled_width, float spatial_scale,
int sampling_ratio, float gamma, cudaStream_t stream) {
int output_size = output.size();
int channels = input.dim(1);
int height = input.dim(2);
int width = input.dim(3);
void DeformRoIPoolForwardCUDAKernelLauncher(Tensor input, Tensor rois,
Tensor offset, Tensor output,
int pooled_height, int pooled_width,
float spatial_scale,
int sampling_ratio, float gamma) {
int output_size = output.numel();
int channels = input.size(1);
int height = input.size(2);
int width = input.size(3);
PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(
input.elemType().prim(), ([&] {
at::cuda::CUDAGuard device_guard(input.device());
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
input.scalar_type(), "deform_roi_pool_forward_cuda_kernel", [&] {
deform_roi_pool_forward_cuda_kernel<scalar_t>
<<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
output_size, input.ptr<scalar_t>(), rois.ptr<scalar_t>(),
offset.ptr<scalar_t>(), output.ptr<scalar_t>(), pooled_height,
pooled_width, spatial_scale, sampling_ratio, gamma, channels,
height, width);
}));
output_size, input.data_ptr<scalar_t>(),
rois.data_ptr<scalar_t>(), offset.data_ptr<scalar_t>(),
output.data_ptr<scalar_t>(), pooled_height, pooled_width,
static_cast<scalar_t>(spatial_scale), sampling_ratio,
static_cast<scalar_t>(gamma), channels, height, width);
});
PARROTS_CUDA_CHECK(cudaGetLastError());
AT_CUDA_CHECK(cudaGetLastError());
}
void DeformRoIPoolBackwardCUDAKernelLauncher(
const DArrayLite grad_output, const DArrayLite input, const DArrayLite rois,
const DArrayLite offset, DArrayLite grad_input, DArrayLite grad_offset,
int pooled_height, int pooled_width, float spatial_scale,
int sampling_ratio, float gamma, cudaStream_t stream) {
int output_size = grad_output.size();
int channels = grad_input.dim(1);
int height = grad_input.dim(2);
int width = grad_input.dim(3);
Tensor grad_output, Tensor input, Tensor rois, Tensor offset,
Tensor grad_input, Tensor grad_offset, int pooled_height, int pooled_width,
float spatial_scale, int sampling_ratio, float gamma) {
int output_size = grad_output.numel();
int channels = grad_input.size(1);
int height = grad_input.size(2);
int width = grad_input.size(3);
PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(
grad_output.elemType().prim(), ([&] {
at::cuda::CUDAGuard device_guard(grad_output.device());
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
grad_output.scalar_type(), "deform_roi_pool_backward_cuda_kernel", [&] {
deform_roi_pool_backward_cuda_kernel<scalar_t>
<<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
output_size, grad_output.ptr<scalar_t>(), input.ptr<scalar_t>(),
rois.ptr<scalar_t>(), offset.ptr<scalar_t>(),
grad_input.ptr<scalar_t>(), grad_offset.ptr<scalar_t>(),
pooled_height, pooled_width, spatial_scale, sampling_ratio,
gamma, channels, height, width);
}));
output_size, grad_output.data_ptr<scalar_t>(),
input.data_ptr<scalar_t>(), rois.data_ptr<scalar_t>(),
offset.data_ptr<scalar_t>(), grad_input.data_ptr<scalar_t>(),
grad_offset.data_ptr<scalar_t>(), pooled_height, pooled_width,
static_cast<scalar_t>(spatial_scale), sampling_ratio,
static_cast<scalar_t>(gamma), channels, height, width);
});
PARROTS_CUDA_CHECK(cudaGetLastError());
AT_CUDA_CHECK(cudaGetLastError());
}
#include <parrots/compute/aten.hpp>
#include <parrots/extension.hpp>
#include <parrots/foundation/ssattrs.hpp>
#include "deform_roi_pool_pytorch.h"
using namespace parrots;
/*void deform_roi_pool_forward_cuda(Tensor input, Tensor rois, Tensor offset,
* Tensor output, int pooled_height,
* int pooled_width, float spatial_scale,
* int sampling_ratio, float gamma);
*/
void deform_roi_pool_forward_cuda_parrots(CudaContext& ctx,
const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
int pooled_height;
int pooled_width;
float spatial_scale;
int sampling_ratio;
float gamma;
SSAttrs(attr)
.get<int>("pooled_height", pooled_height)
.get<int>("pooled_width", pooled_width)
.get<float>("spatial_scale", spatial_scale)
.get<int>("sampling_ratio", sampling_ratio)
.get<float>("gamma", gamma)
.done();
const auto& input = buildATensor(ctx, ins[0]);
const auto& rois = buildATensor(ctx, ins[1]);
const auto& offset = buildATensor(ctx, ins[2]);
auto output = buildATensor(ctx, outs[0]);
deform_roi_pool_forward_cuda(input, rois, offset, output, pooled_height,
pooled_width, spatial_scale, sampling_ratio,
gamma);
}
/*void deform_roi_pool_backward_cuda(Tensor grad_output, Tensor input,
* Tensor rois, Tensor offset,
* Tensor grad_input, Tensor grad_offset,
* int pooled_height, int pooled_width,
* float spatial_scale, int sampling_ratio,
* float gamma);
*/
void deform_roi_pool_backward_cuda_parrots(CudaContext& ctx,
const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
int pooled_height;
int pooled_width;
float spatial_scale;
int sampling_ratio;
float gamma;
SSAttrs(attr)
.get<int>("pooled_height", pooled_height)
.get<int>("pooled_width", pooled_width)
.get<float>("spatial_scale", spatial_scale)
.get<int>("sampling_ratio", sampling_ratio)
.get<float>("gamma", gamma)
.done();
const auto& grad_output = buildATensor(ctx, ins[0]);
const auto& input = buildATensor(ctx, ins[1]);
const auto& rois = buildATensor(ctx, ins[2]);
const auto& offset = buildATensor(ctx, ins[3]);
auto grad_input = buildATensor(ctx, outs[0]);
auto grad_offset = buildATensor(ctx, outs[1]);
deform_roi_pool_backward_cuda(grad_output, input, rois, offset, grad_input,
grad_offset, pooled_height, pooled_width,
spatial_scale, sampling_ratio, gamma);
}
PARROTS_EXTENSION_REGISTER(deform_roi_pool_forward)
.attr("pooled_height")
.attr("pooled_width")
.attr("spatial_scale")
.attr("sampling_ratio")
.attr("gamma")
.input(3)
.output(1)
.apply(deform_roi_pool_forward_cuda_parrots)
.done();
PARROTS_EXTENSION_REGISTER(deform_roi_pool_backward)
.attr("pooled_height")
.attr("pooled_width")
.attr("spatial_scale")
.attr("sampling_ratio")
.attr("gamma")
.input(4)
.output(2)
.apply(deform_roi_pool_backward_cuda_parrots)
.done();
#ifndef DEFORM_ROI_POOL_PYTORCH_H
#define DEFORM_ROI_POOL_PYTORCH_H
#include <torch/extension.h>
using namespace at;
void deform_roi_pool_forward_cuda(Tensor input, Tensor rois, Tensor offset,
Tensor output, int pooled_height,
int pooled_width, float spatial_scale,
int sampling_ratio, float gamma);
void deform_roi_pool_backward_cuda(Tensor grad_output, Tensor input,
Tensor rois, Tensor offset,
Tensor grad_input, Tensor grad_offset,
int pooled_height, int pooled_width,
float spatial_scale, int sampling_ratio,
float gamma);
#endif // DEFORM_ROI_POOL_PYTORCH_H
// Copyright (c) 2018, SenseTime.
#include "parrots_cpp_helper.hpp"
void SigmoidFocalLossForwardCUDAKernelLauncher(
const DArrayLite input, const DArrayLite target, const DArrayLite weight,
DArrayLite output, float gamma, float alpha, cudaStream_t stream);
void SigmoidFocalLossBackwardCUDAKernelLauncher(
const DArrayLite input, const DArrayLite target, const DArrayLite weight,
DArrayLite grad_input, float gamma, float alpha, cudaStream_t stream);
void SoftmaxFocalLossForwardCUDAKernelLauncher(
const DArrayLite input, const DArrayLite target, const DArrayLite weight,
DArrayLite output, float gamma, float alpha, cudaStream_t stream);
void SoftmaxFocalLossBackwardCUDAKernelLauncher(
const DArrayLite input, const DArrayLite target, const DArrayLite weight,
DArrayLite buff, DArrayLite grad_input, float gamma, float alpha,
cudaStream_t stream);
void sigmoid_focal_loss_forward_cuda(CudaContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
float gamma;
float alpha;
SSAttrs(attr).get<float>("gamma", gamma).get<float>("alpha", alpha).done();
// get inputs and outputs
const auto& input = ins[0];
const auto& target = ins[1];
const auto& weight = ins[2];
auto& output = outs[0];
cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
#include "pytorch_cpp_helper.hpp"
#ifdef MMCV_WITH_CUDA
void SigmoidFocalLossForwardCUDAKernelLauncher(Tensor input, Tensor target,
Tensor weight, Tensor output,
const float gamma,
const float alpha);
void SigmoidFocalLossBackwardCUDAKernelLauncher(Tensor input, Tensor target,
Tensor weight,
Tensor grad_input,
const float gamma,
const float alpha);
void SoftmaxFocalLossForwardCUDAKernelLauncher(Tensor input, Tensor target,
Tensor weight, Tensor output,
const float gamma,
const float alpha);
void SoftmaxFocalLossBackwardCUDAKernelLauncher(Tensor input, Tensor target,
Tensor weight, Tensor buff,
Tensor grad_input,
const float gamma,
const float alpha);
void sigmoid_focal_loss_forward_cuda(Tensor input, Tensor target, Tensor weight,
Tensor output, float gamma, float alpha) {
SigmoidFocalLossForwardCUDAKernelLauncher(input, target, weight, output,
gamma, alpha, stream);
gamma, alpha);
}
void sigmoid_focal_loss_backward_cuda(CudaContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
float gamma;
float alpha;
SSAttrs(attr).get<float>("gamma", gamma).get<float>("alpha", alpha).done();
// get inputs and outputs
const auto& input = ins[0];
const auto& target = ins[1];
const auto& weight = ins[2];
auto& grad_input = outs[0];
cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
void sigmoid_focal_loss_backward_cuda(Tensor input, Tensor target,
Tensor weight, Tensor grad_input,
float gamma, float alpha) {
SigmoidFocalLossBackwardCUDAKernelLauncher(input, target, weight, grad_input,
gamma, alpha, stream);
gamma, alpha);
}
void softmax_focal_loss_forward_cuda(CudaContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
float gamma;
float alpha;
SSAttrs(attr).get<float>("gamma", gamma).get<float>("alpha", alpha).done();
// get inputs and outputs
const auto& input = ins[0];
const auto& target = ins[1];
const auto& weight = ins[2];
auto& grad_input = outs[0];
cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
SoftmaxFocalLossForwardCUDAKernelLauncher(input, target, weight, grad_input,
gamma, alpha, stream);
void softmax_focal_loss_forward_cuda(Tensor input, Tensor target, Tensor weight,
Tensor output, float gamma, float alpha) {
SoftmaxFocalLossForwardCUDAKernelLauncher(input, target, weight, output,
gamma, alpha);
}
void softmax_focal_loss_backward_cuda(CudaContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
float gamma;
float alpha;
SSAttrs(attr).get<float>("gamma", gamma).get<float>("alpha", alpha).done();
// get inputs and outputs
const auto& input = ins[0];
const auto& target = ins[1];
const auto& weight = ins[2];
void softmax_focal_loss_backward_cuda(Tensor input, Tensor target,
Tensor weight, Tensor buff,
Tensor grad_input, float gamma,
float alpha) {
SoftmaxFocalLossBackwardCUDAKernelLauncher(input, target, weight, buff,
grad_input, gamma, alpha);
}
#endif
void sigmoid_focal_loss_forward(Tensor input, Tensor target, Tensor weight,
Tensor output, float gamma, float alpha) {
if (input.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(input);
CHECK_CUDA_INPUT(target);
CHECK_CUDA_INPUT(weight);
CHECK_CUDA_INPUT(output);
sigmoid_focal_loss_forward_cuda(input, target, weight, output, gamma,
alpha);
#else
AT_ERROR("SigmoidFocalLoss is not compiled with GPU support");
#endif
} else {
AT_ERROR("SigmoidFocalLoss is not implemented on CPU");
}
}
auto& buff = outs[0];
auto& grad_input = outs[1];
void sigmoid_focal_loss_backward(Tensor input, Tensor target, Tensor weight,
Tensor grad_input, float gamma, float alpha) {
if (input.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(input);
CHECK_CUDA_INPUT(target);
CHECK_CUDA_INPUT(weight);
CHECK_CUDA_INPUT(grad_input);
sigmoid_focal_loss_backward_cuda(input, target, weight, grad_input, gamma,
alpha);
#else
AT_ERROR("SigmoidFocalLoss is not compiled with GPU support");
#endif
} else {
AT_ERROR("SigmoidFocalLoss is not implemented on CPU");
}
}
cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
SoftmaxFocalLossBackwardCUDAKernelLauncher(input, target, weight, buff,
grad_input, gamma, alpha, stream);
void softmax_focal_loss_forward(Tensor input, Tensor target, Tensor weight,
Tensor output, float gamma, float alpha) {
if (input.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(input);
CHECK_CUDA_INPUT(target);
CHECK_CUDA_INPUT(weight);
CHECK_CUDA_INPUT(output);
softmax_focal_loss_forward_cuda(input, target, weight, output, gamma,
alpha);
#else
AT_ERROR("SoftmaxFocalLoss is not compiled with GPU support");
#endif
} else {
AT_ERROR("SoftmaxFocalLoss is not implemented on CPU");
}
}
PARROTS_EXTENSION_REGISTER(sigmoid_focal_loss_forward)
.attr("gamma")
.attr("alpha")
.input(3)
.output(1)
.apply(sigmoid_focal_loss_forward_cuda)
.done();
PARROTS_EXTENSION_REGISTER(sigmoid_focal_loss_backward)
.attr("gamma")
.attr("alpha")
.input(3)
.output(1)
.apply(sigmoid_focal_loss_backward_cuda)
.done();
PARROTS_EXTENSION_REGISTER(softmax_focal_loss_forward)
.attr("gamma")
.attr("alpha")
.input(3)
.output(1)
.apply(softmax_focal_loss_forward_cuda)
.done();
PARROTS_EXTENSION_REGISTER(softmax_focal_loss_backward)
.attr("gamma")
.attr("alpha")
.input(3)
.output(2)
.apply(softmax_focal_loss_backward_cuda)
.done();
void softmax_focal_loss_backward(Tensor input, Tensor target, Tensor weight,
Tensor buff, Tensor grad_input, float gamma,
float alpha) {
if (input.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(input);
CHECK_CUDA_INPUT(target);
CHECK_CUDA_INPUT(weight);
CHECK_CUDA_INPUT(buff);
CHECK_CUDA_INPUT(grad_input);
softmax_focal_loss_backward_cuda(input, target, weight, buff, grad_input,
gamma, alpha);
#else
AT_ERROR("SoftmaxFocalLoss is not compiled with GPU support");
#endif
} else {
AT_ERROR("SoftmaxFocalLoss is not implemented on CPU");
}
}
#include "parrots_cuda_helper.hpp"
#include "pytorch_cuda_helper.hpp"
#include "sigmoid_focal_loss_cuda_kernel.cuh"
#include "softmax_focal_loss_cuda_kernel.cuh"
void SigmoidFocalLossForwardCUDAKernelLauncher(
const DArrayLite input, const DArrayLite target, const DArrayLite weight,
DArrayLite output, float gamma, float alpha, cudaStream_t stream) {
int output_size = output.size();
int num_classes = input.dim(1);
PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(
input.elemType().prim(), ([&] {
void SigmoidFocalLossForwardCUDAKernelLauncher(Tensor input, Tensor target,
Tensor weight, Tensor output,
const float gamma,
const float alpha) {
int output_size = output.numel();
int num_classes = input.size(1);
AT_ASSERTM(target.max().item<int64_t>() <= (int64_t)num_classes,
"target label should smaller or equal than num classes");
at::cuda::CUDAGuard device_guard(input.device());
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
input.scalar_type(), "sigmoid_focal_loss_forward_cuda_kernel", [&] {
sigmoid_focal_loss_forward_cuda_kernel<scalar_t>
<<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
output_size, input.ptr<scalar_t>(), target.ptr<int64_t>(),
weight.ptr<scalar_t>(), output.ptr<scalar_t>(), gamma, alpha,
num_classes);
}));
output_size, input.data_ptr<scalar_t>(),
target.data_ptr<int64_t>(), weight.data_ptr<scalar_t>(),
output.data_ptr<scalar_t>(), gamma, alpha, num_classes);
});
PARROTS_CUDA_CHECK(cudaGetLastError());
AT_CUDA_CHECK(cudaGetLastError());
}
void SigmoidFocalLossBackwardCUDAKernelLauncher(
const DArrayLite input, const DArrayLite target, const DArrayLite weight,
DArrayLite grad_input, float gamma, float alpha, cudaStream_t stream) {
int output_size = grad_input.size();
int num_classes = input.dim(1);
void SigmoidFocalLossBackwardCUDAKernelLauncher(Tensor input, Tensor target,
Tensor weight,
Tensor grad_input,
const float gamma,
const float alpha) {
int output_size = grad_input.numel();
int num_classes = input.size(1);
PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(
input.elemType().prim(), ([&] {
at::cuda::CUDAGuard device_guard(grad_input.device());
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
input.scalar_type(), "sigmoid_focal_loss_backward_cuda_kernel", [&] {
sigmoid_focal_loss_backward_cuda_kernel<scalar_t>
<<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
output_size, input.ptr<scalar_t>(), target.ptr<int64_t>(),
weight.ptr<scalar_t>(), grad_input.ptr<scalar_t>(), gamma,
alpha, num_classes);
}));
output_size, input.data_ptr<scalar_t>(),
target.data_ptr<int64_t>(), weight.data_ptr<scalar_t>(),
grad_input.data_ptr<scalar_t>(), gamma, alpha, num_classes);
});
PARROTS_CUDA_CHECK(cudaGetLastError());
AT_CUDA_CHECK(cudaGetLastError());
}
void SoftmaxFocalLossForwardCUDAKernelLauncher(
const DArrayLite softmax, const DArrayLite target, const DArrayLite weight,
DArrayLite output, float gamma, float alpha, cudaStream_t stream) {
int output_size = output.size();
int num_classes = softmax.dim(1);
void SoftmaxFocalLossForwardCUDAKernelLauncher(Tensor softmax, Tensor target,
Tensor weight, Tensor output,
const float gamma,
const float alpha) {
int output_size = output.numel();
int num_classes = softmax.size(1);
PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(
softmax.elemType().prim(), ([&] {
AT_ASSERTM(target.max().item<int64_t>() <= (int64_t)num_classes,
"target label should smaller or equal than num classes");
at::cuda::CUDAGuard device_guard(softmax.device());
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
softmax.scalar_type(), "softmax_focal_loss_forward_cuda_kernel", [&] {
softmax_focal_loss_forward_cuda_kernel<scalar_t>
<<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
output_size, softmax.ptr<scalar_t>(), target.ptr<int64_t>(),
weight.ptr<scalar_t>(), output.ptr<scalar_t>(), gamma, alpha,
num_classes);
}));
output_size, softmax.data_ptr<scalar_t>(),
target.data_ptr<int64_t>(), weight.data_ptr<scalar_t>(),
output.data_ptr<scalar_t>(), gamma, alpha, num_classes);
});
PARROTS_CUDA_CHECK(cudaGetLastError());
AT_CUDA_CHECK(cudaGetLastError());
}
void SoftmaxFocalLossBackwardCUDAKernelLauncher(
const DArrayLite softmax, const DArrayLite target, const DArrayLite weight,
DArrayLite buff, DArrayLite grad_input, float gamma, float alpha,
cudaStream_t stream) {
int output_size = buff.size();
int num_classes = softmax.dim(1);
void SoftmaxFocalLossBackwardCUDAKernelLauncher(Tensor softmax, Tensor target,
Tensor weight, Tensor buff,
Tensor grad_input,
const float gamma,
const float alpha) {
int num_classes = softmax.size(1);
PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(
grad_input.elemType().prim(), ([&] {
int output_size = buff.numel();
at::cuda::CUDAGuard device_guard(grad_input.device());
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
grad_input.scalar_type(),
"softmax_focal_loss_backward_cuda1_"
"kernel",
[&] {
softmax_focal_loss_backward_cuda1_kernel<scalar_t>
<<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
output_size, softmax.ptr<scalar_t>(), target.ptr<int64_t>(),
weight.ptr<scalar_t>(), buff.ptr<scalar_t>(), gamma, alpha,
num_classes);
}));
PARROTS_CUDA_CHECK(cudaGetLastError());
output_size, softmax.data_ptr<scalar_t>(),
target.data_ptr<int64_t>(), weight.data_ptr<scalar_t>(),
buff.data_ptr<scalar_t>(), gamma, alpha, num_classes);
});
output_size = grad_input.size();
AT_CUDA_CHECK(cudaGetLastError());
PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(
grad_input.elemType().prim(), ([&] {
output_size = grad_input.numel();
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
grad_input.scalar_type(),
"softmax_focal_loss_backward_cuda2_"
"kernel",
[&] {
softmax_focal_loss_backward_cuda2_kernel<scalar_t>
<<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
output_size, softmax.ptr<scalar_t>(), target.ptr<int64_t>(),
buff.ptr<scalar_t>(), grad_input.ptr<scalar_t>(), num_classes);
}));
output_size, softmax.data_ptr<scalar_t>(),
target.data_ptr<int64_t>(), buff.data_ptr<scalar_t>(),
grad_input.data_ptr<scalar_t>(), num_classes);
});
PARROTS_CUDA_CHECK(cudaGetLastError());
AT_CUDA_CHECK(cudaGetLastError());
}
#include <parrots/compute/aten.hpp>
#include <parrots/extension.hpp>
#include <parrots/foundation/ssattrs.hpp>
#include "focal_loss_pytorch.h"
using namespace parrots;
void sigmoid_focal_loss_forward_cuda_parrots(CudaContext& ctx,
const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
float gamma;
float alpha;
SSAttrs(attr).get<float>("gamma", gamma).get<float>("alpha", alpha).done();
// get inputs and outputs
const auto& input = buildATensor(ctx, ins[0]);
const auto& target = buildATensor(ctx, ins[1]);
const auto& weight = buildATensor(ctx, ins[2]);
auto output = buildATensor(ctx, outs[0]);
sigmoid_focal_loss_forward_cuda(input, target, weight, output, gamma, alpha);
}
void sigmoid_focal_loss_backward_cuda_parrots(
CudaContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
float gamma;
float alpha;
SSAttrs(attr).get<float>("gamma", gamma).get<float>("alpha", alpha).done();
// get inputs and outputs
const auto& input = buildATensor(ctx, ins[0]);
const auto& target = buildATensor(ctx, ins[1]);
const auto& weight = buildATensor(ctx, ins[2]);
auto grad_input = buildATensor(ctx, outs[0]);
sigmoid_focal_loss_backward_cuda(input, target, weight, grad_input, gamma,
alpha);
}
void softmax_focal_loss_forward_cuda_parrots(CudaContext& ctx,
const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
float gamma;
float alpha;
SSAttrs(attr).get<float>("gamma", gamma).get<float>("alpha", alpha).done();
// get inputs and outputs
const auto& input = buildATensor(ctx, ins[0]);
const auto& target = buildATensor(ctx, ins[1]);
const auto& weight = buildATensor(ctx, ins[2]);
auto output = buildATensor(ctx, outs[0]);
softmax_focal_loss_forward_cuda(input, target, weight, output, gamma, alpha);
}
void softmax_focal_loss_backward_cuda_parrots(
CudaContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
float gamma;
float alpha;
SSAttrs(attr).get<float>("gamma", gamma).get<float>("alpha", alpha).done();
// get inputs and outputs
const auto& input = buildATensor(ctx, ins[0]);
const auto& target = buildATensor(ctx, ins[1]);
const auto& weight = buildATensor(ctx, ins[2]);
auto buff = buildATensor(ctx, outs[0]);
auto grad_input = buildATensor(ctx, outs[1]);
softmax_focal_loss_backward_cuda(input, target, weight, buff, grad_input,
gamma, alpha);
}
PARROTS_EXTENSION_REGISTER(sigmoid_focal_loss_forward)
.attr("gamma")
.attr("alpha")
.input(3)
.output(1)
.apply(sigmoid_focal_loss_forward_cuda_parrots)
.done();
PARROTS_EXTENSION_REGISTER(sigmoid_focal_loss_backward)
.attr("gamma")
.attr("alpha")
.input(3)
.output(1)
.apply(sigmoid_focal_loss_backward_cuda_parrots)
.done();
PARROTS_EXTENSION_REGISTER(softmax_focal_loss_forward)
.attr("gamma")
.attr("alpha")
.input(3)
.output(1)
.apply(softmax_focal_loss_forward_cuda_parrots)
.done();
PARROTS_EXTENSION_REGISTER(softmax_focal_loss_backward)
.attr("gamma")
.attr("alpha")
.input(3)
.output(2)
.apply(softmax_focal_loss_backward_cuda_parrots)
.done();
#ifndef FOCAL_LOSS_PYTORCH_H
#define FOCAL_LOSS_PYTORCH_H
#include <torch/extension.h>
using namespace at;
void sigmoid_focal_loss_forward_cuda(Tensor input, Tensor target, Tensor weight,
Tensor output, float gamma, float alpha);
void sigmoid_focal_loss_backward_cuda(Tensor input, Tensor target,
Tensor weight, Tensor grad_input,
float gamma, float alpha);
void softmax_focal_loss_forward_cuda(Tensor input, Tensor target, Tensor weight,
Tensor output, float gamma, float alpha);
void softmax_focal_loss_backward_cuda(Tensor input, Tensor target,
Tensor weight, Tensor buff,
Tensor grad_input, float gamma,
float alpha);
#endif // FOCAL_LOSS_PYTORCH_H
#include "parrots_cpp_helper.hpp"
#include "pytorch_cpp_helper.hpp"
void MaskedIm2colForwardCUDAKernelLauncher(
const DArrayLite bottom_data, const DArrayLite mask_h_idx,
const DArrayLite mask_w_idx, DArrayLite top_data, const int kernel_h,
const int kernel_w, const int pad_h, const int pad_w, cudaStream_t stream);
#ifdef MMCV_WITH_CUDA
void MaskedIm2colForwardCUDAKernelLauncher(const Tensor bottom_data,
const Tensor mask_h_idx,
const Tensor mask_w_idx,
Tensor top_data, const int kernel_h,
const int kernel_w, const int pad_h,
const int pad_w);
void MaskedCol2imForwardCUDAKernelLaucher(const DArrayLite bottom_data,
const DArrayLite mask_h_idx,
const DArrayLite mask_w_idx,
DArrayLite top_data, const int height,
const int width, const int channels,
cudaStream_t stream);
void MaskedCol2imForwardCUDAKernelLauncher(const Tensor bottom_data,
const Tensor mask_h_idx,
const Tensor mask_w_idx,
Tensor top_data, const int height,
const int width, const int channels);
void masked_im2col_forward_cuda(CudaContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
void masked_im2col_forward_cuda(const Tensor im, const Tensor mask_h_idx,
const Tensor mask_w_idx, Tensor col,
const int kernel_h, const int kernel_w,
const int pad_h, const int pad_w) {
// im: (n, ic, h, w), kernel size (kh, kw)
// kernel: (oc, ic * kh * kw), col: (kh * kw * ic, ow * oh)
int kernel_h, kernel_w, pad_h, pad_w;
SSAttrs(attr)
.get<int>("kernel_h", kernel_h)
.get<int>("kernel_w", kernel_w)
.get<int>("pad_h", pad_h)
.get<int>("pad_w", pad_w)
.done();
const auto& im = ins[0];
const auto& mask_h_idx = ins[1];
const auto& mask_w_idx = ins[2];
auto& col = outs[0];
cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
MaskedIm2colForwardCUDAKernelLauncher(im, mask_h_idx, mask_w_idx, col,
kernel_h, kernel_w, pad_h, pad_w,
stream);
kernel_h, kernel_w, pad_h, pad_w);
}
void masked_col2im_forward_cuda(CudaContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
void masked_col2im_forward_cuda(const Tensor col, const Tensor mask_h_idx,
const Tensor mask_w_idx, Tensor im, int height,
int width, int channels) {
// im: (n, ic, h, w), kernel size (kh, kw)
// kernel: (oc, ic * kh * kh), col: (kh * kw * ic, ow * oh)
int height, width, channels;
SSAttrs(attr)
.get<int>("height", height)
.get<int>("width", width)
.get<int>("channels", channels)
.done();
const auto& col = ins[0];
const auto& mask_h_idx = ins[1];
const auto& mask_w_idx = ins[2];
auto& im = outs[0];
cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
MaskedCol2imForwardCUDAKernelLaucher(col, mask_h_idx, mask_w_idx, im, height,
width, channels, stream);
MaskedCol2imForwardCUDAKernelLauncher(col, mask_h_idx, mask_w_idx, im, height,
width, channels);
}
#endif
PARROTS_EXTENSION_REGISTER(masked_im2col_forward)
.attr("kernel_h")
.attr("kernel_w")
.attr("pad_h")
.attr("pad_w")
.input(3)
.output(1)
.apply(masked_im2col_forward_cuda)
.done();
void masked_im2col_forward(const Tensor im, const Tensor mask_h_idx,
const Tensor mask_w_idx, Tensor col,
const int kernel_h, const int kernel_w,
const int pad_h, const int pad_w) {
if (im.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(im);
CHECK_CUDA_INPUT(mask_h_idx);
CHECK_CUDA_INPUT(mask_w_idx);
CHECK_CUDA_INPUT(col);
masked_im2col_forward_cuda(im, mask_h_idx, mask_w_idx, col, kernel_h,
kernel_w, pad_h, pad_w);
#else
AT_ERROR("MaskConv is not compiled with GPU support");
#endif
} else {
AT_ERROR("MaskConv is not implemented on CPU");
}
}
PARROTS_EXTENSION_REGISTER(masked_col2im_forward)
.attr("height")
.attr("width")
.attr("channels")
.input(3)
.output(1)
.apply(masked_col2im_forward_cuda)
.done();
void masked_col2im_forward(const Tensor col, const Tensor mask_h_idx,
const Tensor mask_w_idx, Tensor im, int height,
int width, int channels) {
if (col.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(col);
CHECK_CUDA_INPUT(mask_h_idx);
CHECK_CUDA_INPUT(mask_w_idx);
CHECK_CUDA_INPUT(im);
masked_col2im_forward_cuda(col, mask_h_idx, mask_w_idx, im, height, width,
channels);
#else
AT_ERROR("MaskConv is not compiled with GPU support");
#endif
} else {
AT_ERROR("MaskConv is not implemented on CPU");
}
}
#include "masked_conv2d_cuda_kernel.cuh"
#include "parrots_cuda_helper.hpp"
#include "pytorch_cuda_helper.hpp"
void MaskedIm2colForwardCUDAKernelLauncher(
const DArrayLite bottom_data, const DArrayLite mask_h_idx,
const DArrayLite mask_w_idx, DArrayLite top_data, const int kernel_h,
const int kernel_w, const int pad_h, const int pad_w, cudaStream_t stream) {
int channels = bottom_data.dim(1);
int height = bottom_data.dim(2);
int width = bottom_data.dim(3);
int mask_cnt = mask_h_idx.dim(0);
void MaskedIm2colForwardCUDAKernelLauncher(const Tensor bottom_data,
const Tensor mask_h_idx,
const Tensor mask_w_idx,
Tensor top_data, const int kernel_h,
const int kernel_w, const int pad_h,
const int pad_w) {
int channels = bottom_data.size(1);
int height = bottom_data.size(2);
int width = bottom_data.size(3);
int mask_cnt = mask_h_idx.size(0);
int output_size = mask_cnt * channels;
PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(
bottom_data.elemType().prim(), ([&] {
at::cuda::CUDAGuard device_guard(bottom_data.device());
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
bottom_data.scalar_type(), "MaskedIm2colLaucherForward", ([&] {
const scalar_t *bottom_data_ = bottom_data.data_ptr<scalar_t>();
const int64_t *mask_h_idx_ = mask_h_idx.data_ptr<int64_t>();
const int64_t *mask_w_idx_ = mask_w_idx.data_ptr<int64_t>();
scalar_t *top_data_ = top_data.data_ptr<scalar_t>();
MaskedIm2colForward<scalar_t>
<<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
output_size, bottom_data.ptr<scalar_t>(), height, width,
kernel_h, kernel_w, pad_h, pad_w, mask_h_idx.ptr<int64_t>(),
mask_w_idx.ptr<int64_t>(), mask_cnt, top_data.ptr<scalar_t>());
output_size, bottom_data_, height, width, kernel_h, kernel_w,
pad_h, pad_w, mask_h_idx_, mask_w_idx_, mask_cnt, top_data_);
}));
PARROTS_CUDA_CHECK(cudaGetLastError());
AT_CUDA_CHECK(cudaGetLastError());
}
void MaskedCol2imForwardCUDAKernelLaucher(const DArrayLite bottom_data,
const DArrayLite mask_h_idx,
const DArrayLite mask_w_idx,
DArrayLite top_data, const int height,
const int width, const int channels,
cudaStream_t stream) {
int mask_cnt = mask_h_idx.dim(0);
void MaskedCol2imForwardCUDAKernelLauncher(
const Tensor bottom_data, const Tensor mask_h_idx, const Tensor mask_w_idx,
Tensor top_data, const int height, const int width, const int channels) {
int mask_cnt = mask_h_idx.size(0);
int output_size = mask_cnt * channels;
PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(
bottom_data.elemType().prim(), ([&] {
at::cuda::CUDAGuard device_guard(bottom_data.device());
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
bottom_data.scalar_type(), "MaskedCol2imLaucherForward", ([&] {
const scalar_t *bottom_data_ = bottom_data.data_ptr<scalar_t>();
const int64_t *mask_h_idx_ = mask_h_idx.data_ptr<int64_t>();
const int64_t *mask_w_idx_ = mask_w_idx.data_ptr<int64_t>();
scalar_t *top_data_ = top_data.data_ptr<scalar_t>();
MaskedCol2imForward<scalar_t>
<<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
output_size, bottom_data.ptr<scalar_t>(), height, width,
channels, mask_h_idx.ptr<int64_t>(), mask_w_idx.ptr<int64_t>(),
mask_cnt, top_data.ptr<scalar_t>());
output_size, bottom_data_, height, width, channels, mask_h_idx_,
mask_w_idx_, mask_cnt, top_data_);
}));
PARROTS_CUDA_CHECK(cudaGetLastError());
AT_CUDA_CHECK(cudaGetLastError());
}
#include <parrots/compute/aten.hpp>
#include <parrots/extension.hpp>
#include <parrots/foundation/ssattrs.hpp>
#include "masked_conv2d_pytorch.h"
using namespace parrots;
void masked_im2col_forward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
// im: (n, ic, h, w), kernel size (kh, kw)
// kernel: (oc, ic * kh * kw), col: (kh * kw * ic, ow * oh)
int kernel_h, kernel_w, pad_h, pad_w;
SSAttrs(attr)
.get<int>("kernel_h", kernel_h)
.get<int>("kernel_w", kernel_w)
.get<int>("pad_h", pad_h)
.get<int>("pad_w", pad_w)
.done();
const auto& im = buildATensor(ctx, ins[0]);
const auto& mask_h_idx = buildATensor(ctx, ins[1]);
const auto& mask_w_idx = buildATensor(ctx, ins[2]);
auto col = buildATensor(ctx, outs[0]);
masked_im2col_forward_cuda(im, mask_h_idx, mask_w_idx, col, kernel_h,
kernel_w, pad_h, pad_w);
}
void masked_col2im_forward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
// im: (n, ic, h, w), kernel size (kh, kw)
// kernel: (oc, ic * kh * kh), col: (kh * kw * ic, ow * oh)
int height, width, channels;
SSAttrs(attr)
.get<int>("height", height)
.get<int>("width", width)
.get<int>("channels", channels)
.done();
const auto& col = buildATensor(ctx, ins[0]);
const auto& mask_h_idx = buildATensor(ctx, ins[1]);
const auto& mask_w_idx = buildATensor(ctx, ins[2]);
auto im = buildATensor(ctx, outs[0]);
masked_col2im_forward_cuda(col, mask_h_idx, mask_w_idx, im, height, width,
channels);
}
PARROTS_EXTENSION_REGISTER(masked_im2col_forward)
.attr("kernel_h")
.attr("kernel_w")
.attr("pad_h")
.attr("pad_w")
.input(3)
.output(1)
.apply(masked_im2col_forward_cuda_parrots)
.done();
PARROTS_EXTENSION_REGISTER(masked_col2im_forward)
.attr("height")
.attr("width")
.attr("channels")
.input(3)
.output(1)
.apply(masked_col2im_forward_cuda_parrots)
.done();
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment