Unverified Commit 48d99025 authored by z55250825's avatar z55250825 Committed by GitHub
Browse files

Add new parrots extension implementation for all ops (#794)

* delete all parrots file
add bbox_overlaps new parrots op impl

* support first new impl parrts op (bbox_overlaps)(success test)

* add box_iou_rotated op, test succeed

* add carafe and carafe_naive op, test succeed (one parrots bug need fix)

* add cc_attention op, test success

* add corner_pool op, test success

* add parrots op deform_conv, test success

* add deform_roi_pool op, test success (but has question)

* add focal loss op, test success (gradcheck)

* add masked_conv2d op, test success

* add modulated_deform_conv op, test success

* add nms and nms_rotated op, test success

* add psamask op, test success

* add roi_align op, test_success

* add roi_pool op, test success

* add sync_bn op, test success

* add tin_shift op, test success

* fix test_deform_roi_pool, add parrots test

* skip test_onnx because parrots does not support onnx

* fix c++ lint

* fix python lint

* fix python lint
parent 72e4cc12
#include <parrots/compute/aten.hpp>
#include <parrots/extension.hpp>
#include <parrots/foundation/ssattrs.hpp>
#include "cc_attention_pytorch.h"
using namespace parrots;
/*void ca_forward_cuda(const Tensor t, const Tensor f, Tensor weight);*/
void ca_forward_cuda_parrots(CudaContext &ctx, const SSElement &attr,
const OperatorBase::in_list_t &ins,
OperatorBase::out_list_t &outs) {
const auto &t = buildATensor(ctx, ins[0]);
const auto &f = buildATensor(ctx, ins[1]);
auto weight = buildATensor(ctx, outs[0]);
ca_forward_cuda(t, f, weight);
}
/* void ca_backward_cuda(const Tensor dw, const Tensor t, const Tensor f,
* Tensor dt, Tensor df)
*/
void ca_backward_cuda_parrots(CudaContext &ctx, const SSElement &attr,
const OperatorBase::in_list_t &ins,
OperatorBase::out_list_t &outs) {
const auto &dw = buildATensor(ctx, ins[0]);
const auto &t = buildATensor(ctx, ins[1]);
const auto &f = buildATensor(ctx, ins[2]);
auto dt = buildATensor(ctx, outs[0]);
auto df = buildATensor(ctx, outs[1]);
ca_backward_cuda(dw, t, f, dt, df);
}
/* void ca_map_forward_cuda(const Tensor weight, const Tensor g, Tensor out); */
void ca_map_forward_cuda_parrots(CudaContext &ctx, const SSElement &attr,
const OperatorBase::in_list_t &ins,
OperatorBase::out_list_t &outs) {
const auto &weight = buildATensor(ctx, ins[0]);
const auto &g = buildATensor(ctx, ins[1]);
auto out = buildATensor(ctx, outs[0]);
ca_map_forward_cuda(weight, g, out);
}
/* void ca_map_backward_cuda(const Tensor dout, const Tensor weight,
* const Tensor g, Tensor dw, Tensor dg);
*/
void ca_map_backward_cuda_parrots(CudaContext &ctx, const SSElement &attr,
const OperatorBase::in_list_t &ins,
OperatorBase::out_list_t &outs) {
const auto &dout = buildATensor(ctx, ins[0]);
const auto &weight = buildATensor(ctx, ins[1]);
const auto &g = buildATensor(ctx, ins[2]);
auto dw = buildATensor(ctx, outs[0]);
auto dg = buildATensor(ctx, outs[1]);
ca_map_backward_cuda(dout, weight, g, dw, dg);
}
PARROTS_EXTENSION_REGISTER(ca_forward)
.input(2)
.output(1)
.apply(ca_forward_cuda_parrots)
.done();
PARROTS_EXTENSION_REGISTER(ca_backward)
.input(3)
.output(2)
.apply(ca_backward_cuda_parrots)
.done();
PARROTS_EXTENSION_REGISTER(ca_map_forward)
.input(2)
.output(1)
.apply(ca_map_forward_cuda_parrots)
.done();
PARROTS_EXTENSION_REGISTER(ca_map_backward)
.input(3)
.output(2)
.apply(ca_map_backward_cuda_parrots)
.done();
#ifndef CC_ATTENTION_PYTORCH_H
#define CC_ATTENTION_PYTORCH_H
#include <torch/extension.h>
using namespace at;
void ca_forward_cuda(const Tensor t, const Tensor f, Tensor weight);
void ca_backward_cuda(const Tensor dw, const Tensor t, const Tensor f,
Tensor dt, Tensor df);
void ca_map_forward_cuda(const Tensor weight, const Tensor g, Tensor out);
void ca_map_backward_cuda(const Tensor dout, const Tensor weight,
const Tensor g, Tensor dw, Tensor dg);
#endif // CC_ATTENTION_PYTORCH_H
// Modified from // Modified from
// https://github.com/princeton-vl/CornerNet-Lite/tree/master/core/models/py_utils/_cpools/src // https://github.com/princeton-vl/CornerNet-Lite/tree/master/core/models/py_utils/_cpools/src
#include "parrots_cpp_helper.hpp" #include "pytorch_cpp_helper.hpp"
void bottom_pool_forward_cuda(CudaContext& ctx, const SSElement& attr, Tensor bottom_pool_forward(Tensor input) {
const OperatorBase::in_list_t& ins, // Initialize output
OperatorBase::out_list_t& outs) {} Tensor output = at::zeros_like(input);
// Get height
void bottom_pool_backward_cuda(CudaContext& ctx, const SSElement& attr, int64_t height = input.size(2);
const OperatorBase::in_list_t& ins, output.copy_(input);
OperatorBase::out_list_t& outs) {}
for (int64_t ind = 1; ind < height; ind <<= 1) {
void top_pool_forward_cuda(CudaContext& ctx, const SSElement& attr, Tensor max_temp = at::slice(output, 2, ind, height);
const OperatorBase::in_list_t& ins, Tensor cur_temp = at::slice(output, 2, ind, height).clone();
OperatorBase::out_list_t& outs) {} Tensor next_temp = at::slice(output, 2, 0, height - ind).clone();
at::max_out(max_temp, cur_temp, next_temp);
void top_pool_backward_cuda(CudaContext& ctx, const SSElement& attr, }
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {} return output;
}
void left_pool_forward_cuda(CudaContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins, Tensor bottom_pool_backward(Tensor input, Tensor grad_output) {
OperatorBase::out_list_t& outs) {} auto output = at::zeros_like(input);
void left_pool_backward_cuda(CudaContext& ctx, const SSElement& attr, int32_t batch = input.size(0);
const OperatorBase::in_list_t& ins, int32_t channel = input.size(1);
OperatorBase::out_list_t& outs) {} int32_t height = input.size(2);
int32_t width = input.size(3);
void right_pool_forward_cuda(CudaContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins, auto max_val = torch::zeros({batch, channel, width},
OperatorBase::out_list_t& outs) {} at::device(at::kCUDA).dtype(at::kFloat));
auto max_ind = torch::zeros({batch, channel, width},
void right_pool_backward_cuda(CudaContext& ctx, const SSElement& attr, at::device(at::kCUDA).dtype(at::kLong));
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {} auto input_temp = input.select(2, 0);
max_val.copy_(input_temp);
PARROTS_EXTENSION_REGISTER(bottom_pool_forward)
.input(1) max_ind.fill_(0);
.output(1)
.apply(bottom_pool_forward_cuda) auto output_temp = output.select(2, 0);
.done(); auto grad_output_temp = grad_output.select(2, 0);
output_temp.copy_(grad_output_temp);
PARROTS_EXTENSION_REGISTER(bottom_pool_backward)
.input(2) auto un_max_ind = max_ind.unsqueeze(2);
.output(1) auto gt_mask = torch::zeros({batch, channel, width},
.apply(bottom_pool_backward_cuda) at::device(at::kCUDA).dtype(at::kBool));
.done(); auto max_temp = torch::zeros({batch, channel, width},
at::device(at::kCUDA).dtype(at::kFloat));
PARROTS_EXTENSION_REGISTER(top_pool_forward) for (int32_t ind = 0; ind < height - 1; ++ind) {
.input(1) input_temp = input.select(2, ind + 1);
.output(1) at::gt_out(gt_mask, input_temp, max_val);
.apply(top_pool_forward_cuda)
.done(); at::masked_select_out(max_temp, input_temp, gt_mask);
max_val.masked_scatter_(gt_mask, max_temp);
PARROTS_EXTENSION_REGISTER(top_pool_backward) max_ind.masked_fill_(gt_mask, ind + 1);
.input(2)
.output(1) grad_output_temp = grad_output.select(2, ind + 1).unsqueeze(2);
.apply(top_pool_backward_cuda) output.scatter_add_(2, un_max_ind, grad_output_temp);
.done(); }
PARROTS_EXTENSION_REGISTER(left_pool_forward) return output;
.input(1) }
.output(1)
.apply(left_pool_forward_cuda) Tensor left_pool_forward(Tensor input) {
.done(); // Initialize output
Tensor output = at::zeros_like(input);
PARROTS_EXTENSION_REGISTER(left_pool_backward) // Get width
.input(2) int64_t width = input.size(3);
.output(1) output.copy_(input);
.apply(left_pool_backward_cuda)
.done(); for (int64_t ind = 1; ind < width; ind <<= 1) {
Tensor max_temp = at::slice(output, 3, 0, width - ind);
PARROTS_EXTENSION_REGISTER(right_pool_forward) Tensor cur_temp = at::slice(output, 3, 0, width - ind).clone();
.input(1) Tensor next_temp = at::slice(output, 3, ind, width).clone();
.output(1) at::max_out(max_temp, cur_temp, next_temp);
.apply(right_pool_forward_cuda) }
.done();
return output;
PARROTS_EXTENSION_REGISTER(right_pool_backward) }
.input(2)
.output(1) Tensor left_pool_backward(Tensor input, Tensor grad_output) {
.apply(right_pool_backward_cuda) auto output = at::zeros_like(input);
.done();
int32_t batch = input.size(0);
int32_t channel = input.size(1);
int32_t height = input.size(2);
int32_t width = input.size(3);
auto max_val = torch::zeros({batch, channel, height},
at::device(at::kCUDA).dtype(at::kFloat));
auto max_ind = torch::zeros({batch, channel, height},
at::device(at::kCUDA).dtype(at::kLong));
auto input_temp = input.select(3, width - 1);
max_val.copy_(input_temp);
max_ind.fill_(width - 1);
auto output_temp = output.select(3, width - 1);
auto grad_output_temp = grad_output.select(3, width - 1);
output_temp.copy_(grad_output_temp);
auto un_max_ind = max_ind.unsqueeze(3);
auto gt_mask = torch::zeros({batch, channel, height},
at::device(at::kCUDA).dtype(at::kBool));
auto max_temp = torch::zeros({batch, channel, height},
at::device(at::kCUDA).dtype(at::kFloat));
for (int32_t ind = 1; ind < width; ++ind) {
input_temp = input.select(3, width - ind - 1);
at::gt_out(gt_mask, input_temp, max_val);
at::masked_select_out(max_temp, input_temp, gt_mask);
max_val.masked_scatter_(gt_mask, max_temp);
max_ind.masked_fill_(gt_mask, width - ind - 1);
grad_output_temp = grad_output.select(3, width - ind - 1).unsqueeze(3);
output.scatter_add_(3, un_max_ind, grad_output_temp);
}
return output;
}
Tensor right_pool_forward(Tensor input) {
// Initialize output
Tensor output = at::zeros_like(input);
// Get width
int64_t width = input.size(3);
output.copy_(input);
for (int64_t ind = 1; ind < width; ind <<= 1) {
Tensor max_temp = at::slice(output, 3, ind, width);
Tensor cur_temp = at::slice(output, 3, ind, width).clone();
Tensor next_temp = at::slice(output, 3, 0, width - ind).clone();
at::max_out(max_temp, cur_temp, next_temp);
}
return output;
}
Tensor right_pool_backward(Tensor input, Tensor grad_output) {
Tensor output = at::zeros_like(input);
int32_t batch = input.size(0);
int32_t channel = input.size(1);
int32_t height = input.size(2);
int32_t width = input.size(3);
auto max_val = torch::zeros({batch, channel, height},
at::device(at::kCUDA).dtype(at::kFloat));
auto max_ind = torch::zeros({batch, channel, height},
at::device(at::kCUDA).dtype(at::kLong));
auto input_temp = input.select(3, 0);
max_val.copy_(input_temp);
max_ind.fill_(0);
auto output_temp = output.select(3, 0);
auto grad_output_temp = grad_output.select(3, 0);
output_temp.copy_(grad_output_temp);
auto un_max_ind = max_ind.unsqueeze(3);
auto gt_mask = torch::zeros({batch, channel, height},
at::device(at::kCUDA).dtype(at::kBool));
auto max_temp = torch::zeros({batch, channel, height},
at::device(at::kCUDA).dtype(at::kFloat));
for (int32_t ind = 0; ind < width - 1; ++ind) {
input_temp = input.select(3, ind + 1);
at::gt_out(gt_mask, input_temp, max_val);
at::masked_select_out(max_temp, input_temp, gt_mask);
max_val.masked_scatter_(gt_mask, max_temp);
max_ind.masked_fill_(gt_mask, ind + 1);
grad_output_temp = grad_output.select(3, ind + 1).unsqueeze(3);
output.scatter_add_(3, un_max_ind, grad_output_temp);
}
return output;
}
Tensor top_pool_forward(Tensor input) {
// Initialize output
Tensor output = at::zeros_like(input);
// Get height
int64_t height = input.size(2);
output.copy_(input);
for (int64_t ind = 1; ind < height; ind <<= 1) {
Tensor max_temp = at::slice(output, 2, 0, height - ind);
Tensor cur_temp = at::slice(output, 2, 0, height - ind).clone();
Tensor next_temp = at::slice(output, 2, ind, height).clone();
at::max_out(max_temp, cur_temp, next_temp);
}
return output;
}
Tensor top_pool_backward(Tensor input, Tensor grad_output) {
auto output = at::zeros_like(input);
int32_t batch = input.size(0);
int32_t channel = input.size(1);
int32_t height = input.size(2);
int32_t width = input.size(3);
auto max_val = torch::zeros({batch, channel, width},
at::device(at::kCUDA).dtype(at::kFloat));
auto max_ind = torch::zeros({batch, channel, width},
at::device(at::kCUDA).dtype(at::kLong));
auto input_temp = input.select(2, height - 1);
max_val.copy_(input_temp);
max_ind.fill_(height - 1);
auto output_temp = output.select(2, height - 1);
auto grad_output_temp = grad_output.select(2, height - 1);
output_temp.copy_(grad_output_temp);
auto un_max_ind = max_ind.unsqueeze(2);
auto gt_mask = torch::zeros({batch, channel, width},
at::device(at::kCUDA).dtype(at::kBool));
auto max_temp = torch::zeros({batch, channel, width},
at::device(at::kCUDA).dtype(at::kFloat));
for (int32_t ind = 1; ind < height; ++ind) {
input_temp = input.select(2, height - ind - 1);
at::gt_out(gt_mask, input_temp, max_val);
at::masked_select_out(max_temp, input_temp, gt_mask);
max_val.masked_scatter_(gt_mask, max_temp);
max_ind.masked_fill_(gt_mask, height - ind - 1);
grad_output_temp = grad_output.select(2, height - ind - 1).unsqueeze(2);
output.scatter_add_(2, un_max_ind, grad_output_temp);
}
return output;
}
#include <parrots/compute/aten.hpp>
#include <parrots/extension.hpp>
#include <parrots/foundation/ssattrs.hpp>
#include "corner_pool_pytorch.h"
using namespace parrots;
#ifdef MMCV_WITH_CUDA
void bottom_pool_forward_parrots(CudaContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
at::Tensor input;
input = buildATensor(ctx, ins[0]);
auto out = bottom_pool_forward(input);
updateDArray(ctx, out, outs[0]);
}
void bottom_pool_backward_parrots(CudaContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
at::Tensor input, grad_output;
input = buildATensor(ctx, ins[0]);
grad_output = buildATensor(ctx, ins[1]);
auto out = bottom_pool_backward(input, grad_output);
updateDArray(ctx, out, outs[0]);
}
void left_pool_forward_parrots(CudaContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
at::Tensor input;
input = buildATensor(ctx, ins[0]);
auto out = left_pool_forward(input);
updateDArray(ctx, out, outs[0]);
}
void left_pool_backward_parrots(CudaContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
at::Tensor input, grad_output;
input = buildATensor(ctx, ins[0]);
grad_output = buildATensor(ctx, ins[1]);
auto out = left_pool_backward(input, grad_output);
updateDArray(ctx, out, outs[0]);
}
void right_pool_forward_parrots(CudaContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
at::Tensor input;
input = buildATensor(ctx, ins[0]);
auto out = right_pool_forward(input);
updateDArray(ctx, out, outs[0]);
}
void right_pool_backward_parrots(CudaContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
at::Tensor input, grad_output;
input = buildATensor(ctx, ins[0]);
grad_output = buildATensor(ctx, ins[1]);
auto out = right_pool_backward(input, grad_output);
updateDArray(ctx, out, outs[0]);
}
void top_pool_forward_parrots(CudaContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
at::Tensor input;
input = buildATensor(ctx, ins[0]);
auto out = top_pool_forward(input);
updateDArray(ctx, out, outs[0]);
}
void top_pool_backward_parrots(CudaContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
at::Tensor input, grad_output;
input = buildATensor(ctx, ins[0]);
grad_output = buildATensor(ctx, ins[1]);
auto out = top_pool_backward(input, grad_output);
updateDArray(ctx, out, outs[0]);
}
#endif
void bottom_pool_forward_parrots_cpu(HostContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
at::Tensor input;
input = buildATensor(ctx, ins[0]);
auto out = bottom_pool_forward(input);
updateDArray(ctx, out, outs[0]);
}
void bottom_pool_backward_parrots_cpu(HostContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
at::Tensor input, grad_output;
input = buildATensor(ctx, ins[0]);
grad_output = buildATensor(ctx, ins[1]);
auto out = bottom_pool_backward(input, grad_output);
updateDArray(ctx, out, outs[0]);
}
void left_pool_forward_parrots_cpu(HostContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
at::Tensor input;
input = buildATensor(ctx, ins[0]);
auto out = left_pool_forward(input);
updateDArray(ctx, out, outs[0]);
}
void left_pool_backward_parrots_cpu(HostContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
at::Tensor input, grad_output;
input = buildATensor(ctx, ins[0]);
grad_output = buildATensor(ctx, ins[1]);
auto out = left_pool_backward(input, grad_output);
updateDArray(ctx, out, outs[0]);
}
void right_pool_forward_parrots_cpu(HostContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
at::Tensor input;
input = buildATensor(ctx, ins[0]);
auto out = right_pool_forward(input);
updateDArray(ctx, out, outs[0]);
}
void right_pool_backward_parrots_cpu(HostContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
at::Tensor input, grad_output;
input = buildATensor(ctx, ins[0]);
grad_output = buildATensor(ctx, ins[1]);
auto out = right_pool_backward(input, grad_output);
updateDArray(ctx, out, outs[0]);
}
void top_pool_forward_parrots_cpu(HostContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
at::Tensor input;
input = buildATensor(ctx, ins[0]);
auto out = top_pool_forward(input);
updateDArray(ctx, out, outs[0]);
}
void top_pool_backward_parrots_cpu(HostContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
at::Tensor input, grad_output;
input = buildATensor(ctx, ins[0]);
grad_output = buildATensor(ctx, ins[1]);
auto out = top_pool_backward(input, grad_output);
updateDArray(ctx, out, outs[0]);
}
PARROTS_EXTENSION_REGISTER(bottom_pool_forward)
.input(1)
.output(1)
#ifdef MMCV_WITH_CUDA
.apply(bottom_pool_forward_parrots)
#endif
.apply(bottom_pool_forward_parrots_cpu)
.done();
PARROTS_EXTENSION_REGISTER(bottom_pool_backward)
.input(2)
.output(1)
#ifdef MMCV_WITH_CUDA
.apply(bottom_pool_backward_parrots)
#endif
.apply(bottom_pool_backward_parrots_cpu)
.done();
PARROTS_EXTENSION_REGISTER(top_pool_forward)
.input(1)
.output(1)
#ifdef MMCV_WITH_CUDA
.apply(top_pool_forward_parrots)
#endif
.apply(top_pool_forward_parrots_cpu)
.done();
PARROTS_EXTENSION_REGISTER(top_pool_backward)
.input(2)
.output(1)
#ifdef MMCV_WITH_CUDA
.apply(top_pool_backward_parrots)
#endif
.apply(top_pool_backward_parrots_cpu)
.done();
PARROTS_EXTENSION_REGISTER(left_pool_forward)
.input(1)
.output(1)
#ifdef MMCV_WITH_CUDA
.apply(left_pool_forward_parrots)
#endif
.apply(left_pool_forward_parrots_cpu)
.done();
PARROTS_EXTENSION_REGISTER(left_pool_backward)
.input(2)
.output(1)
#ifdef MMCV_WITH_CUDA
.apply(left_pool_backward_parrots)
#endif
.apply(left_pool_backward_parrots_cpu)
.done();
PARROTS_EXTENSION_REGISTER(right_pool_forward)
.input(1)
.output(1)
#ifdef MMCV_WITH_CUDA
.apply(right_pool_forward_parrots)
#endif
.apply(right_pool_forward_parrots_cpu)
.done();
PARROTS_EXTENSION_REGISTER(right_pool_backward)
.input(2)
.output(1)
#ifdef MMCV_WITH_CUDA
.apply(right_pool_backward_parrots)
#endif
.apply(right_pool_backward_parrots_cpu)
.done();
#ifndef CORNER_POOL_PYTORCH_H
#define CORNER_POOL_PYTORCH_H
#include <torch/extension.h>
at::Tensor bottom_pool_forward(at::Tensor input);
at::Tensor bottom_pool_backward(at::Tensor input, at::Tensor grad_output);
at::Tensor left_pool_forward(at::Tensor input);
at::Tensor left_pool_backward(at::Tensor input, at::Tensor grad_output);
at::Tensor right_pool_forward(at::Tensor input);
at::Tensor right_pool_backward(at::Tensor input, at::Tensor grad_output);
at::Tensor top_pool_forward(at::Tensor input);
at::Tensor top_pool_backward(at::Tensor input, at::Tensor grad_output);
#endif // CORNER_POOL_PYTORCH_H
// Copyright (c) 2018, SenseTime. #include "pytorch_cpp_helper.hpp"
#include "parrots_cpp_helper.hpp"
void DeformConvForwardCUDAKernelLauncher( #ifdef MMCV_WITH_CUDA
const DArrayLite input, const DArrayLite weight, const DArrayLite offset, void DeformConvForwardCUDAKernelLauncher(Tensor input, Tensor weight,
DArrayLite output, DArrayLite columns, DArrayLite ones, int kW, int kH, Tensor offset, Tensor output,
int dW, int dH, int padW, int padH, int dilationW, int dilationH, int group, Tensor columns, Tensor ones, int kW,
int deformable_group, int im2col_step, CudaContext& ctx, int kH, int dW, int dH, int padW,
cudaStream_t stream); int padH, int dilationW, int dilationH,
int group, int deformable_group,
int im2col_step);
void DeformConvBackwardInputCUDAKernelLauncher( void DeformConvBackwardInputCUDAKernelLauncher(
const DArrayLite input, const DArrayLite offset, Tensor input, Tensor offset, Tensor gradOutput, Tensor gradInput,
const DArrayLite gradOutput, DArrayLite gradInput, DArrayLite gradOffset, Tensor gradOffset, Tensor weight, Tensor columns, int kW, int kH, int dW,
DArrayLite weight, DArrayLite columns, int kW, int kH, int dW, int dH, int dH, int padW, int padH, int dilationW, int dilationH, int group,
int padW, int padH, int dilationW, int dilationH, int group, int deformable_group, int im2col_step);
int deformable_group, int im2col_step, CudaContext& ctx,
cudaStream_t stream);
void DeformConvBackwardParametersCUDAKernelLauncher( void DeformConvBackwardParametersCUDAKernelLauncher(
const DArrayLite input, const DArrayLite offset, Tensor input, Tensor offset, Tensor gradOutput, Tensor gradWeight,
const DArrayLite gradOutput, DArrayLite gradWeight, DArrayLite columns, Tensor columns, Tensor ones, int kW, int kH, int dW, int dH, int padW,
DArrayLite ones, int kW, int kH, int dW, int dH, int padW, int padH, int padH, int dilationW, int dilationH, int group, int deformable_group,
int dilationW, int dilationH, int group, int deformable_group, float scale, float scale, int im2col_step);
int im2col_step, CudaContext& ctx, cudaStream_t stream);
void deform_conv_forward_cuda(Tensor input, Tensor weight, Tensor offset,
void deform_conv_forward_cuda(CudaContext& ctx, const SSElement& attr, Tensor output, Tensor columns, Tensor ones,
const OperatorBase::in_list_t& ins, int kW, int kH, int dW, int dH, int padW,
OperatorBase::out_list_t& outs) { int padH, int dilationW, int dilationH, int group,
int kW, kH, dW, dH, padW, padH, dilationW, dilationH, group, deformable_group, int deformable_group, int im2col_step) {
im2col_step;
SSAttrs(attr)
.get<int>("kW", kW)
.get<int>("kH", kH)
.get<int>("dW", dW)
.get<int>("dH", dH)
.get<int>("padW", padW)
.get<int>("padH", padH)
.get<int>("dilationW", dilationW)
.get<int>("dilationH", dilationH)
.get<int>("group", group)
.get<int>("deformable_group", deformable_group)
.get<int>("im2col_step", im2col_step)
.done();
const auto input = ins[0];
const auto weight = ins[1];
const auto offset = ins[2];
auto output = outs[0];
auto columns = outs[1];
auto ones = outs[2];
cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
DeformConvForwardCUDAKernelLauncher( DeformConvForwardCUDAKernelLauncher(
input, weight, offset, output, columns, ones, kW, kH, dW, dH, padW, padH, input, weight, offset, output, columns, ones, kW, kH, dW, dH, padW, padH,
dilationW, dilationH, group, deformable_group, im2col_step, ctx, stream); dilationW, dilationH, group, deformable_group, im2col_step);
} }
void deform_conv_backward_input_cuda(CudaContext& ctx, const SSElement& attr, void deform_conv_backward_input_cuda(Tensor input, Tensor offset,
const OperatorBase::in_list_t& ins, Tensor gradOutput, Tensor gradInput,
OperatorBase::out_list_t& outs) { Tensor gradOffset, Tensor weight,
int kW, kH, dW, dH, padW, padH, dilationW, dilationH, group, deformable_group, Tensor columns, int kW, int kH, int dW,
im2col_step; int dH, int padW, int padH, int dilationW,
SSAttrs(attr) int dilationH, int group,
.get<int>("kW", kW) int deformable_group, int im2col_step) {
.get<int>("kH", kH)
.get<int>("dW", dW)
.get<int>("dH", dH)
.get<int>("padW", padW)
.get<int>("padH", padH)
.get<int>("dilationW", dilationW)
.get<int>("dilationH", dilationH)
.get<int>("group", group)
.get<int>("deformable_group", deformable_group)
.get<int>("im2col_step", im2col_step)
.done();
auto input = ins[0];
auto offset = ins[1];
auto gradOutput = ins[2];
auto gradInput = outs[0];
auto gradOffset = outs[1];
auto weight = outs[2];
auto columns = outs[3];
cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
DeformConvBackwardInputCUDAKernelLauncher( DeformConvBackwardInputCUDAKernelLauncher(
input, offset, gradOutput, gradInput, gradOffset, weight, columns, kW, kH, input, offset, gradOutput, gradInput, gradOffset, weight, columns, kW, kH,
dW, dH, padW, padH, dilationW, dilationH, group, deformable_group, dW, dH, padW, padH, dilationW, dilationH, group, deformable_group,
im2col_step, ctx, stream); im2col_step);
} }
void deform_conv_backward_parameters_cuda(CudaContext& ctx, void deform_conv_backward_parameters_cuda(
const SSElement& attr, Tensor input, Tensor offset, Tensor gradOutput, Tensor gradWeight,
const OperatorBase::in_list_t& ins, Tensor columns, Tensor ones, int kW, int kH, int dW, int dH, int padW,
OperatorBase::out_list_t& outs) { int padH, int dilationW, int dilationH, int group, int deformable_group,
int kW, kH, dW, dH, padW, padH, dilationW, dilationH, group, deformable_group, float scale, int im2col_step) {
im2col_step;
float scale;
SSAttrs(attr)
.get<int>("kW", kW)
.get<int>("kH", kH)
.get<int>("dW", dW)
.get<int>("dH", dH)
.get<int>("padW", padW)
.get<int>("padH", padH)
.get<int>("dilationW", dilationW)
.get<int>("dilationH", dilationH)
.get<int>("group", group)
.get<int>("deformable_group", deformable_group)
.get<float>("scale", scale)
.get<int>("im2col_step", im2col_step)
.done();
auto input = ins[0];
auto offset = ins[1];
auto gradOutput = ins[2];
auto gradWeight = outs[0];
auto columns = outs[1];
auto ones = outs[2];
cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
DeformConvBackwardParametersCUDAKernelLauncher( DeformConvBackwardParametersCUDAKernelLauncher(
input, offset, gradOutput, gradWeight, columns, ones, kW, kH, dW, dH, input, offset, gradOutput, gradWeight, columns, ones, kW, kH, dW, dH,
padW, padH, dilationW, dilationH, group, deformable_group, scale, padW, padH, dilationW, dilationH, group, deformable_group, scale,
im2col_step, ctx, stream); im2col_step);
}
#endif
void deform_conv_forward(Tensor input, Tensor weight, Tensor offset,
Tensor output, Tensor columns, Tensor ones, int kW,
int kH, int dW, int dH, int padW, int padH,
int dilationW, int dilationH, int group,
int deformable_group, int im2col_step) {
if (input.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(input);
CHECK_CUDA_INPUT(offset);
CHECK_CUDA_INPUT(weight);
CHECK_CUDA_INPUT(output);
CHECK_CUDA_INPUT(columns);
CHECK_CUDA_INPUT(ones);
deform_conv_forward_cuda(input, weight, offset, output, columns, ones, kW,
kH, dW, dH, padW, padH, dilationW, dilationH,
group, deformable_group, im2col_step);
#else
AT_ERROR("DeformConv is not compiled with GPU support");
#endif
} else {
AT_ERROR("DeformConv is not implemented on CPU");
}
} }
PARROTS_EXTENSION_REGISTER(deform_conv_forward) void deform_conv_backward_input(Tensor input, Tensor offset, Tensor gradOutput,
.attr("kW") Tensor gradInput, Tensor gradOffset,
.attr("kH") Tensor weight, Tensor columns, int kW, int kH,
.attr("dW") int dW, int dH, int padW, int padH,
.attr("dH") int dilationW, int dilationH, int group,
.attr("padW") int deformable_group, int im2col_step) {
.attr("padH") if (input.device().is_cuda()) {
.attr("dilationW") #ifdef MMCV_WITH_CUDA
.attr("dilationH") CHECK_CUDA_INPUT(input);
.attr("group") CHECK_CUDA_INPUT(offset);
.attr("deformable_group") CHECK_CUDA_INPUT(gradOutput);
.attr("im2col_step") CHECK_CUDA_INPUT(gradInput);
.input(3) CHECK_CUDA_INPUT(gradOffset);
.output(3) CHECK_CUDA_INPUT(weight);
.apply(deform_conv_forward_cuda) CHECK_CUDA_INPUT(columns);
.done();
deform_conv_backward_input_cuda(input, offset, gradOutput, gradInput,
PARROTS_EXTENSION_REGISTER(deform_conv_backward_input) gradOffset, weight, columns, kW, kH, dW, dH,
.attr("kW") padW, padH, dilationW, dilationH, group,
.attr("kH") deformable_group, im2col_step);
.attr("dW") #else
.attr("dH") AT_ERROR("DeformConv is not compiled with GPU support");
.attr("padW") #endif
.attr("padH") } else {
.attr("dilationW") AT_ERROR("DeformConv is not implemented on CPU");
.attr("dilationH") }
.attr("group") }
.attr("deformable_group")
.attr("im2col_step")
.input(3)
.output(4)
.apply(deform_conv_backward_input_cuda)
.done();
PARROTS_EXTENSION_REGISTER(deform_conv_backward_parameters) void deform_conv_backward_parameters(Tensor input, Tensor offset,
.attr("kW") Tensor gradOutput, Tensor gradWeight,
.attr("kH") Tensor columns, Tensor ones, int kW,
.attr("dW") int kH, int dW, int dH, int padW, int padH,
.attr("dH") int dilationW, int dilationH, int group,
.attr("padW") int deformable_group, float scale,
.attr("padH") int im2col_step) {
.attr("dilationW") if (input.device().is_cuda()) {
.attr("dilationH") #ifdef MMCV_WITH_CUDA
.attr("group") CHECK_CUDA_INPUT(input);
.attr("deformable_group") CHECK_CUDA_INPUT(offset);
.attr("scale") CHECK_CUDA_INPUT(gradOutput);
.attr("im2col_step") CHECK_CUDA_INPUT(gradWeight);
.input(3) CHECK_CUDA_INPUT(columns);
.output(3) CHECK_CUDA_INPUT(ones);
.apply(deform_conv_backward_parameters_cuda)
.done(); deform_conv_backward_parameters_cuda(input, offset, gradOutput, gradWeight,
columns, ones, kW, kH, dW, dH, padW,
padH, dilationW, dilationH, group,
deformable_group, scale, im2col_step);
#else
AT_ERROR("DeformConv is not compiled with GPU support");
#endif
} else {
AT_ERROR("DeformConv is not implemented on CPU");
}
}
This diff is collapsed.
#include <parrots/compute/aten.hpp>
#include <parrots/extension.hpp>
#include <parrots/foundation/ssattrs.hpp>
#include "deform_conv_pytorch.h"
using namespace parrots;
/*void deform_conv_forward_cuda(Tensor input, Tensor weight, Tensor offset,
* Tensor output, Tensor columns, Tensor ones,
* int kW, int kH, int dW, int dH, int padW,
* int padH, int dilationW, int dilationH, int
* group, int deformable_group, int im2col_step);
*/
void deform_conv_forward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
int kW, kH, dW, dH, padW, padH, dilationW, dilationH, group, deformable_group,
im2col_step;
SSAttrs(attr)
.get<int>("kW", kW)
.get<int>("kH", kH)
.get<int>("dW", dW)
.get<int>("dH", dH)
.get<int>("padW", padW)
.get<int>("padH", padH)
.get<int>("dilationW", dilationW)
.get<int>("dilationH", dilationH)
.get<int>("group", group)
.get<int>("deformable_group", deformable_group)
.get<int>("im2col_step", im2col_step)
.done();
const auto& input = buildATensor(ctx, ins[0]);
const auto& weight = buildATensor(ctx, ins[1]);
const auto& offset = buildATensor(ctx, ins[2]);
auto output = buildATensor(ctx, outs[0]);
auto columns = buildATensor(ctx, outs[1]);
auto ones = buildATensor(ctx, outs[2]);
deform_conv_forward_cuda(input, weight, offset, output, columns, ones, kW, kH,
dW, dH, padW, padH, dilationW, dilationH, group,
deformable_group, im2col_step);
}
/*void deform_conv_backward_input_cuda(Tensor input, Tensor offset,
* Tensor gradOutput, Tensor gradInput,
* Tensor gradOffset, Tensor weight,
* Tensor columns, int kW, int kH, int dW,
* int dH, int padW, int padH, int
* dilationW, int dilationH, int group, int deformable_group, int im2col_step);
*/
void deform_conv_backward_input_cuda_parrots(CudaContext& ctx,
const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
int kW, kH, dW, dH, padW, padH, dilationW, dilationH, group, deformable_group,
im2col_step;
SSAttrs(attr)
.get<int>("kW", kW)
.get<int>("kH", kH)
.get<int>("dW", dW)
.get<int>("dH", dH)
.get<int>("padW", padW)
.get<int>("padH", padH)
.get<int>("dilationW", dilationW)
.get<int>("dilationH", dilationH)
.get<int>("group", group)
.get<int>("deformable_group", deformable_group)
.get<int>("im2col_step", im2col_step)
.done();
const auto& input = buildATensor(ctx, ins[0]);
const auto& offset = buildATensor(ctx, ins[1]);
const auto& gradOutput = buildATensor(ctx, ins[2]);
auto gradInput = buildATensor(ctx, outs[0]);
auto gradOffset = buildATensor(ctx, outs[1]);
auto weight = buildATensor(ctx, outs[2]);
auto columns = buildATensor(ctx, outs[3]);
deform_conv_backward_input_cuda(input, offset, gradOutput, gradInput,
gradOffset, weight, columns, kW, kH, dW, dH,
padW, padH, dilationW, dilationH, group,
deformable_group, im2col_step);
}
/*void deform_conv_backward_parameters_cuda(
* Tensor input, Tensor offset, Tensor gradOutput, Tensor gradWeight,
* Tensor columns, Tensor ones, int kW, int kH, int dW, int dH, int padW,
* int padH, int dilationW, int dilationH, int group, int deformable_group,
* float scale, int im2col_step);
*/
void deform_conv_backward_parameters_cuda_parrots(
CudaContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
int kW, kH, dW, dH, padW, padH, dilationW, dilationH, group, deformable_group,
im2col_step;
float scale;
SSAttrs(attr)
.get<int>("kW", kW)
.get<int>("kH", kH)
.get<int>("dW", dW)
.get<int>("dH", dH)
.get<int>("padW", padW)
.get<int>("padH", padH)
.get<int>("dilationW", dilationW)
.get<int>("dilationH", dilationH)
.get<int>("group", group)
.get<int>("deformable_group", deformable_group)
.get<float>("scale", scale)
.get<int>("im2col_step", im2col_step)
.done();
const auto& input = buildATensor(ctx, ins[0]);
const auto& offset = buildATensor(ctx, ins[1]);
const auto& gradOutput = buildATensor(ctx, ins[2]);
auto gradWeight = buildATensor(ctx, outs[0]);
auto columns = buildATensor(ctx, outs[1]);
auto ones = buildATensor(ctx, outs[2]);
deform_conv_backward_parameters_cuda(input, offset, gradOutput, gradWeight,
columns, ones, kW, kH, dW, dH, padW,
padH, dilationW, dilationH, group,
deformable_group, scale, im2col_step);
}
PARROTS_EXTENSION_REGISTER(deform_conv_forward)
.attr("kW")
.attr("kH")
.attr("dW")
.attr("dH")
.attr("padW")
.attr("padH")
.attr("dilationW")
.attr("dilationH")
.attr("group")
.attr("deformable_group")
.attr("im2col_step")
.input(3)
.output(3)
.apply(deform_conv_forward_cuda_parrots)
.done();
PARROTS_EXTENSION_REGISTER(deform_conv_backward_input)
.attr("kW")
.attr("kH")
.attr("dW")
.attr("dH")
.attr("padW")
.attr("padH")
.attr("dilationW")
.attr("dilationH")
.attr("group")
.attr("deformable_group")
.attr("im2col_step")
.input(3)
.output(4)
.apply(deform_conv_backward_input_cuda_parrots)
.done();
PARROTS_EXTENSION_REGISTER(deform_conv_backward_parameters)
.attr("kW")
.attr("kH")
.attr("dW")
.attr("dH")
.attr("padW")
.attr("padH")
.attr("dilationW")
.attr("dilationH")
.attr("group")
.attr("deformable_group")
.attr("scale")
.attr("im2col_step")
.input(3)
.output(3)
.apply(deform_conv_backward_parameters_cuda_parrots)
.done();
#ifndef DEFORM_CONV_PYTORCH_H
#define DEFORM_CONV_PYTORCH_H
#include <torch/extension.h>
using namespace at;
void deform_conv_forward_cuda(Tensor input, Tensor weight, Tensor offset,
Tensor output, Tensor columns, Tensor ones,
int kW, int kH, int dW, int dH, int padW,
int padH, int dilationW, int dilationH, int group,
int deformable_group, int im2col_step);
void deform_conv_backward_input_cuda(Tensor input, Tensor offset,
Tensor gradOutput, Tensor gradInput,
Tensor gradOffset, Tensor weight,
Tensor columns, int kW, int kH, int dW,
int dH, int padW, int padH, int dilationW,
int dilationH, int group,
int deformable_group, int im2col_step);
void deform_conv_backward_parameters_cuda(
Tensor input, Tensor offset, Tensor gradOutput, Tensor gradWeight,
Tensor columns, Tensor ones, int kW, int kH, int dW, int dH, int padW,
int padH, int dilationW, int dilationH, int group, int deformable_group,
float scale, int im2col_step);
#endif // DEFORM_CONV_PYTORCH_H
#include "parrots_cpp_helper.hpp" #include "pytorch_cpp_helper.hpp"
void DeformRoIPoolForwardCUDAKernelLauncher( #ifdef MMCV_WITH_CUDA
const DArrayLite input, const DArrayLite rois, const DArrayLite offset, void DeformRoIPoolForwardCUDAKernelLauncher(Tensor input, Tensor rois,
DArrayLite output, int pooled_height, int pooled_width, float spatial_scale, Tensor offset, Tensor output,
int sampling_ratio, float gamma, cudaStream_t stream); int pooled_height, int pooled_width,
float spatial_scale,
int sampling_ratio, float gamma);
void DeformRoIPoolBackwardCUDAKernelLauncher( void DeformRoIPoolBackwardCUDAKernelLauncher(
const DArrayLite grad_output, const DArrayLite input, const DArrayLite rois, Tensor grad_output, Tensor input, Tensor rois, Tensor offset,
const DArrayLite offset, DArrayLite grad_input, DArrayLite grad_offset, Tensor grad_input, Tensor grad_offset, int pooled_height, int pooled_width,
int pooled_height, int pooled_width, float spatial_scale, float spatial_scale, int sampling_ratio, float gamma);
int sampling_ratio, float gamma, cudaStream_t stream);
void deform_roi_pool_forward_cuda(CudaContext& ctx, const SSElement& attr, void deform_roi_pool_forward_cuda(Tensor input, Tensor rois, Tensor offset,
const OperatorBase::in_list_t& ins, Tensor output, int pooled_height,
OperatorBase::out_list_t& outs) { int pooled_width, float spatial_scale,
int pooled_height; int sampling_ratio, float gamma) {
int pooled_width; DeformRoIPoolForwardCUDAKernelLauncher(input, rois, offset, output,
float spatial_scale; pooled_height, pooled_width,
int sampling_ratio; spatial_scale, sampling_ratio, gamma);
float gamma;
SSAttrs(attr)
.get<int>("pooled_height", pooled_height)
.get<int>("pooled_width", pooled_width)
.get<float>("spatial_scale", spatial_scale)
.get<int>("sampling_ratio", sampling_ratio)
.get<float>("gamma", gamma)
.done();
const auto& input = ins[0];
const auto& rois = ins[1];
const auto& offset = ins[2];
auto& output = outs[0];
cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
DeformRoIPoolForwardCUDAKernelLauncher(
input, rois, offset, output, pooled_height, pooled_width, spatial_scale,
sampling_ratio, gamma, stream);
} }
void deform_roi_pool_backward_cuda(CudaContext& ctx, const SSElement& attr, void deform_roi_pool_backward_cuda(Tensor grad_output, Tensor input,
const OperatorBase::in_list_t& ins, Tensor rois, Tensor offset,
OperatorBase::out_list_t& outs) { Tensor grad_input, Tensor grad_offset,
int pooled_height; int pooled_height, int pooled_width,
int pooled_width; float spatial_scale, int sampling_ratio,
float spatial_scale; float gamma) {
int sampling_ratio;
float gamma;
SSAttrs(attr)
.get<int>("pooled_height", pooled_height)
.get<int>("pooled_width", pooled_width)
.get<float>("spatial_scale", spatial_scale)
.get<int>("sampling_ratio", sampling_ratio)
.get<float>("gamma", gamma)
.done();
const auto& grad_output = ins[0];
const auto& input = ins[1];
const auto& rois = ins[2];
const auto& offset = ins[3];
auto& grad_input = outs[0];
auto& grad_offset = outs[1];
cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
DeformRoIPoolBackwardCUDAKernelLauncher( DeformRoIPoolBackwardCUDAKernelLauncher(
grad_output, input, rois, offset, grad_input, grad_offset, pooled_height, grad_output, input, rois, offset, grad_input, grad_offset, pooled_height,
pooled_width, spatial_scale, sampling_ratio, gamma, stream); pooled_width, spatial_scale, sampling_ratio, gamma);
} }
#endif
void deform_roi_pool_forward(Tensor input, Tensor rois, Tensor offset,
Tensor output, int pooled_height, int pooled_width,
float spatial_scale, int sampling_ratio,
float gamma) {
if (input.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(input);
CHECK_CUDA_INPUT(rois);
CHECK_CUDA_INPUT(offset);
CHECK_CUDA_INPUT(output);
PARROTS_EXTENSION_REGISTER(deform_roi_pool_forward) deform_roi_pool_forward_cuda(input, rois, offset, output, pooled_height,
.attr("pooled_height") pooled_width, spatial_scale, sampling_ratio,
.attr("pooled_width") gamma);
.attr("spatial_scale") #else
.attr("sampling_ratio") AT_ERROR("DeformRoIPool is not compiled with GPU support");
.attr("gamma") #endif
.input(3) } else {
.output(1) AT_ERROR("DeformRoIPool is not implemented on CPU");
.apply(deform_roi_pool_forward_cuda) }
.done(); }
PARROTS_EXTENSION_REGISTER(deform_roi_pool_backward) void deform_roi_pool_backward(Tensor grad_output, Tensor input, Tensor rois,
.attr("pooled_height") Tensor offset, Tensor grad_input,
.attr("pooled_width") Tensor grad_offset, int pooled_height,
.attr("spatial_scale") int pooled_width, float spatial_scale,
.attr("sampling_ratio") int sampling_ratio, float gamma) {
.attr("gamma") if (grad_output.device().is_cuda()) {
.input(4) #ifdef MMCV_WITH_CUDA
.output(2) CHECK_CUDA_INPUT(grad_output);
.apply(deform_roi_pool_backward_cuda) CHECK_CUDA_INPUT(input);
.done(); CHECK_CUDA_INPUT(rois);
CHECK_CUDA_INPUT(offset);
CHECK_CUDA_INPUT(grad_input);
CHECK_CUDA_INPUT(grad_offset);
deform_roi_pool_backward_cuda(grad_output, input, rois, offset, grad_input,
grad_offset, pooled_height, pooled_width,
spatial_scale, sampling_ratio, gamma);
#else
AT_ERROR("DeformRoIPool is not compiled with GPU support");
#endif
} else {
AT_ERROR("DeformRoIPool is not implemented on CPU");
}
}
#include "deform_roi_pool_cuda_kernel.cuh" #include "deform_roi_pool_cuda_kernel.cuh"
#include "parrots_cuda_helper.hpp" #include "pytorch_cuda_helper.hpp"
void DeformRoIPoolForwardCUDAKernelLauncher( void DeformRoIPoolForwardCUDAKernelLauncher(Tensor input, Tensor rois,
const DArrayLite input, const DArrayLite rois, const DArrayLite offset, Tensor offset, Tensor output,
DArrayLite output, int pooled_height, int pooled_width, float spatial_scale, int pooled_height, int pooled_width,
int sampling_ratio, float gamma, cudaStream_t stream) { float spatial_scale,
int output_size = output.size(); int sampling_ratio, float gamma) {
int channels = input.dim(1); int output_size = output.numel();
int height = input.dim(2); int channels = input.size(1);
int width = input.dim(3); int height = input.size(2);
int width = input.size(3);
PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF( at::cuda::CUDAGuard device_guard(input.device());
input.elemType().prim(), ([&] { cudaStream_t stream = at::cuda::getCurrentCUDAStream();
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
input.scalar_type(), "deform_roi_pool_forward_cuda_kernel", [&] {
deform_roi_pool_forward_cuda_kernel<scalar_t> deform_roi_pool_forward_cuda_kernel<scalar_t>
<<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>( <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
output_size, input.ptr<scalar_t>(), rois.ptr<scalar_t>(), output_size, input.data_ptr<scalar_t>(),
offset.ptr<scalar_t>(), output.ptr<scalar_t>(), pooled_height, rois.data_ptr<scalar_t>(), offset.data_ptr<scalar_t>(),
pooled_width, spatial_scale, sampling_ratio, gamma, channels, output.data_ptr<scalar_t>(), pooled_height, pooled_width,
height, width); static_cast<scalar_t>(spatial_scale), sampling_ratio,
})); static_cast<scalar_t>(gamma), channels, height, width);
});
PARROTS_CUDA_CHECK(cudaGetLastError()); AT_CUDA_CHECK(cudaGetLastError());
} }
void DeformRoIPoolBackwardCUDAKernelLauncher( void DeformRoIPoolBackwardCUDAKernelLauncher(
const DArrayLite grad_output, const DArrayLite input, const DArrayLite rois, Tensor grad_output, Tensor input, Tensor rois, Tensor offset,
const DArrayLite offset, DArrayLite grad_input, DArrayLite grad_offset, Tensor grad_input, Tensor grad_offset, int pooled_height, int pooled_width,
int pooled_height, int pooled_width, float spatial_scale, float spatial_scale, int sampling_ratio, float gamma) {
int sampling_ratio, float gamma, cudaStream_t stream) { int output_size = grad_output.numel();
int output_size = grad_output.size(); int channels = grad_input.size(1);
int channels = grad_input.dim(1); int height = grad_input.size(2);
int height = grad_input.dim(2); int width = grad_input.size(3);
int width = grad_input.dim(3);
PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF( at::cuda::CUDAGuard device_guard(grad_output.device());
grad_output.elemType().prim(), ([&] { cudaStream_t stream = at::cuda::getCurrentCUDAStream();
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
grad_output.scalar_type(), "deform_roi_pool_backward_cuda_kernel", [&] {
deform_roi_pool_backward_cuda_kernel<scalar_t> deform_roi_pool_backward_cuda_kernel<scalar_t>
<<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>( <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
output_size, grad_output.ptr<scalar_t>(), input.ptr<scalar_t>(), output_size, grad_output.data_ptr<scalar_t>(),
rois.ptr<scalar_t>(), offset.ptr<scalar_t>(), input.data_ptr<scalar_t>(), rois.data_ptr<scalar_t>(),
grad_input.ptr<scalar_t>(), grad_offset.ptr<scalar_t>(), offset.data_ptr<scalar_t>(), grad_input.data_ptr<scalar_t>(),
pooled_height, pooled_width, spatial_scale, sampling_ratio, grad_offset.data_ptr<scalar_t>(), pooled_height, pooled_width,
gamma, channels, height, width); static_cast<scalar_t>(spatial_scale), sampling_ratio,
})); static_cast<scalar_t>(gamma), channels, height, width);
});
PARROTS_CUDA_CHECK(cudaGetLastError()); AT_CUDA_CHECK(cudaGetLastError());
} }
#include <parrots/compute/aten.hpp>
#include <parrots/extension.hpp>
#include <parrots/foundation/ssattrs.hpp>
#include "deform_roi_pool_pytorch.h"
using namespace parrots;
/*void deform_roi_pool_forward_cuda(Tensor input, Tensor rois, Tensor offset,
* Tensor output, int pooled_height,
* int pooled_width, float spatial_scale,
* int sampling_ratio, float gamma);
*/
void deform_roi_pool_forward_cuda_parrots(CudaContext& ctx,
const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
int pooled_height;
int pooled_width;
float spatial_scale;
int sampling_ratio;
float gamma;
SSAttrs(attr)
.get<int>("pooled_height", pooled_height)
.get<int>("pooled_width", pooled_width)
.get<float>("spatial_scale", spatial_scale)
.get<int>("sampling_ratio", sampling_ratio)
.get<float>("gamma", gamma)
.done();
const auto& input = buildATensor(ctx, ins[0]);
const auto& rois = buildATensor(ctx, ins[1]);
const auto& offset = buildATensor(ctx, ins[2]);
auto output = buildATensor(ctx, outs[0]);
deform_roi_pool_forward_cuda(input, rois, offset, output, pooled_height,
pooled_width, spatial_scale, sampling_ratio,
gamma);
}
/*void deform_roi_pool_backward_cuda(Tensor grad_output, Tensor input,
* Tensor rois, Tensor offset,
* Tensor grad_input, Tensor grad_offset,
* int pooled_height, int pooled_width,
* float spatial_scale, int sampling_ratio,
* float gamma);
*/
void deform_roi_pool_backward_cuda_parrots(CudaContext& ctx,
const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
int pooled_height;
int pooled_width;
float spatial_scale;
int sampling_ratio;
float gamma;
SSAttrs(attr)
.get<int>("pooled_height", pooled_height)
.get<int>("pooled_width", pooled_width)
.get<float>("spatial_scale", spatial_scale)
.get<int>("sampling_ratio", sampling_ratio)
.get<float>("gamma", gamma)
.done();
const auto& grad_output = buildATensor(ctx, ins[0]);
const auto& input = buildATensor(ctx, ins[1]);
const auto& rois = buildATensor(ctx, ins[2]);
const auto& offset = buildATensor(ctx, ins[3]);
auto grad_input = buildATensor(ctx, outs[0]);
auto grad_offset = buildATensor(ctx, outs[1]);
deform_roi_pool_backward_cuda(grad_output, input, rois, offset, grad_input,
grad_offset, pooled_height, pooled_width,
spatial_scale, sampling_ratio, gamma);
}
PARROTS_EXTENSION_REGISTER(deform_roi_pool_forward)
.attr("pooled_height")
.attr("pooled_width")
.attr("spatial_scale")
.attr("sampling_ratio")
.attr("gamma")
.input(3)
.output(1)
.apply(deform_roi_pool_forward_cuda_parrots)
.done();
PARROTS_EXTENSION_REGISTER(deform_roi_pool_backward)
.attr("pooled_height")
.attr("pooled_width")
.attr("spatial_scale")
.attr("sampling_ratio")
.attr("gamma")
.input(4)
.output(2)
.apply(deform_roi_pool_backward_cuda_parrots)
.done();
#ifndef DEFORM_ROI_POOL_PYTORCH_H
#define DEFORM_ROI_POOL_PYTORCH_H
#include <torch/extension.h>
using namespace at;
void deform_roi_pool_forward_cuda(Tensor input, Tensor rois, Tensor offset,
Tensor output, int pooled_height,
int pooled_width, float spatial_scale,
int sampling_ratio, float gamma);
void deform_roi_pool_backward_cuda(Tensor grad_output, Tensor input,
Tensor rois, Tensor offset,
Tensor grad_input, Tensor grad_offset,
int pooled_height, int pooled_width,
float spatial_scale, int sampling_ratio,
float gamma);
#endif // DEFORM_ROI_POOL_PYTORCH_H
// Copyright (c) 2018, SenseTime. #include "pytorch_cpp_helper.hpp"
#include "parrots_cpp_helper.hpp"
#ifdef MMCV_WITH_CUDA
void SigmoidFocalLossForwardCUDAKernelLauncher( void SigmoidFocalLossForwardCUDAKernelLauncher(Tensor input, Tensor target,
const DArrayLite input, const DArrayLite target, const DArrayLite weight, Tensor weight, Tensor output,
DArrayLite output, float gamma, float alpha, cudaStream_t stream); const float gamma,
const float alpha);
void SigmoidFocalLossBackwardCUDAKernelLauncher(
const DArrayLite input, const DArrayLite target, const DArrayLite weight, void SigmoidFocalLossBackwardCUDAKernelLauncher(Tensor input, Tensor target,
DArrayLite grad_input, float gamma, float alpha, cudaStream_t stream); Tensor weight,
Tensor grad_input,
void SoftmaxFocalLossForwardCUDAKernelLauncher( const float gamma,
const DArrayLite input, const DArrayLite target, const DArrayLite weight, const float alpha);
DArrayLite output, float gamma, float alpha, cudaStream_t stream);
void SoftmaxFocalLossForwardCUDAKernelLauncher(Tensor input, Tensor target,
void SoftmaxFocalLossBackwardCUDAKernelLauncher( Tensor weight, Tensor output,
const DArrayLite input, const DArrayLite target, const DArrayLite weight, const float gamma,
DArrayLite buff, DArrayLite grad_input, float gamma, float alpha, const float alpha);
cudaStream_t stream);
void SoftmaxFocalLossBackwardCUDAKernelLauncher(Tensor input, Tensor target,
void sigmoid_focal_loss_forward_cuda(CudaContext& ctx, const SSElement& attr, Tensor weight, Tensor buff,
const OperatorBase::in_list_t& ins, Tensor grad_input,
OperatorBase::out_list_t& outs) { const float gamma,
float gamma; const float alpha);
float alpha;
SSAttrs(attr).get<float>("gamma", gamma).get<float>("alpha", alpha).done(); void sigmoid_focal_loss_forward_cuda(Tensor input, Tensor target, Tensor weight,
Tensor output, float gamma, float alpha) {
// get inputs and outputs
const auto& input = ins[0];
const auto& target = ins[1];
const auto& weight = ins[2];
auto& output = outs[0];
cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
SigmoidFocalLossForwardCUDAKernelLauncher(input, target, weight, output, SigmoidFocalLossForwardCUDAKernelLauncher(input, target, weight, output,
gamma, alpha, stream); gamma, alpha);
} }
void sigmoid_focal_loss_backward_cuda(CudaContext& ctx, const SSElement& attr, void sigmoid_focal_loss_backward_cuda(Tensor input, Tensor target,
const OperatorBase::in_list_t& ins, Tensor weight, Tensor grad_input,
OperatorBase::out_list_t& outs) { float gamma, float alpha) {
float gamma;
float alpha;
SSAttrs(attr).get<float>("gamma", gamma).get<float>("alpha", alpha).done();
// get inputs and outputs
const auto& input = ins[0];
const auto& target = ins[1];
const auto& weight = ins[2];
auto& grad_input = outs[0];
cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
SigmoidFocalLossBackwardCUDAKernelLauncher(input, target, weight, grad_input, SigmoidFocalLossBackwardCUDAKernelLauncher(input, target, weight, grad_input,
gamma, alpha, stream); gamma, alpha);
} }
void softmax_focal_loss_forward_cuda(CudaContext& ctx, const SSElement& attr, void softmax_focal_loss_forward_cuda(Tensor input, Tensor target, Tensor weight,
const OperatorBase::in_list_t& ins, Tensor output, float gamma, float alpha) {
OperatorBase::out_list_t& outs) { SoftmaxFocalLossForwardCUDAKernelLauncher(input, target, weight, output,
float gamma; gamma, alpha);
float alpha;
SSAttrs(attr).get<float>("gamma", gamma).get<float>("alpha", alpha).done();
// get inputs and outputs
const auto& input = ins[0];
const auto& target = ins[1];
const auto& weight = ins[2];
auto& grad_input = outs[0];
cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
SoftmaxFocalLossForwardCUDAKernelLauncher(input, target, weight, grad_input,
gamma, alpha, stream);
} }
void softmax_focal_loss_backward_cuda(CudaContext& ctx, const SSElement& attr, void softmax_focal_loss_backward_cuda(Tensor input, Tensor target,
const OperatorBase::in_list_t& ins, Tensor weight, Tensor buff,
OperatorBase::out_list_t& outs) { Tensor grad_input, float gamma,
float gamma; float alpha) {
float alpha; SoftmaxFocalLossBackwardCUDAKernelLauncher(input, target, weight, buff,
SSAttrs(attr).get<float>("gamma", gamma).get<float>("alpha", alpha).done(); grad_input, gamma, alpha);
}
// get inputs and outputs #endif
const auto& input = ins[0];
const auto& target = ins[1]; void sigmoid_focal_loss_forward(Tensor input, Tensor target, Tensor weight,
const auto& weight = ins[2]; Tensor output, float gamma, float alpha) {
if (input.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(input);
CHECK_CUDA_INPUT(target);
CHECK_CUDA_INPUT(weight);
CHECK_CUDA_INPUT(output);
sigmoid_focal_loss_forward_cuda(input, target, weight, output, gamma,
alpha);
#else
AT_ERROR("SigmoidFocalLoss is not compiled with GPU support");
#endif
} else {
AT_ERROR("SigmoidFocalLoss is not implemented on CPU");
}
}
auto& buff = outs[0]; void sigmoid_focal_loss_backward(Tensor input, Tensor target, Tensor weight,
auto& grad_input = outs[1]; Tensor grad_input, float gamma, float alpha) {
if (input.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(input);
CHECK_CUDA_INPUT(target);
CHECK_CUDA_INPUT(weight);
CHECK_CUDA_INPUT(grad_input);
sigmoid_focal_loss_backward_cuda(input, target, weight, grad_input, gamma,
alpha);
#else
AT_ERROR("SigmoidFocalLoss is not compiled with GPU support");
#endif
} else {
AT_ERROR("SigmoidFocalLoss is not implemented on CPU");
}
}
cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream()); void softmax_focal_loss_forward(Tensor input, Tensor target, Tensor weight,
SoftmaxFocalLossBackwardCUDAKernelLauncher(input, target, weight, buff, Tensor output, float gamma, float alpha) {
grad_input, gamma, alpha, stream); if (input.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(input);
CHECK_CUDA_INPUT(target);
CHECK_CUDA_INPUT(weight);
CHECK_CUDA_INPUT(output);
softmax_focal_loss_forward_cuda(input, target, weight, output, gamma,
alpha);
#else
AT_ERROR("SoftmaxFocalLoss is not compiled with GPU support");
#endif
} else {
AT_ERROR("SoftmaxFocalLoss is not implemented on CPU");
}
} }
PARROTS_EXTENSION_REGISTER(sigmoid_focal_loss_forward) void softmax_focal_loss_backward(Tensor input, Tensor target, Tensor weight,
.attr("gamma") Tensor buff, Tensor grad_input, float gamma,
.attr("alpha") float alpha) {
.input(3) if (input.device().is_cuda()) {
.output(1) #ifdef MMCV_WITH_CUDA
.apply(sigmoid_focal_loss_forward_cuda) CHECK_CUDA_INPUT(input);
.done(); CHECK_CUDA_INPUT(target);
CHECK_CUDA_INPUT(weight);
PARROTS_EXTENSION_REGISTER(sigmoid_focal_loss_backward) CHECK_CUDA_INPUT(buff);
.attr("gamma") CHECK_CUDA_INPUT(grad_input);
.attr("alpha")
.input(3) softmax_focal_loss_backward_cuda(input, target, weight, buff, grad_input,
.output(1) gamma, alpha);
.apply(sigmoid_focal_loss_backward_cuda) #else
.done(); AT_ERROR("SoftmaxFocalLoss is not compiled with GPU support");
#endif
PARROTS_EXTENSION_REGISTER(softmax_focal_loss_forward) } else {
.attr("gamma") AT_ERROR("SoftmaxFocalLoss is not implemented on CPU");
.attr("alpha") }
.input(3) }
.output(1)
.apply(softmax_focal_loss_forward_cuda)
.done();
PARROTS_EXTENSION_REGISTER(softmax_focal_loss_backward)
.attr("gamma")
.attr("alpha")
.input(3)
.output(2)
.apply(softmax_focal_loss_backward_cuda)
.done();
#include "parrots_cuda_helper.hpp" #include "pytorch_cuda_helper.hpp"
#include "sigmoid_focal_loss_cuda_kernel.cuh" #include "sigmoid_focal_loss_cuda_kernel.cuh"
#include "softmax_focal_loss_cuda_kernel.cuh" #include "softmax_focal_loss_cuda_kernel.cuh"
void SigmoidFocalLossForwardCUDAKernelLauncher( void SigmoidFocalLossForwardCUDAKernelLauncher(Tensor input, Tensor target,
const DArrayLite input, const DArrayLite target, const DArrayLite weight, Tensor weight, Tensor output,
DArrayLite output, float gamma, float alpha, cudaStream_t stream) { const float gamma,
int output_size = output.size(); const float alpha) {
int num_classes = input.dim(1); int output_size = output.numel();
int num_classes = input.size(1);
PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF( AT_ASSERTM(target.max().item<int64_t>() <= (int64_t)num_classes,
input.elemType().prim(), ([&] { "target label should smaller or equal than num classes");
at::cuda::CUDAGuard device_guard(input.device());
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
input.scalar_type(), "sigmoid_focal_loss_forward_cuda_kernel", [&] {
sigmoid_focal_loss_forward_cuda_kernel<scalar_t> sigmoid_focal_loss_forward_cuda_kernel<scalar_t>
<<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>( <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
output_size, input.ptr<scalar_t>(), target.ptr<int64_t>(), output_size, input.data_ptr<scalar_t>(),
weight.ptr<scalar_t>(), output.ptr<scalar_t>(), gamma, alpha, target.data_ptr<int64_t>(), weight.data_ptr<scalar_t>(),
num_classes); output.data_ptr<scalar_t>(), gamma, alpha, num_classes);
})); });
PARROTS_CUDA_CHECK(cudaGetLastError()); AT_CUDA_CHECK(cudaGetLastError());
} }
void SigmoidFocalLossBackwardCUDAKernelLauncher( void SigmoidFocalLossBackwardCUDAKernelLauncher(Tensor input, Tensor target,
const DArrayLite input, const DArrayLite target, const DArrayLite weight, Tensor weight,
DArrayLite grad_input, float gamma, float alpha, cudaStream_t stream) { Tensor grad_input,
int output_size = grad_input.size(); const float gamma,
int num_classes = input.dim(1); const float alpha) {
int output_size = grad_input.numel();
int num_classes = input.size(1);
PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF( at::cuda::CUDAGuard device_guard(grad_input.device());
input.elemType().prim(), ([&] { cudaStream_t stream = at::cuda::getCurrentCUDAStream();
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
input.scalar_type(), "sigmoid_focal_loss_backward_cuda_kernel", [&] {
sigmoid_focal_loss_backward_cuda_kernel<scalar_t> sigmoid_focal_loss_backward_cuda_kernel<scalar_t>
<<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>( <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
output_size, input.ptr<scalar_t>(), target.ptr<int64_t>(), output_size, input.data_ptr<scalar_t>(),
weight.ptr<scalar_t>(), grad_input.ptr<scalar_t>(), gamma, target.data_ptr<int64_t>(), weight.data_ptr<scalar_t>(),
alpha, num_classes); grad_input.data_ptr<scalar_t>(), gamma, alpha, num_classes);
})); });
PARROTS_CUDA_CHECK(cudaGetLastError()); AT_CUDA_CHECK(cudaGetLastError());
} }
void SoftmaxFocalLossForwardCUDAKernelLauncher( void SoftmaxFocalLossForwardCUDAKernelLauncher(Tensor softmax, Tensor target,
const DArrayLite softmax, const DArrayLite target, const DArrayLite weight, Tensor weight, Tensor output,
DArrayLite output, float gamma, float alpha, cudaStream_t stream) { const float gamma,
int output_size = output.size(); const float alpha) {
int num_classes = softmax.dim(1); int output_size = output.numel();
int num_classes = softmax.size(1);
PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF( AT_ASSERTM(target.max().item<int64_t>() <= (int64_t)num_classes,
softmax.elemType().prim(), ([&] { "target label should smaller or equal than num classes");
at::cuda::CUDAGuard device_guard(softmax.device());
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
softmax.scalar_type(), "softmax_focal_loss_forward_cuda_kernel", [&] {
softmax_focal_loss_forward_cuda_kernel<scalar_t> softmax_focal_loss_forward_cuda_kernel<scalar_t>
<<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>( <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
output_size, softmax.ptr<scalar_t>(), target.ptr<int64_t>(), output_size, softmax.data_ptr<scalar_t>(),
weight.ptr<scalar_t>(), output.ptr<scalar_t>(), gamma, alpha, target.data_ptr<int64_t>(), weight.data_ptr<scalar_t>(),
num_classes); output.data_ptr<scalar_t>(), gamma, alpha, num_classes);
})); });
PARROTS_CUDA_CHECK(cudaGetLastError()); AT_CUDA_CHECK(cudaGetLastError());
} }
void SoftmaxFocalLossBackwardCUDAKernelLauncher( void SoftmaxFocalLossBackwardCUDAKernelLauncher(Tensor softmax, Tensor target,
const DArrayLite softmax, const DArrayLite target, const DArrayLite weight, Tensor weight, Tensor buff,
DArrayLite buff, DArrayLite grad_input, float gamma, float alpha, Tensor grad_input,
cudaStream_t stream) { const float gamma,
int output_size = buff.size(); const float alpha) {
int num_classes = softmax.dim(1); int num_classes = softmax.size(1);
PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF( int output_size = buff.numel();
grad_input.elemType().prim(), ([&] { at::cuda::CUDAGuard device_guard(grad_input.device());
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
grad_input.scalar_type(),
"softmax_focal_loss_backward_cuda1_"
"kernel",
[&] {
softmax_focal_loss_backward_cuda1_kernel<scalar_t> softmax_focal_loss_backward_cuda1_kernel<scalar_t>
<<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>( <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
output_size, softmax.ptr<scalar_t>(), target.ptr<int64_t>(), output_size, softmax.data_ptr<scalar_t>(),
weight.ptr<scalar_t>(), buff.ptr<scalar_t>(), gamma, alpha, target.data_ptr<int64_t>(), weight.data_ptr<scalar_t>(),
num_classes); buff.data_ptr<scalar_t>(), gamma, alpha, num_classes);
})); });
PARROTS_CUDA_CHECK(cudaGetLastError());
output_size = grad_input.size(); AT_CUDA_CHECK(cudaGetLastError());
PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF( output_size = grad_input.numel();
grad_input.elemType().prim(), ([&] { AT_DISPATCH_FLOATING_TYPES_AND_HALF(
grad_input.scalar_type(),
"softmax_focal_loss_backward_cuda2_"
"kernel",
[&] {
softmax_focal_loss_backward_cuda2_kernel<scalar_t> softmax_focal_loss_backward_cuda2_kernel<scalar_t>
<<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>( <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
output_size, softmax.ptr<scalar_t>(), target.ptr<int64_t>(), output_size, softmax.data_ptr<scalar_t>(),
buff.ptr<scalar_t>(), grad_input.ptr<scalar_t>(), num_classes); target.data_ptr<int64_t>(), buff.data_ptr<scalar_t>(),
})); grad_input.data_ptr<scalar_t>(), num_classes);
});
PARROTS_CUDA_CHECK(cudaGetLastError()); AT_CUDA_CHECK(cudaGetLastError());
} }
#include <parrots/compute/aten.hpp>
#include <parrots/extension.hpp>
#include <parrots/foundation/ssattrs.hpp>
#include "focal_loss_pytorch.h"
using namespace parrots;
void sigmoid_focal_loss_forward_cuda_parrots(CudaContext& ctx,
const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
float gamma;
float alpha;
SSAttrs(attr).get<float>("gamma", gamma).get<float>("alpha", alpha).done();
// get inputs and outputs
const auto& input = buildATensor(ctx, ins[0]);
const auto& target = buildATensor(ctx, ins[1]);
const auto& weight = buildATensor(ctx, ins[2]);
auto output = buildATensor(ctx, outs[0]);
sigmoid_focal_loss_forward_cuda(input, target, weight, output, gamma, alpha);
}
void sigmoid_focal_loss_backward_cuda_parrots(
CudaContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
float gamma;
float alpha;
SSAttrs(attr).get<float>("gamma", gamma).get<float>("alpha", alpha).done();
// get inputs and outputs
const auto& input = buildATensor(ctx, ins[0]);
const auto& target = buildATensor(ctx, ins[1]);
const auto& weight = buildATensor(ctx, ins[2]);
auto grad_input = buildATensor(ctx, outs[0]);
sigmoid_focal_loss_backward_cuda(input, target, weight, grad_input, gamma,
alpha);
}
void softmax_focal_loss_forward_cuda_parrots(CudaContext& ctx,
const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
float gamma;
float alpha;
SSAttrs(attr).get<float>("gamma", gamma).get<float>("alpha", alpha).done();
// get inputs and outputs
const auto& input = buildATensor(ctx, ins[0]);
const auto& target = buildATensor(ctx, ins[1]);
const auto& weight = buildATensor(ctx, ins[2]);
auto output = buildATensor(ctx, outs[0]);
softmax_focal_loss_forward_cuda(input, target, weight, output, gamma, alpha);
}
void softmax_focal_loss_backward_cuda_parrots(
CudaContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
float gamma;
float alpha;
SSAttrs(attr).get<float>("gamma", gamma).get<float>("alpha", alpha).done();
// get inputs and outputs
const auto& input = buildATensor(ctx, ins[0]);
const auto& target = buildATensor(ctx, ins[1]);
const auto& weight = buildATensor(ctx, ins[2]);
auto buff = buildATensor(ctx, outs[0]);
auto grad_input = buildATensor(ctx, outs[1]);
softmax_focal_loss_backward_cuda(input, target, weight, buff, grad_input,
gamma, alpha);
}
PARROTS_EXTENSION_REGISTER(sigmoid_focal_loss_forward)
.attr("gamma")
.attr("alpha")
.input(3)
.output(1)
.apply(sigmoid_focal_loss_forward_cuda_parrots)
.done();
PARROTS_EXTENSION_REGISTER(sigmoid_focal_loss_backward)
.attr("gamma")
.attr("alpha")
.input(3)
.output(1)
.apply(sigmoid_focal_loss_backward_cuda_parrots)
.done();
PARROTS_EXTENSION_REGISTER(softmax_focal_loss_forward)
.attr("gamma")
.attr("alpha")
.input(3)
.output(1)
.apply(softmax_focal_loss_forward_cuda_parrots)
.done();
PARROTS_EXTENSION_REGISTER(softmax_focal_loss_backward)
.attr("gamma")
.attr("alpha")
.input(3)
.output(2)
.apply(softmax_focal_loss_backward_cuda_parrots)
.done();
#ifndef FOCAL_LOSS_PYTORCH_H
#define FOCAL_LOSS_PYTORCH_H
#include <torch/extension.h>
using namespace at;
void sigmoid_focal_loss_forward_cuda(Tensor input, Tensor target, Tensor weight,
Tensor output, float gamma, float alpha);
void sigmoid_focal_loss_backward_cuda(Tensor input, Tensor target,
Tensor weight, Tensor grad_input,
float gamma, float alpha);
void softmax_focal_loss_forward_cuda(Tensor input, Tensor target, Tensor weight,
Tensor output, float gamma, float alpha);
void softmax_focal_loss_backward_cuda(Tensor input, Tensor target,
Tensor weight, Tensor buff,
Tensor grad_input, float gamma,
float alpha);
#endif // FOCAL_LOSS_PYTORCH_H
#include "parrots_cpp_helper.hpp" #include "pytorch_cpp_helper.hpp"
void MaskedIm2colForwardCUDAKernelLauncher( #ifdef MMCV_WITH_CUDA
const DArrayLite bottom_data, const DArrayLite mask_h_idx, void MaskedIm2colForwardCUDAKernelLauncher(const Tensor bottom_data,
const DArrayLite mask_w_idx, DArrayLite top_data, const int kernel_h, const Tensor mask_h_idx,
const int kernel_w, const int pad_h, const int pad_w, cudaStream_t stream); const Tensor mask_w_idx,
Tensor top_data, const int kernel_h,
const int kernel_w, const int pad_h,
const int pad_w);
void MaskedCol2imForwardCUDAKernelLaucher(const DArrayLite bottom_data, void MaskedCol2imForwardCUDAKernelLauncher(const Tensor bottom_data,
const DArrayLite mask_h_idx, const Tensor mask_h_idx,
const DArrayLite mask_w_idx, const Tensor mask_w_idx,
DArrayLite top_data, const int height, Tensor top_data, const int height,
const int width, const int channels, const int width, const int channels);
cudaStream_t stream);
void masked_im2col_forward_cuda(CudaContext& ctx, const SSElement& attr, void masked_im2col_forward_cuda(const Tensor im, const Tensor mask_h_idx,
const OperatorBase::in_list_t& ins, const Tensor mask_w_idx, Tensor col,
OperatorBase::out_list_t& outs) { const int kernel_h, const int kernel_w,
const int pad_h, const int pad_w) {
// im: (n, ic, h, w), kernel size (kh, kw) // im: (n, ic, h, w), kernel size (kh, kw)
// kernel: (oc, ic * kh * kw), col: (kh * kw * ic, ow * oh) // kernel: (oc, ic * kh * kw), col: (kh * kw * ic, ow * oh)
int kernel_h, kernel_w, pad_h, pad_w;
SSAttrs(attr)
.get<int>("kernel_h", kernel_h)
.get<int>("kernel_w", kernel_w)
.get<int>("pad_h", pad_h)
.get<int>("pad_w", pad_w)
.done();
const auto& im = ins[0];
const auto& mask_h_idx = ins[1];
const auto& mask_w_idx = ins[2];
auto& col = outs[0];
cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
MaskedIm2colForwardCUDAKernelLauncher(im, mask_h_idx, mask_w_idx, col, MaskedIm2colForwardCUDAKernelLauncher(im, mask_h_idx, mask_w_idx, col,
kernel_h, kernel_w, pad_h, pad_w, kernel_h, kernel_w, pad_h, pad_w);
stream);
} }
void masked_col2im_forward_cuda(CudaContext& ctx, const SSElement& attr, void masked_col2im_forward_cuda(const Tensor col, const Tensor mask_h_idx,
const OperatorBase::in_list_t& ins, const Tensor mask_w_idx, Tensor im, int height,
OperatorBase::out_list_t& outs) { int width, int channels) {
// im: (n, ic, h, w), kernel size (kh, kw) // im: (n, ic, h, w), kernel size (kh, kw)
// kernel: (oc, ic * kh * kh), col: (kh * kw * ic, ow * oh) // kernel: (oc, ic * kh * kh), col: (kh * kw * ic, ow * oh)
int height, width, channels; MaskedCol2imForwardCUDAKernelLauncher(col, mask_h_idx, mask_w_idx, im, height,
SSAttrs(attr) width, channels);
.get<int>("height", height)
.get<int>("width", width)
.get<int>("channels", channels)
.done();
const auto& col = ins[0];
const auto& mask_h_idx = ins[1];
const auto& mask_w_idx = ins[2];
auto& im = outs[0];
cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
MaskedCol2imForwardCUDAKernelLaucher(col, mask_h_idx, mask_w_idx, im, height,
width, channels, stream);
} }
#endif
PARROTS_EXTENSION_REGISTER(masked_im2col_forward) void masked_im2col_forward(const Tensor im, const Tensor mask_h_idx,
.attr("kernel_h") const Tensor mask_w_idx, Tensor col,
.attr("kernel_w") const int kernel_h, const int kernel_w,
.attr("pad_h") const int pad_h, const int pad_w) {
.attr("pad_w") if (im.device().is_cuda()) {
.input(3) #ifdef MMCV_WITH_CUDA
.output(1) CHECK_CUDA_INPUT(im);
.apply(masked_im2col_forward_cuda) CHECK_CUDA_INPUT(mask_h_idx);
.done(); CHECK_CUDA_INPUT(mask_w_idx);
CHECK_CUDA_INPUT(col);
masked_im2col_forward_cuda(im, mask_h_idx, mask_w_idx, col, kernel_h,
kernel_w, pad_h, pad_w);
#else
AT_ERROR("MaskConv is not compiled with GPU support");
#endif
} else {
AT_ERROR("MaskConv is not implemented on CPU");
}
}
PARROTS_EXTENSION_REGISTER(masked_col2im_forward) void masked_col2im_forward(const Tensor col, const Tensor mask_h_idx,
.attr("height") const Tensor mask_w_idx, Tensor im, int height,
.attr("width") int width, int channels) {
.attr("channels") if (col.device().is_cuda()) {
.input(3) #ifdef MMCV_WITH_CUDA
.output(1) CHECK_CUDA_INPUT(col);
.apply(masked_col2im_forward_cuda) CHECK_CUDA_INPUT(mask_h_idx);
.done(); CHECK_CUDA_INPUT(mask_w_idx);
CHECK_CUDA_INPUT(im);
masked_col2im_forward_cuda(col, mask_h_idx, mask_w_idx, im, height, width,
channels);
#else
AT_ERROR("MaskConv is not compiled with GPU support");
#endif
} else {
AT_ERROR("MaskConv is not implemented on CPU");
}
}
#include "masked_conv2d_cuda_kernel.cuh" #include "masked_conv2d_cuda_kernel.cuh"
#include "parrots_cuda_helper.hpp" #include "pytorch_cuda_helper.hpp"
void MaskedIm2colForwardCUDAKernelLauncher( void MaskedIm2colForwardCUDAKernelLauncher(const Tensor bottom_data,
const DArrayLite bottom_data, const DArrayLite mask_h_idx, const Tensor mask_h_idx,
const DArrayLite mask_w_idx, DArrayLite top_data, const int kernel_h, const Tensor mask_w_idx,
const int kernel_w, const int pad_h, const int pad_w, cudaStream_t stream) { Tensor top_data, const int kernel_h,
int channels = bottom_data.dim(1); const int kernel_w, const int pad_h,
int height = bottom_data.dim(2); const int pad_w) {
int width = bottom_data.dim(3); int channels = bottom_data.size(1);
int mask_cnt = mask_h_idx.dim(0); int height = bottom_data.size(2);
int width = bottom_data.size(3);
int mask_cnt = mask_h_idx.size(0);
int output_size = mask_cnt * channels; int output_size = mask_cnt * channels;
PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF( at::cuda::CUDAGuard device_guard(bottom_data.device());
bottom_data.elemType().prim(), ([&] { cudaStream_t stream = at::cuda::getCurrentCUDAStream();
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
bottom_data.scalar_type(), "MaskedIm2colLaucherForward", ([&] {
const scalar_t *bottom_data_ = bottom_data.data_ptr<scalar_t>();
const int64_t *mask_h_idx_ = mask_h_idx.data_ptr<int64_t>();
const int64_t *mask_w_idx_ = mask_w_idx.data_ptr<int64_t>();
scalar_t *top_data_ = top_data.data_ptr<scalar_t>();
MaskedIm2colForward<scalar_t> MaskedIm2colForward<scalar_t>
<<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>( <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
output_size, bottom_data.ptr<scalar_t>(), height, width, output_size, bottom_data_, height, width, kernel_h, kernel_w,
kernel_h, kernel_w, pad_h, pad_w, mask_h_idx.ptr<int64_t>(), pad_h, pad_w, mask_h_idx_, mask_w_idx_, mask_cnt, top_data_);
mask_w_idx.ptr<int64_t>(), mask_cnt, top_data.ptr<scalar_t>());
})); }));
AT_CUDA_CHECK(cudaGetLastError());
PARROTS_CUDA_CHECK(cudaGetLastError());
} }
void MaskedCol2imForwardCUDAKernelLaucher(const DArrayLite bottom_data, void MaskedCol2imForwardCUDAKernelLauncher(
const DArrayLite mask_h_idx, const Tensor bottom_data, const Tensor mask_h_idx, const Tensor mask_w_idx,
const DArrayLite mask_w_idx, Tensor top_data, const int height, const int width, const int channels) {
DArrayLite top_data, const int height, int mask_cnt = mask_h_idx.size(0);
const int width, const int channels,
cudaStream_t stream) {
int mask_cnt = mask_h_idx.dim(0);
int output_size = mask_cnt * channels; int output_size = mask_cnt * channels;
PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF( at::cuda::CUDAGuard device_guard(bottom_data.device());
bottom_data.elemType().prim(), ([&] { cudaStream_t stream = at::cuda::getCurrentCUDAStream();
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
bottom_data.scalar_type(), "MaskedCol2imLaucherForward", ([&] {
const scalar_t *bottom_data_ = bottom_data.data_ptr<scalar_t>();
const int64_t *mask_h_idx_ = mask_h_idx.data_ptr<int64_t>();
const int64_t *mask_w_idx_ = mask_w_idx.data_ptr<int64_t>();
scalar_t *top_data_ = top_data.data_ptr<scalar_t>();
MaskedCol2imForward<scalar_t> MaskedCol2imForward<scalar_t>
<<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>( <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
output_size, bottom_data.ptr<scalar_t>(), height, width, output_size, bottom_data_, height, width, channels, mask_h_idx_,
channels, mask_h_idx.ptr<int64_t>(), mask_w_idx.ptr<int64_t>(), mask_w_idx_, mask_cnt, top_data_);
mask_cnt, top_data.ptr<scalar_t>());
})); }));
AT_CUDA_CHECK(cudaGetLastError());
PARROTS_CUDA_CHECK(cudaGetLastError());
} }
#include <parrots/compute/aten.hpp>
#include <parrots/extension.hpp>
#include <parrots/foundation/ssattrs.hpp>
#include "masked_conv2d_pytorch.h"
using namespace parrots;
void masked_im2col_forward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
// im: (n, ic, h, w), kernel size (kh, kw)
// kernel: (oc, ic * kh * kw), col: (kh * kw * ic, ow * oh)
int kernel_h, kernel_w, pad_h, pad_w;
SSAttrs(attr)
.get<int>("kernel_h", kernel_h)
.get<int>("kernel_w", kernel_w)
.get<int>("pad_h", pad_h)
.get<int>("pad_w", pad_w)
.done();
const auto& im = buildATensor(ctx, ins[0]);
const auto& mask_h_idx = buildATensor(ctx, ins[1]);
const auto& mask_w_idx = buildATensor(ctx, ins[2]);
auto col = buildATensor(ctx, outs[0]);
masked_im2col_forward_cuda(im, mask_h_idx, mask_w_idx, col, kernel_h,
kernel_w, pad_h, pad_w);
}
void masked_col2im_forward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
// im: (n, ic, h, w), kernel size (kh, kw)
// kernel: (oc, ic * kh * kh), col: (kh * kw * ic, ow * oh)
int height, width, channels;
SSAttrs(attr)
.get<int>("height", height)
.get<int>("width", width)
.get<int>("channels", channels)
.done();
const auto& col = buildATensor(ctx, ins[0]);
const auto& mask_h_idx = buildATensor(ctx, ins[1]);
const auto& mask_w_idx = buildATensor(ctx, ins[2]);
auto im = buildATensor(ctx, outs[0]);
masked_col2im_forward_cuda(col, mask_h_idx, mask_w_idx, im, height, width,
channels);
}
PARROTS_EXTENSION_REGISTER(masked_im2col_forward)
.attr("kernel_h")
.attr("kernel_w")
.attr("pad_h")
.attr("pad_w")
.input(3)
.output(1)
.apply(masked_im2col_forward_cuda_parrots)
.done();
PARROTS_EXTENSION_REGISTER(masked_col2im_forward)
.attr("height")
.attr("width")
.attr("channels")
.input(3)
.output(1)
.apply(masked_col2im_forward_cuda_parrots)
.done();
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment