"composable_kernel/include/utility/array.hpp" did not exist on "0a2657312ec62a65e92a36cebd7d3b2a3c0712e1"
Unverified Commit 48d99025 authored by z55250825's avatar z55250825 Committed by GitHub
Browse files

Add new parrots extension implementation for all ops (#794)

* delete all parrots file
add bbox_overlaps new parrots op impl

* support first new impl parrts op (bbox_overlaps)(success test)

* add box_iou_rotated op, test succeed

* add carafe and carafe_naive op, test succeed (one parrots bug need fix)

* add cc_attention op, test success

* add corner_pool op, test success

* add parrots op deform_conv, test success

* add deform_roi_pool op, test success (but has question)

* add focal loss op, test success (gradcheck)

* add masked_conv2d op, test success

* add modulated_deform_conv op, test success

* add nms and nms_rotated op, test success

* add psamask op, test success

* add roi_align op, test_success

* add roi_pool op, test success

* add sync_bn op, test success

* add tin_shift op, test success

* fix test_deform_roi_pool, add parrots test

* skip test_onnx because parrots does not support onnx

* fix c++ lint

* fix python lint

* fix python lint
parent 72e4cc12
#include <parrots/compute/aten.hpp>
#include <parrots/extension.hpp>
#include <parrots/foundation/ssattrs.hpp>
#include "cc_attention_pytorch.h"
using namespace parrots;
/*void ca_forward_cuda(const Tensor t, const Tensor f, Tensor weight);*/
void ca_forward_cuda_parrots(CudaContext &ctx, const SSElement &attr,
const OperatorBase::in_list_t &ins,
OperatorBase::out_list_t &outs) {
const auto &t = buildATensor(ctx, ins[0]);
const auto &f = buildATensor(ctx, ins[1]);
auto weight = buildATensor(ctx, outs[0]);
ca_forward_cuda(t, f, weight);
}
/* void ca_backward_cuda(const Tensor dw, const Tensor t, const Tensor f,
* Tensor dt, Tensor df)
*/
void ca_backward_cuda_parrots(CudaContext &ctx, const SSElement &attr,
const OperatorBase::in_list_t &ins,
OperatorBase::out_list_t &outs) {
const auto &dw = buildATensor(ctx, ins[0]);
const auto &t = buildATensor(ctx, ins[1]);
const auto &f = buildATensor(ctx, ins[2]);
auto dt = buildATensor(ctx, outs[0]);
auto df = buildATensor(ctx, outs[1]);
ca_backward_cuda(dw, t, f, dt, df);
}
/* void ca_map_forward_cuda(const Tensor weight, const Tensor g, Tensor out); */
void ca_map_forward_cuda_parrots(CudaContext &ctx, const SSElement &attr,
const OperatorBase::in_list_t &ins,
OperatorBase::out_list_t &outs) {
const auto &weight = buildATensor(ctx, ins[0]);
const auto &g = buildATensor(ctx, ins[1]);
auto out = buildATensor(ctx, outs[0]);
ca_map_forward_cuda(weight, g, out);
}
/* void ca_map_backward_cuda(const Tensor dout, const Tensor weight,
* const Tensor g, Tensor dw, Tensor dg);
*/
void ca_map_backward_cuda_parrots(CudaContext &ctx, const SSElement &attr,
const OperatorBase::in_list_t &ins,
OperatorBase::out_list_t &outs) {
const auto &dout = buildATensor(ctx, ins[0]);
const auto &weight = buildATensor(ctx, ins[1]);
const auto &g = buildATensor(ctx, ins[2]);
auto dw = buildATensor(ctx, outs[0]);
auto dg = buildATensor(ctx, outs[1]);
ca_map_backward_cuda(dout, weight, g, dw, dg);
}
PARROTS_EXTENSION_REGISTER(ca_forward)
.input(2)
.output(1)
.apply(ca_forward_cuda_parrots)
.done();
PARROTS_EXTENSION_REGISTER(ca_backward)
.input(3)
.output(2)
.apply(ca_backward_cuda_parrots)
.done();
PARROTS_EXTENSION_REGISTER(ca_map_forward)
.input(2)
.output(1)
.apply(ca_map_forward_cuda_parrots)
.done();
PARROTS_EXTENSION_REGISTER(ca_map_backward)
.input(3)
.output(2)
.apply(ca_map_backward_cuda_parrots)
.done();
#ifndef CC_ATTENTION_PYTORCH_H
#define CC_ATTENTION_PYTORCH_H
#include <torch/extension.h>
using namespace at;
void ca_forward_cuda(const Tensor t, const Tensor f, Tensor weight);
void ca_backward_cuda(const Tensor dw, const Tensor t, const Tensor f,
Tensor dt, Tensor df);
void ca_map_forward_cuda(const Tensor weight, const Tensor g, Tensor out);
void ca_map_backward_cuda(const Tensor dout, const Tensor weight,
const Tensor g, Tensor dw, Tensor dg);
#endif // CC_ATTENTION_PYTORCH_H
// Modified from // Modified from
// https://github.com/princeton-vl/CornerNet-Lite/tree/master/core/models/py_utils/_cpools/src // https://github.com/princeton-vl/CornerNet-Lite/tree/master/core/models/py_utils/_cpools/src
#include "parrots_cpp_helper.hpp" #include "pytorch_cpp_helper.hpp"
void bottom_pool_forward_cuda(CudaContext& ctx, const SSElement& attr, Tensor bottom_pool_forward(Tensor input) {
const OperatorBase::in_list_t& ins, // Initialize output
OperatorBase::out_list_t& outs) {} Tensor output = at::zeros_like(input);
// Get height
void bottom_pool_backward_cuda(CudaContext& ctx, const SSElement& attr, int64_t height = input.size(2);
const OperatorBase::in_list_t& ins, output.copy_(input);
OperatorBase::out_list_t& outs) {}
for (int64_t ind = 1; ind < height; ind <<= 1) {
void top_pool_forward_cuda(CudaContext& ctx, const SSElement& attr, Tensor max_temp = at::slice(output, 2, ind, height);
const OperatorBase::in_list_t& ins, Tensor cur_temp = at::slice(output, 2, ind, height).clone();
OperatorBase::out_list_t& outs) {} Tensor next_temp = at::slice(output, 2, 0, height - ind).clone();
at::max_out(max_temp, cur_temp, next_temp);
void top_pool_backward_cuda(CudaContext& ctx, const SSElement& attr, }
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {} return output;
}
void left_pool_forward_cuda(CudaContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins, Tensor bottom_pool_backward(Tensor input, Tensor grad_output) {
OperatorBase::out_list_t& outs) {} auto output = at::zeros_like(input);
void left_pool_backward_cuda(CudaContext& ctx, const SSElement& attr, int32_t batch = input.size(0);
const OperatorBase::in_list_t& ins, int32_t channel = input.size(1);
OperatorBase::out_list_t& outs) {} int32_t height = input.size(2);
int32_t width = input.size(3);
void right_pool_forward_cuda(CudaContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins, auto max_val = torch::zeros({batch, channel, width},
OperatorBase::out_list_t& outs) {} at::device(at::kCUDA).dtype(at::kFloat));
auto max_ind = torch::zeros({batch, channel, width},
void right_pool_backward_cuda(CudaContext& ctx, const SSElement& attr, at::device(at::kCUDA).dtype(at::kLong));
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {} auto input_temp = input.select(2, 0);
max_val.copy_(input_temp);
PARROTS_EXTENSION_REGISTER(bottom_pool_forward)
.input(1) max_ind.fill_(0);
.output(1)
.apply(bottom_pool_forward_cuda) auto output_temp = output.select(2, 0);
.done(); auto grad_output_temp = grad_output.select(2, 0);
output_temp.copy_(grad_output_temp);
PARROTS_EXTENSION_REGISTER(bottom_pool_backward)
.input(2) auto un_max_ind = max_ind.unsqueeze(2);
.output(1) auto gt_mask = torch::zeros({batch, channel, width},
.apply(bottom_pool_backward_cuda) at::device(at::kCUDA).dtype(at::kBool));
.done(); auto max_temp = torch::zeros({batch, channel, width},
at::device(at::kCUDA).dtype(at::kFloat));
PARROTS_EXTENSION_REGISTER(top_pool_forward) for (int32_t ind = 0; ind < height - 1; ++ind) {
.input(1) input_temp = input.select(2, ind + 1);
.output(1) at::gt_out(gt_mask, input_temp, max_val);
.apply(top_pool_forward_cuda)
.done(); at::masked_select_out(max_temp, input_temp, gt_mask);
max_val.masked_scatter_(gt_mask, max_temp);
PARROTS_EXTENSION_REGISTER(top_pool_backward) max_ind.masked_fill_(gt_mask, ind + 1);
.input(2)
.output(1) grad_output_temp = grad_output.select(2, ind + 1).unsqueeze(2);
.apply(top_pool_backward_cuda) output.scatter_add_(2, un_max_ind, grad_output_temp);
.done(); }
PARROTS_EXTENSION_REGISTER(left_pool_forward) return output;
.input(1) }
.output(1)
.apply(left_pool_forward_cuda) Tensor left_pool_forward(Tensor input) {
.done(); // Initialize output
Tensor output = at::zeros_like(input);
PARROTS_EXTENSION_REGISTER(left_pool_backward) // Get width
.input(2) int64_t width = input.size(3);
.output(1) output.copy_(input);
.apply(left_pool_backward_cuda)
.done(); for (int64_t ind = 1; ind < width; ind <<= 1) {
Tensor max_temp = at::slice(output, 3, 0, width - ind);
PARROTS_EXTENSION_REGISTER(right_pool_forward) Tensor cur_temp = at::slice(output, 3, 0, width - ind).clone();
.input(1) Tensor next_temp = at::slice(output, 3, ind, width).clone();
.output(1) at::max_out(max_temp, cur_temp, next_temp);
.apply(right_pool_forward_cuda) }
.done();
return output;
PARROTS_EXTENSION_REGISTER(right_pool_backward) }
.input(2)
.output(1) Tensor left_pool_backward(Tensor input, Tensor grad_output) {
.apply(right_pool_backward_cuda) auto output = at::zeros_like(input);
.done();
int32_t batch = input.size(0);
int32_t channel = input.size(1);
int32_t height = input.size(2);
int32_t width = input.size(3);
auto max_val = torch::zeros({batch, channel, height},
at::device(at::kCUDA).dtype(at::kFloat));
auto max_ind = torch::zeros({batch, channel, height},
at::device(at::kCUDA).dtype(at::kLong));
auto input_temp = input.select(3, width - 1);
max_val.copy_(input_temp);
max_ind.fill_(width - 1);
auto output_temp = output.select(3, width - 1);
auto grad_output_temp = grad_output.select(3, width - 1);
output_temp.copy_(grad_output_temp);
auto un_max_ind = max_ind.unsqueeze(3);
auto gt_mask = torch::zeros({batch, channel, height},
at::device(at::kCUDA).dtype(at::kBool));
auto max_temp = torch::zeros({batch, channel, height},
at::device(at::kCUDA).dtype(at::kFloat));
for (int32_t ind = 1; ind < width; ++ind) {
input_temp = input.select(3, width - ind - 1);
at::gt_out(gt_mask, input_temp, max_val);
at::masked_select_out(max_temp, input_temp, gt_mask);
max_val.masked_scatter_(gt_mask, max_temp);
max_ind.masked_fill_(gt_mask, width - ind - 1);
grad_output_temp = grad_output.select(3, width - ind - 1).unsqueeze(3);
output.scatter_add_(3, un_max_ind, grad_output_temp);
}
return output;
}
Tensor right_pool_forward(Tensor input) {
// Initialize output
Tensor output = at::zeros_like(input);
// Get width
int64_t width = input.size(3);
output.copy_(input);
for (int64_t ind = 1; ind < width; ind <<= 1) {
Tensor max_temp = at::slice(output, 3, ind, width);
Tensor cur_temp = at::slice(output, 3, ind, width).clone();
Tensor next_temp = at::slice(output, 3, 0, width - ind).clone();
at::max_out(max_temp, cur_temp, next_temp);
}
return output;
}
Tensor right_pool_backward(Tensor input, Tensor grad_output) {
Tensor output = at::zeros_like(input);
int32_t batch = input.size(0);
int32_t channel = input.size(1);
int32_t height = input.size(2);
int32_t width = input.size(3);
auto max_val = torch::zeros({batch, channel, height},
at::device(at::kCUDA).dtype(at::kFloat));
auto max_ind = torch::zeros({batch, channel, height},
at::device(at::kCUDA).dtype(at::kLong));
auto input_temp = input.select(3, 0);
max_val.copy_(input_temp);
max_ind.fill_(0);
auto output_temp = output.select(3, 0);
auto grad_output_temp = grad_output.select(3, 0);
output_temp.copy_(grad_output_temp);
auto un_max_ind = max_ind.unsqueeze(3);
auto gt_mask = torch::zeros({batch, channel, height},
at::device(at::kCUDA).dtype(at::kBool));
auto max_temp = torch::zeros({batch, channel, height},
at::device(at::kCUDA).dtype(at::kFloat));
for (int32_t ind = 0; ind < width - 1; ++ind) {
input_temp = input.select(3, ind + 1);
at::gt_out(gt_mask, input_temp, max_val);
at::masked_select_out(max_temp, input_temp, gt_mask);
max_val.masked_scatter_(gt_mask, max_temp);
max_ind.masked_fill_(gt_mask, ind + 1);
grad_output_temp = grad_output.select(3, ind + 1).unsqueeze(3);
output.scatter_add_(3, un_max_ind, grad_output_temp);
}
return output;
}
Tensor top_pool_forward(Tensor input) {
// Initialize output
Tensor output = at::zeros_like(input);
// Get height
int64_t height = input.size(2);
output.copy_(input);
for (int64_t ind = 1; ind < height; ind <<= 1) {
Tensor max_temp = at::slice(output, 2, 0, height - ind);
Tensor cur_temp = at::slice(output, 2, 0, height - ind).clone();
Tensor next_temp = at::slice(output, 2, ind, height).clone();
at::max_out(max_temp, cur_temp, next_temp);
}
return output;
}
Tensor top_pool_backward(Tensor input, Tensor grad_output) {
auto output = at::zeros_like(input);
int32_t batch = input.size(0);
int32_t channel = input.size(1);
int32_t height = input.size(2);
int32_t width = input.size(3);
auto max_val = torch::zeros({batch, channel, width},
at::device(at::kCUDA).dtype(at::kFloat));
auto max_ind = torch::zeros({batch, channel, width},
at::device(at::kCUDA).dtype(at::kLong));
auto input_temp = input.select(2, height - 1);
max_val.copy_(input_temp);
max_ind.fill_(height - 1);
auto output_temp = output.select(2, height - 1);
auto grad_output_temp = grad_output.select(2, height - 1);
output_temp.copy_(grad_output_temp);
auto un_max_ind = max_ind.unsqueeze(2);
auto gt_mask = torch::zeros({batch, channel, width},
at::device(at::kCUDA).dtype(at::kBool));
auto max_temp = torch::zeros({batch, channel, width},
at::device(at::kCUDA).dtype(at::kFloat));
for (int32_t ind = 1; ind < height; ++ind) {
input_temp = input.select(2, height - ind - 1);
at::gt_out(gt_mask, input_temp, max_val);
at::masked_select_out(max_temp, input_temp, gt_mask);
max_val.masked_scatter_(gt_mask, max_temp);
max_ind.masked_fill_(gt_mask, height - ind - 1);
grad_output_temp = grad_output.select(2, height - ind - 1).unsqueeze(2);
output.scatter_add_(2, un_max_ind, grad_output_temp);
}
return output;
}
#include <parrots/compute/aten.hpp>
#include <parrots/extension.hpp>
#include <parrots/foundation/ssattrs.hpp>
#include "corner_pool_pytorch.h"
using namespace parrots;
#ifdef MMCV_WITH_CUDA
void bottom_pool_forward_parrots(CudaContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
at::Tensor input;
input = buildATensor(ctx, ins[0]);
auto out = bottom_pool_forward(input);
updateDArray(ctx, out, outs[0]);
}
void bottom_pool_backward_parrots(CudaContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
at::Tensor input, grad_output;
input = buildATensor(ctx, ins[0]);
grad_output = buildATensor(ctx, ins[1]);
auto out = bottom_pool_backward(input, grad_output);
updateDArray(ctx, out, outs[0]);
}
void left_pool_forward_parrots(CudaContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
at::Tensor input;
input = buildATensor(ctx, ins[0]);
auto out = left_pool_forward(input);
updateDArray(ctx, out, outs[0]);
}
void left_pool_backward_parrots(CudaContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
at::Tensor input, grad_output;
input = buildATensor(ctx, ins[0]);
grad_output = buildATensor(ctx, ins[1]);
auto out = left_pool_backward(input, grad_output);
updateDArray(ctx, out, outs[0]);
}
void right_pool_forward_parrots(CudaContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
at::Tensor input;
input = buildATensor(ctx, ins[0]);
auto out = right_pool_forward(input);
updateDArray(ctx, out, outs[0]);
}
void right_pool_backward_parrots(CudaContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
at::Tensor input, grad_output;
input = buildATensor(ctx, ins[0]);
grad_output = buildATensor(ctx, ins[1]);
auto out = right_pool_backward(input, grad_output);
updateDArray(ctx, out, outs[0]);
}
void top_pool_forward_parrots(CudaContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
at::Tensor input;
input = buildATensor(ctx, ins[0]);
auto out = top_pool_forward(input);
updateDArray(ctx, out, outs[0]);
}
void top_pool_backward_parrots(CudaContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
at::Tensor input, grad_output;
input = buildATensor(ctx, ins[0]);
grad_output = buildATensor(ctx, ins[1]);
auto out = top_pool_backward(input, grad_output);
updateDArray(ctx, out, outs[0]);
}
#endif
void bottom_pool_forward_parrots_cpu(HostContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
at::Tensor input;
input = buildATensor(ctx, ins[0]);
auto out = bottom_pool_forward(input);
updateDArray(ctx, out, outs[0]);
}
void bottom_pool_backward_parrots_cpu(HostContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
at::Tensor input, grad_output;
input = buildATensor(ctx, ins[0]);
grad_output = buildATensor(ctx, ins[1]);
auto out = bottom_pool_backward(input, grad_output);
updateDArray(ctx, out, outs[0]);
}
void left_pool_forward_parrots_cpu(HostContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
at::Tensor input;
input = buildATensor(ctx, ins[0]);
auto out = left_pool_forward(input);
updateDArray(ctx, out, outs[0]);
}
void left_pool_backward_parrots_cpu(HostContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
at::Tensor input, grad_output;
input = buildATensor(ctx, ins[0]);
grad_output = buildATensor(ctx, ins[1]);
auto out = left_pool_backward(input, grad_output);
updateDArray(ctx, out, outs[0]);
}
void right_pool_forward_parrots_cpu(HostContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
at::Tensor input;
input = buildATensor(ctx, ins[0]);
auto out = right_pool_forward(input);
updateDArray(ctx, out, outs[0]);
}
void right_pool_backward_parrots_cpu(HostContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
at::Tensor input, grad_output;
input = buildATensor(ctx, ins[0]);
grad_output = buildATensor(ctx, ins[1]);
auto out = right_pool_backward(input, grad_output);
updateDArray(ctx, out, outs[0]);
}
void top_pool_forward_parrots_cpu(HostContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
at::Tensor input;
input = buildATensor(ctx, ins[0]);
auto out = top_pool_forward(input);
updateDArray(ctx, out, outs[0]);
}
void top_pool_backward_parrots_cpu(HostContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
at::Tensor input, grad_output;
input = buildATensor(ctx, ins[0]);
grad_output = buildATensor(ctx, ins[1]);
auto out = top_pool_backward(input, grad_output);
updateDArray(ctx, out, outs[0]);
}
PARROTS_EXTENSION_REGISTER(bottom_pool_forward)
.input(1)
.output(1)
#ifdef MMCV_WITH_CUDA
.apply(bottom_pool_forward_parrots)
#endif
.apply(bottom_pool_forward_parrots_cpu)
.done();
PARROTS_EXTENSION_REGISTER(bottom_pool_backward)
.input(2)
.output(1)
#ifdef MMCV_WITH_CUDA
.apply(bottom_pool_backward_parrots)
#endif
.apply(bottom_pool_backward_parrots_cpu)
.done();
PARROTS_EXTENSION_REGISTER(top_pool_forward)
.input(1)
.output(1)
#ifdef MMCV_WITH_CUDA
.apply(top_pool_forward_parrots)
#endif
.apply(top_pool_forward_parrots_cpu)
.done();
PARROTS_EXTENSION_REGISTER(top_pool_backward)
.input(2)
.output(1)
#ifdef MMCV_WITH_CUDA
.apply(top_pool_backward_parrots)
#endif
.apply(top_pool_backward_parrots_cpu)
.done();
PARROTS_EXTENSION_REGISTER(left_pool_forward)
.input(1)
.output(1)
#ifdef MMCV_WITH_CUDA
.apply(left_pool_forward_parrots)
#endif
.apply(left_pool_forward_parrots_cpu)
.done();
PARROTS_EXTENSION_REGISTER(left_pool_backward)
.input(2)
.output(1)
#ifdef MMCV_WITH_CUDA
.apply(left_pool_backward_parrots)
#endif
.apply(left_pool_backward_parrots_cpu)
.done();
PARROTS_EXTENSION_REGISTER(right_pool_forward)
.input(1)
.output(1)
#ifdef MMCV_WITH_CUDA
.apply(right_pool_forward_parrots)
#endif
.apply(right_pool_forward_parrots_cpu)
.done();
PARROTS_EXTENSION_REGISTER(right_pool_backward)
.input(2)
.output(1)
#ifdef MMCV_WITH_CUDA
.apply(right_pool_backward_parrots)
#endif
.apply(right_pool_backward_parrots_cpu)
.done();
#ifndef CORNER_POOL_PYTORCH_H
#define CORNER_POOL_PYTORCH_H
#include <torch/extension.h>
at::Tensor bottom_pool_forward(at::Tensor input);
at::Tensor bottom_pool_backward(at::Tensor input, at::Tensor grad_output);
at::Tensor left_pool_forward(at::Tensor input);
at::Tensor left_pool_backward(at::Tensor input, at::Tensor grad_output);
at::Tensor right_pool_forward(at::Tensor input);
at::Tensor right_pool_backward(at::Tensor input, at::Tensor grad_output);
at::Tensor top_pool_forward(at::Tensor input);
at::Tensor top_pool_backward(at::Tensor input, at::Tensor grad_output);
#endif // CORNER_POOL_PYTORCH_H
// Copyright (c) 2018, SenseTime. #include "pytorch_cpp_helper.hpp"
#include "parrots_cpp_helper.hpp"
void DeformConvForwardCUDAKernelLauncher( #ifdef MMCV_WITH_CUDA
const DArrayLite input, const DArrayLite weight, const DArrayLite offset, void DeformConvForwardCUDAKernelLauncher(Tensor input, Tensor weight,
DArrayLite output, DArrayLite columns, DArrayLite ones, int kW, int kH, Tensor offset, Tensor output,
int dW, int dH, int padW, int padH, int dilationW, int dilationH, int group, Tensor columns, Tensor ones, int kW,
int deformable_group, int im2col_step, CudaContext& ctx, int kH, int dW, int dH, int padW,
cudaStream_t stream); int padH, int dilationW, int dilationH,
int group, int deformable_group,
int im2col_step);
void DeformConvBackwardInputCUDAKernelLauncher( void DeformConvBackwardInputCUDAKernelLauncher(
const DArrayLite input, const DArrayLite offset, Tensor input, Tensor offset, Tensor gradOutput, Tensor gradInput,
const DArrayLite gradOutput, DArrayLite gradInput, DArrayLite gradOffset, Tensor gradOffset, Tensor weight, Tensor columns, int kW, int kH, int dW,
DArrayLite weight, DArrayLite columns, int kW, int kH, int dW, int dH, int dH, int padW, int padH, int dilationW, int dilationH, int group,
int padW, int padH, int dilationW, int dilationH, int group, int deformable_group, int im2col_step);
int deformable_group, int im2col_step, CudaContext& ctx,
cudaStream_t stream);
void DeformConvBackwardParametersCUDAKernelLauncher( void DeformConvBackwardParametersCUDAKernelLauncher(
const DArrayLite input, const DArrayLite offset, Tensor input, Tensor offset, Tensor gradOutput, Tensor gradWeight,
const DArrayLite gradOutput, DArrayLite gradWeight, DArrayLite columns, Tensor columns, Tensor ones, int kW, int kH, int dW, int dH, int padW,
DArrayLite ones, int kW, int kH, int dW, int dH, int padW, int padH, int padH, int dilationW, int dilationH, int group, int deformable_group,
int dilationW, int dilationH, int group, int deformable_group, float scale, float scale, int im2col_step);
int im2col_step, CudaContext& ctx, cudaStream_t stream);
void deform_conv_forward_cuda(Tensor input, Tensor weight, Tensor offset,
void deform_conv_forward_cuda(CudaContext& ctx, const SSElement& attr, Tensor output, Tensor columns, Tensor ones,
const OperatorBase::in_list_t& ins, int kW, int kH, int dW, int dH, int padW,
OperatorBase::out_list_t& outs) { int padH, int dilationW, int dilationH, int group,
int kW, kH, dW, dH, padW, padH, dilationW, dilationH, group, deformable_group, int deformable_group, int im2col_step) {
im2col_step;
SSAttrs(attr)
.get<int>("kW", kW)
.get<int>("kH", kH)
.get<int>("dW", dW)
.get<int>("dH", dH)
.get<int>("padW", padW)
.get<int>("padH", padH)
.get<int>("dilationW", dilationW)
.get<int>("dilationH", dilationH)
.get<int>("group", group)
.get<int>("deformable_group", deformable_group)
.get<int>("im2col_step", im2col_step)
.done();
const auto input = ins[0];
const auto weight = ins[1];
const auto offset = ins[2];
auto output = outs[0];
auto columns = outs[1];
auto ones = outs[2];
cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
DeformConvForwardCUDAKernelLauncher( DeformConvForwardCUDAKernelLauncher(
input, weight, offset, output, columns, ones, kW, kH, dW, dH, padW, padH, input, weight, offset, output, columns, ones, kW, kH, dW, dH, padW, padH,
dilationW, dilationH, group, deformable_group, im2col_step, ctx, stream); dilationW, dilationH, group, deformable_group, im2col_step);
} }
void deform_conv_backward_input_cuda(CudaContext& ctx, const SSElement& attr, void deform_conv_backward_input_cuda(Tensor input, Tensor offset,
const OperatorBase::in_list_t& ins, Tensor gradOutput, Tensor gradInput,
OperatorBase::out_list_t& outs) { Tensor gradOffset, Tensor weight,
int kW, kH, dW, dH, padW, padH, dilationW, dilationH, group, deformable_group, Tensor columns, int kW, int kH, int dW,
im2col_step; int dH, int padW, int padH, int dilationW,
SSAttrs(attr) int dilationH, int group,
.get<int>("kW", kW) int deformable_group, int im2col_step) {
.get<int>("kH", kH)
.get<int>("dW", dW)
.get<int>("dH", dH)
.get<int>("padW", padW)
.get<int>("padH", padH)
.get<int>("dilationW", dilationW)
.get<int>("dilationH", dilationH)
.get<int>("group", group)
.get<int>("deformable_group", deformable_group)
.get<int>("im2col_step", im2col_step)
.done();
auto input = ins[0];
auto offset = ins[1];
auto gradOutput = ins[2];
auto gradInput = outs[0];
auto gradOffset = outs[1];
auto weight = outs[2];
auto columns = outs[3];
cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
DeformConvBackwardInputCUDAKernelLauncher( DeformConvBackwardInputCUDAKernelLauncher(
input, offset, gradOutput, gradInput, gradOffset, weight, columns, kW, kH, input, offset, gradOutput, gradInput, gradOffset, weight, columns, kW, kH,
dW, dH, padW, padH, dilationW, dilationH, group, deformable_group, dW, dH, padW, padH, dilationW, dilationH, group, deformable_group,
im2col_step, ctx, stream); im2col_step);
} }
void deform_conv_backward_parameters_cuda(CudaContext& ctx, void deform_conv_backward_parameters_cuda(
const SSElement& attr, Tensor input, Tensor offset, Tensor gradOutput, Tensor gradWeight,
const OperatorBase::in_list_t& ins, Tensor columns, Tensor ones, int kW, int kH, int dW, int dH, int padW,
OperatorBase::out_list_t& outs) { int padH, int dilationW, int dilationH, int group, int deformable_group,
int kW, kH, dW, dH, padW, padH, dilationW, dilationH, group, deformable_group, float scale, int im2col_step) {
im2col_step;
float scale;
SSAttrs(attr)
.get<int>("kW", kW)
.get<int>("kH", kH)
.get<int>("dW", dW)
.get<int>("dH", dH)
.get<int>("padW", padW)
.get<int>("padH", padH)
.get<int>("dilationW", dilationW)
.get<int>("dilationH", dilationH)
.get<int>("group", group)
.get<int>("deformable_group", deformable_group)
.get<float>("scale", scale)
.get<int>("im2col_step", im2col_step)
.done();
auto input = ins[0];
auto offset = ins[1];
auto gradOutput = ins[2];
auto gradWeight = outs[0];
auto columns = outs[1];
auto ones = outs[2];
cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
DeformConvBackwardParametersCUDAKernelLauncher( DeformConvBackwardParametersCUDAKernelLauncher(
input, offset, gradOutput, gradWeight, columns, ones, kW, kH, dW, dH, input, offset, gradOutput, gradWeight, columns, ones, kW, kH, dW, dH,
padW, padH, dilationW, dilationH, group, deformable_group, scale, padW, padH, dilationW, dilationH, group, deformable_group, scale,
im2col_step, ctx, stream); im2col_step);
}
#endif
void deform_conv_forward(Tensor input, Tensor weight, Tensor offset,
Tensor output, Tensor columns, Tensor ones, int kW,
int kH, int dW, int dH, int padW, int padH,
int dilationW, int dilationH, int group,
int deformable_group, int im2col_step) {
if (input.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(input);
CHECK_CUDA_INPUT(offset);
CHECK_CUDA_INPUT(weight);
CHECK_CUDA_INPUT(output);
CHECK_CUDA_INPUT(columns);
CHECK_CUDA_INPUT(ones);
deform_conv_forward_cuda(input, weight, offset, output, columns, ones, kW,
kH, dW, dH, padW, padH, dilationW, dilationH,
group, deformable_group, im2col_step);
#else
AT_ERROR("DeformConv is not compiled with GPU support");
#endif
} else {
AT_ERROR("DeformConv is not implemented on CPU");
}
} }
PARROTS_EXTENSION_REGISTER(deform_conv_forward) void deform_conv_backward_input(Tensor input, Tensor offset, Tensor gradOutput,
.attr("kW") Tensor gradInput, Tensor gradOffset,
.attr("kH") Tensor weight, Tensor columns, int kW, int kH,
.attr("dW") int dW, int dH, int padW, int padH,
.attr("dH") int dilationW, int dilationH, int group,
.attr("padW") int deformable_group, int im2col_step) {
.attr("padH") if (input.device().is_cuda()) {
.attr("dilationW") #ifdef MMCV_WITH_CUDA
.attr("dilationH") CHECK_CUDA_INPUT(input);
.attr("group") CHECK_CUDA_INPUT(offset);
.attr("deformable_group") CHECK_CUDA_INPUT(gradOutput);
.attr("im2col_step") CHECK_CUDA_INPUT(gradInput);
.input(3) CHECK_CUDA_INPUT(gradOffset);
.output(3) CHECK_CUDA_INPUT(weight);
.apply(deform_conv_forward_cuda) CHECK_CUDA_INPUT(columns);
.done();
deform_conv_backward_input_cuda(input, offset, gradOutput, gradInput,
PARROTS_EXTENSION_REGISTER(deform_conv_backward_input) gradOffset, weight, columns, kW, kH, dW, dH,
.attr("kW") padW, padH, dilationW, dilationH, group,
.attr("kH") deformable_group, im2col_step);
.attr("dW") #else
.attr("dH") AT_ERROR("DeformConv is not compiled with GPU support");
.attr("padW") #endif
.attr("padH") } else {
.attr("dilationW") AT_ERROR("DeformConv is not implemented on CPU");
.attr("dilationH") }
.attr("group") }
.attr("deformable_group")
.attr("im2col_step")
.input(3)
.output(4)
.apply(deform_conv_backward_input_cuda)
.done();
PARROTS_EXTENSION_REGISTER(deform_conv_backward_parameters) void deform_conv_backward_parameters(Tensor input, Tensor offset,
.attr("kW") Tensor gradOutput, Tensor gradWeight,
.attr("kH") Tensor columns, Tensor ones, int kW,
.attr("dW") int kH, int dW, int dH, int padW, int padH,
.attr("dH") int dilationW, int dilationH, int group,
.attr("padW") int deformable_group, float scale,
.attr("padH") int im2col_step) {
.attr("dilationW") if (input.device().is_cuda()) {
.attr("dilationH") #ifdef MMCV_WITH_CUDA
.attr("group") CHECK_CUDA_INPUT(input);
.attr("deformable_group") CHECK_CUDA_INPUT(offset);
.attr("scale") CHECK_CUDA_INPUT(gradOutput);
.attr("im2col_step") CHECK_CUDA_INPUT(gradWeight);
.input(3) CHECK_CUDA_INPUT(columns);
.output(3) CHECK_CUDA_INPUT(ones);
.apply(deform_conv_backward_parameters_cuda)
.done(); deform_conv_backward_parameters_cuda(input, offset, gradOutput, gradWeight,
columns, ones, kW, kH, dW, dH, padW,
padH, dilationW, dilationH, group,
deformable_group, scale, im2col_step);
#else
AT_ERROR("DeformConv is not compiled with GPU support");
#endif
} else {
AT_ERROR("DeformConv is not implemented on CPU");
}
}
#include "deform_conv_cuda_kernel.cuh" #include "deform_conv_cuda_kernel.cuh"
#include "parrots_cuda_helper.hpp" #include "pytorch_cuda_helper.hpp"
void deformable_im2col(DArrayLite data_im, DArrayLite data_offset, void deformable_im2col(Tensor data_im, Tensor data_offset, const int channels,
const int channels, const int height, const int width, const int height, const int width, const int ksize_h,
const int ksize_h, const int ksize_w, const int pad_h, const int ksize_w, const int pad_h, const int pad_w,
const int pad_w, const int stride_h, const int stride_w, const int stride_h, const int stride_w,
const int dilation_h, const int dilation_w, const int dilation_h, const int dilation_w,
const int parallel_imgs, const int deformable_group, const int parallel_imgs, const int deformable_group,
DArrayLite data_col, cudaStream_t stream) { Tensor data_col) {
// num_axes should be smaller than block size // num_axes should be smaller than block size
// todo: check parallel_imgs is correctly passed in // todo: check parallel_imgs is correctly passed in
int height_col = int height_col =
...@@ -17,28 +17,31 @@ void deformable_im2col(DArrayLite data_im, DArrayLite data_offset, ...@@ -17,28 +17,31 @@ void deformable_im2col(DArrayLite data_im, DArrayLite data_offset,
int num_kernels = channels * height_col * width_col * parallel_imgs; int num_kernels = channels * height_col * width_col * parallel_imgs;
int channel_per_deformable_group = channels / deformable_group; int channel_per_deformable_group = channels / deformable_group;
PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF( AT_DISPATCH_FLOATING_TYPES_AND_HALF(
data_im.elemType().prim(), ([&] { data_im.scalar_type(), "deformable_im2col_gpu", ([&] {
deformable_im2col_gpu_kernel<scalar_t> const scalar_t *data_im_ = data_im.data_ptr<scalar_t>();
<<<GET_BLOCKS(num_kernels), THREADS_PER_BLOCK, 0, stream>>>( const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();
num_kernels, data_im.ptr<scalar_t>(), scalar_t *data_col_ = data_col.data_ptr<scalar_t>();
data_offset.ptr<scalar_t>(), height, width, ksize_h, ksize_w,
pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w,
channel_per_deformable_group, parallel_imgs, channels,
deformable_group, height_col, width_col,
data_col.ptr<scalar_t>());
}));
PARROTS_CUDA_CHECK(cudaGetLastError()); deformable_im2col_gpu_kernel<<<GET_BLOCKS(num_kernels),
THREADS_PER_BLOCK, 0,
at::cuda::getCurrentCUDAStream()>>>(
num_kernels, data_im_, data_offset_, height, width, ksize_h,
ksize_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w,
channel_per_deformable_group, parallel_imgs, channels,
deformable_group, height_col, width_col, data_col_);
}));
AT_CUDA_CHECK(cudaGetLastError());
} }
void deformable_col2im(DArrayLite data_col, DArrayLite data_offset, void deformable_col2im(Tensor data_col, Tensor data_offset, const int channels,
const int channels, const int height, const int width, const int height, const int width, const int ksize_h,
const int ksize_h, const int ksize_w, const int pad_h, const int ksize_w, const int pad_h, const int pad_w,
const int pad_w, const int stride_h, const int stride_w, const int stride_h, const int stride_w,
const int dilation_h, const int dilation_w, const int dilation_h, const int dilation_w,
const int parallel_imgs, const int deformable_group, const int parallel_imgs, const int deformable_group,
DArrayLite grad_im, cudaStream_t stream) { Tensor grad_im) {
// todo: make sure parallel_imgs is passed in correctly
int height_col = int height_col =
(height + 2 * pad_h - (dilation_h * (ksize_h - 1) + 1)) / stride_h + 1; (height + 2 * pad_h - (dilation_h * (ksize_h - 1) + 1)) / stride_h + 1;
int width_col = int width_col =
...@@ -47,27 +50,29 @@ void deformable_col2im(DArrayLite data_col, DArrayLite data_offset, ...@@ -47,27 +50,29 @@ void deformable_col2im(DArrayLite data_col, DArrayLite data_offset,
channels * ksize_h * ksize_w * height_col * width_col * parallel_imgs; channels * ksize_h * ksize_w * height_col * width_col * parallel_imgs;
int channel_per_deformable_group = channels / deformable_group; int channel_per_deformable_group = channels / deformable_group;
PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF( AT_DISPATCH_FLOATING_TYPES_AND_HALF(
data_col.elemType().prim(), ([&] { data_col.scalar_type(), "deformable_col2im_gpu", ([&] {
const scalar_t *data_col_ = data_col.data_ptr<scalar_t>();
const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();
scalar_t *grad_im_ = grad_im.data_ptr<scalar_t>();
deformable_col2im_gpu_kernel<<<GET_BLOCKS(num_kernels), deformable_col2im_gpu_kernel<<<GET_BLOCKS(num_kernels),
THREADS_PER_BLOCK, 0, stream>>>( THREADS_PER_BLOCK, 0,
num_kernels, data_col.ptr<scalar_t>(), data_offset.ptr<scalar_t>(), at::cuda::getCurrentCUDAStream()>>>(
channels, height, width, ksize_h, ksize_w, pad_h, pad_w, stride_h, num_kernels, data_col_, data_offset_, channels, height, width,
stride_w, dilation_h, dilation_w, channel_per_deformable_group, ksize_h, ksize_w, pad_h, pad_w, stride_h, stride_w, dilation_h,
parallel_imgs, deformable_group, height_col, width_col, dilation_w, channel_per_deformable_group, parallel_imgs,
grad_im.ptr<scalar_t>()); deformable_group, height_col, width_col, grad_im_);
})); }));
AT_CUDA_CHECK(cudaGetLastError());
PARROTS_CUDA_CHECK(cudaGetLastError());
} }
void deformable_col2im_coord( void deformable_col2im_coord(
DArrayLite data_col, DArrayLite data_im, DArrayLite data_offset, Tensor data_col, Tensor data_im, Tensor data_offset, const int channels,
const int channels, const int height, const int width, const int ksize_h, const int height, const int width, const int ksize_h, const int ksize_w,
const int ksize_w, const int pad_h, const int pad_w, const int stride_h, const int pad_h, const int pad_w, const int stride_h, const int stride_w,
const int stride_w, const int dilation_h, const int dilation_w, const int dilation_h, const int dilation_w, const int parallel_imgs,
const int parallel_imgs, const int deformable_group, DArrayLite grad_offset, const int deformable_group, Tensor grad_offset) {
cudaStream_t stream) {
int height_col = int height_col =
(height + 2 * pad_h - (dilation_h * (ksize_h - 1) + 1)) / stride_h + 1; (height + 2 * pad_h - (dilation_h * (ksize_h - 1) + 1)) / stride_h + 1;
int width_col = int width_col =
...@@ -77,51 +82,55 @@ void deformable_col2im_coord( ...@@ -77,51 +82,55 @@ void deformable_col2im_coord(
int channel_per_deformable_group = int channel_per_deformable_group =
channels * ksize_h * ksize_w / deformable_group; channels * ksize_h * ksize_w / deformable_group;
PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF( AT_DISPATCH_FLOATING_TYPES_AND_HALF(
data_col.elemType().prim(), ([&] { data_col.scalar_type(), "deformable_col2im_coord_gpu", ([&] {
deformable_col2im_coord_gpu_kernel<<<GET_BLOCKS(num_kernels), const scalar_t *data_col_ = data_col.data_ptr<scalar_t>();
THREADS_PER_BLOCK, 0, stream>>>( const scalar_t *data_im_ = data_im.data_ptr<scalar_t>();
num_kernels, data_col.ptr<scalar_t>(), data_im.ptr<scalar_t>(), const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();
data_offset.ptr<scalar_t>(), channels, height, width, ksize_h, scalar_t *grad_offset_ = grad_offset.data_ptr<scalar_t>();
ksize_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w,
channel_per_deformable_group, parallel_imgs, deformable_col2im_coord_gpu_kernel<<<
GET_BLOCKS(num_kernels), THREADS_PER_BLOCK, 0,
at::cuda::getCurrentCUDAStream()>>>(
num_kernels, data_col_, data_im_, data_offset_, channels, height,
width, ksize_h, ksize_w, pad_h, pad_w, stride_h, stride_w,
dilation_h, dilation_w, channel_per_deformable_group, parallel_imgs,
2 * ksize_h * ksize_w * deformable_group, deformable_group, 2 * ksize_h * ksize_w * deformable_group, deformable_group,
height_col, width_col, grad_offset.ptr<scalar_t>()); height_col, width_col, grad_offset_);
})); }));
AT_CUDA_CHECK(cudaGetLastError());
PARROTS_CUDA_CHECK(cudaGetLastError());
} }
void deform_conv_shape_check(DArrayLite input, DArrayLite offset, void deform_conv_shape_check(Tensor input, Tensor offset, Tensor *gradOutput,
DArrayLite* gradOutput, DArrayLite weight, int kH, Tensor weight, int kH, int kW, int dH, int dW,
int kW, int dH, int dW, int padH, int padW, int padH, int padW, int dilationH, int dilationW,
int dilationH, int dilationW, int group, int group, int deformable_group) {
int deformable_group) { TORCH_CHECK(
PARROTS_CHECKARGS(weight.ndims() == 4) weight.ndimension() == 4,
<< "4D weight tensor (nOutputPlane,nInputPlane,kH,kW) expected, but got: " "4D weight tensor (nOutputPlane,nInputPlane,kH,kW) expected, but got: %s",
<< weight.ndims(); weight.ndimension());
PARROTS_CHECKARGS(weight.isContiguous()) TORCH_CHECK(weight.is_contiguous(), "weight tensor has to be contiguous");
<< "weight tensor has to be contiguous";
PARROTS_CHECKARGS(kW > 0 && kH > 0) TORCH_CHECK(kW > 0 && kH > 0,
<< "kernel size should be greater than zero, but got kH: " << kH "kernel size should be greater than zero, but got kH: %d kW: %d",
<< " kW: " << kW; kH, kW);
PARROTS_CHECKARGS(weight.dim(2) == kH && weight.dim(3) == kW) TORCH_CHECK((weight.size(2) == kH && weight.size(3) == kW),
<< "kernel size should be consistent with weight, but got kH: " << kH "kernel size should be consistent with weight, ",
<< " kW: " << kW << " weight.dim(2): " << weight.dim(2) "but got kH: %d kW: %d weight.size(2): %d, weight.size(3): %d",
<< ", weight.dim(3): " << weight.dim(3); kH, kW, weight.size(2), weight.size(3));
PARROTS_CHECKARGS(dW > 0 && dH > 0) TORCH_CHECK(dW > 0 && dH > 0,
<< "stride should be greater than zero, but got dH: " << dH "stride should be greater than zero, but got dH: %d dW: %d", dH,
<< " dW: " << dW; dW);
PARROTS_CHECKARGS(dilationW > 0 && dilationH > 0) TORCH_CHECK(
<< "dilation should be greater than 0, but got dilationH: " << dilationH dilationW > 0 && dilationH > 0,
<< " dilationW: " << dilationW; "dilation should be greater than 0, but got dilationH: %d dilationW: %d",
dilationH, dilationW);
int ndim = input.ndims(); int ndim = input.ndimension();
int dimf = 0; int dimf = 0;
int dimh = 1; int dimh = 1;
int dimw = 2; int dimw = 2;
...@@ -132,62 +141,67 @@ void deform_conv_shape_check(DArrayLite input, DArrayLite offset, ...@@ -132,62 +141,67 @@ void deform_conv_shape_check(DArrayLite input, DArrayLite offset,
dimw++; dimw++;
} }
PARROTS_CHECKARGS(ndim == 3 || ndim == 4) TORCH_CHECK(ndim == 3 || ndim == 4,
<< "3D or 4D input tensor expected but got: " << ndim; "3D or 4D input tensor expected but got: %s", ndim);
size_t nInputPlane = weight.dim(1) * group; long nInputPlane = weight.size(1) * group;
size_t inputHeight = input.dim(dimh); long inputHeight = input.size(dimh);
size_t inputWidth = input.dim(dimw); long inputWidth = input.size(dimw);
size_t nOutputPlane = weight.dim(0); long nOutputPlane = weight.size(0);
size_t outputHeight = long outputHeight =
(inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1; (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1;
size_t outputWidth = long outputWidth =
(inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1; (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1;
PARROTS_CHECKARGS(nInputPlane % deformable_group == 0) TORCH_CHECK(nInputPlane % deformable_group == 0,
<< "input channels must divide deformable group size"; "input channels must divide deformable group size");
PARROTS_CHECKARGS(outputWidth >= 1 || outputHeight >= 1) if (outputWidth < 1 || outputHeight < 1)
<< "Given input size: (" << nInputPlane << " x " << inputHeight << " x " AT_ERROR(
<< inputWidth << "). Calculated output size: (" << nOutputPlane << " x " "Given input size: (%ld x %ld x %ld). "
<< outputHeight << " x " << outputWidth << "). Output size is too small"; "Calculated output size: (%ld x %ld x %ld). Output size is too small",
nInputPlane, inputHeight, inputWidth, nOutputPlane, outputHeight,
outputWidth);
PARROTS_CHECKARGS(input.dim(1) == nInputPlane) TORCH_CHECK(input.size(1) == nInputPlane,
<< "invalid number of input planes, expected: " << nInputPlane "invalid number of input planes, expected: %d, but got: %d",
<< ", but got: " << input.dim(1); nInputPlane, input.size(1));
PARROTS_CHECKARGS(inputHeight >= kH && inputWidth >= kW) TORCH_CHECK((inputHeight >= kH && inputWidth >= kW),
<< "input image is smaller than kernel"; "input image is smaller than kernel");
PARROTS_CHECKARGS(offset.dim(2) == outputHeight && TORCH_CHECK(
offset.dim(3) == outputWidth) (offset.size(2) == outputHeight && offset.size(3) == outputWidth),
<< "invalid spatial dim of offset, expected height: " << outputHeight "invalid spatial size of offset, expected height: %d width: %d, but "
<< " width: " << outputWidth << ", but got height: " << offset.dim(2) "got height: %d width: %d",
<< " width: " << offset.dim(3); outputHeight, outputWidth, offset.size(2), offset.size(3));
PARROTS_CHECKARGS(offset.dim(1) == deformable_group * 2 * kH * kW) TORCH_CHECK((offset.size(1) == deformable_group * 2 * kH * kW),
<< "invalid number of channels of offset"; "invalid number of channels of offset");
if (gradOutput != NULL) { if (gradOutput != NULL) {
PARROTS_CHECKARGS(gradOutput->dim(dimf) == nOutputPlane) TORCH_CHECK(
<< "invalid number of gradOutput planes, expected: " << nOutputPlane gradOutput->size(dimf) == nOutputPlane,
<< ", but got: " << gradOutput->dim(dimf); "invalid number of gradOutput planes, expected: %d, but got: %d",
nOutputPlane, gradOutput->size(dimf));
PARROTS_CHECKARGS(gradOutput->dim(dimh) == outputHeight &&
gradOutput->dim(dimw) == outputWidth) TORCH_CHECK(
<< "invalid dim of gradOutput, expected height: " << outputHeight (gradOutput->size(dimh) == outputHeight &&
<< " width: " << outputWidth gradOutput->size(dimw) == outputWidth),
<< " , but got height: " << gradOutput->dim(dimh) "invalid size of gradOutput, expected height: %d width: %d , but "
<< " width: " << gradOutput->dim(dimw); "got height: %d width: %d",
outputHeight, outputWidth, gradOutput->size(dimh),
gradOutput->size(dimw));
} }
} }
void DeformConvForwardCUDAKernelLauncher( void DeformConvForwardCUDAKernelLauncher(Tensor input, Tensor weight,
DArrayLite input, DArrayLite weight, DArrayLite offset, DArrayLite output, Tensor offset, Tensor output,
DArrayLite columns, DArrayLite ones, int kW, int kH, int dW, int dH, Tensor columns, Tensor ones, int kW,
int padW, int padH, int dilationW, int dilationH, int group, int kH, int dW, int dH, int padW,
int deformable_group, int im2col_step, CudaContext& ctx, int padH, int dilationW, int dilationH,
cudaStream_t stream) { int group, int deformable_group,
int im2col_step) {
// todo: resize columns to include im2col: done // todo: resize columns to include im2col: done
// todo: add im2col_step as input // todo: add im2col_step as input
// todo: add new output buffer and transpose it to output (or directly // todo: add new output buffer and transpose it to output (or directly
...@@ -196,45 +210,41 @@ void DeformConvForwardCUDAKernelLauncher( ...@@ -196,45 +210,41 @@ void DeformConvForwardCUDAKernelLauncher(
deform_conv_shape_check(input, offset, NULL, weight, kH, kW, dH, dW, padH, deform_conv_shape_check(input, offset, NULL, weight, kH, kW, dH, dW, padH,
padW, dilationH, dilationW, group, deformable_group); padW, dilationH, dilationW, group, deformable_group);
at::DeviceGuard guard(input.device());
int batch = 1; int batch = 1;
if (input.ndims() == 3) { if (input.ndimension() == 3) {
// Force batch // Force batch
batch = 0; batch = 0;
input = input.view({1, input.dim(0), input.dim(1), input.dim(2)}); input.unsqueeze_(0);
offset = offset.view({1, offset.dim(0), offset.dim(1), offset.dim(2)}); offset.unsqueeze_(0);
} }
// todo: assert batchsize dividable by im2col_step // todo: assert batchsize dividable by im2col_step
size_t batchSize = input.dim(0); long batchSize = input.size(0);
size_t nInputPlane = input.dim(1); long nInputPlane = input.size(1);
size_t inputHeight = input.dim(2); long inputHeight = input.size(2);
size_t inputWidth = input.dim(3); long inputWidth = input.size(3);
size_t nOutputPlane = weight.dim(0); long nOutputPlane = weight.size(0);
size_t outputWidth = long outputWidth =
(inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1; (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1;
size_t outputHeight = long outputHeight =
(inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1; (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1;
PARROTS_CHECKARGS(offset.dim(0) == batchSize) TORCH_CHECK((offset.size(0) == batchSize), "invalid batch size of offset");
<< "invalid batch size of offset";
output = output.view({batchSize / im2col_step, im2col_step, nOutputPlane, output = output.view({batchSize / im2col_step, im2col_step, nOutputPlane,
outputHeight, outputWidth}); outputHeight, outputWidth});
columns = at::zeros(
{nInputPlane * kW * kH, im2col_step * outputHeight * outputWidth},
input.options());
columns = ctx.createDArrayLite( if (ones.ndimension() != 2 ||
input.elemType(), DArrayShape(nInputPlane * kW * kH, ones.size(0) * ones.size(1) < outputHeight * outputWidth) {
im2col_step * outputHeight * outputWidth)); ones = at::ones({outputHeight, outputWidth}, input.options());
columns.setZeros(ctx.getStream());
if (ones.ndims() != 2 ||
ones.dim(0) * ones.dim(1) < outputHeight * outputWidth) {
ones = ctx.createDArrayLite(input.elemType(),
DArrayShape(outputHeight, outputWidth));
fill(ctx, ones, *toScalar(1));
} }
input = input.view({batchSize / im2col_step, im2col_step, nInputPlane, input = input.view({batchSize / im2col_step, im2col_step, nInputPlane,
...@@ -243,45 +253,41 @@ void DeformConvForwardCUDAKernelLauncher( ...@@ -243,45 +253,41 @@ void DeformConvForwardCUDAKernelLauncher(
offset.view({batchSize / im2col_step, im2col_step, offset.view({batchSize / im2col_step, im2col_step,
deformable_group * 2 * kH * kW, outputHeight, outputWidth}); deformable_group * 2 * kH * kW, outputHeight, outputWidth});
auto output_buffer = ctx.createDArrayLite( Tensor output_buffer = at::zeros({batchSize / im2col_step, nOutputPlane,
input.elemType(), DArrayShape(batchSize / im2col_step, nOutputPlane, im2col_step * outputHeight, outputWidth},
im2col_step * outputHeight, outputWidth)); output.options());
output_buffer.setZeros(ctx.getStream());
output_buffer = output_buffer.view( output_buffer = output_buffer.view(
{output_buffer.dim(0), group, output_buffer.dim(1) / group, {output_buffer.size(0), group, output_buffer.size(1) / group,
output_buffer.dim(2) * output_buffer.dim(3)}); output_buffer.size(2), output_buffer.size(3)});
for (size_t elt = 0; elt < batchSize / im2col_step; elt++) { for (int elt = 0; elt < batchSize / im2col_step; elt++) {
deformable_im2col(input[elt], offset[elt], nInputPlane, inputHeight, deformable_im2col(input[elt], offset[elt], nInputPlane, inputHeight,
inputWidth, kH, kW, padH, padW, dH, dW, dilationH, inputWidth, kH, kW, padH, padW, dH, dW, dilationH,
dilationW, im2col_step, deformable_group, columns, dilationW, im2col_step, deformable_group, columns);
stream);
columns = columns.view({group, columns.size(0) / group, columns.size(1)});
columns = columns.view({group, columns.dim(0) / group, columns.dim(1)}); weight = weight.view({group, weight.size(0) / group, weight.size(1),
weight = weight.view( weight.size(2), weight.size(3)});
{group, nOutputPlane / group, nInputPlane / group * kH * kW});
for (int g = 0; g < group; g++) {
for (size_t g = 0; g < group; g++) { output_buffer[elt][g] = output_buffer[elt][g]
auto output_g = output_buffer[elt][g]; .flatten(1)
auto weight_g = weight[g]; .addmm_(weight[g].flatten(1), columns[g])
auto columns_g = columns[g]; .view_as(output_buffer[elt][g]);
gemm(ctx, 1, false, weight_g, false, columns_g, 1, output_g);
} }
columns = columns.view({columns.dim(0) * columns.dim(1), columns.dim(2)}); columns =
weight = weight.view({nOutputPlane, nInputPlane, kH, kW}); columns.view({columns.size(0) * columns.size(1), columns.size(2)});
} }
output_buffer = output_buffer.view( output_buffer = output_buffer.view(
{output_buffer.dim(0), output_buffer.dim(1) * output_buffer.dim(2), {output_buffer.size(0), output_buffer.size(1) * output_buffer.size(2),
output_buffer.dim(3)}); output_buffer.size(3), output_buffer.size(4)});
output_buffer = output_buffer.view({batchSize / im2col_step, nOutputPlane, output_buffer = output_buffer.view({batchSize / im2col_step, nOutputPlane,
im2col_step, outputHeight, outputWidth}); im2col_step, outputHeight, outputWidth});
output_buffer = transpose(ctx, output_buffer, 1, 2); output_buffer.transpose_(1, 2);
if (!output_buffer.isContiguous()) { output.copy_(output_buffer);
output_buffer = ctx.cloneDArrayLite(output_buffer);
}
copy(ctx, output, output_buffer);
output = output.view({batchSize, nOutputPlane, outputHeight, outputWidth}); output = output.view({batchSize, nOutputPlane, outputHeight, outputWidth});
input = input.view({batchSize, nInputPlane, inputHeight, inputWidth}); input = input.view({batchSize, nInputPlane, inputHeight, inputWidth});
...@@ -291,58 +297,53 @@ void DeformConvForwardCUDAKernelLauncher( ...@@ -291,58 +297,53 @@ void DeformConvForwardCUDAKernelLauncher(
if (batch == 0) { if (batch == 0) {
output = output.view({nOutputPlane, outputHeight, outputWidth}); output = output.view({nOutputPlane, outputHeight, outputWidth});
input = input.view({nInputPlane, inputHeight, inputWidth}); input = input.view({nInputPlane, inputHeight, inputWidth});
offset = offset.view({offset.dim(1), offset.dim(2), offset.dim(3)}); offset = offset.view({offset.size(1), offset.size(2), offset.size(3)});
} }
} }
void DeformConvBackwardInputCUDAKernelLauncher( void DeformConvBackwardInputCUDAKernelLauncher(
DArrayLite input, DArrayLite offset, DArrayLite gradOutput, Tensor input, Tensor offset, Tensor gradOutput, Tensor gradInput,
DArrayLite gradInput, DArrayLite gradOffset, DArrayLite weight, Tensor gradOffset, Tensor weight, Tensor columns, int kW, int kH, int dW,
DArrayLite columns, int kW, int kH, int dW, int dH, int padW, int padH, int dH, int padW, int padH, int dilationW, int dilationH, int group,
int dilationW, int dilationH, int group, int deformable_group, int deformable_group, int im2col_step) {
int im2col_step, CudaContext& ctx, cudaStream_t stream) {
deform_conv_shape_check(input, offset, &gradOutput, weight, kH, kW, dH, dW, deform_conv_shape_check(input, offset, &gradOutput, weight, kH, kW, dH, dW,
padH, padW, dilationH, dilationW, group, padH, padW, dilationH, dilationW, group,
deformable_group); deformable_group);
at::DeviceGuard guard(input.device());
int batch = 1; int batch = 1;
if (input.ndims() == 3) { if (input.ndimension() == 3) {
// Force batch // Force batch
batch = 0; batch = 0;
input = input.view({1, input.dim(0), input.dim(1), input.dim(2)}); input = input.view({1, input.size(0), input.size(1), input.size(2)});
offset = offset.view({1, offset.dim(0), offset.dim(1), offset.dim(2)}); offset = offset.view({1, offset.size(0), offset.size(1), offset.size(2)});
gradOutput = gradOutput.view( gradOutput = gradOutput.view(
{1, gradOutput.dim(0), gradOutput.dim(1), gradOutput.dim(2)}); {1, gradOutput.size(0), gradOutput.size(1), gradOutput.size(2)});
} }
size_t batchSize = input.dim(0); long batchSize = input.size(0);
size_t nInputPlane = input.dim(1); long nInputPlane = input.size(1);
size_t inputHeight = input.dim(2); long inputHeight = input.size(2);
size_t inputWidth = input.dim(3); long inputWidth = input.size(3);
size_t nOutputPlane = weight.dim(0); long nOutputPlane = weight.size(0);
size_t outputWidth = long outputWidth =
(inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1; (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1;
size_t outputHeight = long outputHeight =
(inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1; (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1;
PARROTS_CHECKARGS(offset.dim(0) == batchSize) TORCH_CHECK((offset.size(0) == batchSize), 3, "invalid batch size of offset");
<< "invalid batch size of offset";
gradInput = gradInput.view({batchSize, nInputPlane, inputHeight, inputWidth}); gradInput = gradInput.view({batchSize, nInputPlane, inputHeight, inputWidth});
columns = ctx.createDArrayLite( columns = at::zeros(
input.elemType(), DArrayShape(nInputPlane * kW * kH, {nInputPlane * kW * kH, im2col_step * outputHeight * outputWidth},
im2col_step * outputHeight * outputWidth)); input.options());
columns.setZeros(ctx.getStream());
// change order of grad output // change order of grad output
gradOutput = gradOutput.view({batchSize / im2col_step, im2col_step, gradOutput = gradOutput.view({batchSize / im2col_step, im2col_step,
nOutputPlane, outputHeight, outputWidth}); nOutputPlane, outputHeight, outputWidth});
gradOutput = transpose(ctx, gradOutput, 1, 2); gradOutput.transpose_(1, 2);
if (!gradOutput.isContiguous()) {
gradOutput = ctx.cloneDArrayLite(gradOutput);
}
gradInput = gradInput.view({batchSize / im2col_step, im2col_step, nInputPlane, gradInput = gradInput.view({batchSize / im2col_step, im2col_step, nInputPlane,
inputHeight, inputWidth}); inputHeight, inputWidth});
...@@ -355,41 +356,37 @@ void DeformConvBackwardInputCUDAKernelLauncher( ...@@ -355,41 +356,37 @@ void DeformConvBackwardInputCUDAKernelLauncher(
offset.view({batchSize / im2col_step, im2col_step, offset.view({batchSize / im2col_step, im2col_step,
deformable_group * 2 * kH * kW, outputHeight, outputWidth}); deformable_group * 2 * kH * kW, outputHeight, outputWidth});
for (size_t elt = 0; elt < batchSize / im2col_step; elt++) { for (int elt = 0; elt < batchSize / im2col_step; elt++) {
// divide into groups // divide into groups
columns = columns.view({group, columns.dim(0) / group, columns.dim(1)}); columns = columns.view({group, columns.size(0) / group, columns.size(1)});
weight = weight.view({group, weight.dim(0) / group, weight = weight.view({group, weight.size(0) / group, weight.size(1),
weight.dim(1) * weight.dim(2) * weight.dim(3)}); weight.size(2), weight.size(3)});
gradOutput = gradOutput.view( gradOutput = gradOutput.view(
{gradOutput.dim(0), group, gradOutput.dim(1) / group, {gradOutput.size(0), group, gradOutput.size(1) / group,
gradOutput.dim(2) * gradOutput.dim(3) * gradOutput.dim(4)}); gradOutput.size(2), gradOutput.size(3), gradOutput.size(4)});
for (size_t g = 0; g < group; g++) { for (int g = 0; g < group; g++) {
auto columns_g = columns[g]; columns[g] = columns[g].addmm_(weight[g].flatten(1).transpose(0, 1),
gemm(ctx, 1, true, weight[g], false, gradOutput[elt][g], 0, columns_g); gradOutput[elt][g].flatten(1), 0.0f, 1.0f);
} }
columns = columns.view({columns.dim(0) * columns.dim(1), columns.dim(2)}); columns =
gradOutput = gradOutput.view({gradOutput.dim(0), columns.view({columns.size(0) * columns.size(1), columns.size(2)});
gradOutput.dim(1) * gradOutput.dim(2), gradOutput = gradOutput.view(
im2col_step, outputHeight, outputWidth}); {gradOutput.size(0), gradOutput.size(1) * gradOutput.size(2),
weight = weight.view({nOutputPlane, nInputPlane, kH, kW}); gradOutput.size(3), gradOutput.size(4), gradOutput.size(5)});
deformable_col2im_coord(columns, input[elt], offset[elt], nInputPlane, deformable_col2im_coord(columns, input[elt], offset[elt], nInputPlane,
inputHeight, inputWidth, kH, kW, padH, padW, dH, dW, inputHeight, inputWidth, kH, kW, padH, padW, dH, dW,
dilationH, dilationW, im2col_step, deformable_group, dilationH, dilationW, im2col_step, deformable_group,
gradOffset[elt], stream); gradOffset[elt]);
deformable_col2im(columns, offset[elt], nInputPlane, inputHeight, deformable_col2im(columns, offset[elt], nInputPlane, inputHeight,
inputWidth, kH, kW, padH, padW, dH, dW, dilationH, inputWidth, kH, kW, padH, padW, dH, dW, dilationH,
dilationW, im2col_step, deformable_group, gradInput[elt], dilationW, im2col_step, deformable_group, gradInput[elt]);
stream);
} }
gradOutput = transpose(ctx, gradOutput, 1, 2); gradOutput.transpose_(1, 2);
if (!gradOutput.isContiguous()) {
gradOutput = ctx.cloneDArrayLite(gradOutput);
}
gradOutput = gradOutput =
gradOutput.view({batchSize, nOutputPlane, outputHeight, outputWidth}); gradOutput.view({batchSize, nOutputPlane, outputHeight, outputWidth});
...@@ -404,17 +401,17 @@ void DeformConvBackwardInputCUDAKernelLauncher( ...@@ -404,17 +401,17 @@ void DeformConvBackwardInputCUDAKernelLauncher(
gradOutput = gradOutput.view({nOutputPlane, outputHeight, outputWidth}); gradOutput = gradOutput.view({nOutputPlane, outputHeight, outputWidth});
input = input.view({nInputPlane, inputHeight, inputWidth}); input = input.view({nInputPlane, inputHeight, inputWidth});
gradInput = gradInput.view({nInputPlane, inputHeight, inputWidth}); gradInput = gradInput.view({nInputPlane, inputHeight, inputWidth});
offset = offset.view({offset.dim(1), offset.dim(2), offset.dim(3)}); offset = offset.view({offset.size(1), offset.size(2), offset.size(3)});
gradOffset = gradOffset.view({offset.dim(1), offset.dim(2), offset.dim(3)}); gradOffset =
gradOffset.view({offset.size(1), offset.size(2), offset.size(3)});
} }
} }
void DeformConvBackwardParametersCUDAKernelLauncher( void DeformConvBackwardParametersCUDAKernelLauncher(
DArrayLite input, DArrayLite offset, DArrayLite gradOutput, Tensor input, Tensor offset, Tensor gradOutput, Tensor gradWeight,
DArrayLite gradWeight, DArrayLite columns, DArrayLite ones, int kW, int kH, Tensor columns, Tensor ones, int kW, int kH, int dW, int dH, int padW,
int dW, int dH, int padW, int padH, int dilationW, int dilationH, int group, int padH, int dilationW, int dilationH, int group, int deformable_group,
int deformable_group, float scale, int im2col_step, CudaContext& ctx, float scale, int im2col_step) {
cudaStream_t stream) {
// todo: transpose and reshape outGrad // todo: transpose and reshape outGrad
// todo: reshape columns // todo: reshape columns
// todo: add im2col_step as input // todo: add im2col_step as input
...@@ -422,53 +419,52 @@ void DeformConvBackwardParametersCUDAKernelLauncher( ...@@ -422,53 +419,52 @@ void DeformConvBackwardParametersCUDAKernelLauncher(
deform_conv_shape_check(input, offset, &gradOutput, gradWeight, kH, kW, dH, deform_conv_shape_check(input, offset, &gradOutput, gradWeight, kH, kW, dH,
dW, padH, padW, dilationH, dilationW, group, dW, padH, padW, dilationH, dilationW, group,
deformable_group); deformable_group);
at::DeviceGuard guard(input.device());
int batch = 1; int batch = 1;
if (input.ndims() == 3) { if (input.ndimension() == 3) {
// Force batch // Force batch
batch = 0; batch = 0;
input = input.view({1, input.dim(0), input.dim(1), input.dim(2)}); input = input.view(
at::IntList({1, input.size(0), input.size(1), input.size(2)}));
gradOutput = gradOutput.view( gradOutput = gradOutput.view(
{1, gradOutput.dim(0), gradOutput.dim(1), gradOutput.dim(2)}); {1, gradOutput.size(0), gradOutput.size(1), gradOutput.size(2)});
} }
size_t batchSize = input.dim(0); long batchSize = input.size(0);
size_t nInputPlane = input.dim(1); long nInputPlane = input.size(1);
size_t inputHeight = input.dim(2); long inputHeight = input.size(2);
size_t inputWidth = input.dim(3); long inputWidth = input.size(3);
size_t nOutputPlane = gradWeight.dim(0); long nOutputPlane = gradWeight.size(0);
size_t outputWidth = long outputWidth =
(inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1; (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1;
size_t outputHeight = long outputHeight =
(inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1; (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1;
PARROTS_CHECKARGS(offset.dim(0) == batchSize) TORCH_CHECK((offset.size(0) == batchSize), "invalid batch size of offset");
<< "invalid batch size of offset";
columns = ctx.createDArrayLite( columns = at::zeros(
input.elemType(), DArrayShape(nInputPlane * kW * kH, {nInputPlane * kW * kH, im2col_step * outputHeight * outputWidth},
im2col_step * outputHeight * outputWidth)); input.options());
columns.setZeros(ctx.getStream());
gradOutput = gradOutput.view({batchSize / im2col_step, im2col_step, gradOutput = gradOutput.view({batchSize / im2col_step, im2col_step,
nOutputPlane, outputHeight, outputWidth}); nOutputPlane, outputHeight, outputWidth});
gradOutput = transpose(ctx, gradOutput, 1, 2); gradOutput.transpose_(1, 2);
if (!gradOutput.isContiguous()) {
gradOutput = ctx.cloneDArrayLite(gradOutput);
}
auto gradOutputBuffer = ctx.cloneDArrayLite(gradOutput); Tensor gradOutputBuffer = at::zeros_like(gradOutput);
gradOutputBuffer =
gradOutputBuffer.view({batchSize / im2col_step, nOutputPlane, im2col_step,
outputHeight, outputWidth});
gradOutputBuffer = gradOutputBuffer.contiguous();
gradOutputBuffer.copy_(gradOutput);
gradOutputBuffer = gradOutputBuffer =
gradOutputBuffer.view({batchSize / im2col_step, nOutputPlane, gradOutputBuffer.view({batchSize / im2col_step, nOutputPlane,
im2col_step * outputHeight, outputWidth}); im2col_step * outputHeight, outputWidth});
gradOutput = transpose(ctx, gradOutput, 1, 2); gradOutput.transpose_(1, 2);
if (!gradOutput.isContiguous()) {
gradOutput = ctx.cloneDArrayLite(gradOutput);
}
gradOutput = gradOutput =
gradOutput.view({batchSize, nOutputPlane, outputHeight, outputWidth}); gradOutput.view({batchSize, nOutputPlane, outputHeight, outputWidth});
...@@ -478,33 +474,36 @@ void DeformConvBackwardParametersCUDAKernelLauncher( ...@@ -478,33 +474,36 @@ void DeformConvBackwardParametersCUDAKernelLauncher(
offset.view({batchSize / im2col_step, im2col_step, offset.view({batchSize / im2col_step, im2col_step,
deformable_group * 2 * kH * kW, outputHeight, outputWidth}); deformable_group * 2 * kH * kW, outputHeight, outputWidth});
for (size_t elt = 0; elt < batchSize / im2col_step; elt++) { for (int elt = 0; elt < batchSize / im2col_step; elt++) {
deformable_im2col(input[elt], offset[elt], nInputPlane, inputHeight, deformable_im2col(input[elt], offset[elt], nInputPlane, inputHeight,
inputWidth, kH, kW, padH, padW, dH, dW, dilationH, inputWidth, kH, kW, padH, padW, dH, dW, dilationH,
dilationW, im2col_step, deformable_group, columns, dilationW, im2col_step, deformable_group, columns);
stream);
// divide into group // divide into group
gradOutputBuffer = gradOutputBuffer.view( gradOutputBuffer = gradOutputBuffer.view(
{gradOutputBuffer.dim(0), group, gradOutputBuffer.dim(1) / group, {gradOutputBuffer.size(0), group, gradOutputBuffer.size(1) / group,
gradOutputBuffer.dim(2) * gradOutputBuffer.dim(3)}); gradOutputBuffer.size(2), gradOutputBuffer.size(3)});
columns = columns.view({group, columns.dim(0) / group, columns.dim(1)}); columns = columns.view({group, columns.size(0) / group, columns.size(1)});
gradWeight = gradWeight.view( gradWeight =
{group, gradWeight.dim(0) / group, gradWeight.view({group, gradWeight.size(0) / group, gradWeight.size(1),
gradWeight.dim(1) * gradWeight.dim(2) * gradWeight.dim(3)}); gradWeight.size(2), gradWeight.size(3)});
for (int g = 0; g < group; g++) { for (int g = 0; g < group; g++) {
auto gradWeight_g = gradWeight[g]; gradWeight[g] = gradWeight[g]
gemm(ctx, scale, false, gradOutputBuffer[elt][g], true, columns[g], 1, .flatten(1)
gradWeight_g); .addmm_(gradOutputBuffer[elt][g].flatten(1),
columns[g].transpose(1, 0), 1.0, scale)
.view_as(gradWeight[g]);
} }
gradOutputBuffer = gradOutputBuffer.view( gradOutputBuffer = gradOutputBuffer.view(
{gradOutputBuffer.dim(0), {gradOutputBuffer.size(0),
gradOutputBuffer.dim(1) * gradOutputBuffer.dim(2), gradOutputBuffer.size(1) * gradOutputBuffer.size(2),
im2col_step * outputHeight, outputWidth}); gradOutputBuffer.size(3), gradOutputBuffer.size(4)});
columns = columns.view({columns.dim(0) * columns.dim(1), columns.dim(2)}); columns =
gradWeight = gradWeight.view( columns.view({columns.size(0) * columns.size(1), columns.size(2)});
{gradWeight.dim(0) * gradWeight.dim(1), nInputPlane / group, kH, kW}); gradWeight = gradWeight.view({gradWeight.size(0) * gradWeight.size(1),
gradWeight.size(2), gradWeight.size(3),
gradWeight.size(4)});
} }
input = input.view({batchSize, nInputPlane, inputHeight, inputWidth}); input = input.view({batchSize, nInputPlane, inputHeight, inputWidth});
......
#include <parrots/compute/aten.hpp>
#include <parrots/extension.hpp>
#include <parrots/foundation/ssattrs.hpp>
#include "deform_conv_pytorch.h"
using namespace parrots;
/*void deform_conv_forward_cuda(Tensor input, Tensor weight, Tensor offset,
* Tensor output, Tensor columns, Tensor ones,
* int kW, int kH, int dW, int dH, int padW,
* int padH, int dilationW, int dilationH, int
* group, int deformable_group, int im2col_step);
*/
void deform_conv_forward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
int kW, kH, dW, dH, padW, padH, dilationW, dilationH, group, deformable_group,
im2col_step;
SSAttrs(attr)
.get<int>("kW", kW)
.get<int>("kH", kH)
.get<int>("dW", dW)
.get<int>("dH", dH)
.get<int>("padW", padW)
.get<int>("padH", padH)
.get<int>("dilationW", dilationW)
.get<int>("dilationH", dilationH)
.get<int>("group", group)
.get<int>("deformable_group", deformable_group)
.get<int>("im2col_step", im2col_step)
.done();
const auto& input = buildATensor(ctx, ins[0]);
const auto& weight = buildATensor(ctx, ins[1]);
const auto& offset = buildATensor(ctx, ins[2]);
auto output = buildATensor(ctx, outs[0]);
auto columns = buildATensor(ctx, outs[1]);
auto ones = buildATensor(ctx, outs[2]);
deform_conv_forward_cuda(input, weight, offset, output, columns, ones, kW, kH,
dW, dH, padW, padH, dilationW, dilationH, group,
deformable_group, im2col_step);
}
/*void deform_conv_backward_input_cuda(Tensor input, Tensor offset,
* Tensor gradOutput, Tensor gradInput,
* Tensor gradOffset, Tensor weight,
* Tensor columns, int kW, int kH, int dW,
* int dH, int padW, int padH, int
* dilationW, int dilationH, int group, int deformable_group, int im2col_step);
*/
void deform_conv_backward_input_cuda_parrots(CudaContext& ctx,
const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
int kW, kH, dW, dH, padW, padH, dilationW, dilationH, group, deformable_group,
im2col_step;
SSAttrs(attr)
.get<int>("kW", kW)
.get<int>("kH", kH)
.get<int>("dW", dW)
.get<int>("dH", dH)
.get<int>("padW", padW)
.get<int>("padH", padH)
.get<int>("dilationW", dilationW)
.get<int>("dilationH", dilationH)
.get<int>("group", group)
.get<int>("deformable_group", deformable_group)
.get<int>("im2col_step", im2col_step)
.done();
const auto& input = buildATensor(ctx, ins[0]);
const auto& offset = buildATensor(ctx, ins[1]);
const auto& gradOutput = buildATensor(ctx, ins[2]);
auto gradInput = buildATensor(ctx, outs[0]);
auto gradOffset = buildATensor(ctx, outs[1]);
auto weight = buildATensor(ctx, outs[2]);
auto columns = buildATensor(ctx, outs[3]);
deform_conv_backward_input_cuda(input, offset, gradOutput, gradInput,
gradOffset, weight, columns, kW, kH, dW, dH,
padW, padH, dilationW, dilationH, group,
deformable_group, im2col_step);
}
/*void deform_conv_backward_parameters_cuda(
* Tensor input, Tensor offset, Tensor gradOutput, Tensor gradWeight,
* Tensor columns, Tensor ones, int kW, int kH, int dW, int dH, int padW,
* int padH, int dilationW, int dilationH, int group, int deformable_group,
* float scale, int im2col_step);
*/
void deform_conv_backward_parameters_cuda_parrots(
CudaContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
int kW, kH, dW, dH, padW, padH, dilationW, dilationH, group, deformable_group,
im2col_step;
float scale;
SSAttrs(attr)
.get<int>("kW", kW)
.get<int>("kH", kH)
.get<int>("dW", dW)
.get<int>("dH", dH)
.get<int>("padW", padW)
.get<int>("padH", padH)
.get<int>("dilationW", dilationW)
.get<int>("dilationH", dilationH)
.get<int>("group", group)
.get<int>("deformable_group", deformable_group)
.get<float>("scale", scale)
.get<int>("im2col_step", im2col_step)
.done();
const auto& input = buildATensor(ctx, ins[0]);
const auto& offset = buildATensor(ctx, ins[1]);
const auto& gradOutput = buildATensor(ctx, ins[2]);
auto gradWeight = buildATensor(ctx, outs[0]);
auto columns = buildATensor(ctx, outs[1]);
auto ones = buildATensor(ctx, outs[2]);
deform_conv_backward_parameters_cuda(input, offset, gradOutput, gradWeight,
columns, ones, kW, kH, dW, dH, padW,
padH, dilationW, dilationH, group,
deformable_group, scale, im2col_step);
}
PARROTS_EXTENSION_REGISTER(deform_conv_forward)
.attr("kW")
.attr("kH")
.attr("dW")
.attr("dH")
.attr("padW")
.attr("padH")
.attr("dilationW")
.attr("dilationH")
.attr("group")
.attr("deformable_group")
.attr("im2col_step")
.input(3)
.output(3)
.apply(deform_conv_forward_cuda_parrots)
.done();
PARROTS_EXTENSION_REGISTER(deform_conv_backward_input)
.attr("kW")
.attr("kH")
.attr("dW")
.attr("dH")
.attr("padW")
.attr("padH")
.attr("dilationW")
.attr("dilationH")
.attr("group")
.attr("deformable_group")
.attr("im2col_step")
.input(3)
.output(4)
.apply(deform_conv_backward_input_cuda_parrots)
.done();
PARROTS_EXTENSION_REGISTER(deform_conv_backward_parameters)
.attr("kW")
.attr("kH")
.attr("dW")
.attr("dH")
.attr("padW")
.attr("padH")
.attr("dilationW")
.attr("dilationH")
.attr("group")
.attr("deformable_group")
.attr("scale")
.attr("im2col_step")
.input(3)
.output(3)
.apply(deform_conv_backward_parameters_cuda_parrots)
.done();
#ifndef DEFORM_CONV_PYTORCH_H
#define DEFORM_CONV_PYTORCH_H
#include <torch/extension.h>
using namespace at;
void deform_conv_forward_cuda(Tensor input, Tensor weight, Tensor offset,
Tensor output, Tensor columns, Tensor ones,
int kW, int kH, int dW, int dH, int padW,
int padH, int dilationW, int dilationH, int group,
int deformable_group, int im2col_step);
void deform_conv_backward_input_cuda(Tensor input, Tensor offset,
Tensor gradOutput, Tensor gradInput,
Tensor gradOffset, Tensor weight,
Tensor columns, int kW, int kH, int dW,
int dH, int padW, int padH, int dilationW,
int dilationH, int group,
int deformable_group, int im2col_step);
void deform_conv_backward_parameters_cuda(
Tensor input, Tensor offset, Tensor gradOutput, Tensor gradWeight,
Tensor columns, Tensor ones, int kW, int kH, int dW, int dH, int padW,
int padH, int dilationW, int dilationH, int group, int deformable_group,
float scale, int im2col_step);
#endif // DEFORM_CONV_PYTORCH_H
#include "parrots_cpp_helper.hpp" #include "pytorch_cpp_helper.hpp"
void DeformRoIPoolForwardCUDAKernelLauncher( #ifdef MMCV_WITH_CUDA
const DArrayLite input, const DArrayLite rois, const DArrayLite offset, void DeformRoIPoolForwardCUDAKernelLauncher(Tensor input, Tensor rois,
DArrayLite output, int pooled_height, int pooled_width, float spatial_scale, Tensor offset, Tensor output,
int sampling_ratio, float gamma, cudaStream_t stream); int pooled_height, int pooled_width,
float spatial_scale,
int sampling_ratio, float gamma);
void DeformRoIPoolBackwardCUDAKernelLauncher( void DeformRoIPoolBackwardCUDAKernelLauncher(
const DArrayLite grad_output, const DArrayLite input, const DArrayLite rois, Tensor grad_output, Tensor input, Tensor rois, Tensor offset,
const DArrayLite offset, DArrayLite grad_input, DArrayLite grad_offset, Tensor grad_input, Tensor grad_offset, int pooled_height, int pooled_width,
int pooled_height, int pooled_width, float spatial_scale, float spatial_scale, int sampling_ratio, float gamma);
int sampling_ratio, float gamma, cudaStream_t stream);
void deform_roi_pool_forward_cuda(CudaContext& ctx, const SSElement& attr, void deform_roi_pool_forward_cuda(Tensor input, Tensor rois, Tensor offset,
const OperatorBase::in_list_t& ins, Tensor output, int pooled_height,
OperatorBase::out_list_t& outs) { int pooled_width, float spatial_scale,
int pooled_height; int sampling_ratio, float gamma) {
int pooled_width; DeformRoIPoolForwardCUDAKernelLauncher(input, rois, offset, output,
float spatial_scale; pooled_height, pooled_width,
int sampling_ratio; spatial_scale, sampling_ratio, gamma);
float gamma;
SSAttrs(attr)
.get<int>("pooled_height", pooled_height)
.get<int>("pooled_width", pooled_width)
.get<float>("spatial_scale", spatial_scale)
.get<int>("sampling_ratio", sampling_ratio)
.get<float>("gamma", gamma)
.done();
const auto& input = ins[0];
const auto& rois = ins[1];
const auto& offset = ins[2];
auto& output = outs[0];
cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
DeformRoIPoolForwardCUDAKernelLauncher(
input, rois, offset, output, pooled_height, pooled_width, spatial_scale,
sampling_ratio, gamma, stream);
} }
void deform_roi_pool_backward_cuda(CudaContext& ctx, const SSElement& attr, void deform_roi_pool_backward_cuda(Tensor grad_output, Tensor input,
const OperatorBase::in_list_t& ins, Tensor rois, Tensor offset,
OperatorBase::out_list_t& outs) { Tensor grad_input, Tensor grad_offset,
int pooled_height; int pooled_height, int pooled_width,
int pooled_width; float spatial_scale, int sampling_ratio,
float spatial_scale; float gamma) {
int sampling_ratio;
float gamma;
SSAttrs(attr)
.get<int>("pooled_height", pooled_height)
.get<int>("pooled_width", pooled_width)
.get<float>("spatial_scale", spatial_scale)
.get<int>("sampling_ratio", sampling_ratio)
.get<float>("gamma", gamma)
.done();
const auto& grad_output = ins[0];
const auto& input = ins[1];
const auto& rois = ins[2];
const auto& offset = ins[3];
auto& grad_input = outs[0];
auto& grad_offset = outs[1];
cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
DeformRoIPoolBackwardCUDAKernelLauncher( DeformRoIPoolBackwardCUDAKernelLauncher(
grad_output, input, rois, offset, grad_input, grad_offset, pooled_height, grad_output, input, rois, offset, grad_input, grad_offset, pooled_height,
pooled_width, spatial_scale, sampling_ratio, gamma, stream); pooled_width, spatial_scale, sampling_ratio, gamma);
} }
#endif
void deform_roi_pool_forward(Tensor input, Tensor rois, Tensor offset,
Tensor output, int pooled_height, int pooled_width,
float spatial_scale, int sampling_ratio,
float gamma) {
if (input.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(input);
CHECK_CUDA_INPUT(rois);
CHECK_CUDA_INPUT(offset);
CHECK_CUDA_INPUT(output);
PARROTS_EXTENSION_REGISTER(deform_roi_pool_forward) deform_roi_pool_forward_cuda(input, rois, offset, output, pooled_height,
.attr("pooled_height") pooled_width, spatial_scale, sampling_ratio,
.attr("pooled_width") gamma);
.attr("spatial_scale") #else
.attr("sampling_ratio") AT_ERROR("DeformRoIPool is not compiled with GPU support");
.attr("gamma") #endif
.input(3) } else {
.output(1) AT_ERROR("DeformRoIPool is not implemented on CPU");
.apply(deform_roi_pool_forward_cuda) }
.done(); }
PARROTS_EXTENSION_REGISTER(deform_roi_pool_backward) void deform_roi_pool_backward(Tensor grad_output, Tensor input, Tensor rois,
.attr("pooled_height") Tensor offset, Tensor grad_input,
.attr("pooled_width") Tensor grad_offset, int pooled_height,
.attr("spatial_scale") int pooled_width, float spatial_scale,
.attr("sampling_ratio") int sampling_ratio, float gamma) {
.attr("gamma") if (grad_output.device().is_cuda()) {
.input(4) #ifdef MMCV_WITH_CUDA
.output(2) CHECK_CUDA_INPUT(grad_output);
.apply(deform_roi_pool_backward_cuda) CHECK_CUDA_INPUT(input);
.done(); CHECK_CUDA_INPUT(rois);
CHECK_CUDA_INPUT(offset);
CHECK_CUDA_INPUT(grad_input);
CHECK_CUDA_INPUT(grad_offset);
deform_roi_pool_backward_cuda(grad_output, input, rois, offset, grad_input,
grad_offset, pooled_height, pooled_width,
spatial_scale, sampling_ratio, gamma);
#else
AT_ERROR("DeformRoIPool is not compiled with GPU support");
#endif
} else {
AT_ERROR("DeformRoIPool is not implemented on CPU");
}
}
#include "deform_roi_pool_cuda_kernel.cuh" #include "deform_roi_pool_cuda_kernel.cuh"
#include "parrots_cuda_helper.hpp" #include "pytorch_cuda_helper.hpp"
void DeformRoIPoolForwardCUDAKernelLauncher( void DeformRoIPoolForwardCUDAKernelLauncher(Tensor input, Tensor rois,
const DArrayLite input, const DArrayLite rois, const DArrayLite offset, Tensor offset, Tensor output,
DArrayLite output, int pooled_height, int pooled_width, float spatial_scale, int pooled_height, int pooled_width,
int sampling_ratio, float gamma, cudaStream_t stream) { float spatial_scale,
int output_size = output.size(); int sampling_ratio, float gamma) {
int channels = input.dim(1); int output_size = output.numel();
int height = input.dim(2); int channels = input.size(1);
int width = input.dim(3); int height = input.size(2);
int width = input.size(3);
PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF( at::cuda::CUDAGuard device_guard(input.device());
input.elemType().prim(), ([&] { cudaStream_t stream = at::cuda::getCurrentCUDAStream();
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
input.scalar_type(), "deform_roi_pool_forward_cuda_kernel", [&] {
deform_roi_pool_forward_cuda_kernel<scalar_t> deform_roi_pool_forward_cuda_kernel<scalar_t>
<<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>( <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
output_size, input.ptr<scalar_t>(), rois.ptr<scalar_t>(), output_size, input.data_ptr<scalar_t>(),
offset.ptr<scalar_t>(), output.ptr<scalar_t>(), pooled_height, rois.data_ptr<scalar_t>(), offset.data_ptr<scalar_t>(),
pooled_width, spatial_scale, sampling_ratio, gamma, channels, output.data_ptr<scalar_t>(), pooled_height, pooled_width,
height, width); static_cast<scalar_t>(spatial_scale), sampling_ratio,
})); static_cast<scalar_t>(gamma), channels, height, width);
});
PARROTS_CUDA_CHECK(cudaGetLastError()); AT_CUDA_CHECK(cudaGetLastError());
} }
void DeformRoIPoolBackwardCUDAKernelLauncher( void DeformRoIPoolBackwardCUDAKernelLauncher(
const DArrayLite grad_output, const DArrayLite input, const DArrayLite rois, Tensor grad_output, Tensor input, Tensor rois, Tensor offset,
const DArrayLite offset, DArrayLite grad_input, DArrayLite grad_offset, Tensor grad_input, Tensor grad_offset, int pooled_height, int pooled_width,
int pooled_height, int pooled_width, float spatial_scale, float spatial_scale, int sampling_ratio, float gamma) {
int sampling_ratio, float gamma, cudaStream_t stream) { int output_size = grad_output.numel();
int output_size = grad_output.size(); int channels = grad_input.size(1);
int channels = grad_input.dim(1); int height = grad_input.size(2);
int height = grad_input.dim(2); int width = grad_input.size(3);
int width = grad_input.dim(3);
PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF( at::cuda::CUDAGuard device_guard(grad_output.device());
grad_output.elemType().prim(), ([&] { cudaStream_t stream = at::cuda::getCurrentCUDAStream();
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
grad_output.scalar_type(), "deform_roi_pool_backward_cuda_kernel", [&] {
deform_roi_pool_backward_cuda_kernel<scalar_t> deform_roi_pool_backward_cuda_kernel<scalar_t>
<<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>( <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
output_size, grad_output.ptr<scalar_t>(), input.ptr<scalar_t>(), output_size, grad_output.data_ptr<scalar_t>(),
rois.ptr<scalar_t>(), offset.ptr<scalar_t>(), input.data_ptr<scalar_t>(), rois.data_ptr<scalar_t>(),
grad_input.ptr<scalar_t>(), grad_offset.ptr<scalar_t>(), offset.data_ptr<scalar_t>(), grad_input.data_ptr<scalar_t>(),
pooled_height, pooled_width, spatial_scale, sampling_ratio, grad_offset.data_ptr<scalar_t>(), pooled_height, pooled_width,
gamma, channels, height, width); static_cast<scalar_t>(spatial_scale), sampling_ratio,
})); static_cast<scalar_t>(gamma), channels, height, width);
});
PARROTS_CUDA_CHECK(cudaGetLastError()); AT_CUDA_CHECK(cudaGetLastError());
} }
#include <parrots/compute/aten.hpp>
#include <parrots/extension.hpp>
#include <parrots/foundation/ssattrs.hpp>
#include "deform_roi_pool_pytorch.h"
using namespace parrots;
/*void deform_roi_pool_forward_cuda(Tensor input, Tensor rois, Tensor offset,
* Tensor output, int pooled_height,
* int pooled_width, float spatial_scale,
* int sampling_ratio, float gamma);
*/
void deform_roi_pool_forward_cuda_parrots(CudaContext& ctx,
const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
int pooled_height;
int pooled_width;
float spatial_scale;
int sampling_ratio;
float gamma;
SSAttrs(attr)
.get<int>("pooled_height", pooled_height)
.get<int>("pooled_width", pooled_width)
.get<float>("spatial_scale", spatial_scale)
.get<int>("sampling_ratio", sampling_ratio)
.get<float>("gamma", gamma)
.done();
const auto& input = buildATensor(ctx, ins[0]);
const auto& rois = buildATensor(ctx, ins[1]);
const auto& offset = buildATensor(ctx, ins[2]);
auto output = buildATensor(ctx, outs[0]);
deform_roi_pool_forward_cuda(input, rois, offset, output, pooled_height,
pooled_width, spatial_scale, sampling_ratio,
gamma);
}
/*void deform_roi_pool_backward_cuda(Tensor grad_output, Tensor input,
* Tensor rois, Tensor offset,
* Tensor grad_input, Tensor grad_offset,
* int pooled_height, int pooled_width,
* float spatial_scale, int sampling_ratio,
* float gamma);
*/
void deform_roi_pool_backward_cuda_parrots(CudaContext& ctx,
const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
int pooled_height;
int pooled_width;
float spatial_scale;
int sampling_ratio;
float gamma;
SSAttrs(attr)
.get<int>("pooled_height", pooled_height)
.get<int>("pooled_width", pooled_width)
.get<float>("spatial_scale", spatial_scale)
.get<int>("sampling_ratio", sampling_ratio)
.get<float>("gamma", gamma)
.done();
const auto& grad_output = buildATensor(ctx, ins[0]);
const auto& input = buildATensor(ctx, ins[1]);
const auto& rois = buildATensor(ctx, ins[2]);
const auto& offset = buildATensor(ctx, ins[3]);
auto grad_input = buildATensor(ctx, outs[0]);
auto grad_offset = buildATensor(ctx, outs[1]);
deform_roi_pool_backward_cuda(grad_output, input, rois, offset, grad_input,
grad_offset, pooled_height, pooled_width,
spatial_scale, sampling_ratio, gamma);
}
PARROTS_EXTENSION_REGISTER(deform_roi_pool_forward)
.attr("pooled_height")
.attr("pooled_width")
.attr("spatial_scale")
.attr("sampling_ratio")
.attr("gamma")
.input(3)
.output(1)
.apply(deform_roi_pool_forward_cuda_parrots)
.done();
PARROTS_EXTENSION_REGISTER(deform_roi_pool_backward)
.attr("pooled_height")
.attr("pooled_width")
.attr("spatial_scale")
.attr("sampling_ratio")
.attr("gamma")
.input(4)
.output(2)
.apply(deform_roi_pool_backward_cuda_parrots)
.done();
#ifndef DEFORM_ROI_POOL_PYTORCH_H
#define DEFORM_ROI_POOL_PYTORCH_H
#include <torch/extension.h>
using namespace at;
void deform_roi_pool_forward_cuda(Tensor input, Tensor rois, Tensor offset,
Tensor output, int pooled_height,
int pooled_width, float spatial_scale,
int sampling_ratio, float gamma);
void deform_roi_pool_backward_cuda(Tensor grad_output, Tensor input,
Tensor rois, Tensor offset,
Tensor grad_input, Tensor grad_offset,
int pooled_height, int pooled_width,
float spatial_scale, int sampling_ratio,
float gamma);
#endif // DEFORM_ROI_POOL_PYTORCH_H
// Copyright (c) 2018, SenseTime. #include "pytorch_cpp_helper.hpp"
#include "parrots_cpp_helper.hpp"
#ifdef MMCV_WITH_CUDA
void SigmoidFocalLossForwardCUDAKernelLauncher( void SigmoidFocalLossForwardCUDAKernelLauncher(Tensor input, Tensor target,
const DArrayLite input, const DArrayLite target, const DArrayLite weight, Tensor weight, Tensor output,
DArrayLite output, float gamma, float alpha, cudaStream_t stream); const float gamma,
const float alpha);
void SigmoidFocalLossBackwardCUDAKernelLauncher(
const DArrayLite input, const DArrayLite target, const DArrayLite weight, void SigmoidFocalLossBackwardCUDAKernelLauncher(Tensor input, Tensor target,
DArrayLite grad_input, float gamma, float alpha, cudaStream_t stream); Tensor weight,
Tensor grad_input,
void SoftmaxFocalLossForwardCUDAKernelLauncher( const float gamma,
const DArrayLite input, const DArrayLite target, const DArrayLite weight, const float alpha);
DArrayLite output, float gamma, float alpha, cudaStream_t stream);
void SoftmaxFocalLossForwardCUDAKernelLauncher(Tensor input, Tensor target,
void SoftmaxFocalLossBackwardCUDAKernelLauncher( Tensor weight, Tensor output,
const DArrayLite input, const DArrayLite target, const DArrayLite weight, const float gamma,
DArrayLite buff, DArrayLite grad_input, float gamma, float alpha, const float alpha);
cudaStream_t stream);
void SoftmaxFocalLossBackwardCUDAKernelLauncher(Tensor input, Tensor target,
void sigmoid_focal_loss_forward_cuda(CudaContext& ctx, const SSElement& attr, Tensor weight, Tensor buff,
const OperatorBase::in_list_t& ins, Tensor grad_input,
OperatorBase::out_list_t& outs) { const float gamma,
float gamma; const float alpha);
float alpha;
SSAttrs(attr).get<float>("gamma", gamma).get<float>("alpha", alpha).done(); void sigmoid_focal_loss_forward_cuda(Tensor input, Tensor target, Tensor weight,
Tensor output, float gamma, float alpha) {
// get inputs and outputs
const auto& input = ins[0];
const auto& target = ins[1];
const auto& weight = ins[2];
auto& output = outs[0];
cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
SigmoidFocalLossForwardCUDAKernelLauncher(input, target, weight, output, SigmoidFocalLossForwardCUDAKernelLauncher(input, target, weight, output,
gamma, alpha, stream); gamma, alpha);
} }
void sigmoid_focal_loss_backward_cuda(CudaContext& ctx, const SSElement& attr, void sigmoid_focal_loss_backward_cuda(Tensor input, Tensor target,
const OperatorBase::in_list_t& ins, Tensor weight, Tensor grad_input,
OperatorBase::out_list_t& outs) { float gamma, float alpha) {
float gamma;
float alpha;
SSAttrs(attr).get<float>("gamma", gamma).get<float>("alpha", alpha).done();
// get inputs and outputs
const auto& input = ins[0];
const auto& target = ins[1];
const auto& weight = ins[2];
auto& grad_input = outs[0];
cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
SigmoidFocalLossBackwardCUDAKernelLauncher(input, target, weight, grad_input, SigmoidFocalLossBackwardCUDAKernelLauncher(input, target, weight, grad_input,
gamma, alpha, stream); gamma, alpha);
} }
void softmax_focal_loss_forward_cuda(CudaContext& ctx, const SSElement& attr, void softmax_focal_loss_forward_cuda(Tensor input, Tensor target, Tensor weight,
const OperatorBase::in_list_t& ins, Tensor output, float gamma, float alpha) {
OperatorBase::out_list_t& outs) { SoftmaxFocalLossForwardCUDAKernelLauncher(input, target, weight, output,
float gamma; gamma, alpha);
float alpha;
SSAttrs(attr).get<float>("gamma", gamma).get<float>("alpha", alpha).done();
// get inputs and outputs
const auto& input = ins[0];
const auto& target = ins[1];
const auto& weight = ins[2];
auto& grad_input = outs[0];
cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
SoftmaxFocalLossForwardCUDAKernelLauncher(input, target, weight, grad_input,
gamma, alpha, stream);
} }
void softmax_focal_loss_backward_cuda(CudaContext& ctx, const SSElement& attr, void softmax_focal_loss_backward_cuda(Tensor input, Tensor target,
const OperatorBase::in_list_t& ins, Tensor weight, Tensor buff,
OperatorBase::out_list_t& outs) { Tensor grad_input, float gamma,
float gamma; float alpha) {
float alpha; SoftmaxFocalLossBackwardCUDAKernelLauncher(input, target, weight, buff,
SSAttrs(attr).get<float>("gamma", gamma).get<float>("alpha", alpha).done(); grad_input, gamma, alpha);
}
// get inputs and outputs #endif
const auto& input = ins[0];
const auto& target = ins[1]; void sigmoid_focal_loss_forward(Tensor input, Tensor target, Tensor weight,
const auto& weight = ins[2]; Tensor output, float gamma, float alpha) {
if (input.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(input);
CHECK_CUDA_INPUT(target);
CHECK_CUDA_INPUT(weight);
CHECK_CUDA_INPUT(output);
sigmoid_focal_loss_forward_cuda(input, target, weight, output, gamma,
alpha);
#else
AT_ERROR("SigmoidFocalLoss is not compiled with GPU support");
#endif
} else {
AT_ERROR("SigmoidFocalLoss is not implemented on CPU");
}
}
auto& buff = outs[0]; void sigmoid_focal_loss_backward(Tensor input, Tensor target, Tensor weight,
auto& grad_input = outs[1]; Tensor grad_input, float gamma, float alpha) {
if (input.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(input);
CHECK_CUDA_INPUT(target);
CHECK_CUDA_INPUT(weight);
CHECK_CUDA_INPUT(grad_input);
sigmoid_focal_loss_backward_cuda(input, target, weight, grad_input, gamma,
alpha);
#else
AT_ERROR("SigmoidFocalLoss is not compiled with GPU support");
#endif
} else {
AT_ERROR("SigmoidFocalLoss is not implemented on CPU");
}
}
cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream()); void softmax_focal_loss_forward(Tensor input, Tensor target, Tensor weight,
SoftmaxFocalLossBackwardCUDAKernelLauncher(input, target, weight, buff, Tensor output, float gamma, float alpha) {
grad_input, gamma, alpha, stream); if (input.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(input);
CHECK_CUDA_INPUT(target);
CHECK_CUDA_INPUT(weight);
CHECK_CUDA_INPUT(output);
softmax_focal_loss_forward_cuda(input, target, weight, output, gamma,
alpha);
#else
AT_ERROR("SoftmaxFocalLoss is not compiled with GPU support");
#endif
} else {
AT_ERROR("SoftmaxFocalLoss is not implemented on CPU");
}
} }
PARROTS_EXTENSION_REGISTER(sigmoid_focal_loss_forward) void softmax_focal_loss_backward(Tensor input, Tensor target, Tensor weight,
.attr("gamma") Tensor buff, Tensor grad_input, float gamma,
.attr("alpha") float alpha) {
.input(3) if (input.device().is_cuda()) {
.output(1) #ifdef MMCV_WITH_CUDA
.apply(sigmoid_focal_loss_forward_cuda) CHECK_CUDA_INPUT(input);
.done(); CHECK_CUDA_INPUT(target);
CHECK_CUDA_INPUT(weight);
PARROTS_EXTENSION_REGISTER(sigmoid_focal_loss_backward) CHECK_CUDA_INPUT(buff);
.attr("gamma") CHECK_CUDA_INPUT(grad_input);
.attr("alpha")
.input(3) softmax_focal_loss_backward_cuda(input, target, weight, buff, grad_input,
.output(1) gamma, alpha);
.apply(sigmoid_focal_loss_backward_cuda) #else
.done(); AT_ERROR("SoftmaxFocalLoss is not compiled with GPU support");
#endif
PARROTS_EXTENSION_REGISTER(softmax_focal_loss_forward) } else {
.attr("gamma") AT_ERROR("SoftmaxFocalLoss is not implemented on CPU");
.attr("alpha") }
.input(3) }
.output(1)
.apply(softmax_focal_loss_forward_cuda)
.done();
PARROTS_EXTENSION_REGISTER(softmax_focal_loss_backward)
.attr("gamma")
.attr("alpha")
.input(3)
.output(2)
.apply(softmax_focal_loss_backward_cuda)
.done();
#include "parrots_cuda_helper.hpp" #include "pytorch_cuda_helper.hpp"
#include "sigmoid_focal_loss_cuda_kernel.cuh" #include "sigmoid_focal_loss_cuda_kernel.cuh"
#include "softmax_focal_loss_cuda_kernel.cuh" #include "softmax_focal_loss_cuda_kernel.cuh"
void SigmoidFocalLossForwardCUDAKernelLauncher( void SigmoidFocalLossForwardCUDAKernelLauncher(Tensor input, Tensor target,
const DArrayLite input, const DArrayLite target, const DArrayLite weight, Tensor weight, Tensor output,
DArrayLite output, float gamma, float alpha, cudaStream_t stream) { const float gamma,
int output_size = output.size(); const float alpha) {
int num_classes = input.dim(1); int output_size = output.numel();
int num_classes = input.size(1);
PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF( AT_ASSERTM(target.max().item<int64_t>() <= (int64_t)num_classes,
input.elemType().prim(), ([&] { "target label should smaller or equal than num classes");
at::cuda::CUDAGuard device_guard(input.device());
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
input.scalar_type(), "sigmoid_focal_loss_forward_cuda_kernel", [&] {
sigmoid_focal_loss_forward_cuda_kernel<scalar_t> sigmoid_focal_loss_forward_cuda_kernel<scalar_t>
<<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>( <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
output_size, input.ptr<scalar_t>(), target.ptr<int64_t>(), output_size, input.data_ptr<scalar_t>(),
weight.ptr<scalar_t>(), output.ptr<scalar_t>(), gamma, alpha, target.data_ptr<int64_t>(), weight.data_ptr<scalar_t>(),
num_classes); output.data_ptr<scalar_t>(), gamma, alpha, num_classes);
})); });
PARROTS_CUDA_CHECK(cudaGetLastError()); AT_CUDA_CHECK(cudaGetLastError());
} }
void SigmoidFocalLossBackwardCUDAKernelLauncher( void SigmoidFocalLossBackwardCUDAKernelLauncher(Tensor input, Tensor target,
const DArrayLite input, const DArrayLite target, const DArrayLite weight, Tensor weight,
DArrayLite grad_input, float gamma, float alpha, cudaStream_t stream) { Tensor grad_input,
int output_size = grad_input.size(); const float gamma,
int num_classes = input.dim(1); const float alpha) {
int output_size = grad_input.numel();
int num_classes = input.size(1);
PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF( at::cuda::CUDAGuard device_guard(grad_input.device());
input.elemType().prim(), ([&] { cudaStream_t stream = at::cuda::getCurrentCUDAStream();
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
input.scalar_type(), "sigmoid_focal_loss_backward_cuda_kernel", [&] {
sigmoid_focal_loss_backward_cuda_kernel<scalar_t> sigmoid_focal_loss_backward_cuda_kernel<scalar_t>
<<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>( <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
output_size, input.ptr<scalar_t>(), target.ptr<int64_t>(), output_size, input.data_ptr<scalar_t>(),
weight.ptr<scalar_t>(), grad_input.ptr<scalar_t>(), gamma, target.data_ptr<int64_t>(), weight.data_ptr<scalar_t>(),
alpha, num_classes); grad_input.data_ptr<scalar_t>(), gamma, alpha, num_classes);
})); });
PARROTS_CUDA_CHECK(cudaGetLastError()); AT_CUDA_CHECK(cudaGetLastError());
} }
void SoftmaxFocalLossForwardCUDAKernelLauncher( void SoftmaxFocalLossForwardCUDAKernelLauncher(Tensor softmax, Tensor target,
const DArrayLite softmax, const DArrayLite target, const DArrayLite weight, Tensor weight, Tensor output,
DArrayLite output, float gamma, float alpha, cudaStream_t stream) { const float gamma,
int output_size = output.size(); const float alpha) {
int num_classes = softmax.dim(1); int output_size = output.numel();
int num_classes = softmax.size(1);
PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF( AT_ASSERTM(target.max().item<int64_t>() <= (int64_t)num_classes,
softmax.elemType().prim(), ([&] { "target label should smaller or equal than num classes");
at::cuda::CUDAGuard device_guard(softmax.device());
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
softmax.scalar_type(), "softmax_focal_loss_forward_cuda_kernel", [&] {
softmax_focal_loss_forward_cuda_kernel<scalar_t> softmax_focal_loss_forward_cuda_kernel<scalar_t>
<<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>( <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
output_size, softmax.ptr<scalar_t>(), target.ptr<int64_t>(), output_size, softmax.data_ptr<scalar_t>(),
weight.ptr<scalar_t>(), output.ptr<scalar_t>(), gamma, alpha, target.data_ptr<int64_t>(), weight.data_ptr<scalar_t>(),
num_classes); output.data_ptr<scalar_t>(), gamma, alpha, num_classes);
})); });
PARROTS_CUDA_CHECK(cudaGetLastError()); AT_CUDA_CHECK(cudaGetLastError());
} }
void SoftmaxFocalLossBackwardCUDAKernelLauncher( void SoftmaxFocalLossBackwardCUDAKernelLauncher(Tensor softmax, Tensor target,
const DArrayLite softmax, const DArrayLite target, const DArrayLite weight, Tensor weight, Tensor buff,
DArrayLite buff, DArrayLite grad_input, float gamma, float alpha, Tensor grad_input,
cudaStream_t stream) { const float gamma,
int output_size = buff.size(); const float alpha) {
int num_classes = softmax.dim(1); int num_classes = softmax.size(1);
PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF( int output_size = buff.numel();
grad_input.elemType().prim(), ([&] { at::cuda::CUDAGuard device_guard(grad_input.device());
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
grad_input.scalar_type(),
"softmax_focal_loss_backward_cuda1_"
"kernel",
[&] {
softmax_focal_loss_backward_cuda1_kernel<scalar_t> softmax_focal_loss_backward_cuda1_kernel<scalar_t>
<<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>( <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
output_size, softmax.ptr<scalar_t>(), target.ptr<int64_t>(), output_size, softmax.data_ptr<scalar_t>(),
weight.ptr<scalar_t>(), buff.ptr<scalar_t>(), gamma, alpha, target.data_ptr<int64_t>(), weight.data_ptr<scalar_t>(),
num_classes); buff.data_ptr<scalar_t>(), gamma, alpha, num_classes);
})); });
PARROTS_CUDA_CHECK(cudaGetLastError());
output_size = grad_input.size(); AT_CUDA_CHECK(cudaGetLastError());
PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF( output_size = grad_input.numel();
grad_input.elemType().prim(), ([&] { AT_DISPATCH_FLOATING_TYPES_AND_HALF(
grad_input.scalar_type(),
"softmax_focal_loss_backward_cuda2_"
"kernel",
[&] {
softmax_focal_loss_backward_cuda2_kernel<scalar_t> softmax_focal_loss_backward_cuda2_kernel<scalar_t>
<<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>( <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
output_size, softmax.ptr<scalar_t>(), target.ptr<int64_t>(), output_size, softmax.data_ptr<scalar_t>(),
buff.ptr<scalar_t>(), grad_input.ptr<scalar_t>(), num_classes); target.data_ptr<int64_t>(), buff.data_ptr<scalar_t>(),
})); grad_input.data_ptr<scalar_t>(), num_classes);
});
PARROTS_CUDA_CHECK(cudaGetLastError()); AT_CUDA_CHECK(cudaGetLastError());
} }
#include <parrots/compute/aten.hpp>
#include <parrots/extension.hpp>
#include <parrots/foundation/ssattrs.hpp>
#include "focal_loss_pytorch.h"
using namespace parrots;
void sigmoid_focal_loss_forward_cuda_parrots(CudaContext& ctx,
const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
float gamma;
float alpha;
SSAttrs(attr).get<float>("gamma", gamma).get<float>("alpha", alpha).done();
// get inputs and outputs
const auto& input = buildATensor(ctx, ins[0]);
const auto& target = buildATensor(ctx, ins[1]);
const auto& weight = buildATensor(ctx, ins[2]);
auto output = buildATensor(ctx, outs[0]);
sigmoid_focal_loss_forward_cuda(input, target, weight, output, gamma, alpha);
}
void sigmoid_focal_loss_backward_cuda_parrots(
CudaContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
float gamma;
float alpha;
SSAttrs(attr).get<float>("gamma", gamma).get<float>("alpha", alpha).done();
// get inputs and outputs
const auto& input = buildATensor(ctx, ins[0]);
const auto& target = buildATensor(ctx, ins[1]);
const auto& weight = buildATensor(ctx, ins[2]);
auto grad_input = buildATensor(ctx, outs[0]);
sigmoid_focal_loss_backward_cuda(input, target, weight, grad_input, gamma,
alpha);
}
void softmax_focal_loss_forward_cuda_parrots(CudaContext& ctx,
const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
float gamma;
float alpha;
SSAttrs(attr).get<float>("gamma", gamma).get<float>("alpha", alpha).done();
// get inputs and outputs
const auto& input = buildATensor(ctx, ins[0]);
const auto& target = buildATensor(ctx, ins[1]);
const auto& weight = buildATensor(ctx, ins[2]);
auto output = buildATensor(ctx, outs[0]);
softmax_focal_loss_forward_cuda(input, target, weight, output, gamma, alpha);
}
void softmax_focal_loss_backward_cuda_parrots(
CudaContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
float gamma;
float alpha;
SSAttrs(attr).get<float>("gamma", gamma).get<float>("alpha", alpha).done();
// get inputs and outputs
const auto& input = buildATensor(ctx, ins[0]);
const auto& target = buildATensor(ctx, ins[1]);
const auto& weight = buildATensor(ctx, ins[2]);
auto buff = buildATensor(ctx, outs[0]);
auto grad_input = buildATensor(ctx, outs[1]);
softmax_focal_loss_backward_cuda(input, target, weight, buff, grad_input,
gamma, alpha);
}
PARROTS_EXTENSION_REGISTER(sigmoid_focal_loss_forward)
.attr("gamma")
.attr("alpha")
.input(3)
.output(1)
.apply(sigmoid_focal_loss_forward_cuda_parrots)
.done();
PARROTS_EXTENSION_REGISTER(sigmoid_focal_loss_backward)
.attr("gamma")
.attr("alpha")
.input(3)
.output(1)
.apply(sigmoid_focal_loss_backward_cuda_parrots)
.done();
PARROTS_EXTENSION_REGISTER(softmax_focal_loss_forward)
.attr("gamma")
.attr("alpha")
.input(3)
.output(1)
.apply(softmax_focal_loss_forward_cuda_parrots)
.done();
PARROTS_EXTENSION_REGISTER(softmax_focal_loss_backward)
.attr("gamma")
.attr("alpha")
.input(3)
.output(2)
.apply(softmax_focal_loss_backward_cuda_parrots)
.done();
#ifndef FOCAL_LOSS_PYTORCH_H
#define FOCAL_LOSS_PYTORCH_H
#include <torch/extension.h>
using namespace at;
void sigmoid_focal_loss_forward_cuda(Tensor input, Tensor target, Tensor weight,
Tensor output, float gamma, float alpha);
void sigmoid_focal_loss_backward_cuda(Tensor input, Tensor target,
Tensor weight, Tensor grad_input,
float gamma, float alpha);
void softmax_focal_loss_forward_cuda(Tensor input, Tensor target, Tensor weight,
Tensor output, float gamma, float alpha);
void softmax_focal_loss_backward_cuda(Tensor input, Tensor target,
Tensor weight, Tensor buff,
Tensor grad_input, float gamma,
float alpha);
#endif // FOCAL_LOSS_PYTORCH_H
#include "parrots_cpp_helper.hpp" #include "pytorch_cpp_helper.hpp"
void MaskedIm2colForwardCUDAKernelLauncher( #ifdef MMCV_WITH_CUDA
const DArrayLite bottom_data, const DArrayLite mask_h_idx, void MaskedIm2colForwardCUDAKernelLauncher(const Tensor bottom_data,
const DArrayLite mask_w_idx, DArrayLite top_data, const int kernel_h, const Tensor mask_h_idx,
const int kernel_w, const int pad_h, const int pad_w, cudaStream_t stream); const Tensor mask_w_idx,
Tensor top_data, const int kernel_h,
const int kernel_w, const int pad_h,
const int pad_w);
void MaskedCol2imForwardCUDAKernelLaucher(const DArrayLite bottom_data, void MaskedCol2imForwardCUDAKernelLauncher(const Tensor bottom_data,
const DArrayLite mask_h_idx, const Tensor mask_h_idx,
const DArrayLite mask_w_idx, const Tensor mask_w_idx,
DArrayLite top_data, const int height, Tensor top_data, const int height,
const int width, const int channels, const int width, const int channels);
cudaStream_t stream);
void masked_im2col_forward_cuda(CudaContext& ctx, const SSElement& attr, void masked_im2col_forward_cuda(const Tensor im, const Tensor mask_h_idx,
const OperatorBase::in_list_t& ins, const Tensor mask_w_idx, Tensor col,
OperatorBase::out_list_t& outs) { const int kernel_h, const int kernel_w,
const int pad_h, const int pad_w) {
// im: (n, ic, h, w), kernel size (kh, kw) // im: (n, ic, h, w), kernel size (kh, kw)
// kernel: (oc, ic * kh * kw), col: (kh * kw * ic, ow * oh) // kernel: (oc, ic * kh * kw), col: (kh * kw * ic, ow * oh)
int kernel_h, kernel_w, pad_h, pad_w;
SSAttrs(attr)
.get<int>("kernel_h", kernel_h)
.get<int>("kernel_w", kernel_w)
.get<int>("pad_h", pad_h)
.get<int>("pad_w", pad_w)
.done();
const auto& im = ins[0];
const auto& mask_h_idx = ins[1];
const auto& mask_w_idx = ins[2];
auto& col = outs[0];
cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
MaskedIm2colForwardCUDAKernelLauncher(im, mask_h_idx, mask_w_idx, col, MaskedIm2colForwardCUDAKernelLauncher(im, mask_h_idx, mask_w_idx, col,
kernel_h, kernel_w, pad_h, pad_w, kernel_h, kernel_w, pad_h, pad_w);
stream);
} }
void masked_col2im_forward_cuda(CudaContext& ctx, const SSElement& attr, void masked_col2im_forward_cuda(const Tensor col, const Tensor mask_h_idx,
const OperatorBase::in_list_t& ins, const Tensor mask_w_idx, Tensor im, int height,
OperatorBase::out_list_t& outs) { int width, int channels) {
// im: (n, ic, h, w), kernel size (kh, kw) // im: (n, ic, h, w), kernel size (kh, kw)
// kernel: (oc, ic * kh * kh), col: (kh * kw * ic, ow * oh) // kernel: (oc, ic * kh * kh), col: (kh * kw * ic, ow * oh)
int height, width, channels; MaskedCol2imForwardCUDAKernelLauncher(col, mask_h_idx, mask_w_idx, im, height,
SSAttrs(attr) width, channels);
.get<int>("height", height)
.get<int>("width", width)
.get<int>("channels", channels)
.done();
const auto& col = ins[0];
const auto& mask_h_idx = ins[1];
const auto& mask_w_idx = ins[2];
auto& im = outs[0];
cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
MaskedCol2imForwardCUDAKernelLaucher(col, mask_h_idx, mask_w_idx, im, height,
width, channels, stream);
} }
#endif
PARROTS_EXTENSION_REGISTER(masked_im2col_forward) void masked_im2col_forward(const Tensor im, const Tensor mask_h_idx,
.attr("kernel_h") const Tensor mask_w_idx, Tensor col,
.attr("kernel_w") const int kernel_h, const int kernel_w,
.attr("pad_h") const int pad_h, const int pad_w) {
.attr("pad_w") if (im.device().is_cuda()) {
.input(3) #ifdef MMCV_WITH_CUDA
.output(1) CHECK_CUDA_INPUT(im);
.apply(masked_im2col_forward_cuda) CHECK_CUDA_INPUT(mask_h_idx);
.done(); CHECK_CUDA_INPUT(mask_w_idx);
CHECK_CUDA_INPUT(col);
masked_im2col_forward_cuda(im, mask_h_idx, mask_w_idx, col, kernel_h,
kernel_w, pad_h, pad_w);
#else
AT_ERROR("MaskConv is not compiled with GPU support");
#endif
} else {
AT_ERROR("MaskConv is not implemented on CPU");
}
}
PARROTS_EXTENSION_REGISTER(masked_col2im_forward) void masked_col2im_forward(const Tensor col, const Tensor mask_h_idx,
.attr("height") const Tensor mask_w_idx, Tensor im, int height,
.attr("width") int width, int channels) {
.attr("channels") if (col.device().is_cuda()) {
.input(3) #ifdef MMCV_WITH_CUDA
.output(1) CHECK_CUDA_INPUT(col);
.apply(masked_col2im_forward_cuda) CHECK_CUDA_INPUT(mask_h_idx);
.done(); CHECK_CUDA_INPUT(mask_w_idx);
CHECK_CUDA_INPUT(im);
masked_col2im_forward_cuda(col, mask_h_idx, mask_w_idx, im, height, width,
channels);
#else
AT_ERROR("MaskConv is not compiled with GPU support");
#endif
} else {
AT_ERROR("MaskConv is not implemented on CPU");
}
}
#include "masked_conv2d_cuda_kernel.cuh" #include "masked_conv2d_cuda_kernel.cuh"
#include "parrots_cuda_helper.hpp" #include "pytorch_cuda_helper.hpp"
void MaskedIm2colForwardCUDAKernelLauncher( void MaskedIm2colForwardCUDAKernelLauncher(const Tensor bottom_data,
const DArrayLite bottom_data, const DArrayLite mask_h_idx, const Tensor mask_h_idx,
const DArrayLite mask_w_idx, DArrayLite top_data, const int kernel_h, const Tensor mask_w_idx,
const int kernel_w, const int pad_h, const int pad_w, cudaStream_t stream) { Tensor top_data, const int kernel_h,
int channels = bottom_data.dim(1); const int kernel_w, const int pad_h,
int height = bottom_data.dim(2); const int pad_w) {
int width = bottom_data.dim(3); int channels = bottom_data.size(1);
int mask_cnt = mask_h_idx.dim(0); int height = bottom_data.size(2);
int width = bottom_data.size(3);
int mask_cnt = mask_h_idx.size(0);
int output_size = mask_cnt * channels; int output_size = mask_cnt * channels;
PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF( at::cuda::CUDAGuard device_guard(bottom_data.device());
bottom_data.elemType().prim(), ([&] { cudaStream_t stream = at::cuda::getCurrentCUDAStream();
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
bottom_data.scalar_type(), "MaskedIm2colLaucherForward", ([&] {
const scalar_t *bottom_data_ = bottom_data.data_ptr<scalar_t>();
const int64_t *mask_h_idx_ = mask_h_idx.data_ptr<int64_t>();
const int64_t *mask_w_idx_ = mask_w_idx.data_ptr<int64_t>();
scalar_t *top_data_ = top_data.data_ptr<scalar_t>();
MaskedIm2colForward<scalar_t> MaskedIm2colForward<scalar_t>
<<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>( <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
output_size, bottom_data.ptr<scalar_t>(), height, width, output_size, bottom_data_, height, width, kernel_h, kernel_w,
kernel_h, kernel_w, pad_h, pad_w, mask_h_idx.ptr<int64_t>(), pad_h, pad_w, mask_h_idx_, mask_w_idx_, mask_cnt, top_data_);
mask_w_idx.ptr<int64_t>(), mask_cnt, top_data.ptr<scalar_t>());
})); }));
AT_CUDA_CHECK(cudaGetLastError());
PARROTS_CUDA_CHECK(cudaGetLastError());
} }
void MaskedCol2imForwardCUDAKernelLaucher(const DArrayLite bottom_data, void MaskedCol2imForwardCUDAKernelLauncher(
const DArrayLite mask_h_idx, const Tensor bottom_data, const Tensor mask_h_idx, const Tensor mask_w_idx,
const DArrayLite mask_w_idx, Tensor top_data, const int height, const int width, const int channels) {
DArrayLite top_data, const int height, int mask_cnt = mask_h_idx.size(0);
const int width, const int channels,
cudaStream_t stream) {
int mask_cnt = mask_h_idx.dim(0);
int output_size = mask_cnt * channels; int output_size = mask_cnt * channels;
PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF( at::cuda::CUDAGuard device_guard(bottom_data.device());
bottom_data.elemType().prim(), ([&] { cudaStream_t stream = at::cuda::getCurrentCUDAStream();
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
bottom_data.scalar_type(), "MaskedCol2imLaucherForward", ([&] {
const scalar_t *bottom_data_ = bottom_data.data_ptr<scalar_t>();
const int64_t *mask_h_idx_ = mask_h_idx.data_ptr<int64_t>();
const int64_t *mask_w_idx_ = mask_w_idx.data_ptr<int64_t>();
scalar_t *top_data_ = top_data.data_ptr<scalar_t>();
MaskedCol2imForward<scalar_t> MaskedCol2imForward<scalar_t>
<<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>( <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
output_size, bottom_data.ptr<scalar_t>(), height, width, output_size, bottom_data_, height, width, channels, mask_h_idx_,
channels, mask_h_idx.ptr<int64_t>(), mask_w_idx.ptr<int64_t>(), mask_w_idx_, mask_cnt, top_data_);
mask_cnt, top_data.ptr<scalar_t>());
})); }));
AT_CUDA_CHECK(cudaGetLastError());
PARROTS_CUDA_CHECK(cudaGetLastError());
} }
#include <parrots/compute/aten.hpp>
#include <parrots/extension.hpp>
#include <parrots/foundation/ssattrs.hpp>
#include "masked_conv2d_pytorch.h"
using namespace parrots;
void masked_im2col_forward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
// im: (n, ic, h, w), kernel size (kh, kw)
// kernel: (oc, ic * kh * kw), col: (kh * kw * ic, ow * oh)
int kernel_h, kernel_w, pad_h, pad_w;
SSAttrs(attr)
.get<int>("kernel_h", kernel_h)
.get<int>("kernel_w", kernel_w)
.get<int>("pad_h", pad_h)
.get<int>("pad_w", pad_w)
.done();
const auto& im = buildATensor(ctx, ins[0]);
const auto& mask_h_idx = buildATensor(ctx, ins[1]);
const auto& mask_w_idx = buildATensor(ctx, ins[2]);
auto col = buildATensor(ctx, outs[0]);
masked_im2col_forward_cuda(im, mask_h_idx, mask_w_idx, col, kernel_h,
kernel_w, pad_h, pad_w);
}
void masked_col2im_forward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
// im: (n, ic, h, w), kernel size (kh, kw)
// kernel: (oc, ic * kh * kh), col: (kh * kw * ic, ow * oh)
int height, width, channels;
SSAttrs(attr)
.get<int>("height", height)
.get<int>("width", width)
.get<int>("channels", channels)
.done();
const auto& col = buildATensor(ctx, ins[0]);
const auto& mask_h_idx = buildATensor(ctx, ins[1]);
const auto& mask_w_idx = buildATensor(ctx, ins[2]);
auto im = buildATensor(ctx, outs[0]);
masked_col2im_forward_cuda(col, mask_h_idx, mask_w_idx, im, height, width,
channels);
}
PARROTS_EXTENSION_REGISTER(masked_im2col_forward)
.attr("kernel_h")
.attr("kernel_w")
.attr("pad_h")
.attr("pad_w")
.input(3)
.output(1)
.apply(masked_im2col_forward_cuda_parrots)
.done();
PARROTS_EXTENSION_REGISTER(masked_col2im_forward)
.attr("height")
.attr("width")
.attr("channels")
.input(3)
.output(1)
.apply(masked_col2im_forward_cuda_parrots)
.done();
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment