Unverified Commit 48d99025 authored by z55250825's avatar z55250825 Committed by GitHub
Browse files

Add new parrots extension implementation for all ops (#794)

* delete all parrots file
add bbox_overlaps new parrots op impl

* support first new impl parrts op (bbox_overlaps)(success test)

* add box_iou_rotated op, test succeed

* add carafe and carafe_naive op, test succeed (one parrots bug need fix)

* add cc_attention op, test success

* add corner_pool op, test success

* add parrots op deform_conv, test success

* add deform_roi_pool op, test success (but has question)

* add focal loss op, test success (gradcheck)

* add masked_conv2d op, test success

* add modulated_deform_conv op, test success

* add nms and nms_rotated op, test success

* add psamask op, test success

* add roi_align op, test_success

* add roi_pool op, test success

* add sync_bn op, test success

* add tin_shift op, test success

* fix test_deform_roi_pool, add parrots test

* skip test_onnx because parrots does not support onnx

* fix c++ lint

* fix python lint

* fix python lint
parent 72e4cc12
#include "parrots_cuda_helper.hpp"
#include "pytorch_cuda_helper.hpp"
#include "roi_align_cuda_kernel.cuh"
void ROIAlignForwardCUDAKernelLauncher(DArrayLite input, DArrayLite rois,
DArrayLite output, DArrayLite argmax_y,
DArrayLite argmax_x, int aligned_height,
int aligned_width, float spatial_scale,
int sampling_ratio, int pool_mode,
bool aligned, cudaStream_t stream) {
int output_size = output.size();
int channels = input.dim(1);
int height = input.dim(2);
int width = input.dim(3);
void ROIAlignForwardCUDAKernelLauncher(Tensor input, Tensor rois, Tensor output,
Tensor argmax_y, Tensor argmax_x,
int aligned_height, int aligned_width,
float spatial_scale, int sampling_ratio,
int pool_mode, bool aligned) {
int output_size = output.numel();
int channels = input.size(1);
int height = input.size(2);
int width = input.size(3);
PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(
input.elemType().prim(), ([&] {
at::cuda::CUDAGuard device_guard(input.device());
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
input.scalar_type(), "roi_align_forward_cuda_kernel", [&] {
roi_align_forward_cuda_kernel<scalar_t>
<<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
output_size, input.ptr<scalar_t>(), rois.ptr<scalar_t>(),
output.ptr<scalar_t>(), argmax_y.ptr<scalar_t>(),
argmax_x.ptr<scalar_t>(), aligned_height, aligned_width,
output_size, input.data_ptr<scalar_t>(),
rois.data_ptr<scalar_t>(), output.data_ptr<scalar_t>(),
argmax_y.data_ptr<scalar_t>(), argmax_x.data_ptr<scalar_t>(),
aligned_height, aligned_width,
static_cast<scalar_t>(spatial_scale), sampling_ratio, pool_mode,
aligned, channels, height, width);
}));
});
PARROTS_CUDA_CHECK(cudaGetLastError());
AT_CUDA_CHECK(cudaGetLastError());
}
void ROIAlignBackwardCUDAKernelLauncher(
DArrayLite grad_output, DArrayLite rois, DArrayLite argmax_y,
DArrayLite argmax_x, DArrayLite grad_input, int aligned_height,
int aligned_width, float spatial_scale, int sampling_ratio, int pool_mode,
bool aligned, cudaStream_t stream) {
int output_size = grad_output.size();
int channels = grad_input.dim(1);
int height = grad_input.dim(2);
int width = grad_input.dim(3);
void ROIAlignBackwardCUDAKernelLauncher(Tensor grad_output, Tensor rois,
Tensor argmax_y, Tensor argmax_x,
Tensor grad_input, int aligned_height,
int aligned_width, float spatial_scale,
int sampling_ratio, int pool_mode,
bool aligned) {
int output_size = grad_output.numel();
int channels = grad_input.size(1);
int height = grad_input.size(2);
int width = grad_input.size(3);
PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(
grad_output.elemType().prim(), ([&] {
at::cuda::CUDAGuard device_guard(grad_output.device());
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
grad_output.scalar_type(), "roi_align_backward_cuda_kernel", [&] {
roi_align_backward_cuda_kernel<scalar_t>
<<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
output_size, grad_output.ptr<scalar_t>(), rois.ptr<scalar_t>(),
argmax_y.ptr<scalar_t>(), argmax_x.ptr<scalar_t>(),
grad_input.ptr<scalar_t>(), aligned_height, aligned_width,
output_size, grad_output.data_ptr<scalar_t>(),
rois.data_ptr<scalar_t>(), argmax_y.data_ptr<scalar_t>(),
argmax_x.data_ptr<scalar_t>(), grad_input.data_ptr<scalar_t>(),
aligned_height, aligned_width,
static_cast<scalar_t>(spatial_scale), sampling_ratio, pool_mode,
aligned, channels, height, width);
}));
});
PARROTS_CUDA_CHECK(cudaGetLastError());
AT_CUDA_CHECK(cudaGetLastError());
}
#include <parrots/compute/aten.hpp>
#include <parrots/extension.hpp>
#include <parrots/foundation/ssattrs.hpp>
#include "roi_align_pytorch.h"
using namespace parrots;
#ifdef MMCV_WITH_CUDA
void roi_align_forward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
int aligned_height;
int aligned_width;
float spatial_scale;
int sampling_ratio;
int pool_mode;
bool aligned;
SSAttrs(attr)
.get<int>("aligned_height", aligned_height)
.get<int>("aligned_width", aligned_width)
.get<float>("spatial_scale", spatial_scale)
.get<int>("sampling_ratio", sampling_ratio)
.get<int>("pool_mode", pool_mode)
.get<bool>("aligned", aligned)
.done();
const auto& input = buildATensor(ctx, ins[0]);
const auto& rois = buildATensor(ctx, ins[1]);
auto output = buildATensor(ctx, outs[0]);
auto argmax_y = buildATensor(ctx, outs[1]);
auto argmax_x = buildATensor(ctx, outs[2]);
roi_align_forward_cuda(input, rois, output, argmax_y, argmax_x,
aligned_height, aligned_width, spatial_scale,
sampling_ratio, pool_mode, aligned);
}
void roi_align_backward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
int aligned_height;
int aligned_width;
float spatial_scale;
int sampling_ratio;
int pool_mode;
bool aligned;
SSAttrs(attr)
.get<int>("aligned_height", aligned_height)
.get<int>("aligned_width", aligned_width)
.get<float>("spatial_scale", spatial_scale)
.get<int>("sampling_ratio", sampling_ratio)
.get<int>("pool_mode", pool_mode)
.get<bool>("aligned", aligned)
.done();
const auto& grad_output = buildATensor(ctx, ins[0]);
const auto& rois = buildATensor(ctx, ins[1]);
const auto& argmax_y = buildATensor(ctx, ins[2]);
const auto& argmax_x = buildATensor(ctx, ins[3]);
auto grad_input = buildATensor(ctx, outs[0]);
roi_align_backward_cuda(grad_output, rois, argmax_y, argmax_x, grad_input,
aligned_height, aligned_width, spatial_scale,
sampling_ratio, pool_mode, aligned);
}
#endif
void roi_align_forward_cpu_parrots(HostContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
int aligned_height;
int aligned_width;
float spatial_scale;
int sampling_ratio;
int pool_mode;
bool aligned;
SSAttrs(attr)
.get<int>("aligned_height", aligned_height)
.get<int>("aligned_width", aligned_width)
.get<float>("spatial_scale", spatial_scale)
.get<int>("sampling_ratio", sampling_ratio)
.get<int>("pool_mode", pool_mode)
.get<bool>("aligned", aligned)
.done();
const auto& input = buildATensor(ctx, ins[0]);
const auto& rois = buildATensor(ctx, ins[1]);
auto output = buildATensor(ctx, outs[0]);
auto argmax_y = buildATensor(ctx, outs[1]);
auto argmax_x = buildATensor(ctx, outs[2]);
roi_align_forward_cpu(input, rois, output, argmax_y, argmax_x, aligned_height,
aligned_width, spatial_scale, sampling_ratio, pool_mode,
aligned);
}
void roi_align_backward_cpu_parrots(HostContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
int aligned_height;
int aligned_width;
float spatial_scale;
int sampling_ratio;
int pool_mode;
bool aligned;
SSAttrs(attr)
.get<int>("aligned_height", aligned_height)
.get<int>("aligned_width", aligned_width)
.get<float>("spatial_scale", spatial_scale)
.get<int>("sampling_ratio", sampling_ratio)
.get<int>("pool_mode", pool_mode)
.get<bool>("aligned", aligned)
.done();
const auto& grad_output = buildATensor(ctx, ins[0]);
const auto& rois = buildATensor(ctx, ins[1]);
const auto& argmax_y = buildATensor(ctx, ins[2]);
const auto& argmax_x = buildATensor(ctx, ins[3]);
auto grad_input = buildATensor(ctx, outs[0]);
roi_align_backward_cpu(grad_output, rois, argmax_y, argmax_x, grad_input,
aligned_height, aligned_width, spatial_scale,
sampling_ratio, pool_mode, aligned);
}
PARROTS_EXTENSION_REGISTER(roi_align_forward)
.attr("aligned_height")
.attr("aligned_width")
.attr("spatial_scale")
.attr("sampling_ratio")
.attr("pool_mode")
.attr("aligned")
.input(2)
.output(3)
.apply(roi_align_forward_cpu_parrots)
#ifdef MMCV_WITH_CUDA
.apply(roi_align_forward_cuda_parrots)
#endif
.done();
PARROTS_EXTENSION_REGISTER(roi_align_backward)
.attr("aligned_height")
.attr("aligned_width")
.attr("spatial_scale")
.attr("sampling_ratio")
.attr("pool_mode")
.attr("aligned")
.input(4)
.output(1)
.apply(roi_align_backward_cpu_parrots)
#ifdef MMCV_WITH_CUDA
.apply(roi_align_backward_cuda_parrots)
#endif
.done();
#ifndef ROI_ALIGN_PYTORCH_H
#define ROI_ALIGN_PYTORCH_H
#include <torch/extension.h>
using namespace at;
#ifdef MMCV_WITH_CUDA
void roi_align_forward_cuda(Tensor input, Tensor rois, Tensor output,
Tensor argmax_y, Tensor argmax_x,
int aligned_height, int aligned_width,
float spatial_scale, int sampling_ratio,
int pool_mode, bool aligned);
void roi_align_backward_cuda(Tensor grad_output, Tensor rois, Tensor argmax_y,
Tensor argmax_x, Tensor grad_input,
int aligned_height, int aligned_width,
float spatial_scale, int sampling_ratio,
int pool_mode, bool aligned);
#endif
void roi_align_forward_cpu(Tensor input, Tensor rois, Tensor output,
Tensor argmax_y, Tensor argmax_x, int aligned_height,
int aligned_width, float spatial_scale,
int sampling_ratio, int pool_mode, bool aligned);
void roi_align_backward_cpu(Tensor grad_output, Tensor rois, Tensor argmax_y,
Tensor argmax_x, Tensor grad_input,
int aligned_height, int aligned_width,
float spatial_scale, int sampling_ratio,
int pool_mode, bool aligned);
#endif // ROI_ALIGN_PYTORCH_H
#include "parrots_cpp_helper.hpp"
#include "pytorch_cpp_helper.hpp"
void ROIPoolForwardCUDAKernelLauncher(const DArrayLite input,
const DArrayLite rois, DArrayLite output,
DArrayLite argmax, int pooled_height,
int pooled_width, float spatial_scale,
cudaStream_t stream);
#ifdef MMCV_WITH_CUDA
void ROIPoolForwardCUDAKernelLauncher(Tensor input, Tensor rois, Tensor output,
Tensor argmax, int pooled_height,
int pooled_width, float spatial_scale);
void ROIPoolBackwardCUDAKernelLauncher(const DArrayLite grad_output,
const DArrayLite rois,
const DArrayLite argmax,
DArrayLite grad_input, int pooled_height,
int pooled_width, float spatial_scale,
cudaStream_t stream);
void ROIPoolBackwardCUDAKernelLauncher(Tensor grad_output, Tensor rois,
Tensor argmax, Tensor grad_input,
int pooled_height, int pooled_width,
float spatial_scale);
void roi_pool_forward_cuda(CudaContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
int pooled_height;
int pooled_width;
float spatial_scale;
SSAttrs(attr)
.get<int>("pooled_height", pooled_height)
.get<int>("pooled_width", pooled_width)
.get<float>("spatial_scale", spatial_scale)
.done();
const auto& input = ins[0];
const auto& rois = ins[1];
auto& output = outs[0];
auto& argmax = outs[1];
cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
void roi_pool_forward_cuda(Tensor input, Tensor rois, Tensor output,
Tensor argmax, int pooled_height, int pooled_width,
float spatial_scale) {
ROIPoolForwardCUDAKernelLauncher(input, rois, output, argmax, pooled_height,
pooled_width, spatial_scale, stream);
pooled_width, spatial_scale);
}
void roi_pool_backward_cuda(CudaContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
int pooled_height;
int pooled_width;
float spatial_scale;
SSAttrs(attr)
.get<int>("pooled_height", pooled_height)
.get<int>("pooled_width", pooled_width)
.get<float>("spatial_scale", spatial_scale)
.done();
void roi_pool_backward_cuda(Tensor grad_output, Tensor rois, Tensor argmax,
Tensor grad_input, int pooled_height,
int pooled_width, float spatial_scale) {
ROIPoolBackwardCUDAKernelLauncher(grad_output, rois, argmax, grad_input,
pooled_height, pooled_width, spatial_scale);
}
#endif
const auto& grad_output = ins[0];
const auto& rois = ins[1];
const auto& argmax = ins[2];
auto& grad_input = outs[0];
void roi_pool_forward(Tensor input, Tensor rois, Tensor output, Tensor argmax,
int pooled_height, int pooled_width,
float spatial_scale) {
if (input.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(input);
CHECK_CUDA_INPUT(rois);
CHECK_CUDA_INPUT(output);
CHECK_CUDA_INPUT(argmax);
cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
ROIPoolBackwardCUDAKernelLauncher(grad_output, rois, argmax, grad_input,
pooled_height, pooled_width, spatial_scale,
stream);
roi_pool_forward_cuda(input, rois, output, argmax, pooled_height,
pooled_width, spatial_scale);
#else
AT_ERROR("RoIPool is not compiled with GPU support");
#endif
} else {
AT_ERROR("RoIPool is not implemented on CPU");
}
}
PARROTS_EXTENSION_REGISTER(roi_pool_forward)
.attr("pooled_height")
.attr("pooled_width")
.attr("spatial_scale")
.input(2)
.output(2)
.apply(roi_pool_forward_cuda)
.done();
void roi_pool_backward(Tensor grad_output, Tensor rois, Tensor argmax,
Tensor grad_input, int pooled_height, int pooled_width,
float spatial_scale) {
if (grad_output.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(grad_output);
CHECK_CUDA_INPUT(rois);
CHECK_CUDA_INPUT(argmax);
CHECK_CUDA_INPUT(grad_input);
PARROTS_EXTENSION_REGISTER(roi_pool_backward)
.attr("pooled_height")
.attr("pooled_width")
.attr("spatial_scale")
.input(3)
.output(1)
.apply(roi_pool_backward_cuda)
.done();
roi_pool_backward_cuda(grad_output, rois, argmax, grad_input, pooled_height,
pooled_width, spatial_scale);
#else
AT_ERROR("RoIPool is not compiled with GPU support");
#endif
} else {
AT_ERROR("RoIPool is not implemented on CPU");
}
}
#include "parrots_cuda_helper.hpp"
#include "pytorch_cuda_helper.hpp"
#include "roi_pool_cuda_kernel.cuh"
void ROIPoolForwardCUDAKernelLauncher(const DArrayLite input,
const DArrayLite rois, DArrayLite output,
DArrayLite argmax, int pooled_height,
int pooled_width, float spatial_scale,
cudaStream_t stream) {
int output_size = output.size();
int channels = input.dim(1);
int height = input.dim(2);
int width = input.dim(3);
void ROIPoolForwardCUDAKernelLauncher(Tensor input, Tensor rois, Tensor output,
Tensor argmax, int pooled_height,
int pooled_width, float spatial_scale) {
int output_size = output.numel();
int channels = input.size(1);
int height = input.size(2);
int width = input.size(3);
PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(input.elemType().prim(), [&] {
roi_pool_forward_cuda_kernel<scalar_t>
<<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
output_size, input.ptr<scalar_t>(), rois.ptr<scalar_t>(),
output.ptr<scalar_t>(), argmax.ptr<int>(), pooled_height,
pooled_width, spatial_scale, channels, height, width);
});
at::cuda::CUDAGuard device_guard(input.device());
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
input.scalar_type(), "roi_pool_forward_cuda_kernel", [&] {
roi_pool_forward_cuda_kernel<scalar_t>
<<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
output_size, input.data_ptr<scalar_t>(),
rois.data_ptr<scalar_t>(), output.data_ptr<scalar_t>(),
argmax.data_ptr<int>(), pooled_height, pooled_width,
static_cast<scalar_t>(spatial_scale), channels, height, width);
});
PARROTS_CUDA_CHECK(cudaGetLastError());
AT_CUDA_CHECK(cudaGetLastError());
}
void ROIPoolBackwardCUDAKernelLauncher(const DArrayLite grad_output,
const DArrayLite rois,
const DArrayLite argmax,
DArrayLite grad_input, int pooled_height,
int pooled_width, float spatial_scale,
cudaStream_t stream) {
int output_size = grad_output.size();
int channels = grad_output.dim(1);
int height = grad_output.dim(2);
int width = grad_output.dim(3);
void ROIPoolBackwardCUDAKernelLauncher(Tensor grad_output, Tensor rois,
Tensor argmax, Tensor grad_input,
int pooled_height, int pooled_width,
float spatial_scale) {
int output_size = grad_output.numel();
int channels = grad_input.size(1);
int height = grad_input.size(2);
int width = grad_input.size(3);
PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(grad_output.elemType().prim(), [&] {
roi_pool_backward_cuda_kernel<scalar_t>
<<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
output_size, grad_output.ptr<scalar_t>(), rois.ptr<scalar_t>(),
argmax.ptr<int>(), grad_input.ptr<scalar_t>(), pooled_height,
pooled_width, channels, height, width);
});
at::cuda::CUDAGuard device_guard(grad_output.device());
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
grad_output.scalar_type(), "roi_pool_backward_cuda_kernel", [&] {
roi_pool_backward_cuda_kernel<scalar_t>
<<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
output_size, grad_output.data_ptr<scalar_t>(),
rois.data_ptr<scalar_t>(), argmax.data_ptr<int>(),
grad_input.data_ptr<scalar_t>(), pooled_height, pooled_width,
channels, height, width);
});
PARROTS_CUDA_CHECK(cudaGetLastError());
AT_CUDA_CHECK(cudaGetLastError());
}
#include <parrots/compute/aten.hpp>
#include <parrots/extension.hpp>
#include <parrots/foundation/ssattrs.hpp>
#include "roi_pool_pytorch.h"
using namespace parrots;
void roi_pool_forward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
int pooled_height;
int pooled_width;
float spatial_scale;
SSAttrs(attr)
.get<int>("pooled_height", pooled_height)
.get<int>("pooled_width", pooled_width)
.get<float>("spatial_scale", spatial_scale)
.done();
const auto& input = buildATensor(ctx, ins[0]);
const auto& rois = buildATensor(ctx, ins[1]);
auto output = buildATensor(ctx, outs[0]);
auto argmax = buildATensor(ctx, outs[1]);
roi_pool_forward_cuda(input, rois, output, argmax, pooled_height,
pooled_width, spatial_scale);
}
void roi_pool_backward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
int pooled_height;
int pooled_width;
float spatial_scale;
SSAttrs(attr)
.get<int>("pooled_height", pooled_height)
.get<int>("pooled_width", pooled_width)
.get<float>("spatial_scale", spatial_scale)
.done();
const auto& grad_output = buildATensor(ctx, ins[0]);
const auto& rois = buildATensor(ctx, ins[1]);
const auto& argmax = buildATensor(ctx, ins[2]);
auto grad_input = buildATensor(ctx, outs[0]);
roi_pool_backward_cuda(grad_output, rois, argmax, grad_input, pooled_height,
pooled_width, spatial_scale);
}
PARROTS_EXTENSION_REGISTER(roi_pool_forward)
.attr("pooled_height")
.attr("pooled_width")
.attr("spatial_scale")
.input(2)
.output(2)
.apply(roi_pool_forward_cuda_parrots)
.done();
PARROTS_EXTENSION_REGISTER(roi_pool_backward)
.attr("pooled_height")
.attr("pooled_width")
.attr("spatial_scale")
.input(3)
.output(1)
.apply(roi_pool_backward_cuda_parrots)
.done();
#ifndef ROI_POOL_PYTORCH_H
#define ROI_POOL_PYTORCH_H
#include <torch/extension.h>
using namespace at;
#ifdef MMCV_WITH_CUDA
void roi_pool_forward_cuda(Tensor input, Tensor rois, Tensor output,
Tensor argmax, int pooled_height, int pooled_width,
float spatial_scale);
void roi_pool_backward_cuda(Tensor grad_output, Tensor rois, Tensor argmax,
Tensor grad_input, int pooled_height,
int pooled_width, float spatial_scale);
#endif
#endif // ROI_POOL_PYTORCH_H
#include "parrots_cpp_helper.hpp"
#include "pytorch_cpp_helper.hpp"
void SyncBNForwardMeanCUDAKernelLauncher(const DArrayLite input,
DArrayLite mean, cudaStream_t stream);
#ifdef MMCV_WITH_CUDA
void SyncBNForwardMeanCUDAKernelLauncher(const Tensor input, Tensor mean);
void SyncBNForwardVarCUDAKernelLauncher(const DArrayLite input,
const DArrayLite mean, DArrayLite var,
cudaStream_t stream);
void SyncBNForwardVarCUDAKernelLauncher(const Tensor input, const Tensor mean,
Tensor var);
void SyncBNForwardOutputCUDAKernelLauncher(
const DArrayLite input, const DArrayLite mean, const DArrayLite var,
DArrayLite running_mean, DArrayLite running_var, const DArrayLite weight,
const DArrayLite bias, DArrayLite norm, DArrayLite std, DArrayLite output,
const float eps, const float momentum, size_t group_size,
cudaStream_t stream);
void SyncBNBackwardParamCUDAKernelLauncher(const DArrayLite grad_output,
const DArrayLite norm,
DArrayLite weight_diff,
DArrayLite bias_diff,
cudaStream_t stream);
void SyncBNBackwardDataCUDAKernelLauncher(
const DArrayLite grad_output, const DArrayLite weight,
const DArrayLite weight_diff, const DArrayLite bias_diff,
const DArrayLite norm, const DArrayLite std, DArrayLite grad_input,
cudaStream_t stream);
void sync_bn_forward_mean_cuda(CudaContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
const auto& input = ins[0];
auto& mean = outs[0];
cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
SyncBNForwardMeanCUDAKernelLauncher(input, mean, stream);
const Tensor input, const Tensor mean, const Tensor var,
Tensor running_mean, Tensor running_var, const Tensor weight,
const Tensor bias, Tensor norm, Tensor std, Tensor output, float eps,
float momentum, int group_size);
void SyncBNBackwardParamCUDAKernelLauncher(const Tensor grad_output,
const Tensor norm,
Tensor grad_weight,
Tensor grad_bias);
void SyncBNBackwardDataCUDAKernelLauncher(const Tensor grad_output,
const Tensor weight,
const Tensor grad_weight,
const Tensor grad_bias,
const Tensor norm, const Tensor std,
Tensor grad_input);
void sync_bn_forward_mean_cuda(const Tensor input, Tensor mean) {
SyncBNForwardMeanCUDAKernelLauncher(input, mean);
}
void sync_bn_forward_var_cuda(CudaContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
const auto& input = ins[0];
const auto& mean = ins[1];
auto& var = outs[0];
cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
SyncBNForwardVarCUDAKernelLauncher(input, mean, var, stream);
void sync_bn_forward_var_cuda(const Tensor input, const Tensor mean,
Tensor var) {
SyncBNForwardVarCUDAKernelLauncher(input, mean, var);
}
void sync_bn_forward_output_cuda(CudaContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
size_t group_size;
float eps, momentum;
SSAttrs(attr)
.get<float>("eps", eps)
.get<float>("momentum", momentum)
.get<size_t>("group_size", group_size)
.done();
const auto& input = ins[0];
const auto& mean = ins[1];
const auto& var = ins[2];
const auto& weight = ins[3];
const auto& bias = ins[4];
auto& running_mean = outs[0];
auto& running_var = outs[1];
auto& norm = outs[2];
auto& std = outs[3];
auto& output = outs[4];
cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
SyncBNForwardOutputCUDAKernelLauncher(
input, mean, var, running_mean, running_var, weight, bias, norm, std,
output, eps, momentum, group_size, stream);
void sync_bn_forward_output_cuda(const Tensor input, const Tensor mean,
const Tensor var, Tensor running_mean,
Tensor running_var, const Tensor weight,
const Tensor bias, Tensor norm, Tensor std,
Tensor output, float eps, float momentum,
int group_size) {
SyncBNForwardOutputCUDAKernelLauncher(input, mean, var, running_mean,
running_var, weight, bias, norm, std,
output, eps, momentum, group_size);
}
void sync_bn_backward_param_cuda(CudaContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
const auto& grad_output = ins[0];
const auto& norm = ins[1];
auto& grad_weight = outs[0];
auto& grad_bias = outs[1];
cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
void sync_bn_backward_param_cuda(const Tensor grad_output, const Tensor norm,
Tensor grad_weight, Tensor grad_bias) {
SyncBNBackwardParamCUDAKernelLauncher(grad_output, norm, grad_weight,
grad_bias, stream);
grad_bias);
}
void sync_bn_backward_data_cuda(CudaContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
const auto& grad_output = ins[0];
const auto& weight = ins[1];
const auto& grad_weight = ins[2];
const auto& grad_bias = ins[3];
const auto& norm = ins[4];
const auto& std = ins[5];
auto& grad_input = outs[0];
cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
void sync_bn_backward_data_cuda(const Tensor grad_output, const Tensor weight,
const Tensor grad_weight,
const Tensor grad_bias, const Tensor norm,
const Tensor std, Tensor grad_input) {
SyncBNBackwardDataCUDAKernelLauncher(grad_output, weight, grad_weight,
grad_bias, norm, std, grad_input,
stream);
grad_bias, norm, std, grad_input);
}
#endif
void sync_bn_forward_mean(const Tensor input, Tensor mean) {
if (input.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(input);
CHECK_CUDA_INPUT(mean);
sync_bn_forward_mean_cuda(input, mean);
#else
AT_ERROR("SyncBatchNorm is not compiled with GPU support");
#endif
} else {
AT_ERROR("SyncBatchNorm is not implemented on CPU");
}
}
PARROTS_EXTENSION_REGISTER(sync_bn_forward_mean)
.input(1)
.output(1)
.apply(sync_bn_forward_mean_cuda)
.done();
PARROTS_EXTENSION_REGISTER(sync_bn_forward_var)
.input(2)
.output(1)
.apply(sync_bn_forward_var_cuda)
.done();
void sync_bn_forward_var(const Tensor input, const Tensor mean, Tensor var) {
if (input.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(input);
CHECK_CUDA_INPUT(mean);
CHECK_CUDA_INPUT(var);
sync_bn_forward_var_cuda(input, mean, var);
#else
AT_ERROR("SyncBatchNorm is not compiled with GPU support");
#endif
} else {
AT_ERROR("SyncBatchNorm is not implemented on CPU");
}
}
PARROTS_EXTENSION_REGISTER(sync_bn_forward_output)
.attr("eps")
.attr("momentum")
.attr("group_size")
.input(5)
.output(5)
.apply(sync_bn_forward_output_cuda)
.done();
void sync_bn_forward_output(const Tensor input, const Tensor mean,
const Tensor var, const Tensor weight,
const Tensor bias, Tensor running_mean,
Tensor running_var, Tensor norm, Tensor std,
Tensor output, float eps, float momentum,
int group_size) {
if (input.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(input);
CHECK_CUDA_INPUT(mean);
CHECK_CUDA_INPUT(var);
CHECK_CUDA_INPUT(weight);
CHECK_CUDA_INPUT(bias);
CHECK_CUDA_INPUT(running_mean);
CHECK_CUDA_INPUT(running_var);
CHECK_CUDA_INPUT(norm);
CHECK_CUDA_INPUT(std);
CHECK_CUDA_INPUT(output);
sync_bn_forward_output_cuda(input, mean, var, running_mean, running_var,
weight, bias, norm, std, output, eps, momentum,
group_size);
#else
AT_ERROR("SyncBatchNorm is not compiled with GPU support");
#endif
} else {
AT_ERROR("SyncBatchNorm is not implemented on CPU");
}
}
PARROTS_EXTENSION_REGISTER(sync_bn_backward_param)
.input(2)
.output(2)
.apply(sync_bn_backward_param_cuda)
.done();
void sync_bn_backward_param(const Tensor grad_output, const Tensor norm,
Tensor grad_weight, Tensor grad_bias) {
if (grad_output.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(grad_output);
CHECK_CUDA_INPUT(norm);
CHECK_CUDA_INPUT(grad_weight);
CHECK_CUDA_INPUT(grad_bias);
sync_bn_backward_param_cuda(grad_output, norm, grad_weight, grad_bias);
#else
AT_ERROR("SyncBatchNorm is not compiled with GPU support");
#endif
} else {
AT_ERROR("SyncBatchNorm is not implemented on CPU");
}
}
PARROTS_EXTENSION_REGISTER(sync_bn_backward_data)
.input(6)
.output(1)
.apply(sync_bn_backward_data_cuda)
.done();
void sync_bn_backward_data(const Tensor grad_output, const Tensor weight,
const Tensor grad_weight, const Tensor grad_bias,
const Tensor norm, const Tensor std,
Tensor grad_input) {
if (grad_output.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(grad_output);
CHECK_CUDA_INPUT(weight);
CHECK_CUDA_INPUT(grad_weight);
CHECK_CUDA_INPUT(grad_bias);
CHECK_CUDA_INPUT(norm);
CHECK_CUDA_INPUT(std);
CHECK_CUDA_INPUT(grad_input);
sync_bn_backward_data_cuda(grad_output, weight, grad_weight, grad_bias,
norm, std, grad_input);
#else
AT_ERROR("SyncBatchNorm is not compiled with GPU support");
#endif
} else {
AT_ERROR("SyncBatchNorm is not implemented on CPU");
}
}
#include "parrots_cuda_helper.hpp"
#include "pytorch_cuda_helper.hpp"
#include "sync_bn_cuda_kernel.cuh"
void SyncBNForwardMeanCUDAKernelLauncher(const DArrayLite input,
DArrayLite mean, cudaStream_t stream) {
int num = input.dim(0);
int channels = input.dim(1);
int spatial = input.dim(2);
void SyncBNForwardMeanCUDAKernelLauncher(const Tensor input, Tensor mean) {
int num = input.size(0);
int channels = input.size(1);
int spatial = input.size(2);
PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(
input.elemType().prim(), ([&] {
at::cuda::CUDAGuard device_guard(input.device());
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
input.scalar_type(), "sync_bn_forward_mean_cuda_kernel", [&] {
sync_bn_forward_mean_cuda_kernel<scalar_t>
<<<channels, THREADS_PER_BLOCK, 0, stream>>>(input.ptr<scalar_t>(),
mean.ptr<float>(), num,
channels, spatial);
}));
PARROTS_CUDA_CHECK(cudaGetLastError());
<<<channels, THREADS_PER_BLOCK, 0, stream>>>(
input.data_ptr<scalar_t>(), mean.data_ptr<float>(), num,
channels, spatial);
});
AT_CUDA_CHECK(cudaGetLastError());
}
void SyncBNForwardVarCUDAKernelLauncher(const DArrayLite input,
const DArrayLite mean, DArrayLite var,
cudaStream_t stream) {
int num = input.dim(0);
int channels = input.dim(1);
int spatial = input.dim(2);
void SyncBNForwardVarCUDAKernelLauncher(const Tensor input, const Tensor mean,
Tensor var) {
int num = input.size(0);
int channels = input.size(1);
int spatial = input.size(2);
PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(
input.elemType().prim(), ([&] {
at::cuda::CUDAGuard device_guard(input.device());
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
input.scalar_type(), "sync_bn_forward_mean_cuda_kernel", [&] {
sync_bn_forward_var_cuda_kernel<scalar_t>
<<<channels, THREADS_PER_BLOCK, 0, stream>>>(
input.ptr<scalar_t>(), mean.ptr<float>(), var.ptr<float>(), num,
channels, spatial);
}));
PARROTS_CUDA_CHECK(cudaGetLastError());
input.data_ptr<scalar_t>(), mean.data_ptr<float>(),
var.data_ptr<float>(), num, channels, spatial);
});
AT_CUDA_CHECK(cudaGetLastError());
}
void SyncBNForwardOutputCUDAKernelLauncher(
const DArrayLite input, const DArrayLite mean, const DArrayLite var,
DArrayLite running_mean, DArrayLite running_var, const DArrayLite weight,
const DArrayLite bias, DArrayLite norm, DArrayLite std, DArrayLite output,
float eps, float momentum, size_t group_size, cudaStream_t stream) {
int num = input.dim(0);
int channels = input.dim(1);
int spatial = input.dim(2);
const Tensor input, const Tensor mean, const Tensor var,
Tensor running_mean, Tensor running_var, const Tensor weight,
const Tensor bias, Tensor norm, Tensor std, Tensor output, float eps,
float momentum, int group_size) {
int num = input.size(0);
int channels = input.size(1);
int spatial = input.size(2);
PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(
input.elemType().prim(), ([&] {
at::cuda::CUDAGuard device_guard(input.device());
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
input.scalar_type(), "sync_bn_forward_mean_cuda_kernel", [&] {
sync_bn_forward_output_cuda_kernel<scalar_t>
<<<channels, THREADS_PER_BLOCK, 0, stream>>>(
input.ptr<scalar_t>(), mean.ptr<float>(), var.ptr<float>(),
running_mean.ptr<float>(), running_var.ptr<float>(),
weight.ptr<float>(), bias.ptr<float>(), norm.ptr<float>(),
std.ptr<float>(), output.ptr<scalar_t>(), num, channels,
spatial, eps, momentum, group_size);
}));
PARROTS_CUDA_CHECK(cudaGetLastError());
input.data_ptr<scalar_t>(), mean.data_ptr<float>(),
var.data_ptr<float>(), running_mean.data_ptr<float>(),
running_var.data_ptr<float>(), weight.data_ptr<float>(),
bias.data_ptr<float>(), norm.data_ptr<float>(),
std.data_ptr<float>(), output.data_ptr<scalar_t>(), num,
channels, spatial, eps, momentum, group_size);
});
AT_CUDA_CHECK(cudaGetLastError());
}
void SyncBNBackwardParamCUDAKernelLauncher(const DArrayLite grad_output,
const DArrayLite norm,
DArrayLite grad_weight,
DArrayLite grad_bias,
cudaStream_t stream) {
int num = grad_output.dim(0);
int channels = grad_output.dim(1);
int spatial = grad_output.dim(2);
void SyncBNBackwardParamCUDAKernelLauncher(const Tensor grad_output,
const Tensor norm,
Tensor grad_weight,
Tensor grad_bias) {
int num = grad_output.size(0);
int channels = grad_output.size(1);
int spatial = grad_output.size(2);
PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(
grad_output.elemType().prim(), ([&] {
at::cuda::CUDAGuard device_guard(grad_output.device());
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
grad_output.scalar_type(), "sync_bn_backward_param_cuda_kernel", [&] {
sync_bn_backward_param_cuda_kernel<scalar_t>
<<<channels, THREADS_PER_BLOCK, 0, stream>>>(
grad_output.ptr<scalar_t>(), norm.ptr<float>(),
grad_weight.ptr<float>(), grad_bias.ptr<float>(), num, channels,
spatial);
}));
PARROTS_CUDA_CHECK(cudaGetLastError());
grad_output.data_ptr<scalar_t>(), norm.data_ptr<float>(),
grad_weight.data_ptr<float>(), grad_bias.data_ptr<float>(), num,
channels, spatial);
});
AT_CUDA_CHECK(cudaGetLastError());
}
void SyncBNBackwardDataCUDAKernelLauncher(
const DArrayLite grad_output, const DArrayLite weight,
const DArrayLite grad_weight, const DArrayLite grad_bias,
const DArrayLite norm, const DArrayLite std, DArrayLite grad_input,
cudaStream_t stream) {
int output_size = grad_input.size();
int num = grad_input.dim(0);
int channels = grad_input.dim(1);
int spatial = grad_input.dim(2);
void SyncBNBackwardDataCUDAKernelLauncher(const Tensor grad_output,
const Tensor weight,
const Tensor grad_weight,
const Tensor grad_bias,
const Tensor norm, const Tensor std,
Tensor grad_input) {
int output_size = grad_input.numel();
int num = grad_input.size(0);
int channels = grad_input.size(1);
int spatial = grad_input.size(2);
PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(
grad_input.elemType().prim(), ([&] {
at::cuda::CUDAGuard device_guard(grad_input.device());
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
grad_output.scalar_type(), "sync_bn_backward_data_cuda_kernel", [&] {
sync_bn_backward_data_cuda_kernel<scalar_t>
<<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
output_size, grad_output.ptr<scalar_t>(), weight.ptr<float>(),
grad_weight.ptr<float>(), grad_bias.ptr<float>(),
norm.ptr<float>(), std.ptr<float>(), grad_input.ptr<scalar_t>(),
num, channels, spatial);
}));
PARROTS_CUDA_CHECK(cudaGetLastError());
output_size, grad_output.data_ptr<scalar_t>(),
weight.data_ptr<float>(), grad_weight.data_ptr<float>(),
grad_bias.data_ptr<float>(), norm.data_ptr<float>(),
std.data_ptr<float>(), grad_input.data_ptr<scalar_t>(), num,
channels, spatial);
});
AT_CUDA_CHECK(cudaGetLastError());
}
#include <parrots/compute/aten.hpp>
#include <parrots/extension.hpp>
#include <parrots/foundation/ssattrs.hpp>
#include "sync_bn_pytorch.h"
using namespace parrots;
void sync_bn_forward_mean_cuda_parrots(CudaContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
const auto& input = buildATensor(ctx, ins[0]);
auto mean = buildATensor(ctx, outs[0]);
sync_bn_forward_mean_cuda(input, mean);
}
void sync_bn_forward_var_cuda_parrots(CudaContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
const auto& input = buildATensor(ctx, ins[0]);
const auto& mean = buildATensor(ctx, ins[1]);
auto var = buildATensor(ctx, outs[0]);
sync_bn_forward_var_cuda(input, mean, var);
}
void sync_bn_forward_output_cuda_parrots(CudaContext& ctx,
const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
size_t group_size;
float eps, momentum;
SSAttrs(attr)
.get<float>("eps", eps)
.get<float>("momentum", momentum)
.get<size_t>("group_size", group_size)
.done();
const auto& input = buildATensor(ctx, ins[0]);
const auto& mean = buildATensor(ctx, ins[1]);
const auto& var = buildATensor(ctx, ins[2]);
const auto& weight = buildATensor(ctx, ins[3]);
const auto& bias = buildATensor(ctx, ins[4]);
auto running_mean = buildATensor(ctx, outs[0]);
auto running_var = buildATensor(ctx, outs[1]);
auto norm = buildATensor(ctx, outs[2]);
auto std = buildATensor(ctx, outs[3]);
auto output = buildATensor(ctx, outs[3]);
sync_bn_forward_output_cuda(input, mean, var, running_mean, running_var,
weight, bias, norm, std, output, eps, momentum,
group_size);
}
void sync_bn_backward_param_cuda_parrots(CudaContext& ctx,
const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
const auto& grad_output = buildATensor(ctx, ins[0]);
const auto& norm = buildATensor(ctx, ins[1]);
auto grad_weight = buildATensor(ctx, outs[0]);
auto grad_bias = buildATensor(ctx, outs[1]);
sync_bn_backward_param_cuda(grad_output, norm, grad_weight, grad_bias);
}
void sync_bn_backward_data_cuda_parrots(CudaContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
const auto& grad_output = buildATensor(ctx, ins[0]);
const auto& weight = buildATensor(ctx, ins[1]);
const auto& grad_weight = buildATensor(ctx, ins[2]);
const auto& grad_bias = buildATensor(ctx, ins[3]);
const auto& norm = buildATensor(ctx, ins[4]);
const auto& std = buildATensor(ctx, ins[5]);
auto grad_input = buildATensor(ctx, outs[0]);
sync_bn_backward_data_cuda(grad_output, weight, grad_weight, grad_bias, norm,
std, grad_input);
}
PARROTS_EXTENSION_REGISTER(sync_bn_forward_mean)
.input(1)
.output(1)
.apply(sync_bn_forward_mean_cuda_parrots)
.done();
PARROTS_EXTENSION_REGISTER(sync_bn_forward_var)
.input(2)
.output(1)
.apply(sync_bn_forward_var_cuda_parrots)
.done();
PARROTS_EXTENSION_REGISTER(sync_bn_forward_output)
.attr("eps")
.attr("momentum")
.attr("group_size")
.input(5)
.output(5)
.apply(sync_bn_forward_output_cuda_parrots)
.done();
PARROTS_EXTENSION_REGISTER(sync_bn_backward_param)
.input(2)
.output(2)
.apply(sync_bn_backward_param_cuda_parrots)
.done();
PARROTS_EXTENSION_REGISTER(sync_bn_backward_data)
.input(6)
.output(1)
.apply(sync_bn_backward_data_cuda_parrots)
.done();
#ifndef SYNC_BN_PYTORCH_H
#define SYNC_BN_PYTORCH_H
#include <torch/extension.h>
using namespace at;
void sync_bn_forward_mean_cuda(const Tensor input, Tensor mean);
void sync_bn_forward_var_cuda(const Tensor input, const Tensor mean,
Tensor var);
void sync_bn_forward_output_cuda(const Tensor input, const Tensor mean,
const Tensor var, Tensor running_mean,
Tensor running_var, const Tensor weight,
const Tensor bias, Tensor norm, Tensor std,
Tensor output, float eps, float momentum,
int group_size);
void sync_bn_backward_param_cuda(const Tensor grad_output, const Tensor norm,
Tensor grad_weight, Tensor grad_bias);
void sync_bn_backward_data_cuda(const Tensor grad_output, const Tensor weight,
const Tensor grad_weight,
const Tensor grad_bias, const Tensor norm,
const Tensor std, Tensor grad_input);
#endif // SYNC_BN_PYTORCH_H
#include "parrots_cpp_helper.hpp"
void TINShiftForwardCUDAKernelLauncher(const DArrayLite input,
const DArrayLite shift,
DArrayLite output, cudaStream_t stream);
void TINShiftBackwardCUDAKernelLauncher(const DArrayLite grad_output,
const DArrayLite shift,
DArrayLite grad_input,
cudaStream_t stream);
void tin_shift_forward_cuda(CudaContext &ctx, const SSElement &attr,
const OperatorBase::in_list_t &ins,
OperatorBase::out_list_t &outs) {
const auto &input = ins[0];
const auto &shift = ins[1];
auto &output = outs[0];
cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
TINShiftForwardCUDAKernelLauncher(input, shift, output, stream);
#include "pytorch_cpp_helper.hpp"
#ifdef MMCV_WITH_CUDA
void TINShiftForwardCUDAKernelLauncher(Tensor input, Tensor shift,
Tensor output);
void TINShiftBackwardCUDAKernelLauncher(Tensor grad_output, Tensor shift,
Tensor grad_input);
void tin_shift_forward_cuda(Tensor input, Tensor shift, Tensor output) {
TINShiftForwardCUDAKernelLauncher(input, shift, output);
}
void tin_shift_backward_cuda(CudaContext &ctx, const SSElement &attr,
const OperatorBase::in_list_t &ins,
OperatorBase::out_list_t &outs) {
const auto &grad_output = ins[0];
const auto &shift = ins[1];
auto &grad_input = outs[0];
cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
TINShiftBackwardCUDAKernelLauncher(grad_output, shift, grad_input, stream);
void tin_shift_backward_cuda(Tensor grad_output, Tensor shift,
Tensor grad_input) {
TINShiftBackwardCUDAKernelLauncher(grad_output, shift, grad_input);
}
PARROTS_EXTENSION_REGISTER(tin_shift_forward)
.input(2)
.output(1)
.apply(tin_shift_forward_cuda)
.done();
PARROTS_EXTENSION_REGISTER(tin_shift_backward)
.input(2)
.output(1)
.apply(tin_shift_backward_cuda)
.done();
#endif
void tin_shift_forward(Tensor input, Tensor shift, Tensor output) {
if (input.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(input);
CHECK_CUDA_INPUT(shift);
CHECK_CUDA_INPUT(output);
tin_shift_forward_cuda(input, shift, output);
#else
AT_ERROR("TINShift is not compiled with GPU support");
#endif
} else {
AT_ERROR("TINShift is not implemented on CPU");
}
}
void tin_shift_backward(Tensor grad_output, Tensor shift, Tensor grad_input) {
if (grad_output.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(grad_output);
CHECK_CUDA_INPUT(shift);
CHECK_CUDA_INPUT(grad_input);
tin_shift_backward_cuda(grad_output, shift, grad_input);
#else
AT_ERROR("TINShift is not compiled with GPU support");
#endif
} else {
AT_ERROR("TINShift is not implemented on CPU");
}
}
#include "parrots_cuda_helper.hpp"
#include "pytorch_cuda_helper.hpp"
#include "tin_shift_cuda_kernel.cuh"
void TINShiftForwardCUDAKernelLauncher(const DArrayLite input,
const DArrayLite shift,
DArrayLite output, cudaStream_t stream) {
int output_size = output.size();
int batch_size = input.dim(0);
int t_size = input.dim(1);
int channels = input.dim(2);
int hw_size = input.dim(3);
int group_size = shift.dim(1);
void TINShiftForwardCUDAKernelLauncher(Tensor input, Tensor shift,
Tensor output) {
int output_size = output.numel();
int batch_size = input.size(0);
int t_size = input.size(1);
int channels = input.size(2);
int hw_size = input.size(3);
int group_size = shift.size(1);
int group_channel = channels / group_size;
int num_kernels = batch_size * hw_size * channels;
PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(
input.elemType().prim(), ([&] {
at::cuda::CUDAGuard device_guard(input.device());
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
input.scalar_type(), "tin_shift_forward_cuda_kernel", [&] {
tin_shift_forward_cuda_kernel<scalar_t>
<<<GET_BLOCKS(num_kernels), THREADS_PER_BLOCK, 0, stream>>>(
output_size, input.ptr<scalar_t>(), shift.ptr<int>(),
output.ptr<scalar_t>(), batch_size, channels, t_size, hw_size,
group_size, group_channel);
}));
output_size, input.data_ptr<scalar_t>(), shift.data_ptr<int>(),
output.data_ptr<scalar_t>(), batch_size, channels, t_size,
hw_size, group_size, group_channel);
});
PARROTS_CUDA_CHECK(cudaGetLastError());
AT_CUDA_CHECK(cudaGetLastError());
}
void TINShiftBackwardCUDAKernelLauncher(const DArrayLite grad_output,
const DArrayLite shift,
DArrayLite grad_input,
cudaStream_t stream) {
int output_size = grad_output.size();
int batch_size = grad_output.dim(0);
int t_size = grad_output.dim(1);
int channels = grad_output.dim(2);
int hw_size = grad_output.dim(3);
int group_size = shift.dim(1);
void TINShiftBackwardCUDAKernelLauncher(Tensor grad_output, Tensor shift,
Tensor grad_input) {
int output_size = grad_output.numel();
int batch_size = grad_output.size(0);
int t_size = grad_output.size(1);
int channels = grad_output.size(2);
int hw_size = grad_output.size(3);
int group_size = shift.size(1);
int group_channel = channels / group_size;
int num_kernels = batch_size * hw_size * channels;
PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(
grad_output.elemType().prim(), ([&] {
at::cuda::CUDAGuard device_guard(grad_output.device());
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
grad_output.scalar_type(), "tin_shift_backward_cuda_kernel", [&] {
tin_shift_backward_cuda_kernel<scalar_t>
<<<GET_BLOCKS(num_kernels), THREADS_PER_BLOCK, 0, stream>>>(
output_size, grad_output.ptr<scalar_t>(), shift.ptr<int>(),
grad_input.ptr<scalar_t>(), batch_size, channels, t_size,
hw_size, group_size, group_channel);
}));
output_size, grad_output.data_ptr<scalar_t>(),
shift.data_ptr<int>(), grad_input.data_ptr<scalar_t>(),
batch_size, channels, t_size, hw_size, group_size,
group_channel);
});
PARROTS_CUDA_CHECK(cudaGetLastError());
AT_CUDA_CHECK(cudaGetLastError());
}
#include <parrots/compute/aten.hpp>
#include <parrots/extension.hpp>
#include <parrots/foundation/ssattrs.hpp>
#include "tin_shift_pytorch.h"
using namespace parrots;
void tin_shift_forward_cuda_parrots(CudaContext &ctx, const SSElement &attr,
const OperatorBase::in_list_t &ins,
OperatorBase::out_list_t &outs) {
const auto &input = buildATensor(ctx, ins[0]);
const auto &shift = buildATensor(ctx, ins[1]);
auto output = buildATensor(ctx, outs[0]);
tin_shift_forward_cuda(input, shift, output);
}
void tin_shift_backward_cuda_parrots(CudaContext &ctx, const SSElement &attr,
const OperatorBase::in_list_t &ins,
OperatorBase::out_list_t &outs) {
const auto &grad_output = buildATensor(ctx, ins[0]);
const auto &shift = buildATensor(ctx, ins[1]);
auto grad_input = buildATensor(ctx, outs[0]);
tin_shift_backward_cuda(grad_output, shift, grad_input);
}
PARROTS_EXTENSION_REGISTER(tin_shift_forward)
.input(2)
.output(1)
.apply(tin_shift_forward_cuda_parrots)
.done();
PARROTS_EXTENSION_REGISTER(tin_shift_backward)
.input(2)
.output(1)
.apply(tin_shift_backward_cuda_parrots)
.done();
#ifndef TIN_SHIFT_PYTORCH_H
#define TIN_SHIFT_PYTORCH_H
#include <torch/extension.h>
using namespace at;
void tin_shift_forward_cuda(Tensor input, Tensor shift, Tensor output);
void tin_shift_backward_cuda(Tensor grad_output, Tensor shift,
Tensor grad_input);
#endif // TIN_SHIFT_PYTORCH_H
......@@ -134,28 +134,12 @@ def nms(boxes, scores, iou_threshold, offset=0):
assert offset in (0, 1)
if torch.__version__ == 'parrots':
x1 = boxes[:, 0]
y1 = boxes[:, 1]
x2 = boxes[:, 2]
y2 = boxes[:, 3]
areas = (x2 - x1 + offset) * (y2 - y1 + offset)
_, order = scores.sort(0, descending=True)
if boxes.device == 'cpu':
indata_list = [boxes, order, areas]
indata_dict = {
'iou_threshold': float(iou_threshold),
'offset': int(offset)
}
select = ext_module.nms(*indata_list, **indata_dict).byte()
else:
boxes_sorted = boxes.index_select(0, order)
indata_list = [boxes_sorted, order, areas]
indata_dict = {
'iou_threshold': float(iou_threshold),
'offset': int(offset)
}
select = ext_module.nms(*indata_list, **indata_dict)
inds = order.masked_select(select)
indata_list = [boxes, scores]
indata_dict = {
'iou_threshold': float(iou_threshold),
'offset': int(offset)
}
inds = ext_module.nms(*indata_list, **indata_dict)
else:
inds = NMSop.apply(boxes, scores, iou_threshold, offset)
dets = torch.cat((boxes[inds], scores[inds].reshape(-1, 1)), dim=1)
......@@ -219,12 +203,8 @@ def soft_nms(boxes,
assert method in method_dict.keys()
if torch.__version__ == 'parrots':
x1 = boxes[:, 0]
y1 = boxes[:, 1]
x2 = boxes[:, 2]
y2 = boxes[:, 3]
areas = (x2 - x1 + offset) * (y2 - y1 + offset)
indata_list = [boxes.cpu(), scores.cpu(), areas.cpu()]
dets = boxes.new_empty((boxes.size(0), 5), device='cpu')
indata_list = [boxes.cpu(), scores.cpu(), dets.cpu()]
indata_dict = {
'iou_threshold': float(iou_threshold),
'sigma': float(sigma),
......@@ -232,8 +212,7 @@ def soft_nms(boxes,
'method': method_dict[method],
'offset': int(offset)
}
dets, inds, num_out = ext_module.softnms(*indata_list, **indata_dict)
inds = inds[:num_out]
inds = ext_module.softnms(*indata_list, **indata_dict)
else:
dets, inds = SoftNMSop.apply(boxes.cpu(), scores.cpu(),
float(iou_threshold), float(sigma),
......@@ -343,7 +322,11 @@ def nms_match(dets, iou_threshold):
dets_t = dets.detach().cpu()
else:
dets_t = torch.from_numpy(dets)
matched = ext_module.nms_match(dets_t, float(iou_threshold))
indata_list = [dets_t]
indata_dict = {'iou_threshold': float(iou_threshold)}
matched = ext_module.nms_match(*indata_list, **indata_dict)
if torch.__version__ == 'parrots':
matched = matched.tolist()
if isinstance(dets, torch.Tensor):
return [dets.new_tensor(m, dtype=torch.long) for m in matched]
......@@ -380,16 +363,13 @@ def nms_rotated(dets, scores, iou_threshold, labels=None):
dets_sorted = dets_wl.index_select(0, order)
if torch.__version__ == 'parrots':
select = torch.zeros((dets.shape[0]),
dtype=torch.int64).to(dets.device)
ext_module.nms_rotated(
keep_inds = ext_module.nms_rotated(
dets_wl,
scores,
order,
dets_sorted,
select,
iou_threshold=iou_threshold,
multi_label=multi_label)
keep_inds = order.masked_select(select == 1)
else:
keep_inds = ext_module.nms_rotated(dets_wl, scores, order, dets_sorted,
iou_threshold, multi_label)
......
......@@ -15,12 +15,19 @@ if torch.__version__ != 'parrots':
else:
from parrots import extension
has_return_value_ops = [
'nms', 'softnms', 'nms_match', 'nms_rotated', 'top_pool_forward',
'top_pool_backward', 'bottom_pool_forward', 'bottom_pool_backward',
'left_pool_forward', 'left_pool_backward', 'right_pool_forward',
'right_pool_backward'
]
def load_ext(name, funcs):
ExtModule = namedtuple('ExtModule', funcs)
ext_list = []
lib_root = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
for fun in funcs:
if fun in ['nms', 'softnms']:
if fun in has_return_value_ops:
ext_list.append(extension.load(fun, name, lib_dir=lib_root).op)
else:
ext_list.append(
......
......@@ -184,20 +184,32 @@ def get_extensions():
if EXT_TYPE == 'parrots':
ext_name = 'mmcv._ext'
from parrots.utils.build_extension import Extension
define_macros = [('MMCV_USE_PARROTS', None)]
op_files = glob.glob('./mmcv/ops/csrc/parrots/*')
include_path = os.path.abspath('./mmcv/ops/csrc')
# new parrots op impl do not use MMCV_USE_PARROTS
# define_macros = [('MMCV_USE_PARROTS', None)]
define_macros = []
op_files = glob.glob('./mmcv/ops/csrc/parrots/*.cu') +\
glob.glob('./mmcv/ops/csrc/parrots/*.cpp')
include_dirs = [os.path.abspath('./mmcv/ops/csrc')]
cuda_args = os.getenv('MMCV_CUDA_ARGS')
extra_compile_args = {
'nvcc': [cuda_args] if cuda_args else [],
'cxx': [],
}
if torch.cuda.is_available() or os.getenv('FORCE_CUDA', '0') == '1':
define_macros += [('MMCV_WITH_CUDA', None)]
extra_compile_args['nvcc'] += [
'-D__CUDA_NO_HALF_OPERATORS__',
'-D__CUDA_NO_HALF_CONVERSIONS__',
'-D__CUDA_NO_HALF2_OPERATORS__',
]
ext_ops = Extension(
name=ext_name,
sources=op_files,
include_dirs=[include_path],
include_dirs=include_dirs,
define_macros=define_macros,
extra_compile_args={
'nvcc': [cuda_args] if cuda_args else [],
'cxx': [],
},
cuda=True)
extra_compile_args=extra_compile_args,
cuda=True,
pytorch=True)
extensions.append(ext_ops)
elif EXT_TYPE == 'pytorch':
ext_name = 'mmcv._ext'
......
......@@ -60,8 +60,7 @@ class TestDeformRoIPool(object):
sampling_ratio=sampling_ratio).cuda()
if _USING_PARROTS:
pass
# gradcheck(droipool, (x, rois), no_grads=[rois])
gradcheck(droipool, (x, rois), no_grads=[rois])
else:
gradcheck(droipool, (x, rois), eps=1e-2, atol=1e-2)
......@@ -90,7 +89,6 @@ class TestDeformRoIPool(object):
sampling_ratio=sampling_ratio).cuda()
if _USING_PARROTS:
pass
# gradcheck(droipool, (x, rois), no_grads=[rois])
gradcheck(droipool, (x, rois), no_grads=[rois])
else:
gradcheck(droipool, (x, rois), eps=1e-2, atol=1e-2)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment