Unverified Commit 48d99025 authored by z55250825's avatar z55250825 Committed by GitHub
Browse files

Add new parrots extension implementation for all ops (#794)

* delete all parrots file
add bbox_overlaps new parrots op impl

* support first new impl parrts op (bbox_overlaps)(success test)

* add box_iou_rotated op, test succeed

* add carafe and carafe_naive op, test succeed (one parrots bug need fix)

* add cc_attention op, test success

* add corner_pool op, test success

* add parrots op deform_conv, test success

* add deform_roi_pool op, test success (but has question)

* add focal loss op, test success (gradcheck)

* add masked_conv2d op, test success

* add modulated_deform_conv op, test success

* add nms and nms_rotated op, test success

* add psamask op, test success

* add roi_align op, test_success

* add roi_pool op, test success

* add sync_bn op, test success

* add tin_shift op, test success

* fix test_deform_roi_pool, add parrots test

* skip test_onnx because parrots does not support onnx

* fix c++ lint

* fix python lint

* fix python lint
parent 72e4cc12
#include "parrots_cuda_helper.hpp" #include "pytorch_cuda_helper.hpp"
#include "roi_align_cuda_kernel.cuh" #include "roi_align_cuda_kernel.cuh"
void ROIAlignForwardCUDAKernelLauncher(DArrayLite input, DArrayLite rois, void ROIAlignForwardCUDAKernelLauncher(Tensor input, Tensor rois, Tensor output,
DArrayLite output, DArrayLite argmax_y, Tensor argmax_y, Tensor argmax_x,
DArrayLite argmax_x, int aligned_height, int aligned_height, int aligned_width,
int aligned_width, float spatial_scale, float spatial_scale, int sampling_ratio,
int sampling_ratio, int pool_mode, int pool_mode, bool aligned) {
bool aligned, cudaStream_t stream) { int output_size = output.numel();
int output_size = output.size(); int channels = input.size(1);
int channels = input.dim(1); int height = input.size(2);
int height = input.dim(2); int width = input.size(3);
int width = input.dim(3);
PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF( at::cuda::CUDAGuard device_guard(input.device());
input.elemType().prim(), ([&] { cudaStream_t stream = at::cuda::getCurrentCUDAStream();
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
input.scalar_type(), "roi_align_forward_cuda_kernel", [&] {
roi_align_forward_cuda_kernel<scalar_t> roi_align_forward_cuda_kernel<scalar_t>
<<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>( <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
output_size, input.ptr<scalar_t>(), rois.ptr<scalar_t>(), output_size, input.data_ptr<scalar_t>(),
output.ptr<scalar_t>(), argmax_y.ptr<scalar_t>(), rois.data_ptr<scalar_t>(), output.data_ptr<scalar_t>(),
argmax_x.ptr<scalar_t>(), aligned_height, aligned_width, argmax_y.data_ptr<scalar_t>(), argmax_x.data_ptr<scalar_t>(),
aligned_height, aligned_width,
static_cast<scalar_t>(spatial_scale), sampling_ratio, pool_mode, static_cast<scalar_t>(spatial_scale), sampling_ratio, pool_mode,
aligned, channels, height, width); aligned, channels, height, width);
})); });
PARROTS_CUDA_CHECK(cudaGetLastError()); AT_CUDA_CHECK(cudaGetLastError());
} }
void ROIAlignBackwardCUDAKernelLauncher( void ROIAlignBackwardCUDAKernelLauncher(Tensor grad_output, Tensor rois,
DArrayLite grad_output, DArrayLite rois, DArrayLite argmax_y, Tensor argmax_y, Tensor argmax_x,
DArrayLite argmax_x, DArrayLite grad_input, int aligned_height, Tensor grad_input, int aligned_height,
int aligned_width, float spatial_scale, int sampling_ratio, int pool_mode, int aligned_width, float spatial_scale,
bool aligned, cudaStream_t stream) { int sampling_ratio, int pool_mode,
int output_size = grad_output.size(); bool aligned) {
int channels = grad_input.dim(1); int output_size = grad_output.numel();
int height = grad_input.dim(2); int channels = grad_input.size(1);
int width = grad_input.dim(3); int height = grad_input.size(2);
int width = grad_input.size(3);
PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF( at::cuda::CUDAGuard device_guard(grad_output.device());
grad_output.elemType().prim(), ([&] { cudaStream_t stream = at::cuda::getCurrentCUDAStream();
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
grad_output.scalar_type(), "roi_align_backward_cuda_kernel", [&] {
roi_align_backward_cuda_kernel<scalar_t> roi_align_backward_cuda_kernel<scalar_t>
<<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>( <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
output_size, grad_output.ptr<scalar_t>(), rois.ptr<scalar_t>(), output_size, grad_output.data_ptr<scalar_t>(),
argmax_y.ptr<scalar_t>(), argmax_x.ptr<scalar_t>(), rois.data_ptr<scalar_t>(), argmax_y.data_ptr<scalar_t>(),
grad_input.ptr<scalar_t>(), aligned_height, aligned_width, argmax_x.data_ptr<scalar_t>(), grad_input.data_ptr<scalar_t>(),
aligned_height, aligned_width,
static_cast<scalar_t>(spatial_scale), sampling_ratio, pool_mode, static_cast<scalar_t>(spatial_scale), sampling_ratio, pool_mode,
aligned, channels, height, width); aligned, channels, height, width);
})); });
PARROTS_CUDA_CHECK(cudaGetLastError()); AT_CUDA_CHECK(cudaGetLastError());
} }
#include <parrots/compute/aten.hpp>
#include <parrots/extension.hpp>
#include <parrots/foundation/ssattrs.hpp>
#include "roi_align_pytorch.h"
using namespace parrots;
#ifdef MMCV_WITH_CUDA
void roi_align_forward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
int aligned_height;
int aligned_width;
float spatial_scale;
int sampling_ratio;
int pool_mode;
bool aligned;
SSAttrs(attr)
.get<int>("aligned_height", aligned_height)
.get<int>("aligned_width", aligned_width)
.get<float>("spatial_scale", spatial_scale)
.get<int>("sampling_ratio", sampling_ratio)
.get<int>("pool_mode", pool_mode)
.get<bool>("aligned", aligned)
.done();
const auto& input = buildATensor(ctx, ins[0]);
const auto& rois = buildATensor(ctx, ins[1]);
auto output = buildATensor(ctx, outs[0]);
auto argmax_y = buildATensor(ctx, outs[1]);
auto argmax_x = buildATensor(ctx, outs[2]);
roi_align_forward_cuda(input, rois, output, argmax_y, argmax_x,
aligned_height, aligned_width, spatial_scale,
sampling_ratio, pool_mode, aligned);
}
void roi_align_backward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
int aligned_height;
int aligned_width;
float spatial_scale;
int sampling_ratio;
int pool_mode;
bool aligned;
SSAttrs(attr)
.get<int>("aligned_height", aligned_height)
.get<int>("aligned_width", aligned_width)
.get<float>("spatial_scale", spatial_scale)
.get<int>("sampling_ratio", sampling_ratio)
.get<int>("pool_mode", pool_mode)
.get<bool>("aligned", aligned)
.done();
const auto& grad_output = buildATensor(ctx, ins[0]);
const auto& rois = buildATensor(ctx, ins[1]);
const auto& argmax_y = buildATensor(ctx, ins[2]);
const auto& argmax_x = buildATensor(ctx, ins[3]);
auto grad_input = buildATensor(ctx, outs[0]);
roi_align_backward_cuda(grad_output, rois, argmax_y, argmax_x, grad_input,
aligned_height, aligned_width, spatial_scale,
sampling_ratio, pool_mode, aligned);
}
#endif
void roi_align_forward_cpu_parrots(HostContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
int aligned_height;
int aligned_width;
float spatial_scale;
int sampling_ratio;
int pool_mode;
bool aligned;
SSAttrs(attr)
.get<int>("aligned_height", aligned_height)
.get<int>("aligned_width", aligned_width)
.get<float>("spatial_scale", spatial_scale)
.get<int>("sampling_ratio", sampling_ratio)
.get<int>("pool_mode", pool_mode)
.get<bool>("aligned", aligned)
.done();
const auto& input = buildATensor(ctx, ins[0]);
const auto& rois = buildATensor(ctx, ins[1]);
auto output = buildATensor(ctx, outs[0]);
auto argmax_y = buildATensor(ctx, outs[1]);
auto argmax_x = buildATensor(ctx, outs[2]);
roi_align_forward_cpu(input, rois, output, argmax_y, argmax_x, aligned_height,
aligned_width, spatial_scale, sampling_ratio, pool_mode,
aligned);
}
void roi_align_backward_cpu_parrots(HostContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
int aligned_height;
int aligned_width;
float spatial_scale;
int sampling_ratio;
int pool_mode;
bool aligned;
SSAttrs(attr)
.get<int>("aligned_height", aligned_height)
.get<int>("aligned_width", aligned_width)
.get<float>("spatial_scale", spatial_scale)
.get<int>("sampling_ratio", sampling_ratio)
.get<int>("pool_mode", pool_mode)
.get<bool>("aligned", aligned)
.done();
const auto& grad_output = buildATensor(ctx, ins[0]);
const auto& rois = buildATensor(ctx, ins[1]);
const auto& argmax_y = buildATensor(ctx, ins[2]);
const auto& argmax_x = buildATensor(ctx, ins[3]);
auto grad_input = buildATensor(ctx, outs[0]);
roi_align_backward_cpu(grad_output, rois, argmax_y, argmax_x, grad_input,
aligned_height, aligned_width, spatial_scale,
sampling_ratio, pool_mode, aligned);
}
PARROTS_EXTENSION_REGISTER(roi_align_forward)
.attr("aligned_height")
.attr("aligned_width")
.attr("spatial_scale")
.attr("sampling_ratio")
.attr("pool_mode")
.attr("aligned")
.input(2)
.output(3)
.apply(roi_align_forward_cpu_parrots)
#ifdef MMCV_WITH_CUDA
.apply(roi_align_forward_cuda_parrots)
#endif
.done();
PARROTS_EXTENSION_REGISTER(roi_align_backward)
.attr("aligned_height")
.attr("aligned_width")
.attr("spatial_scale")
.attr("sampling_ratio")
.attr("pool_mode")
.attr("aligned")
.input(4)
.output(1)
.apply(roi_align_backward_cpu_parrots)
#ifdef MMCV_WITH_CUDA
.apply(roi_align_backward_cuda_parrots)
#endif
.done();
#ifndef ROI_ALIGN_PYTORCH_H
#define ROI_ALIGN_PYTORCH_H
#include <torch/extension.h>
using namespace at;
#ifdef MMCV_WITH_CUDA
void roi_align_forward_cuda(Tensor input, Tensor rois, Tensor output,
Tensor argmax_y, Tensor argmax_x,
int aligned_height, int aligned_width,
float spatial_scale, int sampling_ratio,
int pool_mode, bool aligned);
void roi_align_backward_cuda(Tensor grad_output, Tensor rois, Tensor argmax_y,
Tensor argmax_x, Tensor grad_input,
int aligned_height, int aligned_width,
float spatial_scale, int sampling_ratio,
int pool_mode, bool aligned);
#endif
void roi_align_forward_cpu(Tensor input, Tensor rois, Tensor output,
Tensor argmax_y, Tensor argmax_x, int aligned_height,
int aligned_width, float spatial_scale,
int sampling_ratio, int pool_mode, bool aligned);
void roi_align_backward_cpu(Tensor grad_output, Tensor rois, Tensor argmax_y,
Tensor argmax_x, Tensor grad_input,
int aligned_height, int aligned_width,
float spatial_scale, int sampling_ratio,
int pool_mode, bool aligned);
#endif // ROI_ALIGN_PYTORCH_H
#include "parrots_cpp_helper.hpp" #include "pytorch_cpp_helper.hpp"
void ROIPoolForwardCUDAKernelLauncher(const DArrayLite input, #ifdef MMCV_WITH_CUDA
const DArrayLite rois, DArrayLite output, void ROIPoolForwardCUDAKernelLauncher(Tensor input, Tensor rois, Tensor output,
DArrayLite argmax, int pooled_height, Tensor argmax, int pooled_height,
int pooled_width, float spatial_scale, int pooled_width, float spatial_scale);
cudaStream_t stream);
void ROIPoolBackwardCUDAKernelLauncher(const DArrayLite grad_output, void ROIPoolBackwardCUDAKernelLauncher(Tensor grad_output, Tensor rois,
const DArrayLite rois, Tensor argmax, Tensor grad_input,
const DArrayLite argmax, int pooled_height, int pooled_width,
DArrayLite grad_input, int pooled_height, float spatial_scale);
int pooled_width, float spatial_scale,
cudaStream_t stream);
void roi_pool_forward_cuda(CudaContext& ctx, const SSElement& attr, void roi_pool_forward_cuda(Tensor input, Tensor rois, Tensor output,
const OperatorBase::in_list_t& ins, Tensor argmax, int pooled_height, int pooled_width,
OperatorBase::out_list_t& outs) { float spatial_scale) {
int pooled_height;
int pooled_width;
float spatial_scale;
SSAttrs(attr)
.get<int>("pooled_height", pooled_height)
.get<int>("pooled_width", pooled_width)
.get<float>("spatial_scale", spatial_scale)
.done();
const auto& input = ins[0];
const auto& rois = ins[1];
auto& output = outs[0];
auto& argmax = outs[1];
cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
ROIPoolForwardCUDAKernelLauncher(input, rois, output, argmax, pooled_height, ROIPoolForwardCUDAKernelLauncher(input, rois, output, argmax, pooled_height,
pooled_width, spatial_scale, stream); pooled_width, spatial_scale);
} }
void roi_pool_backward_cuda(CudaContext& ctx, const SSElement& attr, void roi_pool_backward_cuda(Tensor grad_output, Tensor rois, Tensor argmax,
const OperatorBase::in_list_t& ins, Tensor grad_input, int pooled_height,
OperatorBase::out_list_t& outs) { int pooled_width, float spatial_scale) {
int pooled_height; ROIPoolBackwardCUDAKernelLauncher(grad_output, rois, argmax, grad_input,
int pooled_width; pooled_height, pooled_width, spatial_scale);
float spatial_scale; }
SSAttrs(attr) #endif
.get<int>("pooled_height", pooled_height)
.get<int>("pooled_width", pooled_width)
.get<float>("spatial_scale", spatial_scale)
.done();
const auto& grad_output = ins[0]; void roi_pool_forward(Tensor input, Tensor rois, Tensor output, Tensor argmax,
const auto& rois = ins[1]; int pooled_height, int pooled_width,
const auto& argmax = ins[2]; float spatial_scale) {
auto& grad_input = outs[0]; if (input.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(input);
CHECK_CUDA_INPUT(rois);
CHECK_CUDA_INPUT(output);
CHECK_CUDA_INPUT(argmax);
cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream()); roi_pool_forward_cuda(input, rois, output, argmax, pooled_height,
ROIPoolBackwardCUDAKernelLauncher(grad_output, rois, argmax, grad_input, pooled_width, spatial_scale);
pooled_height, pooled_width, spatial_scale, #else
stream); AT_ERROR("RoIPool is not compiled with GPU support");
#endif
} else {
AT_ERROR("RoIPool is not implemented on CPU");
}
} }
PARROTS_EXTENSION_REGISTER(roi_pool_forward) void roi_pool_backward(Tensor grad_output, Tensor rois, Tensor argmax,
.attr("pooled_height") Tensor grad_input, int pooled_height, int pooled_width,
.attr("pooled_width") float spatial_scale) {
.attr("spatial_scale") if (grad_output.device().is_cuda()) {
.input(2) #ifdef MMCV_WITH_CUDA
.output(2) CHECK_CUDA_INPUT(grad_output);
.apply(roi_pool_forward_cuda) CHECK_CUDA_INPUT(rois);
.done(); CHECK_CUDA_INPUT(argmax);
CHECK_CUDA_INPUT(grad_input);
PARROTS_EXTENSION_REGISTER(roi_pool_backward) roi_pool_backward_cuda(grad_output, rois, argmax, grad_input, pooled_height,
.attr("pooled_height") pooled_width, spatial_scale);
.attr("pooled_width") #else
.attr("spatial_scale") AT_ERROR("RoIPool is not compiled with GPU support");
.input(3) #endif
.output(1) } else {
.apply(roi_pool_backward_cuda) AT_ERROR("RoIPool is not implemented on CPU");
.done(); }
}
#include "parrots_cuda_helper.hpp" #include "pytorch_cuda_helper.hpp"
#include "roi_pool_cuda_kernel.cuh" #include "roi_pool_cuda_kernel.cuh"
void ROIPoolForwardCUDAKernelLauncher(const DArrayLite input, void ROIPoolForwardCUDAKernelLauncher(Tensor input, Tensor rois, Tensor output,
const DArrayLite rois, DArrayLite output, Tensor argmax, int pooled_height,
DArrayLite argmax, int pooled_height, int pooled_width, float spatial_scale) {
int pooled_width, float spatial_scale, int output_size = output.numel();
cudaStream_t stream) { int channels = input.size(1);
int output_size = output.size(); int height = input.size(2);
int channels = input.dim(1); int width = input.size(3);
int height = input.dim(2);
int width = input.dim(3);
PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(input.elemType().prim(), [&] { at::cuda::CUDAGuard device_guard(input.device());
roi_pool_forward_cuda_kernel<scalar_t> cudaStream_t stream = at::cuda::getCurrentCUDAStream();
<<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>( AT_DISPATCH_FLOATING_TYPES_AND_HALF(
output_size, input.ptr<scalar_t>(), rois.ptr<scalar_t>(), input.scalar_type(), "roi_pool_forward_cuda_kernel", [&] {
output.ptr<scalar_t>(), argmax.ptr<int>(), pooled_height, roi_pool_forward_cuda_kernel<scalar_t>
pooled_width, spatial_scale, channels, height, width); <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
}); output_size, input.data_ptr<scalar_t>(),
rois.data_ptr<scalar_t>(), output.data_ptr<scalar_t>(),
argmax.data_ptr<int>(), pooled_height, pooled_width,
static_cast<scalar_t>(spatial_scale), channels, height, width);
});
PARROTS_CUDA_CHECK(cudaGetLastError()); AT_CUDA_CHECK(cudaGetLastError());
} }
void ROIPoolBackwardCUDAKernelLauncher(const DArrayLite grad_output, void ROIPoolBackwardCUDAKernelLauncher(Tensor grad_output, Tensor rois,
const DArrayLite rois, Tensor argmax, Tensor grad_input,
const DArrayLite argmax, int pooled_height, int pooled_width,
DArrayLite grad_input, int pooled_height, float spatial_scale) {
int pooled_width, float spatial_scale, int output_size = grad_output.numel();
cudaStream_t stream) { int channels = grad_input.size(1);
int output_size = grad_output.size(); int height = grad_input.size(2);
int channels = grad_output.dim(1); int width = grad_input.size(3);
int height = grad_output.dim(2);
int width = grad_output.dim(3);
PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(grad_output.elemType().prim(), [&] { at::cuda::CUDAGuard device_guard(grad_output.device());
roi_pool_backward_cuda_kernel<scalar_t> cudaStream_t stream = at::cuda::getCurrentCUDAStream();
<<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>( AT_DISPATCH_FLOATING_TYPES_AND_HALF(
output_size, grad_output.ptr<scalar_t>(), rois.ptr<scalar_t>(), grad_output.scalar_type(), "roi_pool_backward_cuda_kernel", [&] {
argmax.ptr<int>(), grad_input.ptr<scalar_t>(), pooled_height, roi_pool_backward_cuda_kernel<scalar_t>
pooled_width, channels, height, width); <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
}); output_size, grad_output.data_ptr<scalar_t>(),
rois.data_ptr<scalar_t>(), argmax.data_ptr<int>(),
grad_input.data_ptr<scalar_t>(), pooled_height, pooled_width,
channels, height, width);
});
PARROTS_CUDA_CHECK(cudaGetLastError()); AT_CUDA_CHECK(cudaGetLastError());
} }
#include <parrots/compute/aten.hpp>
#include <parrots/extension.hpp>
#include <parrots/foundation/ssattrs.hpp>
#include "roi_pool_pytorch.h"
using namespace parrots;
void roi_pool_forward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
int pooled_height;
int pooled_width;
float spatial_scale;
SSAttrs(attr)
.get<int>("pooled_height", pooled_height)
.get<int>("pooled_width", pooled_width)
.get<float>("spatial_scale", spatial_scale)
.done();
const auto& input = buildATensor(ctx, ins[0]);
const auto& rois = buildATensor(ctx, ins[1]);
auto output = buildATensor(ctx, outs[0]);
auto argmax = buildATensor(ctx, outs[1]);
roi_pool_forward_cuda(input, rois, output, argmax, pooled_height,
pooled_width, spatial_scale);
}
void roi_pool_backward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
int pooled_height;
int pooled_width;
float spatial_scale;
SSAttrs(attr)
.get<int>("pooled_height", pooled_height)
.get<int>("pooled_width", pooled_width)
.get<float>("spatial_scale", spatial_scale)
.done();
const auto& grad_output = buildATensor(ctx, ins[0]);
const auto& rois = buildATensor(ctx, ins[1]);
const auto& argmax = buildATensor(ctx, ins[2]);
auto grad_input = buildATensor(ctx, outs[0]);
roi_pool_backward_cuda(grad_output, rois, argmax, grad_input, pooled_height,
pooled_width, spatial_scale);
}
PARROTS_EXTENSION_REGISTER(roi_pool_forward)
.attr("pooled_height")
.attr("pooled_width")
.attr("spatial_scale")
.input(2)
.output(2)
.apply(roi_pool_forward_cuda_parrots)
.done();
PARROTS_EXTENSION_REGISTER(roi_pool_backward)
.attr("pooled_height")
.attr("pooled_width")
.attr("spatial_scale")
.input(3)
.output(1)
.apply(roi_pool_backward_cuda_parrots)
.done();
#ifndef ROI_POOL_PYTORCH_H
#define ROI_POOL_PYTORCH_H
#include <torch/extension.h>
using namespace at;
#ifdef MMCV_WITH_CUDA
void roi_pool_forward_cuda(Tensor input, Tensor rois, Tensor output,
Tensor argmax, int pooled_height, int pooled_width,
float spatial_scale);
void roi_pool_backward_cuda(Tensor grad_output, Tensor rois, Tensor argmax,
Tensor grad_input, int pooled_height,
int pooled_width, float spatial_scale);
#endif
#endif // ROI_POOL_PYTORCH_H
#include "parrots_cpp_helper.hpp" #include "pytorch_cpp_helper.hpp"
void SyncBNForwardMeanCUDAKernelLauncher(const DArrayLite input, #ifdef MMCV_WITH_CUDA
DArrayLite mean, cudaStream_t stream); void SyncBNForwardMeanCUDAKernelLauncher(const Tensor input, Tensor mean);
void SyncBNForwardVarCUDAKernelLauncher(const DArrayLite input, void SyncBNForwardVarCUDAKernelLauncher(const Tensor input, const Tensor mean,
const DArrayLite mean, DArrayLite var, Tensor var);
cudaStream_t stream);
void SyncBNForwardOutputCUDAKernelLauncher( void SyncBNForwardOutputCUDAKernelLauncher(
const DArrayLite input, const DArrayLite mean, const DArrayLite var, const Tensor input, const Tensor mean, const Tensor var,
DArrayLite running_mean, DArrayLite running_var, const DArrayLite weight, Tensor running_mean, Tensor running_var, const Tensor weight,
const DArrayLite bias, DArrayLite norm, DArrayLite std, DArrayLite output, const Tensor bias, Tensor norm, Tensor std, Tensor output, float eps,
const float eps, const float momentum, size_t group_size, float momentum, int group_size);
cudaStream_t stream);
void SyncBNBackwardParamCUDAKernelLauncher(const Tensor grad_output,
void SyncBNBackwardParamCUDAKernelLauncher(const DArrayLite grad_output, const Tensor norm,
const DArrayLite norm, Tensor grad_weight,
DArrayLite weight_diff, Tensor grad_bias);
DArrayLite bias_diff,
cudaStream_t stream); void SyncBNBackwardDataCUDAKernelLauncher(const Tensor grad_output,
const Tensor weight,
void SyncBNBackwardDataCUDAKernelLauncher( const Tensor grad_weight,
const DArrayLite grad_output, const DArrayLite weight, const Tensor grad_bias,
const DArrayLite weight_diff, const DArrayLite bias_diff, const Tensor norm, const Tensor std,
const DArrayLite norm, const DArrayLite std, DArrayLite grad_input, Tensor grad_input);
cudaStream_t stream);
void sync_bn_forward_mean_cuda(const Tensor input, Tensor mean) {
void sync_bn_forward_mean_cuda(CudaContext& ctx, const SSElement& attr, SyncBNForwardMeanCUDAKernelLauncher(input, mean);
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
const auto& input = ins[0];
auto& mean = outs[0];
cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
SyncBNForwardMeanCUDAKernelLauncher(input, mean, stream);
} }
void sync_bn_forward_var_cuda(CudaContext& ctx, const SSElement& attr, void sync_bn_forward_var_cuda(const Tensor input, const Tensor mean,
const OperatorBase::in_list_t& ins, Tensor var) {
OperatorBase::out_list_t& outs) { SyncBNForwardVarCUDAKernelLauncher(input, mean, var);
const auto& input = ins[0];
const auto& mean = ins[1];
auto& var = outs[0];
cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
SyncBNForwardVarCUDAKernelLauncher(input, mean, var, stream);
} }
void sync_bn_forward_output_cuda(CudaContext& ctx, const SSElement& attr, void sync_bn_forward_output_cuda(const Tensor input, const Tensor mean,
const OperatorBase::in_list_t& ins, const Tensor var, Tensor running_mean,
OperatorBase::out_list_t& outs) { Tensor running_var, const Tensor weight,
size_t group_size; const Tensor bias, Tensor norm, Tensor std,
float eps, momentum; Tensor output, float eps, float momentum,
SSAttrs(attr) int group_size) {
.get<float>("eps", eps) SyncBNForwardOutputCUDAKernelLauncher(input, mean, var, running_mean,
.get<float>("momentum", momentum) running_var, weight, bias, norm, std,
.get<size_t>("group_size", group_size) output, eps, momentum, group_size);
.done();
const auto& input = ins[0];
const auto& mean = ins[1];
const auto& var = ins[2];
const auto& weight = ins[3];
const auto& bias = ins[4];
auto& running_mean = outs[0];
auto& running_var = outs[1];
auto& norm = outs[2];
auto& std = outs[3];
auto& output = outs[4];
cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
SyncBNForwardOutputCUDAKernelLauncher(
input, mean, var, running_mean, running_var, weight, bias, norm, std,
output, eps, momentum, group_size, stream);
} }
void sync_bn_backward_param_cuda(CudaContext& ctx, const SSElement& attr, void sync_bn_backward_param_cuda(const Tensor grad_output, const Tensor norm,
const OperatorBase::in_list_t& ins, Tensor grad_weight, Tensor grad_bias) {
OperatorBase::out_list_t& outs) {
const auto& grad_output = ins[0];
const auto& norm = ins[1];
auto& grad_weight = outs[0];
auto& grad_bias = outs[1];
cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
SyncBNBackwardParamCUDAKernelLauncher(grad_output, norm, grad_weight, SyncBNBackwardParamCUDAKernelLauncher(grad_output, norm, grad_weight,
grad_bias, stream); grad_bias);
} }
void sync_bn_backward_data_cuda(CudaContext& ctx, const SSElement& attr, void sync_bn_backward_data_cuda(const Tensor grad_output, const Tensor weight,
const OperatorBase::in_list_t& ins, const Tensor grad_weight,
OperatorBase::out_list_t& outs) { const Tensor grad_bias, const Tensor norm,
const auto& grad_output = ins[0]; const Tensor std, Tensor grad_input) {
const auto& weight = ins[1];
const auto& grad_weight = ins[2];
const auto& grad_bias = ins[3];
const auto& norm = ins[4];
const auto& std = ins[5];
auto& grad_input = outs[0];
cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
SyncBNBackwardDataCUDAKernelLauncher(grad_output, weight, grad_weight, SyncBNBackwardDataCUDAKernelLauncher(grad_output, weight, grad_weight,
grad_bias, norm, std, grad_input, grad_bias, norm, std, grad_input);
stream); }
#endif
void sync_bn_forward_mean(const Tensor input, Tensor mean) {
if (input.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(input);
CHECK_CUDA_INPUT(mean);
sync_bn_forward_mean_cuda(input, mean);
#else
AT_ERROR("SyncBatchNorm is not compiled with GPU support");
#endif
} else {
AT_ERROR("SyncBatchNorm is not implemented on CPU");
}
} }
PARROTS_EXTENSION_REGISTER(sync_bn_forward_mean) void sync_bn_forward_var(const Tensor input, const Tensor mean, Tensor var) {
.input(1) if (input.device().is_cuda()) {
.output(1) #ifdef MMCV_WITH_CUDA
.apply(sync_bn_forward_mean_cuda) CHECK_CUDA_INPUT(input);
.done(); CHECK_CUDA_INPUT(mean);
CHECK_CUDA_INPUT(var);
PARROTS_EXTENSION_REGISTER(sync_bn_forward_var) sync_bn_forward_var_cuda(input, mean, var);
.input(2) #else
.output(1) AT_ERROR("SyncBatchNorm is not compiled with GPU support");
.apply(sync_bn_forward_var_cuda) #endif
.done(); } else {
AT_ERROR("SyncBatchNorm is not implemented on CPU");
}
}
PARROTS_EXTENSION_REGISTER(sync_bn_forward_output) void sync_bn_forward_output(const Tensor input, const Tensor mean,
.attr("eps") const Tensor var, const Tensor weight,
.attr("momentum") const Tensor bias, Tensor running_mean,
.attr("group_size") Tensor running_var, Tensor norm, Tensor std,
.input(5) Tensor output, float eps, float momentum,
.output(5) int group_size) {
.apply(sync_bn_forward_output_cuda) if (input.device().is_cuda()) {
.done(); #ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(input);
CHECK_CUDA_INPUT(mean);
CHECK_CUDA_INPUT(var);
CHECK_CUDA_INPUT(weight);
CHECK_CUDA_INPUT(bias);
CHECK_CUDA_INPUT(running_mean);
CHECK_CUDA_INPUT(running_var);
CHECK_CUDA_INPUT(norm);
CHECK_CUDA_INPUT(std);
CHECK_CUDA_INPUT(output);
sync_bn_forward_output_cuda(input, mean, var, running_mean, running_var,
weight, bias, norm, std, output, eps, momentum,
group_size);
#else
AT_ERROR("SyncBatchNorm is not compiled with GPU support");
#endif
} else {
AT_ERROR("SyncBatchNorm is not implemented on CPU");
}
}
PARROTS_EXTENSION_REGISTER(sync_bn_backward_param) void sync_bn_backward_param(const Tensor grad_output, const Tensor norm,
.input(2) Tensor grad_weight, Tensor grad_bias) {
.output(2) if (grad_output.device().is_cuda()) {
.apply(sync_bn_backward_param_cuda) #ifdef MMCV_WITH_CUDA
.done(); CHECK_CUDA_INPUT(grad_output);
CHECK_CUDA_INPUT(norm);
CHECK_CUDA_INPUT(grad_weight);
CHECK_CUDA_INPUT(grad_bias);
sync_bn_backward_param_cuda(grad_output, norm, grad_weight, grad_bias);
#else
AT_ERROR("SyncBatchNorm is not compiled with GPU support");
#endif
} else {
AT_ERROR("SyncBatchNorm is not implemented on CPU");
}
}
PARROTS_EXTENSION_REGISTER(sync_bn_backward_data) void sync_bn_backward_data(const Tensor grad_output, const Tensor weight,
.input(6) const Tensor grad_weight, const Tensor grad_bias,
.output(1) const Tensor norm, const Tensor std,
.apply(sync_bn_backward_data_cuda) Tensor grad_input) {
.done(); if (grad_output.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(grad_output);
CHECK_CUDA_INPUT(weight);
CHECK_CUDA_INPUT(grad_weight);
CHECK_CUDA_INPUT(grad_bias);
CHECK_CUDA_INPUT(norm);
CHECK_CUDA_INPUT(std);
CHECK_CUDA_INPUT(grad_input);
sync_bn_backward_data_cuda(grad_output, weight, grad_weight, grad_bias,
norm, std, grad_input);
#else
AT_ERROR("SyncBatchNorm is not compiled with GPU support");
#endif
} else {
AT_ERROR("SyncBatchNorm is not implemented on CPU");
}
}
#include "parrots_cuda_helper.hpp" #include "pytorch_cuda_helper.hpp"
#include "sync_bn_cuda_kernel.cuh" #include "sync_bn_cuda_kernel.cuh"
void SyncBNForwardMeanCUDAKernelLauncher(const DArrayLite input, void SyncBNForwardMeanCUDAKernelLauncher(const Tensor input, Tensor mean) {
DArrayLite mean, cudaStream_t stream) { int num = input.size(0);
int num = input.dim(0); int channels = input.size(1);
int channels = input.dim(1); int spatial = input.size(2);
int spatial = input.dim(2);
PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF( at::cuda::CUDAGuard device_guard(input.device());
input.elemType().prim(), ([&] { cudaStream_t stream = at::cuda::getCurrentCUDAStream();
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
input.scalar_type(), "sync_bn_forward_mean_cuda_kernel", [&] {
sync_bn_forward_mean_cuda_kernel<scalar_t> sync_bn_forward_mean_cuda_kernel<scalar_t>
<<<channels, THREADS_PER_BLOCK, 0, stream>>>(input.ptr<scalar_t>(), <<<channels, THREADS_PER_BLOCK, 0, stream>>>(
mean.ptr<float>(), num, input.data_ptr<scalar_t>(), mean.data_ptr<float>(), num,
channels, spatial); channels, spatial);
})); });
AT_CUDA_CHECK(cudaGetLastError());
PARROTS_CUDA_CHECK(cudaGetLastError());
} }
void SyncBNForwardVarCUDAKernelLauncher(const DArrayLite input, void SyncBNForwardVarCUDAKernelLauncher(const Tensor input, const Tensor mean,
const DArrayLite mean, DArrayLite var, Tensor var) {
cudaStream_t stream) { int num = input.size(0);
int num = input.dim(0); int channels = input.size(1);
int channels = input.dim(1); int spatial = input.size(2);
int spatial = input.dim(2);
PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF( at::cuda::CUDAGuard device_guard(input.device());
input.elemType().prim(), ([&] { cudaStream_t stream = at::cuda::getCurrentCUDAStream();
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
input.scalar_type(), "sync_bn_forward_mean_cuda_kernel", [&] {
sync_bn_forward_var_cuda_kernel<scalar_t> sync_bn_forward_var_cuda_kernel<scalar_t>
<<<channels, THREADS_PER_BLOCK, 0, stream>>>( <<<channels, THREADS_PER_BLOCK, 0, stream>>>(
input.ptr<scalar_t>(), mean.ptr<float>(), var.ptr<float>(), num, input.data_ptr<scalar_t>(), mean.data_ptr<float>(),
channels, spatial); var.data_ptr<float>(), num, channels, spatial);
})); });
AT_CUDA_CHECK(cudaGetLastError());
PARROTS_CUDA_CHECK(cudaGetLastError());
} }
void SyncBNForwardOutputCUDAKernelLauncher( void SyncBNForwardOutputCUDAKernelLauncher(
const DArrayLite input, const DArrayLite mean, const DArrayLite var, const Tensor input, const Tensor mean, const Tensor var,
DArrayLite running_mean, DArrayLite running_var, const DArrayLite weight, Tensor running_mean, Tensor running_var, const Tensor weight,
const DArrayLite bias, DArrayLite norm, DArrayLite std, DArrayLite output, const Tensor bias, Tensor norm, Tensor std, Tensor output, float eps,
float eps, float momentum, size_t group_size, cudaStream_t stream) { float momentum, int group_size) {
int num = input.dim(0); int num = input.size(0);
int channels = input.dim(1); int channels = input.size(1);
int spatial = input.dim(2); int spatial = input.size(2);
PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF( at::cuda::CUDAGuard device_guard(input.device());
input.elemType().prim(), ([&] { cudaStream_t stream = at::cuda::getCurrentCUDAStream();
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
input.scalar_type(), "sync_bn_forward_mean_cuda_kernel", [&] {
sync_bn_forward_output_cuda_kernel<scalar_t> sync_bn_forward_output_cuda_kernel<scalar_t>
<<<channels, THREADS_PER_BLOCK, 0, stream>>>( <<<channels, THREADS_PER_BLOCK, 0, stream>>>(
input.ptr<scalar_t>(), mean.ptr<float>(), var.ptr<float>(), input.data_ptr<scalar_t>(), mean.data_ptr<float>(),
running_mean.ptr<float>(), running_var.ptr<float>(), var.data_ptr<float>(), running_mean.data_ptr<float>(),
weight.ptr<float>(), bias.ptr<float>(), norm.ptr<float>(), running_var.data_ptr<float>(), weight.data_ptr<float>(),
std.ptr<float>(), output.ptr<scalar_t>(), num, channels, bias.data_ptr<float>(), norm.data_ptr<float>(),
spatial, eps, momentum, group_size); std.data_ptr<float>(), output.data_ptr<scalar_t>(), num,
})); channels, spatial, eps, momentum, group_size);
});
PARROTS_CUDA_CHECK(cudaGetLastError()); AT_CUDA_CHECK(cudaGetLastError());
} }
void SyncBNBackwardParamCUDAKernelLauncher(const DArrayLite grad_output, void SyncBNBackwardParamCUDAKernelLauncher(const Tensor grad_output,
const DArrayLite norm, const Tensor norm,
DArrayLite grad_weight, Tensor grad_weight,
DArrayLite grad_bias, Tensor grad_bias) {
cudaStream_t stream) { int num = grad_output.size(0);
int num = grad_output.dim(0); int channels = grad_output.size(1);
int channels = grad_output.dim(1); int spatial = grad_output.size(2);
int spatial = grad_output.dim(2);
PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF( at::cuda::CUDAGuard device_guard(grad_output.device());
grad_output.elemType().prim(), ([&] { cudaStream_t stream = at::cuda::getCurrentCUDAStream();
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
grad_output.scalar_type(), "sync_bn_backward_param_cuda_kernel", [&] {
sync_bn_backward_param_cuda_kernel<scalar_t> sync_bn_backward_param_cuda_kernel<scalar_t>
<<<channels, THREADS_PER_BLOCK, 0, stream>>>( <<<channels, THREADS_PER_BLOCK, 0, stream>>>(
grad_output.ptr<scalar_t>(), norm.ptr<float>(), grad_output.data_ptr<scalar_t>(), norm.data_ptr<float>(),
grad_weight.ptr<float>(), grad_bias.ptr<float>(), num, channels, grad_weight.data_ptr<float>(), grad_bias.data_ptr<float>(), num,
spatial); channels, spatial);
})); });
AT_CUDA_CHECK(cudaGetLastError());
PARROTS_CUDA_CHECK(cudaGetLastError());
} }
void SyncBNBackwardDataCUDAKernelLauncher( void SyncBNBackwardDataCUDAKernelLauncher(const Tensor grad_output,
const DArrayLite grad_output, const DArrayLite weight, const Tensor weight,
const DArrayLite grad_weight, const DArrayLite grad_bias, const Tensor grad_weight,
const DArrayLite norm, const DArrayLite std, DArrayLite grad_input, const Tensor grad_bias,
cudaStream_t stream) { const Tensor norm, const Tensor std,
int output_size = grad_input.size(); Tensor grad_input) {
int num = grad_input.dim(0); int output_size = grad_input.numel();
int channels = grad_input.dim(1); int num = grad_input.size(0);
int spatial = grad_input.dim(2); int channels = grad_input.size(1);
int spatial = grad_input.size(2);
PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF( at::cuda::CUDAGuard device_guard(grad_input.device());
grad_input.elemType().prim(), ([&] { cudaStream_t stream = at::cuda::getCurrentCUDAStream();
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
grad_output.scalar_type(), "sync_bn_backward_data_cuda_kernel", [&] {
sync_bn_backward_data_cuda_kernel<scalar_t> sync_bn_backward_data_cuda_kernel<scalar_t>
<<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>( <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
output_size, grad_output.ptr<scalar_t>(), weight.ptr<float>(), output_size, grad_output.data_ptr<scalar_t>(),
grad_weight.ptr<float>(), grad_bias.ptr<float>(), weight.data_ptr<float>(), grad_weight.data_ptr<float>(),
norm.ptr<float>(), std.ptr<float>(), grad_input.ptr<scalar_t>(), grad_bias.data_ptr<float>(), norm.data_ptr<float>(),
num, channels, spatial); std.data_ptr<float>(), grad_input.data_ptr<scalar_t>(), num,
})); channels, spatial);
});
PARROTS_CUDA_CHECK(cudaGetLastError()); AT_CUDA_CHECK(cudaGetLastError());
} }
#include <parrots/compute/aten.hpp>
#include <parrots/extension.hpp>
#include <parrots/foundation/ssattrs.hpp>
#include "sync_bn_pytorch.h"
using namespace parrots;
void sync_bn_forward_mean_cuda_parrots(CudaContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
const auto& input = buildATensor(ctx, ins[0]);
auto mean = buildATensor(ctx, outs[0]);
sync_bn_forward_mean_cuda(input, mean);
}
void sync_bn_forward_var_cuda_parrots(CudaContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
const auto& input = buildATensor(ctx, ins[0]);
const auto& mean = buildATensor(ctx, ins[1]);
auto var = buildATensor(ctx, outs[0]);
sync_bn_forward_var_cuda(input, mean, var);
}
void sync_bn_forward_output_cuda_parrots(CudaContext& ctx,
const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
size_t group_size;
float eps, momentum;
SSAttrs(attr)
.get<float>("eps", eps)
.get<float>("momentum", momentum)
.get<size_t>("group_size", group_size)
.done();
const auto& input = buildATensor(ctx, ins[0]);
const auto& mean = buildATensor(ctx, ins[1]);
const auto& var = buildATensor(ctx, ins[2]);
const auto& weight = buildATensor(ctx, ins[3]);
const auto& bias = buildATensor(ctx, ins[4]);
auto running_mean = buildATensor(ctx, outs[0]);
auto running_var = buildATensor(ctx, outs[1]);
auto norm = buildATensor(ctx, outs[2]);
auto std = buildATensor(ctx, outs[3]);
auto output = buildATensor(ctx, outs[3]);
sync_bn_forward_output_cuda(input, mean, var, running_mean, running_var,
weight, bias, norm, std, output, eps, momentum,
group_size);
}
void sync_bn_backward_param_cuda_parrots(CudaContext& ctx,
const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
const auto& grad_output = buildATensor(ctx, ins[0]);
const auto& norm = buildATensor(ctx, ins[1]);
auto grad_weight = buildATensor(ctx, outs[0]);
auto grad_bias = buildATensor(ctx, outs[1]);
sync_bn_backward_param_cuda(grad_output, norm, grad_weight, grad_bias);
}
void sync_bn_backward_data_cuda_parrots(CudaContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
const auto& grad_output = buildATensor(ctx, ins[0]);
const auto& weight = buildATensor(ctx, ins[1]);
const auto& grad_weight = buildATensor(ctx, ins[2]);
const auto& grad_bias = buildATensor(ctx, ins[3]);
const auto& norm = buildATensor(ctx, ins[4]);
const auto& std = buildATensor(ctx, ins[5]);
auto grad_input = buildATensor(ctx, outs[0]);
sync_bn_backward_data_cuda(grad_output, weight, grad_weight, grad_bias, norm,
std, grad_input);
}
PARROTS_EXTENSION_REGISTER(sync_bn_forward_mean)
.input(1)
.output(1)
.apply(sync_bn_forward_mean_cuda_parrots)
.done();
PARROTS_EXTENSION_REGISTER(sync_bn_forward_var)
.input(2)
.output(1)
.apply(sync_bn_forward_var_cuda_parrots)
.done();
PARROTS_EXTENSION_REGISTER(sync_bn_forward_output)
.attr("eps")
.attr("momentum")
.attr("group_size")
.input(5)
.output(5)
.apply(sync_bn_forward_output_cuda_parrots)
.done();
PARROTS_EXTENSION_REGISTER(sync_bn_backward_param)
.input(2)
.output(2)
.apply(sync_bn_backward_param_cuda_parrots)
.done();
PARROTS_EXTENSION_REGISTER(sync_bn_backward_data)
.input(6)
.output(1)
.apply(sync_bn_backward_data_cuda_parrots)
.done();
#ifndef SYNC_BN_PYTORCH_H
#define SYNC_BN_PYTORCH_H
#include <torch/extension.h>
using namespace at;
void sync_bn_forward_mean_cuda(const Tensor input, Tensor mean);
void sync_bn_forward_var_cuda(const Tensor input, const Tensor mean,
Tensor var);
void sync_bn_forward_output_cuda(const Tensor input, const Tensor mean,
const Tensor var, Tensor running_mean,
Tensor running_var, const Tensor weight,
const Tensor bias, Tensor norm, Tensor std,
Tensor output, float eps, float momentum,
int group_size);
void sync_bn_backward_param_cuda(const Tensor grad_output, const Tensor norm,
Tensor grad_weight, Tensor grad_bias);
void sync_bn_backward_data_cuda(const Tensor grad_output, const Tensor weight,
const Tensor grad_weight,
const Tensor grad_bias, const Tensor norm,
const Tensor std, Tensor grad_input);
#endif // SYNC_BN_PYTORCH_H
#include "parrots_cpp_helper.hpp" #include "pytorch_cpp_helper.hpp"
void TINShiftForwardCUDAKernelLauncher(const DArrayLite input, #ifdef MMCV_WITH_CUDA
const DArrayLite shift, void TINShiftForwardCUDAKernelLauncher(Tensor input, Tensor shift,
DArrayLite output, cudaStream_t stream); Tensor output);
void TINShiftBackwardCUDAKernelLauncher(const DArrayLite grad_output, void TINShiftBackwardCUDAKernelLauncher(Tensor grad_output, Tensor shift,
const DArrayLite shift, Tensor grad_input);
DArrayLite grad_input,
cudaStream_t stream); void tin_shift_forward_cuda(Tensor input, Tensor shift, Tensor output) {
TINShiftForwardCUDAKernelLauncher(input, shift, output);
void tin_shift_forward_cuda(CudaContext &ctx, const SSElement &attr,
const OperatorBase::in_list_t &ins,
OperatorBase::out_list_t &outs) {
const auto &input = ins[0];
const auto &shift = ins[1];
auto &output = outs[0];
cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
TINShiftForwardCUDAKernelLauncher(input, shift, output, stream);
} }
void tin_shift_backward_cuda(CudaContext &ctx, const SSElement &attr, void tin_shift_backward_cuda(Tensor grad_output, Tensor shift,
const OperatorBase::in_list_t &ins, Tensor grad_input) {
OperatorBase::out_list_t &outs) { TINShiftBackwardCUDAKernelLauncher(grad_output, shift, grad_input);
const auto &grad_output = ins[0];
const auto &shift = ins[1];
auto &grad_input = outs[0];
cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
TINShiftBackwardCUDAKernelLauncher(grad_output, shift, grad_input, stream);
} }
PARROTS_EXTENSION_REGISTER(tin_shift_forward) #endif
.input(2)
.output(1) void tin_shift_forward(Tensor input, Tensor shift, Tensor output) {
.apply(tin_shift_forward_cuda) if (input.device().is_cuda()) {
.done(); #ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(input);
PARROTS_EXTENSION_REGISTER(tin_shift_backward) CHECK_CUDA_INPUT(shift);
.input(2) CHECK_CUDA_INPUT(output);
.output(1)
.apply(tin_shift_backward_cuda) tin_shift_forward_cuda(input, shift, output);
.done(); #else
AT_ERROR("TINShift is not compiled with GPU support");
#endif
} else {
AT_ERROR("TINShift is not implemented on CPU");
}
}
void tin_shift_backward(Tensor grad_output, Tensor shift, Tensor grad_input) {
if (grad_output.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(grad_output);
CHECK_CUDA_INPUT(shift);
CHECK_CUDA_INPUT(grad_input);
tin_shift_backward_cuda(grad_output, shift, grad_input);
#else
AT_ERROR("TINShift is not compiled with GPU support");
#endif
} else {
AT_ERROR("TINShift is not implemented on CPU");
}
}
#include "parrots_cuda_helper.hpp" #include "pytorch_cuda_helper.hpp"
#include "tin_shift_cuda_kernel.cuh" #include "tin_shift_cuda_kernel.cuh"
void TINShiftForwardCUDAKernelLauncher(const DArrayLite input, void TINShiftForwardCUDAKernelLauncher(Tensor input, Tensor shift,
const DArrayLite shift, Tensor output) {
DArrayLite output, cudaStream_t stream) { int output_size = output.numel();
int output_size = output.size(); int batch_size = input.size(0);
int batch_size = input.dim(0); int t_size = input.size(1);
int t_size = input.dim(1); int channels = input.size(2);
int channels = input.dim(2); int hw_size = input.size(3);
int hw_size = input.dim(3); int group_size = shift.size(1);
int group_size = shift.dim(1);
int group_channel = channels / group_size; int group_channel = channels / group_size;
int num_kernels = batch_size * hw_size * channels; int num_kernels = batch_size * hw_size * channels;
PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF( at::cuda::CUDAGuard device_guard(input.device());
input.elemType().prim(), ([&] { cudaStream_t stream = at::cuda::getCurrentCUDAStream();
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
input.scalar_type(), "tin_shift_forward_cuda_kernel", [&] {
tin_shift_forward_cuda_kernel<scalar_t> tin_shift_forward_cuda_kernel<scalar_t>
<<<GET_BLOCKS(num_kernels), THREADS_PER_BLOCK, 0, stream>>>( <<<GET_BLOCKS(num_kernels), THREADS_PER_BLOCK, 0, stream>>>(
output_size, input.ptr<scalar_t>(), shift.ptr<int>(), output_size, input.data_ptr<scalar_t>(), shift.data_ptr<int>(),
output.ptr<scalar_t>(), batch_size, channels, t_size, hw_size, output.data_ptr<scalar_t>(), batch_size, channels, t_size,
group_size, group_channel); hw_size, group_size, group_channel);
})); });
PARROTS_CUDA_CHECK(cudaGetLastError()); AT_CUDA_CHECK(cudaGetLastError());
} }
void TINShiftBackwardCUDAKernelLauncher(const DArrayLite grad_output, void TINShiftBackwardCUDAKernelLauncher(Tensor grad_output, Tensor shift,
const DArrayLite shift, Tensor grad_input) {
DArrayLite grad_input, int output_size = grad_output.numel();
cudaStream_t stream) { int batch_size = grad_output.size(0);
int output_size = grad_output.size(); int t_size = grad_output.size(1);
int batch_size = grad_output.dim(0); int channels = grad_output.size(2);
int t_size = grad_output.dim(1); int hw_size = grad_output.size(3);
int channels = grad_output.dim(2); int group_size = shift.size(1);
int hw_size = grad_output.dim(3);
int group_size = shift.dim(1);
int group_channel = channels / group_size; int group_channel = channels / group_size;
int num_kernels = batch_size * hw_size * channels; int num_kernels = batch_size * hw_size * channels;
PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF( at::cuda::CUDAGuard device_guard(grad_output.device());
grad_output.elemType().prim(), ([&] { cudaStream_t stream = at::cuda::getCurrentCUDAStream();
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
grad_output.scalar_type(), "tin_shift_backward_cuda_kernel", [&] {
tin_shift_backward_cuda_kernel<scalar_t> tin_shift_backward_cuda_kernel<scalar_t>
<<<GET_BLOCKS(num_kernels), THREADS_PER_BLOCK, 0, stream>>>( <<<GET_BLOCKS(num_kernels), THREADS_PER_BLOCK, 0, stream>>>(
output_size, grad_output.ptr<scalar_t>(), shift.ptr<int>(), output_size, grad_output.data_ptr<scalar_t>(),
grad_input.ptr<scalar_t>(), batch_size, channels, t_size, shift.data_ptr<int>(), grad_input.data_ptr<scalar_t>(),
hw_size, group_size, group_channel); batch_size, channels, t_size, hw_size, group_size,
})); group_channel);
});
PARROTS_CUDA_CHECK(cudaGetLastError()); AT_CUDA_CHECK(cudaGetLastError());
} }
#include <parrots/compute/aten.hpp>
#include <parrots/extension.hpp>
#include <parrots/foundation/ssattrs.hpp>
#include "tin_shift_pytorch.h"
using namespace parrots;
void tin_shift_forward_cuda_parrots(CudaContext &ctx, const SSElement &attr,
const OperatorBase::in_list_t &ins,
OperatorBase::out_list_t &outs) {
const auto &input = buildATensor(ctx, ins[0]);
const auto &shift = buildATensor(ctx, ins[1]);
auto output = buildATensor(ctx, outs[0]);
tin_shift_forward_cuda(input, shift, output);
}
void tin_shift_backward_cuda_parrots(CudaContext &ctx, const SSElement &attr,
const OperatorBase::in_list_t &ins,
OperatorBase::out_list_t &outs) {
const auto &grad_output = buildATensor(ctx, ins[0]);
const auto &shift = buildATensor(ctx, ins[1]);
auto grad_input = buildATensor(ctx, outs[0]);
tin_shift_backward_cuda(grad_output, shift, grad_input);
}
PARROTS_EXTENSION_REGISTER(tin_shift_forward)
.input(2)
.output(1)
.apply(tin_shift_forward_cuda_parrots)
.done();
PARROTS_EXTENSION_REGISTER(tin_shift_backward)
.input(2)
.output(1)
.apply(tin_shift_backward_cuda_parrots)
.done();
#ifndef TIN_SHIFT_PYTORCH_H
#define TIN_SHIFT_PYTORCH_H
#include <torch/extension.h>
using namespace at;
void tin_shift_forward_cuda(Tensor input, Tensor shift, Tensor output);
void tin_shift_backward_cuda(Tensor grad_output, Tensor shift,
Tensor grad_input);
#endif // TIN_SHIFT_PYTORCH_H
...@@ -134,28 +134,12 @@ def nms(boxes, scores, iou_threshold, offset=0): ...@@ -134,28 +134,12 @@ def nms(boxes, scores, iou_threshold, offset=0):
assert offset in (0, 1) assert offset in (0, 1)
if torch.__version__ == 'parrots': if torch.__version__ == 'parrots':
x1 = boxes[:, 0] indata_list = [boxes, scores]
y1 = boxes[:, 1] indata_dict = {
x2 = boxes[:, 2] 'iou_threshold': float(iou_threshold),
y2 = boxes[:, 3] 'offset': int(offset)
areas = (x2 - x1 + offset) * (y2 - y1 + offset) }
_, order = scores.sort(0, descending=True) inds = ext_module.nms(*indata_list, **indata_dict)
if boxes.device == 'cpu':
indata_list = [boxes, order, areas]
indata_dict = {
'iou_threshold': float(iou_threshold),
'offset': int(offset)
}
select = ext_module.nms(*indata_list, **indata_dict).byte()
else:
boxes_sorted = boxes.index_select(0, order)
indata_list = [boxes_sorted, order, areas]
indata_dict = {
'iou_threshold': float(iou_threshold),
'offset': int(offset)
}
select = ext_module.nms(*indata_list, **indata_dict)
inds = order.masked_select(select)
else: else:
inds = NMSop.apply(boxes, scores, iou_threshold, offset) inds = NMSop.apply(boxes, scores, iou_threshold, offset)
dets = torch.cat((boxes[inds], scores[inds].reshape(-1, 1)), dim=1) dets = torch.cat((boxes[inds], scores[inds].reshape(-1, 1)), dim=1)
...@@ -219,12 +203,8 @@ def soft_nms(boxes, ...@@ -219,12 +203,8 @@ def soft_nms(boxes,
assert method in method_dict.keys() assert method in method_dict.keys()
if torch.__version__ == 'parrots': if torch.__version__ == 'parrots':
x1 = boxes[:, 0] dets = boxes.new_empty((boxes.size(0), 5), device='cpu')
y1 = boxes[:, 1] indata_list = [boxes.cpu(), scores.cpu(), dets.cpu()]
x2 = boxes[:, 2]
y2 = boxes[:, 3]
areas = (x2 - x1 + offset) * (y2 - y1 + offset)
indata_list = [boxes.cpu(), scores.cpu(), areas.cpu()]
indata_dict = { indata_dict = {
'iou_threshold': float(iou_threshold), 'iou_threshold': float(iou_threshold),
'sigma': float(sigma), 'sigma': float(sigma),
...@@ -232,8 +212,7 @@ def soft_nms(boxes, ...@@ -232,8 +212,7 @@ def soft_nms(boxes,
'method': method_dict[method], 'method': method_dict[method],
'offset': int(offset) 'offset': int(offset)
} }
dets, inds, num_out = ext_module.softnms(*indata_list, **indata_dict) inds = ext_module.softnms(*indata_list, **indata_dict)
inds = inds[:num_out]
else: else:
dets, inds = SoftNMSop.apply(boxes.cpu(), scores.cpu(), dets, inds = SoftNMSop.apply(boxes.cpu(), scores.cpu(),
float(iou_threshold), float(sigma), float(iou_threshold), float(sigma),
...@@ -343,7 +322,11 @@ def nms_match(dets, iou_threshold): ...@@ -343,7 +322,11 @@ def nms_match(dets, iou_threshold):
dets_t = dets.detach().cpu() dets_t = dets.detach().cpu()
else: else:
dets_t = torch.from_numpy(dets) dets_t = torch.from_numpy(dets)
matched = ext_module.nms_match(dets_t, float(iou_threshold)) indata_list = [dets_t]
indata_dict = {'iou_threshold': float(iou_threshold)}
matched = ext_module.nms_match(*indata_list, **indata_dict)
if torch.__version__ == 'parrots':
matched = matched.tolist()
if isinstance(dets, torch.Tensor): if isinstance(dets, torch.Tensor):
return [dets.new_tensor(m, dtype=torch.long) for m in matched] return [dets.new_tensor(m, dtype=torch.long) for m in matched]
...@@ -380,16 +363,13 @@ def nms_rotated(dets, scores, iou_threshold, labels=None): ...@@ -380,16 +363,13 @@ def nms_rotated(dets, scores, iou_threshold, labels=None):
dets_sorted = dets_wl.index_select(0, order) dets_sorted = dets_wl.index_select(0, order)
if torch.__version__ == 'parrots': if torch.__version__ == 'parrots':
select = torch.zeros((dets.shape[0]), keep_inds = ext_module.nms_rotated(
dtype=torch.int64).to(dets.device)
ext_module.nms_rotated(
dets_wl, dets_wl,
scores, scores,
order,
dets_sorted, dets_sorted,
select,
iou_threshold=iou_threshold, iou_threshold=iou_threshold,
multi_label=multi_label) multi_label=multi_label)
keep_inds = order.masked_select(select == 1)
else: else:
keep_inds = ext_module.nms_rotated(dets_wl, scores, order, dets_sorted, keep_inds = ext_module.nms_rotated(dets_wl, scores, order, dets_sorted,
iou_threshold, multi_label) iou_threshold, multi_label)
......
...@@ -15,12 +15,19 @@ if torch.__version__ != 'parrots': ...@@ -15,12 +15,19 @@ if torch.__version__ != 'parrots':
else: else:
from parrots import extension from parrots import extension
has_return_value_ops = [
'nms', 'softnms', 'nms_match', 'nms_rotated', 'top_pool_forward',
'top_pool_backward', 'bottom_pool_forward', 'bottom_pool_backward',
'left_pool_forward', 'left_pool_backward', 'right_pool_forward',
'right_pool_backward'
]
def load_ext(name, funcs): def load_ext(name, funcs):
ExtModule = namedtuple('ExtModule', funcs) ExtModule = namedtuple('ExtModule', funcs)
ext_list = [] ext_list = []
lib_root = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) lib_root = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
for fun in funcs: for fun in funcs:
if fun in ['nms', 'softnms']: if fun in has_return_value_ops:
ext_list.append(extension.load(fun, name, lib_dir=lib_root).op) ext_list.append(extension.load(fun, name, lib_dir=lib_root).op)
else: else:
ext_list.append( ext_list.append(
......
...@@ -184,20 +184,32 @@ def get_extensions(): ...@@ -184,20 +184,32 @@ def get_extensions():
if EXT_TYPE == 'parrots': if EXT_TYPE == 'parrots':
ext_name = 'mmcv._ext' ext_name = 'mmcv._ext'
from parrots.utils.build_extension import Extension from parrots.utils.build_extension import Extension
define_macros = [('MMCV_USE_PARROTS', None)] # new parrots op impl do not use MMCV_USE_PARROTS
op_files = glob.glob('./mmcv/ops/csrc/parrots/*') # define_macros = [('MMCV_USE_PARROTS', None)]
include_path = os.path.abspath('./mmcv/ops/csrc') define_macros = []
op_files = glob.glob('./mmcv/ops/csrc/parrots/*.cu') +\
glob.glob('./mmcv/ops/csrc/parrots/*.cpp')
include_dirs = [os.path.abspath('./mmcv/ops/csrc')]
cuda_args = os.getenv('MMCV_CUDA_ARGS') cuda_args = os.getenv('MMCV_CUDA_ARGS')
extra_compile_args = {
'nvcc': [cuda_args] if cuda_args else [],
'cxx': [],
}
if torch.cuda.is_available() or os.getenv('FORCE_CUDA', '0') == '1':
define_macros += [('MMCV_WITH_CUDA', None)]
extra_compile_args['nvcc'] += [
'-D__CUDA_NO_HALF_OPERATORS__',
'-D__CUDA_NO_HALF_CONVERSIONS__',
'-D__CUDA_NO_HALF2_OPERATORS__',
]
ext_ops = Extension( ext_ops = Extension(
name=ext_name, name=ext_name,
sources=op_files, sources=op_files,
include_dirs=[include_path], include_dirs=include_dirs,
define_macros=define_macros, define_macros=define_macros,
extra_compile_args={ extra_compile_args=extra_compile_args,
'nvcc': [cuda_args] if cuda_args else [], cuda=True,
'cxx': [], pytorch=True)
},
cuda=True)
extensions.append(ext_ops) extensions.append(ext_ops)
elif EXT_TYPE == 'pytorch': elif EXT_TYPE == 'pytorch':
ext_name = 'mmcv._ext' ext_name = 'mmcv._ext'
......
...@@ -60,8 +60,7 @@ class TestDeformRoIPool(object): ...@@ -60,8 +60,7 @@ class TestDeformRoIPool(object):
sampling_ratio=sampling_ratio).cuda() sampling_ratio=sampling_ratio).cuda()
if _USING_PARROTS: if _USING_PARROTS:
pass gradcheck(droipool, (x, rois), no_grads=[rois])
# gradcheck(droipool, (x, rois), no_grads=[rois])
else: else:
gradcheck(droipool, (x, rois), eps=1e-2, atol=1e-2) gradcheck(droipool, (x, rois), eps=1e-2, atol=1e-2)
...@@ -90,7 +89,6 @@ class TestDeformRoIPool(object): ...@@ -90,7 +89,6 @@ class TestDeformRoIPool(object):
sampling_ratio=sampling_ratio).cuda() sampling_ratio=sampling_ratio).cuda()
if _USING_PARROTS: if _USING_PARROTS:
pass gradcheck(droipool, (x, rois), no_grads=[rois])
# gradcheck(droipool, (x, rois), no_grads=[rois])
else: else:
gradcheck(droipool, (x, rois), eps=1e-2, atol=1e-2) gradcheck(droipool, (x, rois), eps=1e-2, atol=1e-2)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment