Unverified Commit 48d99025 authored by z55250825's avatar z55250825 Committed by GitHub
Browse files

Add new parrots extension implementation for all ops (#794)

* delete all parrots file
add bbox_overlaps new parrots op impl

* support first new impl parrts op (bbox_overlaps)(success test)

* add box_iou_rotated op, test succeed

* add carafe and carafe_naive op, test succeed (one parrots bug need fix)

* add cc_attention op, test success

* add corner_pool op, test success

* add parrots op deform_conv, test success

* add deform_roi_pool op, test success (but has question)

* add focal loss op, test success (gradcheck)

* add masked_conv2d op, test success

* add modulated_deform_conv op, test success

* add nms and nms_rotated op, test success

* add psamask op, test success

* add roi_align op, test_success

* add roi_pool op, test success

* add sync_bn op, test success

* add tin_shift op, test success

* fix test_deform_roi_pool, add parrots test

* skip test_onnx because parrots does not support onnx

* fix c++ lint

* fix python lint

* fix python lint
parent 72e4cc12
#include "parrots_cpp_helper.hpp" #include "pytorch_cpp_helper.hpp"
void BBoxOverlapsCUDAKernelLauncher(const DArrayLite bboxes1, #ifdef MMCV_WITH_CUDA
const DArrayLite bboxes2, DArrayLite ious, void BBoxOverlapsCUDAKernelLauncher(const Tensor bboxes1, const Tensor bboxes2,
const int mode, const bool aligned, Tensor ious, const int mode,
const int offset, cudaStream_t stream); const bool aligned, const int offset);
void bbox_overlaps_cuda(CudaContext& ctx, const SSElement& attr, void bbox_overlaps_cuda(const Tensor bboxes1, const Tensor bboxes2, Tensor ious,
const OperatorBase::in_list_t& ins, const int mode, const bool aligned, const int offset) {
OperatorBase::out_list_t& outs) { BBoxOverlapsCUDAKernelLauncher(bboxes1, bboxes2, ious, mode, aligned, offset);
int mode, offset; }
bool aligned; #endif
SSAttrs(attr)
.get<int>("mode", mode)
.get<bool>("aligned", aligned)
.get<int>("offset", offset)
.done();
const auto& bboxes1 = ins[0];
const auto& bboxes2 = ins[1];
auto& ious = outs[0]; void bbox_overlaps(const Tensor bboxes1, const Tensor bboxes2, Tensor ious,
const int mode, const bool aligned, const int offset) {
if (bboxes1.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(bboxes1);
CHECK_CUDA_INPUT(bboxes2);
CHECK_CUDA_INPUT(ious);
cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream()); bbox_overlaps_cuda(bboxes1, bboxes2, ious, mode, aligned, offset);
BBoxOverlapsCUDAKernelLauncher(bboxes1, bboxes2, ious, mode, aligned, offset, #else
stream); AT_ERROR("bbox_overlaps is not compiled with GPU support");
#endif
} else {
AT_ERROR("bbox_overlaps is not implemented on CPU");
}
} }
PARROTS_EXTENSION_REGISTER(bbox_overlaps)
.attr("mode")
.attr("aligned")
.attr("offset")
.input(2)
.output(1)
.apply(bbox_overlaps_cuda)
.done();
#include "bbox_overlaps_cuda_kernel.cuh" #include "bbox_overlaps_cuda_kernel.cuh"
#include "parrots_cuda_helper.hpp" #include "pytorch_cuda_helper.hpp"
void BBoxOverlapsCUDAKernelLauncher(const DArrayLite bboxes1, void BBoxOverlapsCUDAKernelLauncher(const Tensor bboxes1, const Tensor bboxes2,
const DArrayLite bboxes2, DArrayLite ious, Tensor ious, const int mode,
const int mode, const bool aligned, const bool aligned, const int offset) {
const int offset, cudaStream_t stream) { int output_size = ious.numel();
int output_size = ious.size(); int num_bbox1 = bboxes1.size(0);
int num_bbox1 = bboxes1.dim(0); int num_bbox2 = bboxes2.size(0);
int num_bbox2 = bboxes2.dim(0);
PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF( at::cuda::CUDAGuard device_guard(bboxes1.device());
bboxes1.elemType().prim(), ([&] { cudaStream_t stream = at::cuda::getCurrentCUDAStream();
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
bboxes1.scalar_type(), "bbox_overlaps_cuda_kernel", ([&] {
bbox_overlaps_cuda_kernel<scalar_t> bbox_overlaps_cuda_kernel<scalar_t>
<<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>( <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
bboxes1.ptr<scalar_t>(), bboxes2.ptr<scalar_t>(), bboxes1.data_ptr<scalar_t>(), bboxes2.data_ptr<scalar_t>(),
ious.ptr<scalar_t>(), num_bbox1, num_bbox2, mode, aligned, ious.data_ptr<scalar_t>(), num_bbox1, num_bbox2, mode, aligned,
offset); offset);
})); }));
AT_CUDA_CHECK(cudaGetLastError());
PARROTS_CUDA_CHECK(cudaGetLastError());
} }
#include <parrots/compute/aten.hpp>
#include <parrots/extension.hpp>
#include <parrots/foundation/ssattrs.hpp>
#include "bbox_overlaps_pytorch.h"
using namespace parrots;
/*
* void bbox_overlaps_cuda(const Tensor bboxes1, const Tensor bboxes2, Tensor
* ious, const int mode, const bool aligned, const int offset);
*/
void bbox_overlaps_parrots(CudaContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
int mode, offset;
bool aligned;
SSAttrs(attr)
.get<int>("mode", mode)
.get<bool>("aligned", aligned)
.get<int>("offset", offset)
.done();
const auto& bboxes1 = buildATensor(ctx, ins[0]);
const auto& bboxes2 = buildATensor(ctx, ins[1]);
auto ious = buildATensor(ctx, outs[0]);
bbox_overlaps_cuda(bboxes1, bboxes2, ious, mode, aligned, offset);
}
PARROTS_EXTENSION_REGISTER(bbox_overlaps)
.attr("mode")
.attr("aligned")
.attr("offset")
.input(2)
.output(1)
.apply(bbox_overlaps_parrots)
.done();
#ifndef BBOX_OVERLAPS_PYTORCH_H
#define BBOX_OVERLAPS_PYTORCH_H
#include <torch/extension.h>
using namespace at;
void bbox_overlaps_cuda(const Tensor bboxes1, const Tensor bboxes2, Tensor ious,
const int mode, const bool aligned, const int offset);
#endif // BBOX_OVERLAPS_PYTORCH_H
// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
// modified from // modified from
// https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/csrc/box_iou_rotated/box_iou_rotated.h // https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/csrc/box_iou_rotated/box_iou_rotated.h
#include "parrots_cpp_helper.hpp" #include "pytorch_cpp_helper.hpp"
void box_iou_rotated_cpu_launcher(const DArrayLite boxes1, void box_iou_rotated_cpu(const Tensor boxes1, const Tensor boxes2, Tensor ious,
const DArrayLite boxes2, DArrayLite ious,
const int mode_flag, const bool aligned); const int mode_flag, const bool aligned);
void box_iou_rotated_cuda_launcher(const DArrayLite boxes1, #ifdef MMCV_WITH_CUDA
const DArrayLite boxes2, DArrayLite ious, void box_iou_rotated_cuda(const Tensor boxes1, const Tensor boxes2, Tensor ious,
const int mode_flag, const bool aligned, const int mode_flag, const bool aligned);
cudaStream_t stream); #endif
void box_iou_rotated_cpu(HostContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
const auto& boxes1 = ins[0];
const auto& boxes2 = ins[1];
bool aligned;
int mode_flag;
SSAttrs(attr)
.get<bool>("aligned", aligned)
.get<int>("mode_flag", mode_flag)
.done();
auto& ious = outs[0];
box_iou_rotated_cpu_launcher(boxes1, boxes2, ious, mode_flag, aligned);
}
void box_iou_rotated_cuda(CudaContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
const auto& boxes1 = ins[0];
const auto& boxes2 = ins[1];
bool aligned;
int mode_flag;
SSAttrs(attr)
.get<bool>("aligned", aligned)
.get<int>("mode_flag", mode_flag)
.done();
cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
auto& ious = outs[0];
box_iou_rotated_cuda_launcher(boxes1, boxes2, ious, mode_flag, aligned,
stream);
}
PARROTS_EXTENSION_REGISTER(box_iou_rotated) // Interface for Python
.attr("aligned") // inline is needed to prevent multiple function definitions when this header is
.attr("mode_flag") // included by different cpps
.input(2) void box_iou_rotated(const Tensor boxes1, const Tensor boxes2, Tensor ious,
.output(1) const int mode_flag, const bool aligned) {
.apply(box_iou_rotated_cpu) assert(boxes1.device().is_cuda() == boxes2.device().is_cuda());
#ifdef PARROTS_USE_CUDA if (boxes1.device().is_cuda()) {
.apply(box_iou_rotated_cuda) #ifdef MMCV_WITH_CUDA
box_iou_rotated_cuda(boxes1, boxes2, ious, mode_flag, aligned);
#else
AT_ERROR("Not compiled with GPU support");
#endif #endif
.done(); } else {
box_iou_rotated_cpu(boxes1, boxes2, ious, mode_flag, aligned);
}
}
...@@ -2,35 +2,32 @@ ...@@ -2,35 +2,32 @@
// modified from // modified from
// https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/csrc/box_iou_rotated/box_iou_rotated_cpu.cpp // https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/csrc/box_iou_rotated/box_iou_rotated_cpu.cpp
#include "box_iou_rotated_utils.hpp" #include "box_iou_rotated_utils.hpp"
#include "parrots_cpp_helper.hpp" #include "pytorch_cpp_helper.hpp"
template <typename T> template <typename T>
void box_iou_rotated_cpu_kernel(const DArrayLite boxes1, void box_iou_rotated_cpu_kernel(const Tensor boxes1, const Tensor boxes2,
const DArrayLite boxes2, DArrayLite ious, Tensor ious, const int mode_flag,
const int mode_flag, const bool aligned) { const bool aligned) {
int output_size = ious.size(); int output_size = ious.numel();
int num_boxes1 = boxes1.dim(0); auto num_boxes1 = boxes1.size(0);
int num_boxes2 = boxes2.dim(0); auto num_boxes2 = boxes2.size(0);
auto ious_ptr = ious.ptr<float>();
if (aligned) { if (aligned) {
for (int i = 0; i < output_size; i++) { for (int i = 0; i < output_size; i++) {
ious_ptr[i] = single_box_iou_rotated<T>(boxes1[i].ptr<T>(), ious[i] = single_box_iou_rotated<T>(boxes1[i].data_ptr<T>(),
boxes2[i].ptr<T>(), mode_flag); boxes2[i].data_ptr<T>(), mode_flag);
} }
} else { } else {
for (int i = 0; i < num_boxes1; i++) { for (int i = 0; i < num_boxes1; i++) {
for (int j = 0; j < num_boxes2; j++) { for (int j = 0; j < num_boxes2; j++) {
ious_ptr[i * num_boxes2 + j] = single_box_iou_rotated<T>( ious[i * num_boxes2 + j] = single_box_iou_rotated<T>(
boxes1[i].ptr<T>(), boxes2[j].ptr<T>(), mode_flag); boxes1[i].data_ptr<T>(), boxes2[j].data_ptr<T>(), mode_flag);
} }
} }
} }
} }
void box_iou_rotated_cpu_launcher(const DArrayLite boxes1, void box_iou_rotated_cpu(const Tensor boxes1, const Tensor boxes2, Tensor ious,
const DArrayLite boxes2, DArrayLite ious,
const int mode_flag, const bool aligned) { const int mode_flag, const bool aligned) {
box_iou_rotated_cpu_kernel<float>(boxes1, boxes2, ious, mode_flag, aligned); box_iou_rotated_cpu_kernel<float>(boxes1, boxes2, ious, mode_flag, aligned);
} }
...@@ -2,23 +2,24 @@ ...@@ -2,23 +2,24 @@
// modified from // modified from
// https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/csrc/box_iou_rotated/box_iou_rotated_cuda.cu // https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/csrc/box_iou_rotated/box_iou_rotated_cuda.cu
#include "box_iou_rotated_cuda.cuh" #include "box_iou_rotated_cuda.cuh"
#include "parrots_cuda_helper.hpp" #include "pytorch_cuda_helper.hpp"
void box_iou_rotated_cuda_launcher(const DArrayLite boxes1, void box_iou_rotated_cuda(const Tensor boxes1, const Tensor boxes2, Tensor ious,
const DArrayLite boxes2, DArrayLite ious, const int mode_flag, const bool aligned) {
const int mode_flag, const bool aligned,
cudaStream_t stream) {
using scalar_t = float; using scalar_t = float;
AT_ASSERTM(boxes1.type().is_cuda(), "boxes1 must be a CUDA tensor");
AT_ASSERTM(boxes2.type().is_cuda(), "boxes2 must be a CUDA tensor");
int output_size = ious.size(); int output_size = ious.numel();
int num_boxes1 = boxes1.dim(0); int num_boxes1 = boxes1.size(0);
int num_boxes2 = boxes2.dim(0); int num_boxes2 = boxes2.size(0);
at::cuda::CUDAGuard device_guard(boxes1.device());
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
box_iou_rotated_cuda_kernel<scalar_t> box_iou_rotated_cuda_kernel<scalar_t>
<<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>( <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
num_boxes1, num_boxes2, boxes1.ptr<scalar_t>(), num_boxes1, num_boxes2, boxes1.data_ptr<scalar_t>(),
boxes2.ptr<scalar_t>(), (scalar_t*)ious.ptr<scalar_t>(), mode_flag, boxes2.data_ptr<scalar_t>(), (scalar_t*)ious.data_ptr<scalar_t>(),
aligned); mode_flag, aligned);
AT_CUDA_CHECK(cudaGetLastError());
PARROTS_CUDA_CHECK(cudaGetLastError());
} }
#include <parrots/compute/aten.hpp>
#include <parrots/extension.hpp>
#include <parrots/foundation/ssattrs.hpp>
#include "box_iou_rotated_pytorch.h"
using namespace parrots;
/*
* void box_iou_rotated_cpu(const Tensor boxes1, const Tensor boxes2, Tensor
* ious, const int mode_flag, const bool aligned);
*/
void box_iou_rotated_cpu_parrots(HostContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
bool aligned;
int mode_flag;
SSAttrs(attr)
.get<bool>("aligned", aligned)
.get<int>("mode_flag", mode_flag)
.done();
const auto& boxes1 = buildATensor(ctx, ins[0]);
const auto& boxes2 = buildATensor(ctx, ins[1]);
auto ious = buildATensor(ctx, outs[0]);
box_iou_rotated_cpu(boxes1, boxes2, ious, mode_flag, aligned);
}
#ifdef MMCV_WITH_CUDA
/*
* void box_iou_rotated_cuda(const Tensor boxes1, const Tensor boxes2, Tensor
* ious, const int mode_flag, const bool aligned);
*/
void box_iou_rotated_cuda_parrots(CudaContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
bool aligned;
int mode_flag;
SSAttrs(attr)
.get<bool>("aligned", aligned)
.get<int>("mode_flag", mode_flag)
.done();
const auto& boxes1 = buildATensor(ctx, ins[0]);
const auto& boxes2 = buildATensor(ctx, ins[1]);
auto ious = buildATensor(ctx, outs[0]);
box_iou_rotated_cuda(boxes1, boxes2, ious, mode_flag, aligned);
}
#endif
PARROTS_EXTENSION_REGISTER(box_iou_rotated)
.attr("aligned")
.attr("mode_flag")
.input(2)
.output(1)
.apply(box_iou_rotated_cpu_parrots)
#ifdef MMCV_WITH_CUDA
.apply(box_iou_rotated_cuda_parrots)
#endif
.done();
#ifndef BOX_IOU_ROTATED_PYTORCH_H
#define BOX_IOU_ROTATED_PYTORCH_H
#include <torch/extension.h>
using namespace at;
void box_iou_rotated_cpu(const Tensor boxes1, const Tensor boxes2, Tensor ious,
const int mode_flag, const bool aligned);
#ifdef MMCV_WITH_CUDA
void box_iou_rotated_cuda(const Tensor boxes1, const Tensor boxes2, Tensor ious,
const int mode_flag, const bool aligned);
#endif
#endif // BOX_IOU_ROTATED_PYTORCH_H
#include "parrots_cpp_helper.hpp" #include "pytorch_cpp_helper.hpp"
void CARAFEForwardCUDAKernelLauncher( #ifdef MMCV_WITH_CUDA
const DArrayLite features, const DArrayLite masks, DArrayLite rfeatures, void CARAFEForwardCUDAKernelLauncher(const Tensor features, const Tensor masks,
DArrayLite routput, DArrayLite rmasks, DArrayLite output, Tensor rfeatures, Tensor routput,
const int kernel_size, const int group_size, const int scale_factor, Tensor rmasks, Tensor output,
cudaStream_t stream); const int kernel_size,
const int group_size,
const int scale_factor);
void CARAFEBackwardCUDAKernelLauncher( void CARAFEBackwardCUDAKernelLauncher(
const DArrayLite top_grad, const DArrayLite rfeatures, const Tensor top_grad, const Tensor rfeatures, const Tensor masks,
const DArrayLite masks, DArrayLite rtop_grad, DArrayLite rbottom_grad_hs, Tensor rtop_grad, Tensor rbottom_grad_hs, Tensor rbottom_grad,
DArrayLite rbottom_grad, DArrayLite rmask_grad, DArrayLite bottom_grad, Tensor rmask_grad, Tensor bottom_grad, Tensor mask_grad,
DArrayLite mask_grad, const int kernel_size, const int group_size, const int kernel_size, const int group_size, const int scale_factor);
const int scale_factor, cudaStream_t stream);
void carafe_forward_cuda(CudaContext& ctx, const SSElement& attr, void carafe_forward_cuda(Tensor features, Tensor masks, Tensor rfeatures,
const OperatorBase::in_list_t& ins, Tensor routput, Tensor rmasks, Tensor output,
OperatorBase::out_list_t& outs) { int kernel_size, int group_size, int scale_factor) {
int kernel_size, group_size, scale_factor;
SSAttrs(attr)
.get<int>("kernel_size", kernel_size)
.get<int>("group_size", group_size)
.get<int>("scale_factor", scale_factor)
.done();
const auto& features = ins[0];
const auto& masks = ins[1];
auto& rfeatures = outs[0];
auto& routput = outs[1];
auto& rmasks = outs[2];
auto& output = outs[3];
cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
CARAFEForwardCUDAKernelLauncher(features, masks, rfeatures, routput, rmasks, CARAFEForwardCUDAKernelLauncher(features, masks, rfeatures, routput, rmasks,
output, kernel_size, group_size, scale_factor, output, kernel_size, group_size,
stream); scale_factor);
} }
void carafe_backward_cuda(CudaContext& ctx, const SSElement& attr, void carafe_backward_cuda(Tensor top_grad, Tensor rfeatures, Tensor masks,
const OperatorBase::in_list_t& ins, Tensor rtop_grad, Tensor rbottom_grad_hs,
OperatorBase::out_list_t& outs) { Tensor rbottom_grad, Tensor rmask_grad,
int kernel_size, group_size, scale_factor; Tensor bottom_grad, Tensor mask_grad, int kernel_size,
SSAttrs(attr) int group_size, int scale_factor) {
.get<int>("kernel_size", kernel_size)
.get<int>("group_size", group_size)
.get<int>("scale_factor", scale_factor)
.done();
const auto& top_grad = ins[0];
const auto& rfeatures = ins[1];
const auto& masks = ins[2];
auto& rtop_grad = outs[0];
auto rbottom_grad_hs = outs[1];
auto& rbottom_grad = outs[2];
auto& rmask_grad = outs[3];
auto& bottom_grad = outs[4];
auto& mask_grad = outs[5];
cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
CARAFEBackwardCUDAKernelLauncher(top_grad, rfeatures, masks, rtop_grad, CARAFEBackwardCUDAKernelLauncher(top_grad, rfeatures, masks, rtop_grad,
rbottom_grad_hs, rbottom_grad, rmask_grad, rbottom_grad_hs, rbottom_grad, rmask_grad,
bottom_grad, mask_grad, kernel_size, bottom_grad, mask_grad, kernel_size,
group_size, scale_factor, stream); group_size, scale_factor);
} }
#endif
PARROTS_EXTENSION_REGISTER(carafe_forward) void carafe_forward(Tensor features, Tensor masks, Tensor rfeatures,
.attr("kernel_size") Tensor routput, Tensor rmasks, Tensor output,
.attr("group_size") int kernel_size, int group_size, int scale_factor) {
.attr("scale_factor") if (features.device().is_cuda()) {
.input(2) #ifdef MMCV_WITH_CUDA
.output(4) CHECK_CUDA_INPUT(features);
.apply(carafe_forward_cuda) CHECK_CUDA_INPUT(masks);
.done(); CHECK_CUDA_INPUT(rfeatures);
CHECK_CUDA_INPUT(routput);
CHECK_CUDA_INPUT(rmasks);
CHECK_CUDA_INPUT(output);
carafe_forward_cuda(features, masks, rfeatures, routput, rmasks, output,
kernel_size, group_size, scale_factor);
#else
AT_ERROR("Carafe is not compiled with GPU support");
#endif
} else {
AT_ERROR("Carafe is not implemented on CPU");
}
}
PARROTS_EXTENSION_REGISTER(carafe_backward) void carafe_backward(Tensor top_grad, Tensor rfeatures, Tensor masks,
.attr("kernel_size") Tensor rtop_grad, Tensor rbottom_grad_hs,
.attr("group_size") Tensor rbottom_grad, Tensor rmask_grad, Tensor bottom_grad,
.attr("scale_factor") Tensor mask_grad, int kernel_size, int group_size,
.input(3) int scale_factor) {
.output(6) if (top_grad.device().is_cuda()) {
.apply(carafe_backward_cuda) #ifdef MMCV_WITH_CUDA
.done(); CHECK_CUDA_INPUT(top_grad);
CHECK_CUDA_INPUT(rfeatures);
CHECK_CUDA_INPUT(masks);
CHECK_CUDA_INPUT(rtop_grad);
CHECK_CUDA_INPUT(rbottom_grad_hs);
CHECK_CUDA_INPUT(rbottom_grad);
CHECK_CUDA_INPUT(rmask_grad);
CHECK_CUDA_INPUT(bottom_grad);
CHECK_CUDA_INPUT(mask_grad);
carafe_backward_cuda(top_grad, rfeatures, masks, rtop_grad, rbottom_grad_hs,
rbottom_grad, rmask_grad, bottom_grad, mask_grad,
kernel_size, group_size, scale_factor);
#else
AT_ERROR("Carafe is not compiled with GPU support");
#endif
} else {
AT_ERROR("Carafe is not implemented on CPU");
}
}
#include "carafe_cuda_kernel.cuh" #include "carafe_cuda_kernel.cuh"
#include "parrots_cuda_helper.hpp" #include "pytorch_cuda_helper.hpp"
void CARAFEForwardCUDAKernelLauncher( void CARAFEForwardCUDAKernelLauncher(const Tensor features, const Tensor masks,
const DArrayLite features, const DArrayLite masks, DArrayLite rfeatures, Tensor rfeatures, Tensor routput,
DArrayLite routput, DArrayLite rmasks, DArrayLite output, Tensor rmasks, Tensor output,
const int kernel_size, const int group_size, const int scale_factor, const int kernel_size,
cudaStream_t stream) { const int group_size,
const int batch_size = output.dim(0); const int scale_factor) {
const int channels = output.dim(1); const int batch_size = output.size(0);
const int output_height = output.dim(2); const int channels = output.size(1);
const int output_width = output.dim(3); const int output_height = output.size(2);
const int output_width = output.size(3);
const int input_height = features.dim(2); const int input_height = features.size(2);
const int input_width = features.dim(3); const int input_width = features.size(3);
const int mask_channels = masks.dim(1); const int mask_channels = masks.size(1);
rfeatures.resize_({batch_size, input_height, input_width, channels});
routput.resize_({batch_size, output_height, output_width, channels});
rmasks.resize_({batch_size, output_height, output_width, mask_channels});
// one warp per pixel // one warp per pixel
PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF( at::cuda::CUDAGuard device_guard(features.device());
features.elemType().prim(), ([&] { cudaStream_t stream = at::cuda::getCurrentCUDAStream();
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
features.scalar_type(), "NCHW2NHWC_Feature", ([&] {
const scalar_t *bottom_data = features.data_ptr<scalar_t>();
scalar_t *top_data = rfeatures.data_ptr<scalar_t>();
const int dh = divideUP(channels, kTileDim); const int dh = divideUP(channels, kTileDim);
const int dw = divideUP(input_height * input_width, kTileDim); const int dw = divideUP(input_height * input_width, kTileDim);
BatchTranspose2DCUDAKernel<scalar_t> BatchTranspose2DCUDAKernel<scalar_t>
<<<batch_size * dh * dw, dim3(kTileDim, kBlockRows), 0, stream>>>( <<<batch_size * dh * dw, dim3(kTileDim, kBlockRows), 0, stream>>>(
batch_size, channels, input_height * input_width, dh, dw, batch_size, channels, input_height * input_width, dh, dw,
features.ptr<scalar_t>(), rfeatures.ptr<scalar_t>()); bottom_data, top_data);
})); }));
PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF( AT_DISPATCH_FLOATING_TYPES_AND_HALF(
features.elemType().prim(), ([&] { features.scalar_type(), "NCHW2NHWC_Masks", ([&] {
const scalar_t *bottom_data = masks.data_ptr<scalar_t>();
scalar_t *top_data = rmasks.data_ptr<scalar_t>();
const int dh = divideUP(mask_channels, kTileDim); const int dh = divideUP(mask_channels, kTileDim);
const int dw = divideUP(output_height * output_width, kTileDim); const int dw = divideUP(output_height * output_width, kTileDim);
BatchTranspose2DCUDAKernel<scalar_t> BatchTranspose2DCUDAKernel<scalar_t>
<<<batch_size * dh * dw, dim3(kTileDim, kBlockRows), 0, stream>>>( <<<batch_size * dh * dw, dim3(kTileDim, kBlockRows), 0, stream>>>(
batch_size, mask_channels, output_height * output_width, dh, dw, batch_size, mask_channels, output_height * output_width, dh, dw,
masks.ptr<scalar_t>(), rmasks.ptr<scalar_t>()); bottom_data, top_data);
})); }));
PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF( AT_DISPATCH_FLOATING_TYPES_AND_HALF(
features.elemType().prim(), ([&] { features.scalar_type(), "CARAFELaucherForward", ([&] {
const int num_kernels = const int num_kernels =
batch_size * output_height * output_width * THREADS_PER_PIXEL; batch_size * output_height * output_width * THREADS_PER_PIXEL;
const scalar_t *bottom_data = rfeatures.data_ptr<scalar_t>();
const scalar_t *bottom_masks = rmasks.data_ptr<scalar_t>();
scalar_t *top_data = routput.data_ptr<scalar_t>();
CARAFEForward<scalar_t><<<divideUP(num_kernels, THREADS_PER_BLOCK), CARAFEForward<scalar_t><<<divideUP(num_kernels, THREADS_PER_BLOCK),
THREADS_PER_BLOCK, 0, stream>>>( THREADS_PER_BLOCK, 0, stream>>>(
num_kernels, rfeatures.ptr<scalar_t>(), rmasks.ptr<scalar_t>(), num_kernels, bottom_data, bottom_masks, kernel_size, group_size,
kernel_size, group_size, scale_factor, channels, input_height, scale_factor, channels, input_height, input_width, output_height,
input_width, output_height, output_width, mask_channels, output_width, mask_channels, top_data);
routput.ptr<scalar_t>());
})); }));
PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF( AT_DISPATCH_FLOATING_TYPES_AND_HALF(
features.elemType().prim(), ([&] { features.scalar_type(), "NHWC2NCHW", ([&] {
const scalar_t *bottom_data = routput.data_ptr<scalar_t>();
scalar_t *top_data = output.data_ptr<scalar_t>();
const int dh = divideUP(output_height * output_width, kTileDim); const int dh = divideUP(output_height * output_width, kTileDim);
const int dw = divideUP(channels, kTileDim); const int dw = divideUP(channels, kTileDim);
BatchTranspose2DCUDAKernel<scalar_t> BatchTranspose2DCUDAKernel<scalar_t>
<<<batch_size * dh * dw, dim3(kTileDim, kBlockRows), 0, stream>>>( <<<batch_size * dh * dw, dim3(kTileDim, kBlockRows), 0, stream>>>(
batch_size, output_height * output_width, channels, dh, dw, batch_size, output_height * output_width, channels, dh, dw,
routput.ptr<scalar_t>(), output.ptr<scalar_t>()); bottom_data, top_data);
})); }));
PARROTS_CUDA_CHECK(cudaGetLastError()); AT_CUDA_CHECK(cudaGetLastError());
} }
void CARAFEBackwardCUDAKernelLauncher( void CARAFEBackwardCUDAKernelLauncher(
const DArrayLite top_grad, const DArrayLite rfeatures, const Tensor top_grad, const Tensor rfeatures, const Tensor masks,
const DArrayLite masks, DArrayLite rtop_grad, DArrayLite rbottom_grad_hs, Tensor rtop_grad, Tensor rbottom_grad_hs, Tensor rbottom_grad,
DArrayLite rbottom_grad, DArrayLite rmask_grad, DArrayLite bottom_grad, Tensor rmask_grad, Tensor bottom_grad, Tensor mask_grad,
DArrayLite mask_grad, const int kernel_size, const int group_size, const int kernel_size, const int group_size, const int scale_factor) {
const int scale_factor, cudaStream_t stream) { const int batch_size = top_grad.size(0);
const int batch_size = top_grad.dim(0); const int channels = top_grad.size(1);
const int channels = top_grad.dim(1); const int output_height = top_grad.size(2);
const int output_height = top_grad.dim(2); const int output_width = top_grad.size(3);
const int output_width = top_grad.dim(3);
const int input_height = bottom_grad.size(2);
const int input_height = bottom_grad.dim(2); const int input_width = bottom_grad.size(3);
const int input_width = bottom_grad.dim(3);
const int mask_channels = masks.size(1);
const int mask_channels = masks.dim(1);
rtop_grad.resize_({batch_size, output_height, output_width, channels});
PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF( rbottom_grad.resize_({batch_size, input_height, input_width, channels});
top_grad.elemType().prim(), ([&] { rbottom_grad_hs.resize_({batch_size, output_height, output_width, channels});
rmask_grad.resize_({batch_size, output_height, output_width, mask_channels});
at::cuda::CUDAGuard device_guard(top_grad.device());
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
top_grad.scalar_type(), "NCHW2NHWC_Top_Grad", ([&] {
const scalar_t *bottom_data = top_grad.data_ptr<scalar_t>();
scalar_t *top_data = rtop_grad.data_ptr<scalar_t>();
const int dh = divideUP(channels, kTileDim); const int dh = divideUP(channels, kTileDim);
const int dw = divideUP(output_height * output_width, kTileDim); const int dw = divideUP(output_height * output_width, kTileDim);
BatchTranspose2DCUDAKernel<scalar_t> BatchTranspose2DCUDAKernel<scalar_t>
<<<batch_size * dh * dw, dim3(kTileDim, kBlockRows), 0, stream>>>( <<<batch_size * dh * dw, dim3(kTileDim, kBlockRows), 0, stream>>>(
batch_size, channels, output_height * output_width, dh, dw, batch_size, channels, output_height * output_width, dh, dw,
top_grad.ptr<scalar_t>(), rtop_grad.ptr<scalar_t>()); bottom_data, top_data);
})); }));
PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(
top_grad.elemType().prim(), ([&] { AT_DISPATCH_FLOATING_TYPES_AND_HALF(
top_grad.scalar_type(), "CARAFELaucherBackward_Feature", ([&] {
const int num_kernels = const int num_kernels =
batch_size * output_height * output_width * THREADS_PER_PIXEL; batch_size * output_height * output_width * THREADS_PER_PIXEL;
const scalar_t *top_diff = rtop_grad.data_ptr<scalar_t>();
const scalar_t *bottom_masks = masks.data_ptr<scalar_t>();
scalar_t *bottom_diff = rbottom_grad_hs.data_ptr<scalar_t>();
CARAFEBackward_Feature<scalar_t> CARAFEBackward_Feature<scalar_t>
<<<divideUP(num_kernels, THREADS_PER_BLOCK), THREADS_PER_BLOCK, 0, <<<divideUP(num_kernels, THREADS_PER_BLOCK), THREADS_PER_BLOCK, 0,
stream>>>(num_kernels, rtop_grad.ptr<scalar_t>(), stream>>>(num_kernels, top_diff, bottom_masks, kernel_size,
masks.ptr<scalar_t>(), kernel_size, group_size, group_size, scale_factor, channels, input_height,
scale_factor, channels, input_height, input_width, input_width, output_height, output_width,
output_height, output_width, mask_channels, mask_channels, bottom_diff);
rbottom_grad_hs.ptr<scalar_t>());
})); }));
PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF( AT_DISPATCH_FLOATING_TYPES_AND_HALF(
top_grad.elemType().prim(), ([&] { top_grad.scalar_type(), "FeatureSum", ([&] {
const int num_kernels = const int num_kernels =
batch_size * input_height * input_width * THREADS_PER_PIXEL; batch_size * input_height * input_width * THREADS_PER_PIXEL;
const scalar_t *bottom_diff_hs = rbottom_grad_hs.data_ptr<scalar_t>();
scalar_t *bottom_diff = rbottom_grad.data_ptr<scalar_t>();
FeatureSum<scalar_t><<<divideUP(num_kernels, THREADS_PER_BLOCK), FeatureSum<scalar_t>
THREADS_PER_BLOCK, 0, stream>>>( <<<divideUP(num_kernels, THREADS_PER_BLOCK), THREADS_PER_BLOCK, 0,
num_kernels, rbottom_grad_hs.ptr<scalar_t>(), scale_factor, stream>>>(num_kernels, bottom_diff_hs, scale_factor, channels,
channels, input_height, input_width, rbottom_grad.ptr<scalar_t>()); input_height, input_width, bottom_diff);
})); }));
PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF( AT_DISPATCH_FLOATING_TYPES_AND_HALF(
top_grad.elemType().prim(), ([&] { top_grad.scalar_type(), "NHWC2NCHW_Bottom_Grad", ([&] {
const scalar_t *bottom_data = rbottom_grad.data_ptr<scalar_t>();
scalar_t *top_data = bottom_grad.data_ptr<scalar_t>();
const int dh = divideUP(input_height * input_width, kTileDim); const int dh = divideUP(input_height * input_width, kTileDim);
const int dw = divideUP(channels, kTileDim); const int dw = divideUP(channels, kTileDim);
BatchTranspose2DCUDAKernel<scalar_t> BatchTranspose2DCUDAKernel<scalar_t>
<<<batch_size * dh * dw, dim3(kTileDim, kBlockRows), 0, stream>>>( <<<batch_size * dh * dw, dim3(kTileDim, kBlockRows), 0, stream>>>(
batch_size, input_height * input_width, channels, dh, dw, batch_size, input_height * input_width, channels, dh, dw,
rbottom_grad.ptr<scalar_t>(), bottom_grad.ptr<scalar_t>()); bottom_data, top_data);
})); }));
PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(
top_grad.elemType().prim(), ([&] { AT_DISPATCH_FLOATING_TYPES_AND_HALF(
top_grad.scalar_type(), "CARAFELaucherBackward_Mask", ([&] {
const int num_kernels = batch_size * output_height * output_width * const int num_kernels = batch_size * output_height * output_width *
mask_channels * WARP_SIZE; mask_channels * WARP_SIZE;
const scalar_t *top_diff = rtop_grad.data_ptr<scalar_t>();
const scalar_t *bottom_data = rfeatures.data_ptr<scalar_t>();
scalar_t *mask_diff = rmask_grad.data_ptr<scalar_t>();
CARAFEBackward_Mask<scalar_t> CARAFEBackward_Mask<scalar_t>
<<<divideUP(num_kernels, THREADS_PER_BLOCK), THREADS_PER_BLOCK, 0, <<<divideUP(num_kernels, THREADS_PER_BLOCK), THREADS_PER_BLOCK, 0,
stream>>>(num_kernels, rtop_grad.ptr<scalar_t>(), stream>>>(num_kernels, top_diff, bottom_data, kernel_size,
rfeatures.ptr<scalar_t>(), kernel_size, group_size, group_size, scale_factor, channels, input_height,
scale_factor, channels, input_height, input_width, input_width, output_height, output_width,
output_height, output_width, mask_channels, mask_channels, mask_diff);
rmask_grad.ptr<scalar_t>());
})); }));
PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF( AT_DISPATCH_FLOATING_TYPES_AND_HALF(
top_grad.elemType().prim(), ([&] { top_grad.scalar_type(), "NHWC2NCHW_Mask_Grad", ([&] {
const scalar_t *bottom_data = rmask_grad.data_ptr<scalar_t>();
scalar_t *top_data = mask_grad.data_ptr<scalar_t>();
const int dh = divideUP(output_height * output_width, kTileDim); const int dh = divideUP(output_height * output_width, kTileDim);
const int dw = divideUP(mask_channels, kTileDim); const int dw = divideUP(mask_channels, kTileDim);
BatchTranspose2DCUDAKernel<scalar_t> BatchTranspose2DCUDAKernel<scalar_t>
<<<batch_size * dh * dw, dim3(kTileDim, kBlockRows), 0, stream>>>( <<<batch_size * dh * dw, dim3(kTileDim, kBlockRows), 0, stream>>>(
batch_size, output_height * output_width, mask_channels, dh, dw, batch_size, output_height * output_width, mask_channels, dh, dw,
rmask_grad.ptr<scalar_t>(), mask_grad.ptr<scalar_t>()); bottom_data, top_data);
})); }));
PARROTS_CUDA_CHECK(cudaGetLastError()); AT_CUDA_CHECK(cudaGetLastError());
} }
#include "parrots_cpp_helper.hpp" #include "pytorch_cpp_helper.hpp"
void CARAFENAIVEForwardCUDAKernelLauncher( #ifdef MMCV_WITH_CUDA
const DArrayLite features, const DArrayLite masks, DArrayLite output, void CARAFENAIVEForwardCUDAKernelLauncher(const Tensor features,
const int kernel_size, const int group_size, const int scale_factor, const Tensor masks, Tensor output,
cudaStream_t stream); const int kernel_size,
const int group_size,
const int scale_factor);
void CARAFENAIVEBackwardCUDAKernelLauncher( void CARAFENAIVEBackwardCUDAKernelLauncher(
const DArrayLite top_grad, const DArrayLite features, const Tensor top_grad, const Tensor features, const Tensor masks,
const DArrayLite masks, DArrayLite bottom_grad, DArrayLite mask_grad, Tensor bottom_grad, Tensor mask_grad, const int kernel_size,
const int kernel_size, const int group_size, const int scale_factor, const int group_size, const int scale_factor);
cudaStream_t stream);
void carafe_naive_forward_cuda(CudaContext& ctx, const SSElement& attr, void carafe_naive_forward_cuda(Tensor features, Tensor masks, Tensor output,
const OperatorBase::in_list_t& ins, int kernel_size, int group_size,
OperatorBase::out_list_t& outs) { int scale_factor) {
int kernel_size, group_size, scale_factor;
SSAttrs(attr)
.get<int>("kernel_size", kernel_size)
.get<int>("group_size", group_size)
.get<int>("scale_factor", scale_factor)
.done();
const auto& features = ins[0];
const auto& masks = ins[1];
auto& output = outs[0];
cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
CARAFENAIVEForwardCUDAKernelLauncher(features, masks, output, kernel_size, CARAFENAIVEForwardCUDAKernelLauncher(features, masks, output, kernel_size,
group_size, scale_factor, stream); group_size, scale_factor);
} }
void carafe_naive_backward_cuda(CudaContext& ctx, const SSElement& attr, void carafe_naive_backward_cuda(Tensor top_grad, Tensor features, Tensor masks,
const OperatorBase::in_list_t& ins, Tensor bottom_grad, Tensor mask_grad,
OperatorBase::out_list_t& outs) { int kernel_size, int group_size,
int kernel_size, group_size, scale_factor; int scale_factor) {
SSAttrs(attr)
.get<int>("kernel_size", kernel_size)
.get<int>("group_size", group_size)
.get<int>("scale_factor", scale_factor)
.done();
const auto& top_grad = ins[0];
const auto& features = ins[1];
const auto& masks = ins[2];
auto& bottom_grad = outs[0];
auto& mask_grad = outs[1];
cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
CARAFENAIVEBackwardCUDAKernelLauncher(top_grad, features, masks, bottom_grad, CARAFENAIVEBackwardCUDAKernelLauncher(top_grad, features, masks, bottom_grad,
mask_grad, kernel_size, group_size, mask_grad, kernel_size, group_size,
scale_factor, stream); scale_factor);
} }
#endif
PARROTS_EXTENSION_REGISTER(carafe_naive_forward) void carafe_naive_forward(Tensor features, Tensor masks, Tensor output,
.attr("kernel_size") int kernel_size, int group_size, int scale_factor) {
.attr("group_size") if (features.device().is_cuda()) {
.attr("scale_factor") #ifdef MMCV_WITH_CUDA
.input(2) CHECK_CUDA_INPUT(features);
.output(1) CHECK_CUDA_INPUT(masks);
.apply(carafe_naive_forward_cuda) CHECK_CUDA_INPUT(output);
.done(); carafe_naive_forward_cuda(features, masks, output, kernel_size, group_size,
scale_factor);
#else
AT_ERROR("CarafeNaive is not compiled with GPU support");
#endif
} else {
AT_ERROR("CarafeNaive is not implemented on CPU");
}
}
PARROTS_EXTENSION_REGISTER(carafe_naive_backward) void carafe_naive_backward(Tensor top_grad, Tensor features, Tensor masks,
.attr("kernel_size") Tensor bottom_grad, Tensor mask_grad,
.attr("group_size") int kernel_size, int group_size, int scale_factor) {
.attr("scale_factor") if (top_grad.device().is_cuda()) {
.input(3) #ifdef MMCV_WITH_CUDA
.output(2) CHECK_CUDA_INPUT(top_grad);
.apply(carafe_naive_backward_cuda) CHECK_CUDA_INPUT(features);
.done(); CHECK_CUDA_INPUT(masks);
CHECK_CUDA_INPUT(bottom_grad);
CHECK_CUDA_INPUT(mask_grad);
carafe_naive_backward_cuda(top_grad, features, masks, bottom_grad,
mask_grad, kernel_size, group_size,
scale_factor);
#else
AT_ERROR("CarafeNaive is not compiled with GPU support");
#endif
} else {
AT_ERROR("CarafeNaive is not implemented on CPU");
}
}
#include "carafe_naive_cuda_kernel.cuh" #include "carafe_naive_cuda_kernel.cuh"
#include "parrots_cuda_helper.hpp" #include "pytorch_cuda_helper.hpp"
void CARAFENAIVEForwardCUDAKernelLauncher( void CARAFENAIVEForwardCUDAKernelLauncher(const Tensor features,
const DArrayLite features, const DArrayLite masks, DArrayLite output, const Tensor masks, Tensor output,
const int kernel_size, const int group_size, const int scale_factor, const int kernel_size,
cudaStream_t stream) { const int group_size,
int output_size = output.size(); const int scale_factor) {
int channels = output.dim(1); int output_size = output.numel();
int height = output.dim(2); int channels = output.size(1);
int width = output.dim(3); int height = output.size(2);
int width = output.size(3);
PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF( at::cuda::CUDAGuard device_guard(features.device());
features.elemType().prim(), ([&] { cudaStream_t stream = at::cuda::getCurrentCUDAStream();
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
features.scalar_type(), "CARAFENAIVEForward", ([&] {
carafe_naive_forward_cuda_kernel<scalar_t> carafe_naive_forward_cuda_kernel<scalar_t>
<<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>( <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
output_size, features.ptr<scalar_t>(), masks.ptr<scalar_t>(), output_size, features.data_ptr<scalar_t>(),
output.ptr<scalar_t>(), kernel_size, group_size, scale_factor, masks.data_ptr<scalar_t>(), output.data_ptr<scalar_t>(),
channels, height, width); kernel_size, group_size, scale_factor, channels, height, width);
})); }));
PARROTS_CUDA_CHECK(cudaGetLastError()); AT_CUDA_CHECK(cudaGetLastError());
} }
void CARAFENAIVEBackwardCUDAKernelLauncher( void CARAFENAIVEBackwardCUDAKernelLauncher(
const DArrayLite top_grad, const DArrayLite features, const Tensor top_grad, const Tensor features, const Tensor masks,
const DArrayLite masks, DArrayLite bottom_grad, DArrayLite mask_grad, Tensor bottom_grad, Tensor mask_grad, const int kernel_size,
const int kernel_size, const int group_size, const int scale_factor, const int group_size, const int scale_factor) {
cudaStream_t stream) { int output_size = top_grad.numel();
int output_size = top_grad.size(); int channels = top_grad.size(1);
int channels = top_grad.dim(1); int height = top_grad.size(2);
int height = top_grad.dim(2); int width = top_grad.size(3);
int width = top_grad.dim(3);
PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF( at::cuda::CUDAGuard device_guard(top_grad.device());
features.elemType().prim(), ([&] { cudaStream_t stream = at::cuda::getCurrentCUDAStream();
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
top_grad.scalar_type(), "CARAFENAIVEBackward", ([&] {
carafe_naive_backward_cuda_kernel<scalar_t> carafe_naive_backward_cuda_kernel<scalar_t>
<<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>( <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
output_size, top_grad.ptr<scalar_t>(), features.ptr<scalar_t>(), output_size, top_grad.data_ptr<scalar_t>(),
masks.ptr<scalar_t>(), bottom_grad.ptr<scalar_t>(), features.data_ptr<scalar_t>(), masks.data_ptr<scalar_t>(),
mask_grad.ptr<scalar_t>(), kernel_size, group_size, bottom_grad.data_ptr<scalar_t>(),
mask_grad.data_ptr<scalar_t>(), kernel_size, group_size,
scale_factor, channels, height, width); scale_factor, channels, height, width);
})); }));
PARROTS_CUDA_CHECK(cudaGetLastError()); AT_CUDA_CHECK(cudaGetLastError());
} }
#include <parrots/compute/aten.hpp>
#include <parrots/extension.hpp>
#include <parrots/foundation/ssattrs.hpp>
#include "carafe_naive_pytorch.h"
using namespace parrots;
/*void carafe_naive_forward_cuda(Tensor features, Tensor masks, Tensor output,
* int kernel_size, int group_size,
* int scale_factor)
*/
void carafe_naive_forward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
int kernel_size, group_size, scale_factor;
SSAttrs(attr)
.get<int>("kernel_size", kernel_size)
.get<int>("group_size", group_size)
.get<int>("scale_factor", scale_factor)
.done();
const auto& features = buildATensor(ctx, ins[0]);
const auto& masks = buildATensor(ctx, ins[1]);
auto output = buildATensor(ctx, outs[0]);
carafe_naive_forward_cuda(features, masks, output, kernel_size, group_size,
scale_factor);
}
/*void carafe_naive_backward_cuda(Tensor top_grad, Tensor features, Tensor
* masks, Tensor bottom_grad, Tensor mask_grad, int kernel_size, int group_size,
* int scale_factor);
*/
void carafe_naive_backward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
int kernel_size, group_size, scale_factor;
SSAttrs(attr)
.get<int>("kernel_size", kernel_size)
.get<int>("group_size", group_size)
.get<int>("scale_factor", scale_factor)
.done();
const auto& top_grad = buildATensor(ctx, ins[0]);
const auto& features = buildATensor(ctx, ins[1]);
const auto& masks = buildATensor(ctx, ins[2]);
auto bottom_grad = buildATensor(ctx, outs[0]);
auto mask_grad = buildATensor(ctx, outs[1]);
carafe_naive_backward_cuda(top_grad, features, masks, bottom_grad, mask_grad,
kernel_size, group_size, scale_factor);
}
PARROTS_EXTENSION_REGISTER(carafe_naive_forward)
.attr("kernel_size")
.attr("group_size")
.attr("scale_factor")
.input(2)
.output(1)
.apply(carafe_naive_forward_cuda_parrots)
.done();
PARROTS_EXTENSION_REGISTER(carafe_naive_backward)
.attr("kernel_size")
.attr("group_size")
.attr("scale_factor")
.input(3)
.output(2)
.apply(carafe_naive_backward_cuda_parrots)
.done();
#ifndef CARAFE_NAIVE_PYTORCH_H
#define CARAFE_NAIVE_PYTORCH_H
#include <torch/extension.h>
using namespace at;
void carafe_naive_forward_cuda(Tensor features, Tensor masks, Tensor output,
int kernel_size, int group_size,
int scale_factor);
void carafe_naive_backward_cuda(Tensor top_grad, Tensor features, Tensor masks,
Tensor bottom_grad, Tensor mask_grad,
int kernel_size, int group_size,
int scale_factor);
#endif // CARAFE_NAIVE_PYTORCH_H
#include <parrots/compute/aten.hpp>
#include <parrots/extension.hpp>
#include <parrots/foundation/ssattrs.hpp>
#include "carafe_pytorch.h"
using namespace parrots;
/*
* void carafe_forward_cuda(Tensor features, Tensor masks, Tensor rfeatures,
* Tensor routput, Tensor rmasks, Tensor output,
* int kernel_size, int group_size, int scale_factor);
*/
void carafe_forward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
int kernel_size, group_size, scale_factor;
SSAttrs(attr)
.get<int>("kernel_size", kernel_size)
.get<int>("group_size", group_size)
.get<int>("scale_factor", scale_factor)
.done();
const auto& features = buildATensor(ctx, ins[0]);
const auto& masks = buildATensor(ctx, ins[1]);
auto rfeatures = buildATensor(ctx, outs[0]);
auto routput = buildATensor(ctx, outs[1]);
auto rmasks = buildATensor(ctx, outs[2]);
auto output = buildATensor(ctx, outs[3]);
carafe_forward_cuda(features, masks, rfeatures, routput, rmasks, output,
kernel_size, group_size, scale_factor);
}
/*
* void carafe_backward_cuda(Tensor top_grad, Tensor rfeatures, Tensor masks,
* Tensor rtop_grad, Tensor rbottom_grad_hs,
* Tensor rbottom_grad, Tensor rmask_grad,
* Tensor bottom_grad, Tensor mask_grad, int
* kernel_size, int group_size, int scale_factor);
*/
void carafe_backward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
int kernel_size, group_size, scale_factor;
SSAttrs(attr)
.get<int>("kernel_size", kernel_size)
.get<int>("group_size", group_size)
.get<int>("scale_factor", scale_factor)
.done();
const auto& top_grad = buildATensor(ctx, ins[0]);
const auto& rfeatures = buildATensor(ctx, ins[1]);
const auto& masks = buildATensor(ctx, ins[2]);
auto rtop_grad = buildATensor(ctx, outs[0]);
auto rbottom_grad_hs = buildATensor(ctx, outs[1]);
auto rbottom_grad = buildATensor(ctx, outs[2]);
auto rmask_grad = buildATensor(ctx, outs[3]);
auto bottom_grad = buildATensor(ctx, outs[4]);
auto mask_grad = buildATensor(ctx, outs[5]);
carafe_backward_cuda(top_grad, rfeatures, masks, rtop_grad, rbottom_grad_hs,
rbottom_grad, rmask_grad, bottom_grad, mask_grad,
kernel_size, group_size, scale_factor);
}
PARROTS_EXTENSION_REGISTER(carafe_forward)
.attr("kernel_size")
.attr("group_size")
.attr("scale_factor")
.input(2)
.output(4)
.apply(carafe_forward_cuda_parrots)
.done();
PARROTS_EXTENSION_REGISTER(carafe_backward)
.attr("kernel_size")
.attr("group_size")
.attr("scale_factor")
.input(3)
.output(6)
.apply(carafe_backward_cuda_parrots)
.done();
#ifndef CARAFE_PYTORCH_H
#define CARAFE_PYTORCH_H
#include <torch/extension.h>
using namespace at;
void carafe_forward_cuda(Tensor features, Tensor masks, Tensor rfeatures,
Tensor routput, Tensor rmasks, Tensor output,
int kernel_size, int group_size, int scale_factor);
void carafe_backward_cuda(Tensor top_grad, Tensor rfeatures, Tensor masks,
Tensor rtop_grad, Tensor rbottom_grad_hs,
Tensor rbottom_grad, Tensor rmask_grad,
Tensor bottom_grad, Tensor mask_grad, int kernel_size,
int group_size, int scale_factor);
#endif // CARAFE_PYTORCH_H
#include "parrots_cpp_helper.hpp" #include "pytorch_cpp_helper.hpp"
void CAForwardCUDAKernelLauncher(const DArrayLite t, const DArrayLite f, #ifdef MMCV_WITH_CUDA
DArrayLite weight, CudaContext &ctx, void CAForwardCUDAKernelLauncher(const Tensor t, const Tensor f, Tensor weight);
cudaStream_t stream);
void CABackwardCUDAKernelLauncher(const DArrayLite dw, const DArrayLite t, void CABackwardCUDAKernelLauncher(const Tensor dw, const Tensor t,
const DArrayLite f, DArrayLite dt, const Tensor f, Tensor dt, Tensor df);
DArrayLite df, CudaContext &ctx,
cudaStream_t stream);
void CAMapForwardCUDAKernelLauncher(const DArrayLite weight, const DArrayLite g, void CAMapForwardCUDAKernelLauncher(const Tensor weight, const Tensor g,
DArrayLite out, CudaContext &ctx, Tensor out);
cudaStream_t stream);
void CAMapBackwardCUDAKernelLauncher(const DArrayLite dout, void CAMapBackwardCUDAKernelLauncher(const Tensor dout, const Tensor weight,
const DArrayLite weight, const Tensor g, Tensor dw, Tensor dg);
const DArrayLite g, DArrayLite dw,
DArrayLite dg, CudaContext &ctx,
cudaStream_t stream);
void ca_forward_cuda(CudaContext &ctx, const SSElement &attr, void ca_forward_cuda(const Tensor t, const Tensor f, Tensor weight) {
const OperatorBase::in_list_t &ins, CAForwardCUDAKernelLauncher(t, f, weight);
OperatorBase::out_list_t &outs) {
const auto &t = ins[0];
const auto &f = ins[1];
auto &weight = outs[0];
cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
CAForwardCUDAKernelLauncher(t, f, weight, ctx, stream);
} }
void ca_backward_cuda(CudaContext &ctx, const SSElement &attr, void ca_backward_cuda(const Tensor dw, const Tensor t, const Tensor f,
const OperatorBase::in_list_t &ins, Tensor dt, Tensor df) {
OperatorBase::out_list_t &outs) { CABackwardCUDAKernelLauncher(dw, t, f, dt, df);
const auto &dw = ins[0];
const auto &t = ins[1];
const auto &f = ins[2];
auto &dt = outs[0];
auto &df = outs[1];
cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
CABackwardCUDAKernelLauncher(dw, t, f, dt, df, ctx, stream);
} }
void ca_map_forward_cuda(CudaContext &ctx, const SSElement &attr, void ca_map_forward_cuda(const Tensor weight, const Tensor g, Tensor out) {
const OperatorBase::in_list_t &ins, CAMapForwardCUDAKernelLauncher(weight, g, out);
OperatorBase::out_list_t &outs) {
const auto &weight = ins[0];
const auto &g = ins[1];
auto &out = outs[0];
cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
CAMapForwardCUDAKernelLauncher(weight, g, out, ctx, stream);
} }
void ca_map_backward_cuda(CudaContext &ctx, const SSElement &attr, void ca_map_backward_cuda(const Tensor dout, const Tensor weight,
const OperatorBase::in_list_t &ins, const Tensor g, Tensor dw, Tensor dg) {
OperatorBase::out_list_t &outs) { CAMapBackwardCUDAKernelLauncher(dout, weight, g, dw, dg);
const auto &dout = ins[0];
const auto &weight = ins[1];
const auto &g = ins[2];
auto &dw = outs[0];
auto &dg = outs[1];
cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
CAMapBackwardCUDAKernelLauncher(dout, weight, g, dw, dg, ctx, stream);
} }
#endif
PARROTS_EXTENSION_REGISTER(ca_forward) void ca_forward(const Tensor t, const Tensor f, Tensor weight) {
.input(2) if (t.device().is_cuda()) {
.output(1) #ifdef MMCV_WITH_CUDA
.apply(ca_forward_cuda) CHECK_CUDA_INPUT(t);
.done(); CHECK_CUDA_INPUT(f);
CHECK_CUDA_INPUT(weight);
ca_forward_cuda(t, f, weight);
#else
AT_ERROR("ca is not compiled with GPU support");
#endif
} else {
AT_ERROR("ca is not implemented on the CPU");
}
}
PARROTS_EXTENSION_REGISTER(ca_backward) void ca_backward(const Tensor dw, const Tensor t, const Tensor f, Tensor dt,
.input(3) Tensor df) {
.output(2) if (dw.device().is_cuda()) {
.apply(ca_backward_cuda) #ifdef MMCV_WITH_CUDA
.done(); CHECK_CUDA_INPUT(dw);
CHECK_CUDA_INPUT(t);
CHECK_CUDA_INPUT(f);
CHECK_CUDA_INPUT(dt);
CHECK_CUDA_INPUT(df);
ca_backward_cuda(dw, t, f, dt, df);
#else
AT_ERROR("ca is not compiled with GPU support");
#endif
} else {
AT_ERROR("ca is not implemented on the CPU");
}
}
PARROTS_EXTENSION_REGISTER(ca_map_forward) void ca_map_forward(const Tensor weight, const Tensor g, Tensor out) {
.input(2) if (weight.device().is_cuda()) {
.output(1) #ifdef MMCV_WITH_CUDA
.apply(ca_map_forward_cuda) CHECK_CUDA_INPUT(weight);
.done(); CHECK_CUDA_INPUT(g);
CHECK_CUDA_INPUT(out);
ca_map_forward_cuda(weight, g, out);
#else
AT_ERROR("ca_map is not compiled with GPU support");
#endif
} else {
AT_ERROR("ca is not implemented on the CPU");
}
}
PARROTS_EXTENSION_REGISTER(ca_map_backward) void ca_map_backward(const Tensor dout, const Tensor weight, const Tensor g,
.input(3) Tensor dw, Tensor dg) {
.output(2) if (dout.device().is_cuda()) {
.apply(ca_map_backward_cuda) #ifdef MMCV_WITH_CUDA
.done(); CHECK_CUDA_INPUT(dout);
CHECK_CUDA_INPUT(weight);
CHECK_CUDA_INPUT(g);
CHECK_CUDA_INPUT(dw);
CHECK_CUDA_INPUT(dg);
ca_map_backward_cuda(dout, weight, g, dw, dg);
#else
AT_ERROR("ca_map is not compiled with GPU support");
#endif
} else {
AT_ERROR("ca is not implemented on the CPU");
}
}
// Modified from
// https://github.com/LikeLy-Journey/SegmenTron/blob/master/segmentron/modules/csrc/criss_cross_attention/ca_cuda.cu
#include <THC/THC.h>
#include <THC/THCDeviceUtils.cuh>
#include "cc_attention_cuda_kernel.cuh"
#include "pytorch_cuda_helper.hpp"
void CAForwardCUDAKernelLauncher(const Tensor t, const Tensor f,
Tensor weight) {
AT_ASSERTM(t.device().is_cuda(), "input must be a CUDA tensor");
AT_ASSERTM(f.device().is_cuda(), "input must be a CUDA tensor");
auto n = t.size(0);
auto c = t.size(1);
auto h = t.size(2);
auto w = t.size(3);
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
// Run kernel
dim3 threads(32, 32);
int d1 = (w + threads.x - 1) / threads.x;
int d2 = (h + threads.y - 1) / threads.y;
int d3 = h + w;
dim3 blocks(d1, d2, d3);
AT_DISPATCH_FLOATING_TYPES(t.scalar_type(), "ca_forward", [&] {
ca_forward_kernel<scalar_t><<<blocks, threads, 0, stream>>>(
t.contiguous().data_ptr<scalar_t>(),
f.contiguous().data_ptr<scalar_t>(),
weight.contiguous().data_ptr<scalar_t>(), n, c, h, w);
});
THCudaCheck(cudaGetLastError());
}
void CABackwardCUDAKernelLauncher(const Tensor dw, const Tensor t,
const Tensor f, Tensor dt, Tensor df) {
AT_ASSERTM(dw.device().is_cuda(), "input must be a CUDA tensor");
AT_ASSERTM(t.device().is_cuda(), "input must be a CUDA tensor");
AT_ASSERTM(f.device().is_cuda(), "input must be a CUDA tensor");
auto n = t.size(0);
auto c = t.size(1);
auto h = t.size(2);
auto w = t.size(3);
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
// Run kernel
dim3 threads(32, 32);
int d1 = (w + threads.x - 1) / threads.x;
int d2 = (h + threads.y - 1) / threads.y;
int d3 = c;
dim3 blocks(d1, d2, d3);
AT_DISPATCH_FLOATING_TYPES(t.scalar_type(), "ca_backward_kernel_t", [&] {
ca_backward_kernel_t<scalar_t><<<blocks, threads, 0, stream>>>(
dw.contiguous().data_ptr<scalar_t>(),
t.contiguous().data_ptr<scalar_t>(),
f.contiguous().data_ptr<scalar_t>(),
dt.contiguous().data_ptr<scalar_t>(), n, c, h, w);
});
AT_DISPATCH_FLOATING_TYPES(f.scalar_type(), "ca_backward_kernel_f", [&] {
ca_backward_kernel_f<scalar_t><<<blocks, threads, 0, stream>>>(
dw.contiguous().data_ptr<scalar_t>(),
t.contiguous().data_ptr<scalar_t>(),
f.contiguous().data_ptr<scalar_t>(),
df.contiguous().data_ptr<scalar_t>(), n, c, h, w);
});
THCudaCheck(cudaGetLastError());
}
void CAMapForwardCUDAKernelLauncher(const Tensor weight, const Tensor g,
Tensor out) {
AT_ASSERTM(weight.device().is_cuda(), "input must be a CUDA tensor");
AT_ASSERTM(g.device().is_cuda(), "input must be a CUDA tensor");
auto n = g.size(0);
auto c = g.size(1);
auto h = g.size(2);
auto w = g.size(3);
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
// Run kernel
dim3 threads(32, 32);
int d1 = (w + threads.x - 1) / threads.x;
int d2 = (h + threads.y - 1) / threads.y;
int d3 = c;
dim3 blocks(d1, d2, d3);
AT_DISPATCH_FLOATING_TYPES(g.scalar_type(), "ca_map_forward", [&] {
ca_map_forward_kernel<scalar_t><<<blocks, threads, 0, stream>>>(
weight.contiguous().data_ptr<scalar_t>(),
g.contiguous().data_ptr<scalar_t>(),
out.contiguous().data_ptr<scalar_t>(), n, c, h, w);
});
THCudaCheck(cudaGetLastError());
}
void CAMapBackwardCUDAKernelLauncher(const Tensor dout, const Tensor weight,
const Tensor g, Tensor dw, Tensor dg) {
AT_ASSERTM(dout.device().is_cuda(), "input must be a CUDA tensor");
AT_ASSERTM(weight.device().is_cuda(), "input must be a CUDA tensor");
AT_ASSERTM(g.device().is_cuda(), "input must be a CUDA tensor");
auto n = dout.size(0);
auto c = dout.size(1);
auto h = dout.size(2);
auto w = dout.size(3);
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
// Run kernel
dim3 threads(32, 32);
int d1 = (w + threads.x - 1) / threads.x;
int d2 = (h + threads.y - 1) / threads.y;
int d3 = h + w;
dim3 blocks(d1, d2, d3);
AT_DISPATCH_FLOATING_TYPES(
weight.scalar_type(), "ca_map_backward_kernel_w", [&] {
ca_map_backward_kernel_w<scalar_t><<<blocks, threads, 0, stream>>>(
dout.contiguous().data_ptr<scalar_t>(),
weight.contiguous().data_ptr<scalar_t>(),
g.contiguous().data_ptr<scalar_t>(),
dw.contiguous().data_ptr<scalar_t>(), n, c, h, w);
});
AT_DISPATCH_FLOATING_TYPES(g.scalar_type(), "ca_map_backward_kernel_g", [&] {
ca_map_backward_kernel_g<scalar_t><<<blocks, threads, 0, stream>>>(
dout.contiguous().data_ptr<scalar_t>(),
weight.contiguous().data_ptr<scalar_t>(),
g.contiguous().data_ptr<scalar_t>(),
dg.contiguous().data_ptr<scalar_t>(), n, c, h, w);
});
THCudaCheck(cudaGetLastError());
}
#include "cc_attention_cuda_kernel.cuh"
#include "parrots_cuda_helper.hpp"
void CAForwardCUDAKernelLauncher(const DArrayLite t, const DArrayLite f,
DArrayLite weight, CudaContext &ctx,
cudaStream_t stream) {
auto n = t.dim(0);
auto c = t.dim(1);
auto h = t.dim(2);
auto w = t.dim(3);
// Run kernel
dim3 threads(32, 32);
int d1 = (w + threads.x - 1) / threads.x;
int d2 = (h + threads.y - 1) / threads.y;
int d3 = h + w;
dim3 blocks(d1, d2, d3);
PARROTS_DISPATCH_FLOATING_TYPES(t.elemType().prim(), [&] {
ca_forward_kernel<scalar_t>
<<<blocks, threads, 0, stream>>>(t.ptr<scalar_t>(), f.ptr<scalar_t>(),
weight.ptr<scalar_t>(), n, c, h, w);
});
PARROTS_CUDA_CHECK(cudaGetLastError());
}
void CABackwardCUDAKernelLauncher(const DArrayLite dw, const DArrayLite t,
const DArrayLite f, DArrayLite dt,
DArrayLite df, CudaContext &ctx,
cudaStream_t stream) {
auto n = t.dim(0);
auto c = t.dim(1);
auto h = t.dim(2);
auto w = t.dim(3);
// Run kernel
dim3 threads(32, 32);
int d1 = (w + threads.x - 1) / threads.x;
int d2 = (h + threads.y - 1) / threads.y;
int d3 = c;
dim3 blocks(d1, d2, d3);
PARROTS_DISPATCH_FLOATING_TYPES(t.elemType().prim(), [&] {
ca_backward_kernel_t<scalar_t><<<blocks, threads, 0, stream>>>(
dw.ptr<scalar_t>(), t.ptr<scalar_t>(), f.ptr<scalar_t>(),
dt.ptr<scalar_t>(), n, c, h, w);
});
PARROTS_DISPATCH_FLOATING_TYPES(f.elemType().prim(), [&] {
ca_backward_kernel_f<scalar_t><<<blocks, threads, 0, stream>>>(
dw.ptr<scalar_t>(), t.ptr<scalar_t>(), f.ptr<scalar_t>(),
df.ptr<scalar_t>(), n, c, h, w);
});
PARROTS_CUDA_CHECK(cudaGetLastError());
}
void CAMapForwardCUDAKernelLauncher(const DArrayLite weight, const DArrayLite g,
DArrayLite out, CudaContext &ctx,
cudaStream_t stream) {
auto n = g.dim(0);
auto c = g.dim(1);
auto h = g.dim(2);
auto w = g.dim(3);
// Run kernel
dim3 threads(32, 32);
int d1 = (w + threads.x - 1) / threads.x;
int d2 = (h + threads.y - 1) / threads.y;
int d3 = c;
dim3 blocks(d1, d2, d3);
PARROTS_DISPATCH_FLOATING_TYPES(g.elemType().prim(), [&] {
ca_map_forward_kernel<scalar_t><<<blocks, threads, 0, stream>>>(
weight.ptr<scalar_t>(), g.ptr<scalar_t>(), out.ptr<scalar_t>(), n, c, h,
w);
});
PARROTS_CUDA_CHECK(cudaGetLastError());
}
void CAMapBackwardCUDAKernelLauncher(const DArrayLite dout,
const DArrayLite weight,
const DArrayLite g, DArrayLite dw,
DArrayLite dg, CudaContext &ctx,
cudaStream_t stream) {
auto n = dout.dim(0);
auto c = dout.dim(1);
auto h = dout.dim(2);
auto w = dout.dim(3);
// Run kernel
dim3 threads(32, 32);
int d1 = (w + threads.x - 1) / threads.x;
int d2 = (h + threads.y - 1) / threads.y;
int d3 = h + w;
dim3 blocks(d1, d2, d3);
PARROTS_DISPATCH_FLOATING_TYPES(weight.elemType().prim(), [&] {
ca_map_backward_kernel_w<scalar_t><<<blocks, threads, 0, stream>>>(
dout.ptr<scalar_t>(), weight.ptr<scalar_t>(), g.ptr<scalar_t>(),
dw.ptr<scalar_t>(), n, c, h, w);
});
PARROTS_DISPATCH_FLOATING_TYPES(g.elemType().prim(), [&] {
ca_map_backward_kernel_g<scalar_t><<<blocks, threads, 0, stream>>>(
dout.ptr<scalar_t>(), weight.ptr<scalar_t>(), g.ptr<scalar_t>(),
dg.ptr<scalar_t>(), n, c, h, w);
});
PARROTS_CUDA_CHECK(cudaGetLastError());
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment