Unverified Commit 9b1209fa authored by CokeDong's avatar CokeDong Committed by GitHub
Browse files

[Feature] Support mmcv ext with DIOPI impl (#2790)

parent 558742c9
// Copyright (c) OpenMMLab. All rights reserved
#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
#ifdef MMCV_WITH_DIOPI
#include <diopi/diopirt.h>
#include <diopi/functions.h>
#include <diopi/functions_mmcv.h>
#include "csrc_dipu/diopirt/diopirt_impl.h"
using dipu::diopi_helper::toDiopiScalar;
using dipu::diopi_helper::toDiopiTensorHandle;
#endif
void bbox_overlaps_impl(const Tensor bboxes1, const Tensor bboxes2, Tensor ious,
const int mode, const bool aligned, const int offset) {
......@@ -8,7 +18,40 @@ void bbox_overlaps_impl(const Tensor bboxes1, const Tensor bboxes2, Tensor ious,
aligned, offset);
}
#ifdef MMCV_WITH_DIOPI
void bbox_overlaps_diopi(const Tensor bboxes1, const Tensor bboxes2,
Tensor ious, const int mode, const bool aligned,
const int offset) {
auto bboxes1_p = toDiopiTensorHandle(bboxes1);
diopiDevice_t device;
diopiGetTensorDevice(bboxes1_p, &device);
if (device == diopi_host) {
bbox_overlaps_impl(bboxes1, bboxes2, ious, mode, aligned, offset);
return;
}
diopiContext ctx(dipu::getCurrentDIPUStream().rawstream());
diopiContextHandle_t ch = &ctx;
auto bboxes2_p = toDiopiTensorHandle(bboxes2);
auto ious_p = toDiopiTensorHandle(ious);
if (reinterpret_cast<void *>(diopiBboxOverlapsMmcv) != nullptr) {
auto ret = diopiBboxOverlapsMmcv(ch, ious_p, bboxes1_p, bboxes2_p, mode,
offset, aligned);
if (ret == diopiSuccess) return;
}
LOG(WARNING) << "Fallback to cpu: mmcv ext op bbox_overlaps";
auto bboxes1_cpu = bboxes1.cpu();
auto bboxes2_cpu = bboxes2.cpu();
auto ious_cpu = ious.cpu();
bbox_overlaps_impl(bboxes1_cpu, bboxes2_cpu, ious_cpu, mode, aligned, offset);
ious.copy_(ious_cpu);
}
#endif
void bbox_overlaps(const Tensor bboxes1, const Tensor bboxes2, Tensor ious,
const int mode, const bool aligned, const int offset) {
#ifdef MMCV_WITH_DIOPI
bbox_overlaps_diopi(bboxes1, bboxes2, ious, mode, aligned, offset);
#else
bbox_overlaps_impl(bboxes1, bboxes2, ious, mode, aligned, offset);
#endif
}
// Copyright (c) OpenMMLab. All rights reserved
#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
#ifdef MMCV_WITH_DIOPI
#include <diopi/diopirt.h>
#include <diopi/functions.h>
#include <diopi/functions_mmcv.h>
#include "csrc_dipu/diopirt/diopirt_impl.h"
using dipu::diopi_helper::toDiopiScalar;
using dipu::diopi_helper::toDiopiTensorHandle;
#endif
void sigmoid_focal_loss_forward_impl(Tensor input, Tensor target, Tensor weight,
Tensor output, float gamma, float alpha) {
......@@ -29,15 +39,92 @@ void softmax_focal_loss_backward_impl(Tensor input, Tensor target,
buff, grad_input, gamma, alpha);
}
#ifdef MMCV_WITH_DIOPI
void sigmoid_focal_loss_forward_diopi(Tensor input, Tensor target,
Tensor weight, Tensor output, float gamma,
float alpha) {
auto input_p = toDiopiTensorHandle(input);
diopiDevice_t device;
diopiGetTensorDevice(input_p, &device);
if (device == diopi_host) {
sigmoid_focal_loss_forward_impl(input, target, weight, output, gamma,
alpha);
return;
}
diopiContext ctx(dipu::getCurrentDIPUStream().rawstream());
diopiContextHandle_t ch = &ctx;
auto target_p = toDiopiTensorHandle(target);
auto weight_p = toDiopiTensorHandle(weight);
auto output_p = toDiopiTensorHandle(output);
if (reinterpret_cast<void *>(diopiSigmoidFocalLossMmcv) != nullptr) {
auto ret = diopiSigmoidFocalLossMmcv(ch, output_p, input_p, target_p,
weight_p, gamma, alpha);
if (ret == diopiSuccess) return;
}
LOG(WARNING)
<< "Fallback to cpu: mmcv ext op sigmoid_focal_loss_forward_impl";
auto input_cpu = input.cpu();
auto target_cpu = target.cpu();
auto weight_cpu = weight.cpu();
auto output_cpu = output.cpu();
sigmoid_focal_loss_forward_impl(input_cpu, target_cpu, weight_cpu, output_cpu,
gamma, alpha);
output.copy_(output_cpu);
return;
}
void sigmoid_focal_loss_backward_diopi(Tensor input, Tensor target,
Tensor weight, Tensor grad_input,
float gamma, float alpha) {
auto input_p = toDiopiTensorHandle(input);
diopiDevice_t device;
diopiGetTensorDevice(input_p, &device);
if (device == diopi_host) {
sigmoid_focal_loss_backward_impl(input, target, weight, grad_input, gamma,
alpha);
return;
}
diopiContext ctx(dipu::getCurrentDIPUStream().rawstream());
diopiContextHandle_t ch = &ctx;
auto target_p = toDiopiTensorHandle(target);
auto weight_p = toDiopiTensorHandle(weight);
auto grad_input_p = toDiopiTensorHandle(grad_input);
if (reinterpret_cast<void *>(diopiSigmoidFocalLossBackwardMmcv) != nullptr) {
auto ret = diopiSigmoidFocalLossBackwardMmcv(
ch, grad_input_p, input_p, target_p, weight_p, gamma, alpha);
if (ret == diopiSuccess) return;
}
LOG(WARNING)
<< "Fallback to cpu: mmcv ext op sigmoid_focal_loss_forward_impl";
auto input_cpu = input.cpu();
auto target_cpu = target.cpu();
auto weight_cpu = weight.cpu();
auto grad_input_cpu = grad_input.cpu();
sigmoid_focal_loss_backward_impl(input_cpu, target_cpu, weight_cpu,
grad_input_cpu, gamma, alpha);
grad_input.copy_(grad_input_cpu);
return;
}
#endif
void sigmoid_focal_loss_forward(Tensor input, Tensor target, Tensor weight,
Tensor output, float gamma, float alpha) {
#ifdef MMCV_WITH_DIOPI
sigmoid_focal_loss_forward_diopi(input, target, weight, output, gamma, alpha);
#else
sigmoid_focal_loss_forward_impl(input, target, weight, output, gamma, alpha);
#endif
}
void sigmoid_focal_loss_backward(Tensor input, Tensor target, Tensor weight,
Tensor grad_input, float gamma, float alpha) {
#ifdef MMCV_WITH_DIOPI
sigmoid_focal_loss_backward_diopi(input, target, weight, grad_input, gamma,
alpha);
#else
sigmoid_focal_loss_backward_impl(input, target, weight, grad_input, gamma,
alpha);
#endif
}
void softmax_focal_loss_forward(Tensor input, Tensor target, Tensor weight,
......
// Copyright (c) OpenMMLab. All rights reserved
#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
#ifdef MMCV_WITH_DIOPI
#include <diopi/diopirt.h>
#include <diopi/functions.h>
#include <diopi/functions_mmcv.h>
#include "csrc_dipu/diopirt/diopirt_impl.h"
using dipu::diopi_helper::toDiopiScalar;
using dipu::diopi_helper::toDiopiTensorHandle;
#endif
void modulated_deformable_im2col_impl(
const Tensor data_im, const Tensor data_offset, const Tensor data_mask,
......@@ -45,7 +55,7 @@ void modulated_deformable_col2im_coord_impl(
dilation_w, deformable_group, grad_offset, grad_mask);
}
void modulated_deform_conv_forward(
void modulated_deform_conv_forward_fallthrough(
Tensor input, Tensor weight, Tensor bias, Tensor ones, Tensor offset,
Tensor mask, Tensor output, Tensor columns, int kernel_h, int kernel_w,
const int stride_h, const int stride_w, const int pad_h, const int pad_w,
......@@ -123,7 +133,7 @@ void modulated_deform_conv_forward(
}
}
void modulated_deform_conv_backward(
void modulated_deform_conv_backward_fallthrough(
Tensor input, Tensor weight, Tensor bias, Tensor ones, Tensor offset,
Tensor mask, Tensor columns, Tensor grad_input, Tensor grad_weight,
Tensor grad_bias, Tensor grad_offset, Tensor grad_mask, Tensor grad_output,
......@@ -235,3 +245,165 @@ void modulated_deform_conv_backward(
grad_output.size(2), grad_output.size(3),
grad_output.size(4)});
}
#ifdef MMCV_WITH_DIOPI
void modulated_deform_conv_forward_diopi(
Tensor input, Tensor weight, Tensor bias, Tensor ones, Tensor offset,
Tensor mask, Tensor output, Tensor columns, int kernel_h, int kernel_w,
const int stride_h, const int stride_w, const int pad_h, const int pad_w,
const int dilation_h, const int dilation_w, const int group,
const int deformable_group, const bool with_bias) {
auto input_p = toDiopiTensorHandle(input);
diopiDevice_t device;
diopiGetTensorDevice(input_p, &device);
if (device == diopi_host) {
modulated_deform_conv_forward_fallthrough(
input, weight, bias, ones, offset, mask, output, columns, kernel_h,
kernel_w, stride_h, stride_w, pad_h, pad_w, dilation_h, dilation_w,
group, deformable_group, with_bias);
return;
}
diopiContext ctx(dipu::getCurrentDIPUStream().rawstream());
diopiContextHandle_t ch = &ctx;
auto weight_p = toDiopiTensorHandle(weight);
auto bias_p = toDiopiTensorHandle(bias);
auto ones_p = toDiopiTensorHandle(ones);
auto offset_p = toDiopiTensorHandle(offset);
auto mask_p = toDiopiTensorHandle(mask);
auto output_p = toDiopiTensorHandle(output);
auto columns_p = toDiopiTensorHandle(columns);
if (reinterpret_cast<void*>(diopiModulatedDeformConvMmcv) != nullptr) {
auto ret = diopiModulatedDeformConvMmcv(
ch, output_p, columns_p, ones_p, input_p, weight_p, bias_p, offset_p,
mask_p, kernel_h, kernel_w, stride_h, stride_w, pad_h, pad_w,
dilation_h, dilation_w, group, deformable_group, with_bias);
if (ret == diopiSuccess) return;
}
LOG(WARNING) << "Fallback to cpu: mmcv ext op modulated_deform_conv_forward";
auto input_cpu = input.cpu();
auto weight_cpu = weight.cpu();
auto bias_cpu = bias.cpu();
auto ones_cpu = ones.cpu();
auto offset_cpu = offset.cpu();
auto mask_cpu = mask.cpu();
auto output_cpu = output.cpu();
auto columns_cpu = columns.cpu();
modulated_deform_conv_forward_fallthrough(
input_cpu, weight_cpu, bias_cpu, ones_cpu, offset_cpu, mask_cpu,
output_cpu, columns_cpu, kernel_h, kernel_w, stride_h, stride_w, pad_h,
pad_w, dilation_h, dilation_w, group, deformable_group, with_bias);
output.copy_(output_cpu);
return;
}
void modulated_deform_conv_backward_diopi(
Tensor input, Tensor weight, Tensor bias, Tensor ones, Tensor offset,
Tensor mask, Tensor columns, Tensor grad_input, Tensor grad_weight,
Tensor grad_bias, Tensor grad_offset, Tensor grad_mask, Tensor grad_output,
int kernel_h, int kernel_w, int stride_h, int stride_w, int pad_h,
int pad_w, int dilation_h, int dilation_w, int group, int deformable_group,
const bool with_bias) {
auto input_p = toDiopiTensorHandle(input);
diopiDevice_t device;
diopiGetTensorDevice(input_p, &device);
if (device == diopi_host) {
modulated_deform_conv_backward_fallthrough(
input, weight, bias, ones, offset, mask, columns, grad_input,
grad_weight, grad_bias, grad_offset, grad_mask, grad_output, kernel_h,
kernel_w, stride_h, stride_w, pad_h, pad_w, dilation_h, dilation_w,
group, deformable_group, with_bias);
return;
}
diopiContext ctx(dipu::getCurrentDIPUStream().rawstream());
diopiContextHandle_t ch = &ctx;
auto weight_p = toDiopiTensorHandle(weight);
auto bias_p = toDiopiTensorHandle(bias);
auto ones_p = toDiopiTensorHandle(ones);
auto offset_p = toDiopiTensorHandle(offset);
auto mask_p = toDiopiTensorHandle(mask);
auto columns_p = toDiopiTensorHandle(columns);
auto grad_input_p = toDiopiTensorHandle(grad_input);
auto grad_weight_p = toDiopiTensorHandle(grad_weight);
auto grad_bias_p = toDiopiTensorHandle(grad_bias);
auto grad_offset_p = toDiopiTensorHandle(grad_offset);
auto grad_mask_p = toDiopiTensorHandle(grad_mask);
auto grad_output_p = toDiopiTensorHandle(grad_output);
if (reinterpret_cast<void*>(diopiModulatedDeformConvBackwardMmcv) !=
nullptr) {
auto ret = diopiModulatedDeformConvBackwardMmcv(
ch, grad_input_p, grad_weight_p, grad_bias_p, grad_offset_p,
grad_mask_p, input_p, weight_p, bias_p, ones_p, offset_p, mask_p,
columns_p, grad_output_p, kernel_h, kernel_w, stride_h, stride_w, pad_h,
pad_w, dilation_h, dilation_w, group, deformable_group, with_bias);
if (ret == diopiSuccess) return;
}
LOG(WARNING) << "Fallback to cpu: mmcv ext op modulated_deform_conv_forward";
auto input_cpu = input.cpu();
auto weight_cpu = weight.cpu();
auto bias_cpu = bias.cpu();
auto ones_cpu = ones.cpu();
auto offset_cpu = offset.cpu();
auto mask_cpu = mask.cpu();
auto columns_cpu = columns.cpu();
auto grad_input_cpu = grad_input.cpu();
auto grad_weight_cpu = grad_weight.cpu();
auto grad_bias_cpu = grad_bias.cpu();
auto grad_offset_cpu = grad_offset.cpu();
auto grad_mask_cpu = grad_mask.cpu();
auto grad_output_cpu = grad_output.cpu();
modulated_deform_conv_backward_fallthrough(
input_cpu, weight_cpu, bias_cpu, ones_cpu, offset_cpu, mask_cpu,
columns_cpu, grad_input_cpu, grad_weight_cpu, grad_bias_cpu,
grad_offset_cpu, grad_mask_cpu, grad_output_cpu, kernel_h, kernel_w,
stride_h, stride_w, pad_h, pad_w, dilation_h, dilation_w, group,
deformable_group, with_bias);
grad_input.copy_(grad_input_cpu);
grad_weight.copy_(grad_weight_cpu);
grad_bias.copy_(grad_bias_cpu);
grad_offset.copy_(grad_offset_cpu);
grad_mask.copy_(grad_mask_cpu);
return;
}
#endif
void modulated_deform_conv_forward(
Tensor input, Tensor weight, Tensor bias, Tensor ones, Tensor offset,
Tensor mask, Tensor output, Tensor columns, int kernel_h, int kernel_w,
const int stride_h, const int stride_w, const int pad_h, const int pad_w,
const int dilation_h, const int dilation_w, const int group,
const int deformable_group, const bool with_bias) {
#ifdef MMCV_WITH_DIOPI
modulated_deform_conv_forward_diopi(
input, weight, bias, ones, offset, mask, output, columns, kernel_h,
kernel_w, stride_h, stride_w, pad_h, pad_w, dilation_h, dilation_w, group,
deformable_group, with_bias);
#else
modulated_deform_conv_forward_fallthrough(
input, weight, bias, ones, offset, mask, output, columns, kernel_h,
kernel_w, stride_h, stride_w, pad_h, pad_w, dilation_h, dilation_w, group,
deformable_group, with_bias);
#endif
}
void modulated_deform_conv_backward(
Tensor input, Tensor weight, Tensor bias, Tensor ones, Tensor offset,
Tensor mask, Tensor columns, Tensor grad_input, Tensor grad_weight,
Tensor grad_bias, Tensor grad_offset, Tensor grad_mask, Tensor grad_output,
int kernel_h, int kernel_w, int stride_h, int stride_w, int pad_h,
int pad_w, int dilation_h, int dilation_w, int group, int deformable_group,
const bool with_bias) {
#ifdef MMCV_WITH_DIOPI
modulated_deform_conv_backward_diopi(
input, weight, bias, ones, offset, mask, columns, grad_input, grad_weight,
grad_bias, grad_offset, grad_mask, grad_output, kernel_h, kernel_w,
stride_h, stride_w, pad_h, pad_w, dilation_h, dilation_w, group,
deformable_group, with_bias);
#else
modulated_deform_conv_backward_fallthrough(
input, weight, bias, ones, offset, mask, columns, grad_input, grad_weight,
grad_bias, grad_offset, grad_mask, grad_output, kernel_h, kernel_w,
stride_h, stride_w, pad_h, pad_w, dilation_h, dilation_w, group,
deformable_group, with_bias);
#endif
}
// Copyright (c) OpenMMLab. All rights reserved
#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
#ifdef MMCV_WITH_DIOPI
#include <diopi/diopirt.h>
#include <diopi/functions.h>
#include <diopi/functions_mmcv.h>
#include "csrc_dipu/diopirt/diopirt_impl.h"
using dipu::diopi_helper::toDiopiScalar;
using dipu::diopi_helper::toDiopiTensorHandle;
#endif
Tensor nms_impl(Tensor boxes, Tensor scores, float iou_threshold, int offset) {
return DISPATCH_DEVICE_IMPL(nms_impl, boxes, scores, iou_threshold, offset);
......@@ -18,8 +28,41 @@ std::vector<std::vector<int> > nms_match_impl(Tensor dets,
return DISPATCH_DEVICE_IMPL(nms_match_impl, dets, iou_threshold);
}
#ifdef MMCV_WITH_DIOPI
Tensor nms_diopi(Tensor boxes, Tensor scores, float iou_threshold, int offset) {
auto boxes_p = toDiopiTensorHandle(boxes);
diopiDevice_t device;
diopiGetTensorDevice(boxes_p, &device);
if (device == diopi_host) {
return nms_impl(boxes, scores, iou_threshold, offset);
}
diopiContext ctx(dipu::getCurrentDIPUStream().rawstream());
diopiContextHandle_t ch = &ctx;
Tensor out;
auto outp = toDiopiTensorHandle(out);
diopiTensorHandle_t* outhandle = &outp;
auto scores_p = toDiopiTensorHandle(scores);
if (reinterpret_cast<void*>(diopiNmsMmcv) != nullptr) {
auto ret =
diopiNmsMmcv(ch, outhandle, boxes_p, scores_p, iou_threshold, offset);
if (ret == diopiSuccess) {
auto tensorhandle = reinterpret_cast<Tensor*>(*outhandle);
return *tensorhandle;
}
}
LOG(WARNING) << "Fallback to cpu: mmcv ext op nms";
auto boxes_cpu = boxes.cpu();
auto scores_cpu = scores.cpu();
return nms_impl(boxes_cpu, scores_cpu, iou_threshold, offset);
}
#endif
Tensor nms(Tensor boxes, Tensor scores, float iou_threshold, int offset) {
#ifdef MMCV_WITH_DIOPI
return nms_diopi(boxes, scores, iou_threshold, offset);
#else
return nms_impl(boxes, scores, iou_threshold, offset);
#endif
}
Tensor softnms(Tensor boxes, Tensor scores, Tensor dets, float iou_threshold,
......
// Copyright (c) OpenMMLab. All rights reserved
#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
#ifdef MMCV_WITH_DIOPI
#include <diopi/diopirt.h>
#include <diopi/functions.h>
#include <diopi/functions_mmcv.h>
#include "csrc_dipu/diopirt/diopirt_impl.h"
using dipu::diopi_helper::toDiopiScalar;
using dipu::diopi_helper::toDiopiTensorHandle;
#endif
void roi_align_forward_impl(Tensor input, Tensor rois, Tensor output,
Tensor argmax_y, Tensor argmax_x,
......@@ -22,20 +32,111 @@ void roi_align_backward_impl(Tensor grad_output, Tensor rois, Tensor argmax_y,
spatial_scale, sampling_ratio, pool_mode, aligned);
}
#ifdef MMCV_WITH_DIOPI
void roi_align_forward_diopi(Tensor input, Tensor rois, Tensor output,
Tensor argmax_y, Tensor argmax_x,
int aligned_height, int aligned_width,
float spatial_scale, int sampling_ratio,
int pool_mode, bool aligned) {
auto input_p = toDiopiTensorHandle(input);
diopiDevice_t device;
diopiGetTensorDevice(input_p, &device);
if (device == diopi_host) {
roi_align_forward_impl(input, rois, output, argmax_y, argmax_x,
aligned_height, aligned_width, spatial_scale,
sampling_ratio, pool_mode, aligned);
return;
}
diopiContext ctx(dipu::getCurrentDIPUStream().rawstream());
diopiContextHandle_t ch = &ctx;
auto rois_p = toDiopiTensorHandle(rois);
auto out_p = toDiopiTensorHandle(output);
auto argmax_y_p = toDiopiTensorHandle(argmax_y);
auto argmax_x_p = toDiopiTensorHandle(argmax_x);
if (reinterpret_cast<void*>(diopiRoiAlignMmcv) != nullptr) {
auto ret = diopiRoiAlignMmcv(
ch, out_p, argmax_y_p, argmax_x_p, input_p, rois_p, aligned_height,
aligned_width, sampling_ratio, pool_mode, spatial_scale, aligned);
if (ret == diopiSuccess) return;
}
LOG(WARNING) << "Fallback to cpu: mmcv ext op roi_align_forward";
auto input_cpu = input.cpu();
auto rois_cpu = rois.cpu();
auto out_cpu = output.cpu();
auto argmax_y_cpu = argmax_y.cpu();
auto argmax_x_cpu = argmax_x.cpu();
roi_align_forward_impl(input_cpu, rois_cpu, out_cpu, argmax_y_cpu,
argmax_x_cpu, aligned_height, aligned_width,
spatial_scale, sampling_ratio, pool_mode, aligned);
output.copy_(out_cpu);
}
void roi_align_backward_diopi(Tensor grad_output, Tensor rois, Tensor argmax_y,
Tensor argmax_x, Tensor grad_input,
int aligned_height, int aligned_width,
float spatial_scale, int sampling_ratio,
int pool_mode, bool aligned) {
auto grad_output_ = toDiopiTensorHandle(grad_output);
diopiDevice_t device;
diopiGetTensorDevice(grad_output_, &device);
if (device == diopi_host) {
roi_align_backward_impl(grad_output, rois, argmax_y, argmax_x, grad_input,
aligned_height, aligned_width, spatial_scale,
sampling_ratio, pool_mode, aligned);
return;
}
auto rois_ = toDiopiTensorHandle(rois);
auto argmax_y_ = toDiopiTensorHandle(argmax_y);
auto argmax_x_ = toDiopiTensorHandle(argmax_x);
auto grad_input_ = toDiopiTensorHandle(grad_input);
diopiContext ctx(dipu::getCurrentDIPUStream().rawstream());
diopiContextHandle_t ch = &ctx;
if (reinterpret_cast<void*>(diopiRoiAlignBackwardMmcv) != nullptr) {
auto ret = diopiRoiAlignBackwardMmcv(ch, grad_input_, grad_output_, rois_,
argmax_y_, argmax_x_, aligned_height,
aligned_width, sampling_ratio,
pool_mode, spatial_scale, aligned);
if (ret == diopiSuccess) return;
}
LOG(WARNING) << "Fallback to cpu: mmcv ext op roi_align_backward";
auto grad_output_cpu = grad_output.cpu();
auto rois_cpu = rois.cpu();
auto argmax_y_cpu = argmax_y.cpu();
auto argmax_x_cpu = argmax_x.cpu();
auto grad_input_cpu = grad_input.cpu();
roi_align_backward_impl(grad_output_cpu, rois_cpu, argmax_y_cpu, argmax_x_cpu,
grad_input_cpu, aligned_height, aligned_width,
spatial_scale, sampling_ratio, pool_mode, aligned);
grad_input.copy_(grad_input_cpu);
}
#endif
void roi_align_forward(Tensor input, Tensor rois, Tensor output,
Tensor argmax_y, Tensor argmax_x, int aligned_height,
int aligned_width, float spatial_scale,
int sampling_ratio, int pool_mode, bool aligned) {
#ifdef MMCV_WITH_DIOPI
roi_align_forward_diopi(input, rois, output, argmax_y, argmax_x,
aligned_height, aligned_width, spatial_scale,
sampling_ratio, pool_mode, aligned);
#else
roi_align_forward_impl(input, rois, output, argmax_y, argmax_x,
aligned_height, aligned_width, spatial_scale,
sampling_ratio, pool_mode, aligned);
#endif
}
void roi_align_backward(Tensor grad_output, Tensor rois, Tensor argmax_y,
Tensor argmax_x, Tensor grad_input, int aligned_height,
int aligned_width, float spatial_scale,
int sampling_ratio, int pool_mode, bool aligned) {
#ifdef MMCV_WITH_DIOPI
roi_align_backward_diopi(grad_output, rois, argmax_y, argmax_x, grad_input,
aligned_height, aligned_width, spatial_scale,
sampling_ratio, pool_mode, aligned);
#else
roi_align_backward_impl(grad_output, rois, argmax_y, argmax_x, grad_input,
aligned_height, aligned_width, spatial_scale,
sampling_ratio, pool_mode, aligned);
#endif
}
// Copyright (c) OpenMMLab. All rights reserved.
#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
#ifdef MMCV_WITH_DIOPI
#include <diopi/diopirt.h>
#include <diopi/functions.h>
#include <diopi/functions_mmcv.h>
#include "csrc_dipu/diopirt/diopirt_impl.h"
using dipu::diopi_helper::toDiopiScalar;
using dipu::diopi_helper::toDiopiTensorHandle;
#endif
int hard_voxelize_forward_impl(const at::Tensor &points, at::Tensor &voxels,
at::Tensor &coors,
......@@ -33,6 +43,132 @@ void dynamic_voxelize_forward_impl(const at::Tensor &points, at::Tensor &coors,
coors_range, NDim);
}
#ifdef MMCV_WITH_DIOPI
void hard_voxelize_forward_diopi(const at::Tensor &points,
const at::Tensor &voxel_size,
const at::Tensor &coors_range,
at::Tensor &voxels, at::Tensor &coors,
at::Tensor &num_points_per_voxel,
at::Tensor &voxel_num, const int max_points,
const int max_voxels, const int NDim = 3,
const bool deterministic = true) {
auto points_p = toDiopiTensorHandle(points);
diopiDevice_t device;
diopiGetTensorDevice(points_p, &device);
if (device == diopi_host) {
int64_t *voxel_num_data = voxel_num.data_ptr<int64_t>();
std::vector<float> voxel_size_v(
voxel_size.data_ptr<float>(),
voxel_size.data_ptr<float>() + voxel_size.numel());
std::vector<float> coors_range_v(
coors_range.data_ptr<float>(),
coors_range.data_ptr<float>() + coors_range.numel());
if (deterministic) {
*voxel_num_data = hard_voxelize_forward_impl(
points, voxels, coors, num_points_per_voxel, voxel_size_v,
coors_range_v, max_points, max_voxels, NDim);
} else {
TORCH_CHECK(
deterministic,
"nondeterministic hard_voxelize_forward is not supported on host!");
}
return;
}
diopiContext ctx(dipu::getCurrentDIPUStream().rawstream());
diopiContextHandle_t ch = &ctx;
auto voxel_size_p = toDiopiTensorHandle(voxel_size);
auto coors_range_p = toDiopiTensorHandle(coors_range);
auto voxels_p = toDiopiTensorHandle(voxels);
auto coors_p = toDiopiTensorHandle(coors);
auto num_points_per_voxel_p = toDiopiTensorHandle(num_points_per_voxel);
auto voxel_num_p = toDiopiTensorHandle(voxel_num);
if (reinterpret_cast<void *>(diopiHardVoxelizeMmcv) != nullptr) {
auto ret = diopiHardVoxelizeMmcv(
ch, voxels_p, coors_p, num_points_per_voxel_p, voxel_num_p, points_p,
voxel_size_p, coors_range_p, max_points, max_voxels, NDim,
deterministic);
if (ret == diopiSuccess) return;
}
LOG(WARNING) << "Fallback to cpu: mmcv ext op hard_voxelize_forward";
auto points_cpu = points.cpu();
auto voxel_size_cpu = voxel_size.cpu();
auto coors_range_cpu = coors_range.cpu();
auto voxels_cpu = voxels.cpu();
auto coors_cpu = coors.cpu();
auto num_points_per_voxel_cpu = num_points_per_voxel.cpu();
auto voxel_num_cpu = voxel_num.cpu();
int64_t *voxel_num_data_cpu = voxel_num_cpu.data_ptr<int64_t>();
std::vector<float> voxel_size_v_cpu(
voxel_size_cpu.data_ptr<float>(),
voxel_size_cpu.data_ptr<float>() + voxel_size_cpu.numel());
std::vector<float> coors_range_v_cpu(
coors_range_cpu.data_ptr<float>(),
coors_range_cpu.data_ptr<float>() + coors_range_cpu.numel());
if (deterministic) {
*voxel_num_data_cpu = hard_voxelize_forward_impl(
points_cpu, voxels_cpu, coors_cpu, num_points_per_voxel_cpu,
voxel_size_v_cpu, coors_range_v_cpu, max_points, max_voxels, NDim);
} else {
puts("nondeterministic hard_voxelize_forward is not supported on host!");
abort();
}
voxels.copy_(voxels_cpu);
coors.copy_(coors_cpu);
num_points_per_voxel.copy_(num_points_per_voxel_cpu);
voxel_num.copy_(voxel_num_cpu);
return;
}
void dynamic_voxelize_forward_diopi(const at::Tensor &points,
const at::Tensor &voxel_size,
const at::Tensor &coors_range,
at::Tensor &coors, const int NDim = 3) {
auto points_p = toDiopiTensorHandle(points);
diopiDevice_t device;
diopiGetTensorDevice(points_p, &device);
if (device == diopi_host) {
std::vector<float> voxel_size_v(
voxel_size.data_ptr<float>(),
voxel_size.data_ptr<float>() + voxel_size.numel());
std::vector<float> coors_range_v(
coors_range.data_ptr<float>(),
coors_range.data_ptr<float>() + coors_range.numel());
dynamic_voxelize_forward_impl(points, coors, voxel_size_v, coors_range_v,
NDim);
return;
}
diopiContext ctx(dipu::getCurrentDIPUStream().rawstream());
diopiContextHandle_t ch = &ctx;
auto voxel_size_p = toDiopiTensorHandle(voxel_size);
auto coors_range_p = toDiopiTensorHandle(coors_range);
auto coors_p = toDiopiTensorHandle(coors);
if (reinterpret_cast<void *>(diopiDynamicVoxelizeMmcv) != nullptr) {
auto ret = diopiDynamicVoxelizeMmcv(ch, coors_p, points_p, voxel_size_p,
coors_range_p, NDim);
if (ret == diopiSuccess) return;
}
LOG(WARNING) << "Fallback to cpu: mmcv ext op dynamic_voxelize_forward";
auto points_cpu = points.cpu();
auto voxel_size_cpu = voxel_size.cpu();
auto coors_range_cpu = coors_range.cpu();
auto coors_cpu = coors.cpu();
std::vector<float> voxel_size_v_cpu(
voxel_size_cpu.data_ptr<float>(),
voxel_size_cpu.data_ptr<float>() + voxel_size_cpu.numel());
std::vector<float> coors_range_v_cpu(
coors_range_cpu.data_ptr<float>(),
coors_range_cpu.data_ptr<float>() + coors_range_cpu.numel());
dynamic_voxelize_forward_impl(points_cpu, coors_cpu, voxel_size_v_cpu,
coors_range_v_cpu, NDim);
coors.copy_(coors_cpu);
return;
}
#endif
void hard_voxelize_forward(const at::Tensor &points,
const at::Tensor &voxel_size,
const at::Tensor &coors_range, at::Tensor &voxels,
......@@ -40,6 +176,11 @@ void hard_voxelize_forward(const at::Tensor &points,
at::Tensor &voxel_num, const int max_points,
const int max_voxels, const int NDim = 3,
const bool deterministic = true) {
#ifdef MMCV_WITH_DIOPI
hard_voxelize_forward_diopi(points, voxel_size, coors_range, voxels, coors,
num_points_per_voxel, voxel_num, max_points,
max_voxels, NDim, deterministic);
#else
int64_t *voxel_num_data = voxel_num.data_ptr<int64_t>();
std::vector<float> voxel_size_v(
voxel_size.data_ptr<float>(),
......@@ -57,12 +198,16 @@ void hard_voxelize_forward(const at::Tensor &points,
points, voxels, coors, num_points_per_voxel, voxel_size_v,
coors_range_v, max_points, max_voxels, NDim);
}
#endif
}
void dynamic_voxelize_forward(const at::Tensor &points,
const at::Tensor &voxel_size,
const at::Tensor &coors_range, at::Tensor &coors,
const int NDim = 3) {
#ifdef MMCV_WITH_DIOPI
dynamic_voxelize_forward_diopi(points, voxel_size, coors_range, coors, NDim);
#else
std::vector<float> voxel_size_v(
voxel_size.data_ptr<float>(),
voxel_size.data_ptr<float>() + voxel_size.numel());
......@@ -71,4 +216,5 @@ void dynamic_voxelize_forward(const at::Tensor &points,
coors_range.data_ptr<float>() + coors_range.numel());
dynamic_voxelize_forward_impl(points, coors, voxel_size_v, coors_range_v,
NDim);
#endif
}
......@@ -210,6 +210,8 @@ def get_extensions():
extra_compile_args['cxx'] = ['/std:c++14']
include_dirs = []
library_dirs = []
libraries = []
extra_objects = []
extra_link_args = []
......@@ -221,7 +223,34 @@ def get_extensions():
except ImportError:
pass
if is_rocm_pytorch or torch.cuda.is_available() or os.getenv(
if os.getenv('MMCV_WITH_DIOPI', '0') == '1':
import mmengine # NOQA: F401
from mmengine.utils.version_utils import digit_version
assert digit_version(mmengine.__version__) >= digit_version(
'0.7.4'), f'mmengine >= 0.7.4 is required \
but {mmengine.__version__} is installed'
print(f'Compiling {ext_name} with CPU and DIPU')
define_macros += [('MMCV_WITH_DIOPI', None)]
define_macros += [('DIOPI_ATTR_WEAK', None)]
op_files = glob.glob('./mmcv/ops/csrc/pytorch/*.cpp') + \
glob.glob('./mmcv/ops/csrc/pytorch/cpu/*.cpp')
extension = CppExtension
include_dirs.append(os.path.abspath('./mmcv/ops/csrc/common'))
dipu_root = os.getenv('DIPU_ROOT')
diopi_path = os.getenv('DIOPI_PATH')
dipu_path = os.getenv('DIPU_PATH')
vendor_include_dirs = os.getenv('VENDOR_INCLUDE_DIRS')
nccl_include_dirs = os.getenv('NCCL_INCLUDE_DIRS')
include_dirs.append(dipu_root)
include_dirs.append(diopi_path + '/include')
include_dirs.append(dipu_path + '/dist/include')
include_dirs.append(vendor_include_dirs)
if nccl_include_dirs:
include_dirs.append(nccl_include_dirs)
library_dirs += [dipu_root]
libraries += ['torch_dipu']
elif is_rocm_pytorch or torch.cuda.is_available() or os.getenv(
'FORCE_CUDA', '0') == '1':
if is_rocm_pytorch:
define_macros += [('MMCV_WITH_HIP', None)]
......@@ -398,6 +427,8 @@ def get_extensions():
define_macros=define_macros,
extra_objects=extra_objects,
extra_compile_args=extra_compile_args,
library_dirs=library_dirs,
libraries=libraries,
extra_link_args=extra_link_args)
extensions.append(ext_ops)
return extensions
......
......@@ -7,6 +7,8 @@ import torch
from mmengine.utils import digit_version
from mmengine.utils.dl_utils import TORCH_VERSION
from mmcv.utils import IS_CUDA_AVAILABLE
try:
# If PyTorch version >= 1.6.0 and fp16 is enabled, torch.cuda.amp.autocast
# would be imported and used; we should test if our modules support it.
......@@ -111,13 +113,28 @@ class TestMdconv:
assert numpy.allclose(dcn.conv_offset.bias.grad.cpu().detach().numpy(),
dcn_offset_b_grad, 1e-2)
def test_mdconv(self):
self._test_mdconv(torch.double, device='cpu')
self._test_mdconv(torch.float, device='cpu')
self._test_mdconv(torch.double)
self._test_mdconv(torch.float)
@pytest.mark.parametrize('device', [
'cpu',
pytest.param(
'cuda',
marks=pytest.mark.skipif(
not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
])
def test_mdconv_float(self, device):
self._test_mdconv(dtype=torch.float, device=device)
@pytest.mark.parametrize('device', [
'cpu',
pytest.param(
'cuda',
marks=pytest.mark.skipif(
not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
])
def test_mdconv_double(self, device):
self._test_mdconv(dtype=torch.double, device=device)
def test_mdconv_half(self):
self._test_mdconv(torch.half)
# test amp when torch version >= '1.6.0', the type of
# input data for mdconv might be torch.float or torch.half
if (TORCH_VERSION != 'parrots'
......
......@@ -93,15 +93,7 @@ def _test_roialign_allclose(device, dtype):
x.grad.data.type(torch.float).cpu().numpy(), np_grad, atol=1e-3)
@pytest.mark.parametrize('dtype', [
torch.float,
pytest.param(
torch.double,
marks=pytest.mark.skipif(
IS_MLU_AVAILABLE or IS_NPU_AVAILABLE,
reason='MLU and NPU do not support for 64-bit floating point')),
torch.half
])
@pytest.mark.parametrize('dtype', [torch.float, torch.half])
@pytest.mark.parametrize('device', [
'cpu',
pytest.param(
......@@ -117,8 +109,17 @@ def _test_roialign_allclose(device, dtype):
marks=pytest.mark.skipif(
not IS_NPU_AVAILABLE, reason='requires NPU support'))
])
def test_roialign(device, dtype):
# check double only
if dtype is torch.double:
_test_roialign_gradcheck(device=device, dtype=dtype)
def test_roialign_float(device, dtype):
_test_roialign_allclose(device=device, dtype=dtype)
@pytest.mark.parametrize('device', [
'cpu',
pytest.param(
'cuda',
marks=pytest.mark.skipif(
not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
])
def test_roialign_float64(device):
_test_roialign_allclose(device=device, dtype=torch.double)
_test_roialign_gradcheck(device=device, dtype=torch.double)
......@@ -139,12 +139,20 @@ def test_voxelization_nondeterministic():
assert len(coors_set) == len(coors) == len(coors_all_set)
@pytest.mark.parametrize('device_type', [
@pytest.mark.parametrize(
'device_type',
[
pytest.param(
# this is only used for dipu device testing case.
# dipu will mock to cuda automatically on mlu physical device.
'cuda:0',
marks=pytest.mark.skipif(
not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
pytest.param(
'mlu',
marks=pytest.mark.skipif(
not IS_MLU_AVAILABLE, reason='requires MLU support'))
])
])
def test_voxelization_mlu(device_type):
voxel_size = [0.5, 0.5, 0.5]
point_cloud_range = [0, -40, -3, 70.4, 40, 1]
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment