Commit 0e2f8a5c authored by limm's avatar limm
Browse files

add v2.2.0

parent 2754cb11
......@@ -5,9 +5,13 @@
#include <diopi/diopirt.h>
#include <diopi/functions.h>
#include <diopi/functions_mmcv.h>
#include <torch/csrc/utils/pybind.h>
#include "csrc_dipu/diopirt/diopirt_impl.h"
#include "csrc_dipu/runtime/device/deviceapis.h"
#include "csrc_dipu/utils/helpfunc.hpp"
using dipu::VENDOR_TYPE;
using dipu::diopi_helper::toDiopiScalar;
using dipu::diopi_helper::toDiopiTensorHandle;
#endif
......@@ -57,9 +61,16 @@ void sigmoid_focal_loss_forward_diopi(Tensor input, Tensor target,
auto weight_p = toDiopiTensorHandle(weight);
auto output_p = toDiopiTensorHandle(output);
if (reinterpret_cast<void *>(diopiSigmoidFocalLossMmcv) != nullptr) {
if (strcmp(dipu::VendorTypeToStr(VENDOR_TYPE), "NPU") == 0) {
pybind11::gil_scoped_release no_gil;
auto ret = diopiSigmoidFocalLossMmcv(ch, output_p, input_p, target_p,
weight_p, gamma, alpha);
if (ret == diopiSuccess) return;
} else {
auto ret = diopiSigmoidFocalLossMmcv(ch, output_p, input_p, target_p,
weight_p, gamma, alpha);
if (ret == diopiSuccess) return;
}
}
LOG(WARNING)
<< "Fallback to cpu: mmcv ext op sigmoid_focal_loss_forward_impl";
......@@ -90,9 +101,16 @@ void sigmoid_focal_loss_backward_diopi(Tensor input, Tensor target,
auto weight_p = toDiopiTensorHandle(weight);
auto grad_input_p = toDiopiTensorHandle(grad_input);
if (reinterpret_cast<void *>(diopiSigmoidFocalLossBackwardMmcv) != nullptr) {
if (strcmp(dipu::VendorTypeToStr(VENDOR_TYPE), "NPU") == 0) {
pybind11::gil_scoped_release no_gil;
auto ret = diopiSigmoidFocalLossBackwardMmcv(
ch, grad_input_p, input_p, target_p, weight_p, gamma, alpha);
if (ret == diopiSuccess) return;
} else {
auto ret = diopiSigmoidFocalLossBackwardMmcv(
ch, grad_input_p, input_p, target_p, weight_p, gamma, alpha);
if (ret == diopiSuccess) return;
}
}
LOG(WARNING)
<< "Fallback to cpu: mmcv ext op sigmoid_focal_loss_forward_impl";
......
......@@ -5,9 +5,13 @@
#include <diopi/diopirt.h>
#include <diopi/functions.h>
#include <diopi/functions_mmcv.h>
#include <torch/csrc/utils/pybind.h>
#include "csrc_dipu/diopirt/diopirt_impl.h"
#include "csrc_dipu/runtime/device/deviceapis.h"
#include "csrc_dipu/utils/helpfunc.hpp"
using dipu::VENDOR_TYPE;
using dipu::diopi_helper::toDiopiScalar;
using dipu::diopi_helper::toDiopiTensorHandle;
#endif
......@@ -273,11 +277,20 @@ void modulated_deform_conv_forward_diopi(
auto output_p = toDiopiTensorHandle(output);
auto columns_p = toDiopiTensorHandle(columns);
if (reinterpret_cast<void*>(diopiModulatedDeformConvMmcv) != nullptr) {
if (strcmp(dipu::VendorTypeToStr(VENDOR_TYPE), "NPU") == 0) {
pybind11::gil_scoped_release no_gil;
auto ret = diopiModulatedDeformConvMmcv(
ch, output_p, columns_p, ones_p, input_p, weight_p, bias_p, offset_p,
mask_p, kernel_h, kernel_w, stride_h, stride_w, pad_h, pad_w,
dilation_h, dilation_w, group, deformable_group, with_bias);
if (ret == diopiSuccess) return;
} else {
auto ret = diopiModulatedDeformConvMmcv(
ch, output_p, columns_p, ones_p, input_p, weight_p, bias_p, offset_p,
mask_p, kernel_h, kernel_w, stride_h, stride_w, pad_h, pad_w,
dilation_h, dilation_w, group, deformable_group, with_bias);
if (ret == diopiSuccess) return;
}
}
LOG(WARNING) << "Fallback to cpu: mmcv ext op modulated_deform_conv_forward";
auto input_cpu = input.cpu();
......@@ -331,13 +344,25 @@ void modulated_deform_conv_backward_diopi(
if (reinterpret_cast<void*>(diopiModulatedDeformConvBackwardMmcv) !=
nullptr) {
if (strcmp(dipu::VendorTypeToStr(VENDOR_TYPE), "NPU") == 0) {
pybind11::gil_scoped_release no_gil;
auto ret = diopiModulatedDeformConvBackwardMmcv(
ch, grad_input_p, grad_weight_p, grad_bias_p, grad_offset_p,
grad_mask_p, input_p, weight_p, bias_p, ones_p, offset_p, mask_p,
columns_p, grad_output_p, kernel_h, kernel_w, stride_h, stride_w, pad_h,
pad_w, dilation_h, dilation_w, group, deformable_group, with_bias);
columns_p, grad_output_p, kernel_h, kernel_w, stride_h, stride_w,
pad_h, pad_w, dilation_h, dilation_w, group, deformable_group,
with_bias);
if (ret == diopiSuccess) return;
} else {
auto ret = diopiModulatedDeformConvBackwardMmcv(
ch, grad_input_p, grad_weight_p, grad_bias_p, grad_offset_p,
grad_mask_p, input_p, weight_p, bias_p, ones_p, offset_p, mask_p,
columns_p, grad_output_p, kernel_h, kernel_w, stride_h, stride_w,
pad_h, pad_w, dilation_h, dilation_w, group, deformable_group,
with_bias);
if (ret == diopiSuccess) return;
}
}
LOG(WARNING) << "Fallback to cpu: mmcv ext op modulated_deform_conv_forward";
auto input_cpu = input.cpu();
auto weight_cpu = weight.cpu();
......
......@@ -5,10 +5,14 @@
#include <diopi/diopirt.h>
#include <diopi/functions.h>
#include <diopi/functions_mmcv.h>
#include <torch/csrc/utils/pybind.h>
#include "csrc_dipu/base/basedef.h"
#include "csrc_dipu/diopirt/diopirt_impl.h"
#include "csrc_dipu/runtime/device/deviceapis.h"
#include "csrc_dipu/utils/helpfunc.hpp"
using dipu::VENDOR_TYPE;
using dipu::diopi_helper::toDiopiScalar;
using dipu::diopi_helper::toDiopiTensorHandle;
#endif
......@@ -45,12 +49,22 @@ Tensor nms_diopi(Tensor boxes, Tensor scores, float iou_threshold, int offset) {
auto scores_p = toDiopiTensorHandle(scores);
bool is_mock_cuda = boxes.device().type() == dipu::DIPU_DEVICE_TYPE;
if (is_mock_cuda && reinterpret_cast<void*>(diopiNmsMmcv) != nullptr) {
if (strcmp(dipu::VendorTypeToStr(VENDOR_TYPE), "NPU") == 0) {
pybind11::gil_scoped_release no_gil;
auto ret =
diopiNmsMmcv(ch, outhandle, boxes_p, scores_p, iou_threshold, offset);
if (ret == diopiSuccess) {
auto tensorhandle = reinterpret_cast<Tensor*>(*outhandle);
return *tensorhandle;
}
} else {
auto ret =
diopiNmsMmcv(ch, outhandle, boxes_p, scores_p, iou_threshold, offset);
if (ret == diopiSuccess) {
auto tensorhandle = reinterpret_cast<Tensor*>(*outhandle);
return *tensorhandle;
}
}
}
LOG(WARNING) << "Fallback to cpu: mmcv ext op nms";
auto boxes_cpu = boxes.cpu();
......
#include "pytorch_npu_helper.hpp"
using namespace NPU_NAME_SPACE;
using namespace std;
void ball_query_forward_npu(int b, int n, int m, float min_radius,
float max_radius, int nsample, const Tensor new_xyz,
const Tensor xyz, Tensor idx) {
int64_t nsample_i64 = nsample;
// transpose new_xyz from [B, M, 3] to [M, B, 3]
at::Tensor new_xyz_transpose = new_xyz.transpose(0, 1);
// transpose xyz from [B, N, 3] to [B, 3, N]
at::Tensor xyz_transpose = xyz.transpose(1, 2);
// transpose idx from [B, M, nsample] to [M, B, nsample]
at::Tensor idx_transpose = idx.transpose(0, 1).contiguous();
OpCommand cmd;
cmd.Name("BallQuery")
.Input(xyz_transpose)
.Input(new_xyz_transpose)
.Output(idx_transpose)
.Attr("min_radius", min_radius)
.Attr("max_radius", max_radius)
.Attr("sample_num", nsample_i64)
.Run();
idx_transpose = idx_transpose.transpose(0, 1).contiguous();
idx.copy_(idx_transpose);
}
void ball_query_forward_impl(int b, int n, int m, float min_radius,
float max_radius, int nsample,
const Tensor new_xyz, const Tensor xyz,
Tensor idx);
REGISTER_NPU_IMPL(ball_query_forward_impl, ball_query_forward_npu);
#include "pytorch_npu_helper.hpp"
using namespace NPU_NAME_SPACE;
using namespace std;
void chamfer_distance_forward_npu(Tensor XYZ1, Tensor XYZ2, Tensor dist1,
Tensor dist2, Tensor idx1, Tensor idx2) {
at::Tensor xyz1 = at::ones_like(XYZ1);
at::Tensor xyz2 = at::ones_like(XYZ2);
xyz1 = XYZ1.transpose(1, 2).transpose(0, 1);
xyz2 = XYZ2.transpose(1, 2).transpose(0, 1);
OpCommand cmd;
cmd.Name("ChamferDistance")
.Input(xyz1)
.Input(xyz2)
.Output(dist1)
.Output(dist2)
.Output(idx1)
.Output(idx2)
.Run();
}
void chamfer_distance_backward_npu(Tensor xyz1, Tensor xyz2, Tensor idx1,
Tensor idx2, Tensor grad_dist1,
Tensor grad_dist2, Tensor grad_xyz1,
Tensor grad_xyz2) {
EXEC_NPU_CMD(aclnnChamferDistanceBackward, xyz1, xyz2, idx1, idx2, grad_dist1,
grad_dist2, grad_xyz1, grad_xyz2);
}
void chamfer_distance_forward_impl(Tensor XYZ1, Tensor XYZ2, Tensor dist1,
Tensor dist2, Tensor idx1, Tensor idx2);
REGISTER_NPU_IMPL(chamfer_distance_forward_impl, chamfer_distance_forward_npu);
void chamfer_distance_backward_impl(Tensor xyz1, Tensor xyz2, Tensor idx1,
Tensor idx2, Tensor grad_dist1,
Tensor grad_dist2, Tensor grad_xyz1,
Tensor grad_xyz2);
REGISTER_NPU_IMPL(chamfer_distance_backward_impl,
chamfer_distance_backward_npu);
#ifndef MMCV_OPS_CSRC_COMMON__UTIL_HPP_
#define MMCV_OPS_CSRC_COMMON__UTIL_HPP_
const int SIZE = 8;
c10::SmallVector<int64_t, SIZE> array_to_vector(c10::IntArrayRef shape) {
c10::SmallVector<int64_t, SIZE> shape_small_vec;
for (uint64_t i = 0; i < shape.size(); i++) {
shape_small_vec.emplace_back(shape[i]);
}
}
#endif // MMCV_OPS_CSRC_COMMON__UTIL_HPP_
#include "pytorch_npu_helper.hpp"
using namespace NPU_NAME_SPACE;
using namespace std;
......@@ -100,7 +99,22 @@ void softmax_focal_loss_forward_npu(Tensor input, Tensor target, Tensor weight,
c10::SmallVector<int64_t, 2> sizes = {n_batch, 1};
at::IntArrayRef offset = at::IntArrayRef(offsets);
at::IntArrayRef size = at::IntArrayRef(sizes);
at_npu::native::custom_ops::npu_slice_out(op_output, offset, size, output);
at::IntArrayRef size_array = at::IntArrayRef(sizes);
c10::SmallVector<int64_t, 8> offsetVec;
for (uint64_t i = 0; i < offset.size(); i++) {
offsetVec.emplace_back(offset[i]);
}
c10::SmallVector<int64_t, 8> sizeVec;
for (uint64_t i = 0; i < size_array.size(); i++) {
sizeVec.emplace_back(size_array[i]);
}
OpCommand cmd2;
cmd2.Name("Slice")
.Input(op_output)
.Input(offsetVec)
.Input(sizeVec)
.Output(output)
.Run();
}
void softmax_focal_loss_forward_impl(Tensor input, Tensor target, Tensor weight,
......
......@@ -16,7 +16,9 @@ Tensor fused_bias_leakyrelu_npu(const Tensor &input, const Tensor &bias,
auto input_size = input.sizes();
int input_length = input_size.size();
c10::SmallVector<int64_t, SIZE> input_size_tmp;
input_size_tmp = array_to_small_vector(input_size);
for (uint64_t i = 0; i < input_size.size(); i++) {
input_size_tmp.emplace_back(input_size[i]);
}
if (input_length > 1) {
for (int i = 0; i < input_length; i++) {
if (i != 1) {
......
......@@ -32,7 +32,11 @@ void gather_points_backward_npu(int b, int c, int n, int npoints,
indices.unsqueeze_(0);
}
int64_t dim = 0;
at::SmallVector<int64_t, N> pad_size = array_to_small_vector(idx.sizes());
auto shape = idx.sizes();
c10::SmallVector<int64_t, 8> pad_size;
for (uint64_t i = 0; i < shape.size(); i++) {
pad_size.emplace_back(shape[i]);
}
at::Tensor trans_grad_points = grad_points.transpose(1, 2).contiguous();
at::Tensor grad_points_view = trans_grad_points.view(
{trans_grad_points.sizes()[0] * trans_grad_points.sizes()[1],
......
......@@ -20,7 +20,7 @@ void group_points_forward_npu(int b, int c, int n, int npoints, int nsample,
indices = indices.view({-1});
at::Tensor trans_features = points.transpose(1, 2);
at::Tensor features = NpuUtils::format_contiguous(trans_features);
at::Tensor features = trans_features.contiguous();
features = features.view({b * n, c});
OpCommand cmd;
......@@ -34,7 +34,7 @@ void group_points_forward_npu(int b, int c, int n, int npoints, int nsample,
at::Tensor output =
out.view({b, npoints, nsample, c}).transpose(1, 3).transpose(2, 3);
at::Tensor res = NpuUtils::format_contiguous(output);
at::Tensor res = output.contiguous();
out.copy_(res);
}
......
#include "pytorch_npu_helper.hpp"
using namespace NPU_NAME_SPACE;
using namespace std;
Tensor ms_deform_attn_impl_forward(const Tensor &value,
const Tensor &value_spatial_shapes,
const Tensor &value_level_start_index,
const Tensor &sampling_locations,
const Tensor &attention_weights,
const int im2col_step);
void check_support(const Tensor &value, const Tensor &attention_weights) {
TORCH_CHECK(
(value.scalar_type() == at::kFloat || value.scalar_type() == at::kHalf),
"Dtype of value should be float32 or float16.");
int64_t num_heads = value.size(2);
int64_t embed_dims = value.size(3);
int64_t num_points = attention_weights.size(4);
TORCH_CHECK((num_heads >= 4 && num_heads <= 8),
"num_heads should be in the range of [4, 8]");
TORCH_CHECK((embed_dims >= 32 && embed_dims <= 256),
"embed_dims should be in the range of [32, 256]");
TORCH_CHECK((num_points >= 4 && num_points <= 8),
"num_points should be in the range of [4, 8]");
}
Tensor ms_deform_attn_forward_npu(const Tensor &value,
const Tensor &value_spatial_shapes,
const Tensor &value_level_start_index,
const Tensor &sampling_locations,
const Tensor &attention_weights,
const int im2col_step) {
check_support(value, attention_weights);
at::Tensor value_fp32 = value;
at::Tensor value_spatial_shapes_int32 = value_spatial_shapes;
at::Tensor value_level_start_index_int32 = value_level_start_index;
at::Tensor sampling_locations_fp32 = sampling_locations;
at::Tensor attention_weights_fp32 = attention_weights;
if (value.scalar_type() != at::kFloat) {
value_fp32 = value.to(at::kFloat);
}
if (value_spatial_shapes.scalar_type() != at::kInt) {
value_spatial_shapes_int32 = value_spatial_shapes.to(at::kInt);
}
if (value_level_start_index.scalar_type() != at::kInt) {
value_level_start_index_int32 = value_level_start_index.to(at::kInt);
}
if (sampling_locations.scalar_type() != at::kFloat) {
sampling_locations_fp32 = sampling_locations.to(at::kFloat);
}
if (attention_weights.scalar_type() != at::kFloat) {
attention_weights_fp32 = attention_weights.to(at::kFloat);
}
c10::SmallVector<int64_t, 3> output_size = {
value.size(0), sampling_locations.size(1), value.size(2) * value.size(3)};
at::Tensor output = at::zeros(output_size, value_fp32.options());
OpCommand cmd;
cmd.Name("MultiScaleDeformableAttnFunction")
.Input(value_fp32)
.Input(value_spatial_shapes_int32)
.Input(value_level_start_index_int32)
.Input(sampling_locations_fp32)
.Input(attention_weights_fp32)
.Output(output)
.Run();
at::Tensor real_output = output;
if (value.scalar_type() != at::kFloat) {
real_output = output.to(value.scalar_type());
}
return real_output;
}
REGISTER_NPU_IMPL(ms_deform_attn_impl_forward, ms_deform_attn_forward_npu);
void ms_deform_attn_impl_backward(
const Tensor &value, const Tensor &spatial_shapes,
const Tensor &level_start_index, const Tensor &sampling_loc,
const Tensor &attn_weight, const Tensor &grad_output, Tensor &grad_value,
Tensor &grad_sampling_loc, Tensor &grad_attn_weight,
const int im2col_step);
void ms_deform_attn_backward_npu(const Tensor &value, const Tensor &spatial_shapes,
const Tensor &level_start_index,
const Tensor &sampling_loc,
const Tensor &attn_weight,
const Tensor &grad_output, Tensor &grad_value,
Tensor &grad_sampling_loc,
Tensor &grad_attn_weight, const int im2col_step) {
check_support(value, attn_weight);
at::Tensor value_fp32 = value;
at::Tensor spatial_shapes_int32 = spatial_shapes;
at::Tensor level_start_index_int32 = level_start_index;
at::Tensor sampling_loc_fp32 = sampling_loc.transpose(4, 5).contiguous();
at::Tensor attn_weight_fp32 = attn_weight;
at::Tensor grad_output_fp32 = grad_output;
if (value.scalar_type() != at::kFloat) {
value_fp32 = value.to(at::kFloat);
}
if (spatial_shapes.scalar_type() != at::kInt) {
spatial_shapes_int32 = spatial_shapes.to(at::kInt);
}
if (level_start_index.scalar_type() != at::kInt) {
level_start_index_int32 = level_start_index.to(at::kInt);
}
if (sampling_loc.scalar_type() != at::kFloat) {
sampling_loc_fp32 = sampling_loc_fp32.to(at::kFloat);
}
if (attn_weight.scalar_type() != at::kFloat) {
attn_weight_fp32 = attn_weight.to(at::kFloat);
}
if (grad_output.scalar_type() != at::kFloat) {
grad_output_fp32 = grad_output.to(at::kFloat);
}
OpCommand cmd;
cmd.Name("MultiScaleDeformableAttentionGrad")
.Input(value_fp32)
.Input(spatial_shapes_int32)
.Input(level_start_index_int32)
.Input(sampling_loc_fp32)
.Input(attn_weight_fp32)
.Input(grad_output_fp32)
.Output(grad_value)
.Output(grad_sampling_loc)
.Output(grad_attn_weight)
.Run();
grad_sampling_loc = grad_sampling_loc.transpose(4, 5).contiguous();
}
REGISTER_NPU_IMPL(ms_deform_attn_impl_backward, ms_deform_attn_backward_npu);
......@@ -12,7 +12,7 @@ void points_in_polygons_npu(const Tensor points, Tensor polygons, Tensor output,
"The batch of polygons tensor must be less than MAX_POLYGONS_BATCH");
at::Tensor trans_polygons = polygons.transpose(0, 1);
OpCommand cmd;
at::Tensor new_trans_polygons = NpuUtils::format_contiguous(trans_polygons);
at::Tensor new_trans_polygons = trans_polygons.contiguous();
cmd.Name("PointsInPolygons")
.Input(points, (string) "points")
.Input(new_trans_polygons, (string) "polygons")
......
......@@ -41,8 +41,11 @@ void roi_align_backward_npu(Tensor grad_output, Tensor rois, Tensor argmax_y,
LOG(WARNING) << "The [aligned] attr in roi_align_grad op is false";
roi_end_mode = 0;
}
c10::SmallVector<int64_t, SIZE> xdiff_shape =
array_to_small_vector(grad_input.sizes());
auto shape = grad_input.sizes();
c10::SmallVector<int64_t, SIZE> xdiff_shape;
for (uint64_t i = 0; i < shape.size(); i++) {
xdiff_shape.emplace_back(shape[i]);
}
OpCommand cmd;
cmd.Name("ROIAlignGrad")
.Input(grad_output)
......
#include "pytorch_npu_helper.hpp"
using namespace NPU_NAME_SPACE;
using namespace std;
void roi_align_rotated_forward_npu(Tensor input, Tensor rois, Tensor output,
int aligned_height, int aligned_width,
float spatial_scale, int sampling_ratio,
bool aligned, bool clockwise) {
int64_t aligned_height_64 = aligned_height;
int64_t aligned_width_64 = aligned_width;
int64_t sampling_ratio_64 = sampling_ratio;
OpCommand cmd;
cmd.Name("RoiAlignRotated")
.Input(input)
.Input(rois)
.Output(output)
.Attr("pooled_h", aligned_height_64)
.Attr("pooled_w", aligned_width_64)
.Attr("spatial_scale", spatial_scale)
.Attr("sampling_ratio", sampling_ratio_64)
.Attr("aligned", aligned)
.Attr("clockwise", clockwise)
.Run();
}
void roi_align_rotated_backward_npu(Tensor top_grad, Tensor rois,
Tensor bottom_grad, int aligned_height,
int aligned_width, float spatial_scale,
int sampling_ratio, bool aligned,
bool clockwise) {
int64_t aligned_height_64 = aligned_height;
int64_t aligned_width_64 = aligned_width;
int64_t sampling_ratio_64 = sampling_ratio;
c10::SmallVector<int64_t, SIZE> y_grad_shape;
auto shape = bottom_grad.sizes();
for (uint64_t i = 0; i < shape.size(); i++) {
y_grad_shape.emplace_back(shape[i]);
}
OpCommand cmd;
cmd.Name("RoiAlignRotatedGrad")
.Input(top_grad)
.Input(rois)
.Output(bottom_grad)
.Attr("y_grad_shape", y_grad_shape)
.Attr("pooled_h", aligned_width_64)
.Attr("pooled_w", aligned_height_64)
.Attr("spatial_scale", spatial_scale)
.Attr("sampling_ratio", sampling_ratio_64)
.Attr("aligned", aligned)
.Attr("clockwise", clockwise)
.Run();
}
void roi_align_rotated_forward_impl(Tensor input, Tensor rois, Tensor output,
int aligned_height, int aligned_width,
float spatial_scale, int sampling_ratio,
bool aligned, bool clockwise);
void roi_align_rotated_backward_impl(Tensor top_grad, Tensor rois,
Tensor bottom_grad, int aligned_height,
int aligned_width, float spatial_scale,
int sampling_ratio, bool aligned,
bool clockwise);
REGISTER_NPU_IMPL(roi_align_rotated_forward_impl,
roi_align_rotated_forward_npu);
REGISTER_NPU_IMPL(roi_align_rotated_backward_impl,
roi_align_rotated_backward_npu);
#include "pytorch_npu_helper.hpp"
using namespace NPU_NAME_SPACE;
using namespace std;
void rotated_feature_align_forward_impl(const Tensor features,
const Tensor best_bboxes,
const float spatial_scale,
const int points, Tensor output);
void rotated_feature_align_backward_impl(const Tensor top_grad,
const Tensor best_bboxes,
const float spatial_scale,
const int points, Tensor bottom_grad);
void rotated_feature_align_forward_npu(const Tensor features,
const Tensor best_bboxes,
const float spatial_scale,
const int points, Tensor output) {
int64_t points_ = (int64_t)points;
at::Tensor best_bboxes_ = best_bboxes.transpose(2, 3).transpose(1, 2);
OpCommand cmd;
cmd.Name("RotatedFeatureAlign")
.Input(features)
.Input(best_bboxes_)
.Output(output)
.Attr("spatial_scale", spatial_scale)
.Attr("points", points_)
.Run();
}
void rotated_feature_align_backward_npu(const Tensor top_grad,
const Tensor best_bboxes,
const float spatial_scale,
const int points, Tensor bottom_grad) {
int64_t points_ = (int64_t)points;
at::Tensor best_bboxes_ = best_bboxes.transpose(2, 3).transpose(1, 2);
OpCommand cmd;
cmd.Name("RotatedFeatureAlignGrad")
.Input(top_grad)
.Input(best_bboxes_)
.Output(bottom_grad)
.Attr("spatial_scale", spatial_scale)
.Attr("points", points_)
.Run();
}
REGISTER_NPU_IMPL(rotated_feature_align_forward_impl,
rotated_feature_align_forward_npu);
REGISTER_NPU_IMPL(rotated_feature_align_backward_impl,
rotated_feature_align_backward_npu);
#include "pytorch_npu_helper.hpp"
using namespace NPU_NAME_SPACE;
using namespace std;
void stack_ball_query_forward_npu(float max_radius, int nsample,
const Tensor new_xyz,
const Tensor new_xyz_batch_cnt,
const Tensor xyz, const Tensor xyz_batch_cnt,
Tensor idx) {
at::Tensor xyz_transpose = xyz.transpose(0, 1).contiguous();
double max_radius_double = double(max_radius);
EXEC_NPU_CMD(aclnnStackBallQuery, xyz_transpose, new_xyz, xyz_batch_cnt,
new_xyz_batch_cnt, max_radius_double, nsample, idx);
}
void stack_ball_query_forward_impl(float max_radius, int nsample,
const Tensor new_xyz,
const Tensor new_xyz_batch_cnt,
const Tensor xyz, const Tensor xyz_batch_cnt,
Tensor idx);
REGISTER_NPU_IMPL(stack_ball_query_forward_impl, stack_ball_query_forward_npu);
#include "pytorch_npu_helper.hpp"
using namespace NPU_NAME_SPACE;
using namespace std;
void stack_group_points_forward_npu(int b, int c, int n, int nsample,
const Tensor features_tensor,
const Tensor features_batch_cnt_tensor,
const Tensor idx_tensor,
const Tensor idx_batch_cnt_tensor,
Tensor out_tensor) {
EXEC_NPU_CMD(aclnnStackGroupPoints, features_tensor,
features_batch_cnt_tensor, idx_tensor, idx_batch_cnt_tensor,
out_tensor);
}
void stack_group_points_forward_impl(int b, int c, int n, int nsample,
const Tensor features_tensor,
const Tensor features_batch_cnt_tensor,
const Tensor idx_tensor,
const Tensor idx_batch_cnt_tensor,
Tensor out_tensor);
REGISTER_NPU_IMPL(stack_group_points_forward_impl,
stack_group_points_forward_npu);
#include "pytorch_npu_helper.hpp"
#include "torch_npu/csrc/aten/NPUNativeFunctions.h"
#include "torch_npu/csrc/framework/utils/OpAdapter.h"
using namespace NPU_NAME_SPACE;
using namespace std;
void three_interpolate_forward_npu(int b, int c, int m, int n,
const Tensor points, const Tensor idx,
const Tensor weight, Tensor out) {
auto originDtype = points.scalar_type();
TORCH_CHECK((originDtype == at::kFloat || originDtype == at::kHalf),
"three_interpolate_forward ascend only support fp32 and fp16.");
auto point_c_trans = points.transpose(1, 2);
OpCommand cmd;
cmd.Name("ThreeInterpolate")
.Input(point_c_trans)
.Input(idx)
.Input(weight)
.Output(out)
.Run();
auto output = out.view({b, n, c}).transpose(1, 2);
auto res = output.contiguous();
out.copy_(res);
}
void three_interpolate_backward_npu(int b, int c, int n, int m,
const Tensor grad_out, const Tensor idx,
const Tensor weight, Tensor grad_points) {
auto originDtype = grad_out.scalar_type();
TORCH_CHECK((originDtype == at::kFloat || originDtype == at::kHalf),
"three_interpolate_backward ascend only support fp32 and fp16.");
auto grad_x = at::unsqueeze(grad_out, 3);
auto grad_y = at::unsqueeze(grad_points, 3);
EXEC_NPU_CMD(aclnnThreeInterpolateBackward, grad_x, idx, weight, m, grad_y);
auto output = at::squeeze(grad_y, 3);
auto res = output.contiguous();
grad_points.copy_(res);
}
void three_interpolate_forward_impl(int b, int c, int m, int n,
const Tensor points, const Tensor idx,
const Tensor weight, Tensor out);
void three_interpolate_backward_impl(int b, int c, int n, int m,
const Tensor grad_out, const Tensor idx,
const Tensor weight, Tensor grad_points);
REGISTER_NPU_IMPL(three_interpolate_forward_impl,
three_interpolate_forward_npu);
REGISTER_NPU_IMPL(three_interpolate_backward_impl,
three_interpolate_backward_npu);
......@@ -5,10 +5,14 @@
#include <diopi/diopirt.h>
#include <diopi/functions.h>
#include <diopi/functions_mmcv.h>
#include <torch/csrc/utils/pybind.h>
#include "csrc_dipu/base/basedef.h"
#include "csrc_dipu/diopirt/diopirt_impl.h"
#include "csrc_dipu/runtime/device/deviceapis.h"
#include "csrc_dipu/utils/helpfunc.hpp"
using dipu::VENDOR_TYPE;
using dipu::diopi_helper::toDiopiScalar;
using dipu::diopi_helper::toDiopiTensorHandle;
#endif
......@@ -56,10 +60,18 @@ void roi_align_forward_diopi(Tensor input, Tensor rois, Tensor output,
auto argmax_x_p = toDiopiTensorHandle(argmax_x);
bool is_mock_cuda = input.device().type() == dipu::DIPU_DEVICE_TYPE;
if (is_mock_cuda && reinterpret_cast<void *>(diopiRoiAlignMmcv) != nullptr) {
if (strcmp(dipu::VendorTypeToStr(VENDOR_TYPE), "NPU") == 0) {
pybind11::gil_scoped_release no_gil;
auto ret = diopiRoiAlignMmcv(
ch, out_p, argmax_y_p, argmax_x_p, input_p, rois_p, aligned_height,
aligned_width, sampling_ratio, pool_mode, spatial_scale, aligned);
if (ret == diopiSuccess) return;
} else {
auto ret = diopiRoiAlignMmcv(
ch, out_p, argmax_y_p, argmax_x_p, input_p, rois_p, aligned_height,
aligned_width, sampling_ratio, pool_mode, spatial_scale, aligned);
if (ret == diopiSuccess) return;
}
}
LOG(WARNING) << "Fallback to cpu: mmcv ext op roi_align_forward";
auto input_cpu = input.cpu();
......@@ -96,11 +108,20 @@ void roi_align_backward_diopi(Tensor grad_output, Tensor rois, Tensor argmax_y,
bool is_mock_cuda = grad_output.device().type() == dipu::DIPU_DEVICE_TYPE;
if (is_mock_cuda &&
reinterpret_cast<void *>(diopiRoiAlignBackwardMmcv) != nullptr) {
if (strcmp(dipu::VendorTypeToStr(VENDOR_TYPE), "NPU") == 0) {
pybind11::gil_scoped_release no_gil;
auto ret = diopiRoiAlignBackwardMmcv(ch, grad_input_, grad_output_, rois_,
argmax_y_, argmax_x_, aligned_height,
aligned_width, sampling_ratio,
pool_mode, spatial_scale, aligned);
if (ret == diopiSuccess) return;
} else {
auto ret = diopiRoiAlignBackwardMmcv(ch, grad_input_, grad_output_, rois_,
argmax_y_, argmax_x_, aligned_height,
aligned_width, sampling_ratio,
pool_mode, spatial_scale, aligned);
if (ret == diopiSuccess) return;
}
}
LOG(WARNING) << "Fallback to cpu: mmcv ext op roi_align_backward";
auto grad_output_cpu = grad_output.cpu();
......
......@@ -5,9 +5,13 @@
#include <diopi/diopirt.h>
#include <diopi/functions.h>
#include <diopi/functions_mmcv.h>
#include <torch/csrc/utils/pybind.h>
#include "csrc_dipu/diopirt/diopirt_impl.h"
#include "csrc_dipu/runtime/device/deviceapis.h"
#include "csrc_dipu/utils/helpfunc.hpp"
using dipu::VENDOR_TYPE;
using dipu::diopi_helper::toDiopiScalar;
using dipu::diopi_helper::toDiopiTensorHandle;
#endif
......@@ -84,12 +88,21 @@ void hard_voxelize_forward_diopi(const at::Tensor &points,
auto num_points_per_voxel_p = toDiopiTensorHandle(num_points_per_voxel);
auto voxel_num_p = toDiopiTensorHandle(voxel_num);
if (reinterpret_cast<void *>(diopiHardVoxelizeMmcv) != nullptr) {
if (strcmp(dipu::VendorTypeToStr(VENDOR_TYPE), "NPU") == 0) {
pybind11::gil_scoped_release no_gil;
auto ret = diopiHardVoxelizeMmcv(
ch, voxels_p, coors_p, num_points_per_voxel_p, voxel_num_p, points_p,
voxel_size_p, coors_range_p, max_points, max_voxels, NDim,
deterministic);
if (ret == diopiSuccess) return;
} else {
auto ret = diopiHardVoxelizeMmcv(
ch, voxels_p, coors_p, num_points_per_voxel_p, voxel_num_p, points_p,
voxel_size_p, coors_range_p, max_points, max_voxels, NDim,
deterministic);
if (ret == diopiSuccess) return;
}
}
LOG(WARNING) << "Fallback to cpu: mmcv ext op hard_voxelize_forward";
auto points_cpu = points.cpu();
auto voxel_size_cpu = voxel_size.cpu();
......@@ -146,10 +159,17 @@ void dynamic_voxelize_forward_diopi(const at::Tensor &points,
auto coors_range_p = toDiopiTensorHandle(coors_range);
auto coors_p = toDiopiTensorHandle(coors);
if (reinterpret_cast<void *>(diopiDynamicVoxelizeMmcv) != nullptr) {
if (strcmp(dipu::VendorTypeToStr(VENDOR_TYPE), "NPU") == 0) {
pybind11::gil_scoped_release no_gil;
auto ret = diopiDynamicVoxelizeMmcv(ch, coors_p, points_p, voxel_size_p,
coors_range_p, NDim);
if (ret == diopiSuccess) return;
} else {
auto ret = diopiDynamicVoxelizeMmcv(ch, coors_p, points_p, voxel_size_p,
coors_range_p, NDim);
if (ret == diopiSuccess) return;
}
}
LOG(WARNING) << "Fallback to cpu: mmcv ext op dynamic_voxelize_forward";
auto points_cpu = points.cpu();
auto voxel_size_cpu = voxel_size.cpu();
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment