Unverified Commit 34bdf448 authored by ckirchhoff's avatar ckirchhoff Committed by GitHub
Browse files

[Feature] Pick npu ops from master to 2.x (#2501)

* merge npu ops from master to 2.x

* BugFix: fix merge bugs

* {[Feature]: add psamask, roipool to 2.x, and fix the SigmoidFocalLoss assert condition

* merge conflicts in ops.md

* [fix]: fix merge bug
parent 7156604e
......@@ -2,62 +2,62 @@
We implement common ops used in detection, segmentation, etc.
| Device | CPU | CUDA | MLU | MPS |
| ---------------------------- | --- | ---- | --- | --- |
| ActiveRotatedFilter | √ | √ | | |
| AssignScoreWithK | | √ | | |
| BallQuery | | √ | | |
| BBoxOverlaps | | √ | √ | √ |
| BorderAlign | | √ | | |
| BoxIouRotated | √ | √ | | |
| BoxIouQuadri | √ | √ | | |
| CARAFE | | √ | √ | |
| ChamferDistance | | √ | | |
| CrissCrossAttention | | √ | | |
| ContourExpand | √ | | | |
| ConvexIoU | | √ | | |
| CornerPool | | √ | | |
| Correlation | | √ | | |
| Deformable Convolution v1/v2 | √ | √ | | |
| Deformable RoIPool | | √ | √ | |
| DiffIoURotated | | √ | | |
| DynamicScatter | | √ | | |
| FurthestPointSample | | √ | | |
| FurthestPointSampleWithDist | | √ | | |
| FusedBiasLeakyrelu | | √ | | |
| GatherPoints | | √ | | |
| GroupPoints | | √ | | |
| Iou3d | | √ | √ | |
| KNN | | √ | | |
| MaskedConv | | √ | √ | |
| MergeCells | | √ | | |
| MinAreaPolygon | | √ | | |
| ModulatedDeformConv2d | √ | √ | | |
| MultiScaleDeformableAttn | | √ | √ | |
| NMS | √ | √ | √ | |
| NMSRotated | √ | √ | | |
| NMSQuadri | √ | √ | | |
| PixelGroup | √ | | | |
| PointsInBoxes | √ | √ | | |
| PointsInPolygons | | √ | | |
| PSAMask | √ | √ | √ | |
| RotatedFeatureAlign | √ | √ | | |
| RoIPointPool3d | | √ | √ | |
| RoIPool | | √ | √ | |
| RoIAlignRotated | √ | √ | √ | |
| RiRoIAlignRotated | | √ | | |
| RoIAlign | √ | √ | √ | |
| RoIAwarePool3d | | √ | √ | |
| SAConv2d | | √ | | |
| SigmoidFocalLoss | | √ | √ | |
| SoftmaxFocalLoss | | √ | | |
| SoftNMS | | √ | | |
| Sparse Convolution | | √ | | |
| Synchronized BatchNorm | | √ | | |
| ThreeInterpolate | | √ | | |
| ThreeNN | | √ | √ | |
| TINShift | | √ | √ | |
| UpFirDn2d | | √ | | |
| Voxelization | √ | √ | | |
| PrRoIPool | | √ | | |
| BezierAlign | √ | √ | | |
| Device | CPU | CUDA | MLU | MPS | Ascend |
| ---------------------------- | --- | ---- | --- | --- | ------ |
| ActiveRotatedFilter | √ | √ | | | |
| AssignScoreWithK | | √ | | | |
| BallQuery | | √ | | | |
| BBoxOverlaps | | √ | √ | √ | |
| BorderAlign | | √ | | | |
| BoxIouRotated | √ | √ | | | |
| BoxIouQuadri | √ | √ | | | |
| CARAFE | | √ | √ | | |
| ChamferDistance | | √ | | | |
| CrissCrossAttention | | √ | | | |
| ContourExpand | √ | | | | |
| ConvexIoU | | √ | | | |
| CornerPool | | √ | | | |
| Correlation | | √ | | | |
| Deformable Convolution v1/v2 | √ | √ | | | √ |
| Deformable RoIPool | | √ | √ | | √ |
| DiffIoURotated | | √ | | | |
| DynamicScatter | | √ | | | |
| FurthestPointSample | | √ | | | |
| FurthestPointSampleWithDist | | √ | | | |
| FusedBiasLeakyrelu | | √ | | | √ |
| GatherPoints | | √ | | | |
| GroupPoints | | √ | | | |
| Iou3d | | √ | √ | | |
| KNN | | √ | | | |
| MaskedConv | | √ | √ | | √ |
| MergeCells | | √ | | | |
| MinAreaPolygon | | √ | | | |
| ModulatedDeformConv2d | √ | √ | | | √ |
| MultiScaleDeformableAttn | | √ | √ | | |
| NMS | √ | √ | √ | | √ |
| NMSRotated | √ | √ | | | |
| NMSQuadri | √ | √ | | | |
| PixelGroup | √ | | | | |
| PointsInBoxes | √ | √ | | | |
| PointsInPolygons | | √ | | | |
| PSAMask | √ | √ | √ | | √ |
| RotatedFeatureAlign | √ | √ | | | |
| RoIPointPool3d | | √ | √ | | |
| RoIPool | | √ | √ | | √ |
| RoIAlignRotated | √ | √ | √ | | |
| RiRoIAlignRotated | | √ | | | |
| RoIAlign | √ | √ | √ | | |
| RoIAwarePool3d | | √ | √ | | |
| SAConv2d | | √ | | | |
| SigmoidFocalLoss | | √ | √ | | √ |
| SoftmaxFocalLoss | | √ | | | √ |
| SoftNMS | | √ | | | |
| Sparse Convolution | | √ | | | |
| Synchronized BatchNorm | | √ | | | |
| ThreeInterpolate | | √ | | | |
| ThreeNN | | √ | √ | | |
| TINShift | | √ | √ | | |
| UpFirDn2d | | √ | | | |
| Voxelization | √ | √ | | | |
| PrRoIPool | | √ | | | |
| BezierAlign | √ | √ | | | |
......@@ -2,62 +2,62 @@
MMCV 提供了检测、分割等任务中常用的算子
| Device | CPU | CUDA | MLU | MPS |
| ---------------------------- | --- | ---- | --- | --- |
| ActiveRotatedFilter | √ | √ | | |
| AssignScoreWithK | | √ | | |
| BallQuery | | √ | | |
| BBoxOverlaps | | √ | √ | √ |
| BorderAlign | | √ | | |
| BoxIouRotated | √ | √ | | |
| BoxIouQuadri | √ | √ | | |
| CARAFE | | √ | √ | |
| ChamferDistance | | √ | | |
| CrissCrossAttention | | √ | | |
| ContourExpand | √ | | | |
| ConvexIoU | | √ | | |
| CornerPool | | √ | | |
| Correlation | | √ | | |
| Deformable Convolution v1/v2 | √ | √ | | |
| Deformable RoIPool | | √ | √ | |
| DiffIoURotated | | √ | | |
| DynamicScatter | | √ | | |
| FurthestPointSample | | √ | | |
| FurthestPointSampleWithDist | | √ | | |
| FusedBiasLeakyrelu | | √ | | |
| GatherPoints | | √ | | |
| GroupPoints | | √ | | |
| Iou3d | | √ | √ | |
| KNN | | √ | | |
| MaskedConv | | √ | √ | |
| MergeCells | | √ | | |
| MinAreaPolygon | | √ | | |
| ModulatedDeformConv2d | √ | √ | | |
| MultiScaleDeformableAttn | | √ | √ | |
| NMS | √ | √ | √ | |
| NMSRotated | √ | √ | | |
| NMSQuadri | √ | √ | | |
| PixelGroup | √ | | | |
| PointsInBoxes | √ | √ | | |
| PointsInPolygons | | √ | | |
| PSAMask | √ | √ | √ | |
| RotatedFeatureAlign | √ | √ | | |
| RoIPointPool3d | | √ | √ | |
| RoIPool | | √ | √ | |
| RoIAlignRotated | √ | √ | √ | |
| RiRoIAlignRotated | | √ | | |
| RoIAlign | √ | √ | √ | |
| RoIAwarePool3d | | √ | √ | |
| SAConv2d | | √ | | |
| SigmoidFocalLoss | | √ | √ | |
| SoftmaxFocalLoss | | √ | | |
| SoftNMS | | √ | | |
| Sparse Convolution | | √ | | |
| Synchronized BatchNorm | | √ | | |
| ThreeInterpolate | | √ | | |
| ThreeNN | | √ | √ | |
| TINShift | | √ | √ | |
| UpFirDn2d | | √ | | |
| Voxelization | √ | √ | | |
| PrRoIPool | | √ | | |
| BezierAlign | √ | √ | | |
| Device | CPU | CUDA | MLU | MPS | Ascend |
| ---------------------------- | --- | ---- | --- | --- | ------ |
| ActiveRotatedFilter | √ | √ | | | |
| AssignScoreWithK | | √ | | | |
| BallQuery | | √ | | | |
| BBoxOverlaps | | √ | √ | √ | |
| BorderAlign | | √ | | | |
| BoxIouRotated | √ | √ | | | |
| BoxIouQuadri | √ | √ | | | |
| CARAFE | | √ | √ | | |
| ChamferDistance | | √ | | | |
| CrissCrossAttention | | √ | | | |
| ContourExpand | √ | | | | |
| ConvexIoU | | √ | | | |
| CornerPool | | √ | | | |
| Correlation | | √ | | | |
| Deformable Convolution v1/v2 | √ | √ | | | √ |
| Deformable RoIPool | | √ | √ | | √ |
| DiffIoURotated | | √ | | | |
| DynamicScatter | | √ | | | |
| FurthestPointSample | | √ | | | |
| FurthestPointSampleWithDist | | √ | | | |
| FusedBiasLeakyrelu | | √ | | | √ |
| GatherPoints | | √ | | | |
| GroupPoints | | √ | | | |
| Iou3d | | √ | √ | | |
| KNN | | √ | | | |
| MaskedConv | | √ | √ | | √ |
| MergeCells | | √ | | | |
| MinAreaPolygon | | √ | | | |
| ModulatedDeformConv2d | √ | √ | | | √ |
| MultiScaleDeformableAttn | | √ | √ | | |
| NMS | √ | √ | √ | | √ |
| NMSRotated | √ | √ | | | |
| NMSQuadri | √ | √ | | | |
| PixelGroup | √ | | | | |
| PointsInBoxes | √ | √ | | | |
| PointsInPolygons | | √ | | | |
| PSAMask | √ | √ | √ | | √ |
| RotatedFeatureAlign | √ | √ | | | |
| RoIPointPool3d | | √ | √ | | |
| RoIPool | | √ | √ | | √ |
| RoIAlignRotated | √ | √ | √ | | |
| RiRoIAlignRotated | | √ | | | |
| RoIAlign | √ | √ | √ | | |
| RoIAwarePool3d | | √ | √ | | |
| SAConv2d | | √ | | | |
| SigmoidFocalLoss | | √ | √ | | √ |
| SoftmaxFocalLoss | | √ | | | √ |
| SoftNMS | | √ | | | |
| Sparse Convolution | | √ | | | |
| Synchronized BatchNorm | | √ | | | |
| ThreeInterpolate | | √ | | | |
| ThreeNN | | √ | √ | | |
| TINShift | | √ | √ | | |
| UpFirDn2d | | √ | | | |
| Voxelization | √ | √ | | | |
| PrRoIPool | | √ | | | |
| BezierAlign | √ | √ | | | |
/******************************************************************************
* Copyright (c) 2022 Huawei Technologies Co., Ltd
* All rights reserved.
*
* Licensed under the BSD 3-Clause License (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* https://opensource.org/licenses/BSD-3-Clause
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
******************************************************************************/
#ifndef PYTORCH_NPU_HELPER_HPP_
#define PYTORCH_NPU_HELPER_HPP_
#include <torch_npu/csrc/aten/NPUNativeFunctions.h>
#include <torch_npu/csrc/framework/utils/CalcuOpUtil.h>
#include <torch_npu/csrc/framework/utils/OpAdapter.h>
#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
#define NPU_NAME_SPACE at_npu::native
#define REGISTER_NPU_IMPL(key, value) REGISTER_DEVICE_IMPL(key, XLA, value)
#define CHECK_NPU(x) \
TORCH_CHECK(x.device().type() == at::kXLA, #x " must be a NPU tensor")
#endif // PYTORCH_NPU_HELPER_HPP_
#include "pytorch_npu_helper.hpp"
using namespace NPU_NAME_SPACE;
using namespace std;
void deform_roi_pool_forward_impl(Tensor input, Tensor rois, Tensor offset,
Tensor output, int pooled_height,
int pooled_width, float spatial_scale,
int sampling_ratio, float gamma);
void deform_roi_pool_backward_impl(Tensor grad_output, Tensor input,
Tensor rois, Tensor offset,
Tensor grad_input, Tensor grad_offset,
int pooled_height, int pooled_width,
float spatial_scale, int sampling_ratio,
float gamma);
void deform_roi_pool_forward_npu(Tensor input, Tensor rois, Tensor offset,
Tensor output, int pooled_height,
int pooled_width, float spatial_scale,
int sampling_ratio, float gamma) {
c10::SmallVector<int64_t, 2> output_sizes = {pooled_height, pooled_width};
at::IntArrayRef output_size = at::IntArrayRef(output_sizes);
int64_t sampling_ratio_ = (int64_t)sampling_ratio;
OpCommand cmd;
cmd.Name("DeformableRoiPool")
.Input(input)
.Input(rois)
.Input(offset)
.Output(output)
.Attr("spatial_scale", spatial_scale)
.Attr("output_size", output_size)
.Attr("sampling_ratio", sampling_ratio_)
.Attr("gamma", gamma)
.Run();
}
void deform_roi_pool_backward_npu(Tensor grad_output, Tensor input, Tensor rois,
Tensor offset, Tensor grad_input,
Tensor grad_offset, int pooled_height,
int pooled_width, float spatial_scale,
int sampling_ratio, float gamma) {
c10::SmallVector<int64_t, 2> output_sizes = {pooled_height, pooled_width};
at::IntArrayRef output_size = at::IntArrayRef(output_sizes);
int64_t sampling_ratio_ = (int64_t)sampling_ratio;
OpCommand cmd;
cmd.Name("DeformableRoiPoolGrad")
.Input(grad_input)
.Input(input)
.Input(rois)
.Input(offset)
.Output(grad_output)
.Output(grad_offset)
.Attr("output_size", output_size)
.Attr("spatial_scale", spatial_scale)
.Attr("sample_ratio", sampling_ratio_)
.Attr("gamma", gamma)
.Run();
}
REGISTER_NPU_IMPL(deform_roi_pool_forward_impl, deform_roi_pool_forward_npu);
REGISTER_NPU_IMPL(deform_roi_pool_backward_impl, deform_roi_pool_backward_npu);
#include "pytorch_npu_helper.hpp"
using namespace NPU_NAME_SPACE;
using namespace std;
void sigmoid_focal_loss_forward_npu(Tensor input, Tensor target, Tensor weight,
Tensor output, float gamma, float alpha) {
int64_t n_class = input.size(1);
at::Tensor target_y = at::ones_like(input);
if (n_class == 1) {
target_y = at::reshape(target, input.sizes());
target_y = at::mul(target_y, -1.0);
target_y = at::add(target_y, 1.0);
} else {
target_y = at_npu::native::NPUNativeFunctions::one_hot(target, n_class);
}
target_y =
at_npu::native::NPUNativeFunctions::npu_dtype_cast(target_y, at::kInt);
int64_t weight_size = weight.size(0);
at::Tensor weight_y = at::ones_like(input);
if (weight_size > 0) {
weight_y = at_npu::native::NPUNativeFunctions::npu_broadcast(weight,
input.sizes());
}
OpCommand cmd;
string reduction = "none";
cmd.Name("SigmoidFocalLoss")
.Input(input)
.Input(target_y)
.Input(weight_y)
.Output(output)
.Attr("gamma", gamma)
.Attr("alpha", alpha)
.Attr("reduction", reduction)
.Run();
}
void sigmoid_focal_loss_forward_impl(Tensor input, Tensor target, Tensor weight,
Tensor output, float gamma, float alpha);
void sigmoid_focal_loss_backward_npu(Tensor input, Tensor target, Tensor weight,
Tensor grad_input, float gamma,
float alpha) {
int64_t n_class = input.size(1);
at::Tensor target_y = at::ones_like(input);
if (n_class == 1) {
target_y = at::reshape(target, input.sizes());
} else {
target_y = at_npu::native::NPUNativeFunctions::one_hot(target, n_class);
target_y = at::mul(target_y, -1.0);
target_y = at::add(target_y, 1.0);
}
target_y =
at_npu::native::NPUNativeFunctions::npu_dtype_cast(target_y, at::kInt);
at::Tensor grad_up = at::ones_like(input);
int64_t weight_size = weight.size(0);
at::Tensor weight_y = at::ones_like(input);
if (weight_size > 0) {
weight_y = at_npu::native::NPUNativeFunctions::npu_broadcast(weight,
input.sizes());
}
OpCommand cmd;
string reduction = "none";
cmd.Name("SigmoidFocalLossGrad")
.Input(input)
.Input(target_y)
.Input(grad_up)
.Input(weight_y)
.Output(grad_input)
.Attr("gamma", gamma)
.Attr("alpha", alpha)
.Attr("reduction", reduction)
.Run();
}
void sigmoid_focal_loss_backward_impl(Tensor input, Tensor target,
Tensor weight, Tensor grad_input,
float gamma, float alpha);
void softmax_focal_loss_forward_npu(Tensor input, Tensor target, Tensor weight,
Tensor output, float gamma, float alpha) {
int64_t n_class = input.size(1);
at::Tensor target_y =
at_npu::native::NPUNativeFunctions::one_hot(target, n_class);
target_y =
at_npu::native::NPUNativeFunctions::npu_dtype_cast(target_y, at::kInt);
int64_t weight_size = weight.size(0);
at::Tensor weight_y = at::ones_like(input);
if (weight_size > 0) {
weight_y = at_npu::native::NPUNativeFunctions::npu_broadcast(weight,
input.sizes());
}
at::Tensor op_output = at::ones_like(input);
OpCommand cmd;
string reduction = "none";
cmd.Name("SoftmaxFocalLoss")
.Input(input)
.Input(target_y)
.Input(weight_y)
.Output(op_output)
.Attr("gamma", gamma)
.Attr("alpha", alpha)
.Attr("reduction", reduction)
.Run();
int64_t n_batch = input.size(0);
c10::SmallVector<int64_t, 2> offsets = {0, 0};
c10::SmallVector<int64_t, 2> sizes = {n_batch, 1};
at::IntArrayRef offset = at::IntArrayRef(offsets);
at::IntArrayRef size = at::IntArrayRef(sizes);
at_npu::native::NPUNativeFunctions::npu_slice_out(op_output, offset, size,
output);
}
void softmax_focal_loss_forward_impl(Tensor input, Tensor target, Tensor weight,
Tensor grad_input, float gamma,
float alpha);
void softmax_focal_loss_backward_npu(Tensor input, Tensor target, Tensor weight,
Tensor buff, Tensor grad_input,
float gamma, float alpha) {
int64_t n_class = input.size(1);
at::Tensor target_y =
at_npu::native::NPUNativeFunctions::one_hot(target, n_class);
target_y =
at_npu::native::NPUNativeFunctions::npu_dtype_cast(target_y, at::kInt);
at::Tensor grad_up = at::ones_like(input);
int64_t weight_size = weight.size(0);
at::Tensor weight_y = at::ones_like(input);
if (weight_size > 0) {
weight_y = at_npu::native::NPUNativeFunctions::npu_broadcast(weight,
input.sizes());
}
OpCommand cmd;
string reduction = "none";
cmd.Name("SoftmaxFocalLossGrad")
.Input(input)
.Input(target_y)
.Input(grad_up)
.Input(weight_y)
.Output(grad_input)
.Attr("gamma", gamma)
.Attr("alpha", alpha)
.Attr("reduction", reduction)
.Run();
}
void softmax_focal_loss_backward_impl(Tensor input, Tensor target,
Tensor weight, Tensor buff,
Tensor grad_input, float gamma,
float alpha);
REGISTER_NPU_IMPL(sigmoid_focal_loss_forward_impl,
sigmoid_focal_loss_forward_npu);
REGISTER_NPU_IMPL(sigmoid_focal_loss_backward_impl,
sigmoid_focal_loss_backward_npu);
REGISTER_NPU_IMPL(softmax_focal_loss_forward_impl,
softmax_focal_loss_forward_npu);
REGISTER_NPU_IMPL(softmax_focal_loss_backward_impl,
softmax_focal_loss_backward_npu);
#include "pytorch_npu_helper.hpp"
using namespace NPU_NAME_SPACE;
using namespace std;
Tensor fused_bias_leakyrelu_op_impl(const Tensor &input, const Tensor &bias,
const Tensor &refer, int act, int grad,
float alpha, float scale);
Tensor fused_bias_leakyrelu_npu(const Tensor &input, const Tensor &bias,
const Tensor &refer, int act, int grad,
float alpha, float scale) {
at::Tensor py = at::empty_like(input);
// forward
if (grad == 0) {
auto input_size = input.sizes();
int input_length = input_size.size();
c10::SmallVector<int64_t, SIZE> input_size_tmp;
input_size_tmp = array_to_small_vector(input_size);
if (input_length > 1) {
for (int i = 0; i < input_length; i++) {
if (i != 1) {
input_size_tmp[i] = 1;
}
}
}
at::Tensor bias_tmp = at::reshape(bias, input_size_tmp);
at::Tensor bias_ = at_npu::native::NPUNativeFunctions::npu_broadcast(
bias_tmp, input.sizes());
OpCommand cmd;
cmd.Name("FusedBiasLeakyRelu")
.Input(input)
.Input(bias_)
.Output(py)
.Attr("scale", scale)
.Attr("negative_slope", alpha)
.Run();
}
// backward
if (grad == 1) {
OpCommand cmd;
cmd.Name("FusedBiasLeakyReluGrad")
.Input(input)
.Input(refer)
.Output(py)
.Attr("scale", scale)
.Attr("negative_slope", alpha)
.Run();
}
return py;
}
REGISTER_NPU_IMPL(fused_bias_leakyrelu_op_impl, fused_bias_leakyrelu_npu);
#include "pytorch_npu_helper.hpp"
using namespace NPU_NAME_SPACE;
using namespace std;
Tensor nms_npu(Tensor boxes, Tensor scores, float iou_threshold, int offset) {
at::Tensor boxed_offest = at_npu::native::OpPreparation::ApplyTensor(boxes);
at::Tensor ones_tensor =
at_npu::native::OpPreparation::ApplyTensor(boxes).fill_(1);
at::add_out(boxed_offest, boxes, ones_tensor, offset);
at::Tensor iou_threshold_y = at_npu::native::OpPreparation::ApplyTensor(
{}, boxes.options().dtype(at::kFloat), boxes)
.fill_(iou_threshold);
at::Tensor scores_threshold_y =
at_npu::native::OpPreparation::ApplyTensor(
{}, boxes.options().dtype(at::kFloat), boxes)
.fill_(0);
at::Tensor max_outputsize_y = at_npu::native::OpPreparation::ApplyTensor(
{}, boxes.options().dtype(at::kInt), boxes)
.fill_(boxes.size(0));
c10::SmallVector<int64_t, SIZE> outputsize = {boxes.size(0)};
at::Tensor output = at_npu::native::OpPreparation::ApplyTensor(
outputsize, boxes.options().dtype(at::kInt), boxes)
.fill_(-1);
OpCommand cmd;
cmd.Name("NonMaxSuppressionV3")
.Input(boxes)
.Input(scores)
.Input(max_outputsize_y)
.Input(iou_threshold_y)
.Input(scores_threshold_y)
.Output(output)
.Run();
auto outputsizeBool = at::gt(output, -1);
auto outputsizeInt = outputsizeBool.to(at::ScalarType::Int);
auto countLen = at::sum(outputsizeInt, at::ScalarType::Int);
at::Tensor actual_output = output.slice(0, 0, countLen.item().toLong());
actual_output = at_npu::native::NPUNativeFunctions::npu_dtype_cast(
actual_output, at::kLong);
return actual_output;
}
Tensor nms_impl(Tensor boxes, Tensor scores, float iou_threshold, int offset);
REGISTER_NPU_IMPL(nms_impl, nms_npu);
#include "pytorch_npu_helper.hpp"
using namespace NPU_NAME_SPACE;
using namespace std;
void psamask_forward_npu(const int psa_type, const Tensor x, Tensor y,
const int num, const int h_feature,
const int w_feature, const int h_mask,
const int w_mask, const int half_h_mask,
const int half_w_mask) {
int64_t psa_type_i64 = psa_type;
int64_t num_i64 = num;
int64_t h_feature_i64 = h_feature;
int64_t w_feature_i64 = w_feature;
int64_t h_mask_i64 = h_mask;
int64_t w_mask_i64 = w_mask;
int64_t half_h_mask_i64 = half_h_mask;
int64_t half_w_mask_i64 = half_w_mask;
OpCommand cmd;
cmd.Name("PSAMask")
.Input(x)
.Output(y)
.Attr("psa_type", psa_type_i64)
.Attr("num", num_i64)
.Attr("h_feature", h_feature_i64)
.Attr("w_feature", w_feature_i64)
.Attr("h_mask", h_mask_i64)
.Attr("w_mask", w_mask_i64)
.Attr("half_h_mask", half_h_mask_i64)
.Attr("half_w_mask", half_w_mask_i64)
.Run();
}
void psamask_forward_impl(const int psa_type, const Tensor x, Tensor y,
const int num, const int h_feature,
const int w_feature, const int h_mask,
const int w_mask, const int half_h_mask,
const int half_w_mask);
void psamask_backward_npu(const int psa_type, const Tensor y_grad,
Tensor x_grad, const int num, const int h_feature,
const int w_feature, const int h_mask,
const int w_mask, const int half_h_mask,
const int half_w_mask) {
int64_t psa_type_i64 = psa_type;
int64_t num_i64 = num;
int64_t h_feature_i64 = h_feature;
int64_t w_feature_i64 = w_feature;
int64_t h_mask_i64 = h_mask;
int64_t w_mask_i64 = w_mask;
int64_t half_h_mask_i64 = half_h_mask;
int64_t half_w_mask_i64 = half_w_mask;
OpCommand cmd;
cmd.Name("PSAMaskGrad")
.Input(y_grad)
.Output(x_grad)
.Attr("psa_type", psa_type_i64)
.Attr("num", num_i64)
.Attr("h_feature", h_feature_i64)
.Attr("w_feature", w_feature_i64)
.Attr("h_mask", h_mask_i64)
.Attr("w_mask", w_mask_i64)
.Attr("half_h_mask", half_h_mask_i64)
.Attr("half_w_mask", half_w_mask_i64)
.Run();
}
void psamask_backward_impl(const int psa_type, const Tensor y_grad,
Tensor x_grad, const int num, const int h_feature,
const int w_feature, const int h_mask,
const int w_mask, const int half_h_mask,
const int half_w_mask);
REGISTER_NPU_IMPL(psamask_forward_impl, psamask_forward_npu);
REGISTER_NPU_IMPL(psamask_backward_impl, psamask_backward_npu);
#include "pytorch_npu_helper.hpp"
using namespace NPU_NAME_SPACE;
using namespace std;
void roi_pool_forward_npu(Tensor input, Tensor rois, Tensor output,
Tensor argmax, int pooled_height, int pooled_width,
float spatial_scale) {
int64_t pooled_height_64 = pooled_height;
int64_t pooled_width_64 = pooled_width;
int64_t pooled_channel = 1;
at::Tensor roi_actual_num = at_npu::native::OpPreparation::ApplyTensor(
{}, rois.options().dtype(at::kInt), rois);
OpCommand cmd;
cmd.Name("RoiPoolingWithArgMax")
.Input(input)
.Input(rois)
.Input(roi_actual_num)
.Output(output)
.Output(argmax)
.Attr("pooled_h", pooled_height_64)
.Attr("pooled_w", pooled_width_64)
.Attr("spatial_scale_h", spatial_scale)
.Attr("spatial_scale_w", spatial_scale)
.Attr("pool_channel", pooled_channel)
.Run();
}
void roi_pool_forward_impl(Tensor input, Tensor rois, Tensor output,
Tensor argmax, int pooled_height, int pooled_width,
float spatial_scale);
REGISTER_NPU_IMPL(roi_pool_forward_impl, roi_pool_forward_npu);
......@@ -13,6 +13,7 @@ from torch.autograd.function import once_differentiable
from torch.nn.modules.utils import _pair, _single
from ..utils import ext_loader
from .modulated_deform_conv import ModulatedDeformConv2dFunction
ext_module = ext_loader.load_ext('_ext', [
'deform_conv_forward', 'deform_conv_backward_input',
......@@ -47,6 +48,23 @@ class DeformConv2dFunction(Function):
bias_i=bias,
im2col_step_i=im2col_step)
@staticmethod
def _npu_backward(ctx, grad_output):
input_tensor, weight, offset_out, offset_all, sort_index_for_npu_bp = \
ctx.saved_tensors
grad_input, grad_weight, grad_offset_all, grad_bias = \
torch.npu_deformable_conv2dbk(
input_tensor, grad_output, offset_out, weight, offset_all,
kernel_size=[weight.shape[3], weight.shape[2]],
stride=[1, 1, ctx.stride[0], ctx.stride[1]],
padding=[1, 1, ctx.padding[0], ctx.padding[1]],
dilation=[1, 1, ctx.dilation[0], ctx.dilation[1]],
groups=ctx.groups, deformable_groups=ctx.deform_groups,
modulated=True)
grad_offset = grad_offset_all.index_select(1, sort_index_for_npu_bp)
return grad_input, grad_offset, grad_weight, \
None, None, None, None, None, None, None
@staticmethod
def forward(ctx,
input: Tensor,
......@@ -70,6 +88,7 @@ class DeformConv2dFunction(Function):
ctx.groups = groups
ctx.deform_groups = deform_groups
ctx.im2col_step = im2col_step
ctx.device = input.device.type
# When pytorch version >= 1.6.0, amp is adopted for fp16 mode;
# amp won't cast the type of model (float32), but "offset" is cast
......@@ -80,6 +99,13 @@ class DeformConv2dFunction(Function):
# whatever the pytorch version is.
input = input.type_as(offset)
weight = weight.type_as(input)
if ctx.device == 'npu':
mask_shape, _ = torch.chunk(offset, 2, dim=1)
mask = torch.ones_like(mask_shape).to(input.device)
bias = input.new_empty(0)
output = ModulatedDeformConv2dFunction._npu_forward(
ctx, input, offset, mask, weight, bias)
return output
ctx.save_for_backward(input, offset, weight)
output = input.new_empty(
......@@ -116,6 +142,8 @@ class DeformConv2dFunction(Function):
ctx, grad_output: Tensor
) -> Tuple[Optional[Tensor], Optional[Tensor], Optional[Tensor], None,
None, None, None, None, None, None]:
if ctx.device == 'npu':
return DeformConv2dFunction._npu_backward(ctx, grad_output)
input, offset, weight = ctx.saved_tensors
grad_input = grad_offset = grad_weight = None
......
......@@ -25,8 +25,7 @@ class SigmoidFocalLossFunction(Function):
weight: Optional[torch.Tensor] = None,
reduction: str = 'mean') -> torch.Tensor:
assert isinstance(
target, (torch.Tensor, torch.LongTensor, torch.cuda.LongTensor))
assert target.dtype == torch.long
assert input.dim() == 2
assert target.dim() == 1
assert input.size(0) == target.size(0)
......@@ -117,7 +116,7 @@ class SoftmaxFocalLossFunction(Function):
weight: Optional[torch.Tensor] = None,
reduction='mean') -> torch.Tensor:
assert isinstance(target, (torch.LongTensor, torch.cuda.LongTensor))
assert target.dtype == torch.long
assert input.dim() == 2
assert target.dim() == 1
assert input.size(0) == target.size(0)
......
......@@ -45,6 +45,24 @@ class MaskedConv2dFunction(Function):
'Stride could not only be 1 in masked_conv2d currently.')
out_channel, in_channel, kernel_h, kernel_w = weight.size()
if features.device.type == 'npu':
import torch_npu
output = torch_npu.npu_conv2d(
features,
weight,
bias,
stride=(stride_h, stride_w),
padding=(pad_h, pad_w),
dilation=(1, 1),
groups=1)
if mask.size()[1:] != output.size()[2:]:
raise ValueError(
'The mask is inconsistent with the shape of output_conv.')
mask = mask > 0
mask = mask.type(output.dtype)
output = output * mask
return output
batch_size = features.size(0)
out_h = int(
math.floor(
......
......@@ -35,6 +35,66 @@ class ModulatedDeformConv2dFunction(Function):
groups_i=groups,
deform_groups_i=deform_groups)
@staticmethod
def _calculate_sort_index(kernel_h, kernel_w, deformable_group):
split_num = deformable_group * 2 * kernel_h * kernel_w
sort_index = list(range(split_num))
sort_index_fp = (sort_index[1::2] + sort_index[::2])
sort_index_bp_dict = {i: idx for idx, i in enumerate(sort_index_fp)}
sort_index_bp = [sort_index_bp_dict[i] for i in sort_index]
sort_index_fp = torch.IntTensor(sort_index_fp)
sort_index_bp = torch.IntTensor(sort_index_bp)
sort_index_fp = sort_index_fp.npu()
sort_index_bp = sort_index_bp.npu()
return sort_index_fp, sort_index_bp
@staticmethod
def _npu_forward(ctx, input_tensor, offset, mask, weight, bias):
_, _, kernel_h, kernel_w = weight.shape
conv2d_bias = bias if len(bias) > 0 else None
sort_index_fp, sort_index_bp = \
ModulatedDeformConv2dFunction._calculate_sort_index(
kernel_w, kernel_h, ctx.deform_groups)
select_offset = offset.index_select(1, sort_index_fp)
offset_all = torch.cat([select_offset, mask], dim=1)
output, offset_out = torch.npu_deformable_conv2d(
input_tensor,
weight,
offset_all,
conv2d_bias,
kernel_size=[kernel_w, kernel_h],
stride=[1, 1, ctx.stride[0], ctx.stride[1]],
padding=[1, 1, ctx.padding[0], ctx.padding[1]],
dilation=[1, 1, ctx.dilation[0], ctx.dilation[1]],
groups=ctx.groups,
deformable_groups=ctx.deform_groups,
modulated=True)
if weight.requires_grad or mask.requires_grad or offset.requires_grad \
or input_tensor.requires_grad:
ctx.save_for_backward(input_tensor, weight, offset_out, offset_all,
sort_index_bp)
return output
@staticmethod
def _npu_backward(ctx, grad_output):
input_tensor, weight, offset_out, offset_all, sort_index_bp = \
ctx.saved_tensors
grad_input, grad_weight, grad_offset_all, grad_bias = \
torch.npu_deformable_conv2dbk(
input_tensor, grad_output, offset_out, weight, offset_all,
kernel_size=[weight.shape[3], weight.shape[2]],
stride=[1, 1, ctx.stride[0], ctx.stride[1]],
padding=[1, 1, ctx.padding[0], ctx.padding[1]],
dilation=[1, 1, ctx.dilation[0], ctx.dilation[1]],
groups=ctx.groups, deformable_groups=ctx.deform_groups,
modulated=True)
grad_offset = grad_offset_all.index_select(1, sort_index_bp)
grad_mask = grad_offset_all[:, grad_offset.shape[1]:, :, :]
if not ctx.with_bias:
grad_bias = None
return (grad_input, grad_offset, grad_mask, grad_weight, grad_bias,
None, None, None, None, None, None, None, None)
@staticmethod
def forward(ctx,
input: torch.Tensor,
......@@ -57,6 +117,7 @@ class ModulatedDeformConv2dFunction(Function):
ctx.groups = groups
ctx.deform_groups = deform_groups
ctx.with_bias = bias is not None
ctx.device = input.device.type
if not ctx.with_bias:
bias = input.new_empty(0) # fake tensor
# When pytorch version >= 1.6.0, amp is adopted for fp16 mode;
......@@ -70,6 +131,10 @@ class ModulatedDeformConv2dFunction(Function):
weight = weight.type_as(input)
bias = bias.type_as(input) # type: ignore
mask = mask.type_as(input)
if ctx.device == 'npu':
output = ModulatedDeformConv2dFunction._npu_forward(
ctx, input, offset, mask, weight, bias)
return output
ctx.save_for_backward(input, offset, mask, weight, bias)
output = input.new_empty(
ModulatedDeformConv2dFunction._output_size(ctx, input, weight))
......@@ -99,6 +164,9 @@ class ModulatedDeformConv2dFunction(Function):
@staticmethod
@once_differentiable
def backward(ctx, grad_output: torch.Tensor) -> tuple:
if ctx.device == 'npu':
return ModulatedDeformConv2dFunction._npu_backward(
ctx, grad_output)
input, offset, mask, weight, bias = ctx.saved_tensors
grad_input = torch.zeros_like(input)
grad_offset = torch.zeros_like(offset)
......
# Copyright (c) OpenMMLab. All rights reserved.
from .device_type import IS_CUDA_AVAILABLE, IS_MLU_AVAILABLE, IS_MPS_AVAILABLE
from .device_type import (IS_CUDA_AVAILABLE, IS_MLU_AVAILABLE,
IS_MPS_AVAILABLE, IS_NPU_AVAILABLE)
from .env import collect_env
from .parrots_jit import jit, skip_no_elena
__all__ = [
'IS_MLU_AVAILABLE', 'IS_MPS_AVAILABLE', 'IS_CUDA_AVAILABLE', 'collect_env',
'jit', 'skip_no_elena'
'IS_MLU_AVAILABLE', 'IS_MPS_AVAILABLE', 'IS_CUDA_AVAILABLE',
'IS_NPU_AVAILABLE', 'collect_env', 'jit', 'skip_no_elena'
]
# Copyright (c) OpenMMLab. All rights reserved.
from mmengine.device import (is_cuda_available, is_mlu_available,
is_mps_available)
is_mps_available, is_npu_available)
IS_MLU_AVAILABLE = is_mlu_available()
IS_MPS_AVAILABLE = is_mps_available()
IS_CUDA_AVAILABLE = is_cuda_available()
IS_NPU_AVAILABLE = is_npu_available()
......@@ -270,6 +270,21 @@ def get_extensions():
extension = CppExtension
include_dirs.append(os.path.abspath('./mmcv/ops/csrc/common'))
include_dirs.append(os.path.abspath('./mmcv/ops/csrc/common/mps'))
elif (os.getenv('FORCE_NPU', '0') == '1'):
print(f'Compiling {ext_name} only with CPU and NPU')
try:
from torch_npu.utils.cpp_extension import NpuExtension
define_macros += [('MMCV_WITH_NPU', None)]
extension = NpuExtension
except Exception:
raise ImportError('can not find any torch_npu')
# src
op_files = glob.glob('./mmcv/ops/csrc/pytorch/*.cpp') + \
glob.glob('./mmcv/ops/csrc/pytorch/cpu/*.cpp') + \
glob.glob('./mmcv/ops/csrc/common/npu/*.cpp') + \
glob.glob('./mmcv/ops/csrc/pytorch/npu/*.cpp')
include_dirs.append(os.path.abspath('./mmcv/ops/csrc/common'))
include_dirs.append(os.path.abspath('./mmcv/ops/csrc/common/npu'))
else:
print(f'Compiling {ext_name} only with CPU')
op_files = glob.glob('./mmcv/ops/csrc/pytorch/*.cpp') + \
......
......@@ -5,7 +5,7 @@ import numpy as np
import pytest
import torch
from mmcv.utils import IS_CUDA_AVAILABLE, IS_MLU_AVAILABLE
from mmcv.utils import IS_CUDA_AVAILABLE, IS_MLU_AVAILABLE, IS_NPU_AVAILABLE
_USING_PARROTS = True
try:
......@@ -126,6 +126,10 @@ class TestDeformRoIPool:
assert np.allclose(x.grad.data.cpu().numpy(), np_grad, 1e-3)
@pytest.mark.parametrize('device', [
pytest.param(
'npu',
marks=pytest.mark.skipif(
not IS_NPU_AVAILABLE, reason='requires NPU support')),
pytest.param(
'cuda',
marks=pytest.mark.skipif(
......
......@@ -3,7 +3,7 @@ import numpy as np
import pytest
import torch
from mmcv.utils import IS_CUDA_AVAILABLE, IS_MLU_AVAILABLE
from mmcv.utils import IS_CUDA_AVAILABLE, IS_MLU_AVAILABLE, IS_NPU_AVAILABLE
_USING_PARROTS = True
try:
......@@ -130,6 +130,10 @@ class Testfocalloss:
self._test_softmax(dtype=torch.half)
@pytest.mark.parametrize('device', [
pytest.param(
'npu',
marks=pytest.mark.skipif(
not IS_NPU_AVAILABLE, reason='requires NPU support')),
pytest.param(
'cuda',
marks=pytest.mark.skipif(
......@@ -143,6 +147,10 @@ class Testfocalloss:
self._test_sigmoid(device=device, dtype=torch.float)
@pytest.mark.parametrize('device', [
pytest.param(
'npu',
marks=pytest.mark.skipif(
not IS_NPU_AVAILABLE, reason='requires NPU support')),
pytest.param(
'cuda',
marks=pytest.mark.skipif(
......
......@@ -2,6 +2,8 @@
import pytest
import torch
from mmcv.utils import IS_CUDA_AVAILABLE, IS_NPU_AVAILABLE
_USING_PARROTS = True
try:
from parrots.autograd import gradcheck
......@@ -14,16 +16,32 @@ class TestFusedBiasLeakyReLU:
@classmethod
def setup_class(cls):
if not torch.cuda.is_available():
if not IS_CUDA_AVAILABLE and not IS_NPU_AVAILABLE:
return
cls.input_tensor = torch.randn((2, 2, 2, 2), requires_grad=True).cuda()
if IS_CUDA_AVAILABLE:
cls.input_tensor = torch.randn((2, 2, 2, 2),
requires_grad=True).cuda()
cls.bias = torch.zeros(2, requires_grad=True).cuda()
elif IS_NPU_AVAILABLE:
cls.input_tensor = torch.randn((2, 2, 2, 2),
requires_grad=True).npu()
cls.bias = torch.zeros(2, requires_grad=True).npu()
@pytest.mark.skipif(not torch.cuda.is_available(), reason='requires cuda')
def test_gradient(self):
@pytest.mark.parametrize('device', [
pytest.param(
'cuda',
marks=pytest.mark.skipif(
not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
pytest.param(
'npu',
marks=pytest.mark.skipif(
not IS_NPU_AVAILABLE, reason='requires NPU support'))
])
def test_gradient(self, device):
from mmcv.ops import FusedBiasLeakyReLU
if _USING_PARROTS:
if IS_CUDA_AVAILABLE:
gradcheck(
FusedBiasLeakyReLU(2).cuda(),
self.input_tensor,
......@@ -31,19 +49,26 @@ class TestFusedBiasLeakyReLU:
pt_atol=1e-3)
else:
gradcheck(
FusedBiasLeakyReLU(2).cuda(),
FusedBiasLeakyReLU(2).to(device),
self.input_tensor,
eps=1e-4,
atol=1e-3)
@pytest.mark.skipif(
not torch.cuda.is_available() or _USING_PARROTS,
reason='requires cuda')
def test_gradgradient(self):
@pytest.mark.parametrize('device', [
pytest.param(
'cuda',
marks=pytest.mark.skipif(
not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
pytest.param(
'npu',
marks=pytest.mark.skipif(
not IS_NPU_AVAILABLE, reason='requires NPU support'))
])
def test_gradgradient(self, device):
from mmcv.ops import FusedBiasLeakyReLU
gradgradcheck(
FusedBiasLeakyReLU(2).cuda(),
FusedBiasLeakyReLU(2).to(device),
self.input_tensor,
eps=1e-4,
atol=1e-3)
......@@ -4,7 +4,7 @@ import pytest
import torch
import torch.nn as nn
from mmcv.utils import IS_CUDA_AVAILABLE, IS_MLU_AVAILABLE
from mmcv.utils import IS_CUDA_AVAILABLE, IS_MLU_AVAILABLE, IS_NPU_AVAILABLE
class Loss(nn.Module):
......@@ -28,7 +28,11 @@ class TestPSAMask:
pytest.param(
'mlu',
marks=pytest.mark.skipif(
not IS_MLU_AVAILABLE, reason='requires MLU support'))
not IS_MLU_AVAILABLE, reason='requires MLU support')),
pytest.param(
'npu',
marks=pytest.mark.skipif(
not IS_NPU_AVAILABLE, reason='requires NPU support'))
])
def test_psa_mask_collect(self, device):
from mmcv.ops import PSAMask
......@@ -76,7 +80,11 @@ class TestPSAMask:
pytest.param(
'mlu',
marks=pytest.mark.skipif(
not IS_MLU_AVAILABLE, reason='requires MLU support'))
not IS_MLU_AVAILABLE, reason='requires MLU support')),
pytest.param(
'npu',
marks=pytest.mark.skipif(
not IS_NPU_AVAILABLE, reason='requires NPU support'))
])
def test_psa_mask_distribute(self, device):
from mmcv.ops import PSAMask
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment