Commit fdeee889 authored by limm's avatar limm
Browse files

release v1.6.1 of mmcv

parent df465820
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <ATen/ATen.h>
#include <utils/spconv/spconv/mp_helper.h>
#include <utils/spconv/spconv/reordering.h>
#include <utils/spconv/tensorview/helper_launch.h>
#include <utils/spconv/tensorview/tensorview.h>
#include <chrono>
#include <limits>
#include <spconv/reordering.cuh>
#include <type_traits>
#include <utils/spconv/tensorview/helper_kernel.cuh>
#include "../spconv_utils.h"
#include "pytorch_cuda_helper.hpp"
namespace functor {
template <typename scalar_t, typename Index>
struct SparseGatherFunctor<tv::TorchGPU, scalar_t, Index> {
using vecload_type_t =
std::conditional_t<std::is_same<scalar_t, at::Half>::value, int2, int4>;
using kernel_block_t = mp_list_c<int, 64, 32, 16>;
void operator()(const tv::TorchGPU &d, tv::TensorView<scalar_t> buffer,
tv::TensorView<const scalar_t> features,
tv::TensorView<const Index> indices, int size) {
if (size <= 0) return;
int numPlanes = features.dim(1);
bool notFound = true;
constexpr int vecloadFactor = sizeof(vecload_type_t) / sizeof(scalar_t);
mp_for_each<kernel_block_t>([=, &buffer, &features, &indices,
&notFound](auto NumTLP) {
constexpr int NumILP = NumTLP / 4;
int nHotBlock = (size / NumTLP) * NumTLP;
if (notFound) {
if (numPlanes % NumTLP == 0) {
if (nHotBlock >= NumTLP) {
gatherVecBlockKernel<scalar_t, Index, int(NumTLP), NumILP,
vecload_type_t>
<<<dim3(numPlanes / NumTLP, size / NumTLP),
dim3(NumTLP / vecloadFactor, NumTLP / NumILP), 0,
d.getStream()>>>(buffer.data(), features.data(),
indices.data(), nHotBlock,
numPlanes / vecloadFactor);
TV_CHECK_CUDA_ERR();
}
if (size - nHotBlock > 0) {
gatherVecKernel<scalar_t, Index, int(NumTLP), NumILP,
vecload_type_t>
<<<dim3(1, numPlanes / NumTLP),
dim3(NumTLP / NumILP, NumTLP / vecloadFactor), 0,
d.getStream()>>>(buffer.data() + nHotBlock * numPlanes,
features.data(), indices.data() + nHotBlock,
size - nHotBlock,
numPlanes / vecloadFactor);
TV_CHECK_CUDA_ERR();
}
notFound = false;
}
}
});
if (notFound) {
constexpr int NumTLP = 64;
constexpr int NumILP = NumTLP / 4;
gatherGenericKernel<scalar_t, Index, NumTLP, NumILP>
<<<dim3(tv::launch::DivUp(size, NumTLP),
tv::launch::DivUp(numPlanes, NumTLP)),
dim3(NumTLP / NumILP, NumTLP), 0, d.getStream()>>>(
buffer.data(), features.data(), indices.data(), size, numPlanes);
TV_CHECK_CUDA_ERR();
}
}
};
template <typename scalar_t, typename Index>
struct SparseScatterAddFunctor<tv::TorchGPU, scalar_t, Index> {
using vecload_type_t =
std::conditional_t<std::is_same<scalar_t, at::Half>::value, int2, int4>;
using kernel_block_t = mp_list_c<int, 64, 32, 16>;
void operator()(const tv::TorchGPU &d, tv::TensorView<scalar_t> outFeatures,
tv::TensorView<const scalar_t> buffer,
tv::TensorView<const Index> indices, int size, bool stable) {
if (size <= 0) return;
int numPlanes = outFeatures.dim(1);
bool notFound = true;
constexpr int vecloadFactor =
sizeof(vecload_type_t) / sizeof(scalar_t); // important for half.
mp_for_each<kernel_block_t>([=, &d, &outFeatures, &buffer, &indices,
&notFound](auto NumTLP) {
constexpr int NumILP = NumTLP / 4;
int nHotBlock = (size / NumTLP) * NumTLP;
if (notFound) {
if (numPlanes % NumTLP == 0) {
if (nHotBlock >= NumTLP) {
scatterAddVecBlockKernel<scalar_t, Index, int(NumTLP), NumILP,
vecload_type_t>
<<<dim3(numPlanes / NumTLP, size / NumTLP),
dim3(NumTLP / vecloadFactor, NumTLP / NumILP), 0,
d.getStream()>>>(outFeatures.data(), buffer.data(),
indices.data(), nHotBlock,
numPlanes / vecloadFactor);
TV_CHECK_CUDA_ERR();
}
if (size - nHotBlock > 0) {
scatterAddGenericKernel<scalar_t, Index, int(NumTLP), NumILP>
<<<dim3(1, numPlanes / NumTLP), dim3(NumTLP / NumILP, NumTLP),
0, d.getStream()>>>(
outFeatures.data(), buffer.data() + nHotBlock * numPlanes,
indices.data() + nHotBlock, size - nHotBlock, numPlanes);
TV_CHECK_CUDA_ERR();
}
notFound = false;
}
}
});
if (notFound) {
constexpr int NumTLP = 64;
constexpr int NumILP = NumTLP / 4;
scatterAddGenericKernel<scalar_t, Index, NumTLP, NumILP>
<<<dim3(tv::launch::DivUp(size, NumTLP),
tv::launch::DivUp(numPlanes, NumTLP)),
dim3(NumTLP / NumILP, NumTLP), 0, d.getStream()>>>(
outFeatures.data(), buffer.data(), indices.data(), size,
numPlanes);
TV_CHECK_CUDA_ERR();
}
}
};
} // namespace functor
#define DECLARE_GPU_SPECS_T_INDEX(scalar_t, Index) \
template struct functor::SparseGatherFunctor<tv::TorchGPU, scalar_t, Index>; \
template struct functor::SparseScatterAddFunctor<tv::TorchGPU, scalar_t, \
Index>;
#define DECLARE_GPU_SPECS(scalar_t) DECLARE_GPU_SPECS_T_INDEX(scalar_t, int);
DECLARE_GPU_SPECS(float);
DECLARE_GPU_SPECS(double);
DECLARE_GPU_SPECS(at::Half);
#undef DECLARE_GPU_SPECS
#undef DECLARE_GPU_SPECS_T_INDEX
#include <cuda_runtime_api.h>
#include <torch/script.h>
#include <utils/spconv/spconv/indice.h>
#include <utils/spconv/spconv/reordering.h>
#include "../spconv_utils.h"
#include "pytorch_cuda_helper.hpp"
template <unsigned NDim>
std::vector<torch::Tensor> GetIndicePairsForwardCUDAKernelLauncher(
torch::Tensor indices, int64_t batchSize,
std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
std::vector<int64_t> padding, std::vector<int64_t> dilation,
std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose) {
at::cuda::CUDAGuard device_guard(indices.device());
bool subM = _subM != 0;
bool transpose = _transpose != 0;
auto numAct = indices.size(0);
auto coorDim = indices.size(1) - 1;
TV_ASSERT_RT_ERR(NDim == coorDim, "error");
TV_ASSERT_RT_ERR(kernelSize.size() == coorDim, "error");
TV_ASSERT_RT_ERR(outSpatialShape.size() == coorDim, "error");
TV_ASSERT_RT_ERR(stride.size() == coorDim, "error");
TV_ASSERT_RT_ERR(padding.size() == coorDim, "error");
TV_ASSERT_RT_ERR(outPadding.size() == coorDim, "error");
TV_ASSERT_RT_ERR(dilation.size() == coorDim, "error");
auto kernelVolume = kernelSize[0];
for (int i = 1; i < kernelSize.size(); ++i) {
kernelVolume *= kernelSize[i];
}
TV_ASSERT_RT_ERR(kernelVolume <= 4096, "error");
auto outputVolume = outSpatialShape[0];
for (int i = 1; i < outSpatialShape.size(); ++i) {
outputVolume *= outSpatialShape[i];
}
torch::Tensor indicePairs =
torch::full({kernelVolume, 2, numAct}, -1,
torch::dtype(torch::kInt32).device(indices.device()));
torch::Tensor indiceNum = torch::zeros(
{kernelVolume}, torch::dtype(torch::kInt32).device(indices.device()));
torch::Tensor gridOut =
torch::full({batchSize * outputVolume}, -1,
torch::dtype(torch::kInt32).device(indices.device()));
int64_t numActOut = -1;
tv::SimpleVector<int, NDim> outSpatialShape32;
tv::SimpleVector<int, NDim> kernelSize32;
tv::SimpleVector<int, NDim> stride32;
tv::SimpleVector<int, NDim> padding32;
tv::SimpleVector<int, NDim> dilation32;
auto indicePairUnique = torch::full(
{indicePairs.numel() / 2 + 1}, std::numeric_limits<int>::max(),
torch::dtype(torch::kInt32).device(indices.device()));
for (int i = 0; i < NDim; ++i) {
outSpatialShape32.push_back(outSpatialShape[i]);
kernelSize32.push_back(kernelSize[i]);
if (subM) {
stride32.push_back(1);
padding32.push_back(kernelSize[i] / 2);
dilation32.push_back(dilation[i]);
} else {
stride32.push_back(stride[i]);
padding32.push_back(padding[i]);
dilation32.push_back(dilation[i]);
}
}
if (subM) {
if (indices.device().type() == torch::kCPU) {
auto getIndicePairFtor =
functor::CreateSubMIndicePairFunctor<tv::CPU, int, int, NDim>();
numActOut = getIndicePairFtor(
tv::CPU(), tv::torch2tv<const int>(indices),
tv::torch2tv<int>(gridOut), tv::torch2tv<int>(indicePairs),
tv::torch2tv<int>(indiceNum), kernelSize32, stride32, padding32,
dilation32, outSpatialShape32, transpose);
} else {
auto getIndicePairFtor =
functor::CreateSubMIndicePairFunctor<tv::TorchGPU, int, int, NDim>();
numActOut = getIndicePairFtor(
tv::TorchGPU(), tv::torch2tv<const int>(indices),
tv::torch2tv<int>(gridOut), tv::torch2tv<int>(indicePairs),
tv::torch2tv<int>(indiceNum), kernelSize32, stride32, padding32,
dilation32, outSpatialShape32, transpose);
}
return {indices, indicePairs, indiceNum};
} else {
torch::Tensor outInds =
torch::zeros({numAct * kernelVolume, coorDim + 1},
torch::dtype(torch::kInt32).device(indices.device()));
if (indices.device().type() == torch::kCPU) {
auto getIndicePairFtor =
functor::CreateConvIndicePairFunctor<tv::CPU, int, int, NDim>();
numActOut = getIndicePairFtor(
tv::CPU(), tv::torch2tv<const int>(indices),
tv::torch2tv<int>(outInds), tv::torch2tv<int>(gridOut),
tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum),
kernelSize32, stride32, padding32, dilation32, outSpatialShape32,
transpose);
} else {
auto getIndicePairFtorP1 =
functor::CreateConvIndicePairFunctorP1<tv::TorchGPU, int, int,
NDim>();
auto getIndicePairFtorP2 =
functor::CreateConvIndicePairFunctorP2<tv::TorchGPU, int, int,
NDim>();
numActOut = getIndicePairFtorP1(
tv::TorchGPU(), tv::torch2tv<const int>(indices),
tv::torch2tv<int>(outInds), tv::torch2tv<int>(gridOut),
tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum),
tv::torch2tv<int>(indicePairUnique), kernelSize32, stride32,
padding32, dilation32, outSpatialShape32, transpose);
if (numActOut > 0) {
auto res = torch::_unique(indicePairUnique);
indicePairUnique = std::get<0>(res);
numActOut = getIndicePairFtorP2(
tv::TorchGPU(), tv::torch2tv<const int>(indices),
tv::torch2tv<int>(outInds), tv::torch2tv<int>(gridOut),
tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum),
tv::torch2tv<int>(indicePairUnique), outSpatialShape32, transpose);
}
}
return {outInds.slice(0, 0, numActOut), indicePairs, indiceNum};
}
}
template <unsigned NDim>
std::vector<torch::Tensor> GetIndicePairsBackwardCUDAKernelLauncher(
torch::Tensor indices, torch::Tensor gridOut, int64_t batchSize,
std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
std::vector<int64_t> padding, std::vector<int64_t> dilation,
std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose) {
at::cuda::CUDAGuard device_guard(indices.device());
bool subM = _subM != 0;
bool transpose = _transpose != 0;
auto numAct = indices.size(0);
auto coorDim = indices.size(1) - 1;
TV_ASSERT_RT_ERR(NDim == coorDim, "error");
TV_ASSERT_RT_ERR(kernelSize.size() == coorDim, "error");
TV_ASSERT_RT_ERR(outSpatialShape.size() == coorDim, "error");
TV_ASSERT_RT_ERR(stride.size() == coorDim, "error");
TV_ASSERT_RT_ERR(padding.size() == coorDim, "error");
TV_ASSERT_RT_ERR(outPadding.size() == coorDim, "error");
TV_ASSERT_RT_ERR(dilation.size() == coorDim, "error");
auto kernelVolume = kernelSize[0];
for (int i = 1; i < kernelSize.size(); ++i) {
kernelVolume *= kernelSize[i];
}
TV_ASSERT_RT_ERR(kernelVolume <= 4096, "error");
auto outputVolume = outSpatialShape[0];
for (int i = 1; i < outSpatialShape.size(); ++i) {
outputVolume *= outSpatialShape[i];
}
TV_ASSERT_INVALID_ARG(gridOut.numel() >= outputVolume * batchSize, "error");
torch::Tensor indicePairs =
torch::full({kernelVolume, 2, numAct}, -1,
torch::dtype(torch::kInt32).device(indices.device()));
torch::Tensor indiceNum = torch::zeros(
{kernelVolume}, torch::dtype(torch::kInt32).device(indices.device()));
int64_t numActOut = -1;
tv::SimpleVector<int, NDim> outSpatialShape32;
tv::SimpleVector<int, NDim> kernelSize32;
tv::SimpleVector<int, NDim> stride32;
tv::SimpleVector<int, NDim> padding32;
tv::SimpleVector<int, NDim> dilation32;
auto indicePairUnique = torch::full(
{indicePairs.numel() / 2 + 1}, std::numeric_limits<int>::max(),
torch::dtype(torch::kInt32).device(indices.device()));
for (int i = 0; i < NDim; ++i) {
outSpatialShape32.push_back(outSpatialShape[i]);
kernelSize32.push_back(kernelSize[i]);
if (subM) {
stride32.push_back(1);
padding32.push_back(kernelSize[i] / 2);
dilation32.push_back(dilation[i]);
} else {
stride32.push_back(stride[i]);
padding32.push_back(padding[i]);
dilation32.push_back(dilation[i]);
}
}
if (subM) {
if (indices.device().type() == torch::kCPU) {
auto getIndicePairFtor =
functor::CreateSubMIndicePairFunctor<tv::CPU, int, int, NDim>();
numActOut = getIndicePairFtor(
tv::CPU(), tv::torch2tv<const int>(indices),
tv::torch2tv<int>(gridOut), tv::torch2tv<int>(indicePairs),
tv::torch2tv<int>(indiceNum), kernelSize32, stride32, padding32,
dilation32, outSpatialShape32, transpose);
gridOut.fill_(-1);
} else {
auto getIndicePairFtor =
functor::CreateSubMIndicePairFunctor<tv::TorchGPU, int, int, NDim>();
numActOut = getIndicePairFtor(
tv::TorchGPU(), tv::torch2tv<const int>(indices),
tv::torch2tv<int>(gridOut), tv::torch2tv<int>(indicePairs),
tv::torch2tv<int>(indiceNum), kernelSize32, stride32, padding32,
dilation32, outSpatialShape32, transpose, true);
}
return {indices, indicePairs, indiceNum};
} else {
torch::Tensor outInds =
torch::zeros({numAct * kernelVolume, coorDim + 1},
torch::dtype(torch::kInt32).device(indices.device()));
if (indices.device().type() == torch::kCPU) {
auto getIndicePairFtor =
functor::CreateConvIndicePairFunctor<tv::CPU, int, int, NDim>();
numActOut = getIndicePairFtor(
tv::CPU(), tv::torch2tv<const int>(indices),
tv::torch2tv<int>(outInds), tv::torch2tv<int>(gridOut),
tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum),
kernelSize32, stride32, padding32, dilation32, outSpatialShape32,
transpose, true);
gridOut.fill_(-1);
} else {
auto getIndicePairFtorP1 =
functor::CreateConvIndicePairFunctorP1<tv::TorchGPU, int, int,
NDim>();
auto getIndicePairFtorP2 =
functor::CreateConvIndicePairFunctorP2<tv::TorchGPU, int, int,
NDim>();
numActOut = getIndicePairFtorP1(
tv::TorchGPU(), tv::torch2tv<const int>(indices),
tv::torch2tv<int>(outInds), tv::torch2tv<int>(gridOut),
tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum),
tv::torch2tv<int>(indicePairUnique), kernelSize32, stride32,
padding32, dilation32, outSpatialShape32, transpose);
if (numActOut > 0) {
auto res = torch::_unique(indicePairUnique);
indicePairUnique = std::get<0>(res);
numActOut = getIndicePairFtorP2(
tv::TorchGPU(), tv::torch2tv<const int>(indices),
tv::torch2tv<int>(outInds), tv::torch2tv<int>(gridOut),
tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum),
tv::torch2tv<int>(indicePairUnique), outSpatialShape32, transpose,
true);
}
}
return {outInds.slice(0, 0, numActOut), indicePairs, indiceNum};
}
}
torch::Tensor IndiceConvForwardCUDAKernelLauncher(
torch::Tensor features, torch::Tensor filters, torch::Tensor indicePairs,
torch::Tensor indiceNum, int64_t numActOut, int64_t _inverse,
int64_t _subM) {
at::cuda::CUDAGuard device_guard(features.device());
bool subM = _subM != 0;
bool inverse = _inverse != 0;
auto device = features.device().type();
auto ndim = filters.dim() - 2;
auto kernelVolume = indicePairs.size(0);
auto numInPlanes = features.size(1);
auto numOutPlanes = filters.size(ndim + 1);
auto indicePairNumCpu = indiceNum.to({torch::kCPU});
auto indicePairMaxSizeIter =
std::max_element(indicePairNumCpu.data_ptr<int>(),
indicePairNumCpu.data_ptr<int>() + kernelVolume);
int indicePairMaxOffset =
indicePairMaxSizeIter - indicePairNumCpu.data_ptr<int>();
int indicePairMaxSize = *indicePairMaxSizeIter;
auto options =
torch::TensorOptions().dtype(features.dtype()).device(features.device());
torch::Tensor output = torch::zeros({numActOut, numOutPlanes}, options);
torch::Tensor inputBuffer =
torch::zeros({indicePairMaxSize, numInPlanes}, options);
torch::Tensor outputBuffer =
torch::zeros({indicePairMaxSize, numOutPlanes}, options);
filters = filters.view({-1, numInPlanes, numOutPlanes});
if (subM) {
torch::mm_out(output, features, filters[indicePairMaxOffset]);
}
double totalGatherTime = 0;
double totalGEMMTime = 0;
double totalSAddTime = 0;
for (int i = 0; i < kernelVolume; ++i) {
auto nHot = indicePairNumCpu.data_ptr<int>()[i];
if (nHot <= 0 || (subM && i == indicePairMaxOffset)) {
continue;
}
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
features.scalar_type(), "IndiceConvForwardKernel", [&] {
auto outputBufferBlob = torch::from_blob(
outputBuffer.data_ptr<scalar_t>(), {nHot, numOutPlanes}, options);
auto inputBufferBlob = torch::from_blob(
inputBuffer.data_ptr<scalar_t>(), {nHot, numInPlanes}, options);
if (device == torch::kCPU) {
functor::SparseGatherFunctor<tv::CPU, scalar_t, int> gatherFtor;
gatherFtor(tv::CPU(), tv::torch2tv<scalar_t>(inputBuffer),
tv::torch2tv<const scalar_t>(features),
tv::torch2tv<const int>(indicePairs).subview(i, inverse),
nHot);
} else {
functor::SparseGatherFunctor<tv::TorchGPU, scalar_t, int>
gatherFtor;
gatherFtor(tv::TorchGPU(), tv::torch2tv<scalar_t>(inputBuffer),
tv::torch2tv<const scalar_t>(features),
tv::torch2tv<const int>(indicePairs).subview(i, inverse),
nHot);
TV_CHECK_CUDA_ERR();
/* slower than SparseGatherFunctor, may due to int->long conversion
auto indicePairLong = indicePairs[i][inverse].to(torch::kInt64);
auto indicePairBlob =
torch::from_blob(indicePairLong.data_ptr<long>(), {nHot},
indicePairOptions); torch::index_select_out(inputBufferBlob,
features, 0, indicePairBlob);*/
}
torch::mm_out(outputBufferBlob, inputBufferBlob, filters[i]);
if (device == torch::kCPU) {
functor::SparseScatterAddFunctor<tv::CPU, scalar_t, int>
scatterFtor;
scatterFtor(
tv::CPU(), tv::torch2tv<scalar_t>(output),
tv::torch2tv<const scalar_t>(outputBuffer),
tv::torch2tv<const int>(indicePairs).subview(i, !inverse), nHot,
true);
} else {
functor::SparseScatterAddFunctor<tv::TorchGPU, scalar_t, int>
scatterFtor;
scatterFtor(
tv::TorchGPU(), tv::torch2tv<scalar_t>(output),
tv::torch2tv<const scalar_t>(outputBuffer),
tv::torch2tv<const int>(indicePairs).subview(i, !inverse), nHot,
true);
TV_CHECK_CUDA_ERR();
}
});
}
return output;
}
std::vector<torch::Tensor> IndiceConvBackwardCUDAKernelLauncher(
torch::Tensor features, torch::Tensor filters, torch::Tensor outGrad,
torch::Tensor indicePairs, torch::Tensor indiceNum, int64_t _inverse,
int64_t _subM) {
at::cuda::CUDAGuard device_guard(features.device());
bool subM = _subM != 0;
bool inverse = _inverse != 0;
auto device = features.device().type();
auto ndim = filters.dim() - 2;
auto kernelVolume = indicePairs.size(0);
auto numInPlanes = features.size(1);
auto numOutPlanes = filters.size(ndim + 1);
auto indicePairNumCpu = indiceNum.to({torch::kCPU});
auto indicePairMaxSizeIter =
std::max_element(indicePairNumCpu.data_ptr<int>(),
indicePairNumCpu.data_ptr<int>() + kernelVolume);
int indicePairMaxOffset =
indicePairMaxSizeIter - indicePairNumCpu.data_ptr<int>();
int indicePairMaxSize = *indicePairMaxSizeIter;
auto options =
torch::TensorOptions().dtype(features.dtype()).device(features.device());
auto filterShape = filters.sizes();
torch::Tensor inputGrad = torch::zeros(features.sizes(), options);
torch::Tensor filtersGrad = torch::zeros(filterShape, options);
torch::Tensor inputBuffer =
torch::zeros({indicePairMaxSize, numInPlanes}, options);
torch::Tensor outputBuffer =
torch::zeros({indicePairMaxSize, numOutPlanes}, options);
filters = filters.view({-1, numInPlanes, numOutPlanes});
filtersGrad = filtersGrad.view({-1, numInPlanes, numOutPlanes});
if (subM) {
auto filterGradSub = filtersGrad[indicePairMaxOffset];
torch::mm_out(filterGradSub, features.t(), outGrad);
torch::mm_out(inputGrad, outGrad, filters[indicePairMaxOffset].t());
}
for (int i = 0; i < kernelVolume; ++i) {
auto nHot = indicePairNumCpu.data_ptr<int>()[i];
if (nHot <= 0 || (subM && i == indicePairMaxOffset)) {
continue;
}
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
features.scalar_type(), "IndiceConvBackwardKernel", [&] {
if (device == torch::kCPU) {
functor::SparseGatherFunctor<tv::CPU, scalar_t, int> gatherFtor;
functor::SparseGatherFunctor<tv::CPU, scalar_t, int> gatherFtorOut;
gatherFtor(tv::CPU(), tv::torch2tv<scalar_t>(inputBuffer),
tv::torch2tv<const scalar_t>(features),
tv::torch2tv<const int>(indicePairs).subview(i, inverse),
nHot);
gatherFtorOut(
tv::CPU(), tv::torch2tv<scalar_t>(outputBuffer),
tv::torch2tv<const scalar_t>(outGrad),
tv::torch2tv<const int>(indicePairs).subview(i, !inverse),
nHot);
} else {
functor::SparseGatherFunctor<tv::TorchGPU, scalar_t, int>
gatherFtor;
functor::SparseGatherFunctor<tv::TorchGPU, scalar_t, int>
gatherFtorOut;
gatherFtor(tv::TorchGPU(), tv::torch2tv<scalar_t>(inputBuffer),
tv::torch2tv<const scalar_t>(features),
tv::torch2tv<const int>(indicePairs).subview(i, inverse),
nHot);
TV_CHECK_CUDA_ERR();
gatherFtorOut(
tv::TorchGPU(), tv::torch2tv<scalar_t>(outputBuffer),
tv::torch2tv<const scalar_t>(outGrad),
tv::torch2tv<const int>(indicePairs).subview(i, !inverse),
nHot);
TV_CHECK_CUDA_ERR();
}
auto filterGradSub = filtersGrad[i];
auto outputBufferBlob = torch::from_blob(
outputBuffer.data_ptr<scalar_t>(), {nHot, numOutPlanes}, options);
auto inputBufferBlob = torch::from_blob(
inputBuffer.data_ptr<scalar_t>(), {nHot, numInPlanes}, options);
torch::mm_out(filterGradSub, inputBufferBlob.t(), outputBufferBlob);
torch::mm_out(inputBufferBlob, outputBufferBlob, filters[i].t());
if (device == torch::kCPU) {
functor::SparseScatterAddFunctor<tv::CPU, scalar_t, int>
scatterFtor;
scatterFtor(
tv::CPU(), tv::torch2tv<scalar_t>(inputGrad),
tv::torch2tv<const scalar_t>(inputBuffer),
tv::torch2tv<const int>(indicePairs).subview(i, inverse), nHot);
} else {
functor::SparseScatterAddFunctor<tv::TorchGPU, scalar_t, int>
scatterFtor;
scatterFtor(
tv::TorchGPU(), tv::torch2tv<scalar_t>(inputGrad),
tv::torch2tv<const scalar_t>(inputBuffer),
tv::torch2tv<const int>(indicePairs).subview(i, inverse), nHot);
TV_CHECK_CUDA_ERR();
}
});
}
return {inputGrad, filtersGrad.view(filterShape)};
}
template std::vector<torch::Tensor> GetIndicePairsForwardCUDAKernelLauncher<2>(
torch::Tensor indices, int64_t batchSize,
std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
std::vector<int64_t> padding, std::vector<int64_t> dilation,
std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose);
template std::vector<torch::Tensor> GetIndicePairsForwardCUDAKernelLauncher<3>(
torch::Tensor indices, int64_t batchSize,
std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
std::vector<int64_t> padding, std::vector<int64_t> dilation,
std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose);
template std::vector<torch::Tensor> GetIndicePairsForwardCUDAKernelLauncher<4>(
torch::Tensor indices, int64_t batchSize,
std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
std::vector<int64_t> padding, std::vector<int64_t> dilation,
std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose);
template std::vector<torch::Tensor> GetIndicePairsBackwardCUDAKernelLauncher<2>(
torch::Tensor indices, torch::Tensor gridOut, int64_t batchSize,
std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
std::vector<int64_t> padding, std::vector<int64_t> dilation,
std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose);
template std::vector<torch::Tensor> GetIndicePairsBackwardCUDAKernelLauncher<3>(
torch::Tensor indices, torch::Tensor gridOut, int64_t batchSize,
std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
std::vector<int64_t> padding, std::vector<int64_t> dilation,
std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose);
......@@ -23,7 +23,7 @@ void ThreeInterpolateForwardCUDAKernelLauncher(int b, int c, int m, int n,
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
// blockIdx.x(col), blockIdx.y(row)
dim3 blocks(DIVUP(n, THREADS_PER_BLOCK), c, b);
dim3 blocks(GET_BLOCKS(n, THREADS_PER_BLOCK), c, b);
dim3 threads(THREADS_PER_BLOCK);
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
......@@ -51,7 +51,7 @@ void ThreeInterpolateBackwardCUDAKernelLauncher(int b, int c, int n, int m,
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
// blockIdx.x(col), blockIdx.y(row)
dim3 blocks(DIVUP(n, THREADS_PER_BLOCK), c, b);
dim3 blocks(GET_BLOCKS(n, THREADS_PER_BLOCK), c, b);
dim3 threads(THREADS_PER_BLOCK);
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
......
......@@ -21,7 +21,7 @@ void ThreeNNForwardCUDAKernelLauncher(int b, int n, int m, const Tensor unknown,
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
// blockIdx.x(col), blockIdx.y(row)
dim3 blocks(DIVUP(n, THREADS_PER_BLOCK), b);
dim3 blocks(GET_BLOCKS(n, THREADS_PER_BLOCK), b);
dim3 threads(THREADS_PER_BLOCK);
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
......
......@@ -145,6 +145,104 @@ int HardVoxelizeForwardCUDAKernelLauncher(
return voxel_num_int;
}
int NondeterministicHardVoxelizeForwardCUDAKernelLauncher(
const at::Tensor &points, at::Tensor &voxels, at::Tensor &coors,
at::Tensor &num_points_per_voxel, const std::vector<float> voxel_size,
const std::vector<float> coors_range, const int max_points,
const int max_voxels, const int NDim = 3) {
at::cuda::CUDAGuard device_guard(points.device());
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
const int num_points = points.size(0);
const int num_features = points.size(1);
if (num_points == 0) return 0;
dim3 blocks(
std::min(at::cuda::ATenCeilDiv(num_points, THREADS_PER_BLOCK), 4096));
dim3 threads(THREADS_PER_BLOCK);
const float voxel_x = voxel_size[0];
const float voxel_y = voxel_size[1];
const float voxel_z = voxel_size[2];
const float coors_x_min = coors_range[0];
const float coors_y_min = coors_range[1];
const float coors_z_min = coors_range[2];
const float coors_x_max = coors_range[3];
const float coors_y_max = coors_range[4];
const float coors_z_max = coors_range[5];
const int grid_x = round((coors_x_max - coors_x_min) / voxel_x);
const int grid_y = round((coors_y_max - coors_y_min) / voxel_y);
const int grid_z = round((coors_z_max - coors_z_min) / voxel_z);
// map points to voxel coors
at::Tensor temp_coors =
at::zeros({num_points, NDim}, points.options().dtype(at::kInt));
// 1. link point to corresponding voxel coors
AT_DISPATCH_ALL_TYPES(
points.scalar_type(), "hard_voxelize_kernel", ([&] {
dynamic_voxelize_kernel<scalar_t, int><<<blocks, threads, 0, stream>>>(
points.contiguous().data_ptr<scalar_t>(),
temp_coors.contiguous().data_ptr<int>(), voxel_x, voxel_y, voxel_z,
coors_x_min, coors_y_min, coors_z_min, coors_x_max, coors_y_max,
coors_z_max, grid_x, grid_y, grid_z, num_points, num_features,
NDim);
}));
at::Tensor coors_map;
at::Tensor reduce_count;
auto coors_clean = temp_coors.masked_fill(temp_coors.lt(0).any(-1, true), -1);
std::tie(temp_coors, coors_map, reduce_count) =
at::unique_dim(coors_clean, 0, true, true, false);
if (temp_coors[0][0].lt(0).item<bool>()) {
// the first element of temp_coors is (-1,-1,-1) and should be removed
temp_coors = temp_coors.slice(0, 1);
coors_map = coors_map - 1;
}
int num_coors = temp_coors.size(0);
temp_coors = temp_coors.to(at::kInt);
coors_map = coors_map.to(at::kInt);
at::Tensor coors_count = at::zeros({1}, coors_map.options());
at::Tensor coors_order = at::empty({num_coors}, coors_map.options());
at::Tensor pts_id = at::zeros({num_points}, coors_map.options());
reduce_count = at::zeros({num_coors}, coors_map.options());
AT_DISPATCH_ALL_TYPES(
points.scalar_type(), "get_assign_pos", ([&] {
nondeterministic_get_assign_pos<<<blocks, threads, 0, stream>>>(
num_points, coors_map.contiguous().data_ptr<int32_t>(),
pts_id.contiguous().data_ptr<int32_t>(),
coors_count.contiguous().data_ptr<int32_t>(),
reduce_count.contiguous().data_ptr<int32_t>(),
coors_order.contiguous().data_ptr<int32_t>());
}));
AT_DISPATCH_ALL_TYPES(
points.scalar_type(), "assign_point_to_voxel", ([&] {
nondeterministic_assign_point_voxel<scalar_t>
<<<blocks, threads, 0, stream>>>(
num_points, points.contiguous().data_ptr<scalar_t>(),
coors_map.contiguous().data_ptr<int32_t>(),
pts_id.contiguous().data_ptr<int32_t>(),
temp_coors.contiguous().data_ptr<int32_t>(),
reduce_count.contiguous().data_ptr<int32_t>(),
coors_order.contiguous().data_ptr<int32_t>(),
voxels.contiguous().data_ptr<scalar_t>(),
coors.contiguous().data_ptr<int32_t>(),
num_points_per_voxel.contiguous().data_ptr<int32_t>(),
max_voxels, max_points, num_features, NDim);
}));
AT_CUDA_CHECK(cudaGetLastError());
return max_voxels < num_coors ? max_voxels : num_coors;
}
void DynamicVoxelizeForwardCUDAKernelLauncher(
const at::Tensor &points, at::Tensor &coors,
const std::vector<float> voxel_size, const std::vector<float> coors_range,
......
// Copyright (c) OpenMMLab. All rights reserved
#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
Tensor diff_iou_rotated_sort_vertices_forward_impl(Tensor vertices, Tensor mask,
Tensor num_valid) {
return DISPATCH_DEVICE_IMPL(diff_iou_rotated_sort_vertices_forward_impl,
vertices, mask, num_valid);
}
Tensor diff_iou_rotated_sort_vertices_forward(Tensor vertices, Tensor mask,
Tensor num_valid) {
return diff_iou_rotated_sort_vertices_forward_impl(vertices, mask, num_valid);
}
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
torch::Tensor fused_indice_conv_batchnorm_forward_impl(
torch::Tensor features, torch::Tensor filters, torch::Tensor bias,
torch::Tensor indicePairs, torch::Tensor indiceNum, int64_t numActOut,
int64_t _inverse, int64_t _subM) {
return DISPATCH_DEVICE_IMPL(fused_indice_conv_batchnorm_forward_impl,
features, filters, bias, indicePairs, indiceNum,
numActOut, _inverse, _subM);
}
torch::Tensor fused_indice_conv_batchnorm_forward(
torch::Tensor features, torch::Tensor filters, torch::Tensor bias,
torch::Tensor indicePairs, torch::Tensor indiceNum, int64_t numActOut,
int64_t _inverse, int64_t _subM) {
return fused_indice_conv_batchnorm_forward_impl(features, filters, bias,
indicePairs, indiceNum,
numActOut, _inverse, _subM);
}
......@@ -19,31 +19,24 @@ void iou3d_boxes_overlap_bev_forward_impl(const int num_a, const Tensor boxes_a,
num_b, boxes_b, ans_overlap);
}
void iou3d_boxes_iou_bev_forward_impl(const int num_a, const Tensor boxes_a,
const int num_b, const Tensor boxes_b,
Tensor ans_iou) {
DISPATCH_DEVICE_IMPL(iou3d_boxes_iou_bev_forward_impl, num_a, boxes_a, num_b,
boxes_b, ans_iou);
}
void iou3d_nms_forward_impl(const Tensor boxes, unsigned long long *mask,
int boxes_num, float nms_overlap_thresh) {
DISPATCH_DEVICE_IMPL(iou3d_nms_forward_impl, boxes, mask, boxes_num,
void iou3d_nms3d_forward_impl(const Tensor boxes, unsigned long long *mask,
int boxes_num, float nms_overlap_thresh) {
DISPATCH_DEVICE_IMPL(iou3d_nms3d_forward_impl, boxes, mask, boxes_num,
nms_overlap_thresh);
}
void iou3d_nms_normal_forward_impl(const Tensor boxes, unsigned long long *mask,
int boxes_num, float nms_overlap_thresh) {
DISPATCH_DEVICE_IMPL(iou3d_nms_normal_forward_impl, boxes, mask, boxes_num,
void iou3d_nms3d_normal_forward_impl(const Tensor boxes,
unsigned long long *mask, int boxes_num,
float nms_overlap_thresh) {
DISPATCH_DEVICE_IMPL(iou3d_nms3d_normal_forward_impl, boxes, mask, boxes_num,
nms_overlap_thresh);
}
void iou3d_boxes_overlap_bev_forward(Tensor boxes_a, Tensor boxes_b,
Tensor ans_overlap) {
// params boxes_a: (N, 5) [x1, y1, x2, y2, ry]
// params boxes: (N, 7) [x, y, z, dx, dy, dz, heading]
// params boxes_b: (M, 5)
// params ans_overlap: (N, M)
int num_a = boxes_a.size(0);
int num_b = boxes_b.size(0);
......@@ -51,20 +44,9 @@ void iou3d_boxes_overlap_bev_forward(Tensor boxes_a, Tensor boxes_b,
ans_overlap);
}
void iou3d_boxes_iou_bev_forward(Tensor boxes_a, Tensor boxes_b,
Tensor ans_iou) {
// params boxes_a: (N, 5) [x1, y1, x2, y2, ry]
// params boxes_b: (M, 5)
// params ans_overlap: (N, M)
int num_a = boxes_a.size(0);
int num_b = boxes_b.size(0);
iou3d_boxes_iou_bev_forward_impl(num_a, boxes_a, num_b, boxes_b, ans_iou);
}
void iou3d_nms_forward(Tensor boxes, Tensor keep, Tensor keep_num,
float nms_overlap_thresh) {
// params boxes: (N, 5) [x1, y1, x2, y2, ry]
void iou3d_nms3d_forward(Tensor boxes, Tensor keep, Tensor keep_num,
float nms_overlap_thresh) {
// params boxes: (N, 7) [x, y, z, dx, dy, dz, heading]
// params keep: (N)
CHECK_CONTIGUOUS(boxes);
CHECK_CONTIGUOUS(keep);
......@@ -73,13 +55,14 @@ void iou3d_nms_forward(Tensor boxes, Tensor keep, Tensor keep_num,
int64_t *keep_data = keep.data_ptr<int64_t>();
int64_t *keep_num_data = keep_num.data_ptr<int64_t>();
const int col_blocks = DIVUP(boxes_num, THREADS_PER_BLOCK_NMS);
const int col_blocks =
(boxes_num + THREADS_PER_BLOCK_NMS - 1) / THREADS_PER_BLOCK_NMS;
Tensor mask =
at::empty({boxes_num, col_blocks}, boxes.options().dtype(at::kLong));
unsigned long long *mask_data =
(unsigned long long *)mask.data_ptr<int64_t>();
iou3d_nms_forward_impl(boxes, mask_data, boxes_num, nms_overlap_thresh);
iou3d_nms3d_forward_impl(boxes, mask_data, boxes_num, nms_overlap_thresh);
at::Tensor mask_cpu = mask.to(at::kCPU);
unsigned long long *mask_host =
......@@ -105,9 +88,9 @@ void iou3d_nms_forward(Tensor boxes, Tensor keep, Tensor keep_num,
}
}
void iou3d_nms_normal_forward(Tensor boxes, Tensor keep, Tensor keep_num,
float nms_overlap_thresh) {
// params boxes: (N, 5) [x1, y1, x2, y2, ry]
void iou3d_nms3d_normal_forward(Tensor boxes, Tensor keep, Tensor keep_num,
float nms_overlap_thresh) {
// params boxes: (N, 7) [x, y, z, dx, dy, dz, heading]
// params keep: (N)
CHECK_CONTIGUOUS(boxes);
......@@ -117,14 +100,15 @@ void iou3d_nms_normal_forward(Tensor boxes, Tensor keep, Tensor keep_num,
int64_t *keep_data = keep.data_ptr<int64_t>();
int64_t *keep_num_data = keep_num.data_ptr<int64_t>();
const int col_blocks = DIVUP(boxes_num, THREADS_PER_BLOCK_NMS);
const int col_blocks =
(boxes_num + THREADS_PER_BLOCK_NMS - 1) / THREADS_PER_BLOCK_NMS;
Tensor mask =
at::empty({boxes_num, col_blocks}, boxes.options().dtype(at::kLong));
unsigned long long *mask_data =
(unsigned long long *)mask.data_ptr<int64_t>();
iou3d_nms_normal_forward_impl(boxes, mask_data, boxes_num,
nms_overlap_thresh);
iou3d_nms3d_normal_forward_impl(boxes, mask_data, boxes_num,
nms_overlap_thresh);
at::Tensor mask_cpu = mask.to(at::kCPU);
unsigned long long *mask_host =
......
// Copyright (c) OpenMMLab. All rights reserved
#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
void min_area_polygons_impl(const Tensor pointsets, Tensor polygons) {
DISPATCH_DEVICE_IMPL(min_area_polygons_impl, pointsets, polygons);
}
void min_area_polygons(const Tensor pointsets, Tensor polygons) {
min_area_polygons_impl(pointsets, polygons);
}
/*************************************************************************
* Copyright (C) 2021 Cambricon.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*************************************************************************/
#include "pytorch_device_registry.hpp"
#include "pytorch_mlu_helper.hpp"
void KernelBBoxOverlaps(cnrtDim3_t k_dim, cnrtFunctionType_t k_type,
cnrtQueue_t queue, const cnrtDataType_t d_type,
const void *bbox1, const void *bbox2, void *ious,
const int32_t num_bbox1, const int32_t num_bbox2,
const int32_t mode, const bool aligned,
const int32_t offset);
static void policyFunc(cnrtDim3_t *k_dim, cnrtFunctionType_t *k_type,
const int32_t batch_num_all) {
auto union_num = torch_mlu::getDeviceAttr(cnrtAttrClusterCount);
auto core_dim = torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster);
auto core_num = union_num * core_dim;
// Union1 policyFunc
*k_type = CNRT_FUNC_TYPE_UNION1;
k_dim->x = core_dim;
auto need_core_num = PAD_UP(batch_num_all, core_dim);
k_dim->y =
(need_core_num < core_num) ? (need_core_num / core_dim) : union_num;
k_dim->z = 1;
return;
}
void BBoxOverlapsMLUKernelLauncher(const Tensor bboxes1, const Tensor bboxes2,
Tensor ious, const int32_t mode,
const bool aligned, const int32_t offset) {
// check dtype
TORCH_CHECK(
bboxes1.scalar_type() == at::kFloat || bboxes1.scalar_type() == at::kHalf,
"Data type of input should be Float or Half. But now input type is ",
bboxes1.scalar_type(), ".");
TORCH_CHECK(bboxes1.scalar_type() == bboxes2.scalar_type(),
"bboxes1's dtype should be the same with bboxes2's dtype.");
// params check
TORCH_CHECK(bboxes1.dim() == 2, "bboxes1 should be a 2d tensor, got ",
bboxes1.dim(), "D");
TORCH_CHECK(bboxes2.dim() == 2, "bboxes2 should be a 2d tensor, got ",
bboxes2.dim(), "D");
auto rows = bboxes1.size(0);
auto cols = bboxes2.size(0);
auto batch_num_all = rows;
if (rows * cols == 0) {
// return if zero element
return;
}
// calculate task dimension
cnrtDim3_t k_dim;
cnrtFunctionType_t k_type;
policyFunc(&k_dim, &k_type, batch_num_all);
// get compute queue
cnrtQueue_t queue = torch_mlu::getCurQueue();
// get dtype of input
cnrtDataType_t d_type = torch_mlu::toCnrtDtype(bboxes1.dtype());
// get ptr of tensors
auto bboxes1_impl = torch_mlu::getMluTensorImpl(bboxes1);
auto bboxes1_ptr = bboxes1_impl->cnnlMalloc();
auto bboxes2_impl = torch_mlu::getMluTensorImpl(bboxes2);
auto bboxes2_ptr = bboxes2_impl->cnnlMalloc();
auto ious_impl = torch_mlu::getMluTensorImpl(ious);
auto ious_ptr = ious_impl->cnnlMalloc();
// launch kernel
CNLOG(INFO) << "Launch Kernel MLUUnion1BboxOverlapsKernel";
CNLOG(INFO) << "kDim :[ " << k_dim.x << ", " << k_dim.y << ", " << k_dim.z
<< " ]";
KernelBBoxOverlaps(k_dim, k_type, queue, d_type, bboxes1_ptr, bboxes2_ptr,
ious_ptr, rows, cols, mode, aligned, offset);
}
void bbox_overlaps_mlu(const Tensor bboxes1, const Tensor bboxes2, Tensor ious,
const int mode, const bool aligned, const int offset) {
BBoxOverlapsMLUKernelLauncher(bboxes1, bboxes2, ious, mode, aligned, offset);
}
void bbox_overlaps_impl(const Tensor bboxes1, const Tensor bboxes2, Tensor ious,
const int mode, const bool aligned, const int offset);
REGISTER_DEVICE_IMPL(bbox_overlaps_impl, MLU, bbox_overlaps_mlu);
/*************************************************************************
* Copyright (C) 2021 Cambricon.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*************************************************************************/
#include <string>
#include <vector>
#include "pytorch_device_registry.hpp"
#include "pytorch_mlu_helper.hpp"
void KernelFocalLossSigmoidForward(cnrtDim3_t k_dim, cnrtFunctionType_t k_type,
cnrtQueue_t queue,
const cnrtDataType_t d_type,
const void *input, const void *target,
const void *weight, const int32_t N,
const int32_t C, const float alpha,
const float gamma, void *output);
void KernelFocalLossSigmoidBackward(cnrtDim3_t k_dim, cnrtFunctionType_t k_type,
cnrtQueue_t queue,
const cnrtDataType_t d_type,
const void *input, const void *target,
const void *weight, const float gamma,
const float alpha, const int32_t dim_n,
const int32_t deal_n, const int32_t dim_c,
void *output);
// Policy Function for Forward
static void policyFuncForward(cnrtDim3_t *k_dim, cnrtFunctionType_t *k_type,
const Tensor &input, const Tensor &target,
const Tensor &weight) {
auto N = input.size(0);
auto C = input.size(1);
const size_t nram_size = torch_mlu::getDeviceAttr(cnrtAttrNramSizePerMcore);
const size_t c_align_size = PAD_UP((C * input.itemsize()), NFU_ALIGN_SIZE);
const int split_target_num = 2;
const int split_pipeline_num = 6;
const int has_weight = weight.data_ptr() != nullptr;
const int target_data_width = target.scalar_type() == at::kLong
? target.itemsize() / 2
: target.itemsize();
const int threshold_c =
PAD_DOWN((nram_size - split_target_num * sizeof(int)) /
(split_pipeline_num + has_weight),
NFU_ALIGN_SIZE) /
input.itemsize();
int n_seg = 1;
if (C <= threshold_c) {
int c_size = C * input.itemsize();
int reservered_align_size =
(split_target_num + split_pipeline_num) * NFU_ALIGN_SIZE;
int wegiht_size = 0;
if (has_weight) {
c_size = c_align_size;
reservered_align_size = split_target_num * NFU_ALIGN_SIZE;
wegiht_size = c_align_size;
}
// n_seg * c_size * split_pipeline_num + n_seg * target.itemsize() *
// split_target_num
// + weight_size + reservered_align_size <= nram_size
n_seg = (nram_size - wegiht_size - reservered_align_size) /
(split_pipeline_num * c_size + split_target_num * sizeof(int32_t));
}
auto seg_num = n_seg == 0 ? N : (N + n_seg - 1) / n_seg;
auto core_dim = torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster);
auto cluster_num = torch_mlu::getDeviceAttr(cnrtAttrClusterCount);
auto core_num = core_dim * cluster_num;
k_dim->x = *k_type;
k_dim->y =
seg_num > core_num ? cluster_num : (seg_num + core_dim - 1) / core_dim;
k_dim->z = 1;
}
// Policy Function for Backward
static void policyFuncBackward(cnrtDim3_t *k_dim, cnrtFunctionType_t *k_type) {
// set Union1 Job
*k_type = CNRT_FUNC_TYPE_UNION1;
k_dim->x = torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster);
k_dim->y = torch_mlu::getDeviceAttr(cnrtAttrClusterCount);
k_dim->z = 1;
}
void SigmoidFocalLossForwardMLUKernelLauncher(Tensor input, Tensor target,
Tensor weight, Tensor output,
const float gamma,
const float alpha) {
// params check
TORCH_CHECK(gamma >= 0, "gamma should be greater than or equal to 0. ",
"But now gamma is ", gamma, ".");
// check dtype
TORCH_CHECK(
input.scalar_type() == at::kFloat || input.scalar_type() == at::kHalf,
"Data type of input should be Float or Half. But now input type is ",
input.scalar_type(), ".");
TORCH_CHECK(
(target.scalar_type() == at::kInt || target.scalar_type() == at::kLong),
"target type should be Int or Long. ", "But now target type is ",
target.scalar_type(), ".");
if (weight.data_ptr() != nullptr) {
TORCH_CHECK(weight.scalar_type() == input.scalar_type(),
"Data types of input and weight should be the same. But now "
"input type is ",
input.scalar_type(), ", weight type is ", weight.scalar_type(),
".");
} else {
CNLOG(INFO) << "weight is a empty tensor.";
}
// return if zero-element
if (input.numel() == 0 || target.numel() == 0 || output.numel() == 0) {
return;
}
// calculate task dimension
cnrtDim3_t k_dim;
cnrtFunctionType_t k_type = CNRT_FUNC_TYPE_UNION1;
policyFuncForward(&k_dim, &k_type, input, target, weight);
auto core_dim = torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster);
// get compute queue
auto queue = torch_mlu::getCurQueue();
// get ptr of tensors
auto input_impl = torch_mlu::getMluTensorImpl(input);
auto input_ptr = input_impl->cnnlMalloc();
auto target_impl = torch_mlu::getMluTensorImpl(target);
auto target_ptr = target_impl->cnnlMalloc();
auto weight_impl = torch_mlu::getMluTensorImpl(weight);
auto weight_ptr = weight_impl->cnnlMalloc();
auto output_impl = torch_mlu::getMluTensorImpl(output);
auto output_ptr = output_impl->cnnlMalloc();
// get dtype of input
cnrtDataType_t d_type = torch_mlu::toCnrtDtype(input.dtype());
CNLOG(INFO) << "Launch Kernel KernelFocalLossSigmoidForward<<<Union"
<< k_type / core_dim << ", " << k_dim.x << ", " << k_dim.y << ", "
<< k_dim.z << ">>>";
// launch kernel
KernelFocalLossSigmoidForward(k_dim, k_type, queue, d_type, input_ptr,
target_ptr, weight_ptr, input.size(0),
input.size(1), alpha, gamma, output_ptr);
}
void getDealNAndThresholdC(const int compute_data_bytes,
const int target_data_bytes, const int total_c,
int *deal_n_ptr, int *threshold_c_ptr,
const bool has_weight, const bool is_half) {
/* NRAM partition:
*
* |-----------------ping pong--------------------|
* |input | pt | alpha_t | temp | output | target | flt_min | gamma | weight|
*
* split_pipeline_num is 5: including input, pt, alpha_t, temp, output.
*/
const int nram_split_num = 5;
const int nram_split_pingpong = 2;
const int max_nram_size = torch_mlu::getDeviceAttr(cnrtAttrNramSizePerMcore);
int32_t compute_align_size = NFU_ALIGN_SIZE;
if (is_half) {
compute_align_size += NFU_ALIGN_SIZE;
}
const int32_t compute_align_num = compute_align_size / compute_data_bytes;
// reservered_align_size: including input(ping pong), pt(ping pong),
// alpha_t(ping pong), temp(ping pong),
// output(ping pong), target(ping pong),
// flt_min and gamma.
const int reservered_align_size =
((nram_split_num + 1) * nram_split_pingpong + 2) * compute_align_size;
int nram_pingpong_size = max_nram_size - reservered_align_size;
int compute_c = total_c;
int threshold_c = 0;
if (has_weight) {
// reserved space for weight to align
nram_pingpong_size -= NFU_ALIGN_SIZE;
// threshold_c * nram_split_pingpong * compute_data_bytes * nram_split_num +
// nram_split_pingpong * target_data_bytes +
// threshold_c * compute_data_bytes <= nram_pingpong_size
threshold_c =
(nram_pingpong_size - nram_split_pingpong * target_data_bytes) /
(compute_data_bytes * (nram_split_num * nram_split_pingpong + 1));
threshold_c = PAD_DOWN(threshold_c, compute_align_num);
int weight_space = PAD_UP(total_c * compute_data_bytes, NFU_ALIGN_SIZE);
// reserved space for weight
nram_pingpong_size -= weight_space;
compute_c = PAD_UP(total_c, compute_align_num);
} else {
// threshold_c * nram_split_pingpong * compute_data_bytes * nram_split_num +
// nram_split_pingpong * target_data_bytes <= nram_pingpong_size
threshold_c =
(nram_pingpong_size / nram_split_pingpong - target_data_bytes) /
(nram_split_num * compute_data_bytes);
}
// deal_n * compute_c * nram_split_pingpong * compute_data_bytes *
// nram_split_num + deal_n * nram_split_pingpong * target_data_bytes <=
// nram_pingpong_size
*deal_n_ptr =
nram_pingpong_size /
((nram_split_num * compute_c * compute_data_bytes + target_data_bytes) *
nram_split_pingpong);
*threshold_c_ptr = threshold_c;
}
void SigmoidFocalLossBackwardMLUKernelLauncher(Tensor input, Tensor target,
Tensor weight, Tensor output,
const float gamma,
const float alpha) {
// params check
TORCH_CHECK(gamma >= 0, "gamma should be greater than or equal to 0. ",
"But now gamma is ", gamma, ".");
// check dtype
TORCH_CHECK(
input.scalar_type() == at::kFloat || input.scalar_type() == at::kHalf,
"Data type of input should be Float or Half. But now input type is ",
input.scalar_type(), ".");
TORCH_CHECK(
(target.scalar_type() == at::kInt || target.scalar_type() == at::kLong),
"target type should be Int or Long. ", "But now target type is ",
target.scalar_type(), ".");
bool has_weight = false;
if (weight.data_ptr() != nullptr) {
TORCH_CHECK(weight.scalar_type() == input.scalar_type(),
"Data types of input and weight should be the same. But now "
"input type is ",
input.scalar_type(), ", weight type is ", weight.scalar_type(),
".");
has_weight = true;
} else {
CNLOG(INFO) << "weight is a empty tensor.";
}
auto dim_c = input.size(1);
const int compute_data_bytes = sizeof(float);
// target supports only INT on MLU device while it keeps LONG on host side,
// so target.itemsize() / 2
const int target_data_bytes = target.scalar_type() == at::kLong
? (target.itemsize() / 2)
: target.itemsize();
int deal_n = 0;
int threshold_c = 0;
bool is_half = false;
if (input.scalar_type() == at::kHalf) {
is_half = true;
}
// calculate deal_n and threshold_c
getDealNAndThresholdC(compute_data_bytes, target_data_bytes, dim_c, &deal_n,
&threshold_c, has_weight, is_half);
// check C
TORCH_CHECK(threshold_c >= dim_c,
"input.size(1) should be in the range of [0, ", threshold_c,
"]. ", "But now input.size(1) is ", dim_c, ".");
if (input.numel() == 0 || target.numel() == 0 || output.numel() == 0) {
// return if zero-element
return;
}
// set task dimension
cnrtDim3_t k_dim;
cnrtFunctionType_t k_type;
policyFuncBackward(&k_dim, &k_type);
// get compute queue
auto queue = torch_mlu::getCurQueue();
// get ptr of tensors
auto input_impl = torch_mlu::getMluTensorImpl(input);
auto input_ptr = input_impl->cnnlMalloc();
auto target_impl = torch_mlu::getMluTensorImpl(target);
auto target_ptr = target_impl->cnnlMalloc();
auto weight_impl = torch_mlu::getMluTensorImpl(weight);
auto weight_ptr = weight_impl->cnnlMalloc();
auto output_impl = torch_mlu::getMluTensorImpl(output);
auto output_ptr = output_impl->cnnlMalloc();
// get dtype of input
cnrtDataType_t d_type = torch_mlu::toCnrtDtype(input.dtype());
auto core_dim = torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster);
auto dim_n = input.size(0);
CNLOG(INFO) << "Launch Kernel KernelFocalLossSigmoidBackward<<<Union"
<< k_type / core_dim << ", " << k_dim.x << ", " << k_dim.y << ", "
<< k_dim.z << ">>>";
// launch kernel
KernelFocalLossSigmoidBackward(k_dim, k_type, queue, d_type, input_ptr,
target_ptr, weight_ptr, gamma, alpha, dim_n,
deal_n, dim_c, output_ptr);
}
void sigmoid_focal_loss_forward_mlu(Tensor input, Tensor target, Tensor weight,
Tensor output, float gamma, float alpha) {
SigmoidFocalLossForwardMLUKernelLauncher(input, target, weight, output, gamma,
alpha);
}
void sigmoid_focal_loss_backward_mlu(Tensor input, Tensor target, Tensor weight,
Tensor grad_input, float gamma,
float alpha) {
SigmoidFocalLossBackwardMLUKernelLauncher(input, target, weight, grad_input,
gamma, alpha);
}
void sigmoid_focal_loss_forward_impl(Tensor input, Tensor target, Tensor weight,
Tensor output, float gamma, float alpha);
void sigmoid_focal_loss_backward_impl(Tensor input, Tensor target,
Tensor weight, Tensor grad_input,
float gamma, float alpha);
REGISTER_DEVICE_IMPL(sigmoid_focal_loss_forward_impl, MLU,
sigmoid_focal_loss_forward_mlu);
REGISTER_DEVICE_IMPL(sigmoid_focal_loss_backward_impl, MLU,
sigmoid_focal_loss_backward_mlu);
/*************************************************************************
* Copyright (C) 2021 by Cambricon.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*************************************************************************/
#include "pytorch_device_registry.hpp"
#include "pytorch_mlu_helper.hpp"
void KernelNms(cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
const cnrtDataType_t data_type_input, const void *boxes_ptr,
const void *scores_ptr, const int input_num_boxes,
const int input_stride, const int max_output_boxes,
const float iou_threshold, const float offset,
void *workspace_ptr, void *output_size_ptr, void *output_ptr);
int selectUnionType(uint32_t use_job, int box_num_per_core) {
// the box_num_per_core should be at least 256, otherwise the real IO
// bandwidth would be very low
while (box_num_per_core < 256 && use_job >= 4) {
box_num_per_core *= 2;
use_job /= 2;
}
return use_job;
}
Tensor NMSMLUKernelLauncher(Tensor boxes, Tensor scores, float iou_threshold,
int offset) {
// dimension parameters check
TORCH_CHECK(boxes.dim() == 2, "boxes should be a 2d tensor, got ",
boxes.dim(), "D");
TORCH_CHECK(boxes.size(1) == 4,
"boxes should have 4 elements in dimension 1, got ",
boxes.size(1));
TORCH_CHECK(scores.dim() == 1, "scores should be a 1d tensor, got ",
scores.dim(), "D");
// data type check
TORCH_CHECK(boxes.scalar_type() == scores.scalar_type(),
"boxes should have the same type as scores");
TORCH_CHECK(
boxes.scalar_type() == at::kFloat || boxes.scalar_type() == at::kHalf,
"data type of boxes should be Float or Half, got ", boxes.scalar_type());
if (boxes.numel() == 0) {
return at::empty({0}, boxes.options().dtype(at::kLong));
}
int input_num_boxes = boxes.size(0);
int input_stride = boxes.size(0);
int max_output_boxes = boxes.size(0);
cnrtDataType_t data_type_input = torch_mlu::toCnrtDtype(boxes.dtype());
cnrtDim3_t k_dim;
cnrtJobType_t k_type;
uint32_t union_number = torch_mlu::getDeviceAttr(cnrtAttrClusterCount);
uint32_t core_dim = torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster);
uint32_t job_limit = union_number * core_dim;
uint32_t core_number = union_number * core_dim;
int box_num_per_core = (input_num_boxes + core_number - 1) / core_number;
// initiate k_type as Union1
k_dim.x = core_dim;
k_dim.y = 1;
k_dim.z = 1;
k_type = CNRT_FUNC_TYPE_UNION1;
int use_job = selectUnionType(job_limit, box_num_per_core);
if (use_job < 4) {
k_dim.x = 1;
k_type = CNRT_FUNC_TYPE_BLOCK;
} else if (use_job == 4) {
k_dim.x = core_dim;
k_type = CNRT_FUNC_TYPE_UNION1;
} else {
k_dim.x = use_job;
k_type = (cnrtFunctionType_t)use_job;
}
// transpose boxes (n, 4) to (4, n) for better performance
auto boxes_t = boxes.transpose(0, 1);
auto boxes_ = torch_mlu::cnnl::ops::cnnl_contiguous(boxes_t);
auto scores_ = torch_mlu::cnnl::ops::cnnl_contiguous(scores);
auto output = at::empty({max_output_boxes}, boxes.options().dtype(at::kLong));
auto output_size = at::empty({1}, scores.options().dtype(at::kInt));
// workspace
const int info_num = 5; // x1, x2, y1, y2 and score
size_t space_size = 0;
if (boxes.scalar_type() == at::kHalf) {
space_size = input_num_boxes * sizeof(int16_t) * info_num + sizeof(float);
} else {
space_size = input_num_boxes * sizeof(float) * info_num + sizeof(float);
}
auto workspace = at::empty(space_size, boxes.options().dtype(at::kByte));
// get compute queue
auto queue = torch_mlu::getCurQueue();
auto boxes_impl = torch_mlu::getMluTensorImpl(boxes_);
auto boxes_ptr = boxes_impl->cnnlMalloc();
auto scores_impl = torch_mlu::getMluTensorImpl(scores_);
auto scores_ptr = scores_impl->cnnlMalloc();
auto workspace_impl = torch_mlu::getMluTensorImpl(workspace);
auto workspace_ptr = workspace_impl->cnnlMalloc();
auto output_impl = torch_mlu::getMluTensorImpl(output);
auto output_ptr = output_impl->cnnlMalloc();
auto output_size_impl = torch_mlu::getMluTensorImpl(output_size);
auto output_size_ptr = output_size_impl->cnnlMalloc();
CNLOG(INFO) << "Launch Kernel MLUUnionX NMS<<<Union" << k_type / core_dim
<< ", " << k_dim.x << ", " << k_dim.y << ", " << k_dim.z << ">>>";
KernelNms(k_dim, k_type, queue, data_type_input, boxes_ptr, scores_ptr,
input_num_boxes, input_stride, max_output_boxes, iou_threshold,
offset, workspace_ptr, output_size_ptr, output_ptr);
int output_num = *static_cast<int *>(output_size.cpu().data_ptr());
return output.slice(0, 0, output_num);
}
Tensor nms_mlu(Tensor boxes, Tensor scores, float iou_threshold, int offset) {
return NMSMLUKernelLauncher(boxes, scores, iou_threshold, offset);
}
Tensor nms_impl(Tensor boxes, Tensor scores, float iou_threshold, int offset);
REGISTER_DEVICE_IMPL(nms_impl, MLU, nms_mlu);
/*************************************************************************
* Copyright (C) 2022 Cambricon.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*************************************************************************/
#include <algorithm>
#include "psamask_utils.hpp"
#include "pytorch_device_registry.hpp"
#include "pytorch_mlu_helper.hpp"
#define COMPUTE_COUNT_ALIGN 64
void KernelPsamaskForward(
cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
const void *x, void *y, const PsamaskType psa_type,
const DimPartitionType core_partition,
const DimPartitionType cluster_partition, const int batch,
const int h_feature, const int w_feature, const int h_mask,
const int w_mask, const int x_c, const int y_c, const int half_h_mask,
const int half_w_mask, const int n_per_core, const int h_per_core,
const int n_per_cluster, const int h_per_cluster, const int limit_n_seg,
const int limit_h_seg, const int limit_w_seg);
void KernelPsamaskBackward(
cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
const void *dy, void *dx, const PsamaskType psa_type,
const DimPartitionType core_partition,
const DimPartitionType cluster_partition, const int batch,
const int h_feature, const int w_feature, const int h_mask,
const int w_mask, const int dx_c, const int dy_c, const int half_h_mask,
const int half_w_mask, const int n_per_core, const int h_per_core,
const int n_per_cluster, const int h_per_cluster, const int limit_n_seg,
const int limit_h_seg, const int limit_w_seg);
namespace {
void policyFunc(cnrtDim3_t *k_dim_ptr, cnrtFunctionType_t *f_type_ptr,
PartitionSeg *partition_ptr, const int n, const int h_feature) {
unsigned int core_dim = torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster);
unsigned int cluster_num = torch_mlu::getDeviceAttr(cnrtAttrClusterCount);
unsigned int use_cluster_num = cluster_num;
unsigned int use_core_num = core_dim;
if (n >= cluster_num || n >= h_feature) {
partition_ptr->cluster_partition = PARTITION_N;
partition_ptr->n_per_cluster = (n + cluster_num - 1) / cluster_num;
partition_ptr->h_per_cluster = h_feature;
use_cluster_num =
(n + partition_ptr->n_per_cluster - 1) / partition_ptr->n_per_cluster;
} else {
partition_ptr->cluster_partition = PARTITION_H;
partition_ptr->h_per_cluster = (h_feature + cluster_num - 1) / cluster_num;
partition_ptr->n_per_cluster = n;
use_cluster_num = (h_feature + partition_ptr->h_per_cluster - 1) /
partition_ptr->h_per_cluster;
}
if (partition_ptr->n_per_cluster >= core_dim ||
partition_ptr->n_per_cluster >= partition_ptr->h_per_cluster) {
partition_ptr->core_partition = PARTITION_N;
partition_ptr->n_per_core =
(partition_ptr->n_per_cluster + core_dim - 1) / core_dim;
partition_ptr->h_per_core = partition_ptr->h_per_cluster;
use_core_num =
(partition_ptr->n_per_cluster + partition_ptr->n_per_core - 1) /
partition_ptr->n_per_core;
} else {
partition_ptr->core_partition = PARTITION_H;
partition_ptr->h_per_core =
(partition_ptr->h_per_cluster + core_dim - 1) / core_dim;
partition_ptr->n_per_core = partition_ptr->n_per_cluster;
use_core_num =
(partition_ptr->h_per_cluster + partition_ptr->h_per_core - 1) /
partition_ptr->h_per_core;
}
*k_dim_ptr = {core_dim, use_cluster_num, 1};
}
} // namespace
bool findLimit(const int shape_core_n, const int shape_core_h,
const int shape_core_w, const int shape_core_ci,
const int shape_core_co, int *limit_n_seg_ptr,
int *limit_h_seg_ptr, int *limit_w_seg_ptr, const int psa_type) {
const bool need_temp = psa_type == 1;
const int input_bytes = sizeof(float);
int limit_n_seg = shape_core_n;
int limit_h_seg = shape_core_h;
int limit_w_seg = shape_core_w;
const int max_nram_size = torch_mlu::getDeviceAttr(cnrtAttrNramSizePerMcore);
const int align_base_128 = NFU_ALIGN_SIZE / input_bytes;
const int align_base_64 = COMPUTE_COUNT_ALIGN / input_bytes;
const int align_co = CEIL_ALIGN(shape_core_co, align_base_64);
const int align_w = CEIL_ALIGN(shape_core_w, align_base_64);
const int align_hw = CEIL_ALIGN(shape_core_h * shape_core_w, align_base_64);
const int max_num = max_nram_size / input_bytes;
int n_limit =
max_num /
(CEIL_ALIGN(shape_core_h * shape_core_w * shape_core_ci, align_base_128) +
align_hw * align_co * (1 + need_temp));
if (n_limit > 0) {
n_limit = std::min(n_limit, shape_core_n);
limit_n_seg = n_limit;
} else {
int h_limit =
max_num / (CEIL_ALIGN(shape_core_w * shape_core_ci, align_base_128) +
align_w * align_co * (1 + need_temp));
if (h_limit > 0) {
h_limit = std::min(h_limit, shape_core_h);
limit_h_seg = h_limit;
limit_n_seg = 1;
} else {
int w_limit =
max_num / (CEIL_ALIGN(shape_core_ci, align_base_128) +
CEIL_ALIGN(align_co, align_base_128) * (1 + need_temp));
if (w_limit > 0 && w_limit >= (COMPUTE_COUNT_ALIGN / input_bytes)) {
w_limit = std::min(w_limit, shape_core_w);
w_limit = w_limit / (COMPUTE_COUNT_ALIGN / input_bytes) *
(COMPUTE_COUNT_ALIGN / input_bytes);
limit_w_seg = w_limit;
limit_h_seg = 1;
limit_n_seg = 1;
} else {
CNLOG(INFO) << "The size of input channel is too large.";
return false;
}
}
}
*limit_n_seg_ptr = limit_n_seg;
*limit_h_seg_ptr = limit_h_seg;
*limit_w_seg_ptr = limit_w_seg;
return true;
}
void PSAMaskForwardMLUKernelLauncher(const int psa_type, const Tensor x,
Tensor y, const int num_,
const int h_feature, const int w_feature,
const int h_mask, const int w_mask,
const int half_h_mask,
const int half_w_mask) {
// params check
TORCH_CHECK(x.scalar_type() == at::kFloat, "x type should be Float, got ",
x.scalar_type());
TORCH_CHECK(y.scalar_type() == x.scalar_type(),
"y should have the same type as x");
TORCH_CHECK(x.dim() == 4, "x should be a 4d tensor, got ", x.dim(), "D");
TORCH_CHECK(y.dim() == 4, "y should be a 4d tensor, got ", y.dim(), "D");
int x_c = x.size(1);
int y_c = y.size(1);
TORCH_CHECK(h_mask * w_mask == x_c,
"channel of x should be the same as h_mask * w_mask");
TORCH_CHECK(h_feature * w_feature == y_c,
"channel of y should be the same as h_feature * w_feature");
TORCH_CHECK(psa_type == 0 || psa_type == 1,
"psa_type only suppurts 'COLLECT' and 'DISTRIBUTE' currently");
if (x.numel() == 0) {
CNLOG(INFO) << "skip zero-element tensor";
return;
}
cnrtFunctionType_t k_type = CNRT_FUNC_TYPE_UNION1;
cnrtDim3_t k_dim;
PartitionSeg partition_info;
policyFunc(&k_dim, &k_type, &partition_info, num_, h_feature);
int n_limit_seg, h_limit_seg, w_limit_seg;
bool ret =
findLimit(partition_info.n_per_core, partition_info.h_per_core, w_feature,
x_c, y_c, &n_limit_seg, &h_limit_seg, &w_limit_seg, psa_type);
if (ret != true) {
return;
}
auto memory_format =
torch_mlu::cnnl::ops::get_channels_last_memory_format(x.dim());
auto x_tensor = torch_mlu::cnnl::ops::cnnl_contiguous(x, memory_format);
at::Tensor y_tmp =
at::empty({num_, y_c, h_feature, w_feature}, x.options(), memory_format);
// get compute queue
auto queue = torch_mlu::getCurQueue();
// get ptr of tensors
auto x_impl = torch_mlu::getMluTensorImpl(x_tensor);
auto x_ptr = x_impl->cnnlMalloc();
auto y_impl = torch_mlu::getMluTensorImpl(y_tmp);
auto y_ptr = y_impl->cnnlMalloc();
KernelPsamaskForward(
k_dim, k_type, queue, x_ptr, y_ptr, (PsamaskType)psa_type,
partition_info.core_partition, partition_info.cluster_partition, num_,
h_feature, w_feature, h_mask, w_mask, x_c, y_c, half_h_mask, half_w_mask,
partition_info.n_per_core, partition_info.h_per_core,
partition_info.n_per_cluster, partition_info.h_per_cluster, n_limit_seg,
h_limit_seg, w_limit_seg);
y.copy_(y_tmp);
}
void PSAMaskBackwardMLUKernelLauncher(const int psa_type, const Tensor dy,
Tensor dx, const int num_,
const int h_feature, const int w_feature,
const int h_mask, const int w_mask,
const int half_h_mask,
const int half_w_mask) {
// params check
TORCH_CHECK(dy.scalar_type() == at::kFloat, "dy type should be Float, got ",
dy.scalar_type());
TORCH_CHECK(dx.scalar_type() == dy.scalar_type(),
"dx should have the same type as dy");
TORCH_CHECK(dy.dim() == 4, "dy should be a 4d tensor, got ", dy.dim(), "D");
TORCH_CHECK(dx.dim() == 4, "dx should be a 4d tensor, got ", dx.dim(), "D");
int dy_c = dy.size(1);
int dx_c = dx.size(1);
TORCH_CHECK(h_feature * w_feature == dy_c,
"channel of dy should be the same as h_feature * w_feature");
TORCH_CHECK(h_mask * w_mask == dx_c,
"channel of dx should be the same as h_mask * w_mask");
TORCH_CHECK(psa_type == 0 || psa_type == 1,
"psa_type only suppurts 'COLLECT' and 'DISTRIBUTE' currently");
if (dx.numel() == 0) {
CNLOG(INFO) << "skip zero-element tensor";
return;
}
cnrtFunctionType_t k_type = CNRT_FUNC_TYPE_UNION1;
cnrtDim3_t k_dim;
PartitionSeg partition_info;
policyFunc(&k_dim, &k_type, &partition_info, num_, h_feature);
int n_limit_seg, h_limit_seg, w_limit_seg;
bool ret =
findLimit(partition_info.n_per_core, partition_info.h_per_core, w_feature,
dx_c, dy_c, &n_limit_seg, &h_limit_seg, &w_limit_seg, psa_type);
if (ret != true) {
return;
}
auto memory_format =
torch_mlu::cnnl::ops::get_channels_last_memory_format(dy.dim());
auto dy_tensor = torch_mlu::cnnl::ops::cnnl_contiguous(dy, memory_format);
at::Tensor dx_tmp = at::empty({num_, dx_c, h_feature, w_feature},
dy.options(), memory_format);
// get compute queue
auto queue = torch_mlu::getCurQueue();
// get ptr of tensors
auto dx_impl = torch_mlu::getMluTensorImpl(dx_tmp);
auto dx_ptr = dx_impl->cnnlMalloc();
auto dy_impl = torch_mlu::getMluTensorImpl(dy_tensor);
auto dy_ptr = dy_impl->cnnlMalloc();
KernelPsamaskBackward(
k_dim, k_type, queue, dy_ptr, dx_ptr, (PsamaskType)psa_type,
partition_info.core_partition, partition_info.cluster_partition, num_,
h_feature, w_feature, h_mask, w_mask, dx_c, dy_c, half_h_mask,
half_w_mask, partition_info.n_per_core, partition_info.h_per_core,
partition_info.n_per_cluster, partition_info.h_per_cluster, n_limit_seg,
h_limit_seg, w_limit_seg);
dx.copy_(dx_tmp);
}
void psamask_forward_mlu(const int psa_type, const Tensor input, Tensor output,
const int num_, const int h_feature,
const int w_feature, const int h_mask,
const int w_mask, const int half_h_mask,
const int half_w_mask) {
PSAMaskForwardMLUKernelLauncher(psa_type, input, output, num_, h_feature,
w_feature, h_mask, w_mask, half_h_mask,
half_w_mask);
}
void psamask_backward_mlu(const int psa_type, const Tensor grad_output,
Tensor grad_input, const int num_,
const int h_feature, const int w_feature,
const int h_mask, const int w_mask,
const int half_h_mask, const int half_w_mask) {
PSAMaskBackwardMLUKernelLauncher(psa_type, grad_output, grad_input, num_,
h_feature, w_feature, h_mask, w_mask,
half_h_mask, half_w_mask);
}
void psamask_forward_impl(const int psa_type, const Tensor input, Tensor output,
const int num_, const int h_feature,
const int w_feature, const int h_mask,
const int w_mask, const int half_h_mask,
const int half_w_mask);
void psamask_backward_impl(const int psa_type, const Tensor grad_output,
Tensor grad_input, const int num_,
const int h_feature, const int w_feature,
const int h_mask, const int w_mask,
const int half_h_mask, const int half_w_mask);
REGISTER_DEVICE_IMPL(psamask_forward_impl, MLU, psamask_forward_mlu);
REGISTER_DEVICE_IMPL(psamask_backward_impl, MLU, psamask_backward_mlu);
/*************************************************************************
* Copyright (C) 2021 Cambricon.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*************************************************************************/
#include "pytorch_device_registry.hpp"
#include "pytorch_mlu_helper.hpp"
void KernelRoiAlign(cnrtDim3_t k_dim, cnrtFunctionType_t k_type,
cnrtQueue_t queue, const cnrtDataType_t d_type,
const void *input, const void *rois, const int channels,
const bool aligned, const int pooled_height,
const int pooled_width, const int input_height,
const int input_width, const int sampling_ratio,
const float spatial_scale, const int num_rois,
void *output);
void KernelRoiAlignBackward(cnrtDim3_t k_dim, cnrtFunctionType_t k_type,
cnrtQueue_t queue, const cnrtDataType_t dtype,
const void *grads, const void *boxes,
void *grads_image, const int boxes_num,
const int hi, const int wi, const int c,
const int no, const int ho, const int wo,
const float spatial_scale, const int sampling_ratio,
const bool aligned);
void ROIAlignForwardMLUKernelLauncher(Tensor input, Tensor rois, Tensor output,
Tensor argmax_y, Tensor argmax_x,
int aligned_height, int aligned_width,
float spatial_scale, int sampling_ratio,
int pool_mode, bool aligned) {
// params check
TORCH_CHECK(
input.scalar_type() == at::kFloat || input.scalar_type() == at::kHalf,
"input type should be Float or Half, got ", input.scalar_type());
TORCH_CHECK(rois.scalar_type() == input.scalar_type(),
"rois should have the same type as input");
TORCH_CHECK(input.dim() == 4, "input should be a 4d tensor, got ",
input.dim(), "D");
TORCH_CHECK(rois.dim() == 2, "rois should be a 2d tensor, got ", rois.dim(),
"D");
TORCH_CHECK(pool_mode == 1, "pool_mode only suppurts 'avg' currently");
auto memory_format =
torch_mlu::cnnl::ops::get_channels_last_memory_format(input.dim());
auto input_tensor =
torch_mlu::cnnl::ops::cnnl_contiguous(input, memory_format);
auto num_rois = rois.size(0);
auto channels = input.size(1);
int height = input.size(2);
int width = input.size(3);
if (output.numel() == 0) {
output = at::zeros({num_rois, channels, aligned_height, aligned_width},
input.options());
return;
}
at::Tensor output_tmp =
at::empty({num_rois, channels, aligned_height, aligned_width},
input.options(), memory_format);
// get tensor impl
auto self_impl = torch_mlu::getMluTensorImpl(input_tensor);
auto rois_impl = torch_mlu::getMluTensorImpl(rois);
auto output_impl = torch_mlu::getMluTensorImpl(output_tmp);
// get compute queue
auto queue = torch_mlu::getCurQueue();
// get the mlu ptr
auto self_ptr = self_impl->cnnlMalloc();
auto rois_ptr = rois_impl->cnnlMalloc();
auto output_ptr = output_impl->cnnlMalloc();
cnrtJobType_t k_type = CNRT_FUNC_TYPE_UNION1;
cnrtDim3_t k_dim;
k_dim.x = torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster);
k_dim.y = torch_mlu::getDeviceAttr(cnrtAttrClusterCount);
k_dim.z = 1;
cnrtDataType_t data_type = torch_mlu::toCnrtDtype(input.dtype());
KernelRoiAlign(k_dim, k_type, queue, data_type, self_ptr, rois_ptr, channels,
aligned, aligned_height, aligned_width, height, width,
sampling_ratio, spatial_scale, num_rois, output_ptr);
output.copy_(output_tmp);
}
static int nearestPower2(int x) {
x--;
x |= x >> 1;
x |= x >> 2;
x |= x >> 4;
x |= x >> 8;
x |= x >> 16;
x++;
return x;
}
void ROIAlignBackwardMLUKernelLauncher(Tensor grad, Tensor rois,
Tensor argmax_y, Tensor argmax_x,
Tensor grad_input, int aligned_height,
int aligned_width, float spatial_scale,
int sampling_ratio, int pool_mode,
bool aligned) {
// params check
TORCH_CHECK(
grad.scalar_type() == at::kFloat || grad.scalar_type() == at::kHalf,
"grad type should be Float or Half, got ", grad.scalar_type());
TORCH_CHECK(rois.scalar_type() == grad.scalar_type(),
"rois should have the same type as grad");
TORCH_CHECK(grad.dim() == 4, "grad should be a 4d tensor, got ", grad.dim(),
"D");
TORCH_CHECK(rois.dim() == 2, "rois should be a 2d tensor, got ", rois.dim(),
"D");
TORCH_CHECK(pool_mode == 1, "pool_mode only suppurts 'avg' currently");
int batch_size = grad_input.size(0);
int channels = grad_input.size(1);
int height = grad_input.size(2);
int width = grad_input.size(3);
auto memory_format =
torch_mlu::cnnl::ops::get_channels_last_memory_format(grad.dim());
auto grad_ = torch_mlu::cnnl::ops::cnnl_contiguous(grad, memory_format);
auto grad_input_ = at::empty({batch_size, channels, height, width},
grad.options(), memory_format)
.zero_();
int boxes_num = rois.size(0);
int hi = grad.size(2);
int wi = grad.size(3);
int c = grad.size(1);
int no = grad_input.size(0);
int ho = grad_input.size(2);
int wo = grad_input.size(3);
// get tensor impl
auto grad_impl = torch_mlu::getMluTensorImpl(grad_);
auto grad_input_impl = torch_mlu::getMluTensorImpl(grad_input_);
auto rois_impl = torch_mlu::getMluTensorImpl(rois);
// get compute queue
auto queue = torch_mlu::getCurQueue();
// get the mlu ptr
auto grad_ptr = grad_impl->cnnlMalloc();
auto rois_ptr = rois_impl->cnnlMalloc();
auto grad_input_ptr = grad_input_impl->cnnlMalloc();
cnrtJobType_t k_type = CNRT_FUNC_TYPE_UNION1;
int need_core = nearestPower2(boxes_num);
int union_number = torch_mlu::getDeviceAttr(cnrtAttrClusterCount);
uint32_t dim_x = torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster);
uint32_t dim_y = (need_core - 1) / dim_x + 1;
dim_y = (dim_y > union_number) ? union_number : dim_y;
cnrtDim3_t k_dim = {dim_x, dim_y, 1};
cnrtDataType_t k_dtype = torch_mlu::toCnrtDtype(grad.dtype());
KernelRoiAlignBackward(k_dim, k_type, queue, k_dtype, grad_ptr, rois_ptr,
grad_input_ptr, boxes_num, hi, wi, c, no, ho, wo,
spatial_scale, sampling_ratio, aligned);
grad_input.copy_(grad_input_);
}
void roi_align_forward_mlu(Tensor input, Tensor rois, Tensor output,
Tensor argmax_y, Tensor argmax_x, int aligned_height,
int aligned_width, float spatial_scale,
int sampling_ratio, int pool_mode, bool aligned) {
ROIAlignForwardMLUKernelLauncher(input, rois, output, argmax_y, argmax_x,
aligned_height, aligned_width, spatial_scale,
sampling_ratio, pool_mode, aligned);
}
void roi_align_backward_mlu(Tensor grad_output, Tensor rois, Tensor argmax_y,
Tensor argmax_x, Tensor grad_input,
int aligned_height, int aligned_width,
float spatial_scale, int sampling_ratio,
int pool_mode, bool aligned) {
ROIAlignBackwardMLUKernelLauncher(
grad_output, rois, argmax_y, argmax_x, grad_input, aligned_height,
aligned_width, spatial_scale, sampling_ratio, pool_mode, aligned);
}
void roi_align_forward_impl(Tensor input, Tensor rois, Tensor output,
Tensor argmax_y, Tensor argmax_x,
int aligned_height, int aligned_width,
float spatial_scale, int sampling_ratio,
int pool_mode, bool aligned);
void roi_align_backward_impl(Tensor grad_output, Tensor rois, Tensor argmax_y,
Tensor argmax_x, Tensor grad_input,
int aligned_height, int aligned_width,
float spatial_scale, int sampling_ratio,
int pool_mode, bool aligned);
REGISTER_DEVICE_IMPL(roi_align_forward_impl, MLU, roi_align_forward_mlu);
REGISTER_DEVICE_IMPL(roi_align_backward_impl, MLU, roi_align_backward_mlu);
/*************************************************************************
* Copyright (C) 2022 by Cambricon.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*************************************************************************/
#include "pytorch_device_registry.hpp"
#include "pytorch_mlu_helper.hpp"
#include "roi_align_rotated_utils.hpp"
namespace {
void policyFunc(int bin_num, cnrtDim3_t *k_dim, cnrtFunctionType_t *k_type) {
unsigned int core_num = torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster);
unsigned int cluster_num = torch_mlu::getDeviceAttr(cnrtAttrClusterCount);
*k_type = CNRT_FUNC_TYPE_UNION1;
k_dim->x = core_num;
unsigned int use_cluster = (bin_num + core_num - 1) / core_num;
k_dim->y = use_cluster > cluster_num ? cluster_num : use_cluster;
k_dim->z = 1;
}
} // namespace
void KernelRoiAlignRotatedForward(
cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
const cnrtDataType_t d_type, const void *features, const void *rois,
void *output, const int batch, const int height, const int width,
const int channel, const int rois_num,
const RoiAlignRotatedParams roiAlignRotatedParams);
void KernelRoiAlignRotatedBackward(
cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
const cnrtDataType_t d_type, const void *top_grad, const void *rois,
void *bottom_grad, const int batch, const int height, const int width,
const int channel, const int rois_num,
const RoiAlignRotatedParams roiAlignRotatedParams);
void ROIAlignRotatedForwardMLUKernelLauncher(Tensor input, Tensor rois,
Tensor output, int pooled_height,
int pooled_width,
float spatial_scale,
int sampling_ratio, bool aligned,
bool clockwise) {
TORCH_CHECK(((input.scalar_type() == output.scalar_type()) &&
(output.scalar_type() == rois.scalar_type())),
"data types of input, rois and output should be the same, ",
"but now input type is ", input.scalar_type(), ", rois type is ",
rois.scalar_type(), ", output type is ", output.scalar_type(),
".");
TORCH_CHECK(
(input.scalar_type() == at::kFloat || input.scalar_type() == at::kHalf),
"input type should be Float or Half, got ", input.scalar_type(), ".");
TORCH_CHECK(input.dim() == 4, "input should be a 4d tensor, got ",
input.dim(), "D.");
TORCH_CHECK(rois.dim() == 2, "rois should be a 2d tensor, got ", rois.dim(),
"D.");
TORCH_CHECK(output.dim() == 4, "output should be a 4d tensor, got ",
output.dim(), "D.");
TORCH_CHECK((rois.size(0) == output.size(0)),
"the 1st dimensions of rois and output should be the same, ",
"but now the 1st dimension of rois is ", rois.size(0),
", and output is ", output.size(0), ".");
TORCH_CHECK((input.size(1) == output.size(1)),
"the 2nd dimensions of input and output should be the same, ",
"but now the 2nd dimension of input is ", input.size(1),
", and output is ", output.size(1), ".");
int channel = input.size(1);
int width = input.size(3);
int height = input.size(2);
int batch = input.size(0);
int rois_nums = rois.size(0);
cnrtDataType_t d_type = torch_mlu::toCnrtDtype(input.dtype());
// return if zero-elements
if (input.numel() == 0) {
CNLOG(INFO) << "Skip the zero-elements case.";
return;
}
RoiAlignRotatedParams roiAlignRotatedParams{pooled_height, pooled_width,
sampling_ratio, spatial_scale,
aligned, clockwise};
cnrtDim3_t k_dim;
cnrtFunctionType_t k_type;
policyFunc(rois_nums * pooled_height * pooled_width, &k_dim, &k_type);
auto memory_format =
torch_mlu::cnnl::ops::get_channels_last_memory_format(input.dim());
auto input_tensor =
torch_mlu::cnnl::ops::cnnl_contiguous(input, memory_format);
at::Tensor output_tmp =
at::empty({batch, channel, pooled_height, pooled_width}, input.options(),
memory_format);
// get compute queue
auto queue = torch_mlu::getCurQueue();
// get ptr of tensors
auto input_impl = torch_mlu::getMluTensorImpl(input_tensor);
auto input_ptr = input_impl->cnnlMalloc();
auto rois_impl = torch_mlu::getMluTensorImpl(rois);
auto rois_ptr = rois_impl->cnnlMalloc();
auto output_impl = torch_mlu::getMluTensorImpl(output_tmp);
auto output_ptr = output_impl->cnnlMalloc();
KernelRoiAlignRotatedForward(k_dim, k_type, queue, d_type, input_ptr,
rois_ptr, output_ptr, batch, height, width,
channel, rois_nums, roiAlignRotatedParams);
output.copy_(output_tmp);
}
void ROIAlignRotatedBackwardMLUKernelLauncher(
Tensor top_grad, Tensor rois, Tensor bottom_grad, int pooled_height,
int pooled_width, float spatial_scale, int sampling_ratio, bool aligned,
bool clockwise) {
TORCH_CHECK(((top_grad.scalar_type() == bottom_grad.scalar_type()) &&
(bottom_grad.scalar_type() == rois.scalar_type())),
"data types of top_grad, rois and bottom_grad should be ",
"the same, but now top_grad type is ", top_grad.scalar_type(),
", rois type is ", rois.scalar_type(), ", bottom_grad type is ",
bottom_grad.scalar_type(), ".");
TORCH_CHECK((bottom_grad.scalar_type() == at::kFloat ||
bottom_grad.scalar_type() == at::kHalf),
"Data type of bottom_grad should be Float ro Half, got ",
bottom_grad.scalar_type(), ".");
TORCH_CHECK(bottom_grad.dim() == 4, "bottom_grad should be a 4d tensor, got ",
top_grad.dim(), "D.");
TORCH_CHECK(rois.dim() == 2, "rois should be a 2d tensor, got ", rois.dim(),
"D.");
TORCH_CHECK(top_grad.dim() == 4, "top_grad should be a 4d tensor, got ",
bottom_grad.dim(), "D.");
TORCH_CHECK((rois.size(0) == top_grad.size(0)),
"the 1st dimensions of rois and top_grad should be the same, ",
"but now the 1st dimension of rois is ", rois.size(0),
", and top_grad is ", top_grad.size(0), ".");
TORCH_CHECK((bottom_grad.size(1) == top_grad.size(1)),
"the 2nd dimensions of bottom_grad and top_grad should be ",
"the same, but now the 2nd dimension of bottom_grad is ",
bottom_grad.size(1), ", and top_grad is ", top_grad.size(1), ".");
int channel = bottom_grad.size(1);
int width = bottom_grad.size(3);
int height = bottom_grad.size(2);
int batch = bottom_grad.size(0);
int rois_nums = rois.size(0);
cnrtDataType_t d_type = torch_mlu::toCnrtDtype(bottom_grad.dtype());
// return if zero-elements
if (bottom_grad.numel() == 0) {
CNLOG(INFO) << "Skip the zero-elements case.";
return;
}
RoiAlignRotatedParams roiAlignRotatedParams{pooled_height, pooled_width,
sampling_ratio, spatial_scale,
aligned, clockwise};
cnrtDim3_t k_dim;
cnrtFunctionType_t k_type;
policyFunc(rois_nums * pooled_height * pooled_width, &k_dim, &k_type);
auto memory_format =
torch_mlu::cnnl::ops::get_channels_last_memory_format(top_grad.dim());
auto top_grad_tensor =
torch_mlu::cnnl::ops::cnnl_contiguous(top_grad, memory_format);
at::Tensor bottom_grad_tmp = at::empty({batch, channel, height, width},
top_grad.options(), memory_format)
.zero_();
// get compute queue
auto queue = torch_mlu::getCurQueue();
// get ptr of tensors
auto bottom_grad_impl = torch_mlu::getMluTensorImpl(bottom_grad_tmp);
auto bottom_grad_ptr = bottom_grad_impl->cnnlMalloc();
auto rois_impl = torch_mlu::getMluTensorImpl(rois);
auto rois_ptr = rois_impl->cnnlMalloc();
auto top_grad_impl = torch_mlu::getMluTensorImpl(top_grad_tensor);
auto top_grad_ptr = top_grad_impl->cnnlMalloc();
KernelRoiAlignRotatedBackward(k_dim, k_type, queue, d_type, top_grad_ptr,
rois_ptr, bottom_grad_ptr, batch, height, width,
channel, rois_nums, roiAlignRotatedParams);
bottom_grad.copy_(bottom_grad_tmp);
}
void roi_align_rotated_forward_mlu(Tensor input, Tensor rois, Tensor output,
int aligned_height, int aligned_width,
float spatial_scale, int sampling_ratio,
bool aligned, bool clockwise) {
ROIAlignRotatedForwardMLUKernelLauncher(input, rois, output, aligned_height,
aligned_width, spatial_scale,
sampling_ratio, aligned, clockwise);
}
void roi_align_rotated_backward_mlu(Tensor top_grad, Tensor rois,
Tensor bottom_grad, int aligned_height,
int aligned_width, float spatial_scale,
int sampling_ratio, bool aligned,
bool clockwise) {
ROIAlignRotatedBackwardMLUKernelLauncher(
top_grad, rois, bottom_grad, aligned_height, aligned_width, spatial_scale,
sampling_ratio, aligned, clockwise);
}
void roi_align_rotated_forward_impl(Tensor input, Tensor rois, Tensor output,
int aligned_height, int aligned_width,
float spatial_scale, int sampling_ratio,
bool aligned, bool clockwise);
void roi_align_rotated_backward_impl(Tensor top_grad, Tensor rois,
Tensor bottom_grad, int aligned_height,
int aligned_width, float spatial_scale,
int sampling_ratio, bool aligned,
bool clockwise);
REGISTER_DEVICE_IMPL(roi_align_rotated_forward_impl, MLU,
roi_align_rotated_forward_mlu);
REGISTER_DEVICE_IMPL(roi_align_rotated_backward_impl, MLU,
roi_align_rotated_backward_mlu);
/*************************************************************************
* Copyright (C) 2022 Cambricon.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*************************************************************************/
#include "pytorch_device_registry.hpp"
#include "pytorch_mlu_helper.hpp"
void KernelRoiPoolForward(cnrtDim3_t k_dim, cnrtFunctionType_t k_type,
cnrtQueue_t queue, cnrtDataType_t data_type,
const void *input_data, const void *input_rois,
const int batch, const int channels, const int height,
const int width, const int pooled_height,
const int pooled_width, const int rois_num,
const float spatial_scale, void *output_data,
int *argmax);
void KernelRoiPoolBackward(cnrtDim3_t k_dim, cnrtFunctionType_t k_type,
cnrtQueue_t queue, cnrtDataType_t k_dtype,
const void *grad_output_ptr, const void *rois_ptr,
const int *argmax_ptr, void *grad_input_ptr,
const int box_num, const int pooled_height,
const int pooled_width, const int channels,
const int batch, const int height, const int width,
const float spatial_scale);
// policy function for forward
static void policyFuncForward(const int bin_num, cnrtDim3_t *k_dim,
cnrtFunctionType_t *k_type) {
auto core_num = torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster);
auto cluster_num = torch_mlu::getDeviceAttr(cnrtAttrClusterCount);
*k_type = CNRT_FUNC_TYPE_UNION1;
k_dim->x = core_num;
unsigned int use_cluster = bin_num / core_num + (bin_num % core_num > 0);
k_dim->y = use_cluster > cluster_num ? cluster_num : use_cluster;
k_dim->z = 1;
}
void ROIPoolForwardMLUKernelLauncher(Tensor input, Tensor rois, Tensor output,
Tensor argmax, int pooled_height,
int pooled_width, float spatial_scale) {
// Check dtype.
TORCH_CHECK(
input.scalar_type() == at::kFloat || input.scalar_type() == at::kHalf,
"input type should be Float or Half, got ", input.scalar_type());
TORCH_CHECK(input.scalar_type() == rois.scalar_type(),
"rois should have the same type as input");
// Check dtype relationship.
TORCH_CHECK(
argmax.scalar_type() == at::kLong || argmax.scalar_type() == at::kInt,
"argmax type should be Int or Long, got ", argmax.scalar_type());
// Check shape.
TORCH_CHECK(input.dim() == 4, "input should be 4d tensor, got ", input.dim(),
"D");
TORCH_CHECK(rois.dim() == 2, "rois should be 2d tensor, got ", rois.dim(),
"D");
TORCH_CHECK(argmax.dim() == 4, "argmax should be 4d tensor, got ",
argmax.dim(), "D");
TORCH_CHECK(spatial_scale > 0 && spatial_scale <= 1,
"spatial_scale should be within (0, 1], got ", spatial_scale);
// compute kernel params
auto batch = input.size(0);
auto height = input.size(2);
auto width = input.size(3);
auto channels = input.size(1);
auto rois_num = output.size(0);
if (output.numel() == 0) {
output = at::zeros({rois_num, channels, pooled_height, pooled_width},
input.options());
return;
}
if (argmax.numel() == 0) {
argmax = at::zeros({rois_num, channels, pooled_height, pooled_width},
argmax.options());
return;
}
// zero element check
if (input.numel() == 0 || rois.numel() == 0 || output.numel() == 0 ||
argmax.numel() == 0) {
return;
}
auto memory_format =
torch_mlu::cnnl::ops::get_channels_last_memory_format(input.dim());
auto input_ = torch_mlu::cnnl::ops::cnnl_contiguous(input, memory_format);
at::Tensor output_ =
at::empty({rois_num, channels, pooled_height, pooled_width},
input.options(), memory_format);
at::Tensor argmax_ =
at::empty({rois_num, channels, pooled_height, pooled_width},
argmax.options(), memory_format);
// calculate task dimension
cnrtDim3_t k_dim;
cnrtFunctionType_t k_type;
policyFuncForward(rois_num * pooled_height * pooled_width, &k_dim, &k_type);
// get compute queue
auto queue = torch_mlu::getCurQueue();
// get ptr of tensors
auto input_impl = torch_mlu::getMluTensorImpl(input_);
auto input_ptr = input_impl->cnnlMalloc();
auto rois_impl = torch_mlu::getMluTensorImpl(rois);
auto rois_ptr = rois_impl->cnnlMalloc();
auto output_impl = torch_mlu::getMluTensorImpl(output_);
auto output_ptr = output_impl->cnnlMalloc();
auto argmax_impl = torch_mlu::getMluTensorImpl(argmax_);
auto argmax_ptr = argmax_impl->cnnlMalloc();
// get comput dtype of input
cnrtDataType_t data_type = torch_mlu::toCnrtDtype(input_.dtype());
// launch kernel
CNLOG(INFO) << "Launch Kernel MLUKernelRoiPoolForward<<<" << k_dim.x << ", "
<< k_dim.y << ", " << k_dim.z << ">>>";
KernelRoiPoolForward(k_dim, k_type, queue, data_type, input_ptr, rois_ptr,
batch, channels, height, width, pooled_height,
pooled_width, rois_num, spatial_scale, output_ptr,
(int *)argmax_ptr);
output.copy_(output_);
argmax.copy_(argmax_);
}
// policy function for backward
static void policyFuncBackward(cnrtDim3_t *k_dim, cnrtFunctionType_t *k_type) {
*k_type = CNRT_FUNC_TYPE_UNION1;
k_dim->x = torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster);
k_dim->y = torch_mlu::getDeviceAttr(cnrtAttrClusterCount);
k_dim->z = 1;
}
void ROIPoolBackwardMLUKernelLauncher(Tensor grad_output, Tensor rois,
Tensor argmax, Tensor grad_input,
int pooled_height, int pooled_width,
float spatial_scale) {
// Check dtype.
TORCH_CHECK(
argmax.scalar_type() == at::kLong || argmax.scalar_type() == at::kInt,
"argmax type should be Int or Long, got ", argmax.scalar_type());
TORCH_CHECK((grad_output.scalar_type() == at::kFloat ||
grad_output.scalar_type() == at::kHalf),
"grad_output type should be FLoat or Half, got ",
grad_output.scalar_type());
// Check dtype relationship.
TORCH_CHECK((rois.scalar_type() == grad_output.scalar_type()),
"rois should have the same type as grad_output");
// Check shape.
TORCH_CHECK(grad_output.dim() == 4, "grad_output should be 4d tensor, got ",
grad_output.dim(), "D");
TORCH_CHECK(rois.dim() == 2, "rois should be 2d tensor, got ", rois.dim(),
"D");
TORCH_CHECK(argmax.dim() == 4, "argmax should be 4d tensor, got ",
argmax.dim(), "D");
TORCH_CHECK(spatial_scale > 0 && spatial_scale <= 1,
"spatial_scale should be within (0, 1], got ", spatial_scale);
// Check relationship between tensor.
// Check the relationship of n.
TORCH_CHECK(grad_output.size(0) == rois.size(0),
"grad_output.size(0) = ", grad_output.size(0),
", while rois.size(0) = ", rois.size(0),
". They should be the same.");
// Check the relationship of channels.
TORCH_CHECK(grad_output.size(1) == argmax.size(1),
"grad_output.size(1) = ", grad_output.size(1),
", while argmax.size(1) = ", argmax.size(1),
". They should be the same.");
// Check the relationship of height and width.
TORCH_CHECK(grad_output.size(2) == argmax.size(2),
"argmax.size(2) = ", argmax.size(2),
", while grad_output.size(2) = ", grad_output.size(2),
". They should be the same.");
TORCH_CHECK(grad_output.size(3) == argmax.size(3),
"argmax.size(3) = ", argmax.size(3),
", while grad_output.size(3) = ", grad_output.size(3),
". They should be the same.");
// Check zero element.
if (grad_output.numel() == 0 || rois.numel() == 0 || argmax.numel() == 0 ||
grad_input.numel() == 0) {
// return if zero-element
return;
}
auto memory_format =
torch_mlu::cnnl::ops::get_channels_last_memory_format(grad_output.dim());
auto grad_output_ =
torch_mlu::cnnl::ops::cnnl_contiguous(grad_output, memory_format);
auto argmax_ = torch_mlu::cnnl::ops::cnnl_contiguous(argmax, memory_format);
int boxes_num = grad_output.size(0);
int no = grad_input.size(0);
int channels = grad_input.size(1);
int height = grad_input.size(2);
int width = grad_input.size(3);
auto grad_input_ = at::empty({no, channels, height, width},
grad_input.options(), memory_format)
.zero_();
// get tensor impl
auto grad_output_impl = torch_mlu::getMluTensorImpl(grad_output_);
auto rois_impl = torch_mlu::getMluTensorImpl(rois);
auto argmax_impl = torch_mlu::getMluTensorImpl(argmax_);
auto grad_input_impl = torch_mlu::getMluTensorImpl(grad_input_);
// get compute queue
auto queue = torch_mlu::getCurQueue();
// get mlu ptr
auto grad_output_ptr = grad_output_impl->cnnlMalloc();
auto rois_ptr = rois_impl->cnnlMalloc();
auto argmax_ptr = argmax_impl->cnnlMalloc();
auto grad_input_ptr = grad_input_impl->cnnlMalloc();
// calculate task dimension
cnrtDataType_t k_dtype = torch_mlu::toCnrtDtype(grad_input.dtype());
cnrtDim3_t k_dim;
cnrtFunctionType_t k_type;
policyFuncBackward(&k_dim, &k_type);
CNLOG(INFO) << "Launch Kernel MLUKernelRoiPoolBackward<<<" << k_dim.x << ", "
<< k_dim.y << ", " << k_dim.z << ">>>";
KernelRoiPoolBackward(k_dim, k_type, queue, k_dtype, grad_output_ptr,
rois_ptr, (int *)argmax_ptr, grad_input_ptr, boxes_num,
pooled_height, pooled_width, channels, no, height,
width, spatial_scale);
grad_input.copy_(grad_input_);
}
void roi_pool_forward_mlu(Tensor input, Tensor rois, Tensor output,
Tensor argmax, int pooled_height, int pooled_width,
float spatial_scale) {
ROIPoolForwardMLUKernelLauncher(input, rois, output, argmax, pooled_height,
pooled_width, spatial_scale);
}
void roi_pool_backward_mlu(Tensor grad_output, Tensor rois, Tensor argmax,
Tensor grad_input, int pooled_height,
int pooled_width, float spatial_scale) {
ROIPoolBackwardMLUKernelLauncher(grad_output, rois, argmax, grad_input,
pooled_height, pooled_width, spatial_scale);
}
void roi_pool_forward_impl(Tensor input, Tensor rois, Tensor output,
Tensor argmax, int pooled_height, int pooled_width,
float spatial_scale);
void roi_pool_backward_impl(Tensor grad_output, Tensor rois, Tensor argmax,
Tensor grad_input, int pooled_height,
int pooled_width, float spatial_scale);
REGISTER_DEVICE_IMPL(roi_pool_forward_impl, MLU, roi_pool_forward_mlu);
REGISTER_DEVICE_IMPL(roi_pool_backward_impl, MLU, roi_pool_backward_mlu);
/*************************************************************************
* Copyright (C) 2022 Cambricon.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*************************************************************************/
#include "pytorch_device_registry.hpp"
#include "pytorch_mlu_helper.hpp"
void KernelTinShiftForward(
cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
const void *input, const void *shifts, void *output, const int batch_size,
const int time_size, const int channel_size, const int hw_size,
const int group_size, const int group_channel,
const cnrtDataType_t data_dtype, const int channel_per_core,
const int max_number_hw_per_core, const int max_length_per_core);
void KernelTinShiftBackward(
cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
const void *grad_output, const void *shifts, void *grad_input,
const int batch_size, const int time_size, const int channel_size,
const int hw_size, const int group_size, const int group_channel,
const cnrtDataType_t data_dtype, const int channel_per_core,
const int max_number_hw_per_core, const int max_length_per_core);
// policy function
static void policyFunc(const Tensor &input, cnrtDim3_t *k_dim,
cnrtFunctionType_t *k_type, int *channel_per_core,
int *max_number_hw_per_core, int *max_length_per_core) {
const int32_t cluster_limit = torch_mlu::getDeviceAttr(cnrtAttrClusterCount);
const int32_t core_limit = torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster);
auto nram_size = torch_mlu::getDeviceAttr(cnrtAttrNramSizePerMcore);
const int core_num = core_limit * cluster_limit;
const int batch_size = input.size(0);
const int time_size = input.size(1);
const int channel_size = input.size(2);
const int hw_size = input.size(3);
const size_t size_per_channel = time_size * hw_size * input.itemsize();
*channel_per_core = nram_size / size_per_channel;
int task_dim = 0;
if (*channel_per_core == 0) {
const size_t size_per_hw = hw_size * input.itemsize();
*max_number_hw_per_core = nram_size / size_per_hw;
if (*max_number_hw_per_core <= 0) {
*max_length_per_core = nram_size / input.itemsize();
}
int tmp_max_number_hw_per_core =
*max_number_hw_per_core > 0 ? *max_number_hw_per_core : 1;
const int loop_time =
(time_size / (tmp_max_number_hw_per_core)) +
((time_size % (tmp_max_number_hw_per_core)) > 0 ? 1 : 0);
task_dim = batch_size * channel_size * loop_time < core_num
? batch_size * channel_size * loop_time
: core_num;
} else {
task_dim = batch_size * channel_size < core_num ? batch_size * channel_size
: core_num;
}
k_dim->x = core_limit;
k_dim->y = (task_dim / core_limit) > 0 ? (task_dim / core_limit) : 1;
k_dim->z = 1;
*k_type = CNRT_FUNC_TYPE_UNION1;
}
void TINShiftForwardMLUKernelLauncher(Tensor input, Tensor shift,
Tensor output) {
// params check
TORCH_CHECK(
input.scalar_type() == at::kFloat || input.scalar_type() == at::kHalf,
"input type should be Float or Half, got ", input.scalar_type(), ".");
TORCH_CHECK(input.dim() == 4, "input should be a 4d tensor, got ",
input.dim(), "d.");
TORCH_CHECK(shift.dim() == 2, "shift should be a 2d tensor, got ",
shift.dim(), "d.");
TORCH_CHECK(
input.size(0) == shift.size(0),
"input batch size should be the same as shift's, input batch size is ",
input.size(0), " and shift batch size is ", shift.size(0), ".");
TORCH_CHECK(input.size(0) != 0, "Input batch size should not be zero.");
TORCH_CHECK(input.size(3) != 0,
"The last dim size of input should not be zero.");
if (input.size(1) == 0) {
return;
}
cnrtDim3_t k_dim;
cnrtFunctionType_t k_type;
int channel_per_core = 0;
int max_number_hw_per_core = 0;
int max_length_per_core = 0;
policyFunc(input, &k_dim, &k_type, &channel_per_core, &max_number_hw_per_core,
&max_length_per_core);
const int batch_size = input.size(0);
const int time_size = input.size(1);
const int channel_size = input.size(2);
const int hw_size = input.size(3);
const int group_size = shift.size(1);
int group_channel = channel_size / group_size;
// get tensor impl
auto input_impl = torch_mlu::getMluTensorImpl(input);
auto shift_impl = torch_mlu::getMluTensorImpl(shift);
auto output_impl = torch_mlu::getMluTensorImpl(output);
// get compute queue
auto queue = torch_mlu::getCurQueue();
// get the mlu ptr
auto input_ptr = input_impl->cnnlMalloc();
auto shift_ptr = shift_impl->cnnlMalloc();
auto output_ptr = output_impl->cnnlMalloc();
cnrtDataType_t data_dtype = torch_mlu::toCnrtDtype(input.dtype());
KernelTinShiftForward(k_dim, k_type, queue, input_ptr, shift_ptr, output_ptr,
batch_size, time_size, channel_size, hw_size,
group_size, group_channel, data_dtype, channel_per_core,
max_number_hw_per_core, max_length_per_core);
}
void TINShiftBackwardMLUKernelLauncher(Tensor grad_output, Tensor shift,
Tensor grad_input) {
// params check
TORCH_CHECK(grad_output.scalar_type() == at::kFloat ||
grad_output.scalar_type() == at::kHalf,
"grad_output type should be Float or Half, got ",
grad_output.scalar_type(), ".");
TORCH_CHECK(grad_output.dim() == 4, "grad_output should be a 4d tensor, got ",
grad_output.dim(), "d.");
TORCH_CHECK(shift.dim() == 2, "shift should be a 2d tensor, got ",
shift.dim(), "d.");
TORCH_CHECK(grad_output.size(0) == shift.size(0),
"grad_output batch size should be the same as shift's, "
"grad_output batch size is ",
grad_output.size(0), ", shift batch size is ", shift.size(0),
".");
TORCH_CHECK(grad_output.size(0) != 0,
"grad_output batch size should not be zero.");
TORCH_CHECK(grad_output.size(3) != 0,
"The last dim size of grad_output should not be zero.");
if (grad_output.size(1) == 0) {
return;
}
cnrtDim3_t k_dim;
cnrtFunctionType_t k_type;
int channel_per_core = 0;
int max_number_hw_per_core = 0;
int max_length_per_core = 0;
policyFunc(grad_output, &k_dim, &k_type, &channel_per_core,
&max_number_hw_per_core, &max_length_per_core);
const int batch_size = grad_output.size(0);
const int time_size = grad_output.size(1);
const int channel_size = grad_output.size(2);
const int hw_size = grad_output.size(3);
const int group_size = shift.size(1);
int group_channel = channel_size / group_size;
// get tensor impl
auto grad_output_impl = torch_mlu::getMluTensorImpl(grad_output);
auto shift_impl = torch_mlu::getMluTensorImpl(shift);
auto grad_input_impl = torch_mlu::getMluTensorImpl(grad_input);
// get compute queue
auto queue = torch_mlu::getCurQueue();
// get the mlu ptr
auto grad_output_ptr = grad_output_impl->cnnlMalloc();
auto shift_ptr = shift_impl->cnnlMalloc();
auto grad_input_ptr = grad_input_impl->cnnlMalloc();
cnrtDataType_t data_dtype = torch_mlu::toCnrtDtype(grad_output.dtype());
KernelTinShiftBackward(k_dim, k_type, queue, grad_output_ptr, shift_ptr,
grad_input_ptr, batch_size, time_size, channel_size,
hw_size, group_size, group_channel, data_dtype,
channel_per_core, max_number_hw_per_core,
max_length_per_core);
}
void tin_shift_forward_mlu(Tensor input, Tensor shift, Tensor output) {
TINShiftForwardMLUKernelLauncher(input, shift, output);
}
void tin_shift_backward_mlu(Tensor grad_output, Tensor shift,
Tensor grad_input) {
TINShiftBackwardMLUKernelLauncher(grad_output, shift, grad_input);
}
void tin_shift_forward_impl(Tensor input, Tensor shift, Tensor output);
void tin_shift_backward_impl(Tensor grad_output, Tensor shift,
Tensor grad_input);
REGISTER_DEVICE_IMPL(tin_shift_forward_impl, MLU, tin_shift_forward_mlu);
REGISTER_DEVICE_IMPL(tin_shift_backward_impl, MLU, tin_shift_backward_mlu);
// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
#include "pytorch_device_registry.hpp"
#include "MPSLibrary.h"
#include "MPSStream.h"
#include "MPSUtils.h"
using at::Tensor;
const static std::string kSourceCode = R"(
#include <metal_math>
#include <metal_stdlib>
using namespace metal;
kernel void bbox_overlap_mps_kernel(constant const float4* bboxes1,
constant const float4* bboxes2,
device float* ious,
constant int& num_bbox1,
constant int& num_bbox2,
constant int& mode,
constant bool& aligned,
constant int& offset,
uint index [[thread_position_in_grid]])
{
int base1 = index;
int base2 = index;
if(!aligned){
base1 = index / num_bbox2;
base2 = index % num_bbox2;
}
const float f_offset = float(offset);
const float4 b1 = bboxes1[base1];
const float b1_area = (b1[2]-b1[0]+f_offset)*(b1[3]-b1[1]+f_offset);
const float4 b2 = bboxes2[base2];
const float b2_area = (b2[2]-b2[0]+f_offset)*(b2[3]-b2[1]+f_offset);
const float2 left_top = fmax(b1.xy, b2.xy);
const float2 right_bottom = fmin(b1.zw, b2.zw);
const float2 wh = fmax(right_bottom - left_top + f_offset, 0.0f);
const float interS = wh.x * wh.y;
const float baseS =
fmax(mode == 0 ? b1_area + b2_area - interS : b1_area, f_offset);
ious[index] = interS / baseS;
}
)";
void BBoxOverlapsMPSKernelLauncher(const Tensor bboxes1, const Tensor bboxes2, Tensor ious,
const int mode, const bool aligned, const int offset) {
// get stream
auto stream = at::mps::getCurrentMPSStream();
auto library_manager = MPSLibraryManager::getInstance();
MPSLibrary* library;
const static std::string kLibraryName = "bbox_overlap";
if (library_manager->hasLibrary(kLibraryName))
library = library_manager->getLibrary(kLibraryName);
else
library = library_manager->createLibraryFromSouce(kLibraryName, kSourceCode);
auto func_pso = library->getComputePipelineState("bbox_overlap_mps_kernel");
// create command buffer and encoder
MTLCommandBuffer_t command_buffer = stream->commandBuffer();
MTLComputeCommandEncoder_t compute_encoder = [command_buffer computeCommandEncoder];
// set pso and buffer
int output_size = ious.numel();
int num_bbox1 = bboxes1.size(0);
int num_bbox2 = bboxes2.size(0);
int num_elements = output_size;
setMTLArgs(compute_encoder, func_pso, bboxes1, bboxes2, ious, num_bbox1, num_bbox2, mode, aligned,
offset);
// set grid size
MTLSize grid_size = MTLSizeMake(num_elements, 1, 1);
NSUInteger thread_group_size_x = func_pso.maxTotalThreadsPerThreadgroup;
if (thread_group_size_x > num_elements) {
thread_group_size_x = num_elements;
}
MTLSize thread_group_size = MTLSizeMake(thread_group_size_x, 1, 1);
// encoding
[compute_encoder dispatchThreads:grid_size threadsPerThreadgroup:thread_group_size];
[compute_encoder endEncoding];
// commit, not sure if flush is required
stream->commit(false);
}
void bbox_overlaps_mps(const Tensor bboxes1, const Tensor bboxes2, Tensor ious, const int mode,
const bool aligned, const int offset) {
BBoxOverlapsMPSKernelLauncher(bboxes1, bboxes2, ious, mode, aligned, offset);
}
void bbox_overlaps_impl(const Tensor bboxes1, const Tensor bboxes2, Tensor ious, const int mode,
const bool aligned, const int offset);
REGISTER_DEVICE_IMPL(bbox_overlaps_impl, MPS, bbox_overlaps_mps);
#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
void points_in_polygons_forward_impl(const Tensor points, const Tensor polygons,
Tensor output, const int rows,
const int cols) {
DISPATCH_DEVICE_IMPL(points_in_polygons_forward_impl, points, polygons,
output, rows, cols);
}
void points_in_polygons_forward(Tensor points, Tensor polygons, Tensor output) {
int rows = points.size(0);
int cols = polygons.size(0);
points_in_polygons_forward_impl(points, polygons, output, rows, cols);
}
// Copyright (c) OpenMMLab. All rights reserved
#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
void prroi_pool_forward_impl(Tensor input, Tensor rois, Tensor output,
int pooled_height, int pooled_width,
float spatial_scale) {
DISPATCH_DEVICE_IMPL(prroi_pool_forward_impl, input, rois, output,
pooled_height, pooled_width, spatial_scale);
}
void prroi_pool_backward_impl(Tensor grad_output, Tensor rois,
Tensor grad_input, int pooled_height,
int pooled_width, float spatial_scale) {
DISPATCH_DEVICE_IMPL(prroi_pool_backward_impl, grad_output, rois, grad_input,
pooled_height, pooled_width, spatial_scale);
}
void prroi_pool_coor_backward_impl(Tensor output, Tensor grad_output,
Tensor input, Tensor rois, Tensor grad_rois,
int pooled_height, int pooled_width,
float spatial_scale) {
DISPATCH_DEVICE_IMPL(prroi_pool_coor_backward_impl, output, grad_output,
input, rois, grad_rois, pooled_height, pooled_width,
spatial_scale);
}
void prroi_pool_forward(Tensor input, Tensor rois, Tensor output,
int pooled_height, int pooled_width,
float spatial_scale) {
prroi_pool_forward_impl(input, rois, output, pooled_height, pooled_width,
spatial_scale);
}
void prroi_pool_backward(Tensor grad_output, Tensor rois, Tensor grad_input,
int pooled_height, int pooled_width,
float spatial_scale) {
prroi_pool_backward_impl(grad_output, rois, grad_input, pooled_height,
pooled_width, spatial_scale);
}
void prroi_pool_coor_backward(Tensor output, Tensor grad_output, Tensor input,
Tensor rois, Tensor grad_rois, int pooled_height,
int pooled_width, float spatial_scale) {
prroi_pool_coor_backward_impl(output, grad_output, input, rois, grad_rois,
pooled_height, pooled_width, spatial_scale);
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment