Unverified Commit c1de4c9b authored by Wenhao Wu's avatar Wenhao Wu Committed by GitHub
Browse files

[Feature] Add spconv ops from mmdet3d (#1581)



* add ops (spconv) of mmdet3d

* fix typo

* refactor code

* resolve comments in #1452

* fix compile error

* fix bugs

* fix bug

* transform from 'types.h' to 'extension.h'

* fix bug

* transform from 'types.h' to 'extension.h' in parrots

* add extension.h in pybind.cpp

* add unittest

* Recover code

* (1) Remove prettyprint.h
(2) Switch `T` to `scalar_t`
(3) Remove useless lines
(4) Refine example in docstring of sparse_modules.py

* (1) rename from `cu.h` to `cuh`
(2) remove useless files
(3) move cpu files to `pytorch/cpu`

* reorganize files

* Add docstring for sparse_functional.py

* use dispatcher

* remove template

* use dispatch in cuda ops

* resolve Segmentation fault

* remove useless files

* fix lint

* fix lint

* fix lint

* fix unittest in test_build_layers.py

* add tensorview into include_dirs when compiling

* recover all deleted files

* fix lint and comments

* recover setup.py

* replace tv::GPU as tv::TorchGPU & support device guard

* fix lint
Co-authored-by: default avatarhdc <hudingchang.vendor@sensetime.com>
Co-authored-by: default avatargrimoire <yaoqian@sensetime.com>
parent 33c83b5a
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <torch/script.h>
#include <utils/spconv/spconv/maxpool.h>
#include "pytorch_cpp_helper.hpp"
namespace functor {
template <typename scalar_t, typename Index>
struct SparseMaxPoolForwardFunctor<tv::CPU, scalar_t, Index> {
void operator()(const tv::CPU &d, tv::TensorView<scalar_t> outFeatures,
tv::TensorView<const scalar_t> inFeatures,
tv::TensorView<const Index> indices, int size) {
int stride = outFeatures.dim(1);
auto outFeaturesData = outFeatures.data();
auto inFeaturesData = inFeatures.data();
auto indicesIn = indices.subview(0).data();
auto indicesOut = indices.subview(1).data();
Index idxi, idxo;
for (int row = 0; row < size; row++) {
idxi = indicesIn[row] * stride;
idxo = indicesOut[row] * stride;
for (int plane = 0; plane < stride; ++plane)
if (outFeaturesData[idxo + plane] < inFeaturesData[idxi + plane])
outFeaturesData[idxo + plane] = inFeaturesData[idxi + plane];
}
}
};
template <typename scalar_t, typename Index>
struct SparseMaxPoolBackwardFunctor<tv::CPU, scalar_t, Index> {
void operator()(const tv::CPU &d, tv::TensorView<const scalar_t> outFeatures,
tv::TensorView<const scalar_t> inFeatures,
tv::TensorView<const scalar_t> fout,
tv::TensorView<scalar_t> fin,
tv::TensorView<const Index> indices, int size) {
int stride = outFeatures.dim(1);
auto outFeaturesData = outFeatures.data();
auto inFeaturesData = inFeatures.data();
auto foutData = fout.data();
auto finData = fin.data();
auto indicesIn = indices.subview(0).data();
auto indicesOut = indices.subview(1).data();
Index idxi, idxo;
for (int row = 0; row < size; row++) {
idxi = indicesIn[row] * stride;
idxo = indicesOut[row] * stride;
for (int plane = 0; plane < stride; ++plane)
if (outFeaturesData[idxo + plane] == inFeaturesData[idxi + plane])
finData[idxi + plane] += foutData[idxo + plane];
}
}
};
} // namespace functor
#define DECLARE_CPU_SPECS_T_INDEX(T, Index) \
template struct functor::SparseMaxPoolForwardFunctor<tv::CPU, T, Index>; \
template struct functor::SparseMaxPoolBackwardFunctor<tv::CPU, T, Index>;
#define DECLARE_CPU_SPECS(T) \
DECLARE_CPU_SPECS_T_INDEX(T, int); \
DECLARE_CPU_SPECS_T_INDEX(T, long);
DECLARE_CPU_SPECS(float);
DECLARE_CPU_SPECS(double);
DECLARE_CPU_SPECS(at::Half);
#undef DECLARE_CPU_SPECS
#undef DECLARE_CPU_SPECS_T_INDEX
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <torch/script.h>
#include <utils/spconv/spconv/reordering.h>
#include "pytorch_cpp_helper.hpp"
namespace functor {
template <typename scalar_t, typename Index>
struct SparseGatherFunctor<tv::CPU, scalar_t, Index> {
void operator()(const tv::CPU& d, tv::TensorView<scalar_t> buffer,
tv::TensorView<const scalar_t> features,
tv::TensorView<const Index> indices, int size) {
int numPlanes = features.dim(1);
for (int i = 0; i < size; ++i) {
std::memcpy(buffer.data() + i * numPlanes,
features.data() + indices[i] * numPlanes,
sizeof(scalar_t) * numPlanes);
}
}
};
template <typename scalar_t, typename Index>
struct SparseScatterAddFunctor<tv::CPU, scalar_t, Index> {
void operator()(const tv::CPU& d, tv::TensorView<scalar_t> outFeatures,
tv::TensorView<const scalar_t> buffer,
tv::TensorView<const Index> indices, int size, bool stable) {
int numPlanes = outFeatures.dim(1);
const scalar_t* buf = buffer.data();
scalar_t* out = outFeatures.data();
for (int i = 0; i < size; ++i) {
buf = buffer.data() + i * numPlanes;
out = outFeatures.data() + indices[i] * numPlanes;
for (int j = 0; j < numPlanes; ++j) {
out[j] += buf[j];
}
}
}
};
} // namespace functor
#define DECLARE_CPU_SPECS_T_INDEX(scalar_t, Index) \
template struct functor::SparseGatherFunctor<tv::CPU, scalar_t, Index>; \
template struct functor::SparseScatterAddFunctor<tv::CPU, scalar_t, Index>;
#define DECLARE_CPU_SPECS(scalar_t) \
DECLARE_CPU_SPECS_T_INDEX(scalar_t, int); \
DECLARE_CPU_SPECS_T_INDEX(scalar_t, long);
DECLARE_CPU_SPECS(float);
DECLARE_CPU_SPECS(double);
DECLARE_CPU_SPECS(at::Half);
#undef DECLARE_CPU_SPECS
#undef DECLARE_CPU_SPECS_T_INDEX
...@@ -1501,6 +1501,117 @@ void points_in_polygons_forward_impl(const Tensor points, const Tensor polygons, ...@@ -1501,6 +1501,117 @@ void points_in_polygons_forward_impl(const Tensor points, const Tensor polygons,
REGISTER_DEVICE_IMPL(points_in_polygons_forward_impl, CUDA, REGISTER_DEVICE_IMPL(points_in_polygons_forward_impl, CUDA,
points_in_polygons_forward_cuda); points_in_polygons_forward_cuda);
torch::Tensor IndiceMaxpoolForwardCUDAKernelLauncher(torch::Tensor features,
torch::Tensor indicePairs,
torch::Tensor indiceNum,
int64_t numAct);
torch::Tensor indice_maxpool_forward_cuda(torch::Tensor features,
torch::Tensor indicePairs,
torch::Tensor indiceNum,
int64_t numAct) {
return IndiceMaxpoolForwardCUDAKernelLauncher(features, indicePairs,
indiceNum, numAct);
};
torch::Tensor indice_maxpool_forward_impl(torch::Tensor features,
torch::Tensor indicePairs,
torch::Tensor indiceNum,
int64_t numAct);
REGISTER_DEVICE_IMPL(indice_maxpool_forward_impl, CUDA,
indice_maxpool_forward_cuda);
torch::Tensor IndiceMaxpoolBackwardCUDAKernelLauncher(torch::Tensor features,
torch::Tensor outFeatures,
torch::Tensor outGrad,
torch::Tensor indicePairs,
torch::Tensor indiceNum);
torch::Tensor indice_maxpool_backward_cuda(torch::Tensor features,
torch::Tensor outFeatures,
torch::Tensor outGrad,
torch::Tensor indicePairs,
torch::Tensor indiceNum) {
return IndiceMaxpoolBackwardCUDAKernelLauncher(features, outFeatures, outGrad,
indicePairs, indiceNum);
};
torch::Tensor indice_maxpool_backward_impl(torch::Tensor features,
torch::Tensor outFeatures,
torch::Tensor outGrad,
torch::Tensor indicePairs,
torch::Tensor indiceNum);
REGISTER_DEVICE_IMPL(indice_maxpool_backward_impl, CUDA,
indice_maxpool_backward_cuda)
torch::Tensor IndiceConvForwardCUDAKernelLauncher(
torch::Tensor features, torch::Tensor filters, torch::Tensor indicePairs,
torch::Tensor indiceNum, int64_t numActOut, int64_t _inverse,
int64_t _subM);
torch::Tensor indice_conv_forward_cuda(torch::Tensor features,
torch::Tensor filters,
torch::Tensor indicePairs,
torch::Tensor indiceNum,
int64_t numActOut, int64_t _inverse,
int64_t _subM) {
return IndiceConvForwardCUDAKernelLauncher(
features, filters, indicePairs, indiceNum, numActOut, _inverse, _subM);
};
torch::Tensor indice_conv_forward_impl(torch::Tensor features,
torch::Tensor filters,
torch::Tensor indicePairs,
torch::Tensor indiceNum,
int64_t numActOut, int64_t _inverse,
int64_t _subM);
REGISTER_DEVICE_IMPL(indice_conv_forward_impl, CUDA, indice_conv_forward_cuda);
std::vector<torch::Tensor> IndiceConvBackwardCUDAKernelLauncher(
torch::Tensor features, torch::Tensor filters, torch::Tensor outGrad,
torch::Tensor indicePairs, torch::Tensor indiceNum, int64_t _inverse,
int64_t _subM);
std::vector<torch::Tensor> indice_conv_backward_cuda(
torch::Tensor features, torch::Tensor filters, torch::Tensor outGrad,
torch::Tensor indicePairs, torch::Tensor indiceNum, int64_t _inverse,
int64_t _subM) {
return IndiceConvBackwardCUDAKernelLauncher(
features, filters, outGrad, indicePairs, indiceNum, _inverse, _subM);
};
std::vector<torch::Tensor> indice_conv_backward_impl(
torch::Tensor features, torch::Tensor filters, torch::Tensor outGrad,
torch::Tensor indicePairs, torch::Tensor indiceNum, int64_t _inverse,
int64_t _subM);
REGISTER_DEVICE_IMPL(indice_conv_backward_impl, CUDA,
indice_conv_backward_cuda);
torch::Tensor FusedIndiceConvBatchnormCUDAKernelLauncher(
torch::Tensor features, torch::Tensor filters, torch::Tensor bias,
torch::Tensor indicePairs, torch::Tensor indiceNum, int64_t numActOut,
int64_t _inverse, int64_t _subM);
torch::Tensor fused_indice_conv_batchnorm_forward_cuda(
torch::Tensor features, torch::Tensor filters, torch::Tensor bias,
torch::Tensor indicePairs, torch::Tensor indiceNum, int64_t numActOut,
int64_t _inverse, int64_t _subM) {
return FusedIndiceConvBatchnormCUDAKernelLauncher(features, filters, bias,
indicePairs, indiceNum,
numActOut, _inverse, _subM);
};
torch::Tensor fused_indice_conv_batchnorm_forward_impl(
torch::Tensor features, torch::Tensor filters, torch::Tensor bias,
torch::Tensor indicePairs, torch::Tensor indiceNum, int64_t numActOut,
int64_t _inverse, int64_t _subM);
REGISTER_DEVICE_IMPL(fused_indice_conv_batchnorm_forward_impl, CUDA,
fused_indice_conv_batchnorm_forward_cuda)
void MinAreaPolygonsCUDAKernelLauncher(const Tensor pointsets, Tensor polygons); void MinAreaPolygonsCUDAKernelLauncher(const Tensor pointsets, Tensor polygons);
void min_area_polygons_cuda(const Tensor pointsets, Tensor polygons) { void min_area_polygons_cuda(const Tensor pointsets, Tensor polygons) {
......
#include <cuda_runtime_api.h>
#include <torch/script.h>
#include <utils/spconv/spconv/indice.h>
#include <utils/spconv/spconv/reordering.h>
#include "../spconv_utils.h"
#include "pytorch_cuda_helper.hpp"
torch::Tensor FusedIndiceConvBatchnormCUDAKernelLauncher(
torch::Tensor features, torch::Tensor filters, torch::Tensor bias,
torch::Tensor indicePairs, torch::Tensor indiceNum, int64_t numActOut,
int64_t _inverse, int64_t _subM) {
at::cuda::CUDAGuard device_guard(features.device());
bool subM = _subM != 0;
bool inverse = _inverse != 0;
auto device = features.device().type();
auto ndim = filters.dim() - 2;
auto kernelVolume = indicePairs.size(0);
auto numInPlanes = features.size(1);
auto numOutPlanes = filters.size(ndim + 1);
auto indicePairNumCpu = indiceNum.to({torch::kCPU});
auto indicePairMaxSizeIter =
std::max_element(indicePairNumCpu.data_ptr<int>(),
indicePairNumCpu.data_ptr<int>() + kernelVolume);
int indicePairMaxOffset =
indicePairMaxSizeIter - indicePairNumCpu.data_ptr<int>();
int indicePairMaxSize = *indicePairMaxSizeIter;
auto options =
torch::TensorOptions().dtype(features.dtype()).device(features.device());
torch::Tensor output =
torch::zeros({numActOut, numOutPlanes}, options).copy_(bias);
torch::Tensor inputBuffer =
torch::zeros({indicePairMaxSize, numInPlanes}, options);
torch::Tensor outputBuffer =
torch::zeros({indicePairMaxSize, numOutPlanes}, options);
filters = filters.view({-1, numInPlanes, numOutPlanes});
if (subM) { // the center index of subm conv don't need gather and scatter
// add.
torch::mm_out(output, features, filters[indicePairMaxOffset]);
}
double totalGatherTime = 0;
double totalGEMMTime = 0;
double totalSAddTime = 0;
for (int i = 0; i < kernelVolume; ++i) {
auto nHot = indicePairNumCpu.data_ptr<int>()[i];
if (nHot <= 0 || (subM && i == indicePairMaxOffset)) {
continue;
}
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
features.scalar_type(), "FusedIndiceConvBatchnormKernel", [&] {
auto outputBufferBlob = torch::from_blob(
outputBuffer.data_ptr<scalar_t>(), {nHot, numOutPlanes}, options);
auto inputBufferBlob = torch::from_blob(
inputBuffer.data_ptr<scalar_t>(), {nHot, numInPlanes}, options);
if (device == torch::kCPU) {
functor::SparseGatherFunctor<tv::CPU, scalar_t, int> gatherFtor;
gatherFtor(tv::CPU(), tv::torch2tv<scalar_t>(inputBuffer),
tv::torch2tv<const scalar_t>(features),
tv::torch2tv<const int>(indicePairs).subview(i, inverse),
nHot);
} else {
functor::SparseGatherFunctor<tv::TorchGPU, scalar_t, int>
gatherFtor;
gatherFtor(tv::TorchGPU(), tv::torch2tv<scalar_t>(inputBuffer),
tv::torch2tv<const scalar_t>(features),
tv::torch2tv<const int>(indicePairs).subview(i, inverse),
nHot);
TV_CHECK_CUDA_ERR();
/* slower than SparseGatherFunctor, may due to int->long conversion
auto indicePairLong = indicePairs[i][inverse].to(torch::kInt64);
auto indicePairBlob =
torch::from_blob(indicePairLong.data_ptr<long>(), {nHot},
indicePairOptions); torch::index_select_out(inputBufferBlob,
features, 0, indicePairBlob);*/
}
torch::mm_out(outputBufferBlob, inputBufferBlob, filters[i]);
if (device == torch::kCPU) {
functor::SparseScatterAddFunctor<tv::CPU, scalar_t, int>
scatterFtor;
scatterFtor(
tv::CPU(), tv::torch2tv<scalar_t>(output),
tv::torch2tv<const scalar_t>(outputBuffer),
tv::torch2tv<const int>(indicePairs).subview(i, !inverse), nHot,
true);
} else {
functor::SparseScatterAddFunctor<tv::TorchGPU, scalar_t, int>
scatterFtor;
scatterFtor(
tv::TorchGPU(), tv::torch2tv<scalar_t>(output),
tv::torch2tv<const scalar_t>(outputBuffer),
tv::torch2tv<const int>(indicePairs).subview(i, !inverse), nHot,
true);
TV_CHECK_CUDA_ERR();
}
});
}
return output;
}
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <ATen/ATen.h>
#include <utils/spconv/spconv/indice.h>
#include <utils/spconv/spconv/mp_helper.h>
#include <utils/spconv/tensorview/helper_launch.h>
#include <utils/spconv/tensorview/tensorview.h>
#include <chrono>
#include <limits>
#include <spconv/indice.cuh>
#include <type_traits>
#include "../spconv_utils.h"
#include "pytorch_cuda_helper.hpp"
namespace functor {
template <typename Index, typename IndexGrid, unsigned NDim>
struct CreateConvIndicePairFunctorP1<tv::TorchGPU, Index, IndexGrid, NDim> {
Index operator()(const tv::TorchGPU &d, tv::TensorView<const Index> indicesIn,
tv::TensorView<Index> indicesOut,
tv::TensorView<IndexGrid> gridsOut,
tv::TensorView<Index> indicePairs,
tv::TensorView<Index> indiceNum,
tv::TensorView<Index> indicePairUnique,
const tv::SimpleVector<Index, NDim> kernelSize,
const tv::SimpleVector<Index, NDim> stride,
const tv::SimpleVector<Index, NDim> padding,
const tv::SimpleVector<Index, NDim> dilation,
const tv::SimpleVector<Index, NDim> outSpatialShape,
bool transpose) {
Index batchSize = gridsOut.dim(0);
auto numActIn = indicesIn.dim(0);
if (numActIn == 0) return 0;
if (transpose)
prepareDeConvIndicePairsKernel<Index, IndexGrid, NDim, 4096>
<<<tv::launch::getBlocks(numActIn), tv::launch::CUDA_NUM_THREADS, 0,
d.getStream()>>>(indicesIn, indicesOut, gridsOut, indicePairs,
indiceNum, indicePairUnique, kernelSize, stride,
padding, dilation, outSpatialShape);
else
prepareIndicePairsKernel<Index, IndexGrid, NDim, 4096>
<<<tv::launch::getBlocks(numActIn), tv::launch::CUDA_NUM_THREADS, 0,
d.getStream()>>>(indicesIn, indicesOut, gridsOut, indicePairs,
indiceNum, indicePairUnique, kernelSize, stride,
padding, dilation, outSpatialShape);
TV_CHECK_CUDA_ERR();
return 1;
}
};
template <typename Index, typename IndexGrid, unsigned NDim>
struct CreateConvIndicePairFunctorP2<tv::TorchGPU, Index, IndexGrid, NDim> {
Index operator()(const tv::TorchGPU &d, tv::TensorView<const Index> indicesIn,
tv::TensorView<Index> indicesOut,
tv::TensorView<IndexGrid> gridsOut,
tv::TensorView<Index> indicePairs,
tv::TensorView<Index> indiceNum,
tv::TensorView<Index> indicePairUnique,
const tv::SimpleVector<Index, NDim> outSpatialShape,
bool transpose, bool resetGrid) {
Index batchSize = gridsOut.dim(0);
auto kernelVolume = indicePairs.dim(0);
auto numActIn = indicesIn.dim(0);
if (numActIn == 0) return 0;
Index numAct = indicePairUnique.dim(0) - 1;
assignGridAndIndiceOutKernel<Index, IndexGrid, NDim>
<<<tv::launch::getBlocks(numAct), tv::launch::CUDA_NUM_THREADS, 0,
d.getStream()>>>(indicesOut, gridsOut, numAct, indicePairs,
indicePairUnique, outSpatialShape, batchSize);
TV_CHECK_CUDA_ERR();
assignIndicePairsKernel<Index, IndexGrid, NDim>
<<<tv::launch::getBlocks(numActIn), tv::launch::CUDA_NUM_THREADS, 0,
d.getStream()>>>(indicesOut, gridsOut, numActIn, indicePairs,
indicePairUnique, outSpatialShape);
TV_CHECK_CUDA_ERR();
if (resetGrid) {
resetGridKernel<Index, IndexGrid, NDim>
<<<tv::launch::getBlocks(numAct), tv::launch::CUDA_NUM_THREADS, 0,
d.getStream()>>>(indicePairUnique.data(), gridsOut, numAct);
TV_CHECK_CUDA_ERR();
}
return numAct;
}
};
template <typename Index, typename IndexGrid, unsigned NDim>
struct CreateSubMIndicePairFunctor<tv::TorchGPU, Index, IndexGrid, NDim> {
Index operator()(const tv::TorchGPU &d, tv::TensorView<const Index> indicesIn,
tv::TensorView<IndexGrid> gridsOut,
tv::TensorView<Index> indicePairs,
tv::TensorView<Index> indiceNum,
const tv::SimpleVector<Index, NDim> kernelSize,
const tv::SimpleVector<Index, NDim> stride,
const tv::SimpleVector<Index, NDim> padding,
const tv::SimpleVector<Index, NDim> dilation,
const tv::SimpleVector<Index, NDim> outSpatialShape,
bool transpose, bool resetGrid) {
auto numActIn = indicesIn.dim(0);
if (numActIn == 0) return 0;
prepareSubMGridKernel<Index, IndexGrid, NDim>
<<<tv::launch::getBlocks(numActIn), tv::launch::CUDA_NUM_THREADS, 0,
d.getStream()>>>(indicesIn, gridsOut, outSpatialShape);
TV_CHECK_CUDA_ERR();
getSubMIndicePairsKernel<Index, IndexGrid, NDim, 4096>
<<<tv::launch::getBlocks(numActIn), tv::launch::CUDA_NUM_THREADS, 0,
d.getStream()>>>(indicesIn, gridsOut, indicePairs, indiceNum,
kernelSize, stride, padding, dilation,
outSpatialShape);
TV_CHECK_CUDA_ERR();
if (resetGrid) {
resetGridSubMKernel<Index, IndexGrid, NDim>
<<<tv::launch::getBlocks(numActIn), tv::launch::CUDA_NUM_THREADS, 0,
d.getStream()>>>(indicesIn.data(), gridsOut, outSpatialShape,
numActIn);
TV_CHECK_CUDA_ERR();
}
return numActIn;
}
};
} // namespace functor
#define DECLARE_GPU_SPECS_INDEX_NDIM(Index, NDIM) \
template struct functor::CreateConvIndicePairFunctor<tv::TorchGPU, Index, \
int, NDIM>; \
template struct functor::CreateConvIndicePairFunctorP1<tv::TorchGPU, Index, \
int, NDIM>; \
template struct functor::CreateConvIndicePairFunctorP2<tv::TorchGPU, Index, \
int, NDIM>; \
template struct functor::CreateSubMIndicePairFunctor<tv::TorchGPU, Index, \
int, NDIM>;
#define DECLARE_GPU_INDEX(Index) \
DECLARE_GPU_SPECS_INDEX_NDIM(Index, 1); \
DECLARE_GPU_SPECS_INDEX_NDIM(Index, 2); \
DECLARE_GPU_SPECS_INDEX_NDIM(Index, 3); \
DECLARE_GPU_SPECS_INDEX_NDIM(Index, 4);
DECLARE_GPU_INDEX(int);
#undef DECLARE_GPU_INDEX
#undef DECLARE_GPU_SPECS_INDEX_NDIM
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <ATen/ATen.h>
#include <utils/spconv/spconv/maxpool.h>
#include <utils/spconv/spconv/mp_helper.h>
#include <utils/spconv/tensorview/helper_launch.h>
#include <utils/spconv/tensorview/tensorview.h>
#include <chrono>
#include <limits>
#include <type_traits>
#include <utils/spconv/tensorview/helper_kernel.cuh>
#include "../spconv_utils.h"
#include "pytorch_cuda_helper.hpp"
template <typename scalar_t, typename Index, int NumTLP, int NumILP>
__global__ void maxPoolFwdBlockKernel(scalar_t *outFeatures,
const scalar_t *inFeatures,
const Index *indicesIn,
const Index *indicesOut, int numHot,
int numPlanes) {
scalar_t in, out;
int ILPStrideY[NumILP];
Index idxo, idxi;
#pragma unroll
for (int ilp = 0; ilp < NumILP; ilp++)
ILPStrideY[ilp] = threadIdx.y + ilp * blockDim.y;
outFeatures += blockIdx.y * NumTLP;
inFeatures += blockIdx.y * NumTLP;
for (int ix = blockIdx.x * blockDim.x; ix < numHot;
ix += blockDim.x * gridDim.x) {
{
#pragma unroll
for (int ilp = 0; ilp < NumILP; ++ilp) {
idxi = indicesIn[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
idxo = indicesOut[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
in = inFeatures[idxi];
out = outFeatures[idxo];
if (in > out) {
outFeatures[idxo] = in;
}
}
}
}
}
template <typename scalar_t, typename Index, int NumTLP, int NumILP>
__global__ void maxPoolFwdGenericBlockKernel(scalar_t *outFeatures,
const scalar_t *inFeatures,
const Index *indicesIn,
const Index *indicesOut,
int numHot, int numPlanes) {
int ILPStrideX[NumILP];
Index RI[NumILP];
Index RO[NumILP];
scalar_t in, out;
#pragma unroll
for (int ilp = 0; ilp < NumILP; ilp++)
ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
for (int ix : tv::KernelLoopX<int, NumILP>(numHot)) {
#pragma unroll
for (int ilp = 0; ilp < NumILP; ilp++) {
RI[ilp] = indicesIn[ix + ILPStrideX[ilp]] * numPlanes;
RO[ilp] = indicesOut[ix + ILPStrideX[ilp]] * numPlanes;
}
for (int iy : tv::KernelLoopY<int>(numPlanes)) {
#pragma unroll
for (int ilp = 0; ilp < NumILP; ++ilp) {
in = inFeatures[RI[ilp] + iy];
out = outFeatures[RO[ilp] + iy];
if (in > out) {
outFeatures[RO[ilp] + iy] = in;
}
}
}
}
}
template <typename scalar_t, typename Index, int NumTLP, int NumILP,
typename VecType>
__global__ void maxPoolFwdVecBlockKernel(scalar_t *outFeatures,
const scalar_t *inFeatures,
const Index *indicesIn,
const Index *indicesOut, int numHot,
int numPlanes) {
int ILPStrideY[NumILP];
constexpr int vecloadFactor = sizeof(VecType) / sizeof(scalar_t);
scalar_t bufi[vecloadFactor];
scalar_t bufo[vecloadFactor];
Index idxi, idxo;
#pragma unroll
for (int ilp = 0; ilp < NumILP; ilp++)
ILPStrideY[ilp] = threadIdx.y + ilp * blockDim.y;
outFeatures += blockIdx.y * NumTLP;
inFeatures += blockIdx.y * NumTLP;
for (int ix = blockIdx.x * blockDim.x * vecloadFactor; ix < numHot;
ix += blockDim.x * gridDim.x * vecloadFactor) {
#pragma unroll
for (int ilp = 0; ilp < NumILP; ++ilp) {
idxi = indicesIn[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
idxo = indicesOut[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
reinterpret_cast<VecType *>(bufo)[0] =
reinterpret_cast<VecType *>(outFeatures)[idxo];
reinterpret_cast<VecType *>(bufi)[0] =
reinterpret_cast<const VecType *>(inFeatures)[idxi];
#pragma unroll
for (int i = 0; i < vecloadFactor; i++) {
if (bufi[i] > bufo[i]) {
bufo[i] = bufi[i];
}
}
reinterpret_cast<VecType *>(outFeatures)[idxo] =
reinterpret_cast<VecType *>(bufo)[0];
}
}
}
template <typename scalar_t, typename Index, int NumTLP, int NumILP>
__global__ void maxPoolFwdGenericKernel(scalar_t *outFeatures,
const scalar_t *inFeatures,
const Index *indicesIn,
const Index *indicesOut, int numHot,
int numPlanes) {
int ILPStrideX[NumILP];
Index RI[NumILP];
Index RO[NumILP];
scalar_t in, out;
#pragma unroll
for (int ilp = 0; ilp < NumILP; ilp++)
ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
for (int ix : tv::KernelLoopX<int, NumILP>(numHot)) {
#pragma unroll
for (int ilp = 0; ilp < NumILP; ilp++) {
if (ix + ILPStrideX[ilp] < numHot) {
RI[ilp] = indicesIn[ix + ILPStrideX[ilp]] * numPlanes;
RO[ilp] = indicesOut[ix + ILPStrideX[ilp]] * numPlanes;
}
}
for (int iy : tv::KernelLoopY<int>(numPlanes)) {
#pragma unroll
for (int ilp = 0; ilp < NumILP; ++ilp) {
if (ix + ILPStrideX[ilp] < numHot) {
in = inFeatures[RI[ilp] + iy];
out = outFeatures[RO[ilp] + iy];
if (in > out) {
outFeatures[RO[ilp] + iy] = in;
}
}
}
}
}
}
template <typename scalar_t, typename Index, int NumTLP, int NumILP>
__global__ void maxPoolBwdBlockKernel(const scalar_t *outFeatures,
const scalar_t *inFeatures,
const scalar_t *fout, scalar_t *fin,
const Index *indicesIn,
const Index *indicesOut, int numHot,
int numPlanes) {
scalar_t in, out;
Index idxo, idxi;
int ILPStrideY[NumILP];
#pragma unroll
for (int ilp = 0; ilp < NumILP; ilp++)
ILPStrideY[ilp] = threadIdx.y + ilp * blockDim.y;
outFeatures += blockIdx.y * NumTLP;
inFeatures += blockIdx.y * NumTLP;
fout += blockIdx.y * NumTLP;
fin += blockIdx.y * NumTLP;
for (int ix = blockIdx.x * blockDim.x; ix < numHot;
ix += blockDim.x * gridDim.x) {
{
#pragma unroll
for (int ilp = 0; ilp < NumILP; ++ilp) {
idxi = indicesIn[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
idxo = indicesOut[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
in = inFeatures[idxi];
out = outFeatures[idxo];
if (in == out) {
fin[idxi] += fout[idxo];
}
}
}
}
}
template <typename scalar_t, typename Index, int NumTLP, int NumILP>
__global__ void maxPoolBwdGenericBlockKernel(
const scalar_t *outFeatures, const scalar_t *inFeatures,
const scalar_t *fout, scalar_t *fin, const Index *indicesIn,
const Index *indicesOut, int numHot, int numPlanes) {
int ILPStrideX[NumILP];
Index RI[NumILP];
Index RO[NumILP];
scalar_t in, out;
#pragma unroll
for (int ilp = 0; ilp < NumILP; ilp++)
ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
for (int ix : tv::KernelLoopX<int, NumILP>(numHot)) {
#pragma unroll
for (int ilp = 0; ilp < NumILP; ilp++) {
RI[ilp] = indicesIn[ix + ILPStrideX[ilp]] * numPlanes;
RO[ilp] = indicesOut[ix + ILPStrideX[ilp]] * numPlanes;
}
for (int iy : tv::KernelLoopY<int>(numPlanes)) {
#pragma unroll
for (int ilp = 0; ilp < NumILP; ++ilp) {
in = inFeatures[RI[ilp] + iy];
out = outFeatures[RO[ilp] + iy];
if (in == out) {
fin[RI[ilp] + iy] += fout[RO[ilp] + iy];
}
}
}
}
}
template <typename scalar_t, typename Index, int NumTLP, int NumILP,
typename VecType>
__global__ void maxPoolBwdVecBlockKernel(const scalar_t *outFeatures,
const scalar_t *inFeatures,
const scalar_t *fout, scalar_t *fin,
const Index *indicesIn,
const Index *indicesOut, int numHot,
int numPlanes) {
int ILPStrideY[NumILP];
constexpr int vecloadFactor = sizeof(VecType) / sizeof(scalar_t);
scalar_t bufi[vecloadFactor];
scalar_t bufo[vecloadFactor];
scalar_t bufdi[vecloadFactor];
scalar_t bufdo[vecloadFactor];
Index idxi, idxo;
#pragma unroll
for (int ilp = 0; ilp < NumILP; ilp++)
ILPStrideY[ilp] = threadIdx.y + ilp * blockDim.y;
outFeatures += blockIdx.y * NumTLP;
inFeatures += blockIdx.y * NumTLP;
for (int ix = blockIdx.x * blockDim.x * vecloadFactor; ix < numHot;
ix += blockDim.x * gridDim.x * vecloadFactor) {
#pragma unroll
for (int ilp = 0; ilp < NumILP; ++ilp) {
idxi = indicesIn[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
idxo = indicesOut[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
reinterpret_cast<VecType *>(bufo)[0] =
reinterpret_cast<const VecType *>(outFeatures)[idxo];
reinterpret_cast<VecType *>(bufi)[0] =
reinterpret_cast<const VecType *>(inFeatures)[idxi];
reinterpret_cast<VecType *>(bufdo)[0] =
reinterpret_cast<const VecType *>(fout)[idxo];
reinterpret_cast<VecType *>(bufdi)[0] =
reinterpret_cast<VecType *>(fin)[idxi];
#pragma unroll
for (int i = 0; i < vecloadFactor; i++) {
if (bufi[i] == bufo[i]) {
bufdi[i] += bufdo[i];
}
}
reinterpret_cast<VecType *>(fin)[idxi] =
reinterpret_cast<VecType *>(bufdi)[0];
}
}
}
template <typename scalar_t, typename Index, int NumTLP, int NumILP>
__global__ void maxPoolBwdGenericKernel(const scalar_t *outFeatures,
const scalar_t *inFeatures,
const scalar_t *fout, scalar_t *fin,
const Index *indicesIn,
const Index *indicesOut, int numHot,
int numPlanes) {
int ILPStrideX[NumILP];
Index RI[NumILP];
Index RO[NumILP];
scalar_t in, out;
#pragma unroll
for (int ilp = 0; ilp < NumILP; ilp++)
ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
for (int ix : tv::KernelLoopX<int, NumILP>(numHot)) {
#pragma unroll
for (int ilp = 0; ilp < NumILP; ilp++) {
if (ix + ILPStrideX[ilp] < numHot) {
RI[ilp] = indicesIn[ix + ILPStrideX[ilp]] * numPlanes;
RO[ilp] = indicesOut[ix + ILPStrideX[ilp]] * numPlanes;
}
}
for (int iy : tv::KernelLoopY<int>(numPlanes)) {
#pragma unroll
for (int ilp = 0; ilp < NumILP; ++ilp) {
if (ix + ILPStrideX[ilp] < numHot) {
in = inFeatures[RI[ilp] + iy];
out = outFeatures[RO[ilp] + iy];
if (in == out) {
fin[RI[ilp] + iy] += fout[RO[ilp] + iy];
}
}
}
}
}
}
namespace functor {
template <typename scalar_t, typename Index>
struct SparseMaxPoolForwardFunctor<tv::TorchGPU, scalar_t, Index> {
using vecload_type_t =
std::conditional_t<std::is_same<scalar_t, at::Half>::value, int2, int4>;
using kernel_block_t = mp_list_c<int, 64, 32, 16>;
void operator()(const tv::TorchGPU &d, tv::TensorView<scalar_t> outFeatures,
tv::TensorView<const scalar_t> inFeatures,
tv::TensorView<const Index> indices, int size) {
if (size <= 0) return;
int numPlanes = inFeatures.dim(1);
bool notFound = true;
constexpr int vecloadFactor = sizeof(vecload_type_t) / sizeof(scalar_t);
mp_for_each<kernel_block_t>([=, &outFeatures, &inFeatures, &indices,
&notFound](auto NumTLP) {
constexpr int NumILP = NumTLP / 4;
int numHotBlock = (size / NumTLP) * NumTLP;
if (notFound) {
if (numPlanes % NumTLP == 0) {
if (numHotBlock >= NumTLP) {
maxPoolFwdVecBlockKernel<scalar_t, Index, int(NumTLP), NumILP,
vecload_type_t>
<<<dim3(std::min(size / NumTLP, 512), numPlanes / NumTLP),
dim3(NumTLP / vecloadFactor, NumTLP / NumILP), 0,
d.getStream()>>>(outFeatures.data(), inFeatures.data(),
indices.subview(0).data(),
indices.subview(1).data(), numHotBlock,
numPlanes / vecloadFactor);
TV_CHECK_CUDA_ERR();
}
if (size > numHotBlock) {
maxPoolFwdGenericKernel<scalar_t, Index, int(NumTLP), NumILP>
<<<dim3(1, numPlanes / NumTLP), dim3(NumTLP / NumILP, NumTLP),
0, d.getStream()>>>(outFeatures.data(), inFeatures.data(),
indices.subview(0).data() + numHotBlock,
indices.subview(1).data() + numHotBlock,
size - numHotBlock, numPlanes);
TV_CHECK_CUDA_ERR();
}
notFound = false;
}
}
});
if (notFound) {
constexpr int NumTLP = 64;
constexpr int NumILP = NumTLP / 4;
int numHotBlock = (size / NumTLP) * NumTLP;
if (numHotBlock >= NumTLP) {
maxPoolFwdGenericBlockKernel<scalar_t, Index, NumTLP, NumILP>
<<<dim3(size / NumTLP, tv::launch::DivUp(numPlanes, NumTLP)),
dim3(NumTLP / NumILP, NumTLP), 0, d.getStream()>>>(
outFeatures.data(), inFeatures.data(),
indices.subview(0).data(), indices.subview(1).data(),
numHotBlock, numPlanes);
TV_CHECK_CUDA_ERR();
}
if (size > numHotBlock) {
maxPoolFwdGenericKernel<scalar_t, Index, NumTLP, NumILP>
<<<dim3(1, tv::launch::DivUp(numPlanes, NumTLP)),
dim3(NumTLP / NumILP, NumTLP), 0, d.getStream()>>>(
outFeatures.data(), inFeatures.data(),
indices.subview(0).data() + numHotBlock,
indices.subview(1).data() + numHotBlock, size - numHotBlock,
numPlanes);
TV_CHECK_CUDA_ERR();
}
}
}
};
template <typename scalar_t, typename Index>
struct SparseMaxPoolBackwardFunctor<tv::TorchGPU, scalar_t, Index> {
using vecload_type_t =
std::conditional_t<std::is_same<scalar_t, at::Half>::value, int2, int4>;
using kernel_block_t = mp_list_c<int, 64, 32, 16>;
void operator()(const tv::TorchGPU &d,
tv::TensorView<const scalar_t> outFeatures,
tv::TensorView<const scalar_t> inFeatures,
tv::TensorView<const scalar_t> fout,
tv::TensorView<scalar_t> fin,
tv::TensorView<const Index> indices, int size) {
if (size <= 0) return;
int numPlanes = inFeatures.dim(1);
bool notFound = true;
constexpr int vecloadFactor = sizeof(vecload_type_t) / sizeof(scalar_t);
mp_for_each<kernel_block_t>([=, &outFeatures, &inFeatures, &fout, &fin,
&indices, &notFound](auto NumTLP) {
constexpr int NumILP = NumTLP / 4;
int numHotBlock = (size / NumTLP) * NumTLP;
if (notFound) {
if (numPlanes % NumTLP == 0) {
if (numHotBlock >= NumTLP) {
maxPoolBwdVecBlockKernel<scalar_t, Index, int(NumTLP), NumILP,
vecload_type_t>
<<<dim3(std::min(size / NumTLP, 512), numPlanes / NumTLP),
dim3(NumTLP / vecloadFactor, NumTLP / NumILP), 0,
d.getStream()>>>(outFeatures.data(), inFeatures.data(),
fout.data(), fin.data(),
indices.subview(0).data(),
indices.subview(1).data(), numHotBlock,
numPlanes / vecloadFactor);
TV_CHECK_CUDA_ERR();
}
if (size > numHotBlock) {
maxPoolBwdGenericKernel<scalar_t, Index, int(NumTLP), NumILP>
<<<dim3(1, numPlanes / NumTLP), dim3(NumTLP / NumILP, NumTLP),
0, d.getStream()>>>(outFeatures.data(), inFeatures.data(),
fout.data(), fin.data(),
indices.subview(0).data() + numHotBlock,
indices.subview(1).data() + numHotBlock,
size - numHotBlock, numPlanes);
TV_CHECK_CUDA_ERR();
}
notFound = false;
}
}
});
if (notFound) {
constexpr int NumTLP = 64;
constexpr int NumILP = NumTLP / 4;
int numHotBlock = (size / NumTLP) * NumTLP;
if (numHotBlock >= NumTLP) {
maxPoolBwdGenericBlockKernel<scalar_t, Index, NumTLP, NumILP>
<<<dim3(size / NumTLP, tv::launch::DivUp(numPlanes, NumTLP)),
dim3(NumTLP / NumILP, NumTLP), 0, d.getStream()>>>(
outFeatures.data(), inFeatures.data(), fout.data(), fin.data(),
indices.subview(0).data(), indices.subview(1).data(),
numHotBlock, numPlanes);
TV_CHECK_CUDA_ERR();
}
if (size > numHotBlock) {
maxPoolBwdGenericKernel<scalar_t, Index, NumTLP, NumILP>
<<<dim3(1, tv::launch::DivUp(numPlanes, NumTLP)),
dim3(NumTLP / NumILP, NumTLP), 0, d.getStream()>>>(
outFeatures.data(), inFeatures.data(), fout.data(), fin.data(),
indices.subview(0).data() + numHotBlock,
indices.subview(1).data() + numHotBlock, size - numHotBlock,
numPlanes);
TV_CHECK_CUDA_ERR();
}
}
}
};
} // namespace functor
#define DECLARE_GPU_SPECS_T_INDEX(scalar_t, Index) \
template struct functor::SparseMaxPoolForwardFunctor<tv::TorchGPU, scalar_t, \
Index>; \
template struct functor::SparseMaxPoolBackwardFunctor<tv::TorchGPU, \
scalar_t, Index>;
#define DECLARE_GPU_SPECS(scalar_t) DECLARE_GPU_SPECS_T_INDEX(scalar_t, int);
DECLARE_GPU_SPECS(float);
DECLARE_GPU_SPECS(double);
DECLARE_GPU_SPECS(at::Half);
#undef DECLARE_GPU_SPECS
#undef DECLARE_GPU_SPECS_T_INDEX
#include <cuda_runtime_api.h>
#include <torch/script.h>
#include <utils/spconv/spconv/maxpool.h>
#include "../spconv_utils.h"
#include "pytorch_cuda_helper.hpp"
torch::Tensor IndiceMaxpoolForwardCUDAKernelLauncher(torch::Tensor features,
torch::Tensor indicePairs,
torch::Tensor indiceNum,
int64_t numAct) {
at::cuda::CUDAGuard device_guard(features.device());
auto device = features.device().type();
auto kernelVolume = indicePairs.size(0);
auto numInPlanes = features.size(1);
auto indicePairNumCpu = indiceNum.to({torch::kCPU});
auto options =
torch::TensorOptions().dtype(features.dtype()).device(features.device());
torch::Tensor output = torch::zeros({numAct, numInPlanes}, options);
double totalTime = 0;
for (int i = 0; i < kernelVolume; ++i) {
auto nHot = indicePairNumCpu.data_ptr<int>()[i];
if (nHot <= 0) {
continue;
}
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
features.scalar_type(), "IndiceMaxpoolForwardKernel", [&] {
if (device == torch::kCPU) {
functor::SparseMaxPoolForwardFunctor<tv::CPU, scalar_t, int>
forwardFtor;
forwardFtor(tv::CPU(), tv::torch2tv<scalar_t>(output),
tv::torch2tv<const scalar_t>(features),
tv::torch2tv<const int>(indicePairs).subview(i), nHot);
} else {
functor::SparseMaxPoolForwardFunctor<tv::TorchGPU, scalar_t, int>
forwardFtor;
forwardFtor(tv::TorchGPU(), tv::torch2tv<scalar_t>(output),
tv::torch2tv<const scalar_t>(features),
tv::torch2tv<const int>(indicePairs).subview(i), nHot);
TV_CHECK_CUDA_ERR();
}
});
}
return output;
}
torch::Tensor IndiceMaxpoolBackwardCUDAKernelLauncher(torch::Tensor features,
torch::Tensor outFeatures,
torch::Tensor outGrad,
torch::Tensor indicePairs,
torch::Tensor indiceNum) {
at::cuda::CUDAGuard device_guard(features.device());
auto device = features.device().type();
auto numInPlanes = features.size(1);
auto indicePairNumCpu = indiceNum.to({torch::kCPU});
auto options =
torch::TensorOptions().dtype(features.dtype()).device(features.device());
torch::Tensor inputGrad = torch::zeros(features.sizes(), options);
auto kernelVolume = indicePairs.size(0);
for (int i = 0; i < kernelVolume; ++i) {
auto nHot = indicePairNumCpu.data_ptr<int>()[i];
if (nHot <= 0) {
continue;
}
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
features.scalar_type(), "IndiceMaxpoolBackwardKernel", [&] {
if (device == torch::kCPU) {
functor::SparseMaxPoolBackwardFunctor<tv::CPU, scalar_t, int>
backwardFtor;
backwardFtor(tv::CPU(), tv::torch2tv<const scalar_t>(outFeatures),
tv::torch2tv<const scalar_t>(features),
tv::torch2tv<const scalar_t>(outGrad),
tv::torch2tv<scalar_t>(inputGrad),
tv::torch2tv<const int>(indicePairs).subview(i), nHot);
} else {
functor::SparseMaxPoolBackwardFunctor<tv::TorchGPU, scalar_t, int>
backwardFtor;
backwardFtor(tv::TorchGPU(),
tv::torch2tv<const scalar_t>(outFeatures),
tv::torch2tv<const scalar_t>(features),
tv::torch2tv<const scalar_t>(outGrad),
tv::torch2tv<scalar_t>(inputGrad),
tv::torch2tv<const int>(indicePairs).subview(i), nHot);
TV_CHECK_CUDA_ERR();
}
});
}
return inputGrad;
}
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <ATen/ATen.h>
#include <utils/spconv/spconv/mp_helper.h>
#include <utils/spconv/spconv/reordering.h>
#include <utils/spconv/tensorview/helper_launch.h>
#include <utils/spconv/tensorview/tensorview.h>
#include <chrono>
#include <limits>
#include <spconv/reordering.cuh>
#include <type_traits>
#include <utils/spconv/tensorview/helper_kernel.cuh>
#include "../spconv_utils.h"
#include "pytorch_cuda_helper.hpp"
namespace functor {
template <typename scalar_t, typename Index>
struct SparseGatherFunctor<tv::TorchGPU, scalar_t, Index> {
using vecload_type_t =
std::conditional_t<std::is_same<scalar_t, at::Half>::value, int2, int4>;
using kernel_block_t = mp_list_c<int, 64, 32, 16>;
void operator()(const tv::TorchGPU &d, tv::TensorView<scalar_t> buffer,
tv::TensorView<const scalar_t> features,
tv::TensorView<const Index> indices, int size) {
if (size <= 0) return;
int numPlanes = features.dim(1);
bool notFound = true;
constexpr int vecloadFactor = sizeof(vecload_type_t) / sizeof(scalar_t);
mp_for_each<kernel_block_t>([=, &buffer, &features, &indices,
&notFound](auto NumTLP) {
constexpr int NumILP = NumTLP / 4;
int nHotBlock = (size / NumTLP) * NumTLP;
if (notFound) {
if (numPlanes % NumTLP == 0) {
if (nHotBlock >= NumTLP) {
gatherVecBlockKernel<scalar_t, Index, int(NumTLP), NumILP,
vecload_type_t>
<<<dim3(numPlanes / NumTLP, size / NumTLP),
dim3(NumTLP / vecloadFactor, NumTLP / NumILP), 0,
d.getStream()>>>(buffer.data(), features.data(),
indices.data(), nHotBlock,
numPlanes / vecloadFactor);
TV_CHECK_CUDA_ERR();
}
if (size - nHotBlock > 0) {
gatherVecKernel<scalar_t, Index, int(NumTLP), NumILP,
vecload_type_t>
<<<dim3(1, numPlanes / NumTLP),
dim3(NumTLP / NumILP, NumTLP / vecloadFactor), 0,
d.getStream()>>>(buffer.data() + nHotBlock * numPlanes,
features.data(), indices.data() + nHotBlock,
size - nHotBlock,
numPlanes / vecloadFactor);
TV_CHECK_CUDA_ERR();
}
notFound = false;
}
}
});
if (notFound) {
constexpr int NumTLP = 64;
constexpr int NumILP = NumTLP / 4;
gatherGenericKernel<scalar_t, Index, NumTLP, NumILP>
<<<dim3(tv::launch::DivUp(size, NumTLP),
tv::launch::DivUp(numPlanes, NumTLP)),
dim3(NumTLP / NumILP, NumTLP), 0, d.getStream()>>>(
buffer.data(), features.data(), indices.data(), size, numPlanes);
TV_CHECK_CUDA_ERR();
}
}
};
template <typename scalar_t, typename Index>
struct SparseScatterAddFunctor<tv::TorchGPU, scalar_t, Index> {
using vecload_type_t =
std::conditional_t<std::is_same<scalar_t, at::Half>::value, int2, int4>;
using kernel_block_t = mp_list_c<int, 64, 32, 16>;
void operator()(const tv::TorchGPU &d, tv::TensorView<scalar_t> outFeatures,
tv::TensorView<const scalar_t> buffer,
tv::TensorView<const Index> indices, int size, bool stable) {
if (size <= 0) return;
int numPlanes = outFeatures.dim(1);
bool notFound = true;
constexpr int vecloadFactor =
sizeof(vecload_type_t) / sizeof(scalar_t); // important for half.
mp_for_each<kernel_block_t>([=, &d, &outFeatures, &buffer, &indices,
&notFound](auto NumTLP) {
constexpr int NumILP = NumTLP / 4;
int nHotBlock = (size / NumTLP) * NumTLP;
if (notFound) {
if (numPlanes % NumTLP == 0) {
if (nHotBlock >= NumTLP) {
scatterAddVecBlockKernel<scalar_t, Index, int(NumTLP), NumILP,
vecload_type_t>
<<<dim3(numPlanes / NumTLP, size / NumTLP),
dim3(NumTLP / vecloadFactor, NumTLP / NumILP), 0,
d.getStream()>>>(outFeatures.data(), buffer.data(),
indices.data(), nHotBlock,
numPlanes / vecloadFactor);
TV_CHECK_CUDA_ERR();
}
if (size - nHotBlock > 0) {
scatterAddGenericKernel<scalar_t, Index, int(NumTLP), NumILP>
<<<dim3(1, numPlanes / NumTLP), dim3(NumTLP / NumILP, NumTLP),
0, d.getStream()>>>(
outFeatures.data(), buffer.data() + nHotBlock * numPlanes,
indices.data() + nHotBlock, size - nHotBlock, numPlanes);
TV_CHECK_CUDA_ERR();
}
notFound = false;
}
}
});
if (notFound) {
constexpr int NumTLP = 64;
constexpr int NumILP = NumTLP / 4;
scatterAddGenericKernel<scalar_t, Index, NumTLP, NumILP>
<<<dim3(tv::launch::DivUp(size, NumTLP),
tv::launch::DivUp(numPlanes, NumTLP)),
dim3(NumTLP / NumILP, NumTLP), 0, d.getStream()>>>(
outFeatures.data(), buffer.data(), indices.data(), size,
numPlanes);
TV_CHECK_CUDA_ERR();
}
}
};
} // namespace functor
#define DECLARE_GPU_SPECS_T_INDEX(scalar_t, Index) \
template struct functor::SparseGatherFunctor<tv::TorchGPU, scalar_t, Index>; \
template struct functor::SparseScatterAddFunctor<tv::TorchGPU, scalar_t, \
Index>;
#define DECLARE_GPU_SPECS(scalar_t) DECLARE_GPU_SPECS_T_INDEX(scalar_t, int);
DECLARE_GPU_SPECS(float);
DECLARE_GPU_SPECS(double);
DECLARE_GPU_SPECS(at::Half);
#undef DECLARE_GPU_SPECS
#undef DECLARE_GPU_SPECS_T_INDEX
#include <cuda_runtime_api.h>
#include <torch/script.h>
#include <utils/spconv/spconv/indice.h>
#include <utils/spconv/spconv/reordering.h>
#include "../spconv_utils.h"
#include "pytorch_cuda_helper.hpp"
template <unsigned NDim>
std::vector<torch::Tensor> GetIndicePairsForwardCUDAKernelLauncher(
torch::Tensor indices, int64_t batchSize,
std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
std::vector<int64_t> padding, std::vector<int64_t> dilation,
std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose) {
at::cuda::CUDAGuard device_guard(indices.device());
bool subM = _subM != 0;
bool transpose = _transpose != 0;
auto numAct = indices.size(0);
auto coorDim = indices.size(1) - 1;
TV_ASSERT_RT_ERR(NDim == coorDim, "error");
TV_ASSERT_RT_ERR(kernelSize.size() == coorDim, "error");
TV_ASSERT_RT_ERR(outSpatialShape.size() == coorDim, "error");
TV_ASSERT_RT_ERR(stride.size() == coorDim, "error");
TV_ASSERT_RT_ERR(padding.size() == coorDim, "error");
TV_ASSERT_RT_ERR(outPadding.size() == coorDim, "error");
TV_ASSERT_RT_ERR(dilation.size() == coorDim, "error");
auto kernelVolume = kernelSize[0];
for (int i = 1; i < kernelSize.size(); ++i) {
kernelVolume *= kernelSize[i];
}
TV_ASSERT_RT_ERR(kernelVolume <= 4096, "error");
auto outputVolume = outSpatialShape[0];
for (int i = 1; i < outSpatialShape.size(); ++i) {
outputVolume *= outSpatialShape[i];
}
torch::Tensor indicePairs =
torch::full({kernelVolume, 2, numAct}, -1,
torch::dtype(torch::kInt32).device(indices.device()));
torch::Tensor indiceNum = torch::zeros(
{kernelVolume}, torch::dtype(torch::kInt32).device(indices.device()));
torch::Tensor gridOut =
torch::full({batchSize * outputVolume}, -1,
torch::dtype(torch::kInt32).device(indices.device()));
int64_t numActOut = -1;
tv::SimpleVector<int, NDim> outSpatialShape32;
tv::SimpleVector<int, NDim> kernelSize32;
tv::SimpleVector<int, NDim> stride32;
tv::SimpleVector<int, NDim> padding32;
tv::SimpleVector<int, NDim> dilation32;
auto indicePairUnique = torch::full(
{indicePairs.numel() / 2 + 1}, std::numeric_limits<int>::max(),
torch::dtype(torch::kInt32).device(indices.device()));
for (int i = 0; i < NDim; ++i) {
outSpatialShape32.push_back(outSpatialShape[i]);
kernelSize32.push_back(kernelSize[i]);
if (subM) {
stride32.push_back(1);
padding32.push_back(kernelSize[i] / 2);
dilation32.push_back(dilation[i]);
} else {
stride32.push_back(stride[i]);
padding32.push_back(padding[i]);
dilation32.push_back(dilation[i]);
}
}
if (subM) {
if (indices.device().type() == torch::kCPU) {
auto getIndicePairFtor =
functor::CreateSubMIndicePairFunctor<tv::CPU, int, int, NDim>();
numActOut = getIndicePairFtor(
tv::CPU(), tv::torch2tv<const int>(indices),
tv::torch2tv<int>(gridOut), tv::torch2tv<int>(indicePairs),
tv::torch2tv<int>(indiceNum), kernelSize32, stride32, padding32,
dilation32, outSpatialShape32, transpose);
} else {
auto getIndicePairFtor =
functor::CreateSubMIndicePairFunctor<tv::TorchGPU, int, int, NDim>();
numActOut = getIndicePairFtor(
tv::TorchGPU(), tv::torch2tv<const int>(indices),
tv::torch2tv<int>(gridOut), tv::torch2tv<int>(indicePairs),
tv::torch2tv<int>(indiceNum), kernelSize32, stride32, padding32,
dilation32, outSpatialShape32, transpose);
}
return {indices, indicePairs, indiceNum};
} else {
torch::Tensor outInds =
torch::zeros({numAct * kernelVolume, coorDim + 1},
torch::dtype(torch::kInt32).device(indices.device()));
if (indices.device().type() == torch::kCPU) {
auto getIndicePairFtor =
functor::CreateConvIndicePairFunctor<tv::CPU, int, int, NDim>();
numActOut = getIndicePairFtor(
tv::CPU(), tv::torch2tv<const int>(indices),
tv::torch2tv<int>(outInds), tv::torch2tv<int>(gridOut),
tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum),
kernelSize32, stride32, padding32, dilation32, outSpatialShape32,
transpose);
} else {
auto getIndicePairFtorP1 =
functor::CreateConvIndicePairFunctorP1<tv::TorchGPU, int, int,
NDim>();
auto getIndicePairFtorP2 =
functor::CreateConvIndicePairFunctorP2<tv::TorchGPU, int, int,
NDim>();
numActOut = getIndicePairFtorP1(
tv::TorchGPU(), tv::torch2tv<const int>(indices),
tv::torch2tv<int>(outInds), tv::torch2tv<int>(gridOut),
tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum),
tv::torch2tv<int>(indicePairUnique), kernelSize32, stride32,
padding32, dilation32, outSpatialShape32, transpose);
if (numActOut > 0) {
auto res = torch::_unique(indicePairUnique);
indicePairUnique = std::get<0>(res);
numActOut = getIndicePairFtorP2(
tv::TorchGPU(), tv::torch2tv<const int>(indices),
tv::torch2tv<int>(outInds), tv::torch2tv<int>(gridOut),
tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum),
tv::torch2tv<int>(indicePairUnique), outSpatialShape32, transpose);
}
}
return {outInds.slice(0, 0, numActOut), indicePairs, indiceNum};
}
}
template <unsigned NDim>
std::vector<torch::Tensor> GetIndicePairsBackwardCUDAKernelLauncher(
torch::Tensor indices, torch::Tensor gridOut, int64_t batchSize,
std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
std::vector<int64_t> padding, std::vector<int64_t> dilation,
std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose) {
at::cuda::CUDAGuard device_guard(indices.device());
bool subM = _subM != 0;
bool transpose = _transpose != 0;
auto numAct = indices.size(0);
auto coorDim = indices.size(1) - 1;
TV_ASSERT_RT_ERR(NDim == coorDim, "error");
TV_ASSERT_RT_ERR(kernelSize.size() == coorDim, "error");
TV_ASSERT_RT_ERR(outSpatialShape.size() == coorDim, "error");
TV_ASSERT_RT_ERR(stride.size() == coorDim, "error");
TV_ASSERT_RT_ERR(padding.size() == coorDim, "error");
TV_ASSERT_RT_ERR(outPadding.size() == coorDim, "error");
TV_ASSERT_RT_ERR(dilation.size() == coorDim, "error");
auto kernelVolume = kernelSize[0];
for (int i = 1; i < kernelSize.size(); ++i) {
kernelVolume *= kernelSize[i];
}
TV_ASSERT_RT_ERR(kernelVolume <= 4096, "error");
auto outputVolume = outSpatialShape[0];
for (int i = 1; i < outSpatialShape.size(); ++i) {
outputVolume *= outSpatialShape[i];
}
TV_ASSERT_INVALID_ARG(gridOut.numel() >= outputVolume * batchSize, "error");
torch::Tensor indicePairs =
torch::full({kernelVolume, 2, numAct}, -1,
torch::dtype(torch::kInt32).device(indices.device()));
torch::Tensor indiceNum = torch::zeros(
{kernelVolume}, torch::dtype(torch::kInt32).device(indices.device()));
int64_t numActOut = -1;
tv::SimpleVector<int, NDim> outSpatialShape32;
tv::SimpleVector<int, NDim> kernelSize32;
tv::SimpleVector<int, NDim> stride32;
tv::SimpleVector<int, NDim> padding32;
tv::SimpleVector<int, NDim> dilation32;
auto indicePairUnique = torch::full(
{indicePairs.numel() / 2 + 1}, std::numeric_limits<int>::max(),
torch::dtype(torch::kInt32).device(indices.device()));
for (int i = 0; i < NDim; ++i) {
outSpatialShape32.push_back(outSpatialShape[i]);
kernelSize32.push_back(kernelSize[i]);
if (subM) {
stride32.push_back(1);
padding32.push_back(kernelSize[i] / 2);
dilation32.push_back(dilation[i]);
} else {
stride32.push_back(stride[i]);
padding32.push_back(padding[i]);
dilation32.push_back(dilation[i]);
}
}
if (subM) {
if (indices.device().type() == torch::kCPU) {
auto getIndicePairFtor =
functor::CreateSubMIndicePairFunctor<tv::CPU, int, int, NDim>();
numActOut = getIndicePairFtor(
tv::CPU(), tv::torch2tv<const int>(indices),
tv::torch2tv<int>(gridOut), tv::torch2tv<int>(indicePairs),
tv::torch2tv<int>(indiceNum), kernelSize32, stride32, padding32,
dilation32, outSpatialShape32, transpose);
gridOut.fill_(-1);
} else {
auto getIndicePairFtor =
functor::CreateSubMIndicePairFunctor<tv::TorchGPU, int, int, NDim>();
numActOut = getIndicePairFtor(
tv::TorchGPU(), tv::torch2tv<const int>(indices),
tv::torch2tv<int>(gridOut), tv::torch2tv<int>(indicePairs),
tv::torch2tv<int>(indiceNum), kernelSize32, stride32, padding32,
dilation32, outSpatialShape32, transpose, true);
}
return {indices, indicePairs, indiceNum};
} else {
torch::Tensor outInds =
torch::zeros({numAct * kernelVolume, coorDim + 1},
torch::dtype(torch::kInt32).device(indices.device()));
if (indices.device().type() == torch::kCPU) {
auto getIndicePairFtor =
functor::CreateConvIndicePairFunctor<tv::CPU, int, int, NDim>();
numActOut = getIndicePairFtor(
tv::CPU(), tv::torch2tv<const int>(indices),
tv::torch2tv<int>(outInds), tv::torch2tv<int>(gridOut),
tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum),
kernelSize32, stride32, padding32, dilation32, outSpatialShape32,
transpose, true);
gridOut.fill_(-1);
} else {
auto getIndicePairFtorP1 =
functor::CreateConvIndicePairFunctorP1<tv::TorchGPU, int, int,
NDim>();
auto getIndicePairFtorP2 =
functor::CreateConvIndicePairFunctorP2<tv::TorchGPU, int, int,
NDim>();
numActOut = getIndicePairFtorP1(
tv::TorchGPU(), tv::torch2tv<const int>(indices),
tv::torch2tv<int>(outInds), tv::torch2tv<int>(gridOut),
tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum),
tv::torch2tv<int>(indicePairUnique), kernelSize32, stride32,
padding32, dilation32, outSpatialShape32, transpose);
if (numActOut > 0) {
auto res = torch::_unique(indicePairUnique);
indicePairUnique = std::get<0>(res);
numActOut = getIndicePairFtorP2(
tv::TorchGPU(), tv::torch2tv<const int>(indices),
tv::torch2tv<int>(outInds), tv::torch2tv<int>(gridOut),
tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum),
tv::torch2tv<int>(indicePairUnique), outSpatialShape32, transpose,
true);
}
}
return {outInds.slice(0, 0, numActOut), indicePairs, indiceNum};
}
}
torch::Tensor IndiceConvForwardCUDAKernelLauncher(
torch::Tensor features, torch::Tensor filters, torch::Tensor indicePairs,
torch::Tensor indiceNum, int64_t numActOut, int64_t _inverse,
int64_t _subM) {
at::cuda::CUDAGuard device_guard(features.device());
bool subM = _subM != 0;
bool inverse = _inverse != 0;
auto device = features.device().type();
auto ndim = filters.dim() - 2;
auto kernelVolume = indicePairs.size(0);
auto numInPlanes = features.size(1);
auto numOutPlanes = filters.size(ndim + 1);
auto indicePairNumCpu = indiceNum.to({torch::kCPU});
auto indicePairMaxSizeIter =
std::max_element(indicePairNumCpu.data_ptr<int>(),
indicePairNumCpu.data_ptr<int>() + kernelVolume);
int indicePairMaxOffset =
indicePairMaxSizeIter - indicePairNumCpu.data_ptr<int>();
int indicePairMaxSize = *indicePairMaxSizeIter;
auto options =
torch::TensorOptions().dtype(features.dtype()).device(features.device());
torch::Tensor output = torch::zeros({numActOut, numOutPlanes}, options);
torch::Tensor inputBuffer =
torch::zeros({indicePairMaxSize, numInPlanes}, options);
torch::Tensor outputBuffer =
torch::zeros({indicePairMaxSize, numOutPlanes}, options);
filters = filters.view({-1, numInPlanes, numOutPlanes});
if (subM) {
torch::mm_out(output, features, filters[indicePairMaxOffset]);
}
double totalGatherTime = 0;
double totalGEMMTime = 0;
double totalSAddTime = 0;
for (int i = 0; i < kernelVolume; ++i) {
auto nHot = indicePairNumCpu.data_ptr<int>()[i];
if (nHot <= 0 || (subM && i == indicePairMaxOffset)) {
continue;
}
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
features.scalar_type(), "IndiceConvForwardKernel", [&] {
auto outputBufferBlob = torch::from_blob(
outputBuffer.data_ptr<scalar_t>(), {nHot, numOutPlanes}, options);
auto inputBufferBlob = torch::from_blob(
inputBuffer.data_ptr<scalar_t>(), {nHot, numInPlanes}, options);
if (device == torch::kCPU) {
functor::SparseGatherFunctor<tv::CPU, scalar_t, int> gatherFtor;
gatherFtor(tv::CPU(), tv::torch2tv<scalar_t>(inputBuffer),
tv::torch2tv<const scalar_t>(features),
tv::torch2tv<const int>(indicePairs).subview(i, inverse),
nHot);
} else {
functor::SparseGatherFunctor<tv::TorchGPU, scalar_t, int>
gatherFtor;
gatherFtor(tv::TorchGPU(), tv::torch2tv<scalar_t>(inputBuffer),
tv::torch2tv<const scalar_t>(features),
tv::torch2tv<const int>(indicePairs).subview(i, inverse),
nHot);
TV_CHECK_CUDA_ERR();
/* slower than SparseGatherFunctor, may due to int->long conversion
auto indicePairLong = indicePairs[i][inverse].to(torch::kInt64);
auto indicePairBlob =
torch::from_blob(indicePairLong.data_ptr<long>(), {nHot},
indicePairOptions); torch::index_select_out(inputBufferBlob,
features, 0, indicePairBlob);*/
}
torch::mm_out(outputBufferBlob, inputBufferBlob, filters[i]);
if (device == torch::kCPU) {
functor::SparseScatterAddFunctor<tv::CPU, scalar_t, int>
scatterFtor;
scatterFtor(
tv::CPU(), tv::torch2tv<scalar_t>(output),
tv::torch2tv<const scalar_t>(outputBuffer),
tv::torch2tv<const int>(indicePairs).subview(i, !inverse), nHot,
true);
} else {
functor::SparseScatterAddFunctor<tv::TorchGPU, scalar_t, int>
scatterFtor;
scatterFtor(
tv::TorchGPU(), tv::torch2tv<scalar_t>(output),
tv::torch2tv<const scalar_t>(outputBuffer),
tv::torch2tv<const int>(indicePairs).subview(i, !inverse), nHot,
true);
TV_CHECK_CUDA_ERR();
}
});
}
return output;
}
std::vector<torch::Tensor> IndiceConvBackwardCUDAKernelLauncher(
torch::Tensor features, torch::Tensor filters, torch::Tensor outGrad,
torch::Tensor indicePairs, torch::Tensor indiceNum, int64_t _inverse,
int64_t _subM) {
at::cuda::CUDAGuard device_guard(features.device());
bool subM = _subM != 0;
bool inverse = _inverse != 0;
auto device = features.device().type();
auto ndim = filters.dim() - 2;
auto kernelVolume = indicePairs.size(0);
auto numInPlanes = features.size(1);
auto numOutPlanes = filters.size(ndim + 1);
auto indicePairNumCpu = indiceNum.to({torch::kCPU});
auto indicePairMaxSizeIter =
std::max_element(indicePairNumCpu.data_ptr<int>(),
indicePairNumCpu.data_ptr<int>() + kernelVolume);
int indicePairMaxOffset =
indicePairMaxSizeIter - indicePairNumCpu.data_ptr<int>();
int indicePairMaxSize = *indicePairMaxSizeIter;
auto options =
torch::TensorOptions().dtype(features.dtype()).device(features.device());
auto filterShape = filters.sizes();
torch::Tensor inputGrad = torch::zeros(features.sizes(), options);
torch::Tensor filtersGrad = torch::zeros(filterShape, options);
torch::Tensor inputBuffer =
torch::zeros({indicePairMaxSize, numInPlanes}, options);
torch::Tensor outputBuffer =
torch::zeros({indicePairMaxSize, numOutPlanes}, options);
filters = filters.view({-1, numInPlanes, numOutPlanes});
filtersGrad = filtersGrad.view({-1, numInPlanes, numOutPlanes});
if (subM) {
auto filterGradSub = filtersGrad[indicePairMaxOffset];
torch::mm_out(filterGradSub, features.t(), outGrad);
torch::mm_out(inputGrad, outGrad, filters[indicePairMaxOffset].t());
}
for (int i = 0; i < kernelVolume; ++i) {
auto nHot = indicePairNumCpu.data_ptr<int>()[i];
if (nHot <= 0 || (subM && i == indicePairMaxOffset)) {
continue;
}
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
features.scalar_type(), "IndiceConvBackwardKernel", [&] {
if (device == torch::kCPU) {
functor::SparseGatherFunctor<tv::CPU, scalar_t, int> gatherFtor;
functor::SparseGatherFunctor<tv::CPU, scalar_t, int> gatherFtorOut;
gatherFtor(tv::CPU(), tv::torch2tv<scalar_t>(inputBuffer),
tv::torch2tv<const scalar_t>(features),
tv::torch2tv<const int>(indicePairs).subview(i, inverse),
nHot);
gatherFtorOut(
tv::CPU(), tv::torch2tv<scalar_t>(outputBuffer),
tv::torch2tv<const scalar_t>(outGrad),
tv::torch2tv<const int>(indicePairs).subview(i, !inverse),
nHot);
} else {
functor::SparseGatherFunctor<tv::TorchGPU, scalar_t, int>
gatherFtor;
functor::SparseGatherFunctor<tv::TorchGPU, scalar_t, int>
gatherFtorOut;
gatherFtor(tv::TorchGPU(), tv::torch2tv<scalar_t>(inputBuffer),
tv::torch2tv<const scalar_t>(features),
tv::torch2tv<const int>(indicePairs).subview(i, inverse),
nHot);
TV_CHECK_CUDA_ERR();
gatherFtorOut(
tv::TorchGPU(), tv::torch2tv<scalar_t>(outputBuffer),
tv::torch2tv<const scalar_t>(outGrad),
tv::torch2tv<const int>(indicePairs).subview(i, !inverse),
nHot);
TV_CHECK_CUDA_ERR();
}
auto filterGradSub = filtersGrad[i];
auto outputBufferBlob = torch::from_blob(
outputBuffer.data_ptr<scalar_t>(), {nHot, numOutPlanes}, options);
auto inputBufferBlob = torch::from_blob(
inputBuffer.data_ptr<scalar_t>(), {nHot, numInPlanes}, options);
torch::mm_out(filterGradSub, inputBufferBlob.t(), outputBufferBlob);
torch::mm_out(inputBufferBlob, outputBufferBlob, filters[i].t());
if (device == torch::kCPU) {
functor::SparseScatterAddFunctor<tv::CPU, scalar_t, int>
scatterFtor;
scatterFtor(
tv::CPU(), tv::torch2tv<scalar_t>(inputGrad),
tv::torch2tv<const scalar_t>(inputBuffer),
tv::torch2tv<const int>(indicePairs).subview(i, inverse), nHot);
} else {
functor::SparseScatterAddFunctor<tv::TorchGPU, scalar_t, int>
scatterFtor;
scatterFtor(
tv::TorchGPU(), tv::torch2tv<scalar_t>(inputGrad),
tv::torch2tv<const scalar_t>(inputBuffer),
tv::torch2tv<const int>(indicePairs).subview(i, inverse), nHot);
TV_CHECK_CUDA_ERR();
}
});
}
return {inputGrad, filtersGrad.view(filterShape)};
}
template std::vector<torch::Tensor> GetIndicePairsForwardCUDAKernelLauncher<2>(
torch::Tensor indices, int64_t batchSize,
std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
std::vector<int64_t> padding, std::vector<int64_t> dilation,
std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose);
template std::vector<torch::Tensor> GetIndicePairsForwardCUDAKernelLauncher<3>(
torch::Tensor indices, int64_t batchSize,
std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
std::vector<int64_t> padding, std::vector<int64_t> dilation,
std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose);
template std::vector<torch::Tensor> GetIndicePairsForwardCUDAKernelLauncher<4>(
torch::Tensor indices, int64_t batchSize,
std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
std::vector<int64_t> padding, std::vector<int64_t> dilation,
std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose);
template std::vector<torch::Tensor> GetIndicePairsBackwardCUDAKernelLauncher<2>(
torch::Tensor indices, torch::Tensor gridOut, int64_t batchSize,
std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
std::vector<int64_t> padding, std::vector<int64_t> dilation,
std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose);
template std::vector<torch::Tensor> GetIndicePairsBackwardCUDAKernelLauncher<3>(
torch::Tensor indices, torch::Tensor gridOut, int64_t batchSize,
std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
std::vector<int64_t> padding, std::vector<int64_t> dilation,
std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose);
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
torch::Tensor fused_indice_conv_batchnorm_forward_impl(
torch::Tensor features, torch::Tensor filters, torch::Tensor bias,
torch::Tensor indicePairs, torch::Tensor indiceNum, int64_t numActOut,
int64_t _inverse, int64_t _subM) {
return DISPATCH_DEVICE_IMPL(fused_indice_conv_batchnorm_forward_impl,
features, filters, bias, indicePairs, indiceNum,
numActOut, _inverse, _subM);
}
torch::Tensor fused_indice_conv_batchnorm_forward(
torch::Tensor features, torch::Tensor filters, torch::Tensor bias,
torch::Tensor indicePairs, torch::Tensor indiceNum, int64_t numActOut,
int64_t _inverse, int64_t _subM) {
return fused_indice_conv_batchnorm_forward_impl(features, filters, bias,
indicePairs, indiceNum,
numActOut, _inverse, _subM);
}
// Copyright (c) OpenMMLab. All rights reserved // Copyright (c) OpenMMLab. All rights reserved
#include <torch/extension.h>
#include "pytorch_cpp_helper.hpp" #include "pytorch_cpp_helper.hpp"
std::string get_compiler_version(); std::string get_compiler_version();
...@@ -240,6 +242,43 @@ void ball_query_forward(Tensor new_xyz_tensor, Tensor xyz_tensor, ...@@ -240,6 +242,43 @@ void ball_query_forward(Tensor new_xyz_tensor, Tensor xyz_tensor,
Tensor idx_tensor, int b, int n, int m, Tensor idx_tensor, int b, int n, int m,
float min_radius, float max_radius, int nsample); float min_radius, float max_radius, int nsample);
template <unsigned NDim>
std::vector<torch::Tensor> get_indice_pairs_forward(
torch::Tensor indices, int64_t batchSize,
std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
std::vector<int64_t> padding, std::vector<int64_t> dilation,
std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose);
template <unsigned NDim>
std::vector<Tensor> get_indice_pairs_backward(
Tensor indices, Tensor gridOut, int64_t batchSize,
std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
std::vector<int64_t> padding, std::vector<int64_t> dilation,
std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose);
Tensor indice_conv_forward(Tensor features, Tensor filters, Tensor indicePairs,
Tensor indiceNum, int64_t numActOut,
int64_t _inverse, int64_t _subM);
std::vector<Tensor> indice_conv_backward(Tensor features, Tensor filters,
Tensor outGrad, Tensor indicePairs,
Tensor indiceNum, int64_t _inverse,
int64_t _subM);
Tensor fused_indice_conv_batchnorm_forward(Tensor features, Tensor filters,
Tensor bias, Tensor indicePairs,
Tensor indiceNum, int64_t numActOut,
int64_t _inverse, int64_t _subM);
Tensor indice_maxpool_forward(Tensor features, Tensor indicePairs,
Tensor indiceNum, int64_t numAct);
Tensor indice_maxpool_backward(Tensor features, Tensor outFeatures,
Tensor outGrad, Tensor indicePairs,
Tensor indiceNum);
Tensor bottom_pool_forward(Tensor input); Tensor bottom_pool_forward(Tensor input);
Tensor bottom_pool_backward(Tensor input, Tensor grad_output); Tensor bottom_pool_backward(Tensor input, Tensor grad_output);
...@@ -602,6 +641,54 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { ...@@ -602,6 +641,54 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
"sync_bn backward_data", py::arg("grad_output"), py::arg("weight"), "sync_bn backward_data", py::arg("grad_output"), py::arg("weight"),
py::arg("grad_weight"), py::arg("grad_bias"), py::arg("norm"), py::arg("grad_weight"), py::arg("grad_bias"), py::arg("norm"),
py::arg("std"), py::arg("grad_input")); py::arg("std"), py::arg("grad_input"));
m.def("get_indice_pairs_2d_forward", &get_indice_pairs_forward<2>,
"get_indice_pairs_2d_forward", py::arg("indices"), py::arg("batchSize"),
py::arg("outSpatialShape"), py::arg("spatialShape"),
py::arg("kernelSize"), py::arg("stride"), py::arg("padding"),
py::arg("dilation"), py::arg("outPadding"), py::arg("_subM"),
py::arg("_transpose"));
m.def("get_indice_pairs_3d_forward", &get_indice_pairs_forward<3>,
"get_indice_pairs_3d_forward", py::arg("indices"), py::arg("batchSize"),
py::arg("outSpatialShape"), py::arg("spatialShape"),
py::arg("kernelSize"), py::arg("stride"), py::arg("padding"),
py::arg("dilation"), py::arg("outPadding"), py::arg("_subM"),
py::arg("_transpose"));
m.def("get_indice_pairs_4d_forward", &get_indice_pairs_forward<4>,
"get_indice_pairs_4d_forward", py::arg("indices"), py::arg("batchSize"),
py::arg("outSpatialShape"), py::arg("spatialShape"),
py::arg("kernelSize"), py::arg("stride"), py::arg("padding"),
py::arg("dilation"), py::arg("outPadding"), py::arg("_subM"),
py::arg("_transpose"));
m.def("get_indice_pairs_2d_backward", &get_indice_pairs_backward<2>,
"get_indice_pairs_2d_backward", py::arg("indices"), py::arg("gridOut"),
py::arg("batchSize"), py::arg("outSpatialShape"),
py::arg("spatialShape"), py::arg("kernelSize"), py::arg("stride"),
py::arg("padding"), py::arg("dilation"), py::arg("outPadding"),
py::arg("_subM"), py::arg("_transpose"));
m.def("get_indice_pairs_3d_backward", &get_indice_pairs_backward<3>,
"get_indice_pairs_3d_backward", py::arg("indices"), py::arg("gridOut"),
py::arg("batchSize"), py::arg("outSpatialShape"),
py::arg("spatialShape"), py::arg("kernelSize"), py::arg("stride"),
py::arg("padding"), py::arg("dilation"), py::arg("outPadding"),
py::arg("_subM"), py::arg("_transpose"));
m.def("indice_conv_forward", &indice_conv_forward, "indice_conv_forward",
py::arg("features"), py::arg("filters"), py::arg("indicePairs"),
py::arg("indiceNum"), py::arg("numActOut"), py::arg("_inverse"),
py::arg("_subM"));
m.def("indice_conv_backward", &indice_conv_backward, "indice_conv_backward",
py::arg("features"), py::arg("filters"), py::arg("outGrad"),
py::arg("indicePairs"), py::arg("indiceNum"), py::arg("_inverse"),
py::arg("_subM"));
m.def("fused_indice_conv_forward", &fused_indice_conv_batchnorm_forward,
"fused_indice_conv_forward", py::arg("features"), py::arg("filters"),
py::arg("bias"), py::arg("indicePairs"), py::arg("indiceNum"),
py::arg("numActOut"), py::arg("_inverse"), py::arg("_subM"));
m.def("indice_maxpool_forward", &indice_maxpool_forward,
"indice_maxpool_forward", py::arg("features"), py::arg("indicePairs"),
py::arg("indiceNum"), py::arg("numAct"));
m.def("indice_maxpool_backward", &indice_maxpool_backward,
"indice_maxpool_backward", py::arg("features"), py::arg("outFeatures"),
py::arg("outGrad"), py::arg("indicePairs"), py::arg("indiceNum"));
m.def("psamask_forward", &psamask_forward, "PSAMASK forward (CPU/CUDA)", m.def("psamask_forward", &psamask_forward, "PSAMASK forward (CPU/CUDA)",
py::arg("input"), py::arg("output"), py::arg("psa_type"), py::arg("input"), py::arg("output"), py::arg("psa_type"),
py::arg("num_"), py::arg("h_feature"), py::arg("w_feature"), py::arg("num_"), py::arg("h_feature"), py::arg("w_feature"),
......
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
torch::Tensor indice_maxpool_forward_impl(torch::Tensor features,
torch::Tensor indicePairs,
torch::Tensor indiceNum,
int64_t numAct) {
return DISPATCH_DEVICE_IMPL(indice_maxpool_forward_impl, features,
indicePairs, indiceNum, numAct);
}
torch::Tensor indice_maxpool_forward(torch::Tensor features,
torch::Tensor indicePairs,
torch::Tensor indiceNum, int64_t numAct) {
return indice_maxpool_forward_impl(features, indicePairs, indiceNum, numAct);
}
torch::Tensor indice_maxpool_backward_impl(torch::Tensor features,
torch::Tensor outFeatures,
torch::Tensor outGrad,
torch::Tensor indicePairs,
torch::Tensor indiceNum) {
return DISPATCH_DEVICE_IMPL(indice_maxpool_backward_impl, features,
outFeatures, outGrad, indicePairs, indiceNum);
}
torch::Tensor indice_maxpool_backward(torch::Tensor features,
torch::Tensor outFeatures,
torch::Tensor outGrad,
torch::Tensor indicePairs,
torch::Tensor indiceNum) {
return indice_maxpool_backward_impl(features, outFeatures, outGrad,
indicePairs, indiceNum);
}
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
template <unsigned NDim>
std::vector<torch::Tensor> GetIndicePairsForwardCUDAKernelLauncher(
torch::Tensor indices, int64_t batchSize,
std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
std::vector<int64_t> padding, std::vector<int64_t> dilation,
std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose);
template <unsigned NDim>
std::vector<torch::Tensor> get_indice_pairs_forward_cuda(
torch::Tensor indices, int64_t batchSize,
std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
std::vector<int64_t> padding, std::vector<int64_t> dilation,
std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose) {
return GetIndicePairsForwardCUDAKernelLauncher<NDim>(
indices, batchSize, outSpatialShape, spatialShape, kernelSize, stride,
padding, dilation, outPadding, _subM, _transpose);
};
template <unsigned NDim>
std::vector<torch::Tensor> GetIndicePairsBackwardCUDAKernelLauncher(
torch::Tensor indices, torch::Tensor gridOut, int64_t batchSize,
std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
std::vector<int64_t> padding, std::vector<int64_t> dilation,
std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose);
template <unsigned NDim>
std::vector<torch::Tensor> get_indice_pairs_backward_cuda(
torch::Tensor indices, torch::Tensor gridOut, int64_t batchSize,
std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
std::vector<int64_t> padding, std::vector<int64_t> dilation,
std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose) {
return GetIndicePairsBackwardCUDAKernelLauncher<NDim>(
indices, gridOut, batchSize, outSpatialShape, spatialShape, kernelSize,
stride, padding, dilation, outPadding, _subM, _transpose);
};
template <unsigned NDim>
std::vector<torch::Tensor> get_indice_pairs_forward(
torch::Tensor indices, int64_t batchSize,
std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
std::vector<int64_t> padding, std::vector<int64_t> dilation,
std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose) {
if (indices.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(indices);
return get_indice_pairs_forward_cuda<NDim>(
indices, batchSize, outSpatialShape, spatialShape, kernelSize, stride,
padding, dilation, outPadding, _subM, _transpose);
#else
AT_ERROR("get_indice_pairs is not compiled with GPU support");
#endif
} else {
AT_ERROR("get_indice_pairs is not implemented on CPU");
}
}
template <unsigned NDim>
std::vector<torch::Tensor> get_indice_pairs_backward(
torch::Tensor indices, torch::Tensor gridOut, int64_t batchSize,
std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
std::vector<int64_t> padding, std::vector<int64_t> dilation,
std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose) {
if (indices.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(indices);
CHECK_CUDA_INPUT(gridOut);
return get_indice_pairs_backward_cuda<NDim>(
indices, gridOut, batchSize, outSpatialShape, spatialShape, kernelSize,
stride, padding, dilation, outPadding, _subM, _transpose);
#else
AT_ERROR("get_indice_pairs is not compiled with GPU support");
#endif
} else {
AT_ERROR("get_indice_pairs is not implemented on CPU");
}
}
torch::Tensor indice_conv_forward_impl(torch::Tensor features,
torch::Tensor filters,
torch::Tensor indicePairs,
torch::Tensor indiceNum,
int64_t numActOut, int64_t _inverse,
int64_t _subM) {
return DISPATCH_DEVICE_IMPL(indice_conv_forward_impl, features, filters,
indicePairs, indiceNum, numActOut, _inverse,
_subM);
}
torch::Tensor indice_conv_forward(torch::Tensor features, torch::Tensor filters,
torch::Tensor indicePairs,
torch::Tensor indiceNum, int64_t numActOut,
int64_t _inverse, int64_t _subM) {
return indice_conv_forward_impl(features, filters, indicePairs, indiceNum,
numActOut, _inverse, _subM);
}
std::vector<torch::Tensor> indice_conv_backward_impl(
torch::Tensor features, torch::Tensor filters, torch::Tensor outGrad,
torch::Tensor indicePairs, torch::Tensor indiceNum, int64_t _inverse,
int64_t _subM) {
return DISPATCH_DEVICE_IMPL(indice_conv_backward_impl, features, filters,
outGrad, indicePairs, indiceNum, _inverse, _subM);
}
std::vector<torch::Tensor> indice_conv_backward(
torch::Tensor features, torch::Tensor filters, torch::Tensor outGrad,
torch::Tensor indicePairs, torch::Tensor indiceNum, int64_t _inverse,
int64_t _subM) {
return indice_conv_backward_impl(features, filters, outGrad, indicePairs,
indiceNum, _inverse, _subM);
}
template std::vector<torch::Tensor> get_indice_pairs_forward<2>(
torch::Tensor indices, int64_t batchSize,
std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
std::vector<int64_t> padding, std::vector<int64_t> dilation,
std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose);
template std::vector<torch::Tensor> get_indice_pairs_forward<3>(
torch::Tensor indices, int64_t batchSize,
std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
std::vector<int64_t> padding, std::vector<int64_t> dilation,
std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose);
template std::vector<torch::Tensor> get_indice_pairs_forward<4>(
torch::Tensor indices, int64_t batchSize,
std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
std::vector<int64_t> padding, std::vector<int64_t> dilation,
std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose);
template std::vector<torch::Tensor> get_indice_pairs_backward<2>(
torch::Tensor indices, torch::Tensor gridOut, int64_t batchSize,
std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
std::vector<int64_t> padding, std::vector<int64_t> dilation,
std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose);
template std::vector<torch::Tensor> get_indice_pairs_backward<3>(
torch::Tensor indices, torch::Tensor gridOut, int64_t batchSize,
std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
std::vector<int64_t> padding, std::vector<int64_t> dilation,
std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose);
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <ATen/ATen.h>
#include <ATen/cuda/CUDAContext.h>
#include <torch/script.h>
#include <utils/spconv/tensorview/tensorview.h>
#include "pytorch_cuda_helper.hpp"
namespace tv {
struct GPU {
GPU(cudaStream_t s = 0) : mStream(s) {}
virtual cudaStream_t getStream() const { return mStream; }
cudaStream_t mStream = 0;
};
struct TorchGPU : public tv::GPU {
virtual cudaStream_t getStream() const override {
return at::cuda::getCurrentCUDAStream();
}
};
template <typename scalar_t>
void check_torch_dtype(const torch::Tensor &tensor) {
switch (tensor.type().scalarType()) {
case at::ScalarType::Double: {
auto val = std::is_same<std::remove_const_t<scalar_t>, double>::value;
TV_ASSERT_RT_ERR(val, "error");
break;
}
case at::ScalarType::Float: {
auto val = std::is_same<std::remove_const_t<scalar_t>, float>::value;
TV_ASSERT_RT_ERR(val, "error");
break;
}
case at::ScalarType::Int: {
auto val = std::is_same<std::remove_const_t<scalar_t>, int>::value;
TV_ASSERT_RT_ERR(val, "error");
break;
}
case at::ScalarType::Half: {
auto val = std::is_same<std::remove_const_t<scalar_t>, at::Half>::value;
TV_ASSERT_RT_ERR(val, "error");
break;
}
case at::ScalarType::Long: {
auto val = std::is_same<std::remove_const_t<scalar_t>, long>::value;
TV_ASSERT_RT_ERR(val, "error");
break;
}
default:
TV_ASSERT_RT_ERR(false, "error");
}
}
template <typename scalar_t>
tv::TensorView<scalar_t> torch2tv(const torch::Tensor &tensor) {
check_torch_dtype<scalar_t>(tensor);
tv::Shape shape;
for (auto i : tensor.sizes()) {
shape.push_back(i);
}
return tv::TensorView<scalar_t>(
tensor.data_ptr<std::remove_const_t<scalar_t>>(), shape);
}
} // namespace tv
# Copyright 2019 Yan Yan
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import math
import numpy as np
import torch
from torch.nn import init
from torch.nn.parameter import Parameter
from ..cnn import CONV_LAYERS
from . import sparse_functional as Fsp
from . import sparse_ops as ops
from .sparse_modules import SparseModule
from .sparse_structure import SparseConvTensor
def _calculate_fan_in_and_fan_out_hwio(tensor):
dimensions = tensor.ndimension()
if dimensions < 2:
raise ValueError('fan in and fan out can not be computed for tensor'
'with fewer than 2 dimensions')
if dimensions == 2: # Linear
fan_in = tensor.size(-2)
fan_out = tensor.size(-1)
else:
num_input_fmaps = tensor.size(-2)
num_output_fmaps = tensor.size(-1)
receptive_field_size = 1
if tensor.dim() > 2:
receptive_field_size = tensor[..., 0, 0].numel()
fan_in = num_input_fmaps * receptive_field_size
fan_out = num_output_fmaps * receptive_field_size
return fan_in, fan_out
class SparseConvolution(SparseModule):
def __init__(self,
ndim,
in_channels,
out_channels,
kernel_size=3,
stride=1,
padding=0,
dilation=1,
groups=1,
bias=True,
subm=False,
output_padding=0,
transposed=False,
inverse=False,
indice_key=None,
fused_bn=False):
super(SparseConvolution, self).__init__()
assert groups == 1
if not isinstance(kernel_size, (list, tuple)):
kernel_size = [kernel_size] * ndim
if not isinstance(stride, (list, tuple)):
stride = [stride] * ndim
if not isinstance(padding, (list, tuple)):
padding = [padding] * ndim
if not isinstance(dilation, (list, tuple)):
dilation = [dilation] * ndim
if not isinstance(output_padding, (list, tuple)):
output_padding = [output_padding] * ndim
for d, s in zip(dilation, stride):
assert any([s == 1, d == 1]), "don't support this."
self.ndim = ndim
self.in_channels = in_channels
self.out_channels = out_channels
self.kernel_size = kernel_size
self.conv1x1 = np.prod(kernel_size) == 1
self.stride = stride
self.padding = padding
self.dilation = dilation
self.transposed = transposed
self.inverse = inverse
self.output_padding = output_padding
self.groups = groups
self.subm = subm
self.indice_key = indice_key
self.fused_bn = fused_bn
self.weight = Parameter(
torch.Tensor(*kernel_size, in_channels, out_channels))
if bias:
self.bias = Parameter(torch.Tensor(out_channels))
else:
self.register_parameter('bias', None)
self.reset_parameters()
def reset_parameters(self):
init.kaiming_uniform_(self.weight, a=math.sqrt(5))
if self.bias is not None:
fan_in, _ = _calculate_fan_in_and_fan_out_hwio(self.weight)
bound = 1 / math.sqrt(fan_in)
init.uniform_(self.bias, -bound, bound)
def forward(self, input):
assert isinstance(input, SparseConvTensor)
features = input.features
device = features.device
indices = input.indices
spatial_shape = input.spatial_shape
batch_size = input.batch_size
if not self.subm:
if self.transposed:
out_spatial_shape = ops.get_deconv_output_size(
spatial_shape, self.kernel_size, self.stride, self.padding,
self.dilation, self.output_padding)
else:
out_spatial_shape = ops.get_conv_output_size(
spatial_shape, self.kernel_size, self.stride, self.padding,
self.dilation)
else:
out_spatial_shape = spatial_shape
if self.conv1x1:
features = torch.mm(
input.features,
self.weight.view(self.in_channels, self.out_channels))
if self.bias is not None:
features += self.bias
out_tensor = SparseConvTensor(features, input.indices,
input.spatial_shape,
input.batch_size)
out_tensor.indice_dict = input.indice_dict
out_tensor.grid = input.grid
return out_tensor
data = input.find_indice_pair(self.indice_key)
if self.inverse:
assert data is not None and self.indice_key is not None
_, outids, indice_pairs, indice_pair_num, out_spatial_shape = data
assert indice_pairs.shape[0] == np.prod(
self.kernel_size
), 'inverse conv must have same kernel size as its couple conv'
else:
if self.indice_key is not None and data is not None:
outids, _, indice_pairs, indice_pair_num, _ = data
else:
outids, indice_pairs, indice_pair_num = ops.get_indice_pairs(
indices,
batch_size,
spatial_shape,
self.kernel_size,
self.stride,
self.padding,
self.dilation,
self.output_padding,
self.subm,
self.transposed,
grid=input.grid)
input.indice_dict[self.indice_key] = (outids, indices,
indice_pairs,
indice_pair_num,
spatial_shape)
if self.fused_bn:
assert self.bias is not None
out_features = ops.fused_indice_conv(features, self.weight,
self.bias,
indice_pairs.to(device),
indice_pair_num,
outids.shape[0], self.inverse,
self.subm)
else:
if self.subm:
out_features = Fsp.indice_subm_conv(features, self.weight,
indice_pairs.to(device),
indice_pair_num,
outids.shape[0])
else:
if self.inverse:
out_features = Fsp.indice_inverse_conv(
features, self.weight, indice_pairs.to(device),
indice_pair_num, outids.shape[0])
else:
out_features = Fsp.indice_conv(features, self.weight,
indice_pairs.to(device),
indice_pair_num,
outids.shape[0])
if self.bias is not None:
out_features += self.bias
out_tensor = SparseConvTensor(out_features, outids, out_spatial_shape,
batch_size)
out_tensor.indice_dict = input.indice_dict
out_tensor.grid = input.grid
return out_tensor
@CONV_LAYERS.register_module()
class SparseConv2d(SparseConvolution):
def __init__(self,
in_channels,
out_channels,
kernel_size,
stride=1,
padding=0,
dilation=1,
groups=1,
bias=True,
indice_key=None):
super(SparseConv2d, self).__init__(
2,
in_channels,
out_channels,
kernel_size,
stride,
padding,
dilation,
groups,
bias,
indice_key=indice_key)
@CONV_LAYERS.register_module()
class SparseConv3d(SparseConvolution):
def __init__(self,
in_channels,
out_channels,
kernel_size,
stride=1,
padding=0,
dilation=1,
groups=1,
bias=True,
indice_key=None):
super(SparseConv3d, self).__init__(
3,
in_channels,
out_channels,
kernel_size,
stride,
padding,
dilation,
groups,
bias,
indice_key=indice_key)
@CONV_LAYERS.register_module()
class SparseConv4d(SparseConvolution):
def __init__(self,
in_channels,
out_channels,
kernel_size,
stride=1,
padding=0,
dilation=1,
groups=1,
bias=True,
indice_key=None):
super(SparseConv4d, self).__init__(
4,
in_channels,
out_channels,
kernel_size,
stride,
padding,
dilation,
groups,
bias,
indice_key=indice_key)
@CONV_LAYERS.register_module()
class SparseConvTranspose2d(SparseConvolution):
def __init__(self,
in_channels,
out_channels,
kernel_size,
stride=1,
padding=0,
dilation=1,
groups=1,
bias=True,
indice_key=None):
super(SparseConvTranspose2d, self).__init__(
2,
in_channels,
out_channels,
kernel_size,
stride,
padding,
dilation,
groups,
bias,
transposed=True,
indice_key=indice_key)
@CONV_LAYERS.register_module()
class SparseConvTranspose3d(SparseConvolution):
def __init__(self,
in_channels,
out_channels,
kernel_size,
stride=1,
padding=0,
dilation=1,
groups=1,
bias=True,
indice_key=None):
super(SparseConvTranspose3d, self).__init__(
3,
in_channels,
out_channels,
kernel_size,
stride,
padding,
dilation,
groups,
bias,
transposed=True,
indice_key=indice_key)
@CONV_LAYERS.register_module()
class SparseInverseConv2d(SparseConvolution):
def __init__(self,
in_channels,
out_channels,
kernel_size,
indice_key=None,
bias=True):
super(SparseInverseConv2d, self).__init__(
2,
in_channels,
out_channels,
kernel_size,
bias=bias,
inverse=True,
indice_key=indice_key)
@CONV_LAYERS.register_module()
class SparseInverseConv3d(SparseConvolution):
def __init__(self,
in_channels,
out_channels,
kernel_size,
indice_key=None,
bias=True):
super(SparseInverseConv3d, self).__init__(
3,
in_channels,
out_channels,
kernel_size,
bias=bias,
inverse=True,
indice_key=indice_key)
@CONV_LAYERS.register_module()
class SubMConv2d(SparseConvolution):
def __init__(self,
in_channels,
out_channels,
kernel_size,
stride=1,
padding=0,
dilation=1,
groups=1,
bias=True,
indice_key=None):
super(SubMConv2d, self).__init__(
2,
in_channels,
out_channels,
kernel_size,
stride,
padding,
dilation,
groups,
bias,
True,
indice_key=indice_key)
@CONV_LAYERS.register_module()
class SubMConv3d(SparseConvolution):
def __init__(self,
in_channels,
out_channels,
kernel_size,
stride=1,
padding=0,
dilation=1,
groups=1,
bias=True,
indice_key=None):
super(SubMConv3d, self).__init__(
3,
in_channels,
out_channels,
kernel_size,
stride,
padding,
dilation,
groups,
bias,
True,
indice_key=indice_key)
@CONV_LAYERS.register_module()
class SubMConv4d(SparseConvolution):
def __init__(self,
in_channels,
out_channels,
kernel_size,
stride=1,
padding=0,
dilation=1,
groups=1,
bias=True,
indice_key=None):
super(SubMConv4d, self).__init__(
4,
in_channels,
out_channels,
kernel_size,
stride,
padding,
dilation,
groups,
bias,
True,
indice_key=indice_key)
# Copyright 2019 Yan Yan
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from torch.autograd import Function
from . import sparse_ops as ops
class SparseConvFunction(Function):
"""Sparse Convolution.
Please refer to `SECOND <https://www.mdpi.com/1424-8220/18/10/3337>`_ for
more details.
"""
@staticmethod
def forward(ctx, features, filters, indice_pairs, indice_pair_num,
num_activate_out):
"""
Args:
features (torch.Tensor): Features that needs to convolute.
filters (torch.nn.parameter.Parameter): Convolution filters.
indice_pairs (torch.Tensor): Indice pairs between inputs locations
and outputs locations.
indice_pair_num (torch.Tensor): Indice pairs num.
num_activate_out (torch.Tensor): Output channels num.
Returns:
torch.Tensor: Output features from gather-gemm-scatter.
"""
ctx.save_for_backward(indice_pairs, indice_pair_num, features, filters)
return ops.indice_conv(features, filters, indice_pairs,
indice_pair_num, num_activate_out, False)
@staticmethod
def backward(ctx, grad_output):
indice_pairs, indice_pair_num, features, filters = ctx.saved_tensors
input_bp, filters_bp = ops.indice_conv_backward(
features, filters, grad_output, indice_pairs, indice_pair_num,
False)
return input_bp, filters_bp, None, None, None
class SparseInverseConvFunction(Function):
@staticmethod
def forward(ctx, features, filters, indice_pairs, indice_pair_num,
num_activate_out):
"""
Args:
features (torch.Tensor): Features that needs to convolute.
filters (torch.nn.parameter.Parameter): Convolution filters.
indice_pairs (torch.Tensor): Indice pairs between inputs locations
and outputs locations.
indice_pair_num (torch.Tensor): Indice pairs num.
num_activate_out (torch.Tensor): Output channels num.
Returns:
torch.Tensor: Output features from gather-gemm-scatter.
"""
ctx.save_for_backward(indice_pairs, indice_pair_num, features, filters)
return ops.indice_conv(features, filters, indice_pairs,
indice_pair_num, num_activate_out, True, False)
@staticmethod
def backward(ctx, grad_output):
indice_pairs, indice_pair_num, features, filters = ctx.saved_tensors
input_bp, filters_bp = ops.indice_conv_backward(
features, filters, grad_output, indice_pairs, indice_pair_num,
True, False)
return input_bp, filters_bp, None, None, None
class SubMConvFunction(Function):
@staticmethod
def forward(ctx, features, filters, indice_pairs, indice_pair_num,
num_activate_out):
"""
Args:
features (torch.Tensor): Features that needs to convolute.
filters (torch.nn.parameter.Parameter): Convolution filters.
indice_pairs (torch.Tensor): Indice pairs between inputs locations
and outputs locations.
indice_pair_num (torch.Tensor): Indice pairs num.
num_activate_out (torch.Tensor): Output channels num.
Returns:
torch.Tensor: Output features from gather-gemm-scatter.
"""
ctx.save_for_backward(indice_pairs, indice_pair_num, features, filters)
return ops.indice_conv(features, filters, indice_pairs,
indice_pair_num, num_activate_out, False, True)
@staticmethod
def backward(ctx, grad_output):
indice_pairs, indice_pair_num, features, filters = ctx.saved_tensors
input_bp, filters_bp = ops.indice_conv_backward(
features, filters, grad_output, indice_pairs, indice_pair_num,
False, True)
return input_bp, filters_bp, None, None, None
class SparseMaxPoolFunction(Function):
@staticmethod
def forward(ctx, features, indice_pairs, indice_pair_num,
num_activate_out):
"""
Args:
features (torch.Tensor): Features that needs to convolute.
indice_pairs (torch.Tensor): Indice pairs between inputs locations
and outputs locations.
indice_pair_num (torch.Tensor): Indice pairs num.
num_activate_out (torch.Tensor): Output channels num.
Returns:
torch.Tensor: Output features from sparse maxpooling.
"""
out = ops.indice_maxpool(features, indice_pairs, indice_pair_num,
num_activate_out)
ctx.save_for_backward(indice_pairs, indice_pair_num, features, out)
return out
@staticmethod
def backward(ctx, grad_output):
indice_pairs, indice_pair_num, features, out = ctx.saved_tensors
input_bp = ops.indice_maxpool_backward(features, out, grad_output,
indice_pairs, indice_pair_num)
return input_bp, None, None, None
indice_conv = SparseConvFunction.apply
indice_inverse_conv = SparseInverseConvFunction.apply
indice_subm_conv = SubMConvFunction.apply
indice_maxpool = SparseMaxPoolFunction.apply
# Copyright 2019 Yan Yan
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import sys
from collections import OrderedDict
import torch
from torch import nn
from .sparse_structure import SparseConvTensor
def is_spconv_module(module):
spconv_modules = (SparseModule, )
return isinstance(module, spconv_modules)
def is_sparse_conv(module):
from .sparse_conv import SparseConvolution
return isinstance(module, SparseConvolution)
def _mean_update(vals, m_vals, t):
outputs = []
if not isinstance(vals, list):
vals = [vals]
if not isinstance(m_vals, list):
m_vals = [m_vals]
for val, m_val in zip(vals, m_vals):
output = t / float(t + 1) * m_val + 1 / float(t + 1) * val
outputs.append(output)
if len(outputs) == 1:
outputs = outputs[0]
return outputs
class SparseModule(nn.Module):
"""place holder, All module subclass from this will take sptensor in
SparseSequential."""
pass
class SparseSequential(SparseModule):
r"""A sequential container.
Modules will be added to it in the order they are passed in the
constructor.
Alternatively, an ordered dict of modules can also be passed in.
To make it easier to understand, given is a small example::
Example:
>>> # using Sequential:
>>> from mmcv.ops import SparseSequential
>>> model = SparseSequential(
SparseConv2d(1,20,5),
nn.ReLU(),
SparseConv2d(20,64,5),
nn.ReLU()
)
>>> # using Sequential with OrderedDict
>>> model = SparseSequential(OrderedDict([
('conv1', SparseConv2d(1,20,5)),
('relu1', nn.ReLU()),
('conv2', SparseConv2d(20,64,5)),
('relu2', nn.ReLU())
]))
>>> # using Sequential with kwargs(python 3.6+)
>>> model = SparseSequential(
conv1=SparseConv2d(1,20,5),
relu1=nn.ReLU(),
conv2=SparseConv2d(20,64,5),
relu2=nn.ReLU()
)
"""
def __init__(self, *args, **kwargs):
super(SparseSequential, self).__init__()
if len(args) == 1 and isinstance(args[0], OrderedDict):
for key, module in args[0].items():
self.add_module(key, module)
else:
for idx, module in enumerate(args):
self.add_module(str(idx), module)
for name, module in kwargs.items():
if sys.version_info < (3, 6):
raise ValueError('kwargs only supported in py36+')
if name in self._modules:
raise ValueError('name exists.')
self.add_module(name, module)
self._sparity_dict = {}
def __getitem__(self, idx):
if not (-len(self) <= idx < len(self)):
raise IndexError('index {} is out of range'.format(idx))
if idx < 0:
idx += len(self)
it = iter(self._modules.values())
for i in range(idx):
next(it)
return next(it)
def __len__(self):
return len(self._modules)
@property
def sparity_dict(self):
return self._sparity_dict
def add(self, module, name=None):
if name is None:
name = str(len(self._modules))
if name in self._modules:
raise KeyError('name exists')
self.add_module(name, module)
def forward(self, input):
for k, module in self._modules.items():
if is_spconv_module(module):
assert isinstance(input, SparseConvTensor)
self._sparity_dict[k] = input.sparity
input = module(input)
else:
if isinstance(input, SparseConvTensor):
if input.indices.shape[0] != 0:
input.features = module(input.features)
else:
input = module(input)
return input
def fused(self):
from .sparse_conv import SparseConvolution
mods = [v for k, v in self._modules.items()]
fused_mods = []
idx = 0
while idx < len(mods):
if is_sparse_conv(mods[idx]):
if idx < len(mods) - 1 and isinstance(mods[idx + 1],
nn.BatchNorm1d):
new_module = SparseConvolution(
ndim=mods[idx].ndim,
in_channels=mods[idx].in_channels,
out_channels=mods[idx].out_channels,
kernel_size=mods[idx].kernel_size,
stride=mods[idx].stride,
padding=mods[idx].padding,
dilation=mods[idx].dilation,
groups=mods[idx].groups,
bias=True,
subm=mods[idx].subm,
output_padding=mods[idx].output_padding,
transposed=mods[idx].transposed,
inverse=mods[idx].inverse,
indice_key=mods[idx].indice_key,
fused_bn=True,
)
new_module.load_state_dict(mods[idx].state_dict(), False)
new_module.to(mods[idx].weight.device)
conv = new_module
bn = mods[idx + 1]
conv.bias.data.zero_()
conv.weight.data[:] = conv.weight.data * bn.weight.data / (
torch.sqrt(bn.running_var) + bn.eps)
conv.bias.data[:] = (
conv.bias.data - bn.running_mean) * bn.weight.data / (
torch.sqrt(bn.running_var) + bn.eps) + bn.bias.data
fused_mods.append(conv)
idx += 2
else:
fused_mods.append(mods[idx])
idx += 1
else:
fused_mods.append(mods[idx])
idx += 1
return SparseSequential(*fused_mods)
class ToDense(SparseModule):
"""convert SparseConvTensor to NCHW dense tensor."""
def forward(self, x: SparseConvTensor):
return x.dense()
class RemoveGrid(SparseModule):
"""remove pre-allocated grid buffer."""
def forward(self, x: SparseConvTensor):
x.grid = None
return x
# Copyright 2019 Yan Yan
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import torch
from ..utils import ext_loader
ext_module = ext_loader.load_ext('_ext', [
'get_indice_pairs_2d_forward', 'get_indice_pairs_3d_forward',
'get_indice_pairs_4d_forward', 'get_indice_pairs_2d_backward',
'get_indice_pairs_3d_backward', 'indice_conv_forward',
'indice_conv_backward', 'fused_indice_conv_forward',
'indice_maxpool_forward', 'indice_maxpool_backward'
])
def get_conv_output_size(input_size, kernel_size, stride, padding, dilation):
ndim = len(input_size)
output_size = []
for i in range(ndim):
size = (input_size[i] + 2 * padding[i] - dilation[i] *
(kernel_size[i] - 1) - 1) // stride[i] + 1
if kernel_size[i] == -1:
output_size.append(1)
else:
output_size.append(size)
return output_size
def get_deconv_output_size(input_size, kernel_size, stride, padding, dilation,
output_padding):
ndim = len(input_size)
output_size = []
for i in range(ndim):
if kernel_size[i] == -1:
raise ValueError("deconv don't support kernel_size < 0")
size = (input_size[i] - 1) * stride[i] - 2 * padding[i] + kernel_size[
i] + output_padding[i]
output_size.append(size)
return output_size
def get_indice_pairs(indices,
batch_size,
spatial_shape,
ksize=3,
stride=1,
padding=0,
dilation=1,
out_padding=0,
subm=False,
transpose=False,
grid=None):
ndim = indices.shape[1] - 1
if not isinstance(ksize, (list, tuple)):
ksize = [ksize] * ndim
if not isinstance(stride, (list, tuple)):
stride = [stride] * ndim
if not isinstance(padding, (list, tuple)):
padding = [padding] * ndim
if not isinstance(dilation, (list, tuple)):
dilation = [dilation] * ndim
if not isinstance(out_padding, (list, tuple)):
out_padding = [out_padding] * ndim
for d, s in zip(dilation, stride):
assert any([s == 1, d == 1]), "don't support this."
if not subm:
if transpose:
out_shape = get_deconv_output_size(spatial_shape, ksize, stride,
padding, dilation, out_padding)
else:
out_shape = get_conv_output_size(spatial_shape, ksize, stride,
padding, dilation)
else:
out_shape = spatial_shape
if grid is None:
if ndim == 2:
get_indice_pairs_func = ext_module.get_indice_pairs_2d_forward
elif ndim == 3:
get_indice_pairs_func = ext_module.get_indice_pairs_3d_forward
elif ndim == 4:
get_indice_pairs_func = ext_module.get_indice_pairs_4d_forward
else:
raise NotImplementedError
return get_indice_pairs_func(indices, batch_size, out_shape,
spatial_shape, ksize, stride, padding,
dilation, out_padding, int(subm),
int(transpose))
else:
if ndim == 2:
get_indice_pairs_func = ext_module.get_indice_pairs_2d_backward
elif ndim == 3:
get_indice_pairs_func = ext_module.get_indice_pairs_3d_backward
else:
raise NotImplementedError
return get_indice_pairs_func(indices, grid, batch_size, out_shape,
spatial_shape, ksize, stride, padding,
dilation, out_padding, int(subm),
int(transpose))
def indice_conv(features,
filters,
indice_pairs,
indice_pair_num,
num_activate_out,
inverse=False,
subm=False):
if filters.dtype == torch.float32 or filters.dtype == torch.half:
return ext_module.indice_conv_forward(features, filters, indice_pairs,
indice_pair_num,
num_activate_out, int(inverse),
int(subm))
else:
raise NotImplementedError
def fused_indice_conv(features, filters, bias, indice_pairs, indice_pair_num,
num_activate_out, inverse, subm):
if features.dtype == torch.half or filters.dtypes == torch.float32:
func = ext_module.fused_indice_conv_forward
else:
raise NotImplementedError
return func(features, filters, bias, indice_pairs, indice_pair_num,
num_activate_out, int(inverse), int(subm))
def indice_conv_backward(features,
filters,
out_bp,
indice_pairs,
indice_pair_num,
inverse=False,
subm=False):
if filters.dtype == torch.float32 or filters.dtype == torch.half:
return ext_module.indice_conv_backward(features, filters, out_bp,
indice_pairs, indice_pair_num,
int(inverse), int(subm))
else:
raise NotImplementedError
def indice_maxpool(features, indice_pairs, indice_pair_num, num_activate_out):
if features.dtype == torch.float32 or features.dtype == torch.half:
return ext_module.indice_maxpool_forward(features, indice_pairs,
indice_pair_num,
num_activate_out)
else:
raise NotImplementedError
def indice_maxpool_backward(features, out_features, out_bp, indice_pairs,
indice_pair_num):
if features.dtype == torch.float32 or features.dtype == torch.half:
return ext_module.indice_maxpool_backward(features, out_features,
out_bp, indice_pairs,
indice_pair_num)
else:
raise NotImplementedError
# Copyright 2019 Yan Yan
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# import sparse_functional as Fsp
# import sparse_ops as ops
from .sparse_functional import indice_maxpool
from .sparse_modules import SparseModule
from .sparse_ops import get_conv_output_size, get_indice_pairs
from .sparse_structure import SparseConvTensor
class SparseMaxPool(SparseModule):
def __init__(self,
ndim,
kernel_size,
stride=1,
padding=0,
dilation=1,
subm=False):
super(SparseMaxPool, self).__init__()
if not isinstance(kernel_size, (list, tuple)):
kernel_size = [kernel_size] * ndim
if not isinstance(stride, (list, tuple)):
stride = [stride] * ndim
if not isinstance(padding, (list, tuple)):
padding = [padding] * ndim
if not isinstance(dilation, (list, tuple)):
dilation = [dilation] * ndim
self.ndim = ndim
self.kernel_size = kernel_size
self.stride = stride
self.padding = padding
self.subm = subm
self.dilation = dilation
def forward(self, input):
assert isinstance(input, SparseConvTensor)
features = input.features
device = features.device
indices = input.indices
spatial_shape = input.spatial_shape
batch_size = input.batch_size
if not self.subm:
out_spatial_shape = get_conv_output_size(spatial_shape,
self.kernel_size,
self.stride, self.padding,
self.dilation)
else:
out_spatial_shape = spatial_shape
outids, indice_pairs, indice_pairs_num = get_indice_pairs(
indices, batch_size, spatial_shape, self.kernel_size, self.stride,
self.padding, self.dilation, 0, self.subm)
out_features = indice_maxpool(features, indice_pairs.to(device),
indice_pairs_num.to(device),
outids.shape[0])
out_tensor = SparseConvTensor(out_features, outids, out_spatial_shape,
batch_size)
out_tensor.indice_dict = input.indice_dict
out_tensor.grid = input.grid
return out_tensor
class SparseMaxPool2d(SparseMaxPool):
def __init__(self, kernel_size, stride=1, padding=0, dilation=1):
super(SparseMaxPool2d, self).__init__(2, kernel_size, stride, padding,
dilation)
class SparseMaxPool3d(SparseMaxPool):
def __init__(self, kernel_size, stride=1, padding=0, dilation=1):
super(SparseMaxPool3d, self).__init__(3, kernel_size, stride, padding,
dilation)
import numpy as np
import torch
def scatter_nd(indices, updates, shape):
"""pytorch edition of tensorflow scatter_nd.
this function don't contain except handle code. so use this carefully when
indice repeats, don't support repeat add which is supported in tensorflow.
"""
ret = torch.zeros(*shape, dtype=updates.dtype, device=updates.device)
ndim = indices.shape[-1]
output_shape = list(indices.shape[:-1]) + shape[indices.shape[-1]:]
flatted_indices = indices.view(-1, ndim)
slices = [flatted_indices[:, i] for i in range(ndim)]
slices += [Ellipsis]
ret[slices] = updates.view(*output_shape)
return ret
class SparseConvTensor(object):
def __init__(self,
features,
indices,
spatial_shape,
batch_size,
grid=None):
self.features = features
self.indices = indices
if self.indices.dtype != torch.int32:
self.indices.int()
self.spatial_shape = spatial_shape
self.batch_size = batch_size
self.indice_dict = {}
self.grid = grid
@property
def spatial_size(self):
return np.prod(self.spatial_shape)
def find_indice_pair(self, key):
if key is None:
return None
if key in self.indice_dict:
return self.indice_dict[key]
return None
def dense(self, channels_first=True):
output_shape = [self.batch_size] + list(
self.spatial_shape) + [self.features.shape[1]]
res = scatter_nd(self.indices.long(), self.features, output_shape)
if not channels_first:
return res
ndim = len(self.spatial_shape)
trans_params = list(range(0, ndim + 1))
trans_params.insert(1, ndim + 1)
return res.permute(*trans_params).contiguous()
@property
def sparity(self):
return (self.indices.shape[0] / np.prod(self.spatial_shape) /
self.batch_size)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment