Commit 3f1316d5 authored by traveller59's avatar traveller59
Browse files

initial release

parent a347176a
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef SPARSE_CONV_OP_H_
#define SPARSE_CONV_OP_H_
#include <cuda_runtime_api.h>
#include <spconv/indice.h>
#include <spconv/reordering.h>
#include <torch/script.h>
#include <torch_utils.h>
#include <utility/timer.h>
namespace spconv {
// torch.jit's doc says only support int64, so we need to convert to int32.
template <unsigned NDim>
std::vector<torch::Tensor>
getIndicePair(torch::Tensor indices, int64_t batchSize,
std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
std::vector<int64_t> padding, std::vector<int64_t> dilation,
std::vector<int64_t> outPadding, bool subM, bool transpose) {
// auto timer = spconv::CudaContextTimer<>();
auto numAct = indices.size(0);
auto coorDim = indices.size(1) - 1; // batchIdx + xyz
TV_ASSERT_RT_ERR(NDim == coorDim, "error");
TV_ASSERT_RT_ERR(kernelSize.size() == coorDim, "error");
TV_ASSERT_RT_ERR(outSpatialShape.size() == coorDim, "error");
TV_ASSERT_RT_ERR(stride.size() == coorDim, "error");
TV_ASSERT_RT_ERR(padding.size() == coorDim, "error");
TV_ASSERT_RT_ERR(outPadding.size() == coorDim, "error");
TV_ASSERT_RT_ERR(dilation.size() == coorDim, "error");
auto kernelVolume = kernelSize[0];
for (int i = 1; i < kernelSize.size(); ++i) {
kernelVolume *= kernelSize[i];
}
TV_ASSERT_RT_ERR(kernelVolume <= 256, "error");
auto outputVolume = outSpatialShape[0];
for (int i = 1; i < outSpatialShape.size(); ++i) {
outputVolume *= outSpatialShape[i];
}
torch::Tensor indicePairs =
torch::full({kernelVolume, 2, numAct}, -1,
torch::dtype(torch::kInt32).device(indices.device()));
torch::Tensor indiceNum = torch::zeros(
{kernelVolume}, torch::dtype(torch::kInt32).device(indices.device()));
torch::Tensor gridOut =
torch::full({batchSize * outputVolume}, -1,
torch::dtype(torch::kInt32).device(indices.device()));
// std::cout << "full time " << timer.report() / 1000.0 << std::endl;
int64_t numActOut = -1;
tv::SimpleVector<int, NDim> outSpatialShape32;
tv::SimpleVector<int, NDim> kernelSize32;
tv::SimpleVector<int, NDim> stride32;
tv::SimpleVector<int, NDim> padding32;
tv::SimpleVector<int, NDim> dilation32;
auto indicePairUnique =
torch::full({indicePairs.numel() / 2 + 1}, std::numeric_limits<int>::max(),
torch::dtype(torch::kInt32).device(indices.device()));
for (int i = 0; i < NDim; ++i) {
outSpatialShape32.push_back(outSpatialShape[i]);
kernelSize32.push_back(kernelSize[i]);
if (subM) {
stride32.push_back(1);
padding32.push_back(kernelSize[i] / 2);
dilation32.push_back(dilation[i]);
} else {
stride32.push_back(stride[i]);
padding32.push_back(padding[i]);
dilation32.push_back(dilation[i]);
}
}
if (subM) {
if (indices.device().type() == torch::kCPU) {
auto getIndicePairFtor =
functor::CreateSubMIndicePairFunctor<tv::CPU, int, int, NDim>();
numActOut = getIndicePairFtor(
tv::CPU(), tv::torch2tv<const int>(indices), tv::torch2tv<int>(gridOut),
tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum), kernelSize32,
stride32, padding32, dilation32, outSpatialShape32, transpose);
} else {
auto getIndicePairFtor =
functor::CreateSubMIndicePairFunctor<tv::GPU, int, int, NDim>();
numActOut = getIndicePairFtor(
tv::TorchGPU(), tv::torch2tv<const int>(indices), tv::torch2tv<int>(gridOut),
tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum), kernelSize32,
stride32, padding32, dilation32, outSpatialShape32, transpose);
}
return {indices, indicePairs, indiceNum};
} else {
torch::Tensor outInds =
torch::zeros({numAct * kernelVolume, coorDim + 1},
torch::dtype(torch::kInt32).device(indices.device()));
if (indices.device().type() == torch::kCPU) {
auto getIndicePairFtor = functor::CreateConvIndicePairFunctor<tv::CPU, int, int, NDim>();
numActOut = getIndicePairFtor(
tv::CPU(), tv::torch2tv<const int>(indices),
tv::torch2tv<int>(outInds), tv::torch2tv<int>(gridOut),
tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum), kernelSize32,
stride32, padding32, dilation32, outSpatialShape32, transpose);
} else {
auto getIndicePairFtorP1 =
functor::CreateConvIndicePairFunctorP1<tv::GPU, int, int, NDim>();
auto getIndicePairFtorP2 =
functor::CreateConvIndicePairFunctorP2<tv::GPU, int, int, NDim>();
numActOut =
getIndicePairFtorP1(tv::TorchGPU(), tv::torch2tv<const int>(indices),
tv::torch2tv<int>(outInds), tv::torch2tv<int>(gridOut),
tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum),
tv::torch2tv<int>(indicePairUnique), kernelSize32, stride32,
padding32, dilation32, outSpatialShape32, transpose);
if (numActOut > 0) {
auto res = torch::_unique(indicePairUnique);
indicePairUnique = std::get<0>(res);
numActOut = getIndicePairFtorP2(
tv::TorchGPU(), tv::torch2tv<const int>(indices),
tv::torch2tv<int>(outInds), tv::torch2tv<int>(gridOut),
tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum),
tv::torch2tv<int>(indicePairUnique), outSpatialShape32, transpose);
}
}
return {outInds.slice(0, 0, numActOut), indicePairs, indiceNum};
}
}
template <unsigned NDim>
std::vector<torch::Tensor>
getIndicePairPreGrid(torch::Tensor indices, torch::Tensor gridOut, int64_t batchSize,
std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
std::vector<int64_t> padding, std::vector<int64_t> dilation,
std::vector<int64_t> outPadding, bool subM, bool transpose) {
// auto timer = spconv::CudaContextTimer<>();
auto numAct = indices.size(0);
auto coorDim = indices.size(1) - 1; // batchIdx + xyz
TV_ASSERT_RT_ERR(NDim == coorDim, "error");
TV_ASSERT_RT_ERR(kernelSize.size() == coorDim, "error");
TV_ASSERT_RT_ERR(outSpatialShape.size() == coorDim, "error");
TV_ASSERT_RT_ERR(stride.size() == coorDim, "error");
TV_ASSERT_RT_ERR(padding.size() == coorDim, "error");
TV_ASSERT_RT_ERR(outPadding.size() == coorDim, "error");
TV_ASSERT_RT_ERR(dilation.size() == coorDim, "error");
auto kernelVolume = kernelSize[0];
for (int i = 1; i < kernelSize.size(); ++i) {
kernelVolume *= kernelSize[i];
}
TV_ASSERT_RT_ERR(kernelVolume <= 256, "error");
auto outputVolume = outSpatialShape[0];
for (int i = 1; i < outSpatialShape.size(); ++i) {
outputVolume *= outSpatialShape[i];
}
TV_ASSERT_INVALID_ARG(gridOut.numel() >= outputVolume * batchSize, "error");
torch::Tensor indicePairs =
torch::full({kernelVolume, 2, numAct}, -1,
torch::dtype(torch::kInt32).device(indices.device()));
torch::Tensor indiceNum = torch::zeros(
{kernelVolume}, torch::dtype(torch::kInt32).device(indices.device()));
// std::cout << "full time " << timer.report() / 1000.0 << std::endl;
int64_t numActOut = -1;
tv::SimpleVector<int, NDim> outSpatialShape32;
tv::SimpleVector<int, NDim> kernelSize32;
tv::SimpleVector<int, NDim> stride32;
tv::SimpleVector<int, NDim> padding32;
tv::SimpleVector<int, NDim> dilation32;
auto indicePairUnique =
torch::full({indicePairs.numel() / 2 + 1}, std::numeric_limits<int>::max(),
torch::dtype(torch::kInt32).device(indices.device()));
for (int i = 0; i < NDim; ++i) {
outSpatialShape32.push_back(outSpatialShape[i]);
kernelSize32.push_back(kernelSize[i]);
if (subM) {
stride32.push_back(1);
padding32.push_back(kernelSize[i] / 2);
dilation32.push_back(dilation[i]);
} else {
stride32.push_back(stride[i]);
padding32.push_back(padding[i]);
dilation32.push_back(dilation[i]);
}
}
if (subM) {
if (indices.device().type() == torch::kCPU) {
auto getIndicePairFtor =
functor::CreateSubMIndicePairFunctor<tv::CPU, int, int, NDim>();
numActOut = getIndicePairFtor(
tv::CPU(), tv::torch2tv<const int>(indices), tv::torch2tv<int>(gridOut),
tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum), kernelSize32,
stride32, padding32, dilation32, outSpatialShape32, transpose);
gridOut.fill_(-1);
} else {
auto getIndicePairFtor =
functor::CreateSubMIndicePairFunctor<tv::GPU, int, int, NDim>();
numActOut = getIndicePairFtor(
tv::TorchGPU(), tv::torch2tv<const int>(indices), tv::torch2tv<int>(gridOut),
tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum), kernelSize32,
stride32, padding32, dilation32, outSpatialShape32, transpose, true);
}
return {indices, indicePairs, indiceNum};
} else {
torch::Tensor outInds =
torch::zeros({numAct * kernelVolume, coorDim + 1},
torch::dtype(torch::kInt32).device(indices.device()));
if (indices.device().type() == torch::kCPU) {
auto getIndicePairFtor = functor::CreateConvIndicePairFunctor<tv::CPU, int, int, NDim>();
numActOut = getIndicePairFtor(
tv::CPU(), tv::torch2tv<const int>(indices),
tv::torch2tv<int>(outInds), tv::torch2tv<int>(gridOut),
tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum), kernelSize32,
stride32, padding32, dilation32, outSpatialShape32, transpose, true);
gridOut.fill_(-1);
} else {
auto getIndicePairFtorP1 =
functor::CreateConvIndicePairFunctorP1<tv::GPU, int, int, NDim>();
auto getIndicePairFtorP2 =
functor::CreateConvIndicePairFunctorP2<tv::GPU, int, int, NDim>();
numActOut =
getIndicePairFtorP1(tv::TorchGPU(), tv::torch2tv<const int>(indices),
tv::torch2tv<int>(outInds), tv::torch2tv<int>(gridOut),
tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum),
tv::torch2tv<int>(indicePairUnique), kernelSize32, stride32,
padding32, dilation32, outSpatialShape32, transpose);
if (numActOut > 0) {
auto res = torch::_unique(indicePairUnique);
indicePairUnique = std::get<0>(res);
numActOut = getIndicePairFtorP2(
tv::TorchGPU(), tv::torch2tv<const int>(indices),
tv::torch2tv<int>(outInds), tv::torch2tv<int>(gridOut),
tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum),
tv::torch2tv<int>(indicePairUnique), outSpatialShape32, transpose, true);
}
}
return {outInds.slice(0, 0, numActOut), indicePairs, indiceNum};
}
}
template <typename T>
torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters,
torch::Tensor indicePairs, torch::Tensor indiceNum,
int64_t numActOut, bool inverse, bool subm) {
auto device = features.device().type();
auto ndim = filters.dim() - 2;
auto kernelVolume = indicePairs.size(0);
auto numInPlanes = features.size(1);
auto numOutPlanes = filters.size(ndim + 1);
auto indicePairNumCpu = indiceNum.to({torch::kCPU});
auto indicePairMaxSizeIter = std::max_element(
indicePairNumCpu.data<int>(), indicePairNumCpu.data<int>() + kernelVolume);
int indicePairMaxOffset = indicePairMaxSizeIter - indicePairNumCpu.data<int>();
int indicePairMaxSize = *indicePairMaxSizeIter;
/*if (subm){
std::vector<int> indicePairNumVec(indicePairNumCpu.data<int>(), indicePairNumCpu.data<int>() + kernelVolume);
indicePairNumVec.erase(indicePairNumVec.begin() + indicePairMaxOffset);
auto indicePairVecMaxSizeIter = std::max_element(
indicePairNumVec.begin(), indicePairNumVec.end());
indicePairMaxSize = *indicePairVecMaxSizeIter;
}*/
auto options =
torch::TensorOptions().dtype(features.dtype()).device(features.device());
// auto indicePairOptions =
// torch::TensorOptions().dtype(torch::kInt64).device(indicePairs.device());
torch::Tensor output = torch::zeros({numActOut, numOutPlanes}, options);
torch::Tensor inputBuffer = torch::zeros({indicePairMaxSize, numInPlanes}, options);
torch::Tensor outputBuffer =
torch::zeros({indicePairMaxSize, numOutPlanes}, options);
filters = filters.view({-1, numInPlanes, numOutPlanes});
if (subm) { // the center index of subm conv don't need gather and scatter
// add.
torch::mm_out(output, features, filters[indicePairMaxOffset]);
}
double totalGatherTime = 0;
double totalGEMMTime = 0;
double totalSAddTime = 0;
for (int i = 0; i < kernelVolume; ++i) {
auto nHot = indicePairNumCpu.data<int>()[i];
if (nHot <= 0 || (subm && i == indicePairMaxOffset)) {
continue;
}
// auto timer = spconv::CudaContextTimer<>();
auto outputBufferBlob =
torch::from_blob(outputBuffer.data<T>(), {nHot, numOutPlanes}, options);
auto inputBufferBlob =
torch::from_blob(inputBuffer.data<T>(), {nHot, numInPlanes}, options);
if (device == torch::kCPU) {
functor::SparseGatherFunctor<tv::CPU, T, int> gatherFtor;
gatherFtor(tv::CPU(), tv::torch2tv<T>(inputBuffer),
tv::torch2tv<const T>(features),
tv::torch2tv<const int>(indicePairs).subview(i, inverse), nHot);
} else {
functor::SparseGatherFunctor<tv::GPU, T, int> gatherFtor;
gatherFtor(tv::TorchGPU(), tv::torch2tv<T>(inputBuffer),
tv::torch2tv<const T>(features),
tv::torch2tv<const int>(indicePairs).subview(i, inverse), nHot);
TV_CHECK_CUDA_ERR();
/* slower than SparseGatherFunctor, may due to int->long conversion
auto indicePairLong = indicePairs[i][inverse].to(torch::kInt64);
auto indicePairBlob = torch::from_blob(indicePairLong.data<long>(), {nHot},
indicePairOptions);
torch::index_select_out(inputBufferBlob, features, 0,
indicePairBlob);*/
}
// totalGatherTime += timer.report() / 1000.0;
torch::mm_out(outputBufferBlob, inputBufferBlob, filters[i]);
// totalGEMMTime += timer.report() / 1000.0;
if (device == torch::kCPU) {
functor::SparseScatterAddFunctor<tv::CPU, T, int> scatterFtor;
scatterFtor(tv::CPU(), tv::torch2tv<T>(output),
tv::torch2tv<const T>(outputBuffer),
tv::torch2tv<const int>(indicePairs).subview(i, !inverse), nHot,
true);
} else {
functor::SparseScatterAddFunctor<tv::GPU, T, int> scatterFtor;
scatterFtor(tv::TorchGPU(), tv::torch2tv<T>(output),
tv::torch2tv<const T>(outputBuffer),
tv::torch2tv<const int>(indicePairs).subview(i, !inverse), nHot,
true);
TV_CHECK_CUDA_ERR();
}
// totalSAddTime += timer.report() / 1000.0;
}
// std::cout << "gather time " << totalGatherTime << std::endl;
// std::cout << "gemm time " << totalGEMMTime << std::endl;
// std::cout << "scatteradd time " << totalSAddTime << std::endl;
return output;
}
template <typename T>
std::vector<torch::Tensor>
indiceConvBackward(torch::Tensor features, torch::Tensor filters,
torch::Tensor outGrad, torch::Tensor indicePairs, torch::Tensor indiceNum,
bool inverse, bool subm) {
auto device = features.device().type();
auto ndim = filters.dim() - 2;
auto kernelVolume = indicePairs.size(0);
auto numInPlanes = features.size(1);
auto numOutPlanes = filters.size(ndim + 1);
auto indicePairNumCpu = indiceNum.to({torch::kCPU});
auto indicePairMaxSizeIter = std::max_element(
indicePairNumCpu.data<int>(), indicePairNumCpu.data<int>() + kernelVolume);
int indicePairMaxOffset = indicePairMaxSizeIter - indicePairNumCpu.data<int>();
int indicePairMaxSize = *indicePairMaxSizeIter;
auto options =
torch::TensorOptions().dtype(features.dtype()).device(features.device());
auto filterShape = filters.sizes();
torch::Tensor inputGrad = torch::zeros(features.sizes(), options);
torch::Tensor filtersGrad = torch::zeros(filterShape, options);
torch::Tensor inputBuffer = torch::zeros({indicePairMaxSize, numInPlanes}, options);
torch::Tensor outputBuffer =
torch::zeros({indicePairMaxSize, numOutPlanes}, options);
filters = filters.view({-1, numInPlanes, numOutPlanes});
filtersGrad = filtersGrad.view({-1, numInPlanes, numOutPlanes});
if (subm) {
auto filterGradSub = filtersGrad[indicePairMaxOffset];
torch::mm_out(filterGradSub, features.t(), outGrad);
torch::mm_out(inputGrad, outGrad, filters[indicePairMaxOffset].t());
}
for (int i = 0; i < kernelVolume; ++i) {
auto nHot = indicePairNumCpu.data<int>()[i];
if (nHot <= 0 || (subm && i == indicePairMaxOffset)) {
continue;
}
if (device == torch::kCPU) {
functor::SparseGatherFunctor<tv::CPU, T, int> gatherFtor;
functor::SparseGatherFunctor<tv::CPU, T, int> gatherFtorOut;
gatherFtor(tv::CPU(), tv::torch2tv<T>(inputBuffer),
tv::torch2tv<const T>(features),
tv::torch2tv<const int>(indicePairs).subview(i, inverse), nHot);
gatherFtorOut(tv::CPU(), tv::torch2tv<T>(outputBuffer),
tv::torch2tv<const T>(outGrad),
tv::torch2tv<const int>(indicePairs).subview(i, !inverse), nHot);
} else {
functor::SparseGatherFunctor<tv::GPU, T, int> gatherFtor;
functor::SparseGatherFunctor<tv::GPU, T, int> gatherFtorOut;
gatherFtor(tv::TorchGPU(), tv::torch2tv<T>(inputBuffer),
tv::torch2tv<const T>(features),
tv::torch2tv<const int>(indicePairs).subview(i, inverse), nHot);
TV_CHECK_CUDA_ERR();
gatherFtorOut(tv::TorchGPU(), tv::torch2tv<T>(outputBuffer),
tv::torch2tv<const T>(outGrad),
tv::torch2tv<const int>(indicePairs).subview(i, !inverse), nHot);
TV_CHECK_CUDA_ERR();
}
auto filterGradSub = filtersGrad[i];
auto outputBufferBlob =
torch::from_blob(outputBuffer.data<T>(), {nHot, numOutPlanes}, options);
auto inputBufferBlob =
torch::from_blob(inputBuffer.data<T>(), {nHot, numInPlanes}, options);
torch::mm_out(filterGradSub, inputBufferBlob.t(), outputBufferBlob);
torch::mm_out(inputBufferBlob, outputBufferBlob, filters[i].t());
if (device == torch::kCPU) {
functor::SparseScatterAddFunctor<tv::CPU, T, int> scatterFtor;
scatterFtor(tv::CPU(), tv::torch2tv<T>(inputGrad),
tv::torch2tv<const T>(inputBuffer),
tv::torch2tv<const int>(indicePairs).subview(i, inverse), nHot);
} else {
functor::SparseScatterAddFunctor<tv::GPU, T, int> scatterFtor;
scatterFtor(tv::TorchGPU(), tv::torch2tv<T>(inputGrad),
tv::torch2tv<const T>(inputBuffer),
tv::torch2tv<const int>(indicePairs).subview(i, inverse), nHot);
TV_CHECK_CUDA_ERR();
}
}
return {inputGrad, filtersGrad.view(filterShape)};
}
template <typename T>
torch::Tensor indiceConvDevelopDontUse(torch::Tensor features, torch::Tensor filters,
torch::Tensor indicePairs, torch::Tensor indiceNum,
int64_t numActOut, bool inverse, bool subm) {
auto device = features.device().type();
auto ndim = filters.dim() - 2;
auto kernelVolume = indicePairs.size(0);
auto numInPlanes = features.size(1);
auto numOutPlanes = filters.size(ndim + 1);
auto indicePairNumCpu = indiceNum.to({torch::kCPU});
auto totalActsTen = indicePairNumCpu.sum();
auto totalActs = indicePairNumCpu.data<int>()[0];
auto indicePairMaxSizeIter = std::max_element(
indicePairNumCpu.data<int>(), indicePairNumCpu.data<int>() + kernelVolume);
int indicePairMaxOffset = indicePairMaxSizeIter - indicePairNumCpu.data<int>();
int indicePairMaxSize = *indicePairMaxSizeIter;
std::vector<int> indicePairNumVec(indicePairNumCpu.data<int>(),
indicePairNumCpu.data<int>() + kernelVolume);
indicePairNumVec.erase(indicePairNumVec.begin() + indicePairMaxOffset);
int subRuleMaxSize = *std::max_element(indicePairNumVec.begin(), indicePairNumVec.end());
if (subm) {
indicePairMaxSize = subRuleMaxSize;
}
auto timer = spconv::CudaContextTimer<>();
auto options =
torch::TensorOptions().dtype(features.dtype()).device(features.device());
// auto indicePairOptions =
// torch::TensorOptions().dtype(torch::kInt64).device(indicePairs.device());
torch::Tensor output = torch::zeros({numActOut, numOutPlanes}, options);
torch::Tensor inputBuffer =
torch::zeros({kernelVolume, indicePairMaxSize, numInPlanes}, options);
torch::Tensor outputBuffer =
torch::zeros({kernelVolume, indicePairMaxSize, numOutPlanes}, options);
filters = filters.view({-1, numInPlanes, numOutPlanes});
std::cout << "create time " << timer.report()/1000.0 << std::endl;
if (subm) { // the center index of subm conv don't need gather and scatter
// add.
torch::mm_out(output, features, filters[indicePairMaxOffset]);
}
double totalGatherTime = 0;
double totalGEMMTime = 0;
double totalSAddTime = 0;
// auto timer = spconv::CudaContextTimer<>();
for (int i = 0; i < kernelVolume; ++i) {
auto nHot = indicePairNumCpu.data<int>()[i];
if (nHot <= 0 || (subm && i == indicePairMaxOffset)) {
continue;
}
//
auto outputBufferBlob = torch::from_blob(outputBuffer[i].data<T>(),
{nHot, numOutPlanes}, options);
auto inputBufferBlob = torch::from_blob(inputBuffer[i].data<T>(),
{nHot, numInPlanes}, options);
if (device == torch::kCPU) {
functor::SparseGatherFunctor<tv::CPU, T, int> gatherFtor;
gatherFtor(tv::CPU(), tv::torch2tv<T>(inputBufferBlob),
tv::torch2tv<const T>(features),
tv::torch2tv<const int>(indicePairs).subview(i, inverse), nHot);
} else {
functor::SparseGatherFunctor<tv::GPU, T, int> gatherFtor;
gatherFtor(tv::TorchGPU(), tv::torch2tv<T>(inputBufferBlob),
tv::torch2tv<const T>(features),
tv::torch2tv<const int>(indicePairs).subview(i, inverse), nHot);
TV_CHECK_CUDA_ERR();
}
// }
// for (int i = 0; i < kernelVolume; ++i) {
// totalGatherTime += timer.report() / 1000.0;
// auto outputBufferBlob = torch::from_blob(outputBuffer[i].data<T>(),
// {nHot, numOutPlanes}, options);
}
// totalGatherTime += timer.report() / 1000.0;
for (int i = 0; i < kernelVolume; ++i) {
auto nHot = indicePairNumCpu.data<int>()[i];
if (nHot <= 0 || (subm && i == indicePairMaxOffset)) {
continue;
}
auto outputBufferBlob = torch::from_blob(outputBuffer[i].data<T>(),
{nHot, numOutPlanes}, options);
auto inputBufferBlob = torch::from_blob(inputBuffer[i].data<T>(),
{nHot, numInPlanes}, options);
torch::mm_out(outputBufferBlob, inputBufferBlob, filters[i]);
}
// totalGEMMTime += timer.report() / 1000.0;
// totalGEMMTime += timer.report() / 1000.0;
for (int i = 0; i < kernelVolume; ++i) {
auto nHot = indicePairNumCpu.data<int>()[i];
if (nHot <= 0 || (subm && i == indicePairMaxOffset)) {
continue;
}
auto outputBufferBlob = torch::from_blob(outputBuffer[i].data<T>(),
{nHot, numOutPlanes}, options);
auto inputBufferBlob = torch::from_blob(inputBuffer[i].data<T>(),
{nHot, numInPlanes}, options);
if (device == torch::kCPU) {
functor::SparseScatterAddFunctor<tv::CPU, T, int> scatterFtor;
scatterFtor(tv::CPU(), tv::torch2tv<T>(output),
tv::torch2tv<const T>(outputBufferBlob),
tv::torch2tv<const int>(indicePairs).subview(i, !inverse), nHot,
true);
} else {
functor::SparseScatterAddFunctor<tv::GPU, T, int> scatterFtor;
scatterFtor(tv::TorchGPU(), tv::torch2tv<T>(output),
tv::torch2tv<const T>(outputBufferBlob),
tv::torch2tv<const int>(indicePairs).subview(i, !inverse), nHot,
true);
TV_CHECK_CUDA_ERR();
}
// totalSAddTime += timer.report() / 1000.0;
}
// totalSAddTime += timer.report() / 1000.0;
// std::cout << "gather time " << totalGatherTime << std::endl;
// std::cout << "gemm time " << totalGEMMTime << std::endl;
// std::cout << "scatteradd time " << totalSAddTime << std::endl;
return output;
}
} // namespace spconv
#endif
\ No newline at end of file
#pragma once
// from tensorflow
namespace tv
{
namespace detail
{
template <typename T>
class KernelLoop
{
struct Iterator
{
__forceinline__ __device__ Iterator(T index, T delta) : index_(index), delta_(delta) {}
__forceinline__ __device__ T operator*() const { return index_; }
__forceinline__ __device__ Iterator &operator++()
{
index_ += delta_;
return *this;
}
__forceinline__ __device__ bool operator!=(const Iterator &other) const
{
bool greater = index_ > other.index_;
bool less = index_ < other.index_;
// Anything past an end iterator (delta_ == 0) is equal.
// In range-based for loops, this optimizes to 'return less'.
if (!other.delta_)
{
return less;
}
if (!delta_)
{
return greater;
}
return less || greater;
}
private:
T index_;
const T delta_;
};
public:
__forceinline__ __device__ KernelLoop(T begin, T delta, T end)
: begin_(begin), delta_(delta), end_(end) {}
__forceinline__ __device__ Iterator begin() const { return Iterator{begin_, delta_}; }
__forceinline__ __device__ Iterator end() const { return Iterator{end_, 0}; }
private:
T begin_;
T delta_;
T end_;
};
} // namespace detail
template <typename T, int NumILP=1>
__forceinline__ __device__ detail::KernelLoop<T> KernelLoopX(T count)
{
return detail::KernelLoop<T>(blockIdx.x * blockDim.x + threadIdx.x,
gridDim.x * blockDim.x * NumILP, count);
}
// Helper to visit indices in the range 0 <= i < count using the y-coordinate.
// Usage: for(int i : KernelLoopY(count)) { visit(i); }
template <typename T, int NumILP=1>
__forceinline__ __device__ detail::KernelLoop<T> KernelLoopY(T count)
{
return detail::KernelLoop<T>(blockIdx.y * blockDim.y + threadIdx.y,
gridDim.y * blockDim.y * NumILP, count);
}
// Helper to visit indices in the range 0 <= i < count using the z-coordinate.
// Usage: for(int i : KernelLoopZ(count)) { visit(i); }
template <typename T, int NumILP=1>
__forceinline__ __device__ detail::KernelLoop<T> KernelLoopZ(T count)
{
return detail::KernelLoop<T>(blockIdx.z * blockDim.z + threadIdx.z,
gridDim.z * blockDim.z * NumILP, count);
}
} // namespace tv
\ No newline at end of file
#pragma once
// from pytorch.aten
#include "tensorview.h"
namespace tv
{
namespace launch
{
template <typename T1, typename T2>
inline int DivUp(const T1 a, const T2 b) { return (a + b - 1) / b; }
// Use 1024 threads per block, which requires cuda sm_2x or above
constexpr int CUDA_NUM_THREADS = 1024;
// CUDA: number of blocks for threads.
inline int getBlocks(const int N)
{
TV_ASSERT_RT_ERR(N > 0, "CUDA kernel launch blocks must be positive, but got N=", N);
return DivUp(N, CUDA_NUM_THREADS);
}
} // namespace launch
} // namespace tv
\ No newline at end of file
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <algorithm>
#include <cassert>
#include <cstdlib>
#include <cuda_runtime_api.h>
#include <iostream>
#include <memory>
#include <prettyprint.h>
#include <sstream>
#include <type_traits>
#include <vector>
namespace tv {
#ifdef __NVCC__
#define TV_HOST_DEVICE_INLINE __forceinline__ __device__ __host__
#define TV_DEVICE_INLINE __forceinline__ __device__
#define TV_HOST_DEVICE __device__ __host__
#define TV_ASSERT(expr) assert(expr)
#elif defined(__CUDACC_RTC__)
#define TV_ASSERT(expr) assert(expr)
#define TV_HOST_DEVICE_INLINE __forceinline__ __device__
#define TV_DEVICE_INLINE __forceinline__ __device__
#define TV_HOST_DEVICE __device__ __host__
#else
#define TV_ASSERT(x) assert(x)
#define TV_HOST_DEVICE_INLINE inline
#define TV_HOST_DEVICE
#endif
#define TV_REQUIRE(expr, ...) \
{ \
if (!(expr)) { \
printf(__VA_ARGS__); \
assert(expr); \
} \
}
#define TV_DEVICE_REQUIRE(expr, ...) \
{ \
if (!(expr) && threadIdx.x == 0) \
printf(__VA_ARGS__); \
assert(expr); \
}
template <class SStream, class T> void sstream_print(SStream &ss, T val) {
ss << val;
}
template <class SStream, class T, class... TArgs>
void sstream_print(SStream &ss, T val, TArgs... args) {
ss << val << " ";
sstream_print(ss, args...);
}
#define TV_ASSERT_RT_ERR(expr, ...) \
{ \
if (!(expr)) { \
std::stringstream __macro_s; \
__macro_s << __FILE__ << " " << __LINE__ << "\n"; \
__macro_s << #expr << " assert faild. "; \
tv::sstream_print(__macro_s, __VA_ARGS__); \
throw std::runtime_error(__macro_s.str()); \
} \
}
#define TV_ASSERT_INVALID_ARG(expr, ...) \
{ \
if (!(expr)) { \
std::stringstream __macro_s; \
__macro_s << __FILE__ << " " << __LINE__ << "\n"; \
__macro_s << #expr << " assert faild. "; \
tv::sstream_print(__macro_s, __VA_ARGS__); \
throw std::invalid_argument(__macro_s.str()); \
} \
}
#define TV_CHECK_CUDA_ERR() \
{ \
auto err = cudaGetLastError(); \
if (err != cudaSuccess) { \
std::stringstream __macro_s; \
__macro_s << __FILE__ << " " << __LINE__ << "\n"; \
__macro_s << "cuda execution failed with error " << err; \
throw std::runtime_error(__macro_s.str()); \
} \
}
struct GPU {
GPU(cudaStream_t s = 0) : mStream(s) {}
cudaStream_t stream() const { return mStream; }
cudaStream_t mStream = 0;
};
struct CPU {};
#define TV_MAX_DIM 6
/*
template <typename T>
constexpr size_t calc_align(size_t ndim)
{
if (ndim * sizeof(T) == 1)
return 1;
else if (ndim * sizeof(T) == 2)
return 2;
else if (ndim * sizeof(T) <= 4 && ndim * sizeof(T) > 2)
return 4;
else if (ndim * sizeof(T) <= 8 && ndim * sizeof(T) > 4)
return 8;
else if (ndim * sizeof(T) <= 16 && ndim * sizeof(T) > 8)
return 16;
else if (ndim * sizeof(T) <= 32 && ndim * sizeof(T) > 16)
return 32;
else
return 64;
}
*/
template <typename T, size_t MaxDim = TV_MAX_DIM>
struct /*alignas(calc_align<T>(MaxDim))*/ SimpleVector {
public:
TV_HOST_DEVICE_INLINE SimpleVector(){};
TV_HOST_DEVICE_INLINE SimpleVector(std::initializer_list<T> q) {
TV_ASSERT(q.size() <= MaxDim);
mSize = 0;
for (T s : q) {
mArray[mSize++] = s;
}
mSize = q.size();
}
SimpleVector(const std::vector<T> &arr) {
TV_ASSERT(arr.size() <= MaxDim);
for (size_t i = 0; i < arr.size(); ++i) {
mArray[i] = arr[i];
}
mSize = arr.size();
}
TV_HOST_DEVICE_INLINE SimpleVector(const SimpleVector<T, MaxDim> &arr) {
TV_ASSERT(arr.size() <= MaxDim);
for (size_t i = 0; i < arr.size(); ++i) {
mArray[i] = arr[i];
}
mSize = arr.size();
}
TV_HOST_DEVICE_INLINE T &operator[](int idx) {
#ifdef TV_DEBUG
TV_ASSERT(idx >= 0 && idx < mSize);
#endif
return mArray[idx];
}
TV_HOST_DEVICE_INLINE const T &operator[](int idx) const {
#ifdef TV_DEBUG
TV_ASSERT(idx >= 0 && idx < mSize);
#endif
return mArray[idx];
}
TV_HOST_DEVICE_INLINE void push_back(T s) {
#ifdef TV_DEBUG
TV_ASSERT(mSize < MaxDim);
#endif
mArray[mSize] = s;
mSize++;
}
TV_HOST_DEVICE_INLINE void pop_back() {
#ifdef TV_DEBUG
TV_ASSERT(mSize > 0);
#endif
mSize--;
}
TV_HOST_DEVICE_INLINE size_t size() const { return mSize; }
TV_HOST_DEVICE_INLINE const T *data() const { return mArray; }
TV_HOST_DEVICE_INLINE size_t empty() const { return mSize == 0; }
typedef size_t size_type;
class iterator {
public:
typedef iterator self_type;
typedef T value_type;
typedef T &reference;
typedef T *pointer;
typedef std::forward_iterator_tag iterator_category;
typedef std::ptrdiff_t difference_type;
TV_HOST_DEVICE_INLINE iterator(pointer ptr) : ptr_(ptr) {}
TV_HOST_DEVICE_INLINE self_type operator++(int junk) {
self_type i = *this;
ptr_++;
return i;
}
TV_HOST_DEVICE_INLINE self_type operator++() {
ptr_++;
return *this;
}
TV_HOST_DEVICE_INLINE reference operator*() { return *ptr_; }
TV_HOST_DEVICE_INLINE pointer operator->() { return ptr_; }
TV_HOST_DEVICE_INLINE bool operator==(const self_type &rhs) {
return ptr_ == rhs.ptr_;
}
TV_HOST_DEVICE_INLINE bool operator!=(const self_type &rhs) {
return ptr_ != rhs.ptr_;
}
private:
pointer ptr_;
};
class const_iterator {
public:
typedef const_iterator self_type;
typedef T value_type;
typedef const T &reference;
typedef const T *pointer;
typedef std::ptrdiff_t difference_type;
typedef std::forward_iterator_tag iterator_category;
TV_HOST_DEVICE_INLINE const_iterator(pointer ptr) : ptr_(ptr) {}
TV_HOST_DEVICE_INLINE self_type operator++(int junk) {
self_type i = *this;
ptr_++;
return i;
}
TV_HOST_DEVICE_INLINE self_type operator++() {
ptr_++;
return *this;
}
TV_HOST_DEVICE_INLINE reference operator*() { return *ptr_; }
TV_HOST_DEVICE_INLINE pointer operator->() { return ptr_; }
TV_HOST_DEVICE_INLINE bool operator==(const self_type &rhs) {
return ptr_ == rhs.ptr_;
}
TV_HOST_DEVICE_INLINE bool operator!=(const self_type &rhs) {
return ptr_ != rhs.ptr_;
}
private:
pointer ptr_;
};
TV_HOST_DEVICE_INLINE iterator begin() { return iterator(mArray); }
TV_HOST_DEVICE_INLINE iterator end() { return iterator(mArray + mSize); }
TV_HOST_DEVICE_INLINE const_iterator begin() const {
return const_iterator(mArray);
}
TV_HOST_DEVICE_INLINE const_iterator end() const {
return const_iterator(mArray + mSize);
}
TV_HOST_DEVICE_INLINE const_iterator cbegin() const {
return const_iterator(mArray);
}
TV_HOST_DEVICE_INLINE const_iterator cend() const {
return const_iterator(mArray + mSize);
}
protected:
T mArray[MaxDim];
size_t mSize = 0;
};
template <typename T, size_t MaxDim>
bool operator==(const SimpleVector<T, MaxDim> &lfs,
const SimpleVector<T, MaxDim> &rfs) {
if (lfs.size() != rfs.size())
return false;
for (size_t i = 0; i < lfs.size(); ++i) {
if (lfs[i] != rfs[i])
return false;
}
return true;
}
template <typename T, size_t MaxDim>
bool operator!=(const SimpleVector<T, MaxDim> &lfs,
const SimpleVector<T, MaxDim> &rfs) {
return !(lfs == rfs);
}
struct Slice {
template <class... Integers> TV_HOST_DEVICE_INLINE Slice(Integers... ints) {
static_assert(sizeof...(ints) <= 3, "slice init must smaller than 3");
SimpleVector<int, 3> slices{int(ints)...};
mSlices[0] = -1;
mSlices[1] = -1;
mSlices[2] = -1;
for (size_t i = 0; i < slices.size(); ++i) {
mSlices[i] = slices[i];
}
}
TV_HOST_DEVICE_INLINE Slice() {
mSlices[0] = -1;
mSlices[1] = -1;
mSlices[2] = -1;
}
template <typename T>
TV_HOST_DEVICE_INLINE Slice(std::initializer_list<T> slice) {
mSlices[0] = -1;
mSlices[1] = -1;
mSlices[2] = -1;
TV_ASSERT(slice.size() <= 3);
int idx = 0;
for (T s : slice) {
mSlices[idx] = int(s);
++idx;
}
}
TV_HOST_DEVICE_INLINE int &operator[](int idx) {
#ifdef TV_DEBUG
TV_ASSERT(idx >= 0 && idx < 3);
#endif
return mSlices[idx];
}
TV_HOST_DEVICE_INLINE const int &operator[](int idx) const {
#ifdef TV_DEBUG
TV_ASSERT(idx >= 0 && idx < 3);
#endif
return mSlices[idx];
}
protected:
int mSlices[3];
};
template <size_t MaxDim = TV_MAX_DIM>
struct ShapeBase : public SimpleVector<int, MaxDim> {
TV_HOST_DEVICE_INLINE ShapeBase() : SimpleVector<int, MaxDim>(){};
TV_HOST_DEVICE_INLINE ShapeBase(std::initializer_list<int> shape)
: SimpleVector<int, MaxDim>(shape) {}
template <typename T, template <class...> class Container>
ShapeBase(Container<T> shape) : SimpleVector<int, MaxDim>(shape) {}
TV_HOST_DEVICE_INLINE ShapeBase(const ShapeBase<MaxDim> &shape)
: SimpleVector<int, MaxDim>(shape) {}
ShapeBase(const std::vector<int> &arr) : SimpleVector<int, MaxDim>(arr) {}
ShapeBase<MaxDim> &operator=(const ShapeBase<MaxDim> &shape) = default;
TV_HOST_DEVICE_INLINE ShapeBase<MaxDim> subshape(int start, int end) const {
#ifdef TV_DEBUG
TV_ASSERT(start >= 0 && end < this->mSize && end > start);
#endif
ShapeBase<MaxDim> shape;
for (int i = start; i < end; ++i) {
shape.push_back(this->mArray[i]);
}
return shape;
}
TV_HOST_DEVICE_INLINE ShapeBase<MaxDim> subshape(int start) const {
#ifdef TV_DEBUG
TV_ASSERT(start >= 0 && start <= this->mSize);
#endif
ShapeBase<MaxDim> shape;
for (int i = start; i < this->mSize; ++i) {
shape.push_back(this->mArray[i]);
}
return shape;
}
TV_HOST_DEVICE_INLINE size_t size() const {
if (this->mSize == 0)
return 0;
size_t s = 1;
for (int i = 0; i < int(this->mSize); ++i) {
s *= this->mArray[i];
}
return s;
}
TV_HOST_DEVICE_INLINE size_t ndim() const { return this->mSize; }
TV_HOST_DEVICE_INLINE ShapeBase<MaxDim> squeeze() const {
ShapeBase<MaxDim> shape;
for (int i = 0; i < this->mSize; ++i) {
if (this->mArray[i] != 1)
shape.push_back(this->mArray[i]);
}
return shape;
}
TV_HOST_DEVICE_INLINE ShapeBase<MaxDim> squeeze(int dim) const {
ShapeBase<MaxDim> shape;
for (int i = 0; i < this->mSize; ++i) {
if (i != dim || this->mArray[i] != 1)
shape.push_back(this->mArray[i]);
}
return shape;
}
};
using Shape = ShapeBase<TV_MAX_DIM>;
template <class... Inds>
TV_HOST_DEVICE_INLINE unsigned rowArrayIdx(std::vector<int> &shape,
Inds... indexes) {
unsigned offset = 0;
unsigned m = 1;
int indexes_vec[sizeof...(indexes)] = {indexes...};
#ifdef TV_DEBUG
TV_ASSERT(sizeof...(indexes) == shape.size());
#endif
#pragma unroll
for (int i = sizeof...(indexes) - 1; i >= 0; --i) {
offset += m * indexes_vec[i];
m *= shape[i];
}
return offset;
}
TV_HOST_DEVICE_INLINE unsigned rowArrayIdx(std::vector<int> &shape,
std::vector<int> &indexes_vec) {
unsigned offset = 0;
unsigned m = 1;
for (int i = shape.size() - 1; i >= 0; --i) {
offset += m * indexes_vec[i];
m *= shape[i];
}
return offset;
}
template <class... Inds>
TV_HOST_DEVICE_INLINE unsigned rowArrayIdx(const Shape &shape,
Inds... indexes) {
unsigned offset = 0;
unsigned m = 1;
int indexes_vec[sizeof...(indexes)] = {indexes...};
#pragma unroll
for (int i = sizeof...(indexes) - 1; i >= 0; --i) {
offset += m * indexes_vec[i];
m *= shape[i];
}
return offset;
}
TV_HOST_DEVICE_INLINE unsigned rowArrayIdx(const Shape &shape,
const Shape &indexes_vec) {
unsigned offset = 0;
unsigned m = 1;
for (int i = indexes_vec.ndim() - 1; i >= 0; --i) {
offset += m * indexes_vec[i];
m *= shape[i];
}
return offset;
}
template <typename Index, unsigned NDim>
TV_HOST_DEVICE_INLINE unsigned rowArrayIdx(const Index *indexes,
const Index *shape) {
unsigned offset = 0;
unsigned m = 1;
#pragma unroll
for (int i = NDim - 1; i >= 0; --i) {
offset += m * indexes[i];
m *= shape[i];
}
return offset;
}
template <typename Index, unsigned NDim>
TV_HOST_DEVICE_INLINE Index rowArrayIdxInv(Index index, Index *output,
const Index *shape) {
#pragma unroll
for (int i = NDim - 1; i >= 0; --i) {
output[i] = index % shape[i];
index -= output[i];
index /= shape[i];
}
return index;
}
template <int N> struct ArrayIndexRowMajor {
// mPtr[((i1 * mShape[1] + i2) * mShape[2] + i3) * mShape[3] + i4];
TV_HOST_DEVICE_INLINE static unsigned run(const Shape &shape,
const Shape &indexes) {
return indexes[N - 1] +
shape[N - 1] * ArrayIndexRowMajor<N - 1>::run(shape, indexes);
}
};
template <> struct ArrayIndexRowMajor<0> {
TV_HOST_DEVICE_INLINE static unsigned run(const Shape &shape,
const Shape &indexes) {
return 0;
}
};
namespace detail {
template <typename T> constexpr const char *simpleTypeName(T val = T());
template <> constexpr const char *simpleTypeName(float val) {
return "float32";
}
template <> constexpr const char *simpleTypeName(double val) {
return "float64";
}
template <> constexpr const char *simpleTypeName(int val) { return "int32"; }
template <> constexpr const char *simpleTypeName(unsigned val) {
return "uint32";
}
template <> constexpr const char *simpleTypeName(long val) { return "int64"; }
template <> constexpr const char *simpleTypeName(unsigned long val) {
return "uint64";
}
}; // namespace detail
template <typename T, int Rank = -1> struct TensorView {
TV_HOST_DEVICE_INLINE TensorView() {}
explicit TV_HOST_DEVICE_INLINE TensorView(T *ptr, Shape shape)
: mPtr(ptr), mShape(shape) {}
// explicit TV_HOST_DEVICE_INLINE TensorView(const
// TensorView<std::remove_const_t<T>> &tview) : mPtr(tview.data()),
// mShape(tview.shape()) {}
template <class... Integers>
explicit TV_HOST_DEVICE_INLINE TensorView(T *ptr, Integers... shapes)
: mPtr(ptr) {
mShape = {int(shapes)...};
}
TV_HOST_DEVICE_INLINE TensorView<T, Rank> &
assign(const TensorView<T, Rank> &tensor) {
TV_REQUIRE(tensor.shape() == shape(), "you must provide same input size%s",
"\n");
T *ptr = mPtr;
const T *other_ptr = tensor.data();
for (size_t i = 0; i < size(); ++i)
*(ptr++) = *(other_ptr++);
return *this;
}
template <typename T1>
TV_HOST_DEVICE_INLINE TensorView<T, Rank> &
assign(std::initializer_list<T1> seq) {
TV_REQUIRE(seq.size() == size(), "you must provide same input size%s",
"\n");
T *ptr = mPtr;
for (const T1 &s : seq)
*(ptr++) = T(s);
return *this;
}
template <class... Inds> TV_HOST_DEVICE_INLINE T &operator()(Inds... inds) {
#ifdef TV_DEBUG
int idxes[sizeof...(Inds)]{int(inds)...};
TV_REQUIRE(sizeof...(inds) == mShape.ndim(),
"you provide %d indexes, but dim is %d\n", sizeof...(inds),
mShape.ndim());
for (int i = 0; i < sizeof...(inds); ++i) {
TV_REQUIRE(idxes[i] >= 0 && idxes[i] < mShape[i],
"index-%d(%d) out-of-range: [0, %d)\n", i, idxes[i],
mShape[i]);
}
#endif
return mPtr[rowArrayIdx(mShape, int(inds)...)];
}
template <class... Inds>
TV_HOST_DEVICE_INLINE const T &operator()(Inds... inds) const {
#ifdef TV_DEBUG
int idxes[sizeof...(Inds)]{int(inds)...};
TV_REQUIRE(sizeof...(inds) == mShape.ndim(),
"you provide %d indexes, but dim is %d\n", sizeof...(inds),
mShape.ndim());
for (int i = 0; i < sizeof...(inds); ++i) {
TV_REQUIRE(idxes[i] >= 0 && idxes[i] < mShape[i],
"index-%d(%d) out-of-range: [0, %d)\n", i, idxes[i],
mShape[i]);
}
#endif
return mPtr[rowArrayIdx(mShape, int(inds)...)];
}
TV_HOST_DEVICE_INLINE T &operator()() {
#if defined TV_DEBUG
#if defined(__CUDA_ARCH__)
TV_DEVICE_REQUIRE(mPtr != nullptr,
"you want get value but the view is empty.%s", "\n");
TV_DEVICE_REQUIRE(mShape.ndim() == 0,
"you provide 0 indexes, but dim is %ld\n", mShape.ndim());
#else
TV_REQUIRE(mPtr != nullptr, "you want get value but the view is empty.%s",
"\n");
TV_REQUIRE(mShape.ndim() == 0, "you provide 0 indexes, but dim is %ld\n",
mShape.ndim());
#endif
#endif
return mPtr[0];
}
TV_HOST_DEVICE_INLINE const T &operator()() const {
#if defined TV_DEBUG
#if defined(__CUDA_ARCH__)
TV_DEVICE_REQUIRE(mPtr != nullptr,
"you want get value but the view is empty.%s", "\n");
TV_DEVICE_REQUIRE(mShape.ndim() == 0,
"you provide 0 indexes, but dim is %ld\n", mShape.ndim());
#else
TV_REQUIRE(mPtr != nullptr, "you want get value but the view is empty.%s",
"\n");
TV_REQUIRE(mShape.ndim() == 0, "you provide 0 indexes, but dim is %ld\n",
mShape.ndim());
#endif
#endif
return mPtr[0];
}
template <class T1> TV_HOST_DEVICE_INLINE T &operator()(T1 i1) {
#if defined TV_DEBUG
#if defined(__CUDA_ARCH__)
TV_DEVICE_REQUIRE(mShape.ndim() == 1,
"you provide 1 indexes, but dim is %ld\n", mShape.ndim());
TV_DEVICE_REQUIRE(i1 >= 0 && i1 < mShape[0],
"index-%d(%d) out-of-range: [0, %d)\n", 0, i1, mShape[0]);
#else
TV_REQUIRE(mShape.ndim() == 1, "you provide 1 indexes, but dim is %ld\n",
mShape.ndim());
TV_REQUIRE(i1 >= 0 && i1 < mShape[0],
"index-%d(%d) out-of-range: [0, %d)\n", 0, i1, mShape[0]);
#endif
#endif
return mPtr[i1];
}
template <class T1, class T2>
TV_HOST_DEVICE_INLINE T &operator()(T1 i1, T2 i2) {
#ifdef TV_DEBUG
#if defined(__CUDA_ARCH__)
TV_DEVICE_REQUIRE(mShape.ndim() == 2,
"you provide 2 indexes, but dim is %ld\n", mShape.ndim());
TV_DEVICE_REQUIRE(i1 >= 0 && i1 < mShape[0],
"index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1),
mShape[0]);
TV_DEVICE_REQUIRE(i2 >= 0 && i2 < mShape[1],
"index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2),
mShape[1]);
#else
TV_REQUIRE(mShape.ndim() == 2, "you provide 2 indexes, but dim is %ld\n",
mShape.ndim());
TV_REQUIRE(i1 >= 0 && i1 < mShape[0],
"index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1), mShape[0]);
TV_REQUIRE(i2 >= 0 && i2 < mShape[1],
"index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2), mShape[1]);
#endif
#endif
return mPtr[i1 * mShape[1] + i2];
}
template <class T1, class T2, class T3>
TV_HOST_DEVICE_INLINE T &operator()(T1 i1, T2 i2, T3 i3) {
#ifdef TV_DEBUG
#if defined(__CUDA_ARCH__)
TV_DEVICE_REQUIRE(mShape.ndim() == 3,
"you provide 3 indexes, but dim is %ld\n", mShape.ndim());
TV_DEVICE_REQUIRE(i1 >= 0 && i1 < mShape[0],
"index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1),
mShape[0]);
TV_DEVICE_REQUIRE(i2 >= 0 && i2 < mShape[1],
"index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2),
mShape[1]);
TV_DEVICE_REQUIRE(i3 >= 0 && i3 < mShape[2],
"index-%d(%d) out-of-range: [0, %d)\n", 2, int(i3),
mShape[2]);
#else
TV_REQUIRE(mShape.ndim() == 3, "you provide 3 indexes, but dim is %ld\n",
mShape.ndim());
TV_REQUIRE(i1 >= 0 && i1 < mShape[0],
"index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1), mShape[0]);
TV_REQUIRE(i2 >= 0 && i2 < mShape[1],
"index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2), mShape[1]);
TV_REQUIRE(i3 >= 0 && i3 < mShape[2],
"index-%d(%d) out-of-range: [0, %d)\n", 2, int(i3), mShape[2]);
#endif
#endif
return mPtr[(i1 * mShape[1] + i2) * mShape[2] + i3];
}
template <class T1, class T2, class T3, class T4>
TV_HOST_DEVICE_INLINE T &operator()(T1 i1, T2 i2, T3 i3, T4 i4) {
#ifdef TV_DEBUG
#if defined(__CUDA_ARCH__)
TV_DEVICE_REQUIRE(mShape.ndim() == 4,
"you provide 4 indexes, but dim is %ld\n", mShape.ndim());
TV_DEVICE_REQUIRE(i1 >= 0 && i1 < mShape[0],
"index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1),
mShape[0]);
TV_DEVICE_REQUIRE(i2 >= 0 && i2 < mShape[1],
"index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2),
mShape[1]);
TV_DEVICE_REQUIRE(i3 >= 0 && i3 < mShape[2],
"index-%d(%d) out-of-range: [0, %d)\n", 2, int(i3),
mShape[2]);
TV_DEVICE_REQUIRE(i4 >= 0 && i4 < mShape[3],
"index-%d(%d) out-of-range: [0, %d)\n", 3, int(i4),
mShape[3]);
#else
TV_REQUIRE(mShape.ndim() == 4, "you provide 4 indexes, but dim is %ld\n",
mShape.ndim());
TV_REQUIRE(i1 >= 0 && i1 < mShape[0],
"index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1), mShape[0]);
TV_REQUIRE(i2 >= 0 && i2 < mShape[1],
"index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2), mShape[1]);
TV_REQUIRE(i3 >= 0 && i3 < mShape[2],
"index-%d(%d) out-of-range: [0, %d)\n", 2, int(i3), mShape[2]);
TV_REQUIRE(i4 >= 0 && i4 < mShape[3],
"index-%d(%d) out-of-range: [0, %d)\n", 3, int(i4), mShape[3]);
#endif
#endif
return mPtr[((i1 * mShape[1] + i2) * mShape[2] + i3) * mShape[3] + i4];
}
template <class T1> TV_HOST_DEVICE_INLINE const T &operator()(T1 i1) const {
#ifdef TV_DEBUG
#if defined(__CUDA_ARCH__)
TV_DEVICE_REQUIRE(mShape.ndim() == 1,
"you provide 1 indexes, but dim is %ld\n", mShape.ndim());
TV_DEVICE_REQUIRE(i1 >= 0 && i1 < mShape[0],
"index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1),
mShape[0]);
#else
TV_REQUIRE(mShape.ndim() == 1, "you provide 1 indexes, but dim is %ld\n",
mShape.ndim());
TV_REQUIRE(i1 >= 0 && i1 < mShape[0],
"index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1), mShape[0]);
#endif
#endif
return mPtr[i1];
}
template <class T1, class T2>
TV_HOST_DEVICE_INLINE const T &operator()(T1 i1, T2 i2) const {
#ifdef TV_DEBUG
#if defined(__CUDA_ARCH__)
TV_DEVICE_REQUIRE(mShape.ndim() == 2,
"you provide 2 indexes, but dim is %ld\n", mShape.ndim());
TV_DEVICE_REQUIRE(i1 >= 0 && i1 < mShape[0],
"index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1),
mShape[0]);
TV_DEVICE_REQUIRE(i2 >= 0 && i2 < mShape[1],
"index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2),
mShape[1]);
#else
TV_REQUIRE(mShape.ndim() == 2, "you provide 2 indexes, but dim is %ld\n",
mShape.ndim());
TV_REQUIRE(i1 >= 0 && i1 < mShape[0],
"index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1), mShape[0]);
TV_REQUIRE(i2 >= 0 && i2 < mShape[1],
"index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2), mShape[1]);
#endif
#endif
return mPtr[i1 * mShape[1] + i2];
}
template <class T1, class T2, class T3>
TV_HOST_DEVICE_INLINE const T &operator()(T1 i1, T2 i2, T3 i3) const {
#ifdef TV_DEBUG
#if defined(__CUDA_ARCH__)
TV_DEVICE_REQUIRE(mShape.ndim() == 3,
"you provide 3 indexes, but dim is %ld\n", mShape.ndim());
TV_DEVICE_REQUIRE(i1 >= 0 && i1 < mShape[0],
"index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1),
mShape[0]);
TV_DEVICE_REQUIRE(i2 >= 0 && i2 < mShape[1],
"index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2),
mShape[1]);
TV_DEVICE_REQUIRE(i3 >= 0 && i3 < mShape[2],
"index-%d(%d) out-of-range: [0, %d)\n", 2, int(i3),
mShape[2]);
#else
TV_REQUIRE(mShape.ndim() == 3, "you provide 3 indexes, but dim is %ld\n",
mShape.ndim());
TV_REQUIRE(i1 >= 0 && i1 < mShape[0],
"index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1), mShape[0]);
TV_REQUIRE(i2 >= 0 && i2 < mShape[1],
"index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2), mShape[1]);
TV_REQUIRE(i3 >= 0 && i3 < mShape[2],
"index-%d(%d) out-of-range: [0, %d)\n", 2, int(i3), mShape[2]);
#endif
#endif
return mPtr[(i1 * mShape[1] + i2) * mShape[2] + i3];
}
template <class T1, class T2, class T3, class T4>
TV_HOST_DEVICE_INLINE const T &operator()(T1 i1, T2 i2, T3 i3, T4 i4) const {
#ifdef TV_DEBUG
#if defined(__CUDA_ARCH__)
TV_DEVICE_REQUIRE(mShape.ndim() == 4,
"you provide 4 indexes, but dim is %ld\n", mShape.ndim());
TV_DEVICE_REQUIRE(i1 >= 0 && i1 < mShape[0],
"index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1),
mShape[0]);
TV_DEVICE_REQUIRE(i2 >= 0 && i2 < mShape[1],
"index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2),
mShape[1]);
TV_DEVICE_REQUIRE(i3 >= 0 && i3 < mShape[2],
"index-%d(%d) out-of-range: [0, %d)\n", 2, int(i3),
mShape[2]);
TV_DEVICE_REQUIRE(i4 >= 0 && i4 < mShape[3],
"index-%d(%d) out-of-range: [0, %d)\n", 3, int(i4),
mShape[3]);
#else
TV_REQUIRE(mShape.ndim() == 4, "you provide 4 indexes, but dim is %ld\n",
mShape.ndim());
TV_REQUIRE(i1 >= 0 && i1 < mShape[0],
"index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1), mShape[0]);
TV_REQUIRE(i2 >= 0 && i2 < mShape[1],
"index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2), mShape[1]);
TV_REQUIRE(i3 >= 0 && i3 < mShape[2],
"index-%d(%d) out-of-range: [0, %d)\n", 2, int(i3), mShape[2]);
TV_REQUIRE(i4 >= 0 && i4 < mShape[3],
"index-%d(%d) out-of-range: [0, %d)\n", 3, int(i4), mShape[3]);
#endif
#endif
return mPtr[((i1 * mShape[1] + i2) * mShape[2] + i3) * mShape[3] + i4];
}
TV_HOST_DEVICE_INLINE T &operator[](int idx) {
#ifdef TV_DEBUG
#if defined(__CUDA_ARCH__)
TV_DEVICE_REQUIRE(idx >= 0 && idx < size(),
"index(%d) out-of-range: [0, %ld)\n", int(idx), size());
#else
TV_REQUIRE(idx >= 0 && idx < size(), "index(%d) out-of-range: [0, %ld)\n",
int(idx), size());
#endif
#endif
return mPtr[idx];
}
// TODO: this is conflcit with operator[](SimpleVector<Slice> slice_vec).
/*TV_HOST_DEVICE_INLINE T &operator[](const Shape index) {
int idx = rowArrayIdx(mShape, index);
#ifdef TV_DEBUG
TV_REQUIRE(idx >= 0 && idx < size(), "index(%d) out-of-range: [0, %ld)\n",
int(idx), size());
#endif
return mPtr[idx];
}
TV_HOST_DEVICE_INLINE const T &operator[](const Shape index) const {
int idx = rowArrayIdx(mShape, index);
#ifdef TV_DEBUG
TV_REQUIRE(idx >= 0 && idx < size(), "index(%d) out-of-range: [0, %ld)\n",
int(idx), size());
#endif
return mPtr[idx];
}*/
TV_HOST_DEVICE_INLINE TensorView<T, Rank>
operator[](SimpleVector<Slice> slice_vec) {
return _subview(slice_vec);
}
TV_HOST_DEVICE_INLINE const TensorView<T, Rank>
operator[](SimpleVector<Slice> slice_vec) const {
return _subview(slice_vec);
}
TV_HOST_DEVICE_INLINE bool empty() const { return mPtr == nullptr; }
TV_HOST_DEVICE_INLINE T *data() { return mPtr; }
TV_HOST_DEVICE_INLINE const T *data() const { return mPtr; }
TV_HOST_DEVICE_INLINE const Shape &shape() const { return mShape; }
TV_HOST_DEVICE_INLINE int dim(int idx) const { return mShape[idx]; }
TV_HOST_DEVICE_INLINE int ndim() const { return mShape.ndim(); }
template <class... Inds>
TV_HOST_DEVICE_INLINE TensorView<T, Rank> &reshape(Inds... newShapes) {
Shape shapes{int(newShapes)...};
TV_ASSERT(shapes.size() == size());
mShape = shapes;
return *this;
}
TV_HOST_DEVICE_INLINE TensorView<T, Rank> &reshape(Shape shapes) {
TV_ASSERT(shapes.size() == size());
mShape = shapes;
return *this;
}
template <class... Inds>
TV_HOST_DEVICE_INLINE TensorView<T, Rank> view(Inds... newShapes) const {
Shape shapes{int(newShapes)...};
for (size_t i = 0; i < shapes.ndim(); ++i) {
if (shapes[i] == -1) {
shapes[i] = 1;
shapes[i] = size() / shapes.size();
break;
}
}
TV_ASSERT(shapes.size() == size());
return TensorView<T, Rank>(mPtr, shapes);
}
TV_HOST_DEVICE_INLINE TensorView<T, Rank> view(Shape shapes) const {
TV_ASSERT(shapes.size() == size());
return TensorView<T, Rank>(mPtr, shapes);
}
TV_HOST_DEVICE_INLINE TensorView<T, Rank> squeeze() const {
return TensorView<T, Rank>(mPtr, mShape.squeeze());
}
TV_HOST_DEVICE_INLINE TensorView<T, Rank> squeeze(int dim) const {
return TensorView<T, Rank>(mPtr, mShape.squeeze(dim));
}
TV_HOST_DEVICE_INLINE size_t size() const { return mShape.size(); }
template <class... Slices>
TV_HOST_DEVICE_INLINE TensorView<T, Rank> subview(Slice slice,
Slices... slices) const {
return subview<float, Slice, Slices...>(slice, slices...);
}
template <class T2 = float, class... Slices>
TV_HOST_DEVICE_INLINE TensorView<T, Rank> subview(Slices... slices) const {
Slice slice_vec[sizeof...(Slices)] = {to_slice(slices)...};
Shape new_shape{to_slice(slices)[0]...};
Shape start{to_slice(slices)[0]...};
TV_ASSERT(new_shape.ndim() <= mShape.ndim());
TV_ASSERT(new_shape.ndim() != 0);
size_t idxsize = new_shape.ndim();
for (size_t i = idxsize; i < mShape.ndim(); ++i) {
new_shape.push_back(0);
start.push_back(0);
}
#pragma unroll
for (size_t i = 0; i < sizeof...(Slices); ++i) {
if (slice_vec[i][1] != -1) {
new_shape[i] = slice_vec[i][1] - slice_vec[i][0];
TV_ASSERT(new_shape[i] >= 0);
} else {
new_shape[i] = 1; // reduce dim
}
}
auto offset = rowArrayIdx(mShape, start);
#pragma unroll
for (size_t i = sizeof...(Slices); i < mShape.ndim(); ++i) {
new_shape[i] = mShape[i];
TV_ASSERT(new_shape[i] >= 0);
}
Shape reduced_shape;
#pragma unroll
for (size_t i = 0; i < sizeof...(Slices); ++i) {
if (slice_vec[i][1] != -1) {
reduced_shape.push_back(new_shape[i]);
}
}
#pragma unroll
for (size_t i = sizeof...(Slices); i < mShape.ndim(); ++i) {
reduced_shape.push_back(new_shape[i]);
}
return TensorView<T, Rank>(mPtr + offset, reduced_shape);
}
template <class... Integers>
TV_HOST_DEVICE_INLINE TensorView<T, Rank> subview(int id, Integers... ints) {
Shape start = {id, ints...};
for (int i = 1 + sizeof...(ints); i < ndim(); ++i) {
start.push_back(0);
}
return TensorView<T, Rank>(mPtr + rowArrayIdx(mShape, start),
mShape.subshape(sizeof...(ints) + 1));
}
std::string repr() const {
std::ostringstream ss;
if (empty())
return "";
if (mShape.ndim() == 0) {
ss << *mPtr;
// ss << fmt::format("\nTensor: shape={}, dtype={}", mShape,
// detail::simpleTypeName<T>());
ss << "Tensor: dtype=" << detail::simpleTypeName<T>();
return ss.str();
}
Shape counter = mShape;
auto tensor_flat = this->view(-1);
for (int i = 0; i < counter.ndim(); ++i) {
counter[i] = 0;
ss << "[";
}
for (size_t i = 0; i < this->size(); ++i) {
ss << tensor_flat(rowArrayIdx(mShape, counter));
counter[counter.ndim() - 1] += 1;
int inc_count = 0;
bool print_comma = true;
for (int c = counter.ndim() - 1; c >= 0; --c) {
if (counter[c] == this->dim(c) && c > 0) {
++inc_count;
counter[c - 1] += 1;
counter[c] = 0;
print_comma = false;
}
}
if (print_comma && i != this->size() - 1)
ss << ", ";
for (int j = 0; j < inc_count; ++j) {
ss << "]";
}
if (i != this->size() - 1) {
if (inc_count != 0)
ss << "\n";
for (int j = 0; j < inc_count; ++j) {
ss << "[";
}
}
}
ss << "]";
// ss << fmt::format("\nTensor: shape={}, dtype={}", mShape,
// detail::simpleTypeName<T>());
ss << "Tensor: dtype=" << detail::simpleTypeName<T>();
return ss.str();
}
protected:
// TODO: make this function public.
// currently this function is called unexpectedly when using subview({0, 0}).
TV_HOST_DEVICE_INLINE TensorView<T, Rank>
_subview(SimpleVector<Slice> slice_vec) {
Shape new_shape;
for (int i = 0; i < slice_vec.size(); ++i) {
new_shape.push_back(slice_vec[i][0]);
}
Shape start = new_shape;
TV_ASSERT(new_shape.ndim() <= mShape.ndim());
TV_ASSERT(new_shape.ndim() != 0);
size_t idxsize = new_shape.ndim();
for (size_t i = idxsize; i < mShape.ndim(); ++i) {
new_shape.push_back(0);
start.push_back(0);
}
for (size_t i = 0; i < slice_vec.size(); ++i) {
if (slice_vec[i][1] != -1) {
new_shape[i] = slice_vec[i][1] - slice_vec[i][0];
TV_ASSERT(new_shape[i] >= 0);
} else {
new_shape[i] = 1; // reduce dim
}
}
auto offset = rowArrayIdx(mShape, start);
for (size_t i = slice_vec.size(); i < mShape.ndim(); ++i) {
new_shape[i] = mShape[i];
TV_ASSERT(new_shape[i] >= 0);
}
Shape reduced_shape;
for (size_t i = 0; i < slice_vec.size(); ++i) {
if (slice_vec[i][1] != -1) {
reduced_shape.push_back(new_shape[i]);
}
}
for (size_t i = slice_vec.size(); i < mShape.ndim(); ++i) {
reduced_shape.push_back(new_shape[i]);
}
return TensorView<T, Rank>(mPtr + offset, reduced_shape);
}
template <typename T1> TV_HOST_DEVICE_INLINE Slice to_slice(T1 s) const {
return Slice{int(s), -1, -1};
}
TV_HOST_DEVICE_INLINE Slice to_slice(Slice s) const { return Slice(s); }
T *mPtr = nullptr;
Shape mShape;
};
template <typename Os, typename T, int Rank>
Os &operator<<(Os &os, const TensorView<T, Rank> &dt) {
os << dt.repr();
return os;
}
template <typename Os, typename T, int Rank>
Os &operator<<(Os &os, const TensorView<const T, Rank> &dt) {
os << dt.repr();
return os;
}
namespace detail {
template <typename T> constexpr const char *printfTypeFormat(T val = T());
template <> constexpr const char *printfTypeFormat(float val) { return "%.2f"; }
template <> constexpr const char *printfTypeFormat(double val) {
return "%.2f";
}
template <> constexpr const char *printfTypeFormat(int val) { return "%d"; }
template <> constexpr const char *printfTypeFormat(unsigned val) {
return "%u";
}
template <> constexpr const char *printfTypeFormat(long val) { return "%ld"; }
template <> constexpr const char *printfTypeFormat(unsigned long val) {
return "%lu";
}
}; // namespace detail
template <typename T>
TV_HOST_DEVICE void printTensorView(const TensorView<T> tensor,
const char *format) {
if (tensor.empty())
return;
if (tensor.ndim() == 0) {
printf(format, tensor());
printf("\n");
return;
}
Shape counter = tensor.shape();
auto tensor_flat = tensor.view(-1);
for (int i = 0; i < counter.ndim(); ++i) {
counter[i] = 0;
printf("[");
}
for (size_t i = 0; i < tensor.size(); ++i) {
printf(format, tensor_flat(rowArrayIdx(tensor.shape(), counter)));
counter[counter.ndim() - 1] += 1;
int inc_count = 0;
bool print_comma = true;
for (int c = counter.ndim() - 1; c >= 0; --c) {
if (counter[c] == tensor.dim(c) && c > 0) {
++inc_count;
counter[c - 1] += 1;
counter[c] = 0;
print_comma = false;
}
}
if (print_comma && i != tensor.size() - 1)
printf(", ");
for (int j = 0; j < inc_count; ++j) {
printf("]");
}
if (i != tensor.size() - 1) {
if (inc_count != 0)
printf("\n");
for (int j = 0; j < inc_count; ++j) {
printf("[");
}
}
}
printf("]\n");
}
template <typename T>
TV_HOST_DEVICE void printTensorView(TensorView<T> tensor) {
using Traw = typename std::remove_const<T>::type;
return printTensorView(tensor, detail::printfTypeFormat<Traw>());
}
template <typename T>
TV_HOST_DEVICE void printTensorView(const T *ptr, Shape shape) {
using Traw = typename std::remove_const<T>::type;
return printTensorView(TensorView<const T>(ptr, shape),
detail::printfTypeFormat<Traw>());
}
template <typename T>
TV_HOST_DEVICE void printTensorView(const T *ptr, Shape shape,
const char *format) {
return printTensorView(TensorView<const T>(ptr, shape), format);
}
} // namespace tv
\ No newline at end of file
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <tensorview/tensorview.h>
#include <torch/script.h>
#include <ATen/ATen.h>
#include <ATen/cuda/CUDAContext.h>
namespace tv {
struct TorchGPU: public tv::GPU {
TorchGPU(){
mStream = at::cuda::getCurrentCUDAStream();
}
};
template <typename T> void check_torch_dtype(const torch::Tensor &tensor) {
switch (tensor.type().scalarType()) {
case at::ScalarType::Double: {
auto val = std::is_same<std::remove_const_t<T>, double>::value;
TV_ASSERT_RT_ERR(val, "error");
break;
}
case at::ScalarType::Float: {
auto val = std::is_same<std::remove_const_t<T>, float>::value;
TV_ASSERT_RT_ERR(val, "error");
break;
}
case at::ScalarType::Int: {
auto val = std::is_same<std::remove_const_t<T>, int>::value;
TV_ASSERT_RT_ERR(val, "error");
break;
}
case at::ScalarType::Half: {
auto val = std::is_same<std::remove_const_t<T>, at::Half>::value;
TV_ASSERT_RT_ERR(val, "error");
break;
}
default:
TV_ASSERT_RT_ERR(false, "error");
}
}
template <typename T>
tv::TensorView<T> torch2tv(const torch::Tensor &tensor) {
check_torch_dtype<T>(tensor);
tv::Shape shape;
for (auto i : tensor.sizes()) {
shape.push_back(i);
}
return tv::TensorView<T>(tensor.data<std::remove_const_t<T>>(), shape);
}
} // namespace tv
\ No newline at end of file
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <chrono>
#include <cuda_runtime_api.h>
#include <iostream>
namespace spconv {
template <typename TimeT = std::chrono::microseconds> struct CudaContextTimer {
CudaContextTimer() {
cudaDeviceSynchronize();
mCurTime = std::chrono::steady_clock::now();
}
typename TimeT::rep report() {
cudaDeviceSynchronize();
auto duration = std::chrono::duration_cast<TimeT>(
std::chrono::steady_clock::now() - mCurTime);
auto res = duration.count();
mCurTime = std::chrono::steady_clock::now();
return res;
}
private:
std::chrono::time_point<std::chrono::steady_clock> mCurTime;
};
template <typename TimeT = std::chrono::microseconds> struct CPUTimer {
CPUTimer() { mCurTime = std::chrono::steady_clock::now(); }
typename TimeT::rep report() {
auto duration = std::chrono::duration_cast<TimeT>(
std::chrono::steady_clock::now() - mCurTime);
auto res = duration.count();
mCurTime = std::chrono::steady_clock::now();
return res;
}
private:
std::chrono::time_point<std::chrono::steady_clock> mCurTime;
};
} // namespace spconv
import os
import re
import sys
import platform
import subprocess
from setuptools import setup, Extension, find_packages
from setuptools.command.build_ext import build_ext
from distutils.version import LooseVersion
if 'LIBTORCH_ROOT' not in os.environ:
raise ValueError("You must set LIBTORCH_ROOT to your torch c++ library.")
PYTHON_VERSION = "{}.{}".format(sys.version_info.major, sys.version_info.minor)
class CMakeExtension(Extension):
def __init__(self, name, sourcedir='', library_dirs=[]):
Extension.__init__(self, name, sources=[], library_dirs=library_dirs)
self.sourcedir = os.path.abspath(sourcedir)
class CMakeBuild(build_ext):
def run(self):
try:
out = subprocess.check_output(['cmake', '--version'])
except OSError:
raise RuntimeError("CMake must be installed to build the following extensions: " +
", ".join(e.name for e in self.extensions))
if platform.system() == "Windows":
raise NotImplementedError
for ext in self.extensions:
self.build_extension(ext)
def build_extension(self, ext):
extdir = os.path.abspath(os.path.dirname(self.get_ext_fullpath(ext.name)))
print(extdir)
cmake_args = ['-DCMAKE_LIBRARY_OUTPUT_DIRECTORY=' + extdir + "/spconv",
'-DCMAKE_PREFIX_PATH=' + os.environ["LIBTORCH_ROOT"],
'-DPYBIND11_PYTHON_VERSION={}'.format(PYTHON_VERSION),
'-DSPCONV_BuildTests=OFF',
'-DCMAKE_CUDA_FLAGS="--expt-relaxed-constexpr"']
cfg = 'Debug' if self.debug else 'Release'
# cfg = 'Debug'
build_args = ['--config', cfg]
print(cfg)
if platform.system() == "Windows":
cmake_args += ['-DCMAKE_LIBRARY_OUTPUT_DIRECTORY_{}={}'.format(cfg.upper(), extdir)]
if sys.maxsize > 2**32:
cmake_args += ['-A', 'x64']
build_args += ['--', '/m']
else:
cmake_args += ['-DCMAKE_BUILD_TYPE=' + cfg]
build_args += ['--', '-j4']
env = os.environ.copy()
env['CXXFLAGS'] = '{} -DVERSION_INFO=\\"{}\\"'.format(env.get('CXXFLAGS', ''),
self.distribution.get_version())
if not os.path.exists(self.build_temp):
os.makedirs(self.build_temp)
subprocess.check_call(['cmake', ext.sourcedir] + cmake_args, cwd=self.build_temp, env=env)
subprocess.check_call(['cmake', '--build', '.'] + build_args, cwd=self.build_temp)
packages = find_packages(exclude=('tools', 'tools.*'))
setup(
name='spconv',
version='1.0',
author='Yan Yan',
author_email='scrin@foxmail.com',
description='spatial sparse convolution for pytorch',
long_description='',
setup_requires = ['torch>=1.0.0'],
packages=packages,
package_dir = {'spconv': 'spconv'},
ext_modules=[CMakeExtension('spconv', library_dirs=[])],
cmdclass=dict(build_ext=CMakeBuild),
zip_safe=False,
)
# Copyright 2019 Yan Yan
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from pathlib import Path
import numpy as np
import torch
from spconv import utils
from spconv.conv import SparseConv2d, SparseConv3d, SubMConv2d, SubMConv3d
from spconv.conv import SparseConvTranspose2d, SparseConvTranspose3d
from spconv.conv import SparseInverseConv2d, SparseInverseConv3d
from spconv.modules import SparseModule, SparseSequential
from spconv.pool import SparseMaxPool2d, SparseMaxPool3d
_LIB_PATH = str(Path(__file__).parent / "libspconv.so")
torch.ops.load_library(_LIB_PATH)
def scatter_nd(indices, updates, shape):
"""pytorch edition of tensorflow scatter_nd.
this function don't contain except handle code. so use this carefully
when indice repeats, don't support repeat add which is supported
in tensorflow.
"""
ret = torch.zeros(*shape, dtype=updates.dtype, device=updates.device)
ndim = indices.shape[-1]
output_shape = list(indices.shape[:-1]) + shape[indices.shape[-1]:]
flatted_indices = indices.view(-1, ndim)
slices = [flatted_indices[:, i] for i in range(ndim)]
slices += [Ellipsis]
ret[slices] = updates.view(*output_shape)
return ret
class SparseConvTensor(object):
def __init__(self, features, indices, spatial_shape, batch_size, grid=None):
"""
Args:
grid: pre-allocated grid tensor. should be used when the volume of spatial shape
is very large.
"""
self.features = features
self.indices = indices
if self.indices.dtype != torch.int32:
self.indices.int()
self.spatial_shape = spatial_shape
self.batch_size = batch_size
self.indice_dict = {}
self.grid = grid
@property
def spatial_size(self):
return np.prod(self.spatial_shape)
def find_indice_pair(self, key):
if key is None:
return None
if key in self.indice_dict:
return self.indice_dict[key]
return None
def dense(self, channels_first=True):
output_shape = [self.batch_size] + list(self.spatial_shape) + [self.features.shape[1]]
res = scatter_nd(self.indices.long(), self.features, output_shape)
if not channels_first:
return res
ndim = len(self.spatial_shape)
trans_params = list(range(0, ndim + 1))
trans_params.insert(1, ndim + 1)
return res.permute(*trans_params).contiguous()
@property
def sparity(self):
return self.indices.shape[0] / np.prod(self.spatial_shape) / self.batch_size
class ToDense(SparseModule):
"""convert SparseConvTensor to NCHW dense tensor.
"""
def forward(self, x: SparseConvTensor):
return x.dense()
class RemoveGrid(SparseModule):
"""remove pre-allocated grid buffer.
"""
def forward(self, x: SparseConvTensor):
x.grid = None
return x
\ No newline at end of file
# Copyright 2019 Yan Yan
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import math
import time
import numpy as np
import spconv
import spconv.functional as Fsp
import torch
from spconv import ops
from spconv.modules import SparseModule
from torch import nn
from torch.nn import init
from torch.nn.parameter import Parameter
def _calculate_fan_in_and_fan_out_hwio(tensor):
dimensions = tensor.ndimension()
if dimensions < 2:
raise ValueError(
"Fan in and fan out can not be computed for tensor with fewer than 2 dimensions"
)
if dimensions == 2: # Linear
fan_in = tensor.size(-2)
fan_out = tensor.size(-1)
else:
num_input_fmaps = tensor.size(-2)
num_output_fmaps = tensor.size(-1)
receptive_field_size = 1
if tensor.dim() > 2:
receptive_field_size = tensor[..., 0, 0].numel()
fan_in = num_input_fmaps * receptive_field_size
fan_out = num_output_fmaps * receptive_field_size
return fan_in, fan_out
class SparseConvolution(SparseModule):
def __init__(self,
ndim,
in_channels,
out_channels,
kernel_size,
stride=1,
padding=0,
dilation=1,
groups=1,
bias=True,
subm=False,
output_padding=0,
transposed=False,
inverse=False,
indice_key=None):
super(SparseConvolution, self).__init__()
assert groups == 1
if not isinstance(kernel_size, (list, tuple)):
kernel_size = [kernel_size] * ndim
if not isinstance(stride, (list, tuple)):
stride = [stride] * ndim
if not isinstance(padding, (list, tuple)):
padding = [padding] * ndim
if not isinstance(dilation, (list, tuple)):
dilation = [dilation] * ndim
if not isinstance(output_padding, (list, tuple)):
output_padding = [output_padding] * ndim
for d, s in zip(dilation, stride):
assert any([s == 1, d == 1]), "don't support this."
self.ndim = ndim
self.in_channels = in_channels
self.out_channels = out_channels
self.kernel_size = kernel_size
self.conv1x1 = np.prod(kernel_size) == 1
self.stride = stride
self.padding = padding
self.dilation = dilation
self.transposed = transposed
self.inverse = inverse
self.output_padding = output_padding
self.groups = groups
self.subm = subm
self.indice_key = indice_key
self.weight = Parameter(
torch.Tensor(*kernel_size, in_channels, out_channels))
if bias:
self.bias = Parameter(torch.Tensor(out_channels))
else:
self.register_parameter('bias', None)
self.reset_parameters()
def reset_parameters(self):
n = self.in_channels
init.kaiming_uniform_(self.weight, a=math.sqrt(5))
if self.bias is not None:
fan_in, _ = _calculate_fan_in_and_fan_out_hwio(self.weight)
bound = 1 / math.sqrt(fan_in)
init.uniform_(self.bias, -bound, bound)
def forward(self, input):
assert isinstance(input, spconv.SparseConvTensor)
features = input.features
device = features.device
indices = input.indices
spatial_shape = input.spatial_shape
batch_size = input.batch_size
if not self.subm:
if self.transposed:
out_spatial_shape = ops.get_deconv_output_size(
spatial_shape, self.kernel_size, self.stride, self.padding, self.dilation, self.output_padding)
else:
out_spatial_shape = ops.get_conv_output_size(
spatial_shape, self.kernel_size, self.stride, self.padding, self.dilation)
else:
out_spatial_shape = spatial_shape
# input.update_grid(out_spatial_shape)
# t = time.time()
if self.conv1x1:
input.features = torch.mm(
input.features,
self.weight.view(self.in_channels, self.out_channels))
if self.bias:
input.features += self.bias
return input
datas = input.find_indice_pair(self.indice_key)
if self.inverse:
assert datas is not None and self.indice_key is not None
_, outids, indice_pairs, indice_pair_num, out_spatial_shape = datas
else:
if self.indice_key is not None and datas is not None:
outids, _, indice_pairs, indice_pair_num, _ = datas
else:
outids, indice_pairs, indice_pair_num = ops.get_indice_pairs(
indices, batch_size, spatial_shape, self.kernel_size,
self.stride, self.padding, self.dilation, self.output_padding, self.subm, self.transposed, grid=input.grid)
input.indice_dict[self.indice_key] = (outids, indices, indice_pairs, indice_pair_num, spatial_shape)
if self.subm:
out_features = Fsp.indice_subm_conv(features, self.weight,
indice_pairs.to(device),
indice_pair_num,
outids.shape[0])
else:
if self.inverse:
out_features = Fsp.indice_inverse_conv(features,
self.weight, indice_pairs.to(device),
indice_pair_num, outids.shape[0])
else:
out_features = Fsp.indice_conv(features,
self.weight, indice_pairs.to(device),
indice_pair_num, outids.shape[0])
if self.bias:
out_features += self.bias
out_tensor = spconv.SparseConvTensor(out_features, outids,
out_spatial_shape, batch_size)
out_tensor.indice_dict = input.indice_dict
out_tensor.grid = input.grid
return out_tensor
class SparseConv2d(SparseConvolution):
def __init__(self,
in_channels,
out_channels,
kernel_size,
stride=1,
padding=0,
dilation=1,
groups=1,
bias=True,
indice_key=None):
super(SparseConv2d, self).__init__(
2,
in_channels,
out_channels,
kernel_size,
stride,
padding,
dilation,
groups,
bias,
indice_key=indice_key)
class SparseConv3d(SparseConvolution):
def __init__(self,
in_channels,
out_channels,
kernel_size,
stride=1,
padding=0,
dilation=1,
groups=1,
bias=True,
indice_key=None):
super(SparseConv3d, self).__init__(
3,
in_channels,
out_channels,
kernel_size,
stride,
padding,
dilation,
groups,
bias,
indice_key=indice_key)
class SparseConvTranspose2d(SparseConvolution):
def __init__(self,
in_channels,
out_channels,
kernel_size,
stride=1,
padding=0,
dilation=1,
groups=1,
bias=True,
indice_key=None):
super(SparseConvTranspose2d, self).__init__(
2,
in_channels,
out_channels,
kernel_size,
stride,
padding,
dilation,
groups,
bias,
transposed=True,
indice_key=indice_key)
class SparseConvTranspose3d(SparseConvolution):
def __init__(self,
in_channels,
out_channels,
kernel_size,
stride=1,
padding=0,
dilation=1,
groups=1,
bias=True,
indice_key=None):
super(SparseConvTranspose3d, self).__init__(
3,
in_channels,
out_channels,
kernel_size,
stride,
padding,
dilation,
groups,
bias,
transposed=True,
indice_key=indice_key)
class SparseInverseConv2d(SparseConvolution):
def __init__(self,
in_channels,
out_channels,
indice_key,
bias=True):
super(SparseInverseConv2d, self).__init__(
2,
in_channels,
out_channels,
bias=bias,
inverse=True,
indice_key=indice_key)
class SparseInverseConv3d(SparseConvolution):
def __init__(self,
in_channels,
out_channels,
indice_key,
bias=True):
super(SparseInverseConv3d, self).__init__(
3,
in_channels,
out_channels,
bias=bias,
inverse=True,
indice_key=indice_key)
class SubMConv2d(SparseConvolution):
def __init__(self,
in_channels,
out_channels,
kernel_size,
stride=1,
padding=0,
dilation=1,
groups=1,
bias=True,
indice_key=None):
super(SubMConv2d, self).__init__(
2,
in_channels,
out_channels,
kernel_size,
stride,
padding,
dilation,
groups,
bias,
True,
indice_key=indice_key)
class SubMConv3d(SparseConvolution):
def __init__(self,
in_channels,
out_channels,
kernel_size,
stride=1,
padding=0,
dilation=1,
groups=1,
bias=True,
indice_key=None):
super(SubMConv3d, self).__init__(
3,
in_channels,
out_channels,
kernel_size,
stride,
padding,
dilation,
groups,
bias,
True,
indice_key=indice_key)
# Copyright 2019 Yan Yan
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import spconv.ops as ops
import torch
from torch import nn
from torch.autograd import Function
class SparseConvFunction(Function):
@staticmethod
def forward(
ctx,
features,
filters,
indice_pairs,
indice_pair_num,
num_activate_out):
ctx.save_for_backward(
indice_pairs,
indice_pair_num,
features,
filters)
return ops.indice_conv(features, filters, indice_pairs, indice_pair_num, num_activate_out, False)
@staticmethod
def backward(ctx, grad_output):
indice_pairs, indice_pair_num, features, filters = ctx.saved_tensors
input_bp, filters_bp = ops.indice_conv_backward(features, filters, grad_output, indice_pairs, indice_pair_num, False)
return input_bp, filters_bp, None, None, None
class SparseInverseConvFunction(Function):
@staticmethod
def forward(
ctx,
features,
filters,
indice_pairs,
indice_pair_num,
num_activate_out):
ctx.save_for_backward(
indice_pairs,
indice_pair_num,
features,
filters)
return ops.indice_conv(features, filters, indice_pairs, indice_pair_num, num_activate_out, True, False)
@staticmethod
def backward(ctx, grad_output):
indice_pairs, indice_pair_num, features, filters = ctx.saved_tensors
input_bp, filters_bp = ops.indice_conv_backward(features, filters, grad_output, indice_pairs, indice_pair_num, True, False)
return input_bp, filters_bp, None, None, None
class SubMConvFunction(Function):
@staticmethod
def forward(
ctx,
features,
filters,
indice_pairs,
indice_pair_num,
num_activate_out):
ctx.save_for_backward(
indice_pairs,
indice_pair_num,
features,
filters)
return ops.indice_conv(features, filters, indice_pairs, indice_pair_num, num_activate_out, False, True)
@staticmethod
def backward(ctx, grad_output):
indice_pairs, indice_pair_num, features, filters = ctx.saved_tensors
input_bp, filters_bp = ops.indice_conv_backward(features, filters, grad_output, indice_pairs, indice_pair_num, False, True)
return input_bp, filters_bp, None, None, None
class SparseMaxPoolFunction(Function):
@staticmethod
def forward(
ctx,
features,
indice_pairs,
indice_pair_num,
num_activate_out):
out = ops.indice_maxpool(features, indice_pairs, indice_pair_num, num_activate_out)
ctx.save_for_backward(
indice_pairs,
indice_pair_num,
features,
out)
return out
@staticmethod
def backward(ctx, grad_output):
indice_pairs, indice_pair_num, features, out = ctx.saved_tensors
input_bp = ops.indice_maxpool_backward(features, out, grad_output, indice_pairs, indice_pair_num)
return input_bp, None, None, None
indice_conv = SparseConvFunction.apply
indice_inverse_conv = SparseInverseConvFunction.apply
indice_subm_conv = SubMConvFunction.apply
indice_maxpool = SparseMaxPoolFunction.apply
# Copyright 2019 Yan Yan
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from collections import OrderedDict
import spconv
import torch
from torch import nn
import time
def is_spconv_module(module):
spconv_modules = (SparseModule,)
return isinstance(module, spconv_modules)
def _mean_update(vals, m_vals, t):
outputs = []
if not isinstance(vals, list):
vals = [vals]
if not isinstance(m_vals, list):
m_vals = [m_vals]
for val, m_val in zip(vals, m_vals):
output = t / float(t + 1) * m_val + 1 / float(t + 1) * val
outputs.append(output)
if len(outputs) == 1:
outputs = outputs[0]
return outputs
class SparseModule(nn.Module):
""" place holder, all module subclass from this will take sptensor in SparseSequential.
"""
pass
class SparseSequential(SparseModule):
r"""A sequential container.
Modules will be added to it in the order they are passed in the constructor.
Alternatively, an ordered dict of modules can also be passed in.
To make it easier to understand, given is a small example::
# Example of using Sequential
model = SparseSequential(
SparseConv2d(1,20,5),
nn.ReLU(),
SparseConv2d(20,64,5),
nn.ReLU()
)
# Example of using Sequential with OrderedDict
model = SparseSequential(OrderedDict([
('conv1', SparseConv2d(1,20,5)),
('relu1', nn.ReLU()),
('conv2', SparseConv2d(20,64,5)),
('relu2', nn.ReLU())
]))
# Example of using Sequential with kwargs(python 3.6+)
model = SparseSequential(
conv1=SparseConv2d(1,20,5),
relu1=nn.ReLU(),
conv2=SparseConv2d(20,64,5),
relu2=nn.ReLU()
)
"""
def __init__(self, *args, **kwargs):
super(SparseSequential, self).__init__()
if len(args) == 1 and isinstance(args[0], OrderedDict):
for key, module in args[0].items():
self.add_module(key, module)
else:
for idx, module in enumerate(args):
self.add_module(str(idx), module)
for name, module in kwargs.items():
if sys.version_info < (3, 6):
raise ValueError("kwargs only supported in py36+")
if name in self._modules:
raise ValueError("name exists.")
self.add_module(name, module)
self._sparity_dict = {}
def __getitem__(self, idx):
if not (-len(self) <= idx < len(self)):
raise IndexError('index {} is out of range'.format(idx))
if idx < 0:
idx += len(self)
it = iter(self._modules.values())
for i in range(idx):
next(it)
return next(it)
def __len__(self):
return len(self._modules)
@property
def sparity_dict(self):
return self._sparity_dict
def add(self, module, name=None):
if name is None:
name = str(len(self._modules))
if name in self._modules:
raise KeyError("name exists")
self.add_module(name, module)
def forward(self, input):
for k, module in self._modules.items():
if is_spconv_module(module): # use SpConvTensor as input
assert isinstance(input, spconv.SparseConvTensor)
self._sparity_dict[k] = input.sparity
input = module(input)
else:
if isinstance(input, spconv.SparseConvTensor):
if input.indices.shape[0] != 0:
input.features = module(input.features)
else:
input = module(input)
return input
# Copyright 2019 Yan Yan
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import spconv
import torch
def get_conv_output_size(input_size, kernel_size, stride, padding, dilation):
ndim = len(input_size)
output_size = []
for i in range(ndim):
size = (input_size[i] + 2 * padding[i] - dilation[i] *
(kernel_size[i] - 1) - 1) // stride[i] + 1
if kernel_size[i] == -1:
output_size.append(1)
else:
output_size.append(size)
return output_size
def get_deconv_output_size(input_size, kernel_size, stride, padding, dilation,
output_padding):
ndim = len(input_size)
output_size = []
for i in range(ndim):
if kernel_size[i] == -1:
raise ValueError("deconv don't support kernel_size < 0")
size = (input_size[i] - 1) * stride[i] - 2 * padding[i] + kernel_size[
i] + output_padding[i]
output_size.append(size)
return output_size
def get_indice_pairs(indices,
batch_size,
spatial_shape,
ksize=3,
stride=1,
padding=0,
dilation=1,
out_padding=0,
subm=False,
transpose=False,
grid=None):
ndim = indices.shape[1] - 1
if not isinstance(ksize, (list, tuple)):
ksize = [ksize] * ndim
if not isinstance(stride, (list, tuple)):
stride = [stride] * ndim
if not isinstance(padding, (list, tuple)):
padding = [padding] * ndim
if not isinstance(dilation, (list, tuple)):
dilation = [dilation] * ndim
if not isinstance(out_padding, (list, tuple)):
out_padding = [out_padding] * ndim
for d, s in zip(dilation, stride):
assert any([s == 1, d == 1]), "don't support this."
if not subm:
if transpose:
out_shape = get_deconv_output_size(spatial_shape, ksize, stride, padding,
dilation, out_padding)
else:
out_shape = get_conv_output_size(spatial_shape, ksize, stride, padding,
dilation)
else:
out_shape = spatial_shape
if grid is None:
if ndim == 2:
get_indice_pairs_func = torch.ops.spconv.get_indice_pairs_2d
elif ndim == 3:
get_indice_pairs_func = torch.ops.spconv.get_indice_pairs_3d
else:
raise NotImplementedError
return get_indice_pairs_func(indices, batch_size, out_shape, spatial_shape, ksize,
stride, padding, dilation, out_padding, subm, transpose)
else:
if ndim == 2:
get_indice_pairs_func = torch.ops.spconv.get_indice_pairs_grid_2d
elif ndim == 3:
get_indice_pairs_func = torch.ops.spconv.get_indice_pairs_grid_3d
else:
raise NotImplementedError
return get_indice_pairs_func(indices, grid, batch_size, out_shape, spatial_shape, ksize,
stride, padding, dilation, out_padding, subm, transpose)
def indice_conv(features,
filters,
indice_pairs,
indice_pair_num,
num_activate_out,
inverse=False,
subm=False):
if filters.dtype == torch.float32:
return torch.ops.spconv.indice_conv_fp32(features, filters, indice_pairs,
indice_pair_num, num_activate_out,
inverse, subm)
elif filters.dtype == torch.half:
return torch.ops.spconv.indice_conv_half(features, filters, indice_pairs,
indice_pair_num, num_activate_out,
inverse, subm)
else:
raise NotImplementedError
def indice_conv_backward(features,
filters,
out_bp,
indice_pairs,
indice_pair_num,
inverse=False,
subm=False):
if filters.dtype == torch.float32:
return torch.ops.spconv.indice_conv_backward_fp32(
features, filters, out_bp, indice_pairs, indice_pair_num, inverse, subm)
elif filters.dtype == torch.half:
return torch.ops.spconv.indice_conv_backward_half(
features, filters, out_bp, indice_pairs, indice_pair_num, inverse, subm)
else:
raise NotImplementedError
def indice_maxpool(features, indice_pairs, indice_pair_num, num_activate_out):
if features.dtype == torch.float32:
return torch.ops.spconv.indice_maxpool_fp32(features, indice_pairs, indice_pair_num,
num_activate_out)
elif features.dtype == torch.half:
return torch.ops.spconv.indice_maxpool_half(features, indice_pairs, indice_pair_num,
num_activate_out)
else:
raise NotImplementedError
def indice_maxpool_backward(features, out_features, out_bp, indice_pairs, indice_pair_num):
if features.dtype == torch.float32:
return torch.ops.spconv.indice_maxpool_backward_fp32(
features, out_features, out_bp, indice_pairs, indice_pair_num)
elif features.dtype == torch.half:
return torch.ops.spconv.indice_maxpool_backward_half(
features, out_features, out_bp, indice_pairs, indice_pair_num)
else:
raise NotImplementedError
# Copyright 2019 Yan Yan
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import math
import time
import numpy as np
import spconv
import spconv.functional as Fsp
import torch
from spconv import ops
from spconv.modules import SparseModule
from torch import nn
from torch.nn import init
from torch.nn.parameter import Parameter
class SparseMaxPool(SparseModule):
def __init__(self,
ndim,
kernel_size,
stride=1,
padding=0,
dilation=1,
subm=False):
super(SparseMaxPool, self).__init__()
if not isinstance(kernel_size, (list, tuple)):
kernel_size = [kernel_size] * ndim
if not isinstance(stride, (list, tuple)):
stride = [stride] * ndim
if not isinstance(padding, (list, tuple)):
padding = [padding] * ndim
if not isinstance(dilation, (list, tuple)):
dilation = [dilation] * ndim
self.ndim = ndim
self.kernel_size = kernel_size
self.stride = stride
self.padding = padding
self.subm = subm
self.dilation = dilation
def forward(self, input):
assert isinstance(input, spconv.SparseConvTensor)
features = input.features
device = features.device
indices = input.indices
spatial_shape = input.spatial_shape
batch_size = input.batch_size
if not self.subm:
out_spatial_shape = ops.get_conv_output_size(
spatial_shape, self.kernel_size, self.stride, self.padding, self.dilation)
else:
out_spatial_shape = spatial_shape
outids, indice_pairs, indice_pairs_num = ops.get_indice_pairs(
indices, batch_size, spatial_shape, self.kernel_size,
self.stride, self.padding, self.dilation, 0, self.subm)
out_features = Fsp.indice_maxpool(features, indice_pairs.to(device),
indice_pairs_num.to(device), outids.shape[0])
out_tensor = spconv.SparseConvTensor(out_features, outids,
out_spatial_shape, batch_size)
out_tensor.indice_dict = input.indice_dict
out_tensor.grid = input.grid
return out_tensor
class SparseMaxPool2d(SparseMaxPool):
def __init__(self,
kernel_size,
stride=1,
padding=0,
dilation=1):
super(SparseMaxPool2d, self).__init__(
2,
kernel_size,
stride,
padding,
dilation)
class SparseMaxPool3d(SparseMaxPool):
def __init__(self,
kernel_size,
stride=1,
padding=0,
dilation=1):
super(SparseMaxPool3d, self).__init__(
3,
kernel_size,
stride,
padding,
dilation)
# Copyright 2019 Yan Yan
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import numpy as np
class TestCase(unittest.TestCase):
def _GetNdArray(self, a):
if not isinstance(a, np.ndarray):
a = np.array(a)
return a
def assertAllEqual(self, a, b):
"""Asserts that two numpy arrays have the same values.
Args:
a: the expected numpy ndarray or anything can be converted to one.
b: the actual numpy ndarray or anything can be converted to one.
"""
a = self._GetNdArray(a)
b = self._GetNdArray(b)
self.assertEqual(a.shape, b.shape,
"Shape mismatch: expected %s, got %s." % (a.shape,
b.shape))
same = (a == b)
if a.dtype == np.float32 or a.dtype == np.float64:
same = np.logical_or(same, np.logical_and(
np.isnan(a), np.isnan(b)))
if not np.all(same):
# Prints more details than np.testing.assert_array_equal.
diff = np.logical_not(same)
if a.ndim:
x = a[np.where(diff)]
y = b[np.where(diff)]
print("not equal where = ", np.where(diff))
else:
# np.where is broken for scalars
x, y = a, b
print("not equal lhs = ", x)
print("not equal rhs = ", y)
np.testing.assert_array_equal(a, b)
def assertAllClose(self, a, b, rtol=1e-6, atol=1e-6):
"""Asserts that two numpy arrays, or dicts of same, have near values.
This does not support nested dicts.
Args:
a: The expected numpy ndarray (or anything can be converted to one), or
dict of same. Must be a dict iff `b` is a dict.
b: The actual numpy ndarray (or anything can be converted to one), or
dict of same. Must be a dict iff `a` is a dict.
rtol: relative tolerance.
atol: absolute tolerance.
Raises:
ValueError: if only one of `a` and `b` is a dict.
"""
is_a_dict = isinstance(a, dict)
if is_a_dict != isinstance(b, dict):
raise ValueError("Can't compare dict to non-dict, %s vs %s." % (a,
b))
if is_a_dict:
self.assertCountEqual(
a.keys(),
b.keys(),
msg="mismatched keys, expected %s, got %s" % (a.keys(),
b.keys()))
for k in a:
self._assertArrayLikeAllClose(
a[k],
b[k],
rtol=rtol,
atol=atol,
msg="%s: expected %s, got %s." % (k, a, b))
else:
self._assertArrayLikeAllClose(a, b, rtol=rtol, atol=atol)
def _assertArrayLikeAllClose(self, a, b, rtol=1e-6, atol=1e-6, msg=None):
a = self._GetNdArray(a)
b = self._GetNdArray(b)
self.assertEqual(a.shape, b.shape,
"Shape mismatch: expected %s, got %s." % (a.shape,
b.shape))
if not np.allclose(a, b, rtol=rtol, atol=atol):
# Prints more details than np.testing.assert_allclose.
#
# NOTE: numpy.allclose (and numpy.testing.assert_allclose)
# checks whether two arrays are element-wise equal within a
# tolerance. The relative difference (rtol * abs(b)) and the
# absolute difference atol are added together to compare against
# the absolute difference between a and b. Here, we want to
# print out which elements violate such conditions.
cond = np.logical_or(
np.abs(a - b) > atol + rtol * np.abs(b),
np.isnan(a) != np.isnan(b))
if a.ndim:
x = a[np.where(cond)]
y = b[np.where(cond)]
print("not close where = ", np.where(cond))
else:
# np.where is broken for scalars
x, y = a, b
print("not close lhs = ", x)
print("not close rhs = ", y)
print("not close dif = ", np.abs(x - y))
print("not close tol = ", atol + rtol * np.abs(y))
print("dtype = %s, shape = %s" % (a.dtype, a.shape))
np.testing.assert_allclose(a, b, rtol=rtol, atol=atol, err_msg=msg)
def params_grid(*params):
size = len(params)
length = 1
for p in params:
length *= len(p)
sizes = [len(p) for p in params]
counter = [0] * size
total = []
for i in range(length):
total.append([0]* size)
for i in range(length):
for j in range(size):
total[i][j] = params[j][counter[j]]
counter[size - 1] += 1
for c in range(size - 1, -1, -1):
if (counter[c] == sizes[c] and c > 0):
counter[c - 1] += 1
counter[c] = 0
return total
def generate_sparse_data(shape,
num_points,
num_channels,
integer=False,
data_range=(-1, 1),
with_dense=True,
dtype=np.float32):
dense_shape = shape
ndim = len(dense_shape)
# num_points = np.random.randint(10, 100, size=[batch_size, ndim])
num_points = np.array(num_points)
# num_points = np.array([3, 2])
batch_size = len(num_points)
batch_indices = []
coors_total = np.stack(
np.meshgrid(*[np.arange(0, s) for s in shape]), axis=-1)
coors_total = coors_total.reshape(-1, ndim)
for i in range(batch_size):
np.random.shuffle(coors_total)
inds_total = coors_total[:num_points[i]]
inds_total = np.pad(
inds_total, ((0, 0), (0, 1)), mode="constant", constant_values=i)
batch_indices.append(inds_total)
if integer:
sparse_data = np.random.randint(
data_range[0], data_range[1], size=[num_points.sum(), num_channels]).astype(dtype)
else:
sparse_data = np.random.uniform(
data_range[0], data_range[1], size=[num_points.sum(), num_channels]).astype(dtype)
# sparse_data = np.arange(1, num_points.sum() + 1).astype(np.float32).reshape(5, 1)
res = {
"features": sparse_data.astype(dtype),
}
if with_dense:
dense_data = np.zeros(
[batch_size, num_channels, *dense_shape], dtype=sparse_data.dtype)
start = 0
for i, inds in enumerate(batch_indices):
for j, ind in enumerate(inds):
dense_slice = (i, slice(None), *ind[:-1])
dense_data[dense_slice] = sparse_data[start + j]
start += len(inds)
res["features_dense"] = dense_data.astype(dtype)
batch_indices = np.concatenate(batch_indices, axis=0)
res["indices"] = batch_indices.astype(np.int32)
return res
# Copyright 2019 Yan Yan
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
from spconv import spconv_utils
from spconv.spconv_utils import (non_max_suppression, non_max_suppression_cpu,
points_to_voxel_3d_np, rbbox_iou,
rotate_non_max_suppression_cpu)
def points_to_voxel(points,
voxel_size,
coors_range,
coor_to_voxelidx,
max_points=35,
max_voxels=20000):
"""convert 3d points(N, >=3) to voxels. This version calculate
everything in one loop. now it takes only 0.8ms(~6k voxels)
with c++ and 3.2ghz cpu.
Args:
points: [N, ndim] float tensor. points[:, :3] contain xyz points and
points[:, 3:] contain other information such as reflectivity.
voxel_size: [3] list/tuple or array, float. xyz, indicate voxel size
coors_range: [6] list/tuple or array, float. indicate voxel range.
format: xyzxyz, minmax
coor_to_voxelidx: int array. used as a dense map.
max_points: int. indicate maximum points contained in a voxel.
max_voxels: int. indicate maximum voxels this function create.
for voxelnet, 20000 is a good choice. you should shuffle points
before call this function because max_voxels may drop some points.
Returns:
voxels: [M, max_points, ndim] float tensor. only contain points.
coordinates: [M, 3] int32 tensor. zyx format.
num_points_per_voxel: [M] int32 tensor.
"""
if not isinstance(voxel_size, np.ndarray):
voxel_size = np.array(voxel_size, dtype=points.dtype)
if not isinstance(coors_range, np.ndarray):
coors_range = np.array(coors_range, dtype=points.dtype)
voxelmap_shape = (coors_range[3:] - coors_range[:3]) / voxel_size
voxelmap_shape = tuple(np.round(voxelmap_shape).astype(np.int32).tolist())
voxelmap_shape = voxelmap_shape[::-1]
num_points_per_voxel = np.zeros(shape=(max_voxels, ), dtype=np.int32)
voxels = np.zeros(
shape=(max_voxels, max_points, points.shape[-1]), dtype=points.dtype)
coors = np.zeros(shape=(max_voxels, 3), dtype=np.int32)
voxel_num = points_to_voxel_3d_np(
points, voxels, coors, num_points_per_voxel, coor_to_voxelidx,
voxel_size.tolist(), coors_range.tolist(), max_points, max_voxels)
coors = coors[:voxel_num]
voxels = voxels[:voxel_num]
num_points_per_voxel = num_points_per_voxel[:voxel_num]
return voxels, coors, num_points_per_voxel
class VoxelGenerator:
def __init__(self,
voxel_size,
point_cloud_range,
max_num_points,
max_voxels=20000):
point_cloud_range = np.array(point_cloud_range, dtype=np.float32)
# [0, -40, -3, 70.4, 40, 1]
voxel_size = np.array(voxel_size, dtype=np.float32)
grid_size = (
point_cloud_range[3:] - point_cloud_range[:3]) / voxel_size
grid_size = np.round(grid_size).astype(np.int64)
voxelmap_shape = tuple(np.round(grid_size).astype(np.int32).tolist())
voxelmap_shape = voxelmap_shape[::-1]
self._coor_to_voxelidx = np.full(voxelmap_shape, -1, dtype=np.int32)
self._voxel_size = voxel_size
self._point_cloud_range = point_cloud_range
self._max_num_points = max_num_points
self._max_voxels = max_voxels
self._grid_size = grid_size
def generate(self, points, max_voxels=None):
res = points_to_voxel(
points, self._voxel_size, self._point_cloud_range, self._coor_to_voxelidx,
self._max_num_points, max_voxels or self._max_voxels)
return res
@property
def voxel_size(self):
return self._voxel_size
@property
def max_num_points_per_voxel(self):
return self._max_num_points
@property
def point_cloud_range(self):
return self._point_cloud_range
@property
def grid_size(self):
return self._grid_size
\ No newline at end of file
add_library(spconv SHARED all.cc indice.cc indice.cu
reordering.cc reordering.cu maxpool.cc maxpool.cu)
target_include_directories(spconv PRIVATE ${ALL_INCLUDE} )
set_property(TARGET spconv PROPERTY CUDA_STANDARD 14)
set_property(TARGET spconv PROPERTY CXX_STANDARD 14)
set_target_properties(spconv PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
target_link_libraries(spconv PRIVATE ${ALL_LIBS})
install (TARGETS spconv DESTINATION lib)
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <cuda_runtime_api.h>
#include <spconv/pool_ops.h>
#include <spconv/spconv_ops.h>
static auto registry =
torch::jit::RegisterOperators("spconv::get_indice_pairs_2d", &spconv::getIndicePair<2>)
.op("spconv::get_indice_pairs_3d", &spconv::getIndicePair<3>)
.op("spconv::get_indice_pairs_grid_2d", &spconv::getIndicePairPreGrid<2>)
.op("spconv::get_indice_pairs_grid_3d", &spconv::getIndicePairPreGrid<3>)
.op("spconv::indice_conv_fp32", &spconv::indiceConv<float>)
.op("spconv::indice_conv_backward_fp32", &spconv::indiceConvBackward<float>)
.op("spconv::indice_conv_half", &spconv::indiceConv<at::Half>)
.op("spconv::indice_conv_backward_half",
&spconv::indiceConvBackward<at::Half>)
.op("spconv::indice_maxpool_fp32", &spconv::indiceMaxPool<float>)
.op("spconv::indice_maxpool_backward_fp32",
&spconv::indiceMaxPoolBackward<float>)
.op("spconv::indice_maxpool_half", &spconv::indiceMaxPool<at::Half>)
.op("spconv::indice_maxpool_backward_half",
&spconv::indiceMaxPoolBackward<at::Half>);
\ No newline at end of file
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <spconv/geometry.h>
#include <spconv/indice.h>
#include <spconv/spconv_ops.h>
#include <torch/script.h>
namespace spconv {
namespace functor {
template <typename Index, typename IndexGrid, unsigned NDim>
struct CreateConvIndicePairFunctor<tv::CPU, Index, IndexGrid, NDim> {
Index operator()(const tv::CPU& d, tv::TensorView<const Index> indicesIn,
tv::TensorView<Index> indicesOut,
tv::TensorView<IndexGrid> gridsOut,
tv::TensorView<Index> indicePairs,
tv::TensorView<Index> indiceNum,
const tv::SimpleVector<Index, NDim> kernelSize,
const tv::SimpleVector<Index, NDim> stride,
const tv::SimpleVector<Index, NDim> padding,
const tv::SimpleVector<Index, NDim> dilation,
const tv::SimpleVector<Index, NDim> outSpatialShape,
bool transpose, bool resetGrid) {
if (transpose)
return getIndicePairsDeConv<Index, IndexGrid, NDim>(
indicesIn, indicesOut,
gridsOut, indicePairs, indiceNum,
kernelSize.data(), stride.data(), padding.data(), dilation.data(),
outSpatialShape.data());
else
return getIndicePairsConv<Index, IndexGrid, NDim>(
indicesIn, indicesOut,
gridsOut, indicePairs, indiceNum,
kernelSize.data(), stride.data(), padding.data(), dilation.data(),
outSpatialShape.data());
}
};
template <typename Index, typename IndexGrid, unsigned NDim>
struct CreateSubMIndicePairFunctor<tv::CPU, Index, IndexGrid, NDim> {
Index operator()(const tv::CPU& d, tv::TensorView<const Index> indicesIn,
tv::TensorView<IndexGrid> gridsOut,
tv::TensorView<Index> indicePairs,
tv::TensorView<Index> indiceNum,
const tv::SimpleVector<Index, NDim> kernelSize,
const tv::SimpleVector<Index, NDim> stride,
const tv::SimpleVector<Index, NDim> padding,
const tv::SimpleVector<Index, NDim> dilation,
const tv::SimpleVector<Index, NDim> outSpatialShape,
bool transpose, bool resetGrid) {
return getIndicePairsSubM<Index, IndexGrid, NDim>(
indicesIn,
gridsOut, indicePairs, indiceNum,
kernelSize.data(), stride.data(), padding.data(), dilation.data(), outSpatialShape.data());
}
};
} // namespace functor
#define DECLARE_CPU_SPECS_INDEX_NDIM(Index, NDIM) \
template struct functor::CreateConvIndicePairFunctor<tv::CPU, Index, int, NDIM>; \
template struct functor::CreateSubMIndicePairFunctor<tv::CPU, Index, int, \
NDIM>;
#define DECLARE_CPU_INDEX(Index) \
DECLARE_CPU_SPECS_INDEX_NDIM(Index, 1); \
DECLARE_CPU_SPECS_INDEX_NDIM(Index, 2); \
DECLARE_CPU_SPECS_INDEX_NDIM(Index, 3); \
DECLARE_CPU_SPECS_INDEX_NDIM(Index, 4);
DECLARE_CPU_INDEX(int);
DECLARE_CPU_INDEX(long);
#undef DECLARE_CPU_INDEX
#undef DECLARE_CPU_SPECS_INDEX_NDIM
} // namespace spconv
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <ATen/ATen.h>
#include <chrono>
#include <limits>
#include <spconv/mp_helper.h>
#include <spconv/indice.h>
#include <spconv/indice.cu.h>
#include <tensorview/helper_launch.h>
#include <tensorview/tensorview.h>
#include <type_traits>
#include <utility/timer.h>
namespace spconv {
namespace functor {
template <typename Index, typename IndexGrid, unsigned NDim>
struct CreateConvIndicePairFunctorP1<tv::GPU, Index, IndexGrid, NDim> {
Index operator()(const tv::GPU &d, tv::TensorView<const Index> indicesIn,
tv::TensorView<Index> indicesOut,
tv::TensorView<IndexGrid> gridsOut,
tv::TensorView<Index> indicePairs,
tv::TensorView<Index> indiceNum,
tv::TensorView<Index> indicePairUnique,
const tv::SimpleVector<Index, NDim> kernelSize,
const tv::SimpleVector<Index, NDim> stride,
const tv::SimpleVector<Index, NDim> padding,
const tv::SimpleVector<Index, NDim> dilation,
const tv::SimpleVector<Index, NDim> outSpatialShape,
bool transpose) {
Index batchSize = gridsOut.dim(0);
auto numActIn = indicesIn.dim(0);
if (numActIn == 0)
return 0;
// auto timer = spconv::CudaContextTimer<>();
if (transpose)
prepareDeConvIndicePairsKernel<Index, IndexGrid, NDim, 256>
<<<tv::launch::getBlocks(numActIn), tv::launch::CUDA_NUM_THREADS, 0,
d.stream()>>>(indicesIn, indicesOut, gridsOut, indicePairs,
indiceNum, indicePairUnique, kernelSize, stride,
padding, dilation, outSpatialShape);
else
prepareIndicePairsKernel<Index, IndexGrid, NDim, 256>
<<<tv::launch::getBlocks(numActIn), tv::launch::CUDA_NUM_THREADS, 0,
d.stream()>>>(indicesIn, indicesOut, gridsOut, indicePairs,
indiceNum, indicePairUnique, kernelSize, stride,
padding, dilation, outSpatialShape);
// std::cout << "p1 gene time " << timer.report() / 1000.0 << std::endl;
return 1;
}
};
template <typename Index, typename IndexGrid, unsigned NDim>
struct CreateConvIndicePairFunctorP2<tv::GPU, Index, IndexGrid, NDim> {
Index operator()(const tv::GPU &d, tv::TensorView<const Index> indicesIn,
tv::TensorView<Index> indicesOut,
tv::TensorView<IndexGrid> gridsOut,
tv::TensorView<Index> indicePairs,
tv::TensorView<Index> indiceNum,
tv::TensorView<Index> indicePairUnique,
const tv::SimpleVector<Index, NDim> outSpatialShape,
bool transpose, bool resetGrid) {
Index batchSize = gridsOut.dim(0);
auto kernelVolume = indicePairs.dim(0);
auto numActIn = indicesIn.dim(0);
if (numActIn == 0)
return 0;
Index numAct = indicePairUnique.dim(0) - 1;
assignGridAndIndiceOutKernel<Index, IndexGrid, NDim>
<<<tv::launch::getBlocks(numAct), tv::launch::CUDA_NUM_THREADS, 0,
d.stream()>>>(indicesOut, gridsOut, numAct, indicePairs,
indicePairUnique, outSpatialShape, batchSize);
assignIndicePairsKernel<Index, IndexGrid, NDim>
<<<tv::launch::getBlocks(numActIn), tv::launch::CUDA_NUM_THREADS, 0,
d.stream()>>>(indicesOut, gridsOut, numActIn, indicePairs,
indicePairUnique, outSpatialShape);
if (resetGrid) {
resetGridKernel<Index, IndexGrid, NDim>
<<<tv::launch::getBlocks(numAct), tv::launch::CUDA_NUM_THREADS, 0,
d.stream()>>>(indicePairUnique.data(), gridsOut, numAct);
}
return numAct;
}
};
template <typename Index, typename IndexGrid, unsigned NDim>
struct CreateSubMIndicePairFunctor<tv::GPU, Index, IndexGrid, NDim> {
Index operator()(const tv::GPU &d, tv::TensorView<const Index> indicesIn,
tv::TensorView<IndexGrid> gridsOut,
tv::TensorView<Index> indicePairs,
tv::TensorView<Index> indiceNum,
const tv::SimpleVector<Index, NDim> kernelSize,
const tv::SimpleVector<Index, NDim> stride,
const tv::SimpleVector<Index, NDim> padding,
const tv::SimpleVector<Index, NDim> dilation,
const tv::SimpleVector<Index, NDim> outSpatialShape,
bool transpose, bool resetGrid) {
auto numActIn = indicesIn.dim(0);
if (numActIn == 0)
return 0;
// auto timer = spconv::CudaContextTimer<>();
prepareSubMGridKernel<Index, IndexGrid, NDim>
<<<tv::launch::getBlocks(numActIn), tv::launch::CUDA_NUM_THREADS, 0,
d.stream()>>>(indicesIn, gridsOut, outSpatialShape);
getSubMIndicePairsKernel<Index, IndexGrid, NDim>
<<<tv::launch::getBlocks(numActIn), tv::launch::CUDA_NUM_THREADS, 0,
d.stream()>>>(indicesIn, gridsOut, indicePairs, indiceNum,
kernelSize, stride, padding, dilation, outSpatialShape);
// std::cout << "subm gene time " << timer.report() / 1000.0 << std::endl;
if (resetGrid) {
resetGridSubMKernel<Index, IndexGrid, NDim>
<<<tv::launch::getBlocks(numActIn), tv::launch::CUDA_NUM_THREADS, 0,
d.stream()>>>(indicesIn.data(), gridsOut, outSpatialShape, numActIn);
}
return numActIn;
}
};
} // namespace functor
#define DECLARE_GPU_SPECS_INDEX_NDIM(Index, NDIM) \
template struct functor::CreateConvIndicePairFunctor<tv::GPU, Index, int, \
NDIM>; \
template struct functor::CreateConvIndicePairFunctorP1<tv::GPU, Index, int, \
NDIM>; \
template struct functor::CreateConvIndicePairFunctorP2<tv::GPU, Index, int, \
NDIM>; \
template struct functor::CreateSubMIndicePairFunctor<tv::GPU, Index, int, \
NDIM>;
#define DECLARE_GPU_INDEX(Index) \
DECLARE_GPU_SPECS_INDEX_NDIM(Index, 1); \
DECLARE_GPU_SPECS_INDEX_NDIM(Index, 2); \
DECLARE_GPU_SPECS_INDEX_NDIM(Index, 3); \
DECLARE_GPU_SPECS_INDEX_NDIM(Index, 4);
DECLARE_GPU_INDEX(int);
#undef DECLARE_GPU_INDEX
#undef DECLARE_GPU_SPECS_INDEX_NDIM
} // namespace spconv
\ No newline at end of file
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <spconv/maxpool.h>
#include <torch/script.h>
namespace spconv {
namespace functor {
template <typename T, typename Index>
struct SparseMaxPoolForwardFunctor<tv::CPU, T, Index> {
void operator()(const tv::CPU &d, tv::TensorView<T> outFeatures,
tv::TensorView<const T> inFeatures,
tv::TensorView<const Index> indices, int size) {
int stride = outFeatures.dim(1);
auto outFeaturesData = outFeatures.data();
auto inFeaturesData = inFeatures.data();
auto indicesIn = indices.subview(0).data();
auto indicesOut = indices.subview(1).data();
Index idxi, idxo;
for (int row = 0; row < size; row++) {
idxi = indicesIn[row] * stride;
idxo = indicesOut[row] * stride;
for (int plane = 0; plane < stride; ++plane)
if (outFeaturesData[idxo + plane] < inFeaturesData[idxi + plane])
outFeaturesData[idxo + plane] = inFeaturesData[idxi + plane];
}
}
};
template <typename T, typename Index>
struct SparseMaxPoolBackwardFunctor<tv::CPU, T, Index> {
void operator()(const tv::CPU &d, tv::TensorView<const T> outFeatures,
tv::TensorView<const T> inFeatures,
tv::TensorView<const T> dout, tv::TensorView<T> din,
tv::TensorView<const Index> indices, int size) {
int stride = outFeatures.dim(1);
auto outFeaturesData = outFeatures.data();
auto inFeaturesData = inFeatures.data();
auto doutData = dout.data();
auto dinData = din.data();
auto indicesIn = indices.subview(0).data();
auto indicesOut = indices.subview(1).data();
Index idxi, idxo;
for (int row = 0; row < size; row++) {
idxi = indicesIn[row] * stride;
idxo = indicesOut[row] * stride;
for (int plane = 0; plane < stride; ++plane)
if (outFeaturesData[idxo + plane] == inFeaturesData[idxi + plane])
dinData[idxi + plane] += doutData[idxo + plane];
}
}
};
} // namespace functor
#define DECLARE_CPU_SPECS_T_INDEX(T, Index) \
template struct functor::SparseMaxPoolForwardFunctor<tv::CPU, T, Index>; \
template struct functor::SparseMaxPoolBackwardFunctor<tv::CPU, T, Index>;
#define DECLARE_CPU_SPECS(T) \
DECLARE_CPU_SPECS_T_INDEX(T, int); \
DECLARE_CPU_SPECS_T_INDEX(T, long);
DECLARE_CPU_SPECS(float);
DECLARE_CPU_SPECS(double);
DECLARE_CPU_SPECS(at::Half);
#undef DECLARE_CPU_SPECS
#undef DECLARE_CPU_SPECS_T_INDEX
} // namespace spconv
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment