Commit 19e73bbe authored by Yan Yan's avatar Yan Yan
Browse files

format code with clang-format, better c++ code

parent c336139f
#include <cuhash/hash_table.h>
#include <cuda.h> #include <cuda.h>
#include <cuhash/hash_table.h>
int main(){ int main() {
auto table = cuhash::HashTable(); auto table = cuhash::HashTable();
table.Initialize(10, 2.0); table.Initialize(10, 2.0);
const int N = 10; const int N = 10;
// ハッシュテーブルに格納するデータ // ハッシュテーブルに格納するデータ
int keys[N] = {1, 6, 4, 9, 0, 3, 7, 2, 5, 8}; int keys[N] = {1, 6, 4, 9, 0, 3, 7, 2, 5, 8};
int vals[N] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; int vals[N] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
// デバイスメモリにコピー // デバイスメモリにコピー
int *d_keys, *d_vals; int *d_keys, *d_vals;
cudaMalloc((void**)&d_keys, sizeof(int) * N); cudaMalloc((void **)&d_keys, sizeof(int) * N);
cudaMemcpy(d_keys, keys, sizeof(int) * N, cudaMemcpyHostToDevice); cudaMemcpy(d_keys, keys, sizeof(int) * N, cudaMemcpyHostToDevice);
cudaMalloc((void**)&d_vals, sizeof(int) * N); cudaMalloc((void **)&d_vals, sizeof(int) * N);
cudaMemcpy(d_vals, vals, sizeof(int) * N, cudaMemcpyHostToDevice); cudaMemcpy(d_vals, vals, sizeof(int) * N, cudaMemcpyHostToDevice);
// ハッシュテーブルにクエリするデータ // ハッシュテーブルにクエリするデータ
int input[N] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; int input[N] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
int output[N]; int output[N];
// デバイスメモリにコピー // デバイスメモリにコピー
int *d_input, *d_output; int *d_input, *d_output;
cudaMalloc((void**)&d_input, sizeof(int) * N); cudaMalloc((void **)&d_input, sizeof(int) * N);
cudaMemcpy(d_input, input, sizeof(int) * N, cudaMemcpyHostToDevice); cudaMemcpy(d_input, input, sizeof(int) * N, cudaMemcpyHostToDevice);
cudaMalloc((void**)&d_output, sizeof(int) * N); cudaMalloc((void **)&d_output, sizeof(int) * N);
cudaMemset(d_output, 0, sizeof(int) * N); cudaMemset(d_output, 0, sizeof(int) * N);
bool s = table.Build(N, (const unsigned int *) d_keys, bool s = table.Build(N, (const unsigned int *)d_keys,
(const unsigned int *) d_vals); (const unsigned int *)d_vals);
std::cout << s << std::endl; std::cout << s << std::endl;
table.Retrieve(N, (const unsigned int *) d_input, table.Retrieve(N, (const unsigned int *)d_input, (unsigned int *)d_output);
(unsigned int *) d_output);
std::cout << s << std::endl;
std::cout << s << std::endl; cudaMemcpy(output, d_output, sizeof(int) * N, cudaMemcpyDeviceToHost);
cudaMemcpy(output, d_output, sizeof(int) * N, cudaMemcpyDeviceToHost); for (int i = 0; i < N; ++i) {
for (int i = 0; i < N; ++i) { printf("%d\n", output[i]);
printf("%d\n", output[i]); }
}
return 0;
return 0;
} }
\ No newline at end of file
// Copyright 2019 Yan Yan // Copyright 2019 Yan Yan
// //
// Licensed under the Apache License, Version 2.0 (the "License"); // Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License. // you may not use this file except in compliance with the License.
// You may obtain a copy of the License at // You may obtain a copy of the License at
// //
// http://www.apache.org/licenses/LICENSE-2.0 // http://www.apache.org/licenses/LICENSE-2.0
// //
// Unless required by applicable law or agreed to in writing, software // Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS, // distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include <torch/script.h>
#include <spconv/pool_ops.h>
#include <spconv/spconv_ops.h>
#include <spconv/pillar_scatter_ops.h>
#include <spconv/fused_spconv_ops.h> #include <spconv/fused_spconv_ops.h>
#include <spconv/nms_ops.h> #include <spconv/nms_ops.h>
#include <spconv/pillar_scatter_ops.h>
#include <spconv/pool_ops.h>
#include <spconv/spconv_ops.h>
#include <torch/script.h>
static auto registry = static auto registry =
torch::RegisterOperators() torch::RegisterOperators()
.op("spconv::get_indice_pairs_2d", &spconv::getIndicePair<2>) .op("spconv::get_indice_pairs_2d", &spconv::getIndicePair<2>)
.op("spconv::get_indice_pairs_3d", &spconv::getIndicePair<3>) .op("spconv::get_indice_pairs_3d", &spconv::getIndicePair<3>)
.op("spconv::get_indice_pairs_4d", &spconv::getIndicePair<4>) .op("spconv::get_indice_pairs_4d", &spconv::getIndicePair<4>)
.op("spconv::get_indice_pairs_grid_2d", &spconv::getIndicePairPreGrid<2>) .op("spconv::get_indice_pairs_grid_2d",
.op("spconv::get_indice_pairs_grid_3d", &spconv::getIndicePairPreGrid<3>) &spconv::getIndicePairPreGrid<2>)
.op("spconv::get_indice_pairs_grid_3d",
&spconv::getIndicePairPreGrid<3>)
.op("spconv::indice_conv", &spconv::indiceConv) .op("spconv::indice_conv", &spconv::indiceConv)
.op("spconv::indice_conv_backward", &spconv::indiceConvBackward) .op("spconv::indice_conv_backward", &spconv::indiceConvBackward)
.op("spconv::fused_indice_conv_fp32", &spconv::fusedIndiceConvBatchNorm<float>) .op("spconv::fused_indice_conv_fp32",
.op("spconv::fused_indice_conv_half", &spconv::fusedIndiceConvBatchNorm<at::Half>) &spconv::fusedIndiceConvBatchNorm<float>)
.op("spconv::fused_indice_conv_half",
&spconv::fusedIndiceConvBatchNorm<at::Half>)
.op("spconv::indice_maxpool_fp32", &spconv::indiceMaxPool<float>) .op("spconv::indice_maxpool_fp32", &spconv::indiceMaxPool<float>)
.op("spconv::indice_maxpool_backward_fp32", .op("spconv::indice_maxpool_backward_fp32",
&spconv::indiceMaxPoolBackward<float>) &spconv::indiceMaxPoolBackward<float>)
...@@ -38,4 +42,5 @@ static auto registry = ...@@ -38,4 +42,5 @@ static auto registry =
&spconv::indiceMaxPoolBackward<at::Half>) &spconv::indiceMaxPoolBackward<at::Half>)
.op("spconv::nms", &spconv::nonMaxSuppression<float>) .op("spconv::nms", &spconv::nonMaxSuppression<float>)
.op("spconv::pillar_scatter_float", &spconv::pointPillarScatter<float>) .op("spconv::pillar_scatter_float", &spconv::pointPillarScatter<float>)
.op("spconv::pillar_scatter_half", &spconv::pointPillarScatter<at::Half>); .op("spconv::pillar_scatter_half",
&spconv::pointPillarScatter<at::Half>);
// Copyright 2019 Yan Yan // Copyright 2019 Yan Yan
// //
// Licensed under the Apache License, Version 2.0 (the "License"); // Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License. // you may not use this file except in compliance with the License.
// You may obtain a copy of the License at // You may obtain a copy of the License at
// //
// http://www.apache.org/licenses/LICENSE-2.0 // http://www.apache.org/licenses/LICENSE-2.0
// //
// Unless required by applicable law or agreed to in writing, software // Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS, // distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include <ATen/Parallel.h>
#include <spconv/geometry.h> #include <spconv/geometry.h>
#include <spconv/indice.h> #include <spconv/indice.h>
#include <spconv/spconv_ops.h> #include <spconv/spconv_ops.h>
#include <torch/script.h> #include <torch/script.h>
#include <ATen/Parallel.h>
namespace spconv { namespace spconv {
...@@ -45,7 +45,7 @@ Index getIndicePairsConv(tv::TensorView<const Index> indicesIn, ...@@ -45,7 +45,7 @@ Index getIndicePairsConv(tv::TensorView<const Index> indicesIn,
} }
Index numValidPoints = 0; Index numValidPoints = 0;
std::vector<Index> validPoints_(kernelVolume * (NDim + 1)); std::vector<Index> validPoints_(kernelVolume * (NDim + 1));
Index* validPoints = validPoints_.data(); Index *validPoints = validPoints_.data();
Index *pointPtr = nullptr; Index *pointPtr = nullptr;
Index hashval; Index hashval;
tsl::robin_map<Index, Index> hash; tsl::robin_map<Index, Index> hash;
...@@ -67,7 +67,7 @@ Index getIndicePairsConv(tv::TensorView<const Index> indicesIn, ...@@ -67,7 +67,7 @@ Index getIndicePairsConv(tv::TensorView<const Index> indicesIn,
indicesOut(numAct, 0) = batchIdx; indicesOut(numAct, 0) = batchIdx;
hashval = numAct++; hashval = numAct++;
hash[index] = hashval; hash[index] = hashval;
}else{ } else {
hashval = iter->second; hashval = iter->second;
} }
// indicePairs: [K, 2, L] // indicePairs: [K, 2, L]
...@@ -102,7 +102,7 @@ Index getIndicePairsDeConv(tv::TensorView<const Index> indicesIn, ...@@ -102,7 +102,7 @@ Index getIndicePairsDeConv(tv::TensorView<const Index> indicesIn,
} }
Index numValidPoints = 0; Index numValidPoints = 0;
std::vector<Index> validPoints_(kernelVolume * (NDim + 1)); std::vector<Index> validPoints_(kernelVolume * (NDim + 1));
Index* validPoints = validPoints_.data(); Index *validPoints = validPoints_.data();
Index *pointPtr = nullptr; Index *pointPtr = nullptr;
Index hashval; Index hashval;
tsl::robin_map<Index, Index> hash; tsl::robin_map<Index, Index> hash;
...@@ -125,7 +125,7 @@ Index getIndicePairsDeConv(tv::TensorView<const Index> indicesIn, ...@@ -125,7 +125,7 @@ Index getIndicePairsDeConv(tv::TensorView<const Index> indicesIn,
indicesOut(numAct, 0) = batchIdx; indicesOut(numAct, 0) = batchIdx;
hashval = numAct++; hashval = numAct++;
hash[index] = hashval; hash[index] = hashval;
}else{ } else {
hashval = iter->second; hashval = iter->second;
} }
// indicePairs: [K, 2, L] // indicePairs: [K, 2, L]
...@@ -136,7 +136,6 @@ Index getIndicePairsDeConv(tv::TensorView<const Index> indicesIn, ...@@ -136,7 +136,6 @@ Index getIndicePairsDeConv(tv::TensorView<const Index> indicesIn,
return numAct; return numAct;
} }
#ifndef TV_WINDOWS #ifndef TV_WINDOWS
template <typename Index, typename IndexGrid, unsigned NDim> template <typename Index, typename IndexGrid, unsigned NDim>
Index getIndicePairsSubM(tv::TensorView<const Index> indicesIn, Index getIndicePairsSubM(tv::TensorView<const Index> indicesIn,
...@@ -145,7 +144,8 @@ Index getIndicePairsSubM(tv::TensorView<const Index> indicesIn, ...@@ -145,7 +144,8 @@ Index getIndicePairsSubM(tv::TensorView<const Index> indicesIn,
tv::TensorView<Index> indiceNum, tv::TensorView<Index> indiceNum,
const Index *const kernelSize, const Index *const kernelSize,
const Index *const stride, const Index *const padding, const Index *const stride, const Index *const padding,
const Index *dilation, const Index *const outSpatialShape) { const Index *dilation,
const Index *const outSpatialShape) {
Index numAct = 0; Index numAct = 0;
auto numActIn = indicesIn.dim(0); auto numActIn = indicesIn.dim(0);
Index batchIdx = 0; Index batchIdx = 0;
...@@ -167,12 +167,12 @@ Index getIndicePairsSubM(tv::TensorView<const Index> indicesIn, ...@@ -167,12 +167,12 @@ Index getIndicePairsSubM(tv::TensorView<const Index> indicesIn,
spatialVolume * indicesIn(j, 0); spatialVolume * indicesIn(j, 0);
hash[index] = j; hash[index] = j;
} }
at::parallel_for(0, numActIn, 0, [&](int64_t begin, int64_t end){ at::parallel_for(0, numActIn, 0, [&](int64_t begin, int64_t end) {
Index index = 0; Index index = 0;
Index numValidPoints = 0; Index numValidPoints = 0;
std::vector<Index> validPoints_(kernelVolume * (NDim + 1)); std::vector<Index> validPoints_(kernelVolume * (NDim + 1));
Index* validPoints = validPoints_.data(); Index *validPoints = validPoints_.data();
Index *pointPtr = nullptr; Index *pointPtr = nullptr;
Index oldOffset = 0; Index oldOffset = 0;
for (int j = begin; j < end; ++j) { for (int j = begin; j < end; ++j) {
...@@ -186,7 +186,7 @@ Index getIndicePairsSubM(tv::TensorView<const Index> indicesIn, ...@@ -186,7 +186,7 @@ Index getIndicePairsSubM(tv::TensorView<const Index> indicesIn,
spatialVolume * indicesIn(j, 0); spatialVolume * indicesIn(j, 0);
auto iter = hash.find(index); auto iter = hash.find(index);
if (iter != hash.end()) { if (iter != hash.end()) {
#pragma omp atomic capture #pragma omp atomic capture
oldOffset = indiceNum[offset]++; oldOffset = indiceNum[offset]++;
indicePairs(offset, 0, oldOffset) = j; indicePairs(offset, 0, oldOffset) = j;
indicePairs(offset, 1, oldOffset) = iter->second; indicePairs(offset, 1, oldOffset) = iter->second;
...@@ -196,7 +196,7 @@ Index getIndicePairsSubM(tv::TensorView<const Index> indicesIn, ...@@ -196,7 +196,7 @@ Index getIndicePairsSubM(tv::TensorView<const Index> indicesIn,
}); });
return numActIn; return numActIn;
} }
#else #else
template <typename Index, typename IndexGrid, unsigned NDim> template <typename Index, typename IndexGrid, unsigned NDim>
Index getIndicePairsSubM(tv::TensorView<const Index> indicesIn, Index getIndicePairsSubM(tv::TensorView<const Index> indicesIn,
tv::TensorView<IndexGrid> gridsOut, tv::TensorView<IndexGrid> gridsOut,
...@@ -204,7 +204,8 @@ Index getIndicePairsSubM(tv::TensorView<const Index> indicesIn, ...@@ -204,7 +204,8 @@ Index getIndicePairsSubM(tv::TensorView<const Index> indicesIn,
tv::TensorView<Index> indiceNum, tv::TensorView<Index> indiceNum,
const Index *const kernelSize, const Index *const kernelSize,
const Index *const stride, const Index *const padding, const Index *const stride, const Index *const padding,
const Index *dilation, const Index *const outSpatialShape) { const Index *dilation,
const Index *const outSpatialShape) {
Index numAct = 0; Index numAct = 0;
auto numActIn = indicesIn.dim(0); auto numActIn = indicesIn.dim(0);
Index batchIdx = 0; Index batchIdx = 0;
...@@ -221,7 +222,7 @@ Index getIndicePairsSubM(tv::TensorView<const Index> indicesIn, ...@@ -221,7 +222,7 @@ Index getIndicePairsSubM(tv::TensorView<const Index> indicesIn,
Index numValidPoints = 0; Index numValidPoints = 0;
// Index validPoints[kernelVolume * (NDim + 1)]; // Index validPoints[kernelVolume * (NDim + 1)];
std::vector<Index> validPoints_(kernelVolume * (NDim + 1)); std::vector<Index> validPoints_(kernelVolume * (NDim + 1));
Index* validPoints = validPoints_.data(); Index *validPoints = validPoints_.data();
Index *pointPtr = nullptr; Index *pointPtr = nullptr;
tsl::robin_map<Index, Index> hash; tsl::robin_map<Index, Index> hash;
for (int j = 0; j < numActIn; ++j) { for (int j = 0; j < numActIn; ++j) {
...@@ -255,57 +256,53 @@ Index getIndicePairsSubM(tv::TensorView<const Index> indicesIn, ...@@ -255,57 +256,53 @@ Index getIndicePairsSubM(tv::TensorView<const Index> indicesIn,
namespace functor { namespace functor {
template <typename Index, typename IndexGrid, unsigned NDim> template <typename Index, typename IndexGrid, unsigned NDim>
struct CreateConvIndicePairFunctor<tv::CPU, Index, IndexGrid, NDim> { struct CreateConvIndicePairFunctor<tv::CPU, Index, IndexGrid, NDim> {
Index operator()(const tv::CPU& d, tv::TensorView<const Index> indicesIn, Index operator()(const tv::CPU &d, tv::TensorView<const Index> indicesIn,
tv::TensorView<Index> indicesOut, tv::TensorView<Index> indicesOut,
tv::TensorView<IndexGrid> gridsOut, tv::TensorView<IndexGrid> gridsOut,
tv::TensorView<Index> indicePairs, tv::TensorView<Index> indicePairs,
tv::TensorView<Index> indiceNum, tv::TensorView<Index> indiceNum,
const tv::SimpleVector<Index, NDim> kernelSize, const tv::SimpleVector<Index, NDim> kernelSize,
const tv::SimpleVector<Index, NDim> stride, const tv::SimpleVector<Index, NDim> stride,
const tv::SimpleVector<Index, NDim> padding, const tv::SimpleVector<Index, NDim> padding,
const tv::SimpleVector<Index, NDim> dilation, const tv::SimpleVector<Index, NDim> dilation,
const tv::SimpleVector<Index, NDim> outSpatialShape, const tv::SimpleVector<Index, NDim> outSpatialShape,
bool transpose, bool resetGrid, bool useHash) { bool transpose, bool resetGrid, bool useHash) {
if (transpose) if (transpose)
return getIndicePairsDeConv<Index, IndexGrid, NDim>( return getIndicePairsDeConv<Index, IndexGrid, NDim>(
indicesIn, indicesOut, indicesIn, indicesOut, gridsOut, indicePairs, indiceNum,
gridsOut, indicePairs, indiceNum,
kernelSize.data(), stride.data(), padding.data(), dilation.data(), kernelSize.data(), stride.data(), padding.data(), dilation.data(),
outSpatialShape.data()); outSpatialShape.data());
else else
return getIndicePairsConv<Index, IndexGrid, NDim>( return getIndicePairsConv<Index, IndexGrid, NDim>(
indicesIn, indicesOut, indicesIn, indicesOut, gridsOut, indicePairs, indiceNum,
gridsOut, indicePairs, indiceNum,
kernelSize.data(), stride.data(), padding.data(), dilation.data(), kernelSize.data(), stride.data(), padding.data(), dilation.data(),
outSpatialShape.data()); outSpatialShape.data());
} }
}; };
template <typename Index, typename IndexGrid, unsigned NDim> template <typename Index, typename IndexGrid, unsigned NDim>
struct CreateSubMIndicePairFunctor<tv::CPU, Index, IndexGrid, NDim> { struct CreateSubMIndicePairFunctor<tv::CPU, Index, IndexGrid, NDim> {
Index operator()(const tv::CPU& d, tv::TensorView<const Index> indicesIn, Index operator()(const tv::CPU &d, tv::TensorView<const Index> indicesIn,
tv::TensorView<IndexGrid> gridsOut, tv::TensorView<IndexGrid> gridsOut,
tv::TensorView<Index> indicePairs, tv::TensorView<Index> indicePairs,
tv::TensorView<Index> indiceNum, tv::TensorView<Index> indiceNum,
const tv::SimpleVector<Index, NDim> kernelSize, const tv::SimpleVector<Index, NDim> kernelSize,
const tv::SimpleVector<Index, NDim> stride, const tv::SimpleVector<Index, NDim> stride,
const tv::SimpleVector<Index, NDim> padding, const tv::SimpleVector<Index, NDim> padding,
const tv::SimpleVector<Index, NDim> dilation, const tv::SimpleVector<Index, NDim> dilation,
const tv::SimpleVector<Index, NDim> outSpatialShape, const tv::SimpleVector<Index, NDim> outSpatialShape,
bool transpose, bool resetGrid, bool useHash) { bool transpose, bool resetGrid, bool useHash) {
return getIndicePairsSubM<Index, IndexGrid, NDim>( return getIndicePairsSubM<Index, IndexGrid, NDim>(
indicesIn, indicesIn, gridsOut, indicePairs, indiceNum, kernelSize.data(),
gridsOut, indicePairs, indiceNum, stride.data(), padding.data(), dilation.data(), outSpatialShape.data());
kernelSize.data(), stride.data(), padding.data(), dilation.data(), outSpatialShape.data());
} }
}; };
} // namespace functor } // namespace functor
#define DECLARE_CPU_SPECS_INDEX_NDIM(Index, NDIM) \ #define DECLARE_CPU_SPECS_INDEX_NDIM(Index, NDIM) \
template struct functor::CreateConvIndicePairFunctor<tv::CPU, Index, int, NDIM>; \ template struct functor::CreateConvIndicePairFunctor<tv::CPU, Index, int, \
template struct functor::CreateSubMIndicePairFunctor<tv::CPU, Index, int, \ NDIM>; \
NDIM>; template struct functor::CreateSubMIndicePairFunctor<tv::CPU, Index, int, \
NDIM>;
#define DECLARE_CPU_INDEX(Index) \ #define DECLARE_CPU_INDEX(Index) \
DECLARE_CPU_SPECS_INDEX_NDIM(Index, 1); \ DECLARE_CPU_SPECS_INDEX_NDIM(Index, 1); \
...@@ -320,4 +317,3 @@ DECLARE_CPU_INDEX(long); ...@@ -320,4 +317,3 @@ DECLARE_CPU_INDEX(long);
#undef DECLARE_CPU_SPECS_INDEX_NDIM #undef DECLARE_CPU_SPECS_INDEX_NDIM
} // namespace spconv } // namespace spconv
...@@ -14,17 +14,240 @@ ...@@ -14,17 +14,240 @@
#include <ATen/ATen.h> #include <ATen/ATen.h>
#include <chrono> #include <chrono>
#include <cuhash/hash_table.h>
#include <limits> #include <limits>
#include <spconv/mp_helper.h>
#include <spconv/indice.h>
#include <spconv/indice.cu.h> #include <spconv/indice.cu.h>
#include <tensorview/helper_launch.h> #include <spconv/indice.h>
#include <tensorview/cuda_utils.h>
#include <tensorview/mp_helper.h>
#include <tensorview/torch_utils.h>
#include <tensorview/tensor.h>
#include <tensorview/tensorview.h> #include <tensorview/tensorview.h>
#include <type_traits> #include <type_traits>
#include <utility/timer.h> #include <utility/timer.h>
#include <cuhash/hash_table.h>
namespace spconv { namespace spconv {
int create_conv_indice_pair_p1_cuda(
torch::Tensor indicesIn, torch::Tensor indicePairs, torch::Tensor indiceNum,
torch::Tensor indicePairUnique, std::vector<int64_t> kernelSize,
std::vector<int64_t> stride, std::vector<int64_t> padding,
std::vector<int64_t> dilation, std::vector<int64_t> outSpatialShape,
bool transpose) {
auto stream = at::cuda::getCurrentCUDAStream();
auto ndim = kernelSize.size();
auto numActIn = indicesIn.size(0);
if (numActIn == 0)
return 0;
// dispatch_torch must be in outside, this is a gcc bug, fixed in gcc 8.
tv::dispatch_torch<int32_t>(indicesIn.scalar_type(), [&](auto V) {
using Index = decltype(V);
using IndexGrid = int32_t;
tv::dispatch_int<2, 3, 4>(ndim, [&](auto I) {
constexpr int NDim = I;
tv::SimpleVector<Index, NDim> ks(kernelSize.begin(), kernelSize.end());
tv::SimpleVector<Index, NDim> st(stride.begin(), stride.end());
tv::SimpleVector<Index, NDim> pa(padding.begin(), padding.end());
tv::SimpleVector<Index, NDim> di(dilation.begin(), dilation.end());
tv::SimpleVector<Index, NDim> ou(outSpatialShape.begin(),
outSpatialShape.end());
if (transpose) {
prepareDeConvIndicePairsKernel<Index, NDim, 4096>
<<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS, 0,
stream>>>(tv::torch2tv<Index>(indicesIn),
tv::torch2tv<Index>(indicePairs),
tv::torch2tv<Index>(indiceNum),
tv::torch2tv<Index>(indicePairUnique), ks, st, pa, di,
ou);
} else {
prepareIndicePairsKernel<Index, NDim, 4096>
<<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS, 0,
stream>>>(tv::torch2tv<Index>(indicesIn),
tv::torch2tv<Index>(indicePairs),
tv::torch2tv<Index>(indiceNum),
tv::torch2tv<Index>(indicePairUnique), ks, st, pa, di,
ou);
}
});
});
return 1;
}
int create_conv_indice_pair_p2_cuda(
torch::Tensor indicesIn, torch::Tensor indicesOut, torch::Tensor gridsOut,
torch::Tensor indicePairs, torch::Tensor indiceNum,
torch::Tensor indicePairUnique, std::vector<int64_t> outSpatialShape,
bool transpose, bool resetGrid, bool useHash) {
auto stream = at::cuda::getCurrentCUDAStream();
auto ndim = outSpatialShape.size();
auto numActIn = indicesIn.size(0);
int batchSize = gridsOut.size(0);
int numAct = indicePairUnique.size(0) - 1;
auto kernelVolume = indicePairs.size(0);
if (numActIn == 0)
return 0;
// dispatch_torch must be in outside, this is a gcc bug, fixed in gcc 8.
tv::dispatch_torch<int32_t>(indicesIn.scalar_type(), [&](auto V) {
using Index = decltype(V);
using IndexGrid = int32_t;
tv::dispatch_int<2, 3, 4>(ndim, [&](auto I) {
constexpr int NDim = I;
using IndexGrid = int32_t;
tv::SimpleVector<Index, NDim> ou(outSpatialShape.begin(),
outSpatialShape.end());
if (useHash) {
auto table = cuhash::HashTable();
// std::cout << "create " << numAct << " size table..." << std::endl;
table.Initialize(numAct, 2.0, 4);
unsigned *d_values = nullptr;
cudaMalloc((void **)&d_values, sizeof(unsigned) * numAct);
TV_CHECK_CUDA_ERR_V2("cudaMalloc failed");
arangeKernel<unsigned>
<<<tv::cuda::getBlocks(numAct), tv::cuda::CUDA_NUM_THREADS, 0,
stream>>>(d_values, numAct);
TV_CHECK_CUDA_ERR_V2("arangeKernel failed");
bool res =
table.Build(numAct,
reinterpret_cast<unsigned *>(
tv::torch2tv<Index>(indicePairUnique).data()),
d_values);
cudaFree(d_values);
TV_CHECK_CUDA_ERR_V2("cudaFree failed");
if (!res) {
return -1; // use -1 to tell outside use CPU implementation
}
assignIndiceOutKernel<Index, NDim>
<<<tv::cuda::getBlocks(numAct), tv::cuda::CUDA_NUM_THREADS, 0,
stream>>>(tv::torch2tv<Index>(indicesOut), numAct,
tv::torch2tv<Index>(indicePairUnique), ou, batchSize);
auto tableSize = table.get_table_size();
auto tableData = table.data();
auto constants = table.get_constants_4();
auto stash_constants = table.get_stash_constants();
auto stash_count = table.get_stash_count();
assignIndicePairsHashKernel<Index, NDim>
<<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS, 0,
stream>>>(tv::torch2tv<Index>(indicesOut), numActIn,
tv::torch2tv<Index>(indicePairs),
tv::torch2tv<Index>(indicePairUnique), tableSize,
tableData, constants, stash_constants, stash_count);
} else {
assignGridAndIndiceOutKernel<Index, IndexGrid, NDim>
<<<tv::cuda::getBlocks(numAct), tv::cuda::CUDA_NUM_THREADS, 0,
stream>>>(tv::torch2tv<Index>(indicesOut),
tv::torch2tv<IndexGrid>(gridsOut), numAct,
tv::torch2tv<Index>(indicePairs),
tv::torch2tv<Index>(indicePairUnique), ou, batchSize);
TV_CHECK_CUDA_ERR_V2("assignGridAndIndiceOutKernel failed");
assignIndicePairsKernel<Index, IndexGrid, NDim>
<<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS, 0,
stream>>>(tv::torch2tv<Index>(indicesOut),
tv::torch2tv<IndexGrid>(gridsOut), numAct,
tv::torch2tv<Index>(indicePairs),
tv::torch2tv<Index>(indicePairUnique), ou);
TV_CHECK_CUDA_ERR_V2("assignIndicePairsKernel failed");
}
if (resetGrid && (!useHash)) {
resetGridKernel<Index, IndexGrid, NDim>
<<<tv::cuda::getBlocks(numAct), tv::cuda::CUDA_NUM_THREADS, 0,
stream>>>(indicePairUnique.data_ptr<Index>(),
tv::torch2tv<IndexGrid>(gridsOut), numAct);
TV_CHECK_CUDA_ERR_V2("resetGridKernel failed");
}
});
});
return numAct;
}
int create_submconv_indice_pair_cuda(
torch::Tensor indicesIn, torch::Tensor gridsOut, torch::Tensor indicePairs,
torch::Tensor indiceNum, std::vector<int64_t> kernelSize,
std::vector<int64_t> stride, std::vector<int64_t> padding,
std::vector<int64_t> dilation, std::vector<int64_t> outSpatialShape,
bool transpose, bool resetGrid, bool useHash) {
auto stream = at::cuda::getCurrentCUDAStream();
auto ndim = outSpatialShape.size();
auto numActIn = indicesIn.size(0);
int batchSize = gridsOut.size(0);
auto kernelVolume = indicePairs.size(0);
if (numActIn == 0)
return 0;
tv::dispatch_torch<int32_t>(indicesIn.scalar_type(), [&](auto V) {
using Index = decltype(V);
using IndexGrid = int32_t;
tv::dispatch_int<2, 3, 4>(ndim, [&](auto I) {
constexpr int NDim = I;
tv::SimpleVector<Index, NDim> ks(kernelSize.begin(), kernelSize.end());
tv::SimpleVector<Index, NDim> st(stride.begin(), stride.end());
tv::SimpleVector<Index, NDim> pa(padding.begin(), padding.end());
tv::SimpleVector<Index, NDim> di(dilation.begin(), dilation.end());
tv::SimpleVector<Index, NDim> ou(outSpatialShape.begin(),
outSpatialShape.end());
if (useHash) {
auto table = cuhash::HashTable();
// std::cout << "create " << numAct << " size table..." << std::endl;
table.Initialize(numActIn, 2.0, 4);
unsigned *d_keyvalues = nullptr;
cudaMalloc((void **)&d_keyvalues, sizeof(unsigned) * numActIn * 2);
unsigned *d_values = d_keyvalues + numActIn;
TV_CHECK_CUDA_ERR_V2("cudaMalloc failed");
prepareSubMHashKernel<Index, NDim>
<<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS, 0,
stream>>>(tv::torch2tv<Index>(indicesIn), d_keyvalues, d_values,
ou);
TV_CHECK_CUDA_ERR_V2("prepareSubMHashKernel failed");
bool res =
table.Build(numActIn, reinterpret_cast<unsigned *>(d_keyvalues),
reinterpret_cast<unsigned *>(d_values));
cudaFree(d_values);
TV_CHECK_CUDA_ERR_V2("cudaFree failed");
if (!res) {
return -1; // use -1 to tell outside use CPU implementation
}
auto tableSize = table.get_table_size();
auto tableData = table.data();
auto constants = table.get_constants_4();
auto stash_constants = table.get_stash_constants();
auto stash_count = table.get_stash_count();
getSubMIndicePairsHashKernel<Index, NDim, 4096>
<<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS, 0,
stream>>>(tv::torch2tv<Index>(indicesIn),
tv::torch2tv<Index>(indicePairs),
tv::torch2tv<Index>(indiceNum), ks, st, pa, di, ou,
tableSize, tableData, constants, stash_constants,
stash_count);
} else {
prepareSubMGridKernel<Index, IndexGrid, NDim>
<<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS, 0,
stream>>>(tv::torch2tv<Index>(indicesIn),
tv::torch2tv<IndexGrid>(gridsOut), ou);
TV_CHECK_CUDA_ERR_V2("prepareSubMGridKernel failed");
getSubMIndicePairsKernel<Index, IndexGrid, NDim, 4096>
<<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS, 0,
stream>>>(tv::torch2tv<Index>(indicesIn),
tv::torch2tv<IndexGrid>(gridsOut),
tv::torch2tv<Index>(indicePairs),
tv::torch2tv<Index>(indiceNum), ks, st, pa, di, ou);
TV_CHECK_CUDA_ERR_V2("assignIndicePairsKernel failed");
}
if (resetGrid && (!useHash)) {
resetGridSubMKernel<Index, IndexGrid, NDim>
<<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS, 0,
stream>>>(indicesIn.data_ptr<Index>(),
tv::torch2tv<IndexGrid>(gridsOut), ou, numActIn);
TV_CHECK_CUDA_ERR_V2("resetGridKernel failed");
}
});
});
return numActIn;
}
namespace functor { namespace functor {
template <typename Index, typename IndexGrid, unsigned NDim> template <typename Index, typename IndexGrid, unsigned NDim>
struct CreateConvIndicePairFunctorP1<tv::GPU, Index, IndexGrid, NDim> { struct CreateConvIndicePairFunctorP1<tv::GPU, Index, IndexGrid, NDim> {
...@@ -46,17 +269,17 @@ struct CreateConvIndicePairFunctorP1<tv::GPU, Index, IndexGrid, NDim> { ...@@ -46,17 +269,17 @@ struct CreateConvIndicePairFunctorP1<tv::GPU, Index, IndexGrid, NDim> {
return 0; return 0;
// auto timer = spconv::CudaContextTimer<>(); // auto timer = spconv::CudaContextTimer<>();
if (transpose) if (transpose)
prepareDeConvIndicePairsKernel<Index, IndexGrid, NDim, 4096> prepareDeConvIndicePairsKernel<Index, NDim, 4096>
<<<tv::launch::getBlocks(numActIn), tv::launch::CUDA_NUM_THREADS, 0, <<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS, 0,
d.getStream()>>>(indicesIn, indicesOut, gridsOut, indicePairs, d.getStream()>>>(indicesIn, indicePairs, indiceNum,
indiceNum, indicePairUnique, kernelSize, stride, indicePairUnique, kernelSize, stride, padding,
padding, dilation, outSpatialShape); dilation, outSpatialShape);
else else
prepareIndicePairsKernel<Index, IndexGrid, NDim, 4096> prepareIndicePairsKernel<Index, NDim, 4096>
<<<tv::launch::getBlocks(numActIn), tv::launch::CUDA_NUM_THREADS, 0, <<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS, 0,
d.getStream()>>>(indicesIn, indicesOut, gridsOut, indicePairs, d.getStream()>>>(indicesIn, indicePairs, indiceNum,
indiceNum, indicePairUnique, kernelSize, stride, indicePairUnique, kernelSize, stride, padding,
padding, dilation, outSpatialShape); dilation, outSpatialShape);
TV_CHECK_CUDA_ERR(); TV_CHECK_CUDA_ERR();
// std::cout << "p1 gene time " << timer.report() / 1000.0 << std::endl; // std::cout << "p1 gene time " << timer.report() / 1000.0 << std::endl;
return 1; return 1;
...@@ -78,57 +301,58 @@ struct CreateConvIndicePairFunctorP2<tv::GPU, Index, IndexGrid, NDim> { ...@@ -78,57 +301,58 @@ struct CreateConvIndicePairFunctorP2<tv::GPU, Index, IndexGrid, NDim> {
auto numActIn = indicesIn.dim(0); auto numActIn = indicesIn.dim(0);
if (numActIn == 0) if (numActIn == 0)
return 0; return 0;
// after unique, there is a std::numeric_limits<int>::max() in the end of indicePairUnique // after unique, there is a std::numeric_limits<int>::max() in the end of
Index numAct = indicePairUnique.dim(0) - 1; // indicePairUnique
if (useHash){ Index numAct = indicePairUnique.dim(0) - 1;
if (useHash) {
auto table = cuhash::HashTable(); auto table = cuhash::HashTable();
// std::cout << "create " << numAct << " size table..." << std::endl; // std::cout << "create " << numAct << " size table..." << std::endl;
table.Initialize(numAct, 2.0, 4); table.Initialize(numAct, 2.0, 4);
unsigned *d_values = nullptr; unsigned *d_values = nullptr;
cudaMalloc((void**)&d_values, sizeof(unsigned) * numAct); cudaMalloc((void **)&d_values, sizeof(unsigned) * numAct);
TV_CHECK_CUDA_ERR_V2("cudaMalloc failed"); TV_CHECK_CUDA_ERR_V2("cudaMalloc failed");
arangeKernel<unsigned><<<tv::launch::getBlocks(numAct), tv::launch::CUDA_NUM_THREADS, 0, arangeKernel<unsigned>
d.getStream()>>>(d_values, numAct); <<<tv::cuda::getBlocks(numAct), tv::cuda::CUDA_NUM_THREADS, 0,
bool res = table.Build(numAct, reinterpret_cast<unsigned*>(indicePairUnique.data()), d.getStream()>>>(d_values, numAct);
d_values); bool res = table.Build(
numAct, reinterpret_cast<unsigned *>(indicePairUnique.data()),
d_values);
cudaFree(d_values); cudaFree(d_values);
if (!res){ if (!res) {
return -1; //use -1 to tell outside use CPU implementation return -1; // use -1 to tell outside use CPU implementation
} }
assignIndiceOutKernel<Index, NDim> assignIndiceOutKernel<Index, NDim>
<<<tv::launch::getBlocks(numAct), tv::launch::CUDA_NUM_THREADS, 0, <<<tv::cuda::getBlocks(numAct), tv::cuda::CUDA_NUM_THREADS, 0,
d.getStream()>>>(indicesOut, numAct, d.getStream()>>>(indicesOut, numAct, indicePairUnique,
indicePairUnique, outSpatialShape, batchSize); outSpatialShape, batchSize);
TV_CHECK_CUDA_ERR_V2("assignGridAndIndiceOutKernel failed"); TV_CHECK_CUDA_ERR_V2("assignGridAndIndiceOutKernel failed");
auto tableSize = table.get_table_size(); auto tableSize = table.get_table_size();
auto tableData = table.data(); auto tableData = table.data();
auto constants = table.get_constants_4(); auto constants = table.get_constants_4();
auto stash_constants = table.get_stash_constants(); auto stash_constants = table.get_stash_constants();
auto stash_count = table.get_stash_count(); auto stash_count = table.get_stash_count();
assignIndicePairsHashKernel<Index, IndexGrid, NDim> assignIndicePairsHashKernel<Index, NDim>
<<<tv::launch::getBlocks(numActIn), tv::launch::CUDA_NUM_THREADS, 0, <<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS, 0,
d.getStream()>>>(indicesOut, numActIn, indicePairs, d.getStream()>>>(indicesOut, numActIn, indicePairs,
indicePairUnique, indicePairUnique, tableSize, tableData, constants,
tableSize, tableData, constants, stash_constants, stash_constants, stash_count);
stash_count);
TV_CHECK_CUDA_ERR_V2("assignIndicePairsKernel failed"); TV_CHECK_CUDA_ERR_V2("assignIndicePairsKernel failed");
}else{ } else {
assignGridAndIndiceOutKernel<Index, IndexGrid, NDim> assignGridAndIndiceOutKernel<Index, IndexGrid, NDim>
<<<tv::launch::getBlocks(numAct), tv::launch::CUDA_NUM_THREADS, 0, <<<tv::cuda::getBlocks(numAct), tv::cuda::CUDA_NUM_THREADS, 0,
d.getStream()>>>(indicesOut, gridsOut, numAct, indicePairs, d.getStream()>>>(indicesOut, gridsOut, numAct, indicePairs,
indicePairUnique, outSpatialShape, batchSize); indicePairUnique, outSpatialShape, batchSize);
TV_CHECK_CUDA_ERR_V2("assignGridAndIndiceOutKernel failed"); TV_CHECK_CUDA_ERR_V2("assignGridAndIndiceOutKernel failed");
assignIndicePairsKernel<Index, IndexGrid, NDim> assignIndicePairsKernel<Index, IndexGrid, NDim>
<<<tv::launch::getBlocks(numActIn), tv::launch::CUDA_NUM_THREADS, 0, <<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS, 0,
d.getStream()>>>(indicesOut, gridsOut, numActIn, indicePairs, d.getStream()>>>(indicesOut, gridsOut, numActIn, indicePairs,
indicePairUnique, outSpatialShape); indicePairUnique, outSpatialShape);
TV_CHECK_CUDA_ERR_V2("assignIndicePairsKernel failed"); TV_CHECK_CUDA_ERR_V2("assignIndicePairsKernel failed");
} }
if (resetGrid && (!useHash)) { if (resetGrid && (!useHash)) {
resetGridKernel<Index, IndexGrid, NDim> resetGridKernel<Index, IndexGrid, NDim>
<<<tv::launch::getBlocks(numAct), tv::launch::CUDA_NUM_THREADS, 0, <<<tv::cuda::getBlocks(numAct), tv::cuda::CUDA_NUM_THREADS, 0,
d.getStream()>>>(indicePairUnique.data(), gridsOut, numAct); d.getStream()>>>(indicePairUnique.data(), gridsOut, numAct);
TV_CHECK_CUDA_ERR_V2("resetGridKernel failed"); TV_CHECK_CUDA_ERR_V2("resetGridKernel failed");
} }
...@@ -152,22 +376,25 @@ struct CreateSubMIndicePairFunctor<tv::GPU, Index, IndexGrid, NDim> { ...@@ -152,22 +376,25 @@ struct CreateSubMIndicePairFunctor<tv::GPU, Index, IndexGrid, NDim> {
if (numActIn == 0) if (numActIn == 0)
return 0; return 0;
// auto timer = spconv::CudaContextTimer<>(); // auto timer = spconv::CudaContextTimer<>();
if (useHash){ if (useHash) {
auto table = cuhash::HashTable(); auto table = cuhash::HashTable();
// std::cout << "subm create " << numActIn << " size table..." << std::endl; // std::cout << "subm create " << numActIn << " size table..." <<
// std::endl;
table.Initialize(numActIn, 2.0, 4); table.Initialize(numActIn, 2.0, 4);
unsigned *d_keyvalues = nullptr; unsigned *d_keyvalues = nullptr;
cudaMalloc((void**)&d_keyvalues, sizeof(unsigned) * numActIn * 2); cudaMalloc((void **)&d_keyvalues, sizeof(unsigned) * numActIn * 2);
unsigned *d_values = d_keyvalues + numActIn; unsigned *d_values = d_keyvalues + numActIn;
prepareSubMHashKernel<Index, NDim> prepareSubMHashKernel<Index, NDim>
<<<tv::launch::getBlocks(numActIn), tv::launch::CUDA_NUM_THREADS, 0, <<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS, 0,
d.getStream()>>>(indicesIn, d_keyvalues, d_values, outSpatialShape); d.getStream()>>>(indicesIn, d_keyvalues, d_values,
outSpatialShape);
TV_CHECK_CUDA_ERR_V2("prepareSubMHashKernel failed"); TV_CHECK_CUDA_ERR_V2("prepareSubMHashKernel failed");
bool res = table.Build(numActIn, reinterpret_cast<unsigned*>(d_keyvalues), bool res =
reinterpret_cast<unsigned*>(d_values)); table.Build(numActIn, reinterpret_cast<unsigned *>(d_keyvalues),
reinterpret_cast<unsigned *>(d_values));
cudaFree(d_keyvalues); cudaFree(d_keyvalues);
if (!res){ if (!res) {
return -1; //use -1 to tell outside use CPU implementation return -1; // use -1 to tell outside use CPU implementation
} }
auto tableSize = table.get_table_size(); auto tableSize = table.get_table_size();
auto tableData = table.data(); auto tableData = table.data();
...@@ -175,28 +402,30 @@ struct CreateSubMIndicePairFunctor<tv::GPU, Index, IndexGrid, NDim> { ...@@ -175,28 +402,30 @@ struct CreateSubMIndicePairFunctor<tv::GPU, Index, IndexGrid, NDim> {
auto stash_constants = table.get_stash_constants(); auto stash_constants = table.get_stash_constants();
auto stash_count = table.get_stash_count(); auto stash_count = table.get_stash_count();
getSubMIndicePairsHashKernel<Index, NDim, 4096> getSubMIndicePairsHashKernel<Index, NDim, 4096>
<<<tv::launch::getBlocks(numActIn), tv::launch::CUDA_NUM_THREADS, 0, <<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS, 0,
d.getStream()>>>(indicesIn, indicePairs, indiceNum, d.getStream()>>>(indicesIn, indicePairs, indiceNum, kernelSize,
kernelSize, stride, padding, dilation, outSpatialShape, stride, padding, dilation, outSpatialShape,
tableSize, tableData, constants, stash_constants, tableSize, tableData, constants, stash_constants,
stash_count); stash_count);
TV_CHECK_CUDA_ERR_V2("getSubMIndicePairsHashKernel failed"); TV_CHECK_CUDA_ERR_V2("getSubMIndicePairsHashKernel failed");
}else{ } else {
prepareSubMGridKernel<Index, IndexGrid, NDim> prepareSubMGridKernel<Index, IndexGrid, NDim>
<<<tv::launch::getBlocks(numActIn), tv::launch::CUDA_NUM_THREADS, 0, <<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS, 0,
d.getStream()>>>(indicesIn, gridsOut, outSpatialShape); d.getStream()>>>(indicesIn, gridsOut, outSpatialShape);
TV_CHECK_CUDA_ERR(); TV_CHECK_CUDA_ERR();
getSubMIndicePairsKernel<Index, IndexGrid, NDim, 4096> getSubMIndicePairsKernel<Index, IndexGrid, NDim, 4096>
<<<tv::launch::getBlocks(numActIn), tv::launch::CUDA_NUM_THREADS, 0, <<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS, 0,
d.getStream()>>>(indicesIn, gridsOut, indicePairs, indiceNum, d.getStream()>>>(indicesIn, gridsOut, indicePairs, indiceNum,
kernelSize, stride, padding, dilation, outSpatialShape); kernelSize, stride, padding, dilation,
outSpatialShape);
TV_CHECK_CUDA_ERR(); TV_CHECK_CUDA_ERR();
} }
// std::cout << "subm gene time " << timer.report() / 1000.0 << std::endl; // std::cout << "subm gene time " << timer.report() / 1000.0 << std::endl;
if (resetGrid && (!useHash)) { if (resetGrid && (!useHash)) {
resetGridSubMKernel<Index, IndexGrid, NDim> resetGridSubMKernel<Index, IndexGrid, NDim>
<<<tv::launch::getBlocks(numActIn), tv::launch::CUDA_NUM_THREADS, 0, <<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS, 0,
d.getStream()>>>(indicesIn.data(), gridsOut, outSpatialShape, numActIn); d.getStream()>>>(indicesIn.data(), gridsOut, outSpatialShape,
numActIn);
TV_CHECK_CUDA_ERR(); TV_CHECK_CUDA_ERR();
} }
return numActIn; return numActIn;
......
...@@ -16,9 +16,9 @@ ...@@ -16,9 +16,9 @@
#include <chrono> #include <chrono>
#include <limits> #include <limits>
#include <spconv/maxpool.h> #include <spconv/maxpool.h>
#include <spconv/mp_helper.h> #include <tensorview/cuda_utils.h>
#include <tensorview/helper_kernel.cu.h> #include <tensorview/kernel_utils.h>
#include <tensorview/helper_launch.h> #include <tensorview/mp_helper.h>
#include <tensorview/tensorview.h> #include <tensorview/tensorview.h>
#include <type_traits> #include <type_traits>
...@@ -255,7 +255,8 @@ maxPoolBwdVecBlockKernel(const T *outFeatures, const T *inFeatures, ...@@ -255,7 +255,8 @@ maxPoolBwdVecBlockKernel(const T *outFeatures, const T *inFeatures,
reinterpret_cast<const VecType *>(inFeatures)[idxi]; reinterpret_cast<const VecType *>(inFeatures)[idxi];
reinterpret_cast<VecType *>(bufdo)[0] = reinterpret_cast<VecType *>(bufdo)[0] =
reinterpret_cast<const VecType *>(dout)[idxo]; reinterpret_cast<const VecType *>(dout)[idxo];
reinterpret_cast<VecType *>(bufdi)[0] = reinterpret_cast<VecType *>(din)[idxi]; reinterpret_cast<VecType *>(bufdi)[0] =
reinterpret_cast<VecType *>(din)[idxi];
#pragma unroll #pragma unroll
for (int i = 0; i < vecloadFactor; i++) { for (int i = 0; i < vecloadFactor; i++) {
...@@ -263,7 +264,8 @@ maxPoolBwdVecBlockKernel(const T *outFeatures, const T *inFeatures, ...@@ -263,7 +264,8 @@ maxPoolBwdVecBlockKernel(const T *outFeatures, const T *inFeatures,
bufdi[i] += bufdo[i]; bufdi[i] += bufdo[i];
} }
} }
reinterpret_cast<VecType *>(din)[idxi] = reinterpret_cast<VecType *>(bufdi)[0]; reinterpret_cast<VecType *>(din)[idxi] =
reinterpret_cast<VecType *>(bufdi)[0];
} }
} }
} }
...@@ -309,7 +311,7 @@ template <typename T, typename Index> ...@@ -309,7 +311,7 @@ template <typename T, typename Index>
struct SparseMaxPoolForwardFunctor<tv::GPU, T, Index> { struct SparseMaxPoolForwardFunctor<tv::GPU, T, Index> {
using vecload_type_t = using vecload_type_t =
std::conditional_t<std::is_same<T, at::Half>::value, int2, int4>; std::conditional_t<std::is_same<T, at::Half>::value, int2, int4>;
using kernel_block_t = mp_list_c<int, 64, 32, 16>; using kernel_block_t = tv::mp_list_c<int, 64, 32, 16>;
void operator()(const tv::GPU &d, tv::TensorView<T> outFeatures, void operator()(const tv::GPU &d, tv::TensorView<T> outFeatures,
tv::TensorView<const T> inFeatures, tv::TensorView<const T> inFeatures,
tv::TensorView<const Index> indices, int size) { tv::TensorView<const Index> indices, int size) {
...@@ -318,21 +320,22 @@ struct SparseMaxPoolForwardFunctor<tv::GPU, T, Index> { ...@@ -318,21 +320,22 @@ struct SparseMaxPoolForwardFunctor<tv::GPU, T, Index> {
int numPlanes = inFeatures.dim(1); int numPlanes = inFeatures.dim(1);
bool notFound = true; bool notFound = true;
constexpr int vecloadFactor = sizeof(vecload_type_t) / sizeof(T); constexpr int vecloadFactor = sizeof(vecload_type_t) / sizeof(T);
mp_for_each<kernel_block_t>([=, &outFeatures, &inFeatures, &indices, tv::mp_for_each<kernel_block_t>([=, &outFeatures, &inFeatures, &indices,
&notFound](auto NumTLP) { &notFound](auto NumTLP) {
constexpr int NumILP = NumTLP / 4; constexpr int NumILP = NumTLP / 4;
int numHotBlock = (size / NumTLP) * NumTLP; int numHotBlock = (size / NumTLP) * NumTLP;
if (notFound) { if (notFound) {
if (numPlanes % NumTLP == 0) { if (numPlanes % NumTLP == 0) {
if (numHotBlock >= NumTLP) { if (numHotBlock >= NumTLP) {
maxPoolFwdVecBlockKernel<T, Index, int(NumTLP), NumILP, vecload_type_t> maxPoolFwdVecBlockKernel<T, Index, int(NumTLP), NumILP,
vecload_type_t>
<<<dim3(std::min(size / NumTLP, 512), numPlanes / NumTLP), <<<dim3(std::min(size / NumTLP, 512), numPlanes / NumTLP),
dim3(NumTLP / vecloadFactor, NumTLP / NumILP), 0, dim3(NumTLP / vecloadFactor, NumTLP / NumILP), 0,
d.getStream()>>>(outFeatures.data(), inFeatures.data(), d.getStream()>>>(outFeatures.data(), inFeatures.data(),
indices.subview(0).data(), indices.subview(0).data(),
indices.subview(1).data(), numHotBlock, indices.subview(1).data(), numHotBlock,
numPlanes / vecloadFactor); numPlanes / vecloadFactor);
TV_CHECK_CUDA_ERR(); TV_CHECK_CUDA_ERR();
} }
...@@ -340,9 +343,9 @@ struct SparseMaxPoolForwardFunctor<tv::GPU, T, Index> { ...@@ -340,9 +343,9 @@ struct SparseMaxPoolForwardFunctor<tv::GPU, T, Index> {
maxPoolFwdGenericKernel<T, Index, int(NumTLP), NumILP> maxPoolFwdGenericKernel<T, Index, int(NumTLP), NumILP>
<<<dim3(1, numPlanes / NumTLP), dim3(NumTLP / NumILP, NumTLP), <<<dim3(1, numPlanes / NumTLP), dim3(NumTLP / NumILP, NumTLP),
0, d.getStream()>>>(outFeatures.data(), inFeatures.data(), 0, d.getStream()>>>(outFeatures.data(), inFeatures.data(),
indices.subview(0).data() + numHotBlock, indices.subview(0).data() + numHotBlock,
indices.subview(1).data() + numHotBlock, indices.subview(1).data() + numHotBlock,
size - numHotBlock, numPlanes); size - numHotBlock, numPlanes);
TV_CHECK_CUDA_ERR(); TV_CHECK_CUDA_ERR();
} }
notFound = false; notFound = false;
...@@ -356,7 +359,7 @@ struct SparseMaxPoolForwardFunctor<tv::GPU, T, Index> { ...@@ -356,7 +359,7 @@ struct SparseMaxPoolForwardFunctor<tv::GPU, T, Index> {
int numHotBlock = (size / NumTLP) * NumTLP; int numHotBlock = (size / NumTLP) * NumTLP;
if (numHotBlock >= NumTLP) { if (numHotBlock >= NumTLP) {
maxPoolFwdGenericBlockKernel<T, Index, NumTLP, NumILP> maxPoolFwdGenericBlockKernel<T, Index, NumTLP, NumILP>
<<<dim3(size / NumTLP, tv::launch::DivUp(numPlanes, NumTLP)), <<<dim3(size / NumTLP, tv::cuda::DivUp(numPlanes, NumTLP)),
dim3(NumTLP / NumILP, NumTLP), 0, d.getStream()>>>( dim3(NumTLP / NumILP, NumTLP), 0, d.getStream()>>>(
outFeatures.data(), inFeatures.data(), outFeatures.data(), inFeatures.data(),
indices.subview(0).data(), indices.subview(1).data(), indices.subview(0).data(), indices.subview(1).data(),
...@@ -366,7 +369,7 @@ struct SparseMaxPoolForwardFunctor<tv::GPU, T, Index> { ...@@ -366,7 +369,7 @@ struct SparseMaxPoolForwardFunctor<tv::GPU, T, Index> {
if (size > numHotBlock) { if (size > numHotBlock) {
maxPoolFwdGenericKernel<T, Index, NumTLP, NumILP> maxPoolFwdGenericKernel<T, Index, NumTLP, NumILP>
<<<dim3(1, tv::launch::DivUp(numPlanes, NumTLP)), <<<dim3(1, tv::cuda::DivUp(numPlanes, NumTLP)),
dim3(NumTLP / NumILP, NumTLP), 0, d.getStream()>>>( dim3(NumTLP / NumILP, NumTLP), 0, d.getStream()>>>(
outFeatures.data(), inFeatures.data(), outFeatures.data(), inFeatures.data(),
indices.subview(0).data() + numHotBlock, indices.subview(0).data() + numHotBlock,
...@@ -382,7 +385,7 @@ template <typename T, typename Index> ...@@ -382,7 +385,7 @@ template <typename T, typename Index>
struct SparseMaxPoolBackwardFunctor<tv::GPU, T, Index> { struct SparseMaxPoolBackwardFunctor<tv::GPU, T, Index> {
using vecload_type_t = using vecload_type_t =
std::conditional_t<std::is_same<T, at::Half>::value, int2, int4>; std::conditional_t<std::is_same<T, at::Half>::value, int2, int4>;
using kernel_block_t = mp_list_c<int, 64, 32, 16>; using kernel_block_t = tv::mp_list_c<int, 64, 32, 16>;
void operator()(const tv::GPU &d, tv::TensorView<const T> outFeatures, void operator()(const tv::GPU &d, tv::TensorView<const T> outFeatures,
tv::TensorView<const T> inFeatures, tv::TensorView<const T> inFeatures,
tv::TensorView<const T> dout, tv::TensorView<T> din, tv::TensorView<const T> dout, tv::TensorView<T> din,
...@@ -392,22 +395,23 @@ struct SparseMaxPoolBackwardFunctor<tv::GPU, T, Index> { ...@@ -392,22 +395,23 @@ struct SparseMaxPoolBackwardFunctor<tv::GPU, T, Index> {
int numPlanes = inFeatures.dim(1); int numPlanes = inFeatures.dim(1);
bool notFound = true; bool notFound = true;
constexpr int vecloadFactor = sizeof(vecload_type_t) / sizeof(T); constexpr int vecloadFactor = sizeof(vecload_type_t) / sizeof(T);
mp_for_each<kernel_block_t>([=, &outFeatures, &inFeatures, &dout, &din, tv::mp_for_each<kernel_block_t>([=, &outFeatures, &inFeatures, &dout, &din,
&indices, &notFound](auto NumTLP) { &indices, &notFound](auto NumTLP) {
constexpr int NumILP = NumTLP / 4; constexpr int NumILP = NumTLP / 4;
int numHotBlock = (size / NumTLP) * NumTLP; int numHotBlock = (size / NumTLP) * NumTLP;
if (notFound) { if (notFound) {
if (numPlanes % NumTLP == 0) { if (numPlanes % NumTLP == 0) {
if (numHotBlock >= NumTLP) { if (numHotBlock >= NumTLP) {
maxPoolBwdVecBlockKernel<T, Index, int(NumTLP), NumILP, vecload_type_t> maxPoolBwdVecBlockKernel<T, Index, int(NumTLP), NumILP,
vecload_type_t>
<<<dim3(std::min(size / NumTLP, 512), numPlanes / NumTLP), <<<dim3(std::min(size / NumTLP, 512), numPlanes / NumTLP),
dim3(NumTLP / vecloadFactor, NumTLP / NumILP), 0, dim3(NumTLP / vecloadFactor, NumTLP / NumILP), 0,
d.getStream()>>>(outFeatures.data(), inFeatures.data(), d.getStream()>>>(outFeatures.data(), inFeatures.data(),
dout.data(), din.data(), dout.data(), din.data(),
indices.subview(0).data(), indices.subview(0).data(),
indices.subview(1).data(), numHotBlock, indices.subview(1).data(), numHotBlock,
numPlanes / vecloadFactor); numPlanes / vecloadFactor);
TV_CHECK_CUDA_ERR(); TV_CHECK_CUDA_ERR();
} }
...@@ -415,10 +419,10 @@ struct SparseMaxPoolBackwardFunctor<tv::GPU, T, Index> { ...@@ -415,10 +419,10 @@ struct SparseMaxPoolBackwardFunctor<tv::GPU, T, Index> {
maxPoolBwdGenericKernel<T, Index, int(NumTLP), NumILP> maxPoolBwdGenericKernel<T, Index, int(NumTLP), NumILP>
<<<dim3(1, numPlanes / NumTLP), dim3(NumTLP / NumILP, NumTLP), <<<dim3(1, numPlanes / NumTLP), dim3(NumTLP / NumILP, NumTLP),
0, d.getStream()>>>(outFeatures.data(), inFeatures.data(), 0, d.getStream()>>>(outFeatures.data(), inFeatures.data(),
dout.data(), din.data(), dout.data(), din.data(),
indices.subview(0).data() + numHotBlock, indices.subview(0).data() + numHotBlock,
indices.subview(1).data() + numHotBlock, indices.subview(1).data() + numHotBlock,
size - numHotBlock, numPlanes); size - numHotBlock, numPlanes);
TV_CHECK_CUDA_ERR(); TV_CHECK_CUDA_ERR();
} }
notFound = false; notFound = false;
...@@ -432,7 +436,7 @@ struct SparseMaxPoolBackwardFunctor<tv::GPU, T, Index> { ...@@ -432,7 +436,7 @@ struct SparseMaxPoolBackwardFunctor<tv::GPU, T, Index> {
int numHotBlock = (size / NumTLP) * NumTLP; int numHotBlock = (size / NumTLP) * NumTLP;
if (numHotBlock >= NumTLP) { if (numHotBlock >= NumTLP) {
maxPoolBwdGenericBlockKernel<T, Index, NumTLP, NumILP> maxPoolBwdGenericBlockKernel<T, Index, NumTLP, NumILP>
<<<dim3(size / NumTLP, tv::launch::DivUp(numPlanes, NumTLP)), <<<dim3(size / NumTLP, tv::cuda::DivUp(numPlanes, NumTLP)),
dim3(NumTLP / NumILP, NumTLP), 0, d.getStream()>>>( dim3(NumTLP / NumILP, NumTLP), 0, d.getStream()>>>(
outFeatures.data(), inFeatures.data(), dout.data(), din.data(), outFeatures.data(), inFeatures.data(), dout.data(), din.data(),
indices.subview(0).data(), indices.subview(1).data(), indices.subview(0).data(), indices.subview(1).data(),
...@@ -442,7 +446,7 @@ struct SparseMaxPoolBackwardFunctor<tv::GPU, T, Index> { ...@@ -442,7 +446,7 @@ struct SparseMaxPoolBackwardFunctor<tv::GPU, T, Index> {
if (size > numHotBlock) { if (size > numHotBlock) {
maxPoolBwdGenericKernel<T, Index, NumTLP, NumILP> maxPoolBwdGenericKernel<T, Index, NumTLP, NumILP>
<<<dim3(1, tv::launch::DivUp(numPlanes, NumTLP)), <<<dim3(1, tv::cuda::DivUp(numPlanes, NumTLP)),
dim3(NumTLP / NumILP, NumTLP), 0, d.getStream()>>>( dim3(NumTLP / NumILP, NumTLP), 0, d.getStream()>>>(
outFeatures.data(), inFeatures.data(), dout.data(), din.data(), outFeatures.data(), inFeatures.data(), dout.data(), din.data(),
indices.subview(0).data() + numHotBlock, indices.subview(0).data() + numHotBlock,
......
...@@ -21,7 +21,7 @@ namespace spconv { ...@@ -21,7 +21,7 @@ namespace spconv {
namespace functor { namespace functor {
template <typename T, typename Index> template <typename T, typename Index>
struct NonMaxSupressionFunctor<tv::CPU, T, Index> { struct NonMaxSupressionFunctor<tv::CPU, T, Index> {
Index operator()(const tv::CPU &d, tv::TensorView<Index> keep, Index operator()(const tv::CPU &d, tv::TensorView<Index> keep,
tv::TensorView<const T> boxes, T threshold, T eps) { tv::TensorView<const T> boxes, T threshold, T eps) {
auto ndets = boxes.dim(0); auto ndets = boxes.dim(0);
...@@ -131,7 +131,7 @@ struct rotateNonMaxSupressionFunctor<tv::CPU, T, Index> { ...@@ -131,7 +131,7 @@ struct rotateNonMaxSupressionFunctor<tv::CPU, T, Index> {
#define DECLARE_CPU_INDEX(Index) \ #define DECLARE_CPU_INDEX(Index) \
DECLARE_CPU_T_INDEX(float, Index); \ DECLARE_CPU_T_INDEX(float, Index); \
DECLARE_CPU_T_INDEX(double, Index); DECLARE_CPU_T_INDEX(double, Index);
DECLARE_CPU_INDEX(int); DECLARE_CPU_INDEX(int);
DECLARE_CPU_INDEX(long); DECLARE_CPU_INDEX(long);
......
...@@ -2,18 +2,18 @@ ...@@ -2,18 +2,18 @@
// Deformable Convolutional Networks // Deformable Convolutional Networks
// Copyright (c) 2015 Microsoft // Copyright (c) 2015 Microsoft
// Licensed under The MIT License // Licensed under The MIT License
// Modified from MATLAB Faster R-CNN (https://github.com/shaoqingren/faster_rcnn) // Modified from MATLAB Faster R-CNN
// (https://github.com/shaoqingren/faster_rcnn)
// ------------------------------------------------------------------ // ------------------------------------------------------------------
#include <ATen/ATen.h> #include <ATen/ATen.h>
#include <chrono> #include <chrono>
#include <limits> #include <limits>
#include <spconv/mp_helper.h>
#include <spconv/reordering.h>
#include <spconv/reordering.cu.h> #include <spconv/reordering.cu.h>
#include <tensorview/helper_kernel.cu.h> #include <spconv/reordering.h>
#include <tensorview/helper_launch.h> #include <tensorview/cuda_utils.h>
#include <tensorview/kernel_utils.h>
#include <tensorview/mp_helper.h>
#include <tensorview/tensorview.h> #include <tensorview/tensorview.h>
#include <type_traits> #include <type_traits>
#include <utility/timer.h> #include <utility/timer.h>
...@@ -22,8 +22,7 @@ ...@@ -22,8 +22,7 @@
int const threadsPerBlock = sizeof(unsigned long long) * 8; int const threadsPerBlock = sizeof(unsigned long long) * 8;
template <typename DType> template <typename DType>
__device__ inline DType devIoU(DType const *const a, DType const *const b) __device__ inline DType devIoU(DType const *const a, DType const *const b) {
{
DType left = max(a[0], b[0]), right = min(a[2], b[2]); DType left = max(a[0], b[0]), right = min(a[2], b[2]);
DType top = max(a[1], b[1]), bottom = min(a[3], b[3]); DType top = max(a[1], b[1]), bottom = min(a[3], b[3]);
DType width = max(right - left + 1, 0.f), height = max(bottom - top + 1, 0.f); DType width = max(right - left + 1, 0.f), height = max(bottom - top + 1, 0.f);
...@@ -35,44 +34,36 @@ __device__ inline DType devIoU(DType const *const a, DType const *const b) ...@@ -35,44 +34,36 @@ __device__ inline DType devIoU(DType const *const a, DType const *const b)
template <typename DType, int BLOCK_THREADS> template <typename DType, int BLOCK_THREADS>
__global__ void nms_kernel(const int n_boxes, const DType nms_overlap_thresh, __global__ void nms_kernel(const int n_boxes, const DType nms_overlap_thresh,
const DType *dev_boxes, unsigned long long *dev_mask) const DType *dev_boxes,
{ unsigned long long *dev_mask) {
const int row_start = blockIdx.y; const int row_start = blockIdx.y;
const int col_start = blockIdx.x; const int col_start = blockIdx.x;
// if (row_start > col_start) return; // if (row_start > col_start) return;
const int row_size = const int row_size = min(n_boxes - row_start * BLOCK_THREADS, BLOCK_THREADS);
min(n_boxes - row_start * BLOCK_THREADS, BLOCK_THREADS); const int col_size = min(n_boxes - col_start * BLOCK_THREADS, BLOCK_THREADS);
const int col_size =
min(n_boxes - col_start * BLOCK_THREADS, BLOCK_THREADS);
__shared__ DType block_boxes[BLOCK_THREADS * 5]; __shared__ DType block_boxes[BLOCK_THREADS * 5];
if (threadIdx.x < col_size) if (threadIdx.x < col_size) {
{
#pragma unroll #pragma unroll
for (int i = 0; i < 5; ++i) for (int i = 0; i < 5; ++i) {
{
block_boxes[threadIdx.x * 5 + i] = block_boxes[threadIdx.x * 5 + i] =
dev_boxes[(BLOCK_THREADS * col_start + threadIdx.x) * 5 + i]; dev_boxes[(BLOCK_THREADS * col_start + threadIdx.x) * 5 + i];
} }
} }
__syncthreads(); __syncthreads();
if (threadIdx.x < row_size) if (threadIdx.x < row_size) {
{
const int cur_box_idx = BLOCK_THREADS * row_start + threadIdx.x; const int cur_box_idx = BLOCK_THREADS * row_start + threadIdx.x;
const DType *cur_box = dev_boxes + cur_box_idx * 5; const DType *cur_box = dev_boxes + cur_box_idx * 5;
unsigned long long t = 0; unsigned long long t = 0;
int start = 0; int start = 0;
if (row_start == col_start) if (row_start == col_start) {
{
start = threadIdx.x + 1; start = threadIdx.x + 1;
} }
for (int i = start; i < col_size; i++) for (int i = start; i < col_size; i++) {
{ if (devIoU(cur_box, block_boxes + i * 5) > nms_overlap_thresh) {
if (devIoU(cur_box, block_boxes + i * 5) > nms_overlap_thresh)
{
t |= 1ULL << i; t |= 1ULL << i;
} }
} }
...@@ -80,4 +71,3 @@ __global__ void nms_kernel(const int n_boxes, const DType nms_overlap_thresh, ...@@ -80,4 +71,3 @@ __global__ void nms_kernel(const int n_boxes, const DType nms_overlap_thresh,
dev_mask[cur_box_idx * col_blocks + col_start] = t; dev_mask[cur_box_idx * col_blocks + col_start] = t;
} }
} }
...@@ -15,10 +15,10 @@ ...@@ -15,10 +15,10 @@
#include <ATen/ATen.h> #include <ATen/ATen.h>
#include <chrono> #include <chrono>
#include <limits> #include <limits>
#include <spconv/mp_helper.h>
#include <spconv/pillar_scatter_functor.h> #include <spconv/pillar_scatter_functor.h>
#include <tensorview/helper_kernel.cu.h> #include <tensorview/cuda_utils.h>
#include <tensorview/helper_launch.h> #include <tensorview/kernel_utils.h>
#include <tensorview/mp_helper.h>
#include <tensorview/tensorview.h> #include <tensorview/tensorview.h>
#include <type_traits> #include <type_traits>
#include <utility/timer.h> #include <utility/timer.h>
...@@ -43,8 +43,8 @@ struct PointPillarScatter<tv::GPU, T, Index> { ...@@ -43,8 +43,8 @@ struct PointPillarScatter<tv::GPU, T, Index> {
void operator()(const tv::GPU &d, tv::TensorView<T> canvas, void operator()(const tv::GPU &d, tv::TensorView<T> canvas,
tv::TensorView<const T> features, tv::TensorView<const T> features,
tv::TensorView<const T> coors) { tv::TensorView<const T> coors) {
auto grid = dim3(tv::launch::DivUp(features.dim(1), 32), auto grid = dim3(tv::cuda::DivUp(features.dim(1), 32),
tv::launch::DivUp(features.dim(0), 32)); tv::cuda::DivUp(features.dim(0), 32));
pointPillarsScatterKernel<T, Index> pointPillarsScatterKernel<T, Index>
<<<grid, dim3(32, 32), 0, d.getStream()>>>(canvas, features, coors); <<<grid, dim3(32, 32), 0, d.getStream()>>>(canvas, features, coors);
TV_CHECK_CUDA_ERR(); TV_CHECK_CUDA_ERR();
......
// Copyright 2019 Yan Yan // Copyright 2019 Yan Yan
// //
// Licensed under the Apache License, Version 2.0 (the "License"); // Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License. // you may not use this file except in compliance with the License.
// You may obtain a copy of the License at // You may obtain a copy of the License at
// //
// http://www.apache.org/licenses/LICENSE-2.0 // http://www.apache.org/licenses/LICENSE-2.0
// //
// Unless required by applicable law or agreed to in writing, software // Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS, // distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include <ATen/Parallel.h>
#include <spconv/reordering.h> #include <spconv/reordering.h>
#include <torch/script.h> #include <torch/script.h>
#include <ATen/Parallel.h>
namespace spconv { namespace spconv {
namespace functor { namespace functor {
template <typename T, typename Index> template <typename T, typename Index>
struct SparseGatherFunctor<tv::CPU, T, Index> { struct SparseGatherFunctor<tv::CPU, T, Index> {
void operator()(const tv::CPU& d, tv::TensorView<T> buffer, tv::TensorView<const T> features, void operator()(const tv::CPU &d, tv::TensorView<T> buffer,
tv::TensorView<const T> features,
tv::TensorView<const Index> indices, int size) { tv::TensorView<const Index> indices, int size) {
int numPlanes = features.dim(1); int numPlanes = features.dim(1);
at::parallel_for(0, size, 0, [&](int64_t begin, int64_t end){ at::parallel_for(0, size, 0, [&](int64_t begin, int64_t end) {
for (int i = begin; i < end; ++i) { for (int i = begin; i < end; ++i) {
std::memcpy(buffer.data() + i * numPlanes, std::memcpy(buffer.data() + i * numPlanes,
features.data() + indices[i] * numPlanes, features.data() + indices[i] * numPlanes,
...@@ -35,16 +36,16 @@ struct SparseGatherFunctor<tv::CPU, T, Index> { ...@@ -35,16 +36,16 @@ struct SparseGatherFunctor<tv::CPU, T, Index> {
template <typename T, typename Index> template <typename T, typename Index>
struct SparseScatterAddFunctor<tv::CPU, T, Index> { struct SparseScatterAddFunctor<tv::CPU, T, Index> {
void operator()(const tv::CPU& d, tv::TensorView<T> outFeatures, void operator()(const tv::CPU &d, tv::TensorView<T> outFeatures,
tv::TensorView<const T> buffer, tv::TensorView<const Index> indices, tv::TensorView<const T> buffer,
int size, bool stable) { tv::TensorView<const Index> indices, int size, bool stable) {
int numPlanes = outFeatures.dim(1); int numPlanes = outFeatures.dim(1);
const T* buf = buffer.data(); const T *buf = buffer.data();
T* out = outFeatures.data(); T *out = outFeatures.data();
for (int i = 0; i < size; ++i) { for (int i = 0; i < size; ++i) {
buf = buffer.data() + i * numPlanes; buf = buffer.data() + i * numPlanes;
out = outFeatures.data() + indices[i] * numPlanes; out = outFeatures.data() + indices[i] * numPlanes;
for (int j = 0; j < numPlanes; ++j){ for (int j = 0; j < numPlanes; ++j) {
out[j] += buf[j]; out[j] += buf[j];
} }
} }
...@@ -53,9 +54,8 @@ struct SparseScatterAddFunctor<tv::CPU, T, Index> { ...@@ -53,9 +54,8 @@ struct SparseScatterAddFunctor<tv::CPU, T, Index> {
} // namespace functor } // namespace functor
#define DECLARE_CPU_SPECS_T_INDEX(T, Index) \
#define DECLARE_CPU_SPECS_T_INDEX(T, Index) \ template struct functor::SparseGatherFunctor<tv::CPU, T, Index>; \
template struct functor::SparseGatherFunctor<tv::CPU, T, Index>; \
template struct functor::SparseScatterAddFunctor<tv::CPU, T, Index>; template struct functor::SparseScatterAddFunctor<tv::CPU, T, Index>;
#define DECLARE_CPU_SPECS(T) \ #define DECLARE_CPU_SPECS(T) \
...@@ -70,4 +70,3 @@ DECLARE_CPU_SPECS(at::Half); ...@@ -70,4 +70,3 @@ DECLARE_CPU_SPECS(at::Half);
#undef DECLARE_CPU_SPECS_T_INDEX #undef DECLARE_CPU_SPECS_T_INDEX
} // namespace spconv } // namespace spconv
...@@ -15,11 +15,11 @@ ...@@ -15,11 +15,11 @@
#include <ATen/ATen.h> #include <ATen/ATen.h>
#include <chrono> #include <chrono>
#include <limits> #include <limits>
#include <spconv/mp_helper.h>
#include <spconv/reordering.h>
#include <spconv/reordering.cu.h> #include <spconv/reordering.cu.h>
#include <tensorview/helper_kernel.cu.h> #include <spconv/reordering.h>
#include <tensorview/helper_launch.h> #include <tensorview/cuda_utils.h>
#include <tensorview/kernel_utils.h>
#include <tensorview/mp_helper.h>
#include <tensorview/tensorview.h> #include <tensorview/tensorview.h>
#include <type_traits> #include <type_traits>
#include <utility/timer.h> #include <utility/timer.h>
...@@ -30,7 +30,7 @@ template <typename T, typename Index> ...@@ -30,7 +30,7 @@ template <typename T, typename Index>
struct SparseGatherFunctor<tv::GPU, T, Index> { struct SparseGatherFunctor<tv::GPU, T, Index> {
using vecload_type_t = using vecload_type_t =
std::conditional_t<std::is_same<T, at::Half>::value, int2, int4>; std::conditional_t<std::is_same<T, at::Half>::value, int2, int4>;
using kernel_block_t = mp_list_c<int, 64, 32, 16>; using kernel_block_t = tv::mp_list_c<int, 64, 32, 16>;
void operator()(const tv::GPU &d, tv::TensorView<T> buffer, void operator()(const tv::GPU &d, tv::TensorView<T> buffer,
tv::TensorView<const T> features, tv::TensorView<const T> features,
tv::TensorView<const Index> indices, int size) { tv::TensorView<const Index> indices, int size) {
...@@ -39,8 +39,8 @@ struct SparseGatherFunctor<tv::GPU, T, Index> { ...@@ -39,8 +39,8 @@ struct SparseGatherFunctor<tv::GPU, T, Index> {
int numPlanes = features.dim(1); int numPlanes = features.dim(1);
bool notFound = true; bool notFound = true;
constexpr int vecloadFactor = sizeof(vecload_type_t) / sizeof(T); constexpr int vecloadFactor = sizeof(vecload_type_t) / sizeof(T);
mp_for_each<kernel_block_t>([=, &buffer, &features, &indices, tv::mp_for_each<kernel_block_t>([=, &buffer, &features, &indices,
&notFound](auto NumTLP) { &notFound](auto NumTLP) {
constexpr int NumILP = NumTLP / 4; constexpr int NumILP = NumTLP / 4;
// constexpr int NumILP = NumTLP / (64 / (NumTLP / vecloadFactor)); // constexpr int NumILP = NumTLP / (64 / (NumTLP / vecloadFactor));
int nHotBlock = (size / NumTLP) * NumTLP; int nHotBlock = (size / NumTLP) * NumTLP;
...@@ -50,8 +50,9 @@ struct SparseGatherFunctor<tv::GPU, T, Index> { ...@@ -50,8 +50,9 @@ struct SparseGatherFunctor<tv::GPU, T, Index> {
gatherVecBlockKernel<T, Index, int(NumTLP), NumILP, vecload_type_t> gatherVecBlockKernel<T, Index, int(NumTLP), NumILP, vecload_type_t>
<<<dim3(numPlanes / NumTLP, size / NumTLP), <<<dim3(numPlanes / NumTLP, size / NumTLP),
dim3(NumTLP / vecloadFactor, NumTLP / NumILP), 0, dim3(NumTLP / vecloadFactor, NumTLP / NumILP), 0,
d.getStream()>>>(buffer.data(), features.data(), indices.data(), d.getStream()>>>(buffer.data(), features.data(),
nHotBlock, numPlanes / vecloadFactor); indices.data(), nHotBlock,
numPlanes / vecloadFactor);
TV_CHECK_CUDA_ERR(); TV_CHECK_CUDA_ERR();
} }
...@@ -60,8 +61,9 @@ struct SparseGatherFunctor<tv::GPU, T, Index> { ...@@ -60,8 +61,9 @@ struct SparseGatherFunctor<tv::GPU, T, Index> {
<<<dim3(1, numPlanes / NumTLP), <<<dim3(1, numPlanes / NumTLP),
dim3(NumTLP / NumILP, NumTLP / vecloadFactor), 0, dim3(NumTLP / NumILP, NumTLP / vecloadFactor), 0,
d.getStream()>>>(buffer.data() + nHotBlock * numPlanes, d.getStream()>>>(buffer.data() + nHotBlock * numPlanes,
features.data(), indices.data() + nHotBlock, features.data(), indices.data() + nHotBlock,
size - nHotBlock, numPlanes / vecloadFactor); size - nHotBlock,
numPlanes / vecloadFactor);
TV_CHECK_CUDA_ERR(); TV_CHECK_CUDA_ERR();
} }
notFound = false; notFound = false;
...@@ -73,8 +75,8 @@ struct SparseGatherFunctor<tv::GPU, T, Index> { ...@@ -73,8 +75,8 @@ struct SparseGatherFunctor<tv::GPU, T, Index> {
constexpr int NumTLP = 64; constexpr int NumTLP = 64;
constexpr int NumILP = NumTLP / 4; constexpr int NumILP = NumTLP / 4;
gatherGenericKernel<T, Index, NumTLP, NumILP> gatherGenericKernel<T, Index, NumTLP, NumILP>
<<<dim3(tv::launch::DivUp(size, NumTLP), <<<dim3(tv::cuda::DivUp(size, NumTLP),
tv::launch::DivUp(numPlanes, NumTLP)), tv::cuda::DivUp(numPlanes, NumTLP)),
dim3(NumTLP / NumILP, NumTLP), 0, d.getStream()>>>( dim3(NumTLP / NumILP, NumTLP), 0, d.getStream()>>>(
buffer.data(), features.data(), indices.data(), size, numPlanes); buffer.data(), features.data(), indices.data(), size, numPlanes);
TV_CHECK_CUDA_ERR(); TV_CHECK_CUDA_ERR();
...@@ -85,7 +87,7 @@ template <typename T, typename Index> ...@@ -85,7 +87,7 @@ template <typename T, typename Index>
struct SparseScatterAddFunctor<tv::GPU, T, Index> { struct SparseScatterAddFunctor<tv::GPU, T, Index> {
using vecload_type_t = using vecload_type_t =
std::conditional_t<std::is_same<T, at::Half>::value, int2, int4>; std::conditional_t<std::is_same<T, at::Half>::value, int2, int4>;
using kernel_block_t = mp_list_c<int, 64, 32, 16>; using kernel_block_t = tv::mp_list_c<int, 64, 32, 16>;
void operator()(const tv::GPU &d, tv::TensorView<T> outFeatures, void operator()(const tv::GPU &d, tv::TensorView<T> outFeatures,
tv::TensorView<const T> buffer, tv::TensorView<const T> buffer,
tv::TensorView<const Index> indices, int size, bool stable) { tv::TensorView<const Index> indices, int size, bool stable) {
...@@ -95,8 +97,8 @@ struct SparseScatterAddFunctor<tv::GPU, T, Index> { ...@@ -95,8 +97,8 @@ struct SparseScatterAddFunctor<tv::GPU, T, Index> {
bool notFound = true; bool notFound = true;
constexpr int vecloadFactor = constexpr int vecloadFactor =
sizeof(vecload_type_t) / sizeof(T); // important for half. sizeof(vecload_type_t) / sizeof(T); // important for half.
mp_for_each<kernel_block_t>([=, &d, &outFeatures, &buffer, &indices, tv::mp_for_each<kernel_block_t>([=, &d, &outFeatures, &buffer, &indices,
&notFound](auto NumTLP) { &notFound](auto NumTLP) {
// constexpr int NumILP = NumTLP / (64 / (NumTLP / vecloadFactor)); // constexpr int NumILP = NumTLP / (64 / (NumTLP / vecloadFactor));
constexpr int NumILP = NumTLP / 4; constexpr int NumILP = NumTLP / 4;
int nHotBlock = (size / NumTLP) * NumTLP; int nHotBlock = (size / NumTLP) * NumTLP;
...@@ -108,8 +110,8 @@ struct SparseScatterAddFunctor<tv::GPU, T, Index> { ...@@ -108,8 +110,8 @@ struct SparseScatterAddFunctor<tv::GPU, T, Index> {
<<<dim3(numPlanes / NumTLP, size / NumTLP), <<<dim3(numPlanes / NumTLP, size / NumTLP),
dim3(NumTLP / vecloadFactor, NumTLP / NumILP), 0, dim3(NumTLP / vecloadFactor, NumTLP / NumILP), 0,
d.getStream()>>>(outFeatures.data(), buffer.data(), d.getStream()>>>(outFeatures.data(), buffer.data(),
indices.data(), nHotBlock, indices.data(), nHotBlock,
numPlanes / vecloadFactor); numPlanes / vecloadFactor);
TV_CHECK_CUDA_ERR(); TV_CHECK_CUDA_ERR();
} }
if (size - nHotBlock > 0) { if (size - nHotBlock > 0) {
...@@ -128,8 +130,8 @@ struct SparseScatterAddFunctor<tv::GPU, T, Index> { ...@@ -128,8 +130,8 @@ struct SparseScatterAddFunctor<tv::GPU, T, Index> {
constexpr int NumTLP = 64; constexpr int NumTLP = 64;
constexpr int NumILP = NumTLP / 4; constexpr int NumILP = NumTLP / 4;
scatterAddGenericKernel<T, Index, NumTLP, NumILP> scatterAddGenericKernel<T, Index, NumTLP, NumILP>
<<<dim3(tv::launch::DivUp(size, NumTLP), <<<dim3(tv::cuda::DivUp(size, NumTLP),
tv::launch::DivUp(numPlanes, NumTLP)), tv::cuda::DivUp(numPlanes, NumTLP)),
dim3(NumTLP / NumILP, NumTLP), 0, d.getStream()>>>( dim3(NumTLP / NumILP, NumTLP), 0, d.getStream()>>>(
outFeatures.data(), buffer.data(), indices.data(), size, outFeatures.data(), buffer.data(), indices.data(), size,
numPlanes); numPlanes);
...@@ -139,7 +141,6 @@ struct SparseScatterAddFunctor<tv::GPU, T, Index> { ...@@ -139,7 +141,6 @@ struct SparseScatterAddFunctor<tv::GPU, T, Index> {
}; };
} // namespace functor } // namespace functor
#define DECLARE_GPU_SPECS_T_INDEX(T, Index) \ #define DECLARE_GPU_SPECS_T_INDEX(T, Index) \
template struct functor::SparseGatherFunctor<tv::GPU, T, Index>; \ template struct functor::SparseGatherFunctor<tv::GPU, T, Index>; \
template struct functor::SparseScatterAddFunctor<tv::GPU, T, Index>; template struct functor::SparseScatterAddFunctor<tv::GPU, T, Index>;
......
...@@ -47,7 +47,7 @@ torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters, ...@@ -47,7 +47,7 @@ torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters,
double totalGatherTime = 0; double totalGatherTime = 0;
double totalGEMMTime = 0; double totalGEMMTime = 0;
double totalSAddTime = 0; double totalSAddTime = 0;
tv::torch_dispatch<float, double, at::Half>( tv::dispatch_torch<float, double, at::Half>(
features.scalar_type(), [&](auto I) { features.scalar_type(), [&](auto I) {
using T = decltype(I); using T = decltype(I);
for (int i = 0; i < kernelVolume; ++i) { for (int i = 0; i < kernelVolume; ++i) {
...@@ -68,7 +68,7 @@ torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters, ...@@ -68,7 +68,7 @@ torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters,
tv::torch2tv<const int>(indicePairs).subview(i, inverse), tv::torch2tv<const int>(indicePairs).subview(i, inverse),
nHot); nHot);
} }
#ifdef SPCONV_CUDA #ifdef TV_CUDA
else if (device == torch::kCUDA) { else if (device == torch::kCUDA) {
functor::SparseGatherFunctor<tv::GPU, T, int> gatherFtor; functor::SparseGatherFunctor<tv::GPU, T, int> gatherFtor;
gatherFtor(tv::TorchGPU(), tv::torch2tv<T>(inputBuffer), gatherFtor(tv::TorchGPU(), tv::torch2tv<T>(inputBuffer),
...@@ -99,7 +99,7 @@ torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters, ...@@ -99,7 +99,7 @@ torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters,
tv::torch2tv<const int>(indicePairs).subview(i, !inverse), nHot, tv::torch2tv<const int>(indicePairs).subview(i, !inverse), nHot,
true); true);
} }
#ifdef SPCONV_CUDA #ifdef TV_CUDA
else if (device == torch::kCUDA) { else if (device == torch::kCUDA) {
functor::SparseScatterAddFunctor<tv::GPU, T, int> scatterFtor; functor::SparseScatterAddFunctor<tv::GPU, T, int> scatterFtor;
scatterFtor( scatterFtor(
...@@ -158,7 +158,7 @@ indiceConvBackward(torch::Tensor features, torch::Tensor filters, ...@@ -158,7 +158,7 @@ indiceConvBackward(torch::Tensor features, torch::Tensor filters,
torch::mm_out(filterGradSub, features.t(), outGrad); torch::mm_out(filterGradSub, features.t(), outGrad);
torch::mm_out(inputGrad, outGrad, filters[indicePairMaxOffset].t()); torch::mm_out(inputGrad, outGrad, filters[indicePairMaxOffset].t());
} }
tv::torch_dispatch<float, double, tv::dispatch_torch<float, double,
at::Half>(features.scalar_type(), [&](auto I) { at::Half>(features.scalar_type(), [&](auto I) {
using T = decltype(I); using T = decltype(I);
for (int i = 0; i < kernelVolume; ++i) { for (int i = 0; i < kernelVolume; ++i) {
...@@ -178,7 +178,7 @@ indiceConvBackward(torch::Tensor features, torch::Tensor filters, ...@@ -178,7 +178,7 @@ indiceConvBackward(torch::Tensor features, torch::Tensor filters,
tv::torch2tv<const int>(indicePairs).subview(i, !inverse), tv::torch2tv<const int>(indicePairs).subview(i, !inverse),
nHot); nHot);
} }
#ifdef SPCONV_CUDA #ifdef TV_CUDA
else if (device == torch::kCUDA) { else if (device == torch::kCUDA) {
functor::SparseGatherFunctor<tv::GPU, T, int> gatherFtor; functor::SparseGatherFunctor<tv::GPU, T, int> gatherFtor;
functor::SparseGatherFunctor<tv::GPU, T, int> gatherFtorOut; functor::SparseGatherFunctor<tv::GPU, T, int> gatherFtorOut;
...@@ -213,7 +213,7 @@ indiceConvBackward(torch::Tensor features, torch::Tensor filters, ...@@ -213,7 +213,7 @@ indiceConvBackward(torch::Tensor features, torch::Tensor filters,
tv::torch2tv<const int>(indicePairs).subview(i, inverse), tv::torch2tv<const int>(indicePairs).subview(i, inverse),
nHot); nHot);
} }
#ifdef SPCONV_CUDA #ifdef TV_CUDA
else if (device == torch::kCUDA) { else if (device == torch::kCUDA) {
functor::SparseScatterAddFunctor<tv::GPU, T, int> scatterFtor; functor::SparseScatterAddFunctor<tv::GPU, T, int> scatterFtor;
scatterFtor(tv::TorchGPU(), tv::torch2tv<T>(inputGrad), scatterFtor(tv::TorchGPU(), tv::torch2tv<T>(inputGrad),
......
...@@ -20,7 +20,7 @@ using namespace pybind11::literals; ...@@ -20,7 +20,7 @@ using namespace pybind11::literals;
PYBIND11_MODULE(spconv_utils, m) { PYBIND11_MODULE(spconv_utils, m) {
m.doc() = "util pybind11 functions for spconv"; m.doc() = "util pybind11 functions for spconv";
#ifdef SPCONV_CUDA #ifdef TV_CUDA
m.def("non_max_suppression", &spconv::non_max_suppression<double>, m.def("non_max_suppression", &spconv::non_max_suppression<double>,
py::return_value_policy::reference_internal, "bbox iou", "boxes"_a = 1, py::return_value_policy::reference_internal, "bbox iou", "boxes"_a = 1,
"keep_out"_a = 2, "nms_overlap_thresh"_a = 3, "device_id"_a = 4); "keep_out"_a = 2, "nms_overlap_thresh"_a = 3, "device_id"_a = 4);
......
...@@ -2,30 +2,28 @@ ...@@ -2,30 +2,28 @@
// Deformable Convolutional Networks // Deformable Convolutional Networks
// Copyright (c) 2015 Microsoft // Copyright (c) 2015 Microsoft
// Licensed under The MIT License // Licensed under The MIT License
// Modified from MATLAB Faster R-CNN (https://github.com/shaoqingren/faster_rcnn) // Modified from MATLAB Faster R-CNN
// (https://github.com/shaoqingren/faster_rcnn)
// ------------------------------------------------------------------ // ------------------------------------------------------------------
#include <vector>
#include <iostream>
#include <cuda_runtime.h> #include <cuda_runtime.h>
#include <iostream>
#include <spconv/nms_gpu.h> #include <spconv/nms_gpu.h>
#include <vector>
#define CUDA_CHECK(condition) \ #define CUDA_CHECK(condition) \
/* Code block avoids redefinition of cudaError_t error */ \ /* Code block avoids redefinition of cudaError_t error */ \
do \ do { \
{ \ cudaError_t error = condition; \
cudaError_t error = condition; \ if (error != cudaSuccess) { \
if (error != cudaSuccess) \ std::cout << cudaGetErrorString(error) << std::endl; \
{ \ } \
std::cout << cudaGetErrorString(error) << std::endl; \
} \
} while (0) } while (0)
#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0)) #define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
int const threadsPerBlock = sizeof(unsigned long long) * 8; int const threadsPerBlock = sizeof(unsigned long long) * 8;
template <typename DType> template <typename DType>
__device__ inline DType devIoU(DType const *const a, DType const *const b) __device__ inline DType devIoU(DType const *const a, DType const *const b) {
{
DType left = max(a[0], b[0]), right = min(a[2], b[2]); DType left = max(a[0], b[0]), right = min(a[2], b[2]);
DType top = max(a[1], b[1]), bottom = min(a[3], b[3]); DType top = max(a[1], b[1]), bottom = min(a[3], b[3]);
DType width = max(right - left + 1, 0.f), height = max(bottom - top + 1, 0.f); DType width = max(right - left + 1, 0.f), height = max(bottom - top + 1, 0.f);
...@@ -37,44 +35,36 @@ __device__ inline DType devIoU(DType const *const a, DType const *const b) ...@@ -37,44 +35,36 @@ __device__ inline DType devIoU(DType const *const a, DType const *const b)
template <typename DType, int BLOCK_THREADS> template <typename DType, int BLOCK_THREADS>
__global__ void nms_kernel(const int n_boxes, const DType nms_overlap_thresh, __global__ void nms_kernel(const int n_boxes, const DType nms_overlap_thresh,
const DType *dev_boxes, unsigned long long *dev_mask) const DType *dev_boxes,
{ unsigned long long *dev_mask) {
const int row_start = blockIdx.y; const int row_start = blockIdx.y;
const int col_start = blockIdx.x; const int col_start = blockIdx.x;
// if (row_start > col_start) return; // if (row_start > col_start) return;
const int row_size = const int row_size = min(n_boxes - row_start * BLOCK_THREADS, BLOCK_THREADS);
min(n_boxes - row_start * BLOCK_THREADS, BLOCK_THREADS); const int col_size = min(n_boxes - col_start * BLOCK_THREADS, BLOCK_THREADS);
const int col_size =
min(n_boxes - col_start * BLOCK_THREADS, BLOCK_THREADS);
__shared__ DType block_boxes[BLOCK_THREADS * 5]; __shared__ DType block_boxes[BLOCK_THREADS * 5];
if (threadIdx.x < col_size) if (threadIdx.x < col_size) {
{
#pragma unroll #pragma unroll
for (int i = 0; i < 5; ++i) for (int i = 0; i < 5; ++i) {
{
block_boxes[threadIdx.x * 5 + i] = block_boxes[threadIdx.x * 5 + i] =
dev_boxes[(BLOCK_THREADS * col_start + threadIdx.x) * 5 + i]; dev_boxes[(BLOCK_THREADS * col_start + threadIdx.x) * 5 + i];
} }
} }
__syncthreads(); __syncthreads();
if (threadIdx.x < row_size) if (threadIdx.x < row_size) {
{
const int cur_box_idx = BLOCK_THREADS * row_start + threadIdx.x; const int cur_box_idx = BLOCK_THREADS * row_start + threadIdx.x;
const DType *cur_box = dev_boxes + cur_box_idx * 5; const DType *cur_box = dev_boxes + cur_box_idx * 5;
unsigned long long t = 0; unsigned long long t = 0;
int start = 0; int start = 0;
if (row_start == col_start) if (row_start == col_start) {
{
start = threadIdx.x + 1; start = threadIdx.x + 1;
} }
for (int i = start; i < col_size; i++) for (int i = start; i < col_size; i++) {
{ if (devIoU(cur_box, block_boxes + i * 5) > nms_overlap_thresh) {
if (devIoU(cur_box, block_boxes + i * 5) > nms_overlap_thresh)
{
t |= 1ULL << i; t |= 1ULL << i;
} }
} }
...@@ -83,12 +73,10 @@ __global__ void nms_kernel(const int n_boxes, const DType nms_overlap_thresh, ...@@ -83,12 +73,10 @@ __global__ void nms_kernel(const int n_boxes, const DType nms_overlap_thresh,
} }
} }
void _set_device(int device_id) void _set_device(int device_id) {
{
int current_device; int current_device;
CUDA_CHECK(cudaGetDevice(&current_device)); CUDA_CHECK(cudaGetDevice(&current_device));
if (current_device == device_id) if (current_device == device_id) {
{
return; return;
} }
// The call to cudaSetDevice must come before any calls to Get, which // The call to cudaSetDevice must come before any calls to Get, which
...@@ -98,8 +86,7 @@ void _set_device(int device_id) ...@@ -98,8 +86,7 @@ void _set_device(int device_id)
template <typename DType, int BLOCK_THREADS> template <typename DType, int BLOCK_THREADS>
int _nms_gpu(int *keep_out, const DType *boxes_host, int boxes_num, int _nms_gpu(int *keep_out, const DType *boxes_host, int boxes_num,
int boxes_dim, DType nms_overlap_thresh, int device_id) int boxes_dim, DType nms_overlap_thresh, int device_id) {
{
_set_device(device_id); _set_device(device_id);
DType *boxes_dev = NULL; DType *boxes_dev = NULL;
...@@ -107,27 +94,21 @@ int _nms_gpu(int *keep_out, const DType *boxes_host, int boxes_num, ...@@ -107,27 +94,21 @@ int _nms_gpu(int *keep_out, const DType *boxes_host, int boxes_num,
const int col_blocks = DIVUP(boxes_num, BLOCK_THREADS); const int col_blocks = DIVUP(boxes_num, BLOCK_THREADS);
CUDA_CHECK(cudaMalloc(&boxes_dev, CUDA_CHECK(cudaMalloc(&boxes_dev, boxes_num * boxes_dim * sizeof(DType)));
boxes_num * boxes_dim * sizeof(DType))); CUDA_CHECK(cudaMemcpy(boxes_dev, boxes_host,
CUDA_CHECK(cudaMemcpy(boxes_dev,
boxes_host,
boxes_num * boxes_dim * sizeof(DType), boxes_num * boxes_dim * sizeof(DType),
cudaMemcpyHostToDevice)); cudaMemcpyHostToDevice));
CUDA_CHECK(cudaMalloc(&mask_dev, CUDA_CHECK(cudaMalloc(&mask_dev,
boxes_num * col_blocks * sizeof(unsigned long long))); boxes_num * col_blocks * sizeof(unsigned long long)));
dim3 blocks(DIVUP(boxes_num, BLOCK_THREADS), dim3 blocks(DIVUP(boxes_num, BLOCK_THREADS), DIVUP(boxes_num, BLOCK_THREADS));
DIVUP(boxes_num, BLOCK_THREADS));
dim3 threads(BLOCK_THREADS); dim3 threads(BLOCK_THREADS);
nms_kernel<DType, BLOCK_THREADS><<<blocks, threads>>>(boxes_num, nms_kernel<DType, BLOCK_THREADS>
nms_overlap_thresh, <<<blocks, threads>>>(boxes_num, nms_overlap_thresh, boxes_dev, mask_dev);
boxes_dev,
mask_dev);
std::vector<unsigned long long> mask_host(boxes_num * col_blocks); std::vector<unsigned long long> mask_host(boxes_num * col_blocks);
CUDA_CHECK(cudaMemcpy(&mask_host[0], CUDA_CHECK(cudaMemcpy(&mask_host[0], mask_dev,
mask_dev,
sizeof(unsigned long long) * boxes_num * col_blocks, sizeof(unsigned long long) * boxes_num * col_blocks,
cudaMemcpyDeviceToHost)); cudaMemcpyDeviceToHost));
...@@ -135,17 +116,14 @@ int _nms_gpu(int *keep_out, const DType *boxes_host, int boxes_num, ...@@ -135,17 +116,14 @@ int _nms_gpu(int *keep_out, const DType *boxes_host, int boxes_num,
memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks); memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks);
int num_to_keep = 0; int num_to_keep = 0;
for (int i = 0; i < boxes_num; i++) for (int i = 0; i < boxes_num; i++) {
{
int nblock = i / BLOCK_THREADS; int nblock = i / BLOCK_THREADS;
int inblock = i % BLOCK_THREADS; int inblock = i % BLOCK_THREADS;
if (!(remv[nblock] & (1ULL << inblock))) if (!(remv[nblock] & (1ULL << inblock))) {
{
keep_out[num_to_keep++] = i; keep_out[num_to_keep++] = i;
unsigned long long *p = &mask_host[0] + i * col_blocks; unsigned long long *p = &mask_host[0] + i * col_blocks;
for (int j = nblock; j < col_blocks; j++) for (int j = nblock; j < col_blocks; j++) {
{
remv[j] |= p[j]; remv[j] |= p[j];
} }
} }
...@@ -156,10 +134,15 @@ int _nms_gpu(int *keep_out, const DType *boxes_host, int boxes_num, ...@@ -156,10 +134,15 @@ int _nms_gpu(int *keep_out, const DType *boxes_host, int boxes_num,
return num_to_keep; return num_to_keep;
} }
// template<>
//template<> template int _nms_gpu<float, threadsPerBlock>(int *keep_out,
template int _nms_gpu<float, threadsPerBlock>(int *keep_out, const float *boxes_host, int boxes_num, const float *boxes_host,
int boxes_dim, float nms_overlap_thresh, int device_id); int boxes_num, int boxes_dim,
//template<> float nms_overlap_thresh,
template int _nms_gpu<double, threadsPerBlock>(int *keep_out, const double *boxes_host, int boxes_num, int device_id);
int boxes_dim, double nms_overlap_thresh, int device_id); // template<>
\ No newline at end of file template int _nms_gpu<double, threadsPerBlock>(int *keep_out,
const double *boxes_host,
int boxes_num, int boxes_dim,
double nms_overlap_thresh,
int device_id);
\ No newline at end of file
import horovod.torch as hvd
import time import time
from pathlib import Path from pathlib import Path
...@@ -12,6 +11,7 @@ from torch.utils import data ...@@ -12,6 +11,7 @@ from torch.utils import data
from torch.utils.data import DataLoader, Dataset from torch.utils.data import DataLoader, Dataset
from torchvision import datasets, transforms from torchvision import datasets, transforms
import horovod.torch as hvd
import spconv import spconv
from spconv.test_utils import generate_sparse_data from spconv.test_utils import generate_sparse_data
...@@ -53,25 +53,47 @@ class FakeClassifier(nn.Module): ...@@ -53,25 +53,47 @@ class FakeClassifier(nn.Module):
def __init__(self): def __init__(self):
super().__init__() super().__init__()
self.net = spconv.SparseSequential( self.net = spconv.SparseSequential(
spconv.SubMConv3d(3, 8, 3, indice_key="subm1", padding=1, use_hash=False), spconv.SubMConv3d(3,
8,
3,
indice_key="subm1",
padding=1,
use_hash=False),
nn.BatchNorm1d(8), nn.BatchNorm1d(8),
nn.ReLU(), nn.ReLU(),
spconv.SparseConv3d(8, 16, 3, stride=2, padding=1, use_hash=False), spconv.SparseConv3d(8, 16, 3, stride=2, padding=1, use_hash=False),
nn.BatchNorm1d(16), nn.BatchNorm1d(16),
nn.ReLU(), nn.ReLU(),
spconv.SubMConv3d(16, 16, 3, indice_key="subm2", padding=1, use_hash=False), spconv.SubMConv3d(16,
16,
3,
indice_key="subm2",
padding=1,
use_hash=False),
nn.BatchNorm1d(16), nn.BatchNorm1d(16),
nn.ReLU(), nn.ReLU(),
spconv.SparseConv3d(16, 32, 3, stride=2, padding=1, use_hash=False), spconv.SparseConv3d(16, 32, 3, stride=2, padding=1,
use_hash=False),
nn.BatchNorm1d(32), nn.BatchNorm1d(32),
nn.ReLU(), nn.ReLU(),
spconv.SubMConv3d(32, 32, 3, indice_key="subm3", padding=1, use_hash=False), spconv.SubMConv3d(32,
32,
3,
indice_key="subm3",
padding=1,
use_hash=False),
nn.BatchNorm1d(32), nn.BatchNorm1d(32),
nn.ReLU(), nn.ReLU(),
spconv.SparseConv3d(32, 64, 3, stride=2, padding=1, use_hash=False), spconv.SparseConv3d(32, 64, 3, stride=2, padding=1,
use_hash=False),
nn.BatchNorm1d(64), nn.BatchNorm1d(64),
nn.ReLU(), nn.ReLU(),
spconv.SubMConv3d(64, 64, 3, indice_key="subm4", padding=1, use_hash=False), spconv.SubMConv3d(64,
64,
3,
indice_key="subm4",
padding=1,
use_hash=False),
nn.BatchNorm1d(64), nn.BatchNorm1d(64),
nn.ReLU(), nn.ReLU(),
spconv.ToDense() # [64, 2, 8, 8] spconv.ToDense() # [64, 2, 8, 8]
...@@ -100,15 +122,16 @@ def run(): ...@@ -100,15 +122,16 @@ def run():
hvd.broadcast_optimizer_state(optimizer, root_rank=0) hvd.broadcast_optimizer_state(optimizer, root_rank=0)
compression = hvd.Compression.none compression = hvd.Compression.none
optimizer = hvd.DistributedOptimizer(optimizer, optimizer = hvd.DistributedOptimizer(
named_parameters=model.named_parameters(), optimizer,
compression=compression, named_parameters=model.named_parameters(),
op=hvd.Average) compression=compression,
op=hvd.Average)
for i in tqdm.tqdm(list(range(100))): for i in tqdm.tqdm(list(range(100))):
# for j in range(4): # for j in range(4):
# features, indices, label = ds[(i * 4 + j) % len(ds)] # features, indices, label = ds[(i * 4 + j) % len(ds)]
features, indices, label = ds[i % len(ds)] features, indices, label = ds[i % len(ds)]
features_t = torch.from_numpy(features) features_t = torch.from_numpy(features)
indices_t = torch.from_numpy(indices) indices_t = torch.from_numpy(indices)
......
...@@ -52,25 +52,47 @@ class FakeClassifier(nn.Module): ...@@ -52,25 +52,47 @@ class FakeClassifier(nn.Module):
def __init__(self): def __init__(self):
super().__init__() super().__init__()
self.net = spconv.SparseSequential( self.net = spconv.SparseSequential(
spconv.SubMConv3d(3, 8, 3, indice_key="subm1", padding=1, use_hash=False), spconv.SubMConv3d(3,
8,
3,
indice_key="subm1",
padding=1,
use_hash=False),
nn.BatchNorm1d(8), nn.BatchNorm1d(8),
nn.ReLU(), nn.ReLU(),
spconv.SparseConv3d(8, 16, 3, stride=2, padding=1, use_hash=False), spconv.SparseConv3d(8, 16, 3, stride=2, padding=1, use_hash=False),
nn.BatchNorm1d(16), nn.BatchNorm1d(16),
nn.ReLU(), nn.ReLU(),
spconv.SubMConv3d(16, 16, 3, indice_key="subm2", padding=1, use_hash=False), spconv.SubMConv3d(16,
16,
3,
indice_key="subm2",
padding=1,
use_hash=False),
nn.BatchNorm1d(16), nn.BatchNorm1d(16),
nn.ReLU(), nn.ReLU(),
spconv.SparseConv3d(16, 32, 3, stride=2, padding=1, use_hash=False), spconv.SparseConv3d(16, 32, 3, stride=2, padding=1,
use_hash=False),
nn.BatchNorm1d(32), nn.BatchNorm1d(32),
nn.ReLU(), nn.ReLU(),
spconv.SubMConv3d(32, 32, 3, indice_key="subm3", padding=1, use_hash=False), spconv.SubMConv3d(32,
32,
3,
indice_key="subm3",
padding=1,
use_hash=False),
nn.BatchNorm1d(32), nn.BatchNorm1d(32),
nn.ReLU(), nn.ReLU(),
spconv.SparseConv3d(32, 64, 3, stride=2, padding=1, use_hash=False), spconv.SparseConv3d(32, 64, 3, stride=2, padding=1,
use_hash=False),
nn.BatchNorm1d(64), nn.BatchNorm1d(64),
nn.ReLU(), nn.ReLU(),
spconv.SubMConv3d(64, 64, 3, indice_key="subm4", padding=1, use_hash=False), spconv.SubMConv3d(64,
64,
3,
indice_key="subm4",
padding=1,
use_hash=False),
nn.BatchNorm1d(64), nn.BatchNorm1d(64),
nn.ReLU(), nn.ReLU(),
spconv.ToDense() # [64, 2, 8, 8] spconv.ToDense() # [64, 2, 8, 8]
...@@ -97,7 +119,7 @@ def run(): ...@@ -97,7 +119,7 @@ def run():
for i in tqdm.tqdm(list(range(100))): for i in tqdm.tqdm(list(range(100))):
# for j in range(4): # for j in range(4):
# features, indices, label = ds[(i * 4 + j) % len(ds)] # features, indices, label = ds[(i * 4 + j) % len(ds)]
features, indices, label = ds[i % len(ds)] features, indices, label = ds[i % len(ds)]
features_t = torch.from_numpy(features) features_t = torch.from_numpy(features)
indices_t = torch.from_numpy(indices) indices_t = torch.from_numpy(indices)
......
# Copyright 2019 Yan Yan # Copyright 2019 Yan Yan
# #
# Licensed under the Apache License, Version 2.0 (the "License"); # Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License. # you may not use this file except in compliance with the License.
# You may obtain a copy of the License at # You may obtain a copy of the License at
# #
# http://www.apache.org/licenses/LICENSE-2.0 # http://www.apache.org/licenses/LICENSE-2.0
# #
# Unless required by applicable law or agreed to in writing, software # Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, # distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import time
import unittest
from pathlib import Path from pathlib import Path
import spconv
import numpy as np
import torch import torch
from torch import nn from torch import nn
import numpy as np
import time import spconv
from spconv.test_utils import params_grid, generate_sparse_data, TestCase from spconv.test_utils import TestCase, generate_sparse_data, params_grid
import unittest
# import sparseconvnet as scn # import sparseconvnet as scn
class SparseConv3dTestTorch(nn.Module): class SparseConv3dTestTorch(nn.Module):
def __init__(self, num_layers, ndim, shape, in_channels, out_channels, kernel_size, def __init__(self, num_layers, ndim, shape, in_channels, out_channels,
stride, padding, dilation): kernel_size, stride, padding, dilation):
super().__init__() super().__init__()
layers = [spconv.SparseConv3d( layers = [
in_channels, spconv.SparseConv3d(in_channels,
out_channels, out_channels,
kernel_size, kernel_size,
stride, stride,
padding=padding, padding=padding,
dilation=dilation, dilation=dilation,
bias=False, bias=False,
use_hash=True)] use_hash=True)
]
for i in range(1, num_layers): for i in range(1, num_layers):
layers.append(spconv.SparseConv3d( layers.append(
out_channels, spconv.SparseConv3d(out_channels,
out_channels, out_channels,
kernel_size, kernel_size,
stride, stride,
padding=padding, padding=padding,
dilation=dilation, dilation=dilation,
bias=False)) bias=False))
self.net = spconv.SparseSequential( self.net = spconv.SparseSequential(*layers, )
*layers,
)
# self.grid = torch.full([3, *shape], -1, dtype=torch.int32).cuda() # self.grid = torch.full([3, *shape], -1, dtype=torch.int32).cuda()
self.grid = None self.grid = None
self.shape = shape self.shape = shape
def forward(self, features, coors, batch_size): def forward(self, features, coors, batch_size):
coors = coors.int() coors = coors.int()
x = spconv.SparseConvTensor(features, coors,self.shape, batch_size, self.grid) x = spconv.SparseConvTensor(features, coors, self.shape, batch_size,
return self.net(x)# .dense() self.grid)
return self.net(x) # .dense()
class SubMConv3dTestTorch(nn.Module): class SubMConv3dTestTorch(nn.Module):
def __init__(self, num_layers, ndim, shape, in_channels, out_channels, kernel_size, def __init__(self, num_layers, ndim, shape, in_channels, out_channels,
stride, padding, dilation): kernel_size, stride, padding, dilation):
super().__init__() super().__init__()
layers = [spconv.SubMConv3d( layers = [
in_channels, spconv.SubMConv3d(in_channels,
out_channels, out_channels,
kernel_size, kernel_size,
stride, stride,
padding=padding, padding=padding,
dilation=dilation, dilation=dilation,
bias=False)] bias=False)
]
for i in range(1, num_layers): for i in range(1, num_layers):
layers.append(spconv.SubMConv3d( layers.append(
out_channels, spconv.SubMConv3d(out_channels,
out_channels, out_channels,
kernel_size, kernel_size,
stride, stride,
padding=padding, padding=padding,
dilation=dilation, dilation=dilation,
bias=False)) bias=False))
self.net = spconv.SparseSequential( self.net = spconv.SparseSequential(*layers, )
*layers,
)
self.grid = torch.full([3, *shape], -1, dtype=torch.int32).cuda() self.grid = torch.full([3, *shape], -1, dtype=torch.int32).cuda()
self.shape = shape self.shape = shape
def forward(self, features, coors, batch_size): def forward(self, features, coors, batch_size):
coors = coors.int() coors = coors.int()
x = spconv.SparseConvTensor(features, coors,self.shape, batch_size, self.grid) x = spconv.SparseConvTensor(features, coors, self.shape, batch_size,
return self.net(x)# .dense() self.grid)
return self.net(x) # .dense()
class Conv3dTestTorch(nn.Module): class Conv3dTestTorch(nn.Module):
def __init__(self, num_layers, ndim, shape, in_channels, out_channels, kernel_size, def __init__(self, num_layers, ndim, shape, in_channels, out_channels,
stride, padding, dilation): kernel_size, stride, padding, dilation):
super().__init__() super().__init__()
layers = [nn.Conv3d( layers = [
in_channels, nn.Conv3d(in_channels,
out_channels, out_channels,
kernel_size, kernel_size,
stride, stride,
padding=padding, padding=padding,
dilation=dilation, dilation=dilation,
bias=False)] bias=False)
]
for i in range(1, num_layers): for i in range(1, num_layers):
layers.append(nn.Conv3d( layers.append(
out_channels, nn.Conv3d(out_channels,
out_channels, out_channels,
kernel_size, kernel_size,
stride, stride,
padding=padding, padding=padding,
dilation=dilation, dilation=dilation,
bias=False)) bias=False))
self.net = nn.Sequential( self.net = nn.Sequential(*layers, )
*layers,
)
self.shape = shape self.shape = shape
def forward(self, x): def forward(self, x):
return self.net(x)# .dense() return self.net(x) # .dense()
class SparseDeConv3dTestTorch(nn.Module): class SparseDeConv3dTestTorch(nn.Module):
def __init__(self, num_layers, ndim, shape, in_channels, out_channels, kernel_size, def __init__(self, num_layers, ndim, shape, in_channels, out_channels,
stride, padding, dilation): kernel_size, stride, padding, dilation):
super().__init__() super().__init__()
layers = [spconv.SparseConvTranspose3d( layers = [
in_channels, spconv.SparseConvTranspose3d(in_channels,
out_channels, out_channels,
kernel_size, kernel_size,
stride, stride,
padding=padding, padding=padding,
dilation=dilation, dilation=dilation,
bias=False)] bias=False)
]
for i in range(1, num_layers): for i in range(1, num_layers):
layers.append(spconv.SparseConvTranspose3d( layers.append(
out_channels, spconv.SparseConvTranspose3d(out_channels,
out_channels, out_channels,
kernel_size, kernel_size,
stride, stride,
padding=padding, padding=padding,
dilation=dilation, dilation=dilation,
bias=False)) bias=False))
self.net = spconv.SparseSequential( self.net = spconv.SparseSequential(*layers, )
*layers,
)
self.shape = shape self.shape = shape
def forward(self, features, coors, batch_size): def forward(self, features, coors, batch_size):
coors = coors.int() coors = coors.int()
x = spconv.SparseConvTensor(features, coors,self.shape, batch_size) x = spconv.SparseConvTensor(features, coors, self.shape, batch_size)
return self.net(x)# .dense() return self.net(x) # .dense()
class DeConv3dTestTorch(nn.Module): class DeConv3dTestTorch(nn.Module):
def __init__(self, num_layers, ndim, shape, in_channels, out_channels, kernel_size, def __init__(self, num_layers, ndim, shape, in_channels, out_channels,
stride, padding, dilation): kernel_size, stride, padding, dilation):
super().__init__() super().__init__()
layers = [nn.ConvTranspose3d( layers = [
in_channels, nn.ConvTranspose3d(in_channels,
out_channels, out_channels,
kernel_size, kernel_size,
stride, stride,
padding=padding, padding=padding,
dilation=dilation, dilation=dilation,
bias=False)] bias=False)
]
for i in range(1, num_layers): for i in range(1, num_layers):
layers.append(nn.ConvTranspose3d( layers.append(
out_channels, nn.ConvTranspose3d(out_channels,
out_channels, out_channels,
kernel_size, kernel_size,
stride, stride,
padding=padding, padding=padding,
dilation=dilation, dilation=dilation,
bias=False)) bias=False))
self.net = nn.Sequential( self.net = nn.Sequential(*layers, )
*layers,
)
self.shape = shape self.shape = shape
def forward(self, x): def forward(self, x):
return self.net(x)# .dense() return self.net(x) # .dense()
class SparseMaxPoolTestTorch(nn.Module): class SparseMaxPoolTestTorch(nn.Module):
def __init__(self, num_layers, ndim, shape, kernel_size, def __init__(self, num_layers, ndim, shape, kernel_size, stride, padding,
stride, padding, dilation): dilation):
super().__init__() super().__init__()
layers = [spconv.SparseMaxPool3d( layers = [
kernel_size, spconv.SparseMaxPool3d(kernel_size, stride, padding, dilation)
stride, padding, dilation)] ]
for i in range(1, num_layers): for i in range(1, num_layers):
layers.append(spconv.SparseMaxPool3d( layers.append(
kernel_size, spconv.SparseMaxPool3d(kernel_size, stride, padding, dilation))
stride, padding, dilation)) self.net = spconv.SparseSequential(*layers, )
self.net = spconv.SparseSequential(
*layers,
)
self.shape = shape self.shape = shape
def forward(self, features, coors, batch_size): def forward(self, features, coors, batch_size):
coors = coors.int() coors = coors.int()
x = spconv.SparseConvTensor(features, coors, self.shape, batch_size ) x = spconv.SparseConvTensor(features, coors, self.shape, batch_size)
return self.net(x)# .dense() return self.net(x) # .dense()
class MaxPool3dTestTorch(nn.Module): class MaxPool3dTestTorch(nn.Module):
def __init__(self, num_layers, ndim, shape, kernel_size, def __init__(self, num_layers, ndim, shape, kernel_size, stride, padding,
stride, padding, dilation): dilation):
super().__init__() super().__init__()
layers = [nn.MaxPool3d( layers = [nn.MaxPool3d(kernel_size, stride, padding, dilation)]
kernel_size,
stride, padding, dilation)]
for i in range(1, num_layers): for i in range(1, num_layers):
layers.append(nn.MaxPool3d( layers.append(nn.MaxPool3d(kernel_size, stride, padding, dilation))
kernel_size, self.net = nn.Sequential(*layers, )
stride, padding, dilation))
self.net = nn.Sequential(
*layers,
)
self.shape = shape self.shape = shape
def forward(self, x): def forward(self, x):
return self.net(x)# .dense() return self.net(x) # .dense()
class SubmanifoldConvTestTorch(nn.Module): class SubmanifoldConvTestTorch(nn.Module):
def __init__(self, num_layers, ndim, shape, in_channels, out_channels, kernel_size, stride): def __init__(self, num_layers, ndim, shape, in_channels, out_channels,
kernel_size, stride):
super().__init__() super().__init__()
layers = [spconv.SubMConv3d( layers = [
in_channels, out_channels, kernel_size, bias=False, indice_key="subm0")] spconv.SubMConv3d(in_channels,
out_channels,
kernel_size,
bias=False,
indice_key="subm0")
]
for i in range(1, num_layers): for i in range(1, num_layers):
layers.append(spconv.SubMConv3d( layers.append(
out_channels, out_channels, kernel_size, bias=False)) spconv.SubMConv3d(out_channels,
self.net = nn.Sequential( out_channels,
*layers, kernel_size,
) bias=False))
self.net = nn.Sequential(*layers, )
self.shape = shape self.shape = shape
def forward(self, features, coors, batch_size): def forward(self, features, coors, batch_size):
coors = coors.int() coors = coors.int()
x = spconv.SparseConvTensor(features, coors, self.shape, batch_size ) x = spconv.SparseConvTensor(features, coors, self.shape, batch_size)
return self.net(x) return self.net(x)
class SCNCoupleDeConvTest(nn.Module): class SCNCoupleDeConvTest(nn.Module):
def __init__(self, num_layers, ndim, shape, in_channels, out_channels, kernel_size, def __init__(self, num_layers, ndim, shape, in_channels, out_channels,
stride): kernel_size, stride):
super().__init__() super().__init__()
self.scn_input = scn.InputLayer(ndim, shape, mode=0) self.scn_input = scn.InputLayer(ndim, shape, mode=0)
self.net = nn.Sequential( self.net = nn.Sequential(
scn.Convolution( scn.Convolution(ndim,
ndim, in_channels,
in_channels, out_channels,
out_channels, kernel_size,
kernel_size, stride,
stride, bias=False),
bias=False), scn.Deconvolution(ndim,
scn.Deconvolution( out_channels,
ndim, in_channels,
out_channels, kernel_size,
in_channels, stride,
kernel_size, bias=False),
stride,
bias=False),
scn.SparseToDense(ndim, in_channels), scn.SparseToDense(ndim, in_channels),
) )
...@@ -267,44 +268,44 @@ class SCNCoupleDeConvTest(nn.Module): ...@@ -267,44 +268,44 @@ class SCNCoupleDeConvTest(nn.Module):
x = self.scn_input((coors, features)) x = self.scn_input((coors, features))
return self.net(x) return self.net(x)
class SparseCoupleDeConvTest(nn.Module): class SparseCoupleDeConvTest(nn.Module):
def __init__(self, num_layers, ndim, shape, in_channels, out_channels, kernel_size, def __init__(self, num_layers, ndim, shape, in_channels, out_channels,
stride): kernel_size, stride):
super().__init__() super().__init__()
self.net = spconv.SparseSequential( self.net = spconv.SparseSequential(
spconv.SparseConv3d( spconv.SparseConv3d(in_channels,
in_channels, out_channels,
out_channels, kernel_size,
kernel_size, stride,
stride, indice_key="cp0",
indice_key="cp0", bias=False),
bias=False), spconv.SparseInverseConv3d(out_channels,
spconv.SparseInverseConv3d( in_channels,
out_channels, kernel_size,
in_channels, indice_key="cp0",
kernel_size, bias=False),
indice_key="cp0",
bias=False),
) )
self.todense = spconv.ToDense() self.todense = spconv.ToDense()
self.shape = shape self.shape = shape
def forward(self, features, coors, batch_size): def forward(self, features, coors, batch_size):
coors = coors.int() coors = coors.int()
x = spconv.SparseConvTensor(features, coors,self.shape, batch_size ) x = spconv.SparseConvTensor(features, coors, self.shape, batch_size)
return self.todense(self.net(x))# .dense() return self.todense(self.net(x)) # .dense()
def gather_nd(params, indices): def gather_nd(params, indices):
# this function has a limit that MAX_ADVINDEX_CALC_DIMS=5 # this function has a limit that MAX_ADVINDEX_CALC_DIMS=5
ndim = indices.shape[-1] ndim = indices.shape[-1]
output_shape = list(indices.shape[:-1]) + list(params.shape[indices.shape[-1]:]) output_shape = list(indices.shape[:-1]) + list(
params.shape[indices.shape[-1]:])
flatted_indices = indices.view(-1, ndim) flatted_indices = indices.view(-1, ndim)
slices = [flatted_indices[:, i] for i in range(ndim)] slices = [flatted_indices[:, i] for i in range(ndim)]
slices += [Ellipsis] slices += [Ellipsis]
return params[slices].view(*output_shape) return params[slices].view(*output_shape)
def scatter_nd(indices, updates, shape): def scatter_nd(indices, updates, shape):
"""pytorch edition of tensorflow scatter_nd. """pytorch edition of tensorflow scatter_nd.
this function don't contain except handle code. so use this carefully this function don't contain except handle code. so use this carefully
...@@ -322,7 +323,6 @@ def scatter_nd(indices, updates, shape): ...@@ -322,7 +323,6 @@ def scatter_nd(indices, updates, shape):
class TestSpConv(TestCase): class TestSpConv(TestCase):
def testSpConv3d(self): def testSpConv3d(self):
np.random.seed(484) np.random.seed(484)
devices = ["cpu:0"] devices = ["cpu:0"]
...@@ -337,36 +337,44 @@ class TestSpConv(TestCase): ...@@ -337,36 +337,44 @@ class TestSpConv(TestCase):
dilations = [1, 2, 3] dilations = [1, 2, 3]
for dev, shape, bs, IC, OC, k, s, p, d in params_grid( for dev, shape, bs, IC, OC, k, s, p, d in params_grid(
devices, shapes, batchsizes, in_channels, out_channels, ksizes, devices, shapes, batchsizes, in_channels, out_channels, ksizes,
strides, paddings, dilations): strides, paddings, dilations):
if all([s > 1, d > 1]): if all([s > 1, d > 1]):
continue # don't support this. continue # don't support this.
device = torch.device(dev) device = torch.device(dev)
num_points = [1000] * bs num_points = [1000] * bs
sparse_dict = generate_sparse_data(shape, num_points, IC) sparse_dict = generate_sparse_data(shape, num_points, IC)
features = np.ascontiguousarray(sparse_dict["features"]).astype(np.float32) features = np.ascontiguousarray(sparse_dict["features"]).astype(
indices = np.ascontiguousarray(sparse_dict["indices"][:, [3, 0, 1, 2]]).astype(np.int32) np.float32)
indices = np.ascontiguousarray(
sparse_dict["indices"][:, [3, 0, 1, 2]]).astype(np.int32)
features_dense = sparse_dict["features_dense"].astype(np.float32) features_dense = sparse_dict["features_dense"].astype(np.float32)
filters = np.random.uniform(0, 1, size=[k, k, k, IC, OC]).astype(np.float32) filters = np.random.uniform(0, 1, size=[k, k, k, IC,
OC]).astype(np.float32)
indices_t = torch.from_numpy(indices).int().to(device) indices_t = torch.from_numpy(indices).int().to(device)
features_t = torch.from_numpy(features).to(device) features_t = torch.from_numpy(features).to(device)
features_t.requires_grad = True features_t.requires_grad = True
features_dense_t = torch.from_numpy(features_dense).to(device) features_dense_t = torch.from_numpy(features_dense).to(device)
features_dense_t.requires_grad = True features_dense_t.requires_grad = True
net = SparseConv3dTestTorch(1, 3, shape, IC, OC, k, s, p, d).to(device) net = SparseConv3dTestTorch(1, 3, shape, IC, OC, k, s, p,
net_ref = Conv3dTestTorch(1, 3, shape, IC, OC, k, s, p, d).to(device) d).to(device)
net_ref = Conv3dTestTorch(1, 3, shape, IC, OC, k, s, p,
d).to(device)
filters_t = torch.from_numpy(filters).to(device) filters_t = torch.from_numpy(filters).to(device)
net_ref.net[0].weight.data[:] = filters_t.permute(4, 3, 0, 1, 2).contiguous() net_ref.net[0].weight.data[:] = filters_t.permute(4, 3, 0, 1,
2).contiguous()
net.net[0].weight.data[:] = filters_t net.net[0].weight.data[:] = filters_t
out_ref = net_ref(features_dense_t) out_ref = net_ref(features_dense_t)
out = net(features_t, indices_t, bs).dense() out = net(features_t, indices_t, bs).dense()
dout = np.random.uniform(-0.2, 0.2, out_ref.shape).astype(features.dtype) dout = np.random.uniform(-0.2, 0.2,
out_ref.shape).astype(features.dtype)
dout_t = torch.from_numpy(dout).to(device) dout_t = torch.from_numpy(dout).to(device)
out.backward(dout_t) out.backward(dout_t)
out_ref.backward(dout_t) out_ref.backward(dout_t)
din_dense = features_dense_t.grad.detach().permute(0, 2, 3, 4, 1).contiguous() din_dense = features_dense_t.grad.detach().permute(0, 2, 3, 4,
1).contiguous()
din_sparse = gather_nd(din_dense, indices_t.long()) din_sparse = gather_nd(din_dense, indices_t.long())
din = features_t.grad.detach() din = features_t.grad.detach()
din_np = din.cpu().numpy() din_np = din.cpu().numpy()
...@@ -381,7 +389,7 @@ class TestSpConv(TestCase): ...@@ -381,7 +389,7 @@ class TestSpConv(TestCase):
out_np = out.detach().cpu().numpy() out_np = out.detach().cpu().numpy()
out_ref_np = out_ref.detach().cpu().numpy() out_ref_np = out_ref.detach().cpu().numpy()
self.assertAllClose(out_np, out_ref_np, atol=1e-4) self.assertAllClose(out_np, out_ref_np, atol=1e-4)
def testSpDeConv3d(self): def testSpDeConv3d(self):
np.random.seed(484) np.random.seed(484)
devices = ["cuda:0", "cpu:0"] devices = ["cuda:0", "cpu:0"]
...@@ -396,36 +404,44 @@ class TestSpConv(TestCase): ...@@ -396,36 +404,44 @@ class TestSpConv(TestCase):
dilations = [1, 2, 3] dilations = [1, 2, 3]
for dev, shape, bs, IC, OC, k, s, p, d in params_grid( for dev, shape, bs, IC, OC, k, s, p, d in params_grid(
devices, shapes, batchsizes, in_channels, out_channels, ksizes, devices, shapes, batchsizes, in_channels, out_channels, ksizes,
strides, paddings, dilations): strides, paddings, dilations):
if all([s > 1, d > 1]): if all([s > 1, d > 1]):
continue # don't support this. continue # don't support this.
device = torch.device(dev) device = torch.device(dev)
num_points = [1000] * bs num_points = [1000] * bs
sparse_dict = generate_sparse_data(shape, num_points, IC) sparse_dict = generate_sparse_data(shape, num_points, IC)
features = np.ascontiguousarray(sparse_dict["features"]).astype(np.float32) features = np.ascontiguousarray(sparse_dict["features"]).astype(
indices = np.ascontiguousarray(sparse_dict["indices"][:, [3, 0, 1, 2]]).astype(np.int32) np.float32)
indices = np.ascontiguousarray(
sparse_dict["indices"][:, [3, 0, 1, 2]]).astype(np.int32)
features_dense = sparse_dict["features_dense"].astype(np.float32) features_dense = sparse_dict["features_dense"].astype(np.float32)
filters = np.random.uniform(0, 1, size=[k, k, k, IC, OC]).astype(np.float32) filters = np.random.uniform(0, 1, size=[k, k, k, IC,
OC]).astype(np.float32)
indices_t = torch.from_numpy(indices).int().to(device) indices_t = torch.from_numpy(indices).int().to(device)
features_t = torch.from_numpy(features).to(device) features_t = torch.from_numpy(features).to(device)
features_t.requires_grad = True features_t.requires_grad = True
features_dense_t = torch.from_numpy(features_dense).to(device) features_dense_t = torch.from_numpy(features_dense).to(device)
features_dense_t.requires_grad = True features_dense_t.requires_grad = True
net = SparseDeConv3dTestTorch(1, 3, shape, IC, OC, k, s, p, d).to(device) net = SparseDeConv3dTestTorch(1, 3, shape, IC, OC, k, s, p,
net_ref = DeConv3dTestTorch(1, 3, shape, IC, OC, k, s, p, d).to(device) d).to(device)
net_ref = DeConv3dTestTorch(1, 3, shape, IC, OC, k, s, p,
d).to(device)
filters_t = torch.from_numpy(filters).to(device) filters_t = torch.from_numpy(filters).to(device)
net_ref.net[0].weight.data[:] = filters_t.permute(3, 4, 0, 1, 2).contiguous() net_ref.net[0].weight.data[:] = filters_t.permute(3, 4, 0, 1,
2).contiguous()
net.net[0].weight.data[:] = filters_t net.net[0].weight.data[:] = filters_t
out_ref = net_ref(features_dense_t) out_ref = net_ref(features_dense_t)
out = net(features_t, indices_t, bs).dense() out = net(features_t, indices_t, bs).dense()
dout = np.random.uniform(-0.2, 0.2, out_ref.shape).astype(features.dtype) dout = np.random.uniform(-0.2, 0.2,
out_ref.shape).astype(features.dtype)
dout_t = torch.from_numpy(dout).to(device) dout_t = torch.from_numpy(dout).to(device)
out.backward(dout_t) out.backward(dout_t)
out_ref.backward(dout_t) out_ref.backward(dout_t)
din_dense = features_dense_t.grad.detach().permute(0, 2, 3, 4, 1).contiguous() din_dense = features_dense_t.grad.detach().permute(0, 2, 3, 4,
1).contiguous()
din_sparse = gather_nd(din_dense, indices_t.long()) din_sparse = gather_nd(din_dense, indices_t.long())
din = features_t.grad.detach() din = features_t.grad.detach()
din_np = din.cpu().numpy() din_np = din.cpu().numpy()
...@@ -440,7 +456,7 @@ class TestSpConv(TestCase): ...@@ -440,7 +456,7 @@ class TestSpConv(TestCase):
out_np = out.detach().cpu().numpy() out_np = out.detach().cpu().numpy()
out_ref_np = out_ref.detach().cpu().numpy() out_ref_np = out_ref.detach().cpu().numpy()
self.assertAllClose(out_np, out_ref_np, atol=1e-4) self.assertAllClose(out_np, out_ref_np, atol=1e-4)
def testSpCpConv3d(self): def testSpCpConv3d(self):
np.random.seed(484) np.random.seed(484)
devices = ["cuda:0", "cpu:0"] devices = ["cuda:0", "cpu:0"]
...@@ -455,19 +471,23 @@ class TestSpConv(TestCase): ...@@ -455,19 +471,23 @@ class TestSpConv(TestCase):
dilations = [1, 2, 3] dilations = [1, 2, 3]
for dev, shape, bs, IC, OC, k, s in params_grid( for dev, shape, bs, IC, OC, k, s in params_grid(
devices, shapes, batchsizes, in_channels, out_channels, ksizes, devices, shapes, batchsizes, in_channels, out_channels, ksizes,
strides): strides):
device = torch.device(dev) device = torch.device(dev)
num_points = [1000] * bs num_points = [1000] * bs
sparse_dict = generate_sparse_data(shape, num_points, IC) sparse_dict = generate_sparse_data(shape, num_points, IC)
features = np.ascontiguousarray(sparse_dict["features"]).astype(np.float32) features = np.ascontiguousarray(sparse_dict["features"]).astype(
indices = np.ascontiguousarray(sparse_dict["indices"][:, [3, 0, 1, 2]]).astype(np.int32) np.float32)
indices = np.ascontiguousarray(
sparse_dict["indices"][:, [3, 0, 1, 2]]).astype(np.int32)
features_dense = sparse_dict["features_dense"].astype(np.float32) features_dense = sparse_dict["features_dense"].astype(np.float32)
filters = np.random.uniform(0, 1, size=[k, k, k, IC, OC]).astype(np.float32) filters = np.random.uniform(0, 1, size=[k, k, k, IC,
OC]).astype(np.float32)
indices_t = torch.from_numpy(indices).int().to(device) indices_t = torch.from_numpy(indices).int().to(device)
indices_scn_t = torch.from_numpy(indices[:, [1, 2, 3, 0]]).int().to(device) indices_scn_t = torch.from_numpy(
indices[:, [1, 2, 3, 0]]).int().to(device)
features_t = torch.from_numpy(features).to(device) features_t = torch.from_numpy(features).to(device)
features_t.requires_grad = True features_t.requires_grad = True
features_ref_t = torch.from_numpy(features).to(device) features_ref_t = torch.from_numpy(features).to(device)
...@@ -475,11 +495,14 @@ class TestSpConv(TestCase): ...@@ -475,11 +495,14 @@ class TestSpConv(TestCase):
net_ref = SCNCoupleDeConvTest(1, 3, shape, IC, OC, k, s).to(device) net_ref = SCNCoupleDeConvTest(1, 3, shape, IC, OC, k, s).to(device)
net = SparseCoupleDeConvTest(1, 3, shape, IC, OC, k, s).to(device) net = SparseCoupleDeConvTest(1, 3, shape, IC, OC, k, s).to(device)
net_ref.net[0].weight.data[:] = net.net[0].weight.data[:].view(*net_ref.net[0].weight.shape) net_ref.net[0].weight.data[:] = net.net[0].weight.data[:].view(
net_ref.net[1].weight.data[:] = net.net[1].weight.data[:].view(*net_ref.net[1].weight.shape) *net_ref.net[0].weight.shape)
net_ref.net[1].weight.data[:] = net.net[1].weight.data[:].view(
*net_ref.net[1].weight.shape)
out_ref = net_ref(features_ref_t, indices_scn_t, bs) out_ref = net_ref(features_ref_t, indices_scn_t, bs)
out = net(features_t, indices_t, bs) out = net(features_t, indices_t, bs)
dout = np.random.uniform(-0.2, 0.2, out_ref.shape).astype(features.dtype) dout = np.random.uniform(-0.2, 0.2,
out_ref.shape).astype(features.dtype)
dout_t = torch.from_numpy(dout).to(device) dout_t = torch.from_numpy(dout).to(device)
out.backward(dout_t) out.backward(dout_t)
out_ref.backward(dout_t) out_ref.backward(dout_t)
...@@ -490,15 +513,14 @@ class TestSpConv(TestCase): ...@@ -490,15 +513,14 @@ class TestSpConv(TestCase):
self.assertAllClose(din_ref_np, din_np, atol=1e-4) self.assertAllClose(din_ref_np, din_np, atol=1e-4)
for layer, layer_ref in zip(net.net, net_ref.net): for layer, layer_ref in zip(net.net, net_ref.net):
dw = layer.weight.grad.detach().cpu().numpy() dw = layer.weight.grad.detach().cpu().numpy()
dw_ref = layer_ref.weight.grad.detach().cpu().view(*dw.shape).numpy() dw_ref = layer_ref.weight.grad.detach().cpu().view(
*dw.shape).numpy()
self.assertAllClose(dw, dw_ref, atol=1e-4) self.assertAllClose(dw, dw_ref, atol=1e-4)
out_np = out.detach().cpu().numpy() out_np = out.detach().cpu().numpy()
out_ref_np = out_ref.detach().cpu().numpy() out_ref_np = out_ref.detach().cpu().numpy()
self.assertAllClose(out_np, out_ref_np, atol=1e-4) self.assertAllClose(out_np, out_ref_np, atol=1e-4)
def testSpMaxPool3d(self): def testSpMaxPool3d(self):
np.random.seed(485) np.random.seed(485)
devices = ["cuda:0", "cpu:0"] devices = ["cuda:0", "cpu:0"]
...@@ -513,19 +535,25 @@ class TestSpConv(TestCase): ...@@ -513,19 +535,25 @@ class TestSpConv(TestCase):
dilations = [1, 2, 3] dilations = [1, 2, 3]
for dev, shape, bs, IC, OC, k, s, p, d in params_grid( for dev, shape, bs, IC, OC, k, s, p, d in params_grid(
devices, shapes, batchsizes, in_channels, out_channels, ksizes, devices, shapes, batchsizes, in_channels, out_channels, ksizes,
strides, paddings, dilations): strides, paddings, dilations):
if all([s > 1, d > 1]): if all([s > 1, d > 1]):
continue # don't support this. continue # don't support this.
device = torch.device(dev) device = torch.device(dev)
num_points = [1000] * bs num_points = [1000] * bs
# when data contains negative, sparse maxpool is not equal to dense maxpool. # when data contains negative, sparse maxpool is not equal to dense maxpool.
sparse_dict = generate_sparse_data(shape, num_points, IC, data_range=[0.1, 1]) sparse_dict = generate_sparse_data(shape,
num_points,
features = np.ascontiguousarray(sparse_dict["features"]).astype(np.float32) IC,
indices = np.ascontiguousarray(sparse_dict["indices"][:, [3, 0, 1, 2]]).astype(np.int32) data_range=[0.1, 1])
features = np.ascontiguousarray(sparse_dict["features"]).astype(
np.float32)
indices = np.ascontiguousarray(
sparse_dict["indices"][:, [3, 0, 1, 2]]).astype(np.int32)
features_dense = sparse_dict["features_dense"].astype(np.float32) features_dense = sparse_dict["features_dense"].astype(np.float32)
filters = np.random.uniform(0, 1, size=[k, k, k, IC, OC]).astype(np.float32) filters = np.random.uniform(0, 1, size=[k, k, k, IC,
OC]).astype(np.float32)
indices_t = torch.from_numpy(indices).int().to(device) indices_t = torch.from_numpy(indices).int().to(device)
features_t = torch.from_numpy(features).to(device) features_t = torch.from_numpy(features).to(device)
features_t.requires_grad = True features_t.requires_grad = True
...@@ -540,24 +568,27 @@ class TestSpConv(TestCase): ...@@ -540,24 +568,27 @@ class TestSpConv(TestCase):
outfeatures = out.features outfeatures = out.features
out_dense = out.dense(channels_first=False) out_dense = out.dense(channels_first=False)
out = out_dense.permute(0, 4, 1, 2, 3).contiguous() out = out_dense.permute(0, 4, 1, 2, 3).contiguous()
dout_sparse = np.random.uniform(-0.2, 0.2, outfeatures.shape).astype(features.dtype) dout_sparse = np.random.uniform(
-0.2, 0.2, outfeatures.shape).astype(features.dtype)
dout_sparse_t = torch.from_numpy(dout_sparse).to(device) dout_sparse_t = torch.from_numpy(dout_sparse).to(device)
dout_t = scatter_nd(outids.long(), dout_sparse_t, list(out_dense.shape)) dout_t = scatter_nd(outids.long(), dout_sparse_t,
list(out_dense.shape))
dout_t = dout_t.permute(0, 4, 1, 2, 3).contiguous() dout_t = dout_t.permute(0, 4, 1, 2, 3).contiguous()
out.backward(dout_t) out.backward(dout_t)
out_ref.backward(dout_t) out_ref.backward(dout_t)
din_dense = features_dense_t.grad.detach().permute(0, 2, 3, 4, 1).contiguous() din_dense = features_dense_t.grad.detach().permute(0, 2, 3, 4,
1).contiguous()
din_sparse = gather_nd(din_dense, indices_t.long()) din_sparse = gather_nd(din_dense, indices_t.long())
din = features_t.grad.detach() din = features_t.grad.detach()
din_np = din.cpu().numpy() din_np = din.cpu().numpy()
din_sparse_np = din_sparse.cpu().numpy() din_sparse_np = din_sparse.cpu().numpy()
self.assertAllClose(din_np, din_sparse_np, atol=1e-4) self.assertAllClose(din_np, din_sparse_np, atol=1e-4)
out_np = out.detach().cpu().numpy() out_np = out.detach().cpu().numpy()
out_ref_np = out_ref.detach().cpu().numpy() out_ref_np = out_ref.detach().cpu().numpy()
self.assertAllClose(out_np, out_ref_np, atol=1e-4) self.assertAllClose(out_np, out_ref_np, atol=1e-4)
def main(): def main():
# function for develop. # function for develop.
...@@ -567,7 +598,6 @@ def main(): ...@@ -567,7 +598,6 @@ def main():
shapes = [[50, 30, 30]] shapes = [[50, 30, 30]]
batchsizes = [2] batchsizes = [2]
in_channels = [256] in_channels = [256]
out_channels = [256] out_channels = [256]
ksizes = [(3, 1, 1)] ksizes = [(3, 1, 1)]
...@@ -576,8 +606,8 @@ def main(): ...@@ -576,8 +606,8 @@ def main():
dilations = [1] dilations = [1]
for dev, shape, bs, IC, OC, k, s, p, d in params_grid( for dev, shape, bs, IC, OC, k, s, p, d in params_grid(
devices, shapes, batchsizes, in_channels, out_channels, ksizes, devices, shapes, batchsizes, in_channels, out_channels, ksizes,
strides, paddings, dilations): strides, paddings, dilations):
if all([s > 1, d > 1]): if all([s > 1, d > 1]):
continue continue
device = torch.device(dev) device = torch.device(dev)
...@@ -585,19 +615,25 @@ def main(): ...@@ -585,19 +615,25 @@ def main():
sparse_dict = generate_sparse_data(shape, num_points, IC) sparse_dict = generate_sparse_data(shape, num_points, IC)
features = np.ascontiguousarray(sparse_dict["features"]).astype(np.float32) features = np.ascontiguousarray(sparse_dict["features"]).astype(
indices = np.ascontiguousarray(sparse_dict["indices"][:, [3, 0, 1, 2]]).astype(np.int32) np.float32)
indices = np.ascontiguousarray(
sparse_dict["indices"][:, [3, 0, 1, 2]]).astype(np.int32)
features_dense = sparse_dict["features_dense"].astype(np.float32) features_dense = sparse_dict["features_dense"].astype(np.float32)
indices_t = torch.from_numpy(indices) indices_t = torch.from_numpy(indices)
filters = np.random.uniform(0, 1, size=[k[0], 1, 1, IC, OC]).astype(np.float32) filters = np.random.uniform(0, 1, size=[k[0], 1, 1, IC,
OC]).astype(np.float32)
indices_t = torch.from_numpy(indices).int().to(device).float() indices_t = torch.from_numpy(indices).int().to(device).float()
features_t = torch.from_numpy(features).to(device).float() features_t = torch.from_numpy(features).to(device).float()
features_dense_t = torch.from_numpy(features_dense).to(device).float() features_dense_t = torch.from_numpy(features_dense).to(device).float()
net = SparseConv3dTestTorch(1, 3, shape, IC, OC, k, s, p, d).to(device).float() net = SparseConv3dTestTorch(1, 3, shape, IC, OC, k, s, p,
net_ref = Conv3dTestTorch(1, 3, shape, IC, OC, k, s, p, d).to(device).float() d).to(device).float()
net_ref = Conv3dTestTorch(1, 3, shape, IC, OC, k, s, p,
d).to(device).float()
filters_t = torch.from_numpy(filters).to(device).float() filters_t = torch.from_numpy(filters).to(device).float()
net_ref.net[0].weight[:] = filters_t.permute(4, 3, 0, 1, 2).contiguous() net_ref.net[0].weight[:] = filters_t.permute(4, 3, 0, 1,
2).contiguous()
net.net[0].weight[:] = filters_t net.net[0].weight[:] = filters_t
out_ref = net_ref(features_dense_t) out_ref = net_ref(features_dense_t)
times = [] times = []
...@@ -607,16 +643,16 @@ def main(): ...@@ -607,16 +643,16 @@ def main():
torch.cuda.synchronize() torch.cuda.synchronize()
times.append(time.time() - t) times.append(time.time() - t)
# print((net.grid == -1).float().sum(), net.grid.numel()) # print((net.grid == -1).float().sum(), net.grid.numel())
# print("spconv time", time.time() - t) # print("spconv time", time.time() - t)
print("spconv time", np.mean(times[2:])) print("spconv time", np.mean(times[2:]))
out = net(features_t, indices_t, bs) out = net(features_t, indices_t, bs)
# print(out.indices) # print(out.indices)
out = out.dense() out = out.dense()
print(np.linalg.norm(out.detach().cpu().numpy() - out_ref.detach().cpu().numpy())) print(
np.linalg.norm(out.detach().cpu().numpy() -
out_ref.detach().cpu().numpy()))
if __name__ == '__main__': if __name__ == '__main__':
main() main()
# unittest.main() # unittest.main()
Subproject commit 085a29436a8c472caaaf7157aa644b571079bcaa Subproject commit 3b1dbebabc801c9cf6f0953a4c20b904d444f879
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment