Commit 19e73bbe authored by Yan Yan's avatar Yan Yan
Browse files

format code with clang-format, better c++ code

parent c336139f
#include <cuhash/hash_table.h>
#include <cuda.h>
#include <cuhash/hash_table.h>
int main(){
auto table = cuhash::HashTable();
table.Initialize(10, 2.0);
const int N = 10;
// ハッシュテーブルに格納するデータ
int keys[N] = {1, 6, 4, 9, 0, 3, 7, 2, 5, 8};
int vals[N] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
// デバイスメモリにコピー
int *d_keys, *d_vals;
cudaMalloc((void**)&d_keys, sizeof(int) * N);
cudaMemcpy(d_keys, keys, sizeof(int) * N, cudaMemcpyHostToDevice);
cudaMalloc((void**)&d_vals, sizeof(int) * N);
cudaMemcpy(d_vals, vals, sizeof(int) * N, cudaMemcpyHostToDevice);
// ハッシュテーブルにクエリするデータ
int input[N] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
int output[N];
// デバイスメモリにコピー
int *d_input, *d_output;
cudaMalloc((void**)&d_input, sizeof(int) * N);
cudaMemcpy(d_input, input, sizeof(int) * N, cudaMemcpyHostToDevice);
cudaMalloc((void**)&d_output, sizeof(int) * N);
cudaMemset(d_output, 0, sizeof(int) * N);
bool s = table.Build(N, (const unsigned int *) d_keys,
(const unsigned int *) d_vals);
std::cout << s << std::endl;
table.Retrieve(N, (const unsigned int *) d_input,
(unsigned int *) d_output);
std::cout << s << std::endl;
cudaMemcpy(output, d_output, sizeof(int) * N, cudaMemcpyDeviceToHost);
for (int i = 0; i < N; ++i) {
printf("%d\n", output[i]);
}
return 0;
int main() {
auto table = cuhash::HashTable();
table.Initialize(10, 2.0);
const int N = 10;
// ハッシュテーブルに格納するデータ
int keys[N] = {1, 6, 4, 9, 0, 3, 7, 2, 5, 8};
int vals[N] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
// デバイスメモリにコピー
int *d_keys, *d_vals;
cudaMalloc((void **)&d_keys, sizeof(int) * N);
cudaMemcpy(d_keys, keys, sizeof(int) * N, cudaMemcpyHostToDevice);
cudaMalloc((void **)&d_vals, sizeof(int) * N);
cudaMemcpy(d_vals, vals, sizeof(int) * N, cudaMemcpyHostToDevice);
// ハッシュテーブルにクエリするデータ
int input[N] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
int output[N];
// デバイスメモリにコピー
int *d_input, *d_output;
cudaMalloc((void **)&d_input, sizeof(int) * N);
cudaMemcpy(d_input, input, sizeof(int) * N, cudaMemcpyHostToDevice);
cudaMalloc((void **)&d_output, sizeof(int) * N);
cudaMemset(d_output, 0, sizeof(int) * N);
bool s = table.Build(N, (const unsigned int *)d_keys,
(const unsigned int *)d_vals);
std::cout << s << std::endl;
table.Retrieve(N, (const unsigned int *)d_input, (unsigned int *)d_output);
std::cout << s << std::endl;
cudaMemcpy(output, d_output, sizeof(int) * N, cudaMemcpyDeviceToHost);
for (int i = 0; i < N; ++i) {
printf("%d\n", output[i]);
}
return 0;
}
\ No newline at end of file
// Copyright 2019 Yan Yan
//
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//
// http://www.apache.org/licenses/LICENSE-2.0
//
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <torch/script.h>
#include <spconv/pool_ops.h>
#include <spconv/spconv_ops.h>
#include <spconv/pillar_scatter_ops.h>
#include <spconv/fused_spconv_ops.h>
#include <spconv/nms_ops.h>
#include <spconv/pillar_scatter_ops.h>
#include <spconv/pool_ops.h>
#include <spconv/spconv_ops.h>
#include <torch/script.h>
static auto registry =
torch::RegisterOperators()
.op("spconv::get_indice_pairs_2d", &spconv::getIndicePair<2>)
.op("spconv::get_indice_pairs_3d", &spconv::getIndicePair<3>)
.op("spconv::get_indice_pairs_4d", &spconv::getIndicePair<4>)
.op("spconv::get_indice_pairs_grid_2d", &spconv::getIndicePairPreGrid<2>)
.op("spconv::get_indice_pairs_grid_3d", &spconv::getIndicePairPreGrid<3>)
.op("spconv::get_indice_pairs_grid_2d",
&spconv::getIndicePairPreGrid<2>)
.op("spconv::get_indice_pairs_grid_3d",
&spconv::getIndicePairPreGrid<3>)
.op("spconv::indice_conv", &spconv::indiceConv)
.op("spconv::indice_conv_backward", &spconv::indiceConvBackward)
.op("spconv::fused_indice_conv_fp32", &spconv::fusedIndiceConvBatchNorm<float>)
.op("spconv::fused_indice_conv_half", &spconv::fusedIndiceConvBatchNorm<at::Half>)
.op("spconv::fused_indice_conv_fp32",
&spconv::fusedIndiceConvBatchNorm<float>)
.op("spconv::fused_indice_conv_half",
&spconv::fusedIndiceConvBatchNorm<at::Half>)
.op("spconv::indice_maxpool_fp32", &spconv::indiceMaxPool<float>)
.op("spconv::indice_maxpool_backward_fp32",
&spconv::indiceMaxPoolBackward<float>)
......@@ -38,4 +42,5 @@ static auto registry =
&spconv::indiceMaxPoolBackward<at::Half>)
.op("spconv::nms", &spconv::nonMaxSuppression<float>)
.op("spconv::pillar_scatter_float", &spconv::pointPillarScatter<float>)
.op("spconv::pillar_scatter_half", &spconv::pointPillarScatter<at::Half>);
.op("spconv::pillar_scatter_half",
&spconv::pointPillarScatter<at::Half>);
// Copyright 2019 Yan Yan
//
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//
// http://www.apache.org/licenses/LICENSE-2.0
//
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <ATen/Parallel.h>
#include <spconv/geometry.h>
#include <spconv/indice.h>
#include <spconv/spconv_ops.h>
#include <torch/script.h>
#include <ATen/Parallel.h>
namespace spconv {
......@@ -45,7 +45,7 @@ Index getIndicePairsConv(tv::TensorView<const Index> indicesIn,
}
Index numValidPoints = 0;
std::vector<Index> validPoints_(kernelVolume * (NDim + 1));
Index* validPoints = validPoints_.data();
Index *validPoints = validPoints_.data();
Index *pointPtr = nullptr;
Index hashval;
tsl::robin_map<Index, Index> hash;
......@@ -67,7 +67,7 @@ Index getIndicePairsConv(tv::TensorView<const Index> indicesIn,
indicesOut(numAct, 0) = batchIdx;
hashval = numAct++;
hash[index] = hashval;
}else{
} else {
hashval = iter->second;
}
// indicePairs: [K, 2, L]
......@@ -102,7 +102,7 @@ Index getIndicePairsDeConv(tv::TensorView<const Index> indicesIn,
}
Index numValidPoints = 0;
std::vector<Index> validPoints_(kernelVolume * (NDim + 1));
Index* validPoints = validPoints_.data();
Index *validPoints = validPoints_.data();
Index *pointPtr = nullptr;
Index hashval;
tsl::robin_map<Index, Index> hash;
......@@ -125,7 +125,7 @@ Index getIndicePairsDeConv(tv::TensorView<const Index> indicesIn,
indicesOut(numAct, 0) = batchIdx;
hashval = numAct++;
hash[index] = hashval;
}else{
} else {
hashval = iter->second;
}
// indicePairs: [K, 2, L]
......@@ -136,7 +136,6 @@ Index getIndicePairsDeConv(tv::TensorView<const Index> indicesIn,
return numAct;
}
#ifndef TV_WINDOWS
template <typename Index, typename IndexGrid, unsigned NDim>
Index getIndicePairsSubM(tv::TensorView<const Index> indicesIn,
......@@ -145,7 +144,8 @@ Index getIndicePairsSubM(tv::TensorView<const Index> indicesIn,
tv::TensorView<Index> indiceNum,
const Index *const kernelSize,
const Index *const stride, const Index *const padding,
const Index *dilation, const Index *const outSpatialShape) {
const Index *dilation,
const Index *const outSpatialShape) {
Index numAct = 0;
auto numActIn = indicesIn.dim(0);
Index batchIdx = 0;
......@@ -167,12 +167,12 @@ Index getIndicePairsSubM(tv::TensorView<const Index> indicesIn,
spatialVolume * indicesIn(j, 0);
hash[index] = j;
}
at::parallel_for(0, numActIn, 0, [&](int64_t begin, int64_t end){
at::parallel_for(0, numActIn, 0, [&](int64_t begin, int64_t end) {
Index index = 0;
Index numValidPoints = 0;
std::vector<Index> validPoints_(kernelVolume * (NDim + 1));
Index* validPoints = validPoints_.data();
Index *validPoints = validPoints_.data();
Index *pointPtr = nullptr;
Index oldOffset = 0;
for (int j = begin; j < end; ++j) {
......@@ -186,7 +186,7 @@ Index getIndicePairsSubM(tv::TensorView<const Index> indicesIn,
spatialVolume * indicesIn(j, 0);
auto iter = hash.find(index);
if (iter != hash.end()) {
#pragma omp atomic capture
#pragma omp atomic capture
oldOffset = indiceNum[offset]++;
indicePairs(offset, 0, oldOffset) = j;
indicePairs(offset, 1, oldOffset) = iter->second;
......@@ -196,7 +196,7 @@ Index getIndicePairsSubM(tv::TensorView<const Index> indicesIn,
});
return numActIn;
}
#else
#else
template <typename Index, typename IndexGrid, unsigned NDim>
Index getIndicePairsSubM(tv::TensorView<const Index> indicesIn,
tv::TensorView<IndexGrid> gridsOut,
......@@ -204,7 +204,8 @@ Index getIndicePairsSubM(tv::TensorView<const Index> indicesIn,
tv::TensorView<Index> indiceNum,
const Index *const kernelSize,
const Index *const stride, const Index *const padding,
const Index *dilation, const Index *const outSpatialShape) {
const Index *dilation,
const Index *const outSpatialShape) {
Index numAct = 0;
auto numActIn = indicesIn.dim(0);
Index batchIdx = 0;
......@@ -221,7 +222,7 @@ Index getIndicePairsSubM(tv::TensorView<const Index> indicesIn,
Index numValidPoints = 0;
// Index validPoints[kernelVolume * (NDim + 1)];
std::vector<Index> validPoints_(kernelVolume * (NDim + 1));
Index* validPoints = validPoints_.data();
Index *validPoints = validPoints_.data();
Index *pointPtr = nullptr;
tsl::robin_map<Index, Index> hash;
for (int j = 0; j < numActIn; ++j) {
......@@ -255,57 +256,53 @@ Index getIndicePairsSubM(tv::TensorView<const Index> indicesIn,
namespace functor {
template <typename Index, typename IndexGrid, unsigned NDim>
struct CreateConvIndicePairFunctor<tv::CPU, Index, IndexGrid, NDim> {
Index operator()(const tv::CPU& d, tv::TensorView<const Index> indicesIn,
tv::TensorView<Index> indicesOut,
tv::TensorView<IndexGrid> gridsOut,
tv::TensorView<Index> indicePairs,
tv::TensorView<Index> indiceNum,
const tv::SimpleVector<Index, NDim> kernelSize,
const tv::SimpleVector<Index, NDim> stride,
const tv::SimpleVector<Index, NDim> padding,
const tv::SimpleVector<Index, NDim> dilation,
const tv::SimpleVector<Index, NDim> outSpatialShape,
bool transpose, bool resetGrid, bool useHash) {
Index operator()(const tv::CPU &d, tv::TensorView<const Index> indicesIn,
tv::TensorView<Index> indicesOut,
tv::TensorView<IndexGrid> gridsOut,
tv::TensorView<Index> indicePairs,
tv::TensorView<Index> indiceNum,
const tv::SimpleVector<Index, NDim> kernelSize,
const tv::SimpleVector<Index, NDim> stride,
const tv::SimpleVector<Index, NDim> padding,
const tv::SimpleVector<Index, NDim> dilation,
const tv::SimpleVector<Index, NDim> outSpatialShape,
bool transpose, bool resetGrid, bool useHash) {
if (transpose)
return getIndicePairsDeConv<Index, IndexGrid, NDim>(
indicesIn, indicesOut,
gridsOut, indicePairs, indiceNum,
indicesIn, indicesOut, gridsOut, indicePairs, indiceNum,
kernelSize.data(), stride.data(), padding.data(), dilation.data(),
outSpatialShape.data());
else
return getIndicePairsConv<Index, IndexGrid, NDim>(
indicesIn, indicesOut,
gridsOut, indicePairs, indiceNum,
indicesIn, indicesOut, gridsOut, indicePairs, indiceNum,
kernelSize.data(), stride.data(), padding.data(), dilation.data(),
outSpatialShape.data());
}
};
template <typename Index, typename IndexGrid, unsigned NDim>
struct CreateSubMIndicePairFunctor<tv::CPU, Index, IndexGrid, NDim> {
Index operator()(const tv::CPU& d, tv::TensorView<const Index> indicesIn,
tv::TensorView<IndexGrid> gridsOut,
tv::TensorView<Index> indicePairs,
tv::TensorView<Index> indiceNum,
const tv::SimpleVector<Index, NDim> kernelSize,
const tv::SimpleVector<Index, NDim> stride,
const tv::SimpleVector<Index, NDim> padding,
const tv::SimpleVector<Index, NDim> dilation,
const tv::SimpleVector<Index, NDim> outSpatialShape,
bool transpose, bool resetGrid, bool useHash) {
Index operator()(const tv::CPU &d, tv::TensorView<const Index> indicesIn,
tv::TensorView<IndexGrid> gridsOut,
tv::TensorView<Index> indicePairs,
tv::TensorView<Index> indiceNum,
const tv::SimpleVector<Index, NDim> kernelSize,
const tv::SimpleVector<Index, NDim> stride,
const tv::SimpleVector<Index, NDim> padding,
const tv::SimpleVector<Index, NDim> dilation,
const tv::SimpleVector<Index, NDim> outSpatialShape,
bool transpose, bool resetGrid, bool useHash) {
return getIndicePairsSubM<Index, IndexGrid, NDim>(
indicesIn,
gridsOut, indicePairs, indiceNum,
kernelSize.data(), stride.data(), padding.data(), dilation.data(), outSpatialShape.data());
indicesIn, gridsOut, indicePairs, indiceNum, kernelSize.data(),
stride.data(), padding.data(), dilation.data(), outSpatialShape.data());
}
};
} // namespace functor
#define DECLARE_CPU_SPECS_INDEX_NDIM(Index, NDIM) \
template struct functor::CreateConvIndicePairFunctor<tv::CPU, Index, int, NDIM>; \
template struct functor::CreateSubMIndicePairFunctor<tv::CPU, Index, int, \
NDIM>;
template struct functor::CreateConvIndicePairFunctor<tv::CPU, Index, int, \
NDIM>; \
template struct functor::CreateSubMIndicePairFunctor<tv::CPU, Index, int, \
NDIM>;
#define DECLARE_CPU_INDEX(Index) \
DECLARE_CPU_SPECS_INDEX_NDIM(Index, 1); \
......@@ -320,4 +317,3 @@ DECLARE_CPU_INDEX(long);
#undef DECLARE_CPU_SPECS_INDEX_NDIM
} // namespace spconv
......@@ -14,17 +14,240 @@
#include <ATen/ATen.h>
#include <chrono>
#include <cuhash/hash_table.h>
#include <limits>
#include <spconv/mp_helper.h>
#include <spconv/indice.h>
#include <spconv/indice.cu.h>
#include <tensorview/helper_launch.h>
#include <spconv/indice.h>
#include <tensorview/cuda_utils.h>
#include <tensorview/mp_helper.h>
#include <tensorview/torch_utils.h>
#include <tensorview/tensor.h>
#include <tensorview/tensorview.h>
#include <type_traits>
#include <utility/timer.h>
#include <cuhash/hash_table.h>
namespace spconv {
int create_conv_indice_pair_p1_cuda(
torch::Tensor indicesIn, torch::Tensor indicePairs, torch::Tensor indiceNum,
torch::Tensor indicePairUnique, std::vector<int64_t> kernelSize,
std::vector<int64_t> stride, std::vector<int64_t> padding,
std::vector<int64_t> dilation, std::vector<int64_t> outSpatialShape,
bool transpose) {
auto stream = at::cuda::getCurrentCUDAStream();
auto ndim = kernelSize.size();
auto numActIn = indicesIn.size(0);
if (numActIn == 0)
return 0;
// dispatch_torch must be in outside, this is a gcc bug, fixed in gcc 8.
tv::dispatch_torch<int32_t>(indicesIn.scalar_type(), [&](auto V) {
using Index = decltype(V);
using IndexGrid = int32_t;
tv::dispatch_int<2, 3, 4>(ndim, [&](auto I) {
constexpr int NDim = I;
tv::SimpleVector<Index, NDim> ks(kernelSize.begin(), kernelSize.end());
tv::SimpleVector<Index, NDim> st(stride.begin(), stride.end());
tv::SimpleVector<Index, NDim> pa(padding.begin(), padding.end());
tv::SimpleVector<Index, NDim> di(dilation.begin(), dilation.end());
tv::SimpleVector<Index, NDim> ou(outSpatialShape.begin(),
outSpatialShape.end());
if (transpose) {
prepareDeConvIndicePairsKernel<Index, NDim, 4096>
<<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS, 0,
stream>>>(tv::torch2tv<Index>(indicesIn),
tv::torch2tv<Index>(indicePairs),
tv::torch2tv<Index>(indiceNum),
tv::torch2tv<Index>(indicePairUnique), ks, st, pa, di,
ou);
} else {
prepareIndicePairsKernel<Index, NDim, 4096>
<<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS, 0,
stream>>>(tv::torch2tv<Index>(indicesIn),
tv::torch2tv<Index>(indicePairs),
tv::torch2tv<Index>(indiceNum),
tv::torch2tv<Index>(indicePairUnique), ks, st, pa, di,
ou);
}
});
});
return 1;
}
int create_conv_indice_pair_p2_cuda(
torch::Tensor indicesIn, torch::Tensor indicesOut, torch::Tensor gridsOut,
torch::Tensor indicePairs, torch::Tensor indiceNum,
torch::Tensor indicePairUnique, std::vector<int64_t> outSpatialShape,
bool transpose, bool resetGrid, bool useHash) {
auto stream = at::cuda::getCurrentCUDAStream();
auto ndim = outSpatialShape.size();
auto numActIn = indicesIn.size(0);
int batchSize = gridsOut.size(0);
int numAct = indicePairUnique.size(0) - 1;
auto kernelVolume = indicePairs.size(0);
if (numActIn == 0)
return 0;
// dispatch_torch must be in outside, this is a gcc bug, fixed in gcc 8.
tv::dispatch_torch<int32_t>(indicesIn.scalar_type(), [&](auto V) {
using Index = decltype(V);
using IndexGrid = int32_t;
tv::dispatch_int<2, 3, 4>(ndim, [&](auto I) {
constexpr int NDim = I;
using IndexGrid = int32_t;
tv::SimpleVector<Index, NDim> ou(outSpatialShape.begin(),
outSpatialShape.end());
if (useHash) {
auto table = cuhash::HashTable();
// std::cout << "create " << numAct << " size table..." << std::endl;
table.Initialize(numAct, 2.0, 4);
unsigned *d_values = nullptr;
cudaMalloc((void **)&d_values, sizeof(unsigned) * numAct);
TV_CHECK_CUDA_ERR_V2("cudaMalloc failed");
arangeKernel<unsigned>
<<<tv::cuda::getBlocks(numAct), tv::cuda::CUDA_NUM_THREADS, 0,
stream>>>(d_values, numAct);
TV_CHECK_CUDA_ERR_V2("arangeKernel failed");
bool res =
table.Build(numAct,
reinterpret_cast<unsigned *>(
tv::torch2tv<Index>(indicePairUnique).data()),
d_values);
cudaFree(d_values);
TV_CHECK_CUDA_ERR_V2("cudaFree failed");
if (!res) {
return -1; // use -1 to tell outside use CPU implementation
}
assignIndiceOutKernel<Index, NDim>
<<<tv::cuda::getBlocks(numAct), tv::cuda::CUDA_NUM_THREADS, 0,
stream>>>(tv::torch2tv<Index>(indicesOut), numAct,
tv::torch2tv<Index>(indicePairUnique), ou, batchSize);
auto tableSize = table.get_table_size();
auto tableData = table.data();
auto constants = table.get_constants_4();
auto stash_constants = table.get_stash_constants();
auto stash_count = table.get_stash_count();
assignIndicePairsHashKernel<Index, NDim>
<<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS, 0,
stream>>>(tv::torch2tv<Index>(indicesOut), numActIn,
tv::torch2tv<Index>(indicePairs),
tv::torch2tv<Index>(indicePairUnique), tableSize,
tableData, constants, stash_constants, stash_count);
} else {
assignGridAndIndiceOutKernel<Index, IndexGrid, NDim>
<<<tv::cuda::getBlocks(numAct), tv::cuda::CUDA_NUM_THREADS, 0,
stream>>>(tv::torch2tv<Index>(indicesOut),
tv::torch2tv<IndexGrid>(gridsOut), numAct,
tv::torch2tv<Index>(indicePairs),
tv::torch2tv<Index>(indicePairUnique), ou, batchSize);
TV_CHECK_CUDA_ERR_V2("assignGridAndIndiceOutKernel failed");
assignIndicePairsKernel<Index, IndexGrid, NDim>
<<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS, 0,
stream>>>(tv::torch2tv<Index>(indicesOut),
tv::torch2tv<IndexGrid>(gridsOut), numAct,
tv::torch2tv<Index>(indicePairs),
tv::torch2tv<Index>(indicePairUnique), ou);
TV_CHECK_CUDA_ERR_V2("assignIndicePairsKernel failed");
}
if (resetGrid && (!useHash)) {
resetGridKernel<Index, IndexGrid, NDim>
<<<tv::cuda::getBlocks(numAct), tv::cuda::CUDA_NUM_THREADS, 0,
stream>>>(indicePairUnique.data_ptr<Index>(),
tv::torch2tv<IndexGrid>(gridsOut), numAct);
TV_CHECK_CUDA_ERR_V2("resetGridKernel failed");
}
});
});
return numAct;
}
int create_submconv_indice_pair_cuda(
torch::Tensor indicesIn, torch::Tensor gridsOut, torch::Tensor indicePairs,
torch::Tensor indiceNum, std::vector<int64_t> kernelSize,
std::vector<int64_t> stride, std::vector<int64_t> padding,
std::vector<int64_t> dilation, std::vector<int64_t> outSpatialShape,
bool transpose, bool resetGrid, bool useHash) {
auto stream = at::cuda::getCurrentCUDAStream();
auto ndim = outSpatialShape.size();
auto numActIn = indicesIn.size(0);
int batchSize = gridsOut.size(0);
auto kernelVolume = indicePairs.size(0);
if (numActIn == 0)
return 0;
tv::dispatch_torch<int32_t>(indicesIn.scalar_type(), [&](auto V) {
using Index = decltype(V);
using IndexGrid = int32_t;
tv::dispatch_int<2, 3, 4>(ndim, [&](auto I) {
constexpr int NDim = I;
tv::SimpleVector<Index, NDim> ks(kernelSize.begin(), kernelSize.end());
tv::SimpleVector<Index, NDim> st(stride.begin(), stride.end());
tv::SimpleVector<Index, NDim> pa(padding.begin(), padding.end());
tv::SimpleVector<Index, NDim> di(dilation.begin(), dilation.end());
tv::SimpleVector<Index, NDim> ou(outSpatialShape.begin(),
outSpatialShape.end());
if (useHash) {
auto table = cuhash::HashTable();
// std::cout << "create " << numAct << " size table..." << std::endl;
table.Initialize(numActIn, 2.0, 4);
unsigned *d_keyvalues = nullptr;
cudaMalloc((void **)&d_keyvalues, sizeof(unsigned) * numActIn * 2);
unsigned *d_values = d_keyvalues + numActIn;
TV_CHECK_CUDA_ERR_V2("cudaMalloc failed");
prepareSubMHashKernel<Index, NDim>
<<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS, 0,
stream>>>(tv::torch2tv<Index>(indicesIn), d_keyvalues, d_values,
ou);
TV_CHECK_CUDA_ERR_V2("prepareSubMHashKernel failed");
bool res =
table.Build(numActIn, reinterpret_cast<unsigned *>(d_keyvalues),
reinterpret_cast<unsigned *>(d_values));
cudaFree(d_values);
TV_CHECK_CUDA_ERR_V2("cudaFree failed");
if (!res) {
return -1; // use -1 to tell outside use CPU implementation
}
auto tableSize = table.get_table_size();
auto tableData = table.data();
auto constants = table.get_constants_4();
auto stash_constants = table.get_stash_constants();
auto stash_count = table.get_stash_count();
getSubMIndicePairsHashKernel<Index, NDim, 4096>
<<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS, 0,
stream>>>(tv::torch2tv<Index>(indicesIn),
tv::torch2tv<Index>(indicePairs),
tv::torch2tv<Index>(indiceNum), ks, st, pa, di, ou,
tableSize, tableData, constants, stash_constants,
stash_count);
} else {
prepareSubMGridKernel<Index, IndexGrid, NDim>
<<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS, 0,
stream>>>(tv::torch2tv<Index>(indicesIn),
tv::torch2tv<IndexGrid>(gridsOut), ou);
TV_CHECK_CUDA_ERR_V2("prepareSubMGridKernel failed");
getSubMIndicePairsKernel<Index, IndexGrid, NDim, 4096>
<<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS, 0,
stream>>>(tv::torch2tv<Index>(indicesIn),
tv::torch2tv<IndexGrid>(gridsOut),
tv::torch2tv<Index>(indicePairs),
tv::torch2tv<Index>(indiceNum), ks, st, pa, di, ou);
TV_CHECK_CUDA_ERR_V2("assignIndicePairsKernel failed");
}
if (resetGrid && (!useHash)) {
resetGridSubMKernel<Index, IndexGrid, NDim>
<<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS, 0,
stream>>>(indicesIn.data_ptr<Index>(),
tv::torch2tv<IndexGrid>(gridsOut), ou, numActIn);
TV_CHECK_CUDA_ERR_V2("resetGridKernel failed");
}
});
});
return numActIn;
}
namespace functor {
template <typename Index, typename IndexGrid, unsigned NDim>
struct CreateConvIndicePairFunctorP1<tv::GPU, Index, IndexGrid, NDim> {
......@@ -46,17 +269,17 @@ struct CreateConvIndicePairFunctorP1<tv::GPU, Index, IndexGrid, NDim> {
return 0;
// auto timer = spconv::CudaContextTimer<>();
if (transpose)
prepareDeConvIndicePairsKernel<Index, IndexGrid, NDim, 4096>
<<<tv::launch::getBlocks(numActIn), tv::launch::CUDA_NUM_THREADS, 0,
d.getStream()>>>(indicesIn, indicesOut, gridsOut, indicePairs,
indiceNum, indicePairUnique, kernelSize, stride,
padding, dilation, outSpatialShape);
prepareDeConvIndicePairsKernel<Index, NDim, 4096>
<<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS, 0,
d.getStream()>>>(indicesIn, indicePairs, indiceNum,
indicePairUnique, kernelSize, stride, padding,
dilation, outSpatialShape);
else
prepareIndicePairsKernel<Index, IndexGrid, NDim, 4096>
<<<tv::launch::getBlocks(numActIn), tv::launch::CUDA_NUM_THREADS, 0,
d.getStream()>>>(indicesIn, indicesOut, gridsOut, indicePairs,
indiceNum, indicePairUnique, kernelSize, stride,
padding, dilation, outSpatialShape);
prepareIndicePairsKernel<Index, NDim, 4096>
<<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS, 0,
d.getStream()>>>(indicesIn, indicePairs, indiceNum,
indicePairUnique, kernelSize, stride, padding,
dilation, outSpatialShape);
TV_CHECK_CUDA_ERR();
// std::cout << "p1 gene time " << timer.report() / 1000.0 << std::endl;
return 1;
......@@ -78,57 +301,58 @@ struct CreateConvIndicePairFunctorP2<tv::GPU, Index, IndexGrid, NDim> {
auto numActIn = indicesIn.dim(0);
if (numActIn == 0)
return 0;
// after unique, there is a std::numeric_limits<int>::max() in the end of indicePairUnique
Index numAct = indicePairUnique.dim(0) - 1;
if (useHash){
// after unique, there is a std::numeric_limits<int>::max() in the end of
// indicePairUnique
Index numAct = indicePairUnique.dim(0) - 1;
if (useHash) {
auto table = cuhash::HashTable();
// std::cout << "create " << numAct << " size table..." << std::endl;
table.Initialize(numAct, 2.0, 4);
unsigned *d_values = nullptr;
cudaMalloc((void**)&d_values, sizeof(unsigned) * numAct);
cudaMalloc((void **)&d_values, sizeof(unsigned) * numAct);
TV_CHECK_CUDA_ERR_V2("cudaMalloc failed");
arangeKernel<unsigned><<<tv::launch::getBlocks(numAct), tv::launch::CUDA_NUM_THREADS, 0,
d.getStream()>>>(d_values, numAct);
bool res = table.Build(numAct, reinterpret_cast<unsigned*>(indicePairUnique.data()),
d_values);
arangeKernel<unsigned>
<<<tv::cuda::getBlocks(numAct), tv::cuda::CUDA_NUM_THREADS, 0,
d.getStream()>>>(d_values, numAct);
bool res = table.Build(
numAct, reinterpret_cast<unsigned *>(indicePairUnique.data()),
d_values);
cudaFree(d_values);
if (!res){
return -1; //use -1 to tell outside use CPU implementation
if (!res) {
return -1; // use -1 to tell outside use CPU implementation
}
assignIndiceOutKernel<Index, NDim>
<<<tv::launch::getBlocks(numAct), tv::launch::CUDA_NUM_THREADS, 0,
d.getStream()>>>(indicesOut, numAct,
indicePairUnique, outSpatialShape, batchSize);
<<<tv::cuda::getBlocks(numAct), tv::cuda::CUDA_NUM_THREADS, 0,
d.getStream()>>>(indicesOut, numAct, indicePairUnique,
outSpatialShape, batchSize);
TV_CHECK_CUDA_ERR_V2("assignGridAndIndiceOutKernel failed");
auto tableSize = table.get_table_size();
auto tableData = table.data();
auto constants = table.get_constants_4();
auto stash_constants = table.get_stash_constants();
auto stash_count = table.get_stash_count();
assignIndicePairsHashKernel<Index, IndexGrid, NDim>
<<<tv::launch::getBlocks(numActIn), tv::launch::CUDA_NUM_THREADS, 0,
d.getStream()>>>(indicesOut, numActIn, indicePairs,
indicePairUnique,
tableSize, tableData, constants, stash_constants,
stash_count);
assignIndicePairsHashKernel<Index, NDim>
<<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS, 0,
d.getStream()>>>(indicesOut, numActIn, indicePairs,
indicePairUnique, tableSize, tableData, constants,
stash_constants, stash_count);
TV_CHECK_CUDA_ERR_V2("assignIndicePairsKernel failed");
}else{
} else {
assignGridAndIndiceOutKernel<Index, IndexGrid, NDim>
<<<tv::launch::getBlocks(numAct), tv::launch::CUDA_NUM_THREADS, 0,
d.getStream()>>>(indicesOut, gridsOut, numAct, indicePairs,
indicePairUnique, outSpatialShape, batchSize);
<<<tv::cuda::getBlocks(numAct), tv::cuda::CUDA_NUM_THREADS, 0,
d.getStream()>>>(indicesOut, gridsOut, numAct, indicePairs,
indicePairUnique, outSpatialShape, batchSize);
TV_CHECK_CUDA_ERR_V2("assignGridAndIndiceOutKernel failed");
assignIndicePairsKernel<Index, IndexGrid, NDim>
<<<tv::launch::getBlocks(numActIn), tv::launch::CUDA_NUM_THREADS, 0,
d.getStream()>>>(indicesOut, gridsOut, numActIn, indicePairs,
indicePairUnique, outSpatialShape);
<<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS, 0,
d.getStream()>>>(indicesOut, gridsOut, numActIn, indicePairs,
indicePairUnique, outSpatialShape);
TV_CHECK_CUDA_ERR_V2("assignIndicePairsKernel failed");
}
if (resetGrid && (!useHash)) {
resetGridKernel<Index, IndexGrid, NDim>
<<<tv::launch::getBlocks(numAct), tv::launch::CUDA_NUM_THREADS, 0,
<<<tv::cuda::getBlocks(numAct), tv::cuda::CUDA_NUM_THREADS, 0,
d.getStream()>>>(indicePairUnique.data(), gridsOut, numAct);
TV_CHECK_CUDA_ERR_V2("resetGridKernel failed");
}
......@@ -152,22 +376,25 @@ struct CreateSubMIndicePairFunctor<tv::GPU, Index, IndexGrid, NDim> {
if (numActIn == 0)
return 0;
// auto timer = spconv::CudaContextTimer<>();
if (useHash){
if (useHash) {
auto table = cuhash::HashTable();
// std::cout << "subm create " << numActIn << " size table..." << std::endl;
// std::cout << "subm create " << numActIn << " size table..." <<
// std::endl;
table.Initialize(numActIn, 2.0, 4);
unsigned *d_keyvalues = nullptr;
cudaMalloc((void**)&d_keyvalues, sizeof(unsigned) * numActIn * 2);
cudaMalloc((void **)&d_keyvalues, sizeof(unsigned) * numActIn * 2);
unsigned *d_values = d_keyvalues + numActIn;
prepareSubMHashKernel<Index, NDim>
<<<tv::launch::getBlocks(numActIn), tv::launch::CUDA_NUM_THREADS, 0,
d.getStream()>>>(indicesIn, d_keyvalues, d_values, outSpatialShape);
<<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS, 0,
d.getStream()>>>(indicesIn, d_keyvalues, d_values,
outSpatialShape);
TV_CHECK_CUDA_ERR_V2("prepareSubMHashKernel failed");
bool res = table.Build(numActIn, reinterpret_cast<unsigned*>(d_keyvalues),
reinterpret_cast<unsigned*>(d_values));
bool res =
table.Build(numActIn, reinterpret_cast<unsigned *>(d_keyvalues),
reinterpret_cast<unsigned *>(d_values));
cudaFree(d_keyvalues);
if (!res){
return -1; //use -1 to tell outside use CPU implementation
if (!res) {
return -1; // use -1 to tell outside use CPU implementation
}
auto tableSize = table.get_table_size();
auto tableData = table.data();
......@@ -175,28 +402,30 @@ struct CreateSubMIndicePairFunctor<tv::GPU, Index, IndexGrid, NDim> {
auto stash_constants = table.get_stash_constants();
auto stash_count = table.get_stash_count();
getSubMIndicePairsHashKernel<Index, NDim, 4096>
<<<tv::launch::getBlocks(numActIn), tv::launch::CUDA_NUM_THREADS, 0,
d.getStream()>>>(indicesIn, indicePairs, indiceNum,
kernelSize, stride, padding, dilation, outSpatialShape,
tableSize, tableData, constants, stash_constants,
stash_count);
<<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS, 0,
d.getStream()>>>(indicesIn, indicePairs, indiceNum, kernelSize,
stride, padding, dilation, outSpatialShape,
tableSize, tableData, constants, stash_constants,
stash_count);
TV_CHECK_CUDA_ERR_V2("getSubMIndicePairsHashKernel failed");
}else{
} else {
prepareSubMGridKernel<Index, IndexGrid, NDim>
<<<tv::launch::getBlocks(numActIn), tv::launch::CUDA_NUM_THREADS, 0,
d.getStream()>>>(indicesIn, gridsOut, outSpatialShape);
<<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS, 0,
d.getStream()>>>(indicesIn, gridsOut, outSpatialShape);
TV_CHECK_CUDA_ERR();
getSubMIndicePairsKernel<Index, IndexGrid, NDim, 4096>
<<<tv::launch::getBlocks(numActIn), tv::launch::CUDA_NUM_THREADS, 0,
d.getStream()>>>(indicesIn, gridsOut, indicePairs, indiceNum,
kernelSize, stride, padding, dilation, outSpatialShape);
<<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS, 0,
d.getStream()>>>(indicesIn, gridsOut, indicePairs, indiceNum,
kernelSize, stride, padding, dilation,
outSpatialShape);
TV_CHECK_CUDA_ERR();
}
// std::cout << "subm gene time " << timer.report() / 1000.0 << std::endl;
if (resetGrid && (!useHash)) {
resetGridSubMKernel<Index, IndexGrid, NDim>
<<<tv::launch::getBlocks(numActIn), tv::launch::CUDA_NUM_THREADS, 0,
d.getStream()>>>(indicesIn.data(), gridsOut, outSpatialShape, numActIn);
<<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS, 0,
d.getStream()>>>(indicesIn.data(), gridsOut, outSpatialShape,
numActIn);
TV_CHECK_CUDA_ERR();
}
return numActIn;
......
......@@ -16,9 +16,9 @@
#include <chrono>
#include <limits>
#include <spconv/maxpool.h>
#include <spconv/mp_helper.h>
#include <tensorview/helper_kernel.cu.h>
#include <tensorview/helper_launch.h>
#include <tensorview/cuda_utils.h>
#include <tensorview/kernel_utils.h>
#include <tensorview/mp_helper.h>
#include <tensorview/tensorview.h>
#include <type_traits>
......@@ -255,7 +255,8 @@ maxPoolBwdVecBlockKernel(const T *outFeatures, const T *inFeatures,
reinterpret_cast<const VecType *>(inFeatures)[idxi];
reinterpret_cast<VecType *>(bufdo)[0] =
reinterpret_cast<const VecType *>(dout)[idxo];
reinterpret_cast<VecType *>(bufdi)[0] = reinterpret_cast<VecType *>(din)[idxi];
reinterpret_cast<VecType *>(bufdi)[0] =
reinterpret_cast<VecType *>(din)[idxi];
#pragma unroll
for (int i = 0; i < vecloadFactor; i++) {
......@@ -263,7 +264,8 @@ maxPoolBwdVecBlockKernel(const T *outFeatures, const T *inFeatures,
bufdi[i] += bufdo[i];
}
}
reinterpret_cast<VecType *>(din)[idxi] = reinterpret_cast<VecType *>(bufdi)[0];
reinterpret_cast<VecType *>(din)[idxi] =
reinterpret_cast<VecType *>(bufdi)[0];
}
}
}
......@@ -309,7 +311,7 @@ template <typename T, typename Index>
struct SparseMaxPoolForwardFunctor<tv::GPU, T, Index> {
using vecload_type_t =
std::conditional_t<std::is_same<T, at::Half>::value, int2, int4>;
using kernel_block_t = mp_list_c<int, 64, 32, 16>;
using kernel_block_t = tv::mp_list_c<int, 64, 32, 16>;
void operator()(const tv::GPU &d, tv::TensorView<T> outFeatures,
tv::TensorView<const T> inFeatures,
tv::TensorView<const Index> indices, int size) {
......@@ -318,21 +320,22 @@ struct SparseMaxPoolForwardFunctor<tv::GPU, T, Index> {
int numPlanes = inFeatures.dim(1);
bool notFound = true;
constexpr int vecloadFactor = sizeof(vecload_type_t) / sizeof(T);
mp_for_each<kernel_block_t>([=, &outFeatures, &inFeatures, &indices,
&notFound](auto NumTLP) {
tv::mp_for_each<kernel_block_t>([=, &outFeatures, &inFeatures, &indices,
&notFound](auto NumTLP) {
constexpr int NumILP = NumTLP / 4;
int numHotBlock = (size / NumTLP) * NumTLP;
if (notFound) {
if (numPlanes % NumTLP == 0) {
if (numHotBlock >= NumTLP) {
maxPoolFwdVecBlockKernel<T, Index, int(NumTLP), NumILP, vecload_type_t>
maxPoolFwdVecBlockKernel<T, Index, int(NumTLP), NumILP,
vecload_type_t>
<<<dim3(std::min(size / NumTLP, 512), numPlanes / NumTLP),
dim3(NumTLP / vecloadFactor, NumTLP / NumILP), 0,
d.getStream()>>>(outFeatures.data(), inFeatures.data(),
indices.subview(0).data(),
indices.subview(1).data(), numHotBlock,
numPlanes / vecloadFactor);
indices.subview(0).data(),
indices.subview(1).data(), numHotBlock,
numPlanes / vecloadFactor);
TV_CHECK_CUDA_ERR();
}
......@@ -340,9 +343,9 @@ struct SparseMaxPoolForwardFunctor<tv::GPU, T, Index> {
maxPoolFwdGenericKernel<T, Index, int(NumTLP), NumILP>
<<<dim3(1, numPlanes / NumTLP), dim3(NumTLP / NumILP, NumTLP),
0, d.getStream()>>>(outFeatures.data(), inFeatures.data(),
indices.subview(0).data() + numHotBlock,
indices.subview(1).data() + numHotBlock,
size - numHotBlock, numPlanes);
indices.subview(0).data() + numHotBlock,
indices.subview(1).data() + numHotBlock,
size - numHotBlock, numPlanes);
TV_CHECK_CUDA_ERR();
}
notFound = false;
......@@ -356,7 +359,7 @@ struct SparseMaxPoolForwardFunctor<tv::GPU, T, Index> {
int numHotBlock = (size / NumTLP) * NumTLP;
if (numHotBlock >= NumTLP) {
maxPoolFwdGenericBlockKernel<T, Index, NumTLP, NumILP>
<<<dim3(size / NumTLP, tv::launch::DivUp(numPlanes, NumTLP)),
<<<dim3(size / NumTLP, tv::cuda::DivUp(numPlanes, NumTLP)),
dim3(NumTLP / NumILP, NumTLP), 0, d.getStream()>>>(
outFeatures.data(), inFeatures.data(),
indices.subview(0).data(), indices.subview(1).data(),
......@@ -366,7 +369,7 @@ struct SparseMaxPoolForwardFunctor<tv::GPU, T, Index> {
if (size > numHotBlock) {
maxPoolFwdGenericKernel<T, Index, NumTLP, NumILP>
<<<dim3(1, tv::launch::DivUp(numPlanes, NumTLP)),
<<<dim3(1, tv::cuda::DivUp(numPlanes, NumTLP)),
dim3(NumTLP / NumILP, NumTLP), 0, d.getStream()>>>(
outFeatures.data(), inFeatures.data(),
indices.subview(0).data() + numHotBlock,
......@@ -382,7 +385,7 @@ template <typename T, typename Index>
struct SparseMaxPoolBackwardFunctor<tv::GPU, T, Index> {
using vecload_type_t =
std::conditional_t<std::is_same<T, at::Half>::value, int2, int4>;
using kernel_block_t = mp_list_c<int, 64, 32, 16>;
using kernel_block_t = tv::mp_list_c<int, 64, 32, 16>;
void operator()(const tv::GPU &d, tv::TensorView<const T> outFeatures,
tv::TensorView<const T> inFeatures,
tv::TensorView<const T> dout, tv::TensorView<T> din,
......@@ -392,22 +395,23 @@ struct SparseMaxPoolBackwardFunctor<tv::GPU, T, Index> {
int numPlanes = inFeatures.dim(1);
bool notFound = true;
constexpr int vecloadFactor = sizeof(vecload_type_t) / sizeof(T);
mp_for_each<kernel_block_t>([=, &outFeatures, &inFeatures, &dout, &din,
&indices, &notFound](auto NumTLP) {
tv::mp_for_each<kernel_block_t>([=, &outFeatures, &inFeatures, &dout, &din,
&indices, &notFound](auto NumTLP) {
constexpr int NumILP = NumTLP / 4;
int numHotBlock = (size / NumTLP) * NumTLP;
if (notFound) {
if (numPlanes % NumTLP == 0) {
if (numHotBlock >= NumTLP) {
maxPoolBwdVecBlockKernel<T, Index, int(NumTLP), NumILP, vecload_type_t>
maxPoolBwdVecBlockKernel<T, Index, int(NumTLP), NumILP,
vecload_type_t>
<<<dim3(std::min(size / NumTLP, 512), numPlanes / NumTLP),
dim3(NumTLP / vecloadFactor, NumTLP / NumILP), 0,
d.getStream()>>>(outFeatures.data(), inFeatures.data(),
dout.data(), din.data(),
indices.subview(0).data(),
indices.subview(1).data(), numHotBlock,
numPlanes / vecloadFactor);
dout.data(), din.data(),
indices.subview(0).data(),
indices.subview(1).data(), numHotBlock,
numPlanes / vecloadFactor);
TV_CHECK_CUDA_ERR();
}
......@@ -415,10 +419,10 @@ struct SparseMaxPoolBackwardFunctor<tv::GPU, T, Index> {
maxPoolBwdGenericKernel<T, Index, int(NumTLP), NumILP>
<<<dim3(1, numPlanes / NumTLP), dim3(NumTLP / NumILP, NumTLP),
0, d.getStream()>>>(outFeatures.data(), inFeatures.data(),
dout.data(), din.data(),
indices.subview(0).data() + numHotBlock,
indices.subview(1).data() + numHotBlock,
size - numHotBlock, numPlanes);
dout.data(), din.data(),
indices.subview(0).data() + numHotBlock,
indices.subview(1).data() + numHotBlock,
size - numHotBlock, numPlanes);
TV_CHECK_CUDA_ERR();
}
notFound = false;
......@@ -432,7 +436,7 @@ struct SparseMaxPoolBackwardFunctor<tv::GPU, T, Index> {
int numHotBlock = (size / NumTLP) * NumTLP;
if (numHotBlock >= NumTLP) {
maxPoolBwdGenericBlockKernel<T, Index, NumTLP, NumILP>
<<<dim3(size / NumTLP, tv::launch::DivUp(numPlanes, NumTLP)),
<<<dim3(size / NumTLP, tv::cuda::DivUp(numPlanes, NumTLP)),
dim3(NumTLP / NumILP, NumTLP), 0, d.getStream()>>>(
outFeatures.data(), inFeatures.data(), dout.data(), din.data(),
indices.subview(0).data(), indices.subview(1).data(),
......@@ -442,7 +446,7 @@ struct SparseMaxPoolBackwardFunctor<tv::GPU, T, Index> {
if (size > numHotBlock) {
maxPoolBwdGenericKernel<T, Index, NumTLP, NumILP>
<<<dim3(1, tv::launch::DivUp(numPlanes, NumTLP)),
<<<dim3(1, tv::cuda::DivUp(numPlanes, NumTLP)),
dim3(NumTLP / NumILP, NumTLP), 0, d.getStream()>>>(
outFeatures.data(), inFeatures.data(), dout.data(), din.data(),
indices.subview(0).data() + numHotBlock,
......
......@@ -21,7 +21,7 @@ namespace spconv {
namespace functor {
template <typename T, typename Index>
struct NonMaxSupressionFunctor<tv::CPU, T, Index> {
struct NonMaxSupressionFunctor<tv::CPU, T, Index> {
Index operator()(const tv::CPU &d, tv::TensorView<Index> keep,
tv::TensorView<const T> boxes, T threshold, T eps) {
auto ndets = boxes.dim(0);
......@@ -131,7 +131,7 @@ struct rotateNonMaxSupressionFunctor<tv::CPU, T, Index> {
#define DECLARE_CPU_INDEX(Index) \
DECLARE_CPU_T_INDEX(float, Index); \
DECLARE_CPU_T_INDEX(double, Index);
DECLARE_CPU_T_INDEX(double, Index);
DECLARE_CPU_INDEX(int);
DECLARE_CPU_INDEX(long);
......
......@@ -2,18 +2,18 @@
// Deformable Convolutional Networks
// Copyright (c) 2015 Microsoft
// Licensed under The MIT License
// Modified from MATLAB Faster R-CNN (https://github.com/shaoqingren/faster_rcnn)
// Modified from MATLAB Faster R-CNN
// (https://github.com/shaoqingren/faster_rcnn)
// ------------------------------------------------------------------
#include <ATen/ATen.h>
#include <chrono>
#include <limits>
#include <spconv/mp_helper.h>
#include <spconv/reordering.h>
#include <spconv/reordering.cu.h>
#include <tensorview/helper_kernel.cu.h>
#include <tensorview/helper_launch.h>
#include <spconv/reordering.h>
#include <tensorview/cuda_utils.h>
#include <tensorview/kernel_utils.h>
#include <tensorview/mp_helper.h>
#include <tensorview/tensorview.h>
#include <type_traits>
#include <utility/timer.h>
......@@ -22,8 +22,7 @@
int const threadsPerBlock = sizeof(unsigned long long) * 8;
template <typename DType>
__device__ inline DType devIoU(DType const *const a, DType const *const b)
{
__device__ inline DType devIoU(DType const *const a, DType const *const b) {
DType left = max(a[0], b[0]), right = min(a[2], b[2]);
DType top = max(a[1], b[1]), bottom = min(a[3], b[3]);
DType width = max(right - left + 1, 0.f), height = max(bottom - top + 1, 0.f);
......@@ -35,44 +34,36 @@ __device__ inline DType devIoU(DType const *const a, DType const *const b)
template <typename DType, int BLOCK_THREADS>
__global__ void nms_kernel(const int n_boxes, const DType nms_overlap_thresh,
const DType *dev_boxes, unsigned long long *dev_mask)
{
const DType *dev_boxes,
unsigned long long *dev_mask) {
const int row_start = blockIdx.y;
const int col_start = blockIdx.x;
// if (row_start > col_start) return;
const int row_size =
min(n_boxes - row_start * BLOCK_THREADS, BLOCK_THREADS);
const int col_size =
min(n_boxes - col_start * BLOCK_THREADS, BLOCK_THREADS);
const int row_size = min(n_boxes - row_start * BLOCK_THREADS, BLOCK_THREADS);
const int col_size = min(n_boxes - col_start * BLOCK_THREADS, BLOCK_THREADS);
__shared__ DType block_boxes[BLOCK_THREADS * 5];
if (threadIdx.x < col_size)
{
if (threadIdx.x < col_size) {
#pragma unroll
for (int i = 0; i < 5; ++i)
{
for (int i = 0; i < 5; ++i) {
block_boxes[threadIdx.x * 5 + i] =
dev_boxes[(BLOCK_THREADS * col_start + threadIdx.x) * 5 + i];
}
}
__syncthreads();
if (threadIdx.x < row_size)
{
if (threadIdx.x < row_size) {
const int cur_box_idx = BLOCK_THREADS * row_start + threadIdx.x;
const DType *cur_box = dev_boxes + cur_box_idx * 5;
unsigned long long t = 0;
int start = 0;
if (row_start == col_start)
{
if (row_start == col_start) {
start = threadIdx.x + 1;
}
for (int i = start; i < col_size; i++)
{
if (devIoU(cur_box, block_boxes + i * 5) > nms_overlap_thresh)
{
for (int i = start; i < col_size; i++) {
if (devIoU(cur_box, block_boxes + i * 5) > nms_overlap_thresh) {
t |= 1ULL << i;
}
}
......@@ -80,4 +71,3 @@ __global__ void nms_kernel(const int n_boxes, const DType nms_overlap_thresh,
dev_mask[cur_box_idx * col_blocks + col_start] = t;
}
}
......@@ -15,10 +15,10 @@
#include <ATen/ATen.h>
#include <chrono>
#include <limits>
#include <spconv/mp_helper.h>
#include <spconv/pillar_scatter_functor.h>
#include <tensorview/helper_kernel.cu.h>
#include <tensorview/helper_launch.h>
#include <tensorview/cuda_utils.h>
#include <tensorview/kernel_utils.h>
#include <tensorview/mp_helper.h>
#include <tensorview/tensorview.h>
#include <type_traits>
#include <utility/timer.h>
......@@ -43,8 +43,8 @@ struct PointPillarScatter<tv::GPU, T, Index> {
void operator()(const tv::GPU &d, tv::TensorView<T> canvas,
tv::TensorView<const T> features,
tv::TensorView<const T> coors) {
auto grid = dim3(tv::launch::DivUp(features.dim(1), 32),
tv::launch::DivUp(features.dim(0), 32));
auto grid = dim3(tv::cuda::DivUp(features.dim(1), 32),
tv::cuda::DivUp(features.dim(0), 32));
pointPillarsScatterKernel<T, Index>
<<<grid, dim3(32, 32), 0, d.getStream()>>>(canvas, features, coors);
TV_CHECK_CUDA_ERR();
......
// Copyright 2019 Yan Yan
//
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//
// http://www.apache.org/licenses/LICENSE-2.0
//
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <ATen/Parallel.h>
#include <spconv/reordering.h>
#include <torch/script.h>
#include <ATen/Parallel.h>
namespace spconv {
namespace functor {
template <typename T, typename Index>
struct SparseGatherFunctor<tv::CPU, T, Index> {
void operator()(const tv::CPU& d, tv::TensorView<T> buffer, tv::TensorView<const T> features,
void operator()(const tv::CPU &d, tv::TensorView<T> buffer,
tv::TensorView<const T> features,
tv::TensorView<const Index> indices, int size) {
int numPlanes = features.dim(1);
at::parallel_for(0, size, 0, [&](int64_t begin, int64_t end){
at::parallel_for(0, size, 0, [&](int64_t begin, int64_t end) {
for (int i = begin; i < end; ++i) {
std::memcpy(buffer.data() + i * numPlanes,
features.data() + indices[i] * numPlanes,
......@@ -35,16 +36,16 @@ struct SparseGatherFunctor<tv::CPU, T, Index> {
template <typename T, typename Index>
struct SparseScatterAddFunctor<tv::CPU, T, Index> {
void operator()(const tv::CPU& d, tv::TensorView<T> outFeatures,
tv::TensorView<const T> buffer, tv::TensorView<const Index> indices,
int size, bool stable) {
void operator()(const tv::CPU &d, tv::TensorView<T> outFeatures,
tv::TensorView<const T> buffer,
tv::TensorView<const Index> indices, int size, bool stable) {
int numPlanes = outFeatures.dim(1);
const T* buf = buffer.data();
T* out = outFeatures.data();
const T *buf = buffer.data();
T *out = outFeatures.data();
for (int i = 0; i < size; ++i) {
buf = buffer.data() + i * numPlanes;
out = outFeatures.data() + indices[i] * numPlanes;
for (int j = 0; j < numPlanes; ++j){
for (int j = 0; j < numPlanes; ++j) {
out[j] += buf[j];
}
}
......@@ -53,9 +54,8 @@ struct SparseScatterAddFunctor<tv::CPU, T, Index> {
} // namespace functor
#define DECLARE_CPU_SPECS_T_INDEX(T, Index) \
template struct functor::SparseGatherFunctor<tv::CPU, T, Index>; \
#define DECLARE_CPU_SPECS_T_INDEX(T, Index) \
template struct functor::SparseGatherFunctor<tv::CPU, T, Index>; \
template struct functor::SparseScatterAddFunctor<tv::CPU, T, Index>;
#define DECLARE_CPU_SPECS(T) \
......@@ -70,4 +70,3 @@ DECLARE_CPU_SPECS(at::Half);
#undef DECLARE_CPU_SPECS_T_INDEX
} // namespace spconv
......@@ -15,11 +15,11 @@
#include <ATen/ATen.h>
#include <chrono>
#include <limits>
#include <spconv/mp_helper.h>
#include <spconv/reordering.h>
#include <spconv/reordering.cu.h>
#include <tensorview/helper_kernel.cu.h>
#include <tensorview/helper_launch.h>
#include <spconv/reordering.h>
#include <tensorview/cuda_utils.h>
#include <tensorview/kernel_utils.h>
#include <tensorview/mp_helper.h>
#include <tensorview/tensorview.h>
#include <type_traits>
#include <utility/timer.h>
......@@ -30,7 +30,7 @@ template <typename T, typename Index>
struct SparseGatherFunctor<tv::GPU, T, Index> {
using vecload_type_t =
std::conditional_t<std::is_same<T, at::Half>::value, int2, int4>;
using kernel_block_t = mp_list_c<int, 64, 32, 16>;
using kernel_block_t = tv::mp_list_c<int, 64, 32, 16>;
void operator()(const tv::GPU &d, tv::TensorView<T> buffer,
tv::TensorView<const T> features,
tv::TensorView<const Index> indices, int size) {
......@@ -39,8 +39,8 @@ struct SparseGatherFunctor<tv::GPU, T, Index> {
int numPlanes = features.dim(1);
bool notFound = true;
constexpr int vecloadFactor = sizeof(vecload_type_t) / sizeof(T);
mp_for_each<kernel_block_t>([=, &buffer, &features, &indices,
&notFound](auto NumTLP) {
tv::mp_for_each<kernel_block_t>([=, &buffer, &features, &indices,
&notFound](auto NumTLP) {
constexpr int NumILP = NumTLP / 4;
// constexpr int NumILP = NumTLP / (64 / (NumTLP / vecloadFactor));
int nHotBlock = (size / NumTLP) * NumTLP;
......@@ -50,8 +50,9 @@ struct SparseGatherFunctor<tv::GPU, T, Index> {
gatherVecBlockKernel<T, Index, int(NumTLP), NumILP, vecload_type_t>
<<<dim3(numPlanes / NumTLP, size / NumTLP),
dim3(NumTLP / vecloadFactor, NumTLP / NumILP), 0,
d.getStream()>>>(buffer.data(), features.data(), indices.data(),
nHotBlock, numPlanes / vecloadFactor);
d.getStream()>>>(buffer.data(), features.data(),
indices.data(), nHotBlock,
numPlanes / vecloadFactor);
TV_CHECK_CUDA_ERR();
}
......@@ -60,8 +61,9 @@ struct SparseGatherFunctor<tv::GPU, T, Index> {
<<<dim3(1, numPlanes / NumTLP),
dim3(NumTLP / NumILP, NumTLP / vecloadFactor), 0,
d.getStream()>>>(buffer.data() + nHotBlock * numPlanes,
features.data(), indices.data() + nHotBlock,
size - nHotBlock, numPlanes / vecloadFactor);
features.data(), indices.data() + nHotBlock,
size - nHotBlock,
numPlanes / vecloadFactor);
TV_CHECK_CUDA_ERR();
}
notFound = false;
......@@ -73,8 +75,8 @@ struct SparseGatherFunctor<tv::GPU, T, Index> {
constexpr int NumTLP = 64;
constexpr int NumILP = NumTLP / 4;
gatherGenericKernel<T, Index, NumTLP, NumILP>
<<<dim3(tv::launch::DivUp(size, NumTLP),
tv::launch::DivUp(numPlanes, NumTLP)),
<<<dim3(tv::cuda::DivUp(size, NumTLP),
tv::cuda::DivUp(numPlanes, NumTLP)),
dim3(NumTLP / NumILP, NumTLP), 0, d.getStream()>>>(
buffer.data(), features.data(), indices.data(), size, numPlanes);
TV_CHECK_CUDA_ERR();
......@@ -85,7 +87,7 @@ template <typename T, typename Index>
struct SparseScatterAddFunctor<tv::GPU, T, Index> {
using vecload_type_t =
std::conditional_t<std::is_same<T, at::Half>::value, int2, int4>;
using kernel_block_t = mp_list_c<int, 64, 32, 16>;
using kernel_block_t = tv::mp_list_c<int, 64, 32, 16>;
void operator()(const tv::GPU &d, tv::TensorView<T> outFeatures,
tv::TensorView<const T> buffer,
tv::TensorView<const Index> indices, int size, bool stable) {
......@@ -95,8 +97,8 @@ struct SparseScatterAddFunctor<tv::GPU, T, Index> {
bool notFound = true;
constexpr int vecloadFactor =
sizeof(vecload_type_t) / sizeof(T); // important for half.
mp_for_each<kernel_block_t>([=, &d, &outFeatures, &buffer, &indices,
&notFound](auto NumTLP) {
tv::mp_for_each<kernel_block_t>([=, &d, &outFeatures, &buffer, &indices,
&notFound](auto NumTLP) {
// constexpr int NumILP = NumTLP / (64 / (NumTLP / vecloadFactor));
constexpr int NumILP = NumTLP / 4;
int nHotBlock = (size / NumTLP) * NumTLP;
......@@ -108,8 +110,8 @@ struct SparseScatterAddFunctor<tv::GPU, T, Index> {
<<<dim3(numPlanes / NumTLP, size / NumTLP),
dim3(NumTLP / vecloadFactor, NumTLP / NumILP), 0,
d.getStream()>>>(outFeatures.data(), buffer.data(),
indices.data(), nHotBlock,
numPlanes / vecloadFactor);
indices.data(), nHotBlock,
numPlanes / vecloadFactor);
TV_CHECK_CUDA_ERR();
}
if (size - nHotBlock > 0) {
......@@ -128,8 +130,8 @@ struct SparseScatterAddFunctor<tv::GPU, T, Index> {
constexpr int NumTLP = 64;
constexpr int NumILP = NumTLP / 4;
scatterAddGenericKernel<T, Index, NumTLP, NumILP>
<<<dim3(tv::launch::DivUp(size, NumTLP),
tv::launch::DivUp(numPlanes, NumTLP)),
<<<dim3(tv::cuda::DivUp(size, NumTLP),
tv::cuda::DivUp(numPlanes, NumTLP)),
dim3(NumTLP / NumILP, NumTLP), 0, d.getStream()>>>(
outFeatures.data(), buffer.data(), indices.data(), size,
numPlanes);
......@@ -139,7 +141,6 @@ struct SparseScatterAddFunctor<tv::GPU, T, Index> {
};
} // namespace functor
#define DECLARE_GPU_SPECS_T_INDEX(T, Index) \
template struct functor::SparseGatherFunctor<tv::GPU, T, Index>; \
template struct functor::SparseScatterAddFunctor<tv::GPU, T, Index>;
......
......@@ -47,7 +47,7 @@ torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters,
double totalGatherTime = 0;
double totalGEMMTime = 0;
double totalSAddTime = 0;
tv::torch_dispatch<float, double, at::Half>(
tv::dispatch_torch<float, double, at::Half>(
features.scalar_type(), [&](auto I) {
using T = decltype(I);
for (int i = 0; i < kernelVolume; ++i) {
......@@ -68,7 +68,7 @@ torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters,
tv::torch2tv<const int>(indicePairs).subview(i, inverse),
nHot);
}
#ifdef SPCONV_CUDA
#ifdef TV_CUDA
else if (device == torch::kCUDA) {
functor::SparseGatherFunctor<tv::GPU, T, int> gatherFtor;
gatherFtor(tv::TorchGPU(), tv::torch2tv<T>(inputBuffer),
......@@ -99,7 +99,7 @@ torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters,
tv::torch2tv<const int>(indicePairs).subview(i, !inverse), nHot,
true);
}
#ifdef SPCONV_CUDA
#ifdef TV_CUDA
else if (device == torch::kCUDA) {
functor::SparseScatterAddFunctor<tv::GPU, T, int> scatterFtor;
scatterFtor(
......@@ -158,7 +158,7 @@ indiceConvBackward(torch::Tensor features, torch::Tensor filters,
torch::mm_out(filterGradSub, features.t(), outGrad);
torch::mm_out(inputGrad, outGrad, filters[indicePairMaxOffset].t());
}
tv::torch_dispatch<float, double,
tv::dispatch_torch<float, double,
at::Half>(features.scalar_type(), [&](auto I) {
using T = decltype(I);
for (int i = 0; i < kernelVolume; ++i) {
......@@ -178,7 +178,7 @@ indiceConvBackward(torch::Tensor features, torch::Tensor filters,
tv::torch2tv<const int>(indicePairs).subview(i, !inverse),
nHot);
}
#ifdef SPCONV_CUDA
#ifdef TV_CUDA
else if (device == torch::kCUDA) {
functor::SparseGatherFunctor<tv::GPU, T, int> gatherFtor;
functor::SparseGatherFunctor<tv::GPU, T, int> gatherFtorOut;
......@@ -213,7 +213,7 @@ indiceConvBackward(torch::Tensor features, torch::Tensor filters,
tv::torch2tv<const int>(indicePairs).subview(i, inverse),
nHot);
}
#ifdef SPCONV_CUDA
#ifdef TV_CUDA
else if (device == torch::kCUDA) {
functor::SparseScatterAddFunctor<tv::GPU, T, int> scatterFtor;
scatterFtor(tv::TorchGPU(), tv::torch2tv<T>(inputGrad),
......
......@@ -20,7 +20,7 @@ using namespace pybind11::literals;
PYBIND11_MODULE(spconv_utils, m) {
m.doc() = "util pybind11 functions for spconv";
#ifdef SPCONV_CUDA
#ifdef TV_CUDA
m.def("non_max_suppression", &spconv::non_max_suppression<double>,
py::return_value_policy::reference_internal, "bbox iou", "boxes"_a = 1,
"keep_out"_a = 2, "nms_overlap_thresh"_a = 3, "device_id"_a = 4);
......
......@@ -2,30 +2,28 @@
// Deformable Convolutional Networks
// Copyright (c) 2015 Microsoft
// Licensed under The MIT License
// Modified from MATLAB Faster R-CNN (https://github.com/shaoqingren/faster_rcnn)
// Modified from MATLAB Faster R-CNN
// (https://github.com/shaoqingren/faster_rcnn)
// ------------------------------------------------------------------
#include <vector>
#include <iostream>
#include <cuda_runtime.h>
#include <iostream>
#include <spconv/nms_gpu.h>
#include <vector>
#define CUDA_CHECK(condition) \
/* Code block avoids redefinition of cudaError_t error */ \
do \
{ \
cudaError_t error = condition; \
if (error != cudaSuccess) \
{ \
std::cout << cudaGetErrorString(error) << std::endl; \
} \
#define CUDA_CHECK(condition) \
/* Code block avoids redefinition of cudaError_t error */ \
do { \
cudaError_t error = condition; \
if (error != cudaSuccess) { \
std::cout << cudaGetErrorString(error) << std::endl; \
} \
} while (0)
#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
int const threadsPerBlock = sizeof(unsigned long long) * 8;
template <typename DType>
__device__ inline DType devIoU(DType const *const a, DType const *const b)
{
__device__ inline DType devIoU(DType const *const a, DType const *const b) {
DType left = max(a[0], b[0]), right = min(a[2], b[2]);
DType top = max(a[1], b[1]), bottom = min(a[3], b[3]);
DType width = max(right - left + 1, 0.f), height = max(bottom - top + 1, 0.f);
......@@ -37,44 +35,36 @@ __device__ inline DType devIoU(DType const *const a, DType const *const b)
template <typename DType, int BLOCK_THREADS>
__global__ void nms_kernel(const int n_boxes, const DType nms_overlap_thresh,
const DType *dev_boxes, unsigned long long *dev_mask)
{
const DType *dev_boxes,
unsigned long long *dev_mask) {
const int row_start = blockIdx.y;
const int col_start = blockIdx.x;
// if (row_start > col_start) return;
const int row_size =
min(n_boxes - row_start * BLOCK_THREADS, BLOCK_THREADS);
const int col_size =
min(n_boxes - col_start * BLOCK_THREADS, BLOCK_THREADS);
const int row_size = min(n_boxes - row_start * BLOCK_THREADS, BLOCK_THREADS);
const int col_size = min(n_boxes - col_start * BLOCK_THREADS, BLOCK_THREADS);
__shared__ DType block_boxes[BLOCK_THREADS * 5];
if (threadIdx.x < col_size)
{
if (threadIdx.x < col_size) {
#pragma unroll
for (int i = 0; i < 5; ++i)
{
for (int i = 0; i < 5; ++i) {
block_boxes[threadIdx.x * 5 + i] =
dev_boxes[(BLOCK_THREADS * col_start + threadIdx.x) * 5 + i];
}
}
__syncthreads();
if (threadIdx.x < row_size)
{
if (threadIdx.x < row_size) {
const int cur_box_idx = BLOCK_THREADS * row_start + threadIdx.x;
const DType *cur_box = dev_boxes + cur_box_idx * 5;
unsigned long long t = 0;
int start = 0;
if (row_start == col_start)
{
if (row_start == col_start) {
start = threadIdx.x + 1;
}
for (int i = start; i < col_size; i++)
{
if (devIoU(cur_box, block_boxes + i * 5) > nms_overlap_thresh)
{
for (int i = start; i < col_size; i++) {
if (devIoU(cur_box, block_boxes + i * 5) > nms_overlap_thresh) {
t |= 1ULL << i;
}
}
......@@ -83,12 +73,10 @@ __global__ void nms_kernel(const int n_boxes, const DType nms_overlap_thresh,
}
}
void _set_device(int device_id)
{
void _set_device(int device_id) {
int current_device;
CUDA_CHECK(cudaGetDevice(&current_device));
if (current_device == device_id)
{
if (current_device == device_id) {
return;
}
// The call to cudaSetDevice must come before any calls to Get, which
......@@ -98,8 +86,7 @@ void _set_device(int device_id)
template <typename DType, int BLOCK_THREADS>
int _nms_gpu(int *keep_out, const DType *boxes_host, int boxes_num,
int boxes_dim, DType nms_overlap_thresh, int device_id)
{
int boxes_dim, DType nms_overlap_thresh, int device_id) {
_set_device(device_id);
DType *boxes_dev = NULL;
......@@ -107,27 +94,21 @@ int _nms_gpu(int *keep_out, const DType *boxes_host, int boxes_num,
const int col_blocks = DIVUP(boxes_num, BLOCK_THREADS);
CUDA_CHECK(cudaMalloc(&boxes_dev,
boxes_num * boxes_dim * sizeof(DType)));
CUDA_CHECK(cudaMemcpy(boxes_dev,
boxes_host,
CUDA_CHECK(cudaMalloc(&boxes_dev, boxes_num * boxes_dim * sizeof(DType)));
CUDA_CHECK(cudaMemcpy(boxes_dev, boxes_host,
boxes_num * boxes_dim * sizeof(DType),
cudaMemcpyHostToDevice));
CUDA_CHECK(cudaMalloc(&mask_dev,
boxes_num * col_blocks * sizeof(unsigned long long)));
dim3 blocks(DIVUP(boxes_num, BLOCK_THREADS),
DIVUP(boxes_num, BLOCK_THREADS));
dim3 blocks(DIVUP(boxes_num, BLOCK_THREADS), DIVUP(boxes_num, BLOCK_THREADS));
dim3 threads(BLOCK_THREADS);
nms_kernel<DType, BLOCK_THREADS><<<blocks, threads>>>(boxes_num,
nms_overlap_thresh,
boxes_dev,
mask_dev);
nms_kernel<DType, BLOCK_THREADS>
<<<blocks, threads>>>(boxes_num, nms_overlap_thresh, boxes_dev, mask_dev);
std::vector<unsigned long long> mask_host(boxes_num * col_blocks);
CUDA_CHECK(cudaMemcpy(&mask_host[0],
mask_dev,
CUDA_CHECK(cudaMemcpy(&mask_host[0], mask_dev,
sizeof(unsigned long long) * boxes_num * col_blocks,
cudaMemcpyDeviceToHost));
......@@ -135,17 +116,14 @@ int _nms_gpu(int *keep_out, const DType *boxes_host, int boxes_num,
memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks);
int num_to_keep = 0;
for (int i = 0; i < boxes_num; i++)
{
for (int i = 0; i < boxes_num; i++) {
int nblock = i / BLOCK_THREADS;
int inblock = i % BLOCK_THREADS;
if (!(remv[nblock] & (1ULL << inblock)))
{
if (!(remv[nblock] & (1ULL << inblock))) {
keep_out[num_to_keep++] = i;
unsigned long long *p = &mask_host[0] + i * col_blocks;
for (int j = nblock; j < col_blocks; j++)
{
for (int j = nblock; j < col_blocks; j++) {
remv[j] |= p[j];
}
}
......@@ -156,10 +134,15 @@ int _nms_gpu(int *keep_out, const DType *boxes_host, int boxes_num,
return num_to_keep;
}
//template<>
template int _nms_gpu<float, threadsPerBlock>(int *keep_out, const float *boxes_host, int boxes_num,
int boxes_dim, float nms_overlap_thresh, int device_id);
//template<>
template int _nms_gpu<double, threadsPerBlock>(int *keep_out, const double *boxes_host, int boxes_num,
int boxes_dim, double nms_overlap_thresh, int device_id);
\ No newline at end of file
// template<>
template int _nms_gpu<float, threadsPerBlock>(int *keep_out,
const float *boxes_host,
int boxes_num, int boxes_dim,
float nms_overlap_thresh,
int device_id);
// template<>
template int _nms_gpu<double, threadsPerBlock>(int *keep_out,
const double *boxes_host,
int boxes_num, int boxes_dim,
double nms_overlap_thresh,
int device_id);
\ No newline at end of file
import horovod.torch as hvd
import time
from pathlib import Path
......@@ -12,6 +11,7 @@ from torch.utils import data
from torch.utils.data import DataLoader, Dataset
from torchvision import datasets, transforms
import horovod.torch as hvd
import spconv
from spconv.test_utils import generate_sparse_data
......@@ -53,25 +53,47 @@ class FakeClassifier(nn.Module):
def __init__(self):
super().__init__()
self.net = spconv.SparseSequential(
spconv.SubMConv3d(3, 8, 3, indice_key="subm1", padding=1, use_hash=False),
spconv.SubMConv3d(3,
8,
3,
indice_key="subm1",
padding=1,
use_hash=False),
nn.BatchNorm1d(8),
nn.ReLU(),
spconv.SparseConv3d(8, 16, 3, stride=2, padding=1, use_hash=False),
nn.BatchNorm1d(16),
nn.ReLU(),
spconv.SubMConv3d(16, 16, 3, indice_key="subm2", padding=1, use_hash=False),
spconv.SubMConv3d(16,
16,
3,
indice_key="subm2",
padding=1,
use_hash=False),
nn.BatchNorm1d(16),
nn.ReLU(),
spconv.SparseConv3d(16, 32, 3, stride=2, padding=1, use_hash=False),
spconv.SparseConv3d(16, 32, 3, stride=2, padding=1,
use_hash=False),
nn.BatchNorm1d(32),
nn.ReLU(),
spconv.SubMConv3d(32, 32, 3, indice_key="subm3", padding=1, use_hash=False),
spconv.SubMConv3d(32,
32,
3,
indice_key="subm3",
padding=1,
use_hash=False),
nn.BatchNorm1d(32),
nn.ReLU(),
spconv.SparseConv3d(32, 64, 3, stride=2, padding=1, use_hash=False),
spconv.SparseConv3d(32, 64, 3, stride=2, padding=1,
use_hash=False),
nn.BatchNorm1d(64),
nn.ReLU(),
spconv.SubMConv3d(64, 64, 3, indice_key="subm4", padding=1, use_hash=False),
spconv.SubMConv3d(64,
64,
3,
indice_key="subm4",
padding=1,
use_hash=False),
nn.BatchNorm1d(64),
nn.ReLU(),
spconv.ToDense() # [64, 2, 8, 8]
......@@ -100,15 +122,16 @@ def run():
hvd.broadcast_optimizer_state(optimizer, root_rank=0)
compression = hvd.Compression.none
optimizer = hvd.DistributedOptimizer(optimizer,
named_parameters=model.named_parameters(),
compression=compression,
op=hvd.Average)
optimizer = hvd.DistributedOptimizer(
optimizer,
named_parameters=model.named_parameters(),
compression=compression,
op=hvd.Average)
for i in tqdm.tqdm(list(range(100))):
# for j in range(4):
# features, indices, label = ds[(i * 4 + j) % len(ds)]
features, indices, label = ds[i % len(ds)]
features_t = torch.from_numpy(features)
indices_t = torch.from_numpy(indices)
......
......@@ -52,25 +52,47 @@ class FakeClassifier(nn.Module):
def __init__(self):
super().__init__()
self.net = spconv.SparseSequential(
spconv.SubMConv3d(3, 8, 3, indice_key="subm1", padding=1, use_hash=False),
spconv.SubMConv3d(3,
8,
3,
indice_key="subm1",
padding=1,
use_hash=False),
nn.BatchNorm1d(8),
nn.ReLU(),
spconv.SparseConv3d(8, 16, 3, stride=2, padding=1, use_hash=False),
nn.BatchNorm1d(16),
nn.ReLU(),
spconv.SubMConv3d(16, 16, 3, indice_key="subm2", padding=1, use_hash=False),
spconv.SubMConv3d(16,
16,
3,
indice_key="subm2",
padding=1,
use_hash=False),
nn.BatchNorm1d(16),
nn.ReLU(),
spconv.SparseConv3d(16, 32, 3, stride=2, padding=1, use_hash=False),
spconv.SparseConv3d(16, 32, 3, stride=2, padding=1,
use_hash=False),
nn.BatchNorm1d(32),
nn.ReLU(),
spconv.SubMConv3d(32, 32, 3, indice_key="subm3", padding=1, use_hash=False),
spconv.SubMConv3d(32,
32,
3,
indice_key="subm3",
padding=1,
use_hash=False),
nn.BatchNorm1d(32),
nn.ReLU(),
spconv.SparseConv3d(32, 64, 3, stride=2, padding=1, use_hash=False),
spconv.SparseConv3d(32, 64, 3, stride=2, padding=1,
use_hash=False),
nn.BatchNorm1d(64),
nn.ReLU(),
spconv.SubMConv3d(64, 64, 3, indice_key="subm4", padding=1, use_hash=False),
spconv.SubMConv3d(64,
64,
3,
indice_key="subm4",
padding=1,
use_hash=False),
nn.BatchNorm1d(64),
nn.ReLU(),
spconv.ToDense() # [64, 2, 8, 8]
......@@ -97,7 +119,7 @@ def run():
for i in tqdm.tqdm(list(range(100))):
# for j in range(4):
# features, indices, label = ds[(i * 4 + j) % len(ds)]
features, indices, label = ds[i % len(ds)]
features_t = torch.from_numpy(features)
indices_t = torch.from_numpy(indices)
......
# Copyright 2019 Yan Yan
#
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#
# http://www.apache.org/licenses/LICENSE-2.0
#
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import time
import unittest
from pathlib import Path
import spconv
import numpy as np
import torch
from torch import nn
import numpy as np
import time
from spconv.test_utils import params_grid, generate_sparse_data, TestCase
import unittest
# import sparseconvnet as scn
from torch import nn
import spconv
from spconv.test_utils import TestCase, generate_sparse_data, params_grid
# import sparseconvnet as scn
class SparseConv3dTestTorch(nn.Module):
def __init__(self, num_layers, ndim, shape, in_channels, out_channels, kernel_size,
stride, padding, dilation):
def __init__(self, num_layers, ndim, shape, in_channels, out_channels,
kernel_size, stride, padding, dilation):
super().__init__()
layers = [spconv.SparseConv3d(
in_channels,
out_channels,
kernel_size,
stride,
padding=padding,
dilation=dilation,
bias=False,
use_hash=True)]
layers = [
spconv.SparseConv3d(in_channels,
out_channels,
kernel_size,
stride,
padding=padding,
dilation=dilation,
bias=False,
use_hash=True)
]
for i in range(1, num_layers):
layers.append(spconv.SparseConv3d(
out_channels,
out_channels,
kernel_size,
stride,
padding=padding,
dilation=dilation,
bias=False))
self.net = spconv.SparseSequential(
*layers,
)
layers.append(
spconv.SparseConv3d(out_channels,
out_channels,
kernel_size,
stride,
padding=padding,
dilation=dilation,
bias=False))
self.net = spconv.SparseSequential(*layers, )
# self.grid = torch.full([3, *shape], -1, dtype=torch.int32).cuda()
self.grid = None
self.shape = shape
def forward(self, features, coors, batch_size):
coors = coors.int()
x = spconv.SparseConvTensor(features, coors,self.shape, batch_size, self.grid)
return self.net(x)# .dense()
x = spconv.SparseConvTensor(features, coors, self.shape, batch_size,
self.grid)
return self.net(x) # .dense()
class SubMConv3dTestTorch(nn.Module):
def __init__(self, num_layers, ndim, shape, in_channels, out_channels, kernel_size,
stride, padding, dilation):
def __init__(self, num_layers, ndim, shape, in_channels, out_channels,
kernel_size, stride, padding, dilation):
super().__init__()
layers = [spconv.SubMConv3d(
in_channels,
out_channels,
kernel_size,
stride,
padding=padding,
dilation=dilation,
bias=False)]
layers = [
spconv.SubMConv3d(in_channels,
out_channels,
kernel_size,
stride,
padding=padding,
dilation=dilation,
bias=False)
]
for i in range(1, num_layers):
layers.append(spconv.SubMConv3d(
out_channels,
out_channels,
kernel_size,
stride,
padding=padding,
dilation=dilation,
bias=False))
self.net = spconv.SparseSequential(
*layers,
)
layers.append(
spconv.SubMConv3d(out_channels,
out_channels,
kernel_size,
stride,
padding=padding,
dilation=dilation,
bias=False))
self.net = spconv.SparseSequential(*layers, )
self.grid = torch.full([3, *shape], -1, dtype=torch.int32).cuda()
self.shape = shape
def forward(self, features, coors, batch_size):
coors = coors.int()
x = spconv.SparseConvTensor(features, coors,self.shape, batch_size, self.grid)
return self.net(x)# .dense()
x = spconv.SparseConvTensor(features, coors, self.shape, batch_size,
self.grid)
return self.net(x) # .dense()
class Conv3dTestTorch(nn.Module):
def __init__(self, num_layers, ndim, shape, in_channels, out_channels, kernel_size,
stride, padding, dilation):
def __init__(self, num_layers, ndim, shape, in_channels, out_channels,
kernel_size, stride, padding, dilation):
super().__init__()
layers = [nn.Conv3d(
in_channels,
out_channels,
kernel_size,
stride,
padding=padding,
dilation=dilation,
bias=False)]
layers = [
nn.Conv3d(in_channels,
out_channels,
kernel_size,
stride,
padding=padding,
dilation=dilation,
bias=False)
]
for i in range(1, num_layers):
layers.append(nn.Conv3d(
out_channels,
out_channels,
kernel_size,
stride,
padding=padding,
dilation=dilation,
bias=False))
self.net = nn.Sequential(
*layers,
)
layers.append(
nn.Conv3d(out_channels,
out_channels,
kernel_size,
stride,
padding=padding,
dilation=dilation,
bias=False))
self.net = nn.Sequential(*layers, )
self.shape = shape
def forward(self, x):
return self.net(x)# .dense()
return self.net(x) # .dense()
class SparseDeConv3dTestTorch(nn.Module):
def __init__(self, num_layers, ndim, shape, in_channels, out_channels, kernel_size,
stride, padding, dilation):
def __init__(self, num_layers, ndim, shape, in_channels, out_channels,
kernel_size, stride, padding, dilation):
super().__init__()
layers = [spconv.SparseConvTranspose3d(
in_channels,
out_channels,
kernel_size,
stride,
padding=padding,
dilation=dilation,
bias=False)]
layers = [
spconv.SparseConvTranspose3d(in_channels,
out_channels,
kernel_size,
stride,
padding=padding,
dilation=dilation,
bias=False)
]
for i in range(1, num_layers):
layers.append(spconv.SparseConvTranspose3d(
out_channels,
out_channels,
kernel_size,
stride,
padding=padding,
dilation=dilation,
bias=False))
self.net = spconv.SparseSequential(
*layers,
)
layers.append(
spconv.SparseConvTranspose3d(out_channels,
out_channels,
kernel_size,
stride,
padding=padding,
dilation=dilation,
bias=False))
self.net = spconv.SparseSequential(*layers, )
self.shape = shape
def forward(self, features, coors, batch_size):
coors = coors.int()
x = spconv.SparseConvTensor(features, coors,self.shape, batch_size)
return self.net(x)# .dense()
x = spconv.SparseConvTensor(features, coors, self.shape, batch_size)
return self.net(x) # .dense()
class DeConv3dTestTorch(nn.Module):
def __init__(self, num_layers, ndim, shape, in_channels, out_channels, kernel_size,
stride, padding, dilation):
def __init__(self, num_layers, ndim, shape, in_channels, out_channels,
kernel_size, stride, padding, dilation):
super().__init__()
layers = [nn.ConvTranspose3d(
in_channels,
out_channels,
kernel_size,
stride,
padding=padding,
dilation=dilation,
bias=False)]
layers = [
nn.ConvTranspose3d(in_channels,
out_channels,
kernel_size,
stride,
padding=padding,
dilation=dilation,
bias=False)
]
for i in range(1, num_layers):
layers.append(nn.ConvTranspose3d(
out_channels,
out_channels,
kernel_size,
stride,
padding=padding,
dilation=dilation,
bias=False))
self.net = nn.Sequential(
*layers,
)
layers.append(
nn.ConvTranspose3d(out_channels,
out_channels,
kernel_size,
stride,
padding=padding,
dilation=dilation,
bias=False))
self.net = nn.Sequential(*layers, )
self.shape = shape
def forward(self, x):
return self.net(x)# .dense()
return self.net(x) # .dense()
class SparseMaxPoolTestTorch(nn.Module):
def __init__(self, num_layers, ndim, shape, kernel_size,
stride, padding, dilation):
def __init__(self, num_layers, ndim, shape, kernel_size, stride, padding,
dilation):
super().__init__()
layers = [spconv.SparseMaxPool3d(
kernel_size,
stride, padding, dilation)]
layers = [
spconv.SparseMaxPool3d(kernel_size, stride, padding, dilation)
]
for i in range(1, num_layers):
layers.append(spconv.SparseMaxPool3d(
kernel_size,
stride, padding, dilation))
self.net = spconv.SparseSequential(
*layers,
)
layers.append(
spconv.SparseMaxPool3d(kernel_size, stride, padding, dilation))
self.net = spconv.SparseSequential(*layers, )
self.shape = shape
def forward(self, features, coors, batch_size):
coors = coors.int()
x = spconv.SparseConvTensor(features, coors, self.shape, batch_size )
return self.net(x)# .dense()
x = spconv.SparseConvTensor(features, coors, self.shape, batch_size)
return self.net(x) # .dense()
class MaxPool3dTestTorch(nn.Module):
def __init__(self, num_layers, ndim, shape, kernel_size,
stride, padding, dilation):
def __init__(self, num_layers, ndim, shape, kernel_size, stride, padding,
dilation):
super().__init__()
layers = [nn.MaxPool3d(
kernel_size,
stride, padding, dilation)]
layers = [nn.MaxPool3d(kernel_size, stride, padding, dilation)]
for i in range(1, num_layers):
layers.append(nn.MaxPool3d(
kernel_size,
stride, padding, dilation))
self.net = nn.Sequential(
*layers,
)
layers.append(nn.MaxPool3d(kernel_size, stride, padding, dilation))
self.net = nn.Sequential(*layers, )
self.shape = shape
def forward(self, x):
return self.net(x)# .dense()
return self.net(x) # .dense()
class SubmanifoldConvTestTorch(nn.Module):
def __init__(self, num_layers, ndim, shape, in_channels, out_channels, kernel_size, stride):
def __init__(self, num_layers, ndim, shape, in_channels, out_channels,
kernel_size, stride):
super().__init__()
layers = [spconv.SubMConv3d(
in_channels, out_channels, kernel_size, bias=False, indice_key="subm0")]
layers = [
spconv.SubMConv3d(in_channels,
out_channels,
kernel_size,
bias=False,
indice_key="subm0")
]
for i in range(1, num_layers):
layers.append(spconv.SubMConv3d(
out_channels, out_channels, kernel_size, bias=False))
self.net = nn.Sequential(
*layers,
)
layers.append(
spconv.SubMConv3d(out_channels,
out_channels,
kernel_size,
bias=False))
self.net = nn.Sequential(*layers, )
self.shape = shape
def forward(self, features, coors, batch_size):
coors = coors.int()
x = spconv.SparseConvTensor(features, coors, self.shape, batch_size )
x = spconv.SparseConvTensor(features, coors, self.shape, batch_size)
return self.net(x)
class SCNCoupleDeConvTest(nn.Module):
def __init__(self, num_layers, ndim, shape, in_channels, out_channels, kernel_size,
stride):
def __init__(self, num_layers, ndim, shape, in_channels, out_channels,
kernel_size, stride):
super().__init__()
self.scn_input = scn.InputLayer(ndim, shape, mode=0)
self.net = nn.Sequential(
scn.Convolution(
ndim,
in_channels,
out_channels,
kernel_size,
stride,
bias=False),
scn.Deconvolution(
ndim,
out_channels,
in_channels,
kernel_size,
stride,
bias=False),
scn.Convolution(ndim,
in_channels,
out_channels,
kernel_size,
stride,
bias=False),
scn.Deconvolution(ndim,
out_channels,
in_channels,
kernel_size,
stride,
bias=False),
scn.SparseToDense(ndim, in_channels),
)
......@@ -267,44 +268,44 @@ class SCNCoupleDeConvTest(nn.Module):
x = self.scn_input((coors, features))
return self.net(x)
class SparseCoupleDeConvTest(nn.Module):
def __init__(self, num_layers, ndim, shape, in_channels, out_channels, kernel_size,
stride):
def __init__(self, num_layers, ndim, shape, in_channels, out_channels,
kernel_size, stride):
super().__init__()
self.net = spconv.SparseSequential(
spconv.SparseConv3d(
in_channels,
out_channels,
kernel_size,
stride,
indice_key="cp0",
bias=False),
spconv.SparseInverseConv3d(
out_channels,
in_channels,
kernel_size,
indice_key="cp0",
bias=False),
spconv.SparseConv3d(in_channels,
out_channels,
kernel_size,
stride,
indice_key="cp0",
bias=False),
spconv.SparseInverseConv3d(out_channels,
in_channels,
kernel_size,
indice_key="cp0",
bias=False),
)
self.todense = spconv.ToDense()
self.shape = shape
def forward(self, features, coors, batch_size):
coors = coors.int()
x = spconv.SparseConvTensor(features, coors,self.shape, batch_size )
return self.todense(self.net(x))# .dense()
x = spconv.SparseConvTensor(features, coors, self.shape, batch_size)
return self.todense(self.net(x)) # .dense()
def gather_nd(params, indices):
# this function has a limit that MAX_ADVINDEX_CALC_DIMS=5
ndim = indices.shape[-1]
output_shape = list(indices.shape[:-1]) + list(params.shape[indices.shape[-1]:])
output_shape = list(indices.shape[:-1]) + list(
params.shape[indices.shape[-1]:])
flatted_indices = indices.view(-1, ndim)
slices = [flatted_indices[:, i] for i in range(ndim)]
slices += [Ellipsis]
return params[slices].view(*output_shape)
def scatter_nd(indices, updates, shape):
"""pytorch edition of tensorflow scatter_nd.
this function don't contain except handle code. so use this carefully
......@@ -322,7 +323,6 @@ def scatter_nd(indices, updates, shape):
class TestSpConv(TestCase):
def testSpConv3d(self):
np.random.seed(484)
devices = ["cpu:0"]
......@@ -337,36 +337,44 @@ class TestSpConv(TestCase):
dilations = [1, 2, 3]
for dev, shape, bs, IC, OC, k, s, p, d in params_grid(
devices, shapes, batchsizes, in_channels, out_channels, ksizes,
strides, paddings, dilations):
devices, shapes, batchsizes, in_channels, out_channels, ksizes,
strides, paddings, dilations):
if all([s > 1, d > 1]):
continue # don't support this.
continue # don't support this.
device = torch.device(dev)
num_points = [1000] * bs
sparse_dict = generate_sparse_data(shape, num_points, IC)
features = np.ascontiguousarray(sparse_dict["features"]).astype(np.float32)
indices = np.ascontiguousarray(sparse_dict["indices"][:, [3, 0, 1, 2]]).astype(np.int32)
features = np.ascontiguousarray(sparse_dict["features"]).astype(
np.float32)
indices = np.ascontiguousarray(
sparse_dict["indices"][:, [3, 0, 1, 2]]).astype(np.int32)
features_dense = sparse_dict["features_dense"].astype(np.float32)
filters = np.random.uniform(0, 1, size=[k, k, k, IC, OC]).astype(np.float32)
filters = np.random.uniform(0, 1, size=[k, k, k, IC,
OC]).astype(np.float32)
indices_t = torch.from_numpy(indices).int().to(device)
features_t = torch.from_numpy(features).to(device)
features_t.requires_grad = True
features_dense_t = torch.from_numpy(features_dense).to(device)
features_dense_t.requires_grad = True
net = SparseConv3dTestTorch(1, 3, shape, IC, OC, k, s, p, d).to(device)
net_ref = Conv3dTestTorch(1, 3, shape, IC, OC, k, s, p, d).to(device)
net = SparseConv3dTestTorch(1, 3, shape, IC, OC, k, s, p,
d).to(device)
net_ref = Conv3dTestTorch(1, 3, shape, IC, OC, k, s, p,
d).to(device)
filters_t = torch.from_numpy(filters).to(device)
net_ref.net[0].weight.data[:] = filters_t.permute(4, 3, 0, 1, 2).contiguous()
net_ref.net[0].weight.data[:] = filters_t.permute(4, 3, 0, 1,
2).contiguous()
net.net[0].weight.data[:] = filters_t
out_ref = net_ref(features_dense_t)
out = net(features_t, indices_t, bs).dense()
dout = np.random.uniform(-0.2, 0.2, out_ref.shape).astype(features.dtype)
dout = np.random.uniform(-0.2, 0.2,
out_ref.shape).astype(features.dtype)
dout_t = torch.from_numpy(dout).to(device)
out.backward(dout_t)
out_ref.backward(dout_t)
din_dense = features_dense_t.grad.detach().permute(0, 2, 3, 4, 1).contiguous()
din_dense = features_dense_t.grad.detach().permute(0, 2, 3, 4,
1).contiguous()
din_sparse = gather_nd(din_dense, indices_t.long())
din = features_t.grad.detach()
din_np = din.cpu().numpy()
......@@ -381,7 +389,7 @@ class TestSpConv(TestCase):
out_np = out.detach().cpu().numpy()
out_ref_np = out_ref.detach().cpu().numpy()
self.assertAllClose(out_np, out_ref_np, atol=1e-4)
def testSpDeConv3d(self):
np.random.seed(484)
devices = ["cuda:0", "cpu:0"]
......@@ -396,36 +404,44 @@ class TestSpConv(TestCase):
dilations = [1, 2, 3]
for dev, shape, bs, IC, OC, k, s, p, d in params_grid(
devices, shapes, batchsizes, in_channels, out_channels, ksizes,
strides, paddings, dilations):
devices, shapes, batchsizes, in_channels, out_channels, ksizes,
strides, paddings, dilations):
if all([s > 1, d > 1]):
continue # don't support this.
continue # don't support this.
device = torch.device(dev)
num_points = [1000] * bs
sparse_dict = generate_sparse_data(shape, num_points, IC)
features = np.ascontiguousarray(sparse_dict["features"]).astype(np.float32)
indices = np.ascontiguousarray(sparse_dict["indices"][:, [3, 0, 1, 2]]).astype(np.int32)
features = np.ascontiguousarray(sparse_dict["features"]).astype(
np.float32)
indices = np.ascontiguousarray(
sparse_dict["indices"][:, [3, 0, 1, 2]]).astype(np.int32)
features_dense = sparse_dict["features_dense"].astype(np.float32)
filters = np.random.uniform(0, 1, size=[k, k, k, IC, OC]).astype(np.float32)
filters = np.random.uniform(0, 1, size=[k, k, k, IC,
OC]).astype(np.float32)
indices_t = torch.from_numpy(indices).int().to(device)
features_t = torch.from_numpy(features).to(device)
features_t.requires_grad = True
features_dense_t = torch.from_numpy(features_dense).to(device)
features_dense_t.requires_grad = True
net = SparseDeConv3dTestTorch(1, 3, shape, IC, OC, k, s, p, d).to(device)
net_ref = DeConv3dTestTorch(1, 3, shape, IC, OC, k, s, p, d).to(device)
net = SparseDeConv3dTestTorch(1, 3, shape, IC, OC, k, s, p,
d).to(device)
net_ref = DeConv3dTestTorch(1, 3, shape, IC, OC, k, s, p,
d).to(device)
filters_t = torch.from_numpy(filters).to(device)
net_ref.net[0].weight.data[:] = filters_t.permute(3, 4, 0, 1, 2).contiguous()
net_ref.net[0].weight.data[:] = filters_t.permute(3, 4, 0, 1,
2).contiguous()
net.net[0].weight.data[:] = filters_t
out_ref = net_ref(features_dense_t)
out = net(features_t, indices_t, bs).dense()
dout = np.random.uniform(-0.2, 0.2, out_ref.shape).astype(features.dtype)
dout = np.random.uniform(-0.2, 0.2,
out_ref.shape).astype(features.dtype)
dout_t = torch.from_numpy(dout).to(device)
out.backward(dout_t)
out_ref.backward(dout_t)
din_dense = features_dense_t.grad.detach().permute(0, 2, 3, 4, 1).contiguous()
din_dense = features_dense_t.grad.detach().permute(0, 2, 3, 4,
1).contiguous()
din_sparse = gather_nd(din_dense, indices_t.long())
din = features_t.grad.detach()
din_np = din.cpu().numpy()
......@@ -440,7 +456,7 @@ class TestSpConv(TestCase):
out_np = out.detach().cpu().numpy()
out_ref_np = out_ref.detach().cpu().numpy()
self.assertAllClose(out_np, out_ref_np, atol=1e-4)
def testSpCpConv3d(self):
np.random.seed(484)
devices = ["cuda:0", "cpu:0"]
......@@ -455,19 +471,23 @@ class TestSpConv(TestCase):
dilations = [1, 2, 3]
for dev, shape, bs, IC, OC, k, s in params_grid(
devices, shapes, batchsizes, in_channels, out_channels, ksizes,
strides):
devices, shapes, batchsizes, in_channels, out_channels, ksizes,
strides):
device = torch.device(dev)
num_points = [1000] * bs
sparse_dict = generate_sparse_data(shape, num_points, IC)
features = np.ascontiguousarray(sparse_dict["features"]).astype(np.float32)
indices = np.ascontiguousarray(sparse_dict["indices"][:, [3, 0, 1, 2]]).astype(np.int32)
features = np.ascontiguousarray(sparse_dict["features"]).astype(
np.float32)
indices = np.ascontiguousarray(
sparse_dict["indices"][:, [3, 0, 1, 2]]).astype(np.int32)
features_dense = sparse_dict["features_dense"].astype(np.float32)
filters = np.random.uniform(0, 1, size=[k, k, k, IC, OC]).astype(np.float32)
filters = np.random.uniform(0, 1, size=[k, k, k, IC,
OC]).astype(np.float32)
indices_t = torch.from_numpy(indices).int().to(device)
indices_scn_t = torch.from_numpy(indices[:, [1, 2, 3, 0]]).int().to(device)
indices_scn_t = torch.from_numpy(
indices[:, [1, 2, 3, 0]]).int().to(device)
features_t = torch.from_numpy(features).to(device)
features_t.requires_grad = True
features_ref_t = torch.from_numpy(features).to(device)
......@@ -475,11 +495,14 @@ class TestSpConv(TestCase):
net_ref = SCNCoupleDeConvTest(1, 3, shape, IC, OC, k, s).to(device)
net = SparseCoupleDeConvTest(1, 3, shape, IC, OC, k, s).to(device)
net_ref.net[0].weight.data[:] = net.net[0].weight.data[:].view(*net_ref.net[0].weight.shape)
net_ref.net[1].weight.data[:] = net.net[1].weight.data[:].view(*net_ref.net[1].weight.shape)
net_ref.net[0].weight.data[:] = net.net[0].weight.data[:].view(
*net_ref.net[0].weight.shape)
net_ref.net[1].weight.data[:] = net.net[1].weight.data[:].view(
*net_ref.net[1].weight.shape)
out_ref = net_ref(features_ref_t, indices_scn_t, bs)
out = net(features_t, indices_t, bs)
dout = np.random.uniform(-0.2, 0.2, out_ref.shape).astype(features.dtype)
dout = np.random.uniform(-0.2, 0.2,
out_ref.shape).astype(features.dtype)
dout_t = torch.from_numpy(dout).to(device)
out.backward(dout_t)
out_ref.backward(dout_t)
......@@ -490,15 +513,14 @@ class TestSpConv(TestCase):
self.assertAllClose(din_ref_np, din_np, atol=1e-4)
for layer, layer_ref in zip(net.net, net_ref.net):
dw = layer.weight.grad.detach().cpu().numpy()
dw_ref = layer_ref.weight.grad.detach().cpu().view(*dw.shape).numpy()
dw_ref = layer_ref.weight.grad.detach().cpu().view(
*dw.shape).numpy()
self.assertAllClose(dw, dw_ref, atol=1e-4)
out_np = out.detach().cpu().numpy()
out_ref_np = out_ref.detach().cpu().numpy()
self.assertAllClose(out_np, out_ref_np, atol=1e-4)
def testSpMaxPool3d(self):
np.random.seed(485)
devices = ["cuda:0", "cpu:0"]
......@@ -513,19 +535,25 @@ class TestSpConv(TestCase):
dilations = [1, 2, 3]
for dev, shape, bs, IC, OC, k, s, p, d in params_grid(
devices, shapes, batchsizes, in_channels, out_channels, ksizes,
strides, paddings, dilations):
devices, shapes, batchsizes, in_channels, out_channels, ksizes,
strides, paddings, dilations):
if all([s > 1, d > 1]):
continue # don't support this.
continue # don't support this.
device = torch.device(dev)
num_points = [1000] * bs
# when data contains negative, sparse maxpool is not equal to dense maxpool.
sparse_dict = generate_sparse_data(shape, num_points, IC, data_range=[0.1, 1])
features = np.ascontiguousarray(sparse_dict["features"]).astype(np.float32)
indices = np.ascontiguousarray(sparse_dict["indices"][:, [3, 0, 1, 2]]).astype(np.int32)
sparse_dict = generate_sparse_data(shape,
num_points,
IC,
data_range=[0.1, 1])
features = np.ascontiguousarray(sparse_dict["features"]).astype(
np.float32)
indices = np.ascontiguousarray(
sparse_dict["indices"][:, [3, 0, 1, 2]]).astype(np.int32)
features_dense = sparse_dict["features_dense"].astype(np.float32)
filters = np.random.uniform(0, 1, size=[k, k, k, IC, OC]).astype(np.float32)
filters = np.random.uniform(0, 1, size=[k, k, k, IC,
OC]).astype(np.float32)
indices_t = torch.from_numpy(indices).int().to(device)
features_t = torch.from_numpy(features).to(device)
features_t.requires_grad = True
......@@ -540,24 +568,27 @@ class TestSpConv(TestCase):
outfeatures = out.features
out_dense = out.dense(channels_first=False)
out = out_dense.permute(0, 4, 1, 2, 3).contiguous()
dout_sparse = np.random.uniform(-0.2, 0.2, outfeatures.shape).astype(features.dtype)
dout_sparse = np.random.uniform(
-0.2, 0.2, outfeatures.shape).astype(features.dtype)
dout_sparse_t = torch.from_numpy(dout_sparse).to(device)
dout_t = scatter_nd(outids.long(), dout_sparse_t, list(out_dense.shape))
dout_t = scatter_nd(outids.long(), dout_sparse_t,
list(out_dense.shape))
dout_t = dout_t.permute(0, 4, 1, 2, 3).contiguous()
out.backward(dout_t)
out_ref.backward(dout_t)
din_dense = features_dense_t.grad.detach().permute(0, 2, 3, 4, 1).contiguous()
din_dense = features_dense_t.grad.detach().permute(0, 2, 3, 4,
1).contiguous()
din_sparse = gather_nd(din_dense, indices_t.long())
din = features_t.grad.detach()
din_np = din.cpu().numpy()
din_sparse_np = din_sparse.cpu().numpy()
self.assertAllClose(din_np, din_sparse_np, atol=1e-4)
out_np = out.detach().cpu().numpy()
out_ref_np = out_ref.detach().cpu().numpy()
self.assertAllClose(out_np, out_ref_np, atol=1e-4)
def main():
# function for develop.
......@@ -567,7 +598,6 @@ def main():
shapes = [[50, 30, 30]]
batchsizes = [2]
in_channels = [256]
out_channels = [256]
ksizes = [(3, 1, 1)]
......@@ -576,8 +606,8 @@ def main():
dilations = [1]
for dev, shape, bs, IC, OC, k, s, p, d in params_grid(
devices, shapes, batchsizes, in_channels, out_channels, ksizes,
strides, paddings, dilations):
devices, shapes, batchsizes, in_channels, out_channels, ksizes,
strides, paddings, dilations):
if all([s > 1, d > 1]):
continue
device = torch.device(dev)
......@@ -585,19 +615,25 @@ def main():
sparse_dict = generate_sparse_data(shape, num_points, IC)
features = np.ascontiguousarray(sparse_dict["features"]).astype(np.float32)
indices = np.ascontiguousarray(sparse_dict["indices"][:, [3, 0, 1, 2]]).astype(np.int32)
features = np.ascontiguousarray(sparse_dict["features"]).astype(
np.float32)
indices = np.ascontiguousarray(
sparse_dict["indices"][:, [3, 0, 1, 2]]).astype(np.int32)
features_dense = sparse_dict["features_dense"].astype(np.float32)
indices_t = torch.from_numpy(indices)
filters = np.random.uniform(0, 1, size=[k[0], 1, 1, IC, OC]).astype(np.float32)
filters = np.random.uniform(0, 1, size=[k[0], 1, 1, IC,
OC]).astype(np.float32)
indices_t = torch.from_numpy(indices).int().to(device).float()
features_t = torch.from_numpy(features).to(device).float()
features_dense_t = torch.from_numpy(features_dense).to(device).float()
net = SparseConv3dTestTorch(1, 3, shape, IC, OC, k, s, p, d).to(device).float()
net_ref = Conv3dTestTorch(1, 3, shape, IC, OC, k, s, p, d).to(device).float()
net = SparseConv3dTestTorch(1, 3, shape, IC, OC, k, s, p,
d).to(device).float()
net_ref = Conv3dTestTorch(1, 3, shape, IC, OC, k, s, p,
d).to(device).float()
filters_t = torch.from_numpy(filters).to(device).float()
net_ref.net[0].weight[:] = filters_t.permute(4, 3, 0, 1, 2).contiguous()
net_ref.net[0].weight[:] = filters_t.permute(4, 3, 0, 1,
2).contiguous()
net.net[0].weight[:] = filters_t
out_ref = net_ref(features_dense_t)
times = []
......@@ -607,16 +643,16 @@ def main():
torch.cuda.synchronize()
times.append(time.time() - t)
# print((net.grid == -1).float().sum(), net.grid.numel())
# print("spconv time", time.time() - t)
# print("spconv time", time.time() - t)
print("spconv time", np.mean(times[2:]))
out = net(features_t, indices_t, bs)
# print(out.indices)
out = out.dense()
print(np.linalg.norm(out.detach().cpu().numpy() - out_ref.detach().cpu().numpy()))
print(
np.linalg.norm(out.detach().cpu().numpy() -
out_ref.detach().cpu().numpy()))
if __name__ == '__main__':
main()
# unittest.main()
Subproject commit 085a29436a8c472caaaf7157aa644b571079bcaa
Subproject commit 3b1dbebabc801c9cf6f0953a4c20b904d444f879
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment