Commit d17a00e0 authored by yanyan's avatar yanyan
Browse files

sync some code

parent 11bcbbf6
...@@ -269,7 +269,7 @@ __global__ void getSubMIndicePairsKernel( ...@@ -269,7 +269,7 @@ __global__ void getSubMIndicePairsKernel(
template <typename Index, typename IndexGrid, unsigned K0, unsigned K1, template <typename Index, typename IndexGrid, unsigned K0, unsigned K1,
unsigned K2> unsigned K2>
__global__ void getSubMIndicePairsKernel3( __global__ void getSubMIndicePairsUnrollKernel3(
tv::TensorView<const Index> indicesIn, tv::TensorView<IndexGrid> gridsOut, tv::TensorView<const Index> indicesIn, tv::TensorView<IndexGrid> gridsOut,
tv::TensorView<Index> indicePairs, tv::TensorView<Index> indiceNum, tv::TensorView<Index> indicePairs, tv::TensorView<Index> indiceNum,
const tv::SimpleVector<Index, 3> outSpatialShape, Index spatialVolume) { const tv::SimpleVector<Index, 3> outSpatialShape, Index spatialVolume) {
...@@ -290,25 +290,26 @@ __global__ void getSubMIndicePairsKernel3( ...@@ -290,25 +290,26 @@ __global__ void getSubMIndicePairsKernel3(
#pragma unroll #pragma unroll
for (int k = 0; k < K2; ++k) { for (int k = 0; k < K2; ++k) {
offset = i * K1 * K2 + j * K2 + k; offset = i * K1 * K2 + j * K2 + k;
if (offset > center){ if (offset > center) {
continue; continue;
} }
if (center == offset){ if (center == offset) {
// center of subm indice pairs dont need atomicadd // center of subm indice pairs dont need atomicadd
indicePairs(1, offset, ix) = ix; indicePairs(1, offset, ix) = ix;
indicePairs(0, offset, ix) = ix; indicePairs(0, offset, ix) = ix;
}else{ } else {
point[2] = indice_data[3] - k + K2 / 2; point[2] = indice_data[3] - k + K2 / 2;
point[1] = indice_data[2] - j + K1 / 2; point[1] = indice_data[2] - j + K1 / 2;
point[0] = indice_data[1] - i + K0 / 2; point[0] = indice_data[1] - i + K0 / 2;
if (point[1] >= 0 && point[1] < outSpatialShape[1] && point[2] >= 0 && if (point[1] >= 0 && point[1] < outSpatialShape[1] &&
point[2] < outSpatialShape[2] && point[0] >= 0 && point[2] >= 0 && point[2] < outSpatialShape[2] &&
point[0] < outSpatialShape[0]) { point[0] >= 0 && point[0] < outSpatialShape[0]) {
index = tv::ArrayIndexRowMajor<3, 3>::runPtrs( index = tv::ArrayIndexRowMajor<3, 3>::runPtrs(
point, outSpatialShape.data(), 0) + point, outSpatialShape.data(), 0) +
spatialVolume * indice_data[0]; spatialVolume * indice_data[0];
if (gridsOut[index] != -1) { if (gridsOut[index] != -1) {
// for subm: indicePairs[0, i] = indicePairs[1, kernelVolume - i - 1] // for subm: indicePairs[0, i] = indicePairs[1, kernelVolume - i
// - 1]
Index oldNum = atomicAdd(indiceNum.data() + offset, Index(1)); Index oldNum = atomicAdd(indiceNum.data() + offset, Index(1));
atomicAdd(indiceNum.data() + KV - offset - 1, Index(1)); atomicAdd(indiceNum.data() + KV - offset - 1, Index(1));
indicePairs(1, offset, oldNum) = gridsOut[index]; indicePairs(1, offset, oldNum) = gridsOut[index];
...@@ -325,7 +326,7 @@ __global__ void getSubMIndicePairsKernel3( ...@@ -325,7 +326,7 @@ __global__ void getSubMIndicePairsKernel3(
} }
template <typename Index, typename IndexGrid, unsigned K0, unsigned K1> template <typename Index, typename IndexGrid, unsigned K0, unsigned K1>
__global__ void getSubMIndicePairsKernel2( __global__ void getSubMIndicePairsUnrollKernel2(
tv::TensorView<const Index> indicesIn, tv::TensorView<IndexGrid> gridsOut, tv::TensorView<const Index> indicesIn, tv::TensorView<IndexGrid> gridsOut,
tv::TensorView<Index> indicePairs, tv::TensorView<Index> indiceNum, tv::TensorView<Index> indicePairs, tv::TensorView<Index> indiceNum,
const tv::SimpleVector<Index, 2> outSpatialShape, Index spatialVolume) { const tv::SimpleVector<Index, 2> outSpatialShape, Index spatialVolume) {
...@@ -344,14 +345,14 @@ __global__ void getSubMIndicePairsKernel2( ...@@ -344,14 +345,14 @@ __global__ void getSubMIndicePairsKernel2(
#pragma unroll #pragma unroll
for (int j = 0; j < K1; ++j) { for (int j = 0; j < K1; ++j) {
offset = i * K1 + j; offset = i * K1 + j;
if (offset > center){ if (offset > center) {
continue; continue;
} }
if (center == offset){ if (center == offset) {
// center of subm indice pairs dont need atomicadd // center of subm indice pairs dont need atomicadd
indicePairs(1, offset, ix) = ix; indicePairs(1, offset, ix) = ix;
indicePairs(0, offset, ix) = ix; indicePairs(0, offset, ix) = ix;
}else{ } else {
point[1] = indice_data[2] - j + K1 / 2; point[1] = indice_data[2] - j + K1 / 2;
point[0] = indice_data[1] - i + K0 / 2; point[0] = indice_data[1] - i + K0 / 2;
if (point[1] >= 0 && point[1] < outSpatialShape[1] && point[0] >= 0 && if (point[1] >= 0 && point[1] < outSpatialShape[1] && point[0] >= 0 &&
...@@ -418,6 +419,130 @@ __global__ void getSubMIndicePairsHashKernel( ...@@ -418,6 +419,130 @@ __global__ void getSubMIndicePairsHashKernel(
} }
} }
template <typename Index, unsigned K0, unsigned K1, unsigned K2,
unsigned kNumHashFunctions = 4>
__global__ void getSubMIndicePairsHashUnrollKernel3(
tv::TensorView<const Index> indicesIn, tv::TensorView<Index> indicePairs,
tv::TensorView<Index> indiceNum,
const tv::SimpleVector<Index, 3> outSpatialShape, Index spatialVolume,
unsigned table_size, const cuhash::Entry *table,
cuhash::Functions<kNumHashFunctions> constants, uint2 stash_constants,
unsigned stash_count) {
auto numActIn = indicesIn.dim(0);
Index index = 0;
Index offset;
Index point[3];
constexpr unsigned KV = K0 * K1 * K2;
constexpr unsigned center = KV / 2;
*(indiceNum.data() + center) = numActIn;
for (int ix : tv::KernelLoopX<int>(numActIn)) {
const Index *indice_data = indicesIn.data() + ix * (3 + 1);
#pragma unroll
for (int i = 0; i < K0; ++i) {
#pragma unroll
for (int j = 0; j < K1; ++j) {
#pragma unroll
for (int k = 0; k < K2; ++k) {
offset = i * K1 * K2 + j * K2 + k;
if (offset > center) {
continue;
}
if (center == offset) {
// center of subm indice pairs dont need atomicadd
indicePairs(1, offset, ix) = ix;
indicePairs(0, offset, ix) = ix;
} else {
point[2] = indice_data[3] - k + K2 / 2;
point[1] = indice_data[2] - j + K1 / 2;
point[0] = indice_data[1] - i + K0 / 2;
if (point[1] >= 0 && point[1] < outSpatialShape[1] &&
point[2] >= 0 && point[2] < outSpatialShape[2] &&
point[0] >= 0 && point[0] < outSpatialShape[0]) {
index = tv::ArrayIndexRowMajor<3, 3>::runPtrs(
point, outSpatialShape.data(), 0) +
spatialVolume * indice_data[0];
auto val =
cuhash::retrieve((unsigned)(index), table_size, table,
constants, stash_constants, stash_count);
if (val != cuhash::kNotFound) {
// for subm: indicePairs[0, i] = indicePairs[1, kernelVolume - i
// - 1]
Index oldNum = atomicAdd(indiceNum.data() + offset, Index(1));
atomicAdd(indiceNum.data() + KV - offset - 1, Index(1));
indicePairs(1, offset, oldNum) = val;
indicePairs(0, offset, oldNum) = ix;
indicePairs(1, KV - offset - 1, oldNum) = ix;
indicePairs(0, KV - offset - 1, oldNum) = val;
}
}
}
}
}
}
}
}
template <typename Index, unsigned K0, unsigned K1,
unsigned kNumHashFunctions = 4>
__global__ void getSubMIndicePairsHashUnrollKernel2(
tv::TensorView<const Index> indicesIn, tv::TensorView<Index> indicePairs,
tv::TensorView<Index> indiceNum,
const tv::SimpleVector<Index, 2> outSpatialShape, Index spatialVolume,
unsigned table_size, const cuhash::Entry *table,
cuhash::Functions<kNumHashFunctions> constants, uint2 stash_constants,
unsigned stash_count) {
auto numActIn = indicesIn.dim(0);
Index index = 0;
Index offset;
Index point[2];
constexpr unsigned KV = K0 * K1;
constexpr unsigned center = KV / 2;
*(indiceNum.data() + center) = numActIn;
for (int ix : tv::KernelLoopX<int>(numActIn)) {
const Index *indice_data = indicesIn.data() + ix * (2 + 1);
#pragma unroll
for (int i = 0; i < K0; ++i) {
#pragma unroll
for (int j = 0; j < K1; ++j) {
offset = i * K1 + j;
if (offset > center) {
continue;
}
if (center == offset) {
// center of subm indice pairs dont need atomicadd
indicePairs(1, offset, ix) = ix;
indicePairs(0, offset, ix) = ix;
} else {
point[1] = indice_data[2] - j + K1 / 2;
point[0] = indice_data[1] - i + K0 / 2;
if (point[1] >= 0 && point[1] < outSpatialShape[1] && point[0] >= 0 &&
point[0] < outSpatialShape[0]) {
index = tv::ArrayIndexRowMajor<2, 2>::runPtrs(
point, outSpatialShape.data(), 0) +
spatialVolume * indice_data[0];
auto val =
cuhash::retrieve((unsigned)(index), table_size, table,
constants, stash_constants, stash_count);
if (val != cuhash::kNotFound) {
// for subm: indicePairs[0, i] = indicePairs[1, kernelVolume - i -
// 1]
Index oldNum = atomicAdd(indiceNum.data() + offset, Index(1));
atomicAdd(indiceNum.data() + KV - offset - 1, Index(1));
indicePairs(1, offset, oldNum) = val;
indicePairs(0, offset, oldNum) = ix;
indicePairs(1, KV - offset - 1, oldNum) = ix;
indicePairs(0, KV - offset - 1, oldNum) = val;
}
}
}
}
}
}
}
template <typename Index, typename IndexGrid, unsigned NDim> template <typename Index, typename IndexGrid, unsigned NDim>
__global__ void resetGridKernel(const Index *indicePairUnique, __global__ void resetGridKernel(const Index *indicePairUnique,
tv::TensorView<IndexGrid> gridsOut, tv::TensorView<IndexGrid> gridsOut,
...@@ -437,22 +562,13 @@ template <typename Index, typename IndexGrid, unsigned NDim> ...@@ -437,22 +562,13 @@ template <typename Index, typename IndexGrid, unsigned NDim>
__global__ void __global__ void
resetGridSubMKernel(const Index *indices, tv::TensorView<IndexGrid> gridsOut, resetGridSubMKernel(const Index *indices, tv::TensorView<IndexGrid> gridsOut,
const tv::SimpleVector<Index, NDim> outSpatialShape, const tv::SimpleVector<Index, NDim> outSpatialShape,
int numAct) { int numAct, Index spatialVolume) {
Index outSpatialShapeReg[NDim];
for (int i = 0; i < NDim; ++i) {
outSpatialShapeReg[i] = outSpatialShape[i];
}
Index spatialVolume = 1;
auto indsPtr = indices; auto indsPtr = indices;
#pragma unroll
for (int i = 0; i < NDim; ++i) {
spatialVolume *= outSpatialShape[i];
}
Index index; Index index;
for (int ix : tv::KernelLoopX<int>(numAct)) { for (int ix : tv::KernelLoopX<int>(numAct)) {
indsPtr = indices + ix * (NDim + 1); indsPtr = indices + ix * (NDim + 1);
index = tv::ArrayIndexRowMajor<NDim, NDim>::runPtrs(indsPtr + 1, index = tv::ArrayIndexRowMajor<NDim, NDim>::runPtrs(indsPtr + 1,
outSpatialShapeReg, 0); outSpatialShape.data(), 0);
gridsOut[index + spatialVolume * indsPtr[0]] = -1; gridsOut[index + spatialVolume * indsPtr[0]] = -1;
} }
} }
......
...@@ -27,7 +27,7 @@ enum ConvAlgo { kNative = 0, kBatch = 1, kBatchGemmGather = 2 }; ...@@ -27,7 +27,7 @@ enum ConvAlgo { kNative = 0, kBatch = 1, kBatchGemmGather = 2 };
// torch.jit's doc says only support int64, so we need to convert to int32. // torch.jit's doc says only support int64, so we need to convert to int32.
std::vector<torch::Tensor> std::vector<torch::Tensor>
getIndicePairs(torch::Tensor indices, int64_t batchSize, getIndicePairs(torch::Tensor indices, torch::Tensor gridOut, int64_t batchSize,
std::vector<int64_t> outSpatialShape, std::vector<int64_t> outSpatialShape,
std::vector<int64_t> spatialShape, std::vector<int64_t> spatialShape,
std::vector<int64_t> kernelSize, std::vector<int64_t> stride, std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
......
#include <tensorview/tensor.h>
namespace spconv {
enum HashTypes {
kDenseMap = 0,
kCUDPPHash = 1
};
template <int Impl>
struct HashMap;
template<>
struct HashMap<kDenseMap>{
};
}
\ No newline at end of file
...@@ -510,6 +510,20 @@ struct DispatchInt<T<Args...>> { ...@@ -510,6 +510,20 @@ struct DispatchInt<T<Args...>> {
} }
}; };
template <class T> struct DispatchIntNoexcept;
template <template <class...> class T, class... Args>
struct DispatchIntNoexcept<T<Args...>> {
template <typename F> inline bool operator()(int t, F &&f) {
return dispatch_int_noexcept<Args::value...>(t, std::forward<F>(f));
}
template <typename F, typename BinaryPredicate>
inline bool operator()(int t, BinaryPredicate p, F &&f) {
return dispatch_int_noexcept<Args::value...>(t, p, std::forward<F>(f));
}
};
constexpr size_t kTensorMaxDim = 10; constexpr size_t kTensorMaxDim = 10;
using TensorShape = ShapeBase<kTensorMaxDim, int64_t>; using TensorShape = ShapeBase<kTensorMaxDim, int64_t>;
......
...@@ -68,6 +68,8 @@ class SparseConvTensor(object): ...@@ -68,6 +68,8 @@ class SparseConvTensor(object):
self.spatial_shape = spatial_shape self.spatial_shape = spatial_shape
self.batch_size = batch_size self.batch_size = batch_size
self.indice_dict = {} self.indice_dict = {}
if grid is None:
grid = torch.Tensor() # empty tensor
self.grid = grid self.grid = grid
@classmethod @classmethod
......
...@@ -88,23 +88,13 @@ def get_indice_pairs(indices, ...@@ -88,23 +88,13 @@ def get_indice_pairs(indices,
else: else:
out_shape = spatial_shape out_shape = spatial_shape
if grid is None: if grid is None:
res = torch.ops.spconv.get_indice_pairs(indices, batch_size, out_shape, grid = torch.Tensor()
spatial_shape, ksize, stride, res = torch.ops.spconv.get_indice_pairs(indices, grid, batch_size, out_shape,
padding, dilation, out_padding, spatial_shape, ksize, stride,
int(subm), int(transpose), padding, dilation, out_padding,
int(use_hash)) int(subm), int(transpose),
return res int(use_hash))
else: return res
if ndim == 2:
get_indice_pairs_func = torch.ops.spconv.get_indice_pairs_grid_2d
elif ndim == 3:
get_indice_pairs_func = torch.ops.spconv.get_indice_pairs_grid_3d
else:
raise NotImplementedError
return get_indice_pairs_func(indices, grid, batch_size, out_shape,
spatial_shape, ksize, stride, padding,
dilation, out_padding, int(subm),
int(transpose), int(use_hash))
def indice_conv(features, def indice_conv(features,
......
...@@ -46,7 +46,6 @@ class SparseMaxPool(SparseModule): ...@@ -46,7 +46,6 @@ class SparseMaxPool(SparseModule):
padding = [padding] * ndim padding = [padding] * ndim
if not isinstance(dilation, (list, tuple)): if not isinstance(dilation, (list, tuple)):
dilation = [dilation] * ndim dilation = [dilation] * ndim
self.ndim = ndim self.ndim = ndim
self.kernel_size = kernel_size self.kernel_size = kernel_size
self.stride = stride self.stride = stride
...@@ -61,6 +60,8 @@ class SparseMaxPool(SparseModule): ...@@ -61,6 +60,8 @@ class SparseMaxPool(SparseModule):
indices = input.indices indices = input.indices
spatial_shape = input.spatial_shape spatial_shape = input.spatial_shape
batch_size = input.batch_size batch_size = input.batch_size
torch.cuda.synchronize()
t = time.time()
if not self.subm: if not self.subm:
out_spatial_shape = ops.get_conv_output_size( out_spatial_shape = ops.get_conv_output_size(
spatial_shape, self.kernel_size, self.stride, self.padding, spatial_shape, self.kernel_size, self.stride, self.padding,
...@@ -69,11 +70,14 @@ class SparseMaxPool(SparseModule): ...@@ -69,11 +70,14 @@ class SparseMaxPool(SparseModule):
out_spatial_shape = spatial_shape out_spatial_shape = spatial_shape
outids, indice_pairs, indice_pairs_num = ops.get_indice_pairs( outids, indice_pairs, indice_pairs_num = ops.get_indice_pairs(
indices, batch_size, spatial_shape, self.kernel_size, self.stride, indices, batch_size, spatial_shape, self.kernel_size, self.stride,
self.padding, self.dilation, 0, self.subm) self.padding, self.dilation, 0, self.subm, grid=input.grid)
out_features = Fsp.indice_maxpool(features, indice_pairs.to(device), out_features = Fsp.indice_maxpool(features, indice_pairs.to(device),
indice_pairs_num.to(device), indice_pairs_num.to(device),
outids.shape[0]) outids.shape[0])
torch.cuda.synchronize()
print("maxpool", spatial_shape, time.time() - t)
out_tensor = spconv.SparseConvTensor(out_features, outids, out_tensor = spconv.SparseConvTensor(out_features, outids,
out_spatial_shape, batch_size) out_spatial_shape, batch_size)
out_tensor.indice_dict = input.indice_dict out_tensor.indice_dict = input.indice_dict
......
...@@ -23,7 +23,6 @@ static auto registry = ...@@ -23,7 +23,6 @@ static auto registry =
torch::RegisterOperators() torch::RegisterOperators()
.op("spconv::get_indice_pairs", &spconv::getIndicePairs) .op("spconv::get_indice_pairs", &spconv::getIndicePairs)
.op("spconv::indice_conv", &spconv::indiceConv) .op("spconv::indice_conv", &spconv::indiceConv)
.op("spconv::indice_conv_batch", &spconv::indiceConvBatch)
.op("spconv::indice_conv_backward", &spconv::indiceConvBackward) .op("spconv::indice_conv_backward", &spconv::indiceConvBackward)
.op("spconv::fused_indice_conv_bn", &spconv::fusedIndiceConvBatchNorm) .op("spconv::fused_indice_conv_bn", &spconv::fusedIndiceConvBatchNorm)
.op("spconv::indice_maxpool", &spconv::indiceMaxPool) .op("spconv::indice_maxpool", &spconv::indiceMaxPool)
......
...@@ -42,6 +42,8 @@ int create_conv_indice_pair_p1_cuda( ...@@ -42,6 +42,8 @@ int create_conv_indice_pair_p1_cuda(
auto ndim = kernelSize.size(); auto ndim = kernelSize.size();
auto numActIn = indicesIn.size(0); auto numActIn = indicesIn.size(0);
auto kernelVolume = indiceNum.size(0); auto kernelVolume = indiceNum.size(0);
// auto timer = spconv::CudaContextTimer<>();
if (numActIn == 0) if (numActIn == 0)
return 0; return 0;
tv::dispatch_torch<int32_t>(indicesIn.scalar_type(), [&](auto IndexValue) { tv::dispatch_torch<int32_t>(indicesIn.scalar_type(), [&](auto IndexValue) {
...@@ -77,6 +79,7 @@ int create_conv_indice_pair_p1_cuda( ...@@ -77,6 +79,7 @@ int create_conv_indice_pair_p1_cuda(
pa, di, ou); pa, di, ou);
TV_CHECK_CUDA_ERR_V2("prepareIndicePairsKernel failed"); TV_CHECK_CUDA_ERR_V2("prepareIndicePairsKernel failed");
} }
// tv::ssprint("prepareIndicePairsKernel", timer.report() / 1000.0);
#ifdef TV_LOG_KERNEL_INFO #ifdef TV_LOG_KERNEL_INFO
cudaFuncAttributes attr; cudaFuncAttributes attr;
checkCudaErrors(cudaFuncGetAttributes( checkCudaErrors(cudaFuncGetAttributes(
...@@ -193,10 +196,6 @@ int create_conv_indice_pair_p2_cuda( ...@@ -193,10 +196,6 @@ int create_conv_indice_pair_p2_cuda(
return numAct; return numAct;
} }
template <typename T> struct is_valid {
__device__ __forceinline__ bool operator()(const T x) { return x != -1; }
};
int create_submconv_indice_pair_cuda( int create_submconv_indice_pair_cuda(
torch::Tensor indicesIn, torch::Tensor gridsOut, torch::Tensor indicePairs, torch::Tensor indicesIn, torch::Tensor gridsOut, torch::Tensor indicePairs,
torch::Tensor indiceNum, std::vector<int64_t> kernelSize, torch::Tensor indiceNum, std::vector<int64_t> kernelSize,
...@@ -226,6 +225,8 @@ int create_submconv_indice_pair_cuda( ...@@ -226,6 +225,8 @@ int create_submconv_indice_pair_cuda(
for (int i = 0; i < NDim; ++i) { for (int i = 0; i < NDim; ++i) {
spatialVolume *= outSpatialShape[i]; spatialVolume *= outSpatialShape[i];
} }
auto dispatcher = tv::DispatchIntNoexcept<tv::mp_list_c<int, 1, 3, 5>>();
if (useHash) { if (useHash) {
auto table = cuhash::HashTable(); auto table = cuhash::HashTable();
// std::cout << "create " << numAct << " size table..." << std::endl; // std::cout << "create " << numAct << " size table..." << std::endl;
...@@ -252,18 +253,71 @@ int create_submconv_indice_pair_cuda( ...@@ -252,18 +253,71 @@ int create_submconv_indice_pair_cuda(
auto constants = table.get_constants_4(); auto constants = table.get_constants_4();
auto stash_constants = table.get_stash_constants(); auto stash_constants = table.get_stash_constants();
auto stash_count = table.get_stash_count(); auto stash_count = table.get_stash_count();
tv::DispatchInt<max_kernel_vol_t>()( bool dilation_one = true;
kernelVolume, std::less_equal<int>(), [&](auto I2) { for (int i = 0; i < NDim; ++i) {
constexpr int MaxKernelVolume = decltype(I2)::value; dilation_one &= di[i] == 1;
getSubMIndicePairsHashKernel<Index, NDim, MaxKernelVolume> }
<<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS, auto found = false;
0, stream>>>(tv::torch2tv<Index>(indicesIn), if (dilation_one && (NDim == 2 || NDim == 3)) {
tv::torch2tv<Index>(indicePairs), auto indiceNumCpu = indiceNum.cpu();
tv::torch2tv<Index>(indiceNum), ks, st, pa, if (NDim == 2) {
di, ou, tableSize, tableData, constants, tv::SimpleVector<Index, 2> ou_(outSpatialShape.begin(),
stash_constants, stash_count); outSpatialShape.end());
TV_CHECK_CUDA_ERR_V2("getSubMIndicePairsHashKernel failed");
dispatcher(kernelSize[0], [&](auto K0C) {
dispatcher(kernelSize[1], [&](auto K1C) {
constexpr int K0 = decltype(K0C)::value;
constexpr int K1 = decltype(K1C)::value;
found = true;
getSubMIndicePairsHashUnrollKernel2<Index, K0, K1>
<<<tv::cuda::getBlocks(numActIn),
tv::cuda::CUDA_NUM_THREADS, 0, stream>>>(
tv::torch2tv<Index>(indicesIn),
tv::torch2tv<Index>(indicePairs),
tv::torch2tv<Index>(indiceNum), ou_, spatialVolume,
tableSize, tableData, constants, stash_constants,
stash_count);
});
}); });
} else if (NDim == 3) {
tv::SimpleVector<Index, 3> ou_(outSpatialShape.begin(),
outSpatialShape.end());
dispatcher(kernelSize[0], [&](auto K0C) {
dispatcher(kernelSize[1], [&](auto K1C) {
dispatcher(kernelSize[2], [&](auto K2C) {
constexpr int K0 = decltype(K0C)::value;
constexpr int K1 = decltype(K1C)::value;
constexpr int K2 = decltype(K2C)::value;
found = true;
getSubMIndicePairsHashUnrollKernel3<Index, K0, K1, K2>
<<<tv::cuda::getBlocks(numActIn),
tv::cuda::CUDA_NUM_THREADS, 0, stream>>>(
tv::torch2tv<Index>(indicesIn),
tv::torch2tv<Index>(indicePairs),
tv::torch2tv<Index>(indiceNum), ou_, spatialVolume,
tableSize, tableData, constants, stash_constants,
stash_count);
});
});
});
}
}
if (!found) {
tv::DispatchInt<max_kernel_vol_t>()(
kernelVolume, std::less_equal<int>(), [&](auto I2) {
constexpr int MaxKernelVolume = decltype(I2)::value;
getSubMIndicePairsHashKernel<Index, NDim, MaxKernelVolume>
<<<tv::cuda::getBlocks(numActIn),
tv::cuda::CUDA_NUM_THREADS, 0, stream>>>(
tv::torch2tv<Index>(indicesIn),
tv::torch2tv<Index>(indicePairs),
tv::torch2tv<Index>(indiceNum), ks, st, pa, di, ou,
tableSize, tableData, constants, stash_constants,
stash_count);
TV_CHECK_CUDA_ERR_V2("getSubMIndicePairsHashKernel failed");
});
}
} else { } else {
// auto timer = spconv::CudaContextTimer<>(); // auto timer = spconv::CudaContextTimer<>();
prepareSubMGridKernel<Index, IndexGrid, NDim> prepareSubMGridKernel<Index, IndexGrid, NDim>
...@@ -280,16 +334,17 @@ int create_submconv_indice_pair_cuda( ...@@ -280,16 +334,17 @@ int create_submconv_indice_pair_cuda(
auto found = false; auto found = false;
if (dilation_one && (NDim == 2 || NDim == 3)) { if (dilation_one && (NDim == 2 || NDim == 3)) {
auto indiceNumCpu = indiceNum.cpu(); auto indiceNumCpu = indiceNum.cpu();
if (NDim == 2) { if (NDim == 2) {
tv::SimpleVector<Index, 2> ou_(outSpatialShape.begin(), tv::SimpleVector<Index, 2> ou_(outSpatialShape.begin(),
outSpatialShape.end()); outSpatialShape.end());
tv::dispatch_int_noexcept<1, 3, 5>(kernelSize[0], [&](auto K0C) { dispatcher(kernelSize[0], [&](auto K0C) {
tv::dispatch_int_noexcept<1, 3, 5>(kernelSize[1], [&](auto K1C) { dispatcher(kernelSize[1], [&](auto K1C) {
constexpr int K0 = decltype(K0C)::value; constexpr int K0 = decltype(K0C)::value;
constexpr int K1 = decltype(K1C)::value; constexpr int K1 = decltype(K1C)::value;
found = true; found = true;
getSubMIndicePairsKernel2<Index, IndexGrid, K0, K1> getSubMIndicePairsUnrollKernel2<Index, IndexGrid, K0, K1>
<<<tv::cuda::getBlocks(numActIn), <<<tv::cuda::getBlocks(numActIn),
tv::cuda::CUDA_NUM_THREADS, 0, stream>>>( tv::cuda::CUDA_NUM_THREADS, 0, stream>>>(
tv::torch2tv<Index>(indicesIn), tv::torch2tv<Index>(indicesIn),
...@@ -301,24 +356,21 @@ int create_submconv_indice_pair_cuda( ...@@ -301,24 +356,21 @@ int create_submconv_indice_pair_cuda(
} else if (NDim == 3) { } else if (NDim == 3) {
tv::SimpleVector<Index, 3> ou_(outSpatialShape.begin(), tv::SimpleVector<Index, 3> ou_(outSpatialShape.begin(),
outSpatialShape.end()); outSpatialShape.end());
dispatcher(kernelSize[0], [&](auto K0C) {
tv::dispatch_int_noexcept<1, 3, 5>(kernelSize[0], [&](auto K0C) { dispatcher(kernelSize[1], [&](auto K1C) {
tv::dispatch_int_noexcept<1, 3, 5>(kernelSize[1], [&](auto K1C) { dispatcher(kernelSize[2], [&](auto K2C) {
tv::dispatch_int_noexcept<1, 3, 5>( constexpr int K0 = decltype(K0C)::value;
kernelSize[2], [&](auto K2C) { constexpr int K1 = decltype(K1C)::value;
constexpr int K0 = decltype(K0C)::value; constexpr int K2 = decltype(K2C)::value;
constexpr int K1 = decltype(K1C)::value; found = true;
constexpr int K2 = decltype(K2C)::value; getSubMIndicePairsUnrollKernel3<Index, IndexGrid, K0, K1, K2>
found = true; <<<tv::cuda::getBlocks(numActIn),
getSubMIndicePairsKernel3<Index, IndexGrid, K0, K1, K2> tv::cuda::CUDA_NUM_THREADS, 0, stream>>>(
<<<tv::cuda::getBlocks(numActIn), tv::torch2tv<Index>(indicesIn),
tv::cuda::CUDA_NUM_THREADS, 0, stream>>>( tv::torch2tv<IndexGrid>(gridsOut),
tv::torch2tv<Index>(indicesIn), tv::torch2tv<Index>(indicePairs),
tv::torch2tv<IndexGrid>(gridsOut), tv::torch2tv<Index>(indiceNum), ou_, spatialVolume);
tv::torch2tv<Index>(indicePairs), });
tv::torch2tv<Index>(indiceNum), ou_,
spatialVolume);
});
}); });
}); });
} }
...@@ -344,7 +396,7 @@ int create_submconv_indice_pair_cuda( ...@@ -344,7 +396,7 @@ int create_submconv_indice_pair_cuda(
resetGridSubMKernel<Index, IndexGrid, NDim> resetGridSubMKernel<Index, IndexGrid, NDim>
<<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS, 0, <<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS, 0,
stream>>>(indicesIn.data_ptr<Index>(), stream>>>(indicesIn.data_ptr<Index>(),
tv::torch2tv<IndexGrid>(gridsOut), ou, numActIn); tv::torch2tv<IndexGrid>(gridsOut), ou, numActIn, spatialVolume);
TV_CHECK_CUDA_ERR_V2("resetGridKernel failed"); TV_CHECK_CUDA_ERR_V2("resetGridKernel failed");
} }
}); });
......
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
namespace spconv { namespace spconv {
std::vector<torch::Tensor> std::vector<torch::Tensor>
getIndicePairs(torch::Tensor indices, int64_t batchSize, getIndicePairs(torch::Tensor indices, torch::Tensor gridOut, int64_t batchSize,
std::vector<int64_t> outSpatialShape, std::vector<int64_t> outSpatialShape,
std::vector<int64_t> spatialShape, std::vector<int64_t> spatialShape,
std::vector<int64_t> kernelSize, std::vector<int64_t> stride, std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
...@@ -47,8 +47,11 @@ getIndicePairs(torch::Tensor indices, int64_t batchSize, ...@@ -47,8 +47,11 @@ getIndicePairs(torch::Tensor indices, int64_t batchSize,
if (useHash) { if (useHash) {
gridSize = batchSize; gridSize = batchSize;
} }
torch::Tensor gridOut = torch::full( bool resetGrid = gridOut.numel() != 0;
{gridSize}, -1, torch::dtype(torch::kInt32).device(indices.device())); if (!resetGrid){
gridOut = torch::full(
{gridSize}, -1, torch::dtype(torch::kInt32).device(indices.device()));
}
gridOut = gridOut.view({batchSize, -1}); gridOut = gridOut.view({batchSize, -1});
int64_t numActOut = -1; int64_t numActOut = -1;
for (int i = 0; i < NDim; ++i) { for (int i = 0; i < NDim; ++i) {
...@@ -68,7 +71,7 @@ getIndicePairs(torch::Tensor indices, int64_t batchSize, ...@@ -68,7 +71,7 @@ getIndicePairs(torch::Tensor indices, int64_t batchSize,
else if (indices.device().type() == torch::kCUDA) { else if (indices.device().type() == torch::kCUDA) {
numActOut = create_submconv_indice_pair_cuda( numActOut = create_submconv_indice_pair_cuda(
indices, gridOut, indicePairs, indiceNum, kernelSize, stride, padding, indices, gridOut, indicePairs, indiceNum, kernelSize, stride, padding,
dilation, outSpatialShape, transpose, false, useHash); dilation, outSpatialShape, transpose, resetGrid, useHash);
if (numActOut == -1) { if (numActOut == -1) {
auto device = indices.device(); auto device = indices.device();
indicePairs = indicePairs.to({torch::kCPU}); indicePairs = indicePairs.to({torch::kCPU});
...@@ -98,10 +101,10 @@ getIndicePairs(torch::Tensor indices, int64_t batchSize, ...@@ -98,10 +101,10 @@ getIndicePairs(torch::Tensor indices, int64_t batchSize,
if (indices.device().type() == torch::kCPU) { if (indices.device().type() == torch::kCPU) {
numActOut = create_conv_indice_pair_cpu( numActOut = create_conv_indice_pair_cpu(
indices, outInds, gridOut, indicePairs, indiceNum, kernelSize, stride, indices, outInds, gridOut, indicePairs, indiceNum, kernelSize, stride,
padding, dilation, outSpatialShape, transpose, false, useHash); padding, dilation, outSpatialShape, transpose, resetGrid, useHash);
} }
#ifdef TV_CUDA #ifdef TV_CUDA
else if (indices.device().type() == torch::kCUDA) { else if (indices.device().type() == torch::kCUDA) {
numActOut = create_conv_indice_pair_p1_cuda( numActOut = create_conv_indice_pair_p1_cuda(
indices, indicePairs, indiceNum, indicePairUnique, kernelSize, stride, indices, indicePairs, indiceNum, indicePairUnique, kernelSize, stride,
padding, dilation, outSpatialShape, transpose); padding, dilation, outSpatialShape, transpose);
...@@ -110,7 +113,7 @@ getIndicePairs(torch::Tensor indices, int64_t batchSize, ...@@ -110,7 +113,7 @@ getIndicePairs(torch::Tensor indices, int64_t batchSize,
indicePairUnique = std::get<0>(res); indicePairUnique = std::get<0>(res);
numActOut = create_conv_indice_pair_p2_cuda( numActOut = create_conv_indice_pair_p2_cuda(
indices, outInds, gridOut, indicePairs, indiceNum, indicePairUnique, indices, outInds, gridOut, indicePairs, indiceNum, indicePairUnique,
outSpatialShape, transpose, false, useHash); outSpatialShape, transpose, resetGrid, useHash);
if (numActOut == -1) { if (numActOut == -1) {
auto device = indices.device(); auto device = indices.device();
outInds = outInds.to({torch::kCPU}); outInds = outInds.to({torch::kCPU});
...@@ -188,7 +191,8 @@ torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters, ...@@ -188,7 +191,8 @@ torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters,
double totalGatherTime = 0; double totalGatherTime = 0;
double totalGEMMTime = 0; double totalGEMMTime = 0;
double totalSAddTime = 0; double totalSAddTime = 0;
// tv::ssprint("first subm gemm time", timer.report() / 1000.0); // tv::ssprint("first subm gemm time", timer.report() / 1000.0, std::vector<int>(indicePairNumCpu.data_ptr<int>(),
// indicePairNumCpu.data_ptr<int>() + kernelVolume));
for (int i = 0; i < kernelVolume; ++i) { for (int i = 0; i < kernelVolume; ++i) {
auto nHot = indicePairNumCpu.data_ptr<int>()[i]; auto nHot = indicePairNumCpu.data_ptr<int>()[i];
...@@ -237,6 +241,8 @@ torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters, ...@@ -237,6 +241,8 @@ torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters,
// totalSAddTime += timer.report() / 1000.0; // totalSAddTime += timer.report() / 1000.0;
} }
// tv::ssprint(totalGatherTime, totalGEMMTime, totalSAddTime); // tv::ssprint(totalGatherTime, totalGEMMTime, totalSAddTime);
// tv::ssprint("final subm gemm time", timer.report() / 1000.0);
return output; return output;
} }
......
import torch
import spconv
import numpy as np
from spconv.utils import VoxelGeneratorV2
from pathlib import Path
from torch import nn
import time
def waymo_data(batch_size=1):
gen = VoxelGeneratorV2([0.1, 0.1, 0.1], [-80, -80, -2, 80, 80, 6], 1, 150000)
data = np.load(Path(__file__).parent / "data" / "benchmark-pc.npz")
pc = data["pc"]
data = gen.generate(pc)
voxels = data["voxels"].reshape(-1, 3)
coors = data["coordinates"]
N = coors.shape[0]
coors = np.concatenate([np.full([N, 1], 0, coors.dtype), coors], axis=1)
return voxels, coors, gen.grid_size
class Net(nn.Module):
def __init__(self,
shape):
super().__init__()
self.net = spconv.SparseSequential(
spconv.SubMConv3d(3, 64, 3, bias=False, indice_key="c0"),
spconv.SubMConv3d(64, 64, 3, bias=False, indice_key="c0"),
# nn.BatchNorm1d(32),
# nn.ReLU(),
spconv.SparseMaxPool3d(2, 2),
spconv.SubMConv3d(64, 96, 3, bias=False, indice_key="c1"),
spconv.SubMConv3d(96, 96, 3, bias=False, indice_key="c1"),
# nn.BatchNorm1d(64),
# nn.ReLU(),
spconv.SparseMaxPool3d(2, 2),
spconv.SubMConv3d(96, 128, 3, bias=False, indice_key="c2"),
spconv.SubMConv3d(128, 128, 3, bias=False, indice_key="c2"),
# nn.BatchNorm1d(128),
# nn.ReLU(),
spconv.SparseMaxPool3d(2, 2),
spconv.SubMConv3d(128, 160, 3, bias=False, indice_key="c3"),
spconv.SubMConv3d(160, 160, 3, bias=False, indice_key="c3"),
# nn.BatchNorm1d(128),
# nn.ReLU(),
spconv.SparseMaxPool3d(2, 2),
spconv.SubMConv3d(160, 192, 3, bias=False, indice_key="c4"),
spconv.SubMConv3d(192, 192, 3, bias=False, indice_key="c4"),
# nn.BatchNorm1d(128),
# nn.ReLU(),
spconv.SparseMaxPool3d(2, 2),
spconv.SubMConv3d(192, 224, 3, bias=False, indice_key="c5"),
spconv.SubMConv3d(224, 224, 3, bias=False, indice_key="c5"),
# nn.BatchNorm1d(128),
# nn.ReLU(),
spconv.SparseMaxPool3d(2, 2),
spconv.SubMConv3d(224, 256, 3, bias=False, indice_key="c6"),
spconv.SubMConv3d(256, 256, 3, bias=False, indice_key="c6"),
)
max_batch_size = 1
self.grid = torch.full([max_batch_size, *shape], -1, dtype=torch.int32).cuda()
# self.grid = None
self.shape = shape
def forward(self, features, coors, batch_size):
x = spconv.SparseConvTensor(features, coors, self.shape, batch_size,
self.grid)
return self.net(x)
def main():
voxels, coors, spatial_shape = waymo_data()
voxels_th = torch.from_numpy(voxels).cuda().float()
coors_th = torch.from_numpy(coors).cuda()
net = Net(spatial_shape[::-1]).cuda().eval().float()
print(coors_th.shape)
out = net(voxels_th, coors_th, 1)
print(out.spatial_shape)
times = []
with torch.no_grad():
for i in range(20):
torch.cuda.synchronize()
t = time.time()
out = net(voxels_th, coors_th, 1)
torch.cuda.synchronize()
times.append(time.time() - t)
# print((net.grid == -1).float().sum(), net.grid.numel())
# print("spconv time", time.time() - t)
print("spconv time", np.mean(times[10:]))
if __name__ == "__main__":
main()
\ No newline at end of file
...@@ -752,8 +752,8 @@ def main_subm(algo, dtype=torch.float32): ...@@ -752,8 +752,8 @@ def main_subm(algo, dtype=torch.float32):
if __name__ == '__main__': if __name__ == '__main__':
main_subm(algo=spconv.ConvAlgo.Native, dtype=torch.float32) main(algo=spconv.ConvAlgo.Native, dtype=torch.float32)
main_subm(algo=spconv.ConvAlgo.Native, dtype=torch.half) main(algo=spconv.ConvAlgo.Native, dtype=torch.half)
# TestCase().assertAllClose(out_my, out_ref) # TestCase().assertAllClose(out_my, out_ref)
# unittest.main() # unittest.main()
# TestSpConv().testSpConv3d() # TestSpConv().testSpConv3d()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment